From d4ba532e5afac1c6ef3ecba4a20a6c4d30b97d3e Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Wed, 8 Apr 2026 12:53:15 +0800 Subject: [PATCH 01/12] DAOS-18705 rebuild: stop refreshing aggregation epoch while rebuilding - stop refreshing aggregation epoch while rebuilding - set rebuilding flag before setting rebuild fence Signed-off-by: Liang Zhen --- src/container/srv_container.c | 8 +++++++- src/container/srv_target.c | 21 ++++++++++++--------- src/include/daos_srv/container.h | 4 ++-- src/include/daos_srv/pool.h | 10 ++++------ src/object/srv_obj.c | 2 +- src/rebuild/scan.c | 6 +++--- src/rebuild/srv.c | 7 ++++++- 7 files changed, 35 insertions(+), 23 deletions(-) diff --git a/src/container/srv_container.c b/src/container/srv_container.c index 8d8b40c2368..dc706df36d7 100644 --- a/src/container/srv_container.c +++ b/src/container/srv_container.c @@ -1945,6 +1945,12 @@ cont_refresh_track_eph_one(void *data) if (rc) return rc; + /* temporarily stop refreshing sc_ec_agg_eph_boundary when rebuild is running, + * so VOS aggregation can't progress to higher epoch. + */ + if (ds_pool_is_rebuilding(cont_child->sc_pool->spc_pool)) + goto out; + D_DEBUG(DB_MD, DF_CONT": %s ec agg boundary eph "DF_X64"->"DF_X64", " ": %s stable eph "DF_X64"->"DF_X64"\n", DP_CONT(arg->pool_uuid, arg->cont_uuid), @@ -1967,7 +1973,7 @@ cont_refresh_track_eph_one(void *data) else rc = 0; } - +out: ds_cont_child_put(cont_child); return rc; } diff --git a/src/container/srv_target.c b/src/container/srv_target.c index fbc640283b3..4aa7c6a473d 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -210,7 +210,7 @@ cont_aggregate_runnable(struct ds_cont_child *cont, struct sched_request *req, if (ds_pool_is_rebuilding(pool) && !vos_agg) { D_DEBUG(DB_EPC, DF_CONT ": skip EC aggregation during rebuild %d, %d.\n", DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), - atomic_load(&pool->sp_rebuilding), pool->sp_rebuild_scan); + atomic_load(&pool->sp_rebuilding), atomic_load(&pool->sp_rebuild_scanning)); return false; } @@ -331,16 +331,19 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, change_hlc = max(cont->sc_snapshot_delete_hlc, cont->sc_pool->spc_rebuild_end_hlc); - if (param->ap_full_scan_hlc < change_hlc) { - /* Snapshot has been deleted or rebuild happens since the last - * aggregation, let's restart from 0. - */ + + if (param->ap_epc_aggregated < cont->sc_snapshot_delete_hlc) { + /* Snapshot deleted: always full scan from epoch 0 */ epoch_min = 0; flags |= VOS_AGG_FL_FORCE_SCAN; - D_DEBUG(DB_EPC, "change hlc "DF_X64" > full "DF_X64"\n", - change_hlc, param->ap_full_scan_hlc); + D_DEBUG(DB_EPC, + "%s full scan (snap delete): change hlc " DF_X64 " > full " DF_X64 "\n", + param->ap_vos_agg ? "VOS" : "EC", change_hlc, param->ap_epc_aggregated); + } else { epoch_min = get_hae(cont, param->ap_vos_agg); + if (param->ap_epc_aggregated < cont->sc_pool->spc_rebuild_end_hlc) + flags |= VOS_AGG_FL_FORCE_SCAN; } if (unlikely(DAOS_FAIL_CHECK(DAOS_FORCE_EC_AGG) || @@ -463,8 +466,8 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, flags &= ~VOS_AGG_FL_FORCE_MERGE; rc = agg_cb(cont, &epoch_range, flags, param); out: - if (rc == 0 && epoch_min == 0) - param->ap_full_scan_hlc = hlc; + if (rc == 0) + param->ap_epc_aggregated = hlc; D_DEBUG(DB_EPC, DF_CONT "[%d]: Aggregating finished. %d\n", DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), tgt_id, rc); diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h index 0480e6603b6..b9c8a55c3c6 100644 --- a/src/include/daos_srv/container.h +++ b/src/include/daos_srv/container.h @@ -129,7 +129,7 @@ struct ds_cont_child { * VOS aggregation will use this boundary. We will optimize it later. */ uint64_t sc_ec_agg_eph_boundary; - /* The current EC aggregate epoch for this xstream */ + /* The local EC aggregation epoch for this xstream */ uint64_t sc_ec_agg_eph; /* Used by ds_cont_eph_report() to query the minimum ec_agg_eph and stable_eph * from all local VOS. @@ -160,7 +160,7 @@ struct ds_cont_child { struct agg_param { void *ap_data; struct ds_cont_child *ap_cont; - daos_epoch_t ap_full_scan_hlc; + daos_epoch_t ap_epc_aggregated; bool ap_vos_agg; }; diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 3fbaae93810..28e059dd1e9 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -96,11 +96,8 @@ struct ds_pool { uint32_t sp_rebuild_gen; ATOMIC int sp_rebuilding; ATOMIC int sp_discarding; - /** - * someone has already messaged this pool to for rebuild scan, - * NB: all xstreams can do lockless-write on it but it's OK - */ - int sp_rebuild_scan; + /* someone has already messaged this pool to for rebuild scan */ + ATOMIC int sp_rebuild_scanning; int sp_discard_status; /** path to ephemeral metrics */ @@ -219,7 +216,8 @@ struct ds_pool_svc_op_val { static inline bool ds_pool_is_rebuilding(struct ds_pool *pool) { - return (atomic_load(&pool->sp_rebuilding) > 0 || pool->sp_rebuild_scan > 0); + return (atomic_load(&pool->sp_rebuilding) > 0 || + atomic_load(&pool->sp_rebuild_scanning) > 0); } /* encode metadata RPC operation key: HLC time first, in network order, for keys sorted by time. diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 1b26846fbb1..e8c23393883 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -3439,7 +3439,7 @@ obj_local_enum(struct obj_io_context *ioc, crt_rpc_t *rpc, * by setting this flag. * NB: it's a lockess write to shared data structure and it's harmless. */ - ioc->ioc_coc->sc_pool->spc_pool->sp_rebuild_scan = 1; + atomic_fetch_add(&ioc->ioc_coc->sc_pool->spc_pool->sp_rebuild_scanning, 1); flags = DTX_FOR_MIGRATION; } diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index ff3b6968825..0c299ff610f 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1316,8 +1316,6 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) D_GOTO(out, rc); } - atomic_fetch_add(&rpt->rt_pool->sp_rebuilding, 1); /* reset in rebuild_tgt_fini */ - rpt_get(rpt); /* step-3: start scan leader */ rc = dss_ult_create(rebuild_scan_leader, rpt, DSS_XS_SELF, 0, 0, NULL); @@ -1331,8 +1329,10 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) tls->rebuild_pool_status = rc; if (rpt) { - if (rc) + if (rc) { + atomic_fetch_sub(&rpt->rt_pool->sp_rebuilding, 1); rpt_delete(rpt); + } rpt_put(rpt); } rout = crt_reply_get(rpc); diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 0c3885486e9..9b9cc51b802 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -2900,7 +2900,9 @@ rebuild_tgt_fini(struct rebuild_tgt_pool_tracker *rpt) D_ASSERT(atomic_load(&rpt->rt_pool->sp_rebuilding) > 0); atomic_fetch_sub(&rpt->rt_pool->sp_rebuilding, 1); - rpt->rt_pool->sp_rebuild_scan = 0; + + D_ASSERT(atomic_load(&rpt->rt_pool->sp_rebuild_scanning) > 0); + atomic_store(&rpt->rt_pool->sp_rebuild_scanning, 0); ABT_mutex_lock(rpt->rt_lock); ABT_cond_signal(rpt->rt_global_dtx_wait_cond); @@ -3213,6 +3215,8 @@ rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) DL_ERROR(rc, DF_RB " cannot find pool", DP_RB_RSI(rsi)); return rc; } + /* must set rebuild flag before setting rt_rebuild_fence, it's reset in rebuild_tgt_fini */ + atomic_fetch_add(&pool->sp_rebuilding, 1); if (ds_pool_get_version(pool) < rsi->rsi_rebuild_ver) { D_INFO(DF_RB " map %u < rsi_rebuild_ver %u\n", DP_RB_RSI(rsi), @@ -3299,6 +3303,7 @@ rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) } rpt_put(rpt); } + atomic_fetch_sub(&pool->sp_rebuilding, 1); ds_pool_put(pool); } daos_prop_fini(&prop); From 00dfa69f49f9f7033aaad0356e72a72c8e109cf7 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Wed, 8 Apr 2026 10:53:07 +0000 Subject: [PATCH 02/12] refine codes Signed-off-by: Wang Shilong --- src/container/srv_container.c | 8 +---- src/container/srv_target.c | 59 ++++++++------------------------ src/include/daos_srv/container.h | 2 +- src/include/daos_srv/pool.h | 11 +----- src/rebuild/rebuild_internal.h | 8 +++-- src/rebuild/srv.c | 34 +++++++----------- 6 files changed, 36 insertions(+), 86 deletions(-) diff --git a/src/container/srv_container.c b/src/container/srv_container.c index dc706df36d7..8d8b40c2368 100644 --- a/src/container/srv_container.c +++ b/src/container/srv_container.c @@ -1945,12 +1945,6 @@ cont_refresh_track_eph_one(void *data) if (rc) return rc; - /* temporarily stop refreshing sc_ec_agg_eph_boundary when rebuild is running, - * so VOS aggregation can't progress to higher epoch. - */ - if (ds_pool_is_rebuilding(cont_child->sc_pool->spc_pool)) - goto out; - D_DEBUG(DB_MD, DF_CONT": %s ec agg boundary eph "DF_X64"->"DF_X64", " ": %s stable eph "DF_X64"->"DF_X64"\n", DP_CONT(arg->pool_uuid, arg->cont_uuid), @@ -1973,7 +1967,7 @@ cont_refresh_track_eph_one(void *data) else rc = 0; } -out: + ds_cont_child_put(cont_child); return rc; } diff --git a/src/container/srv_target.c b/src/container/srv_target.c index 4aa7c6a473d..b4471ee926b 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -319,8 +319,7 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, daos_epoch_t epoch_max, epoch_min; daos_epoch_range_t epoch_range; struct sched_request *req = cont2req(cont, param->ap_vos_agg); - uint64_t hlc = d_hlc_get(); - uint64_t change_hlc; + uint64_t hlc = d_hlc_get(); uint64_t interval; uint64_t snapshots_local[MAX_SNAPSHOT_LOCAL] = { 0 }; uint64_t *snapshots = NULL; @@ -329,21 +328,16 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, uint32_t flags = 0; int i, rc = 0; - change_hlc = max(cont->sc_snapshot_delete_hlc, - cont->sc_pool->spc_rebuild_end_hlc); - - if (param->ap_epc_aggregated < cont->sc_snapshot_delete_hlc) { - /* Snapshot deleted: always full scan from epoch 0 */ + if (param->ap_full_scan_hlc < cont->sc_snapshot_delete_hlc) { + /* Snapshot has been deleted since the last + * aggregation, let's restart from 0. + */ epoch_min = 0; flags |= VOS_AGG_FL_FORCE_SCAN; - D_DEBUG(DB_EPC, - "%s full scan (snap delete): change hlc " DF_X64 " > full " DF_X64 "\n", - param->ap_vos_agg ? "VOS" : "EC", change_hlc, param->ap_epc_aggregated); - + D_DEBUG(DB_EPC, "snapshot del hlc " DF_X64 " > full " DF_X64 "\n", + cont->sc_snapshot_delete_hlc, param->ap_full_scan_hlc); } else { epoch_min = get_hae(cont, param->ap_vos_agg); - if (param->ap_epc_aggregated < cont->sc_pool->spc_rebuild_end_hlc) - flags |= VOS_AGG_FL_FORCE_SCAN; } if (unlikely(DAOS_FAIL_CHECK(DAOS_FORCE_EC_AGG) || @@ -381,41 +375,18 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, D_DEBUG(DB_EPC, "hlc "DF_X64" epoch "DF_X64"/"DF_X64" agg max "DF_X64"\n", hlc, epoch_max, epoch_min, cont->sc_aggregation_max); - if (cont->sc_snapshots_nr + 1 < MAX_SNAPSHOT_LOCAL) { + if (cont->sc_snapshots_nr < MAX_SNAPSHOT_LOCAL) { snapshots = snapshots_local; } else { - D_ALLOC(snapshots, (cont->sc_snapshots_nr + 1) * - sizeof(daos_epoch_t)); + D_ALLOC(snapshots, cont->sc_snapshots_nr * sizeof(daos_epoch_t)); if (snapshots == NULL) return -DER_NOMEM; } - if (cont->sc_pool->spc_rebuild_fence != 0) { - uint64_t rebuild_fence = cont->sc_pool->spc_rebuild_fence; - int j; - int insert_idx; - - /* insert rebuild_fetch into the snapshot list */ - D_DEBUG(DB_EPC, "rebuild fence "DF_X64"\n", rebuild_fence); - for (j = 0, insert_idx = 0; j < cont->sc_snapshots_nr; j++) { - if (cont->sc_snapshots[j] < rebuild_fence) { - snapshots[j] = cont->sc_snapshots[j]; - insert_idx++; - } else { - snapshots[j + 1] = cont->sc_snapshots[j]; - } - } - snapshots[insert_idx] = rebuild_fence; - snapshots_nr = cont->sc_snapshots_nr + 1; - } else { - /* Since sc_snapshots might be freed by other ULT, let's - * always copy here. - */ - snapshots_nr = cont->sc_snapshots_nr; - if (snapshots_nr > 0) - memcpy(snapshots, cont->sc_snapshots, - snapshots_nr * sizeof(daos_epoch_t)); - } + /* Since sc_snapshots might be freed by other ULT, let's always copy here. */ + snapshots_nr = cont->sc_snapshots_nr; + if (snapshots_nr > 0) + memcpy(snapshots, cont->sc_snapshots, snapshots_nr * sizeof(daos_epoch_t)); /* Find highest snapshot less than last aggregated epoch. */ for (i = 0; i < snapshots_nr && snapshots[i] < epoch_min; ++i) @@ -466,8 +437,8 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, flags &= ~VOS_AGG_FL_FORCE_MERGE; rc = agg_cb(cont, &epoch_range, flags, param); out: - if (rc == 0) - param->ap_epc_aggregated = hlc; + if (rc == 0 && epoch_min == 0) + param->ap_full_scan_hlc = hlc; D_DEBUG(DB_EPC, DF_CONT "[%d]: Aggregating finished. %d\n", DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), tgt_id, rc); diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h index b9c8a55c3c6..f7c84d807e1 100644 --- a/src/include/daos_srv/container.h +++ b/src/include/daos_srv/container.h @@ -160,7 +160,7 @@ struct ds_cont_child { struct agg_param { void *ap_data; struct ds_cont_child *ap_cont; - daos_epoch_t ap_epc_aggregated; + daos_epoch_t ap_full_scan_hlc; bool ap_vos_agg; }; diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 28e059dd1e9..63b68c3b48c 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -175,16 +175,7 @@ struct ds_pool_child { d_list_t spc_cont_list; d_list_t spc_srv_cont_hdl; /* Single server cont handle */ - /* The current maxim rebuild epoch, (0 if there is no rebuild), so - * vos aggregation can not cross this epoch during rebuild to avoid - * interfering rebuild process. - */ - uint64_t spc_rebuild_fence; - - /* The HLC when current rebuild ends, which will be used to compare - * with the aggregation full scan start HLC to know whether the - * aggregation needs to be restarted from 0. */ - uint64_t spc_rebuild_end_hlc; + uint64_t spc_rebuild_start; uint32_t spc_map_version; int spc_ref; ABT_eventual spc_ref_eventual; diff --git a/src/rebuild/rebuild_internal.h b/src/rebuild/rebuild_internal.h index b95cd81e479..458b6e709c5 100644 --- a/src/rebuild/rebuild_internal.h +++ b/src/rebuild/rebuild_internal.h @@ -78,10 +78,12 @@ struct rebuild_tgt_pool_tracker { /* Only used by reclaim job to discard those half-rebuild data */ uint64_t rt_reclaim_epoch; - /* local rebuild epoch mainly to constrain the VOS aggregation - * to make sure aggregation will not cross the epoch + /* + * XX: remove this. + * rebuild_fini_one() compare this value against rt_rebuild_start to + * decide whether this rebuild still owns this vos pool's rebuild. */ - uint64_t rt_rebuild_fence; + uint64_t rt_rebuild_start; uint32_t rt_leader_rank; diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 9b9cc51b802..3e87d5d0cf2 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -2869,20 +2869,15 @@ rebuild_fini_one(void *arg) if (dpc == NULL) return 0; - /* Reset rebuild epoch, then reset the aggregation epoch, so - * it can aggregate the rebuild epoch. - */ - D_ASSERT(rpt->rt_rebuild_fence != 0); - if (rpt->rt_rebuild_fence == dpc->spc_rebuild_fence) { - dpc->spc_rebuild_fence = 0; - dpc->spc_rebuild_end_hlc = d_hlc_get(); - D_DEBUG(DB_REBUILD, DF_RB ": Reset aggregation end hlc " DF_U64 "\n", - DP_RB_RPT(rpt), dpc->spc_rebuild_end_hlc); + D_ASSERT(rpt->rt_rebuild_start != 0); + if (rpt->rt_rebuild_start == dpc->spc_rebuild_start) { + dpc->spc_rebuild_start = 0; + D_DEBUG(DB_REBUILD, DF_RB ": Reset rebuild start epoch\n", DP_RB_RPT(rpt)); } else { D_DEBUG(DB_REBUILD, - DF_RB ": pool is still being rebuilt rt_rebuild_fence " DF_U64 - " spc_rebuild_fence " DF_U64 "\n", - DP_RB_RPT(rpt), rpt->rt_rebuild_fence, dpc->spc_rebuild_fence); + DF_RB ": pool is still being rebuilt rt_rebuild_start " DF_U64 + " spc_rebuild_start " DF_U64 "\n", + DP_RB_RPT(rpt), rpt->rt_rebuild_start, dpc->spc_rebuild_start); } ds_pool_child_put(dpc); @@ -3129,13 +3124,10 @@ rebuild_prepare_one(void *data) D_ASSERT(dss_get_module_info()->dmi_xs_id != 0); - /* Set the rebuild epoch per VOS container, so VOS aggregation will not - * cross the epoch to cause problem. - */ - D_ASSERT(rpt->rt_rebuild_fence != 0); - dpc->spc_rebuild_fence = rpt->rt_rebuild_fence; + D_ASSERT(rpt->rt_rebuild_start != 0); + dpc->spc_rebuild_start = rpt->rt_rebuild_start; D_DEBUG(DB_REBUILD, DF_RB " open local container " DF_UUID " rebuild eph " DF_X64 "\n", - DP_RB_RPT(rpt), DP_UUID(rpt->rt_coh_uuid), rpt->rt_rebuild_fence); + DP_RB_RPT(rpt), DP_UUID(rpt->rt_coh_uuid), rpt->rt_rebuild_start); put: ds_pool_child_put(dpc); @@ -3215,7 +3207,7 @@ rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) DL_ERROR(rc, DF_RB " cannot find pool", DP_RB_RSI(rsi)); return rc; } - /* must set rebuild flag before setting rt_rebuild_fence, it's reset in rebuild_tgt_fini */ + /* must set rebuild flag before yield, it's reset in rebuild_tgt_fini */ atomic_fetch_add(&pool->sp_rebuilding, 1); if (ds_pool_get_version(pool) < rsi->rsi_rebuild_ver) { @@ -3279,12 +3271,12 @@ rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) if (pool_tls == NULL) D_GOTO(out, rc = -DER_NOMEM); - rpt->rt_rebuild_fence = d_hlc_get(); + rpt->rt_rebuild_start = d_hlc_get(); rc = ds_pool_task_collective(rpt->rt_pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, rebuild_prepare_one, rpt, 0); if (rc) { - rpt->rt_rebuild_fence = 0; + rpt->rt_rebuild_start = 0; rebuild_pool_tls_destroy(pool_tls); D_GOTO(out, rc); } From 171ecd4efa9791686588ebfc64bd2abf8526f294 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Thu, 9 Apr 2026 21:24:15 +0800 Subject: [PATCH 03/12] DAOS: code cleanup Signed-off-by: Liang Zhen --- src/container/srv_target.c | 6 +++--- src/rebuild/rebuild_internal.h | 3 ++- src/rebuild/scan.c | 29 ++++++++++++++++++++--------- src/rebuild/srv.c | 25 ++++++------------------- 4 files changed, 31 insertions(+), 32 deletions(-) diff --git a/src/container/srv_target.c b/src/container/srv_target.c index b4471ee926b..3bc53d2ede1 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -375,16 +375,16 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, D_DEBUG(DB_EPC, "hlc "DF_X64" epoch "DF_X64"/"DF_X64" agg max "DF_X64"\n", hlc, epoch_max, epoch_min, cont->sc_aggregation_max); - if (cont->sc_snapshots_nr < MAX_SNAPSHOT_LOCAL) { + snapshots_nr = cont->sc_snapshots_nr; + if (snapshots_nr < MAX_SNAPSHOT_LOCAL) { snapshots = snapshots_local; } else { - D_ALLOC(snapshots, cont->sc_snapshots_nr * sizeof(daos_epoch_t)); + D_ALLOC(snapshots, snapshots_nr * sizeof(daos_epoch_t)); if (snapshots == NULL) return -DER_NOMEM; } /* Since sc_snapshots might be freed by other ULT, let's always copy here. */ - snapshots_nr = cont->sc_snapshots_nr; if (snapshots_nr > 0) memcpy(snapshots, cont->sc_snapshots, snapshots_nr * sizeof(daos_epoch_t)); diff --git a/src/rebuild/rebuild_internal.h b/src/rebuild/rebuild_internal.h index 458b6e709c5..c4cc64a041a 100644 --- a/src/rebuild/rebuild_internal.h +++ b/src/rebuild/rebuild_internal.h @@ -374,7 +374,8 @@ void rebuild_tgt_status_check_ult(void *arg); int -rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt); +rebuild_tgt_prepare(struct ds_pool *pool, struct rebuild_scan_in *rsi, + struct rebuild_tgt_pool_tracker **p_rpt); bool rebuild_status_match(struct rebuild_tgt_pool_tracker *rpt, diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 0c299ff610f..66a714ec8a5 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1201,6 +1201,7 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) struct rebuild_scan_out *rout; struct rebuild_pool_tls *tls = NULL; struct rebuild_tgt_pool_tracker *rpt = NULL; + struct ds_pool *pool = NULL; int rc; rsi = crt_req_get(rpc); @@ -1208,6 +1209,13 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) D_INFO(DF_RB "\n", DP_RB_RSI(rsi)); + rc = ds_pool_lookup(rsi->rsi_pool_uuid, &pool); + if (rc) { + DL_ERROR(rc, DF_RB " cannot find pool", DP_RB_RSI(rsi)); + D_GOTO(out, rc); + } + atomic_fetch_add(&pool->sp_rebuilding, 1); + /* If PS leader has been changed, and rebuild version is also increased * due to adding new failure targets for rebuild, let's abort previous * rebuild. @@ -1304,7 +1312,7 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) if (daos_fail_check(DAOS_REBUILD_TGT_START_FAIL)) D_GOTO(out, rc = -DER_INVAL); - rc = rebuild_tgt_prepare(rpc, &rpt); + rc = rebuild_tgt_prepare(pool, rsi, &rpt); if (rc) D_GOTO(out, rc); @@ -1325,16 +1333,19 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) } out: - if (tls && tls->rebuild_pool_status == 0 && rc != 0) - tls->rebuild_pool_status = rc; - - if (rpt) { - if (rc) { - atomic_fetch_sub(&rpt->rt_pool->sp_rebuilding, 1); + if (rc != 0) { + if (tls && tls->rebuild_pool_status == 0) + tls->rebuild_pool_status = rc; + if (pool) + atomic_fetch_sub(&pool->sp_rebuilding, 1); + if (rpt) rpt_delete(rpt); - } - rpt_put(rpt); } + if (pool) + ds_pool_put(pool); + if (rpt) + rpt_put(rpt); + rout = crt_reply_get(rpc); rout->rso_status = rc; rout->rso_stable_epoch = d_hlc_get(); diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 3e87d5d0cf2..16ad609f9b4 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -3189,10 +3189,9 @@ rpt_create(struct ds_pool *pool, uint32_t master_rank, uint32_t pm_ver, * each target get the scan rpc from the master. */ int -rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) +rebuild_tgt_prepare(struct ds_pool *pool, struct rebuild_scan_in *rsi, + struct rebuild_tgt_pool_tracker **p_rpt) { - struct rebuild_scan_in *rsi = crt_req_get(rpc); - struct ds_pool *pool; struct rebuild_tgt_pool_tracker *rpt = NULL; struct rebuild_pool_tls *pool_tls; daos_prop_t prop = { 0 }; @@ -3202,14 +3201,6 @@ rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) D_DEBUG(DB_REBUILD, DF_RB " prepare rebuild\n", DP_RB_RSI(rsi)); - rc = ds_pool_lookup(rsi->rsi_pool_uuid, &pool); - if (rc) { - DL_ERROR(rc, DF_RB " cannot find pool", DP_RB_RSI(rsi)); - return rc; - } - /* must set rebuild flag before yield, it's reset in rebuild_tgt_fini */ - atomic_fetch_add(&pool->sp_rebuilding, 1); - if (ds_pool_get_version(pool) < rsi->rsi_rebuild_ver) { D_INFO(DF_RB " map %u < rsi_rebuild_ver %u\n", DP_RB_RSI(rsi), ds_pool_get_version(pool), rsi->rsi_rebuild_ver); @@ -3287,16 +3278,12 @@ rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) *p_rpt = rpt; out: - if (rc) { - if (rpt) { - if (!d_list_empty(&rpt->rt_list)) { - rpt_delete(rpt); - rpt_put(rpt); - } + if (rc && rpt) { + if (!d_list_empty(&rpt->rt_list)) { + rpt_delete(rpt); rpt_put(rpt); } - atomic_fetch_sub(&pool->sp_rebuilding, 1); - ds_pool_put(pool); + rpt_put(rpt); } daos_prop_fini(&prop); From cddcc5d6a51865f12440ff125d8d635df40a7769 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Fri, 10 Apr 2026 20:49:53 +0800 Subject: [PATCH 04/12] DAOS: remove unused epoch Signed-off-by: Liang Zhen --- src/container/srv_target.c | 5 +---- src/include/daos_srv/pool.h | 1 - src/rebuild/rebuild_internal.h | 9 +-------- src/rebuild/srv.c | 27 +++++++-------------------- 4 files changed, 9 insertions(+), 33 deletions(-) diff --git a/src/container/srv_target.c b/src/container/srv_target.c index 3bc53d2ede1..de4910f537d 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -392,10 +392,7 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, for (i = 0; i < snapshots_nr && snapshots[i] < epoch_min; ++i) ; - if (i == 0) - epoch_range.epr_lo = 0; - else - epoch_range.epr_lo = snapshots[i - 1] + 1; + epoch_range.epr_lo = epoch_min != 0 ? epoch_min + 1 : 0; if (epoch_range.epr_lo >= epoch_max) D_GOTO(free, rc = 0); diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 63b68c3b48c..9e1763d2dde 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -175,7 +175,6 @@ struct ds_pool_child { d_list_t spc_cont_list; d_list_t spc_srv_cont_hdl; /* Single server cont handle */ - uint64_t spc_rebuild_start; uint32_t spc_map_version; int spc_ref; ABT_eventual spc_ref_eventual; diff --git a/src/rebuild/rebuild_internal.h b/src/rebuild/rebuild_internal.h index c4cc64a041a..2b8ff817be3 100644 --- a/src/rebuild/rebuild_internal.h +++ b/src/rebuild/rebuild_internal.h @@ -77,14 +77,7 @@ struct rebuild_tgt_pool_tracker { uint64_t rt_stable_epoch; /* Only used by reclaim job to discard those half-rebuild data */ - uint64_t rt_reclaim_epoch; - /* - * XX: remove this. - * rebuild_fini_one() compare this value against rt_rebuild_start to - * decide whether this rebuild still owns this vos pool's rebuild. - */ - uint64_t rt_rebuild_start; - + uint64_t rt_reclaim_epoch; uint32_t rt_leader_rank; /* Global dtx resync version */ diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 16ad609f9b4..f524b16447a 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -2869,17 +2869,8 @@ rebuild_fini_one(void *arg) if (dpc == NULL) return 0; - D_ASSERT(rpt->rt_rebuild_start != 0); - if (rpt->rt_rebuild_start == dpc->spc_rebuild_start) { - dpc->spc_rebuild_start = 0; - D_DEBUG(DB_REBUILD, DF_RB ": Reset rebuild start epoch\n", DP_RB_RPT(rpt)); - } else { - D_DEBUG(DB_REBUILD, - DF_RB ": pool is still being rebuilt rt_rebuild_start " DF_U64 - " spc_rebuild_start " DF_U64 "\n", - DP_RB_RPT(rpt), rpt->rt_rebuild_start, dpc->spc_rebuild_start); - } - + D_DEBUG(DB_REBUILD, DF_RB ": rebuild fini for stable epoch " DF_U64 "\n", DP_RB_RPT(rpt), + rpt->rt_stable_epoch); ds_pool_child_put(dpc); return 0; } @@ -3124,10 +3115,8 @@ rebuild_prepare_one(void *data) D_ASSERT(dss_get_module_info()->dmi_xs_id != 0); - D_ASSERT(rpt->rt_rebuild_start != 0); - dpc->spc_rebuild_start = rpt->rt_rebuild_start; - D_DEBUG(DB_REBUILD, DF_RB " open local container " DF_UUID " rebuild eph " DF_X64 "\n", - DP_RB_RPT(rpt), DP_UUID(rpt->rt_coh_uuid), rpt->rt_rebuild_start); + D_DEBUG(DB_REBUILD, DF_RB " open local container " DF_UUID " stable eph " DF_X64 "\n", + DP_RB_RPT(rpt), DP_UUID(rpt->rt_coh_uuid), rpt->rt_stable_epoch); put: ds_pool_child_put(dpc); @@ -3262,12 +3251,10 @@ rebuild_tgt_prepare(struct ds_pool *pool, struct rebuild_scan_in *rsi, if (pool_tls == NULL) D_GOTO(out, rc = -DER_NOMEM); - rpt->rt_rebuild_start = d_hlc_get(); - rc = ds_pool_task_collective(rpt->rt_pool_uuid, - PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, - rebuild_prepare_one, rpt, 0); + rc = ds_pool_task_collective(rpt->rt_pool_uuid, + PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, + rebuild_prepare_one, rpt, 0); if (rc) { - rpt->rt_rebuild_start = 0; rebuild_pool_tls_destroy(pool_tls); D_GOTO(out, rc); } From 57e7d4195e48bf2d3e380d55e82eefb09e4fd804 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Sat, 11 Apr 2026 00:05:12 +0800 Subject: [PATCH 05/12] DAOS: fix the ds_pool refcount Signed-off-by: Liang Zhen --- src/object/srv_obj.c | 2 +- src/rebuild/scan.c | 4 ++-- src/rebuild/srv.c | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index e8c23393883..ff71575fc5e 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -3439,7 +3439,7 @@ obj_local_enum(struct obj_io_context *ioc, crt_rpc_t *rpc, * by setting this flag. * NB: it's a lockess write to shared data structure and it's harmless. */ - atomic_fetch_add(&ioc->ioc_coc->sc_pool->spc_pool->sp_rebuild_scanning, 1); + atomic_store(&ioc->ioc_coc->sc_pool->spc_pool->sp_rebuild_scanning, 1); flags = DTX_FOR_MIGRATION; } diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 66a714ec8a5..667bc2c462f 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1336,10 +1336,10 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) if (rc != 0) { if (tls && tls->rebuild_pool_status == 0) tls->rebuild_pool_status = rc; - if (pool) - atomic_fetch_sub(&pool->sp_rebuilding, 1); if (rpt) rpt_delete(rpt); + else if (pool) /* otherwise rpt_put() will decrease this for me */ + atomic_fetch_sub(&pool->sp_rebuilding, 1); } if (pool) ds_pool_put(pool); diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index f524b16447a..bce129d9dbb 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -3260,6 +3260,7 @@ rebuild_tgt_prepare(struct ds_pool *pool, struct rebuild_scan_in *rsi, } ABT_mutex_lock(rpt->rt_lock); + ds_pool_get(pool); rpt->rt_pool = pool; /* pin it */ ABT_mutex_unlock(rpt->rt_lock); From a39765808412f1e561d4a12e07c82044d1759842 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Sat, 11 Apr 2026 11:42:42 +0800 Subject: [PATCH 06/12] DAOS: remove false assertion Signed-off-by: Liang Zhen --- src/container/srv_target.c | 2 +- src/include/daos_srv/pool.h | 9 ++++----- src/object/srv_obj.c | 3 +-- src/rebuild/srv.c | 3 +-- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/container/srv_target.c b/src/container/srv_target.c index de4910f537d..1cc68865af8 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -210,7 +210,7 @@ cont_aggregate_runnable(struct ds_cont_child *cont, struct sched_request *req, if (ds_pool_is_rebuilding(pool) && !vos_agg) { D_DEBUG(DB_EPC, DF_CONT ": skip EC aggregation during rebuild %d, %d.\n", DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), - atomic_load(&pool->sp_rebuilding), atomic_load(&pool->sp_rebuild_scanning)); + atomic_load(&pool->sp_rebuilding), atomic_load(&pool->sp_rebuild_enum)); return false; } diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 9e1763d2dde..cb1fe8d1eb5 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -94,10 +94,10 @@ struct ds_pool { * rebuild job. */ uint32_t sp_rebuild_gen; - ATOMIC int sp_rebuilding; ATOMIC int sp_discarding; - /* someone has already messaged this pool to for rebuild scan */ - ATOMIC int sp_rebuild_scanning; + ATOMIC int sp_rebuilding; + /* someone has already messaged this pool to for rebuild object/key enumeration */ + ATOMIC int sp_rebuild_enum; int sp_discard_status; /** path to ephemeral metrics */ @@ -206,8 +206,7 @@ struct ds_pool_svc_op_val { static inline bool ds_pool_is_rebuilding(struct ds_pool *pool) { - return (atomic_load(&pool->sp_rebuilding) > 0 || - atomic_load(&pool->sp_rebuild_scanning) > 0); + return (atomic_load(&pool->sp_rebuilding) > 0 || atomic_load(&pool->sp_rebuild_enum) > 0); } /* encode metadata RPC operation key: HLC time first, in network order, for keys sorted by time. diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index ff71575fc5e..7a1cf6228db 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -3437,9 +3437,8 @@ obj_local_enum(struct obj_io_context *ioc, crt_rpc_t *rpc, if (oei->oei_flags & ORF_FOR_MIGRATION) { /* just in case ds_pool::sp_rebuilding is not set, pause my local EC aggregation * by setting this flag. - * NB: it's a lockess write to shared data structure and it's harmless. */ - atomic_store(&ioc->ioc_coc->sc_pool->spc_pool->sp_rebuild_scanning, 1); + atomic_store(&ioc->ioc_coc->sc_pool->spc_pool->sp_rebuild_enum, 1); flags = DTX_FOR_MIGRATION; } diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index bce129d9dbb..cb44b164fae 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -2887,8 +2887,7 @@ rebuild_tgt_fini(struct rebuild_tgt_pool_tracker *rpt) D_ASSERT(atomic_load(&rpt->rt_pool->sp_rebuilding) > 0); atomic_fetch_sub(&rpt->rt_pool->sp_rebuilding, 1); - D_ASSERT(atomic_load(&rpt->rt_pool->sp_rebuild_scanning) > 0); - atomic_store(&rpt->rt_pool->sp_rebuild_scanning, 0); + atomic_store(&rpt->rt_pool->sp_rebuild_enum, 0); ABT_mutex_lock(rpt->rt_lock); ABT_cond_signal(rpt->rt_global_dtx_wait_cond); From 510c9ddf742efe3ea5f547d9ab9ab3c1c3d95bd9 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Sun, 12 Apr 2026 22:54:44 +0800 Subject: [PATCH 07/12] DAOS: more code cleanup Signed-off-by: Liang Zhen --- src/rebuild/scan.c | 22 +++++++++++++--------- src/rebuild/srv.c | 20 ++++++++++++-------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 667bc2c462f..8b61b5a27d2 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1202,6 +1202,7 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) struct rebuild_pool_tls *tls = NULL; struct rebuild_tgt_pool_tracker *rpt = NULL; struct ds_pool *pool = NULL; + bool checker = false; int rc; rsi = crt_req_get(rpc); @@ -1323,6 +1324,7 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) rpt_put(rpt); D_GOTO(out, rc); } + checker = true; rpt_get(rpt); /* step-3: start scan leader */ @@ -1333,18 +1335,20 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) } out: - if (rc != 0) { - if (tls && tls->rebuild_pool_status == 0) - tls->rebuild_pool_status = rc; - if (rpt) - rpt_delete(rpt); - else if (pool) /* otherwise rpt_put() will decrease this for me */ + if (rc != 0 && tls && tls->rebuild_pool_status == 0) + tls->rebuild_pool_status = rc; + + if (pool) { + if (!checker) atomic_fetch_sub(&pool->sp_rebuilding, 1); - } - if (pool) ds_pool_put(pool); - if (rpt) + } + + if (rpt) { + if (!checker) + rpt_delete(rpt); rpt_put(rpt); + } rout = crt_reply_get(rpc); rout->rso_status = rc; diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index cb44b164fae..5e7d3cfa34d 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -374,6 +374,8 @@ static void rpt_insert(struct rebuild_tgt_pool_tracker *rpt) { D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); + + rpt_get(rpt); ABT_rwlock_wrlock(rebuild_gst.rg_ttl_rwlock); d_list_add(&rpt->rt_list, &rebuild_gst.rg_tgt_tracker_list); ABT_rwlock_unlock(rebuild_gst.rg_ttl_rwlock); @@ -382,10 +384,17 @@ rpt_insert(struct rebuild_tgt_pool_tracker *rpt) void rpt_delete(struct rebuild_tgt_pool_tracker *rpt) { + bool decref = false; + D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); ABT_rwlock_wrlock(rebuild_gst.rg_ttl_rwlock); - d_list_del_init(&rpt->rt_list); + if (!d_list_empty(&rpt->rt_list)) { + d_list_del_init(&rpt->rt_list); + decref = true; + } ABT_rwlock_unlock(rebuild_gst.rg_ttl_rwlock); + if (decref) + rpt_put(rpt); } struct rebuild_tgt_pool_tracker * @@ -2920,7 +2929,6 @@ rebuild_tgt_fini(struct rebuild_tgt_pool_tracker *rpt) /* No one should access rpt after rebuild_fini_one. */ D_INFO(DF_RB " Finalized rebuild\n", DP_RB_RPT(rpt)); rpt_delete(rpt); - rpt_put(rpt); } void @@ -3079,8 +3087,8 @@ rebuild_tgt_status_check_ult(void *arg) sched_req_put(rpt->rt_ult); rpt->rt_ult = NULL; out: - rpt_put(rpt); rebuild_tgt_fini(rpt); + rpt_put(rpt); } /** @@ -3223,7 +3231,6 @@ rebuild_tgt_prepare(struct ds_pool *pool, struct rebuild_scan_in *rsi, /* Let's add the rpt to the tracker list before IV fetch, which might yield, * to make sure the new coming request can find the rpt in the list. */ - rpt_get(rpt); rpt_insert(rpt); rc = ds_pool_iv_srv_hdl_fetch(pool, &rpt->rt_poh_uuid, &rpt->rt_coh_uuid); @@ -3266,10 +3273,7 @@ rebuild_tgt_prepare(struct ds_pool *pool, struct rebuild_scan_in *rsi, *p_rpt = rpt; out: if (rc && rpt) { - if (!d_list_empty(&rpt->rt_list)) { - rpt_delete(rpt); - rpt_put(rpt); - } + rpt_delete(rpt); rpt_put(rpt); } daos_prop_fini(&prop); From b797371637e94a5b68de6dfa7e64bd93248cd512 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Mon, 13 Apr 2026 10:08:06 +0800 Subject: [PATCH 08/12] DAOS: clang format fix Signed-off-by: Liang Zhen --- src/rebuild/scan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 8b61b5a27d2..845bece1b9a 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1201,7 +1201,7 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) struct rebuild_scan_out *rout; struct rebuild_pool_tls *tls = NULL; struct rebuild_tgt_pool_tracker *rpt = NULL; - struct ds_pool *pool = NULL; + struct ds_pool *pool = NULL; bool checker = false; int rc; From c14b94741b31fdc93411be472a37a1692c87c37c Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Mon, 13 Apr 2026 15:35:42 +0800 Subject: [PATCH 09/12] DAOS: revert the epoch range change Signed-off-by: Liang Zhen --- src/container/srv_target.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/container/srv_target.c b/src/container/srv_target.c index 058f7e1d6a9..aa4d49b6bb6 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -392,7 +392,10 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, for (i = 0; i < snapshots_nr && snapshots[i] < epoch_min; ++i) ; - epoch_range.epr_lo = epoch_min != 0 ? epoch_min + 1 : 0; + if (i == 0) + epoch_range.epr_lo = 0; + else + epoch_range.epr_lo = snapshots[i - 1] + 1; if (epoch_range.epr_lo >= epoch_max) D_GOTO(free, rc = 0); From 4915ab80c2552c83f2bf76ebb4c5061e19374369 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Wed, 15 Apr 2026 17:19:43 +0800 Subject: [PATCH 10/12] DAOS: fix the wait condition in rebuild_tgt_fini Signed-off-by: Liang Zhen --- src/rebuild/srv.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 8868f973ef4..4c7877b97b8 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -385,17 +385,14 @@ rpt_insert(struct rebuild_tgt_pool_tracker *rpt) void rpt_delete(struct rebuild_tgt_pool_tracker *rpt) { - bool decref = false; - D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); + D_ASSERT(!d_list_empty(&rpt->rt_list)); + ABT_rwlock_wrlock(rebuild_gst.rg_ttl_rwlock); - if (!d_list_empty(&rpt->rt_list)) { - d_list_del_init(&rpt->rt_list); - decref = true; - } + d_list_del_init(&rpt->rt_list); ABT_rwlock_unlock(rebuild_gst.rg_ttl_rwlock); - if (decref) - rpt_put(rpt); + + rpt_put(rpt); } struct rebuild_tgt_pool_tracker * @@ -2904,14 +2901,13 @@ rebuild_tgt_fini(struct rebuild_tgt_pool_tracker *rpt) D_ASSERT(rpt->rt_refcount > 0); rpt->rt_finishing = 1; /* Wait until all ult/tasks finish and release the rpt. - * NB: Because rebuild_tgt_fini will be only called in - * rebuild_tgt_status_check_ult, which will make sure when - * rt_refcount reaches to 1, either all rebuild is done or - * all ult/task has been aborted by rt_abort, i.e. no new - * ULT/task will be created after this check. So it is safe - * to destroy the rpt after this. + * NB: Because rebuild_tgt_fini will be only called in rebuild_tgt_status_check_ult, + * which will make sure when rt_refcount reaches to 2 (one by check ULT, the other by + * track list), either all rebuild is done or all ult/task has been aborted by rt_abort, + * i.e. no new ULT/task will be created after this check. So it is safe to destroy + * the rpt after this. */ - if (rpt->rt_refcount > 1) + if (rpt->rt_refcount > 2) ABT_cond_wait(rpt->rt_fini_cond, rpt->rt_lock); ABT_mutex_unlock(rpt->rt_lock); From 82a7a0f1bbfdd4dab7ff7010bc769d2f3011a849 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Thu, 16 Apr 2026 01:40:20 +0000 Subject: [PATCH 11/12] Fix rpt wakeup and assertion Signed-off-by: Wang Shilong --- src/rebuild/scan.c | 35 +++++++++++++++++------------------ src/rebuild/srv.c | 2 +- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 55d705707b1..f989e5102cb 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1266,7 +1266,7 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) rc = ds_pool_lookup(rsi->rsi_pool_uuid, &pool); if (rc) { DL_ERROR(rc, DF_RB " cannot find pool", DP_RB_RSI(rsi)); - D_GOTO(out, rc); + D_GOTO(out_put, rc); } atomic_fetch_add(&pool->sp_rebuilding, 1); @@ -1299,7 +1299,7 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) if (rpt != NULL && rpt->rt_rebuild_op == rsi->rsi_rebuild_op) { if (rpt->rt_global_done) { D_WARN("previous not cleaned up yet " DF_RBF "\n", DP_RBF_RPT(rpt)); - D_GOTO(out, rc = -DER_BUSY); + D_GOTO(out_put, rc = -DER_BUSY); } /* Rebuild should never skip the version */ @@ -1332,7 +1332,7 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) * an old or same leader. */ if (rsi->rsi_leader_term <= rpt->rt_leader_term) - D_GOTO(out, rc = 0); + D_GOTO(out_put, rc = 0); if (rpt->rt_leader_rank != rsi->rsi_master_rank) { D_DEBUG(DB_REBUILD, "new leader existing " DF_RBF "-> req " DF_RBF "\n", @@ -1349,7 +1349,7 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) rpt->rt_leader_term = rsi->rsi_leader_term; - D_GOTO(out, rc = 0); + D_GOTO(out_put, rc = 0); } else if (rpt != NULL) { rpt_put(rpt); rpt = NULL; @@ -1360,22 +1360,22 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) rsi->rsi_rebuild_gen); if (tls != NULL) { D_WARN("previous not cleaned up yet " DF_RBF, DP_RBF_RSI(rsi)); - D_GOTO(out, rc = -DER_BUSY); + D_GOTO(out_delete, rc = -DER_BUSY); } if (daos_fail_check(DAOS_REBUILD_TGT_START_FAIL)) - D_GOTO(out, rc = -DER_INVAL); + D_GOTO(out_delete, rc = -DER_INVAL); rc = rebuild_tgt_prepare(pool, rsi, &rpt); if (rc) - D_GOTO(out, rc); + D_GOTO(out_delete, rc); rpt_get(rpt); rc = dss_ult_create(rebuild_tgt_status_check_ult, rpt, DSS_XS_SELF, 0, DSS_DEEP_STACK_SZ, NULL); if (rc) { rpt_put(rpt); - D_GOTO(out, rc); + D_GOTO(out_delete, rc); } checker = true; @@ -1384,24 +1384,23 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) rc = dss_ult_create(rebuild_scan_leader, rpt, DSS_XS_SELF, 0, 0, NULL); if (rc != 0) { rpt_put(rpt); - D_GOTO(out, rc); + D_GOTO(out_delete, rc); } -out: - if (rc != 0 && tls && tls->rebuild_pool_status == 0) - tls->rebuild_pool_status = rc; +out_delete: + if (rpt && !checker) + rpt_delete(rpt); +out_put: + if (rpt) + rpt_put(rpt); if (pool) { if (!checker) atomic_fetch_sub(&pool->sp_rebuilding, 1); ds_pool_put(pool); } - - if (rpt) { - if (!checker) - rpt_delete(rpt); - rpt_put(rpt); - } + if (rc != 0 && tls && tls->rebuild_pool_status == 0) + tls->rebuild_pool_status = rc; rout = crt_reply_get(rpc); rout->rso_status = rc; diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 4c7877b97b8..580f21b9338 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -1552,7 +1552,7 @@ rpt_put(struct rebuild_tgt_pool_tracker *rpt) D_ASSERT(rpt->rt_refcount >= 0); D_DEBUG(DB_REBUILD, DF_RB ": rpt %p ref %d finishing %d\n", DP_RB_RPT(rpt), rpt, rpt->rt_refcount, rpt->rt_finishing); - if (rpt->rt_refcount == 1 && rpt->rt_finishing) + if (rpt->rt_refcount == 2 && rpt->rt_finishing) ABT_cond_signal(rpt->rt_fini_cond); zombie = (rpt->rt_refcount == 0); ABT_mutex_unlock(rpt->rt_lock); From 9e128f4ef7bfc85fa26dc43d2f999219ea555609 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Fri, 17 Apr 2026 12:03:52 +0800 Subject: [PATCH 12/12] DAOS: code cleanup Signed-off-by: Liang Zhen --- src/rebuild/srv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 580f21b9338..61eb444911d 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -1552,7 +1552,7 @@ rpt_put(struct rebuild_tgt_pool_tracker *rpt) D_ASSERT(rpt->rt_refcount >= 0); D_DEBUG(DB_REBUILD, DF_RB ": rpt %p ref %d finishing %d\n", DP_RB_RPT(rpt), rpt, rpt->rt_refcount, rpt->rt_finishing); - if (rpt->rt_refcount == 2 && rpt->rt_finishing) + if (rpt->rt_finishing) ABT_cond_signal(rpt->rt_fini_cond); zombie = (rpt->rt_refcount == 0); ABT_mutex_unlock(rpt->rt_lock); @@ -2907,7 +2907,7 @@ rebuild_tgt_fini(struct rebuild_tgt_pool_tracker *rpt) * i.e. no new ULT/task will be created after this check. So it is safe to destroy * the rpt after this. */ - if (rpt->rt_refcount > 2) + while (rpt->rt_refcount > 2) ABT_cond_wait(rpt->rt_fini_cond, rpt->rt_lock); ABT_mutex_unlock(rpt->rt_lock);