From e84a9af599632a7c480f7d7d6732d15c19381d78 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Sat, 18 Apr 2026 10:22:06 +0800 Subject: [PATCH 1/3] DAOS-18690 vos: handle DTX commit under space pressure If we cannot normally allocate space to hold committed DTX table, then release some old DTX entries from current container to hold new committed ones. The patch also preallocates some space for TX snapshots. Related logic, such as DTX commit and maybe GC, will switch to emergency mode and use the preallocated buffer in case of space pressure. Signed-off-by: Fan Yong --- src/common/mem.c | 92 +++++++-- src/dtx/dtx_common.c | 28 +++ src/dtx/dtx_internal.h | 5 +- src/dtx/dtx_rpc.c | 4 +- src/dtx/dtx_srv.c | 19 +- src/include/daos/common.h | 1 + src/include/daos/dtx.h | 4 +- src/include/daos/mem.h | 38 ++++ src/object/srv_obj.c | 16 +- src/tests/suite/daos_base_tx.c | 79 ++++++++ src/vos/vos_dtx.c | 335 ++++++++++++++++++++++++--------- src/vos/vos_layout.h | 11 +- src/vos/vos_pool.c | 10 +- 13 files changed, 505 insertions(+), 137 deletions(-) diff --git a/src/common/mem.c b/src/common/mem.c index e473cb8bfe5..7b2c7c24d19 100644 --- a/src/common/mem.c +++ b/src/common/mem.c @@ -997,6 +997,56 @@ pmem_defer_free(struct umem_instance *umm, umem_off_t off, void *act) pmemobj_defer_free(pop, id, (struct pobj_action *)act); } +static void +pmem_tx_set_failure_behavior(enum umem_tx_failure_behavior behavior) +{ + switch (behavior) { + case TX_FAILURE_ABORT: + pmemobj_tx_set_failure_behavior(POBJ_TX_FAILURE_ABORT); + break; + case TX_FAILURE_RETURN: + pmemobj_tx_set_failure_behavior(POBJ_TX_FAILURE_RETURN); + break; + default: + D_ASSERTF(0, "Unknown TX failure behavior %d\n", behavior); + } +} + +static int +pmem_tx_get_failure_behavior(void) +{ + enum pobj_tx_failure_behavior behavior; + + behavior = pmemobj_tx_get_failure_behavior(); + + switch (behavior) { + case POBJ_TX_FAILURE_ABORT: + return TX_FAILURE_ABORT; + case POBJ_TX_FAILURE_RETURN: + return TX_FAILURE_RETURN; + default: + D_ERROR("Unknown TX failure behavior %d\n", behavior); + return -DER_INVAL; + } +} + +static int +pmem_tx_set_snapbuf(struct umem_instance *umm, umem_off_t snapbuf, size_t size) +{ + void *buf = umem_off2ptr(umm, snapbuf); + int rc; + + rc = pmemobj_tx_log_append_buffer(TX_LOG_TYPE_SNAPSHOT, buf, size); + if (rc != 0) + return rc; + + rc = pmemobj_tx_log_auto_alloc(TX_LOG_TYPE_SNAPSHOT, 0); + if (rc != 0) + return rc; + + return 0; +} + static int pmem_tx_stage(void) { @@ -1135,28 +1185,30 @@ umem_tx_add_cb(struct umem_instance *umm, struct umem_tx_stage_data *txd, return 0; } -static umem_ops_t pmem_ops = { - .mo_tx_free = pmem_tx_free, - .mo_tx_alloc = pmem_tx_alloc, - .mo_tx_add = pmem_tx_add, - .mo_tx_xadd = pmem_tx_xadd, - .mo_tx_add_ptr = pmem_tx_add_ptr, - .mo_tx_abort = pmem_tx_abort, - .mo_tx_begin = pmem_tx_begin, - .mo_tx_commit = pmem_tx_commit, - .mo_tx_stage = pmem_tx_stage, - .mo_reserve = pmem_reserve, - .mo_defer_free = pmem_defer_free, - .mo_cancel = pmem_cancel, - .mo_tx_publish = pmem_tx_publish, - .mo_atomic_copy = pmem_atomic_copy, - .mo_atomic_alloc = pmem_atomic_alloc, - .mo_atomic_free = pmem_atomic_free, - .mo_atomic_flush = pmem_atomic_flush, - .mo_tx_add_callback = umem_tx_add_cb, +static umem_ops_t pmem_ops = { + .mo_tx_free = pmem_tx_free, + .mo_tx_alloc = pmem_tx_alloc, + .mo_tx_add = pmem_tx_add, + .mo_tx_xadd = pmem_tx_xadd, + .mo_tx_add_ptr = pmem_tx_add_ptr, + .mo_tx_abort = pmem_tx_abort, + .mo_tx_begin = pmem_tx_begin, + .mo_tx_commit = pmem_tx_commit, + .mo_tx_set_failure_behavior = pmem_tx_set_failure_behavior, + .mo_tx_get_failure_behavior = pmem_tx_get_failure_behavior, + .mo_tx_set_snapbuf = pmem_tx_set_snapbuf, + .mo_tx_stage = pmem_tx_stage, + .mo_reserve = pmem_reserve, + .mo_defer_free = pmem_defer_free, + .mo_cancel = pmem_cancel, + .mo_tx_publish = pmem_tx_publish, + .mo_atomic_copy = pmem_atomic_copy, + .mo_atomic_alloc = pmem_atomic_alloc, + .mo_atomic_free = pmem_atomic_free, + .mo_atomic_flush = pmem_atomic_flush, + .mo_tx_add_callback = umem_tx_add_cb, }; - /** BMEM operations (depends on dav) */ static int diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index 169c2b2d6a2..d27963b5bf8 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -1032,6 +1032,34 @@ dtx_sub_init(struct dtx_handle *dth, daos_unit_oid_t *oid, uint64_t dkey_hash) return rc; } +int +dtx_commit_large(daos_handle_t coh, struct dtx_id *dtis, int cnt, bool keep_act, bool *rm_cos) +{ + int step = DTX_YIELD_CYCLE; + int committed = 0; + int rc = 0; + int i = 0; + + while (i < cnt) { + if (i + step > cnt) + step = cnt - i; + + rc = vos_dtx_commit(coh, dtis + i, step, keep_act, rm_cos); + if (rc >= 0) { + committed += rc; + i += step; + } else { + if ((rc != -DER_NOSPACE && rc != -DER_OVERFLOW) || step <= 1) + return rc; + + /* If out of space, reduce TX size and retry. */ + step >>= 1; + } + } + + return committed; +} + /** * Prepare the leader DTX handle in DRAM. * diff --git a/src/dtx/dtx_internal.h b/src/dtx/dtx_internal.h index 1298b4b350a..49fafa83523 100644 --- a/src/dtx/dtx_internal.h +++ b/src/dtx/dtx_internal.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -101,7 +101,7 @@ CRT_RPC_DECLARE(dtx, DAOS_ISEQ_DTX, DAOS_OSEQ_DTX); CRT_RPC_DECLARE(dtx_coll, DAOS_ISEQ_COLL_DTX, DAOS_OSEQ_COLL_DTX); -#define DTX_YIELD_CYCLE (DTX_THRESHOLD_COUNT >> 3) +#define DTX_YIELD_CYCLE DTX_PIGGYBACK_COUNT /* The count threshold (per pool) for triggering DTX aggregation. */ #define DTX_AGG_THD_CNT_MAX (1 << 24) @@ -263,6 +263,7 @@ extern btr_ops_t dtx_btr_cos_ops; /* dtx_common.c */ int dtx_handle_reinit(struct dtx_handle *dth); void dtx_batched_commit(void *arg); +int dtx_commit_large(daos_handle_t coh, struct dtx_id *dtis, int cnt, bool keep_act, bool *rm_cos); void dtx_aggregation_main(void *arg); int start_dtx_reindex_ult(struct ds_cont_child *cont); void dtx_merge_check_result(int *tgt, int src); diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index ec5b3ec016d..d63f0fe26e6 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -879,7 +879,7 @@ dtx_commit(struct ds_cont_child *cont, struct dtx_entry **dtes, * the DTX entries (in the dtis) as "PARTIAL_COMMITTED" and re-commit them later. * It is harmless to re-commit the DTX that has ever been committed. */ - rc1 = vos_dtx_commit(cont->sc_hdl, dca.dca_dtis, count, rc != 0, rm_cos); + rc1 = dtx_commit_large(cont->sc_hdl, dca.dca_dtis, count, rc != 0, rm_cos); if (rc1 > 0) { dra->dra_committed += rc1; rc1 = 0; diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c index 5b68645cf54..67ebe8b885b 100644 --- a/src/dtx/dtx_srv.c +++ b/src/dtx/dtx_srv.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -196,19 +196,12 @@ dtx_handler(crt_rpc_t *rpc) if (unlikely(din->di_epoch == 1)) D_GOTO(out, rc = -DER_IO); - while (i < din->di_dtx_array.ca_count) { - if (i + count > din->di_dtx_array.ca_count) - count = din->di_dtx_array.ca_count - i; + rc1 = dtx_commit_large(cont->sc_hdl, (struct dtx_id *)(din->di_dtx_array.ca_arrays), + din->di_dtx_array.ca_count, false, NULL); + if (rc1 < 0) + D_GOTO(out, rc = rc1); - dtis = (struct dtx_id *)din->di_dtx_array.ca_arrays + i; - rc1 = vos_dtx_commit(cont->sc_hdl, dtis, count, false, NULL); - if (rc1 > 0) - committed += rc1; - else if (rc == 0 && rc1 < 0) - rc = rc1; - - i += count; - } + committed += rc1; if (din->di_flags.ca_count > 0) flags = din->di_flags.ca_arrays; diff --git a/src/include/daos/common.h b/src/include/daos/common.h index b1b8ba73b0b..5d09a0c50f9 100644 --- a/src/include/daos/common.h +++ b/src/include/daos/common.h @@ -860,6 +860,7 @@ enum { #define DAOS_OBJ_COLL_SPARSE (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4d) #define DAOS_DTX_RESEND_NONLEADER (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4e) +#define DAOS_DTX_NOSPACE_NOREFRESH (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x4f) #define DAOS_NVME_FAULTY (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x50) #define DAOS_NVME_WRITE_ERR (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x51) diff --git a/src/include/daos/dtx.h b/src/include/daos/dtx.h index b91e5a1084c..db175503646 100644 --- a/src/include/daos/dtx.h +++ b/src/include/daos/dtx.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2023 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -25,6 +25,8 @@ */ #define DTX_THRESHOLD_COUNT (1 << 9) +#define DTX_PIGGYBACK_COUNT (1 << 6) + /* The time (in second) threshold for batched DTX commit. */ #define DTX_COMMIT_THRESHOLD_AGE 10 diff --git a/src/include/daos/mem.h b/src/include/daos/mem.h index d451f26704d..ee8e40a42c6 100644 --- a/src/include/daos/mem.h +++ b/src/include/daos/mem.h @@ -190,6 +190,11 @@ struct umem_pool { struct umem_slab_desc up_slabs[0]; }; +enum umem_tx_failure_behavior { + TX_FAILURE_ABORT, + TX_FAILURE_RETURN, +}; + #ifdef DAOS_PMEM_BUILD #define UMEM_CACHE_PAGE_SZ_SHIFT 24 /* 16MB */ #define UMEM_CACHE_PAGE_SZ (1 << UMEM_CACHE_PAGE_SZ_SHIFT) @@ -750,7 +755,14 @@ typedef struct { /** commit memory transaction */ int (*mo_tx_commit)(struct umem_instance *umm, void *data); + /** set TX_FAILURE_ABORT or TX_FAILURE_RETURN when hit failure during TX. */ + void (*mo_tx_set_failure_behavior)(enum umem_tx_failure_behavior behavior); + + /** query the failure behavior for current TX. */ + int (*mo_tx_get_failure_behavior)(void); #ifdef DAOS_PMEM_BUILD + /** Set emergency buffer for transaction snapshot */ + int (*mo_tx_set_snapbuf)(struct umem_instance *umm, umem_off_t snap_buf, size_t size); /** get TX stage */ int (*mo_tx_stage)(void); @@ -1074,12 +1086,38 @@ umem_tx_end(struct umem_instance *umm, int err) return umem_tx_end_ex(umm, err, NULL); } +static inline void +umem_tx_set_failure_behavior(struct umem_instance *umm, enum umem_tx_failure_behavior behavior) +{ + if (umm->umm_ops->mo_tx_set_failure_behavior) + return umm->umm_ops->mo_tx_set_failure_behavior(behavior); +} + +static inline int +umem_tx_get_failure_behavior(struct umem_instance *umm) +{ + if (umm->umm_ops->mo_tx_get_failure_behavior) + return umm->umm_ops->mo_tx_get_failure_behavior(); + else + /* Abort TX on failure by default. */ + return TX_FAILURE_ABORT; +} + #ifdef DAOS_PMEM_BUILD bool umem_tx_inprogress(struct umem_instance *umm); bool umem_tx_none(struct umem_instance *umm); int umem_tx_errno(int err); +static inline int +umem_tx_set_snapbuf(struct umem_instance *umm, umem_off_t snap_buf, size_t size) +{ + if (umm->umm_ops->mo_tx_set_snapbuf) + return umm->umm_ops->mo_tx_set_snapbuf(umm, snap_buf, size); + else + return 0; +} + static inline int umem_tx_stage(struct umem_instance *umm) { diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 29083dd02da..ae1ecec3876 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2093,14 +2093,18 @@ obj_local_rw_internal_wrap(crt_rpc_t *rpc, struct obj_io_context *ioc, struct dt static int obj_local_rw(crt_rpc_t *rpc, struct obj_io_context *ioc, struct dtx_handle *dth) { - struct obj_rw_in *orw = crt_req_get(rpc); - struct dtx_share_peer *dsp; - uint32_t retry = 0; - int rc; + struct obj_rw_in *orw = crt_req_get(rpc); + struct dtx_share_peer *dsp; + uint32_t retry = 0; + uint32_t opc = opc_get(rpc->cr_opc); + int rc; again: rc = obj_local_rw_internal_wrap(rpc, ioc, dth); if (dth != NULL && obj_dtx_need_refresh(dth, rc)) { + if (opc == DAOS_OBJ_RPC_FETCH && DAOS_FAIL_CHECK(DAOS_DTX_NOSPACE_NOREFRESH)) + return -DER_NONEXIST; + if (++retry < 3) { rc = dtx_refresh(dth, ioc->ioc_coc); if (rc == 0) @@ -3166,7 +3170,7 @@ ds_obj_rw_handler(crt_rpc_t *rpc) */ D_FREE(dti_cos); dti_cos_cnt = dtx_cos_get_piggyback(ioc.ioc_coc, &orw->orw_oid, orw->orw_dkey_hash, - DTX_THRESHOLD_COUNT, &dti_cos); + DTX_PIGGYBACK_COUNT, &dti_cos); if (dti_cos_cnt < 0) D_GOTO(out, rc = dti_cos_cnt); @@ -4073,7 +4077,7 @@ ds_obj_punch_handler(crt_rpc_t *rpc) */ D_FREE(dti_cos); dti_cos_cnt = dtx_cos_get_piggyback(ioc.ioc_coc, &opi->opi_oid, opi->opi_dkey_hash, - DTX_THRESHOLD_COUNT, &dti_cos); + DTX_PIGGYBACK_COUNT, &dti_cos); if (dti_cos_cnt < 0) D_GOTO(out, rc = dti_cos_cnt); diff --git a/src/tests/suite/daos_base_tx.c b/src/tests/suite/daos_base_tx.c index 0c6bd80d71c..410f14cbf4e 100644 --- a/src/tests/suite/daos_base_tx.c +++ b/src/tests/suite/daos_base_tx.c @@ -1024,6 +1024,83 @@ dtx_23(void **state) ioreq_fini(&req); } +#define TSIZE 8 + +static void +dtx_24(void **state) +{ + test_arg_t *arg = *state; + const char *dkey = dts_dtx_dkey; + char akey[TSIZE] = {0}; + char wbuf[TSIZE]; + char rbuf[TSIZE]; + daos_obj_id_t oid; + struct ioreq req; + char i; + char j; + + FAULT_INJECTION_REQUIRED(); + + print_message("DTX24: DTX commit under space pressure\n"); + + if (!test_runable(arg, dts_dtx_replica_cnt)) + return; + + oid = daos_test_oid_gen(arg->coh, dts_dtx_class, 0, 0, arg->myrank); + ioreq_init(&req, arg->coh, oid, DAOS_IOD_ARRAY, arg); + + print_message("Filling DTX committed table...\n"); + + for (i = 'a'; i <= 'z'; i++) { + snprintf(akey, TSIZE - 1, "akey-%c", i); + for (j = 'A'; j <= 'Z'; j++) { + memset(wbuf, j, TSIZE); + insert_single(dkey, akey, (j - 'A') * TSIZE, wbuf, TSIZE, DAOS_TX_NONE, + &req); + } + } + + /* Wait for batched commit. */ + sleep(DTX_COMMIT_THRESHOLD_AGE + 3); + + /* Simulate DER_NOSPACE when DTX commit via DAOS_DTX_NOSPACE_NOREFRESH. */ + dtx_set_fail_loc(arg, DAOS_DTX_NOSPACE_NOREFRESH | DAOS_FAIL_ALWAYS); + + print_message("Writing more after space pressure...\n"); + + for (i = 'N'; i <= 'Z'; i++) { + snprintf(akey, TSIZE - 1, "akey-%c", i); + for (j = 'a'; j <= 'm'; j++) { + memset(wbuf, j, TSIZE); + insert_single(dkey, akey, (j - 'a') * TSIZE, wbuf, TSIZE, DAOS_TX_NONE, + &req); + } + } + + /* Wait for batched commit. */ + sleep(DTX_COMMIT_THRESHOLD_AGE + 3); + + print_message("Verifying all written data...\n"); + + /* + * DAOS_DTX_NOSPACE_NOREFRESH will prevent DTX refresh. If former batched commit failed + * to commit some DTX because of simulated space exhaustion, then related lookup will + * return IO failure and the fetch result will be different from former write one. + */ + for (i = 'N'; i <= 'Z'; i++) { + snprintf(akey, TSIZE - 1, "akey-%c", i); + for (j = 'a'; j <= 'm'; j++) { + memset(wbuf, j, TSIZE); + lookup_single(dkey, akey, (j - 'a') * TSIZE, rbuf, TSIZE, DAOS_TX_NONE, + &req); + assert_memory_equal(wbuf, rbuf, TSIZE); + } + } + + dtx_set_fail_loc(arg, 0); + ioreq_fini(&req); +} + static int dtx_base_rf0_setup(void **state) { @@ -1092,6 +1169,8 @@ static const struct CMUnitTest dtx_tests[] = { dtx_22, NULL, test_case_teardown}, {"DTX23: Resend with lost reply from non-leader", dtx_23, NULL, test_case_teardown}, + {"DTX24: DTX under space pressure", + dtx_24, NULL, test_case_teardown}, }; /* clang-format on */ diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index d50f2b87cc8..5d69af670a9 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -420,6 +420,31 @@ static btr_ops_t dtx_committed_btr_ops = { .to_rec_update = dtx_cmt_ent_update, }; +static inline int +vos_dtx_add_ptr(struct vos_pool *pool, void *ptr, size_t size) +{ + struct umem_instance *umm = &pool->vp_umm; + int rc; + + rc = umem_tx_add_ptr(umm, ptr, size); +#ifdef DAOS_PMEM_BUILD + if (unlikely(rc == -DER_NOSPACE)) { + struct vos_pool_ext_df *ext_df = umem_off2ptr(umm, pool->vp_pool_df->pd_ext); + int behavior = umem_tx_get_failure_behavior(umm); + + if (ext_df != NULL && !UMOFF_IS_NULL(ext_df->ped_emerg_buf) && + behavior == TX_FAILURE_RETURN) { + rc = umem_tx_set_snapbuf(umm, ext_df->ped_emerg_buf, VOS_SNAPBUF_EMERG); + if (rc == 0) + rc = umem_tx_add_ptr(umm, ptr, size); + else + rc = -DER_NOSPACE; + } + } +#endif + return rc; +} + int vos_dtx_table_register(void) { @@ -473,7 +498,7 @@ vos_dtx_table_destroy(struct umem_instance *umm, struct vos_cont_df *cont_df) /* cd_dtx_active_tail is next to cd_dtx_active_head */ rc = umem_tx_add_ptr(umm, &cont_df->cd_dtx_active_head, sizeof(cont_df->cd_dtx_active_head) + - sizeof(cont_df->cd_dtx_active_tail)); + sizeof(cont_df->cd_dtx_active_tail)); if (rc != 0) return rc; @@ -571,7 +596,7 @@ do_dtx_rec_release(struct umem_instance *umm, struct vos_container *cont, } if (abort) { if (DAE_INDEX(dae) != DTX_INDEX_INVAL) { - rc = umem_tx_add_ptr(umm, &svt->ir_dtx, + rc = vos_dtx_add_ptr(cont->vc_pool, &svt->ir_dtx, sizeof(svt->ir_dtx)); if (rc != 0) return rc; @@ -579,8 +604,7 @@ do_dtx_rec_release(struct umem_instance *umm, struct vos_container *cont, dtx_set_aborted(&svt->ir_dtx); } else { - rc = umem_tx_add_ptr(umm, &svt->ir_dtx, - sizeof(svt->ir_dtx)); + rc = vos_dtx_add_ptr(cont->vc_pool, &svt->ir_dtx, sizeof(svt->ir_dtx)); if (rc != 0) return rc; @@ -600,7 +624,7 @@ do_dtx_rec_release(struct umem_instance *umm, struct vos_container *cont, if (abort) { if (DAE_INDEX(dae) != DTX_INDEX_INVAL) { - rc = umem_tx_add_ptr(umm, &evt->dc_dtx, + rc = vos_dtx_add_ptr(cont->vc_pool, &evt->dc_dtx, sizeof(evt->dc_dtx)); if (rc != 0) return rc; @@ -608,8 +632,7 @@ do_dtx_rec_release(struct umem_instance *umm, struct vos_container *cont, dtx_set_aborted(&evt->dc_dtx); } else { - rc = umem_tx_add_ptr(umm, &evt->dc_dtx, - sizeof(evt->dc_dtx)); + rc = vos_dtx_add_ptr(cont->vc_pool, &evt->dc_dtx, sizeof(evt->dc_dtx)); if (rc != 0) return rc; @@ -708,7 +731,8 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, bool ab return rc; if (!invalid && keep_act) { - rc = umem_tx_add_ptr(umm, &dae_df->dae_rec_off, sizeof(dae_df->dae_rec_off)); + rc = vos_dtx_add_ptr(cont->vc_pool, &dae_df->dae_rec_off, + sizeof(dae_df->dae_rec_off)); if (rc != 0) return rc; dae_df->dae_rec_off = UMOFF_NULL; @@ -718,7 +742,7 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, bool ab if (!invalid && keep_act) { /* When re-commit partial committed DTX, the count can be zero. */ if (dae_df->dae_rec_cnt > 0) { - rc = umem_tx_add_ptr(umm, &dae_df->dae_rec_cnt, + rc = vos_dtx_add_ptr(cont->vc_pool, &dae_df->dae_rec_cnt, sizeof(dae_df->dae_rec_cnt)); if (rc != 0) return rc; @@ -733,7 +757,7 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, bool ab if (DAE_FLAGS(dae) & DTE_PARTIAL_COMMITTED) return 0; - rc = umem_tx_add_ptr(umm, &dae_df->dae_flags, sizeof(dae_df->dae_flags)); + rc = vos_dtx_add_ptr(cont->vc_pool, &dae_df->dae_flags, sizeof(dae_df->dae_flags)); if (rc != 0) return rc; @@ -753,11 +777,11 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, bool ab } if (dbd->dbd_count > 1 || dbd->dbd_index < dbd->dbd_cap) { - rc = umem_tx_add_ptr(umm, &dae_df->dae_flags, sizeof(dae_df->dae_flags)); + rc = vos_dtx_add_ptr(cont->vc_pool, &dae_df->dae_flags, sizeof(dae_df->dae_flags)); if (rc != 0) return rc; - rc = umem_tx_add_ptr(umm, &dbd->dbd_count, sizeof(dbd->dbd_count)); + rc = vos_dtx_add_ptr(cont->vc_pool, &dbd->dbd_count, sizeof(dbd->dbd_count)); if (rc != 0) return rc; @@ -772,8 +796,7 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, bool ab dbd_off = umem_ptr2off(umm, dbd); tmp = umem_off2ptr(umm, dbd->dbd_prev); if (tmp != NULL) { - rc = umem_tx_add_ptr(umm, &tmp->dbd_next, - sizeof(tmp->dbd_next)); + rc = vos_dtx_add_ptr(cont->vc_pool, &tmp->dbd_next, sizeof(tmp->dbd_next)); if (rc != 0) return rc; @@ -782,8 +805,7 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, bool ab tmp = umem_off2ptr(umm, dbd->dbd_next); if (tmp != NULL) { - rc = umem_tx_add_ptr(umm, &tmp->dbd_prev, - sizeof(tmp->dbd_prev)); + rc = vos_dtx_add_ptr(cont->vc_pool, &tmp->dbd_prev, sizeof(tmp->dbd_prev)); if (rc != 0) return rc; @@ -791,8 +813,8 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, bool ab } if (cont_df->cd_dtx_active_head == dbd_off) { - rc = umem_tx_add_ptr(umm, &cont_df->cd_dtx_active_head, - sizeof(cont_df->cd_dtx_active_head)); + rc = vos_dtx_add_ptr(cont->vc_pool, &cont_df->cd_dtx_active_head, + sizeof(cont_df->cd_dtx_active_head)); if (rc != 0) return rc; @@ -800,8 +822,8 @@ dtx_rec_release(struct vos_container *cont, struct vos_dtx_act_ent *dae, bool ab } if (cont_df->cd_dtx_active_tail == dbd_off) { - rc = umem_tx_add_ptr(umm, &cont_df->cd_dtx_active_tail, - sizeof(cont_df->cd_dtx_active_tail)); + rc = vos_dtx_add_ptr(cont->vc_pool, &cont_df->cd_dtx_active_tail, + sizeof(cont_df->cd_dtx_active_tail)); if (rc != 0) return rc; @@ -987,23 +1009,22 @@ vos_dtx_extend_act_table(struct vos_container *cont) D_ASSERT(UMOFF_IS_NULL(cont_df->cd_dtx_active_head)); /* cd_dtx_active_tail is next to cd_dtx_active_head */ - rc = umem_tx_add_ptr(umm, &cont_df->cd_dtx_active_head, + rc = vos_dtx_add_ptr(cont->vc_pool, &cont_df->cd_dtx_active_head, sizeof(cont_df->cd_dtx_active_head) + - sizeof(cont_df->cd_dtx_active_tail)); + sizeof(cont_df->cd_dtx_active_tail)); if (rc != 0) goto out; cont_df->cd_dtx_active_head = dbd_off; } else { - rc = umem_tx_add_ptr(umm, &tmp->dbd_next, - sizeof(tmp->dbd_next)); + rc = vos_dtx_add_ptr(cont->vc_pool, &tmp->dbd_next, sizeof(tmp->dbd_next)); if (rc != 0) goto out; tmp->dbd_next = dbd_off; - dbd->dbd_prev = cont_df->cd_dtx_active_tail; - rc = umem_tx_add_ptr(umm, &cont_df->cd_dtx_active_tail, + + rc = vos_dtx_add_ptr(cont->vc_pool, &cont_df->cd_dtx_active_tail, sizeof(cont_df->cd_dtx_active_tail)); if (rc != 0) goto out; @@ -1018,6 +1039,169 @@ vos_dtx_extend_act_table(struct vos_container *cont) return rc; } +static int +vos_dtx_extend_cmt_table(struct vos_container *cont) +{ + struct umem_instance *umm = vos_cont2umm(cont); + struct vos_cont_df *cont_df = cont->vc_cont_df; + struct vos_dtx_blob_df *dbd = NULL; + struct vos_dtx_blob_df *head; + struct vos_dtx_blob_df *tail; + struct vos_dtx_cmt_ent_df *dce_df; + daos_epoch_t epoch; + umem_off_t dbd_off = UMOFF_NULL; + d_iov_t kiov; + uint32_t count; + int rc; + int i; + bool allocated = false; + + if (!DAOS_FAIL_CHECK(DAOS_DTX_NOSPACE_NOREFRESH)) + dbd_off = umem_zalloc(umm, DTX_CMT_BLOB_SIZE); + + if (UMOFF_IS_NULL(dbd_off)) { + /* + * We almost run out of space. Under such case, we have to release some space to + * make current DTX commit to go ahead; otherwise, non-committed DTX will affect + * VOS aggregation as to no more space can be released via VOS aggregation. That + * will make things to be worse. The most direct solution is to release some old + * committed DTX blob from current container. That maybe unfair since some other + * container may have more old committed DTX entries, but scanning the target to + * find out the pool/container that have the oldest DTX entry may take sometime, + * depends on the pool/container count on the target. Under space pressure, such + * behavior may be not worth, we just choose current container as the victim to + * release space. It may be optimized in the future. DAOS-18690 + */ + + if (unlikely(cont_df->cd_dtx_committed_head == cont_df->cd_dtx_committed_tail)) { + D_ERROR("No space when extend commit DTX table.\n"); + return -DER_NOSPACE; + } + + dbd_off = cont_df->cd_dtx_committed_head; + head = umem_off2ptr(umm, dbd_off); + D_ASSERT(head != NULL); + + for (i = 0, epoch = 0, count = 0; i < head->dbd_count; i++) { + dce_df = &head->dbd_committed_data[i]; + d_iov_set(&kiov, &dce_df->dce_xid, sizeof(dce_df->dce_xid)); + rc = dbtree_delete(cont->vc_dtx_committed_hdl, BTR_PROBE_EQ, &kiov, NULL); + if (rc == 0) + count++; + if (rc == -DER_NONEXIST) + rc = 0; + if (unlikely(rc != 0)) { + D_ERROR("Failed to remove DTX entry " DF_DTI " for urgent DTX " + "aggregation " UMOFF_PF ": " DF_RC "\n", + DP_DTI(&dce_df->dce_xid), UMOFF_P(dbd_off), DP_RC(rc)); + goto out; + } + + if (epoch < dce_df->dce_epoch) + epoch = dce_df->dce_epoch; + } + + tail = umem_off2ptr(umm, cont_df->cd_dtx_committed_tail); + D_ASSERT(tail != NULL); + + dbd = umem_off2ptr(umm, head->dbd_next); + D_ASSERT(dbd != NULL); + + rc = vos_dtx_add_ptr(cont->vc_pool, head, DTX_CMT_BLOB_SIZE); + if (rc != 0) + goto out; + + /* dbd_next is next to dbd_prev */ + rc = vos_dtx_add_ptr(cont->vc_pool, &head->dbd_prev, + sizeof(head->dbd_prev) + sizeof(head->dbd_next)); + if (rc != 0) + goto out; + + rc = vos_dtx_add_ptr(cont->vc_pool, &tail->dbd_next, sizeof(tail->dbd_next)); + if (rc != 0) + goto out; + + rc = vos_dtx_add_ptr(cont->vc_pool, &dbd->dbd_prev, sizeof(dbd->dbd_prev)); + if (rc != 0) + goto out; + + /* cd_dtx_committed_tail is next to cd_dtx_committed_head */ + rc = vos_dtx_add_ptr(cont->vc_pool, &cont_df->cd_dtx_committed_head, + sizeof(cont_df->cd_dtx_committed_head) + + sizeof(cont_df->cd_dtx_committed_tail)); + if (rc != 0) + goto out; + + /* Current @head will be reused, move it after @tail, @dbd will be the new head. */ + + dbd->dbd_prev = head->dbd_prev; + head->dbd_prev = cont_df->cd_dtx_committed_tail; + tail->dbd_next = cont_df->cd_dtx_committed_head; + cont_df->cd_dtx_committed_head = head->dbd_next; + head->dbd_next = UMOFF_NULL; + cont_df->cd_dtx_committed_tail = dbd_off; + dbd = head; + + if (count > 0) { + D_ASSERTF(cont->vc_dtx_committed_count >= count, + "Unexpected committed DTX entries count for " DF_UUID ": %u/%u\n", + DP_UUID(cont->vc_id), cont->vc_dtx_committed_count, count); + + cont->vc_dtx_committed_count -= count; + cont->vc_pool->vp_dtx_committed_count -= count; + } + } else { + allocated = true; + + dbd = umem_off2ptr(umm, dbd_off); + tail = umem_off2ptr(umm, cont_df->cd_dtx_committed_tail); + if (tail == NULL) { + D_ASSERT(UMOFF_IS_NULL(cont_df->cd_dtx_committed_head)); + + /* cd_dtx_committed_tail is next to cd_dtx_committed_head */ + rc = vos_dtx_add_ptr(cont->vc_pool, &cont_df->cd_dtx_committed_head, + sizeof(cont_df->cd_dtx_committed_head) + + sizeof(cont_df->cd_dtx_committed_tail)); + if (rc != 0) + goto out; + + cont_df->cd_dtx_committed_head = dbd_off; + } else { + rc = + vos_dtx_add_ptr(cont->vc_pool, &tail->dbd_next, sizeof(tail->dbd_next)); + if (rc != 0) + goto out; + + rc = vos_dtx_add_ptr(cont->vc_pool, &cont_df->cd_dtx_committed_tail, + sizeof(cont_df->cd_dtx_committed_tail)); + if (rc != 0) + goto out; + + tail->dbd_next = dbd_off; + dbd->dbd_prev = cont_df->cd_dtx_committed_tail; + } + + cont_df->cd_dtx_committed_tail = dbd_off; + } + + dbd->dbd_magic = DTX_CMT_BLOB_MAGIC; + dbd->dbd_cap = (DTX_CMT_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) / + sizeof(struct vos_dtx_cmt_ent_df); + dbd->dbd_count = 0; + dbd->dbd_index = 0; + +out: + if (allocated) + DL_CDEBUG(rc == 0, DB_IO, DLOG_ERR, rc, + "Allocate DTX committed blob %p (" UMOFF_PF ") for cont " DF_UUID, dbd, + UMOFF_P(dbd_off), DP_UUID(cont->vc_id)); + else + DL_CDEBUG(rc == 0, DLOG_WARN, DLOG_ERR, rc, + "SPACE PRESSURE! reuse DTX blob %p (" UMOFF_PF ") for cont " DF_UUID, dbd, + UMOFF_P(dbd_off), DP_UUID(cont->vc_id)); + return rc; +} + static int vos_dtx_alloc(struct umem_instance *umm, struct dtx_handle *dth) { @@ -2209,19 +2393,17 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[], int count, daos_epoch_t epoch, bool keep_act, bool rm_cos[], struct vos_dtx_act_ent **daes, struct vos_dtx_cmt_ent **dces) { - struct vos_cont_df *cont_df = cont->vc_cont_df; - struct umem_instance *umm = vos_cont2umm(cont); - struct vos_dtx_blob_df *dbd; - struct vos_dtx_blob_df *dbd_prev; - umem_off_t dbd_off; - uint64_t cmt_time = daos_wallclock_secs(); - int committed = 0; - int rc = 0; - int p = 0; - int i = 0; - int j; - int k; - bool allocated = false; + struct vos_dtx_blob_df *dbd; + struct vos_cont_df *cont_df = cont->vc_cont_df; + struct umem_instance *umm = vos_cont2umm(cont); + uint64_t cmt_time = daos_wallclock_secs(); + int committed = 0; + int rc = 0; + int p = 0; + int i = 0; + int j; + int k; + bool allocated = false; dbd = umem_off2ptr(umm, cont_df->cd_dtx_committed_tail); if (dbd == NULL) @@ -2263,7 +2445,8 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[], goto out; /* Only need to add range for the first partial blob. */ - rc = umem_tx_add_ptr(umm, &dbd->dbd_count, sizeof(dbd->dbd_count)); + rc = + vos_dtx_add_ptr(cont->vc_pool, &dbd->dbd_count, sizeof(dbd->dbd_count)); if (rc != 0) goto out; } @@ -2283,52 +2466,17 @@ vos_dtx_commit_internal(struct vos_container *cont, struct dtx_id dtis[], goto out; new_blob: - dbd_prev = dbd; - /* Need new @dbd */ - dbd_off = umem_zalloc(umm, DTX_CMT_BLOB_SIZE); - if (UMOFF_IS_NULL(dbd_off)) { - D_ERROR("No space to store committed DTX %d "DF_DTI"\n", - count, DP_DTI(&dtis[i])); - D_GOTO(out, rc = -DER_NOSPACE); - } - - dbd = umem_off2ptr(umm, dbd_off); - dbd->dbd_magic = DTX_CMT_BLOB_MAGIC; - dbd->dbd_cap = (DTX_CMT_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) / - sizeof(struct vos_dtx_cmt_ent_df); - dbd->dbd_prev = umem_ptr2off(umm, dbd_prev); - - if (dbd_prev == NULL) { - D_ASSERT(UMOFF_IS_NULL(cont_df->cd_dtx_committed_head)); - D_ASSERT(UMOFF_IS_NULL(cont_df->cd_dtx_committed_tail)); - - /* cd_dtx_committed_tail is next to cd_dtx_committed_head */ - rc = umem_tx_add_ptr(umm, &cont_df->cd_dtx_committed_head, - sizeof(cont_df->cd_dtx_committed_head) + - sizeof(cont_df->cd_dtx_committed_tail)); - if (rc != 0) - goto out; - - cont_df->cd_dtx_committed_head = dbd_off; - } else { - rc = umem_tx_add_ptr(umm, &dbd_prev->dbd_next, - sizeof(dbd_prev->dbd_next)); - if (rc != 0) - goto out; + if (unlikely(allocated)) + D_GOTO(out, rc = -DER_OVERFLOW); - dbd_prev->dbd_next = dbd_off; + rc = vos_dtx_extend_cmt_table(cont); + if (rc != 0) + goto out; - rc = umem_tx_add_ptr(umm, &cont_df->cd_dtx_committed_tail, - sizeof(cont_df->cd_dtx_committed_tail)); - if (rc != 0) - goto out; - } + allocated = true; - D_DEBUG(DB_IO, "Allocated DTX committed blob %p ("UMOFF_PF") for cont "DF_UUID"\n", - dbd, UMOFF_P(dbd_off), DP_UUID(cont->vc_id)); + dbd = umem_off2ptr(umm, cont_df->cd_dtx_committed_tail); - cont_df->cd_dtx_committed_tail = dbd_off; - allocated = true; goto again; out: @@ -2618,6 +2766,8 @@ vos_dtx_commit(daos_handle_t coh, struct dtx_id dtis[], int count, bool keep_act /* Commit multiple DTXs via single local transaction. */ rc = umem_tx_begin(vos_cont2umm(cont), NULL); if (rc == 0) { + umem_tx_set_failure_behavior(vos_cont2umm(cont), TX_FAILURE_RETURN); + committed = vos_dtx_commit_internal(cont, &dtis[idx], pinned, 0, keep_act, rm_cos != NULL ? &rm_cos[idx] : NULL, &daes[idx], &dces[idx]); @@ -2691,6 +2841,8 @@ vos_dtx_abort_internal(struct vos_container *cont, struct vos_dtx_act_ent *dae, if (rc != 0) goto out; + umem_tx_set_failure_behavior(umm, TX_FAILURE_RETURN); + if (dth != NULL) { D_ASSERT(dth->dth_ent == dae || dth->dth_ent == NULL); /* Not allow dtx_abort against solo DTX. */ @@ -2975,7 +3127,7 @@ vos_dtx_set_flags_one(struct vos_container *cont, struct dtx_id *dti, uint32_t f dae_df = umem_off2ptr(umm, dae->dae_df_off); D_ASSERT(dae_df != NULL); - rc = umem_tx_add_ptr(umm, &dae_df->dae_flags, sizeof(dae_df->dae_flags)); + rc = vos_dtx_add_ptr(cont->vc_pool, &dae_df->dae_flags, sizeof(dae_df->dae_flags)); if (rc == 0) { dae_df->dae_flags |= flags; DAE_FLAGS(dae) |= flags; @@ -3014,6 +3166,8 @@ vos_dtx_set_flags(daos_handle_t coh, struct dtx_id dtis[], int count, uint32_t f if (rc != 0) goto out; + umem_tx_set_failure_behavior(umm, TX_FAILURE_RETURN); + for (i = 0; i < count; i++) { rc = vos_dtx_set_flags_one(cont, &dtis[i], flags); if (rc != 0) @@ -3052,6 +3206,8 @@ dtx_blob_aggregate(struct umem_instance *umm, struct vos_tls *tls, struct vos_co goto out; } + umem_tx_set_failure_behavior(umm, TX_FAILURE_RETURN); + for (i = 0; i < dbd->dbd_count; i++) { struct vos_dtx_cmt_ent_df *dce_df; d_iov_t kiov; @@ -3080,7 +3236,7 @@ dtx_blob_aggregate(struct umem_instance *umm, struct vos_tls *tls, struct vos_co } if (epoch != cont_df->cd_newest_aggregated) { - rc = umem_tx_add_ptr(umm, &cont_df->cd_newest_aggregated, + rc = vos_dtx_add_ptr(cont->vc_pool, &cont_df->cd_newest_aggregated, sizeof(cont_df->cd_newest_aggregated)); if (unlikely(rc != 0)) { D_ERROR("Failed to refresh epoch for DTX aggregation " UMOFF_PF ": " DF_RC @@ -3094,7 +3250,7 @@ dtx_blob_aggregate(struct umem_instance *umm, struct vos_tls *tls, struct vos_co if (dbd->dbd_count - dtx_aggr_count > 0) { size_t buf_len; - rc = umem_tx_add_ptr(umm, &dbd->dbd_committed_data[0], + rc = vos_dtx_add_ptr(cont->vc_pool, &dbd->dbd_committed_data[0], sizeof(dbd->dbd_committed_data[0]) * dbd->dbd_count); if (unlikely(rc != 0)) { D_ERROR("Failed update committed DTX blob " UMOFF_PF ": " DF_RC "\n", @@ -3105,7 +3261,7 @@ dtx_blob_aggregate(struct umem_instance *umm, struct vos_tls *tls, struct vos_co memmove(&dbd->dbd_committed_data[0], &dbd->dbd_committed_data[dtx_aggr_count], buf_len); - rc = umem_tx_add_ptr(umm, &dbd->dbd_count, sizeof(dbd->dbd_count)); + rc = vos_dtx_add_ptr(cont->vc_pool, &dbd->dbd_count, sizeof(dbd->dbd_count)); if (unlikely(rc != 0)) { D_ERROR("Failed update committed DTX count " UMOFF_PF ": " DF_RC "\n", UMOFF_P(dbd_off), DP_RC(rc)); @@ -3122,7 +3278,7 @@ dtx_blob_aggregate(struct umem_instance *umm, struct vos_tls *tls, struct vos_co D_ASSERT(UMOFF_IS_NULL(dbd_prev_off)); D_ASSERT(dbd_off == cont_df->cd_dtx_committed_head); - rc = umem_tx_add_ptr(umm, &cont_df->cd_dtx_committed_head, + rc = vos_dtx_add_ptr(cont->vc_pool, &cont_df->cd_dtx_committed_head, sizeof(cont_df->cd_dtx_committed_head)); if (unlikely(rc != 0)) { D_ERROR("Failed to update head for DTX aggregation " UMOFF_PF ": " DF_RC @@ -3134,7 +3290,7 @@ dtx_blob_aggregate(struct umem_instance *umm, struct vos_tls *tls, struct vos_co if (dbd_next == NULL) { D_ASSERT(dbd_off == cont_df->cd_dtx_committed_tail); - rc = umem_tx_add_ptr(umm, &cont_df->cd_dtx_committed_tail, + rc = vos_dtx_add_ptr(cont->vc_pool, &cont_df->cd_dtx_committed_tail, sizeof(cont_df->cd_dtx_committed_tail)); if (unlikely(rc != 0)) { D_ERROR("Failed to update tail for DTX aggregation " UMOFF_PF @@ -3144,7 +3300,8 @@ dtx_blob_aggregate(struct umem_instance *umm, struct vos_tls *tls, struct vos_co } cont_df->cd_dtx_committed_tail = dbd_prev_off; } else { - rc = umem_tx_add_ptr(umm, &dbd_next->dbd_prev, sizeof(dbd_next->dbd_prev)); + rc = vos_dtx_add_ptr(cont->vc_pool, &dbd_next->dbd_prev, + sizeof(dbd_next->dbd_prev)); if (unlikely(rc != 0)) { D_ERROR("Failed to update previous DTXs blob for DTX " "aggregation " UMOFF_PF ": " DF_RC "\n", diff --git a/src/vos/vos_layout.h b/src/vos/vos_layout.h index 19335f3df6e..59c40e00556 100644 --- a/src/vos/vos_layout.h +++ b/src/vos/vos_layout.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -116,7 +116,12 @@ struct vos_gc_bkt_df { /** 2.8 features */ #define VOS_POOL_FEAT_2_8 (VOS_POOL_FEAT_GANG_SV) -#define VOS_POOL_EXT_DF_PADDING_SIZE 53 +#define VOS_POOL_EXT_DF_PADDING_SIZE 52 + +/* 512K preallocated buffer for TX snapshot in case of space emergency. + * NB: workload of GC/DTX is deterministic (tree operations), 512K should be sufficient. + */ +#define VOS_SNAPBUF_EMERG (1 << 19) /* VOS pool durable format extension */ struct vos_pool_ext_df { @@ -124,6 +129,8 @@ struct vos_pool_ext_df { struct vos_gc_bkt_df ped_gc_bkt; /* Memory file size for md-on-ssd phase2 pool */ uint64_t ped_mem_sz; + /* emergency buffer for GC */ + umem_off_t ped_emerg_buf; /* Paddings for other potential new feature */ uint64_t ped_paddings[VOS_POOL_EXT_DF_PADDING_SIZE]; /* Reserved for future extension */ diff --git a/src/vos/vos_pool.c b/src/vos/vos_pool.c index a6e06d6e026..c969d16abfd 100644 --- a/src/vos/vos_pool.c +++ b/src/vos/vos_pool.c @@ -1442,8 +1442,14 @@ vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_ pool_df->pd_version = version; /* pd_ext is newly allocated, no need to call tx_add_ptr() */ - pd_ext_df = umem_off2ptr(&umem, pool_df->pd_ext); - pd_ext_df->ped_mem_sz = scm_sz; + pd_ext_df = umem_off2ptr(&umem, pool_df->pd_ext); + pd_ext_df->ped_mem_sz = scm_sz; + pd_ext_df->ped_emerg_buf = umem_zalloc(&umem, VOS_SNAPBUF_EMERG); + if (UMOFF_IS_NULL(pd_ext_df->ped_emerg_buf)) { + D_ERROR("Failed to allocate pool emergency buffer.\n"); + rc = -DER_NOSPACE; + } + end: /** * The transaction can in reality be aborted From 5d3fac810c75bfaf446bc9ddef425602acb0ad75 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Thu, 23 Apr 2026 16:15:05 +0800 Subject: [PATCH 2/3] DAOS-18690 vos: fixes for review feedback Signed-off-by: Fan Yong --- src/common/mem.c | 11 +- src/dtx/dtx_srv.c | 5 +- src/include/daos/mem.h | 2 +- src/vos/vos_dtx.c | 239 +++++++++++++++++++++-------------------- src/vos/vos_layout.h | 6 +- src/vos/vos_pool.c | 2 +- 6 files changed, 135 insertions(+), 130 deletions(-) diff --git a/src/common/mem.c b/src/common/mem.c index 7b2c7c24d19..f64d744be72 100644 --- a/src/common/mem.c +++ b/src/common/mem.c @@ -1025,7 +1025,7 @@ pmem_tx_get_failure_behavior(void) case POBJ_TX_FAILURE_RETURN: return TX_FAILURE_RETURN; default: - D_ERROR("Unknown TX failure behavior %d\n", behavior); + D_ASSERTF(0, "Unknown TX failure behavior %d\n", behavior); return -DER_INVAL; } } @@ -1037,14 +1037,7 @@ pmem_tx_set_snapbuf(struct umem_instance *umm, umem_off_t snapbuf, size_t size) int rc; rc = pmemobj_tx_log_append_buffer(TX_LOG_TYPE_SNAPSHOT, buf, size); - if (rc != 0) - return rc; - - rc = pmemobj_tx_log_auto_alloc(TX_LOG_TYPE_SNAPSHOT, 0); - if (rc != 0) - return rc; - - return 0; + return rc ? umem_tx_errno(rc) : 0; } static int diff --git a/src/dtx/dtx_srv.c b/src/dtx/dtx_srv.c index 67ebe8b885b..328144ed2ab 100644 --- a/src/dtx/dtx_srv.c +++ b/src/dtx/dtx_srv.c @@ -196,8 +196,9 @@ dtx_handler(crt_rpc_t *rpc) if (unlikely(din->di_epoch == 1)) D_GOTO(out, rc = -DER_IO); - rc1 = dtx_commit_large(cont->sc_hdl, (struct dtx_id *)(din->di_dtx_array.ca_arrays), - din->di_dtx_array.ca_count, false, NULL); + /* The count of DTX entries will not exceed DTX_THRESHOLD_COUNT. */ + rc1 = dtx_commit_large(cont->sc_hdl, (struct dtx_id *)din->di_dtx_array.ca_arrays, + (int)din->di_dtx_array.ca_count, false, NULL); if (rc1 < 0) D_GOTO(out, rc = rc1); diff --git a/src/include/daos/mem.h b/src/include/daos/mem.h index ee8e40a42c6..eb4bdbf3552 100644 --- a/src/include/daos/mem.h +++ b/src/include/daos/mem.h @@ -1090,7 +1090,7 @@ static inline void umem_tx_set_failure_behavior(struct umem_instance *umm, enum umem_tx_failure_behavior behavior) { if (umm->umm_ops->mo_tx_set_failure_behavior) - return umm->umm_ops->mo_tx_set_failure_behavior(behavior); + umm->umm_ops->mo_tx_set_failure_behavior(behavior); } static inline int diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index 5d69af670a9..a8a5d64a680 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -434,7 +434,8 @@ vos_dtx_add_ptr(struct vos_pool *pool, void *ptr, size_t size) if (ext_df != NULL && !UMOFF_IS_NULL(ext_df->ped_emerg_buf) && behavior == TX_FAILURE_RETURN) { - rc = umem_tx_set_snapbuf(umm, ext_df->ped_emerg_buf, VOS_SNAPBUF_EMERG); + rc = + umem_tx_set_snapbuf(umm, ext_df->ped_emerg_buf, VOS_SNAPBUF_EMERG_SIZE); if (rc == 0) rc = umem_tx_add_ptr(umm, ptr, size); else @@ -1040,7 +1041,7 @@ vos_dtx_extend_act_table(struct vos_container *cont) } static int -vos_dtx_extend_cmt_table(struct vos_container *cont) +vos_dtx_reuse_cmt_blob(struct vos_container *cont) { struct umem_instance *umm = vos_cont2umm(cont); struct vos_cont_df *cont_df = cont->vc_cont_df; @@ -1054,135 +1055,150 @@ vos_dtx_extend_cmt_table(struct vos_container *cont) uint32_t count; int rc; int i; - bool allocated = false; - if (!DAOS_FAIL_CHECK(DAOS_DTX_NOSPACE_NOREFRESH)) - dbd_off = umem_zalloc(umm, DTX_CMT_BLOB_SIZE); + /* + * Space is almost exhausted. Under such case, we must reclaim space to make current + * DTX commit to be proceed; otherwise, uncommitted DTX may block VOS aggregation as + * to prevent further space release. The most direct approach is to reclaim some old + * blob from current container's committed DTX table. It may be unfair because other + * containers could hold older committed DTX entries. However, it maybe not worth to + * scan all pools/containers on the target to find the globally oldest committed DTX + * blob under space pressure. For now, we select current container as the victim. + * + * This can be optimized later. DAOS-18690. + */ - if (UMOFF_IS_NULL(dbd_off)) { - /* - * We almost run out of space. Under such case, we have to release some space to - * make current DTX commit to go ahead; otherwise, non-committed DTX will affect - * VOS aggregation as to no more space can be released via VOS aggregation. That - * will make things to be worse. The most direct solution is to release some old - * committed DTX blob from current container. That maybe unfair since some other - * container may have more old committed DTX entries, but scanning the target to - * find out the pool/container that have the oldest DTX entry may take sometime, - * depends on the pool/container count on the target. Under space pressure, such - * behavior may be not worth, we just choose current container as the victim to - * release space. It may be optimized in the future. DAOS-18690 - */ + if (unlikely(cont_df->cd_dtx_committed_head == cont_df->cd_dtx_committed_tail)) { + D_ERROR("No space when extending commit DTX table.\n"); + return -DER_NOSPACE; + } - if (unlikely(cont_df->cd_dtx_committed_head == cont_df->cd_dtx_committed_tail)) { - D_ERROR("No space when extend commit DTX table.\n"); - return -DER_NOSPACE; + dbd_off = cont_df->cd_dtx_committed_head; + head = umem_off2ptr(umm, dbd_off); + D_ASSERT(head != NULL); + + for (i = 0, epoch = 0, count = 0; i < head->dbd_count; i++) { + dce_df = &head->dbd_committed_data[i]; + d_iov_set(&kiov, &dce_df->dce_xid, sizeof(dce_df->dce_xid)); + rc = dbtree_delete(cont->vc_dtx_committed_hdl, BTR_PROBE_EQ, &kiov, NULL); + if (rc == 0) + count++; + if (rc == -DER_NONEXIST) + rc = 0; + if (unlikely(rc != 0)) { + D_ERROR("Failed to remove DTX entry " + DF_DTI " for urgent DTX aggregation " UMOFF_PF ": " DF_RC "\n", + DP_DTI(&dce_df->dce_xid), UMOFF_P(dbd_off), DP_RC(rc)); + goto out; } - dbd_off = cont_df->cd_dtx_committed_head; - head = umem_off2ptr(umm, dbd_off); - D_ASSERT(head != NULL); + if (epoch < dce_df->dce_epoch) + epoch = dce_df->dce_epoch; + } - for (i = 0, epoch = 0, count = 0; i < head->dbd_count; i++) { - dce_df = &head->dbd_committed_data[i]; - d_iov_set(&kiov, &dce_df->dce_xid, sizeof(dce_df->dce_xid)); - rc = dbtree_delete(cont->vc_dtx_committed_hdl, BTR_PROBE_EQ, &kiov, NULL); - if (rc == 0) - count++; - if (rc == -DER_NONEXIST) - rc = 0; - if (unlikely(rc != 0)) { - D_ERROR("Failed to remove DTX entry " DF_DTI " for urgent DTX " - "aggregation " UMOFF_PF ": " DF_RC "\n", - DP_DTI(&dce_df->dce_xid), UMOFF_P(dbd_off), DP_RC(rc)); - goto out; - } + tail = umem_off2ptr(umm, cont_df->cd_dtx_committed_tail); + D_ASSERT(tail != NULL); - if (epoch < dce_df->dce_epoch) - epoch = dce_df->dce_epoch; - } + dbd = umem_off2ptr(umm, head->dbd_next); + D_ASSERT(dbd != NULL); - tail = umem_off2ptr(umm, cont_df->cd_dtx_committed_tail); - D_ASSERT(tail != NULL); + rc = vos_dtx_add_ptr(cont->vc_pool, head, DTX_CMT_BLOB_SIZE); + if (rc != 0) + goto out; - dbd = umem_off2ptr(umm, head->dbd_next); - D_ASSERT(dbd != NULL); + /* dbd_next is next to dbd_prev */ + rc = vos_dtx_add_ptr(cont->vc_pool, &head->dbd_prev, + sizeof(head->dbd_prev) + sizeof(head->dbd_next)); + if (rc != 0) + goto out; - rc = vos_dtx_add_ptr(cont->vc_pool, head, DTX_CMT_BLOB_SIZE); - if (rc != 0) - goto out; + rc = vos_dtx_add_ptr(cont->vc_pool, &tail->dbd_next, sizeof(tail->dbd_next)); + if (rc != 0) + goto out; - /* dbd_next is next to dbd_prev */ - rc = vos_dtx_add_ptr(cont->vc_pool, &head->dbd_prev, - sizeof(head->dbd_prev) + sizeof(head->dbd_next)); - if (rc != 0) - goto out; + rc = vos_dtx_add_ptr(cont->vc_pool, &dbd->dbd_prev, sizeof(dbd->dbd_prev)); + if (rc != 0) + goto out; - rc = vos_dtx_add_ptr(cont->vc_pool, &tail->dbd_next, sizeof(tail->dbd_next)); - if (rc != 0) - goto out; + /* cd_dtx_committed_tail is next to cd_dtx_committed_head */ + rc = vos_dtx_add_ptr(cont->vc_pool, &cont_df->cd_dtx_committed_head, + sizeof(cont_df->cd_dtx_committed_head) + + sizeof(cont_df->cd_dtx_committed_tail)); + if (rc != 0) + goto out; - rc = vos_dtx_add_ptr(cont->vc_pool, &dbd->dbd_prev, sizeof(dbd->dbd_prev)); - if (rc != 0) - goto out; + /* Current @head will be reused, move it after @tail, @dbd will be the new head. */ - /* cd_dtx_committed_tail is next to cd_dtx_committed_head */ - rc = vos_dtx_add_ptr(cont->vc_pool, &cont_df->cd_dtx_committed_head, - sizeof(cont_df->cd_dtx_committed_head) + - sizeof(cont_df->cd_dtx_committed_tail)); - if (rc != 0) - goto out; + dbd->dbd_prev = head->dbd_prev; + head->dbd_prev = cont_df->cd_dtx_committed_tail; + tail->dbd_next = cont_df->cd_dtx_committed_head; + cont_df->cd_dtx_committed_head = head->dbd_next; + head->dbd_next = UMOFF_NULL; + cont_df->cd_dtx_committed_tail = dbd_off; - /* Current @head will be reused, move it after @tail, @dbd will be the new head. */ + dbd = head; + dbd->dbd_count = 0; + dbd->dbd_index = 0; - dbd->dbd_prev = head->dbd_prev; - head->dbd_prev = cont_df->cd_dtx_committed_tail; - tail->dbd_next = cont_df->cd_dtx_committed_head; - cont_df->cd_dtx_committed_head = head->dbd_next; - head->dbd_next = UMOFF_NULL; - cont_df->cd_dtx_committed_tail = dbd_off; - dbd = head; + if (count > 0) { + D_ASSERTF(cont->vc_dtx_committed_count >= count, + "Unexpected committed DTX entries count for " DF_UUID ": %u/%u\n", + DP_UUID(cont->vc_id), cont->vc_dtx_committed_count, count); - if (count > 0) { - D_ASSERTF(cont->vc_dtx_committed_count >= count, - "Unexpected committed DTX entries count for " DF_UUID ": %u/%u\n", - DP_UUID(cont->vc_id), cont->vc_dtx_committed_count, count); + cont->vc_dtx_committed_count -= count; + cont->vc_pool->vp_dtx_committed_count -= count; + } - cont->vc_dtx_committed_count -= count; - cont->vc_pool->vp_dtx_committed_count -= count; - } - } else { - allocated = true; +out: + DL_CDEBUG(rc == 0, DLOG_WARN, DLOG_ERR, rc, + "Reused DTX blob %p (" UMOFF_PF ") under space pressure for " DF_UUID "/" DF_UUID, + dbd, UMOFF_P(dbd_off), DP_UUID(cont->vc_pool->vp_id), DP_UUID(cont->vc_id)); + return rc; +} - dbd = umem_off2ptr(umm, dbd_off); - tail = umem_off2ptr(umm, cont_df->cd_dtx_committed_tail); - if (tail == NULL) { - D_ASSERT(UMOFF_IS_NULL(cont_df->cd_dtx_committed_head)); +static int +vos_dtx_extend_cmt_table(struct vos_container *cont) +{ + struct umem_instance *umm = vos_cont2umm(cont); + struct vos_cont_df *cont_df = cont->vc_cont_df; + struct vos_dtx_blob_df *dbd = NULL; + struct vos_dtx_blob_df *tail; + umem_off_t dbd_off = UMOFF_NULL; + int rc; - /* cd_dtx_committed_tail is next to cd_dtx_committed_head */ - rc = vos_dtx_add_ptr(cont->vc_pool, &cont_df->cd_dtx_committed_head, - sizeof(cont_df->cd_dtx_committed_head) + - sizeof(cont_df->cd_dtx_committed_tail)); - if (rc != 0) - goto out; + if (!DAOS_FAIL_CHECK(DAOS_DTX_NOSPACE_NOREFRESH)) + dbd_off = umem_zalloc(umm, DTX_CMT_BLOB_SIZE); - cont_df->cd_dtx_committed_head = dbd_off; - } else { - rc = - vos_dtx_add_ptr(cont->vc_pool, &tail->dbd_next, sizeof(tail->dbd_next)); - if (rc != 0) - goto out; + if (UMOFF_IS_NULL(dbd_off)) + return vos_dtx_reuse_cmt_blob(cont); - rc = vos_dtx_add_ptr(cont->vc_pool, &cont_df->cd_dtx_committed_tail, - sizeof(cont_df->cd_dtx_committed_tail)); - if (rc != 0) - goto out; + dbd = umem_off2ptr(umm, dbd_off); + tail = umem_off2ptr(umm, cont_df->cd_dtx_committed_tail); + if (tail == NULL) { + D_ASSERT(UMOFF_IS_NULL(cont_df->cd_dtx_committed_head)); - tail->dbd_next = dbd_off; - dbd->dbd_prev = cont_df->cd_dtx_committed_tail; - } + /* cd_dtx_committed_tail is next to cd_dtx_committed_head */ + rc = vos_dtx_add_ptr(cont->vc_pool, &cont_df->cd_dtx_committed_head, + sizeof(cont_df->cd_dtx_committed_head) + + sizeof(cont_df->cd_dtx_committed_tail)); + if (rc != 0) + goto out; - cont_df->cd_dtx_committed_tail = dbd_off; + cont_df->cd_dtx_committed_head = dbd_off; + } else { + rc = vos_dtx_add_ptr(cont->vc_pool, &tail->dbd_next, sizeof(tail->dbd_next)); + if (rc != 0) + goto out; + + rc = vos_dtx_add_ptr(cont->vc_pool, &cont_df->cd_dtx_committed_tail, + sizeof(cont_df->cd_dtx_committed_tail)); + if (rc != 0) + goto out; + + tail->dbd_next = dbd_off; + dbd->dbd_prev = cont_df->cd_dtx_committed_tail; } + cont_df->cd_dtx_committed_tail = dbd_off; dbd->dbd_magic = DTX_CMT_BLOB_MAGIC; dbd->dbd_cap = (DTX_CMT_BLOB_SIZE - sizeof(struct vos_dtx_blob_df)) / @@ -1191,14 +1207,9 @@ vos_dtx_extend_cmt_table(struct vos_container *cont) dbd->dbd_index = 0; out: - if (allocated) - DL_CDEBUG(rc == 0, DB_IO, DLOG_ERR, rc, - "Allocate DTX committed blob %p (" UMOFF_PF ") for cont " DF_UUID, dbd, - UMOFF_P(dbd_off), DP_UUID(cont->vc_id)); - else - DL_CDEBUG(rc == 0, DLOG_WARN, DLOG_ERR, rc, - "SPACE PRESSURE! reuse DTX blob %p (" UMOFF_PF ") for cont " DF_UUID, dbd, - UMOFF_P(dbd_off), DP_UUID(cont->vc_id)); + DL_CDEBUG(rc == 0, DB_IO, DLOG_ERR, rc, + "Allocated DTX committed blob %p (" UMOFF_PF ") for " DF_UUID "/" DF_UUID, dbd, + UMOFF_P(dbd_off), DP_UUID(cont->vc_pool->vp_id), DP_UUID(cont->vc_id)); return rc; } diff --git a/src/vos/vos_layout.h b/src/vos/vos_layout.h index 59c40e00556..6c40711f6b7 100644 --- a/src/vos/vos_layout.h +++ b/src/vos/vos_layout.h @@ -118,10 +118,10 @@ struct vos_gc_bkt_df { #define VOS_POOL_EXT_DF_PADDING_SIZE 52 -/* 512K preallocated buffer for TX snapshot in case of space emergency. +/* Preallocate 512KB buffer for backend transaction snapshots under space pressure. * NB: workload of GC/DTX is deterministic (tree operations), 512K should be sufficient. */ -#define VOS_SNAPBUF_EMERG (1 << 19) +#define VOS_SNAPBUF_EMERG_SIZE (1 << 19) /* VOS pool durable format extension */ struct vos_pool_ext_df { @@ -129,7 +129,7 @@ struct vos_pool_ext_df { struct vos_gc_bkt_df ped_gc_bkt; /* Memory file size for md-on-ssd phase2 pool */ uint64_t ped_mem_sz; - /* emergency buffer for GC */ + /* emergency buffer for TX snapshots under space pressure */ umem_off_t ped_emerg_buf; /* Paddings for other potential new feature */ uint64_t ped_paddings[VOS_POOL_EXT_DF_PADDING_SIZE]; diff --git a/src/vos/vos_pool.c b/src/vos/vos_pool.c index c969d16abfd..3d40c89684f 100644 --- a/src/vos/vos_pool.c +++ b/src/vos/vos_pool.c @@ -1444,7 +1444,7 @@ vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_ /* pd_ext is newly allocated, no need to call tx_add_ptr() */ pd_ext_df = umem_off2ptr(&umem, pool_df->pd_ext); pd_ext_df->ped_mem_sz = scm_sz; - pd_ext_df->ped_emerg_buf = umem_zalloc(&umem, VOS_SNAPBUF_EMERG); + pd_ext_df->ped_emerg_buf = umem_zalloc(&umem, VOS_SNAPBUF_EMERG_SIZE); if (UMOFF_IS_NULL(pd_ext_df->ped_emerg_buf)) { D_ERROR("Failed to allocate pool emergency buffer.\n"); rc = -DER_NOSPACE; From 3bc77c6b2e870c637be8d670ea346dc3621a72a1 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Fri, 24 Apr 2026 10:13:27 +0800 Subject: [PATCH 3/3] DAOS-18690 vos: code format adjustment Signed-off-by: Fan Yong --- src/vos/vos_dtx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index a8a5d64a680..79e98956138 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -1086,8 +1086,8 @@ vos_dtx_reuse_cmt_blob(struct vos_container *cont) if (rc == -DER_NONEXIST) rc = 0; if (unlikely(rc != 0)) { - D_ERROR("Failed to remove DTX entry " - DF_DTI " for urgent DTX aggregation " UMOFF_PF ": " DF_RC "\n", + D_ERROR("Failed to remove DTX entry " DF_DTI + " for urgent DTX aggregation " UMOFF_PF ": " DF_RC "\n", DP_DTI(&dce_df->dce_xid), UMOFF_P(dbd_off), DP_RC(rc)); goto out; }