From f778563410f4efcaf1916f2cfa193d882649745c Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Sun, 10 May 2026 16:07:17 +0800 Subject: [PATCH] DAOS-18888 client: validation check when event completion It is suspected that some event completion callback maybe repeatedly triggered against UCX provider under the cases of network trouble. If that is true, it will misguide the subsequent thread-private event owner. The patch will check such completion against stale event and skip it. Signed-off-by: Fan Yong --- src/client/api/client_internal.h | 11 +++++++---- src/client/api/event.c | 29 +++++++++++++++++++++++++++++ src/client/api/task.c | 20 ++++++++++++++++++-- src/include/daos/event.h | 13 +++++++++++++ 4 files changed, 67 insertions(+), 6 deletions(-) diff --git a/src/client/api/client_internal.h b/src/client/api/client_internal.h index cfbca3afa68..fbf488f4fe2 100644 --- a/src/client/api/client_internal.h +++ b/src/client/api/client_internal.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2022 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -44,6 +45,9 @@ struct daos_event_callback { d_list_t evx_comp_list; }; +#define DAOS_EVENT_GEN_BITS (30) +#define DAOS_EVENT_GEN_MAX ((1 << DAOS_EVENT_GEN_BITS) - 1) + struct daos_event_private { daos_handle_t evx_eqh; d_list_t evx_link; @@ -52,10 +56,9 @@ struct daos_event_private { unsigned int evx_nchild; unsigned int evx_nchild_running; unsigned int evx_nchild_comp; - /** flag to indicate whether event is a barrier event */ - unsigned int is_barrier:1; - /** flag to indicate whether to convert DER to errno */ - unsigned int is_errno:1; + + unsigned int evx_gen : DAOS_EVENT_GEN_BITS, /* Generation for validation when completion. */ + is_barrier : 1, /* Barrier event. */ is_errno : 1; /* Whether convert DER to errno. */ unsigned int evx_flags; ATOMIC daos_ev_status_t evx_status; diff --git a/src/client/api/event.c b/src/client/api/event.c index b75d3475348..a335e1272de 100644 --- a/src/client/api/event.c +++ b/src/client/api/event.c @@ -521,6 +521,22 @@ daos_event_complete(struct daos_event *ev, int rc) } } +void +daos_event_complete_with_check(struct daos_event *ev, int rc, void *data) +{ + struct daos_event_private *evx = daos_ev2evx(ev); + unsigned int gen; + + D_ASSERT(data != NULL); + + gen = *(unsigned int *)data; + + if (unlikely(evx->evx_gen != gen)) + D_WARN("Skip completion against stale event %p: %u vs %u\n", ev, evx->evx_gen, gen); + else + daos_event_complete(ev, rc); +} + struct ev_progress_arg { struct daos_eq_private *eqx; struct daos_event_private *evx; @@ -1004,6 +1020,19 @@ daos_event_destroy_children(struct daos_event *ev, bool force) return rc; } +unsigned int +daos_event_bump_gen(daos_event_t *ev) +{ + struct daos_event_private *evx = daos_ev2evx(ev); + + if (likely(evx->evx_gen < DAOS_EVENT_GEN_MAX)) + evx->evx_gen++; + else + evx->evx_gen = 1; + + return evx->evx_gen; +} + /** * Add the event to the event queue, and if there is parent, add * it to its child list as well. diff --git a/src/client/api/task.c b/src/client/api/task.c index d6685080015..a20724e5bc8 100644 --- a/src/client/api/task.c +++ b/src/client/api/task.c @@ -1,5 +1,6 @@ /** * (C) Copyright 2017-2021 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -35,8 +36,16 @@ task_is_valid(tse_task_t *task) static int task_comp_event(tse_task_t *task, void *data) { + daos_event_t *ev; + D_ASSERT(task_is_valid(task)); - daos_event_complete(task_ptr2args(task)->ta_ev, task->dt_result); + + ev = task_ptr2args(task)->ta_ev; + if (daos_event_is_priv(ev)) + daos_event_complete_with_check(ev, task->dt_result, data); + else + daos_event_complete(ev, task->dt_result); + return 0; } @@ -53,6 +62,8 @@ dc_task_create(tse_task_func_t func, tse_sched_t *sched, daos_event_t *ev, { struct daos_task_args *args; tse_task_t *task; + unsigned int gen; + size_t size = 0; int rc; if (sched == NULL) { @@ -71,8 +82,13 @@ dc_task_create(tse_task_func_t func, tse_sched_t *sched, daos_event_t *ev, args = task_ptr2args(task); args->ta_magic = DAOS_TASK_MAGIC; if (ev) { + if (daos_event_is_priv(ev)) { + gen = daos_event_bump_gen(ev); + size = sizeof(gen); + } + /** register a comp cb on the task to complete the event */ - rc = tse_task_register_comp_cb(task, task_comp_event, NULL, 0); + rc = tse_task_register_comp_cb(task, task_comp_event, size ? &gen : NULL, size); if (rc != 0) D_GOTO(failed, rc); args->ta_ev = ev; diff --git a/src/include/daos/event.h b/src/include/daos/event.h index 12c5c4933ec..1089793c34c 100644 --- a/src/include/daos/event.h +++ b/src/include/daos/event.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2015-2024 Intel Corporation. + * (C) Copyright 2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -83,6 +84,16 @@ int daos_event_init_adv(struct daos_event *ev, enum daos_ev_flags flags, void daos_event_complete(daos_event_t *ev, int rc); +/** + * Mark the event completed with validation check firstly. + * + * \param ev [IN] event to complete. + * \param rc [IN] operation return code. + * \param data [IN] argument for completion callback verification. + */ +void +daos_event_complete_with_check(daos_event_t *ev, int rc, void *data); + /** * Mark the event launched, i.e. move this event to running list. * @@ -120,6 +131,8 @@ daos_event_destroy(struct daos_event *ev, bool force); int daos_event_destroy_children(struct daos_event *ev, bool force); +unsigned int +daos_event_bump_gen(daos_event_t *ev); /** * Wait for completion of the private event