Skip to content

Commit 2060032

Browse files
committed
DAOS-18981 mercury: Fix handling of connection reset.
Test-provider: ucx+dc_x Test-provider-hw-large: ucx+dc_x Test-provider-hw-medium: ucx+dc_x Test-provider-ucx: ucx+dc_x Skip-func-hw-test-medium-verbs-provider: true Skip-func-test-vm: true Allow-unstable-test: true Signed-off-by: Joseph Moore <joseph.moore@hpe.com>
1 parent 8cc63a9 commit 2060032

4 files changed

Lines changed: 27 additions & 3 deletions

File tree

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
diff --git a/src/na/na_ucx.c b/src/na/na_ucx.c
2+
index 96501b2..f97ce69 100644
3+
--- a/src/na/na_ucx.c
4+
+++ b/src/na/na_ucx.c
5+
@@ -3251,16 +3251,10 @@ na_ucx_rma(struct na_ucx_class NA_UNUSED *na_ucx_class, na_context_t *context,
6+
7+
/* There is no need to have a fully resolved address to start an RMA.
8+
* This is only necessary for two-sided communication. */
9+
- /* The above assumption is now in question, so the following will resolve
10+
- * the address if required. */
11+
+ NA_CHECK_SUBSYS_WARNING(rma,
12+
+ (!(hg_atomic_get32(&na_ucx_addr->status) & NA_UCX_ADDR_RESOLVED)),
13+
+ "Connection is not resolved for rma operation");
14+
15+
- /* Check addr to ensure the EP for that addr is still valid */
16+
- if (!(hg_atomic_get32(&na_ucx_addr->status) & NA_UCX_ADDR_RESOLVED)) {
17+
- ret = na_ucx_addr_map_update(
18+
- na_ucx_class, &na_ucx_class->addr_map, na_ucx_addr);
19+
- NA_CHECK_SUBSYS_NA_ERROR(
20+
- addr, error, ret, "Could not update NA UCX address");
21+
- }
22+
NA_CHECK_SUBSYS_ERROR(rma, na_ucx_addr->ucp_ep == NULL, error, ret,
23+
NA_ADDRNOTAVAIL, "UCP endpoint is NULL for that address");
24+

src/tests/ftest/util/general_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -545,7 +545,7 @@ def get_errors_count(log, hostlist, file_glob):
545545
"""
546546
# Get the Client side Error from client_log file.
547547
cmd = "cat {} | sed -n -E -e ".format(get_log_file(file_glob))
548-
cmd += r"'/^.+[[:space:]]ERR[[:space:]].+[[:space:]]DER_[^(]+\([^)]+\).+$/"
548+
cmd += r"'/^(ERR|.+[[:space:]]ERR)[[:space:]].+[[:space:]]DER_[^(]+\([^)]+\).+$/"
549549
cmd += r"s/^.+[[:space:]]DER_[^(]+\((-[[:digit:]]+)\).+$/\1/p'"
550550
result = run_remote(log, hostlist, cmd, verbose=False)
551551
errors_count = {}

utils/build.config

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,6 @@ ucx=https://github.com/openucx/ucx.git
2727

2828
[patch_versions]
2929
spdk=0001_external_isal_v26.01.patch
30-
mercury=0001_dep_versions.patch,0002_ofi_counters.patch,0003_ofi_auth_key.patch
30+
mercury=0001_dep_versions.patch,0002_ofi_counters.patch,0003_ofi_auth_key.patch,0004_ucx_connection_reset.patch
3131
pmdk=https://github.com/daos-stack/pmdk/commit/bb048d67ccd07609f86a5e8b3c6ad54414d593ee.diff,https://github.com/daos-stack/pmdk/commit/69925cf455ef672c4cbdbdb13bef7ae581e67045.diff,https://github.com/daos-stack/pmdk/commit/6805ed4f8d1a4e4c6070bf8b68f0dffef08b9c99.diff
3232
argobots=0001_411e5b344642ebc82190fd8b125db512e5b449d1.diff,0002_bb0c908abfac4bfe37852eee621930634183c6aa.diff

utils/rpms/package_info.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ export libfabric_version="1.22.0"
4545
export libfabric_release="5${distro_name}"
4646
export libfabric_full="${libfabric_version}-${libfabric_release}"
4747
export mercury_version="2.4.1"
48-
export mercury_release="2${distro_name}"
48+
export mercury_release="3${distro_name}"
4949
export mercury_full="${mercury_version}-${mercury_release}"
5050
export argobots_version="1.2"
5151
export argobots_release="4${distro_name}"

0 commit comments

Comments
 (0)