Skip to content

Commit 1941908

Browse files
committed
DAOS-18981 mercury: Fix handling of connection reset.
Test-provider: ucx+dc_x Test-provider-hw-large: ucx+dc_x Test-provider-hw-medium: ucx+dc_x Test-provider-ucx: ucx+dc_x Skip-func-hw-test-medium-verbs-provider: true Skip-func-test-vm: true Allow-unstable-test: true Signed-off-by: Joseph Moore <joseph.moore@hpe.com>
1 parent 2060032 commit 1941908

1 file changed

Lines changed: 51 additions & 17 deletions

File tree

Lines changed: 51 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,58 @@
11
diff --git a/src/na/na_ucx.c b/src/na/na_ucx.c
2-
index 96501b2..f97ce69 100644
2+
index 7c0ac8d4..662c3a79 100644
33
--- a/src/na/na_ucx.c
44
+++ b/src/na/na_ucx.c
5-
@@ -3251,16 +3251,10 @@ na_ucx_rma(struct na_ucx_class NA_UNUSED *na_ucx_class, na_context_t *context,
5+
@@ -144,6 +144,7 @@ struct na_ucx_addr {
6+
void *close_request; /* Close request */
7+
hg_atomic_int32_t refcount; /* Reference counter */
8+
hg_atomic_int32_t status; /* Connection state */
9+
+ bool connect; /* Issued from a connection request */
10+
};
611

7-
/* There is no need to have a fully resolved address to start an RMA.
8-
* This is only necessary for two-sided communication. */
9-
- /* The above assumption is now in question, so the following will resolve
10-
- * the address if required. */
11-
+ NA_CHECK_SUBSYS_WARNING(rma,
12-
+ (!(hg_atomic_get32(&na_ucx_addr->status) & NA_UCX_ADDR_RESOLVED)),
13-
+ "Connection is not resolved for rma operation");
12+
/* Map (used to cache addresses) */
13+
@@ -2746,6 +2747,7 @@ na_ucx_addr_map_insert(struct na_ucx_class *na_ucx_class,
14+
(void *) na_ucx_addr, &na_ucx_addr->ucp_ep);
15+
NA_CHECK_SUBSYS_NA_ERROR(
16+
addr, error, ret, "Could not connect UCP endpoint");
17+
+ na_ucx_addr->connect = true;
18+
}
19+
NA_LOG_SUBSYS_DEBUG(addr, "UCP ep for addr %p is %p", (void *) na_ucx_addr,
20+
(void *) na_ucx_addr->ucp_ep);
21+
@@ -3105,6 +3107,7 @@ na_ucx_addr_reset(struct na_ucx_addr *na_ucx_addr, ucs_sock_addr_t *addr_key)
22+
na_ucx_addr->ucp_ep = NULL;
23+
hg_atomic_init32(&na_ucx_addr->refcount, 1);
24+
hg_atomic_init32(&na_ucx_addr->status, 0);
25+
+ na_ucx_addr->connect = false;
1426

15-
- /* Check addr to ensure the EP for that addr is still valid */
27+
if (addr_key && addr_key->addr) {
28+
memcpy(&na_ucx_addr->ss_addr, addr_key->addr, addr_key->addrlen);
29+
@@ -3255,7 +3258,8 @@ na_ucx_rma(struct na_ucx_class NA_UNUSED *na_ucx_class, na_context_t *context,
30+
* the address if required. */
31+
32+
/* Check addr to ensure the EP for that addr is still valid */
33+
- if (!(hg_atomic_get32(&na_ucx_addr->status) & NA_UCX_ADDR_RESOLVED)) {
34+
+ if (na_ucx_addr->connect &&
35+
+ !(hg_atomic_get32(&na_ucx_addr->status) & NA_UCX_ADDR_RESOLVED)) {
36+
ret = na_ucx_addr_map_update(
37+
na_ucx_class, &na_ucx_class->addr_map, na_ucx_addr);
38+
NA_CHECK_SUBSYS_NA_ERROR(
39+
@@ -4076,7 +4080,8 @@ na_ucx_msg_send_unexpected(na_class_t NA_UNUSED *na_class,
40+
na_cb_type_to_string(na_ucx_op_id->completion_data.callback_info.type));
41+
42+
/* Check addr to ensure the EP for that addr is still valid */
43+
- if (!(hg_atomic_get32(&na_ucx_addr->status) & NA_UCX_ADDR_RESOLVED)) {
44+
+ if (na_ucx_addr->connect &&
45+
+ !(hg_atomic_get32(&na_ucx_addr->status) & NA_UCX_ADDR_RESOLVED)) {
46+
struct na_ucx_class *na_ucx_class = NA_UCX_CLASS(na_class);
47+
48+
ret = na_ucx_addr_map_update(
49+
@@ -4159,7 +4164,8 @@ na_ucx_msg_send_expected(na_class_t NA_UNUSED *na_class, na_context_t *context,
50+
na_cb_type_to_string(na_ucx_op_id->completion_data.callback_info.type));
51+
52+
/* Check addr to ensure the EP for that addr is still valid */
1653
- if (!(hg_atomic_get32(&na_ucx_addr->status) & NA_UCX_ADDR_RESOLVED)) {
17-
- ret = na_ucx_addr_map_update(
18-
- na_ucx_class, &na_ucx_class->addr_map, na_ucx_addr);
19-
- NA_CHECK_SUBSYS_NA_ERROR(
20-
- addr, error, ret, "Could not update NA UCX address");
21-
- }
22-
NA_CHECK_SUBSYS_ERROR(rma, na_ucx_addr->ucp_ep == NULL, error, ret,
23-
NA_ADDRNOTAVAIL, "UCP endpoint is NULL for that address");
54+
+ if (na_ucx_addr->connect &&
55+
+ !(hg_atomic_get32(&na_ucx_addr->status) & NA_UCX_ADDR_RESOLVED)) {
56+
struct na_ucx_class *na_ucx_class = NA_UCX_CLASS(na_class);
2457

58+
ret = na_ucx_addr_map_update(

0 commit comments

Comments
 (0)