Description
Background information
What version of Open MPI are you using? (e.g., v4.1.6, v5.0.1, git branch name and hash, etc.)
v4.1.7rc1
Describe how Open MPI was installed (e.g., from a source/distribution tarball, from a git clone, from an operating system distribution package, etc.)
Git clone.
If you are building/installing from a git clone, please copy-n-paste the output from git submodule status
.
Please describe the system on which you are running
- Operating system/version: Ubuntu 24.04.2 LTS (Noble Numbat), Kernel 6.8.0-57-generic
- Computer hardware:
- Network type: IPoIB
Details of the problem
With the following reproducer with ucc_perftest :
$ srun -A admin -p admin -N64 --mpi=pmix --ntasks-per-node=8 --container-image=<container-image> env UCX_TLS=self,tcp ucc_perftest -c alltoall -m host -b 1048576 -e 2147483648 -n 2
srun: job 357102 queued and waiting for resources
srun: job 357102 has been allocated resources
[1744257072.724104] [node0190:1259783:0] sock.c:334 UCX ERROR connect(fd=252, dest_addr=<ip address>) failed: Connection timed out
[node0190.<domain>:1259783] pml_ucx.c:424 Error: ucp_ep_create(proc=496) failed: Destination is unreachable
[node0190.<domain>:1259783] pml_ucx.c:477 Error: Failed to resolve UCX endpoint for rank 496 [LOG_CAT_COMMPATTERNS] isend failed in comm_allreduce_pml at iterations 7
.................
that leads to corefiles being created on several nodes.
Thus I have been able to dig into these corefiles to try to understand the reason of the SEGVs, likely to be caused by a wrong/missing error-path in OMPI/UCC code, when triggering the original error/msgs above upon some IPoIB networking issue.
Here are my findings.
The fully unwinded stack is like following :
#0 ompi_request_default_test_all (count=2, requests=0x555555a2f228, completed=0x7fffffffc5c4, statuses=0x0) at request/req_test.c:187
#1 0x00007ffff50139ac in oob_allgather_test (req=0x555555a2f200) at coll_ucc_module.c:182
#2 0x00007ffff7f8ea5c in ucc_core_addr_exchange (context=context@entry=0x555555a2e990, oob=oob@entry=0x555555a2e9a8, addr_storage=addr_storage@entry=0x555555a2eaa0) at core/ucc_context.c:461
#3 0x00007ffff7f8f657 in ucc_context_create_proc_info (lib=0x5555559d12b0, params=params@entry=0x7fffffffc960, config=0x555555a2e840, context=context@entry=0x7ffff50213c8 <mca_coll_ucc_component+392>, proc_info=0x7ffff7fbca60 <ucc_local_proc>)
at core/ucc_context.c:723
#4 0x00007ffff7f901f0 in ucc_context_create (lib=<optimized out>, params=params@entry=0x7fffffffc960, config=<optimized out>, context=context@entry=0x7ffff50213c8 <mca_coll_ucc_component+392>) at core/ucc_context.c:866
#5 0x00007ffff5013cb1 in mca_coll_ucc_init_ctx () at coll_ucc_module.c:302
#6 0x00007ffff501583f in mca_coll_ucc_comm_query (comm=0x55555557d240 <ompi_mpi_comm_world>, priority=0x7fffffffcb6c) at coll_ucc_module.c:488
#7 0x00007ffff7ee5e4c in query_2_0_0 (module=<synthetic pointer>, priority=0x7fffffffcb6c, comm=0x55555557d240 <ompi_mpi_comm_world>, component=0x7ffff5021240 <mca_coll_ucc_component>) at base/coll_base_comm_select.c:540
#8 query (module=<synthetic pointer>, priority=0x7fffffffcb6c, comm=<optimized out>, component=0x7ffff5021240 <mca_coll_ucc_component>) at base/coll_base_comm_select.c:523
#9 check_one_component (module=<synthetic pointer>, component=0x7ffff5021240 <mca_coll_ucc_component>, comm=<optimized out>) at base/coll_base_comm_select.c:486
#10 check_components (comm=comm@entry=0x55555557d240 <ompi_mpi_comm_world>, components=<optimized out>) at base/coll_base_comm_select.c:406
#11 0x00007ffff7ee6446 in mca_coll_base_comm_select (comm=0x55555557d240 <ompi_mpi_comm_world>) at base/coll_base_comm_select.c:114
#12 0x00007ffff7f33613 in ompi_mpi_init (argc=<optimized out>, argc@entry=0, argv=<optimized out>, argv@entry=0x0, requested=0, provided=0x7fffffffcdf4, reinit_ok=reinit_ok@entry=false) at runtime/ompi_mpi_init.c:957
#13 0x00007ffff7ed6c2c in PMPI_Init (argc=0x0, argv=0x0) at pinit.c:69
#14 0x000055555555dbf4 in ucc_pt_bootstrap_mpi::ucc_pt_bootstrap_mpi() ()
#15 0x0000555555565666 in ucc_pt_comm::ucc_pt_comm(ucc_pt_comm_config) ()
#16 0x0000555555558f2a in main ()
where you can see that the unresolved symbol/frame in previously detailed stack is in fact in oob_allgather_test().
And the reason of the SEGV is because :
(gdb) p/x *(oob_allgather_req_t *)0x555555a2f200
$1 = {sbuf = 0x555555a2ea00, rbuf = 0x555555a710c0, oob_coll_ctx = 0x55555557d240, msglen = 0x8, iter = 0x1, reqs = {0x726568, 0x555555a8fa48}}
where reqs[0] is garbage when being dereferenced :
(gdb) p/x $rip
$3 = 0x7ffff7eb39e8
(gdb) x/10i ($rip - 0x18)
0x7ffff7eb39d0 <ompi_request_default_test_all+48>: cmpq $0x1,0x58(%rax)
0x7ffff7eb39d5 <ompi_request_default_test_all+53>: je 0x7ffff7eb39f0 <ompi_request_default_test_all+80>
0x7ffff7eb39d7 <ompi_request_default_test_all+55>: lea 0x1(%r12),%rax
0x7ffff7eb39dc <ompi_request_default_test_all+60>: cmp %rax,%rdi
0x7ffff7eb39df <ompi_request_default_test_all+63>: je 0x7ffff7eb39fe <ompi_request_default_test_all+94>
0x7ffff7eb39e1 <ompi_request_default_test_all+65>: mov %rax,%r12
0x7ffff7eb39e4 <ompi_request_default_test_all+68>: mov (%rbx,%r12,8),%rax
=> 0x7ffff7eb39e8 <ompi_request_default_test_all+72>: mov 0x60(%rax),%esi
0x7ffff7eb39eb <ompi_request_default_test_all+75>: cmp $0x1,%esi
0x7ffff7eb39ee <ompi_request_default_test_all+78>: jne 0x7ffff7eb39d0 <ompi_request_default_test_all+48>
(gdb) x/gx ($rax + 0x60)
0x7265c8: Cannot access memory at address 0x7265c8
(gdb) p/x $rbx + $r12 * 0x8
$4 = 0x555555a2f228
(gdb) x/gx ($rbx + $r12 * 0x8)
0x555555a2f228: 0x0000000000726568
(gdb) p/x $rax
$5 = 0x726568
(gdb) x/gx ($rax + 0x60)
0x7265c8: Cannot access memory at address 0x7265c8
(gdb)
Looking at the corresponding source code in "ompi/mca/coll/ucc/coll_ucc_module.c" :
141
142 typedef struct oob_allgather_req{
143 void *sbuf;
144 void *rbuf;
145 void *oob_coll_ctx;
146 size_t msglen;
147 int iter;
148 ompi_request_t *reqs[2];
149 } oob_allgather_req_t;
150
151 static ucc_status_t oob_allgather_test(void *req)
152 {
153 oob_allgather_req_t *oob_req = (oob_allgather_req_t*)req;
154 ompi_communicator_t *comm = (ompi_communicator_t *)oob_req->oob_coll_ctx;
155 char *tmpsend = NULL;
156 char *tmprecv = NULL;
157 size_t msglen = oob_req->msglen;
158 int probe_count = 5;
159 int rank, size, sendto, recvfrom, recvdatafrom,
160 senddatafrom, completed, probe;
161
162 size = ompi_comm_size(comm);
163 rank = ompi_comm_rank(comm);
164 if (oob_req->iter == 0) {
165 tmprecv = (char*) oob_req->rbuf + (ptrdiff_t)rank * (ptrdiff_t)msglen;
166 memcpy(tmprecv, oob_req->sbuf, msglen);
167 }
168 sendto = (rank + 1) % size;
169 recvfrom = (rank - 1 + size) % size;
170 for (; oob_req->iter < size - 1; oob_req->iter++) {
171 if (oob_req->iter > 0) { <<<< iter is 0 for 1st loop ...
172 probe = 0;
173 do {
174 ompi_request_test_all(2, oob_req->reqs, &completed, MPI_STATUS_IGNORE);
<<<<<< during 2nd loop (iter == 1) , ompi_request_test_all() is called with garbled reqs[0] !!
175 probe++;
176 } while (!completed && probe < probe_count);
177 if (!completed) {
178 return UCC_INPROGRESS;
179 }
180 }
181 recvdatafrom = (rank - oob_req->iter - 1 + size) % size;
182 senddatafrom = (rank - oob_req->iter + size) % size;
183 tmprecv = (char*)oob_req->rbuf + (ptrdiff_t)recvdatafrom * (ptrdiff_t)msglen;
184 tmpsend = (char*)oob_req->rbuf + (ptrdiff_t)senddatafrom * (ptrdiff_t)msglen;
185 MCA_PML_CALL(isend(tmpsend, msglen, MPI_BYTE, sendto, MCA_COLL_BASE_TAG_UCC,
186 MCA_PML_BASE_SEND_STANDARD, comm, &oob_req->reqs[0]));
<<<<<< isend triggers an error so reqs[0] is not populated !!
187 MCA_PML_CALL(irecv(tmprecv, msglen, MPI_BYTE, recvfrom,
188 MCA_COLL_BASE_TAG_UCC, comm, &oob_req->reqs[1]));
<<<<<< irecv do not report error, so reqs[1] is populated.
189 }
190 probe = 0;
191 do {
192 ompi_request_test_all(2, oob_req->reqs, &completed, MPI_STATUS_IGNORE);
193 probe++;
194 } while (!completed && probe < probe_count);
195 if (!completed) {
196 return UCC_INPROGRESS;
197 }
198 return UCC_OK;
199 }
200
201 static ucc_status_t oob_allgather_free(void *req)
202 {
203 free(req);
204 return UCC_OK;
205 }
206
207 static ucc_status_t oob_allgather(void *sbuf, void *rbuf, size_t msglen,
208 void *oob_coll_ctx, void **req)
209 {
210 oob_allgather_req_t *oob_req = malloc(sizeof(*oob_req));
211 oob_req->sbuf = sbuf;
212 oob_req->rbuf = rbuf;
213 oob_req->msglen = msglen;
214 oob_req->oob_coll_ctx = oob_coll_ctx;
215 oob_req->iter = 0;
216 *req = oob_req;
217 return UCC_OK;
218 }
219
"ompi/mca/coll/ucc/coll_ucc_module.c" 528 lines --41%-- 219,0-1 37%
and just to be complete :
#define ompi_request_test_all (ompi_request_functions.req_test_all)
"ompi/request/request.h" 504L, 19446B 407,1 83%
(gdb) x/i ompi_request_functions.req_test_all
0x7ffff7eb39a0 <ompi_request_default_test_all>: endbr64
Based on all of this it appears that the following patch/correction (in v4.1.7rc1, the quite recent OMPI version we are running) would allow OMPI/UCC to no longer coredump by gracefully handling any error during isend/irecv :
~/ompi$ git status
HEAD detached at v4.1.7rc1
Changes not staged for commit:
(use "git add <file>..." to update what will be committed)
(use "git restore <file>..." to discard changes in working directory)
modified: ompi/mca/coll/ucc/coll_ucc_module.c
no changes added to commit (use "git add" and/or "git commit -a")
~/ompi$ git diff
diff --git a/ompi/mca/coll/ucc/coll_ucc_module.c b/ompi/mca/coll/ucc/coll_ucc_module.c
index 1686697618..dfa2674a3d 100644
--- a/ompi/mca/coll/ucc/coll_ucc_module.c
+++ b/ompi/mca/coll/ucc/coll_ucc_module.c
@@ -158,6 +158,7 @@ static ucc_status_t oob_allgather_test(void *req)
int probe_count = 5;
int rank, size, sendto, recvfrom, recvdatafrom,
senddatafrom, completed, probe;
+ int rc;
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
@@ -182,10 +183,12 @@ static ucc_status_t oob_allgather_test(void *req)
senddatafrom = (rank - oob_req->iter + size) % size;
tmprecv = (char*)oob_req->rbuf + (ptrdiff_t)recvdatafrom * (ptrdiff_t)msglen;
tmpsend = (char*)oob_req->rbuf + (ptrdiff_t)senddatafrom * (ptrdiff_t)msglen;
- MCA_PML_CALL(isend(tmpsend, msglen, MPI_BYTE, sendto, MCA_COLL_BASE_TAG_UCC,
+ rc = MCA_PML_CALL(isend(tmpsend, msglen, MPI_BYTE, sendto, MCA_COLL_BASE_TAG_UCC,
MCA_PML_BASE_SEND_STANDARD, comm, &oob_req->reqs[0]));
- MCA_PML_CALL(irecv(tmprecv, msglen, MPI_BYTE, recvfrom,
+ if (OMPI_SUCCESS != rc) return rc
+ rc = MCA_PML_CALL(irecv(tmprecv, msglen, MPI_BYTE, recvfrom,
MCA_COLL_BASE_TAG_UCC, comm, &oob_req->reqs[1]));
+ if (OMPI_SUCCESS != rc) return rc
}
probe = 0;
do {
@@ -213,6 +216,8 @@ static ucc_status_t oob_allgather(void *sbuf, void *rbuf, size_t msglen,
oob_req->msglen = msglen;
oob_req->oob_coll_ctx = oob_coll_ctx;
oob_req->iter = 0;
+ oob_req->reqs[0] = NULL;
+ oob_req->reqs[1] = NULL;
*req = oob_req;
return UCC_OK;
}
~/ompi$