Skip to content

SEGV in ompi_request_default_test_all() when triggering IPoIB networking problem during ucc_perftest run #13191

Open
@bfaccini

Description

@bfaccini

Background information

What version of Open MPI are you using? (e.g., v4.1.6, v5.0.1, git branch name and hash, etc.)

v4.1.7rc1

Describe how Open MPI was installed (e.g., from a source/distribution tarball, from a git clone, from an operating system distribution package, etc.)

Git clone.

If you are building/installing from a git clone, please copy-n-paste the output from git submodule status.

Please describe the system on which you are running

  • Operating system/version: Ubuntu 24.04.2 LTS (Noble Numbat), Kernel 6.8.0-57-generic
  • Computer hardware:
  • Network type: IPoIB

Details of the problem

With the following reproducer with ucc_perftest :
​​​​​​

$ srun -A admin -p admin -N64 --mpi=pmix --ntasks-per-node=8 --container-image=<container-image> env UCX_TLS=self,tcp ucc_perftest -c alltoall -m host -b 1048576 -e 2147483648 -n 2 
srun: job 357102 queued and waiting for resources 
srun: job 357102 has been allocated resources 
[1744257072.724104] [node0190:1259783:0] sock.c:334 UCX ERROR connect(fd=252, dest_addr=<ip address>) failed: Connection timed out 
[node0190.<domain>:1259783] pml_ucx.c:424 Error: ucp_ep_create(proc=496) failed: Destination is unreachable 
[node0190.<domain>:1259783] pml_ucx.c:477 Error: Failed to resolve UCX endpoint for rank 496 [LOG_CAT_COMMPATTERNS] isend failed in comm_allreduce_pml at iterations 7
.................

that leads to corefiles being created on several nodes.
Thus I have been able to dig into these corefiles to try to understand the reason of the SEGVs, likely to be caused by a wrong/missing error-path in OMPI/UCC code, when triggering the original error/msgs above upon some IPoIB networking issue.

Here are my findings.
The fully unwinded stack is like following :

#0  ompi_request_default_test_all (count=2, requests=0x555555a2f228, completed=0x7fffffffc5c4, statuses=0x0) at request/req_test.c:187

#1  0x00007ffff50139ac in oob_allgather_test (req=0x555555a2f200) at coll_ucc_module.c:182

#2  0x00007ffff7f8ea5c in ucc_core_addr_exchange (context=context@entry=0x555555a2e990, oob=oob@entry=0x555555a2e9a8, addr_storage=addr_storage@entry=0x555555a2eaa0) at core/ucc_context.c:461

#3  0x00007ffff7f8f657 in ucc_context_create_proc_info (lib=0x5555559d12b0, params=params@entry=0x7fffffffc960, config=0x555555a2e840, context=context@entry=0x7ffff50213c8 <mca_coll_ucc_component+392>, proc_info=0x7ffff7fbca60 <ucc_local_proc>)

    at core/ucc_context.c:723

#4  0x00007ffff7f901f0 in ucc_context_create (lib=<optimized out>, params=params@entry=0x7fffffffc960, config=<optimized out>, context=context@entry=0x7ffff50213c8 <mca_coll_ucc_component+392>) at core/ucc_context.c:866

#5  0x00007ffff5013cb1 in mca_coll_ucc_init_ctx () at coll_ucc_module.c:302

#6  0x00007ffff501583f in mca_coll_ucc_comm_query (comm=0x55555557d240 <ompi_mpi_comm_world>, priority=0x7fffffffcb6c) at coll_ucc_module.c:488

#7  0x00007ffff7ee5e4c in query_2_0_0 (module=<synthetic pointer>, priority=0x7fffffffcb6c, comm=0x55555557d240 <ompi_mpi_comm_world>, component=0x7ffff5021240 <mca_coll_ucc_component>) at base/coll_base_comm_select.c:540

#8  query (module=<synthetic pointer>, priority=0x7fffffffcb6c, comm=<optimized out>, component=0x7ffff5021240 <mca_coll_ucc_component>) at base/coll_base_comm_select.c:523

#9  check_one_component (module=<synthetic pointer>, component=0x7ffff5021240 <mca_coll_ucc_component>, comm=<optimized out>) at base/coll_base_comm_select.c:486

#10 check_components (comm=comm@entry=0x55555557d240 <ompi_mpi_comm_world>, components=<optimized out>) at base/coll_base_comm_select.c:406

#11 0x00007ffff7ee6446 in mca_coll_base_comm_select (comm=0x55555557d240 <ompi_mpi_comm_world>) at base/coll_base_comm_select.c:114

#12 0x00007ffff7f33613 in ompi_mpi_init (argc=<optimized out>, argc@entry=0, argv=<optimized out>, argv@entry=0x0, requested=0, provided=0x7fffffffcdf4, reinit_ok=reinit_ok@entry=false) at runtime/ompi_mpi_init.c:957

#13 0x00007ffff7ed6c2c in PMPI_Init (argc=0x0, argv=0x0) at pinit.c:69

#14 0x000055555555dbf4 in ucc_pt_bootstrap_mpi::ucc_pt_bootstrap_mpi() ()

#15 0x0000555555565666 in ucc_pt_comm::ucc_pt_comm(ucc_pt_comm_config) ()

#16 0x0000555555558f2a in main ()

where you can see that the unresolved symbol/frame in previously detailed stack is in fact in oob_allgather_test().

And the reason of the SEGV is because :

(gdb) p/x *(oob_allgather_req_t *)0x555555a2f200
​​​​​​$1 = {sbuf = 0x555555a2ea00, rbuf = 0x555555a710c0, oob_coll_ctx = 0x55555557d240, msglen = 0x8, iter = 0x1, reqs = {0x726568, 0x555555a8fa48}}

where reqs[0] is garbage when being dereferenced :

(gdb) p/x $rip

$3 = 0x7ffff7eb39e8

(gdb) x/10i ($rip - 0x18)

   0x7ffff7eb39d0 <ompi_request_default_test_all+48>:   cmpq   $0x1,0x58(%rax)

   0x7ffff7eb39d5 <ompi_request_default_test_all+53>:   je     0x7ffff7eb39f0 <ompi_request_default_test_all+80>

   0x7ffff7eb39d7 <ompi_request_default_test_all+55>:   lea    0x1(%r12),%rax

   0x7ffff7eb39dc <ompi_request_default_test_all+60>:   cmp    %rax,%rdi

   0x7ffff7eb39df <ompi_request_default_test_all+63>:   je     0x7ffff7eb39fe <ompi_request_default_test_all+94>

   0x7ffff7eb39e1 <ompi_request_default_test_all+65>:   mov    %rax,%r12

   0x7ffff7eb39e4 <ompi_request_default_test_all+68>:   mov    (%rbx,%r12,8),%rax

=> 0x7ffff7eb39e8 <ompi_request_default_test_all+72>:   mov    0x60(%rax),%esi

   0x7ffff7eb39eb <ompi_request_default_test_all+75>:   cmp    $0x1,%esi

   0x7ffff7eb39ee <ompi_request_default_test_all+78>:   jne    0x7ffff7eb39d0 <ompi_request_default_test_all+48>

(gdb) x/gx ($rax + 0x60)

0x7265c8:       Cannot access memory at address 0x7265c8

(gdb) p/x $rbx + $r12 * 0x8

$4 = 0x555555a2f228

(gdb) x/gx ($rbx + $r12 * 0x8)

0x555555a2f228: 0x0000000000726568

(gdb) p/x $rax

$5 = 0x726568

(gdb) x/gx ($rax + 0x60)

0x7265c8:       Cannot access memory at address 0x7265c8

(gdb) 

Looking at the corresponding source code in "ompi/mca/coll/ucc/coll_ucc_module.c" :

141         

142 typedef struct oob_allgather_req{

143     void           *sbuf;

144     void           *rbuf;  

145     void           *oob_coll_ctx;

146     size_t          msglen;

147     int             iter;

148     ompi_request_t *reqs[2];

149 } oob_allgather_req_t;

150         

151 static ucc_status_t oob_allgather_test(void *req)

152 {   

153     oob_allgather_req_t *oob_req = (oob_allgather_req_t*)req;

154     ompi_communicator_t *comm    = (ompi_communicator_t *)oob_req->oob_coll_ctx;

155     char                *tmpsend = NULL;

156     char                *tmprecv = NULL;

157     size_t               msglen  = oob_req->msglen;

158     int                  probe_count = 5;

159     int rank, size, sendto, recvfrom, recvdatafrom,

160         senddatafrom, completed, probe;

161     

162     size = ompi_comm_size(comm);

163     rank = ompi_comm_rank(comm);

164     if (oob_req->iter == 0) {

165         tmprecv = (char*) oob_req->rbuf + (ptrdiff_t)rank * (ptrdiff_t)msglen;

166         memcpy(tmprecv, oob_req->sbuf, msglen);

167     }

168     sendto   = (rank + 1) % size;

169     recvfrom = (rank - 1 + size) % size;

170     for (; oob_req->iter < size - 1; oob_req->iter++) {

171         if (oob_req->iter > 0) {             <<<< iter is 0 for 1st loop ...

172             probe = 0;

173             do {​​​​​​

174                 ompi_request_test_all(2, oob_req->reqs, &completed, MPI_STATUS_IGNORE);
<<<<<< during 2nd loop (iter == 1) , ompi_request_test_all() is called with garbled reqs[0] !!
175                 probe++;

176             } while (!completed && probe < probe_count);

177             if (!completed) {

178                 return UCC_INPROGRESS;

179             }

180         }

181         recvdatafrom = (rank - oob_req->iter - 1 + size) % size;

182         senddatafrom = (rank - oob_req->iter + size) % size;

183         tmprecv = (char*)oob_req->rbuf + (ptrdiff_t)recvdatafrom * (ptrdiff_t)msglen;

184         tmpsend = (char*)oob_req->rbuf + (ptrdiff_t)senddatafrom * (ptrdiff_t)msglen;

185         MCA_PML_CALL(isend(tmpsend, msglen, MPI_BYTE, sendto, MCA_COLL_BASE_TAG_UCC,

186                            MCA_PML_BASE_SEND_STANDARD, comm, &oob_req->reqs[0]));
<<<<<< isend triggers an error so reqs[0] is not populated !!

187         MCA_PML_CALL(irecv(tmprecv, msglen, MPI_BYTE, recvfrom,

188                            MCA_COLL_BASE_TAG_UCC, comm, &oob_req->reqs[1]));
<<<<<< irecv do not report error, so reqs[1] is populated.
189     }

190     probe = 0;

191     do {

192         ompi_request_test_all(2, oob_req->reqs, &completed, MPI_STATUS_IGNORE);

193         probe++;

194     } while (!completed && probe < probe_count);

195     if (!completed) {

196         return UCC_INPROGRESS;

197     }

198     return UCC_OK;

199 }

200 

201 static ucc_status_t oob_allgather_free(void *req)

202 {

203     free(req);

204     return UCC_OK;

205 }

206 

207 static ucc_status_t oob_allgather(void *sbuf, void *rbuf, size_t msglen,

208                                   void *oob_coll_ctx, void **req)

209 {

210     oob_allgather_req_t *oob_req = malloc(sizeof(*oob_req));

211     oob_req->sbuf                = sbuf;

212     oob_req->rbuf                = rbuf;

213     oob_req->msglen              = msglen;

214     oob_req->oob_coll_ctx        = oob_coll_ctx;

215     oob_req->iter                = 0;​​​​​​

216     *req                         = oob_req;

217     return UCC_OK;

218 }

219 
​​​​​​"ompi/mca/coll/ucc/coll_ucc_module.c" 528 lines --41%--                              219,0-1       37%

and just to be complete :

​​​​​​#define ompi_request_test_all   (ompi_request_functions.req_test_all)
​​​​​​"ompi/request/request.h" 504L, 19446B              407,1         83%

​​​​​​

(gdb) x/i ompi_request_functions.req_test_all

   0x7ffff7eb39a0 <ompi_request_default_test_all>:      endbr64

Based on all of this it appears that the following patch/correction (in v4.1.7rc1, the quite recent OMPI version we are running) would allow OMPI/UCC to no longer coredump by gracefully handling any error during isend/irecv :

~/ompi$ git status

HEAD detached at v4.1.7rc1

Changes not staged for commit:

  (use "git add <file>..." to update what will be committed)

  (use "git restore <file>..." to discard changes in working directory)

        modified:   ompi/mca/coll/ucc/coll_ucc_module.c

no changes added to commit (use "git add" and/or "git commit -a")

~/ompi$ git diff 

diff --git a/ompi/mca/coll/ucc/coll_ucc_module.c b/ompi/mca/coll/ucc/coll_ucc_module.c

index 1686697618..dfa2674a3d 100644

--- a/ompi/mca/coll/ucc/coll_ucc_module.c

+++ b/ompi/mca/coll/ucc/coll_ucc_module.c

@@ -158,6 +158,7 @@ static ucc_status_t oob_allgather_test(void *req)

     int                  probe_count = 5;

     int rank, size, sendto, recvfrom, recvdatafrom,

         senddatafrom, completed, probe;

+    int rc;

 

     size = ompi_comm_size(comm);

     rank = ompi_comm_rank(comm);

@@ -182,10 +183,12 @@ static ucc_status_t oob_allgather_test(void *req)

         senddatafrom = (rank - oob_req->iter + size) % size;

         tmprecv = (char*)oob_req->rbuf + (ptrdiff_t)recvdatafrom * (ptrdiff_t)msglen;

         tmpsend = (char*)oob_req->rbuf + (ptrdiff_t)senddatafrom * (ptrdiff_t)msglen;

-        MCA_PML_CALL(isend(tmpsend, msglen, MPI_BYTE, sendto, MCA_COLL_BASE_TAG_UCC,

+        rc = MCA_PML_CALL(isend(tmpsend, msglen, MPI_BYTE, sendto, MCA_COLL_BASE_TAG_UCC,

                            MCA_PML_BASE_SEND_STANDARD, comm, &oob_req->reqs[0]));

-        MCA_PML_CALL(irecv(tmprecv, msglen, MPI_BYTE, recvfrom,

+       if (OMPI_SUCCESS != rc) return rc

+        rc = MCA_PML_CALL(irecv(tmprecv, msglen, MPI_BYTE, recvfrom,

                            MCA_COLL_BASE_TAG_UCC, comm, &oob_req->reqs[1]));

+       if (OMPI_SUCCESS != rc) return rc

     }

     probe = 0;

     do {

@@ -213,6 +216,8 @@ static ucc_status_t oob_allgather(void *sbuf, void *rbuf, size_t msglen,

     oob_req->msglen              = msglen;

     oob_req->oob_coll_ctx        = oob_coll_ctx;

     oob_req->iter                = 0;

+    oob_req->reqs[0]             = NULL;

+    oob_req->reqs[1]             = NULL;

     *req                         = oob_req;

     return UCC_OK;

 }

~/ompi$ 

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions