@@ -164,7 +164,7 @@ __device__ __forceinline__ void recvImplNonContig(
164164 int groupIdx,
165165 int ngroups,
166166 size_t maxRecvcount,
167- bool nonContigIndices ) {
167+ bool combine ) {
168168 const auto localRank = statex->localRank ();
169169 const auto nLocalRanks = statex->nLocalRanks ();
170170
@@ -189,7 +189,7 @@ __device__ __forceinline__ void recvImplNonContig(
189189 // writes it to the recvCountsTmpbufGPU buffer.
190190 devSyncWaitStep (sync, groupIdx, 0 );
191191 mySendIndicesBlockLength = recvIndicesPeerAllToAllvDynamicBufsMap[0 ];
192- if (threadIdx .x == 0 && groupIdx == 0 && !nonContigIndices ) {
192+ if (threadIdx .x == 0 && groupIdx == 0 && !combine ) {
193193 for (int i = 0 ; i < sendcountsLength; i++) {
194194 recvCountsTmpbufGPU[recvPeerGlobal * sendcountsLength + i] =
195195 recvcountsPeerAllToAllvDynamicBufsMap[i];
@@ -198,7 +198,7 @@ __device__ __forceinline__ void recvImplNonContig(
198198 devSyncSetStep (sync, groupIdx, CTRAN_ALGO_STEP_RESET);
199199
200200 size_t recvOffsets = 0 , lastRecvIndex = 0 ;
201- if (nonContigIndices ) {
201+ if (combine ) {
202202 lastRecvIndex = sendcountsLength * statex->rank () / statex->nRanks ();
203203 }
204204 for (int i = 0 ; i < mySendIndicesBlockLength; i++) {
@@ -304,7 +304,7 @@ __device__ __forceinline__ void selfCopyNonContig(
304304 int groupIdx,
305305 bool groupType,
306306 size_t maxRecvcount,
307- bool nonContigIndices ) {
307+ bool combine ) {
308308 // Now we calculate the startSendIndex on-the-fly,
309309 // which may not be efficient. If the inputChunkCountPerRank can be
310310 // on CPU, we can calculate it on CPU and pass it to GPU.
@@ -317,7 +317,7 @@ __device__ __forceinline__ void selfCopyNonContig(
317317 startSendIndex += inputChunkCountPerRank[i];
318318 }
319319
320- if (!nonContigIndices && groupIdx == 0 && groupType == GROUP_RECV) {
320+ if (!combine && groupIdx == 0 && groupType == GROUP_RECV) {
321321 ctranKernCopy<size_t >(
322322 sendcounts,
323323 recvCountsTmpbufGPU + rank * sendcountsLength,
@@ -326,7 +326,7 @@ __device__ __forceinline__ void selfCopyNonContig(
326326 1 );
327327 }
328328
329- if (nonContigIndices ) {
329+ if (combine ) {
330330 curOffsetIndex = sendcountsLength * rank / nRanks;
331331 }
332332
@@ -383,7 +383,7 @@ __device__ __forceinline__ void ncclKernelAllToAllvDynamicCommon(
383383 int * flag,
384384 CtranKernelAllToAllvDynamicArgs args,
385385 ALGOTYPE algoType,
386- bool nonContigIndices = false ) {
386+ bool combine = false ) {
387387 const auto gtIdx = blockDim .x * blockIdx .x + threadIdx .x ;
388388
389389 const auto rank = statex->rank ();
@@ -461,7 +461,7 @@ __device__ __forceinline__ void ncclKernelAllToAllvDynamicCommon(
461461 groupIdx,
462462 groupType,
463463 args.nonContig .maxRecvcount ,
464- nonContigIndices );
464+ combine );
465465 if (groupType == GROUP_RECV) {
466466 recvImplNonContig (
467467 recvbuffs,
@@ -471,7 +471,7 @@ __device__ __forceinline__ void ncclKernelAllToAllvDynamicCommon(
471471 groupIdx,
472472 ngroups,
473473 args.nonContig .maxRecvcount ,
474- nonContigIndices );
474+ combine );
475475 } else {
476476 sendImplNonContig (
477477 sendbuffs,
@@ -510,7 +510,7 @@ __device__ __forceinline__ void ncclKernelAllToAllvDynamicCommon(
510510 // Copy back to recvcounts for DYNAMIC and DYNAMIC_SPLIT
511511 // or if it is first a2a for DYNAMIC_SPLIT_NON_CONTIG
512512 if (groupIdx == 0 && groupType == GROUP_RECV &&
513- (algoType != DYNAMIC_SPLIT_NON_CONTIG || !nonContigIndices )) {
513+ (algoType != DYNAMIC_SPLIT_NON_CONTIG || !combine )) {
514514 ctranKernCopy<size_t >(
515515 recvCountsTmpbufGPU,
516516 reinterpret_cast <size_t *>(args.actualRecvcounts ),
@@ -528,7 +528,7 @@ __device__ __forceinline__ void ncclKernelAllToAllvDynamicCommon(
528528template <typename T>
529529__device__ __forceinline__ void generateSendbuffs (
530530 CtranKernelAllToAllvDynamicArgs& args,
531- bool nonContigIndices = false ) {
531+ bool combine = false ) {
532532 const auto gtIdx = blockDim .x * blockIdx .x + threadIdx .x ;
533533 const size_t * sendSplitLengths = (size_t *)args.sendcounts ;
534534 args.split .sendbuffsPtrShmDev =
@@ -548,7 +548,7 @@ __device__ __forceinline__ void generateSendbuffs(
548548 // and hence need to reset the sendbuff offset.
549549 // The length of each rank is equal to maxsendcounts/ranks.
550550 // i / numCountsPerRank is the rank number.
551- if (nonContigIndices && (i % numCountsPerRank == 0 )) {
551+ if (combine && (i % numCountsPerRank == 0 )) {
552552 sendbuffsGPU[i] = sendbuffsGPU[0 ] +
553553 (args.nonContig .maxSendcount / statex->nRanks ()) *
554554 (i / numCountsPerRank);
@@ -592,14 +592,14 @@ __global__ void ncclKernelAllToAllvDynamicSplitNonContig(
592592 CtranKernelAllToAllvDynamicArgs args) {
593593 devStateLoadToShm (devState);
594594
595- bool nonContigIndices = false ;
596595 int totalSendIndicesLength = 0 ;
597596 for (int i = 0 ; i < statex->nRanks (); i++) {
598597 totalSendIndicesLength += args.nonContig .inputChunkCountPerRank [i];
599598 }
600- nonContigIndices = (totalSendIndicesLength < args.sendcountsLength );
601599
602- generateSendbuffs<T>(args, nonContigIndices);
600+ bool combine = args.nonContig .combine ;
601+
602+ generateSendbuffs<T>(args, combine);
603603
604604 ctranKernCopy<size_t >(
605605 args.nonContig .inputChunkIndices ,
@@ -630,7 +630,7 @@ __global__ void ncclKernelAllToAllvDynamicSplitNonContig(
630630 }
631631
632632 ncclKernelAllToAllvDynamicCommon<T>(
633- flag, args, DYNAMIC_SPLIT_NON_CONTIG, nonContigIndices );
633+ flag, args, DYNAMIC_SPLIT_NON_CONTIG, combine );
634634}
635635
636636#define DECL_CTRAN_ALLTOALLVDYNAMIC_KERN (T ) \
0 commit comments