Skip to content

Commit 968bf7e

Browse files
committed
Add a configure option to enable GPU-aware communications.
Signed-off-by: George Bosilca <gbosilca@nvidia.com>
1 parent 3ecdb1a commit 968bf7e

File tree

6 files changed

+32
-6
lines changed

6 files changed

+32
-6
lines changed

CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,9 @@ option(PARSEC_DIST_WITH_MPI
165165
if(PARSEC_DIST_WITH_MPI AND 0)
166166
message(FATAL_ERROR "PARSEC_DIST_WITH_MPI and PARSEC_DIST_WITH_OTHER are mutually exclusive, please select only one")
167167
endif()
168+
option(PARSEC_MPI_IS_GPU_AWARE
169+
"Build PaRSEC assuming the MPI library is GPU-aware, aka. can move data directly to and from GPU memory.\
170+
As of today (mid 2024) while most MPI support such an option, they require a single process per GPU" ON)
168171
option(PARSEC_DIST_THREAD
169172
"Use an extra thread to progress the data movements" ON)
170173
option(PARSEC_DIST_PRIORITIES

parsec/include/parsec/parsec_options.h.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171

7272
/* Communication engine */
7373
#cmakedefine PARSEC_DIST_WITH_MPI
74+
#cmakedefine PARSEC_MPI_IS_GPU_AWARE
7475
#cmakedefine PARSEC_DIST_THREAD
7576
#cmakedefine PARSEC_DIST_PRIORITIES
7677
#cmakedefine PARSEC_DIST_COLLECTIVES

parsec/mca/device/device_gpu.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -954,9 +954,9 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
954954

955955
/* Skip CTL flows only */
956956
if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) {
957-
gpu_task->flow_nb_elts[i] = 0; /* assume there is nothing to transfer to the GPU */
957+
gpu_task->flow_nb_elts[i] = 0; /* assume there is nothing to transfer to the GPU */
958958
continue;
959-
}
959+
}
960960

961961
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
962962
"GPU[%d:%s]:%s: Investigating flow %s:%d",
@@ -971,7 +971,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
971971
gpu_device->super.device_index, gpu_device->super.name, task_name,
972972
flow->name, i, gpu_elem,
973973
this_task->data[i].data_in->data_transfer_status == PARSEC_DATA_STATUS_UNDER_TRANSFER ? " [in transfer]" : "");
974-
this_task->data[i].data_out = this_task->data[i].data_in;
974+
this_task->data[i].data_out = this_task->data[i].data_in;
975975
continue;
976976
}
977977
master = this_task->data[i].data_in->original;
@@ -2466,7 +2466,10 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
24662466
gpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
24672467
assert(PARSEC_DATA_STATUS_UNDER_TRANSFER == cpu_copy->data_transfer_status);
24682468
cpu_copy->data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER;
2469-
2469+
if( 0 == (parsec_mpi_allow_gpu_memory_communications & PARSEC_RUNTIME_SEND_FROM_GPU_MEMORY) ) {
2470+
/* Report the CPU copy as the output of the task. */
2471+
this_task->data[i].data_out = cpu_copy;
2472+
}
24702473
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
24712474
"GPU copy %p [ref_count %d] moved to the read LRU in %s",
24722475
gpu_copy, gpu_copy->super.super.obj_reference_count, __func__);

parsec/parsec_internal.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,9 @@ PARSEC_DECLSPEC PARSEC_OBJ_CLASS_DECLARATION(parsec_taskpool_t);
200200
#define PARSEC_DEPENDENCIES_STARTUP_TASK ((parsec_dependency_t)(1<<29))
201201
#define PARSEC_DEPENDENCIES_BITMASK (~(PARSEC_DEPENDENCIES_TASK_DONE|PARSEC_DEPENDENCIES_IN_DONE|PARSEC_DEPENDENCIES_STARTUP_TASK))
202202

203+
#define PARSEC_RUNTIME_SEND_FROM_GPU_MEMORY 0x00000002
204+
#define PARSEC_RUNTIME_RECV_FROM_GPU_MEMORY 0x00000001
205+
203206
/**
204207
* This structure is used internally by the parsec_dependencies_t structures
205208
*/
@@ -492,6 +495,12 @@ PARSEC_DECLSPEC extern int parsec_slow_bind_warning;
492495
* the scheduler, but can provide a better cache reuse.
493496
*/
494497
PARSEC_DECLSPEC extern int parsec_runtime_keep_highest_priority_task;
498+
/**
499+
* Global configuration mask allowing or not for the data to be sent or received,
500+
* from or to, GPU memory. It can be an OR between PARSEC_RUNTIME_SEND_FROM_GPU_MEMORY
501+
* and PARSEC_RUNTIME_RECV_FROM_GPU_MEMORY.
502+
*/
503+
PARSEC_DECLSPEC extern int parsec_mpi_allow_gpu_memory_communications;
495504

496505
/**
497506
* Description of the state of the task. It indicates what will be the next

parsec/parsec_mpi_funnelled.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,8 @@ parsec_list_t mpi_funnelled_dynamic_sendreq_fifo; /* ordered non threaded fifo *
201201
parsec_list_t mpi_funnelled_dynamic_recvreq_fifo; /* ordered non threaded fifo */
202202
parsec_mempool_t *mpi_funnelled_dynamic_req_mempool = NULL;
203203

204+
int parsec_mpi_allow_gpu_memory_communications = 3;
205+
204206
/* This structure is used to save all the information necessary to
205207
* invoke a callback after a MPI_Request is satisfied
206208
*/
@@ -506,6 +508,14 @@ static int mpi_funneled_init_once(parsec_context_t* context)
506508
MAX_MPI_TAG, (unsigned int)MAX_MPI_TAG, MAX_MPI_TAG / MAX_DEP_OUT_COUNT);
507509
}
508510

511+
#if !defined(PARSEC_MPI_IS_GPU_AWARE)
512+
parsec_mpi_allow_gpu_memory_communications = 0;
513+
#endif
514+
parsec_mca_param_reg_int_name("mpi", "gpu_aware",
515+
"Enabled if PaRSEC should allow MPI to move data directly from or to GPU memory. Otherwise, all data"
516+
" movements will transit through CPU memory, and will always have a backup copy there. Accepted values "
517+
"are ORed between 1 for receiving into GPU memory and 2 for sending from GPU memory",
518+
false, false, parsec_mpi_allow_gpu_memory_communications, &parsec_mpi_allow_gpu_memory_communications);
509519
(void)context;
510520
return 0;
511521
}

parsec/remote_dep_mpi.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2110,8 +2110,8 @@ static void remote_dep_mpi_get_start(parsec_execution_stream_t* es,
21102110
/* prepare the local receiving data */
21112111
assert(NULL == deps->output[k].data.data); /* we do not support in-place tiles now, make sure it doesn't happen yet */
21122112
if(NULL == deps->output[k].data.data) {
2113-
deps->output[k].data.data = remote_dep_copy_allocate(&deps->output[k].data.remote,
2114-
deps->output[k].data.preferred_device);
2113+
int best_device = (parsec_mpi_allow_gpu_memory_communications & PARSEC_RUNTIME_RECV_FROM_GPU_MEMORY) ? deps->output[k].data.preferred_device : 0;
2114+
deps->output[k].data.data = remote_dep_copy_allocate(&deps->output[k].data.remote, best_device);
21152115
}
21162116
/* Mark the data under tranfer */
21172117
deps->output[k].data.data->data_transfer_status = PARSEC_DATA_STATUS_UNDER_TRANSFER;

0 commit comments

Comments
 (0)