Skip to content

Commit 022e468

Browse files
committed
Transfer data to and from GPU.
This is a multi-part patch that allows the CPU to prepare a data copy mapped onto a device. 1. The first question is how is such a device selected ? The allocation of such a copy happen way before the scheduler is invoked for a task, in fact before the task is even ready. Thus, we need to decide on the location of this copy only based on some static information, such as the task affinity. Therefore, this approach only works for owner-compute type of tasks, where the task will be executed on the device that owns the data used for the task affinity. 2. Pass the correct data copy across the entire system, instead of falling back to data copy of the device 0 (CPU memory) Signed-off-by: George Bosilca <gbosilca@nvidia.com>
1 parent 1e89cbd commit 022e468

File tree

14 files changed

+361
-98
lines changed

14 files changed

+361
-98
lines changed

parsec/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ if( BUILD_PARSEC )
238238
$<$<BOOL:${PARSEC_HAVE_OTF2}>:OTF2::OTF2>
239239
$<$<BOOL:${MPI_C_FOUND}>:MPI::MPI_C>
240240
$<$<BOOL:${PARSEC_HAVE_CUDA}>:CUDA::cudart>
241+
$<$<BOOL:${PARSEC_HAVE_CUDA}>:cuda>
241242
$<$<BOOL:${PARSEC_HAVE_HIP}>:hip::host>
242243
${EXTRA_LIBS}
243244
INTERFACE

parsec/arena.c

Lines changed: 89 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -235,43 +235,108 @@ int parsec_arena_allocate_device_private(parsec_data_copy_t *copy,
235235
return PARSEC_SUCCESS;
236236
}
237237

238-
parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
239-
size_t count, int device,
240-
parsec_datatype_t dtt)
238+
#include "parsec/utils/zone_malloc.h"
239+
#include "mca/device/device_gpu.h"
240+
241+
static inline parsec_data_copy_t *
242+
parsec_arena_internal_copy_new(parsec_arena_t *arena,
243+
parsec_data_t *data,
244+
size_t count, int device,
245+
parsec_datatype_t dtt)
241246
{
242-
parsec_data_t *data;
243-
parsec_data_copy_t *copy;
244-
int rc;
245-
246-
247-
data = parsec_data_new();
247+
parsec_data_copy_t *copy = NULL;
248+
parsec_data_t* ldata = data;
248249
if( NULL == data ) {
250+
ldata = parsec_data_new();
251+
if( NULL == ldata ) {
252+
return NULL;
253+
}
254+
}
255+
if( 0 == device ) {
256+
copy = parsec_data_copy_new(ldata, device, dtt,
257+
PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED | PARSEC_DATA_FLAG_ARENA);
258+
if (NULL == copy) {
259+
goto free_and_return;
260+
}
261+
int rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);
262+
if (PARSEC_SUCCESS != rc) {
263+
goto free_and_return;
264+
}
265+
return copy;
266+
}
267+
/**
268+
* This part is not really nice, it breaks the separation between devices, and how their memory is
269+
* managed. But, it should give nice perfromance improvements if the communication layer is
270+
* capable of sending or receiving data directly to and from the accelerator memory. The only drawback
271+
* is that once the GPU memory is full, this will fail, so the soeftware will fall back to the
272+
* prior behavior, going through the CPU memory.
273+
*
274+
* The zone deallocation is not symmetric, it will happen in the GPU management, when the data copies
275+
* are released from the different LRU lists.
276+
*/
277+
parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t *)parsec_mca_device_get(device);
278+
if (NULL == gpu_device) {
249279
return NULL;
250280
}
281+
size_t size = count * arena->elem_size;
282+
void* device_private = zone_malloc(gpu_device->memory, size);
283+
if( NULL == device_private ) {
284+
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p failed (out of memory)\n",
285+
device, size, (void *)copy->arena_chunk);
286+
goto free_and_return;
287+
}
288+
copy = parsec_data_copy_new(ldata, device, dtt,
289+
PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED);
290+
if (NULL == copy) {
291+
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p failed to allocate copy (out of memory)\n",
292+
device, size, (void *)copy->arena_chunk);
293+
zone_free(gpu_device->memory, device_private);
294+
goto free_and_return;
295+
}
296+
copy->dtt = dtt;
297+
copy->device_private = device_private;
298+
copy->arena_chunk = (parsec_arena_chunk_t*)gpu_device->memory;
299+
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p, "
300+
"data ptr %p",
301+
device, size, (void*)copy->arena_chunk, (void*)copy->device_private);
302+
copy->version = 0;
303+
copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
304+
copy->original->owner_device = device;
305+
copy->original->preferred_device = device;
306+
return copy;
307+
free_and_return:
308+
if( NULL != copy )
309+
PARSEC_OBJ_RELEASE(copy);
310+
if( NULL == data)
311+
PARSEC_OBJ_RELEASE(ldata); /* release the locally allocated data */
312+
return NULL;
313+
}
251314

252-
copy = parsec_data_copy_new( data, device, dtt,
253-
PARSEC_DATA_FLAG_ARENA |
254-
PARSEC_DATA_FLAG_PARSEC_OWNED |
255-
PARSEC_DATA_FLAG_PARSEC_MANAGED);
315+
parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
316+
size_t count, int device,
317+
parsec_datatype_t dtt)
318+
{
319+
parsec_data_copy_t *dev0_copy, *copy;
256320

257-
if(NULL == copy) {
258-
PARSEC_OBJ_RELEASE(data);
321+
dev0_copy = parsec_arena_internal_copy_new(arena, NULL, count, 0 /* first allocate the copy on the device 0 */, dtt);
322+
if( NULL == dev0_copy ) {
259323
return NULL;
260324
}
325+
dev0_copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
326+
dev0_copy->version = 0; /* start from somewhere */
327+
if( 0 == device ) {
328+
return dev0_copy;
329+
}
261330

262-
rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);
263-
331+
copy = parsec_arena_internal_copy_new(arena, dev0_copy->original, count, device, dtt);
332+
if( NULL == copy ) {
333+
copy = dev0_copy; /* return the main memory data copy */
334+
}
264335
/* This data is going to be released once all copies are released
265336
* It does not exist without at least a copy, and we don't give the
266337
* pointer to the user, so we must remove our retain from it
267338
*/
268-
PARSEC_OBJ_RELEASE(data);
269-
270-
if( PARSEC_SUCCESS != rc ) {
271-
PARSEC_OBJ_RELEASE(copy);
272-
return NULL;
273-
}
274-
339+
PARSEC_OBJ_RELEASE(dev0_copy->original);
275340
return copy;
276341
}
277342

parsec/data.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,12 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
330330
copy = data->device_copies[device];
331331
assert( NULL != copy );
332332

333+
if( valid_copy == device ) {
334+
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,
335+
"DEV[%d]: already has ownership of data %p to copy %p in mode %d",
336+
device, data, copy, access_mode);
337+
goto bookkeeping;
338+
}
333339
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,
334340
"DEV[%d]: start transfer ownership of data %p to copy %p in mode %d",
335341
device, data, copy, access_mode);
@@ -417,6 +423,7 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
417423
}
418424
}
419425

426+
bookkeeping:
420427
if( PARSEC_FLOW_ACCESS_READ & access_mode ) {
421428
copy->readers++;
422429
}

parsec/data.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ typedef uint8_t parsec_data_coherency_t;
3131
#define PARSEC_DATA_COHERENCY_SHARED ((parsec_data_coherency_t)0x4)
3232

3333
typedef uint8_t parsec_data_status_t;
34-
#define PARSEC_DATA_STATUS_NOT_TRANSFER ((parsec_data_coherency_t)0x0)
35-
#define PARSEC_DATA_STATUS_UNDER_TRANSFER ((parsec_data_coherency_t)0x1)
36-
#define PARSEC_DATA_STATUS_COMPLETE_TRANSFER ((parsec_data_coherency_t)0x2)
34+
#define PARSEC_DATA_STATUS_NOT_TRANSFER ((parsec_data_status_t)0x0)
35+
#define PARSEC_DATA_STATUS_UNDER_TRANSFER ((parsec_data_status_t)0x1)
36+
#define PARSEC_DATA_STATUS_COMPLETE_TRANSFER ((parsec_data_status_t)0x2)
3737
/**
3838
* Data copies have three levels of 'ownership':
3939
* - a data copy can be owned and managed by PaRSEC.

parsec/data_dist/matrix/two_dim_rectangle_cyclic.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ typedef struct parsec_matrix_block_cyclic {
4545
* @param dc matrix description structure, already allocated, that will be initialize
4646
* @param mtype type of data used for this matrix
4747
* @param storage type of storage of data
48-
* @param nodes number of nodes
4948
* @param myrank rank of the local node (as of mpi rank)
5049
* @param mb number of row in a tile
5150
* @param nb number of column in a tile

parsec/interfaces/ptg/ptg-compiler/jdf2c.c

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -672,6 +672,17 @@ static char* dump_local_assignments( void** elem, void* arg )
672672
if( dos > 0 ) {
673673
string_arena_init(info->sa);
674674
string_arena_add_string(info->sa, "const int %s = %s%s.value;", def->name, info->holder, def->name);
675+
#if 0
676+
jdf_expr_t* type_str = jdf_find_property( def->properties, "type", NULL );
677+
if( NULL == type_str ) {
678+
string_arena_add_string(info->sa, "const int %s = %s%s.value;", def->name, info->holder, def->name);
679+
} else {
680+
expr_info_t expr_info = {.sa = info->sa, .prefix = "", .suffix = "", .assignments = "locals"};
681+
string_arena_add_string(info->sa, "const %s %s = %s%s.value;",
682+
dump_expr((void**)type_str, &expr_info),
683+
def->name, info->holder, def->name);
684+
}
685+
#endif
675686
if( dos > 1 )
676687
string_arena_add_string(info->sa, " (void)%s;", def->name);
677688
return string_arena_get_string(info->sa);
@@ -5649,12 +5660,19 @@ jdf_generate_code_call_initialization(const jdf_t *jdf, const jdf_call_t *call,
56495660

56505661
/* Code to create & fulfill a reshape promise locally in case this input dependency is typed */
56515662
jdf_generate_code_reshape_input_from_dep(jdf, f, flow, dl, spaces);
5652-
coutput("%s this_task->data._f_%s.data_out = parsec_data_get_copy(chunk->original, target_device);\n"
5653-
"#if defined(PARSEC_PROF_GRAPHER) && defined(PARSEC_PROF_TRACE)\n"
5663+
/* TODO: Setting the data_out here is kind of random, especially as some copy of the input flow. The only thing
5664+
* that would make sense here is to set the data_out to the dep outputs back into the user memory (output
5665+
* dep with a target into a data collection), to give the opportunity to the accelerator components to
5666+
* do a pushout to the desired location (instead of the current approach that will do a pushout to the
5667+
* data_copy on device 0 followed by a memcpy into the desired location).
5668+
*/
5669+
//coutput("%s this_task->data._f_%s.data_out = parsec_data_get_copy(chunk->original, target_device);\n",
5670+
// spaces, flow->varname);
5671+
5672+
coutput("#if defined(PARSEC_PROF_GRAPHER) && defined(PARSEC_PROF_TRACE)\n"
56545673
"%s parsec_prof_grapher_data_input(chunk->original, (parsec_task_t*)this_task, &%s, 0);\n"
56555674
"#endif\n"
56565675
"%s }\n",
5657-
spaces, flow->varname,
56585676
spaces, JDF_OBJECT_ONAME( flow ),
56595677
spaces);
56605678
}
@@ -6512,10 +6530,10 @@ jdf_generate_code_data_lookup(const jdf_t *jdf,
65126530
* This way, it's only retained once during release_deps.
65136531
*/
65146532
coutput(" if( NULL == this_task->repo_entry ){\n"
6515-
" this_task->repo_entry = data_repo_lookup_entry_and_create(es, %s_repo, "
6533+
" this_task->repo_entry = data_repo_lookup_entry_and_create(es, %s_repo, \n"
65166534
" %s((const parsec_taskpool_t*)__parsec_tp, (const parsec_assignment_t*)&this_task->locals));\n"
6517-
" data_repo_entry_addto_usage_limit(%s_repo, this_task->repo_entry->ht_item.key, 1);"
6518-
" this_task->repo_entry ->generator = (void*)this_task; /* for AYU */\n"
6535+
" data_repo_entry_addto_usage_limit(%s_repo, this_task->repo_entry->ht_item.key, 1);\n"
6536+
" this_task->repo_entry->generator = (void*)this_task; /* for AYU */\n"
65196537
"#if defined(PARSEC_SIM)\n"
65206538
" assert(this_task->repo_entry ->sim_exec_date == 0);\n"
65216539
" this_task->repo_entry ->sim_exec_date = this_task->sim_exec_date;\n"
@@ -6525,7 +6543,7 @@ jdf_generate_code_data_lookup(const jdf_t *jdf,
65256543
jdf_property_get_string(f->properties, JDF_PROP_UD_MAKE_KEY_FN_NAME, NULL),
65266544
f->fname);
65276545

6528-
coutput(" /* The reshape repo is the current task repo. */"
6546+
coutput(" /* The reshape repo is the current task repo. */\n"
65296547
" reshape_repo = %s_repo;\n"
65306548
" reshape_entry_key = %s((const parsec_taskpool_t*)__parsec_tp, (const parsec_assignment_t*)&this_task->locals) ;\n"
65316549
" reshape_entry = this_task->repo_entry;\n",
@@ -7033,6 +7051,12 @@ static void jdf_generate_code_hook(const jdf_t *jdf,
70337051
output = UTIL_DUMP_LIST(sa, f->dataflow, next,
70347052
dump_data_initialization_from_data_array, &ai2, "", "", "", "");
70357053
if( 0 != strlen(output) ) {
7054+
coutput("/* Make sure we have the data_out set to the data_in */\n");
7055+
for( fl = f->dataflow; fl != NULL; fl = fl->next) {
7056+
if( fl->flow_flags & JDF_FLOW_TYPE_CTL ) continue;
7057+
coutput(" this_task->data._f_%s.data_out = this_task->data._f_%s.data_in;\n",
7058+
fl->varname, fl->varname);
7059+
}
70367060
coutput(" /** Declare the variables that will hold the data, and all the accounting for each */\n"
70377061
"%s\n",
70387062
output);

0 commit comments

Comments
 (0)