Skip to content

Commit 2c004b1

Browse files
committed
Add support for batched tasks.
The idea is the following: - tasks incarnations (aka. BODY) can be marked with the "batch" property allowing the runtime to provide the task with the entire list of ready tasks of the execution stream instead of just extracting the head. - this list of ready tasks is in fact a ring, that can then be trimmed by the kernel and divided into batch and the rest. The rest of the tasks will be left in the ring, while the batch group will be submitted for execution. - the kernel also needs to provide a callback into the gpu_task complete_stage, such that the runtime can call the specialized function able to complete all batched tasks. Signed-off-by: George Bosilca <gbosilca@nvidia.com>
1 parent dcba0c0 commit 2c004b1

File tree

20 files changed

+184
-63
lines changed

20 files changed

+184
-63
lines changed

CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,8 @@ option(PARSEC_GPU_ALLOC_PER_TILE
181181
mark_as_advanced(PARSEC_GPU_ALLOC_PER_TILE)
182182
option(PARSEC_GPU_WITH_CUDA
183183
"Enable GPU support using CUDA kernels" ON)
184+
option(PARSEC_GPU_WITH_CUDA_BATCH
185+
"Enable the runtime support for batched kernels" ON)
184186
option(PARSEC_GPU_WITH_HIP
185187
"Enable GPU support using HIP kernels" ON)
186188
option(PARSEC_GPU_WITH_LEVEL_ZERO
@@ -729,6 +731,12 @@ int main(int argc, char *argv[]) {
729731
endif (CUDAToolkit_FOUND)
730732
set(PARSEC_HAVE_CU_COMPILER ${CMAKE_CUDA_COMPILER} CACHE BOOL "True if PaRSEC provide support for compiling .cu files")
731733
endif( PARSEC_GPU_WITH_CUDA )
734+
if( PARSEC_GPU_WITH_CUDA_BATCH )
735+
if( NOT PARSEC_HAVE_CUDA)
736+
message(FATAL_ERROR "PARSEC_GPU_WITH_CUDA_BATCH requires PARSEC_GPU_WITH_CUDA. Enable both or none")
737+
endif( NOT PARSEC_HAVE_CUDA)
738+
set(PARSEC_HAVE_CUDA_BATCH True CACHE BOOL "True if support for batched CUDA has been enabled")
739+
endif( PARSEC_GPU_WITH_CUDA_BATCH )
732740

733741
if( PARSEC_GPU_WITH_HIP )
734742
# This is kinda ugly but the PATH and HINTS don't get transmitted to sub-dependents

cmake_modules/PaRSECConfig.cmake.in

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,10 @@ endif(@PARSEC_DIST_WITH_MPI@)
6565
if(@PARSEC_HAVE_CUDA@)
6666
find_package(CUDAToolkit REQUIRED)
6767
set(PARSEC_HAVE_CUDA TRUE)
68+
69+
if(@PARSEC_HAVE_CUDA_BATCH@)
70+
set(PARSEC_HAVE_CUDA_BATCH TRUE)
71+
endif(@PARSEC_HAVE_CUDA_BATCH@)
6872
endif(@PARSEC_HAVE_CUDA@)
6973

7074
if(@PARSEC_HAVE_HIP@)

parsec/include/parsec/parsec_options.h.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@
130130
#cmakedefine PARSEC_HAVE_DEV_CPU_SUPPORT
131131
#cmakedefine PARSEC_HAVE_DEV_RECURSIVE_SUPPORT
132132
#cmakedefine PARSEC_HAVE_DEV_CUDA_SUPPORT
133+
#cmakedefine PARSEC_HAVE_DEV_CUDA_BATCH_SUPPORT
133134
#cmakedefine PARSEC_HAVE_DEV_HIP_SUPPORT
134135
#cmakedefine PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT
135136
#cmakedefine PARSEC_HAVE_DEV_OPENCL_SUPPORT

parsec/interfaces/dtd/insert_function.c

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1477,7 +1477,7 @@ parsec_dtd_startup(parsec_context_t *context,
14771477
if( !(tp->devices_index_mask & (1 << device->device_index))) continue; /* not supported */
14781478
// If CUDA is enabled, let the CUDA device activated for this
14791479
// taskpool.
1480-
if( PARSEC_DEV_CUDA == device->type ) continue;
1480+
if( PARSEC_DEV_CUDA & device->type ) continue;
14811481
if( NULL != device->taskpool_register )
14821482
if( PARSEC_SUCCESS !=
14831483
device->taskpool_register(device, (parsec_taskpool_t *)tp)) {
@@ -2355,8 +2355,8 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
23552355
/* We assume that incarnations is big enough, because it has been pre-allocated
23562356
* with PARSEC_DEV_MAX_NB_TYPE+1 chores, as this is a DTD task class */
23572357
incarnations = (__parsec_chore_t*)dtd_tc->super.incarnations;
2358-
for(i = 0; i < PARSEC_DEV_MAX_NB_TYPE && incarnations[i].type != PARSEC_DEV_NONE; i++) {
2359-
if( incarnations[i].type == device_type ) {
2358+
for(i = 0; i < PARSEC_DEV_MAX_NB_TYPE && (incarnations[i].type & PARSEC_DEV_ANY_TYPE) != PARSEC_DEV_NONE; i++) {
2359+
if( incarnations[i].type & PARSEC_DEV_ANY_TYPE & device_type ) {
23602360
parsec_warning("A chore for this device type has already been added to task class '%s'\n",
23612361
tc->name);
23622362
return PARSEC_ERROR;
@@ -2369,7 +2369,7 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
23692369
}
23702370

23712371
incarnations[i].type = device_type;
2372-
if(PARSEC_DEV_CUDA == device_type) {
2372+
if(PARSEC_DEV_CUDA & device_type) {
23732373
incarnations[i].hook = parsec_dtd_gpu_task_submit;
23742374
dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)function;
23752375
}
@@ -3258,19 +3258,20 @@ __parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp,
32583258
dtd_tc = parsec_dtd_create_task_classv(name_of_kernel, nb_params, params);
32593259
tc = &dtd_tc->super;
32603260

3261-
__parsec_chore_t **incarnations = (__parsec_chore_t **)&tc->incarnations;
3262-
(*incarnations)[0].type = device_type;
3263-
if( device_type == PARSEC_DEV_CUDA ) {
3261+
__parsec_chore_t *incarnations = (__parsec_chore_t *)tc->incarnations;
3262+
incarnations[0].type = device_type;
3263+
if( device_type & PARSEC_DEV_CUDA ) {
32643264
/* Special case for CUDA: we need an intermediate */
3265-
(*incarnations)[0].hook = parsec_dtd_gpu_task_submit;
3265+
incarnations[0].hook = parsec_dtd_gpu_task_submit;
32663266
dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)fpointer;
32673267
}
32683268
else {
32693269
/* Default case: the user-provided function is directly the hook to call */
3270-
(*incarnations)[0].hook = fpointer; // We can directly call the CPU hook
3270+
incarnations[0].hook = fpointer; // We can directly call the CPU hook
32713271
dtd_tc->cpu_func_ptr = fpointer;
32723272
}
3273-
(*incarnations)[1].type = PARSEC_DEV_NONE;
3273+
incarnations[1].type = PARSEC_DEV_NONE;
3274+
incarnations[1].hook = NULL;
32743275

32753276
/* Bookkeeping of the task class */
32763277
parsec_dtd_register_task_class(&dtd_tp->super, fkey, tc);

parsec/interfaces/ptg/ptg-compiler/jdf2c.c

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3938,25 +3938,40 @@ jdf_generate_function_incarnation_list( const jdf_t *jdf,
39383938
jdf_def_list_t* dyld_property;
39393939
jdf_def_list_t* evaluate_property = NULL;
39403940
jdf_def_list_t* device_property = NULL;
3941+
jdf_def_list_t* batch_property = NULL;
39413942

39423943
(void)jdf;
39433944
string_arena_add_string(sa, "static const __parsec_chore_t __%s_chores[] ={\n", base_name);
39443945
do {
39453946
jdf_find_property(body->properties, "type", &type_property);
39463947
jdf_find_property(body->properties, "dyld", &dyld_property);
39473948
jdf_find_property(body->properties, JDF_BODY_PROP_EVALUATE, &evaluate_property);
3948-
if( NULL == type_property) {
3949+
jdf_find_property(body->properties, "batch", &batch_property);
3950+
if (NULL == type_property)
3951+
{
39493952
string_arena_add_string(sa, "#if defined(PARSEC_HAVE_DEV_CPU_SUPPORT)\n");
39503953
string_arena_add_string(sa, " { .type = PARSEC_DEV_CPU,\n");
39513954
string_arena_add_string(sa, " .evaluate = (parsec_evaluate_function_t*)%s,\n",
39523955
(NULL == evaluate_property) ? "NULL" : evaluate_property->expr->jdf_c_code.fname);
39533956
string_arena_add_string(sa, " .hook = (parsec_hook_t*)hook_of_%s },\n", base_name);
39543957
string_arena_add_string(sa, "#endif /* defined(PARSEC_HAVE_DEV_CPU_SUPPORT) */\n");
3955-
} else {
3958+
if( NULL != batch_property ) {
3959+
fprintf(stderr,
3960+
"Error: batched property (%s) not allowed for devices other than accelerators in body of task %s at line %d\n",
3961+
batch_property->expr->jdf_var, f->fname, JDF_OBJECT_LINENO(body));
3962+
assert( NULL != batch_property );
3963+
}
3964+
}
3965+
else
3966+
{
39563967
char* dev_upper = strdup_upper(type_property->expr->jdf_var);
39573968

39583969
string_arena_add_string(sa, "#if defined(PARSEC_HAVE_DEV_%s_SUPPORT)\n", dev_upper);
3959-
string_arena_add_string(sa, " { .type = PARSEC_DEV_%s,\n", dev_upper);
3970+
string_arena_add_string(sa, " { .type = PARSEC_DEV_%s", dev_upper);
3971+
if( NULL != batch_property) {
3972+
string_arena_add_string(sa, " | PARSEC_DEV_CHORE_ALLOW_BATCH");
3973+
}
3974+
string_arena_add_string(sa, ",\n");
39603975
if( NULL == dyld_property ) {
39613976
string_arena_add_string(sa, " .dyld = NULL,\n");
39623977
} else {
@@ -4491,7 +4506,7 @@ static void jdf_generate_startup_hook( const jdf_t *jdf )
44914506
" parsec_task_class_t* tc = (parsec_task_class_t*)__parsec_tp->super.super.task_classes_array[i];\n"
44924507
" __parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations;\n"
44934508
" uint32_t idx = 0, j;\n"
4494-
" for( j = 0; PARSEC_DEV_NONE != chores[j].type; j++ ) {\n"
4509+
" for( j = 0; PARSEC_DEV_NONE != (chores[j].type & PARSEC_DEV_ANY_TYPE); j++ ) {\n"
44954510
" if( !(supported_dev & chores[j].type) ) continue;\n"
44964511
" if( j != idx ) {\n"
44974512
" chores[idx] = chores[j];\n"
@@ -4680,7 +4695,7 @@ static void jdf_generate_constructor( const jdf_t* jdf )
46804695
coutput(" for( i = 0; i < __parsec_tp->super.super.nb_task_classes; i++ ) {\n"
46814696
" __parsec_tp->super.super.task_classes_array[i] = tc = malloc(sizeof(parsec_task_class_t));\n"
46824697
" memcpy(tc, %s_task_classes[i], sizeof(parsec_task_class_t));\n"
4683-
" for( j = 0; PARSEC_DEV_NONE != tc->incarnations[j].type; j++); /* compute the number of incarnations */\n"
4698+
" for( j = 0; PARSEC_DEV_NONE != (tc->incarnations[j].type & PARSEC_DEV_ANY_TYPE); j++); /* compute the number of incarnations */\n"
46844699
" tc->incarnations = (__parsec_chore_t*)malloc((j+1) * sizeof(__parsec_chore_t));\n "
46854700
" memcpy((__parsec_chore_t*)tc->incarnations, %s_task_classes[i]->incarnations, (j+1) * sizeof(__parsec_chore_t));\n\n"
46864701
" /* Add a placeholder for initialization and startup task */\n"
@@ -6731,8 +6746,8 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf,
67316746
coutput(" /* Pointer to dynamic gpu function */\n"
67326747
" {\n"
67336748
" int chore_idx = 0;\n"
6734-
" for ( ; PARSEC_DEV_NONE != this_task->task_class->incarnations[chore_idx].type; ++chore_idx) {\n"
6735-
" if (this_task->task_class->incarnations[chore_idx].type == PARSEC_DEV_%s) break;\n"
6749+
" for ( ; PARSEC_DEV_NONE != (this_task->task_class->incarnations[chore_idx].type & PARSEC_DEV_ANY_TYPE); ++chore_idx) {\n"
6750+
" if (this_task->task_class->incarnations[chore_idx].type & PARSEC_DEV_%s) break;\n"
67366751
" }\n"
67376752
" /* The void* cast prevents the compiler from complaining about the type change */\n"
67386753
" parsec_body.dyld_fn = (%s)(void*)this_task->task_class->incarnations[chore_idx].dyld_fn;\n"
@@ -6983,7 +6998,7 @@ static void jdf_generate_code_hook(const jdf_t *jdf,
69836998
coutput("#if defined(PARSEC_HAVE_DEV_%s_SUPPORT)\n", type_upper);
69846999
if( NULL != type_property) {
69857000

6986-
if (!strcasecmp(type_property->expr->jdf_var, "cuda")
7001+
if (!strncasecmp(type_property->expr->jdf_var, "cuda", 4) /* for batched */
69877002
|| !strcasecmp(type_property->expr->jdf_var, "hip")) {
69887003
jdf_generate_code_hook_gpu(jdf, f, body, name);
69897004
goto hook_end_block;

parsec/mca/device/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@ set(PARSEC_HAVE_DEV_RECURSIVE_SUPPORT 0 CACHE BOOL "PaRSEC has support for Recu
1414
if(PARSEC_HAVE_CUDA)
1515
set(PARSEC_HAVE_DEV_CUDA_SUPPORT 1 CACHE BOOL "PaRSEC support for CUDA")
1616
endif(PARSEC_HAVE_CUDA)
17+
if(PARSEC_HAVE_CUDA_BATCH)
18+
set(PARSEC_HAVE_DEV_CUDA_BATCH_SUPPORT 1 CACHE BOOL "PaRSEC support for batched CUDA")
19+
endif(PARSEC_HAVE_CUDA_BATCH)
1720
if(PARSEC_HAVE_HIP)
1821
set(PARSEC_HAVE_DEV_HIP_SUPPORT 1 CACHE BOOL "PaRSEC support for HIP")
1922
endif(PARSEC_HAVE_HIP)

parsec/mca/device/cuda/device_cuda_component.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ static int device_cuda_component_close(void)
276276
/* Check that no CUDA devices are still registered with PaRSEC */
277277
for(i = 0; i < parsec_mca_device_enabled(); i++) {
278278
if( NULL == (cdev = (parsec_device_cuda_module_t*)parsec_mca_device_get(i)) ) continue;
279-
if(PARSEC_DEV_CUDA != cdev->super.super.type) continue;
279+
if(PARSEC_DEV_CUDA & cdev->super.super.type) continue;
280280

281281
PARSEC_DEBUG_VERBOSE(0, parsec_gpu_output_stream,
282282
"GPU[%d:%s] CUDA device %d still registered with PaRSEC at the end of CUDA finalize.\n"

parsec/mca/device/device.c

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
107107

108108
/* Run the evaluates for the incarnation types to determine if they can
109109
* execute this task */
110-
for(chore_id = 0; PARSEC_DEV_NONE != tc->incarnations[chore_id].type; chore_id++) {
110+
for(chore_id = 0; PARSEC_DEV_NONE != (tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE); chore_id++) {
111111
if( 0 == (this_task->chore_mask & (1<<chore_id)) ) continue;
112112
if( NULL == tc->incarnations[chore_id].hook ) continue; /* dyld hook not found during initialization */
113113

@@ -116,15 +116,15 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
116116
if( PARSEC_HOOK_RETURN_DONE != rc ) {
117117
if( PARSEC_HOOK_RETURN_NEXT != rc ) {
118118
PARSEC_DEBUG_VERBOSE(5, parsec_device_output, "Failed to evaluate %s[%d] chore %d",
119-
tmp, tc->incarnations[chore_id].type,
119+
tmp, tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE,
120120
chore_id);
121121
}
122122
/* Mark this chore as tested */
123123
this_task->chore_mask &= ~( 1<<chore_id );
124124
continue;
125125
}
126126
}
127-
valid_types |= tc->incarnations[chore_id].type; /* the eval accepted the type, but no device specified yet */
127+
valid_types |= (tc->incarnations[chore_id].type & PARSEC_DEV_ANY_TYPE); /* the eval accepted the type, but no device specified yet */
128128
/* Evaluate may have picked a device, abide by it */
129129
if( NULL != this_task->selected_device ) {
130130
assert( this_task->selected_device->type & valid_types );
@@ -140,7 +140,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
140140
if (PARSEC_DEV_CPU == valid_types) { /* shortcut for CPU only tasks */
141141
this_task->selected_device = dev = parsec_mca_device_get(0);
142142
this_task->load = 0;
143-
for(chore_id = 0; tc->incarnations[chore_id].type != PARSEC_DEV_CPU; chore_id++);
143+
for(chore_id = 0; !(tc->incarnations[chore_id].type & PARSEC_DEV_CPU); chore_id++);
144144
this_task->selected_chore = chore_id;
145145
PARSEC_DEBUG_VERBOSE(80, parsec_device_output, "%s: Task %s cpu-only task set selected_device %d:%s",
146146
__func__, tmp, dev->device_index, dev->name);
@@ -226,7 +226,7 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
226226
/* Skip the device if no incarnations for its type */
227227
if(!(dev->type & valid_types)) continue;
228228
/* Skip recursive devices: time estimates are computed on the associated CPU device */
229-
if(dev->type == PARSEC_DEV_RECURSIVE) continue;
229+
if(dev->type & PARSEC_DEV_RECURSIVE) continue;
230230

231231
eta = dev->device_load + time_estimate(this_task, dev);
232232
if( best_eta > eta ) {
@@ -244,14 +244,14 @@ int parsec_select_best_device( parsec_task_t* this_task ) {
244244
goto no_valid_device;
245245

246246
this_task->selected_device = parsec_mca_device_get(best_index);
247-
assert( this_task->selected_device->type != PARSEC_DEV_RECURSIVE );
247+
assert( !(this_task->selected_device->type & PARSEC_DEV_RECURSIVE) );
248248
}
249249

250250
device_selected:
251251
dev = this_task->selected_device;
252252
assert( NULL != dev );
253253
assert( tp->devices_index_mask & (1 << dev->device_index) );
254-
for(chore_id = 0; tc->incarnations[chore_id].type != dev->type; chore_id++)
254+
for(chore_id = 0; !(tc->incarnations[chore_id].type & dev->type); chore_id++)
255255
assert(PARSEC_DEV_NONE != tc->incarnations[chore_id].type /* we have selected this device, so there *must* be an incarnation that matches */);
256256
this_task->selected_chore = chore_id;
257257
this_task->load = time_estimate(this_task, dev);
@@ -748,8 +748,8 @@ int parsec_mca_device_registration_complete(parsec_context_t* context)
748748
for( uint32_t i = 0; i < parsec_nb_devices; i++ ) {
749749
parsec_device_module_t* device = parsec_devices[i];
750750
if( NULL == device ) continue;
751-
if( PARSEC_DEV_RECURSIVE == device->type ) continue;
752-
if( PARSEC_DEV_CPU == device->type ) {
751+
if( PARSEC_DEV_RECURSIVE & device->type ) continue;
752+
if( PARSEC_DEV_CPU & device->type ) {
753753
c = 0;
754754
for(int p = 0; p < context->nb_vp; p++)
755755
c += context->virtual_processes[p]->nb_cores;
@@ -768,7 +768,7 @@ int parsec_mca_device_registration_complete(parsec_context_t* context)
768768
for( uint32_t i = 0; i < parsec_nb_devices; i++ ) {
769769
parsec_device_module_t* device = parsec_devices[i];
770770
if( NULL == device ) continue;
771-
if( PARSEC_DEV_RECURSIVE == device->type ) continue;
771+
if( PARSEC_DEV_RECURSIVE & device->type ) continue;
772772
device->time_estimate_default = total_gflops_fp64/(double)device->gflops_fp64;
773773
parsec_debug_verbose(6, parsec_device_output, " Dev[%d] default-time-estimate %-4"PRId64" <- double %-8"PRId64" single %-8"PRId64" tensor %-8"PRId64" half %-8"PRId64" %s",
774774
i, device->time_estimate_default, device->gflops_fp64, device->gflops_fp32, device->gflops_tf32, device->gflops_fp16, device->gflops_guess? "GUESSED": "");
@@ -933,7 +933,7 @@ device_taskpool_register_static(parsec_device_module_t* device, parsec_taskpool_
933933
continue;
934934
__parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations;
935935
for( j = 0; NULL != chores[j].hook; j++ ) {
936-
if( chores[j].type != device->type )
936+
if( !(chores[j].type & device->type) )
937937
continue;
938938
if( NULL != chores[j].dyld_fn ) {
939939
continue; /* the function has been set for another device of the same type */

parsec/mca/device/device.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,13 @@ typedef struct parsec_device_base_component_2_0_0 parsec_device_base_component_t
6565
#define PARSEC_DEV_CUDA ((uint8_t)(1 << 2))
6666
#define PARSEC_DEV_HIP ((uint8_t)(1 << 3))
6767
#define PARSEC_DEV_LEVEL_ZERO ((uint8_t)(1 << 4))
68+
#define PARSEC_DEV_CUDA_BATCH ((uint8_t)(1 << 5))
6869
#define PARSEC_DEV_TEMPLATE ((uint8_t)(1 << 7))
6970
#define PARSEC_DEV_ANY_TYPE ((uint8_t) 0x3f)
7071
#define PARSEC_DEV_ALL ((uint8_t) 0x3f)
7172
#define PARSEC_DEV_MAX_NB_TYPE (7)
73+
/* The following flags are extensions to the device type */
74+
#define PARSEC_DEV_CHORE_ALLOW_BATCH ((uint32_t)0x00000100)
7275

7376
#define PARSEC_DEV_GPU_MASK (PARSEC_DEV_CUDA|PARSEC_DEV_HIP|PARSEC_DEV_LEVEL_ZERO)
7477
#define PARSEC_DEV_IS_GPU(t) (0 != ((t) & PARSEC_DEV_GPU_MASK))

0 commit comments

Comments
 (0)