Skip to content

Commit e3a7829

Browse files
committed
Decouple device flow count from parameters
Make PARSEC_MAX_DEVICE_FLOWS configurable and select a proper integer type, up to int128_t. Make sure the flow mask is properly checked. Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
1 parent a9ab33d commit e3a7829

7 files changed

Lines changed: 62 additions & 40 deletions

File tree

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ set(PARSEC_MAX_LOCAL_COUNT 20 CACHE STRING "Number of local variables for tasks
151151
set(PARSEC_MAX_PARAM_COUNT 20 CACHE STRING "Number of parameters for tasks (default 20)")
152152
set(PARSEC_MAX_DEP_IN_COUNT 10 CACHE STRING "Number of input flows for each task (default 10)")
153153
set(PARSEC_MAX_DEP_OUT_COUNT 10 CACHE STRING "Number of output flows for each task (default 10)")
154+
set(PARSEC_MAX_DEVICE_FLOWS ${PARSEC_MAX_PARAM_COUNT} CACHE STRING "Number of parameters for tasks (default: same as PARSEC_MAX_PARAM_COUNT)")
154155

155156
### PaRSEC PP options
156157
set(PARSEC_PTGPP_FLAGS "--noline" CACHE STRING "Additional parsec-ptgpp precompiling flags (separate flags with ';')" )

parsec/include/parsec/parsec_config_bottom.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,24 @@ typedef int32_t parsec_dependency_t;
160160
typedef int32_t parsec_dependency_t;
161161
#endif
162162

163+
#if ((MAX_PARAM_COUNT <= 16) && (PARSEC_MAX_DEVICE_FLOWS <= 16))
164+
typedef uint16_t parsec_flow_mask_t;
165+
#elif ((MAX_PARAM_COUNT <= 32) && (PARSEC_MAX_DEVICE_FLOWS <= 32))
166+
typedef uint32_t parsec_flow_mask_t;
167+
#elif ((MAX_PARAM_COUNT <= 64) && (PARSEC_MAX_DEVICE_FLOWS <= 64))
168+
typedef uint64_t parsec_flow_mask_t;
169+
#elif ((MAX_PARAM_COUNT <= 128) && (PARSEC_MAX_DEVICE_FLOWS <= 128)) && defined(PARSEC_HAVE_INT128)
170+
typedef __int128_t parsec_flow_mask_t;
171+
#else
172+
#error Failed to find proper type for PaRSEC flow mask type. \
173+
Make sure MAX_PARAM_COUNT and PARSEC_MAX_DEVICE_FLOWS \
174+
is max 128 or 64 if 128bit integer are not supported.
175+
#endif
176+
177+
#define PARSEC_FLOW_MASK(_id) (((parsec_flow_mask_t)1) << _id)
178+
#define PARSEC_CHECK_FLOW_MASK(_mask, _id) (!!(_mask & PARSEC_FLOW_MASK(_id)))
179+
180+
163181
/*
164182
* A set of constants defining the capabilities of the underlying
165183
* runtime.

parsec/include/parsec/parsec_options.h.in

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,9 @@
152152
/* The max number of output dependencies (not flows) for each task */
153153
#define MAX_DEP_OUT_COUNT @PARSEC_MAX_DEP_OUT_COUNT@
154154

155+
/* The max number of flows handled by device tasks */
156+
#define PARSEC_MAX_DEVICE_FLOWS @PARSEC_MAX_DEVICE_FLOWS@
157+
155158
#include "parsec/parsec_config_bottom.h"
156159

157160
#endif /* PARSEC_CONFIG_H_HAS_BEEN_INCLUDED */

parsec/mca/device/device_gpu.c

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -841,7 +841,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
841841
parsec_gpu_task_t *gpu_task )
842842
{
843843
parsec_task_t *this_task = gpu_task->ec;
844-
parsec_gpu_data_copy_t* temp_loc[MAX_PARAM_COUNT], *gpu_elem, *lru_gpu_elem;
844+
parsec_gpu_data_copy_t* temp_loc[PARSEC_MAX_DEVICE_FLOWS], *gpu_elem, *lru_gpu_elem;
845845
parsec_data_t* master, *oldmaster;
846846
const parsec_flow_t *flow;
847847
int i, j, data_avail_epoch = 0, copy_readers_update = 0;
@@ -1163,7 +1163,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
11631163
*/
11641164
int
11651165
parsec_default_gpu_stage_in(parsec_gpu_task_t *gtask,
1166-
uint32_t flow_mask,
1166+
parsec_flow_mask_t flow_mask,
11671167
parsec_gpu_exec_stream_t *gpu_stream)
11681168
{
11691169
int ret;
@@ -1176,7 +1176,7 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t *gtask,
11761176
parsec_device_transfer_direction_t dir;
11771177

11781178
for(int i = 0; i < task->task_class->nb_flows; i++) {
1179-
if( !(flow_mask & (1U << i)) ) continue;
1179+
if( !PARSEC_CHECK_FLOW_MASK(flow_mask, i) ) continue;
11801180
source = gtask->sources[i];
11811181
dest = task->data[i].data_out;
11821182
src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index);
@@ -1213,7 +1213,7 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t *gtask,
12131213
*/
12141214
int
12151215
parsec_default_gpu_stage_out(parsec_gpu_task_t *gtask,
1216-
uint32_t flow_mask,
1216+
parsec_flow_mask_t flow_mask,
12171217
parsec_gpu_exec_stream_t *gpu_stream)
12181218
{
12191219
int ret;
@@ -1225,7 +1225,7 @@ parsec_default_gpu_stage_out(parsec_gpu_task_t *gtask,
12251225
parsec_device_transfer_direction_t dir;
12261226
int i;
12271227
for(i = 0; i < task->task_class->nb_flows; i++){
1228-
if(flow_mask & (1U << i)){
1228+
if( PARSEC_CHECK_FLOW_MASK(flow_mask, i) ){
12291229
source = task->data[i].data_out;
12301230
dest = source->original->device_copies[0];
12311231
dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dest->device_index);
@@ -1497,7 +1497,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
14971497
#endif
14981498
gpu_task->sources[flow->flow_index] = candidate; /* save the candidate for release on transfer completion */
14991499
/* Push data into the GPU from the source device */
1500-
int rc = gpu_task->stage_in ? gpu_task->stage_in(gpu_task, (1U << flow->flow_index), gpu_stream): PARSEC_SUCCESS;
1500+
int rc = gpu_task->stage_in ? gpu_task->stage_in(gpu_task, PARSEC_FLOW_MASK(flow->flow_index), gpu_stream): PARSEC_SUCCESS;
15011501
if(PARSEC_SUCCESS != rc) {
15021502
parsec_warning( "GPU[%d:%s]: gpu_task->stage_in to device rc=%d @%s:%d\n"
15031503
"\t<<%p on device %d:%s>> -> <<%p on device %d:%s>> [%zu, %s]",
@@ -2117,7 +2117,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t *gpu_device,
21172117
/* If the gpu copy is not owned by parsec, we don't manage it at all */
21182118
if( 0 == (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;
21192119
original = gpu_copy->original;
2120-
rc = gpu_task->stage_out? gpu_task->stage_out(gpu_task, (1U << i), gpu_stream): PARSEC_SUCCESS;
2120+
rc = gpu_task->stage_out? gpu_task->stage_out(gpu_task, PARSEC_FLOW_MASK(i), gpu_stream): PARSEC_SUCCESS;
21212121
if(PARSEC_SUCCESS != rc) {
21222122
parsec_warning( "GPU[%d:%s]: gpu_task->stage_out from device rc=%d @%s:%d\n"
21232123
"\tdata %s <<%p>> -> <<%p>>\n",
@@ -2206,7 +2206,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t *gpu_device,
22062206
assert( ((parsec_list_item_t*)gpu_copy)->list_prev == (parsec_list_item_t*)gpu_copy );
22072207

22082208
assert( PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state );
2209-
if( gpu_task->pushout & (1 << i) ) {
2209+
if( PARSEC_CHECK_FLOW_MASK(gpu_task->pushout, i) ) {
22102210
/* TODO: make sure no readers are working on the CPU version */
22112211
original = gpu_copy->original;
22122212
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
@@ -2238,7 +2238,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t *gpu_device,
22382238
}
22392239
#endif
22402240
/* Move the data back into main memory */
2241-
rc = gpu_task->stage_out? gpu_task->stage_out(gpu_task, (1U << flow->flow_index), gpu_stream): PARSEC_SUCCESS;
2241+
rc = gpu_task->stage_out? gpu_task->stage_out(gpu_task, PARSEC_FLOW_MASK(flow->flow_index), gpu_stream): PARSEC_SUCCESS;
22422242
if(PARSEC_SUCCESS != rc) {
22432243
parsec_warning( "GPU[%d:%s]: gpu_task->stage_out from device rc=%d @%s:%d\n"
22442244
"\tdata %s <<%p>> -> <<%p>>\n",
@@ -2342,7 +2342,7 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
23422342

23432343
assert( 0 <= gpu_copy->readers );
23442344

2345-
if( gpu_task->pushout & (1 << i) ) {
2345+
if( PARSEC_CHECK_FLOW_MASK(gpu_task->pushout, i) ) {
23462346
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
23472347
"GPU copy %p [ref_count %d] moved to the read LRU in %s",
23482348
gpu_copy, gpu_copy->super.super.obj_reference_count, __func__);

parsec/mca/device/device_gpu.h

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ typedef int (*parsec_advance_task_function_t)(parsec_device_gpu_module_t *gpu_d
6060
*
6161
*/
6262
typedef int (parsec_stage_in_function_t)(parsec_gpu_task_t *gtask,
63-
uint32_t flow_mask,
63+
parsec_flow_mask_t flow_mask,
6464
parsec_gpu_exec_stream_t *gpu_stream);
6565

6666

@@ -74,7 +74,7 @@ typedef int (parsec_stage_in_function_t)(parsec_gpu_task_t *gtask,
7474
*
7575
*/
7676
typedef int (parsec_stage_out_function_t)(parsec_gpu_task_t *gtask,
77-
uint32_t flow_mask,
77+
parsec_flow_mask_t flow_mask,
7878
parsec_gpu_exec_stream_t *gpu_stream);
7979

8080
/* Function type for releasing a device task. The DSL is responsible for allocating such tasks,
@@ -86,8 +86,8 @@ typedef void (*parsec_release_device_task_function_t)(void*);
8686
struct parsec_gpu_task_s {
8787
parsec_list_item_t list_item;
8888
uint16_t task_type;
89-
uint16_t pushout;
9089
int32_t last_status;
90+
parsec_flow_mask_t pushout;
9191
parsec_advance_task_function_t submit;
9292
parsec_complete_stage_function_t complete_stage;
9393
parsec_stage_in_function_t *stage_in;
@@ -102,23 +102,23 @@ struct parsec_gpu_task_s {
102102
struct {
103103
parsec_task_t *ec;
104104
uint64_t last_data_check_epoch;
105-
const parsec_flow_t *flow[MAX_PARAM_COUNT]; /* There is no consistent way to access the flows from the task_class,
106-
* so the DSL need to provide these flows here.
107-
*/
108-
size_t flow_nb_elts[MAX_PARAM_COUNT]; /* for each flow, size of the data to be allocated
109-
* on the GPU.
110-
*/
111-
parsec_data_collection_t *flow_dc[MAX_PARAM_COUNT]; /* for each flow, data collection from which the data
112-
* to be transferred logically belongs to.
113-
* This gives the user the chance to indicate on the JDF
114-
* a data collection to inspect during GPU transfer.
115-
* User may want info from the DC (e.g. mtype),
116-
* & otherwise remote copies don't have any info.
117-
*/
105+
const parsec_flow_t *flow[PARSEC_MAX_DEVICE_FLOWS]; /* There is no consistent way to access the flows from the task_class,
106+
* so the DSL need to provide these flows here.
107+
*/
108+
size_t flow_nb_elts[PARSEC_MAX_DEVICE_FLOWS]; /* for each flow, size of the data to be allocated
109+
* on the GPU.
110+
*/
111+
parsec_data_collection_t *flow_dc[PARSEC_MAX_DEVICE_FLOWS]; /* for each flow, data collection from which the data
112+
* to be transferred logically belongs to.
113+
* This gives the user the chance to indicate on the JDF
114+
* a data collection to inspect during GPU transfer.
115+
* User may want info from the DC (e.g. mtype),
116+
* & otherwise remote copies don't have any info.
117+
*/
118118
/* These are private and should not be used outside the device driver */
119-
parsec_data_copy_t *sources[MAX_PARAM_COUNT]; /* If the driver decides to acquire the data from a different
120-
* source, it will temporary store the best candidate here.
121-
*/
119+
parsec_data_copy_t *sources[PARSEC_MAX_DEVICE_FLOWS]; /* If the driver decides to acquire the data from a different
120+
* source, it will temporary store the best candidate here.
121+
*/
122122
};
123123
struct {
124124
parsec_data_copy_t *copy;
@@ -376,7 +376,7 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module,
376376
*/
377377
int
378378
parsec_default_gpu_stage_in(parsec_gpu_task_t *gtask,
379-
uint32_t flow_mask,
379+
parsec_flow_mask_t flow_mask,
380380
parsec_gpu_exec_stream_t *gpu_stream);
381381

382382
/* Default stage_out function to transfer data from the GPU device.
@@ -390,7 +390,7 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t *gtask,
390390
*/
391391
int
392392
parsec_default_gpu_stage_out(parsec_gpu_task_t *gtask,
393-
uint32_t flow_mask,
393+
parsec_flow_mask_t flow_mask,
394394
parsec_gpu_exec_stream_t *gpu_stream);
395395

396396
END_C_DECLS

parsec/mca/device/transfer_gpu.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ static int
106106
datatype_lookup_of_gpu_d2h_task( parsec_execution_stream_t * es,
107107
const parsec_gpu_d2h_task_t* this_task,
108108
const parsec_task_t * parent_task,
109-
uint32_t * flow_mask,
109+
parsec_flow_mask_t * flow_mask,
110110
parsec_dep_data_description_t * data)
111111
{
112112
(void)es; (void)this_task; (void)parent_task; (void)flow_mask; (void)data;
@@ -183,9 +183,9 @@ int32_t parsec_gpu_d2h_max_flows = 0;
183183
static const parsec_task_class_t parsec_gpu_d2h_task_class = {
184184
.name = "GPU D2H data transfer",
185185
.task_class_id = 0,
186-
.nb_flows = MAX_PARAM_COUNT, /* This value will have an impact on the duration of the
187-
* search for additional data to move. As this search is linear
188-
* we need to keep this upper bound set to a reasonable value. */
186+
.nb_flows = PARSEC_MAX_DEVICE_FLOWS, /* This value will have an impact on the duration of the
187+
* search for additional data to move. As this search is linear
188+
* we need to keep this upper bound set to a reasonable value. */
189189
.nb_parameters = 1,
190190
.nb_locals = 0,
191191
.params = {&symb_gpu_d2h_task_param},
@@ -217,7 +217,7 @@ static const parsec_task_class_t parsec_gpu_d2h_task_class = {
217217

218218

219219
/**
220-
* Transfer at most the MAX_PARAM_COUNT oldest data from the GPU back
220+
* Transfer at most the PARSEC_MAX_DEVICE_FLOWS oldest data from the GPU back
221221
* to main memory. Create a single task to move them all out, then switch the
222222
* GPU data copy in shared mode.
223223
*/

tests/runtime/cuda/stage_custom.jdf

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ extern "C" %{
2727

2828
static int
2929
stage_stride_in(parsec_gpu_task_t *gtask,
30-
uint32_t flow_mask,
30+
parsec_flow_mask_t flow_mask,
3131
parsec_gpu_exec_stream_t *gpu_stream){
3232
parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)gpu_stream;
3333
cudaError_t ret = 0;
@@ -39,7 +39,7 @@ stage_stride_in(parsec_gpu_task_t *gtask,
3939
int elem_sz;
4040
int i;
4141
for(i = 0; i < task->task_class->nb_flows; i++){
42-
if(flow_mask & (1U << i)){
42+
if(PARSEC_CHECK_FLOW_MASK(flow_mask, i)){
4343
copy_in = task->data[i].data_in;
4444
copy_out = task->data[i].data_out;
4545
dc = (parsec_tiled_matrix_t*)gtask->flow_dc[i];
@@ -75,7 +75,7 @@ stage_stride_in(parsec_gpu_task_t *gtask,
7575

7676
static int
7777
stage_stride_out(parsec_gpu_task_t *gtask,
78-
uint32_t flow_mask,
78+
parsec_flow_mask_t flow_mask,
7979
parsec_gpu_exec_stream_t *gpu_stream){
8080
parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream;
8181
cudaError_t ret;
@@ -86,7 +86,7 @@ stage_stride_out(parsec_gpu_task_t *gtask,
8686
int elem_sz;
8787
int i;
8888
for(i = 0; i < task->task_class->nb_flows; i++){
89-
if(flow_mask & (1U << i)){
89+
if(PARSEC_CHECK_FLOW_MASK(flow_mask, i)){
9090
copy_in = task->data[i].data_out;
9191
copy_out = copy_in->original->device_copies[0];
9292
dc = (parsec_tiled_matrix_t*)gtask->flow_dc[i];

0 commit comments

Comments
 (0)