Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions parsec/mca/device/device.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ static float load_balance_skew;
*/
static int parsec_device_load_balance_allow_cpu = 0;

/**
* Whether to skip events on input, output, and execution streams that
* are not strictly necessary (enabled by default).
*/
int parsec_device_skip_empty_events = 1;

/**
* @brief Estimates how many nanoseconds this_task will run on devid
*
Expand Down Expand Up @@ -322,6 +328,11 @@ int parsec_mca_device_init(void)
(void)parsec_mca_param_reg_int_name("device", "load_balance_skew",
"Allow load balancing to skew by x%% to favor data reuse",
false, false, parsec_device_load_balance_skew, NULL);
(void)parsec_mca_param_reg_int_name("device", "skip_empty_events",
"Skip recording events on input and output streams "
"that are not strictly necessary (enabled)",
false, false, parsec_device_skip_empty_events,
&parsec_device_skip_empty_events);
(void)parsec_mca_param_reg_int_name("device", "load_balance_allow_cpu",
"Allow load balancing tasks with GPU incarnations to CPU cores",
false, false, parsec_device_load_balance_allow_cpu, NULL);
Expand Down
6 changes: 6 additions & 0 deletions parsec/mca/device/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,12 @@ PARSEC_OBJ_CLASS_DECLARATION(parsec_device_module_t);
extern uint32_t parsec_nb_devices;
extern int parsec_device_output;

/**
* Whether to skip events on input, output, and execution streams that
* are not strictly necessary (enabled by default).
*/
extern int parsec_device_skip_empty_events;

/**
* @brief Find the best device to execute the kernel based on the compute
* capability of the device.
Expand Down
40 changes: 33 additions & 7 deletions parsec/mca/device/device_gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,13 @@ static int parsec_gpu_profiling_initiated = 0;
int parsec_gpu_output_stream = -1;
int parsec_gpu_verbosity;

/* The return value of these functions is either a parsec_hook_return_t for <= 0 values,
* or a positive number which represents that something has been scheduled on the gpu_stream
*/
typedef int(*parsec_gpu_step_function_t)(parsec_device_gpu_module_t *gpu_device,
parsec_gpu_task_t *gpu_task,
parsec_gpu_exec_stream_t *gpu_stream);

static inline int
parsec_device_check_space_needed(parsec_device_gpu_module_t *gpu_device,
parsec_gpu_task_t *gpu_task)
Expand Down Expand Up @@ -1461,7 +1468,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
"GPU[%d:%s]:\tThere is a potential alternative source for data_in %p [ref_count %d] in original %p to go in copy %p [ref_count %d], but it is not ready, falling back on CPU source",
gpu_device->super.device_index, gpu_device->super.name, task_data->data_in, task_data->data_in->super.super.obj_reference_count, original, gpu_elem, gpu_elem->super.super.obj_reference_count);
//return PARSEC_HOOK_RETURN_NEXT;
//return PARSEC_HOOK_RETURN_AGAIN;
}

/* We fall back on the CPU copy */
Expand Down Expand Up @@ -1878,7 +1885,7 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t *gpu_device,
static inline int
parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device,
parsec_gpu_exec_stream_t* stream,
parsec_advance_task_function_t progress_fct,
parsec_gpu_step_function_t progress_fct,
parsec_gpu_task_t* task,
parsec_gpu_task_t** out_task )
{
Expand Down Expand Up @@ -1937,6 +1944,7 @@ parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device,
}

grab_a_task:
assert(NULL == task);
if( NULL == stream->tasks[stream->start] ) { /* there is room on the stream */
task = (parsec_gpu_task_t*)parsec_list_pop_front(stream->fifo_pending); /* get the best task */
}
Expand All @@ -1949,6 +1957,20 @@ parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device,

schedule_task:
rc = progress_fct( gpu_device, task, stream );
if( 0 == rc && parsec_device_skip_empty_events ) {
#if defined(PARSEC_PROF_TRACE)
if( stream->prof_event_track_enable ) {
if( task->prof_key_end != -1 ) {
PARSEC_PROFILING_TRACE(stream->profiling, task->prof_key_end, task->prof_event_id, task->prof_tp_id, NULL);
}
}
#endif
/* If progress_fct added nothing on that stream, we skip scheduling a record on the GPU stream */
if( task->complete_stage )
rc = task->complete_stage(gpu_device, &task, stream);
*out_task = task;
return rc;
}
if( 0 > rc ) {
if( PARSEC_HOOK_RETURN_AGAIN != rc ) {
if( PARSEC_HOOK_RETURN_NEXT == rc ) {
Expand Down Expand Up @@ -1978,7 +2000,6 @@ parsec_device_progress_stream( parsec_device_gpu_module_t* gpu_device,
"trigger the task will be handled accordingly",
gpu_device->super.device_index, gpu_device->super.name, (void*)task);
}
task->last_status = rc;
/**
* Do not skip the gpu event generation. The problem is that some of the inputs
* might be in the pipe of being transferred to the GPU. If we activate this task
Expand Down Expand Up @@ -2018,7 +2039,7 @@ parsec_device_kernel_push( parsec_device_gpu_module_t *gpu_device,
{
parsec_task_t *this_task = gpu_task->ec;
const parsec_flow_t *flow;
int i, ret = 0;
int i, ret = 0, how_many = 0;
#if defined(PARSEC_DEBUG_NOISIER)
char tmp[MAX_TASK_STRLEN];
#endif
Expand Down Expand Up @@ -2089,6 +2110,7 @@ parsec_device_kernel_push( parsec_device_gpu_module_t *gpu_device,
gpu_task->last_status = ret;
return ret;
}
how_many += ret;
}

PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
Expand All @@ -2099,7 +2121,7 @@ parsec_device_kernel_push( parsec_device_gpu_module_t *gpu_device,
#if defined(PARSEC_PROF_TRACE)
gpu_task->prof_key_end = -1; /* We do not log that event as the completion of this task */
#endif
return PARSEC_HOOK_RETURN_DONE;
return how_many;
}

/**
Expand All @@ -2116,6 +2138,7 @@ parsec_device_kernel_exec( parsec_device_gpu_module_t *gpu_device,
{
parsec_advance_task_function_t progress_fct = gpu_task->submit;
parsec_task_t* this_task = gpu_task->ec;
int rc;

#if defined(PARSEC_DEBUG_NOISIER)
char tmp[MAX_TASK_STRLEN];
Expand Down Expand Up @@ -2152,7 +2175,9 @@ parsec_device_kernel_exec( parsec_device_gpu_module_t *gpu_device,
#endif /* defined(PARSEC_DEBUG_PARANOID) */

(void)this_task;
return progress_fct( gpu_device, gpu_task, gpu_stream );
rc = progress_fct( gpu_device, gpu_task, gpu_stream );
gpu_task->last_status = rc;
return 1;
}

/**
Expand Down Expand Up @@ -2193,8 +2218,9 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t *gpu_device,
return_code = PARSEC_HOOK_RETURN_DISABLE;
goto release_and_return_error;
}
how_many++;
}
return PARSEC_HOOK_RETURN_DONE;
return how_many;
}

PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
Expand Down
Loading