Skip to content

Commit 397ca9b

Browse files
committed
Walk forward and backward when creating a w2r task
Discarded data sit toward the end of the lru while the data to be evicted is at the front. We walk both forward and backward to collect the discarded data from the back, until we either meet the pivot or we found enough data to evict. If we discarded data we don't evict. Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
1 parent 14934a5 commit 397ca9b

File tree

4 files changed

+99
-41
lines changed

4 files changed

+99
-41
lines changed

parsec/mca/device/cuda/device_cuda_component.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,9 @@ static int device_cuda_component_register(void)
161161
(void)parsec_mca_param_reg_int_name("device_cuda", "max_number_of_ejected_data",
162162
"Sets up the maximum number of blocks that can be ejected from GPU memory",
163163
false, false, MAX_PARAM_COUNT, &parsec_gpu_d2h_max_flows);
164+
(void)parsec_mca_param_reg_int_name("device_cuda", "max_number_of_discarded_data",
165+
"Sets up the maximum number of discarded blocks to be collected at once",
166+
false, false, MAX_PARAM_COUNT, &parsec_gpu_d2h_max_discarded);
164167
(void)parsec_mca_param_reg_int_name("device_cuda", "max_streams",
165168
"Maximum number of Streams to use for the GPU engine; 2 streams are used for communication between host and device, so the minimum is 3",
166169
false, false, PARSEC_GPU_MAX_STREAMS, &parsec_cuda_max_streams);

parsec/mca/device/device_gpu.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,7 @@ typedef struct parsec_gpu_workspace_s {
279279
PARSEC_DECLSPEC extern int parsec_gpu_output_stream;
280280
PARSEC_DECLSPEC extern int parsec_gpu_verbosity;
281281
PARSEC_DECLSPEC extern int32_t parsec_gpu_d2h_max_flows;
282+
PARSEC_DECLSPEC extern int32_t parsec_gpu_d2h_max_discarded;
282283

283284
/**
284285
* Debugging functions.

parsec/mca/device/level_zero/device_level_zero_component.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,9 @@ static int device_level_zero_component_register(void)
271271
(void)parsec_mca_param_reg_int_name("device_level_zero", "max_number_of_ejected_data",
272272
"Sets up the maximum number of blocks that can be ejected from GPU memory",
273273
false, false, MAX_PARAM_COUNT, &parsec_gpu_d2h_max_flows);
274+
(void)parsec_mca_param_reg_int_name("device_level_zero", "max_number_of_discarded_data",
275+
"Sets up the maximum number of discarded blocks to be collected at once",
276+
false, false, MAX_PARAM_COUNT, &parsec_gpu_d2h_max_discarded);
274277
(void)parsec_mca_param_reg_int_name("device_level_zero", "max_streams",
275278
"Maximum number of Streams to use for the GPU engine; 2 streams are used for communication between host and device, so the minimum is 3",
276279
false, false, PARSEC_GPU_MAX_STREAMS, &parsec_level_zero_max_streams);

parsec/mca/device/transfer_gpu.c

Lines changed: 92 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ static const parsec_symbol_t symb_gpu_d2h_task_param = {
179179
};
180180

181181
int32_t parsec_gpu_d2h_max_flows = 0;
182+
int32_t parsec_gpu_d2h_max_discarded = 0;
182183

183184
static const parsec_task_class_t parsec_gpu_d2h_task_class = {
184185
.name = "GPU D2H data transfer",
@@ -215,6 +216,16 @@ static const parsec_task_class_t parsec_gpu_d2h_task_class = {
215216
#endif
216217
};
217218

219+
static inline void release_discarded_data(parsec_device_gpu_module_t *gpu_device, parsec_gpu_data_copy_t* gpu_copy)
220+
{
221+
parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy);
222+
PARSEC_LIST_ITEM_SINGLETON(gpu_copy);
223+
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
224+
"D2H[%d:%s] GPU data copy %p of discarded data %p will be released",
225+
gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->original);
226+
parsec_device_release_gpu_copy(gpu_device, gpu_copy);
227+
228+
}
218229

219230
/**
220231
* Transfer at most the MAX_PARAM_COUNT oldest data from the GPU back
@@ -227,58 +238,98 @@ parsec_gpu_create_w2r_task(parsec_device_gpu_module_t *gpu_device,
227238
{
228239
parsec_gpu_task_t *w2r_task = NULL;
229240
parsec_gpu_d2h_task_t *d2h_task = NULL;
230-
parsec_gpu_data_copy_t *gpu_copy, *cpu_copy;
231-
parsec_list_item_t* item = (parsec_list_item_t*)gpu_device->gpu_mem_owned_lru.ghost_element.list_next;
241+
parsec_gpu_data_copy_t *fwd_gpu_copy = NULL, *fwd_cpu_copy = NULL, *rev_gpu_copy = NULL, *rev_cpu_copy = NULL;
242+
parsec_list_item_t* fwd = (parsec_list_item_t*)gpu_device->gpu_mem_owned_lru.ghost_element.list_next;
243+
parsec_list_item_t* rev = (parsec_list_item_t*)gpu_device->gpu_mem_owned_lru.ghost_element.list_prev;
232244
int nb_cleaned = 0;
245+
int nb_discarded = 0;
246+
int nb_candidates = 0;
247+
const int max_flows = (parsec_gpu_d2h_max_flows < MAX_PARAM_COUNT) ? parsec_gpu_d2h_max_flows : MAX_PARAM_COUNT;
248+
/* store candidates in an array without unlinking them so we can easily abandon them */
249+
parsec_gpu_data_copy_t *candidates[MAX_PARAM_COUNT];
233250

234251
/* Find a data copy that has no pending users on the GPU, and can be
235-
* safely moved back on the main memory */
236-
while(nb_cleaned < parsec_gpu_d2h_max_flows) {
252+
* safely moved back on the main memory.
253+
* Also look for data that was discarded and can be released immediatly.
254+
*
255+
* Observation: data to be evicted is more likely at the front of the list
256+
* while data that is discarded is more likely at the end
257+
* (since it was likely discarded shortly after being used)
258+
* so we search from the front and the back. */
259+
while(nb_candidates < max_flows &&
260+
/* allow discarding to be disabled */
261+
(parsec_gpu_d2h_max_discarded == 0 || nb_discarded < parsec_gpu_d2h_max_discarded)) {
237262
/* Break at the end of the list */
238-
if( item == &(gpu_device->gpu_mem_owned_lru.ghost_element) ) {
263+
if( fwd == &gpu_device->gpu_mem_owned_lru.ghost_element ) {
239264
break;
240265
}
241-
gpu_copy = (parsec_gpu_data_copy_t*)item;
242-
cpu_copy = gpu_copy->original->device_copies[0];
243-
parsec_atomic_lock( &gpu_copy->original->lock );
244-
/* get the next item before altering the next pointer */
245-
item = (parsec_list_item_t*)item->list_next; /* conversion needed for volatile */
246-
if (cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) {
247-
parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy);
248-
PARSEC_LIST_ITEM_SINGLETON(gpu_copy);
249-
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
250-
"D2H[%d:%s] GPU data copy %p of discarded data %p will be released",
251-
gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->original);
252-
parsec_atomic_unlock( &gpu_copy->original->lock );
253-
parsec_device_release_gpu_copy(gpu_device, gpu_copy);
254-
} else if( 0 == gpu_copy->readers ) {
255-
if( PARSEC_UNLIKELY(NULL == d2h_task) ) { /* allocate on-demand */
256-
d2h_task = (parsec_gpu_d2h_task_t*)parsec_thread_mempool_allocate(es->context_mempool);
257-
if( PARSEC_UNLIKELY(NULL == d2h_task) ) { /* we're running out of memory. Bail out. */
258-
parsec_atomic_unlock( &gpu_copy->original->lock );
259-
return NULL;
260-
}
261-
PARSEC_OBJ_CONSTRUCT(d2h_task, parsec_task_t);
262-
}
263-
parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy);
264-
PARSEC_LIST_ITEM_SINGLETON(gpu_copy);
265-
gpu_copy->readers++;
266-
d2h_task->data[nb_cleaned].data_out = gpu_copy;
267-
gpu_copy->data_transfer_status = PARSEC_DATA_STATUS_UNDER_TRANSFER; /* mark the copy as in transfer */
268-
parsec_atomic_unlock( &gpu_copy->original->lock );
269-
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "D2H[%d:%s] task %p:\tdata %d -> %p [%p] readers %d",
270-
gpu_device->super.device_index, gpu_device->super.name, (void*)d2h_task,
271-
nb_cleaned, gpu_copy, gpu_copy->original, gpu_copy->readers);
272-
nb_cleaned++;
273-
if (MAX_PARAM_COUNT == nb_cleaned)
266+
if (fwd == rev || fwd->list_next == rev) {
267+
/* break at median if we discarded data */
268+
if (nb_discarded > 0) {
274269
break;
275-
} else {
276-
parsec_atomic_unlock( &gpu_copy->original->lock );
270+
}
271+
/* otherwise stop walking backwards because we already
272+
* looked for discarded data on the way */
273+
rev = NULL;
274+
rev_gpu_copy = NULL;
275+
rev_cpu_copy = NULL;
277276
}
277+
278+
fwd_gpu_copy = (parsec_gpu_data_copy_t*)fwd;
279+
fwd_cpu_copy = fwd_gpu_copy->original->device_copies[0];
280+
/* get the next item before altering the next pointer */
281+
fwd = (parsec_list_item_t*)fwd->list_next; /* conversion needed for volatile */
282+
if (NULL != rev) {
283+
rev_gpu_copy = (parsec_gpu_data_copy_t*)rev;
284+
rev_cpu_copy = rev_gpu_copy->original->device_copies[0];
285+
rev = (parsec_list_item_t*)rev->list_prev; // cast for volatile
286+
}
287+
if (parsec_gpu_d2h_max_discarded && fwd_cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) {
288+
release_discarded_data(gpu_device, fwd_gpu_copy);
289+
++nb_discarded;
290+
} else if( max_flows > nb_candidates && 0 == fwd_gpu_copy->readers ) {
291+
/* store the candidates but leave them in the LRU */
292+
candidates[nb_candidates] = fwd_gpu_copy;
293+
nb_candidates++;
294+
}
295+
if (parsec_gpu_d2h_max_discarded &&
296+
NULL != rev_cpu_copy &&
297+
rev_cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) {
298+
release_discarded_data(gpu_device, rev_gpu_copy);
299+
++nb_discarded;
300+
}
301+
}
302+
303+
if( nb_discarded > 0 || nb_candidates == 0 ) {
304+
/* we discarded some data, don't bother pushing out */
305+
return NULL;
278306
}
279307

280-
if( 0 == nb_cleaned )
308+
d2h_task = (parsec_gpu_d2h_task_t*)parsec_thread_mempool_allocate(es->context_mempool);
309+
if( PARSEC_UNLIKELY(NULL == d2h_task) ) { /* we're running out of memory. Bail out. */
281310
return NULL;
311+
}
312+
PARSEC_OBJ_CONSTRUCT(d2h_task, parsec_task_t);
313+
314+
for (int i = 0; i < nb_candidates; ++i) {
315+
parsec_gpu_data_copy_t *gpu_copy = candidates[i];
316+
parsec_atomic_lock( &gpu_copy->original->lock );
317+
if (PARSEC_UNLIKELY(gpu_copy->readers != 0)) {
318+
/* gained a reader, ignore */
319+
parsec_atomic_unlock( &gpu_copy->original->lock );
320+
continue;
321+
}
322+
parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy);
323+
PARSEC_LIST_ITEM_SINGLETON(gpu_copy);
324+
gpu_copy->readers++;
325+
d2h_task->data[nb_cleaned].data_out = gpu_copy;
326+
gpu_copy->data_transfer_status = PARSEC_DATA_STATUS_UNDER_TRANSFER; /* mark the copy as in transfer */
327+
parsec_atomic_unlock( &gpu_copy->original->lock );
328+
nb_cleaned++;
329+
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "D2H[%d:%s] task %p:\tdata %d -> %p [%p] readers %d",
330+
gpu_device->super.device_index, gpu_device->super.name, (void*)d2h_task,
331+
nb_cleaned, gpu_copy, gpu_copy->original, gpu_copy->readers);
332+
}
282333

283334
d2h_task->priority = INT32_MAX;
284335
d2h_task->task_class = &parsec_gpu_d2h_task_class;

0 commit comments

Comments
 (0)