@@ -179,6 +179,7 @@ static const parsec_symbol_t symb_gpu_d2h_task_param = {
179179};
180180
181181int32_t parsec_gpu_d2h_max_flows = 0 ;
182+ int32_t parsec_gpu_d2h_max_discarded = 0 ;
182183
183184static const parsec_task_class_t parsec_gpu_d2h_task_class = {
184185 .name = "GPU D2H data transfer" ,
@@ -215,6 +216,16 @@ static const parsec_task_class_t parsec_gpu_d2h_task_class = {
215216#endif
216217};
217218
219+ static inline void release_discarded_data (parsec_device_gpu_module_t * gpu_device , parsec_gpu_data_copy_t * gpu_copy )
220+ {
221+ parsec_list_item_ring_chop ((parsec_list_item_t * )gpu_copy );
222+ PARSEC_LIST_ITEM_SINGLETON (gpu_copy );
223+ PARSEC_DEBUG_VERBOSE (10 , parsec_gpu_output_stream ,
224+ "D2H[%d:%s] GPU data copy %p of discarded data %p will be released" ,
225+ gpu_device -> super .device_index , gpu_device -> super .name , gpu_copy , gpu_copy -> original );
226+ parsec_device_release_gpu_copy (gpu_device , gpu_copy );
227+
228+ }
218229
219230/**
220231 * Transfer at most the MAX_PARAM_COUNT oldest data from the GPU back
@@ -227,58 +238,98 @@ parsec_gpu_create_w2r_task(parsec_device_gpu_module_t *gpu_device,
227238{
228239 parsec_gpu_task_t * w2r_task = NULL ;
229240 parsec_gpu_d2h_task_t * d2h_task = NULL ;
230- parsec_gpu_data_copy_t * gpu_copy , * cpu_copy ;
231- parsec_list_item_t * item = (parsec_list_item_t * )gpu_device -> gpu_mem_owned_lru .ghost_element .list_next ;
241+ parsec_gpu_data_copy_t * fwd_gpu_copy = NULL , * fwd_cpu_copy = NULL , * rev_gpu_copy = NULL , * rev_cpu_copy = NULL ;
242+ parsec_list_item_t * fwd = (parsec_list_item_t * )gpu_device -> gpu_mem_owned_lru .ghost_element .list_next ;
243+ parsec_list_item_t * rev = (parsec_list_item_t * )gpu_device -> gpu_mem_owned_lru .ghost_element .list_prev ;
232244 int nb_cleaned = 0 ;
245+ int nb_discarded = 0 ;
246+ int nb_candidates = 0 ;
247+ const int max_flows = (parsec_gpu_d2h_max_flows < MAX_PARAM_COUNT ) ? parsec_gpu_d2h_max_flows : MAX_PARAM_COUNT ;
248+ /* store candidates in an array without unlinking them so we can easily abandon them */
249+ parsec_gpu_data_copy_t * candidates [MAX_PARAM_COUNT ];
233250
234251 /* Find a data copy that has no pending users on the GPU, and can be
235- * safely moved back on the main memory */
236- while (nb_cleaned < parsec_gpu_d2h_max_flows ) {
252+ * safely moved back on the main memory.
253+ * Also look for data that was discarded and can be released immediatly.
254+ *
255+ * Observation: data to be evicted is more likely at the front of the list
256+ * while data that is discarded is more likely at the end
257+ * (since it was likely discarded shortly after being used)
258+ * so we search from the front and the back. */
259+ while (nb_candidates < max_flows &&
260+ /* allow discarding to be disabled */
261+ (parsec_gpu_d2h_max_discarded == 0 || nb_discarded < parsec_gpu_d2h_max_discarded )) {
237262 /* Break at the end of the list */
238- if ( item == & ( gpu_device -> gpu_mem_owned_lru .ghost_element ) ) {
263+ if ( fwd == & gpu_device -> gpu_mem_owned_lru .ghost_element ) {
239264 break ;
240265 }
241- gpu_copy = (parsec_gpu_data_copy_t * )item ;
242- cpu_copy = gpu_copy -> original -> device_copies [0 ];
243- parsec_atomic_lock ( & gpu_copy -> original -> lock );
244- /* get the next item before altering the next pointer */
245- item = (parsec_list_item_t * )item -> list_next ; /* conversion needed for volatile */
246- if (cpu_copy -> flags & PARSEC_DATA_FLAG_DISCARDED ) {
247- parsec_list_item_ring_chop ((parsec_list_item_t * )gpu_copy );
248- PARSEC_LIST_ITEM_SINGLETON (gpu_copy );
249- PARSEC_DEBUG_VERBOSE (10 , parsec_gpu_output_stream ,
250- "D2H[%d:%s] GPU data copy %p of discarded data %p will be released" ,
251- gpu_device -> super .device_index , gpu_device -> super .name , gpu_copy , gpu_copy -> original );
252- parsec_atomic_unlock ( & gpu_copy -> original -> lock );
253- parsec_device_release_gpu_copy (gpu_device , gpu_copy );
254- } else if ( 0 == gpu_copy -> readers ) {
255- if ( PARSEC_UNLIKELY (NULL == d2h_task ) ) { /* allocate on-demand */
256- d2h_task = (parsec_gpu_d2h_task_t * )parsec_thread_mempool_allocate (es -> context_mempool );
257- if ( PARSEC_UNLIKELY (NULL == d2h_task ) ) { /* we're running out of memory. Bail out. */
258- parsec_atomic_unlock ( & gpu_copy -> original -> lock );
259- return NULL ;
260- }
261- PARSEC_OBJ_CONSTRUCT (d2h_task , parsec_task_t );
262- }
263- parsec_list_item_ring_chop ((parsec_list_item_t * )gpu_copy );
264- PARSEC_LIST_ITEM_SINGLETON (gpu_copy );
265- gpu_copy -> readers ++ ;
266- d2h_task -> data [nb_cleaned ].data_out = gpu_copy ;
267- gpu_copy -> data_transfer_status = PARSEC_DATA_STATUS_UNDER_TRANSFER ; /* mark the copy as in transfer */
268- parsec_atomic_unlock ( & gpu_copy -> original -> lock );
269- PARSEC_DEBUG_VERBOSE (10 , parsec_gpu_output_stream , "D2H[%d:%s] task %p:\tdata %d -> %p [%p] readers %d" ,
270- gpu_device -> super .device_index , gpu_device -> super .name , (void * )d2h_task ,
271- nb_cleaned , gpu_copy , gpu_copy -> original , gpu_copy -> readers );
272- nb_cleaned ++ ;
273- if (MAX_PARAM_COUNT == nb_cleaned )
266+ if (fwd == rev || fwd -> list_next == rev ) {
267+ /* break at median if we discarded data */
268+ if (nb_discarded > 0 ) {
274269 break ;
275- } else {
276- parsec_atomic_unlock ( & gpu_copy -> original -> lock );
270+ }
271+ /* otherwise stop walking backwards because we already
272+ * looked for discarded data on the way */
273+ rev = NULL ;
274+ rev_gpu_copy = NULL ;
275+ rev_cpu_copy = NULL ;
277276 }
277+
278+ fwd_gpu_copy = (parsec_gpu_data_copy_t * )fwd ;
279+ fwd_cpu_copy = fwd_gpu_copy -> original -> device_copies [0 ];
280+ /* get the next item before altering the next pointer */
281+ fwd = (parsec_list_item_t * )fwd -> list_next ; /* conversion needed for volatile */
282+ if (NULL != rev ) {
283+ rev_gpu_copy = (parsec_gpu_data_copy_t * )rev ;
284+ rev_cpu_copy = rev_gpu_copy -> original -> device_copies [0 ];
285+ rev = (parsec_list_item_t * )rev -> list_prev ; // cast for volatile
286+ }
287+ if (parsec_gpu_d2h_max_discarded && fwd_cpu_copy -> flags & PARSEC_DATA_FLAG_DISCARDED ) {
288+ release_discarded_data (gpu_device , fwd_gpu_copy );
289+ ++ nb_discarded ;
290+ } else if ( max_flows > nb_candidates && 0 == fwd_gpu_copy -> readers ) {
291+ /* store the candidates but leave them in the LRU */
292+ candidates [nb_candidates ] = fwd_gpu_copy ;
293+ nb_candidates ++ ;
294+ }
295+ if (parsec_gpu_d2h_max_discarded &&
296+ NULL != rev_cpu_copy &&
297+ rev_cpu_copy -> flags & PARSEC_DATA_FLAG_DISCARDED ) {
298+ release_discarded_data (gpu_device , rev_gpu_copy );
299+ ++ nb_discarded ;
300+ }
301+ }
302+
303+ if ( nb_discarded > 0 || nb_candidates == 0 ) {
304+ /* we discarded some data, don't bother pushing out */
305+ return NULL ;
278306 }
279307
280- if ( 0 == nb_cleaned )
308+ d2h_task = (parsec_gpu_d2h_task_t * )parsec_thread_mempool_allocate (es -> context_mempool );
309+ if ( PARSEC_UNLIKELY (NULL == d2h_task ) ) { /* we're running out of memory. Bail out. */
281310 return NULL ;
311+ }
312+ PARSEC_OBJ_CONSTRUCT (d2h_task , parsec_task_t );
313+
314+ for (int i = 0 ; i < nb_candidates ; ++ i ) {
315+ parsec_gpu_data_copy_t * gpu_copy = candidates [i ];
316+ parsec_atomic_lock ( & gpu_copy -> original -> lock );
317+ if (PARSEC_UNLIKELY (gpu_copy -> readers != 0 )) {
318+ /* gained a reader, ignore */
319+ parsec_atomic_unlock ( & gpu_copy -> original -> lock );
320+ continue ;
321+ }
322+ parsec_list_item_ring_chop ((parsec_list_item_t * )gpu_copy );
323+ PARSEC_LIST_ITEM_SINGLETON (gpu_copy );
324+ gpu_copy -> readers ++ ;
325+ d2h_task -> data [nb_cleaned ].data_out = gpu_copy ;
326+ gpu_copy -> data_transfer_status = PARSEC_DATA_STATUS_UNDER_TRANSFER ; /* mark the copy as in transfer */
327+ parsec_atomic_unlock ( & gpu_copy -> original -> lock );
328+ nb_cleaned ++ ;
329+ PARSEC_DEBUG_VERBOSE (10 , parsec_gpu_output_stream , "D2H[%d:%s] task %p:\tdata %d -> %p [%p] readers %d" ,
330+ gpu_device -> super .device_index , gpu_device -> super .name , (void * )d2h_task ,
331+ nb_cleaned , gpu_copy , gpu_copy -> original , gpu_copy -> readers );
332+ }
282333
283334 d2h_task -> priority = INT32_MAX ;
284335 d2h_task -> task_class = & parsec_gpu_d2h_task_class ;
0 commit comments