33 * Copyright (c) 2021-2024 The University of Tennessee and The University
44 * of Tennessee Research Foundation. All rights
55 * reserved.
6- * Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
6+ * Copyright (c) 2024-2026 NVIDIA Corporation. All rights reserved.
77 */
88
99#include "parsec/parsec_config.h"
@@ -2457,11 +2457,11 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
24572457 int i ;
24582458
24592459#if defined(PARSEC_DEBUG_NOISIER )
2460- char tmp [MAX_TASK_STRLEN ];
2460+ char task_str [MAX_TASK_STRLEN ];
2461+ (void )parsec_task_snprintf (task_str , MAX_TASK_STRLEN , this_task );
24612462 PARSEC_DEBUG_VERBOSE (10 , parsec_gpu_output_stream ,
24622463 "GPU[%d:%s]: Epilog of %s" ,
2463- gpu_device -> super .device_index , gpu_device -> super .name ,
2464- parsec_task_snprintf (tmp , MAX_TASK_STRLEN , this_task ) );
2464+ gpu_device -> super .device_index , gpu_device -> super .name , task_str );
24652465#endif
24662466
24672467 for ( i = 0 ; i < this_task -> task_class -> nb_flows ; i ++ ) {
@@ -2504,9 +2504,9 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
25042504 cpu_copy -> coherency_state = PARSEC_DATA_COHERENCY_SHARED ;
25052505
25062506 cpu_copy -> version = gpu_copy -> version ;
2507- PARSEC_DEBUG_VERBOSE (10 , parsec_gpu_output_stream ,
2508- "GPU[%d:%s]: CPU copy %p [ref_count %d] gets the same version %d as GPU copy %p [ref_count %d]" ,
2509- gpu_device -> super .device_index , gpu_device -> super .name ,
2507+ PARSEC_DEBUG_VERBOSE (20 , parsec_gpu_output_stream ,
2508+ "GPU[%d:%s]: %s: CPU copy %p [ref_count %d] gets the same version %d as GPU copy %p [ref_count %d]" ,
2509+ gpu_device -> super .device_index , gpu_device -> super .name , task_str ,
25102510 cpu_copy , cpu_copy -> super .super .obj_reference_count , cpu_copy -> version , gpu_copy , gpu_copy -> super .super .obj_reference_count );
25112511
25122512 /**
@@ -2528,19 +2528,32 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
25282528 gpu_copy -> coherency_state = PARSEC_DATA_COHERENCY_SHARED ;
25292529 assert (PARSEC_DATA_STATUS_UNDER_TRANSFER == cpu_copy -> data_transfer_status );
25302530 cpu_copy -> data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER ;
2531- if ( 0 == (parsec_mpi_allow_gpu_memory_communications & PARSEC_RUNTIME_SEND_GPU_MEMORY ) ) {
2531+ /* This condition should be checked more strictly. It is one thing to be able to send the
2532+ * data directly from the GPU via the communication engine, but if we have local successors
2533+ * that are not executing on the GPU (aka. the pushout has been set) we need to propagate
2534+ * the CPU copy instead.
2535+ */
2536+ if ( 1 || (0 == (parsec_mpi_allow_gpu_memory_communications & PARSEC_RUNTIME_SEND_GPU_MEMORY )) ) {
25322537 /* Report the CPU copy as the output of the task. */
25332538 this_task -> data [i ].data_out = cpu_copy ;
2539+ PARSEC_DEBUG_VERBOSE (100 , parsec_gpu_output_stream ,
2540+ "GPU[%d:%s]: %s: GPU copy %p [ref_count %d] replaced by the CPU copy %p [ref_count %d] in %s" ,
2541+ gpu_device -> super .device_index , gpu_device -> super .name , task_str ,
2542+ gpu_copy , gpu_copy -> super .super .obj_reference_count ,
2543+ cpu_copy , cpu_copy -> super .super .obj_reference_count , __func__ );
25342544 }
25352545 PARSEC_DEBUG_VERBOSE (20 , parsec_gpu_output_stream ,
2536- "GPU copy %p [ref_count %d] moved to the read LRU in %s" ,
2546+ "GPU[%d:%s]: %s: GPU copy %p [ref_count %d] moved to the read LRU in %s" ,
2547+ gpu_device -> super .device_index , gpu_device -> super .name , task_str ,
25372548 gpu_copy , gpu_copy -> super .super .obj_reference_count , __func__ );
25382549 parsec_list_item_ring_chop ((parsec_list_item_t * )gpu_copy );
25392550 PARSEC_LIST_ITEM_SINGLETON (gpu_copy );
25402551 parsec_list_push_back (& gpu_device -> gpu_mem_lru , (parsec_list_item_t * )gpu_copy );
25412552 } else {
2553+ /* No need to detach the GPU copy it does not belong to any lists because it was owned by the task */
25422554 PARSEC_DEBUG_VERBOSE (20 , parsec_gpu_output_stream ,
2543- "GPU copy %p [ref_count %d] moved to the owned LRU in %s" ,
2555+ "GPU[%d:%s]: %s: GPU copy %p [ref_count %d] moved to the owned LRU in %s" ,
2556+ gpu_device -> super .device_index , gpu_device -> super .name , task_str ,
25442557 gpu_copy , gpu_copy -> super .super .obj_reference_count , __func__ );
25452558 parsec_list_push_back (& gpu_device -> gpu_mem_owned_lru , (parsec_list_item_t * )gpu_copy );
25462559 }
0 commit comments