Skip to content

Commit 76bc47f

Browse files
committed
Always propagate the CPU copy to successors.
This disable the communications from GPU memory, but it is necessary for a proper tracking of reference counts on the data copies. At this point I dont think we can fiddle with the code to allow for device copy propagation, too many corner cases to be able to be workable. Instead, we should rethink the entire data copy framework, and allow tasks to fetch their inputs from devices as needed. Signed-off-by: George Bosilca <gbosilca@nvidia.com>
1 parent daf90eb commit 76bc47f

1 file changed

Lines changed: 23 additions & 10 deletions

File tree

parsec/mca/device/device_gpu.c

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2021-2024 The University of Tennessee and The University
44
* of Tennessee Research Foundation. All rights
55
* reserved.
6-
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
6+
* Copyright (c) 2024-2026 NVIDIA Corporation. All rights reserved.
77
*/
88

99
#include "parsec/parsec_config.h"
@@ -2457,11 +2457,11 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
24572457
int i;
24582458

24592459
#if defined(PARSEC_DEBUG_NOISIER)
2460-
char tmp[MAX_TASK_STRLEN];
2460+
char task_str[MAX_TASK_STRLEN];
2461+
(void)parsec_task_snprintf(task_str, MAX_TASK_STRLEN, this_task);
24612462
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
24622463
"GPU[%d:%s]: Epilog of %s",
2463-
gpu_device->super.device_index, gpu_device->super.name,
2464-
parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) );
2464+
gpu_device->super.device_index, gpu_device->super.name, task_str);
24652465
#endif
24662466

24672467
for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
@@ -2504,9 +2504,9 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
25042504
cpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
25052505

25062506
cpu_copy->version = gpu_copy->version;
2507-
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
2508-
"GPU[%d:%s]: CPU copy %p [ref_count %d] gets the same version %d as GPU copy %p [ref_count %d]",
2509-
gpu_device->super.device_index, gpu_device->super.name,
2507+
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
2508+
"GPU[%d:%s]: %s: CPU copy %p [ref_count %d] gets the same version %d as GPU copy %p [ref_count %d]",
2509+
gpu_device->super.device_index, gpu_device->super.name, task_str,
25102510
cpu_copy, cpu_copy->super.super.obj_reference_count, cpu_copy->version, gpu_copy, gpu_copy->super.super.obj_reference_count);
25112511

25122512
/**
@@ -2528,19 +2528,32 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
25282528
gpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
25292529
assert(PARSEC_DATA_STATUS_UNDER_TRANSFER == cpu_copy->data_transfer_status);
25302530
cpu_copy->data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER;
2531-
if( 0 == (parsec_mpi_allow_gpu_memory_communications & PARSEC_RUNTIME_SEND_GPU_MEMORY) ) {
2531+
/* This condition should be checked more strictly. It is one thing to be able to send the
2532+
* data directly from the GPU via the communication engine, but if we have local successors
2533+
* that are not executing on the GPU (aka. the pushout has been set) we need to propagate
2534+
* the CPU copy instead.
2535+
*/
2536+
if( 1 || (0 == (parsec_mpi_allow_gpu_memory_communications & PARSEC_RUNTIME_SEND_GPU_MEMORY)) ) {
25322537
/* Report the CPU copy as the output of the task. */
25332538
this_task->data[i].data_out = cpu_copy;
2539+
PARSEC_DEBUG_VERBOSE(100, parsec_gpu_output_stream,
2540+
"GPU[%d:%s]: %s: GPU copy %p [ref_count %d] replaced by the CPU copy %p [ref_count %d] in %s",
2541+
gpu_device->super.device_index, gpu_device->super.name, task_str,
2542+
gpu_copy, gpu_copy->super.super.obj_reference_count,
2543+
cpu_copy, cpu_copy->super.super.obj_reference_count, __func__);
25342544
}
25352545
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
2536-
"GPU copy %p [ref_count %d] moved to the read LRU in %s",
2546+
"GPU[%d:%s]: %s: GPU copy %p [ref_count %d] moved to the read LRU in %s",
2547+
gpu_device->super.device_index, gpu_device->super.name, task_str,
25372548
gpu_copy, gpu_copy->super.super.obj_reference_count, __func__);
25382549
parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy);
25392550
PARSEC_LIST_ITEM_SINGLETON(gpu_copy);
25402551
parsec_list_push_back(&gpu_device->gpu_mem_lru, (parsec_list_item_t*)gpu_copy);
25412552
} else {
2553+
/* No need to detach the GPU copy it does not belong to any lists because it was owned by the task */
25422554
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
2543-
"GPU copy %p [ref_count %d] moved to the owned LRU in %s",
2555+
"GPU[%d:%s]: %s: GPU copy %p [ref_count %d] moved to the owned LRU in %s",
2556+
gpu_device->super.device_index, gpu_device->super.name, task_str,
25442557
gpu_copy, gpu_copy->super.super.obj_reference_count, __func__);
25452558
parsec_list_push_back(&gpu_device->gpu_mem_owned_lru, (parsec_list_item_t*)gpu_copy);
25462559
}

0 commit comments

Comments
 (0)