diff --git a/src/components/cuda/cupti_event_and_metric.c b/src/components/cuda/cupti_event_and_metric.c index 5498dd2f9..a9130914d 100644 --- a/src/components/cuda/cupti_event_and_metric.c +++ b/src/components/cuda/cupti_event_and_metric.c @@ -807,7 +807,7 @@ int cuptie_ctx_create(cuptic_info_t thr_info, cuptie_control_t *pstate, uint32_t CUcontext internalContext; cudaArtCheckErrors( cudaSetDevicePtr(native_event_info.device), return PAPI_EMISC ); cudaArtCheckErrors( cudaFreePtr(NULL), return PAPI_EMISC ); - cudaCheckErrors( cuCtxGetCurrentPtr(&internalContext), return PAPI_EMISC); + cudaCheckErrors( cuCtxGetCurrentPtr(&internalContext), return PAPI_EMISC); thr_info[native_event_info.device].ctx = internalContext; // Pop the context off so verify_user_added_event_or_metric functions properly cudaCheckErrors( cuCtxPopCurrentPtr(&internalContext), return PAPI_EMISC ); @@ -867,6 +867,12 @@ int cuptie_ctx_start(cuptie_control_t state) { SUBDBG("ENTERING: Setting up profiling for the Event and Metric APIs.\n"); + CUcontext currentUserContext; + cudaCheckErrors( cuCtxGetCurrentPtr(¤tUserContext), return PAPI_EMISC); + if (currentUserContext != NULL) { + cudaCheckErrors( cuCtxPopCurrentPtr(¤tUserContext), return PAPI_EMISC ); + } + int deviceIdx; for (deviceIdx = 0; deviceIdx < numDevicesOnMachine; deviceIdx++) { cuptie_gpu_state_t *gpu_ctl = &(state->gpu_ctl[deviceIdx]); @@ -874,6 +880,13 @@ int cuptie_ctx_start(cuptie_control_t state) continue; } + int papi_errno = cuptic_device_acquire(gpu_ctl->added_events, API_LEGACY); + if (papi_errno != PAPI_OK) { + SUBDBG("Profiling the same gpu from multiple event sets is not allowed.\n"); + return papi_errno; + } + + cudaCheckErrors( cuCtxSetCurrentPtr(state->info[deviceIdx].ctx), return PAPI_EMISC ); // Calculate the total number of user added events @@ -924,6 +937,10 @@ int cuptie_ctx_start(cuptie_control_t state) cuptiCheckErrors( cuCtxPopCurrentPtr(&state->info[deviceIdx].ctx), return PAPI_EMISC ); } + if (currentUserContext != NULL) { + cudaCheckErrors( cuCtxPushCurrentPtr(currentUserContext), return PAPI_EMISC ); + } + SUBDBG("EXITING: Profiling setup completed.\n"); return PAPI_OK; } @@ -941,6 +958,12 @@ int cuptie_ctx_read(cuptie_control_t state, long long **counterValues) { SUBDBG("ENTERING: Reading values for the Event and Metric APIs.\n"); + CUcontext currentUserContext; + cudaCheckErrors( cuCtxGetCurrentPtr(¤tUserContext), return PAPI_EMISC); + if (currentUserContext != NULL) { + cuptiCheckErrors( cuCtxPopCurrentPtr(¤tUserContext), return PAPI_EMISC ); + } + int numCountersRead = 0; long long *readCounterValues = state->counters; @@ -1117,6 +1140,10 @@ int cuptie_ctx_read(cuptie_control_t state, long long **counterValues) state->read_count = numCountersRead; *counterValues = readCounterValues; + if (currentUserContext != NULL) { + cuptiCheckErrors( cuCtxPushCurrentPtr(currentUserContext), return PAPI_EMISC ); + } + SUBDBG("EXITING: Reading values completed.\n"); return PAPI_OK; } @@ -1131,6 +1158,12 @@ int cuptie_ctx_stop(cuptie_control_t state) { SUBDBG("ENTERING: Disabling and destroying the event group sets created. Collection of events will be stopped.\n"); + CUcontext currentUserContext; + cudaCheckErrors( cuCtxGetCurrentPtr(¤tUserContext), return PAPI_EMISC); + if (currentUserContext != NULL) { + cudaCheckErrors( cuCtxPopCurrentPtr(¤tUserContext), return PAPI_EMISC ); + } + int deviceIdx; for (deviceIdx = 0; deviceIdx < numDevicesOnMachine; deviceIdx++) { cuptie_gpu_state_t *gpu_ctl = &(state->gpu_ctl[deviceIdx]); @@ -1146,9 +1179,18 @@ int cuptie_ctx_stop(cuptie_control_t state) cuptiCheckErrors( cuptiEventGroupSetDisablePtr(eventGroupSet), return PAPI_EMISC ); cuptiCheckErrors( cuptiEventGroupSetsDestroyPtr(eventGroupSets), return PAPI_EMISC ); + int papi_errno = cuptic_device_release(gpu_ctl->added_events, API_LEGACY); + if (papi_errno != PAPI_OK) { + return papi_errno; + } + cudaCheckErrors( cuCtxPopCurrentPtr(&state->info[deviceIdx].ctx), return PAPI_EMISC ); } + if (currentUserContext != NULL) { + cudaCheckErrors( cuCtxPushCurrentPtr(currentUserContext), return PAPI_EMISC ); + } + SUBDBG("EXITING: Disabling event group sets completed.\n"); return PAPI_OK; } @@ -1164,6 +1206,12 @@ int cuptie_ctx_reset(cuptie_control_t state) { SUBDBG("ENTERING: Resetting counter values.\n"); + CUcontext currentUserContext; + cudaCheckErrors( cuCtxGetCurrentPtr(¤tUserContext), return PAPI_EMISC); + if (currentUserContext != NULL) { + cudaCheckErrors( cuCtxPopCurrentPtr(¤tUserContext), return PAPI_EMISC ); + } + int counterIdx; for (counterIdx = 0; counterIdx < state->read_count; counterIdx++) { state->counters[counterIdx] = 0; @@ -1192,6 +1240,10 @@ int cuptie_ctx_reset(cuptie_control_t state) cudaCheckErrors( cuCtxPopCurrentPtr(&state->info[deviceIdx].ctx), return PAPI_EMISC ); } + if (currentUserContext != NULL) { + cudaCheckErrors( cuCtxPushCurrentPtr(currentUserContext), return PAPI_EMISC ); + } + SUBDBG("EXITING: Resetting counter values completed.\n"); return PAPI_OK; } @@ -1425,11 +1477,16 @@ static int verify_user_added_event_or_metric(uint32_t *events_id, int num_events } totalNumberOfUserAddedEvents++; state->gpu_ctl[native_event_info.device].added_events->totalNumberOfUserAddedNativeEvents = totalNumberOfUserAddedEvents; + // For a specific device table, get the current event index + int idx = state->gpu_ctl[native_event_info.device].added_events->count; + state->gpu_ctl[native_event_info.device].added_events->cuda_devs[idx] = native_event_info.device; + state->gpu_ctl[native_event_info.device].added_events->count++; // Pop off the set context cudaCheckErrors( cuCtxPopCurrentPtr(&thr_info[native_event_info.device].ctx), return PAPI_EMISC ); } + SUBDBG("EXITING: Checking user added a valid event completed.\n"); return PAPI_OK; } @@ -1491,6 +1548,7 @@ static int create_event_and_metric_table(int totalNumberOfEntries, cuptiu_event_ goto fn_fail; } + eventTable->count = 0; eventTable->capacity = totalNumberOfEntries; eventTable->startTimeStampNs = 0; eventTable->totalNumberOfUserAddedNativeEvents = 0; diff --git a/src/components/cuda/cupti_event_and_metric.h b/src/components/cuda/cupti_event_and_metric.h index 1db3699dd..d1e301673 100644 --- a/src/components/cuda/cupti_event_and_metric.h +++ b/src/components/cuda/cupti_event_and_metric.h @@ -56,6 +56,7 @@ typedef struct event_and_metric_record_s { typedef struct event_and_metric_table_s { unsigned int count; unsigned int capacity; + int cuda_devs[30]; CUpti_EventGroupSets *eventGroupSets; CUpti_MetricID metricIDs[PAPI_CUDA_MAX_COUNTERS]; int *idsThatMakeupAUserAddedEventArray[PAPI_CUDA_MAX_COUNTERS]; diff --git a/src/components/cuda/cupti_profiler.c b/src/components/cuda/cupti_profiler.c index c66203e19..16f27bb0e 100644 --- a/src/components/cuda/cupti_profiler.c +++ b/src/components/cuda/cupti_profiler.c @@ -817,7 +817,7 @@ int cuptip_ctx_start(cuptip_control_t state) } LOGDBG("Device num %d: event_count %d, rmr count %d\n", dev_id, gpu_ctl->added_events->count, gpu_ctl->numberOfRawMetricRequests); - papi_errno = cuptic_device_acquire(state->gpu_ctl[dev_id].added_events); + papi_errno = cuptic_device_acquire(state->gpu_ctl[dev_id].added_events, API_PERFWORKS); if (papi_errno != PAPI_OK) { ERRDBG("Profiling same gpu from multiple event sets not allowed.\n"); return papi_errno; @@ -1134,7 +1134,7 @@ int cuptip_ctx_stop(cuptip_control_t state) return papi_errno; } - papi_errno = cuptic_device_release(state->gpu_ctl[dev_id].added_events); + papi_errno = cuptic_device_release(state->gpu_ctl[dev_id].added_events, API_PERFWORKS); if (papi_errno != PAPI_OK) { return papi_errno; } diff --git a/src/components/cuda/papi_cupti_common.c b/src/components/cuda/papi_cupti_common.c index 921783d58..75b17e462 100644 --- a/src/components/cuda/papi_cupti_common.c +++ b/src/components/cuda/papi_cupti_common.c @@ -15,6 +15,7 @@ #include "cupti_config.h" #include "papi_cupti_common.h" +#include "cupti_event_and_metric.h" static void *dl_drv, *dl_rt; @@ -848,47 +849,77 @@ int cuptic_ctxarr_destroy(cuptic_info_t *pinfo) return PAPI_OK; } -int _devmask_events_get(cuptiu_event_table_t *evt_table, gpu_occupancy_t *bitmask) +int cuptic_device_acquire(void *evt_table, int flag) { - gpu_occupancy_t acq_mask = 0; - long i; - for (i = 0; i < evt_table->count; i++) { - acq_mask |= (1 << evt_table->cuda_devs[i]); + int i; + gpu_occupancy_t bitmask = 0; + switch(flag) { + case API_LEGACY: + { + cuptiu_event_and_metric_table_t *legacy_evt_table = (cuptiu_event_and_metric_table_t *) evt_table; + for (i = 0; i < legacy_evt_table->count; i++) { + bitmask |= (1 << legacy_evt_table->cuda_devs[i]); + } + break; + } + case API_PERFWORKS: + { + cuptiu_event_table_t *perfworks_evt_table = (cuptiu_event_table_t *) evt_table; + for (i = 0; i < perfworks_evt_table->count; i++) { + bitmask |= (1 << perfworks_evt_table->cuda_devs[i]); + } + break; + } + default: + SUBDBG("Provided flag is not accounted for in this switch statement. Code needs to be updated.\n"); + return PAPI_EBUG; } - *bitmask = acq_mask; - return PAPI_OK; -} - -int cuptic_device_acquire(cuptiu_event_table_t *evt_table) -{ - gpu_occupancy_t bitmask; - int papi_errno = _devmask_events_get(evt_table, &bitmask); - if (papi_errno != PAPI_OK) { - return papi_errno; - } if (bitmask & global_gpu_bitmask) { return PAPI_ECNFLCT; } + _papi_hwi_lock(_cuda_lock); global_gpu_bitmask |= bitmask; _papi_hwi_unlock(_cuda_lock); + return PAPI_OK; } -int cuptic_device_release(cuptiu_event_table_t *evt_table) +int cuptic_device_release(void *evt_table, int flag) { - gpu_occupancy_t bitmask; - int papi_errno = _devmask_events_get(evt_table, &bitmask); - if (papi_errno != PAPI_OK) { - return papi_errno; + int i; + gpu_occupancy_t bitmask = 0; + switch(flag) { + case API_LEGACY: + { + cuptiu_event_and_metric_table_t *legacy_evt_table = (cuptiu_event_and_metric_table_t *) evt_table; + for (i = 0; i < legacy_evt_table->count; i++) { + bitmask |= (1 << legacy_evt_table->cuda_devs[i]); + } + break; + } + case API_PERFWORKS: + { + cuptiu_event_table_t *perfworks_evt_table = (cuptiu_event_table_t *) evt_table; + for (i = 0; i < perfworks_evt_table->count; i++) { + bitmask |= (1 << perfworks_evt_table->cuda_devs[i]); + } + break; + } + default: + SUBDBG("Provided flag is not accounted for in this switch statement. Code needs to be updated.\n"); + return PAPI_EBUG; } + if ((bitmask & global_gpu_bitmask) != bitmask) { return PAPI_EMISC; } + _papi_hwi_lock(_cuda_lock); global_gpu_bitmask ^= bitmask; _papi_hwi_unlock(_cuda_lock); + return PAPI_OK; } diff --git a/src/components/cuda/papi_cupti_common.h b/src/components/cuda/papi_cupti_common.h index 5d325ea3a..e7d857b21 100644 --- a/src/components/cuda/papi_cupti_common.h +++ b/src/components/cuda/papi_cupti_common.h @@ -76,8 +76,8 @@ int cuptic_ctxarr_get_ctx(cuptic_info_t info, int dev_id, CUcontext *ctx); int cuptic_ctxarr_destroy(cuptic_info_t *pinfo); /* functions to track the occupancy of gpu counters in event sets */ -int cuptic_device_acquire(cuptiu_event_table_t *evt_table); -int cuptic_device_release(cuptiu_event_table_t *evt_table); +int cuptic_device_acquire(void *evt_table, int flag); +int cuptic_device_release(void *evt_table, int flag); /* device qualifier interfaces */ int cuptiu_dev_set(cuptiu_bitmap_t *bitmap, int i); diff --git a/src/components/cuda/tests/HelloWorld.cu b/src/components/cuda/tests/HelloWorld.cu index 77b7f69b0..2982f6a51 100644 --- a/src/components/cuda/tests/HelloWorld.cu +++ b/src/components/cuda/tests/HelloWorld.cu @@ -1,399 +1,318 @@ -/****************************/ -/* THIS IS OPEN SOURCE CODE */ -/****************************/ - /** - * @file HelloWorld.cu - * @author Heike Jagode - * jagode@eecs.utk.edu - * Mods: Anustuv Pal - * anustuv@icl.utk.edu - * Mods: - * - * test case for Example component - * - * - * @brief - * This file is a very simple HelloWorld C example which serves (together - * with its Makefile) as a guideline on how to add tests to components. - * The papi configure and papi Makefile will take care of the compilation - * of the component tests (if all tests are added to a directory named - * 'tests' in the specific component dir). - * See components/README for more details. - * - * The string "Hello World!" is mangled and then restored. - * - * CUDA Context notes for CUPTI_11: Although a cudaSetDevice() will create a - * primary context for the device that allows kernel execution; PAPI cannot - * use a primary context to control the Nvidia Performance Profiler. - * Applications must create a context using cuCtxCreate() that will execute - * the kernel, this must be done prior to the PAPI_add_events() invocation in - * the code below. If multiple GPUs are in use, each requires its own context, - * and that context should be active when PAPI_events are added for each - * device. Which means using Seperate PAPI_add_events() for each device. For - * an example see simpleMultiGPU.cu. - * - * There are three points below where cuCtxCreate() is called, this code works - * if any one of them is used alone. - */ +* @file HelloWorld.cu +* @brief This test serves as a very simple hello world c example where the string +* "Hello World!" is mangled and then restored. cuCtxCreate is used for context +* creation. +* +* Note: The cuda component supports being partially disabled, meaning that certain devices +* will not be "enabled" to profile on. If PAPI_CUDA_API is not set, then devices with +* CC's >= 7.0 will be used and if PAPI_CUDA_API is set to LEGACY then devices with +* CC's <= 7.0 will be used. +*/ -#include +// Standard library headers #include #include +#include -#ifdef PAPI +// Cuda Toolkit headers +#include + +// Internal headers +#include "cuda_tests_helper.h" #include "papi.h" #include "papi_test.h" -#endif -#define STEP_BY_STEP_DEBUG 0 /* helps debug CUcontext issues. */ -#define PRINT(quiet, format, args...) {if (!quiet) {fprintf(stderr, format, ## args);}} +// Aid in debugging Cuda contexts +#define STEP_BY_STEP_DEBUG 0 -// Device kernel -__global__ void helloWorld(char* str) +static void print_help_message(void) { - // determine where in the thread grid we are - int idx = blockIdx.x * blockDim.x + threadIdx.x; - // unmangle output - str[idx] += idx; + printf("./HelloWorld --device [nvidia device index] --cuda-native-event-names [list of cuda native event names separated by a comma].\n" + "Notes:\n" + "1. The device index must match the device qualifier if provided.\n"); } -/** @class add_events_from_command_line - * @brief Try and add each event provided on the command line by the user. - * - * @param EventSet - * A PAPI eventset. - * @param totalEventCount - * Number of events from the command line. - * @param **eventNamesFromCommandLine - * Events provided on the command line. - * @param *numEventsSuccessfullyAdded - * Total number of successfully added events. - * @param **eventsSuccessfullyAdded - * Events that we are able to add to the EventSet. - * @param *numMultipassEvents - * Counter to see if a multiple pass event was provided on the command line. -*/ -static void add_events_from_command_line(int EventSet, int totalEventCount, char **eventNamesFromCommandLine, int *numEventsSuccessfullyAdded, char **eventsSuccessfullyAdded, int *numMultipassEvents) +static void parse_and_assign_args(int argc, char *argv[], int *device_index, char ***cuda_native_event_names, int *total_event_count) { - int i; - for (i = 0; i < totalEventCount; i++) { - int papi_errno = PAPI_add_named_event(EventSet, eventNamesFromCommandLine[i]); - if (papi_errno != PAPI_OK) { - if (papi_errno != PAPI_EMULPASS) { - fprintf(stderr, "Unable to add event %s to the EventSet with error code %d.\n", eventNamesFromCommandLine[i], papi_errno); - test_skip(__FILE__, __LINE__, "", 0); + int num_device_indices = 0, *event_device_indices = NULL; + int i, device_arg_found = 0, cuda_native_event_name_arg_found = 0; + for (i = 1; i < argc; ++i) + { + char *arg = argv[i]; + if (strcmp(arg, "--help") == 0) + { + print_help_message(); + exit(EXIT_SUCCESS); + } + else if (strcmp(arg, "--device") == 0) + { + if (!argv[i + 1]) + { + printf("ERROR!! Add a nvidia device index.\n"); + exit(EXIT_FAILURE); + } + *device_index = atoi(argv[i + 1]); + device_arg_found++; + i++; + } + else if (strcmp(arg, "--cuda-native-event-names") == 0) + { + if (!argv[i + 1]) + { + printf("ERROR!! --cuda-native-event-names given, but no events listed.\n"); + exit(EXIT_FAILURE); } - // Handle multiple pass events - (*numMultipassEvents)++; - continue; + char **cmd_line_native_event_names = NULL; + const char *cuda_native_event_name = strtok(argv[i+1], ","); + while (cuda_native_event_name != NULL) + { + const char *device_substring = strstr(cuda_native_event_name, ":device="); + if (device_substring != NULL) { + event_device_indices = (int *) realloc(event_device_indices, (num_device_indices + 1) * sizeof(int)); + event_device_indices[num_device_indices++] = atoi(device_substring + strlen(":device=")); + } + + cmd_line_native_event_names = (char **) realloc(cmd_line_native_event_names, ((*total_event_count) + 1) * sizeof(char *)); + check_memory_allocation_call(cmd_line_native_event_names); + + cmd_line_native_event_names[(*total_event_count)] = (char *) malloc(PAPI_2MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call(cmd_line_native_event_names[(*total_event_count)]); + + int strLen = snprintf(cmd_line_native_event_names[(*total_event_count)], PAPI_2MAX_STR_LEN, "%s", cuda_native_event_name); + if (strLen < 0 || strLen >= PAPI_2MAX_STR_LEN) { + fprintf(stderr, "Failed to fully write cuda native event name.\n"); + exit(EXIT_FAILURE); + } + + (*total_event_count)++; + cuda_native_event_name_arg_found++; + cuda_native_event_name = strtok(NULL, ","); + } + i++; + *cuda_native_event_names = cmd_line_native_event_names; + } + else + { + print_help_message(); + exit(EXIT_FAILURE); } + } - // Handle successfully added events - int strLen = snprintf(eventsSuccessfullyAdded[(*numEventsSuccessfullyAdded)], PAPI_MAX_STR_LEN, "%s", eventNamesFromCommandLine[i]); - if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { - fprintf(stderr, "Failed to fully write successfully added event.\n"); - test_skip(__FILE__, __LINE__, "", 0); + if (device_arg_found == 0 || cuda_native_event_name_arg_found == 0) { + fprintf(stderr, "You must use both the --device arg and --cuda-native-event-names arg in conjunction.\n"); + exit(EXIT_FAILURE); + } + + for (i = 0; i < num_device_indices; i++) { + if ((*device_index) != event_device_indices[i]) { + fprintf(stderr, "The device qualifier index %d does not match the index %d provided by --device.\n", event_device_indices[i], *device_index); + exit(EXIT_FAILURE); } - (*numEventsSuccessfullyAdded)++; } + free(event_device_indices); +} - return; +// Device kernel +__global__ void helloWorld(char* str) +{ + // determine where in the thread grid we are + int idx = blockIdx.x * blockDim.x + threadIdx.x; + // unmangle output + str[idx] += idx; } // Host function int main(int argc, char** argv) { - int quiet = 0; - CUcontext getCtx=NULL, sessionCtx=NULL; - cudaError_t cudaError; - CUresult cuError; (void) cuError; - - cuError = cuInit(0); - if (cuError != CUDA_SUCCESS) { - fprintf(stderr, "Failed to initialize the CUDA driver API.\n"); - exit(1); - } - -#ifdef PAPI - char *test_quiet = getenv("PAPI_CUDA_TEST_QUIET"); - if (test_quiet) - quiet = (int) strtol(test_quiet, (char**) NULL, 10); + check_cuda_driver_api_call( cuInit(0) ); - /* PAPI Initialization */ - int papi_errno = PAPI_library_init( PAPI_VER_CURRENT ); - if( papi_errno != PAPI_VER_CURRENT ) { - test_fail(__FILE__,__LINE__, "PAPI_library_init failed", 0 ); + // Determine the number of Cuda capable devices + int num_devices = 0; + check_cuda_runtime_api_call( cudaGetDeviceCount(&num_devices) ); + // No devices detected on the machine, exit + if (num_devices < 1) { + fprintf(stderr, "No NVIDIA devices found on the machine. This is required for the test to run.\n"); + exit(EXIT_FAILURE); } - printf( "PAPI_VERSION : %4d %6d %7d\n", - PAPI_VERSION_MAJOR( PAPI_VERSION ), - PAPI_VERSION_MINOR( PAPI_VERSION ), - PAPI_VERSION_REVISION( PAPI_VERSION ) ); - - int i; - int EventSet = PAPI_NULL; - int eventCount = argc - 1; - - /* if no events passed at command line, just report test skipped. */ - if (eventCount == 0) { - fprintf(stderr, "No eventnames specified at command line."); - test_skip(__FILE__, __LINE__, "", 0); + int suppress_output = 0; + char *user_defined_suppress_output = getenv("PAPI_CUDA_TEST_QUIET"); + if (user_defined_suppress_output) { + suppress_output = (int) strtol(user_defined_suppress_output, (char**) NULL, 10); } + PRINT(suppress_output, "Running the cuda component test HelloWorld.cu\n"); - long long *values = (long long *) calloc(eventCount, sizeof (long long)); - if (values == NULL) { - test_fail(__FILE__, __LINE__, "Failed to allocate memory for values.\n", 0); + int cuda_device_index = -1; + char **cuda_native_event_names = NULL; + // If command line arguments are provided then get their values. + int total_event_count = 0; + if (argc > 1) { + parse_and_assign_args(argc, argv, &cuda_device_index, &cuda_native_event_names, &total_event_count); } - int *events = (int *) calloc(eventCount, sizeof (int)); - if (events == NULL) { - test_fail(__FILE__, __LINE__, "Failed to allocate memory for events.\n", 0); + // Initialize the PAPI library + int papi_errno = PAPI_library_init( PAPI_VER_CURRENT ); + if( papi_errno != PAPI_VER_CURRENT ) { + test_fail(__FILE__,__LINE__, "PAPI_library_init()", papi_errno); } + PRINT(suppress_output, "PAPI version being used for this test: %d.%d.%d\n", + PAPI_VERSION_MAJOR(PAPI_VERSION), + PAPI_VERSION_MINOR(PAPI_VERSION), + PAPI_VERSION_REVISION(PAPI_VERSION)); - if (STEP_BY_STEP_DEBUG) { - cuCtxGetCurrent(&getCtx); - fprintf(stderr, "%s:%s:%i before PAPI_create_eventset() getCtx=%p.\n", __FILE__, __func__, __LINE__, getCtx); + int cuda_cmp_idx = PAPI_get_component_index("cuda"); + if (cuda_cmp_idx < 0) { + test_fail(__FILE__, __LINE__, "PAPI_get_component_index()", cuda_cmp_idx); } + PRINT(suppress_output, "The cuda component is assigned to component index: %d\n", cuda_cmp_idx); - papi_errno = PAPI_create_eventset( &EventSet ); - if( papi_errno != PAPI_OK ) { - test_fail(__FILE__,__LINE__,"Cannot create eventset",papi_errno); + // If a user does not provide an event or events, then we go get an event to add + if (total_event_count == 0) { + enumerate_and_store_cuda_native_events(&cuda_native_event_names, &total_event_count, &cuda_device_index); } - if (STEP_BY_STEP_DEBUG) { - cuCtxGetCurrent(&getCtx); - fprintf(stderr, "%s:%s:%i after PAPI_create_eventset() getCtx=%p.\n", __FILE__, __func__, __LINE__, getCtx); - } + int EventSet = PAPI_NULL; + check_papi_api_call( PAPI_create_eventset( &EventSet ) ); // If multiple GPUs/contexts were being used, you'd need to // create contexts for each device. See, for example, // simpleMultiGPU.cu. + CUcontext sessionCtx = NULL; int flags = 0; - CUdevice device = 0; + CUdevice device = cuda_device_index; #if defined(CUDA_TOOLKIT_GE_13) - cuError = cuCtxCreate(&sessionCtx, (CUctxCreateParams*)0, flags, device); - if (cuError != CUDA_SUCCESS) { - fprintf(stderr, "Failed to create Cuda context for a Cuda Toolkit version >= 13: %d\n", cuError); - exit(1); - } + check_cuda_driver_api_call( cuCtxCreate(&sessionCtx, (CUctxCreateParams*)0, flags, device) ); #else - cuError = cuCtxCreate(&sessionCtx, flags, device); - if (cuError != CUDA_SUCCESS) { - fprintf(stderr, "Failed to create Cuda context for a Cuda Toolkit version < 13: %d\n", cuError); - exit(1); - } + check_cuda_driver_api_call( cuCtxCreate(&sessionCtx, flags, device) ); #endif + CUcontext getCtx; if (STEP_BY_STEP_DEBUG) { - cuCtxGetCurrent(&getCtx); - fprintf(stderr, "%s:%s:%i after cuCtxCreate(&sessionCtx), about to PAPI_start(), sessionCtx=%p, getCtx=%p.\n", __FILE__, __func__, __LINE__, sessionCtx, getCtx); + check_cuda_driver_api_call( cuCtxGetCurrent(&getCtx) ); + fprintf(stderr, "Address of Cuda context after call to cuCtxCreate is %p\n", getCtx); } - // Handle the events from the command line - int numEventsSuccessfullyAdded = 0, numMultipassEvents = 0; - char **eventsSuccessfullyAdded, **metricNames = argv + 1; - eventsSuccessfullyAdded = (char **) malloc(eventCount * sizeof(char *)); - if (eventsSuccessfullyAdded == NULL) { - fprintf(stderr, "Failed to allocate memory for successfully added events.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - for (i = 0; i < eventCount; i++) { - eventsSuccessfullyAdded[i] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); - if (eventsSuccessfullyAdded[i] == NULL) { - fprintf(stderr, "Failed to allocate memory for command line argument.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - } + int num_events_successfully_added = 0, numMultipassEvents = 0; + char **events_successfully_added = (char **) malloc(total_event_count * sizeof(char *)); + check_memory_allocation_call( events_successfully_added ); + + int event_idx; + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + events_successfully_added[event_idx] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call( events_successfully_added[event_idx] ); - add_events_from_command_line(EventSet, eventCount, metricNames, &numEventsSuccessfullyAdded, eventsSuccessfullyAdded, &numMultipassEvents); + add_cuda_native_events(EventSet, cuda_native_event_names[event_idx], &num_events_successfully_added, events_successfully_added, &numMultipassEvents); + } // Only multiple pass events were provided on the command line - if (numEventsSuccessfullyAdded == 0) { + if (num_events_successfully_added == 0) { fprintf(stderr, "Events provided on the command line could not be added to an EventSet as they require multiple passes.\n"); - test_skip(__FILE__, __LINE__, "", 0); + exit(EXIT_FAILURE); } if (STEP_BY_STEP_DEBUG) { - cuCtxGetCurrent(&getCtx); - fprintf(stderr, "%s:%s:%i before PAPI_start(), getCtx=%p.\n", __FILE__, __func__, __LINE__, getCtx); + check_cuda_driver_api_call( cuCtxGetCurrent(&getCtx) ); + fprintf(stderr, "Address of Cuda context after events have been added is %p\n", getCtx); } - papi_errno = PAPI_start( EventSet ); - if( papi_errno != PAPI_OK ) { - test_fail(__FILE__, __LINE__, "PAPI_start failed.", papi_errno); - } + check_papi_api_call( PAPI_start(EventSet) ); if (STEP_BY_STEP_DEBUG) { - cuCtxGetCurrent(&getCtx); - fprintf(stderr, "%s:%s:%i after PAPI_start(), getCtx=%p.\n", __FILE__, __func__, __LINE__, getCtx); + check_cuda_driver_api_call( cuCtxGetCurrent(&getCtx) ); + fprintf(stderr, "Address of Cuda context after call to PAPI_start is %p\n", getCtx); } -#endif - - int j; - - // desired output + // Mangle contents of output + // The null character is left intact for simplicity char str[] = "Hello World!"; - - // mangle contents of output - // the null character is left intact for simplicity - for(j = 0; j < 12; j++) { - str[j] -= j; + int i; + for (i = 0; i < strlen(str); i++) { + str[i] -= i; } + PRINT(suppress_output, "mangled str=%s\n", str); - PRINT(quiet, "mangled str=%s\n", str); - - // allocate memory on the device + // Allocate memory on the device char *d_str; size_t size = sizeof(str); - cudaMalloc((void**)&d_str, size); + check_cuda_runtime_api_call( cudaMalloc((void**)&d_str, size) ); + check_memory_allocation_call( d_str ); - if (STEP_BY_STEP_DEBUG) { - cuCtxGetCurrent(&getCtx); - fprintf(stderr, "%s:%s:%i after cudaMalloc() getCtx=%p.\n", __FILE__, __func__, __LINE__, getCtx); - } - - // copy the string to the device - cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice); - - if (STEP_BY_STEP_DEBUG) { - cuCtxGetCurrent(&getCtx); - fprintf(stderr, "%s:%s:%i after cudaMemcpy(ToDevice) getCtx=%p.\n", __FILE__, __func__, __LINE__, getCtx); - } + // Copy the string to the device + check_cuda_runtime_api_call( cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice) ); - // set the grid and block sizes - dim3 dimGrid(2); // one block per word - dim3 dimBlock(6); // one thread per character + // Set the grid and block sizes + dim3 dimGrid(2); // One block per word + dim3 dimBlock(6); // One thread per character - // invoke the kernel + // Invoke the kernel helloWorld<<< dimGrid, dimBlock >>>(d_str); + check_cuda_runtime_api_call( cudaGetLastError() ); - cudaError = cudaGetLastError(); - if (STEP_BY_STEP_DEBUG) { - fprintf(stderr, "%s:%s:%i Kernel Return Code: %s.\n", __FILE__, __func__, __LINE__, cudaGetErrorString(cudaError)); - } - - if (STEP_BY_STEP_DEBUG) { - cuCtxGetCurrent(&getCtx); - fprintf(stderr, "%s:%s:%i After Kernel Execution: getCtx=%p.\n", __FILE__, __func__, __LINE__, getCtx); - } - - // retrieve the results from the device - cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost); - - if (STEP_BY_STEP_DEBUG) { - cuCtxGetCurrent(&getCtx); - fprintf(stderr, "%s:%s:%i after cudaMemcpy(ToHost) getCtx=%p.\n", __FILE__, __func__, __LINE__, getCtx); - } + // Retrieve the results from the device + check_cuda_runtime_api_call( cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost) ); // free up the allocated memory on the device - cudaFree(d_str); + check_cuda_runtime_api_call( cudaFree(d_str) ); - if (STEP_BY_STEP_DEBUG) { - cuCtxGetCurrent(&getCtx); - fprintf(stderr, "%s:%s:%i after cudaFree() getCtx=%p.\n", __FILE__, __func__, __LINE__, getCtx); - } + long long *cuda_counter_values = (long long *) calloc(total_event_count, sizeof (long long)); + check_memory_allocation_call(cuda_counter_values); -#ifdef PAPI - papi_errno = PAPI_read( EventSet, values ); - if( papi_errno != PAPI_OK ) { - test_fail(__FILE__, __LINE__, "PAPI_read failed", papi_errno); + check_papi_api_call( PAPI_read(EventSet, cuda_counter_values) ); + for (event_idx = 0; event_idx < num_events_successfully_added; event_idx++ ) { + PRINT(suppress_output, "After PAPI_read, the event %s produced the value: \t\t%lld\n", events_successfully_added[event_idx], cuda_counter_values[event_idx]); } if (STEP_BY_STEP_DEBUG) { - cuCtxGetCurrent(&getCtx); - fprintf(stderr, "%s:%s:%i after PAPI_read getCtx=%p.\n", __FILE__, __func__, __LINE__, getCtx); - } - - for( i = 0; i < numEventsSuccessfullyAdded; i++ ) { - PRINT( quiet, "read: %12lld \t=0X%016llX \t\t --> %s \n", values[i], values[i], eventsSuccessfullyAdded[i] ); + check_cuda_driver_api_call( cuCtxGetCurrent(&getCtx) ); + fprintf(stderr, "Address of Cuda context after call to PAPI_read is %p\n", getCtx); } - papi_errno = cuCtxPopCurrent(&getCtx); - if( papi_errno != CUDA_SUCCESS) { - fprintf( stderr, "cuCtxPopCurrent failed, papi_errno=%d (%s)\n", papi_errno, PAPI_strerror(papi_errno) ); - exit(1); + check_papi_api_call( PAPI_stop(EventSet, cuda_counter_values) ); + for (event_idx = 0; event_idx < num_events_successfully_added; event_idx++ ) { + PRINT(suppress_output, "After PAPI_stop, the event %s produced the value: \t\t%lld\n", events_successfully_added[event_idx], cuda_counter_values[event_idx]); } if (STEP_BY_STEP_DEBUG) { - cuCtxGetCurrent(&getCtx); - fprintf(stderr, "%s:%s:%i after cuCtxPopCurrent() getCtx=%p.\n", __FILE__, __func__, __LINE__, getCtx); + check_cuda_driver_api_call( cuCtxGetCurrent(&getCtx) ); + fprintf(stderr, "Address of Cuda context after call to PAPI_stop is %p\n", getCtx); } - papi_errno = PAPI_stop( EventSet, values ); - if( papi_errno != PAPI_OK ) { - test_fail(__FILE__, __LINE__, "PAPI_stop failed", papi_errno); - } + check_papi_api_call( PAPI_cleanup_eventset(EventSet) ); - if (STEP_BY_STEP_DEBUG) { - cuCtxGetCurrent(&getCtx); - fprintf(stderr, "%s:%s:%i after PAPI_stop getCtx=%p.\n", __FILE__, __func__, __LINE__, getCtx); - } - - papi_errno = PAPI_cleanup_eventset(EventSet); - if( papi_errno != PAPI_OK ) { - test_fail(__FILE__, __LINE__, "PAPI_cleanup_eventset failed", papi_errno); - } + check_papi_api_call( PAPI_destroy_eventset(&EventSet) ); if (STEP_BY_STEP_DEBUG) { - cuCtxGetCurrent(&getCtx); - fprintf(stderr, "%s:%s:%i after PAPI_cleanup_eventset getCtx=%p.\n", __FILE__, __func__, __LINE__, getCtx); - } - papi_errno = PAPI_destroy_eventset(&EventSet); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_destroy_eventset failed", papi_errno); - } - - if (STEP_BY_STEP_DEBUG) { - cuCtxGetCurrent(&getCtx); - fprintf(stderr, "%s:%s:%i after PAPI_destroy_eventset getCtx=%p.\n", __FILE__, __func__, __LINE__, getCtx); - } - - for( i = 0; i < numEventsSuccessfullyAdded; i++ ) { - PRINT( quiet, "stop: %12lld \t=0X%016llX \t\t --> %s \n", values[i], values[i], eventsSuccessfullyAdded[i] ); - } -#endif - - if (STEP_BY_STEP_DEBUG) { fprintf(stderr, "%s:%s:%i before cuCtxDestroy sessionCtx=%p.\n", __FILE__, __func__, __LINE__, sessionCtx); } - // Test destroying the session Context. - if (sessionCtx != NULL) { - cuCtxDestroy(sessionCtx); - } + // Destroy the context used for this test + check_cuda_driver_api_call( cuCtxDestroy(sessionCtx) ); - if (STEP_BY_STEP_DEBUG) { - cuCtxGetCurrent(&getCtx); - fprintf(stderr, "%s:%s:%i after cuCtxDestroy(%p) getCtx=%p.\n", __FILE__, __func__, __LINE__, sessionCtx, getCtx); + // Free allocated memory + free(cuda_counter_values); + + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + free(cuda_native_event_names[event_idx]); } + free(cuda_native_event_names); - // Free allocated memory - free(values); - free(events); - for (i = 0; i < eventCount; i++) { - free(eventsSuccessfullyAdded[i]); + for (event_idx = 0; event_idx < num_events_successfully_added; event_idx++) { + free(events_successfully_added[event_idx]); } - free(eventsSuccessfullyAdded); + free(events_successfully_added); -#ifdef PAPI PAPI_shutdown(); - if (STEP_BY_STEP_DEBUG) { - cuCtxGetCurrent(&getCtx); - fprintf(stderr, "%s:%s:%i after PAPI_shutdown getCtx=%p.\n", __FILE__, __func__, __LINE__, getCtx); - } - // Output a note that a multiple pass event was provided on the command line if (numMultipassEvents > 0) { - PRINT(quiet, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); + PRINT(suppress_output, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); } test_pass(__FILE__); -#endif + return 0; } diff --git a/src/components/cuda/tests/HelloWorld_noCuCtx.cu b/src/components/cuda/tests/HelloWorld_noCuCtx.cu index 20ebc96aa..2b18efedc 100644 --- a/src/components/cuda/tests/HelloWorld_noCuCtx.cu +++ b/src/components/cuda/tests/HelloWorld_noCuCtx.cu @@ -1,53 +1,84 @@ -/****************************/ -/* THIS IS OPEN SOURCE CODE */ -/****************************/ - /** - * @file HelloWorld_noCuCtx.cu - * @author Heike Jagode - * jagode@eecs.utk.edu - * Mods: Anustuv Pal - * anustuv@icl.utk.edu - * Mods: - * - * test case for cuda component - * - * - * @brief - * This file is a very simple HelloWorld C example which serves (together - * with its Makefile) as a guideline on how to add tests to components. - * The papi configure and papi Makefile will take care of the compilation - * of the component tests (if all tests are added to a directory named - * 'tests' in the specific component dir). - * See components/README for more details. - * - * The string "Hello World!" is mangled and then restored. - * - * CUDA Context notes for CUPTI_11: Although a cudaSetDevice() will create a - * primary context for the device that allows kernel execution; PAPI cannot - * use a primary context to control the Nvidia Performance Profiler. - * Applications must create a context using cuCtxCreate() that will execute - * the kernel, this must be done prior to the PAPI_add_events() invocation in - * the code below. If multiple GPUs are in use, each requires its own context, - * and that context should be active when PAPI_events are added for each - * device. Which means using Seperate PAPI_add_events() for each device. For - * an example see simpleMultiGPU.cu. - * - * There are three points below where cuCtxCreate() is called, this code works - * if any one of them is used alone. - */ +* @file HelloWorld_noCuCtx.cu +* @brief This test serves as a very simple hello world c example where the string +* "Hello World!" is mangled and then restored. +* +* Note: The cuda component supports being partially disabled, meaning that certain devices +* will not be "enabled" to profile on. If PAPI_CUDA_API is not set, then devices with +* CC's >= 7.0 will be used and if PAPI_CUDA_API is set to LEGACY then devices with +* CC's <= 7.0 will be used. +*/ -#include +// Standard library headers #include #include +#include + +// Cuda Toolkit headers +#include -#ifdef PAPI +// Internal headers +#include "cuda_tests_helper.h" #include "papi.h" #include "papi_test.h" -#endif -#define STEP_BY_STEP_DEBUG 0 /* helps debug CUcontext issues. */ -#define PRINT(quiet, format, args...) {if (!quiet) {fprintf(stderr, format, ## args);}} +static void print_help_message(void) +{ + printf("./HelloWorld_noCuCtx --cuda-native-event-names [list of cuda native event names separated by a comma].\n" + "Notes:\n" + "1. A device qualifier must be provided otherwise a context will not be created.\n"); +} + +static void parse_and_assign_args(int argc, char *argv[], int *device_index, char ***cuda_native_event_names, int *total_event_count) +{ + int i; + for (i = 1; i < argc; ++i) + { + char *arg = argv[i]; + if (strcmp(arg, "--help") == 0) + { + print_help_message(); + exit(EXIT_SUCCESS); + } + else if (strcmp(arg, "--cuda-native-event-names") == 0) + { + if (!argv[i + 1]) + { + printf("ERROR!! --cuda-native-event-names given, but no events listed.\n"); + exit(EXIT_FAILURE); + } + + char **cmd_line_native_event_names = NULL; + const char *cuda_native_event_name = strtok(argv[i+1], ","); + while (cuda_native_event_name != NULL) + { + cmd_line_native_event_names = (char **) realloc(cmd_line_native_event_names, ((*total_event_count) + 1) * sizeof(char *)); + check_memory_allocation_call(cmd_line_native_event_names); + + cmd_line_native_event_names[(*total_event_count)] = (char *) malloc(PAPI_2MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call(cmd_line_native_event_names[(*total_event_count)]); + + int strLen = snprintf(cmd_line_native_event_names[(*total_event_count)], PAPI_2MAX_STR_LEN, "%s", cuda_native_event_name); + if (strLen < 0 || strLen >= PAPI_2MAX_STR_LEN) { + fprintf(stderr, "Failed to fully write cuda native event name.\n"); + exit(EXIT_FAILURE); + } + + (*total_event_count)++; + cuda_native_event_name = strtok(NULL, ","); + } + i++; + *cuda_native_event_names = cmd_line_native_event_names; + } + else + { + print_help_message(); + exit(EXIT_FAILURE); + } + } +} + + // Device kernel __global__ void @@ -59,216 +90,147 @@ helloWorld(char* str) str[idx] += idx; } -/** @class add_events_from_command_line - * @brief Try and add each event provided on the command line by the user. - * - * @param EventSet - * A PAPI eventset. - * @param totalEventCount - * Number of events from the command line. - * @param eventNamesFromCommandLine - * Events provided on the command line. - * @param *numEventsSuccessfullyAdded - * Total number of successfully added events. - * @param **eventsSuccessfullyAdded - * Events that we are able to add to the EventSet. - * @param *numMultipassEvents - * Counter to see if a multiple pass event was provided on the command line. -*/ -static void add_events_from_command_line(int EventSet, int totalEventCount, char **eventNamesFromCommandLine, int *numEventsSuccessfullyAdded, char **eventsSuccessfullyAdded, int *numMultipassEvents) -{ - int i; - for (i = 0; i < totalEventCount; i++) { - int papi_errno = PAPI_add_named_event(EventSet, eventNamesFromCommandLine[i]); - if (papi_errno != PAPI_OK) { - if (papi_errno != PAPI_EMULPASS) { - fprintf(stderr, "Unable to add event %s to the EventSet with error code %d.\n", eventNamesFromCommandLine[i], papi_errno); - test_skip(__FILE__, __LINE__, "", 0); - } - - // Handle multiple pass events - (*numMultipassEvents)++; - continue; - } - - // Handle successfully added events - int strLen = snprintf(eventsSuccessfullyAdded[(*numEventsSuccessfullyAdded)], PAPI_MAX_STR_LEN, "%s", eventNamesFromCommandLine[i]); - if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { - fprintf(stderr, "Failed to fully write successfully added event.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - (*numEventsSuccessfullyAdded)++; - } - - return; -} - -// Host function int main(int argc, char** argv) { - int quiet = 0; - cudaError_t cudaError; - CUresult cuError; (void) cuError; - - cuInit(0); + check_cuda_driver_api_call( cuInit(0) ); + + // Determine the number of Cuda capable devices + int num_devices = 0; + check_cuda_runtime_api_call( cudaGetDeviceCount(&num_devices) ); + // No devices detected on the machine, exit + if (num_devices < 1) { + fprintf(stderr, "No NVIDIA devices found on the machine. This is required for the test to run.\n"); + exit(EXIT_FAILURE); + } + + int suppress_output = 0; + char *user_defined_suppress_output = getenv("PAPI_CUDA_TEST_QUIET"); + if (user_defined_suppress_output) { + suppress_output = (int) strtol(user_defined_suppress_output, (char**) NULL, 10); + } + PRINT(suppress_output, "Running the cuda component test HelloWorld_noCuCtx.cu\n"); + + int cuda_device_index = -1; + char **cuda_native_event_names = NULL; + // If command line arguments are provided then get their values. + int total_event_count = 0; + if (argc > 1) { + parse_and_assign_args(argc, argv, &cuda_device_index, &cuda_native_event_names, &total_event_count); + } + + // Initialize the PAPI library + int papi_errno = PAPI_library_init(PAPI_VER_CURRENT); + if(papi_errno != PAPI_VER_CURRENT) { + test_fail(__FILE__,__LINE__, "PAPI_library_init()", papi_errno); + } + PRINT(suppress_output, "PAPI version being used for this test: %d.%d.%d\n", + PAPI_VERSION_MAJOR(PAPI_VERSION), + PAPI_VERSION_MINOR(PAPI_VERSION), + PAPI_VERSION_REVISION(PAPI_VERSION)); + + int cuda_cmp_idx = PAPI_get_component_index("cuda"); + if (cuda_cmp_idx < 0) { + test_fail(__FILE__, __LINE__, "PAPI_get_component_index()", cuda_cmp_idx); + } + PRINT(suppress_output, "The cuda component is assigned to component index: %d\n", cuda_cmp_idx); + + // If a user does not provide an event or events, then we go get an event to add + if (total_event_count == 0) { + enumerate_and_store_cuda_native_events(&cuda_native_event_names, &total_event_count, &cuda_device_index); + } -#ifdef PAPI - char *test_quiet = getenv("PAPI_CUDA_TEST_QUIET"); - if (test_quiet) - quiet = (int) strtol(test_quiet, (char**) NULL, 10); - - /* PAPI Initialization */ - int papi_errno = PAPI_library_init( PAPI_VER_CURRENT ); - if( papi_errno != PAPI_VER_CURRENT ) { - test_fail(__FILE__,__LINE__, "PAPI_library_init failed", 0); - } - - printf( "PAPI_VERSION : %4d %6d %7d\n", - PAPI_VERSION_MAJOR( PAPI_VERSION ), - PAPI_VERSION_MINOR( PAPI_VERSION ), - PAPI_VERSION_REVISION( PAPI_VERSION ) ); - - int i; int EventSet = PAPI_NULL; - int eventCount = argc - 1; - - /* if no events passed at command line, just report test skipped. */ - if (eventCount == 0) { - fprintf(stderr, "No events specified at command line."); - test_skip(__FILE__,__LINE__, "", 0); - } - - long long *values = (long long *) calloc(eventCount, sizeof (long long)); - if (values == NULL) { - test_fail(__FILE__, __LINE__, "Failed to allocate memory for values.\n", 0); - } - - int *events = (int *) calloc(eventCount, sizeof (int)); - if (events == NULL) { - test_fail(__FILE__, __LINE__, "Failed to allocate memory for events.\n", 0); - } - - papi_errno = PAPI_create_eventset( &EventSet ); - if( papi_errno != PAPI_OK ) { - test_fail(__FILE__,__LINE__,"Cannot create eventset",papi_errno); - } + check_papi_api_call( PAPI_create_eventset(&EventSet) ); // Handle the events from the command line - int numEventsSuccessfullyAdded = 0, numMultipassEvents = 0; - char **eventsSuccessfullyAdded, **metricNames = argv + 1; - eventsSuccessfullyAdded = (char **) malloc(eventCount * sizeof(char *)); - if (eventsSuccessfullyAdded == NULL) { - fprintf(stderr, "Failed to allocate memory for successfully added events.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - for (i = 0; i < eventCount; i++) { - eventsSuccessfullyAdded[i] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); - if (eventsSuccessfullyAdded[i] == NULL) { - fprintf(stderr, "Failed to allocate memory for command line argument.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - } + int num_events_successfully_added = 0, numMultipassEvents = 0; + char **events_successfully_added = (char **) malloc(total_event_count * sizeof(char *)); + check_memory_allocation_call( events_successfully_added ); - add_events_from_command_line(EventSet, eventCount, metricNames, &numEventsSuccessfullyAdded, eventsSuccessfullyAdded, &numMultipassEvents); + int event_idx; + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + events_successfully_added[event_idx] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call( events_successfully_added[event_idx] ); + + add_cuda_native_events(EventSet, cuda_native_event_names[event_idx], &num_events_successfully_added, events_successfully_added, &numMultipassEvents); + } // Only multiple pass events were provided on the command line - if (numEventsSuccessfullyAdded == 0) { + if (num_events_successfully_added == 0) { fprintf(stderr, "Events provided on the command line could not be added to an EventSet as they require multiple passes.\n"); test_skip(__FILE__, __LINE__, "", 0); } - papi_errno = PAPI_start( EventSet ); - if( papi_errno != PAPI_OK ) { - test_fail(__FILE__, __LINE__, "PAPI_start failed.", papi_errno); - } - -#endif - - int j; - - // desired output - char str[] = "Hello World!"; + check_papi_api_call( PAPI_start(EventSet) ); // mangle contents of output // the null character is left intact for simplicity - for(j = 0; j < 12; j++) { - str[j] -= j; + char str[] = "Hello World!"; // Destired Output + int i; + for (i = 0; i < strlen(str); i++) { + str[i] -= i; } - - PRINT( quiet, "mangled str=%s\n", str ); + PRINT(suppress_output, "mangled str=%s\n", str); // allocate memory on the device char *d_str; size_t size = sizeof(str); - cudaMalloc((void**)&d_str, size); + check_cuda_runtime_api_call( cudaMalloc((void**)&d_str, size) ); + check_memory_allocation_call( d_str ); - // copy the string to the device - cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice); + // Copy the string to the device + check_cuda_runtime_api_call( cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice) ); - // set the grid and block sizes - dim3 dimGrid(2); // one block per word - dim3 dimBlock(6); // one thread per character + // Set the grid and block sizes + dim3 dimGrid(2); // One block per word + dim3 dimBlock(6); // One thread per character // invoke the kernel helloWorld<<< dimGrid, dimBlock >>>(d_str); + check_cuda_runtime_api_call( cudaGetLastError() ); - cudaError = cudaGetLastError(); - if (STEP_BY_STEP_DEBUG) { - fprintf(stderr, "%s:%s:%i Kernel Return Code: %s.\n", __FILE__, __func__, __LINE__, cudaGetErrorString(cudaError)); - } + // Retrieve the results from the device + check_cuda_runtime_api_call( cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost) ); - // retrieve the results from the device - cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost); + // Free up the allocated memory on the device + check_cuda_runtime_api_call( cudaFree(d_str) ); - // free up the allocated memory on the device - cudaFree(d_str); + long long *cuda_counter_values = (long long *) calloc(total_event_count, sizeof (long long)); + check_memory_allocation_call( cuda_counter_values ); -#ifdef PAPI - papi_errno = PAPI_read( EventSet, values ); - if( papi_errno != PAPI_OK ) { - test_fail(__FILE__, __LINE__, "PAPI_read failed", papi_errno); - } + check_papi_api_call( PAPI_read(EventSet, cuda_counter_values) ); - for( i = 0; i < numEventsSuccessfullyAdded; i++ ) { - PRINT( quiet, "read: %12lld \t=0X%016llX \t\t --> %s \n", values[i], values[i], eventsSuccessfullyAdded[i] ); + for (event_idx = 0; event_idx < num_events_successfully_added; event_idx++ ) { + PRINT(suppress_output, "After PAPI_read, the event %s produced the value: \t\t%lld\n", events_successfully_added[event_idx], cuda_counter_values[event_idx]); } - papi_errno = PAPI_stop( EventSet, values ); - if( papi_errno != PAPI_OK ) { - test_fail(__FILE__, __LINE__, "PAPI_stop failed", papi_errno); - } + check_papi_api_call( PAPI_stop(EventSet, cuda_counter_values) ); - papi_errno = PAPI_cleanup_eventset(EventSet); - if( papi_errno != PAPI_OK ) { - test_fail(__FILE__, __LINE__, "PAPI_cleanup_eventset failed", papi_errno); - } + for (event_idx = 0; event_idx < num_events_successfully_added; event_idx++) { + PRINT(suppress_output, "After PAPI_stop, the event %s produced the value: \t\t%lld\n", events_successfully_added[event_idx], cuda_counter_values[event_idx]); + } - papi_errno = PAPI_destroy_eventset(&EventSet); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_destroy_eventset failed", papi_errno); - } + check_papi_api_call( PAPI_cleanup_eventset(EventSet) ); - for( i = 0; i < numEventsSuccessfullyAdded; i++ ) { - PRINT( quiet, "stop: %12lld \t=0X%016llX \t\t --> %s \n", values[i], values[i], eventsSuccessfullyAdded[i] ); + check_papi_api_call( PAPI_destroy_eventset(&EventSet) ); + + // Output a note that a multiple pass event was provided on the command line + if (numMultipassEvents > 0) { + PRINT(suppress_output, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); } // Free allocated memory - free(values); - free(events); - for (i = 0; i < eventCount; i++) { - free(eventsSuccessfullyAdded[i]); + free(cuda_counter_values); + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + free(cuda_native_event_names[event_idx]); + free(events_successfully_added[event_idx]); } - free(eventsSuccessfullyAdded); + free(cuda_native_event_names); + free(events_successfully_added); PAPI_shutdown(); - // Output a note that a multiple pass event was provided on the command line - if (numMultipassEvents > 0) { - PRINT(quiet, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); - } - test_pass(__FILE__); -#endif - return 0; + return 0; } diff --git a/src/components/cuda/tests/Makefile b/src/components/cuda/tests/Makefile index 5bded9135..e52440d41 100644 --- a/src/components/cuda/tests/Makefile +++ b/src/components/cuda/tests/Makefile @@ -37,55 +37,58 @@ ifeq ($(BUILD_SHARED_LIB),yes) NVCFLAGS += -Xcompiler -fpic endif CFLAGS += -g $(PAPI_FLAG) -INCLUDE += -I$(PAPI_CUDA_ROOT)/include +INCLUDE += -I$(PAPI_CUDA_ROOT)/include -I$(PAPI_CUDA_ROOT)/extras/CUPTI/include CUDALIBS = -L$(PAPI_CUDA_ROOT)/lib64 -lcudart -lcuda cuda_tests: $(TESTS) $(TESTS_NOCTX) %.o:%.cu - $(NVCC) $(INCLUDE) $(NVCFLAGS) $(CUDA_CPPFLAGS) -c -o $@ $< + $(NVCC) $(INCLUDE) $(NVCFLAGS) $(CUDA_CPPFLAGS) -Xcompiler -fopenmp -c -o $@ $< %.mac:%.cu - $(NVCC) $(INCLUDE) $(NVCFLAGS) $(CUDA_CPPFLAGS) -E -c -o $@ $< + $(NVCC) $(INCLUDE) $(NVCFLAGS) $(CUDA_CPPFLAGS) -Xcompiler -fopenmp -E -c -o $@ $< -test_multi_read_and_reset: test_multi_read_and_reset.o $(UTILOBJS) - $(CXX) $(CFLAGS) -o test_multi_read_and_reset test_multi_read_and_reset.o $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) +cuda_tests_helper.o: cuda_tests_helper.c + $(CXX) $(CFLAGS) $(INCLUDE) -o $@ -c $^ + +test_multi_read_and_reset: test_multi_read_and_reset.o cuda_tests_helper.o $(UTILOBJS) + $(CXX) $(CFLAGS) cuda_tests_helper.o -o test_multi_read_and_reset test_multi_read_and_reset.o $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) concurrent_profiling: concurrent_profiling.o $(UTILOBJS) - $(CXX) $(CFLAGS) -pthread -o concurrent_profiling concurrent_profiling.o $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) + $(CXX) $(CFLAGS) -pthread -o concurrent_profiling concurrent_profiling.o $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) -L$(PAPI_CUDA_ROOT)/extras/CUPTI/lib64 -lcupti concurrent_profiling_noCuCtx: concurrent_profiling_noCuCtx.o $(UTILOBJS) - $(CXX) $(CFLAGS) -pthread -o concurrent_profiling_noCuCtx concurrent_profiling_noCuCtx.o $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) + $(CXX) $(CFLAGS) -pthread -o concurrent_profiling_noCuCtx concurrent_profiling_noCuCtx.o $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) -L$(PAPI_CUDA_ROOT)/extras/CUPTI/lib64 -lcupti -pthreads: pthreads.o - $(CXX) $(CFLAGS) -pthread -o pthreads pthreads.o $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) +pthreads: pthreads.o cuda_tests_helper.o + $(CXX) $(CFLAGS) -pthread -o pthreads pthreads.o cuda_tests_helper.o $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) -pthreads_noCuCtx: pthreads_noCuCtx.o - $(CXX) $(CFLAGS) -pthread -o pthreads_noCuCtx pthreads_noCuCtx.o $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) +pthreads_noCuCtx: pthreads_noCuCtx.o cuda_tests_helper.o + $(CXX) $(CFLAGS) -pthread -o pthreads_noCuCtx pthreads_noCuCtx.o cuda_tests_helper.o $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) -cudaOpenMP: cudaOpenMP.o - $(CXX) $(CFLAGS) -o cudaOpenMP cudaOpenMP.o -lgomp -fopenmp $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) +cudaOpenMP: cudaOpenMP.o cuda_tests_helper.o + $(CXX) $(CFLAGS) -fopenmp -o cudaOpenMP cudaOpenMP.o cuda_tests_helper.o -lgomp $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) -cudaOpenMP_noCuCtx: cudaOpenMP_noCuCtx.o - $(CXX) $(CFLAGS) -o cudaOpenMP_noCuCtx cudaOpenMP_noCuCtx.o -lgomp -fopenmp $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) +cudaOpenMP_noCuCtx: cudaOpenMP_noCuCtx.o cuda_tests_helper.o + $(CXX) $(CFLAGS) -o cudaOpenMP_noCuCtx cudaOpenMP_noCuCtx.o cuda_tests_helper.o -lgomp -fopenmp $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) test_multipass_event_fail: test_multipass_event_fail.o $(UTILOBJS) $(CXX) $(CFLAGS) -o test_multipass_event_fail test_multipass_event_fail.o $(INCLUDE) $(UTILOBJS) $(PAPILIB) $(LDFLAGS) $(CUDALIBS) -test_2thr_1gpu_not_allowed: test_2thr_1gpu_not_allowed.o - $(CXX) $(CFLAGS) -pthread -o test_2thr_1gpu_not_allowed test_2thr_1gpu_not_allowed.o $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) +test_2thr_1gpu_not_allowed: test_2thr_1gpu_not_allowed.o cuda_tests_helper.o + $(CXX) $(CFLAGS) -pthread cuda_tests_helper.o -o test_2thr_1gpu_not_allowed test_2thr_1gpu_not_allowed.o $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) -HelloWorld: HelloWorld.o $(UTILOBJS) - $(CXX) $(CFLAGS) -o HelloWorld HelloWorld.o $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) +HelloWorld: HelloWorld.o cuda_tests_helper.o $(UTILOBJS) + $(CXX) $(CFLAGS) cuda_tests_helper.o HelloWorld.o -o HelloWorld $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) -HelloWorld_noCuCtx: HelloWorld_noCuCtx.o $(UTILOBJS) - $(CXX) $(CFLAGS) -o HelloWorld_noCuCtx HelloWorld_noCuCtx.o $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) +HelloWorld_noCuCtx: HelloWorld_noCuCtx.o cuda_tests_helper.o $(UTILOBJS) + $(CXX) $(CFLAGS) cuda_tests_helper.o -o HelloWorld_noCuCtx HelloWorld_noCuCtx.o $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) -simpleMultiGPU: simpleMultiGPU.o $(UTILOBJS) - $(CXX) $(CFLAGS) -o simpleMultiGPU simpleMultiGPU.o $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) +simpleMultiGPU: simpleMultiGPU.o cuda_tests_helper.o $(UTILOBJS) + $(CXX) $(CFLAGS) -o simpleMultiGPU simpleMultiGPU.o cuda_tests_helper.o $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) -simpleMultiGPU_noCuCtx: simpleMultiGPU_noCuCtx.o $(UTILOBJS) - $(CXX) $(CFLAGS) -o simpleMultiGPU_noCuCtx simpleMultiGPU_noCuCtx.o $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) +simpleMultiGPU_noCuCtx: simpleMultiGPU_noCuCtx.o cuda_tests_helper.o $(UTILOBJS) + $(CXX) $(CFLAGS) -o simpleMultiGPU_noCuCtx simpleMultiGPU_noCuCtx.o cuda_tests_helper.o $(UTILOBJS) $(PAPILIB) $(CUDALIBS) $(LDFLAGS) clean: rm -f *.o $(TESTS) $(TESTS_NOCTX) diff --git a/src/components/cuda/tests/concurrent_profiling.cu b/src/components/cuda/tests/concurrent_profiling.cu index f8fbac81a..3b13bbe27 100644 --- a/src/components/cuda/tests/concurrent_profiling.cu +++ b/src/components/cuda/tests/concurrent_profiling.cu @@ -1,102 +1,60 @@ -// Copyright 2021 NVIDIA Corporation. All rights reserved -// -// This sample demonstrates two ways to use the CUPTI Profiler API with concurrent kernels. -// By taking the ratio of runtimes for a consecutive series of kernels, compared -// to a series of concurrent kernels, one can difinitively demonstrate that concurrent -// kernels were running while metrics were gathered and the User Replay mechanism was in use. -// -// Example: -// 4 kernel launches, with 1x, 2x, 3x, and 4x amounts of work, each sized to one SM (one warp -// of threads, one thread block). -// When run synchronously, this comes to 10x amount of work. -// When run concurrently, the longest (4x) kernel should be the only measured time (it hides the others). -// Thus w/ 4 kernels, the concurrent : consecutive time ratio should be 4:10. -// On test hardware this does simplify to 3.998:10. As the test is affected by memory layout, this may not -// hold for certain architectures where, for example, cache sizes may optimize certain kernel calls. -// -// After demonstrating concurrency using multpile streams, this then demonstrates using multiple devices. -// In this 3rd configuration, the same concurrent workload with streams is then duplicated and run -// on each device concurrently using streams. -// In this case, the wallclock time to launch, run, and join the threads should be roughly the same as the -// wallclock time to run the single device case. If concurrency was not working, the wallcock time -// would be (num devices) times the single device concurrent case. -// -// * If the multiple devices have different performance, the runtime may be significantly different between -// devices, but this does not mean concurrent profiling is not happening. - -// This code has been adapted to PAPI from -// `/extras/CUPTI/samples/concurrent_profiling/cpncurrent_profiling.cu` - -#ifdef PAPI -extern "C" { - #include - #include "papi_test.h" -} -#endif - -// Standard CUDA, CUPTI, Profiler, NVPW headers -#include "cuda.h" +/** +* @file concurrent_profiling.cu +* @brief This test utilizes concurrent kernels by taking the ration of runtimes for a consecutive series of kernels, +* compared to a series of concurrent kernels, one can definitively demonstrate that concurrent kernels +* were running while native events were gathered and the user replay mechanism was in use. Cuda contexts are +* created with calls to cuCtxCreate. +* +* Example: 4 kernel launches, with 1x, 2x, 3x, and 4x amounts of work, each sized to one SM (one warp +* of threads, one thread block). When run synchronously, this comes to 10x amount of work. When run +* concurrently, the longest (4x) kernel should be the only measured time (it hides the others). +* Thus w/4 kernels, the concurrent : consecutive time ration should be 4:10. +* On test hardware this does simplify to 3.998:10. As the test is affected by memory layout, this +* may not hold for certain architectures where, for example, cache sizes may optimize certain kernel +* calls. +* +* After demonstrating concurrent usign multiple streams, this test then demonstrates using multiple devices. +* In this 3rd configuration, the same concurrent workflow with streams is then duplicated and run +* on each device concurrently using streams. In this, case, the wallclock time to launch, run, and join +* threads should be roughly the same as the wallclock time to run the single device case. If concurrency +* was not working, the wallclock time would be (number of deivces) times the single device concurrent case. +* +* Notes: +* - This test only works with CC's >= 7.0 which follows exactly what is done +* for the concurrent_profiling.cu test in extras/CUPTI/samples/concurrent_profiling. +* +* - If the multiple devices have different performance, the runtime may be significantly different between +* devices, but this does not mean concurrent profiling is not happening. +*/ -// Standard STL headers +// Standard library headers #include #include #include - #include using ::std::string; - #include using ::std::thread; - #include using ::std::vector; - #include using ::std::find; -#define PRINT(quiet, format, args...) {if (!quiet) {fprintf(stderr, format, ## args);}} -int quiet; - -#ifdef PAPI -#define PAPI_CALL(apiFuncCall) \ -do { \ - int _status = apiFuncCall; \ - if (_status != PAPI_OK) { \ - fprintf(stderr, "error: function %s failed.", #apiFuncCall); \ - test_fail(__FILE__, __LINE__, "", _status); \ - } \ -} while (0) -#endif +// Cuda Toolkit headers +#include +#include + +// Internal headers +extern "C" { + #include "cuda_tests_helper.h" + #include "papi.h" + #include "papi_test.h" +} -// Helpful error handlers for standard CUPTI and CUDA runtime calls -#define RUNTIME_API_CALL(apiFuncCall) \ -do { \ - cudaError_t _status = apiFuncCall; \ - if (_status != cudaSuccess) { \ - fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ - __FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\ - exit(EXIT_FAILURE); \ - } \ -} while (0) - -#define MEMORY_ALLOCATION_CALL(var) \ -do { \ - if (var == NULL) { \ - fprintf(stderr, "%s:%d: Error: Memory Allocation Failed \n", \ - __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - } \ -} while (0) - -#define DRIVER_API_CALL(apiFuncCall) \ -do { \ - CUresult _status = apiFuncCall; \ - if (_status != CUDA_SUCCESS) { \ - fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \ - __FILE__, __LINE__, #apiFuncCall, _status); \ - exit(EXIT_FAILURE); \ - } \ -} while (0) +// Currently we are only adding cuda:::sm__cycles_active:stat=sum and cuda:::sm__cycles_elapsed:stat=max +#define MAX_EVENTS_TO_ADD 2 + +int global_suppress_output; typedef struct { @@ -107,12 +65,19 @@ typedef struct // Per-device configuration, buffers, stream and device information, and device pointers typedef struct { - int deviceID; - profilingConfig config; // Each device (or each context) needs its own CUPTI profiling config - vector streams; // Each device needs its own streams - vector d_x; // And device memory allocation - vector d_y; // .. - long long values[100]; // Capture PAPI measured values for each device + // For each device, store the range name + vector range_name; + // For each device, store the successfully added events + vector events_successfully_added; + // For each device (or each context) store its CUPTI profiling config + profilingConfig config; + // For each device, store its streams + vector streams; + // For each device, allocate memory + vector d_x; + vector d_y; + // For each event for a device, store PAPI counter values + vector> cuda_counter_values; } perDeviceData; #define DAXPY_REPEAT 32768 @@ -136,7 +101,7 @@ int threadBlocks = 1; // Configurable number of kernels (streams, when running concurrently) int const numKernels = 4; -int const numStreams = numKernels; +int const num_streams = numKernels; vector elements(numKernels); // Each kernel call allocates and computes (call number) * (blockSize) elements @@ -144,46 +109,38 @@ vector elements(numKernels); int const blockSize = 4 * 1024; // Globals for successfully added and multiple pass events -int numMultipassEvents = 0; -vector eventsSuccessfullyAdded; +int global_num_multipass_events; -/** @class add_events_from_command_line +/** @class add_cuda_native_events * @brief Try and add each event provided on the command line by the user. * - * @param d - * Per device data. * @param EventSet * A PAPI eventset. - * @param metricNames - * Events provided on the command line. - * @param successfullyAddedEvents - * Events successfully added to the EventSet. + * @param cuda_native_event_name + * Event to add to the EventSet. + * @param &device_data + * Per device configuration. * @param *numMultipassEvents * Counter to see if a multiple pass event was provided on the command line. */ -static void add_events_from_command_line(perDeviceData &d, int EventSet, vector const &metricNames, vector successfullyAddedEvents, int *numMultipassEvents) -{ - int i; - for (i = 0; i < metricNames.size(); i++) { - string evt_name = metricNames[i] + std::to_string(d.config.device); - int papi_errno = PAPI_add_named_event(EventSet, evt_name.c_str()); - if (papi_errno != PAPI_OK) { - if (papi_errno != PAPI_EMULPASS) { - fprintf(stderr, "Unable to add event %s to the EventSet with error code %d.\n", evt_name.c_str(), papi_errno); - test_skip(__FILE__, __LINE__, "", 0); - } - - // Handle multiple pass events - (*numMultipassEvents)++; - continue; - } - - // Handle successfully added events - if (find(eventsSuccessfullyAdded.begin(), eventsSuccessfullyAdded.end(), metricNames[i]) == eventsSuccessfullyAdded.end()) { - eventsSuccessfullyAdded.push_back(metricNames[i]); - } - } - +static void add_cuda_native_events_concurrent(int EventSet, string cuda_native_event_name, perDeviceData &device_data, int *numMultipassEvents) +{ + int papi_errno = PAPI_add_named_event(EventSet, cuda_native_event_name.c_str()); + if (papi_errno != PAPI_OK) { + if (papi_errno != PAPI_EMULPASS) { + fprintf(stderr, "Unable to add event %s to the EventSet with error code %d.\n", cuda_native_event_name, papi_errno); + exit(EXIT_FAILURE); + } + + // Handle multiple pass events + (*numMultipassEvents)++; + } + + // Handle successfully added events + if (find(device_data.events_successfully_added.begin(), device_data.events_successfully_added.end(), cuda_native_event_name.c_str()) == device_data.events_successfully_added.end()) { + device_data.events_successfully_added.push_back(cuda_native_event_name.c_str()); + } + return; } @@ -191,28 +148,37 @@ static void add_events_from_command_line(perDeviceData &d, int EventSet, vector< // The device streams vector is used to control which stream each call is made on // If 'serial' is non-zero, the device streams are ignored and instead the default stream is used void profileKernels(perDeviceData &d, - vector const &metricNames, + vector const &base_cuda_native_event_names_with_stat_qual, char const * const rangeName, bool serial) { // Switch to desired device - RUNTIME_API_CALL(cudaSetDevice(d.config.device)); // Orig code has mistake here - DRIVER_API_CALL(cuCtxSetCurrent(d.config.context)); -#ifdef PAPI - int eventset = PAPI_NULL; - PAPI_CALL(PAPI_create_eventset(&eventset)); + check_cuda_runtime_api_call( cudaSetDevice(d.config.device) ); // Orig code has mistake here + + check_cuda_driver_api_call( cuCtxSetCurrent(d.config.context) ); + + int EventSet = PAPI_NULL; + check_papi_api_call( PAPI_create_eventset(&EventSet) ); + + global_num_multipass_events = 0; + int event_idx; + for (event_idx = 0; event_idx < base_cuda_native_event_names_with_stat_qual.size(); event_idx++) { + string tmp_event_name = base_cuda_native_event_names_with_stat_qual[event_idx] + ":device=" + std::to_string(d.config.device); + add_cuda_native_events_concurrent(EventSet, tmp_event_name, d, &global_num_multipass_events); + } - add_events_from_command_line(d, eventset, metricNames, eventsSuccessfullyAdded, &numMultipassEvents); + //add_cuda_native_events(d, EventSet, base_cuda_native_event_names_with_stat_qual, &global_num_multipass_events); // Only multiple pass events were provided on the command line - if (eventsSuccessfullyAdded.size() == 0) { - fprintf(stderr, "Events provided on the command line could not be added to an EventSet as they require multiple passes.\n"); + if (d.events_successfully_added.size() == 0) { + fprintf(stderr, "Both cuda:::sm__cycles_active:stat=sum and cuda:::sm__cycles_elapsed:stat=max were unable to be added. This may be due to the architecture you are running on.\n"); test_skip(__FILE__, __LINE__, "", 0); } - PAPI_CALL(PAPI_start(eventset)); -#endif + // Internally at PAPI_start we push a range; therefore, users do not push a range + check_papi_api_call( PAPI_start(EventSet) ); - for (unsigned int stream = 0; stream < d.streams.size(); stream++) + unsigned int stream; + for (stream = 0; stream < d.streams.size(); stream++) { cudaStream_t streamId = (serial ? 0 : d.streams[stream]); daxpyKernel <<>> (elements[stream], a, d.d_x[stream], d.d_y[stream]); @@ -221,124 +187,162 @@ void profileKernels(perDeviceData &d, // After launching all work, synchronize all streams if (serial == false) { - for (unsigned int stream = 0; stream < d.streams.size(); stream++) + for (stream = 0; stream < d.streams.size(); stream++) { - RUNTIME_API_CALL(cudaStreamSynchronize(d.streams[stream])); + check_cuda_runtime_api_call( cudaStreamSynchronize(d.streams[stream]) ); } } else { - RUNTIME_API_CALL(cudaStreamSynchronize(0)); + check_cuda_runtime_api_call( cudaStreamSynchronize(0) ); } -#ifdef PAPI - PAPI_CALL(PAPI_stop(eventset, d.values)); - PAPI_CALL(PAPI_cleanup_eventset(eventset)); - PAPI_CALL(PAPI_destroy_eventset(&eventset)); -#endif + + // Internally at PAPI_stop we pop the range; therefore, users do not pop a range + long long values[MAX_EVENTS_TO_ADD]; + check_papi_api_call( PAPI_stop(EventSet, values) ); + + for (event_idx = 0; event_idx < d.events_successfully_added.size(); event_idx++) { + d.cuda_counter_values[event_idx].push_back(values[event_idx]); + } + + check_papi_api_call( PAPI_cleanup_eventset(EventSet) ); + + check_papi_api_call( PAPI_destroy_eventset(&EventSet) ); + + // Keep track of the range name, again PAPI internally has defined a range name, but + // for the sake of following the CUPTI test, we will use their range names. + d.range_name.push_back(rangeName); } -void print_measured_values(perDeviceData &d, vector const &metricNames) +void print_measured_values(perDeviceData &d) { - string evt_name; - PRINT(quiet, "PAPI event name\t\t\t\t\t\t\tMeasured value\n"); - PRINT(quiet, "%s\n", std::string(80, '-').c_str()); - for (int i=0; i < metricNames.size(); i++) { - evt_name = metricNames[i] + std::to_string(d.config.device); - PRINT(quiet, "%s\t\t\t%lld\n", evt_name.c_str(), d.values[i]); + PRINT(global_suppress_output, "%s\n", std::string(200, '-').c_str()); + int event_idx; + for (event_idx = 0; event_idx < d.events_successfully_added.size(); event_idx++) { + int range_idx; + for (range_idx = 0; range_idx < d.range_name.size(); range_idx++) { + PRINT(global_suppress_output, "Range %s with event %s produced the value:\t\t%lld\n", d.range_name[range_idx].c_str(), d.events_successfully_added[event_idx].c_str(), d.cuda_counter_values[event_idx][range_idx]); + } } } +static void print_help_message(void) +{ + printf("./concurrent_profiling\n"); + printf("Notes:\n" + "1. This test is specifically designed to use devices that support CUPTI Profiling i.e. devices with CCs >= 7.0.\n" + "2. No events are accepted from the command line as cuda:::sm_cycles_active:stat=sum, cuda:::sm__cycles_elapsed:stat=max,\n" + " and cuda:::smsp__sass_thread_inst_executed_op_dfma_pred_on:stat=sum are required events.\n"); +} + int main(int argc, char **argv) { - quiet = 0; - int i; -#ifdef PAPI - char *test_quiet = getenv("PAPI_CUDA_TEST_QUIET"); - if (test_quiet) - quiet = (int) strtol(test_quiet, (char**) NULL, 10); - - int event_count = argc - 1; - /* if no events passed at command line, just report test skipped. */ - if (event_count == 0) { - fprintf(stderr, "No eventnames specified at command line.\n"); + char *papi_cuda_api = getenv("PAPI_CUDA_API"); + if (papi_cuda_api != NULL) { + fprintf(stderr, "The concurrent_profiling test only works with the Perfworks Metrics API. Unset the environment variable PAPI_CUDA_API.\n"); + test_skip(__FILE__, __LINE__, "", 0); + } + + // Determine the number of Cuda capable devices + int num_devices = 0; + check_cuda_runtime_api_call( cudaGetDeviceCount(&num_devices) ); + + // No devices detected on the machine, exit + if (num_devices < 1) { + fprintf(stderr, "No NVIDIA devices found on the machine. This is required for the test to run.\n"); test_skip(__FILE__, __LINE__, "", 0); } - vector metricNames; - for (i=0; i < event_count; i++) { - metricNames.push_back(argv[i+1]); + global_suppress_output = 0; + char *user_defined_suppress_output = getenv("PAPI_CUDA_TEST_QUIET"); + if (user_defined_suppress_output) { + global_suppress_output = (int) strtol(user_defined_suppress_output, (char**) NULL, 10); + } + PRINT(global_suppress_output, "Running the cuda component test concurrent_profiling.cu\n") + + // User either provided --help or an argument that would not be useful to this test + if (argc > 1) { + print_help_message(); + exit(EXIT_SUCCESS); } // Initialize the PAPI library - if (PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT) { - test_fail(__FILE__, __LINE__, "PAPI_library_init failed.", 0); + int papi_errno = PAPI_library_init(PAPI_VER_CURRENT); + if (papi_errno != PAPI_VER_CURRENT) { + test_fail(__FILE__, __LINE__, "PAPI_library_init()", papi_errno); } -#else - vector metricNames = {""}; -#endif + PRINT(global_suppress_output, "PAPI version being used for this test: %d.%d.%d\n", + PAPI_VERSION_MAJOR(PAPI_VERSION), + PAPI_VERSION_MINOR(PAPI_VERSION), + PAPI_VERSION_REVISION(PAPI_VERSION)); - int numDevices; - RUNTIME_API_CALL(cudaGetDeviceCount(&numDevices)); + int cuda_cmp_idx = PAPI_get_component_index("cuda"); + if (cuda_cmp_idx < 0) { + test_fail(__FILE__, __LINE__, "PAPI_get_component_index()", cuda_cmp_idx); + } + PRINT(global_suppress_output, "The cuda component is assigned to component index: %d\n", cuda_cmp_idx); // Per-device information vector device_ids; - - // Find all devices capable of running CUPTI Profiling (Compute Capability >= 7.0) - for (i = 0; i < numDevices; i++) + // Find all devices capable of running CUPTI Profiling (CC >= 7.0) + int dev_idx; + for (dev_idx = 0; dev_idx < num_devices; dev_idx++) { - // Get device properties + // Obtain major compute capability int major; - RUNTIME_API_CALL(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, i)); - if (major >= 7) - { - // Record device number - device_ids.push_back(i); + check_cuda_runtime_api_call( cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev_idx) ); + if (major >= 7) { + PRINT(global_suppress_output, "--> Device %d is compatible with the concurrent_profiling test\n", dev_idx); + device_ids.push_back(dev_idx); } } - - numDevices = device_ids.size(); - PRINT(quiet, "Found %d compatible devices\n", numDevices); - - // Ensure we found at least one device - if (numDevices == 0) - { - fprintf(stderr, "No devices detected compatible with CUPTI Profiling (Compute Capability >= 7.0)\n"); -#ifdef PAPI + if (device_ids.size() == 0) { + fprintf(stderr, "No devices on the machine detected that have CC >= 7.0 and support CUPTI Profiling.\n"); test_skip(__FILE__, __LINE__, "", 0); -#endif } + + // Overwrite num_devices with the number of devices this test actually supports + num_devices = device_ids.size(); // Initialize kernel input to some known numbers vector h_x(blockSize * numKernels); vector h_y(blockSize * numKernels); - for (size_t i = 0; i < blockSize * numKernels; i++) - { + size_t i; + for (i = 0; i < blockSize * numKernels; i++) { h_x[i] = 1.5 * i; h_y[i] = 2.0 * (i - 3000); } // Initialize a vector of 'default stream' values to demonstrate serialized kernels - vector defaultStreams(numStreams); - for (int stream = 0; stream < numStreams; stream++) - { - defaultStreams[stream] = 0; + vector default_streams(num_streams); + int stream; + for (stream = 0; stream < num_streams; stream++) { + default_streams[stream] = 0; } // Scale per-kernel work by stream number - for (int stream = 0; stream < numStreams; stream++) - { + for (stream = 0; stream < num_streams; stream++) { elements[stream] = blockSize * (stream + 1); } - // For each device, configure profiling, set up buffers, copy kernel data - vector deviceData(numDevices); + // These metrics below are hardcoded in the CUPTI provided sample code and therefore this test was + // written with them in mind. No command line arguments will be accepted. The test will be skipped + // if none of them can be successfully added. + vector base_cuda_native_event_names_with_stat_qual; + // The below two metrics will demonstrate whether kernels within a Range were run serially or concurrently. + base_cuda_native_event_names_with_stat_qual.push_back("cuda:::sm__cycles_active:stat=sum"); + base_cuda_native_event_names_with_stat_qual.push_back("cuda:::sm__cycles_elapsed:stat=max"); + // This metric shows that the same number of flops were executed on each run. + //base_cuda_native_event_names_with_stat_qual.push_back("cuda:::smsp__sass_thread_inst_executed_op_dfma_pred_on:stat=sum"); - for (int device = 0; device < numDevices; device++) + // For each device, configure profiling, set up buffers, copy kernel data + vector device_data(num_devices); + int device; + for (device = 0; device < num_devices; device++) { int device_id = device_ids[device]; - RUNTIME_API_CALL(cudaSetDevice(device_id)); - PRINT(quiet, "Configuring device %d\n", device_id); - deviceData[device].deviceID = device_id; + check_cuda_runtime_api_call( cudaSetDevice(device_id) ); + PRINT(global_suppress_output, "--> Configuring device %d\n", device_id); // Required CUPTI Profiling configuration & initialization // Can be done ahead of time or immediately before startSession() call @@ -346,105 +350,113 @@ int main(int argc, char **argv) // For simplicity's sake, in this sample, a single config struct is created per device and passed to each CUPTI Profiler API call // For more complex cases, each combination of CUPTI Profiler Session and Config requires additional initialization profilingConfig config; - config.device = device_id; // Device ID, used to get device name for metrics enumeration + // Device ID, used to get device name for metrics enumeration + config.device = device_id; // config.maxLaunchesPerPass = 1; // Must be >= maxRangesPerPass. Set this to the largest count of kernel launches which may be encountered in any Pass in this Session // // Device 0 has max of 3 passes; other devices only run one pass in this sample code int flags = 0; #if defined(CUDA_TOOLKIT_GE_13) - DRIVER_API_CALL( cuCtxCreate(&(config.context), (CUctxCreateParams*)0, flags, device) ); + check_cuda_driver_api_call( cuCtxCreate(&(config.context), (CUctxCreateParams*)0, flags, device) ); #else - DRIVER_API_CALL( cuCtxCreate(&(config.context), flags, device) ); + check_cuda_driver_api_call( cuCtxCreate(&(config.context), flags, device) ); #endif - deviceData[device].config = config;// Save this device config + device_data[device].config = config;// Save this device config // Initialize CUPTI Profiling structures - // targetInitProfiling(deviceData[device], metricNames); + // targetInitProfiling(device_data[device], base_cuda_native_event_names_with_stat_qual); // Per-stream initialization & memory allocation - copy from constant host array to each device array - deviceData[device].streams.resize(numStreams); - deviceData[device].d_x.resize(numStreams); - deviceData[device].d_y.resize(numStreams); - for (int stream = 0; stream < numStreams; stream++) + device_data[device].streams.resize(num_streams); + device_data[device].d_x.resize(num_streams); + device_data[device].d_y.resize(num_streams); + // Resize the vector of vectors for the number of events we have + device_data[device].cuda_counter_values.resize( base_cuda_native_event_names_with_stat_qual.size()); + for (stream = 0; stream < num_streams; stream++) { - RUNTIME_API_CALL(cudaStreamCreate(&(deviceData[device].streams[stream]))); + // Create an asynchronous stream + check_cuda_runtime_api_call( cudaStreamCreate(&(device_data[device].streams[stream])) ); // Each kernel does (stream #) * blockSize work on doubles size_t size = elements[stream] * sizeof(double); - RUNTIME_API_CALL(cudaMalloc(&(deviceData[device].d_x[stream]), size)); - MEMORY_ALLOCATION_CALL(deviceData[device].d_x[stream]); // Validate pointer - RUNTIME_API_CALL(cudaMemcpy(deviceData[device].d_x[stream], h_x.data(), size, cudaMemcpyHostToDevice)); + check_cuda_runtime_api_call( cudaMalloc(&(device_data[device].d_x[stream]), size) ); + check_memory_allocation_call( device_data[device].d_x[stream] ); + check_cuda_runtime_api_call( cudaMemcpy(device_data[device].d_x[stream], h_x.data(), size, cudaMemcpyHostToDevice) ); - RUNTIME_API_CALL(cudaMalloc(&(deviceData[device].d_y[stream]), size)); - MEMORY_ALLOCATION_CALL(deviceData[device].d_y[stream]); // Validate pointer - RUNTIME_API_CALL(cudaMemcpy(deviceData[device].d_y[stream], h_x.data(), size, cudaMemcpyHostToDevice)); + check_cuda_runtime_api_call( cudaMalloc(&(device_data[device].d_y[stream]), size) ); + check_memory_allocation_call( device_data[device].d_y[stream] ); + check_cuda_runtime_api_call( cudaMemcpy(device_data[device].d_y[stream], h_x.data(), size, cudaMemcpyHostToDevice) ); } } - // - // First version - single device, kernel calls serialized on default stream - // + // Formatting print statement + PRINT(global_suppress_output, "%s\n", std::string(200, '-').c_str()); + //////////////////////////////////////////////////////////////////////////////// + // First Version - single device, kernel calls serialized on default stream. // + ////////////////////////////////////////////////////////////////////////////// + // Use wallclock time to measure performance auto begin_time = ::std::chrono::high_resolution_clock::now(); // Run on first device and use default streams, which run serially - profileKernels(deviceData[0], metricNames, "single_device_serial", true); + profileKernels(device_data[0], base_cuda_native_event_names_with_stat_qual, "single_device_serial", true); auto end_time = ::std::chrono::high_resolution_clock::now(); - int elapsed_serial_ms = ::std::chrono::duration_cast<::std::chrono::milliseconds>(end_time - begin_time).count(); + auto elapsed_serial_ms = ::std::chrono::duration_cast<::std::chrono::milliseconds>(end_time - begin_time).count(); int numBlocks = 0; - for (int i = 1; i <= numKernels; i++) + for (i = 1; i <= numKernels; i++) { numBlocks += i; } - PRINT(quiet, "It took %d ms on the host to profile %d kernels in serial.", elapsed_serial_ms, numKernels); + PRINT(global_suppress_output, "It took %d ms on the host to profile %d kernels in serial.\n", elapsed_serial_ms, numKernels); - // - // Second version - same kernel calls as before on the same device, but now using separate streams for concurrency - // (Should be limited by the longest running kernel) - // + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Second version - same kernel calls as before on the same device, but now using separate streams for concurrency, // + // Should be limited by the longest running kernel. // + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // User wallclock time to measure performance begin_time = ::std::chrono::high_resolution_clock::now(); // Still only use first device, but this time use its allocated streams for parallelism - profileKernels(deviceData[0], metricNames, "single_device_async", false); + profileKernels(device_data[0], base_cuda_native_event_names_with_stat_qual, "single_device_async", false); end_time = ::std::chrono::high_resolution_clock::now(); int elapsed_single_device_ms = ::std::chrono::duration_cast<::std::chrono::milliseconds>(end_time - begin_time).count(); - PRINT(quiet, "It took %d ms on the host to profile %d kernels on a single device on separate streams.", elapsed_single_device_ms, numKernels); - PRINT(quiet, "--> If the separate stream wallclock time is less than the serial version, the streams were profiling concurrently.\n"); + PRINT(global_suppress_output, "It took %d ms on the host to profile %d kernels on a single device on separate streams.\n", elapsed_single_device_ms, numKernels); + PRINT(global_suppress_output, "--> If the separate stream wallclock time is less than the serial version, the streams were profiling concurrently.\n"); - // - // Third version - same as the second case, but duplicates the concurrent work across devices to show cross-device concurrency - // This is done using devices so no serialization is needed between devices - // (Should have roughly the same wallclock time as second case if the devices have similar performance) - // + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Third version - same as the second case, but duplicates the concurrent work across devices to show cross-device concurrency. // + // This is done using devices so no serialization is needed between devices. // + // Should have roughly the same wallclock time as the second case if the devices have similar performance. // + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - if (numDevices == 1) + // The third version can only be ran if we have more than one compatible device found + if (device_ids.size() == 1) { - PRINT(quiet, "Only one compatible device found; skipping the multi-threaded test.\n"); + PRINT(global_suppress_output, "Only one compatible device found; skipping the multi-threaded test.\n"); } else { -#ifdef PAPI - int papi_errno = PAPI_thread_init((unsigned long (*)(void)) std::this_thread::get_id); - if ( papi_errno != PAPI_OK ) { - test_fail(__FILE__, __LINE__, "Error setting thread id function.\n", papi_errno); - } -#endif + // Initialize PAPI thread support + check_papi_api_call( PAPI_thread_init((unsigned long (*)(void)) std::this_thread::get_id) ); - PRINT(quiet, "Running on %d devices, one thread per device.\n", numDevices); + // Formatting print statement + PRINT(global_suppress_output, "\n"); + PRINT(global_suppress_output, "Running on %d devices, one thread per device.\n", num_devices); // Time creation of the same multiple streams (on multiple devices, if possible) vector<::std::thread> threads; begin_time = ::std::chrono::high_resolution_clock::now(); // Now launch parallel thread work, duplicated on one thread per device - for (int thread = 0; thread < numDevices; thread++) + int thread; + for (thread = 0; thread < num_devices; thread++) { - threads.push_back(::std::thread(profileKernels, ::std::ref(deviceData[thread]), metricNames, "multi_device_async", false)); + threads.push_back(::std::thread(profileKernels, ::std::ref(device_data[thread]), base_cuda_native_event_names_with_stat_qual, "multi_device_async", false)); } // Wait for all threads to finish @@ -456,49 +468,50 @@ int main(int argc, char **argv) // Record time used when launching on multiple devices end_time = ::std::chrono::high_resolution_clock::now(); int elapsed_multiple_device_ms = ::std::chrono::duration_cast<::std::chrono::milliseconds>(end_time - begin_time).count(); - PRINT(quiet, "It took %d ms on the host to profile the same %d kernels on each of the %d devices in parallel\n", elapsed_multiple_device_ms, numKernels, numDevices); - PRINT(quiet, "--> Wallclock ratio of parallel device launch to single device launch is %f\n", elapsed_multiple_device_ms / (double) elapsed_single_device_ms); - PRINT(quiet, "--> If the ratio is close to 1, that means there was little overhead to profile in parallel on multiple devices compared to profiling on a single device.\n"); - PRINT(quiet, "--> If the devices have different performance, the ratio may not be close to one, and this should be limited by the slowest device.\n"); + PRINT(global_suppress_output, "It took %d ms on the host to profile the same %d kernels on each of the %d devices in parallel\n", elapsed_multiple_device_ms, numKernels, num_devices); + PRINT(global_suppress_output, "--> Wallclock ratio of parallel device launch to single device launch is %f\n", elapsed_multiple_device_ms / (double) elapsed_single_device_ms); + PRINT(global_suppress_output, "--> If the ratio is close to 1, that means there was little overhead to profile in parallel on multiple devices compared to profiling on a single device.\n"); + PRINT(global_suppress_output, "--> If the devices have different performance, the ratio may not be close to one, and this should be limited by the slowest device.\n"); } // Free stream memory for each device - for (int i = 0; i < numDevices; i++) + for (i = 0; i < num_devices; i++) { - for (int j = 0; j < numKernels; j++) + int j; + for (j = 0; j < numKernels; j++) { - RUNTIME_API_CALL(cudaFree(deviceData[i].d_x[j])); - RUNTIME_API_CALL(cudaFree(deviceData[i].d_y[j])); + check_cuda_runtime_api_call( cudaFree(device_data[i].d_x[j]) ); + check_cuda_runtime_api_call( cudaFree(device_data[i].d_y[j]) ); } } -#ifdef PAPI // Display metric values - PRINT(quiet, "\nMetrics for device #0:\n"); - PRINT(quiet, "Look at the sm__cycles_elapsed.max values for each test.\n"); - PRINT(quiet, "This value represents the time spent on device to run the kernels in each case, and should be longest for the serial range, and roughly equal for the single and multi device concurrent ranges.\n"); - print_measured_values(deviceData[0], eventsSuccessfullyAdded); + PRINT(global_suppress_output, "\nMetrics for device #0:\n"); + PRINT(global_suppress_output, "Look at the cuda:::sm__cycles_elapsed:stat=max values for each test.\n"); + PRINT(global_suppress_output, "This value represents the time spent on device to run the kernels in each case, and should be longest for the serial range, and roughly equal for the single and multi device concurrent ranges.\n"); + print_measured_values(device_data[0]); // Only display next device info if needed - if (numDevices > 1) + if (num_devices > 1) { - PRINT(quiet, "\nMetrics for the remaining devices only display the multi device async case and should all be similar to the first device's values if the device has similar performance characteristics.\n"); - PRINT(quiet, "If devices have different performance characteristics, the runtime cycles calculation may vary by device.\n"); + PRINT(global_suppress_output, "\nMetrics for the remaining devices only display the multi device async case and should all be similar to the first device's values if the device has similar performance characteristics.\n"); + PRINT(global_suppress_output, "If devices have different performance characteristics, the runtime cycles calculation may vary by device.\n"); } - for (int i = 1; i < numDevices; i++) + + for (i = 1; i < num_devices; i++) { - PRINT(quiet, "\nMetrics for device #%d:\n", i); - print_measured_values(deviceData[i], eventsSuccessfullyAdded); + PRINT(global_suppress_output, "\nMetrics for device #%d:\n", i); + print_measured_values(device_data[i]); } - PAPI_shutdown(); - // Output a note that a multiple pass event was provided on the command line - if (numMultipassEvents > 0) { - PRINT(quiet, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); + if (global_num_multipass_events > 0) { + PRINT(global_suppress_output, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); } + PAPI_shutdown(); + test_pass(__FILE__); -#endif + return 0; } diff --git a/src/components/cuda/tests/concurrent_profiling_noCuCtx.cu b/src/components/cuda/tests/concurrent_profiling_noCuCtx.cu index 756602467..fda7f267b 100644 --- a/src/components/cuda/tests/concurrent_profiling_noCuCtx.cu +++ b/src/components/cuda/tests/concurrent_profiling_noCuCtx.cu @@ -1,102 +1,60 @@ -// Copyright 2021 NVIDIA Corporation. All rights reserved -// -// This sample demonstrates two ways to use the CUPTI Profiler API with concurrent kernels. -// By taking the ratio of runtimes for a consecutive series of kernels, compared -// to a series of concurrent kernels, one can difinitively demonstrate that concurrent -// kernels were running while metrics were gathered and the User Replay mechanism was in use. -// -// Example: -// 4 kernel launches, with 1x, 2x, 3x, and 4x amounts of work, each sized to one SM (one warp -// of threads, one thread block). -// When run synchronously, this comes to 10x amount of work. -// When run concurrently, the longest (4x) kernel should be the only measured time (it hides the others). -// Thus w/ 4 kernels, the concurrent : consecutive time ratio should be 4:10. -// On test hardware this does simplify to 3.998:10. As the test is affected by memory layout, this may not -// hold for certain architectures where, for example, cache sizes may optimize certain kernel calls. -// -// After demonstrating concurrency using multpile streams, this then demonstrates using multiple devices. -// In this 3rd configuration, the same concurrent workload with streams is then duplicated and run -// on each device concurrently using streams. -// In this case, the wallclock time to launch, run, and join the threads should be roughly the same as the -// wallclock time to run the single device case. If concurrency was not working, the wallcock time -// would be (num devices) times the single device concurrent case. -// -// * If the multiple devices have different performance, the runtime may be significantly different between -// devices, but this does not mean concurrent profiling is not happening. - -// This code has been adapted to PAPI from -// `/extras/CUPTI/samples/concurrent_profiling/cpncurrent_profiling.cu` - -#ifdef PAPI -extern "C" { - #include - #include "papi_test.h" -} -#endif - -// Standard CUDA, CUPTI, Profiler, NVPW headers -#include "cuda.h" +/** +* @file concurrent_profiling.cu +* @brief This test utilizes concurrent kernels by taking the ration of runtimes for a consecutive series of kernels, +* compared to a series of concurrent kernels, one can definitively demonstrate that concurrent kernels +* were running while native events were gathered and the user replay mechanism was in use. cudaSetDevice +* is used to set the device which device executions will be done on. +* +* Example: 4 kernel launches, with 1x, 2x, 3x, and 4x amounts of work, each sized to one SM (one warp +* of threads, one thread block). When run synchronously, this comes to 10x amount of work. When run +* concurrently, the longest (4x) kernel should be the only measured time (it hides the others). +* Thus w/4 kernels, the concurrent : consecutive time ration should be 4:10. +* On test hardware this does simplify to 3.998:10. As the test is affected by memory layout, this +* may not hold for certain architectures where, for example, cache sizes may optimize certain kernel +* calls. +* +* After demonstrating concurrent usign multiple streams, this test then demonstrates using multiple devices. +* In this 3rd configuration, the same concurrent workflow with streams is then duplicated and run +* on each device concurrently using streams. In this, case, the wallclock time to launch, run, and join +* threads should be roughly the same as the wallclock time to run the single device case. If concurrency +* was not working, the wallclock time would be (number of deivces) times the single device concurrent case. +* +* Notes: +* - This test only works with CC's >= 7.0 which follows exactly what is done +* for the concurrent_profiling.cu test in extras/CUPTI/samples/concurrent_profiling. +* +* - If the multiple devices have different performance, the runtime may be significantly different between +* devices, but this does not mean concurrent profiling is not happening. +*/ -// Standard STL headers +// Standard library headers #include #include #include - #include using ::std::string; - #include using ::std::thread; - #include using ::std::vector; - #include using ::std::find; -#define PRINT(quiet, format, args...) {if (!quiet) {fprintf(stderr, format, ## args);}} -int quiet; - -#ifdef PAPI -#define PAPI_CALL(apiFuncCall) \ -do { \ - int _status = apiFuncCall; \ - if (_status != PAPI_OK) { \ - fprintf(stderr, "error: function %s failed.", #apiFuncCall); \ - test_fail(__FILE__, __LINE__, "", _status); \ - } \ -} while (0) -#endif - -// Helpful error handlers for standard CUPTI and CUDA runtime calls -#define RUNTIME_API_CALL(apiFuncCall) \ -do { \ - cudaError_t _status = apiFuncCall; \ - if (_status != cudaSuccess) { \ - fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ - __FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\ - exit(EXIT_FAILURE); \ - } \ -} while (0) - -#define MEMORY_ALLOCATION_CALL(var) \ -do { \ - if (var == NULL) { \ - fprintf(stderr, "%s:%d: Error: Memory Allocation Failed \n", \ - __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - } \ -} while (0) - -#define DRIVER_API_CALL(apiFuncCall) \ -do { \ - CUresult _status = apiFuncCall; \ - if (_status != CUDA_SUCCESS) { \ - fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \ - __FILE__, __LINE__, #apiFuncCall, _status); \ - exit(EXIT_FAILURE); \ - } \ -} while (0) +// Cuda Toolkit headers +#include +#include + +// Internal headers +extern "C" { + #include "cuda_tests_helper.h" + #include "papi.h" + #include "papi_test.h" +} + +// Currently we are only adding cuda:::sm__cycles_active:stat=sum and cuda:::sm__cycles_elapsed:stat=max +#define MAX_EVENTS_TO_ADD 2 + +int global_suppress_output; typedef struct { @@ -106,12 +64,19 @@ typedef struct // Per-device configuration, buffers, stream and device information, and device pointers typedef struct { - int deviceID; - profilingConfig config; // Each device (or each context) needs its own CUPTI profiling config - vector streams; // Each device needs its own streams - vector d_x; // And device memory allocation - vector d_y; // .. - long long values[100]; // Capture PAPI measured values for each device + // For each device, store the range name + vector range_name; + // For each device, store the successfully added events + vector events_successfully_added; + // For each device (or each context) store its CUPTI profiling config + profilingConfig config; + // For each device, store its streams + vector streams; + // For each device, allocate memory + vector d_x; + vector d_y; + // For each event for a device, store PAPI counter values + vector> cuda_counter_values; } perDeviceData; #define DAXPY_REPEAT 32768 @@ -135,54 +100,46 @@ int threadBlocks = 1; // Configurable number of kernels (streams, when running concurrently) int const numKernels = 4; -int const numStreams = numKernels; +int const num_streams = numKernels; vector elements(numKernels); // Each kernel call allocates and computes (call number) * (blockSize) elements // For 4 calls, this is 4k elements * 2 arrays * (1 + 2 + 3 + 4 stream mul) * 8B/elem =~ 640KB int const blockSize = 4 * 1024; -// Globals for successfully added and multiple pass events -int numMultipassEvents = 0; -vector eventsSuccessfullyAdded; +// Globals for multiple pass events +int global_num_multipass_events = 0; -/** @class add_events_from_command_line +/** @class add_cuda_native_events * @brief Try and add each event provided on the command line by the user. * - * @param d - * Per device data. * @param EventSet * A PAPI eventset. - * @param metricNames - * Events provided on the command line. - * @param successfullyAddedEvents - * Events successfully added to the EventSet. + * @param cuda_native_event_name + * Event to add to the EventSet. + * @param &device_data + * Per device configuration. * @param *numMultipassEvents * Counter to see if a multiple pass event was provided on the command line. */ -static void add_events_from_command_line(perDeviceData &d, int EventSet, vector const &metricNames, vector successfullyAddedEvents, int *numMultipassEvents) -{ - int i; - for (i = 0; i < metricNames.size(); i++) { - string evt_name = metricNames[i] + std::to_string(d.config.device); - int papi_errno = PAPI_add_named_event(EventSet, evt_name.c_str()); - if (papi_errno != PAPI_OK) { - if (papi_errno != PAPI_EMULPASS) { - fprintf(stderr, "Unable to add event %s to the EventSet with error code %d.\n", evt_name.c_str(), papi_errno); - test_skip(__FILE__, __LINE__, "", 0); - } - - // Handle multiple pass events - (*numMultipassEvents)++; - continue; - } - - // Handle successfully added events - if (find(eventsSuccessfullyAdded.begin(), eventsSuccessfullyAdded.end(), metricNames[i]) == eventsSuccessfullyAdded.end()) { - eventsSuccessfullyAdded.push_back(metricNames[i]); - } - } - +static void add_cuda_native_events_concurrent(int EventSet, string cuda_native_event_name, perDeviceData &device_data, int *numMultipassEvents) +{ + int papi_errno = PAPI_add_named_event(EventSet, cuda_native_event_name.c_str()); + if (papi_errno != PAPI_OK) { + if (papi_errno != PAPI_EMULPASS) { + fprintf(stderr, "Unable to add event %s to the EventSet with error code %d.\n", cuda_native_event_name, papi_errno); + exit(EXIT_FAILURE); + } + + // Handle multiple pass events + (*numMultipassEvents)++; + } + + // Handle successfully added events + if (find(device_data.events_successfully_added.begin(), device_data.events_successfully_added.end(), cuda_native_event_name.c_str()) == device_data.events_successfully_added.end()) { + device_data.events_successfully_added.push_back(cuda_native_event_name.c_str()); + } + return; } @@ -190,152 +147,197 @@ static void add_events_from_command_line(perDeviceData &d, int EventSet, vector< // The device streams vector is used to control which stream each call is made on // If 'serial' is non-zero, the device streams are ignored and instead the default stream is used void profileKernels(perDeviceData &d, - vector const &metricNames, + vector const &base_cuda_native_event_names_with_stat_qual, char const * const rangeName, bool serial) { - RUNTIME_API_CALL(cudaSetDevice(d.config.device)); // Orig code has mistake here -#ifdef PAPI - int eventset = PAPI_NULL; - PAPI_CALL(PAPI_create_eventset(&eventset)); + // Switch to desired device + check_cuda_runtime_api_call( cudaSetDevice(d.config.device) ); // Orig code has mistake here - add_events_from_command_line(d, eventset, metricNames, eventsSuccessfullyAdded, &numMultipassEvents); + int EventSet = PAPI_NULL; + check_papi_api_call( PAPI_create_eventset(&EventSet) ); + + global_num_multipass_events = 0; + int event_idx; + for (event_idx = 0; event_idx < base_cuda_native_event_names_with_stat_qual.size(); event_idx++) { + string tmp_event_name = base_cuda_native_event_names_with_stat_qual[event_idx] + ":device=" + std::to_string(d.config.device); + add_cuda_native_events_concurrent(EventSet, tmp_event_name, d, &global_num_multipass_events); + } // Only multiple pass events were provided on the command line - if (eventsSuccessfullyAdded.size() == 0) { - fprintf(stderr, "Events provided on the command line could not be added to an EventSet as they require multiple passes.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } + if (d.events_successfully_added.size() == 0) { + fprintf(stderr, "Both cuda:::sm__cycles_active:stat=sum and cuda:::sm__cycles_elapsed:stat=max were unable to be added. This may be due to the architecture you are running on.\n"); + test_skip(__FILE__, __LINE__, "", 0); + } + // Internally at PAPI_start we push a range; therefore, users do not push a range + check_papi_api_call( PAPI_start(EventSet) ); - PAPI_CALL(PAPI_start(eventset)); -#endif - for (unsigned int stream = 0; stream < d.streams.size(); stream++) - { + unsigned int stream; + for (stream = 0; stream < d.streams.size(); stream++) + { cudaStream_t streamId = (serial ? 0 : d.streams[stream]); daxpyKernel <<>> (elements[stream], a, d.d_x[stream], d.d_y[stream]); - } + } // After launching all work, synchronize all streams if (serial == false) - { - for (unsigned int stream = 0; stream < d.streams.size(); stream++) - { - RUNTIME_API_CALL(cudaStreamSynchronize(d.streams[stream])); - } - } + { + for (stream = 0; stream < d.streams.size(); stream++) + { + check_cuda_runtime_api_call( cudaStreamSynchronize(d.streams[stream]) ); + } + } else - { - RUNTIME_API_CALL(cudaStreamSynchronize(0)); - } -#ifdef PAPI - PAPI_CALL(PAPI_stop(eventset, d.values)); - PAPI_CALL(PAPI_cleanup_eventset(eventset)); - PAPI_CALL(PAPI_destroy_eventset(&eventset)); -#endif + { + check_cuda_runtime_api_call( cudaStreamSynchronize(0) ); + } + + // Internally at PAPI_stop we pop the range; therefore, users do not pop a range + long long values[MAX_EVENTS_TO_ADD]; + check_papi_api_call( PAPI_stop(EventSet, values) ); + + for (event_idx = 0; event_idx < d.events_successfully_added.size(); event_idx++) { + d.cuda_counter_values[event_idx].push_back(values[event_idx]); + } + + check_papi_api_call( PAPI_cleanup_eventset(EventSet) ); + + check_papi_api_call( PAPI_destroy_eventset(&EventSet) ); + + // Keep track of the range name, again PAPI internally has defined a range name, but + // for the sake of following the CUPTI test, we will use their range names. + d.range_name.push_back(rangeName); } -void print_measured_values(perDeviceData &d, vector const &metricNames) +void print_measured_values(perDeviceData &d) { - string evt_name; - PRINT(quiet, "PAPI event name\t\t\t\t\t\t\tMeasured value\n"); - PRINT(quiet, "%s\n", std::string(80, '-').c_str()); - for (int i=0; i < metricNames.size(); i++) { - evt_name = metricNames[i] + std::to_string(d.config.device); - PRINT(quiet, "%s\t\t\t%lld\n", evt_name.c_str(), d.values[i]); + PRINT(global_suppress_output, "%s\n", std::string(200, '-').c_str()); + int event_idx; + for (event_idx = 0; event_idx < d.events_successfully_added.size(); event_idx++) { + int range_idx; + for (range_idx = 0; range_idx < d.range_name.size(); range_idx++) { + PRINT(global_suppress_output, "Range %s with event %s produced the value:\t\t%lld\n", d.range_name[range_idx].c_str(), d.events_successfully_added[event_idx].c_str(), d.cuda_counter_values[event_idx][range_idx]); + } } } +static void print_help_message(void) +{ + printf("./concurrent_profiling_noCuCtx\n"); + printf("Notes:\n" + "1. This test is specifically designed to use devices that support CUPTI Profiling i.e. devices with CCs >= 7.0.\n" + "2. No events are accepted from the command line as cuda:::sm_cycles_active:stat=sum, cuda:::sm__cycles_elapsed:stat=max,\n" + " and cuda:::smsp__sass_thread_inst_executed_op_dfma_pred_on:stat=sum are required events.\n"); +} + int main(int argc, char **argv) { - quiet = 0; - int i; -#ifdef PAPI - char *test_quiet = getenv("PAPI_CUDA_TEST_QUIET"); - if (test_quiet) - quiet = (int) strtol(test_quiet, (char**) NULL, 10); - - int event_count = argc - 1; - /* if no events passed at command line, just report test skipped. */ - if (event_count == 0) { - fprintf(stderr, "No eventnames specified at command line.\n"); - test_skip(__FILE__, __LINE__, "", 0); + char *papi_cuda_api = getenv("PAPI_CUDA_API"); + if (papi_cuda_api != NULL) { + fprintf(stderr, "The concurrent_profiling test only works with the Perfworks Metrics API. Unset the environment variable PAPI_CUDA_API.\n"); + test_skip(__FILE__, __LINE__, "", 0); + } + + // Determine the number of Cuda capable devices + int num_devices = 0; + check_cuda_runtime_api_call( cudaGetDeviceCount(&num_devices) ); + + // No devices detected on the machine, exit + if (num_devices < 1) { + fprintf(stderr, "No NVIDIA devices found on the machine. This is required for the test to run.\n"); + test_skip(__FILE__, __LINE__, "", 0); } - vector metricNames; - for (i=0; i < event_count; i++) { - metricNames.push_back(argv[i+1]); + global_suppress_output = 0; + char *user_defined_suppress_output = getenv("PAPI_CUDA_TEST_QUIET"); + if (user_defined_suppress_output) { + global_suppress_output = (int) strtol(user_defined_suppress_output, (char**) NULL, 10); } + PRINT(global_suppress_output, "Running the cuda component test concurrent_profiling_noCuCtx.cu\n") + + // User either provided --help or an argument that would not be useful to this test + if (argc > 1) { + print_help_message(); + exit(EXIT_SUCCESS); + } // Initialize the PAPI library - if (PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT) { - test_fail(__FILE__, __LINE__, "PAPI_library_init failed.", 0); + int papi_errno = PAPI_library_init(PAPI_VER_CURRENT); + if (papi_errno != PAPI_VER_CURRENT) { + test_fail(__FILE__, __LINE__, "PAPI_library_init()", papi_errno); } -#else - vector metricNames = {""}; -#endif + PRINT(global_suppress_output, "PAPI version being used for this test: %d.%d.%d\n", + PAPI_VERSION_MAJOR(PAPI_VERSION), + PAPI_VERSION_MINOR(PAPI_VERSION), + PAPI_VERSION_REVISION(PAPI_VERSION)); - int numDevices; - RUNTIME_API_CALL(cudaGetDeviceCount(&numDevices)); + int cuda_cmp_idx = PAPI_get_component_index("cuda"); + if (cuda_cmp_idx < 0) { + test_fail(__FILE__, __LINE__, "PAPI_get_component_index()", cuda_cmp_idx); + } + PRINT(global_suppress_output, "The cuda component is assigned to component index: %d\n", cuda_cmp_idx); // Per-device information vector device_ids; - - // Find all devices capable of running CUPTI Profiling (Compute Capability >= 7.0) - for (i = 0; i < numDevices; i++) + // Find all devices capable of running CUPTI Profiling (CC >= 7.0) + int dev_idx; + for (dev_idx = 0; dev_idx < num_devices; dev_idx++) { - // Get device properties + // Obtain major compute capability int major; - RUNTIME_API_CALL(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, i)); - if (major >= 7) - { - // Record device number - device_ids.push_back(i); - } - } - - numDevices = device_ids.size(); - PRINT(quiet, "Found %d compatible devices\n", numDevices); - - // Ensure we found at least one device - if (numDevices == 0) - { - fprintf(stderr, "No devices detected compatible with CUPTI Profiling (Compute Capability >= 7.0)\n"); -#ifdef PAPI - test_skip(__FILE__, __LINE__, "", 0); -#endif - } + check_cuda_runtime_api_call( cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev_idx) ); + if (major >= 7) { + PRINT(global_suppress_output, "--> Device %d is compatible with the concurrent_profiling_noCuCtx test\n", dev_idx); + device_ids.push_back(dev_idx); + } + } + if (device_ids.size() == 0) { + fprintf(stderr, "No devices on the machine detected that have CC >= 7.0 and support CUPTI Profiling.\n"); + test_skip(__FILE__, __LINE__, "", 0); + } + + // Overwrite num_devices with the number of devices this test actually supports + num_devices = device_ids.size(); // Initialize kernel input to some known numbers vector h_x(blockSize * numKernels); vector h_y(blockSize * numKernels); - for (size_t i = 0; i < blockSize * numKernels; i++) - { + size_t i; + for (i = 0; i < blockSize * numKernels; i++) { h_x[i] = 1.5 * i; h_y[i] = 2.0 * (i - 3000); } // Initialize a vector of 'default stream' values to demonstrate serialized kernels - vector defaultStreams(numStreams); - for (int stream = 0; stream < numStreams; stream++) - { - defaultStreams[stream] = 0; + vector default_streams(num_streams); + int stream; + for (stream = 0; stream < num_streams; stream++) { + default_streams[stream] = 0; } // Scale per-kernel work by stream number - for (int stream = 0; stream < numStreams; stream++) - { + for (stream = 0; stream < num_streams; stream++) { elements[stream] = blockSize * (stream + 1); } - // For each device, configure profiling, set up buffers, copy kernel data - vector deviceData(numDevices); + // These metrics below are hardcoded in the CUPTI provided sample code and therefore this test was + // written with them in mind. No command line arguments will be accepted. The test will be skipped + // if none of them can be successfully added. + vector base_cuda_native_event_names_with_stat_qual; + // The below two metrics will demonstrate whether kernels within a Range were run serially or concurrently. + base_cuda_native_event_names_with_stat_qual.push_back("cuda:::sm__cycles_active:stat=sum"); + base_cuda_native_event_names_with_stat_qual.push_back("cuda:::sm__cycles_elapsed:stat=max"); + // This metric shows that the same number of flops were executed on each run. + //base_cuda_native_event_names_with_stat_qual.push_back("cuda:::smsp__sass_thread_inst_executed_op_dfma_pred_on:stat=sum"); - for (int device = 0; device < numDevices; device++) + // For each device, configure profiling, set up buffers, copy kernel data + vector device_data(num_devices); + int device; + for (device = 0; device < num_devices; device++) { int device_id = device_ids[device]; - RUNTIME_API_CALL(cudaSetDevice(device_id)); - PRINT(quiet, "Configuring device %d\n", device_id); - deviceData[device].deviceID = device_id; + check_cuda_runtime_api_call( cudaSetDevice(device_id) ); + PRINT(global_suppress_output, "--> Configuring device %d\n", device_id); // Required CUPTI Profiling configuration & initialization // Can be done ahead of time or immediately before startSession() call @@ -343,151 +345,162 @@ int main(int argc, char **argv) // For simplicity's sake, in this sample, a single config struct is created per device and passed to each CUPTI Profiler API call // For more complex cases, each combination of CUPTI Profiler Session and Config requires additional initialization profilingConfig config; - config.device = device_id; // Device ID, used to get device name for metrics enumeration + // Device ID, used to get device name for metrics enumeration + config.device = device_id; // config.maxLaunchesPerPass = 1; // Must be >= maxRangesPerPass. Set this to the largest count of kernel launches which may be encountered in any Pass in this Session - // // Device 0 has max of 3 passes; other devices only run one pass in this sample code - deviceData[device].config = config;// Save this device config + // Device 0 has max of 3 passes; other devices only run one pass in this sample code + device_data[device].config = config;// Save this device config // Initialize CUPTI Profiling structures + // targetInitProfiling(device_data[device], base_cuda_native_event_names_with_stat_qual); + // Per-stream initialization & memory allocation - copy from constant host array to each device array - deviceData[device].streams.resize(numStreams); - deviceData[device].d_x.resize(numStreams); - deviceData[device].d_y.resize(numStreams); - for (int stream = 0; stream < numStreams; stream++) + device_data[device].streams.resize(num_streams); + device_data[device].d_x.resize(num_streams); + device_data[device].d_y.resize(num_streams); + // Resize the vector of vectors for the number of events we have + device_data[device].cuda_counter_values.resize( base_cuda_native_event_names_with_stat_qual.size()); + for (stream = 0; stream < num_streams; stream++) { - RUNTIME_API_CALL(cudaStreamCreate(&(deviceData[device].streams[stream]))); + // Create an asynchronous stream + check_cuda_runtime_api_call( cudaStreamCreate(&(device_data[device].streams[stream])) ); // Each kernel does (stream #) * blockSize work on doubles size_t size = elements[stream] * sizeof(double); - RUNTIME_API_CALL(cudaMalloc(&(deviceData[device].d_x[stream]), size)); - MEMORY_ALLOCATION_CALL(deviceData[device].d_x[stream]); // Validate pointer - RUNTIME_API_CALL(cudaMemcpy(deviceData[device].d_x[stream], h_x.data(), size, cudaMemcpyHostToDevice)); + check_cuda_runtime_api_call( cudaMalloc(&(device_data[device].d_x[stream]), size) ); + check_memory_allocation_call( device_data[device].d_x[stream] ); + check_cuda_runtime_api_call( cudaMemcpy(device_data[device].d_x[stream], h_x.data(), size, cudaMemcpyHostToDevice) ); - RUNTIME_API_CALL(cudaMalloc(&(deviceData[device].d_y[stream]), size)); - MEMORY_ALLOCATION_CALL(deviceData[device].d_y[stream]); // Validate pointer - RUNTIME_API_CALL(cudaMemcpy(deviceData[device].d_y[stream], h_x.data(), size, cudaMemcpyHostToDevice)); + check_cuda_runtime_api_call( cudaMalloc(&(device_data[device].d_y[stream]), size) ); + check_memory_allocation_call( device_data[device].d_y[stream] ); + check_cuda_runtime_api_call( cudaMemcpy(device_data[device].d_y[stream], h_x.data(), size, cudaMemcpyHostToDevice) ); } } - // - // First version - single device, kernel calls serialized on default stream - // + // Formatting print statement + PRINT(global_suppress_output, "%s\n", std::string(200, '-').c_str()); + //////////////////////////////////////////////////////////////////////////////// + // First Version - single device, kernel calls serialized on default stream. // + ////////////////////////////////////////////////////////////////////////////// + // Use wallclock time to measure performance auto begin_time = ::std::chrono::high_resolution_clock::now(); // Run on first device and use default streams, which run serially - profileKernels(deviceData[0], metricNames, "single_device_serial", true); + profileKernels(device_data[0], base_cuda_native_event_names_with_stat_qual, "single_device_serial", true); auto end_time = ::std::chrono::high_resolution_clock::now(); - int elapsed_serial_ms = ::std::chrono::duration_cast<::std::chrono::milliseconds>(end_time - begin_time).count(); + auto elapsed_serial_ms = ::std::chrono::duration_cast<::std::chrono::milliseconds>(end_time - begin_time).count(); int numBlocks = 0; - for (int i = 1; i <= numKernels; i++) - { + for (i = 1; i <= numKernels; i++) + { numBlocks += i; - } - PRINT(quiet, "It took %d ms on the host to profile %d kernels in serial.", elapsed_serial_ms, numKernels); + } + PRINT(global_suppress_output, "It took %d ms on the host to profile %d kernels in serial.\n", elapsed_serial_ms, numKernels); - // - // Second version - same kernel calls as before on the same device, but now using separate streams for concurrency - // (Should be limited by the longest running kernel) - // + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Second version - same kernel calls as before on the same device, but now using separate streams for concurrency, // + // Should be limited by the longest running kernel. // + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // User wallclock time to measure performance begin_time = ::std::chrono::high_resolution_clock::now(); // Still only use first device, but this time use its allocated streams for parallelism - profileKernels(deviceData[0], metricNames, "single_device_async", false); + profileKernels(device_data[0], base_cuda_native_event_names_with_stat_qual, "single_device_async", false); end_time = ::std::chrono::high_resolution_clock::now(); int elapsed_single_device_ms = ::std::chrono::duration_cast<::std::chrono::milliseconds>(end_time - begin_time).count(); - PRINT(quiet, "It took %d ms on the host to profile %d kernels on a single device on separate streams.", elapsed_single_device_ms, numKernels); - PRINT(quiet, "--> If the separate stream wallclock time is less than the serial version, the streams were profiling concurrently.\n"); - - // - // Third version - same as the second case, but duplicates the concurrent work across devices to show cross-device concurrency - // This is done using devices so no serialization is needed between devices - // (Should have roughly the same wallclock time as second case if the devices have similar performance) - // - - if (numDevices == 1) - { - PRINT(quiet, "Only one compatible device found; skipping the multi-threaded test.\n"); - } + PRINT(global_suppress_output, "It took %d ms on the host to profile %d kernels on a single device on separate streams.\n", elapsed_single_device_ms, numKernels); + PRINT(global_suppress_output, "--> If the separate stream wallclock time is less than the serial version, the streams were profiling concurrently.\n"); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Third version - same as the second case, but duplicates the concurrent work across devices to show cross-device concurrency. // + // This is done using devices so no serialization is needed between devices. // + // Should have roughly the same wallclock time as the second case if the devices have similar performance. // + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + // The third version can only be ran if we have more than one compatible device found + if (device_ids.size() == 1) + { + PRINT(global_suppress_output, "Only one compatible device found; skipping the multi-threaded test.\n"); + } else - { -#ifdef PAPI - int papi_errno = PAPI_thread_init((unsigned long (*)(void)) std::this_thread::get_id); - if ( papi_errno != PAPI_OK ) { - test_fail(__FILE__, __LINE__, "Error setting thread id function.\n", papi_errno); - } -#endif - PRINT(quiet, "Running on %d devices, one thread per device.\n", numDevices); + { + // Initialize PAPI thread support + check_papi_api_call( PAPI_thread_init((unsigned long (*)(void)) std::this_thread::get_id) ); + + // Formatting print statement + PRINT(global_suppress_output, "\n"); + PRINT(global_suppress_output, "Running on %d devices, one thread per device.\n", num_devices); // Time creation of the same multiple streams (on multiple devices, if possible) vector<::std::thread> threads; begin_time = ::std::chrono::high_resolution_clock::now(); // Now launch parallel thread work, duplicated on one thread per device - for (int thread = 0; thread < numDevices; thread++) - { - threads.push_back(::std::thread(profileKernels, ::std::ref(deviceData[thread]), metricNames, "multi_device_async", false)); - } + int thread; + for (thread = 0; thread < num_devices; thread++) + { + threads.push_back(::std::thread(profileKernels, ::std::ref(device_data[thread]), base_cuda_native_event_names_with_stat_qual, "multi_device_async", false)); + } // Wait for all threads to finish for (auto &t: threads) - { + { t.join(); - } + } // Record time used when launching on multiple devices end_time = ::std::chrono::high_resolution_clock::now(); int elapsed_multiple_device_ms = ::std::chrono::duration_cast<::std::chrono::milliseconds>(end_time - begin_time).count(); - double ratio = elapsed_multiple_device_ms / (double) elapsed_single_device_ms; - PRINT(quiet, "It took %d ms on the host to profile the same %d kernels on each of the %d devices in parallel\n", elapsed_multiple_device_ms, numKernels, numDevices); - PRINT(quiet, "--> Wallclock ratio of parallel device launch to single device launch is %f\n", ratio); - PRINT(quiet, "--> If the ratio is close to 1, that means there was little overhead to profile in parallel on multiple devices compared to profiling on a single device.\n"); - PRINT(quiet, "--> If the devices have different performance, the ratio may not be close to one, and this should be limited by the slowest device.\n"); - } + PRINT(global_suppress_output, "It took %d ms on the host to profile the same %d kernels on each of the %d devices in parallel\n", elapsed_multiple_device_ms, numKernels, num_devices); + PRINT(global_suppress_output, "--> Wallclock ratio of parallel device launch to single device launch is %f\n", elapsed_multiple_device_ms / (double) elapsed_single_device_ms); + PRINT(global_suppress_output, "--> If the ratio is close to 1, that means there was little overhead to profile in parallel on multiple devices compared to profiling on a single device.\n"); + PRINT(global_suppress_output, "--> If the devices have different performance, the ratio may not be close to one, and this should be limited by the slowest device.\n"); + } // Free stream memory for each device - for (int i = 0; i < numDevices; i++) - { - for (int j = 0; j < numKernels; j++) - { - RUNTIME_API_CALL(cudaFree(deviceData[i].d_x[j])); - RUNTIME_API_CALL(cudaFree(deviceData[i].d_y[j])); - } - } + for (i = 0; i < num_devices; i++) + { + int j; + for (j = 0; j < numKernels; j++) + { + check_cuda_runtime_api_call( cudaFree(device_data[i].d_x[j]) ); + check_cuda_runtime_api_call( cudaFree(device_data[i].d_y[j]) ); + } + } -#ifdef PAPI // Display metric values - PRINT(quiet, "\nMetrics for device #0:\n"); - PRINT(quiet, "Look at the sm__cycles_elapsed.max values for each test.\n"); - PRINT(quiet, "This value represents the time spent on device to run the kernels in each case, and should be longest for the serial range, and roughly equal for the single and multi device concurrent ranges.\n"); - print_measured_values(deviceData[0], eventsSuccessfullyAdded); + PRINT(global_suppress_output, "\nMetrics for device #0:\n"); + PRINT(global_suppress_output, "Look at the cuda:::sm__cycles_elapsed:stat=max values for each test.\n"); + PRINT(global_suppress_output, "This value represents the time spent on device to run the kernels in each case, and should be longest for the serial range, and roughly equal for the single and multi device concurrent ranges.\n"); + print_measured_values(device_data[0]); // Only display next device info if needed - if (numDevices > 1) - { - PRINT(quiet, "\nMetrics for the remaining devices only display the multi device async case and should all be similar to the first device's values if the device has similar performance characteristics.\n"); - PRINT(quiet, "If devices have different performance characteristics, the runtime cycles calculation may vary by device.\n"); - } - for (int i = 1; i < numDevices; i++) - { - PRINT(quiet, "\nMetrics for device #%d:\n", i); - print_measured_values(deviceData[i], eventsSuccessfullyAdded); - } - - PAPI_shutdown(); + if (num_devices > 1) + { + PRINT(global_suppress_output, "\nMetrics for the remaining devices only display the multi device async case and should all be similar to the first device's values if the device has similar performance characteristics.\n"); + PRINT(global_suppress_output, "If devices have different performance characteristics, the runtime cycles calculation may vary by device.\n"); + } + + for (i = 1; i < num_devices; i++) + { + PRINT(global_suppress_output, "\nMetrics for device #%d:\n", i); + print_measured_values(device_data[i]); + } // Output a note that a multiple pass event was provided on the command line - if (numMultipassEvents > 0) { - PRINT(quiet, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); - } + if (global_num_multipass_events > 0) { + PRINT(global_suppress_output, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); + } + + PAPI_shutdown(); test_pass(__FILE__); -#endif + return 0; } diff --git a/src/components/cuda/tests/cudaOpenMP.cu b/src/components/cuda/tests/cudaOpenMP.cu index c84e10f9d..44a74f339 100644 --- a/src/components/cuda/tests/cudaOpenMP.cu +++ b/src/components/cuda/tests/cudaOpenMP.cu @@ -1,283 +1,267 @@ -/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of NVIDIA CORPORATION nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * Multi-GPU sample using OpenMP for threading on the CPU side - * needs a compiler that supports OpenMP 2.0 - */ - -#ifdef PAPI -#include -#include "papi_test.h" - -#define PAPI_CALL(apiFuncCall) \ -do { \ - int _status = apiFuncCall; \ - if (_status != PAPI_OK) { \ - fprintf(stderr, "error: function %s failed.", #apiFuncCall); \ - test_fail(__FILE__, __LINE__, "", _status); \ - } \ -} while (0) - -#endif +/** +* @file cudaOpenMP.cu +* @brief For all NVIDIA devices detected on the machine create a matching thread +* for it using OpenMP. Even though a thread is created for all NVIDIA devices, +* cuCtxCreate will be called only for enabled devices. +* +* Note: The cuda component supports being partially disabled, meaning that certain devices +* will not be "enabled" to profile on. If PAPI_CUDA_API is not set, then devices with +* CC's >= 7.0 will be used and if PAPI_CUDA_API is set to LEGACY then devices with +* CC's <= 7.0 will be used. +* +* For each enabled device, their matching thread will have a workflow of: +* 1. Creating an EventSet +* 2. Adding events to the EventSet +* 3. Starting the EventSet +* 4. Stopping the EventSet +* +* Finally, a compiler that supports OpenMP 2.0 is needed. +*/ +// Standard library headers +#include +#include +// Internal headers +#include "cuda_tests_helper.h" #include "gpu_work.h" -#include -#include // stdio functions are used since C++ streams aren't necessarily thread safe - -#define PRINT(quiet, format, args...) {if (!quiet) {fprintf(stderr, format, ## args);}} -int quiet; - -#define RUNTIME_API_CALL(apiFuncCall) \ -do { \ - cudaError_t _status = apiFuncCall; \ - if (_status != cudaSuccess) { \ - fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ - __FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\ - exit(EXIT_FAILURE); \ - } \ -} while (0) - -#define DRIVER_API_CALL(apiFuncCall) \ -do { \ - CUresult _status = apiFuncCall; \ - if (_status != CUDA_SUCCESS) { \ - fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \ - __FILE__, __LINE__, #apiFuncCall, _status); \ - exit(EXIT_FAILURE); \ - } \ -} while (0) +#include "papi.h" +#include "papi_test.h" #define MAX_THREADS (32) -/** @class add_events_from_command_line - * @brief Try and add each event provided on the command line by the user. - * - * @param EventSet - * A PAPI eventset. - * @param totalEventCount - * Number of events from the command line. - * @param gpu_id - * NVIDIA device index. - * @param **eventNamesFromCommandLine - * Events provided on the command line. - * @param *numEventsSuccessfullyAdded - * Total number of successfully added events. - * @param **eventsSuccessfullyAdded - * Events that we are able to add to the EventSet. - * @param *numMultipassEvents - * Counter to see if a multiple pass event was provided on the command line. -*/ -static void add_events_from_command_line(int EventSet, int totalEventCount, int gpu_id, char **eventNamesFromCommandLine, int *numEventsSuccessfullyAdded, char **eventsSuccessfullyAdded, int *numMultipassEvents) +static void print_help_message(void) +{ + printf("./cudaOpenMP --cuda-native-event-names [list of cuda native event names separated by a comma].\n" + "Notes:\n" + "1. Native event names must not have the device qualifier appended.\n"); +} + +static void parse_and_assign_args(int argc, char *argv[], char ***cuda_native_event_names, int *total_event_count) { int i; - for (i = 0; i < totalEventCount; i++) { - char tmpEventName[PAPI_MAX_STR_LEN]; - int strLen = snprintf(tmpEventName, PAPI_MAX_STR_LEN, "%s:device=%d", eventNamesFromCommandLine[i], gpu_id); - if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { - fprintf(stderr, "Failed to fully write event name with appended device qualifier.\n"); - test_skip(__FILE__, __LINE__, "", 0); + for (i = 1; i < argc; ++i) + { + char *arg = argv[i]; + if (strcmp(arg, "--help") == 0) + { + print_help_message(); + exit(EXIT_SUCCESS); } - - int papi_errno = PAPI_add_named_event(EventSet, tmpEventName); - if (papi_errno != PAPI_OK) { - if (papi_errno != PAPI_EMULPASS) { - fprintf(stderr, "Unable to add event %s to the EventSet with error code %d.\n", tmpEventName, papi_errno); - test_skip(__FILE__, __LINE__, "", 0); + else if (strcmp(arg, "--cuda-native-event-names") == 0) + { + if (!argv[i + 1]) + { + printf("ERROR!! --cuda-native-event-names given, but no events listed.\n"); + exit(EXIT_FAILURE); } - // Handle multiple pass events - (*numMultipassEvents)++; - continue; + char **cmd_line_native_event_names = NULL; + const char *cuda_native_event_name = strtok(argv[i+1], ","); + while (cuda_native_event_name != NULL) + { + if (strstr(cuda_native_event_name, ":device")) { + fprintf(stderr, "Cuda native event name must not have a device qualifier appended for this test, i.e. no :device=#.\n"); + print_help_message(); + exit(EXIT_FAILURE); + } + + cmd_line_native_event_names = (char **) realloc(cmd_line_native_event_names, ((*total_event_count) + 1) * sizeof(char *)); + check_memory_allocation_call(cmd_line_native_event_names); + + cmd_line_native_event_names[(*total_event_count)] = (char *) malloc(PAPI_2MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call(cmd_line_native_event_names[(*total_event_count)]); + + int strLen = snprintf(cmd_line_native_event_names[(*total_event_count)], PAPI_2MAX_STR_LEN, "%s", cuda_native_event_name); + if (strLen < 0 || strLen >= PAPI_2MAX_STR_LEN) { + fprintf(stderr, "Failed to fully write cuda native event name.\n"); + exit(EXIT_FAILURE); + } + + (*total_event_count)++; + cuda_native_event_name = strtok(NULL, ","); + } + i++; + *cuda_native_event_names = cmd_line_native_event_names; } - - // Handle successfully added events - strLen = snprintf(eventsSuccessfullyAdded[(*numEventsSuccessfullyAdded)], PAPI_MAX_STR_LEN, "%s", tmpEventName); - if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { - fprintf(stderr, "Failed to fully write successfully added event.\n"); - test_skip(__FILE__, __LINE__, "", 0); + else + { + print_help_message(); + exit(EXIT_FAILURE); } - (*numEventsSuccessfullyAdded)++; } - - return; } int main(int argc, char *argv[]) { - quiet = 0; -#ifdef PAPI - char *test_quiet = getenv("PAPI_CUDA_TEST_QUIET"); - if (test_quiet) - quiet = (int) strtol(test_quiet, (char**) NULL, 10); - - int event_count = argc - 1; - /* if no events passed at command line, just report test skipped. */ - if (event_count == 0) { - fprintf(stderr, "No eventnames specified at command line.\n"); - test_skip(__FILE__, __LINE__, "", 0); + // Determine the number of Cuda capable devices + int num_devices = 0; + check_cuda_runtime_api_call( cudaGetDeviceCount(&num_devices) ); + + // No devices detected on the machine, exit + if (num_devices < 1) { + fprintf(stderr, "No NVIDIA devices found on the machine. This is required for the test to run.\n"); + exit(EXIT_FAILURE); } -#endif - - int num_gpus = 0, i; - CUcontext ctx_arr[MAX_THREADS]; - RUNTIME_API_CALL(cudaGetDeviceCount(&num_gpus)); // determine the number of CUDA capable GPUs + int suppress_output = 0; + char *user_defined_suppress_output = getenv("PAPI_CUDA_TEST_QUIET"); + if (user_defined_suppress_output) { + suppress_output = (int) strtol(user_defined_suppress_output, (char**) NULL, 10); + } + PRINT(suppress_output, "Running the cuda component test cudaOpenMP.cu\n"); + + char **cuda_native_event_names = NULL; + // If command line arguments are provided then get their values. + int total_event_count = 0; + if (argc > 1) { + parse_and_assign_args(argc, argv, &cuda_native_event_names, &total_event_count); + } - if (num_gpus < 1) { - fprintf(stderr, "no CUDA capable devices were detected\n"); -#ifdef PAPI - test_skip(__FILE__, __LINE__, "", 0); -#endif - return 0; + // Initialize the PAPI library + int papi_errno = PAPI_library_init( PAPI_VER_CURRENT ); + if ( papi_errno != PAPI_VER_CURRENT ) { + test_fail(__FILE__, __LINE__, "PAPI_library_init failed()", papi_errno); } - ///////////////////////////////////////////////////////////////// - // display CPU and GPU configuration - // - PRINT(quiet, "number of host CPUs:\t%d\n", omp_get_num_procs()); - PRINT(quiet, "number of CUDA devices:\t%d\n", num_gpus); - - for (i = 0; i < num_gpus; i++) { - cudaDeviceProp dprop; - RUNTIME_API_CALL(cudaGetDeviceProperties(&dprop, i)); - PRINT(quiet, " %d: %s\n", i, dprop.name); + PRINT(suppress_output, "PAPI version being used for this test: %d.%d.%d\n", + PAPI_VERSION_MAJOR(PAPI_VERSION), + PAPI_VERSION_MINOR(PAPI_VERSION), + PAPI_VERSION_REVISION(PAPI_VERSION)); + + // Initialize thread support in PAPI + check_papi_api_call( PAPI_thread_init((unsigned long (*)(void)) omp_get_thread_num) ); + + // Verify the cuda component has been compiled in + int cuda_cmp_idx = PAPI_get_component_index("cuda"); + if (cuda_cmp_idx < 0) { + test_fail(__FILE__, __LINE__, "PAPI_get_component_index()", cuda_cmp_idx); } - int num_threads = (num_gpus > MAX_THREADS) ? MAX_THREADS : num_gpus; - // Create a gpu context for every thread - for (i=0; i < num_threads; i++) { - int flags = 0; - CUdevice device = i % num_gpus; -#if defined(CUDA_TOOLKIT_GE_13) - DRIVER_API_CALL( cuCtxCreate(&(ctx_arr[i]), (CUctxCreateParams*)0, flags, device) ); -#else - DRIVER_API_CALL( cuCtxCreate(&(ctx_arr[i]), flags, device) ); -#endif - DRIVER_API_CALL(cuCtxPopCurrent(&(ctx_arr[i]))); + PRINT(suppress_output, "The cuda component is assigned to component index: %d\n", cuda_cmp_idx); + + // Initialize the Cuda component + int cuda_eventcode = 0 | PAPI_NATIVE_MASK; + check_papi_api_call( PAPI_enum_cmp_event(&cuda_eventcode, PAPI_ENUM_FIRST, cuda_cmp_idx) ); + + // If we have not gotten an event via the command line, use the event obtained from PAPI_enum_cmp_event + if (total_event_count == 0) { + int num_spaces_to_allocate = 1; + cuda_native_event_names = (char **) malloc(num_spaces_to_allocate * sizeof(char *)); + check_memory_allocation_call( cuda_native_event_names ); + + cuda_native_event_names[total_event_count] = (char *) malloc(PAPI_2MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call( cuda_native_event_names[total_event_count] ); + + check_papi_api_call( PAPI_event_code_to_name(cuda_eventcode, cuda_native_event_names[total_event_count++]) ); } - PRINT(quiet, "---------------------------\n"); -#ifdef PAPI - int papi_errno = PAPI_library_init( PAPI_VER_CURRENT ); - if ( papi_errno != PAPI_VER_CURRENT ) { - test_fail(__FILE__, __LINE__, "PAPI_library_init failed.", 0); + const PAPI_component_info_t *cmpInfo = PAPI_get_component_info(cuda_cmp_idx); + if (cmpInfo == NULL) { + fprintf(stderr, "Call to PAPI_get_component_info failed.\n"); + exit(EXIT_FAILURE); + } + + // Check to see if the Cuda component is partially disabled + if (cmpInfo->partially_disabled) { + const char *cc_support = (getenv("PAPI_CUDA_API") != NULL) ? "<=7.0" : ">=7.0"; + PRINT(suppress_output, "\033[33mThe cuda component is partially disabled. Only support for CC's %s are enabled.\033[0m\n", cc_support); } - PAPI_CALL(PAPI_thread_init((unsigned long (*)(void)) omp_get_thread_num)); -#endif - omp_lock_t lock; - omp_init_lock(&lock); + // Determine the number of threads we will launch based off the number of + // Cuda devices on the machine (max of 32). + int num_threads_and_devs = (num_devices > MAX_THREADS) ? MAX_THREADS : num_devices; + omp_set_num_threads(num_threads_and_devs); + PRINT(suppress_output, "Total number of threads to be launched: %d\n", num_threads_and_devs); + int i, thread_and_dev_idx, event_idx, numMultipassEvents = 0; + #pragma omp parallel for + for (thread_and_dev_idx = 0; thread_and_dev_idx < num_threads_and_devs; thread_and_dev_idx++) { + const PAPI_component_info_t *cmpInfo = PAPI_get_component_info(cuda_cmp_idx); + if (cmpInfo == NULL) { + fprintf(stderr, "Call to PAPI_get_component_info failed.\n"); + exit(EXIT_FAILURE); + } - PRINT(quiet, "Launching %d threads.\n", num_threads); - omp_set_num_threads(num_threads); // create as many CPU threads as there are CUDA devices - int numMultipassEvents = 0; -#pragma omp parallel - { - unsigned int cpu_thread_id = omp_get_thread_num(); - unsigned int num_cpu_threads = omp_get_num_threads(); - PRINT(quiet, "cpu_thread_id %u, num_cpu_threads %u, num_threads %d, num_gpus %d\n", cpu_thread_id, num_cpu_threads, num_threads, num_gpus); + if (cmpInfo->partially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(thread_and_dev_idx) == 0) { + continue; + } + } - DRIVER_API_CALL(cuCtxPushCurrent(ctx_arr[cpu_thread_id])); -#ifdef PAPI - int gpu_id = cpu_thread_id % num_gpus; + CUcontext ctx; + int flags = 0; + CUdevice device = thread_and_dev_idx; +#if defined(CUDA_TOOLKIT_GE_13) + check_cuda_driver_api_call( cuCtxCreate(&ctx, (CUctxCreateParams*)0, flags, device) ); +#else + check_cuda_driver_api_call( cuCtxCreate(&ctx, flags, device) ); +#endif int EventSet = PAPI_NULL; - long long values[MAX_THREADS]; - int j, errno; - PAPI_CALL(PAPI_create_eventset(&EventSet)); - PRINT(quiet, "CPU thread %d (of %d) uses CUDA device %d with context %p @ eventset %d\n", cpu_thread_id, num_cpu_threads, gpu_id, ctx_arr[cpu_thread_id], EventSet); - - int numEventsSuccessfullyAdded = 0; - char **eventsSuccessfullyAdded, **metricNames = argv + 1; - eventsSuccessfullyAdded = (char **) malloc(event_count * sizeof(char *)); - if (eventsSuccessfullyAdded == NULL) { - fprintf(stderr, "Failed to allocate memory for successfully added events.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - for (i = 0; i < event_count; i++) { - eventsSuccessfullyAdded[i] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); - if (eventsSuccessfullyAdded[i] == NULL) { - fprintf(stderr, "Failed to allocate memory for command line argument.\n"); - test_skip(__FILE__, __LINE__, "", 0); + check_papi_api_call( PAPI_create_eventset(&EventSet) ); + + int num_events_successfully_added = 0; + char **events_successfully_added = (char **) malloc(total_event_count * sizeof(char *)); + check_memory_allocation_call(events_successfully_added); + + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + char tmp_event_name[PAPI_MAX_STR_LEN]; + int strLen = snprintf(tmp_event_name, PAPI_MAX_STR_LEN, "%s:device=%d", cuda_native_event_names[event_idx], thread_and_dev_idx); + if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { + fprintf(stderr, "Failed to fully write event name with appended device qualifier.\n"); + exit(EXIT_FAILURE); } - } - add_events_from_command_line(EventSet, event_count, gpu_id, metricNames, &numEventsSuccessfullyAdded, eventsSuccessfullyAdded, &numMultipassEvents); + events_successfully_added[event_idx] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call(events_successfully_added[event_idx]); + + add_cuda_native_events(EventSet, tmp_event_name, &num_events_successfully_added, events_successfully_added, &numMultipassEvents); + } // Only multiple pass events were provided on the command line - if (numEventsSuccessfullyAdded == 0) { + if (num_events_successfully_added == 0) { fprintf(stderr, "Events provided on the command line could not be added to an EventSet as they require multiple passes.\n"); - test_skip(__FILE__, __LINE__, "", 0); + exit(EXIT_FAILURE); } - PAPI_CALL(PAPI_start(EventSet)); -#endif - VectorAddSubtract(50000*(cpu_thread_id+1), quiet); // gpu work -#ifdef PAPI - PAPI_CALL(PAPI_stop(EventSet, values)); + check_papi_api_call( PAPI_start(EventSet) ); + + // Work for the device + VectorAddSubtract(50000 * (thread_and_dev_idx + 1), suppress_output); + + long long cuda_counter_values[MAX_THREADS]; + check_papi_api_call( PAPI_stop(EventSet, cuda_counter_values) ); - PRINT(quiet, "User measured values.\n"); - for (j = 0; j < numEventsSuccessfullyAdded; j++) { - PRINT(quiet, "%s\t\t%lld\n", eventsSuccessfullyAdded[j], values[j]); + printf("num_events_successfully: %d\n", num_events_successfully_added); + for (event_idx = 0; event_idx < num_events_successfully_added; event_idx++) { + PRINT(suppress_output, "Event %s on thread and device id %d produced the value:\t\t%lld\n", events_successfully_added[event_idx], thread_and_dev_idx, cuda_counter_values[event_idx]); } // Free allocated memory - for (i = 0; i < event_count; i++) { - free(eventsSuccessfullyAdded[i]); + for (event_idx = 0; i < num_events_successfully_added; event_idx++) { + free(events_successfully_added[event_idx]); } - free(eventsSuccessfullyAdded); + free(events_successfully_added); - DRIVER_API_CALL(cuCtxPopCurrent(&(ctx_arr[gpu_id]))); + check_papi_api_call( PAPI_cleanup_eventset(EventSet) ); - errno = PAPI_cleanup_eventset(EventSet); - if (errno != PAPI_OK) { - fprintf(stderr, "PAPI_cleanup_eventset(%d) failed with error %d", EventSet, errno); - test_fail(__FILE__, __LINE__, "", errno); - } - PAPI_CALL(PAPI_destroy_eventset(&EventSet)); -#endif - } // omp parallel region end + check_papi_api_call( PAPI_destroy_eventset(&EventSet) ); + + check_cuda_driver_api_call( cuCtxDestroy(ctx) ); + } // End omp parallel for loop region - for (i = 0; i < num_threads; i++) { - DRIVER_API_CALL(cuCtxDestroy(ctx_arr[i])); + // Output a note that a multiple pass event was provided on the command line + if (numMultipassEvents > 0) { + PRINT(suppress_output, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); } - if (cudaSuccess != cudaGetLastError()) - fprintf(stderr, "%s\n", cudaGetErrorString(cudaGetLastError())); + // Free allocated memory + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + free(cuda_native_event_names[event_idx]); + } + free(cuda_native_event_names); - omp_destroy_lock(&lock); -#ifdef PAPI PAPI_shutdown(); - // Output a note that a multiple pass event was provided on the command line - if (numMultipassEvents > 0) { - PRINT(quiet, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); - } + test_pass(__FILE__); - test_pass(__FILE__); -#endif return 0; } diff --git a/src/components/cuda/tests/cudaOpenMP_noCuCtx.cu b/src/components/cuda/tests/cudaOpenMP_noCuCtx.cu index a57283fec..efae1b3ff 100644 --- a/src/components/cuda/tests/cudaOpenMP_noCuCtx.cu +++ b/src/components/cuda/tests/cudaOpenMP_noCuCtx.cu @@ -1,269 +1,260 @@ -/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of NVIDIA CORPORATION nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * Multi-GPU sample using OpenMP for threading on the CPU side - * needs a compiler that supports OpenMP 2.0 - */ - -#ifdef PAPI -#include -#include "papi_test.h" - -#define PAPI_CALL(apiFuncCall) \ -do { \ - int _status = apiFuncCall; \ - if (_status != PAPI_OK) { \ - fprintf(stderr, "error: function %s failed.", #apiFuncCall); \ - test_fail(__FILE__, __LINE__, "", _status); \ - } \ -} while (0) +/** +* @file cudaOpenMP_noCuCtx.cu +* @brief For all NVIDIA devices detected on the machine create a matching thread +* for it using OpenMP. Even though a thread is created for all NVIDIA devices, +* cudaSetDevice will be called only for enabled devices. cudaSetDevice determines +* which device executions will be done on. +* +* Note: The cuda component supports being partially disabled, meaning that certain devices +* will not be "enabled" to profile on. If PAPI_CUDA_API is not set, then devices with +* CC's >= 7.0 will be used and if PAPI_CUDA_API is set to LEGACY then devices with +* CC's <= 7.0 will be used +* +* For each enabled device, their matching thread will have a workflow of: +* 1. Creating an EventSet +* 2. Adding events to the EventSet +* 3. Starting the EventSet +* 4. Stopping the EventSet +* +* Finally, a compiler that supports OpenMP 2.0 is needed. +*/ -#endif +// Standard library headers +#include +#include +// Internal headers +#include "cuda_tests_helper.h" #include "gpu_work.h" -#include -#include // stdio functions are used since C++ streams aren't necessarily thread safe - -#define PRINT(quiet, format, args...) {if (!quiet) {fprintf(stderr, format, ## args);}} -int quiet; - -#define RUNTIME_API_CALL(apiFuncCall) \ -do { \ - cudaError_t _status = apiFuncCall; \ - if (_status != cudaSuccess) { \ - fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ - __FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\ - exit(EXIT_FAILURE); \ - } \ -} while (0) - -#define DRIVER_API_CALL(apiFuncCall) \ -do { \ - CUresult _status = apiFuncCall; \ - if (_status != CUDA_SUCCESS) { \ - fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \ - __FILE__, __LINE__, #apiFuncCall, _status); \ - exit(EXIT_FAILURE); \ - } \ -} while (0) +#include "papi.h" +#include "papi_test.h" #define MAX_THREADS (32) +static void print_help_message(void) +{ + printf("./cudaOpenMP_noCuCtx --cuda-native-event-names [list of cuda native event names separated by a comma].\n" + "Notes:\n" + "1. Native event names must not have the device qualifier appended.\n"); +} -/** @class add_events_from_command_line - * @brief Try and add each event provided on the command line by the user. - * - * @param EventSet - * A PAPI eventset. - * @param totalEventCount - * Number of events from the command line. - * @param gpu_id - * NVIDIA device index. - * @param eventNamesFromCommandLine - * Events provided on the command line. - * @param *numEventsSuccessfullyAdded - * Total number of successfully added events. - * @param **eventsSuccessfullyAdded - * Events that we are able to add to the EventSet. - * @param *numMultipassEvents - * Counter to see if a multiple pass event was provided on the command line. -*/ -static void add_events_from_command_line(int EventSet, int totalEventCount, int gpu_id, char **eventNamesFromCommandLine, int *numEventsSuccessfullyAdded, char **eventsSuccessfullyAdded, int *numMultipassEvents) +static void parse_and_assign_args(int argc, char *argv[], char ***cuda_native_event_names, int *total_event_count) { int i; - for (i = 0; i < totalEventCount; i++) { - char tmpEventName[PAPI_MAX_STR_LEN]; - int strLen = snprintf(tmpEventName, PAPI_MAX_STR_LEN, "%s:device=%d", eventNamesFromCommandLine[i], gpu_id); - if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { - fprintf(stderr, "Failed to fully write event name with appended device qualifier.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - - int papi_errno = PAPI_add_named_event(EventSet, tmpEventName); - if (papi_errno != PAPI_OK) { - if (papi_errno != PAPI_EMULPASS) { - fprintf(stderr, "Unable to add event %s to the EventSet with error code %d.\n", tmpEventName, papi_errno); - test_skip(__FILE__, __LINE__, "", 0); - } - - // Handle multiple pass events - (*numMultipassEvents)++; - continue; - } - - // Handle successfully added events - strLen = snprintf(eventsSuccessfullyAdded[(*numEventsSuccessfullyAdded)], PAPI_MAX_STR_LEN, "%s", tmpEventName); - if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { - fprintf(stderr, "Failed to fully write successfully added event.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - (*numEventsSuccessfullyAdded)++; - } - - return; + for (i = 1; i < argc; ++i) + { + char *arg = argv[i]; + if (strcmp(arg, "--help") == 0) + { + print_help_message(); + exit(EXIT_SUCCESS); + } + else if (strcmp(arg, "--cuda-native-event-names") == 0) + { + if (!argv[i + 1]) + { + printf("ERROR!! --cuda-native-event-names given, but no events listed.\n"); + exit(EXIT_FAILURE); + } + + char **cmd_line_native_event_names = NULL; + const char *cuda_native_event_name = strtok(argv[i+1], ","); + while (cuda_native_event_name != NULL) + { + if (strstr(cuda_native_event_name, ":device")) { + fprintf(stderr, "Cuda native event name must not have a device qualifier appended for this test, i.e. no :device=#.\n"); + print_help_message(); + exit(EXIT_FAILURE); + } + + cmd_line_native_event_names = (char **) realloc(cmd_line_native_event_names, ((*total_event_count) + 1) * sizeof(char *)); + check_memory_allocation_call(cmd_line_native_event_names); + + cmd_line_native_event_names[(*total_event_count)] = (char *) malloc(PAPI_2MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call(cmd_line_native_event_names[(*total_event_count)]); + + int strLen = snprintf(cmd_line_native_event_names[(*total_event_count)], PAPI_2MAX_STR_LEN, "%s", cuda_native_event_name); + if (strLen < 0 || strLen >= PAPI_2MAX_STR_LEN) { + fprintf(stderr, "Failed to fully write cuda native event name.\n"); + exit(EXIT_FAILURE); + } + + (*total_event_count)++; + cuda_native_event_name = strtok(NULL, ","); + } + i++; + *cuda_native_event_names = cmd_line_native_event_names; + } + else + { + print_help_message(); + exit(EXIT_FAILURE); + } + } } int main(int argc, char *argv[]) { - quiet = 0; -#ifdef PAPI - char *test_quiet = getenv("PAPI_CUDA_TEST_QUIET"); - if (test_quiet) - quiet = (int) strtol(test_quiet, (char**) NULL, 10); - - int event_count = argc - 1; - /* if no events passed at command line, just report test skipped. */ - if (event_count == 0) { - fprintf(stderr, "No eventnames specified at command line.\n"); - test_skip(__FILE__, __LINE__, "", 0); + // Determine the number of Cuda capable devices + int num_devices = 0; + check_cuda_runtime_api_call( cudaGetDeviceCount(&num_devices) ); + + // No devices detected on the machine, exit + if (num_devices < 1) { + fprintf(stderr, "No NVIDIA devices found on the machine. This is required for the test to run.\n"); + exit(EXIT_FAILURE); } -#endif - - int num_gpus = 0, i; - - RUNTIME_API_CALL(cudaGetDeviceCount(&num_gpus)); // determine the number of CUDA capable GPUs - if (num_gpus < 1) { - fprintf(stderr, "no CUDA capable devices were detected\n"); -#ifdef PAPI - test_skip(__FILE__, __LINE__, "", 0); -#endif - return 0; + int suppress_output = 0; + char *user_defined_suppress_output = getenv("PAPI_CUDA_TEST_QUIET"); + if (user_defined_suppress_output) { + suppress_output = (int) strtol(user_defined_suppress_output, (char**) NULL, 10); + } + PRINT(suppress_output, "Running the cuda component test cudaOpenMP_noCuCtx.cu\n"); + + char **cuda_native_event_names = NULL; + // If command line arguments are provided then get their values. + int total_event_count = 0; + if (argc > 1) { + parse_and_assign_args(argc, argv, &cuda_native_event_names, &total_event_count); } - ///////////////////////////////////////////////////////////////// - // display CPU and GPU configuration - // - PRINT(quiet, "number of host CPUs:\t%d\n", omp_get_num_procs()); - PRINT(quiet, "number of CUDA devices:\t%d\n", num_gpus); - - for (i = 0; i < num_gpus; i++) { - cudaDeviceProp dprop; - RUNTIME_API_CALL(cudaGetDeviceProperties(&dprop, i)); - PRINT(quiet, " %d: %s\n", i, dprop.name); - - RUNTIME_API_CALL(cudaSetDevice(i)); - RUNTIME_API_CALL(cudaFree(NULL)); - } - - int num_threads = (num_gpus > MAX_THREADS) ? MAX_THREADS : num_gpus; - PRINT(quiet, "---------------------------\n"); -#ifdef PAPI - int papi_errno = PAPI_library_init( PAPI_VER_CURRENT ); + // Initialize the PAPI library + int papi_errno = PAPI_library_init(PAPI_VER_CURRENT); if( papi_errno != PAPI_VER_CURRENT ) { - test_fail(__FILE__, __LINE__, "PAPI_library_init failed.", 0); + test_fail(__FILE__, __LINE__, "PAPI_library_init()", 0); } - PAPI_CALL(PAPI_thread_init((unsigned long (*)(void)) omp_get_thread_num)); -#endif - - omp_lock_t lock; - omp_init_lock(&lock); - - omp_set_num_threads(num_threads); // create as many CPU threads as there are CUDA devices - int numMultipassEvents = 0; -#pragma omp parallel - { - unsigned int cpu_thread_id = omp_get_thread_num(); - unsigned int num_cpu_threads = omp_get_num_threads(); - int gpu_id = cpu_thread_id % num_gpus; - RUNTIME_API_CALL(cudaSetDevice(gpu_id)); - -#ifdef PAPI + PRINT(suppress_output, "PAPI version being used for this test: %d.%d.%d\n", + PAPI_VERSION_MAJOR(PAPI_VERSION), + PAPI_VERSION_MINOR(PAPI_VERSION), + PAPI_VERSION_REVISION(PAPI_VERSION)); + + // Initialize thread support in PAPI + check_papi_api_call( PAPI_thread_init((unsigned long (*)(void)) omp_get_thread_num) ); + + // Verify the cuda component was compiled in + int cuda_cmp_idx = PAPI_get_component_index("cuda"); + if (cuda_cmp_idx < 0) { + test_fail(__FILE__, __LINE__, "PAPI_get_component_index()", cuda_cmp_idx); + } + PRINT(suppress_output, "The cuda component is assigned to component index: %d\n", cuda_cmp_idx); + + // Initialize the cuda component + int cuda_eventcode = 0 | PAPI_NATIVE_MASK; + check_papi_api_call( PAPI_enum_cmp_event(&cuda_eventcode, PAPI_ENUM_FIRST, cuda_cmp_idx) ); + + // If we have not gotten an event via the command line, use the event obtained from PAPI_enum_cmp_event + if (total_event_count == 0) { + int num_spaces_to_allocate = 1; + cuda_native_event_names = (char **) malloc(num_spaces_to_allocate * sizeof(char *)); + check_memory_allocation_call( cuda_native_event_names ); + + cuda_native_event_names[total_event_count] = (char *) malloc(PAPI_2MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call( cuda_native_event_names[total_event_count] ); + + check_papi_api_call( PAPI_event_code_to_name(cuda_eventcode, cuda_native_event_names[total_event_count++]) ); + } + + const PAPI_component_info_t *cmpInfo = PAPI_get_component_info(cuda_cmp_idx); + if (cmpInfo == NULL) { + fprintf(stderr, "Call to PAPI_get_component_info failed.\n"); + exit(EXIT_FAILURE); + } + + // Check to see if the Cuda component is partially disabled + if (cmpInfo->partially_disabled) { + const char *cc_support = (getenv("PAPI_CUDA_API") != NULL) ? "<=7.0" : ">=7.0"; + PRINT(suppress_output, "\033[33mThe cuda component is partially disabled. Only support for CC's %s are enabled.\033[0m\n", cc_support); + } + + // Determine the number of threads we will launch based off the number of + // Cuda devices on the machine (max of 32). + int num_threads_and_devs = (num_devices > MAX_THREADS) ? MAX_THREADS : num_devices; + omp_set_num_threads(num_threads_and_devs); + PRINT(suppress_output, "Total number of threads to be launched: %d\n", num_threads_and_devs); + int thread_and_dev_idx, event_idx, numMultipassEvents = 0; + #pragma omp parallel for + for (thread_and_dev_idx = 0; thread_and_dev_idx < num_threads_and_devs; thread_and_dev_idx++) { + const PAPI_component_info_t *cmpInfo = PAPI_get_component_info(cuda_cmp_idx); + if (cmpInfo == NULL) { + fprintf(stderr, "Call to PAPI_get_component_info failed.\n"); + exit(EXIT_FAILURE); + } + + if (cmpInfo->partially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(thread_and_dev_idx) == 0) { + continue; + } + } + + check_cuda_runtime_api_call( cudaSetDevice(thread_and_dev_idx) ); + int EventSet = PAPI_NULL; - long long values[MAX_THREADS]; - int j, errno; - PAPI_CALL(PAPI_create_eventset(&EventSet)); - - PRINT(quiet, "CPU thread %d (of %d) uses CUDA device %d @ eventset %d\n", cpu_thread_id, num_cpu_threads, gpu_id, EventSet); - - int numEventsSuccessfullyAdded = 0; - char **eventsSuccessfullyAdded, **metricNames = argv + 1; - eventsSuccessfullyAdded = (char **) malloc(event_count * sizeof(char *)); - if (eventsSuccessfullyAdded == NULL) { - fprintf(stderr, "Failed to allocate memory for successfully added events.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - for (i = 0; i < event_count; i++) { - eventsSuccessfullyAdded[i] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); - if (eventsSuccessfullyAdded[i] == NULL) { - fprintf(stderr, "Failed to allocate memory for command line argument.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - } + check_papi_api_call( PAPI_create_eventset(&EventSet) ); + + int num_events_successfully_added = 0; + char **events_successfully_added = (char **) malloc(total_event_count * sizeof(char *)); + check_memory_allocation_call(events_successfully_added); + + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + char tmp_event_name[PAPI_MAX_STR_LEN]; + int strLen = snprintf(tmp_event_name, PAPI_MAX_STR_LEN, "%s:device=%d", cuda_native_event_names[event_idx], thread_and_dev_idx); + if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { + fprintf(stderr, "Failed to fully write event name with appended device qualifier.\n"); + exit(EXIT_FAILURE); + } - add_events_from_command_line(EventSet, event_count, gpu_id, metricNames, &numEventsSuccessfullyAdded, eventsSuccessfullyAdded, &numMultipassEvents); + events_successfully_added[event_idx] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call(events_successfully_added[event_idx]); + + + add_cuda_native_events(EventSet, tmp_event_name, &num_events_successfully_added, events_successfully_added, &numMultipassEvents); + } // Only multiple pass events were provided on the command line - if (numEventsSuccessfullyAdded == 0) { + if (num_events_successfully_added == 0) { fprintf(stderr, "Events provided on the command line could not be added to an EventSet as they require multiple passes.\n"); - test_skip(__FILE__, __LINE__, "", 0); + exit(EXIT_FAILURE); } - PAPI_CALL(PAPI_start(EventSet)); -#endif - VectorAddSubtract(50000*(cpu_thread_id+1), quiet); // gpu work -#ifdef PAPI - PAPI_CALL(PAPI_stop(EventSet, values)); + check_papi_api_call( PAPI_start(EventSet) ); - PRINT(quiet, "User measured values.\n"); - for (j = 0; j < numEventsSuccessfullyAdded; j++) { - PRINT(quiet, "%s\t\t%lld\n", eventsSuccessfullyAdded[j], values[j]); - } + // Work for the device + VectorAddSubtract(50000 * (thread_and_dev_idx + 1), suppress_output); - // Free allocated memory - for (i = 0; i < event_count; i++) { - free(eventsSuccessfullyAdded[i]); - } - free(eventsSuccessfullyAdded); + long long cuda_counter_values[MAX_THREADS]; + check_papi_api_call( PAPI_stop(EventSet, cuda_counter_values) ); - errno = PAPI_cleanup_eventset(EventSet); - if (errno != PAPI_OK) { - fprintf(stderr, "PAPI_cleanup_eventset(%d) failed with error %d", EventSet, errno); - test_fail(__FILE__, __LINE__, "", errno); + for (event_idx = 0; event_idx < num_events_successfully_added; event_idx++) { + PRINT(suppress_output, "Event %s on thread and device id %d produced the value:\t\t%lld\n", events_successfully_added[event_idx], thread_and_dev_idx, cuda_counter_values[event_idx]); } - PAPI_CALL(PAPI_destroy_eventset(&EventSet)); -#endif - } // omp parallel region end + // Free allocated memory + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + free(events_successfully_added[event_idx]); + } + free(events_successfully_added); - if (cudaSuccess != cudaGetLastError()) - fprintf(stderr, "%s\n", cudaGetErrorString(cudaGetLastError())); + check_papi_api_call( PAPI_cleanup_eventset(EventSet) ); - omp_destroy_lock(&lock); -#ifdef PAPI - PAPI_shutdown(); + check_papi_api_call( PAPI_destroy_eventset(&EventSet) ); + } // omp parallel region end // Output a note that a multiple pass event was provided on the command line if (numMultipassEvents > 0) { - PRINT(quiet, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); + PRINT(suppress_output, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); } + // Free allocated memory + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + free(cuda_native_event_names[event_idx]); + } + free(cuda_native_event_names); + + PAPI_shutdown(); + test_pass(__FILE__); -#endif + return 0; } diff --git a/src/components/cuda/tests/cuda_tests_helper.c b/src/components/cuda/tests/cuda_tests_helper.c new file mode 100644 index 000000000..bd204b636 --- /dev/null +++ b/src/components/cuda/tests/cuda_tests_helper.c @@ -0,0 +1,177 @@ +// Standard library headers +#include +#include +#include + +// Cuda Toolkit headers +#include + +// Internal headers +#include "papi.h" +#include "papi_test.h" +#include "cuda_tests_helper.h" + +/** @class add_cuda_native_events + * @brief Try and add each event provided on the command line by the user. + * + * @param EventSet + * A PAPI eventset. + * @param *cuda_native_event_name + * Event to add to the EventSet. + * @param *num_events_successfully_added + * Total number of successfully added events. + * @param **events_successfully_added + * Events that we are able to add to the EventSet. + * @param *numMultipassEvents + * Counter to see if a multiple pass event was provided on the command line. +*/ +void add_cuda_native_events(int EventSet, const char *cuda_native_event_name, int *num_events_successfully_added, char **events_successfully_added, int *numMultipassEvents) +{ + int papi_errno = PAPI_add_named_event(EventSet, cuda_native_event_name); + if (papi_errno != PAPI_OK) { + if (papi_errno != PAPI_EMULPASS) { + fprintf(stderr, "Unable to add event %s to the EventSet with error code %d.\n", cuda_native_event_name, papi_errno); + exit(EXIT_FAILURE); + } + // Handle multiple pass events + (*numMultipassEvents)++; + return; + } + + // Handle successfully added events + int strLen = snprintf(events_successfully_added[(*num_events_successfully_added)], PAPI_MAX_STR_LEN, "%s", cuda_native_event_name); + if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { + fprintf(stderr, "Failed to fully write successfully added event.\n"); + exit(EXIT_FAILURE); + } + (*num_events_successfully_added)++; + + return; +} + +/** @class determine_if_device_is_enabled + * @brief If a machine has mixed compute capabilites determine which devices + * are available to be used. + * + * @param device_idx + * Index of the device on the machine. +*/ +int determine_if_device_is_enabled(int device_idx) +{ + cudaDeviceProp device_prop; + cudaError_t cudaError = cudaGetDeviceProperties(&device_prop, device_idx); + if (cudaError != cudaSuccess) { + fprintf(stderr, "Call to cudaGetDeviceProperties failed with error code: %d.\n", cudaError); + exit(EXIT_FAILURE); + } + + int device_enabled = 1; + char *cudaApi = getenv("PAPI_CUDA_API"); + // Perfworks API is enabled + if (cudaApi == NULL) { + // Perfworks Metrics API supports CC's >= 7 + if (device_prop.major < 7) { + device_enabled = 0; + } + } + // Legacy API is enabled + else { + // Legacy API supports CC's <= 7 + if (device_prop.major > 7) { + device_enabled = 0; + } + } + + return device_enabled; + +} + +/** @class enumerate_and_store_cuda_native_events + * @brief For the case users do not add an event on the command line, enumerate through + * the available cuda native events and store one to be used for profiling. + * + * @param ***cuda_native_event_names + * Stores the enumerated event name to be used for profiling. + * @param *total_event_count + * Number of events that were stored. + * @param *cuda_device_index + * Device index that will be used to create a cuda context. +*/ +void enumerate_and_store_cuda_native_events(char ***cuda_native_event_names, int *total_event_count, int *cuda_device_index) +{ + // Get the first cuda native event on the architecture. + int cuda_cmp_idx = PAPI_get_component_index("cuda"); + if (cuda_cmp_idx < 0) { + test_fail(__FILE__, __LINE__, "PAPI_get_component_index", cuda_cmp_idx); + } + + int modifier = PAPI_ENUM_FIRST; + int cuda_eventcode = 0 | PAPI_NATIVE_MASK; + int papi_errno = PAPI_enum_cmp_event(&cuda_eventcode, modifier, cuda_cmp_idx); + if (papi_errno != PAPI_OK) { + test_fail(__FILE__, __LINE__, "PAPI_enum_cmp_event", papi_errno); + } + + int num_spaces_to_allocate = 1; + char **enumerated_cuda_native_event_name = (char **) malloc(num_spaces_to_allocate * sizeof(char *)); + check_memory_allocation_call(enumerated_cuda_native_event_name); + + enumerated_cuda_native_event_name[(*total_event_count)] = (char *) malloc(PAPI_2MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call(enumerated_cuda_native_event_name[(*total_event_count)]); + + // Convert the first cuda native event code to a name, the name will + // be in the format of cuda:::basename with no qualifiers appended. + char basename[PAPI_MAX_STR_LEN]; + papi_errno = PAPI_event_code_to_name(cuda_eventcode, basename); + if (papi_errno != PAPI_OK) { + test_fail(__FILE__, __LINE__, "PAPI_event_code_to_name", papi_errno); + } + + // Begin reconstructing the Cuda native event name with qualifiers + int strLen = snprintf(enumerated_cuda_native_event_name[(*total_event_count)], PAPI_2MAX_STR_LEN, "%s", basename); + if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { + fprintf(stderr, "Failed to fully write event name."); + exit(EXIT_FAILURE); + } + + // Enumerate through the available default qualifiers. + // The Legacy API only has the device qualifiers + // while the Perfworks Metrics API has a stat and device + // qualifier. + modifier = PAPI_NTV_ENUM_UMASKS; + papi_errno = PAPI_enum_cmp_event(&cuda_eventcode, modifier, cuda_cmp_idx); + if (papi_errno != PAPI_OK) { + test_fail(__FILE__, __LINE__, "PAPI_enum_cmp_event", papi_errno); + } + + do { + PAPI_event_info_t info; + papi_errno = PAPI_get_event_info(cuda_eventcode, &info); + if (papi_errno != PAPI_OK) { + test_fail(__FILE__, __LINE__, "PAPI_get_event_info", papi_errno); + } + + char *qualifier = strstr(info.symbol + strlen("cuda:::"), ":"); + if (strncmp(qualifier, ":device=", 8) == 0) { + (*cuda_device_index) = strtol(qualifier + strlen(":device="), NULL, 10); + } + + int strLen = snprintf(enumerated_cuda_native_event_name[(*total_event_count)] + strlen(enumerated_cuda_native_event_name[(*total_event_count)]), PAPI_2MAX_STR_LEN - strlen(enumerated_cuda_native_event_name[(*total_event_count)]), "%s", qualifier); + if (strLen < 0 || strLen >= PAPI_2MAX_STR_LEN - strlen(enumerated_cuda_native_event_name[(*total_event_count)])) { + fprintf(stderr, "Unable to append qualifier to cuda native event name.\n"); + exit(EXIT_FAILURE); + } + + } while (PAPI_enum_cmp_event(&cuda_eventcode, modifier, cuda_cmp_idx) == PAPI_OK); + + // Safety net, this should never be triggered + if ((*cuda_device_index) == -1) { + fprintf(stderr, "A device qualifier is needed to continue or a device index must be provided on the command line.\n"); + exit(EXIT_FAILURE); + } + + (*total_event_count)++; + *cuda_native_event_names = enumerated_cuda_native_event_name; + + return; +} diff --git a/src/components/cuda/tests/cuda_tests_helper.h b/src/components/cuda/tests/cuda_tests_helper.h new file mode 100644 index 000000000..48fd32b50 --- /dev/null +++ b/src/components/cuda/tests/cuda_tests_helper.h @@ -0,0 +1,57 @@ +#ifndef CUDA_TESTS_HELPER_H +#define CUDA_TESTS_HELPER_H +#include +#include + +void add_cuda_native_events(int EventSet, const char *cuda_native_event_name, int *num_events_successfully_added, char **events_successfully_added, int *numMultipassEvents); +int determine_if_device_is_enabled(int device_idx); +void enumerate_and_store_cuda_native_events(char ***cuda_native_event_names, int *total_event_count, int *cuda_device_index); + +// Define to handle suppress print output for the cuda component tests +#define PRINT(global_suppress_output, format, args...) \ +{ \ + if (!global_suppress_output) { \ + fprintf(stderr, format, ## args); \ + } \ +} \ + +// Define to handle memory allocation checks +#define check_memory_allocation_call(var) \ +do { \ + if (var == NULL) { \ + fprintf(stderr, "%s:%d: Error: Memory Allocation Failed \n", \ + __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +// Define to handle PAPI api calls +#define check_papi_api_call(apiFuncCall) \ +do { \ + int papi_errno = apiFuncCall; \ + if (papi_errno != PAPI_OK) { \ + test_fail(__FILE__, __LINE__, #apiFuncCall, papi_errno); \ + } \ +} while (0) + +// Define's to handle Cuda API calls +#define check_cuda_runtime_api_call(apiFuncCall) \ +do { \ + cudaError_t _status = apiFuncCall; \ + if (_status != cudaSuccess) { \ + fprintf(stderr, "Call to %s on line %d failed with error code %d.\n", \ + #apiFuncCall, __LINE__, _status); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define check_cuda_driver_api_call(apiFuncCall) \ +do { \ + CUresult _status = apiFuncCall; \ + if (_status != CUDA_SUCCESS) { \ + fprintf(stderr, "Call to %s on line %d failed with error code %d.\n", \ + #apiFuncCall, __LINE__, _status); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) +#endif // CUDA_TESTS_HELPER_H diff --git a/src/components/cuda/tests/pthreads.cu b/src/components/cuda/tests/pthreads.cu index 3858a5ce7..087540294 100644 --- a/src/components/cuda/tests/pthreads.cu +++ b/src/components/cuda/tests/pthreads.cu @@ -1,280 +1,327 @@ /** - * @file pthreads.cu - * @author Anustuv Pal - * anustuv@icl.utk.edu - */ +* @file pthreads.cu +* @brief For each enabled NVIDIA device detected on the machine a matching thread will be created +* using pthread_create. For each thread, cuCtxCreate will be called which will +* create a Cuda context. +* +* Note: The cuda component supports being partially disabled, meaning that certain devices +* will not be "enabled" to profile on. If PAPI_CUDA_API is not set, then devices with +* CC's >= 7.0 will be used and if PAPI_CUDA_API is set to LEGACY then devices with +* CC's <= 7.0 will be used. +* +* For each enabled device, their matching thread will have a workflow of: +* 1. Creating an EventSet +* 2. Adding events to the EventSet +* 3. Starting the EventSet +* 4. Stopping the EventSet +*/ +// Standard library headers #include #include #include #include -#include "gpu_work.h" -#ifdef PAPI -#include +// Cuda Toolkit headers +#include + +// Internal headers +#include "cuda_tests_helper.h" +#include "gpu_work.h" +#include "papi.h" #include "papi_test.h" -#define PAPI_CALL(apiFuncCall) \ -do { \ - int _status = apiFuncCall; \ - if (_status != PAPI_OK) { \ - fprintf(stderr, "error: function %s failed.", #apiFuncCall); \ - test_fail(__FILE__, __LINE__, "", _status); \ - } \ -} while (0) -#endif +#define MAX_THREADS (32) -#define PRINT(quiet, format, args...) {if (!quiet) {fprintf(stderr, format, ## args);}} -int quiet; - -#define RUNTIME_API_CALL(apiFuncCall) \ -do { \ - cudaError_t _status = apiFuncCall; \ - if (_status != cudaSuccess) { \ - fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ - __FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\ - exit(EXIT_FAILURE); \ - } \ -} while (0) - -#define DRIVER_API_CALL(apiFuncCall) \ -do { \ - CUresult _status = apiFuncCall; \ - if (_status != CUDA_SUCCESS) { \ - fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \ - __FILE__, __LINE__, #apiFuncCall, _status); \ - exit(EXIT_FAILURE); \ - } \ -} while (0) +int global_suppress_output; +int global_total_event_count; +char **global_cuda_native_event_names = NULL; +int global_num_multipass_events; -#define MAX_THREADS (32) +static void print_help_message(void) +{ + printf("./pthreads --cuda-native-event-names [list of cuda native event names separated by a comma].\n" + "Notes:\n" + "1. Native event names must not have the device qualifier appended.\n"); +} -int numGPUs; -int g_event_count; -char **g_evt_names; - -static volatile int global_thread_count = 0; -pthread_mutex_t global_mutex; -pthread_t tidarr[MAX_THREADS]; -CUcontext cuCtx[MAX_THREADS]; -pthread_mutex_t lock; - -// Globals for multiple pass events -int numMultipassEvents = 0; - -/** @class add_events_from_command_line - * @brief Try and add each event provided on the command line by the user. - * - * @param EventSet - * A PAPI eventset. - * @param totalEventCount - * Number of events from the command line. - * @param **eventNamesFromCommandLine - * Events provided on the command line. - * @param gpu_id - * NVIDIA device index. - * @param *numEventsSuccessfullyAdded - * Total number of successfully added events. - * @param **eventsSuccessfullyAdded - * Events that we are able to add to the EventSet. - * @param *numMultipassEvents - * Counter to see if a multiple pass event was provided on the command line. -*/ -static void add_events_from_command_line(int EventSet, int totalEventCount, char **eventNamesFromCommandLine, int gpu_id, int *numEventsSuccessfullyAdded, char **eventsSuccessfullyAdded, int *numMultipassEvents) +static void parse_and_assign_args(int argc, char *argv[], char ***cuda_native_event_names, int *total_event_count) { int i; - for (i = 0; i < totalEventCount; i++) { - char tmpEventName[PAPI_MAX_STR_LEN]; - int strLen = snprintf(tmpEventName, PAPI_MAX_STR_LEN, "%s:device=%d", eventNamesFromCommandLine[i], gpu_id); - if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { - fprintf(stderr, "Failed to fully write event name with appended device qualifier.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - - int papi_errno = PAPI_add_named_event(EventSet, tmpEventName); - if (papi_errno != PAPI_OK) { - if (papi_errno != PAPI_EMULPASS) { - fprintf(stderr, "Unable to add event %s to the EventSet with error code %d.\n", tmpEventName, papi_errno); - test_skip(__FILE__, __LINE__, "", 0); - } - - // Handle multiple pass events - (*numMultipassEvents)++; - continue; - } - - // Handle successfully added events - strLen = snprintf(eventsSuccessfullyAdded[(*numEventsSuccessfullyAdded)], PAPI_MAX_STR_LEN, "%s", tmpEventName); - if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { - fprintf(stderr, "Failed to fully write successfully added event.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - (*numEventsSuccessfullyAdded)++; - } - - return; + for (i = 1; i < argc; ++i) + { + char *arg = argv[i]; + if (strcmp(arg, "--help") == 0) + { + print_help_message(); + exit(EXIT_SUCCESS); + } + else if (strcmp(arg, "--cuda-native-event-names") == 0) + { + if (!argv[i + 1]) + { + printf("ERROR!! --cuda-native-event-names given, but no events listed.\n"); + exit(EXIT_FAILURE); + } + + char **cmd_line_native_event_names = NULL; + const char *cuda_native_event_name = strtok(argv[i+1], ","); + while (cuda_native_event_name != NULL) + { + if (strstr(cuda_native_event_name, ":device")) { + fprintf(stderr, "Cuda native event name must not have a device qualifier appended for this test, i.e. no :device=#.\n"); + print_help_message(); + exit(EXIT_FAILURE); + } + + cmd_line_native_event_names = (char **) realloc(cmd_line_native_event_names, ((*total_event_count) + 1) * sizeof(char *)); + check_memory_allocation_call(cmd_line_native_event_names); + + cmd_line_native_event_names[(*total_event_count)] = (char *) malloc(PAPI_2MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call(cmd_line_native_event_names[(*total_event_count)]); + + int strLen = snprintf(cmd_line_native_event_names[(*total_event_count)], PAPI_2MAX_STR_LEN, "%s", cuda_native_event_name); + if (strLen < 0 || strLen >= PAPI_2MAX_STR_LEN) { + fprintf(stderr, "Failed to fully write cuda native event name.\n"); + exit(EXIT_FAILURE); + } + + (*total_event_count)++; + cuda_native_event_name = strtok(NULL, ","); + } + i++; + *cuda_native_event_names = cmd_line_native_event_names; + } + else + { + print_help_message(); + exit(EXIT_FAILURE); + } + } } -void *thread_gpu(void * idx) +void *thread_gpu(void *thread_and_dev_idx) { - int tid = *((int*) idx); - unsigned long gettid = (unsigned long) pthread_self(); - -#ifdef PAPI - int gpuid = tid % numGPUs; - int i; + int curr_thread_and_dev_idx = *(int *) thread_and_dev_idx; + // Create a Cuda context for the current thread + CUcontext ctx; + int flags = 0; + CUdevice device = curr_thread_and_dev_idx; +#if defined(CUDA_TOOLKIT_GE_13) + check_cuda_driver_api_call( cuCtxCreate(&ctx, (CUctxCreateParams*)0, flags, device) ); +#else + check_cuda_driver_api_call( cuCtxCreate(&ctx, flags, device) ); +#endif int EventSet = PAPI_NULL; - long long values[MAX_THREADS]; - PAPI_CALL(PAPI_create_eventset(&EventSet)); - - DRIVER_API_CALL(cuCtxSetCurrent(cuCtx[tid])); - PRINT(quiet, "This is idx %d thread %lu - using GPU %d context %p!\n", - tid, gettid, gpuid, cuCtx[tid]); - - int numEventsSuccessfullyAdded = 0; - char **eventsSuccessfullyAdded; - eventsSuccessfullyAdded = (char **) malloc(g_event_count * sizeof(char *)); - if (eventsSuccessfullyAdded == NULL) { - fprintf(stderr, "Failed to allocate memory for successfully added events.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - for (i = 0; i < g_event_count; i++) { - eventsSuccessfullyAdded[i] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); - if (eventsSuccessfullyAdded[i] == NULL) { - fprintf(stderr, "Failed to allocate memory for command line argument.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - } + check_papi_api_call( PAPI_create_eventset(&EventSet) ); + + int num_events_successfully_added = 0; + char **events_successfully_added = (char **) malloc(global_total_event_count * sizeof(char *)); + check_memory_allocation_call(events_successfully_added); - pthread_mutex_lock(&global_mutex); + int event_idx; + for (event_idx = 0; event_idx < global_total_event_count; event_idx++) { + char tmp_event_name[PAPI_MAX_STR_LEN]; + int strLen = snprintf(tmp_event_name, PAPI_MAX_STR_LEN, "%s:device=%d", global_cuda_native_event_names[event_idx], curr_thread_and_dev_idx); + if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { + fprintf(stderr, "Failed to fully write event name with appended device qualifier.\n"); + exit(EXIT_FAILURE); + } - add_events_from_command_line(EventSet, g_event_count, g_evt_names, gpuid, &numEventsSuccessfullyAdded, eventsSuccessfullyAdded, &numMultipassEvents); + events_successfully_added[event_idx] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call(events_successfully_added[event_idx]); + + add_cuda_native_events(EventSet, tmp_event_name, &num_events_successfully_added, events_successfully_added, &global_num_multipass_events); + } // Only multiple pass events were provided on the command line - if (numEventsSuccessfullyAdded == 0) { + if (num_events_successfully_added == 0) { fprintf(stderr, "Events provided on the command line could not be added to an EventSet as they require multiple passes.\n"); - test_skip(__FILE__, __LINE__, "", 0); + exit(EXIT_FAILURE); } - ++global_thread_count; - pthread_mutex_unlock(&global_mutex); + check_papi_api_call( PAPI_start(EventSet) ); - while(global_thread_count < numGPUs); + // Work for the device + VectorAddSubtract(50000 * (curr_thread_and_dev_idx + 1), global_suppress_output); - PAPI_CALL(PAPI_start(EventSet)); -#endif - - VectorAddSubtract(50000*(tid+1), quiet); // gpu work - -#ifdef PAPI - PAPI_CALL(PAPI_stop(EventSet, values)); + long long cuda_counter_values[MAX_THREADS]; + check_papi_api_call( PAPI_stop(EventSet, cuda_counter_values) ); - PRINT(quiet, "User measured values in thread id %d.\n", tid); - for (i = 0; i < numEventsSuccessfullyAdded; i++) { - PRINT(quiet, "%s\t\t%lld\n", eventsSuccessfullyAdded[i], values[i]); + for (event_idx = 0; event_idx < num_events_successfully_added; event_idx++) { + PRINT(global_suppress_output, "Event %s on thread and device id %d produced the value:\t\t%lld\n", events_successfully_added[event_idx], curr_thread_and_dev_idx, cuda_counter_values[event_idx]); } // Free allocated memory - for (i = 0; i < g_event_count; i++) { - free(eventsSuccessfullyAdded[i]); + for (event_idx = 0; event_idx < num_events_successfully_added; event_idx++) { + free(events_successfully_added[event_idx]); } - free(eventsSuccessfullyAdded); + free(events_successfully_added); + + check_papi_api_call( PAPI_cleanup_eventset(EventSet) ); + + check_papi_api_call( PAPI_destroy_eventset(&EventSet) ); + + check_cuda_driver_api_call( cuCtxDestroy(ctx) ); - PAPI_CALL(PAPI_cleanup_eventset(EventSet)); - PAPI_CALL(PAPI_destroy_eventset(&EventSet)); -#endif return NULL; } int main(int argc, char **argv) { - quiet = 0; -#ifdef PAPI - char *test_quiet = getenv("PAPI_CUDA_TEST_QUIET"); - if (test_quiet) - quiet = (int) strtol(test_quiet, (char**) NULL, 10); - - g_event_count = argc - 1; - /* if no events passed at command line, just report test skipped. */ - if (g_event_count == 0) { - fprintf(stderr, "No eventnames specified at command line.\n"); - test_skip(__FILE__, __LINE__, "", 0); + // Determine the number of Cuda capable devices + int num_devices = 0; + check_cuda_runtime_api_call( cudaGetDeviceCount(&num_devices) ); + + // No devices detected on the machine, exit + if (num_devices < 1) { + fprintf(stderr, "No NVIDIA devices found on the machine. This is required for the test to run.\n"); + exit(EXIT_FAILURE); + } + + global_suppress_output = 0; + char *global_user_defined_suppress_output = getenv("PAPI_CUDA_TEST_QUIET"); + if (global_user_defined_suppress_output) { + global_suppress_output = (int) strtol(global_user_defined_suppress_output, (char**) NULL, 10); } - g_evt_names = argv + 1; -#endif - int rc, i; - int tid[MAX_THREADS]; - - RUNTIME_API_CALL(cudaGetDeviceCount(&numGPUs)); - PRINT(quiet, "No. of GPUs = %d\n", numGPUs); - if (numGPUs < 1) { - fprintf(stderr, "No GPUs found on system.\n"); -#ifdef PAPI - test_skip(__FILE__, __LINE__, "", 0); -#endif - return 0; + PRINT(global_suppress_output, "Running the cuda component test pthreads.cu\n"); + + // If command line arguments are provided then get their values. + global_total_event_count = 0; + if (argc > 1) { + parse_and_assign_args(argc, argv, &global_cuda_native_event_names, &global_total_event_count); } - if (numGPUs > MAX_THREADS) - numGPUs = MAX_THREADS; - PRINT(quiet, "No. of threads to launch = %d\n", numGPUs); -#ifdef PAPI - pthread_mutex_init(&global_mutex, NULL); - int papi_errno = PAPI_library_init( PAPI_VER_CURRENT ); + // Initialize the PAPI library + int papi_errno = PAPI_library_init(PAPI_VER_CURRENT); if( papi_errno != PAPI_VER_CURRENT ) { - test_fail(__FILE__, __LINE__, "PAPI_library_init failed.", 0); + test_fail(__FILE__, __LINE__, "PAPI_library_init()", papi_errno); } - // Point PAPI to function that gets the thread id - PAPI_CALL(PAPI_thread_init((unsigned long (*)(void)) pthread_self)); -#endif - // Launch the threads - for(i = 0; i < numGPUs; i++) - { - tid[i] = i; + PRINT(global_suppress_output, "PAPI version being used for this test: %d.%d.%d\n", + PAPI_VERSION_MAJOR(PAPI_VERSION), + PAPI_VERSION_MINOR(PAPI_VERSION), + PAPI_VERSION_REVISION(PAPI_VERSION)); + + // Initialize thread support in PAPI + check_papi_api_call( PAPI_thread_init((unsigned long (*)(void)) pthread_self) ); + + // Verify that the cuda component is compiled in + int cuda_cmp_idx = PAPI_get_component_index("cuda"); + if (cuda_cmp_idx < 0) { + test_fail(__FILE__, __LINE__, "PAPI_get_component_index()", cuda_cmp_idx); + } + PRINT(global_suppress_output, "The cuda component is assigned to component index: %d\n", cuda_cmp_idx); - int flags = 0; - CUdevice device = i % numGPUs; -#if defined(CUDA_TOOLKIT_GE_13) - DRIVER_API_CALL( cuCtxCreate(&(cuCtx[i]), (CUctxCreateParams*)0, flags, device) ); -#else - DRIVER_API_CALL( cuCtxCreate(&(cuCtx[i]), flags, device) ); -#endif - DRIVER_API_CALL(cuCtxPopCurrent(&(cuCtx[i]))); + // Initialize the Cuda component + int cuda_eventcode = 0 | PAPI_NATIVE_MASK; + check_papi_api_call( PAPI_enum_cmp_event(&cuda_eventcode, PAPI_ENUM_FIRST, cuda_cmp_idx) ); - rc = pthread_create(&tidarr[i], NULL, thread_gpu, &(tid[i])); - if(rc) - { - fprintf(stderr, "\n ERROR: return code from pthread_create is %d \n", rc); - exit(1); - } + // If we have not gotten an event via the command line, use the event obtained from PAPI_enum_cmp_event + if (global_total_event_count == 0) { + int num_spaces_to_allocate = 1; + global_cuda_native_event_names = (char **) malloc(num_spaces_to_allocate * sizeof(char *)); + check_memory_allocation_call( global_cuda_native_event_names ); + global_cuda_native_event_names[global_total_event_count] = (char *) malloc(PAPI_2MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call( global_cuda_native_event_names[global_total_event_count] ); - PRINT(quiet, "\n Main thread %lu. Created new thread (%lu) in iteration %d ...\n", - (unsigned long)pthread_self(), (unsigned long)tidarr[i], i); + check_papi_api_call( PAPI_event_code_to_name(cuda_eventcode, global_cuda_native_event_names[global_total_event_count++]) ); } - // Join all threads when complete - for (i = 0; i < numGPUs; i++) { - pthread_join(tidarr[i], NULL); - PRINT(quiet, "IDX: %d: TID: %lu: Done! Joined main thread.\n", i, (unsigned long)tidarr[i]); + const PAPI_component_info_t *cmpInfo = PAPI_get_component_info(cuda_cmp_idx); + if (cmpInfo == NULL) { + fprintf(stderr, "Call to PAPI_get_component_info failed.\n"); + exit(EXIT_FAILURE); + } + + // Check to see if the Cuda component is partially disabled + if (cmpInfo->partially_disabled) { + const char *cc_support = (getenv("PAPI_CUDA_API") != NULL) ? "<=7.0" : ">=7.0"; + PRINT(global_suppress_output, "\033[33mThe cuda component is partially disabled. Only support for CC's %s are enabled.\033[0m\n", cc_support); + } + + // Cap the number of devices to the max allowed number of threads + if (num_devices > MAX_THREADS) { + num_devices = MAX_THREADS; } - // Destroy all CUDA contexts for all threads/GPUs - for (i = 0; i < numGPUs; i++) { - DRIVER_API_CALL(cuCtxDestroy(cuCtx[i])); + // Allocate memory for all the gpus found on the machine to keep track of threads and thread args + pthread_t *tinfo = (pthread_t *) calloc(num_devices, sizeof(pthread_t)); + check_memory_allocation_call(tinfo); + + int *thread_args = (int *) calloc(num_devices, sizeof(int)); + check_memory_allocation_call(thread_args); + + PRINT(global_suppress_output, "Total number of threads to be launched: %d\n", num_devices); + // For the number of devices detected on the machine, launch a thread + int thread_and_dev_idx, thread_errno, global_num_multipass_events = 0; + for(thread_and_dev_idx = 0; thread_and_dev_idx < num_devices; thread_and_dev_idx++) + { + const PAPI_component_info_t *cmpInfo = PAPI_get_component_info(cuda_cmp_idx); + if (cmpInfo == NULL) { + fprintf(stderr, "Call to PAPI_get_component_info failed.\n"); + exit(EXIT_FAILURE); + } + + if (cmpInfo->partially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(thread_and_dev_idx) == 0) { + continue; + } + } + + // Store thread information to later use pthread_join + tinfo[thread_and_dev_idx] = thread_and_dev_idx; + // Store thread args so we do not increment the looping variable while in thread_gpu + thread_args[thread_and_dev_idx] = thread_and_dev_idx; + + thread_errno = pthread_create(&tinfo[thread_and_dev_idx], NULL, thread_gpu, &thread_args[thread_and_dev_idx]); + if(thread_errno != 0) { + fprintf(stderr, "Call to pthread_create failed for thread %d with error code %d.\n", thread_and_dev_idx, thread_errno); + exit(EXIT_FAILURE); + } } -#ifdef PAPI - PAPI_shutdown(); - PRINT(quiet, "Main thread exit!\n"); + // Now join with each thread + for (thread_and_dev_idx = 0; thread_and_dev_idx < num_devices; thread_and_dev_idx++) { + const PAPI_component_info_t *cmpInfo = PAPI_get_component_info(cuda_cmp_idx); + if (cmpInfo == NULL) { + fprintf(stderr, "Call to PAPI_get_component_info failed.\n"); + exit(EXIT_FAILURE); + } + + if (cmpInfo->partially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(thread_and_dev_idx) == 0) { + continue; + } + } + + thread_errno = pthread_join(tinfo[thread_and_dev_idx], NULL); + if (thread_errno != 0) { + fprintf(stderr, "Call to pthread_join failed for thread %d with error code %d.\n", thread_and_dev_idx, thread_errno); + exit(EXIT_FAILURE); + } + } // Output a note that a multiple pass event was provided on the command line - if (numMultipassEvents > 0) { - PRINT(quiet, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); + if (global_num_multipass_events > 0) { + PRINT(global_suppress_output, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); + } + + // Free allocated memory + int event_idx; + for (event_idx = 0; event_idx < global_total_event_count; event_idx++) { + free(global_cuda_native_event_names[event_idx]); } + free(global_cuda_native_event_names); + free(tinfo); + free(thread_args); + + PAPI_shutdown(); test_pass(__FILE__); -#endif + return 0; } diff --git a/src/components/cuda/tests/pthreads_noCuCtx.cu b/src/components/cuda/tests/pthreads_noCuCtx.cu index ac824ef77..12740bfa8 100644 --- a/src/components/cuda/tests/pthreads_noCuCtx.cu +++ b/src/components/cuda/tests/pthreads_noCuCtx.cu @@ -1,267 +1,319 @@ /** - * @file pthreads_noCuCtx.cu - * @author Anustuv Pal - * anustuv@icl.utk.edu - */ +* @file pthreads_noCuCtx.cu +* @brief For each enabled NVIDIA device detected on the machine a matching thread will be created +* using pthread_create. For each thread, cudaSetDevice will be called which determines which +* device executions will be done on. +* +* Note: The cuda component supports being partially disabled, meaning that certain devices +* will not be "enabled" to profile on. If PAPI_CUDA_API is not set, then devices with +* CC's >= 7.0 will be used and if PAPI_CUDA_API is set to LEGACY then devices with +* CC's <= 7.0 will be used. +* +* For each enabled device, their matching thread will have a workflow of: +* 1. Creating an EventSet +* 2. Adding events to the EventSet +* 3. Starting the EventSet +* 4. Stopping the EventSet +*/ +// Standard library headers #include #include #include #include -#include "gpu_work.h" -#ifdef PAPI -#include -#include "papi_test.h" +// Cuda Toolkit headers +#include -#define PAPI_CALL(apiFuncCall) \ -do { \ - int _status = apiFuncCall; \ - if (_status != PAPI_OK) { \ - fprintf(stderr, "error: function %s failed.", #apiFuncCall); \ - test_fail(__FILE__, __LINE__, "", _status); \ - } \ -} while (0) -#endif - -#define PRINT(quiet, format, args...) {if (!quiet) {fprintf(stderr, format, ## args);}} -int quiet; - -#define RUNTIME_API_CALL(apiFuncCall) \ -do { \ - cudaError_t _status = apiFuncCall; \ - if (_status != cudaSuccess) { \ - fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ - __FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\ - exit(EXIT_FAILURE); \ - } \ -} while (0) - -#define DRIVER_API_CALL(apiFuncCall) \ -do { \ - CUresult _status = apiFuncCall; \ - if (_status != CUDA_SUCCESS) { \ - fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \ - __FILE__, __LINE__, #apiFuncCall, _status); \ - exit(EXIT_FAILURE); \ - } \ -} while (0) +// Internal headers +#include "cuda_tests_helper.h" +#include "gpu_work.h" +#include "papi.h" +#include "papi_test.h" #define MAX_THREADS (32) -int numGPUs; -int g_event_count; -char **g_evt_names; - -static volatile int global_thread_count = 0; -pthread_mutex_t global_mutex; -pthread_t tidarr[MAX_THREADS]; -CUcontext cuCtx[MAX_THREADS]; -pthread_mutex_t lock; - -// Globals for multiple pass events -int numMultipassEvents = 0; - -/** @class add_events_from_command_line - * @brief Try and add each event provided on the command line by the user. - * - * @param EventSet - * A PAPI eventset. - * @param totalEventCount - * Number of events from the command line. - * @param **eventNamesFromCommandLine - * Events provided on the command line. - * @param gpu_id - * NVIDIA device index. - * @param *numEventsSuccessfullyAdded - * Total number of successfully added events. - * @param **eventsSuccessfullyAdded - * Events that we are able to add to the EventSet. - * @param *numMultipassEvents - * Counter to see if a multiple pass event was provided on the command line. -*/ -static void add_events_from_command_line(int EventSet, int totalEventCount, char **eventNamesFromCommandLine, int gpu_id, int *numEventsSuccessfullyAdded, char **eventsSuccessfullyAdded, int *numMultipassEvents) -{ - int i; - for (i = 0; i < totalEventCount; i++) { - char tmpEventName[PAPI_MAX_STR_LEN]; - int strLen = snprintf(tmpEventName, PAPI_MAX_STR_LEN, "%s:device=%d", eventNamesFromCommandLine[i], gpu_id); - if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { - fprintf(stderr, "Failed to fully write event name with appended device qualifier.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - - int papi_errno = PAPI_add_named_event(EventSet, tmpEventName); - if (papi_errno != PAPI_OK) { - if (papi_errno != PAPI_EMULPASS) { - fprintf(stderr, "Unable to add event %s to the EventSet with error code %d.\n", tmpEventName, papi_errno); - test_skip(__FILE__, __LINE__, "", 0); - } +int global_suppress_output; +int global_total_event_count; +char **global_cuda_native_event_names = NULL; +int global_num_multipass_events = 0; - // Handle multiple pass events - (*numMultipassEvents)++; - continue; - } - - // Handle successfully added events - strLen = snprintf(eventsSuccessfullyAdded[(*numEventsSuccessfullyAdded)], PAPI_MAX_STR_LEN, "%s", tmpEventName); - if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { - fprintf(stderr, "Failed to fully write successfully added event.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - (*numEventsSuccessfullyAdded)++; - } +static void print_help_message(void) +{ + printf("./pthreads_noCuCtx --cuda-native-event-names [list of cuda native event names separated by a comma].\n" + "Notes:\n" + "1. Native event names must not have the device qualifier appended.\n"); +} - return; +static void parse_and_assign_args(int argc, char *argv[], char ***cuda_native_event_names, int *total_event_count) +{ + int i; + for (i = 1; i < argc; ++i) + { + char *arg = argv[i]; + if (strcmp(arg, "--help") == 0) + { + print_help_message(); + exit(EXIT_SUCCESS); + } + else if (strcmp(arg, "--cuda-native-event-names") == 0) + { + if (!argv[i + 1]) + { + printf("ERROR!! --cuda-native-event-names given, but no events listed.\n"); + exit(EXIT_FAILURE); + } + + char **cmd_line_native_event_names = NULL; + const char *cuda_native_event_name = strtok(argv[i+1], ","); + while (cuda_native_event_name != NULL) + { + if (strstr(cuda_native_event_name, ":device")) { + fprintf(stderr, "Cuda native event name must not have a device qualifier appended for this test, i.e. no :device=#.\n"); + print_help_message(); + exit(EXIT_FAILURE); + } + + cmd_line_native_event_names = (char **) realloc(cmd_line_native_event_names, ((*total_event_count) + 1) * sizeof(char *)); + check_memory_allocation_call(cmd_line_native_event_names); + + cmd_line_native_event_names[(*total_event_count)] = (char *) malloc(PAPI_2MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call(cmd_line_native_event_names[(*total_event_count)]); + + int strLen = snprintf(cmd_line_native_event_names[(*total_event_count)], PAPI_2MAX_STR_LEN, "%s", cuda_native_event_name); + if (strLen < 0 || strLen >= PAPI_2MAX_STR_LEN) { + fprintf(stderr, "Failed to fully write cuda native event name.\n"); + exit(EXIT_FAILURE); + } + + (*total_event_count)++; + cuda_native_event_name = strtok(NULL, ","); + } + i++; + *cuda_native_event_names = cmd_line_native_event_names; + } + else + { + print_help_message(); + exit(EXIT_FAILURE); + } + } } -void *thread_gpu(void * idx) +void *thread_gpu(void *thread_and_dev_idx) { - int tid = *((int*) idx); - unsigned long gettid = (unsigned long) pthread_self(); + int curr_thread_and_dev_idx = *(int *) thread_and_dev_idx; + + check_cuda_runtime_api_call( cudaSetDevice(curr_thread_and_dev_idx) ); + check_cuda_runtime_api_call( cudaFree(NULL) ); -#ifdef PAPI - int gpuid = tid % numGPUs; - int i; int EventSet = PAPI_NULL; - long long values[MAX_THREADS]; - PAPI_CALL(PAPI_create_eventset(&EventSet)); - - RUNTIME_API_CALL(cudaSetDevice(gpuid)); - PRINT(quiet, "This is idx %d thread %lu - using GPU %d\n", - tid, gettid, gpuid); - - int numEventsSuccessfullyAdded = 0; - char **eventsSuccessfullyAdded; - eventsSuccessfullyAdded = (char **) malloc(g_event_count * sizeof(char *)); - if (eventsSuccessfullyAdded == NULL) { - fprintf(stderr, "Failed to allocate memory for successfully added events.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - for (i = 0; i < g_event_count; i++) { - eventsSuccessfullyAdded[i] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); - if (eventsSuccessfullyAdded[i] == NULL) { - fprintf(stderr, "Failed to allocate memory for command line argument.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - } + check_papi_api_call( PAPI_create_eventset(&EventSet) ); - pthread_mutex_lock(&global_mutex); + int num_events_successfully_added = 0; + char **events_successfully_added = (char **) malloc(global_total_event_count * sizeof(char *)); + check_memory_allocation_call( events_successfully_added ); - add_events_from_command_line(EventSet, g_event_count, g_evt_names, gpuid, &numEventsSuccessfullyAdded, eventsSuccessfullyAdded, &numMultipassEvents); + int event_idx; + for (event_idx = 0; event_idx < global_total_event_count; event_idx++) { + char tmp_event_name[PAPI_MAX_STR_LEN]; + int strLen = snprintf(tmp_event_name, PAPI_MAX_STR_LEN, "%s:device=%d", global_cuda_native_event_names[event_idx], curr_thread_and_dev_idx); + if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { + fprintf(stderr, "Failed to fully write event name with appended device qualifier.\n"); + exit(EXIT_FAILURE); + } + + events_successfully_added[event_idx] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call(events_successfully_added[event_idx]); + + add_cuda_native_events(EventSet, tmp_event_name, &num_events_successfully_added, events_successfully_added, &global_num_multipass_events); + } // Only multiple pass events were provided on the command line - if (numEventsSuccessfullyAdded == 0) { + if (num_events_successfully_added == 0) { fprintf(stderr, "Events provided on the command line could not be added to an EventSet as they require multiple passes.\n"); - test_skip(__FILE__, __LINE__, "", 0); + exit(EXIT_FAILURE); } - ++global_thread_count; - pthread_mutex_unlock(&global_mutex); + check_papi_api_call( PAPI_start(EventSet) ); - while(global_thread_count < numGPUs); + // Work for the device + VectorAddSubtract(50000 * (curr_thread_and_dev_idx + 1), global_suppress_output); - PAPI_CALL(PAPI_start(EventSet)); -#endif + long long cuda_counter_values[MAX_THREADS]; + check_papi_api_call( PAPI_stop(EventSet, cuda_counter_values) ); - VectorAddSubtract(50000*(tid+1), quiet); // gpu work - -#ifdef PAPI - PAPI_CALL(PAPI_stop(EventSet, values)); - - PRINT(quiet, "User measured values in thread id %d.\n", tid); - for (i = 0; i < numEventsSuccessfullyAdded; i++) { - PRINT(quiet, "%s\t\t%lld\n", eventsSuccessfullyAdded[i], values[i]); + for (event_idx = 0; event_idx < num_events_successfully_added; event_idx++) { + PRINT(global_suppress_output, "Event %s on thread and device id %d produced the value:\t\t%lld\n", events_successfully_added[event_idx], curr_thread_and_dev_idx, cuda_counter_values[event_idx]); } // Free allocated memory - for (i = 0; i < g_event_count; i++) { - free(eventsSuccessfullyAdded[i]); + for (event_idx = 0; event_idx < num_events_successfully_added; event_idx++) { + free(events_successfully_added[event_idx]); } - free(eventsSuccessfullyAdded); + free(events_successfully_added); + + check_papi_api_call(PAPI_cleanup_eventset(EventSet)); + + check_papi_api_call(PAPI_destroy_eventset(&EventSet)); - PAPI_CALL(PAPI_cleanup_eventset(EventSet)); - PAPI_CALL(PAPI_destroy_eventset(&EventSet)); -#endif return NULL; } int main(int argc, char **argv) { - quiet = 0; -#ifdef PAPI - char *test_quiet = getenv("PAPI_CUDA_TEST_QUIET"); - if (test_quiet) - quiet = (int) strtol(test_quiet, (char**) NULL, 10); - - g_event_count = argc - 1; - /* if no events passed at command line, just report test skipped. */ - if (g_event_count == 0) { - fprintf(stderr, "No eventnames specified at command line.\n"); - test_skip(__FILE__, __LINE__, "", 0); + // Determine the number of Cuda capable devices + int num_devices = 0; + check_cuda_runtime_api_call( cudaGetDeviceCount(&num_devices) ); + + // No devices detected on the machine, exit + if (num_devices < 1) { + fprintf(stderr, "No NVIDIA devices found on the machine. This is required for the test to run.\n"); + exit(EXIT_FAILURE); } - g_evt_names = argv + 1; -#endif - int rc, i; - int tid[MAX_THREADS]; - - RUNTIME_API_CALL(cudaGetDeviceCount(&numGPUs)); - PRINT(quiet, "No. of GPUs = %d\n", numGPUs); - if (numGPUs < 1) { - fprintf(stderr, "No GPUs found on system.\n"); -#ifdef PAPI - test_skip(__FILE__, __LINE__, "", 0); -#endif - return 0; + + global_suppress_output = 0; + char *global_user_defined_suppress_output = getenv("PAPI_CUDA_TEST_QUIET"); + if (global_user_defined_suppress_output) { + global_suppress_output = (int) strtol(global_user_defined_suppress_output, (char**) NULL, 10); + } + PRINT(global_suppress_output, "Running the cuda component test pthreads_noCuCtx.cu\n"); + + // If command line arguments are provided then get their values. + global_total_event_count = 0; + if (argc > 1) { + parse_and_assign_args(argc, argv, &global_cuda_native_event_names, &global_total_event_count); } - if (numGPUs > MAX_THREADS) - numGPUs = MAX_THREADS; - PRINT(quiet, "No. of threads to launch = %d\n", numGPUs); -#ifdef PAPI - pthread_mutex_init(&global_mutex, NULL); + // Initialize the PAPI library int papi_errno = PAPI_library_init( PAPI_VER_CURRENT ); if( papi_errno != PAPI_VER_CURRENT ) { - test_fail(__FILE__, __LINE__, "PAPI_library_init failed.", 0); + test_fail(__FILE__, __LINE__, "PAPI_library_init()", papi_errno); + } + PRINT(global_suppress_output, "PAPI version being used for this test: %d.%d.%d\n", + PAPI_VERSION_MAJOR(PAPI_VERSION), + PAPI_VERSION_MINOR(PAPI_VERSION), + PAPI_VERSION_REVISION(PAPI_VERSION)); + + // Initialize thread support in PAPI + check_papi_api_call(PAPI_thread_init((unsigned long (*)(void)) pthread_self)); + + // Verify the cuda component is compiled in + int cuda_cmp_idx = PAPI_get_component_index("cuda"); + if (cuda_cmp_idx < 0) { + test_fail(__FILE__, __LINE__, "PAPI_get_component_index()", cuda_cmp_idx); } - // Point PAPI to function that gets the thread id - PAPI_CALL(PAPI_thread_init((unsigned long (*)(void)) pthread_self)); -#endif + PRINT(global_suppress_output, "The cuda component is assigned to component index: %d\n", cuda_cmp_idx); - // Launch the threads - for(i = 0; i < numGPUs; i++) - { - tid[i] = i; - RUNTIME_API_CALL(cudaSetDevice(tid[i] % numGPUs)); - RUNTIME_API_CALL(cudaFree(NULL)); - - rc = pthread_create(&tidarr[i], NULL, thread_gpu, &(tid[i])); - if(rc) - { - fprintf(stderr, "\n ERROR: return code from pthread_create is %d \n", rc); - exit(1); - } - PRINT(quiet, "\n Main thread %lu. Created new thread (%lu) in iteration %d ...\n", - (unsigned long)pthread_self(), (unsigned long)tidarr[i], i); + // Initialize the Cuda component + int cuda_eventcode = 0 | PAPI_NATIVE_MASK; + check_papi_api_call( PAPI_enum_cmp_event(&cuda_eventcode, PAPI_ENUM_FIRST, cuda_cmp_idx) ); + + // If we have not gotten an event via the command line, use the event obtained from PAPI_enum_cmp_event + if (global_total_event_count == 0) { + int num_spaces_to_allocate = 1; + global_cuda_native_event_names = (char **) malloc(num_spaces_to_allocate * sizeof(char *)); + check_memory_allocation_call( global_cuda_native_event_names ); + + global_cuda_native_event_names[global_total_event_count] = (char *) malloc(PAPI_2MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call( global_cuda_native_event_names[global_total_event_count] ); + + check_papi_api_call( PAPI_event_code_to_name(cuda_eventcode, global_cuda_native_event_names[global_total_event_count++]) ); } - // Join all threads when complete - for (i = 0; i < numGPUs; i++) { - pthread_join(tidarr[i], NULL); - PRINT(quiet, "IDX: %d: TID: %lu: Done! Joined main thread.\n", i, (unsigned long)tidarr[i]); + const PAPI_component_info_t *cmpInfo = PAPI_get_component_info(cuda_cmp_idx); + if (cmpInfo == NULL) { + fprintf(stderr, "Call to PAPI_get_component_info failed.\n"); + exit(EXIT_FAILURE); } -#ifdef PAPI - PAPI_shutdown(); + // Check to see if the Cuda component is partially disabled + if (cmpInfo->partially_disabled) { + const char *cc_support = (getenv("PAPI_CUDA_API") != NULL) ? "<=7.0" : ">=7.0"; + PRINT(global_suppress_output, "\033[33mThe cuda component is partially disabled. Only support for CC's %s are enabled.\033[0m\n", cc_support); + } + + // Cap the number of devices to the max allowed number of threads + if (num_devices > MAX_THREADS) { + num_devices = MAX_THREADS; + } + + // Allocate memory for all the gpus found on the machine to keep track of threads and thread args + pthread_t *tinfo = (pthread_t *) calloc(num_devices, sizeof(pthread_t)); + check_memory_allocation_call( tinfo ); - PRINT(quiet, "Main thread exit!\n"); + int *thread_args = (int *) calloc(num_devices, sizeof(int)); + check_memory_allocation_call( thread_args ); + + PRINT(global_suppress_output, "Total number of threads to be launched: %d\n", num_devices); + // For the number of devices detected on the machine, launch a thread + int thread_and_dev_idx, thread_errno, global_num_multipass_events = 0; + for(thread_and_dev_idx = 0; thread_and_dev_idx < num_devices; thread_and_dev_idx++) + { + const PAPI_component_info_t *cmpInfo = PAPI_get_component_info(cuda_cmp_idx); + if (cmpInfo == NULL) { + fprintf(stderr, "Call to PAPI_get_component_info failed.\n"); + exit(EXIT_FAILURE); + } + + if (cmpInfo->partially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(thread_and_dev_idx) == 0) { + continue; + } + } + + // Store thread information to later use pthread_join + tinfo[thread_and_dev_idx] = thread_and_dev_idx; + // Store thread args so we do not increment the looping variable while in thread_gpu + thread_args[thread_and_dev_idx] = thread_and_dev_idx; + + thread_errno = pthread_create(&tinfo[thread_and_dev_idx], NULL, thread_gpu, &thread_args[thread_and_dev_idx]); + if(thread_errno != 0) { + fprintf(stderr, "Call to pthread_create failed for thread %d with error code %d.\n", thread_and_dev_idx, thread_errno); + exit(EXIT_FAILURE); + } + } + + // Now join each thread + for (thread_and_dev_idx = 0; thread_and_dev_idx < num_devices; thread_and_dev_idx++) { + const PAPI_component_info_t *cmpInfo = PAPI_get_component_info(cuda_cmp_idx); + if (cmpInfo == NULL) { + fprintf(stderr, "Call to PAPI_get_component_info failed.\n"); + exit(EXIT_FAILURE); + } + + if (cmpInfo->partially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(thread_and_dev_idx) == 0) { + continue; + } + } + + thread_errno = pthread_join(tinfo[thread_and_dev_idx], NULL); + if (thread_errno != 0) { + fprintf(stderr, "Call to pthread_join failed for thread %d with error code %d.\n", thread_and_dev_idx, thread_errno); + exit(EXIT_FAILURE); + } + } // Output a note that a multiple pass event was provided on the command line - if (numMultipassEvents > 0) { - PRINT(quiet, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); + if (global_num_multipass_events > 0) { + PRINT(global_suppress_output, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); + } + + // Free allocated memory + int event_idx; + for (event_idx = 0; event_idx < global_total_event_count; event_idx++) { + free(global_cuda_native_event_names[event_idx]); } + free(global_cuda_native_event_names); + free(tinfo); + free(thread_args); + + PAPI_shutdown(); test_pass(__FILE__); -#endif + return 0; } diff --git a/src/components/cuda/tests/run_cuda_tests.sh b/src/components/cuda/tests/run_cuda_tests.sh new file mode 100644 index 000000000..7b04f8b06 --- /dev/null +++ b/src/components/cuda/tests/run_cuda_tests.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# Add -verbose to show output, i.e. sh run_sde_tests.sh -verbose. +if [ "$1" = "--suppress-output" ]; then + export PAPI_CUDA_TEST_QUIET=1 +fi + +make_cuda_test_targets=( + "test_multi_read_and_reset" + "concurrent_profiling" + "concurrent_profiling_noCuCtx" + "pthreads" + "pthreads_noCuCtx" + "cudaOpenMP" + "cudaOpenMP_noCuCtx" + "test_multipass_event_fail" + "test_2thr_1gpu_not_allowed" + "HelloWorld" + "HelloWorld_noCuCtx" + "simpleMultiGPU" + "simpleMultiGPU_noCuCtx" +) + +for cuda_test in ${make_cuda_test_targets[@]}; do + echo "make $cuda_test:" + make $cuda_test + + printf "\n" + + echo "Running $cuda_test:" + ./$cuda_test + + echo "-------------------------------------" +done diff --git a/src/components/cuda/tests/runtest.sh b/src/components/cuda/tests/runtest.sh deleted file mode 100644 index 224f1e63b..000000000 --- a/src/components/cuda/tests/runtest.sh +++ /dev/null @@ -1,84 +0,0 @@ -#!/bin/bash - -export PAPI_CUDA_TEST_QUIET=1 # Comment this line to see standard output from tests - -evt_names=("cuda:::dram__bytes_read:stat=sum:device=0" \ - "cuda:::sm__cycles_active:stat=sum:device=0" \ - "cuda:::smsp__warps_launched:stat=sum:device=0") - -multi_gpu_evt_names=("cuda:::dram__bytes_read:stat=sum" \ - "cuda:::sm__cycles_active:stat=sum" \ - "cuda:::smsp__warps_launched:stat=sum") - -multi_pass_evt_name="cuda:::gpu__compute_memory_access_throughput_internal_activity.pct_of_peak_sustained_elapsed:stat=max:device=0" - -concurrent_evt_names=("cuda:::sm__cycles_active:stat=sum:device=" \ - "cuda:::sm__cycles_elapsed:stat=max:device=") - -make test_multipass_event_fail -echo -e "Running: \e[36m./test_multipass_event_fail\e[0m" "${evt_names[@]}" $multi_pass_evt_name -./test_multipass_event_fail "${evt_names[@]}" $multi_pass_evt_name -echo -e "-------------------------------------\n" - -make test_multi_read_and_reset -echo -e "Running: \e[36m./test_multi_read_and_reset\e[0m" "${evt_names[@]}" -./test_multi_read_and_reset "${evt_names[@]}" -echo -e "-------------------------------------\n" - -make test_2thr_1gpu_not_allowed -echo -e "Running: \e[36m./test_2thr_1gpu_not_allowed\e[0m" "${evt_names[@]:0:2}" -./test_2thr_1gpu_not_allowed "${evt_names[@]:0:2}" -echo -e "-------------------------------------\n" - -make HelloWorld -echo -e "Running: \e[36m./HelloWorld\e[0m" "${evt_names[@]}" -./HelloWorld "${evt_names[@]}" -echo -e "-------------------------------------\n" - -make HelloWorld_noCuCtx -echo -e "Running: \e[36m./HelloWorld_noCuCtx\e[0m" "${evt_names[@]}" -./HelloWorld_noCuCtx "${evt_names[@]}" -echo -e "-------------------------------------\n" - -make simpleMultiGPU -echo -e "Running: \e[36m./simpleMultiGPU\e[0m" "${multi_gpu_evt_names[@]}" -./simpleMultiGPU "${multi_gpu_evt_names[@]}" -echo -e "-------------------------------------\n" - -make simpleMultiGPU_noCuCtx -echo -e "Running: \e[36m./simpleMultiGPU_noCuCtx\e[0m" "${multi_gpu_evt_names[@]}" -./simpleMultiGPU_noCuCtx "${multi_gpu_evt_names[@]}" -echo -e "-------------------------------------\n" - -make pthreads_noCuCtx -echo -e "Running: \e[36m./pthreads_noCuCtx\e[0m" "${multi_gpu_evt_names[@]}" -./pthreads_noCuCtx "${multi_gpu_evt_names[@]}" -echo -e "-------------------------------------\n" - -make pthreads -echo -e "Running: \e[36m./pthreads\e[0m" "${multi_gpu_evt_names[@]}" -./pthreads "${multi_gpu_evt_names[@]}" -echo -e "-------------------------------------\n" - -make cudaOpenMP -echo -e "Running: \e[36m./cudaOpenMP\e[0m" "${multi_gpu_evt_names[@]}" -./cudaOpenMP "${multi_gpu_evt_names[@]}" -echo -e "-------------------------------------\n" - -make cudaOpenMP_noCuCtx -echo -e "Running: \e[36m./cudaOpenMP_noCuCtx\e[0m" "${multi_gpu_evt_names[@]}" -./cudaOpenMP_noCuCtx "${multi_gpu_evt_names[@]}" -echo -e "-------------------------------------\n" - -make concurrent_profiling_noCuCtx -echo -e "Running: \e[36m./concurrent_profiling_noCuCtx\e[0m" "${concurrent_evt_names[@]}" -./concurrent_profiling_noCuCtx "${concurrent_evt_names[@]}" -echo -e "-------------------------------------\n" - -make concurrent_profiling -echo -e "Running: \e[36m./concurrent_profiling\e[0m" "${concurrent_evt_names[@]}" -./concurrent_profiling "${concurrent_evt_names[@]}" -echo -e "-------------------------------------\n" - -# Finalize tests -unset PAPI_CUDA_TEST_QUIET diff --git a/src/components/cuda/tests/simpleMultiGPU.cu b/src/components/cuda/tests/simpleMultiGPU.cu index 35e3c6276..84b088bb6 100644 --- a/src/components/cuda/tests/simpleMultiGPU.cu +++ b/src/components/cuda/tests/simpleMultiGPU.cu @@ -1,92 +1,37 @@ -/* - * PAPI Multiple GPU example. This example is taken from the NVIDIA - * documentation (Copyright 1993-2013 NVIDIA Corporation) and has been - * adapted to show the use of CUPTI and PAPI in collecting event - * counters for multiple GPU contexts. PAPI Team (2015) - * - * Update, July/2021, for CUPTI 11. This version is for the CUPTI 11 - * API, which PAPI uses for Nvidia GPUs with Compute Capability >= - * 7.0. It will only work on cuda distributions of 10.0 or better. - * Similar to legacy CUpti API, PAPI is informed of the CUcontexts - * that will be used to execute kernels at the time of adding PAPI - * events for that device; as shown below. - */ - -/* - * This software contains source code provided by NVIDIA Corporation - * - * According to the Nvidia EULA (compute 5.5 version) - * http://developer.download.nvidia.com/compute/cuda/5_5/rel/docs/EULA.pdf - * - * Chapter 2. NVIDIA CORPORATION CUDA SAMPLES END USER LICENSE AGREEMENT - * 2.1.1. Source Code - * Developer shall have the right to modify and create derivative works with the Source - * Code. Developer shall own any derivative works ("Derivatives") it creates to the Source - * Code, provided that Developer uses the Materials in accordance with the terms and - * conditions of this Agreement. Developer may distribute the Derivatives, provided that - * all NVIDIA copyright notices and trademarks are propagated and used properly and - * the Derivatives include the following statement: “This software contains source code - * provided by NVIDIA Corporation.” - */ - -/* - * This application demonstrates how to use the CUDA API to use multiple GPUs, - * with an emphasis on simple illustration of the techniques (not on performance). - * - * Note that in order to detect multiple GPUs in your system you have to disable - * SLI in the nvidia control panel. Otherwise only one GPU is visible to the - * application. On the other side, you can still extend your desktop to screens - * attached to both GPUs. - * - * CUDA Context notes for CUPTI_11: Although a cudaSetDevice() will create a - * primary context for the device that allows kernel execution; PAPI cannot - * use a primary context to control the Nvidia Performance Profiler. - * Applications must create a context using cuCtxCreate() that will execute - * the kernel, this must be done prior to the PAPI_add_events() invocation in - * the code below. When multiple GPUs are in use, each requires its own - * context, and that context should be active when PAPI_events are added for - * each device. This means using seperate PAPI_add_events() for each device, - * as we do here. - */ - -// System includes +/** +* @file simpleMultiGPU.cu +* @brief For all enabled NVIDIA devices detected on the machine a matching Cuda context +* will be created and work will be done on that device. +* +* Note: The cuda component supports being partially disabled, meaning that certain devices +* will not be "enabled" to profile on. If PAPI_CUDA_API is not set, then devices with +* CC's >= 7.0 will be used and if PAPI_CUDA_API is set to LEGACY then devices with +* CC's <= 7.0 will be used. +*/ + +// Standard library headers #include +#include -// CUDA runtime +// Cuda Toolkit headers #include -#include -#ifdef PAPI +// Internal headers +#include "cuda_tests_helper.h" #include "papi.h" #include "papi_test.h" -#endif +#include "simpleMultiGPU.h" #ifndef MAX #define MAX(a,b) (a > b ? a : b) #endif -#include "simpleMultiGPU.h" - // ////////////////////////////////////////////////////////////////////////////// // Data configuration // ////////////////////////////////////////////////////////////////////////////// const int MAX_GPU_COUNT = 32; const int DATA_N = 48576 * 32; -#ifdef PAPI const int MAX_NUM_EVENTS = 32; -#endif - -#define CHECK_CU_ERROR(err, cufunc) \ - if (err != CUDA_SUCCESS) { fprintf (stderr, "Error %d for CUDA Driver API function '%s'\n", err, cufunc); return -1; } - -#define CHECK_CUDA_ERROR(err) \ - if (err != cudaSuccess) { fprintf (stderr, "%s:%i Error %d for CUDA [%s]\n", __FILE__, __LINE__, err, cudaGetErrorString(err) ); return -1; } - -#define CHECK_CUPTI_ERROR(err, cuptifunc) \ - if (err != CUPTI_SUCCESS) { const char *errStr; cuptiGetResultString(err, &errStr); \ - fprintf (stderr, "%s:%i Error %d [%s] for CUPTI API function '%s'\n", __FILE__, __LINE__, err, errStr, cuptifunc); return -1; } - -#define PRINT(quiet, format, args...) {if (!quiet) {fprintf(stderr, format, ## args);}} // ////////////////////////////////////////////////////////////////////////////// // Simple reduction kernel. @@ -105,363 +50,397 @@ __global__ static void reduceKernel( float *d_Result, float *d_Input, int N ) d_Result[tid] = sum; } -/** @class add_events_from_command_line - * @brief Try and add each event provided on the command line by the user. - * - * @param EventSet - * A PAPI eventset. - * @param totalEventCount - * Number of events from the command line. - * @param **eventsFromCommandLine - * Events provided on the command line. - * @param gpu_id - * NVIDIA device index. - * @param *numEventsSuccessfullyAdded - * Total number of successfully added events. - * @param **eventsSuccessfullyAdded - * Events that we are able to add to the EventSet. - * @param *numMultipassEvents - * Counter to see if a multiple pass event was provided on the command line. -*/ -static void add_events_from_command_line(int EventSet, int totalEventCount, char **eventNamesFromCommandLine, int gpu_id, int *numEventsSuccessfullyAdded, char **eventsSuccessfullyAdded, int *numMultipassEvents) +static void print_help_message(void) { - int i; - for (i = 0; i < totalEventCount; i++) { - char tmpEventName[PAPI_MAX_STR_LEN]; - int strLen = snprintf(tmpEventName, PAPI_MAX_STR_LEN, "%s:device=%d", eventNamesFromCommandLine[i], gpu_id); - if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { - fprintf(stderr, "Failed to fully write event name with appended device qualifier.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - - int papi_errno = PAPI_add_named_event(EventSet, tmpEventName); - if (papi_errno != PAPI_OK) { - if (papi_errno != PAPI_EMULPASS) { - fprintf(stderr, "Unable to add event %s to the EventSet with error code %d.\n", tmpEventName, papi_errno); - test_skip(__FILE__, __LINE__, "", 0); - } + printf("./simpleMultiGPU --cuda-native-event-names [list of cuda native event names separated by a comma].\n" + "Notes:\n" + "1. Native event names must not have the device qualifier appended.\n"); +} - // Handle multiple pass events - (*numMultipassEvents)++; - continue; - } +static void parse_and_assign_args(int argc, char *argv[], char ***cuda_native_event_names, int *total_event_count) +{ + int i; + for (i = 1; i < argc; ++i) + { + char *arg = argv[i]; + if (strcmp(arg, "--help") == 0) + { + print_help_message(); + exit(EXIT_SUCCESS); + } + else if (strcmp(arg, "--cuda-native-event-names") == 0) + { + if (!argv[i + 1]) + { + printf("ERROR!! --cuda-native-event-names given, but no events listed.\n"); + exit(EXIT_FAILURE); + } + + char **cmd_line_native_event_names = NULL; + const char *cuda_native_event_name = strtok(argv[i+1], ","); + while (cuda_native_event_name != NULL) + { + if (strstr(cuda_native_event_name, ":device")) { + fprintf(stderr, "Cuda native event name must not have a device qualifier appended for this test, i.e. no :device=#.\n"); + print_help_message(); + exit(EXIT_FAILURE); + } + + cmd_line_native_event_names = (char **) realloc(cmd_line_native_event_names, ((*total_event_count) + 1) * sizeof(char *)); + check_memory_allocation_call(cmd_line_native_event_names); + + cmd_line_native_event_names[(*total_event_count)] = (char *) malloc(PAPI_2MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call(cmd_line_native_event_names[(*total_event_count)]); + + int strLen = snprintf(cmd_line_native_event_names[(*total_event_count)], PAPI_2MAX_STR_LEN, "%s", cuda_native_event_name); + if (strLen < 0 || strLen >= PAPI_2MAX_STR_LEN) { + fprintf(stderr, "Failed to fully write cuda native event name.\n"); + exit(EXIT_FAILURE); + } + + (*total_event_count)++; + cuda_native_event_name = strtok(NULL, ","); + } + i++; + *cuda_native_event_names = cmd_line_native_event_names; + } + else + { + print_help_message(); + exit(EXIT_FAILURE); + } + } +} - // Handle successfully added events - strLen = snprintf(eventsSuccessfullyAdded[(*numEventsSuccessfullyAdded)], PAPI_MAX_STR_LEN, "%s", tmpEventName); - if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { - fprintf(stderr, "Failed to fully write successfully added event.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - (*numEventsSuccessfullyAdded)++; +int main(int argc, char **argv) +{ + // Determine the number of Cuda capable devices + int num_devices = 0; + check_cuda_runtime_api_call( cudaGetDeviceCount(&num_devices) ); + + // No devices detected on the machine, exit + if (num_devices < 1) { + fprintf(stderr, "No NVIDIA devices found on the machine. This is required for the test to run.\n"); + test_skip(__FILE__, __LINE__, "", 0); } - return; -} + char *user_defined_suppress_output = getenv("PAPI_CUDA_TEST_QUIET"); + int suppress_output = 0; + if (user_defined_suppress_output) { + suppress_output = (int) strtol(user_defined_suppress_output, (char**) NULL, 10); + } + PRINT(suppress_output, "Running the Cuda component test simpleMultiGPU.cu\n"); -// ////////////////////////////////////////////////////////////////////////////// -// Program main -// ////////////////////////////////////////////////////////////////////////////// -int main( int argc, char **argv ) -{ - // Solver config - TGPUplan plan[MAX_GPU_COUNT]; - // GPU reduction results - float h_SumGPU[MAX_GPU_COUNT]; - float sumGPU; - double sumCPU, diff; - int i, j, gpuBase, num_gpus; + char **cuda_native_event_names = NULL; + // If command line arguments are provided then get their values. + int total_event_count = 0; + if (argc > 1) { + parse_and_assign_args(argc, argv, &cuda_native_event_names, &total_event_count); + } - const int BLOCK_N = 32; - const int THREAD_N = 256; - const int ACCUM_N = BLOCK_N * THREAD_N; + // Initialize PAPI library + int papi_errno = PAPI_library_init( PAPI_VER_CURRENT ); + if( papi_errno != PAPI_VER_CURRENT ) { + test_fail(__FILE__, __LINE__, "PAPI_library_init()", papi_errno); + } + PRINT(suppress_output, "PAPI version being used for this test: %d.%d.%d\n", + PAPI_VERSION_MAJOR(PAPI_VERSION), + PAPI_VERSION_MINOR(PAPI_VERSION), + PAPI_VERSION_REVISION(PAPI_VERSION)); + + int cuda_cmp_idx = PAPI_get_component_index("cuda"); + if (cuda_cmp_idx < 0) { + test_fail(__FILE__, __LINE__, "PAPI_get_component_index()", cuda_cmp_idx); + } + PRINT(suppress_output, "The cuda component is assigned to component index: %d\n", cuda_cmp_idx); - CUcontext ctx[MAX_GPU_COUNT]; - CUcontext poppedCtx; + // Initialize the Cuda component + int cuda_eventcode = 0 | PAPI_NATIVE_MASK; + check_papi_api_call( PAPI_enum_cmp_event(&cuda_eventcode, PAPI_ENUM_FIRST, cuda_cmp_idx) ); - char *test_quiet = getenv("PAPI_CUDA_TEST_QUIET"); - int quiet = 0; - if (test_quiet) - quiet = (int) strtol(test_quiet, (char**) NULL, 10); + // If we have not gotten an event via the command line, use the event obtained from PAPI_enum_cmp_event + if (total_event_count == 0) { + int num_spaces_to_allocate = 1; + cuda_native_event_names = (char **) malloc(num_spaces_to_allocate * sizeof(char *)); + check_memory_allocation_call( cuda_native_event_names ); - PRINT( quiet, "Starting simpleMultiGPU\n" ); + cuda_native_event_names[total_event_count] = (char *) malloc(PAPI_2MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call( cuda_native_event_names[total_event_count] ); -#ifdef PAPI - int event_count = argc - 1; + check_papi_api_call( PAPI_event_code_to_name(cuda_eventcode, cuda_native_event_names[total_event_count++]) ); + } - /* if no events passed at command line, just report test skipped. */ - if (event_count == 0) { - fprintf(stderr, "No eventnames specified at command line.\n"); - test_skip(__FILE__, __LINE__, "", 0); + const PAPI_component_info_t *cmpInfo = PAPI_get_component_info(cuda_cmp_idx); + if (cmpInfo == NULL) { + fprintf(stderr, "Call to PAPI_get_component_info failed.\n"); + exit(EXIT_FAILURE); } - /* PAPI Initialization must occur before any context creation/manipulation. */ - /* This is to ensure PAPI can monitor CUpti library calls. */ - int papi_errno = PAPI_library_init( PAPI_VER_CURRENT ); - if( papi_errno != PAPI_VER_CURRENT ) { - fprintf( stderr, "PAPI_library_init failed\n" ); - exit(-1); + // Check to see if the Cuda component is partially disabled + if (cmpInfo->partially_disabled) { + const char *cc_support = (getenv("PAPI_CUDA_API") != NULL) ? "<=7.0" : ">=7.0"; + PRINT(suppress_output, "\033[33mThe cuda component is partially disabled. Only support for CC's %s are enabled.\033[0m\n", cc_support); } - printf( "PAPI version: %d.%d.%d\n", PAPI_VERSION_MAJOR( PAPI_VERSION ), PAPI_VERSION_MINOR( PAPI_VERSION ), PAPI_VERSION_REVISION( PAPI_VERSION ) ); -#endif - - // Report on the available CUDA devices - int computeCapabilityMajor = 0, computeCapabilityMinor = 0; - int runtimeVersion = 0, driverVersion = 0; - char deviceName[PAPI_MIN_STR_LEN]; + check_cuda_runtime_api_call( cudaGetDeviceCount( &num_devices ) ); CUdevice device[MAX_GPU_COUNT]; - CHECK_CUDA_ERROR( cudaGetDeviceCount( &num_gpus ) ); - if( num_gpus > MAX_GPU_COUNT ) num_gpus = MAX_GPU_COUNT; - PRINT( quiet, "CUDA-capable device count: %i\n", num_gpus ); - for ( i=0; ipartially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(dev_idx) == 0) { + continue; + } + } + int flags = 0; + check_cuda_driver_api_call( cuDeviceGet(&device[dev_idx], dev_idx) ); #if defined(CUDA_TOOLKIT_GE_13) - CHECK_CU_ERROR( cuCtxCreate(&(ctx[i]), (CUctxCreateParams*)0, flags, device[i]), "cuCtxCreate" ); + check_cuda_driver_api_call( cuCtxCreate(&(ctx[dev_idx]), (CUctxCreateParams*)0, flags, device[dev_idx]) ); #else - CHECK_CU_ERROR( cuCtxCreate(&(ctx[i]), flags, device[i]), "cuCtxCreate" ); + check_cuda_driver_api_call( cuCtxCreate(&(ctx[dev_idx]), flags, device[dev_idx]) ); #endif - CHECK_CU_ERROR( cuCtxPopCurrent(&poppedCtx), "cuCtxPopCurrent" ); // ... so take it off. - } + plan[dev_idx].dataN = DATA_N / num_devices; + // Take into account odd data sizes and increment + if (plan[dev_idx].dataN % 2) { + plan[dev_idx].dataN++; + } - PRINT( quiet, "Generating input data...\n" ); - - // Subdividing input data across GPUs - // Get data sizes for each GPU - for( i = 0; i < num_gpus; i++ ) - plan[i].dataN = DATA_N / num_gpus; - // Take into account "odd" data sizes - for( i = 0; i < DATA_N % num_gpus; i++ ) - plan[i].dataN++; - - // Assign data ranges to GPUs - gpuBase = 0; - for( i = 0; i < num_gpus; i++ ) { - plan[i].h_Sum = h_SumGPU + i; // point within h_SumGPU array - gpuBase += plan[i].dataN; - } + plan[dev_idx].h_Sum = h_SumGPU + dev_idx; // point within h_SumGPU array + gpuBase += plan[dev_idx].dataN; - // Create streams for issuing GPU command asynchronously and allocate memory (GPU and System page-locked) - for( i = 0; i < num_gpus; i++ ) { - CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent"); - CHECK_CUDA_ERROR( cudaStreamCreate( &plan[i].stream ) ); - CHECK_CUDA_ERROR( cudaMalloc( ( void ** ) &plan[i].d_Data, plan[i].dataN * sizeof( float ) ) ); - CHECK_CUDA_ERROR( cudaMalloc( ( void ** ) &plan[i].d_Sum, ACCUM_N * sizeof( float ) ) ); - CHECK_CUDA_ERROR( cudaMallocHost( ( void ** ) &plan[i].h_Sum_from_device, ACCUM_N * sizeof( float ) ) ); - CHECK_CUDA_ERROR( cudaMallocHost( ( void ** ) &plan[i].h_Data, plan[i].dataN * sizeof( float ) ) ); - for( j = 0; j < plan[i].dataN; j++ ) { - plan[i].h_Data[j] = ( float ) rand() / ( float ) RAND_MAX; + // Create an asynchronous stream + check_cuda_runtime_api_call( cudaStreamCreate( &plan[dev_idx].stream ) ); + // Allocate memory on the device + check_cuda_runtime_api_call( cudaMalloc((void **) &plan[dev_idx].d_Data, plan[dev_idx].dataN * sizeof(float)) ); + check_cuda_runtime_api_call( cudaMalloc((void **) &plan[dev_idx].d_Sum, ACCUM_N * sizeof(float)) ); + // Allocates page locked memory on the host + check_cuda_runtime_api_call( cudaMallocHost((void **) &plan[dev_idx].h_Sum_from_device, ACCUM_N * sizeof(float)) ); + check_cuda_runtime_api_call( cudaMallocHost((void **) &plan[dev_idx].h_Data, plan[dev_idx].dataN * sizeof(float)) ); + + for (j = 0; j < plan[dev_idx].dataN; j++) { + plan[dev_idx].h_Data[j] = ( float ) rand() / ( float ) RAND_MAX; } - CHECK_CU_ERROR( cuCtxPopCurrent(&poppedCtx), "cuCtxPopCurrent" ); + check_cuda_driver_api_call( cuCtxPopCurrent(&poppedCtx) ); } -#ifdef PAPI - PRINT(quiet, "Setup PAPI counters internally (PAPI)\n"); int EventSet = PAPI_NULL; - int NUM_EVENTS = MAX_GPU_COUNT*MAX_NUM_EVENTS; - long long values[NUM_EVENTS]; - - int cid = PAPI_get_component_index("cuda"); - if (cid < 0) { - PAPI_shutdown(); - test_fail(__FILE__, __LINE__, "Failed to get index of cuda component.", PAPI_ECMP); - } - - PRINT(quiet, "Found CUDA Component at id %d\n", cid); + check_papi_api_call( PAPI_create_eventset(&EventSet) ); - papi_errno = PAPI_create_eventset(&EventSet); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_create_eventset failed.", papi_errno); - } - - papi_errno = PAPI_assign_eventset_component(EventSet, cid); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_assign_eventset_component failed.", papi_errno); - } + // Handle the events from the command line + int num_events_successfully_added = 0, numMultipassEvents = 0; + int NUM_EVENTS = MAX_GPU_COUNT * MAX_NUM_EVENTS; + char **events_successfully_added = (char **) malloc(NUM_EVENTS * sizeof(char *)); + check_memory_allocation_call( events_successfully_added ); + + int event_idx; + for (dev_idx = 0; dev_idx < num_devices; dev_idx++) { + if (cmpInfo->partially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(dev_idx) == 0) { + continue; + } + } - // In this example measure events from each GPU - // Add events at a GPU specific level ... eg cuda:::device:2:elapsed_cycles_sm - // Similar to legacy CUpti API, we must change the contexts to the appropriate device to - // add events to inform PAPI of the context that will run the kernels. + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + char tmp_event_name[PAPI_MAX_STR_LEN]; + int strLen = snprintf(tmp_event_name, PAPI_MAX_STR_LEN, "%s:device=%d", cuda_native_event_names[event_idx], dev_idx); + if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { + fprintf(stderr, "Failed to fully write event name with appended device qualifier.\n"); + exit(EXIT_FAILURE); + } - // Save current context, will restore after adding events. - CUcontext userContext; - CHECK_CU_ERROR(cuCtxGetCurrent(&userContext), "cuCtxGetCurrent"); + events_successfully_added[num_events_successfully_added] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call( events_successfully_added[event_idx] ); - // Handle the events from the command line - int numEventsSuccessfullyAdded = 0, numMultipassEvents = 0; - char **eventsSuccessfullyAdded, **metricNames = argv + 1; - eventsSuccessfullyAdded = (char **) malloc(NUM_EVENTS * sizeof(char *)); - if (eventsSuccessfullyAdded == NULL) { - fprintf(stderr, "Failed to allocate memory for successfully added events.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - for (i = 0; i < NUM_EVENTS; i++) { - eventsSuccessfullyAdded[i] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); - if (eventsSuccessfullyAdded[i] == NULL) { - fprintf(stderr, "Failed to allocate memory for command line argument.\n"); - test_skip(__FILE__, __LINE__, "", 0); + // We must change contexts to the appropriate device to add events to inform PAPI of the context that will run the kernels + check_cuda_driver_api_call( cuCtxSetCurrent(ctx[dev_idx]) ); + add_cuda_native_events(EventSet, tmp_event_name, &num_events_successfully_added, events_successfully_added, &numMultipassEvents); } } - int gpu_id; - for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { - CHECK_CU_ERROR(cuCtxSetCurrent(ctx[gpu_id]), "cuCtxSetCurrent"); - add_events_from_command_line(EventSet, event_count, metricNames, gpu_id, &numEventsSuccessfullyAdded, eventsSuccessfullyAdded, &numMultipassEvents); - } - // Only multiple pass events were provided on the command line - if (numEventsSuccessfullyAdded == 0) { + if (num_events_successfully_added == 0) { fprintf(stderr, "Events provided on the command line could not be added to an EventSet as they require multiple passes.\n"); test_skip(__FILE__, __LINE__, "", 0); } - // Restore user context. - - CHECK_CU_ERROR(cuCtxSetCurrent(userContext), "cuCtxSetCurrent"); - // Invoke PAPI_start(). - papi_errno = PAPI_start( EventSet ); - if( papi_errno != PAPI_OK ) { - test_fail(__FILE__, __LINE__, "PAPI_start failed", papi_errno); - } -#endif + check_papi_api_call( PAPI_start(EventSet) ); - // Start timing and compute on GPU(s) - PRINT( quiet, "Computing with %d GPUs...\n", num_gpus ); + // Start timing StartTimer(); // Copy data to GPU, launch the kernel and copy data back. All asynchronously - for (i = 0; i < num_gpus; i++) { + for (dev_idx = 0; dev_idx < num_devices; dev_idx++) { + if (cmpInfo->partially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(dev_idx) == 0) { + continue; + } + } // Pushing a context implicitly sets the device for which it was created. - CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent"); + check_cuda_driver_api_call( cuCtxPushCurrent(ctx[dev_idx]) ); // Copy input data from CPU - CHECK_CUDA_ERROR( cudaMemcpyAsync( plan[i].d_Data, plan[i].h_Data, plan[i].dataN * sizeof( float ), cudaMemcpyHostToDevice, plan[i].stream ) ); + check_cuda_runtime_api_call( cudaMemcpyAsync( plan[dev_idx].d_Data, plan[dev_idx].h_Data, plan[dev_idx].dataN * sizeof( float ), cudaMemcpyHostToDevice, plan[dev_idx].stream ) ); // Perform GPU computations - reduceKernel <<< BLOCK_N, THREAD_N, 0, plan[i].stream >>> ( plan[i].d_Sum, plan[i].d_Data, plan[i].dataN ); - if ( cudaGetLastError() != cudaSuccess ) { printf( "reduceKernel() execution failed (GPU %d).\n", i ); exit(EXIT_FAILURE); } + reduceKernel <<< BLOCK_N, THREAD_N, 0, plan[dev_idx].stream >>> ( plan[dev_idx].d_Sum, plan[dev_idx].d_Data, plan[dev_idx].dataN ); + check_cuda_runtime_api_call( cudaGetLastError() ); // Read back GPU results - CHECK_CUDA_ERROR( cudaMemcpyAsync( plan[i].h_Sum_from_device, plan[i].d_Sum, ACCUM_N * sizeof( float ), cudaMemcpyDeviceToHost, plan[i].stream ) ); + check_cuda_runtime_api_call( cudaMemcpyAsync( plan[dev_idx].h_Sum_from_device, plan[dev_idx].d_Sum, ACCUM_N * sizeof( float ), cudaMemcpyDeviceToHost, plan[dev_idx].stream ) ); // Popping a context can change the device to match the previous context. - CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" ); + check_cuda_driver_api_call( cuCtxPopCurrent(&(ctx[dev_idx])) ); } // Process GPU results - PRINT( quiet, "Process GPU results on %d GPUs...\n", num_gpus ); - for( i = 0; i < num_gpus; i++ ) { + PRINT(suppress_output, "Process GPU results...\n"); + for(dev_idx = 0; dev_idx < num_devices; dev_idx++) { + if (cmpInfo->partially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(dev_idx) == 0) { + continue; + } + } float sum; // Pushing a context implicitly sets the device for which it was created. - CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent"); + check_cuda_driver_api_call( cuCtxPushCurrent(ctx[dev_idx]) ); // Wait for all operations to finish - cudaStreamSynchronize( plan[i].stream ); + cudaStreamSynchronize( plan[dev_idx].stream ); // Finalize GPU reduction for current subvector sum = 0; - for( j = 0; j < ACCUM_N; j++ ) { - sum += plan[i].h_Sum_from_device[j]; + for (j = 0; j < ACCUM_N; j++) { + sum += plan[dev_idx].h_Sum_from_device[j]; } - *( plan[i].h_Sum ) = ( float ) sum; + *( plan[dev_idx].h_Sum ) = ( float ) sum; // Popping a context can change the device to match the previous context. - CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" ); + check_cuda_driver_api_call( cuCtxPopCurrent(&(ctx[dev_idx])) ); } double gpuTime = GetTimer(); -#ifdef PAPI - for ( i=0; ipartially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(dev_idx) == 0) { + continue; + } + } // Pushing a context implicitly sets the device for which it was created. - CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent"); - CHECK_CU_ERROR( cuCtxSynchronize( ), "cuCtxSynchronize" ); + check_cuda_driver_api_call( cuCtxPushCurrent(ctx[dev_idx]) ); + check_cuda_driver_api_call( cuCtxSynchronize( ) ); // Popping a context may change the current device to match the new current context. - CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" ); + check_cuda_driver_api_call( cuCtxPopCurrent(&(ctx[dev_idx])) ); + } + + long long cuda_counter_values[NUM_EVENTS]; + check_papi_api_call( PAPI_stop(EventSet, cuda_counter_values) ); + + for(event_idx = 0; event_idx < num_events_successfully_added; event_idx++) { + PRINT(suppress_output, "Event %s produced the value:\t\t%lld\n", events_successfully_added[event_idx], cuda_counter_values[event_idx]); } - papi_errno = PAPI_stop( EventSet, values ); // Stop (will read values). - if( papi_errno != PAPI_OK ) fprintf( stderr, "PAPI_stop failed\n" ); - for( i = 0; i < numEventsSuccessfullyAdded; i++ ) - PRINT( quiet, "PAPI counterValue %12lld \t\t --> %s \n", values[i], eventsSuccessfullyAdded[i] ); + check_papi_api_call( PAPI_cleanup_eventset(EventSet) ); + + check_papi_api_call( PAPI_destroy_eventset(&EventSet) ); - papi_errno = PAPI_cleanup_eventset( EventSet ); - if( papi_errno != PAPI_OK ) fprintf( stderr, "PAPI_cleanup_eventset failed\n" ); - papi_errno = PAPI_destroy_eventset( &EventSet ); - if( papi_errno != PAPI_OK ) fprintf( stderr, "PAPI_destroy_eventset failed\n" ); PAPI_shutdown(); -#endif - sumGPU = 0; - for( i = 0; i < num_gpus; i++ ) { - sumGPU += h_SumGPU[i]; + float sumGPU = 0.0; + for(dev_idx = 0; dev_idx < num_devices; dev_idx++) { + if (cmpInfo->partially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(dev_idx) == 0) { + continue; + } + } + sumGPU += h_SumGPU[dev_idx]; } - PRINT( quiet, " GPU Processing time: %f (ms)\n", gpuTime ); + PRINT(suppress_output, " GPU Processing time: %f (ms)\n", gpuTime); // Compute on Host CPU - PRINT( quiet, "Computing the same result with Host CPU...\n" ); + PRINT(suppress_output, "Computing the same result with Host CPU...\n"); StartTimer(); - sumCPU = 0; - for( i = 0; i < num_gpus; i++ ) { - for( j = 0; j < plan[i].dataN; j++ ) { - sumCPU += plan[i].h_Data[j]; + double sumCPU = 0.0; + for(dev_idx = 0; dev_idx < num_devices; dev_idx++) { + if (cmpInfo->partially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(dev_idx) == 0) { + continue; + } + } + + for (j = 0; j < plan[dev_idx].dataN; j++) { + sumCPU += plan[dev_idx].h_Data[j]; } } + double cpuTime = GetTimer(); if (gpuTime > 0) { - PRINT( quiet, " CPU Processing time: %f (ms) (speedup %.2fX)\n", cpuTime, (cpuTime/gpuTime) ); + PRINT(suppress_output, " CPU Processing time: %f (ms) (speedup %.2fX)\n", cpuTime, (cpuTime/gpuTime)); } else { - PRINT( quiet, " CPU Processing time: %f (ms)\n", cpuTime); + PRINT(suppress_output, " CPU Processing time: %f (ms)\n", cpuTime); } // Compare GPU and CPU results - PRINT( quiet, "Comparing GPU and Host CPU results...\n" ); - diff = fabs( sumCPU - sumGPU ) / fabs( sumCPU ); - PRINT( quiet, " GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU ); - PRINT( quiet, " Relative difference: %E \n", diff ); + PRINT(suppress_output, "Comparing GPU and Host CPU results...\n"); + double diff = fabs( sumCPU - sumGPU ) / fabs( sumCPU ); + PRINT(suppress_output, " GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU); + PRINT(suppress_output, " Relative difference: %E \n", diff); + + // Output a note that a multiple pass event was provided on the command line + if (numMultipassEvents > 0) { + PRINT(suppress_output, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); + } // Cleanup and shutdown - for( i = 0; i < num_gpus; i++ ) { - CHECK_CUDA_ERROR( cudaFreeHost( plan[i].h_Sum_from_device ) ); - CHECK_CUDA_ERROR( cudaFreeHost( plan[i].h_Data ) ); - CHECK_CUDA_ERROR( cudaFree( plan[i].d_Sum ) ); - CHECK_CUDA_ERROR( cudaFree( plan[i].d_Data ) ); - // Shut down this GPU - CHECK_CUDA_ERROR( cudaStreamDestroy( plan[i].stream ) ); - CHECK_CU_ERROR( cuCtxDestroy(ctx[i]), "cuCtxDestroy"); + for(dev_idx = 0; dev_idx < num_devices; dev_idx++ ) { + if (cmpInfo->partially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(dev_idx) == 0) { + continue; + } + } + // Free page-locked memory + check_cuda_runtime_api_call( cudaFreeHost(plan[dev_idx].h_Sum_from_device) ); + check_cuda_runtime_api_call( cudaFreeHost(plan[dev_idx].h_Data) ); + // Free memory on the device + check_cuda_runtime_api_call( cudaFree(plan[dev_idx].d_Sum) ); + check_cuda_runtime_api_call( cudaFree(plan[dev_idx].d_Data) ); + // Destroys and cleans up asynchronous stream + check_cuda_runtime_api_call( cudaStreamDestroy(plan[dev_idx].stream) ); + // Destroy Cuda context + check_cuda_driver_api_call( cuCtxDestroy(ctx[dev_idx]) ); } //Free allocated memory - for (i = 0; i < event_count; i++) { - free(eventsSuccessfullyAdded[i]); - } - free(eventsSuccessfullyAdded); - -#ifdef PAPI - // Output a note that a multiple pass event was provided on the command line - if (numMultipassEvents > 0) { - PRINT(quiet, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + free(cuda_native_event_names[event_idx]); } + free(cuda_native_event_names); + + for (event_idx = 0; event_idx < num_events_successfully_added; event_idx++) { + free(events_successfully_added[event_idx]); + } + free(events_successfully_added); - if ( diff < 1e-5 ) + if (diff < 1e-5) { test_pass(__FILE__); - else + } + else { test_fail(__FILE__, __LINE__, "Result of GPU calculation doesn't match CPU.", PAPI_EINVAL); -#endif + } + return 0; } diff --git a/src/components/cuda/tests/simpleMultiGPU_noCuCtx.cu b/src/components/cuda/tests/simpleMultiGPU_noCuCtx.cu index 416bac61d..1fcc1753a 100644 --- a/src/components/cuda/tests/simpleMultiGPU_noCuCtx.cu +++ b/src/components/cuda/tests/simpleMultiGPU_noCuCtx.cu @@ -1,92 +1,37 @@ -/* - * PAPI Multiple GPU example. This example is taken from the NVIDIA - * documentation (Copyright 1993-2013 NVIDIA Corporation) and has been - * adapted to show the use of CUPTI and PAPI in collecting event - * counters for multiple GPU contexts. PAPI Team (2015) - * - * Update, July/2021, for CUPTI 11. This version is for the CUPTI 11 - * API, which PAPI uses for Nvidia GPUs with Compute Capability >= - * 7.0. It will only work on cuda distributions of 10.0 or better. - * Similar to legacy CUpti API, PAPI is informed of the CUcontexts - * that will be used to execute kernels at the time of adding PAPI - * events for that device; as shown below. - */ - -/* - * This software contains source code provided by NVIDIA Corporation - * - * According to the Nvidia EULA (compute 5.5 version) - * http://developer.download.nvidia.com/compute/cuda/5_5/rel/docs/EULA.pdf - * - * Chapter 2. NVIDIA CORPORATION CUDA SAMPLES END USER LICENSE AGREEMENT - * 2.1.1. Source Code - * Developer shall have the right to modify and create derivative works with the Source - * Code. Developer shall own any derivative works ("Derivatives") it creates to the Source - * Code, provided that Developer uses the Materials in accordance with the terms and - * conditions of this Agreement. Developer may distribute the Derivatives, provided that - * all NVIDIA copyright notices and trademarks are propagated and used properly and - * the Derivatives include the following statement: “This software contains source code - * provided by NVIDIA Corporation.” - */ - -/* - * This application demonstrates how to use the CUDA API to use multiple GPUs, - * with an emphasis on simple illustration of the techniques (not on performance). - * - * Note that in order to detect multiple GPUs in your system you have to disable - * SLI in the nvidia control panel. Otherwise only one GPU is visible to the - * application. On the other side, you can still extend your desktop to screens - * attached to both GPUs. - * - * CUDA Context notes for CUPTI_11: Although a cudaSetDevice() will create a - * primary context for the device that allows kernel execution; PAPI cannot - * use a primary context to control the Nvidia Performance Profiler. - * Applications must create a context using cuCtxCreate() that will execute - * the kernel, this must be done prior to the PAPI_add_events() invocation in - * the code below. When multiple GPUs are in use, each requires its own - * context, and that context should be active when PAPI_events are added for - * each device. This means using seperate PAPI_add_events() for each device, - * as we do here. - */ - -// System includes +/** +* @file simpleMultiGPU_noCuCtx.cu +* @brief For all enabled NVIDIA devices detected on the machine a matching call to cudaSetDevice +* will be made. cudaSetDevice determines which device executions will be done on. +* +* Note: The cuda component supports being partially disabled, meaning that certain devices +* will not be "enabled" to profile on. If PAPI_CUDA_API is not set, then devices with +* CC's >= 7.0 will be used and if PAPI_CUDA_API is set to LEGACY then devices with +* CC's <= 7.0 will be used. +*/ + +// Standard library headers #include +#include -// CUDA runtime +// Cuda Toolkit headers #include -#include -#ifdef PAPI +// Internal headers +#include "cuda_tests_helper.h" #include "papi.h" #include "papi_test.h" -#endif +#include "simpleMultiGPU.h" #ifndef MAX #define MAX(a,b) (a > b ? a : b) #endif -#include "simpleMultiGPU.h" - // ////////////////////////////////////////////////////////////////////////////// // Data configuration // ////////////////////////////////////////////////////////////////////////////// const int MAX_GPU_COUNT = 32; const int DATA_N = 48576 * 32; -#ifdef PAPI const int MAX_NUM_EVENTS = 32; -#endif - -#define CHECK_CU_ERROR(err, cufunc) \ - if (err != CUDA_SUCCESS) { fprintf (stderr, "Error %d for CUDA Driver API function '%s'\n", err, cufunc); return -1; } - -#define CHECK_CUDA_ERROR(err) \ - if (err != cudaSuccess) { fprintf (stderr, "%s:%i Error %d for CUDA [%s]\n", __FILE__, __LINE__, err, cudaGetErrorString(err) ); return -1; } - -#define CHECK_CUPTI_ERROR(err, cuptifunc) \ - if (err != CUPTI_SUCCESS) { const char *errStr; cuptiGetResultString(err, &errStr); \ - fprintf (stderr, "%s:%i Error %d [%s] for CUPTI API function '%s'\n", __FILE__, __LINE__, err, errStr, cuptifunc); return -1; } - -#define PRINT(quiet, format, args...) {if (!quiet) {fprintf(stderr, format, ## args);}} // ////////////////////////////////////////////////////////////////////////////// // Simple reduction kernel. @@ -105,333 +50,376 @@ __global__ static void reduceKernel( float *d_Result, float *d_Input, int N ) d_Result[tid] = sum; } -/** @class add_events_from_command_line - * @brief Try and add each event provided on the command line by the user. - * - * @param EventSet - * A PAPI eventset. - * @param totalEventCount - * Number of events from the command line. - * @param **eventsFromCommandLine - * Events provided on the command line. - * @param gpu_id - * Current gpu id. - * @param *numEventsSuccessfullyAdded - * Total number of successfully added events. - * @param **eventsSuccessfullyAdded - * Events that we are able to add to the EventSet. - * @param *numMultipassEvents - * Counter to see if a multiple pass event was provided on the command line. -*/ -static void add_events_from_command_line(int EventSet, int totalEventCount, char **eventNamesFromCommandLine, int gpu_id, int *numEventsSuccessfullyAdded, char **eventsSuccessfullyAdded, int *numMultipassEvents) +static void print_help_message(void) { - int i; - for (i = 0; i < totalEventCount; i++) { - char tmpEventName[PAPI_MAX_STR_LEN]; - int strLen = snprintf(tmpEventName, PAPI_MAX_STR_LEN, "%s:device=%d", eventNamesFromCommandLine[i], gpu_id); - if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { - fprintf(stderr, "Failed to fully write event name with appended device qualifier.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - - int papi_errno = PAPI_add_named_event(EventSet, tmpEventName); - if (papi_errno != PAPI_OK) { - if (papi_errno != PAPI_EMULPASS) { - fprintf(stderr, "Unable to add event %s to the EventSet with error code %d.\n", tmpEventName, papi_errno); - test_skip(__FILE__, __LINE__, "", 0); - } - - // Handle multiple pass events - (*numMultipassEvents)++; - continue; - } - - // Handle successfully added events - strLen = snprintf(eventsSuccessfullyAdded[(*numEventsSuccessfullyAdded)], PAPI_MAX_STR_LEN, "%s", tmpEventName); - if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { - fprintf(stderr, "Failed to fully write successfully added event.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - (*numEventsSuccessfullyAdded)++; - } - - return; + printf("./simpleMultiGPU_noCuCtx --cuda-native-event-names [list of cuda native event names separated by a comma].\n" + "Notes:\n" + "1. Native event names must not have the device qualifier appended.\n"); } -// ////////////////////////////////////////////////////////////////////////////// -// Program main -// ////////////////////////////////////////////////////////////////////////////// -int main( int argc, char **argv ) +static void parse_and_assign_args(int argc, char *argv[], char ***cuda_native_event_names, int *total_event_count) { - // Solver config - TGPUplan plan[MAX_GPU_COUNT]; - // GPU reduction results - float h_SumGPU[MAX_GPU_COUNT]; - float sumGPU; - double sumCPU, diff; - int i, j, gpuBase, num_gpus; - - const int BLOCK_N = 32; - const int THREAD_N = 256; - const int ACCUM_N = BLOCK_N * THREAD_N; - - char *test_quiet = getenv("PAPI_CUDA_TEST_QUIET"); - int quiet = 0; - if (test_quiet) - quiet = (int) strtol(test_quiet, (char**) NULL, 10); - - PRINT( quiet, "Starting simpleMultiGPU\n" ); - -#ifdef PAPI - int event_count = argc - 1; + int i; + for (i = 1; i < argc; ++i) + { + char *arg = argv[i]; + if (strcmp(arg, "--help") == 0) + { + print_help_message(); + exit(EXIT_SUCCESS); + } + else if (strcmp(arg, "--cuda-native-event-names") == 0) + { + if (!argv[i + 1]) + { + printf("ERROR!! --cuda-native-event-names given, but no events listed.\n"); + exit(EXIT_FAILURE); + } + + char **cmd_line_native_event_names = NULL; + const char *cuda_native_event_name = strtok(argv[i+1], ","); + while (cuda_native_event_name != NULL) + { + if (strstr(cuda_native_event_name, ":device")) { + fprintf(stderr, "Cuda native event name must not have a device qualifier appended for this test, i.e. no :device=#.\n"); + print_help_message(); + exit(EXIT_FAILURE); + } + + cmd_line_native_event_names = (char **) realloc(cmd_line_native_event_names, ((*total_event_count) + 1) * sizeof(char *)); + check_memory_allocation_call(cmd_line_native_event_names); + + cmd_line_native_event_names[(*total_event_count)] = (char *) malloc(PAPI_2MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call(cmd_line_native_event_names[(*total_event_count)]); + + int strLen = snprintf(cmd_line_native_event_names[(*total_event_count)], PAPI_2MAX_STR_LEN, "%s", cuda_native_event_name); + if (strLen < 0 || strLen >= PAPI_2MAX_STR_LEN) { + fprintf(stderr, "Failed to fully write cuda native event name.\n"); + exit(EXIT_FAILURE); + } + + (*total_event_count)++; + cuda_native_event_name = strtok(NULL, ","); + } + i++; + *cuda_native_event_names = cmd_line_native_event_names; + } + else + { + print_help_message(); + exit(EXIT_FAILURE); + } + } +} - /* if no events passed at command line, just report test skipped. */ - if (event_count == 0) { - fprintf(stderr, "No eventnames specified at command line.\n"); - test_skip(__FILE__, __LINE__, "", 0); +int main(int argc, char **argv) +{ + // Determine the number of Cuda capable devices + int num_devices = 0; + check_cuda_runtime_api_call( cudaGetDeviceCount(&num_devices) ); + + // No devices detected on the machine, exit + if (num_devices < 1) { + fprintf(stderr, "No NVIDIA devices found on the machine. This is required for the test to run.\n"); + test_skip(__FILE__, __LINE__, "", 0); + } + + char *user_defined_suppress_output = getenv("PAPI_CUDA_TEST_QUIET"); + int suppress_output = 0; + if (user_defined_suppress_output) { + suppress_output = (int) strtol(user_defined_suppress_output, (char**) NULL, 10); + } + PRINT(suppress_output, "Running the Cuda component test simpleMultiGPU_noCuCtx.cu\n"); + + char **cuda_native_event_names = NULL; + // If command line arguments are provided then get their values. + int total_event_count = 0; + if (argc > 1) { + parse_and_assign_args(argc, argv, &cuda_native_event_names, &total_event_count); } - /* PAPI Initialization must occur before any context creation/manipulation. */ - /* This is to ensure PAPI can monitor CUpti library calls. */ + // Initialize PAPI int papi_errno = PAPI_library_init( PAPI_VER_CURRENT ); if( papi_errno != PAPI_VER_CURRENT ) { - fprintf( stderr, "PAPI_library_init failed\n" ); - exit(-1); + test_fail(__FILE__, __LINE__, "PAPI_library_init()", papi_errno); } - - printf( "PAPI version: %d.%d.%d\n", PAPI_VERSION_MAJOR( PAPI_VERSION ), PAPI_VERSION_MINOR( PAPI_VERSION ), PAPI_VERSION_REVISION( PAPI_VERSION ) ); -#endif - - // Report on the available CUDA devices - int computeCapabilityMajor = 0, computeCapabilityMinor = 0; - int runtimeVersion = 0, driverVersion = 0; - char deviceName[PAPI_MIN_STR_LEN]; - CUdevice device[MAX_GPU_COUNT]; - CHECK_CUDA_ERROR( cudaGetDeviceCount( &num_gpus ) ); - if( num_gpus > MAX_GPU_COUNT ) num_gpus = MAX_GPU_COUNT; - PRINT( quiet, "CUDA-capable device count: %i\n", num_gpus ); - for ( i=0; ipartially_disabled) { + const char *cc_support = (getenv("PAPI_CUDA_API") != NULL) ? "<=7.0" : ">=7.0"; + PRINT(suppress_output, "\033[33mThe cuda component is partially disabled. Only support for CC's %s are enabled.\033[0m\n", cc_support); } - PRINT(quiet, "Found CUDA Component at id %d\n", cid); + check_cuda_runtime_api_call( cudaGetDeviceCount( &num_devices ) ); + TGPUplan plan[MAX_GPU_COUNT]; + float h_SumGPU[MAX_GPU_COUNT]; + int gpuBase = 0; + const int BLOCK_N = 32; + const int THREAD_N = 256; + const int ACCUM_N = BLOCK_N * THREAD_N; + // Create streams for issuing GPU command asynchronously and allocate memory (GPU and System page-locked) + int j; + int dev_idx; + for (dev_idx = 0; dev_idx < num_devices; dev_idx++) { + if (cmpInfo->partially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(dev_idx) == 0) { + continue; + } + } + + check_cuda_runtime_api_call( cudaSetDevice(dev_idx) ); - papi_errno = PAPI_create_eventset(&EventSet); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_create_eventset failed.", papi_errno); - } + plan[dev_idx].dataN = DATA_N / num_devices; + // Take into account odd data sizes and increment + if (plan[dev_idx].dataN % 2) { + plan[dev_idx].dataN++; + } - papi_errno = PAPI_assign_eventset_component(EventSet, cid); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_assign_eventset_component failed.", papi_errno); - } + plan[dev_idx].h_Sum = h_SumGPU + dev_idx; // point within h_SumGPU array + gpuBase += plan[dev_idx].dataN; - // In this example measure events from each GPU - // Add events at a GPU specific level ... eg cuda:::device:2:elapsed_cycles_sm - // Similar to legacy CUpti API, we must change the contexts to the appropriate device to - // add events to inform PAPI of the context that will run the kernels. + // Create an asynchronous stream + check_cuda_runtime_api_call( cudaStreamCreate( &plan[dev_idx].stream ) ); + // Allocate memory on the device + check_cuda_runtime_api_call( cudaMalloc((void **) &plan[dev_idx].d_Data, plan[dev_idx].dataN * sizeof(float)) ); + check_cuda_runtime_api_call( cudaMalloc((void **) &plan[dev_idx].d_Sum, ACCUM_N * sizeof(float)) ); + // Allocates page locked memory on the host + check_cuda_runtime_api_call( cudaMallocHost((void **) &plan[dev_idx].h_Sum_from_device, ACCUM_N * sizeof(float)) ); + check_cuda_runtime_api_call( cudaMallocHost((void **) &plan[dev_idx].h_Data, plan[dev_idx].dataN * sizeof(float)) ); - // Handle the events from the command line - int numEventsSuccessfullyAdded = 0, numMultipassEvents = 0; - char **eventsSuccessfullyAdded, **metricNames = argv + 1; - eventsSuccessfullyAdded = (char **) malloc(NUM_EVENTS * sizeof(char *)); - if (eventsSuccessfullyAdded == NULL) { - fprintf(stderr, "Failed to allocate memory for successfully added events.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - for (i = 0; i < NUM_EVENTS; i++) { - eventsSuccessfullyAdded[i] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); - if (eventsSuccessfullyAdded[i] == NULL) { - fprintf(stderr, "Failed to allocate memory for command line argument.\n"); - test_skip(__FILE__, __LINE__, "", 0); + for(j = 0; j < plan[dev_idx].dataN; j++) { + plan[dev_idx].h_Data[j] = ( float ) rand() / ( float ) RAND_MAX; } } - int gpu_id; - for (gpu_id = 0; gpu_id < num_gpus; gpu_id++) { - CHECK_CUDA_ERROR(cudaSetDevice(device[gpu_id])); - add_events_from_command_line(EventSet, event_count, metricNames, gpu_id, &numEventsSuccessfullyAdded, eventsSuccessfullyAdded, &numMultipassEvents); - } + int EventSet = PAPI_NULL; + check_papi_api_call( PAPI_create_eventset(&EventSet) ); + + // Handle the events from the command line + int num_events_successfully_added = 0, numMultipassEvents = 0; + int NUM_EVENTS = MAX_GPU_COUNT * MAX_NUM_EVENTS; + char **events_successfully_added = (char **) malloc(NUM_EVENTS * sizeof(char *)); + check_memory_allocation_call( events_successfully_added ); + + int event_idx; + for (dev_idx = 0; dev_idx < num_devices; dev_idx++) { + if (cmpInfo->partially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(dev_idx) == 0) { + continue; + } + } + + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + char tmp_event_name[PAPI_MAX_STR_LEN]; + int strLen = snprintf(tmp_event_name, PAPI_MAX_STR_LEN, "%s:device=%d", cuda_native_event_names[event_idx], dev_idx); + if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { + fprintf(stderr, "Failed to fully write event name with appended device qualifier.\n"); + exit(EXIT_FAILURE); + } + + events_successfully_added[num_events_successfully_added] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call( events_successfully_added[event_idx] ); + + check_cuda_runtime_api_call( cudaSetDevice(dev_idx) ); + add_cuda_native_events(EventSet, tmp_event_name, &num_events_successfully_added, events_successfully_added, &numMultipassEvents); + } + } // Only multiple pass events were provided on the command line - if (numEventsSuccessfullyAdded == 0) { + if (num_events_successfully_added == 0) { fprintf(stderr, "Events provided on the command line could not be added to an EventSet as they require multiple passes.\n"); test_skip(__FILE__, __LINE__, "", 0); } - + // Invoke PAPI_start(). - papi_errno = PAPI_start( EventSet ); - if( papi_errno != PAPI_OK ) { - test_fail(__FILE__, __LINE__, "PAPI_start failed", papi_errno); - } -#endif - - // Start timing and compute on GPU(s) - PRINT( quiet, "Computing with %d GPUs...\n", num_gpus ); + check_papi_api_call( PAPI_start(EventSet) ); + + // Start timing StartTimer(); // Copy data to GPU, launch the kernel and copy data back. All asynchronously - for (i = 0; i < num_gpus; i++) { - // Pushing a context implicitly sets the device for which it was created. - CHECK_CUDA_ERROR(cudaSetDevice(device[i])); + for (dev_idx = 0; dev_idx < num_devices; dev_idx++) { + if (cmpInfo->partially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(dev_idx) == 0) { + continue; + } + } + // Set device to be used for GPU executions + check_cuda_runtime_api_call( cudaSetDevice(dev_idx) ); // Copy input data from CPU - CHECK_CUDA_ERROR( cudaMemcpyAsync( plan[i].d_Data, plan[i].h_Data, plan[i].dataN * sizeof( float ), cudaMemcpyHostToDevice, plan[i].stream ) ); + check_cuda_runtime_api_call( cudaMemcpyAsync( plan[dev_idx].d_Data, plan[dev_idx].h_Data, plan[dev_idx].dataN * sizeof( float ), cudaMemcpyHostToDevice, plan[dev_idx].stream ) ); // Perform GPU computations - reduceKernel <<< BLOCK_N, THREAD_N, 0, plan[i].stream >>> ( plan[i].d_Sum, plan[i].d_Data, plan[i].dataN ); - if ( cudaGetLastError() != cudaSuccess ) { printf( "reduceKernel() execution failed (GPU %d).\n", i ); exit(EXIT_FAILURE); } + reduceKernel <<< BLOCK_N, THREAD_N, 0, plan[dev_idx].stream >>> ( plan[dev_idx].d_Sum, plan[dev_idx].d_Data, plan[dev_idx].dataN ); + check_cuda_runtime_api_call( cudaGetLastError() ); // Read back GPU results - CHECK_CUDA_ERROR( cudaMemcpyAsync( plan[i].h_Sum_from_device, plan[i].d_Sum, ACCUM_N * sizeof( float ), cudaMemcpyDeviceToHost, plan[i].stream ) ); + check_cuda_runtime_api_call( cudaMemcpyAsync( plan[dev_idx].h_Sum_from_device, plan[dev_idx].d_Sum, ACCUM_N * sizeof( float ), cudaMemcpyDeviceToHost, plan[dev_idx].stream ) ); } // Process GPU results - PRINT( quiet, "Process GPU results on %d GPUs...\n", num_gpus ); - for( i = 0; i < num_gpus; i++ ) { + PRINT(suppress_output, "Process GPU results on %d GPUs...\n", num_devices); + for(dev_idx = 0; dev_idx < num_devices; dev_idx++) { + if (cmpInfo->partially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(dev_idx) == 0) { + continue; + } + } float sum; - // Pushing a context implicitly sets the device for which it was created. - CHECK_CUDA_ERROR(cudaSetDevice(device[i])); + // Set device to be used for GPU executions + check_cuda_runtime_api_call( cudaSetDevice(dev_idx) ); // Wait for all operations to finish - cudaStreamSynchronize( plan[i].stream ); + cudaStreamSynchronize( plan[dev_idx].stream ); // Finalize GPU reduction for current subvector sum = 0; - for( j = 0; j < ACCUM_N; j++ ) { - sum += plan[i].h_Sum_from_device[j]; + for (j = 0; j < ACCUM_N; j++) { + sum += plan[dev_idx].h_Sum_from_device[j]; } - *( plan[i].h_Sum ) = ( float ) sum; + *( plan[dev_idx].h_Sum ) = ( float ) sum; } double gpuTime = GetTimer(); -#ifdef PAPI - for ( i=0; ipartially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(dev_idx) == 0) { + continue; + } + } + // Set device to be used for GPU executions + check_cuda_runtime_api_call( cudaSetDevice(dev_idx) ); + check_cuda_driver_api_call( cuCtxSynchronize() ); + } + + long long cuda_counter_values[NUM_EVENTS]; + check_papi_api_call( PAPI_stop(EventSet, cuda_counter_values) ); + + for(event_idx = 0; event_idx < num_events_successfully_added; event_idx++) { + PRINT(suppress_output, "Event %s produced the value:\t\t%lld\n", events_successfully_added[event_idx], cuda_counter_values[event_idx]); } - papi_errno = PAPI_stop( EventSet, values ); // Stop (will read values). - if( papi_errno != PAPI_OK ) fprintf( stderr, "PAPI_stop failed\n" ); - for( i = 0; i < numEventsSuccessfullyAdded; i++ ) - PRINT( quiet, "PAPI counterValue %12lld \t\t --> %s \n", values[i], eventsSuccessfullyAdded[i] ); + check_papi_api_call( PAPI_cleanup_eventset(EventSet) ); + + check_papi_api_call( PAPI_destroy_eventset(&EventSet) ); - papi_errno = PAPI_cleanup_eventset( EventSet ); - if( papi_errno != PAPI_OK ) fprintf( stderr, "PAPI_cleanup_eventset failed\n" ); - papi_errno = PAPI_destroy_eventset( &EventSet ); - if( papi_errno != PAPI_OK ) fprintf( stderr, "PAPI_destroy_eventset failed\n" ); PAPI_shutdown(); -#endif - sumGPU = 0; - for( i = 0; i < num_gpus; i++ ) { - sumGPU += h_SumGPU[i]; + float sumGPU = 0.0; + for(dev_idx = 0; dev_idx < num_devices; dev_idx++) { + if (cmpInfo->partially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(dev_idx) == 0) { + continue; + } + } + sumGPU += h_SumGPU[dev_idx]; } - PRINT( quiet, " GPU Processing time: %f (ms)\n", gpuTime ); + PRINT(suppress_output, " GPU Processing time: %f (ms)\n", gpuTime); // Compute on Host CPU - PRINT( quiet, "Computing the same result with Host CPU...\n" ); + PRINT(suppress_output, "Computing the same result with Host CPU...\n"); StartTimer(); - sumCPU = 0; - for( i = 0; i < num_gpus; i++ ) { - for( j = 0; j < plan[i].dataN; j++ ) { - sumCPU += plan[i].h_Data[j]; + double sumCPU = 0.0; + for(dev_idx = 0; dev_idx < num_devices; dev_idx++) { + if (cmpInfo->partially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(dev_idx) == 0) { + continue; + } + } + + for (j = 0; j < plan[dev_idx].dataN; j++) { + sumCPU += plan[dev_idx].h_Data[j]; } } double cpuTime = GetTimer(); if (gpuTime > 0) { - PRINT( quiet, " CPU Processing time: %f (ms) (speedup %.2fX)\n", cpuTime, (cpuTime/gpuTime) ); + PRINT(suppress_output, " CPU Processing time: %f (ms) (speedup %.2fX)\n", cpuTime, (cpuTime/gpuTime)); } else { - PRINT( quiet, " CPU Processing time: %f (ms)\n", cpuTime); + PRINT(suppress_output, " CPU Processing time: %f (ms)\n", cpuTime); } // Compare GPU and CPU results - PRINT( quiet, "Comparing GPU and Host CPU results...\n" ); - diff = fabs( sumCPU - sumGPU ) / fabs( sumCPU ); - PRINT( quiet, " GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU ); - PRINT( quiet, " Relative difference: %E \n", diff ); + PRINT(suppress_output, "Comparing GPU and Host CPU results...\n"); + double diff = fabs( sumCPU - sumGPU ) / fabs( sumCPU ); + PRINT(suppress_output, " GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU); + PRINT(suppress_output, " Relative difference: %E \n", diff); + + // Output a note that a multiple pass event was provided on the command line + if (numMultipassEvents > 0) { + PRINT(suppress_output, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); + } // Cleanup and shutdown - for( i = 0; i < num_gpus; i++ ) { - CHECK_CUDA_ERROR( cudaFreeHost( plan[i].h_Sum_from_device ) ); - CHECK_CUDA_ERROR( cudaFreeHost( plan[i].h_Data ) ); - CHECK_CUDA_ERROR( cudaFree( plan[i].d_Sum ) ); - CHECK_CUDA_ERROR( cudaFree( plan[i].d_Data ) ); - // Shut down this GPU - CHECK_CUDA_ERROR( cudaStreamDestroy( plan[i].stream ) ); + for (dev_idx = 0; dev_idx < num_devices; dev_idx++) { + if (cmpInfo->partially_disabled) { + // Device is not enabled continue + if (determine_if_device_is_enabled(dev_idx) == 0) { + continue; + } + } + // Free page-locked memory + check_cuda_runtime_api_call( cudaFreeHost(plan[dev_idx].h_Sum_from_device) ); + check_cuda_runtime_api_call( cudaFreeHost(plan[dev_idx].h_Data) ); + // Free memory on the device + check_cuda_runtime_api_call( cudaFree(plan[dev_idx].d_Sum) ); + check_cuda_runtime_api_call( cudaFree(plan[dev_idx].d_Data) ); + // Destroys and cleans up asynchronous stream + check_cuda_runtime_api_call( cudaStreamDestroy(plan[dev_idx].stream) ); } //Free allocated memory - for (i = 0; i < event_count; i++) { - free(eventsSuccessfullyAdded[i]); + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + free(cuda_native_event_names[event_idx]); } - free(eventsSuccessfullyAdded); + free(cuda_native_event_names); -#ifdef PAPI - // Output a note that a multiple pass event was provided on the command line - if (numMultipassEvents > 0) { - PRINT(quiet, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); + for (event_idx = 0; event_idx < num_events_successfully_added; event_idx++) { + free(events_successfully_added[event_idx]); } + free(events_successfully_added); - if ( diff < 1e-5 ) + if (diff < 1e-5) { test_pass(__FILE__); - else + } + else { test_fail(__FILE__, __LINE__, "Result of GPU calculation doesn't match CPU.", PAPI_EINVAL); -#endif + } + return 0; } diff --git a/src/components/cuda/tests/test_2thr_1gpu_not_allowed.cu b/src/components/cuda/tests/test_2thr_1gpu_not_allowed.cu index 62d32f250..df0e7a4c1 100644 --- a/src/components/cuda/tests/test_2thr_1gpu_not_allowed.cu +++ b/src/components/cuda/tests/test_2thr_1gpu_not_allowed.cu @@ -1,57 +1,35 @@ /** - * @file test_2thr_1gpu_not_allowed.cu - * @author Anustuv Pal - * anustuv@icl.utk.edu - */ +* @file test_2thr_1gpu_not_allowed.cu +* @brief Verify that we do not allow multiple threads on a single device. PAPI_ECNFLCT +* should be returned if this occurs. +* +* Note: The cuda component supports being partially disabled, meaning that certain devices +* will not be "enabled" to profile on. If PAPI_CUDA_API is not set, then devices with +* CC's >= 7.0 will be used and if PAPI_CUDA_API is set to LEGACY then devices with +* CC's <= 7.0 will be used. +*/ +// Standard library headers #include #include #include #include -#include "gpu_work.h" -#ifdef PAPI -#include -#include - -#define PAPI_CALL(apiFuncCall) \ -do { \ - int _status = apiFuncCall; \ - if (_status != PAPI_OK) { \ - fprintf(stderr, "error: function %s failed.", #apiFuncCall); \ - test_fail(__FILE__, __LINE__, "", _status); \ - } \ -} while (0) -#endif +// Cuda Toolkit headers +#include -#define PRINT(quiet, format, args...) {if (!quiet) {fprintf(stderr, format, ## args);}} -int quiet; - -#define RUNTIME_API_CALL(apiFuncCall) \ -do { \ - cudaError_t _status = apiFuncCall; \ - if (_status != cudaSuccess) { \ - fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ - __FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\ - exit(EXIT_FAILURE); \ - } \ -} while (0) - -#define DRIVER_API_CALL(apiFuncCall) \ -do { \ - CUresult _status = apiFuncCall; \ - if (_status != CUDA_SUCCESS) { \ - fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \ - __FILE__, __LINE__, #apiFuncCall, _status); \ - exit(EXIT_FAILURE); \ - } \ -} while (0) +// Internal headers +#include "cuda_tests_helper.h" +#include "gpu_work.h" +#include "papi.h" +#include "papi_test.h" #define NUM_THREADS 2 -int numGPUs; -int g_event_count; -char **g_evt_names; +int global_suppress_output; +int global_num_devices; +int global_total_event_count; +char **global_cuda_native_event_names = NULL; typedef struct pthread_params_s { pthread_t tid; @@ -60,141 +38,318 @@ typedef struct pthread_params_s { int retval; } pthread_params_t; +static void print_help_message(void) +{ + printf("./test_2thr_1gpu_not_allowed --device [nvidia device index] --cuda-native-event-names [list of cuda native event names separated by a comma].\n" + "Notes:\n" + "1. Must provide exactly two native event names on the command line with matching device qualifiers.\n"); +} + +static void parse_and_assign_args(int argc, char *argv[], int *device_index, char ***cuda_native_event_names, int *total_event_count) +{ + int num_device_indices = 0, *event_device_indices = NULL; + int i, device_arg_found = 0, cuda_native_event_name_arg_found = 0; + for (i = 1; i < argc; ++i) + { + char *arg = argv[i]; + if (strcmp(arg, "--help") == 0) + { + print_help_message(); + exit(EXIT_SUCCESS); + } + else if (strcmp(arg, "--device") == 0) + { + if (!argv[i + 1]) + { + printf("ERROR!! Add a nvidia device index.\n"); + exit(EXIT_FAILURE); + } + *device_index = atoi(argv[i + 1]); + device_arg_found++; + i++; + } + else if (strcmp(arg, "--cuda-native-event-names") == 0) + { + if (!argv[i + 1]) + { + printf("ERROR!! --cuda-native-event-names given, but no events listed.\n"); + exit(EXIT_FAILURE); + } + + char **cmd_line_native_event_names = NULL; + const char *cuda_native_event_name = strtok(argv[i+1], ","); + while (cuda_native_event_name != NULL) + { + const char *device_substring = strstr(cuda_native_event_name, ":device="); + if (device_substring != NULL) { + event_device_indices = (int *) realloc(event_device_indices, (num_device_indices + 1) * sizeof(int)); + event_device_indices[num_device_indices++] = atoi(device_substring + strlen(":device=")); + } + + cmd_line_native_event_names = (char **) realloc(cmd_line_native_event_names, ((*total_event_count) + 1) * sizeof(char *)); + check_memory_allocation_call(cmd_line_native_event_names); + + cmd_line_native_event_names[(*total_event_count)] = (char *) malloc(PAPI_2MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call(cmd_line_native_event_names[(*total_event_count)]); + + int strLen = snprintf(cmd_line_native_event_names[(*total_event_count)], PAPI_2MAX_STR_LEN, "%s", cuda_native_event_name); + if (strLen < 0 || strLen >= PAPI_2MAX_STR_LEN) { + fprintf(stderr, "Failed to fully write cuda native event name.\n"); + exit(EXIT_FAILURE); + } + + (*total_event_count)++; + cuda_native_event_name_arg_found++; + cuda_native_event_name = strtok(NULL, ","); + } + i++; + *cuda_native_event_names = cmd_line_native_event_names; + } + else + { + print_help_message(); + exit(EXIT_FAILURE); + } + } + + if (device_arg_found == 0 || cuda_native_event_name_arg_found == 0) { + fprintf(stderr, "You must use both the --device arg and --cuda-native-event-names arg in conjunction.\n"); + exit(EXIT_FAILURE); + } + + for (i = 0; i < num_device_indices; i++) { + if ((*device_index) != event_device_indices[i]) { + fprintf(stderr, "The device qualifier index %d does not match the index %d provided by --device.\n", event_device_indices[i], *device_index); + exit(EXIT_FAILURE); + } + } + free(event_device_indices); +} + void *thread_gpu(void * ptinfo) { pthread_params_t *tinfo = (pthread_params_t *) ptinfo; - int idx = tinfo->idx; - int gpuid = idx % numGPUs; + int thread_idx = tinfo->idx; unsigned long gettid = (unsigned long) pthread_self(); - DRIVER_API_CALL(cuCtxSetCurrent(tinfo->cuCtx)); - PRINT(quiet, "This is idx %d thread %lu - using GPU %d context %p!\n", - idx, gettid, gpuid, tinfo->cuCtx); + check_cuda_driver_api_call( cuCtxSetCurrent(tinfo->cuCtx) ); + + CUdevice deviceId; + check_cuda_driver_api_call( cuCtxGetDevice(&deviceId) ); + PRINT(global_suppress_output, "Attempting to run on thread %d (%lu) with device %d.\n", thread_idx, gettid, deviceId); -#ifdef PAPI - int papi_errno; int EventSet = PAPI_NULL; - long long values[1]; - PAPI_CALL(PAPI_create_eventset(&EventSet)); + check_papi_api_call( PAPI_create_eventset(&EventSet) ); - papi_errno = PAPI_add_named_event(EventSet, g_evt_names[idx]); + int papi_errno = PAPI_add_named_event(EventSet, global_cuda_native_event_names[thread_idx]); if (papi_errno != PAPI_OK) { if (papi_errno == PAPI_EMULPASS) { - fprintf(stderr, "Event %s requires multiple passes and cannot be added to an EventSet. Two single pass events are needed for this test see utils/papi_native_avail for more Cuda native events.\n", g_evt_names[idx]); + fprintf(stderr, "Event %s requires multiple passes and cannot be added to an EventSet. Two single pass events are needed for this test see utils/papi_native_avail for more Cuda native events.\n", global_cuda_native_event_names[thread_idx]); test_skip(__FILE__, __LINE__, "", 0); } else { - fprintf(stderr, "Unable to add event %s to the EventSet with error code %d.\n", g_evt_names[idx], papi_errno); + fprintf(stderr, "Unable to add event %s to the EventSet with error code %d.\n", global_cuda_native_event_names[thread_idx], papi_errno); test_skip(__FILE__, __LINE__, "", 0); } } papi_errno = PAPI_start(EventSet); if (papi_errno == PAPI_ECNFLCT) { - PRINT(quiet, "Thread %d was not allowed to start profiling on same GPU.\n", tinfo->idx); + PRINT(global_suppress_output, "\033[0;32mThread %d was not allowed to start profiling on the same GPU.\n\n\033[0m", thread_idx); tinfo->retval = papi_errno; return NULL; } -#endif - VectorAddSubtract(5000000*(idx+1), quiet); // gpu work + VectorAddSubtract(5000000 * (thread_idx + 1), global_suppress_output); // gpu work -#ifdef PAPI - PAPI_CALL(PAPI_stop(EventSet, values)); + long long cuda_counter_value; + check_papi_api_call( PAPI_stop(EventSet, &cuda_counter_value) ); - PRINT(quiet, "User measured values in thread id %d.\n", idx); - PRINT(quiet, "%s\t\t%lld\n", g_evt_names[idx], values[0]); + PRINT(global_suppress_output, "User measured values in thread id %d.\n", thread_idx); + PRINT(global_suppress_output, "%s\t\t%lld\n\n", global_cuda_native_event_names[thread_idx], cuda_counter_value); tinfo->retval = PAPI_OK; - PAPI_CALL(PAPI_cleanup_eventset(EventSet)); - PAPI_CALL(PAPI_destroy_eventset(&EventSet)); -#endif + check_papi_api_call( PAPI_cleanup_eventset(EventSet) ); + + check_papi_api_call( PAPI_destroy_eventset(&EventSet) ); + return NULL; } int main(int argc, char **argv) { - quiet = 0; -#ifdef PAPI - char *test_quiet = getenv("PAPI_CUDA_TEST_QUIET"); - if (test_quiet) - quiet = (int) strtol(test_quiet, (char**) NULL, 10); - g_event_count = argc - 1; - /* if no events passed at command line, just report test skipped. */ - if (g_event_count == 0) { - fprintf(stderr, "No eventnames specified at command line.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - else if (g_event_count != 2) { - fprintf(stderr, "Two single pass events are needed for this test to run properly.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - - g_evt_names = argv + 1; -#endif - int rc, i; - pthread_params_t data[NUM_THREADS]; + // Determine the number of Cuda capable devices + global_num_devices = 0; + check_cuda_runtime_api_call( cudaGetDeviceCount(&global_num_devices) ); + // No devices detected on the machine, exit + if (global_num_devices < 1) { + fprintf(stderr, "No NVIDIA devices found on the machine. This is required for the test to run.\n"); + exit(EXIT_FAILURE); + } + + global_suppress_output = 0; + char *user_defined_suppress_output = getenv("PAPI_CUDA_TEST_QUIET"); + if (user_defined_suppress_output) { + global_suppress_output = (int) strtol(user_defined_suppress_output, (char**) NULL, 10); + } + PRINT(global_suppress_output, "Running the cuda component test test_2thr_1gpu_not_allowed.cu\n"); - RUNTIME_API_CALL(cudaGetDeviceCount(&numGPUs)); - PRINT(quiet, "No. of GPUs = %d\n", numGPUs); - PRINT(quiet, "No. of threads to launch = %d\n", NUM_THREADS); + int cuda_device_index = -1; + // If command line arguments are provided then get their values. + global_total_event_count = 0; + if (argc > 1) { + parse_and_assign_args(argc, argv, &cuda_device_index, &global_cuda_native_event_names, &global_total_event_count); + if (global_total_event_count != 2) { + fprintf(stderr, "Must provide two single pass Cuda native events on the command line for this test to run properoly.\n"); + test_skip(__FILE__, __LINE__, "", 0); + } + } -#ifdef PAPI - int papi_errno = PAPI_library_init( PAPI_VER_CURRENT ); + // Initialize the PAPI library + int papi_errno = PAPI_library_init(PAPI_VER_CURRENT); if( papi_errno != PAPI_VER_CURRENT ) { - test_fail(__FILE__, __LINE__, "PAPI_library_init failed.", 0); + test_fail(__FILE__, __LINE__, "PAPI_library_init()", papi_errno); } - // Point PAPI to function that gets the thread id - PAPI_CALL(PAPI_thread_init((unsigned long (*)(void)) pthread_self)); -#endif + PRINT(global_suppress_output, "PAPI version being used for this test: %d.%d.%d\n", + PAPI_VERSION_MAJOR(PAPI_VERSION), + PAPI_VERSION_MINOR(PAPI_VERSION), + PAPI_VERSION_REVISION(PAPI_VERSION)); + + // Verify the cuda component has been compiled in + int cuda_cmp_idx = PAPI_get_component_index("cuda"); + if (cuda_cmp_idx < 0 ) { + test_fail(__FILE__, __LINE__, "PAPI_get_component_index()", cuda_cmp_idx); + } + PRINT(global_suppress_output, "The cuda component is assigned to component index: %d\n", cuda_cmp_idx); + + // No events were provided on the command line + if (global_total_event_count == 0) { + int num_spaces_to_allocate = 2; + global_cuda_native_event_names = (char **) malloc(num_spaces_to_allocate * sizeof(char *)); + check_memory_allocation_call(global_cuda_native_event_names); + + int modifier = PAPI_ENUM_FIRST; + int cuda_eventcode = 0 | PAPI_NATIVE_MASK; + // Enumerate until we get two Cuda native events + while (PAPI_enum_cmp_event(&cuda_eventcode, modifier, cuda_cmp_idx) == PAPI_OK && global_total_event_count < num_spaces_to_allocate) { + global_cuda_native_event_names[global_total_event_count] = (char *) malloc(PAPI_2MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call( global_cuda_native_event_names[global_total_event_count] ); + + // Convert the first cuda native event code to a name, the name will + // be in the format of cuda:::basename with no qualifiers appended. + char basename[PAPI_MAX_STR_LEN]; + check_papi_api_call( PAPI_event_code_to_name(cuda_eventcode, basename) ); + + // Begin reconstructing the Cuda native event name with qualifiers + int strLen = snprintf(global_cuda_native_event_names[global_total_event_count], PAPI_2MAX_STR_LEN, "%s", basename); + if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { + fprintf(stderr, "Failed to fully write event name."); + exit(EXIT_FAILURE); + } + + // Enumerate through the available default qualifiers. + // The Legacy API only has the device qualifiers + // while the Perfworks Metrics API has a stat and device + // qualifier. + modifier = PAPI_NTV_ENUM_UMASKS; + check_papi_api_call( PAPI_enum_cmp_event(&cuda_eventcode, modifier, cuda_cmp_idx) ); + + do { + PAPI_event_info_t info; + papi_errno = PAPI_get_event_info(cuda_eventcode, &info); + check_papi_api_call( PAPI_get_event_info(cuda_eventcode, &info) ); + + char *qualifier = strstr(info.symbol + strlen("cuda:::"), ":"); + if (strncmp(qualifier, ":device=", 8) == 0) { + cuda_device_index = strtol(qualifier + strlen(":device="), NULL, 10); + } + + int strLen = snprintf(global_cuda_native_event_names[global_total_event_count] + strlen(global_cuda_native_event_names[global_total_event_count]), PAPI_2MAX_STR_LEN - strlen(global_cuda_native_event_names[global_total_event_count]), "%s", qualifier); + if (strLen < 0 || strLen >= PAPI_2MAX_STR_LEN - strlen(global_cuda_native_event_names[global_total_event_count])) { + fprintf(stderr, "Unable to construct cuda native event name.\n"); + exit(EXIT_FAILURE); + } + + } while (PAPI_enum_cmp_event(&cuda_eventcode, modifier, cuda_cmp_idx) == PAPI_OK); + + global_total_event_count++; + // Change modifier for the outer loop + modifier = PAPI_ENUM_EVENTS; + } + + // Safety net, this should never be triggered + if (cuda_device_index == -1) { + fprintf(stderr, "A device qualifier is needed to continue or a device index must be provided on the command line.\n"); + exit(EXIT_FAILURE); + } + + } + + // Initialize PAPI thread support + check_papi_api_call( PAPI_thread_init((unsigned long (*)(void)) pthread_self) ); + // Launch the threads - for(i = 0; i < NUM_THREADS; i++) + pthread_params_t data[NUM_THREADS]; + int thread_idx, thread_errno; + for(thread_idx = 0; thread_idx < NUM_THREADS; thread_idx++) { - data[i].idx = i; + data[thread_idx].idx = thread_idx; int flags = 0; - CUdevice device = 0; + CUdevice device = cuda_device_index; #if defined(CUDA_TOOLKIT_GE_13) - DRIVER_API_CALL( cuCtxCreate(&(data[i].cuCtx), (CUctxCreateParams*)0, flags, device) ); + check_cuda_driver_api_call( cuCtxCreate(&(data[thread_idx].cuCtx), (CUctxCreateParams*)0, flags, device) ); #else - DRIVER_API_CALL( cuCtxCreate(&(data[i].cuCtx), flags, device) ); + check_cuda_driver_api_call( cuCtxCreate(&(data[thread_idx].cuCtx), flags, device) ); #endif - DRIVER_API_CALL(cuCtxPopCurrent(&(data[i].cuCtx))); + check_cuda_driver_api_call( cuCtxPopCurrent(&(data[thread_idx].cuCtx)) ); - rc = pthread_create(&data[i].tid, NULL, thread_gpu, &(data[i])); - if(rc) - { - fprintf(stderr, "\n ERROR: return code from pthread_create is %d \n", rc); - exit(1); + thread_errno = pthread_create(&data[thread_idx].tid, NULL, thread_gpu, &(data[thread_idx])); + if(thread_errno != 0) { + fprintf(stderr, "Call to pthread_create failed for thread %d with error code %d.\n", thread_idx, thread_errno); + exit(EXIT_FAILURE); } - PRINT(quiet, "\n Main thread %lu. Created new thread (%lu) in iteration %d ...\n", - (unsigned long)pthread_self(), (unsigned long) data[i].tid, i); } // Join all threads when complete - for (i=0; i= 7.0 will be used and if PAPI_CUDA_API is set to LEGACY then devices with +* CC's <= 7.0 will be used. +*/ +// Standard library headers #include -#include "gpu_work.h" +#include -#ifdef PAPI -#include +// Internal headers +#include "cuda_tests_helper.h" +#include "gpu_work.h" +#include "papi.h" #include "papi_test.h" -#endif -#define COMP_NAME "cuda" #define MAX_EVENT_COUNT (32) -#define PRINT(quiet, format, args...) {if (!quiet) {fprintf(stderr, format, ## args);}} -int quiet; +int suppress_output; + +static void print_help_message(void) +{ + printf("./test_multi_read_and_reset --device [nvidia device index] --cuda-native-event-names [list of cuda native event names separated by a comma].\n" + "Notes:\n" + "1. The device index must match the device qualifier if provided.\n"); +} + +static void parse_and_assign_args(int argc, char *argv[], int *device_index, char ***cuda_native_event_names, int *total_event_count) +{ + int num_device_indices = 0, *event_device_indices = NULL; + int i, device_arg_found = 0, cuda_native_event_name_arg_found = 0; + for (i = 1; i < argc; ++i) + { + char *arg = argv[i]; + if (strcmp(arg, "--help") == 0) + { + print_help_message(); + exit(EXIT_SUCCESS); + } + else if (strcmp(arg, "--device") == 0) + { + if (!argv[i + 1]) + { + printf("ERROR!! Add a nvidia device index.\n"); + exit(EXIT_FAILURE); + } + *device_index = atoi(argv[i + 1]); + device_arg_found++; + i++; + } + else if (strcmp(arg, "--cuda-native-event-names") == 0) + { + if (!argv[i + 1]) + { + printf("ERROR!! --cuda-native-event-names given, but no events listed.\n"); + exit(EXIT_FAILURE); + } + + char **cmd_line_native_event_names = NULL; + const char *cuda_native_event_name = strtok(argv[i+1], ","); + while (cuda_native_event_name != NULL) + { + const char *device_substring = strstr(cuda_native_event_name, ":device="); + if (device_substring != NULL) { + event_device_indices = (int *) realloc(event_device_indices, (num_device_indices + 1) * sizeof(int)); + event_device_indices[num_device_indices++] = atoi(device_substring + strlen(":device=")); + } + + cmd_line_native_event_names = (char **) realloc(cmd_line_native_event_names, ((*total_event_count) + 1) * sizeof(char *)); + check_memory_allocation_call(cmd_line_native_event_names); + + cmd_line_native_event_names[(*total_event_count)] = (char *) malloc(PAPI_2MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call(cmd_line_native_event_names[(*total_event_count)]); + + int strLen = snprintf(cmd_line_native_event_names[(*total_event_count)], PAPI_2MAX_STR_LEN, "%s", cuda_native_event_name); + if (strLen < 0 || strLen >= PAPI_2MAX_STR_LEN) { + fprintf(stderr, "Failed to fully write cuda native event name.\n"); + exit(EXIT_FAILURE); + } + + (*total_event_count)++; + cuda_native_event_name_arg_found++; + cuda_native_event_name = strtok(NULL, ","); + } + i++; + *cuda_native_event_names = cmd_line_native_event_names; + } + else + { + print_help_message(); + exit(EXIT_FAILURE); + } + } + + if (device_arg_found == 0 || cuda_native_event_name_arg_found == 0) { + fprintf(stderr, "You must use both the --device arg and --cuda-native-event-names arg in conjunction.\n"); + exit(EXIT_FAILURE); + } + + for (i = 0; i < num_device_indices; i++) { + if ((*device_index) != event_device_indices[i]) { + fprintf(stderr, "The device qualifier index %d does not match the index %d provided by --device.\n", event_device_indices[i], *device_index); + exit(EXIT_FAILURE); + } + } + + free(event_device_indices); +} int approx_equal(long long v1, long long v2) { @@ -26,407 +122,288 @@ int approx_equal(long long v1, long long v2) } // Globals for successfully added and multiple pass events -int numEventsSuccessfullyAdded = 0, numMultipassEvents = 0; - -/** @class add_events_from_command_line - * @brief Try and add each event provided on the command line by the user. - * - * @param EventSet - * A PAPI eventset. - * @param totalEventCount - * Number of events from the command line. - * @param **eventNamesFromCommandLine - * Events provided on the command line. - * @param *numEventsSuccessfullyAdded - * Total number of successfully added events. - * @param **eventsSuccessfullyAdded - * Events that we are able to add to the EventSet. - * @param *numMultipassEvents - * Counter to see if a multiple pass event was provided on the command line. -*/ -static void add_events_from_command_line(int EventSet, int totalEventCount, char **eventNamesFromCommandLine, int *numEventsSuccessfullyAdded, char **eventsSuccessfullyAdded, int *numMultipassEvents) -{ - int i; - for (i = 0; i < totalEventCount; i++) { - int strLen; - int papi_errno = PAPI_add_named_event(EventSet, eventNamesFromCommandLine[i]); - if (papi_errno != PAPI_OK) { - if (papi_errno != PAPI_EMULPASS) { - fprintf(stderr, "Unable to add event %s to the EventSet with error code %d.\n", eventNamesFromCommandLine[i], papi_errno); - test_skip(__FILE__, __LINE__, "", 0); - } - - // Handle multiple pass events - (*numMultipassEvents)++; - continue; - } - - // Handle successfully added events - strLen = snprintf(eventsSuccessfullyAdded[(*numEventsSuccessfullyAdded)], PAPI_MAX_STR_LEN, "%s", eventNamesFromCommandLine[i]); - if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) { - fprintf(stderr, "Failed to fully write successfully added event.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - (*numEventsSuccessfullyAdded)++; - } - - return; -} +int global_num_events_successfully_added = 0, global_num_multipass_events = 0; -void multi_reset(int event_count, char **evt_names, long long *values) +void multi_reset(int total_event_count, char **cuda_native_event_names, long long *cuda_counter_values, int cuda_device_index) { CUcontext ctx; - int papi_errno, i; - - CUresult cuError; int flags = 0; - CUdevice device = 0; + CUdevice device = cuda_device_index; #if defined(CUDA_TOOLKIT_GE_13) - cuError = cuCtxCreate(&ctx, (CUctxCreateParams*)0, flags, device); - if (cuError != CUDA_SUCCESS) { - fprintf(stderr, "Failed to create Cuda context for a Cuda Toolkit version >= 13: %d\n", cuError); - exit(1); - } + check_cuda_driver_api_call( cuCtxCreate(&ctx, (CUctxCreateParams*)0, flags, device) ); #else - cuError = cuCtxCreate(&ctx, flags, device); - if (cuError != CUDA_SUCCESS) { - fprintf(stderr, "Failed to create Cuda context for a Cuda Toolkit version < 13: %d\n", cuError); - exit(1); - } + check_cuda_driver_api_call( cuCtxCreate(&ctx, flags, device) ); #endif -#ifdef PAPI int EventSet = PAPI_NULL; - int j; - papi_errno = PAPI_create_eventset(&EventSet); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "Failed to create eventset.", papi_errno); - } + check_papi_api_call( PAPI_create_eventset(&EventSet) ); // Handle the events from the command line - numEventsSuccessfullyAdded = 0; - numMultipassEvents = 0; - char **eventsSuccessfullyAdded; - eventsSuccessfullyAdded = (char **) malloc(event_count * sizeof(char *)); - if (eventsSuccessfullyAdded == NULL) { - fprintf(stderr, "Failed to allocate memory for successfully added events.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - for (i = 0; i < event_count; i++) { - eventsSuccessfullyAdded[i] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); - if (eventsSuccessfullyAdded[i] == NULL) { - fprintf(stderr, "Failed to allocate memory for command line argument.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } + global_num_events_successfully_added = 0; + global_num_multipass_events = 0; + char **events_successfully_added = (char **) malloc(total_event_count * sizeof(char *)); + check_memory_allocation_call(events_successfully_added); + + int event_idx; + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + events_successfully_added[event_idx] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call(events_successfully_added[event_idx]); + + add_cuda_native_events(EventSet, cuda_native_event_names[event_idx], &global_num_events_successfully_added, events_successfully_added, &global_num_multipass_events); } - add_events_from_command_line(EventSet, event_count, evt_names, &numEventsSuccessfullyAdded, eventsSuccessfullyAdded, &numMultipassEvents); // Only multiple pass events were provided on the command line - if (numEventsSuccessfullyAdded == 0) { + if (global_num_events_successfully_added == 0) { fprintf(stderr, "Events provided on the command line could not be added to an EventSet as they require multiple passes.\n"); test_skip(__FILE__, __LINE__, "", 0); } - papi_errno = PAPI_start(EventSet); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_start error.", papi_errno); - } -#endif + check_papi_api_call( PAPI_start(EventSet) ); - for (i=0; i<10; i++) { - VectorAddSubtract(100000, quiet); -#ifdef PAPI - papi_errno = PAPI_read(EventSet, values); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_read error.", papi_errno); - } - PRINT(quiet, "Measured values iter %d\n", i); - for (j=0; j < numEventsSuccessfullyAdded; j++) { - PRINT(quiet, "%s\t\t%lld\n", eventsSuccessfullyAdded[j], values[j]); - } - papi_errno = PAPI_reset(EventSet); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_reset error.", papi_errno); + int iter; + for (iter = 0; iter < 10; iter++) { + VectorAddSubtract(100000, suppress_output); + + check_papi_api_call( PAPI_read(EventSet, cuda_counter_values) ); + + for (event_idx = 0; event_idx < global_num_events_successfully_added; event_idx++) { + PRINT(suppress_output, "Event %s for iter %d produced the value:\t\t%lld\n", events_successfully_added[event_idx], iter, cuda_counter_values[event_idx]); } -#endif - } -#ifdef PAPI - papi_errno = PAPI_stop(EventSet, values); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_stop error.", papi_errno); - } - papi_errno = PAPI_cleanup_eventset(EventSet); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_cleanup_eventset error.", papi_errno); + check_papi_api_call( PAPI_reset(EventSet) ); } - papi_errno = PAPI_destroy_eventset(&EventSet); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_destroy_eventset error.", papi_errno); - } -#endif - papi_errno = cuCtxDestroy(ctx); - if (papi_errno != CUDA_SUCCESS) { - fprintf(stderr, "cude error: failed to destroy context.\n"); - exit(1); - } + check_papi_api_call( PAPI_stop(EventSet, cuda_counter_values) ); + + check_papi_api_call( PAPI_cleanup_eventset(EventSet) ); + + check_papi_api_call( PAPI_destroy_eventset(&EventSet) ); + + check_cuda_driver_api_call( cuCtxDestroy(ctx) ); // Free allocated memory - for (i = 0; i < event_count; i++) { - free(eventsSuccessfullyAdded[i]); + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + free(events_successfully_added[event_idx]); } - free(eventsSuccessfullyAdded); + free(events_successfully_added); } -void multi_read(int event_count, char **evt_names, long long *values) +void multi_read(int total_event_count, char **cuda_native_event_names, long long *cuda_counter_values, int cuda_device_index) { CUcontext ctx; - int papi_errno, i; - - CUresult cuError; int flags = 0; - CUdevice device = 0; + CUdevice device = cuda_device_index; #if defined(CUDA_TOOLKIT_GE_13) - cuError = cuCtxCreate(&ctx, (CUctxCreateParams*)0, flags, device); - if (cuError != CUDA_SUCCESS) { - fprintf(stderr, "Failed to create Cuda context for a Cuda Toolkit version >= 13: %d\n", cuError); - exit(1); - } + check_cuda_driver_api_call( cuCtxCreate(&ctx, (CUctxCreateParams*)0, flags, device) ); #else - cuError = cuCtxCreate(&ctx, flags, device); - if (cuError != CUDA_SUCCESS) { - fprintf(stderr, "Failed to create Cuda context for a Cuda Toolkit version < 13: %d\n", cuError); - exit(1); - } + check_cuda_driver_api_call( cuCtxCreate(&ctx, flags, device) ); #endif -#ifdef PAPI - int EventSet = PAPI_NULL, j; - papi_errno = PAPI_create_eventset(&EventSet); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "Failed to create eventset.", papi_errno); - } + int EventSet = PAPI_NULL; + check_papi_api_call( PAPI_create_eventset(&EventSet) ); // Handle the events from the command line - numEventsSuccessfullyAdded = 0; - numMultipassEvents = 0; - char **eventsSuccessfullyAdded; - eventsSuccessfullyAdded = (char **) malloc(event_count * sizeof(char *)); - if (eventsSuccessfullyAdded == NULL) { - fprintf(stderr, "Failed to allocate memory for successfully added events.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - for (i = 0; i < event_count; i++) { - eventsSuccessfullyAdded[i] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); - if (eventsSuccessfullyAdded[i] == NULL) { - fprintf(stderr, "Failed to allocate memory for command line argument.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } + global_num_events_successfully_added = 0; + global_num_multipass_events = 0; + char **events_successfully_added = (char **) malloc(total_event_count * sizeof(char *)); + check_memory_allocation_call(events_successfully_added); + + int event_idx; + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + events_successfully_added[event_idx] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call(events_successfully_added[event_idx]); + + add_cuda_native_events(EventSet, cuda_native_event_names[event_idx], &global_num_events_successfully_added, events_successfully_added, &global_num_multipass_events); } - add_events_from_command_line(EventSet, event_count, evt_names, &numEventsSuccessfullyAdded, eventsSuccessfullyAdded, &numMultipassEvents); // Only multiple pass events were provided on the command line - if (numEventsSuccessfullyAdded == 0) { + if (global_num_events_successfully_added == 0) { fprintf(stderr, "Events provided on the command line could not be added to an EventSet as they require multiple passes.\n"); test_skip(__FILE__, __LINE__, "", 0); } - papi_errno = PAPI_start(EventSet); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_start error.", papi_errno); - } -#endif - for (i=0; i<10; i++) { - VectorAddSubtract(100000, quiet); -#ifdef PAPI - papi_errno = PAPI_read(EventSet, values); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_start error.", papi_errno); - } - PRINT(quiet, "Measured values iter %d\n", i); - for (j=0; j < numEventsSuccessfullyAdded; j++) { - PRINT(quiet, "%s\t\t%lld\n", eventsSuccessfullyAdded[j], values[j]); + check_papi_api_call( PAPI_start(EventSet) ); + + int iter; + for (iter = 0; iter < 10; iter++) { + VectorAddSubtract(100000, suppress_output); + + check_papi_api_call( PAPI_read(EventSet, cuda_counter_values) ); + + for (event_idx = 0; event_idx < global_num_events_successfully_added; event_idx++) { + PRINT(suppress_output, "Event %s for iter %d produced the value:\t\t%lld\n", events_successfully_added[event_idx], iter, cuda_counter_values[event_idx]); } } - papi_errno = PAPI_stop(EventSet, values); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_stop error.", papi_errno); - } - papi_errno = PAPI_cleanup_eventset(EventSet); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_cleanup_eventset error.", papi_errno); - } - papi_errno = PAPI_destroy_eventset(&EventSet); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_destroy_eventset error.", papi_errno); -#endif - } - papi_errno = cuCtxDestroy(ctx); - if (papi_errno != CUDA_SUCCESS) { - fprintf(stderr, "cude error: failed to destroy context.\n"); - exit(1); - } + check_papi_api_call( PAPI_stop(EventSet, cuda_counter_values) ); + + check_papi_api_call( PAPI_cleanup_eventset(EventSet) ); + + check_papi_api_call( PAPI_destroy_eventset(&EventSet) ); + + check_cuda_driver_api_call( cuCtxDestroy(ctx) ); // Free allocated memory - for (i = 0; i < event_count; i++) { - free(eventsSuccessfullyAdded[i]); + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + free(events_successfully_added[event_idx]); } - free(eventsSuccessfullyAdded); + free(events_successfully_added); } -void single_read(int event_count, char **evt_names, long long *values, char ***addedEvents) +void single_read(int total_event_count, char **cuda_native_event_names, long long *cuda_counter_values, char ***addedEvents, int cuda_device_index) { - int papi_errno, i; CUcontext ctx; - - CUresult cuError; int flags = 0; - CUdevice device = 0; + CUdevice device = cuda_device_index; #if defined(CUDA_TOOLKIT_GE_13) - cuError = cuCtxCreate(&ctx, (CUctxCreateParams*)0, flags, device); - if (cuError != CUDA_SUCCESS) { - fprintf(stderr, "Failed to create Cuda context for a Cuda Toolkit version >= 13: %d\n", cuError); - exit(1); - } + check_cuda_driver_api_call( cuCtxCreate(&ctx, (CUctxCreateParams*)0, flags, device) ); #else - cuError = cuCtxCreate(&ctx, flags, device); - if (cuError != CUDA_SUCCESS) { - fprintf(stderr, "Failed to create Cuda context for a Cuda Toolkit version < 13: %d\n", cuError); - exit(1); - } + check_cuda_driver_api_call( cuCtxCreate(&ctx, flags, device) ); #endif -#ifdef PAPI - int EventSet = PAPI_NULL, j; - papi_errno = PAPI_create_eventset(&EventSet); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "Failed to create eventset.", papi_errno); - } + int EventSet = PAPI_NULL; + check_papi_api_call( PAPI_create_eventset(&EventSet) ); // Handle the events from the command line - numEventsSuccessfullyAdded = 0; - numMultipassEvents = 0; - char **eventsSuccessfullyAdded; - eventsSuccessfullyAdded = (char **) malloc(event_count * sizeof(char *)); - if (eventsSuccessfullyAdded == NULL) { - fprintf(stderr, "Failed to allocate memory for successfully added events.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - for (i = 0; i < event_count; i++) { - eventsSuccessfullyAdded[i] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); - if (eventsSuccessfullyAdded[i] == NULL) { - fprintf(stderr, "Failed to allocate memory for command line argument.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } + global_num_events_successfully_added = 0; + global_num_multipass_events = 0; + char **events_successfully_added = (char **) malloc(total_event_count * sizeof(char *)); + check_memory_allocation_call(events_successfully_added); + + int event_idx; + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + events_successfully_added[event_idx] = (char *) malloc(PAPI_MAX_STR_LEN * sizeof(char)); + check_memory_allocation_call(events_successfully_added[event_idx]); + add_cuda_native_events(EventSet, cuda_native_event_names[event_idx], &global_num_events_successfully_added, events_successfully_added, &global_num_multipass_events); } - add_events_from_command_line(EventSet, event_count, evt_names, &numEventsSuccessfullyAdded, eventsSuccessfullyAdded, &numMultipassEvents); // Only multiple pass events were provided on the command line - if (numEventsSuccessfullyAdded == 0) { + if (global_num_events_successfully_added == 0) { fprintf(stderr, "Events provided on the command line could not be added to an EventSet as they require multiple passes.\n"); test_skip(__FILE__, __LINE__, "", 0); } - papi_errno = PAPI_start(EventSet); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_start error.", papi_errno); - } -#endif - for (i=0; i<10; i++) { - VectorAddSubtract(100000, quiet); - } -#ifdef PAPI - papi_errno = PAPI_stop(EventSet, values); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_stop error.", papi_errno); - } - PRINT(quiet, "Measured values from single read\n"); - for (j=0; j < numEventsSuccessfullyAdded; j++) { - PRINT(quiet, "%s\t\t%lld\n", eventsSuccessfullyAdded[j], values[j]); - } - papi_errno = PAPI_cleanup_eventset(EventSet); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_cleanup_eventset error.", papi_errno); - } - papi_errno = PAPI_destroy_eventset(&EventSet); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_destroy_eventset error.", papi_errno); + check_papi_api_call( PAPI_start(EventSet) ); + + int iter; + for (iter = 0; iter < 10; iter++) { + VectorAddSubtract(100000, suppress_output); } -#endif - papi_errno = cuCtxDestroy(ctx); - if (papi_errno != CUDA_SUCCESS) { - fprintf(stderr, "cuda error: failed to destroy cuda context.\n"); - exit(1); + + check_papi_api_call( PAPI_stop(EventSet, cuda_counter_values) ); + + for (event_idx = 0; event_idx < global_num_events_successfully_added; event_idx++) { + PRINT(suppress_output, "Event %s for a single read produced the value:\t\t%lld\n", events_successfully_added[event_idx], cuda_counter_values[event_idx]); } - *addedEvents = eventsSuccessfullyAdded; -} + check_papi_api_call( PAPI_cleanup_eventset(EventSet) ); -int main(int argc, char **argv) -{ - cuInit(0); + check_papi_api_call( PAPI_destroy_eventset(&EventSet) ); - quiet = 0; -#ifdef PAPI - int papi_errno; - char *test_quiet = getenv("PAPI_CUDA_TEST_QUIET"); - if (test_quiet) - quiet = (int) strtol(test_quiet, (char**) NULL, 10); + check_cuda_driver_api_call( cuCtxDestroy(ctx) ); - int event_count = argc - 1; + *addedEvents = events_successfully_added; +} - /* if no events passed at command line, just report test skipped. */ - if (event_count == 0) { - fprintf(stderr, "No eventnames specified at command line.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - papi_errno = PAPI_library_init(PAPI_VER_CURRENT); +int main(int argc, char **argv) +{ + check_cuda_driver_api_call( cuInit(0) ); + + // Determine the number of Cuda capable devices + int num_devices = 0; + check_cuda_runtime_api_call( cudaGetDeviceCount(&num_devices) ); + // No devices detected on the machine, exit + if (num_devices < 1) { + fprintf(stderr, "No NVIDIA devices found on the machine. This is required for the test to run.\n"); + exit(EXIT_FAILURE); + } + + suppress_output = 0; + char *user_defined_suppress_output = getenv("PAPI_CUDA_TEST_QUIET"); + if (user_defined_suppress_output) { + suppress_output = (int) strtol(user_defined_suppress_output, (char**) NULL, 10); + } + PRINT(suppress_output, "Running the cuda component test test_multi_read_and_reset.cu\n"); + + int cuda_device_index = -1; + char **cuda_native_event_names = NULL; + // If command line arguments are provided then get their values. + int total_event_count = 0; + if (argc > 1) { + parse_and_assign_args(argc, argv, &cuda_device_index, &cuda_native_event_names, &total_event_count); + } + + // Initialize the PAPI library + int papi_errno = PAPI_library_init(PAPI_VER_CURRENT); if (papi_errno != PAPI_VER_CURRENT) { - test_fail(__FILE__, __LINE__, "Failed to initialize PAPI.", 0); + test_fail(__FILE__, __LINE__, "PAPI_library_init()", papi_errno); + } + PRINT(suppress_output, "PAPI version being used for this test: %d.%d.%d\n", + PAPI_VERSION_MAJOR(PAPI_VERSION), + PAPI_VERSION_MINOR(PAPI_VERSION), + PAPI_VERSION_REVISION(PAPI_VERSION)); + + // Verify the cuda component has been compiled in + int cuda_cmp_idx = PAPI_get_component_index("cuda"); + if (cuda_cmp_idx < 0) { + test_fail(__FILE__, __LINE__, "PAPI_get_component_index()", cuda_cmp_idx); + } + PRINT(suppress_output, "The cuda component is assigned to component index: %d\n", cuda_cmp_idx); + + // If a user does not provide an event or events, then we go get an event to add + if (total_event_count == 0) { + enumerate_and_store_cuda_native_events(&cuda_native_event_names, &total_event_count, &cuda_device_index); + } + + PRINT(suppress_output, "Running Multi Reset\n"); + PRINT(suppress_output, "----------------------------------------\n"); + long long cuda_counter_values_multi_reset[MAX_EVENT_COUNT]; + multi_reset(total_event_count, cuda_native_event_names, cuda_counter_values_multi_reset, cuda_device_index); + PRINT(suppress_output, "----------------------------------------\n"); + + PRINT(suppress_output, "\nRunning Multi Read\n"); + PRINT(suppress_output, "----------------------------------------\n"); + long long cuda_counter_values_multi_read[MAX_EVENT_COUNT]; + multi_read(total_event_count, cuda_native_event_names, cuda_counter_values_multi_read, cuda_device_index); + PRINT(suppress_output, "----------------------------------------\n"); + + PRINT(suppress_output, "\nRunning Single Read\n"); + PRINT(suppress_output, "----------------------------------------\n"); + long long cuda_counter_values_single_read[MAX_EVENT_COUNT]; + char **events_successfully_added = { 0 }; + single_read(total_event_count, cuda_native_event_names, cuda_counter_values_single_read, &events_successfully_added, cuda_device_index); + PRINT(suppress_output, "----------------------------------------\n"); + + int event_idx; + PRINT(suppress_output, "\nFinal Measured Cuda Counter Values\n"); + PRINT(suppress_output, "----------------------------------------\n"); + PRINT(suppress_output, "Event Name\t\t\t\t\t\tMulti Read\tSingle Read\n"); + for (event_idx = 0; event_idx < global_num_events_successfully_added; event_idx++) { + PRINT(suppress_output, "%s\t\t\t%lld\t\t%lld\n", events_successfully_added[event_idx], cuda_counter_values_multi_read[event_idx], cuda_counter_values_single_read[event_idx]); + if ( !approx_equal(cuda_counter_values_multi_read[event_idx], cuda_counter_values_single_read[event_idx]) ) + printf("\033[33mWARNING: Multi read and single read do not match for %s\033[0m\n", events_successfully_added[event_idx]); } - papi_errno = PAPI_get_component_index(COMP_NAME); - if (papi_errno < 0) { - test_fail(__FILE__, __LINE__, "Failed to get index of cuda component.", PAPI_ECMP); - } - long long values_multi_reset[MAX_EVENT_COUNT]; - long long values_multi_read[MAX_EVENT_COUNT]; - long long values_single_read[MAX_EVENT_COUNT]; - - PRINT(quiet, "Running multi_reset.\n"); - multi_reset(event_count, argv + 1, values_multi_reset); - PRINT(quiet, "\nRunning multi_read.\n"); - multi_read(event_count, argv + 1, values_multi_read); - PRINT(quiet, "\nRunning single_read.\n"); - char **eventsSuccessfullyAdded = { 0 }; - single_read(event_count, argv + 1, values_single_read, &eventsSuccessfullyAdded); - - int i; - PRINT(quiet, "Final measured values\nEvent_name\t\t\t\t\t\tMulti_read\tsingle_read\n"); - for (i=0; i < numEventsSuccessfullyAdded; i++) { - PRINT(quiet, "%s\t\t\t%lld\t\t%lld\n", eventsSuccessfullyAdded[i], values_multi_read[i], values_single_read[i]); - if ( !approx_equal(values_multi_read[i], values_single_read[i]) ) - test_warn(__FILE__, __LINE__, "Measured values from multi read and single read don't match.", PAPI_OK); + + // Output a note that a multiple pass event was provided on the command line + if (global_num_multipass_events > 0) { + PRINT(suppress_output, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); } // Free allocated memory - for (i = 0; i < event_count; i++) { - free(eventsSuccessfullyAdded[i]); + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + free(cuda_native_event_names[event_idx]); } - free(eventsSuccessfullyAdded); - - PAPI_shutdown(); + free(cuda_native_event_names); - // Output a note that a multiple pass event was provided on the command line - if (numMultipassEvents > 0) { - PRINT(quiet, "\033[0;33mNOTE: From the events provided on the command line, an event or events requiring multiple passes was detected and not added to the EventSet. Check your events with utils/papi_native_avail.\n\033[0m"); + for (event_idx = 0; event_idx < global_num_events_successfully_added; event_idx++) { + free(events_successfully_added[event_idx]); } + free(events_successfully_added); + + PAPI_shutdown(); test_pass(__FILE__); -#else - fprintf(stderr, "Please compile with -DPAPI to test this feature.\n"); -#endif + return 0; } diff --git a/src/components/cuda/tests/test_multipass_event_fail.cu b/src/components/cuda/tests/test_multipass_event_fail.cu index c2decf089..12fad108f 100644 --- a/src/components/cuda/tests/test_multipass_event_fail.cu +++ b/src/components/cuda/tests/test_multipass_event_fail.cu @@ -1,182 +1,189 @@ /** - * @file test_multipass_event_fail.cu - * @author Anustuv Pal - * anustuv@icl.utk.edu - */ - +* @file test_multipass_event_fail.cu +* @brief Test to see if a cuda native event requires multiple passes to profile. +* If it does PAPI_EMULPASS will be returned. +* +* Note: The cuda component supports being partially disabled, meaning that certain devices +* will not be "enabled" to profile on. If PAPI_CUDA_API is not set, then devices with +* CC's >= 7.0 will be used and if PAPI_CUDA_API is set to LEGACY then devices with +* CC's <= 7.0 will be used. +*/ + +// Standard library headers #include +#include -#ifdef PAPI +// Internal headers +#include "cuda_tests_helper.h" #include "papi.h" #include "papi_test.h" -#define PASS 1 -#define FAIL 0 -#define MAX_EVENT_COUNT (32) -#define PRINT(quiet, format, args...) {if (!quiet) {fprintf(stderr, format, ## args);}} -int quiet; - -int test_PAPI_add_named_event(int *EventSet, int numEvents, char **EventName) { - int i, papi_errno; - PRINT(quiet, "LOG: %s: Entering.\n", __func__); - for (i=0; i= PAPI_2MAX_STR_LEN) { + fprintf(stderr, "Failed to fully write event name %s.\n", cuda_native_event_name); + exit(EXIT_FAILURE); + } + + (*total_event_count)++; + cuda_native_event_name = strtok(NULL, ","); + } + i++; + *cuda_native_event_names = cmd_line_native_event_names; + + } + else + { + print_help_message(); + exit(EXIT_FAILURE); + } + + } } -#endif int main(int argc, char **argv) { -#ifdef PAPI - int papi_errno, pass; - int event_set; - - quiet = 0; - char *test_quiet = getenv("PAPI_CUDA_TEST_QUIET"); - if (test_quiet) - quiet = (int) strtol(test_quiet, (char**) NULL, 10); - - int event_count = argc - 1; - - /* if no events passed at command line, just report test skipped. */ - if (event_count == 0) { - fprintf(stderr, "No eventnames specified at command line.\n"); - test_skip(__FILE__, __LINE__, "", 0); - } - - papi_errno = PAPI_library_init( PAPI_VER_CURRENT ); + // Determine the number of Cuda capable devices + int num_devices = 0; + check_cuda_runtime_api_call( cudaGetDeviceCount(&num_devices) ); + // No devices detected on the machine, exit + if (num_devices < 1) { + fprintf(stderr, "No NVIDIA devices found on the machine. This is required for the test to run.\n"); + exit(EXIT_FAILURE); + } + + int suppress_output = 0; + char *user_defined_suppress_output = getenv("PAPI_CUDA_TEST_QUIET"); + if (user_defined_suppress_output) { + suppress_output = (int) strtol(user_defined_suppress_output, (char**) NULL, 10); + } + PRINT(suppress_output, "Running the cuda component test test_multipass_event_fail.cu\n"); + + char **cuda_native_event_names = NULL; + // See if a metric was passed on the command line + int total_event_count = 0; + if (argc > 1) { + parse_and_assign_args(argc, argv, &cuda_native_event_names, &total_event_count); + } + + // Initialize the PAPI library + int papi_errno = PAPI_library_init(PAPI_VER_CURRENT); if (papi_errno != PAPI_VER_CURRENT) { - test_fail(__FILE__, __LINE__, "PAPI_library_init() failed", 0); - } - - papi_errno = PAPI_get_component_index("cuda"); - if (papi_errno < 0 ) { - test_fail(__FILE__, __LINE__, "CUDA component not configured", 0); - } - - event_set = PAPI_NULL; - papi_errno = PAPI_create_eventset( &event_set ); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_create_eventset() failed!", 0); - } - - // Keep track of the number of events from the command line we can actually add - // This is done to properly check the test in the function test_PAPI_add_events - int numEventsSuccessfullyAdded = 0; - pass = test_PAPI_add_event(&event_set, argc-1, argv+1, &numEventsSuccessfullyAdded); - papi_errno = PAPI_cleanup_eventset(event_set); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_cleanup_eventset() failed!", 0); - } - - papi_errno = PAPI_destroy_eventset(&event_set); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_destroy_eventset() failed!", 0); - } - - event_set = PAPI_NULL; - papi_errno = PAPI_create_eventset( &event_set ); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_create_eventset() failed!", 0); - } + test_fail(__FILE__, __LINE__, "PAPI_library_init()", papi_errno); + } + PRINT(suppress_output, "PAPI version being used for this test: %d.%d.%d\n", + PAPI_VERSION_MAJOR(PAPI_VERSION), + PAPI_VERSION_MINOR(PAPI_VERSION), + PAPI_VERSION_REVISION(PAPI_VERSION)); + + // Verify the cuda component has been compiled in + int cuda_cmp_idx = PAPI_get_component_index("cuda"); + if (cuda_cmp_idx < 0 ) { + test_fail(__FILE__, __LINE__, "PAPI_get_component_index()", cuda_cmp_idx); + } + PRINT(suppress_output, "The cuda component is assigned to component index: %d\n", cuda_cmp_idx); + + int EventSet = PAPI_NULL; + check_papi_api_call( PAPI_create_eventset(&EventSet) ); + + // An event has been added on the command line. + int event_idx; + if (total_event_count > 0) { + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + // First check if the cuda native event even requires multiple passes + papi_errno = PAPI_add_named_event(EventSet, cuda_native_event_names[event_idx]); + if (papi_errno != PAPI_OK && papi_errno != PAPI_EMULPASS) { + test_fail(__FILE__, __LINE__, "PAPI_add_named_event()", papi_errno); + } + else if (papi_errno == PAPI_EMULPASS) { + PRINT(suppress_output, "%s requires multiple passes.\n", cuda_native_event_names[event_idx]); + } + else { + PRINT(suppress_output, "%s does not require multiple passes.\n", cuda_native_event_names[event_idx]); - pass += test_PAPI_add_named_event(&event_set, argc-1, argv+1); - papi_errno = PAPI_cleanup_eventset(event_set); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_cleanup_eventset() failed!", 0); + check_papi_api_call( PAPI_remove_named_event(EventSet, cuda_native_event_names[event_idx]) ); + } + } } - - papi_errno = PAPI_destroy_eventset(&event_set); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_destroy_eventset() failed!", 0); + // No event has been added on the command line. + else { + int modifier = PAPI_ENUM_FIRST; + int cuda_eventcode = 0 | PAPI_NATIVE_MASK; + check_papi_api_call( PAPI_enum_cmp_event(&cuda_eventcode, modifier, cuda_cmp_idx); ); + + int multipass_event_found = 0; + modifier = PAPI_ENUM_EVENTS; + do { + char cuda_eventname[PAPI_2MAX_STR_LEN]; + check_papi_api_call( PAPI_event_code_to_name(cuda_eventcode, cuda_eventname) ); + + papi_errno = PAPI_add_named_event(EventSet, cuda_eventname); + if (papi_errno != PAPI_OK && papi_errno != PAPI_EMULPASS) { + test_fail(__FILE__, __LINE__, "PAPI_add_named_event()", papi_errno); + } + else if (papi_errno == PAPI_EMULPASS) { + multipass_event_found++; + PRINT(suppress_output, "%s requires multiple passes.\n", cuda_eventname); + } + else { + check_papi_api_call( PAPI_remove_named_event(EventSet, cuda_eventname) ); + } + + } while (PAPI_enum_cmp_event(&cuda_eventcode, modifier, cuda_cmp_idx) == PAPI_OK && multipass_event_found == 0); + + if (multipass_event_found == 0) { + PRINT(suppress_output, "\033[33mNo multipass event found for this architecture. Verify that this indeed holds true.\033[0m\n"); + } } - event_set = PAPI_NULL; - papi_errno = PAPI_create_eventset( &event_set ); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_create_eventset() failed!", 0); - } + check_papi_api_call( PAPI_destroy_eventset(&EventSet) ); - pass += test_PAPI_add_events(&event_set, argc-1, argv+1, numEventsSuccessfullyAdded); - papi_errno = PAPI_cleanup_eventset(event_set); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_cleanup_eventset() failed!", 0); + // Free allocated memory + if (cuda_native_event_names != NULL) { + for (event_idx = 0; event_idx < total_event_count; event_idx++) { + free(cuda_native_event_names[event_idx]); + } + free(cuda_native_event_names); } - papi_errno = PAPI_destroy_eventset(&event_set); - if (papi_errno != PAPI_OK) { - test_fail(__FILE__, __LINE__, "PAPI_destroy_eventset() failed!", 0); - } + PAPI_shutdown(); - if (pass != 3) - test_fail(__FILE__, __LINE__, "CUDA framework multipass event test failed.", 0); - else - test_pass(__FILE__); + test_pass(__FILE__); - PAPI_shutdown(); -#else - fprintf(stderr, "Please compile with -DPAPI to test this feature.\n"); -#endif return 0; }