Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/components/amd_smi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,10 @@ After changing `PAPI_AMDSMI_ROOT` or related library paths, rerun make clobber &

## Hardware and Software Support
To see the `amd_smi` component's current supported hardware and software please visit the GitHub wiki page [Hardware and Software Support - AMD\_SMI Component](https://github.com/icl-utk-edu/papi/wiki/Hardware-and-Software-Support-%E2%80%90-AMD_SMI-Component).

## Known Limitations

* For AMD devices older than the AMD Instinct MI300A, PAPI should not be configured with both `rocm_smi` and `amd_smi`.
If both components are configured, then `rocm_smi` will be active by default for ROCm < 6.4.0; `amd_smi` will be active by default for ROCm >= 6.4.0.
Users can override this when running an application by setting `export PAPI_DISABLE_COMPONENTS=rocm_smi` when `rocm_smi` is active by default, or
`export PAPI_DISABLE_COMPONENTS=amd_smi` when `amd_smi` is active by default.
40 changes: 34 additions & 6 deletions src/components/amd_smi/linux-amd-smi.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,40 @@ static int _amd_smi_init_component(int cidx) {
_amd_smi_vector.cmp_info.num_mpx_cntrs = -1;
_amd_smi_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cidx;

CHECK_SNPRINTF(_amd_smi_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN,
"Not initialized. Access an AMD SMI event to initialize.");
_amd_smi_vector.cmp_info.disabled = PAPI_EDELAY_INIT;
/* Manage contension between rocm_smi and amd_smi components. */
int use_amd_smi = 0;
#if defined(DEFAULT_TO_AMD_SMI)
use_amd_smi = 1;
#endif
#if defined(DEFAULT_TO_ROCM_SMI)
char *disabledComps = getenv("PAPI_DISABLE_COMPONENTS");
if (disabledComps != NULL) {
char *penv = strdup(disabledComps);
char *p;
for (p = strtok (penv, ",:"); p != NULL; p = strtok (NULL, ",:")) {
if(!strcmp(p, "rocm_smi")) use_amd_smi = 1;
}
free(penv);
} else {
SUBDBG("amd_smi: getenv(PAPI_DISABLE_COMPONENTS) was not set.\n");
}
#endif

int papi_errno;
if (use_amd_smi) {
CHECK_SNPRINTF(_amd_smi_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN,
"Not initialized. Access an AMD SMI event to initialize.");
papi_errno = PAPI_EDELAY_INIT;
_amd_smi_vector.cmp_info.disabled = papi_errno;
return papi_errno;
} else {
CHECK_SNPRINTF(_amd_smi_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN,
"Not active while rocm_smi component is active. Set 'export PAPI_DISABLE_COMPONENTS=rocm_smi' to override.");
papi_errno = PAPI_ECOMBO;
_amd_smi_vector.cmp_info.disabled = papi_errno;
return papi_errno;
}

return PAPI_EDELAY_INIT;
}

static int evt_get_count(int *count) {
Expand Down Expand Up @@ -93,8 +122,7 @@ static int _amd_smi_init_private(void) {
if (!error_str || !error_str[0])
error_str = "AMD SMI component initialization failed";
CHECK_SNPRINTF(_amd_smi_vector.cmp_info.disabled_reason,
sizeof _amd_smi_vector.cmp_info.disabled_reason, "%s",
error_str);
PAPI_HUGE_STR_LEN, "%s", error_str);
goto fn_fail;
}

Expand Down
3 changes: 3 additions & 0 deletions src/components/rocm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ setting the ROCP\_TOOL\_LIB to the PAPI library as follows:
Please instead use the [`rocp_sdk`](https://github.com/icl-utk-edu/papi/blob/master/src/components/rocp_sdk/README.md) component.

* For AMD devices older than the AMD Instinct MI300A, PAPI should not be configured with both `rocm` and `rocp_sdk`.
If both components are configured, then `rocm` will be active by default for ROCm < 6.3.2; `rocp_sdk` will be active by default for ROCm >= 6.3.2.
Users can override this when running an application by setting `export PAPI_DISABLE_COMPONENTS=rocm` when `rocm` is active by default, or
`export PAPI_DISABLE_COMPONENTS=rocp_sdk` when `rocp_sdk` is active by default.

* For ROCm >= 6.2.0, the environment variable `AQLPROFILE_READ_API` should be set to 0 for intercept mode and 1 (or unset) for sampling mode.
Otherwise, counter values in intercept mode will return 0. See PAPI Issue #457 for more details.
Expand Down
69 changes: 51 additions & 18 deletions src/components/rocm/rocm.c
Original file line number Diff line number Diff line change
Expand Up @@ -133,25 +133,58 @@ rocm_init_component(int cid)
_rocm_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cid;
SUBDBG("ENTER: cid: %d\n", cid);

int papi_errno = rocd_init_environment();
if (papi_errno != PAPI_OK) {
_rocm_vector.cmp_info.initialized = 1;
/* Manage contension between rocm and rocp_sdk components. */
int use_rocm = 0;
#if defined(DEFAULT_TO_ROCM)
use_rocm = 1;
#endif
#if defined(DEFAULT_TO_ROCP_SDK)
char *disabledComps = getenv("PAPI_DISABLE_COMPONENTS");
if (disabledComps != NULL) {
char *penv = strdup(disabledComps);
char *p;
for (p = strtok (penv, ",:"); p != NULL; p = strtok (NULL, ",:")) {
if(!strcmp(p, "rocp_sdk")) use_rocm = 1;
}
free(penv);
} else {
SUBDBG("rocm: getenv(PAPI_DISABLE_COMPONENTS) was not set.\n");
}
#endif

int papi_errno, expect;
if (use_rocm) {
papi_errno = rocd_init_environment();
if (papi_errno != PAPI_OK) {
_rocm_vector.cmp_info.initialized = 1;
_rocm_vector.cmp_info.disabled = papi_errno;
const char *err_string;
rocd_err_get_last(&err_string);
expect = snprintf(_rocm_vector.cmp_info.disabled_reason,
PAPI_HUGE_STR_LEN, "%s", err_string);
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
SUBDBG("disabled_reason truncated");
}
goto fn_fail;
}

expect = snprintf(_rocm_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s",
"Not initialized. Access component events to initialize it.");
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
SUBDBG("disabled_reason truncated");
}
papi_errno = PAPI_EDELAY_INIT;
_rocm_vector.cmp_info.disabled = papi_errno;
const char *err_string;
rocd_err_get_last(&err_string);
int expect = snprintf(_rocm_vector.cmp_info.disabled_reason,
PAPI_MAX_STR_LEN, "%s", err_string);
if (expect > PAPI_MAX_STR_LEN) {
} else {
expect = snprintf(_rocm_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s",
"Not active while rocp_sdk component is active. Set 'export PAPI_DISABLE_COMPONENTS=rocp_sdk' to override.");
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
SUBDBG("disabled_reason truncated");
}
goto fn_fail;
papi_errno = PAPI_ECOMBO;
_rocm_vector.cmp_info.disabled = papi_errno;
}

sprintf(_rocm_vector.cmp_info.disabled_reason,
"Not initialized. Access component events to initialize it.");
papi_errno = PAPI_EDELAY_INIT;
_rocm_vector.cmp_info.disabled = papi_errno;

fn_exit:
SUBDBG("EXIT: %s\n", PAPI_strerror(papi_errno));
return papi_errno;
Expand Down Expand Up @@ -209,8 +242,8 @@ rocm_init_private(void)
const char *err_string;
rocd_err_get_last(&err_string);
int expect = snprintf(_rocm_vector.cmp_info.disabled_reason,
PAPI_MAX_STR_LEN, "%s", err_string);
if (expect > PAPI_MAX_STR_LEN) {
PAPI_HUGE_STR_LEN, "%s", err_string);
if (expect > PAPI_HUGE_STR_LEN) {
SUBDBG("disabled_reason truncated");
}

Expand All @@ -222,8 +255,8 @@ rocm_init_private(void)
_rocm_vector.cmp_info.num_native_events = count;
_rocm_vector.cmp_info.num_cntrs = count;
_rocm_vector.cmp_info.initialized = 1;
int strLen = snprintf(_rocm_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s", "");
if (strLen < 0 || strLen >= PAPI_MAX_STR_LEN) {
int strLen = snprintf(_rocm_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s", "");
if (strLen < 0 || strLen >= PAPI_HUGE_STR_LEN) {
SUBDBG("Failed to fully write disabled_reason.\n");
}

Expand Down
5 changes: 5 additions & 0 deletions src/components/rocm_smi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ In both cases, the directory specified by `PAPI_ROCMSMI_ROOT` **must contain** t

## Known Limitations

* For AMD devices older than the AMD Instinct MI300A, PAPI should not be configured with both `rocm_smi` and `amd_smi`.
If both components are configured, then `rocm_smi` will be active by default for ROCm < 6.4.0; `amd_smi` will be active by default for ROCm >= 6.4.0.
Users can override this when running an application by setting `export PAPI_DISABLE_COMPONENTS=rocm_smi` when `rocm_smi` is active by default, or
`export PAPI_DISABLE_COMPONENTS=amd_smi` when `amd_smi` is active by default.

* Only sets of metrics and events that can be gathered in a single pass are supported.

* Although AMD metrics may be floating point, all values are recast and returned as long long integers.
Expand Down
55 changes: 49 additions & 6 deletions src/components/rocm_smi/linux-rocm-smi.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,45 @@ _rocm_smi_init_component(int cidx)
_rocm_smi_vector.cmp_info.num_cntrs = -1;
_rocm_smi_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cidx;

sprintf(_rocm_smi_vector.cmp_info.disabled_reason,
"Not initialized. Access component events to initialize it.");
_rocm_smi_vector.cmp_info.disabled = PAPI_EDELAY_INIT;

return PAPI_EDELAY_INIT;
/* Manage contension between rocm_smi and amd_smi components. */
int use_rocm_smi = 0;
#if defined(DEFAULT_TO_ROCM_SMI)
use_rocm_smi = 1;
#endif
#if defined(DEFAULT_TO_AMD_SMI)
char *disabledComps = getenv("PAPI_DISABLE_COMPONENTS");
if (disabledComps != NULL) {
char *penv = strdup(disabledComps);
char *p;
for (p = strtok (penv, ",:"); p != NULL; p = strtok (NULL, ",:")) {
if(!strcmp(p, "amd_smi")) use_rocm_smi = 1;
}
free(penv);
} else {
SUBDBG("rocm_smi: getenv(PAPI_DISABLE_COMPONENTS) was not set.\n");
}
#endif

int papi_errno, expect;
if (use_rocm_smi) {
expect = snprintf(_rocm_smi_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s",
"Not initialized. Access component events to initialize it.");
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
SUBDBG("disabled_reason truncated");
}
papi_errno = PAPI_EDELAY_INIT;
_rocm_smi_vector.cmp_info.disabled = papi_errno;
return papi_errno;
} else {
expect = snprintf(_rocm_smi_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s",
"Not active while amd_smi component is active. Set 'export PAPI_DISABLE_COMPONENTS=amd_smi' to override.");
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
SUBDBG("disabled_reason truncated");
}
papi_errno = PAPI_ECOMBO;
_rocm_smi_vector.cmp_info.disabled = papi_errno;
return papi_errno;
}
}

static int
Expand Down Expand Up @@ -108,7 +142,10 @@ _rocm_smi_init_private(void)
_rocm_smi_vector.cmp_info.disabled = papi_errno;
const char *error_str;
rocs_err_get_last(&error_str);
sprintf(_rocm_smi_vector.cmp_info.disabled_reason, "%s", error_str);
int expect = snprintf(_rocm_smi_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s", error_str);
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
SUBDBG("disabled_reason truncated");
}
goto fn_fail;
}

Expand All @@ -125,6 +162,12 @@ _rocm_smi_init_private(void)

fn_exit:
_rocm_smi_vector.cmp_info.disabled = papi_errno;
if(PAPI_OK == papi_errno) {
int expect = snprintf(_rocm_smi_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s", "");
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
SUBDBG("disabled_reason truncated");
}
}
PAPI_unlock(COMPONENT_LOCK);
return papi_errno;
fn_fail:
Expand Down
4 changes: 4 additions & 0 deletions src/components/rocp_sdk/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,5 +65,9 @@ To see the ROCP\_SDK component's current supported hardware and software please

## Known Limitations

* For AMD devices older than the AMD Instinct MI300A, PAPI should not be configured with both `rocm` and `rocp_sdk`.
If both components are configured, then `rocm` will be active by default for ROCm < 6.3.2; `rocp_sdk` will be active by default for ROCm >= 6.3.2.
Users can override this when running an application by setting `export PAPI_DISABLE_COMPONENTS=rocm` when `rocm` is active by default, or
`export PAPI_DISABLE_COMPONENTS=rocp_sdk` when `rocp_sdk` is active by default.
* In dispatch mode, PAPI may read zeros if reading takes place immediately after the return of a GPU kernel. This is not a PAPI bug. It may occur because calls such as hipDeviceSynchronize() do not guarantee that ROCprofiler has been called and all counter buffers have been flushed. Therefore, it is recommended that the user code adds a delay between the return of a kernel and calls to PAPI_read(), PAPI_stop(), etc.
* If an application is linked against the static PAPI library libpapi.a, then the application must call PAPI_library_init() before calling any hip routines (e.g. hipInit(), hipGetDeviceCount(), hipLaunchKernelGGL(), etc). If the application is linked against the dynamic library libpapi.so, then the order of operations does not matter.
65 changes: 51 additions & 14 deletions src/components/rocp_sdk/rocp_sdk.c
Original file line number Diff line number Diff line change
Expand Up @@ -133,23 +133,57 @@ rocp_sdk_init_component(int cid)
_rocp_sdk_vector.cmp_info.num_cntrs = -1;
_rocp_sdk_lock = PAPI_NUM_LOCK + NUM_INNER_LOCK + cid;

// We set this env variable to silence some unnecessary ROCprofiler-SDK debug messages.
// It is not critical, so if it fails to be set, we can safely ignore the error.
(void)setenv("ROCPROFILER_LOG_LEVEL","fatal",0);
/* Manage contension between rocm and rocp_sdk components. */
int use_rocp_sdk = 0;
#if defined(DEFAULT_TO_ROCP_SDK)
use_rocp_sdk = 1;
#endif
#if defined(DEFAULT_TO_ROCM)
char *disabledComps = getenv("PAPI_DISABLE_COMPONENTS");
if (disabledComps != NULL) {
char *penv = strdup(disabledComps);
char *p;
for (p = strtok (penv, ",:"); p != NULL; p = strtok (NULL, ",:")) {
if(!strcmp(p, "rocm")) use_rocp_sdk = 1;
}
free(penv);
} else {
SUBDBG("rocp_sdk: getenv(PAPI_DISABLE_COMPONENTS) was not set.\n");
}
#endif

int papi_errno = rocprofiler_sdk_init_pre();
if (papi_errno != PAPI_OK) {
_rocp_sdk_vector.cmp_info.initialized = 1;
int papi_errno, expect;
if( use_rocp_sdk) {
// We set this env variable to silence some unnecessary ROCprofiler-SDK debug messages.
// It is not critical, so if it fails to be set, we can safely ignore the error.
(void)setenv("ROCPROFILER_LOG_LEVEL","fatal",0);

papi_errno = rocprofiler_sdk_init_pre();
if (papi_errno != PAPI_OK) {
_rocp_sdk_vector.cmp_info.initialized = 1;
_rocp_sdk_vector.cmp_info.disabled = papi_errno;
const char *err_string;
rocprofiler_sdk_err_get_last(&err_string);
expect = snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s", err_string);
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
SUBDBG("disabled_reason truncated");
}
return papi_errno;
}

// This component needs to be fully initialized from the beginning,
// because interleaving hip calls and PAPI calls leads to errors.
return check_n_initialize();
} else {
expect = snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s",
"Not active while rocm component is active. Set 'export PAPI_DISABLE_COMPONENTS=rocm' to override.");
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
SUBDBG("disabled_reason truncated");
}
papi_errno = PAPI_ECOMBO;
_rocp_sdk_vector.cmp_info.disabled = papi_errno;
const char *err_string;
rocprofiler_sdk_err_get_last(&err_string);
snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s", err_string);
return papi_errno;
}

// This component needs to be fully initialized from the beginning,
// because interleaving hip calls and PAPI calls leads to errors.
return check_n_initialize();
}

int
Expand Down Expand Up @@ -205,7 +239,10 @@ rocp_sdk_init_private(void)
_rocp_sdk_vector.cmp_info.disabled = papi_errno;
const char *err_string;
rocprofiler_sdk_err_get_last(&err_string);
snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_MAX_STR_LEN, "%s", err_string);
int expect = snprintf(_rocp_sdk_vector.cmp_info.disabled_reason, PAPI_HUGE_STR_LEN, "%s", err_string);
if (expect < 0 || expect >= PAPI_HUGE_STR_LEN) {
SUBDBG("disabled_reason truncated");
}
goto fn_fail;
}

Expand Down
Loading