diff --git a/libkineto/CMakeLists.txt b/libkineto/CMakeLists.txt index 266bead06..bf8d55ea9 100644 --- a/libkineto/CMakeLists.txt +++ b/libkineto/CMakeLists.txt @@ -79,6 +79,50 @@ else() set(LIBKINETO_NOXPUPTI ON) endif() +# Detect ROCM Version +if(NOT LIBKINETO_NOROCTRACER) + if(NOT ROCM_INCLUDE_DIRS) + set(ROCM_INCLUDE_DIRS "${ROCM_SOURCE_DIR}/include") + endif() + + find_file(ROCM_VERSION_HEADER_PATH + NAMES rocm-core/rocm_version.h + NO_DEFAULT_PATH + PATHS ${ROCM_INCLUDE_DIRS} + ) + + if(EXISTS ${ROCM_VERSION_HEADER_PATH}) + set(ROCM_HEADER_FILE ${ROCM_VERSION_HEADER_PATH}) + endif() + + # Read the ROCM headerfile into a variable + message(STATUS "Reading ROCM version from: ${ROCM_HEADER_FILE}") + file(READ "${ROCM_HEADER_FILE}" ROCM_HEADER_CONTENT) + + string(REGEX MATCH "ROCM_VERSION_MAJOR[ ]+[0-9]+" TEMP1 ${ROCM_HEADER_CONTENT}) + string(REPLACE "ROCM_VERSION_MAJOR" "" TEMP2 ${TEMP1}) + string(STRIP ${TEMP2} ROCM_VERSION_DEV_MAJOR) + string(REGEX MATCH "ROCM_VERSION_MINOR[ ]+[0-9]+" TEMP1 ${ROCM_HEADER_CONTENT}) + string(REPLACE "ROCM_VERSION_MINOR" "" TEMP2 ${TEMP1}) + string(STRIP ${TEMP2} ROCM_VERSION_DEV_MINOR) + string(REGEX MATCH "ROCM_VERSION_PATCH[ ]+[0-9]+" TEMP1 ${ROCM_HEADER_CONTENT}) + string(REPLACE "ROCM_VERSION_PATCH" "" TEMP2 ${TEMP1}) + string(STRIP ${TEMP2} ROCM_VERSION_DEV_PATCH) + + message(STATUS "ROCM major: ${ROCM_VERSION_DEV_MAJOR}") + message(STATUS "ROCM minor: ${ROCM_VERSION_DEV_MINOR}") + message(STATUS "ROCM patch: ${ROCM_VERSION_DEV_PATCH}") + +# Use rocprofiler-sdk for rocm version 6.4 forward + if ((${ROCM_VERSION_DEV_MAJOR} GREATER_EQUAL 7) OR (${ROCM_VERSION_DEV_MAJOR} GREATER_EQUAL 6 AND ${ROCM_VERSION_DEV_MINOR} GREATER_EQUAL 4)) + set(USE_ROCPROFILER_SDK ON) + endif() + if (${USE_ROCPROFILER_SDK}) + message(STATUS "Building with: rocprofiler-sdk") + else() + message(STATUS "Building with: libroctracer") + endif() +endif() if(NOT DEFINED LIBKINETO_NOAIUPTI) message(INFO " LIBKINETO_NOAIUPTI NOT DEFINED adding subdirectory(src/plugin/aiupti)") add_subdirectory(src/plugin/aiupti) @@ -90,8 +134,14 @@ if(LIBKINETO_NOCUPTI AND LIBKINETO_NOROCTRACER AND LIBKINETO_NOXPUPTI AND LIBKIN message(STATUS " CUPTI unavailable or disabled - not building GPU profilers") else() if(NOT LIBKINETO_NOROCTRACER) - get_filelist("get_libkineto_roctracer_srcs(with_api=False)" LIBKINETO_roc_SRCS) - message(STATUS " Building with roctracer") + if (${USE_ROCPROFILER_SDK}) + get_filelist("get_libkineto_rocprofiler_srcs(with_api=False)" LIBKINETO_roc_SRCS) + message(STATUS " Building with rocprofiler-sdk") + else() + get_filelist("get_libkineto_roctracer_srcs(with_api=False)" LIBKINETO_roc_SRCS) + add_compile_options(-DROCTRACER_FALLBACK) + message(STATUS " Building with roctracer") + endif() elseif(NOT LIBKINETO_NOCUPTI) get_filelist("get_libkineto_cupti_srcs(with_api=False)" LIBKINETO_cuda_SRCS) endif() @@ -228,9 +278,14 @@ target_include_directories(kineto PUBLIC $) if(NOT LIBKINETO_NOROCTRACER) - find_library(ROCTRACER_LIBRARY NAMES libroctracer64.so HINTS +if (${USE_ROCPROFILER_SDK}) + find_library(ROCPROF_LIBRARY NAMES librocprofiler-sdk.so HINTS ${ROCM_SOURCE_DIR}/lib) - target_link_libraries(kineto "${ROCTRACER_LIBRARY}") +else() + find_library(ROCPROF_LIBRARY NAMES libroctracer64.so HINTS + ${ROCM_SOURCE_DIR}/lib) +endif() + target_link_libraries(kineto "${ROCPROF_LIBRARY}") find_library(KINETO_HIP_LIBRARY NAMES libamdhip64.so HINTS ${ROCM_SOURCE_DIR}/lib) target_link_libraries(kineto "${KINETO_HIP_LIBRARY}") diff --git a/libkineto/libkineto_defs.bzl b/libkineto/libkineto_defs.bzl index e0f5bf972..e93fa460f 100644 --- a/libkineto/libkineto_defs.bzl +++ b/libkineto/libkineto_defs.bzl @@ -27,10 +27,18 @@ def get_libkineto_cupti_srcs(with_api = True): "src/cupti_strings.cpp", ] + (get_libkineto_cpu_only_srcs(with_api)) +def get_libkineto_rocprofiler_srcs(with_api = True): + return [ + "src/RocprofActivityApi.cpp", + "src/RocprofLogger.cpp", + "src/RocLogger.cpp", + ] + (get_libkineto_cpu_only_srcs(with_api)) + def get_libkineto_roctracer_srcs(with_api = True): return [ "src/RoctracerActivityApi.cpp", "src/RoctracerLogger.cpp", + "src/RocLogger.cpp", ] + (get_libkineto_cpu_only_srcs(with_api)) def get_libkineto_xpupti_srcs(with_api = True): diff --git a/libkineto/src/ActivityProfilerController.cpp b/libkineto/src/ActivityProfilerController.cpp index d1814ca37..e9867fcaf 100644 --- a/libkineto/src/ActivityProfilerController.cpp +++ b/libkineto/src/ActivityProfilerController.cpp @@ -18,8 +18,12 @@ #include "CuptiActivityApi.h" #ifdef HAS_ROCTRACER +#ifndef ROCTRACER_FALLBACK +#include "RocprofActivityApi.h" +#else #include "RoctracerActivityApi.h" #endif +#endif #include "ThreadUtil.h" #include "output_json.h" @@ -67,8 +71,13 @@ ActivityProfilerController::ActivityProfilerController( #endif // !USE_GOOGLE_LOG #ifdef HAS_ROCTRACER +#ifndef ROCTRACER_FALLBACK + profiler_ = std::make_unique( + RocprofActivityApi::singleton(), cpuOnly); +#else profiler_ = std::make_unique( RoctracerActivityApi::singleton(), cpuOnly); +#endif #else profiler_ = std::make_unique( CuptiActivityApi::singleton(), cpuOnly); diff --git a/libkineto/src/CuptiActivityProfiler.cpp b/libkineto/src/CuptiActivityProfiler.cpp index 8c7e7924c..2846a4ab1 100644 --- a/libkineto/src/CuptiActivityProfiler.cpp +++ b/libkineto/src/CuptiActivityProfiler.cpp @@ -25,7 +25,7 @@ #ifdef HAS_CUPTI #include #elif defined(HAS_ROCTRACER) -#include +#include #endif #include "Config.h" @@ -39,9 +39,14 @@ #include "KernelRegistry.h" #endif // HAS_CUPTI #ifdef HAS_ROCTRACER +#include "RocLogger.h" +#ifndef ROCTRACER_FALLBACK +#include "RocprofActivity.h" +#include "RocprofActivityApi.h" +#else #include "RoctracerActivity.h" #include "RoctracerActivityApi.h" -#include "RoctracerLogger.h" +#endif #endif #ifdef HAS_XPUPTI #include "plugin/xpupti/XpuptiActivityProfiler.h" @@ -214,9 +219,15 @@ void CuptiActivityProfiler::transferCpuTrace( } #ifdef HAS_ROCTRACER +#ifndef ROCTRACER_FALLBACK +CuptiActivityProfiler::CuptiActivityProfiler( + RocprofActivityApi& cupti, + bool cpuOnly) +#else CuptiActivityProfiler::CuptiActivityProfiler( RoctracerActivityApi& cupti, bool cpuOnly) +#endif #else CuptiActivityProfiler::CuptiActivityProfiler( CuptiActivityApi& cupti, @@ -256,23 +267,23 @@ void CuptiActivityProfiler::logGpuVersions() { addVersionMetadata("cuda_driver_version", std::to_string(cudaDriverVersion)); #elif defined(HAS_ROCTRACER) - uint32_t majorVersion = roctracer_version_major(); - uint32_t minorVersion = roctracer_version_minor(); + uint32_t majorVersion = ROCPROFILER_VERSION_MAJOR; + uint32_t minorVersion = ROCPROFILER_VERSION_MINOR; std::string roctracerVersion = std::to_string(majorVersion) + "." + std::to_string(minorVersion); int hipRuntimeVersion = 0, hipDriverVersion = 0; CUDA_CALL(hipRuntimeGetVersion(&hipRuntimeVersion)); CUDA_CALL(hipDriverGetVersion(&hipDriverVersion)); - LOG(INFO) << "HIP versions. Roctracer: " << roctracerVersion + LOG(INFO) << "HIP versions. Rocprofiler-sdk: " << roctracerVersion << "; Runtime: " << hipRuntimeVersion << "; Driver: " << hipDriverVersion; - LOGGER_OBSERVER_ADD_METADATA("roctracer_version", roctracerVersion); + LOGGER_OBSERVER_ADD_METADATA("rocprofiler-sdk_version", roctracerVersion); LOGGER_OBSERVER_ADD_METADATA( "hip_runtime_version", std::to_string(hipRuntimeVersion)); LOGGER_OBSERVER_ADD_METADATA( "hip_driver_version", std::to_string(hipDriverVersion)); - addVersionMetadata("roctracer_version", roctracerVersion); + addVersionMetadata("rocprofiler-sdk_version", roctracerVersion); addVersionMetadata("hip_runtime_version", std::to_string(hipRuntimeVersion)); addVersionMetadata("hip_driver_version", std::to_string(hipDriverVersion)); @@ -372,7 +383,7 @@ void CuptiActivityProfiler::processTraceInternal(ActivityLogger& logger) { VLOG(0) << "Retrieving GPU activity buffers"; const int count = cupti_.processActivities( std::bind( - &CuptiActivityProfiler::handleRoctracerActivity, + &CuptiActivityProfiler::handleRocprofActivity, this, std::placeholders::_1, &logger), @@ -480,10 +491,10 @@ inline void CuptiActivityProfiler::handleCorrelationActivity( inline void CuptiActivityProfiler::handleCorrelationActivity( uint64_t correlationId, uint64_t externalId, - RoctracerLogger::CorrelationDomain externalKind) { - if (externalKind == RoctracerLogger::CorrelationDomain::Domain0) { + RocLogger::CorrelationDomain externalKind) { + if (externalKind == RocLogger::CorrelationDomain::Domain0) { cpuCorrelationMap_[correlationId] = externalId; - } else if (externalKind == RoctracerLogger::CorrelationDomain::Domain1) { + } else if (externalKind == RocLogger::CorrelationDomain::Domain1) { userCorrelationMap_[correlationId] = externalId; } else { LOG(WARNING) @@ -960,7 +971,7 @@ void CuptiActivityProfiler::handleRuntimeActivity( } inline void CuptiActivityProfiler::handleGpuActivity( - const roctracerAsyncRow* act, + const rocprofAsyncRow* act, ActivityLogger* logger) { const ITraceActivity* linked = linkedActivity(act->id, cpuCorrelationMap_); const auto& gpu_activity = @@ -968,29 +979,29 @@ inline void CuptiActivityProfiler::handleGpuActivity( handleGpuActivity(gpu_activity, logger); } -void CuptiActivityProfiler::handleRoctracerActivity( - const roctracerBase* record, +void CuptiActivityProfiler::handleRocprofActivity( + const rocprofBase* record, ActivityLogger* logger) { switch (record->type) { case ROCTRACER_ACTIVITY_DEFAULT: handleRuntimeActivity( - reinterpret_cast(record), logger); + reinterpret_cast(record), logger); break; case ROCTRACER_ACTIVITY_KERNEL: handleRuntimeActivity( - reinterpret_cast(record), logger); + reinterpret_cast(record), logger); break; case ROCTRACER_ACTIVITY_COPY: handleRuntimeActivity( - reinterpret_cast(record), logger); + reinterpret_cast(record), logger); break; case ROCTRACER_ACTIVITY_MALLOC: handleRuntimeActivity( - reinterpret_cast(record), logger); + reinterpret_cast(record), logger); break; case ROCTRACER_ACTIVITY_ASYNC: handleGpuActivity( - reinterpret_cast(record), logger); + reinterpret_cast(record), logger); break; case ROCTRACER_ACTIVITY_NONE: default: @@ -1571,8 +1582,13 @@ void CuptiActivityProfiler::pushCorrelationId(uint64_t id) { id, CuptiActivityApi::CorrelationFlowType::Default); #endif // HAS_CUPTI #ifdef HAS_ROCTRACER +#ifndef ROCTRACER_FALLBACK + RocprofActivityApi::pushCorrelationID( + id, RocprofActivityApi::CorrelationFlowType::Default); +#else RoctracerActivityApi::pushCorrelationID( id, RoctracerActivityApi::CorrelationFlowType::Default); +#endif #endif for (auto& session : sessions_) { session->pushCorrelationId(id); @@ -1585,8 +1601,13 @@ void CuptiActivityProfiler::popCorrelationId() { CuptiActivityApi::CorrelationFlowType::Default); #endif // HAS_CUPTI #ifdef HAS_ROCTRACER +#ifndef ROCTRACER_FALLBACK + RocprofActivityApi::popCorrelationID( + RocprofActivityApi::CorrelationFlowType::Default); +#else RoctracerActivityApi::popCorrelationID( RoctracerActivityApi::CorrelationFlowType::Default); +#endif #endif for (auto& session : sessions_) { session->popCorrelationId(); @@ -1599,8 +1620,13 @@ void CuptiActivityProfiler::pushUserCorrelationId(uint64_t id) { id, CuptiActivityApi::CorrelationFlowType::User); #endif // HAS_CUPTI #ifdef HAS_ROCTRACER +#ifndef ROCTRACER_FALLBACK + RocprofActivityApi::pushCorrelationID( + id, RocprofActivityApi::CorrelationFlowType::User); +#else RoctracerActivityApi::pushCorrelationID( id, RoctracerActivityApi::CorrelationFlowType::User); +#endif #endif for (auto& session : sessions_) { session->pushUserCorrelationId(id); @@ -1613,8 +1639,13 @@ void CuptiActivityProfiler::popUserCorrelationId() { CuptiActivityApi::CorrelationFlowType::User); #endif // HAS_CUPTI #ifdef HAS_ROCTRACER +#ifndef ROCTRACER_FALLBACK + RocprofActivityApi::popCorrelationID( + RocprofActivityApi::CorrelationFlowType::User); +#else RoctracerActivityApi::popCorrelationID( RoctracerActivityApi::CorrelationFlowType::User); +#endif #endif for (auto& session : sessions_) { session->popUserCorrelationId(); diff --git a/libkineto/src/CuptiActivityProfiler.h b/libkineto/src/CuptiActivityProfiler.h index 379fa9078..7d308cdb9 100644 --- a/libkineto/src/CuptiActivityProfiler.h +++ b/libkineto/src/CuptiActivityProfiler.h @@ -30,7 +30,11 @@ #endif // HAS_CUPTI #ifdef HAS_ROCTRACER +#ifndef ROCTRACER_FALLBACK +#include "RocprofLogger.h" +#else #include "RoctracerLogger.h" +#endif #endif // HAS_ROCTRACER #include "GenericTraceActivity.h" @@ -45,6 +49,7 @@ namespace KINETO_NAMESPACE { class Config; class CuptiActivityApi; +class RocprofActivityApi; class RoctracerActivityApi; // This struct is a derived snapshot of the Config. And should not @@ -121,7 +126,8 @@ inline size_t hash_combine(size_t seed, size_t value) { class CuptiActivityProfiler { public: CuptiActivityProfiler(CuptiActivityApi& cupti, bool cpuOnly); - CuptiActivityProfiler(RoctracerActivityApi& rai, bool cpuOnly); + CuptiActivityProfiler(RocprofActivityApi& rai, bool cpuOnly); + CuptiActivityProfiler(RoctracerActivityApi& rtai, bool cpuOnly); CuptiActivityProfiler(const CuptiActivityProfiler&) = delete; CuptiActivityProfiler& operator=(const CuptiActivityProfiler&) = delete; ~CuptiActivityProfiler(); @@ -411,20 +417,16 @@ class CuptiActivityProfiler { #endif // HAS_CUPTI #ifdef HAS_ROCTRACER - // Process generic RocTracer activity - void handleRoctracerActivity( - const roctracerBase* record, - ActivityLogger* logger); + // Process generic RocProf activity + void handleRocprofActivity(const rocprofBase* record, ActivityLogger* logger); void handleCorrelationActivity( uint64_t correlationId, uint64_t externalId, - RoctracerLogger::CorrelationDomain externalKind); + RocLogger::CorrelationDomain externalKind); // Process specific GPU activity types template void handleRuntimeActivity(const T* activity, ActivityLogger* logger); - void handleGpuActivity( - const roctracerAsyncRow* record, - ActivityLogger* logger); + void handleGpuActivity(const rocprofAsyncRow* record, ActivityLogger* logger); #endif // HAS_ROCTRACER void resetTraceData(); @@ -457,7 +459,11 @@ class CuptiActivityProfiler { // Calls to CUPTI is encapsulated behind this interface #ifdef HAS_ROCTRACER - RoctracerActivityApi& cupti_; // Design failure here +#ifndef ROCTRACER_FALLBACK + RocprofActivityApi& cupti_; // Design failure here +#else + RoctracerActivityApi& cupti_; +#endif #else CuptiActivityApi& cupti_; #endif diff --git a/libkineto/src/DeviceUtil.h b/libkineto/src/DeviceUtil.h index f5d772742..2fbec448c 100644 --- a/libkineto/src/DeviceUtil.h +++ b/libkineto/src/DeviceUtil.h @@ -48,7 +48,7 @@ #elif defined(HAS_ROCTRACER) #include -#include +#include #define CUDA_CALL(call) \ { \ diff --git a/libkineto/src/RocLogger.cpp b/libkineto/src/RocLogger.cpp new file mode 100644 index 000000000..b1290640b --- /dev/null +++ b/libkineto/src/RocLogger.cpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "RocLogger.h" + +ApiIdList::ApiIdList() : invert_(true) {} + +void ApiIdList::add(const std::string& apiName) { + uint32_t cid = mapName(apiName); + if (cid > 0) + filter_[cid] = 1; +} + +void ApiIdList::remove(const std::string& apiName) { + uint32_t cid = mapName(apiName); + if (cid > 0) + filter_.erase(cid); +} + +bool ApiIdList::loadUserPrefs() { + // FIXME: check an ENV variable that points to an exclude file + return false; +} + +bool ApiIdList::contains(uint32_t apiId) { + return (filter_.find(apiId) != filter_.end()) ? !invert_ : invert_; // XOR +} diff --git a/libkineto/src/RocLogger.h b/libkineto/src/RocLogger.h new file mode 100644 index 000000000..4626751a9 --- /dev/null +++ b/libkineto/src/RocLogger.h @@ -0,0 +1,229 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace libkineto { +class RocprofActivityApi; +} + +typedef uint64_t timestamp_t; + +static timestamp_t timespec_to_ns(const timespec& time) { + return ((timestamp_t)time.tv_sec * 1000000000) + time.tv_nsec; +} + +namespace RocLogger { +enum CorrelationDomain { + begin, + Default = begin, + Domain0 = begin, + Domain1, + end, + size = end +}; +} // namespace RocLogger + +class ApiIdList { + public: + ApiIdList(); + virtual ~ApiIdList() {} + bool invertMode() { + return invert_; + } + void setInvertMode(bool invert) { + invert_ = invert; + } + void add(const std::string& apiName); + void remove(const std::string& apiName); + bool loadUserPrefs(); + + // Map api string to cnid enum + virtual uint32_t mapName(const std::string& apiName) = 0; + + bool contains(uint32_t apiId); + const std::unordered_map& filterList() { + return filter_; + } + + private: + std::unordered_map filter_; + bool invert_; +}; + +typedef enum { + ROCTRACER_ACTIVITY_DEFAULT = 0, + ROCTRACER_ACTIVITY_KERNEL, + ROCTRACER_ACTIVITY_COPY, + ROCTRACER_ACTIVITY_MALLOC, + ROCTRACER_ACTIVITY_ASYNC, + ROCTRACER_ACTIVITY_NONE +} rocprof_activity_types; + +struct rocprofBase { + rocprofBase( + uint64_t id, + uint32_t domain, + uint64_t begin, + uint64_t end, + rocprof_activity_types type = ROCTRACER_ACTIVITY_NONE) + : id(id), begin(begin), end(end), domain(domain), type(type) {} + uint64_t id; // correlation_id + uint64_t begin; + uint64_t end; + uint32_t domain; + rocprof_activity_types type; +}; + +struct rocprofRow : public rocprofBase { + rocprofRow( + uint64_t id, + uint32_t domain, + uint32_t cid, + uint32_t pid, + uint32_t tid, + uint64_t begin, + uint64_t end, + rocprof_activity_types type = ROCTRACER_ACTIVITY_DEFAULT) + : rocprofBase(id, domain, begin, end, type), + cid(cid), + pid(pid), + tid(tid) {} + uint32_t cid; + uint32_t pid; + uint32_t tid; +}; + +struct rocprofKernelRow : public rocprofRow { + rocprofKernelRow( + uint64_t id, + uint32_t domain, + uint32_t cid, + uint32_t pid, + uint32_t tid, + uint64_t begin, + uint64_t end, + const void* faddr, + hipFunction_t function, + unsigned int gx, + unsigned int gy, + unsigned int gz, + unsigned int wx, + unsigned int wy, + unsigned int wz, + size_t gss, + hipStream_t stream, + rocprof_activity_types type = ROCTRACER_ACTIVITY_KERNEL) + : rocprofRow(id, domain, cid, pid, tid, begin, end, type), + functionAddr(faddr), + function(function), + gridX(gx), + gridY(gy), + gridZ(gz), + workgroupX(wx), + workgroupY(wy), + workgroupZ(wz), + groupSegmentSize(gss), + stream(stream) {} + const void* functionAddr; + hipFunction_t function; + unsigned int gridX; + unsigned int gridY; + unsigned int gridZ; + unsigned int workgroupX; + unsigned int workgroupY; + unsigned int workgroupZ; + size_t groupSegmentSize; + hipStream_t stream; +}; + +struct rocprofCopyRow : public rocprofRow { + rocprofCopyRow( + uint64_t id, + uint32_t domain, + uint32_t cid, + uint32_t pid, + uint32_t tid, + uint64_t begin, + uint64_t end, + const void* src, + const void* dst, + size_t size, + hipMemcpyKind kind, + hipStream_t stream, + rocprof_activity_types type = ROCTRACER_ACTIVITY_COPY) + : rocprofRow(id, domain, cid, pid, tid, begin, end, type), + src(src), + dst(dst), + size(size), + kind(kind), + stream(stream) {} + const void* src; + const void* dst; + size_t size; + hipMemcpyKind kind; + hipStream_t stream; +}; + +struct rocprofMallocRow : public rocprofRow { + rocprofMallocRow( + uint64_t id, + uint32_t domain, + uint32_t cid, + uint32_t pid, + uint32_t tid, + uint64_t begin, + uint64_t end, + const void* ptr, + size_t size, + rocprof_activity_types type = ROCTRACER_ACTIVITY_MALLOC) + : rocprofRow(id, domain, cid, pid, tid, begin, end, type), + ptr(ptr), + size(size) {} + const void* ptr; + size_t size; +}; + +struct rocprofAsyncRow : public rocprofBase { + rocprofAsyncRow( + uint64_t id, + uint32_t domain, + uint32_t kind, + uint32_t op, + int device, + uint64_t queue, + uint64_t begin, + uint64_t end, + const std::string& kernelName, + rocprof_activity_types type = ROCTRACER_ACTIVITY_ASYNC) + : rocprofBase(id, domain, begin, end, type), + kind(kind), + op(op), + device(device), + queue(queue), + kernelName(kernelName) {} + uint32_t kind; + uint32_t op; + int device; + uint64_t queue; + std::string kernelName; +}; diff --git a/libkineto/src/RocprofActivity.h b/libkineto/src/RocprofActivity.h new file mode 100644 index 000000000..296578072 --- /dev/null +++ b/libkineto/src/RocprofActivity.h @@ -0,0 +1,165 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +// TODO(T90238193) +// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude +#include "GenericTraceActivity.h" +#include "ITraceActivity.h" +#include "RocprofLogger.h" +#include "ThreadUtil.h" + +#include +#include + +namespace libkineto { +class ActivityLogger; +} + +namespace KINETO_NAMESPACE { + +using namespace libkineto; +struct TraceSpan; + +// These classes wrap the various Rocprof activity types +// into subclasses of ITraceActivity so that they can all be accessed +// using the ITraceActivity interface and logged via ActivityLogger. + +// Abstract base class, templated on Rocprof activity type +template +struct RocprofActivity : public ITraceActivity { + explicit RocprofActivity(const T* activity, const ITraceActivity* linked) + : activity_(*activity), linked_(linked) {} + // Our stored timestamps (from rocprof and generated) are in CLOCK_MONOTONIC + // domain (in ns). Convert the timestamps. + int64_t timestamp() const override { + return activity_.begin; + } + int64_t duration() const override { + return activity_.end - activity_.begin; + } + int64_t correlationId() const override { + return 0; + } + int32_t getThreadId() const override { + return 0; + } + const ITraceActivity* linkedActivity() const override { + return linked_; + } + int flowType() const override { + return kLinkAsyncCpuGpu; + } + int64_t flowId() const override { + return correlationId(); + } + const T& raw() const { + return activity_; + } + const TraceSpan* traceSpan() const override { + return nullptr; + } + const std::string getMetadataValue(const std::string& key) const override { + auto it = metadata_.find(key); + if (it != metadata_.end()) { + return it->second; + } + return ""; + } + + protected: + const T& activity_; + const ITraceActivity* linked_{nullptr}; + std::unordered_map metadata_; +}; + +// rocprofAsyncRow - Rocprof GPU activities +struct GpuActivity : public RocprofActivity { + explicit GpuActivity( + const rocprofAsyncRow* activity, + const ITraceActivity* linked) + : RocprofActivity(activity, linked) { + switch (activity_.domain) { + case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY: + type_ = ActivityType::GPU_MEMCPY; + break; + case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH: + default: + type_ = ActivityType::CONCURRENT_KERNEL; + break; + } + } + int64_t correlationId() const override { + return activity_.id; + } + int64_t deviceId() const override { + return activity_.device; + } + int64_t resourceId() const override { + return activity_.queue; + } + ActivityType type() const override { + return type_; + }; + bool flowStart() const override { + return false; + } + const std::string name() const override; + void log(ActivityLogger& logger) const override; + const std::string metadataJson() const override; + + // Add small buffer to fix visual error created by + // https://github.com/ROCm/rocprof/issues/105 Once this is resolved we can + // use ifdef to handle having this buffer or not based on version + int64_t timestamp() const override { + return activity_.begin + 1; + } + int64_t duration() const override { + return activity_.end - (activity_.begin + 1); + } + + private: + ActivityType type_; +}; + +// rocprofRow, rocprofKernelRow, rocprofCopyRow, rocprofMallocRow - +// Rocprof runtime activities +template +struct RuntimeActivity : public RocprofActivity { + explicit RuntimeActivity(const T* activity, const ITraceActivity* linked) + : RocprofActivity(activity, linked) {} + int64_t correlationId() const override { + return raw().id; + } + int64_t deviceId() const override { + return raw().pid; + } + int64_t resourceId() const override { + return raw().tid; + } + ActivityType type() const override { + return ActivityType::CUDA_RUNTIME; + } + bool flowStart() const override; + const std::string name() const override { + return RocprofLogger::opString( + ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API, raw().cid); + } + void log(ActivityLogger& logger) const override; + const std::string metadataJson() const override; + const T& raw() const { + return RocprofActivity::raw(); + } +}; + +} // namespace KINETO_NAMESPACE + +// Include the implementation detail of this header file. +// The *_inl.h helps separate header interface from implementation details. +#include "RocprofActivity_inl.h" diff --git a/libkineto/src/RocprofActivityApi.cpp b/libkineto/src/RocprofActivityApi.cpp new file mode 100644 index 000000000..60a243b9b --- /dev/null +++ b/libkineto/src/RocprofActivityApi.cpp @@ -0,0 +1,196 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "RocprofActivityApi.h" + +#include +#include +#include +#include +#include "ApproximateClock.h" +#include "Demangle.h" +#include "Logger.h" +#include "ThreadUtil.h" +#include "output_base.h" + +using namespace std::chrono; + +namespace KINETO_NAMESPACE { + +RocprofActivityApi& RocprofActivityApi::singleton() { + static RocprofActivityApi instance; + return instance; +} + +RocprofActivityApi::RocprofActivityApi() : d(&RocprofLogger::singleton()) {} + +RocprofActivityApi::~RocprofActivityApi() { + disableActivities(std::set()); +} + +void RocprofActivityApi::pushCorrelationID(int id, CorrelationFlowType type) { +#ifdef HAS_ROCTRACER + if (!singleton().d->externalCorrelationEnabled_) { + return; + } + singleton().d->pushCorrelationID( + id, static_cast(type)); +#endif +} + +void RocprofActivityApi::popCorrelationID(CorrelationFlowType type) { +#ifdef HAS_ROCTRACER + if (!singleton().d->externalCorrelationEnabled_) { + return; + } + singleton().d->popCorrelationID( + static_cast(type)); +#endif +} + +void RocprofActivityApi::setMaxEvents(uint32_t maxEvents) { + d->setMaxEvents(maxEvents); +} + +void RocprofActivityApi::setMaxBufferSize(int size) { + // FIXME: implement? + // maxGpuBufferCount_ = 1 + size / kBufSize; +} + +inline bool inRange(int64_t start, int64_t end, int64_t stamp) { + return ((stamp > start) && (stamp < end)); +} + +inline bool RocprofActivityApi::isLogged(libkineto::ActivityType atype) const { + return activityMaskSnapshot_ & (1 << static_cast(atype)); +} + +timestamp_t getTimeOffset() { + int64_t t0, t00; + timespec t1; + t0 = libkineto::getApproximateTime(); + clock_gettime(CLOCK_MONOTONIC, &t1); + t00 = libkineto::getApproximateTime(); + + // Confvert to ns (if necessary) + t0 = libkineto::get_time_converter()(t0); + t00 = libkineto::get_time_converter()(t00); + + // Our stored timestamps (from roctracer and generated) are in CLOCK_MONOTONIC + // domain (in ns). + return (t0 >> 1) + (t00 >> 1) - timespec_to_ns(t1); +} + +int RocprofActivityApi::processActivities( + std::function handler, + std::function + correlationHandler) { + // Find offset to map from monotonic clock to system clock. + // This will break time-ordering of events but is status quo. + + int count = 0; + + // Process all external correlations pairs + for (int it = RocLogger::CorrelationDomain::begin; + it < RocLogger::CorrelationDomain::end; + ++it) { + auto& externalCorrelations = d->externalCorrelations_[it]; + for (auto& item : externalCorrelations) { + correlationHandler( + item.first, + item.second, + static_cast(it)); + } + std::lock_guard lock(d->externalCorrelationsMutex_); + externalCorrelations.clear(); + } + + // Async ops are in CLOCK_MONOTONIC rather than junk clock. + // Convert these timestamps, poorly. + // These accurate timestamps will skew when converted to approximate time + // The time_converter is not available at collection time. Or we could do a + // much better job. + auto toffset = getTimeOffset(); + + // All Runtime API Calls + for (auto& item : d->rows_) { + bool filtered = false; + if (item->type != ROCTRACER_ACTIVITY_ASYNC && + !isLogged(ActivityType::CUDA_RUNTIME)) { + filtered = true; + } else { + switch (reinterpret_cast(item)->domain) { + case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY: + if (!isLogged(ActivityType::GPU_MEMCPY)) + filtered = true; + break; + case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH: + default: + if (!isLogged(ActivityType::CONCURRENT_KERNEL)) + filtered = true; + break; + } + } + if (!filtered) { + // Convert the begin and end timestamps from monotonic clock to system + // clock. + if (item->type == ROCTRACER_ACTIVITY_ASYNC) { + // Async ops are in CLOCK_MONOTONIC, apply offset to converted + // approximate + item->begin += toffset; + item->end += toffset; + } else { + // Runtime ranges are in approximate clock, just apply conversion + item->begin = libkineto::get_time_converter()(item->begin); + item->end = libkineto::get_time_converter()(item->end); + } + handler(item); + ++count; + } + } + return count; +} + +// TODO: implement the actual flush with roctracer_flush_activity +void RocprofActivityApi::flushActivities() {} + +void RocprofActivityApi::clearActivities() { + d->clearLogs(); +} + +void RocprofActivityApi::enableActivities( + const std::set& selected_activities) { +#ifdef HAS_ROCTRACER + d->startLogging(); + + for (const auto& activity : selected_activities) { + activityMask_ |= (1 << static_cast(activity)); + if (activity == ActivityType::EXTERNAL_CORRELATION) { + d->externalCorrelationEnabled_ = true; + } + } +#endif +} + +void RocprofActivityApi::disableActivities( + const std::set& selected_activities) { +#ifdef HAS_ROCTRACER + d->stopLogging(); + + activityMaskSnapshot_ = activityMask_; + + for (const auto& activity : selected_activities) { + activityMask_ &= ~(1 << static_cast(activity)); + if (activity == ActivityType::EXTERNAL_CORRELATION) { + d->externalCorrelationEnabled_ = false; + } + } +#endif +} + +} // namespace KINETO_NAMESPACE diff --git a/libkineto/src/RocprofActivityApi.h b/libkineto/src/RocprofActivityApi.h new file mode 100644 index 000000000..08a2a2a63 --- /dev/null +++ b/libkineto/src/RocprofActivityApi.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once +#ifdef HAS_ROCTRACER + +#include +#include +#include + +#include "RocprofLogger.h" + +#include "ActivityType.h" +#include "GenericTraceActivity.h" + +class RocprofLogger; + +namespace KINETO_NAMESPACE { + +using namespace libkineto; + +class RocprofActivityApi { + public: + enum CorrelationFlowType { Default, User }; + + RocprofActivityApi(); + RocprofActivityApi(const RocprofActivityApi&) = delete; + RocprofActivityApi& operator=(const RocprofActivityApi&) = delete; + + virtual ~RocprofActivityApi(); + + static RocprofActivityApi& singleton(); + + static void pushCorrelationID(int id, CorrelationFlowType type); + static void popCorrelationID(CorrelationFlowType type); + + void enableActivities(const std::set& selected_activities); + void disableActivities(const std::set& selected_activities); + void flushActivities(); + void clearActivities(); + void teardownContext() {} + void setTimeOffset(timestamp_t toffset); + void setMaxEvents(uint32_t maxEvents); + + virtual int processActivities( + std::function handler, + std::function + correlationHandler); + + void setMaxBufferSize(int size); + + std::atomic_bool stopCollection{false}; + + private: + bool registered_{false}; + timestamp_t toffset_{0}; + + // Enabled Activity Filters + uint32_t activityMask_{0}; + uint32_t activityMaskSnapshot_{0}; + bool isLogged(libkineto::ActivityType atype) const; + + RocprofLogger* d; +}; + +} // namespace KINETO_NAMESPACE +#endif diff --git a/libkineto/src/RocprofActivity_inl.h b/libkineto/src/RocprofActivity_inl.h new file mode 100644 index 000000000..e5e58ab80 --- /dev/null +++ b/libkineto/src/RocprofActivity_inl.h @@ -0,0 +1,261 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include "RocprofActivity.h" + +#include +#include +#include + +#include "Demangle.h" +#include "output_base.h" + +namespace KINETO_NAMESPACE { + +using namespace libkineto; + +namespace { +thread_local std::unordered_map correlationToGrid; +thread_local std::unordered_map correlationToBlock; +thread_local std::unordered_map correlationToSize; +} // namespace + +const char* getGpuActivityKindString(uint32_t domain, uint32_t op) { + if (domain == ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH) + return "Dispatch Kernel"; + else if (domain == ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY) { + switch (op) { + case ROCPROFILER_MEMORY_COPY_HOST_TO_HOST: + return "HtoH"; + case ROCPROFILER_MEMORY_COPY_HOST_TO_DEVICE: + return "HtoD"; + case ROCPROFILER_MEMORY_COPY_DEVICE_TO_HOST: + return "DtoH"; + case ROCPROFILER_MEMORY_COPY_DEVICE_TO_DEVICE: + return "DtoD"; + } + } + return ""; +} + +void getMemcpySrcDstString(uint32_t kind, std::string& src, std::string& dst) { + switch (kind) { + case ROCPROFILER_MEMORY_COPY_HOST_TO_HOST: + src = "Host"; + dst = "Host"; + break; + case ROCPROFILER_MEMORY_COPY_DEVICE_TO_HOST: + src = "Device"; + dst = "Host"; + break; + case ROCPROFILER_MEMORY_COPY_HOST_TO_DEVICE: + src = "Host"; + dst = "Device"; + break; + case ROCPROFILER_MEMORY_COPY_DEVICE_TO_DEVICE: + src = "Device"; + dst = "Device"; + break; + default: + src = "?"; + dst = "?"; + break; + } +} + +// GPU Activities + +inline const std::string GpuActivity::name() const { + if (type_ == ActivityType::CONCURRENT_KERNEL) { + auto op = raw().op; + auto domain = raw().domain; + std::string opString = RocprofLogger::opString( + static_cast(domain), op); + const char* name = opString.c_str(); + return demangle( + raw().kernelName.length() > 0 ? raw().kernelName : std::string(name)); + } else if (type_ == ActivityType::GPU_MEMSET) { + return fmt::format( + "Memset ({})", getGpuActivityKindString(raw().domain, raw().op)); + } else if (type_ == ActivityType::GPU_MEMCPY) { + std::string src = ""; + std::string dst = ""; + getMemcpySrcDstString(raw().op, src, dst); + return fmt::format( + "Memcpy {} ({} -> {})", + getGpuActivityKindString(raw().domain, raw().op), + src, + dst); + } else { + return ""; + } + return ""; +} + +inline void GpuActivity::log(ActivityLogger& logger) const { + logger.handleActivity(*this); +} + +static inline std::string bandwidth(size_t bytes, uint64_t duration) { + return duration == 0 ? "\"N/A\"" : fmt::format("{}", bytes * 1.0 / duration); +} + +inline const std::string GpuActivity::metadataJson() const { + const auto& gpuActivity = raw(); + // clang-format off + + // if memcpy or memset, add size + if (correlationToSize.count(gpuActivity.id) > 0) { + size_t size = correlationToSize[gpuActivity.id]; + std::string bandwidth_gib = (bandwidth(size, gpuActivity.end - gpuActivity.begin)); + return fmt::format(R"JSON( + "device": {}, "stream": {}, + "correlation": {}, "kind": "{}", + "bytes": {}, "memory bandwidth (GB/s)": {})JSON", + gpuActivity.device, gpuActivity.queue, + gpuActivity.id, getGpuActivityKindString(gpuActivity.domain, gpuActivity.op), + size, bandwidth_gib); + } + + // if compute kernel, add grid and block + else if (correlationToGrid.count(gpuActivity.id) > 0) { + return fmt::format(R"JSON( + "device": {}, "stream": {}, + "correlation": {}, "kind": "{}", + "grid": {}, "block": {})JSON", + gpuActivity.device, gpuActivity.queue, + gpuActivity.id, getGpuActivityKindString(gpuActivity.domain, gpuActivity.op), + correlationToGrid[gpuActivity.id], correlationToBlock[gpuActivity.id]); + } else { + return fmt::format(R"JSON( + "device": {}, "stream": {}, + "correlation": {}, "kind": "{}")JSON", + gpuActivity.device, gpuActivity.queue, + gpuActivity.id, getGpuActivityKindString(gpuActivity.domain, gpuActivity.op)); + } + // clang-format on +} + +// Runtime Activities + +template +inline bool RuntimeActivity::flowStart() const { + bool should_correlate = + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipLaunchKernel || + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipExtLaunchKernel || + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipLaunchCooperativeKernel || + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipHccModuleLaunchKernel || + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipModuleLaunchKernel || + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipExtModuleLaunchKernel || + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipMalloc || + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipFree || + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy || + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyAsync || + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyWithStream; + return should_correlate; +} + +template +inline void RuntimeActivity::log(ActivityLogger& logger) const { + logger.handleActivity(*this); +} + +template <> +inline const std::string RuntimeActivity::metadataJson() + const { + std::string kernel = ""; + if ((raw().functionAddr != nullptr)) { + kernel = fmt::format( + R"JSON( + "kernel": "{}", )JSON", + demangle(hipKernelNameRefByPtr(raw().functionAddr, raw().stream))); + } else if ((raw().function != nullptr)) { + kernel = fmt::format( + R"JSON( + "kernel": "{}", )JSON", + demangle(hipKernelNameRef(raw().function))); + } + // cache grid and block so we can pass it into async activity (GPU track) + correlationToGrid[raw().id] = fmt::format( + R"JSON( + [{}, {}, {}])JSON", + raw().gridX, + raw().gridY, + raw().gridZ); + + correlationToBlock[raw().id] = fmt::format( + R"JSON( + [{}, {}, {}])JSON", + raw().workgroupX, + raw().workgroupY, + raw().workgroupZ); + + return fmt::format( + R"JSON( + {}"cid": {}, "correlation": {}, + "grid": [{}, {}, {}], + "block": [{}, {}, {}], + "shared memory": {})JSON", + kernel, + raw().cid, + raw().id, + raw().gridX, + raw().gridY, + raw().gridZ, + raw().workgroupX, + raw().workgroupY, + raw().workgroupZ, + raw().groupSegmentSize); +} + +template <> +inline const std::string RuntimeActivity::metadataJson() const { + correlationToSize[raw().id] = raw().size; + return fmt::format( + R"JSON( + "cid": {}, "correlation": {}, "src": "{}", "dst": "{}", "bytes": "{}", "kind": "{}")JSON", + raw().cid, + raw().id, + raw().src, + raw().dst, + raw().size, + fmt::underlying(raw().kind)); +} + +template <> +inline const std::string RuntimeActivity::metadataJson() + const { + correlationToSize[raw().id] = raw().size; + std::string size = ""; + if (raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipMalloc) { + size = fmt::format( + R"JSON( + "bytes": {}, )JSON", + raw().size); + } + return fmt::format( + R"JSON( + {}"cid": {}, "correlation": {}, "ptr": "{}")JSON", + size, + raw().cid, + raw().id, + raw().ptr); +} + +template +inline const std::string RuntimeActivity::metadataJson() const { + return fmt::format( + R"JSON( + "cid": {}, "correlation": {})JSON", + raw().cid, + raw().id); +} + +} // namespace KINETO_NAMESPACE diff --git a/libkineto/src/RocprofLogger.cpp b/libkineto/src/RocprofLogger.cpp new file mode 100644 index 000000000..1d65ae1a0 --- /dev/null +++ b/libkineto/src/RocprofLogger.cpp @@ -0,0 +1,799 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "RocprofLogger.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "ApproximateClock.h" +#include "Demangle.h" +#include "Logger.h" +#include "ThreadUtil.h" + +using namespace libkineto; +using namespace std::chrono; +using namespace RocLogger; + +class RocprofLoggerShared; + +namespace { +RocprofLoggerShared* s{nullptr}; +using kernel_symbol_data_t = + rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; +using kernel_symbol_map_t = + std::unordered_map; +using kernel_name_map_t = + std::unordered_map; +using rocprofiler::sdk::buffer_name_info; +using rocprofiler::sdk::callback_name_info; +using agent_info_map_t = std::unordered_map; + +// extract copy args +struct copy_args { + const char* dst{""}; + const char* src{""}; + size_t size{0}; + const char* copyKindStr{""}; + hipMemcpyKind copyKind{hipMemcpyDefault}; + hipStream_t stream{nullptr}; + rocprofiler_callback_tracing_kind_t kind; + rocprofiler_tracing_operation_t operation; +}; +auto extract_copy_args = [](rocprofiler_callback_tracing_kind_t, + rocprofiler_tracing_operation_t, + uint32_t arg_num, + const void* const arg_value_addr, + int32_t indirection_count, + const char* arg_type, + const char* arg_name, + const char* arg_value_str, + int32_t dereference_count, + void* cb_data) -> int { + auto& args = *(static_cast(cb_data)); + if (strcmp("dst", arg_name) == 0) { + args.dst = arg_value_str; + } else if (strcmp("src", arg_name) == 0) { + args.src = arg_value_str; + } else if (strcmp("sizeBytes", arg_name) == 0) { + args.size = *(reinterpret_cast(arg_value_addr)); + } else if (strcmp("kind", arg_name) == 0) { + args.copyKindStr = arg_value_str; + args.copyKind = *(reinterpret_cast(arg_value_addr)); + } else if (strcmp("stream", arg_name) == 0) { + args.stream = *(reinterpret_cast(arg_value_addr)); + } + return 0; +}; + +// extract kernel args +struct kernel_args { + // const char *stream; + hipStream_t stream{nullptr}; + uint32_t privateSize{0}; + uint32_t groupSize{0}; + rocprofiler_dim3_t workgroupSize{0}; + rocprofiler_dim3_t gridSize{0}; + rocprofiler_callback_tracing_kind_t kind; + rocprofiler_tracing_operation_t operation; +}; +auto extract_kernel_args = [](rocprofiler_callback_tracing_kind_t, + rocprofiler_tracing_operation_t, + uint32_t arg_num, + const void* const arg_value_addr, + int32_t indirection_count, + const char* arg_type, + const char* arg_name, + const char* arg_value_str, + int32_t dereference_count, + void* cb_data) -> int { + auto& args = *(static_cast(cb_data)); + if (strcmp("stream", arg_name) == 0) + args.stream = *(reinterpret_cast(arg_value_addr)); + else if (strcmp("numBlocks", arg_name) == 0) + args.workgroupSize = + *(reinterpret_cast(arg_value_addr)); + else if (strcmp("dimBlocks", arg_name) == 0) + args.gridSize = + *(reinterpret_cast(arg_value_addr)); + else if (strcmp("sharedMemBytes", arg_name) == 0) + args.groupSize = *(reinterpret_cast(arg_value_addr)); + else if (strcmp("globalWorkSizeX", arg_name) == 0) + args.workgroupSize.x = + *(reinterpret_cast(arg_value_addr)); + else if (strcmp("globalWorkSizeY", arg_name) == 0) + args.workgroupSize.y = + *(reinterpret_cast(arg_value_addr)); + else if (strcmp("globalWorkSizeZ", arg_name) == 0) + args.workgroupSize.z = + *(reinterpret_cast(arg_value_addr)); + else if (strcmp("localWorkSizeX", arg_name) == 0) + args.gridSize.x = + *(reinterpret_cast(arg_value_addr)); + else if (strcmp("localWorkSizeY", arg_name) == 0) + args.gridSize.y = + *(reinterpret_cast(arg_value_addr)); + else if (strcmp("localWorkSizeZ", arg_name) == 0) + args.gridSize.z = + *(reinterpret_cast(arg_value_addr)); + return 0; +}; + +// extract malloc args +struct malloc_args { + const char* ptr; + size_t size; +}; +auto extract_malloc_args = [](rocprofiler_callback_tracing_kind_t, + rocprofiler_tracing_operation_t, + uint32_t arg_num, + const void* const arg_value_addr, + int32_t indirection_count, + const char* arg_type, + const char* arg_name, + const char* arg_value_str, + int32_t dereference_count, + void* cb_data) -> int { + auto& args = *(static_cast(cb_data)); + if (strcmp("ptr", arg_name) == 0) { + args.ptr = arg_value_str; + } + if (strcmp("size", arg_name) == 0) { + args.size = *(reinterpret_cast(arg_value_addr)); + } + return 0; +}; + +// copy api calls +bool isCopyApi(uint32_t id) { + switch (id) { + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2D: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DFromArray: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DFromArrayAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DToArray: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DToArrayAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy3D: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy3DAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyAtoH: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyDtoD: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyDtoDAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyDtoH: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyDtoHAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyFromArray: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyFromSymbol: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyFromSymbolAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyHtoA: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyHtoD: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyHtoDAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyParam2D: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyParam2DAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyPeer: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyPeerAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyToArray: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyToSymbol: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyToSymbolAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyWithStream: + return true; + break; + default:; + } + return false; +} + +// kernel api calls +bool isKernelApi(uint32_t id) { + switch (id) { + case ROCPROFILER_HIP_RUNTIME_API_ID_hipExtLaunchKernel: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipExtLaunchMultiKernelMultiDevice: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipLaunchCooperativeKernel: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipLaunchCooperativeKernelMultiDevice: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipLaunchKernel: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipModuleLaunchCooperativeKernel: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipModuleLaunchCooperativeKernelMultiDevice: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipModuleLaunchKernel: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipExtModuleLaunchKernel: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipHccModuleLaunchKernel: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipLaunchCooperativeKernel_spt: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipLaunchKernel_spt: + return true; + break; + default:; + } + return false; +} + +// malloc api calls +bool isMallocApi(uint32_t id) { + switch (id) { + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMalloc: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipFree: + return true; + break; + default:; + } + return false; +} + +class RocprofApiIdList : public ApiIdList { + public: + RocprofApiIdList(callback_name_info& names); + uint32_t mapName(const std::string& apiName) override; + std::vector allEnabled(); + + private: + std::unordered_map nameMap_; +}; + +} // namespace + +class RocprofLoggerShared { + public: + static RocprofLoggerShared& singleton(); + + rocprofiler_client_id_t* clientId{nullptr}; + rocprofiler_tool_configure_result_t cfg = rocprofiler_tool_configure_result_t{ + sizeof(rocprofiler_tool_configure_result_t), + &RocprofLogger::toolInit, + &RocprofLogger::toolFinialize, + nullptr}; + + // Contexts + rocprofiler_context_id_t utilityContext = {0}; + rocprofiler_context_id_t context = {0}; + + // Buffers + rocprofiler_buffer_id_t buffer = {}; + + // Manage kernel names - #betterThanRoctracer + kernel_symbol_map_t kernel_info = {}; + kernel_name_map_t kernel_names = {}; + std::mutex kernel_lock; + + // Manage buffer name - #betterThanRoctracer + callback_name_info name_info = {}; + buffer_name_info buff_name_info = {}; + + // Agent info + // + agent_info_map_t agents = {}; + + std::map kernelargs; + std::map copyargs; + + private: + RocprofLoggerShared() { + s = this; + } + ~RocprofLoggerShared() { + s = nullptr; + } +}; + +RocprofLoggerShared& RocprofLoggerShared::singleton() { + static RocprofLoggerShared* instance = new RocprofLoggerShared(); // Leak this + return *instance; +} + +std::vector get_gpu_device_agents() { + std::vector agents; + + // Callback used by rocprofiler_query_available_agents to return + // agents on the device. This can include CPU agents as well. We + // select GPU agents only (i.e. type == ROCPROFILER_AGENT_TYPE_GPU) + rocprofiler_query_available_agents_cb_t iterate_cb = + [](rocprofiler_agent_version_t agents_ver, + const void** agents_arr, + size_t num_agents, + void* udata) { + if (agents_ver != ROCPROFILER_AGENT_INFO_VERSION_0) + throw std::runtime_error{"unexpected rocprofiler agent version"}; + auto* agents_v = + static_cast*>(udata); + for (size_t i = 0; i < num_agents; ++i) { + const auto* agent = + static_cast(agents_arr[i]); + // if(agent->type == ROCPROFILER_AGENT_TYPE_GPU) + // agents_v->emplace_back(*agent); + agents_v->emplace_back(*agent); + } + return ROCPROFILER_STATUS_SUCCESS; + }; + + // Query the agents, only a single callback is made that contains a vector + // of all agents. + rocprofiler_query_available_agents( + ROCPROFILER_AGENT_INFO_VERSION_0, + iterate_cb, + sizeof(rocprofiler_agent_t), + const_cast(static_cast(&agents))); + return agents; +} + +// +// Static setup +// +extern "C" rocprofiler_tool_configure_result_t* rocprofiler_configure( + uint32_t version, + const char* runtime_version, + uint32_t priority, + rocprofiler_client_id_t* id) { + RocprofLoggerShared::singleton(); // CRITICAL: static init + std::cout << "RocprofLogger::rocprofiler_configure" << std::endl; + + id->name = "kineto"; + s->clientId = id; + + // return pointer to configure data + return &s->cfg; +} + +int RocprofLogger::toolInit( + rocprofiler_client_finalize_t finialize_func, + void* tool_data) { + // Gather api names + s->name_info = rocprofiler::sdk::get_callback_tracing_names(); + s->buff_name_info = rocprofiler::sdk::get_buffer_tracing_names(); + + // Gather agent info + auto agent_info = get_gpu_device_agents(); + for (auto agent : agent_info) { + s->agents[agent.id.handle] = agent; + } + + // + // Setup utility context to gather code object info + // + rocprofiler_create_context(&s->utilityContext); + auto code_object_ops = std::vector{ + ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER}; + + rocprofiler_configure_callback_tracing_service( + s->utilityContext, + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, + code_object_ops.data(), + code_object_ops.size(), + RocprofLogger::code_object_callback, + nullptr); + { + int isValid = 0; + rocprofiler_context_is_valid(s->utilityContext, &isValid); + if (isValid == 0) { + s->utilityContext.handle = 0; // Can't destroy it, so leak it + return -1; + } + } + rocprofiler_start_context(s->utilityContext); + + // + // select some api calls to omit, in the most inconvenient way possible + // #betterThanRoctracer + RocprofApiIdList apiList(s->name_info); + apiList.setInvertMode(true); // Omit the specified api + apiList.add("hipGetDevice"); + apiList.add("hipSetDevice"); + apiList.add("hipGetLastError"); + apiList.add("__hipPushCallConfiguration"); + apiList.add("__hipPopCallConfiguration"); + apiList.add("hipCtxSetCurrent"); + apiList.add("hipEventRecord"); + apiList.add("hipEventQuery"); + apiList.add("hipGetDeviceProperties"); + apiList.add("hipPeekAtLastError"); + apiList.add("hipModuleGetFunction"); + apiList.add("hipEventCreateWithFlags"); + + // Get a vector of the enabled api calls + auto apis = apiList.allEnabled(); + + // + // Setup main context to collect runtime and kernel info + // + rocprofiler_create_context(&s->context); + + // Collect api info via callback + rocprofiler_configure_callback_tracing_service( + s->context, + ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API, + apis.data(), + apis.size(), + api_callback, + nullptr); + + // Collect async ops via buffers + constexpr auto buffer_size_bytes = 0x40000; + constexpr auto buffer_watermark_bytes = buffer_size_bytes / 2; + + rocprofiler_create_buffer( + s->context, + buffer_size_bytes, + buffer_watermark_bytes, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + RocprofLogger::buffer_callback, + nullptr, + &s->buffer); + + rocprofiler_configure_buffer_tracing_service( + s->context, + ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, + nullptr, + 0, + s->buffer); + + rocprofiler_configure_buffer_tracing_service( + s->context, + ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, + nullptr, + 0, + s->buffer); + { + int isValid = 0; + rocprofiler_context_is_valid(s->context, &isValid); + if (isValid == 0) { + s->context.handle = 0; // Can't destroy it, so leak it + return -1; + } + } + rocprofiler_stop_context(s->context); + + return 0; +} + +void RocprofLogger::toolFinialize(void* tool_data) { + rocprofiler_stop_context(s->utilityContext); + s->utilityContext.handle = 0; + rocprofiler_stop_context(s->context); + s->context.handle = 0; +} + +class Flush { + public: + std::mutex mutex_; + std::atomic maxCorrelationId_; + uint64_t maxCompletedCorrelationId_{0}; + void reportCorrelation(const uint64_t& cid) { + uint64_t prev = maxCorrelationId_; + while (prev < cid && !maxCorrelationId_.compare_exchange_weak(prev, cid)) { + } + } +}; + +RocprofLogger& RocprofLogger::singleton() { + static RocprofLogger instance; + return instance; +} + +RocprofLogger::RocprofLogger() {} + +RocprofLogger::~RocprofLogger() { + stopLogging(); + endTracing(); +} + +namespace { +thread_local std::deque + t_externalIds[RocLogger::CorrelationDomain::size]; +} + +void RocprofLogger::pushCorrelationID(uint64_t id, CorrelationDomain type) { + if (!singleton().externalCorrelationEnabled_) { + return; + } + t_externalIds[type].push_back(id); +} + +void RocprofLogger::popCorrelationID(CorrelationDomain type) { + if (!singleton().externalCorrelationEnabled_) { + return; + } + if (!t_externalIds[type].empty()) { + t_externalIds[type].pop_back(); + } else { + LOG(ERROR) + << "Attempt to popCorrelationID from an empty external Ids stack"; + } +} + +void RocprofLogger::clearLogs() { + // CuptiActivityProfiler clears this before the output Loggers use the data + // for (auto &row : rows_) + // delete row; + rows_.clear(); + for (int i = 0; i < CorrelationDomain::size; ++i) { + externalCorrelations_[i].clear(); + } +} + +void RocprofLogger::insert_row_to_buffer(rocprofBase* row) { + RocprofLogger* dis = &singleton(); + std::lock_guard lock(dis->rowsMutex_); + if (dis->rows_.size() >= dis->maxBufferSize_) { + LOG_FIRST_N(WARNING, 10) + << "Exceeded max GPU buffer count (" << dis->rows_.size() << " > " + << dis->maxBufferSize_ << ") - terminating tracing"; + return; + } + dis->rows_.push_back(row); +} + +void RocprofLogger::code_object_callback( + rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* callback_data) { + if (record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT && + record.operation == ROCPROFILER_CODE_OBJECT_LOAD) { + if (record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) { + // flush the buffer to ensure that any lookups for the client kernel names + // for the code object are completed NOTE: not using buffer ATM + } + } else if ( + record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT && + record.operation == + ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) { + auto* data = static_cast(record.payload); + if (record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD) { + std::lock_guard lock(s->kernel_lock); + s->kernel_info.emplace(data->kernel_id, *data); + s->kernel_names.emplace(data->kernel_id, demangle(data->kernel_name)); + } else if (record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) { + // FIXME: clear these? At minimum need kernel names at shutdown, async + // completion + // s->kernel_info.erase(data->kernel_id); + // s->kernel_names.erase(data->kernel_id); + } + } +} + +void RocprofLogger::api_callback( + rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* callback_data) { + thread_local std::unordered_map timestamps; + + if (record.kind == ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API) { + if (record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) { + timestamps[record.correlation_id.internal] = getApproximateTime(); + } // ROCPROFILER_CALLBACK_PHASE_ENTER + else { // ROCPROFILER_CALLBACK_PHASE_EXIT + uint64_t startTime = timestamps[record.correlation_id.internal]; + timestamps.erase(record.correlation_id.internal); + uint64_t endTime = getApproximateTime(); + + // Kernel Launch Records + if (isKernelApi(record.operation)) { + kernel_args args; + rocprofiler_iterate_callback_tracing_kind_operation_args( + record, + extract_kernel_args, + 1 /*max_deref*/ + , + &args); + + rocprofKernelRow* row = new rocprofKernelRow( + record.correlation_id.internal, + record.kind, + record.operation, + processId(), + systemThreadId(), + startTime, + endTime, + nullptr, + nullptr, + args.workgroupSize.x, + args.workgroupSize.y, + args.workgroupSize.z, + args.gridSize.x, + args.gridSize.y, + args.gridSize.z, + args.groupSize, + args.stream); + insert_row_to_buffer(row); + + } + // Copy Records + else if (isCopyApi(record.operation)) { + copy_args args; + rocprofiler_iterate_callback_tracing_kind_operation_args( + record, + extract_copy_args, + 1 /*max_deref*/ + , + &args); + + rocprofCopyRow* row = new rocprofCopyRow( + record.correlation_id.internal, + args.kind, + args.operation, + processId(), + systemThreadId(), + startTime, + endTime, + args.src, + args.dst, + args.size, + args.copyKind, + args.stream); + insert_row_to_buffer(row); + } + // Malloc Records + else if (isMallocApi(record.operation)) { + malloc_args args; + args.size = 0; + rocprofiler_iterate_callback_tracing_kind_operation_args( + record, + extract_malloc_args, + 1 /*max_deref*/ + , + &args); + rocprofMallocRow* row = new rocprofMallocRow( + record.correlation_id.internal, + record.kind, + record.operation, + processId(), + systemThreadId(), + startTime, + endTime, + args.ptr, + args.size); + insert_row_to_buffer(row); + } + // Default Records + else { + rocprofRow* row = new rocprofRow( + record.correlation_id.internal, + record.kind, + record.operation, + processId(), + systemThreadId(), + startTime, + endTime); + insert_row_to_buffer(row); + } + } // ROCPROFILER_CALLBACK_PHASE_EXIT + } // ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API +} + +void RocprofLogger::buffer_callback( + rocprofiler_context_id_t context, + rocprofiler_buffer_id_t buffer_id, + rocprofiler_record_header_t** headers, + size_t num_headers, + void* user_data, + uint64_t drop_count) { + for (size_t i = 0; i < num_headers; ++i) { + auto* header = headers[i]; + + if (header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING) { + if (header->kind == ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH) { + auto& record = + *(static_cast( + header->payload)); + auto& dispatch = record.dispatch_info; + + rocprofAsyncRow* row = new rocprofAsyncRow( + record.correlation_id.internal, + record.kind, + record.operation, + record.operation, // shared op - No longer a thing. Placeholder + s->agents.at(dispatch.agent_id.handle).logical_node_type_id, + dispatch.queue_id.handle, + record.start_timestamp, + record.end_timestamp, + s->kernel_names.at(dispatch.kernel_id)); + insert_row_to_buffer(row); + } else if (header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_COPY) { + auto& record = + *(static_cast( + header->payload)); + rocprofAsyncRow* row = new rocprofAsyncRow( + record.correlation_id.internal, + record.kind, + record.operation, + record.operation, // shared op - No longer a thing. Placeholder + s->agents.at(record.dst_agent_id.handle).logical_node_type_id, + 0, + record.start_timestamp, + record.end_timestamp, + ""); + insert_row_to_buffer(row); + } + } + } +} + +std::string RocprofLogger::opString( + rocprofiler_callback_tracing_kind_t kind, + rocprofiler_tracing_operation_t op) { + return std::string(RocprofLoggerShared::singleton().name_info[kind][op]); +} + +std::string RocprofLogger::opString( + rocprofiler_buffer_tracing_kind_t kind, + rocprofiler_tracing_operation_t op) { + return std::string(RocprofLoggerShared::singleton().buff_name_info[kind][op]); +} + +void RocprofLogger::setMaxEvents(uint32_t maxBufferSize) { + RocprofLogger* dis = &singleton(); + std::lock_guard lock(dis->rowsMutex_); + maxBufferSize_ = maxBufferSize; +} + +void RocprofLogger::startLogging() { + if (!registered_) { + } + + externalCorrelationEnabled_ = true; + logging_ = true; + if (s != nullptr) + rocprofiler_start_context(s->context); + else + LOG(WARNING) << "Rocprofiler not configured"; +} + +void RocprofLogger::stopLogging() { + if (logging_ == false) + return; + logging_ = false; + + // Flush buffers + rocprofiler_flush_buffer(s->buffer); + + if (s != nullptr) + rocprofiler_stop_context(s->context); +} + +void RocprofLogger::endTracing() { + // This should be handled in RocprofLogger::toolFinialize +} + +// +// ApiIdList +// Jump through some extra hoops +// +// +RocprofApiIdList::RocprofApiIdList(callback_name_info& names) : nameMap_() { + auto& hipapis = + names[ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API].operations; + + for (size_t i = 0; i < hipapis.size(); ++i) { + nameMap_.emplace(hipapis[i], i); + } +} + +uint32_t RocprofApiIdList::mapName(const std::string& apiName) { + auto it = nameMap_.find(apiName); + if (it != nameMap_.end()) { + return it->second; + } + return 0; +} + +std::vector RocprofApiIdList::allEnabled() { + std::vector oplist; + for (auto& it : nameMap_) { + if (contains(it.second)) + oplist.push_back(it.second); + } + return oplist; +} +// +// +// diff --git a/libkineto/src/RocprofLogger.h b/libkineto/src/RocprofLogger.h new file mode 100644 index 000000000..2d7325951 --- /dev/null +++ b/libkineto/src/RocprofLogger.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "RocLogger.h" + +class RocprofLogger { + public: + RocprofLogger(); + RocprofLogger(const RocprofLogger&) = delete; + RocprofLogger& operator=(const RocprofLogger&) = delete; + + virtual ~RocprofLogger(); + + static RocprofLogger& singleton(); + + static void pushCorrelationID(uint64_t id, RocLogger::CorrelationDomain type); + static void popCorrelationID(RocLogger::CorrelationDomain type); + + void startLogging(); + void stopLogging(); + void clearLogs(); + void setMaxEvents(uint32_t maxBufferSize); + + static int toolInit( + rocprofiler_client_finalize_t finalize_func, + void* tool_data); + static void toolFinialize(void* tool_data); + + static std::string opString( + rocprofiler_callback_tracing_kind_t kind, + rocprofiler_tracing_operation_t op); + + static std::string opString( + rocprofiler_buffer_tracing_kind_t kind, + rocprofiler_tracing_operation_t op); + + private: + bool registered_{false}; + void endTracing(); + + static void insert_row_to_buffer(rocprofBase* row); + + // + static void api_callback( + rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* callback_data); + static void buffer_callback( + rocprofiler_context_id_t context, + rocprofiler_buffer_id_t buffer_id, + rocprofiler_record_header_t** headers, + size_t num_headers, + void* user_data, + uint64_t drop_count); + static void code_object_callback( + rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* callback_data); + + // Api callback data + uint32_t maxBufferSize_{5000000}; // 5M GPU runtime/kernel events. + std::vector rows_; + std::mutex rowsMutex_; + + // This vector collects pairs of correlationId and their respective + // externalCorrelationId for each CorrelationDomain. This will be used + // to populate the Correlation maps during post processing. + std::vector> + externalCorrelations_[RocLogger::CorrelationDomain::size]; + std::mutex externalCorrelationsMutex_; + + bool externalCorrelationEnabled_{true}; + bool logging_{false}; + + friend class libkineto::RocprofActivityApi; +}; diff --git a/libkineto/src/RoctracerActivity.h b/libkineto/src/RoctracerActivity.h index 72083ab6b..615eef52e 100644 --- a/libkineto/src/RoctracerActivity.h +++ b/libkineto/src/RoctracerActivity.h @@ -81,10 +81,10 @@ struct RoctracerActivity : public ITraceActivity { std::unordered_map metadata_; }; -// roctracerAsyncRow - Roctracer GPU activities -struct GpuActivity : public RoctracerActivity { +// rocprofAsyncRow - Roctracer GPU activities +struct GpuActivity : public RoctracerActivity { explicit GpuActivity( - const roctracerAsyncRow* activity, + const rocprofAsyncRow* activity, const ITraceActivity* linked) : RoctracerActivity(activity, linked) { switch (activity_.kind) { diff --git a/libkineto/src/RoctracerActivityApi.cpp b/libkineto/src/RoctracerActivityApi.cpp index 0bec09d4a..6a9ebcd9a 100644 --- a/libkineto/src/RoctracerActivityApi.cpp +++ b/libkineto/src/RoctracerActivityApi.cpp @@ -40,7 +40,7 @@ void RoctracerActivityApi::pushCorrelationID(int id, CorrelationFlowType type) { return; } singleton().d->pushCorrelationID( - id, static_cast(type)); + id, static_cast(type)); #endif } @@ -50,7 +50,7 @@ void RoctracerActivityApi::popCorrelationID(CorrelationFlowType type) { return; } singleton().d->popCorrelationID( - static_cast(type)); + static_cast(type)); #endif } @@ -85,8 +85,8 @@ timestamp_t getTimeOffset() { } int RoctracerActivityApi::processActivities( - std::function handler, - std::function + std::function handler, + std::function correlationHandler) { // Find offset to map from monotonic clock to system clock. // This will break time-ordering of events but is status quo. @@ -94,15 +94,15 @@ int RoctracerActivityApi::processActivities( int count = 0; // Process all external correlations pairs - for (int it = RoctracerLogger::CorrelationDomain::begin; - it < RoctracerLogger::CorrelationDomain::end; + for (int it = RocLogger::CorrelationDomain::begin; + it < RocLogger::CorrelationDomain::end; ++it) { auto& externalCorrelations = d->externalCorrelations_[it]; for (auto& item : externalCorrelations) { correlationHandler( item.first, item.second, - static_cast(it)); + static_cast(it)); } std::lock_guard lock(d->externalCorrelationsMutex_); externalCorrelations.clear(); @@ -122,7 +122,7 @@ int RoctracerActivityApi::processActivities( !isLogged(ActivityType::CUDA_RUNTIME)) { filtered = true; } else { - switch (reinterpret_cast(item)->kind) { + switch (reinterpret_cast(item)->kind) { case HIP_OP_COPY_KIND_DEVICE_TO_HOST_: case HIP_OP_COPY_KIND_HOST_TO_DEVICE_: case HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_: @@ -142,8 +142,7 @@ int RoctracerActivityApi::processActivities( if (!isLogged(ActivityType::CONCURRENT_KERNEL)) filtered = true; // Don't record barriers/markers - if (reinterpret_cast(item)->op == - HIP_OP_ID_BARRIER) + if (reinterpret_cast(item)->op == HIP_OP_ID_BARRIER) filtered = true; break; } diff --git a/libkineto/src/RoctracerActivityApi.h b/libkineto/src/RoctracerActivityApi.h index 54bf03f73..415d5b436 100644 --- a/libkineto/src/RoctracerActivityApi.h +++ b/libkineto/src/RoctracerActivityApi.h @@ -48,9 +48,8 @@ class RoctracerActivityApi { void setMaxEvents(uint32_t maxEvents); virtual int processActivities( - std::function handler, - std::function< - void(uint64_t, uint64_t, RoctracerLogger::CorrelationDomain)> + std::function handler, + std::function correlationHandler); void setMaxBufferSize(int size); diff --git a/libkineto/src/RoctracerActivity_inl.h b/libkineto/src/RoctracerActivity_inl.h index 56d54f6a9..30b189944 100644 --- a/libkineto/src/RoctracerActivity_inl.h +++ b/libkineto/src/RoctracerActivity_inl.h @@ -161,7 +161,7 @@ inline void RuntimeActivity::log(ActivityLogger& logger) const { } template <> -inline const std::string RuntimeActivity::metadataJson() +inline const std::string RuntimeActivity::metadataJson() const { std::string kernel = ""; if ((raw().functionAddr != nullptr)) { @@ -209,8 +209,7 @@ inline const std::string RuntimeActivity::metadataJson() } template <> -inline const std::string RuntimeActivity::metadataJson() - const { +inline const std::string RuntimeActivity::metadataJson() const { correlationToSize[raw().id] = raw().size; return fmt::format( R"JSON( @@ -224,7 +223,7 @@ inline const std::string RuntimeActivity::metadataJson() } template <> -inline const std::string RuntimeActivity::metadataJson() +inline const std::string RuntimeActivity::metadataJson() const { correlationToSize[raw().id] = raw().size; std::string size = ""; diff --git a/libkineto/src/RoctracerLogger.cpp b/libkineto/src/RoctracerLogger.cpp index 725c4d0b0..2e2b696af 100644 --- a/libkineto/src/RoctracerLogger.cpp +++ b/libkineto/src/RoctracerLogger.cpp @@ -35,6 +35,12 @@ class Flush { }; static Flush s_flush; +uint32_t RoctracerApiIdList::mapName(const std::string& apiName) { + uint32_t cid = 0; + roctracer_op_code(ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr); + return cid; +} + RoctracerLogger& RoctracerLogger::singleton() { static RoctracerLogger instance; return instance; @@ -49,17 +55,19 @@ RoctracerLogger::~RoctracerLogger() { namespace { thread_local std::deque - t_externalIds[RoctracerLogger::CorrelationDomain::size]; + t_externalIds[RocLogger::CorrelationDomain::size]; } -void RoctracerLogger::pushCorrelationID(uint64_t id, CorrelationDomain type) { +void RoctracerLogger::pushCorrelationID( + uint64_t id, + RocLogger::CorrelationDomain type) { if (!singleton().externalCorrelationEnabled_) { return; } t_externalIds[type].push_back(id); } -void RoctracerLogger::popCorrelationID(CorrelationDomain type) { +void RoctracerLogger::popCorrelationID(RocLogger::CorrelationDomain type) { if (!singleton().externalCorrelationEnabled_) { return; } @@ -73,12 +81,12 @@ void RoctracerLogger::popCorrelationID(CorrelationDomain type) { void RoctracerLogger::clearLogs() { rows_.clear(); - for (int i = 0; i < CorrelationDomain::size; ++i) { + for (int i = 0; i < RocLogger::CorrelationDomain::size; ++i) { externalCorrelations_[i].clear(); } } -void RoctracerLogger::insert_row_to_buffer(roctracerBase* row) { +void RoctracerLogger::insert_row_to_buffer(rocprofBase* row) { RoctracerLogger* dis = &singleton(); std::lock_guard lock(dis->rowsMutex_); if (dis->rows_.size() >= dis->maxBufferSize_) { @@ -119,7 +127,7 @@ void RoctracerLogger::api_callback( { s_flush.reportCorrelation(data->correlation_id); auto& args = data->args.hipLaunchKernel; - roctracerKernelRow* row = new roctracerKernelRow( + rocprofKernelRow* row = new rocprofKernelRow( data->correlation_id, domain, cid, @@ -144,7 +152,7 @@ void RoctracerLogger::api_callback( case HIP_API_ID_hipExtModuleLaunchKernel: { s_flush.reportCorrelation(data->correlation_id); auto& args = data->args.hipModuleLaunchKernel; - roctracerKernelRow* row = new roctracerKernelRow( + rocprofKernelRow* row = new rocprofKernelRow( data->correlation_id, domain, cid, @@ -169,7 +177,7 @@ void RoctracerLogger::api_callback( #if 0 { auto &args = data->args.hipLaunchCooperativeKernelMultiDevice.launchParamsList__val; - roctracerKernelRow* row = new roctracerKernelRow( + rocprofKernelRow* row = new rocprofKernelRow( data->correlation_id, domain, cid, @@ -193,7 +201,7 @@ void RoctracerLogger::api_callback( #endif break; case HIP_API_ID_hipMalloc: { - roctracerMallocRow* row = new roctracerMallocRow( + rocprofMallocRow* row = new rocprofMallocRow( data->correlation_id, domain, cid, @@ -206,7 +214,7 @@ void RoctracerLogger::api_callback( insert_row_to_buffer(row); } break; case HIP_API_ID_hipFree: { - roctracerMallocRow* row = new roctracerMallocRow( + rocprofMallocRow* row = new rocprofMallocRow( data->correlation_id, domain, cid, @@ -220,7 +228,7 @@ void RoctracerLogger::api_callback( } break; case HIP_API_ID_hipMemcpy: { auto& args = data->args.hipMemcpy; - roctracerCopyRow* row = new roctracerCopyRow( + rocprofCopyRow* row = new rocprofCopyRow( data->correlation_id, domain, cid, @@ -239,7 +247,7 @@ void RoctracerLogger::api_callback( case HIP_API_ID_hipMemcpyAsync: case HIP_API_ID_hipMemcpyWithStream: { auto& args = data->args.hipMemcpyAsync; - roctracerCopyRow* row = new roctracerCopyRow( + rocprofCopyRow* row = new rocprofCopyRow( data->correlation_id, domain, cid, @@ -255,7 +263,7 @@ void RoctracerLogger::api_callback( insert_row_to_buffer(row); } break; default: { - roctracerRow* row = new roctracerRow( + rocprofRow* row = new rocprofRow( data->correlation_id, domain, cid, @@ -267,7 +275,8 @@ void RoctracerLogger::api_callback( } break; } // switch // External correlation - for (int it = CorrelationDomain::begin; it < CorrelationDomain::end; + for (int it = RocLogger::CorrelationDomain::begin; + it < RocLogger::CorrelationDomain::end; ++it) { if (t_externalIds[it].size() > 0) { std::lock_guard lock(dis->externalCorrelationsMutex_); @@ -293,7 +302,7 @@ void RoctracerLogger::activity_callback( if (record->correlation_id > s_flush.maxCompletedCorrelationId_) { s_flush.maxCompletedCorrelationId_ = record->correlation_id; } - roctracerAsyncRow* row = new roctracerAsyncRow( + rocprofAsyncRow* row = new rocprofAsyncRow( record->correlation_id, record->domain, record->kind, @@ -425,30 +434,3 @@ void RoctracerLogger::endTracing() { hccPool_ = nullptr; } } - -ApiIdList::ApiIdList() : invert_(true) {} - -void ApiIdList::add(const std::string& apiName) { - uint32_t cid = 0; - if (roctracer_op_code( - ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr) == - ROCTRACER_STATUS_SUCCESS) { - filter_[cid] = 1; - } -} -void ApiIdList::remove(const std::string& apiName) { - uint32_t cid = 0; - if (roctracer_op_code( - ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr) == - ROCTRACER_STATUS_SUCCESS) { - filter_.erase(cid); - } -} - -bool ApiIdList::loadUserPrefs() { - // placeholder - return false; -} -bool ApiIdList::contains(uint32_t apiId) { - return (filter_.find(apiId) != filter_.end()) ? !invert_ : invert_; // XOR -} diff --git a/libkineto/src/RoctracerLogger.h b/libkineto/src/RoctracerLogger.h index d28a9f08b..2e5387585 100644 --- a/libkineto/src/RoctracerLogger.h +++ b/libkineto/src/RoctracerLogger.h @@ -24,6 +24,8 @@ #include #include +#include "RocLogger.h" + // Local copy of hip op types. These are public (and stable) in later rocm // releases typedef enum { @@ -56,203 +58,13 @@ namespace libkineto { class RoctracerActivityApi; } -typedef uint64_t timestamp_t; - -static timestamp_t timespec_to_ns(const timespec& time) { - return ((timestamp_t)time.tv_sec * 1000000000) + time.tv_nsec; -} - -class ApiIdList { +class RoctracerApiIdList : public ApiIdList { public: - ApiIdList(); - bool invertMode() { - return invert_; - } - void setInvertMode(bool invert) { - invert_ = invert; - } - void add(const std::string& apiName); - void remove(const std::string& apiName); - bool loadUserPrefs(); - bool contains(uint32_t apiId); - const std::unordered_map& filterList() { - return filter_; - } - - private: - std::unordered_map filter_; - bool invert_; -}; - -typedef enum { - ROCTRACER_ACTIVITY_DEFAULT = 0, - ROCTRACER_ACTIVITY_KERNEL, - ROCTRACER_ACTIVITY_COPY, - ROCTRACER_ACTIVITY_MALLOC, - ROCTRACER_ACTIVITY_ASYNC, - ROCTRACER_ACTIVITY_NONE -} roctracer_activity_types; - -struct roctracerBase { - roctracerBase( - uint64_t id, - uint32_t domain, - uint64_t begin, - uint64_t end, - roctracer_activity_types type = ROCTRACER_ACTIVITY_NONE) - : id(id), begin(begin), end(end), domain(domain), type(type) {} - uint64_t id; // correlation_id - uint64_t begin; - uint64_t end; - uint32_t domain; - roctracer_activity_types type; -}; - -struct roctracerRow : public roctracerBase { - roctracerRow( - uint64_t id, - uint32_t domain, - uint32_t cid, - uint32_t pid, - uint32_t tid, - uint64_t begin, - uint64_t end, - roctracer_activity_types type = ROCTRACER_ACTIVITY_DEFAULT) - : roctracerBase(id, domain, begin, end, type), - cid(cid), - pid(pid), - tid(tid) {} - uint32_t cid; - uint32_t pid; - uint32_t tid; -}; - -struct roctracerKernelRow : public roctracerRow { - roctracerKernelRow( - uint64_t id, - uint32_t domain, - uint32_t cid, - uint32_t pid, - uint32_t tid, - uint64_t begin, - uint64_t end, - const void* faddr, - hipFunction_t function, - unsigned int gx, - unsigned int gy, - unsigned int gz, - unsigned int wx, - unsigned int wy, - unsigned int wz, - size_t gss, - hipStream_t stream, - roctracer_activity_types type = ROCTRACER_ACTIVITY_KERNEL) - : roctracerRow(id, domain, cid, pid, tid, begin, end, type), - functionAddr(faddr), - function(function), - gridX(gx), - gridY(gy), - gridZ(gz), - workgroupX(wx), - workgroupY(wy), - workgroupZ(wz), - groupSegmentSize(gss), - stream(stream) {} - const void* functionAddr; - hipFunction_t function; - unsigned int gridX; - unsigned int gridY; - unsigned int gridZ; - unsigned int workgroupX; - unsigned int workgroupY; - unsigned int workgroupZ; - size_t groupSegmentSize; - hipStream_t stream; -}; - -struct roctracerCopyRow : public roctracerRow { - roctracerCopyRow( - uint64_t id, - uint32_t domain, - uint32_t cid, - uint32_t pid, - uint32_t tid, - uint64_t begin, - uint64_t end, - const void* src, - const void* dst, - size_t size, - hipMemcpyKind kind, - hipStream_t stream, - roctracer_activity_types type = ROCTRACER_ACTIVITY_COPY) - : roctracerRow(id, domain, cid, pid, tid, begin, end, type), - src(src), - dst(dst), - size(size), - kind(kind), - stream(stream) {} - const void* src; - const void* dst; - size_t size; - hipMemcpyKind kind; - hipStream_t stream; -}; - -struct roctracerMallocRow : public roctracerRow { - roctracerMallocRow( - uint64_t id, - uint32_t domain, - uint32_t cid, - uint32_t pid, - uint32_t tid, - uint64_t begin, - uint64_t end, - const void* ptr, - size_t size, - roctracer_activity_types type = ROCTRACER_ACTIVITY_MALLOC) - : roctracerRow(id, domain, cid, pid, tid, begin, end, type), - ptr(ptr), - size(size) {} - const void* ptr; - size_t size; -}; - -struct roctracerAsyncRow : public roctracerBase { - roctracerAsyncRow( - uint64_t id, - uint32_t domain, - uint32_t kind, - uint32_t op, - int device, - uint64_t queue, - uint64_t begin, - uint64_t end, - const std::string& kernelName, - roctracer_activity_types type = ROCTRACER_ACTIVITY_ASYNC) - : roctracerBase(id, domain, begin, end, type), - kind(kind), - op(op), - device(device), - queue(queue), - kernelName(kernelName) {} - uint32_t kind; - uint32_t op; - int device; - uint64_t queue; - std::string kernelName; + uint32_t mapName(const std::string& apiName) override; }; class RoctracerLogger { public: - enum CorrelationDomain { - begin, - Default = begin, - Domain0 = begin, - Domain1, - end, - size = end - }; - RoctracerLogger(); RoctracerLogger(const RoctracerLogger&) = delete; RoctracerLogger& operator=(const RoctracerLogger&) = delete; @@ -261,8 +73,8 @@ class RoctracerLogger { static RoctracerLogger& singleton(); - static void pushCorrelationID(uint64_t id, CorrelationDomain type); - static void popCorrelationID(CorrelationDomain type); + static void pushCorrelationID(uint64_t id, RocLogger::CorrelationDomain type); + static void popCorrelationID(RocLogger::CorrelationDomain type); void startLogging(); void stopLogging(); @@ -274,7 +86,7 @@ class RoctracerLogger { void endTracing(); roctracer_pool_t* hccPool_{NULL}; - static void insert_row_to_buffer(roctracerBase* row); + static void insert_row_to_buffer(rocprofBase* row); static void api_callback( uint32_t domain, uint32_t cid, @@ -282,18 +94,18 @@ class RoctracerLogger { void* arg); static void activity_callback(const char* begin, const char* end, void* arg); - ApiIdList loggedIds_; + RoctracerApiIdList loggedIds_; // Api callback data uint32_t maxBufferSize_{5000000}; // 5M GPU runtime/kernel events. - std::vector rows_; + std::vector rows_; std::mutex rowsMutex_; // This vector collects pairs of correlationId and their respective // externalCorrelationId for each CorrelationDomain. This will be used // to populate the Correlation maps during post processing. std::vector> - externalCorrelations_[CorrelationDomain::size]; + externalCorrelations_[RocLogger::CorrelationDomain::size]; std::mutex externalCorrelationsMutex_; bool externalCorrelationEnabled_{true};