From 8ee0e9dccb5c42a2784b8aa60e15e904e6359436 Mon Sep 17 00:00:00 2001 From: Michael Wootton Date: Mon, 18 Aug 2025 16:05:11 -0500 Subject: [PATCH 1/7] Remove roctracer implementation --- libkineto/src/RoctracerActivity.h | 176 ---------- libkineto/src/RoctracerActivityApi.cpp | 214 ------------ libkineto/src/RoctracerActivityApi.h | 72 ---- libkineto/src/RoctracerActivity_inl.h | 255 -------------- libkineto/src/RoctracerLogger.cpp | 454 ------------------------- libkineto/src/RoctracerLogger.h | 304 ----------------- 6 files changed, 1475 deletions(-) delete mode 100644 libkineto/src/RoctracerActivity.h delete mode 100644 libkineto/src/RoctracerActivityApi.cpp delete mode 100644 libkineto/src/RoctracerActivityApi.h delete mode 100644 libkineto/src/RoctracerActivity_inl.h delete mode 100644 libkineto/src/RoctracerLogger.cpp delete mode 100644 libkineto/src/RoctracerLogger.h diff --git a/libkineto/src/RoctracerActivity.h b/libkineto/src/RoctracerActivity.h deleted file mode 100644 index 72083ab6b..000000000 --- a/libkineto/src/RoctracerActivity.h +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include -#include - -// TODO(T90238193) -// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude -#include "GenericTraceActivity.h" -#include "ITraceActivity.h" -#include "RoctracerLogger.h" -#include "ThreadUtil.h" - -namespace libkineto { -class ActivityLogger; -} - -namespace KINETO_NAMESPACE { - -using namespace libkineto; -struct TraceSpan; - -// These classes wrap the various Roctracer activity types -// into subclasses of ITraceActivity so that they can all be accessed -// using the ITraceActivity interface and logged via ActivityLogger. - -// Abstract base class, templated on Roctracer activity type -template -struct RoctracerActivity : public ITraceActivity { - explicit RoctracerActivity(const T* activity, const ITraceActivity* linked) - : activity_(*activity), linked_(linked) {} - // Our stored timestamps (from roctracer and generated) are in CLOCK_MONOTONIC - // domain (in ns). Convert the timestamps. - int64_t timestamp() const override { - return activity_.begin; - } - int64_t duration() const override { - return activity_.end - activity_.begin; - } - int64_t correlationId() const override { - return 0; - } - int32_t getThreadId() const override { - return 0; - } - const ITraceActivity* linkedActivity() const override { - return linked_; - } - int flowType() const override { - return kLinkAsyncCpuGpu; - } - int64_t flowId() const override { - return correlationId(); - } - const T& raw() const { - return activity_; - } - const TraceSpan* traceSpan() const override { - return nullptr; - } - const std::string getMetadataValue(const std::string& key) const override { - auto it = metadata_.find(key); - if (it != metadata_.end()) { - return it->second; - } - return ""; - } - - protected: - const T& activity_; - const ITraceActivity* linked_{nullptr}; - std::unordered_map metadata_; -}; - -// roctracerAsyncRow - Roctracer GPU activities -struct GpuActivity : public RoctracerActivity { - explicit GpuActivity( - const roctracerAsyncRow* activity, - const ITraceActivity* linked) - : RoctracerActivity(activity, linked) { - switch (activity_.kind) { - case HIP_OP_COPY_KIND_DEVICE_TO_HOST_: - case HIP_OP_COPY_KIND_HOST_TO_DEVICE_: - case HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_: - case HIP_OP_COPY_KIND_DEVICE_TO_HOST_2D_: - case HIP_OP_COPY_KIND_HOST_TO_DEVICE_2D_: - case HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_2D_: - type_ = ActivityType::GPU_MEMCPY; - break; - case HIP_OP_COPY_KIND_FILL_BUFFER_: - type_ = ActivityType::GPU_MEMSET; - break; - case HIP_OP_DISPATCH_KIND_KERNEL_: - case HIP_OP_DISPATCH_KIND_TASK_: - default: - type_ = ActivityType::CONCURRENT_KERNEL; - break; - } - } - int64_t correlationId() const override { - return activity_.id; - } - int64_t deviceId() const override { - return activity_.device; - } - int64_t resourceId() const override { - return activity_.queue; - } - ActivityType type() const override { - return type_; - }; - bool flowStart() const override { - return false; - } - const std::string name() const override; - void log(ActivityLogger& logger) const override; - const std::string metadataJson() const override; - - // Add small buffer to fix visual error created by - // https://github.com/ROCm/roctracer/issues/105 Once this is resolved we can - // use ifdef to handle having this buffer or not based on version - int64_t timestamp() const override { - return activity_.begin + 1; - } - int64_t duration() const override { - return activity_.end - (activity_.begin + 1); - } - - private: - ActivityType type_; -}; - -// roctracerRow, roctracerKernelRow, roctracerCopyRow, roctracerMallocRow - -// Roctracer runtime activities -template -struct RuntimeActivity : public RoctracerActivity { - explicit RuntimeActivity(const T* activity, const ITraceActivity* linked) - : RoctracerActivity(activity, linked) {} - int64_t correlationId() const override { - return raw().id; - } - int64_t deviceId() const override { - return raw().pid; - } - int64_t resourceId() const override { - return raw().tid; - } - ActivityType type() const override { - return ActivityType::CUDA_RUNTIME; - } - bool flowStart() const override; - const std::string name() const override { - return std::string( - roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, raw().cid, 0)); - } - void log(ActivityLogger& logger) const override; - const std::string metadataJson() const override; - const T& raw() const { - return RoctracerActivity::raw(); - } -}; - -} // namespace KINETO_NAMESPACE - -// Include the implementation detail of this header file. -// The *_inl.h helps separate header interface from implementation details. -#include "RoctracerActivity_inl.h" diff --git a/libkineto/src/RoctracerActivityApi.cpp b/libkineto/src/RoctracerActivityApi.cpp deleted file mode 100644 index 0bec09d4a..000000000 --- a/libkineto/src/RoctracerActivityApi.cpp +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include "RoctracerActivityApi.h" - -#include -#include -#include -#include -#include "ApproximateClock.h" -#include "Demangle.h" -#include "Logger.h" -#include "ThreadUtil.h" -#include "output_base.h" - -using namespace std::chrono; - -namespace KINETO_NAMESPACE { - -RoctracerActivityApi& RoctracerActivityApi::singleton() { - static RoctracerActivityApi instance; - return instance; -} - -RoctracerActivityApi::RoctracerActivityApi() - : d(&RoctracerLogger::singleton()) {} - -RoctracerActivityApi::~RoctracerActivityApi() { - disableActivities(std::set()); -} - -void RoctracerActivityApi::pushCorrelationID(int id, CorrelationFlowType type) { -#ifdef HAS_ROCTRACER - if (!singleton().d->externalCorrelationEnabled_) { - return; - } - singleton().d->pushCorrelationID( - id, static_cast(type)); -#endif -} - -void RoctracerActivityApi::popCorrelationID(CorrelationFlowType type) { -#ifdef HAS_ROCTRACER - if (!singleton().d->externalCorrelationEnabled_) { - return; - } - singleton().d->popCorrelationID( - static_cast(type)); -#endif -} - -void RoctracerActivityApi::setMaxBufferSize(int size) { - // FIXME: implement? - // maxGpuBufferCount_ = 1 + size / kBufSize; -} - -inline bool inRange(int64_t start, int64_t end, int64_t stamp) { - return ((stamp > start) && (stamp < end)); -} - -inline bool RoctracerActivityApi::isLogged( - libkineto::ActivityType atype) const { - return activityMaskSnapshot_ & (1 << static_cast(atype)); -} - -timestamp_t getTimeOffset() { - int64_t t0, t00; - timespec t1; - t0 = libkineto::getApproximateTime(); - clock_gettime(CLOCK_MONOTONIC, &t1); - t00 = libkineto::getApproximateTime(); - - // Confvert to ns (if necessary) - t0 = libkineto::get_time_converter()(t0); - t00 = libkineto::get_time_converter()(t00); - - // Our stored timestamps (from roctracer and generated) are in CLOCK_MONOTONIC - // domain (in ns). - return (t0 >> 1) + (t00 >> 1) - timespec_to_ns(t1); -} - -int RoctracerActivityApi::processActivities( - std::function handler, - std::function - correlationHandler) { - // Find offset to map from monotonic clock to system clock. - // This will break time-ordering of events but is status quo. - - int count = 0; - - // Process all external correlations pairs - for (int it = RoctracerLogger::CorrelationDomain::begin; - it < RoctracerLogger::CorrelationDomain::end; - ++it) { - auto& externalCorrelations = d->externalCorrelations_[it]; - for (auto& item : externalCorrelations) { - correlationHandler( - item.first, - item.second, - static_cast(it)); - } - std::lock_guard lock(d->externalCorrelationsMutex_); - externalCorrelations.clear(); - } - - // Async ops are in CLOCK_MONOTONIC rather than junk clock. - // Convert these timestamps, poorly. - // These accurate timestamps will skew when converted to approximate time - // The time_converter is not available at collection time. Or we could do a - // much better job. - auto toffset = getTimeOffset(); - - // All Runtime API Calls - for (auto& item : d->rows_) { - bool filtered = false; - if (item->type != ROCTRACER_ACTIVITY_ASYNC && - !isLogged(ActivityType::CUDA_RUNTIME)) { - filtered = true; - } else { - switch (reinterpret_cast(item)->kind) { - case HIP_OP_COPY_KIND_DEVICE_TO_HOST_: - case HIP_OP_COPY_KIND_HOST_TO_DEVICE_: - case HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_: - case HIP_OP_COPY_KIND_DEVICE_TO_HOST_2D_: - case HIP_OP_COPY_KIND_HOST_TO_DEVICE_2D_: - case HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_2D_: - if (!isLogged(ActivityType::GPU_MEMCPY)) - filtered = true; - break; - case HIP_OP_COPY_KIND_FILL_BUFFER_: - if (!isLogged(ActivityType::GPU_MEMSET)) - filtered = true; - break; - case HIP_OP_DISPATCH_KIND_KERNEL_: - case HIP_OP_DISPATCH_KIND_TASK_: - default: - if (!isLogged(ActivityType::CONCURRENT_KERNEL)) - filtered = true; - // Don't record barriers/markers - if (reinterpret_cast(item)->op == - HIP_OP_ID_BARRIER) - filtered = true; - break; - } - } - if (!filtered) { - // Convert the begin and end timestamps from monotonic clock to system - // clock. - if (item->type == ROCTRACER_ACTIVITY_ASYNC) { - // Async ops are in CLOCK_MONOTONIC, apply offset to converted - // approximate - item->begin += toffset; - item->end += toffset; - } else { - // Runtime ranges are in approximate clock, just apply conversion - item->begin = libkineto::get_time_converter()(item->begin); - item->end = libkineto::get_time_converter()(item->end); - } - handler(item); - ++count; - } - } - return count; -} - -// TODO: implement the actual flush with roctracer_flush_activity -void RoctracerActivityApi::flushActivities() {} - -void RoctracerActivityApi::clearActivities() { - d->clearLogs(); -} - -void RoctracerActivityApi::setMaxEvents(uint32_t maxEvents) { -#ifdef HAS_ROCTRACER - d->setMaxEvents(maxEvents); -#endif -} - -void RoctracerActivityApi::enableActivities( - const std::set& selected_activities) { -#ifdef HAS_ROCTRACER - d->startLogging(); - - for (const auto& activity : selected_activities) { - activityMask_ |= (1 << static_cast(activity)); - if (activity == ActivityType::EXTERNAL_CORRELATION) { - d->externalCorrelationEnabled_ = true; - } - } -#endif -} - -void RoctracerActivityApi::disableActivities( - const std::set& selected_activities) { -#ifdef HAS_ROCTRACER - d->stopLogging(); - - activityMaskSnapshot_ = activityMask_; - - for (const auto& activity : selected_activities) { - activityMask_ &= ~(1 << static_cast(activity)); - if (activity == ActivityType::EXTERNAL_CORRELATION) { - d->externalCorrelationEnabled_ = false; - } - } -#endif -} - -} // namespace KINETO_NAMESPACE diff --git a/libkineto/src/RoctracerActivityApi.h b/libkineto/src/RoctracerActivityApi.h deleted file mode 100644 index 54bf03f73..000000000 --- a/libkineto/src/RoctracerActivityApi.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once -#ifdef HAS_ROCTRACER - -#include -#include -#include - -#include -#include "RoctracerLogger.h" - -#include "ActivityType.h" -#include "GenericTraceActivity.h" - -class RoctracerLogger; - -namespace KINETO_NAMESPACE { - -using namespace libkineto; - -class RoctracerActivityApi { - public: - enum CorrelationFlowType { Default, User }; - - RoctracerActivityApi(); - RoctracerActivityApi(const RoctracerActivityApi&) = delete; - RoctracerActivityApi& operator=(const RoctracerActivityApi&) = delete; - - virtual ~RoctracerActivityApi(); - - static RoctracerActivityApi& singleton(); - - static void pushCorrelationID(int id, CorrelationFlowType type); - static void popCorrelationID(CorrelationFlowType type); - - void enableActivities(const std::set& selected_activities); - void disableActivities(const std::set& selected_activities); - void clearActivities(); - void flushActivities(); - void teardownContext() {} - void setMaxEvents(uint32_t maxEvents); - - virtual int processActivities( - std::function handler, - std::function< - void(uint64_t, uint64_t, RoctracerLogger::CorrelationDomain)> - correlationHandler); - - void setMaxBufferSize(int size); - - std::atomic_bool stopCollection{false}; - - private: - bool registered_{false}; - - // Enabled Activity Filters - uint32_t activityMask_{0}; - uint32_t activityMaskSnapshot_{0}; - bool isLogged(libkineto::ActivityType atype) const; - - RoctracerLogger* d; -}; - -} // namespace KINETO_NAMESPACE -#endif diff --git a/libkineto/src/RoctracerActivity_inl.h b/libkineto/src/RoctracerActivity_inl.h deleted file mode 100644 index 56d54f6a9..000000000 --- a/libkineto/src/RoctracerActivity_inl.h +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include "RoctracerActivity.h" - -#include -#include -#include - -#include "Demangle.h" -#include "output_base.h" - -namespace KINETO_NAMESPACE { - -using namespace libkineto; - -namespace { -thread_local std::unordered_map correlationToGrid; -thread_local std::unordered_map correlationToBlock; -thread_local std::unordered_map correlationToSize; -} // namespace - -const char* getGpuActivityKindString(uint32_t kind) { - switch (kind) { - case HIP_OP_COPY_KIND_DEVICE_TO_HOST_: - case HIP_OP_COPY_KIND_DEVICE_TO_HOST_2D_: - return "DtoH"; - case HIP_OP_COPY_KIND_HOST_TO_DEVICE_: - case HIP_OP_COPY_KIND_HOST_TO_DEVICE_2D_: - return "HtoD"; - case HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_: - case HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_2D_: - return "DtoD"; - case HIP_OP_COPY_KIND_FILL_BUFFER_: - return "Device"; - case HIP_OP_DISPATCH_KIND_KERNEL_: - return "Dispatch Kernel"; - case HIP_OP_DISPATCH_KIND_TASK_: - return "Dispatch Task"; - default: - break; - } - return ""; -} - -void getMemcpySrcDstString(uint32_t kind, std::string& src, std::string& dst) { - switch (kind) { - case HIP_OP_COPY_KIND_DEVICE_TO_HOST_: - case HIP_OP_COPY_KIND_DEVICE_TO_HOST_2D_: - src = "Device"; - dst = "Host"; - break; - case HIP_OP_COPY_KIND_HOST_TO_DEVICE_: - case HIP_OP_COPY_KIND_HOST_TO_DEVICE_2D_: - src = "Host"; - dst = "Device"; - break; - case HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_: - case HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_2D_: - src = "Device"; - dst = "Device"; - break; - default: - src = "?"; - dst = "?"; - break; - } -} - -// GPU Activities - -inline const std::string GpuActivity::name() const { - if (type_ == ActivityType::CONCURRENT_KERNEL) { - const char* name = roctracer_op_string(raw().domain, raw().op, raw().kind); - return demangle( - raw().kernelName.length() > 0 ? raw().kernelName : std::string(name)); - } else if (type_ == ActivityType::GPU_MEMSET) { - return fmt::format("Memset ({})", getGpuActivityKindString(raw().kind)); - } else if (type_ == ActivityType::GPU_MEMCPY) { - std::string src = ""; - std::string dst = ""; - getMemcpySrcDstString(raw().kind, src, dst); - return fmt::format( - "Memcpy {} ({} -> {})", getGpuActivityKindString(raw().kind), src, dst); - } else { - return ""; - } -} - -inline void GpuActivity::log(ActivityLogger& logger) const { - logger.handleActivity(*this); -} - -static inline std::string bandwidth(size_t bytes, uint64_t duration) { - return duration == 0 ? "\"N/A\"" : fmt::format("{}", bytes * 1.0 / duration); -} - -inline const std::string GpuActivity::metadataJson() const { - const auto& gpuActivity = raw(); - // clang-format off - - // if memcpy or memset, add size - if (correlationToSize.count(gpuActivity.id) > 0) { - size_t size = correlationToSize[gpuActivity.id]; - std::string bandwidth_gib = (bandwidth(size, gpuActivity.end - gpuActivity.begin)); - return fmt::format(R"JSON( - "device": {}, "stream": {}, - "correlation": {}, "kind": "{}", - "bytes": {}, "memory bandwidth (GB/s)": {})JSON", - gpuActivity.device, gpuActivity.queue, - gpuActivity.id, getGpuActivityKindString(gpuActivity.kind), - size, bandwidth_gib); - } - - // if compute kernel, add grid and block - else if (correlationToGrid.count(gpuActivity.id) > 0) { - return fmt::format(R"JSON( - "device": {}, "stream": {}, - "correlation": {}, "kind": "{}", - "grid": {}, "block": {})JSON", - gpuActivity.device, gpuActivity.queue, - gpuActivity.id, getGpuActivityKindString(gpuActivity.kind), - correlationToGrid[gpuActivity.id], correlationToBlock[gpuActivity.id]); - } else { - return fmt::format(R"JSON( - "device": {}, "stream": {}, - "correlation": {}, "kind": "{}")JSON", - gpuActivity.device, gpuActivity.queue, - gpuActivity.id, getGpuActivityKindString(gpuActivity.kind)); - } - // clang-format on -} - -// Runtime Activities - -template -inline bool RuntimeActivity::flowStart() const { - bool should_correlate = raw().cid == HIP_API_ID_hipLaunchKernel || - raw().cid == HIP_API_ID_hipExtLaunchKernel || - raw().cid == HIP_API_ID_hipLaunchCooperativeKernel || - raw().cid == HIP_API_ID_hipHccModuleLaunchKernel || - raw().cid == HIP_API_ID_hipModuleLaunchKernel || - raw().cid == HIP_API_ID_hipExtModuleLaunchKernel || - raw().cid == HIP_API_ID_hipMalloc || raw().cid == HIP_API_ID_hipFree || - raw().cid == HIP_API_ID_hipMemcpy || - raw().cid == HIP_API_ID_hipMemcpyAsync || - raw().cid == HIP_API_ID_hipMemcpyWithStream; - return should_correlate; -} - -template -inline void RuntimeActivity::log(ActivityLogger& logger) const { - logger.handleActivity(*this); -} - -template <> -inline const std::string RuntimeActivity::metadataJson() - const { - std::string kernel = ""; - if ((raw().functionAddr != nullptr)) { - kernel = fmt::format( - R"JSON( - "kernel": "{}", )JSON", - demangle(hipKernelNameRefByPtr(raw().functionAddr, raw().stream))); - } else if ((raw().function != nullptr)) { - kernel = fmt::format( - R"JSON( - "kernel": "{}", )JSON", - demangle(hipKernelNameRef(raw().function))); - } - // cache grid and block so we can pass it into async activity (GPU track) - correlationToGrid[raw().id] = fmt::format( - R"JSON( - [{}, {}, {}])JSON", - raw().gridX, - raw().gridY, - raw().gridZ); - - correlationToBlock[raw().id] = fmt::format( - R"JSON( - [{}, {}, {}])JSON", - raw().workgroupX, - raw().workgroupY, - raw().workgroupZ); - - return fmt::format( - R"JSON( - {}"cid": {}, "correlation": {}, - "grid": [{}, {}, {}], - "block": [{}, {}, {}], - "shared memory": {})JSON", - kernel, - raw().cid, - raw().id, - raw().gridX, - raw().gridY, - raw().gridZ, - raw().workgroupX, - raw().workgroupY, - raw().workgroupZ, - raw().groupSegmentSize); -} - -template <> -inline const std::string RuntimeActivity::metadataJson() - const { - correlationToSize[raw().id] = raw().size; - return fmt::format( - R"JSON( - "cid": {}, "correlation": {}, "src": "{}", "dst": "{}", "bytes": "{}", "kind": "{}")JSON", - raw().cid, - raw().id, - raw().src, - raw().dst, - raw().size, - fmt::underlying(raw().kind)); -} - -template <> -inline const std::string RuntimeActivity::metadataJson() - const { - correlationToSize[raw().id] = raw().size; - std::string size = ""; - if (raw().cid == HIP_API_ID_hipMalloc) { - size = fmt::format( - R"JSON( - "bytes": {}, )JSON", - raw().size); - } - return fmt::format( - R"JSON( - {}"cid": {}, "correlation": {}, "ptr": "{}")JSON", - size, - raw().cid, - raw().id, - raw().ptr); -} - -template -inline const std::string RuntimeActivity::metadataJson() const { - return fmt::format( - R"JSON( - "cid": {}, "correlation": {})JSON", - raw().cid, - raw().id); -} - -} // namespace KINETO_NAMESPACE diff --git a/libkineto/src/RoctracerLogger.cpp b/libkineto/src/RoctracerLogger.cpp deleted file mode 100644 index 725c4d0b0..000000000 --- a/libkineto/src/RoctracerLogger.cpp +++ /dev/null @@ -1,454 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include "RoctracerLogger.h" - -#include -#include -#include -#include -#include - -#include "ApproximateClock.h" -#include "Demangle.h" -#include "Logger.h" -#include "ThreadUtil.h" - -using namespace libkineto; -using namespace std::chrono; - -class Flush { - public: - std::mutex mutex_; - std::atomic maxCorrelationId_; - uint64_t maxCompletedCorrelationId_{0}; - void reportCorrelation(const uint64_t& cid) { - uint64_t prev = maxCorrelationId_; - while (prev < cid && !maxCorrelationId_.compare_exchange_weak(prev, cid)) { - } - } -}; -static Flush s_flush; - -RoctracerLogger& RoctracerLogger::singleton() { - static RoctracerLogger instance; - return instance; -} - -RoctracerLogger::RoctracerLogger() {} - -RoctracerLogger::~RoctracerLogger() { - stopLogging(); - endTracing(); -} - -namespace { -thread_local std::deque - t_externalIds[RoctracerLogger::CorrelationDomain::size]; -} - -void RoctracerLogger::pushCorrelationID(uint64_t id, CorrelationDomain type) { - if (!singleton().externalCorrelationEnabled_) { - return; - } - t_externalIds[type].push_back(id); -} - -void RoctracerLogger::popCorrelationID(CorrelationDomain type) { - if (!singleton().externalCorrelationEnabled_) { - return; - } - if (!t_externalIds[type].empty()) { - t_externalIds[type].pop_back(); - } else { - LOG(ERROR) - << "Attempt to popCorrelationID from an empty external Ids stack"; - } -} - -void RoctracerLogger::clearLogs() { - rows_.clear(); - for (int i = 0; i < CorrelationDomain::size; ++i) { - externalCorrelations_[i].clear(); - } -} - -void RoctracerLogger::insert_row_to_buffer(roctracerBase* row) { - RoctracerLogger* dis = &singleton(); - std::lock_guard lock(dis->rowsMutex_); - if (dis->rows_.size() >= dis->maxBufferSize_) { - LOG_FIRST_N(WARNING, 10) - << "Exceeded max GPU buffer count (" << dis->rows_.size() << " > " - << dis->maxBufferSize_ << ") - terminating tracing"; - return; - } - dis->rows_.push_back(row); -} - -void RoctracerLogger::api_callback( - uint32_t domain, - uint32_t cid, - const void* callback_data, - void* arg) { - RoctracerLogger* dis = &singleton(); - - if (domain == ACTIVITY_DOMAIN_HIP_API && dis->loggedIds_.contains(cid)) { - const hip_api_data_t* data = (const hip_api_data_t*)(callback_data); - - // Pack callbacks into row structures - - thread_local std::unordered_map - timestamps; - - if (data->phase == ACTIVITY_API_PHASE_ENTER) { - timestamps[data->correlation_id] = getApproximateTime(); - } else { // (data->phase == ACTIVITY_API_PHASE_EXIT) - uint64_t startTime = timestamps[data->correlation_id]; - timestamps.erase(data->correlation_id); - uint64_t endTime = getApproximateTime(); - - switch (cid) { - case HIP_API_ID_hipLaunchKernel: - case HIP_API_ID_hipExtLaunchKernel: - case HIP_API_ID_hipLaunchCooperativeKernel: // Should work here - { - s_flush.reportCorrelation(data->correlation_id); - auto& args = data->args.hipLaunchKernel; - roctracerKernelRow* row = new roctracerKernelRow( - data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - startTime, - endTime, - args.function_address, - nullptr, - args.numBlocks.x, - args.numBlocks.y, - args.numBlocks.z, - args.dimBlocks.x, - args.dimBlocks.y, - args.dimBlocks.z, - args.sharedMemBytes, - args.stream); - insert_row_to_buffer(row); - } break; - case HIP_API_ID_hipHccModuleLaunchKernel: - case HIP_API_ID_hipModuleLaunchKernel: - case HIP_API_ID_hipExtModuleLaunchKernel: { - s_flush.reportCorrelation(data->correlation_id); - auto& args = data->args.hipModuleLaunchKernel; - roctracerKernelRow* row = new roctracerKernelRow( - data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - startTime, - endTime, - nullptr, - args.f, - args.gridDimX, - args.gridDimY, - args.gridDimZ, - args.blockDimX, - args.blockDimY, - args.blockDimZ, - args.sharedMemBytes, - args.stream); - insert_row_to_buffer(row); - } break; - case HIP_API_ID_hipLaunchCooperativeKernelMultiDevice: - case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice: -#if 0 - { - auto &args = data->args.hipLaunchCooperativeKernelMultiDevice.launchParamsList__val; - roctracerKernelRow* row = new roctracerKernelRow( - data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - startTime, - endTime, - args.function_address, - nullptr, - args.numBlocks.x, - args.numBlocks.y, - args.numBlocks.z, - args.dimBlocks.x, - args.dimBlocks.y, - args.dimBlocks.z, - args.sharedMemBytes, - args.stream - ); - insert_row_to_buffer(row); - } -#endif - break; - case HIP_API_ID_hipMalloc: { - roctracerMallocRow* row = new roctracerMallocRow( - data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - startTime, - endTime, - data->args.hipMalloc.ptr__val, - data->args.hipMalloc.size); - insert_row_to_buffer(row); - } break; - case HIP_API_ID_hipFree: { - roctracerMallocRow* row = new roctracerMallocRow( - data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - startTime, - endTime, - data->args.hipFree.ptr, - 0); - insert_row_to_buffer(row); - } break; - case HIP_API_ID_hipMemcpy: { - auto& args = data->args.hipMemcpy; - roctracerCopyRow* row = new roctracerCopyRow( - data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - startTime, - endTime, - args.src, - args.dst, - args.sizeBytes, - args.kind, - static_cast(0) // use placeholder? - ); - insert_row_to_buffer(row); - } break; - case HIP_API_ID_hipMemcpyAsync: - case HIP_API_ID_hipMemcpyWithStream: { - auto& args = data->args.hipMemcpyAsync; - roctracerCopyRow* row = new roctracerCopyRow( - data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - startTime, - endTime, - args.src, - args.dst, - args.sizeBytes, - args.kind, - args.stream); - insert_row_to_buffer(row); - } break; - default: { - roctracerRow* row = new roctracerRow( - data->correlation_id, - domain, - cid, - processId(), - systemThreadId(), - startTime, - endTime); - insert_row_to_buffer(row); - } break; - } // switch - // External correlation - for (int it = CorrelationDomain::begin; it < CorrelationDomain::end; - ++it) { - if (t_externalIds[it].size() > 0) { - std::lock_guard lock(dis->externalCorrelationsMutex_); - dis->externalCorrelations_[it].emplace_back( - data->correlation_id, t_externalIds[it].back()); - } - } - } // phase exit - } -} - -void RoctracerLogger::activity_callback( - const char* begin, - const char* end, - void* arg) { - // Log latest completed correlation id. Used to ensure we have flushed all - // data on stop - std::unique_lock lock(s_flush.mutex_); - const roctracer_record_t* record = (const roctracer_record_t*)(begin); - const roctracer_record_t* end_record = (const roctracer_record_t*)(end); - - while (record < end_record) { - if (record->correlation_id > s_flush.maxCompletedCorrelationId_) { - s_flush.maxCompletedCorrelationId_ = record->correlation_id; - } - roctracerAsyncRow* row = new roctracerAsyncRow( - record->correlation_id, - record->domain, - record->kind, - record->op, - record->device_id, - record->queue_id, - record->begin_ns, - record->end_ns, - ((record->kind == HIP_OP_DISPATCH_KIND_KERNEL_) || - (record->kind == HIP_OP_DISPATCH_KIND_TASK_)) - ? demangle(record->kernel_name) - : std::string()); - insert_row_to_buffer(row); - roctracer_next_record(record, &record); - } -} - -void RoctracerLogger::setMaxEvents(uint32_t maxBufferSize) { -#ifdef HAS_ROCTRACER - RoctracerLogger* dis = &singleton(); - std::lock_guard lock(dis->rowsMutex_); - maxBufferSize_ = maxBufferSize; -#endif -} - -void RoctracerLogger::startLogging() { - if (!registered_) { - roctracer_set_properties( - ACTIVITY_DOMAIN_HIP_API, nullptr); // Magic encantation - - // Set some api calls to ignore - loggedIds_.setInvertMode(true); // Omit the specified api - loggedIds_.add("hipGetDevice"); - loggedIds_.add("hipSetDevice"); - loggedIds_.add("hipGetLastError"); - loggedIds_.add("__hipPushCallConfiguration"); - loggedIds_.add("__hipPopCallConfiguration"); - loggedIds_.add("hipCtxSetCurrent"); - loggedIds_.add("hipEventRecord"); - loggedIds_.add("hipEventQuery"); - loggedIds_.add("hipGetDeviceProperties"); - loggedIds_.add("hipPeekAtLastError"); - loggedIds_.add("hipModuleGetFunction"); - loggedIds_.add("hipEventCreateWithFlags"); - loggedIds_.add("hipGetDeviceCount"); - loggedIds_.add("hipDevicePrimaryCtxGetState"); - - // Enable API callbacks - if (loggedIds_.invertMode() == true) { - // exclusion list - enable entire domain and turn off things in list - roctracer_enable_domain_callback( - ACTIVITY_DOMAIN_HIP_API, api_callback, nullptr); - const std::unordered_map& filter = - loggedIds_.filterList(); - for (auto it = filter.begin(); it != filter.end(); ++it) { - roctracer_disable_op_callback(ACTIVITY_DOMAIN_HIP_API, it->first); - } - } else { - // inclusion list - only enable things in the list - const std::unordered_map& filter = - loggedIds_.filterList(); - roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API); - for (auto it = filter.begin(); it != filter.end(); ++it) { - roctracer_enable_op_callback( - ACTIVITY_DOMAIN_HIP_API, it->first, api_callback, nullptr); - } - } - // roctracer_enable_domain_callback(ACTIVITY_DOMAIN_ROCTX, api_callback, - // nullptr); - - // Allocate default tracing pool - roctracer_properties_t properties; - memset(&properties, 0, sizeof(roctracer_properties_t)); - properties.buffer_size = 0x1000; - roctracer_open_pool(&properties); - - // Enable async op collection - roctracer_properties_t hcc_cb_properties; - memset(&hcc_cb_properties, 0, sizeof(roctracer_properties_t)); - hcc_cb_properties.buffer_size = 0x4000; - hcc_cb_properties.buffer_callback_fun = activity_callback; - roctracer_open_pool_expl(&hcc_cb_properties, &hccPool_); - roctracer_enable_domain_activity_expl(ACTIVITY_DOMAIN_HCC_OPS, hccPool_); - - registered_ = true; - } - - externalCorrelationEnabled_ = true; - logging_ = true; - roctracer_start(); -} - -void RoctracerLogger::stopLogging() { - if (logging_ == false) - return; - logging_ = false; - - hipError_t err = hipDeviceSynchronize(); - if (err != hipSuccess) { - LOG(ERROR) << "hipDeviceSynchronize failed with code " << err; - } - roctracer_flush_activity_expl(hccPool_); - - // If we are stopping the tracer, implement reliable flushing - std::unique_lock lock(s_flush.mutex_); - - auto correlationId = - s_flush.maxCorrelationId_.load(); // load ending id from the running max - - // Poll on the worker finding the final correlation id - int timeout = 50; - while ((s_flush.maxCompletedCorrelationId_ < correlationId) && --timeout) { - lock.unlock(); - roctracer_flush_activity_expl(hccPool_); - usleep(1000); - lock.lock(); - } - - roctracer_stop(); -} - -void RoctracerLogger::endTracing() { - if (registered_ == true) { - roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API); - // roctracer_disable_domain_callback(ACTIVITY_DOMAIN_ROCTX); - - roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HCC_OPS); - roctracer_close_pool_expl(hccPool_); - hccPool_ = nullptr; - } -} - -ApiIdList::ApiIdList() : invert_(true) {} - -void ApiIdList::add(const std::string& apiName) { - uint32_t cid = 0; - if (roctracer_op_code( - ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr) == - ROCTRACER_STATUS_SUCCESS) { - filter_[cid] = 1; - } -} -void ApiIdList::remove(const std::string& apiName) { - uint32_t cid = 0; - if (roctracer_op_code( - ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr) == - ROCTRACER_STATUS_SUCCESS) { - filter_.erase(cid); - } -} - -bool ApiIdList::loadUserPrefs() { - // placeholder - return false; -} -bool ApiIdList::contains(uint32_t apiId) { - return (filter_.find(apiId) != filter_.end()) ? !invert_ : invert_; // XOR -} diff --git a/libkineto/src/RoctracerLogger.h b/libkineto/src/RoctracerLogger.h deleted file mode 100644 index d28a9f08b..000000000 --- a/libkineto/src/RoctracerLogger.h +++ /dev/null @@ -1,304 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -// Local copy of hip op types. These are public (and stable) in later rocm -// releases -typedef enum { - HIP_OP_COPY_KIND_UNKNOWN_ = 0, - HIP_OP_COPY_KIND_DEVICE_TO_HOST_ = 0x11F3, - HIP_OP_COPY_KIND_HOST_TO_DEVICE_ = 0x11F4, - HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_ = 0x11F5, - HIP_OP_COPY_KIND_DEVICE_TO_HOST_2D_ = 0x1201, - HIP_OP_COPY_KIND_HOST_TO_DEVICE_2D_ = 0x1202, - HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_2D_ = 0x1203, - HIP_OP_COPY_KIND_FILL_BUFFER_ = 0x1207 -} hip_op_copy_kind_t_; - -typedef enum { - HIP_OP_DISPATCH_KIND_UNKNOWN_ = 0, - HIP_OP_DISPATCH_KIND_KERNEL_ = 0x11F0, - HIP_OP_DISPATCH_KIND_TASK_ = 0x11F1 -} hip_op_dispatch_kind_t_; - -typedef enum { HIP_OP_BARRIER_KIND_UNKNOWN_ = 0 } hip_op_barrier_kind_t_; -// end hip op defines - -namespace onnxruntime { -namespace profiling { -class RocmProfiler; -} -} // namespace onnxruntime - -namespace libkineto { -class RoctracerActivityApi; -} - -typedef uint64_t timestamp_t; - -static timestamp_t timespec_to_ns(const timespec& time) { - return ((timestamp_t)time.tv_sec * 1000000000) + time.tv_nsec; -} - -class ApiIdList { - public: - ApiIdList(); - bool invertMode() { - return invert_; - } - void setInvertMode(bool invert) { - invert_ = invert; - } - void add(const std::string& apiName); - void remove(const std::string& apiName); - bool loadUserPrefs(); - bool contains(uint32_t apiId); - const std::unordered_map& filterList() { - return filter_; - } - - private: - std::unordered_map filter_; - bool invert_; -}; - -typedef enum { - ROCTRACER_ACTIVITY_DEFAULT = 0, - ROCTRACER_ACTIVITY_KERNEL, - ROCTRACER_ACTIVITY_COPY, - ROCTRACER_ACTIVITY_MALLOC, - ROCTRACER_ACTIVITY_ASYNC, - ROCTRACER_ACTIVITY_NONE -} roctracer_activity_types; - -struct roctracerBase { - roctracerBase( - uint64_t id, - uint32_t domain, - uint64_t begin, - uint64_t end, - roctracer_activity_types type = ROCTRACER_ACTIVITY_NONE) - : id(id), begin(begin), end(end), domain(domain), type(type) {} - uint64_t id; // correlation_id - uint64_t begin; - uint64_t end; - uint32_t domain; - roctracer_activity_types type; -}; - -struct roctracerRow : public roctracerBase { - roctracerRow( - uint64_t id, - uint32_t domain, - uint32_t cid, - uint32_t pid, - uint32_t tid, - uint64_t begin, - uint64_t end, - roctracer_activity_types type = ROCTRACER_ACTIVITY_DEFAULT) - : roctracerBase(id, domain, begin, end, type), - cid(cid), - pid(pid), - tid(tid) {} - uint32_t cid; - uint32_t pid; - uint32_t tid; -}; - -struct roctracerKernelRow : public roctracerRow { - roctracerKernelRow( - uint64_t id, - uint32_t domain, - uint32_t cid, - uint32_t pid, - uint32_t tid, - uint64_t begin, - uint64_t end, - const void* faddr, - hipFunction_t function, - unsigned int gx, - unsigned int gy, - unsigned int gz, - unsigned int wx, - unsigned int wy, - unsigned int wz, - size_t gss, - hipStream_t stream, - roctracer_activity_types type = ROCTRACER_ACTIVITY_KERNEL) - : roctracerRow(id, domain, cid, pid, tid, begin, end, type), - functionAddr(faddr), - function(function), - gridX(gx), - gridY(gy), - gridZ(gz), - workgroupX(wx), - workgroupY(wy), - workgroupZ(wz), - groupSegmentSize(gss), - stream(stream) {} - const void* functionAddr; - hipFunction_t function; - unsigned int gridX; - unsigned int gridY; - unsigned int gridZ; - unsigned int workgroupX; - unsigned int workgroupY; - unsigned int workgroupZ; - size_t groupSegmentSize; - hipStream_t stream; -}; - -struct roctracerCopyRow : public roctracerRow { - roctracerCopyRow( - uint64_t id, - uint32_t domain, - uint32_t cid, - uint32_t pid, - uint32_t tid, - uint64_t begin, - uint64_t end, - const void* src, - const void* dst, - size_t size, - hipMemcpyKind kind, - hipStream_t stream, - roctracer_activity_types type = ROCTRACER_ACTIVITY_COPY) - : roctracerRow(id, domain, cid, pid, tid, begin, end, type), - src(src), - dst(dst), - size(size), - kind(kind), - stream(stream) {} - const void* src; - const void* dst; - size_t size; - hipMemcpyKind kind; - hipStream_t stream; -}; - -struct roctracerMallocRow : public roctracerRow { - roctracerMallocRow( - uint64_t id, - uint32_t domain, - uint32_t cid, - uint32_t pid, - uint32_t tid, - uint64_t begin, - uint64_t end, - const void* ptr, - size_t size, - roctracer_activity_types type = ROCTRACER_ACTIVITY_MALLOC) - : roctracerRow(id, domain, cid, pid, tid, begin, end, type), - ptr(ptr), - size(size) {} - const void* ptr; - size_t size; -}; - -struct roctracerAsyncRow : public roctracerBase { - roctracerAsyncRow( - uint64_t id, - uint32_t domain, - uint32_t kind, - uint32_t op, - int device, - uint64_t queue, - uint64_t begin, - uint64_t end, - const std::string& kernelName, - roctracer_activity_types type = ROCTRACER_ACTIVITY_ASYNC) - : roctracerBase(id, domain, begin, end, type), - kind(kind), - op(op), - device(device), - queue(queue), - kernelName(kernelName) {} - uint32_t kind; - uint32_t op; - int device; - uint64_t queue; - std::string kernelName; -}; - -class RoctracerLogger { - public: - enum CorrelationDomain { - begin, - Default = begin, - Domain0 = begin, - Domain1, - end, - size = end - }; - - RoctracerLogger(); - RoctracerLogger(const RoctracerLogger&) = delete; - RoctracerLogger& operator=(const RoctracerLogger&) = delete; - - virtual ~RoctracerLogger(); - - static RoctracerLogger& singleton(); - - static void pushCorrelationID(uint64_t id, CorrelationDomain type); - static void popCorrelationID(CorrelationDomain type); - - void startLogging(); - void stopLogging(); - void clearLogs(); - void setMaxEvents(uint32_t maxBufferSize); - - private: - bool registered_{false}; - void endTracing(); - - roctracer_pool_t* hccPool_{NULL}; - static void insert_row_to_buffer(roctracerBase* row); - static void api_callback( - uint32_t domain, - uint32_t cid, - const void* callback_data, - void* arg); - static void activity_callback(const char* begin, const char* end, void* arg); - - ApiIdList loggedIds_; - - // Api callback data - uint32_t maxBufferSize_{5000000}; // 5M GPU runtime/kernel events. - std::vector rows_; - std::mutex rowsMutex_; - - // This vector collects pairs of correlationId and their respective - // externalCorrelationId for each CorrelationDomain. This will be used - // to populate the Correlation maps during post processing. - std::vector> - externalCorrelations_[CorrelationDomain::size]; - std::mutex externalCorrelationsMutex_; - - bool externalCorrelationEnabled_{true}; - bool logging_{false}; - - friend class onnxruntime::profiling::RocmProfiler; - friend class libkineto::RoctracerActivityApi; -}; From dcfe1a47d0f9cacb3aa34e09e67392c57563fadb Mon Sep 17 00:00:00 2001 From: Michael Wootton Date: Wed, 20 Aug 2025 05:37:22 -0500 Subject: [PATCH 2/7] Add rocprofiler-sdk support --- libkineto/CMakeLists.txt | 4 +- libkineto/libkineto_defs.bzl | 5 +- libkineto/src/ActivityProfilerController.cpp | 4 +- libkineto/src/CuptiActivityProfiler.cpp | 60 +- libkineto/src/CuptiActivityProfiler.h | 18 +- libkineto/src/RocLogger.cpp | 32 + libkineto/src/RocLogger.h | 229 ++++++ libkineto/src/RocprofActivity.h | 165 ++++ libkineto/src/RocprofActivityApi.cpp | 196 +++++ libkineto/src/RocprofActivityApi.h | 72 ++ libkineto/src/RocprofActivity_inl.h | 261 +++++++ libkineto/src/RocprofLogger.cpp | 778 +++++++++++++++++++ libkineto/src/RocprofLogger.h | 97 +++ 13 files changed, 1876 insertions(+), 45 deletions(-) create mode 100644 libkineto/src/RocLogger.cpp create mode 100644 libkineto/src/RocLogger.h create mode 100644 libkineto/src/RocprofActivity.h create mode 100644 libkineto/src/RocprofActivityApi.cpp create mode 100644 libkineto/src/RocprofActivityApi.h create mode 100644 libkineto/src/RocprofActivity_inl.h create mode 100644 libkineto/src/RocprofLogger.cpp create mode 100644 libkineto/src/RocprofLogger.h diff --git a/libkineto/CMakeLists.txt b/libkineto/CMakeLists.txt index 13509b2fd..4d025a7cd 100644 --- a/libkineto/CMakeLists.txt +++ b/libkineto/CMakeLists.txt @@ -238,9 +238,9 @@ target_include_directories(kineto PUBLIC $) if(NOT LIBKINETO_NOROCTRACER) - find_library(ROCTRACER_LIBRARY NAMES libroctracer64.so HINTS + find_library(ROCPROF_LIBRARY NAMES librocprofiler-sdk.so HINTS ${ROCM_SOURCE_DIR}/lib) - target_link_libraries(kineto "${ROCTRACER_LIBRARY}") + target_link_libraries(kineto "${ROCPROF_LIBRARY}") find_library(KINETO_HIP_LIBRARY NAMES libamdhip64.so HINTS ${ROCM_SOURCE_DIR}/lib) target_link_libraries(kineto "${KINETO_HIP_LIBRARY}") diff --git a/libkineto/libkineto_defs.bzl b/libkineto/libkineto_defs.bzl index 6721fcb35..f22633d0a 100644 --- a/libkineto/libkineto_defs.bzl +++ b/libkineto/libkineto_defs.bzl @@ -29,8 +29,9 @@ def get_libkineto_cupti_srcs(with_api = True): def get_libkineto_roctracer_srcs(with_api = True): return [ - "src/RoctracerActivityApi.cpp", - "src/RoctracerLogger.cpp", + "src/RocprofActivityApi.cpp", + "src/RocprofLogger.cpp", + "src/RocLogger.cpp", ] + (get_libkineto_cpu_only_srcs(with_api)) def get_libkineto_xpupti_srcs(with_api = True): diff --git a/libkineto/src/ActivityProfilerController.cpp b/libkineto/src/ActivityProfilerController.cpp index b75dc72b9..4c45c6b85 100644 --- a/libkineto/src/ActivityProfilerController.cpp +++ b/libkineto/src/ActivityProfilerController.cpp @@ -18,7 +18,7 @@ #include "CuptiActivityApi.h" #ifdef HAS_ROCTRACER -#include "RoctracerActivityApi.h" +#include "RocprofActivityApi.h" #endif #include "ThreadUtil.h" @@ -68,7 +68,7 @@ ActivityProfilerController::ActivityProfilerController( #ifdef HAS_ROCTRACER profiler_ = std::make_unique( - RoctracerActivityApi::singleton(), cpuOnly); + RocprofActivityApi::singleton(), cpuOnly); #else profiler_ = std::make_unique( CuptiActivityApi::singleton(), cpuOnly); diff --git a/libkineto/src/CuptiActivityProfiler.cpp b/libkineto/src/CuptiActivityProfiler.cpp index 975e736ee..bcb97844e 100644 --- a/libkineto/src/CuptiActivityProfiler.cpp +++ b/libkineto/src/CuptiActivityProfiler.cpp @@ -25,7 +25,7 @@ #ifdef HAS_CUPTI #include #elif defined(HAS_ROCTRACER) -#include +#include #endif #include "Config.h" @@ -39,9 +39,9 @@ #include "KernelRegistry.h" #endif // HAS_CUPTI #ifdef HAS_ROCTRACER -#include "RoctracerActivity.h" -#include "RoctracerActivityApi.h" -#include "RoctracerLogger.h" +#include "RocLogger.h" +#include "RocprofActivity.h" +#include "RocprofActivityApi.h" #endif #ifdef HAS_XPUPTI #include "plugin/xpupti/XpuptiActivityProfiler.h" @@ -215,7 +215,7 @@ void CuptiActivityProfiler::transferCpuTrace( #ifdef HAS_ROCTRACER CuptiActivityProfiler::CuptiActivityProfiler( - RoctracerActivityApi& cupti, + RocprofActivityApi& cupti, bool cpuOnly) #else CuptiActivityProfiler::CuptiActivityProfiler( @@ -256,23 +256,23 @@ void CuptiActivityProfiler::logGpuVersions() { addVersionMetadata("cuda_driver_version", std::to_string(cudaDriverVersion)); #elif defined(HAS_ROCTRACER) - uint32_t majorVersion = roctracer_version_major(); - uint32_t minorVersion = roctracer_version_minor(); + uint32_t majorVersion = ROCPROFILER_VERSION_MAJOR; + uint32_t minorVersion = ROCPROFILER_VERSION_MINOR; std::string roctracerVersion = std::to_string(majorVersion) + "." + std::to_string(minorVersion); int hipRuntimeVersion = 0, hipDriverVersion = 0; CUDA_CALL(hipRuntimeGetVersion(&hipRuntimeVersion)); CUDA_CALL(hipDriverGetVersion(&hipDriverVersion)); - LOG(INFO) << "HIP versions. Roctracer: " << roctracerVersion + LOG(INFO) << "HIP versions. Rocprofiler-sdk: " << roctracerVersion << "; Runtime: " << hipRuntimeVersion << "; Driver: " << hipDriverVersion; - LOGGER_OBSERVER_ADD_METADATA("roctracer_version", roctracerVersion); + LOGGER_OBSERVER_ADD_METADATA("rocprofiler-sdk_version", roctracerVersion); LOGGER_OBSERVER_ADD_METADATA( "hip_runtime_version", std::to_string(hipRuntimeVersion)); LOGGER_OBSERVER_ADD_METADATA( "hip_driver_version", std::to_string(hipDriverVersion)); - addVersionMetadata("roctracer_version", roctracerVersion); + addVersionMetadata("rocprofiler-sdk_version", roctracerVersion); addVersionMetadata("hip_runtime_version", std::to_string(hipRuntimeVersion)); addVersionMetadata("hip_driver_version", std::to_string(hipDriverVersion)); @@ -372,7 +372,7 @@ void CuptiActivityProfiler::processTraceInternal(ActivityLogger& logger) { VLOG(0) << "Retrieving GPU activity buffers"; const int count = cupti_.processActivities( std::bind( - &CuptiActivityProfiler::handleRoctracerActivity, + &CuptiActivityProfiler::handleRocprofActivity, this, std::placeholders::_1, &logger), @@ -480,10 +480,10 @@ inline void CuptiActivityProfiler::handleCorrelationActivity( inline void CuptiActivityProfiler::handleCorrelationActivity( uint64_t correlationId, uint64_t externalId, - RoctracerLogger::CorrelationDomain externalKind) { - if (externalKind == RoctracerLogger::CorrelationDomain::Domain0) { + RocLogger::CorrelationDomain externalKind) { + if (externalKind == RocLogger::CorrelationDomain::Domain0) { cpuCorrelationMap_[correlationId] = externalId; - } else if (externalKind == RoctracerLogger::CorrelationDomain::Domain1) { + } else if (externalKind == RocLogger::CorrelationDomain::Domain1) { userCorrelationMap_[correlationId] = externalId; } else { LOG(WARNING) @@ -958,7 +958,7 @@ void CuptiActivityProfiler::handleRuntimeActivity( } inline void CuptiActivityProfiler::handleGpuActivity( - const roctracerAsyncRow* act, + const rocprofAsyncRow* act, ActivityLogger* logger) { const ITraceActivity* linked = linkedActivity(act->id, cpuCorrelationMap_); const auto& gpu_activity = @@ -966,29 +966,29 @@ inline void CuptiActivityProfiler::handleGpuActivity( handleGpuActivity(gpu_activity, logger); } -void CuptiActivityProfiler::handleRoctracerActivity( - const roctracerBase* record, +void CuptiActivityProfiler::handleRocprofActivity( + const rocprofBase* record, ActivityLogger* logger) { switch (record->type) { case ROCTRACER_ACTIVITY_DEFAULT: handleRuntimeActivity( - reinterpret_cast(record), logger); + reinterpret_cast(record), logger); break; case ROCTRACER_ACTIVITY_KERNEL: handleRuntimeActivity( - reinterpret_cast(record), logger); + reinterpret_cast(record), logger); break; case ROCTRACER_ACTIVITY_COPY: handleRuntimeActivity( - reinterpret_cast(record), logger); + reinterpret_cast(record), logger); break; case ROCTRACER_ACTIVITY_MALLOC: handleRuntimeActivity( - reinterpret_cast(record), logger); + reinterpret_cast(record), logger); break; case ROCTRACER_ACTIVITY_ASYNC: handleGpuActivity( - reinterpret_cast(record), logger); + reinterpret_cast(record), logger); break; case ROCTRACER_ACTIVITY_NONE: default: @@ -1569,8 +1569,8 @@ void CuptiActivityProfiler::pushCorrelationId(uint64_t id) { id, CuptiActivityApi::CorrelationFlowType::Default); #endif // HAS_CUPTI #ifdef HAS_ROCTRACER - RoctracerActivityApi::pushCorrelationID( - id, RoctracerActivityApi::CorrelationFlowType::Default); + RocprofActivityApi::pushCorrelationID( + id, RocprofActivityApi::CorrelationFlowType::Default); #endif for (auto& session : sessions_) { session->pushCorrelationId(id); @@ -1583,8 +1583,8 @@ void CuptiActivityProfiler::popCorrelationId() { CuptiActivityApi::CorrelationFlowType::Default); #endif // HAS_CUPTI #ifdef HAS_ROCTRACER - RoctracerActivityApi::popCorrelationID( - RoctracerActivityApi::CorrelationFlowType::Default); + RocprofActivityApi::popCorrelationID( + RocprofActivityApi::CorrelationFlowType::Default); #endif for (auto& session : sessions_) { session->popCorrelationId(); @@ -1597,8 +1597,8 @@ void CuptiActivityProfiler::pushUserCorrelationId(uint64_t id) { id, CuptiActivityApi::CorrelationFlowType::User); #endif // HAS_CUPTI #ifdef HAS_ROCTRACER - RoctracerActivityApi::pushCorrelationID( - id, RoctracerActivityApi::CorrelationFlowType::User); + RocprofActivityApi::pushCorrelationID( + id, RocprofActivityApi::CorrelationFlowType::User); #endif for (auto& session : sessions_) { session->pushUserCorrelationId(id); @@ -1611,8 +1611,8 @@ void CuptiActivityProfiler::popUserCorrelationId() { CuptiActivityApi::CorrelationFlowType::User); #endif // HAS_CUPTI #ifdef HAS_ROCTRACER - RoctracerActivityApi::popCorrelationID( - RoctracerActivityApi::CorrelationFlowType::User); + RocprofActivityApi::popCorrelationID( + RocprofActivityApi::CorrelationFlowType::User); #endif for (auto& session : sessions_) { session->popUserCorrelationId(); diff --git a/libkineto/src/CuptiActivityProfiler.h b/libkineto/src/CuptiActivityProfiler.h index 379fa9078..187ee80f9 100644 --- a/libkineto/src/CuptiActivityProfiler.h +++ b/libkineto/src/CuptiActivityProfiler.h @@ -30,7 +30,7 @@ #endif // HAS_CUPTI #ifdef HAS_ROCTRACER -#include "RoctracerLogger.h" +#include "RocprofLogger.h" #endif // HAS_ROCTRACER #include "GenericTraceActivity.h" @@ -45,7 +45,7 @@ namespace KINETO_NAMESPACE { class Config; class CuptiActivityApi; -class RoctracerActivityApi; +class RocprofActivityApi; // This struct is a derived snapshot of the Config. And should not // be mutable after construction. @@ -121,7 +121,7 @@ inline size_t hash_combine(size_t seed, size_t value) { class CuptiActivityProfiler { public: CuptiActivityProfiler(CuptiActivityApi& cupti, bool cpuOnly); - CuptiActivityProfiler(RoctracerActivityApi& rai, bool cpuOnly); + CuptiActivityProfiler(RocprofActivityApi& rai, bool cpuOnly); CuptiActivityProfiler(const CuptiActivityProfiler&) = delete; CuptiActivityProfiler& operator=(const CuptiActivityProfiler&) = delete; ~CuptiActivityProfiler(); @@ -411,19 +411,19 @@ class CuptiActivityProfiler { #endif // HAS_CUPTI #ifdef HAS_ROCTRACER - // Process generic RocTracer activity - void handleRoctracerActivity( - const roctracerBase* record, + // Process generic RocProf activity + void handleRocprofActivity( + const rocprofBase* record, ActivityLogger* logger); void handleCorrelationActivity( uint64_t correlationId, uint64_t externalId, - RoctracerLogger::CorrelationDomain externalKind); + RocLogger::CorrelationDomain externalKind); // Process specific GPU activity types template void handleRuntimeActivity(const T* activity, ActivityLogger* logger); void handleGpuActivity( - const roctracerAsyncRow* record, + const rocprofAsyncRow* record, ActivityLogger* logger); #endif // HAS_ROCTRACER @@ -457,7 +457,7 @@ class CuptiActivityProfiler { // Calls to CUPTI is encapsulated behind this interface #ifdef HAS_ROCTRACER - RoctracerActivityApi& cupti_; // Design failure here + RocprofActivityApi& cupti_; // Design failure here #else CuptiActivityApi& cupti_; #endif diff --git a/libkineto/src/RocLogger.cpp b/libkineto/src/RocLogger.cpp new file mode 100644 index 000000000..b1290640b --- /dev/null +++ b/libkineto/src/RocLogger.cpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "RocLogger.h" + +ApiIdList::ApiIdList() : invert_(true) {} + +void ApiIdList::add(const std::string& apiName) { + uint32_t cid = mapName(apiName); + if (cid > 0) + filter_[cid] = 1; +} + +void ApiIdList::remove(const std::string& apiName) { + uint32_t cid = mapName(apiName); + if (cid > 0) + filter_.erase(cid); +} + +bool ApiIdList::loadUserPrefs() { + // FIXME: check an ENV variable that points to an exclude file + return false; +} + +bool ApiIdList::contains(uint32_t apiId) { + return (filter_.find(apiId) != filter_.end()) ? !invert_ : invert_; // XOR +} diff --git a/libkineto/src/RocLogger.h b/libkineto/src/RocLogger.h new file mode 100644 index 000000000..a057a4c4a --- /dev/null +++ b/libkineto/src/RocLogger.h @@ -0,0 +1,229 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace libkineto { +class RocprofActivityApi; +} + +typedef uint64_t timestamp_t; + +static timestamp_t timespec_to_ns(const timespec& time) { + return ((timestamp_t)time.tv_sec * 1000000000) + time.tv_nsec; +} + +namespace RocLogger { +enum CorrelationDomain { + begin, + Default = begin, + Domain0 = begin, + Domain1, + end, + size = end +}; +} // namespace RocLogger + +class ApiIdList { + public: + ApiIdList(); + virtual ~ApiIdList() {} + bool invertMode() { + return invert_; + } + void setInvertMode(bool invert) { + invert_ = invert; + } + void add(const std::string& apiName); + void remove(const std::string& apiName); + bool loadUserPrefs(); + + // Map api string to cnid enum + virtual uint32_t mapName(const std::string& apiName) = 0; + + bool contains(uint32_t apiId); + const std::unordered_map& filterList() { + return filter_; + } + + private: + std::unordered_map filter_; + bool invert_; +}; + +typedef enum { + ROCTRACER_ACTIVITY_DEFAULT = 0, + ROCTRACER_ACTIVITY_KERNEL, + ROCTRACER_ACTIVITY_COPY, + ROCTRACER_ACTIVITY_MALLOC, + ROCTRACER_ACTIVITY_ASYNC, + ROCTRACER_ACTIVITY_NONE +} rocprof_activity_types; + +struct rocprofBase { + rocprofBase( + uint64_t id, + uint32_t domain, + uint64_t begin, + uint64_t end, + rocprof_activity_types type = ROCTRACER_ACTIVITY_NONE) + : id(id), begin(begin), end(end), domain(domain), type(type) {} + uint64_t id; // correlation_id + uint64_t begin; + uint64_t end; + uint32_t domain; + rocprof_activity_types type; +}; + +struct rocprofRow : public rocprofBase { + rocprofRow( + uint64_t id, + uint32_t domain, + uint32_t cid, + uint32_t pid, + uint32_t tid, + uint64_t begin, + uint64_t end, + rocprof_activity_types type = ROCTRACER_ACTIVITY_DEFAULT) + : rocprofBase(id, domain, begin, end, type), + cid(cid), + pid(pid), + tid(tid) {} + uint32_t cid; + uint32_t pid; + uint32_t tid; +}; + +struct rocprofKernelRow : public rocprofRow { + rocprofKernelRow( + uint64_t id, + uint32_t domain, + uint32_t cid, + uint32_t pid, + uint32_t tid, + uint64_t begin, + uint64_t end, + const void* faddr, + hipFunction_t function, + unsigned int gx, + unsigned int gy, + unsigned int gz, + unsigned int wx, + unsigned int wy, + unsigned int wz, + size_t gss, + hipStream_t stream, + rocprof_activity_types type = ROCTRACER_ACTIVITY_KERNEL) + : rocprofRow(id, domain, cid, pid, tid, begin, end, type), + functionAddr(faddr), + function(function), + gridX(gx), + gridY(gy), + gridZ(gz), + workgroupX(wx), + workgroupY(wy), + workgroupZ(wz), + groupSegmentSize(gss), + stream(stream) {} + const void* functionAddr; + hipFunction_t function; + unsigned int gridX; + unsigned int gridY; + unsigned int gridZ; + unsigned int workgroupX; + unsigned int workgroupY; + unsigned int workgroupZ; + size_t groupSegmentSize; + hipStream_t stream; +}; + +struct rocprofCopyRow : public rocprofRow { + rocprofCopyRow( + uint64_t id, + uint32_t domain, + uint32_t cid, + uint32_t pid, + uint32_t tid, + uint64_t begin, + uint64_t end, + const void* src, + const void* dst, + size_t size, + hipMemcpyKind kind, + hipStream_t stream, + rocprof_activity_types type = ROCTRACER_ACTIVITY_COPY) + : rocprofRow(id, domain, cid, pid, tid, begin, end, type), + src(src), + dst(dst), + size(size), + kind(kind), + stream(stream) {} + const void* src; + const void* dst; + size_t size; + hipMemcpyKind kind; + hipStream_t stream; +}; + +struct rocprofMallocRow : public rocprofRow { + rocprofMallocRow( + uint64_t id, + uint32_t domain, + uint32_t cid, + uint32_t pid, + uint32_t tid, + uint64_t begin, + uint64_t end, + const void* ptr, + size_t size, + rocprof_activity_types type = ROCTRACER_ACTIVITY_MALLOC) + : rocprofRow(id, domain, cid, pid, tid, begin, end, type), + ptr(ptr), + size(size) {} + const void* ptr; + size_t size; +}; + +struct rocprofAsyncRow : public rocprofBase { + rocprofAsyncRow( + uint64_t id, + uint32_t domain, + uint32_t kind, + uint32_t op, + int device, + uint64_t queue, + uint64_t begin, + uint64_t end, + const std::string& kernelName, + rocprof_activity_types type = ROCTRACER_ACTIVITY_ASYNC) + : rocprofBase(id, domain, begin, end, type), + kind(kind), + op(op), + device(device), + queue(queue), + kernelName(kernelName) {} + uint32_t kind; + uint32_t op; + int device; + uint64_t queue; + std::string kernelName; +}; diff --git a/libkineto/src/RocprofActivity.h b/libkineto/src/RocprofActivity.h new file mode 100644 index 000000000..296578072 --- /dev/null +++ b/libkineto/src/RocprofActivity.h @@ -0,0 +1,165 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +// TODO(T90238193) +// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude +#include "GenericTraceActivity.h" +#include "ITraceActivity.h" +#include "RocprofLogger.h" +#include "ThreadUtil.h" + +#include +#include + +namespace libkineto { +class ActivityLogger; +} + +namespace KINETO_NAMESPACE { + +using namespace libkineto; +struct TraceSpan; + +// These classes wrap the various Rocprof activity types +// into subclasses of ITraceActivity so that they can all be accessed +// using the ITraceActivity interface and logged via ActivityLogger. + +// Abstract base class, templated on Rocprof activity type +template +struct RocprofActivity : public ITraceActivity { + explicit RocprofActivity(const T* activity, const ITraceActivity* linked) + : activity_(*activity), linked_(linked) {} + // Our stored timestamps (from rocprof and generated) are in CLOCK_MONOTONIC + // domain (in ns). Convert the timestamps. + int64_t timestamp() const override { + return activity_.begin; + } + int64_t duration() const override { + return activity_.end - activity_.begin; + } + int64_t correlationId() const override { + return 0; + } + int32_t getThreadId() const override { + return 0; + } + const ITraceActivity* linkedActivity() const override { + return linked_; + } + int flowType() const override { + return kLinkAsyncCpuGpu; + } + int64_t flowId() const override { + return correlationId(); + } + const T& raw() const { + return activity_; + } + const TraceSpan* traceSpan() const override { + return nullptr; + } + const std::string getMetadataValue(const std::string& key) const override { + auto it = metadata_.find(key); + if (it != metadata_.end()) { + return it->second; + } + return ""; + } + + protected: + const T& activity_; + const ITraceActivity* linked_{nullptr}; + std::unordered_map metadata_; +}; + +// rocprofAsyncRow - Rocprof GPU activities +struct GpuActivity : public RocprofActivity { + explicit GpuActivity( + const rocprofAsyncRow* activity, + const ITraceActivity* linked) + : RocprofActivity(activity, linked) { + switch (activity_.domain) { + case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY: + type_ = ActivityType::GPU_MEMCPY; + break; + case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH: + default: + type_ = ActivityType::CONCURRENT_KERNEL; + break; + } + } + int64_t correlationId() const override { + return activity_.id; + } + int64_t deviceId() const override { + return activity_.device; + } + int64_t resourceId() const override { + return activity_.queue; + } + ActivityType type() const override { + return type_; + }; + bool flowStart() const override { + return false; + } + const std::string name() const override; + void log(ActivityLogger& logger) const override; + const std::string metadataJson() const override; + + // Add small buffer to fix visual error created by + // https://github.com/ROCm/rocprof/issues/105 Once this is resolved we can + // use ifdef to handle having this buffer or not based on version + int64_t timestamp() const override { + return activity_.begin + 1; + } + int64_t duration() const override { + return activity_.end - (activity_.begin + 1); + } + + private: + ActivityType type_; +}; + +// rocprofRow, rocprofKernelRow, rocprofCopyRow, rocprofMallocRow - +// Rocprof runtime activities +template +struct RuntimeActivity : public RocprofActivity { + explicit RuntimeActivity(const T* activity, const ITraceActivity* linked) + : RocprofActivity(activity, linked) {} + int64_t correlationId() const override { + return raw().id; + } + int64_t deviceId() const override { + return raw().pid; + } + int64_t resourceId() const override { + return raw().tid; + } + ActivityType type() const override { + return ActivityType::CUDA_RUNTIME; + } + bool flowStart() const override; + const std::string name() const override { + return RocprofLogger::opString( + ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API, raw().cid); + } + void log(ActivityLogger& logger) const override; + const std::string metadataJson() const override; + const T& raw() const { + return RocprofActivity::raw(); + } +}; + +} // namespace KINETO_NAMESPACE + +// Include the implementation detail of this header file. +// The *_inl.h helps separate header interface from implementation details. +#include "RocprofActivity_inl.h" diff --git a/libkineto/src/RocprofActivityApi.cpp b/libkineto/src/RocprofActivityApi.cpp new file mode 100644 index 000000000..60a243b9b --- /dev/null +++ b/libkineto/src/RocprofActivityApi.cpp @@ -0,0 +1,196 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "RocprofActivityApi.h" + +#include +#include +#include +#include +#include "ApproximateClock.h" +#include "Demangle.h" +#include "Logger.h" +#include "ThreadUtil.h" +#include "output_base.h" + +using namespace std::chrono; + +namespace KINETO_NAMESPACE { + +RocprofActivityApi& RocprofActivityApi::singleton() { + static RocprofActivityApi instance; + return instance; +} + +RocprofActivityApi::RocprofActivityApi() : d(&RocprofLogger::singleton()) {} + +RocprofActivityApi::~RocprofActivityApi() { + disableActivities(std::set()); +} + +void RocprofActivityApi::pushCorrelationID(int id, CorrelationFlowType type) { +#ifdef HAS_ROCTRACER + if (!singleton().d->externalCorrelationEnabled_) { + return; + } + singleton().d->pushCorrelationID( + id, static_cast(type)); +#endif +} + +void RocprofActivityApi::popCorrelationID(CorrelationFlowType type) { +#ifdef HAS_ROCTRACER + if (!singleton().d->externalCorrelationEnabled_) { + return; + } + singleton().d->popCorrelationID( + static_cast(type)); +#endif +} + +void RocprofActivityApi::setMaxEvents(uint32_t maxEvents) { + d->setMaxEvents(maxEvents); +} + +void RocprofActivityApi::setMaxBufferSize(int size) { + // FIXME: implement? + // maxGpuBufferCount_ = 1 + size / kBufSize; +} + +inline bool inRange(int64_t start, int64_t end, int64_t stamp) { + return ((stamp > start) && (stamp < end)); +} + +inline bool RocprofActivityApi::isLogged(libkineto::ActivityType atype) const { + return activityMaskSnapshot_ & (1 << static_cast(atype)); +} + +timestamp_t getTimeOffset() { + int64_t t0, t00; + timespec t1; + t0 = libkineto::getApproximateTime(); + clock_gettime(CLOCK_MONOTONIC, &t1); + t00 = libkineto::getApproximateTime(); + + // Confvert to ns (if necessary) + t0 = libkineto::get_time_converter()(t0); + t00 = libkineto::get_time_converter()(t00); + + // Our stored timestamps (from roctracer and generated) are in CLOCK_MONOTONIC + // domain (in ns). + return (t0 >> 1) + (t00 >> 1) - timespec_to_ns(t1); +} + +int RocprofActivityApi::processActivities( + std::function handler, + std::function + correlationHandler) { + // Find offset to map from monotonic clock to system clock. + // This will break time-ordering of events but is status quo. + + int count = 0; + + // Process all external correlations pairs + for (int it = RocLogger::CorrelationDomain::begin; + it < RocLogger::CorrelationDomain::end; + ++it) { + auto& externalCorrelations = d->externalCorrelations_[it]; + for (auto& item : externalCorrelations) { + correlationHandler( + item.first, + item.second, + static_cast(it)); + } + std::lock_guard lock(d->externalCorrelationsMutex_); + externalCorrelations.clear(); + } + + // Async ops are in CLOCK_MONOTONIC rather than junk clock. + // Convert these timestamps, poorly. + // These accurate timestamps will skew when converted to approximate time + // The time_converter is not available at collection time. Or we could do a + // much better job. + auto toffset = getTimeOffset(); + + // All Runtime API Calls + for (auto& item : d->rows_) { + bool filtered = false; + if (item->type != ROCTRACER_ACTIVITY_ASYNC && + !isLogged(ActivityType::CUDA_RUNTIME)) { + filtered = true; + } else { + switch (reinterpret_cast(item)->domain) { + case ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY: + if (!isLogged(ActivityType::GPU_MEMCPY)) + filtered = true; + break; + case ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH: + default: + if (!isLogged(ActivityType::CONCURRENT_KERNEL)) + filtered = true; + break; + } + } + if (!filtered) { + // Convert the begin and end timestamps from monotonic clock to system + // clock. + if (item->type == ROCTRACER_ACTIVITY_ASYNC) { + // Async ops are in CLOCK_MONOTONIC, apply offset to converted + // approximate + item->begin += toffset; + item->end += toffset; + } else { + // Runtime ranges are in approximate clock, just apply conversion + item->begin = libkineto::get_time_converter()(item->begin); + item->end = libkineto::get_time_converter()(item->end); + } + handler(item); + ++count; + } + } + return count; +} + +// TODO: implement the actual flush with roctracer_flush_activity +void RocprofActivityApi::flushActivities() {} + +void RocprofActivityApi::clearActivities() { + d->clearLogs(); +} + +void RocprofActivityApi::enableActivities( + const std::set& selected_activities) { +#ifdef HAS_ROCTRACER + d->startLogging(); + + for (const auto& activity : selected_activities) { + activityMask_ |= (1 << static_cast(activity)); + if (activity == ActivityType::EXTERNAL_CORRELATION) { + d->externalCorrelationEnabled_ = true; + } + } +#endif +} + +void RocprofActivityApi::disableActivities( + const std::set& selected_activities) { +#ifdef HAS_ROCTRACER + d->stopLogging(); + + activityMaskSnapshot_ = activityMask_; + + for (const auto& activity : selected_activities) { + activityMask_ &= ~(1 << static_cast(activity)); + if (activity == ActivityType::EXTERNAL_CORRELATION) { + d->externalCorrelationEnabled_ = false; + } + } +#endif +} + +} // namespace KINETO_NAMESPACE diff --git a/libkineto/src/RocprofActivityApi.h b/libkineto/src/RocprofActivityApi.h new file mode 100644 index 000000000..08a2a2a63 --- /dev/null +++ b/libkineto/src/RocprofActivityApi.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once +#ifdef HAS_ROCTRACER + +#include +#include +#include + +#include "RocprofLogger.h" + +#include "ActivityType.h" +#include "GenericTraceActivity.h" + +class RocprofLogger; + +namespace KINETO_NAMESPACE { + +using namespace libkineto; + +class RocprofActivityApi { + public: + enum CorrelationFlowType { Default, User }; + + RocprofActivityApi(); + RocprofActivityApi(const RocprofActivityApi&) = delete; + RocprofActivityApi& operator=(const RocprofActivityApi&) = delete; + + virtual ~RocprofActivityApi(); + + static RocprofActivityApi& singleton(); + + static void pushCorrelationID(int id, CorrelationFlowType type); + static void popCorrelationID(CorrelationFlowType type); + + void enableActivities(const std::set& selected_activities); + void disableActivities(const std::set& selected_activities); + void flushActivities(); + void clearActivities(); + void teardownContext() {} + void setTimeOffset(timestamp_t toffset); + void setMaxEvents(uint32_t maxEvents); + + virtual int processActivities( + std::function handler, + std::function + correlationHandler); + + void setMaxBufferSize(int size); + + std::atomic_bool stopCollection{false}; + + private: + bool registered_{false}; + timestamp_t toffset_{0}; + + // Enabled Activity Filters + uint32_t activityMask_{0}; + uint32_t activityMaskSnapshot_{0}; + bool isLogged(libkineto::ActivityType atype) const; + + RocprofLogger* d; +}; + +} // namespace KINETO_NAMESPACE +#endif diff --git a/libkineto/src/RocprofActivity_inl.h b/libkineto/src/RocprofActivity_inl.h new file mode 100644 index 000000000..e5e58ab80 --- /dev/null +++ b/libkineto/src/RocprofActivity_inl.h @@ -0,0 +1,261 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include "RocprofActivity.h" + +#include +#include +#include + +#include "Demangle.h" +#include "output_base.h" + +namespace KINETO_NAMESPACE { + +using namespace libkineto; + +namespace { +thread_local std::unordered_map correlationToGrid; +thread_local std::unordered_map correlationToBlock; +thread_local std::unordered_map correlationToSize; +} // namespace + +const char* getGpuActivityKindString(uint32_t domain, uint32_t op) { + if (domain == ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH) + return "Dispatch Kernel"; + else if (domain == ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY) { + switch (op) { + case ROCPROFILER_MEMORY_COPY_HOST_TO_HOST: + return "HtoH"; + case ROCPROFILER_MEMORY_COPY_HOST_TO_DEVICE: + return "HtoD"; + case ROCPROFILER_MEMORY_COPY_DEVICE_TO_HOST: + return "DtoH"; + case ROCPROFILER_MEMORY_COPY_DEVICE_TO_DEVICE: + return "DtoD"; + } + } + return ""; +} + +void getMemcpySrcDstString(uint32_t kind, std::string& src, std::string& dst) { + switch (kind) { + case ROCPROFILER_MEMORY_COPY_HOST_TO_HOST: + src = "Host"; + dst = "Host"; + break; + case ROCPROFILER_MEMORY_COPY_DEVICE_TO_HOST: + src = "Device"; + dst = "Host"; + break; + case ROCPROFILER_MEMORY_COPY_HOST_TO_DEVICE: + src = "Host"; + dst = "Device"; + break; + case ROCPROFILER_MEMORY_COPY_DEVICE_TO_DEVICE: + src = "Device"; + dst = "Device"; + break; + default: + src = "?"; + dst = "?"; + break; + } +} + +// GPU Activities + +inline const std::string GpuActivity::name() const { + if (type_ == ActivityType::CONCURRENT_KERNEL) { + auto op = raw().op; + auto domain = raw().domain; + std::string opString = RocprofLogger::opString( + static_cast(domain), op); + const char* name = opString.c_str(); + return demangle( + raw().kernelName.length() > 0 ? raw().kernelName : std::string(name)); + } else if (type_ == ActivityType::GPU_MEMSET) { + return fmt::format( + "Memset ({})", getGpuActivityKindString(raw().domain, raw().op)); + } else if (type_ == ActivityType::GPU_MEMCPY) { + std::string src = ""; + std::string dst = ""; + getMemcpySrcDstString(raw().op, src, dst); + return fmt::format( + "Memcpy {} ({} -> {})", + getGpuActivityKindString(raw().domain, raw().op), + src, + dst); + } else { + return ""; + } + return ""; +} + +inline void GpuActivity::log(ActivityLogger& logger) const { + logger.handleActivity(*this); +} + +static inline std::string bandwidth(size_t bytes, uint64_t duration) { + return duration == 0 ? "\"N/A\"" : fmt::format("{}", bytes * 1.0 / duration); +} + +inline const std::string GpuActivity::metadataJson() const { + const auto& gpuActivity = raw(); + // clang-format off + + // if memcpy or memset, add size + if (correlationToSize.count(gpuActivity.id) > 0) { + size_t size = correlationToSize[gpuActivity.id]; + std::string bandwidth_gib = (bandwidth(size, gpuActivity.end - gpuActivity.begin)); + return fmt::format(R"JSON( + "device": {}, "stream": {}, + "correlation": {}, "kind": "{}", + "bytes": {}, "memory bandwidth (GB/s)": {})JSON", + gpuActivity.device, gpuActivity.queue, + gpuActivity.id, getGpuActivityKindString(gpuActivity.domain, gpuActivity.op), + size, bandwidth_gib); + } + + // if compute kernel, add grid and block + else if (correlationToGrid.count(gpuActivity.id) > 0) { + return fmt::format(R"JSON( + "device": {}, "stream": {}, + "correlation": {}, "kind": "{}", + "grid": {}, "block": {})JSON", + gpuActivity.device, gpuActivity.queue, + gpuActivity.id, getGpuActivityKindString(gpuActivity.domain, gpuActivity.op), + correlationToGrid[gpuActivity.id], correlationToBlock[gpuActivity.id]); + } else { + return fmt::format(R"JSON( + "device": {}, "stream": {}, + "correlation": {}, "kind": "{}")JSON", + gpuActivity.device, gpuActivity.queue, + gpuActivity.id, getGpuActivityKindString(gpuActivity.domain, gpuActivity.op)); + } + // clang-format on +} + +// Runtime Activities + +template +inline bool RuntimeActivity::flowStart() const { + bool should_correlate = + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipLaunchKernel || + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipExtLaunchKernel || + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipLaunchCooperativeKernel || + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipHccModuleLaunchKernel || + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipModuleLaunchKernel || + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipExtModuleLaunchKernel || + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipMalloc || + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipFree || + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy || + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyAsync || + raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyWithStream; + return should_correlate; +} + +template +inline void RuntimeActivity::log(ActivityLogger& logger) const { + logger.handleActivity(*this); +} + +template <> +inline const std::string RuntimeActivity::metadataJson() + const { + std::string kernel = ""; + if ((raw().functionAddr != nullptr)) { + kernel = fmt::format( + R"JSON( + "kernel": "{}", )JSON", + demangle(hipKernelNameRefByPtr(raw().functionAddr, raw().stream))); + } else if ((raw().function != nullptr)) { + kernel = fmt::format( + R"JSON( + "kernel": "{}", )JSON", + demangle(hipKernelNameRef(raw().function))); + } + // cache grid and block so we can pass it into async activity (GPU track) + correlationToGrid[raw().id] = fmt::format( + R"JSON( + [{}, {}, {}])JSON", + raw().gridX, + raw().gridY, + raw().gridZ); + + correlationToBlock[raw().id] = fmt::format( + R"JSON( + [{}, {}, {}])JSON", + raw().workgroupX, + raw().workgroupY, + raw().workgroupZ); + + return fmt::format( + R"JSON( + {}"cid": {}, "correlation": {}, + "grid": [{}, {}, {}], + "block": [{}, {}, {}], + "shared memory": {})JSON", + kernel, + raw().cid, + raw().id, + raw().gridX, + raw().gridY, + raw().gridZ, + raw().workgroupX, + raw().workgroupY, + raw().workgroupZ, + raw().groupSegmentSize); +} + +template <> +inline const std::string RuntimeActivity::metadataJson() const { + correlationToSize[raw().id] = raw().size; + return fmt::format( + R"JSON( + "cid": {}, "correlation": {}, "src": "{}", "dst": "{}", "bytes": "{}", "kind": "{}")JSON", + raw().cid, + raw().id, + raw().src, + raw().dst, + raw().size, + fmt::underlying(raw().kind)); +} + +template <> +inline const std::string RuntimeActivity::metadataJson() + const { + correlationToSize[raw().id] = raw().size; + std::string size = ""; + if (raw().cid == ROCPROFILER_HIP_RUNTIME_API_ID_hipMalloc) { + size = fmt::format( + R"JSON( + "bytes": {}, )JSON", + raw().size); + } + return fmt::format( + R"JSON( + {}"cid": {}, "correlation": {}, "ptr": "{}")JSON", + size, + raw().cid, + raw().id, + raw().ptr); +} + +template +inline const std::string RuntimeActivity::metadataJson() const { + return fmt::format( + R"JSON( + "cid": {}, "correlation": {})JSON", + raw().cid, + raw().id); +} + +} // namespace KINETO_NAMESPACE diff --git a/libkineto/src/RocprofLogger.cpp b/libkineto/src/RocprofLogger.cpp new file mode 100644 index 000000000..401220541 --- /dev/null +++ b/libkineto/src/RocprofLogger.cpp @@ -0,0 +1,778 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "RocprofLogger.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "Demangle.h" +#include "Logger.h" +#include "ThreadUtil.h" +#include "ApproximateClock.h" + +using namespace libkineto; +using namespace std::chrono; +using namespace RocLogger; + +class RocprofLoggerShared; + +namespace { +RocprofLoggerShared* s{nullptr}; +using kernel_symbol_data_t = + rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; +using kernel_symbol_map_t = + std::unordered_map; +using kernel_name_map_t = + std::unordered_map; +using rocprofiler::sdk::buffer_name_info; +using rocprofiler::sdk::callback_name_info; +using agent_info_map_t = std::unordered_map; + +// extract copy args +struct copy_args { + const char* dst{""}; + const char* src{""}; + size_t size{0}; + const char* copyKindStr{""}; + hipMemcpyKind copyKind{hipMemcpyDefault}; + hipStream_t stream{nullptr}; + rocprofiler_callback_tracing_kind_t kind; + rocprofiler_tracing_operation_t operation; +}; +auto extract_copy_args = [](rocprofiler_callback_tracing_kind_t, + rocprofiler_tracing_operation_t, + uint32_t arg_num, + const void* const arg_value_addr, + int32_t indirection_count, + const char* arg_type, + const char* arg_name, + const char* arg_value_str, + int32_t dereference_count, + void* cb_data) -> int { + auto& args = *(static_cast(cb_data)); + if (strcmp("dst", arg_name) == 0) { + args.dst = arg_value_str; + } else if (strcmp("src", arg_name) == 0) { + args.src = arg_value_str; + } else if (strcmp("sizeBytes", arg_name) == 0) { + args.size = *(reinterpret_cast(arg_value_addr)); + } else if (strcmp("kind", arg_name) == 0) { + args.copyKindStr = arg_value_str; + args.copyKind = *(reinterpret_cast(arg_value_addr)); + } else if (strcmp("stream", arg_name) == 0) { + args.stream = *(reinterpret_cast(arg_value_addr)); + } + return 0; +}; + +// extract kernel args +struct kernel_args { + // const char *stream; + hipStream_t stream{nullptr}; + uint32_t privateSize {0}; + uint32_t groupSize {0}; + rocprofiler_dim3_t workgroupSize {0}; + rocprofiler_dim3_t gridSize {0}; + rocprofiler_callback_tracing_kind_t kind; + rocprofiler_tracing_operation_t operation; +}; +auto extract_kernel_args = [](rocprofiler_callback_tracing_kind_t, + rocprofiler_tracing_operation_t, + uint32_t arg_num, + const void* const arg_value_addr, + int32_t indirection_count, + const char* arg_type, + const char* arg_name, + const char* arg_value_str, + int32_t dereference_count, + void* cb_data) -> int { + auto& args = *(static_cast(cb_data)); + if (strcmp("stream", arg_name) == 0) + args.stream = *(reinterpret_cast(arg_value_addr)); + else if (strcmp("numBlocks", arg_name) == 0) + args.workgroupSize = *(reinterpret_cast(arg_value_addr)); + else if (strcmp("dimBlocks", arg_name) == 0) + args.gridSize = *(reinterpret_cast(arg_value_addr)); + else if (strcmp("sharedMemBytes", arg_name) == 0) + args.groupSize = *(reinterpret_cast(arg_value_addr)); + else if (strcmp("globalWorkSizeX", arg_name) == 0) + args.workgroupSize.x = *(reinterpret_cast(arg_value_addr)); + else if (strcmp("globalWorkSizeY", arg_name) == 0) + args.workgroupSize.y = *(reinterpret_cast(arg_value_addr)); + else if (strcmp("globalWorkSizeZ", arg_name) == 0) + args.workgroupSize.z = *(reinterpret_cast(arg_value_addr)); + else if (strcmp("localWorkSizeX", arg_name) == 0) + args.gridSize.x = *(reinterpret_cast(arg_value_addr)); + else if (strcmp("localWorkSizeY", arg_name) == 0) + args.gridSize.y = *(reinterpret_cast(arg_value_addr)); + else if (strcmp("localWorkSizeZ", arg_name) == 0) + args.gridSize.z = *(reinterpret_cast(arg_value_addr)); + return 0; +}; + +// extract malloc args +struct malloc_args { + const char* ptr; + size_t size; +}; +auto extract_malloc_args = [](rocprofiler_callback_tracing_kind_t, + rocprofiler_tracing_operation_t, + uint32_t arg_num, + const void* const arg_value_addr, + int32_t indirection_count, + const char* arg_type, + const char* arg_name, + const char* arg_value_str, + int32_t dereference_count, + void* cb_data) -> int { + auto& args = *(static_cast(cb_data)); + if (strcmp("ptr", arg_name) == 0) { + args.ptr = arg_value_str; + } + if (strcmp("size", arg_name) == 0) { + args.size = *(reinterpret_cast(arg_value_addr)); + } + return 0; +}; + +// copy api calls +bool isCopyApi(uint32_t id) { + switch (id) { + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2D: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DFromArray: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DFromArrayAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DToArray: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DToArrayAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy3D: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy3DAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyAtoH: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyDtoD: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyDtoDAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyDtoH: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyDtoHAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyFromArray: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyFromSymbol: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyFromSymbolAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyHtoA: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyHtoD: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyHtoDAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyParam2D: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyParam2DAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyPeer: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyPeerAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyToArray: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyToSymbol: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyToSymbolAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyWithStream: + return true; + break; + default:; + } + return false; +} + +// kernel api calls +bool isKernelApi(uint32_t id) { + switch (id) { + case ROCPROFILER_HIP_RUNTIME_API_ID_hipExtLaunchKernel: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipExtLaunchMultiKernelMultiDevice: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipLaunchCooperativeKernel: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipLaunchCooperativeKernelMultiDevice: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipLaunchKernel: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipModuleLaunchCooperativeKernel: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipModuleLaunchCooperativeKernelMultiDevice: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipModuleLaunchKernel: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipExtModuleLaunchKernel: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipHccModuleLaunchKernel: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipLaunchCooperativeKernel_spt: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipLaunchKernel_spt: + return true; + break; + default:; + } + return false; +} + +// malloc api calls +bool isMallocApi(uint32_t id) { + switch (id) { + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMalloc: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipFree: + return true; + break; + default:; + } + return false; +} + +class RocprofApiIdList : public ApiIdList { + public: + RocprofApiIdList(callback_name_info& names); + uint32_t mapName(const std::string& apiName) override; + std::vector allEnabled(); + + private: + std::unordered_map nameMap_; +}; + +} // namespace + +class RocprofLoggerShared { + public: + static RocprofLoggerShared& singleton(); + + rocprofiler_client_id_t* clientId{nullptr}; + rocprofiler_tool_configure_result_t cfg = rocprofiler_tool_configure_result_t{ + sizeof(rocprofiler_tool_configure_result_t), + &RocprofLogger::toolInit, + &RocprofLogger::toolFinialize, + nullptr}; + + // Contexts + rocprofiler_context_id_t utilityContext = {0}; + rocprofiler_context_id_t context = {0}; + + // Buffers + rocprofiler_buffer_id_t buffer = {}; + + // Manage kernel names - #betterThanRoctracer + kernel_symbol_map_t kernel_info = {}; + kernel_name_map_t kernel_names = {}; + std::mutex kernel_lock; + + // Manage buffer name - #betterThanRoctracer + callback_name_info name_info = {}; + buffer_name_info buff_name_info = {}; + + // Agent info + // + agent_info_map_t agents = {}; + + std::map kernelargs; + std::map copyargs; + + private: + RocprofLoggerShared() { + s = this; + } + ~RocprofLoggerShared() { + s = nullptr; + } +}; + +RocprofLoggerShared& RocprofLoggerShared::singleton() { + static RocprofLoggerShared* instance = new RocprofLoggerShared(); // Leak this + return *instance; +} + +std::vector get_gpu_device_agents() { + std::vector agents; + + // Callback used by rocprofiler_query_available_agents to return + // agents on the device. This can include CPU agents as well. We + // select GPU agents only (i.e. type == ROCPROFILER_AGENT_TYPE_GPU) + rocprofiler_query_available_agents_cb_t iterate_cb = + [](rocprofiler_agent_version_t agents_ver, + const void** agents_arr, + size_t num_agents, + void* udata) { + if (agents_ver != ROCPROFILER_AGENT_INFO_VERSION_0) + throw std::runtime_error{"unexpected rocprofiler agent version"}; + auto* agents_v = + static_cast*>(udata); + for (size_t i = 0; i < num_agents; ++i) { + const auto* agent = + static_cast(agents_arr[i]); + // if(agent->type == ROCPROFILER_AGENT_TYPE_GPU) + // agents_v->emplace_back(*agent); + agents_v->emplace_back(*agent); + } + return ROCPROFILER_STATUS_SUCCESS; + }; + + // Query the agents, only a single callback is made that contains a vector + // of all agents. + rocprofiler_query_available_agents( + ROCPROFILER_AGENT_INFO_VERSION_0, + iterate_cb, + sizeof(rocprofiler_agent_t), + const_cast(static_cast(&agents))); + return agents; +} + +// +// Static setup +// +extern "C" rocprofiler_tool_configure_result_t* rocprofiler_configure( + uint32_t version, + const char* runtime_version, + uint32_t priority, + rocprofiler_client_id_t* id) { + RocprofLoggerShared::singleton(); // CRITICAL: static init + + id->name = "kineto"; + s->clientId = id; + + // return pointer to configure data + return &s->cfg; +} + +int RocprofLogger::toolInit( + rocprofiler_client_finalize_t finialize_func, + void* tool_data) { + // Gather api names + s->name_info = rocprofiler::sdk::get_callback_tracing_names(); + s->buff_name_info = rocprofiler::sdk::get_buffer_tracing_names(); + + // Gather agent info + auto agent_info = get_gpu_device_agents(); + for (auto agent : agent_info) { + s->agents[agent.id.handle] = agent; + } + + // + // Setup utility context to gather code object info + // + rocprofiler_create_context(&s->utilityContext); + auto code_object_ops = std::vector{ + ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER}; + + rocprofiler_configure_callback_tracing_service( + s->utilityContext, + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, + code_object_ops.data(), + code_object_ops.size(), + RocprofLogger::code_object_callback, + nullptr); + { + int isValid = 0; + rocprofiler_context_is_valid(s->utilityContext, &isValid); + if (isValid == 0) { + s->utilityContext.handle = 0; // Can't destroy it, so leak it + return -1; + } + } + rocprofiler_start_context(s->utilityContext); + + // + // select some api calls to omit, in the most inconvenient way possible + // #betterThanRoctracer + RocprofApiIdList apiList(s->name_info); + apiList.setInvertMode(true); // Omit the specified api + apiList.add("hipGetDevice"); + apiList.add("hipSetDevice"); + apiList.add("hipGetLastError"); + apiList.add("__hipPushCallConfiguration"); + apiList.add("__hipPopCallConfiguration"); + apiList.add("hipCtxSetCurrent"); + apiList.add("hipEventRecord"); + apiList.add("hipEventQuery"); + apiList.add("hipGetDeviceProperties"); + apiList.add("hipPeekAtLastError"); + apiList.add("hipModuleGetFunction"); + apiList.add("hipEventCreateWithFlags"); + + // Get a vector of the enabled api calls + auto apis = apiList.allEnabled(); + + // + // Setup main context to collect runtime and kernel info + // + rocprofiler_create_context(&s->context); + + // Collect api info via callback + rocprofiler_configure_callback_tracing_service( + s->context, + ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API, + apis.data(), + apis.size(), + api_callback, + nullptr); + + // Collect async ops via buffers + constexpr auto buffer_size_bytes = 0x40000; + constexpr auto buffer_watermark_bytes = buffer_size_bytes / 2; + + rocprofiler_create_buffer(s->context, + buffer_size_bytes, + buffer_watermark_bytes, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + RocprofLogger::buffer_callback, + nullptr, + &s->buffer); + + rocprofiler_configure_buffer_tracing_service(s->context, + ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, + nullptr, + 0, + s->buffer); + + rocprofiler_configure_buffer_tracing_service(s->context, + ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, + nullptr, + 0, + s->buffer); + { + int isValid = 0; + rocprofiler_context_is_valid(s->context, &isValid); + if (isValid == 0) { + s->context.handle = 0; // Can't destroy it, so leak it + return -1; + } + } + rocprofiler_stop_context(s->context); + + return 0; +} + +void RocprofLogger::toolFinialize(void* tool_data) { + rocprofiler_stop_context(s->utilityContext); + s->utilityContext.handle = 0; + rocprofiler_stop_context(s->context); + s->context.handle = 0; +} + +class Flush { + public: + std::mutex mutex_; + std::atomic maxCorrelationId_; + uint64_t maxCompletedCorrelationId_{0}; + void reportCorrelation(const uint64_t& cid) { + uint64_t prev = maxCorrelationId_; + while (prev < cid && !maxCorrelationId_.compare_exchange_weak(prev, cid)) { + } + } +}; + +RocprofLogger& RocprofLogger::singleton() { + static RocprofLogger instance; + return instance; +} + +RocprofLogger::RocprofLogger() {} + +RocprofLogger::~RocprofLogger() { + stopLogging(); + endTracing(); +} + +namespace { +thread_local std::deque + t_externalIds[RocLogger::CorrelationDomain::size]; +} + +void RocprofLogger::pushCorrelationID(uint64_t id, CorrelationDomain type) { + if (!singleton().externalCorrelationEnabled_) { + return; + } + t_externalIds[type].push_back(id); +} + +void RocprofLogger::popCorrelationID(CorrelationDomain type) { + if (!singleton().externalCorrelationEnabled_) { + return; + } + if (!t_externalIds[type].empty()) { + t_externalIds[type].pop_back(); + } else { + LOG(ERROR) + << "Attempt to popCorrelationID from an empty external Ids stack"; + } +} + +void RocprofLogger::clearLogs() { + rows_.clear(); + for (int i = 0; i < CorrelationDomain::size; ++i) { + externalCorrelations_[i].clear(); + } +} + +void RocprofLogger::insert_row_to_buffer(rocprofBase* row) { + RocprofLogger* dis = &singleton(); + std::lock_guard lock(dis->rowsMutex_); + if (dis->rows_.size() >= dis->maxBufferSize_) { + LOG_FIRST_N(WARNING, 10) + << "Exceeded max GPU buffer count (" << dis->rows_.size() << " > " + << dis->maxBufferSize_ << ") - terminating tracing"; + return; + } + dis->rows_.push_back(row); +} + +void RocprofLogger::code_object_callback( + rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* callback_data) { + if (record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT && + record.operation == ROCPROFILER_CODE_OBJECT_LOAD) { + if (record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) { + // flush the buffer to ensure that any lookups for the client kernel names + // for the code object are completed NOTE: not using buffer ATM + } + } else if ( + record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT && + record.operation == + ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) { + auto* data = static_cast(record.payload); + if (record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD) { + std::lock_guard lock(s->kernel_lock); + s->kernel_info.emplace(data->kernel_id, *data); + s->kernel_names.emplace(data->kernel_id, demangle(data->kernel_name)); + } else if (record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) { + // FIXME: clear these? At minimum need kernel names at shutdown, async + // completion + // s->kernel_info.erase(data->kernel_id); + // s->kernel_names.erase(data->kernel_id); + } + } +} + +void RocprofLogger::api_callback( + rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* callback_data) { + thread_local std::unordered_map timestamps; + + if (record.kind == ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API) { + if (record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) { + timestamps[record.correlation_id.internal] = getApproximateTime(); + } // ROCPROFILER_CALLBACK_PHASE_ENTER + else { // ROCPROFILER_CALLBACK_PHASE_EXIT + uint64_t startTime = timestamps[record.correlation_id.internal]; + timestamps.erase(record.correlation_id.internal); + uint64_t endTime = getApproximateTime(); + + // Kernel Launch Records + if (isKernelApi(record.operation)) { + kernel_args args; + rocprofiler_iterate_callback_tracing_kind_operation_args( + record, + extract_kernel_args, + 1 /*max_deref*/ + , + &args); + + rocprofKernelRow* row = new rocprofKernelRow( + record.correlation_id.internal, + record.kind, + record.operation, + processId(), + systemThreadId(), + startTime, + endTime, + nullptr, + nullptr, + args.workgroupSize.x, + args.workgroupSize.y, + args.workgroupSize.z, + args.gridSize.x, + args.gridSize.y, + args.gridSize.z, + args.groupSize, + args.stream); + insert_row_to_buffer(row); + + } + // Copy Records + else if (isCopyApi(record.operation)) { + copy_args args; + rocprofiler_iterate_callback_tracing_kind_operation_args( + record, + extract_copy_args, + 1 /*max_deref*/ + , + &args); + + rocprofCopyRow* row = new rocprofCopyRow( + record.correlation_id.internal, + args.kind, + args.operation, + processId(), + systemThreadId(), + startTime, + endTime, + args.src, + args.dst, + args.size, + args.copyKind, + args.stream); + insert_row_to_buffer(row); + } + // Malloc Records + else if (isMallocApi(record.operation)) { + malloc_args args; + args.size = 0; + rocprofiler_iterate_callback_tracing_kind_operation_args( + record, + extract_malloc_args, + 1 /*max_deref*/ + , + &args); + rocprofMallocRow* row = new rocprofMallocRow( + record.correlation_id.internal, + record.kind, + record.operation, + processId(), + systemThreadId(), + startTime, + endTime, + args.ptr, + args.size); + insert_row_to_buffer(row); + } + // Default Records + else { + rocprofRow* row = new rocprofRow( + record.correlation_id.internal, + record.kind, + record.operation, + processId(), + systemThreadId(), + startTime, + endTime); + insert_row_to_buffer(row); + } + } // ROCPROFILER_CALLBACK_PHASE_EXIT + } // ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API +} + + +void RocprofLogger::buffer_callback(rocprofiler_context_id_t context, rocprofiler_buffer_id_t buffer_id, rocprofiler_record_header_t** headers, size_t num_headers, void* user_data, uint64_t drop_count) +{ + for (size_t i = 0; i < num_headers; ++i) { + auto* header = headers[i]; + + if (header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING) { + if (header->kind == ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH) { + auto& record = *(static_cast(header->payload)); + auto& dispatch = record.dispatch_info; + + rocprofAsyncRow* row = new rocprofAsyncRow( + record.correlation_id.internal, + record.kind, + record.operation, + record.operation, // shared op - No longer a thing. Placeholder + s->agents.at(dispatch.agent_id.handle).logical_node_type_id, + dispatch.queue_id.handle, + record.start_timestamp, + record.end_timestamp, + s->kernel_names.at(dispatch.kernel_id)); + insert_row_to_buffer(row); + } + else if (header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_COPY) { + auto &record = *(static_cast(header->payload)); + rocprofAsyncRow* row = new rocprofAsyncRow( + record.correlation_id.internal, + record.kind, + record.operation, + record.operation, // shared op - No longer a thing. Placeholder + s->agents.at(record.dst_agent_id.handle).logical_node_type_id, + 0, + record.start_timestamp, + record.end_timestamp, + ""); + insert_row_to_buffer(row); + } + } + } +} + + +std::string RocprofLogger::opString( + rocprofiler_callback_tracing_kind_t kind, + rocprofiler_tracing_operation_t op) { + return std::string(RocprofLoggerShared::singleton().name_info[kind][op]); +} + +std::string RocprofLogger::opString( + rocprofiler_buffer_tracing_kind_t kind, + rocprofiler_tracing_operation_t op) { + return std::string(RocprofLoggerShared::singleton().buff_name_info[kind][op]); +} + +void RocprofLogger::setMaxEvents(uint32_t maxBufferSize) { + RocprofLogger* dis = &singleton(); + std::lock_guard lock(dis->rowsMutex_); + maxBufferSize_ = maxBufferSize; +} + +void RocprofLogger::startLogging() { + if (!registered_) { + } + + externalCorrelationEnabled_ = true; + logging_ = true; + if (s != nullptr) + rocprofiler_start_context(s->context); + else + LOG(WARNING) << "Rocprofiler not configured"; +} + +void RocprofLogger::stopLogging() { + if (logging_ == false) + return; + logging_ = false; + + // Flush buffers + rocprofiler_flush_buffer(s->buffer); + + if (s != nullptr) + rocprofiler_stop_context(s->context); +} + +void RocprofLogger::endTracing() { + // This should be handled in RocprofLogger::toolFinialize +} + +// +// ApiIdList +// Jump through some extra hoops +// +// +RocprofApiIdList::RocprofApiIdList(callback_name_info& names) : nameMap_() { + auto& hipapis = + names[ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API].operations; + + for (size_t i = 0; i < hipapis.size(); ++i) { + nameMap_.emplace(hipapis[i], i); + } +} + +uint32_t RocprofApiIdList::mapName(const std::string& apiName) { + auto it = nameMap_.find(apiName); + if (it != nameMap_.end()) { + return it->second; + } + return 0; +} + +std::vector RocprofApiIdList::allEnabled() { + std::vector oplist; + for (auto& it : nameMap_) { + if (contains(it.second)) + oplist.push_back(it.second); + } + return oplist; +} +// +// +// diff --git a/libkineto/src/RocprofLogger.h b/libkineto/src/RocprofLogger.h new file mode 100644 index 000000000..4628a7ab1 --- /dev/null +++ b/libkineto/src/RocprofLogger.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "RocLogger.h" + +class RocprofLogger { + public: + RocprofLogger(); + RocprofLogger(const RocprofLogger&) = delete; + RocprofLogger& operator=(const RocprofLogger&) = delete; + + virtual ~RocprofLogger(); + + static RocprofLogger& singleton(); + + static void pushCorrelationID(uint64_t id, RocLogger::CorrelationDomain type); + static void popCorrelationID(RocLogger::CorrelationDomain type); + + void startLogging(); + void stopLogging(); + void clearLogs(); + void setMaxEvents(uint32_t maxBufferSize); + + static int toolInit( + rocprofiler_client_finalize_t finalize_func, + void* tool_data); + static void toolFinialize(void* tool_data); + + static std::string opString( + rocprofiler_callback_tracing_kind_t kind, + rocprofiler_tracing_operation_t op); + + static std::string opString( + rocprofiler_buffer_tracing_kind_t kind, + rocprofiler_tracing_operation_t op); + + private: + bool registered_{false}; + void endTracing(); + + static void insert_row_to_buffer(rocprofBase* row); + + // + static void api_callback( + rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* callback_data); + static void buffer_callback( + rocprofiler_context_id_t context, + rocprofiler_buffer_id_t buffer_id, + rocprofiler_record_header_t** headers, + size_t num_headers, + void* user_data, + uint64_t drop_count); + static void code_object_callback( + rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* callback_data); + + // Api callback data + uint32_t maxBufferSize_{1000000}; // 1M GPU runtime/kernel events. + std::vector rows_; + std::mutex rowsMutex_; + + // This vector collects pairs of correlationId and their respective + // externalCorrelationId for each CorrelationDomain. This will be used + // to populate the Correlation maps during post processing. + std::vector> + externalCorrelations_[RocLogger::CorrelationDomain::size]; + std::mutex externalCorrelationsMutex_; + + bool externalCorrelationEnabled_{true}; + bool logging_{false}; + + friend class libkineto::RocprofActivityApi; +}; From 2ad040a733f2c27be2f09a86d1853b088b1ea6e0 Mon Sep 17 00:00:00 2001 From: Michael Wootton Date: Mon, 25 Aug 2025 11:15:29 -0500 Subject: [PATCH 3/7] Rocprof: update maxBufferSize_. Delete records on clear --- libkineto/src/RocprofLogger.cpp | 2 ++ libkineto/src/RocprofLogger.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/libkineto/src/RocprofLogger.cpp b/libkineto/src/RocprofLogger.cpp index 401220541..5b6b67fb7 100644 --- a/libkineto/src/RocprofLogger.cpp +++ b/libkineto/src/RocprofLogger.cpp @@ -500,6 +500,8 @@ void RocprofLogger::popCorrelationID(CorrelationDomain type) { } void RocprofLogger::clearLogs() { + for (auto &row : rows_) + delete row; rows_.clear(); for (int i = 0; i < CorrelationDomain::size; ++i) { externalCorrelations_[i].clear(); diff --git a/libkineto/src/RocprofLogger.h b/libkineto/src/RocprofLogger.h index 4628a7ab1..d5ef9747f 100644 --- a/libkineto/src/RocprofLogger.h +++ b/libkineto/src/RocprofLogger.h @@ -79,7 +79,7 @@ class RocprofLogger { void* callback_data); // Api callback data - uint32_t maxBufferSize_{1000000}; // 1M GPU runtime/kernel events. + uint32_t maxBufferSize_{5000000}; // 5M GPU runtime/kernel events. std::vector rows_; std::mutex rowsMutex_; From 194c49fc2f62144ee78ee6383c052f1130981751 Mon Sep 17 00:00:00 2001 From: Michael Wootton Date: Fri, 5 Sep 2025 09:45:17 -0500 Subject: [PATCH 4/7] Restore Roctracer files --- libkineto/src/RoctracerActivity.h | 176 ++++++++++ libkineto/src/RoctracerActivityApi.cpp | 214 ++++++++++++ libkineto/src/RoctracerActivityApi.h | 72 ++++ libkineto/src/RoctracerActivity_inl.h | 255 ++++++++++++++ libkineto/src/RoctracerLogger.cpp | 454 +++++++++++++++++++++++++ libkineto/src/RoctracerLogger.h | 304 +++++++++++++++++ 6 files changed, 1475 insertions(+) create mode 100644 libkineto/src/RoctracerActivity.h create mode 100644 libkineto/src/RoctracerActivityApi.cpp create mode 100644 libkineto/src/RoctracerActivityApi.h create mode 100644 libkineto/src/RoctracerActivity_inl.h create mode 100644 libkineto/src/RoctracerLogger.cpp create mode 100644 libkineto/src/RoctracerLogger.h diff --git a/libkineto/src/RoctracerActivity.h b/libkineto/src/RoctracerActivity.h new file mode 100644 index 000000000..72083ab6b --- /dev/null +++ b/libkineto/src/RoctracerActivity.h @@ -0,0 +1,176 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +// TODO(T90238193) +// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude +#include "GenericTraceActivity.h" +#include "ITraceActivity.h" +#include "RoctracerLogger.h" +#include "ThreadUtil.h" + +namespace libkineto { +class ActivityLogger; +} + +namespace KINETO_NAMESPACE { + +using namespace libkineto; +struct TraceSpan; + +// These classes wrap the various Roctracer activity types +// into subclasses of ITraceActivity so that they can all be accessed +// using the ITraceActivity interface and logged via ActivityLogger. + +// Abstract base class, templated on Roctracer activity type +template +struct RoctracerActivity : public ITraceActivity { + explicit RoctracerActivity(const T* activity, const ITraceActivity* linked) + : activity_(*activity), linked_(linked) {} + // Our stored timestamps (from roctracer and generated) are in CLOCK_MONOTONIC + // domain (in ns). Convert the timestamps. + int64_t timestamp() const override { + return activity_.begin; + } + int64_t duration() const override { + return activity_.end - activity_.begin; + } + int64_t correlationId() const override { + return 0; + } + int32_t getThreadId() const override { + return 0; + } + const ITraceActivity* linkedActivity() const override { + return linked_; + } + int flowType() const override { + return kLinkAsyncCpuGpu; + } + int64_t flowId() const override { + return correlationId(); + } + const T& raw() const { + return activity_; + } + const TraceSpan* traceSpan() const override { + return nullptr; + } + const std::string getMetadataValue(const std::string& key) const override { + auto it = metadata_.find(key); + if (it != metadata_.end()) { + return it->second; + } + return ""; + } + + protected: + const T& activity_; + const ITraceActivity* linked_{nullptr}; + std::unordered_map metadata_; +}; + +// roctracerAsyncRow - Roctracer GPU activities +struct GpuActivity : public RoctracerActivity { + explicit GpuActivity( + const roctracerAsyncRow* activity, + const ITraceActivity* linked) + : RoctracerActivity(activity, linked) { + switch (activity_.kind) { + case HIP_OP_COPY_KIND_DEVICE_TO_HOST_: + case HIP_OP_COPY_KIND_HOST_TO_DEVICE_: + case HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_: + case HIP_OP_COPY_KIND_DEVICE_TO_HOST_2D_: + case HIP_OP_COPY_KIND_HOST_TO_DEVICE_2D_: + case HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_2D_: + type_ = ActivityType::GPU_MEMCPY; + break; + case HIP_OP_COPY_KIND_FILL_BUFFER_: + type_ = ActivityType::GPU_MEMSET; + break; + case HIP_OP_DISPATCH_KIND_KERNEL_: + case HIP_OP_DISPATCH_KIND_TASK_: + default: + type_ = ActivityType::CONCURRENT_KERNEL; + break; + } + } + int64_t correlationId() const override { + return activity_.id; + } + int64_t deviceId() const override { + return activity_.device; + } + int64_t resourceId() const override { + return activity_.queue; + } + ActivityType type() const override { + return type_; + }; + bool flowStart() const override { + return false; + } + const std::string name() const override; + void log(ActivityLogger& logger) const override; + const std::string metadataJson() const override; + + // Add small buffer to fix visual error created by + // https://github.com/ROCm/roctracer/issues/105 Once this is resolved we can + // use ifdef to handle having this buffer or not based on version + int64_t timestamp() const override { + return activity_.begin + 1; + } + int64_t duration() const override { + return activity_.end - (activity_.begin + 1); + } + + private: + ActivityType type_; +}; + +// roctracerRow, roctracerKernelRow, roctracerCopyRow, roctracerMallocRow - +// Roctracer runtime activities +template +struct RuntimeActivity : public RoctracerActivity { + explicit RuntimeActivity(const T* activity, const ITraceActivity* linked) + : RoctracerActivity(activity, linked) {} + int64_t correlationId() const override { + return raw().id; + } + int64_t deviceId() const override { + return raw().pid; + } + int64_t resourceId() const override { + return raw().tid; + } + ActivityType type() const override { + return ActivityType::CUDA_RUNTIME; + } + bool flowStart() const override; + const std::string name() const override { + return std::string( + roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, raw().cid, 0)); + } + void log(ActivityLogger& logger) const override; + const std::string metadataJson() const override; + const T& raw() const { + return RoctracerActivity::raw(); + } +}; + +} // namespace KINETO_NAMESPACE + +// Include the implementation detail of this header file. +// The *_inl.h helps separate header interface from implementation details. +#include "RoctracerActivity_inl.h" diff --git a/libkineto/src/RoctracerActivityApi.cpp b/libkineto/src/RoctracerActivityApi.cpp new file mode 100644 index 000000000..0bec09d4a --- /dev/null +++ b/libkineto/src/RoctracerActivityApi.cpp @@ -0,0 +1,214 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "RoctracerActivityApi.h" + +#include +#include +#include +#include +#include "ApproximateClock.h" +#include "Demangle.h" +#include "Logger.h" +#include "ThreadUtil.h" +#include "output_base.h" + +using namespace std::chrono; + +namespace KINETO_NAMESPACE { + +RoctracerActivityApi& RoctracerActivityApi::singleton() { + static RoctracerActivityApi instance; + return instance; +} + +RoctracerActivityApi::RoctracerActivityApi() + : d(&RoctracerLogger::singleton()) {} + +RoctracerActivityApi::~RoctracerActivityApi() { + disableActivities(std::set()); +} + +void RoctracerActivityApi::pushCorrelationID(int id, CorrelationFlowType type) { +#ifdef HAS_ROCTRACER + if (!singleton().d->externalCorrelationEnabled_) { + return; + } + singleton().d->pushCorrelationID( + id, static_cast(type)); +#endif +} + +void RoctracerActivityApi::popCorrelationID(CorrelationFlowType type) { +#ifdef HAS_ROCTRACER + if (!singleton().d->externalCorrelationEnabled_) { + return; + } + singleton().d->popCorrelationID( + static_cast(type)); +#endif +} + +void RoctracerActivityApi::setMaxBufferSize(int size) { + // FIXME: implement? + // maxGpuBufferCount_ = 1 + size / kBufSize; +} + +inline bool inRange(int64_t start, int64_t end, int64_t stamp) { + return ((stamp > start) && (stamp < end)); +} + +inline bool RoctracerActivityApi::isLogged( + libkineto::ActivityType atype) const { + return activityMaskSnapshot_ & (1 << static_cast(atype)); +} + +timestamp_t getTimeOffset() { + int64_t t0, t00; + timespec t1; + t0 = libkineto::getApproximateTime(); + clock_gettime(CLOCK_MONOTONIC, &t1); + t00 = libkineto::getApproximateTime(); + + // Confvert to ns (if necessary) + t0 = libkineto::get_time_converter()(t0); + t00 = libkineto::get_time_converter()(t00); + + // Our stored timestamps (from roctracer and generated) are in CLOCK_MONOTONIC + // domain (in ns). + return (t0 >> 1) + (t00 >> 1) - timespec_to_ns(t1); +} + +int RoctracerActivityApi::processActivities( + std::function handler, + std::function + correlationHandler) { + // Find offset to map from monotonic clock to system clock. + // This will break time-ordering of events but is status quo. + + int count = 0; + + // Process all external correlations pairs + for (int it = RoctracerLogger::CorrelationDomain::begin; + it < RoctracerLogger::CorrelationDomain::end; + ++it) { + auto& externalCorrelations = d->externalCorrelations_[it]; + for (auto& item : externalCorrelations) { + correlationHandler( + item.first, + item.second, + static_cast(it)); + } + std::lock_guard lock(d->externalCorrelationsMutex_); + externalCorrelations.clear(); + } + + // Async ops are in CLOCK_MONOTONIC rather than junk clock. + // Convert these timestamps, poorly. + // These accurate timestamps will skew when converted to approximate time + // The time_converter is not available at collection time. Or we could do a + // much better job. + auto toffset = getTimeOffset(); + + // All Runtime API Calls + for (auto& item : d->rows_) { + bool filtered = false; + if (item->type != ROCTRACER_ACTIVITY_ASYNC && + !isLogged(ActivityType::CUDA_RUNTIME)) { + filtered = true; + } else { + switch (reinterpret_cast(item)->kind) { + case HIP_OP_COPY_KIND_DEVICE_TO_HOST_: + case HIP_OP_COPY_KIND_HOST_TO_DEVICE_: + case HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_: + case HIP_OP_COPY_KIND_DEVICE_TO_HOST_2D_: + case HIP_OP_COPY_KIND_HOST_TO_DEVICE_2D_: + case HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_2D_: + if (!isLogged(ActivityType::GPU_MEMCPY)) + filtered = true; + break; + case HIP_OP_COPY_KIND_FILL_BUFFER_: + if (!isLogged(ActivityType::GPU_MEMSET)) + filtered = true; + break; + case HIP_OP_DISPATCH_KIND_KERNEL_: + case HIP_OP_DISPATCH_KIND_TASK_: + default: + if (!isLogged(ActivityType::CONCURRENT_KERNEL)) + filtered = true; + // Don't record barriers/markers + if (reinterpret_cast(item)->op == + HIP_OP_ID_BARRIER) + filtered = true; + break; + } + } + if (!filtered) { + // Convert the begin and end timestamps from monotonic clock to system + // clock. + if (item->type == ROCTRACER_ACTIVITY_ASYNC) { + // Async ops are in CLOCK_MONOTONIC, apply offset to converted + // approximate + item->begin += toffset; + item->end += toffset; + } else { + // Runtime ranges are in approximate clock, just apply conversion + item->begin = libkineto::get_time_converter()(item->begin); + item->end = libkineto::get_time_converter()(item->end); + } + handler(item); + ++count; + } + } + return count; +} + +// TODO: implement the actual flush with roctracer_flush_activity +void RoctracerActivityApi::flushActivities() {} + +void RoctracerActivityApi::clearActivities() { + d->clearLogs(); +} + +void RoctracerActivityApi::setMaxEvents(uint32_t maxEvents) { +#ifdef HAS_ROCTRACER + d->setMaxEvents(maxEvents); +#endif +} + +void RoctracerActivityApi::enableActivities( + const std::set& selected_activities) { +#ifdef HAS_ROCTRACER + d->startLogging(); + + for (const auto& activity : selected_activities) { + activityMask_ |= (1 << static_cast(activity)); + if (activity == ActivityType::EXTERNAL_CORRELATION) { + d->externalCorrelationEnabled_ = true; + } + } +#endif +} + +void RoctracerActivityApi::disableActivities( + const std::set& selected_activities) { +#ifdef HAS_ROCTRACER + d->stopLogging(); + + activityMaskSnapshot_ = activityMask_; + + for (const auto& activity : selected_activities) { + activityMask_ &= ~(1 << static_cast(activity)); + if (activity == ActivityType::EXTERNAL_CORRELATION) { + d->externalCorrelationEnabled_ = false; + } + } +#endif +} + +} // namespace KINETO_NAMESPACE diff --git a/libkineto/src/RoctracerActivityApi.h b/libkineto/src/RoctracerActivityApi.h new file mode 100644 index 000000000..54bf03f73 --- /dev/null +++ b/libkineto/src/RoctracerActivityApi.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once +#ifdef HAS_ROCTRACER + +#include +#include +#include + +#include +#include "RoctracerLogger.h" + +#include "ActivityType.h" +#include "GenericTraceActivity.h" + +class RoctracerLogger; + +namespace KINETO_NAMESPACE { + +using namespace libkineto; + +class RoctracerActivityApi { + public: + enum CorrelationFlowType { Default, User }; + + RoctracerActivityApi(); + RoctracerActivityApi(const RoctracerActivityApi&) = delete; + RoctracerActivityApi& operator=(const RoctracerActivityApi&) = delete; + + virtual ~RoctracerActivityApi(); + + static RoctracerActivityApi& singleton(); + + static void pushCorrelationID(int id, CorrelationFlowType type); + static void popCorrelationID(CorrelationFlowType type); + + void enableActivities(const std::set& selected_activities); + void disableActivities(const std::set& selected_activities); + void clearActivities(); + void flushActivities(); + void teardownContext() {} + void setMaxEvents(uint32_t maxEvents); + + virtual int processActivities( + std::function handler, + std::function< + void(uint64_t, uint64_t, RoctracerLogger::CorrelationDomain)> + correlationHandler); + + void setMaxBufferSize(int size); + + std::atomic_bool stopCollection{false}; + + private: + bool registered_{false}; + + // Enabled Activity Filters + uint32_t activityMask_{0}; + uint32_t activityMaskSnapshot_{0}; + bool isLogged(libkineto::ActivityType atype) const; + + RoctracerLogger* d; +}; + +} // namespace KINETO_NAMESPACE +#endif diff --git a/libkineto/src/RoctracerActivity_inl.h b/libkineto/src/RoctracerActivity_inl.h new file mode 100644 index 000000000..56d54f6a9 --- /dev/null +++ b/libkineto/src/RoctracerActivity_inl.h @@ -0,0 +1,255 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include "RoctracerActivity.h" + +#include +#include +#include + +#include "Demangle.h" +#include "output_base.h" + +namespace KINETO_NAMESPACE { + +using namespace libkineto; + +namespace { +thread_local std::unordered_map correlationToGrid; +thread_local std::unordered_map correlationToBlock; +thread_local std::unordered_map correlationToSize; +} // namespace + +const char* getGpuActivityKindString(uint32_t kind) { + switch (kind) { + case HIP_OP_COPY_KIND_DEVICE_TO_HOST_: + case HIP_OP_COPY_KIND_DEVICE_TO_HOST_2D_: + return "DtoH"; + case HIP_OP_COPY_KIND_HOST_TO_DEVICE_: + case HIP_OP_COPY_KIND_HOST_TO_DEVICE_2D_: + return "HtoD"; + case HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_: + case HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_2D_: + return "DtoD"; + case HIP_OP_COPY_KIND_FILL_BUFFER_: + return "Device"; + case HIP_OP_DISPATCH_KIND_KERNEL_: + return "Dispatch Kernel"; + case HIP_OP_DISPATCH_KIND_TASK_: + return "Dispatch Task"; + default: + break; + } + return ""; +} + +void getMemcpySrcDstString(uint32_t kind, std::string& src, std::string& dst) { + switch (kind) { + case HIP_OP_COPY_KIND_DEVICE_TO_HOST_: + case HIP_OP_COPY_KIND_DEVICE_TO_HOST_2D_: + src = "Device"; + dst = "Host"; + break; + case HIP_OP_COPY_KIND_HOST_TO_DEVICE_: + case HIP_OP_COPY_KIND_HOST_TO_DEVICE_2D_: + src = "Host"; + dst = "Device"; + break; + case HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_: + case HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_2D_: + src = "Device"; + dst = "Device"; + break; + default: + src = "?"; + dst = "?"; + break; + } +} + +// GPU Activities + +inline const std::string GpuActivity::name() const { + if (type_ == ActivityType::CONCURRENT_KERNEL) { + const char* name = roctracer_op_string(raw().domain, raw().op, raw().kind); + return demangle( + raw().kernelName.length() > 0 ? raw().kernelName : std::string(name)); + } else if (type_ == ActivityType::GPU_MEMSET) { + return fmt::format("Memset ({})", getGpuActivityKindString(raw().kind)); + } else if (type_ == ActivityType::GPU_MEMCPY) { + std::string src = ""; + std::string dst = ""; + getMemcpySrcDstString(raw().kind, src, dst); + return fmt::format( + "Memcpy {} ({} -> {})", getGpuActivityKindString(raw().kind), src, dst); + } else { + return ""; + } +} + +inline void GpuActivity::log(ActivityLogger& logger) const { + logger.handleActivity(*this); +} + +static inline std::string bandwidth(size_t bytes, uint64_t duration) { + return duration == 0 ? "\"N/A\"" : fmt::format("{}", bytes * 1.0 / duration); +} + +inline const std::string GpuActivity::metadataJson() const { + const auto& gpuActivity = raw(); + // clang-format off + + // if memcpy or memset, add size + if (correlationToSize.count(gpuActivity.id) > 0) { + size_t size = correlationToSize[gpuActivity.id]; + std::string bandwidth_gib = (bandwidth(size, gpuActivity.end - gpuActivity.begin)); + return fmt::format(R"JSON( + "device": {}, "stream": {}, + "correlation": {}, "kind": "{}", + "bytes": {}, "memory bandwidth (GB/s)": {})JSON", + gpuActivity.device, gpuActivity.queue, + gpuActivity.id, getGpuActivityKindString(gpuActivity.kind), + size, bandwidth_gib); + } + + // if compute kernel, add grid and block + else if (correlationToGrid.count(gpuActivity.id) > 0) { + return fmt::format(R"JSON( + "device": {}, "stream": {}, + "correlation": {}, "kind": "{}", + "grid": {}, "block": {})JSON", + gpuActivity.device, gpuActivity.queue, + gpuActivity.id, getGpuActivityKindString(gpuActivity.kind), + correlationToGrid[gpuActivity.id], correlationToBlock[gpuActivity.id]); + } else { + return fmt::format(R"JSON( + "device": {}, "stream": {}, + "correlation": {}, "kind": "{}")JSON", + gpuActivity.device, gpuActivity.queue, + gpuActivity.id, getGpuActivityKindString(gpuActivity.kind)); + } + // clang-format on +} + +// Runtime Activities + +template +inline bool RuntimeActivity::flowStart() const { + bool should_correlate = raw().cid == HIP_API_ID_hipLaunchKernel || + raw().cid == HIP_API_ID_hipExtLaunchKernel || + raw().cid == HIP_API_ID_hipLaunchCooperativeKernel || + raw().cid == HIP_API_ID_hipHccModuleLaunchKernel || + raw().cid == HIP_API_ID_hipModuleLaunchKernel || + raw().cid == HIP_API_ID_hipExtModuleLaunchKernel || + raw().cid == HIP_API_ID_hipMalloc || raw().cid == HIP_API_ID_hipFree || + raw().cid == HIP_API_ID_hipMemcpy || + raw().cid == HIP_API_ID_hipMemcpyAsync || + raw().cid == HIP_API_ID_hipMemcpyWithStream; + return should_correlate; +} + +template +inline void RuntimeActivity::log(ActivityLogger& logger) const { + logger.handleActivity(*this); +} + +template <> +inline const std::string RuntimeActivity::metadataJson() + const { + std::string kernel = ""; + if ((raw().functionAddr != nullptr)) { + kernel = fmt::format( + R"JSON( + "kernel": "{}", )JSON", + demangle(hipKernelNameRefByPtr(raw().functionAddr, raw().stream))); + } else if ((raw().function != nullptr)) { + kernel = fmt::format( + R"JSON( + "kernel": "{}", )JSON", + demangle(hipKernelNameRef(raw().function))); + } + // cache grid and block so we can pass it into async activity (GPU track) + correlationToGrid[raw().id] = fmt::format( + R"JSON( + [{}, {}, {}])JSON", + raw().gridX, + raw().gridY, + raw().gridZ); + + correlationToBlock[raw().id] = fmt::format( + R"JSON( + [{}, {}, {}])JSON", + raw().workgroupX, + raw().workgroupY, + raw().workgroupZ); + + return fmt::format( + R"JSON( + {}"cid": {}, "correlation": {}, + "grid": [{}, {}, {}], + "block": [{}, {}, {}], + "shared memory": {})JSON", + kernel, + raw().cid, + raw().id, + raw().gridX, + raw().gridY, + raw().gridZ, + raw().workgroupX, + raw().workgroupY, + raw().workgroupZ, + raw().groupSegmentSize); +} + +template <> +inline const std::string RuntimeActivity::metadataJson() + const { + correlationToSize[raw().id] = raw().size; + return fmt::format( + R"JSON( + "cid": {}, "correlation": {}, "src": "{}", "dst": "{}", "bytes": "{}", "kind": "{}")JSON", + raw().cid, + raw().id, + raw().src, + raw().dst, + raw().size, + fmt::underlying(raw().kind)); +} + +template <> +inline const std::string RuntimeActivity::metadataJson() + const { + correlationToSize[raw().id] = raw().size; + std::string size = ""; + if (raw().cid == HIP_API_ID_hipMalloc) { + size = fmt::format( + R"JSON( + "bytes": {}, )JSON", + raw().size); + } + return fmt::format( + R"JSON( + {}"cid": {}, "correlation": {}, "ptr": "{}")JSON", + size, + raw().cid, + raw().id, + raw().ptr); +} + +template +inline const std::string RuntimeActivity::metadataJson() const { + return fmt::format( + R"JSON( + "cid": {}, "correlation": {})JSON", + raw().cid, + raw().id); +} + +} // namespace KINETO_NAMESPACE diff --git a/libkineto/src/RoctracerLogger.cpp b/libkineto/src/RoctracerLogger.cpp new file mode 100644 index 000000000..725c4d0b0 --- /dev/null +++ b/libkineto/src/RoctracerLogger.cpp @@ -0,0 +1,454 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "RoctracerLogger.h" + +#include +#include +#include +#include +#include + +#include "ApproximateClock.h" +#include "Demangle.h" +#include "Logger.h" +#include "ThreadUtil.h" + +using namespace libkineto; +using namespace std::chrono; + +class Flush { + public: + std::mutex mutex_; + std::atomic maxCorrelationId_; + uint64_t maxCompletedCorrelationId_{0}; + void reportCorrelation(const uint64_t& cid) { + uint64_t prev = maxCorrelationId_; + while (prev < cid && !maxCorrelationId_.compare_exchange_weak(prev, cid)) { + } + } +}; +static Flush s_flush; + +RoctracerLogger& RoctracerLogger::singleton() { + static RoctracerLogger instance; + return instance; +} + +RoctracerLogger::RoctracerLogger() {} + +RoctracerLogger::~RoctracerLogger() { + stopLogging(); + endTracing(); +} + +namespace { +thread_local std::deque + t_externalIds[RoctracerLogger::CorrelationDomain::size]; +} + +void RoctracerLogger::pushCorrelationID(uint64_t id, CorrelationDomain type) { + if (!singleton().externalCorrelationEnabled_) { + return; + } + t_externalIds[type].push_back(id); +} + +void RoctracerLogger::popCorrelationID(CorrelationDomain type) { + if (!singleton().externalCorrelationEnabled_) { + return; + } + if (!t_externalIds[type].empty()) { + t_externalIds[type].pop_back(); + } else { + LOG(ERROR) + << "Attempt to popCorrelationID from an empty external Ids stack"; + } +} + +void RoctracerLogger::clearLogs() { + rows_.clear(); + for (int i = 0; i < CorrelationDomain::size; ++i) { + externalCorrelations_[i].clear(); + } +} + +void RoctracerLogger::insert_row_to_buffer(roctracerBase* row) { + RoctracerLogger* dis = &singleton(); + std::lock_guard lock(dis->rowsMutex_); + if (dis->rows_.size() >= dis->maxBufferSize_) { + LOG_FIRST_N(WARNING, 10) + << "Exceeded max GPU buffer count (" << dis->rows_.size() << " > " + << dis->maxBufferSize_ << ") - terminating tracing"; + return; + } + dis->rows_.push_back(row); +} + +void RoctracerLogger::api_callback( + uint32_t domain, + uint32_t cid, + const void* callback_data, + void* arg) { + RoctracerLogger* dis = &singleton(); + + if (domain == ACTIVITY_DOMAIN_HIP_API && dis->loggedIds_.contains(cid)) { + const hip_api_data_t* data = (const hip_api_data_t*)(callback_data); + + // Pack callbacks into row structures + + thread_local std::unordered_map + timestamps; + + if (data->phase == ACTIVITY_API_PHASE_ENTER) { + timestamps[data->correlation_id] = getApproximateTime(); + } else { // (data->phase == ACTIVITY_API_PHASE_EXIT) + uint64_t startTime = timestamps[data->correlation_id]; + timestamps.erase(data->correlation_id); + uint64_t endTime = getApproximateTime(); + + switch (cid) { + case HIP_API_ID_hipLaunchKernel: + case HIP_API_ID_hipExtLaunchKernel: + case HIP_API_ID_hipLaunchCooperativeKernel: // Should work here + { + s_flush.reportCorrelation(data->correlation_id); + auto& args = data->args.hipLaunchKernel; + roctracerKernelRow* row = new roctracerKernelRow( + data->correlation_id, + domain, + cid, + processId(), + systemThreadId(), + startTime, + endTime, + args.function_address, + nullptr, + args.numBlocks.x, + args.numBlocks.y, + args.numBlocks.z, + args.dimBlocks.x, + args.dimBlocks.y, + args.dimBlocks.z, + args.sharedMemBytes, + args.stream); + insert_row_to_buffer(row); + } break; + case HIP_API_ID_hipHccModuleLaunchKernel: + case HIP_API_ID_hipModuleLaunchKernel: + case HIP_API_ID_hipExtModuleLaunchKernel: { + s_flush.reportCorrelation(data->correlation_id); + auto& args = data->args.hipModuleLaunchKernel; + roctracerKernelRow* row = new roctracerKernelRow( + data->correlation_id, + domain, + cid, + processId(), + systemThreadId(), + startTime, + endTime, + nullptr, + args.f, + args.gridDimX, + args.gridDimY, + args.gridDimZ, + args.blockDimX, + args.blockDimY, + args.blockDimZ, + args.sharedMemBytes, + args.stream); + insert_row_to_buffer(row); + } break; + case HIP_API_ID_hipLaunchCooperativeKernelMultiDevice: + case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice: +#if 0 + { + auto &args = data->args.hipLaunchCooperativeKernelMultiDevice.launchParamsList__val; + roctracerKernelRow* row = new roctracerKernelRow( + data->correlation_id, + domain, + cid, + processId(), + systemThreadId(), + startTime, + endTime, + args.function_address, + nullptr, + args.numBlocks.x, + args.numBlocks.y, + args.numBlocks.z, + args.dimBlocks.x, + args.dimBlocks.y, + args.dimBlocks.z, + args.sharedMemBytes, + args.stream + ); + insert_row_to_buffer(row); + } +#endif + break; + case HIP_API_ID_hipMalloc: { + roctracerMallocRow* row = new roctracerMallocRow( + data->correlation_id, + domain, + cid, + processId(), + systemThreadId(), + startTime, + endTime, + data->args.hipMalloc.ptr__val, + data->args.hipMalloc.size); + insert_row_to_buffer(row); + } break; + case HIP_API_ID_hipFree: { + roctracerMallocRow* row = new roctracerMallocRow( + data->correlation_id, + domain, + cid, + processId(), + systemThreadId(), + startTime, + endTime, + data->args.hipFree.ptr, + 0); + insert_row_to_buffer(row); + } break; + case HIP_API_ID_hipMemcpy: { + auto& args = data->args.hipMemcpy; + roctracerCopyRow* row = new roctracerCopyRow( + data->correlation_id, + domain, + cid, + processId(), + systemThreadId(), + startTime, + endTime, + args.src, + args.dst, + args.sizeBytes, + args.kind, + static_cast(0) // use placeholder? + ); + insert_row_to_buffer(row); + } break; + case HIP_API_ID_hipMemcpyAsync: + case HIP_API_ID_hipMemcpyWithStream: { + auto& args = data->args.hipMemcpyAsync; + roctracerCopyRow* row = new roctracerCopyRow( + data->correlation_id, + domain, + cid, + processId(), + systemThreadId(), + startTime, + endTime, + args.src, + args.dst, + args.sizeBytes, + args.kind, + args.stream); + insert_row_to_buffer(row); + } break; + default: { + roctracerRow* row = new roctracerRow( + data->correlation_id, + domain, + cid, + processId(), + systemThreadId(), + startTime, + endTime); + insert_row_to_buffer(row); + } break; + } // switch + // External correlation + for (int it = CorrelationDomain::begin; it < CorrelationDomain::end; + ++it) { + if (t_externalIds[it].size() > 0) { + std::lock_guard lock(dis->externalCorrelationsMutex_); + dis->externalCorrelations_[it].emplace_back( + data->correlation_id, t_externalIds[it].back()); + } + } + } // phase exit + } +} + +void RoctracerLogger::activity_callback( + const char* begin, + const char* end, + void* arg) { + // Log latest completed correlation id. Used to ensure we have flushed all + // data on stop + std::unique_lock lock(s_flush.mutex_); + const roctracer_record_t* record = (const roctracer_record_t*)(begin); + const roctracer_record_t* end_record = (const roctracer_record_t*)(end); + + while (record < end_record) { + if (record->correlation_id > s_flush.maxCompletedCorrelationId_) { + s_flush.maxCompletedCorrelationId_ = record->correlation_id; + } + roctracerAsyncRow* row = new roctracerAsyncRow( + record->correlation_id, + record->domain, + record->kind, + record->op, + record->device_id, + record->queue_id, + record->begin_ns, + record->end_ns, + ((record->kind == HIP_OP_DISPATCH_KIND_KERNEL_) || + (record->kind == HIP_OP_DISPATCH_KIND_TASK_)) + ? demangle(record->kernel_name) + : std::string()); + insert_row_to_buffer(row); + roctracer_next_record(record, &record); + } +} + +void RoctracerLogger::setMaxEvents(uint32_t maxBufferSize) { +#ifdef HAS_ROCTRACER + RoctracerLogger* dis = &singleton(); + std::lock_guard lock(dis->rowsMutex_); + maxBufferSize_ = maxBufferSize; +#endif +} + +void RoctracerLogger::startLogging() { + if (!registered_) { + roctracer_set_properties( + ACTIVITY_DOMAIN_HIP_API, nullptr); // Magic encantation + + // Set some api calls to ignore + loggedIds_.setInvertMode(true); // Omit the specified api + loggedIds_.add("hipGetDevice"); + loggedIds_.add("hipSetDevice"); + loggedIds_.add("hipGetLastError"); + loggedIds_.add("__hipPushCallConfiguration"); + loggedIds_.add("__hipPopCallConfiguration"); + loggedIds_.add("hipCtxSetCurrent"); + loggedIds_.add("hipEventRecord"); + loggedIds_.add("hipEventQuery"); + loggedIds_.add("hipGetDeviceProperties"); + loggedIds_.add("hipPeekAtLastError"); + loggedIds_.add("hipModuleGetFunction"); + loggedIds_.add("hipEventCreateWithFlags"); + loggedIds_.add("hipGetDeviceCount"); + loggedIds_.add("hipDevicePrimaryCtxGetState"); + + // Enable API callbacks + if (loggedIds_.invertMode() == true) { + // exclusion list - enable entire domain and turn off things in list + roctracer_enable_domain_callback( + ACTIVITY_DOMAIN_HIP_API, api_callback, nullptr); + const std::unordered_map& filter = + loggedIds_.filterList(); + for (auto it = filter.begin(); it != filter.end(); ++it) { + roctracer_disable_op_callback(ACTIVITY_DOMAIN_HIP_API, it->first); + } + } else { + // inclusion list - only enable things in the list + const std::unordered_map& filter = + loggedIds_.filterList(); + roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API); + for (auto it = filter.begin(); it != filter.end(); ++it) { + roctracer_enable_op_callback( + ACTIVITY_DOMAIN_HIP_API, it->first, api_callback, nullptr); + } + } + // roctracer_enable_domain_callback(ACTIVITY_DOMAIN_ROCTX, api_callback, + // nullptr); + + // Allocate default tracing pool + roctracer_properties_t properties; + memset(&properties, 0, sizeof(roctracer_properties_t)); + properties.buffer_size = 0x1000; + roctracer_open_pool(&properties); + + // Enable async op collection + roctracer_properties_t hcc_cb_properties; + memset(&hcc_cb_properties, 0, sizeof(roctracer_properties_t)); + hcc_cb_properties.buffer_size = 0x4000; + hcc_cb_properties.buffer_callback_fun = activity_callback; + roctracer_open_pool_expl(&hcc_cb_properties, &hccPool_); + roctracer_enable_domain_activity_expl(ACTIVITY_DOMAIN_HCC_OPS, hccPool_); + + registered_ = true; + } + + externalCorrelationEnabled_ = true; + logging_ = true; + roctracer_start(); +} + +void RoctracerLogger::stopLogging() { + if (logging_ == false) + return; + logging_ = false; + + hipError_t err = hipDeviceSynchronize(); + if (err != hipSuccess) { + LOG(ERROR) << "hipDeviceSynchronize failed with code " << err; + } + roctracer_flush_activity_expl(hccPool_); + + // If we are stopping the tracer, implement reliable flushing + std::unique_lock lock(s_flush.mutex_); + + auto correlationId = + s_flush.maxCorrelationId_.load(); // load ending id from the running max + + // Poll on the worker finding the final correlation id + int timeout = 50; + while ((s_flush.maxCompletedCorrelationId_ < correlationId) && --timeout) { + lock.unlock(); + roctracer_flush_activity_expl(hccPool_); + usleep(1000); + lock.lock(); + } + + roctracer_stop(); +} + +void RoctracerLogger::endTracing() { + if (registered_ == true) { + roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API); + // roctracer_disable_domain_callback(ACTIVITY_DOMAIN_ROCTX); + + roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HCC_OPS); + roctracer_close_pool_expl(hccPool_); + hccPool_ = nullptr; + } +} + +ApiIdList::ApiIdList() : invert_(true) {} + +void ApiIdList::add(const std::string& apiName) { + uint32_t cid = 0; + if (roctracer_op_code( + ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr) == + ROCTRACER_STATUS_SUCCESS) { + filter_[cid] = 1; + } +} +void ApiIdList::remove(const std::string& apiName) { + uint32_t cid = 0; + if (roctracer_op_code( + ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr) == + ROCTRACER_STATUS_SUCCESS) { + filter_.erase(cid); + } +} + +bool ApiIdList::loadUserPrefs() { + // placeholder + return false; +} +bool ApiIdList::contains(uint32_t apiId) { + return (filter_.find(apiId) != filter_.end()) ? !invert_ : invert_; // XOR +} diff --git a/libkineto/src/RoctracerLogger.h b/libkineto/src/RoctracerLogger.h new file mode 100644 index 000000000..d28a9f08b --- /dev/null +++ b/libkineto/src/RoctracerLogger.h @@ -0,0 +1,304 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +// Local copy of hip op types. These are public (and stable) in later rocm +// releases +typedef enum { + HIP_OP_COPY_KIND_UNKNOWN_ = 0, + HIP_OP_COPY_KIND_DEVICE_TO_HOST_ = 0x11F3, + HIP_OP_COPY_KIND_HOST_TO_DEVICE_ = 0x11F4, + HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_ = 0x11F5, + HIP_OP_COPY_KIND_DEVICE_TO_HOST_2D_ = 0x1201, + HIP_OP_COPY_KIND_HOST_TO_DEVICE_2D_ = 0x1202, + HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_2D_ = 0x1203, + HIP_OP_COPY_KIND_FILL_BUFFER_ = 0x1207 +} hip_op_copy_kind_t_; + +typedef enum { + HIP_OP_DISPATCH_KIND_UNKNOWN_ = 0, + HIP_OP_DISPATCH_KIND_KERNEL_ = 0x11F0, + HIP_OP_DISPATCH_KIND_TASK_ = 0x11F1 +} hip_op_dispatch_kind_t_; + +typedef enum { HIP_OP_BARRIER_KIND_UNKNOWN_ = 0 } hip_op_barrier_kind_t_; +// end hip op defines + +namespace onnxruntime { +namespace profiling { +class RocmProfiler; +} +} // namespace onnxruntime + +namespace libkineto { +class RoctracerActivityApi; +} + +typedef uint64_t timestamp_t; + +static timestamp_t timespec_to_ns(const timespec& time) { + return ((timestamp_t)time.tv_sec * 1000000000) + time.tv_nsec; +} + +class ApiIdList { + public: + ApiIdList(); + bool invertMode() { + return invert_; + } + void setInvertMode(bool invert) { + invert_ = invert; + } + void add(const std::string& apiName); + void remove(const std::string& apiName); + bool loadUserPrefs(); + bool contains(uint32_t apiId); + const std::unordered_map& filterList() { + return filter_; + } + + private: + std::unordered_map filter_; + bool invert_; +}; + +typedef enum { + ROCTRACER_ACTIVITY_DEFAULT = 0, + ROCTRACER_ACTIVITY_KERNEL, + ROCTRACER_ACTIVITY_COPY, + ROCTRACER_ACTIVITY_MALLOC, + ROCTRACER_ACTIVITY_ASYNC, + ROCTRACER_ACTIVITY_NONE +} roctracer_activity_types; + +struct roctracerBase { + roctracerBase( + uint64_t id, + uint32_t domain, + uint64_t begin, + uint64_t end, + roctracer_activity_types type = ROCTRACER_ACTIVITY_NONE) + : id(id), begin(begin), end(end), domain(domain), type(type) {} + uint64_t id; // correlation_id + uint64_t begin; + uint64_t end; + uint32_t domain; + roctracer_activity_types type; +}; + +struct roctracerRow : public roctracerBase { + roctracerRow( + uint64_t id, + uint32_t domain, + uint32_t cid, + uint32_t pid, + uint32_t tid, + uint64_t begin, + uint64_t end, + roctracer_activity_types type = ROCTRACER_ACTIVITY_DEFAULT) + : roctracerBase(id, domain, begin, end, type), + cid(cid), + pid(pid), + tid(tid) {} + uint32_t cid; + uint32_t pid; + uint32_t tid; +}; + +struct roctracerKernelRow : public roctracerRow { + roctracerKernelRow( + uint64_t id, + uint32_t domain, + uint32_t cid, + uint32_t pid, + uint32_t tid, + uint64_t begin, + uint64_t end, + const void* faddr, + hipFunction_t function, + unsigned int gx, + unsigned int gy, + unsigned int gz, + unsigned int wx, + unsigned int wy, + unsigned int wz, + size_t gss, + hipStream_t stream, + roctracer_activity_types type = ROCTRACER_ACTIVITY_KERNEL) + : roctracerRow(id, domain, cid, pid, tid, begin, end, type), + functionAddr(faddr), + function(function), + gridX(gx), + gridY(gy), + gridZ(gz), + workgroupX(wx), + workgroupY(wy), + workgroupZ(wz), + groupSegmentSize(gss), + stream(stream) {} + const void* functionAddr; + hipFunction_t function; + unsigned int gridX; + unsigned int gridY; + unsigned int gridZ; + unsigned int workgroupX; + unsigned int workgroupY; + unsigned int workgroupZ; + size_t groupSegmentSize; + hipStream_t stream; +}; + +struct roctracerCopyRow : public roctracerRow { + roctracerCopyRow( + uint64_t id, + uint32_t domain, + uint32_t cid, + uint32_t pid, + uint32_t tid, + uint64_t begin, + uint64_t end, + const void* src, + const void* dst, + size_t size, + hipMemcpyKind kind, + hipStream_t stream, + roctracer_activity_types type = ROCTRACER_ACTIVITY_COPY) + : roctracerRow(id, domain, cid, pid, tid, begin, end, type), + src(src), + dst(dst), + size(size), + kind(kind), + stream(stream) {} + const void* src; + const void* dst; + size_t size; + hipMemcpyKind kind; + hipStream_t stream; +}; + +struct roctracerMallocRow : public roctracerRow { + roctracerMallocRow( + uint64_t id, + uint32_t domain, + uint32_t cid, + uint32_t pid, + uint32_t tid, + uint64_t begin, + uint64_t end, + const void* ptr, + size_t size, + roctracer_activity_types type = ROCTRACER_ACTIVITY_MALLOC) + : roctracerRow(id, domain, cid, pid, tid, begin, end, type), + ptr(ptr), + size(size) {} + const void* ptr; + size_t size; +}; + +struct roctracerAsyncRow : public roctracerBase { + roctracerAsyncRow( + uint64_t id, + uint32_t domain, + uint32_t kind, + uint32_t op, + int device, + uint64_t queue, + uint64_t begin, + uint64_t end, + const std::string& kernelName, + roctracer_activity_types type = ROCTRACER_ACTIVITY_ASYNC) + : roctracerBase(id, domain, begin, end, type), + kind(kind), + op(op), + device(device), + queue(queue), + kernelName(kernelName) {} + uint32_t kind; + uint32_t op; + int device; + uint64_t queue; + std::string kernelName; +}; + +class RoctracerLogger { + public: + enum CorrelationDomain { + begin, + Default = begin, + Domain0 = begin, + Domain1, + end, + size = end + }; + + RoctracerLogger(); + RoctracerLogger(const RoctracerLogger&) = delete; + RoctracerLogger& operator=(const RoctracerLogger&) = delete; + + virtual ~RoctracerLogger(); + + static RoctracerLogger& singleton(); + + static void pushCorrelationID(uint64_t id, CorrelationDomain type); + static void popCorrelationID(CorrelationDomain type); + + void startLogging(); + void stopLogging(); + void clearLogs(); + void setMaxEvents(uint32_t maxBufferSize); + + private: + bool registered_{false}; + void endTracing(); + + roctracer_pool_t* hccPool_{NULL}; + static void insert_row_to_buffer(roctracerBase* row); + static void api_callback( + uint32_t domain, + uint32_t cid, + const void* callback_data, + void* arg); + static void activity_callback(const char* begin, const char* end, void* arg); + + ApiIdList loggedIds_; + + // Api callback data + uint32_t maxBufferSize_{5000000}; // 5M GPU runtime/kernel events. + std::vector rows_; + std::mutex rowsMutex_; + + // This vector collects pairs of correlationId and their respective + // externalCorrelationId for each CorrelationDomain. This will be used + // to populate the Correlation maps during post processing. + std::vector> + externalCorrelations_[CorrelationDomain::size]; + std::mutex externalCorrelationsMutex_; + + bool externalCorrelationEnabled_{true}; + bool logging_{false}; + + friend class onnxruntime::profiling::RocmProfiler; + friend class libkineto::RoctracerActivityApi; +}; From 76c1f1e1d119c828ba2b07de195849830db086b7 Mon Sep 17 00:00:00 2001 From: Michael Wootton Date: Tue, 9 Sep 2025 11:30:33 -0500 Subject: [PATCH 5/7] Rocprof: Fall back to roctracer for rocm < 6.4 --- libkineto/CMakeLists.txt | 60 +++++- libkineto/libkineto_defs.bzl | 9 +- libkineto/src/ActivityProfilerController.cpp | 9 + libkineto/src/CuptiActivityProfiler.cpp | 31 +++ libkineto/src/CuptiActivityProfiler.h | 12 +- libkineto/src/RocprofLogger.cpp | 5 +- libkineto/src/RoctracerActivity.h | 6 +- libkineto/src/RoctracerActivityApi.cpp | 18 +- libkineto/src/RoctracerActivityApi.h | 4 +- libkineto/src/RoctracerActivity_inl.h | 6 +- libkineto/src/RoctracerLogger.cpp | 65 ++---- libkineto/src/RoctracerLogger.h | 208 +------------------ 12 files changed, 170 insertions(+), 263 deletions(-) diff --git a/libkineto/CMakeLists.txt b/libkineto/CMakeLists.txt index 4d025a7cd..ccd6ed552 100644 --- a/libkineto/CMakeLists.txt +++ b/libkineto/CMakeLists.txt @@ -110,14 +110,65 @@ else() set(LIBKINETO_NOXPUPTI ON) endif() +# Detect ROCM Version +if(NOT LIBKINETO_NOROCTRACER) + if(NOT ROCM_INCLUDE_DIRS) + set(ROCM_INCLUDE_DIRS "${ROCM_SOURCE_DIR}/include") + endif() + + find_file(ROCM_VERSION_HEADER_PATH + NAMES rocm-core/rocm_version.h + NO_DEFAULT_PATH + PATHS ${ROCM_INCLUDE_DIRS} + ) + + if(EXISTS ${ROCM_VERSION_HEADER_PATH}) + set(ROCM_HEADER_FILE ${ROCM_VERSION_HEADER_PATH}) + endif() + + # Read the ROCM headerfile into a variable + message(STATUS "Reading ROCM version from: ${ROCM_HEADER_FILE}") + file(READ "${ROCM_HEADER_FILE}" ROCM_HEADER_CONTENT) + + string(REGEX MATCH "ROCM_VERSION_MAJOR[ ]+[0-9]+" TEMP1 ${ROCM_HEADER_CONTENT}) + string(REPLACE "ROCM_VERSION_MAJOR" "" TEMP2 ${TEMP1}) + string(STRIP ${TEMP2} ROCM_VERSION_DEV_MAJOR) + string(REGEX MATCH "ROCM_VERSION_MINOR[ ]+[0-9]+" TEMP1 ${ROCM_HEADER_CONTENT}) + string(REPLACE "ROCM_VERSION_MINOR" "" TEMP2 ${TEMP1}) + string(STRIP ${TEMP2} ROCM_VERSION_DEV_MINOR) + string(REGEX MATCH "ROCM_VERSION_PATCH[ ]+[0-9]+" TEMP1 ${ROCM_HEADER_CONTENT}) + string(REPLACE "ROCM_VERSION_PATCH" "" TEMP2 ${TEMP1}) + string(STRIP ${TEMP2} ROCM_VERSION_DEV_PATCH) + + message(STATUS "ROCM major: ${ROCM_VERSION_DEV_MAJOR}") + message(STATUS "ROCM minor: ${ROCM_VERSION_DEV_MINOR}") + message(STATUS "ROCM patch: ${ROCM_VERSION_DEV_PATCH}") + +# Use rocprofiler-sdk for rocm version 6.4 forward + if ((${ROCM_VERSION_DEV_MAJOR} GREATER_EQUAL 7) OR (${ROCM_VERSION_DEV_MAJOR} GREATER_EQUAL 6 AND ${ROCM_VERSION_DEV_MINOR} GREATER_EQUAL 4)) + set(USE_ROCPROFILER_SDK ON) + endif() + if (${USE_ROCPROFILER_SDK}) + message(STATUS "Building with: rocprofiler-sdk") + else() + message(STATUS "Building with: libroctracer") + endif() +endif() + # Define file lists if (LIBKINETO_NOCUPTI AND LIBKINETO_NOROCTRACER AND LIBKINETO_NOXPUPTI) get_filelist("get_libkineto_cpu_only_srcs(with_api=False)" LIBKINETO_SRCS) message(STATUS " CUPTI unavailable or disabled - not building GPU profilers") else() if(NOT LIBKINETO_NOROCTRACER) - get_filelist("get_libkineto_roctracer_srcs(with_api=False)" LIBKINETO_roc_SRCS) - message(STATUS " Building with roctracer") + if (${USE_ROCPROFILER_SDK}) + get_filelist("get_libkineto_rocprofiler_srcs(with_api=False)" LIBKINETO_roc_SRCS) + message(STATUS " Building with rocprofiler-sdk") + else() + get_filelist("get_libkineto_roctracer_srcs(with_api=False)" LIBKINETO_roc_SRCS) + add_compile_options(-DROCTRACER_FALLBACK) + message(STATUS " Building with roctracer") + endif() elseif(NOT LIBKINETO_NOCUPTI) get_filelist("get_libkineto_cupti_srcs(with_api=False)" LIBKINETO_cuda_SRCS) endif() @@ -238,8 +289,13 @@ target_include_directories(kineto PUBLIC $) if(NOT LIBKINETO_NOROCTRACER) +if (${USE_ROCPROFILER_SDK}) find_library(ROCPROF_LIBRARY NAMES librocprofiler-sdk.so HINTS ${ROCM_SOURCE_DIR}/lib) +else() + find_library(ROCPROF_LIBRARY NAMES libroctracer64.so HINTS + ${ROCM_SOURCE_DIR}/lib) +endif() target_link_libraries(kineto "${ROCPROF_LIBRARY}") find_library(KINETO_HIP_LIBRARY NAMES libamdhip64.so HINTS ${ROCM_SOURCE_DIR}/lib) diff --git a/libkineto/libkineto_defs.bzl b/libkineto/libkineto_defs.bzl index f22633d0a..7f717df45 100644 --- a/libkineto/libkineto_defs.bzl +++ b/libkineto/libkineto_defs.bzl @@ -27,13 +27,20 @@ def get_libkineto_cupti_srcs(with_api = True): "src/cupti_strings.cpp", ] + (get_libkineto_cpu_only_srcs(with_api)) -def get_libkineto_roctracer_srcs(with_api = True): +def get_libkineto_rocprofiler_srcs(with_api = True): return [ "src/RocprofActivityApi.cpp", "src/RocprofLogger.cpp", "src/RocLogger.cpp", ] + (get_libkineto_cpu_only_srcs(with_api)) +def get_libkineto_roctracer_srcs(with_api = True): + return [ + "src/RoctracerActivityApi.cpp", + "src/RoctracerLogger.cpp", + "src/RocLogger.cpp", + ] + (get_libkineto_cpu_only_srcs(with_api)) + def get_libkineto_xpupti_srcs(with_api = True): return [ "src/plugin/xpupti/XpuptiActivityApi.cpp", diff --git a/libkineto/src/ActivityProfilerController.cpp b/libkineto/src/ActivityProfilerController.cpp index 4c45c6b85..24dd6e1bb 100644 --- a/libkineto/src/ActivityProfilerController.cpp +++ b/libkineto/src/ActivityProfilerController.cpp @@ -18,7 +18,11 @@ #include "CuptiActivityApi.h" #ifdef HAS_ROCTRACER + #ifndef ROCTRACER_FALLBACK #include "RocprofActivityApi.h" + #else +#include "RoctracerActivityApi.h" + #endif #endif #include "ThreadUtil.h" @@ -67,8 +71,13 @@ ActivityProfilerController::ActivityProfilerController( #endif // !USE_GOOGLE_LOG #ifdef HAS_ROCTRACER + #ifndef ROCTRACER_FALLBACK profiler_ = std::make_unique( RocprofActivityApi::singleton(), cpuOnly); + #else + profiler_ = std::make_unique( + RoctracerActivityApi::singleton(), cpuOnly); + #endif #else profiler_ = std::make_unique( CuptiActivityApi::singleton(), cpuOnly); diff --git a/libkineto/src/CuptiActivityProfiler.cpp b/libkineto/src/CuptiActivityProfiler.cpp index bcb97844e..ecefbbf41 100644 --- a/libkineto/src/CuptiActivityProfiler.cpp +++ b/libkineto/src/CuptiActivityProfiler.cpp @@ -40,8 +40,13 @@ #endif // HAS_CUPTI #ifdef HAS_ROCTRACER #include "RocLogger.h" + #ifndef ROCTRACER_FALLBACK #include "RocprofActivity.h" #include "RocprofActivityApi.h" + #else +#include "RoctracerActivity.h" +#include "RoctracerActivityApi.h" + #endif #endif #ifdef HAS_XPUPTI #include "plugin/xpupti/XpuptiActivityProfiler.h" @@ -214,9 +219,15 @@ void CuptiActivityProfiler::transferCpuTrace( } #ifdef HAS_ROCTRACER + #ifndef ROCTRACER_FALLBACK CuptiActivityProfiler::CuptiActivityProfiler( RocprofActivityApi& cupti, bool cpuOnly) + #else +CuptiActivityProfiler::CuptiActivityProfiler( + RoctracerActivityApi& cupti, + bool cpuOnly) + #endif #else CuptiActivityProfiler::CuptiActivityProfiler( CuptiActivityApi& cupti, @@ -1569,8 +1580,13 @@ void CuptiActivityProfiler::pushCorrelationId(uint64_t id) { id, CuptiActivityApi::CorrelationFlowType::Default); #endif // HAS_CUPTI #ifdef HAS_ROCTRACER + #ifndef ROCTRACER_FALLBACK RocprofActivityApi::pushCorrelationID( id, RocprofActivityApi::CorrelationFlowType::Default); + #else + RoctracerActivityApi::pushCorrelationID( + id, RoctracerActivityApi::CorrelationFlowType::Default); + #endif #endif for (auto& session : sessions_) { session->pushCorrelationId(id); @@ -1583,8 +1599,13 @@ void CuptiActivityProfiler::popCorrelationId() { CuptiActivityApi::CorrelationFlowType::Default); #endif // HAS_CUPTI #ifdef HAS_ROCTRACER + #ifndef ROCTRACER_FALLBACK RocprofActivityApi::popCorrelationID( RocprofActivityApi::CorrelationFlowType::Default); + #else + RoctracerActivityApi::popCorrelationID( + RoctracerActivityApi::CorrelationFlowType::Default); + #endif #endif for (auto& session : sessions_) { session->popCorrelationId(); @@ -1597,8 +1618,13 @@ void CuptiActivityProfiler::pushUserCorrelationId(uint64_t id) { id, CuptiActivityApi::CorrelationFlowType::User); #endif // HAS_CUPTI #ifdef HAS_ROCTRACER + #ifndef ROCTRACER_FALLBACK RocprofActivityApi::pushCorrelationID( id, RocprofActivityApi::CorrelationFlowType::User); + #else + RoctracerActivityApi::pushCorrelationID( + id, RoctracerActivityApi::CorrelationFlowType::User); + #endif #endif for (auto& session : sessions_) { session->pushUserCorrelationId(id); @@ -1611,8 +1637,13 @@ void CuptiActivityProfiler::popUserCorrelationId() { CuptiActivityApi::CorrelationFlowType::User); #endif // HAS_CUPTI #ifdef HAS_ROCTRACER + #ifndef ROCTRACER_FALLBACK RocprofActivityApi::popCorrelationID( RocprofActivityApi::CorrelationFlowType::User); + #else + RoctracerActivityApi::popCorrelationID( + RoctracerActivityApi::CorrelationFlowType::User); + #endif #endif for (auto& session : sessions_) { session->popUserCorrelationId(); diff --git a/libkineto/src/CuptiActivityProfiler.h b/libkineto/src/CuptiActivityProfiler.h index 187ee80f9..165aea49e 100644 --- a/libkineto/src/CuptiActivityProfiler.h +++ b/libkineto/src/CuptiActivityProfiler.h @@ -30,7 +30,11 @@ #endif // HAS_CUPTI #ifdef HAS_ROCTRACER + #ifndef ROCTRACER_FALLBACK #include "RocprofLogger.h" + #else +#include "RoctracerLogger.h" + #endif #endif // HAS_ROCTRACER #include "GenericTraceActivity.h" @@ -46,6 +50,7 @@ namespace KINETO_NAMESPACE { class Config; class CuptiActivityApi; class RocprofActivityApi; +class RoctracerActivityApi; // This struct is a derived snapshot of the Config. And should not // be mutable after construction. @@ -122,6 +127,7 @@ class CuptiActivityProfiler { public: CuptiActivityProfiler(CuptiActivityApi& cupti, bool cpuOnly); CuptiActivityProfiler(RocprofActivityApi& rai, bool cpuOnly); + CuptiActivityProfiler(RoctracerActivityApi& rtai, bool cpuOnly); CuptiActivityProfiler(const CuptiActivityProfiler&) = delete; CuptiActivityProfiler& operator=(const CuptiActivityProfiler&) = delete; ~CuptiActivityProfiler(); @@ -457,7 +463,11 @@ class CuptiActivityProfiler { // Calls to CUPTI is encapsulated behind this interface #ifdef HAS_ROCTRACER - RocprofActivityApi& cupti_; // Design failure here + #ifndef ROCTRACER_FALLBACK + RocprofActivityApi& cupti_; // Design failure here + #else + RoctracerActivityApi& cupti_; + #endif #else CuptiActivityApi& cupti_; #endif diff --git a/libkineto/src/RocprofLogger.cpp b/libkineto/src/RocprofLogger.cpp index 5b6b67fb7..c9202481f 100644 --- a/libkineto/src/RocprofLogger.cpp +++ b/libkineto/src/RocprofLogger.cpp @@ -500,8 +500,9 @@ void RocprofLogger::popCorrelationID(CorrelationDomain type) { } void RocprofLogger::clearLogs() { - for (auto &row : rows_) - delete row; + // CuptiActivityProfiler clears this before the output Loggers use the data + //for (auto &row : rows_) + // delete row; rows_.clear(); for (int i = 0; i < CorrelationDomain::size; ++i) { externalCorrelations_[i].clear(); diff --git a/libkineto/src/RoctracerActivity.h b/libkineto/src/RoctracerActivity.h index 72083ab6b..615eef52e 100644 --- a/libkineto/src/RoctracerActivity.h +++ b/libkineto/src/RoctracerActivity.h @@ -81,10 +81,10 @@ struct RoctracerActivity : public ITraceActivity { std::unordered_map metadata_; }; -// roctracerAsyncRow - Roctracer GPU activities -struct GpuActivity : public RoctracerActivity { +// rocprofAsyncRow - Roctracer GPU activities +struct GpuActivity : public RoctracerActivity { explicit GpuActivity( - const roctracerAsyncRow* activity, + const rocprofAsyncRow* activity, const ITraceActivity* linked) : RoctracerActivity(activity, linked) { switch (activity_.kind) { diff --git a/libkineto/src/RoctracerActivityApi.cpp b/libkineto/src/RoctracerActivityApi.cpp index 0bec09d4a..7a1ac3e76 100644 --- a/libkineto/src/RoctracerActivityApi.cpp +++ b/libkineto/src/RoctracerActivityApi.cpp @@ -40,7 +40,7 @@ void RoctracerActivityApi::pushCorrelationID(int id, CorrelationFlowType type) { return; } singleton().d->pushCorrelationID( - id, static_cast(type)); + id, static_cast(type)); #endif } @@ -50,7 +50,7 @@ void RoctracerActivityApi::popCorrelationID(CorrelationFlowType type) { return; } singleton().d->popCorrelationID( - static_cast(type)); + static_cast(type)); #endif } @@ -85,8 +85,8 @@ timestamp_t getTimeOffset() { } int RoctracerActivityApi::processActivities( - std::function handler, - std::function + std::function handler, + std::function correlationHandler) { // Find offset to map from monotonic clock to system clock. // This will break time-ordering of events but is status quo. @@ -94,15 +94,15 @@ int RoctracerActivityApi::processActivities( int count = 0; // Process all external correlations pairs - for (int it = RoctracerLogger::CorrelationDomain::begin; - it < RoctracerLogger::CorrelationDomain::end; + for (int it = RocLogger::CorrelationDomain::begin; + it < RocLogger::CorrelationDomain::end; ++it) { auto& externalCorrelations = d->externalCorrelations_[it]; for (auto& item : externalCorrelations) { correlationHandler( item.first, item.second, - static_cast(it)); + static_cast(it)); } std::lock_guard lock(d->externalCorrelationsMutex_); externalCorrelations.clear(); @@ -122,7 +122,7 @@ int RoctracerActivityApi::processActivities( !isLogged(ActivityType::CUDA_RUNTIME)) { filtered = true; } else { - switch (reinterpret_cast(item)->kind) { + switch (reinterpret_cast(item)->kind) { case HIP_OP_COPY_KIND_DEVICE_TO_HOST_: case HIP_OP_COPY_KIND_HOST_TO_DEVICE_: case HIP_OP_COPY_KIND_DEVICE_TO_DEVICE_: @@ -142,7 +142,7 @@ int RoctracerActivityApi::processActivities( if (!isLogged(ActivityType::CONCURRENT_KERNEL)) filtered = true; // Don't record barriers/markers - if (reinterpret_cast(item)->op == + if (reinterpret_cast(item)->op == HIP_OP_ID_BARRIER) filtered = true; break; diff --git a/libkineto/src/RoctracerActivityApi.h b/libkineto/src/RoctracerActivityApi.h index 54bf03f73..849e4db0d 100644 --- a/libkineto/src/RoctracerActivityApi.h +++ b/libkineto/src/RoctracerActivityApi.h @@ -48,9 +48,9 @@ class RoctracerActivityApi { void setMaxEvents(uint32_t maxEvents); virtual int processActivities( - std::function handler, + std::function handler, std::function< - void(uint64_t, uint64_t, RoctracerLogger::CorrelationDomain)> + void(uint64_t, uint64_t, RocLogger::CorrelationDomain)> correlationHandler); void setMaxBufferSize(int size); diff --git a/libkineto/src/RoctracerActivity_inl.h b/libkineto/src/RoctracerActivity_inl.h index 56d54f6a9..c92addb55 100644 --- a/libkineto/src/RoctracerActivity_inl.h +++ b/libkineto/src/RoctracerActivity_inl.h @@ -161,7 +161,7 @@ inline void RuntimeActivity::log(ActivityLogger& logger) const { } template <> -inline const std::string RuntimeActivity::metadataJson() +inline const std::string RuntimeActivity::metadataJson() const { std::string kernel = ""; if ((raw().functionAddr != nullptr)) { @@ -209,7 +209,7 @@ inline const std::string RuntimeActivity::metadataJson() } template <> -inline const std::string RuntimeActivity::metadataJson() +inline const std::string RuntimeActivity::metadataJson() const { correlationToSize[raw().id] = raw().size; return fmt::format( @@ -224,7 +224,7 @@ inline const std::string RuntimeActivity::metadataJson() } template <> -inline const std::string RuntimeActivity::metadataJson() +inline const std::string RuntimeActivity::metadataJson() const { correlationToSize[raw().id] = raw().size; std::string size = ""; diff --git a/libkineto/src/RoctracerLogger.cpp b/libkineto/src/RoctracerLogger.cpp index 725c4d0b0..0fdf42913 100644 --- a/libkineto/src/RoctracerLogger.cpp +++ b/libkineto/src/RoctracerLogger.cpp @@ -35,6 +35,14 @@ class Flush { }; static Flush s_flush; +uint32_t RoctracerApiIdList::mapName(const std::string& apiName) +{ + uint32_t cid = 0; + roctracer_op_code( + ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr); + return cid; +} + RoctracerLogger& RoctracerLogger::singleton() { static RoctracerLogger instance; return instance; @@ -49,17 +57,17 @@ RoctracerLogger::~RoctracerLogger() { namespace { thread_local std::deque - t_externalIds[RoctracerLogger::CorrelationDomain::size]; + t_externalIds[RocLogger::CorrelationDomain::size]; } -void RoctracerLogger::pushCorrelationID(uint64_t id, CorrelationDomain type) { +void RoctracerLogger::pushCorrelationID(uint64_t id, RocLogger::CorrelationDomain type) { if (!singleton().externalCorrelationEnabled_) { return; } t_externalIds[type].push_back(id); } -void RoctracerLogger::popCorrelationID(CorrelationDomain type) { +void RoctracerLogger::popCorrelationID(RocLogger::CorrelationDomain type) { if (!singleton().externalCorrelationEnabled_) { return; } @@ -73,12 +81,12 @@ void RoctracerLogger::popCorrelationID(CorrelationDomain type) { void RoctracerLogger::clearLogs() { rows_.clear(); - for (int i = 0; i < CorrelationDomain::size; ++i) { + for (int i = 0; i < RocLogger::CorrelationDomain::size; ++i) { externalCorrelations_[i].clear(); } } -void RoctracerLogger::insert_row_to_buffer(roctracerBase* row) { +void RoctracerLogger::insert_row_to_buffer(rocprofBase* row) { RoctracerLogger* dis = &singleton(); std::lock_guard lock(dis->rowsMutex_); if (dis->rows_.size() >= dis->maxBufferSize_) { @@ -119,7 +127,7 @@ void RoctracerLogger::api_callback( { s_flush.reportCorrelation(data->correlation_id); auto& args = data->args.hipLaunchKernel; - roctracerKernelRow* row = new roctracerKernelRow( + rocprofKernelRow* row = new rocprofKernelRow( data->correlation_id, domain, cid, @@ -144,7 +152,7 @@ void RoctracerLogger::api_callback( case HIP_API_ID_hipExtModuleLaunchKernel: { s_flush.reportCorrelation(data->correlation_id); auto& args = data->args.hipModuleLaunchKernel; - roctracerKernelRow* row = new roctracerKernelRow( + rocprofKernelRow* row = new rocprofKernelRow( data->correlation_id, domain, cid, @@ -169,7 +177,7 @@ void RoctracerLogger::api_callback( #if 0 { auto &args = data->args.hipLaunchCooperativeKernelMultiDevice.launchParamsList__val; - roctracerKernelRow* row = new roctracerKernelRow( + rocprofKernelRow* row = new rocprofKernelRow( data->correlation_id, domain, cid, @@ -193,7 +201,7 @@ void RoctracerLogger::api_callback( #endif break; case HIP_API_ID_hipMalloc: { - roctracerMallocRow* row = new roctracerMallocRow( + rocprofMallocRow* row = new rocprofMallocRow( data->correlation_id, domain, cid, @@ -206,7 +214,7 @@ void RoctracerLogger::api_callback( insert_row_to_buffer(row); } break; case HIP_API_ID_hipFree: { - roctracerMallocRow* row = new roctracerMallocRow( + rocprofMallocRow* row = new rocprofMallocRow( data->correlation_id, domain, cid, @@ -220,7 +228,7 @@ void RoctracerLogger::api_callback( } break; case HIP_API_ID_hipMemcpy: { auto& args = data->args.hipMemcpy; - roctracerCopyRow* row = new roctracerCopyRow( + rocprofCopyRow* row = new rocprofCopyRow( data->correlation_id, domain, cid, @@ -239,7 +247,7 @@ void RoctracerLogger::api_callback( case HIP_API_ID_hipMemcpyAsync: case HIP_API_ID_hipMemcpyWithStream: { auto& args = data->args.hipMemcpyAsync; - roctracerCopyRow* row = new roctracerCopyRow( + rocprofCopyRow* row = new rocprofCopyRow( data->correlation_id, domain, cid, @@ -255,7 +263,7 @@ void RoctracerLogger::api_callback( insert_row_to_buffer(row); } break; default: { - roctracerRow* row = new roctracerRow( + rocprofRow* row = new rocprofRow( data->correlation_id, domain, cid, @@ -267,7 +275,7 @@ void RoctracerLogger::api_callback( } break; } // switch // External correlation - for (int it = CorrelationDomain::begin; it < CorrelationDomain::end; + for (int it = RocLogger::CorrelationDomain::begin; it < RocLogger::CorrelationDomain::end; ++it) { if (t_externalIds[it].size() > 0) { std::lock_guard lock(dis->externalCorrelationsMutex_); @@ -293,7 +301,7 @@ void RoctracerLogger::activity_callback( if (record->correlation_id > s_flush.maxCompletedCorrelationId_) { s_flush.maxCompletedCorrelationId_ = record->correlation_id; } - roctracerAsyncRow* row = new roctracerAsyncRow( + rocprofAsyncRow* row = new rocprofAsyncRow( record->correlation_id, record->domain, record->kind, @@ -425,30 +433,3 @@ void RoctracerLogger::endTracing() { hccPool_ = nullptr; } } - -ApiIdList::ApiIdList() : invert_(true) {} - -void ApiIdList::add(const std::string& apiName) { - uint32_t cid = 0; - if (roctracer_op_code( - ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr) == - ROCTRACER_STATUS_SUCCESS) { - filter_[cid] = 1; - } -} -void ApiIdList::remove(const std::string& apiName) { - uint32_t cid = 0; - if (roctracer_op_code( - ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr) == - ROCTRACER_STATUS_SUCCESS) { - filter_.erase(cid); - } -} - -bool ApiIdList::loadUserPrefs() { - // placeholder - return false; -} -bool ApiIdList::contains(uint32_t apiId) { - return (filter_.find(apiId) != filter_.end()) ? !invert_ : invert_; // XOR -} diff --git a/libkineto/src/RoctracerLogger.h b/libkineto/src/RoctracerLogger.h index d28a9f08b..2e5387585 100644 --- a/libkineto/src/RoctracerLogger.h +++ b/libkineto/src/RoctracerLogger.h @@ -24,6 +24,8 @@ #include #include +#include "RocLogger.h" + // Local copy of hip op types. These are public (and stable) in later rocm // releases typedef enum { @@ -56,203 +58,13 @@ namespace libkineto { class RoctracerActivityApi; } -typedef uint64_t timestamp_t; - -static timestamp_t timespec_to_ns(const timespec& time) { - return ((timestamp_t)time.tv_sec * 1000000000) + time.tv_nsec; -} - -class ApiIdList { +class RoctracerApiIdList : public ApiIdList { public: - ApiIdList(); - bool invertMode() { - return invert_; - } - void setInvertMode(bool invert) { - invert_ = invert; - } - void add(const std::string& apiName); - void remove(const std::string& apiName); - bool loadUserPrefs(); - bool contains(uint32_t apiId); - const std::unordered_map& filterList() { - return filter_; - } - - private: - std::unordered_map filter_; - bool invert_; -}; - -typedef enum { - ROCTRACER_ACTIVITY_DEFAULT = 0, - ROCTRACER_ACTIVITY_KERNEL, - ROCTRACER_ACTIVITY_COPY, - ROCTRACER_ACTIVITY_MALLOC, - ROCTRACER_ACTIVITY_ASYNC, - ROCTRACER_ACTIVITY_NONE -} roctracer_activity_types; - -struct roctracerBase { - roctracerBase( - uint64_t id, - uint32_t domain, - uint64_t begin, - uint64_t end, - roctracer_activity_types type = ROCTRACER_ACTIVITY_NONE) - : id(id), begin(begin), end(end), domain(domain), type(type) {} - uint64_t id; // correlation_id - uint64_t begin; - uint64_t end; - uint32_t domain; - roctracer_activity_types type; -}; - -struct roctracerRow : public roctracerBase { - roctracerRow( - uint64_t id, - uint32_t domain, - uint32_t cid, - uint32_t pid, - uint32_t tid, - uint64_t begin, - uint64_t end, - roctracer_activity_types type = ROCTRACER_ACTIVITY_DEFAULT) - : roctracerBase(id, domain, begin, end, type), - cid(cid), - pid(pid), - tid(tid) {} - uint32_t cid; - uint32_t pid; - uint32_t tid; -}; - -struct roctracerKernelRow : public roctracerRow { - roctracerKernelRow( - uint64_t id, - uint32_t domain, - uint32_t cid, - uint32_t pid, - uint32_t tid, - uint64_t begin, - uint64_t end, - const void* faddr, - hipFunction_t function, - unsigned int gx, - unsigned int gy, - unsigned int gz, - unsigned int wx, - unsigned int wy, - unsigned int wz, - size_t gss, - hipStream_t stream, - roctracer_activity_types type = ROCTRACER_ACTIVITY_KERNEL) - : roctracerRow(id, domain, cid, pid, tid, begin, end, type), - functionAddr(faddr), - function(function), - gridX(gx), - gridY(gy), - gridZ(gz), - workgroupX(wx), - workgroupY(wy), - workgroupZ(wz), - groupSegmentSize(gss), - stream(stream) {} - const void* functionAddr; - hipFunction_t function; - unsigned int gridX; - unsigned int gridY; - unsigned int gridZ; - unsigned int workgroupX; - unsigned int workgroupY; - unsigned int workgroupZ; - size_t groupSegmentSize; - hipStream_t stream; -}; - -struct roctracerCopyRow : public roctracerRow { - roctracerCopyRow( - uint64_t id, - uint32_t domain, - uint32_t cid, - uint32_t pid, - uint32_t tid, - uint64_t begin, - uint64_t end, - const void* src, - const void* dst, - size_t size, - hipMemcpyKind kind, - hipStream_t stream, - roctracer_activity_types type = ROCTRACER_ACTIVITY_COPY) - : roctracerRow(id, domain, cid, pid, tid, begin, end, type), - src(src), - dst(dst), - size(size), - kind(kind), - stream(stream) {} - const void* src; - const void* dst; - size_t size; - hipMemcpyKind kind; - hipStream_t stream; -}; - -struct roctracerMallocRow : public roctracerRow { - roctracerMallocRow( - uint64_t id, - uint32_t domain, - uint32_t cid, - uint32_t pid, - uint32_t tid, - uint64_t begin, - uint64_t end, - const void* ptr, - size_t size, - roctracer_activity_types type = ROCTRACER_ACTIVITY_MALLOC) - : roctracerRow(id, domain, cid, pid, tid, begin, end, type), - ptr(ptr), - size(size) {} - const void* ptr; - size_t size; -}; - -struct roctracerAsyncRow : public roctracerBase { - roctracerAsyncRow( - uint64_t id, - uint32_t domain, - uint32_t kind, - uint32_t op, - int device, - uint64_t queue, - uint64_t begin, - uint64_t end, - const std::string& kernelName, - roctracer_activity_types type = ROCTRACER_ACTIVITY_ASYNC) - : roctracerBase(id, domain, begin, end, type), - kind(kind), - op(op), - device(device), - queue(queue), - kernelName(kernelName) {} - uint32_t kind; - uint32_t op; - int device; - uint64_t queue; - std::string kernelName; + uint32_t mapName(const std::string& apiName) override; }; class RoctracerLogger { public: - enum CorrelationDomain { - begin, - Default = begin, - Domain0 = begin, - Domain1, - end, - size = end - }; - RoctracerLogger(); RoctracerLogger(const RoctracerLogger&) = delete; RoctracerLogger& operator=(const RoctracerLogger&) = delete; @@ -261,8 +73,8 @@ class RoctracerLogger { static RoctracerLogger& singleton(); - static void pushCorrelationID(uint64_t id, CorrelationDomain type); - static void popCorrelationID(CorrelationDomain type); + static void pushCorrelationID(uint64_t id, RocLogger::CorrelationDomain type); + static void popCorrelationID(RocLogger::CorrelationDomain type); void startLogging(); void stopLogging(); @@ -274,7 +86,7 @@ class RoctracerLogger { void endTracing(); roctracer_pool_t* hccPool_{NULL}; - static void insert_row_to_buffer(roctracerBase* row); + static void insert_row_to_buffer(rocprofBase* row); static void api_callback( uint32_t domain, uint32_t cid, @@ -282,18 +94,18 @@ class RoctracerLogger { void* arg); static void activity_callback(const char* begin, const char* end, void* arg); - ApiIdList loggedIds_; + RoctracerApiIdList loggedIds_; // Api callback data uint32_t maxBufferSize_{5000000}; // 5M GPU runtime/kernel events. - std::vector rows_; + std::vector rows_; std::mutex rowsMutex_; // This vector collects pairs of correlationId and their respective // externalCorrelationId for each CorrelationDomain. This will be used // to populate the Correlation maps during post processing. std::vector> - externalCorrelations_[CorrelationDomain::size]; + externalCorrelations_[RocLogger::CorrelationDomain::size]; std::mutex externalCorrelationsMutex_; bool externalCorrelationEnabled_{true}; From d1a1531e3be1d2dcc035fca1cbf7d92f42ddb906 Mon Sep 17 00:00:00 2001 From: Shivam Raikundalia Date: Wed, 8 Oct 2025 12:03:16 -0700 Subject: [PATCH 6/7] lint --- libkineto/src/ActivityProfilerController.cpp | 12 +- libkineto/src/CuptiActivityProfiler.cpp | 36 +++--- libkineto/src/CuptiActivityProfiler.h | 24 ++-- libkineto/src/RocLogger.h | 2 +- libkineto/src/RocprofLogger.cpp | 129 +++++++++++-------- libkineto/src/RocprofLogger.h | 2 +- libkineto/src/RoctracerActivityApi.cpp | 3 +- libkineto/src/RoctracerActivityApi.h | 3 +- libkineto/src/RoctracerActivity_inl.h | 3 +- libkineto/src/RoctracerLogger.cpp | 13 +- 10 files changed, 119 insertions(+), 108 deletions(-) diff --git a/libkineto/src/ActivityProfilerController.cpp b/libkineto/src/ActivityProfilerController.cpp index cfd980be6..e9867fcaf 100644 --- a/libkineto/src/ActivityProfilerController.cpp +++ b/libkineto/src/ActivityProfilerController.cpp @@ -18,11 +18,11 @@ #include "CuptiActivityApi.h" #ifdef HAS_ROCTRACER - #ifndef ROCTRACER_FALLBACK +#ifndef ROCTRACER_FALLBACK #include "RocprofActivityApi.h" - #else +#else #include "RoctracerActivityApi.h" - #endif +#endif #endif #include "ThreadUtil.h" @@ -71,13 +71,13 @@ ActivityProfilerController::ActivityProfilerController( #endif // !USE_GOOGLE_LOG #ifdef HAS_ROCTRACER - #ifndef ROCTRACER_FALLBACK +#ifndef ROCTRACER_FALLBACK profiler_ = std::make_unique( RocprofActivityApi::singleton(), cpuOnly); - #else +#else profiler_ = std::make_unique( RoctracerActivityApi::singleton(), cpuOnly); - #endif +#endif #else profiler_ = std::make_unique( CuptiActivityApi::singleton(), cpuOnly); diff --git a/libkineto/src/CuptiActivityProfiler.cpp b/libkineto/src/CuptiActivityProfiler.cpp index f5aefb24c..2846a4ab1 100644 --- a/libkineto/src/CuptiActivityProfiler.cpp +++ b/libkineto/src/CuptiActivityProfiler.cpp @@ -40,13 +40,13 @@ #endif // HAS_CUPTI #ifdef HAS_ROCTRACER #include "RocLogger.h" - #ifndef ROCTRACER_FALLBACK +#ifndef ROCTRACER_FALLBACK #include "RocprofActivity.h" #include "RocprofActivityApi.h" - #else +#else #include "RoctracerActivity.h" #include "RoctracerActivityApi.h" - #endif +#endif #endif #ifdef HAS_XPUPTI #include "plugin/xpupti/XpuptiActivityProfiler.h" @@ -219,15 +219,15 @@ void CuptiActivityProfiler::transferCpuTrace( } #ifdef HAS_ROCTRACER - #ifndef ROCTRACER_FALLBACK +#ifndef ROCTRACER_FALLBACK CuptiActivityProfiler::CuptiActivityProfiler( RocprofActivityApi& cupti, bool cpuOnly) - #else +#else CuptiActivityProfiler::CuptiActivityProfiler( RoctracerActivityApi& cupti, bool cpuOnly) - #endif +#endif #else CuptiActivityProfiler::CuptiActivityProfiler( CuptiActivityApi& cupti, @@ -1582,13 +1582,13 @@ void CuptiActivityProfiler::pushCorrelationId(uint64_t id) { id, CuptiActivityApi::CorrelationFlowType::Default); #endif // HAS_CUPTI #ifdef HAS_ROCTRACER - #ifndef ROCTRACER_FALLBACK +#ifndef ROCTRACER_FALLBACK RocprofActivityApi::pushCorrelationID( id, RocprofActivityApi::CorrelationFlowType::Default); - #else +#else RoctracerActivityApi::pushCorrelationID( id, RoctracerActivityApi::CorrelationFlowType::Default); - #endif +#endif #endif for (auto& session : sessions_) { session->pushCorrelationId(id); @@ -1601,13 +1601,13 @@ void CuptiActivityProfiler::popCorrelationId() { CuptiActivityApi::CorrelationFlowType::Default); #endif // HAS_CUPTI #ifdef HAS_ROCTRACER - #ifndef ROCTRACER_FALLBACK +#ifndef ROCTRACER_FALLBACK RocprofActivityApi::popCorrelationID( RocprofActivityApi::CorrelationFlowType::Default); - #else +#else RoctracerActivityApi::popCorrelationID( RoctracerActivityApi::CorrelationFlowType::Default); - #endif +#endif #endif for (auto& session : sessions_) { session->popCorrelationId(); @@ -1620,13 +1620,13 @@ void CuptiActivityProfiler::pushUserCorrelationId(uint64_t id) { id, CuptiActivityApi::CorrelationFlowType::User); #endif // HAS_CUPTI #ifdef HAS_ROCTRACER - #ifndef ROCTRACER_FALLBACK +#ifndef ROCTRACER_FALLBACK RocprofActivityApi::pushCorrelationID( id, RocprofActivityApi::CorrelationFlowType::User); - #else +#else RoctracerActivityApi::pushCorrelationID( id, RoctracerActivityApi::CorrelationFlowType::User); - #endif +#endif #endif for (auto& session : sessions_) { session->pushUserCorrelationId(id); @@ -1639,13 +1639,13 @@ void CuptiActivityProfiler::popUserCorrelationId() { CuptiActivityApi::CorrelationFlowType::User); #endif // HAS_CUPTI #ifdef HAS_ROCTRACER - #ifndef ROCTRACER_FALLBACK +#ifndef ROCTRACER_FALLBACK RocprofActivityApi::popCorrelationID( RocprofActivityApi::CorrelationFlowType::User); - #else +#else RoctracerActivityApi::popCorrelationID( RoctracerActivityApi::CorrelationFlowType::User); - #endif +#endif #endif for (auto& session : sessions_) { session->popUserCorrelationId(); diff --git a/libkineto/src/CuptiActivityProfiler.h b/libkineto/src/CuptiActivityProfiler.h index 165aea49e..7d308cdb9 100644 --- a/libkineto/src/CuptiActivityProfiler.h +++ b/libkineto/src/CuptiActivityProfiler.h @@ -30,11 +30,11 @@ #endif // HAS_CUPTI #ifdef HAS_ROCTRACER - #ifndef ROCTRACER_FALLBACK +#ifndef ROCTRACER_FALLBACK #include "RocprofLogger.h" - #else +#else #include "RoctracerLogger.h" - #endif +#endif #endif // HAS_ROCTRACER #include "GenericTraceActivity.h" @@ -418,9 +418,7 @@ class CuptiActivityProfiler { #ifdef HAS_ROCTRACER // Process generic RocProf activity - void handleRocprofActivity( - const rocprofBase* record, - ActivityLogger* logger); + void handleRocprofActivity(const rocprofBase* record, ActivityLogger* logger); void handleCorrelationActivity( uint64_t correlationId, uint64_t externalId, @@ -428,9 +426,7 @@ class CuptiActivityProfiler { // Process specific GPU activity types template void handleRuntimeActivity(const T* activity, ActivityLogger* logger); - void handleGpuActivity( - const rocprofAsyncRow* record, - ActivityLogger* logger); + void handleGpuActivity(const rocprofAsyncRow* record, ActivityLogger* logger); #endif // HAS_ROCTRACER void resetTraceData(); @@ -463,11 +459,11 @@ class CuptiActivityProfiler { // Calls to CUPTI is encapsulated behind this interface #ifdef HAS_ROCTRACER - #ifndef ROCTRACER_FALLBACK - RocprofActivityApi& cupti_; // Design failure here - #else - RoctracerActivityApi& cupti_; - #endif +#ifndef ROCTRACER_FALLBACK + RocprofActivityApi& cupti_; // Design failure here +#else + RoctracerActivityApi& cupti_; +#endif #else CuptiActivityApi& cupti_; #endif diff --git a/libkineto/src/RocLogger.h b/libkineto/src/RocLogger.h index a057a4c4a..4626751a9 100644 --- a/libkineto/src/RocLogger.h +++ b/libkineto/src/RocLogger.h @@ -16,9 +16,9 @@ #include #include #include +#include #include #include -#include #include diff --git a/libkineto/src/RocprofLogger.cpp b/libkineto/src/RocprofLogger.cpp index c9202481f..d0820fc7c 100644 --- a/libkineto/src/RocprofLogger.cpp +++ b/libkineto/src/RocprofLogger.cpp @@ -21,10 +21,10 @@ #include #include +#include "ApproximateClock.h" #include "Demangle.h" #include "Logger.h" #include "ThreadUtil.h" -#include "ApproximateClock.h" using namespace libkineto; using namespace std::chrono; @@ -85,10 +85,10 @@ auto extract_copy_args = [](rocprofiler_callback_tracing_kind_t, struct kernel_args { // const char *stream; hipStream_t stream{nullptr}; - uint32_t privateSize {0}; - uint32_t groupSize {0}; - rocprofiler_dim3_t workgroupSize {0}; - rocprofiler_dim3_t gridSize {0}; + uint32_t privateSize{0}; + uint32_t groupSize{0}; + rocprofiler_dim3_t workgroupSize{0}; + rocprofiler_dim3_t gridSize{0}; rocprofiler_callback_tracing_kind_t kind; rocprofiler_tracing_operation_t operation; }; @@ -106,23 +106,31 @@ auto extract_kernel_args = [](rocprofiler_callback_tracing_kind_t, if (strcmp("stream", arg_name) == 0) args.stream = *(reinterpret_cast(arg_value_addr)); else if (strcmp("numBlocks", arg_name) == 0) - args.workgroupSize = *(reinterpret_cast(arg_value_addr)); + args.workgroupSize = + *(reinterpret_cast(arg_value_addr)); else if (strcmp("dimBlocks", arg_name) == 0) - args.gridSize = *(reinterpret_cast(arg_value_addr)); + args.gridSize = + *(reinterpret_cast(arg_value_addr)); else if (strcmp("sharedMemBytes", arg_name) == 0) args.groupSize = *(reinterpret_cast(arg_value_addr)); else if (strcmp("globalWorkSizeX", arg_name) == 0) - args.workgroupSize.x = *(reinterpret_cast(arg_value_addr)); + args.workgroupSize.x = + *(reinterpret_cast(arg_value_addr)); else if (strcmp("globalWorkSizeY", arg_name) == 0) - args.workgroupSize.y = *(reinterpret_cast(arg_value_addr)); + args.workgroupSize.y = + *(reinterpret_cast(arg_value_addr)); else if (strcmp("globalWorkSizeZ", arg_name) == 0) - args.workgroupSize.z = *(reinterpret_cast(arg_value_addr)); + args.workgroupSize.z = + *(reinterpret_cast(arg_value_addr)); else if (strcmp("localWorkSizeX", arg_name) == 0) - args.gridSize.x = *(reinterpret_cast(arg_value_addr)); + args.gridSize.x = + *(reinterpret_cast(arg_value_addr)); else if (strcmp("localWorkSizeY", arg_name) == 0) - args.gridSize.y = *(reinterpret_cast(arg_value_addr)); + args.gridSize.y = + *(reinterpret_cast(arg_value_addr)); else if (strcmp("localWorkSizeZ", arg_name) == 0) - args.gridSize.z = *(reinterpret_cast(arg_value_addr)); + args.gridSize.z = + *(reinterpret_cast(arg_value_addr)); return 0; }; @@ -409,28 +417,31 @@ int RocprofLogger::toolInit( nullptr); // Collect async ops via buffers - constexpr auto buffer_size_bytes = 0x40000; + constexpr auto buffer_size_bytes = 0x40000; constexpr auto buffer_watermark_bytes = buffer_size_bytes / 2; - rocprofiler_create_buffer(s->context, - buffer_size_bytes, - buffer_watermark_bytes, - ROCPROFILER_BUFFER_POLICY_LOSSLESS, - RocprofLogger::buffer_callback, - nullptr, - &s->buffer); - - rocprofiler_configure_buffer_tracing_service(s->context, - ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, - nullptr, - 0, - s->buffer); - - rocprofiler_configure_buffer_tracing_service(s->context, - ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, - nullptr, - 0, - s->buffer); + rocprofiler_create_buffer( + s->context, + buffer_size_bytes, + buffer_watermark_bytes, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + RocprofLogger::buffer_callback, + nullptr, + &s->buffer); + + rocprofiler_configure_buffer_tracing_service( + s->context, + ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, + nullptr, + 0, + s->buffer); + + rocprofiler_configure_buffer_tracing_service( + s->context, + ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, + nullptr, + 0, + s->buffer); { int isValid = 0; rocprofiler_context_is_valid(s->context, &isValid); @@ -501,7 +512,7 @@ void RocprofLogger::popCorrelationID(CorrelationDomain type) { void RocprofLogger::clearLogs() { // CuptiActivityProfiler clears this before the output Loggers use the data - //for (auto &row : rows_) + // for (auto &row : rows_) // delete row; rows_.clear(); for (int i = 0; i < CorrelationDomain::size; ++i) { @@ -605,20 +616,20 @@ void RocprofLogger::api_callback( , &args); - rocprofCopyRow* row = new rocprofCopyRow( - record.correlation_id.internal, - args.kind, - args.operation, - processId(), - systemThreadId(), - startTime, - endTime, - args.src, - args.dst, - args.size, - args.copyKind, - args.stream); - insert_row_to_buffer(row); + rocprofCopyRow* row = new rocprofCopyRow( + record.correlation_id.internal, + args.kind, + args.operation, + processId(), + systemThreadId(), + startTime, + endTime, + args.src, + args.dst, + args.size, + args.copyKind, + args.stream); + insert_row_to_buffer(row); } // Malloc Records else if (isMallocApi(record.operation)) { @@ -658,15 +669,21 @@ void RocprofLogger::api_callback( } // ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API } - -void RocprofLogger::buffer_callback(rocprofiler_context_id_t context, rocprofiler_buffer_id_t buffer_id, rocprofiler_record_header_t** headers, size_t num_headers, void* user_data, uint64_t drop_count) -{ +void RocprofLogger::buffer_callback( + rocprofiler_context_id_t context, + rocprofiler_buffer_id_t buffer_id, + rocprofiler_record_header_t** headers, + size_t num_headers, + void* user_data, + uint64_t drop_count) { for (size_t i = 0; i < num_headers; ++i) { auto* header = headers[i]; if (header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING) { if (header->kind == ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH) { - auto& record = *(static_cast(header->payload)); + auto& record = + *(static_cast( + header->payload)); auto& dispatch = record.dispatch_info; rocprofAsyncRow* row = new rocprofAsyncRow( @@ -680,9 +697,10 @@ void RocprofLogger::buffer_callback(rocprofiler_context_id_t context, rocprofile record.end_timestamp, s->kernel_names.at(dispatch.kernel_id)); insert_row_to_buffer(row); - } - else if (header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_COPY) { - auto &record = *(static_cast(header->payload)); + } else if (header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_COPY) { + auto& record = + *(static_cast( + header->payload)); rocprofAsyncRow* row = new rocprofAsyncRow( record.correlation_id.internal, record.kind, @@ -699,7 +717,6 @@ void RocprofLogger::buffer_callback(rocprofiler_context_id_t context, rocprofile } } - std::string RocprofLogger::opString( rocprofiler_callback_tracing_kind_t kind, rocprofiler_tracing_operation_t op) { diff --git a/libkineto/src/RocprofLogger.h b/libkineto/src/RocprofLogger.h index d5ef9747f..2d7325951 100644 --- a/libkineto/src/RocprofLogger.h +++ b/libkineto/src/RocprofLogger.h @@ -16,9 +16,9 @@ #include #include #include +#include #include #include -#include #include diff --git a/libkineto/src/RoctracerActivityApi.cpp b/libkineto/src/RoctracerActivityApi.cpp index 7a1ac3e76..6a9ebcd9a 100644 --- a/libkineto/src/RoctracerActivityApi.cpp +++ b/libkineto/src/RoctracerActivityApi.cpp @@ -142,8 +142,7 @@ int RoctracerActivityApi::processActivities( if (!isLogged(ActivityType::CONCURRENT_KERNEL)) filtered = true; // Don't record barriers/markers - if (reinterpret_cast(item)->op == - HIP_OP_ID_BARRIER) + if (reinterpret_cast(item)->op == HIP_OP_ID_BARRIER) filtered = true; break; } diff --git a/libkineto/src/RoctracerActivityApi.h b/libkineto/src/RoctracerActivityApi.h index 849e4db0d..415d5b436 100644 --- a/libkineto/src/RoctracerActivityApi.h +++ b/libkineto/src/RoctracerActivityApi.h @@ -49,8 +49,7 @@ class RoctracerActivityApi { virtual int processActivities( std::function handler, - std::function< - void(uint64_t, uint64_t, RocLogger::CorrelationDomain)> + std::function correlationHandler); void setMaxBufferSize(int size); diff --git a/libkineto/src/RoctracerActivity_inl.h b/libkineto/src/RoctracerActivity_inl.h index c92addb55..30b189944 100644 --- a/libkineto/src/RoctracerActivity_inl.h +++ b/libkineto/src/RoctracerActivity_inl.h @@ -209,8 +209,7 @@ inline const std::string RuntimeActivity::metadataJson() } template <> -inline const std::string RuntimeActivity::metadataJson() - const { +inline const std::string RuntimeActivity::metadataJson() const { correlationToSize[raw().id] = raw().size; return fmt::format( R"JSON( diff --git a/libkineto/src/RoctracerLogger.cpp b/libkineto/src/RoctracerLogger.cpp index 0fdf42913..2e2b696af 100644 --- a/libkineto/src/RoctracerLogger.cpp +++ b/libkineto/src/RoctracerLogger.cpp @@ -35,11 +35,9 @@ class Flush { }; static Flush s_flush; -uint32_t RoctracerApiIdList::mapName(const std::string& apiName) -{ +uint32_t RoctracerApiIdList::mapName(const std::string& apiName) { uint32_t cid = 0; - roctracer_op_code( - ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr); + roctracer_op_code(ACTIVITY_DOMAIN_HIP_API, apiName.c_str(), &cid, nullptr); return cid; } @@ -60,7 +58,9 @@ thread_local std::deque t_externalIds[RocLogger::CorrelationDomain::size]; } -void RoctracerLogger::pushCorrelationID(uint64_t id, RocLogger::CorrelationDomain type) { +void RoctracerLogger::pushCorrelationID( + uint64_t id, + RocLogger::CorrelationDomain type) { if (!singleton().externalCorrelationEnabled_) { return; } @@ -275,7 +275,8 @@ void RoctracerLogger::api_callback( } break; } // switch // External correlation - for (int it = RocLogger::CorrelationDomain::begin; it < RocLogger::CorrelationDomain::end; + for (int it = RocLogger::CorrelationDomain::begin; + it < RocLogger::CorrelationDomain::end; ++it) { if (t_externalIds[it].size() > 0) { std::lock_guard lock(dis->externalCorrelationsMutex_); From 685f93f25173e0d810e1c609cea0ccdcf55ac27d Mon Sep 17 00:00:00 2001 From: Shivam Raikundalia Date: Wed, 8 Oct 2025 12:57:25 -0700 Subject: [PATCH 7/7] header --- libkineto/src/DeviceUtil.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libkineto/src/DeviceUtil.h b/libkineto/src/DeviceUtil.h index f5d772742..2fbec448c 100644 --- a/libkineto/src/DeviceUtil.h +++ b/libkineto/src/DeviceUtil.h @@ -48,7 +48,7 @@ #elif defined(HAS_ROCTRACER) #include -#include +#include #define CUDA_CALL(call) \ { \