Skip to content

Commit f7a8bf2

Browse files
Ilia Cherniavskiifacebook-github-bot
authored andcommitted
Use libkineto in profiler (pytorch#46470)
Summary: Pull Request resolved: pytorch#46470 Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py python test/test_autograd.py -k test_profile python test/test_autograd.py -k test_record ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Memcpy HtoD (Pageable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 33.33% 2.000us 1.000us 2 sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 33.33% 2.000us 2.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.000us 16.67% 1.000us 1.000us 1 Memcpy DtoH (Device -> Pageable) 0.00% 0.000us 0.00% 0.000us 0.000us 1.000us 16.67% 1.000us 1.000us 1 aten::randn 5.17% 74.000us 6.71% 96.000us 48.000us 0.000us 0.00% 0.000us 0.000us 2 aten::empty 1.33% 19.000us 1.33% 19.000us 4.750us 0.000us 0.00% 0.000us 0.000us 4 aten::normal_ 1.05% 15.000us 1.05% 15.000us 7.500us 0.000us 0.00% 0.000us 0.000us 2 aten::to 77.90% 1.114ms 91.61% 1.310ms 436.667us 0.000us 0.00% 3.000us 1.000us 3 aten::empty_strided 2.52% 36.000us 2.52% 36.000us 12.000us 0.000us 0.00% 0.000us 0.000us 3 aten::copy_ 2.73% 39.000us 11.19% 160.000us 53.333us 0.000us 0.00% 3.000us 1.000us 3 cudaMemcpyAsync 4.34% 62.000us 4.34% 62.000us 20.667us 0.000us 0.00% 0.000us 0.000us 3 cudaStreamSynchronize 1.61% 23.000us 1.61% 23.000us 7.667us 0.000us 0.00% 0.000us 0.000us 3 aten::mm 0.21% 3.000us 7.20% 103.000us 103.000us 0.000us 0.00% 2.000us 2.000us 1 aten::stride 0.21% 3.000us 0.21% 3.000us 1.000us 0.000us 0.00% 0.000us 0.000us 3 cudaLaunchKernel 2.45% 35.000us 2.45% 35.000us 17.500us 0.000us 0.00% 0.000us 0.000us 2 aten::add 0.49% 7.000us 4.27% 61.000us 61.000us 0.000us 0.00% 1.000us 1.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` benchmark: https://gist.github.com/ilia-cher/a5a9eb6b68504542a3cad5150fc39b1a Reviewed By: Chillee Differential Revision: D25142223 Pulled By: ilia-cher fbshipit-source-id: b0dff46c28da5fb0a8e01cf548aa4f2b723fde80
1 parent e9efd8d commit f7a8bf2

26 files changed

+2040
-1017
lines changed

aten/src/ATen/record_function.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@ namespace {
1010

1111
// Used to generate unique callback handles
1212
CallbackHandle next_unique_callback_handle() {
13-
static std::atomic<uint64_t> unique_cb_id {0};
14-
return CallbackHandle(++unique_cb_id);
13+
static std::atomic<uint64_t> unique_cb_id {1};
14+
return CallbackHandle(unique_cb_id++);
1515
}
1616

1717
RecordFunctionHandle next_unique_record_function_handle() {
18-
static std::atomic<uint64_t> unique_rf_id {0};
19-
return RecordFunctionHandle(++unique_rf_id);
18+
static std::atomic<uint64_t> unique_rf_id {1};
19+
return RecordFunctionHandle(unique_rf_id++);
2020
}
2121

2222
thread_local RecordFunctionTLS rf_tls_;

benchmarks/profiler_benchmark/profiler_bench.py

Lines changed: 17 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
import argparse
2-
import statistics
32
import sys
43
import timeit
54
import torch
65

7-
from torch.utils._benchmark import Timer
6+
from torch.utils.benchmark import Timer
87

98
PARALLEL_TASKS_NUM = 4
109
INTERNAL_ITER = None
@@ -34,29 +33,30 @@ def parallel_task(x):
3433
parser.add_argument('--with_cuda', action='store_true')
3534
parser.add_argument('--with_stack', action='store_true')
3635
parser.add_argument('--use_script', action='store_true')
36+
parser.add_argument('--use_kineto', action='store_true')
3737
parser.add_argument('--profiling_tensor_size', default=1, type=int)
3838
parser.add_argument('--workload', default='loop', type=str)
3939
parser.add_argument('--internal_iter', default=256, type=int)
40-
parser.add_argument('--n', default=100, type=int)
41-
parser.add_argument('--use_timer', action='store_true')
42-
parser.add_argument('--timer_min_run_time', default=100, type=int)
40+
parser.add_argument('--timer_min_run_time', default=10, type=int)
41+
parser.add_argument('--cuda_only', action='store_true')
4342

4443
args = parser.parse_args()
4544

4645
if args.with_cuda and not torch.cuda.is_available():
4746
print("No CUDA available")
4847
sys.exit()
4948

50-
print("Payload: {}; {} iterations, N = {}\n".format(
51-
args.workload, args.internal_iter, args.n))
49+
print("Payload: {}, {} iterations; timer min. runtime = {}\n".format(
50+
args.workload, args.internal_iter, args.timer_min_run_time))
5251
INTERNAL_ITER = args.internal_iter
5352

5453
for profiling_enabled in [False, True]:
55-
print("Profiling {}, tensor size {}x{}, use cuda: {}, with stacks: {}, use script: {}".format(
54+
print("Profiling {}, tensor size {}x{}, use cuda: {}, use kineto: {}, with stacks: {}, use script: {}".format(
5655
"enabled" if profiling_enabled else "disabled",
5756
args.profiling_tensor_size,
5857
args.profiling_tensor_size,
5958
args.with_cuda,
59+
args.use_kineto,
6060
args.with_stack,
6161
args.use_script))
6262

@@ -83,27 +83,18 @@ def payload():
8383
x = None
8484
with torch.autograd.profiler.profile(
8585
use_cuda=args.with_cuda,
86-
with_stack=args.with_stack) as prof:
86+
with_stack=args.with_stack,
87+
use_kineto=args.use_kineto,
88+
use_cpu=not args.cuda_only) as prof:
8789
x = workload(input_x)
8890
return x
8991
else:
9092
def payload():
9193
return workload(input_x)
9294

93-
if args.use_timer:
94-
t = Timer(
95-
"payload()",
96-
globals={"payload": payload},
97-
timer=timeit.default_timer,
98-
).blocked_autorange(min_run_time=args.timer_min_run_time)
99-
print(t)
100-
else:
101-
runtimes = timeit.repeat(payload, repeat=args.n, number=1)
102-
avg_time = statistics.mean(runtimes) * 1000.0
103-
stddev_time = statistics.stdev(runtimes) * 1000.0
104-
print("\tavg. time: {:.3f} ms, stddev: {:.3f} ms".format(
105-
avg_time, stddev_time))
106-
if args.workload == "loop":
107-
print("\ttime per iteration: {:.3f} ms".format(
108-
avg_time / args.internal_iter))
109-
print()
95+
t = Timer(
96+
"payload()",
97+
globals={"payload": payload},
98+
timer=timeit.default_timer,
99+
).blocked_autorange(min_run_time=args.timer_min_run_time)
100+
print(t)

cmake/Dependencies.cmake

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1751,7 +1751,8 @@ endif()
17511751
#
17521752
# End ATen checks
17531753
#
1754-
1754+
set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
1755+
set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libs" FORCE)
17551756
add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt)
17561757

17571758
# Disable compiler feature checks for `fmt`.
@@ -1764,6 +1765,7 @@ add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt)
17641765
set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "")
17651766

17661767
list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)
1768+
set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE)
17671769

17681770
# ---[ Kineto
17691771
if(USE_KINETO)
@@ -1774,8 +1776,34 @@ if(USE_KINETO)
17741776
set(KINETO_LIBRARY_TYPE "static" CACHE STRING "")
17751777
set(CUDA_SOURCE_DIR "${CUDA_TOOLKIT_ROOT_DIR}" CACHE STRING "")
17761778

1779+
message(STATUS "Configuring Kineto dependency:")
1780+
message(STATUS " KINETO_SOURCE_DIR = ${KINETO_SOURCE_DIR}")
1781+
message(STATUS " KINETO_BUILD_TESTS = ${KINETO_BUILD_TESTS}")
1782+
message(STATUS " KINETO_LIBRARY_TYPE = ${KINETO_LIBRARY_TYPE}")
1783+
message(STATUS " CUDA_SOURCE_DIR = ${CUDA_SOURCE_DIR}")
1784+
1785+
if(EXISTS ${CUDA_SOURCE_DIR}/extras/CUPTI/include)
1786+
set(CUPTI_INCLUDE_DIR "${CUDA_SOURCE_DIR}/extras/CUPTI/include")
1787+
elseif(EXISTS ${CUDA_SOURCE_DIR}/include/cupti.h)
1788+
set(CUPTI_INCLUDE_DIR "${CUDA_SOURCE_DIR}/include")
1789+
endif()
1790+
1791+
if((NOT DEFINED CUDA_cupti_LIBRARY) OR (${CUDA_cupti_LIBRARY} STREQUAL "CUDA_cupti_LIBRARY-NOTFOUND"))
1792+
if(EXISTS ${CUDA_SOURCE_DIR}/extras/CUPTI/lib64/libcupti_static.a)
1793+
set(CUDA_cupti_LIBRARY "${CUDA_SOURCE_DIR}/extras/CUPTI/lib64/libcupti_static.a")
1794+
elseif(EXISTS ${CUDA_SOURCE_DIR}/lib64/libcupti_static.a)
1795+
set(CUDA_cupti_LIBRARY "${CUDA_SOURCE_DIR}/lib64/libcupti_static.a")
1796+
elseif(EXISTS ${CUDA_SOURCE_DIR}/extras/CUPTI/lib64/libcupti.so)
1797+
set(CUDA_cupti_LIBRARY "${CUDA_SOURCE_DIR}/extras/CUPTI/lib64/libcupti.so")
1798+
elseif(EXISTS ${CUDA_SOURCE_DIR}/lib64/libcupti.so)
1799+
set(CUDA_cupti_LIBRARY "${CUDA_SOURCE_DIR}/lib64/libcupti.so")
1800+
endif()
1801+
endif()
1802+
message(STATUS " CUDA_cupti_LIBRARY = ${CUDA_cupti_LIBRARY}")
1803+
message(STATUS " CUPTI_INCLUDE_DIR = ${CUPTI_INCLUDE_DIR}")
1804+
17771805
add_subdirectory("${KINETO_SOURCE_DIR}")
1778-
message(STATUS "Configured libkineto as a dependency.")
1806+
message(STATUS "Configured Kineto as a dependency.")
17791807
endif()
17801808

17811809
list(APPEND Caffe2_DEPENDENCY_LIBS kineto)

test/cpp/jit/test_misc.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2163,7 +2163,7 @@ TEST(TLSFutureCallbacksTest, Basic) {
21632163
// test running callbacks with propagation of TLS state.
21642164
{
21652165
// Enable the profiler in this thread
2166-
torch::autograd::profiler::enableProfiler(
2166+
torch::autograd::profiler::enableProfilerLegacy(
21672167
torch::autograd::profiler::ProfilerConfig(
21682168
torch::autograd::profiler::ProfilerState::CPU, false, false));
21692169
auto s1 = c10::make_intrusive<Future>(IntType::get());
@@ -2172,12 +2172,12 @@ TEST(TLSFutureCallbacksTest, Basic) {
21722172
// Since we join here, we can ensure that all callbacks corresponding to
21732173
// markCompleted() have finished.
21742174
t.join();
2175-
torch::autograd::profiler::disableProfiler();
2175+
torch::autograd::profiler::disableProfilerLegacy();
21762176
}
21772177
// then() with TLS State
21782178
{
21792179
// Enable the profiler in this thread
2180-
torch::autograd::profiler::enableProfiler(
2180+
torch::autograd::profiler::enableProfilerLegacy(
21812181
torch::autograd::profiler::ProfilerConfig(
21822182
torch::autograd::profiler::ProfilerState::CPU, false, false));
21832183
auto s1 = c10::make_intrusive<Future>(IntType::get());
@@ -2190,7 +2190,7 @@ TEST(TLSFutureCallbacksTest, Basic) {
21902190
std::thread t([s1 = std::move(s1)]() { s1->markCompleted(); });
21912191
t.join();
21922192
s2->wait();
2193-
torch::autograd::profiler::disableProfiler();
2193+
torch::autograd::profiler::disableProfilerLegacy();
21942194
}
21952195
}
21962196

@@ -2199,7 +2199,7 @@ TEST(ProfilerDisableInCallbackTest, Basic) {
21992199
auto profilerEnabledCb = []() {
22002200
ASSERT_TRUE(torch::autograd::profiler::profilerEnabled());
22012201
};
2202-
torch::autograd::profiler::enableProfiler(
2202+
torch::autograd::profiler::enableProfilerLegacy(
22032203
torch::autograd::profiler::ProfilerConfig(
22042204
torch::autograd::profiler::ProfilerState::CPU, false, false));
22052205
auto s1 = c10::make_intrusive<Future>(IntType::get());
@@ -2212,10 +2212,10 @@ TEST(ProfilerDisableInCallbackTest, Basic) {
22122212
// Don't cleanup TLSState, and just consolidate.
22132213
auto opts = torch::autograd::profiler::ProfilerDisableOptions(false, true);
22142214
auto thread_event_lists =
2215-
torch::autograd::profiler::disableProfiler(std::move(opts));
2215+
torch::autograd::profiler::disableProfilerLegacy(std::move(opts));
22162216
// Ensure that the events from this thread are still profiled and we obtain
22172217
// the expected in events in our consolidated list when calling
2218-
// disableProfiler().
2218+
// disableProfilerLegacy().
22192219
bool found_ones = false;
22202220
bool found_add = false;
22212221
for (const auto& li : thread_event_lists) {
@@ -2237,21 +2237,21 @@ TEST(ProfilerDisableInCallbackTest, Basic) {
22372237
s1->addCallback(verifyProfilerCb);
22382238
// Disable the profiler, but do not consolidate results in the main thread.
22392239
auto opts = torch::autograd::profiler::ProfilerDisableOptions(true, false);
2240-
torch::autograd::profiler::disableProfiler(std::move(opts));
2240+
torch::autograd::profiler::disableProfilerLegacy(std::move(opts));
22412241
std::thread t([s1 = std::move(s1)]() { s1->markCompleted(at::IValue(1)); });
22422242
t.join();
22432243

22442244
// Similar to above test, but verifies correctness in the case where
22452245
// continuation runs on the main thread.
2246-
torch::autograd::profiler::enableProfiler(
2246+
torch::autograd::profiler::enableProfilerLegacy(
22472247
torch::autograd::profiler::ProfilerConfig(
22482248
torch::autograd::profiler::ProfilerState::CPU, false, false));
22492249
s1 = c10::make_intrusive<Future>(IntType::get());
22502250
s1->addCallback(verifyProfilerCb);
22512251
// Runs callback inline
22522252
s1->markCompleted(at::IValue(1));
22532253
opts = torch::autograd::profiler::ProfilerDisableOptions(true, false);
2254-
torch::autograd::profiler::disableProfiler(std::move(opts));
2254+
torch::autograd::profiler::disableProfilerLegacy(std::move(opts));
22552255
}
22562256

22572257
TEST(IValueKWargsTest, Basic) {

0 commit comments

Comments
 (0)