Skip to content

Commit ca50462

Browse files
authored
Revert "Revert "[Profiler] Make timer_create-based CPU profiler default (#7322)"" (#7578)
## Summary of changes Reverts #7427 and supersedes #7444 ## Reason for change Another try to make `timer_create`-based CPU profiler the default. What's the difference between this PR and the last one(s): - The RingBuffer is now long-lived and does not have its lifecycle tied to the provider. ## Implementation details - Only register managed threads that are assigned to native threads. - Have the ring buffer as a field of the CorProfilerCallback class so it can outlive the customer(s).
1 parent 45217f4 commit ca50462

File tree

16 files changed

+178
-60
lines changed

16 files changed

+178
-60
lines changed

profiler/build/CpuWallTime.linux.json

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
"DD_PROFILING_ENABLED": "1",
2727
"DD_PROFILING_CPU_ENABLED": "1",
2828
"DD_INTERNAL_USE_BACKTRACE2": "1",
29+
"DD_INTERNAL_CPU_PROFILER_TYPE": "TimerCreate",
2930
"DD_TRACE_ENABLED" : "0"
3031
}
3132
},
@@ -35,17 +36,19 @@
3536
"DD_CLR_ENABLE_NGEN": "true",
3637
"DD_PROFILING_ENABLED": "1",
3738
"DD_PROFILING_CPU_ENABLED": "1",
39+
"DD_INTERNAL_CPU_PROFILER_TYPE": "TimerCreate",
3840
"DD_INTERNAL_USE_BACKTRACE2": "0",
3941
"DD_TRACE_ENABLED" : "0"
4042
}
4143
},
4244
{
43-
"name": "Profiler_cpu_walltime_timer_create",
45+
"name": "Profiler_cpu_walltime_manual",
4446
"environmentVariables": {
4547
"DD_CLR_ENABLE_NGEN": "true",
4648
"DD_PROFILING_ENABLED": "1",
4749
"DD_PROFILING_CPU_ENABLED": "1",
48-
"DD_INTERNAL_CPU_PROFILER_TYPE": "TimerCreate",
50+
"DD_INTERNAL_USE_BACKTRACE2": "1",
51+
"DD_INTERNAL_CPU_PROFILER_TYPE": "ManualCpuTime",
4952
"DD_TRACE_ENABLED" : "0"
5053
}
5154
}

profiler/src/ProfilerEngine/Datadog.Profiler.Native.Linux/CpuSampleProvider.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@
1111
CpuSampleProvider::CpuSampleProvider(
1212
SampleValueTypeProvider& valueTypeProvider,
1313
RawSampleTransformer* rawSampleTransformer,
14-
std::unique_ptr<RingBuffer> ringBuffer,
14+
RingBuffer* ringBuffer,
1515
MetricsRegistry& metricsRegistry
1616
)
1717
:
18-
RawSampleCollectorBase<RawCpuSample>("CpuSampleProvider", valueTypeProvider.GetOrRegister(CpuTimeProvider::SampleTypeDefinitions), rawSampleTransformer, std::move(ringBuffer), metricsRegistry)
18+
RawSampleCollectorBase<RawCpuSample>("CpuSampleProvider", valueTypeProvider.GetOrRegister(CpuTimeProvider::SampleTypeDefinitions), rawSampleTransformer, ringBuffer, metricsRegistry)
1919
{
2020
}

profiler/src/ProfilerEngine/Datadog.Profiler.Native.Linux/CpuSampleProvider.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,6 @@ class CpuSampleProvider
2323
CpuSampleProvider(
2424
SampleValueTypeProvider& valueTypeProvider,
2525
RawSampleTransformer* rawSampleTransformer,
26-
std::unique_ptr<RingBuffer> ringBuffer,
26+
RingBuffer* ringBuffer,
2727
MetricsRegistry& metricsRegistry);
2828
};

profiler/src/ProfilerEngine/Datadog.Profiler.Native.Linux/ProfilerSignalManager.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,22 @@ bool ProfilerSignalManager::UnRegisterHandler()
9393
return true;
9494
}
9595

96+
bool ProfilerSignalManager::IgnoreSignal() {
97+
struct sigaction sampleAction;
98+
sampleAction.sa_handler = SIG_IGN;
99+
sigemptyset(&sampleAction.sa_mask);
100+
sampleAction.sa_flags = 0;
101+
102+
int32_t result = sigaction(_signalToSend, &sampleAction, nullptr);
103+
if (result != 0)
104+
{
105+
Log::Error("ProfilerSignalManager::IgnoreSignal: Failed mark ", strsignal(_signalToSend), " as ignored. Reason: ",
106+
strerror(errno), ".");
107+
return false;
108+
}
109+
return true;
110+
}
111+
96112
void ProfilerSignalManager::SetSignal(int32_t signal)
97113
{
98114
_signalToSend = signal;

profiler/src/ProfilerEngine/Datadog.Profiler.Native.Linux/ProfilerSignalManager.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ class ProfilerSignalManager
1818

1919
bool RegisterHandler(HandlerFn_t handler);
2020
bool UnRegisterHandler();
21+
bool IgnoreSignal();
2122
int32_t SendSignal(pid_t threadId);
2223
bool CheckSignalHandler();
2324
bool IsHandlerInPlace() const;

profiler/src/ProfilerEngine/Datadog.Profiler.Native.Linux/RawSampleCollectorBase.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,12 +96,12 @@ class RawSampleCollectorBase : public ServiceBase,
9696
const char* name,
9797
std::vector<SampleValueTypeProvider::Offset> valueOffsets,
9898
RawSampleTransformer* rawSampleTransformer,
99-
std::unique_ptr<RingBuffer> ringBuffer,
99+
RingBuffer* ringBuffer,
100100
MetricsRegistry& metricsRegistry) :
101101
ProviderBase(name),
102102
_valueOffsets{std::move(valueOffsets)},
103103
_rawSampleTransformer{rawSampleTransformer},
104-
_collectedSamples{std::move(ringBuffer)},
104+
_collectedSamples{ringBuffer},
105105
_failedReservationMetric{metricsRegistry.GetOrRegister<DiscardMetrics>("dotnet_raw_sample_failed_allocation")}
106106
{
107107
}
@@ -110,7 +110,7 @@ class RawSampleCollectorBase : public ServiceBase,
110110

111111
SampleHolder GetRawSample()
112112
{
113-
return SampleHolder(_collectedSamples.get(), _failedReservationMetric.get());
113+
return SampleHolder(_collectedSamples, _failedReservationMetric.get());
114114
}
115115

116116
const char* GetName() override
@@ -182,6 +182,6 @@ class RawSampleCollectorBase : public ServiceBase,
182182

183183
std::vector<SampleValueTypeProvider::Offset> _valueOffsets;
184184
RawSampleTransformer* _rawSampleTransformer;
185-
std::unique_ptr<RingBuffer> _collectedSamples;
185+
RingBuffer* _collectedSamples;
186186
std::shared_ptr<DiscardMetrics> _failedReservationMetric;
187187
};

profiler/src/ProfilerEngine/Datadog.Profiler.Native.Linux/TimerCreateCpuProfiler.cpp

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
#include <ucontext.h>
1818
#include <unistd.h>
1919

20-
TimerCreateCpuProfiler* TimerCreateCpuProfiler::Instance = nullptr;
20+
std::atomic<TimerCreateCpuProfiler*> TimerCreateCpuProfiler::Instance = nullptr;
2121

2222
TimerCreateCpuProfiler::TimerCreateCpuProfiler(
2323
IConfiguration* pConfiguration,
@@ -29,7 +29,8 @@ TimerCreateCpuProfiler::TimerCreateCpuProfiler(
2929
_pSignalManager{pSignalManager}, // put it as parameter for better testing
3030
_pManagedThreadsList{pManagedThreadsList},
3131
_pProvider{pProvider},
32-
_samplingInterval{pConfiguration->GetCpuProfilingInterval()}
32+
_samplingInterval{pConfiguration->GetCpuProfilingInterval()},
33+
_nbThreadsInSignalHandler{0}
3334
{
3435
Log::Info("Cpu profiling interval: ", _samplingInterval.count(), "ms");
3536
Log::Info("timer_create Cpu profiler is enabled");
@@ -83,14 +84,29 @@ bool TimerCreateCpuProfiler::StartImpl()
8384
Instance = this;
8485

8586
// Create and start timer for all threads.
86-
_pManagedThreadsList->ForEach([this](ManagedThreadInfo* thread) { RegisterThreadImpl(thread); });
87+
_pManagedThreadsList->ForEach([this](ManagedThreadInfo* thread) {
88+
// only register threads that have an OS thread id
89+
if (thread->GetOsThreadId() != 0)
90+
{
91+
RegisterThreadImpl(thread);
92+
}
93+
});
8794
}
8895

8996
return registered;
9097
}
9198

9299
bool TimerCreateCpuProfiler::StopImpl()
93100
{
101+
Instance = nullptr;
102+
// we cannot unregister. We would replace the current action by the default one
103+
// which will cause the termination of the process
104+
// Instead, mark SIGPROF as ignored.
105+
if (!_pSignalManager->IgnoreSignal())
106+
{
107+
Log::Warn("Failed to mark the signal SIGPROF as ignored.");
108+
}
109+
94110
{
95111
std::unique_lock lock(_registerLock);
96112

@@ -99,15 +115,27 @@ bool TimerCreateCpuProfiler::StopImpl()
99115
_pManagedThreadsList->ForEach([this](ManagedThreadInfo* thread) { UnregisterThreadImpl(thread); });
100116
}
101117

102-
Instance = nullptr;
103-
_pSignalManager->UnRegisterHandler();
118+
std::uint64_t nbThreadsInSignalHandler = _nbThreadsInSignalHandler;
119+
if (nbThreadsInSignalHandler != 0)
120+
{
121+
Log::Info("Waiting for all threads exiting the signal handler (#threads ", _nbThreadsInSignalHandler, ")");
122+
123+
// TODO: for now we sleep.
124+
std::this_thread::sleep_for(500ms);
125+
if (_nbThreadsInSignalHandler != 0)
126+
{
127+
Log::Warn("There are threads that are still executing the signal handler: ", _nbThreadsInSignalHandler);
128+
return false;
129+
}
130+
Log::Info("All threads exited the signal handler");
131+
}
104132

105133
return true;
106134
}
107135

108136
bool TimerCreateCpuProfiler::CollectStackSampleSignalHandler(int sig, siginfo_t* info, void* ucontext)
109137
{
110-
auto instance = Instance;
138+
auto instance = Instance.load();
111139
if (instance == nullptr)
112140
{
113141
return false;
@@ -122,7 +150,6 @@ extern "C" unsigned long long dd_inside_wrapped_functions() __attribute__((weak)
122150

123151
bool TimerCreateCpuProfiler::CanCollect(void* ctx)
124152
{
125-
// TODO (in another PR): add metrics about reasons we could not collect
126153
if (dd_inside_wrapped_functions != nullptr && dd_inside_wrapped_functions() != 0)
127154
{
128155
_discardMetrics->Incr<DiscardReason::InsideWrappedFunction>();
@@ -189,12 +216,14 @@ struct StackWalkLock
189216

190217
bool TimerCreateCpuProfiler::Collect(void* ctx)
191218
{
219+
_nbThreadsInSignalHandler++;
192220
_totalSampling->Incr();
193221

194222
auto threadInfo = ManagedThreadInfo::CurrentThreadInfo;
195223
if (threadInfo == nullptr)
196224
{
197225
_discardMetrics->Incr<DiscardReason::UnknownThread>();
226+
_nbThreadsInSignalHandler--;
198227
// Ooops should never happen
199228
return false;
200229
}
@@ -203,11 +232,13 @@ bool TimerCreateCpuProfiler::Collect(void* ctx)
203232
if (!l.IsLockAcquired())
204233
{
205234
_discardMetrics->Incr<DiscardReason::FailedAcquiringLock>();
235+
_nbThreadsInSignalHandler--;
206236
return false;
207237
}
208238

209239
if (!CanCollect(ctx))
210240
{
241+
_nbThreadsInSignalHandler--;
211242
return false;
212243
}
213244

@@ -217,6 +248,7 @@ bool TimerCreateCpuProfiler::Collect(void* ctx)
217248
auto rawCpuSample = _pProvider->GetRawSample();
218249
if (!rawCpuSample)
219250
{
251+
_nbThreadsInSignalHandler--;
220252
return false;
221253
}
222254

@@ -229,6 +261,7 @@ bool TimerCreateCpuProfiler::Collect(void* ctx)
229261
{
230262
rawCpuSample.Discard();
231263
_discardMetrics->Incr<DiscardReason::EmptyBacktrace>();
264+
_nbThreadsInSignalHandler--;
232265
return false;
233266
}
234267

@@ -240,6 +273,7 @@ bool TimerCreateCpuProfiler::Collect(void* ctx)
240273
rawCpuSample->AppDomainId = threadInfo->GetAppDomainId();
241274
rawCpuSample->ThreadInfo = std::move(threadInfo);
242275
rawCpuSample->Duration = _samplingInterval;
276+
_nbThreadsInSignalHandler--;
243277
return true;
244278
}
245279

profiler/src/ProfilerEngine/Datadog.Profiler.Native.Linux/TimerCreateCpuProfiler.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ class TimerCreateCpuProfiler : public ServiceBase
4242

4343
private:
4444
static bool CollectStackSampleSignalHandler(int sig, siginfo_t* info, void* ucontext);
45-
static TimerCreateCpuProfiler* Instance;
45+
static std::atomic<TimerCreateCpuProfiler*> Instance;
4646

4747
bool CanCollect(void* context);
4848
bool Collect(void* ucontext);
@@ -59,4 +59,5 @@ class TimerCreateCpuProfiler : public ServiceBase
5959
std::shared_mutex _registerLock;
6060
std::shared_ptr<CounterMetric> _totalSampling;
6161
std::shared_ptr<DiscardMetrics> _discardMetrics;
62+
std::atomic<std::uint64_t> _nbThreadsInSignalHandler;
6263
};

profiler/src/ProfilerEngine/Datadog.Profiler.Native/Configuration.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@ std::string const Configuration::DefaultEmptyString = "";
2626
std::chrono::seconds const Configuration::DefaultDevUploadInterval = 20s;
2727
std::chrono::seconds const Configuration::DefaultProdUploadInterval = 60s;
2828
std::chrono::milliseconds const Configuration::DefaultCpuProfilingInterval = 9ms;
29+
CpuProfilerType const Configuration::DefaultCpuProfilerType =
30+
#ifdef _WINDOWS
31+
CpuProfilerType::ManualCpuTime;
32+
#else
33+
CpuProfilerType::TimerCreate;
34+
#endif
2935

3036
Configuration::Configuration()
3137
{
@@ -110,7 +116,7 @@ Configuration::Configuration()
110116
);
111117
_httpRequestDurationThreshold = ExtractHttpRequestDurationThreshold();
112118
_forceHttpSampling = GetEnvironmentValue(EnvironmentVariables::ForceHttpSampling, false);
113-
_cpuProfilerType = GetEnvironmentValue(EnvironmentVariables::CpuProfilerType, CpuProfilerType::ManualCpuTime);
119+
_cpuProfilerType = GetEnvironmentValue(EnvironmentVariables::CpuProfilerType, DefaultCpuProfilerType);
114120
_isWaitHandleProfilingEnabled = GetEnvironmentValue(EnvironmentVariables::WaitHandleProfilingEnabled, false);
115121
}
116122

profiler/src/ProfilerEngine/Datadog.Profiler.Native/Configuration.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ class Configuration final : public IConfiguration
122122
static std::chrono::seconds const DefaultDevUploadInterval;
123123
static std::chrono::seconds const DefaultProdUploadInterval;
124124
static std::chrono::milliseconds const DefaultCpuProfilingInterval;
125+
static CpuProfilerType const DefaultCpuProfilerType;
125126

126127
bool _isProfilingEnabled;
127128
bool _isCpuProfilingEnabled;

0 commit comments

Comments
 (0)