Skip to content

Commit 906cc88

Browse files
authored
Merge pull request #13220 from xokdvium/flamegraph
Stack sampling flamegraph profiler
2 parents 5b4806a + 2190bf2 commit 906cc88

14 files changed

+508
-3
lines changed

doc/manual/rl-next/eval-profiler.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
---
2+
synopsis: Add stack sampling evaluation profiler
3+
prs: [13220]
4+
---
5+
6+
Nix evaluator now supports stack sampling evaluation profiling via `--eval-profiler flamegraph` setting.
7+
It collects collapsed call stack information to output file specified by
8+
`--eval-profile-file` (`nix.profile` by default) in a format directly consumable
9+
by `flamegraph.pl` and compatible tools like [speedscope](https://speedscope.app/).
10+
Sampling frequency can be configured via `--eval-profiler-frequency` (99 Hz by default).
11+
12+
Unlike existing `--trace-function-calls` this profiler includes the name of the function
13+
being called when it's available.

doc/manual/source/SUMMARY.md.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
- [Tuning Cores and Jobs](advanced-topics/cores-vs-jobs.md)
5858
- [Verifying Build Reproducibility](advanced-topics/diff-hook.md)
5959
- [Using the `post-build-hook`](advanced-topics/post-build-hook.md)
60+
- [Evaluation profiler](advanced-topics/eval-profiler.md)
6061
- [Command Reference](command-ref/index.md)
6162
- [Common Options](command-ref/opt-common.md)
6263
- [Common Environment Variables](command-ref/env-common.md)
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Using the `eval-profiler`
2+
3+
Nix evaluator supports [evaluation](@docroot@/language/evaluation.md)
4+
[profiling](<https://en.wikipedia.org/wiki/Profiling_(computer_programming)>)
5+
compatible with `flamegraph.pl`. The profiler samples the nix
6+
function call stack at regular intervals. It can be enabled with the
7+
[`eval-profiler`](@docroot@/command-ref/conf-file.md#conf-eval-profiler)
8+
setting:
9+
10+
```console
11+
$ nix-instantiate "<nixpkgs>" -A hello --eval-profiler flamegraph
12+
```
13+
14+
Stack sampling frequency and the output file path can be configured with
15+
[`eval-profile-file`](@docroot@/command-ref/conf-file.md#conf-eval-profile-file)
16+
and [`eval-profiler-frequency`](@docroot@/command-ref/conf-file.md#conf-eval-profiler-frequency).
17+
By default the collected profile is saved to `nix.profile` file in the current working directory.
18+
19+
The collected profile can be directly consumed by `flamegraph.pl`:
20+
21+
```console
22+
$ flamegraph.pl nix.profile > flamegraph.svg
23+
```
24+
25+
The line information in the profile contains the location of the [call
26+
site](https://en.wikipedia.org/wiki/Call_site) position and the name of the
27+
function being called (when available). For example:
28+
29+
```
30+
/nix/store/x9wnkly3k1gkq580m90jjn32q9f05q2v-source/pkgs/top-level/default.nix:167:5:primop import
31+
```
32+
33+
Here `import` primop is called at `/nix/store/x9wnkly3k1gkq580m90jjn32q9f05q2v-source/pkgs/top-level/default.nix:167:5`.

src/libexpr/eval-profiler-settings.cc

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#include "nix/expr/eval-profiler-settings.hh"
2+
#include "nix/util/configuration.hh"
3+
#include "nix/util/logging.hh" /* Needs to be included before config-impl.hh */
4+
#include "nix/util/config-impl.hh"
5+
#include "nix/util/abstract-setting-to-json.hh"
6+
7+
#include <nlohmann/json.hpp>
8+
9+
namespace nix {
10+
11+
template<>
12+
EvalProfilerMode BaseSetting<EvalProfilerMode>::parse(const std::string & str) const
13+
{
14+
if (str == "disabled")
15+
return EvalProfilerMode::disabled;
16+
else if (str == "flamegraph")
17+
return EvalProfilerMode::flamegraph;
18+
else
19+
throw UsageError("option '%s' has invalid value '%s'", name, str);
20+
}
21+
22+
template<>
23+
struct BaseSetting<EvalProfilerMode>::trait
24+
{
25+
static constexpr bool appendable = false;
26+
};
27+
28+
template<>
29+
std::string BaseSetting<EvalProfilerMode>::to_string() const
30+
{
31+
if (value == EvalProfilerMode::disabled)
32+
return "disabled";
33+
else if (value == EvalProfilerMode::flamegraph)
34+
return "flamegraph";
35+
else
36+
unreachable();
37+
}
38+
39+
NLOHMANN_JSON_SERIALIZE_ENUM(
40+
EvalProfilerMode,
41+
{
42+
{EvalProfilerMode::disabled, "disabled"},
43+
{EvalProfilerMode::flamegraph, "flamegraph"},
44+
});
45+
46+
/* Explicit instantiation of templates */
47+
template class BaseSetting<EvalProfilerMode>;
48+
49+
}

src/libexpr/eval-profiler.cc

Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#include "nix/expr/eval-profiler.hh"
22
#include "nix/expr/nixexpr.hh"
3+
#include "nix/expr/eval.hh"
4+
#include "nix/util/lru-cache.hh"
35

46
namespace nix {
57

@@ -45,4 +47,267 @@ void MultiEvalProfiler::addProfiler(ref<EvalProfiler> profiler)
4547
invalidateNeededHooks();
4648
}
4749

50+
namespace {
51+
52+
class PosCache : private LRUCache<PosIdx, Pos>
53+
{
54+
const EvalState & state;
55+
56+
public:
57+
PosCache(const EvalState & state)
58+
: LRUCache(524288) /* ~40MiB */
59+
, state(state)
60+
{
61+
}
62+
63+
Pos lookup(PosIdx posIdx)
64+
{
65+
auto posOrNone = LRUCache::get(posIdx);
66+
if (posOrNone)
67+
return *posOrNone;
68+
69+
auto pos = state.positions[posIdx];
70+
upsert(posIdx, pos);
71+
return pos;
72+
}
73+
};
74+
75+
struct LambdaFrameInfo
76+
{
77+
ExprLambda * expr;
78+
/** Position where the lambda has been called. */
79+
PosIdx callPos = noPos;
80+
std::ostream & symbolize(const EvalState & state, std::ostream & os, PosCache & posCache) const;
81+
auto operator<=>(const LambdaFrameInfo & rhs) const = default;
82+
};
83+
84+
/** Primop call. */
85+
struct PrimOpFrameInfo
86+
{
87+
const PrimOp * expr;
88+
/** Position where the primop has been called. */
89+
PosIdx callPos = noPos;
90+
std::ostream & symbolize(const EvalState & state, std::ostream & os, PosCache & posCache) const;
91+
auto operator<=>(const PrimOpFrameInfo & rhs) const = default;
92+
};
93+
94+
/** Used for functor calls (attrset with __functor attr). */
95+
struct FunctorFrameInfo
96+
{
97+
PosIdx pos;
98+
std::ostream & symbolize(const EvalState & state, std::ostream & os, PosCache & posCache) const;
99+
auto operator<=>(const FunctorFrameInfo & rhs) const = default;
100+
};
101+
102+
/** Fallback frame info. */
103+
struct GenericFrameInfo
104+
{
105+
PosIdx pos;
106+
std::ostream & symbolize(const EvalState & state, std::ostream & os, PosCache & posCache) const;
107+
auto operator<=>(const GenericFrameInfo & rhs) const = default;
108+
};
109+
110+
using FrameInfo = std::variant<LambdaFrameInfo, PrimOpFrameInfo, FunctorFrameInfo, GenericFrameInfo>;
111+
using FrameStack = std::vector<FrameInfo>;
112+
113+
/**
114+
* Stack sampling profiler.
115+
*/
116+
class SampleStack : public EvalProfiler
117+
{
118+
/* How often stack profiles should be flushed to file. This avoids the need
119+
to persist stack samples across the whole evaluation at the cost
120+
of periodically flushing data to disk. */
121+
static constexpr std::chrono::microseconds profileDumpInterval = std::chrono::milliseconds(2000);
122+
123+
Hooks getNeededHooksImpl() const override
124+
{
125+
return Hooks().set(preFunctionCall).set(postFunctionCall);
126+
}
127+
128+
public:
129+
SampleStack(const EvalState & state, std::filesystem::path profileFile, std::chrono::nanoseconds period)
130+
: state(state)
131+
, sampleInterval(period)
132+
, profileFd([&]() {
133+
AutoCloseFD fd = toDescriptor(open(profileFile.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0660));
134+
if (!fd)
135+
throw SysError("opening file %s", profileFile);
136+
return fd;
137+
}())
138+
, posCache(state)
139+
{
140+
}
141+
142+
[[gnu::noinline]] void
143+
preFunctionCallHook(const EvalState & state, const Value & v, std::span<Value *> args, const PosIdx pos) override;
144+
[[gnu::noinline]] void
145+
postFunctionCallHook(const EvalState & state, const Value & v, std::span<Value *> args, const PosIdx pos) override;
146+
147+
void maybeSaveProfile(std::chrono::time_point<std::chrono::high_resolution_clock> now);
148+
void saveProfile();
149+
FrameInfo getFrameInfoFromValueAndPos(const Value & v, PosIdx pos);
150+
151+
SampleStack(SampleStack &&) = default;
152+
SampleStack & operator=(SampleStack &&) = delete;
153+
SampleStack(const SampleStack &) = delete;
154+
SampleStack & operator=(const SampleStack &) = delete;
155+
~SampleStack();
156+
157+
private:
158+
/** Hold on to an instance of EvalState for symbolizing positions. */
159+
const EvalState & state;
160+
std::chrono::nanoseconds sampleInterval;
161+
AutoCloseFD profileFd;
162+
FrameStack stack;
163+
std::map<FrameStack, uint32_t> callCount;
164+
std::chrono::time_point<std::chrono::high_resolution_clock> lastStackSample =
165+
std::chrono::high_resolution_clock::now();
166+
std::chrono::time_point<std::chrono::high_resolution_clock> lastDump = std::chrono::high_resolution_clock::now();
167+
PosCache posCache;
168+
};
169+
170+
FrameInfo SampleStack::getFrameInfoFromValueAndPos(const Value & v, PosIdx pos)
171+
{
172+
/* NOTE: No actual references to garbage collected values are not held in
173+
the profiler. */
174+
if (v.isLambda())
175+
return LambdaFrameInfo{.expr = v.payload.lambda.fun, .callPos = pos};
176+
else if (v.isPrimOp())
177+
return PrimOpFrameInfo{.expr = v.primOp(), .callPos = pos};
178+
else if (v.isPrimOpApp())
179+
/* Resolve primOp eagerly. Must not hold on to a reference to a Value. */
180+
return PrimOpFrameInfo{.expr = v.primOpAppPrimOp(), .callPos = pos};
181+
else if (state.isFunctor(v)) {
182+
const auto functor = v.attrs()->get(state.sFunctor);
183+
if (auto pos_ = posCache.lookup(pos); std::holds_alternative<std::monostate>(pos_.origin))
184+
/* HACK: In case callsite position is unresolved. */
185+
return FunctorFrameInfo{.pos = functor->pos};
186+
return FunctorFrameInfo{.pos = pos};
187+
} else
188+
/* NOTE: Add a stack frame even for invalid cases (e.g. when calling a non-function). This is what
189+
* trace-function-calls does. */
190+
return GenericFrameInfo{.pos = pos};
191+
}
192+
193+
[[gnu::noinline]] void SampleStack::preFunctionCallHook(
194+
const EvalState & state, const Value & v, [[maybe_unused]] std::span<Value *> args, const PosIdx pos)
195+
{
196+
stack.push_back(getFrameInfoFromValueAndPos(v, pos));
197+
198+
auto now = std::chrono::high_resolution_clock::now();
199+
200+
if (now - lastStackSample > sampleInterval) {
201+
callCount[stack] += 1;
202+
lastStackSample = now;
203+
}
204+
205+
/* Do this in preFunctionCallHook because we might throw an exception, but
206+
callFunction uses Finally, which doesn't play well with exceptions. */
207+
maybeSaveProfile(now);
208+
}
209+
210+
[[gnu::noinline]] void
211+
SampleStack::postFunctionCallHook(const EvalState & state, const Value & v, std::span<Value *> args, const PosIdx pos)
212+
{
213+
214+
if (!stack.empty())
215+
stack.pop_back();
216+
}
217+
218+
std::ostream & LambdaFrameInfo::symbolize(const EvalState & state, std::ostream & os, PosCache & posCache) const
219+
{
220+
if (auto pos = posCache.lookup(callPos); std::holds_alternative<std::monostate>(pos.origin))
221+
/* HACK: To avoid dubious «none»:0 in the generated profile if the origin can't be resolved
222+
resort to printing the lambda location instead of the callsite position. */
223+
os << posCache.lookup(expr->getPos());
224+
else
225+
os << pos;
226+
if (expr->name)
227+
os << ":" << state.symbols[expr->name];
228+
return os;
229+
}
230+
231+
std::ostream & GenericFrameInfo::symbolize(const EvalState & state, std::ostream & os, PosCache & posCache) const
232+
{
233+
os << posCache.lookup(pos);
234+
return os;
235+
}
236+
237+
std::ostream & FunctorFrameInfo::symbolize(const EvalState & state, std::ostream & os, PosCache & posCache) const
238+
{
239+
os << posCache.lookup(pos) << ":functor";
240+
return os;
241+
}
242+
243+
std::ostream & PrimOpFrameInfo::symbolize(const EvalState & state, std::ostream & os, PosCache & posCache) const
244+
{
245+
/* Sometimes callsite position can have an unresolved origin, which
246+
leads to confusing «none»:0 locations in the profile. */
247+
auto pos = posCache.lookup(callPos);
248+
if (!std::holds_alternative<std::monostate>(pos.origin))
249+
os << posCache.lookup(callPos) << ":";
250+
os << *expr;
251+
return os;
252+
}
253+
254+
void SampleStack::maybeSaveProfile(std::chrono::time_point<std::chrono::high_resolution_clock> now)
255+
{
256+
if (now - lastDump >= profileDumpInterval)
257+
saveProfile();
258+
else
259+
return;
260+
261+
/* Save the last dump timepoint. Do this after actually saving data to file
262+
to not account for the time doing the flushing to disk. */
263+
lastDump = std::chrono::high_resolution_clock::now();
264+
265+
/* Free up memory used for stack sampling. This might be very significant for
266+
long-running evaluations, so we shouldn't hog too much memory. */
267+
callCount.clear();
268+
}
269+
270+
void SampleStack::saveProfile()
271+
{
272+
auto os = std::ostringstream{};
273+
for (auto & [stack, count] : callCount) {
274+
auto first = true;
275+
for (auto & pos : stack) {
276+
if (first)
277+
first = false;
278+
else
279+
os << ";";
280+
281+
std::visit([&](auto && info) { info.symbolize(state, os, posCache); }, pos);
282+
}
283+
os << " " << count;
284+
writeLine(profileFd.get(), std::move(os).str());
285+
/* Clear ostringstream. */
286+
os.str("");
287+
os.clear();
288+
}
289+
}
290+
291+
SampleStack::~SampleStack()
292+
{
293+
/* Guard against cases when we are already unwinding the stack. */
294+
try {
295+
saveProfile();
296+
} catch (...) {
297+
ignoreExceptionInDestructor();
298+
}
299+
}
300+
301+
} // namespace
302+
303+
ref<EvalProfiler>
304+
makeSampleStackProfiler(const EvalState & state, std::filesystem::path profileFile, uint64_t frequency)
305+
{
306+
/* 0 is a special value for sampling stack after each call. */
307+
std::chrono::nanoseconds period = frequency == 0
308+
? std::chrono::nanoseconds{0}
309+
: std::chrono::nanoseconds{std::nano::den / frequency / std::nano::num};
310+
return make_ref<SampleStack>(state, profileFile, period);
311+
}
312+
48313
}

0 commit comments

Comments
 (0)