Skip to content

Commit 8cda6ae

Browse files
authored
gh-151613: Fix remote debugging frame cache ABA (#151614)
The remote debugging frame cache previously used only the last_profiled_frame address as its cache anchor. If a frame returned and a later frame reused the same _PyInterpreterFrame address, the profiler could accept a stale cache entry and splice parent frames from a different call chain into the current stack. This adds a last_profiled_frame_seq counter next to last_profiled_frame, increments it when the anchor advances, stores it in frame cache entries, and validates cache hits against both the frame address and the sequence. Cache miss walks now copy stack chunks before storing new cache entries so stored continuations come from a stable snapshot. The new regression test exercises alternating call chains and checks that cached stacks never contain frames from both branches.
1 parent 876c06c commit 8cda6ae

13 files changed

Lines changed: 182 additions & 56 deletions

File tree

Include/cpython/pystate.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ struct _ts {
143143
struct _PyInterpreterFrame *base_frame;
144144

145145
struct _PyInterpreterFrame *last_profiled_frame;
146+
uintptr_t last_profiled_frame_seq;
146147

147148
Py_tracefunc c_profilefunc;
148149
Py_tracefunc c_tracefunc;

Include/internal/pycore_debug_offsets.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ typedef struct _Py_DebugOffsets {
104104
uint64_t current_frame;
105105
uint64_t base_frame;
106106
uint64_t last_profiled_frame;
107+
uint64_t last_profiled_frame_seq;
107108
uint64_t thread_id;
108109
uint64_t native_thread_id;
109110
uint64_t datastack_chunk;
@@ -294,6 +295,7 @@ typedef struct _Py_DebugOffsets {
294295
.current_frame = offsetof(PyThreadState, current_frame), \
295296
.base_frame = offsetof(PyThreadState, base_frame), \
296297
.last_profiled_frame = offsetof(PyThreadState, last_profiled_frame), \
298+
.last_profiled_frame_seq = offsetof(PyThreadState, last_profiled_frame_seq), \
297299
.thread_id = offsetof(PyThreadState, thread_id), \
298300
.native_thread_id = offsetof(PyThreadState, native_thread_id), \
299301
.datastack_chunk = offsetof(PyThreadState, datastack_chunk), \

Include/internal/pycore_interpframe.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,12 +319,15 @@ _PyThreadState_GetFrame(PyThreadState *tstate)
319319
// This avoids corrupting the cache when transient frames (called and returned
320320
// between profiler samples) update last_profiled_frame to addresses the
321321
// profiler never saw.
322+
// The sequence distinguishes this anchor from a later frame that reuses the
323+
// same _PyInterpreterFrame address.
322324
#define _PyThreadState_UpdateLastProfiledFrame(tstate, frame, previous) \
323325
do { \
324326
PyThreadState *tstate_ = (tstate); \
325327
_PyInterpreterFrame *frame_ = (frame); \
326328
if (tstate_->last_profiled_frame == frame_) { \
327329
tstate_->last_profiled_frame = (previous); \
330+
tstate_->last_profiled_frame_seq++; \
328331
} \
329332
} while (0)
330333

InternalDocs/frames.md

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -142,22 +142,26 @@ since the frame chain may have been in an inconsistent state due to concurrent u
142142

143143
### Remote Profiling Frame Cache
144144

145-
The `last_profiled_frame` field in `PyThreadState` supports an optimization for
146-
remote profilers that sample call stacks from external processes. When a remote
147-
profiler reads the call stack, it writes the current frame address to this field.
148-
The eval loop then keeps this pointer valid by updating it to the parent frame
149-
whenever a frame returns (in `_PyEval_FrameClearAndPop`).
145+
The `last_profiled_frame` and `last_profiled_frame_seq` fields in
146+
`PyThreadState` support an optimization for remote profilers that sample call
147+
stacks from external processes. When a remote profiler reads the call stack, it
148+
writes the current frame address to `last_profiled_frame`. The eval loop then
149+
keeps this pointer valid by updating it to the parent frame whenever a frame
150+
returns (in `_PyEval_FrameClearAndPop`) and increments the sequence.
150151

151152
This creates a "high-water mark" that always points to a frame still on the stack.
152153
On subsequent samples, the profiler can walk from `current_frame` until it reaches
153-
`last_profiled_frame`, knowing that frames from that point downward are unchanged
154-
and can be retrieved from a cache. This significantly reduces the amount of remote
155-
memory reads needed when call stacks are deep and stable at their base.
156-
157-
The update in `_PyEval_FrameClearAndPop` is guarded: it only writes when
158-
`last_profiled_frame` is non-NULL AND matches the frame being popped. This
159-
prevents transient frames (called and returned between profiler samples) from
160-
corrupting the cache pointer, while avoiding any overhead when profiling is inactive.
154+
`last_profiled_frame`, then validate the pointer and sequence before using cached
155+
callers. This prevents a later frame that reuses the same `_PyInterpreterFrame`
156+
address from being mistaken for the sampled frame. The cache significantly
157+
reduces the amount of remote memory reads needed when call stacks are deep and
158+
stable at their base.
159+
160+
The update in `_PyEval_FrameClearAndPop` is guarded: it only advances the
161+
pointer and sequence when `last_profiled_frame` is non-NULL AND matches the
162+
frame being popped. This prevents transient frames (called and returned between
163+
profiler samples) from corrupting the cache anchor, while avoiding any overhead
164+
when profiling is inactive.
161165

162166

163167
### The Instruction Pointer
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
Fix skewed stack trackes in the Tachyon profiler when caching is enabled and
1+
Fix skewed stack traces in the Tachyon profiler when caching is enabled and
22
when generators and coroutines are profiled, by updating
33
``tstate->last_profiled_frame`` at every frame-removal site. The issue resulted
44
in total erasure of some callers. Patch by Maurycy Pawłowski-Wieroński.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix another way the Tachyon profiler frame cache could produce impossible
2+
mixed stack traces when ``_PyInterpreterFrame`` addresses are reused, by
3+
validating cached frame anchors with a sequence counter.

Modules/_remote_debugging/_remote_debugging.h

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -225,9 +225,15 @@ typedef struct {
225225
#define FRAME_CACHE_MAX_THREADS 32
226226
#define FRAME_CACHE_MAX_FRAMES 1024
227227

228+
typedef struct {
229+
uintptr_t frame;
230+
uintptr_t seq;
231+
} FrameCacheAnchor;
232+
228233
typedef struct {
229234
uint64_t thread_id; // 0 = empty slot
230235
uintptr_t thread_state_addr;
236+
uintptr_t last_profiled_frame_seq; // sequence paired with addrs[0]
231237
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
232238
Py_ssize_t num_addrs;
233239
PyObject *thread_id_obj; // owned reference, NULL if empty
@@ -434,7 +440,7 @@ typedef struct {
434440
uintptr_t thread_state_addr; // Owning thread state address
435441
uintptr_t base_frame_addr; // Sentinel at bottom (for validation)
436442
uintptr_t gc_frame; // GC frame address (0 if not tracking)
437-
uintptr_t last_profiled_frame; // Last cached frame (0 if no cache)
443+
FrameCacheAnchor last_profiled; // Last cached frame anchor
438444
StackChunkList *chunks; // Pre-copied stack chunks
439445
int skip_first_frame; // Skip frame_addr itself (continue from its caller)
440446
RemoteReadPrefetch prefetch; // Optional already-read thread/frame buffers
@@ -622,15 +628,21 @@ extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder);
622628
extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id);
623629
extern FrameCacheEntry *frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr);
624630
extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder);
631+
extern int set_last_profiled_frame(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr, uintptr_t frame_addr);
625632
extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result);
626633
extern int frame_cache_lookup_and_extend(
627634
RemoteUnwinderObject *unwinder,
628635
uint64_t thread_id,
629-
uintptr_t last_profiled_frame,
636+
uintptr_t thread_state_addr,
637+
FrameCacheAnchor anchor,
630638
PyObject *frame_info,
631639
uintptr_t *frame_addrs,
632640
Py_ssize_t *num_addrs,
633641
Py_ssize_t max_addrs);
642+
extern int frame_cache_anchor_matches(
643+
RemoteUnwinderObject *unwinder,
644+
uintptr_t thread_state_addr,
645+
FrameCacheAnchor anchor);
634646
// Returns: 1 = stored, 0 = not stored (graceful), -1 = error
635647
// Only stores complete stacks that reach base_frame_addr
636648
extern int frame_cache_store(
@@ -640,6 +652,7 @@ extern int frame_cache_store(
640652
const uintptr_t *addrs,
641653
Py_ssize_t num_addrs,
642654
uintptr_t thread_state_addr,
655+
uintptr_t last_profiled_frame_seq,
643656
uintptr_t base_frame_addr,
644657
uintptr_t last_frame_visited);
645658

Modules/_remote_debugging/debug_offsets_validation.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
#define FIELD_SIZE(type, member) sizeof(((type *)0)->member)
3232

3333
enum {
34-
PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 880,
34+
PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 888,
3535
PY_REMOTE_ASYNC_DEBUG_OFFSETS_TOTAL_SIZE = 104,
3636
};
3737

@@ -261,7 +261,8 @@ validate_fixed_field(
261261
APPLY(thread_state, next, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \
262262
APPLY(thread_state, current_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \
263263
APPLY(thread_state, base_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \
264-
APPLY(thread_state, last_profiled_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size)
264+
APPLY(thread_state, last_profiled_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \
265+
APPLY(thread_state, last_profiled_frame_seq, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size)
265266

266267
#define PY_REMOTE_DEBUG_INTERPRETER_STATE_FIELDS(APPLY, buffer_size) \
267268
APPLY(interpreter_state, id, sizeof(int64_t), _Alignof(int64_t), buffer_size); \

Modules/_remote_debugging/frame_cache.c

Lines changed: 85 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -147,50 +147,119 @@ frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result)
147147
Py_CLEAR(unwinder->frame_cache[i].frame_list);
148148
unwinder->frame_cache[i].thread_id = 0;
149149
unwinder->frame_cache[i].thread_state_addr = 0;
150+
unwinder->frame_cache[i].last_profiled_frame_seq = 0;
150151
unwinder->frame_cache[i].num_addrs = 0;
151152
STATS_INC(unwinder, stale_cache_invalidations);
152153
}
153154
}
154155
}
155156

157+
static int
158+
read_last_profiled_anchor(RemoteUnwinderObject *unwinder,
159+
uintptr_t thread_state_addr,
160+
FrameCacheAnchor *anchor)
161+
{
162+
uintptr_t frame_offset = (uintptr_t)unwinder->debug_offsets.thread_state.last_profiled_frame;
163+
uintptr_t seq_offset = (uintptr_t)unwinder->debug_offsets.thread_state.last_profiled_frame_seq;
164+
165+
// These fields are adjacent in PyThreadState. Read them together when the
166+
// layout allows it so validation uses a pointer and sequence from the same
167+
// remote-memory read.
168+
if (seq_offset == frame_offset + sizeof(uintptr_t)) {
169+
uintptr_t live_anchor[2];
170+
if (_Py_RemoteDebug_ReadRemoteMemory(&unwinder->handle,
171+
thread_state_addr + frame_offset,
172+
sizeof(live_anchor),
173+
live_anchor) < 0) {
174+
return -1;
175+
}
176+
anchor->frame = live_anchor[0];
177+
anchor->seq = live_anchor[1];
178+
return 0;
179+
}
180+
181+
if (read_ptr(unwinder, thread_state_addr + frame_offset, &anchor->frame) < 0) {
182+
return -1;
183+
}
184+
return read_ptr(unwinder, thread_state_addr + seq_offset, &anchor->seq);
185+
}
186+
187+
static Py_ssize_t
188+
find_cached_frame_addr(const FrameCacheEntry *entry, uintptr_t frame_addr,
189+
uintptr_t *real_pops)
190+
{
191+
*real_pops = 0;
192+
for (Py_ssize_t i = 0; i < entry->num_addrs; i++) {
193+
if (entry->addrs[i] == frame_addr) {
194+
return i;
195+
}
196+
if (entry->addrs[i] != 0) {
197+
(*real_pops)++;
198+
}
199+
}
200+
return -1;
201+
}
202+
203+
int
204+
frame_cache_anchor_matches(
205+
RemoteUnwinderObject *unwinder,
206+
uintptr_t thread_state_addr,
207+
FrameCacheAnchor anchor)
208+
{
209+
FrameCacheAnchor live_anchor = {0, 0};
210+
if (read_last_profiled_anchor(unwinder, thread_state_addr, &live_anchor) < 0) {
211+
PyErr_Clear();
212+
return 0;
213+
}
214+
return live_anchor.frame == anchor.frame && live_anchor.seq == anchor.seq;
215+
}
216+
156217
// Find last_profiled_frame in cache and extend frame_info with cached continuation
157218
// If frame_addrs is provided (not NULL), also extends it with cached addresses
158219
int
159220
frame_cache_lookup_and_extend(
160221
RemoteUnwinderObject *unwinder,
161222
uint64_t thread_id,
162-
uintptr_t last_profiled_frame,
223+
uintptr_t thread_state_addr,
224+
FrameCacheAnchor anchor,
163225
PyObject *frame_info,
164226
uintptr_t *frame_addrs,
165227
Py_ssize_t *num_addrs,
166228
Py_ssize_t max_addrs)
167229
{
168-
if (!unwinder->frame_cache || last_profiled_frame == 0) {
230+
if (!unwinder->frame_cache || anchor.frame == 0) {
169231
return 0;
170232
}
171233

172234
FrameCacheEntry *entry = frame_cache_find(unwinder, thread_id);
173235
if (!entry || !entry->frame_list) {
174236
return 0;
175237
}
238+
if (entry->thread_state_addr != thread_state_addr) {
239+
return 0;
240+
}
176241

177242
assert(entry->num_addrs >= 0 && entry->num_addrs <= FRAME_CACHE_MAX_FRAMES);
178243

179-
// Find the index where last_profiled_frame matches
180-
Py_ssize_t start_idx = -1;
181-
for (Py_ssize_t i = 0; i < entry->num_addrs; i++) {
182-
if (entry->addrs[i] == last_profiled_frame) {
183-
start_idx = i;
184-
break;
185-
}
186-
}
187-
244+
uintptr_t real_pops = 0;
245+
Py_ssize_t start_idx = find_cached_frame_addr(entry, anchor.frame, &real_pops);
188246
if (start_idx < 0) {
189247
return 0; // Not found
190248
}
191249
assert(start_idx < entry->num_addrs);
192250

251+
// Synthetic marker frames (<native>/<GC>) are stored as addr-0 entries but
252+
// never increment last_profiled_frame_seq in the target (only real frame
253+
// pops do). Count the real frames before start_idx so the sequence check is
254+
// not thrown off by markers sitting between the leaf and the anchor.
255+
if (entry->last_profiled_frame_seq + real_pops != anchor.seq) {
256+
return 0;
257+
}
258+
193259
Py_ssize_t num_frames = PyList_GET_SIZE(entry->frame_list);
260+
if (start_idx >= num_frames) {
261+
return 0;
262+
}
194263

195264
// Extend frame_info with frames ABOVE start_idx (not including it).
196265
// The frame at start_idx (last_profiled_frame) was the executing frame
@@ -200,6 +269,9 @@ frame_cache_lookup_and_extend(
200269
if (cache_start >= num_frames) {
201270
return 0; // Nothing above last_profiled_frame to extend with
202271
}
272+
if (!frame_cache_anchor_matches(unwinder, thread_state_addr, anchor)) {
273+
return 0;
274+
}
203275

204276
PyObject *slice = PyList_GetSlice(entry->frame_list, cache_start, num_frames);
205277
if (!slice) {
@@ -235,6 +307,7 @@ frame_cache_store(
235307
const uintptr_t *addrs,
236308
Py_ssize_t num_addrs,
237309
uintptr_t thread_state_addr,
310+
uintptr_t last_profiled_frame_seq,
238311
uintptr_t base_frame_addr,
239312
uintptr_t last_frame_visited)
240313
{
@@ -277,6 +350,7 @@ frame_cache_store(
277350
}
278351
entry->thread_id = thread_id;
279352
entry->thread_state_addr = thread_state_addr;
353+
entry->last_profiled_frame_seq = last_profiled_frame_seq;
280354
if (entry->thread_id_obj == NULL) {
281355
entry->thread_id_obj = PyLong_FromUnsignedLongLong(thread_id);
282356
if (entry->thread_id_obj == NULL) {

0 commit comments

Comments
 (0)