diff --git a/tests/ir_lowering/call_operands.ll b/tests/ir_lowering/call_operands.ll index 9e8a6131f..4448cec13 100644 --- a/tests/ir_lowering/call_operands.ll +++ b/tests/ir_lowering/call_operands.ll @@ -6,8 +6,7 @@ ; call f(1i32, 2i32, 3i32) [safepoint: 1i64, ()] ; ret 0i32 ; } -; -; func llvm.experimental.stackmap(%arg0: i64, %arg1: i32, ...); +; ... ; Check a call instruction lowers and prints correctly. ; diff --git a/tests/ir_lowering/empty.ll b/tests/ir_lowering/empty.ll index a7b2c9b40..f64c8c795 100644 --- a/tests/ir_lowering/empty.ll +++ b/tests/ir_lowering/empty.ll @@ -1,15 +1,16 @@ ; Dump: ; stdout: ; # IR format version: 0 -; # Num funcs: 1 +; # Num funcs: 2 ; # Num consts: 0 ; # Num global decls: 0 -; # Num types: 2 +; # Num types: 4 ; ; func main() { ; bb0: ; ret ; } +; ... ; The simplest test you could write. Checks an empty module lowers correctly. diff --git a/tests/ir_lowering/gepoperand.ll b/tests/ir_lowering/gepoperand.ll index ae9e87fc8..70c8e5bb5 100644 --- a/tests/ir_lowering/gepoperand.ll +++ b/tests/ir_lowering/gepoperand.ll @@ -7,6 +7,7 @@ ; %0_1: i32 = load %0_0 ; ret ; } +; ... ; Check that GEP operands are rewritten to GEP instructions which in turn are ; lowered to a ptr_add and a load. diff --git a/tests/ir_lowering/mem_access.ll b/tests/ir_lowering/mem_access.ll index f700caa68..c7e234268 100644 --- a/tests/ir_lowering/mem_access.ll +++ b/tests/ir_lowering/mem_access.ll @@ -16,6 +16,7 @@ ; unimplemented << store i16 0, ptr %0, align 1>> ; ret ; } +; ... ; Check that loads and stores lower OK. diff --git a/tests/ir_lowering/null_ptr.ll b/tests/ir_lowering/null_ptr.ll index 0b79a5c2a..1e9f21484 100644 --- a/tests/ir_lowering/null_ptr.ll +++ b/tests/ir_lowering/null_ptr.ll @@ -5,6 +5,7 @@ ; bb0: ; ret 0x0 ; } +; ... ; Check null pointer constants lower OK. diff --git a/tests/ir_lowering/struct.ll b/tests/ir_lowering/struct.ll index c1a941c0f..530d949b3 100644 --- a/tests/ir_lowering/struct.ll +++ b/tests/ir_lowering/struct.ll @@ -7,6 +7,7 @@ ; %0_1: {0: i32, 64: i64} = insert_val %0_0, 100i32 ; ret ; } +; ... ; Check that a structure type lowers correctly. diff --git a/tests/ir_lowering/unsupported_variants.ll b/tests/ir_lowering/unsupported_variants.ll index ed5a8a80f..ae4c6d8df 100644 --- a/tests/ir_lowering/unsupported_variants.ll +++ b/tests/ir_lowering/unsupported_variants.ll @@ -2,49 +2,48 @@ ; stdout: ; ... ; func main(... -; bb0: +; bb{{_}}: ; ... -; %{{_}}: ?ty<<8 x ptr>> = unimplemented << %{{4}} = getelementptr i32, <8 x ptr> %{{1}}, i32 1>> +; %{{_}}: ?ty<<8 x ptr>> = unimplemented << %{{4}} = getelementptr i32, <8 x ptr> %{{1}}, i32 1, ... ; %{{_}}: ptr = unimplemented << %{{_}} = getelementptr [8 x i8], ptr %{{_}}, i512 %{{_}}>> -; br bb1 -; bb1: -; %{{_}}: ptr = unimplemented << %{{6}} = alloca inalloca i32, align 4>> +; br bb{{_}} +; bb{{_}}: +; %{{_}}: ptr = unimplemented << %{{6}} = alloca inalloca i32, align 4, ... ; %{{_}}: ptr = unimplemented << %{{7}} = alloca i32, align 4, addrspace(4)>> ; %{{_}}: ptr = unimplemented << %{{8}} = alloca i32, i32 %2, align 4>> -; br bb2 -; bb2: -; %{{_}}: float = unimplemented << %{{13}} = fadd nnan float %{{3}}, %{{3}}>> +; br bb{{_}} +; bb{{_}}: +; %{{_}}: float = unimplemented << %{{13}} = fadd nnan float %{{3}}, %{{3}}, ... ; %{{_}}: ?ty<<4 x i32>> = unimplemented << %{{15}} = add <4 x i32> %{{44}}, %{{44}}>> -; br bb3 -; bb3: -; %{{_}}: i32 = unimplemented << %{{17}} = call i32 @f(i32 swiftself 5) >> +; br bb{{_}} +; bb{{_}}: +; %{{_}}: i32 = unimplemented << %{{17}} = call i32 @f(i32 swiftself 5), !yk-swt-bb-purpose !4 >> ; %{{_}}: i32 = unimplemented << %{{18}} = call inreg i32 @f(i32 5) >> ; %{{_}}: i32 = unimplemented << %{{19}} = call i32 @f(i32 5) #{{0}} >> ; %{{_}}: float = unimplemented << %{{20}} = call nnan float @g() >> ; %{{_}}: i32 = unimplemented << %{{21}} = call ghccc i32 @f(i32 5) >> ; %{{_}}: i32 = unimplemented << %{{22}} = call i32 @f(i32 5) [ "kcfi"(i32 1234) ] >> ; %{{_}}: ptr = unimplemented << %{{23}} = call addrspace(6) ptr @p() >> -; br bb4 -; bb4: -; %{{_}}: ?ty<<8 x i8>> = unimplemented << %{{26}} = ptrtoint <8 x ptr> %{{ptrs}} to <8 x i8>>> +; br bb{{_}} +; bb{{_}}: +; %{{_}}: ?ty<<8 x i8>> = unimplemented << %{{26}} = ptrtoint <8 x ptr> %{{ptrs}} to <8 x i8>, ... ; %{{_}}: ?ty<<4 x i64>> = unimplemented << %{{_}} = sext <4 x i32> %{{_}} to <4 x i64>>> ; %{{_}}: ?ty<<4 x i64>> = unimplemented << %{{_}} = zext <4 x i32> %{{_}} to <4 x i64>>> ; %{{_}}: ?ty<<4 x i8>> = unimplemented << %{{_}} = trunc <4 x i32> %{{_}} to <4 x i8>>> -; br bb5 -; bb5: -; %{{_}}: ?ty<<4 x i1>> = unimplemented << %{{27}} = icmp ne <4 x i32> %{{444}}, zeroinitializer>> -; br bb6 -; bb6: +; br bb{{_}} +; bb{{_}}: +; %{{_}}: ?ty<<4 x i1>> = unimplemented << %{{27}} = icmp ne <4 x i32> %{{444}}, zeroinitializer, ... +; br bb{{_}} +; bb{{_}}: ; %{{_}}: i32 = load %0_0 ; %{{_}}: i32 = unimplemented << %{{_}} = load i32, ptr addrspace(10) %{{_}}, align 4 >> ; %{{_}}: i32 = load %0_0 ; br ... ; ... -; bb10: ; %{{_}}: float = unimplemented << %{{_}} = phi nnan float... -; br bb11 -; bb11: -; unimplemented << store atomic i32 0, ptr %0 release, align 4>> +; br bb{{_}} +; bb{{_}}: +; unimplemented << store atomic i32 0, ptr %0 release, align 4, ... ; unimplemented << store i32 0, ptr addrspace(10) %5, align 4 >> ; unimplemented << store i32 0, ptr %0, align 2>> ; ret diff --git a/tests/langtest_ir_lowering.rs b/tests/langtest_ir_lowering.rs index 93bce0513..75a0e855e 100644 --- a/tests/langtest_ir_lowering.rs +++ b/tests/langtest_ir_lowering.rs @@ -34,6 +34,12 @@ fn main() { // We don't use yk-config here, as we are testing one very specific functionality that // requires only one special flag. let mut compiler = Command::new(ykllvm_bin("clang")); + let md = env::var("CARGO_MANIFEST_DIR").unwrap(); + let profile = full_cargo_profile(); + let ykcapi_path = [&md, "..", "target", &profile, "deps"] + .iter() + .collect::(); + let ykcapi_linkdir = format!("-L{}", ykcapi_path.to_str().unwrap()); compiler.args([ "-flto", "-fuse-ld=lld", @@ -41,11 +47,13 @@ fn main() { "-o", exe.to_str().unwrap(), "-Wl,-mllvm=--yk-embed-ir", + // The serialiser now assumes that we are doing software tracing. + "-Wl,--mllvm=--yk-basicblock-tracer", + // Link libykcapi so that the tests inherit the necessary software tracing symbols. + &ykcapi_linkdir, + "-lykcapi", p.to_str().unwrap(), ]); - - let md = env::var("CARGO_MANIFEST_DIR").unwrap(); - let profile = full_cargo_profile(); let dumper_path = [&md, "..", "target", &profile, "dump_ir"] .iter() .collect::(); diff --git a/ykcapi/src/lib.rs b/ykcapi/src/lib.rs index 690021402..0e5b4d805 100644 --- a/ykcapi/src/lib.rs +++ b/ykcapi/src/lib.rs @@ -71,11 +71,29 @@ pub extern "C" fn __ykrt_control_point( // FIXME: We could get rid of this entire function if we pass the frame's base pointer into the // control point from the interpreter. std::arch::naked_asm!( - // Pass the interpreter frame's base pointer via the 4th argument register. - "sub rsp, 8", // Alignment - "mov rcx, rbp", // Pass interpreter frame's base pointer via 4th argument register. + "sub rsp, 8", // Alignment + // Push the callee-save registers to the stack. This is required so that traces can read + // live variables from them. + "push rbp", + "push rbx", + "push r12", + "push r13", + "push r14", + "push r15", + // Pass interpreter frame's base pointer via 4th argument register. + "mov rcx, rbp", + // Do the call "call __ykrt_control_point_real", - "add rsp, 8", + // Restore callee-save registers. + "pop r15", + "pop r14", + "pop r13", + "pop r12", + "pop rbx", + "pop rbp", + "add rsp, 8", // Alignment. + // NOTE! If the control point determined that a trace needs to be executed, then the return + // address has been overwritten and this `ret` will jump to the trace's entry point! "ret", ); } diff --git a/ykllvm b/ykllvm index 58db46cfc..8d5a4c6af 160000 --- a/ykllvm +++ b/ykllvm @@ -1 +1 @@ -Subproject commit 58db46cfc016d6d1bd350ea861c6c5258a674ae2 +Subproject commit 8d5a4c6af26d83615ee518e04c0ddda0d4ebf176 diff --git a/ykrt/src/compile/jitc_yk/trace_builder.rs b/ykrt/src/compile/jitc_yk/trace_builder.rs index ad79eae15..f0a75fc15 100644 --- a/ykrt/src/compile/jitc_yk/trace_builder.rs +++ b/ykrt/src/compile/jitc_yk/trace_builder.rs @@ -18,11 +18,13 @@ use crate::{ mt::{MT, TraceId}, trace::{AOTTraceIterator, TraceAction}, }; +use smallvec::SmallVec; use std::{collections::HashMap, ffi::CString, sync::Arc}; use ykaddr::addr::symbol_to_ptr; -/// Caller-saved registers in DWARF notation. -static CALLER_CLOBBER_REG: [u16; 9] = [0, 1, 2, 4, 5, 8, 9, 10, 11]; +/// SysV x86_64 callee-saved registers in DWARF notation. +#[cfg(target_arch = "x86_64")] +static CALLEE_SAVE_REGS: [u16; 6] = [6, 3, 12, 13, 14, 15]; /// Given an execution trace and AOT IR, creates a JIT IR trace. pub(crate) struct TraceBuilder { @@ -135,6 +137,12 @@ impl TraceBuilder { &mut self, blk: &'static aot_ir::BBlock, ) -> Result<(), CompilationError> { + // This code assumes that the trace starts from a call to the control point. In other + // words, that it's a root trace. + assert!(!matches!( + self.jit_mod.tracekind(), + TraceKind::Sidetrace(..) + )); // Find the control point call to retrieve the live variables from its safepoint. let safepoint = match self.jit_mod.tracekind() { TraceKind::HeaderOnly | TraceKind::HeaderAndBody | TraceKind::DifferentFrames => { @@ -172,29 +180,67 @@ impl TraceBuilder { todo!("Deal with multi register locations"); } - // Rewrite registers to their spill locations. We need to do this as we no longer - // push/pop registers around the control point to reduce its overhead. We know that - // for every live variable in a caller-saved register there must exist a spill offset - // in that location's extras. + // Rewrite live values in caller-save registers to their saved locations. + // + // Due to the unconventional control flow involved in executing a root trace, we cannot + // guarantee that caller-save registers have their values preserved between entering + // the control point and executing a trace. + // + // When the interpreter calls the control point, and a trace is going to be executed, + // this is the sequence of events: + // + // - interpreter does caller-save before calling `__ykrt_control_point`. + // - `__ykrt_control_point` does the callee-save before running all of the Rust + // internals of the control point. + // - eventually the control point internals determine that a trace needs to be + // executed and calls `__yk_exec_trace`. + // - `__yk_exec_trace` *overwrites the return value* of the control point with the + // trace entry point, before returning. + // - Rust frames return normally until we reach the frame for `__ykrt_control_point`: + // the frame that had its return address overwritten. + // - `__ykrt_control_point` returns to *the trace*. + // + // After this, the stack looks as though the interpreter directly called the trace, but + // this isn't the case at all, and the caller-save registers remain in their "probably + // clobbered" state. Yet sometimes traces expect to be able to read from a caller-save + // register, as though the control point had never run. + // + // For every live variable in a caller-save register we *must* find the caller-saved + // copy of that value created during the call sequence to the control point. For each + // such live variable there must be a copy either spilled to the stack, or in a + // *callee-save* register. Here we tell the trace to use one of those alternative + // locations instead. let loc = match &var[0] { - yksmp::Location::Register(reg, size, v) => { + yksmp::Location::Register(_, size, v) => { let mut newloc = None; - for offset in v { - if *offset < 0 { - newloc = Some(yksmp::Location::Indirect(6, i32::from(*offset), *size)); - break; + for extra in v { + if *extra > 0 && CALLEE_SAVE_REGS.contains(&u16::try_from(*extra).unwrap()) + { + newloc = Some(yksmp::Location::Register( + u16::try_from(*extra).unwrap(), + *size, + SmallVec::new(), + )); + break; // Stop now, a register save is the best-case scenario. + } else if *extra < 0 && newloc.is_none() { + newloc = Some(yksmp::Location::Indirect(6, i32::from(*extra), *size)); + // No `break`. Keep trying, in case we find the value in a register, + // which is preferable. } } if let Some(loc) = newloc { loc - } else if CALLER_CLOBBER_REG.contains(reg) { - panic!("No spill offset for caller-saved register.") } else { var[0].clone() } } _ => var[0].clone(), }; + if let yksmp::Location::Register(r, ..) = loc { + // If we plan to read a live value from a register, it must be from a callee save + // register now. See large comment above for an explanation. + assert!(CALLEE_SAVE_REGS.contains(&r)); + } let param_inst = jit_ir::ParamInst::new(ParamIdx::try_from(idx)?, input_tyidx).into(); self.jit_mod.push(param_inst)?; diff --git a/ykrt/src/lib.rs b/ykrt/src/lib.rs index b968ab96e..bfee69b77 100644 --- a/ykrt/src/lib.rs +++ b/ykrt/src/lib.rs @@ -4,6 +4,7 @@ #![feature(assert_matches)] #![feature(int_roundings)] #![feature(trim_prefix_suffix)] +#![feature(thread_local)] #![allow(clippy::too_many_arguments)] #![allow(clippy::type_complexity)] #![allow(clippy::upper_case_acronyms)] diff --git a/ykrt/src/mt.rs b/ykrt/src/mt.rs index f4206233c..ffe561cff 100644 --- a/ykrt/src/mt.rs +++ b/ykrt/src/mt.rs @@ -14,7 +14,6 @@ use std::{ }, }; -use atomic_enum::atomic_enum; use parking_lot::Mutex; #[cfg(not(all(feature = "yk_testing", not(test))))] use parking_lot_core::SpinWait; @@ -74,11 +73,17 @@ thread_local! { /// This thread's [MTThread]. Do not access this directly: use [MTThread::with_borrow] or /// [MTThread::with_borrow_mut]. static THREAD_MTTHREAD: RefCell = RefCell::new(MTThread::new()); - /// Is this thread tracing something? Do not access this directly: use [MTThread::is_tracing] - /// and friends. - static THREAD_IS_TRACING: AtomicIsTracing = const { AtomicIsTracing::new(IsTracing::None) }; } +// The current thread's tracing state. +// +// Note: This thread local is shared to code generated by our BasicBlockTracer ykllvm pass, hence +// we use "native" TLS, instead of `thread_local!` +#[allow(non_upper_case_globals)] +#[unsafe(no_mangle)] +#[thread_local] +static mut __yk_thread_tracing_state: IsTracing = IsTracing::None; + /// A meta-tracer. This is always passed around stored in an [Arc]. /// /// When you are finished with this meta-tracer, it is best to explicitly call [MT::shutdown] to @@ -1319,17 +1324,18 @@ impl MTThread { /// Is this thread currently tracing something? pub(crate) fn is_tracing() -> bool { - THREAD_IS_TRACING.with(|x| x.load(Ordering::Relaxed) != IsTracing::None) + unsafe { __yk_thread_tracing_state != IsTracing::None } } /// What kind of tracing (if any!) is this thread undertaking? fn tracing_kind() -> IsTracing { - THREAD_IS_TRACING.with(|x| x.load(Ordering::Relaxed)) + let raw = &raw mut __yk_thread_tracing_state; + unsafe { std::ptr::read(raw) } } - /// Mark this thread as currently tracing something. + /// Set this thread's tracing state. fn set_tracing(kind: IsTracing) { - THREAD_IS_TRACING.with(|x| x.store(kind, Ordering::Relaxed)); + unsafe { __yk_thread_tracing_state = kind } } /// Call `f` with a `&` reference to this thread's [MTThread] instance. @@ -1483,7 +1489,6 @@ impl MTThread { } } -#[atomic_enum] #[derive(PartialEq)] enum IsTracing { None, diff --git a/ykrt/src/trace/swt/mod.rs b/ykrt/src/trace/swt/mod.rs index d50c10efa..2433c3477 100644 --- a/ykrt/src/trace/swt/mod.rs +++ b/ykrt/src/trace/swt/mod.rs @@ -17,10 +17,9 @@ thread_local! { static BASIC_BLOCKS: RefCell> = const { RefCell::new(vec![]) }; } -/// If the current thread is tracing, records the specified basicblock into the software tracing -/// buffer. +/// Records the specified basic block into the software tracing buffer. /// -/// If the current thread is not tracing, it does nothing. +/// This must only be called if the current thread is tracing. /// /// # Arguments /// * `function_index` - The index of the function to which the basic block belongs. @@ -28,14 +27,13 @@ thread_local! { #[cfg(tracer_swt)] #[unsafe(no_mangle)] pub extern "C" fn __yk_trace_basicblock(function_index: usize, block_index: usize) { - if MTThread::is_tracing() { - BASIC_BLOCKS.with(|v| { - v.borrow_mut().push(TracingBBlock { - function_index, - block_index, - }); - }) - } + debug_assert!(MTThread::is_tracing()); + BASIC_BLOCKS.with(|v| { + v.borrow_mut().push(TracingBBlock { + function_index, + block_index, + }); + }) } pub(crate) struct SWTracer {} diff --git a/yksmp/src/lib.rs b/yksmp/src/lib.rs index fa67db806..e8eb63525 100644 --- a/yksmp/src/lib.rs +++ b/yksmp/src/lib.rs @@ -173,6 +173,10 @@ impl StackMapParser<'_> { for mut r in records.drain(..) { // Calculate the absolute offset for this record in the binary. r.offset += f.addr; + // `u64::MAX` means that LLVM doesn't know the size of the frame statically. This + // happens when there is an `alloca` instruction in a non-entry + // block. We can't work with such frames. + assert_ne!(f.stack_size, u64::MAX); r.size = f.stack_size; let idx = usize::try_from(r.id).unwrap(); all_records[idx] = (r, i);