diff --git a/clang/test/Yk/idempotent_inlined_promotions.c b/clang/test/Yk/idempotent_inlined_promotions.c index 47c8437695043..505ec359339fe 100644 --- a/clang/test/Yk/idempotent_inlined_promotions.c +++ b/clang/test/Yk/idempotent_inlined_promotions.c @@ -1,7 +1,7 @@ // Checks the compiler borks if a __yk_promote_* gets inlined into a // yk_idempotent function. // -// RUN: not %clang -O2 -mllvm --yk-embed-ir -mllvm --yk-insert-stackmaps %s 2>&1 | FileCheck %s +// RUN: not %clang -O2 -mllvm --yk-embed-ir -mllvm --yk-insert-stackmaps -mllvm --yk-basicblock-tracer %s 2>&1 | FileCheck %s void *__yk_promote_ptr(void *); diff --git a/clang/test/Yk/outline_inlined_promotions.c b/clang/test/Yk/outline_inlined_promotions.c index 1a3b41e1eb4df..fe4b0fda3fc3b 100644 --- a/clang/test/Yk/outline_inlined_promotions.c +++ b/clang/test/Yk/outline_inlined_promotions.c @@ -1,7 +1,7 @@ // Checks the compiler borks if a __yk_promote_* gets inlined into a // yk_outline function. // -// RUN: not %clang -O2 -mllvm --yk-embed-ir -mllvm --yk-insert-stackmaps %s 2>&1 | FileCheck %s +// RUN: not %clang -O2 -mllvm --yk-embed-ir -mllvm --yk-insert-stackmaps -mllvm --yk-basicblock-tracer %s 2>&1 | FileCheck %s void *__yk_promote_ptr(void *); diff --git a/llvm/include/llvm/Transforms/Yk/BasicBlockTracer.h b/llvm/include/llvm/Transforms/Yk/BasicBlockTracer.h index 0e4511af577ca..61dfa240d3ace 100644 --- a/llvm/include/llvm/Transforms/Yk/BasicBlockTracer.h +++ b/llvm/include/llvm/Transforms/Yk/BasicBlockTracer.h @@ -6,6 +6,9 @@ // The name of the trace function - used in swt tracing. #define YK_TRACE_FUNCTION "__yk_trace_basicblock" +// The name of the thread tracing state thread local. +#define YK_THREAD_TRACING_STATE_TL "__yk_thread_tracing_state" + // The name of the dummy (noop) trace function - used in multi-module swt // tracing. #define YK_TRACE_FUNCTION_DUMMY "__yk_trace_basicblock_dummy" diff --git a/llvm/lib/Transforms/Yk/BasicBlockTracer.cpp b/llvm/lib/Transforms/Yk/BasicBlockTracer.cpp index 7c999e8073d29..ca6101908825a 100644 --- a/llvm/lib/Transforms/Yk/BasicBlockTracer.cpp +++ b/llvm/lib/Transforms/Yk/BasicBlockTracer.cpp @@ -1,3 +1,24 @@ +//===- The Basic Block Tracer Pass -------------------------------------===// +// +// For each basic block, the IR is modified such that it has the following +// control flow (pseudo-code): +// +// ``` +// tracing_check: +// %t <- load the "is this thread tracing?" thread local +// %dont_record <- t == 0 +// condbr %dont_record, done, record +// +// record: +// call __yk_trace_basicblock(...) +// br done +// +// done: +// ...original contents of block... +// ``` +// +//===-------------------------------------------------------------------===// +// #include "llvm/Transforms/Yk/BasicBlockTracer.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Function.h" @@ -5,6 +26,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -13,6 +35,8 @@ #include "llvm/YkIR/YkIRWriter.h" #define DEBUG_TYPE "yk-basicblock-tracer-pass" +const uint8_t ThreadTracingStateNone = 0; + using namespace llvm; namespace llvm { @@ -29,20 +53,44 @@ struct YkBasicBlockTracer : public ModulePass { bool runOnModule(Module &M) override { LLVMContext &Context = M.getContext(); - // Create externally linked function declaration: - // void __yk_trace_basicblock(int functionIndex, int blockIndex) + + // Declare the thread tracing state thread local (if not already present -- + // the input program could have already defined it extern). + llvm::Type *I8Ty = llvm::Type::getInt8Ty(Context); + GlobalVariable *ThreadTracingTL = + M.getNamedGlobal(YK_THREAD_TRACING_STATE_TL); + if (!ThreadTracingTL) { + ThreadTracingTL = new llvm::GlobalVariable( + M, I8Ty, false, llvm::GlobalValue::ExternalLinkage, nullptr, + YK_THREAD_TRACING_STATE_TL); + ThreadTracingTL->setThreadLocalMode( + llvm::GlobalValue::GeneralDynamicTLSModel); + ThreadTracingTL->setAlignment(Align(1)); + } + + // Trace function is used to trace the execution of the program. Type *ReturnType = Type::getVoidTy(Context); Type *FunctionIndexArgType = Type::getInt32Ty(Context); Type *BlockIndexArgType = Type::getInt32Ty(Context); FunctionType *FType = FunctionType::get( ReturnType, {FunctionIndexArgType, BlockIndexArgType}, false); - - // Trace function is used to trace the execution of the program. Function *TraceFunc = Function::Create( FType, GlobalVariable::ExternalLinkage, YK_TRACE_FUNCTION, M); - IRBuilder<> builder(Context); + // Metadata used to help the serialiser identify the purpose of a block. + // + // This block is a "are we tracing" check: + MDNode *TracingCheckBBMD = + MDNode::get(Context, MDString::get(Context, "swt-tracing-check-bb")); + // This block records the block: + MDNode *RecordBBMD = + MDNode::get(Context, MDString::get(Context, "swt-record-bb")); + // This is a block we will serialise (the above two we don't): + MDNode *SerialiseBBMD = + MDNode::get(Context, MDString::get(Context, "swt-serialise-bb")); + + IRBuilder<> Builder(Context); uint32_t FunctionIndex = 0; for (auto &F : M) { // If we won't ever trace this, don't insert calls to the tracer, as it @@ -56,11 +104,77 @@ struct YkBasicBlockTracer : public ModulePass { continue; } - uint32_t BlockIndex = 0; + // Collect *original* blocks that require instrumentation. + std::vector BBs; for (auto &BB : F) { - builder.SetInsertPoint(&*BB.getFirstInsertionPt()); - builder.CreateCall(TraceFunc, {builder.getInt32(FunctionIndex), - builder.getInt32(BlockIndex)}); + BBs.push_back(&BB); + } + + uint32_t BlockIndex = 0; + for (BasicBlock *BB : BBs) { + // If there are allocas in an entry block, then they have to stay + // there, otherwise stackmaps will consider the frame to have dynamic + // size (and we won't know how big the frame is at runtime). + std::vector EntryAllocas; + if (BlockIndex == 0) { + for (Instruction &I : *BB) { + if (AllocaInst *AI = dyn_cast(&I)) { + EntryAllocas.push_back(AI); + } + } + // We also move the allocas to be first in the block to simplify + // serialisation. `llvm_reverse` ensures they appear in the same + // order. + // + // Note: There can be no PHI nodes in an entry block, so we don't + // need to check they appear first. + Builder.SetInsertPoint(&*BB->getFirstInsertionPt()); + for (AllocaInst *AI : llvm::reverse(EntryAllocas)) { + AI->moveBefore(&*BB->getFirstInsertionPt()); + } + } + + // Insert a "are we tracing?" check. + // + // It's actually a "are we NOT tracing?" check so that the branch + // predictor has an easier time for the common case (that we are not + // tracing). + // + // If this is an entry block with allocas, the check comes after the + // allocas, otherwise the check comes first in the block. + if (EntryAllocas.size() > 0) { + Builder.SetInsertPoint(EntryAllocas.back()->getNextNode()); + } else { + Builder.SetInsertPoint(&*BB->getFirstInsertionPt()); + } + Instruction *ThreadTracingState = + Builder.CreateLoad(I8Ty, ThreadTracingTL); + Value *DontRec = Builder.CreateICmpEQ( + ThreadTracingState, ConstantInt::get(I8Ty, ThreadTracingStateNone)); + + // Split off the remainder of the block. + BasicBlock *RestBB = + BB->splitBasicBlock(cast(DontRec)->getNextNode()); + + // Make the block that calls the recorder. + BasicBlock *RecBB = llvm::BasicBlock::Create(Context, "", &F, RestBB); + Builder.SetInsertPoint(RecBB); + Builder.CreateCall(TraceFunc, {Builder.getInt32(FunctionIndex), + Builder.getInt32(BlockIndex)}); + Builder.CreateBr(RestBB); + + // Update the terminator of the "are we tracing?" block We jump over + // the recorder block if we are not tracing. + Instruction *OldTerm = BB->getTerminator(); + Builder.SetInsertPoint(OldTerm); + Builder.CreateCondBr(DontRec, RestBB, RecBB); + OldTerm->eraseFromParent(); + + // Attach metadata to the first instruction of each of the blocks so + // that we can more easily identify their purpose in the serialiser. + BB->front().setMetadata("yk-swt-bb-purpose", TracingCheckBBMD); + RecBB->front().setMetadata("yk-swt-bb-purpose", RecordBBMD); + RestBB->front().setMetadata("yk-swt-bb-purpose", SerialiseBBMD); assert(BlockIndex != UINT32_MAX && "Expected BlockIndex to not overflow"); diff --git a/llvm/lib/YkIR/YkIRWriter.cpp b/llvm/lib/YkIR/YkIRWriter.cpp index 375836b0b936e..8f58f16562605 100644 --- a/llvm/lib/YkIR/YkIRWriter.cpp +++ b/llvm/lib/YkIR/YkIRWriter.cpp @@ -2,6 +2,9 @@ // // Converts an LLVM module into Yk's on-disk AOT IR. // +// Note that this serialiser now assumes that the yk basic block tracer pass +// has been run prior. +// //===-------------------------------------------------------------------===// #include "llvm/YkIR/YkIRWriter.h" @@ -278,6 +281,32 @@ class FuncLowerCtxt { bool vlMapContains(Instruction *I) { return VLMap.count(I) == 1; } }; +// Idenitfies the "purpose" of a basic block. +enum BBPurpose { + // This block is a "is the current thread tracing?" check. + BBPurposeTracingCheck, + // This block records a basic block, if we are tracing. + BBPurposeRecord, + // This block is serialised (into yk AOT IR). + BBPurposeSerialise, +}; + +// An entry in the basic block cache. +struct BBCacheEntry { + // The purpose of the block. + BBPurpose Purpose; + // The yk AOT IR basic block index that the above purpose applies to. + size_t BBIdx; + // The BBPurposeSerialiseBB LLVM IR block that this entry corresponds with. + BasicBlock *SerBB; +}; + +// An entry in the function cache. +struct FunctionCacheEntry { + size_t FuncIdx; + size_t NumSerBBs; +}; + // The class responsible for serialising our IR into the interpreter binary. // // It walks over the LLVM IR, lowering each function, block, instruction, etc. @@ -313,8 +342,8 @@ class YkIRWriter { // File paths. vector Paths; - llvm::DenseMap FunctionCache; - llvm::DenseMap BBCache; + llvm::DenseMap FunctionCache; + llvm::DenseMap BBCache; // Line-level debug line info for the instructions of the module. // @@ -345,9 +374,19 @@ class YkIRWriter { return Idx; } - size_t getIndex(Function *F) { return FunctionCache.at(F); } + size_t getIndex(Function *F) { return FunctionCache.at(F).FuncIdx; } - size_t getIndex(BasicBlock *BB) { return BBCache.at(BB); } + size_t getIndex(BasicBlock *BB) { + const BBCacheEntry &BCE = BBCache.at(BB); + // The serialiser should only every need to query indices of blocks with: + // - The `BBPurposeTracingCheck` purpose: because branches in the blocks + // we serialise go to these kinds of block. + // - The `BBPurposeSerialise` purpose: because the incoming edges of PHI + // nodes will go to these. + assert(BCE.Purpose == BBPurposeTracingCheck || + BCE.Purpose == BBPurposeSerialise); + return BCE.BBIdx; + } // Return the index of the LLVM constant `C`, inserting a new entry if // necessary. @@ -425,7 +464,6 @@ class YkIRWriter { } void serialiseBlockLabel(BasicBlock *BB) { - // Basic block indices are the same in both LLVM IR and our IR. OutStreamer.emitSizeT(getIndex(BB)); } @@ -1465,7 +1503,10 @@ class YkIRWriter { OutStreamer.emitSizeT(NumIncoming); // incoming_bbs: for (size_t J = 0; J < NumIncoming; J++) { - serialiseBlockLabel(I->getIncomingBlock(J)); + BasicBlock *IB = I->getIncomingBlock(J); + const BBCacheEntry &IBCE = BBCache.at(IB); + assert(IBCE.Purpose == BBPurposeSerialise); + serialiseBlockLabel(IB); } // incoming_vals: for (size_t J = 0; J < NumIncoming; J++) { @@ -1643,7 +1684,8 @@ class YkIRWriter { } void serialiseBlock(BasicBlock &BB, FuncLowerCtxt &FLCtxt, unsigned &BBIdx, - Function &F) { + Function &F, std::vector *EntryAllocas, + std::vector *PhiNodes) { auto ShouldSkipInstr = [](Instruction *I) { // Skip non-semantic instrucitons for now. // @@ -1675,6 +1717,11 @@ class YkIRWriter { // instrs: unsigned InstIdx = 0; + // Serialise any PHI nodes. + for (PHINode *PN : *PhiNodes) { + serialiseInst(PN, FLCtxt, BBIdx, InstIdx); + } + // Insert LoadArg instructions for each argument of this function and // replace all Argument operands with their respective LoadArg instruction. // This ensures we won't have to deal with argument operands in the yk @@ -1685,6 +1732,10 @@ class YkIRWriter { FLCtxt.ArgumentMap[Arg] = InstIdx; InstIdx++; } + // Serialise any entry block allocas. + for (AllocaInst *AI : *EntryAllocas) { + serialiseInst(AI, FLCtxt, BBIdx, InstIdx); + } } for (Instruction &I : BB) { @@ -1771,13 +1822,47 @@ class YkIRWriter { if ((!F.hasFnAttribute(YK_OUTLINE_FNATTR)) || (containsControlPoint(F))) { // Emit a function *definition*. // num_blocks: - OutStreamer.emitSizeT(F.size()); + // + // Note, this is not the same as the number of blocks in the LLVM IR + // because we only serialise blocks that have the `BBPurposeSerialise` + // purpose. + OutStreamer.emitSizeT(FunctionCache.at(&F).NumSerBBs); // blocks: unsigned BBIdx = 0; FuncLowerCtxt FLCtxt; std::vector V; + std::vector EntryAllocas; + std::vector PhiNodes; for (BasicBlock &BB : F) { - serialiseBlock(BB, FLCtxt, BBIdx, F); + const BBCacheEntry &BCE = BBCache.at(&BB); + // Cache entry block allocas that we have to serialise as though they + // appear in the the `BBPurposeSerialiseBB` block that will come later. + // + // Why didn't we just move the allocas into that block in the LLVM IR + // when we added software tracing instrumentation? Because then LLVM + // would consider the frame dynamically sized (because allocas exist + // that aren't in the entry block). + if (BB.isEntryBlock()) { + assert(BCE.Purpose == BBPurposeTracingCheck); + for (Instruction &I : BB) { + if (AllocaInst *AI = dyn_cast(&I)) { + EntryAllocas.push_back(AI); + } + } + } + // Similarly, cache PHI nodes (if any) at the start of any + // `BBPurposeTracingCheck` block. We will need to serialise those as + // though they appear in the `BBPurposeSerialise` block to follow. + if (BCE.Purpose == BBPurposeTracingCheck) { + for (Instruction &I : BB) { + if (PHINode *PN = dyn_cast(&I)) { + PhiNodes.push_back(PN); + } + } + } else if (BCE.Purpose == BBPurposeSerialise) { + serialiseBlock(BB, FLCtxt, BBIdx, F, &EntryAllocas, &PhiNodes); + PhiNodes.clear(); + } } FLCtxt.patchLocalVarIdxs(OutStreamer); } else { @@ -1968,6 +2053,72 @@ class YkIRWriter { YkIRWriter(Module &M, MCStreamer &OutStreamer) : M(M), OutStreamer(OutStreamer), DL(&M) {} + // Return the purpose of a basic block. + BBPurpose getBBPurpose(BasicBlock *BB) { + if (MDNode *MD = BB->front().getMetadata("yk-swt-bb-purpose")) { + if (auto *S = dyn_cast(MD->getOperand(0))) { + StringRef PS = S->getString(); + if (PS == "swt-tracing-check-bb") { + return BBPurposeTracingCheck; + } else if (PS == "swt-record-bb") { + return BBPurposeRecord; + } else if (PS == "swt-serialise-bb") { + return BBPurposeSerialise; + } else { + llvm::report_fatal_error("encountered block with unknown purpose"); + } + } + } else { + llvm::report_fatal_error("encountered block with no purpose: has the " + "basic block racer pass been run?"); + } + llvm_unreachable("failed to get bb purpose"); + } + + // Create the basic block cache and the function cache. + size_t createCaches() { + size_t FuncIdx = 0; + for (auto &F : M) { + size_t BBIdx = 0; + // Only make block cache entries for functions that can be traced. + if ((!F.hasFnAttribute(YK_OUTLINE_FNATTR)) || (containsControlPoint(F))) { + // The expected purpose of the next block. + BBPurpose Expect = BBPurposeTracingCheck; + std::optional TracingCheckBB; + std::optional RecordBB; + for (BasicBlock &BB : F) { + BBPurpose BP = getBBPurpose(&BB); + assert(BP == Expect); + switch (BP) { + case BBPurposeTracingCheck: + assert(!TracingCheckBB.has_value()); + TracingCheckBB = &BB; + Expect = BBPurposeRecord; + break; + case BBPurposeRecord: + assert(!RecordBB.has_value()); + RecordBB = &BB; + Expect = BBPurposeSerialise; + break; + case BBPurposeSerialise: + BBCache[TracingCheckBB.value()] = + BBCacheEntry{BBPurposeTracingCheck, BBIdx, &BB}; + BBCache[RecordBB.value()] = + BBCacheEntry{BBPurposeRecord, BBIdx, &BB}; + BBCache[&BB] = BBCacheEntry{BBPurposeSerialise, BBIdx, &BB}; + Expect = BBPurposeTracingCheck; + TracingCheckBB = nullopt; + RecordBB = nullopt; + BBIdx++; + break; + } + } + } + FunctionCache[&F] = {FuncIdx++, BBIdx}; + } + return FuncIdx; + } + // Entry point for IR serialisation. // // The order of serialisation matters. @@ -1986,17 +2137,9 @@ class YkIRWriter { assert(IdxBitWidth <= 0xff); OutStreamer.emitInt8(IdxBitWidth); - // Precompute func and bb indices - size_t FuncIdx = 0; - for (auto &F : M) { - FunctionCache[&F] = FuncIdx++; - size_t BBIdx = 0; - for (BasicBlock &BB : F) { - BBCache[&BB] = BBIdx++; - } - } + size_t NumFuncs = createCaches(); // Emit the number of functions - OutStreamer.emitSizeT(FuncIdx); + OutStreamer.emitSizeT(NumFuncs); // funcs: for (llvm::Function &F : M) { serialiseFunc(F); diff --git a/llvm/test/Transforms/Yk/BasicBlockTracer.ll b/llvm/test/Transforms/Yk/BasicBlockTracer.ll index 98861d9d08a25..8516816bbf559 100644 --- a/llvm/test/Transforms/Yk/BasicBlockTracer.ll +++ b/llvm/test/Transforms/Yk/BasicBlockTracer.ll @@ -1,8 +1,7 @@ ; RUNNING TEST EXAMPLE: llvm-lit llvm/test/Transforms/Yk/BasicBlockTracer.ll ; RUN: llc -stop-after yk-basicblock-tracer-pass --yk-basicblock-tracer < %s | FileCheck %s -; CHECK-LABEL: define dso_local noundef i32 @main() -; CHECK-NEXT: call void @__yk_trace_basicblock(i32 0, i32 0) +; CHECK: call void @__yk_trace_basicblock(i32 0, i32 0) define dso_local noundef i32 @main() #0 { %1 = alloca i32, align 4 %2 = alloca i32, align 4 @@ -12,55 +11,47 @@ define dso_local noundef i32 @main() #0 { store i32 0, i32* %3, align 4 br label %4 -; CHECK-LABEL: 4:{{.*}} -; CHECK-NEXT: call void @__yk_trace_basicblock(i32 0, i32 1) +; CHECK: call void @__yk_trace_basicblock(i32 0, i32 1) 4: ; preds = %13, %0 %5 = load i32, i32* %3, align 4 %6 = icmp slt i32 %5, 43 br i1 %6, label %7, label %16 -; CHECK-LABEL: 7:{{.*}} -; CHECK-NEXT: call void @__yk_trace_basicblock(i32 0, i32 2) +; CHECK: call void @__yk_trace_basicblock(i32 0, i32 2) 7: ; preds = %4 %8 = load i32, i32* %3, align 4 %9 = icmp eq i32 %8, 42 br i1 %9, label %10, label %12 -; CHECK-LABEL: 10:{{.*}} -; CHECK-NEXT: call void @__yk_trace_basicblock(i32 0, i32 3) +; CHECK: call void @__yk_trace_basicblock(i32 0, i32 3) 10: ; preds = %7 %11 = load i32, i32* %3, align 4 store i32 %11, i32* %1, align 4 br label %17 -; CHECK-LABEL: 12:{{.*}} -; CHECK-NEXT: call void @__yk_trace_basicblock(i32 0, i32 4) +; CHECK: call void @__yk_trace_basicblock(i32 0, i32 4) 12: ; preds = %7 br label %13 -; CHECK-LABEL: 13:{{.*}} -; CHECK-NEXT: call void @__yk_trace_basicblock(i32 0, i32 5) +; CHECK: call void @__yk_trace_basicblock(i32 0, i32 5) 13: ; preds = %12 %14 = load i32, i32* %3, align 4 %15 = add nsw i32 %14, 1 store i32 %15, i32* %3, align 4 br label %4, !llvm.loop !6 -; CHECK-LABEL: 16:{{.*}} -; CHECK-NEXT: call void @__yk_trace_basicblock(i32 0, i32 6) +; CHECK: call void @__yk_trace_basicblock(i32 0, i32 6) 16: ; preds = %4 store i32 0, i32* %1, align 4 br label %17 -; CHECK-LABEL: 17:{{.*}} -; CHECK-NEXT: call void @__yk_trace_basicblock(i32 0, i32 7) +; CHECK: call void @__yk_trace_basicblock(i32 0, i32 7) 17: ; preds = %16, %10 %18 = load i32, i32* %1, align 4 ret i32 %18 } -; CHECK-LABEL: define dso_local noundef i32 @_Z5checki(i32 noundef %0) -; CHECK-NEXT: call void @__yk_trace_basicblock(i32 1, i32 0) +; CHECK: call void @__yk_trace_basicblock(i32 1, i32 0) define dso_local noundef i32 @_Z5checki(i32 noundef %0) #1 { %2 = alloca i32, align 4 store i32 %0, i32* %2, align 4 diff --git a/llvm/test/YkIR/yk_idempotent_not_ykoutline.ll b/llvm/test/YkIR/yk_idempotent_not_ykoutline.ll index aae8dcc6fcb29..815993d6359e2 100644 --- a/llvm/test/YkIR/yk_idempotent_not_ykoutline.ll +++ b/llvm/test/YkIR/yk_idempotent_not_ykoutline.ll @@ -1,7 +1,7 @@ ; Checks the compiler borks if there's a __yk_promote_* in a yk_outline ; function. ; -; RUN: not llc --yk-embed-ir < %s 2>&1 | FileCheck %s +; RUN: not llc --yk-embed-ir --yk-basicblock-tracer < %s 2>&1 | FileCheck %s declare ptr @__yk_promote_ptr(ptr)