-
Notifications
You must be signed in to change notification settings - Fork 13.7k
[BOLT] Support pre-aggregated returns #143296
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/aaupov/spr/main.bolt-support-pre-aggregated-returns
Are you sure you want to change the base?
[BOLT] Support pre-aggregated returns #143296
Conversation
Created using spr 1.3.4
Created using spr 1.3.4
@llvm/pr-subscribers-bolt Author: Amir Ayupov (aaupov) ChangesIntel's Architectural LBR supports capturing branch type information
Linux kernel can preserve branch type when > - save_type: save branch type during sampling in case binary is not available later. This information is needed to disambiguate external returns (from This patch adds new pre-aggregated trace type (R). Test Plan: updated callcont-fallthru.s Full diff: https://github.com/llvm/llvm-project/pull/143296.diff 4 Files Affected:
diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index 96969cf53baca..ae66c58e127cd 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -109,6 +109,7 @@ class DataAggregator : public DataReader {
static constexpr const uint64_t BR_ONLY = -1ULL;
static constexpr const uint64_t FT_ONLY = -1ULL;
static constexpr const uint64_t FT_EXTERNAL_ORIGIN = -2ULL;
+ static constexpr const uint64_t BR_EXTERNAL_RETURN = -3ULL;
uint64_t Branch;
uint64_t From;
@@ -388,7 +389,7 @@ class DataAggregator : public DataReader {
/// File format syntax:
/// E <event>
/// S <start> <count>
- /// T <start> <end> <ft_end> <count>
+ /// [TR] <start> <end> <ft_end> <count>
/// B <start> <end> <count> <mispred_count>
/// [Ff] <start> <end> <count>
///
@@ -403,6 +404,7 @@ class DataAggregator : public DataReader {
/// jump to the block
/// T - an aggregated trace: branch from <start> to <end> with a fall-through
/// to <ft_end>
+ /// R - an aggregated trace originating at a return
///
/// <id> - build id of the object containing the address. We can skip it for
/// the main binary and use "X" for an unknown object. This will save some
@@ -532,6 +534,9 @@ inline raw_ostream &operator<<(raw_ostream &OS,
case DataAggregator::Trace::FT_ONLY:
case DataAggregator::Trace::FT_EXTERNAL_ORIGIN:
break;
+ case DataAggregator::Trace::BR_EXTERNAL_RETURN:
+ OS << "0 -> ";
+ break;
default:
OS << Twine::utohexstr(T.Branch) << " -> ";
}
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 11d282e98413b..c28dd6e57f8e4 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -1194,6 +1194,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
INVALID = 0,
EVENT_NAME, // E
TRACE, // T
+ RETURN, // R
SAMPLE, // S
BRANCH, // B
FT, // F
@@ -1224,6 +1225,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
Type = StringSwitch<AggregatedLBREntry>(Str)
.Case("T", TRACE)
+ .Case("R", RETURN)
.Case("S", SAMPLE)
.Case("E", EVENT_NAME)
.Case("B", BRANCH)
@@ -1237,7 +1239,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
}
using SSI = StringSwitch<int>;
- AddrNum = SSI(Str).Case("T", 3).Case("S", 1).Case("E", 0).Default(2);
+ AddrNum = SSI(Str).Cases("T", "R", 3).Case("S", 1).Case("E", 0).Default(2);
CounterNum = SSI(Str).Case("B", 2).Case("E", 0).Default(1);
}
@@ -1295,8 +1297,13 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
Addr[0] = Location(Type == FT ? Trace::FT_ONLY : Trace::FT_EXTERNAL_ORIGIN);
}
- if (Type == BRANCH) {
+ if (Type == BRANCH)
Addr[2] = Location(Trace::BR_ONLY);
+
+ if (Type == RETURN) {
+ if (!Addr[0]->Offset)
+ Addr[0]->Offset = Trace::BR_EXTERNAL_RETURN;
+ Returns.emplace(Addr[0]->Offset);
}
Trace T{Addr[0]->Offset, Addr[1]->Offset, Addr[2]->Offset};
diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s
index c2ef024db9475..63142903c80d2 100644
--- a/bolt/test/X86/callcont-fallthru.s
+++ b/bolt/test/X86/callcont-fallthru.s
@@ -10,6 +10,8 @@
# RUN: link_fdata %s %t %t.pa-ret PREAGG-RET
# Trace from an external location to a landing pad/entry point call continuation
# RUN: link_fdata %s %t %t.pa-ext PREAGG-EXT
+# Return trace to a landing pad/entry point call continuation
+# RUN: link_fdata %s %t %t.pa-pret PREAGG-PRET
# RUN-DISABLED: link_fdata %s %t %t.pa-plt PREAGG-PLT
# RUN: llvm-strip --strip-unneeded %t -o %t.strip
@@ -38,6 +40,15 @@
# RUN: llvm-bolt %t.strip --pa -p %t.pa-ext -o %t.out \
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-SKIP
+## Check pre-aggregated return traces from external location attach call
+## continuation fallthrough count to secondary entry point (unstripped)
+# RUN: llvm-bolt %t --pa -p %t.pa-pret -o %t.out \
+# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
+## Check pre-aggregated return traces from external location attach call
+## continuation fallthrough count to landing pad (stripped, landing pad)
+# RUN: llvm-bolt %t.strip --pa -p %t.pa-pret -o %t.out \
+# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
+
## Check pre-aggregated traces don't report zero-sized PLT fall-through as
## invalid trace
# RUN-DISABLED: llvm-bolt %t.strip --pa -p %t.pa-plt -o %t.out | FileCheck %s \
@@ -92,6 +103,8 @@ Ltmp4_br:
# PREAGG-RET: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
## Target is a secondary entry point (unstripped) or a landing pad (stripped)
# PREAGG-EXT: T X:0 #Ltmp3# #Ltmp3_br# 1
+## Pre-aggregated return trace
+# PREAGG-PRET: R X:0 #Ltmp3# #Ltmp3_br# 1
# CHECK-ATTACH: callq foo
# CHECK-ATTACH-NEXT: count: 1
diff --git a/bolt/test/link_fdata.py b/bolt/test/link_fdata.py
index 5a9752068bb9f..cb6b3c7baaab5 100755
--- a/bolt/test/link_fdata.py
+++ b/bolt/test/link_fdata.py
@@ -36,9 +36,9 @@
fdata_pat = re.compile(r"([01].*) (?P<mispred>\d+) (?P<exec>\d+)")
# Pre-aggregated profile:
-# {T|S|E|B|F|f} <start> [<end>] [<ft_end>] <count> [<mispred_count>]
+# {T|R|S|E|B|F|f} <start> [<end>] [<ft_end>] <count> [<mispred_count>]
# <loc>: [<id>:]<offset>
-preagg_pat = re.compile(r"(?P<type>[TSBFf]) (?P<offsets_count>.*)")
+preagg_pat = re.compile(r"(?P<type>[TRSBFf]) (?P<offsets_count>.*)")
# No-LBR profile:
# <is symbol?> <closest elf symbol or DSO name> <relative address> <count>
|
Intel's Architectural LBR supports capturing branch type information
as part of LBR stack (SDM Vol 3B, part 2, October 2024):
Linux kernel can preserve branch type when
save_type
is enabled,even if CPU does not support Architectural LBR:
https://github.com/torvalds/linux/blob/f09079bd04a924c72d555cd97942d5f8d7eca98c/tools/perf/Documentation/perf-record.txt#L457-L460
This information is needed to disambiguate external returns (from
DSO/JIT) to an entry point or a landing pad, when BOLT can't
disassemble the branch source.
This patch adds new pre-aggregated trace type (R).
Depends on #143295.
Test Plan: updated callcont-fallthru.s