mirror of
https://github.com/intel/llvm.git
synced 2026-01-15 12:25:46 +08:00
[perf2bolt] Pre-aggregate LBR samples
Summary: Pre-aggregating LBR data cuts pef2bolt processing times in half. (cherry picked from FBD10420286)
This commit is contained in:
@@ -278,6 +278,9 @@ public:
|
||||
uint64_t MissedMacroFusionPairs{0};
|
||||
uint64_t MissedMacroFusionExecCount{0};
|
||||
|
||||
// Address of the first allocated segment.
|
||||
uint64_t FirstAllocAddress{std::numeric_limits<uint64_t>::max()};
|
||||
|
||||
/// Track next available address for new allocatable sections. RewriteInstance
|
||||
/// sets this prior to running BOLT passes, so layout passes are aware of the
|
||||
/// final addresses functions will have.
|
||||
@@ -568,6 +571,11 @@ public:
|
||||
Sections.end()));
|
||||
}
|
||||
|
||||
/// Check if the address belongs to this binary's static allocation space.
|
||||
bool containsAddress(uint64_t Address) const {
|
||||
return Address >= FirstAllocAddress && Address < LayoutStartAddress;
|
||||
}
|
||||
|
||||
/// Return section name containing the given \p Address.
|
||||
ErrorOr<StringRef> getSectionNameForAddress(uint64_t Address) const;
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
#include "llvm/Support/Regex.h"
|
||||
#include "llvm/Support/Timer.h"
|
||||
#include <map>
|
||||
#include <unordered_map>
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
@@ -550,6 +551,9 @@ bool DataAggregator::aggregate(BinaryContext &BC,
|
||||
|
||||
BinaryFunction *
|
||||
DataAggregator::getBinaryFunctionContainingAddress(uint64_t Address) {
|
||||
if (!BC->containsAddress(Address))
|
||||
return nullptr;
|
||||
|
||||
auto FI = BFs->upper_bound(Address);
|
||||
if (FI == BFs->begin())
|
||||
return nullptr;
|
||||
@@ -964,6 +968,37 @@ std::error_code DataAggregator::parseBranchEvents() {
|
||||
uint64_t NumEntries{0};
|
||||
uint64_t NumSamples{0};
|
||||
uint64_t NumTraces{0};
|
||||
|
||||
struct Location {
|
||||
uint64_t From;
|
||||
uint64_t To;
|
||||
Location(uint64_t From, uint64_t To)
|
||||
: From(From), To(To) {}
|
||||
bool operator==(const Location &Other) const {
|
||||
return From == Other.From && To == Other.To;
|
||||
}
|
||||
};
|
||||
|
||||
struct LocationHash {
|
||||
size_t operator()(const Location &L) const {
|
||||
return std::hash<uint64_t>()(L.From << 32 | L.To);
|
||||
}
|
||||
};
|
||||
|
||||
struct TraceInfo {
|
||||
uint64_t InternCount{0};
|
||||
uint64_t ExternCount{0};
|
||||
};
|
||||
|
||||
struct BranchInfo {
|
||||
uint64_t TakenCount{0};
|
||||
uint64_t MispredCount{0};
|
||||
};
|
||||
|
||||
/// Map location to counters.
|
||||
std::unordered_map<Location, BranchInfo, LocationHash> BranchLBRs;
|
||||
std::unordered_map<Location, TraceInfo, LocationHash> FallthroughLBRs;
|
||||
|
||||
while (hasData()) {
|
||||
auto SampleRes = parseBranchSample();
|
||||
if (std::error_code EC = SampleRes.getError())
|
||||
@@ -981,13 +1016,62 @@ std::error_code DataAggregator::parseBranchEvents() {
|
||||
const LBREntry *NextLBR{nullptr};
|
||||
for (const auto &LBR : Sample.LBR) {
|
||||
if (NextLBR) {
|
||||
doTrace(LBR, *NextLBR);
|
||||
// Record fall-through trace.
|
||||
const auto TraceFrom = LBR.To;
|
||||
const auto TraceTo = NextLBR->From;
|
||||
const auto *TraceBF = getBinaryFunctionContainingAddress(TraceFrom);
|
||||
if (TraceBF && TraceBF->containsAddress(TraceTo)) {
|
||||
auto &Info = FallthroughLBRs[Location(TraceFrom, TraceTo)];
|
||||
if (TraceBF->containsAddress(LBR.From)) {
|
||||
++Info.InternCount;
|
||||
} else {
|
||||
++Info.ExternCount;
|
||||
}
|
||||
} else {
|
||||
if (TraceBF && getBinaryFunctionContainingAddress(TraceTo)) {
|
||||
++NumInvalidTraces;
|
||||
} else {
|
||||
++NumLongRangeTraces;
|
||||
}
|
||||
}
|
||||
++NumTraces;
|
||||
}
|
||||
doBranch(LBR.From, LBR.To, 1, LBR.Mispred);
|
||||
NextLBR = &LBR;
|
||||
|
||||
auto From = LBR.From;
|
||||
if (!getBinaryFunctionContainingAddress(From))
|
||||
From = 0;
|
||||
auto To = LBR.To;
|
||||
if (!getBinaryFunctionContainingAddress(To))
|
||||
To = 0;
|
||||
if (!From && !To)
|
||||
continue;
|
||||
auto &Info = BranchLBRs[Location(From, To)];
|
||||
++Info.TakenCount;
|
||||
Info.MispredCount += LBR.Mispred;
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto &AggrLBR : FallthroughLBRs) {
|
||||
auto &Loc = AggrLBR.first;
|
||||
auto &Info = AggrLBR.second;
|
||||
LBREntry First{Loc.From, Loc.From, false};
|
||||
LBREntry Second{Loc.To, Loc.To, false};
|
||||
if (Info.InternCount) {
|
||||
doTrace(First, Second, Info.InternCount);
|
||||
}
|
||||
if (Info.ExternCount) {
|
||||
First.From = 0;
|
||||
doTrace(First, Second, Info.ExternCount);
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto &AggrLBR : BranchLBRs) {
|
||||
auto &Loc = AggrLBR.first;
|
||||
auto &Info = AggrLBR.second;
|
||||
doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount);
|
||||
}
|
||||
|
||||
outs() << "PERF2BOLT: Read " << NumSamples << " samples and "
|
||||
<< NumEntries << " LBR entries\n";
|
||||
outs() << "PERF2BOLT: Traces mismatching disassembled function contents: "
|
||||
|
||||
@@ -124,6 +124,8 @@ convert(const BinaryFunction &BF, yaml::bolt::BinaryFunctionProfile &YamlBF) {
|
||||
}
|
||||
}
|
||||
|
||||
std::sort(YamlBB.CallSites.begin(), YamlBB.CallSites.end());
|
||||
|
||||
// Skip printing if there's no profile data for non-entry basic block.
|
||||
// Include landing pads with non-zero execution count.
|
||||
if (YamlBB.CallSites.empty() &&
|
||||
|
||||
@@ -37,9 +37,27 @@ struct CallSiteInfo {
|
||||
DestId == Other.DestId &&
|
||||
EntryDiscriminator == Other.EntryDiscriminator;
|
||||
}
|
||||
|
||||
bool operator!=(const CallSiteInfo &Other) const {
|
||||
return !(*this == Other);
|
||||
}
|
||||
|
||||
bool operator<(const CallSiteInfo &Other) const {
|
||||
if (Offset < Other.Offset)
|
||||
return true;
|
||||
if (Offset > Other.Offset)
|
||||
return false;
|
||||
|
||||
if (DestId < Other.DestId)
|
||||
return true;
|
||||
if (DestId > Other.DestId)
|
||||
return false;
|
||||
|
||||
if (EntryDiscriminator < Other.EntryDiscriminator)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
};
|
||||
} // end namespace bolt
|
||||
|
||||
|
||||
@@ -810,16 +810,13 @@ void RewriteInstance::discoverStorage() {
|
||||
|
||||
EntryPoint = Obj->getHeader()->e_entry;
|
||||
|
||||
// This is where the first segment and ELF header were allocated.
|
||||
uint64_t FirstAllocAddress = std::numeric_limits<uint64_t>::max();
|
||||
|
||||
NextAvailableAddress = 0;
|
||||
uint64_t NextAvailableOffset = 0;
|
||||
auto PHs = cantFail(Obj->program_headers(), "program_headers() failed");
|
||||
for (const auto &Phdr : PHs) {
|
||||
if (Phdr.p_type == ELF::PT_LOAD) {
|
||||
FirstAllocAddress = std::min(FirstAllocAddress,
|
||||
static_cast<uint64_t>(Phdr.p_vaddr));
|
||||
BC->FirstAllocAddress = std::min(BC->FirstAllocAddress,
|
||||
static_cast<uint64_t>(Phdr.p_vaddr));
|
||||
NextAvailableAddress = std::max(NextAvailableAddress,
|
||||
Phdr.p_vaddr + Phdr.p_memsz);
|
||||
NextAvailableOffset = std::max(NextAvailableOffset,
|
||||
@@ -856,7 +853,7 @@ void RewriteInstance::discoverStorage() {
|
||||
"no PT_LOAD pheader seen");
|
||||
|
||||
outs() << "BOLT-INFO: first alloc address is 0x"
|
||||
<< Twine::utohexstr(FirstAllocAddress) << '\n';
|
||||
<< Twine::utohexstr(BC->FirstAllocAddress) << '\n';
|
||||
|
||||
FirstNonAllocatableOffset = NextAvailableOffset;
|
||||
|
||||
@@ -874,13 +871,13 @@ void RewriteInstance::discoverStorage() {
|
||||
//
|
||||
// NB: bfd's strip command cannot do the above and will corrupt the
|
||||
// binary during the process of stripping non-allocatable sections.
|
||||
if (NextAvailableOffset <= NextAvailableAddress - FirstAllocAddress) {
|
||||
NextAvailableOffset = NextAvailableAddress - FirstAllocAddress;
|
||||
if (NextAvailableOffset <= NextAvailableAddress - BC->FirstAllocAddress) {
|
||||
NextAvailableOffset = NextAvailableAddress - BC->FirstAllocAddress;
|
||||
} else {
|
||||
NextAvailableAddress = NextAvailableOffset + FirstAllocAddress;
|
||||
NextAvailableAddress = NextAvailableOffset + BC->FirstAllocAddress;
|
||||
}
|
||||
assert(NextAvailableOffset == NextAvailableAddress - FirstAllocAddress &&
|
||||
"PHDR table address calculation error");
|
||||
assert(NextAvailableOffset == NextAvailableAddress - BC->FirstAllocAddress
|
||||
&& "PHDR table address calculation error");
|
||||
|
||||
outs() << "BOLT-INFO: creating new program header table at address 0x"
|
||||
<< Twine::utohexstr(NextAvailableAddress) << ", offset 0x"
|
||||
|
||||
Reference in New Issue
Block a user