[perf2bolt] Pre-aggregate LBR samples

Summary: Pre-aggregating LBR data cuts pef2bolt processing times in half.

(cherry picked from FBD10420286)
This commit is contained in:
Maksim Panchenko
2018-10-02 17:16:26 -07:00
parent 74a71c6812
commit a76b13d48e
5 changed files with 122 additions and 13 deletions

View File

@@ -278,6 +278,9 @@ public:
uint64_t MissedMacroFusionPairs{0};
uint64_t MissedMacroFusionExecCount{0};
// Address of the first allocated segment.
uint64_t FirstAllocAddress{std::numeric_limits<uint64_t>::max()};
/// Track next available address for new allocatable sections. RewriteInstance
/// sets this prior to running BOLT passes, so layout passes are aware of the
/// final addresses functions will have.
@@ -568,6 +571,11 @@ public:
Sections.end()));
}
/// Check if the address belongs to this binary's static allocation space.
bool containsAddress(uint64_t Address) const {
return Address >= FirstAllocAddress && Address < LayoutStartAddress;
}
/// Return section name containing the given \p Address.
ErrorOr<StringRef> getSectionNameForAddress(uint64_t Address) const;

View File

@@ -24,6 +24,7 @@
#include "llvm/Support/Regex.h"
#include "llvm/Support/Timer.h"
#include <map>
#include <unordered_map>
#include <unistd.h>
@@ -550,6 +551,9 @@ bool DataAggregator::aggregate(BinaryContext &BC,
BinaryFunction *
DataAggregator::getBinaryFunctionContainingAddress(uint64_t Address) {
if (!BC->containsAddress(Address))
return nullptr;
auto FI = BFs->upper_bound(Address);
if (FI == BFs->begin())
return nullptr;
@@ -964,6 +968,37 @@ std::error_code DataAggregator::parseBranchEvents() {
uint64_t NumEntries{0};
uint64_t NumSamples{0};
uint64_t NumTraces{0};
struct Location {
uint64_t From;
uint64_t To;
Location(uint64_t From, uint64_t To)
: From(From), To(To) {}
bool operator==(const Location &Other) const {
return From == Other.From && To == Other.To;
}
};
struct LocationHash {
size_t operator()(const Location &L) const {
return std::hash<uint64_t>()(L.From << 32 | L.To);
}
};
struct TraceInfo {
uint64_t InternCount{0};
uint64_t ExternCount{0};
};
struct BranchInfo {
uint64_t TakenCount{0};
uint64_t MispredCount{0};
};
/// Map location to counters.
std::unordered_map<Location, BranchInfo, LocationHash> BranchLBRs;
std::unordered_map<Location, TraceInfo, LocationHash> FallthroughLBRs;
while (hasData()) {
auto SampleRes = parseBranchSample();
if (std::error_code EC = SampleRes.getError())
@@ -981,13 +1016,62 @@ std::error_code DataAggregator::parseBranchEvents() {
const LBREntry *NextLBR{nullptr};
for (const auto &LBR : Sample.LBR) {
if (NextLBR) {
doTrace(LBR, *NextLBR);
// Record fall-through trace.
const auto TraceFrom = LBR.To;
const auto TraceTo = NextLBR->From;
const auto *TraceBF = getBinaryFunctionContainingAddress(TraceFrom);
if (TraceBF && TraceBF->containsAddress(TraceTo)) {
auto &Info = FallthroughLBRs[Location(TraceFrom, TraceTo)];
if (TraceBF->containsAddress(LBR.From)) {
++Info.InternCount;
} else {
++Info.ExternCount;
}
} else {
if (TraceBF && getBinaryFunctionContainingAddress(TraceTo)) {
++NumInvalidTraces;
} else {
++NumLongRangeTraces;
}
}
++NumTraces;
}
doBranch(LBR.From, LBR.To, 1, LBR.Mispred);
NextLBR = &LBR;
auto From = LBR.From;
if (!getBinaryFunctionContainingAddress(From))
From = 0;
auto To = LBR.To;
if (!getBinaryFunctionContainingAddress(To))
To = 0;
if (!From && !To)
continue;
auto &Info = BranchLBRs[Location(From, To)];
++Info.TakenCount;
Info.MispredCount += LBR.Mispred;
}
}
for (const auto &AggrLBR : FallthroughLBRs) {
auto &Loc = AggrLBR.first;
auto &Info = AggrLBR.second;
LBREntry First{Loc.From, Loc.From, false};
LBREntry Second{Loc.To, Loc.To, false};
if (Info.InternCount) {
doTrace(First, Second, Info.InternCount);
}
if (Info.ExternCount) {
First.From = 0;
doTrace(First, Second, Info.ExternCount);
}
}
for (const auto &AggrLBR : BranchLBRs) {
auto &Loc = AggrLBR.first;
auto &Info = AggrLBR.second;
doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount);
}
outs() << "PERF2BOLT: Read " << NumSamples << " samples and "
<< NumEntries << " LBR entries\n";
outs() << "PERF2BOLT: Traces mismatching disassembled function contents: "

View File

@@ -124,6 +124,8 @@ convert(const BinaryFunction &BF, yaml::bolt::BinaryFunctionProfile &YamlBF) {
}
}
std::sort(YamlBB.CallSites.begin(), YamlBB.CallSites.end());
// Skip printing if there's no profile data for non-entry basic block.
// Include landing pads with non-zero execution count.
if (YamlBB.CallSites.empty() &&

View File

@@ -37,9 +37,27 @@ struct CallSiteInfo {
DestId == Other.DestId &&
EntryDiscriminator == Other.EntryDiscriminator;
}
bool operator!=(const CallSiteInfo &Other) const {
return !(*this == Other);
}
bool operator<(const CallSiteInfo &Other) const {
if (Offset < Other.Offset)
return true;
if (Offset > Other.Offset)
return false;
if (DestId < Other.DestId)
return true;
if (DestId > Other.DestId)
return false;
if (EntryDiscriminator < Other.EntryDiscriminator)
return true;
return false;
}
};
} // end namespace bolt

View File

@@ -810,16 +810,13 @@ void RewriteInstance::discoverStorage() {
EntryPoint = Obj->getHeader()->e_entry;
// This is where the first segment and ELF header were allocated.
uint64_t FirstAllocAddress = std::numeric_limits<uint64_t>::max();
NextAvailableAddress = 0;
uint64_t NextAvailableOffset = 0;
auto PHs = cantFail(Obj->program_headers(), "program_headers() failed");
for (const auto &Phdr : PHs) {
if (Phdr.p_type == ELF::PT_LOAD) {
FirstAllocAddress = std::min(FirstAllocAddress,
static_cast<uint64_t>(Phdr.p_vaddr));
BC->FirstAllocAddress = std::min(BC->FirstAllocAddress,
static_cast<uint64_t>(Phdr.p_vaddr));
NextAvailableAddress = std::max(NextAvailableAddress,
Phdr.p_vaddr + Phdr.p_memsz);
NextAvailableOffset = std::max(NextAvailableOffset,
@@ -856,7 +853,7 @@ void RewriteInstance::discoverStorage() {
"no PT_LOAD pheader seen");
outs() << "BOLT-INFO: first alloc address is 0x"
<< Twine::utohexstr(FirstAllocAddress) << '\n';
<< Twine::utohexstr(BC->FirstAllocAddress) << '\n';
FirstNonAllocatableOffset = NextAvailableOffset;
@@ -874,13 +871,13 @@ void RewriteInstance::discoverStorage() {
//
// NB: bfd's strip command cannot do the above and will corrupt the
// binary during the process of stripping non-allocatable sections.
if (NextAvailableOffset <= NextAvailableAddress - FirstAllocAddress) {
NextAvailableOffset = NextAvailableAddress - FirstAllocAddress;
if (NextAvailableOffset <= NextAvailableAddress - BC->FirstAllocAddress) {
NextAvailableOffset = NextAvailableAddress - BC->FirstAllocAddress;
} else {
NextAvailableAddress = NextAvailableOffset + FirstAllocAddress;
NextAvailableAddress = NextAvailableOffset + BC->FirstAllocAddress;
}
assert(NextAvailableOffset == NextAvailableAddress - FirstAllocAddress &&
"PHDR table address calculation error");
assert(NextAvailableOffset == NextAvailableAddress - BC->FirstAllocAddress
&& "PHDR table address calculation error");
outs() << "BOLT-INFO: creating new program header table at address 0x"
<< Twine::utohexstr(NextAvailableAddress) << ", offset 0x"