mirror of
https://github.com/intel/llvm.git
synced 2026-01-27 06:06:34 +08:00
[CSSPGO] Track and use context-sensitive post-optimization function size to drive global pre-inliner in llvm-profgen
This change enables llvm-profgen to use accurate context-sensitive post-optimization function byte size as a cost proxy to drive global preinline decisions. To do this, BinarySizeContextTracker is introduced to track function byte size under different inline context during disassembling. In preinliner, we can not query context byte size under switch `context-cost-for-preinliner`. The tracker uses a reverse trie to keep size of functions under different context (callee as parent, caller as child), and it can give best/longest possible matching context size for given input context. The new size cost is off by default. There're a few TODOs that needs to addressed: 1) avoid dangling string from `Offset2LocStackMap`, which will be addressed in split context work; 2) using inlinee's entry probe to make sure we have correct zero size for inlinee that's completely optimized away after inlining. Some tuning is also needed. Differential Revision: https://reviews.llvm.org/D108180
This commit is contained in:
@@ -430,6 +430,16 @@ public:
|
||||
return ContextStr.split(" @ ");
|
||||
}
|
||||
|
||||
// Split the leaf context frame (right-most substr) from context.
|
||||
static std::pair<StringRef, StringRef>
|
||||
rsplitContextString(StringRef ContextStr) {
|
||||
auto ContextSplit = ContextStr.rsplit(" @ ");
|
||||
if (ContextSplit.second.empty()) {
|
||||
std::swap(ContextSplit.first, ContextSplit.second);
|
||||
}
|
||||
return ContextSplit;
|
||||
}
|
||||
|
||||
// Reconstruct a new context with the last k frames, return the context-less
|
||||
// name if K = 1
|
||||
StringRef getContextWithLastKFrames(uint32_t K) {
|
||||
|
||||
@@ -37,30 +37,33 @@ class ContextTrieNode {
|
||||
public:
|
||||
ContextTrieNode(ContextTrieNode *Parent = nullptr,
|
||||
StringRef FName = StringRef(),
|
||||
FunctionSamples *FSamples = nullptr,
|
||||
FunctionSamples *FSamples = nullptr, uint32_t FSize = 0,
|
||||
LineLocation CallLoc = {0, 0})
|
||||
: ParentContext(Parent), FuncName(FName), FuncSamples(FSamples),
|
||||
CallSiteLoc(CallLoc){};
|
||||
FuncSize(FSize), CallSiteLoc(CallLoc){};
|
||||
ContextTrieNode *getChildContext(const LineLocation &CallSite,
|
||||
StringRef CalleeName);
|
||||
StringRef ChildName);
|
||||
ContextTrieNode *getHottestChildContext(const LineLocation &CallSite);
|
||||
ContextTrieNode *getOrCreateChildContext(const LineLocation &CallSite,
|
||||
StringRef CalleeName,
|
||||
StringRef ChildName,
|
||||
bool AllowCreate = true);
|
||||
|
||||
ContextTrieNode &moveToChildContext(const LineLocation &CallSite,
|
||||
ContextTrieNode &&NodeToMove,
|
||||
StringRef ContextStrToRemove,
|
||||
bool DeleteNode = true);
|
||||
void removeChildContext(const LineLocation &CallSite, StringRef CalleeName);
|
||||
void removeChildContext(const LineLocation &CallSite, StringRef ChildName);
|
||||
std::map<uint32_t, ContextTrieNode> &getAllChildContext();
|
||||
StringRef getFuncName() const;
|
||||
FunctionSamples *getFunctionSamples() const;
|
||||
void setFunctionSamples(FunctionSamples *FSamples);
|
||||
uint32_t getFunctionSize() const;
|
||||
void setFunctionSize(uint32_t FSize);
|
||||
LineLocation getCallSiteLoc() const;
|
||||
ContextTrieNode *getParentContext() const;
|
||||
void setParentContext(ContextTrieNode *Parent);
|
||||
void dump();
|
||||
void dumpNode();
|
||||
void dumpTree();
|
||||
|
||||
private:
|
||||
static uint32_t nodeHash(StringRef ChildName, const LineLocation &Callsite);
|
||||
@@ -77,6 +80,9 @@ private:
|
||||
// Function Samples for current context
|
||||
FunctionSamples *FuncSamples;
|
||||
|
||||
// Function size for current context
|
||||
uint32_t FuncSize;
|
||||
|
||||
// Callsite location in parent context
|
||||
LineLocation CallSiteLoc;
|
||||
};
|
||||
|
||||
@@ -127,6 +127,10 @@ void ContextTrieNode::setFunctionSamples(FunctionSamples *FSamples) {
|
||||
FuncSamples = FSamples;
|
||||
}
|
||||
|
||||
uint32_t ContextTrieNode::getFunctionSize() const { return FuncSize; }
|
||||
|
||||
void ContextTrieNode::setFunctionSize(uint32_t FSize) { FuncSize = FSize; }
|
||||
|
||||
LineLocation ContextTrieNode::getCallSiteLoc() const { return CallSiteLoc; }
|
||||
|
||||
ContextTrieNode *ContextTrieNode::getParentContext() const {
|
||||
@@ -137,9 +141,10 @@ void ContextTrieNode::setParentContext(ContextTrieNode *Parent) {
|
||||
ParentContext = Parent;
|
||||
}
|
||||
|
||||
void ContextTrieNode::dump() {
|
||||
void ContextTrieNode::dumpNode() {
|
||||
dbgs() << "Node: " << FuncName << "\n"
|
||||
<< " Callsite: " << CallSiteLoc << "\n"
|
||||
<< " Size: " << FuncSize << "\n"
|
||||
<< " Children:\n";
|
||||
|
||||
for (auto &It : AllChildContext) {
|
||||
@@ -147,6 +152,23 @@ void ContextTrieNode::dump() {
|
||||
}
|
||||
}
|
||||
|
||||
void ContextTrieNode::dumpTree() {
|
||||
dbgs() << "Context Profile Tree:\n";
|
||||
std::queue<ContextTrieNode *> NodeQueue;
|
||||
NodeQueue.push(this);
|
||||
|
||||
while (!NodeQueue.empty()) {
|
||||
ContextTrieNode *Node = NodeQueue.front();
|
||||
NodeQueue.pop();
|
||||
Node->dumpNode();
|
||||
|
||||
for (auto &It : Node->getAllChildContext()) {
|
||||
ContextTrieNode *ChildNode = &It.second;
|
||||
NodeQueue.push(ChildNode);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t ContextTrieNode::nodeHash(StringRef ChildName,
|
||||
const LineLocation &Callsite) {
|
||||
// We still use child's name for child hash, this is
|
||||
@@ -171,7 +193,8 @@ ContextTrieNode *ContextTrieNode::getOrCreateChildContext(
|
||||
if (!AllowCreate)
|
||||
return nullptr;
|
||||
|
||||
AllChildContext[Hash] = ContextTrieNode(this, CalleeName, nullptr, CallSite);
|
||||
AllChildContext[Hash] =
|
||||
ContextTrieNode(this, CalleeName, nullptr, 0, CallSite);
|
||||
return &AllChildContext[Hash];
|
||||
}
|
||||
|
||||
@@ -385,22 +408,7 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
|
||||
ContextStrToRemove);
|
||||
}
|
||||
|
||||
void SampleContextTracker::dump() {
|
||||
dbgs() << "Context Profile Tree:\n";
|
||||
std::queue<ContextTrieNode *> NodeQueue;
|
||||
NodeQueue.push(&RootContext);
|
||||
|
||||
while (!NodeQueue.empty()) {
|
||||
ContextTrieNode *Node = NodeQueue.front();
|
||||
NodeQueue.pop();
|
||||
Node->dump();
|
||||
|
||||
for (auto &It : Node->getAllChildContext()) {
|
||||
ContextTrieNode *ChildNode = &It.second;
|
||||
NodeQueue.push(ChildNode);
|
||||
}
|
||||
}
|
||||
}
|
||||
void SampleContextTracker::dump() { RootContext.dumpTree(); }
|
||||
|
||||
ContextTrieNode *
|
||||
SampleContextTracker::getContextFor(const SampleContext &Context) {
|
||||
|
||||
BIN
llvm/test/tools/llvm-profgen/Inputs/cs-preinline-cost.perfbin
Executable file
BIN
llvm/test/tools/llvm-profgen/Inputs/cs-preinline-cost.perfbin
Executable file
Binary file not shown.
3000
llvm/test/tools/llvm-profgen/Inputs/cs-preinline-cost.perfscript
Normal file
3000
llvm/test/tools/llvm-profgen/Inputs/cs-preinline-cost.perfscript
Normal file
File diff suppressed because it is too large
Load Diff
66
llvm/test/tools/llvm-profgen/cs-preinline-cost.test
Normal file
66
llvm/test/tools/llvm-profgen/cs-preinline-cost.test
Normal file
@@ -0,0 +1,66 @@
|
||||
; REQUIRES: asserts
|
||||
; Test default using size of profile as a proxy
|
||||
; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cs-preinline-cost.perfscript --binary=%S/Inputs/cs-preinline-cost.perfbin --csspgo-preinliner --debug-only=cs-preinliner --output=/dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT
|
||||
|
||||
; Test use-context-cost-for-preinliner using inlinee's byte size as context-sensitive inline cost
|
||||
; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cs-preinline-cost.perfscript --binary=%S/Inputs/cs-preinline-cost.perfbin --csspgo-preinliner --debug-only=cs-preinliner --use-context-cost-for-preinliner --output=/dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-CSCOST
|
||||
|
||||
CHECK-DEFAULT: Process main for context-sensitive pre-inlining (pre-inline size: 9, size limit: 108)
|
||||
CHECK-DEFAULT-NEXT: Inlined context profile for: main:9 @ _Z3fooi (callee size: 2, call count:545)
|
||||
CHECK-DEFAULT-NEXT: Inlined context profile for: main:7 @ _Z3fooi (callee size: 14, call count:545)
|
||||
CHECK-DEFAULT-NEXT: Inlined context profile for: main:8 @ _Z3fooi (callee size: 4, call count:544)
|
||||
|
||||
CHECK-CSCOST: Process main for context-sensitive pre-inlining (pre-inline size: 69, size limit: 828)
|
||||
CHECK-CSCOST-NEXT: Inlined context profile for: main:9 @ _Z3fooi (callee size: 264, call count:545)
|
||||
CHECK-CSCOST-NEXT: Inlined context profile for: main:7 @ _Z3fooi (callee size: 279, call count:545)
|
||||
CHECK-CSCOST-NEXT: Inlined context profile for: main:8 @ _Z3fooi (callee size: 44, call count:544)
|
||||
|
||||
; binary is built with the source below using the following command line:
|
||||
; clang -O3 -g -fpseudo-probe-for-profiling -fexperimental-new-pass-manager test.cpp
|
||||
;
|
||||
;#include <stdio.h>
|
||||
;
|
||||
;volatile int state = 9000;
|
||||
;
|
||||
;int foo(int x) {
|
||||
; if (x == 0) {
|
||||
; return 7;
|
||||
; }
|
||||
;
|
||||
; if ((x & 1) == 0) {
|
||||
; state--;
|
||||
; return 9;
|
||||
; }
|
||||
;
|
||||
; if (state > 5000) {
|
||||
; while (state > 5000) {
|
||||
; for (int i = 50; i >= 0; i--) {
|
||||
; state *= 6;
|
||||
; state /= 7;
|
||||
; state -= 1;
|
||||
; }
|
||||
; }
|
||||
; }
|
||||
; else {
|
||||
; while (state < 5000) {
|
||||
; for (int i = 50; i >= 0; i--) {
|
||||
; state *= 6;
|
||||
; state /= 5;
|
||||
; state += 1;
|
||||
; }
|
||||
; }
|
||||
; }
|
||||
;
|
||||
; return state;
|
||||
;}
|
||||
;
|
||||
;volatile int cnt = 10000000;//10000000;
|
||||
;int main() {
|
||||
; int r = 0;
|
||||
; for (int i = 0; i < cnt; i++) {
|
||||
; r += foo(i);
|
||||
; r -= foo(i & (~1));
|
||||
; r += foo(0);
|
||||
; }
|
||||
; return r;
|
||||
;}
|
||||
@@ -7,6 +7,7 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "CSPreInliner.h"
|
||||
#include "ProfiledBinary.h"
|
||||
#include "llvm/ADT/SCCIterator.h"
|
||||
#include <cstdint>
|
||||
#include <queue>
|
||||
@@ -25,15 +26,26 @@ extern cl::opt<int> ProfileInlineGrowthLimit;
|
||||
extern cl::opt<int> ProfileInlineLimitMin;
|
||||
extern cl::opt<int> ProfileInlineLimitMax;
|
||||
|
||||
cl::opt<bool> EnableCSPreInliner(
|
||||
"csspgo-preinliner", cl::Hidden, cl::init(false),
|
||||
cl::desc("Run a global pre-inliner to merge context profile based on "
|
||||
"estimated global top-down inline decisions"));
|
||||
|
||||
cl::opt<bool> UseContextCostForPreInliner(
|
||||
"use-context-cost-for-preinliner", cl::Hidden, cl::init(false),
|
||||
cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
|
||||
|
||||
static cl::opt<bool> SamplePreInlineReplay(
|
||||
"csspgo-replay-preinline", cl::Hidden, cl::init(false),
|
||||
cl::desc(
|
||||
"Replay previous inlining and adjust context profile accordingly"));
|
||||
|
||||
CSPreInliner::CSPreInliner(StringMap<FunctionSamples> &Profiles,
|
||||
uint64_t HotThreshold, uint64_t ColdThreshold)
|
||||
: ContextTracker(Profiles), ProfileMap(Profiles),
|
||||
HotCountThreshold(HotThreshold), ColdCountThreshold(ColdThreshold) {}
|
||||
ProfiledBinary &Binary, uint64_t HotThreshold,
|
||||
uint64_t ColdThreshold)
|
||||
: UseContextCost(UseContextCostForPreInliner), ContextTracker(Profiles),
|
||||
ProfileMap(Profiles), Binary(Binary), HotCountThreshold(HotThreshold),
|
||||
ColdCountThreshold(ColdThreshold) {}
|
||||
|
||||
std::vector<StringRef> CSPreInliner::buildTopDownOrder() {
|
||||
std::vector<StringRef> Order;
|
||||
@@ -87,12 +99,22 @@ bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
|
||||
// TODO: call site and callee entry count should be mostly consistent, add
|
||||
// check for that.
|
||||
HasNewCandidate = true;
|
||||
CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount));
|
||||
uint32_t CalleeSize = getFuncSize(*CalleeSamples);
|
||||
CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount),
|
||||
CalleeSize);
|
||||
}
|
||||
|
||||
return HasNewCandidate;
|
||||
}
|
||||
|
||||
uint32_t CSPreInliner::getFuncSize(const FunctionSamples &FSamples) {
|
||||
if (UseContextCost) {
|
||||
return Binary.getFuncSizeForContext(FSamples.getContext());
|
||||
}
|
||||
|
||||
return FSamples.getBodySamples().size();
|
||||
}
|
||||
|
||||
bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
|
||||
// If replay inline is requested, simply follow the inline decision of the
|
||||
// profiled binary.
|
||||
@@ -115,21 +137,20 @@ bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
|
||||
}
|
||||
|
||||
void CSPreInliner::processFunction(const StringRef Name) {
|
||||
LLVM_DEBUG(dbgs() << "Process " << Name
|
||||
<< " for context-sensitive pre-inlining\n");
|
||||
|
||||
FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
|
||||
if (!FSamples)
|
||||
return;
|
||||
|
||||
// Use the number of lines/probes as proxy for function size for now.
|
||||
// TODO: retrieve accurate size from dwarf or binary instead.
|
||||
unsigned FuncSize = FSamples->getBodySamples().size();
|
||||
unsigned FuncSize = getFuncSize(*FSamples);
|
||||
unsigned FuncFinalSize = FuncSize;
|
||||
unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
|
||||
SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
|
||||
SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
|
||||
|
||||
LLVM_DEBUG(dbgs() << "Process " << Name
|
||||
<< " for context-sensitive pre-inlining (pre-inline size: "
|
||||
<< FuncSize << ", size limit: " << SizeLimit << ")\n");
|
||||
|
||||
ProfiledCandidateQueue CQueue;
|
||||
getInlineCandidates(CQueue, FSamples);
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#ifndef LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H
|
||||
#define LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H
|
||||
|
||||
#include "ProfiledBinary.h"
|
||||
#include "llvm/ADT/PriorityQueue.h"
|
||||
#include "llvm/ProfileData/ProfileCommon.h"
|
||||
#include "llvm/ProfileData/SampleProf.h"
|
||||
@@ -23,9 +24,9 @@ namespace sampleprof {
|
||||
|
||||
// Inline candidate seen from profile
|
||||
struct ProfiledInlineCandidate {
|
||||
ProfiledInlineCandidate(const FunctionSamples *Samples, uint64_t Count)
|
||||
: CalleeSamples(Samples), CallsiteCount(Count),
|
||||
SizeCost(Samples->getBodySamples().size()) {}
|
||||
ProfiledInlineCandidate(const FunctionSamples *Samples, uint64_t Count,
|
||||
uint32_t Size)
|
||||
: CalleeSamples(Samples), CallsiteCount(Count), SizeCost(Size) {}
|
||||
// Context-sensitive function profile for inline candidate
|
||||
const FunctionSamples *CalleeSamples;
|
||||
// Call site count for an inline candidate
|
||||
@@ -33,7 +34,6 @@ struct ProfiledInlineCandidate {
|
||||
// target count for corresponding call are consistent.
|
||||
uint64_t CallsiteCount;
|
||||
// Size proxy for function under particular call context.
|
||||
// TODO: use post-inline callee size from debug info.
|
||||
uint64_t SizeCost;
|
||||
};
|
||||
|
||||
@@ -67,8 +67,8 @@ using ProfiledCandidateQueue =
|
||||
// size by only keep context that is estimated to be inlined.
|
||||
class CSPreInliner {
|
||||
public:
|
||||
CSPreInliner(StringMap<FunctionSamples> &Profiles, uint64_t HotThreshold,
|
||||
uint64_t ColdThreshold);
|
||||
CSPreInliner(StringMap<FunctionSamples> &Profiles, ProfiledBinary &Binary,
|
||||
uint64_t HotThreshold, uint64_t ColdThreshold);
|
||||
void run();
|
||||
|
||||
private:
|
||||
@@ -77,8 +77,11 @@ private:
|
||||
std::vector<StringRef> buildTopDownOrder();
|
||||
void processFunction(StringRef Name);
|
||||
bool shouldInline(ProfiledInlineCandidate &Candidate);
|
||||
uint32_t getFuncSize(const FunctionSamples &FSamples);
|
||||
bool UseContextCost;
|
||||
SampleContextTracker ContextTracker;
|
||||
StringMap<FunctionSamples> &ProfileMap;
|
||||
ProfiledBinary &Binary;
|
||||
|
||||
// Count thresholds to answer isHotCount and isColdCount queries.
|
||||
// Mirrors the threshold in ProfileSummaryInfo.
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "ProfileGenerator.h"
|
||||
#include "ProfiledBinary.h"
|
||||
#include "llvm/ProfileData/ProfileCommon.h"
|
||||
#include <unordered_set>
|
||||
|
||||
@@ -55,11 +56,6 @@ static cl::opt<int, true> CSProfMaxContextDepth(
|
||||
"depth limit."),
|
||||
cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth));
|
||||
|
||||
static cl::opt<bool> EnableCSPreInliner(
|
||||
"csspgo-preinliner", cl::Hidden, cl::init(false),
|
||||
cl::desc("Run a global pre-inliner to merge context profile based on "
|
||||
"estimated global top-down inline decisions"));
|
||||
|
||||
extern cl::opt<int> ProfileSummaryCutoffCold;
|
||||
|
||||
using namespace llvm;
|
||||
@@ -406,8 +402,10 @@ void CSProfileGenerator::postProcessProfiles() {
|
||||
|
||||
// Run global pre-inliner to adjust/merge context profile based on estimated
|
||||
// inline decisions.
|
||||
if (EnableCSPreInliner)
|
||||
CSPreInliner(ProfileMap, HotCountThreshold, ColdCountThreshold).run();
|
||||
if (EnableCSPreInliner) {
|
||||
CSPreInliner(ProfileMap, *Binary, HotCountThreshold, ColdCountThreshold)
|
||||
.run();
|
||||
}
|
||||
|
||||
// Trim and merge cold context profile using cold threshold above;
|
||||
SampleContextTrimmer(ProfileMap)
|
||||
|
||||
@@ -52,6 +52,60 @@ static const Target *getTarget(const ObjectFile *Obj) {
|
||||
return TheTarget;
|
||||
}
|
||||
|
||||
void BinarySizeContextTracker::addInstructionForContext(
|
||||
const FrameLocationStack &Context, uint32_t InstrSize) {
|
||||
ContextTrieNode *CurNode = &RootContext;
|
||||
bool IsLeaf = true;
|
||||
for (const auto &Callsite : reverse(Context)) {
|
||||
StringRef CallerName = Callsite.first;
|
||||
LineLocation CallsiteLoc = IsLeaf ? LineLocation(0, 0) : Callsite.second;
|
||||
CurNode = CurNode->getOrCreateChildContext(CallsiteLoc, CallerName);
|
||||
IsLeaf = false;
|
||||
}
|
||||
|
||||
CurNode->setFunctionSize(CurNode->getFunctionSize() + InstrSize);
|
||||
}
|
||||
|
||||
uint32_t
|
||||
BinarySizeContextTracker::getFuncSizeForContext(const SampleContext &Context) {
|
||||
ContextTrieNode *CurrNode = &RootContext;
|
||||
ContextTrieNode *PrevNode = nullptr;
|
||||
StringRef ContextRemain = Context;
|
||||
StringRef ChildContext;
|
||||
StringRef CallerName;
|
||||
uint32_t Size = 0;
|
||||
|
||||
// Start from top-level context-less function, travese down the reverse
|
||||
// context trie to find the best/longest match for given context, then
|
||||
// retrieve the size.
|
||||
while (CurrNode && !ContextRemain.empty()) {
|
||||
// rsplit so we process from leaf function to callers (added to context).
|
||||
auto ContextSplit = SampleContext::rsplitContextString(ContextRemain);
|
||||
ChildContext = ContextSplit.second;
|
||||
ContextRemain = ContextSplit.first;
|
||||
LineLocation CallSiteLoc(0, 0);
|
||||
SampleContext::decodeContextString(ChildContext, CallerName, CallSiteLoc);
|
||||
PrevNode = CurrNode;
|
||||
CurrNode = CurrNode->getChildContext(CallSiteLoc, CallerName);
|
||||
if (CurrNode && CurrNode->getFunctionSize())
|
||||
Size = CurrNode->getFunctionSize();
|
||||
}
|
||||
|
||||
// If we traversed all nodes along the path of the context and haven't
|
||||
// found a size yet, pivot to look for size from sibling nodes, i.e size
|
||||
// of inlinee under different context.
|
||||
if (!Size) {
|
||||
if (!CurrNode)
|
||||
CurrNode = PrevNode;
|
||||
while (!Size && CurrNode) {
|
||||
CurrNode = &CurrNode->getAllChildContext().begin()->second;
|
||||
Size = CurrNode->getFunctionSize();
|
||||
}
|
||||
}
|
||||
|
||||
return Size;
|
||||
}
|
||||
|
||||
void ProfiledBinary::load() {
|
||||
// Attempt to open the binary.
|
||||
OwningBinary<Binary> OBinary = unwrapOrError(createBinary(Path), Path);
|
||||
@@ -253,7 +307,8 @@ bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
|
||||
if (Cur < 40)
|
||||
outs().indent(40 - Cur);
|
||||
InstructionPointer IP(this, Offset);
|
||||
outs() << getReversedLocWithContext(symbolize(IP, ShowCanonicalFnName));
|
||||
outs() << getReversedLocWithContext(
|
||||
symbolize(IP, ShowCanonicalFnName, ShowPseudoProbe));
|
||||
}
|
||||
outs() << "\n";
|
||||
}
|
||||
@@ -263,12 +318,21 @@ bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
|
||||
// Populate a vector of the symbolized callsite at this location
|
||||
// We don't need symbolized info for probe-based profile, just use an
|
||||
// empty stack as an entry to indicate a valid binary offset
|
||||
FrameLocationStack SymbolizedCallStack;
|
||||
if (!UsePseudoProbes) {
|
||||
|
||||
if (!UsePseudoProbes || TrackFuncContextSize) {
|
||||
InstructionPointer IP(this, Offset);
|
||||
SymbolizedCallStack = symbolize(IP, true);
|
||||
// TODO: reallocation of Offset2LocStackMap will lead to dangling
|
||||
// strings We need ProfiledBinary to owned these string.
|
||||
Offset2LocStackMap[Offset] = symbolize(IP, true, UsePseudoProbes);
|
||||
FrameLocationStack &SymbolizedCallStack = Offset2LocStackMap[Offset];
|
||||
// Record instruction size for the corresponding context
|
||||
if (TrackFuncContextSize && !SymbolizedCallStack.empty())
|
||||
FuncSizeTracker.addInstructionForContext(Offset2LocStackMap[Offset],
|
||||
Size);
|
||||
} else {
|
||||
Offset2LocStackMap[Offset] = FrameLocationStack();
|
||||
}
|
||||
Offset2LocStackMap[Offset] = SymbolizedCallStack;
|
||||
|
||||
// Populate address maps.
|
||||
CodeAddrs.push_back(Offset);
|
||||
if (MCDesc.isCall())
|
||||
@@ -411,7 +475,8 @@ void ProfiledBinary::setupSymbolizer() {
|
||||
}
|
||||
|
||||
FrameLocationStack ProfiledBinary::symbolize(const InstructionPointer &IP,
|
||||
bool UseCanonicalFnName) {
|
||||
bool UseCanonicalFnName,
|
||||
bool UseProbeDiscriminator) {
|
||||
assert(this == IP.Binary &&
|
||||
"Binary should only symbolize its own instruction");
|
||||
auto Addr = object::SectionedAddress{IP.Offset + getPreferredBaseAddress(),
|
||||
@@ -420,18 +485,28 @@ FrameLocationStack ProfiledBinary::symbolize(const InstructionPointer &IP,
|
||||
unwrapOrError(Symbolizer->symbolizeInlinedCode(Path, Addr), getName());
|
||||
|
||||
FrameLocationStack CallStack;
|
||||
|
||||
for (int32_t I = InlineStack.getNumberOfFrames() - 1; I >= 0; I--) {
|
||||
const auto &CallerFrame = InlineStack.getFrame(I);
|
||||
if (CallerFrame.FunctionName == "<invalid>")
|
||||
break;
|
||||
|
||||
StringRef FunctionName(CallerFrame.FunctionName);
|
||||
if (UseCanonicalFnName)
|
||||
FunctionName = FunctionSamples::getCanonicalFnName(FunctionName);
|
||||
LineLocation Line(CallerFrame.Line - CallerFrame.StartLine,
|
||||
DILocation::getBaseDiscriminatorFromDiscriminator(
|
||||
CallerFrame.Discriminator,
|
||||
/* IsFSDiscriminator */ false));
|
||||
|
||||
uint32_t Discriminator = CallerFrame.Discriminator;
|
||||
uint32_t LineOffset = CallerFrame.Line - CallerFrame.StartLine;
|
||||
if (UseProbeDiscriminator) {
|
||||
LineOffset =
|
||||
PseudoProbeDwarfDiscriminator::extractProbeIndex(Discriminator);
|
||||
Discriminator = 0;
|
||||
} else {
|
||||
Discriminator = DILocation::getBaseDiscriminatorFromDiscriminator(
|
||||
CallerFrame.Discriminator,
|
||||
/* IsFSDiscriminator */ false);
|
||||
}
|
||||
|
||||
LineLocation Line(LineOffset, Discriminator);
|
||||
FrameLocation Callsite(FunctionName.str(), Line);
|
||||
CallStack.push_back(Callsite);
|
||||
}
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include "llvm/ProfileData/SampleProf.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/Path.h"
|
||||
#include "llvm/Transforms/IPO/SampleContextTracker.h"
|
||||
#include <list>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
@@ -37,6 +38,9 @@
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
extern cl::opt<bool> EnableCSPreInliner;
|
||||
extern cl::opt<bool> UseContextCostForPreInliner;
|
||||
|
||||
using namespace llvm;
|
||||
using namespace sampleprof;
|
||||
using namespace llvm::object;
|
||||
@@ -95,6 +99,36 @@ struct PrologEpilogTracker {
|
||||
}
|
||||
};
|
||||
|
||||
// Track function byte size under different context (outlined version as well as
|
||||
// various inlined versions). It also provides query support to get function
|
||||
// size with the best matching context, which is used to help pre-inliner use
|
||||
// accurate post-optimization size to make decisions.
|
||||
// TODO: If an inlinee is completely optimized away, ideally we should have zero
|
||||
// for its context size, currently we would misss such context since it doesn't
|
||||
// have instructions. To fix this, we need to mark all inlinee with entry probe
|
||||
// but without instructions as having zero size.
|
||||
class BinarySizeContextTracker {
|
||||
public:
|
||||
// Add instruction with given size to a context
|
||||
void addInstructionForContext(const FrameLocationStack &Context,
|
||||
uint32_t InstrSize);
|
||||
|
||||
// Get function size with a specific context. When there's no exact match
|
||||
// for the given context, try to retrieve the size of that function from
|
||||
// closest matching context.
|
||||
uint32_t getFuncSizeForContext(const SampleContext &Context);
|
||||
|
||||
void dump() { RootContext.dumpTree(); }
|
||||
|
||||
private:
|
||||
// Root node for context trie tree, node that this is a reverse context trie
|
||||
// with callee as parent and caller as child. This way we can traverse from
|
||||
// root to find the best/longest matching context if an exact match does not
|
||||
// exist. It gives us the best possible estimate for function's post-inline,
|
||||
// post-optimization byte size.
|
||||
ContextTrieNode RootContext;
|
||||
};
|
||||
|
||||
class ProfiledBinary {
|
||||
// Absolute path of the binary.
|
||||
std::string Path;
|
||||
@@ -130,8 +164,12 @@ class ProfiledBinary {
|
||||
// A set of return instruction offsets. Used by virtual unwinding.
|
||||
std::unordered_set<uint64_t> RetAddrs;
|
||||
|
||||
// Estimate and track function prolog and epilog ranges.
|
||||
PrologEpilogTracker ProEpilogTracker;
|
||||
|
||||
// Track function sizes under different context
|
||||
BinarySizeContextTracker FuncSizeTracker;
|
||||
|
||||
// The symbolizer used to get inline context for an instruction.
|
||||
std::unique_ptr<symbolize::LLVMSymbolizer> Symbolizer;
|
||||
|
||||
@@ -140,6 +178,9 @@ class ProfiledBinary {
|
||||
|
||||
bool UsePseudoProbes = false;
|
||||
|
||||
// Whether we need to symbolize all instructions to get function context size.
|
||||
bool TrackFuncContextSize = false;
|
||||
|
||||
// Indicate if the base loading address is parsed from the mmap event or uses
|
||||
// the preferred address
|
||||
bool IsLoadedByMMap = false;
|
||||
@@ -165,7 +206,8 @@ class ProfiledBinary {
|
||||
SectionSymbolsTy &Symbols, const SectionRef &Section);
|
||||
/// Symbolize a given instruction pointer and return a full call context.
|
||||
FrameLocationStack symbolize(const InstructionPointer &IP,
|
||||
bool UseCanonicalFnName = false);
|
||||
bool UseCanonicalFnName = false,
|
||||
bool UseProbeDiscriminator = false);
|
||||
|
||||
/// Decode the interesting parts of the binary and build internal data
|
||||
/// structures. On high level, the parts of interest are:
|
||||
@@ -183,7 +225,10 @@ class ProfiledBinary {
|
||||
}
|
||||
|
||||
public:
|
||||
ProfiledBinary(const StringRef Path) : Path(Path), ProEpilogTracker(this) {
|
||||
ProfiledBinary(const StringRef Path)
|
||||
: Path(Path), ProEpilogTracker(this),
|
||||
TrackFuncContextSize(EnableCSPreInliner &&
|
||||
UseContextCostForPreInliner) {
|
||||
setupSymbolizer();
|
||||
load();
|
||||
}
|
||||
@@ -249,6 +294,10 @@ public:
|
||||
return FuncStartAddrMap[Offset];
|
||||
}
|
||||
|
||||
uint32_t getFuncSizeForContext(SampleContext &Context) {
|
||||
return FuncSizeTracker.getFuncSizeForContext(Context);
|
||||
}
|
||||
|
||||
Optional<FrameLocation> getInlineLeafFrameLoc(uint64_t Offset) {
|
||||
const auto &Stack = getFrameLocationStack(Offset);
|
||||
if (Stack.empty())
|
||||
|
||||
Reference in New Issue
Block a user