Inlining fixes/enhancements

Summary:
A number of fixes/enhancements to inline-small-functions
- Fixed size estimateHotSize to use computeCodeSize instead of the original layout offsets.
- Added -print-inline option to dump CFGs for functions that have been modified by inlining.
- Added flag to force consideration of functions without any profiling info (mostly for testing)
- Updated debug line info for inlined functions.
- Ignore the number of pseudo instructions when checking for candidates of suitable size.

Misc changes
- Moved most print flags to BinaryPasses.cpp

(cherry picked from FBD3812658)
This commit is contained in:
Bill Nell
2016-09-02 11:58:53 -07:00
committed by Maksim Panchenko
parent 1cf200107e
commit dcaffe64d3
6 changed files with 198 additions and 105 deletions

View File

@@ -152,6 +152,22 @@ void findSubprograms(DWARFCompileUnit *Unit,
} // namespace
unsigned BinaryContext::addDebugFilenameToUnit(const uint32_t DestCUID,
const uint32_t SrcCUID,
unsigned FileIndex) {
auto SrcUnit = DwCtx->getCompileUnitForOffset(SrcCUID);
auto LineTable = DwCtx->getLineTableForUnit(SrcUnit);
const auto &FileNames = LineTable->Prologue.FileNames;
// Dir indexes start at 1, as DWARF file numbers, and a dir index 0
// means empty dir.
assert(FileIndex > 0 && FileIndex <= FileNames.size() &&
"FileIndex out of range for the compilation unit.");
const char *Dir = FileNames[FileIndex - 1].DirIdx ?
LineTable->Prologue.IncludeDirectories[FileNames[FileIndex - 1].DirIdx - 1] :
"";
return Ctx->getDwarfFile(Dir, FileNames[FileIndex - 1].Name, 0, DestCUID);
}
void BinaryContext::preprocessDebugInfo(
std::map<uint64_t, BinaryFunction> &BinaryFunctions) {
// Populate MCContext with DWARF files.
@@ -165,7 +181,7 @@ void BinaryContext::preprocessDebugInfo(
const char *Dir = FileNames[I].DirIdx ?
LineTable->Prologue.IncludeDirectories[FileNames[I].DirIdx - 1] :
"";
Ctx->getDwarfFile(Dir, FileNames[I].Name, I + 1, CUID);
Ctx->getDwarfFile(Dir, FileNames[I].Name, 0, CUID);
}
}

View File

@@ -193,6 +193,11 @@ public:
void preprocessFunctionDebugInfo(
std::map<uint64_t, BinaryFunction> &BinaryFunctions);
/// Add a filename entry from SrcCUID to DestCUID.
unsigned addDebugFilenameToUnit(const uint32_t DestCUID,
const uint32_t SrcCUID,
unsigned FileIndex);
/// Compute the native code size for a range of instructions.
/// Note: this can be imprecise wrt the final binary since happening prior to
/// relaxation, as well as wrt the original binary because of opcode

View File

@@ -1080,11 +1080,13 @@ public:
/// This is a very rough estimate, as with C++ exceptions there are
/// blocks we don't move, and it makes no attempt at estimating the size
/// of the added/removed branch instructions.
/// Note that this size is optimistic and the actual size may increase
/// after relaxation.
size_t estimateHotSize() const {
size_t Estimate = 0;
for (const auto *BB : BasicBlocksLayout) {
if (BB->ExecutionCount != 0) {
Estimate += getBasicBlockOriginalSize(BB);
Estimate += BC.computeCodeSize(BB->begin(), BB->end());
}
}
return Estimate;

View File

@@ -15,50 +15,94 @@
#define DEBUG_TYPE "bolt"
using namespace llvm;
namespace opts {
extern llvm::cl::opt<unsigned> Verbosity;
extern llvm::cl::opt<bool> PrintAll;
extern llvm::cl::opt<bool> DumpDotAll;
extern llvm::cl::opt<bool> PrintReordered;
extern llvm::cl::opt<bool> PrintEHRanges;
extern llvm::cl::opt<bool> PrintUCE;
extern llvm::cl::opt<bool> PrintPeepholes;
extern llvm::cl::opt<bool> PrintSimplifyROLoads;
extern llvm::cl::opt<bool> PrintICF;
extern llvm::cl::opt<llvm::bolt::BinaryFunction::SplittingType> SplitFunctions;
extern bool shouldProcess(const llvm::bolt::BinaryFunction &Function);
extern cl::opt<unsigned> Verbosity;
extern cl::opt<bool> PrintAll;
extern cl::opt<bool> DumpDotAll;
extern cl::opt<llvm::bolt::BinaryFunction::SplittingType> SplitFunctions;
extern bool shouldProcess(const bolt::BinaryFunction &Function);
static llvm::cl::opt<llvm::bolt::BinaryFunction::LayoutType>
static cl::opt<bool>
PrintReordered("print-reordered",
cl::desc("print functions after layout optimization"),
cl::Hidden);
static cl::opt<bool>
PrintEHRanges("print-eh-ranges",
cl::desc("print function with updated exception ranges"),
cl::Hidden);
static cl::opt<bool>
PrintUCE("print-uce",
cl::desc("print functions after unreachable code elimination"),
cl::Hidden);
static cl::opt<bool>
PrintPeepholes("print-peepholes",
cl::desc("print functions after peephole optimization"),
cl::Hidden);
static cl::opt<bool>
PrintSimplifyROLoads("print-simplify-rodata-loads",
cl::desc("print functions after simplification of RO data"
" loads"),
cl::Hidden);
static cl::opt<bool>
PrintICF("print-icf",
cl::desc("print functions after ICF optimization"),
cl::Hidden);
static cl::opt<bool>
PrintInline("print-inline",
cl::desc("print functions after inlining optimization"),
cl::Hidden);
static cl::list<std::string>
ForceInlineFunctions("force-inline",
cl::CommaSeparated,
cl::desc("list of functions to always consider "
"for inlining"),
cl::value_desc("func1,func2,func3,..."));
static cl::opt<bool>
AggressiveInlining("aggressive-inlining",
cl::desc("perform aggressive inlining"),
cl::Hidden);
static cl::opt<bolt::BinaryFunction::LayoutType>
ReorderBlocks(
"reorder-blocks",
llvm::cl::desc("change layout of basic blocks in a function"),
llvm::cl::init(llvm::bolt::BinaryFunction::LT_NONE),
llvm::cl::values(clEnumValN(llvm::bolt::BinaryFunction::LT_NONE,
"none",
"do not reorder basic blocks"),
clEnumValN(llvm::bolt::BinaryFunction::LT_REVERSE,
"reverse",
"layout blocks in reverse order"),
clEnumValN(llvm::bolt::BinaryFunction::LT_OPTIMIZE,
"normal",
"perform optimal layout based on profile"),
clEnumValN(llvm::bolt::BinaryFunction::LT_OPTIMIZE_BRANCH,
"branch-predictor",
"perform optimal layout prioritizing branch "
"predictions"),
clEnumValN(llvm::bolt::BinaryFunction::LT_OPTIMIZE_CACHE,
"cache",
"perform optimal layout prioritizing I-cache "
"behavior"),
clEnumValEnd));
cl::desc("change layout of basic blocks in a function"),
cl::init(bolt::BinaryFunction::LT_NONE),
cl::values(clEnumValN(bolt::BinaryFunction::LT_NONE,
"none",
"do not reorder basic blocks"),
clEnumValN(bolt::BinaryFunction::LT_REVERSE,
"reverse",
"layout blocks in reverse order"),
clEnumValN(bolt::BinaryFunction::LT_OPTIMIZE,
"normal",
"perform optimal layout based on profile"),
clEnumValN(bolt::BinaryFunction::LT_OPTIMIZE_BRANCH,
"branch-predictor",
"perform optimal layout prioritizing branch "
"predictions"),
clEnumValN(bolt::BinaryFunction::LT_OPTIMIZE_CACHE,
"cache",
"perform optimal layout prioritizing I-cache "
"behavior"),
clEnumValEnd));
static llvm::cl::opt<bool>
static cl::opt<bool>
MinBranchClusters(
"min-branch-clusters",
llvm::cl::desc("use a modified clustering algorithm geared towards "
"minimizing branches"),
llvm::cl::Hidden);
cl::desc("use a modified clustering algorithm geared towards "
"minimizing branches"),
cl::Hidden);
} // namespace opts
@@ -152,10 +196,8 @@ void InlineSmallFunctions::findInliningCandidates(
auto &BB = *Function.begin();
const auto &LastInstruction = *BB.rbegin();
// Check if the function is small enough and doesn't do a tail call.
// The size we use includes pseudo-instructions but here they shouldn't
// matter. So some opportunities may be missed because of this.
if (BB.size() > 0 &&
BB.size() <= kMaxInstructions &&
(BB.size() - BB.getNumPseudos()) <= kMaxInstructions &&
BC.MIA->isReturn(LastInstruction) &&
!BC.MIA->isTailCall(LastInstruction)) {
InliningCandidates.insert(&Function);
@@ -351,8 +393,11 @@ InlineSmallFunctions::inlineCall(
const MCSymbol *OldFTLabel = nullptr;
MCInst *CondBranch = nullptr;
MCInst *UncondBranch = nullptr;
assert(BC.MIA->analyzeBranch(Instruction, OldTargetLabel, OldFTLabel,
CondBranch, UncondBranch));
const bool Result = BC.MIA->analyzeBranch(Instruction, OldTargetLabel,
OldFTLabel, CondBranch,
UncondBranch);
assert(Result &&
"analyzeBranch failed on instruction guaranteed to be a branch");
assert(OldTargetLabel);
const MCSymbol *NewTargetLabel = nullptr;
for (const auto SuccBB : InlinedFunctionBB->successors()) {
@@ -543,7 +588,7 @@ bool InlineSmallFunctions::inlineCallsInFunction(
for (auto InstIt = BB->begin(), End = BB->end(); InstIt != End; ++InstIt) {
auto &Inst = *InstIt;
if (BC.MIA->isCall(Inst)) {
totalDynamicCalls += BB->getExecutionCount();
TotalDynamicCalls += BB->getExecutionCount();
}
}
}
@@ -569,12 +614,12 @@ bool InlineSmallFunctions::inlineCallsInFunction(
bool CallToInlineableFunction =
InliningCandidates.count(TargetFunction);
totalInlineableCalls +=
TotalInlineableCalls +=
CallToInlineableFunction * BB->getExecutionCount();
if (CallToInlineableFunction &&
TargetFunction->getSize() + ExtraSize
+ Function.estimateHotSize() < Function.getMaxSize()) {
+ Function.estimateHotSize() < Function.getMaxSize()) {
auto NextInstIt = std::next(InstIt);
inlineCall(BC, *BB, &Inst, *TargetFunction->begin());
DidInlining = true;
@@ -583,7 +628,7 @@ bool InlineSmallFunctions::inlineCallsInFunction(
<< Function << "\n");
InstIt = NextInstIt;
ExtraSize += TargetFunction->getSize();
inlinedDynamicCalls += BB->getExecutionCount();
InlinedDynamicCalls += BB->getExecutionCount();
continue;
}
}
@@ -611,7 +656,7 @@ bool InlineSmallFunctions::inlineCallsInFunctionAggressive(
for (auto InstIt = BB->begin(), End = BB->end(); InstIt != End; ++InstIt) {
auto &Inst = *InstIt;
if (BC.MIA->isCall(Inst)) {
totalDynamicCalls += BB->getExecutionCount();
TotalDynamicCalls += BB->getExecutionCount();
}
}
}
@@ -638,7 +683,7 @@ bool InlineSmallFunctions::inlineCallsInFunctionAggressive(
bool CallToInlineableFunction =
InliningCandidates.count(TargetFunction);
totalInlineableCalls +=
TotalInlineableCalls +=
CallToInlineableFunction * BB->getExecutionCount();
if (CallToInlineableFunction &&
@@ -655,7 +700,7 @@ bool InlineSmallFunctions::inlineCallsInFunctionAggressive(
InstIndex = NextBB == BB ? NextInstIndex : BB->size();
InstIt = NextBB == BB ? BB->begin() + NextInstIndex : BB->end();
ExtraSize += TargetFunction->getSize();
inlinedDynamicCalls += BB->getExecutionCount();
InlinedDynamicCalls += BB->getExecutionCount();
continue;
}
}
@@ -669,20 +714,35 @@ bool InlineSmallFunctions::inlineCallsInFunctionAggressive(
return DidInlining;
}
bool InlineSmallFunctions::mustConsider(const BinaryFunction &BF) {
for (auto &Name : opts::ForceInlineFunctions) {
if (BF.hasName(Name))
return true;
}
return false;
}
void InlineSmallFunctions::runOnFunctions(
BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &) {
findInliningCandidates(BC, BFs);
if (opts::AggressiveInlining)
findInliningCandidatesAggressive(BC, BFs);
else
findInliningCandidates(BC, BFs);
std::vector<BinaryFunction *> ConsideredFunctions;
std::vector<bool> Modified;
for (auto &It : BFs) {
auto &Function = It.second;
if (!Function.isSimple() || !opts::shouldProcess(Function))
continue;
if (Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE)
if (Function.getExecutionCount() == BinaryFunction::COUNT_NO_PROFILE &&
!mustConsider(Function))
continue;
ConsideredFunctions.push_back(&Function);
Modified.push_back(false);
}
std::sort(ConsideredFunctions.begin(), ConsideredFunctions.end(),
[](BinaryFunction *A, BinaryFunction *B) {
@@ -692,14 +752,34 @@ void InlineSmallFunctions::runOnFunctions(
for (unsigned i = 0; i < ConsideredFunctions.size() &&
ModifiedFunctions <= kMaxFunctions; ++i) {
auto &Function = *ConsideredFunctions[i];
if (inlineCallsInFunction(BC, Function))
const bool DidInline = opts::AggressiveInlining
? inlineCallsInFunctionAggressive(BC, Function)
: inlineCallsInFunction(BC, Function);
if (DidInline) {
Modified[i] = true;
++ModifiedFunctions;
}
}
DEBUG(dbgs() << "BOLT-DEBUG: Inlined " << inlinedDynamicCalls << " of "
<< totalDynamicCalls << " function calls in the profile.\n");
DEBUG(dbgs() << "BOLT-DEBUG: Inlined calls represent "
<< (100.0 * inlinedDynamicCalls / totalInlineableCalls)
if (opts::PrintAll || opts::PrintInline || opts::DumpDotAll) {
for (unsigned i = 0; i < ConsideredFunctions.size(); ++i) {
if (Modified[i]) {
const auto *Function = ConsideredFunctions[i];
if (opts::PrintAll || opts::PrintInline)
Function->print(errs(), "after inlining", true);
if (opts::DumpDotAll)
Function->dumpGraphForPass("inlining");
}
}
}
DEBUG(dbgs() << "BOLT-INFO: Inlined " << InlinedDynamicCalls << " of "
<< TotalDynamicCalls << " function calls in the profile.\n"
<< "BOLT-INFO: Inlined calls represent "
<< format("%.1f", 100.0 * InlinedDynamicCalls / TotalInlineableCalls)
<< "% of all inlineable calls in the profile.\n");
}

View File

@@ -72,9 +72,11 @@ private:
static const unsigned kMaxFunctions = 30000;
/// Statistics collected for debugging.
uint64_t totalDynamicCalls = 0;
uint64_t inlinedDynamicCalls = 0;
uint64_t totalInlineableCalls = 0;
uint64_t TotalDynamicCalls = 0;
uint64_t InlinedDynamicCalls = 0;
uint64_t TotalInlineableCalls = 0;
static bool mustConsider(const BinaryFunction &BF);
void findInliningCandidates(BinaryContext &BC,
const std::map<uint64_t, BinaryFunction> &BFs);

View File

@@ -172,41 +172,10 @@ static cl::opt<bool>
PrintLoopInfo("print-loops", cl::desc("print loop related information"),
cl::Hidden);
cl::opt<bool>
PrintUCE("print-uce",
cl::desc("print functions after unreachable code elimination"),
cl::Hidden);
cl::opt<bool>
PrintPeepholes("print-peepholes",
cl::desc("print functions after peephole optimization"),
cl::Hidden);
static cl::opt<bool>
PrintDisasm("print-disasm", cl::desc("print function after disassembly"),
cl::Hidden);
cl::opt<bool>
PrintEHRanges("print-eh-ranges",
cl::desc("print function with updated exception ranges"),
cl::Hidden);
cl::opt<bool>
PrintSimplifyROLoads("print-simplify-rodata-loads",
cl::desc("print functions after simplification of RO data"
" loads"),
cl::Hidden);
cl::opt<bool>
PrintReordered("print-reordered",
cl::desc("print functions after layout optimization"),
cl::Hidden);
cl::opt<bool>
PrintICF("print-icf",
cl::desc("print functions after ICF optimization"),
cl::Hidden);
static cl::opt<bool>
KeepTmp("keep-tmp",
cl::desc("preserve intermediate .o file"),
@@ -1234,6 +1203,7 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function,
}
// Emit code.
auto ULT = Function.getDWARFUnitLineTable();
int64_t CurrentGnuArgsSize = 0;
for (auto BB : Function.layout()) {
if (EmitColdPart != BB->isCold())
@@ -1264,28 +1234,46 @@ void emitFunction(MCStreamer &Streamer, BinaryFunction &Function,
auto RowReference = DebugLineTableRowRef::fromSMLoc(Instr.getLoc());
if (RowReference != DebugLineTableRowRef::NULL_ROW &&
Instr.getLoc().getPointer() != LastLocSeen.getPointer()) {
auto ULT = Function.getDWARFUnitLineTable();
auto Unit = ULT.first;
auto OriginalLineTable = ULT.second;
const auto OrigUnitID = Unit->getOffset();
unsigned NewFilenum = 0;
// If the CU id from the current instruction location does not
// match the CU id from the current function, it means that we
// have come across some inlined code. We must look up the CU
// for the instruction's original function and get the line table
// from that. We also update the current CU debug info with the
// filename of the inlined function.
if (RowReference.DwCompileUnitIndex != OrigUnitID) {
Unit =
BC.DwCtx->getCompileUnitForOffset(RowReference.DwCompileUnitIndex);
OriginalLineTable = BC.DwCtx->getLineTableForUnit(Unit);
const auto Filenum =
OriginalLineTable->Rows[RowReference.RowIndex - 1].File;
NewFilenum =
BC.addDebugFilenameToUnit(OrigUnitID,
RowReference.DwCompileUnitIndex,
Filenum);
}
assert(Unit && OriginalLineTable &&
"Invalid CU offset set in instruction debug info.");
assert(RowReference.DwCompileUnitIndex == Unit->getOffset() &&
"DWARF compile unit mismatch");
const auto &OriginalRow =
OriginalLineTable->Rows[RowReference.RowIndex - 1];
OriginalLineTable->Rows[RowReference.RowIndex - 1];
BC.Ctx->setCurrentDwarfLoc(
OriginalRow.File,
OriginalRow.Line,
OriginalRow.Column,
(DWARF2_FLAG_IS_STMT * OriginalRow.IsStmt) |
(DWARF2_FLAG_BASIC_BLOCK * OriginalRow.BasicBlock) |
(DWARF2_FLAG_PROLOGUE_END * OriginalRow.PrologueEnd) |
(DWARF2_FLAG_EPILOGUE_BEGIN * OriginalRow.EpilogueBegin),
OriginalRow.Isa,
OriginalRow.Discriminator);
BC.Ctx->setDwarfCompileUnitID(Unit->getOffset());
NewFilenum == 0 ? OriginalRow.File : NewFilenum,
OriginalRow.Line,
OriginalRow.Column,
(DWARF2_FLAG_IS_STMT * OriginalRow.IsStmt) |
(DWARF2_FLAG_BASIC_BLOCK * OriginalRow.BasicBlock) |
(DWARF2_FLAG_PROLOGUE_END * OriginalRow.PrologueEnd) |
(DWARF2_FLAG_EPILOGUE_BEGIN * OriginalRow.EpilogueBegin),
OriginalRow.Isa,
OriginalRow.Discriminator);
BC.Ctx->setDwarfCompileUnitID(OrigUnitID);
LastLocSeen = Instr.getLoc();
}
}