[BOLT] Add shrink wrapping pass

Summary:
Add an implementation for shrink wrapping, a frame optimization
that moves callee-saved register spills from hot prologues to cold
successors.

(cherry picked from FBD4983706)
This commit is contained in:
Rafael Auler
2017-05-01 16:52:54 -07:00
committed by Maksim Panchenko
parent 4b485f4167
commit d850ca3622
32 changed files with 3609 additions and 844 deletions

View File

@@ -148,8 +148,9 @@ BinaryBasicBlock *BinaryBasicBlock::getLandingPad(const MCSymbol *Label) const {
}
int32_t BinaryBasicBlock::getCFIStateAtInstr(const MCInst *Instr) const {
assert(getFunction()->getState() == BinaryFunction::State::CFG &&
"can only calculate CFI state when function is in active CFG state");
assert(
getFunction()->getState() >= BinaryFunction::State::CFG &&
"can only calculate CFI state when function is in or past the CFG state");
const auto &FDEProgram = getFunction()->getFDEProgram();
@@ -316,6 +317,38 @@ bool BinaryBasicBlock::analyzeBranch(const MCSymbol *&TBB,
return MIA->analyzeBranch(Instructions, TBB, FBB, CondBranch, UncondBranch);
}
MCInst *BinaryBasicBlock::getTerminatorBefore(MCInst *Pos) {
auto &BC = Function->getBinaryContext();
auto Itr = rbegin();
bool Check = Pos ? false : true;
MCInst *FirstTerminator{nullptr};
while (Itr != rend()) {
if (!Check) {
if (&*Itr == Pos)
Check = true;
++Itr;
continue;
}
if (BC.MIA->isTerminator(*Itr))
FirstTerminator = &*Itr;
++Itr;
}
return FirstTerminator;
}
bool BinaryBasicBlock::hasTerminatorAfter(MCInst *Pos) {
auto &BC = Function->getBinaryContext();
auto Itr = rbegin();
while (Itr != rend()) {
if (&*Itr == Pos)
return false;
if (BC.MIA->isTerminator(*Itr))
return true;
++Itr;
}
return false;
}
bool BinaryBasicBlock::swapConditionalSuccessors() {
if (succ_size() != 2)
return false;

View File

@@ -617,20 +617,26 @@ public:
return Instructions.erase(II);
}
/// Retrieve iterator for \p Inst or return end iterator if instruction is not
/// from this basic block.
decltype(Instructions)::iterator findInstruction(const MCInst *Inst) {
if (Instructions.empty())
return Instructions.end();
size_t Index = Inst - &Instructions[0];
return Index >= Instructions.size() ? Instructions.end()
: Instructions.begin() + Index;
}
/// Replace an instruction with a sequence of instructions. Returns true
/// if the instruction to be replaced was found and replaced.
template <typename Itr>
bool replaceInstruction(const MCInst *Inst, Itr Begin, Itr End) {
auto I = Instructions.end();
auto B = Instructions.begin();
while (I > B) {
--I;
if (&*I == Inst) {
adjustNumPseudos(*Inst, -1);
Instructions.insert(Instructions.erase(I), Begin, End);
adjustNumPseudos(Begin, End, 1);
return true;
}
auto I = findInstruction(Inst);
if (I != Instructions.end()) {
adjustNumPseudos(*Inst, -1);
Instructions.insert(Instructions.erase(I), Begin, End);
adjustNumPseudos(Begin, End, 1);
return true;
}
return false;
}
@@ -640,6 +646,23 @@ public:
return replaceInstruction(Inst, Replacement.begin(), Replacement.end());
}
/// Insert \p NewInst before \p At, which must be an existing instruction in
/// this BB. Return a pointer to the newly inserted instruction.
iterator insertInstruction(iterator At, MCInst &&NewInst) {
adjustNumPseudos(NewInst, 1);
return Instructions.emplace(At, std::move(NewInst));
}
/// Helper to retrieve any terminators in \p BB before \p Pos. This is used
/// to skip CFI instructions and to retrieve the first terminator instruction
/// in basic blocks with two terminators (conditional jump and unconditional
/// jump).
MCInst *getTerminatorBefore(MCInst *Pos);
/// Used to identify whether an instruction is before a terminator and whether
/// moving it to the end of the BB would render it dead code.
bool hasTerminatorAfter(MCInst *Pos);
/// Split apart the instructions in this basic block starting at Inst.
/// The instructions following Inst are removed and returned in a vector.
std::vector<MCInst> splitInstructions(const MCInst *Inst) {

View File

@@ -239,24 +239,57 @@ void BinaryContext::preprocessDebugInfo(
}
}
void BinaryContext::printCFI(raw_ostream &OS, uint32_t Operation) {
switch(Operation) {
case MCCFIInstruction::OpSameValue: OS << "OpSameValue"; break;
case MCCFIInstruction::OpRememberState: OS << "OpRememberState"; break;
case MCCFIInstruction::OpRestoreState: OS << "OpRestoreState"; break;
case MCCFIInstruction::OpOffset: OS << "OpOffset"; break;
case MCCFIInstruction::OpDefCfaRegister: OS << "OpDefCfaRegister"; break;
case MCCFIInstruction::OpDefCfaOffset: OS << "OpDefCfaOffset"; break;
case MCCFIInstruction::OpDefCfa: OS << "OpDefCfa"; break;
case MCCFIInstruction::OpRelOffset: OS << "OpRelOffset"; break;
case MCCFIInstruction::OpAdjustCfaOffset: OS << "OfAdjustCfaOffset"; break;
case MCCFIInstruction::OpEscape: OS << "OpEscape"; break;
case MCCFIInstruction::OpRestore: OS << "OpRestore"; break;
case MCCFIInstruction::OpUndefined: OS << "OpUndefined"; break;
case MCCFIInstruction::OpRegister: OS << "OpRegister"; break;
case MCCFIInstruction::OpWindowSave: OS << "OpWindowSave"; break;
case MCCFIInstruction::OpGnuArgsSize: OS << "OpGnuArgsSize"; break;
default: OS << "Op#" << Operation; break;
void BinaryContext::printCFI(raw_ostream &OS, const MCCFIInstruction &Inst) {
uint32_t Operation = Inst.getOperation();
switch (Operation) {
case MCCFIInstruction::OpSameValue:
OS << "OpSameValue Reg" << Inst.getRegister();
break;
case MCCFIInstruction::OpRememberState:
OS << "OpRememberState";
break;
case MCCFIInstruction::OpRestoreState:
OS << "OpRestoreState";
break;
case MCCFIInstruction::OpOffset:
OS << "OpOffset Reg" << Inst.getRegister() << " " << Inst.getOffset();
break;
case MCCFIInstruction::OpDefCfaRegister:
OS << "OpDefCfaRegister Reg" << Inst.getRegister();
break;
case MCCFIInstruction::OpDefCfaOffset:
OS << "OpDefCfaOffset " << Inst.getOffset();
break;
case MCCFIInstruction::OpDefCfa:
OS << "OpDefCfa Reg" << Inst.getRegister() << " " << Inst.getOffset();
break;
case MCCFIInstruction::OpRelOffset:
OS << "OpRelOffset";
break;
case MCCFIInstruction::OpAdjustCfaOffset:
OS << "OfAdjustCfaOffset";
break;
case MCCFIInstruction::OpEscape:
OS << "OpEscape";
break;
case MCCFIInstruction::OpRestore:
OS << "OpRestore";
break;
case MCCFIInstruction::OpUndefined:
OS << "OpUndefined";
break;
case MCCFIInstruction::OpRegister:
OS << "OpRegister";
break;
case MCCFIInstruction::OpWindowSave:
OS << "OpWindowSave";
break;
case MCCFIInstruction::OpGnuArgsSize:
OS << "OpGnuArgsSize";
break;
default:
OS << "Op#" << Operation;
break;
}
}
@@ -274,7 +307,7 @@ void BinaryContext::printInstruction(raw_ostream &OS,
uint32_t Offset = Instruction.getOperand(0).getImm();
OS << "\t!CFI\t$" << Offset << "\t; ";
if (Function)
printCFI(OS, Function->getCFIFor(Instruction)->getOperation());
printCFI(OS, *Function->getCFIFor(Instruction));
OS << "\n";
return;
}

View File

@@ -143,6 +143,12 @@ public:
const DataReader &DR;
/// Sum of execution count of all functions
uint64_t SumExecutionCount{0};
/// Number of functions with profile information
uint64_t NumProfiledFuncs{0};
BinaryContext(std::unique_ptr<MCContext> Ctx,
std::unique_ptr<DWARFContext> DwCtx,
std::unique_ptr<Triple> TheTriple,
@@ -262,8 +268,19 @@ public:
return Size;
}
/// Return a function execution count threshold for determining whether the
/// the function is 'hot'. Consider it hot if count is above the average exec
/// count of profiled functions.
uint64_t getHotThreshold() const {
static uint64_t Threshold{0};
if (Threshold == 0) {
Threshold = NumProfiledFuncs ? SumExecutionCount / NumProfiledFuncs : 1;
}
return Threshold;
}
/// Print the string name for a CFI operation.
static void printCFI(raw_ostream &OS, uint32_t Operation);
static void printCFI(raw_ostream &OS, const MCCFIInstruction &Inst);
/// Print a single MCInst in native format. If Function is non-null,
/// the instruction will be annotated with CFI and possibly DWARF line table

View File

@@ -150,7 +150,7 @@ constexpr unsigned NoRegister = 0;
constexpr const char *DynoStats::Desc[];
constexpr unsigned BinaryFunction::MinAlign;
namespace {
/// Gets debug line information for the instruction located at the given
@@ -535,8 +535,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation,
for (auto &Elmt : OffsetToCFI) {
OS << format(" %08x:\t", Elmt.first);
assert(Elmt.second < FrameInstructions.size() && "Incorrect CFI offset");
BinaryContext::printCFI(OS,
FrameInstructions[Elmt.second].getOperation());
BinaryContext::printCFI(OS, FrameInstructions[Elmt.second]);
OS << "\n";
}
} else {
@@ -544,7 +543,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation,
for (uint32_t I = 0, E = FrameInstructions.size(); I != E; ++I) {
const MCCFIInstruction &CFI = FrameInstructions[I];
OS << format(" %d:\t", I);
BinaryContext::printCFI(OS, CFI.getOperation());
BinaryContext::printCFI(OS, CFI);
OS << "\n";
}
}
@@ -3442,6 +3441,54 @@ void BinaryFunction::updateLayout(LayoutType Type,
updateLayoutIndices();
}
bool BinaryFunction::replaceJumpTableEntryIn(BinaryBasicBlock *BB,
BinaryBasicBlock *OldDest,
BinaryBasicBlock *NewDest) {
auto *Instr = BB->getLastNonPseudoInstr();
if (!Instr || !BC.MIA->isIndirectBranch(*Instr))
return false;
auto JTAddress = BC.MIA->getJumpTable(*Instr);
assert(JTAddress && "Invalid jump table address");
auto *JT = getJumpTableContainingAddress(JTAddress);
assert(JT && "No jump table structure for this indirect branch");
bool Patched = JT->replaceDestination(JTAddress, OldDest->getLabel(),
NewDest->getLabel());
assert(Patched && "Invalid entry to be replaced in jump table");
return true;
}
BinaryBasicBlock *BinaryFunction::splitEdge(BinaryBasicBlock *From,
BinaryBasicBlock *To) {
// Create intermediate BB
MCSymbol *Tmp = BC.Ctx->createTempSymbol("SplitEdge", true);
auto NewBB = createBasicBlock(0, Tmp);
auto NewBBPtr = NewBB.get();
// Update "From" BB
auto I = From->succ_begin();
auto BI = From->branch_info_begin();
for (; I != From->succ_end(); ++I) {
if (*I == To)
break;
++BI;
}
assert(I != From->succ_end() && "Invalid CFG edge in splitEdge!");
uint64_t OrigCount{BI->Count};
uint64_t OrigMispreds{BI->MispredictedCount};
replaceJumpTableEntryIn(From, To, NewBBPtr);
From->replaceSuccessor(To, NewBBPtr, OrigCount, OrigMispreds);
NewBB->addSuccessor(To, OrigCount, OrigMispreds);
NewBB->setExecutionCount(OrigCount);
NewBB->setIsCold(From->isCold());
// Update CFI and BB layout with new intermediate BB
std::vector<std::unique_ptr<BinaryBasicBlock>> NewBBs;
NewBBs.emplace_back(std::move(NewBB));
insertBasicBlocks(From, std::move(NewBBs), true, true);
return NewBBPtr;
}
bool BinaryFunction::isSymbolValidInScope(const SymbolRef &Symbol,
uint64_t SymbolSize) const {
// Some symbols are tolerated inside function bodies, others are not.
@@ -3578,6 +3625,22 @@ BinaryFunction::JumpTable::getEntriesForAddress(const uint64_t Addr) const {
return std::make_pair(StartIndex, EndIndex);
}
bool BinaryFunction::JumpTable::replaceDestination(uint64_t JTAddress,
const MCSymbol *OldDest,
MCSymbol *NewDest) {
bool Patched{false};
const auto Range = getEntriesForAddress(JTAddress);
for (auto I = &Entries[Range.first], E = &Entries[Range.second];
I != E; ++I) {
auto &Entry = *I;
if (Entry == OldDest) {
Patched = true;
Entry = NewDest;
}
}
return Patched;
}
void BinaryFunction::JumpTable::updateOriginal(BinaryContext &BC) {
// In non-relocation mode we have to emit jump tables in local sections.
// This way we only overwrite them when a corresponding function is

View File

@@ -624,6 +624,11 @@ public:
/// Total number of times this jump table was used.
uint64_t Count{0};
/// Change all entries of the jump table in \p JTAddress pointing to
/// \p OldDest to \p NewDest. Return false if unsuccessful.
bool replaceDestination(uint64_t JTAddress, const MCSymbol *OldDest,
MCSymbol *NewDest);
/// Update jump table at its original location.
void updateOriginal(BinaryContext &BC);
@@ -1368,6 +1373,21 @@ public:
/// new blocks into the CFG. This must be called after updateLayout.
void updateCFIState(BinaryBasicBlock *Start, const unsigned NumNewBlocks);
/// Change \p OrigDest to \p NewDest in the jump table used at the end of
/// \p BB. Returns false if \p OrigDest couldn't be find as a valid target
/// and no replacement took place.
bool replaceJumpTableEntryIn(BinaryBasicBlock *BB,
BinaryBasicBlock *OldDest,
BinaryBasicBlock *NewDest);
/// Split the CFG edge <From, To> by inserting an intermediate basic block.
/// Returns a pointer to this new intermediate basic block. BB "From" will be
/// updated to jump to the intermediate block, which in turn will have an
/// unconditional branch to BB "To".
/// User needs to manually call fixBranches(). This function only creates the
/// correct CFG edges.
BinaryBasicBlock *splitEdge(BinaryBasicBlock *From, BinaryBasicBlock *To);
/// Determine direction of the branch based on the current layout.
/// Callee is responsible of updating basic block indices prior to using
/// this function (e.g. by calling BinaryFunction::updateLayoutIndices()).

View File

@@ -10,6 +10,7 @@
//===----------------------------------------------------------------------===//
#include "BinaryPassManager.h"
#include "Passes/AllocCombiner.h"
#include "Passes/FrameOptimizer.h"
#include "Passes/IndirectCallPromotion.h"
#include "Passes/Inliner.h"
@@ -62,12 +63,6 @@ OptimizeBodylessFunctions("optimize-bodyless-functions",
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
static cl::opt<bool>
OptimizeFrameAccesses("frame-opt",
cl::desc("optimize stack frame accesses"),
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
static cl::opt<bool>
Peepholes("peepholes",
cl::desc("run peephole optimizations"),
@@ -331,9 +326,6 @@ void BinaryFunctionPassManager::runAllPasses(
// fix branches consistency internally.
Manager.registerPass(llvm::make_unique<FixupBranches>(PrintAfterBranchFixup));
Manager.registerPass(llvm::make_unique<FrameOptimizerPass>(PrintFOP),
OptimizeFrameAccesses);
// This pass should come close to last since it uses the estimated hot
// size of a function to determine the order. It should definitely
// also happen after any changes to the call graph are made, e.g. inlining.
@@ -356,6 +348,14 @@ void BinaryFunctionPassManager::runAllPasses(
// This pass should always run last.*
Manager.registerPass(llvm::make_unique<FinalizeFunctions>(PrintFinalized));
// FrameOptimizer has an implicit dependency on FinalizeFunctions.
// FrameOptimizer move values around and needs to update CFIs. To do this, it
// must read CFI, interpret it and rewrite it, so CFIs need to be correctly
// placed according to the final layout.
Manager.registerPass(llvm::make_unique<FrameOptimizerPass>(PrintFOP));
Manager.registerPass(llvm::make_unique<AllocCombinerPass>(PrintFOP));
// *except for this pass. This pass turns tail calls into jumps which
// makes them invisible to function reordering.
Manager.registerPass(

View File

@@ -0,0 +1,116 @@
#include "AllocCombiner.h"
#define DEBUG_TYPE "alloccombiner"
using namespace llvm;
namespace opts {
extern bool shouldProcess(const bolt::BinaryFunction &Function);
extern cl::opt<bolt::FrameOptimizationType> FrameOptimization;
} // end namespace opts
namespace llvm {
namespace bolt {
namespace {
bool getStackAdjustmentSize(const BinaryContext &BC, const MCInst &Inst,
int64_t &Adjustment) {
return BC.MIA->evaluateSimple(Inst, Adjustment,
std::make_pair(BC.MIA->getStackPointer(), 0LL),
std::make_pair(0, 0LL));
}
bool isIndifferentToSP(const MCInst &Inst, const BinaryContext &BC) {
if (BC.MIA->isCFI(Inst))
return true;
const auto II = BC.MII->get(Inst.getOpcode());
if (BC.MIA->isTerminator(Inst) ||
II.hasImplicitDefOfPhysReg(BC.MIA->getStackPointer(), BC.MRI.get()) ||
II.hasImplicitUseOfPhysReg(BC.MIA->getStackPointer()))
return false;
for (int I = 0, E = Inst.getNumOperands(); I != E; ++I) {
const auto &Operand = Inst.getOperand(I);
if (Operand.isReg() && Operand.getReg() == BC.MIA->getStackPointer()) {
return false;
}
}
return true;
}
bool shouldProc(BinaryFunction &Function) {
return Function.isSimple() && Function.hasCFG() &&
opts::shouldProcess(Function) && (Function.getSize() > 0);
}
void runForAllWeCare(std::map<uint64_t, BinaryFunction> &BFs,
std::function<void(BinaryFunction &)> Task) {
for (auto &It : BFs) {
auto &Function = It.second;
if (shouldProc(Function))
Task(Function);
}
}
} // end anonymous namespace
void AllocCombinerPass::combineAdjustments(BinaryContext &BC,
BinaryFunction &BF) {
for (auto &BB : BF) {
MCInst *Prev = nullptr;
for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) {
auto &Inst = *I;
if (isIndifferentToSP(Inst, BC))
continue; // Skip updating Prev
int64_t Adjustment{0LL};
if (!Prev || !BC.MIA->isStackAdjustment(Inst) ||
!BC.MIA->isStackAdjustment(*Prev) ||
!getStackAdjustmentSize(BC, *Prev, Adjustment)) {
Prev = &Inst;
continue;
}
DEBUG({
dbgs() << "At \"" << BF.getPrintName() << "\", combining: \n";
Inst.dump();
Prev->dump();
dbgs() << "Adjustment: " << Adjustment << "\n";
});
if (BC.MIA->isSUB(Inst))
Adjustment = -Adjustment;
BC.MIA->addToImm(Inst, Adjustment, BC.Ctx.get());
DEBUG({
dbgs() << "After adjustment:\n";
Inst.dump();
});
BB.eraseInstruction(Prev);
++NumCombined;
Prev = &Inst;
}
}
}
void AllocCombinerPass::runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) {
if (opts::FrameOptimization == FOP_NONE)
return;
runForAllWeCare(
BFs, [&](BinaryFunction &Function) { combineAdjustments(BC, Function); });
outs() << "BOLT-INFO: Allocation combiner: " << NumCoalesced
<< " empty spaces coalesced.\n";
}
} // end namespace bolt
} // end namespace llvm

View File

@@ -0,0 +1,48 @@
//===--- Passes/AllocCombiner.h -------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEDEFRAG_H
#define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEDEFRAG_H
#include "BinaryPasses.h"
#include "DataflowInfoManager.h"
namespace llvm {
namespace bolt {
class AllocCombinerPass : public BinaryFunctionPass {
/// Stats aggregating variables
uint64_t NumCombined{0};
uint64_t NumCoalesced{0};
void combineAdjustments(BinaryContext &BC, BinaryFunction &BF);
void coalesceEmptySpace(BinaryContext &BC, BinaryFunction &BF,
DataflowInfoManager &Info, FrameAnalysis &FA);
public:
explicit AllocCombinerPass(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) {}
const char *getName() const override {
return "alloc-combiner";
}
/// Pass entry point
void runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &LargeFunctions) override;
};
} // namespace bolt
} // namespace llvm
#endif

View File

@@ -584,9 +584,11 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC,
auto BI = PredBB->branch_info_begin();
std::swap(*BI, *(BI + 1));
} else {
// Change destination of the unconditional branch.
// Change destination of the conditional branch.
MIA->replaceBranchTarget(*CondBranch, CalleeSymbol, BC.Ctx.get());
}
// Annotate it, so "isCall" returns true for this jcc
MIA->addAnnotation(BC.Ctx.get(), *CondBranch, "IsCTC", true);
// Remove the unused successor which may be eliminated later
// if there are no other users.

View File

@@ -359,6 +359,12 @@ public:
std::set<uint64_t> &LargeFunctions) override;
};
enum FrameOptimizationType : char {
FOP_NONE, /// Don't perform FOP.
FOP_HOT, /// Perform FOP on hot functions.
FOP_ALL /// Perform FOP on all functions.
};
} // namespace bolt
} // namespace llvm

View File

@@ -1,4 +1,5 @@
add_llvm_library(LLVMBOLTPasses
AllocCombiner.cpp
BinaryPasses.cpp
BinaryFunctionCallGraph.cpp
CallGraph.cpp
@@ -14,7 +15,11 @@ add_llvm_library(LLVMBOLTPasses
PettisAndHansen.cpp
ReorderAlgorithm.cpp
ReorderFunctions.cpp
ShrinkWrapping.cpp
StackAllocationAnalysis.cpp
StackAvailableExpressions.cpp
StackPointerTracking.cpp
StackReachingUses.cpp
)
include_directories( ${LLVM_MAIN_SRC_DIR}/tools/llvm-bolt )

View File

@@ -265,12 +265,13 @@ public:
return getStateAt(*Point.getInst());
}
/// Relies on a ptr map to fetch the previous instruction and then retrieve
/// state. WARNING: Watch out for invalidated pointers. Do not use this
/// function if you invalidated pointers after the analysis has been completed
ErrorOr<const StateTy &> getStateBefore(const MCInst &Point) {
return getStateAt(PrevPoint[&Point]);
}
/// Return the in set (out set) of a given program point if the direction of
/// the dataflow is forward (backward).
ErrorOr<const StateTy &>getStateBefore(ProgramPoint Point) {
if (Point.isBB())
return getStateAt(*Point.getBB());
@@ -491,6 +492,25 @@ public:
/// Maps expressions defs (MCInsts) to its index in the Expressions vector
std::unordered_map<const MCInst *, uint64_t> ExprToIdx;
/// Return whether \p Expr is in the state set at \p Point
bool count(ProgramPoint Point, const MCInst &Expr) const {
auto IdxIter = ExprToIdx.find(&Expr);
assert (IdxIter != ExprToIdx.end() && "Invalid Expr");
return (*this->getStateAt(Point))[IdxIter->second];
}
bool count(const MCInst &Point, const MCInst &Expr) const {
auto IdxIter = ExprToIdx.find(&Expr);
assert (IdxIter != ExprToIdx.end() && "Invalid Expr");
return (*this->getStateAt(Point))[IdxIter->second];
}
/// Return whether \p Expr is in the state set at the instr of index
/// \p PointIdx
bool count(unsigned PointIdx, const MCInst &Expr) const {
return count(*Expressions[PointIdx], Expr);
}
InstrsDataflowAnalysis(const BinaryContext &BC, BinaryFunction &BF)
: DataflowAnalysis<Derived, BitVector, Backward>(BC, BF) {}
virtual ~InstrsDataflowAnalysis() {}

View File

@@ -20,10 +20,7 @@ ReachingDefOrUse</*Def=*/true> &DataflowInfoManager::getReachingDefs() {
return *RD;
assert(FA && "FrameAnalysis required");
RD.reset(new ReachingDefOrUse<true>(*FA, BC, BF));
{
NamedRegionTimer T1("RD", "Dataflow", true);
RD->run();
}
RD->run();
return *RD;
}
@@ -36,10 +33,7 @@ ReachingDefOrUse</*Def=*/false> &DataflowInfoManager::getReachingUses() {
return *RU;
assert(FA && "FrameAnalysis required");
RU.reset(new ReachingDefOrUse<false>(*FA, BC, BF));
{
NamedRegionTimer T1("RU", "Dataflow", true);
RU->run();
}
RU->run();
return *RU;
}
@@ -52,10 +46,7 @@ LivenessAnalysis &DataflowInfoManager::getLivenessAnalysis() {
return *LA;
assert(FA && "FrameAnalysis required");
LA.reset(new LivenessAnalysis(*FA, BC, BF));
{
NamedRegionTimer T1("LA", "Dataflow", true);
LA->run();
}
LA->run();
return *LA;
}
@@ -63,14 +54,24 @@ void DataflowInfoManager::invalidateLivenessAnalysis() {
LA.reset(nullptr);
}
StackReachingUses &DataflowInfoManager::getStackReachingUses() {
if (SRU)
return *SRU;
assert(FA && "FrameAnalysis required");
SRU.reset(new StackReachingUses(*FA, BC, BF));
SRU->run();
return *SRU;
}
void DataflowInfoManager::invalidateStackReachingUses() {
SRU.reset(nullptr);
}
DominatorAnalysis<false> &DataflowInfoManager::getDominatorAnalysis() {
if (DA)
return *DA;
DA.reset(new DominatorAnalysis<false>(BC, BF));
{
NamedRegionTimer T1("DA", "Dataflow", true);
DA->run();
}
DA->run();
return *DA;
}
@@ -82,10 +83,7 @@ DominatorAnalysis<true> &DataflowInfoManager::getPostDominatorAnalysis() {
if (PDA)
return *PDA;
PDA.reset(new DominatorAnalysis<true>(BC, BF));
{
NamedRegionTimer T1("PDA", "Dataflow", true);
PDA->run();
}
PDA->run();
return *PDA;
}
@@ -97,14 +95,12 @@ StackPointerTracking &DataflowInfoManager::getStackPointerTracking() {
if (SPT)
return *SPT;
SPT.reset(new StackPointerTracking(BC, BF));
{
NamedRegionTimer T1("SPT", "Dataflow", true);
SPT->run();
}
SPT->run();
return *SPT;
}
void DataflowInfoManager::invalidateStackPointerTracking() {
invalidateStackAllocationAnalysis();
SPT.reset(nullptr);
}
@@ -112,10 +108,7 @@ ReachingInsns<false> &DataflowInfoManager::getReachingInsns() {
if (RI)
return *RI;
RI.reset(new ReachingInsns<false>(BC, BF));
{
NamedRegionTimer T1("RI", "Dataflow", true);
RI->run();
}
RI->run();
return *RI;
}
@@ -127,10 +120,7 @@ ReachingInsns<true> &DataflowInfoManager::getReachingInsnsBackwards() {
if (RIB)
return *RIB;
RIB.reset(new ReachingInsns<true>(BC, BF));
{
NamedRegionTimer T1("RIB", "Dataflow", true);
RIB->run();
}
RIB->run();
return *RIB;
}
@@ -138,6 +128,18 @@ void DataflowInfoManager::invalidateReachingInsnsBackwards() {
RIB.reset(nullptr);
}
StackAllocationAnalysis &DataflowInfoManager::getStackAllocationAnalysis() {
if (SAA)
return *SAA;
SAA.reset(new StackAllocationAnalysis(BC, BF, getStackPointerTracking()));
SAA->run();
return *SAA;
}
void DataflowInfoManager::invalidateStackAllocationAnalysis() {
SAA.reset(nullptr);
}
std::unordered_map<const MCInst *, BinaryBasicBlock *> &
DataflowInfoManager::getInsnToBBMap() {
if (InsnToBB)
@@ -158,11 +160,13 @@ void DataflowInfoManager::invalidateAll() {
invalidateReachingDefs();
invalidateReachingUses();
invalidateLivenessAnalysis();
invalidateStackReachingUses();
invalidateDominatorAnalysis();
invalidatePostDominatorAnalysis();
invalidateStackPointerTracking();
invalidateReachingInsns();
invalidateReachingInsnsBackwards();
invalidateStackAllocationAnalysis();
invalidateInsnToBBMap();
}

View File

@@ -14,10 +14,12 @@
#include "FrameAnalysis.h"
#include "ReachingDefOrUse.h"
#include "StackReachingUses.h"
#include "DominatorAnalysis.h"
#include "StackPointerTracking.h"
#include "ReachingInsns.h"
#include "LivenessAnalysis.h"
#include "StackAllocationAnalysis.h"
namespace llvm {
namespace bolt {
@@ -33,11 +35,13 @@ class DataflowInfoManager {
std::unique_ptr<ReachingDefOrUse</*Def=*/true>> RD;
std::unique_ptr<ReachingDefOrUse</*Def=*/false>> RU;
std::unique_ptr<LivenessAnalysis> LA;
std::unique_ptr<StackReachingUses> SRU;
std::unique_ptr<DominatorAnalysis</*Bwd=*/false>> DA;
std::unique_ptr<DominatorAnalysis</*Bwd=*/true>> PDA;
std::unique_ptr<StackPointerTracking> SPT;
std::unique_ptr<ReachingInsns<false>> RI;
std::unique_ptr<ReachingInsns<true>> RIB;
std::unique_ptr<StackAllocationAnalysis> SAA;
std::unique_ptr<std::unordered_map<const MCInst *, BinaryBasicBlock *>>
InsnToBB;
@@ -45,12 +49,20 @@ public:
DataflowInfoManager(const FrameAnalysis *FA, const BinaryContext &BC,
BinaryFunction &BF) : FA(FA), BC(BC), BF(BF) {};
/// Helper function to fetch the parent BB associated with a program point
/// If PP is a BB itself, then return itself (cast to a BinaryBasicBlock)
BinaryBasicBlock *getParentBB(ProgramPoint PP) {
return PP.isBB() ? PP.getBB() : getInsnToBBMap()[PP.getInst()];
}
ReachingDefOrUse</*Def=*/true> &getReachingDefs();
void invalidateReachingDefs();
ReachingDefOrUse</*Def=*/false> &getReachingUses();
void invalidateReachingUses();
LivenessAnalysis &getLivenessAnalysis();
void invalidateLivenessAnalysis();
StackReachingUses &getStackReachingUses();
void invalidateStackReachingUses();
DominatorAnalysis<false> &getDominatorAnalysis();
void invalidateDominatorAnalysis();
DominatorAnalysis<true> &getPostDominatorAnalysis();
@@ -61,6 +73,8 @@ public:
void invalidateReachingInsns();
ReachingInsns<true> &getReachingInsnsBackwards();
void invalidateReachingInsnsBackwards();
StackAllocationAnalysis &getStackAllocationAnalysis();
void invalidateStackAllocationAnalysis();
std::unordered_map<const MCInst *, BinaryBasicBlock *> &getInsnToBBMap();
void invalidateInsnToBBMap();
void invalidateAll();

View File

@@ -13,6 +13,7 @@
#define LLVM_TOOLS_LLVM_BOLT_PASSES_DOMINATORANALYSIS_H
#include "DataflowAnalysis.h"
#include "llvm/Support/Timer.h"
namespace llvm {
namespace bolt {
@@ -60,13 +61,21 @@ public:
return Result;
}
bool doesADominatesB(const MCInst &A, const MCInst &B) {
return (*this->getStateAt(B))[this->ExprToIdx[&A]];
bool doesADominateB(const MCInst &A, unsigned BIdx) {
return this->count(BIdx, A);
}
bool doesADominatesB(ProgramPoint A, const MCInst &B) {
bool doesADominateB(const MCInst &A, const MCInst &B) {
return this->count(B, A);
}
bool doesADominateB(const MCInst &A, ProgramPoint B) {
return this->count(B, A);
}
bool doesADominateB(ProgramPoint A, const MCInst &B) {
if (A.isInst())
return doesADominatesB(*A.getInst(), B);
return doesADominateB(*A.getInst(), B);
// This analysis keep track of which instructions dominates another
// instruction, it doesn't keep track of BBs. So we need a non-empty
@@ -79,7 +88,7 @@ public:
BB = *BB->succ_begin();
}
const MCInst &InstA = *BB->begin();
return doesADominatesB(InstA, B);
return doesADominateB(InstA, B);
}
void doForAllDominators(const MCInst &Inst,
@@ -89,6 +98,11 @@ public:
}
}
void run() {
NamedRegionTimer T1("DA", "Dataflow", true);
InstrsDataflowAnalysis<DominatorAnalysis<Backward>, Backward>::run();
}
private:
void preflight() {
// Populate our universe of tracked expressions with all instructions

View File

@@ -215,6 +215,12 @@ public:
void FrameAnalysis::addArgAccessesFor(const BinaryContext &BC, MCInst &Inst,
ArgAccesses &&AA) {
if (auto OldAA = getArgAccessesFor(BC, Inst)) {
if (OldAA->AssumeEverything)
return;
*OldAA = std::move(AA);
return;
}
if (AA.AssumeEverything) {
// Index 0 in ArgAccessesVector represents an "assumeeverything" entry
BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "ArgAccessEntry", 0U);
@@ -222,7 +228,7 @@ void FrameAnalysis::addArgAccessesFor(const BinaryContext &BC, MCInst &Inst,
}
BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "ArgAccessEntry",
(unsigned)ArgAccessesVector.size());
ArgAccessesVector.emplace_back(AA);
ArgAccessesVector.emplace_back(std::move(AA));
}
void FrameAnalysis::addArgInStackAccessFor(const BinaryContext &BC,
@@ -329,29 +335,39 @@ BitVector FrameAnalysis::getFunctionClobberList(const BinaryContext &BC,
void FrameAnalysis::buildClobberMap(const BinaryContext &BC) {
std::queue<BinaryFunction *> Queue;
std::set<BinaryFunction *> InQueue;
for (auto *Func : TopologicalCGOrder) {
Queue.push(Func);
InQueue.insert(Func);
}
while (!Queue.empty()) {
auto *Func = Queue.front();
Queue.pop();
InQueue.erase(Func);
BitVector RegsKilled = getFunctionClobberList(BC, Func);
bool Updated = ClobberAnalysisOnly ? false : computeArgsAccessed(BC, *Func);
bool ArgsUpdated = ClobberAnalysisOnly ? false : computeArgsAccessed(BC, *Func);
bool RegsUpdated = false;
if (RegsKilledMap.find(Func) == RegsKilledMap.end()) {
RegsKilledMap[Func] = std::move(RegsKilled);
continue;
} else {
RegsUpdated = RegsKilledMap[Func] != RegsKilled;
if (RegsUpdated)
RegsKilledMap[Func] = std::move(RegsKilled);
}
if (RegsKilledMap[Func] != RegsKilled || Updated) {
if (RegsUpdated || ArgsUpdated) {
for (auto Caller : Cg.predecessors(Cg.getNodeId(Func))) {
Queue.push(Cg.nodeIdToFunc(Caller));
BinaryFunction *CallerFunc = Cg.nodeIdToFunc(Caller);
if (!InQueue.count(CallerFunc)) {
InQueue.insert(CallerFunc);
Queue.push(CallerFunc);
}
}
}
RegsKilledMap[Func] = std::move(RegsKilled);
}
if (opts::Verbosity == 0) {
@@ -453,10 +469,11 @@ bool FrameAnalysis::updateArgsTouchedFor(const BinaryContext &BC,
break;
}
DEBUG(dbgs() << "Added arg in stack access annotation "
<< CurOffset + Elem.first << "\n");
<< CurOffset + Elem.first << "\n");
addArgInStackAccessFor(
BC, Inst, ArgInStackAccess{/*StackOffset=*/CurOffset + Elem.first,
/*Size=*/Elem.second});
BC, Inst,
ArgInStackAccess{/*StackOffset=*/CurOffset + Elem.first,
/*Size=*/Elem.second});
}
return Changed;
}

View File

@@ -10,6 +10,11 @@
//===----------------------------------------------------------------------===//
#include "FrameOptimizer.h"
#include "FrameAnalysis.h"
#include "ShrinkWrapping.h"
#include "StackAvailableExpressions.h"
#include "StackReachingUses.h"
#include "llvm/Support/Timer.h"
#include <queue>
#include <unordered_map>
@@ -19,616 +24,34 @@ using namespace llvm;
namespace opts {
extern cl::opt<unsigned> Verbosity;
}
extern cl::OptionCategory BoltOptCategory;
using namespace bolt;
cl::opt<FrameOptimizationType>
FrameOptimization("frame-opt",
cl::init(FOP_NONE),
cl::desc("optimize stack frame accesses"),
cl::values(
clEnumValN(FOP_NONE, "none", "do not perform frame optimization"),
clEnumValN(FOP_HOT, "hot", "perform FOP on hot functions"),
clEnumValN(FOP_ALL, "all", "perform FOP on all functions"),
clEnumValEnd),
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
} // namespace opts
namespace llvm {
namespace bolt {
void FrameOptimizerPass::getInstClobberList(const BinaryContext &BC,
const MCInst &Inst,
BitVector &KillSet) const {
if (!BC.MIA->isCall(Inst)) {
BC.MIA->getClobberedRegs(Inst, KillSet, *BC.MRI);
return;
}
const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst);
// If indirect call, kill set should have all elements
if (TargetSymbol == nullptr) {
KillSet.set(0, KillSet.size());
return;
}
const auto *Function = BC.getFunctionForSymbol(TargetSymbol);
if (Function == nullptr) {
// Call to a function without a BinaryFunction object.
// This should be a call to a PLT entry, and since it is a trampoline to
// a DSO, we can't really know the code in advance. Conservatively assume
// everything is clobbered.
KillSet.set(0, KillSet.size());
return;
}
auto BV = RegsKilledMap.find(Function);
if (BV != RegsKilledMap.end()) {
KillSet |= BV->second;
return;
}
// Ignore calls to function whose clobber list wasn't yet calculated. This
// instruction will be evaluated again once we have info for the callee.
return;
}
BitVector
FrameOptimizerPass::getFunctionClobberList(const BinaryContext &BC,
const BinaryFunction *Func) {
BitVector RegsKilled = BitVector(BC.MRI->getNumRegs(), false);
if (!Func->isSimple() || !shouldOptimize(*Func)) {
RegsKilled.set(0, RegsKilled.size());
return RegsKilled;
}
for (const auto &BB : *Func) {
for (const auto &Inst : BB) {
getInstClobberList(BC, Inst, RegsKilled);
}
}
return RegsKilled;
}
void FrameOptimizerPass::buildClobberMap(const BinaryContext &BC) {
std::queue<const BinaryFunction *> Queue;
for (auto *Func : TopologicalCGOrder) {
Queue.push(Func);
}
while (!Queue.empty()) {
auto *Func = Queue.front();
Queue.pop();
BitVector RegsKilled = getFunctionClobberList(BC, Func);
if (RegsKilledMap.find(Func) == RegsKilledMap.end()) {
RegsKilledMap[Func] = std::move(RegsKilled);
continue;
}
if (RegsKilledMap[Func] != RegsKilled) {
for (auto Caller : Cg.predecessors(Cg.getNodeId(Func))) {
Queue.push(Cg.nodeIdToFunc(Caller));
}
}
RegsKilledMap[Func] = std::move(RegsKilled);
}
if (opts::Verbosity == 0) {
#ifndef NDEBUG
if (!DebugFlag || !isCurrentDebugType("fop"))
return;
#else
return;
#endif
}
// This loop is for computing statistics only
for (auto *Func : TopologicalCGOrder) {
auto Iter = RegsKilledMap.find(Func);
assert(Iter != RegsKilledMap.end() &&
"Failed to compute all clobbers list");
if (Iter->second.all()) {
auto Count = Func->getExecutionCount();
if (Count != BinaryFunction::COUNT_NO_PROFILE)
CountFunctionsAllClobber += Count;
++NumFunctionsAllClobber;
}
DEBUG_WITH_TYPE("fop",
dbgs() << "Killed regs set for func: " << Func->getPrintName() << "\n";
const BitVector &RegsKilled = Iter->second;
int RegIdx = RegsKilled.find_first();
while (RegIdx != -1) {
dbgs() << "\tREG" << RegIdx;
RegIdx = RegsKilled.find_next(RegIdx);
};
dbgs() << "\n";
);
}
}
namespace {
template <typename StateTy>
class ForwardDataflow {
protected:
/// Reference to the function being analysed
const BinaryFunction &Func;
/// Tracks the set of available exprs at the end of each MCInst in this
/// function
std::unordered_map<const MCInst *, StateTy> StateAtPoint;
/// Tracks the set of available exprs at basic block start
std::unordered_map<const BinaryBasicBlock *, StateTy> StateAtBBEntry;
virtual void preflight() = 0;
virtual StateTy getStartingStateAtBB(const BinaryBasicBlock &BB) = 0;
virtual StateTy getStartingStateAtPoint(const MCInst &Point) = 0;
virtual void doConfluence(StateTy &StateOut, const StateTy &StateIn) = 0;
virtual StateTy computeNext(const MCInst &Point, const StateTy &Cur) = 0;
public:
ForwardDataflow(const BinaryFunction &BF) : Func(BF) {}
virtual ~ForwardDataflow() {}
ErrorOr<const StateTy &>getStateAt(const BinaryBasicBlock &BB) const {
auto Iter = StateAtBBEntry.find(&BB);
if (Iter == StateAtBBEntry.end())
return make_error_code(errc::result_out_of_range);
return Iter->second;
}
ErrorOr<const StateTy &>getStateAt(const MCInst &Point) const {
auto Iter = StateAtPoint.find(&Point);
if (Iter == StateAtPoint.end())
return make_error_code(errc::result_out_of_range);
return Iter->second;
}
void run() {
preflight();
// Initialize state for all points of the function
for (auto &BB : Func) {
StateAtBBEntry[&BB] = getStartingStateAtBB(BB);
for (auto &Inst : BB) {
StateAtPoint[&Inst] = getStartingStateAtPoint(Inst);
}
}
assert(Func.begin() != Func.end() && "Unexpected empty function");
std::queue<const BinaryBasicBlock *> Worklist;
// TODO: Pushing this in a DFS ordering will greatly speed up the dataflow
// performance.
for (auto &BB : Func) {
Worklist.push(&BB);
}
// Main dataflow loop
while (!Worklist.empty()) {
auto *BB = Worklist.front();
Worklist.pop();
DEBUG(dbgs() << "\tNow at BB " << BB->getName() << "\n");
// Calculate state at the entry of first instruction in BB
StateTy &StateAtEntry = StateAtBBEntry[BB];
for (auto I = BB->pred_begin(), E = BB->pred_end(); I != E; ++I) {
auto Last = (*I)->rbegin();
if (Last != (*I)->rend()) {
doConfluence(StateAtEntry, StateAtPoint[&*Last]);
} else {
doConfluence(StateAtEntry, StateAtBBEntry[*I]);
}
}
// Skip empty
if (BB->begin() == BB->end())
continue;
// Propagate information from first instruction down to the last one
bool Changed = false;
StateTy *PrevState = &StateAtEntry;
const MCInst *LAST = &*BB->rbegin();
for (auto &Inst : *BB) {
DEBUG(dbgs() << "\t\tNow at ");
DEBUG(Inst.dump());
StateTy CurState = computeNext(Inst, *PrevState);
if (StateAtPoint[&Inst] != CurState) {
StateAtPoint[&Inst] = CurState;
if (&Inst == LAST)
Changed = true;
}
PrevState = &StateAtPoint[&Inst];
}
if (Changed) {
for (auto I = BB->succ_begin(), E = BB->succ_end(); I != E; ++I) {
Worklist.push(*I);
}
}
}
}
};
class StackAvailableExpressions : public ForwardDataflow<BitVector> {
public:
StackAvailableExpressions(const FrameOptimizerPass &FOP,
const BinaryContext &BC, const BinaryFunction &BF)
: ForwardDataflow(BF), FOP(FOP), FrameIndexMap(FOP.FrameIndexMap),
BC(BC) {}
virtual ~StackAvailableExpressions() {}
/// Define an iterator for navigating the expressions calculated by the
/// dataflow at each program point
class ExprIterator
: public std::iterator<std::forward_iterator_tag, const MCInst *> {
public:
ExprIterator &operator++() {
assert(Idx != -1 && "Iterator already at the end");
Idx = BV->find_next(Idx);
return *this;
}
ExprIterator operator++(int) {
assert(Idx != -1 && "Iterator already at the end");
ExprIterator Ret = *this;
++(*this);
return Ret;
}
bool operator==(ExprIterator Other) const { return Idx == Other.Idx; }
bool operator!=(ExprIterator Other) const { return Idx != Other.Idx; }
const MCInst *operator*() {
assert(Idx != -1 && "Invalid access to end iterator");
return Expressions[Idx];
}
ExprIterator(const BitVector *BV, const std::vector<const MCInst *> &Exprs)
: BV(BV), Expressions(Exprs) {
Idx = BV->find_first();
}
ExprIterator(const BitVector *BV, const std::vector<const MCInst *> &Exprs,
int Idx)
: BV(BV), Expressions(Exprs), Idx(Idx) {}
private:
const BitVector *BV;
const std::vector<const MCInst *> &Expressions;
public:
int Idx;
};
ExprIterator expr_begin(const BitVector &BV) const {
return ExprIterator(&BV, Expressions);
}
ExprIterator expr_begin(const MCInst &Point) const {
auto Iter = StateAtPoint.find(&Point);
if (Iter == StateAtPoint.end())
return expr_end();
return ExprIterator(&Iter->second, Expressions);
}
ExprIterator expr_begin(const BinaryBasicBlock &BB) const {
auto Iter = StateAtBBEntry.find(&BB);
if (Iter == StateAtBBEntry.end())
return expr_end();
return ExprIterator(&Iter->second, Expressions);
}
ExprIterator expr_end() const {
return ExprIterator(nullptr, Expressions, -1);
}
private:
/// Reference to the result of stack frame analysis
const FrameOptimizerPass &FOP;
const FrameOptimizerPass::FrameIndexMapTy &FrameIndexMap;
const BinaryContext &BC;
/// Used to size the set of expressions/definitions being tracked by the
/// dataflow analysis
uint64_t NumInstrs{0};
/// We put every MCInst we want to track (which one representing an
/// expression/def) into a vector because we need to associate them with
/// small numbers. They will be tracked via BitVectors throughout the
/// dataflow analysis.
std::vector<const MCInst *> Expressions;
/// Maps expressions defs (MCInsts) to its index in the Expressions vector
std::unordered_map<const MCInst *, uint64_t> ExprToIdx;
void preflight() override {
DEBUG(dbgs() << "Starting StackAvailableExpressions on \""
<< Func.getPrintName() << "\"\n");
// Populate our universe of tracked expressions. We are interested in
// tracking available stores to frame position at any given point of the
// program.
for (auto &BB : Func) {
for (auto &Inst : BB) {
auto FIEIter = FrameIndexMap.find(&Inst);
if (FIEIter == FrameIndexMap.end())
continue;
const auto &FIE = FIEIter->second;
if (FIE.IsLoad == false && FIE.IsSimple == true) {
Expressions.push_back(&Inst);
ExprToIdx[&Inst] = NumInstrs++;
}
}
}
}
BitVector getStartingStateAtBB(const BinaryBasicBlock &BB) override {
// Entry points start with empty set (Function entry and landing pads).
// All others start with the full set.
if (BB.pred_size() == 0)
return BitVector(NumInstrs, false);
return BitVector(NumInstrs, true);
}
BitVector getStartingStateAtPoint(const MCInst &Point) override {
return BitVector(NumInstrs, true);
}
void doConfluence(BitVector &StateOut, const BitVector &StateIn) override {
StateOut &= StateIn;
}
/// Define the function computing the kill set -- whether expression Y, a
/// tracked expression, will be considered to be dead after executing X.
bool doesXKillsY(const MCInst *X, const MCInst *Y) {
// if both are stores, and both store to the same stack location, return
// true
auto FIEIterX = FrameIndexMap.find(X);
auto FIEIterY = FrameIndexMap.find(Y);
if (FIEIterX != FrameIndexMap.end() && FIEIterY != FrameIndexMap.end()) {
const FrameOptimizerPass::FrameIndexEntry &FIEX = FIEIterX->second;
const FrameOptimizerPass::FrameIndexEntry &FIEY = FIEIterY->second;;
if (FIEX.IsLoad == 0 && FIEY.IsLoad == 0 &&
FIEX.StackOffset + FIEX.Size > FIEY.StackOffset &&
FIEX.StackOffset < FIEY.StackOffset + FIEY.Size)
return true;
}
// getClobberedRegs for X and Y. If they intersect, return true
BitVector XClobbers = BitVector(BC.MRI->getNumRegs(), false);
BitVector YClobbers = BitVector(BC.MRI->getNumRegs(), false);
FOP.getInstClobberList(BC, *X, XClobbers);
// If Y is a store to stack, its clobber list is its source reg. This is
// different than the rest because we want to check if the store source
// reaches its corresponding load untouched.
if (FIEIterY != FrameIndexMap.end() && FIEIterY->second.IsLoad == 0 &&
FIEIterY->second.IsStoreFromReg) {
YClobbers.set(FIEIterY->second.RegOrImm);
} else {
FOP.getInstClobberList(BC, *Y, YClobbers);
}
XClobbers &= YClobbers;
return XClobbers.any();
}
BitVector computeNext(const MCInst &Point, const BitVector &Cur) override {
BitVector Next = Cur;
// Kill
for (auto I = expr_begin(Next), E = expr_end(); I != E; ++I) {
assert(*I != nullptr && "Lost pointers");
DEBUG(dbgs() << "\t\t\tDoes it kill ");
DEBUG((*I)->dump());
if (doesXKillsY(&Point, *I)) {
DEBUG(dbgs() << "\t\t\t\tYes\n");
Next.reset(I.Idx);
}
};
// Gen
auto FIEIter = FrameIndexMap.find(&Point);
if (FIEIter != FrameIndexMap.end() &&
FIEIter->second.IsLoad == false &&
FIEIter->second.IsSimple == true)
Next.set(ExprToIdx[&Point]);
return Next;
}
};
class StackPointerTracking : public ForwardDataflow<int> {
const BinaryContext &BC;
void preflight() override {
DEBUG(dbgs() << "Starting StackPointerTracking on \""
<< Func.getPrintName() << "\"\n");
}
int getStartingStateAtBB(const BinaryBasicBlock &BB) override {
// Entry BB start with offset 8 from CFA.
// All others start with EMPTY (meaning we don't know anything).
if (BB.isEntryPoint())
return -8;
return EMPTY;
}
int getStartingStateAtPoint(const MCInst &Point) override {
return EMPTY;
}
void doConfluence(int &StateOut, const int &StateIn) override {
if (StateOut == EMPTY) {
StateOut = StateIn;
return;
}
if (StateIn == EMPTY || StateIn == StateOut)
return;
// We can't agree on a specific value from this point on
StateOut = SUPERPOSITION;
}
int computeNext(const MCInst &Point, const int &Cur) override {
const auto &MIA = BC.MIA;
if (Cur == EMPTY || Cur == SUPERPOSITION)
return Cur;
if (int Sz = MIA->getPushSize(Point))
return Cur - Sz;
if (int Sz = MIA->getPopSize(Point))
return Cur + Sz;
if (BC.MII->get(Point.getOpcode())
.hasDefOfPhysReg(Point, MIA->getStackPointer(), *BC.MRI)) {
int64_t Offset = Cur;
if (!MIA->evaluateSimple(Point, Offset, std::make_pair(0, 0),
std::make_pair(0, 0)))
return SUPERPOSITION;
return static_cast<int>(Offset);
}
return Cur;
}
public:
StackPointerTracking(const BinaryContext &BC, const BinaryFunction &BF)
: ForwardDataflow(BF), BC(BC) {}
virtual ~StackPointerTracking() {}
static constexpr int SUPERPOSITION = std::numeric_limits<int>::max();
static constexpr int EMPTY = std::numeric_limits<int>::min();
};
} // anonymous namespace
bool FrameOptimizerPass::restoreFrameIndex(const BinaryContext &BC,
const BinaryFunction &BF) {
StackPointerTracking SPT(BC, BF);
SPT.run();
// Vars used for storing useful CFI info to give us a hint about how the stack
// is used in this function
int64_t CfaOffset{-8};
uint16_t CfaReg{7};
bool CfaRegLocked{false};
uint16_t CfaRegLockedVal{0};
std::stack<std::pair<int64_t, uint16_t>> CFIStack;
DEBUG(dbgs() << "Restoring frame indices for \"" << BF.getPrintName()
<< "\"\n");
// TODO: Implement SP tracking and improve this analysis
for (auto &BB : BF) {
DEBUG(dbgs() <<"\tNow at BB " << BB.getName() << "\n");
const MCInst *Prev = nullptr;
for (const auto &Inst : BB) {
int SPOffset = (Prev ? *SPT.getStateAt(*Prev) : *SPT.getStateAt(BB));
DEBUG({
dbgs() << "\t\tNow at ";
Inst.dump();
dbgs() << "\t\t\tSP offset is " << SPOffset << "\n";
});
Prev = &Inst;
// Use CFI information to keep track of which register is being used to
// access the frame
if (BC.MIA->isCFI(Inst)) {
const auto *CFI = BF.getCFIFor(Inst);
switch (CFI->getOperation()) {
case MCCFIInstruction::OpDefCfa:
CfaOffset = CFI->getOffset();
// Fall-through
case MCCFIInstruction::OpDefCfaRegister:
CfaReg = CFI->getRegister();
break;
case MCCFIInstruction::OpDefCfaOffset:
CfaOffset = CFI->getOffset();
break;
case MCCFIInstruction::OpRememberState:
CFIStack.push(std::make_pair(CfaOffset, CfaReg));
break;
case MCCFIInstruction::OpRestoreState: {
assert(!CFIStack.empty() && "Corrupt CFI stack");
auto &Elem = CFIStack.top();
CFIStack.pop();
CfaOffset = Elem.first;
CfaReg = Elem.second;
break;
}
case MCCFIInstruction::OpAdjustCfaOffset:
llvm_unreachable("Unhandled AdjustCfaOffset");
break;
default:
break;
}
continue;
}
if (BC.MIA->leaksStackAddress(Inst, *BC.MRI, false)) {
DEBUG(dbgs() << "Leaked stack address, giving up on this function.\n");
DEBUG(dbgs() << "Blame insn: ");
DEBUG(Inst.dump());
return false;
}
bool IsLoad = false;
bool IsStore = false;
bool IsStoreFromReg = false;
bool IsSimple = false;
int32_t SrcImm{0};
MCPhysReg Reg{0};
MCPhysReg StackPtrReg{0};
int64_t StackOffset{0};
uint8_t Size{0};
bool IsIndexed = false;
if (BC.MIA->isStackAccess(Inst, IsLoad, IsStore, IsStoreFromReg, Reg,
SrcImm, StackPtrReg, StackOffset, Size,
IsSimple, IsIndexed)) {
assert(Size != 0);
if (CfaRegLocked && CfaRegLockedVal != CfaReg) {
DEBUG(dbgs() << "CFA reg changed, giving up on this function.\n");
return false;
}
if (StackPtrReg != BC.MRI->getLLVMRegNum(CfaReg, /*isEH=*/false)) {
if (StackPtrReg != BC.MIA->getStackPointer() ||
SPOffset == SPT.EMPTY || SPOffset == SPT.SUPERPOSITION) {
DEBUG(dbgs()
<< "Found stack access with reg different than cfa reg.\n");
DEBUG(dbgs() << "\tCurrent CFA reg: " << CfaReg
<< "\n\tStack access reg: " << StackPtrReg << "\n");
DEBUG(dbgs() << "Blame insn: ");
DEBUG(Inst.dump());
return false;
}
DEBUG(dbgs() << "Adding access via SP while CFA reg is another one\n");
if (IsStoreFromReg || IsLoad)
SrcImm = Reg;
// Ignore accesses to the previous stack frame
if (SPOffset + StackOffset >= 0)
continue;
FrameIndexMap.emplace(
&Inst, FrameIndexEntry{IsLoad, IsStoreFromReg, SrcImm,
SPOffset + StackOffset, Size, IsSimple});
} else {
CfaRegLocked = true;
CfaRegLockedVal = CfaReg;
if (IsStoreFromReg || IsLoad)
SrcImm = Reg;
// Ignore accesses to the previous stack frame
if (CfaOffset + StackOffset >= 0)
continue;
FrameIndexMap.emplace(
&Inst, FrameIndexEntry{IsLoad, IsStoreFromReg, SrcImm,
CfaOffset + StackOffset, Size, IsSimple});
}
DEBUG_WITH_TYPE("fop",
dbgs() << "Frame index annotation added to:\n";
BC.printInstruction(dbgs(), Inst, 0, &BF, true);
dbgs() << " FrameIndexEntry <IsLoad:" << IsLoad << " StackOffset:";
if (FrameIndexMap[&Inst].StackOffset < 0)
dbgs() << "-" << Twine::utohexstr(-FrameIndexMap[&Inst].StackOffset);
else
dbgs() << "+" << Twine::utohexstr(FrameIndexMap[&Inst].StackOffset);
dbgs() << " IsStoreFromReg:" << FrameIndexMap[&Inst].IsStoreFromReg
<< " RegOrImm:" << FrameIndexMap[&Inst].RegOrImm << ">\n";
);
}
}
}
return true;
}
void FrameOptimizerPass::removeUnnecessarySpills(const BinaryContext &BC,
BinaryFunction &BF) {
StackAvailableExpressions SAE(*this, BC, BF);
void FrameOptimizerPass::removeUnnecessaryLoads(const FrameAnalysis &FA,
const BinaryContext &BC,
BinaryFunction &BF) {
StackAvailableExpressions SAE(FA, BC, BF);
SAE.run();
DEBUG(dbgs() << "Performing frame optimization\n");
DEBUG(dbgs() << "Performing unnecessary loads removal\n");
std::deque<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
bool Changed = false;
const auto ExprEnd = SAE.expr_end();
@@ -648,16 +71,16 @@ void FrameOptimizerPass::removeUnnecessarySpills(const BinaryContext &BC,
// if Inst is a load from stack and the current available expressions show
// this value is available in a register or immediate, replace this load
// with move from register or from immediate.
const auto Iter = FrameIndexMap.find(&Inst);
if (Iter == FrameIndexMap.end()) {
auto FIEX = FA.getFIEFor(BC, Inst);
if (!FIEX) {
Prev = &Inst;
continue;
}
const FrameIndexEntry &FIEX = Iter->second;
// FIXME: Change to remove IsSimple == 0. We're being conservative here,
// but once replaceMemOperandWithReg is ready, we should feed it with all
// sorts of complex instructions.
if (FIEX.IsLoad == 0 || FIEX.IsSimple == 0) {
if (FIEX->IsLoad == false || FIEX->IsSimple == false ||
FIEX->StackOffset >= 0) {
Prev = &Inst;
continue;
}
@@ -665,13 +88,14 @@ void FrameOptimizerPass::removeUnnecessarySpills(const BinaryContext &BC,
for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB);
I != ExprEnd; ++I) {
const MCInst *AvailableInst = *I;
const auto Iter = FrameIndexMap.find(AvailableInst);
if (Iter == FrameIndexMap.end())
auto FIEY = FA.getFIEFor(BC, *AvailableInst);
if (!FIEY)
continue;
const FrameIndexEntry &FIEY = Iter->second;
assert(FIEY.IsLoad == 0 && FIEY.IsSimple != 0);
if (FIEX.StackOffset != FIEY.StackOffset || FIEX.Size != FIEY.Size)
assert(FIEY->IsStore && FIEY->IsSimple);
if (FIEX->StackOffset != FIEY->StackOffset || FIEX->Size != FIEY->Size)
continue;
// TODO: Change push/pops to stack adjustment instruction
if (BC.MIA->isPop(Inst))
continue;
++NumRedundantLoads;
@@ -682,12 +106,13 @@ void FrameOptimizerPass::removeUnnecessarySpills(const BinaryContext &BC,
DEBUG(AvailableInst->dump());
DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
// Replace load
if (FIEY.IsStoreFromReg) {
if (!BC.MIA->replaceMemOperandWithReg(Inst, FIEY.RegOrImm)) {
if (FIEY->IsStoreFromReg) {
if (!BC.MIA->replaceMemOperandWithReg(Inst, FIEY->RegOrImm)) {
DEBUG(dbgs() << "FAILED to change operand to a reg\n");
break;
}
++NumLoadsChangedToReg;
BC.MIA->removeAnnotation(Inst, "FrameAccessEntry");
DEBUG(dbgs() << "Changed operand to a reg\n");
if (BC.MIA->isRedundantMove(Inst)) {
++NumLoadsDeleted;
@@ -697,12 +122,13 @@ void FrameOptimizerPass::removeUnnecessarySpills(const BinaryContext &BC,
}
} else {
char Buf[8] = {0, 0, 0, 0, 0, 0, 0, 0};
support::ulittle64_t::ref(Buf + 0) = FIEY.RegOrImm;
support::ulittle64_t::ref(Buf + 0) = FIEY->RegOrImm;
DEBUG(dbgs() << "Changing operand to an imm... ");
if (!BC.MIA->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) {
DEBUG(dbgs() << "FAILED\n");
} else {
++NumLoadsChangedToImm;
BC.MIA->removeAnnotation(Inst, "FrameAccessEntry");
DEBUG(dbgs() << "Ok\n");
}
}
@@ -716,71 +142,130 @@ void FrameOptimizerPass::removeUnnecessarySpills(const BinaryContext &BC,
if (Changed) {
DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
}
// TODO: Implement an interface of eraseInstruction that works out the
// complete list of elements to remove.
for (auto I : ToErase) {
I.first->eraseInstruction(I.second);
}
}
void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA,
const BinaryContext &BC,
BinaryFunction &BF) {
StackReachingUses SRU(FA, BC, BF);
SRU.run();
DEBUG(dbgs() << "Performing unused stores removal\n");
std::vector<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
bool Changed = false;
for (auto &BB : BF) {
DEBUG(dbgs() <<"\tNow at BB " << BB.getName() << "\n");
const MCInst *Prev = nullptr;
for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) {
auto &Inst = *I;
DEBUG({
dbgs() << "\t\tNow at ";
Inst.dump();
for (auto I = Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB);
I != SRU.expr_end(); ++I) {
dbgs() << "\t\t\tReached by: ";
(*I)->dump();
}
});
auto FIEX = FA.getFIEFor(BC, Inst);
if (!FIEX) {
Prev = &Inst;
continue;
}
if (FIEX->IsLoad || !FIEX->IsSimple || FIEX->StackOffset >= 0) {
Prev = &Inst;
continue;
}
if (SRU.isStoreUsed(*FIEX,
Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB))) {
Prev = &Inst;
continue;
}
// TODO: Change push/pops to stack adjustment instruction
if (BC.MIA->isPush(Inst))
continue;
++NumRedundantStores;
Changed = true;
DEBUG(dbgs() << "Unused store instruction: ");
DEBUG(Inst.dump());
DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
// Delete it!
ToErase.push_back(std::make_pair(&BB, &Inst));
Prev = &Inst;
}
}
for (auto I : ToErase) {
I.first->eraseInstruction(I.second);
}
if (Changed) {
DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
}
}
void FrameOptimizerPass::runOnFunctions(BinaryContext &BC,
std::map<uint64_t, BinaryFunction> &BFs,
std::set<uint64_t> &) {
uint64_t NumFunctionsNotOptimized{0};
uint64_t NumFunctionsFailedRestoreFI{0};
uint64_t CountFunctionsNotOptimized{0};
uint64_t CountFunctionsFailedRestoreFI{0};
uint64_t CountDenominator{0};
Cg = buildCallGraph(BC, BFs);
TopologicalCGOrder = Cg.buildTraversalOrder();
buildClobberMap(BC);
std::set<uint64_t> &LargeFunctions) {
if (opts::FrameOptimization == FOP_NONE)
return;
// Run FrameAnalysis pass
FrameAnalysis FA(PrintPass);
FA.runOnFunctions(BC, BFs, LargeFunctions);
// Our main loop: perform caller-saved register optimizations, then
// callee-saved register optimizations (shrink wrapping).
for (auto &I : BFs) {
auto Count = I.second.getExecutionCount();
if (Count != BinaryFunction::COUNT_NO_PROFILE)
CountDenominator += Count;
if (!shouldOptimize(I.second)) {
++NumFunctionsNotOptimized;
if (Count != BinaryFunction::COUNT_NO_PROFILE)
CountFunctionsNotOptimized += Count;
if (!FA.hasFrameInfo(I.second))
continue;
// Restrict pass execution if user asked to only run on hot functions
if (opts::FrameOptimization == FOP_HOT) {
if (I.second.getKnownExecutionCount() < BC.getHotThreshold())
continue;
DEBUG(dbgs() << "Considering " << I.second.getPrintName()
<< " for frame optimizations because its execution count ( "
<< I.second.getKnownExecutionCount()
<< " ) exceeds our hotness threshold ( "
<< BC.getHotThreshold() << " )\n");
}
if (!restoreFrameIndex(BC, I.second)) {
++NumFunctionsFailedRestoreFI;
auto Count = I.second.getExecutionCount();
if (Count != BinaryFunction::COUNT_NO_PROFILE)
CountFunctionsFailedRestoreFI += Count;
{
NamedRegionTimer T1("remove loads", "FOP breakdown", true);
removeUnnecessaryLoads(FA, BC, I.second);
}
{
NamedRegionTimer T1("remove stores", "FOP breakdown", true);
removeUnusedStores(FA, BC, I.second);
}
// Don't even start shrink wrapping if no profiling info is available
if (I.second.getKnownExecutionCount() == 0)
continue;
{
NamedRegionTimer T1("move spills", "FOP breakdown", true);
DataflowInfoManager Info(&FA, BC, I.second);
ShrinkWrapping SW(FA, BC, I.second, Info);
SW.perform();
}
removeUnnecessarySpills(BC, I.second);
}
FA.cleanAnnotations(BC, BFs);
outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads
<< " redundant load(s).\n";
if (opts::Verbosity == 0) {
#ifndef NDEBUG
if (!DebugFlag || !isCurrentDebugType("fop"))
return;
#else
return;
#endif
}
<< " redundant load(s) and " << NumRedundantStores
<< " unused store(s)\n";
outs() << "BOLT-INFO: FOP changed " << NumLoadsChangedToReg
<< " load(s) to use a register instead of a stack access, and "
<< NumLoadsChangedToImm << " to use an immediate.\n"
<< "BOLT-INFO: FOP deleted " << NumLoadsDeleted << " load(s).\n"
<< "BOLT-INFO: FOP: Number of functions conservatively treated as "
"clobbering all registers: "
<< NumFunctionsAllClobber
<< format(" (%.1lf%% dyn cov)\n",
(100.0 * CountFunctionsAllClobber / CountDenominator))
<< "BOLT-INFO: FOP: " << NumFunctionsNotOptimized << " function(s) "
<< format("(%.1lf%% dyn cov)",
(100.0 * CountFunctionsNotOptimized / CountDenominator))
<< " were not optimized.\n"
<< "BOLT-INFO: FOP: " << NumFunctionsFailedRestoreFI << " function(s) "
<< format("(%.1lf%% dyn cov)",
(100.0 * CountFunctionsFailedRestoreFI / CountDenominator))
<< " could not have its frame indices restored.\n";
<< "BOLT-INFO: FOP deleted " << NumLoadsDeleted << " load(s) and "
<< NumRedundantStores << " store(s).\n";
FA.printStats();
ShrinkWrapping::printStats();
}
} // namespace bolt

View File

@@ -13,31 +13,40 @@
#define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEOPTIMIZER_H
#include "BinaryPasses.h"
#include "BinaryFunctionCallGraph.h"
#include "FrameAnalysis.h"
namespace llvm {
namespace bolt {
/// FrameOptimizerPass strives for removing unnecessary stack frame accesses.
/// For example, caller-saved registers may be conservatively pushed to the
/// stack because the callee may write to these registers. But if we can prove
/// the callee will never touch these registers, we can remove this spill.
/// FrameOptimizerPass strives for removing or moving stack frame accesses to
/// less frequently executed basic blocks, reducing the pressure on icache
/// usage as well as dynamic instruction count.
///
/// This optimization analyzes the call graph and first compute the set of
/// This is accomplished by analyzing both caller-saved register spills and
/// callee-saved register spills. This class handles the former while delegating
/// the latter to the class ShrinkWrapping. We discuss caller-saved register
/// spills optimization below.
///
/// Caller-saved registers must be conservatively pushed to the stack because
/// the callee may write to these registers. If we can prove the callee will
/// never touch these registers, we can remove this spill.
///
/// This optimization analyzes the call graph and first computes the set of
/// registers that may get overwritten when executing a function (this includes
/// the set of registers touched by all functions this function may call during
/// its execution).
/// its execution) -- see the FrameAnalysis class for implementation details.
///
/// The second step is to perform an alias analysis to disambiguate which stack
/// position is being accessed by each load/store instruction, and annotate
/// these instructions.
/// The second step is to perform an analysis to disambiguate which stack
/// position is being accessed by each load/store instruction -- see the
/// FrameAnalysis class.
///
/// The third step performs a forward dataflow analysis, using intersection as
/// the confluence operator, to propagate information about available
/// stack definitions at each point of the program. This definition shows
/// an equivalence between the value in a stack position and the value of a
/// register or immediate. To have those preserved, both register and the value
/// in the stack position cannot be touched by another instruction.
/// stack definitions at each point of the program. See the
/// StackAvailableExpressions class. This definition shows an equivalence
/// between the value in a stack position and the value of a register or
/// immediate. To have those preserved, both register and the value in the stack
/// position cannot be touched by another instruction.
/// These definitions we are tracking occur in the form:
///
/// stack def: MEM[FRAME - 0x5c] <= RAX
@@ -62,86 +71,29 @@ namespace bolt {
/// In this example, since the store source register is the same as the load
/// destination register, this creates a redundant MOV that can be deleted.
///
/// Finally, another analysis propagates information about which instructions
/// are using (loading from) a stack position -- see StackReachingUses. If a
/// store sees no use of the value it is storing, it is eliminated.
///
class FrameOptimizerPass : public BinaryFunctionPass {
/// Stats aggregating variables
uint64_t NumRedundantLoads{0};
uint64_t NumRedundantStores{0};
uint64_t NumLoadsChangedToReg{0};
uint64_t NumLoadsChangedToImm{0};
uint64_t NumLoadsDeleted{0};
/// Number of functions we conservatively marked as clobbering the entire set
/// of registers because we couldn't fully understand it.
uint64_t NumFunctionsAllClobber{0};
/// Execution count of those functions to give us an idea of their dynamic
/// coverage
uint64_t CountFunctionsAllClobber{0};
/// Call graph info
BinaryFunctionCallGraph Cg;
/// Perform a dataflow analysis in \p BF to reveal unnecessary reloads from
/// the frame. Use the analysis to convert memory loads to register moves or
/// immediate loads. Delete redundant register moves.
void removeUnnecessaryLoads(const FrameAnalysis &FA,
const BinaryContext &BC,
BinaryFunction &BF);
/// DFS or reverse post-ordering of the call graph nodes to allow us to
/// traverse the call graph bottom-up
std::deque<BinaryFunction *> TopologicalCGOrder;
/// Map functions to the set of registers they may overwrite starting at when
/// it is called until it returns to the caller.
std::map<const BinaryFunction *, BitVector> RegsKilledMap;
public:
/// Alias analysis information attached to each instruction that accesses a
/// frame position. This is called a "frame index" by LLVM Target libs when
/// it is building a MachineFunction frame, and we use the same name here
/// because we are essentially doing the job of frame reconstruction.
struct FrameIndexEntry {
/// If this is false, this instruction is necessarily a store
bool IsLoad;
/// If a store, this controls whether the store uses a register os an imm
/// as the source value.
bool IsStoreFromReg;
/// If load, this holds the destination register. If store, this holds
/// either the source register or source immediate.
int32_t RegOrImm;
/// StackOffset and Size are the two aspects that identify this frame access
/// for the purposes of alias analysis.
int64_t StackOffset;
uint8_t Size;
/// If this is false, we will never atempt to remove or optimize this
/// instruction. We just use it to keep track of stores we don't fully
/// understand but we know it may write to a frame position.
bool IsSimple;
};
typedef std::unordered_map<const MCInst *, const FrameIndexEntry>
FrameIndexMapTy;
FrameIndexMapTy FrameIndexMap;
/// Compute the set of registers \p Inst may write to, marking them in
/// \p KillSet. If this is a call, try to get the set of registers the call
/// target will write to.
void getInstClobberList(const BinaryContext &BC, const MCInst &Inst,
BitVector &KillSet) const;
private:
/// Compute the set of registers \p Func may write to during its execution,
/// starting at the point when it is called up until when it returns. Returns
/// a BitVector the size of the target number of registers, representing the
/// set of clobbered registers.
BitVector getFunctionClobberList(const BinaryContext &BC,
const BinaryFunction *Func);
/// Perform the step of building the set of registers clobbered by each
/// function execution, populating RegsKilledMap.
void buildClobberMap(const BinaryContext &BC);
/// Alias analysis to disambiguate which frame position is accessed by each
/// instruction in function \p BF. Populates FrameIndexMap.
bool restoreFrameIndex(const BinaryContext &BC, const BinaryFunction &BF);
/// Uses RegsKilledMap and FrameIndexMap to perform a dataflow analysis in
/// \p BF to reveal unnecessary reloads from the frame. Use the analysis
/// to convert memory loads to register moves or immediate loads. Delete
/// redundant register moves.
void removeUnnecessarySpills(const BinaryContext &BC,
BinaryFunction &BF);
/// Use information from stack frame usage to delete unused stores.
void removeUnusedStores(const FrameAnalysis &FA,
const BinaryContext &BC,
BinaryFunction &BF);
public:
explicit FrameOptimizerPass(const cl::opt<bool> &PrintPass)
@@ -158,6 +110,7 @@ public:
};
} // namespace bolt
} // namespace llvm

View File

@@ -14,6 +14,7 @@
#include "DataflowAnalysis.h"
#include "FrameAnalysis.h"
#include "llvm/Support/Timer.h"
namespace llvm {
namespace bolt {
@@ -29,6 +30,18 @@ public:
NumRegs(BC.MRI->getNumRegs()) {}
virtual ~LivenessAnalysis();
bool isAlive(ProgramPoint PP, MCPhysReg Reg) const {
BitVector BV = (*this->getStateAt(PP));
const BitVector &RegAliases = BC.MIA->getAliases(Reg, *BC.MRI);
BV &= RegAliases;
return BV.any();
}
void run() {
NamedRegionTimer T1("LA", "Dataflow", true);
DataflowAnalysis<LivenessAnalysis, BitVector, true>::run();
}
protected:
/// Reference to the result of stack frame analysis
const FrameAnalysis &FA;

View File

@@ -13,6 +13,7 @@
#define LLVM_TOOLS_LLVM_BOLT_PASSES_REACHINGDEFORUSE_H
#include "DataflowAnalysis.h"
#include "llvm/Support/Timer.h"
namespace llvm {
namespace bolt {
@@ -50,6 +51,11 @@ public:
return (*this->getStateAt(B))[this->ExprToIdx[&A]];
}
void run() {
NamedRegionTimer T1("RD", "Dataflow", true);
InstrsDataflowAnalysis<ReachingDefOrUse<Def>, !Def>::run();
}
protected:
/// Reference to the result of stack frame analysis
const FrameAnalysis &FA;

View File

@@ -12,6 +12,9 @@
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REACHINGINSNS_H
#define LLVM_TOOLS_LLVM_BOLT_PASSES_REACHINGINSNS_H
#include "DataflowAnalysis.h"
#include "llvm/Support/Timer.h"
namespace llvm {
namespace bolt {
@@ -37,6 +40,11 @@ public:
return isInLoop(*BB);
}
void run() {
NamedRegionTimer T1("RI", "Dataflow", true);
InstrsDataflowAnalysis<ReachingInsns<Backward>, Backward>::run();
}
protected:
std::unordered_map<const MCInst *, BinaryBasicBlock *> InsnToBB;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,477 @@
//===--- Passes/ShrinkWrapping.h ------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_SHRINKWRAPPING_H
#define LLVM_TOOLS_LLVM_BOLT_PASSES_SHRINKWRAPPING_H
#include "BinaryPasses.h"
#include "FrameAnalysis.h"
#include "DataflowInfoManager.h"
namespace llvm {
namespace bolt {
/// Encapsulates logic required to analyze a binary function and detect which
/// registers are being saved as callee-saved, where are these saves and where
/// are the points where their original value are being restored.
class CalleeSavedAnalysis {
const FrameAnalysis &FA;
const BinaryContext &BC;
BinaryFunction &BF;
DataflowInfoManager &Info;
/// Compute all stores of callee-saved regs. Those are the ones that stores a
/// register whose definition is not local.
void analyzeSaves();
/// Similar to analyzeSaves, tries to determine all instructions that recover
/// the original value of the callee-saved register before exiting the
/// function.
void analyzeRestores();
/// Returns the identifying string used to annotate instructions with metadata
/// for this analysis. These are deleted in the destructor.
static StringRef getSaveTag() {
return StringRef("CSA-SavedReg");
}
static StringRef getRestoreTag() {
return StringRef("CSA-RestoredReg");
}
public:
BitVector CalleeSaved;
std::vector<int64_t> OffsetsByReg;
BitVector HasRestores;
std::vector<uint64_t> SavingCost;
std::vector<const FrameIndexEntry*> SaveFIEByReg;
std::vector<const FrameIndexEntry*> LoadFIEByReg;
CalleeSavedAnalysis(const FrameAnalysis &FA, const BinaryContext &BC,
BinaryFunction &BF, DataflowInfoManager &Info)
: FA(FA), BC(BC), BF(BF), Info(Info),
CalleeSaved(BC.MRI->getNumRegs(), false),
OffsetsByReg(BC.MRI->getNumRegs(), 0LL),
HasRestores(BC.MRI->getNumRegs(), false),
SavingCost(BC.MRI->getNumRegs(), 0ULL),
SaveFIEByReg(BC.MRI->getNumRegs(), nullptr),
LoadFIEByReg(BC.MRI->getNumRegs(), nullptr) {}
~CalleeSavedAnalysis();
void compute() {
analyzeSaves();
analyzeRestores();
}
/// Retrieves the value of the callee-saved register that is saved by this
/// instruction or 0 if this is not a CSR save instruction.
uint16_t getSavedReg(const MCInst &Inst) {
auto Val = BC.MIA->tryGetAnnotationAs<decltype(FrameIndexEntry::RegOrImm)>(
Inst, getSaveTag());
if (Val)
return *Val;
return 0;
}
/// Retrieves the value of the callee-saved register that is restored by this
/// instruction or 0 if this is not a CSR restore instruction.
uint16_t getRestoredReg(const MCInst &Inst) {
auto Val = BC.MIA->tryGetAnnotationAs<decltype(FrameIndexEntry::RegOrImm)>(
Inst, getRestoreTag());
if (Val)
return *Val;
return 0;
}
/// Routines to compute all saves/restores for a Reg (needs to traverse all
/// instructions).
std::vector<MCInst *> getSavesByReg(uint16_t Reg);
std::vector<MCInst *> getRestoresByReg(uint16_t Reg);
};
/// Identifies in a given binary function all stack regions being used and allow
/// us to edit the layout, removing or inserting new regions. When the layout is
/// modified, all affected stack-accessing instructions are updated.
class StackLayoutModifier {
const FrameAnalysis &FA;
const BinaryContext &BC;
BinaryFunction &BF;
DataflowInfoManager &Info;
// Keep track of stack slots we know how to safely move
std::map<int64_t, int64_t> AvailableRegions;
DenseSet<int64_t> CollapsedRegions;
DenseSet<int64_t> InsertedRegions;
// A map of chunks of stack memory we don't really know what's happening there
// and we need to leave it untouched.
std::map<int64_t, int64_t> BlacklistedRegions;
// Maps stack slots to the regs that are saved to them
DenseMap<int64_t, std::set<MCPhysReg>> RegionToRegMap;
DenseMap<int, std::set<int64_t>> RegToRegionMap;
// If we can't understand how to move stack slots, IsSimple will be false
bool IsSimple{true};
bool IsInitialized{false};
public:
// Keep a worklist of operations to perform on the function to perform
// the requested layout modifications via collapseRegion()/insertRegion().
struct WorklistItem {
enum ActionType : uint8_t {
None = 0,
AdjustLoadStoreOffset,
AdjustCFI,
} Action;
int64_t OffsetUpdate{0};
WorklistItem() : Action(None) {}
WorklistItem(ActionType Action) : Action(Action) {}
WorklistItem(ActionType Action, int OffsetUpdate)
: Action(Action), OffsetUpdate(OffsetUpdate) {}
};
private:
/// Mark the stack region identified by \p Offset and \p Size to be a
/// no-touch zone, whose accesses cannot be relocated to another region.
void blacklistRegion(int64_t Offset, int64_t Size);
/// Check if this region overlaps with blacklisted addresses
bool isRegionBlacklisted(int64_t Offset, int64_t Size);
/// Check if the region identified by \p Offset and \p Size has any conflicts
/// with available regions so far. If it has, blacklist all involved regions
/// and return true.
bool blacklistAllInConflictWith(int64_t Offset, int64_t Size);
/// If \p Point is identified as frame pointer initialization (defining the
/// value of FP with SP), check for non-standard initialization that precludes
/// us from changing the stack layout. If positive, update blacklisted
/// regions.
void checkFramePointerInitialization(MCInst &Point);
/// Make sense of each stack offsets we can freely change
void classifyStackAccesses();
void classifyCFIs();
/// Used to keep track of modifications to the function that will later be
/// performed by performChanges();
void scheduleChange(MCInst &Inst, WorklistItem Item);
static StringRef getTodoTagName() {
return StringRef("SLM-TodoTag");
}
static StringRef getSlotTagName() {
return StringRef("SLM-SlotTag");
}
static StringRef getOffsetCFIRegTagName() {
return StringRef("SLM-OffsetCFIReg");
}
public:
StackLayoutModifier(const FrameAnalysis &FA, const BinaryContext &BC,
BinaryFunction &BF, DataflowInfoManager &Info)
: FA(FA), BC(BC), BF(BF), Info(Info) {}
~StackLayoutModifier() {
for (auto &BB : BF) {
for (auto &Inst : BB) {
BC.MIA->removeAnnotation(Inst, getTodoTagName());
BC.MIA->removeAnnotation(Inst, getSlotTagName());
BC.MIA->removeAnnotation(Inst, getOffsetCFIRegTagName());
}
}
}
/// Retrieves the value of the callee-saved register that is restored by this
/// instruction or 0 if this is not a CSR restore instruction.
uint16_t getOffsetCFIReg(const MCInst &Inst) {
auto Val =
BC.MIA->tryGetAnnotationAs<uint16_t>(Inst, getOffsetCFIRegTagName());
if (Val)
return *Val;
return 0;
}
/// Check if it is possible to delete the push instruction \p DeletedPush.
/// This involves collapsing the region accessed by this push and updating all
/// other instructions that access affected memory regions. Return true if we
/// can update this.
bool canCollapseRegion(int64_t RegionAddr);
bool canCollapseRegion(MCInst *DeletedPush);
/// Notify the layout manager that \p DeletedPush was deleted and that it
/// needs to update other affected stack-accessing instructions.
bool collapseRegion(MCInst *Alloc, int64_t RegionAddr, int64_t RegionSize);
bool collapseRegion(MCInst *DeletedPush);
/// Set the new stack address difference for load/store instructions that
/// referenced a stack location that was deleted via collapseRegion.
void setOffsetForCollapsedAccesses(int64_t NewOffset);
/// Check if it is possible to insert a push instruction at point \p P.
/// This involves inserting a new region in the stack, possibly affecting
/// instructions that access the frame. Return true if we can update them all.
bool canInsertRegion(ProgramPoint P);
/// Notify the layout manager that a new push instruction has been inserted
/// at point \p P and that it will need to update relevant instructions.
bool insertRegion(ProgramPoint P, int64_t RegionSz);
/// Perform all changes scheduled by collapseRegion()/insertRegion()
void performChanges();
/// Perform initial assessment of the function trying to understand its stack
/// accesses.
void initialize();
};
/// Implements a pass to optimize callee-saved register spills. These spills
/// typically happen at function prologue/epilogue. When these are hot basic
/// blocks, this pass will try to move these spills to cold blocks whenever
/// possible.
class ShrinkWrapping {
const FrameAnalysis &FA;
const BinaryContext &BC;
BinaryFunction &BF;
DataflowInfoManager &Info;
StackLayoutModifier SLM;
/// For each CSR, store a vector of all CFI indexes deleted as a consequence
/// of moving this Callee-Saved Reg
DenseMap<unsigned, std::vector<uint32_t>> DeletedPushCFIs;
DenseMap<unsigned, std::vector<uint32_t>> DeletedPopCFIs;
std::vector<bool> HasDeletedOffsetCFIs;
SmallPtrSet<const MCCFIInstruction *, 16> UpdatedCFIs;
std::vector<BitVector> UsesByReg;
std::vector<int64_t> PushOffsetByReg;
std::vector<int64_t> PopOffsetByReg;
std::vector<MCPhysReg> DomOrder;
CalleeSavedAnalysis CSA;
std::vector<SmallPtrSet<MCInst *, 4>> SavePos;
std::vector<uint64_t> BestSaveCount;
std::vector<MCInst *> BestSavePos;
/// Pass stats
static uint64_t SpillsMovedRegularMode;
static uint64_t SpillsMovedPushPopMode;
/// Allow our custom worklist-sensitive analysis
/// PredictiveStackPointerTracking to access WorklistItem
public:
struct WorklistItem {
enum ActionType : uint8_t {
Erase = 0,
ChangeToAdjustment,
InsertLoadOrStore,
InsertPushOrPop
} Action;
FrameIndexEntry FIEToInsert;
unsigned AffectedReg;
int Adjustment{0};
WorklistItem(ActionType Action, unsigned AffectedReg)
: Action(Action), FIEToInsert(), AffectedReg(AffectedReg) {}
WorklistItem(ActionType Action, unsigned AffectedReg, int Adjustment)
: Action(Action), FIEToInsert(), AffectedReg(AffectedReg),
Adjustment(Adjustment) {}
WorklistItem(ActionType Action, const FrameIndexEntry &FIE,
unsigned AffectedReg)
: Action(Action), FIEToInsert(FIE), AffectedReg(AffectedReg) {}
};
/// Insertion todo items scheduled to happen at the end of BBs. Since we
/// can't annotate BBs we maintain this bookkeeping here.
DenseMap<BinaryBasicBlock*, std::vector<WorklistItem>> Todo;
/// Annotation name used to tag instructions with removal or insertion actions
static StringRef getAnnotationName() {
return StringRef("ShrinkWrap-Todo");
}
private:
using BBIterTy = BinaryBasicBlock::iterator;
/// Calculate all possible uses/defs of these callee-saved regs
void classifyCSRUses();
// Ensure we don't work on cases where there are no uses of the callee-saved
// register. These unnecessary spills should have been removed by previous
// passes.
void pruneUnwantedCSRs();
// Map regs to their possible save possibilities (at start of these BBs)
void computeSaveLocations();
/// Look into the best save location found for saving callee-saved reg
/// \p CSR and evaluates whether we would benefit by moving the spill to this
/// new save location. Returns true in case it is profitable to perform the
/// move.
bool validateBestSavePos(unsigned CSR, MCInst *&BestPosSave,
uint64_t &TotalEstimatedWin);
/// Populate the Todo map with worklistitems to change the function
template <typename ...T>
void scheduleChange(ProgramPoint PP, T&& ...Item) {
if (PP.isInst()) {
auto &WList = BC.MIA->getOrCreateAnnotationAs<std::vector<WorklistItem>>(
BC.Ctx.get(), *PP.getInst(), getAnnotationName());
WList.emplace_back(std::forward<T>(Item)...);
return;
}
// Avoid inserting on BBs with no instructions because we have a dataflow
// analysis that depends on insertions happening before real instructions
// (PredictiveStackPointerTracking)
BinaryBasicBlock *BB = PP.getBB();
if (BB->size() != 0) {
Todo[BB].emplace_back(std::forward<T>(Item)...);
return;
}
while (BB->size() == 0) {
assert (BB->succ_size() == 1);
BB = *BB->succ_begin();
}
auto &WList = BC.MIA->getOrCreateAnnotationAs<std::vector<WorklistItem>>(
BC.Ctx.get(), *BB->begin(), getAnnotationName());
WList.emplace_back(std::forward<T>(Item)...);
}
/// Determine the POP ordering according to which CSR save is the dominator.
void computeDomOrder();
/// Check that the best possible location for a spill save (as determined by
/// computeSaveLocations) is cold enough to be worth moving the save to it.
/// \p CSR is the callee-saved register number, \p BestPosSave returns the
/// pointer to the cold location in case the function returns true, while
/// \p TotalEstimatedWin contains the ins dyn count reduction after moving.
bool isBestSavePosCold(unsigned CSR, MCInst *&BestPosSave,
uint64_t &TotalEstimatedWin);
/// Auxiliary function used to create basic blocks for critical edges and
/// update the dominance frontier with these new locations
void splitFrontierCritEdges(
BinaryFunction *Func, SmallVector<ProgramPoint, 4> &Frontier,
const SmallVector<bool, 4> &IsCritEdge,
const SmallVector<BinaryBasicBlock *, 4> &From,
const SmallVector<SmallVector<BinaryBasicBlock *, 4>, 4> &To);
/// After the best save location for a spill has been established in
/// \p BestPosSave for reg \p CSR, compute adequate locations to restore
/// the spilled value. This will be at the dominance frontier.
/// Returns an empty vector if we failed. In case of success, set
/// \p UsePushPops to true if we can operate in the push/pops mode.
SmallVector<ProgramPoint, 4> doRestorePlacement(MCInst *BestPosSave,
unsigned CSR,
uint64_t TotalEstimatedWin);
/// Checks whether using push and pops (instead of the longer load-store
/// counterparts) is correct for reg \p CSR
bool validatePushPopsMode(unsigned CSR, MCInst *BestPosSave,
int64_t SaveOffset);
/// Adjust restore locations to the correct SP offset if we are using POPs
/// instead of random-access load instructions.
SmallVector<ProgramPoint, 4>
fixPopsPlacements(const SmallVector<ProgramPoint, 4> &RestorePoints,
int64_t SaveOffset, unsigned CSR);
/// When moving spills, mark all old spill locations to be deleted
void scheduleOldSaveRestoresRemoval(unsigned CSR, bool UsePushPops);
/// Return true if \p Inst uses reg \p CSR
bool doesInstUsesCSR(const MCInst &Inst, uint16_t CSR);
/// When moving spills, mark all new spill locations for insertion
void
scheduleSaveRestoreInsertions(unsigned CSR, MCInst *BestPosSave,
SmallVector<ProgramPoint, 4> &RestorePoints,
bool UsePushPops);
/// Coordinate the replacement of callee-saved spills from their original
/// place (at prologue and epilogues) to colder basic blocks as determined
/// by computeSaveLocations().
void moveSaveRestores();
/// After the spill locations for reg \p CSR has been moved and all affected
/// CFI has been removed, insert new updated CFI information for these
/// locations.
void insertUpdatedCFI(unsigned CSR, int SPValPush, int SPValPop);
/// In case the function anchors the CFA reg as SP and we inserted pushes/pops
/// insert def_cfa_offsets at appropriate places (and delete old
/// def_cfa_offsets)
void rebuildCFIForSP();
/// Rebuild all CFI for affected Callee-Saved Registers.
void rebuildCFI();
/// Create a load-store instruction (depending on the contents of \p FIE).
/// If \p CreatePushOrPop is true, create a push/pop instead. Current SP/FP
/// values, as determined by StackPointerTracking, should be informed via
/// \p SPVal and \p FPVal in order to emit the correct offset form SP/FP.
MCInst createStackAccess(int SPVal, int FPVal, const FrameIndexEntry &FIE,
bool CreatePushOrPop);
/// Update the CFI referenced by \p Inst with \p NewOffset, if the CFI has
/// an offset.
void updateCFIInstOffset(MCInst &Inst, int64_t NewOffset);
/// Insert any CFI that should be attached to a register spill save/restore.
BBIterTy insertCFIsForPushOrPop(BinaryBasicBlock &BB, BBIterTy Pos,
unsigned Reg, bool isPush, int Sz,
int64_t NewOffset);
/// Auxiliary function to processInsertionsList, adding a new instruction
/// before \p InsertionPoint as requested by \p Item. Return an updated
/// InsertionPoint for other instructions that need to be inserted at the same
/// original location, since this insertion may have invalidated the previous
/// location.
BBIterTy processInsertion(BBIterTy InsertionPoint, BinaryBasicBlock *CurBB,
const WorklistItem &Item, int64_t SPVal,
int64_t FPVal);
/// Auxiliary function to processInsertions(), helping perform all the
/// insertion tasks in the todo list associated with a single insertion point.
/// Return true if at least one insertion was performed.
BBIterTy processInsertionsList(BBIterTy InsertionPoint,
BinaryBasicBlock *CurBB,
std::vector<WorklistItem> &TodoList,
int64_t SPVal, int64_t FPVal);
/// Apply all insertion todo tasks regarding insertion of new stores/loads or
/// push/pops at annotated points. Return false if the entire function had
/// no todo tasks annotation and this pass has nothing to do.
bool processInsertions();
/// Apply all deletion todo tasks (or tasks to change a push/pop to a memory
/// access no-op)
void processDeletions();
public:
ShrinkWrapping(const FrameAnalysis &FA, const BinaryContext &BC,
BinaryFunction &BF, DataflowInfoManager &Info)
: FA(FA), BC(BC), BF(BF), Info(Info), SLM(FA, BC, BF, Info),
CSA(FA, BC, BF, Info) {}
~ShrinkWrapping() {
for (auto &BB : BF) {
for (auto &Inst : BB) {
BC.MIA->removeAnnotation(Inst, getAnnotationName());
}
}
}
void perform();
static void printStats();
};
} // end namespace bolt
} // end namespace llvm
#endif

View File

@@ -0,0 +1,153 @@
//===--- Passes/StackAllocationAnalysis.cpp -------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#include "StackAllocationAnalysis.h"
#include "llvm/Support/Debug.h"
#define DEBUG_TYPE "saa"
namespace llvm {
namespace bolt {
void StackAllocationAnalysis::preflight() {
DEBUG(dbgs() << "Starting StackAllocationAnalysis on \""
<< Func.getPrintName() << "\"\n");
for (auto &BB : this->Func) {
for (auto &Inst : BB) {
MCPhysReg From, To;
if (!BC.MIA->isPush(Inst) && (!BC.MIA->isRegToRegMove(Inst, From, To) ||
To != BC.MIA->getStackPointer() ||
From != BC.MIA->getFramePointer()) &&
!BC.MII->get(Inst.getOpcode())
.hasDefOfPhysReg(Inst, BC.MIA->getStackPointer(), *BC.MRI))
continue;
this->Expressions.push_back(&Inst);
this->ExprToIdx[&Inst] = this->NumInstrs++;
}
}
}
BitVector
StackAllocationAnalysis::getStartingStateAtBB(const BinaryBasicBlock &BB) {
return BitVector(this->NumInstrs, false);
}
BitVector
StackAllocationAnalysis::getStartingStateAtPoint(const MCInst &Point) {
return BitVector(this->NumInstrs, false);
}
void StackAllocationAnalysis::doConfluence(BitVector &StateOut,
const BitVector &StateIn) {
StateOut |= StateIn;
}
BitVector StackAllocationAnalysis::doKill(const MCInst &Point,
const BitVector &StateIn,
int DeallocSize) {
int64_t SPOffset = SPT.getStateAt(Point)->first;
BitVector Next = StateIn;
if (SPOffset == SPT.SUPERPOSITION || SPOffset == SPT.EMPTY)
return Next;
for (auto I = this->expr_begin(Next), E = this->expr_end(); I != E; ++I) {
const MCInst *Instr = *I;
int64_t InstrOffset = SPT.getStateAt(*Instr)->first;
if (InstrOffset == SPT.SUPERPOSITION || InstrOffset == SPT.EMPTY)
continue;
if (InstrOffset < SPOffset) {
Next.reset(I.getBitVectorIndex());
DEBUG({
dbgs() << "SAA FYI: Killed: ";
Instr->dump();
dbgs() << "by: ";
Point.dump();
dbgs() << " (more info: Killed instr offset = " << InstrOffset
<< ". SPOffset = " << SPOffset
<< "; DeallocSize= " << DeallocSize << "\n";
});
}
}
return Next;
}
void StackAllocationAnalysis::doConfluenceWithLP(BitVector &StateOut,
const BitVector &StateIn,
const MCInst &Invoke) {
BitVector NewIn = StateIn;
for (const auto &Operand : Invoke) {
if (Operand.isGnuArgsSize()) {
auto ArgsSize = Operand.getGnuArgsSize();
NewIn = doKill(Invoke, NewIn, ArgsSize);
}
}
StateOut |= NewIn;
}
BitVector StackAllocationAnalysis::computeNext(const MCInst &Point,
const BitVector &Cur) {
const auto &MIA = BC.MIA;
BitVector Next = Cur;
if (int Sz = MIA->getPopSize(Point)) {
Next = doKill(Point, Next, Sz);
return Next;
}
if (MIA->isPush(Point)) {
Next.set(this->ExprToIdx[&Point]);
return Next;
}
MCPhysReg From, To;
int64_t SPOffset, FPOffset;
std::tie(SPOffset, FPOffset) = *SPT.getStateBefore(Point);
if (MIA->isRegToRegMove(Point, From, To) && To == MIA->getStackPointer() &&
From == MIA->getFramePointer()) {
if (MIA->isLeave(Point))
FPOffset += 8;
if (SPOffset < FPOffset) {
Next = doKill(Point, Next, FPOffset - SPOffset);
return Next;
}
if (SPOffset > FPOffset) {
Next.set(this->ExprToIdx[&Point]);
return Next;
}
}
if (BC.MII->get(Point.getOpcode())
.hasDefOfPhysReg(Point, MIA->getStackPointer(), *BC.MRI)) {
std::pair<MCPhysReg, int64_t> SP;
if (SPOffset != SPT.EMPTY && SPOffset != SPT.SUPERPOSITION)
SP = std::make_pair(MIA->getStackPointer(), SPOffset);
else
SP = std::make_pair(0, 0);
std::pair<MCPhysReg, int64_t> FP;
if (FPOffset != SPT.EMPTY && FPOffset != SPT.SUPERPOSITION)
FP = std::make_pair(MIA->getFramePointer(), FPOffset);
else
FP = std::make_pair(0, 0);
int64_t Output;
if (!MIA->evaluateSimple(Point, Output, SP, FP))
return Next;
if (SPOffset < Output) {
Next = doKill(Point, Next, Output - SPOffset);
return Next;
}
if (SPOffset > Output) {
Next.set(this->ExprToIdx[&Point]);
return Next;
}
}
return Next;
}
} // end namespace bolt
} // end namespace llvm

View File

@@ -0,0 +1,68 @@
//===--- Passes/StackAllocationAnalysis.h ---------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_STACKALLOCATIONANALYSIS_H
#define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKALLOCATIONANALYSIS_H
#include "DataflowAnalysis.h"
#include "StackPointerTracking.h"
#include "llvm/Support/Timer.h"
namespace llvm {
namespace bolt {
/// Perform a dataflow analysis to track the value of SP as an offset relative
/// to the CFA.
class StackAllocationAnalysis
: public InstrsDataflowAnalysis<StackAllocationAnalysis,
/*Backward=*/false> {
friend class DataflowAnalysis<StackAllocationAnalysis, BitVector>;
StackPointerTracking &SPT;
public:
StackAllocationAnalysis(const BinaryContext &BC, BinaryFunction &BF,
StackPointerTracking &SPT)
: InstrsDataflowAnalysis<StackAllocationAnalysis, false>(BC, BF),
SPT(SPT) {}
virtual ~StackAllocationAnalysis() {}
void run() {
NamedRegionTimer T1("SAA", "Dataflow", true);
InstrsDataflowAnalysis<StackAllocationAnalysis, false>::run();
}
protected:
void preflight();
BitVector getStartingStateAtBB(const BinaryBasicBlock &BB);
BitVector getStartingStateAtPoint(const MCInst &Point);
void doConfluence(BitVector &StateOut, const BitVector &StateIn);
BitVector doKill(const MCInst &Point, const BitVector &StateIn,
int DeallocSize);
void doConfluenceWithLP(BitVector &StateOut, const BitVector &StateIn,
const MCInst &Invoke);
BitVector computeNext(const MCInst &Point, const BitVector &Cur);
StringRef getAnnotationName() const {
return StringRef("StackAllocationAnalysis");
}
};
} // end namespace bolt
} // end namespace llvm
#endif

View File

@@ -0,0 +1,132 @@
//===--- Passes/StackAvailableExpressions.cpp -----------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#include "StackAvailableExpressions.h"
#include "FrameAnalysis.h"
#define DEBUG_TYPE "sae"
namespace llvm {
namespace bolt {
StackAvailableExpressions::StackAvailableExpressions(const FrameAnalysis &FA,
const BinaryContext &BC,
BinaryFunction &BF)
: InstrsDataflowAnalysis(BC, BF), FA(FA) {}
void StackAvailableExpressions::preflight() {
DEBUG(dbgs() << "Starting StackAvailableExpressions on \""
<< Func.getPrintName() << "\"\n");
// Populate our universe of tracked expressions. We are interested in
// tracking available stores to frame position at any given point of the
// program.
for (auto &BB : Func) {
for (auto &Inst : BB) {
auto FIE = FA.getFIEFor(BC, Inst);
if (!FIE)
continue;
if (FIE->IsStore == true && FIE->IsSimple == true) {
Expressions.push_back(&Inst);
ExprToIdx[&Inst] = NumInstrs++;
}
}
}
}
BitVector
StackAvailableExpressions::getStartingStateAtBB(const BinaryBasicBlock &BB) {
// Entry points start with empty set
// All others start with the full set.
if (BB.pred_size() == 0 && BB.throw_size() == 0)
return BitVector(NumInstrs, false);
return BitVector(NumInstrs, true);
}
BitVector
StackAvailableExpressions::getStartingStateAtPoint(const MCInst &Point) {
return BitVector(NumInstrs, true);
}
void StackAvailableExpressions::doConfluence(BitVector &StateOut,
const BitVector &StateIn) {
StateOut &= StateIn;
}
namespace {
bool isLoadRedundant(const FrameIndexEntry &LoadFIE,
const FrameIndexEntry &StoreFIE) {
if (LoadFIE.IsLoad == false || LoadFIE.IsSimple == false) {
return false;
}
if (LoadFIE.StackOffset == StoreFIE.StackOffset &&
LoadFIE.Size == StoreFIE.Size) {
return true;
}
return false;
}
}
bool StackAvailableExpressions::doesXKillsY(const MCInst *X, const MCInst *Y) {
// if both are stores, and both store to the same stack location, return
// true
auto FIEX = FA.getFIEFor(BC, *X);
auto FIEY = FA.getFIEFor(BC, *Y);
if (FIEX && FIEY) {
if (isLoadRedundant(*FIEX, *FIEY))
return false;
if (FIEX->IsStore == true && FIEY->IsStore == true &&
FIEX->StackOffset + FIEX->Size > FIEY->StackOffset &&
FIEX->StackOffset < FIEY->StackOffset + FIEY->Size)
return true;
}
// getClobberedRegs for X and Y. If they intersect, return true
BitVector XClobbers = BitVector(BC.MRI->getNumRegs(), false);
BitVector YClobbers = BitVector(BC.MRI->getNumRegs(), false);
FA.getInstClobberList(BC, *X, XClobbers);
// If Y is a store to stack, its clobber list is its source reg. This is
// different than the rest because we want to check if the store source
// reaches its corresponding load untouched.
if (FIEY && FIEY->IsStore == true && FIEY->IsStoreFromReg) {
YClobbers.set(FIEY->RegOrImm);
} else {
FA.getInstClobberList(BC, *Y, YClobbers);
}
XClobbers &= YClobbers;
return XClobbers.any();
}
BitVector StackAvailableExpressions::computeNext(const MCInst &Point,
const BitVector &Cur) {
BitVector Next = Cur;
// Kill
for (auto I = expr_begin(Next), E = expr_end(); I != E; ++I) {
assert(*I != nullptr && "Lost pointers");
DEBUG(dbgs() << "\t\t\tDoes it kill ");
DEBUG((*I)->dump());
if (doesXKillsY(&Point, *I)) {
DEBUG(dbgs() << "\t\t\t\tKilling ");
DEBUG((*I)->dump());
Next.reset(I.getBitVectorIndex());
}
}
// Gen
if (auto FIE = FA.getFIEFor(BC, Point)) {
if (FIE->IsStore == true && FIE->IsSimple == true)
Next.set(ExprToIdx[&Point]);
}
return Next;
}
} // namespace bolt
} // namespace llvm

View File

@@ -0,0 +1,58 @@
//===--- Passes/StackAvailableExpressions.h -------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_STACKAVAILABLEEXPRESSIONS_H
#define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKAVAILABLEEXPRESSIONS_H
#include "DataflowAnalysis.h"
#include "llvm/Support/Timer.h"
namespace llvm {
namespace bolt {
class FrameAnalysis;
class StackAvailableExpressions
: public InstrsDataflowAnalysis<StackAvailableExpressions> {
friend class DataflowAnalysis<StackAvailableExpressions, BitVector>;
public:
StackAvailableExpressions(const FrameAnalysis &FA,
const BinaryContext &BC, BinaryFunction &BF);
virtual ~StackAvailableExpressions() {}
void run() {
NamedRegionTimer T1("SAE", "Dataflow", true);
InstrsDataflowAnalysis<StackAvailableExpressions>::run();
}
protected:
/// Reference to the result of stack frame analysis
const FrameAnalysis &FA;
void preflight();
BitVector getStartingStateAtBB(const BinaryBasicBlock &BB);
BitVector getStartingStateAtPoint(const MCInst &Point);
void doConfluence(BitVector &StateOut, const BitVector &StateIn);
/// Define the function computing the kill set -- whether expression Y, a
/// tracked expression, will be considered to be dead after executing X.
bool doesXKillsY(const MCInst *X, const MCInst *Y);
BitVector computeNext(const MCInst &Point, const BitVector &Cur);
StringRef getAnnotationName() const {
return StringRef("StackAvailableExpressions");
}
};
} // namespace bolt
} // namespace llvm
#endif

View File

@@ -13,6 +13,7 @@
#define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKPOINTERTRACKING_H
#include "DataflowAnalysis.h"
#include "llvm/Support/Timer.h"
namespace llvm {
namespace bolt {
@@ -190,6 +191,11 @@ class StackPointerTracking
public:
StackPointerTracking(const BinaryContext &BC, BinaryFunction &BF);
virtual ~StackPointerTracking() {}
void run() {
NamedRegionTimer T1("SPT", "Dataflow", true);
StackPointerTrackingBase<StackPointerTracking>::run();
}
};
} // end namespace bolt

View File

@@ -0,0 +1,112 @@
//===--- Passes/StackReachingUses.cpp -------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#include "StackReachingUses.h"
#include "FrameAnalysis.h"
#define DEBUG_TYPE "sru"
namespace llvm {
namespace bolt {
bool StackReachingUses::isStoreUsed(const FrameIndexEntry &StoreFIE,
ExprIterator Candidates,
bool IncludeLocalAccesses) const {
for (auto I = Candidates; I != expr_end(); ++I) {
const MCInst *ReachingInst = *I;
if (IncludeLocalAccesses) {
if (auto FIEY = FA.getFIEFor(BC, *ReachingInst)) {
assert(FIEY->IsLoad == 1);
if (StoreFIE.StackOffset + StoreFIE.Size > FIEY->StackOffset &&
StoreFIE.StackOffset < FIEY->StackOffset + FIEY->Size) {
return true;
}
}
}
auto Args = FA.getArgAccessesFor(BC, *ReachingInst);
if (!Args)
continue;
if (Args->AssumeEverything) {
return true;
}
for (auto FIEY : Args->Set) {
if (StoreFIE.StackOffset + StoreFIE.Size > FIEY.StackOffset &&
StoreFIE.StackOffset < FIEY.StackOffset + FIEY.Size) {
return true;
}
}
}
return false;
}
void StackReachingUses::preflight() {
DEBUG(dbgs() << "Starting StackReachingUses on \"" << Func.getPrintName()
<< "\"\n");
// Populate our universe of tracked expressions. We are interested in
// tracking reaching loads from frame position at any given point of the
// program.
for (auto &BB : Func) {
for (auto &Inst : BB) {
if (auto FIE = FA.getFIEFor(BC, Inst)) {
if (FIE->IsLoad == true) {
Expressions.push_back(&Inst);
ExprToIdx[&Inst] = NumInstrs++;
continue;
}
}
auto AA = FA.getArgAccessesFor(BC, Inst);
if (AA && (!AA->Set.empty() || AA->AssumeEverything)) {
Expressions.push_back(&Inst);
ExprToIdx[&Inst] = NumInstrs++;
}
}
}
}
bool StackReachingUses::doesXKillsY(const MCInst *X, const MCInst *Y) {
// if X is a store to the same stack location and the bytes fetched is a
// superset of those bytes affected by the load in Y, return true
auto FIEX = FA.getFIEFor(BC, *X);
auto FIEY = FA.getFIEFor(BC, *Y);
if (FIEX && FIEY) {
if (FIEX->IsStore == true && FIEY->IsLoad == true &&
FIEX->StackOffset <= FIEY->StackOffset &&
FIEX->StackOffset + FIEX->Size >= FIEY->StackOffset + FIEY->Size)
return true;
}
return false;
}
BitVector StackReachingUses::computeNext(const MCInst &Point,
const BitVector &Cur) {
BitVector Next = Cur;
// Kill
for (auto I = expr_begin(Next), E = expr_end(); I != E; ++I) {
assert(*I != nullptr && "Lost pointers");
if (doesXKillsY(&Point, *I)) {
DEBUG(dbgs() << "\t\t\tKilling ");
DEBUG((*I)->dump());
Next.reset(I.getBitVectorIndex());
}
};
// Gen
if (auto FIE = FA.getFIEFor(BC, Point)) {
if (FIE->IsLoad == true)
Next.set(ExprToIdx[&Point]);
}
auto AA = FA.getArgAccessesFor(BC, Point);
if (AA && (!AA->Set.empty() || AA->AssumeEverything))
Next.set(ExprToIdx[&Point]);
return Next;
}
} // namespace bolt
} // namespace llvm

View File

@@ -0,0 +1,71 @@
//===--- Passes/StackReachingUses.h ---------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_STACKREACHINGUSES_H
#define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKREACHINGUSES_H
#include "DataflowAnalysis.h"
#include "llvm/Support/Timer.h"
namespace llvm {
namespace bolt {
class FrameAnalysis;
struct FrameIndexEntry;
class StackReachingUses
: public InstrsDataflowAnalysis<StackReachingUses, /*Backward=*/true> {
friend class DataflowAnalysis<StackReachingUses, BitVector, true>;
public:
StackReachingUses(const FrameAnalysis &FA, const BinaryContext &BC,
BinaryFunction &BF)
: InstrsDataflowAnalysis(BC, BF), FA(FA) {}
virtual ~StackReachingUses() {}
bool isStoreUsed(const FrameIndexEntry &StoreFIE, ExprIterator Candidates,
bool IncludeLocalAccesses = true) const;
void run() {
NamedRegionTimer T1("SRU", "Dataflow", true);
InstrsDataflowAnalysis<StackReachingUses, true>::run();
}
protected:
// Reference to the result of stack frame analysis
const FrameAnalysis &FA;
void preflight();
BitVector getStartingStateAtBB(const BinaryBasicBlock &BB) {
return BitVector(NumInstrs, false);
}
BitVector getStartingStateAtPoint(const MCInst &Point) {
return BitVector(NumInstrs, false);
}
void doConfluence(BitVector &StateOut, const BitVector &StateIn) {
StateOut |= StateIn;
}
// Define the function computing the kill set -- whether expression Y, a
// tracked expression, will be considered to be dead after executing X.
bool doesXKillsY(const MCInst *X, const MCInst *Y);
BitVector computeNext(const MCInst &Point, const BitVector &Cur);
StringRef getAnnotationName() const { return StringRef("StackReachingUses"); }
};
} // end namespace bolt
} // end namespace llvm
#endif

View File

@@ -1659,6 +1659,7 @@ void RewriteInstance::readDebugInfo() {
void RewriteInstance::disassembleFunctions() {
// Disassemble every function and build it's control flow graph.
TotalScore = 0;
BC->SumExecutionCount = 0;
for (auto &BFI : BinaryFunctions) {
BinaryFunction &Function = BFI.second;
@@ -1803,6 +1804,7 @@ void RewriteInstance::disassembleFunctions() {
}
TotalScore += Function.getFunctionScore();
BC->SumExecutionCount += Function.getKnownExecutionCount();
} // Iterate over all functions
@@ -1821,6 +1823,7 @@ void RewriteInstance::disassembleFunctions() {
else
++NumStaleProfileFunctions;
}
BC->NumProfiledFuncs = ProfiledFunctions.size();
const auto NumAllProfiledFunctions =
ProfiledFunctions.size() + NumStaleProfileFunctions;