From d850ca36222f4ee58bee17bac22385e8c9ffa2d1 Mon Sep 17 00:00:00 2001 From: Rafael Auler Date: Mon, 1 May 2017 16:52:54 -0700 Subject: [PATCH] [BOLT] Add shrink wrapping pass Summary: Add an implementation for shrink wrapping, a frame optimization that moves callee-saved register spills from hot prologues to cold successors. (cherry picked from FBD4983706) --- bolt/BinaryBasicBlock.cpp | 37 +- bolt/BinaryBasicBlock.h | 43 +- bolt/BinaryContext.cpp | 71 +- bolt/BinaryContext.h | 19 +- bolt/BinaryFunction.cpp | 71 +- bolt/BinaryFunction.h | 20 + bolt/BinaryPassManager.cpp | 18 +- bolt/Passes/AllocCombiner.cpp | 116 ++ bolt/Passes/AllocCombiner.h | 48 + bolt/Passes/BinaryPasses.cpp | 4 +- bolt/Passes/BinaryPasses.h | 6 + bolt/Passes/CMakeLists.txt | 5 + bolt/Passes/DataflowAnalysis.h | 24 +- bolt/Passes/DataflowInfoManager.cpp | 68 +- bolt/Passes/DataflowInfoManager.h | 14 + bolt/Passes/DominatorAnalysis.h | 24 +- bolt/Passes/FrameAnalysis.cpp | 35 +- bolt/Passes/FrameOptimizer.cpp | 813 ++-------- bolt/Passes/FrameOptimizer.h | 125 +- bolt/Passes/LivenessAnalysis.h | 13 + bolt/Passes/ReachingDefOrUse.h | 6 + bolt/Passes/ReachingInsns.h | 8 + bolt/Passes/ShrinkWrapping.cpp | 1785 +++++++++++++++++++++ bolt/Passes/ShrinkWrapping.h | 477 ++++++ bolt/Passes/StackAllocationAnalysis.cpp | 153 ++ bolt/Passes/StackAllocationAnalysis.h | 68 + bolt/Passes/StackAvailableExpressions.cpp | 132 ++ bolt/Passes/StackAvailableExpressions.h | 58 + bolt/Passes/StackPointerTracking.h | 6 + bolt/Passes/StackReachingUses.cpp | 112 ++ bolt/Passes/StackReachingUses.h | 71 + bolt/RewriteInstance.cpp | 3 + 32 files changed, 3609 insertions(+), 844 deletions(-) create mode 100644 bolt/Passes/AllocCombiner.cpp create mode 100644 bolt/Passes/AllocCombiner.h create mode 100644 bolt/Passes/ShrinkWrapping.cpp create mode 100644 bolt/Passes/ShrinkWrapping.h create mode 100644 bolt/Passes/StackAllocationAnalysis.cpp create mode 100644 bolt/Passes/StackAllocationAnalysis.h create mode 100644 bolt/Passes/StackAvailableExpressions.cpp create mode 100644 bolt/Passes/StackAvailableExpressions.h create mode 100644 bolt/Passes/StackReachingUses.cpp create mode 100644 bolt/Passes/StackReachingUses.h diff --git a/bolt/BinaryBasicBlock.cpp b/bolt/BinaryBasicBlock.cpp index e580995be8da..8a56beba1835 100644 --- a/bolt/BinaryBasicBlock.cpp +++ b/bolt/BinaryBasicBlock.cpp @@ -148,8 +148,9 @@ BinaryBasicBlock *BinaryBasicBlock::getLandingPad(const MCSymbol *Label) const { } int32_t BinaryBasicBlock::getCFIStateAtInstr(const MCInst *Instr) const { - assert(getFunction()->getState() == BinaryFunction::State::CFG && - "can only calculate CFI state when function is in active CFG state"); + assert( + getFunction()->getState() >= BinaryFunction::State::CFG && + "can only calculate CFI state when function is in or past the CFG state"); const auto &FDEProgram = getFunction()->getFDEProgram(); @@ -316,6 +317,38 @@ bool BinaryBasicBlock::analyzeBranch(const MCSymbol *&TBB, return MIA->analyzeBranch(Instructions, TBB, FBB, CondBranch, UncondBranch); } +MCInst *BinaryBasicBlock::getTerminatorBefore(MCInst *Pos) { + auto &BC = Function->getBinaryContext(); + auto Itr = rbegin(); + bool Check = Pos ? false : true; + MCInst *FirstTerminator{nullptr}; + while (Itr != rend()) { + if (!Check) { + if (&*Itr == Pos) + Check = true; + ++Itr; + continue; + } + if (BC.MIA->isTerminator(*Itr)) + FirstTerminator = &*Itr; + ++Itr; + } + return FirstTerminator; +} + +bool BinaryBasicBlock::hasTerminatorAfter(MCInst *Pos) { + auto &BC = Function->getBinaryContext(); + auto Itr = rbegin(); + while (Itr != rend()) { + if (&*Itr == Pos) + return false; + if (BC.MIA->isTerminator(*Itr)) + return true; + ++Itr; + } + return false; +} + bool BinaryBasicBlock::swapConditionalSuccessors() { if (succ_size() != 2) return false; diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index ad1227279217..151ac321484c 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -617,20 +617,26 @@ public: return Instructions.erase(II); } + /// Retrieve iterator for \p Inst or return end iterator if instruction is not + /// from this basic block. + decltype(Instructions)::iterator findInstruction(const MCInst *Inst) { + if (Instructions.empty()) + return Instructions.end(); + size_t Index = Inst - &Instructions[0]; + return Index >= Instructions.size() ? Instructions.end() + : Instructions.begin() + Index; + } + /// Replace an instruction with a sequence of instructions. Returns true /// if the instruction to be replaced was found and replaced. template bool replaceInstruction(const MCInst *Inst, Itr Begin, Itr End) { - auto I = Instructions.end(); - auto B = Instructions.begin(); - while (I > B) { - --I; - if (&*I == Inst) { - adjustNumPseudos(*Inst, -1); - Instructions.insert(Instructions.erase(I), Begin, End); - adjustNumPseudos(Begin, End, 1); - return true; - } + auto I = findInstruction(Inst); + if (I != Instructions.end()) { + adjustNumPseudos(*Inst, -1); + Instructions.insert(Instructions.erase(I), Begin, End); + adjustNumPseudos(Begin, End, 1); + return true; } return false; } @@ -640,6 +646,23 @@ public: return replaceInstruction(Inst, Replacement.begin(), Replacement.end()); } + /// Insert \p NewInst before \p At, which must be an existing instruction in + /// this BB. Return a pointer to the newly inserted instruction. + iterator insertInstruction(iterator At, MCInst &&NewInst) { + adjustNumPseudos(NewInst, 1); + return Instructions.emplace(At, std::move(NewInst)); + } + + /// Helper to retrieve any terminators in \p BB before \p Pos. This is used + /// to skip CFI instructions and to retrieve the first terminator instruction + /// in basic blocks with two terminators (conditional jump and unconditional + /// jump). + MCInst *getTerminatorBefore(MCInst *Pos); + + /// Used to identify whether an instruction is before a terminator and whether + /// moving it to the end of the BB would render it dead code. + bool hasTerminatorAfter(MCInst *Pos); + /// Split apart the instructions in this basic block starting at Inst. /// The instructions following Inst are removed and returned in a vector. std::vector splitInstructions(const MCInst *Inst) { diff --git a/bolt/BinaryContext.cpp b/bolt/BinaryContext.cpp index 55926e96f608..26f29ab6f71a 100644 --- a/bolt/BinaryContext.cpp +++ b/bolt/BinaryContext.cpp @@ -239,24 +239,57 @@ void BinaryContext::preprocessDebugInfo( } } -void BinaryContext::printCFI(raw_ostream &OS, uint32_t Operation) { - switch(Operation) { - case MCCFIInstruction::OpSameValue: OS << "OpSameValue"; break; - case MCCFIInstruction::OpRememberState: OS << "OpRememberState"; break; - case MCCFIInstruction::OpRestoreState: OS << "OpRestoreState"; break; - case MCCFIInstruction::OpOffset: OS << "OpOffset"; break; - case MCCFIInstruction::OpDefCfaRegister: OS << "OpDefCfaRegister"; break; - case MCCFIInstruction::OpDefCfaOffset: OS << "OpDefCfaOffset"; break; - case MCCFIInstruction::OpDefCfa: OS << "OpDefCfa"; break; - case MCCFIInstruction::OpRelOffset: OS << "OpRelOffset"; break; - case MCCFIInstruction::OpAdjustCfaOffset: OS << "OfAdjustCfaOffset"; break; - case MCCFIInstruction::OpEscape: OS << "OpEscape"; break; - case MCCFIInstruction::OpRestore: OS << "OpRestore"; break; - case MCCFIInstruction::OpUndefined: OS << "OpUndefined"; break; - case MCCFIInstruction::OpRegister: OS << "OpRegister"; break; - case MCCFIInstruction::OpWindowSave: OS << "OpWindowSave"; break; - case MCCFIInstruction::OpGnuArgsSize: OS << "OpGnuArgsSize"; break; - default: OS << "Op#" << Operation; break; +void BinaryContext::printCFI(raw_ostream &OS, const MCCFIInstruction &Inst) { + uint32_t Operation = Inst.getOperation(); + switch (Operation) { + case MCCFIInstruction::OpSameValue: + OS << "OpSameValue Reg" << Inst.getRegister(); + break; + case MCCFIInstruction::OpRememberState: + OS << "OpRememberState"; + break; + case MCCFIInstruction::OpRestoreState: + OS << "OpRestoreState"; + break; + case MCCFIInstruction::OpOffset: + OS << "OpOffset Reg" << Inst.getRegister() << " " << Inst.getOffset(); + break; + case MCCFIInstruction::OpDefCfaRegister: + OS << "OpDefCfaRegister Reg" << Inst.getRegister(); + break; + case MCCFIInstruction::OpDefCfaOffset: + OS << "OpDefCfaOffset " << Inst.getOffset(); + break; + case MCCFIInstruction::OpDefCfa: + OS << "OpDefCfa Reg" << Inst.getRegister() << " " << Inst.getOffset(); + break; + case MCCFIInstruction::OpRelOffset: + OS << "OpRelOffset"; + break; + case MCCFIInstruction::OpAdjustCfaOffset: + OS << "OfAdjustCfaOffset"; + break; + case MCCFIInstruction::OpEscape: + OS << "OpEscape"; + break; + case MCCFIInstruction::OpRestore: + OS << "OpRestore"; + break; + case MCCFIInstruction::OpUndefined: + OS << "OpUndefined"; + break; + case MCCFIInstruction::OpRegister: + OS << "OpRegister"; + break; + case MCCFIInstruction::OpWindowSave: + OS << "OpWindowSave"; + break; + case MCCFIInstruction::OpGnuArgsSize: + OS << "OpGnuArgsSize"; + break; + default: + OS << "Op#" << Operation; + break; } } @@ -274,7 +307,7 @@ void BinaryContext::printInstruction(raw_ostream &OS, uint32_t Offset = Instruction.getOperand(0).getImm(); OS << "\t!CFI\t$" << Offset << "\t; "; if (Function) - printCFI(OS, Function->getCFIFor(Instruction)->getOperation()); + printCFI(OS, *Function->getCFIFor(Instruction)); OS << "\n"; return; } diff --git a/bolt/BinaryContext.h b/bolt/BinaryContext.h index 5b4d3169beae..fcc54e358cfa 100644 --- a/bolt/BinaryContext.h +++ b/bolt/BinaryContext.h @@ -143,6 +143,12 @@ public: const DataReader &DR; + /// Sum of execution count of all functions + uint64_t SumExecutionCount{0}; + + /// Number of functions with profile information + uint64_t NumProfiledFuncs{0}; + BinaryContext(std::unique_ptr Ctx, std::unique_ptr DwCtx, std::unique_ptr TheTriple, @@ -262,8 +268,19 @@ public: return Size; } + /// Return a function execution count threshold for determining whether the + /// the function is 'hot'. Consider it hot if count is above the average exec + /// count of profiled functions. + uint64_t getHotThreshold() const { + static uint64_t Threshold{0}; + if (Threshold == 0) { + Threshold = NumProfiledFuncs ? SumExecutionCount / NumProfiledFuncs : 1; + } + return Threshold; + } + /// Print the string name for a CFI operation. - static void printCFI(raw_ostream &OS, uint32_t Operation); + static void printCFI(raw_ostream &OS, const MCCFIInstruction &Inst); /// Print a single MCInst in native format. If Function is non-null, /// the instruction will be annotated with CFI and possibly DWARF line table diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 6bf637006923..410dc317b880 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -150,7 +150,7 @@ constexpr unsigned NoRegister = 0; constexpr const char *DynoStats::Desc[]; constexpr unsigned BinaryFunction::MinAlign; - + namespace { /// Gets debug line information for the instruction located at the given @@ -535,8 +535,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, for (auto &Elmt : OffsetToCFI) { OS << format(" %08x:\t", Elmt.first); assert(Elmt.second < FrameInstructions.size() && "Incorrect CFI offset"); - BinaryContext::printCFI(OS, - FrameInstructions[Elmt.second].getOperation()); + BinaryContext::printCFI(OS, FrameInstructions[Elmt.second]); OS << "\n"; } } else { @@ -544,7 +543,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation, for (uint32_t I = 0, E = FrameInstructions.size(); I != E; ++I) { const MCCFIInstruction &CFI = FrameInstructions[I]; OS << format(" %d:\t", I); - BinaryContext::printCFI(OS, CFI.getOperation()); + BinaryContext::printCFI(OS, CFI); OS << "\n"; } } @@ -3442,6 +3441,54 @@ void BinaryFunction::updateLayout(LayoutType Type, updateLayoutIndices(); } +bool BinaryFunction::replaceJumpTableEntryIn(BinaryBasicBlock *BB, + BinaryBasicBlock *OldDest, + BinaryBasicBlock *NewDest) { + auto *Instr = BB->getLastNonPseudoInstr(); + if (!Instr || !BC.MIA->isIndirectBranch(*Instr)) + return false; + auto JTAddress = BC.MIA->getJumpTable(*Instr); + assert(JTAddress && "Invalid jump table address"); + auto *JT = getJumpTableContainingAddress(JTAddress); + assert(JT && "No jump table structure for this indirect branch"); + bool Patched = JT->replaceDestination(JTAddress, OldDest->getLabel(), + NewDest->getLabel()); + assert(Patched && "Invalid entry to be replaced in jump table"); + return true; +} + +BinaryBasicBlock *BinaryFunction::splitEdge(BinaryBasicBlock *From, + BinaryBasicBlock *To) { + // Create intermediate BB + MCSymbol *Tmp = BC.Ctx->createTempSymbol("SplitEdge", true); + auto NewBB = createBasicBlock(0, Tmp); + auto NewBBPtr = NewBB.get(); + + // Update "From" BB + auto I = From->succ_begin(); + auto BI = From->branch_info_begin(); + for (; I != From->succ_end(); ++I) { + if (*I == To) + break; + ++BI; + } + assert(I != From->succ_end() && "Invalid CFG edge in splitEdge!"); + uint64_t OrigCount{BI->Count}; + uint64_t OrigMispreds{BI->MispredictedCount}; + replaceJumpTableEntryIn(From, To, NewBBPtr); + From->replaceSuccessor(To, NewBBPtr, OrigCount, OrigMispreds); + + NewBB->addSuccessor(To, OrigCount, OrigMispreds); + NewBB->setExecutionCount(OrigCount); + NewBB->setIsCold(From->isCold()); + + // Update CFI and BB layout with new intermediate BB + std::vector> NewBBs; + NewBBs.emplace_back(std::move(NewBB)); + insertBasicBlocks(From, std::move(NewBBs), true, true); + return NewBBPtr; +} + bool BinaryFunction::isSymbolValidInScope(const SymbolRef &Symbol, uint64_t SymbolSize) const { // Some symbols are tolerated inside function bodies, others are not. @@ -3578,6 +3625,22 @@ BinaryFunction::JumpTable::getEntriesForAddress(const uint64_t Addr) const { return std::make_pair(StartIndex, EndIndex); } +bool BinaryFunction::JumpTable::replaceDestination(uint64_t JTAddress, + const MCSymbol *OldDest, + MCSymbol *NewDest) { + bool Patched{false}; + const auto Range = getEntriesForAddress(JTAddress); + for (auto I = &Entries[Range.first], E = &Entries[Range.second]; + I != E; ++I) { + auto &Entry = *I; + if (Entry == OldDest) { + Patched = true; + Entry = NewDest; + } + } + return Patched; +} + void BinaryFunction::JumpTable::updateOriginal(BinaryContext &BC) { // In non-relocation mode we have to emit jump tables in local sections. // This way we only overwrite them when a corresponding function is diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index eeb5c52ed389..affbd39ade44 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -624,6 +624,11 @@ public: /// Total number of times this jump table was used. uint64_t Count{0}; + /// Change all entries of the jump table in \p JTAddress pointing to + /// \p OldDest to \p NewDest. Return false if unsuccessful. + bool replaceDestination(uint64_t JTAddress, const MCSymbol *OldDest, + MCSymbol *NewDest); + /// Update jump table at its original location. void updateOriginal(BinaryContext &BC); @@ -1368,6 +1373,21 @@ public: /// new blocks into the CFG. This must be called after updateLayout. void updateCFIState(BinaryBasicBlock *Start, const unsigned NumNewBlocks); + /// Change \p OrigDest to \p NewDest in the jump table used at the end of + /// \p BB. Returns false if \p OrigDest couldn't be find as a valid target + /// and no replacement took place. + bool replaceJumpTableEntryIn(BinaryBasicBlock *BB, + BinaryBasicBlock *OldDest, + BinaryBasicBlock *NewDest); + + /// Split the CFG edge by inserting an intermediate basic block. + /// Returns a pointer to this new intermediate basic block. BB "From" will be + /// updated to jump to the intermediate block, which in turn will have an + /// unconditional branch to BB "To". + /// User needs to manually call fixBranches(). This function only creates the + /// correct CFG edges. + BinaryBasicBlock *splitEdge(BinaryBasicBlock *From, BinaryBasicBlock *To); + /// Determine direction of the branch based on the current layout. /// Callee is responsible of updating basic block indices prior to using /// this function (e.g. by calling BinaryFunction::updateLayoutIndices()). diff --git a/bolt/BinaryPassManager.cpp b/bolt/BinaryPassManager.cpp index 0dac7f0b1b04..eda6c575b3bc 100644 --- a/bolt/BinaryPassManager.cpp +++ b/bolt/BinaryPassManager.cpp @@ -10,6 +10,7 @@ //===----------------------------------------------------------------------===// #include "BinaryPassManager.h" +#include "Passes/AllocCombiner.h" #include "Passes/FrameOptimizer.h" #include "Passes/IndirectCallPromotion.h" #include "Passes/Inliner.h" @@ -62,12 +63,6 @@ OptimizeBodylessFunctions("optimize-bodyless-functions", cl::ZeroOrMore, cl::cat(BoltOptCategory)); -static cl::opt -OptimizeFrameAccesses("frame-opt", - cl::desc("optimize stack frame accesses"), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); - static cl::opt Peepholes("peepholes", cl::desc("run peephole optimizations"), @@ -331,9 +326,6 @@ void BinaryFunctionPassManager::runAllPasses( // fix branches consistency internally. Manager.registerPass(llvm::make_unique(PrintAfterBranchFixup)); - Manager.registerPass(llvm::make_unique(PrintFOP), - OptimizeFrameAccesses); - // This pass should come close to last since it uses the estimated hot // size of a function to determine the order. It should definitely // also happen after any changes to the call graph are made, e.g. inlining. @@ -356,6 +348,14 @@ void BinaryFunctionPassManager::runAllPasses( // This pass should always run last.* Manager.registerPass(llvm::make_unique(PrintFinalized)); + // FrameOptimizer has an implicit dependency on FinalizeFunctions. + // FrameOptimizer move values around and needs to update CFIs. To do this, it + // must read CFI, interpret it and rewrite it, so CFIs need to be correctly + // placed according to the final layout. + Manager.registerPass(llvm::make_unique(PrintFOP)); + + Manager.registerPass(llvm::make_unique(PrintFOP)); + // *except for this pass. This pass turns tail calls into jumps which // makes them invisible to function reordering. Manager.registerPass( diff --git a/bolt/Passes/AllocCombiner.cpp b/bolt/Passes/AllocCombiner.cpp new file mode 100644 index 000000000000..6d9c82732012 --- /dev/null +++ b/bolt/Passes/AllocCombiner.cpp @@ -0,0 +1,116 @@ +#include "AllocCombiner.h" + +#define DEBUG_TYPE "alloccombiner" + +using namespace llvm; + +namespace opts { +extern bool shouldProcess(const bolt::BinaryFunction &Function); + +extern cl::opt FrameOptimization; + +} // end namespace opts + +namespace llvm { +namespace bolt { + +namespace { + +bool getStackAdjustmentSize(const BinaryContext &BC, const MCInst &Inst, + int64_t &Adjustment) { + return BC.MIA->evaluateSimple(Inst, Adjustment, + std::make_pair(BC.MIA->getStackPointer(), 0LL), + std::make_pair(0, 0LL)); +} + +bool isIndifferentToSP(const MCInst &Inst, const BinaryContext &BC) { + if (BC.MIA->isCFI(Inst)) + return true; + + const auto II = BC.MII->get(Inst.getOpcode()); + if (BC.MIA->isTerminator(Inst) || + II.hasImplicitDefOfPhysReg(BC.MIA->getStackPointer(), BC.MRI.get()) || + II.hasImplicitUseOfPhysReg(BC.MIA->getStackPointer())) + return false; + + for (int I = 0, E = Inst.getNumOperands(); I != E; ++I) { + const auto &Operand = Inst.getOperand(I); + if (Operand.isReg() && Operand.getReg() == BC.MIA->getStackPointer()) { + return false; + } + } + return true; +} + +bool shouldProc(BinaryFunction &Function) { + return Function.isSimple() && Function.hasCFG() && + opts::shouldProcess(Function) && (Function.getSize() > 0); +} + +void runForAllWeCare(std::map &BFs, + std::function Task) { + for (auto &It : BFs) { + auto &Function = It.second; + if (shouldProc(Function)) + Task(Function); + } +} + +} // end anonymous namespace + +void AllocCombinerPass::combineAdjustments(BinaryContext &BC, + BinaryFunction &BF) { + for (auto &BB : BF) { + MCInst *Prev = nullptr; + for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { + auto &Inst = *I; + if (isIndifferentToSP(Inst, BC)) + continue; // Skip updating Prev + + int64_t Adjustment{0LL}; + if (!Prev || !BC.MIA->isStackAdjustment(Inst) || + !BC.MIA->isStackAdjustment(*Prev) || + !getStackAdjustmentSize(BC, *Prev, Adjustment)) { + Prev = &Inst; + continue; + } + + DEBUG({ + dbgs() << "At \"" << BF.getPrintName() << "\", combining: \n"; + Inst.dump(); + Prev->dump(); + dbgs() << "Adjustment: " << Adjustment << "\n"; + }); + + if (BC.MIA->isSUB(Inst)) + Adjustment = -Adjustment; + + BC.MIA->addToImm(Inst, Adjustment, BC.Ctx.get()); + + DEBUG({ + dbgs() << "After adjustment:\n"; + Inst.dump(); + }); + + BB.eraseInstruction(Prev); + ++NumCombined; + Prev = &Inst; + } + } +} + +void AllocCombinerPass::runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) { + if (opts::FrameOptimization == FOP_NONE) + return; + + runForAllWeCare( + BFs, [&](BinaryFunction &Function) { combineAdjustments(BC, Function); }); + + outs() << "BOLT-INFO: Allocation combiner: " << NumCoalesced + << " empty spaces coalesced.\n"; +} + +} // end namespace bolt +} // end namespace llvm diff --git a/bolt/Passes/AllocCombiner.h b/bolt/Passes/AllocCombiner.h new file mode 100644 index 000000000000..1be39974be3c --- /dev/null +++ b/bolt/Passes/AllocCombiner.h @@ -0,0 +1,48 @@ +//===--- Passes/AllocCombiner.h -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEDEFRAG_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEDEFRAG_H + +#include "BinaryPasses.h" +#include "DataflowInfoManager.h" + +namespace llvm { +namespace bolt { + +class AllocCombinerPass : public BinaryFunctionPass { + /// Stats aggregating variables + uint64_t NumCombined{0}; + uint64_t NumCoalesced{0}; + + void combineAdjustments(BinaryContext &BC, BinaryFunction &BF); + void coalesceEmptySpace(BinaryContext &BC, BinaryFunction &BF, + DataflowInfoManager &Info, FrameAnalysis &FA); + +public: + explicit AllocCombinerPass(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) {} + + const char *getName() const override { + return "alloc-combiner"; + } + + /// Pass entry point + void runOnFunctions(BinaryContext &BC, + std::map &BFs, + std::set &LargeFunctions) override; +}; + +} // namespace bolt +} // namespace llvm + + +#endif diff --git a/bolt/Passes/BinaryPasses.cpp b/bolt/Passes/BinaryPasses.cpp index d04b64e3955c..0d832cc90b5f 100644 --- a/bolt/Passes/BinaryPasses.cpp +++ b/bolt/Passes/BinaryPasses.cpp @@ -584,9 +584,11 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC, auto BI = PredBB->branch_info_begin(); std::swap(*BI, *(BI + 1)); } else { - // Change destination of the unconditional branch. + // Change destination of the conditional branch. MIA->replaceBranchTarget(*CondBranch, CalleeSymbol, BC.Ctx.get()); } + // Annotate it, so "isCall" returns true for this jcc + MIA->addAnnotation(BC.Ctx.get(), *CondBranch, "IsCTC", true); // Remove the unused successor which may be eliminated later // if there are no other users. diff --git a/bolt/Passes/BinaryPasses.h b/bolt/Passes/BinaryPasses.h index 2a25ec656ebd..d80876ca79b4 100644 --- a/bolt/Passes/BinaryPasses.h +++ b/bolt/Passes/BinaryPasses.h @@ -359,6 +359,12 @@ public: std::set &LargeFunctions) override; }; +enum FrameOptimizationType : char { + FOP_NONE, /// Don't perform FOP. + FOP_HOT, /// Perform FOP on hot functions. + FOP_ALL /// Perform FOP on all functions. +}; + } // namespace bolt } // namespace llvm diff --git a/bolt/Passes/CMakeLists.txt b/bolt/Passes/CMakeLists.txt index 3fc9336f29c6..7d9714893c45 100644 --- a/bolt/Passes/CMakeLists.txt +++ b/bolt/Passes/CMakeLists.txt @@ -1,4 +1,5 @@ add_llvm_library(LLVMBOLTPasses + AllocCombiner.cpp BinaryPasses.cpp BinaryFunctionCallGraph.cpp CallGraph.cpp @@ -14,7 +15,11 @@ add_llvm_library(LLVMBOLTPasses PettisAndHansen.cpp ReorderAlgorithm.cpp ReorderFunctions.cpp + ShrinkWrapping.cpp + StackAllocationAnalysis.cpp + StackAvailableExpressions.cpp StackPointerTracking.cpp + StackReachingUses.cpp ) include_directories( ${LLVM_MAIN_SRC_DIR}/tools/llvm-bolt ) diff --git a/bolt/Passes/DataflowAnalysis.h b/bolt/Passes/DataflowAnalysis.h index 1252be07eaa2..e28d2b085c29 100644 --- a/bolt/Passes/DataflowAnalysis.h +++ b/bolt/Passes/DataflowAnalysis.h @@ -265,12 +265,13 @@ public: return getStateAt(*Point.getInst()); } + /// Relies on a ptr map to fetch the previous instruction and then retrieve + /// state. WARNING: Watch out for invalidated pointers. Do not use this + /// function if you invalidated pointers after the analysis has been completed ErrorOr getStateBefore(const MCInst &Point) { return getStateAt(PrevPoint[&Point]); } - /// Return the in set (out set) of a given program point if the direction of - /// the dataflow is forward (backward). ErrorOrgetStateBefore(ProgramPoint Point) { if (Point.isBB()) return getStateAt(*Point.getBB()); @@ -491,6 +492,25 @@ public: /// Maps expressions defs (MCInsts) to its index in the Expressions vector std::unordered_map ExprToIdx; + /// Return whether \p Expr is in the state set at \p Point + bool count(ProgramPoint Point, const MCInst &Expr) const { + auto IdxIter = ExprToIdx.find(&Expr); + assert (IdxIter != ExprToIdx.end() && "Invalid Expr"); + return (*this->getStateAt(Point))[IdxIter->second]; + } + + bool count(const MCInst &Point, const MCInst &Expr) const { + auto IdxIter = ExprToIdx.find(&Expr); + assert (IdxIter != ExprToIdx.end() && "Invalid Expr"); + return (*this->getStateAt(Point))[IdxIter->second]; + } + + /// Return whether \p Expr is in the state set at the instr of index + /// \p PointIdx + bool count(unsigned PointIdx, const MCInst &Expr) const { + return count(*Expressions[PointIdx], Expr); + } + InstrsDataflowAnalysis(const BinaryContext &BC, BinaryFunction &BF) : DataflowAnalysis(BC, BF) {} virtual ~InstrsDataflowAnalysis() {} diff --git a/bolt/Passes/DataflowInfoManager.cpp b/bolt/Passes/DataflowInfoManager.cpp index 0c4cdbe99e06..e280c1554b3d 100644 --- a/bolt/Passes/DataflowInfoManager.cpp +++ b/bolt/Passes/DataflowInfoManager.cpp @@ -20,10 +20,7 @@ ReachingDefOrUse &DataflowInfoManager::getReachingDefs() { return *RD; assert(FA && "FrameAnalysis required"); RD.reset(new ReachingDefOrUse(*FA, BC, BF)); - { - NamedRegionTimer T1("RD", "Dataflow", true); - RD->run(); - } + RD->run(); return *RD; } @@ -36,10 +33,7 @@ ReachingDefOrUse &DataflowInfoManager::getReachingUses() { return *RU; assert(FA && "FrameAnalysis required"); RU.reset(new ReachingDefOrUse(*FA, BC, BF)); - { - NamedRegionTimer T1("RU", "Dataflow", true); - RU->run(); - } + RU->run(); return *RU; } @@ -52,10 +46,7 @@ LivenessAnalysis &DataflowInfoManager::getLivenessAnalysis() { return *LA; assert(FA && "FrameAnalysis required"); LA.reset(new LivenessAnalysis(*FA, BC, BF)); - { - NamedRegionTimer T1("LA", "Dataflow", true); - LA->run(); - } + LA->run(); return *LA; } @@ -63,14 +54,24 @@ void DataflowInfoManager::invalidateLivenessAnalysis() { LA.reset(nullptr); } +StackReachingUses &DataflowInfoManager::getStackReachingUses() { + if (SRU) + return *SRU; + assert(FA && "FrameAnalysis required"); + SRU.reset(new StackReachingUses(*FA, BC, BF)); + SRU->run(); + return *SRU; +} + +void DataflowInfoManager::invalidateStackReachingUses() { + SRU.reset(nullptr); +} + DominatorAnalysis &DataflowInfoManager::getDominatorAnalysis() { if (DA) return *DA; DA.reset(new DominatorAnalysis(BC, BF)); - { - NamedRegionTimer T1("DA", "Dataflow", true); - DA->run(); - } + DA->run(); return *DA; } @@ -82,10 +83,7 @@ DominatorAnalysis &DataflowInfoManager::getPostDominatorAnalysis() { if (PDA) return *PDA; PDA.reset(new DominatorAnalysis(BC, BF)); - { - NamedRegionTimer T1("PDA", "Dataflow", true); - PDA->run(); - } + PDA->run(); return *PDA; } @@ -97,14 +95,12 @@ StackPointerTracking &DataflowInfoManager::getStackPointerTracking() { if (SPT) return *SPT; SPT.reset(new StackPointerTracking(BC, BF)); - { - NamedRegionTimer T1("SPT", "Dataflow", true); - SPT->run(); - } + SPT->run(); return *SPT; } void DataflowInfoManager::invalidateStackPointerTracking() { + invalidateStackAllocationAnalysis(); SPT.reset(nullptr); } @@ -112,10 +108,7 @@ ReachingInsns &DataflowInfoManager::getReachingInsns() { if (RI) return *RI; RI.reset(new ReachingInsns(BC, BF)); - { - NamedRegionTimer T1("RI", "Dataflow", true); - RI->run(); - } + RI->run(); return *RI; } @@ -127,10 +120,7 @@ ReachingInsns &DataflowInfoManager::getReachingInsnsBackwards() { if (RIB) return *RIB; RIB.reset(new ReachingInsns(BC, BF)); - { - NamedRegionTimer T1("RIB", "Dataflow", true); - RIB->run(); - } + RIB->run(); return *RIB; } @@ -138,6 +128,18 @@ void DataflowInfoManager::invalidateReachingInsnsBackwards() { RIB.reset(nullptr); } +StackAllocationAnalysis &DataflowInfoManager::getStackAllocationAnalysis() { + if (SAA) + return *SAA; + SAA.reset(new StackAllocationAnalysis(BC, BF, getStackPointerTracking())); + SAA->run(); + return *SAA; +} + +void DataflowInfoManager::invalidateStackAllocationAnalysis() { + SAA.reset(nullptr); +} + std::unordered_map & DataflowInfoManager::getInsnToBBMap() { if (InsnToBB) @@ -158,11 +160,13 @@ void DataflowInfoManager::invalidateAll() { invalidateReachingDefs(); invalidateReachingUses(); invalidateLivenessAnalysis(); + invalidateStackReachingUses(); invalidateDominatorAnalysis(); invalidatePostDominatorAnalysis(); invalidateStackPointerTracking(); invalidateReachingInsns(); invalidateReachingInsnsBackwards(); + invalidateStackAllocationAnalysis(); invalidateInsnToBBMap(); } diff --git a/bolt/Passes/DataflowInfoManager.h b/bolt/Passes/DataflowInfoManager.h index a9ef9f7d897d..34a6b64bef15 100644 --- a/bolt/Passes/DataflowInfoManager.h +++ b/bolt/Passes/DataflowInfoManager.h @@ -14,10 +14,12 @@ #include "FrameAnalysis.h" #include "ReachingDefOrUse.h" +#include "StackReachingUses.h" #include "DominatorAnalysis.h" #include "StackPointerTracking.h" #include "ReachingInsns.h" #include "LivenessAnalysis.h" +#include "StackAllocationAnalysis.h" namespace llvm { namespace bolt { @@ -33,11 +35,13 @@ class DataflowInfoManager { std::unique_ptr> RD; std::unique_ptr> RU; std::unique_ptr LA; + std::unique_ptr SRU; std::unique_ptr> DA; std::unique_ptr> PDA; std::unique_ptr SPT; std::unique_ptr> RI; std::unique_ptr> RIB; + std::unique_ptr SAA; std::unique_ptr> InsnToBB; @@ -45,12 +49,20 @@ public: DataflowInfoManager(const FrameAnalysis *FA, const BinaryContext &BC, BinaryFunction &BF) : FA(FA), BC(BC), BF(BF) {}; + /// Helper function to fetch the parent BB associated with a program point + /// If PP is a BB itself, then return itself (cast to a BinaryBasicBlock) + BinaryBasicBlock *getParentBB(ProgramPoint PP) { + return PP.isBB() ? PP.getBB() : getInsnToBBMap()[PP.getInst()]; + } + ReachingDefOrUse &getReachingDefs(); void invalidateReachingDefs(); ReachingDefOrUse &getReachingUses(); void invalidateReachingUses(); LivenessAnalysis &getLivenessAnalysis(); void invalidateLivenessAnalysis(); + StackReachingUses &getStackReachingUses(); + void invalidateStackReachingUses(); DominatorAnalysis &getDominatorAnalysis(); void invalidateDominatorAnalysis(); DominatorAnalysis &getPostDominatorAnalysis(); @@ -61,6 +73,8 @@ public: void invalidateReachingInsns(); ReachingInsns &getReachingInsnsBackwards(); void invalidateReachingInsnsBackwards(); + StackAllocationAnalysis &getStackAllocationAnalysis(); + void invalidateStackAllocationAnalysis(); std::unordered_map &getInsnToBBMap(); void invalidateInsnToBBMap(); void invalidateAll(); diff --git a/bolt/Passes/DominatorAnalysis.h b/bolt/Passes/DominatorAnalysis.h index 87eef5f7662f..4abc508e78f0 100644 --- a/bolt/Passes/DominatorAnalysis.h +++ b/bolt/Passes/DominatorAnalysis.h @@ -13,6 +13,7 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_DOMINATORANALYSIS_H #include "DataflowAnalysis.h" +#include "llvm/Support/Timer.h" namespace llvm { namespace bolt { @@ -60,13 +61,21 @@ public: return Result; } - bool doesADominatesB(const MCInst &A, const MCInst &B) { - return (*this->getStateAt(B))[this->ExprToIdx[&A]]; + bool doesADominateB(const MCInst &A, unsigned BIdx) { + return this->count(BIdx, A); } - bool doesADominatesB(ProgramPoint A, const MCInst &B) { + bool doesADominateB(const MCInst &A, const MCInst &B) { + return this->count(B, A); + } + + bool doesADominateB(const MCInst &A, ProgramPoint B) { + return this->count(B, A); + } + + bool doesADominateB(ProgramPoint A, const MCInst &B) { if (A.isInst()) - return doesADominatesB(*A.getInst(), B); + return doesADominateB(*A.getInst(), B); // This analysis keep track of which instructions dominates another // instruction, it doesn't keep track of BBs. So we need a non-empty @@ -79,7 +88,7 @@ public: BB = *BB->succ_begin(); } const MCInst &InstA = *BB->begin(); - return doesADominatesB(InstA, B); + return doesADominateB(InstA, B); } void doForAllDominators(const MCInst &Inst, @@ -89,6 +98,11 @@ public: } } + void run() { + NamedRegionTimer T1("DA", "Dataflow", true); + InstrsDataflowAnalysis, Backward>::run(); + } + private: void preflight() { // Populate our universe of tracked expressions with all instructions diff --git a/bolt/Passes/FrameAnalysis.cpp b/bolt/Passes/FrameAnalysis.cpp index 7c0110926381..38d770ad679a 100644 --- a/bolt/Passes/FrameAnalysis.cpp +++ b/bolt/Passes/FrameAnalysis.cpp @@ -215,6 +215,12 @@ public: void FrameAnalysis::addArgAccessesFor(const BinaryContext &BC, MCInst &Inst, ArgAccesses &&AA) { + if (auto OldAA = getArgAccessesFor(BC, Inst)) { + if (OldAA->AssumeEverything) + return; + *OldAA = std::move(AA); + return; + } if (AA.AssumeEverything) { // Index 0 in ArgAccessesVector represents an "assumeeverything" entry BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "ArgAccessEntry", 0U); @@ -222,7 +228,7 @@ void FrameAnalysis::addArgAccessesFor(const BinaryContext &BC, MCInst &Inst, } BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "ArgAccessEntry", (unsigned)ArgAccessesVector.size()); - ArgAccessesVector.emplace_back(AA); + ArgAccessesVector.emplace_back(std::move(AA)); } void FrameAnalysis::addArgInStackAccessFor(const BinaryContext &BC, @@ -329,29 +335,39 @@ BitVector FrameAnalysis::getFunctionClobberList(const BinaryContext &BC, void FrameAnalysis::buildClobberMap(const BinaryContext &BC) { std::queue Queue; + std::set InQueue; for (auto *Func : TopologicalCGOrder) { Queue.push(Func); + InQueue.insert(Func); } while (!Queue.empty()) { auto *Func = Queue.front(); Queue.pop(); + InQueue.erase(Func); BitVector RegsKilled = getFunctionClobberList(BC, Func); - bool Updated = ClobberAnalysisOnly ? false : computeArgsAccessed(BC, *Func); + bool ArgsUpdated = ClobberAnalysisOnly ? false : computeArgsAccessed(BC, *Func); + bool RegsUpdated = false; if (RegsKilledMap.find(Func) == RegsKilledMap.end()) { RegsKilledMap[Func] = std::move(RegsKilled); - continue; + } else { + RegsUpdated = RegsKilledMap[Func] != RegsKilled; + if (RegsUpdated) + RegsKilledMap[Func] = std::move(RegsKilled); } - if (RegsKilledMap[Func] != RegsKilled || Updated) { + if (RegsUpdated || ArgsUpdated) { for (auto Caller : Cg.predecessors(Cg.getNodeId(Func))) { - Queue.push(Cg.nodeIdToFunc(Caller)); + BinaryFunction *CallerFunc = Cg.nodeIdToFunc(Caller); + if (!InQueue.count(CallerFunc)) { + InQueue.insert(CallerFunc); + Queue.push(CallerFunc); + } } } - RegsKilledMap[Func] = std::move(RegsKilled); } if (opts::Verbosity == 0) { @@ -453,10 +469,11 @@ bool FrameAnalysis::updateArgsTouchedFor(const BinaryContext &BC, break; } DEBUG(dbgs() << "Added arg in stack access annotation " - << CurOffset + Elem.first << "\n"); + << CurOffset + Elem.first << "\n"); addArgInStackAccessFor( - BC, Inst, ArgInStackAccess{/*StackOffset=*/CurOffset + Elem.first, - /*Size=*/Elem.second}); + BC, Inst, + ArgInStackAccess{/*StackOffset=*/CurOffset + Elem.first, + /*Size=*/Elem.second}); } return Changed; } diff --git a/bolt/Passes/FrameOptimizer.cpp b/bolt/Passes/FrameOptimizer.cpp index 7f02b840ba64..4662cf87515b 100644 --- a/bolt/Passes/FrameOptimizer.cpp +++ b/bolt/Passes/FrameOptimizer.cpp @@ -10,6 +10,11 @@ //===----------------------------------------------------------------------===// #include "FrameOptimizer.h" +#include "FrameAnalysis.h" +#include "ShrinkWrapping.h" +#include "StackAvailableExpressions.h" +#include "StackReachingUses.h" +#include "llvm/Support/Timer.h" #include #include @@ -19,616 +24,34 @@ using namespace llvm; namespace opts { extern cl::opt Verbosity; -} +extern cl::OptionCategory BoltOptCategory; + +using namespace bolt; + +cl::opt +FrameOptimization("frame-opt", + cl::init(FOP_NONE), + cl::desc("optimize stack frame accesses"), + cl::values( + clEnumValN(FOP_NONE, "none", "do not perform frame optimization"), + clEnumValN(FOP_HOT, "hot", "perform FOP on hot functions"), + clEnumValN(FOP_ALL, "all", "perform FOP on all functions"), + clEnumValEnd), + cl::ZeroOrMore, + cl::cat(BoltOptCategory)); + +} // namespace opts namespace llvm { namespace bolt { -void FrameOptimizerPass::getInstClobberList(const BinaryContext &BC, - const MCInst &Inst, - BitVector &KillSet) const { - if (!BC.MIA->isCall(Inst)) { - BC.MIA->getClobberedRegs(Inst, KillSet, *BC.MRI); - return; - } - - const auto *TargetSymbol = BC.MIA->getTargetSymbol(Inst); - // If indirect call, kill set should have all elements - if (TargetSymbol == nullptr) { - KillSet.set(0, KillSet.size()); - return; - } - - const auto *Function = BC.getFunctionForSymbol(TargetSymbol); - if (Function == nullptr) { - // Call to a function without a BinaryFunction object. - // This should be a call to a PLT entry, and since it is a trampoline to - // a DSO, we can't really know the code in advance. Conservatively assume - // everything is clobbered. - KillSet.set(0, KillSet.size()); - return; - } - auto BV = RegsKilledMap.find(Function); - if (BV != RegsKilledMap.end()) { - KillSet |= BV->second; - return; - } - // Ignore calls to function whose clobber list wasn't yet calculated. This - // instruction will be evaluated again once we have info for the callee. - return; -} - -BitVector -FrameOptimizerPass::getFunctionClobberList(const BinaryContext &BC, - const BinaryFunction *Func) { - BitVector RegsKilled = BitVector(BC.MRI->getNumRegs(), false); - - if (!Func->isSimple() || !shouldOptimize(*Func)) { - RegsKilled.set(0, RegsKilled.size()); - return RegsKilled; - } - - for (const auto &BB : *Func) { - for (const auto &Inst : BB) { - getInstClobberList(BC, Inst, RegsKilled); - } - } - - return RegsKilled; -} - -void FrameOptimizerPass::buildClobberMap(const BinaryContext &BC) { - std::queue Queue; - - for (auto *Func : TopologicalCGOrder) { - Queue.push(Func); - } - - while (!Queue.empty()) { - auto *Func = Queue.front(); - Queue.pop(); - - BitVector RegsKilled = getFunctionClobberList(BC, Func); - - if (RegsKilledMap.find(Func) == RegsKilledMap.end()) { - RegsKilledMap[Func] = std::move(RegsKilled); - continue; - } - - if (RegsKilledMap[Func] != RegsKilled) { - for (auto Caller : Cg.predecessors(Cg.getNodeId(Func))) { - Queue.push(Cg.nodeIdToFunc(Caller)); - } - } - RegsKilledMap[Func] = std::move(RegsKilled); - } - - if (opts::Verbosity == 0) { -#ifndef NDEBUG - if (!DebugFlag || !isCurrentDebugType("fop")) - return; -#else - return; -#endif - } - - // This loop is for computing statistics only - for (auto *Func : TopologicalCGOrder) { - auto Iter = RegsKilledMap.find(Func); - assert(Iter != RegsKilledMap.end() && - "Failed to compute all clobbers list"); - if (Iter->second.all()) { - auto Count = Func->getExecutionCount(); - if (Count != BinaryFunction::COUNT_NO_PROFILE) - CountFunctionsAllClobber += Count; - ++NumFunctionsAllClobber; - } - DEBUG_WITH_TYPE("fop", - dbgs() << "Killed regs set for func: " << Func->getPrintName() << "\n"; - const BitVector &RegsKilled = Iter->second; - int RegIdx = RegsKilled.find_first(); - while (RegIdx != -1) { - dbgs() << "\tREG" << RegIdx; - RegIdx = RegsKilled.find_next(RegIdx); - }; - dbgs() << "\n"; - ); - } -} - -namespace { - -template -class ForwardDataflow { -protected: - /// Reference to the function being analysed - const BinaryFunction &Func; - - /// Tracks the set of available exprs at the end of each MCInst in this - /// function - std::unordered_map StateAtPoint; - /// Tracks the set of available exprs at basic block start - std::unordered_map StateAtBBEntry; - - virtual void preflight() = 0; - - virtual StateTy getStartingStateAtBB(const BinaryBasicBlock &BB) = 0; - - virtual StateTy getStartingStateAtPoint(const MCInst &Point) = 0; - - virtual void doConfluence(StateTy &StateOut, const StateTy &StateIn) = 0; - - virtual StateTy computeNext(const MCInst &Point, const StateTy &Cur) = 0; - -public: - ForwardDataflow(const BinaryFunction &BF) : Func(BF) {} - virtual ~ForwardDataflow() {} - - ErrorOrgetStateAt(const BinaryBasicBlock &BB) const { - auto Iter = StateAtBBEntry.find(&BB); - if (Iter == StateAtBBEntry.end()) - return make_error_code(errc::result_out_of_range); - return Iter->second; - } - - ErrorOrgetStateAt(const MCInst &Point) const { - auto Iter = StateAtPoint.find(&Point); - if (Iter == StateAtPoint.end()) - return make_error_code(errc::result_out_of_range); - return Iter->second; - } - - void run() { - preflight(); - - // Initialize state for all points of the function - for (auto &BB : Func) { - StateAtBBEntry[&BB] = getStartingStateAtBB(BB); - for (auto &Inst : BB) { - StateAtPoint[&Inst] = getStartingStateAtPoint(Inst); - } - } - assert(Func.begin() != Func.end() && "Unexpected empty function"); - - std::queue Worklist; - // TODO: Pushing this in a DFS ordering will greatly speed up the dataflow - // performance. - for (auto &BB : Func) { - Worklist.push(&BB); - } - - // Main dataflow loop - while (!Worklist.empty()) { - auto *BB = Worklist.front(); - Worklist.pop(); - - DEBUG(dbgs() << "\tNow at BB " << BB->getName() << "\n"); - - // Calculate state at the entry of first instruction in BB - StateTy &StateAtEntry = StateAtBBEntry[BB]; - for (auto I = BB->pred_begin(), E = BB->pred_end(); I != E; ++I) { - auto Last = (*I)->rbegin(); - if (Last != (*I)->rend()) { - doConfluence(StateAtEntry, StateAtPoint[&*Last]); - } else { - doConfluence(StateAtEntry, StateAtBBEntry[*I]); - } - } - // Skip empty - if (BB->begin() == BB->end()) - continue; - - // Propagate information from first instruction down to the last one - bool Changed = false; - StateTy *PrevState = &StateAtEntry; - const MCInst *LAST = &*BB->rbegin(); - for (auto &Inst : *BB) { - DEBUG(dbgs() << "\t\tNow at "); - DEBUG(Inst.dump()); - - StateTy CurState = computeNext(Inst, *PrevState); - - if (StateAtPoint[&Inst] != CurState) { - StateAtPoint[&Inst] = CurState; - if (&Inst == LAST) - Changed = true; - } - PrevState = &StateAtPoint[&Inst]; - } - - if (Changed) { - for (auto I = BB->succ_begin(), E = BB->succ_end(); I != E; ++I) { - Worklist.push(*I); - } - } - } - } -}; - -class StackAvailableExpressions : public ForwardDataflow { -public: - StackAvailableExpressions(const FrameOptimizerPass &FOP, - const BinaryContext &BC, const BinaryFunction &BF) - : ForwardDataflow(BF), FOP(FOP), FrameIndexMap(FOP.FrameIndexMap), - BC(BC) {} - virtual ~StackAvailableExpressions() {} - - /// Define an iterator for navigating the expressions calculated by the - /// dataflow at each program point - class ExprIterator - : public std::iterator { - public: - ExprIterator &operator++() { - assert(Idx != -1 && "Iterator already at the end"); - Idx = BV->find_next(Idx); - return *this; - } - ExprIterator operator++(int) { - assert(Idx != -1 && "Iterator already at the end"); - ExprIterator Ret = *this; - ++(*this); - return Ret; - } - bool operator==(ExprIterator Other) const { return Idx == Other.Idx; } - bool operator!=(ExprIterator Other) const { return Idx != Other.Idx; } - const MCInst *operator*() { - assert(Idx != -1 && "Invalid access to end iterator"); - return Expressions[Idx]; - } - ExprIterator(const BitVector *BV, const std::vector &Exprs) - : BV(BV), Expressions(Exprs) { - Idx = BV->find_first(); - } - ExprIterator(const BitVector *BV, const std::vector &Exprs, - int Idx) - : BV(BV), Expressions(Exprs), Idx(Idx) {} - - private: - const BitVector *BV; - const std::vector &Expressions; - public: - int Idx; - }; - ExprIterator expr_begin(const BitVector &BV) const { - return ExprIterator(&BV, Expressions); - } - ExprIterator expr_begin(const MCInst &Point) const { - auto Iter = StateAtPoint.find(&Point); - if (Iter == StateAtPoint.end()) - return expr_end(); - return ExprIterator(&Iter->second, Expressions); - } - ExprIterator expr_begin(const BinaryBasicBlock &BB) const { - auto Iter = StateAtBBEntry.find(&BB); - if (Iter == StateAtBBEntry.end()) - return expr_end(); - return ExprIterator(&Iter->second, Expressions); - } - ExprIterator expr_end() const { - return ExprIterator(nullptr, Expressions, -1); - } - -private: - /// Reference to the result of stack frame analysis - const FrameOptimizerPass &FOP; - const FrameOptimizerPass::FrameIndexMapTy &FrameIndexMap; - const BinaryContext &BC; - - /// Used to size the set of expressions/definitions being tracked by the - /// dataflow analysis - uint64_t NumInstrs{0}; - /// We put every MCInst we want to track (which one representing an - /// expression/def) into a vector because we need to associate them with - /// small numbers. They will be tracked via BitVectors throughout the - /// dataflow analysis. - std::vector Expressions; - /// Maps expressions defs (MCInsts) to its index in the Expressions vector - std::unordered_map ExprToIdx; - - void preflight() override { - DEBUG(dbgs() << "Starting StackAvailableExpressions on \"" - << Func.getPrintName() << "\"\n"); - - // Populate our universe of tracked expressions. We are interested in - // tracking available stores to frame position at any given point of the - // program. - for (auto &BB : Func) { - for (auto &Inst : BB) { - auto FIEIter = FrameIndexMap.find(&Inst); - if (FIEIter == FrameIndexMap.end()) - continue; - const auto &FIE = FIEIter->second; - if (FIE.IsLoad == false && FIE.IsSimple == true) { - Expressions.push_back(&Inst); - ExprToIdx[&Inst] = NumInstrs++; - } - } - } - } - - BitVector getStartingStateAtBB(const BinaryBasicBlock &BB) override { - // Entry points start with empty set (Function entry and landing pads). - // All others start with the full set. - if (BB.pred_size() == 0) - return BitVector(NumInstrs, false); - return BitVector(NumInstrs, true); - } - - BitVector getStartingStateAtPoint(const MCInst &Point) override { - return BitVector(NumInstrs, true); - } - - void doConfluence(BitVector &StateOut, const BitVector &StateIn) override { - StateOut &= StateIn; - } - - /// Define the function computing the kill set -- whether expression Y, a - /// tracked expression, will be considered to be dead after executing X. - bool doesXKillsY(const MCInst *X, const MCInst *Y) { - // if both are stores, and both store to the same stack location, return - // true - auto FIEIterX = FrameIndexMap.find(X); - auto FIEIterY = FrameIndexMap.find(Y); - if (FIEIterX != FrameIndexMap.end() && FIEIterY != FrameIndexMap.end()) { - const FrameOptimizerPass::FrameIndexEntry &FIEX = FIEIterX->second; - const FrameOptimizerPass::FrameIndexEntry &FIEY = FIEIterY->second;; - if (FIEX.IsLoad == 0 && FIEY.IsLoad == 0 && - FIEX.StackOffset + FIEX.Size > FIEY.StackOffset && - FIEX.StackOffset < FIEY.StackOffset + FIEY.Size) - return true; - } - // getClobberedRegs for X and Y. If they intersect, return true - BitVector XClobbers = BitVector(BC.MRI->getNumRegs(), false); - BitVector YClobbers = BitVector(BC.MRI->getNumRegs(), false); - FOP.getInstClobberList(BC, *X, XClobbers); - // If Y is a store to stack, its clobber list is its source reg. This is - // different than the rest because we want to check if the store source - // reaches its corresponding load untouched. - if (FIEIterY != FrameIndexMap.end() && FIEIterY->second.IsLoad == 0 && - FIEIterY->second.IsStoreFromReg) { - YClobbers.set(FIEIterY->second.RegOrImm); - } else { - FOP.getInstClobberList(BC, *Y, YClobbers); - } - XClobbers &= YClobbers; - return XClobbers.any(); - } - - BitVector computeNext(const MCInst &Point, const BitVector &Cur) override { - BitVector Next = Cur; - // Kill - for (auto I = expr_begin(Next), E = expr_end(); I != E; ++I) { - assert(*I != nullptr && "Lost pointers"); - DEBUG(dbgs() << "\t\t\tDoes it kill "); - DEBUG((*I)->dump()); - if (doesXKillsY(&Point, *I)) { - DEBUG(dbgs() << "\t\t\t\tYes\n"); - Next.reset(I.Idx); - } - }; - // Gen - auto FIEIter = FrameIndexMap.find(&Point); - if (FIEIter != FrameIndexMap.end() && - FIEIter->second.IsLoad == false && - FIEIter->second.IsSimple == true) - Next.set(ExprToIdx[&Point]); - return Next; - } -}; - -class StackPointerTracking : public ForwardDataflow { - const BinaryContext &BC; - - void preflight() override { - DEBUG(dbgs() << "Starting StackPointerTracking on \"" - << Func.getPrintName() << "\"\n"); - } - - int getStartingStateAtBB(const BinaryBasicBlock &BB) override { - // Entry BB start with offset 8 from CFA. - // All others start with EMPTY (meaning we don't know anything). - if (BB.isEntryPoint()) - return -8; - return EMPTY; - } - - int getStartingStateAtPoint(const MCInst &Point) override { - return EMPTY; - } - - void doConfluence(int &StateOut, const int &StateIn) override { - if (StateOut == EMPTY) { - StateOut = StateIn; - return; - } - if (StateIn == EMPTY || StateIn == StateOut) - return; - - // We can't agree on a specific value from this point on - StateOut = SUPERPOSITION; - } - - int computeNext(const MCInst &Point, const int &Cur) override { - const auto &MIA = BC.MIA; - - if (Cur == EMPTY || Cur == SUPERPOSITION) - return Cur; - - if (int Sz = MIA->getPushSize(Point)) - return Cur - Sz; - - if (int Sz = MIA->getPopSize(Point)) - return Cur + Sz; - - if (BC.MII->get(Point.getOpcode()) - .hasDefOfPhysReg(Point, MIA->getStackPointer(), *BC.MRI)) { - int64_t Offset = Cur; - if (!MIA->evaluateSimple(Point, Offset, std::make_pair(0, 0), - std::make_pair(0, 0))) - return SUPERPOSITION; - - return static_cast(Offset); - } - - return Cur; - } -public: - StackPointerTracking(const BinaryContext &BC, const BinaryFunction &BF) - : ForwardDataflow(BF), BC(BC) {} - virtual ~StackPointerTracking() {} - - static constexpr int SUPERPOSITION = std::numeric_limits::max(); - static constexpr int EMPTY = std::numeric_limits::min(); -}; - -} // anonymous namespace - -bool FrameOptimizerPass::restoreFrameIndex(const BinaryContext &BC, - const BinaryFunction &BF) { - StackPointerTracking SPT(BC, BF); - - SPT.run(); - - // Vars used for storing useful CFI info to give us a hint about how the stack - // is used in this function - int64_t CfaOffset{-8}; - uint16_t CfaReg{7}; - bool CfaRegLocked{false}; - uint16_t CfaRegLockedVal{0}; - std::stack> CFIStack; - - DEBUG(dbgs() << "Restoring frame indices for \"" << BF.getPrintName() - << "\"\n"); - - // TODO: Implement SP tracking and improve this analysis - for (auto &BB : BF) { - DEBUG(dbgs() <<"\tNow at BB " << BB.getName() << "\n"); - - const MCInst *Prev = nullptr; - for (const auto &Inst : BB) { - int SPOffset = (Prev ? *SPT.getStateAt(*Prev) : *SPT.getStateAt(BB)); - DEBUG({ - dbgs() << "\t\tNow at "; - Inst.dump(); - dbgs() << "\t\t\tSP offset is " << SPOffset << "\n"; - }); - Prev = &Inst; - // Use CFI information to keep track of which register is being used to - // access the frame - if (BC.MIA->isCFI(Inst)) { - const auto *CFI = BF.getCFIFor(Inst); - switch (CFI->getOperation()) { - case MCCFIInstruction::OpDefCfa: - CfaOffset = CFI->getOffset(); - // Fall-through - case MCCFIInstruction::OpDefCfaRegister: - CfaReg = CFI->getRegister(); - break; - case MCCFIInstruction::OpDefCfaOffset: - CfaOffset = CFI->getOffset(); - break; - case MCCFIInstruction::OpRememberState: - CFIStack.push(std::make_pair(CfaOffset, CfaReg)); - break; - case MCCFIInstruction::OpRestoreState: { - assert(!CFIStack.empty() && "Corrupt CFI stack"); - auto &Elem = CFIStack.top(); - CFIStack.pop(); - CfaOffset = Elem.first; - CfaReg = Elem.second; - break; - } - case MCCFIInstruction::OpAdjustCfaOffset: - llvm_unreachable("Unhandled AdjustCfaOffset"); - break; - default: - break; - } - continue; - } - - if (BC.MIA->leaksStackAddress(Inst, *BC.MRI, false)) { - DEBUG(dbgs() << "Leaked stack address, giving up on this function.\n"); - DEBUG(dbgs() << "Blame insn: "); - DEBUG(Inst.dump()); - return false; - } - - bool IsLoad = false; - bool IsStore = false; - bool IsStoreFromReg = false; - bool IsSimple = false; - int32_t SrcImm{0}; - MCPhysReg Reg{0}; - MCPhysReg StackPtrReg{0}; - int64_t StackOffset{0}; - uint8_t Size{0}; - bool IsIndexed = false; - if (BC.MIA->isStackAccess(Inst, IsLoad, IsStore, IsStoreFromReg, Reg, - SrcImm, StackPtrReg, StackOffset, Size, - IsSimple, IsIndexed)) { - assert(Size != 0); - if (CfaRegLocked && CfaRegLockedVal != CfaReg) { - DEBUG(dbgs() << "CFA reg changed, giving up on this function.\n"); - return false; - } - if (StackPtrReg != BC.MRI->getLLVMRegNum(CfaReg, /*isEH=*/false)) { - if (StackPtrReg != BC.MIA->getStackPointer() || - SPOffset == SPT.EMPTY || SPOffset == SPT.SUPERPOSITION) { - DEBUG(dbgs() - << "Found stack access with reg different than cfa reg.\n"); - DEBUG(dbgs() << "\tCurrent CFA reg: " << CfaReg - << "\n\tStack access reg: " << StackPtrReg << "\n"); - DEBUG(dbgs() << "Blame insn: "); - DEBUG(Inst.dump()); - return false; - } - DEBUG(dbgs() << "Adding access via SP while CFA reg is another one\n"); - if (IsStoreFromReg || IsLoad) - SrcImm = Reg; - // Ignore accesses to the previous stack frame - if (SPOffset + StackOffset >= 0) - continue; - FrameIndexMap.emplace( - &Inst, FrameIndexEntry{IsLoad, IsStoreFromReg, SrcImm, - SPOffset + StackOffset, Size, IsSimple}); - } else { - CfaRegLocked = true; - CfaRegLockedVal = CfaReg; - if (IsStoreFromReg || IsLoad) - SrcImm = Reg; - // Ignore accesses to the previous stack frame - if (CfaOffset + StackOffset >= 0) - continue; - FrameIndexMap.emplace( - &Inst, FrameIndexEntry{IsLoad, IsStoreFromReg, SrcImm, - CfaOffset + StackOffset, Size, IsSimple}); - } - - DEBUG_WITH_TYPE("fop", - dbgs() << "Frame index annotation added to:\n"; - BC.printInstruction(dbgs(), Inst, 0, &BF, true); - dbgs() << " FrameIndexEntry \n"; - ); - } - } - } - return true; -} - -void FrameOptimizerPass::removeUnnecessarySpills(const BinaryContext &BC, - BinaryFunction &BF) { - StackAvailableExpressions SAE(*this, BC, BF); - +void FrameOptimizerPass::removeUnnecessaryLoads(const FrameAnalysis &FA, + const BinaryContext &BC, + BinaryFunction &BF) { + StackAvailableExpressions SAE(FA, BC, BF); SAE.run(); - DEBUG(dbgs() << "Performing frame optimization\n"); + DEBUG(dbgs() << "Performing unnecessary loads removal\n"); std::deque> ToErase; bool Changed = false; const auto ExprEnd = SAE.expr_end(); @@ -648,16 +71,16 @@ void FrameOptimizerPass::removeUnnecessarySpills(const BinaryContext &BC, // if Inst is a load from stack and the current available expressions show // this value is available in a register or immediate, replace this load // with move from register or from immediate. - const auto Iter = FrameIndexMap.find(&Inst); - if (Iter == FrameIndexMap.end()) { + auto FIEX = FA.getFIEFor(BC, Inst); + if (!FIEX) { Prev = &Inst; continue; } - const FrameIndexEntry &FIEX = Iter->second; // FIXME: Change to remove IsSimple == 0. We're being conservative here, // but once replaceMemOperandWithReg is ready, we should feed it with all // sorts of complex instructions. - if (FIEX.IsLoad == 0 || FIEX.IsSimple == 0) { + if (FIEX->IsLoad == false || FIEX->IsSimple == false || + FIEX->StackOffset >= 0) { Prev = &Inst; continue; } @@ -665,13 +88,14 @@ void FrameOptimizerPass::removeUnnecessarySpills(const BinaryContext &BC, for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB); I != ExprEnd; ++I) { const MCInst *AvailableInst = *I; - const auto Iter = FrameIndexMap.find(AvailableInst); - if (Iter == FrameIndexMap.end()) + auto FIEY = FA.getFIEFor(BC, *AvailableInst); + if (!FIEY) continue; - - const FrameIndexEntry &FIEY = Iter->second; - assert(FIEY.IsLoad == 0 && FIEY.IsSimple != 0); - if (FIEX.StackOffset != FIEY.StackOffset || FIEX.Size != FIEY.Size) + assert(FIEY->IsStore && FIEY->IsSimple); + if (FIEX->StackOffset != FIEY->StackOffset || FIEX->Size != FIEY->Size) + continue; + // TODO: Change push/pops to stack adjustment instruction + if (BC.MIA->isPop(Inst)) continue; ++NumRedundantLoads; @@ -682,12 +106,13 @@ void FrameOptimizerPass::removeUnnecessarySpills(const BinaryContext &BC, DEBUG(AvailableInst->dump()); DEBUG(dbgs() << "@BB: " << BB.getName() << "\n"); // Replace load - if (FIEY.IsStoreFromReg) { - if (!BC.MIA->replaceMemOperandWithReg(Inst, FIEY.RegOrImm)) { + if (FIEY->IsStoreFromReg) { + if (!BC.MIA->replaceMemOperandWithReg(Inst, FIEY->RegOrImm)) { DEBUG(dbgs() << "FAILED to change operand to a reg\n"); break; } ++NumLoadsChangedToReg; + BC.MIA->removeAnnotation(Inst, "FrameAccessEntry"); DEBUG(dbgs() << "Changed operand to a reg\n"); if (BC.MIA->isRedundantMove(Inst)) { ++NumLoadsDeleted; @@ -697,12 +122,13 @@ void FrameOptimizerPass::removeUnnecessarySpills(const BinaryContext &BC, } } else { char Buf[8] = {0, 0, 0, 0, 0, 0, 0, 0}; - support::ulittle64_t::ref(Buf + 0) = FIEY.RegOrImm; + support::ulittle64_t::ref(Buf + 0) = FIEY->RegOrImm; DEBUG(dbgs() << "Changing operand to an imm... "); if (!BC.MIA->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) { DEBUG(dbgs() << "FAILED\n"); } else { ++NumLoadsChangedToImm; + BC.MIA->removeAnnotation(Inst, "FrameAccessEntry"); DEBUG(dbgs() << "Ok\n"); } } @@ -716,71 +142,130 @@ void FrameOptimizerPass::removeUnnecessarySpills(const BinaryContext &BC, if (Changed) { DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n"); } + // TODO: Implement an interface of eraseInstruction that works out the + // complete list of elements to remove. for (auto I : ToErase) { I.first->eraseInstruction(I.second); } } +void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA, + const BinaryContext &BC, + BinaryFunction &BF) { + StackReachingUses SRU(FA, BC, BF); + SRU.run(); + + DEBUG(dbgs() << "Performing unused stores removal\n"); + std::vector> ToErase; + bool Changed = false; + for (auto &BB : BF) { + DEBUG(dbgs() <<"\tNow at BB " << BB.getName() << "\n"); + const MCInst *Prev = nullptr; + for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { + auto &Inst = *I; + DEBUG({ + dbgs() << "\t\tNow at "; + Inst.dump(); + for (auto I = Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB); + I != SRU.expr_end(); ++I) { + dbgs() << "\t\t\tReached by: "; + (*I)->dump(); + } + }); + auto FIEX = FA.getFIEFor(BC, Inst); + if (!FIEX) { + Prev = &Inst; + continue; + } + if (FIEX->IsLoad || !FIEX->IsSimple || FIEX->StackOffset >= 0) { + Prev = &Inst; + continue; + } + + if (SRU.isStoreUsed(*FIEX, + Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB))) { + Prev = &Inst; + continue; + } + // TODO: Change push/pops to stack adjustment instruction + if (BC.MIA->isPush(Inst)) + continue; + + ++NumRedundantStores; + Changed = true; + DEBUG(dbgs() << "Unused store instruction: "); + DEBUG(Inst.dump()); + DEBUG(dbgs() << "@BB: " << BB.getName() << "\n"); + // Delete it! + ToErase.push_back(std::make_pair(&BB, &Inst)); + Prev = &Inst; + } + } + + for (auto I : ToErase) { + I.first->eraseInstruction(I.second); + } + if (Changed) { + DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n"); + } +} + void FrameOptimizerPass::runOnFunctions(BinaryContext &BC, std::map &BFs, - std::set &) { - uint64_t NumFunctionsNotOptimized{0}; - uint64_t NumFunctionsFailedRestoreFI{0}; - uint64_t CountFunctionsNotOptimized{0}; - uint64_t CountFunctionsFailedRestoreFI{0}; - uint64_t CountDenominator{0}; - Cg = buildCallGraph(BC, BFs); - TopologicalCGOrder = Cg.buildTraversalOrder(); - buildClobberMap(BC); + std::set &LargeFunctions) { + if (opts::FrameOptimization == FOP_NONE) + return; + + // Run FrameAnalysis pass + FrameAnalysis FA(PrintPass); + FA.runOnFunctions(BC, BFs, LargeFunctions); + + // Our main loop: perform caller-saved register optimizations, then + // callee-saved register optimizations (shrink wrapping). for (auto &I : BFs) { - auto Count = I.second.getExecutionCount(); - if (Count != BinaryFunction::COUNT_NO_PROFILE) - CountDenominator += Count; - if (!shouldOptimize(I.second)) { - ++NumFunctionsNotOptimized; - if (Count != BinaryFunction::COUNT_NO_PROFILE) - CountFunctionsNotOptimized += Count; + if (!FA.hasFrameInfo(I.second)) continue; + // Restrict pass execution if user asked to only run on hot functions + if (opts::FrameOptimization == FOP_HOT) { + if (I.second.getKnownExecutionCount() < BC.getHotThreshold()) + continue; + DEBUG(dbgs() << "Considering " << I.second.getPrintName() + << " for frame optimizations because its execution count ( " + << I.second.getKnownExecutionCount() + << " ) exceeds our hotness threshold ( " + << BC.getHotThreshold() << " )\n"); } - if (!restoreFrameIndex(BC, I.second)) { - ++NumFunctionsFailedRestoreFI; - auto Count = I.second.getExecutionCount(); - if (Count != BinaryFunction::COUNT_NO_PROFILE) - CountFunctionsFailedRestoreFI += Count; + { + NamedRegionTimer T1("remove loads", "FOP breakdown", true); + removeUnnecessaryLoads(FA, BC, I.second); + } + { + NamedRegionTimer T1("remove stores", "FOP breakdown", true); + removeUnusedStores(FA, BC, I.second); + } + // Don't even start shrink wrapping if no profiling info is available + if (I.second.getKnownExecutionCount() == 0) continue; + { + NamedRegionTimer T1("move spills", "FOP breakdown", true); + DataflowInfoManager Info(&FA, BC, I.second); + ShrinkWrapping SW(FA, BC, I.second, Info); + SW.perform(); } - removeUnnecessarySpills(BC, I.second); } + FA.cleanAnnotations(BC, BFs); + outs() << "BOLT-INFO: FOP optimized " << NumRedundantLoads - << " redundant load(s).\n"; - - if (opts::Verbosity == 0) { -#ifndef NDEBUG - if (!DebugFlag || !isCurrentDebugType("fop")) - return; -#else - return; -#endif - } - + << " redundant load(s) and " << NumRedundantStores + << " unused store(s)\n"; outs() << "BOLT-INFO: FOP changed " << NumLoadsChangedToReg << " load(s) to use a register instead of a stack access, and " << NumLoadsChangedToImm << " to use an immediate.\n" - << "BOLT-INFO: FOP deleted " << NumLoadsDeleted << " load(s).\n" - << "BOLT-INFO: FOP: Number of functions conservatively treated as " - "clobbering all registers: " - << NumFunctionsAllClobber - << format(" (%.1lf%% dyn cov)\n", - (100.0 * CountFunctionsAllClobber / CountDenominator)) - << "BOLT-INFO: FOP: " << NumFunctionsNotOptimized << " function(s) " - << format("(%.1lf%% dyn cov)", - (100.0 * CountFunctionsNotOptimized / CountDenominator)) - << " were not optimized.\n" - << "BOLT-INFO: FOP: " << NumFunctionsFailedRestoreFI << " function(s) " - << format("(%.1lf%% dyn cov)", - (100.0 * CountFunctionsFailedRestoreFI / CountDenominator)) - << " could not have its frame indices restored.\n"; + << "BOLT-INFO: FOP deleted " << NumLoadsDeleted << " load(s) and " + << NumRedundantStores << " store(s).\n"; + FA.printStats(); + ShrinkWrapping::printStats(); } } // namespace bolt diff --git a/bolt/Passes/FrameOptimizer.h b/bolt/Passes/FrameOptimizer.h index e3423ad19f42..4ba8e1c2bb56 100644 --- a/bolt/Passes/FrameOptimizer.h +++ b/bolt/Passes/FrameOptimizer.h @@ -13,31 +13,40 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_FRAMEOPTIMIZER_H #include "BinaryPasses.h" -#include "BinaryFunctionCallGraph.h" +#include "FrameAnalysis.h" namespace llvm { namespace bolt { -/// FrameOptimizerPass strives for removing unnecessary stack frame accesses. -/// For example, caller-saved registers may be conservatively pushed to the -/// stack because the callee may write to these registers. But if we can prove -/// the callee will never touch these registers, we can remove this spill. +/// FrameOptimizerPass strives for removing or moving stack frame accesses to +/// less frequently executed basic blocks, reducing the pressure on icache +/// usage as well as dynamic instruction count. /// -/// This optimization analyzes the call graph and first compute the set of +/// This is accomplished by analyzing both caller-saved register spills and +/// callee-saved register spills. This class handles the former while delegating +/// the latter to the class ShrinkWrapping. We discuss caller-saved register +/// spills optimization below. +/// +/// Caller-saved registers must be conservatively pushed to the stack because +/// the callee may write to these registers. If we can prove the callee will +/// never touch these registers, we can remove this spill. +/// +/// This optimization analyzes the call graph and first computes the set of /// registers that may get overwritten when executing a function (this includes /// the set of registers touched by all functions this function may call during -/// its execution). +/// its execution) -- see the FrameAnalysis class for implementation details. /// -/// The second step is to perform an alias analysis to disambiguate which stack -/// position is being accessed by each load/store instruction, and annotate -/// these instructions. +/// The second step is to perform an analysis to disambiguate which stack +/// position is being accessed by each load/store instruction -- see the +/// FrameAnalysis class. /// /// The third step performs a forward dataflow analysis, using intersection as /// the confluence operator, to propagate information about available -/// stack definitions at each point of the program. This definition shows -/// an equivalence between the value in a stack position and the value of a -/// register or immediate. To have those preserved, both register and the value -/// in the stack position cannot be touched by another instruction. +/// stack definitions at each point of the program. See the +/// StackAvailableExpressions class. This definition shows an equivalence +/// between the value in a stack position and the value of a register or +/// immediate. To have those preserved, both register and the value in the stack +/// position cannot be touched by another instruction. /// These definitions we are tracking occur in the form: /// /// stack def: MEM[FRAME - 0x5c] <= RAX @@ -62,86 +71,29 @@ namespace bolt { /// In this example, since the store source register is the same as the load /// destination register, this creates a redundant MOV that can be deleted. /// +/// Finally, another analysis propagates information about which instructions +/// are using (loading from) a stack position -- see StackReachingUses. If a +/// store sees no use of the value it is storing, it is eliminated. +/// class FrameOptimizerPass : public BinaryFunctionPass { /// Stats aggregating variables uint64_t NumRedundantLoads{0}; + uint64_t NumRedundantStores{0}; uint64_t NumLoadsChangedToReg{0}; uint64_t NumLoadsChangedToImm{0}; uint64_t NumLoadsDeleted{0}; - /// Number of functions we conservatively marked as clobbering the entire set - /// of registers because we couldn't fully understand it. - uint64_t NumFunctionsAllClobber{0}; - /// Execution count of those functions to give us an idea of their dynamic - /// coverage - uint64_t CountFunctionsAllClobber{0}; - /// Call graph info - BinaryFunctionCallGraph Cg; + /// Perform a dataflow analysis in \p BF to reveal unnecessary reloads from + /// the frame. Use the analysis to convert memory loads to register moves or + /// immediate loads. Delete redundant register moves. + void removeUnnecessaryLoads(const FrameAnalysis &FA, + const BinaryContext &BC, + BinaryFunction &BF); - /// DFS or reverse post-ordering of the call graph nodes to allow us to - /// traverse the call graph bottom-up - std::deque TopologicalCGOrder; - - /// Map functions to the set of registers they may overwrite starting at when - /// it is called until it returns to the caller. - std::map RegsKilledMap; - -public: - /// Alias analysis information attached to each instruction that accesses a - /// frame position. This is called a "frame index" by LLVM Target libs when - /// it is building a MachineFunction frame, and we use the same name here - /// because we are essentially doing the job of frame reconstruction. - struct FrameIndexEntry { - /// If this is false, this instruction is necessarily a store - bool IsLoad; - /// If a store, this controls whether the store uses a register os an imm - /// as the source value. - bool IsStoreFromReg; - /// If load, this holds the destination register. If store, this holds - /// either the source register or source immediate. - int32_t RegOrImm; - - /// StackOffset and Size are the two aspects that identify this frame access - /// for the purposes of alias analysis. - int64_t StackOffset; - uint8_t Size; - - /// If this is false, we will never atempt to remove or optimize this - /// instruction. We just use it to keep track of stores we don't fully - /// understand but we know it may write to a frame position. - bool IsSimple; - }; - typedef std::unordered_map - FrameIndexMapTy; - FrameIndexMapTy FrameIndexMap; - - /// Compute the set of registers \p Inst may write to, marking them in - /// \p KillSet. If this is a call, try to get the set of registers the call - /// target will write to. - void getInstClobberList(const BinaryContext &BC, const MCInst &Inst, - BitVector &KillSet) const; -private: - /// Compute the set of registers \p Func may write to during its execution, - /// starting at the point when it is called up until when it returns. Returns - /// a BitVector the size of the target number of registers, representing the - /// set of clobbered registers. - BitVector getFunctionClobberList(const BinaryContext &BC, - const BinaryFunction *Func); - - /// Perform the step of building the set of registers clobbered by each - /// function execution, populating RegsKilledMap. - void buildClobberMap(const BinaryContext &BC); - - /// Alias analysis to disambiguate which frame position is accessed by each - /// instruction in function \p BF. Populates FrameIndexMap. - bool restoreFrameIndex(const BinaryContext &BC, const BinaryFunction &BF); - - /// Uses RegsKilledMap and FrameIndexMap to perform a dataflow analysis in - /// \p BF to reveal unnecessary reloads from the frame. Use the analysis - /// to convert memory loads to register moves or immediate loads. Delete - /// redundant register moves. - void removeUnnecessarySpills(const BinaryContext &BC, - BinaryFunction &BF); + /// Use information from stack frame usage to delete unused stores. + void removeUnusedStores(const FrameAnalysis &FA, + const BinaryContext &BC, + BinaryFunction &BF); public: explicit FrameOptimizerPass(const cl::opt &PrintPass) @@ -158,6 +110,7 @@ public: }; } // namespace bolt + } // namespace llvm diff --git a/bolt/Passes/LivenessAnalysis.h b/bolt/Passes/LivenessAnalysis.h index f95a9ef12503..ed9e0f00a1e2 100644 --- a/bolt/Passes/LivenessAnalysis.h +++ b/bolt/Passes/LivenessAnalysis.h @@ -14,6 +14,7 @@ #include "DataflowAnalysis.h" #include "FrameAnalysis.h" +#include "llvm/Support/Timer.h" namespace llvm { namespace bolt { @@ -29,6 +30,18 @@ public: NumRegs(BC.MRI->getNumRegs()) {} virtual ~LivenessAnalysis(); + bool isAlive(ProgramPoint PP, MCPhysReg Reg) const { + BitVector BV = (*this->getStateAt(PP)); + const BitVector &RegAliases = BC.MIA->getAliases(Reg, *BC.MRI); + BV &= RegAliases; + return BV.any(); + } + + void run() { + NamedRegionTimer T1("LA", "Dataflow", true); + DataflowAnalysis::run(); + } + protected: /// Reference to the result of stack frame analysis const FrameAnalysis &FA; diff --git a/bolt/Passes/ReachingDefOrUse.h b/bolt/Passes/ReachingDefOrUse.h index ca67389b281a..9b5f8695b3f1 100644 --- a/bolt/Passes/ReachingDefOrUse.h +++ b/bolt/Passes/ReachingDefOrUse.h @@ -13,6 +13,7 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_REACHINGDEFORUSE_H #include "DataflowAnalysis.h" +#include "llvm/Support/Timer.h" namespace llvm { namespace bolt { @@ -50,6 +51,11 @@ public: return (*this->getStateAt(B))[this->ExprToIdx[&A]]; } + void run() { + NamedRegionTimer T1("RD", "Dataflow", true); + InstrsDataflowAnalysis, !Def>::run(); + } + protected: /// Reference to the result of stack frame analysis const FrameAnalysis &FA; diff --git a/bolt/Passes/ReachingInsns.h b/bolt/Passes/ReachingInsns.h index 4bcdb3d843dd..ce6cd8ccaa08 100644 --- a/bolt/Passes/ReachingInsns.h +++ b/bolt/Passes/ReachingInsns.h @@ -12,6 +12,9 @@ #ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_REACHINGINSNS_H #define LLVM_TOOLS_LLVM_BOLT_PASSES_REACHINGINSNS_H +#include "DataflowAnalysis.h" +#include "llvm/Support/Timer.h" + namespace llvm { namespace bolt { @@ -37,6 +40,11 @@ public: return isInLoop(*BB); } + void run() { + NamedRegionTimer T1("RI", "Dataflow", true); + InstrsDataflowAnalysis, Backward>::run(); + } + protected: std::unordered_map InsnToBB; diff --git a/bolt/Passes/ShrinkWrapping.cpp b/bolt/Passes/ShrinkWrapping.cpp new file mode 100644 index 000000000000..dcc5b5758c60 --- /dev/null +++ b/bolt/Passes/ShrinkWrapping.cpp @@ -0,0 +1,1785 @@ +//===--- Passes/ShrinkWrapping.cpp ----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "ShrinkWrapping.h" +#include + +#define DEBUG_TYPE "shrinkwrapping" + +using namespace llvm; + +namespace opts { + +extern cl::OptionCategory BoltOptCategory; + +static cl::opt ShrinkWrappingThreshold( + "shrink-wrapping-threshold", + cl::desc("Percentage of prologue execution count to use as threshold when" + " evaluating whether a block is cold enough to be profitable to" + " move eligible spills there"), + cl::init(40), cl::ZeroOrMore, cl::cat(BoltOptCategory)); +} + +namespace llvm { +namespace bolt { + +void CalleeSavedAnalysis::analyzeSaves() { + ReachingDefOrUse &RD = Info.getReachingDefs(); + StackReachingUses &SRU = Info.getStackReachingUses(); + auto &InsnToBB = Info.getInsnToBBMap(); + + DEBUG(dbgs() << "Checking spill locations\n"); + for (auto &BB : BF) { + DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n"); + const MCInst *Prev = nullptr; + for (auto &Inst : BB) { + if (auto FIE = FA.getFIEFor(BC, Inst)) { + if (!FIE->IsStore || !FIE->IsSimple || !FIE->IsStoreFromReg || + FIE->StackOffset >= 0) { + Prev = &Inst; + continue; + } + + if (RD.isReachedBy(FIE->RegOrImm, + Prev ? RD.expr_begin(*Prev) : RD.expr_begin(BB))) { + Prev = &Inst; + continue; + } + + // If this stack position is accessed in another function, we are + // probably dealing with a parameter passed in a stack -- do not mess + // with it + if (SRU.isStoreUsed(*FIE, + Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB)), + /*IncludeLocalAccesses=*/false) { + Prev = &Inst; + continue; + } + + CalleeSaved.set(FIE->RegOrImm); + if (SaveFIEByReg[FIE->RegOrImm] == nullptr) + SaveFIEByReg[FIE->RegOrImm] = &*FIE; + SavingCost[FIE->RegOrImm] += InsnToBB[&Inst]->getKnownExecutionCount(); + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, getSaveTag(), FIE->RegOrImm); + OffsetsByReg[FIE->RegOrImm] = FIE->StackOffset; + DEBUG(dbgs() << "Logging new candidate for Callee-Saved Reg: " + << FIE->RegOrImm << "\n"); + } + Prev = &Inst; + } + } +} + +void CalleeSavedAnalysis::analyzeRestores() { + ReachingDefOrUse &RU = Info.getReachingUses(); + + // Now compute all restores of these callee-saved regs + for (auto &BB : BF) { + const MCInst *Prev = nullptr; + for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { + auto &Inst = *I; + if (auto FIE = FA.getFIEFor(BC, Inst)) { + if (!FIE->IsLoad || !FIE->IsSimple || !CalleeSaved[FIE->RegOrImm] || + FIE->StackOffset >= 0) { + Prev = &Inst; + continue; + } + + // If this reg is used locally after a restore, then we are probably + // not dealing with a callee-saved reg. Except if this use is by + // another store, but we don't cover this case yet. + if (RU.isReachedBy(FIE->RegOrImm, + Prev ? RU.expr_begin(*Prev) : RU.expr_begin(BB))) { + Prev = &Inst; + continue; + } + // If stack offsets between saves/store don't agree with each other, + // we don't completely understand what's happening here + if (FIE->StackOffset != OffsetsByReg[FIE->RegOrImm]) { + CalleeSaved.reset(FIE->RegOrImm); + DEBUG(dbgs() << "Dismissing Callee-Saved Reg because we found a " + "mismatching restore: " + << FIE->RegOrImm << "\n"); + Prev = &Inst; + continue; + } + + DEBUG(dbgs() << "Adding matching restore for: " << FIE->RegOrImm + << "\n"); + if (LoadFIEByReg[FIE->RegOrImm] == nullptr) + LoadFIEByReg[FIE->RegOrImm] = &*FIE; + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, getRestoreTag(), + FIE->RegOrImm); + HasRestores.set(FIE->RegOrImm); + } + Prev = &Inst; + } + } +} + +std::vector CalleeSavedAnalysis::getSavesByReg(uint16_t Reg) { + std::vector Results; + for (auto &BB : BF) { + for (auto &Inst : BB) { + if (getSavedReg(Inst) == Reg) + Results.push_back(&Inst); + } + } + return Results; +} + +std::vector CalleeSavedAnalysis::getRestoresByReg(uint16_t Reg) { + std::vector Results; + for (auto &BB : BF) { + for (auto &Inst : BB) { + if (getRestoredReg(Inst) == Reg) + Results.push_back(&Inst); + } + } + return Results; +} + +CalleeSavedAnalysis::~CalleeSavedAnalysis() { + for (auto &BB : BF) { + for (auto &Inst : BB) { + BC.MIA->removeAnnotation(Inst, getSaveTag()); + BC.MIA->removeAnnotation(Inst, getRestoreTag()); + } + } +} + +void StackLayoutModifier::blacklistRegion(int64_t Offset, int64_t Size) { + if (BlacklistedRegions[Offset] < Size) { + BlacklistedRegions[Offset] = Size; + } +} + +bool StackLayoutModifier::isRegionBlacklisted(int64_t Offset, int64_t Size) { + for (auto Elem : BlacklistedRegions) { + if (Offset + Size > Elem.first && Offset < Elem.first + Elem.second) + return true; + } + return false; +} + +bool StackLayoutModifier::blacklistAllInConflictWith(int64_t Offset, + int64_t Size) { + bool HasConflict = false; + for (auto Iter = AvailableRegions.begin(); Iter != AvailableRegions.end();) { + auto &Elem = *Iter; + if (Offset + Size > Elem.first && Offset < Elem.first + Elem.second && + (Offset != Elem.first || Size != Elem.second)) { + Iter = AvailableRegions.erase(Iter); + HasConflict = true; + continue; + } + ++Iter; + } + if (HasConflict) { + blacklistRegion(Offset, Size); + return true; + } + return false; +} + +void StackLayoutModifier::checkFramePointerInitialization(MCInst &Point) { + auto &SPT = Info.getStackPointerTracking(); + if (!BC.MII->get(Point.getOpcode()) + .hasDefOfPhysReg(Point, BC.MIA->getFramePointer(), *BC.MRI)) + return; + + int SPVal, FPVal; + std::tie(SPVal, FPVal) = *SPT.getStateBefore(Point); + std::pair FP; + + if (FPVal != SPT.EMPTY && FPVal != SPT.SUPERPOSITION) + FP = std::make_pair(BC.MIA->getFramePointer(), FPVal); + else + FP = std::make_pair(0, 0); + std::pair SP; + + if (SPVal != SPT.EMPTY && SPVal != SPT.SUPERPOSITION) + SP = std::make_pair(BC.MIA->getStackPointer(), SPVal); + else + SP = std::make_pair(0, 0); + + int64_t Output; + if (!BC.MIA->evaluateSimple(Point, Output, SP, FP)) + return; + + // Not your regular frame pointer initialization... bail + if (Output != SPVal) + blacklistRegion(0, 0); +} + +void StackLayoutModifier::classifyStackAccesses() { + // Understand when stack slots are being used non-locally + auto &SRU = Info.getStackReachingUses(); + + for (auto &BB : BF) { + const MCInst *Prev = nullptr; + for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { + auto &Inst = *I; + checkFramePointerInitialization(Inst); + auto FIEX = FA.getFIEFor(BC, Inst); + if (!FIEX) { + Prev = &Inst; + continue; + } + if (!FIEX->IsSimple || (FIEX->IsStore && !FIEX->IsStoreFromReg)) { + blacklistRegion(FIEX->StackOffset, FIEX->Size); + Prev = &Inst; + continue; + } + // If this stack position is accessed in another function, we are + // probably dealing with a parameter passed in a stack -- do not mess + // with it + if (SRU.isStoreUsed(*FIEX, + Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB), + /*IncludeLocalAccesses=*/false)) { + blacklistRegion(FIEX->StackOffset, FIEX->Size); + Prev = &Inst; + continue; + } + // Now we have a clear stack slot access. Check if its blacklisted or if + // it conflicts with another chunk. + if (isRegionBlacklisted(FIEX->StackOffset, FIEX->Size) || + blacklistAllInConflictWith(FIEX->StackOffset, FIEX->Size)) { + Prev = &Inst; + continue; + } + // We are free to go. Add it as available stack slot which we know how + // to move it. + AvailableRegions[FIEX->StackOffset] = FIEX->Size; + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, getSlotTagName(), + FIEX->StackOffset); + RegionToRegMap[FIEX->StackOffset].insert(FIEX->RegOrImm); + RegToRegionMap[FIEX->RegOrImm].insert(FIEX->StackOffset); + DEBUG(dbgs() << "Adding region " << FIEX->StackOffset << " size " + << (int)FIEX->Size << "\n"); + } + } +} + +void StackLayoutModifier::classifyCFIs() { + std::stack> CFIStack; + int64_t CfaOffset{-8}; + uint16_t CfaReg{7}; + + auto recordAccess = [&](MCInst *Inst, int64_t Offset) { + const uint16_t Reg = BC.MRI->getLLVMRegNum(CfaReg, /*isEH=*/false); + if (Reg == BC.MIA->getStackPointer() || Reg == BC.MIA->getFramePointer()) { + BC.MIA->addAnnotation(BC.Ctx.get(), *Inst, getSlotTagName(), Offset); + DEBUG(dbgs() << "Recording CFI " << Offset << "\n"); + } else { + IsSimple = false; + return; + } + }; + + for (auto &BB : BF.layout()) { + for (auto &Inst : *BB) { + if (!BC.MIA->isCFI(Inst)) + continue; + auto *CFI = BF.getCFIFor(Inst); + switch (CFI->getOperation()) { + case MCCFIInstruction::OpDefCfa: + CfaOffset = CFI->getOffset(); + recordAccess(&Inst, CfaOffset); + // Fall-through + case MCCFIInstruction::OpDefCfaRegister: + CfaReg = CFI->getRegister(); + break; + case MCCFIInstruction::OpDefCfaOffset: + CfaOffset = CFI->getOffset(); + recordAccess(&Inst, CfaOffset); + break; + case MCCFIInstruction::OpOffset: + recordAccess(&Inst, CFI->getOffset()); + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, getOffsetCFIRegTagName(), + BC.MRI->getLLVMRegNum(CFI->getRegister(), + /*isEH=*/false)); + break; + case MCCFIInstruction::OpSameValue: + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, getOffsetCFIRegTagName(), + BC.MRI->getLLVMRegNum(CFI->getRegister(), + /*isEH=*/false)); + break; + case MCCFIInstruction::OpRememberState: + CFIStack.push(std::make_pair(CfaOffset, CfaReg)); + break; + case MCCFIInstruction::OpRestoreState: { + assert(!CFIStack.empty() && "Corrupt CFI stack"); + auto &Elem = CFIStack.top(); + CFIStack.pop(); + CfaOffset = Elem.first; + CfaReg = Elem.second; + break; + } + case MCCFIInstruction::OpRelOffset: + case MCCFIInstruction::OpAdjustCfaOffset: + llvm_unreachable("Unhandled AdjustCfaOffset"); + break; + default: + break; + } + } + } +} + +void StackLayoutModifier::scheduleChange( + MCInst &Inst, StackLayoutModifier::WorklistItem Item) { + auto &WList = BC.MIA->getOrCreateAnnotationAs>( + BC.Ctx.get(), Inst, getTodoTagName()); + WList.push_back(Item); +} + +bool StackLayoutModifier::canCollapseRegion(MCInst *DeletedPush) { + if (!IsSimple || !BC.MIA->isPush(*DeletedPush)) + return false; + + auto FIE = FA.getFIEFor(BC, *DeletedPush); + if (!FIE) + return false; + + return canCollapseRegion(FIE->StackOffset); +} + +bool StackLayoutModifier::canCollapseRegion(int64_t RegionAddr) { + if (!IsInitialized) + initialize(); + if (!IsSimple) + return false; + + if (CollapsedRegions.count(RegionAddr)) + return true; + + // Check if it is possible to readjust all accesses below RegionAddr + if (!BlacklistedRegions.empty()) + return false; + + return true; +} + +bool StackLayoutModifier::collapseRegion(MCInst *DeletedPush) { + auto FIE = FA.getFIEFor(BC, *DeletedPush); + if (!FIE) + return false; + int64_t RegionAddr = FIE->StackOffset; + int64_t RegionSz = FIE->Size; + return collapseRegion(DeletedPush, RegionAddr, RegionSz); +} + +bool StackLayoutModifier::collapseRegion(MCInst *Alloc, int64_t RegionAddr, + int64_t RegionSz) { + if (!canCollapseRegion(RegionAddr)) + return false; + + assert(IsInitialized); + auto &SAA = Info.getStackAllocationAnalysis(); + + for (auto &BB : BF) { + for (auto &Inst : BB) { + if (!BC.MIA->hasAnnotation(Inst, getSlotTagName())) + continue; + auto Slot = + BC.MIA->getAnnotationAs( + Inst, getSlotTagName()); + if (!AvailableRegions.count(Slot)) + continue; + // We need to ensure this access is affected by the deleted push + if (!(*SAA.getStateBefore(Inst))[SAA.ExprToIdx[Alloc]]) + continue; + + if (BC.MIA->isCFI(Inst)) { + if (Slot > RegionAddr) + continue; + scheduleChange(Inst, WorklistItem(WorklistItem::AdjustCFI, RegionSz)); + continue; + } + + if (Slot == RegionAddr) { + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "AccessesDeletedPos", 0U); + continue; + } + if (BC.MIA->isPush(Inst) || BC.MIA->isPop(Inst)) { + continue; + } + + auto FIE = FA.getFIEFor(BC, Inst); + assert(FIE); + if (FIE->StackPtrReg == BC.MIA->getStackPointer() && Slot < RegionAddr) + continue; + + if (FIE->StackPtrReg == BC.MIA->getFramePointer() && Slot > RegionAddr) + continue; + + scheduleChange( + Inst, WorklistItem(WorklistItem::AdjustLoadStoreOffset, RegionSz)); + } + } + + CollapsedRegions.insert(RegionAddr); + return true; +} + +void StackLayoutModifier::setOffsetForCollapsedAccesses(int64_t NewOffset) { + for (auto &BB : BF) { + for (auto &Inst : BB) { + if (!BC.MIA->hasAnnotation(Inst, "AccessesDeletedPos")) + continue; + BC.MIA->removeAnnotation(Inst, "AccessesDeletedPos"); + scheduleChange( + Inst, WorklistItem(WorklistItem::AdjustLoadStoreOffset, NewOffset)); + } + } +} + +bool StackLayoutModifier::canInsertRegion(ProgramPoint P) { + if (!IsInitialized) + initialize(); + if (!IsSimple) + return false; + + auto &SPT = Info.getStackPointerTracking(); + int64_t RegionAddr = SPT.getStateBefore(P)->first; + if (RegionAddr == SPT.SUPERPOSITION || RegionAddr == SPT.EMPTY) + return false; + + if (InsertedRegions.count(RegionAddr)) + return true; + + // Check if we are going to screw up stack accesses at call sites that + // pass parameters via stack + if (!BlacklistedRegions.empty()) + return false; + + return true; +} + +bool StackLayoutModifier::insertRegion(ProgramPoint P, int64_t RegionSz) { + if (!canInsertRegion(P)) + return false; + + assert(IsInitialized); + auto &SPT = Info.getStackPointerTracking(); + // This RegionAddr is slightly different from the one seen in collapseRegion + // This is the value of SP before the allocation the user wants to make. + int64_t RegionAddr = SPT.getStateBefore(P)->first; + if (RegionAddr == SPT.SUPERPOSITION || RegionAddr == SPT.EMPTY) + return false; + + auto &DA = Info.getDominatorAnalysis(); + + for (auto &BB : BF) { + for (auto &Inst : BB) { + if (!BC.MIA->hasAnnotation(Inst, getSlotTagName())) + continue; + auto Slot = + BC.MIA->getAnnotationAs( + Inst, getSlotTagName()); + if (!AvailableRegions.count(Slot)) + continue; + + if (!(DA.doesADominateB(P, Inst))) + continue; + + if (BC.MIA->isCFI(Inst)) { + if (Slot >= RegionAddr) + continue; + scheduleChange(Inst, WorklistItem(WorklistItem::AdjustCFI, -RegionSz)); + continue; + } + + auto FIE = FA.getFIEFor(BC, Inst); + assert(FIE); + if (FIE->StackPtrReg == BC.MIA->getStackPointer() && Slot < RegionAddr) + continue; + if (FIE->StackPtrReg == BC.MIA->getFramePointer() && Slot >= RegionAddr) + continue; + if (BC.MIA->isPush(Inst) || BC.MIA->isPop(Inst)) + continue; + scheduleChange( + Inst, WorklistItem(WorklistItem::AdjustLoadStoreOffset, -RegionSz)); + } + } + + InsertedRegions.insert(RegionAddr); + return true; +} + +void StackLayoutModifier::performChanges() { + std::set ModifiedCFIIndices; + for (auto &BB : BF) { + for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { + auto &Inst = *I; + if (BC.MIA->hasAnnotation(Inst, "AccessesDeletedPos")) { + assert(BC.MIA->isPop(Inst) || BC.MIA->isPush(Inst)); + BC.MIA->removeAnnotation(Inst, "AccessesDeletedPos"); + } + if (!BC.MIA->hasAnnotation(Inst, getTodoTagName())) + continue; + auto &WList = BC.MIA->getAnnotationAs>( + Inst, getTodoTagName()); + int64_t Adjustment = 0; + WorklistItem::ActionType AdjustmentType = WorklistItem::None; + for (auto &WI : WList) { + if (WI.Action == WorklistItem::None) + continue; + assert(WI.Action == WorklistItem::AdjustLoadStoreOffset || + WI.Action == WorklistItem::AdjustCFI); + assert((AdjustmentType == WorklistItem::None || + AdjustmentType == WI.Action) && + "Conflicting actions requested at the same program point"); + AdjustmentType = WI.Action; + Adjustment += WI.OffsetUpdate; + } + if (!Adjustment) + continue; + if (AdjustmentType != WorklistItem::AdjustLoadStoreOffset) { + assert(BC.MIA->isCFI(Inst)); + uint32_t CFINum = Inst.getOperand(0).getImm(); + if (ModifiedCFIIndices.count(CFINum)) + continue; + ModifiedCFIIndices.insert(CFINum); + MCCFIInstruction *CFI = BF.getCFIFor(Inst); + DEBUG(dbgs() << "Changing CFI offset from " << CFI->getOffset() + << " to " << (CFI->getOffset() + Adjustment) << "\n"); + CFI->setOffset(CFI->getOffset() + Adjustment); + continue; + } + int32_t SrcImm{0}; + MCPhysReg Reg{0}; + MCPhysReg StackPtrReg{0}; + int64_t StackOffset{0}; + bool IsIndexed{false}; + bool IsLoad{false}; + bool IsStore{false}; + bool IsSimple{false}; + bool IsStoreFromReg{false}; + uint8_t Size{0}; + bool Success{false}; + Success = BC.MIA->isStackAccess(Inst, IsLoad, IsStore, IsStoreFromReg, + Reg, SrcImm, StackPtrReg, StackOffset, + Size, IsSimple, IsIndexed); + assert(Success && IsSimple && !IsIndexed && (!IsStore || IsStoreFromReg)); + if (StackPtrReg != BC.MIA->getFramePointer()) + Adjustment = -Adjustment; + if (IsLoad) + Success = BC.MIA->createRestoreFromStack( + Inst, StackPtrReg, StackOffset + Adjustment, Reg, Size); + else if (IsStore) + Success = BC.MIA->createSaveToStack( + Inst, StackPtrReg, StackOffset + Adjustment, Reg, Size); + DEBUG({ + dbgs() << "Adjusted instruction: "; + Inst.dump(); + }); + assert(Success); + } + } +} + +void StackLayoutModifier::initialize() { + classifyStackAccesses(); + classifyCFIs(); + IsInitialized = true; +} + +uint64_t ShrinkWrapping::SpillsMovedRegularMode = 0; +uint64_t ShrinkWrapping::SpillsMovedPushPopMode = 0; + +using BBIterTy = BinaryBasicBlock::iterator; + +void ShrinkWrapping::classifyCSRUses() { + auto &DA = Info.getDominatorAnalysis(); + auto &SPT = Info.getStackPointerTracking(); + UsesByReg = std::vector(BC.MRI->getNumRegs(), + BitVector(DA.NumInstrs, false)); + + const BitVector &FPAliases = + BC.MIA->getAliases(BC.MIA->getFramePointer(), *BC.MRI); + for (auto &BB : BF) { + for (auto &Inst : BB) { + if (BC.MIA->isCFI(Inst)) + continue; + auto BV = BitVector(BC.MRI->getNumRegs(), false); + BC.MIA->getTouchedRegs(Inst, BV, *BC.MRI); + BV &= CSA.CalleeSaved; + for (int I = BV.find_first(); I != -1; I = BV.find_next(I)) { + if (I == 0) + continue; + if (CSA.getSavedReg(Inst) != I && CSA.getRestoredReg(Inst) != I) + UsesByReg[I].set(DA.ExprToIdx[&Inst]); + } + if (!SPT.HasFramePointer || !BC.MIA->isCall(Inst)) + continue; + BV = CSA.CalleeSaved; + BV &= FPAliases; + for (int I = BV.find_first(); I > 0; I = BV.find_next(I)) { + UsesByReg[I].set(DA.ExprToIdx[&Inst]); + } + } + } +} + +void ShrinkWrapping::pruneUnwantedCSRs() { + BitVector ParamRegs = BC.MIA->getRegsUsedAsParams(*BC.MRI); + for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) { + if (!CSA.CalleeSaved[I]) + continue; + if (ParamRegs[I]) { + CSA.CalleeSaved.reset(I); + continue; + } + if (UsesByReg[I].empty()) { + DEBUG(dbgs() + << "Dismissing Callee-Saved Reg because we found no uses of it:" + << I << "\n"); + CSA.CalleeSaved.reset(I); + continue; + } + if (!CSA.HasRestores[I]) { + DEBUG(dbgs() << "Dismissing Callee-Saved Reg because it does not have " + "restores:" + << I << "\n"); + CSA.CalleeSaved.reset(I); + } + } +} + +void ShrinkWrapping::computeSaveLocations() { + SavePos = std::vector>(BC.MRI->getNumRegs()); + auto &RI = Info.getReachingInsnsBackwards(); + auto &DA = Info.getDominatorAnalysis(); + + DEBUG(dbgs() << "Checking save/restore possibilities\n"); + for (auto &BB : BF) { + DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n"); + + MCInst *First = BB.begin() != BB.end() ? &*BB.begin() : nullptr; + if (!First) + continue; + + // Use reaching instructions to detect if we are inside a loop - if we + // are, do not consider this BB as valid placement for saves. + if (RI.isInLoop(BB)) + continue; + + for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) { + if (!CSA.CalleeSaved[I]) + continue; + + auto BBDominatedUses = BitVector(DA.NumInstrs, false); + for (auto J = UsesByReg[I].find_first(); J > 0; + J = UsesByReg[I].find_next(J)) { + if (DA.doesADominateB(*First, J)) + BBDominatedUses.set(J); + } + DEBUG(dbgs() << "\t\tBB " << BB.getName() << " dominates " + << BBDominatedUses.count() << " uses for reg " << I + << ". Total uses for reg is " << UsesByReg[I].count() + << "\n"); + BBDominatedUses &= UsesByReg[I]; + if (BBDominatedUses == UsesByReg[I]) { + DEBUG(dbgs() << "\t\t\tAdded " << BB.getName() << " as a save pos for " + << I << "\n"); + SavePos[I].insert(First); + DEBUG({ + dbgs() << "Dominated uses are:\n"; + for (auto J = UsesByReg[I].find_first(); J > 0; + J = UsesByReg[I].find_next(J)) { + dbgs() << "Idx " << J << ": "; + DA.Expressions[J]->dump(); + } + }); + } + } + } + + BestSaveCount = std::vector(BC.MRI->getNumRegs(), + std::numeric_limits::max()); + BestSavePos = std::vector(BC.MRI->getNumRegs(), nullptr); + auto &InsnToBB = Info.getInsnToBBMap(); + for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) { + if (!CSA.CalleeSaved[I]) + continue; + + for (auto *Pos : SavePos[I]) { + auto *BB = InsnToBB[Pos]; + uint64_t Count = BB->getExecutionCount(); + if (Count != BinaryBasicBlock::COUNT_NO_PROFILE && + Count < BestSaveCount[I]) { + BestSavePos[I] = Pos; + BestSaveCount[I] = Count; + } + } + } +} + +void ShrinkWrapping::computeDomOrder() { + std::vector Order; + for (MCPhysReg I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) { + Order.push_back(I); + } + + auto &DA = Info.getDominatorAnalysis(); + auto &InsnToBB = Info.getInsnToBBMap(); + std::sort(Order.begin(), Order.end(), [&](const MCPhysReg &A, + const MCPhysReg &B) { + auto *BBA = BestSavePos[A] ? InsnToBB[BestSavePos[A]] : nullptr; + auto *BBB = BestSavePos[B] ? InsnToBB[BestSavePos[B]] : nullptr; + if (BBA == BBB) + return A < B; + if (!BBA && BBB) + return false; + if (BBA && !BBB) + return true; + if (DA.doesADominateB(*BestSavePos[A], *BestSavePos[B])) + return true; + if (DA.doesADominateB(*BestSavePos[B], *BestSavePos[A])) + return false; + return A < B; + }); + + for (MCPhysReg I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) { + DomOrder[Order[I]] = I; + } +} + +bool ShrinkWrapping::isBestSavePosCold(unsigned CSR, MCInst *&BestPosSave, + uint64_t &TotalEstimatedWin) { + const uint64_t CurSavingCost = CSA.SavingCost[CSR]; + if (!CSA.CalleeSaved[CSR]) + return false; + + uint64_t BestCount = BestSaveCount[CSR]; + BestPosSave = BestSavePos[CSR]; + bool ShouldMove{false}; + if (BestCount != std::numeric_limits::max() && + BestCount < (opts::ShrinkWrappingThreshold / 100.0) * CurSavingCost) { + DEBUG({ + auto &InsnToBB = Info.getInsnToBBMap(); + dbgs() << "Better position for saves found in func " << BF.getPrintName() + << " count << " << BF.getKnownExecutionCount() << "\n"; + dbgs() << "Reg: " << CSR + << "; New BB: " << InsnToBB[BestPosSave]->getName() + << " Freq reduction: " << (CurSavingCost - BestCount) << "\n"; + }); + TotalEstimatedWin += CurSavingCost - BestCount; + ShouldMove = true; + } + + if (!ShouldMove) + return false; + if (!BestPosSave) { + DEBUG({ + dbgs() << "Dropping opportunity because we don't know where to put " + "stores -- total est. freq reduc: " + << TotalEstimatedWin << "\n"; + }); + return false; + } + return true; +} + +/// Auxiliar function used to create basic blocks for critical edges and update +/// the dominance frontier with these new locations +void ShrinkWrapping::splitFrontierCritEdges( + BinaryFunction *Func, SmallVector &Frontier, + const SmallVector &IsCritEdge, + const SmallVector &From, + const SmallVector, 4> &To) { + DEBUG(dbgs() << "splitFrontierCritEdges: Now handling func " + << BF.getPrintName() << "\n"); + for (size_t I = 0; I < Frontier.size(); ++I) { + if (!IsCritEdge[I]) + continue; + if (To[I].empty()) + continue; + auto FromBB = From[I]; + DEBUG(dbgs() << " - Now handling FrontierBB " << FromBB->getName() << "\n"); + for (auto DestinationBB : To[I]) { + DEBUG(dbgs() << " - Dest : " << DestinationBB->getName() << "\n"); + auto *NewBB = Func->splitEdge(FromBB, DestinationBB); + // Insert dummy instruction so this BB is never empty (we need this for + // PredictiveStackPointerTracking to work, since it annotates instructions + // and not BBs). + if (NewBB->empty()) { + MCInst NewInst; + BC.MIA->createNoop(NewInst); + NewBB->addInstruction(std::move(NewInst)); + scheduleChange(&*NewBB->begin(), WorklistItem(WorklistItem::Erase, 0)); + } + + // Update frontier + Frontier[I] = ProgramPoint::getLastPointAt(*NewBB); + } + } +} + +SmallVector +ShrinkWrapping::doRestorePlacement(MCInst *BestPosSave, unsigned CSR, + uint64_t TotalEstimatedWin) { + SmallVector Frontier; + SmallVector IsCritEdge; + bool CannotPlace{false}; + auto &DA = Info.getDominatorAnalysis(); + + SmallVector CritEdgesFrom; + SmallVector, 4> CritEdgesTo; + // In case of a critical edge, we need to create extra BBs to host restores + // into edges transitioning to the dominance frontier, otherwise we pull these + // restores to inside the dominated area. + Frontier = DA.getDominanceFrontierFor(*BestPosSave); + for (auto &PP : Frontier) { + bool HasCritEdges{false}; + if (PP.isInst() && BC.MIA->isTerminator(*PP.getInst()) && + doesInstUsesCSR(*PP.getInst(), CSR)) { + CannotPlace = true; + } + BinaryBasicBlock *FrontierBB = Info.getParentBB(PP); + CritEdgesFrom.emplace_back(FrontierBB); + CritEdgesTo.emplace_back(0); + auto &Dests = CritEdgesTo.back(); + bool MayNeedLPSplitting{false}; + // Check for invoke instructions at the dominance frontier, which indicates + // the landing pad is not dominated. + if (PP.isInst() && BC.MIA->isInvoke(*PP.getInst())) + MayNeedLPSplitting = true; + doForAllSuccs(*FrontierBB, [&](ProgramPoint P) { + if (!DA.doesADominateB(*BestPosSave, P)) { + Dests.emplace_back(Info.getParentBB(P)); + return; + } + HasCritEdges = true; + }); + // This confirms LP splitting is necessary to continue. Bail. + if (MayNeedLPSplitting && Dests.empty()) { + DEBUG(dbgs() << "Bailing on restore placement to avoid LP splitting\n"); + Frontier.clear(); + return Frontier; + } + IsCritEdge.push_back(HasCritEdges); + } + if (std::accumulate(IsCritEdge.begin(), IsCritEdge.end(), 0)) { + DEBUG({ + dbgs() << "Now detected critical edges in the following frontier:\n"; + for (auto &PP : Frontier) { + if (PP.isBB()) + dbgs() << " BB: " << PP.getBB()->getName() << "\n"; + else { + dbgs() << " Inst: "; + PP.getInst()->dump(); + } + } + }); + splitFrontierCritEdges(&BF, Frontier, IsCritEdge, CritEdgesFrom, + CritEdgesTo); + // BitVectors that represent all insns of the function are invalid now + // since we changed BBs/Insts. Re-run steps that depend on pointers being + // valid + Info.invalidateAll(); + classifyCSRUses(); + } + if (CannotPlace) { + DEBUG({ + dbgs() << "Dropping opportunity because restore placement failed" + " -- total est. freq reduc: " + << TotalEstimatedWin << "\n"; + }); + Frontier.clear(); + return Frontier; + } + return Frontier; +} + +bool ShrinkWrapping::validatePushPopsMode(unsigned CSR, MCInst *BestPosSave, + int64_t SaveOffset) { + if (FA.requiresAlignment(BF)) { + DEBUG({ + dbgs() << "Reg " << CSR << " is not using push/pops due to function " + "alignment requirements.\n"; + }); + return false; + } + for (MCInst *Save : CSA.getSavesByReg(CSR)) { + if (!SLM.canCollapseRegion(Save)) { + DEBUG(dbgs() << "Reg " << CSR << " cannot collapse region.\n"); + return false; + } + } + + auto &SPT = Info.getStackPointerTracking(); + // Abort if we are inserting a push into an entry BB (offset -8) and this + // func sets up a frame pointer. + if (!SLM.canInsertRegion(BestPosSave) || + SaveOffset == SPT.SUPERPOSITION || SaveOffset == SPT.EMPTY || + (SaveOffset == -8 && SPT.HasFramePointer)) { + DEBUG({ + dbgs() << "Reg " << CSR << " cannot insert region or we are " + "trying to insert a push into entry bb.\n"; + }); + return false; + } + return true; +} + +SmallVector ShrinkWrapping::fixPopsPlacements( + const SmallVector &RestorePoints, int64_t SaveOffset, + unsigned CSR) { + SmallVector FixedRestorePoints = RestorePoints; + // Moving pop locations to the correct sp offset + auto &RI = Info.getReachingInsnsBackwards(); + auto &SPT = Info.getStackPointerTracking(); + for (auto &PP : FixedRestorePoints) { + auto *BB = Info.getParentBB(PP); + auto Found = false; + if (SPT.getStateAt(ProgramPoint::getLastPointAt(*BB))->first == + SaveOffset) { + BitVector BV = *RI.getStateAt(ProgramPoint::getLastPointAt(*BB)); + BV &= UsesByReg[CSR]; + if (!BV.any()) { + Found = true; + PP = BB; + continue; + } + } + for (auto RIt = BB->rbegin(), End = BB->rend(); RIt != End; ++RIt) { + if (SPT.getStateBefore(*RIt)->first == SaveOffset) { + BitVector BV = *RI.getStateAt(*RIt); + BV &= UsesByReg[CSR]; + if (!BV.any()) { + Found = true; + PP = &*RIt; + break; + } + } + } + if (!Found) { + DEBUG({ + dbgs() << "Could not find restore insertion point for " << CSR + << ", falling back to load/store mode\n"; + }); + FixedRestorePoints.clear(); + return FixedRestorePoints; + } + } + return FixedRestorePoints; +} + +void ShrinkWrapping::scheduleOldSaveRestoresRemoval(unsigned CSR, + bool UsePushPops) { + + for (auto &BB : BF.layout()) { + std::vector CFIs; + for (auto I = BB->rbegin(), E = BB->rend(); I != E; ++I) { + auto &Inst = *I; + if (BC.MIA->isCFI(Inst)) { + // Delete all offset CFIs related to this CSR + if (SLM.getOffsetCFIReg(Inst) == CSR) { + HasDeletedOffsetCFIs[CSR] = true; + scheduleChange(&Inst, WorklistItem(WorklistItem::Erase, CSR)); + continue; + } + CFIs.push_back(&Inst); + continue; + } + + auto SavedReg = CSA.getSavedReg(Inst); + auto RestoredReg = CSA.getRestoredReg(Inst); + if (SavedReg != CSR && RestoredReg != CSR) { + CFIs.clear(); + continue; + } + + scheduleChange(&Inst, WorklistItem(UsePushPops + ? WorklistItem::Erase + : WorklistItem::ChangeToAdjustment, + CSR)); + + // Delete associated CFIs + const bool RecordDeletedPushCFIs = + SavedReg == CSR && DeletedPushCFIs[CSR].empty(); + const bool RecordDeletedPopCFIs = + RestoredReg == CSR && DeletedPopCFIs[CSR].empty(); + for (MCInst *CFI : CFIs) { + auto *MCCFI = BF.getCFIFor(*CFI); + // Do not touch these... + if (MCCFI->getOperation() == MCCFIInstruction::OpRestoreState || + MCCFI->getOperation() == MCCFIInstruction::OpRememberState) + continue; + scheduleChange(CFI, WorklistItem(WorklistItem::Erase, CSR)); + if (RecordDeletedPushCFIs) { + // Do not record this to be replayed later because we are going to + // rebuild it. + if (MCCFI->getOperation() == MCCFIInstruction::OpDefCfaOffset) + continue; + DeletedPushCFIs[CSR].push_back(CFI->getOperand(0).getImm()); + } + if (RecordDeletedPopCFIs) { + if (MCCFI->getOperation() == MCCFIInstruction::OpDefCfaOffset) + continue; + DeletedPopCFIs[CSR].push_back(CFI->getOperand(0).getImm()); + } + } + CFIs.clear(); + } + } +} + +bool ShrinkWrapping::doesInstUsesCSR(const MCInst &Inst, uint16_t CSR) { + if (BC.MIA->isCFI(Inst) || CSA.getSavedReg(Inst) == CSR || + CSA.getRestoredReg(Inst) == CSR) + return false; + BitVector BV = BitVector(BC.MRI->getNumRegs(), false); + BC.MIA->getTouchedRegs(Inst, BV, *BC.MRI); + return BV[CSR]; +} + +void ShrinkWrapping::scheduleSaveRestoreInsertions( + unsigned CSR, MCInst *BestPosSave, + SmallVector &RestorePoints, bool UsePushPops) { + auto &InsnToBB = Info.getInsnToBBMap(); + auto FIESave = CSA.SaveFIEByReg[CSR]; + auto FIELoad = CSA.LoadFIEByReg[CSR]; + assert(FIESave && FIELoad && "Invalid CSR"); + + DEBUG({ + dbgs() << "Scheduling save insertion at: "; + BestPosSave->dump(); + }); + + scheduleChange(BestPosSave, UsePushPops ? WorklistItem::InsertPushOrPop + : WorklistItem::InsertLoadOrStore, + *FIESave, CSR); + + for (auto &PP : RestorePoints) { + BinaryBasicBlock *FrontierBB = Info.getParentBB(PP); + DEBUG({ + dbgs() << "Scheduling restore insertion at: "; + if (PP.isInst()) + PP.getInst()->dump(); + else { + dbgs() << PP.getBB()->getName() << "\n"; + } + }); + MCInst *Term = + FrontierBB->getTerminatorBefore(PP.isInst() ? PP.getInst() : nullptr); + if (Term) + PP = Term; + if (PP.isInst() && doesInstUsesCSR(*PP.getInst(), CSR)) { + assert(!InsnToBB[PP.getInst()]->hasTerminatorAfter(PP.getInst()) && + "cannot move to end of bb"); + scheduleChange(InsnToBB[PP.getInst()], + UsePushPops ? WorklistItem::InsertPushOrPop + : WorklistItem::InsertLoadOrStore, + *FIELoad, CSR); + continue; + } + scheduleChange(PP, UsePushPops ? WorklistItem::InsertPushOrPop + : WorklistItem::InsertLoadOrStore, + *FIELoad, CSR); + } +} + +void ShrinkWrapping::moveSaveRestores() { + bool DisablePushPopMode{false}; + bool UsedPushPopMode{false}; + + for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) { + MCInst *BestPosSave{nullptr}; + uint64_t TotalEstimatedWin{0}; + if (!isBestSavePosCold(I, BestPosSave, TotalEstimatedWin)) + continue; + SmallVector RestorePoints = + doRestorePlacement(BestPosSave, I, TotalEstimatedWin); + if (RestorePoints.empty()) + continue; + + auto FIESave = CSA.SaveFIEByReg[I]; + auto FIELoad = CSA.LoadFIEByReg[I]; + assert(FIESave && FIELoad); + auto &SPT = Info.getStackPointerTracking(); + auto SaveOffset = SPT.getStateBefore(*BestPosSave)->first; + auto SaveSize = FIESave->Size; + + // Operation mode: if true, will insert push/pops instead of loads/restores + bool UsePushPops = validatePushPopsMode(I, BestPosSave, SaveOffset); + + if (UsePushPops) { + auto FixedRestorePoints = fixPopsPlacements(RestorePoints, SaveOffset, I); + if (FixedRestorePoints.empty()) + UsePushPops = false; + else + RestorePoints = FixedRestorePoints; + } + + // Disable push-pop mode for all CSRs in this function + if (!UsePushPops) + DisablePushPopMode = true; + else + UsedPushPopMode = true; + + scheduleOldSaveRestoresRemoval(I, UsePushPops); + scheduleSaveRestoreInsertions(I, BestPosSave, RestorePoints, UsePushPops); + + // Schedule modifications to stack-accessing instructions via + // StackLayoutModifier + if (UsePushPops) { + for (MCInst *Save : CSA.getSavesByReg(I)) { + SLM.collapseRegion(Save); + } + SLM.insertRegion(BestPosSave, SaveSize); + } + + // Stats collection + if (UsePushPops) + ++SpillsMovedPushPopMode; + else + ++SpillsMovedRegularMode; + } + + // Revert push-pop mode if it failed for a single CSR + if (DisablePushPopMode && UsedPushPopMode) { + for (auto &BB : BF) { + auto WRI = Todo.find(&BB); + if (WRI != Todo.end()) { + auto &TodoList = WRI->second; + for (auto &Item : TodoList) { + if (Item.Action == WorklistItem::InsertPushOrPop) + Item.Action = WorklistItem::InsertLoadOrStore; + } + } + for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { + auto &Inst = *I; + auto TodoList = BC.MIA->tryGetAnnotationAs>( + Inst, getAnnotationName()); + if (!TodoList) + continue; + bool isCFI = BC.MIA->isCFI(Inst); + for (auto &Item : *TodoList) { + if (Item.Action == WorklistItem::InsertPushOrPop) + Item.Action = WorklistItem::InsertLoadOrStore; + if (!isCFI && Item.Action == WorklistItem::Erase) + Item.Action = WorklistItem::ChangeToAdjustment; + } + } + } + } +} + +namespace { + +// A special StackPointerTracking that compensates for our future plans +// in removing/adding insn. +class PredictiveStackPointerTracking + : public StackPointerTrackingBase { + friend class DataflowAnalysis>; + decltype(ShrinkWrapping::Todo) &TodoMap; + DataflowInfoManager &Info; + +protected: + void compNextAux(const MCInst &Point, + const std::vector &TodoItems, + std::pair &Res) { + for (const auto &Item : TodoItems) { + if (Item.Action == ShrinkWrapping::WorklistItem::Erase && + BC.MIA->isPush(Point)) { + Res.first += BC.MIA->getPushSize(Point); + continue; + } + if (Item.Action == ShrinkWrapping::WorklistItem::Erase && + BC.MIA->isPop(Point)) { + Res.first -= BC.MIA->getPopSize(Point); + continue; + } + if (Item.Action == ShrinkWrapping::WorklistItem::InsertPushOrPop && + Item.FIEToInsert.IsStore) { + Res.first -= Item.FIEToInsert.Size; + continue; + } + if (Item.Action == ShrinkWrapping::WorklistItem::InsertPushOrPop && + Item.FIEToInsert.IsLoad) { + Res.first += Item.FIEToInsert.Size; + continue; + } + } + } + + std::pair computeNext(const MCInst &Point, + const std::pair &Cur) { + std::pair Res = + StackPointerTrackingBase::computeNext( + Point, Cur); + if (Res.first == StackPointerTracking::SUPERPOSITION || + Res.first == StackPointerTracking::EMPTY) + return Res; + auto TodoItems = + BC.MIA->tryGetAnnotationAs>( + Point, ShrinkWrapping::getAnnotationName()); + if (TodoItems) + compNextAux(Point, *TodoItems, Res); + auto &InsnToBBMap = Info.getInsnToBBMap(); + if (&*InsnToBBMap[&Point]->rbegin() != &Point) + return Res; + auto WRI = TodoMap.find(InsnToBBMap[&Point]); + if (WRI == TodoMap.end()) + return Res; + compNextAux(Point, WRI->second, Res); + return Res; + } + + StringRef getAnnotationName() const { + return StringRef("PredictiveStackPointerTracking"); + } + +public: + PredictiveStackPointerTracking(const BinaryContext &BC, BinaryFunction &BF, + decltype(ShrinkWrapping::Todo) &TodoMap, + DataflowInfoManager &Info) + : StackPointerTrackingBase(BC, BF), + TodoMap(TodoMap), Info(Info) {} + + void run() { + NamedRegionTimer T1("PSPT", "Dataflow", true); + StackPointerTrackingBase::run(); + } +}; + +} // end anonymous namespace + +void ShrinkWrapping::insertUpdatedCFI(unsigned CSR, int SPValPush, + int SPValPop) { + MCInst *SavePoint{nullptr}; + for (auto &BB : BF) { + for (auto InstIter = BB.rbegin(), EndIter = BB.rend(); InstIter != EndIter; + ++InstIter) { + int32_t SrcImm{0}; + MCPhysReg Reg{0}; + MCPhysReg StackPtrReg{0}; + int64_t StackOffset{0}; + bool IsIndexed{false}; + bool IsLoad{false}; + bool IsStore{false}; + bool IsSimple{false}; + bool IsStoreFromReg{false}; + uint8_t Size{0}; + if (!BC.MIA->isStackAccess(*InstIter, IsLoad, IsStore, IsStoreFromReg, + Reg, SrcImm, StackPtrReg, StackOffset, Size, + IsSimple, IsIndexed)) + continue; + if (Reg != CSR || !IsStore) + continue; + SavePoint = &*InstIter; + break; + } + if (SavePoint) + break; + } + assert(SavePoint); + DEBUG({ + dbgs() << "Now using as save point for reg " << CSR << " :"; + SavePoint->dump(); + }); + bool PrevAffectedZone{false}; + BinaryBasicBlock *PrevBB{nullptr}; + auto &DA = Info.getDominatorAnalysis(); + for (auto BB : BF.layout()) { + if (BB->size() == 0) + continue; + const bool InAffectedZoneAtEnd = DA.count(*BB->rbegin(), *SavePoint); + const bool InAffectedZoneAtBegin = + (*DA.getStateBefore(*BB->begin()))[DA.ExprToIdx[SavePoint]]; + bool InAffectedZone = InAffectedZoneAtBegin; + for (auto InstIter = BB->begin(); InstIter != BB->end(); ++InstIter) { + const bool CurZone = DA.count(*InstIter, *SavePoint); + if (InAffectedZone != CurZone) { + auto InsertionIter = InstIter; + ++InsertionIter; + InAffectedZone = CurZone; + if (InAffectedZone) { + InstIter = --insertCFIsForPushOrPop(*BB, InsertionIter, CSR, true, 0, + SPValPop); + } else { + InstIter = --insertCFIsForPushOrPop(*BB, InsertionIter, CSR, false, 0, + SPValPush); + } + } + } + if (InAffectedZoneAtBegin != PrevAffectedZone) { + if (InAffectedZoneAtBegin) { + insertCFIsForPushOrPop(*PrevBB, PrevBB->end(), CSR, true, 0, SPValPush); + } else { + insertCFIsForPushOrPop(*PrevBB, PrevBB->end(), CSR, false, 0, SPValPop); + } + } + PrevAffectedZone = InAffectedZoneAtEnd; + PrevBB = BB; + } +} + +void ShrinkWrapping::rebuildCFIForSP() { + for (auto &BB : BF) { + for (auto &Inst : BB) { + if (!BC.MIA->isCFI(Inst)) + continue; + auto *CFI = BF.getCFIFor(Inst); + if (CFI->getOperation() == MCCFIInstruction::OpDefCfaOffset) + BC.MIA->addAnnotation(BC.Ctx.get(), Inst, "DeleteMe", 0U); + } + } + + int PrevSPVal{-8}; + BinaryBasicBlock *PrevBB{nullptr}; + auto &SPT = Info.getStackPointerTracking(); + for (auto BB : BF.layout()) { + if (BB->size() == 0) + continue; + const int SPValAtEnd = SPT.getStateAt(*BB->rbegin())->first; + const int SPValAtBegin = SPT.getStateBefore(*BB->begin())->first; + int SPVal = SPValAtBegin; + for (auto Iter = BB->begin(); Iter != BB->end(); ++Iter) { + const int CurVal = SPT.getStateAt(*Iter)->first; + if (SPVal != CurVal) { + auto InsertionIter = Iter; + ++InsertionIter; + Iter = BF.addCFIInstruction( + BB, InsertionIter, + MCCFIInstruction::createDefCfaOffset(nullptr, SPVal)); + SPVal = CurVal; + } + } + if (SPValAtBegin != PrevSPVal) { + BF.addCFIInstruction( + PrevBB, PrevBB->end(), + MCCFIInstruction::createDefCfaOffset(nullptr, SPValAtBegin)); + } + PrevSPVal = SPValAtEnd; + PrevBB = BB; + } + + for (auto &BB : BF) + for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) + if (BC.MIA->hasAnnotation(*I, "DeleteMe")) + BB.eraseInstruction(&*I); +} + +MCInst ShrinkWrapping::createStackAccess(int SPVal, int FPVal, + const FrameIndexEntry &FIE, + bool CreatePushOrPop) { + MCInst NewInst; + if (SPVal != StackPointerTracking::SUPERPOSITION && + SPVal != StackPointerTracking::EMPTY) { + if (FIE.IsLoad) { + if (!BC.MIA->createRestoreFromStack(NewInst, BC.MIA->getStackPointer(), + FIE.StackOffset - SPVal, FIE.RegOrImm, + FIE.Size)) { + errs() << "createRestoreFromStack: not supported on this platform\n"; + abort(); + } + } else { + if (!BC.MIA->createSaveToStack(NewInst, BC.MIA->getStackPointer(), + FIE.StackOffset - SPVal, FIE.RegOrImm, + FIE.Size)) { + errs() << "createSaveToStack: not supported on this platform\n"; + abort(); + } + } + if (CreatePushOrPop) + BC.MIA->changeToPushOrPop(NewInst); + return NewInst; + } + assert(FPVal != StackPointerTracking::SUPERPOSITION && + FPVal != StackPointerTracking::EMPTY); + + if (FIE.IsLoad) { + if (!BC.MIA->createRestoreFromStack(NewInst, BC.MIA->getFramePointer(), + FIE.StackOffset - FPVal, FIE.RegOrImm, + FIE.Size)) { + errs() << "createRestoreFromStack: not supported on this platform\n"; + abort(); + } + } else { + if (!BC.MIA->createSaveToStack(NewInst, BC.MIA->getFramePointer(), + FIE.StackOffset - FPVal, FIE.RegOrImm, + FIE.Size)) { + errs() << "createSaveToStack: not supported on this platform\n"; + abort(); + } + } + return NewInst; +} + +void ShrinkWrapping::updateCFIInstOffset(MCInst &Inst, int64_t NewOffset) { + auto *CFI = BF.getCFIFor(Inst); + if (UpdatedCFIs.count(CFI)) + return; + + switch (CFI->getOperation()) { + case MCCFIInstruction::OpDefCfa: + case MCCFIInstruction::OpDefCfaRegister: + case MCCFIInstruction::OpDefCfaOffset: + CFI->setOffset(NewOffset); + break; + case MCCFIInstruction::OpOffset: + default: + break; + } + + UpdatedCFIs.insert(CFI); +} + +BBIterTy ShrinkWrapping::insertCFIsForPushOrPop(BinaryBasicBlock &BB, + BBIterTy Pos, unsigned Reg, + bool isPush, int Sz, + int64_t NewOffset) { + if (isPush) { + for (uint32_t Idx : DeletedPushCFIs[Reg]) { + Pos = BF.addCFIPseudo(&BB, Pos, Idx); + updateCFIInstOffset(*Pos++, NewOffset); + } + if (HasDeletedOffsetCFIs[Reg]) { + Pos = ++BF.addCFIInstruction( + &BB, Pos, + MCCFIInstruction::createOffset( + nullptr, BC.MRI->getDwarfRegNum(Reg, false), NewOffset)); + } + } else { + for (uint32_t Idx : DeletedPopCFIs[Reg]) { + Pos = BF.addCFIPseudo(&BB, Pos, Idx); + updateCFIInstOffset(*Pos++, NewOffset); + } + if (HasDeletedOffsetCFIs[Reg]) { + Pos = ++BF.addCFIInstruction( + &BB, Pos, + MCCFIInstruction::createSameValue( + nullptr, BC.MRI->getDwarfRegNum(Reg, false))); + } + } + return Pos; +} + +BBIterTy ShrinkWrapping::processInsertion(BBIterTy InsertionPoint, + BinaryBasicBlock *CurBB, + const WorklistItem &Item, + int64_t SPVal, int64_t FPVal) { + // Trigger CFI reconstruction for this CSR if necessary - writing to + // PushOffsetByReg/PopOffsetByReg *will* trigger CFI update + if ((Item.FIEToInsert.IsStore && + !DeletedPushCFIs[Item.AffectedReg].empty()) || + (Item.FIEToInsert.IsLoad && !DeletedPopCFIs[Item.AffectedReg].empty()) || + HasDeletedOffsetCFIs[Item.AffectedReg]) { + if (Item.Action == WorklistItem::InsertPushOrPop) { + if (Item.FIEToInsert.IsStore) + PushOffsetByReg[Item.AffectedReg] = SPVal - Item.FIEToInsert.Size; + else + PopOffsetByReg[Item.AffectedReg] = SPVal; + } else { + if (Item.FIEToInsert.IsStore) + PushOffsetByReg[Item.AffectedReg] = Item.FIEToInsert.StackOffset; + else + PopOffsetByReg[Item.AffectedReg] = Item.FIEToInsert.StackOffset; + } + } + + DEBUG({ + dbgs() << "Creating stack access with SPVal = " << SPVal + << "; stack offset = " << Item.FIEToInsert.StackOffset + << " Is push = " << (Item.Action == WorklistItem::InsertPushOrPop) + << "\n"; + }); + MCInst NewInst = + createStackAccess(SPVal, FPVal, Item.FIEToInsert, + Item.Action == WorklistItem::InsertPushOrPop); + if (InsertionPoint != CurBB->end()) { + DEBUG({ + dbgs() << "Adding before Inst: "; + InsertionPoint->dump(); + dbgs() << "the following inst: "; + NewInst.dump(); + }); + return ++CurBB->insertInstruction(InsertionPoint, std::move(NewInst)); + } + CurBB->addInstruction(std::move(NewInst)); + DEBUG(dbgs() << "Adding to BB!\n"); + return CurBB->end(); +} + +BBIterTy ShrinkWrapping::processInsertionsList( + BBIterTy InsertionPoint, BinaryBasicBlock *CurBB, + std::vector &TodoList, int64_t SPVal, int64_t FPVal) { + bool HasInsertions{false}; + for (auto &Item : TodoList) { + if (Item.Action == WorklistItem::Erase || + Item.Action == WorklistItem::ChangeToAdjustment) + continue; + HasInsertions = true; + break; + } + + if (!HasInsertions) + return InsertionPoint; + + assert(((SPVal != StackPointerTracking::SUPERPOSITION && + SPVal != StackPointerTracking::EMPTY) || + (FPVal != StackPointerTracking::SUPERPOSITION && + FPVal != StackPointerTracking::EMPTY)) && + "Cannot insert if we have no idea of the stack state here"); + + // Revert the effect of PSPT for this location, we want SP Value before + // insertions + if (InsertionPoint == CurBB->end()) { + for (auto &Item : TodoList) { + if (Item.Action != WorklistItem::InsertPushOrPop) + continue; + if (Item.FIEToInsert.IsStore) + SPVal += Item.FIEToInsert.Size; + if (Item.FIEToInsert.IsLoad) + SPVal -= Item.FIEToInsert.Size; + } + } + + // Reorder POPs to obey the correct dominance relation between them + std::stable_sort(TodoList.begin(), TodoList.end(), [&](const WorklistItem &A, + const WorklistItem + &B) { + if ((A.Action != WorklistItem::InsertPushOrPop || !A.FIEToInsert.IsLoad) && + (B.Action != WorklistItem::InsertPushOrPop || !B.FIEToInsert.IsLoad)) + return false; + if ((A.Action != WorklistItem::InsertPushOrPop || !A.FIEToInsert.IsLoad)) + return false; + if ((B.Action != WorklistItem::InsertPushOrPop || !B.FIEToInsert.IsLoad)) + return true; + return DomOrder[B.AffectedReg] < DomOrder[A.AffectedReg]; + }); + + // Process insertions + for (auto &Item : TodoList) { + if (Item.Action == WorklistItem::Erase || + Item.Action == WorklistItem::ChangeToAdjustment) + continue; + + InsertionPoint = + processInsertion(InsertionPoint, CurBB, Item, SPVal, FPVal); + if (Item.Action == WorklistItem::InsertPushOrPop && + Item.FIEToInsert.IsStore) { + SPVal -= Item.FIEToInsert.Size; + } + if (Item.Action == WorklistItem::InsertPushOrPop && + Item.FIEToInsert.IsLoad) { + SPVal += Item.FIEToInsert.Size; + } + } + return InsertionPoint; +} + +bool ShrinkWrapping::processInsertions() { + PredictiveStackPointerTracking PSPT(BC, BF, Todo, Info); + PSPT.run(); + + bool Changes{false}; + for (auto &BB : BF) { + // Process insertions before some inst. + for (auto I = BB.begin(); I != BB.end(); ++I) { + auto &Inst = *I; + auto TodoList = BC.MIA->tryGetAnnotationAs>( + Inst, getAnnotationName()); + if (!TodoList) + continue; + Changes = true; + auto List = *TodoList; + DEBUG({ + dbgs() << "Now processing insertions in " << BB.getName() + << " before inst: "; + Inst.dump(); + }); + auto Iter = I; + auto SPTState = + *PSPT.getStateAt(Iter == BB.begin() ? (ProgramPoint)&BB : &*(--Iter)); + I = processInsertionsList(I, &BB, List, SPTState.first, SPTState.second); + } + // Process insertions at the end of bb + auto WRI = Todo.find(&BB); + if (WRI != Todo.end()) { + auto SPTState = *PSPT.getStateAt(*BB.rbegin()); + processInsertionsList(BB.end(), &BB, WRI->second, SPTState.first, + SPTState.second); + Changes = true; + } + } + return Changes; +} + +void ShrinkWrapping::processDeletions() { + auto &LA = Info.getLivenessAnalysis(); + for (auto &BB : BF) { + for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) { + auto &Inst = *I; + auto TodoList = BC.MIA->tryGetAnnotationAs>( + Inst, getAnnotationName()); + if (!TodoList) + continue; + // Process all deletions + for (auto &Item : *TodoList) { + if (Item.Action != WorklistItem::Erase && + Item.Action != WorklistItem::ChangeToAdjustment) + continue; + + if (Item.Action == WorklistItem::ChangeToAdjustment) { + // Is flag reg alive across this func? + bool DontClobberFlags = LA.isAlive(&Inst, BC.MIA->getFlagsReg()); + if (auto Sz = BC.MIA->getPushSize(Inst)) { + BC.MIA->createStackPointerIncrement(Inst, Sz, DontClobberFlags); + continue; + } + if (auto Sz = BC.MIA->getPopSize(Inst)) { + BC.MIA->createStackPointerDecrement(Inst, Sz, DontClobberFlags); + continue; + } + } + + DEBUG({ + dbgs() << "Erasing: "; + Inst.dump(); + }); + BB.eraseInstruction(&Inst); + break; + } + } + } +} + +void ShrinkWrapping::rebuildCFI() { + const bool FP = Info.getStackPointerTracking().HasFramePointer; + Info.invalidateAll(); + if (!FP) { + rebuildCFIForSP(); + Info.invalidateAll(); + } + for (unsigned I = 0, E = BC.MRI->getNumRegs(); I != E; ++I) { + if (PushOffsetByReg[I] == 0 || PopOffsetByReg[I] == 0) + continue; + const int64_t SPValPush = PushOffsetByReg[I]; + const int64_t SPValPop = PopOffsetByReg[I]; + insertUpdatedCFI(I, SPValPush, SPValPop); + Info.invalidateAll(); + } +} + +void ShrinkWrapping::perform() { + HasDeletedOffsetCFIs = std::vector(BC.MRI->getNumRegs(), false); + PushOffsetByReg = std::vector(BC.MRI->getNumRegs(), 0LL); + PopOffsetByReg = std::vector(BC.MRI->getNumRegs(), 0LL); + DomOrder = std::vector(BC.MRI->getNumRegs(), 0); + + SLM.initialize(); + CSA.compute(); + classifyCSRUses(); + pruneUnwantedCSRs(); + computeSaveLocations(); + computeDomOrder(); + moveSaveRestores(); + DEBUG({ + dbgs() << "Func before shrink-wrapping: \n"; + BF.dump(); + }); + SLM.performChanges(); + // Early exit if processInsertions doesn't detect any todo items + if (!processInsertions()) + return; + processDeletions(); + rebuildCFI(); + // We may have split edges, creating BBs that need correct branching + BF.fixBranches(); + DEBUG({ + dbgs() << "Func after shrink-wrapping: \n"; + BF.dump(); + }); +} + +void ShrinkWrapping::printStats() { + outs() << "BOLT-INFO: Shrink wrapping moved " << SpillsMovedRegularMode + << " spills inserting load/stores and " << SpillsMovedPushPopMode + << " spills inserting push/pops\n"; +} + +// Operators necessary as a result of using MCAnnotation +raw_ostream &operator<<(raw_ostream &OS, + const std::vector &Vec) { + OS << "SWTodo["; + auto Sep = ""; + for (const auto &Item : Vec) { + OS << Sep; + switch (Item.Action) { + case ShrinkWrapping::WorklistItem::Erase: + OS << "Erase"; + break; + case ShrinkWrapping::WorklistItem::ChangeToAdjustment: + OS << "ChangeToAdjustment"; + break; + case ShrinkWrapping::WorklistItem::InsertLoadOrStore: + OS << "InsertLoadOrStore"; + break; + case ShrinkWrapping::WorklistItem::InsertPushOrPop: + OS << "InsertPushOrPop"; + break; + } + Sep = ", "; + } + OS << "]"; + return OS; +} + +raw_ostream & +operator<<(raw_ostream &OS, + const std::vector &Vec) { + OS << "SLMTodo["; + auto Sep = ""; + for (const auto &Item : Vec) { + OS << Sep; + switch (Item.Action) { + case StackLayoutModifier::WorklistItem::None: + OS << "None"; + break; + case StackLayoutModifier::WorklistItem::AdjustLoadStoreOffset: + OS << "AdjustLoadStoreOffset"; + break; + case StackLayoutModifier::WorklistItem::AdjustCFI: + OS << "AdjustCFI"; + break; + } + Sep = ", "; + } + OS << "]"; + return OS; +} + +bool operator==(const ShrinkWrapping::WorklistItem &A, + const ShrinkWrapping::WorklistItem &B) { + return (A.Action == B.Action && A.AffectedReg == B.AffectedReg && + A.Adjustment == B.Adjustment && + A.FIEToInsert.IsLoad == B.FIEToInsert.IsLoad && + A.FIEToInsert.IsStore == B.FIEToInsert.IsStore && + A.FIEToInsert.RegOrImm == B.FIEToInsert.RegOrImm && + A.FIEToInsert.Size == B.FIEToInsert.Size && + A.FIEToInsert.IsSimple == B.FIEToInsert.IsSimple && + A.FIEToInsert.StackOffset == B.FIEToInsert.StackOffset); +} + +bool operator==(const StackLayoutModifier::WorklistItem &A, + const StackLayoutModifier::WorklistItem &B) { + return (A.Action == B.Action && A.OffsetUpdate == B.OffsetUpdate); +} + +} // end namespace bolt +} // end namespace llvm diff --git a/bolt/Passes/ShrinkWrapping.h b/bolt/Passes/ShrinkWrapping.h new file mode 100644 index 000000000000..7c28dea5ba47 --- /dev/null +++ b/bolt/Passes/ShrinkWrapping.h @@ -0,0 +1,477 @@ +//===--- Passes/ShrinkWrapping.h ------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_SHRINKWRAPPING_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_SHRINKWRAPPING_H + +#include "BinaryPasses.h" +#include "FrameAnalysis.h" +#include "DataflowInfoManager.h" + +namespace llvm { +namespace bolt { + +/// Encapsulates logic required to analyze a binary function and detect which +/// registers are being saved as callee-saved, where are these saves and where +/// are the points where their original value are being restored. +class CalleeSavedAnalysis { + const FrameAnalysis &FA; + const BinaryContext &BC; + BinaryFunction &BF; + DataflowInfoManager &Info; + + /// Compute all stores of callee-saved regs. Those are the ones that stores a + /// register whose definition is not local. + void analyzeSaves(); + + /// Similar to analyzeSaves, tries to determine all instructions that recover + /// the original value of the callee-saved register before exiting the + /// function. + void analyzeRestores(); + + /// Returns the identifying string used to annotate instructions with metadata + /// for this analysis. These are deleted in the destructor. + static StringRef getSaveTag() { + return StringRef("CSA-SavedReg"); + } + static StringRef getRestoreTag() { + return StringRef("CSA-RestoredReg"); + } + +public: + BitVector CalleeSaved; + std::vector OffsetsByReg; + BitVector HasRestores; + std::vector SavingCost; + std::vector SaveFIEByReg; + std::vector LoadFIEByReg; + + CalleeSavedAnalysis(const FrameAnalysis &FA, const BinaryContext &BC, + BinaryFunction &BF, DataflowInfoManager &Info) + : FA(FA), BC(BC), BF(BF), Info(Info), + CalleeSaved(BC.MRI->getNumRegs(), false), + OffsetsByReg(BC.MRI->getNumRegs(), 0LL), + HasRestores(BC.MRI->getNumRegs(), false), + SavingCost(BC.MRI->getNumRegs(), 0ULL), + SaveFIEByReg(BC.MRI->getNumRegs(), nullptr), + LoadFIEByReg(BC.MRI->getNumRegs(), nullptr) {} + + ~CalleeSavedAnalysis(); + + void compute() { + analyzeSaves(); + analyzeRestores(); + } + + /// Retrieves the value of the callee-saved register that is saved by this + /// instruction or 0 if this is not a CSR save instruction. + uint16_t getSavedReg(const MCInst &Inst) { + auto Val = BC.MIA->tryGetAnnotationAs( + Inst, getSaveTag()); + if (Val) + return *Val; + return 0; + } + + /// Retrieves the value of the callee-saved register that is restored by this + /// instruction or 0 if this is not a CSR restore instruction. + uint16_t getRestoredReg(const MCInst &Inst) { + auto Val = BC.MIA->tryGetAnnotationAs( + Inst, getRestoreTag()); + if (Val) + return *Val; + return 0; + } + + /// Routines to compute all saves/restores for a Reg (needs to traverse all + /// instructions). + std::vector getSavesByReg(uint16_t Reg); + std::vector getRestoresByReg(uint16_t Reg); +}; + +/// Identifies in a given binary function all stack regions being used and allow +/// us to edit the layout, removing or inserting new regions. When the layout is +/// modified, all affected stack-accessing instructions are updated. +class StackLayoutModifier { + const FrameAnalysis &FA; + const BinaryContext &BC; + BinaryFunction &BF; + DataflowInfoManager &Info; + + // Keep track of stack slots we know how to safely move + std::map AvailableRegions; + + DenseSet CollapsedRegions; + DenseSet InsertedRegions; + + // A map of chunks of stack memory we don't really know what's happening there + // and we need to leave it untouched. + std::map BlacklistedRegions; + + // Maps stack slots to the regs that are saved to them + DenseMap> RegionToRegMap; + DenseMap> RegToRegionMap; + + // If we can't understand how to move stack slots, IsSimple will be false + bool IsSimple{true}; + + bool IsInitialized{false}; + +public: + // Keep a worklist of operations to perform on the function to perform + // the requested layout modifications via collapseRegion()/insertRegion(). + struct WorklistItem { + enum ActionType : uint8_t { + None = 0, + AdjustLoadStoreOffset, + AdjustCFI, + } Action; + + int64_t OffsetUpdate{0}; + WorklistItem() : Action(None) {} + WorklistItem(ActionType Action) : Action(Action) {} + WorklistItem(ActionType Action, int OffsetUpdate) + : Action(Action), OffsetUpdate(OffsetUpdate) {} + }; +private: + + /// Mark the stack region identified by \p Offset and \p Size to be a + /// no-touch zone, whose accesses cannot be relocated to another region. + void blacklistRegion(int64_t Offset, int64_t Size); + + /// Check if this region overlaps with blacklisted addresses + bool isRegionBlacklisted(int64_t Offset, int64_t Size); + + /// Check if the region identified by \p Offset and \p Size has any conflicts + /// with available regions so far. If it has, blacklist all involved regions + /// and return true. + bool blacklistAllInConflictWith(int64_t Offset, int64_t Size); + + /// If \p Point is identified as frame pointer initialization (defining the + /// value of FP with SP), check for non-standard initialization that precludes + /// us from changing the stack layout. If positive, update blacklisted + /// regions. + void checkFramePointerInitialization(MCInst &Point); + + /// Make sense of each stack offsets we can freely change + void classifyStackAccesses(); + void classifyCFIs(); + + /// Used to keep track of modifications to the function that will later be + /// performed by performChanges(); + void scheduleChange(MCInst &Inst, WorklistItem Item); + static StringRef getTodoTagName() { + return StringRef("SLM-TodoTag"); + } + static StringRef getSlotTagName() { + return StringRef("SLM-SlotTag"); + } + static StringRef getOffsetCFIRegTagName() { + return StringRef("SLM-OffsetCFIReg"); + } + +public: + StackLayoutModifier(const FrameAnalysis &FA, const BinaryContext &BC, + BinaryFunction &BF, DataflowInfoManager &Info) + : FA(FA), BC(BC), BF(BF), Info(Info) {} + + ~StackLayoutModifier() { + for (auto &BB : BF) { + for (auto &Inst : BB) { + BC.MIA->removeAnnotation(Inst, getTodoTagName()); + BC.MIA->removeAnnotation(Inst, getSlotTagName()); + BC.MIA->removeAnnotation(Inst, getOffsetCFIRegTagName()); + } + } + } + + /// Retrieves the value of the callee-saved register that is restored by this + /// instruction or 0 if this is not a CSR restore instruction. + uint16_t getOffsetCFIReg(const MCInst &Inst) { + auto Val = + BC.MIA->tryGetAnnotationAs(Inst, getOffsetCFIRegTagName()); + if (Val) + return *Val; + return 0; + } + + /// Check if it is possible to delete the push instruction \p DeletedPush. + /// This involves collapsing the region accessed by this push and updating all + /// other instructions that access affected memory regions. Return true if we + /// can update this. + bool canCollapseRegion(int64_t RegionAddr); + bool canCollapseRegion(MCInst *DeletedPush); + + /// Notify the layout manager that \p DeletedPush was deleted and that it + /// needs to update other affected stack-accessing instructions. + bool collapseRegion(MCInst *Alloc, int64_t RegionAddr, int64_t RegionSize); + bool collapseRegion(MCInst *DeletedPush); + + /// Set the new stack address difference for load/store instructions that + /// referenced a stack location that was deleted via collapseRegion. + void setOffsetForCollapsedAccesses(int64_t NewOffset); + + /// Check if it is possible to insert a push instruction at point \p P. + /// This involves inserting a new region in the stack, possibly affecting + /// instructions that access the frame. Return true if we can update them all. + bool canInsertRegion(ProgramPoint P); + + /// Notify the layout manager that a new push instruction has been inserted + /// at point \p P and that it will need to update relevant instructions. + bool insertRegion(ProgramPoint P, int64_t RegionSz); + + /// Perform all changes scheduled by collapseRegion()/insertRegion() + void performChanges(); + + /// Perform initial assessment of the function trying to understand its stack + /// accesses. + void initialize(); +}; + +/// Implements a pass to optimize callee-saved register spills. These spills +/// typically happen at function prologue/epilogue. When these are hot basic +/// blocks, this pass will try to move these spills to cold blocks whenever +/// possible. +class ShrinkWrapping { + const FrameAnalysis &FA; + const BinaryContext &BC; + BinaryFunction &BF; + DataflowInfoManager &Info; + StackLayoutModifier SLM; + /// For each CSR, store a vector of all CFI indexes deleted as a consequence + /// of moving this Callee-Saved Reg + DenseMap> DeletedPushCFIs; + DenseMap> DeletedPopCFIs; + std::vector HasDeletedOffsetCFIs; + SmallPtrSet UpdatedCFIs; + std::vector UsesByReg; + std::vector PushOffsetByReg; + std::vector PopOffsetByReg; + std::vector DomOrder; + CalleeSavedAnalysis CSA; + std::vector> SavePos; + std::vector BestSaveCount; + std::vector BestSavePos; + + /// Pass stats + static uint64_t SpillsMovedRegularMode; + static uint64_t SpillsMovedPushPopMode; + + /// Allow our custom worklist-sensitive analysis + /// PredictiveStackPointerTracking to access WorklistItem +public: + struct WorklistItem { + enum ActionType : uint8_t { + Erase = 0, + ChangeToAdjustment, + InsertLoadOrStore, + InsertPushOrPop + } Action; + FrameIndexEntry FIEToInsert; + unsigned AffectedReg; + int Adjustment{0}; + WorklistItem(ActionType Action, unsigned AffectedReg) + : Action(Action), FIEToInsert(), AffectedReg(AffectedReg) {} + WorklistItem(ActionType Action, unsigned AffectedReg, int Adjustment) + : Action(Action), FIEToInsert(), AffectedReg(AffectedReg), + Adjustment(Adjustment) {} + WorklistItem(ActionType Action, const FrameIndexEntry &FIE, + unsigned AffectedReg) + : Action(Action), FIEToInsert(FIE), AffectedReg(AffectedReg) {} + }; + + /// Insertion todo items scheduled to happen at the end of BBs. Since we + /// can't annotate BBs we maintain this bookkeeping here. + DenseMap> Todo; + + /// Annotation name used to tag instructions with removal or insertion actions + static StringRef getAnnotationName() { + return StringRef("ShrinkWrap-Todo"); + } +private: + using BBIterTy = BinaryBasicBlock::iterator; + + /// Calculate all possible uses/defs of these callee-saved regs + void classifyCSRUses(); + + // Ensure we don't work on cases where there are no uses of the callee-saved + // register. These unnecessary spills should have been removed by previous + // passes. + void pruneUnwantedCSRs(); + + // Map regs to their possible save possibilities (at start of these BBs) + void computeSaveLocations(); + + /// Look into the best save location found for saving callee-saved reg + /// \p CSR and evaluates whether we would benefit by moving the spill to this + /// new save location. Returns true in case it is profitable to perform the + /// move. + bool validateBestSavePos(unsigned CSR, MCInst *&BestPosSave, + uint64_t &TotalEstimatedWin); + + /// Populate the Todo map with worklistitems to change the function + template + void scheduleChange(ProgramPoint PP, T&& ...Item) { + if (PP.isInst()) { + auto &WList = BC.MIA->getOrCreateAnnotationAs>( + BC.Ctx.get(), *PP.getInst(), getAnnotationName()); + WList.emplace_back(std::forward(Item)...); + return; + } + // Avoid inserting on BBs with no instructions because we have a dataflow + // analysis that depends on insertions happening before real instructions + // (PredictiveStackPointerTracking) + BinaryBasicBlock *BB = PP.getBB(); + if (BB->size() != 0) { + Todo[BB].emplace_back(std::forward(Item)...); + return; + } + while (BB->size() == 0) { + assert (BB->succ_size() == 1); + BB = *BB->succ_begin(); + } + auto &WList = BC.MIA->getOrCreateAnnotationAs>( + BC.Ctx.get(), *BB->begin(), getAnnotationName()); + WList.emplace_back(std::forward(Item)...); + } + + /// Determine the POP ordering according to which CSR save is the dominator. + void computeDomOrder(); + + /// Check that the best possible location for a spill save (as determined by + /// computeSaveLocations) is cold enough to be worth moving the save to it. + /// \p CSR is the callee-saved register number, \p BestPosSave returns the + /// pointer to the cold location in case the function returns true, while + /// \p TotalEstimatedWin contains the ins dyn count reduction after moving. + bool isBestSavePosCold(unsigned CSR, MCInst *&BestPosSave, + uint64_t &TotalEstimatedWin); + + /// Auxiliary function used to create basic blocks for critical edges and + /// update the dominance frontier with these new locations + void splitFrontierCritEdges( + BinaryFunction *Func, SmallVector &Frontier, + const SmallVector &IsCritEdge, + const SmallVector &From, + const SmallVector, 4> &To); + + /// After the best save location for a spill has been established in + /// \p BestPosSave for reg \p CSR, compute adequate locations to restore + /// the spilled value. This will be at the dominance frontier. + /// Returns an empty vector if we failed. In case of success, set + /// \p UsePushPops to true if we can operate in the push/pops mode. + SmallVector doRestorePlacement(MCInst *BestPosSave, + unsigned CSR, + uint64_t TotalEstimatedWin); + + /// Checks whether using push and pops (instead of the longer load-store + /// counterparts) is correct for reg \p CSR + bool validatePushPopsMode(unsigned CSR, MCInst *BestPosSave, + int64_t SaveOffset); + + /// Adjust restore locations to the correct SP offset if we are using POPs + /// instead of random-access load instructions. + SmallVector + fixPopsPlacements(const SmallVector &RestorePoints, + int64_t SaveOffset, unsigned CSR); + + /// When moving spills, mark all old spill locations to be deleted + void scheduleOldSaveRestoresRemoval(unsigned CSR, bool UsePushPops); + /// Return true if \p Inst uses reg \p CSR + bool doesInstUsesCSR(const MCInst &Inst, uint16_t CSR); + /// When moving spills, mark all new spill locations for insertion + void + scheduleSaveRestoreInsertions(unsigned CSR, MCInst *BestPosSave, + SmallVector &RestorePoints, + bool UsePushPops); + + /// Coordinate the replacement of callee-saved spills from their original + /// place (at prologue and epilogues) to colder basic blocks as determined + /// by computeSaveLocations(). + void moveSaveRestores(); + + /// After the spill locations for reg \p CSR has been moved and all affected + /// CFI has been removed, insert new updated CFI information for these + /// locations. + void insertUpdatedCFI(unsigned CSR, int SPValPush, int SPValPop); + + /// In case the function anchors the CFA reg as SP and we inserted pushes/pops + /// insert def_cfa_offsets at appropriate places (and delete old + /// def_cfa_offsets) + void rebuildCFIForSP(); + + /// Rebuild all CFI for affected Callee-Saved Registers. + void rebuildCFI(); + + /// Create a load-store instruction (depending on the contents of \p FIE). + /// If \p CreatePushOrPop is true, create a push/pop instead. Current SP/FP + /// values, as determined by StackPointerTracking, should be informed via + /// \p SPVal and \p FPVal in order to emit the correct offset form SP/FP. + MCInst createStackAccess(int SPVal, int FPVal, const FrameIndexEntry &FIE, + bool CreatePushOrPop); + + /// Update the CFI referenced by \p Inst with \p NewOffset, if the CFI has + /// an offset. + void updateCFIInstOffset(MCInst &Inst, int64_t NewOffset); + + /// Insert any CFI that should be attached to a register spill save/restore. + BBIterTy insertCFIsForPushOrPop(BinaryBasicBlock &BB, BBIterTy Pos, + unsigned Reg, bool isPush, int Sz, + int64_t NewOffset); + + /// Auxiliary function to processInsertionsList, adding a new instruction + /// before \p InsertionPoint as requested by \p Item. Return an updated + /// InsertionPoint for other instructions that need to be inserted at the same + /// original location, since this insertion may have invalidated the previous + /// location. + BBIterTy processInsertion(BBIterTy InsertionPoint, BinaryBasicBlock *CurBB, + const WorklistItem &Item, int64_t SPVal, + int64_t FPVal); + + /// Auxiliary function to processInsertions(), helping perform all the + /// insertion tasks in the todo list associated with a single insertion point. + /// Return true if at least one insertion was performed. + BBIterTy processInsertionsList(BBIterTy InsertionPoint, + BinaryBasicBlock *CurBB, + std::vector &TodoList, + int64_t SPVal, int64_t FPVal); + + /// Apply all insertion todo tasks regarding insertion of new stores/loads or + /// push/pops at annotated points. Return false if the entire function had + /// no todo tasks annotation and this pass has nothing to do. + bool processInsertions(); + + /// Apply all deletion todo tasks (or tasks to change a push/pop to a memory + /// access no-op) + void processDeletions(); + +public: + ShrinkWrapping(const FrameAnalysis &FA, const BinaryContext &BC, + BinaryFunction &BF, DataflowInfoManager &Info) + : FA(FA), BC(BC), BF(BF), Info(Info), SLM(FA, BC, BF, Info), + CSA(FA, BC, BF, Info) {} + + ~ShrinkWrapping() { + for (auto &BB : BF) { + for (auto &Inst : BB) { + BC.MIA->removeAnnotation(Inst, getAnnotationName()); + } + } + } + + void perform(); + + static void printStats(); +}; + +} // end namespace bolt +} // end namespace llvm + +#endif diff --git a/bolt/Passes/StackAllocationAnalysis.cpp b/bolt/Passes/StackAllocationAnalysis.cpp new file mode 100644 index 000000000000..89f2d2a1c254 --- /dev/null +++ b/bolt/Passes/StackAllocationAnalysis.cpp @@ -0,0 +1,153 @@ +//===--- Passes/StackAllocationAnalysis.cpp -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "StackAllocationAnalysis.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "saa" + +namespace llvm { +namespace bolt { + +void StackAllocationAnalysis::preflight() { + DEBUG(dbgs() << "Starting StackAllocationAnalysis on \"" + << Func.getPrintName() << "\"\n"); + + for (auto &BB : this->Func) { + for (auto &Inst : BB) { + MCPhysReg From, To; + if (!BC.MIA->isPush(Inst) && (!BC.MIA->isRegToRegMove(Inst, From, To) || + To != BC.MIA->getStackPointer() || + From != BC.MIA->getFramePointer()) && + !BC.MII->get(Inst.getOpcode()) + .hasDefOfPhysReg(Inst, BC.MIA->getStackPointer(), *BC.MRI)) + continue; + this->Expressions.push_back(&Inst); + this->ExprToIdx[&Inst] = this->NumInstrs++; + } + } +} + +BitVector +StackAllocationAnalysis::getStartingStateAtBB(const BinaryBasicBlock &BB) { + return BitVector(this->NumInstrs, false); +} + +BitVector +StackAllocationAnalysis::getStartingStateAtPoint(const MCInst &Point) { + return BitVector(this->NumInstrs, false); +} + +void StackAllocationAnalysis::doConfluence(BitVector &StateOut, + const BitVector &StateIn) { + StateOut |= StateIn; +} + +BitVector StackAllocationAnalysis::doKill(const MCInst &Point, + const BitVector &StateIn, + int DeallocSize) { + int64_t SPOffset = SPT.getStateAt(Point)->first; + BitVector Next = StateIn; + if (SPOffset == SPT.SUPERPOSITION || SPOffset == SPT.EMPTY) + return Next; + for (auto I = this->expr_begin(Next), E = this->expr_end(); I != E; ++I) { + const MCInst *Instr = *I; + int64_t InstrOffset = SPT.getStateAt(*Instr)->first; + if (InstrOffset == SPT.SUPERPOSITION || InstrOffset == SPT.EMPTY) + continue; + if (InstrOffset < SPOffset) { + Next.reset(I.getBitVectorIndex()); + DEBUG({ + dbgs() << "SAA FYI: Killed: "; + Instr->dump(); + dbgs() << "by: "; + Point.dump(); + dbgs() << " (more info: Killed instr offset = " << InstrOffset + << ". SPOffset = " << SPOffset + << "; DeallocSize= " << DeallocSize << "\n"; + }); + } + } + return Next; +} + +void StackAllocationAnalysis::doConfluenceWithLP(BitVector &StateOut, + const BitVector &StateIn, + const MCInst &Invoke) { + BitVector NewIn = StateIn; + for (const auto &Operand : Invoke) { + if (Operand.isGnuArgsSize()) { + auto ArgsSize = Operand.getGnuArgsSize(); + NewIn = doKill(Invoke, NewIn, ArgsSize); + } + } + StateOut |= NewIn; +} + +BitVector StackAllocationAnalysis::computeNext(const MCInst &Point, + const BitVector &Cur) { + const auto &MIA = BC.MIA; + BitVector Next = Cur; + if (int Sz = MIA->getPopSize(Point)) { + Next = doKill(Point, Next, Sz); + return Next; + } + if (MIA->isPush(Point)) { + Next.set(this->ExprToIdx[&Point]); + return Next; + } + + MCPhysReg From, To; + int64_t SPOffset, FPOffset; + std::tie(SPOffset, FPOffset) = *SPT.getStateBefore(Point); + if (MIA->isRegToRegMove(Point, From, To) && To == MIA->getStackPointer() && + From == MIA->getFramePointer()) { + if (MIA->isLeave(Point)) + FPOffset += 8; + if (SPOffset < FPOffset) { + Next = doKill(Point, Next, FPOffset - SPOffset); + return Next; + } + if (SPOffset > FPOffset) { + Next.set(this->ExprToIdx[&Point]); + return Next; + } + } + if (BC.MII->get(Point.getOpcode()) + .hasDefOfPhysReg(Point, MIA->getStackPointer(), *BC.MRI)) { + std::pair SP; + if (SPOffset != SPT.EMPTY && SPOffset != SPT.SUPERPOSITION) + SP = std::make_pair(MIA->getStackPointer(), SPOffset); + else + SP = std::make_pair(0, 0); + std::pair FP; + if (FPOffset != SPT.EMPTY && FPOffset != SPT.SUPERPOSITION) + FP = std::make_pair(MIA->getFramePointer(), FPOffset); + else + FP = std::make_pair(0, 0); + int64_t Output; + if (!MIA->evaluateSimple(Point, Output, SP, FP)) + return Next; + + if (SPOffset < Output) { + Next = doKill(Point, Next, Output - SPOffset); + return Next; + } + if (SPOffset > Output) { + Next.set(this->ExprToIdx[&Point]); + return Next; + } + } + return Next; +} + +} // end namespace bolt +} // end namespace llvm diff --git a/bolt/Passes/StackAllocationAnalysis.h b/bolt/Passes/StackAllocationAnalysis.h new file mode 100644 index 000000000000..64fba984fed2 --- /dev/null +++ b/bolt/Passes/StackAllocationAnalysis.h @@ -0,0 +1,68 @@ +//===--- Passes/StackAllocationAnalysis.h ---------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_STACKALLOCATIONANALYSIS_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKALLOCATIONANALYSIS_H + +#include "DataflowAnalysis.h" +#include "StackPointerTracking.h" +#include "llvm/Support/Timer.h" + +namespace llvm { +namespace bolt { + +/// Perform a dataflow analysis to track the value of SP as an offset relative +/// to the CFA. +class StackAllocationAnalysis + : public InstrsDataflowAnalysis { + friend class DataflowAnalysis; + + StackPointerTracking &SPT; + +public: + StackAllocationAnalysis(const BinaryContext &BC, BinaryFunction &BF, + StackPointerTracking &SPT) + : InstrsDataflowAnalysis(BC, BF), + SPT(SPT) {} + virtual ~StackAllocationAnalysis() {} + + void run() { + NamedRegionTimer T1("SAA", "Dataflow", true); + InstrsDataflowAnalysis::run(); + } + +protected: + void preflight(); + + BitVector getStartingStateAtBB(const BinaryBasicBlock &BB); + + BitVector getStartingStateAtPoint(const MCInst &Point); + + void doConfluence(BitVector &StateOut, const BitVector &StateIn); + + BitVector doKill(const MCInst &Point, const BitVector &StateIn, + int DeallocSize); + + void doConfluenceWithLP(BitVector &StateOut, const BitVector &StateIn, + const MCInst &Invoke); + + BitVector computeNext(const MCInst &Point, const BitVector &Cur); + + StringRef getAnnotationName() const { + return StringRef("StackAllocationAnalysis"); + } +}; + +} // end namespace bolt +} // end namespace llvm + +#endif diff --git a/bolt/Passes/StackAvailableExpressions.cpp b/bolt/Passes/StackAvailableExpressions.cpp new file mode 100644 index 000000000000..d0a5f5b1c12a --- /dev/null +++ b/bolt/Passes/StackAvailableExpressions.cpp @@ -0,0 +1,132 @@ +//===--- Passes/StackAvailableExpressions.cpp -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "StackAvailableExpressions.h" +#include "FrameAnalysis.h" + +#define DEBUG_TYPE "sae" + +namespace llvm { +namespace bolt { + +StackAvailableExpressions::StackAvailableExpressions(const FrameAnalysis &FA, + const BinaryContext &BC, + BinaryFunction &BF) + : InstrsDataflowAnalysis(BC, BF), FA(FA) {} + +void StackAvailableExpressions::preflight() { + DEBUG(dbgs() << "Starting StackAvailableExpressions on \"" + << Func.getPrintName() << "\"\n"); + + // Populate our universe of tracked expressions. We are interested in + // tracking available stores to frame position at any given point of the + // program. + for (auto &BB : Func) { + for (auto &Inst : BB) { + auto FIE = FA.getFIEFor(BC, Inst); + if (!FIE) + continue; + if (FIE->IsStore == true && FIE->IsSimple == true) { + Expressions.push_back(&Inst); + ExprToIdx[&Inst] = NumInstrs++; + } + } + } +} + +BitVector +StackAvailableExpressions::getStartingStateAtBB(const BinaryBasicBlock &BB) { + // Entry points start with empty set + // All others start with the full set. + if (BB.pred_size() == 0 && BB.throw_size() == 0) + return BitVector(NumInstrs, false); + return BitVector(NumInstrs, true); +} + +BitVector +StackAvailableExpressions::getStartingStateAtPoint(const MCInst &Point) { + return BitVector(NumInstrs, true); +} + +void StackAvailableExpressions::doConfluence(BitVector &StateOut, + const BitVector &StateIn) { + StateOut &= StateIn; +} + +namespace { + +bool isLoadRedundant(const FrameIndexEntry &LoadFIE, + const FrameIndexEntry &StoreFIE) { + if (LoadFIE.IsLoad == false || LoadFIE.IsSimple == false) { + return false; + } + if (LoadFIE.StackOffset == StoreFIE.StackOffset && + LoadFIE.Size == StoreFIE.Size) { + return true; + } + + return false; +} +} + +bool StackAvailableExpressions::doesXKillsY(const MCInst *X, const MCInst *Y) { + // if both are stores, and both store to the same stack location, return + // true + auto FIEX = FA.getFIEFor(BC, *X); + auto FIEY = FA.getFIEFor(BC, *Y); + if (FIEX && FIEY) { + if (isLoadRedundant(*FIEX, *FIEY)) + return false; + if (FIEX->IsStore == true && FIEY->IsStore == true && + FIEX->StackOffset + FIEX->Size > FIEY->StackOffset && + FIEX->StackOffset < FIEY->StackOffset + FIEY->Size) + return true; + } + // getClobberedRegs for X and Y. If they intersect, return true + BitVector XClobbers = BitVector(BC.MRI->getNumRegs(), false); + BitVector YClobbers = BitVector(BC.MRI->getNumRegs(), false); + FA.getInstClobberList(BC, *X, XClobbers); + // If Y is a store to stack, its clobber list is its source reg. This is + // different than the rest because we want to check if the store source + // reaches its corresponding load untouched. + if (FIEY && FIEY->IsStore == true && FIEY->IsStoreFromReg) { + YClobbers.set(FIEY->RegOrImm); + } else { + FA.getInstClobberList(BC, *Y, YClobbers); + } + XClobbers &= YClobbers; + return XClobbers.any(); +} + +BitVector StackAvailableExpressions::computeNext(const MCInst &Point, + const BitVector &Cur) { + BitVector Next = Cur; + // Kill + for (auto I = expr_begin(Next), E = expr_end(); I != E; ++I) { + assert(*I != nullptr && "Lost pointers"); + DEBUG(dbgs() << "\t\t\tDoes it kill "); + DEBUG((*I)->dump()); + if (doesXKillsY(&Point, *I)) { + DEBUG(dbgs() << "\t\t\t\tKilling "); + DEBUG((*I)->dump()); + Next.reset(I.getBitVectorIndex()); + } + } + // Gen + if (auto FIE = FA.getFIEFor(BC, Point)) { + if (FIE->IsStore == true && FIE->IsSimple == true) + Next.set(ExprToIdx[&Point]); + } + return Next; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/Passes/StackAvailableExpressions.h b/bolt/Passes/StackAvailableExpressions.h new file mode 100644 index 000000000000..6ec3234ff6ad --- /dev/null +++ b/bolt/Passes/StackAvailableExpressions.h @@ -0,0 +1,58 @@ +//===--- Passes/StackAvailableExpressions.h -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_STACKAVAILABLEEXPRESSIONS_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKAVAILABLEEXPRESSIONS_H + +#include "DataflowAnalysis.h" +#include "llvm/Support/Timer.h" + +namespace llvm { +namespace bolt { + +class FrameAnalysis; + +class StackAvailableExpressions + : public InstrsDataflowAnalysis { + friend class DataflowAnalysis; + +public: + StackAvailableExpressions(const FrameAnalysis &FA, + const BinaryContext &BC, BinaryFunction &BF); + virtual ~StackAvailableExpressions() {} + + void run() { + NamedRegionTimer T1("SAE", "Dataflow", true); + InstrsDataflowAnalysis::run(); + } + +protected: + /// Reference to the result of stack frame analysis + const FrameAnalysis &FA; + + void preflight(); + BitVector getStartingStateAtBB(const BinaryBasicBlock &BB); + BitVector getStartingStateAtPoint(const MCInst &Point); + void doConfluence(BitVector &StateOut, const BitVector &StateIn); + /// Define the function computing the kill set -- whether expression Y, a + /// tracked expression, will be considered to be dead after executing X. + bool doesXKillsY(const MCInst *X, const MCInst *Y); + BitVector computeNext(const MCInst &Point, const BitVector &Cur); + + StringRef getAnnotationName() const { + return StringRef("StackAvailableExpressions"); + } +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/Passes/StackPointerTracking.h b/bolt/Passes/StackPointerTracking.h index 7f02e766dfc9..99e4818c2395 100644 --- a/bolt/Passes/StackPointerTracking.h +++ b/bolt/Passes/StackPointerTracking.h @@ -13,6 +13,7 @@ #define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKPOINTERTRACKING_H #include "DataflowAnalysis.h" +#include "llvm/Support/Timer.h" namespace llvm { namespace bolt { @@ -190,6 +191,11 @@ class StackPointerTracking public: StackPointerTracking(const BinaryContext &BC, BinaryFunction &BF); virtual ~StackPointerTracking() {} + + void run() { + NamedRegionTimer T1("SPT", "Dataflow", true); + StackPointerTrackingBase::run(); + } }; } // end namespace bolt diff --git a/bolt/Passes/StackReachingUses.cpp b/bolt/Passes/StackReachingUses.cpp new file mode 100644 index 000000000000..68e76b1438ff --- /dev/null +++ b/bolt/Passes/StackReachingUses.cpp @@ -0,0 +1,112 @@ +//===--- Passes/StackReachingUses.cpp -------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// +#include "StackReachingUses.h" +#include "FrameAnalysis.h" + +#define DEBUG_TYPE "sru" + +namespace llvm { +namespace bolt { + +bool StackReachingUses::isStoreUsed(const FrameIndexEntry &StoreFIE, + ExprIterator Candidates, + bool IncludeLocalAccesses) const { + for (auto I = Candidates; I != expr_end(); ++I) { + const MCInst *ReachingInst = *I; + if (IncludeLocalAccesses) { + if (auto FIEY = FA.getFIEFor(BC, *ReachingInst)) { + assert(FIEY->IsLoad == 1); + if (StoreFIE.StackOffset + StoreFIE.Size > FIEY->StackOffset && + StoreFIE.StackOffset < FIEY->StackOffset + FIEY->Size) { + return true; + } + } + } + auto Args = FA.getArgAccessesFor(BC, *ReachingInst); + if (!Args) + continue; + if (Args->AssumeEverything) { + return true; + } + for (auto FIEY : Args->Set) { + if (StoreFIE.StackOffset + StoreFIE.Size > FIEY.StackOffset && + StoreFIE.StackOffset < FIEY.StackOffset + FIEY.Size) { + return true; + } + } + } + return false; +} + +void StackReachingUses::preflight() { + DEBUG(dbgs() << "Starting StackReachingUses on \"" << Func.getPrintName() + << "\"\n"); + + // Populate our universe of tracked expressions. We are interested in + // tracking reaching loads from frame position at any given point of the + // program. + for (auto &BB : Func) { + for (auto &Inst : BB) { + if (auto FIE = FA.getFIEFor(BC, Inst)) { + if (FIE->IsLoad == true) { + Expressions.push_back(&Inst); + ExprToIdx[&Inst] = NumInstrs++; + continue; + } + } + auto AA = FA.getArgAccessesFor(BC, Inst); + if (AA && (!AA->Set.empty() || AA->AssumeEverything)) { + Expressions.push_back(&Inst); + ExprToIdx[&Inst] = NumInstrs++; + } + } + } +} + +bool StackReachingUses::doesXKillsY(const MCInst *X, const MCInst *Y) { + // if X is a store to the same stack location and the bytes fetched is a + // superset of those bytes affected by the load in Y, return true + auto FIEX = FA.getFIEFor(BC, *X); + auto FIEY = FA.getFIEFor(BC, *Y); + if (FIEX && FIEY) { + if (FIEX->IsStore == true && FIEY->IsLoad == true && + FIEX->StackOffset <= FIEY->StackOffset && + FIEX->StackOffset + FIEX->Size >= FIEY->StackOffset + FIEY->Size) + return true; + } + return false; +} + +BitVector StackReachingUses::computeNext(const MCInst &Point, + const BitVector &Cur) { + BitVector Next = Cur; + // Kill + for (auto I = expr_begin(Next), E = expr_end(); I != E; ++I) { + assert(*I != nullptr && "Lost pointers"); + if (doesXKillsY(&Point, *I)) { + DEBUG(dbgs() << "\t\t\tKilling "); + DEBUG((*I)->dump()); + Next.reset(I.getBitVectorIndex()); + } + }; + // Gen + if (auto FIE = FA.getFIEFor(BC, Point)) { + if (FIE->IsLoad == true) + Next.set(ExprToIdx[&Point]); + } + auto AA = FA.getArgAccessesFor(BC, Point); + if (AA && (!AA->Set.empty() || AA->AssumeEverything)) + Next.set(ExprToIdx[&Point]); + return Next; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/Passes/StackReachingUses.h b/bolt/Passes/StackReachingUses.h new file mode 100644 index 000000000000..7ea7094ef6bd --- /dev/null +++ b/bolt/Passes/StackReachingUses.h @@ -0,0 +1,71 @@ +//===--- Passes/StackReachingUses.h ---------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_STACKREACHINGUSES_H +#define LLVM_TOOLS_LLVM_BOLT_PASSES_STACKREACHINGUSES_H + +#include "DataflowAnalysis.h" +#include "llvm/Support/Timer.h" + +namespace llvm { +namespace bolt { + +class FrameAnalysis; +struct FrameIndexEntry; + +class StackReachingUses + : public InstrsDataflowAnalysis { + friend class DataflowAnalysis; + +public: + StackReachingUses(const FrameAnalysis &FA, const BinaryContext &BC, + BinaryFunction &BF) + : InstrsDataflowAnalysis(BC, BF), FA(FA) {} + virtual ~StackReachingUses() {} + + bool isStoreUsed(const FrameIndexEntry &StoreFIE, ExprIterator Candidates, + bool IncludeLocalAccesses = true) const; + + void run() { + NamedRegionTimer T1("SRU", "Dataflow", true); + InstrsDataflowAnalysis::run(); + } + +protected: + // Reference to the result of stack frame analysis + const FrameAnalysis &FA; + + void preflight(); + + BitVector getStartingStateAtBB(const BinaryBasicBlock &BB) { + return BitVector(NumInstrs, false); + } + + BitVector getStartingStateAtPoint(const MCInst &Point) { + return BitVector(NumInstrs, false); + } + + void doConfluence(BitVector &StateOut, const BitVector &StateIn) { + StateOut |= StateIn; + } + + // Define the function computing the kill set -- whether expression Y, a + // tracked expression, will be considered to be dead after executing X. + bool doesXKillsY(const MCInst *X, const MCInst *Y); + BitVector computeNext(const MCInst &Point, const BitVector &Cur); + + StringRef getAnnotationName() const { return StringRef("StackReachingUses"); } +}; + +} // end namespace bolt +} // end namespace llvm + +#endif diff --git a/bolt/RewriteInstance.cpp b/bolt/RewriteInstance.cpp index 7ef342a5124d..1ee3aa7a75a0 100644 --- a/bolt/RewriteInstance.cpp +++ b/bolt/RewriteInstance.cpp @@ -1659,6 +1659,7 @@ void RewriteInstance::readDebugInfo() { void RewriteInstance::disassembleFunctions() { // Disassemble every function and build it's control flow graph. TotalScore = 0; + BC->SumExecutionCount = 0; for (auto &BFI : BinaryFunctions) { BinaryFunction &Function = BFI.second; @@ -1803,6 +1804,7 @@ void RewriteInstance::disassembleFunctions() { } TotalScore += Function.getFunctionScore(); + BC->SumExecutionCount += Function.getKnownExecutionCount(); } // Iterate over all functions @@ -1821,6 +1823,7 @@ void RewriteInstance::disassembleFunctions() { else ++NumStaleProfileFunctions; } + BC->NumProfiledFuncs = ProfiledFunctions.size(); const auto NumAllProfiledFunctions = ProfiledFunctions.size() + NumStaleProfileFunctions;