diff --git a/bolt/BinaryBasicBlock.h b/bolt/BinaryBasicBlock.h index 07962f99d5bb..ea292f26edd2 100644 --- a/bolt/BinaryBasicBlock.h +++ b/bolt/BinaryBasicBlock.h @@ -85,9 +85,6 @@ class BinaryBasicBlock { /// Each successor has a corresponding BranchInfo entry in the list. std::vector BranchInfo; - typedef std::vector::iterator branch_info_iterator; - typedef std::vector::const_iterator - const_branch_info_iterator; BinaryBasicBlock() {} @@ -252,6 +249,25 @@ public: return iterator_range(lp_begin(), lp_end()); } + // BranchInfo iterators. + typedef std::vector::const_iterator + const_branch_info_iterator; + + const_branch_info_iterator branch_info_begin() const + { return BranchInfo.begin(); } + const_branch_info_iterator branch_info_end() const + { return BranchInfo.end(); } + unsigned branch_info_size() const { + return (unsigned)BranchInfo.size(); + } + bool branch_info_empty() const + { return BranchInfo.empty(); } + + inline iterator_range branch_info() const { + return iterator_range( + branch_info_begin(), branch_info_end()); + } + /// Return symbol marking the start of this basic block. MCSymbol *getLabel() const { return Label; diff --git a/bolt/BinaryFunction.cpp b/bolt/BinaryFunction.cpp index 44ceafb744bd..5bf0e07cbf3e 100644 --- a/bolt/BinaryFunction.cpp +++ b/bolt/BinaryFunction.cpp @@ -12,6 +12,7 @@ #include "BinaryBasicBlock.h" #include "BinaryFunction.h" +#include "ReorderAlgorithm.h" #include "DataReader.h" #include "llvm/ADT/StringRef.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" @@ -41,9 +42,6 @@ AgressiveSplitting("split-all-cold", cl::desc("outline as many cold basic blocks as possible"), cl::Optional); -static cl::opt -PrintClusters("print-clusters", cl::desc("print clusters"), cl::Optional); - static cl::opt PrintDebugInfo("print-debug-info", cl::desc("print debug info when printing functions"), @@ -1254,378 +1252,47 @@ void BinaryFunction::modifyLayout(LayoutType Type, bool Split) { if (BasicBlocksLayout.empty() || Type == LT_NONE) return; - if (Type == LT_REVERSE) { - BasicBlockOrderType ReverseOrder; - auto FirstBB = BasicBlocksLayout.front(); - ReverseOrder.push_back(FirstBB); - for (auto RBBI = BasicBlocksLayout.rbegin(); *RBBI != FirstBB; ++RBBI) - ReverseOrder.push_back(*RBBI); - BasicBlocksLayout.swap(ReverseOrder); - - if (Split) - splitFunction(); - - fixBranches(); - - return; - } + BasicBlockOrderType NewLayout; + std::unique_ptr Algo; // Cannot do optimal layout without profile. - if (!hasValidProfile()) + if (Type != LT_REVERSE && !hasValidProfile()) return; - // Work on optimal solution if problem is small enough - if (BasicBlocksLayout.size() <= FUNC_SIZE_THRESHOLD) - return solveOptimalLayout(Split); + if (Type == LT_REVERSE) { + Algo.reset(new ReverseReorderAlgorithm()); + } + else if (BasicBlocksLayout.size() <= FUNC_SIZE_THRESHOLD) { + // Work on optimal solution if problem is small enough + DEBUG(dbgs() << "finding optimal block layout for " << getName() << "\n"); + Algo.reset(new OptimalReorderAlgorithm()); + } + else { + DEBUG(dbgs() << "running block layout heuristics on " << getName() << "\n"); - DEBUG(dbgs() << "running block layout heuristics on " << getName() << "\n"); + std::unique_ptr CAlgo(new GreedyClusterAlgorithm()); - // Greedy heuristic implementation for the TSP, applied to BB layout. Try to - // maximize weight during a path traversing all BBs. In this way, we will - // convert the hottest branches into fall-throughs. + switch(Type) { + case LT_OPTIMIZE: + Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo))); + break; - // Encode an edge between two basic blocks, source and destination - typedef std::pair EdgeTy; - std::map Weight; + case LT_OPTIMIZE_BRANCH: + Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo))); + break; - // Define a comparison function to establish SWO between edges - auto Comp = [&] (EdgeTy A, EdgeTy B) { - // With equal weights, prioritize branches with lower index - // source/destination. This helps to keep original block order for blocks - // when optimal order cannot be deducted from a profile. - if (Weight[A] == Weight[B]) { - uint32_t ASrcBBIndex = getIndex(A.first); - uint32_t BSrcBBIndex = getIndex(B.first); - if (ASrcBBIndex != BSrcBBIndex) - return ASrcBBIndex > BSrcBBIndex; - return getIndex(A.second) > getIndex(B.second); - } - return Weight[A] < Weight[B]; - }; - std::priority_queue, decltype(Comp)> Queue(Comp); + case LT_OPTIMIZE_CACHE: + Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo))); + break; - typedef std::vector ClusterTy; - typedef std::map BBToClusterMapTy; - std::vector Clusters; - BBToClusterMapTy BBToClusterMap; - - // Encode relative weights between two clusters - std::vector> ClusterEdges; - ClusterEdges.resize(BasicBlocksLayout.size()); - - for (auto BB : BasicBlocksLayout) { - // Create a cluster for this BB - uint32_t I = Clusters.size(); - Clusters.emplace_back(); - auto &Cluster = Clusters.back(); - Cluster.push_back(BB); - BBToClusterMap[BB] = I; - // Populate priority queue with edges - auto BI = BB->BranchInfo.begin(); - for (auto &I : BB->successors()) { - if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) - Weight[std::make_pair(BB, I)] = BI->Count; - Queue.push(std::make_pair(BB, I)); - ++BI; + default: + llvm_unreachable("unexpected layout type"); } } - // Grow clusters in a greedy fashion - while (!Queue.empty()) { - auto elmt = Queue.top(); - Queue.pop(); - - BinaryBasicBlock *BBSrc = elmt.first; - BinaryBasicBlock *BBDst = elmt.second; - - // Case 1: BBSrc and BBDst are the same. Ignore this edge - if (BBSrc == BBDst || BBDst == *BasicBlocksLayout.begin()) - continue; - - int I = BBToClusterMap[BBSrc]; - int J = BBToClusterMap[BBDst]; - - // Case 2: If they are already allocated at the same cluster, just increase - // the weight of this cluster - if (I == J) { - ClusterEdges[I][I] += Weight[elmt]; - continue; - } - - auto &ClusterA = Clusters[I]; - auto &ClusterB = Clusters[J]; - if (ClusterA.back() == BBSrc && ClusterB.front() == BBDst) { - // Case 3: BBSrc is at the end of a cluster and BBDst is at the start, - // allowing us to merge two clusters - for (auto BB : ClusterB) - BBToClusterMap[BB] = I; - ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end()); - ClusterB.clear(); - // Iterate through all inter-cluster edges and transfer edges targeting - // cluster B to cluster A. - // It is bad to have to iterate though all edges when we could have a list - // of predecessors for cluster B. However, it's not clear if it is worth - // the added code complexity to create a data structure for clusters that - // maintains a list of predecessors. Maybe change this if it becomes a - // deal breaker. - for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K) - ClusterEdges[K][I] += ClusterEdges[K][J]; - } else { - // Case 4: Both BBSrc and BBDst are allocated in positions we cannot - // merge them. Annotate the weight of this edge in the weight between - // clusters to help us decide ordering between these clusters. - ClusterEdges[I][J] += Weight[elmt]; - } - } - std::vector Order; // Cluster layout order - - // Here we have 3 conflicting goals as to how to layout clusters. If we want - // to minimize jump offsets, we should put clusters with heavy inter-cluster - // dependence as close as possible. If we want to maximize the probability - // that all inter-cluster edges are predicted as not-taken, we should enforce - // a topological order to make targets appear after sources, creating forward - // branches. If we want to separate hot from cold blocks to maximize the - // probability that unfrequently executed code doesn't pollute the cache, we - // should put clusters in descending order of hotness. - std::vector AvgFreq; - AvgFreq.resize(Clusters.size(), 0.0); - for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { - double Freq = 0.0; - for (auto BB : Clusters[I]) { - if (!BB->empty() && BB->size() != BB->getNumPseudos()) - Freq += ((double) BB->getExecutionCount()) / - (BB->size() - BB->getNumPseudos()); - } - AvgFreq[I] = Freq; - } - - if (opts::PrintClusters) { - for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { - errs() << "Cluster number " << I << " (frequency: " << AvgFreq[I] - << ") : "; - auto Sep = ""; - for (auto BB : Clusters[I]) { - errs() << Sep << BB->getName(); - Sep = ", "; - } - errs() << "\n"; - }; - } - - switch(Type) { - case LT_OPTIMIZE: { - for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) - if (!Clusters[I].empty()) - Order.push_back(I); - break; - } - case LT_OPTIMIZE_BRANCH: { - // Do a topological sort for clusters, prioritizing frequently-executed BBs - // during the traversal. - std::stack Stack; - std::vector Status; - std::vector Parent; - Status.resize(Clusters.size(), 0); - Parent.resize(Clusters.size(), 0); - constexpr uint32_t STACKED = 1; - constexpr uint32_t VISITED = 2; - Status[0] = STACKED; - Stack.push(0); - while (!Stack.empty()) { - uint32_t I = Stack.top(); - if (!(Status[I] & VISITED)) { - Status[I] |= VISITED; - // Order successors by weight - auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) { - return ClusterEdges[I][A] > ClusterEdges[I][B]; - }; - std::priority_queue, - decltype(ClusterComp)> SuccQueue(ClusterComp); - for (auto &Target: ClusterEdges[I]) { - if (Target.second > 0 && !(Status[Target.first] & STACKED) && - !Clusters[Target.first].empty()) { - Parent[Target.first] = I; - Status[Target.first] = STACKED; - SuccQueue.push(Target.first); - } - } - while (!SuccQueue.empty()) { - Stack.push(SuccQueue.top()); - SuccQueue.pop(); - } - continue; - } - // Already visited this node - Stack.pop(); - Order.push_back(I); - } - std::reverse(Order.begin(), Order.end()); - // Put unreachable clusters at the end - for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) - if (!(Status[I] & VISITED) && !Clusters[I].empty()) - Order.push_back(I); - - // Sort nodes with equal precedence - auto Beg = Order.begin(); - // Don't reorder the first cluster, which contains the function entry point - ++Beg; - std::stable_sort(Beg, Order.end(), - [&AvgFreq, &Parent](uint32_t A, uint32_t B) { - uint32_t P = Parent[A]; - while (Parent[P] != 0) { - if (Parent[P] == B) - return false; - P = Parent[P]; - } - P = Parent[B]; - while (Parent[P] != 0) { - if (Parent[P] == A) - return true; - P = Parent[P]; - } - return AvgFreq[A] > AvgFreq[B]; - }); - break; - } - case LT_OPTIMIZE_CACHE: { - // Order clusters based on average instruction execution frequency - for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) - if (!Clusters[I].empty()) - Order.push_back(I); - auto Beg = Order.begin(); - // Don't reorder the first cluster, which contains the function entry point - ++Beg; - std::stable_sort(Beg, Order.end(), [&AvgFreq](uint32_t A, uint32_t B) { - return AvgFreq[A] > AvgFreq[B]; - }); - - break; - } - default: - llvm_unreachable("unexpected layout type"); - } - - if (opts::PrintClusters) { - errs() << "New cluster order: "; - auto Sep = ""; - for (auto O : Order) { - errs() << Sep << O; - Sep = ", "; - } - errs() << '\n'; - } - + Algo->reorderBasicBlocks(*this, NewLayout); BasicBlocksLayout.clear(); - for (auto I : Order) { - auto &Cluster = Clusters[I]; - BasicBlocksLayout.insert(BasicBlocksLayout.end(), Cluster.begin(), - Cluster.end()); - } - - if (Split) - splitFunction(); - fixBranches(); -} - -void BinaryFunction::solveOptimalLayout(bool Split) { - std::vector> Weight; - std::map BBToIndex; - std::vector IndexToBB; - - DEBUG(dbgs() << "finding optimal block layout for " << getName() << "\n"); - - unsigned N = BasicBlocksLayout.size(); - // Populating weight map and index map - for (auto BB : BasicBlocksLayout) { - BBToIndex[BB] = IndexToBB.size(); - IndexToBB.push_back(BB); - } - Weight.resize(N); - for (auto BB : BasicBlocksLayout) { - auto BI = BB->BranchInfo.begin(); - Weight[BBToIndex[BB]].resize(N); - for (auto I : BB->successors()) { - if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) - Weight[BBToIndex[BB]][BBToIndex[I]] = BI->Count; - ++BI; - } - } - - std::vector> DP; - DP.resize(1 << N); - for (auto &Elmt : DP) { - Elmt.resize(N, -1); - } - // Start with the entry basic block being allocated with cost zero - DP[1][0] = 0; - // Walk through TSP solutions using a bitmask to represent state (current set - // of BBs in the layout) - unsigned BestSet = 1; - unsigned BestLast = 0; - int64_t BestWeight = 0; - for (unsigned Set = 1; Set < (1U << N); ++Set) { - // Traverse each possibility of Last BB visited in this layout - for (unsigned Last = 0; Last < N; ++Last) { - // Case 1: There is no possible layout with this BB as Last - if (DP[Set][Last] == -1) - continue; - - // Case 2: There is a layout with this Set and this Last, and we try - // to expand this set with New - for (unsigned New = 1; New < N; ++New) { - // Case 2a: BB "New" is already in this Set - if ((Set & (1 << New)) != 0) - continue; - - // Case 2b: BB "New" is not in this set and we add it to this Set and - // record total weight of this layout with "New" as the last BB. - unsigned NewSet = (Set | (1 << New)); - if (DP[NewSet][New] == -1) - DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New]; - DP[NewSet][New] = std::max(DP[NewSet][New], - DP[Set][Last] + (int64_t)Weight[Last][New]); - - if (DP[NewSet][New] > BestWeight) { - BestWeight = DP[NewSet][New]; - BestSet = NewSet; - BestLast = New; - } - } - } - } - - std::vector PastLayout = BasicBlocksLayout; - - // Define final function layout based on layout that maximizes weight - BasicBlocksLayout.clear(); - unsigned Last = BestLast; - unsigned Set = BestSet; - std::vector Visited; - Visited.resize(N); - Visited[Last] = true; - BasicBlocksLayout.push_back(IndexToBB[Last]); - Set = Set & ~(1U << Last); - while (Set != 0) { - int64_t Best = -1; - for (unsigned I = 0; I < N; ++I) { - if (DP[Set][I] == -1) - continue; - if (DP[Set][I] > Best) { - Last = I; - Best = DP[Set][I]; - } - } - Visited[Last] = true; - BasicBlocksLayout.push_back(IndexToBB[Last]); - Set = Set & ~(1U << Last); - } - std::reverse(BasicBlocksLayout.begin(), BasicBlocksLayout.end()); - - // Finalize layout with BBs that weren't assigned to the layout - for (auto BB : PastLayout) { - if (Visited[BBToIndex[BB]] == false) - BasicBlocksLayout.push_back(BB); - } + BasicBlocksLayout.swap(NewLayout); if (Split) splitFunction(); diff --git a/bolt/BinaryFunction.h b/bolt/BinaryFunction.h index a739227781f5..93dcf84897fe 100644 --- a/bolt/BinaryFunction.h +++ b/bolt/BinaryFunction.h @@ -306,6 +306,9 @@ public: typedef BasicBlockOrderType::iterator order_iterator; typedef BasicBlockOrderType::const_iterator const_order_iterator; + typedef BasicBlockOrderType::reverse_iterator reverse_order_iterator; + typedef BasicBlockOrderType::const_reverse_iterator + const_reverse_order_iterator; // CFG iterators. iterator begin() { return BasicBlocks.begin(); } @@ -325,19 +328,39 @@ public: const BinaryBasicBlock & back() const { return *BasicBlocks.back(); } BinaryBasicBlock & back() { return *BasicBlocks.back(); } - unsigned layout_size() const { - return (unsigned)BasicBlocksLayout.size(); - } - const_order_iterator layout_begin() const { - return BasicBlocksLayout.begin(); - } - order_iterator layout_begin() { return BasicBlocksLayout.begin(); } + order_iterator layout_begin() { return BasicBlocksLayout.begin(); } + const_order_iterator layout_begin() const + { return BasicBlocksLayout.begin(); } + order_iterator layout_end() { return BasicBlocksLayout.end(); } + const_order_iterator layout_end() const + { return BasicBlocksLayout.end(); } + reverse_order_iterator layout_rbegin() + { return BasicBlocksLayout.rbegin(); } + const_reverse_order_iterator layout_rbegin() const + { return BasicBlocksLayout.rbegin(); } + reverse_order_iterator layout_rend() + { return BasicBlocksLayout.rend(); } + const_reverse_order_iterator layout_rend() const + { return BasicBlocksLayout.rend(); } + unsigned layout_size() const { return (unsigned)BasicBlocksLayout.size(); } + bool layout_empty() const { return BasicBlocksLayout.empty(); } + const BinaryBasicBlock *layout_front() const + { return BasicBlocksLayout.front(); } + BinaryBasicBlock *layout_front() { return BasicBlocksLayout.front(); } + const BinaryBasicBlock *layout_back() const + { return BasicBlocksLayout.back(); } + BinaryBasicBlock *layout_back() { return BasicBlocksLayout.back(); } inline iterator_range layout() { return iterator_range(BasicBlocksLayout.begin(), BasicBlocksLayout.end()); } + inline iterator_range layout() const { + return iterator_range(BasicBlocksLayout.begin(), + BasicBlocksLayout.end()); + } + cfi_iterator cie_begin() { return CIEFrameInstructions.begin(); } const_cfi_iterator cie_begin() const { return CIEFrameInstructions.begin(); } cfi_iterator cie_end() { return CIEFrameInstructions.end(); } @@ -368,14 +391,6 @@ public: /// end of basic blocks. void modifyLayout(LayoutType Type, bool Split); - /// Dynamic programming implementation for the TSP, applied to BB layout. Find - /// the optimal way to maximize weight during a path traversing all BBs. In - /// this way, we will convert the hottest branches into fall-throughs. - /// - /// Uses exponential amount of memory on the number of basic blocks and should - /// only be used for small functions. - void solveOptimalLayout(bool Split); - /// View CFG in graphviz program void viewGraph(); diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 1adf2aaf1e59..53faad6bd59a 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -24,4 +24,5 @@ add_llvm_tool(llvm-bolt DebugData.cpp Exceptions.cpp RewriteInstance.cpp + ReorderAlgorithm.cpp ) diff --git a/bolt/ReorderAlgorithm.cpp b/bolt/ReorderAlgorithm.cpp new file mode 100644 index 000000000000..8465b9aff4d1 --- /dev/null +++ b/bolt/ReorderAlgorithm.cpp @@ -0,0 +1,436 @@ +//===--- ReorderAlgorithm.cpp - Basic block reorderng algorithms ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Implements different basic block reordering algorithms. +// +//===----------------------------------------------------------------------===// + +#include "ReorderAlgorithm.h" +#include "BinaryBasicBlock.h" +#include "BinaryFunction.h" +#include "llvm/Support/CommandLine.h" +#include + +using namespace llvm; +using namespace bolt; + +namespace opts { + +static cl::opt +PrintClusters("print-clusters", cl::desc("print clusters"), cl::Optional); + +} // namespace opts + +void ClusterAlgorithm::computeClusterAverageFrequency() { + AvgFreq.resize(Clusters.size(), 0.0); + for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { + double Freq = 0.0; + for (auto BB : Clusters[I]) { + if (!BB->empty() && BB->size() != BB->getNumPseudos()) + Freq += ((double) BB->getExecutionCount()) / + (BB->size() - BB->getNumPseudos()); + } + AvgFreq[I] = Freq; + } +} + +void ClusterAlgorithm::printClusters() const { + for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { + errs() << "Cluster number " << I; + if (AvgFreq.size() == Clusters.size()) + errs() << " (frequency: " << AvgFreq[I] << ")"; + errs() << " : "; + auto Sep = ""; + for (auto BB : Clusters[I]) { + errs() << Sep << BB->getName(); + Sep = ", "; + } + errs() << "\n"; + } +} + +void ClusterAlgorithm::reset() { + Clusters.clear(); + ClusterEdges.clear(); + AvgFreq.clear(); +} + +void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF) { + reset(); + + // Greedy heuristic implementation for the TSP, applied to BB layout. Try to + // maximize weight during a path traversing all BBs. In this way, we will + // convert the hottest branches into fall-throughs. + + // Encode an edge between two basic blocks, source and destination + typedef std::pair EdgeTy; + std::map Weight; + + // Define a comparison function to establish SWO between edges + auto Comp = [&] (EdgeTy A, EdgeTy B) { + // With equal weights, prioritize branches with lower index + // source/destination. This helps to keep original block order for blocks + // when optimal order cannot be deducted from a profile. + if (Weight[A] == Weight[B]) { + uint32_t ASrcBBIndex = BF.getIndex(A.first); + uint32_t BSrcBBIndex = BF.getIndex(B.first); + if (ASrcBBIndex != BSrcBBIndex) + return ASrcBBIndex > BSrcBBIndex; + return BF.getIndex(A.second) > BF.getIndex(B.second); + } + return Weight[A] < Weight[B]; + }; + std::priority_queue, decltype(Comp)> Queue(Comp); + + typedef std::map BBToClusterMapTy; + BBToClusterMapTy BBToClusterMap; + + ClusterEdges.resize(BF.layout_size()); + + for (auto BB : BF.layout()) { + // Create a cluster for this BB + uint32_t I = Clusters.size(); + Clusters.emplace_back(); + auto &Cluster = Clusters.back(); + Cluster.push_back(BB); + BBToClusterMap[BB] = I; + // Populate priority queue with edges + auto BI = BB->branch_info_begin(); + for (auto &I : BB->successors()) { + if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) + Weight[std::make_pair(BB, I)] = BI->Count; + Queue.push(std::make_pair(BB, I)); + ++BI; + } + } + + // Grow clusters in a greedy fashion + while (!Queue.empty()) { + auto elmt = Queue.top(); + Queue.pop(); + + BinaryBasicBlock *BBSrc = elmt.first; + BinaryBasicBlock *BBDst = elmt.second; + + // Case 1: BBSrc and BBDst are the same. Ignore this edge + if (BBSrc == BBDst || BBDst == *BF.layout_begin()) + continue; + + int I = BBToClusterMap[BBSrc]; + int J = BBToClusterMap[BBDst]; + + // Case 2: If they are already allocated at the same cluster, just increase + // the weight of this cluster + if (I == J) { + ClusterEdges[I][I] += Weight[elmt]; + continue; + } + + auto &ClusterA = Clusters[I]; + auto &ClusterB = Clusters[J]; + if (ClusterA.back() == BBSrc && ClusterB.front() == BBDst) { + // Case 3: BBSrc is at the end of a cluster and BBDst is at the start, + // allowing us to merge two clusters + for (auto BB : ClusterB) + BBToClusterMap[BB] = I; + ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end()); + ClusterB.clear(); + // Iterate through all inter-cluster edges and transfer edges targeting + // cluster B to cluster A. + // It is bad to have to iterate though all edges when we could have a list + // of predecessors for cluster B. However, it's not clear if it is worth + // the added code complexity to create a data structure for clusters that + // maintains a list of predecessors. Maybe change this if it becomes a + // deal breaker. + for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K) + ClusterEdges[K][I] += ClusterEdges[K][J]; + } else { + // Case 4: Both BBSrc and BBDst are allocated in positions we cannot + // merge them. Annotate the weight of this edge in the weight between + // clusters to help us decide ordering between these clusters. + ClusterEdges[I][J] += Weight[elmt]; + } + } +} + +void OptimalReorderAlgorithm::reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const { + std::vector> Weight; + std::map BBToIndex; + std::vector IndexToBB; + + unsigned N = BF.layout_size(); + // Populating weight map and index map + for (auto BB : BF.layout()) { + BBToIndex[BB] = IndexToBB.size(); + IndexToBB.push_back(BB); + } + Weight.resize(N); + for (auto BB : BF.layout()) { + auto BI = BB->branch_info_begin(); + Weight[BBToIndex[BB]].resize(N); + for (auto I : BB->successors()) { + if (BI->Count != BinaryBasicBlock::COUNT_FALLTHROUGH_EDGE) + Weight[BBToIndex[BB]][BBToIndex[I]] = BI->Count; + ++BI; + } + } + + std::vector> DP; + DP.resize(1 << N); + for (auto &Elmt : DP) { + Elmt.resize(N, -1); + } + // Start with the entry basic block being allocated with cost zero + DP[1][0] = 0; + // Walk through TSP solutions using a bitmask to represent state (current set + // of BBs in the layout) + unsigned BestSet = 1; + unsigned BestLast = 0; + int64_t BestWeight = 0; + for (unsigned Set = 1; Set < (1U << N); ++Set) { + // Traverse each possibility of Last BB visited in this layout + for (unsigned Last = 0; Last < N; ++Last) { + // Case 1: There is no possible layout with this BB as Last + if (DP[Set][Last] == -1) + continue; + + // Case 2: There is a layout with this Set and this Last, and we try + // to expand this set with New + for (unsigned New = 1; New < N; ++New) { + // Case 2a: BB "New" is already in this Set + if ((Set & (1 << New)) != 0) + continue; + + // Case 2b: BB "New" is not in this set and we add it to this Set and + // record total weight of this layout with "New" as the last BB. + unsigned NewSet = (Set | (1 << New)); + if (DP[NewSet][New] == -1) + DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New]; + DP[NewSet][New] = std::max(DP[NewSet][New], + DP[Set][Last] + (int64_t)Weight[Last][New]); + + if (DP[NewSet][New] > BestWeight) { + BestWeight = DP[NewSet][New]; + BestSet = NewSet; + BestLast = New; + } + } + } + } + + // Define final function layout based on layout that maximizes weight + unsigned Last = BestLast; + unsigned Set = BestSet; + std::vector Visited; + Visited.resize(N); + Visited[Last] = true; + Order.push_back(IndexToBB[Last]); + Set = Set & ~(1U << Last); + while (Set != 0) { + int64_t Best = -1; + for (unsigned I = 0; I < N; ++I) { + if (DP[Set][I] == -1) + continue; + if (DP[Set][I] > Best) { + Last = I; + Best = DP[Set][I]; + } + } + Visited[Last] = true; + Order.push_back(IndexToBB[Last]); + Set = Set & ~(1U << Last); + } + std::reverse(Order.begin(), Order.end()); + + // Finalize layout with BBs that weren't assigned to the layout + for (auto BB : BF.layout()) { + if (Visited[BBToIndex[BB]] == false) + Order.push_back(BB); + } +} + +void OptimizeReorderAlgorithm::reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const { + if (BF.layout_empty()) + return; + + // Cluster basic blocks. + CAlgo->clusterBasicBlocks(BF); + + if (opts::PrintClusters) + CAlgo->printClusters(); + + // Arrange basic blocks according to clusters. + for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters) + Order.insert(Order.end(), Cluster.begin(), Cluster.end()); +} + +void OptimizeBranchReorderAlgorithm::reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const { + if (BF.layout_empty()) + return; + + // Cluster basic blocks. + CAlgo->clusterBasicBlocks(BF); + std::vector &Clusters = CAlgo->Clusters;; + std::vector> &ClusterEdges = CAlgo->ClusterEdges; + + // Compute clusters' average frequencies. + CAlgo->computeClusterAverageFrequency(); + std::vector &AvgFreq = CAlgo->AvgFreq;; + + if (opts::PrintClusters) + CAlgo->printClusters(); + + // Cluster layout order + std::vector ClusterOrder; + + // Do a topological sort for clusters, prioritizing frequently-executed BBs + // during the traversal. + std::stack Stack; + std::vector Status; + std::vector Parent; + Status.resize(Clusters.size(), 0); + Parent.resize(Clusters.size(), 0); + constexpr uint32_t STACKED = 1; + constexpr uint32_t VISITED = 2; + Status[0] = STACKED; + Stack.push(0); + while (!Stack.empty()) { + uint32_t I = Stack.top(); + if (!(Status[I] & VISITED)) { + Status[I] |= VISITED; + // Order successors by weight + auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) { + return ClusterEdges[I][A] > ClusterEdges[I][B]; + }; + std::priority_queue, + decltype(ClusterComp)> SuccQueue(ClusterComp); + for (auto &Target: ClusterEdges[I]) { + if (Target.second > 0 && !(Status[Target.first] & STACKED) && + !Clusters[Target.first].empty()) { + Parent[Target.first] = I; + Status[Target.first] = STACKED; + SuccQueue.push(Target.first); + } + } + while (!SuccQueue.empty()) { + Stack.push(SuccQueue.top()); + SuccQueue.pop(); + } + continue; + } + // Already visited this node + Stack.pop(); + ClusterOrder.push_back(I); + } + std::reverse(ClusterOrder.begin(), ClusterOrder.end()); + // Put unreachable clusters at the end + for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) + if (!(Status[I] & VISITED) && !Clusters[I].empty()) + ClusterOrder.push_back(I); + + // Sort nodes with equal precedence + auto Beg = ClusterOrder.begin(); + // Don't reorder the first cluster, which contains the function entry point + ++Beg; + std::stable_sort(Beg, ClusterOrder.end(), + [&AvgFreq, &Parent](uint32_t A, uint32_t B) { + uint32_t P = Parent[A]; + while (Parent[P] != 0) { + if (Parent[P] == B) + return false; + P = Parent[P]; + } + P = Parent[B]; + while (Parent[P] != 0) { + if (Parent[P] == A) + return true; + P = Parent[P]; + } + return AvgFreq[A] > AvgFreq[B]; + }); + + if (opts::PrintClusters) { + errs() << "New cluster order: "; + auto Sep = ""; + for (auto O : ClusterOrder) { + errs() << Sep << O; + Sep = ", "; + } + errs() << '\n'; + } + + // Arrange basic blocks according to cluster order. + for (uint32_t ClusterIndex : ClusterOrder) { + ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; + Order.insert(Order.end(), Cluster.begin(), Cluster.end()); + } +} + +void OptimizeCacheReorderAlgorithm::reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const { + if (BF.layout_empty()) + return; + + // Cluster basic blocks. + CAlgo->clusterBasicBlocks(BF); + std::vector &Clusters = CAlgo->Clusters;; + + // Compute clusters' average frequencies. + CAlgo->computeClusterAverageFrequency(); + std::vector &AvgFreq = CAlgo->AvgFreq;; + + if (opts::PrintClusters) + CAlgo->printClusters(); + + // Cluster layout order + std::vector ClusterOrder; + + // Order clusters based on average instruction execution frequency + for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) + if (!Clusters[I].empty()) + ClusterOrder.push_back(I); + auto Beg = ClusterOrder.begin(); + // Don't reorder the first cluster, which contains the function entry point + ++Beg; + std::stable_sort(Beg, ClusterOrder.end(), [&AvgFreq](uint32_t A, uint32_t B) { + return AvgFreq[A] > AvgFreq[B]; + }); + + if (opts::PrintClusters) { + errs() << "New cluster order: "; + auto Sep = ""; + for (auto O : ClusterOrder) { + errs() << Sep << O; + Sep = ", "; + } + errs() << '\n'; + } + + // Arrange basic blocks according to cluster order. + for (uint32_t ClusterIndex : ClusterOrder) { + ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; + Order.insert(Order.end(), Cluster.begin(), Cluster.end()); + } +} + +void ReverseReorderAlgorithm::reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const { + if (BF.layout_empty()) + return; + + auto FirstBB = *BF.layout_begin(); + Order.push_back(FirstBB); + for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI) + Order.push_back(*RLI); +} + + diff --git a/bolt/ReorderAlgorithm.h b/bolt/ReorderAlgorithm.h new file mode 100644 index 000000000000..9ea30ed19f81 --- /dev/null +++ b/bolt/ReorderAlgorithm.h @@ -0,0 +1,168 @@ +//===- ReorderAlgorithm.h - Interface for basic block reorderng algorithms ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Interface to different basic block reordering algorithms. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_BOLT_REORDER_ALGORITHM_H +#define LLVM_TOOLS_LLVM_BOLT_REORDER_ALGORITHM_H + +#include "llvm/Support/ErrorHandling.h" +#include +#include +#include + + +namespace llvm { +namespace bolt { + + +class BinaryBasicBlock; +class BinaryFunction; + +/// Objects of this class implement various basic block clustering algorithms. +/// Basic block clusters are chains of basic blocks that should be laid out +/// in this order to maximize performace. These algorithms group basic blocks +/// into clusters using execution profile data and various heuristics. +class ClusterAlgorithm { +public: + typedef std::vector ClusterTy; + std::vector Clusters; + std::vector> ClusterEdges; + std::vector AvgFreq; + + /// Group the basic blocks the given function into clusters stored in the + /// Clusters vector. Also encode relative weights between two clusters in + /// the ClusterEdges vector. This vector is indexed by the clusters indices + /// in the Clusters vector. + virtual void clusterBasicBlocks(const BinaryFunction &BF) =0; + + /// Compute for each cluster its averagae execution frequency, that is + /// the sum of average frequencies of its blocks (execution count / # instrs). + /// The average frequencies are stored in the AvgFreq vector, index by the + /// cluster indices in the Clusters vector. + void computeClusterAverageFrequency(); + + /// Clear clusters and related info. + void reset(); + + void printClusters() const; + + virtual ~ClusterAlgorithm() { } +}; + + +/// This clustering algorithm is based on a greedy heuristic suggested by +/// Pettis (PLDI '90). +class GreedyClusterAlgorithm : public ClusterAlgorithm { +public: + void clusterBasicBlocks(const BinaryFunction &BF) override; +}; + +/// Objects of this class implement various basic block reordering alogrithms. +/// Most of these algorithms depend on a clustering alogrithm. +/// Here we have 3 conflicting goals as to how to layout clusters. If we want +/// to minimize jump offsets, we should put clusters with heavy inter-cluster +/// dependence as close as possible. If we want to maximize the probability +/// that all inter-cluster edges are predicted as not-taken, we should enforce +/// a topological order to make targets appear after sources, creating forward +/// branches. If we want to separate hot from cold blocks to maximize the +/// probability that unfrequently executed code doesn't pollute the cache, we +/// should put clusters in descending order of hotness. +class ReorderAlgorithm { +protected: + std::unique_ptr CAlgo; + +public: + ReorderAlgorithm() { } + explicit ReorderAlgorithm(std::unique_ptr CAlgo) : + CAlgo(std::move(CAlgo)) { } + + typedef std::vector BasicBlockOrder; + + /// Reorder the basic blocks of the given function and store the new order in + /// the new Clusters vector. + virtual void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const =0; + + void setClusterAlgorithm(ClusterAlgorithm *CAlgo) { + this->CAlgo.reset(CAlgo); + } + + virtual ~ReorderAlgorithm() { } +}; + + +/// Dynamic programming implementation for the TSP, applied to BB layout. Find +/// the optimal way to maximize weight during a path traversing all BBs. In +/// this way, we will convert the hottest branches into fall-throughs. +/// +/// Uses exponential amount of memory on the number of basic blocks and should +/// only be used for small functions. +class OptimalReorderAlgorithm : public ReorderAlgorithm { +public: + void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const override; +}; + + +/// Simple algorithm that groups basic blocks into clusters and then +/// lays them out cluster after cluster. +class OptimizeReorderAlgorithm : public ReorderAlgorithm { +public: + explicit OptimizeReorderAlgorithm(std::unique_ptr CAlgo) : + ReorderAlgorithm(std::move(CAlgo)) { } + + void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const override; +}; + + +/// This reorder algorithm tries to ensure that all inter-cluster edges are +/// predicted as not-taken, by enforcing a topological order to make +/// targets appear after sources, creating forward branches. +class OptimizeBranchReorderAlgorithm : public ReorderAlgorithm { +public: + explicit OptimizeBranchReorderAlgorithm( + std::unique_ptr CAlgo) : + ReorderAlgorithm(std::move(CAlgo)) { } + + void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const override; +}; + + +/// This reorder tries to separate hot from cold blocks to maximize the +/// probability that unfrequently executed code doesn't pollute the cache, by +/// putting clusters in descending order of hotness. +class OptimizeCacheReorderAlgorithm : public ReorderAlgorithm { +public: + explicit OptimizeCacheReorderAlgorithm( + std::unique_ptr CAlgo) : + ReorderAlgorithm(std::move(CAlgo)) { } + + void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const override; +}; + + +/// Toy example that simply reverses the original basic block order. +class ReverseReorderAlgorithm : public ReorderAlgorithm { +public: + void reorderBasicBlocks( + const BinaryFunction &BF, BasicBlockOrder &Order) const override; +}; + + +} // namespace bolt +} // namespace llvm + +#endif +