Files
llvm/bolt/Passes/ReorderAlgorithm.cpp

707 lines
22 KiB
C++
Raw Normal View History

//===--- Passes/ReorderAlgorithm.cpp - Basic block reorderng algorithms ---===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// Implements different basic block reordering algorithms.
//
//===----------------------------------------------------------------------===//
#include "ReorderAlgorithm.h"
#include "BinaryBasicBlock.h"
#include "BinaryFunction.h"
#include "llvm/Support/CommandLine.h"
#include <queue>
#include <functional>
#undef DEBUG_TYPE
#define DEBUG_TYPE "bolt"
using namespace llvm;
using namespace bolt;
namespace opts {
extern cl::OptionCategory BoltOptCategory;
static cl::opt<bool>
PrintClusters("print-clusters",
cl::desc("print clusters"),
cl::ZeroOrMore,
cl::Hidden,
cl::cat(BoltOptCategory));
cl::opt<uint32_t>
RandomSeed("bolt-seed",
cl::desc("seed for randomization"),
cl::init(42),
cl::ZeroOrMore,
cl::Hidden,
cl::cat(BoltOptCategory));
} // namespace opts
namespace {
template <class T>
inline void hashCombine(size_t &Seed, const T &Val) {
std::hash<T> Hasher;
Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2);
}
template <typename A, typename B>
struct HashPair {
size_t operator()(const std::pair<A,B>& Val) const {
std::hash<A> Hasher;
size_t Seed = Hasher(Val.first);
hashCombine(Seed, Val.second);
return Seed;
}
};
}
void ClusterAlgorithm::computeClusterAverageFrequency() {
AvgFreq.resize(Clusters.size(), 0.0);
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
double Freq = 0.0;
for (auto BB : Clusters[I]) {
if (BB->getNumNonPseudos() > 0)
Freq += ((double) BB->getExecutionCount()) / BB->getNumNonPseudos();
}
AvgFreq[I] = Freq;
}
}
void ClusterAlgorithm::printClusters() const {
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
errs() << "Cluster number " << I;
if (AvgFreq.size() == Clusters.size())
errs() << " (frequency: " << AvgFreq[I] << ")";
errs() << " : ";
auto Sep = "";
for (auto BB : Clusters[I]) {
errs() << Sep << BB->getName();
Sep = ", ";
}
errs() << "\n";
}
}
void ClusterAlgorithm::reset() {
Clusters.clear();
ClusterEdges.clear();
AvgFreq.clear();
}
void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const {
OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count;
}
size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const {
HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher;
return Hasher(std::make_pair(E.Src, E.Dst));
}
bool GreedyClusterAlgorithm::EdgeEqual::operator()(
const EdgeTy &A, const EdgeTy &B) const {
return A.Src == B.Src && A.Dst == B.Dst;
}
void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF,
bool ComputeEdges) {
reset();
// Greedy heuristic implementation for the TSP, applied to BB layout. Try to
// maximize weight during a path traversing all BBs. In this way, we will
// convert the hottest branches into fall-throughs.
// This is the queue of edges from which we will pop edges and use them to
// cluster basic blocks in a greedy fashion.
std::vector<EdgeTy> Queue;
// Initialize inter-cluster weights.
if (ComputeEdges)
ClusterEdges.resize(BF.layout_size());
// Initialize clusters and edge queue.
for (auto BB : BF.layout()) {
// Create a cluster for this BB.
uint32_t I = Clusters.size();
Clusters.emplace_back();
auto &Cluster = Clusters.back();
Cluster.push_back(BB);
BBToClusterMap[BB] = I;
// Populate priority queue with edges.
auto BI = BB->branch_info_begin();
for (auto &I : BB->successors()) {
assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
"attempted reordering blocks of function with no profile data");
Queue.emplace_back(EdgeTy(BB, I, BI->Count));
++BI;
}
}
// Sort and adjust the edge queue.
initQueue(Queue, BF);
// Grow clusters in a greedy fashion.
while (!Queue.empty()) {
auto E = Queue.back();
Queue.pop_back();
const auto *SrcBB = E.Src;
const auto *DstBB = E.Dst;
DEBUG(dbgs() << "Popped edge ";
E.print(dbgs());
dbgs() << "\n");
// Case 1: BBSrc and BBDst are the same. Ignore this edge
if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
DEBUG(dbgs() << "\tIgnored (same src, dst)\n");
continue;
}
int I = BBToClusterMap[SrcBB];
int J = BBToClusterMap[DstBB];
// Case 2: If they are already allocated at the same cluster, just increase
// the weight of this cluster
if (I == J) {
if (ComputeEdges)
ClusterEdges[I][I] += E.Count;
DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n");
continue;
}
auto &ClusterA = Clusters[I];
auto &ClusterB = Clusters[J];
if (areClustersCompatible(ClusterA, ClusterB, E)) {
// Case 3: SrcBB is at the end of a cluster and DstBB is at the start,
// allowing us to merge two clusters.
for (auto BB : ClusterB)
BBToClusterMap[BB] = I;
ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end());
ClusterB.clear();
if (ComputeEdges) {
// Increase the intra-cluster edge count of cluster A with the count of
// this edge as well as with the total count of previously visited edges
// from cluster B cluster A.
ClusterEdges[I][I] += E.Count;
ClusterEdges[I][I] += ClusterEdges[J][I];
// Iterate through all inter-cluster edges and transfer edges targeting
// cluster B to cluster A.
for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K)
ClusterEdges[K][I] += ClusterEdges[K][J];
}
// Adjust the weights of the remaining edges and re-sort the queue.
adjustQueue(Queue, BF);
DEBUG(dbgs() << "\tMerged clusters of src, dst\n");
} else {
// Case 4: Both SrcBB and DstBB are allocated in positions we cannot
// merge them. Add the count of this edge to the inter-cluster edge count
// between clusters A and B to help us decide ordering between these
// clusters.
if (ComputeEdges)
ClusterEdges[I][J] += E.Count;
DEBUG(dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n");
}
}
}
void GreedyClusterAlgorithm::reset() {
ClusterAlgorithm::reset();
BBToClusterMap.clear();
}
void PHGreedyClusterAlgorithm::initQueue(
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
// Define a comparison function to establish SWO between edges.
auto Comp = [&BF] (const EdgeTy &A, const EdgeTy &B) {
// With equal weights, prioritize branches with lower index
// source/destination. This helps to keep original block order for blocks
// when optimal order cannot be deducted from a profile.
if (A.Count == B.Count) {
const auto SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
return (SrcOrder != 0)
? SrcOrder > 0
: BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
}
return A.Count < B.Count;
};
// Sort edges in increasing profile count order.
std::sort(Queue.begin(), Queue.end(), Comp);
}
void PHGreedyClusterAlgorithm::adjustQueue(
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
// Nothing to do.
return;
}
bool PHGreedyClusterAlgorithm::areClustersCompatible(
const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
return Front.back() == E.Src && Back.front() == E.Dst;
}
int64_t MinBranchGreedyClusterAlgorithm::calculateWeight(
const EdgeTy &E, const BinaryFunction &BF) const {
const BinaryBasicBlock *SrcBB = E.Src;
const BinaryBasicBlock *DstBB = E.Dst;
// Initial weight value.
int64_t W = (int64_t)E.Count;
// Adjust the weight by taking into account other edges with the same source.
auto BI = SrcBB->branch_info_begin();
for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
"attempted reordering blocks of function with no profile data");
assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
"overflow detected");
// Ignore edges with same source and destination, edges that target the
// entry block as well as the edge E itself.
if (SuccBB != SrcBB && SuccBB != *BF.layout_begin() && SuccBB != DstBB)
W -= (int64_t)BI->Count;
++BI;
}
// Adjust the weight by taking into account other edges with the same
// destination.
for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
// Ignore edges with same source and destination as well as the edge E
// itself.
if (PredBB == DstBB || PredBB == SrcBB)
continue;
auto BI = PredBB->branch_info_begin();
for (const BinaryBasicBlock *SuccBB : PredBB->successors()) {
if (SuccBB == DstBB)
break;
++BI;
}
assert(BI != PredBB->branch_info_end() && "invalid control flow graph");
assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
"attempted reordering blocks of function with no profile data");
assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
"overflow detected");
W -= (int64_t)BI->Count;
}
return W;
}
void MinBranchGreedyClusterAlgorithm::initQueue(
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
// Initialize edge weights.
for (const EdgeTy &E : Queue)
Weight.emplace(std::make_pair(E, calculateWeight(E, BF)));
// Sort edges in increasing weight order.
adjustQueue(Queue, BF);
}
void MinBranchGreedyClusterAlgorithm::adjustQueue(
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
// Define a comparison function to establish SWO between edges.
auto Comp = [&] (const EdgeTy &A, const EdgeTy &B) {
// With equal weights, prioritize branches with lower index
// source/destination. This helps to keep original block order for blocks
// when optimal order cannot be deduced from a profile.
if (Weight[A] == Weight[B]) {
const auto SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
return (SrcOrder != 0)
? SrcOrder > 0
: BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
}
return Weight[A] < Weight[B];
};
// Iterate through all remaining edges to find edges that have their
// source and destination in the same cluster.
std::vector<EdgeTy> NewQueue;
for (const EdgeTy &E : Queue) {
const auto *SrcBB = E.Src;
const auto *DstBB = E.Dst;
// Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore
// this edge.
if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
DEBUG(dbgs() << "\tAdjustment: Ignored edge ";
E.print(dbgs());
dbgs() << " (same src, dst)\n");
continue;
}
int I = BBToClusterMap[SrcBB];
int J = BBToClusterMap[DstBB];
auto &ClusterA = Clusters[I];
auto &ClusterB = Clusters[J];
// Case 2: They are already allocated at the same cluster or incompatible
// clusters. Adjust the weights of edges with the same source or
// destination, so that this edge has no effect on them any more, and ignore
// this edge. Also increase the intra- (or inter-) cluster edge count.
if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) {
if (!ClusterEdges.empty())
ClusterEdges[I][J] += E.Count;
DEBUG(dbgs() << "\tAdjustment: Ignored edge ";
E.print(dbgs());
dbgs() << " (src, dst belong to same cluster or incompatible "
"clusters)\n");
for (const auto *SuccBB : SrcBB->successors()) {
if (SuccBB == DstBB)
continue;
auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0));
assert(WI != Weight.end() && "CFG edge not found in Weight map");
WI->second += (int64_t)E.Count;
}
for (const auto *PredBB : DstBB->predecessors()) {
if (PredBB == SrcBB)
continue;
auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0));
assert(WI != Weight.end() && "CFG edge not found in Weight map");
WI->second += (int64_t)E.Count;
}
continue;
}
// Case 3: None of the previous cases is true, so just keep this edge in
// the queue.
NewQueue.emplace_back(E);
}
// Sort remaining edges in increasing weight order.
Queue.swap(NewQueue);
std::sort(Queue.begin(), Queue.end(), Comp);
}
bool MinBranchGreedyClusterAlgorithm::areClustersCompatible(
const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
return Front.back() == E.Src && Back.front() == E.Dst;
}
void MinBranchGreedyClusterAlgorithm::reset() {
GreedyClusterAlgorithm::reset();
Weight.clear();
}
void OptimalReorderAlgorithm::reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const {
std::vector<std::vector<uint64_t>> Weight;
std::unordered_map<const BinaryBasicBlock *, int> BBToIndex;
std::vector<BinaryBasicBlock *> IndexToBB;
unsigned N = BF.layout_size();
// Populating weight map and index map
for (auto BB : BF.layout()) {
BBToIndex[BB] = IndexToBB.size();
IndexToBB.push_back(BB);
}
Weight.resize(N);
for (auto BB : BF.layout()) {
auto BI = BB->branch_info_begin();
Weight[BBToIndex[BB]].resize(N);
for (auto I : BB->successors()) {
if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE)
Weight[BBToIndex[BB]][BBToIndex[I]] = BI->Count;
++BI;
}
}
std::vector<std::vector<int64_t>> DP;
DP.resize(1 << N);
for (auto &Elmt : DP) {
Elmt.resize(N, -1);
}
// Start with the entry basic block being allocated with cost zero
DP[1][0] = 0;
// Walk through TSP solutions using a bitmask to represent state (current set
// of BBs in the layout)
unsigned BestSet = 1;
unsigned BestLast = 0;
int64_t BestWeight = 0;
for (unsigned Set = 1; Set < (1U << N); ++Set) {
// Traverse each possibility of Last BB visited in this layout
for (unsigned Last = 0; Last < N; ++Last) {
// Case 1: There is no possible layout with this BB as Last
if (DP[Set][Last] == -1)
continue;
// Case 2: There is a layout with this Set and this Last, and we try
// to expand this set with New
for (unsigned New = 1; New < N; ++New) {
// Case 2a: BB "New" is already in this Set
if ((Set & (1 << New)) != 0)
continue;
// Case 2b: BB "New" is not in this set and we add it to this Set and
// record total weight of this layout with "New" as the last BB.
unsigned NewSet = (Set | (1 << New));
if (DP[NewSet][New] == -1)
DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New];
DP[NewSet][New] = std::max(DP[NewSet][New],
DP[Set][Last] + (int64_t)Weight[Last][New]);
if (DP[NewSet][New] > BestWeight) {
BestWeight = DP[NewSet][New];
BestSet = NewSet;
BestLast = New;
}
}
}
}
// Define final function layout based on layout that maximizes weight
unsigned Last = BestLast;
unsigned Set = BestSet;
std::vector<bool> Visited;
Visited.resize(N);
Visited[Last] = true;
Order.push_back(IndexToBB[Last]);
Set = Set & ~(1U << Last);
while (Set != 0) {
int64_t Best = -1;
for (unsigned I = 0; I < N; ++I) {
if (DP[Set][I] == -1)
continue;
if (DP[Set][I] > Best) {
Last = I;
Best = DP[Set][I];
}
}
Visited[Last] = true;
Order.push_back(IndexToBB[Last]);
Set = Set & ~(1U << Last);
}
std::reverse(Order.begin(), Order.end());
// Finalize layout with BBs that weren't assigned to the layout
for (auto BB : BF.layout()) {
if (Visited[BBToIndex[BB]] == false)
Order.push_back(BB);
}
}
void OptimizeReorderAlgorithm::reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const {
if (BF.layout_empty())
return;
// Cluster basic blocks.
CAlgo->clusterBasicBlocks(BF);
if (opts::PrintClusters)
CAlgo->printClusters();
// Arrange basic blocks according to clusters.
for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters)
Order.insert(Order.end(), Cluster.begin(), Cluster.end());
}
void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const {
if (BF.layout_empty())
return;
// Cluster basic blocks.
CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */true);
std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
auto &ClusterEdges = CAlgo->ClusterEdges;
// Compute clusters' average frequencies.
CAlgo->computeClusterAverageFrequency();
std::vector<double> &AvgFreq = CAlgo->AvgFreq;
if (opts::PrintClusters)
CAlgo->printClusters();
// Cluster layout order
std::vector<uint32_t> ClusterOrder;
// Do a topological sort for clusters, prioritizing frequently-executed BBs
// during the traversal.
std::stack<uint32_t> Stack;
std::vector<uint32_t> Status;
std::vector<uint32_t> Parent;
Status.resize(Clusters.size(), 0);
Parent.resize(Clusters.size(), 0);
constexpr uint32_t STACKED = 1;
constexpr uint32_t VISITED = 2;
Status[0] = STACKED;
Stack.push(0);
while (!Stack.empty()) {
uint32_t I = Stack.top();
if (!(Status[I] & VISITED)) {
Status[I] |= VISITED;
// Order successors by weight
auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) {
return ClusterEdges[I][A] > ClusterEdges[I][B];
};
std::priority_queue<uint32_t, std::vector<uint32_t>,
decltype(ClusterComp)> SuccQueue(ClusterComp);
for (auto &Target: ClusterEdges[I]) {
if (Target.second > 0 && !(Status[Target.first] & STACKED) &&
!Clusters[Target.first].empty()) {
Parent[Target.first] = I;
Status[Target.first] = STACKED;
SuccQueue.push(Target.first);
}
}
while (!SuccQueue.empty()) {
Stack.push(SuccQueue.top());
SuccQueue.pop();
}
continue;
}
// Already visited this node
Stack.pop();
ClusterOrder.push_back(I);
}
std::reverse(ClusterOrder.begin(), ClusterOrder.end());
// Put unreachable clusters at the end
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
if (!(Status[I] & VISITED) && !Clusters[I].empty())
ClusterOrder.push_back(I);
// Sort nodes with equal precedence
auto Beg = ClusterOrder.begin();
// Don't reorder the first cluster, which contains the function entry point
++Beg;
std::stable_sort(Beg, ClusterOrder.end(),
[&AvgFreq, &Parent](uint32_t A, uint32_t B) {
uint32_t P = Parent[A];
while (Parent[P] != 0) {
if (Parent[P] == B)
return false;
P = Parent[P];
}
P = Parent[B];
while (Parent[P] != 0) {
if (Parent[P] == A)
return true;
P = Parent[P];
}
return AvgFreq[A] > AvgFreq[B];
});
if (opts::PrintClusters) {
errs() << "New cluster order: ";
auto Sep = "";
for (auto O : ClusterOrder) {
errs() << Sep << O;
Sep = ", ";
}
errs() << '\n';
}
// Arrange basic blocks according to cluster order.
for (uint32_t ClusterIndex : ClusterOrder) {
ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
Order.insert(Order.end(), Cluster.begin(), Cluster.end());
}
}
void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const {
if (BF.layout_empty())
return;
// Cluster basic blocks.
CAlgo->clusterBasicBlocks(BF);
std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
// Compute clusters' average frequencies.
CAlgo->computeClusterAverageFrequency();
std::vector<double> &AvgFreq = CAlgo->AvgFreq;
if (opts::PrintClusters)
CAlgo->printClusters();
// Cluster layout order
std::vector<uint32_t> ClusterOrder;
// Order clusters based on average instruction execution frequency
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
if (!Clusters[I].empty())
ClusterOrder.push_back(I);
// Don't reorder the first cluster, which contains the function entry point
std::stable_sort(std::next(ClusterOrder.begin()),
ClusterOrder.end(),
[&AvgFreq](uint32_t A, uint32_t B) {
return AvgFreq[A] > AvgFreq[B];
});
if (opts::PrintClusters) {
errs() << "New cluster order: ";
auto Sep = "";
for (auto O : ClusterOrder) {
errs() << Sep << O;
Sep = ", ";
}
errs() << '\n';
}
// Arrange basic blocks according to cluster order.
for (uint32_t ClusterIndex : ClusterOrder) {
ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
Order.insert(Order.end(), Cluster.begin(), Cluster.end());
}
}
void ReverseReorderAlgorithm::reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const {
if (BF.layout_empty())
return;
auto FirstBB = *BF.layout_begin();
Order.push_back(FirstBB);
for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI)
Order.push_back(*RLI);
}
void RandomClusterReorderAlgorithm::reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const {
if (BF.layout_empty())
return;
// Cluster basic blocks.
CAlgo->clusterBasicBlocks(BF);
std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
if (opts::PrintClusters)
CAlgo->printClusters();
// Cluster layout order
std::vector<uint32_t> ClusterOrder;
// Order clusters based on average instruction execution frequency
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
if (!Clusters[I].empty())
ClusterOrder.push_back(I);
std::srand(opts::RandomSeed);
std::random_shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end());
if (opts::PrintClusters) {
errs() << "New cluster order: ";
auto Sep = "";
for (auto O : ClusterOrder) {
errs() << Sep << O;
Sep = ", ";
}
errs() << '\n';
}
// Arrange basic blocks according to cluster order.
for (uint32_t ClusterIndex : ClusterOrder) {
ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
Order.insert(Order.end(), Cluster.begin(), Cluster.end());
}
}