Files
llvm/bolt/ReorderAlgorithm.cpp
Maksim Panchenko bc8a456309 ICF improvements.
Summary:
Re-worked the way ICF operates. The pass now checks for more than just
call instructions, but also for all references including function
pointers. Jump tables are handled too.

(cherry picked from FBD4372491)
2016-12-21 17:13:56 -08:00

699 lines
22 KiB
C++

//===--- ReorderAlgorithm.cpp - Basic block reorderng algorithms ----------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// Implements different basic block reordering algorithms.
//
//===----------------------------------------------------------------------===//
#include "ReorderAlgorithm.h"
#include "BinaryBasicBlock.h"
#include "BinaryFunction.h"
#include "llvm/Support/CommandLine.h"
#include <queue>
#include <functional>
#undef DEBUG_TYPE
#define DEBUG_TYPE "bolt"
using namespace llvm;
using namespace bolt;
namespace opts {
static cl::opt<bool>
PrintClusters("print-clusters", cl::desc("print clusters"), cl::ZeroOrMore);
static cl::opt<uint32_t>
RandomSeed("bolt-seed",
cl::desc("seed for randomization"),
cl::init(42),
cl::ZeroOrMore);
} // namespace opts
namespace {
template <class T>
inline void hashCombine(size_t &Seed, const T &Val) {
std::hash<T> Hasher;
Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2);
}
template <typename A, typename B>
struct HashPair {
size_t operator()(const std::pair<A,B>& Val) const {
std::hash<A> Hasher;
size_t Seed = Hasher(Val.first);
hashCombine(Seed, Val.second);
return Seed;
}
};
}
void ClusterAlgorithm::computeClusterAverageFrequency() {
AvgFreq.resize(Clusters.size(), 0.0);
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
double Freq = 0.0;
for (auto BB : Clusters[I]) {
if (BB->getNumNonPseudos() > 0)
Freq += ((double) BB->getExecutionCount()) / BB->getNumNonPseudos();
}
AvgFreq[I] = Freq;
}
}
void ClusterAlgorithm::printClusters() const {
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
errs() << "Cluster number " << I;
if (AvgFreq.size() == Clusters.size())
errs() << " (frequency: " << AvgFreq[I] << ")";
errs() << " : ";
auto Sep = "";
for (auto BB : Clusters[I]) {
errs() << Sep << BB->getName();
Sep = ", ";
}
errs() << "\n";
}
}
void ClusterAlgorithm::reset() {
Clusters.clear();
ClusterEdges.clear();
AvgFreq.clear();
}
void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const {
OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count;
}
size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const {
HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher;
return Hasher(std::make_pair(E.Src, E.Dst));
}
bool GreedyClusterAlgorithm::EdgeEqual::operator()(
const EdgeTy &A, const EdgeTy &B) const {
return A.Src == B.Src && A.Dst == B.Dst;
}
void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF,
bool ComputeEdges) {
reset();
// Greedy heuristic implementation for the TSP, applied to BB layout. Try to
// maximize weight during a path traversing all BBs. In this way, we will
// convert the hottest branches into fall-throughs.
// This is the queue of edges from which we will pop edges and use them to
// cluster basic blocks in a greedy fashion.
std::vector<EdgeTy> Queue;
// Initialize inter-cluster weights.
if (ComputeEdges)
ClusterEdges.resize(BF.layout_size());
// Initialize clusters and edge queue.
for (auto BB : BF.layout()) {
// Create a cluster for this BB.
uint32_t I = Clusters.size();
Clusters.emplace_back();
auto &Cluster = Clusters.back();
Cluster.push_back(BB);
BBToClusterMap[BB] = I;
// Populate priority queue with edges.
auto BI = BB->branch_info_begin();
for (auto &I : BB->successors()) {
assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
"attempted reordering blocks of function with no profile data");
Queue.emplace_back(EdgeTy(BB, I, BI->Count));
++BI;
}
}
// Sort and adjust the edge queue.
initQueue(Queue, BF);
// Grow clusters in a greedy fashion.
while (!Queue.empty()) {
auto E = Queue.back();
Queue.pop_back();
const auto *SrcBB = E.Src;
const auto *DstBB = E.Dst;
DEBUG(dbgs() << "Popped edge ";
E.print(dbgs());
dbgs() << "\n");
// Case 1: BBSrc and BBDst are the same. Ignore this edge
if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
DEBUG(dbgs() << "\tIgnored (same src, dst)\n");
continue;
}
int I = BBToClusterMap[SrcBB];
int J = BBToClusterMap[DstBB];
// Case 2: If they are already allocated at the same cluster, just increase
// the weight of this cluster
if (I == J) {
if (ComputeEdges)
ClusterEdges[I][I] += E.Count;
DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n");
continue;
}
auto &ClusterA = Clusters[I];
auto &ClusterB = Clusters[J];
if (areClustersCompatible(ClusterA, ClusterB, E)) {
// Case 3: SrcBB is at the end of a cluster and DstBB is at the start,
// allowing us to merge two clusters.
for (auto BB : ClusterB)
BBToClusterMap[BB] = I;
ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end());
ClusterB.clear();
if (ComputeEdges) {
// Increase the intra-cluster edge count of cluster A with the count of
// this edge as well as with the total count of previously visited edges
// from cluster B cluster A.
ClusterEdges[I][I] += E.Count;
ClusterEdges[I][I] += ClusterEdges[J][I];
// Iterate through all inter-cluster edges and transfer edges targeting
// cluster B to cluster A.
for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K)
ClusterEdges[K][I] += ClusterEdges[K][J];
}
// Adjust the weights of the remaining edges and re-sort the queue.
adjustQueue(Queue, BF);
DEBUG(dbgs() << "\tMerged clusters of src, dst\n");
} else {
// Case 4: Both SrcBB and DstBB are allocated in positions we cannot
// merge them. Add the count of this edge to the inter-cluster edge count
// between clusters A and B to help us decide ordering between these
// clusters.
if (ComputeEdges)
ClusterEdges[I][J] += E.Count;
DEBUG(dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n");
}
}
}
void GreedyClusterAlgorithm::reset() {
ClusterAlgorithm::reset();
BBToClusterMap.clear();
}
void PHGreedyClusterAlgorithm::initQueue(
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
// Define a comparison function to establish SWO between edges.
auto Comp = [&BF] (const EdgeTy &A, const EdgeTy &B) {
// With equal weights, prioritize branches with lower index
// source/destination. This helps to keep original block order for blocks
// when optimal order cannot be deducted from a profile.
if (A.Count == B.Count) {
const auto SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
return (SrcOrder != 0)
? SrcOrder > 0
: BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
}
return A.Count < B.Count;
};
// Sort edges in increasing profile count order.
std::sort(Queue.begin(), Queue.end(), Comp);
}
void PHGreedyClusterAlgorithm::adjustQueue(
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
// Nothing to do.
return;
}
bool PHGreedyClusterAlgorithm::areClustersCompatible(
const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
return Front.back() == E.Src && Back.front() == E.Dst;
}
int64_t MinBranchGreedyClusterAlgorithm::calculateWeight(
const EdgeTy &E, const BinaryFunction &BF) const {
const BinaryBasicBlock *SrcBB = E.Src;
const BinaryBasicBlock *DstBB = E.Dst;
// Initial weight value.
int64_t W = (int64_t)E.Count;
// Adjust the weight by taking into account other edges with the same source.
auto BI = SrcBB->branch_info_begin();
for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
"attempted reordering blocks of function with no profile data");
assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
"overflow detected");
// Ignore edges with same source and destination, edges that target the
// entry block as well as the edge E itself.
if (SuccBB != SrcBB && SuccBB != *BF.layout_begin() && SuccBB != DstBB)
W -= (int64_t)BI->Count;
++BI;
}
// Adjust the weight by taking into account other edges with the same
// destination.
for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
// Ignore edges with same source and destination as well as the edge E
// itself.
if (PredBB == DstBB || PredBB == SrcBB)
continue;
auto BI = PredBB->branch_info_begin();
for (const BinaryBasicBlock *SuccBB : PredBB->successors()) {
if (SuccBB == DstBB)
break;
++BI;
}
assert(BI != PredBB->branch_info_end() && "invalid control flow graph");
assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
"attempted reordering blocks of function with no profile data");
assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
"overflow detected");
W -= (int64_t)BI->Count;
}
return W;
}
void MinBranchGreedyClusterAlgorithm::initQueue(
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
// Initialize edge weights.
for (const EdgeTy &E : Queue)
Weight.emplace(std::make_pair(E, calculateWeight(E, BF)));
// Sort edges in increasing weight order.
adjustQueue(Queue, BF);
}
void MinBranchGreedyClusterAlgorithm::adjustQueue(
std::vector<EdgeTy> &Queue, const BinaryFunction &BF) {
// Define a comparison function to establish SWO between edges.
auto Comp = [&] (const EdgeTy &A, const EdgeTy &B) {
// With equal weights, prioritize branches with lower index
// source/destination. This helps to keep original block order for blocks
// when optimal order cannot be deduced from a profile.
if (Weight[A] == Weight[B]) {
const auto SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
return (SrcOrder != 0)
? SrcOrder > 0
: BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
}
return Weight[A] < Weight[B];
};
// Iterate through all remaining edges to find edges that have their
// source and destination in the same cluster.
std::vector<EdgeTy> NewQueue;
for (const EdgeTy &E : Queue) {
const auto *SrcBB = E.Src;
const auto *DstBB = E.Dst;
// Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore
// this edge.
if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
DEBUG(dbgs() << "\tAdjustment: Ignored edge ";
E.print(dbgs());
dbgs() << " (same src, dst)\n");
continue;
}
int I = BBToClusterMap[SrcBB];
int J = BBToClusterMap[DstBB];
auto &ClusterA = Clusters[I];
auto &ClusterB = Clusters[J];
// Case 2: They are already allocated at the same cluster or incompatible
// clusters. Adjust the weights of edges with the same source or
// destination, so that this edge has no effect on them any more, and ignore
// this edge. Also increase the intra- (or inter-) cluster edge count.
if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) {
if (!ClusterEdges.empty())
ClusterEdges[I][J] += E.Count;
DEBUG(dbgs() << "\tAdjustment: Ignored edge ";
E.print(dbgs());
dbgs() << " (src, dst belong to same cluster or incompatible "
"clusters)\n");
for (const auto *SuccBB : SrcBB->successors()) {
if (SuccBB == DstBB)
continue;
auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0));
assert(WI != Weight.end() && "CFG edge not found in Weight map");
WI->second += (int64_t)E.Count;
}
for (const auto *PredBB : DstBB->predecessors()) {
if (PredBB == SrcBB)
continue;
auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0));
assert(WI != Weight.end() && "CFG edge not found in Weight map");
WI->second += (int64_t)E.Count;
}
continue;
}
// Case 3: None of the previous cases is true, so just keep this edge in
// the queue.
NewQueue.emplace_back(E);
}
// Sort remaining edges in increasing weight order.
Queue.swap(NewQueue);
std::sort(Queue.begin(), Queue.end(), Comp);
}
bool MinBranchGreedyClusterAlgorithm::areClustersCompatible(
const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
return Front.back() == E.Src && Back.front() == E.Dst;
}
void MinBranchGreedyClusterAlgorithm::reset() {
GreedyClusterAlgorithm::reset();
Weight.clear();
}
void OptimalReorderAlgorithm::reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const {
std::vector<std::vector<uint64_t>> Weight;
std::unordered_map<const BinaryBasicBlock *, int> BBToIndex;
std::vector<BinaryBasicBlock *> IndexToBB;
unsigned N = BF.layout_size();
// Populating weight map and index map
for (auto BB : BF.layout()) {
BBToIndex[BB] = IndexToBB.size();
IndexToBB.push_back(BB);
}
Weight.resize(N);
for (auto BB : BF.layout()) {
auto BI = BB->branch_info_begin();
Weight[BBToIndex[BB]].resize(N);
for (auto I : BB->successors()) {
if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE)
Weight[BBToIndex[BB]][BBToIndex[I]] = BI->Count;
++BI;
}
}
std::vector<std::vector<int64_t>> DP;
DP.resize(1 << N);
for (auto &Elmt : DP) {
Elmt.resize(N, -1);
}
// Start with the entry basic block being allocated with cost zero
DP[1][0] = 0;
// Walk through TSP solutions using a bitmask to represent state (current set
// of BBs in the layout)
unsigned BestSet = 1;
unsigned BestLast = 0;
int64_t BestWeight = 0;
for (unsigned Set = 1; Set < (1U << N); ++Set) {
// Traverse each possibility of Last BB visited in this layout
for (unsigned Last = 0; Last < N; ++Last) {
// Case 1: There is no possible layout with this BB as Last
if (DP[Set][Last] == -1)
continue;
// Case 2: There is a layout with this Set and this Last, and we try
// to expand this set with New
for (unsigned New = 1; New < N; ++New) {
// Case 2a: BB "New" is already in this Set
if ((Set & (1 << New)) != 0)
continue;
// Case 2b: BB "New" is not in this set and we add it to this Set and
// record total weight of this layout with "New" as the last BB.
unsigned NewSet = (Set | (1 << New));
if (DP[NewSet][New] == -1)
DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New];
DP[NewSet][New] = std::max(DP[NewSet][New],
DP[Set][Last] + (int64_t)Weight[Last][New]);
if (DP[NewSet][New] > BestWeight) {
BestWeight = DP[NewSet][New];
BestSet = NewSet;
BestLast = New;
}
}
}
}
// Define final function layout based on layout that maximizes weight
unsigned Last = BestLast;
unsigned Set = BestSet;
std::vector<bool> Visited;
Visited.resize(N);
Visited[Last] = true;
Order.push_back(IndexToBB[Last]);
Set = Set & ~(1U << Last);
while (Set != 0) {
int64_t Best = -1;
for (unsigned I = 0; I < N; ++I) {
if (DP[Set][I] == -1)
continue;
if (DP[Set][I] > Best) {
Last = I;
Best = DP[Set][I];
}
}
Visited[Last] = true;
Order.push_back(IndexToBB[Last]);
Set = Set & ~(1U << Last);
}
std::reverse(Order.begin(), Order.end());
// Finalize layout with BBs that weren't assigned to the layout
for (auto BB : BF.layout()) {
if (Visited[BBToIndex[BB]] == false)
Order.push_back(BB);
}
}
void OptimizeReorderAlgorithm::reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const {
if (BF.layout_empty())
return;
// Cluster basic blocks.
CAlgo->clusterBasicBlocks(BF);
if (opts::PrintClusters)
CAlgo->printClusters();
// Arrange basic blocks according to clusters.
for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters)
Order.insert(Order.end(), Cluster.begin(), Cluster.end());
}
void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const {
if (BF.layout_empty())
return;
// Cluster basic blocks.
CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */true);
std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
auto &ClusterEdges = CAlgo->ClusterEdges;
// Compute clusters' average frequencies.
CAlgo->computeClusterAverageFrequency();
std::vector<double> &AvgFreq = CAlgo->AvgFreq;
if (opts::PrintClusters)
CAlgo->printClusters();
// Cluster layout order
std::vector<uint32_t> ClusterOrder;
// Do a topological sort for clusters, prioritizing frequently-executed BBs
// during the traversal.
std::stack<uint32_t> Stack;
std::vector<uint32_t> Status;
std::vector<uint32_t> Parent;
Status.resize(Clusters.size(), 0);
Parent.resize(Clusters.size(), 0);
constexpr uint32_t STACKED = 1;
constexpr uint32_t VISITED = 2;
Status[0] = STACKED;
Stack.push(0);
while (!Stack.empty()) {
uint32_t I = Stack.top();
if (!(Status[I] & VISITED)) {
Status[I] |= VISITED;
// Order successors by weight
auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) {
return ClusterEdges[I][A] > ClusterEdges[I][B];
};
std::priority_queue<uint32_t, std::vector<uint32_t>,
decltype(ClusterComp)> SuccQueue(ClusterComp);
for (auto &Target: ClusterEdges[I]) {
if (Target.second > 0 && !(Status[Target.first] & STACKED) &&
!Clusters[Target.first].empty()) {
Parent[Target.first] = I;
Status[Target.first] = STACKED;
SuccQueue.push(Target.first);
}
}
while (!SuccQueue.empty()) {
Stack.push(SuccQueue.top());
SuccQueue.pop();
}
continue;
}
// Already visited this node
Stack.pop();
ClusterOrder.push_back(I);
}
std::reverse(ClusterOrder.begin(), ClusterOrder.end());
// Put unreachable clusters at the end
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
if (!(Status[I] & VISITED) && !Clusters[I].empty())
ClusterOrder.push_back(I);
// Sort nodes with equal precedence
auto Beg = ClusterOrder.begin();
// Don't reorder the first cluster, which contains the function entry point
++Beg;
std::stable_sort(Beg, ClusterOrder.end(),
[&AvgFreq, &Parent](uint32_t A, uint32_t B) {
uint32_t P = Parent[A];
while (Parent[P] != 0) {
if (Parent[P] == B)
return false;
P = Parent[P];
}
P = Parent[B];
while (Parent[P] != 0) {
if (Parent[P] == A)
return true;
P = Parent[P];
}
return AvgFreq[A] > AvgFreq[B];
});
if (opts::PrintClusters) {
errs() << "New cluster order: ";
auto Sep = "";
for (auto O : ClusterOrder) {
errs() << Sep << O;
Sep = ", ";
}
errs() << '\n';
}
// Arrange basic blocks according to cluster order.
for (uint32_t ClusterIndex : ClusterOrder) {
ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
Order.insert(Order.end(), Cluster.begin(), Cluster.end());
}
}
void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const {
if (BF.layout_empty())
return;
// Cluster basic blocks.
CAlgo->clusterBasicBlocks(BF);
std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
// Compute clusters' average frequencies.
CAlgo->computeClusterAverageFrequency();
std::vector<double> &AvgFreq = CAlgo->AvgFreq;
if (opts::PrintClusters)
CAlgo->printClusters();
// Cluster layout order
std::vector<uint32_t> ClusterOrder;
// Order clusters based on average instruction execution frequency
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
if (!Clusters[I].empty())
ClusterOrder.push_back(I);
// Don't reorder the first cluster, which contains the function entry point
std::stable_sort(std::next(ClusterOrder.begin()),
ClusterOrder.end(),
[&AvgFreq](uint32_t A, uint32_t B) {
return AvgFreq[A] > AvgFreq[B];
});
if (opts::PrintClusters) {
errs() << "New cluster order: ";
auto Sep = "";
for (auto O : ClusterOrder) {
errs() << Sep << O;
Sep = ", ";
}
errs() << '\n';
}
// Arrange basic blocks according to cluster order.
for (uint32_t ClusterIndex : ClusterOrder) {
ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
Order.insert(Order.end(), Cluster.begin(), Cluster.end());
}
}
void ReverseReorderAlgorithm::reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const {
if (BF.layout_empty())
return;
auto FirstBB = *BF.layout_begin();
Order.push_back(FirstBB);
for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI)
Order.push_back(*RLI);
}
void RandomClusterReorderAlgorithm::reorderBasicBlocks(
const BinaryFunction &BF, BasicBlockOrder &Order) const {
if (BF.layout_empty())
return;
// Cluster basic blocks.
CAlgo->clusterBasicBlocks(BF);
std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
if (opts::PrintClusters)
CAlgo->printClusters();
// Cluster layout order
std::vector<uint32_t> ClusterOrder;
// Order clusters based on average instruction execution frequency
for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
if (!Clusters[I].empty())
ClusterOrder.push_back(I);
std::srand(opts::RandomSeed);
std::random_shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end());
if (opts::PrintClusters) {
errs() << "New cluster order: ";
auto Sep = "";
for (auto O : ClusterOrder) {
errs() << Sep << O;
Sep = ", ";
}
errs() << '\n';
}
// Arrange basic blocks according to cluster order.
for (uint32_t ClusterIndex : ClusterOrder) {
ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
Order.insert(Order.end(), Cluster.begin(), Cluster.end());
}
}