Files
intel-graphics-compiler/visa/KernelCost.cpp
DianaChen f0962ad863 IGA: Clean-up gcc warnings
Clean-up gcc warnings, such as
-Werror=misleading-indentation
-Werror=catch-value
-Werror=class-memaccess
-Werror=unused-variable
-Werror=unused-but-set-variable
2025-07-01 18:23:24 +02:00

618 lines
18 KiB
C++

/*========================== begin_copyright_notice ============================
Copyright (C) 2017-2021 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
// This file contains the implementation of frequency info pass.
#include "KernelCost.hpp"
#include "G4_IR.hpp"
#include "G4_Kernel.hpp"
#include "FlowGraph.h"
#include <list>
#include <unordered_map>
#include <iostream>
#include <algorithm>
using namespace vISA;
KernelCost::KernelCost(G4_Kernel* pK, std::vector<VISA_BB_INFO> &BBInfo)
: m_kernel(pK), m_loops(pK->fg.getLoops()) {
vISA_ASSERT(BBInfo.size() == pK->fg.getNumBB(),
"KernelCost(): invliad the number of BBs");
// Set up cost info for each BB
for (auto BB : pK->fg.getBBList()) {
uint32_t id = BB->getId();
BBCostInfo BCI;
BCI.m_cycles = BBInfo[id].staticCycle;
BCI.m_prob = 0;
m_BBCostInfo.insert(std::make_pair(BB, BCI));
}
}
void KernelCost::run() {
m_loops.setStale();
m_loops.recomputeIfStale();
init();
calculateProb();
collectPerfMetrics();
// save data into G4_Kernel for IGC to access
m_kernel->createKernelCostInfo(this);
}
// DFS_PO: Post-order traversal and record each BB in RPOT vector
void KernelCost::DFS_PO(G4_BB *BB) {
if (visited[BB] == 1)
return;
visited[BB] = 1;
if (BB->getBBType() & G4_BB_CALL_TYPE) {
G4_BB *N = BB->getPhysicalSucc();
DFS_PO(N);
} else if (!(BB->getBBType() & G4_BB_EXIT_TYPE)) {
for (G4_BB *S : BB->Succs)
DFS_PO(S);
}
// BB in the increasing PO order, and thus its reverse is in RPOT order.
RPOT.push_back(BB);
}
void KernelCost::doRPOT(G4_BB *EntryBB) {
RPOT.clear();
visited.clear();
DFS_PO(EntryBB);
std::reverse(RPOT.begin(), RPOT.end());
assert(EntryBB == RPOT.front());
}
// Propagate prob into BB's succ.
// If L is not nullptr, it means this BB is inside a loop and BB's succ
// to L's header is ignored and to any block outside L is also ignored.
void KernelCost::propagateBBProb(G4_BB* BB, Loop *L) {
visited[BB] = 1;
// Special case
if (BB->getBBType() & G4_BB_CALL_TYPE) {
G4_BB* S = BB->getPhysicalSucc();
updateBBProb(S, getBBProb(BB));
return;
} else if (BB->getBBType() & G4_BB_EXIT_TYPE) {
return;
}
std::vector<ProbType> SuccProb;
getSuccEdgeProb(BB, SuccProb);
if (SuccProb.size() == 0) {
// No succ that needs propagation
return;
}
vISA_ASSERT(SuccProb.size() == BB->Succs.size(),
"Succ's prob has wrong vector size");
int i = -1;
for (G4_BB *S : BB->Succs) {
++i;
// Skip back-edge as it does not affect header's prob
if (L) {
if (S == L->getHeader())
continue;
// Skip BBs that are outside the loop
if (!L->contains(S))
continue;
}
updateBBProb(S, SuccProb[i]);
}
}
// Redo exit bb's prob
// 1. loop's immediate exit is dependent upon the loop header's prob
// 2. where the exit target is does not matter
void KernelCost::propagateLoopProb(Loop *L, int RPOT_pos) {
auto getLoopExitProb = [this](Loop *L, G4_BB *ExitBB) {
ProbType retProb = 0;
for (G4_BB *B : ExitBB->Preds) {
Loop *iL = m_loops.getInnerMostLoop(B);
if (iL && iL == L) {
std::vector<ProbType> tProb;
getSuccEdgeProb(B, tProb);
if (tProb.size() == 0)
continue;
int i = 0;
for (G4_BB *S : B->Succs) {
if (S == ExitBB) {
retProb += tProb[i];
break;
}
++i;
}
}
}
return retProb;
};
G4_BB *H = L->getHeader();
for (int i = RPOT_pos; i < (int)RPOT.size(); ++i) {
G4_BB *BB = RPOT[i];
// visited BBs inside this loop
if (visited[BB] || !L->contains(BB))
continue;
// Recusively handle immediate child loop
Loop *CL = L->getOuterMostChildLoop(BB);
if (CL && CL->getHeader() == BB)
propagateLoopProb(CL, i);
else if (!CL)
propagateBBProb(BB, L);
}
// Calculate exits' probability. Only loop's exit BBs that are 1 level up
// is considered. For any exit that is more than 1 level up, it will be
// handled when the outer loop exits are handled.
std::vector<G4_BB *> &lpExits = L->getLoopExits();
std::vector<ProbType> immExitProbs;
std::vector<G4_BB *> immExitBBs;
uint32_t L_level = L->getNestingLevel();
ProbType totalP = 0;
for (G4_BB *B : lpExits) {
Loop *tL = m_loops.getInnerMostLoop(B);
if ((!tL && !L->parent) ||
(tL && tL->getNestingLevel() == (L_level + 1))) {
immExitBBs.push_back(B);
ProbType tProb = getLoopExitProb(L, B);
immExitProbs.push_back(tProb);
totalP += tProb;
}
}
// Normalize exit BB's probability so that sum of all prob(immediateExitBB)
// is the same as prob(H).
// L0 : P=1
// L1 : P=0.5
// goto E0
// goto E1
// E1:
// E0
// The following code will have Prob(E1) = 0.5 and Prob(E0) = 1 by handling
// E1 for loop L1 and E0 for loop L0 (L1 does not handle E0!).
//
if (immExitProbs.size() > 0) {
ProbType P = getBBProb(H);
// totalP should not be larger than P. Here make sure it is the case.
totalP = std::min(totalP, P);
float factor = (float)P / totalP;
ProbType P0 = P;
for (int i = 1; i < (int)immExitProbs.size(); ++i) {
G4_BB *B = immExitBBs[i];
ProbType tP = (ProbType)(factor * immExitProbs[i]);
updateBBProb(B, tP);
// make sure no overflow
P0 = (P0 >= tP ? P0 - tP : 0);
}
G4_BB *B0 = immExitBBs[0];
updateBBProb(B0, P0);
}
}
void KernelCost::calculateProb() {
// Each function's probability is calculated independently,
// assuming their entry prob is 1.
// Leaf subroutine is calculated first, and kernel is cacluated last.
for (int f = 0; f < (int)m_metrics.size(); ++f) {
G4_BB *StartBB = m_metrics[f].m_funcInfo->getInitBB();
doRPOT(StartBB);
// Entry has prob 1 always.
updateBBProb(StartBB, MAX_PROB_POINTS);
// Start propagation in Rerverse Post-order traveral.
visited.clear();
for (int i = 0; i < (int)RPOT.size(); ++i) {
G4_BB *BB = RPOT[i];
if (visited[BB])
continue;
Loop *L = m_loops.getOuterMostLoop(BB);
if (L && L->getHeader() == BB)
propagateLoopProb(L, i);
else if (!L)
propagateBBProb(BB);
}
}
}
void KernelCost::getSuccEdgeProb(G4_BB *BB,
std::vector<ProbType>& SuccEdgeProb) {
ProbType P = getBBProb(BB);
uint32_t nsuccs = (uint32_t)BB->Succs.size();
if (nsuccs == 1 || (BB->getBBType() & G4_BB_CALL_TYPE)) {
SuccEdgeProb.push_back(P);
return;
}
if (nsuccs == 0 || (BB->getBBType() & G4_BB_EXIT_TYPE))
return;
G4_InstCF *br = BB->getLastCFInst();
if (br && nsuccs == 2) {
// common case
if (!br->isUniform()) {
SuccEdgeProb.insert(SuccEdgeProb.end(), nsuccs, P);
return;
}
float takenP = 0.5;
bool defByDst;
G4_INST *predDefI = getBranchFlagLocalDef(BB, defByDst);
if (predDefI && !defByDst) {
G4_Predicate *Pred = br->getPredicate();
bool isNegPred = (Pred->getState() == G4_PredState::PredState_Minus);
G4_CondMod *cMod = predDefI->getCondMod();
vISA_ASSERT(cMod, "ICE: condMod should be preset\n");
switch (cMod->getMod()) {
case G4_CondModifier::Mod_z:
case G4_CondModifier::Mod_e:
case G4_CondModifier::Mod_o:
case G4_CondModifier::Mod_u:
// ==, overflow, unordered : unlikely
takenP = isNegPred ? 0.90f : 0.1f;
break;
case G4_CondModifier::Mod_nz:
case G4_CondModifier::Mod_ne:
// != : likely
takenP = isNegPred ? 0.1f : 0.9f;
break;
default:
break;
}
}
ProbType target_P = (ProbType)(takenP * P);
ProbType fallthru_P = (P >= target_P ? P - target_P : 0);
SuccEdgeProb.push_back(fallthru_P);
SuccEdgeProb.push_back(target_P);
return;
}
// Shouldn't reach this place, but do it in case.
SuccEdgeProb.resize(nsuccs);
// Evenly assign probability to each edge
// (Nice to keep the sum of all succs' prob the same as BB's)
ProbType total = 0;
ProbType eachP = (P / nsuccs);
for (int i = 1; i < (int)nsuccs; ++i) {
SuccEdgeProb[i] = eachP;
total += eachP;
}
SuccEdgeProb[0] = (P >= total ? P - total : 0);
return;
}
G4_INST* KernelCost::getBranchFlagLocalDef(G4_BB* BB, bool& DefByDst) {
DefByDst = false;
G4_InstCF *br = BB->getLastCFInst();
if (!br)
return nullptr;
G4_Predicate *P = br->getPredicate();
if (!P)
return nullptr;
// RS: reverse iterator to br
auto RS = BB->rbegin();
for (auto RE = BB->rend(), RI = ++RS; RI != RE; ++RI) {
G4_INST *I = *RI;
G4_Operand *Dst = I->getDst();
G4_Operand *cMod = I->getCondMod();
if (Dst && !I->hasNULLDst() &&
P->compareOperand(Dst, I->getBuilder()) != Rel_disjoint) {
DefByDst = true;
return I;
}
if (cMod && P->compareOperand(cMod, I->getBuilder()) != Rel_disjoint)
return I;
}
return nullptr;
}
void KernelCost::calculateBBMetrics(CostMetricsWrapper& CM, G4_BB* BB)
{
BBCostInfo &BInfo = m_BBCostInfo[BB];
float factor = BInfo.m_prob / (float)MAX_PROB_POINTS;
// make sure no BBs have 0 cycles!
uint32_t cycles = std::max(1u, (uint32_t)(BInfo.m_cycles * factor));
CM.setCycles(cycles);
uint32_t loadBytes = 0;
uint32_t storeBytes = 0;
for (auto Inst : BB->getInstList()) {
if (Inst->isSend()) {
uint32_t rBytes = 0, wBytes = 0;
collectSendMetrics(Inst->asSendInst(), rBytes, wBytes);
loadBytes += rBytes;
storeBytes += wBytes;
}
}
// If loads/stores are not 0, make sure the final result is at least 1.
if (loadBytes > 0)
loadBytes = std::max(1u, (uint32_t)(loadBytes * factor));
if (storeBytes > 0)
storeBytes = std::max(1u, (uint32_t)(storeBytes * factor));
CM.setLoadBytes(loadBytes);
CM.setStoreBytes(storeBytes);;
if (BB->isEndWithCall()) {
FuncInfo *pFI = BB->getCalleeInfo();
int ix = m_funcIndex[pFI];
CostMetricsWrapper &calleeCM = m_metrics[ix].m_estimateCost;
CM.add(calleeCM, BInfo.m_prob);
}
}
void KernelCost::init()
{
// Set up info for all subroutines
int numFuncs = (int)m_kernel->fg.sortedFuncTable.size();
if (numFuncs > 0) {
m_metrics.resize(numFuncs);
for (int i = 0; i < numFuncs; ++i) {
FuncInfo *pFInfo = m_kernel->fg.sortedFuncTable[i];
m_metrics[i].m_funcInfo = pFInfo;
m_funcIndex[pFInfo] = i;
}
} else {
m_metrics.resize(1);
FuncInfo *kernelInfo = m_kernel->fg.kernelInfo;
m_metrics[0].m_funcInfo = kernelInfo;
m_funcIndex[kernelInfo] = 0;
numFuncs = 1;
}
// set up loops for kernel/subroutine in program order
BB_LIST &BBs = m_kernel->fg.getBBList();
for (int i = 0; i < numFuncs; ++i) {
FuncCost &FC = m_metrics[i];
FuncInfo *pFI = FC.m_funcInfo;
G4_BB *StartBB = pFI->getInitBB();
G4_BB *EndBB = pFI->getExitBB();
auto IS = std::find(BBs.begin(), BBs.end(), StartBB);
auto IE = std::find(BBs.begin(), BBs.end(), EndBB);
IE = std::next(IE);
for (auto II = IS; II != IE; ++II) {
G4_BB *BB = *II;
// all the loops with the same header : BB
std::list<Loop *> lps;
Loop *L = m_loops.getInnerMostLoop(BB);
while (L && L->getHeader() == BB) {
lps.push_front(L);
L = L->parent;
}
if (!lps.empty()) {
int nextIdx = (int)FC.m_allLoopsInProgramOrder.size();
for (auto al : lps) {
FC.m_allLoopsInProgramOrder.push_back(al);
LoopCost &LC = m_loopCosts[al];
G4_BB *backedgeSrc = al->backEdgeSrc();
int visaid = -1;
if (!al->backEdgeSrc()->empty()) {
G4_INST *br = backedgeSrc->back();
visaid = br->getVISAId();
}
LC.m_backedge_visaId = visaid;
LC.m_loopId = nextIdx;
++nextIdx;
}
}
}
}
}
void KernelCost::collectLoopMetrics(Loop *L) {
LoopCost &LC = m_loopCosts[L];
for (G4_BB *bb : L->getBBs()) {
Loop *lp = L->getOuterMostChildLoop(bb);
if (lp && lp->getHeader() == bb) {
collectLoopMetrics(lp);
LoopCost &aLC = m_loopCosts[lp];
LC.m_estimateCost.add(aLC.m_estimateCost);
} else if (!lp) {
CostMetricsWrapper BCM;
calculateBBMetrics(BCM, bb);
LC.m_loopBodyCost.C.add(BCM);
}
}
LC.m_estimateCost.add(LC.m_loopBodyCost.C);
// Estimate loop cost assuming loop count = 16
LC.m_estimateCost.mul(16);
}
void KernelCost::collectPerfMetrics()
{
BB_LIST& BBs = m_kernel->fg.getBBList();
const int numFuncs = (int)m_metrics.size();
for (int i = 0; i < numFuncs; ++i) {
FuncCost &FC = m_metrics[i];
FuncInfo *pFI = FC.m_funcInfo;
G4_BB *StartBB = pFI->getInitBB();
G4_BB *EndBB = pFI->getExitBB();
auto IS = std::find(BBs.begin(), BBs.end(), StartBB);
auto IE = std::find(BBs.begin(), BBs.end(), EndBB);
IE = std::next(IE);
for (auto II = IS; II != IE; ++II) {
G4_BB *BB = *II;
Loop *L = m_loops.getOuterMostLoop(BB);
if (L && L->getHeader() == BB) {
collectLoopMetrics(L);
LoopCost &LC = m_loopCosts[L];
FC.m_estimateCost.add(LC.m_estimateCost);
} else if (!L) {
CostMetricsWrapper BCM;
calculateBBMetrics(BCM, BB);
FC.m_funcCost.C.add(BCM);
}
}
FC.m_estimateCost.add(FC.m_funcCost.C);
// Loop part of cost expr, set up in program order.
for (auto L : FC.m_allLoopsInProgramOrder) {
if (L->getNumImmChildLoops() == 0)
continue;
LoopCost &LC = m_loopCosts[L];
CostExprInternal &lce = LC.m_loopBodyCost;
// make sure to iterate loops in program order
std::list<Loop *> immLoops(L->begin(), L->end());
immLoops.sort([this](Loop *a, Loop *b) {
LoopCost *aLC = &m_loopCosts[a];
LoopCost *bLC = &m_loopCosts[b];
return aLC->m_loopId < bLC->m_loopId;
});
for (auto LI : immLoops) {
Loop *nested = LI;
LoopCost &nestedLC = m_loopCosts[nested];
lce.LoopCosts.push_back(&nestedLC);
}
}
}
if (m_kernel->getOption(vISA_dumpKCI) ||
m_kernel->getOption(vISA_dumpDetailKCI))
print(std::cout);
if (m_kernel->getOption(vISA_dumpKCIForLit)) {
printForLit(std::cout);
}
}
void KernelCost::collectSendMetrics(G4_InstSend* SendI, uint32_t& ldBytes, uint32_t& stBytes)
{
// Todo: make sure the bytes are calculated correctly!
ldBytes = 0;
stBytes = 0;
G4_SendDesc *mDesc = SendI->getMsgDesc();
if (mDesc->isRead() || mDesc->isReadWrite()) {
ldBytes = mDesc->getDstLenBytes();
}
if (mDesc->isWrite() || mDesc->isReadWrite()) {
// Make sure this is correct.
stBytes = mDesc->getSrc1LenBytes();
}
}
void KernelCost::print(std::ostream &OS)
{
if (m_kernel->getOption(vISA_dumpDetailKCI)) {
// Dump G4
m_kernel->dumpToFile("KernelCost", true);
// dump detail info for each BB
OS << "\n\n---- BB Info ----\n\n";
BB_LIST &BBs = m_kernel->fg.getBBList();
for (auto II = BBs.begin(), IE = BBs.end(); II != IE; ++II) {
G4_BB *bb = *II;
BBCostInfo &BInfo = m_BBCostInfo[bb];
OS << "BB" << bb->getId()
<< ": Prob = " << (float)BInfo.m_prob / MAX_PROB_POINTS << " (0x"
<< std::hex << BInfo.m_prob << "/0x" << MAX_PROB_POINTS << std::dec
<< "); cycles = " << BInfo.m_cycles << "\n";
}
}
// Dump summary of cost metrics for each subroutine and kernel
const int numFuncs = (int)m_metrics.size();
for (int i = 0; i < numFuncs; ++i) {
const FuncCost &FC = m_metrics[i];
const CostMetricsWrapper &FC_ec = FC.m_estimateCost;
const char *name;
const char *nameKind = (((i + 1) == numFuncs) ? "kernel " : "subroutine ");
if ((i + 1) == numFuncs) {
name = m_kernel->getName();
} else {
// subroutine, get its entry label name
FuncInfo *pFI = FC.m_funcInfo;
G4_BB *StartBB = pFI->getInitBB();
name = StartBB->getLabel()->getLabelName();
}
OS << "\nEstimated Kernel Cost Metrics for reference only: "
<< nameKind << name << "\n"
<< " Total Cycles = " << FC_ec.getCycles() << "\n"
<< " Total Bytes Loaded = " << FC_ec.getLoadBytes() << "\n"
<< " Total Bytes Stored = " << FC_ec.getStoreBytes() << "\n\n\n";
// Cost expression (output to be written to zeinfo)
const CostMetricsWrapper &FCM = FC.m_funcCost.C;
OS << "\nKernel Cost Metrics : " << nameKind << name << "\n"
<< " Cycles (excluding loops) = " << FCM.getCycles() << "\n"
<< " Bytes Loaded (excluding loops) = " << FCM.getLoadBytes() << "\n"
<< " Bytes Stored (excluding loops) = " << FCM.getStoreBytes()
<< "\n\n";
for (const Loop *aL : FC.m_allLoopsInProgramOrder) {
LoopCost &LC = m_loopCosts[aL];
CostExprInternal &lce = LC.m_loopBodyCost;
int level = aL->getNestingLevel();
std::string indent(4 * level, ' ');
OS << indent << "L[" << LC.m_loopId << "] "
<< "[Header:BB" << aL->getHeader()->getId() << ", Tail:BB"
<< aL->backEdgeSrc()->getId() << ", visaid: " << LC.m_backedge_visaId
<< "]\n";
CostMetricsWrapper &LCM = lce.C;
OS << indent << " Loop body only, excluding nested loops\n"
<< indent << " Cycles = " << LCM.getCycles() << "\n"
<< indent << " Bytes Loaded = " << LCM.getLoadBytes() << "\n"
<< indent << " Bytes Stored = " << LCM.getStoreBytes() << "\n";
if (aL->getNumImmChildLoops() > 0)
OS << "\n";
}
}
}
void KernelCost::printForLit(std::ostream &OS) {
const FuncCost &FC = m_metrics.back();
int i = 0;
for (const Loop *aL : FC.m_allLoopsInProgramOrder) {
LoopCost &LC = m_loopCosts[aL];
OS << "Loop " << i << ": id " << LC.m_loopId
<< ", level " << aL->getNestingLevel() << "\n";
++i;
}
OS << "#subroutines = " << m_metrics.size() - 1 << "\n";
}
void KernelCost::dump() {
print(std::cout);
}
void KernelCost::dump() const {
const_cast<KernelCost*>(this)->print(std::cout);
}
namespace vISA {
void collectKernelCostInfo(G4_Kernel* pK, std::vector<VISA_BB_INFO> &BBInfo) {
KernelCost KCA(pK, BBInfo);
KCA.run();
}
}