Files
intel-graphics-compiler/visa/LocalRA.cpp
Cheng, Bu Qi 82a1986c0a RA change
_OS_DESCRIPTION
2025-07-18 02:38:09 +02:00

3068 lines
100 KiB
C++

/*========================== begin_copyright_notice ============================
Copyright (C) 2017-2021 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
#include "LocalRA.h"
#include "Assertions.h"
#include "DebugInfo.h"
#include "common.h"
#include <fstream>
#include <tuple>
using namespace vISA;
// #define _DEBUG
// #define cout cerr
#define NUM_PREGS_FOR_UNIQUE_ASSIGN 50
#define SPLIT_REF_CNT_THRESHOLD 3
#define SPLIT_USE_CNT_THRESHOLD 2
#define SPLIT_USE_DISTANCE_THRESHOLD 100
void getForbiddenGRFs(std::vector<unsigned int> &regNum, G4_Kernel &kernel,
unsigned stackCallRegSize, unsigned reserveSpillSize,
unsigned reservedRegNum);
void getCallerSaveGRF(std::vector<unsigned int> &regNum, G4_Kernel *kernel);
LocalRA::LocalRA(BankConflictPass &b, GlobalRA &g)
: kernel(g.kernel), builder(g.builder), bc(b), gra(g) {}
BankAlign LocalRA::getBankAlignForUniqueAssign(G4_Declare *dcl) {
// FIXME: this code is rather suspicious (we return even alignment
// for first_half_odd???), should revisit
switch (gra.getBankConflict(dcl)) {
case BANK_CONFLICT_FIRST_HALF_EVEN:
case BANK_CONFLICT_FIRST_HALF_ODD:
return builder.oneGRFBankDivision() ? BankAlign::Even : BankAlign::Even2GRF;
case BANK_CONFLICT_SECOND_HALF_EVEN:
case BANK_CONFLICT_SECOND_HALF_ODD:
return builder.oneGRFBankDivision() ? BankAlign::Odd : BankAlign::Odd2GRF;
default:
return BankAlign::Either;
}
}
// returns true if kernel contains a back edge
// call/return edge may also be considered as a back edge depending on layout.
bool LocalRA::hasBackEdge() {
for (auto curBB : kernel.fg) {
vISA_ASSERT(curBB->size() > 0 || curBB->Succs.size() > 1,
"Error in detecting back-edge: Basic block found with no inst "
"list and multiple successors.");
if (curBB->size() > 0) {
for (auto succ : curBB->Succs) {
vISA_ASSERT(succ->size() > 0,
"Error in detecting back-edge: Destination basic block of "
"a jmp has no instructions.");
if (curBB->getId() >= succ->getId()) {
return true;
}
}
}
}
return false;
}
void LocalRA::getRowInfo(int size, int &nrows, int &lastRowSize,
const IR_Builder &builder) {
if (size <= (int)builder.numEltPerGRF<Type_UW>()) {
nrows = 1;
} else {
// nrows is total number of rows, including last row even if it is partial
nrows = size / builder.numEltPerGRF<Type_UW>();
// lastrowsize is number of words actually used in last row
lastRowSize = size % builder.numEltPerGRF<Type_UW>();
if (size % builder.numEltPerGRF<Type_UW>() != 0) {
nrows++;
}
if (lastRowSize == 0) {
lastRowSize = builder.numEltPerGRF<Type_UW>();
}
}
return;
}
void LocalRA::specialAlign() {
if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
kernel.fg.size() > 2) {
if (gra.use4GRFAlign) {
gra.augAlign();
} else if ((GlobalRA::useGenericAugAlign(kernel.getPlatformGeneration()) &&
kernel.getSimdSize() >= kernel.numEltPerGRF<Type_UD>()) ||
(!GlobalRA::useGenericAugAlign(kernel.getPlatformGeneration()) &&
kernel.getSimdSize() > kernel.numEltPerGRF<Type_UD>())) {
// Set alignment of all GRF candidates
// to 2GRF except for NoMask variables
#ifdef DEBUG_VERBOSE_ON
DEBUG_VERBOSE("Updating alignment to Even for GRF candidates"
<< "\n");
#endif
gra.augAlign();
}
gra.updateSubRegAlignment(builder.getGRFAlign());
// Since we are piggy backing on mask field of G4_Declare,
// we need to make sure we reset it before going further.
resetMasks();
}
return;
}
/*
* Pre-localRA analsyis including:
* 1) Mark reference to create live ranges, 2) alignment handling, 3) Avialable
* register calcuating and marking
*/
void LocalRA::preLocalRAAnalysis() {
unsigned int numRowsEOT = 0;
bool lifetimeOpFound = false;
int numGRF = kernel.getNumRegTotal();
// Clear LocalLiveRange* computed preRA
gra.clearStaleLiveRanges();
// Mark references made to decls to sieve local from global ranges
markReferences(numRowsEOT, lifetimeOpFound);
// Mark those physical registers unavailable that are declared with Output
// attribute
blockOutputPhyRegs();
if (lifetimeOpFound == true) {
// Check whether pseudo_kill/lifetime.end are only references
// for their respective variables. Remove them if so. Doing this
// helps reduce number of variables in symbol table increasing
// changes of skipping global RA.
removeUnrequiredLifetimeOps();
}
unsigned int numRowsReserved = numRowsEOT;
// Remove unreferenced dcls
gra.removeUnreferencedDcls();
if (gra.useFastRA || gra.useHybridRAwithSpill) {
unsigned reserveSpillSize = 0;
unsigned int spillRegSize = 0;
unsigned int indrSpillRegSize = 0;
gra.determineSpillRegSize(spillRegSize, indrSpillRegSize);
reserveSpillSize = spillRegSize + indrSpillRegSize;
if (reserveSpillSize >= kernel.stackCall.getNumCalleeSaveRegs()) {
gra.useFastRA = false;
gra.useHybridRAwithSpill = false;
numRegLRA = numGRF - numRowsReserved;
} else {
numRegLRA = numGRF - numRowsReserved - reserveSpillSize -
4; // spill Header, scratch offset, a0.2, WA
}
} else {
numRegLRA = numGRF - numRowsReserved;
}
bool isStackCall =
(kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc());
unsigned int stackCallRegSize = isStackCall ? kernel.stackCall.numReservedABIGRF() : 0;
unsigned int reservedGRFNum =
builder.getOptions()->getuInt32Option(vISA_ReservedGRFNum);
if (isStackCall || reservedGRFNum || builder.getOption(vISA_Debug)) {
std::vector<unsigned int> forbiddenRegs;
getForbiddenGRFs(forbiddenRegs, kernel, stackCallRegSize, 0,
reservedGRFNum);
for (unsigned int i = 0, size = forbiddenRegs.size(); i < size; i++) {
unsigned int regNum = forbiddenRegs[i];
pregs->setGRFUnavailable(regNum);
}
// FIXME: this will break if # of GRF is not 128.
if (isStackCall) {
// Set r60 to r99 as unavailable for local RA since these registers are
// callee saved Local RA will use only caller save registers for
// allocation.
std::vector<unsigned int> callerSaveRegs;
getCallerSaveGRF(callerSaveRegs, &kernel);
for (unsigned int i = 0, size = callerSaveRegs.size(); i < size; i++) {
unsigned int regNum = callerSaveRegs[i];
pregs->setGRFUnavailable(regNum);
}
numRegLRA = (kernel.getNumRegTotal() - 3) - numRowsReserved;
}
if (builder.getOption(vISA_Debug)) {
// Since LocalRA is not undone when debug info generation is required,
// for keeping compile time low, we allow fewer physical registers
// as assignable candidates. Without this, we could run in to a
// situation where very few physical registers are available for GRA
// and it is unable to assign registers even with spilling.
#define USABLE_GRFS_WITH_DEBUG_INFO 80
int maxSendReg = 0;
for (auto bb : kernel.fg) {
for (auto inst : *bb) {
if (inst->isSend() || inst->isSplitSend()) {
maxSendReg = ((int)inst->getMsgDesc()->getDstLenRegs() > maxSendReg)
? ((int)inst->getMsgDesc()->getDstLenRegs())
: maxSendReg;
maxSendReg =
((int)inst->getMsgDesc()->getSrc0LenRegs() > maxSendReg)
? ((int)inst->getMsgDesc()->getSrc0LenRegs())
: maxSendReg;
maxSendReg =
((int)inst->getMsgDesc()->getSrc1LenRegs() > maxSendReg)
? ((int)inst->getMsgDesc()->getSrc1LenRegs())
: maxSendReg;
}
}
}
int maxRegsToUse = USABLE_GRFS_WITH_DEBUG_INFO;
if (maxSendReg > (numGRF - USABLE_GRFS_WITH_DEBUG_INFO)) {
maxRegsToUse = (numGRF - maxSendReg) - 10;
}
// Also check max size of addressed GRF
unsigned int maxAddressedRows = 0;
for (auto dcl : kernel.Declares) {
if (dcl->getAddressed() && maxAddressedRows < dcl->getNumRows()) {
maxAddressedRows = dcl->getNumRows();
}
}
// Assume indirect operand of maxAddressedRows exists
// on dst, src0, src1. This is overly conservative but
// should work for general cases.
if ((numGRF - maxRegsToUse) / 3 < (int)maxAddressedRows) {
maxRegsToUse = (numGRF - (maxAddressedRows * 3));
if (maxRegsToUse < 0)
maxRegsToUse = 0;
}
for (int i = maxRegsToUse; i < numGRF; i++) {
pregs->setGRFUnavailable(i);
}
}
} else {
pregs->setSimpleGRFAvailable(true);
if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) != VISA_3D ||
!builder.canWriteR0()) {
pregs->setR0Forbidden();
}
if (builder.mustReserveR1()) {
pregs->setR1Forbidden();
}
}
return;
}
void LocalRA::trivialAssignRA(bool &needGlobalRA, bool threeSourceCandidate) {
RA_TRACE(std::cout << "\t--trivial RA\n");
needGlobalRA = assignUniqueRegisters(threeSourceCandidate,
builder.lowHighBundle(), false);
if (!needGlobalRA) {
if (threeSourceCandidate) {
kernel.setRAType(RA_Type::TRIVIAL_BC_RA);
} else {
kernel.setRAType(RA_Type::TRIVIAL_RA);
}
}
return;
}
// Entry point to local RA
bool LocalRA::localRAPass(bool doRoundRobin, bool doSplitLLR) {
bool needGlobalRA = true;
PhyRegsLocalRA localPregs = *pregs;
// Iterate over each BB and perform linear scan
VISA_DEBUG_VERBOSE({
localPregs.printBusyRegs();
printAddressTakenDecls();
printLocalRACandidates();
});
if (kernel.fg.funcInfoTable.empty() && !hasBackEdge()) {
calculateInputIntervals();
}
VISA_DEBUG_VERBOSE(printInputLiveIntervals());
for (auto curBB : kernel.fg) {
PhyRegsManager pregManager(builder, localPregs, doBCR);
std::vector<LocalLiveRange *> liveIntervals;
PhyRegSummary *summary = gra.createPhyRegSummary();
calculateLiveIntervals(curBB, liveIntervals);
VISA_DEBUG_VERBOSE(printLocalLiveIntervals(curBB, liveIntervals));
LinearScan ra(gra, liveIntervals, inputIntervals, pregManager, localPregs,
summary, numRegLRA, globalLRSize, doRoundRobin, doBCR,
highInternalConflict, doSplitLLR, kernel.getSimdSize());
ra.run(curBB, builder, LLRUseMap);
VISA_DEBUG_VERBOSE({
std::cout << "BB" << curBB->getId() << "\n";
summary->printBusyRegs();
});
// Tag summary of physical registers used by local RA
gra.addBBLRASummary(curBB, summary);
if (kernel.getOptions()->getOption(vISA_GenerateDebugInfo)) {
updateDebugInfo(kernel, liveIntervals);
}
}
if (unassignedRangeFound() == false) {
needGlobalRA = false;
VISA_DEBUG_VERBOSE(
std::cout
<< "Skipping global RA because local RA allocated all live-ranges.\n");
}
VISA_DEBUG_VERBOSE(localRAOptReport());
if (needGlobalRA == true &&
!(kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc())) {
// Check whether unique physical registers can be assigned
// to global ranges.
bool twoBanksAssign = false;
if (builder.oneGRFBankDivision()) {
twoBanksAssign =
highInternalConflict || kernel.getSimdSize() == g4::SIMD16;
} else {
twoBanksAssign = highInternalConflict;
}
needGlobalRA = assignUniqueRegisters(
doBCR, twoBanksAssign,
gra.useHybridRAwithSpill && !doRoundRobin);
}
if (needGlobalRA && (doRoundRobin || gra.twoSrcBundleBCR)) {
undoLocalRAAssignments(true);
}
if (needGlobalRA == false) {
VISA_DEBUG(std::cout << "Allocated 100% GRF ranges without graph coloring."
<< "\n");
}
return needGlobalRA;
}
bool LocalRA::localRA() {
RA_TRACE(std::cout << "--local RA--\n");
bool doRoundRobin = builder.getOption(vISA_LocalRARoundRobin);
int numGRF = kernel.getNumRegTotal();
PhyRegsLocalRA phyRegs(builder, numGRF);
pregs = &phyRegs;
globalLRSize = 0;
bool reduceBCInTAandFF = false;
bool needGlobalRA = true;
doSplitLLR = (builder.getOption(vISA_SpiltLLR) && kernel.fg.size() == 1 &&
kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D);
preLocalRAAnalysis();
bool reduceBCInRR = false;
if (builder.getOption(vISA_LocalBankConflictReduction) &&
(builder.hasBankCollision() || gra.twoSrcBundleBCR)) {
reduceBCInRR = bc.setupBankConflictsForKernel(
doRoundRobin, reduceBCInTAandFF, numRegLRA, highInternalConflict);
}
if (!kernel.fg.getHasStackCalls() && !kernel.fg.getIsStackCallFunc() &&
!hasSplitInsts) {
trivialAssignRA(needGlobalRA, reduceBCInTAandFF);
}
if (doRoundRobin) {
doRoundRobin = countLiveIntervals();
}
if ((!doRoundRobin && reduceBCInTAandFF) ||
(doRoundRobin && reduceBCInRR)) // To avoid the scheduling issue for send
{
doBCR = true;
}
// Local RA passes:
// RR+BC --> FF+BC -->FF-->Hybrids
// or
// RR --> FF --> Hybrid
if (doRoundRobin) {
RA_TRACE(std::cout << "\t--round-robin " << (doBCR ? "BCR " : "")
<< "RA\n");
needGlobalRA = localRAPass(true, false);
if (needGlobalRA == true) {
doRoundRobin = false;
}
}
if (!doRoundRobin) {
if ((gra.forceBCR || gra.twoSrcBundleBCR) && doBCR) {
RA_TRACE(std::cout << "\t--first-fit BCR RA\n");
needGlobalRA = localRAPass(false, doSplitLLR);
}
if (needGlobalRA) {
RA_TRACE(std::cout << "\t--first-fit RA\n");
globalLRSize = 0;
if (gra.useHybridRAwithSpill) {
countLiveIntervals();
} else {
globalLRSize = 0;
}
specialAlign();
needGlobalRA = localRAPass(false, doSplitLLR);
}
}
if (needGlobalRA == false) {
if (doRoundRobin) {
kernel.setRAType(doBCR ? RA_Type::LOCAL_ROUND_ROBIN_BC_RA
: RA_Type::LOCAL_ROUND_ROBIN_RA);
} else {
kernel.setRAType(doBCR ? RA_Type::LOCAL_FIRST_FIT_BC_RA
: RA_Type::LOCAL_FIRST_FIT_RA);
}
}
setLexicalID(true);
return !needGlobalRA;
}
void LocalRA::resetMasks() {
auto &dcls = kernel.Declares;
for (auto dcl : dcls) {
gra.setMask(dcl, {});
}
}
unsigned int LocalRA::getLargestInputGRF() {
unsigned int largestInput = 0;
for (auto dcl : kernel.Declares) {
// Find out the largest GRF regsiter occupied by input variable
// In case overlap with reserved registers
if ((dcl->getRegFile() == G4_INPUT || dcl->isLiveIn()) &&
dcl->isOutput() == false && !(dcl->getRegVar()->isAreg()) &&
dcl->getRegVar()->isPhyRegAssigned()) {
G4_RegVar *var = dcl->getRegVar();
unsigned int regNum = var->getPhyReg()->asGreg()->getRegNum();
unsigned int regOff = var->getPhyRegOff();
unsigned int largestGRF =
(regNum * kernel.numEltPerGRF<Type_UW>() +
(regOff * dcl->getElemSize()) / G4_WSIZE + dcl->getWordSize() - 1) /
(kernel.numEltPerGRF<Type_UW>());
largestInput = largestInput < largestGRF ? largestGRF : largestInput;
}
}
return largestInput;
}
void LocalRA::blockOutputPhyRegs() {
for (auto dcl : kernel.Declares) {
if (dcl->isOutput() && dcl->isInput()) {
// The live-range may never be referenced in the CFG so we need
// this special pass to block physical registers corresponding
// to those declares. This may be true for FC.
pregs->markPhyRegs(dcl);
}
}
}
class isLifetimeCandidateOpCandidateForRemoval {
public:
isLifetimeCandidateOpCandidateForRemoval(GlobalRA &g) : gra(g) {}
GlobalRA &gra;
bool operator()(G4_INST *inst) {
if (inst->isPseudoKill() || inst->isLifeTimeEnd()) {
G4_Declare *topdcl;
if (inst->isPseudoKill()) {
topdcl = GetTopDclFromRegRegion(inst->getDst());
} else {
topdcl = GetTopDclFromRegRegion(inst->getSrc(0));
}
if (gra.getNumRefs(topdcl) == 0 && (topdcl->getRegFile() == G4_GRF ||
topdcl->getRegFile() == G4_INPUT)) {
// Remove this lifetime op
return true;
}
}
return false;
}
};
void LocalRA::removeUnrequiredLifetimeOps() {
// Iterate over all instructions and inspect only
// pseudo_kills/lifetime.end instructions. Remove
// instructions that have no other useful instruction.
for (BB_LIST_ITER bb_it = kernel.fg.begin(); bb_it != kernel.fg.end();
bb_it++) {
G4_BB *bb = (*bb_it);
bb->erase(
std::remove_if(bb->begin(), bb->end(),
isLifetimeCandidateOpCandidateForRemoval(this->gra)),
bb->end());
}
}
void PhyRegsLocalRA::findRegisterCandiateWithAlignForward(int &i,
BankAlign align,
bool evenAlign) {
if ((align == BankAlign::Even) && (i % 2 != 0)) {
i++;
} else if ((align == BankAlign::Odd) && (i % 2 == 0)) {
i++;
} else if ((align == BankAlign::QuadGRF) && (i % 4 != 0)) {
i += (4 - (i % 4));
} else if (align == BankAlign::Even2GRF) {
while ((i % 4 >= 2) || (evenAlign && (i % 2 != 0))) {
i++;
}
} else if (align == BankAlign::Odd2GRF) {
while ((i % 4 < 2) || (evenAlign && (i % 2 != 0))) {
i++;
}
}
}
unsigned int PhyRegsLocalRA::get_bundle(unsigned int baseReg, int offset) {
if (builder.has64bundleSize2GRFPerBank()) {
return (((baseReg + offset) % 32) / 4);
}
if (builder.hasPartialInt64Support()) {
return (((baseReg + offset) % 32) / 2);
}
if (builder.has64bundleSize()) {
return (((baseReg + offset) % 16) / 2);
}
return (((baseReg + offset) % 64) / 4);
}
int PhyRegsLocalRA::findBundleConflictFreeRegister(
int curReg, int endReg, unsigned short occupiedBundles, BankAlign align,
bool evenAlign) {
int i = curReg;
while (occupiedBundles & (1 << get_bundle(i, 0)) ||
occupiedBundles & (1 << get_bundle(i, 1))) {
i++;
findRegisterCandiateWithAlignForward(i, align, evenAlign);
if (i > endReg) // Out of bound, drop the bundle conflict considration
{
i = curReg;
break;
}
}
return i;
}
void PhyRegsLocalRA::findRegisterCandiateWithAlignBackward(int &i,
BankAlign align,
bool evenAlign) {
if ((align == BankAlign::Even) && (i % 2 != 0)) {
i--;
} else if ((align == BankAlign::Odd) && (i % 2 == 0)) {
i--;
} else if ((align == BankAlign::QuadGRF) && (i % 4 != 0)) {
if (i > 0)
i -= (i % 4);
else
i -= (4 - (-i % 4));
} else if (align == BankAlign::Even2GRF) {
while (i >= 0 && ((i % 4 >= 2) || (evenAlign && (i % 2 != 0)))) {
i--;
}
} else if (align == BankAlign::Odd2GRF) {
while (i >= 0 && ((i % 4 < 2) || (evenAlign && (i % 2 != 0)))) {
i--;
}
}
}
inline static unsigned short getOccupiedBundle(IR_Builder &builder,
GlobalRA &gra, G4_Declare *dcl,
BankAlign &preferBank) {
unsigned short occupiedBundles = 0;
unsigned bundleNum = 0;
unsigned int evenBankNum = 0;
unsigned int oddBankNum = 0;
if (!builder.hasDPAS() ||
!builder.getOption(vISA_EnableDPASBundleConflictReduction)) {
return 0;
}
for (const BundleConflict &conflict : gra.getBundleConflicts(dcl)) {
LocalLiveRange *lr = gra.getLocalLR(conflict.dcl);
if (lr == nullptr)
continue;
int subregnum;
G4_VarBase *preg = lr->getPhyReg(subregnum);
if (preg == NULL) {
preg = lr->getTopDcl()->getRegVar()->getPhyReg();
}
if (preg != NULL) {
int offset = conflict.offset;
unsigned int reg = preg->asGreg()->getRegNum();
unsigned int bank = gra.get_bank(reg, offset);
if (bank) {
oddBankNum++;
} else {
evenBankNum++;
}
unsigned int bundle = gra.get_bundle(reg, offset);
unsigned int bundle1 = gra.get_bundle(reg, offset + 1);
if (!(occupiedBundles & ((unsigned short)1 << bundle))) {
bundleNum++;
}
occupiedBundles |= (unsigned short)1 << bundle;
occupiedBundles |= (unsigned short)1 << bundle1;
}
}
if (bundleNum > 12) {
occupiedBundles = 0;
}
if (evenBankNum || oddBankNum) {
preferBank = (evenBankNum < oddBankNum) ? BankAlign::Even : BankAlign::Odd;
}
return occupiedBundles;
}
bool LocalRA::assignUniqueRegisters(bool twoBanksRA, bool twoDirectionsAssign,
bool hybridWithSpill) {
// Iterate over all dcls and calculate number of rows
// required if each unallocated dcl had its own physical
// register. Check number of registers used up by local RA.
// If unique assignments are possible, then assign and set
// needGlobalRA to false.
unsigned int numRows = 0;
std::vector<G4_Declare *> unallocatedRanges;
bool needGlobalRA = true;
uint32_t nextEOTGRF = numRegLRA;
std::unordered_set<unsigned int> emptyForbidden;
unsigned int largestInput = getLargestInputGRF();
if (((nextEOTGRF < (kernel.getNumRegTotal() - 16)) ||
(nextEOTGRF < largestInput)) &&
builder.hasEOTGRFBinding()) {
// we can't guarantee unique assignments for all the EOT sources
// punt to global RA
return true;
}
// Identify global ranges not having an allocation
for (auto dcl : kernel.Declares) {
if (dcl->getAliasDeclare() == NULL && dcl->getRegFile() == G4_GRF &&
dcl->getRegVar()->isPhyRegAssigned() == false) {
auto dclLR = gra.getLocalLR(dcl);
if (dclLR && dclLR->isEOT() && builder.hasEOTGRFBinding()) {
// For EOT use r112 - r127
// for simplicity we assign each EOT source unique GRFs
G4_Greg *phyReg = builder.phyregpool.getGreg(nextEOTGRF);
dcl->getRegVar()->setPhyReg(phyReg, 0);
dclLR->setPhyReg(phyReg, 0);
dclLR->setAssigned(true);
// we have to make sure src0 and src1 of split send get different GRFs
nextEOTGRF += dcl->getNumRows();
if (kernel.getOption(vISA_GenerateDebugInfo)) {
uint32_t start = 0, end = 0;
for (auto rit = kernel.fg.rbegin(); rit != kernel.fg.rend(); rit++) {
G4_BB *bb = (*rit);
if (bb->size() > 0) {
end = bb->back()->getVISAId();
break;
}
}
updateDebugInfo(kernel, dcl, start, end);
}
} else {
numRows += dcl->getNumRows();
if (hybridWithSpill) {
if (dcl->isDoNotSpill()) {
unallocatedRanges.push_back(dcl);
}
} else {
unallocatedRanges.push_back(dcl);
}
}
}
}
if (numRows < numRegLRA || hybridWithSpill) {
// Get superset of registers used by local RA
// in all basic blocks.
PhyRegsManager phyRegMgr(builder, *pregs, twoBanksRA);
if (!hybridWithSpill && kernel.getOption(vISA_Debug)) {
// when doing trivial RA, if any kernel input registers
// are unreferenced, they're overwritten by other
// variables. when running in -O0 mode, which is usually
// for debug info generation, mark those input
// registers as busy so they're not overwritten.
// this is a problem for trivial assignments because
// live-range computation is not done when trivial RA
// succeeds. all variables are expected to be available
// throughout the program since they're trivial assignments.
for (auto dcl : kernel.Declares) {
if (!dcl->getRootDeclare())
continue;
if (!dcl->isInput())
continue;
if (gra.getNumRefs(dcl) == 0) {
// this is an unreferenced input
auto phyReg = dcl->getRegVar()->getPhyReg();
if (!phyReg || !phyReg->isGreg())
continue;
phyRegMgr.getAvailableRegs()->markPhyRegs(dcl);
}
}
}
for (auto bb : kernel.fg) {
PhyRegSummary *summary = gra.getBBLRASummary(bb);
if (summary) {
for (unsigned int i = 0; i < numRegLRA; i++) {
if (summary->isGRFBusy(i)) {
phyRegMgr.getAvailableRegs()->setGRFBusy(i, 1);
}
}
}
}
needGlobalRA = hybridWithSpill;
bool assignFromFront = true;
#ifdef DEBUG_VERBOSE_ON
COUT_ERROR << "Trival RA: "
<< "\n";
#endif
for (auto dcl : unallocatedRanges) {
int regNum = 0;
int subregNum = 0;
int sizeInWords = dcl->getWordSize();
int nrows = 0;
BankAlign bankAlign = BankAlign::Either;
if (twoBanksRA && gra.getBankConflict(dcl) != BANK_CONFLICT_NONE) {
bankAlign = getBankAlignForUniqueAssign(dcl);
if (bankAlign == BankAlign::Even || bankAlign == BankAlign::Even2GRF) {
assignFromFront = true;
}
if (twoDirectionsAssign &&
(bankAlign == BankAlign::Odd || bankAlign == BankAlign::Odd2GRF)) {
assignFromFront = false;
}
}
auto assignAlign = gra.isQuadAligned(dcl) ? BankAlign::QuadGRF
: gra.isEvenAligned(dcl) ? BankAlign::Even
: bankAlign;
G4_SubReg_Align subAlign =
builder.GRFAlign() ? builder.getGRFAlign() : gra.getSubRegAlign(dcl);
auto dclLR = gra.getLocalLR(dcl);
if (assignFromFront) {
BankAlign preferBank = BankAlign::Either;
unsigned short occupiedBundles =
getOccupiedBundle(builder, gra, dcl, preferBank);
nrows = phyRegMgr.findFreeRegs(
sizeInWords, assignAlign, subAlign, regNum, subregNum, 0,
numRegLRA - 1, occupiedBundles, 0, false,
dclLR ? dclLR->getForbidden() : emptyForbidden);
} else {
nrows = phyRegMgr.findFreeRegs(
sizeInWords, assignAlign, subAlign, regNum, subregNum,
numRegLRA - 1, 0, 0, 0, false,
dclLR ? dclLR->getForbidden() : emptyForbidden);
}
if (nrows) {
G4_Greg *phyReg = builder.phyregpool.getGreg(regNum);
// adjust subreg num to type size
if (dcl->getElemSize() == 1) {
subregNum *= 2;
} else {
subregNum /= (dcl->getElemSize() / 2);
}
dcl->getRegVar()->setPhyReg(phyReg, subregNum);
} else {
needGlobalRA = true;
break;
}
if (twoBanksRA && twoDirectionsAssign) {
assignFromFront = !assignFromFront;
}
}
if (needGlobalRA) {
for (auto dcl : unallocatedRanges) {
if (!hybridWithSpill) {
dcl->getRegVar()->resetPhyReg();
} else if (!dcl->isDoNotSpill()) {
dcl->getRegVar()->resetPhyReg();
}
}
} else {
if (kernel.getOption(vISA_GenerateDebugInfo)) {
uint32_t start = 0, end = UNMAPPABLE_VISA_INDEX;
for (auto rit = kernel.fg.rbegin(); rit != kernel.fg.rend(); rit++) {
G4_BB *bb = (*rit);
if (bb->size() > 0) {
for (auto inst : *bb) {
if (inst->getVISAId() != UNMAPPABLE_VISA_INDEX) {
end = bb->back()->getVISAId();
break;
}
}
if (end != UNMAPPABLE_VISA_INDEX)
break;
}
}
for (auto dcl : unallocatedRanges) {
updateDebugInfo(kernel, dcl, start, end);
}
// unallocatedRanges doesnt contain input dcls
for (auto dcl : kernel.Declares) {
if (dcl->isInput()) {
// if an input is never referenced in the program, its
// GRF storage may be overwritten in -O2 mode. only in
// presence of -debug switch does compiler preserve
// kernel inputs registers even when unreferenced with
// trivial RA.
if (gra.getNumRefs(dcl) > 0 || kernel.getOption(vISA_Debug))
updateDebugInfo(kernel, dcl, start, end);
}
}
}
}
} else {
needGlobalRA = true;
}
#ifdef DEBUG_VERBOSE_ON
COUT_ERROR << "\n";
#endif
return needGlobalRA;
}
void GlobalRA::removeUnreferencedDcls() {
// Iterate over all dcls and remove those with 0
// ref count and not addressed. This is done only for
// GRF dcls.
// Propagate top dcl info to aliases
for (auto dcl : kernel.Declares) {
if (dcl->getAliasDeclare() != nullptr) {
setNumRefs(dcl, getNumRefs(dcl->getAliasDeclare()));
}
}
auto isUnrefDcl = [this](G4_Declare *dcl) {
return (dcl->getRegFile() == G4_GRF || dcl->getRegFile() == G4_INPUT) &&
getNumRefs(dcl) == 0 &&
dcl->getRegVar()->isPhyRegAssigned() == false &&
dcl != kernel.fg.builder->getBuiltinR0() &&
dcl != kernel.fg.builder->getSpillSurfaceOffset() &&
(!kernel.fg.builder->getOptions()->getuInt32Option(vISA_CodePatch) ||
dcl->getAliasDeclare() != kernel.fg.builder->getInputR1());
};
kernel.Declares.erase(std::remove_if(kernel.Declares.begin(),
kernel.Declares.end(), isUnrefDcl),
kernel.Declares.end());
}
bool LocalRA::unassignedRangeFound() {
bool unassignedRangeFound = false;
for (auto dcl : kernel.Declares) {
if (dcl->getAliasDeclare() == NULL && dcl->getRegFile() == G4_GRF &&
dcl->getRegVar()->isPhyRegAssigned() == false) {
unassignedRangeFound = true;
break;
}
}
return unassignedRangeFound;
}
// Update numRegsUsed to max physical register used based on summary
void LocalRA::updateRegUsage(PhyRegSummary *summary,
unsigned int &numRegsUsed) {
for (uint32_t i = 0, numGRF = summary->getNumGRF(); i < numGRF; i++) {
if (numRegsUsed < i && summary->isGRFBusy(i) == true) {
numRegsUsed = i;
}
}
}
// Emit out localRA opt report
void LocalRA::localRAOptReport() {
unsigned int totalRanges = 0, localRanges = 0;
for (auto dcl : kernel.Declares) {
auto dcl_it_lr = gra.getLocalLR(dcl);
if (dcl_it_lr && dcl_it_lr->isLiveRangeLocal() &&
dcl_it_lr->isGRFRegAssigned()) {
localRanges++;
}
if ((dcl->getRegFile() == G4_GRF || dcl->getRegFile() == G4_INPUT) &&
dcl->getAliasDeclare() == NULL) {
totalRanges++;
}
}
VISA_DEBUG({
std::cout << "\n";
std::cout << "Total GRF ranges: " << totalRanges << "\n";
std::cout << "GRF ranges allocated by local RA: " << localRanges << "\n";
if (totalRanges) {
std::cout << (int)(localRanges * 100 / totalRanges)
<< "% allocated by local RA\n\n";
}
});
}
// Given a src/dst reg region in opnd, traverse to its base and
// return the declaration. Resolve any aliases and return the
// topmost declaration.
// opnd->topdcl should give same result for most cases. But for
// spill code, if an address register spills then opnd->topdcl does
// not return expected dcl for the fill.
G4_Declare *GetTopDclFromRegRegion(G4_Operand *opnd) {
G4_Declare *dcl = nullptr;
vISA_ASSERT(opnd->isRegRegion(),
"Operand is not a register region so cannot have a top dcl");
G4_VarBase *base = opnd->getBase();
if (base && base->isRegVar()) {
dcl = base->asRegVar()->getDeclare()->getRootDeclare();
}
return dcl;
}
unsigned int LocalRA::convertSubRegOffFromWords(G4_Declare *dcl,
int subregnuminwords) {
// Return subreg offset in units of dcl's element size.
// Input is subregnum in word units.
unsigned int subregnum;
subregnum = (subregnuminwords * 2) / dcl->getElemSize();
return subregnum;
}
unsigned int LocalRA::convertSubRegOffToWords(G4_Declare *dcl, int subregnum) {
// Return subreg offset in units of words from dcl's element size
unsigned int subregnuminwords;
subregnuminwords = subregnum * dcl->getElemSize() / 2;
return subregnuminwords;
}
void LocalRA::undoLocalRAAssignments(bool clearInterval) {
VISA_DEBUG(std::cout << "Undoing local RA assignments\n");
// Undo all assignments made by local RA
for (auto dcl : kernel.Declares) {
LocalLiveRange *lr = gra.getLocalLR(dcl);
if (lr != NULL) {
if (lr->getAssigned() == true) {
// Undo the assignment
lr->setAssigned(false);
lr->getTopDcl()->getRegVar()->resetPhyReg();
if (kernel.getOption(vISA_GenerateDebugInfo)) {
kernel.getKernelDebugInfo()
->getLiveIntervalInfo(lr->getTopDcl(), false)
->clearLiveIntervals();
}
}
if (clearInterval) {
if (lr->isLiveRangeLocal()) {
lr->setFirstRef(NULL, 0);
}
if (lr->getTopDcl()->getRegFile() == G4_INPUT) {
lr->setLastRef(NULL, 0);
}
}
}
}
// Delete summary information stored with each basic block
gra.clearBBLRASummaries();
}
LocalLiveRange *GlobalRA::GetOrCreateLocalLiveRange(G4_Declare *topdcl) {
LocalLiveRange *lr = getLocalLR(topdcl);
// Check topdcl of operand and setup a new live range if required
if (!lr) {
localLiveRanges.push_back(LocalLiveRange(builder));
lr = &localLiveRanges.back();
setLocalLR(topdcl, lr);
}
vISA_ASSERT(lr != NULL, "Local LR could not be created");
return lr;
}
void LocalRA::markReferencesInOpnd(G4_Operand *opnd, bool isEOT,
INST_LIST_ITER inst_it, unsigned int pos) {
G4_Declare *topdcl = NULL;
if ((opnd->isSrcRegRegion() || opnd->isDstRegRegion())) {
topdcl = GetTopDclFromRegRegion(opnd);
if (topdcl &&
(topdcl->getRegFile() == G4_GRF || topdcl->getRegFile() == G4_INPUT)) {
// Handle GRF here
vISA_ASSERT(topdcl->getAliasDeclare() == NULL, "Not topdcl");
LocalLiveRange *lr = gra.GetOrCreateLocalLiveRange(topdcl);
lr->recordRef(curBB);
if (doSplitLLR && opnd->isSrcRegRegion()) {
LLR_USE_MAP_ITER useMapIter;
useMapIter = LLRUseMap.find(lr);
if (useMapIter == LLRUseMap.end()) {
std::vector<std::pair<INST_LIST_ITER, unsigned int>> useList;
useList.push_back(make_pair(inst_it, pos));
LLRUseMap.insert(make_pair(lr, useList));
} else {
(*useMapIter).second.push_back(make_pair(inst_it, pos));
}
}
if (topdcl->getRegVar() && topdcl->getRegVar()->isPhyRegAssigned() &&
topdcl->getRegVar()->getPhyReg()->isGreg()) {
pregs->markPhyRegs(topdcl);
}
if (isEOT) {
lr->markEOT();
}
gra.recordRef(topdcl);
if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
opnd->isDstRegRegion() && !topdcl->getAddressed()) {
if (opnd->getInst()->isWriteEnableInst() == false) {
} else {
gra.setAugmentationMask(topdcl, AugmentationMasks::NonDefault);
}
}
} else {
// Handle register other than GRF
}
} else if (opnd->isAddrExp()) {
G4_AddrExp *addrExp = opnd->asAddrExp();
topdcl = addrExp->getRegVar()->getDeclare();
while (topdcl->getAliasDeclare() != NULL)
topdcl = topdcl->getAliasDeclare();
vISA_ASSERT(topdcl != NULL, "Top dcl was null for addr exp opnd");
LocalLiveRange *lr = gra.GetOrCreateLocalLiveRange(topdcl);
lr->recordRef(curBB);
lr->markIndirectRef();
if (topdcl->getRegVar() && topdcl->getRegVar()->isPhyRegAssigned() &&
topdcl->getRegVar()->getPhyReg()->isGreg()) {
pregs->markPhyRegs(topdcl);
}
gra.recordRef(topdcl);
}
}
void LocalRA::markReferencesInInst(INST_LIST_ITER inst_it) {
auto inst = (*inst_it);
if (inst->getNumDst() > 0) {
// Scan dst
G4_Operand *dst = inst->getDst();
if (dst) {
markReferencesInOpnd(dst, false, inst_it, 0);
}
}
// Scan srcs
for (int i = 0, nSrcs = inst->getNumSrc(); i < nSrcs; i++) {
G4_Operand *src = inst->getSrc(i);
if (src) {
markReferencesInOpnd(src, inst->isEOT(), inst_it, i);
}
}
}
void LocalRA::setLexicalID(bool includePseudo) {
unsigned int id = 0;
for (auto bb : kernel.fg) {
for (auto curInst : *bb) {
if ((!includePseudo) &&
(curInst->isPseudoKill() || curInst->isLifeTimeEnd())) {
curInst->setLexicalId(id);
} else {
curInst->setLexicalId(id++);
}
}
}
}
void LocalRA::addOutOfBoundForbidden(G4_Declare *dcl, G4_Operand *opnd) {
LocalLiveRange *lr = gra.getLocalLR(dcl);
if (!lr) {
return;
}
int opndEndGRF = opnd->getLinearizedEnd() / builder.numEltPerGRF<Type_UB>();
int dclEndGRF = (dcl->getByteSize() - 1) / builder.numEltPerGRF<Type_UB>();
if (opndEndGRF > dclEndGRF) {
for (int i = 0; i < (opndEndGRF - dclEndGRF); i++) {
lr->addForbidden(kernel.getNumRegTotal() - i + 1);
}
}
return;
}
void LocalRA::markSpecialForbidden(INST_LIST_ITER inst_it) {
G4_INST *inst = (*inst_it);
if (!inst->isSend()) {
return;
}
if (!inst->asSendInst()->isSVMScatterRW() ||
inst->getExecSize() >= g4::SIMD8) {
return;
}
G4_DstRegRegion *dst = inst->getDst();
if (dst && !dst->isNullReg()) {
G4_Declare *dcl = inst->getDst()->getTopDcl();
if (dcl) {
dcl = dcl->getRootDeclare();
addOutOfBoundForbidden(dcl, dst);
}
}
for (unsigned i = 0, numSrc = inst->getNumSrc(); i < numSrc; i++) {
G4_Operand *src = inst->getSrc(i);
if (src) {
G4_Declare *dcl = src->getTopDcl();
if (dcl) {
dcl = dcl->getRootDeclare();
addOutOfBoundForbidden(dcl, src);
}
}
}
return;
}
void LocalRA::markReferences(unsigned int &numRowsEOT, bool &lifetimeOpFound) {
unsigned int id = 0;
// Iterate over all BBs
for (auto _curBB : kernel.fg) {
curBB = _curBB;
// Iterate over all insts
for (INST_LIST_ITER inst_it = curBB->begin(), inst_end = curBB->end();
inst_it != inst_end; ++inst_it) {
G4_INST *curInst = (*inst_it);
if (curInst->isPseudoKill() || curInst->isLifeTimeEnd()) {
curInst->setLexicalId(id);
lifetimeOpFound = true;
if (curInst->isLifeTimeEnd()) {
markReferencesInInst(inst_it);
}
continue;
}
if (curInst->isSplitIntrinsic())
hasSplitInsts = true;
// set lexical ID
curInst->setLexicalId(id++);
if (curInst->isEOT() && kernel.fg.builder->hasEOTGRFBinding()) {
numRowsEOT += curInst->getSrc(0)->getTopDcl()->getNumRows();
if (curInst->isSplitSend() && !curInst->getSrc(1)->isNullReg()) {
// both src0 and src1 have to be >=r112
numRowsEOT += curInst->getSrc(1)->getTopDcl()->getNumRows();
}
}
markReferencesInInst(inst_it);
markSpecialForbidden(inst_it);
}
}
}
// Function to calculate input register live intervals,
// which are now tracked at word granularity because
// there may be overlap between two different input variables.
void LocalRA::calculateInputIntervals() {
setLexicalID(false);
int numGRF = kernel.getNumRegTotal();
std::vector<uint32_t> inputRegLastRef(numGRF * kernel.numEltPerGRF<Type_UW>(),
UINT_MAX);
for (BB_LIST_RITER bb_it = kernel.fg.rbegin(), bb_rend = kernel.fg.rend();
bb_it != bb_rend; bb_it++) {
G4_BB *bb = (*bb_it);
for (INST_LIST_RITER inst_it = bb->rbegin(), inst_rend = bb->rend();
inst_it != inst_rend; inst_it++) {
G4_INST *curInst = (*inst_it);
G4_Declare *topdcl = NULL;
LocalLiveRange *lr = NULL;
// scan dst operand (may be unnecessary but added for safety)
if (curInst->getDst() != NULL) {
// Scan dst
G4_DstRegRegion *dst = curInst->getDst();
topdcl = GetTopDclFromRegRegion(dst);
if (topdcl && (lr = gra.getLocalLR(topdcl))) {
if (topdcl->getRegFile() == G4_INPUT && !(dst->isAreg()) &&
topdcl->isOutput() == false && lr->hasIndirectAccess() == false &&
!builder.isPreDefArg(topdcl)) {
unsigned int lastRef = 0;
vISA_ASSERT(lr->isGRFRegAssigned(),
"Input variable has no pre-assigned physical register");
if (lr->getLastRef(lastRef) == NULL) {
unsigned int curInstId = curInst->getLexicalId();
lr->setLastRef(curInst, curInstId);
G4_RegVar *var = topdcl->getRegVar();
unsigned int regNum = var->getPhyReg()->asGreg()->getRegNum();
unsigned int regOff = var->getPhyRegOff();
unsigned int idx = regNum * kernel.numEltPerGRF<Type_UW>() +
(regOff * topdcl->getElemSize()) / G4_WSIZE +
topdcl->getWordSize() - 1;
unsigned int numWords = topdcl->getWordSize();
for (int i = numWords - 1; i >= 0; --i, --idx) {
if (inputRegLastRef[idx] == UINT_MAX &&
// Check for registers that are marked as forbidden,
// e.g., BuiltinR0 and those reserved for stack call
pregs->isGRFAvailable(idx /
kernel.numEltPerGRF<Type_UW>())) {
inputRegLastRef[idx] = curInstId;
inputIntervals.push_front(InputLiveRange(idx, curInstId));
if (kernel.getOptions()->getOption(vISA_GenerateDebugInfo)) {
updateDebugInfo(kernel, topdcl, 0, curInst->getVISAId());
}
}
}
}
}
}
}
// Scan src operands
for (int i = 0, nSrcs = curInst->getNumSrc(); i < nSrcs; i++) {
G4_Operand *src = curInst->getSrc(i);
if (src && src->getTopDcl()) {
topdcl = GetTopDclFromRegRegion(src);
if (topdcl && (lr = gra.getLocalLR(topdcl))) {
// Check whether it is input
if ((topdcl->getRegFile() == G4_INPUT || topdcl->isLiveIn()) &&
!(src->isAreg()) && topdcl->isOutput() == false &&
lr->hasIndirectAccess() == false &&
!builder.isPreDefArg(topdcl)) {
unsigned int lastRef = 0;
vISA_ASSERT(
lr->isGRFRegAssigned(),
"Input variable has no pre-assigned physical register");
if (lr->getLastRef(lastRef) == NULL) {
unsigned int curInstId = curInst->getLexicalId();
lr->setLastRef(curInst, curInstId);
G4_RegVar *var = topdcl->getRegVar();
unsigned int regNum = var->getPhyReg()->asGreg()->getRegNum();
unsigned int regOff = var->getPhyRegOff();
unsigned int idx =
regNum * kernel.numEltPerGRF<Type_UW>() +
(regOff * TypeSize(topdcl->getElemType())) / G4_WSIZE +
topdcl->getWordSize() - 1;
unsigned int numWords = topdcl->getWordSize();
for (int i = numWords - 1; i >= 0; --i, --idx) {
if (inputRegLastRef[idx] == UINT_MAX &&
// Check for registers that are marked as forbidden,
// e.g., BuiltinR0 and those reserved for stack call
pregs->isGRFAvailable(idx /
kernel.numEltPerGRF<Type_UW>())) {
inputRegLastRef[idx] = curInstId;
if (builder.avoidDstSrcOverlap() &&
curInst->getDst() != NULL &&
hasDstSrcOverlapPotential(curInst->getDst(),
src->asSrcRegRegion())) {
inputIntervals.push_front(
InputLiveRange(idx, curInstId + 1));
} else {
inputIntervals.push_front(InputLiveRange(idx, curInstId));
}
if (kernel.getOptions()->getOption(
vISA_GenerateDebugInfo)) {
updateDebugInfo(kernel, topdcl, 0, curInst->getVISAId());
}
}
}
}
}
}
}
}
}
}
#ifdef _DEBUG
for (DECLARE_LIST_ITER dcl_it = kernel.Declares.begin();
dcl_it != kernel.Declares.end(); dcl_it++) {
G4_Declare *curDcl = (*dcl_it);
if (curDcl->isOutput() && gra.getLocalLR(curDcl) &&
gra.getLocalLR(curDcl)->isGRFRegAssigned()) {
G4_RegVar *var = curDcl->getRegVar();
unsigned int regNum = var->getPhyReg()->asGreg()->getRegNum();
unsigned int regOff = var->getPhyRegOff();
unsigned int idx = regNum * kernel.numEltPerGRF<Type_UW>() +
(regOff * TypeSize(curDcl->getElemType())) / G4_WSIZE;
unsigned int numWords = curDcl->getWordSize();
for (unsigned int i = 0; i < numWords; ++i, ++idx) {
if (inputRegLastRef[idx] != UINT_MAX) {
vISA_ASSERT(
false,
"Invalid output variable with overlapping input variable!");
}
}
}
}
#endif
}
bool LocalRA::hasDstSrcOverlapPotential(G4_DstRegRegion *dst,
G4_SrcRegRegion *src) {
bool dstOpndNumRows = false;
if (dst->getBase()->isRegVar()) {
G4_Declare *dstDcl = dst->getBase()->asRegVar()->getDeclare();
if (dstDcl != nullptr) {
int dstOffset = (dstDcl->getOffsetFromBase() + dst->getLeftBound()) /
kernel.numEltPerGRF<Type_UB>();
G4_DstRegRegion *dstRgn = dst;
dstOpndNumRows = dstRgn->getSubRegOff() * dstRgn->getTypeSize() +
dstRgn->getLinearizedEnd() -
dstRgn->getLinearizedStart() + 1 >
kernel.numEltPerGRF<Type_UB>();
if (src != NULL && src->isSrcRegRegion() &&
src->asSrcRegRegion()->getBase()->isRegVar()) {
G4_SrcRegRegion *srcRgn = src->asSrcRegRegion();
G4_Declare *srcDcl = src->getBase()->asRegVar()->getDeclare();
int srcOffset = (srcDcl->getOffsetFromBase() + src->getLeftBound()) /
kernel.numEltPerGRF<Type_UB>();
bool srcOpndNumRows = srcRgn->getSubRegOff() * srcRgn->getTypeSize() +
srcRgn->getLinearizedEnd() -
srcRgn->getLinearizedStart() + 1 >
kernel.numEltPerGRF<Type_UB>();
if (dstOpndNumRows || srcOpndNumRows) {
if (!(gra.isEvenAligned(dstDcl) && gra.isEvenAligned(srcDcl) &&
srcOffset % 2 == dstOffset % 2 && dstOpndNumRows &&
srcOpndNumRows)) {
return true;
}
}
}
}
}
return false;
}
// Function to calculate local live intervals in ascending order of starting
// point in basic block bb.
//
// FIXME: current implementation cannot handle partial use-before-define case as
// shown below:
//
// loop_start:
// V1 (1) =
// P1 =
// (P1) (16) = V1
// ...
// V1 (16) =
// ...
// (P2) Jmp loop_start
//
// One way to fix this is to check the emask and simd size of local def/use and
// make sure they satisfy the followign rules: 1) The SIMD size of a use should
// be equal to or smaller than earlier def seen for that variable. 2) The Emask
// of use should be equal of narrower than def seen earlier. 3) Each use should
// be fully defined in current BB (e.g., if a use is predicated,
// the def should be either non-predicated or have the same predicate).
//
void LocalRA::calculateLiveIntervals(
G4_BB *bb, std::vector<LocalLiveRange *> &liveIntervals) {
int idx = 0;
bool brk = false;
for (INST_LIST_ITER inst_it = bb->begin(), bbend = bb->end();
inst_it != bbend && !brk; inst_it++, idx += 2) {
G4_INST *curInst = (*inst_it);
G4_Declare *topdcl = NULL;
LocalLiveRange *lr = NULL;
if (curInst->isPseudoKill() || curInst->isLifeTimeEnd()) {
continue;
}
// Scan srcs
for (int i = 0, nSrcs = curInst->getNumSrc(); i < nSrcs; i++) {
G4_Operand *src = curInst->getSrc(i);
if (src && src->getTopDcl()) {
// Scan all srcs
topdcl = GetTopDclFromRegRegion(src);
LocalLiveRange *topdclLR = nullptr;
if (topdcl && (topdclLR = gra.getLocalLR(topdcl))) {
lr = topdclLR;
// Check whether local LR is a candidate
if (lr->isLiveRangeLocal() && lr->isGRFRegAssigned() == false) {
unsigned int startIdx;
if (lr->getFirstRef(startIdx) == NULL) {
// Skip this lr and update its ref count,
// So it will be treated as a global LR
lr->recordRef(NULL);
continue;
}
vISA_ASSERT(idx > 0,
"Candidate use found in first inst of basic block");
if ((builder.WaDisableSendSrcDstOverlap() &&
((curInst->isSend() && i == 0) ||
(curInst->isSplitSend() && i == 1))) ||
(curInst->isDpas() &&
i == 1) // For DPAS, as part of same instruction, src1 should
// not have overlap with dst. Src0 and src2 are okay to
// have overlap
|| (builder.avoidDstSrcOverlap() && curInst->getDst() != NULL &&
hasDstSrcOverlapPotential(curInst->getDst(),
src->asSrcRegRegion()))) {
#ifdef DEBUG_VERBOSE_ON
if (curInst->getDst() != NULL &&
hasDstSrcOverlapPotential(curInst->getDst(),
src->asSrcRegRegion())) {
curInst->dump();
}
#endif
lr->setLastRef(curInst, idx + 1);
} else {
lr->setLastRef(curInst, idx);
}
}
}
}
}
if (G4_Inst_Table[curInst->opcode()].n_dst == 1 ||
curInst->isSplitIntrinsic()) {
// Scan dst
G4_DstRegRegion *dst = curInst->getDst();
if (dst) {
topdcl = GetTopDclFromRegRegion(dst);
if (topdcl && (lr = gra.getLocalLR(topdcl))) {
// Check whether local LR is a candidate
if (lr->isLiveRangeLocal() && lr->isGRFRegAssigned() == false) {
unsigned int startIdx;
if (lr->getFirstRef(startIdx) == NULL) {
lr->setFirstRef(curInst, idx);
liveIntervals.push_back(lr);
}
lr->setLastRef(curInst, idx);
}
}
}
}
}
}
void LocalRA::printAddressTakenDecls() {
DEBUG_VERBOSE("Address taken decls:"
<< "\n");
for (DECLARE_LIST_ITER dcl_it = kernel.Declares.begin();
dcl_it != kernel.Declares.end(); dcl_it++) {
G4_Declare *curDcl = (*dcl_it);
if (curDcl->getAliasDeclare() != NULL) {
vISA_ASSERT(gra.getLocalLR(curDcl) == NULL,
"Local LR found for alias declare");
continue;
}
if (gra.getLocalLR(curDcl) &&
gra.getLocalLR(curDcl)->hasIndirectAccess() == true) {
DEBUG_VERBOSE(curDcl->getName() << ", ");
}
}
DEBUG_VERBOSE("\n");
}
void LocalRA::printLocalRACandidates() {
DEBUG_VERBOSE("Local RA candidates:"
<< "\n");
for (DECLARE_LIST_ITER dcl_it = kernel.Declares.begin();
dcl_it != kernel.Declares.end(); dcl_it++) {
G4_Declare *curDcl = (*dcl_it);
if (curDcl->getAliasDeclare() != NULL) {
vISA_ASSERT(gra.getLocalLR(curDcl) == NULL,
"Local LR found for alias declare");
continue;
}
if (gra.getLocalLR(curDcl) && gra.getLocalLR(curDcl)->isLiveRangeLocal() &&
gra.getLocalLR(curDcl)->isGRFRegAssigned() == false) {
DEBUG_VERBOSE(curDcl->getName() << ", ");
}
}
DEBUG_VERBOSE("\n");
}
void LocalRA::printLocalLiveIntervals(
G4_BB *bb, std::vector<LocalLiveRange *> &liveIntervals) {
DEBUG_VERBOSE("Live intervals for bb " << bb->getId() << "\n");
for (auto lr : liveIntervals) {
unsigned int start, end;
lr->getFirstRef(start);
lr->getLastRef(end);
DEBUG_VERBOSE(lr->getTopDcl()->getName()
<< "(" << start << ", " << end << ", "
<< lr->getTopDcl()->getByteSize() << ")");
DEBUG_VERBOSE("\n");
}
DEBUG_VERBOSE("\n");
}
void LocalRA::printInputLiveIntervals() {
DEBUG_VERBOSE("\n"
<< "Input Live intervals "
<< "\n");
for (auto it = inputIntervals.begin(); it != inputIntervals.end(); it++) {
[[maybe_unused]] unsigned int regWordIdx, lrEndIdx, regNum, subRegInWord;
InputLiveRange lr = *it;
regWordIdx = lr.getRegWordIdx();
regNum = regWordIdx / kernel.numEltPerGRF<Type_UW>();
subRegInWord = regWordIdx % kernel.numEltPerGRF<Type_UW>();
lrEndIdx = lr.getLrEndIdx();
DEBUG_VERBOSE("r" << regNum << "." << subRegInWord << " " << lrEndIdx);
DEBUG_VERBOSE("\n");
}
DEBUG_VERBOSE("\n");
}
void LocalRA::countLocalLiveIntervals(
std::vector<LocalLiveRange *> &liveIntervals, unsigned grfSize) {
unsigned int numScalars = 0, numHalfGRF = 0, numOneGRF = 0,
numTwoOrMoreGRF = 0, numTotal = 0;
for (auto lr : liveIntervals) {
unsigned size, numrows;
G4_Declare *dcl = lr->getTopDcl();
numrows = dcl->getNumRows();
size = dcl->getNumElems() * dcl->getElemSize();
numTotal++;
if (numrows > 1) {
numTwoOrMoreGRF++;
} else {
if (dcl->getNumElems() > 1 && size > (grfSize / 2u) && size <= grfSize)
numOneGRF++;
else if (dcl->getNumElems() > 1 && size <= (grfSize / 2u))
numHalfGRF++;
else if (dcl->getNumElems() == 1)
numScalars++;
else
vISA_ASSERT_UNREACHABLE("Unknown size");
}
}
printLocalLiveIntervalDistribution(numScalars, numHalfGRF, numOneGRF,
numTwoOrMoreGRF, numTotal);
}
void LocalRA::printLocalLiveIntervalDistribution(unsigned int numScalars,
unsigned int numHalfGRF,
unsigned int numOneGRF,
unsigned int numTwoOrMoreGRF,
unsigned int numTotal) {
DEBUG_VERBOSE("In units of num of live ranges:"
<< "\n");
DEBUG_VERBOSE("Total local live ranges: " << numTotal << "\n");
DEBUG_VERBOSE("2 GRFs or more: " << numTwoOrMoreGRF << "\n");
DEBUG_VERBOSE("1 GRF: " << numOneGRF << "\n");
DEBUG_VERBOSE("Half GRF: " << numHalfGRF << "\n");
DEBUG_VERBOSE("Scalar: " << numScalars << "\n");
DEBUG_VERBOSE("\n");
}
// return false if roundrobin RA should be disabled
bool LocalRA::countLiveIntervals() {
int globalRows = 0;
uint32_t localRows = 0;
for (auto curDcl : kernel.Declares) {
LocalLiveRange *curDclLR = nullptr;
if (curDcl->getAliasDeclare() == NULL && curDcl->getRegFile() == G4_GRF &&
(curDclLR = gra.getLocalLR(curDcl)) &&
curDclLR->isGRFRegAssigned() == false) {
if (curDclLR->isLiveRangeGlobal()) {
globalRows += curDcl->getNumRows();
} else if (curDclLR->isLiveRangeLocal()) {
localRows += curDcl->getNumRows();
}
}
}
int scaledUniqueAssign = kernel.getScaledGRFSize(NUM_PREGS_FOR_UNIQUE_ASSIGN);
if (globalRows <= scaledUniqueAssign) {
globalLRSize = globalRows;
} else {
if (localRows < (numRegLRA - scaledUniqueAssign)) {
globalLRSize = scaledUniqueAssign;
} else {
return false;
}
}
return true;
}
// ********* LocalLiveRange class implementation *********
void LocalLiveRange::recordRef(G4_BB *bb) {
if (bb != prevBBRef)
numRefsInFG++;
prevBBRef = bb;
}
bool LocalLiveRange::isLiveRangeLocal() const {
if (isIndirectAccess == false && numRefsInFG == 1 && eot == false &&
!builder.isPreDefArg(topdcl) && !builder.isPreDefRet(topdcl) &&
topdcl->isOutput() == false) {
return true;
}
return false;
}
bool LocalLiveRange::isLiveRangeGlobal() {
if (isIndirectAccess == true || (numRefsInFG > 1 && eot == false) ||
topdcl->isOutput() == true) {
return true;
}
return false;
}
bool LocalLiveRange::isGRFRegAssigned() {
vISA_ASSERT(topdcl != NULL, "Top dcl not set");
G4_RegVar *rvar = topdcl->getRegVar();
bool isPhyRegAssigned = false;
if (rvar) {
if (rvar->isPhyRegAssigned())
isPhyRegAssigned = true;
}
return isPhyRegAssigned;
}
unsigned int LocalLiveRange::getSizeInWords() {
int nrows = getTopDcl()->getNumRows();
int elemsize = getTopDcl()->getElemSize();
int nelems = getTopDcl()->getNumElems();
int words = 0;
if (nrows > 1) {
// If sizeInWords is set, use it otherwise consider entire row reserved
unsigned int sizeInWords = getTopDcl()->getWordSize();
if (sizeInWords > 0)
words = sizeInWords;
else
words = nrows * builder.numEltPerGRF<Type_UW>();
} else if (nrows == 1) {
int nbytesperword = 2;
words = (nelems * elemsize + 1) / nbytesperword;
}
return words;
}
// ********* PhyRegsLocalRA class implementation *********
void PhyRegsLocalRA::setGRFBusy(int which) {
vISA_ASSERT(isGRFAvailable(which), "Invalid register");
// all 1 word mask based on register size
uint64_t wordMask = (1ULL << (builder.getGRFSize() / 2)) - 1;
regBusyVector[which] = (uint32_t)wordMask;
if (twoBanksRA) {
if (which < SECOND_HALF_BANK_START_GRF) {
bank1AvailableRegNum--;
} else {
bank2AvailableRegNum--;
}
}
}
// ********* PhyRegsLocalRA class implementation *********
void PhyRegsLocalRA::setGRFBusy(int which, int howmany) {
for (int i = 0; i < howmany; i++) {
setGRFBusy(which + i);
}
}
void PhyRegsLocalRA::setWordBusy(int whichgrf, int word) {
vISA_ASSERT(isGRFAvailable(whichgrf), "Invalid register");
vISA_ASSERT(word < (int)builder.numEltPerGRF<Type_UW>(), "Invalid word");
if (twoBanksRA) {
if (regBusyVector[whichgrf] == 0) {
if (whichgrf < SECOND_HALF_BANK_START_GRF) {
bank1AvailableRegNum--;
} else {
bank2AvailableRegNum--;
}
}
}
regBusyVector[whichgrf] |= (WORD_BUSY << word);
}
void PhyRegsLocalRA::setWordBusy(int whichgrf, int word, int howmany) {
for (int i = 0; i < howmany; i++) {
setWordBusy(whichgrf, word + i);
}
}
void PhyRegsLocalRA::setGRFNotBusy(int which, int instID) {
vISA_ASSERT(isGRFAvailable(which), "Invalid register");
regBusyVector[which] = 0;
if (twoBanksRA) {
if (which < SECOND_HALF_BANK_START_GRF) {
lastUseSum1 -= regLastUse[which];
lastUseSum1 += instID;
bank1AvailableRegNum++;
} else {
lastUseSum2 -= regLastUse[which];
lastUseSum2 += instID;
bank2AvailableRegNum++;
}
}
if (instID) {
regLastUse[which] = instID;
}
}
void PhyRegsLocalRA::setWordNotBusy(int whichgrf, int word, int instID) {
vISA_ASSERT(isGRFAvailable(whichgrf), "Invalid register");
vISA_ASSERT(word < (int)builder.numEltPerGRF<Type_UW>(), "Invalid word");
if (twoBanksRA) {
if (whichgrf < SECOND_HALF_BANK_START_GRF) {
lastUseSum1 -= regLastUse[whichgrf];
lastUseSum1 += instID;
if (regBusyVector[whichgrf] == 0) {
bank1AvailableRegNum++;
}
} else {
lastUseSum2 -= regLastUse[whichgrf];
lastUseSum2 += instID;
if (regBusyVector[whichgrf] == 0) {
bank2AvailableRegNum++;
}
}
}
uint32_t mask = ~(1 << word);
regBusyVector[whichgrf] &= mask;
if (instID) {
regLastUse[whichgrf] = instID;
}
}
inline bool PhyRegsLocalRA::isWordBusy(int whichgrf, int word) {
vISA_ASSERT(isGRFAvailable(whichgrf), "Invalid register");
vISA_ASSERT(word < (int)builder.numEltPerGRF<Type_UW>(), "Invalid word");
bool isBusy = ((regBusyVector[whichgrf] & (WORD_BUSY << word)) != 0);
return isBusy;
}
bool PhyRegsLocalRA::isWordBusy(int whichgrf, int word, int howmany) {
bool retval = false;
for (int i = 0; i < howmany && !retval; i++) {
retval |= isWordBusy(whichgrf, word + i);
}
return retval;
}
bool PhyRegsLocalRA::findFreeMultipleRegsForward(
int regIdx, BankAlign align, int &regnum, int nrows, int lastRowSize,
int endReg, unsigned short occupiedBundles, int instID, bool isHybridAlloc,
std::unordered_set<unsigned int> &forbidden) {
int foundItem = 0;
int startReg = 0;
int i = regIdx;
int grfRows = 0;
bool multiSteps = nrows > 1;
if (lastRowSize % builder.numEltPerGRF<Type_UW>() == 0) {
grfRows = nrows;
} else {
grfRows = nrows - 1;
}
findRegisterCandiateWithAlignForward(i, align, multiSteps);
i = findBundleConflictFreeRegister(i, endReg, occupiedBundles, align,
multiSteps);
startReg = i;
while (i <= endReg + nrows - 1) {
if (isGRFAvailable(i) && forbidden.find(i) == forbidden.end() &&
regBusyVector[i] == 0 &&
(!isHybridAlloc ||
(((instID - regLastUse[i]) / 2 >= LraFFWindowSize) ||
(regLastUse[i] == 0)))) {
foundItem++;
} else if (foundItem < grfRows) {
foundItem = 0;
i++;
findRegisterCandiateWithAlignForward(i, align, multiSteps);
i = findBundleConflictFreeRegister(i, endReg, occupiedBundles, align,
multiSteps);
startReg = i;
continue;
}
if (foundItem == grfRows) {
if (lastRowSize % builder.numEltPerGRF<Type_UW>() == 0) {
regnum = startReg;
return true;
} else {
if (i + 1 <= endReg + nrows - 1 && isGRFAvailable(i + 1) &&
forbidden.find(i + 1) == forbidden.end() &&
(isWordBusy(i + 1, 0, lastRowSize) == false) &&
(!isHybridAlloc ||
(((instID - regLastUse[i + 1]) / 2 >= LraFFWindowSize) ||
(regLastUse[i + 1] == 0)))) {
regnum = startReg;
return true;
} else {
foundItem = 0;
i++;
findRegisterCandiateWithAlignForward(i, align, multiSteps);
i = findBundleConflictFreeRegister(i, endReg, occupiedBundles, align,
multiSteps);
startReg = i;
continue;
}
}
}
i++;
}
return false;
}
bool PhyRegsLocalRA::findFreeMultipleRegsBackward(
int regIdx, BankAlign align, int &regnum, int nrows, int lastRowSize,
int endReg, int instID, bool isHybridAlloc,
std::unordered_set<unsigned int> &forbidden) {
int foundItem = 0;
int startReg = 0;
int grfRows = 0;
int i = regIdx;
bool multiSteps = nrows > 1;
if (lastRowSize % builder.numEltPerGRF<Type_UW>() == 0) {
grfRows = nrows;
} else {
grfRows = nrows - 1;
}
findRegisterCandiateWithAlignBackward(i, align, multiSteps);
startReg = i;
while (i >= endReg && i >= 0) {
if (isGRFAvailable(i) && forbidden.find(i) == forbidden.end() &&
regBusyVector[i] == 0 &&
(!isHybridAlloc || (((instID - regLastUse[i]) / 2 >= LraFFWindowSize) ||
(regLastUse[i] == 0)))) {
foundItem++;
} else if (foundItem < grfRows) {
foundItem = 0;
i -= nrows;
findRegisterCandiateWithAlignBackward(i, align, multiSteps);
startReg = i;
continue;
}
if (foundItem == grfRows) {
if (lastRowSize % builder.numEltPerGRF<Type_UW>() == 0) {
regnum = startReg;
return true;
} else {
if (i + 1 <= endReg && isGRFAvailable(i + 1) &&
forbidden.find(i + 1) == forbidden.end() &&
(isWordBusy(i + 1, 0, lastRowSize) == false) &&
(!isHybridAlloc ||
(((instID - regLastUse[i + 1]) / 2 >= LraFFWindowSize) ||
(regLastUse[i + 1] == 0)))) {
regnum = startReg;
return true;
} else {
foundItem = 0;
i -= nrows;
findRegisterCandiateWithAlignBackward(i, align, multiSteps);
startReg = i;
continue;
}
}
}
i++;
}
return false;
}
bool PhyRegsLocalRA::findFreeSingleReg(
int regIdx, int size, BankAlign align, G4_SubReg_Align subalign,
int &regnum, int &subregnum, int endReg, int instID, bool isHybridAlloc,
bool forward, std::unordered_set<unsigned int> &forbidden) {
int i = regIdx;
bool found = false;
while (!found) {
if (forward) {
if (i > endReg) //<= works
break;
} else {
if (i < endReg) //>= works
break;
}
// Align GRF
if ((align == BankAlign::Even) && (i % 2 != 0)) {
i += forward ? 1 : -1;
continue;
} else if ((align == BankAlign::Odd) && (i % 2 == 0)) {
i += forward ? 1 : -1;
continue;
} else if ((align == BankAlign::QuadGRF) && (i % 4 != 0)) {
i += forward ? (4 - (i % 4)) : -(i % 4);
continue;
} else if ((align == BankAlign::Even2GRF) && ((i % 4 >= 2))) {
i += forward ? 1 : -1;
continue;
} else if ((align == BankAlign::Odd2GRF) && ((i % 4 < 2))) {
i += forward ? 1 : -1;
continue;
}
if (isGRFAvailable(i, 1) && forbidden.find(i) == forbidden.end() &&
(!isHybridAlloc || (((instID - regLastUse[i]) / 2 >= LraFFWindowSize) ||
(regLastUse[i] == 0)))) {
found = findFreeSingleReg(i, subalign, regnum, subregnum, size);
if (found) {
return true;
}
}
i += forward ? 1 : -1;
}
return false;
}
void PhyRegsLocalRA::printBusyRegs() {
for (int i = 0; i < (int)numRegs; i++) {
if (isGRFAvailable(i) == false)
continue;
for (int j = 0; j < (int)builder.numEltPerGRF<Type_UW>(); j++) {
if (isWordBusy(i, j) == true) {
DEBUG_VERBOSE("r" << i << "." << j << ":w, "
<< "\n");
}
}
}
}
// Based on RegVar, mark available registers as busy
void PhyRegsLocalRA::markPhyRegs(G4_Declare *topdcl) {
G4_RegVar *rvar = topdcl->getRegVar();
unsigned numrows = topdcl->getNumRows();
unsigned numwords = 0;
unsigned int regnum = 0;
if (!rvar->getPhyReg() || !rvar->isGreg()) {
return;
}
// Calculate number of physical registers required by this dcl
if (numrows == 1) {
unsigned numbytes = topdcl->getElemSize() * topdcl->getNumElems();
numwords = (numbytes + 1) / 2;
unsigned int subReg = rvar->getPhyRegOff();
unsigned int subRegInWord = subReg * rvar->getDeclare()->getElemSize() / 2;
regnum = rvar->getPhyReg()->asGreg()->getRegNum();
if (isGRFAvailable(regnum) == true) {
for (unsigned int i = 0; i < numwords; i++) {
// In case across one GRF
if (subRegInWord + i >= builder.numEltPerGRF<Type_UW>()) {
regnum++;
if (!isGRFAvailable(regnum)) {
break;
}
numwords -= i;
i = 0;
subRegInWord = 0;
}
// Starting from first word, mark each consecutive word busy
setWordBusy(regnum, (subRegInWord + i));
}
}
} else {
regnum = rvar->getPhyReg()->asGreg()->getRegNum();
for (unsigned int i = 0; i < topdcl->getNumRows(); i++) {
if (isGRFAvailable(i + regnum) == true) {
// Set entire GRF busy
setGRFBusy(regnum + i);
}
}
}
}
// ********* PhyRegsManager class implementation *********
bool PhyRegsLocalRA::findFreeSingleReg(int regIdx, G4_SubReg_Align subalign,
int &regnum, int &subregnum, int size) {
bool found = false;
if (subalign == builder.getGRFAlign()) {
if (isWordBusy(regIdx, 0, size) == false) {
subregnum = 0;
found = true;
}
} else if (subalign == builder.getHalfGRFAlign()) {
if (isWordBusy(regIdx, 0, size) == false) {
subregnum = 0;
found = true;
} else if (size <= (int)builder.numEltPerGRF<Type_UW>() / 2 &&
isWordBusy(regIdx, builder.numEltPerGRF<Type_UW>() / 2, size) ==
false) {
subregnum = builder.numEltPerGRF<Type_UW>() / 2;
found = true;
}
} else {
// ToDo: check if dynamic step size has compile time impact
int step = 1;
int upBound = builder.numEltPerGRF<Type_UW>() - size + 1;
switch (subalign) {
case Eight_Word:
step = 8;
break;
case Four_Word:
step = 4;
break;
case Even_Word:
step = 2;
break;
case Any:
upBound = builder.numEltPerGRF<Type_UW>() -
size; // FIXME, why not the last word
step = 1;
break;
default:
vISA_ASSERT_UNREACHABLE("unexpected alignment");
}
for (int j = 0; j < upBound; j += step) {
if (!isWordBusy(regIdx, j, size)) {
subregnum = j;
found = true;
break;
}
}
}
if (found) {
regnum = regIdx;
}
return found;
}
int PhyRegsManager::findFreeRegs(int size, BankAlign align,
G4_SubReg_Align subalign, int &regnum,
int &subregnum, int startRegNum, int endRegNum,
unsigned short occupiedBundles,
unsigned int instID, bool isHybridAlloc,
std::unordered_set<unsigned int> &forbidden) {
int nrows = 0;
int lastRowSize = 0;
LocalRA::getRowInfo(size, nrows, lastRowSize, builder);
bool forward = (startRegNum <= endRegNum ? true : false);
int startReg = forward ? startRegNum : startRegNum - nrows + 1;
int endReg = forward ? endRegNum - nrows + 1 : endRegNum;
bool found = false;
if (size >= (int)builder.numEltPerGRF<Type_UW>()) {
if (forward) {
found = availableRegs.findFreeMultipleRegsForward(
startReg, align, regnum, nrows, lastRowSize, endReg, occupiedBundles,
instID, isHybridAlloc, forbidden);
} else {
found = availableRegs.findFreeMultipleRegsBackward(
startReg, align, regnum, nrows, lastRowSize, endReg, instID,
isHybridAlloc, forbidden);
}
if (found) {
subregnum = 0;
if (size % builder.numEltPerGRF<Type_UW>() == 0) {
availableRegs.setGRFBusy(regnum, nrows);
} else {
availableRegs.setGRFBusy(regnum, nrows - 1);
availableRegs.setWordBusy(regnum + nrows - 1, 0, lastRowSize);
}
}
} else {
found = availableRegs.findFreeSingleReg(startReg, size, align, subalign,
regnum, subregnum, endReg, instID,
isHybridAlloc, forward, forbidden);
if (found) {
availableRegs.setWordBusy(regnum, subregnum, size);
}
}
if (found) {
return nrows;
}
return 0;
}
// subregnum parameter is expected to be in units of word
void PhyRegsManager::freeRegs(int regnum, int subregnum, int numwords,
int instID) {
while (numwords >= (int)builder.numEltPerGRF<Type_UW>()) {
availableRegs.setGRFNotBusy(regnum, instID);
numwords -= builder.numEltPerGRF<Type_UW>();
regnum++;
}
while (numwords >= 1) {
availableRegs.setWordNotBusy(regnum, subregnum, instID);
subregnum++;
if (subregnum >= (int)builder.numEltPerGRF<Type_UW>()) {
subregnum = 0;
regnum++;
}
numwords--;
}
}
// ********* LinearScan class implementation *********
LinearScan::LinearScan(GlobalRA &g,
std::vector<LocalLiveRange *> &localLiveIntervals,
std::list<InputLiveRange> &inputLivelIntervals,
PhyRegsManager &pregMgr, PhyRegsLocalRA &pregs,
PhyRegSummary *s, unsigned int numReg, unsigned int glrs,
bool roundRobin, bool bankConflict,
bool internalConflict, bool splitLLR, unsigned int simdS)
: gra(g), builder(g.builder), pregManager(pregMgr), initPregs(pregs),
liveIntervals(localLiveIntervals), inputIntervals(inputLivelIntervals),
summary(s),
pregs(g.kernel.getNumRegTotal() * g.kernel.numEltPerGRF<Type_UW>(),
false),
simdSize(simdS), globalLRSize(glrs), numRegLRA(numReg),
useRoundRobin(roundRobin), doBankConflict(bankConflict),
highInternalConflict(internalConflict), doSplitLLR(splitLLR) {
// FIXME: this entire class is a mess, whole thing needs a rewrite
if (g.kernel.getNumRegTotal() < 128) {
// code below can segfault if user specifies #GRF < 128. We just hack it so
// bank1 == bank2
bank1StartGRFReg = bank1_start = 0;
bank2StartGRFReg = bank2_start = 0;
bank1_end = bank2_end = numRegLRA - 1;
startGRFReg = &bank1StartGRFReg;
return;
}
// register number boundaries
bank1_start = 0;
bank1_end = SECOND_HALF_BANK_START_GRF - globalLRSize / 2 - 1;
if (useRoundRobin) { // From middle to back
bank2_start = SECOND_HALF_BANK_START_GRF + (globalLRSize + 1) / 2;
bank2_end = numRegLRA - 1;
} else { // From back to middle
bank2_start = numRegLRA - 1;
bank2_end = SECOND_HALF_BANK_START_GRF + (globalLRSize + 1) / 2;
}
// register number pointers
bank1StartGRFReg = bank1_start;
bank2StartGRFReg = bank2_start;
// register pointer
startGRFReg = &bank1StartGRFReg;
int bank1AvailableRegNum = 0;
for (int i = 0; i < SECOND_HALF_BANK_START_GRF; i++) {
if (pregManager.getAvailableRegs()->isGRFAvailable(i) &&
!pregManager.getAvailableRegs()->isGRFBusy(i)) {
bank1AvailableRegNum++;
}
}
pregManager.getAvailableRegs()->setBank1AvailableRegNum(bank1AvailableRegNum);
int bank2AvailableRegNum = 0;
for (unsigned int i = SECOND_HALF_BANK_START_GRF; i < numRegLRA; i++) {
if (pregManager.getAvailableRegs()->isGRFAvailable(i) &&
!pregManager.getAvailableRegs()->isGRFBusy(i)) {
bank2AvailableRegNum++;
}
}
pregManager.getAvailableRegs()->setBank2AvailableRegNum(bank2AvailableRegNum);
}
// Linear scan implementation
void LinearScan::run(G4_BB *bb, IR_Builder &builder, LLR_USE_MAP &LLRUseMap) {
unsigned int idx = 0;
bool allocateRegResult = false;
unsigned int firstLocalIdx = 0, firstGlobalIdx = 0;
if (liveIntervals.size() > 0) {
LocalLiveRange *firstLr = (*liveIntervals.begin());
G4_INST *firstInst = firstLr->getFirstRef(firstLocalIdx);
firstGlobalIdx = firstInst->getLexicalId();
}
for (auto lr : liveIntervals) {
G4_INST *currInst = lr->getFirstRef(idx);
expireRanges(idx);
expireInputRanges(currInst->getLexicalId(), idx, firstGlobalIdx);
if (doBankConflict && builder.lowHighBundle() &&
(highInternalConflict ||
(simdSize >= 16 && builder.oneGRFBankDivision()))) {
allocateRegResult = allocateRegsFromBanks(lr);
} else {
allocateRegResult = allocateRegs(lr, bb, builder, LLRUseMap);
}
if (allocateRegResult) {
updateActiveList(lr);
#ifdef _DEBUG
int startregnum, endregnum, startsregnum, endsregnum;
G4_VarBase *op;
op = lr->getPhyReg(startsregnum);
startregnum = endregnum = op->asGreg()->getRegNum();
endsregnum = startsregnum +
(lr->getTopDcl()->getNumElems() *
lr->getTopDcl()->getElemSize() / 2) -
1;
vISA_ASSERT(((unsigned int)startregnum + lr->getTopDcl()->getNumRows() -
1) < numRegLRA,
"Linear scan allocated unavailable physical GRF");
if (lr->getTopDcl()->getNumRows() > 1) {
endregnum = startregnum + lr->getTopDcl()->getNumRows() - 1;
if (lr->getTopDcl()->getWordSize() > 0) {
endsregnum =
lr->getTopDcl()->getWordSize() % builder.numEltPerGRF<Type_UW>() -
1;
if (endsregnum < 0)
endsregnum = 15;
} else
endsregnum = 15; // last word in GRF
}
#ifdef DEBUG_VERBOSE_ON
DEBUG_VERBOSE("Assigned physical register to "
<< lr->getTopDcl()->getName() << " (r" << startregnum << "."
<< startsregnum << ":w - "
<< "r" << endregnum << "." << endsregnum << ":w)"
<< "\n");
#endif
#endif
}
}
expireAllActive();
}
void LinearScan::updateActiveList(LocalLiveRange *lr) {
bool done = false;
unsigned int newlr_end;
lr->getLastRef(newlr_end);
for (auto active_it = active.begin(); active_it != active.end();
active_it++) {
unsigned int end_idx;
LocalLiveRange *active_lr = (*active_it);
active_lr->getLastRef(end_idx);
if (end_idx > newlr_end) {
active.insert(active_it, lr);
done = true;
break;
}
}
if (done == false)
active.push_back(lr);
}
void LinearScan::expireAllActive() {
if (active.size() > 0) {
// Expire any remaining ranges
LocalLiveRange *lastActive = active.back();
unsigned int endIdx;
lastActive->getLastRef(endIdx);
expireRanges(endIdx);
}
}
// idx is the current position being processed
// The function will expire active ranges that end at or before idx
void LinearScan::expireRanges(unsigned int idx) {
// active list is sorted in ascending order of starting index
while (active.size() > 0) {
unsigned int endIdx;
LocalLiveRange *lr = active.front();
lr->getLastRef(endIdx);
if (endIdx <= idx) {
G4_VarBase *preg;
int subregnumword, subregnum;
preg = lr->getPhyReg(subregnumword);
if (preg) {
subregnum =
LocalRA::convertSubRegOffFromWords(lr->getTopDcl(), subregnumword);
// Mark the RegVar object of dcl as assigned to physical register
lr->getTopDcl()->getRegVar()->setPhyReg(preg, subregnum);
lr->setAssigned(true);
if (summary) {
// mark physical register as used atleast once
summary->markPhyRegs(preg, lr->getSizeInWords());
}
}
// Free physical regs marked for this range
freeAllocedRegs(lr, true);
#ifdef DEBUG_VERBOSE_ON
DEBUG_VERBOSE("Expiring range " << lr->getTopDcl()->getName() << "\n");
#endif
// Remove range from active list
active.pop_front();
} else {
// As soon as we find first range that ends after ids break loop
break;
}
}
}
void LinearScan::expireInputRanges(unsigned int global_idx,
unsigned int local_idx,
unsigned int first_idx) {
while (inputIntervals.size() > 0) {
InputLiveRange lr = inputIntervals.front();
unsigned int endIdx = lr.getLrEndIdx();
if (endIdx <= global_idx) {
unsigned int regnum =
lr.getRegWordIdx() / builder.numEltPerGRF<Type_UW>();
unsigned int subRegInWord =
lr.getRegWordIdx() % builder.numEltPerGRF<Type_UW>();
int inputIdx =
(endIdx < first_idx) ? 0 : (local_idx - (global_idx - endIdx) * 2);
// Free physical regs marked for this range
pregManager.freeRegs(regnum, subRegInWord, 1, inputIdx);
initPregs.setWordNotBusy(regnum, subRegInWord, 0);
#ifdef DEBUG_VERBOSE_ON
DEBUG_VERBOSE("Expiring input r" << regnum << "." << subRegInWord
<< "\n");
#endif
// Remove range from inputIntervals list
inputIntervals.pop_front();
} else {
// As soon as we find first range that ends after ids break loop
break;
}
}
}
// Allocate registers to live range. It makes a decision whether to spill
// a currently active range or the range passed as parameter. The range
// that has larger size and is longer is the spill candidate.
// Return true if free registers found, false if range is to be spilled
bool LinearScan::allocateRegs(LocalLiveRange *lr, G4_BB *bb,
IR_Builder &builder, LLR_USE_MAP &LLRUseMap) {
int regnum, subregnum;
unsigned int localRABound = 0;
unsigned int instID;
G4_INST *currInst = lr->getFirstRef(instID);
// Let local RA allocate only those ranges that need < 10 GRFs
// Larger ranges are not many and are best left to global RA
// as it can make a better judgment by considering the
// spill cost.
int nrows = 0;
int size = lr->getSizeInWords();
G4_Declare *dcl = lr->getTopDcl();
G4_SubReg_Align subalign = gra.getSubRegAlign(dcl);
BankAlign preBank = BankAlign::Either;
unsigned short occupiedBundles =
getOccupiedBundle(builder, gra, dcl, preBank);
localRABound = numRegLRA - globalLRSize -
1; //-1, localRABound will be counted in findFreeRegs()
BankAlign bankAlign = gra.isQuadAligned(dcl) ? BankAlign::QuadGRF
: gra.isEvenAligned(dcl) ? BankAlign::Even
: BankAlign::Either;
if (bankAlign == BankAlign::Either && doBankConflict) {
bankAlign = gra.getBankAlign(dcl);
} else {
if (bankAlign == BankAlign::Either) {
bankAlign = preBank;
}
}
if (useRoundRobin) {
nrows = pregManager.findFreeRegs(
size, bankAlign, subalign, regnum, subregnum, *startGRFReg,
localRABound, occupiedBundles, instID, false, lr->getForbidden());
} else {
nrows = pregManager.findFreeRegs(
size, bankAlign, subalign, regnum, subregnum, *startGRFReg,
localRABound, occupiedBundles, instID, true, lr->getForbidden());
if (!nrows) {
nrows = pregManager.findFreeRegs(
size, bankAlign, subalign, regnum, subregnum, *startGRFReg,
localRABound, occupiedBundles, instID, false, lr->getForbidden());
}
}
if (useRoundRobin) {
if (nrows) {
*startGRFReg = (regnum + nrows) % (localRABound);
} else {
unsigned int endGRFReg = *startGRFReg + nrows;
endGRFReg = (endGRFReg > localRABound) ? localRABound : endGRFReg;
nrows = pregManager.findFreeRegs(size, bankAlign, subalign, regnum,
subregnum, 0, endGRFReg, occupiedBundles,
instID, false, lr->getForbidden());
if (nrows) {
*startGRFReg = (regnum + nrows) % (localRABound);
}
}
}
// If allocations fails, return false
if (!nrows) {
// Check if any range in active is longer and wider than lr.
// Spill it if it is and call self again.
for (auto active_it = active.rbegin(); active_it != active.rend();
active_it++) {
LocalLiveRange *activeLR = (*active_it);
if (activeLR->getSizeInWords() >= lr->getSizeInWords() &&
(!doSplitLLR ||
gra.getNumRefs(activeLR->getTopDcl()) > SPLIT_REF_CNT_THRESHOLD)) {
// Another active range is larger than lr
unsigned int active_lr_end, lr_end;
activeLR->getLastRef(active_lr_end);
lr->getLastRef(lr_end);
if (active_lr_end > lr_end) {
// Range in active is longer, so spill it
#ifdef DEBUG_VERBOSE_ON
DEBUG_VERBOSE("Spilling range " << activeLR->getTopDcl()->getName()
<< "\n");
#endif
freeAllocedRegs(activeLR, false);
active.erase(--(active_it.base()));
if (doSplitLLR) {
G4_Declare *oldDcl = activeLR->getTopDcl();
unsigned short totalElems = oldDcl->getTotalElems();
std::vector<std::pair<INST_LIST_ITER, unsigned int>> *useList =
nullptr;
auto Iter = LLRUseMap.find(activeLR);
if (Iter != LLRUseMap.end())
useList = &(Iter->second);
if (useList && useList->size() > 2 &&
gra.getNumRefs(oldDcl) - useList->size() == 1 &&
(totalElems == 8 || totalElems == 16 || totalElems == 32)) {
bool split = false;
unsigned int useCnt = 0;
INST_LIST_ITER lastUseIt;
G4_Declare *newDcl = NULL;
for (const auto &usePoint : *useList) {
INST_LIST_ITER useIt = usePoint.first;
G4_INST *useInst = *useIt;
useCnt++;
unsigned int currId = currInst->getLexicalId();
if (useInst->getLexicalId() < currId ||
useCnt < SPLIT_USE_CNT_THRESHOLD) {
lastUseIt = useIt;
} else if (split == false &&
(useInst->getLexicalId() -
(*lastUseIt)->getLexicalId()) >
SPLIT_USE_DISTANCE_THRESHOLD &&
!(oldDcl->getElemSize() >= 8 &&
oldDcl->getTotalElems() >= 16)) {
G4_Declare *splitDcl = NULL;
const char *splitDclName =
builder.getNameString(16, "split_%s", oldDcl->getName());
splitDcl = builder.createDeclare(
splitDclName, G4_GRF, oldDcl->getNumElems(),
oldDcl->getNumRows(), oldDcl->getElemType());
splitDcl->copyAlign(oldDcl);
LocalLiveRange *splitLR =
gra.GetOrCreateLocalLiveRange(splitDcl);
splitLR->markSplit();
INST_LIST_ITER iter = lastUseIt;
iter++;
// FIXME: The following code does not handle QWord variable
// spilling completely. See the last condition of this
// if-stmt.
G4_DstRegRegion *dst =
builder.createDstRegRegion(splitDcl, 1);
G4_SrcRegRegion *src = builder.createSrcRegRegion(
oldDcl, builder.getRegionStride1());
G4_ExecSize splitInstExecSize(oldDcl->getTotalElems() > 16
? 16
: oldDcl->getTotalElems());
G4_INST *splitInst = builder.createMov(
splitInstExecSize, dst, src, InstOpt_WriteEnable, false);
bb->insertBefore(iter, splitInst);
unsigned int idx = 0;
gra.getLocalLR(oldDcl)->setLastRef(splitInst, idx);
gra.setNumRefs(oldDcl, useCnt);
splitLR->setFirstRef(splitInst, idx);
gra.setNumRefs(splitDcl, 2);
if (totalElems == 32) {
G4_DstRegRegion *dst = builder.createDst(
splitDcl->getRegVar(), 2, 0, 1, oldDcl->getElemType());
auto src = builder.createSrc(oldDcl->getRegVar(), 2, 0,
builder.getRegionStride1(),
oldDcl->getElemType());
G4_INST *splitInst2 = builder.createMov(
g4::SIMD16, dst, src, InstOpt_WriteEnable, false);
bb->insertBefore(iter, splitInst2);
}
const char *newDclName =
builder.getNameString(16, "copy_%s", oldDcl->getName());
newDcl = builder.createDeclare(
newDclName, G4_GRF, oldDcl->getNumElems(),
oldDcl->getNumRows(), oldDcl->getElemType());
newDcl->copyAlign(oldDcl);
unsigned int oldRefs = gra.getNumRefs(oldDcl);
LocalLiveRange *newLR = gra.GetOrCreateLocalLiveRange(newDcl);
iter = useIt;
dst = builder.createDstRegRegion(newDcl, 1);
src = builder.createSrcRegRegion(splitDcl,
builder.getRegionStride1());
G4_INST *movInst = builder.createMov(
G4_ExecSize(splitDcl->getTotalElems() > 16
? 16
: splitDcl->getTotalElems()),
dst, src, InstOpt_WriteEnable, false);
bb->insertBefore(iter, movInst);
splitLR->setLastRef(movInst, idx);
G4_INST *old_last_use = activeLR->getLastRef(idx);
newLR->setFirstRef(splitInst, idx);
newLR->setLastRef(old_last_use, idx);
gra.setNumRefs(newDcl, oldRefs - useCnt + 1);
if (totalElems == 32) {
G4_DstRegRegion *dst = builder.createDst(
newDcl->getRegVar(), 2, 0, 1, splitDcl->getElemType());
auto src = builder.createSrc(splitDcl->getRegVar(), 2, 0,
builder.getRegionStride1(),
splitDcl->getElemType());
G4_INST *movInst2 = builder.createMov(
g4::SIMD16, dst, src, InstOpt_WriteEnable, false);
bb->insertBefore(iter, movInst2);
}
unsigned int pos = usePoint.second;
G4_SrcRegRegion *oldSrc =
useInst->getSrc(pos)->asSrcRegRegion();
G4_Declare *oldSrcDcl =
oldSrc->getBase()->asRegVar()->getDeclare();
G4_Declare *newSrcDcl = newDcl;
G4_Declare *aliasOldSrcDcl = oldSrcDcl->getAliasDeclare();
if (aliasOldSrcDcl != NULL) {
const char *newSrcDclName = builder.getNameString(
16, "copy_%s", oldSrcDcl->getName());
newSrcDcl = builder.createDeclare(
newSrcDclName, G4_GRF, oldSrcDcl->getNumElems(),
oldSrcDcl->getNumRows(), oldSrcDcl->getElemType());
newSrcDcl->copyAlign(oldSrcDcl);
newSrcDcl->setAliasDeclare(aliasOldSrcDcl,
oldSrcDcl->getAliasOffset());
}
auto newSrc = builder.createSrcRegRegion(
oldSrc->getModifier(), Direct, newSrcDcl->getRegVar(),
oldSrc->getRegOff(), oldSrc->getSubRegOff(),
oldSrc->getRegion(), oldSrc->getType());
useInst->setSrc(newSrc, pos);
while (aliasOldSrcDcl && aliasOldSrcDcl != oldDcl) {
oldSrcDcl = aliasOldSrcDcl;
const char *newSrcDclName = builder.getNameString(
16, "copy_%s", oldSrcDcl->getName());
newSrcDcl = builder.createDeclare(
newSrcDclName, G4_GRF, oldSrcDcl->getNumElems(),
oldSrcDcl->getNumRows(), oldSrcDcl->getElemType());
newSrcDcl->copyAlign(oldSrcDcl);
aliasOldSrcDcl = oldSrcDcl->getAliasDeclare();
vISA_ASSERT(aliasOldSrcDcl != NULL, "Invalid alias decl");
newSrcDcl->setAliasDeclare(aliasOldSrcDcl,
oldSrcDcl->getAliasOffset());
}
split = true;
} else if (split) {
unsigned int pos = usePoint.second;
G4_SrcRegRegion *oldSrc =
useInst->getSrc(pos)->asSrcRegRegion();
G4_Declare *oldSrcDcl =
oldSrc->getBase()->asRegVar()->getDeclare();
G4_Declare *newSrcDcl = newDcl;
G4_Declare *aliasOldSrcDcl = oldSrcDcl->getAliasDeclare();
if (aliasOldSrcDcl != NULL) {
const char *newSrcDclName = builder.getNameString(
16, "copy_%s", oldSrcDcl->getName());
newSrcDcl = builder.createDeclare(
newSrcDclName, G4_GRF, oldSrcDcl->getNumElems(),
oldSrcDcl->getNumRows(), oldSrcDcl->getElemType());
newSrcDcl->copyAlign(oldSrcDcl);
newSrcDcl->setAliasDeclare(aliasOldSrcDcl,
oldSrcDcl->getAliasOffset());
}
auto newSrc = builder.createSrcRegRegion(
oldSrc->getModifier(), Direct, newSrcDcl->getRegVar(),
oldSrc->getRegOff(), oldSrc->getSubRegOff(),
oldSrc->getRegion(), oldSrc->getType());
;
useInst->setSrc(newSrc, pos);
while (aliasOldSrcDcl && aliasOldSrcDcl != oldDcl) {
oldSrcDcl = aliasOldSrcDcl;
const char *newSrcDclName = builder.getNameString(
16, "copy_%s", oldSrcDcl->getName());
newSrcDcl = builder.createDeclare(
newSrcDclName, G4_GRF, oldSrcDcl->getNumElems(),
oldSrcDcl->getNumRows(), oldSrcDcl->getElemType());
newSrcDcl->copyAlign(oldSrcDcl);
aliasOldSrcDcl = oldSrcDcl->getAliasDeclare();
vISA_ASSERT(aliasOldSrcDcl != NULL, "Invalid alias decl");
newSrcDcl->setAliasDeclare(aliasOldSrcDcl,
oldSrcDcl->getAliasOffset());
}
} else {
lastUseIt = useIt;
}
}
}
}
// Try again
return allocateRegs(lr, bb, builder, LLRUseMap);
}
}
}
#ifdef DEBUG_VERBOSE_ON
DEBUG_VERBOSE("Spilling range " << lr->getTopDcl()->getName() << "\n");
#endif
// Even after spilling everything could not find an allocation
return false;
}
#ifdef DEBUG_VERBOSE_ON
COUT_ERROR << lr->getTopDcl()->getName() << ":r" << regnum
<< " BANK: " << gra.getBankConflict(lr->getTopDcl()) << "\n";
#endif
lr->setPhyReg(builder.phyregpool.getGreg(regnum), subregnum);
return true;
}
bool LinearScan::allocateRegsFromBanks(LocalLiveRange *lr) {
vISA_ASSERT(!gra.use4GRFAlign, "not expecting 4GRF align");
int regnum, subregnum;
unsigned int tmpLocalRABound = 0;
int nrows = 0;
int lastUseSum1 = pregManager.getAvailableRegs()->getLastUseSum1();
int lastUseSum2 = pregManager.getAvailableRegs()->getLastUseSum2();
int bank1AvailableRegNum =
pregManager.getAvailableRegs()->getBank1AvailableRegNum();
int bank2AvailableRegNum =
pregManager.getAvailableRegs()->getBank2AvailableRegNum();
int size = lr->getSizeInWords();
BankAlign align =
gra.isEvenAligned(lr->getTopDcl()) ? BankAlign::Even : BankAlign::Either;
G4_SubReg_Align subalign = gra.getSubRegAlign(lr->getTopDcl());
unsigned int instID;
lr->getFirstRef(instID);
auto lrBC = gra.getBankConflict(lr->getTopDcl());
// Reverse the order for FF two banks RA.
// For FF, second bank register allocated from back to front
if (align == BankAlign::Either && lrBC != BANK_CONFLICT_NONE) {
switch (lrBC) {
case BANK_CONFLICT_FIRST_HALF_EVEN:
case BANK_CONFLICT_FIRST_HALF_ODD:
align = BankAlign::Even;
startGRFReg = &bank1StartGRFReg;
tmpLocalRABound = bank1_end;
break;
case BANK_CONFLICT_SECOND_HALF_EVEN:
case BANK_CONFLICT_SECOND_HALF_ODD:
if (useRoundRobin) {
align = BankAlign::Odd;
}
startGRFReg = &bank2StartGRFReg;
tmpLocalRABound = bank2_end;
break;
default:
break;
}
} else {
if (useRoundRobin) {
if (bank1AvailableRegNum == 0 && bank2AvailableRegNum == 0) {
return false;
}
float bank1Average = bank1AvailableRegNum == 0
? (float)0x7fffffff
: (float)lastUseSum1 / bank1AvailableRegNum;
float bank2Average = bank2AvailableRegNum == 0
? (float)0x7fffffff
: (float)lastUseSum2 / bank2AvailableRegNum;
if (bank1Average > bank2Average) {
startGRFReg = &bank2StartGRFReg;
tmpLocalRABound = bank2_end;
} else {
startGRFReg = &bank1StartGRFReg;
tmpLocalRABound = bank1_end;
}
} else {
if (bank1AvailableRegNum < bank2AvailableRegNum) {
startGRFReg = &bank2StartGRFReg;
tmpLocalRABound = bank2_end;
} else {
startGRFReg = &bank1StartGRFReg;
tmpLocalRABound = bank1_end;
}
}
}
if (useRoundRobin) {
nrows = pregManager.findFreeRegs(size, align, subalign, regnum, subregnum,
*startGRFReg, tmpLocalRABound, 0, instID,
false, lr->getForbidden());
if (nrows) {
// Wrapper handling
if (tmpLocalRABound) {
*startGRFReg = (regnum + nrows) % (tmpLocalRABound);
}
if (startGRFReg == &bank2StartGRFReg) {
if (*startGRFReg < SECOND_HALF_BANK_START_GRF) {
*startGRFReg += bank2_start;
if (*startGRFReg >= numRegLRA) {
*startGRFReg = bank2_start;
return false;
}
}
}
} else {
// record the original endReg in case failed to allocate in bank switch
unsigned int endReg = *startGRFReg + nrows;
// If failed, try another bank first.
if (startGRFReg == &bank2StartGRFReg) {
startGRFReg = &bank1StartGRFReg;
tmpLocalRABound = bank1_end;
} else {
startGRFReg = &bank2StartGRFReg;
tmpLocalRABound = bank2_end;
}
nrows = pregManager.findFreeRegs(size, align, subalign, regnum, subregnum,
*startGRFReg, tmpLocalRABound, 0, instID,
false, lr->getForbidden());
if (!nrows) {
// Still fail, wrapper round in original bank
if (startGRFReg == &bank1StartGRFReg) // switch back to before
{
startGRFReg = &bank2StartGRFReg;
*startGRFReg = bank2_start;
tmpLocalRABound = ((endReg) > (bank2_end)) ? bank2_end : endReg;
} else {
startGRFReg = &bank1StartGRFReg;
*startGRFReg = bank1_start;
tmpLocalRABound = (endReg > bank1_end) ? bank1_end : endReg;
}
nrows = pregManager.findFreeRegs(
size, align, subalign, regnum, subregnum, *startGRFReg,
tmpLocalRABound, 0, instID, false, lr->getForbidden());
}
}
} else {
nrows = pregManager.findFreeRegs(size, align, subalign, regnum, subregnum,
*startGRFReg, tmpLocalRABound, 0, instID,
true, lr->getForbidden());
if (!nrows) { // Try without window, no even/odd alignment for bank, but
// still low and high(keep in same bank)
nrows = pregManager.findFreeRegs(
size,
gra.isEvenAligned(lr->getTopDcl()) ? BankAlign::Even
: BankAlign::Either,
subalign, regnum, subregnum, *startGRFReg, tmpLocalRABound, 0, instID,
false, lr->getForbidden());
}
if (!nrows) {
// Try another bank, no even/odd alignment for bank, also no low and high
if (startGRFReg == &bank2StartGRFReg) {
startGRFReg = &bank1StartGRFReg;
tmpLocalRABound = bank1_end;
} else {
startGRFReg = &bank2StartGRFReg;
tmpLocalRABound = bank2_end;
}
nrows = pregManager.findFreeRegs(
size,
gra.isEvenAligned(lr->getTopDcl()) ? BankAlign::Even
: BankAlign::Either,
subalign, regnum, subregnum, *startGRFReg, tmpLocalRABound, 0, instID,
false, lr->getForbidden());
}
}
// If allocations fails, return false
if (!nrows) {
return false;
}
lr->setPhyReg(builder.phyregpool.getGreg(regnum), subregnum);
#ifdef DEBUG_VERBOSE_ON
COUT_ERROR << lr->getTopDcl()->getName() << ":r" << regnum
<< " BANK: " << gra.getBankConflict(lr->getTopDcl()) << "\n";
#endif
return true;
}
// Mark physical register allocated to range lr as available
void LinearScan::freeAllocedRegs(LocalLiveRange *lr, bool setInstID) {
int sregnum;
G4_VarBase *preg = lr->getPhyReg(sregnum);
vISA_ASSERT(
preg != NULL,
"Physical register not assigned to live range. Cannot free regs.");
unsigned int idx = 0;
if (setInstID) {
lr->getLastRef(idx);
}
pregManager.freeRegs(preg->asGreg()->getRegNum(), sregnum,
lr->getSizeInWords(), idx);
}
// ********* PhyRegSummary class implementation *********
// Mark GRFs as used
void PhyRegSummary::markPhyRegs(G4_VarBase *pr, unsigned int size) {
// Assume that pr is aligned to GRF start if it cannot fit in a single GRF
vISA_ASSERT(pr->isGreg(), "Expecting GRF as operand");
vISA_ASSERT(builder != nullptr, "ir builder should be set");
int numGRFs = size / builder->numEltPerGRF<Type_UW>();
unsigned int regnum = pr->asGreg()->getRegNum();
if (size % builder->numEltPerGRF<Type_UW>() != 0)
numGRFs++;
for (int i = 0; i < numGRFs; i++) {
GRFUsage[regnum + i] = true;
}
}
void PhyRegSummary::printBusyRegs() {
VISA_DEBUG_VERBOSE({
std::cout << "GRFs used: ";
for (int i = 0, size = static_cast<int>(GRFUsage.size()); i < size; i++) {
if (isGRFBusy(i))
std::cout << "r" << i << ", ";
}
std::cout << "\n\n";
});
}