mirror of
https://github.com/intel/intel-graphics-compiler.git
synced 2025-11-04 08:21:06 +08:00
12718 lines
440 KiB
C++
12718 lines
440 KiB
C++
/*========================== begin_copyright_notice ============================
|
|
|
|
Copyright (C) 2017-2023 Intel Corporation
|
|
|
|
SPDX-License-Identifier: MIT
|
|
|
|
============================= end_copyright_notice ===========================*/
|
|
|
|
#include "GraphColor.h"
|
|
#include "BuildIR.h"
|
|
#include "DebugInfo.h"
|
|
#include "FlagSpillCleanup.h"
|
|
#include "FlowGraph.h"
|
|
#include "LinearScanRA.h"
|
|
#include "LocalRA.h"
|
|
#include "Optimizer.h"
|
|
#include "PointsToAnalysis.h"
|
|
#include "RADebug.h"
|
|
#include "RPE.h"
|
|
#include "Rematerialization.h"
|
|
#include "SCCAnalysis.h"
|
|
#include "SpillCleanup.h"
|
|
#include "SpillCode.h"
|
|
#include "SplitAlignedScalars.h"
|
|
#include "Timer.h"
|
|
|
|
#include <algorithm>
|
|
#include <cmath> // sqrt
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <list>
|
|
#include <sstream>
|
|
#include <optional>
|
|
|
|
#include "common/LLVMWarningsPush.hpp"
|
|
#include <llvm/ADT/SmallString.h>
|
|
#include <llvm/ADT/StringRef.h>
|
|
#include "common/LLVMWarningsPop.hpp"
|
|
|
|
using namespace vISA;
|
|
|
|
#define GRAPH_COLOR_MEM_SIZE 16 * 1024
|
|
#define SCRATCH_MSG_LIMIT (128 * 1024)
|
|
#define SCRATCH_COMPRESS_THRESHOLD (12 * 1024)
|
|
|
|
const RAVarInfo GlobalRA::defaultValues;
|
|
const char GlobalRA::StackCallStr[] = "StackCall";
|
|
|
|
static const unsigned IN_LOOP_REFERENCE_COUNT_FACTOR = 4;
|
|
|
|
#define BANK_CONFLICT_HEURISTIC_INST 0.04
|
|
#define BANK_CONFLICT_HEURISTIC_REF_COUNT 0.25
|
|
#define BANK_CONFLICT_HEURISTIC_LOOP_ITERATION 5
|
|
#define BANK_CONFLICT_SEND_INST_CYCLE \
|
|
60 // Some send 200, some 400 we choose the small one
|
|
#define BANK_CONFLICT_SIMD8_OVERHEAD_CYCLE 1
|
|
#define BANK_CONFLICT_SIMD16_OVERHEAD_CYCLE 2
|
|
#define INTERNAL_CONFLICT_RATIO_HEURISTIC 0.25
|
|
|
|
#define NOMASK_BYTE 0x80
|
|
|
|
Interference::Interference(const LivenessAnalysis *l, GlobalRA &g)
|
|
: gra(g), kernel(g.kernel), lrs(gra.incRA.getLRs()),
|
|
builder(*g.kernel.fg.builder), maxId(l->getNumSelectedVar()),
|
|
rowSize(maxId / BITS_DWORD + 1),
|
|
splitStartId(l->getNumSplitStartID()), splitNum(l->getNumSplitVar()),
|
|
liveAnalysis(l), aug(*this, *l, g), incRA(g.incRA),
|
|
sparseIntf(g.intfStorage.sparseIntf), sparseMatrix(g.intfStorage.sparseMatrix) {
|
|
denseMatrixLimit = builder.getuint32Option(vISA_DenseMatrixLimit);
|
|
incRA.registerNextIter((G4_RegFileKind)l->getSelectedRF(), l, this);
|
|
}
|
|
|
|
criticalCmpForEndInterval::criticalCmpForEndInterval(GlobalRA &g) : gra(g) {}
|
|
bool criticalCmpForEndInterval::operator()(const QueueEntry &A, const QueueEntry &B) const {
|
|
return A.interval.end->getLexicalId() > B.interval.end->getLexicalId();
|
|
}
|
|
AugmentPriorityQueue::AugmentPriorityQueue(criticalCmpForEndInterval cmp)
|
|
: std::priority_queue<QueueEntry, std::vector<QueueEntry>,
|
|
criticalCmpForEndInterval>(cmp) {}
|
|
|
|
inline bool Interference::varSplitCheckBeforeIntf(unsigned v1,
|
|
unsigned v2) const {
|
|
const LiveRange *l1 = lrs[v1];
|
|
const LiveRange *l2 = lrs[v2];
|
|
|
|
if (!l1->getIsPartialDcl() && !l2->getIsPartialDcl()) {
|
|
return false;
|
|
}
|
|
|
|
// Don't do interference for two split declares
|
|
if (l1->getIsPartialDcl() && l2->getIsPartialDcl()) {
|
|
return true;
|
|
}
|
|
|
|
unsigned p1 = v1;
|
|
unsigned p2 = v2;
|
|
// Don't do inteference for child and parent delcares
|
|
if (l1->getIsPartialDcl()) {
|
|
p1 = l1->getParentLRID();
|
|
}
|
|
|
|
if (l2->getIsPartialDcl()) {
|
|
p2 = l2->getParentLRID();
|
|
}
|
|
|
|
if (p1 == p2) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
BankConflict BankConflictPass::setupBankAccordingToSiblingOperand(
|
|
BankConflict assignedBank, unsigned offset, bool oneGRFBank) {
|
|
BankConflict tgtBank;
|
|
|
|
vISA_ASSERT(assignedBank != BANK_CONFLICT_NONE,
|
|
"sibling bank is not assigned");
|
|
|
|
// Set according to sibling
|
|
tgtBank = (assignedBank == BANK_CONFLICT_FIRST_HALF_EVEN ||
|
|
assignedBank == BANK_CONFLICT_FIRST_HALF_ODD)
|
|
? (assignedBank == BANK_CONFLICT_FIRST_HALF_EVEN
|
|
? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_SECOND_HALF_EVEN)
|
|
: (assignedBank == BANK_CONFLICT_SECOND_HALF_EVEN
|
|
? BANK_CONFLICT_FIRST_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN);
|
|
|
|
// Adjust according to the offset
|
|
if (oneGRFBank) {
|
|
if (offset % 2) {
|
|
if (tgtBank == BANK_CONFLICT_SECOND_HALF_EVEN ||
|
|
tgtBank == BANK_CONFLICT_FIRST_HALF_EVEN) {
|
|
tgtBank = (tgtBank == BANK_CONFLICT_FIRST_HALF_EVEN)
|
|
? BANK_CONFLICT_FIRST_HALF_ODD
|
|
: BANK_CONFLICT_SECOND_HALF_ODD;
|
|
} else {
|
|
tgtBank = (tgtBank == BANK_CONFLICT_FIRST_HALF_ODD)
|
|
? BANK_CONFLICT_FIRST_HALF_EVEN
|
|
: BANK_CONFLICT_SECOND_HALF_EVEN;
|
|
}
|
|
}
|
|
} else {
|
|
if (offset % 4 >= 2) {
|
|
if (tgtBank == BANK_CONFLICT_SECOND_HALF_EVEN ||
|
|
tgtBank == BANK_CONFLICT_FIRST_HALF_EVEN) {
|
|
tgtBank = (tgtBank == BANK_CONFLICT_FIRST_HALF_EVEN)
|
|
? BANK_CONFLICT_FIRST_HALF_ODD
|
|
: BANK_CONFLICT_SECOND_HALF_ODD;
|
|
} else {
|
|
tgtBank = (tgtBank == BANK_CONFLICT_FIRST_HALF_ODD)
|
|
? BANK_CONFLICT_FIRST_HALF_EVEN
|
|
: BANK_CONFLICT_SECOND_HALF_EVEN;
|
|
}
|
|
}
|
|
}
|
|
|
|
return tgtBank;
|
|
}
|
|
|
|
void refNumBasedSort(const unsigned *refNum, unsigned *index) {
|
|
if (refNum[2] > refNum[1]) {
|
|
index[0] = 2;
|
|
index[1] = 1;
|
|
} else {
|
|
index[0] = 1;
|
|
index[1] = 2;
|
|
}
|
|
|
|
index[2] = 0;
|
|
|
|
return;
|
|
}
|
|
|
|
bool BankConflictPass::hasInternalConflict3Srcs(BankConflict *srcBC) {
|
|
if (((srcBC[0] == BANK_CONFLICT_SECOND_HALF_EVEN ||
|
|
srcBC[0] == BANK_CONFLICT_FIRST_HALF_EVEN) &&
|
|
(srcBC[1] == BANK_CONFLICT_SECOND_HALF_EVEN ||
|
|
srcBC[1] == BANK_CONFLICT_FIRST_HALF_EVEN) &&
|
|
(srcBC[2] == BANK_CONFLICT_SECOND_HALF_EVEN ||
|
|
srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN)) ||
|
|
((srcBC[0] == BANK_CONFLICT_SECOND_HALF_ODD ||
|
|
srcBC[0] == BANK_CONFLICT_FIRST_HALF_ODD) &&
|
|
(srcBC[1] == BANK_CONFLICT_SECOND_HALF_ODD ||
|
|
srcBC[1] == BANK_CONFLICT_FIRST_HALF_ODD) &&
|
|
(srcBC[2] == BANK_CONFLICT_SECOND_HALF_ODD ||
|
|
srcBC[2] == BANK_CONFLICT_FIRST_HALF_ODD))) {
|
|
return true;
|
|
}
|
|
if ((srcBC[0] < BANK_CONFLICT_SECOND_HALF_EVEN &&
|
|
srcBC[1] < BANK_CONFLICT_SECOND_HALF_EVEN &&
|
|
srcBC[2] < BANK_CONFLICT_SECOND_HALF_EVEN) ||
|
|
(srcBC[0] >= BANK_CONFLICT_SECOND_HALF_EVEN &&
|
|
srcBC[1] >= BANK_CONFLICT_SECOND_HALF_EVEN &&
|
|
srcBC[2] >= BANK_CONFLICT_SECOND_HALF_EVEN)) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
void BankConflictPass::setupEvenOddBankConflictsForDecls(
|
|
G4_Declare *dcl_1, G4_Declare *dcl_2, unsigned offset1, unsigned offset2,
|
|
BankConflict &srcBC1, BankConflict &srcBC2) {
|
|
vISA_ASSERT(srcBC1 == BANK_CONFLICT_NONE, "Wrong Bank initial value");
|
|
vISA_ASSERT(srcBC2 == BANK_CONFLICT_NONE, "Wrong Bank initial value");
|
|
|
|
unsigned refNum1 = gra.getNumRefs(dcl_1);
|
|
unsigned refNum2 = gra.getNumRefs(dcl_2);
|
|
|
|
BankConflict bank1 = BANK_CONFLICT_NONE;
|
|
BankConflict bank2 = BANK_CONFLICT_NONE;
|
|
|
|
bank1 = (refNum1 >= refNum2) ? BANK_CONFLICT_FIRST_HALF_EVEN
|
|
: BANK_CONFLICT_SECOND_HALF_ODD;
|
|
bank2 = (bank1 == BANK_CONFLICT_FIRST_HALF_EVEN)
|
|
? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
|
|
srcBC1 = bank1;
|
|
srcBC2 = bank2;
|
|
|
|
// Adjust only for the single bank allocation
|
|
if ((offset1 + offset2) % 2) {
|
|
if (refNum1 >= refNum2) {
|
|
bank2 = (bank2 == BANK_CONFLICT_FIRST_HALF_EVEN)
|
|
? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
} else {
|
|
bank1 = (bank1 == BANK_CONFLICT_FIRST_HALF_EVEN)
|
|
? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
}
|
|
}
|
|
|
|
gra.setBankConflict(dcl_1, bank1);
|
|
gra.setBankConflict(dcl_2, bank2);
|
|
|
|
return;
|
|
}
|
|
|
|
//
|
|
// inst opcode is G4_mad. This function sets up a simple state machine to
|
|
// prevent conflict between src 1 and 2 of mad inst. Following is how GRF file
|
|
// is divided in to banks: bank-block A = 0, 2, 4, 6, ..., 62 bank-block B = 1,
|
|
// 3, 5, 7, ..., 63 bank-block C = 64, 66, 68, ..., 126 bank-block D = 65, 67,
|
|
// 69, ..., 127
|
|
//
|
|
// For ternary ops, if src1 and src2 are to the same bank then there will be an
|
|
// access collision. But unary and binary ops will have no collision, no matter
|
|
// what registers they use. The reason is second and third src operands are read
|
|
// in the same clock cycle, which is different than when src0 operand is read.
|
|
// This is true upto pre-SKL.
|
|
//
|
|
// Bank Conflict Herustics:
|
|
// 1. Try to balance the used registers in two banks for the potential
|
|
// conflicted registers.
|
|
// 2. reference number is used to decide which to be assigned first
|
|
// 3. When conflict detected, bank can be updated according to the reference
|
|
// count.
|
|
//
|
|
void BankConflictPass::setupBankConflictsOneGRFOld(G4_INST *inst,
|
|
int &bank1RegNum,
|
|
int &bank2RegNum,
|
|
float GRFRatio,
|
|
unsigned &internalConflict) {
|
|
BankConflict srcBC[3];
|
|
unsigned regNum[3];
|
|
unsigned refNum[3];
|
|
unsigned offset[3];
|
|
G4_Declare *dcls[3];
|
|
G4_Declare *opndDcls[3];
|
|
int bank_num = 0;
|
|
|
|
for (int i = 0; i < 3; i++) {
|
|
dcls[i] = nullptr;
|
|
opndDcls[i] = nullptr;
|
|
|
|
G4_Operand *src = inst->getSrc(i);
|
|
if (!src || !src->isSrcRegRegion() || src->isAccReg()) {
|
|
// bank conflict not possible
|
|
return;
|
|
}
|
|
|
|
dcls[i] = GetTopDclFromRegRegion(src);
|
|
opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
|
|
|
|
regNum[i] = dcls[i]->getNumRows();
|
|
refNum[i] = gra.getNumRefs(dcls[i]);
|
|
offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) /
|
|
gra.kernel.numEltPerGRF<Type_UB>();
|
|
srcBC[i] = gra.getBankConflict(dcls[i]);
|
|
|
|
if (src->getBase()->asRegVar()->isPhyRegAssigned()) {
|
|
unsigned reg =
|
|
src->getBase()->asRegVar()->getPhyReg()->asGreg()->getRegNum();
|
|
if ((reg + offset[i]) < SECOND_HALF_BANK_START_GRF) {
|
|
srcBC[i] = ((reg + offset[i]) % 2) ? BANK_CONFLICT_FIRST_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
} else {
|
|
srcBC[i] = ((reg + offset[i]) % 2) ? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_SECOND_HALF_EVEN;
|
|
}
|
|
if (reg < SECOND_HALF_BANK_START_GRF) {
|
|
bank1RegNum += regNum[i];
|
|
} else {
|
|
bank2RegNum += regNum[i];
|
|
}
|
|
gra.setBankConflict(dcls[i], srcBC[i]);
|
|
} else if (srcBC[i] != BANK_CONFLICT_NONE) {
|
|
if (offset[i] % 2) {
|
|
// Get operand's bank from declare's bank
|
|
if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN ||
|
|
srcBC[i] == BANK_CONFLICT_FIRST_HALF_ODD) {
|
|
srcBC[i] = (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN)
|
|
? BANK_CONFLICT_FIRST_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
} else {
|
|
srcBC[i] = (srcBC[i] == BANK_CONFLICT_SECOND_HALF_EVEN)
|
|
? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_SECOND_HALF_EVEN;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (i > 0) {
|
|
bank_num += srcBC[i];
|
|
}
|
|
}
|
|
|
|
// In case src1 and src2 share same declare, i.e. use same register
|
|
if (bank_num == 0 && dcls[1] == dcls[2]) {
|
|
BankConflict bank1 = ((bank1RegNum * GRFRatio) > bank2RegNum)
|
|
? BANK_CONFLICT_SECOND_HALF_EVEN
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
|
|
gra.setBankConflict(dcls[1], bank1);
|
|
srcBC[1] = bank1;
|
|
srcBC[2] = bank1;
|
|
bank_num += bank1 * 2;
|
|
if (bank1 < BANK_CONFLICT_SECOND_HALF_EVEN) {
|
|
bank1RegNum += regNum[1];
|
|
} else {
|
|
bank2RegNum += regNum[1];
|
|
}
|
|
}
|
|
|
|
// No bank assigned to src 1, 2.
|
|
// assign the two delcares into different bundles/banks.
|
|
if (bank_num == 0) {
|
|
BankConflict bank1 = BANK_CONFLICT_NONE;
|
|
BankConflict bank2 = BANK_CONFLICT_NONE;
|
|
bool bank1First = false;
|
|
if (GRFRatio == 1.0) {
|
|
// For global RA: Try to reduce the size of bank 2
|
|
if ((float)refNum[1] / regNum[1] >= (float)refNum[2] / regNum[2]) {
|
|
bank1 = BANK_CONFLICT_SECOND_HALF_EVEN;
|
|
bank2 = BANK_CONFLICT_FIRST_HALF_ODD;
|
|
bank1First = true;
|
|
} else {
|
|
bank2 = BANK_CONFLICT_SECOND_HALF_EVEN;
|
|
bank1 = BANK_CONFLICT_FIRST_HALF_ODD;
|
|
}
|
|
} else {
|
|
// For local RA: Try to balance two banks
|
|
if (refNum[1] >= refNum[2]) {
|
|
bank1 = ((bank1RegNum * GRFRatio) > bank2RegNum)
|
|
? BANK_CONFLICT_SECOND_HALF_EVEN
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
bank2 = (bank1 == BANK_CONFLICT_SECOND_HALF_EVEN)
|
|
? BANK_CONFLICT_FIRST_HALF_ODD
|
|
: BANK_CONFLICT_SECOND_HALF_ODD;
|
|
bank1First = true;
|
|
} else {
|
|
bank2 = (bank1RegNum * GRFRatio) > bank2RegNum
|
|
? BANK_CONFLICT_SECOND_HALF_EVEN
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
bank1 = (bank2 == BANK_CONFLICT_SECOND_HALF_EVEN)
|
|
? BANK_CONFLICT_FIRST_HALF_ODD
|
|
: BANK_CONFLICT_SECOND_HALF_ODD;
|
|
}
|
|
}
|
|
|
|
// Adjust only for the single bank allocation
|
|
if ((offset[1] + offset[2]) % 2) {
|
|
if (bank1First) {
|
|
bank2 = (bank2 == BANK_CONFLICT_FIRST_HALF_ODD)
|
|
? BANK_CONFLICT_FIRST_HALF_EVEN
|
|
: BANK_CONFLICT_SECOND_HALF_EVEN;
|
|
} else {
|
|
bank1 = (bank1 == BANK_CONFLICT_SECOND_HALF_ODD)
|
|
? BANK_CONFLICT_SECOND_HALF_EVEN
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
}
|
|
}
|
|
|
|
if (bank1 >= BANK_CONFLICT_SECOND_HALF_EVEN) {
|
|
bank2RegNum += regNum[1];
|
|
bank1RegNum += regNum[2];
|
|
} else {
|
|
bank1RegNum += regNum[1];
|
|
bank2RegNum += regNum[2];
|
|
}
|
|
|
|
gra.setBankConflict(dcls[1], bank1);
|
|
gra.setBankConflict(dcls[2], bank2);
|
|
} else {
|
|
if (srcBC[1] == BANK_CONFLICT_NONE || srcBC[2] == BANK_CONFLICT_NONE) {
|
|
// One source operand is assigned bank already
|
|
if (srcBC[2] == BANK_CONFLICT_NONE) {
|
|
srcBC[2] =
|
|
setupBankAccordingToSiblingOperand(srcBC[1], offset[2], true);
|
|
gra.setBankConflict(dcls[2], srcBC[2]);
|
|
|
|
if (srcBC[2] < BANK_CONFLICT_SECOND_HALF_EVEN)
|
|
bank1RegNum += regNum[2];
|
|
else
|
|
bank2RegNum += regNum[2];
|
|
} else {
|
|
srcBC[1] =
|
|
setupBankAccordingToSiblingOperand(srcBC[2], offset[1], true);
|
|
gra.setBankConflict(dcls[1], srcBC[1]);
|
|
if (srcBC[1] < BANK_CONFLICT_SECOND_HALF_EVEN)
|
|
bank1RegNum += regNum[1];
|
|
else
|
|
bank2RegNum += regNum[1];
|
|
}
|
|
} else if (dcls[1] != dcls[2]) {
|
|
if (((srcBC[1] == BANK_CONFLICT_SECOND_HALF_EVEN ||
|
|
srcBC[1] == BANK_CONFLICT_FIRST_HALF_EVEN) &&
|
|
(srcBC[2] == BANK_CONFLICT_SECOND_HALF_EVEN ||
|
|
srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN)) ||
|
|
((srcBC[1] == BANK_CONFLICT_SECOND_HALF_ODD ||
|
|
srcBC[1] == BANK_CONFLICT_FIRST_HALF_ODD) &&
|
|
(srcBC[2] == BANK_CONFLICT_SECOND_HALF_ODD ||
|
|
srcBC[2] == BANK_CONFLICT_FIRST_HALF_ODD))) {
|
|
internalConflict++;
|
|
}
|
|
if ((srcBC[1] < BANK_CONFLICT_SECOND_HALF_EVEN &&
|
|
srcBC[2] < BANK_CONFLICT_SECOND_HALF_EVEN) ||
|
|
(srcBC[1] >= BANK_CONFLICT_SECOND_HALF_EVEN &&
|
|
srcBC[2] >= BANK_CONFLICT_SECOND_HALF_EVEN)) {
|
|
internalConflict++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void BankConflictPass::getBanks(G4_INST *inst, BankConflict *srcBC,
|
|
G4_Declare **dcls, G4_Declare **opndDcls,
|
|
unsigned *offset) {
|
|
for (int i = 0; i < 3; i++) {
|
|
dcls[i] = nullptr;
|
|
opndDcls[i] = nullptr;
|
|
srcBC[i] = BANK_CONFLICT_NONE;
|
|
|
|
G4_Operand *src = inst->getSrc(i);
|
|
if (!src || !src->isSrcRegRegion() || src->isAccReg()) {
|
|
return;
|
|
}
|
|
|
|
dcls[i] = GetTopDclFromRegRegion(src);
|
|
if (!dcls[i]) {
|
|
continue;
|
|
}
|
|
opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
|
|
|
|
offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) /
|
|
gra.kernel.numEltPerGRF<Type_UB>();
|
|
srcBC[i] = gra.getBankConflict(dcls[i]);
|
|
|
|
if (src->getBase()->asRegVar()->isPhyRegAssigned()) {
|
|
unsigned reg =
|
|
src->getBase()->asRegVar()->getPhyReg()->asGreg()->getRegNum();
|
|
srcBC[i] = ((reg + offset[i]) % 2) ? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
} else if (srcBC[i] != BANK_CONFLICT_NONE) {
|
|
if (offset[i] % 2) {
|
|
if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN) {
|
|
srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
|
|
} else {
|
|
srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
void BankConflictPass::getPrevBanks(G4_INST *inst, BankConflict *srcBC,
|
|
G4_Declare **dcls, G4_Declare **opndDcls,
|
|
unsigned *offset) {
|
|
// We only care about ALU instructions which have a max number of 3 sources.
|
|
int execSize[3];
|
|
|
|
for (int i = 1; i < 3; i++) {
|
|
dcls[i] = nullptr;
|
|
opndDcls[i] = nullptr;
|
|
srcBC[i] = BANK_CONFLICT_NONE;
|
|
|
|
G4_Operand *src = inst->getSrc(i);
|
|
if (!src || !src->isSrcRegRegion()) {
|
|
return;
|
|
}
|
|
dcls[i] = GetTopDclFromRegRegion(src);
|
|
if (dcls[i]->getRegFile() != G4_GRF) {
|
|
return;
|
|
}
|
|
execSize[i] = src->getLinearizedEnd() - src->getLinearizedStart() + 1;
|
|
|
|
opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
|
|
|
|
offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) /
|
|
gra.kernel.numEltPerGRF<Type_UB>();
|
|
srcBC[i] = gra.getBankConflict(dcls[i]);
|
|
|
|
if (src->getBase()->asRegVar()->isPhyRegAssigned()) {
|
|
unsigned reg =
|
|
src->getBase()->asRegVar()->getPhyReg()->asGreg()->getRegNum();
|
|
srcBC[i] = ((reg + offset[i]) % 2) ? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
} else if (srcBC[i] != BANK_CONFLICT_NONE) {
|
|
if (offset[i] % 2) {
|
|
if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN) {
|
|
srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
|
|
} else {
|
|
srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
}
|
|
}
|
|
}
|
|
if (execSize[i] > 32) {
|
|
srcBC[i] = (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN)
|
|
? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
void BankConflictPass::setupBankForSrc0(G4_INST *inst, G4_INST *prevInst) {
|
|
BankConflict srcBC[3];
|
|
G4_Declare *dcls[3];
|
|
G4_Declare *opndDcls[3];
|
|
unsigned offset[3];
|
|
|
|
BankConflict prevSrcBC[3];
|
|
G4_Declare *prevDcls[3];
|
|
G4_Declare *prevOpndDcls[3];
|
|
unsigned prevOffset[3];
|
|
|
|
if (prevInst->isSend() || prevInst->isMath()) {
|
|
return;
|
|
}
|
|
|
|
getBanks(inst, srcBC, dcls, opndDcls, offset);
|
|
getPrevBanks(prevInst, prevSrcBC, prevDcls, prevOpndDcls, prevOffset);
|
|
|
|
if (dcls[0] != nullptr && srcBC[0] == BANK_CONFLICT_NONE &&
|
|
prevSrcBC[1] != BANK_CONFLICT_NONE &&
|
|
prevSrcBC[2] != BANK_CONFLICT_NONE) {
|
|
if (prevSrcBC[1] == prevSrcBC[2]) {
|
|
if (prevSrcBC[1] == BANK_CONFLICT_FIRST_HALF_EVEN) {
|
|
srcBC[0] = offset[0] % 2 ? BANK_CONFLICT_FIRST_HALF_EVEN
|
|
: BANK_CONFLICT_SECOND_HALF_ODD;
|
|
} else {
|
|
srcBC[0] = offset[0] % 2 ? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
}
|
|
|
|
gra.setBankConflict(dcls[0], srcBC[0]);
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
void BankConflictPass::setupBankConflictsforTwoGRFs(G4_INST *inst) {
|
|
BankConflict srcBC[3];
|
|
unsigned refNum[3];
|
|
unsigned offset[3];
|
|
G4_Declare *dcls[3];
|
|
G4_Declare *opndDcls[3];
|
|
int bank_num = 0;
|
|
int execSize[3];
|
|
|
|
for (int i = 0; i < 3; i++) {
|
|
dcls[i] = nullptr;
|
|
opndDcls[i] = nullptr;
|
|
execSize[i] = 0;
|
|
|
|
G4_Operand *src = inst->getSrc(i);
|
|
if (!src || !src->isSrcRegRegion() || src->isAccReg()) {
|
|
// bank conflict not possible
|
|
return;
|
|
}
|
|
execSize[i] = src->getLinearizedEnd() - src->getLinearizedStart() + 1;
|
|
|
|
dcls[i] = GetTopDclFromRegRegion(src);
|
|
opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
|
|
|
|
refNum[i] = gra.getNumRefs(dcls[i]);
|
|
offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) /
|
|
gra.kernel.numEltPerGRF<Type_UB>();
|
|
srcBC[i] = gra.getBankConflict(dcls[i]);
|
|
|
|
if (src->getBase()->asRegVar()->isPhyRegAssigned()) {
|
|
unsigned reg =
|
|
src->getBase()->asRegVar()->getPhyReg()->asGreg()->getRegNum();
|
|
srcBC[i] = ((reg + offset[i]) % 2) ? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
gra.setBankConflict(dcls[i], srcBC[i]);
|
|
} else if (srcBC[i] != BANK_CONFLICT_NONE) {
|
|
if (offset[i] % 2) {
|
|
if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN) {
|
|
srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
|
|
} else {
|
|
srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
}
|
|
}
|
|
}
|
|
if (i != 0) {
|
|
bank_num += srcBC[i];
|
|
}
|
|
}
|
|
|
|
int simd8SrcNum = 0;
|
|
for (int i = 0; i < 3; i++) {
|
|
if (execSize[i] <= 32) {
|
|
simd8SrcNum++;
|
|
}
|
|
}
|
|
|
|
// In case (src0) src1 and src2 use same declare, i.e. use same register
|
|
if ((dcls[0] == dcls[1]) && (dcls[1] == dcls[2])) {
|
|
return;
|
|
}
|
|
|
|
// No bank assigned to src operands,
|
|
// assign the two delcares into different bundles/banks.
|
|
if (simd8SrcNum <= 1) // All simd16, do even align
|
|
{
|
|
for (int i = 0; i < 3; i++) {
|
|
if (execSize[i] > 32) {
|
|
srcBC[i] = offset[i] % 2 ? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
gra.setBankConflict(dcls[i], srcBC[i]);
|
|
}
|
|
}
|
|
} else if (bank_num == 0) {
|
|
unsigned index[3];
|
|
|
|
refNumBasedSort(refNum, index);
|
|
|
|
if (dcls[index[0]] != dcls[index[1]]) {
|
|
setupEvenOddBankConflictsForDecls(dcls[index[0]], dcls[index[1]],
|
|
offset[index[0]], offset[index[1]],
|
|
srcBC[index[0]], srcBC[index[1]]);
|
|
}
|
|
} else {
|
|
if (srcBC[1] != BANK_CONFLICT_NONE) {
|
|
srcBC[2] = (srcBC[1] == BANK_CONFLICT_FIRST_HALF_EVEN)
|
|
? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
if (offset[2] % 2) {
|
|
srcBC[2] = (srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN)
|
|
? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
}
|
|
gra.setBankConflict(dcls[2], srcBC[2]);
|
|
} else {
|
|
srcBC[1] = (srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN)
|
|
? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
if (offset[1] % 2) {
|
|
srcBC[1] = (srcBC[1] == BANK_CONFLICT_FIRST_HALF_EVEN)
|
|
? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
}
|
|
gra.setBankConflict(dcls[1], srcBC[1]);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool BankConflictPass::isOddOffset(unsigned offset) const {
|
|
if (gra.kernel.fg.builder->oneGRFBankDivision()) {
|
|
return (offset % 2);
|
|
} else {
|
|
return ((offset % 4) / 2);
|
|
}
|
|
}
|
|
|
|
void BankConflictPass::setupBankConflictsforDPAS(G4_INST *inst) {
|
|
BankConflict srcBC[3];
|
|
unsigned refNum[3];
|
|
unsigned offset[3];
|
|
G4_Declare *dcls[3];
|
|
G4_Declare *opndDcls[3];
|
|
int bank_num = 0;
|
|
|
|
if (!inst->isDpas()) {
|
|
return;
|
|
}
|
|
|
|
for (int i = 0; i < 3; i += 1) {
|
|
opndDcls[i] = nullptr;
|
|
|
|
G4_Operand *src = inst->getSrc(i);
|
|
|
|
dcls[i] = GetTopDclFromRegRegion(src);
|
|
if (dcls[i]) {
|
|
opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
|
|
|
|
refNum[i] = gra.getNumRefs(dcls[i]);
|
|
offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) /
|
|
gra.kernel.numEltPerGRF<Type_UB>();
|
|
srcBC[i] = gra.getBankConflict(dcls[i]);
|
|
|
|
if (srcBC[i] != BANK_CONFLICT_NONE) {
|
|
if (isOddOffset(offset[i])) {
|
|
if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN) {
|
|
srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
|
|
} else {
|
|
srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
}
|
|
}
|
|
if (i != 1) {
|
|
bank_num++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (dcls[0] && dcls[1]) {
|
|
gra.addBundleConflictDcl(dcls[0], dcls[1], offset[0] - offset[1]);
|
|
gra.addBundleConflictDcl(dcls[1], dcls[0], offset[1] - offset[0]);
|
|
}
|
|
if (dcls[1] && dcls[2]) {
|
|
gra.addBundleConflictDcl(dcls[2], dcls[1], offset[2] - offset[1]);
|
|
gra.addBundleConflictDcl(dcls[1], dcls[2], offset[1] - offset[2]);
|
|
}
|
|
// In case src0 and src2 are null or use same declare, i.e. use same register
|
|
if (dcls[0] == dcls[2] || !dcls[0] || !dcls[2]) {
|
|
return;
|
|
}
|
|
|
|
if (bank_num == 0) {
|
|
srcBC[0] = refNum[0] > refNum[2] ? BANK_CONFLICT_FIRST_HALF_EVEN
|
|
: BANK_CONFLICT_SECOND_HALF_ODD;
|
|
srcBC[2] = refNum[0] > refNum[2] ? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
if (isOddOffset(offset[0])) {
|
|
srcBC[0] = (srcBC[0] == BANK_CONFLICT_FIRST_HALF_EVEN)
|
|
? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
}
|
|
if (isOddOffset(offset[2])) {
|
|
srcBC[2] = (srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN)
|
|
? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
}
|
|
gra.setBankConflict(dcls[0], srcBC[0]);
|
|
gra.setBankConflict(dcls[2], srcBC[2]);
|
|
|
|
} else if (bank_num == 1) {
|
|
if (srcBC[0] != BANK_CONFLICT_NONE) {
|
|
srcBC[2] = (srcBC[0] == BANK_CONFLICT_FIRST_HALF_EVEN)
|
|
? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
if (isOddOffset(offset[2])) {
|
|
srcBC[2] = (srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN)
|
|
? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
}
|
|
gra.setBankConflict(dcls[2], srcBC[2]);
|
|
} else {
|
|
srcBC[0] = (srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN)
|
|
? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
if (offset[0] % 2) {
|
|
srcBC[0] = (srcBC[0] == BANK_CONFLICT_FIRST_HALF_EVEN)
|
|
? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
}
|
|
gra.setBankConflict(dcls[0], srcBC[0]);
|
|
}
|
|
}
|
|
|
|
#ifdef DEBUG_VERBOSE_ON
|
|
for (int i = 0; i < 3; i += 2) {
|
|
if (opndDcls[i]) {
|
|
printf("%s, ", opndDcls[i]->getName());
|
|
|
|
if (gra.getBankConflict(dcls[i]) == BANK_CONFLICT_FIRST_HALF_EVEN) {
|
|
printf("%s\n", "EVEN");
|
|
} else if (gra.getBankConflict(dcls[i]) ==
|
|
BANK_CONFLICT_SECOND_HALF_ODD) {
|
|
printf("%s\n", "ODD");
|
|
} else {
|
|
printf("%s\n", "NONE");
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
void BankConflictPass::setupBundleConflictsforTwoSrcsInst(G4_INST *inst) {
|
|
vISA_ASSERT(inst->getNumSrc() == 2, "Only support two source operands instructions");
|
|
|
|
G4_Declare *dcls[2];
|
|
G4_Declare *opndDcls[2];
|
|
unsigned offset[2];
|
|
|
|
for (int i = 0; i < 2; i += 1) {
|
|
dcls[i] = nullptr;
|
|
opndDcls[i] = nullptr;
|
|
|
|
G4_Operand *src = inst->getSrc(i);
|
|
if (!src || !src->isSrcRegRegion() || src->isAreg()) {
|
|
// bank conflict not possible
|
|
continue;
|
|
}
|
|
|
|
dcls[i] = GetTopDclFromRegRegion(src);
|
|
opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
|
|
offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) /
|
|
gra.kernel.numEltPerGRF<Type_UB>();
|
|
}
|
|
|
|
// Add potential bundle conflicts
|
|
if (dcls[0] && dcls[1]) {
|
|
gra.addBundleConflictDcl(dcls[0], dcls[1], offset[0] - offset[1]);
|
|
gra.addBundleConflictDcl(dcls[1], dcls[0], offset[1] - offset[0]);
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
void BankConflictPass::setupBankConflictsforMad(G4_INST *inst) {
|
|
BankConflict srcBC[3];
|
|
unsigned offset[3];
|
|
G4_Declare *dcls[3];
|
|
G4_Declare *opndDcls[3];
|
|
BankConflict assignedBank = BANK_CONFLICT_NONE; // Flip for next
|
|
|
|
for (int i = 0; i < 3; i += 1) {
|
|
dcls[i] = nullptr;
|
|
opndDcls[i] = nullptr;
|
|
|
|
G4_Operand *src = inst->getSrc(i);
|
|
if (!src || !src->isSrcRegRegion() || src->isAreg()) {
|
|
// bank conflict not possible
|
|
continue;
|
|
}
|
|
|
|
dcls[i] = GetTopDclFromRegRegion(src);
|
|
opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
|
|
offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) /
|
|
gra.kernel.numEltPerGRF<Type_UB>();
|
|
srcBC[i] = gra.getBankConflict(dcls[i]);
|
|
|
|
if (srcBC[i] != BANK_CONFLICT_NONE) {
|
|
if (isOddOffset(offset[i])) {
|
|
if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN) {
|
|
srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
|
|
} else {
|
|
srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
}
|
|
}
|
|
if (assignedBank != BANK_CONFLICT_SECOND_HALF_EVEN) {
|
|
if (assignedBank == BANK_CONFLICT_NONE) {
|
|
assignedBank = srcBC[i];
|
|
} else if (assignedBank != srcBC[i]) {
|
|
assignedBank =
|
|
BANK_CONFLICT_SECOND_HALF_EVEN; // BANK_CONFLICT_SECOND_HALF_EVEN
|
|
// is used to represent all banks
|
|
// are assigned
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Add potential bundle conflicts, so that RA can handle it when option
|
|
// -enableBundleCR with value 2 or 3
|
|
if (gra.kernel.getuInt32Option(vISA_enableBundleCR) & 2) {
|
|
if (dcls[0] && dcls[1]) {
|
|
gra.addBundleConflictDcl(dcls[0], dcls[1], offset[0] - offset[1]);
|
|
gra.addBundleConflictDcl(dcls[1], dcls[0], offset[1] - offset[0]);
|
|
}
|
|
if (dcls[1] && dcls[2]) {
|
|
gra.addBundleConflictDcl(dcls[2], dcls[1], offset[2] - offset[1]);
|
|
gra.addBundleConflictDcl(dcls[1], dcls[2], offset[1] - offset[2]);
|
|
}
|
|
}
|
|
|
|
for (int k = 0; k < 2; k++) {
|
|
for (int i = 2; i != -1; i--) {
|
|
if (!dcls[i]) {
|
|
continue;
|
|
}
|
|
|
|
LocalLiveRange *lr = gra.getLocalLR(dcls[i]);
|
|
if (!lr || (k == 0 && !lr->isLiveRangeLocal())) {
|
|
continue;
|
|
}
|
|
|
|
if (k == 1 && lr->isLiveRangeLocal()) {
|
|
continue;
|
|
}
|
|
|
|
if (assignedBank == BANK_CONFLICT_SECOND_HALF_EVEN) {
|
|
continue;
|
|
}
|
|
|
|
srcBC[i] = gra.getBankConflict(dcls[i]);
|
|
if (srcBC[i] != BANK_CONFLICT_NONE) {
|
|
if (isOddOffset(offset[i])) {
|
|
if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN) {
|
|
srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
|
|
} else {
|
|
srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
}
|
|
}
|
|
|
|
if (assignedBank == BANK_CONFLICT_NONE) {
|
|
assignedBank = srcBC[i];
|
|
} else if (srcBC[i] != assignedBank) {
|
|
assignedBank = BANK_CONFLICT_SECOND_HALF_EVEN;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
if (assignedBank == BANK_CONFLICT_NONE) {
|
|
srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
assignedBank = srcBC[i];
|
|
if (isOddOffset(offset[i])) {
|
|
srcBC[i] = (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN)
|
|
? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
}
|
|
gra.setBankConflict(dcls[i], srcBC[i]);
|
|
} else {
|
|
srcBC[i] = (assignedBank == BANK_CONFLICT_FIRST_HALF_EVEN)
|
|
? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
if (isOddOffset(offset[i])) {
|
|
srcBC[i] = (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN)
|
|
? BANK_CONFLICT_SECOND_HALF_ODD
|
|
: BANK_CONFLICT_FIRST_HALF_EVEN;
|
|
}
|
|
gra.setBankConflict(dcls[i], srcBC[i]);
|
|
assignedBank = BANK_CONFLICT_SECOND_HALF_EVEN;
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifdef DEBUG_VERBOSE_ON
|
|
printf("$%d:\n", inst->getVISAId());
|
|
for (int i = 0; i < 3; i++) {
|
|
if (dcls[i]) {
|
|
printf("%s, ", dcls[i]->getName());
|
|
|
|
if (gra.getBankConflict(dcls[i]) == BANK_CONFLICT_FIRST_HALF_EVEN) {
|
|
printf("%s\n", "EVEN");
|
|
} else if (gra.getBankConflict(dcls[i]) ==
|
|
BANK_CONFLICT_SECOND_HALF_ODD) {
|
|
printf("%s\n", "ODD");
|
|
} else {
|
|
printf("%s\n", "NONE");
|
|
}
|
|
}
|
|
}
|
|
printf("\n");
|
|
#endif
|
|
|
|
return;
|
|
}
|
|
|
|
void BankConflictPass::setupBankConflictsForBB(G4_BB *bb,
|
|
unsigned &threeSourceInstNum,
|
|
unsigned &sendInstNum,
|
|
unsigned numRegLRA,
|
|
unsigned &internalConflict) {
|
|
int bank1RegNum = 0;
|
|
int bank2RegNum = 0;
|
|
float GRFRatio = 0;
|
|
G4_INST *prevInst = nullptr;
|
|
|
|
if (numRegLRA) {
|
|
GRFRatio = ((float)(numRegLRA - SECOND_HALF_BANK_START_GRF)) /
|
|
SECOND_HALF_BANK_START_GRF;
|
|
}
|
|
|
|
for (auto i = bb->rbegin(), rend = bb->rend(); i != rend; i++) {
|
|
G4_INST *inst = (*i);
|
|
if (inst->getNumSrc() == 3 && !inst->isSend()) {
|
|
threeSourceInstNum++;
|
|
setupBankConflictsOneGRFOld(inst, bank1RegNum, bank2RegNum, GRFRatio,
|
|
internalConflict);
|
|
}
|
|
if (inst->isSend() && !inst->isEOT()) {
|
|
// Why only data port read causes issue?
|
|
if (inst->getMsgDesc()->isRead()) {
|
|
sendInstNum++;
|
|
}
|
|
}
|
|
}
|
|
|
|
if ((float)threeSourceInstNum / bb->size() > 0.1) {
|
|
if (!gra.kernel.fg.builder->lowHighBundle() &&
|
|
gra.kernel.fg.builder->hasEarlyGRFRead()) {
|
|
for (G4_INST *inst : *bb) {
|
|
if (prevInst && inst->getNumSrc() == 3 && !inst->isSend()) {
|
|
setupBankForSrc0(inst, prevInst);
|
|
}
|
|
prevInst = inst;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void BankConflictPass::setupBankConflictsForBBTGL(G4_BB *bb,
|
|
unsigned &threeSourceInstNum,
|
|
unsigned &sendInstNum,
|
|
unsigned numRegLRA,
|
|
unsigned &internalConflict) {
|
|
G4_INST *prevInst = nullptr;
|
|
|
|
for (auto i = bb->rbegin(), rend = bb->rend(); i != rend; i++) {
|
|
G4_INST *inst = (*i);
|
|
if (inst->isSend() || inst->isCFInst() || inst->isLabel() ||
|
|
inst->isOptBarrier()) {
|
|
if (inst->isSend() && !inst->isEOT()) {
|
|
// Why only data port read causes issue?
|
|
if (inst->getMsgDesc()->isRead()) {
|
|
sendInstNum++;
|
|
}
|
|
}
|
|
continue;
|
|
}
|
|
if (inst->getNumSrc() >= 3) {
|
|
threeSourceInstNum++;
|
|
if (inst->isDpas()) {
|
|
threeSourceInstNum += 8;
|
|
hasDpasInst = true;
|
|
setupBankConflictsforDPAS(inst);
|
|
} else {
|
|
setupBankConflictsforMad(inst);
|
|
}
|
|
} else if (!forGlobal &&
|
|
inst->getNumSrc() == 2) {
|
|
if (gra.forceBCR) {
|
|
threeSourceInstNum++;
|
|
setupBankConflictsforMad(inst);
|
|
}
|
|
if (gra.twoSrcBundleBCR) {
|
|
threeSourceInstNum++;
|
|
setupBundleConflictsforTwoSrcsInst(inst);
|
|
}
|
|
}
|
|
}
|
|
|
|
if ((float)threeSourceInstNum / bb->size() > 0.1) {
|
|
if (!gra.kernel.fg.builder->lowHighBundle() &&
|
|
gra.kernel.fg.builder->hasEarlyGRFRead()) {
|
|
for (G4_INST *inst : *bb) {
|
|
if (prevInst && inst->getNumSrc() == 3 && !inst->isSend()) {
|
|
setupBankForSrc0(inst, prevInst);
|
|
}
|
|
prevInst = inst;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Use for BB sorting according to the loop nest level and the BB size.
|
|
bool compareBBLoopLevel(G4_BB *bb1, G4_BB *bb2) {
|
|
if (bb1->getNestLevel() > bb2->getNestLevel()) {
|
|
return true;
|
|
} else if (bb1->getNestLevel() == bb2->getNestLevel()) {
|
|
return bb1->size() > bb2->size();
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* output:
|
|
* threeSourceCandidate, if there are enough three source instructions
|
|
* return value, if do bank confliction reduction to RR RA.
|
|
*/
|
|
bool BankConflictPass::setupBankConflictsForKernel(bool doLocalRR,
|
|
bool &threeSourceCandidate,
|
|
unsigned numRegLRA,
|
|
bool &highInternalConflict) {
|
|
unsigned threeSourceInstNumInKernel = 0;
|
|
unsigned internalConflict = 0;
|
|
unsigned instNumInKernel = 0;
|
|
unsigned sendInstNumInKernel = 0;
|
|
|
|
std::vector<G4_BB *> orderedBBs(gra.kernel.fg.cbegin(), gra.kernel.fg.cend());
|
|
std::sort(orderedBBs.begin(), orderedBBs.end(), compareBBLoopLevel);
|
|
|
|
for (auto bb : orderedBBs) {
|
|
unsigned instNum = 0;
|
|
unsigned sendInstNum = 0;
|
|
unsigned threeSourceInstNum = 0;
|
|
unsigned conflicts = 0;
|
|
|
|
unsigned loopNestLevel = 0;
|
|
|
|
if (gra.kernel.fg.builder->lowHighBundle()) {
|
|
setupBankConflictsForBB(bb, threeSourceInstNum, sendInstNum, numRegLRA,
|
|
conflicts);
|
|
} else {
|
|
setupBankConflictsForBBTGL(bb, threeSourceInstNum, sendInstNum, numRegLRA,
|
|
conflicts);
|
|
}
|
|
|
|
loopNestLevel = bb->getNestLevel() + 1;
|
|
|
|
if (threeSourceInstNum) {
|
|
instNum = (uint32_t)bb->size() * loopNestLevel *
|
|
BANK_CONFLICT_HEURISTIC_LOOP_ITERATION;
|
|
threeSourceInstNum = threeSourceInstNum * loopNestLevel *
|
|
BANK_CONFLICT_HEURISTIC_LOOP_ITERATION;
|
|
sendInstNum =
|
|
sendInstNum * loopNestLevel * BANK_CONFLICT_HEURISTIC_LOOP_ITERATION;
|
|
conflicts =
|
|
conflicts * loopNestLevel * BANK_CONFLICT_HEURISTIC_LOOP_ITERATION;
|
|
internalConflict += conflicts;
|
|
threeSourceInstNumInKernel += threeSourceInstNum;
|
|
instNumInKernel += instNum;
|
|
sendInstNumInKernel += sendInstNum;
|
|
}
|
|
}
|
|
|
|
if (!threeSourceInstNumInKernel ||
|
|
(float)threeSourceInstNumInKernel / instNumInKernel <
|
|
BANK_CONFLICT_HEURISTIC_INST) {
|
|
return false;
|
|
}
|
|
|
|
highInternalConflict =
|
|
((float)internalConflict / threeSourceInstNumInKernel) >
|
|
INTERNAL_CONFLICT_RATIO_HEURISTIC;
|
|
|
|
// Bank conflict reduction is done only when there is enough three source
|
|
// instructions.
|
|
threeSourceCandidate = true;
|
|
|
|
if (doLocalRR && sendInstNumInKernel) {
|
|
if (!hasDpasInst && (sendInstNumInKernel > threeSourceInstNumInKernel)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool GlobalRA::areAllDefsNoMask(G4_Declare *dcl) {
|
|
bool retval = true;
|
|
auto &maskUsed = getMask(dcl);
|
|
if (maskUsed.size() > 0 &&
|
|
getAugmentationMask(dcl) != AugmentationMasks::NonDefault) {
|
|
auto byteSize = dcl->getByteSize();
|
|
for (unsigned i = 0; i < byteSize; i++) {
|
|
if (maskUsed[i] != NOMASK_BYTE) {
|
|
retval = false;
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
if (getAugmentationMask(dcl) == AugmentationMasks::NonDefault)
|
|
retval = true;
|
|
else
|
|
retval = false;
|
|
}
|
|
return retval;
|
|
}
|
|
|
|
BankAlign GlobalRA::getBankAlign(const G4_Declare *dcl) const {
|
|
const IR_Builder *builder = kernel.fg.builder;
|
|
switch (getBankConflict(dcl)) {
|
|
case BANK_CONFLICT_FIRST_HALF_EVEN:
|
|
case BANK_CONFLICT_SECOND_HALF_EVEN:
|
|
return builder->oneGRFBankDivision() ? BankAlign::Even
|
|
: BankAlign::Even2GRF;
|
|
case BANK_CONFLICT_FIRST_HALF_ODD:
|
|
case BANK_CONFLICT_SECOND_HALF_ODD:
|
|
return builder->oneGRFBankDivision() ? BankAlign::Odd : BankAlign::Odd2GRF;
|
|
default:
|
|
return BankAlign::Either;
|
|
}
|
|
}
|
|
|
|
void GlobalRA::emitFGWithLiveness(const LivenessAnalysis &liveAnalysis) const {
|
|
VISA_DEBUG_VERBOSE({
|
|
for (G4_BB *bb : kernel.fg) {
|
|
std::cout << "\n"
|
|
<< "-------------------------------------------------------"
|
|
"----------";
|
|
std::cout << "\nBB" << bb->getId() << ":";
|
|
std::cout << "\nPreds: ";
|
|
for (const G4_BB *pred : bb->Preds)
|
|
std::cout << "BB" << pred->getId() << ", ";
|
|
std::cout << "\nSuccs: ";
|
|
for (const G4_BB *succ : bb->Succs)
|
|
std::cout << "BB" << succ->getId() << ", ";
|
|
|
|
if (localRAEnable) {
|
|
if (auto summary = getBBLRASummary(bb)) {
|
|
std::cout << "\nLocal RA: ";
|
|
for (unsigned i = 0; i < kernel.getNumRegTotal(); i++) {
|
|
if (summary->isGRFBusy(i))
|
|
std::cout << "r" << i << ", ";
|
|
}
|
|
}
|
|
}
|
|
|
|
std::cout << "\nGen: ";
|
|
for (const G4_Declare *dcl : kernel.Declares) {
|
|
if (dcl->getAliasDeclare())
|
|
continue;
|
|
|
|
if (dcl->getRegVar()->isRegAllocPartaker()) {
|
|
if (liveAnalysis.use_gen[bb->getId()].test(
|
|
dcl->getRegVar()->getId())) {
|
|
std::cout << dcl->getName() << ", ";
|
|
}
|
|
}
|
|
}
|
|
|
|
std::cout << "\nKill: ";
|
|
for (const G4_Declare *dcl : kernel.Declares) {
|
|
if (dcl->getAliasDeclare())
|
|
continue;
|
|
|
|
if (dcl->getRegVar()->isRegAllocPartaker()) {
|
|
if (liveAnalysis.use_kill[bb->getId()].test(
|
|
dcl->getRegVar()->getId())) {
|
|
std::cout << dcl->getName() << ", ";
|
|
}
|
|
}
|
|
}
|
|
|
|
std::cout << "\nLive-in: ";
|
|
for (const G4_Declare *dcl : kernel.Declares) {
|
|
if (dcl->getAliasDeclare())
|
|
continue;
|
|
|
|
if (dcl->getRegVar()->isRegAllocPartaker()) {
|
|
if (liveAnalysis.isLiveAtEntry(bb, dcl->getRegVar()->getId())) {
|
|
std::cout << dcl->getName() << ", ";
|
|
}
|
|
}
|
|
}
|
|
|
|
std::cout << "\nLive-out: ";
|
|
for (const G4_Declare *dcl : kernel.Declares) {
|
|
if (dcl->getAliasDeclare())
|
|
continue;
|
|
|
|
if (dcl->getRegVar()->isRegAllocPartaker()) {
|
|
if (liveAnalysis.isLiveAtExit(bb, dcl->getRegVar()->getId())) {
|
|
std::cout << dcl->getName() << ", ";
|
|
}
|
|
}
|
|
}
|
|
std::cout << "\n";
|
|
bb->emit(COUT_ERROR);
|
|
}
|
|
});
|
|
}
|
|
|
|
void GlobalRA::reportSpillInfo(const LivenessAnalysis &liveness,
|
|
const GraphColor &coloring) const {
|
|
// Emit out interference graph of each spill candidate
|
|
// and if a spill candidate is a local range, emit its
|
|
// start and end line number in file.
|
|
const auto& lrs = coloring.getLiveRanges();
|
|
|
|
for (const vISA::LiveRange *slr : coloring.getSpilledLiveRanges()) {
|
|
if (slr->getRegKind() == G4_GRF) {
|
|
const G4_RegVar *spillVar = slr->getVar();
|
|
VISA_DEBUG_VERBOSE({
|
|
std::cout << "Spill candidate " << spillVar->getName() << " intf:";
|
|
std::cout << "\t(" << spillVar->getDeclare()->getTotalElems()
|
|
<< "):" << TypeSymbol(spillVar->getDeclare()->getElemType())
|
|
<< "\n";
|
|
});
|
|
|
|
if (getLocalLR(spillVar->getDeclare())) {
|
|
if (getLocalLR(spillVar->getDeclare())->isLiveRangeLocal()) {
|
|
[[maybe_unused]] int start, end;
|
|
unsigned dummy;
|
|
start = getLocalLR(spillVar->getDeclare())
|
|
->getFirstRef(dummy)
|
|
->getLineNo();
|
|
end = getLocalLR(spillVar->getDeclare())
|
|
->getLastRef(dummy)
|
|
->getLineNo();
|
|
VISA_DEBUG_VERBOSE(std::cout
|
|
<< "(Liverange is local starting at line #"
|
|
<< start << " and ending at line #" << end << ")"
|
|
<< "\n");
|
|
|
|
}
|
|
}
|
|
|
|
const Interference *intf = coloring.getIntf();
|
|
unsigned spillVarId = slr->getVar()->getId();
|
|
|
|
for (int i = 0; i < (int)liveness.getNumSelectedVar(); i++) {
|
|
if (intf->interfereBetween(spillVarId, i)) {
|
|
const G4_RegVar *intfRangeVar = lrs[i]->getVar();
|
|
(void)intfRangeVar;
|
|
VISA_DEBUG_VERBOSE(
|
|
std::cout << "\t" << intfRangeVar->getName() << "("
|
|
<< intfRangeVar->getDeclare()->getTotalElems() << "):"
|
|
<< TypeSymbol(
|
|
intfRangeVar->getDeclare()->getElemType()));
|
|
|
|
if (!lrs[i]->getPhyReg()) {
|
|
VISA_DEBUG_VERBOSE(std::cout << " --- spilled");
|
|
}
|
|
VISA_DEBUG_VERBOSE(std::cout << ",\n");
|
|
}
|
|
}
|
|
|
|
VISA_DEBUG_VERBOSE(std::cout << "\n\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
LiveRange::LiveRange(G4_RegVar *v, GlobalRA &g)
|
|
: var(v), dcl(v->getDeclare()), regKind(dcl->getRegFile()), gra(g),
|
|
numRegNeeded(dcl->getNumRegNeeded()){
|
|
isCandidate = true;
|
|
}
|
|
|
|
void LiveRange::initializeForbidden() {
|
|
auto rf = gra.incRA.getSelectedRF();
|
|
if (LivenessAnalysis::livenessClass(rf, G4_ADDRESS)) {
|
|
setForbidden(forbiddenKind::FBD_ADDR);
|
|
} else if (LivenessAnalysis::livenessClass(rf, G4_FLAG)) {
|
|
setForbidden(forbiddenKind::FBD_FLAG);
|
|
} else if (LivenessAnalysis::livenessClass(rf, G4_SCALAR)) {
|
|
setForbidden(forbiddenKind::FBD_SCALAR);
|
|
} else {
|
|
setForbidden(forbiddenKind::FBD_RESERVEDGRF);
|
|
};
|
|
|
|
bool hasStackCall =
|
|
gra.kernel.fg.getHasStackCalls() || gra.kernel.fg.getIsStackCallFunc();
|
|
setCallerSaveBias(hasStackCall);
|
|
if (getRegKind() == G4_GRF) {
|
|
if (gra.kernel.fg.isPseudoVCADcl(dcl)) {
|
|
setForbidden(forbiddenKind::FBD_CALLERSAVE);
|
|
} else if (gra.kernel.fg.isPseudoVCEDcl(dcl)) {
|
|
setForbidden(forbiddenKind::FBD_CALLEESAVE);
|
|
} else if (dcl == gra.getOldFPDcl()) {
|
|
setForbidden(forbiddenKind::FBD_CALLERSAVE);
|
|
}
|
|
}
|
|
}
|
|
|
|
void LiveRange::initialize() {
|
|
if (gra.kernel.fg.isPseudoDcl(dcl)) {
|
|
setIsPseudoNode();
|
|
}
|
|
if (dcl->getIsPartialDcl()) {
|
|
if (G4_Declare *parentDcl = gra.getSplittedDeclare(dcl)) {
|
|
setParentLRID(parentDcl->getRegVar()->getId());
|
|
setIsPartialDcl();
|
|
}
|
|
}
|
|
if (dcl->getIsSplittedDcl()) {
|
|
setIsSplittedDcl(true);
|
|
}
|
|
setBC(gra.getBankConflict(dcl));
|
|
|
|
initializeForbidden();
|
|
}
|
|
|
|
LiveRange *LiveRange::createNewLiveRange(G4_Declare *dcl, GlobalRA &gra) {
|
|
auto &IncRAMem = gra.incRA.mem;
|
|
G4_RegVar *var = dcl->getRegVar();
|
|
vISA_ASSERT(!dcl->getAliasDeclare(),
|
|
"error: attempt to create LiveRange for non-root dcl");
|
|
auto *lr = new (IncRAMem) LiveRange(var, gra);
|
|
|
|
lr->initialize();
|
|
|
|
return lr;
|
|
}
|
|
|
|
void LiveRange::checkForInfiniteSpillCost(
|
|
G4_BB *bb, std::list<G4_INST *>::reverse_iterator &it) {
|
|
// G4_INST at *it defines liverange object (this ptr)
|
|
// If next instruction of iterator uses same liverange then
|
|
// it may be a potential infinite spill cost candidate.
|
|
// To confirm, following requirements should be fulfilled:
|
|
// a. this liverange is not a global
|
|
// b. this liverange is defined/used in these 2 instructions only
|
|
//
|
|
// The idea is for ranges marked with infinite spill cost,
|
|
// coloring will attempt to put them on top of stack so they
|
|
// have higher chance of getting a color. If a range that should
|
|
// be infinite spill cost is not marked as being so, the only
|
|
// downside is extra compile time spent in inserting spill code
|
|
// and then punting out when later spilled code will cause
|
|
// even more spills.
|
|
//
|
|
// The assumption is that current live-range is a current register
|
|
// allocation candidate.
|
|
//
|
|
G4_INST *curInst = (*it);
|
|
|
|
// Skip the check if curInst is a pseudoKill
|
|
// Otherwise, it may invalidate a previously marked infinite
|
|
// spill cost candidate, e.g.,
|
|
// pseudo_kill (1) P1(0,0)[1]:uw [Align1]
|
|
// mov (1) P1(0,0)[1]:uw TV1(8,0)[0;1,0]:uw [Align1, NoMask]
|
|
// (+P1.0) sel (16) V65(0,0)[1]:f TV0(0,0)[0;1,0]:f 0:f [Align1, H1]
|
|
if (curInst->isPseudoKill()) {
|
|
return;
|
|
}
|
|
|
|
// Check whether dst variable is a global
|
|
if (gra.isBlockLocal(this->getDcl()) == false) {
|
|
isCandidate = false;
|
|
isInfiniteCost = false;
|
|
|
|
return;
|
|
}
|
|
|
|
G4_DstRegRegion *dst = curInst->getDst();
|
|
// If cur instruction dst is indirect write then return
|
|
if (dst && dst->getRegAccess() == IndirGRF &&
|
|
dst->getBase()->asRegVar()->getId() == this->getVar()->getId()) {
|
|
return;
|
|
}
|
|
|
|
// isCandidate is set to true only for first definition ever seen.
|
|
// If more than 1 def if found this gets set to false.
|
|
const std::list<G4_INST *>::reverse_iterator rbegin = bb->rbegin();
|
|
if (this->isCandidate == true && it != rbegin) {
|
|
G4_INST *nextInst = NULL;
|
|
if (this->getRefCount() != 2 || (this->getRegKind() == G4_GRF &&
|
|
this->getDcl()->getAddressed() == true)) {
|
|
// If a liverange has > 2 refs then it
|
|
// cannot be a candidate.
|
|
// Also an address taken GRF is not a candidate.
|
|
// This represents an early exit.
|
|
isCandidate = false;
|
|
isInfiniteCost = false;
|
|
|
|
return;
|
|
}
|
|
|
|
// Skip all pseudo kills
|
|
std::list<G4_INST *>::reverse_iterator next = it;
|
|
while (true) {
|
|
if (next == rbegin) {
|
|
isCandidate = isInfiniteCost = false;
|
|
return;
|
|
}
|
|
--next;
|
|
|
|
// This is not a pseudo-kill instruction, then find
|
|
// the desired next instruction. Otherwise, continue.
|
|
nextInst = *next;
|
|
if (!(nextInst->isPseudoKill()))
|
|
break;
|
|
}
|
|
|
|
// Check whether this liverange is used in nextInst
|
|
for (unsigned i = 0, numSrc = nextInst->getNumSrc(); i < numSrc; i++) {
|
|
G4_Operand *src = nextInst->getSrc(i);
|
|
|
|
if (src && src->isSrcRegRegion() &&
|
|
src->getBase()->isRegAllocPartaker()) {
|
|
// src can be Direct/Indirect
|
|
G4_SrcRegRegion *srcRgn = src->asSrcRegRegion();
|
|
|
|
if (srcRgn->getRegAccess() == Direct && srcRgn->getBase()->isRegVar() &&
|
|
srcRgn->getBase()->asRegVar()->getId() == this->getVar()->getId()) {
|
|
// Def-use found back-to-back
|
|
isInfiniteCost = true;
|
|
// Identify no more candidates
|
|
isCandidate = false;
|
|
} else if (this->getRegKind() == G4_ADDRESS &&
|
|
srcRgn->getRegAccess() == IndirGRF &&
|
|
srcRgn->getBase()->isRegVar() &&
|
|
srcRgn->getBase()->asRegVar()->getId() ==
|
|
this->getVar()->getId()) {
|
|
// Def-use found back-to-back
|
|
isInfiniteCost = true;
|
|
// Identify no more candidates
|
|
isCandidate = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
G4_DstRegRegion *nextDst = nextInst->getDst();
|
|
if (isCandidate == true && this->getRegKind() == G4_ADDRESS && nextDst &&
|
|
nextDst->getRegAccess() == IndirGRF && nextDst->getBase()->isRegVar() &&
|
|
nextDst->getBase()->asRegVar()->isRegAllocPartaker() &&
|
|
nextDst->getBase()->asRegVar()->getId() == this->getVar()->getId()) {
|
|
// Pattern found:
|
|
// A0=
|
|
// r[A0]=
|
|
isInfiniteCost = true;
|
|
// Identify no more candidates
|
|
isCandidate = false;
|
|
}
|
|
|
|
if (isCandidate == true && this->getRegKind() == G4_FLAG &&
|
|
nextInst->getPredicate() && nextInst->getPredicate()->getBase() &&
|
|
nextInst->getPredicate()->getBase()->isRegVar() &&
|
|
nextInst->getPredicate()->getBase()->asRegVar()->isRegAllocPartaker() &&
|
|
nextInst->getPredicate()->getBase()->asRegVar()->getId() ==
|
|
this->getVar()->getId()) {
|
|
// Pattern found:
|
|
// P0 = or cmp.P0 = <-- P0 defined
|
|
// (P0) ... <-- P0 used as predicate
|
|
isInfiniteCost = true;
|
|
// Identify no more candidates
|
|
isCandidate = false;
|
|
}
|
|
|
|
VISA_DEBUG_VERBOSE({
|
|
if (isInfiniteCost == true) {
|
|
std::cout
|
|
<< "Marking " << this->getDcl()->getName()
|
|
<< " as having infinite spill cost due to back-to-back def-use"
|
|
<< "\n";
|
|
}
|
|
});
|
|
|
|
// Once a def is seen, stop looking for more defs
|
|
isCandidate = false;
|
|
} else {
|
|
VISA_DEBUG_VERBOSE({
|
|
if (isInfiniteCost == true) {
|
|
std::cout << "Unmarking " << this->getDcl()->getName()
|
|
<< " as having infinite spill cost"
|
|
<< "\n";
|
|
}
|
|
});
|
|
isCandidate = false;
|
|
isInfiniteCost = false;
|
|
}
|
|
}
|
|
|
|
//
|
|
// return true, if live ranges v1 and v2 interfere
|
|
//
|
|
bool Interference::interfereBetween(unsigned v1, unsigned v2) const {
|
|
if (v1 > v2) {
|
|
std::swap(v1, v2);
|
|
}
|
|
|
|
if (useDenseMatrix()) {
|
|
unsigned col = v2 / BITS_DWORD;
|
|
return matrix[v1 * rowSize + col] & (1 << (v2 % BITS_DWORD));
|
|
} else {
|
|
auto &set1 = sparseMatrix[v1];
|
|
return set1.test(v2);
|
|
}
|
|
}
|
|
|
|
//
|
|
// init live vector with all live ranges that are live at the exit
|
|
// also set the next seq use of any live range that is live across to be INT_MAX
|
|
// to indicate that this live range does not have exclusive sequential uses and
|
|
// hence is not a candidate for being marked with an infinite spill cost.
|
|
//
|
|
void Interference::buildInterferenceAtBBExit(const G4_BB *bb,
|
|
SparseBitVector &live) {
|
|
|
|
// live must be empty at this point
|
|
live = liveAnalysis->use_out[bb->getId()];
|
|
live &= liveAnalysis->def_out[bb->getId()];
|
|
}
|
|
|
|
//
|
|
// Filter out partial or splitted declares in batch interference.
|
|
//
|
|
inline void Interference::filterSplitDclares(unsigned startIdx, unsigned endIdx,
|
|
unsigned n, unsigned col,
|
|
unsigned &elt, bool is_partial) {
|
|
|
|
if (is_partial) // Don't interference with parent
|
|
{
|
|
unsigned rowSplited = n / BITS_DWORD;
|
|
if (rowSplited == col) {
|
|
elt &= ~(1 << (n % BITS_DWORD));
|
|
}
|
|
}
|
|
|
|
// if current is splitted dcl, don't interference with any of its child nodes.
|
|
// if current is partial dcl, don't interference with any other child nodes.
|
|
if (col >= startIdx / BITS_DWORD && col < (endIdx / BITS_DWORD + 1)) {
|
|
unsigned selt = 0;
|
|
unsigned start_id = col * BITS_DWORD > startIdx ? 0 : startIdx % BITS_DWORD;
|
|
unsigned end_id =
|
|
(col + 1) * BITS_DWORD > endIdx ? endIdx % BITS_DWORD : BITS_DWORD;
|
|
|
|
for (unsigned i = start_id; i < end_id; i++) {
|
|
selt |= 1 << i;
|
|
}
|
|
elt &= ~selt;
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
//
|
|
// set interference for all live ranges that are currently live
|
|
// for partial declares, following rules are applied
|
|
// a. current partial declare does not interference with any other partial
|
|
// declare b. current parent declare does not interference with its children
|
|
// declares, can children declare interference with parent declare? c. current
|
|
// partial declare does not interference with hybrid declares added by local RA,
|
|
// the reason is simple, these declares are assigned register already.
|
|
//
|
|
void Interference::buildInterferenceWithLive(const SparseBitVector &live,
|
|
unsigned i) {
|
|
// set interference between variable with index "i" and variable set in "live".
|
|
// j is the valid bit index in the live.
|
|
for (unsigned j : live) {
|
|
if (!varSplitCheckBeforeIntf(i, j)) {
|
|
if (j < i) {
|
|
safeSetInterference(j, i);
|
|
} else if (j > i) {
|
|
safeSetInterference(i, j);
|
|
}
|
|
}
|
|
}
|
|
const LiveRange *lr = lrs[i];
|
|
bool is_partial = lr->getIsPartialDcl();
|
|
bool is_splitted = lr->getIsSplittedDcl();
|
|
unsigned n = 0;
|
|
unsigned start_idx = 0; // The variable index of the first child declare, the
|
|
// child variables' indexes are contigious.
|
|
unsigned end_idx = 0; // The variable index of the last child declare
|
|
if (is_splitted) // if current is splitted dcl, don't interference with all
|
|
// its child nodes.
|
|
{
|
|
start_idx = lr->getDcl()->getSplitVarStartID();
|
|
end_idx = start_idx + gra.getSplitVarNum(lr->getDcl());
|
|
}
|
|
if (is_partial) // if current is partial dcl, don't interference with all
|
|
// other partial dcls, and it's parent dcl.
|
|
{
|
|
// n is the variable ID of the splitted(parent) declare
|
|
n = gra.getSplittedDeclare(lr->getDcl())->getRegVar()->getId();
|
|
start_idx = splitStartId;
|
|
end_idx = splitStartId + splitNum;
|
|
}
|
|
|
|
if (is_partial) { // Don't interference with parent
|
|
if (i < n) {
|
|
safeClearInterference(i, n);
|
|
} else {
|
|
safeClearInterference(n, i);
|
|
}
|
|
}
|
|
for (unsigned j = start_idx; j < end_idx; j++) { //Don't inteference with the child
|
|
if (j < i) {
|
|
safeClearInterference(j, i);
|
|
} else {
|
|
safeClearInterference(i, j);
|
|
}
|
|
}
|
|
}
|
|
|
|
void Interference::buildInterferenceWithSubDcl(unsigned lr_id, G4_Operand *opnd,
|
|
SparseBitVector &live, bool setLive,
|
|
bool setIntf) {
|
|
|
|
const G4_Declare *dcl = lrs[lr_id]->getDcl();
|
|
for (const G4_Declare *subDcl : gra.getSubDclList(dcl)) {
|
|
unsigned leftBound = gra.getSubOffset(subDcl);
|
|
unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
|
|
if (!(opnd->getRightBound() < leftBound ||
|
|
rightBound < opnd->getLeftBound())) {
|
|
int subID = subDcl->getRegVar()->getId();
|
|
|
|
if (setIntf) {
|
|
buildInterferenceWithLive(live, subID);
|
|
}
|
|
if (setLive) {
|
|
live.set(subID);
|
|
}
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
void Interference::buildInterferenceWithAllSubDcl(unsigned v1, unsigned v2) {
|
|
const G4_Declare *d1 = lrs[v1]->getDcl();
|
|
const G4_Declare *d2 = lrs[v2]->getDcl();
|
|
|
|
if (d1->getIsSplittedDcl() && !d2->getIsPartialDcl()) {
|
|
for (const G4_Declare *subDcl : gra.getSubDclList(d1)) {
|
|
int subID = subDcl->getRegVar()->getId();
|
|
checkAndSetIntf(v2, subID);
|
|
}
|
|
}
|
|
|
|
if (d2->getIsSplittedDcl() && !d1->getIsPartialDcl()) {
|
|
for (const G4_Declare *subDcl : gra.getSubDclList(d2)) {
|
|
int subID = subDcl->getRegVar()->getId();
|
|
checkAndSetIntf(v1, subID);
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
//
|
|
// Bias the live ranges in "live" to be assigned the callee-save registers as
|
|
// they are live through a stack call. Exclude file scope variables as they are
|
|
// always save/restore before/after call and are better assigned to the
|
|
// caller-save space.
|
|
//
|
|
void Interference::addCalleeSaveBias(const SparseBitVector &live) {
|
|
for (unsigned i = 0; i < maxId; i++) {
|
|
if (live.test(i)) {
|
|
lrs[i]->setCallerSaveBias(false);
|
|
lrs[i]->setCalleeSaveBias(true);
|
|
}
|
|
}
|
|
}
|
|
|
|
void Interference::buildInterferenceAmongLiveOuts() {
|
|
// Mark interference between dcls marked as Output.
|
|
//
|
|
// Interference computation marks interference for a
|
|
// variable only when definition for that variable is
|
|
// seen, not otherwise.
|
|
//
|
|
// This method is useful when definition of such
|
|
// "Output" variables are emitted to program post RA.
|
|
//
|
|
// It is safe to mark interference between all "Output"
|
|
// dcls even when their definition is present in the program.
|
|
|
|
// First gather all Output dcls in a vector to avoid an O(N^2)
|
|
// lookup. Number of OutputDcls should be small.
|
|
std::vector<G4_Declare *> OutputDcls;
|
|
for (auto dcl : kernel.Declares) {
|
|
if (!dcl->getRegVar()->isRegAllocPartaker() || !dcl->isOutput())
|
|
continue;
|
|
|
|
OutputDcls.push_back(dcl);
|
|
}
|
|
|
|
for (auto dcl1 : OutputDcls) {
|
|
// dcl1 is RA partaker iter and is marked as Output
|
|
for (auto dcl2 : OutputDcls) {
|
|
if (dcl1 == dcl2)
|
|
continue;
|
|
|
|
checkAndSetIntf(dcl1->getRegVar()->getId(), dcl2->getRegVar()->getId());
|
|
}
|
|
}
|
|
}
|
|
|
|
void Interference::buildInterferenceAmongLiveIns() {
|
|
//
|
|
// Build interference between all live-ins. If all live-ins are only
|
|
// read then their interference will be skipped in earlier phase.
|
|
// For eg, arg and globals are both live-in. And both may only have
|
|
// uses in function and no def.
|
|
//
|
|
const G4_BB *entryBB = kernel.fg.getEntryBB();
|
|
|
|
for (auto it = liveAnalysis->globalVars.begin();
|
|
it != liveAnalysis->globalVars.end(); ++it) {
|
|
auto i = (*it);
|
|
if (liveAnalysis->isLiveAtEntry(entryBB, i)) {
|
|
// Mark reference can not gaurantee all the varaibles are local, update
|
|
// here
|
|
if (lrs[i]->getDcl()->getIsSplittedDcl()) {
|
|
lrs[i]->getDcl()->setIsSplittedDcl(false);
|
|
lrs[i]->setIsSplittedDcl(false);
|
|
}
|
|
|
|
auto nextIt = it;
|
|
for (auto nit = ++nextIt; nit != liveAnalysis->globalVars.end(); ++nit) {
|
|
auto j = (*nit);
|
|
if (liveAnalysis->isLiveAtEntry(entryBB, j)) {
|
|
if (lrs[i]->getDcl()->getRegFile() == G4_INPUT &&
|
|
lrs[i]->getVar()->getPhyReg() != NULL &&
|
|
lrs[j]->getDcl()->getRegFile() == G4_INPUT &&
|
|
lrs[j]->getVar()->getPhyReg() != NULL) {
|
|
continue;
|
|
} else {
|
|
if (!varSplitCheckBeforeIntf(i, j)) {
|
|
checkAndSetIntf(i, j);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Interference::markInterferenceForSend(G4_BB *bb, G4_INST *inst,
|
|
G4_DstRegRegion *dst) {
|
|
bool isDstRegAllocPartaker = false;
|
|
bool isDstLocallyAssigned = false;
|
|
unsigned dstId = 0;
|
|
int dstPreg = 0, dstNumRows = 0;
|
|
|
|
if (dst->getBase()->isRegVar()) {
|
|
if (dst->getBase()->isRegAllocPartaker()) {
|
|
G4_DstRegRegion *dstRgn = dst;
|
|
isDstRegAllocPartaker = true;
|
|
dstId = ((G4_RegVar *)dstRgn->getBase())->getId();
|
|
} else if (gra.useLocalRA) {
|
|
LocalLiveRange *localLR = NULL;
|
|
G4_Declare *topdcl = GetTopDclFromRegRegion(dst);
|
|
|
|
if (topdcl)
|
|
localLR = gra.getLocalLR(topdcl);
|
|
|
|
if (localLR && localLR->getAssigned()) {
|
|
int sreg;
|
|
G4_VarBase *preg = localLR->getPhyReg(sreg);
|
|
|
|
vISA_ASSERT(preg->isGreg(), "Register in dst was not GRF");
|
|
|
|
isDstLocallyAssigned = true;
|
|
dstPreg = preg->asGreg()->getRegNum();
|
|
dstNumRows = localLR->getTopDcl()->getNumRows();
|
|
}
|
|
}
|
|
|
|
if (isDstRegAllocPartaker || isDstLocallyAssigned) {
|
|
for (unsigned j = 0, numSrc = inst->getNumSrc(); j < numSrc; j++) {
|
|
G4_Operand *src = inst->getSrc(j);
|
|
if (src && src->isSrcRegRegion() &&
|
|
src->asSrcRegRegion()->getBase()->isRegVar()) {
|
|
if (src->asSrcRegRegion()->getBase()->isRegAllocPartaker()) {
|
|
unsigned srcId =
|
|
src->asSrcRegRegion()->getBase()->asRegVar()->getId();
|
|
|
|
if (isDstRegAllocPartaker) {
|
|
if (!varSplitCheckBeforeIntf(dstId, srcId)) {
|
|
checkAndSetIntf(dstId, srcId);
|
|
buildInterferenceWithAllSubDcl(dstId, srcId);
|
|
}
|
|
} else {
|
|
for (int j = dstPreg, sum = dstPreg + dstNumRows; j < sum; j++) {
|
|
int k = getGRFDclForHRA(j)->getRegVar()->getId();
|
|
if (!varSplitCheckBeforeIntf(k, srcId)) {
|
|
checkAndSetIntf(k, srcId);
|
|
buildInterferenceWithAllSubDcl(k, srcId);
|
|
}
|
|
}
|
|
}
|
|
} else if (gra.useLocalRA && isDstRegAllocPartaker) {
|
|
LocalLiveRange *localLR = nullptr;
|
|
const G4_Declare *topdcl = GetTopDclFromRegRegion(src);
|
|
|
|
if (topdcl)
|
|
localLR = gra.getLocalLR(topdcl);
|
|
|
|
if (localLR && localLR->getAssigned()) {
|
|
int sreg;
|
|
G4_VarBase *preg = localLR->getPhyReg(sreg);
|
|
int numrows = localLR->getTopDcl()->getNumRows();
|
|
|
|
vISA_ASSERT(preg->isGreg(), "Register in src was not GRF");
|
|
|
|
int reg = preg->asGreg()->getRegNum();
|
|
|
|
for (int j = reg, sum = reg + numrows; j < sum; j++) {
|
|
int k = getGRFDclForHRA(j)->getRegVar()->getId();
|
|
if (!varSplitCheckBeforeIntf(dstId, k)) {
|
|
checkAndSetIntf(dstId, k);
|
|
buildInterferenceWithAllSubDcl(dstId, k);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Interference::setOutOfBoundForbidden(G4_Operand *opnd) {
|
|
G4_Declare *dcl = opnd->getBaseRegVarRootDeclare();
|
|
vISA_ASSERT(dcl, "NULL declare");
|
|
int dclEndGRF = (dcl->getByteSize() - 1) / builder.numEltPerGRF<Type_UB>();
|
|
int opndEndGRF = opnd->getLinearizedEnd() / builder.numEltPerGRF<Type_UB>();
|
|
unsigned lrId = ((G4_RegVar *)opnd->getBase())->getId();
|
|
LiveRange *lr = lrs[lrId];
|
|
|
|
if (lr && (opndEndGRF > dclEndGRF)) {
|
|
vISA_ASSERT((opndEndGRF - dclEndGRF) == 1,
|
|
"More register reservation required for svm gather");
|
|
lr->setForbidden(forbiddenKind::FBD_LASTGRF);
|
|
}
|
|
}
|
|
|
|
void Interference::setForbiddenGRFNumForSVMScatter(G4_INST *inst) {
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
|
|
if (dst && dst->getBase()->isRegVar()) {
|
|
if (dst->getBase()->isRegAllocPartaker()) {
|
|
setOutOfBoundForbidden(dst);
|
|
}
|
|
}
|
|
|
|
for (unsigned j = 0, numSrc = inst->getNumSrc(); j < numSrc; j++) {
|
|
G4_Operand *src = inst->getSrc(j);
|
|
if (src && src->isSrcRegRegion() &&
|
|
src->asSrcRegRegion()->getBase()->isRegVar()) {
|
|
if (src->asSrcRegRegion()->getBase()->isRegAllocPartaker()) {
|
|
setOutOfBoundForbidden(src);
|
|
}
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
void Interference::markInterferenceToAvoidDstSrcOverlap(G4_BB *bb,
|
|
G4_INST *inst) {
|
|
bool isDstRegAllocPartaker = false;
|
|
bool isDstLocallyAssigned = false;
|
|
unsigned dstId = 0;
|
|
int dstPreg = 0, dstNumRows = 0;
|
|
bool dstOpndNumRows = false;
|
|
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
if (dst->getBase()->isRegVar() &&
|
|
(dst->getTopDcl()->getRegFile() == G4_GRF)) {
|
|
G4_Declare *dstDcl = dst->getTopDcl();
|
|
int dstOffset = dst->getLeftBound() / kernel.numEltPerGRF<Type_UB>();
|
|
bool isDstEvenAlign = gra.isEvenAligned(dstDcl);
|
|
|
|
if (dst->getBase()->isRegAllocPartaker()) {
|
|
isDstRegAllocPartaker = true;
|
|
dstId = ((G4_RegVar *)dst->getBase())->getId();
|
|
dstOpndNumRows = dst->getSubRegOff() * dst->getTypeSize() +
|
|
dst->getLinearizedEnd() - dst->getLinearizedStart() +
|
|
1 >
|
|
kernel.numEltPerGRF<Type_UB>();
|
|
} else if (gra.useLocalRA) {
|
|
LocalLiveRange *localLR = NULL;
|
|
G4_Declare *topdcl = GetTopDclFromRegRegion(dst);
|
|
|
|
if (topdcl)
|
|
localLR = gra.getLocalLR(topdcl);
|
|
if (localLR && localLR->getAssigned()) {
|
|
int sreg;
|
|
G4_VarBase *preg = localLR->getPhyReg(sreg);
|
|
|
|
vISA_ASSERT(preg->isGreg(), "Register in dst was not GRF");
|
|
|
|
isDstLocallyAssigned = true;
|
|
dstPreg = preg->asGreg()->getRegNum();
|
|
dstNumRows = localLR->getTopDcl()->getNumRows();
|
|
dstOpndNumRows = dst->getSubRegOff() * dst->getTypeSize() +
|
|
dst->getLinearizedEnd() - dst->getLinearizedStart() + 1 >
|
|
kernel.numEltPerGRF<Type_UB>();
|
|
isDstEvenAlign = (dstPreg % 2 == 0);
|
|
}
|
|
}
|
|
|
|
if (isDstRegAllocPartaker || isDstLocallyAssigned) {
|
|
for (unsigned j = 0, numSrc = inst->getNumSrc(); j < numSrc; j++) {
|
|
if (inst->isDpas() && j != 1)
|
|
continue;
|
|
G4_Operand *src = inst->getSrc(j);
|
|
if (src != NULL && src->isSrcRegRegion() &&
|
|
src->asSrcRegRegion()->getBase()->isRegVar()) {
|
|
G4_SrcRegRegion *srcRgn = src->asSrcRegRegion();
|
|
G4_Declare *srcDcl = src->getTopDcl();
|
|
if (srcRgn->getRegAccess() == Direct &&
|
|
(src->getTopDcl()->getRegFile() == G4_GRF ||
|
|
src->getTopDcl()->getRegFile() == G4_INPUT)) {
|
|
int srcOffset =
|
|
src->getLeftBound() / kernel.numEltPerGRF<Type_UB>();
|
|
bool srcOpndNumRows =
|
|
srcRgn->getSubRegOff() * srcRgn->getTypeSize() +
|
|
srcRgn->getLinearizedEnd() -
|
|
srcRgn->getLinearizedStart() + 1 >
|
|
kernel.numEltPerGRF<Type_UB>();
|
|
|
|
int srcReg = 0;
|
|
bool isSrcEvenAlign = gra.isEvenAligned(srcDcl);
|
|
if (!src->asSrcRegRegion()->getBase()->isRegAllocPartaker() &&
|
|
gra.useLocalRA) {
|
|
int sreg;
|
|
LocalLiveRange *localLR = NULL;
|
|
G4_Declare *topdcl = GetTopDclFromRegRegion(src);
|
|
|
|
if (topdcl)
|
|
localLR = gra.getLocalLR(topdcl);
|
|
if (localLR && localLR->getAssigned()) {
|
|
G4_VarBase *preg = localLR->getPhyReg(sreg);
|
|
|
|
vISA_ASSERT(preg->isGreg(), "Register in src was not GRF");
|
|
srcReg = preg->asGreg()->getRegNum();
|
|
isSrcEvenAlign = (srcReg % 2 == 0);
|
|
}
|
|
}
|
|
|
|
if (srcDcl->getRegFile() == G4_INPUT &&
|
|
srcDcl->getRegVar()->getPhyReg() != NULL &&
|
|
srcDcl->getRegVar()->getPhyReg()->isGreg()) {
|
|
srcReg = srcDcl->getRegVar()->getPhyReg()->asGreg()->getRegNum();
|
|
isSrcEvenAlign = (srcReg % 2 == 0);
|
|
}
|
|
|
|
if (dstOpndNumRows || srcOpndNumRows) {
|
|
if (!(isDstEvenAlign && isSrcEvenAlign &&
|
|
srcOffset % 2 == dstOffset % 2 && dstOpndNumRows &&
|
|
srcOpndNumRows)) {
|
|
if (src->asSrcRegRegion()->getBase()->isRegAllocPartaker()) {
|
|
unsigned srcId =
|
|
src->asSrcRegRegion()->getBase()->asRegVar()->getId();
|
|
#ifdef DEBUG_VERBOSE_ON
|
|
printf("Src%d ", j);
|
|
inst->dump();
|
|
#endif
|
|
if (isDstRegAllocPartaker) {
|
|
if (!varSplitCheckBeforeIntf(dstId, srcId)) {
|
|
checkAndSetIntf(dstId, srcId);
|
|
buildInterferenceWithAllSubDcl(dstId, srcId);
|
|
}
|
|
} else {
|
|
for (int j = dstPreg, sum = dstPreg + dstNumRows; j < sum;
|
|
j++) {
|
|
int k = getGRFDclForHRA(j)->getRegVar()->getId();
|
|
if (!varSplitCheckBeforeIntf(k, srcId)) {
|
|
checkAndSetIntf(k, srcId);
|
|
buildInterferenceWithAllSubDcl(k, srcId);
|
|
}
|
|
}
|
|
}
|
|
} else if (gra.useLocalRA &&
|
|
isDstRegAllocPartaker) {
|
|
LocalLiveRange *localLR = NULL;
|
|
G4_Declare *topdcl = GetTopDclFromRegRegion(src);
|
|
|
|
if (topdcl)
|
|
localLR = gra.getLocalLR(topdcl);
|
|
|
|
if (localLR && localLR->getAssigned()) {
|
|
int reg, sreg, numrows;
|
|
G4_VarBase *preg = localLR->getPhyReg(sreg);
|
|
numrows = localLR->getTopDcl()->getNumRows();
|
|
|
|
vISA_ASSERT(preg->isGreg(), "Register in src was not GRF");
|
|
|
|
reg = preg->asGreg()->getRegNum();
|
|
#ifdef DEBUG_VERBOSE_ON
|
|
printf("Src%d ", j);
|
|
inst->dump();
|
|
#endif
|
|
for (int j = reg, sum = reg + numrows; j < sum; j++) {
|
|
int k = getGRFDclForHRA(j)->getRegVar()->getId();
|
|
if (!varSplitCheckBeforeIntf(dstId, k)) {
|
|
checkAndSetIntf(dstId, k);
|
|
buildInterferenceWithAllSubDcl(dstId, k);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} else if (srcRgn->getRegAccess() == IndirGRF) {
|
|
// make every var in points-to set live
|
|
const REGVAR_VECTOR &pointsToSet =
|
|
liveAnalysis->getPointsToAnalysis().getAllInPointsToOrIndrUse(
|
|
srcRgn, bb);
|
|
for (auto &pt : pointsToSet) {
|
|
if (pt.var->isRegAllocPartaker()) {
|
|
unsigned srcId = pt.var->getId();
|
|
if (isDstRegAllocPartaker) {
|
|
if (!varSplitCheckBeforeIntf(dstId, srcId)) {
|
|
checkAndSetIntf(dstId, srcId);
|
|
buildInterferenceWithAllSubDcl(dstId, srcId);
|
|
}
|
|
} else {
|
|
for (int j = dstPreg, sum = dstPreg + dstNumRows; j < sum;
|
|
j++) {
|
|
int k = getGRFDclForHRA(j)->getRegVar()->getId();
|
|
if (!varSplitCheckBeforeIntf(k, srcId)) {
|
|
checkAndSetIntf(k, srcId);
|
|
buildInterferenceWithAllSubDcl(k, srcId);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
uint32_t GlobalRA::getRefCount(int loopNestLevel) {
|
|
if (loopNestLevel == 0) {
|
|
return 1;
|
|
}
|
|
return (uint32_t)std::pow(IN_LOOP_REFERENCE_COUNT_FACTOR,
|
|
std::min(loopNestLevel, 8));
|
|
}
|
|
|
|
// handle return value interference for fcall
|
|
void Interference::buildInterferenceForFcall(
|
|
G4_BB *bb, SparseBitVector &live, G4_INST *inst,
|
|
std::list<G4_INST *>::reverse_iterator i, const G4_VarBase *regVar) {
|
|
vISA_ASSERT(inst->opcode() == G4_pseudo_fcall, "expect fcall inst");
|
|
if (regVar->isRegAllocPartaker()) {
|
|
unsigned id = static_cast<const G4_RegVar *>(regVar)->getId();
|
|
|
|
buildInterferenceWithLive(live, id);
|
|
updateLiveness(live, id, false);
|
|
}
|
|
}
|
|
|
|
bool GlobalRA::canIncreaseGRF(unsigned spillSize, bool infCostSpilled) {
|
|
// If we estimate insufficient # GRFs early on, we may end up
|
|
// spilling an infinite spill cost variable. As last ditch effort,
|
|
// we bump up # GRFs and retry compilation. If we estimate GRF
|
|
// config well, then we should never see infCostSpilled == true.
|
|
|
|
// Conditions to increase #GRFs assuming first RA iteration did not succeed:
|
|
// - Variable with inf spill cost, or
|
|
// - #GRFs selected and next larger one has same number of threads, or
|
|
// - Spill size is above threshold
|
|
if ((infCostSpilled || kernel.grfMode.hasLargerGRFSameThreads() ||
|
|
spillSize >= kernel.grfMode.getSpillThreshold()) &&
|
|
!didGRFIncrease) {
|
|
if (kernel.updateKernelToLargerGRF()) {
|
|
// GRF successfully increased
|
|
RA_TRACE(std::cout << "\t--new GRF size " << kernel.getNumRegTotal()
|
|
<< ". Re-run RA\n ");
|
|
didGRFIncrease = true;
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void Interference::buildInterferenceForDst(
|
|
G4_BB *bb, SparseBitVector &live, G4_INST *inst,
|
|
std::list<G4_INST *>::reverse_iterator i, G4_DstRegRegion *dst) {
|
|
|
|
if (dst->getBase()->isRegAllocPartaker()) {
|
|
unsigned id = ((G4_RegVar *)dst->getBase())->getId();
|
|
//
|
|
// In following code,
|
|
// pseudo_kill V10
|
|
// mov (8) V10, V11
|
|
//
|
|
// V10 and V11 do not interfere and can be assigned
|
|
// same register.
|
|
//
|
|
// Following condition skips marking interference for
|
|
// pseudo_kill nodes.
|
|
//
|
|
if (!inst->isPseudoKill() && !inst->isLifeTimeEnd()) {
|
|
buildInterferenceWithLive(live, id);
|
|
if (lrs[id]->getIsSplittedDcl()) {
|
|
buildInterferenceWithSubDcl(id, (G4_Operand *)dst, live, false, true);
|
|
}
|
|
}
|
|
|
|
//
|
|
// if the write does not cover the whole dst region, we should continue let
|
|
// the liveness propagate upwards
|
|
//
|
|
if (liveAnalysis->writeWholeRegion(bb, inst, dst) || inst->isPseudoKill()) {
|
|
updateLiveness(live, id, false);
|
|
|
|
if (lrs[id]->getIsSplittedDcl()) {
|
|
for (unsigned i = lrs[id]->getDcl()->getSplitVarStartID();
|
|
i < lrs[id]->getDcl()->getSplitVarStartID() +
|
|
gra.getSplitVarNum(lrs[id]->getDcl());
|
|
i++) {
|
|
live.reset(i); // kill all childs, there may be not used childs
|
|
// generated due to splitting, killed also.
|
|
}
|
|
}
|
|
}
|
|
} else if (dst->isIndirect() && liveAnalysis->livenessClass(G4_GRF)) {
|
|
//
|
|
// add interferences to the list of potential indirect destination accesses.
|
|
//
|
|
const REGVAR_VECTOR &pointsToSet =
|
|
liveAnalysis->getPointsToAnalysis().getAllInPointsToOrIndrUse(dst, bb);
|
|
for (auto &pt : pointsToSet) {
|
|
if (pt.var->isRegAllocPartaker()) {
|
|
buildInterferenceWithLive(live, pt.var->getId());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Interference::buildInterferenceWithinBB(G4_BB *bb, SparseBitVector &live) {
|
|
DebugInfoState state;
|
|
|
|
for (auto i = bb->rbegin(); i != bb->rend(); i++) {
|
|
G4_INST *inst = (*i);
|
|
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
if (dst) {
|
|
buildInterferenceForDst(bb, live, inst, i, dst);
|
|
}
|
|
|
|
if (inst->opcode() == G4_pseudo_fcall) {
|
|
if (liveAnalysis->livenessClass(G4_GRF)) {
|
|
auto fcall = kernel.fg.builder->getFcallInfo(bb->back());
|
|
G4_Declare *arg = kernel.fg.builder->getStackCallArg();
|
|
G4_Declare *ret = kernel.fg.builder->getStackCallRet();
|
|
vISA_ASSERT(fcall != std::nullopt, "fcall info not found");
|
|
uint16_t retSize = fcall->getRetSize();
|
|
uint16_t argSize = fcall->getArgSize();
|
|
if (ret && retSize > 0 && ret->getRegVar()) {
|
|
buildInterferenceForFcall(bb, live, inst, i, ret->getRegVar());
|
|
}
|
|
if (arg && argSize > 0 && arg->getRegVar()) {
|
|
auto id = arg->getRegVar()->getId();
|
|
updateLiveness(live, id, true);
|
|
}
|
|
} else if (liveAnalysis->livenessClass(G4_ADDRESS)) {
|
|
// assume callee will use A0
|
|
auto A0Dcl = kernel.fg.fcallToPseudoDclMap[inst->asCFInst()].A0;
|
|
buildInterferenceWithLive(live, A0Dcl->getRegVar()->getId());
|
|
} else if (liveAnalysis->livenessClass(G4_FLAG)) {
|
|
// assume callee will use both F0 and F1
|
|
auto flagDcl = kernel.fg.fcallToPseudoDclMap[inst->asCFInst()].Flag;
|
|
buildInterferenceWithLive(live, flagDcl->getRegVar()->getId());
|
|
}
|
|
}
|
|
|
|
if (inst->isSend() && inst->asSendInst()->isSVMScatterRW() &&
|
|
inst->getExecSize() < g4::SIMD8) {
|
|
setForbiddenGRFNumForSVMScatter(inst);
|
|
}
|
|
|
|
if ((inst->isSend() || inst->isFillIntrinsic()) && !dst->isNullReg() &&
|
|
kernel.fg.builder->WaDisableSendSrcDstOverlap()) {
|
|
markInterferenceForSend(bb, inst, dst);
|
|
} else if (kernel.fg.builder->avoidDstSrcOverlap() && dst &&
|
|
!dst->isNullReg()) {
|
|
markInterferenceToAvoidDstSrcOverlap(bb, inst);
|
|
}
|
|
|
|
if (inst->isSplitSend() && !inst->getSrc(1)->isNullReg()) {
|
|
G4_SrcRegRegion *src0 = inst->getSrc(0)->asSrcRegRegion();
|
|
G4_SrcRegRegion *src1 = inst->getSrc(1)->asSrcRegRegion();
|
|
|
|
if (src0->getBase()->isRegAllocPartaker() &&
|
|
src1->getBase()->isRegAllocPartaker()) {
|
|
// src0 and src1 of split send may not overlap. In normal cases this is
|
|
// handled automatically as we add interference edge when we reach
|
|
// src0/src1's def. If one source is an undefined variable (this can
|
|
// happen for URB write payload) and the other an input, however, we
|
|
// could miss the interference edge between the two. So we add it
|
|
// explicitly here
|
|
int src0Id = src0->getBase()->asRegVar()->getId();
|
|
int src1Id = src1->getBase()->asRegVar()->getId();
|
|
|
|
checkAndSetIntf(src0Id, src1Id);
|
|
buildInterferenceWithAllSubDcl(src0Id, src1Id);
|
|
}
|
|
}
|
|
|
|
// DPAS: As part of same instruction, src1 should not have overlap with dst.
|
|
// Src0 and src2 are okay to have overlap
|
|
if (inst->isDpas() && !inst->getSrc(1)->isNullReg()) {
|
|
G4_SrcRegRegion *src1 = inst->getSrc(1)->asSrcRegRegion();
|
|
if (dst->getBase()->isRegAllocPartaker() &&
|
|
src1->getBase()->isRegAllocPartaker()) {
|
|
int dstId = dst->getBase()->asRegVar()->getId();
|
|
int src1Id = src1->getBase()->asRegVar()->getId();
|
|
checkAndSetIntf(dstId, src1Id);
|
|
buildInterferenceWithAllSubDcl(dstId, src1Id);
|
|
}
|
|
}
|
|
|
|
//
|
|
// process each source operand
|
|
//
|
|
for (unsigned j = 0, numSrc = inst->getNumSrc(); j < numSrc; j++) {
|
|
G4_Operand *src = inst->getSrc(j);
|
|
if (!src)
|
|
continue;
|
|
if (src->isSrcRegRegion()) {
|
|
G4_SrcRegRegion *srcRegion = src->asSrcRegRegion();
|
|
if (srcRegion->getBase()->isRegAllocPartaker()) {
|
|
unsigned id = ((G4_RegVar *)(srcRegion)->getBase())->getId();
|
|
|
|
if (!inst->isLifeTimeEnd()) {
|
|
updateLiveness(live, id, true);
|
|
if (lrs[id]->getIsSplittedDcl()) {
|
|
buildInterferenceWithSubDcl(id, src, live, true, false);
|
|
}
|
|
}
|
|
} else if (srcRegion->isIndirect() &&
|
|
liveAnalysis->livenessClass(G4_GRF)) {
|
|
// make every var in points-to set live
|
|
const REGVAR_VECTOR &pointsToSet =
|
|
liveAnalysis->getPointsToAnalysis().getAllInPointsToOrIndrUse(
|
|
srcRegion, bb);
|
|
for (auto &pt : pointsToSet) {
|
|
if (pt.var->isRegAllocPartaker()) {
|
|
updateLiveness(live, pt.var->getId(), true);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// Process register-indirect destination uses of ARF.
|
|
//
|
|
if (dst) {
|
|
if (dst->getBase()->isRegAllocPartaker() &&
|
|
dst->getRegAccess() != Direct) {
|
|
live.set(dst->getBase()->asRegVar()->getId());
|
|
}
|
|
}
|
|
|
|
//
|
|
// Process condMod
|
|
//
|
|
G4_CondMod *mod = inst->getCondMod();
|
|
if (mod != NULL) {
|
|
G4_VarBase *flagReg = mod->getBase();
|
|
if (flagReg != NULL) {
|
|
unsigned id = flagReg->asRegVar()->getId();
|
|
if (flagReg->asRegVar()->isRegAllocPartaker()) {
|
|
buildInterferenceWithLive(live, id);
|
|
|
|
if (liveAnalysis->writeWholeRegion(bb, inst, flagReg)) {
|
|
updateLiveness(live, id, false);
|
|
}
|
|
}
|
|
} else {
|
|
vISA_ASSERT((inst->opcode() == G4_sel || inst->opcode() == G4_csel) &&
|
|
inst->getCondMod() != NULL,
|
|
"Invalid CondMod");
|
|
}
|
|
}
|
|
|
|
//
|
|
// Process predicate
|
|
//
|
|
G4_Predicate *predicate = inst->getPredicate();
|
|
if (predicate != NULL) {
|
|
G4_VarBase *flagReg = predicate->getBase();
|
|
unsigned id = flagReg->asRegVar()->getId();
|
|
if (flagReg->asRegVar()->isRegAllocPartaker()) {
|
|
live.set(id);
|
|
}
|
|
}
|
|
|
|
// Update debug info intervals based on live set
|
|
if (builder.getOption(vISA_GenerateDebugInfo)) {
|
|
updateDebugInfo(kernel, inst, *liveAnalysis, lrs, live, &state,
|
|
inst == bb->front());
|
|
}
|
|
}
|
|
}
|
|
|
|
void Interference::applyPartitionBias() {
|
|
// Any variable that interferes with a VCA dcl is live through an fcall.
|
|
// This function makes such variables callee save biased to avoid save/restore
|
|
// code around fcall. Save/restore may still be needed in case this is a
|
|
// stack call function (vs kernel), but a single save/restore sequence can
|
|
// free the callee save register throughout the function.
|
|
for (auto i : liveAnalysis->globalVars) {
|
|
if (kernel.fg.isPseudoVCADcl(lrs[i]->getDcl())) {
|
|
const auto &intfs = sparseIntf[i];
|
|
for (const auto edge : intfs) {
|
|
// no point adding bias to any variable already assigned
|
|
if (lrs[edge]->getPhyReg())
|
|
continue;
|
|
|
|
lrs[edge]->setCalleeSaveBias(true);
|
|
lrs[edge]->setCallerSaveBias(false);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Any setting of LiveRange property that is discovered during interference
|
|
// must be done here. Because with incremental RA, we may not run interference
|
|
// computation for all BBs.
|
|
void Interference::setupLRs(G4_BB *bb) {
|
|
unsigned refCount = GlobalRA::getRefCount(
|
|
kernel.getOption(vISA_ConsiderLoopInfoInRA) ? bb->getNestLevel() : 0);
|
|
bool incSpillCostAddrTaken = kernel.getOption(vISA_IncSpillCostAllAddrTaken);
|
|
|
|
for (auto i = bb->rbegin(); i != bb->rend(); i++) {
|
|
G4_INST *inst = (*i);
|
|
|
|
auto dst = inst->getDst();
|
|
if (dst) {
|
|
if (dst->getBase()->isRegAllocPartaker()) {
|
|
unsigned id = ((G4_RegVar *)dst->getBase())->getId();
|
|
if (!inst->isPseudoKill() && !inst->isLifeTimeEnd()) {
|
|
lrs[id]->setRefCount(lrs[id]->getRefCount() +
|
|
refCount); // update reference count
|
|
}
|
|
lrs[id]->checkForInfiniteSpillCost(bb, i);
|
|
} else if (dst->isIndirect() && liveAnalysis->livenessClass(G4_GRF)) {
|
|
const REGVAR_VECTOR &pointsToSet =
|
|
liveAnalysis->getPointsToAnalysis().getAllInPointsToOrIndrUse(dst,
|
|
bb);
|
|
for (auto &pt : pointsToSet) {
|
|
if (!pt.var->isRegAllocPartaker() || !incSpillCostAddrTaken)
|
|
continue;
|
|
|
|
lrs[pt.var->getId()]->setRefCount(
|
|
lrs[pt.var->getId()]->getRefCount() + refCount);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (inst->opcode() == G4_pseudo_fcall &&
|
|
liveAnalysis->livenessClass(G4_GRF)) {
|
|
auto fcall = kernel.fg.builder->getFcallInfo(bb->back());
|
|
G4_Declare *ret = kernel.fg.builder->getStackCallRet();
|
|
vISA_ASSERT(fcall != std::nullopt, "fcall info not found");
|
|
uint16_t retSize = fcall->getRetSize();
|
|
if (ret && retSize > 0 && ret->getRegVar() &&
|
|
ret->getRegVar()->isRegAllocPartaker()) {
|
|
unsigned id = static_cast<const G4_RegVar *>(ret->getRegVar())->getId();
|
|
lrs[id]->setRefCount(lrs[id]->getRefCount() + refCount);
|
|
}
|
|
}
|
|
|
|
bool isSend =
|
|
inst->isSend() || inst->isFillIntrinsic() || inst->isSpillIntrinsic();
|
|
if (isSend && !dst->isNullReg()) {
|
|
// r127 must not be used for return address when there is a src and dest
|
|
// overlap in send instruction. This applies to split-send as well
|
|
if (kernel.fg.builder->needsToReserveR127() &&
|
|
liveAnalysis->livenessClass(G4_GRF) &&
|
|
dst->getBase()->isRegAllocPartaker() &&
|
|
!dst->getBase()->asRegVar()->isPhyRegAssigned()) {
|
|
int dstId = dst->getBase()->asRegVar()->getId();
|
|
lrs[dstId]->setForbidden(forbiddenKind::FBD_LASTGRF);
|
|
}
|
|
}
|
|
|
|
|
|
//
|
|
// process each source operand
|
|
//
|
|
for (unsigned j = 0, numSrc = inst->getNumSrc(); j < numSrc; j++) {
|
|
G4_Operand *src = inst->getSrc(j);
|
|
if (!src || !src->isSrcRegRegion())
|
|
continue;
|
|
|
|
G4_SrcRegRegion *srcRegion = src->asSrcRegRegion();
|
|
if (srcRegion->getBase()->isRegAllocPartaker()) {
|
|
unsigned id = ((G4_RegVar *)(srcRegion)->getBase())->getId();
|
|
|
|
lrs[id]->setRefCount(lrs[id]->getRefCount() + refCount);
|
|
if (inst->isEOT() && liveAnalysis->livenessClass(G4_GRF)) {
|
|
// mark the liveRange as the EOT source
|
|
lrs[id]->setEOTSrc();
|
|
if (builder.hasEOTGRFBinding()) {
|
|
lrs[id]->setForbidden(forbiddenKind::FBD_EOT);
|
|
}
|
|
}
|
|
if (inst->isReturn()) {
|
|
lrs[id]->setRetIp();
|
|
}
|
|
} else if (srcRegion->isIndirect() &&
|
|
liveAnalysis->livenessClass(G4_GRF)) {
|
|
// make every var in points-to set live
|
|
const REGVAR_VECTOR &pointsToSet =
|
|
liveAnalysis->getPointsToAnalysis().getAllInPointsToOrIndrUse(
|
|
srcRegion, bb);
|
|
for (auto &pt : pointsToSet) {
|
|
if (!pt.var->isRegAllocPartaker() || !incSpillCostAddrTaken)
|
|
continue;
|
|
|
|
lrs[pt.var->getId()]->setRefCount(
|
|
lrs[pt.var->getId()]->getRefCount() + refCount);
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// Process condMod
|
|
//
|
|
if (auto mod = inst->getCondMod()) {
|
|
G4_VarBase *flagReg = mod->getBase();
|
|
if (flagReg) {
|
|
unsigned id = flagReg->asRegVar()->getId();
|
|
if (flagReg->asRegVar()->isRegAllocPartaker()) {
|
|
lrs[id]->setRefCount(lrs[id]->getRefCount() + refCount);
|
|
lrs[id]->checkForInfiniteSpillCost(bb, i);
|
|
}
|
|
} else {
|
|
vISA_ASSERT((inst->opcode() == G4_sel || inst->opcode() == G4_csel) &&
|
|
inst->getCondMod() != NULL,
|
|
"Invalid CondMod");
|
|
}
|
|
}
|
|
|
|
//
|
|
// Process predicate
|
|
//
|
|
if (auto predicate = inst->getPredicate()) {
|
|
G4_VarBase *flagReg = predicate->getBase();
|
|
unsigned id = flagReg->asRegVar()->getId();
|
|
if (flagReg->asRegVar()->isRegAllocPartaker()) {
|
|
lrs[id]->setRefCount(lrs[id]->getRefCount() + refCount);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Interference::computeInterference() {
|
|
startTimer(TimerID::INTERFERENCE);
|
|
|
|
for (auto bb : kernel.fg) {
|
|
// Initialize LR properties like ref count and forbidden here.
|
|
// This method is invoked for all BBs even with incremental RA.
|
|
setupLRs(bb);
|
|
}
|
|
|
|
//
|
|
// create bool vector, live, to track live ranges that are currently live
|
|
//
|
|
SparseBitVector live;
|
|
|
|
buildInterferenceAmongLiveOuts();
|
|
|
|
for (G4_BB *bb : kernel.fg) {
|
|
if (!incRA.intfNeededForBB(bb)) {
|
|
continue;
|
|
}
|
|
//
|
|
// mark all live ranges dead
|
|
//
|
|
live.clear();
|
|
//
|
|
// start with all live ranges that are live at the exit of BB
|
|
//
|
|
buildInterferenceAtBBExit(bb, live);
|
|
//
|
|
// traverse inst in the reverse order
|
|
//
|
|
buildInterferenceWithinBB(bb, live);
|
|
}
|
|
|
|
buildInterferenceAmongLiveIns();
|
|
|
|
//
|
|
// Build interference with physical registers assigned by local RA
|
|
//
|
|
if (gra.useLocalRA) {
|
|
for (auto curBB : kernel.fg) {
|
|
buildInterferenceWithLocalRA(curBB);
|
|
}
|
|
}
|
|
|
|
RA_TRACE({
|
|
RPE rpe(gra, liveAnalysis);
|
|
rpe.run();
|
|
std::cout << "\t--max RP: " << rpe.getMaxRP() << "\n";
|
|
});
|
|
|
|
if ((builder.getOption(vISA_RATrace) ||
|
|
builder.getOption(vISA_DumpPerfStatsVerbose)) &&
|
|
builder.getJitInfo()->statsVerbose.RAIterNum == 1) {
|
|
getNormIntfNum();
|
|
}
|
|
|
|
// Augment interference graph to accomodate non-default masks
|
|
aug.augmentIntfGraph();
|
|
|
|
generateSparseIntfGraph();
|
|
|
|
countNeighbors();
|
|
|
|
if (IncrementalRA::isEnabled(kernel)) {
|
|
// Incremental interference was computed for current iteration.
|
|
// Now prepare for incremental temps in next iteration.
|
|
gra.incRA.clearCandidates();
|
|
}
|
|
|
|
// apply callee save bias after augmentation as interference graph is
|
|
// up-to-date.
|
|
if (kernel.fg.getHasStackCalls()) {
|
|
applyPartitionBias();
|
|
}
|
|
stopTimer(TimerID::INTERFERENCE);
|
|
}
|
|
|
|
void Interference::getNormIntfNum() {
|
|
unsigned numVars = liveAnalysis->getNumSelectedVar();
|
|
uint32_t numEdges = 0;
|
|
|
|
if (useDenseMatrix()) {
|
|
// Iterate over intf graph matrix
|
|
for (unsigned row = 0; row < numVars; row++) {
|
|
unsigned rowOffset = row * rowSize;
|
|
unsigned colStart = (row + 1) / BITS_DWORD;
|
|
for (unsigned j = colStart; j < rowSize; j++) {
|
|
unsigned intfBlk = getInterferenceBlk(rowOffset + j);
|
|
if (intfBlk == 0) {
|
|
continue;
|
|
}
|
|
for (unsigned k = 0; k < BITS_DWORD; k++) {
|
|
if (!(intfBlk & (1 << k))) {
|
|
continue;
|
|
}
|
|
unsigned v2 = (j * BITS_DWORD) + k;
|
|
if (v2 != row) {
|
|
numEdges++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
for (uint32_t v1 = 0; v1 < maxId; ++v1) {
|
|
auto &intfSet = sparseMatrix[v1];
|
|
numEdges += intfSet.count();
|
|
}
|
|
}
|
|
|
|
builder.getJitInfo()->statsVerbose.normIntfNum = numEdges;
|
|
RA_TRACE(std::cout << "\t--normal edge #: " << numEdges << "\n");
|
|
}
|
|
|
|
#define SPARSE_INTF_VEC_SIZE 64
|
|
|
|
void Interference::generateSparseIntfGraph() {
|
|
// Generate sparse intf graph from the dense one
|
|
unsigned numVars = liveAnalysis->getNumSelectedVar();
|
|
|
|
sparseIntf.resize(numVars);
|
|
|
|
for (unsigned row = 0; row < numVars; row++) {
|
|
sparseIntf[row].reserve(SPARSE_INTF_VEC_SIZE);
|
|
}
|
|
|
|
if (useDenseMatrix()) {
|
|
// Iterate over intf graph matrix
|
|
for (unsigned row = 0; row < numVars; row++) {
|
|
unsigned rowOffset = row * rowSize;
|
|
unsigned colStart = (row + 1) / BITS_DWORD;
|
|
for (unsigned j = colStart; j < rowSize; j++) {
|
|
unsigned intfBlk = getInterferenceBlk(rowOffset + j);
|
|
if (intfBlk != 0) {
|
|
for (unsigned k = 0; k < BITS_DWORD; k++) {
|
|
if (intfBlk & (1 << k)) {
|
|
unsigned v2 = (j * BITS_DWORD) + k;
|
|
if (v2 != row) {
|
|
sparseIntf[v2].emplace_back(row);
|
|
sparseIntf[row].emplace_back(v2);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
for (uint32_t v1 = 0; v1 < maxId; ++v1) {
|
|
auto &intfSet = sparseMatrix[v1];
|
|
for (uint32_t v2 : intfSet) {
|
|
sparseIntf[v1].emplace_back(v2);
|
|
sparseIntf[v2].emplace_back(v1);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Interference::countNeighbors() {
|
|
if (!builder.getOption(vISA_RATrace) &&
|
|
!builder.getOption(vISA_DumpPerfStatsVerbose))
|
|
return;
|
|
|
|
uint32_t numNeighbor = 0;
|
|
uint32_t maxNeighbor = 0;
|
|
[[maybe_unused]] uint32_t maxIndex = 0;
|
|
uint32_t numEdges = 0;
|
|
for (int i = 0, numVar = (int)sparseIntf.size(); i < numVar; ++i) {
|
|
if (lrs[i]->getPhyReg() == nullptr) {
|
|
auto &intf = sparseIntf[i];
|
|
numNeighbor += (uint32_t)intf.size();
|
|
maxNeighbor = std::max(maxNeighbor, numNeighbor);
|
|
if (maxNeighbor == numNeighbor)
|
|
maxIndex = i;
|
|
}
|
|
numEdges += (uint32_t)sparseIntf[i].size();
|
|
}
|
|
float avgNeighbor = ((float)numNeighbor) / sparseIntf.size();
|
|
if (builder.getJitInfo()->statsVerbose.RAIterNum == 1) {
|
|
builder.getJitInfo()->statsVerbose.avgNeighbors = avgNeighbor;
|
|
builder.getJitInfo()->statsVerbose.maxNeighbors = maxNeighbor;
|
|
builder.getJitInfo()->statsVerbose.augIntfNum =
|
|
(numEdges / 2) - builder.getJitInfo()->statsVerbose.normIntfNum;
|
|
}
|
|
RA_TRACE({
|
|
std::cout << "\t--avg # neighbors: " << std::setprecision(6) << avgNeighbor
|
|
<< "\n";
|
|
std::cout << "\t--max # neighbors: " << maxNeighbor << " ("
|
|
<< lrs[maxIndex]->getDcl()->getName() << ")\n";
|
|
if (builder.getJitInfo()->statsVerbose.RAIterNum == 1) {
|
|
std::cout << "\t--aug edge #: "
|
|
<< builder.getJitInfo()->statsVerbose.augIntfNum << "\n";
|
|
}
|
|
});
|
|
}
|
|
|
|
// This function can be invoked before local RA or after augmentation.
|
|
// This function will update sub-reg data only for non-NoMask vars and
|
|
// leave others unchanged, ie their value will be as per HW conformity
|
|
// or earlier phase.
|
|
void GlobalRA::updateSubRegAlignment(G4_SubReg_Align subAlign) {
|
|
// Update alignment of all GRF declares to sub-align
|
|
for (auto dcl : kernel.Declares) {
|
|
if (dcl->getRegFile() & G4_GRF && !dcl->getIsPartialDcl()) {
|
|
G4_Declare *topdcl = dcl->getRootDeclare();
|
|
|
|
if (!areAllDefsNoMask(topdcl) &&
|
|
getAugmentationMask(topdcl) != AugmentationMasks::NonDefault) {
|
|
dcl->setSubRegAlign(subAlign);
|
|
setSubRegAlign(dcl, subAlign);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
int GlobalRA::getAlignFromAugBucket(G4_Declare *dcl) {
|
|
if (GlobalRA::useGenericAugAlign(builder.getPlatformGeneration())) {
|
|
// Return 0 if no special alignment is needed
|
|
// Return 2 if even alignment is needed
|
|
// Return 4 if quad alignment is needed
|
|
|
|
// Even align needed if for given SIMD size and elem type,
|
|
// a complete def uses between 1-2 GRFs.
|
|
auto kernelSimdSizeToUse = kernel.getSimdSizeWithSlicing();
|
|
G4_Declare *topdcl = dcl->getRootDeclare();
|
|
auto topdclAugMask = getAugmentationMask(topdcl);
|
|
|
|
if (!areAllDefsNoMask(topdcl) && !topdcl->getIsPartialDcl() &&
|
|
topdclAugMask != AugmentationMasks::NonDefault) {
|
|
auto elemSizeToUse = topdcl->getElemSize();
|
|
if (elemSizeToUse < 4 && topdclAugMask == AugmentationMasks::Default32Bit)
|
|
// :uw with hstride 2 can also be Default32Bit and hence needs even
|
|
// alignment
|
|
elemSizeToUse = 4;
|
|
else if (elemSizeToUse < 8 &&
|
|
topdclAugMask == AugmentationMasks::Default64Bit)
|
|
elemSizeToUse = 8;
|
|
|
|
auto totalByteSize = elemSizeToUse * kernelSimdSizeToUse;
|
|
auto bucketSpans2GRFs = [&]() {
|
|
return totalByteSize > (unsigned)kernel.numEltPerGRF<Type_UB>() &&
|
|
totalByteSize <= (unsigned)(2 * kernel.numEltPerGRF<Type_UB>());
|
|
};
|
|
|
|
if (!(!builder.canReadR0() && dcl == kernel.fg.builder->getBuiltinR0())) {
|
|
if (use4GRFAlign) {
|
|
if (topdclAugMask == AugmentationMasks::Default16Bit ||
|
|
topdclAugMask == AugmentationMasks::Default32Bit) {
|
|
if (bucketSpans2GRFs())
|
|
return 2;
|
|
} else if (topdclAugMask == AugmentationMasks::Default64Bit) {
|
|
if (bucketSpans2GRFs())
|
|
// :df SIMD16
|
|
return 2;
|
|
|
|
// :df SIMD32
|
|
return 4;
|
|
} else if (topdclAugMask == AugmentationMasks::Undetermined) {
|
|
// Local RA will take this path as augmentation buckets are set
|
|
// to Undetermined. Although this is conservative, hybrid RA
|
|
// will run augmentation and compute buckets to fill in "holes".
|
|
// For eg, mov (32|M0) V10<2>:f should use 4GRF alignment as
|
|
// it's Default64Bit variable, although elem size is :f.
|
|
return 4;
|
|
}
|
|
} else {
|
|
// Even align if size is between 1-2 GRFs, for >2GRF sizes.
|
|
if (bucketSpans2GRFs())
|
|
return 2;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
if (dcl->getRegFile() & G4_GRF) {
|
|
G4_Declare *topdcl = dcl->getRootDeclare();
|
|
auto topdclAugMask = getAugmentationMask(topdcl);
|
|
|
|
if (!areAllDefsNoMask(topdcl) && !topdcl->getIsPartialDcl() &&
|
|
topdclAugMask != AugmentationMasks::NonDefault &&
|
|
topdclAugMask != AugmentationMasks::Default64Bit) {
|
|
if ((topdcl->getElemSize() >= 4 ||
|
|
topdclAugMask == AugmentationMasks::Default32Bit) &&
|
|
topdcl->getByteSize() >= kernel.numEltPerGRF<Type_UB>() &&
|
|
!(!builder.canReadR0() &&
|
|
dcl == kernel.fg.builder->getBuiltinR0())) {
|
|
return 2;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
void GlobalRA::augAlign() {
|
|
// Update alignment of all GRF declares based on
|
|
// augmentation bucket and platform.
|
|
for (auto dcl : kernel.Declares) {
|
|
if (dcl->getRegFile() & G4_GRF) {
|
|
unsigned int align = getAlignFromAugBucket(dcl);
|
|
if (align == 4) {
|
|
if (incRA.isEnabled() && !isQuadAligned(dcl)) {
|
|
incRA.evenAlignUpdate(dcl);
|
|
}
|
|
forceQuadAlign(dcl);
|
|
} else if (align == 2) {
|
|
if (incRA.isEnabled() && !isEvenAligned(dcl)) {
|
|
incRA.evenAlignUpdate(dcl);
|
|
}
|
|
setEvenAligned(dcl, true);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void GlobalRA::getBankAlignment(LiveRange *lr, BankAlign &align) {
|
|
G4_Declare *dcl = lr->getDcl();
|
|
if (kernel.getSimdSize() < g4::SIMD16) {
|
|
return;
|
|
}
|
|
|
|
if (dcl->getRegFile() & G4_GRF) {
|
|
G4_Declare *topdcl = dcl->getRootDeclare();
|
|
auto topdclBC = getBankConflict(topdcl);
|
|
|
|
if (topdclBC != BANK_CONFLICT_NONE) {
|
|
if (topdcl->getElemSize() >= 4 && topdcl->getNumRows() > 1 &&
|
|
!(!builder.canReadR0() && dcl == kernel.fg.builder->getBuiltinR0())) {
|
|
if (topdclBC == BANK_CONFLICT_SECOND_HALF_EVEN ||
|
|
topdclBC == BANK_CONFLICT_SECOND_HALF_ODD) {
|
|
align = BankAlign::Odd;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Compute homeFunc for dcl. Following rules are used:
|
|
// 1. A variable that's defined or used in a single function has
|
|
// that function as its home function.
|
|
// 2. A variable that's defined or used across functions (eg,
|
|
// args, retval) have their home function set to nullptr.
|
|
// 3. homeFunc is set only on root G4_Declare.
|
|
FuncInfo *Augmentation::computeHomeFunc(G4_Declare *dcl) {
|
|
vISA_ASSERT(!dcl->getAliasDeclare(), "root dcl expected");
|
|
// If there are no subroutines then all dcls have kernel as home function
|
|
if (!hasSubroutines)
|
|
return kernel.fg.kernelInfo;
|
|
|
|
if (hasUniqueFuncHome(dcl))
|
|
return getUniqueFuncHome(dcl);
|
|
|
|
FuncInfo *homeFunction = nullptr;
|
|
// Live-ins to kernel are modeled as being implicitly defined in kernel.
|
|
if (dcl->isInput())
|
|
homeFunction = kernel.fg.kernelInfo;
|
|
auto *defs = refs.getDefs(dcl);
|
|
if (defs) {
|
|
for (auto &def : *defs) {
|
|
auto *bb = std::get<1>(def);
|
|
auto *curDefFunc = bb->getFuncInfo();
|
|
if (!homeFunction) {
|
|
homeFunction = curDefFunc;
|
|
continue;
|
|
} else if (homeFunction != curDefFunc) {
|
|
return nullptr;
|
|
}
|
|
}
|
|
}
|
|
|
|
auto *uses = refs.getUses(dcl);
|
|
if (uses) {
|
|
for (auto &use : *uses) {
|
|
auto *bb = std::get<1>(use);
|
|
auto *curUseFunc = bb->getFuncInfo();
|
|
if (!homeFunction) {
|
|
homeFunction = curUseFunc;
|
|
continue;
|
|
} else if (homeFunction != curUseFunc) {
|
|
return nullptr;
|
|
}
|
|
}
|
|
}
|
|
|
|
return homeFunction;
|
|
}
|
|
|
|
void Augmentation::populateFuncMaps() {
|
|
vISA_ASSERT(kernel.fg.getBBList().back()->size() > 0, "last BB empty");
|
|
instToFunc.resize(kernel.fg.getBBList().back()->back()->getLexicalId() + 1);
|
|
for (auto &func : kernel.fg.sortedFuncTable) {
|
|
for (auto &bb : func->getBBList()) {
|
|
for (auto *inst : *bb) {
|
|
instToFunc[inst->getLexicalId()] = func;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Augmentation::populateHomeFunc() {
|
|
// Assume last G4_Declare has max declId
|
|
homeFunc.resize(kernel.Declares.back()->getDeclId() + 1);
|
|
for (auto dcl : kernel.Declares) {
|
|
if (dcl->getAliasDeclare())
|
|
dcl = dcl->getRootDeclare();
|
|
auto *func = computeHomeFunc(dcl);
|
|
vISA_ASSERT(!hasUniqueFuncHome(dcl) || getUniqueFuncHome(dcl) == func,
|
|
"different home func set");
|
|
homeFunc[dcl->getDeclId()] = func;
|
|
}
|
|
}
|
|
|
|
Augmentation::Augmentation(Interference &i, const LivenessAnalysis &l,
|
|
GlobalRA &g)
|
|
: kernel(g.kernel), intf(i), gra(g), liveAnalysis(l), lrs(g.incRA.getLRs()),
|
|
fcallRetMap(g.fcallRetMap),
|
|
refs(g.kernel, false, false, true, &g.pointsToAnalysis),
|
|
hasSubroutines(kernel.fg.sortedFuncTable.size() > 0 &&
|
|
g.kernel.getOption(vISA_NewAugmentation)) {
|
|
useGenericAugAlign =
|
|
GlobalRA::useGenericAugAlign(kernel.getPlatformGeneration());
|
|
}
|
|
|
|
// For Scatter read, the channel is not handled as the block read.
|
|
// Update the emask according to the definition of VISA
|
|
bool Augmentation::updateDstMaskForGather(G4_INST *inst,
|
|
std::vector<unsigned char> &mask) {
|
|
G4_InstSend *sendInst = reinterpret_cast<G4_InstSend *>(inst);
|
|
G4_SendDesc *msgDesc = sendInst->getMsgDesc();
|
|
|
|
if (msgDesc->isRaw()) {
|
|
return updateDstMaskForGatherRaw(
|
|
inst, mask, reinterpret_cast<const G4_SendDescRaw *>(msgDesc));
|
|
}
|
|
vISA_ASSERT_UNREACHABLE("unexpected descriptor");
|
|
return false;
|
|
}
|
|
|
|
static void updateMaskSIMT(unsigned char curEMBit, unsigned char execSize,
|
|
std::vector<unsigned char> &mask,
|
|
unsigned dataSizeBytes, unsigned vecElems) {
|
|
unsigned blockSize = dataSizeBytes;
|
|
unsigned blockNum = vecElems;
|
|
for (unsigned i = 0; i < execSize; i++) {
|
|
for (unsigned j = 0; j < blockNum; j++) {
|
|
for (unsigned k = 0; k < blockSize; k++) {
|
|
mask[(j * execSize + i) * blockSize + k] = curEMBit;
|
|
}
|
|
}
|
|
if (curEMBit != NOMASK_BYTE) {
|
|
curEMBit++;
|
|
vISA_ASSERT(curEMBit <= 32, "Illegal mask channel");
|
|
}
|
|
}
|
|
}
|
|
|
|
bool Augmentation::updateDstMaskForGatherRaw(G4_INST *inst,
|
|
std::vector<unsigned char> &mask,
|
|
const G4_SendDescRaw *msgDesc) {
|
|
unsigned char execSize = inst->getExecSize();
|
|
const G4_DstRegRegion *dst = inst->getDst();
|
|
unsigned char curEMBit = (unsigned char)inst->getMaskOffset();
|
|
unsigned short elemSize = dst->getElemSize();
|
|
|
|
if (inst->isWriteEnableInst() ||
|
|
kernel.fg.builder->hasGatherReadSuppressionWARA()) {
|
|
curEMBit = NOMASK_BYTE;
|
|
}
|
|
|
|
SFID funcID = msgDesc->getFuncId();
|
|
|
|
switch (funcID) {
|
|
case SFID::RTHW:
|
|
// Mark RT send dst to be NonDefault, even when it doesn't have WriteEnable
|
|
if (kernel.getPlatform() >= Xe2) {
|
|
for (auto &elem : mask)
|
|
elem = NOMASK_BYTE;
|
|
return true;
|
|
}
|
|
break;
|
|
|
|
case SFID::DP_DC1:
|
|
switch (msgDesc->getHdcMessageType()) {
|
|
case DC1_A64_SCATTERED_READ: // a64 scattered read: svm_gather
|
|
{
|
|
unsigned blockNum = msgDesc->getElemsPerAddr();
|
|
unsigned blockSize = msgDesc->getElemSize();
|
|
|
|
for (unsigned i = 0; i < execSize; i++) {
|
|
for (unsigned j = 0; j < blockNum; j++) {
|
|
for (unsigned k = 0; k < blockSize; k++) {
|
|
mask[(j * execSize + i) * blockSize + k] = curEMBit;
|
|
}
|
|
}
|
|
if (curEMBit != NOMASK_BYTE) {
|
|
curEMBit++;
|
|
vISA_ASSERT(curEMBit <= 32, "Illegal mask channel");
|
|
}
|
|
}
|
|
return true;
|
|
} break;
|
|
|
|
case DC1_A64_UNTYPED_SURFACE_READ: // SVM gather 4
|
|
case DC1_UNTYPED_SURFACE_READ: // VISA gather 4
|
|
case DC1_TYPED_SURFACE_READ: // Gather 4 typed
|
|
{
|
|
unsigned channelNum = msgDesc->getEnabledChannelNum();
|
|
if (channelNum == 0) {
|
|
return false;
|
|
}
|
|
if (elemSize < 4) {
|
|
elemSize = 4;
|
|
}
|
|
|
|
for (unsigned i = 0; i < channelNum; i++) {
|
|
for (unsigned j = 0; j < execSize; j++) {
|
|
for (unsigned k = 0; k < elemSize; k++) {
|
|
mask[(i * execSize + j) * elemSize + k] = curEMBit;
|
|
}
|
|
if (curEMBit != NOMASK_BYTE) {
|
|
curEMBit++;
|
|
vISA_ASSERT(curEMBit <= 32, "Illegal mask channel");
|
|
}
|
|
}
|
|
if (curEMBit != NOMASK_BYTE) {
|
|
curEMBit = (unsigned char)inst->getMaskOffset();
|
|
}
|
|
}
|
|
return true;
|
|
} break;
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
break;
|
|
case SFID::DP_DC2:
|
|
switch (msgDesc->getHdcMessageType()) {
|
|
case DC2_UNTYPED_SURFACE_READ: // gather 4 scaled
|
|
case DC2_A64_UNTYPED_SURFACE_READ: // SVM gather 4 scaled
|
|
{
|
|
unsigned channelNum = msgDesc->getEnabledChannelNum();
|
|
if (channelNum == 0) {
|
|
return false;
|
|
}
|
|
if (elemSize < 4) {
|
|
elemSize = 4;
|
|
}
|
|
|
|
for (unsigned i = 0; i < channelNum; i++) {
|
|
for (unsigned j = 0; j < execSize; j++) {
|
|
for (unsigned k = 0; k < elemSize; k++) {
|
|
mask[(i * execSize + j) * elemSize + k] = curEMBit;
|
|
}
|
|
if (curEMBit != NOMASK_BYTE) {
|
|
curEMBit++;
|
|
vISA_ASSERT(curEMBit <= 32, "Illegal mask channel");
|
|
}
|
|
}
|
|
if (curEMBit != NOMASK_BYTE) {
|
|
curEMBit = (unsigned char)inst->getMaskOffset();
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
case DC2_BYTE_SCATTERED_READ: // scaled byte scattered read: gather_scaled,
|
|
// handled as block read write
|
|
default:
|
|
return false;
|
|
}
|
|
break;
|
|
case SFID::DP_DC0:
|
|
switch (msgDesc->getHdcMessageType()) {
|
|
case DC_DWORD_SCATTERED_READ: // dword scattered read: gather(dword),
|
|
// handled as block read write
|
|
case DC_BYTE_SCATTERED_READ: // byte scattered read: gather(byte), handled
|
|
// as block read write
|
|
default:
|
|
return false;
|
|
}
|
|
break;
|
|
|
|
case SFID::SAMPLER: {
|
|
unsigned respLength = msgDesc->ResponseLength();
|
|
if (respLength * kernel.numEltPerGRF<Type_UB>() !=
|
|
dst->getTopDcl()->getByteSize() &&
|
|
msgDesc->isFence()) {
|
|
// since send dst size is not exactly equal to ResponseLength encoded in
|
|
// the descriptor, conservatively treat the send as being non-default
|
|
auto sz = dst->getTopDcl()->getByteSize();
|
|
for (unsigned int i = 0; i != sz; ++i)
|
|
mask[i] = NOMASK_BYTE;
|
|
return true;
|
|
}
|
|
elemSize = msgDesc->is16BitReturn() ? 2 : 4;
|
|
unsigned warpNum =
|
|
respLength * kernel.numEltPerGRF<Type_UB>() / (execSize * elemSize);
|
|
if (inst->isWriteEnableInst()) {
|
|
curEMBit = NOMASK_BYTE;
|
|
}
|
|
for (unsigned i = 0; i < warpNum; i++) {
|
|
for (unsigned j = 0; j < execSize; j++) {
|
|
for (unsigned k = 0; k < elemSize; k++) {
|
|
mask[(i * execSize + j) * elemSize + k] = curEMBit;
|
|
}
|
|
if (curEMBit != NOMASK_BYTE) {
|
|
curEMBit++;
|
|
vISA_ASSERT(curEMBit <= 32, "Illegal mask channel");
|
|
}
|
|
}
|
|
if (curEMBit != NOMASK_BYTE) {
|
|
curEMBit = (unsigned char)inst->getMaskOffset();
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
break;
|
|
|
|
case SFID::UGM:
|
|
case SFID::UGML:
|
|
case SFID::SLM: {
|
|
uint32_t desc = msgDesc->getDesc();
|
|
uint32_t op = (desc & 0x3F); // [5:0]
|
|
uint32_t dszEncd = (desc >> 9) & 0x7; // [11:9]
|
|
bool isTranspose = ((desc >> 15) & 0x1) != 0; // [15]
|
|
if (op == LSC_LOAD && !isTranspose) { // transpose not supported yet
|
|
int dataSzReg = 0;
|
|
switch (dszEncd) { // dat size [11:9]
|
|
case 0:
|
|
dataSzReg = 1;
|
|
break; // d8
|
|
case 1:
|
|
dataSzReg = 2;
|
|
break; // d16
|
|
default:
|
|
dataSzReg = 4;
|
|
break; // d32, d8u32, d16u32, d16u32h
|
|
case 3:
|
|
dataSzReg = 8;
|
|
break; // d64
|
|
}
|
|
int vecSz = 0;
|
|
int vecSzEncd = (desc >> 12) & 0x7; // [14:12]
|
|
if (vecSzEncd <= 3) {
|
|
vecSz = vecSzEncd + 1; // V1, V2, V3, V4
|
|
} else {
|
|
vecSz = 4 << (vecSzEncd - 3); // V8, V16, V32, V64
|
|
}
|
|
updateMaskSIMT(curEMBit, execSize, mask, (unsigned)dataSzReg,
|
|
(unsigned)vecSz);
|
|
return true;
|
|
}
|
|
}
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
// Value stored at each byte in mask determines which bits
|
|
// of EM enable that byte for writing. When checkCmodOnly
|
|
// is set dst is ignored and mask only for cmod is set. For
|
|
// flag declares, mask is at bit granularity rather than byte.
|
|
// Function updates mask field in declaration of correspoing
|
|
// variable - dst or cmod.
|
|
void Augmentation::updateDstMask(G4_INST *inst, bool checkCmodOnly) {
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
G4_CondMod *cmod = inst->getCondMod();
|
|
|
|
if ((checkCmodOnly == false && dst && dst->getBase() &&
|
|
dst->getBase()->isRegVar()) ||
|
|
(checkCmodOnly == true && cmod != NULL && cmod->getBase() != NULL)) {
|
|
int dclOffset = 0;
|
|
G4_Declare *topdcl = NULL;
|
|
|
|
if (checkCmodOnly == false) {
|
|
topdcl = dst->getBase()->asRegVar()->getDeclare();
|
|
} else {
|
|
topdcl = cmod->asCondMod()->getTopDcl();
|
|
}
|
|
|
|
while (topdcl->getAliasDeclare() != nullptr) {
|
|
dclOffset += topdcl->getAliasOffset();
|
|
topdcl = topdcl->getAliasDeclare();
|
|
}
|
|
|
|
auto &mask = const_cast<std::vector<unsigned char> &>(gra.getMask(topdcl));
|
|
|
|
unsigned size = topdcl->getByteSize();
|
|
if (checkCmodOnly == true || dst->isFlag()) {
|
|
size *= BITS_PER_BYTE;
|
|
}
|
|
|
|
if (mask.size() == 0) {
|
|
mask.resize(size);
|
|
}
|
|
|
|
vISA_ASSERT(mask.size() > 0, "Valid mask not found for dcl %s",
|
|
topdcl->getName());
|
|
|
|
unsigned short hstride, elemSize;
|
|
short row, subReg;
|
|
unsigned startByte;
|
|
|
|
if (checkCmodOnly == false) {
|
|
hstride = dst->getHorzStride();
|
|
|
|
row = dst->getRegOff();
|
|
subReg = dst->getSubRegOff();
|
|
elemSize = dst->getElemSize();
|
|
|
|
if (inst->isSend() && !inst->isEOT()) {
|
|
if (updateDstMaskForGather(inst, mask)) {
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (dst->isFlag()) {
|
|
elemSize = 1;
|
|
}
|
|
|
|
startByte = (row * kernel.getGRFSize()) + (subReg * elemSize);
|
|
|
|
if (dst->isFlag()) {
|
|
startByte = (row * 32) + (subReg * 8);
|
|
}
|
|
} else {
|
|
hstride = 1;
|
|
row = 0;
|
|
elemSize = 1;
|
|
startByte = cmod->asCondMod()->getLeftBound();
|
|
}
|
|
|
|
unsigned rb = 0xffffffff;
|
|
|
|
if (checkCmodOnly == true) {
|
|
rb = cmod->asCondMod()->getRightBound();
|
|
} else {
|
|
rb = dst->getRightBound();
|
|
}
|
|
|
|
unsigned char curEMBit = (unsigned char)inst->getMaskOffset();
|
|
if (inst->isWriteEnableInst()) {
|
|
curEMBit = NOMASK_BYTE;
|
|
}
|
|
|
|
for (unsigned i = dclOffset + startByte; i <= rb;
|
|
i += (hstride * elemSize)) {
|
|
for (int j = 0; j < elemSize; j++) {
|
|
vISA_ASSERT(i + j < size,
|
|
"updateDstMask writing past end of mask array size: %d",
|
|
size);
|
|
mask[i + j] |= curEMBit;
|
|
}
|
|
if (curEMBit != NOMASK_BYTE) {
|
|
curEMBit++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
unsigned Augmentation::getByteSizeFromMask(AugmentationMasks type) {
|
|
if (type == AugmentationMasks::Default16Bit) {
|
|
return 2;
|
|
} else if (type == AugmentationMasks::Default32Bit) {
|
|
return 4;
|
|
} else if (type == AugmentationMasks::Default64Bit) {
|
|
return 8;
|
|
}
|
|
|
|
vISA_ASSERT_UNREACHABLE("Unexpected type of mask");
|
|
|
|
return 0;
|
|
}
|
|
|
|
bool Augmentation::isDefaultMaskDcl(G4_Declare *dcl, unsigned simdSize,
|
|
AugmentationMasks type) {
|
|
// default mask is one where dst's hstride is 1 and
|
|
// elem size is 4
|
|
bool isDefault = false;
|
|
auto &mask = gra.getMask(dcl);
|
|
|
|
unsigned byteSize = getByteSizeFromMask(type);
|
|
|
|
// treat simd32 as simd16 when the program is split in to 2 simd16.
|
|
// when a simd32 program is not split in to 2 simd16, but some sends
|
|
// are broken in to 2 simd16 then treat those simd16 sends as non-default.
|
|
if (simdSize == 32 && kernel.getChannelSlicing()) {
|
|
simdSize = 16;
|
|
}
|
|
if (mask.size() > 0) {
|
|
G4_Declare *topdcl = dcl->getRootDeclare();
|
|
bool isFlagDcl = (topdcl->getRegFile() == G4_FLAG);
|
|
|
|
unsigned size = topdcl->getByteSize();
|
|
unsigned char curEMBit = 0;
|
|
bool found = true;
|
|
unsigned wrapAround = simdSize * byteSize;
|
|
|
|
if (isFlagDcl == true) {
|
|
size *= BITS_PER_BYTE;
|
|
wrapAround = 16;
|
|
}
|
|
|
|
for (unsigned i = 0; i < size; i += 1) {
|
|
if (isFlagDcl == true) {
|
|
curEMBit++;
|
|
} else {
|
|
if (byteSize && i % byteSize == 0) {
|
|
curEMBit++;
|
|
}
|
|
}
|
|
|
|
if (i % wrapAround == 0) {
|
|
// Wrap around based on simd size
|
|
// For SIMD8 wrap around each row,
|
|
// for SIMD16 wrap around every other row
|
|
curEMBit = 0;
|
|
}
|
|
|
|
if (mask[i] != curEMBit &&
|
|
// For flags, we set bytesize = 2 although
|
|
// the kernel is SIMD8. This means higher 8
|
|
// bits of mask will be set to 0 since those
|
|
// bits are never defined. Such masks need
|
|
// not be considered non-default.
|
|
!(isFlagDcl == true && mask[i] == 0)) {
|
|
found = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (found == true) {
|
|
isDefault = true;
|
|
}
|
|
}
|
|
|
|
return isDefault;
|
|
}
|
|
|
|
bool Augmentation::isDefaultMaskSubDeclare(unsigned char *mask, unsigned lb,
|
|
unsigned rb, G4_Declare *dcl,
|
|
unsigned simdSize) {
|
|
bool isDefault = false;
|
|
|
|
// treat simd32 as simd16 as the instruction is always split to 2 simd16
|
|
if (simdSize == 32) {
|
|
simdSize = 16;
|
|
}
|
|
|
|
if (mask != NULL) {
|
|
unsigned size = dcl->getByteSize();
|
|
unsigned char curEMBit = 0;
|
|
bool found = true;
|
|
unsigned wrapAround = simdSize * 4;
|
|
unsigned leftBound = gra.getSubOffset(dcl);
|
|
unsigned rightBound = leftBound + size - 1;
|
|
|
|
vISA_ASSERT(rightBound <= rb, "Wrong sub declare right bound!");
|
|
|
|
for (unsigned i = lb; i < rightBound + 1; i += 1) {
|
|
if ((i - lb) % 4 == 0) {
|
|
curEMBit++;
|
|
}
|
|
|
|
if ((i - lb) % wrapAround == 0) {
|
|
curEMBit = 0;
|
|
}
|
|
|
|
if (i >= leftBound) {
|
|
if (mask[i] != curEMBit) {
|
|
found = false;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (found == true) {
|
|
isDefault = true;
|
|
}
|
|
}
|
|
|
|
return isDefault;
|
|
}
|
|
|
|
bool Augmentation::verifyMaskIfInit(G4_Declare *dcl, AugmentationMasks mask) {
|
|
// Return true if dcl mask is either undetermined or same as mask
|
|
auto m = gra.getAugmentationMask(dcl);
|
|
if (m == mask || m == AugmentationMasks::Undetermined) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool Augmentation::checkGRFPattern2(G4_Declare *dcl, G4_DstRegRegion *dst,
|
|
unsigned maskOff, unsigned lb, unsigned rb,
|
|
unsigned execSize) {
|
|
auto opndByteSize = dst->getTypeSize();
|
|
unsigned modWith = opndByteSize * kernel.getSimdSize();
|
|
if (lb % modWith - (maskOff * opndByteSize * dst->getHorzStride()) <=
|
|
opndByteSize) {
|
|
if ((lb +
|
|
(execSize * opndByteSize * dst->getHorzStride() -
|
|
dst->getHorzStride()) -
|
|
rb) < opndByteSize) {
|
|
if (opndByteSize == 2 &&
|
|
verifyMaskIfInit(dcl, AugmentationMasks::Default32Bit)) {
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::Default32Bit);
|
|
return true;
|
|
} else if (opndByteSize == 4 &&
|
|
verifyMaskIfInit(dcl, AugmentationMasks::Default64Bit)) {
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::Default64Bit);
|
|
return true;
|
|
} else {
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
// Returns true if dcl mask deemed to be non-default, false otherwise.
|
|
bool Augmentation::checkGRFPattern1(G4_Declare *dcl, G4_DstRegRegion *dst,
|
|
unsigned maskOff, unsigned lb, unsigned rb,
|
|
unsigned execSize) {
|
|
auto opndByteSize = dst->getTypeSize();
|
|
unsigned modWith = opndByteSize * kernel.getSimdSize();
|
|
if (dst->getHorzStride() == 1) {
|
|
if ((lb % modWith == (maskOff * opndByteSize) &&
|
|
rb == (lb + (execSize * opndByteSize) - 1))) {
|
|
// This will be taken only when hstride = 1
|
|
if (opndByteSize == 2 &&
|
|
verifyMaskIfInit(dcl, AugmentationMasks::Default16Bit)) {
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::Default16Bit);
|
|
return true;
|
|
} else if (opndByteSize == 4 &&
|
|
verifyMaskIfInit(dcl, AugmentationMasks::Default32Bit)) {
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::Default32Bit);
|
|
return true;
|
|
} else if (opndByteSize == 8 &&
|
|
verifyMaskIfInit(dcl, AugmentationMasks::Default64Bit)) {
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::Default64Bit);
|
|
return true;
|
|
} else {
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
void Augmentation::markNonDefaultDstRgn(G4_INST *inst, G4_Operand *opnd) {
|
|
if (inst->isPseudoKill()) {
|
|
return;
|
|
}
|
|
|
|
G4_DstRegRegion *dst = nullptr;
|
|
G4_CondMod *condMod = nullptr;
|
|
if (opnd->isDstRegRegion()) {
|
|
dst = opnd->asDstRegRegion();
|
|
} else if (opnd->isCondMod()) {
|
|
condMod = opnd->asCondMod();
|
|
} else {
|
|
vISA_ASSERT(false, "Don't know how to handle this type of operand");
|
|
}
|
|
|
|
// Handle condMod
|
|
if (condMod && condMod->getBase()) {
|
|
G4_Declare *dcl = condMod->getTopDcl();
|
|
dcl = dcl->getRootDeclare();
|
|
|
|
if (inst->isWriteEnableInst() ||
|
|
opnd->getLeftBound() != inst->getMaskOffset()) {
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
|
|
return;
|
|
}
|
|
|
|
if (verifyMaskIfInit(dcl, AugmentationMasks::DefaultPredicateMask)) {
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::DefaultPredicateMask);
|
|
}
|
|
return;
|
|
}
|
|
|
|
// Handle dst
|
|
if (dst && (inst->isCall() || inst->isCallerSave())) {
|
|
const G4_Declare *dcl = dst->getBase()->asRegVar()->getDeclare();
|
|
if (dcl && liveAnalysis.livenessClass(dcl->getRegFile())) {
|
|
gra.setAugmentationMask(dcl->getRootDeclare(),
|
|
AugmentationMasks::NonDefault);
|
|
}
|
|
return;
|
|
}
|
|
|
|
bool isFlagRA = liveAnalysis.livenessClass(G4_FLAG);
|
|
if (dst && dst->getBase() && dst->getBase()->isRegVar()) {
|
|
G4_Declare *dcl = dst->getBase()->asRegVar()->getDeclare();
|
|
if (!liveAnalysis.livenessClass(dcl->getRegFile())) {
|
|
return;
|
|
}
|
|
unsigned offTopDcl = 0;
|
|
while (dcl->getAliasDeclare()) {
|
|
offTopDcl += dcl->getAliasOffset();
|
|
dcl = dcl->getAliasDeclare();
|
|
}
|
|
|
|
// NoMask instructions's dst is always non-default
|
|
if (inst->isWriteEnableInst()) {
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
|
|
return;
|
|
}
|
|
|
|
if (gra.getAugmentationMask(dcl) == AugmentationMasks::NonDefault)
|
|
return;
|
|
|
|
unsigned maskOff = inst->getMaskOffset();
|
|
unsigned lb = dst->getLeftBound() + offTopDcl;
|
|
unsigned rb = dst->getRightBound() + offTopDcl;
|
|
unsigned execSize = inst->getExecSize();
|
|
|
|
if (dcl->getAddressed()) {
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
|
|
return;
|
|
}
|
|
|
|
if (!isFlagRA) {
|
|
// Treat send as special case because update mask for scatter
|
|
// has some special checks.
|
|
if (inst->isSend()) {
|
|
if (gra.getAugmentationMask(dcl) == AugmentationMasks::NonDefault) {
|
|
return;
|
|
}
|
|
|
|
updateDstMask(inst, false);
|
|
if (isDefaultMaskDcl(dcl, kernel.getSimdSize(),
|
|
AugmentationMasks::Default16Bit)) {
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::Default16Bit);
|
|
} else if (isDefaultMaskDcl(dcl, kernel.getSimdSize(),
|
|
AugmentationMasks::Default32Bit)) {
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::Default32Bit);
|
|
} else if (isDefaultMaskDcl(dcl, kernel.getSimdSize(),
|
|
AugmentationMasks::Default64Bit)) {
|
|
bool useNonDefault = false;
|
|
|
|
// TODO: Why?
|
|
useNonDefault |=
|
|
(kernel.getSimdSize() >= g4::SIMD16 && dcl->getTotalElems() > 8);
|
|
useNonDefault |=
|
|
(kernel.getSimdSize() == g4::SIMD8 && dcl->getTotalElems() > 4);
|
|
|
|
gra.setAugmentationMask(dcl, useNonDefault
|
|
? AugmentationMasks::NonDefault
|
|
: AugmentationMasks::Default64Bit);
|
|
} else {
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
|
|
return;
|
|
}
|
|
} else {
|
|
bool found = false;
|
|
// default one
|
|
found |= checkGRFPattern1(dcl, dst, maskOff, lb, rb, execSize);
|
|
if (!found ||
|
|
gra.getAugmentationMask(dcl) == AugmentationMasks::Undetermined) {
|
|
// hstride = 2 case
|
|
found |= checkGRFPattern2(dcl, dst, maskOff, lb, rb, execSize);
|
|
}
|
|
|
|
if (!found ||
|
|
gra.getAugmentationMask(dcl) == AugmentationMasks::Undetermined) {
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
|
|
}
|
|
}
|
|
} else {
|
|
// Handle flag register as destination here
|
|
if (!(lb == maskOff && rb == (lb + execSize - 1))) {
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
|
|
return;
|
|
}
|
|
|
|
if (verifyMaskIfInit(dcl, AugmentationMasks::DefaultPredicateMask)) {
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::DefaultPredicateMask);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Returns true if any inst found using non-default mask.
|
|
// This function sets up lexical id of all instructions.
|
|
bool Augmentation::markNonDefaultMaskDef() {
|
|
// Iterate dcls list and mark obvious ones as non-default.
|
|
// Obvoius non-default is 1 element, ie uniform dcl.
|
|
for (auto dcl : kernel.Declares) {
|
|
auto dclRegFile = dcl->getRegFile();
|
|
if (!liveAnalysis.livenessClass(dclRegFile))
|
|
continue;
|
|
|
|
if (dclRegFile == G4_GRF || dclRegFile == G4_INPUT ||
|
|
dclRegFile == G4_ADDRESS) {
|
|
if (dcl->getTotalElems() < 8 || dclRegFile == G4_INPUT) {
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
|
|
}
|
|
} else if (dclRegFile == G4_FLAG) {
|
|
// Flags are processed when processing instructions
|
|
}
|
|
}
|
|
|
|
unsigned id = 0;
|
|
bool isFlagRA = liveAnalysis.livenessClass(G4_FLAG);
|
|
|
|
for (auto bb : kernel.fg) {
|
|
for (auto inst : *bb) {
|
|
inst->setLexicalId(id++);
|
|
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
|
|
if (dst) {
|
|
markNonDefaultDstRgn(inst, dst);
|
|
}
|
|
|
|
if (isFlagRA && inst->getCondMod()) {
|
|
markNonDefaultDstRgn(inst, inst->getCondMod());
|
|
}
|
|
}
|
|
}
|
|
|
|
// Update whether each dcl is default/not
|
|
AugmentationMasks prevAugMask = AugmentationMasks::Undetermined;
|
|
bool nonDefaultMaskDefFound = false;
|
|
|
|
for (auto dcl : kernel.Declares) {
|
|
if (liveAnalysis.livenessClass(dcl->getRegFile())) {
|
|
if (gra.getAugmentationMask(dcl) == AugmentationMasks::Undetermined) {
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
|
|
nonDefaultMaskDefFound = true;
|
|
}
|
|
|
|
if (kernel.getOption(vISA_forceBCR) &&
|
|
gra.getBankConflict(dcl) != BANK_CONFLICT_NONE) {
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
|
|
nonDefaultMaskDefFound = true;
|
|
}
|
|
|
|
if (!nonDefaultMaskDefFound &&
|
|
gra.getAugmentationMask(dcl) != prevAugMask &&
|
|
prevAugMask != AugmentationMasks::Undetermined) {
|
|
nonDefaultMaskDefFound = true;
|
|
}
|
|
|
|
prevAugMask = gra.getAugmentationMask(dcl);
|
|
}
|
|
|
|
bool checkLRAAlign = false;
|
|
if (liveAnalysis.livenessClass(G4_GRF)) {
|
|
if (GlobalRA::useGenericAugAlign(kernel.getPlatformGeneration()) &&
|
|
gra.getAlignFromAugBucket(dcl) > 0)
|
|
checkLRAAlign = true;
|
|
else if (gra.getAugmentationMask(dcl) ==
|
|
AugmentationMasks::Default32Bit &&
|
|
kernel.getSimdSize() > kernel.numEltPerGRF<Type_UD>())
|
|
checkLRAAlign = true;
|
|
}
|
|
|
|
if (checkLRAAlign) {
|
|
auto dclLR = gra.getLocalLR(dcl);
|
|
if (dclLR) {
|
|
int s;
|
|
auto phyReg = dclLR->getPhyReg(s);
|
|
unsigned int maxAlign = 2;
|
|
if (gra.use4GRFAlign && gra.getAugmentationMask(dcl) == AugmentationMasks::Default64Bit) {
|
|
maxAlign = 4;
|
|
}
|
|
if (phyReg && phyReg->asGreg()->getRegNum() % maxAlign != 0) {
|
|
// If LRA assignment is not aligned as expected then
|
|
// mark it as non-default. GRA candidates cannot fully
|
|
// overlap with such ranges. Partial overlap is illegal.
|
|
|
|
// TODO: There's a bug here. This branch should execute only if
|
|
// dclLR->getAssigned() == true. If this is false, then
|
|
// dclLR->getPhyReg() is invalid. Once this is fixed, we can
|
|
// re-enable following assert.
|
|
//
|
|
//vISA_ASSERT(!gra.use4GRFAlign,
|
|
// "expecting LRA allocation to be aligned");
|
|
gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
|
|
nonDefaultMaskDefFound = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return nonDefaultMaskDefFound;
|
|
}
|
|
|
|
void Augmentation::updateStartIntervalForSubDcl(G4_Declare *dcl,
|
|
G4_INST *curInst,
|
|
G4_Operand *opnd) {
|
|
for (const G4_Declare *subDcl : gra.getSubDclList(dcl)) {
|
|
unsigned leftBound = gra.getSubOffset(subDcl);
|
|
unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
|
|
if (!(opnd->getRightBound() < leftBound ||
|
|
rightBound < opnd->getLeftBound())) {
|
|
auto subDclStartInterval = gra.getLastStartInterval(subDcl);
|
|
if (subDclStartInterval == NULL ||
|
|
(subDclStartInterval->getLexicalId() > curInst->getLexicalId())) {
|
|
gra.setLastStartInterval(subDcl, curInst);
|
|
}
|
|
|
|
auto subDclEndIntrval = gra.getLastEndInterval(subDcl);
|
|
if (subDclEndIntrval == NULL ||
|
|
(subDclEndIntrval->getLexicalId() < curInst->getLexicalId())) {
|
|
gra.setLastEndInterval(subDcl, curInst);
|
|
}
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
void Augmentation::updateEndIntervalForSubDcl(G4_Declare *dcl, G4_INST *curInst,
|
|
G4_Operand *opnd) {
|
|
for (const G4_Declare *subDcl : gra.getSubDclList(dcl)) {
|
|
unsigned leftBound = gra.getSubOffset(subDcl);
|
|
unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
|
|
if (!(opnd->getRightBound() < leftBound ||
|
|
rightBound < opnd->getLeftBound())) {
|
|
auto subDclEndInterval = gra.getLastEndInterval(subDcl);
|
|
if (subDclEndInterval == NULL ||
|
|
(subDclEndInterval->getLexicalId() < curInst->getLexicalId())) {
|
|
gra.setLastEndInterval(subDcl, curInst);
|
|
}
|
|
|
|
auto subDclStartInterval = gra.getLastStartInterval(subDcl);
|
|
if (subDclStartInterval == NULL ||
|
|
(subDclStartInterval->getLexicalId() > curInst->getLexicalId())) {
|
|
gra.setLastStartInterval(subDcl, curInst);
|
|
}
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
void Augmentation::updateStartInterval(const G4_Declare *dcl,
|
|
G4_INST *curInst) {
|
|
auto dclStartInterval = gra.getLastStartInterval(dcl);
|
|
if (dclStartInterval && hasSubroutines) {
|
|
// If we want to extend dcl in a new subroutine than earlier,
|
|
// then we create a new interval for new subroutine. This way
|
|
// we ensure an interval never straddles subroutines.
|
|
auto *funcCurInst = instToFunc[curInst->getLexicalId()];
|
|
auto *funcStartInterval = instToFunc[dclStartInterval->getLexicalId()];
|
|
if (funcCurInst != funcStartInterval) {
|
|
gra.pushBackNewInterval(dcl);
|
|
dclStartInterval = nullptr;
|
|
}
|
|
}
|
|
if (!dclStartInterval ||
|
|
(dclStartInterval->getLexicalId() > curInst->getLexicalId())) {
|
|
gra.setLastStartInterval(dcl, curInst);
|
|
}
|
|
|
|
auto dclEndInterval = gra.getLastEndInterval(dcl);
|
|
if (!dclEndInterval ||
|
|
(dclEndInterval->getLexicalId() < curInst->getLexicalId())) {
|
|
gra.setLastEndInterval(dcl, curInst);
|
|
}
|
|
}
|
|
|
|
void Augmentation::updateEndInterval(const G4_Declare *dcl, G4_INST *curInst) {
|
|
auto dclEndInterval = gra.getLastEndInterval(dcl);
|
|
if (dclEndInterval && hasSubroutines) {
|
|
auto *funcCurInst = instToFunc[curInst->getLexicalId()];
|
|
auto *funcEndInterval = instToFunc[dclEndInterval->getLexicalId()];
|
|
if (funcCurInst != funcEndInterval) {
|
|
gra.pushBackNewInterval(dcl);
|
|
dclEndInterval = nullptr;
|
|
}
|
|
}
|
|
if (!dclEndInterval ||
|
|
(dclEndInterval->getLexicalId() < curInst->getLexicalId())) {
|
|
gra.setLastEndInterval(dcl, curInst);
|
|
}
|
|
|
|
auto dclStartInterval = gra.getLastStartInterval(dcl);
|
|
if (!dclStartInterval ||
|
|
(dclStartInterval->getLexicalId() > curInst->getLexicalId())) {
|
|
gra.setLastStartInterval(dcl, curInst);
|
|
}
|
|
}
|
|
|
|
void Augmentation::updateStartIntervalForLocal(G4_Declare *dcl,
|
|
G4_INST *curInst,
|
|
G4_Operand *opnd) {
|
|
updateStartInterval(dcl, curInst);
|
|
if (dcl->getIsSplittedDcl()) {
|
|
updateStartIntervalForSubDcl(dcl, curInst, opnd);
|
|
}
|
|
}
|
|
|
|
void Augmentation::updateEndIntervalForLocal(G4_Declare *dcl, G4_INST *curInst,
|
|
G4_Operand *opnd) {
|
|
updateEndInterval(dcl, curInst);
|
|
if (dcl->getIsSplittedDcl()) {
|
|
updateEndIntervalForSubDcl(dcl, curInst, opnd);
|
|
}
|
|
}
|
|
|
|
void GlobalRA::printLiveIntervals() {
|
|
for (const G4_Declare *dcl : kernel.Declares) {
|
|
if (getLastStartInterval(dcl) != nullptr || getLastEndInterval(dcl) != nullptr) {
|
|
DEBUG_VERBOSE(dcl->getName() << " (");
|
|
|
|
if (getLastStartInterval(dcl) != nullptr) {
|
|
DEBUG_VERBOSE(getLastStartInterval(dcl)->getLexicalId());
|
|
} else {
|
|
DEBUG_VERBOSE("*");
|
|
}
|
|
|
|
DEBUG_VERBOSE(", ");
|
|
|
|
if (getLastEndInterval(dcl) != nullptr) {
|
|
DEBUG_VERBOSE(getLastEndInterval(dcl)->getLexicalId());
|
|
} else {
|
|
DEBUG_VERBOSE("*");
|
|
}
|
|
|
|
DEBUG_VERBOSE("] "
|
|
<< "\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
bool Augmentation::isUnknownArgOrRetval(G4_Declare *dcl) const {
|
|
if (!argsRetVal.count(dcl))
|
|
return false;
|
|
return isUnknownArg(dcl) || isUnknownRetVal(dcl);
|
|
}
|
|
|
|
bool Augmentation::isUnknownRetVal(G4_Declare *dcl) const {
|
|
return isRetvalType<RetValType::Unknown>(dcl);
|
|
}
|
|
|
|
bool Augmentation::isRegularRetVal(G4_Declare *dcl) const {
|
|
return isRetvalType<RetValType::Regular>(dcl);
|
|
}
|
|
|
|
bool Augmentation::isUnknownArg(G4_Declare *dcl) const {
|
|
return isArgType<ArgType::Unknown>(dcl);
|
|
}
|
|
|
|
bool Augmentation::isDefBeforeEachCallArg(G4_Declare *dcl) const {
|
|
return isArgType<ArgType::DefBeforeEachCall>(dcl);
|
|
}
|
|
|
|
bool Augmentation::isLiveThroughArg(G4_Declare *dcl) const {
|
|
return isArgType<ArgType::LiveThrough>(dcl);
|
|
}
|
|
|
|
void Augmentation::buildUnknownArgRetval() {
|
|
// Arg and retval of Unknown type could have inaccurate
|
|
// SIMD liveness. So we treat these conservatively by
|
|
// extending their liveness throughout the function such
|
|
// variables are referenced in. These variables may be
|
|
// live through subroutines that don't reference them,
|
|
// but that should be captured either by SIMT liveness
|
|
// or by SIMD liveness when we mark mayDef of callee
|
|
// with overlapping intervals at call sites.
|
|
if (!hasSubroutines)
|
|
return;
|
|
|
|
std::unordered_map<FuncInfo *, std::pair<G4_INST *, G4_INST *>> funcStartEnd;
|
|
for (auto *func : kernel.fg.sortedFuncTable) {
|
|
vISA_ASSERT(!func->getInitBB()->empty(), "expecting non-empty init bb");
|
|
vISA_ASSERT(!func->getExitBB()->empty(), "expecting non-empty exit bb");
|
|
auto *start = func->getInitBB()->front();
|
|
auto *end = func->getExitBB()->back();
|
|
funcStartEnd[func] = std::make_pair(start, end);
|
|
}
|
|
|
|
|
|
// We've a dcl of unknown arg/retval type and a list of subroutines
|
|
// the dcl is referenced in, directly or indirectly. We create live-intervals
|
|
// for dcl spanning each subroutine it's referenced in. Since live-intervals
|
|
// cannot straddle subroutines, we create 1 entry per subroutine.
|
|
auto attachIntervals = [&](G4_Declare *dcl,
|
|
std::unordered_set<FuncInfo *> &subroutines) {
|
|
vISA_ASSERT(gra.getNumIntervals(dcl) == 0,
|
|
"found > 0 intervals for unknown");
|
|
for (auto &startEnd : funcStartEnd) {
|
|
auto *func = startEnd.first;
|
|
if (!subroutines.count(func))
|
|
continue;
|
|
gra.pushBackNewInterval(dcl);
|
|
gra.setLastStartInterval(dcl, startEnd.second.first);
|
|
gra.setLastEndInterval(dcl, startEnd.second.second);
|
|
}
|
|
};
|
|
|
|
if (!liveAnalysis.livenessClass(G4_GRF)) {
|
|
for (auto &var : nonGRFRefs) {
|
|
if (var.second.size() < 2)
|
|
continue;
|
|
|
|
// Non-GRF variables that are referenced in multiple subroutines
|
|
// are considered as unknown type.
|
|
auto dcl = var.first;
|
|
gra.clearIntervals(dcl);
|
|
attachIntervals(dcl, var.second);
|
|
}
|
|
} else {
|
|
// Now gather all GRF Unknown arg, retval
|
|
for (const auto &info : argsRetVal) {
|
|
auto dcl = info.first;
|
|
|
|
if (isUnknownArgOrRetval(dcl)) {
|
|
if (!unknownArgRetvalRefs.count(dcl))
|
|
continue;
|
|
auto &allRefs = unknownArgRetvalRefs.at(dcl);
|
|
gra.clearIntervals(dcl);
|
|
attachIntervals(dcl, allRefs);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Verify that no interval straddles function boundaries
|
|
if (gra.verifyAugmentation) {
|
|
[[maybe_unused]] auto getFunc = [&](G4_INST *inst) {
|
|
unsigned int lexId = inst->getLexicalId();
|
|
|
|
int funcId = 0;
|
|
for (auto &func : funcStartEnd) {
|
|
if (lexId >= func.second.first->getLexicalId() &&
|
|
lexId <= func.second.second->getLexicalId())
|
|
return funcId;
|
|
funcId++;
|
|
}
|
|
return funcId;
|
|
};
|
|
for (G4_Declare *dcl : kernel.Declares) {
|
|
auto &allIntervals = gra.getAllIntervals(dcl);
|
|
for (auto &interval : allIntervals) {
|
|
[[maybe_unused]] auto start = interval.start;
|
|
[[maybe_unused]] auto end = interval.end;
|
|
vISA_ASSERT(getFunc(start) == getFunc(end),
|
|
"interval straddles functions");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
bool Augmentation::hasUniqueFuncHome(G4_Declare *dcl) const {
|
|
// Home functions are computed before computing arg/retval.
|
|
// Per definition, arg/retval have no home function.
|
|
if (argsRetVal.count(dcl))
|
|
return false;
|
|
auto *homeFunction = homeFunc[dcl->getDeclId()];
|
|
return homeFunction != nullptr;
|
|
}
|
|
|
|
FuncInfo* Augmentation::getUniqueFuncHome(G4_Declare* dcl) const {
|
|
vISA_ASSERT(hasUniqueFuncHome(dcl), "expecting unique home func");
|
|
return homeFunc[dcl->getDeclId()];
|
|
}
|
|
|
|
void Augmentation::startIntervalForLiveIn(FuncInfo *funcInfo, G4_BB *bb) {
|
|
// Start live-in intervals
|
|
auto liveInBB = liveAnalysis.getLiveAtEntry(bb) & liveAnalysis.globalVars;
|
|
for (auto i : liveInBB) {
|
|
G4_Declare *dcl = lrs[i]->getDcl()->getRootDeclare();
|
|
if (isUnknownArgOrRetval(dcl))
|
|
continue;
|
|
|
|
if (hasUniqueFuncHome(dcl) && getUniqueFuncHome(dcl) != funcInfo)
|
|
continue;
|
|
|
|
vISA_ASSERT(bb->size() > 0, "empty instlist");
|
|
vISA_ASSERT(funcInfo == kernel.fg.kernelInfo ||
|
|
argsPerSub.count(funcInfo) > 0 ||
|
|
!liveAnalysis.livenessClass(G4_GRF),
|
|
"didnt find callee entry");
|
|
updateStartInterval(dcl, bb->front());
|
|
}
|
|
}
|
|
|
|
void Augmentation::handleCallSite(G4_BB *curBB, unsigned int &funcCnt) {
|
|
const char *name =
|
|
kernel.fg.builder->getNameString(32, "SCALL_%d", funcCnt++);
|
|
G4_Declare *scallDcl =
|
|
kernel.fg.builder->createDeclare(name, G4_GRF, 1, 1, Type_UD);
|
|
gra.addVarToRA(scallDcl);
|
|
|
|
auto *inst = curBB->back();
|
|
vISA_ASSERT(inst->isCall(), "expecting call instruction");
|
|
|
|
updateStartInterval(scallDcl, inst);
|
|
updateEndInterval(scallDcl, inst);
|
|
|
|
std::pair<G4_INST *, G4_BB *> callInfo(inst, curBB);
|
|
callDclMap.emplace(scallDcl, callInfo);
|
|
|
|
if (liveAnalysis.livenessClass(G4_GRF)) {
|
|
auto *retLocDcl = inst->getDst()->getTopDcl();
|
|
// RET__loc dcl starts and ends at call site.
|
|
// If a function has multiple call sites to same
|
|
// callee then there would be as many trivial
|
|
// live-intervals for corresponding RET__loc dcl.
|
|
|
|
// RET__loc dcl in entryBB is identified as LiveThrough
|
|
// rather than DefBeforeEachCall. LiveThrough variable's
|
|
// interference is fully handled by SIMT. So we don't
|
|
// need to create the short interval for RET__loc at
|
|
// call site.
|
|
if (isDefBeforeEachCallArg(retLocDcl))
|
|
gra.pushBackNewInterval(retLocDcl);
|
|
|
|
auto *callee = curBB->getCalleeInfo();
|
|
vISA_ASSERT(argsPerSub.count(callee) > 0, "didnt find entry for sub");
|
|
auto &args = argsPerSub.at(callee);
|
|
// Terminate any arg with type DefBeforeEachCall
|
|
for (auto *arg : args) {
|
|
if (!isDefBeforeEachCallArg(arg))
|
|
continue;
|
|
updateEndInterval(arg, inst);
|
|
}
|
|
|
|
vISA_ASSERT(retValPerSub.count(callee) > 0, "didnt find callee entry");
|
|
// Start regular retval live-interval at call
|
|
const auto &retvals = retValPerSub[callee];
|
|
for (auto *retvalDcl : retvals) {
|
|
if (isRegularRetVal(retvalDcl)) {
|
|
gra.pushBackNewInterval(retvalDcl);
|
|
updateStartInterval(retvalDcl, curBB->back());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Augmentation::handleDstOpnd(FuncInfo *funcInfo, G4_BB *curBB,
|
|
G4_INST *inst) {
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
|
|
if (dst && dst->getRegAccess() == Direct && dst->getBase()) {
|
|
// Destination
|
|
G4_Declare *defdcl = GetTopDclFromRegRegion(dst);
|
|
|
|
if (dst->getBase()->isRegAllocPartaker()) {
|
|
if (defdcl) {
|
|
if (!liveAnalysis.livenessClass(G4_GRF))
|
|
nonGRFRefs[defdcl].insert(funcInfo);
|
|
|
|
if (isDefBeforeEachCallArg(defdcl)) {
|
|
vISA_ASSERT(!defdcl->getIsSplittedDcl(),
|
|
"not expecting to see split on arg");
|
|
// Check if previous interval ended at an earlier call.
|
|
// If not continue it, otherwise start new one.
|
|
auto *prevEnd = gra.getLastEndInterval(defdcl);
|
|
if (prevEnd && prevEnd->isCall())
|
|
gra.pushBackNewInterval(defdcl);
|
|
} else if (isUnknownArgOrRetval(defdcl)) {
|
|
unknownArgRetvalRefs[defdcl].insert(funcInfo);
|
|
}
|
|
|
|
if (gra.getLocalLR(defdcl)) {
|
|
updateStartIntervalForLocal(defdcl, inst, dst);
|
|
} else {
|
|
updateStartInterval(defdcl, inst);
|
|
}
|
|
}
|
|
} else if (liveAnalysis.livenessClass(G4_GRF)) {
|
|
LocalLiveRange *defdclLR;
|
|
|
|
// Handle ranges allocated by local RA
|
|
if (defdcl && (defdclLR = gra.getLocalLR(defdcl)) &&
|
|
defdclLR->getAssigned() == true && !defdclLR->isEOT()) {
|
|
vISA_ASSERT(!hasSubroutines || argsRetVal.count(defdcl) > 0,
|
|
"didnt expect arg to be allocated by LRA");
|
|
updateStartInterval(defdcl, inst);
|
|
}
|
|
}
|
|
} else if (liveAnalysis.livenessClass(G4_ADDRESS) && dst &&
|
|
dst->getRegAccess() == IndirGRF && dst->getBase() &&
|
|
dst->getBase()->isRegVar()) {
|
|
// Destination is indirect
|
|
G4_Declare *defdcl = dst->getBaseRegVarRootDeclare();
|
|
nonGRFRefs[defdcl].insert(funcInfo);
|
|
updateEndInterval(defdcl, inst);
|
|
} else if (liveAnalysis.livenessClass(G4_GRF) && dst && dst->isIndirect()) {
|
|
const REGVAR_VECTOR &pointsToSet =
|
|
liveAnalysis.getPointsToAnalysis().getAllInPointsToOrIndrUse(dst,
|
|
curBB);
|
|
for (const auto& pointsToVar : pointsToSet) {
|
|
if (pointsToVar.var->isRegAllocPartaker()) {
|
|
updateStartInterval(pointsToVar.var->getDeclare()->getRootDeclare(),
|
|
inst);
|
|
auto dcl = pointsToVar.var->getDeclare()->getRootDeclare();
|
|
if (isUnknownArgOrRetval(dcl))
|
|
unknownArgRetvalRefs[dcl].insert(funcInfo);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Augmentation::handleCondMod(FuncInfo* funcInfo, G4_INST *inst) {
|
|
if (liveAnalysis.livenessClass(G4_FLAG)) {
|
|
G4_CondMod *cmod = inst->getCondMod();
|
|
|
|
if (cmod && cmod->getBase()) {
|
|
// Conditional modifier
|
|
G4_Declare *dcl = cmod->getBaseRegVarRootDeclare();
|
|
nonGRFRefs[dcl].insert(funcInfo);
|
|
updateStartInterval(dcl, inst);
|
|
}
|
|
}
|
|
}
|
|
|
|
void Augmentation::handleSrcOpnd(FuncInfo *funcInfo, G4_BB *curBB,
|
|
G4_Operand *src) {
|
|
G4_SrcRegRegion *srcRegion = src->asSrcRegRegion();
|
|
auto *inst = srcRegion->getInst();
|
|
if (srcRegion->getRegAccess() == Direct && srcRegion->getBase()) {
|
|
G4_Declare *usedcl = GetTopDclFromRegRegion(src);
|
|
|
|
if (isUnknownArg(usedcl) || isUnknownRetVal(usedcl))
|
|
unknownArgRetvalRefs[usedcl].insert(funcInfo);
|
|
|
|
if (srcRegion->getBase()->isRegAllocPartaker()) {
|
|
if (!liveAnalysis.livenessClass(G4_GRF))
|
|
nonGRFRefs[usedcl].insert(funcInfo);
|
|
|
|
if (gra.getLocalLR(usedcl)) {
|
|
updateEndIntervalForLocal(usedcl, inst, src);
|
|
} else {
|
|
updateEndInterval(usedcl, inst);
|
|
}
|
|
} else if (liveAnalysis.livenessClass(G4_GRF)) {
|
|
LocalLiveRange *usedclLR = nullptr;
|
|
if (usedcl && (usedclLR = gra.getLocalLR(usedcl)) &&
|
|
usedclLR->getAssigned() == true && !usedclLR->isEOT()) {
|
|
updateEndInterval(usedcl, inst);
|
|
}
|
|
}
|
|
} else if (liveAnalysis.livenessClass(G4_GRF) && srcRegion->isIndirect()) {
|
|
const REGVAR_VECTOR &pointsToSet =
|
|
liveAnalysis.getPointsToAnalysis().getAllInPointsToOrIndrUse(srcRegion,
|
|
curBB);
|
|
for (const auto& pointsToVar : pointsToSet) {
|
|
if (pointsToVar.var->isRegAllocPartaker()) {
|
|
updateEndInterval(pointsToVar.var->getDeclare()->getRootDeclare(),
|
|
inst);
|
|
auto dcl = pointsToVar.var->getDeclare()->getRootDeclare();
|
|
if (isUnknownArgOrRetval(dcl))
|
|
unknownArgRetvalRefs[dcl].insert(funcInfo);
|
|
}
|
|
}
|
|
} else if (liveAnalysis.livenessClass(G4_ADDRESS) &&
|
|
srcRegion->getRegAccess() == IndirGRF && srcRegion->getBase() &&
|
|
srcRegion->getBase()->isRegVar()) {
|
|
G4_Declare *usedcl = src->getBaseRegVarRootDeclare();
|
|
nonGRFRefs[usedcl].insert(funcInfo);
|
|
updateEndInterval(usedcl, inst);
|
|
}
|
|
}
|
|
|
|
void Augmentation::handlePred(FuncInfo* funcInfo, G4_INST *inst) {
|
|
if (liveAnalysis.livenessClass(G4_FLAG)) {
|
|
G4_Predicate *pred = inst->getPredicate();
|
|
|
|
if (pred) {
|
|
// Predicate
|
|
G4_Declare *dcl = pred->getBaseRegVarRootDeclare();
|
|
nonGRFRefs[dcl].insert(funcInfo);
|
|
updateEndInterval(dcl, inst);
|
|
}
|
|
}
|
|
}
|
|
|
|
void Augmentation::endIntervalForLiveOut(FuncInfo* funcInfo, G4_BB *bb) {
|
|
auto liveOutBB = liveAnalysis.getLiveAtExit(bb) & liveAnalysis.globalVars;
|
|
if (bb->isEndWithCall() && liveAnalysis.livenessClass(G4_GRF)) {
|
|
// reset bit for RET__loc as we handle it specially later to
|
|
// create point intervals at call site.
|
|
auto retLocVarId = bb->back()->getDst()->getTopDcl()->getRegVar()->getId();
|
|
liveOutBB.reset(retLocVarId);
|
|
// Default subroutine argument has to start at definition and
|
|
// end at call site. A caller may have multiple call sites for
|
|
// a callee. We want to create multiple live-intervals, one
|
|
// per call site. Creation of a live-interval per call site
|
|
// is handled in handleCallSite() already, so we skip extending
|
|
// them here.
|
|
auto *callee = bb->getCalleeInfo();
|
|
if (callee && argsPerSub.count(callee)) {
|
|
auto &argsForCallee = argsPerSub.at(callee);
|
|
for (auto *arg : argsForCallee) {
|
|
if (isDefBeforeEachCallArg(arg))
|
|
liveOutBB.reset(arg->getRegVar()->getId());
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extend live-out interval to BB
|
|
for (auto i : liveOutBB) {
|
|
G4_Declare *dcl = lrs[i]->getDcl()->getRootDeclare();
|
|
if (isUnknownArgOrRetval(dcl))
|
|
continue;
|
|
|
|
if (hasUniqueFuncHome(dcl) && getUniqueFuncHome(dcl) != funcInfo)
|
|
continue;
|
|
|
|
vISA_ASSERT(bb->size() > 0, "empty instlist");
|
|
updateEndInterval(dcl, bb->back());
|
|
}
|
|
}
|
|
|
|
// Handle live-range extension for non-reducible CFG
|
|
void Augmentation::handleNonReducibleExtension(FuncInfo *funcInfo) {
|
|
// use SCC instead
|
|
// FIXME: does augmentation work in the presence of subroutine? neither
|
|
// SCCAnalysis nor findNaturalLoops considers the call graph
|
|
SCCAnalysis SCCFinder(kernel.fg);
|
|
SCCFinder.run();
|
|
for (auto iter = SCCFinder.SCC_begin(), iterEnd = SCCFinder.SCC_end();
|
|
iter != iterEnd; ++iter) {
|
|
auto &&anSCC = *iter;
|
|
std::unordered_set<G4_BB *> SCCSucc; // any successor BB of the SCC
|
|
G4_BB *headBB = anSCC.getEarliestBB();
|
|
if (hasSubroutines && headBB->getFuncInfo() != funcInfo)
|
|
continue;
|
|
for (auto BI = anSCC.body_begin(), BIEnd = anSCC.body_end(); BI != BIEnd;
|
|
++BI) {
|
|
G4_BB *bb = *BI;
|
|
for (auto succ : bb->Succs) {
|
|
if (!anSCC.isMember(succ)) {
|
|
SCCSucc.insert(succ);
|
|
}
|
|
}
|
|
}
|
|
for (auto exitBB : SCCSucc) {
|
|
extendVarLiveness(funcInfo, exitBB, headBB->front());
|
|
}
|
|
}
|
|
}
|
|
|
|
std::unordered_set<G4_BB *>
|
|
Augmentation::getAllJIPTargetBBs(FuncInfo *funcInfo) {
|
|
// Any BB that has join as first non-label instruction is a JIP target.
|
|
std::unordered_set<G4_BB *> JIPTargetBBs;
|
|
|
|
for (auto *BB : funcInfo->getBBList()) {
|
|
if (BB->empty())
|
|
continue;
|
|
auto InstIt = BB->begin();
|
|
if ((*InstIt)->isLabel())
|
|
++InstIt;
|
|
if (InstIt != BB->end() && (*InstIt)->opcode() == G4_join)
|
|
JIPTargetBBs.insert(BB);
|
|
}
|
|
|
|
return JIPTargetBBs;
|
|
}
|
|
|
|
std::vector<std::pair<G4_BB *, G4_BB *>>
|
|
Augmentation::getNonLoopBackEdges(FuncInfo *funcInfo) {
|
|
auto &LoopBackEdges = kernel.fg.getAllNaturalLoops();
|
|
std::vector<std::pair<G4_BB *, G4_BB *>> NonLoopBackEdges;
|
|
|
|
for (auto *BB : funcInfo->getBBList()) {
|
|
if (BB->empty())
|
|
continue;
|
|
if (BB->back()->opcode() != G4_jmpi && BB->back()->opcode() != G4_goto)
|
|
continue;
|
|
auto LastInstLexId = BB->back()->getLexicalId();
|
|
for (auto *Succ : BB->Succs) {
|
|
vISA_ASSERT(!Succ->empty(), "expecting non-empty succ BB");
|
|
auto SuccInstLexId = Succ->front()->getLexicalId();
|
|
// Forward edge
|
|
if (SuccInstLexId > LastInstLexId)
|
|
continue;
|
|
// Check if this is a loop edge
|
|
auto Edge = std::pair(BB, Succ);
|
|
if (LoopBackEdges.find(Edge) == LoopBackEdges.end())
|
|
NonLoopBackEdges.push_back(Edge);
|
|
}
|
|
}
|
|
|
|
return NonLoopBackEdges;
|
|
}
|
|
|
|
void Augmentation::handleNonLoopBackEdges(FuncInfo *funcInfo) {
|
|
|
|
// up:
|
|
// (W) P5 =
|
|
// ...
|
|
// goto Later
|
|
// ...
|
|
// <other BBs>
|
|
// join down
|
|
//
|
|
// BB1:
|
|
// P21 = ...
|
|
// (P38) goto down
|
|
//
|
|
// otherBB:
|
|
// ...
|
|
// (W) jmpi (M1, 1) up
|
|
//
|
|
// down:
|
|
// = P21
|
|
//
|
|
// Later:
|
|
//
|
|
// In above snippet, following path may be taken:
|
|
// BB1, otherBB, up, down, Later
|
|
//
|
|
// P21 is defined in BB1. If P5 uses same register
|
|
// then it can clobber P21 before it gets used in
|
|
// down. So P5 and P21 intervals must overlap.
|
|
//
|
|
// If we've a non-loop backedge in an interval and if
|
|
// there's an incoming JIP edge within that interval
|
|
// then it means we should extend the interval up to
|
|
// the backedge destination. In above snippet, it means
|
|
// extending P21 to "up" so that it overlaps with P5.
|
|
|
|
auto AllJIPTargetBBs = getAllJIPTargetBBs(funcInfo);
|
|
|
|
// Return true if there's any JIP incoming edge within interval
|
|
auto hasIncomingJIPEdge = [&AllJIPTargetBBs](const vISA::Interval &Interval) {
|
|
for (auto *JIPTargetBB : AllJIPTargetBBs) {
|
|
vISA::Interval Temp(JIPTargetBB->front(), JIPTargetBB->front());
|
|
if (Interval.intervalsOverlap(Temp))
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
auto NonLoopBackEdges = getNonLoopBackEdges(funcInfo);
|
|
if (NonLoopBackEdges.empty()) {
|
|
VISA_DEBUG_VERBOSE({ std::cout << "No non-loop backedges found\n"; });
|
|
return;
|
|
}
|
|
|
|
auto getNonLoopBackEdgesInInterval =
|
|
[&NonLoopBackEdges](const vISA::Interval &Interval) {
|
|
std::vector<std::pair<G4_BB *, G4_BB *>> NonLoopBackEdgesInInterval;
|
|
|
|
for (auto &NonLoopBackEdge : NonLoopBackEdges) {
|
|
vISA::Interval Temp(NonLoopBackEdge.first->back(),
|
|
NonLoopBackEdge.first->back());
|
|
if (Interval.intervalsOverlap(Temp))
|
|
NonLoopBackEdgesInInterval.push_back(NonLoopBackEdge);
|
|
}
|
|
|
|
return NonLoopBackEdgesInInterval;
|
|
};
|
|
|
|
for (G4_Declare *Dcl : kernel.Declares) {
|
|
auto &All = gra.getAllIntervals(Dcl);
|
|
// We shouldn't need to consider special variables like args, retval.
|
|
// Because such variables are not defined/used in same function.
|
|
if (All.size() != 1)
|
|
continue;
|
|
auto &Interval = All[0];
|
|
bool Change = false;
|
|
// Handle transitive backwards branches
|
|
// TODO: Handle forward branch from interval that later jump backwards
|
|
// and cause JIP edge to be taken in the middle of the interval.
|
|
do {
|
|
Change = false;
|
|
auto Start = Interval.start;
|
|
if (hasSubroutines && instToFunc[Start->getLexicalId()] != funcInfo)
|
|
continue;
|
|
if (!hasIncomingJIPEdge(Interval))
|
|
continue;
|
|
std::vector<std::pair<G4_BB *, G4_BB *>> NonLoopBackEdges =
|
|
getNonLoopBackEdgesInInterval(Interval);
|
|
|
|
for (auto &NonLoopBackEdge : NonLoopBackEdges) {
|
|
if (NonLoopBackEdge.second) {
|
|
vISA_ASSERT(NonLoopBackEdge.second->size() > 0,
|
|
"expecting backedge target to be non-empty");
|
|
auto StartLexId = Interval.start->getLexicalId();
|
|
if (StartLexId > NonLoopBackEdge.second->front()->getLexicalId()) {
|
|
VISA_DEBUG_VERBOSE({
|
|
std::cout << "Updating start interval for " << Dcl->getName()
|
|
<< " from " << StartLexId << " to "
|
|
<< NonLoopBackEdge.second->front()->getLexicalId()
|
|
<< " - ";
|
|
NonLoopBackEdge.second->front()->dump();
|
|
});
|
|
auto OldInterval = Interval;
|
|
updateStartInterval(Dcl, NonLoopBackEdge.second->front());
|
|
Change = (OldInterval != Interval);
|
|
}
|
|
}
|
|
}
|
|
} while (Change);
|
|
}
|
|
}
|
|
|
|
void Augmentation::handleLoopExtension(FuncInfo *funcInfo) {
|
|
// process each natural loop
|
|
for (auto &iter : kernel.fg.getAllNaturalLoops()) {
|
|
auto &backEdge = iter.first;
|
|
// Check whether loop is in current function
|
|
if (hasSubroutines &&
|
|
funcInfo != backEdge.first->getFuncInfo())
|
|
continue;
|
|
G4_INST *startInst = (backEdge.second)->front();
|
|
const std::set<G4_BB *> &loopBody = iter.second;
|
|
|
|
for (auto block : loopBody) {
|
|
// FIXME: this may process a BB multiple times
|
|
for (auto succBB : block->Succs) {
|
|
// A subroutine call BB's successor is callee's INIT BB.
|
|
// Loop data structure doesn't include callee BB. So
|
|
// succBB not part of loop may still be INIT BB of callee.
|
|
// Such an INIT BB shouldn't be treated as a loop exit
|
|
// for live-range extension. If we don't check for INIT BB
|
|
// we end up extending RET__loc range to loop header
|
|
// which isn't correct.
|
|
if (loopBody.find(succBB) == loopBody.end() &&
|
|
(succBB->getBBType() & G4_BB_INIT_TYPE) == 0) {
|
|
G4_BB *exitBB = succBB;
|
|
|
|
unsigned latchBBId = (backEdge.first)->getId();
|
|
unsigned exitBBId = succBB->getId();
|
|
if (exitBBId < latchBBId && succBB->Succs.size() == 1) {
|
|
exitBB = succBB->Succs.front();
|
|
}
|
|
VISA_DEBUG_VERBOSE({
|
|
std::cout << "==> Extend live-in for BB" << exitBB->getId() << "\n";
|
|
exitBB->emit(std::cout);
|
|
});
|
|
extendVarLiveness(funcInfo, exitBB, startInst);
|
|
}
|
|
}
|
|
}
|
|
|
|
G4_BB *startBB = backEdge.second;
|
|
G4_BB *endBB = backEdge.first;
|
|
|
|
auto liveInStartBB = liveAnalysis.getLiveAtEntry(startBB);
|
|
auto liveOutEndBB = liveAnalysis.getLiveAtExit(endBB);
|
|
auto globalsLiveInAndLiveOut =
|
|
liveInStartBB & liveOutEndBB & liveAnalysis.globalVars;
|
|
|
|
for (auto i : globalsLiveInAndLiveOut) {
|
|
auto *dcl = lrs[i]->getDcl()->getRootDeclare();
|
|
// If dcl has non-nullptr home function then extend liveness only
|
|
// in same function.
|
|
if (hasUniqueFuncHome(dcl) && getUniqueFuncHome(dcl) != funcInfo)
|
|
continue;
|
|
|
|
updateEndInterval(dcl, endBB->back());
|
|
VISA_DEBUG_VERBOSE({
|
|
unsigned oldEnd = gra.getLastEndInterval(dcl)->getLexicalId();
|
|
if (oldEnd < gra.getLastEndInterval(dcl)->getLexicalId()) {
|
|
std::cout << "Extending " << dcl->getName() << " from old end "
|
|
<< oldEnd << " to "
|
|
<< gra.getEndInterval(dcl)->getLexicalId()
|
|
<< " due to back-edge"
|
|
<< "\n";
|
|
}
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extend all variables that are live at bb entry to the given inst
|
|
void Augmentation::extendVarLiveness(FuncInfo *funcInfo, G4_BB *bb,
|
|
G4_INST *inst) {
|
|
auto liveAtEntryBB =
|
|
liveAnalysis.getLiveAtEntry(bb) & liveAnalysis.globalVars;
|
|
for (auto i : liveAtEntryBB) {
|
|
G4_Declare *dcl = lrs[i]->getDcl()->getRootDeclare();
|
|
// If dcl has non-nullptr home function then extend liveness only
|
|
// in same function.
|
|
if (hasUniqueFuncHome(dcl) && getUniqueFuncHome(dcl) != funcInfo)
|
|
continue;
|
|
|
|
if (!kernel.fg.isPseudoDcl(dcl)) {
|
|
// Extend ith live-interval
|
|
updateStartInterval(dcl, inst);
|
|
VISA_DEBUG_VERBOSE({
|
|
unsigned oldStart = gra.getLastStartInterval(dcl)->getLexicalId();
|
|
if (oldStart > gra.getLastStartInterval(dcl)->getLexicalId()) {
|
|
std::cout << "Extending " << dcl->getName() << " from old start "
|
|
<< oldStart << " to "
|
|
<< gra.getLastStartInterval(dcl)->getLexicalId()
|
|
<< " due to back-edge"
|
|
<< "\n";
|
|
}
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
// Build live-intervals for given subroutine and store them per subroutine.
|
|
// Arg/Retval are specially treated. We construct live-intervals with holes
|
|
// for such special variables to avoid unnecessary overlaps.
|
|
void Augmentation::buildLiveIntervals(FuncInfo* funcInfo) {
|
|
unsigned funcCnt = 0;
|
|
for (G4_BB *curBB : funcInfo->getBBList()) {
|
|
if (!curBB->empty()) {
|
|
startIntervalForLiveIn(funcInfo, curBB);
|
|
endIntervalForLiveOut(funcInfo, curBB);
|
|
}
|
|
|
|
for (G4_INST *inst : *curBB) {
|
|
if (inst->isPseudoKill() == true)
|
|
continue;
|
|
|
|
if (inst->isCall()) {
|
|
handleCallSite(curBB, funcCnt);
|
|
continue;
|
|
}
|
|
|
|
handleDstOpnd(funcInfo, curBB, inst);
|
|
|
|
handleCondMod(funcInfo, inst);
|
|
|
|
for (unsigned i = 0, numSrc = inst->getNumSrc(); i < numSrc; i++) {
|
|
G4_Operand *src = inst->getSrc(i);
|
|
if (!src || !src->isSrcRegRegion()) {
|
|
continue;
|
|
}
|
|
handleSrcOpnd(funcInfo, curBB, src);
|
|
}
|
|
|
|
handlePred(funcInfo, inst);
|
|
}
|
|
}
|
|
|
|
handleNonLoopBackEdges(funcInfo);
|
|
|
|
// A variable may be defined in each divergent loop iteration and used
|
|
// outside the loop. SIMT liveness can detect the variable as KILL and
|
|
// this makes the variable non-loop carried. However, channel enable
|
|
// behavior may differ across loop iterations. So a channel be be defined
|
|
// in an earlier iteration and that channel could be disabled till end of
|
|
// the loop, while getting re-enabled outside the loop. This means we
|
|
// need to preserve value of the variable in each loop iteration and
|
|
// treat the variable as loop carried. Following is pseudo-code:
|
|
//
|
|
// loop_header:
|
|
// (W) V1 =
|
|
// = V1
|
|
// V2:d = {Q1}
|
|
// (P) goto loop_header
|
|
//
|
|
// outside_loop:
|
|
// = V2
|
|
//
|
|
// In above case, V2 should be treated as loop carried as it's defined using
|
|
// Q1 EM and belongs to Default32Bit bucket. It cannot share storage with
|
|
// V1 because V1 uses (W) and that could destroy value of V2 computed in an
|
|
// earlier iteration.
|
|
|
|
|
|
if (!kernel.fg.isReducible()) {
|
|
handleNonReducibleExtension(funcInfo);
|
|
} else {
|
|
handleLoopExtension(funcInfo);
|
|
}
|
|
|
|
#ifdef DEBUG_VERBOSE_ON
|
|
// Print calculated live-ranges
|
|
gra.printLiveIntervals();
|
|
#endif
|
|
}
|
|
|
|
// FIXME: Used by old augmentation only where no holes are modeled.
|
|
void Augmentation::buildLiveIntervals() {
|
|
// Treat variables live-in to program first
|
|
G4_BB *entryBB = kernel.fg.getEntryBB();
|
|
|
|
// Live-in variables have their start interval start with
|
|
// first instruction of entry BB
|
|
for (auto i : liveAnalysis.globalVars) {
|
|
if (liveAnalysis.isLiveAtEntry(entryBB, i)) {
|
|
const G4_Declare *dcl = lrs[i]->getDcl()->getRootDeclare();
|
|
|
|
updateStartInterval(dcl, entryBB->front());
|
|
}
|
|
}
|
|
|
|
unsigned funcCnt = 0;
|
|
|
|
for (G4_BB *curBB : kernel.fg) {
|
|
for (G4_INST *inst : *curBB) {
|
|
if (inst->isPseudoKill() == true) {
|
|
continue;
|
|
}
|
|
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
|
|
if (inst->isCall()) {
|
|
const char *name =
|
|
kernel.fg.builder->getNameString(32, "SCALL_%d", funcCnt++);
|
|
G4_Declare *scallDcl =
|
|
kernel.fg.builder->createDeclare(name, G4_GRF, 1, 1, Type_UD);
|
|
gra.addVarToRA(scallDcl);
|
|
|
|
updateStartInterval(scallDcl, inst);
|
|
updateEndInterval(scallDcl, inst);
|
|
|
|
std::pair<G4_INST *, G4_BB *> callInfo(inst, curBB);
|
|
callDclMap.emplace(scallDcl, callInfo);
|
|
|
|
continue;
|
|
}
|
|
|
|
if (dst && dst->getRegAccess() == Direct && dst->getBase()) {
|
|
// Destination
|
|
G4_Declare *defdcl = GetTopDclFromRegRegion(dst);
|
|
|
|
if (dst->getBase()->isRegAllocPartaker()) {
|
|
if (defdcl) {
|
|
if (gra.getLocalLR(defdcl)) {
|
|
updateStartIntervalForLocal(defdcl, inst, dst);
|
|
} else {
|
|
updateStartInterval(defdcl, inst);
|
|
}
|
|
}
|
|
} else if (liveAnalysis.livenessClass(G4_GRF)) {
|
|
LocalLiveRange *defdclLR;
|
|
|
|
// Handle ranges allocated by local RA
|
|
if (defdcl && (defdclLR = gra.getLocalLR(defdcl)) &&
|
|
defdclLR->getAssigned() == true && !defdclLR->isEOT()) {
|
|
updateStartInterval(defdcl, inst);
|
|
}
|
|
}
|
|
} else if (liveAnalysis.livenessClass(G4_ADDRESS) && dst &&
|
|
dst->getRegAccess() == IndirGRF && dst->getBase() &&
|
|
dst->getBase()->isRegVar()) {
|
|
// Destination is indirect
|
|
G4_Declare *defdcl = dst->getBaseRegVarRootDeclare();
|
|
|
|
updateEndInterval(defdcl, inst);
|
|
} else if (liveAnalysis.livenessClass(G4_GRF) && dst &&
|
|
dst->isIndirect()) {
|
|
const REGVAR_VECTOR &pointsToSet =
|
|
liveAnalysis.getPointsToAnalysis().getAllInPointsToOrIndrUse(dst,
|
|
curBB);
|
|
for (const auto &pointsToVar : pointsToSet) {
|
|
if (pointsToVar.var->isRegAllocPartaker()) {
|
|
updateStartInterval(pointsToVar.var->getDeclare()->getRootDeclare(),
|
|
inst);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (liveAnalysis.livenessClass(G4_FLAG)) {
|
|
G4_CondMod *cmod = inst->getCondMod();
|
|
|
|
if (cmod != nullptr && cmod->getBase() != nullptr) {
|
|
// Conditional modifier
|
|
G4_Declare *dcl = cmod->getBaseRegVarRootDeclare();
|
|
|
|
updateStartInterval(dcl, inst);
|
|
}
|
|
}
|
|
|
|
for (unsigned i = 0, numSrc = inst->getNumSrc(); i < numSrc; i++) {
|
|
G4_Operand *src = inst->getSrc(i);
|
|
if (!src || !src->isSrcRegRegion()) {
|
|
continue;
|
|
}
|
|
G4_SrcRegRegion *srcRegion = src->asSrcRegRegion();
|
|
|
|
if (srcRegion->getRegAccess() == Direct && srcRegion->getBase()) {
|
|
G4_Declare *usedcl = GetTopDclFromRegRegion(src);
|
|
|
|
if (srcRegion->getBase()->isRegAllocPartaker()) {
|
|
if (gra.getLocalLR(usedcl)) {
|
|
updateEndIntervalForLocal(usedcl, inst, src);
|
|
} else {
|
|
updateEndInterval(usedcl, inst);
|
|
}
|
|
} else if (liveAnalysis.livenessClass(G4_GRF)) {
|
|
LocalLiveRange *usedclLR = nullptr;
|
|
if (usedcl && (usedclLR = gra.getLocalLR(usedcl)) &&
|
|
usedclLR->getAssigned() == true && !usedclLR->isEOT()) {
|
|
updateEndInterval(usedcl, inst);
|
|
}
|
|
}
|
|
} else if (liveAnalysis.livenessClass(G4_GRF) &&
|
|
srcRegion->isIndirect()) {
|
|
const REGVAR_VECTOR &pointsToSet =
|
|
liveAnalysis.getPointsToAnalysis().getAllInPointsToOrIndrUse(
|
|
srcRegion, curBB);
|
|
for (const auto &pointsToVar : pointsToSet) {
|
|
if (pointsToVar.var->isRegAllocPartaker()) {
|
|
updateEndInterval(pointsToVar.var->getDeclare()->getRootDeclare(),
|
|
inst);
|
|
}
|
|
}
|
|
} else if (liveAnalysis.livenessClass(G4_ADDRESS) &&
|
|
srcRegion->getRegAccess() == IndirGRF &&
|
|
srcRegion->getBase() && srcRegion->getBase()->isRegVar()) {
|
|
G4_Declare *usedcl = src->getBaseRegVarRootDeclare();
|
|
|
|
updateEndInterval(usedcl, inst);
|
|
}
|
|
}
|
|
|
|
if (liveAnalysis.livenessClass(G4_FLAG)) {
|
|
G4_Predicate *pred = inst->getPredicate();
|
|
|
|
if (pred != NULL) {
|
|
// Predicate
|
|
G4_Declare *dcl = pred->getBaseRegVarRootDeclare();
|
|
|
|
updateEndInterval(dcl, inst);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// extend all variables that are live at bb entry to the given inst
|
|
// ToDo: this seems very slow when # variable is large, should look for sparse
|
|
// implementation
|
|
auto extendVarLiveness = [this](G4_BB *bb, G4_INST *inst) {
|
|
for (auto i : liveAnalysis.globalVars) {
|
|
if (liveAnalysis.isLiveAtEntry(bb, i) == true &&
|
|
!kernel.fg.isPseudoDcl(lrs[i]->getDcl())) {
|
|
// Extend ith live-interval
|
|
G4_Declare *dcl = lrs[i]->getDcl()->getRootDeclare();
|
|
updateStartInterval(dcl, inst);
|
|
VISA_DEBUG_VERBOSE({
|
|
unsigned oldStart = gra.getStartInterval(dcl)->getLexicalId();
|
|
if (oldStart > gra.getStartInterval(dcl)->getLexicalId()) {
|
|
std::cout << "Extending " << dcl->getName() << " from old start "
|
|
<< oldStart << " to "
|
|
<< gra.getStartInterval(dcl)->getLexicalId()
|
|
<< " due to back-edge"
|
|
<< "\n";
|
|
}
|
|
});
|
|
}
|
|
}
|
|
};
|
|
|
|
if (!kernel.fg.isReducible()) {
|
|
// use SCC instead
|
|
// FIXME: does augmentation work in the presence of subroutine? neither
|
|
// SCCAnalysis nor findNaturalLoops considers the call graph
|
|
SCCAnalysis SCCFinder(kernel.fg);
|
|
SCCFinder.run();
|
|
for (auto iter = SCCFinder.SCC_begin(), iterEnd = SCCFinder.SCC_end();
|
|
iter != iterEnd; ++iter) {
|
|
auto &&anSCC = *iter;
|
|
std::unordered_set<G4_BB *> SCCSucc; // any successor BB of the SCC
|
|
G4_BB *headBB = anSCC.getEarliestBB();
|
|
for (auto BI = anSCC.body_begin(), BIEnd = anSCC.body_end(); BI != BIEnd;
|
|
++BI) {
|
|
G4_BB *bb = *BI;
|
|
for (auto succ : bb->Succs) {
|
|
if (!anSCC.isMember(succ)) {
|
|
SCCSucc.insert(succ);
|
|
}
|
|
}
|
|
}
|
|
for (auto exitBB : SCCSucc) {
|
|
extendVarLiveness(exitBB, headBB->front());
|
|
}
|
|
}
|
|
} else {
|
|
// process each natural loop
|
|
for (auto &&iter : kernel.fg.getAllNaturalLoops()) {
|
|
auto &&backEdge = iter.first;
|
|
G4_INST *startInst = (backEdge.second)->front();
|
|
const std::set<G4_BB *> &loopBody = iter.second;
|
|
|
|
for (auto block : loopBody) {
|
|
// FIXME: this may process a BB multiple times
|
|
for (auto succBB : block->Succs) {
|
|
// A subroutine call BB's successor is callee's INIT BB.
|
|
// Loop data structure doesn't include callee BB. So
|
|
// succBB not part of loop may still be INIT BB of callee.
|
|
// Such an INIT BB shouldn't be treated as a loop exit
|
|
// for live-range extension. If we don't check for INIT BB
|
|
// we end up extending RET__loc range to loop header
|
|
// which isn't correct.
|
|
if (loopBody.find(succBB) == loopBody.end() &&
|
|
(succBB->getBBType() & G4_BB_INIT_TYPE) == 0) {
|
|
G4_BB *exitBB = succBB;
|
|
|
|
unsigned latchBBId = (backEdge.first)->getId();
|
|
unsigned exitBBId = succBB->getId();
|
|
if (exitBBId < latchBBId && succBB->Succs.size() == 1) {
|
|
exitBB = succBB->Succs.front();
|
|
}
|
|
VISA_DEBUG_VERBOSE({
|
|
std::cout << "==> Extend live-in for BB" << exitBB->getId()
|
|
<< "\n";
|
|
exitBB->emit(std::cout);
|
|
});
|
|
extendVarLiveness(exitBB, startInst);
|
|
}
|
|
}
|
|
}
|
|
|
|
G4_BB *startBB = backEdge.second;
|
|
G4_BB *EndBB = backEdge.first;
|
|
|
|
for (auto i : liveAnalysis.globalVars) {
|
|
if (liveAnalysis.isLiveAtEntry(startBB, i) == true &&
|
|
liveAnalysis.isLiveAtExit(EndBB, i) == true) {
|
|
const G4_Declare *dcl = lrs[i]->getDcl()->getRootDeclare();
|
|
unsigned oldEnd = gra.getEndInterval(dcl)->getLexicalId();
|
|
(void)oldEnd;
|
|
updateEndInterval(dcl, EndBB->back());
|
|
VISA_DEBUG_VERBOSE({
|
|
if (oldEnd < gra.getEndInterval(dcl)->getLexicalId()) {
|
|
std::cout << "Extending " << dcl->getName() << " from old end "
|
|
<< oldEnd << " to "
|
|
<< gra.getEndInterval(dcl)->getLexicalId()
|
|
<< " due to back-edge"
|
|
<< "\n";
|
|
}
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifdef DEBUG_VERBOSE_ON
|
|
// Print calculated live-ranges
|
|
gra.printLiveIntervals();
|
|
#endif
|
|
}
|
|
|
|
Augmentation::~Augmentation() {
|
|
// Clear out calculated information so that subsequent RA
|
|
// iterations don't have stale information
|
|
for (DECLARE_LIST_ITER dcl_it = kernel.Declares.begin(),
|
|
end = kernel.Declares.end();
|
|
dcl_it != end; dcl_it++) {
|
|
gra.clearIntervals(*dcl_it);
|
|
gra.setMask(*dcl_it, {});
|
|
gra.setAugmentationMask(*dcl_it, AugmentationMasks::Undetermined);
|
|
}
|
|
}
|
|
|
|
class compareInterval {
|
|
public:
|
|
GlobalRA &gra;
|
|
|
|
compareInterval(GlobalRA &g) : gra(g) {}
|
|
|
|
// Used to store live-intervals in stable sorted order. Sorting is
|
|
// done first on start lexical id, so live-ranges are stored in
|
|
// ascending order of start. For stable order, we use secondary
|
|
// check on dcl id.
|
|
bool operator()(const QueueEntry &s1, const QueueEntry &s2) {
|
|
auto s1Start = gra.getIntervalStart(s1.interval)->getLexicalId();
|
|
auto s2Start = gra.getIntervalStart(s2.interval)->getLexicalId();
|
|
|
|
if (s1Start == s2Start)
|
|
return s1.dcl->getDeclId() < s2.dcl->getDeclId();
|
|
|
|
return s1Start < s2Start;
|
|
}
|
|
};
|
|
|
|
void Augmentation::sortLiveIntervals() {
|
|
// Sort all intervals in kernel based on their starting point in
|
|
// ascending order and return them in sortedIntervals vector
|
|
// This is actually more efficient (at least according to vTune) than the O(N)
|
|
// bucket sort algorithm below, since it avoids most of the malloc/free
|
|
// overhead from the vector.resize()
|
|
for (G4_Declare *dcl : kernel.Declares) {
|
|
auto &all = gra.getAllIntervals(dcl);
|
|
for (auto &interval : all) {
|
|
if (gra.getIntervalEnd(interval))
|
|
sortedIntervals.push_back(QueueEntry(dcl, interval));
|
|
}
|
|
}
|
|
|
|
std::sort(sortedIntervals.begin(), sortedIntervals.end(),
|
|
compareInterval(gra));
|
|
|
|
VISA_DEBUG_VERBOSE({
|
|
std::cout << "Live-intervals in sorted order:\n";
|
|
for (auto &entry : sortedIntervals) {
|
|
auto *dcl = entry.first;
|
|
const auto &interval = entry.second;
|
|
std::cout << dcl->getName() << " - "
|
|
<< "(" << gra.getIntervalStart(interval)->getLexicalId() << ", "
|
|
<< gra.getIntervalEnd(interval)->getLexicalId() << "]"
|
|
<< "\n";
|
|
}
|
|
});
|
|
|
|
if (kernel.getOption(vISA_VerifyAugmentation)) {
|
|
dumpSortedIntervals();
|
|
}
|
|
}
|
|
|
|
unsigned Augmentation::getEnd(const G4_Declare *dcl) const {
|
|
return gra.getLastEndInterval(dcl)->getLexicalId();
|
|
}
|
|
|
|
// Mark interference between dcls. Either one of dcls may have
|
|
// register assigned by local RA so handle those cases too.
|
|
// Re-entrant function.
|
|
void Augmentation::handleSIMDIntf(G4_Declare *firstDcl, G4_Declare *secondDcl,
|
|
bool isCall) {
|
|
auto markIntfWithLRAAssignment = [](const G4_Declare *firstDcl,
|
|
const G4_Declare *lraAssigned,
|
|
Interference &intf) {
|
|
unsigned numRows = lraAssigned->getNumRows();
|
|
const G4_VarBase *preg = lraAssigned->getRegVar()->getPhyReg();
|
|
vISA_ASSERT(preg->isGreg(),
|
|
"Expecting a physical register during building interference "
|
|
"among incompatible masks");
|
|
unsigned start = preg->asGreg()->getRegNum();
|
|
|
|
for (unsigned i = start; i < (start + numRows); i++) {
|
|
auto GRFDcl = intf.getGRFDclForHRA(i);
|
|
intf.checkAndSetIntf(firstDcl->getRegVar()->getId(),
|
|
GRFDcl->getRegVar()->getId());
|
|
VISA_DEBUG_VERBOSE(std::cout << "Marking interference between "
|
|
<< firstDcl->getName() << " and "
|
|
<< GRFDcl->getName() << "\n");
|
|
}
|
|
};
|
|
|
|
auto firstRegVar = firstDcl->getRegVar();
|
|
auto secondRegVar = secondDcl->getRegVar();
|
|
if (firstDcl->getRegFile() == G4_INPUT && firstRegVar->getPhyReg() &&
|
|
secondDcl->getRegFile() == G4_INPUT && secondRegVar->getPhyReg()) {
|
|
return;
|
|
}
|
|
|
|
bool isFirstDcl = true;
|
|
bool isPseudoVCADcl = kernel.fg.isPseudoVCADcl(firstDcl);
|
|
if (!isPseudoVCADcl){
|
|
isPseudoVCADcl = kernel.fg.isPseudoVCADcl(secondDcl);
|
|
isFirstDcl = false;
|
|
}
|
|
|
|
if (isPseudoVCADcl) {
|
|
// Mark intf for following pattern:
|
|
// V33 =
|
|
// ...
|
|
// if
|
|
// = V33
|
|
// fcall
|
|
// ...
|
|
// else
|
|
// = V33
|
|
// endif
|
|
//
|
|
// V33 will interfere with VCA_SAVE pseudo node.
|
|
// It also needs to interfere with retval to
|
|
// ensure V33 and retval don't get same allocation.
|
|
// Note that if V33 is actually live after fcall
|
|
// then graph coloring will do this for us. In this
|
|
// case however we need to rely on augmentation.
|
|
auto retIter =
|
|
isFirstDcl ? fcallRetMap.find(firstDcl) : fcallRetMap.find(secondDcl);
|
|
if (retIter != fcallRetMap.end()) {
|
|
G4_Declare *retVar = retIter->second;
|
|
LocalLiveRange *otherDclLR;
|
|
G4_Declare *otherDcl = isFirstDcl ? secondDcl : firstDcl;
|
|
if (otherDcl->getRegVar()->isRegAllocPartaker())
|
|
intf.checkAndSetIntf(otherDcl->getRegVar()->getId(),
|
|
retVar->getRegVar()->getId());
|
|
else if ((otherDclLR = gra.getLocalLR(otherDcl)) &&
|
|
otherDclLR->getAssigned() && !otherDclLR->isEOT()) {
|
|
markIntfWithLRAAssignment(retVar, otherDcl, intf);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (firstRegVar->isRegAllocPartaker() &&
|
|
secondRegVar->isRegAllocPartaker()) {
|
|
if (!intf.varSplitCheckBeforeIntf(firstRegVar->getId(),
|
|
secondRegVar->getId())) {
|
|
intf.checkAndSetIntf(firstRegVar->getId(),
|
|
secondRegVar->getId());
|
|
if (isCall) {
|
|
intf.buildInterferenceWithAllSubDcl(firstRegVar->getId(),
|
|
secondRegVar->getId());
|
|
}
|
|
VISA_DEBUG_VERBOSE(std::cout << "Marking interference between "
|
|
<< firstDcl->getName() << " and "
|
|
<< secondDcl->getName() << "\n");
|
|
}
|
|
} else if (liveAnalysis.livenessClass(G4_GRF)) {
|
|
LocalLiveRange *secondDclLR = nullptr, *firstDclLR = nullptr;
|
|
|
|
if (firstRegVar->isRegAllocPartaker() &&
|
|
(secondDclLR = gra.getLocalLR(secondDcl)) &&
|
|
secondDclLR->getAssigned() && !secondDclLR->isEOT()) {
|
|
// secondDcl was assigned by local RA and it uses
|
|
markIntfWithLRAAssignment(firstDcl, secondDcl, intf);
|
|
} else if (secondRegVar->isRegAllocPartaker() &&
|
|
(firstDclLR = gra.getLocalLR(firstDcl)) &&
|
|
firstDclLR->getAssigned() && !firstDclLR->isEOT()) {
|
|
// Call self with reversed parameters instead of re-implementing
|
|
// above code
|
|
handleSIMDIntf(secondDcl, firstDcl, isCall);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool Augmentation::isNoMask(const G4_Declare *dcl, unsigned size) const {
|
|
auto &mask = gra.getMask(dcl);
|
|
bool result = false;
|
|
|
|
if (mask.size() > 0) {
|
|
result = true;
|
|
|
|
for (unsigned i = 0; i < size; i++) {
|
|
if (mask[i] != NOMASK_BYTE) {
|
|
result = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool Augmentation::isConsecutiveBits(const G4_Declare *dcl,
|
|
unsigned size) const {
|
|
auto &mask = gra.getMask(dcl);
|
|
bool result = false;
|
|
|
|
if (mask.size() > 0) {
|
|
result = true;
|
|
|
|
for (unsigned i = 0; i < size; i++) {
|
|
if (mask[i] != i) {
|
|
result = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool Augmentation::isCompatible(const G4_Declare *testDcl,
|
|
const G4_Declare *biggerDcl) const {
|
|
bool compatible = false;
|
|
|
|
unsigned testSize = testDcl->getRegVar()->isFlag()
|
|
? testDcl->getNumberFlagElements()
|
|
: testDcl->getByteSize();
|
|
unsigned biggerSize = biggerDcl->getRegVar()->isFlag()
|
|
? biggerDcl->getNumberFlagElements()
|
|
: biggerDcl->getByteSize();
|
|
unsigned size = (testSize < biggerSize ? testSize : biggerSize);
|
|
|
|
// Masks are compatible when:
|
|
// i. Both decls have exactly 1 EM bit defining each byte
|
|
// (This means a dcl with Q1 in one inst and Q2 in another
|
|
// instruction writing same subregisters is not a candidate
|
|
// for next step).
|
|
// ii. Bytes at common indices are enabled by same EM bit
|
|
// (This means NoMask dcl is compatible with NoMask dcl and
|
|
// not with any other dcl).
|
|
// UPDATE: (ii) above is now altered such that NoMask dcls
|
|
// that overlap are considered to be incompatible. This is to
|
|
// handle removal of JIP edges (then->else edge).
|
|
|
|
auto &testMask = gra.getMask(testDcl);
|
|
auto &biggerMask = gra.getMask(biggerDcl);
|
|
|
|
if (testMask.size() > 0 && biggerMask.size() > 0) {
|
|
// Lets pattern match
|
|
if (testDcl->getRegFile() == G4_FLAG) {
|
|
if (isConsecutiveBits(testDcl, size) &&
|
|
isConsecutiveBits(biggerDcl, size)) {
|
|
compatible = true;
|
|
}
|
|
} else {
|
|
// Add another pattern to check here
|
|
}
|
|
}
|
|
|
|
return compatible;
|
|
}
|
|
|
|
void Augmentation::expireIntervals(unsigned startIdx) {
|
|
// Expire elements from both lists
|
|
while (defaultMaskQueue.size() > 0) {
|
|
if (defaultMaskQueue.top().interval.end->getLexicalId() <=
|
|
startIdx) {
|
|
VISA_DEBUG_VERBOSE(std::cout << "Expiring "
|
|
<< defaultMaskQueue.top().first->getName()
|
|
<< "\n");
|
|
defaultMaskQueue.pop();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
while (nonDefaultMaskQueue.size() > 0) {
|
|
if (nonDefaultMaskQueue.top().interval.end->getLexicalId() <=
|
|
startIdx) {
|
|
VISA_DEBUG_VERBOSE(std::cout << "Expiring "
|
|
<< nonDefaultMaskQueue.top().first->getName()
|
|
<< "\n");
|
|
nonDefaultMaskQueue.pop();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Return true if edge between dcl1 and dcl2 is strong.
|
|
bool Interference::isStrongEdgeBetween(const G4_Declare *dcl1,
|
|
const G4_Declare *dcl2) const {
|
|
auto dcl1RegVar = dcl1->getRegVar();
|
|
auto dcl2RegVar = dcl2->getRegVar();
|
|
auto dcl1RAPartaker = dcl1RegVar->isRegAllocPartaker();
|
|
auto dcl2RAPartaker = dcl2RegVar->isRegAllocPartaker();
|
|
|
|
if (dcl1RAPartaker && dcl2RAPartaker) {
|
|
if (interfereBetween(dcl1RegVar->getId(), dcl2RegVar->getId())) {
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (dcl1RAPartaker) {
|
|
auto dcl2NumRows = dcl2->getNumRows();
|
|
auto startPhyReg = dcl2RegVar->getPhyReg()->asGreg()->getRegNum();
|
|
auto dcl2LR = gra.getLocalLR(dcl2);
|
|
|
|
if (dcl2LR && dcl2LR->getAssigned()) {
|
|
bool allEdgesStrong = true;
|
|
for (unsigned i = startPhyReg; i < (startPhyReg + dcl2NumRows); i++) {
|
|
const G4_Declare *lraPreg = getGRFDclForHRA(i);
|
|
allEdgesStrong &= interfereBetween(lraPreg->getRegVar()->getId(),
|
|
dcl1RegVar->getId());
|
|
}
|
|
|
|
if (allEdgesStrong)
|
|
return true;
|
|
}
|
|
} else {
|
|
return isStrongEdgeBetween(dcl2, dcl1);
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool Augmentation::weakEdgeNeeded(AugmentationMasks defaultDclMask,
|
|
AugmentationMasks newDclMask) {
|
|
if (gra.use4GRFAlign)
|
|
return false;
|
|
if (useGenericAugAlign) {
|
|
// Weak edge needed in case #GRF exceeds 2
|
|
if (newDclMask == AugmentationMasks::Default64Bit)
|
|
return (TypeSize(Type_Q) * kernel.getSimdSizeWithSlicing()) >
|
|
(unsigned)(2 * kernel.numEltPerGRF<Type_UB>());
|
|
|
|
if (newDclMask == AugmentationMasks::Default32Bit) {
|
|
// Even align up to 2 GRFs size variable, use weak edges beyond
|
|
return (TypeSize(Type_D) * kernel.getSimdSizeWithSlicing()) >
|
|
(unsigned)(2 * kernel.numEltPerGRF<Type_UB>());
|
|
}
|
|
} else {
|
|
return (defaultDclMask == AugmentationMasks::Default64Bit &&
|
|
newDclMask == AugmentationMasks::Default64Bit);
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
// This method is invoked when building SIMD intf and current variable
|
|
// is the artificial variable created to model call. Live-intervals in
|
|
// default set and non-default set are ones that overlap with call site
|
|
// at end of callBB. The idea here is to mark every such active interval
|
|
// with mask associated with func. Later, we'll mark interference with
|
|
// each live-interval bit set here and maydef of func.
|
|
void Augmentation::addSIMDIntfDclForCallSite(
|
|
G4_BB *callBB, const std::vector<bool> &globalVars) {
|
|
FuncInfo *func = callBB->getCalleeInfo();
|
|
auto isLiveThroughFunc = [&](unsigned int id) {
|
|
if (liveAnalysis.isLiveAtExit(callBB, id)) {
|
|
auto retBB = func->getExitBB();
|
|
if (liveAnalysis.isLiveAtExit(retBB, id))
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
auto &overlapDeclares = overlapDclsWithFunc[func];
|
|
for (auto &defaultEntry : defaultMaskQueue) {
|
|
auto defaultDcl = defaultEntry.dcl;
|
|
auto id = defaultDcl->getRegVar()->getId();
|
|
if (!isLiveThroughFunc(id) && globalVars[id])
|
|
overlapDeclares.first.insert(id);
|
|
}
|
|
|
|
for (auto &nonDefaultEntry : nonDefaultMaskQueue) {
|
|
auto nonDefaultDcl = nonDefaultEntry.dcl;
|
|
auto id = nonDefaultDcl->getRegVar()->getId();
|
|
if (!isLiveThroughFunc(id) && globalVars[id])
|
|
overlapDeclares.second.insert(id);
|
|
}
|
|
}
|
|
|
|
void Augmentation::addSIMDIntfForRetDclares(
|
|
G4_Declare *newDcl, const std::vector<bool> &globalVars) {
|
|
auto dclIt = retDeclares.find(newDcl);
|
|
MaskDeclares *mask = nullptr;
|
|
if (dclIt == retDeclares.end()) {
|
|
MaskDeclares newMask;
|
|
retDeclares[newDcl] = std::move(newMask);
|
|
mask = &retDeclares[newDcl];
|
|
} else {
|
|
mask = &dclIt->second;
|
|
}
|
|
|
|
for (auto& defaultSeg : defaultMaskQueue) {
|
|
auto defaultDcl = defaultSeg.dcl;
|
|
auto id = defaultDcl->getRegVar()->getId();
|
|
if (globalVars[id])
|
|
mask->first.insert(id);
|
|
}
|
|
|
|
for (auto& nonDefaultSeg : nonDefaultMaskQueue) {
|
|
auto nonDefaultDcl = nonDefaultSeg.dcl;
|
|
auto id = nonDefaultDcl->getRegVar()->getId();
|
|
if (globalVars[id])
|
|
mask->second.insert(id);
|
|
}
|
|
}
|
|
|
|
Augmentation::RetValType Augmentation::computeRetValType(FuncInfo *func,
|
|
G4_Declare *retVal) {
|
|
if (retVal->getAddressed())
|
|
return Augmentation::RetValType::Unknown;
|
|
|
|
const auto *defs = refs.getDefs(retVal);
|
|
if (defs) {
|
|
// All defs must be in func only
|
|
for (const auto &def : *defs) {
|
|
auto *bb = std::get<1>(def);
|
|
if (!func->contains(bb))
|
|
return Augmentation::RetValType::Unknown;
|
|
}
|
|
}
|
|
|
|
// All uses must be in BB immediately following call site
|
|
const auto *uses = refs.getUses(retVal);
|
|
if (uses) {
|
|
for (const auto &use : *uses) {
|
|
auto *bb = std::get<1>(use);
|
|
auto *pred = bb->getPhysicalPred();
|
|
if (pred->isSpecialEmptyBB())
|
|
pred = pred->getPhysicalPred();
|
|
if (!pred->isEndWithCall() || pred->getCalleeInfo() != func)
|
|
return Augmentation::RetValType::Unknown;
|
|
}
|
|
}
|
|
|
|
return Augmentation::RetValType::Regular;
|
|
}
|
|
|
|
Augmentation::ArgType Augmentation::computeArgType(FuncInfo *func,
|
|
G4_Declare *arg) {
|
|
if (arg->getAddressed())
|
|
return Augmentation::ArgType::Unknown;
|
|
|
|
// Trivial case where argument is input to kernel and no defs of
|
|
// the variable exist in the program.
|
|
const auto *defs = refs.getDefs(arg);
|
|
if (!defs || defs->size() == 0)
|
|
return Augmentation::ArgType::LiveThrough;
|
|
|
|
// Check if all defs of arg are in kernel entry BB
|
|
bool allDefsInEntryBB = true;
|
|
for (const auto &def : *defs) {
|
|
auto *bb = std::get<1>(def);
|
|
if (kernel.fg.getEntryBB() != bb) {
|
|
allDefsInEntryBB = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (allDefsInEntryBB)
|
|
return Augmentation::ArgType::LiveThrough;
|
|
|
|
// Check if use of subroutine arg exists in a BB that doesn't belong
|
|
// to the subroutine.
|
|
const auto *uses = refs.getUses(arg);
|
|
if (uses) {
|
|
for (const auto &use : *uses) {
|
|
auto bb = std::get<1>(use);
|
|
if (!func->contains(bb))
|
|
return Augmentation::ArgType::Unknown;
|
|
}
|
|
}
|
|
|
|
// Check if all defs are in same BB as call site
|
|
std::unordered_set<G4_BB *> funcCallSitesMatched;
|
|
for (auto bb : kernel.fg.getBBList()) {
|
|
if (!bb->isEndWithCall() || bb->getCalleeInfo() != func)
|
|
continue;
|
|
funcCallSitesMatched.insert(bb);
|
|
}
|
|
|
|
bool killFound = false;
|
|
for (const auto &def : *defs) {
|
|
auto *bb = std::get<1>(def);
|
|
if (bb->isEndWithCall() && bb->getCalleeInfo() == func) {
|
|
auto *inst = std::get<0>(def);
|
|
if (liveAnalysis.isLiveAtEntry(bb, arg->getRegVar()->getId())) {
|
|
return Augmentation::ArgType::Unknown;
|
|
}
|
|
|
|
if (inst->isPseudoKill() ||
|
|
liveAnalysis.writeWholeRegion(bb, inst, inst->getDst())) {
|
|
funcCallSitesMatched.erase(bb);
|
|
killFound = true;
|
|
}
|
|
continue;
|
|
}
|
|
return Augmentation::ArgType::Unknown;
|
|
}
|
|
if (!killFound || funcCallSitesMatched.size() > 0)
|
|
return Augmentation::ArgType::Unknown;
|
|
|
|
return Augmentation::ArgType::DefBeforeEachCall;
|
|
}
|
|
|
|
void Augmentation::discoverRetVal(FuncInfo *func) {
|
|
if (!liveAnalysis.livenessClass(G4_GRF))
|
|
return;
|
|
|
|
vISA_ASSERT(retValPerSub.count(func) == 0, "already saw sub");
|
|
retValPerSub[func] = {};
|
|
|
|
if (func == kernel.fg.kernelInfo)
|
|
return;
|
|
|
|
SparseBitVector subRetVal = liveAnalysis.retVal.at(func);
|
|
|
|
for (auto i : subRetVal) {
|
|
auto *dcl = lrs[i]->getDcl();
|
|
auto &retValInfo = argsRetVal[dcl];
|
|
retValInfo.subroutines.insert(func);
|
|
if (retValInfo.retValType != RetValType::Unknown)
|
|
retValInfo.retValType = computeRetValType(func, dcl);
|
|
vISA_ASSERT(retValInfo.retValType != RetValType::Init,
|
|
"expecting non-init retval type");
|
|
retValPerSub[func].insert(dcl);
|
|
if (retValInfo.subroutines.size() > 1)
|
|
retValInfo.retValType = RetValType::Unknown;
|
|
vISA_ASSERT(!hasUniqueFuncHome(dcl),
|
|
"retval cannot have non-nullptr home function");
|
|
}
|
|
|
|
if (kernel.getOption(vISA_VerifyAugmentation)) {
|
|
dumpRetVal(subRetVal);
|
|
}
|
|
}
|
|
|
|
void Augmentation::discoverArgs(FuncInfo *func) {
|
|
if (!liveAnalysis.livenessClass(G4_GRF))
|
|
return;
|
|
|
|
vISA_ASSERT(argsPerSub.count(func) == 0, "already saw sub");
|
|
argsPerSub[func] = {};
|
|
|
|
SparseBitVector subArgs;
|
|
if (func == kernel.fg.kernelInfo)
|
|
subArgs = liveAnalysis.use_in[kernel.fg.getEntryBB()->getId()] &
|
|
liveAnalysis.def_in[kernel.fg.getEntryBB()->getId()];
|
|
else
|
|
subArgs = liveAnalysis.args.at(func);
|
|
|
|
for (auto i : subArgs) {
|
|
auto *dcl = lrs[i]->getDcl();
|
|
auto &argInfo = argsRetVal[dcl];
|
|
argInfo.subroutines.insert(func);
|
|
if (argInfo.argType != ArgType::Unknown)
|
|
argInfo.argType = computeArgType(func, dcl);
|
|
vISA_ASSERT(argInfo.argType != ArgType::Init,
|
|
"expecting non-init arg type");
|
|
argsPerSub[func].insert(dcl);
|
|
// Same arg cannot be shared between 2 subroutines
|
|
if (argInfo.subroutines.size() > 1 &&
|
|
argInfo.argType == ArgType::DefBeforeEachCall)
|
|
argInfo.argType = ArgType::Unknown;
|
|
vISA_ASSERT(
|
|
argInfo.argType != ArgType::DefBeforeEachCall ||
|
|
!hasUniqueFuncHome(dcl),
|
|
"def before each call arg cannot have non-nullptr home function");
|
|
}
|
|
|
|
|
|
if (kernel.getOption(vISA_VerifyAugmentation)) {
|
|
func->dump(std::cout);
|
|
dumpArgs(subArgs);
|
|
}
|
|
}
|
|
|
|
void Augmentation::dumpSortedIntervals() {
|
|
if (kernel.getOption(vISA_DumpProgramWithLexicalId)) {
|
|
for (auto bb : kernel.fg.getBBList()) {
|
|
for (auto inst : *bb) {
|
|
std::cout << inst->getLexicalId() << ":\t";
|
|
inst->print(std::cout);
|
|
}
|
|
}
|
|
}
|
|
|
|
std::cout << "Started dumping sorted intervals:\n";
|
|
std::unordered_map<G4_Declare *, std::vector<Interval>> intervalsPerVar;
|
|
for (auto &entry : sortedIntervals) {
|
|
intervalsPerVar[entry.dcl].push_back(entry.interval);
|
|
}
|
|
|
|
for (auto &entry : sortedIntervals) {
|
|
auto &interval = entry.interval;
|
|
auto *dcl = entry.dcl;
|
|
std::cout << dcl->getName();
|
|
if (isUnknownArg(dcl))
|
|
std::cout << " (Unknown arg)";
|
|
else if (isUnknownRetVal(dcl))
|
|
std::cout << " (Unknown retval)";
|
|
else if (isDefBeforeEachCallArg(dcl))
|
|
std::cout << " (DefBeforeEachCallArg)";
|
|
else if (isLiveThroughArg(dcl))
|
|
std::cout << " (LiveThroughArg)";
|
|
else if (isRegularRetVal(dcl))
|
|
std::cout << " (RegularRetVal)";
|
|
if (dcl->getDeclId() >= homeFunc.size()) {
|
|
std::cout << " @ (new var)";
|
|
}
|
|
else {
|
|
auto *homeFunction = homeFunc[dcl->getDeclId()];
|
|
if (!homeFunction)
|
|
std::cout << " @ (global)";
|
|
else
|
|
std::cout << " @ (func " << (int)homeFunction->getId() << ")";
|
|
}
|
|
std::cout << " - (" << gra.getIntervalStart(interval)->getLexicalId()
|
|
<< ", " << gra.getIntervalEnd(interval)->getLexicalId() << "]";
|
|
if (intervalsPerVar[dcl].size() > 1) {
|
|
auto &allIntervals = intervalsPerVar[dcl];
|
|
std::cout << " other intervals: ";
|
|
for (auto &otherInterval : allIntervals) {
|
|
if (otherInterval == interval)
|
|
continue;
|
|
std::cout << "(" << gra.getIntervalStart(otherInterval)->getLexicalId()
|
|
<< ", " << gra.getIntervalEnd(otherInterval)->getLexicalId()
|
|
<< "] ";
|
|
}
|
|
}
|
|
std::cout << "\n";
|
|
}
|
|
std::cout << "Ended dumping sorted intervals:\n";
|
|
}
|
|
|
|
void Augmentation::dumpRetVal(SparseBitVector &subRetVal) {
|
|
auto getRetValType = [](RetValType retValType) {
|
|
if (retValType == Augmentation::RetValType::Init)
|
|
return "Init";
|
|
else if (retValType == Augmentation::RetValType::Regular)
|
|
return "Regular";
|
|
else if (retValType == Augmentation::RetValType::Unknown)
|
|
return "Unknown";
|
|
return "???";
|
|
};
|
|
|
|
for (auto i : subRetVal) {
|
|
printf("Retval = %s (%d) - %s\n",
|
|
gra.incRA.getLRs()[i]->getDcl()->getName(), i,
|
|
getRetValType(argsRetVal[lrs[i]->getDcl()].retValType));
|
|
}
|
|
printf("\n\n");
|
|
}
|
|
|
|
void Augmentation::dumpArgs(SparseBitVector& subArgs)
|
|
{
|
|
printf("\n");
|
|
|
|
printf("\n");
|
|
auto getArgType = [](ArgType argType) {
|
|
if (argType == Augmentation::ArgType::DefBeforeEachCall)
|
|
return "DefBeforeCall";
|
|
else if (argType == Augmentation::ArgType::Init)
|
|
return "Init";
|
|
else if (argType == Augmentation::ArgType::LiveThrough)
|
|
return "LiveThrough";
|
|
else if (argType == Augmentation::ArgType::Unknown)
|
|
return "Unknown";
|
|
return "???";
|
|
};
|
|
for (auto i : subArgs) {
|
|
printf("Arg = %s (%d) - %s\n", gra.incRA.getLRs()[i]->getDcl()->getName(),
|
|
i, getArgType(argsRetVal[lrs[i]->getDcl()].argType));
|
|
}
|
|
printf("\n");
|
|
}
|
|
|
|
//
|
|
// Mark interference between newDcl and other incompatible dcls in current
|
|
// active lists.
|
|
//
|
|
void Augmentation::buildSIMDIntfDcl(G4_Declare *newDcl) {
|
|
auto newDclAugMask = gra.getAugmentationMask(newDcl);
|
|
auto intfNeededForNewDcl =
|
|
(gra.incRA.isEnabled() && gra.incRA.hasAnyCandidates())
|
|
? gra.incRA.intfNeededForVar(newDcl)
|
|
: true;
|
|
auto id1 = newDcl->getRegVar()->getId();
|
|
auto newDclRAPartaker = newDcl->getRegVar()->isRegAllocPartaker();
|
|
|
|
auto intfNeeded = [&](G4_Declare *otherDcl) {
|
|
if (!intfNeededForNewDcl && !gra.incRA.intfNeededForVar(otherDcl)) {
|
|
return false;
|
|
}
|
|
|
|
auto otherRegVar = otherDcl->getRegVar();
|
|
if (newDclRAPartaker && otherRegVar->isRegAllocPartaker()) {
|
|
auto id2 = otherRegVar->getId();
|
|
|
|
if (intf.interfereBetween(id1, id2)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
};
|
|
|
|
if (newDclAugMask == AugmentationMasks::NonDefault)
|
|
{
|
|
for (auto& defaultEntry : defaultMaskQueue) {
|
|
auto defaultDcl = defaultEntry.dcl;
|
|
if (!intfNeeded(defaultDcl))
|
|
continue;
|
|
|
|
handleSIMDIntf(defaultDcl, newDcl, false);
|
|
}
|
|
} else {
|
|
for (auto &defaultEntry : defaultMaskQueue) {
|
|
auto defaultDcl = defaultEntry.dcl;
|
|
auto defaultDclAugMask = gra.getAugmentationMask(defaultDcl);
|
|
if (defaultDclAugMask != newDclAugMask) {
|
|
if (!intfNeeded(defaultDcl))
|
|
continue;
|
|
|
|
handleSIMDIntf(defaultDcl, newDcl, false);
|
|
} else {
|
|
if (liveAnalysis.livenessClass(G4_GRF) &&
|
|
// Populate compatible sparse intf data structure
|
|
// only for weak edges.
|
|
weakEdgeNeeded(defaultDclAugMask, newDclAugMask)) {
|
|
if (defaultDcl->getRegVar()->isPhyRegAssigned() &&
|
|
newDcl->getRegVar()->isPhyRegAssigned()) {
|
|
continue;
|
|
}
|
|
|
|
if (intf.isStrongEdgeBetween(defaultDcl, newDcl)) {
|
|
// No need to add weak edge
|
|
continue;
|
|
}
|
|
|
|
// defaultDcl and newDcl are compatible live-ranges and can have weak
|
|
// edge in intf graph
|
|
intf.compatibleSparseIntf[defaultDcl].insert(newDcl);
|
|
intf.compatibleSparseIntf[newDcl].insert(defaultDcl);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Mark interference among non-default mask variables
|
|
for (auto &nonDefaultEntry : nonDefaultMaskQueue) {
|
|
auto nonDefaultDcl = nonDefaultEntry.dcl;
|
|
auto isAugNeeded = [&]() {
|
|
if (newDclAugMask != AugmentationMasks::NonDefault)
|
|
return true;
|
|
|
|
// Skip augmentation check if both dcls are infinite spill cost tmp dcls
|
|
// generated by RA. Such dcls have their interference correctly computed
|
|
// by conventional interference computation. In case of address taken
|
|
// spill/fill dcls, applying augmentation on them causes unexpected
|
|
// interference edges.
|
|
//
|
|
// Unexpected intf shows up because we reuse dcl for address taken
|
|
// spill/fill across BBs. As per generated code, such address taken
|
|
// spill/fill dcl ranges are live only around the indirect operand. Also,
|
|
// these ranges are never live across BBs. As augmentation models
|
|
// live-intervals without holes, it ends up with unnecessary
|
|
// interferences. Here is such an example of unnecessary interference
|
|
// edge:
|
|
//
|
|
// BB1:
|
|
// A0 = &ADDR_SP_FL_1 + offset
|
|
// (W) Fill ADDR_SP_FL_1
|
|
// r[A0] = ...
|
|
// (W) Spill ADD_SP_FL_1
|
|
//
|
|
// BB2:
|
|
// (W) Fill FL_V10
|
|
// = FL_V10
|
|
//
|
|
// BB10:
|
|
// (W) Fill ADDR_SP_FL_1
|
|
// r[A0] = ...
|
|
// (W) Spill ADD_SP_FL_1
|
|
//
|
|
// ADDR_SP_FL_1 and FL_V10 shouldnt interfere. Without logic below, they
|
|
// would interfere making RA results worse.
|
|
|
|
auto regVar1 = nonDefaultDcl->getRegVar();
|
|
auto regVar2 = newDcl->getRegVar();
|
|
if (!((regVar1->isRegVarTmp() || regVar1->isRegVarTransient() ||
|
|
regVar1->isRegVarCoalesced()) &&
|
|
(regVar2->isRegVarTmp() || regVar2->isRegVarTransient() ||
|
|
regVar2->isRegVarCoalesced())))
|
|
return true;
|
|
|
|
// Both dcls are RA tmps. Ordinarily they're never live-out of any BB. If
|
|
// any of them is live across BBs then it's possible they don't interfere
|
|
// as per scalar liveness but they may interfere due to divergent CF.
|
|
// For example:
|
|
//
|
|
// if(cond)
|
|
// (W) V1 = ...
|
|
// else
|
|
// (W) V2 = ...
|
|
// = V2
|
|
// endif
|
|
//
|
|
// = V1
|
|
//
|
|
// In above example, V1 doesnt interfere with V2 as per scalar liveness
|
|
// but it should if the branch were divergent. For correctness we need to
|
|
// mark V1 and V2 as interfering. Since they're never live together as per
|
|
// scalar liveness, they need to be handled in augmentation. This case
|
|
// shouldnt occur for RA tmps as RA generated spill/fill tmps are
|
|
// transient and never live-out of any BB. Still adding check to be safe.
|
|
|
|
auto id1 = regVar1->getId();
|
|
auto id2 = regVar2->getId();
|
|
|
|
for (auto bb : kernel.fg.getBBList()) {
|
|
if (liveAnalysis.isLiveAtExit(bb, id1) ||
|
|
liveAnalysis.isLiveAtExit(bb, id2))
|
|
return true;
|
|
}
|
|
|
|
// Conventional intf construction correctly handles the scenario when V1
|
|
// and V2 are referenced in single (same) BB.
|
|
|
|
return false;
|
|
};
|
|
|
|
if (!isAugNeeded())
|
|
continue;
|
|
|
|
if (!intfNeeded(nonDefaultDcl))
|
|
continue;
|
|
|
|
// Non-default masks are different so mark interference.
|
|
// SIMD interference for call sites is handled separately.
|
|
handleSIMDIntf(nonDefaultDcl, newDcl, false);
|
|
}
|
|
}
|
|
|
|
//
|
|
// Mark interference between newDcl and other incompatible dcls in current
|
|
// active lists. If newDcl was created for a subroutine call, do this for all
|
|
// varaibles in function summary.
|
|
//
|
|
void Augmentation::storeOverlapWithCallRet(G4_Declare *newDcl,
|
|
const std::vector<bool>& globalVars) {
|
|
vISA_ASSERT(callDclMap.count(newDcl) > 0, "expecting newDcl in map");
|
|
auto& callDclData = callDclMap[newDcl];
|
|
|
|
if (liveAnalysis.livenessClass(G4_GRF)) // For return value
|
|
{
|
|
G4_INST *callInst = callDclData.first;
|
|
auto* varDcl = callInst->getDst()->getTopDcl();
|
|
addSIMDIntfForRetDclares(varDcl, globalVars);
|
|
}
|
|
|
|
auto *callBB = callDclData.second;
|
|
addSIMDIntfDclForCallSite(callBB, globalVars);
|
|
}
|
|
|
|
//
|
|
// Perform linear scan and mark interference between conflicting dcls with
|
|
// incompatible masks.
|
|
//
|
|
void Augmentation::buildInterferenceIncompatibleMask() {
|
|
// Collect global vars in unordered_set for quick lookup
|
|
std::vector<bool> globalVars(liveAnalysis.getNumSelectedVar(), false);
|
|
if (!kernel.fg.funcInfoTable.empty()) {
|
|
for (auto bit : liveAnalysis.globalVars)
|
|
globalVars[bit] = true;
|
|
}
|
|
|
|
// Create 2 active lists - 1 for holding active live-intervals
|
|
// with non-default mask and other for default mask
|
|
for (auto &interval : sortedIntervals) {
|
|
auto *newDcl = interval.dcl;
|
|
unsigned startIdx = interval.interval.start->getLexicalId();
|
|
VISA_DEBUG_VERBOSE(std::cout << "New idx " << startIdx << "\n");
|
|
expireIntervals(startIdx);
|
|
|
|
if (callDclMap.count(newDcl) > 0) {
|
|
storeOverlapWithCallRet(newDcl, globalVars);
|
|
} else {
|
|
buildSIMDIntfDcl(newDcl);
|
|
}
|
|
|
|
// Add newDcl to correct list
|
|
if (gra.getHasNonDefaultMaskDef(newDcl) || newDcl->getAddressed() == true) {
|
|
nonDefaultMaskQueue.push(interval);
|
|
VISA_DEBUG_VERBOSE(std::cout << "Adding " << newDcl->getName()
|
|
<< " to non-default list\n");
|
|
} else {
|
|
defaultMaskQueue.push(interval);
|
|
VISA_DEBUG_VERBOSE(std::cout << "Adding " << newDcl->getName()
|
|
<< " to default list\n");
|
|
}
|
|
}
|
|
|
|
for (auto func : kernel.fg.funcInfoTable) {
|
|
buildInteferenceForCallsite(func);
|
|
}
|
|
buildInteferenceForRetDeclares();
|
|
}
|
|
|
|
void Augmentation::buildInteferenceForCallSiteOrRetDeclare(std::vector<G4_Declare*>& dcls,
|
|
MaskDeclares *mask) {
|
|
for (auto newDcl : dcls) {
|
|
auto newDclAugMask = gra.getAugmentationMask(newDcl);
|
|
auto intfNeededForNewDcl =
|
|
(gra.incRA.isEnabled() && gra.incRA.hasAnyCandidates())
|
|
? gra.incRA.intfNeededForVar(newDcl)
|
|
: true;
|
|
auto id1 = newDcl->getRegVar()->getId();
|
|
auto newDclRAPartaker = newDcl->getRegVar()->isRegAllocPartaker();
|
|
|
|
auto intfNeeded = [&](G4_Declare *otherDcl) {
|
|
if (!intfNeededForNewDcl && !gra.incRA.intfNeededForVar(otherDcl)) {
|
|
return false;
|
|
}
|
|
|
|
auto otherRegVar = otherDcl->getRegVar();
|
|
if (newDclRAPartaker && otherRegVar->isRegAllocPartaker()) {
|
|
auto id2 = otherRegVar->getId();
|
|
if (intf.interfereBetween(id1, id2)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
};
|
|
|
|
if (newDclAugMask == AugmentationMasks::NonDefault) {
|
|
for (auto i : mask->first) {
|
|
G4_Declare *defaultDcl = lrs[i]->getDcl();
|
|
if (!intfNeeded(defaultDcl))
|
|
continue;
|
|
handleSIMDIntf(defaultDcl, newDcl, true);
|
|
}
|
|
} else {
|
|
for (auto i : mask->first) {
|
|
G4_Declare *defaultDcl = lrs[i]->getDcl();
|
|
if (gra.getAugmentationMask(defaultDcl) != newDclAugMask) {
|
|
if (!intfNeeded(defaultDcl))
|
|
continue;
|
|
handleSIMDIntf(defaultDcl, newDcl, true);
|
|
} else {
|
|
if (liveAnalysis.livenessClass(G4_GRF) &&
|
|
// Populate compatible sparse intf data structure
|
|
// only for weak edges.
|
|
weakEdgeNeeded(gra.getAugmentationMask(defaultDcl),
|
|
newDclAugMask)) {
|
|
if (defaultDcl->getRegVar()->isPhyRegAssigned() &&
|
|
newDcl->getRegVar()->isPhyRegAssigned()) {
|
|
continue;
|
|
}
|
|
|
|
if (intf.isStrongEdgeBetween(defaultDcl, newDcl)) {
|
|
// No need to add weak edge
|
|
continue;
|
|
}
|
|
|
|
// defaultDcl and newDcl are compatible live-ranges and can have
|
|
// weak edge in intf graph
|
|
intf.compatibleSparseIntf[defaultDcl].insert(newDcl);
|
|
intf.compatibleSparseIntf[newDcl].insert(defaultDcl);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (auto i : mask->second) {
|
|
if (!intfNeeded(lrs[i]->getDcl()))
|
|
continue;
|
|
// Mark interference among non-default mask variables
|
|
G4_Declare *nonDefaultDcl = lrs[i]->getDcl();
|
|
// Non-default masks are different so mark interference
|
|
handleSIMDIntf(nonDefaultDcl, newDcl, true);
|
|
}
|
|
}
|
|
}
|
|
|
|
std::vector<G4_Declare *> SBitToVector(SparseBitVector *sparseBitVector,
|
|
const LiveRangeVec &lrs) {
|
|
std::vector<G4_Declare *> retVector;
|
|
for (auto bit : *sparseBitVector) {
|
|
auto *varDcl = lrs[bit]->getDcl();
|
|
retVector.push_back(varDcl);
|
|
}
|
|
|
|
return retVector;
|
|
}
|
|
|
|
// This method is invoked once per subroutine func.
|
|
void Augmentation::buildInteferenceForCallsite(FuncInfo *func) {
|
|
auto maydefConst = liveAnalysis.subroutineMaydef.find(func);
|
|
if (maydefConst != liveAnalysis.subroutineMaydef.end()) {
|
|
auto *maydef = const_cast<SparseBitVector *>(&maydefConst->second);
|
|
std::vector<G4_Declare *> maydefDcls(SBitToVector(maydef, lrs));
|
|
buildInteferenceForCallSiteOrRetDeclare(maydefDcls, &overlapDclsWithFunc[func]);
|
|
}
|
|
if (gra.useLocalRA) {
|
|
std::vector<G4_Declare *> lraDcls;
|
|
for (uint32_t j = 0; j < kernel.getNumRegTotal(); j++) {
|
|
if (localSummaryOfCallee[func].isGRFBusy(j)) {
|
|
G4_Declare *varDcl = gra.getGRFDclForHRA(j);
|
|
lraDcls.push_back(varDcl);
|
|
}
|
|
}
|
|
buildInteferenceForCallSiteOrRetDeclare(lraDcls, &overlapDclsWithFunc[func]);
|
|
}
|
|
}
|
|
|
|
void Augmentation::buildInteferenceForRetDeclares() {
|
|
for (auto &retDclIt : retDeclares) {
|
|
std::vector<G4_Declare *> retDclVec({retDclIt.first});
|
|
buildInteferenceForCallSiteOrRetDeclare(retDclVec, &retDclIt.second);
|
|
}
|
|
}
|
|
|
|
void Augmentation::buildSummaryForCallees() {
|
|
int totalGRFNum = kernel.getNumRegTotal();
|
|
|
|
for (auto func : kernel.fg.sortedFuncTable) {
|
|
unsigned fid = func->getId();
|
|
if (fid == UINT_MAX) {
|
|
// entry kernel
|
|
continue;
|
|
}
|
|
PhyRegSummary funcSummary(kernel.fg.builder, totalGRFNum);
|
|
for (auto &&bb : func->getBBList()) {
|
|
if (auto summary = gra.getBBLRASummary(bb)) {
|
|
for (int i = 0; i < totalGRFNum; i++) {
|
|
if (summary->isGRFBusy(i)) {
|
|
funcSummary.setGRFBusy(i);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (auto &&callee : func->getCallees()) {
|
|
PhyRegSummary *summary = &localSummaryOfCallee[callee];
|
|
if (summary) {
|
|
for (int i = 0; i < totalGRFNum; i++) {
|
|
if (summary->isGRFBusy(i)) {
|
|
funcSummary.setGRFBusy(i);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
localSummaryOfCallee[func] = std::move(funcSummary);
|
|
}
|
|
}
|
|
|
|
void Augmentation::augmentIntfGraph() {
|
|
if (!(kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
|
|
!liveAnalysis.livenessClass(G4_ADDRESS) && kernel.fg.size() > 2)) {
|
|
if (!kernel.getOption(vISA_DumpRegChart)) {
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (gra.useLocalRA) {
|
|
buildSummaryForCallees();
|
|
}
|
|
|
|
bool augWithHoles = kernel.getOption(vISA_NewAugmentation);
|
|
|
|
// First check whether any definitions exist with incompatible mask
|
|
bool nonDefaultMaskDef = markNonDefaultMaskDef();
|
|
|
|
if (nonDefaultMaskDef == true) {
|
|
if (augWithHoles) {
|
|
if (kernel.fg.getNumFuncs() > 0)
|
|
populateFuncMaps();
|
|
|
|
populateHomeFunc();
|
|
|
|
// Atleast one definition with non-default mask was found so
|
|
// perform steps to augment intf graph with such defs
|
|
|
|
// Discover and store subroutine arguments
|
|
if (hasSubroutines) {
|
|
for (auto &subroutine : kernel.fg.sortedFuncTable) {
|
|
discoverArgs(subroutine);
|
|
discoverRetVal(subroutine);
|
|
|
|
// Now build live-intervals per subroutine. This function will
|
|
// calculate live-intervals and assign start/end inst for
|
|
// respective declares.
|
|
buildLiveIntervals(subroutine);
|
|
}
|
|
} else {
|
|
buildLiveIntervals(kernel.fg.kernelInfo);
|
|
}
|
|
|
|
// Create live-intervals for Unknown arg and retval
|
|
buildUnknownArgRetval();
|
|
} else {
|
|
buildLiveIntervals();
|
|
}
|
|
|
|
// Sort live-intervals based on their start
|
|
sortLiveIntervals();
|
|
|
|
if (kernel.getOption(vISA_DumpLiveRanges)) {
|
|
dumpLiveRanges(gra, sortedIntervals);
|
|
}
|
|
|
|
if (kernel.getOption(vISA_DumpRegChart)) {
|
|
gra.regChart = std::make_unique<RegChartDump>(gra);
|
|
gra.regChart->recordLiveIntervals(sortedIntervals);
|
|
}
|
|
|
|
if (gra.verifyAugmentation) {
|
|
gra.verifyAugmentation->loadAugData(
|
|
sortedIntervals, lrs, callDclMap,
|
|
intf.liveAnalysis->getNumSelectedVar(), &intf, gra);
|
|
}
|
|
|
|
if (kernel.getOption(vISA_SpillAnalysis)) {
|
|
if (gra.spillAnalysis.get())
|
|
gra.spillAnalysis->LoadAugIntervals(sortedIntervals, gra);
|
|
}
|
|
|
|
if (kernel.fg.builder->getOption(vISA_GenerateDebugInfo)) {
|
|
// Following is done to prevent passing GlobalRA to debug info function
|
|
// for clear interface.
|
|
std::vector<std::tuple<G4_Declare *, G4_INST *, G4_INST *>> dclIntervals;
|
|
dclIntervals.reserve(sortedIntervals.size());
|
|
for (auto &interval : sortedIntervals) {
|
|
auto dcl = interval.dcl;
|
|
dclIntervals.push_back(std::make_tuple(dcl, interval.interval.start,
|
|
interval.interval.end));
|
|
}
|
|
updateDebugInfo(kernel, std::move(dclIntervals));
|
|
}
|
|
|
|
// Perform linear scan to augment graph
|
|
buildInterferenceIncompatibleMask();
|
|
|
|
if (liveAnalysis.livenessClass(G4_GRF)) {
|
|
if ((GlobalRA::useGenericAugAlign(kernel.getPlatformGeneration()) &&
|
|
kernel.getSimdSize() >= kernel.numEltPerGRF<Type_UD>()) ||
|
|
(!GlobalRA::useGenericAugAlign(kernel.getPlatformGeneration()) &&
|
|
kernel.getSimdSize() > kernel.numEltPerGRF<Type_UD>())) {
|
|
// Set alignment of all GRF candidates
|
|
// to 2GRF except for NoMask variables
|
|
VISA_DEBUG_VERBOSE(std::cout
|
|
<< "Kernel size is SIMD" << kernel.getSimdSize()
|
|
<< " so updating all GRFs to aug align"
|
|
<< "\n");
|
|
gra.augAlign();
|
|
}
|
|
gra.updateSubRegAlignment(kernel.getGRFAlign());
|
|
}
|
|
}
|
|
}
|
|
|
|
void Interference::buildInterferenceWithLocalRA(G4_BB *bb) {
|
|
auto LRASummary = gra.getBBLRASummary(bb);
|
|
if (LRASummary == nullptr) {
|
|
return;
|
|
}
|
|
|
|
BitSet cur(kernel.getNumRegTotal(), true);
|
|
SparseBitVector live;
|
|
std::vector<int> curUpdate;
|
|
|
|
buildInterferenceAtBBExit(bb, live);
|
|
VISA_DEBUG_VERBOSE(std::cout << "BB" << bb->getId() << "\n");
|
|
|
|
for (INST_LIST_RITER rit = bb->rbegin(), rend = bb->rend(); rit != rend;
|
|
rit++) {
|
|
bool update = false;
|
|
G4_INST *inst = (*rit);
|
|
curUpdate.clear();
|
|
VISA_DEBUG_VERBOSE({
|
|
inst->emit(std::cout);
|
|
std::cout << "\n";
|
|
});
|
|
|
|
// Any physical registers defined will be marked available if
|
|
// current inst is first def or if complete region is written
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
|
|
if (dst && dst->getBase()->isRegVar()) {
|
|
LocalLiveRange *localLR = NULL;
|
|
G4_Declare *topdcl = GetTopDclFromRegRegion(dst);
|
|
unsigned t;
|
|
|
|
if (topdcl)
|
|
localLR = gra.getLocalLR(topdcl);
|
|
|
|
if (localLR && localLR->getAssigned() && !localLR->isEOT()) {
|
|
int reg, sreg, numrows;
|
|
G4_VarBase *preg = localLR->getPhyReg(sreg);
|
|
numrows = localLR->getTopDcl()->getNumRows();
|
|
|
|
vISA_ASSERT(preg->isGreg(), "Register in dst was not GRF");
|
|
|
|
reg = preg->asGreg()->getRegNum();
|
|
|
|
// Check whether the dst physical register is busy/available.
|
|
// If it is available, and we still see a def that means there was no
|
|
// corresponding use. In such cases mark the physical register as
|
|
// busy, so interference building can take place correctly.
|
|
for (int j = reg, sum = reg + numrows; j < sum; j++) {
|
|
int k = getGRFDclForHRA(j)->getRegVar()->getId();
|
|
|
|
if (cur.isSet(j) == true) {
|
|
buildInterferenceWithLive(live, k);
|
|
VISA_DEBUG_VERBOSE(
|
|
std::cout << "Found no use for r" << j
|
|
<< ".0 so marking it as interfering with live set"
|
|
<< "\n");
|
|
}
|
|
}
|
|
|
|
if ((localLR->getFirstRef(t) == inst) ||
|
|
liveAnalysis->writeWholeRegion(bb, inst, dst)) {
|
|
// Last row may be only partially used by the current dcl
|
|
// so we still need to pessimistically mark last range as
|
|
// busy. Because some other src opnd that is live may still
|
|
// be using the remaining GRF.
|
|
if (localLR->getSizeInWords() % kernel.numEltPerGRF<Type_UW>() != 0)
|
|
numrows--;
|
|
|
|
for (int j = reg, sum = reg + numrows; j < sum; j++) {
|
|
cur.set(j, true);
|
|
VISA_DEBUG_VERBOSE(std::cout << "Setting r" << j << ".0 available"
|
|
<< "\n");
|
|
}
|
|
|
|
// Build interference only for point ranges, ideally which shouldnt
|
|
// exist These are ranges that have a def, but no use
|
|
if (localLR->getFirstRef(t) == localLR->getLastRef(t)) {
|
|
for (int j = reg; j < reg + localLR->getTopDcl()->getNumRows();
|
|
j++) {
|
|
int k = getGRFDclForHRA(j)->getRegVar()->getId();
|
|
buildInterferenceWithLive(live, k);
|
|
}
|
|
}
|
|
}
|
|
} else if (dst->getBase()->isRegAllocPartaker()) {
|
|
// Global range
|
|
|
|
// In bottom-up order if the live-range has not started then
|
|
// a use was not seen for this def. But we need to ensure this
|
|
// variable interferes with all other live vars.
|
|
bool isPointRange = !live.test(dst->getBase()->asRegVar()->getId());
|
|
|
|
if (isPointRange) {
|
|
// Mark interference with all busy physical registers
|
|
for (unsigned i = 0; i < kernel.getNumRegTotal(); i++) {
|
|
if (cur.isSet(i) == false) {
|
|
int k = getGRFDclForHRA(i)->getRegVar()->getId();
|
|
checkAndSetIntf(dst->getBase()->asRegVar()->getId(), k);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (liveAnalysis->writeWholeRegion(bb, inst, dst) ||
|
|
inst->isPseudoKill()) {
|
|
// Whole write or first def found so mark this operand as not live for
|
|
// earlier instructions
|
|
auto id = dst->getBase()->asRegVar()->getId();
|
|
updateLiveness(live, id, false);
|
|
}
|
|
} else if (dst->isIndirect() && liveAnalysis->livenessClass(G4_GRF)) {
|
|
// make every var in points-to set live
|
|
const REGVAR_VECTOR &pointsToSet =
|
|
liveAnalysis->getPointsToAnalysis().getAllInPointsToOrIndrUse(dst,
|
|
bb);
|
|
for (auto &pt : pointsToSet) {
|
|
if (pt.var->isRegAllocPartaker()) {
|
|
updateLiveness(live, pt.var->getId(), true);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Any physical registers used by src opnds will be busy before the current
|
|
// inst
|
|
for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; i++) {
|
|
G4_Operand *src = inst->getSrc(i);
|
|
|
|
if (src && src->isSrcRegRegion() &&
|
|
src->asSrcRegRegion()->getBase()->isRegVar()) {
|
|
LocalLiveRange *localLR = NULL;
|
|
G4_Declare *topdcl = GetTopDclFromRegRegion(src);
|
|
|
|
if (topdcl)
|
|
localLR = gra.getLocalLR(topdcl);
|
|
|
|
if (localLR && localLR->getAssigned() && !localLR->isEOT()) {
|
|
int sreg;
|
|
G4_VarBase *preg = localLR->getPhyReg(sreg);
|
|
int numrows = localLR->getTopDcl()->getNumRows();
|
|
|
|
vISA_ASSERT(preg->isGreg(), "Register in src was not GRF");
|
|
|
|
int reg = preg->asGreg()->getRegNum();
|
|
|
|
for (int j = reg, sum = reg + numrows; j < sum; j++) {
|
|
int k = getGRFDclForHRA(j)->getRegVar()->getId();
|
|
|
|
if (cur.isSet(j) == true) {
|
|
// G4_RegVar with id k was marked free, but becomes
|
|
// busy at this instruction. For incremental updates
|
|
// push this to a vector and use it while updating
|
|
// interference graph incrementally.
|
|
curUpdate.push_back(k);
|
|
}
|
|
|
|
cur.set(j, false);
|
|
VISA_DEBUG_VERBOSE(std::cout << "Setting r" << j << ".0 busy\n");
|
|
}
|
|
} else if (src->asSrcRegRegion()->getBase()->isRegAllocPartaker()) {
|
|
if (live.test(
|
|
src->asSrcRegRegion()->getBase()->asRegVar()->getId()) ==
|
|
false)
|
|
update = true;
|
|
|
|
// Mark operand as live from this inst upwards
|
|
auto id = src->asSrcRegRegion()->getBase()->asRegVar()->getId();
|
|
updateLiveness(live, id, true);
|
|
} else if (src->asSrcRegRegion()->isIndirect() &&
|
|
liveAnalysis->livenessClass(G4_GRF)) {
|
|
// make every var in points-to set live
|
|
const REGVAR_VECTOR &pointsToSet =
|
|
liveAnalysis->getPointsToAnalysis().getAllInPointsToOrIndrUse(
|
|
src->asSrcRegRegion(), bb);
|
|
for (auto &pt : pointsToSet) {
|
|
if (pt.var->isRegAllocPartaker()) {
|
|
if (live.test(pt.var->getId()) == false)
|
|
update = true;
|
|
|
|
updateLiveness(live, pt.var->getId(), true);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (update == true) {
|
|
// Mark interference with all live
|
|
for (unsigned i = 0; i < kernel.getNumRegTotal(); i++) {
|
|
if (cur.isSet(i) == false) {
|
|
int k = getGRFDclForHRA(i)->getRegVar()->getId();
|
|
buildInterferenceWithLive(live, k);
|
|
}
|
|
}
|
|
} else {
|
|
if (curUpdate.size() > 0) {
|
|
// Perform incremental update. This code is executed when:
|
|
// 1) live set is unchanged, ie no new global range was started in inst
|
|
// 2) cur set has changed, ie an earlier free GRF has become busy
|
|
// Any new busy GRFs will have to be marked as interfering with
|
|
// currently live-ranges. There is no need to iterate over all
|
|
// busy GRFs. Instead only those GRFs that have got busy in this
|
|
// iteration can be considered for incremental updates.
|
|
for (int k : curUpdate) {
|
|
buildInterferenceWithLive(live, k);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (unsigned i = 0; i < maxId; i++) {
|
|
bool isAddrSensitive = liveAnalysis->isAddressSensitive(i);
|
|
|
|
// If a range is Address taken AND (live-in or live-out or killed)
|
|
// mark it to interfere with all physical registers used by local RA
|
|
// FIXME: need to check if this is actually needed
|
|
if (isAddrSensitive) {
|
|
bool assigned = (lrs[i]->getVar()->getPhyReg() != NULL);
|
|
if (!assigned) {
|
|
bool isLiveIn = liveAnalysis->isLiveAtEntry(bb, i);
|
|
bool isLiveOut = liveAnalysis->isLiveAtExit(bb, i);
|
|
bool isKilled = liveAnalysis->use_kill[bb->getId()].test(i);
|
|
if (isLiveIn || isLiveOut || isKilled) {
|
|
// Make it to interfere with all physical registers used in the BB
|
|
for (uint32_t j = 0, numReg = kernel.getNumRegTotal(); j < numReg;
|
|
j++) {
|
|
if (LRASummary->isGRFBusy(j)) {
|
|
int k = getGRFDclForHRA(j)->getRegVar()->getId();
|
|
checkAndSetIntf(i, k);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
GraphColor::GraphColor(LivenessAnalysis &live, bool hybrid, bool forceSpill_)
|
|
: gra(live.gra), totalGRFRegCount(gra.kernel.getNumRegTotal()),
|
|
numVar(live.getNumSelectedVar()), intf(&live, gra), regPool(gra.regPool),
|
|
builder(gra.builder), lrs(live.gra.incRA.getLRs()), isHybrid(hybrid),
|
|
forceSpill(forceSpill_), GCMem(GRAPH_COLOR_MEM_SIZE), kernel(gra.kernel),
|
|
liveAnalysis(live)
|
|
{
|
|
spAddrRegSig.resize(builder.getNumAddrRegisters(), 0);
|
|
m_options = builder.getOptions();
|
|
}
|
|
|
|
//
|
|
// lrs[i] gives the live range whose id is i
|
|
//
|
|
void GraphColor::createLiveRanges() {
|
|
lrs.resize(numVar);
|
|
for (auto dcl : gra.kernel.Declares) {
|
|
G4_RegVar *var = dcl->getRegVar();
|
|
// Do not include alias var in liverange creation
|
|
if (!var->isRegAllocPartaker() || dcl->getAliasDeclare() != NULL) {
|
|
continue;
|
|
}
|
|
lrs[var->getId()] = LiveRange::createNewLiveRange(dcl, gra);
|
|
}
|
|
}
|
|
|
|
template<bool Support4GRFAlign>
|
|
void GraphColor::computeDegreeForGRF() {
|
|
for (unsigned i = 0; i < numVar; i++) {
|
|
unsigned degree = 0;
|
|
|
|
if (!(lrs[i]->getIsPseudoNode()) && !(lrs[i]->getIsPartialDcl())) {
|
|
const std::vector<unsigned> &intfs = intf.getSparseIntfForVar(i);
|
|
unsigned bankDegree = 0;
|
|
auto lraBC = lrs[i]->getBC();
|
|
bool isOdd = (lraBC == BANK_CONFLICT_SECOND_HALF_EVEN ||
|
|
lraBC == BANK_CONFLICT_SECOND_HALF_ODD);
|
|
|
|
|
|
auto computeDegree = [&](LiveRange *lr1) {
|
|
if (!lr1->getIsPartialDcl()) {
|
|
unsigned edgeDegree = edgeWeightGRF<Support4GRFAlign>(lrs[i], lr1);
|
|
|
|
degree += edgeDegree;
|
|
|
|
auto lrsitBC = lr1->getBC();
|
|
bool isOddBC = (lrsitBC == BANK_CONFLICT_SECOND_HALF_EVEN ||
|
|
lrsitBC == BANK_CONFLICT_SECOND_HALF_ODD);
|
|
|
|
if ((isOdd && isOddBC) || (!isOdd && !isOddBC)) {
|
|
bankDegree += edgeDegree;
|
|
}
|
|
}
|
|
};
|
|
|
|
for (auto it : intfs) {
|
|
computeDegree(lrs[it]);
|
|
}
|
|
|
|
// consider weak edges in degree computation
|
|
auto *weakEdges = intf.getCompatibleSparseIntf(lrs[i]->getDcl());
|
|
if (weakEdges) {
|
|
vISA_ASSERT(!gra.use4GRFAlign, "not expecting weak edges");
|
|
for (auto weakNeighbor : *weakEdges) {
|
|
if (!weakNeighbor->getRegVar()->isRegAllocPartaker())
|
|
continue;
|
|
|
|
computeDegree(lrs[weakNeighbor->getRegVar()->getId()]);
|
|
}
|
|
}
|
|
|
|
if (isOdd) {
|
|
oddTotalDegree += bankDegree; // std::max(bankDegree, oddMaxDegree);
|
|
oddTotalRegNum += lrs[i]->getNumRegNeeded();
|
|
oddMaxRegNum = std::max(oddMaxRegNum, lrs[i]->getNumRegNeeded());
|
|
} else {
|
|
evenTotalDegree += bankDegree; // std::max(bankDegree, evenMaxDegree);
|
|
evenTotalRegNum += lrs[i]->getNumRegNeeded();
|
|
evenMaxRegNum = std::max(evenMaxRegNum, lrs[i]->getNumRegNeeded());
|
|
}
|
|
}
|
|
|
|
lrs[i]->setDegree(degree);
|
|
}
|
|
|
|
if (kernel.getOption(vISA_SpillAnalysis)) {
|
|
for (unsigned int i = 0; i != numVar; ++i) {
|
|
auto dcl = lrs[i]->getDcl();
|
|
auto degree = lrs[i]->getDegree();
|
|
gra.spillAnalysis->LoadDegree(dcl, degree);
|
|
}
|
|
}
|
|
}
|
|
|
|
void GraphColor::computeDegreeForARF() {
|
|
for (unsigned i = 0; i < numVar; i++) {
|
|
unsigned degree = 0;
|
|
|
|
if (!(lrs[i]->getIsPseudoNode())) {
|
|
const std::vector<unsigned> &intfs = intf.getSparseIntfForVar(i);
|
|
for (auto it : intfs) {
|
|
degree += edgeWeightARF(lrs[i], lrs[it]);
|
|
}
|
|
}
|
|
|
|
lrs[i]->setDegree(degree);
|
|
}
|
|
}
|
|
|
|
void GraphColor::computeSpillCosts(bool useSplitLLRHeuristic, const RPE *rpe) {
|
|
LiveRangeVec addressSensitiveVars;
|
|
float maxNormalCost = 0.0f;
|
|
VarReferences directRefs(kernel, true, false);
|
|
std::unordered_map<G4_Declare *, std::list<std::pair<G4_INST *, G4_BB *>>>
|
|
indirectRefs;
|
|
// when reg pressure is not very high in iter0, use spill cost function
|
|
// that favors allocating large variables
|
|
bool useNewSpillCost =
|
|
(builder.getOption(vISA_NewSpillCostFunctionISPC) ||
|
|
builder.getOption(vISA_NewSpillCostFunction)) &&
|
|
rpe &&
|
|
!(gra.getIterNo() == 0 &&
|
|
(float)rpe->getMaxRP() < (float)kernel.getNumRegTotal() * 0.80f);
|
|
|
|
RA_TRACE({
|
|
if (useNewSpillCost)
|
|
std::cout << "\t--using new spill cost function\n";
|
|
});
|
|
|
|
if (useNewSpillCost && liveAnalysis.livenessClass(G4_GRF)) {
|
|
// gather all instructions with indirect operands
|
|
// for ref count computation once.
|
|
for (auto bb : kernel.fg.getBBList()) {
|
|
for (auto inst : *bb) {
|
|
auto dst = inst->getDst();
|
|
if (dst && dst->isIndirect()) {
|
|
auto pointsTo = liveAnalysis.getPointsToAnalysis().getAllInPointsTo(
|
|
dst->getBase()
|
|
->asRegVar()
|
|
->getDeclare()
|
|
->getRootDeclare()
|
|
->getRegVar());
|
|
if (pointsTo) {
|
|
for (auto &pointee : *pointsTo)
|
|
indirectRefs[pointee.var->getDeclare()->getRootDeclare()]
|
|
.push_back(std::make_pair(inst, bb));
|
|
}
|
|
continue;
|
|
}
|
|
|
|
for (unsigned int i = 0; i != inst->getNumSrc(); ++i) {
|
|
auto src = inst->getSrc(i);
|
|
if (!src || !src->isSrcRegRegion() ||
|
|
!src->asSrcRegRegion()->isIndirect()) {
|
|
continue;
|
|
}
|
|
auto pointsTo = liveAnalysis.getPointsToAnalysis().getAllInPointsTo(
|
|
src->asSrcRegRegion()
|
|
->getBase()
|
|
->asRegVar()
|
|
->getDeclare()
|
|
->getRootDeclare()
|
|
->getRegVar());
|
|
if (pointsTo) {
|
|
for (auto &pointee : *pointsTo)
|
|
indirectRefs[pointee.var->getDeclare()->getRootDeclare()]
|
|
.push_back(std::make_pair(inst, bb));
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
auto getWeightedRefCount = [&](G4_Declare *dcl, unsigned int useWt = 1,
|
|
unsigned int defWt = 1) {
|
|
auto defs = directRefs.getDefs(dcl);
|
|
auto uses = directRefs.getUses(dcl);
|
|
auto &loops = kernel.fg.getLoops();
|
|
unsigned int refCount = 0;
|
|
const unsigned int assumeLoopIter = 10;
|
|
|
|
if (defs) {
|
|
for (auto &def : *defs) {
|
|
auto *bb = std::get<1>(def);
|
|
auto *innerMostLoop = loops.getInnerMostLoop(bb);
|
|
if (innerMostLoop) {
|
|
auto nestingLevel = innerMostLoop->getNestingLevel();
|
|
refCount += (unsigned int)std::pow(assumeLoopIter, nestingLevel);
|
|
} else
|
|
refCount += defWt;
|
|
}
|
|
}
|
|
|
|
if (uses) {
|
|
for (auto &use : *uses) {
|
|
auto *bb = std::get<1>(use);
|
|
auto *innerMostLoop = loops.getInnerMostLoop(bb);
|
|
if (innerMostLoop) {
|
|
auto nestingLevel = innerMostLoop->getNestingLevel();
|
|
refCount += (unsigned int)std::pow(assumeLoopIter, nestingLevel);
|
|
} else
|
|
refCount += useWt;
|
|
}
|
|
}
|
|
|
|
if (dcl->getAddressed()) {
|
|
auto indirectRefsIt = indirectRefs.find(dcl);
|
|
if (indirectRefsIt != indirectRefs.end()) {
|
|
auto &dclIndirRefs = (*indirectRefsIt).second;
|
|
for (auto &item : dclIndirRefs) {
|
|
auto bb = item.second;
|
|
|
|
auto *innerMostLoop = loops.getInnerMostLoop(bb);
|
|
if (innerMostLoop) {
|
|
auto nestingLevel = innerMostLoop->getNestingLevel();
|
|
refCount += (unsigned int)std::pow(assumeLoopIter, nestingLevel);
|
|
} else
|
|
refCount += useWt;
|
|
}
|
|
}
|
|
}
|
|
|
|
return refCount == 0 ? 1 : refCount;
|
|
};
|
|
|
|
std::unordered_map<const G4_Declare *, std::vector<G4_Declare *>>
|
|
addrTakenMap;
|
|
std::unordered_map<G4_Declare *, std::vector<const G4_Declare *>>
|
|
revAddrTakenMap;
|
|
bool addrMapsComputed = false;
|
|
auto incSpillCostCandidate = [&](LiveRange *lr) {
|
|
if (kernel.getOption(vISA_IncSpillCostAllAddrTaken))
|
|
return true;
|
|
if (!addrMapsComputed) {
|
|
const_cast<PointsToAnalysis &>(liveAnalysis.getPointsToAnalysis())
|
|
.getPointsToMap(addrTakenMap);
|
|
const_cast<PointsToAnalysis &>(liveAnalysis.getPointsToAnalysis())
|
|
.getRevPointsToMap(revAddrTakenMap);
|
|
addrMapsComputed = true;
|
|
}
|
|
|
|
// this condition is a safety measure and isnt expected to be true.
|
|
auto it = revAddrTakenMap.find(lr->getDcl());
|
|
if (it == revAddrTakenMap.end())
|
|
return true;
|
|
|
|
for (auto &addrVar : (*it).second) {
|
|
if (addrTakenMap.count(addrVar) > 1)
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
for (unsigned i = 0; i < numVar; i++) {
|
|
G4_Declare *dcl = lrs[i]->getDcl();
|
|
|
|
if (dcl->getIsPartialDcl()) {
|
|
continue;
|
|
}
|
|
//
|
|
// The spill cost of pseudo nodes inserted to aid generation of save/restore
|
|
// code must be the minimum so that such nodes go to the bootom of the color
|
|
// stack.
|
|
//
|
|
if (builder.kernel.fg.isPseudoDcl(dcl)) {
|
|
if (builder.kernel.fg.isPseudoVCADcl(dcl)) {
|
|
lrs[i]->setSpillCost(MINSPILLCOST + 1);
|
|
} else {
|
|
lrs[i]->setSpillCost(MINSPILLCOST);
|
|
}
|
|
}
|
|
|
|
auto dclLR = gra.getLocalLR(dcl);
|
|
if (dclLR != NULL && dclLR->getSplit()) {
|
|
lrs[i]->setSpillCost(MINSPILLCOST + 2);
|
|
}
|
|
//
|
|
// Give the tiny spill/fill ranges an infinite spill cost, so that they are
|
|
// picked first for coloring.
|
|
// Also ARF live ranges with exclusively sequential references within the
|
|
// code are assigned an infinite spill cost as spilling them will not lower
|
|
// the register pressure in the region they are referenced. This does not
|
|
// necessarily hold for GRF live ranges are these are potentially large in
|
|
// size but the portions accessed by each sequential use are limited to 2
|
|
// registers for general instructions and 8 registers for SEND instructions.
|
|
//
|
|
else if (gra.isAddrFlagSpillDcl(dcl) || lrs[i]->isRetIp() ||
|
|
lrs[i]->getIsInfiniteSpillCost() == true ||
|
|
((lrs[i]->getVar()->isRegVarTransient() == true ||
|
|
lrs[i]->getVar()->isRegVarTmp() == true) &&
|
|
lrs[i]->getVar()->isSpilled() == false) ||
|
|
dcl == gra.getOldFPDcl() ||
|
|
(!builder.canReadR0() && dcl == builder.getBuiltinR0())) {
|
|
lrs[i]->setSpillCost(MAXSPILLCOST);
|
|
} else if (dcl->isDoNotSpill()) {
|
|
lrs[i]->setSpillCost(MAXSPILLCOST);
|
|
}
|
|
//
|
|
// Calculate spill costs of regular nodes.
|
|
//
|
|
else {
|
|
float spillCost = 0.0f;
|
|
// NOTE: Add 1 to degree to avoid divide-by-0, as a live range may have no
|
|
// neighbors
|
|
if (builder.kernel.getInt32KernelAttr(Attributes::ATTR_Target) ==
|
|
VISA_3D) {
|
|
if (useSplitLLRHeuristic) {
|
|
spillCost = 1.0f * lrs[i]->getRefCount() / (lrs[i]->getDegree() + 1);
|
|
} else {
|
|
vASSERT(lrs[i]->getDcl()->getTotalElems() > 0);
|
|
if (!liveAnalysis.livenessClass(G4_GRF) || !useNewSpillCost) {
|
|
// address or flag variables
|
|
unsigned short numRows = lrs[i]->getDcl()->getNumRows();
|
|
spillCost = 1.0f * lrs[i]->getRefCount() * lrs[i]->getRefCount() *
|
|
lrs[i]->getDcl()->getByteSize() *
|
|
(float)sqrt(lrs[i]->getDcl()->getByteSize()) /
|
|
((float)sqrt(lrs[i]->getDegree() + 1) *
|
|
(float)(sqrt(sqrt(numRows))));
|
|
} else {
|
|
// GRF variables
|
|
|
|
auto refCount = getWeightedRefCount(lrs[i]->getDcl());
|
|
spillCost = 1.0f * refCount * refCount * refCount /
|
|
((float)(lrs[i]->getDegree() + 1) *
|
|
(float)(lrs[i]->getDegree() + 1));
|
|
}
|
|
}
|
|
} else {
|
|
if (!useNewSpillCost) {
|
|
spillCost = liveAnalysis.livenessClass(G4_GRF)
|
|
? lrs[i]->getDegree()
|
|
: 1.0f * lrs[i]->getRefCount() *
|
|
lrs[i]->getRefCount() /
|
|
(lrs[i]->getDegree() + 1);
|
|
} else {
|
|
auto refCount = getWeightedRefCount(lrs[i]->getDcl());
|
|
spillCost = 1.0f * refCount * refCount * refCount /
|
|
((float)(lrs[i]->getDegree() + 1) *
|
|
(float)(lrs[i]->getDegree() + 1));
|
|
}
|
|
}
|
|
|
|
lrs[i]->setSpillCost(spillCost);
|
|
// Track address sensitive live range.
|
|
if (liveAnalysis.isAddressSensitive(i) && incSpillCostCandidate(lrs[i])) {
|
|
addressSensitiveVars.push_back(lrs[i]);
|
|
} else {
|
|
// Set the spill cost of all other normal live ranges, and
|
|
// track the max normal cost.
|
|
if (maxNormalCost < spillCost) {
|
|
maxNormalCost = spillCost;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// Set the spill cost of address sensitive live ranges above all the
|
|
// normal live ranges, so that they get colored before all the normal
|
|
// live ranges.
|
|
//
|
|
for (LiveRange *lr : addressSensitiveVars) {
|
|
if (lr->getSpillCost() != MAXSPILLCOST) {
|
|
lr->setSpillCost(maxNormalCost + lr->getSpillCost());
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// subtract lr's neighbors that are still in work list
|
|
//
|
|
void GraphColor::relaxNeighborDegreeGRF(LiveRange *lr) {
|
|
if (lr->getIsPseudoNode() || lr->getIsPartialDcl())
|
|
return;
|
|
|
|
unsigned lr_id = lr->getVar()->getId();
|
|
unsigned lr2_nreg = lr->getNumRegNeeded();
|
|
|
|
const std::vector<unsigned> &intfs = intf.getSparseIntfForVar(lr_id);
|
|
if (gra.use4GRFAlign) {
|
|
unsigned int lr2AugAlign = gra.getAugAlign(lr->getDcl());
|
|
for (auto it : intfs) {
|
|
LiveRange *lr1 = lrs[it];
|
|
if (lr1->getActive() && !lr1->getIsPseudoNode() &&
|
|
!(lr1->getIsPartialDcl())) {
|
|
unsigned lr1_nreg = lr1->getNumRegNeeded();
|
|
unsigned int lr1AugAlign = gra.getAugAlign(lr1->getDcl());
|
|
auto w =
|
|
edgeWeightWith4GRF(lr1AugAlign, lr2AugAlign, lr1_nreg, lr2_nreg);
|
|
relax(lr1, w);
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
// Handle case where 4GRF align is unsupported
|
|
bool lr2EvenAlign = gra.isEvenAligned(lr->getDcl());
|
|
for (auto it : intfs) {
|
|
LiveRange *lr1 = lrs[it];
|
|
if (lr1->getActive() && !lr1->getIsPseudoNode() &&
|
|
!(lr1->getIsPartialDcl())) {
|
|
unsigned lr1_nreg = lr1->getNumRegNeeded();
|
|
unsigned w = 0;
|
|
bool lr1EvenAlign = gra.isEvenAligned(lr1->getDcl());
|
|
w = edgeWeightGRF(lr1EvenAlign, lr2EvenAlign, lr1_nreg, lr2_nreg);
|
|
relax(lr1, w);
|
|
}
|
|
}
|
|
|
|
// Weak edges are supported only when 4GRF align is unsupported
|
|
auto *weakEdges = intf.getCompatibleSparseIntf(lr->getDcl());
|
|
if (weakEdges) {
|
|
for (auto weakNeighbor : *weakEdges) {
|
|
if (!weakNeighbor->getRegVar()->isRegAllocPartaker())
|
|
continue;
|
|
auto lr1 = lrs[weakNeighbor->getRegVar()->getId()];
|
|
if (lr1->getActive() && !lr1->getIsPseudoNode() &&
|
|
!(lr1->getIsPartialDcl())) {
|
|
unsigned lr1_nreg = lr1->getNumRegNeeded();
|
|
bool lr1EvenAlign = gra.isEvenAligned(lr1->getDcl());
|
|
auto w = edgeWeightGRF(lr1EvenAlign, lr2EvenAlign, lr1_nreg, lr2_nreg);
|
|
relax(lr1, w);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void GraphColor::relaxNeighborDegreeARF(LiveRange *lr) {
|
|
if (!(lr->getIsPseudoNode())) {
|
|
unsigned lr_id = lr->getVar()->getId();
|
|
const std::vector<unsigned> &intfs = intf.getSparseIntfForVar(lr_id);
|
|
for (auto it : intfs) {
|
|
LiveRange *lrs_it = lrs[it];
|
|
|
|
if (lrs_it->getActive() && !lrs_it->getIsPseudoNode()) {
|
|
unsigned w = edgeWeightARF(lrs_it, lr);
|
|
VISA_DEBUG_VERBOSE({
|
|
std::cout << "\t relax ";
|
|
lrs_it->dump();
|
|
std::cout << " degree(" << lrs_it->getDegree() << ") - " << w << "\n";
|
|
});
|
|
lrs_it->subtractDegree(w);
|
|
|
|
unsigned availColor = numColor;
|
|
|
|
if (lrs_it->getDegree() + lrs_it->getNumRegNeeded() <= availColor) {
|
|
unconstrainedWorklist.push_back(lrs_it);
|
|
lrs_it->setActive(false);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static bool compareSpillCost(LiveRange *lr1, LiveRange *lr2) {
|
|
return lr1->getSpillCost() < lr2->getSpillCost() ||
|
|
(lr1->getSpillCost() == lr2->getSpillCost() &&
|
|
lr1->getVar()->getId() < lr2->getVar()->getId());
|
|
}
|
|
|
|
//
|
|
// All nodes in work list are all contrained (whose degree > max color)
|
|
// find one contrained node and move it to order list
|
|
//
|
|
void GraphColor::removeConstrained() {
|
|
if (!constrainedWorklist.empty()) {
|
|
LiveRange *lr = constrainedWorklist.front();
|
|
constrainedWorklist.pop_front();
|
|
|
|
if (lr->getActive()) {
|
|
VISA_DEBUG_VERBOSE({
|
|
std::cout << ".... Remove Constrained ";
|
|
lr->dump();
|
|
std::cout << "\n";
|
|
});
|
|
|
|
if (liveAnalysis.livenessClass(G4_GRF)) {
|
|
relaxNeighborDegreeGRF(lr);
|
|
} else {
|
|
relaxNeighborDegreeARF(lr);
|
|
}
|
|
colorOrder.push_back(lr);
|
|
lr->setActive(false);
|
|
}
|
|
}
|
|
}
|
|
|
|
void GraphColor::determineColorOrdering() {
|
|
numColor = 0;
|
|
if (liveAnalysis.livenessClass(G4_GRF))
|
|
numColor = totalGRFRegCount - reserveSpillGRFCount;
|
|
else if (liveAnalysis.livenessClass(G4_ADDRESS))
|
|
numColor = builder.getNumAddrRegisters();
|
|
else if (liveAnalysis.livenessClass(G4_FLAG))
|
|
numColor = builder.getNumFlagRegisters();
|
|
|
|
unsigned numUnassignedVar = liveAnalysis.getNumUnassignedVar();
|
|
|
|
//
|
|
// create an array for sorting live ranges
|
|
//
|
|
LiveRangeVec sorted;
|
|
sorted.reserve(numUnassignedVar);
|
|
unsigned j = 0;
|
|
for (unsigned i = 0; i < numVar; i++) {
|
|
if (lrs[i]->getPhyReg() == nullptr && !lrs[i]->getIsPartialDcl()) {
|
|
sorted.push_back(lrs[i]);
|
|
j++;
|
|
}
|
|
}
|
|
|
|
if (gra.incRA.isEnabledWithVerification(kernel)) {
|
|
gra.incRA.computeLeftOverUnassigned(sorted, liveAnalysis);
|
|
}
|
|
|
|
vISA_ASSERT(j == numUnassignedVar, ERROR_GRAPHCOLOR);
|
|
|
|
//
|
|
// sort the live range array
|
|
//
|
|
std::sort(sorted.begin(), sorted.end(), compareSpillCost);
|
|
//This will not change the order unless SPGSS is turned on
|
|
builder.getFreqInfoManager().sortBasedOnFreq(sorted);
|
|
|
|
for (unsigned i = 0; i < numUnassignedVar; i++) {
|
|
LiveRange *lr = sorted[i];
|
|
unsigned availColor = numColor;
|
|
availColor = numColor - lr->getNumForbidden();
|
|
|
|
if (lr->getDegree() + lr->getNumRegNeeded() <= availColor) {
|
|
unconstrainedWorklist.push_back(lr);
|
|
lr->setActive(false);
|
|
if (lr->getRegKind() == G4_GRF) {
|
|
// Mark current lr as unconstrained, which means RR algorithm can always
|
|
// be applied to the variable.
|
|
lr->setUnconstrained(true);
|
|
}
|
|
} else {
|
|
constrainedWorklist.push_back(lr);
|
|
lr->setActive(true);
|
|
}
|
|
}
|
|
|
|
VISA_DEBUG_VERBOSE({
|
|
std::cout << "\nSPILL COST\n";
|
|
for (unsigned i = 0; i < numUnassignedVar; i++) {
|
|
sorted[i]->dump();
|
|
std::cout << "\t spillCost=" << sorted[i]->getSpillCost();
|
|
std::cout << "\t degree=" << sorted[i]->getDegree();
|
|
std::cout << "\t refCnt=" << sorted[i]->getRefCount();
|
|
std::cout << "\t size=" << sorted[i]->getDcl()->getByteSize();
|
|
std::cout << "\t active=" << sorted[i]->getActive();
|
|
std::cout << "\n";
|
|
}
|
|
std::cout << "\n";
|
|
});
|
|
|
|
while (!constrainedWorklist.empty() || !unconstrainedWorklist.empty()) {
|
|
while (!unconstrainedWorklist.empty()) {
|
|
LiveRange *lr = unconstrainedWorklist.front();
|
|
unconstrainedWorklist.pop_front();
|
|
|
|
VISA_DEBUG_VERBOSE({
|
|
std::cout << ".... Remove Unconstrained ";
|
|
lr->dump();
|
|
std::cout << "\n";
|
|
});
|
|
|
|
if (liveAnalysis.livenessClass(G4_GRF)) {
|
|
relaxNeighborDegreeGRF(lr);
|
|
} else {
|
|
relaxNeighborDegreeARF(lr);
|
|
}
|
|
colorOrder.push_back(lr);
|
|
}
|
|
|
|
removeConstrained();
|
|
}
|
|
}
|
|
|
|
void PhyRegUsage::updateRegUsage(LiveRange *lr) {
|
|
G4_Declare *dcl = lr->getDcl();
|
|
G4_VarBase *pr;
|
|
if (lr->getIsPartialDcl()) {
|
|
pr = lrs[lr->getParentLRID()]->getPhyReg();
|
|
} else {
|
|
pr = lr->getPhyReg();
|
|
}
|
|
|
|
if (!pr) {
|
|
return;
|
|
}
|
|
if (pr->isGreg()) {
|
|
if (dcl->getIsPartialDcl()) {
|
|
// Assumptions:
|
|
// 1. the offset of the sub declare must be G4_WSIZE aligned
|
|
// 2. the size of the subdeclare must be G4_WSIZE aligned
|
|
markBusyForDclSplit(G4_GRF, ((G4_Greg *)pr)->getRegNum(),
|
|
(lrs[lr->getParentLRID()]->getPhyRegOff() *
|
|
TypeSize(dcl->getElemType()) +
|
|
gra.getSubOffset(dcl)) /
|
|
G4_WSIZE,
|
|
dcl->getByteSize() / G4_WSIZE, dcl->getNumRows());
|
|
} else {
|
|
markBusyGRF(
|
|
((G4_Greg *)pr)->getRegNum(),
|
|
PhyRegUsage::offsetAllocUnit(lr->getPhyRegOff(), dcl->getElemType()),
|
|
dcl->getWordSize(), lr->getNumRegNeeded(), dcl->isPreDefinedVar());
|
|
}
|
|
} else if (pr->isFlag()) {
|
|
auto flagWordOffset = lr->getPhyReg()->asAreg()->getFlagNum() * 2;
|
|
markBusyFlag(
|
|
0,
|
|
PhyRegUsage::offsetAllocUnit(flagWordOffset + lr->getPhyRegOff(),
|
|
dcl->getElemType()),
|
|
PhyRegUsage::numAllocUnit(dcl->getNumElems(), dcl->getElemType()),
|
|
dcl->getNumRows());
|
|
} else if (pr->isA0()) {
|
|
markBusyAddress(
|
|
0, PhyRegUsage::offsetAllocUnit(lr->getPhyRegOff(), dcl->getElemType()),
|
|
PhyRegUsage::numAllocUnit(dcl->getNumElems(), dcl->getElemType()),
|
|
dcl->getNumRows());
|
|
}
|
|
else if (pr->isS0()) {
|
|
markBusyScalar(
|
|
0, PhyRegUsage::offsetAllocUnit(lr->getPhyRegOff(), dcl->getElemType()),
|
|
PhyRegUsage::numAllocUnit(dcl->getNumElems(), dcl->getElemType()),
|
|
dcl->getNumRows());
|
|
}
|
|
else {
|
|
vISA_ASSERT(false, ERROR_GRAPHCOLOR); // un-handled reg type
|
|
}
|
|
}
|
|
|
|
bool GraphColor::assignColors(ColorHeuristic colorHeuristicGRF,
|
|
bool doBankConflict, bool highInternalConflict,
|
|
bool doBundleConflict) {
|
|
RA_TRACE(std::cout << "\t--"
|
|
<< (colorHeuristicGRF == ROUND_ROBIN ? "round-robin"
|
|
: "first-fit")
|
|
<< (doBankConflict ? " BCR" : "") << " graph coloring\n");
|
|
|
|
unsigned bank1_end = 0;
|
|
unsigned bank2_end = totalGRFRegCount - 1;
|
|
unsigned bank1_start = 0;
|
|
unsigned bank2_start = totalGRFRegCount - 1;
|
|
unsigned totalGRFNum = kernel.getNumRegTotal();
|
|
bool oneGRFBankDivision = gra.kernel.fg.builder->oneGRFBankDivision();
|
|
bool allocFromBanks =
|
|
liveAnalysis.livenessClass(G4_GRF) && builder.lowHighBundle() &&
|
|
!builder.getOptions()->getuInt32Option(vISA_ReservedGRFNum) &&
|
|
doBankConflict &&
|
|
((oneGRFBankDivision && gra.kernel.getSimdSize() >= g4::SIMD16) ||
|
|
(!oneGRFBankDivision && highInternalConflict));
|
|
|
|
if (allocFromBanks && (colorHeuristicGRF == ROUND_ROBIN)) {
|
|
bank1_end = (unsigned)((totalGRFRegCount - 1) *
|
|
(((float)evenTotalDegree / evenTotalRegNum) /
|
|
(((float)evenTotalDegree / evenTotalRegNum) +
|
|
((float)oddTotalDegree / oddTotalRegNum))));
|
|
if (bank1_end < evenMaxRegNum ||
|
|
totalGRFRegCount - bank1_end < oddMaxRegNum ||
|
|
bank1_end == totalGRFRegCount - 1 || bank1_end == 0) {
|
|
// FIXME: How can we early return without assigning???
|
|
return false;
|
|
}
|
|
|
|
bank2_end = bank1_end + 1;
|
|
}
|
|
|
|
G4_RegFileKind rFile = G4_GRF;
|
|
if (liveAnalysis.livenessClass(G4_FLAG))
|
|
rFile = G4_FLAG;
|
|
else if (liveAnalysis.livenessClass(G4_ADDRESS))
|
|
rFile = G4_ADDRESS;
|
|
else if (liveAnalysis.livenessClass(G4_SCALAR))
|
|
rFile = G4_SCALAR;
|
|
|
|
FreePhyRegs FPR(kernel);
|
|
|
|
unsigned maxGRFCanBeUsed = totalGRFRegCount;
|
|
// FIXME: the bank configs should be computed in PhyRegAllocationState instead
|
|
// of pased in, but the strange early return from above prevents this..
|
|
PhyRegAllocationState parms(gra, lrs, rFile, maxGRFCanBeUsed, bank1_start,
|
|
bank1_end, bank2_start, bank2_end, doBankConflict,
|
|
doBundleConflict);
|
|
bool noIndirForceSpills = builder.getOption(vISA_NoIndirectForceSpills);
|
|
|
|
// Returns true when valid assignment is found or when lr is added to spilled
|
|
// set. Adding to spill set happens only if heuristic is not round_robin (FF
|
|
// may not spill). Parameter returnFalseOnFail is set when the function is
|
|
// required to return false on assignment failure. When parameter spillAllowed
|
|
// is set to true, this function adds lr to spilled set. If spillAllowed is
|
|
// false, the lr is not added to spill set. This logic is useful to try
|
|
// re-allocation of a child/parent dcl when split is enabled.
|
|
// ignoreChildrenIntf is set to true when all children are assigned to
|
|
// consecutive ranges and we want to get fully coalesceable assignment for
|
|
// parent. In such circumstance, we don't want to account for interference
|
|
// between parent/child since doing so cannot result in a coalesceable
|
|
// assignment.
|
|
auto assignColor = [&](LiveRange *lr) {
|
|
auto lrVar = lr->getVar();
|
|
|
|
//
|
|
// assign register to live ranges
|
|
//
|
|
if (lr->getPhyReg() == NULL && !lrVar->isSpilled() &&
|
|
!lr->getIsPartialDcl()) // no assigned register yet and not spilled
|
|
{
|
|
unsigned lr_id = lrVar->getId();
|
|
//
|
|
// compute what registers are already assigned
|
|
//
|
|
PhyRegUsage regUsage(parms, FPR);
|
|
|
|
const std::vector<unsigned> &intfs = intf.getSparseIntfForVar(lr_id);
|
|
auto weakEdgeSet =
|
|
intf.getCompatibleSparseIntf(lrVar->getDeclare()->getRootDeclare());
|
|
for (auto it : intfs) {
|
|
LiveRange *lrTemp = lrs[it];
|
|
if (lrTemp->getPhyReg() != nullptr || lrTemp->getIsPartialDcl()) {
|
|
if (lrTemp->getIsSplittedDcl()) {
|
|
// Only interfere with children declares
|
|
continue;
|
|
}
|
|
|
|
regUsage.updateRegUsage(lrTemp);
|
|
}
|
|
}
|
|
|
|
if (weakEdgeSet) {
|
|
regUsage.runOverlapTest(true);
|
|
for (auto weakDcl : *weakEdgeSet) {
|
|
auto regVar = weakDcl->getRootDeclare()->getRegVar();
|
|
unsigned pvar = 0, numRegs = 0;
|
|
if (regVar->isPhyRegAssigned()) {
|
|
// This branch will be taken for dcls assigned
|
|
// regs by LRA.
|
|
pvar = regVar->getPhyReg()->asGreg()->getRegNum();
|
|
numRegs = weakDcl->getNumRows();
|
|
} else {
|
|
// For dcls not assigned regs by LRA, lookup temp
|
|
// registers assigned to LiveRange instances.
|
|
auto id = regVar->getId();
|
|
auto lr = lrs[id];
|
|
auto phyReg = lr->getPhyReg();
|
|
if (phyReg) {
|
|
pvar = phyReg->asGreg()->getRegNum();
|
|
numRegs = weakDcl->getNumRows();
|
|
}
|
|
}
|
|
|
|
// For now it is assumed only 8-byte types will appear
|
|
// here. If other sized types will also appear then
|
|
// augmentation mask also needs to be sent in
|
|
// weak edge data structure below.
|
|
for (unsigned r = pvar; r < (pvar + numRegs); r++) {
|
|
auto use = regUsage.getWeakEdgeUse(r);
|
|
if (use == 0 || use == (r - pvar + 1)) {
|
|
regUsage.setWeakEdgeUse(r, r - pvar + 1);
|
|
} else {
|
|
// Indiates two neighbors use a physical
|
|
// register with different overlap.
|
|
regUsage.setWeakEdgeUse(r, 0xff);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
ColorHeuristic heuristic = colorHeuristicGRF;
|
|
|
|
bool failed_alloc = false;
|
|
G4_Declare *dcl = lrVar->getDeclare();
|
|
|
|
if (!(noIndirForceSpills && liveAnalysis.isAddressSensitive(lr_id)) &&
|
|
forceSpill &&
|
|
(dcl->getRegFile() == G4_GRF || dcl->getRegFile() == G4_FLAG) &&
|
|
lr->getRefCount() != 0 && lr->getSpillCost() != MAXSPILLCOST) {
|
|
failed_alloc = true;
|
|
}
|
|
|
|
if ((dcl->getNumRows() > totalGRFNum) ||
|
|
(dcl->isForceSpilled() && (lr->getSpillCost() != MAXSPILLCOST))) {
|
|
// we sure as hell won't get an assignment
|
|
failed_alloc = true;
|
|
}
|
|
|
|
if (kernel.getOption(vISA_GCRRInFF)) {
|
|
if (lr->getRegKind() != G4_GRF) {
|
|
// None GRF assignment, keep single FF or RR algorithm
|
|
if (heuristic == FIRST_FIT) {
|
|
parms.setStartGRF(0);
|
|
}
|
|
} else if (heuristic == FIRST_FIT && !lr->getIsUnconstrained()) {
|
|
// GRF assignment, start GRF is always 0 if first fit algorithm is
|
|
// used and the variable is constrainted
|
|
parms.setStartGRF(0);
|
|
}
|
|
}
|
|
|
|
if (!failed_alloc) {
|
|
// When evenAlignNeeded is true, it is binding for correctness
|
|
bool evenAlignNeeded = gra.isEvenAligned(lrVar->getDeclare());
|
|
bool quadAlignNeeded = gra.isQuadAligned(lrVar->getDeclare());
|
|
BankAlign align = BankAlign::Either;
|
|
if (quadAlignNeeded)
|
|
align = BankAlign::QuadGRF;
|
|
else if (evenAlignNeeded)
|
|
align = BankAlign::Even;
|
|
|
|
if (allocFromBanks) {
|
|
vISA_ASSERT(align != BankAlign::QuadGRF, "unexpected value");
|
|
if (!isHybrid && oneGRFBankDivision &&
|
|
(!evenAlignNeeded ||
|
|
builder.getPlatformGeneration() == PlatformGen::GEN9)) {
|
|
gra.getBankAlignment(lr, align);
|
|
}
|
|
failed_alloc |= !regUsage.assignGRFRegsFromBanks(
|
|
lr, align, lr->getForbidden(), heuristic, oneGRFBankDivision);
|
|
} else {
|
|
failed_alloc |= !regUsage.assignRegs(
|
|
highInternalConflict, lr, lr->getForbidden(), align,
|
|
gra.getSubRegAlign(lrVar->getDeclare()), heuristic,
|
|
lr->getSpillCost());
|
|
}
|
|
}
|
|
|
|
//
|
|
// assign unused color
|
|
//
|
|
if (failed_alloc) {
|
|
//
|
|
// for GRF register assignment, if we are performing round-robin (1st
|
|
// pass) then abort on spill
|
|
//
|
|
if ((heuristic == ROUND_ROBIN ||
|
|
(doBankConflict && !kernel.getOption(vISA_forceBCR))) &&
|
|
(lr->getRegKind() == G4_GRF || lr->getRegKind() == G4_FLAG)) {
|
|
return false;
|
|
} else if (kernel.fg.isPseudoDcl(dcl)) {
|
|
// these pseudo dcls are not (and cannot be) spilled, but instead
|
|
// save/restore code will be inserted in stack call prolog/epilog
|
|
} else {
|
|
// for first-fit register assignment track spilled live ranges
|
|
spilledLRs.push_back(lr);
|
|
lr->setSpilled(true);
|
|
}
|
|
}
|
|
}
|
|
VISA_DEBUG_VERBOSE({
|
|
lr->dump();
|
|
std::cout << "\n";
|
|
});
|
|
return true;
|
|
};
|
|
|
|
// colorOrder is in reverse order (unconstrained at front)
|
|
for (auto iter = colorOrder.rbegin(), iterEnd = colorOrder.rend();
|
|
iter != iterEnd; ++iter) {
|
|
auto lr = (*iter);
|
|
|
|
// in case child/parent was already spilled earlier, don't recolor
|
|
if (lr->isSpilled())
|
|
continue;
|
|
|
|
bool ret = assignColor(lr);
|
|
|
|
// early exit
|
|
if (!ret)
|
|
return false;
|
|
}
|
|
|
|
if (failSafeIter) {
|
|
// As per spec, EOT has to be allocated to r112+.
|
|
// When fail safe iteration is run, upper GRFs are
|
|
// reserved. It's possible that # of reserved
|
|
// GRFs are too many and r112+ allocation restriction
|
|
// on EOT cannot be fulfilled (eg, r116-r127 are reserved
|
|
// EOT src operand size is 8 GRFs). This causes EOT var
|
|
// to spill and then the spill range faces the same
|
|
// restriction. The fix here is to check whether
|
|
// reserved GRF restriction can be eased for EOT.
|
|
auto hasSpilledNeighbor = [&](unsigned int id) {
|
|
for (const auto *spillLR : spilledLRs) {
|
|
if (id != spillLR->getVar()->getId() &&
|
|
getIntf()->interfereBetween(id, spillLR->getVar()->getId()))
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
if (gra.useHybridRAwithSpill) {
|
|
// This local analysis is skipped in favor of
|
|
// compile time in global RA loop, so run it here
|
|
// when needed.
|
|
gra.markGraphBlockLocalVars();
|
|
}
|
|
|
|
for (auto lrIt = spilledLRs.begin(); lrIt != spilledLRs.end(); ++lrIt) {
|
|
auto lr = (*lrIt);
|
|
bool needsEOTGRF = lr->getEOTSrc() && builder.hasEOTGRFBinding();
|
|
if (needsEOTGRF && gra.isBlockLocal(lr->getDcl()) &&
|
|
(totalGRFRegCount - reserveSpillGRFCount + lr->getNumRegNeeded()) <=
|
|
kernel.getNumRegTotal() &&
|
|
!hasSpilledNeighbor(lr->getVar()->getId())) {
|
|
// Following conditions true:
|
|
// 1. EOT range spilled that needs r112-r127 assignment,
|
|
// 2. Variable is local to a BB,
|
|
// 3. Reserved GRF start + # EOT GRFs fits within total GRFs,
|
|
// 4. Has no spilled neighbor
|
|
//
|
|
// This makes it safe to directly assign a reserved GRF to this
|
|
// variable than spill it.
|
|
lr->setPhyReg(builder.phyregpool.getGreg(kernel.getNumRegTotal() -
|
|
lr->getNumRegNeeded()),
|
|
0);
|
|
spilledLRs.erase(lrIt);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// record RA type
|
|
if (liveAnalysis.livenessClass(G4_GRF)) {
|
|
if (colorHeuristicGRF == ROUND_ROBIN) {
|
|
kernel.setRAType(doBankConflict ? RA_Type::GRAPH_COLORING_RR_BC_RA
|
|
: RA_Type::GRAPH_COLORING_RR_RA);
|
|
} else {
|
|
kernel.setRAType(doBankConflict ? RA_Type::GRAPH_COLORING_FF_BC_RA
|
|
: RA_Type::GRAPH_COLORING_FF_RA);
|
|
}
|
|
}
|
|
|
|
#ifdef _DEBUG
|
|
// Verify that spilledLRs has no duplicate
|
|
for (auto item : spilledLRs) {
|
|
unsigned count = 0;
|
|
for (auto checkItem : spilledLRs) {
|
|
if (checkItem == item) {
|
|
vISA_ASSERT(count == 0, "Duplicate entry found in spilledLRs");
|
|
count++;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Verify that none of spilledLRs have an allocation
|
|
for (auto lr : spilledLRs) {
|
|
vISA_ASSERT(lr->getPhyReg() == nullptr,
|
|
"Spilled LR contains valid allocation");
|
|
}
|
|
|
|
// Verify that all spilled LRs are synced
|
|
for (auto lr : spilledLRs) {
|
|
vISA_ASSERT(lr->isSpilled(),
|
|
"LR not marked as spilled, but inserted in spilledLRs list");
|
|
}
|
|
|
|
// Verify if all LRs have either an allocation or are spilled
|
|
for (auto lr : colorOrder) {
|
|
if (!kernel.fg.isPseudoDcl(lr->getDcl())) {
|
|
vISA_ASSERT(lr->isSpilled() || lr->getPhyReg() ||
|
|
lr->getDcl()->isSpilled(),
|
|
"Range without allocation and not spilled");
|
|
}
|
|
}
|
|
#endif
|
|
|
|
return true;
|
|
}
|
|
|
|
template <class REGION_TYPE>
|
|
unsigned GlobalRA::getRegionDisp(REGION_TYPE *region, const IR_Builder &irb) {
|
|
unsigned rowOffset = irb.numEltPerGRF<Type_UB>() * region->getRegOff();
|
|
unsigned columnOffset = region->getSubRegOff() * region->getElemSize();
|
|
return rowOffset + columnOffset;
|
|
}
|
|
|
|
void GlobalRA::addEUFusionCallWAInst(G4_INST *inst) {
|
|
if (EUFusionCallWANeeded())
|
|
EUFusionCallWAInsts.insert(inst);
|
|
}
|
|
|
|
void GlobalRA::addEUFusionNoMaskWAInst(G4_BB *BB, G4_INST *Inst) {
|
|
if (EUFusionNoMaskWANeeded() && (BB->getBBType() & G4_BB_NM_WA_TYPE) != 0) {
|
|
EUFusionNoMaskWAInsts.insert(Inst);
|
|
Inst->setNeedPostRA(true);
|
|
}
|
|
}
|
|
|
|
void GlobalRA::removeEUFusionNoMaskWAInst(G4_INST *Inst) {
|
|
if (EUFusionNoMaskWANeeded()) {
|
|
if (EUFusionNoMaskWAInsts.erase(Inst) > 0) {
|
|
Inst->setNeedPostRA(false);
|
|
}
|
|
}
|
|
}
|
|
|
|
unsigned GlobalRA::getRegionByteSize(G4_DstRegRegion *region,
|
|
unsigned execSize) {
|
|
unsigned size =
|
|
region->getHorzStride() * region->getElemSize() * (execSize - 1) +
|
|
region->getElemSize();
|
|
|
|
return size;
|
|
}
|
|
|
|
#define OWORD_BYTE_SIZE 16
|
|
|
|
template <class REGION_TYPE>
|
|
bool GlobalRA::isUnalignedRegion(REGION_TYPE *region, unsigned execSize) {
|
|
unsigned regionDisp = getRegionDisp(region, builder);
|
|
unsigned regionByteSize = getRegionByteSize(region, execSize);
|
|
|
|
if (regionDisp % kernel.numEltPerGRF<Type_UB>() == 0 &&
|
|
regionByteSize % kernel.numEltPerGRF<Type_UB>() == 0) {
|
|
return regionByteSize / kernel.numEltPerGRF<Type_UB>() != 1 &&
|
|
regionByteSize / kernel.numEltPerGRF<Type_UB>() != 2 &&
|
|
regionByteSize / kernel.numEltPerGRF<Type_UB>() != 4;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool GlobalRA::shouldPreloadDst(G4_INST *instContext, G4_BB *curBB) {
|
|
// Check for partial and unaligned regions and add pre-load code, if
|
|
// necessary.
|
|
auto spilledRangeRegion = instContext->getDst();
|
|
uint8_t execSize = instContext->getExecSize();
|
|
|
|
if (isPartialRegion(spilledRangeRegion, execSize) ||
|
|
isUnalignedRegion(spilledRangeRegion, execSize) ||
|
|
instContext->isPartialWriteForSpill(!curBB->isAllLaneActive(),
|
|
useLscForNonStackCallSpillFill)) {
|
|
return true;
|
|
}
|
|
// No pre-load for whole and aligned region writes
|
|
else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool GlobalRA::livenessCandidate(const G4_Declare *decl) const {
|
|
if (decl->getAliasDeclare()) {
|
|
return false;
|
|
}
|
|
|
|
if ((G4_GRF & decl->getRegFile())) {
|
|
if ((decl->getRegFile() & G4_INPUT) &&
|
|
decl->getRegVar()->isPhyRegAssigned() && !decl->getRegVar()->isGreg()) {
|
|
return false;
|
|
}
|
|
if (decl->getByteSize() == 0) {
|
|
// regrettably, this can happen for arg/retval pre-defined variable
|
|
return false;
|
|
}
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
void GlobalRA::determineSpillRegSize(unsigned &spillRegSize,
|
|
unsigned &indrSpillRegSize) {
|
|
// Iterate over all BBs
|
|
for (auto curBB : kernel.fg) {
|
|
// Iterate over all insts
|
|
for (INST_LIST_ITER inst_it = curBB->begin(), iend = curBB->end();
|
|
inst_it != iend; ++inst_it) {
|
|
unsigned currentSpillRegSize = 0;
|
|
unsigned currentIndrSpillRegSize = 0;
|
|
|
|
G4_INST *curInst = (*inst_it);
|
|
|
|
if (curInst->isPseudoKill() || curInst->isLifeTimeEnd() ||
|
|
curInst->opcode() == G4_pseudo_fcall ||
|
|
curInst->opcode() == G4_pseudo_fret) {
|
|
continue;
|
|
}
|
|
|
|
if (curInst->isSend()) {
|
|
G4_SendDesc *msgDesc = curInst->getMsgDesc();
|
|
|
|
unsigned dstSpillRegSize = 0;
|
|
dstSpillRegSize = msgDesc->getDstLenRegs();
|
|
|
|
unsigned src0FillRegSize = 0;
|
|
src0FillRegSize = msgDesc->getSrc0LenRegs();
|
|
|
|
unsigned src1FillRegSize = 0;
|
|
if (curInst->isSplitSend()) {
|
|
src1FillRegSize = msgDesc->getSrc1LenRegs();
|
|
}
|
|
|
|
if (!kernel.fg.builder->useSends()) {
|
|
dstSpillRegSize++;
|
|
}
|
|
|
|
currentSpillRegSize =
|
|
dstSpillRegSize + src0FillRegSize + src1FillRegSize;
|
|
} else if (curInst->isDpas()) {
|
|
unsigned dstSpillRegSize = 0;
|
|
G4_DstRegRegion *dst = curInst->getDst();
|
|
if (dst && dst->getBase()->isRegVar()) {
|
|
dstSpillRegSize =
|
|
dst->getBase()->asRegVar()->getDeclare()->getNumRows();
|
|
}
|
|
|
|
unsigned srcFillRegSize = 0;
|
|
for (int i = 0, srcNum = curInst->getNumSrc(); i < srcNum; i++) {
|
|
G4_Operand *src = curInst->getSrc(i);
|
|
|
|
if (src && src->isSrcRegRegion() &&
|
|
src->asSrcRegRegion()->getBase()->isRegVar()) {
|
|
if (src->asSrcRegRegion()
|
|
->getBase()
|
|
->asRegVar()
|
|
->getDeclare()
|
|
->getRegFile() == G4_GRF) {
|
|
unsigned srcSize =
|
|
src->getBase()->asRegVar()->getDeclare()->getNumRows();
|
|
// FIXME, currently we only use the max src size.
|
|
// To save the spill registers, it's better the space can be
|
|
// determined by checking if the variable is really spilled or
|
|
// not.
|
|
srcFillRegSize += srcSize;
|
|
}
|
|
}
|
|
}
|
|
currentSpillRegSize = srcFillRegSize + dstSpillRegSize;
|
|
} else {
|
|
ORG_REGVAR_VECTOR indrVars;
|
|
|
|
unsigned dstSpillRegSize = 0;
|
|
unsigned indrDstSpillRegSize = 0;
|
|
if (G4_Inst_Table[curInst->opcode()].n_dst == 1) {
|
|
G4_DstRegRegion *dst = curInst->getDst();
|
|
|
|
if (dst && dst->getBase()->isRegVar()) {
|
|
if (dst->getBase()->asRegVar()->getDeclare()->getRegFile() ==
|
|
G4_GRF) {
|
|
if (dst->isCrossGRFDst(builder)) {
|
|
dstSpillRegSize = 2;
|
|
} else {
|
|
dstSpillRegSize = 1;
|
|
}
|
|
|
|
if (shouldPreloadDst(curInst, curBB)) {
|
|
dstSpillRegSize *= 3;
|
|
} else {
|
|
dstSpillRegSize *= 2;
|
|
}
|
|
|
|
if (!kernel.fg.builder->useSends()) {
|
|
dstSpillRegSize++;
|
|
}
|
|
} else if (dst->getRegAccess() == IndirGRF) {
|
|
auto pointsToSet =
|
|
pointsToAnalysis.getAllInPointsTo(dst->getBase()->asRegVar());
|
|
if (pointsToSet != nullptr) {
|
|
for (const auto& pt : *pointsToSet) {
|
|
if (pt.var->isRegAllocPartaker() ||
|
|
((useFastRA || useHybridRAwithSpill) &&
|
|
livenessCandidate(pt.var->getDeclare()))) {
|
|
indrVars.push_back(pt.var);
|
|
indrDstSpillRegSize += pt.var->getDeclare()->getNumRows();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
unsigned srcFillRegSize = 0;
|
|
unsigned indirSrcFillRegSize = 0;
|
|
// Scan srcs
|
|
for (int i = 0, srcNum = curInst->getNumSrc(); i < srcNum; i++) {
|
|
G4_Operand *src = curInst->getSrc(i);
|
|
|
|
if (src && src->isSrcRegRegion() &&
|
|
src->asSrcRegRegion()->getBase()->isRegVar()) {
|
|
if (src->asSrcRegRegion()
|
|
->getBase()
|
|
->asRegVar()
|
|
->getDeclare()
|
|
->getRegFile() == G4_GRF) {
|
|
if (src->asSrcRegRegion()->crossGRF(builder)) {
|
|
srcFillRegSize += 2;
|
|
} else {
|
|
srcFillRegSize += 1;
|
|
}
|
|
} else if (src->asSrcRegRegion()->getRegAccess() == IndirGRF) {
|
|
auto pointsToSet = pointsToAnalysis.getAllInPointsTo(
|
|
src->asSrcRegRegion()->getBase()->asRegVar());
|
|
if (pointsToSet != nullptr) {
|
|
for (const auto& pt : *pointsToSet) {
|
|
if (pt.var->isRegAllocPartaker() ||
|
|
((useFastRA || useHybridRAwithSpill) &&
|
|
livenessCandidate(pt.var->getDeclare()))) {
|
|
if (std::find(indrVars.begin(), indrVars.end(), pt.var) ==
|
|
indrVars.end()) {
|
|
indrVars.push_back(pt.var);
|
|
indirSrcFillRegSize += pt.var->getDeclare()->getNumRows();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (builder.avoidDstSrcOverlap()) {
|
|
currentSpillRegSize = srcFillRegSize + dstSpillRegSize;
|
|
} else {
|
|
currentSpillRegSize = srcFillRegSize > dstSpillRegSize
|
|
? srcFillRegSize
|
|
: dstSpillRegSize;
|
|
}
|
|
currentIndrSpillRegSize = indrDstSpillRegSize + indirSrcFillRegSize;
|
|
}
|
|
|
|
spillRegSize = std::max(spillRegSize, currentSpillRegSize);
|
|
indrSpillRegSize = std::max(indrSpillRegSize, currentIndrSpillRegSize);
|
|
}
|
|
}
|
|
}
|
|
|
|
void GraphColor::gatherScatterForbiddenWA() {
|
|
if (!liveAnalysis.livenessClass(G4_GRF))
|
|
return;
|
|
|
|
// VISA spec supports gather.1 and scatter.1 instructions.
|
|
// But they're not natively supported across platforms. When
|
|
// lowering gather.1 (scatter.1) on unsupported platforms, we
|
|
// use rsp len (msg len) = 2 while actual dst (payload) may be
|
|
// smaller in size. This could cause a problem if dst (payload)
|
|
// gets assigned to r127 as rsp len (msg len) = 2 could make
|
|
// it access beyond last GRF. For eg,
|
|
//
|
|
// VISA:
|
|
//.decl Rsp v_type=G type=q num_elts=1
|
|
//.decl Addr v_type=G type=q num_elts=1
|
|
// svm_gather.8.1 (M1, 1) Addr Rsp
|
|
//
|
|
// asm:
|
|
// send.dc1 (1|M0) r127 r4 null:0 exMSD MSD // wr:2+0, rd:2; a64
|
|
// qword gathering read
|
|
//
|
|
// This asm instruction is illegal as Rsp (size = 8 bytes) was assigned r127
|
|
// but send response length = 2.
|
|
//
|
|
// We fix such cases by looking them up and marking upper GRFs
|
|
// as forbidden for allocation.
|
|
for (auto bb : kernel.fg.getBBList()) {
|
|
for (auto inst : *bb) {
|
|
if (!inst->isSend() || inst->getExecSize().value >= 8)
|
|
continue;
|
|
|
|
// dstLen is actual # of GRFs written based on rb, lb
|
|
// src0Len is actual # of GRFs read based on rb, lb
|
|
// src1Len is actual # of GRFs read based on rb, lb
|
|
unsigned int dstLen = 0, src0Len = 0, src1Len = 0;
|
|
auto sendDst = inst->getDst();
|
|
auto sendHdr = inst->getSrc(0);
|
|
auto sendPayload = inst->getSrc(1);
|
|
|
|
auto getLenInGRF = [&](G4_Operand *opnd) {
|
|
unsigned int sz = 0;
|
|
if (opnd && !opnd->isNullReg() && opnd->getTopDcl())
|
|
sz = (opnd->getRightBound() - opnd->getLeftBound() +
|
|
kernel.getGRFSize() - 1) /
|
|
kernel.getGRFSize();
|
|
return sz;
|
|
};
|
|
|
|
dstLen = getLenInGRF(sendDst);
|
|
src0Len = getLenInGRF(sendHdr);
|
|
src1Len = getLenInGRF(sendPayload);
|
|
|
|
auto sendRspLen = inst->asSendInst()->getMsgDesc()->getDstLenRegs();
|
|
auto headerLen = inst->asSendInst()->getMsgDesc()->getSrc0LenRegs();
|
|
auto payloadLen = inst->asSendInst()->getMsgDesc()->getSrc1LenRegs();
|
|
|
|
// For gather.[1|2|4] (scatter.[1|2|4]) difference in actual dst
|
|
// (src0/src1) size and rspLen (msg len/ext msg len) should not exceed 1
|
|
// GRF.
|
|
auto markForbiddenForDcl = [&](unsigned int opndLen, G4_Declare *dcl,
|
|
unsigned int lenInSend) {
|
|
if (opndLen > 0 && dcl && dcl->getRegVar() &&
|
|
dcl->getRegVar()->isRegAllocPartaker()) {
|
|
if (lenInSend == (opndLen + 1)) {
|
|
lrs[dcl->getRegVar()->getId()]->setForbidden(
|
|
forbiddenKind::FBD_LASTGRF);
|
|
} else if (lenInSend > opndLen) {
|
|
vISA_ASSERT(false,
|
|
"mismatch between len in send and that of operand");
|
|
}
|
|
}
|
|
};
|
|
|
|
markForbiddenForDcl(dstLen, sendDst->getTopDcl(), sendRspLen);
|
|
markForbiddenForDcl(src0Len, sendHdr->getTopDcl(), headerLen);
|
|
markForbiddenForDcl(src1Len, sendPayload->getTopDcl(), payloadLen);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool GraphColor::regAlloc(bool doBankConflictReduction,
|
|
bool highInternalConflict, const RPE *rpe) {
|
|
bool useSplitLLRHeuristic = false;
|
|
// FIXME: This whole bundle thing is a mess, the flag is an int but we
|
|
// treat it as a bool when passing to assignColors, and it's not clear if it
|
|
// works for non-DPAS instructions.
|
|
unsigned doBundleConflictReduction = kernel.getuInt32Option(vISA_enableBundleCR);
|
|
|
|
RA_TRACE(std::cout << "\t--# variables: " << liveAnalysis.getNumSelectedVar()
|
|
<< "\n");
|
|
|
|
// Copy over alignment for vars inserted by RA
|
|
gra.copyMissingAlignment();
|
|
|
|
//
|
|
// create an array of live ranges.
|
|
//
|
|
if (!IncrementalRA::isEnabled(kernel) || lrs.size() == 0) {
|
|
// Create vector of live ranges if we're not using
|
|
// incremental RA or if this is 1st iteration.
|
|
// With incremental RA, live-ranges are created right when
|
|
// new temp var is created in RA.
|
|
createLiveRanges();
|
|
}
|
|
|
|
//
|
|
// set the pre-assigned registers
|
|
//
|
|
for (unsigned i = 0; i < numVar; i++) {
|
|
if (lrs[i]->getVar()->getPhyReg()) {
|
|
lrs[i]->setPhyReg(lrs[i]->getVar()->getPhyReg(),
|
|
lrs[i]->getVar()->getPhyRegOff());
|
|
}
|
|
|
|
G4_Declare *dcl = lrs[i]->getDcl();
|
|
if (!useSplitLLRHeuristic) {
|
|
auto dclLR = gra.getLocalLR(dcl);
|
|
|
|
if (dclLR != nullptr && dclLR->getSplit()) {
|
|
useSplitLLRHeuristic = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// compute interference matrix
|
|
//
|
|
intf.init();
|
|
intf.computeInterference();
|
|
|
|
builder.getFreqInfoManager().initForRegAlloc(&liveAnalysis);
|
|
|
|
// If option is true, try to get extra interference info from file
|
|
if (liveAnalysis.livenessClass(G4_GRF) &&
|
|
kernel.getOption(vISA_AddExtraIntfInfo)) {
|
|
getExtraInterferenceInfo();
|
|
}
|
|
|
|
TIME_SCOPE(COLORING);
|
|
//
|
|
// compute degree and spill costs for each live range
|
|
//
|
|
if (liveAnalysis.livenessClass(G4_GRF)) {
|
|
if (gra.use4GRFAlign)
|
|
computeDegreeForGRF<true>();
|
|
else
|
|
computeDegreeForGRF<false>();
|
|
} else {
|
|
computeDegreeForARF();
|
|
}
|
|
|
|
computeSpillCosts(useSplitLLRHeuristic, rpe);
|
|
builder.getFreqInfoManager().computeFreqSpillCosts(gra, useSplitLLRHeuristic, rpe);
|
|
|
|
if (kernel.getOption(vISA_DumpRAIntfGraph))
|
|
intf.dumpInterference();
|
|
//
|
|
// determine coloring order
|
|
//
|
|
determineColorOrdering();
|
|
|
|
//
|
|
// Set up the sub-reg alignment from declare information
|
|
// FIXME: Why is this called after degrees are computed? Wouldn't the
|
|
// alignment affect degree computation?
|
|
//
|
|
for (unsigned i = 0; i < numVar; i++) {
|
|
G4_Declare *dcl = lrs[i]->getDcl();
|
|
|
|
if (gra.getSubRegAlign(dcl) == Any && !dcl->getIsPartialDcl()) {
|
|
//
|
|
// multi-row, subreg alignment = 16 words
|
|
//
|
|
if (dcl->getNumRows() > 1) {
|
|
gra.setSubRegAlign(lrs[i]->getVar()->getDeclare(),
|
|
kernel.getGRFAlign());
|
|
}
|
|
//
|
|
// single-row
|
|
//
|
|
else if (gra.getSubRegAlign(lrs[i]->getVar()->getDeclare()) == Any) {
|
|
//
|
|
// set up Odd word or Even word sub reg alignment
|
|
//
|
|
unsigned nbytes = dcl->getNumElems() * TypeSize(dcl->getElemType());
|
|
unsigned nwords = nbytes / G4_WSIZE + nbytes % G4_WSIZE;
|
|
if (nwords >= 2 && lrs[i]->getRegKind() == G4_GRF) {
|
|
gra.setSubRegAlign(lrs[i]->getVar()->getDeclare(), Even_Word);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
gatherScatterForbiddenWA();
|
|
|
|
//
|
|
// assign registers for GRFs, GRFs are first attempted to be assigned using
|
|
// round-robin and if it fails then we retry using a first-fit heuristic.
|
|
//
|
|
if (liveAnalysis.livenessClass(G4_GRF)) {
|
|
bool hasStackCall =
|
|
kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc();
|
|
|
|
bool willSpill =
|
|
((gra.useFastRA || gra.useHybridRAwithSpill) &&
|
|
(!hasStackCall ||
|
|
builder.getOption(vISA_PartitionWithFastHybridRA))) ||
|
|
(kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
|
|
rpe->getMaxRP() >= kernel.getNumRegTotal() + 24);
|
|
if (willSpill) {
|
|
// go straight to first_fit to save compile time since we are definitely
|
|
// spilling we do this for 3D only since with indirect/subroutine the RP
|
|
// pressure can be very unreliable
|
|
// FIXME: due to factors like local split and scalar variables that are
|
|
// not accurately modeled in RP estimate, RA may succeed even when RP is >
|
|
// total #GRF. We should investigate these cases and fix RPE
|
|
assignColors(FIRST_FIT);
|
|
// assert(requireSpillCode() && "inaccurate GRF pressure estimate");
|
|
return !requireSpillCode();
|
|
}
|
|
|
|
if (kernel.getOption(vISA_RoundRobin) && !hasStackCall) {
|
|
if (assignColors(ROUND_ROBIN, doBankConflictReduction,
|
|
highInternalConflict,
|
|
doBundleConflictReduction) == false) {
|
|
resetTemporaryRegisterAssignments();
|
|
bool success = assignColors(FIRST_FIT, doBankConflictReduction,
|
|
highInternalConflict, doBundleConflictReduction);
|
|
|
|
if (!success && doBankConflictReduction && isHybrid) {
|
|
return false;
|
|
}
|
|
|
|
if (!kernel.getOption(vISA_forceBCR)) {
|
|
if (!success && doBankConflictReduction) {
|
|
resetTemporaryRegisterAssignments();
|
|
assignColors(FIRST_FIT);
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
bool success = assignColors(FIRST_FIT, true, highInternalConflict,
|
|
doBundleConflictReduction);
|
|
if (!success) {
|
|
resetTemporaryRegisterAssignments();
|
|
assignColors(FIRST_FIT);
|
|
}
|
|
}
|
|
} else if (liveAnalysis.livenessClass(G4_FLAG)) {
|
|
if (kernel.getOption(vISA_RoundRobin)) {
|
|
if (assignColors(ROUND_ROBIN) == false) {
|
|
resetTemporaryRegisterAssignments();
|
|
assignColors(FIRST_FIT);
|
|
}
|
|
} else {
|
|
assignColors(FIRST_FIT);
|
|
}
|
|
} else {
|
|
// assign registers for ARFs using a first-fit heuristic
|
|
assignColors(FIRST_FIT, false, false);
|
|
}
|
|
|
|
return (requireSpillCode() == false);
|
|
}
|
|
|
|
void GraphColor::confirmRegisterAssignments() {
|
|
for (unsigned i = 0; i < numVar; i++) {
|
|
if (lrs[i]->getPhyReg()) {
|
|
if (lrs[i]->getVar()->getPhyReg()) {
|
|
vISA_ASSERT((lrs[i]->getVar()->getPhyReg() == lrs[i]->getPhyReg()),
|
|
ERROR_GRAPHCOLOR);
|
|
} else {
|
|
lrs[i]->getVar()->setPhyReg(lrs[i]->getPhyReg(),
|
|
lrs[i]->getPhyRegOff());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void GraphColor::resetTemporaryRegisterAssignments() {
|
|
for (unsigned i = 0; i < numVar; i++) {
|
|
if (lrs[i]->getVar()->getPhyReg() == NULL) {
|
|
lrs[i]->resetPhyReg();
|
|
lrs[i]->setSpilled(false);
|
|
}
|
|
}
|
|
spilledLRs.clear();
|
|
}
|
|
|
|
void GraphColor::cleanupRedundantARFFillCode() {
|
|
for (G4_BB *bb : builder.kernel.fg) {
|
|
clearSpillAddrLocSignature();
|
|
|
|
for (std::list<G4_INST *>::iterator i = bb->begin(); i != bb->end();) {
|
|
G4_INST *inst = (*i);
|
|
|
|
//
|
|
// process writes to spill storage (GRF) of addr regs
|
|
//
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
|
|
if (dst && dst->getBase() && dst->getBase()->isRegVar() &&
|
|
(kernel.fg.isPseudoA0Dcl(dst->getBase()->asRegVar()->getDeclare()) ||
|
|
inst->isPseudoKill())) {
|
|
i++;
|
|
continue;
|
|
}
|
|
|
|
if (dst != NULL && dst->getRegAccess() == Direct) {
|
|
|
|
if (dst->getBase()->isRegVar() &&
|
|
dst->getBase()->asRegVar()->isRegVarAddrSpillLoc()) {
|
|
pruneActiveSpillAddrLocs(dst, inst->getExecSize(),
|
|
inst->getExecType());
|
|
}
|
|
//
|
|
// process writes to (allocated) addr regs
|
|
//
|
|
else if (dst->getBase()->isRegAllocPartaker()) {
|
|
G4_RegVar *addrReg = dst->getBase()->asRegVar();
|
|
|
|
if (gra.isAddrFlagSpillDcl(addrReg->getDeclare())) {
|
|
G4_SrcRegRegion *srcRgn = inst->getSrc(0)->asSrcRegRegion();
|
|
|
|
if (redundantAddrFill(dst, srcRgn, inst->getExecSize())) {
|
|
std::list<G4_INST *>::iterator j = i++;
|
|
bb->erase(j);
|
|
continue;
|
|
} else {
|
|
updateActiveSpillAddrLocs(dst, srcRgn, inst->getExecSize());
|
|
}
|
|
} else {
|
|
pruneActiveSpillAddrLocs(dst, inst->getExecSize(),
|
|
inst->getExecType());
|
|
}
|
|
}
|
|
}
|
|
|
|
i++;
|
|
}
|
|
}
|
|
}
|
|
|
|
void GraphColor::pruneActiveSpillAddrLocs(G4_DstRegRegion *dstRegion,
|
|
unsigned exec_size,
|
|
G4_Type exec_type) {
|
|
if (dstRegion->getBase()->asRegVar()->isRegVarAddrSpillLoc()) {
|
|
vISA_ASSERT(((exec_type == Type_UW || exec_type == Type_W) &&
|
|
exec_size <= builder.getNumAddrRegisters()) ||
|
|
(exec_size == 1),
|
|
"Unexpected ADDR spill loc update format!");
|
|
vISA_ASSERT(dstRegion->getRegAccess() == Direct,
|
|
"Unexpected ADDR spill loc");
|
|
|
|
G4_RegVarAddrSpillLoc *spillLocReg =
|
|
static_cast<G4_RegVarAddrSpillLoc *>(dstRegion->getBase());
|
|
unsigned startId = spillLocReg->getLocId() + dstRegion->getSubRegOff();
|
|
unsigned endId = startId + exec_size * dstRegion->getHorzStride();
|
|
|
|
for (unsigned i = 0, horzStride = dstRegion->getHorzStride();
|
|
i < builder.getNumAddrRegisters(); i += horzStride) {
|
|
if (spAddrRegSig[i] >= startId && spAddrRegSig[i] < endId) {
|
|
spAddrRegSig[i] = 0;
|
|
}
|
|
}
|
|
} else if (dstRegion->getBase()->asRegVar()->isPhyRegAssigned()) {
|
|
G4_RegVar *addrReg = dstRegion->getBase()->asRegVar();
|
|
vISA_ASSERT(addrReg->getPhyReg()->isA0(),
|
|
"Unknown error in ADDR reg spill code cleanup!");
|
|
unsigned startId = addrReg->getPhyRegOff();
|
|
unsigned endId = startId + exec_size * dstRegion->getHorzStride();
|
|
vISA_ASSERT(endId <= builder.getNumAddrRegisters(),
|
|
"Unknown error in ADDR reg spill code cleanup!");
|
|
|
|
for (unsigned i = startId; i < endId; i += dstRegion->getHorzStride()) {
|
|
spAddrRegSig[i] = 0;
|
|
}
|
|
} else {
|
|
vISA_ASSERT(false, "Unknown error in ADDR reg spill code cleanup!");
|
|
}
|
|
}
|
|
|
|
void GraphColor::updateActiveSpillAddrLocs(G4_DstRegRegion *tmpDstRegion,
|
|
G4_SrcRegRegion *srcRegion,
|
|
unsigned exec_size) {
|
|
vISA_ASSERT(
|
|
gra.isAddrFlagSpillDcl(tmpDstRegion->getBase()->asRegVar()->getDeclare()),
|
|
"Unknown error in ADDR reg spill code cleanup!");
|
|
G4_RegVar *addrReg = tmpDstRegion->getBase()->asRegVar();
|
|
vISA_ASSERT(addrReg->getPhyReg()->isA0(),
|
|
"Unknown error in ADDR reg spill code cleanup!");
|
|
unsigned startAddrId = addrReg->getPhyRegOff();
|
|
unsigned endAddrId = startAddrId + exec_size * tmpDstRegion->getHorzStride();
|
|
vISA_ASSERT(endAddrId <= builder.getNumAddrRegisters(),
|
|
"Unknown error in ADDR reg spill code cleanup!");
|
|
|
|
vISA_ASSERT(srcRegion->getBase()->asRegVar()->isRegVarAddrSpillLoc(),
|
|
"Unknown error in ADDR reg spill code cleanup!");
|
|
G4_RegVarAddrSpillLoc *spillLocReg =
|
|
static_cast<G4_RegVarAddrSpillLoc *>(srcRegion->getBase());
|
|
unsigned startLocId = spillLocReg->getLocId() + srcRegion->getSubRegOff();
|
|
|
|
for (unsigned i = startAddrId, j = startLocId; i < endAddrId;
|
|
i += tmpDstRegion->getHorzStride(),
|
|
j += srcRegion->getRegion()->horzStride) {
|
|
spAddrRegSig[i] = j;
|
|
}
|
|
}
|
|
|
|
bool GraphColor::redundantAddrFill(G4_DstRegRegion *tmpDstRegion,
|
|
G4_SrcRegRegion *srcRegion,
|
|
unsigned exec_size) {
|
|
bool match = true;
|
|
|
|
vISA_ASSERT(
|
|
gra.isAddrFlagSpillDcl(tmpDstRegion->getBase()->asRegVar()->getDeclare()),
|
|
"Unknown error in ADDR reg spill code cleanup!");
|
|
G4_RegVar *addrReg = tmpDstRegion->getBase()->asRegVar();
|
|
vISA_ASSERT(addrReg->getPhyReg()->isA0(),
|
|
"Unknown error in ADDR reg spill code cleanup!");
|
|
unsigned startAddrId = addrReg->getPhyRegOff();
|
|
unsigned endAddrId = startAddrId + exec_size * tmpDstRegion->getHorzStride();
|
|
vISA_ASSERT(endAddrId <= builder.getNumAddrRegisters(),
|
|
"Unknown error in ADDR reg spill code cleanup!");
|
|
|
|
vISA_ASSERT(srcRegion->getBase()->asRegVar()->isRegVarAddrSpillLoc(),
|
|
"Unknown error in ADDR reg spill code cleanup!");
|
|
G4_RegVarAddrSpillLoc *spillLocReg =
|
|
static_cast<G4_RegVarAddrSpillLoc *>(srcRegion->getBase());
|
|
unsigned startLocId = spillLocReg->getLocId() + srcRegion->getSubRegOff();
|
|
|
|
for (unsigned i = startAddrId, j = startLocId; i < endAddrId;
|
|
i += tmpDstRegion->getHorzStride(),
|
|
j += srcRegion->getRegion()->horzStride) {
|
|
if (spAddrRegSig[i] != j) {
|
|
match = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return match;
|
|
}
|
|
|
|
unsigned GlobalRA::sendBlockSizeCode(unsigned owordSize) {
|
|
unsigned code;
|
|
|
|
switch (owordSize) {
|
|
case 1:
|
|
code = 0;
|
|
break;
|
|
case 2:
|
|
code = 2;
|
|
break;
|
|
case 4:
|
|
code = 3;
|
|
break;
|
|
case 8:
|
|
code = 4;
|
|
break;
|
|
case 16:
|
|
code = 5;
|
|
break;
|
|
default:
|
|
vISA_ASSERT_UNREACHABLE(ERROR_REGALLOC);
|
|
code = 0;
|
|
}
|
|
|
|
return code;
|
|
}
|
|
|
|
#define STATELESS_SURFACE_INDEX 0xFF
|
|
#define HEADER_PRESENT 0x80000
|
|
#define SEND_OWORD_READ_TYPE 0
|
|
#define SEND_OWORD_WRITE_TYPE 8
|
|
#define SEND_MSG_TYPE_BIT_OFFSET 14
|
|
#define SEND_RSP_LENGTH_BIT_OFFSET 20
|
|
#define SEND_MSG_LENGTH_BIT_OFFSET 25
|
|
#define SEND_DESC_DATA_SIZE_BIT_OFFSET 8
|
|
|
|
G4_Imm *GlobalRA::createMsgDesc(unsigned owordSize, bool writeType,
|
|
bool isSplitSend) {
|
|
// If isSplitSend = true then messageLength = 1 and extMesLength =
|
|
// (owordSize/2) GRFs
|
|
unsigned message = STATELESS_SURFACE_INDEX;
|
|
message |= HEADER_PRESENT;
|
|
if (writeType) {
|
|
unsigned messageType = SEND_OWORD_WRITE_TYPE;
|
|
message |= messageType << SEND_MSG_TYPE_BIT_OFFSET;
|
|
unsigned messageLength = 1;
|
|
if (!isSplitSend) {
|
|
messageLength += owordToGRFSize(
|
|
ROUND(owordSize, kernel.numEltPerGRF<Type_UB>() / OWORD_BYTE_SIZE),
|
|
builder);
|
|
}
|
|
message |= messageLength << SEND_MSG_LENGTH_BIT_OFFSET;
|
|
} else {
|
|
unsigned messageType = SEND_OWORD_READ_TYPE;
|
|
message |= messageType << SEND_MSG_TYPE_BIT_OFFSET;
|
|
unsigned responseLength = owordToGRFSize(
|
|
ROUND(owordSize, kernel.numEltPerGRF<Type_UB>() / OWORD_BYTE_SIZE),
|
|
builder);
|
|
message |= responseLength << SEND_RSP_LENGTH_BIT_OFFSET;
|
|
unsigned messageLength = 1;
|
|
message |= messageLength << SEND_MSG_LENGTH_BIT_OFFSET;
|
|
}
|
|
unsigned writeOwordSize = sendBlockSizeCode(owordSize);
|
|
message |= writeOwordSize << SEND_DESC_DATA_SIZE_BIT_OFFSET;
|
|
return builder.createImm(message, Type_UD);
|
|
}
|
|
|
|
void GlobalRA::stackCallProlog() {
|
|
G4_BB *entryBB = builder.kernel.fg.getEntryBB();
|
|
|
|
// Used for creating inst to initialize address for immediate offset usage.
|
|
auto AddrComputeInst = [this](G4_Declare *srcDcl) {
|
|
auto addSrc0 = builder.createSrc(srcDcl->getRegVar(), 0, 0,
|
|
builder.getRegionScalar(), Type_UD);
|
|
auto immSrc1 = builder.createImm(SPILL_FILL_IMMOFF_MAX, Type_UD);
|
|
auto addInst = builder.createBinOp(
|
|
G4_add, g4::SIMD1,
|
|
builder.createDstRegRegion(builder.kernel.fg.scratchRegDcl, 1), addSrc0,
|
|
immSrc1, InstOpt_WriteEnable, false);
|
|
return addInst;
|
|
};
|
|
|
|
// Initialize address for immediate offset usage for spill/fill messages
|
|
// except for frame descriptor save message.
|
|
// This is for common cases which uses %be_fp as address.
|
|
{
|
|
// Turn off immediate offset if frame size is 0 or exceeds threshhold
|
|
if ((kernel.fg.frameSizeInOWord == 0) ||
|
|
(kernel.fg.frameSizeInOWord * 16 > SPILL_FILL_IMMOFF_MAX * 2))
|
|
canUseLscImmediateOffsetSpillFill = false;
|
|
|
|
if (canUseLscImmediateOffsetSpillFill) {
|
|
// copy (%be_fp + 0x10000) to r126.0 for immediate offset usage in
|
|
// stackcall spill/fill
|
|
// add(1) r126.0 %be_fp 0x10000
|
|
auto insertIt = std::find(entryBB->begin(), entryBB->end(),
|
|
builder.kernel.getBEFPSetupInst());
|
|
vISA_ASSERT(insertIt != entryBB->end(), "Can't find BE_FP setup inst");
|
|
entryBB->insertBefore(++insertIt, AddrComputeInst(builder.getBEFP()));
|
|
|
|
// Each stack function has its own r126.0, so need resume r126.0 after
|
|
// function call as the value has been changed in the callee.
|
|
// See below example:
|
|
// Foo()
|
|
// mov r125.3 r125.2
|
|
// add r126.0 r125.3 0x10000
|
|
// add r125.2 r125.2 frameSizeFoo
|
|
// spill [r126.0 offset1-0x10000]
|
|
// Bar()
|
|
// mov r125.3 r125.2
|
|
// add r126.0 r125.3 0x10000
|
|
// add r125.2 r125.2 frameSizeBar
|
|
// spill [r126.0 offset2-0x10000]
|
|
// ...
|
|
// add r126.0 r125.3 0x10000
|
|
// spill [r126.0 offset3-0x10000]
|
|
// After Bar() return, we should re-compute r126.0
|
|
for (auto bb : kernel.fg) {
|
|
if (bb->isEndWithFCall()) {
|
|
G4_BB *succ = bb->Succs.front();
|
|
insertIt =
|
|
std::find_if(succ->begin(), succ->end(),
|
|
[](G4_INST *inst) { return inst->isLabel(); });
|
|
vISA_ASSERT(insertIt != succ->end(), "Can't find label");
|
|
succ->insertBefore(++insertIt, AddrComputeInst(builder.getBEFP()));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Emit frame descriptor
|
|
if (kernel.fg.getIsStackCallFunc()) {
|
|
if (canSkipFDE())
|
|
return;
|
|
|
|
auto payload = builder.createHardwiredDeclare(
|
|
8, Type_UD, kernel.stackCall.getFPSPGRF(), 0);
|
|
payload->setName(builder.getNameString(24, "FrameDescriptorGRF"));
|
|
auto payloadSrc =
|
|
builder.createSrcRegRegion(payload, builder.getRegionStride1());
|
|
const unsigned execSize = 8;
|
|
G4_DstRegRegion *postDst = builder.createNullDst(Type_UD);
|
|
G4_INST *store = nullptr;
|
|
if (builder.supportsLSC()) {
|
|
auto headerOpnd = getSpillFillHeader(*kernel.fg.builder, nullptr);
|
|
store = builder.createSpill(
|
|
postDst, headerOpnd, payloadSrc, G4_ExecSize(execSize), 1, 0,
|
|
builder.getBESP(), InstOpt_WriteEnable, false);
|
|
} else {
|
|
store =
|
|
builder.createSpill(postDst, payloadSrc, G4_ExecSize(execSize), 1, 0,
|
|
builder.getBESP(), InstOpt_WriteEnable, false);
|
|
}
|
|
builder.setFDSpillInst(store);
|
|
auto iter = std::find_if(entryBB->begin(), entryBB->end(),
|
|
[](G4_INST *inst) { return !inst->isLabel(); });
|
|
iter = entryBB->insertBefore(iter, store);
|
|
|
|
if (EUFusionCallWANeeded()) {
|
|
auto oldSaveInst = builder.getPartFDSaveInst();
|
|
builder.setPartFDSaveInst(store);
|
|
entryBB->remove(oldSaveInst);
|
|
}
|
|
addEUFusionCallWAInst(store);
|
|
|
|
// Initialize address for immediate offset usage for frame descriptor store
|
|
// message. This is a special case as it uses %be_sp as address.
|
|
{
|
|
if (canUseLscImmediateOffsetSpillFill) {
|
|
// copy (%be_sp + 0x10000) to r126.0 for immediate offset usage
|
|
// for frame descriptor save instruction
|
|
// add(1) r126.0<1>:ud %be_sp<1;0,1>:ud 0x10000:ud
|
|
entryBB->insertBefore(iter, AddrComputeInst(builder.getBESP()));
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
// Make r126 a copy of r0 only up to VISA ABI v2
|
|
if (kernel.stackCall.getVersion() >= StackCallABI::StackCallABIVersion::VER_3)
|
|
return;
|
|
|
|
// mov (8) r126.0<1>:ud r0.0<1;1,1>:ud
|
|
auto dstRgn = builder.createDstRegRegion(builder.kernel.fg.scratchRegDcl, 1);
|
|
auto srcRgn = builder.createSrcRegRegion(builder.getBuiltinR0(),
|
|
builder.getRegionStride1());
|
|
|
|
G4_INST *mov = builder.createMov(G4_ExecSize(kernel.numEltPerGRF<Type_UD>()),
|
|
dstRgn, srcRgn, InstOpt_WriteEnable, false);
|
|
|
|
auto iter = std::find_if(entryBB->begin(), entryBB->end(),
|
|
[](G4_INST *inst) { return !inst->isLabel(); });
|
|
entryBB->insertBefore(iter, mov);
|
|
}
|
|
|
|
//
|
|
// Generate the save code for startReg to startReg+owordSize/2.
|
|
//
|
|
void GlobalRA::saveRegs(unsigned startReg, unsigned owordSize,
|
|
G4_Declare *scratchRegDcl, G4_Declare *framePtr,
|
|
unsigned frameOwordOffset, G4_BB *bb,
|
|
INST_LIST_ITER insertIt,
|
|
std::unordered_set<G4_INST *> &group) {
|
|
vISA_ASSERT(builder.getPlatform() >= GENX_SKL,
|
|
"stack call only supported on SKL+");
|
|
|
|
if ((useLscForSpillFill && owordSize == 16) || owordSize == 8 ||
|
|
owordSize == 4 || owordSize == 2) {
|
|
// add (1) r126.2<1>:ud r125.3<0;1,0>:ud 0x2:ud
|
|
// sends (8) null<1>:ud r126.0 r1.0 ...
|
|
G4_ExecSize execSize = (owordSize > 2) ? g4::SIMD16 : g4::SIMD8;
|
|
unsigned messageLength = GlobalRA::owordToGRFSize(owordSize, builder);
|
|
G4_Declare *msgDcl =
|
|
builder.createTempVar(messageLength * builder.getGenxDataportIOSize(),
|
|
Type_UD, builder.getGRFAlign(), StackCallStr);
|
|
msgDcl->getRegVar()->setPhyReg(regPool.getGreg(startReg), 0);
|
|
auto sendSrc2 = builder.createSrc(msgDcl->getRegVar(), 0, 0,
|
|
builder.getRegionStride1(), Type_UD);
|
|
G4_DstRegRegion *dst =
|
|
builder.createNullDst((execSize > 8) ? Type_UW : Type_UD);
|
|
G4_INST *spillIntrinsic = nullptr;
|
|
if (builder.supportsLSC()) {
|
|
auto headerOpnd = getSpillFillHeader(*kernel.fg.builder, nullptr);
|
|
spillIntrinsic = builder.createSpill(
|
|
dst, headerOpnd, sendSrc2, execSize, messageLength,
|
|
frameOwordOffset / 2, framePtr, InstOpt_WriteEnable, false);
|
|
} else
|
|
spillIntrinsic = builder.createSpill(
|
|
dst, sendSrc2, execSize, messageLength, frameOwordOffset / 2,
|
|
framePtr, InstOpt_WriteEnable, false);
|
|
spillIntrinsic->inheritDIFrom(*insertIt);
|
|
bb->insertBefore(insertIt, spillIntrinsic);
|
|
group.insert(spillIntrinsic);
|
|
} else if ((useLscForSpillFill && owordSize > 16)) {
|
|
saveRegs(startReg, 16, scratchRegDcl, framePtr, frameOwordOffset, bb,
|
|
insertIt, group);
|
|
saveRegs(startReg + GlobalRA::owordToGRFSize(16, builder), owordSize - 16,
|
|
scratchRegDcl, framePtr, frameOwordOffset + 16, bb, insertIt,
|
|
group);
|
|
} else if (owordSize > 8) {
|
|
saveRegs(startReg, 8, scratchRegDcl, framePtr, frameOwordOffset, bb,
|
|
insertIt, group);
|
|
saveRegs(startReg + GlobalRA::owordToGRFSize(8, builder), owordSize - 8,
|
|
scratchRegDcl, framePtr, frameOwordOffset + 8, bb, insertIt,
|
|
group);
|
|
}
|
|
//
|
|
// Split into chunks of sizes 4 and remaining owords.
|
|
//
|
|
else if (owordSize > 4) {
|
|
saveRegs(startReg, 4, scratchRegDcl, framePtr, frameOwordOffset, bb,
|
|
insertIt, group);
|
|
saveRegs(startReg + GlobalRA::owordToGRFSize(4, builder), owordSize - 4,
|
|
scratchRegDcl, framePtr, frameOwordOffset + 4, bb, insertIt,
|
|
group);
|
|
}
|
|
//
|
|
// Split into chunks of sizes 2 and remaining owords.
|
|
//
|
|
else if (owordSize > 2) {
|
|
saveRegs(startReg, 2, scratchRegDcl, framePtr, frameOwordOffset, bb,
|
|
insertIt, group);
|
|
saveRegs(startReg + GlobalRA::owordToGRFSize(2, builder), owordSize - 2,
|
|
scratchRegDcl, framePtr, frameOwordOffset + 2, bb, insertIt,
|
|
group);
|
|
} else {
|
|
vISA_ASSERT(false, ERROR_REGALLOC);
|
|
}
|
|
}
|
|
|
|
//
|
|
// Generate the save code for the i/p saveRegs.
|
|
//
|
|
void GlobalRA::saveActiveRegs(std::vector<bool> &saveRegs, unsigned startReg,
|
|
unsigned frameOffset, G4_BB *bb,
|
|
INST_LIST_ITER insertIt,
|
|
std::unordered_set<G4_INST *> &group) {
|
|
G4_Declare *scratchRegDcl = builder.kernel.fg.scratchRegDcl;
|
|
G4_Declare *framePtr = builder.kernel.fg.framePtrDcl;
|
|
|
|
unsigned frameOwordPos = frameOffset;
|
|
unsigned startPos = 0;
|
|
|
|
while (startPos < saveRegs.size()) {
|
|
for (; startPos < saveRegs.size() && saveRegs[startPos] == false;
|
|
startPos++)
|
|
;
|
|
if (startPos < saveRegs.size() && saveRegs[startPos]) {
|
|
unsigned endPos = startPos + 1;
|
|
for (; endPos < saveRegs.size() && saveRegs[endPos] == true; endPos++)
|
|
;
|
|
unsigned owordSize =
|
|
(endPos - startPos) * GlobalRA::GRFSizeToOwords(1, builder);
|
|
owordSize = std::max(owordSize, GlobalRA::GRFSizeToOwords(1, builder));
|
|
this->saveRegs(startPos + startReg, owordSize, scratchRegDcl, framePtr,
|
|
frameOwordPos, bb, insertIt, group);
|
|
frameOwordPos += owordSize;
|
|
startPos = endPos;
|
|
}
|
|
}
|
|
}
|
|
|
|
G4_SrcRegRegion *GraphColor::getScratchSurface() const {
|
|
if (builder.hasScratchSurface()) {
|
|
return builder.createSrcRegRegion(builder.getBuiltinScratchSurface(),
|
|
builder.getRegionScalar());
|
|
}
|
|
return nullptr; // use stateless access
|
|
}
|
|
|
|
//
|
|
// Generate the restore code for startReg to startReg+owordSize/2.
|
|
//
|
|
void GlobalRA::restoreRegs(unsigned startReg, unsigned owordSize,
|
|
G4_Declare *scratchRegDcl, G4_Declare *framePtr,
|
|
unsigned frameOwordOffset, G4_BB *bb,
|
|
INST_LIST_ITER insertIt,
|
|
std::unordered_set<G4_INST *> &group, bool caller) {
|
|
//
|
|
// Process chunks of size 8, 4, 2 and 1.
|
|
//
|
|
if ((useLscForSpillFill && owordSize == 16) || owordSize == 8 ||
|
|
owordSize == 4 || owordSize == 2) {
|
|
G4_ExecSize execSize = (owordSize > 2) ? g4::SIMD16 : g4::SIMD8;
|
|
unsigned responseLength = GlobalRA::owordToGRFSize(owordSize, builder);
|
|
G4_Declare *dstDcl =
|
|
builder.createTempVar(responseLength * builder.getGenxDataportIOSize(),
|
|
Type_UD, builder.getGRFAlign(), StackCallStr);
|
|
if (caller) {
|
|
kernel.callerRestoreDecls.push_back(dstDcl);
|
|
}
|
|
dstDcl->getRegVar()->setPhyReg(regPool.getGreg(startReg), 0);
|
|
G4_DstRegRegion *dstRgn = builder.createDst(
|
|
dstDcl->getRegVar(), 0, 0, 1, (execSize > 8) ? Type_UW : Type_UD);
|
|
G4_INST *fillIntrinsic = nullptr;
|
|
if (builder.supportsLSC()) {
|
|
auto headerOpnd = getSpillFillHeader(*kernel.fg.builder, nullptr);
|
|
fillIntrinsic = builder.createFill(headerOpnd, dstRgn, execSize,
|
|
responseLength, frameOwordOffset / 2,
|
|
framePtr, InstOpt_WriteEnable, false);
|
|
} else
|
|
fillIntrinsic = builder.createFill(dstRgn, execSize, responseLength,
|
|
frameOwordOffset / 2, framePtr,
|
|
InstOpt_WriteEnable, false);
|
|
fillIntrinsic->inheritDIFrom(*insertIt);
|
|
bb->insertBefore(insertIt, fillIntrinsic);
|
|
group.insert(fillIntrinsic);
|
|
}
|
|
//
|
|
// Split into chunks of sizes 8 and remaining owords.
|
|
//
|
|
else if ((useLscForSpillFill && owordSize > 16)) {
|
|
restoreRegs(startReg, 16, scratchRegDcl, framePtr, frameOwordOffset, bb,
|
|
insertIt, group, caller);
|
|
restoreRegs(startReg + GlobalRA::owordToGRFSize(16, builder),
|
|
owordSize - 16, scratchRegDcl, framePtr, frameOwordOffset + 16,
|
|
bb, insertIt, group, caller);
|
|
} else if (owordSize > 8) {
|
|
restoreRegs(startReg, 8, scratchRegDcl, framePtr, frameOwordOffset, bb,
|
|
insertIt, group, caller);
|
|
restoreRegs(startReg + GlobalRA::owordToGRFSize(8, builder), owordSize - 8,
|
|
scratchRegDcl, framePtr, frameOwordOffset + 8, bb, insertIt,
|
|
group, caller);
|
|
}
|
|
//
|
|
// Split into chunks of sizes 4 and remaining owords.
|
|
//
|
|
else if (owordSize > 4) {
|
|
restoreRegs(startReg, 4, scratchRegDcl, framePtr, frameOwordOffset, bb,
|
|
insertIt, group, caller);
|
|
restoreRegs(startReg + GlobalRA::owordToGRFSize(4, builder), owordSize - 4,
|
|
scratchRegDcl, framePtr, frameOwordOffset + 4, bb, insertIt,
|
|
group, caller);
|
|
}
|
|
//
|
|
// Split into chunks of sizes 2 and remaining owords.
|
|
//
|
|
else if (owordSize > 2) {
|
|
restoreRegs(startReg, 2, scratchRegDcl, framePtr, frameOwordOffset, bb,
|
|
insertIt, group, caller);
|
|
restoreRegs(startReg + GlobalRA::owordToGRFSize(2, builder), owordSize - 2,
|
|
scratchRegDcl, framePtr, frameOwordOffset + 2, bb, insertIt,
|
|
group, caller);
|
|
} else {
|
|
vISA_ASSERT(false, ERROR_REGALLOC);
|
|
}
|
|
}
|
|
|
|
//
|
|
// Generate the restore code for the i/p restoreRegs.
|
|
//
|
|
void GlobalRA::restoreActiveRegs(std::vector<bool> &restoreRegs,
|
|
unsigned startReg, unsigned frameOffset,
|
|
G4_BB *bb, INST_LIST_ITER insertIt,
|
|
std::unordered_set<G4_INST *> &group,
|
|
bool caller) {
|
|
G4_Declare *scratchRegDcl = builder.kernel.fg.scratchRegDcl;
|
|
G4_Declare *framePtr = builder.kernel.fg.framePtrDcl;
|
|
|
|
unsigned frameOwordPos = frameOffset;
|
|
unsigned startPos = 0;
|
|
|
|
while (startPos < restoreRegs.size()) {
|
|
for (; startPos < restoreRegs.size() && restoreRegs[startPos] == false;
|
|
startPos++)
|
|
;
|
|
if (startPos < restoreRegs.size() && restoreRegs[startPos]) {
|
|
unsigned endPos = startPos + 1;
|
|
for (; endPos < restoreRegs.size() && restoreRegs[endPos] == true;
|
|
endPos++)
|
|
;
|
|
unsigned owordSize =
|
|
(endPos - startPos) * GlobalRA::GRFSizeToOwords(1, builder);
|
|
owordSize = std::max(owordSize, GlobalRA::GRFSizeToOwords(1, builder));
|
|
this->restoreRegs(startPos + startReg, owordSize, scratchRegDcl, framePtr,
|
|
frameOwordPos, bb, insertIt, group, caller);
|
|
frameOwordPos += owordSize;
|
|
startPos = endPos;
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// Optimize the reg footprint so as to reduce the number of "send" instructions
|
|
// required for save/restore, at the cost of a little additional save/restore
|
|
// memory (if any). Since we are using oword read/write for save/restore, we can
|
|
// only read/write only in units of 1, 2 or 4 regs per "send" instruction.
|
|
//
|
|
void GlobalRA::OptimizeActiveRegsFootprint(std::vector<bool> &saveRegs) {
|
|
unsigned startPos = 0;
|
|
while (startPos < saveRegs.size()) {
|
|
for (; startPos < saveRegs.size() && !saveRegs[startPos]; ++startPos)
|
|
;
|
|
if (startPos == saveRegs.size()) {
|
|
break;
|
|
}
|
|
if (startPos + 4 <= saveRegs.size()) {
|
|
if (saveRegs[startPos] & saveRegs[startPos + 2] &
|
|
!saveRegs[startPos + 3]) {
|
|
saveRegs[startPos + 1] = saveRegs[startPos + 3] = true;
|
|
} else if (saveRegs[startPos] & saveRegs[startPos + 3]) {
|
|
if (startPos + 4 < saveRegs.size()) {
|
|
if (!saveRegs[startPos + 4]) {
|
|
saveRegs[startPos + 1] = saveRegs[startPos + 2] = true;
|
|
}
|
|
} else {
|
|
saveRegs[startPos + 1] = saveRegs[startPos + 2] = true;
|
|
}
|
|
}
|
|
}
|
|
unsigned winBound =
|
|
std::min(static_cast<unsigned>(saveRegs.size()), startPos + 4);
|
|
for (; startPos < winBound && saveRegs[startPos]; ++startPos)
|
|
;
|
|
}
|
|
}
|
|
|
|
void GlobalRA::OptimizeActiveRegsFootprint(std::vector<bool> &saveRegs,
|
|
std::vector<bool> &retRegs) {
|
|
unsigned startPos = 0;
|
|
while (startPos < saveRegs.size()) {
|
|
for (; startPos < saveRegs.size() && !saveRegs[startPos]; ++startPos)
|
|
;
|
|
if (startPos == saveRegs.size()) {
|
|
break;
|
|
}
|
|
if (startPos + 4 <= saveRegs.size()) {
|
|
if (saveRegs[startPos] & saveRegs[startPos + 2]) {
|
|
if (!saveRegs[startPos + 1] & !retRegs[startPos + 1]) {
|
|
saveRegs[startPos + 1] = true;
|
|
}
|
|
if (!saveRegs[startPos + 3] & !retRegs[startPos + 3]) {
|
|
saveRegs[startPos + 3] = true;
|
|
}
|
|
} else if (saveRegs[startPos] & saveRegs[startPos + 3]) {
|
|
if (startPos + 4 < saveRegs.size()) {
|
|
if (!saveRegs[startPos + 4]) {
|
|
if (!saveRegs[startPos + 1] & !retRegs[startPos + 1]) {
|
|
saveRegs[startPos + 1] = true;
|
|
}
|
|
if (!saveRegs[startPos + 2] & !retRegs[startPos + 2]) {
|
|
saveRegs[startPos + 2] = true;
|
|
}
|
|
}
|
|
} else {
|
|
if (!saveRegs[startPos + 1] & !retRegs[startPos + 1]) {
|
|
saveRegs[startPos + 1] = true;
|
|
}
|
|
if (!saveRegs[startPos + 2] & !retRegs[startPos + 2]) {
|
|
saveRegs[startPos + 2] = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
unsigned winBound =
|
|
std::min(static_cast<unsigned>(saveRegs.size()), startPos + 4);
|
|
for (; startPos < winBound && saveRegs[startPos]; ++startPos)
|
|
;
|
|
}
|
|
}
|
|
|
|
void GraphColor::getCallerSaveRegisters() {
|
|
unsigned callerSaveNumGRF = kernel.stackCall.getCallerSaveLastGRF() + 1;
|
|
|
|
for (BB_LIST_ITER it = builder.kernel.fg.begin();
|
|
it != builder.kernel.fg.end(); ++it) {
|
|
if ((*it)->isEndWithFCall()) {
|
|
//
|
|
// Determine the caller-save registers per call site.
|
|
//
|
|
gra.callerSaveRegsMap[(*it)].resize(callerSaveNumGRF, false);
|
|
gra.retRegsMap[(*it)].resize(callerSaveNumGRF, false);
|
|
unsigned callerSaveRegCount = 0;
|
|
G4_INST *callInst = (*it)->back();
|
|
unsigned pseudoVCAId =
|
|
builder.kernel.fg.fcallToPseudoDclMap[callInst->asCFInst()]
|
|
.VCA->getRegVar()
|
|
->getId();
|
|
vISA_ASSERT((*it)->Succs.size() == 1,
|
|
"fcall basic block cannot have more than 1 successor");
|
|
|
|
for (unsigned i = 0; i < numVar; i++) {
|
|
if (i != pseudoVCAId &&
|
|
kernel.fg.isPseudoVCEDcl(lrs[i]->getDcl()) != true &&
|
|
intf.interfereBetween(pseudoVCAId, i) == true) {
|
|
if (!builder.isPreDefArg(lrs[i]->getDcl())) {
|
|
// It is possible that we end up with unallocated spill variable
|
|
// when using new fail safe RA.
|
|
if (lrs[i]->getDcl()->isSpilled() &&
|
|
kernel.getOption(vISA_NewFailSafeRA))
|
|
continue;
|
|
// NOTE: Spilled live ranges should not be caller-save.
|
|
vISA_ASSERT(lrs[i]->getPhyReg()->isGreg(), ERROR_REGALLOC);
|
|
unsigned startReg = lrs[i]->getPhyReg()->asGreg()->getRegNum();
|
|
unsigned endReg = startReg + lrs[i]->getDcl()->getNumRows();
|
|
startReg =
|
|
(startReg < callerSaveNumGRF) ? startReg : callerSaveNumGRF;
|
|
startReg = (startReg > 0) ? startReg : 1;
|
|
endReg = (endReg < callerSaveNumGRF) ? endReg : callerSaveNumGRF;
|
|
endReg = (endReg > 0) ? endReg : 1;
|
|
for (unsigned j = startReg; j < endReg; j++) {
|
|
if (builder.isPreDefRet(lrs[i]->getDcl())) {
|
|
if (gra.retRegsMap[(*it)][j] == false) {
|
|
gra.retRegsMap[(*it)][j] = true;
|
|
}
|
|
} else {
|
|
if (gra.callerSaveRegsMap[(*it)][j] == false) {
|
|
gra.callerSaveRegsMap[(*it)][j] = true;
|
|
callerSaveRegCount++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
gra.callerSaveRegCountMap[(*it)] = callerSaveRegCount;
|
|
VISA_DEBUG_VERBOSE({
|
|
std::cout << "Caller save size: "
|
|
<< callerSaveRegCount * builder.getGRFSize()
|
|
<< " bytes for fcall at cisa id "
|
|
<< (*it)->back()->getVISAId() << "\n";
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// Add caller save/restore code before/after each stack call.
|
|
//
|
|
void GlobalRA::addCallerSaveRestoreCode() {
|
|
uint32_t maxCallerSaveSize = 0;
|
|
|
|
for (G4_BB *bb : builder.kernel.fg) {
|
|
if (bb->isEndWithFCall()) {
|
|
//
|
|
// Determine the caller-save registers per call site.
|
|
//
|
|
G4_INST *callInst = bb->back();
|
|
G4_BB *afterFCallBB = bb->Succs.front();
|
|
|
|
OptimizeActiveRegsFootprint(callerSaveRegsMap[bb], retRegsMap[bb]);
|
|
|
|
unsigned callerSaveRegsWritten = 0;
|
|
for (bool csr : callerSaveRegsMap[bb])
|
|
callerSaveRegsWritten += (csr ? 1 : 0);
|
|
|
|
INST_LIST_ITER insertSaveIt = bb->end();
|
|
--insertSaveIt, --insertSaveIt;
|
|
while ((*insertSaveIt)->isPseudoKill()) {
|
|
--insertSaveIt;
|
|
}
|
|
vISA_ASSERT((*insertSaveIt)->isCallerSave(), ERROR_REGALLOC);
|
|
INST_LIST_ITER rmIt = insertSaveIt;
|
|
if (insertSaveIt == bb->begin()) {
|
|
insertSaveIt = bb->end();
|
|
}
|
|
|
|
if (insertSaveIt != bb->end()) {
|
|
++insertSaveIt;
|
|
} else {
|
|
insertSaveIt = bb->begin();
|
|
}
|
|
if (callerSaveRegCountMap[bb] > 0) {
|
|
if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
|
|
builder.kernel.getKernelDebugInfo()->clearOldInstList();
|
|
builder.kernel.getKernelDebugInfo()->setOldInstList(bb);
|
|
}
|
|
|
|
saveActiveRegs(callerSaveRegsMap[bb], 0,
|
|
builder.kernel.fg.callerSaveAreaOffset, bb, insertSaveIt,
|
|
callerSaveInsts[callInst]);
|
|
|
|
// mark instructions for EU Fusion WA
|
|
for (auto save : callerSaveInsts[callInst])
|
|
addEUFusionCallWAInst(save);
|
|
|
|
if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
|
|
auto deltaInstList =
|
|
builder.kernel.getKernelDebugInfo()->getDeltaInstructions(bb);
|
|
for (auto jt : deltaInstList) {
|
|
builder.kernel.getKernelDebugInfo()->addCallerSaveInst(bb, jt);
|
|
}
|
|
}
|
|
}
|
|
bb->erase(rmIt);
|
|
INST_LIST_ITER insertRestIt = afterFCallBB->begin();
|
|
for (; !(*insertRestIt)->isCallerRestore(); ++insertRestIt)
|
|
;
|
|
if (callerSaveRegCountMap[bb] > 0) {
|
|
if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
|
|
builder.kernel.getKernelDebugInfo()->clearOldInstList();
|
|
builder.kernel.getKernelDebugInfo()->setOldInstList(afterFCallBB);
|
|
}
|
|
|
|
restoreActiveRegs(callerSaveRegsMap[bb], 0,
|
|
builder.kernel.fg.callerSaveAreaOffset, afterFCallBB,
|
|
insertRestIt, callerRestoreInsts[callInst], true);
|
|
|
|
// mark instructions for EU Fusion WA
|
|
for (auto restore : callerRestoreInsts[callInst])
|
|
addEUFusionCallWAInst(restore);
|
|
|
|
if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
|
|
auto deltaInsts =
|
|
builder.kernel.getKernelDebugInfo()->getDeltaInstructions(
|
|
afterFCallBB);
|
|
for (auto jt : deltaInsts) {
|
|
builder.kernel.getKernelDebugInfo()->addCallerRestoreInst(bb, jt);
|
|
}
|
|
}
|
|
}
|
|
afterFCallBB->erase(insertRestIt);
|
|
|
|
maxCallerSaveSize = std::max(maxCallerSaveSize, callerSaveRegsWritten *
|
|
builder.getGRFSize());
|
|
}
|
|
}
|
|
|
|
auto byteOffset =
|
|
builder.kernel.fg.callerSaveAreaOffset * 16 + maxCallerSaveSize;
|
|
builder.kernel.fg.frameSizeInOWord = ROUND(byteOffset, 64) / 16;
|
|
|
|
builder.instList.clear();
|
|
}
|
|
|
|
void GraphColor::getCalleeSaveRegisters() {
|
|
unsigned callerSaveNumGRF = kernel.stackCall.getCallerSaveLastGRF() + 1;
|
|
unsigned numCalleeSaveRegs = kernel.stackCall.getNumCalleeSaveRegs();
|
|
|
|
// Determine the callee-save registers.
|
|
|
|
gra.calleeSaveRegs.resize(numCalleeSaveRegs, false);
|
|
gra.calleeSaveRegCount = 0;
|
|
|
|
unsigned pseudoVCEId = builder.kernel.fg.pseudoVCEDcl->getRegVar()->getId();
|
|
unsigned stackCallStartReg = kernel.stackCall.getStackCallStartReg();
|
|
for (unsigned i = 0; i < numVar; i++) {
|
|
if (pseudoVCEId != i && intf.interfereBetween(pseudoVCEId, i)) {
|
|
if (lrs[i]->getPhyReg()) {
|
|
vISA_ASSERT(lrs[i]->getPhyReg()->isGreg(), ERROR_REGALLOC);
|
|
unsigned startReg = lrs[i]->getPhyReg()->asGreg()->getRegNum();
|
|
unsigned endReg = startReg + lrs[i]->getDcl()->getNumRows();
|
|
startReg = (startReg >= callerSaveNumGRF) ? startReg : callerSaveNumGRF;
|
|
startReg =
|
|
(startReg < stackCallStartReg) ? startReg : stackCallStartReg;
|
|
endReg = (endReg >= callerSaveNumGRF) ? endReg : callerSaveNumGRF;
|
|
endReg = (endReg < stackCallStartReg) ? endReg : stackCallStartReg;
|
|
for (unsigned j = startReg; j < endReg; j++) {
|
|
if (gra.calleeSaveRegs[j - callerSaveNumGRF] == false) {
|
|
gra.calleeSaveRegs[j - callerSaveNumGRF] = true;
|
|
gra.calleeSaveRegCount++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// Add callee save/restore code at stack call function entry/exit.
|
|
//
|
|
void GlobalRA::addCalleeSaveRestoreCode() {
|
|
unsigned callerSaveNumGRF = kernel.stackCall.getCallerSaveLastGRF() + 1;
|
|
|
|
OptimizeActiveRegsFootprint(calleeSaveRegs);
|
|
unsigned calleeSaveRegsWritten = 0;
|
|
for (bool b : calleeSaveRegs)
|
|
calleeSaveRegsWritten += (b ? 1 : 0);
|
|
|
|
INST_LIST_ITER insertSaveIt = builder.kernel.fg.getEntryBB()->end();
|
|
for (--insertSaveIt; !(*insertSaveIt)->isCalleeSave(); --insertSaveIt)
|
|
;
|
|
if (calleeSaveRegCount > 0) {
|
|
if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
|
|
// Store old inst list so we can separate callee save
|
|
// instructions that get inserted.
|
|
builder.kernel.getKernelDebugInfo()->clearOldInstList();
|
|
builder.kernel.getKernelDebugInfo()->setOldInstList(
|
|
builder.kernel.fg.getEntryBB());
|
|
}
|
|
vISA_ASSERT(calleeSaveInsts.size() == 0,
|
|
"Unexpected size of callee save set");
|
|
saveActiveRegs(calleeSaveRegs, callerSaveNumGRF,
|
|
builder.kernel.fg.calleeSaveAreaOffset,
|
|
builder.kernel.fg.getEntryBB(), insertSaveIt,
|
|
calleeSaveInsts);
|
|
|
|
if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
|
|
// Delta of oldInstList and current instList are all
|
|
// callee save instructions.
|
|
auto instList = builder.kernel.getKernelDebugInfo()->getDeltaInstructions(
|
|
builder.kernel.fg.getEntryBB());
|
|
for (auto inst : instList) {
|
|
builder.kernel.getKernelDebugInfo()->addCalleeSaveInst(inst);
|
|
}
|
|
}
|
|
}
|
|
builder.kernel.fg.getEntryBB()->erase(insertSaveIt);
|
|
INST_LIST_ITER insertRestIt = builder.kernel.fg.getUniqueReturnBlock()->end();
|
|
for (--insertRestIt; !(*insertRestIt)->isCalleeRestore(); --insertRestIt)
|
|
;
|
|
INST_LIST_ITER eraseIt = insertRestIt++;
|
|
if (calleeSaveRegCount > 0) {
|
|
if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
|
|
// Store old inst list so we can separate callee save
|
|
// instructions that get inserted.
|
|
builder.kernel.getKernelDebugInfo()->clearOldInstList();
|
|
builder.kernel.getKernelDebugInfo()->setOldInstList(
|
|
builder.kernel.fg.getUniqueReturnBlock());
|
|
}
|
|
vISA_ASSERT(calleeRestoreInsts.size() == 0,
|
|
"Unexpected size of callee restore set");
|
|
restoreActiveRegs(calleeSaveRegs, callerSaveNumGRF,
|
|
builder.kernel.fg.calleeSaveAreaOffset,
|
|
builder.kernel.fg.getUniqueReturnBlock(), insertRestIt,
|
|
calleeRestoreInsts, false);
|
|
|
|
if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
|
|
auto instList = builder.kernel.getKernelDebugInfo()->getDeltaInstructions(
|
|
builder.kernel.fg.getUniqueReturnBlock());
|
|
for (auto inst : instList) {
|
|
builder.kernel.getKernelDebugInfo()->addCalleeRestoreInst(inst);
|
|
}
|
|
}
|
|
}
|
|
builder.kernel.fg.getUniqueReturnBlock()->erase(eraseIt);
|
|
|
|
builder.instList.clear();
|
|
|
|
// mark instructions for EU Fusion WA
|
|
for (auto save : calleeSaveInsts)
|
|
addEUFusionCallWAInst(save);
|
|
for (auto restore : calleeRestoreInsts)
|
|
addEUFusionCallWAInst(restore);
|
|
|
|
// caller-save starts after callee-save and is 64-byte aligned
|
|
auto byteOffset = builder.kernel.fg.calleeSaveAreaOffset * 16 +
|
|
calleeSaveRegsWritten * builder.getGRFSize();
|
|
builder.kernel.fg.callerSaveAreaOffset = ROUND(byteOffset, 64) / 16;
|
|
VISA_DEBUG({
|
|
std::cout << "Callee save size: "
|
|
<< calleeSaveRegCount * builder.getGRFSize() << " bytes"
|
|
<< "\n";
|
|
});
|
|
}
|
|
|
|
//
|
|
// Add code to setup the stack frame in callee.
|
|
//
|
|
void GlobalRA::addGenxMainStackSetupCode() {
|
|
uint32_t fpInitVal =
|
|
(uint32_t)kernel.getInt32KernelAttr(Attributes::ATTR_SpillMemOffset);
|
|
// FIXME: a potential failure here is that frameSizeInOword is already the
|
|
// offset based on GlobalSratchOffset, which is the value of fpInitVal. So
|
|
// below we generate code to do SP = fpInitVal + frameSize, which does not
|
|
// make sense. It is correct now since when there's stack call, IGC will not
|
|
// use scratch, so fpInitVal will be 0.
|
|
unsigned frameSize = builder.kernel.fg.frameSizeInOWord;
|
|
uint16_t factor = 1;
|
|
if (useLscForSpillFill)
|
|
factor = 16;
|
|
G4_Declare *framePtr = builder.kernel.fg.framePtrDcl;
|
|
G4_Declare *stackPtr = builder.kernel.fg.stackPtrDcl;
|
|
|
|
auto entryBB = builder.kernel.fg.getEntryBB();
|
|
auto insertIt = std::find_if(entryBB->begin(), entryBB->end(),
|
|
[](G4_INST *inst) { return !inst->isLabel(); });
|
|
//
|
|
// FP = spillMemOffset
|
|
//
|
|
{
|
|
G4_DstRegRegion *dst =
|
|
builder.createDst(framePtr->getRegVar(), 0, 0, 1, Type_UD);
|
|
G4_Imm *src = builder.createImm(fpInitVal, Type_UD);
|
|
G4_INST *fpInst =
|
|
builder.createMov(g4::SIMD1, dst, src, InstOpt_WriteEnable, false);
|
|
insertIt = entryBB->insertBefore(insertIt, fpInst);
|
|
|
|
builder.kernel.setBEFPSetupInst(fpInst);
|
|
|
|
if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
|
|
builder.kernel.getKernelDebugInfo()->setBEFPSetupInst(fpInst);
|
|
builder.kernel.getKernelDebugInfo()->setFrameSize(frameSize * 16);
|
|
}
|
|
}
|
|
//
|
|
// SP = FP + FrameSize (overflow-area offset + overflow-area size)
|
|
//
|
|
{
|
|
G4_DstRegRegion *dst =
|
|
builder.createDst(stackPtr->getRegVar(), 0, 0, 1, Type_UD);
|
|
G4_Imm *src = builder.createImm(fpInitVal + frameSize * factor, Type_UD);
|
|
G4_INST *spIncInst =
|
|
builder.createMov(g4::SIMD1, dst, src, InstOpt_WriteEnable, false);
|
|
|
|
builder.kernel.setBESPSetupInst(spIncInst);
|
|
entryBB->insertBefore(++insertIt, spIncInst);
|
|
}
|
|
|
|
VISA_DEBUG(std::cout << "Total frame size: " << frameSize * 16 << " bytes"
|
|
<< "\n");
|
|
}
|
|
|
|
//
|
|
// Add code to setup the stack frame in callee.
|
|
//
|
|
void GlobalRA::addCalleeStackSetupCode() {
|
|
int frameSize = (int)builder.kernel.fg.frameSizeInOWord;
|
|
uint16_t factor = 1;
|
|
// convert framesize to bytes from oword for LSC
|
|
if (useLscForSpillFill)
|
|
factor = 16;
|
|
G4_Declare *framePtr = builder.kernel.fg.framePtrDcl;
|
|
G4_Declare *stackPtr = builder.kernel.fg.stackPtrDcl;
|
|
|
|
vISA_ASSERT(frameSize > 0, "frame size cannot be 0");
|
|
|
|
//
|
|
// BE_FP = BE_SP
|
|
// BE_SP += FrameSize
|
|
//
|
|
{
|
|
G4_DstRegRegion *dst =
|
|
builder.createDst(stackPtr->getRegVar(), 0, 0, 1, Type_UD);
|
|
G4_DstRegRegion *fp_dst =
|
|
builder.createDst(framePtr->getRegVar(), 0, 0, 1, Type_UD);
|
|
const RegionDesc *rDesc = builder.getRegionScalar();
|
|
G4_Operand *src0 =
|
|
builder.createSrc(stackPtr->getRegVar(), 0, 0, rDesc, Type_UD);
|
|
G4_Operand *sp_src =
|
|
builder.createSrc(stackPtr->getRegVar(), 0, 0, rDesc, Type_UD);
|
|
G4_Imm *src1 = builder.createImm(frameSize * factor, Type_UD);
|
|
auto createBEFP = builder.createMov(g4::SIMD1, fp_dst, sp_src,
|
|
InstOpt_WriteEnable, false);
|
|
createBEFP->addComment("vISA_FP = vISA_SP");
|
|
auto addInst = builder.createBinOp(G4_add, g4::SIMD1, dst, src0, src1,
|
|
InstOpt_WriteEnable, false);
|
|
addInst->addComment("vISA_SP += vISA_frameSize");
|
|
G4_BB *entryBB = builder.kernel.fg.getEntryBB();
|
|
auto insertIt =
|
|
std::find(entryBB->begin(), entryBB->end(), getSaveBE_FPInst());
|
|
vISA_ASSERT(insertIt != entryBB->end(), "Can't find BE_FP store inst");
|
|
|
|
builder.kernel.setBEFPSetupInst(createBEFP);
|
|
builder.kernel.setBESPSetupInst(addInst);
|
|
|
|
if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
|
|
builder.kernel.getKernelDebugInfo()->setBEFPSetupInst(createBEFP);
|
|
builder.kernel.getKernelDebugInfo()->setFrameSize(frameSize * 16);
|
|
}
|
|
|
|
addEUFusionCallWAInst(createBEFP);
|
|
addEUFusionCallWAInst(addInst);
|
|
|
|
if (EUFusionCallWANeeded()) {
|
|
builder.kernel.getKernelDebugInfo()->setCallerBEFPSaveInst(createBEFP);
|
|
}
|
|
|
|
insertIt++;
|
|
entryBB->insertBefore(insertIt, createBEFP);
|
|
entryBB->insertBefore(insertIt, addInst);
|
|
}
|
|
|
|
// Stack is destroyed in function addStoreRestoreToReturn() where part FDE is
|
|
// restored before fret. This is an optimization as 1 SIMD4 instruction
|
|
// restores ret %ip, ret EM, caller's BE_FP, BE_SP.
|
|
|
|
builder.instList.clear();
|
|
|
|
VISA_DEBUG(std::cout << "\nTotal frame size: " << frameSize * 16
|
|
<< " bytes\n");
|
|
}
|
|
|
|
//
|
|
// Add A0 save/restore code for stack calls.
|
|
//
|
|
void GraphColor::addA0SaveRestoreCode() {
|
|
uint8_t numA0Elements = (uint8_t)builder.getNumAddrRegisters();
|
|
|
|
int count = 0;
|
|
for (auto bb : builder.kernel.fg) {
|
|
if (bb->isEndWithFCall()) {
|
|
G4_BB *succ = bb->Succs.front();
|
|
auto fcallInst = bb->back()->asCFInst();
|
|
G4_RegVar *assocPseudoA0 =
|
|
bb->getParent().fcallToPseudoDclMap[fcallInst].A0->getRegVar();
|
|
|
|
if (!assocPseudoA0->getPhyReg()) {
|
|
// Insert save/restore code because the pseudo node did not get an
|
|
// allocation
|
|
const char *name = builder.getNameString(20, "SA0_%d", count++);
|
|
G4_Declare *savedDcl =
|
|
builder.createDeclare(name, G4_GRF, numA0Elements, 1, Type_UW);
|
|
|
|
{
|
|
//
|
|
// (W) mov (16) TMP_GRF<1>:uw a0.0<16;16,1>:uw
|
|
//
|
|
G4_DstRegRegion *dst =
|
|
builder.createDst(savedDcl->getRegVar(), 0, 0, 1, Type_UW);
|
|
const RegionDesc *rDesc = builder.getRegionStride1();
|
|
G4_Operand *src =
|
|
builder.createSrc(regPool.getAddrReg(), 0, 0, rDesc, Type_UW);
|
|
G4_INST *saveInst = builder.createMov(
|
|
G4_ExecSize(numA0Elements), dst, src, InstOpt_WriteEnable, false);
|
|
INST_LIST_ITER insertIt = std::prev(bb->end());
|
|
bb->insertBefore(insertIt, saveInst);
|
|
|
|
gra.addEUFusionCallWAInst(saveInst);
|
|
}
|
|
|
|
{
|
|
//
|
|
// (W) mov (16) a0.0<1>:uw TMP_GRF<16;16,1>:uw
|
|
//
|
|
G4_DstRegRegion *dst =
|
|
builder.createDst(regPool.getAddrReg(), 0, 0, 1, Type_UW);
|
|
const RegionDesc *rDesc = builder.getRegionStride1();
|
|
G4_Operand *src =
|
|
builder.createSrc(savedDcl->getRegVar(), 0, 0, rDesc, Type_UW);
|
|
G4_INST *restoreInst = builder.createMov(
|
|
G4_ExecSize(numA0Elements), dst, src, InstOpt_WriteEnable, false);
|
|
auto insertIt =
|
|
std::find_if(succ->begin(), succ->end(),
|
|
[](G4_INST *inst) { return !inst->isLabel(); });
|
|
succ->insertBefore(insertIt, restoreInst);
|
|
|
|
gra.addEUFusionCallWAInst(restoreInst);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
builder.instList.clear();
|
|
}
|
|
|
|
//
|
|
// Add Flag save/restore code for stack calls.
|
|
//
|
|
void GraphColor::addFlagSaveRestoreCode() {
|
|
int count = 0;
|
|
int num32BitFlags = builder.getNumFlagRegisters() / 2;
|
|
|
|
// each 32-bit flag gets a declare
|
|
// ToDo: should we use flag ARF directly here?
|
|
std::vector<G4_Declare *> tmpFlags;
|
|
for (int i = 0; i < num32BitFlags; ++i) {
|
|
G4_Declare *tmpFlag = builder.createTempFlag(2);
|
|
tmpFlag->getRegVar()->setPhyReg(regPool.getFlagAreg(i), 0);
|
|
tmpFlags.push_back(tmpFlag);
|
|
}
|
|
|
|
for (auto bb : builder.kernel.fg) {
|
|
if (bb->isEndWithFCall()) {
|
|
G4_BB *succ = bb->Succs.front();
|
|
auto fcallInst = bb->back()->asCFInst();
|
|
G4_RegVar *assocPseudoFlag =
|
|
bb->getParent().fcallToPseudoDclMap[fcallInst].Flag->getRegVar();
|
|
|
|
if (!assocPseudoFlag->getPhyReg()) {
|
|
// Insert save/restore code because the pseudo node did not get an
|
|
// allocation
|
|
const char *name = builder.getNameString(32, "SFLAG_%d", count++);
|
|
G4_Declare *savedDcl1 =
|
|
builder.createDeclare(name, G4_GRF, num32BitFlags, 1, Type_UD);
|
|
{
|
|
//
|
|
// (W) mov (1) TMP_GRF.0<1>:ud f0.0:ud
|
|
// (W) mov (1) TMP_GRF.1<1>:ud f1.0:ud
|
|
//
|
|
auto createFlagSaveInst = [&](int index) {
|
|
auto flagDcl = tmpFlags[index];
|
|
G4_DstRegRegion *dst =
|
|
builder.createDst(savedDcl1->getRegVar(), 0, index, 1, Type_UD);
|
|
G4_Operand *src = builder.createSrc(
|
|
flagDcl->getRegVar(), 0, 0, builder.getRegionScalar(), Type_UD);
|
|
return builder.createMov(g4::SIMD1, dst, src, InstOpt_WriteEnable,
|
|
false);
|
|
};
|
|
|
|
auto iter = std::prev(bb->end());
|
|
for (int i = 0; i < num32BitFlags; ++i) {
|
|
auto saveInst = createFlagSaveInst(i);
|
|
bb->insertBefore(iter, saveInst);
|
|
|
|
gra.addEUFusionCallWAInst(saveInst);
|
|
}
|
|
}
|
|
|
|
{
|
|
//
|
|
// mov (1) f0.0:ud TMP_GRF.0<0;1,0>:ud
|
|
// mov (1) f1.0:ud TMP_GRF.1<0;1,0>:ud
|
|
//
|
|
auto createRestoreFlagInst = [&](int index) {
|
|
auto flagDcl = tmpFlags[index];
|
|
G4_DstRegRegion *dst =
|
|
builder.createDst(flagDcl->getRegVar(), 0, 0, 1, Type_UD);
|
|
const RegionDesc *rDesc = builder.getRegionScalar();
|
|
G4_Operand *src = builder.createSrc(savedDcl1->getRegVar(), 0,
|
|
index, rDesc, Type_UD);
|
|
return builder.createMov(g4::SIMD1, dst, src, InstOpt_WriteEnable,
|
|
false);
|
|
};
|
|
auto insertIt =
|
|
std::find_if(succ->begin(), succ->end(),
|
|
[](G4_INST *inst) { return !inst->isLabel(); });
|
|
for (int i = 0; i < num32BitFlags; ++i) {
|
|
auto restoreInst = createRestoreFlagInst(i);
|
|
succ->insertBefore(insertIt, restoreInst);
|
|
|
|
gra.addEUFusionCallWAInst(restoreInst);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
builder.instList.clear();
|
|
}
|
|
|
|
void GraphColor::getSaveRestoreRegister() {
|
|
if (!builder.getIsKernel()) {
|
|
getCalleeSaveRegisters();
|
|
}
|
|
getCallerSaveRegisters();
|
|
}
|
|
|
|
//
|
|
// Get the forbidden vector size
|
|
//
|
|
unsigned ForbiddenRegs::getForbiddenVectorSize(G4_RegFileKind regKind) const {
|
|
switch (regKind) {
|
|
case G4_GRF:
|
|
case G4_INPUT:
|
|
return builder.kernel.getNumRegTotal();
|
|
case G4_ADDRESS:
|
|
return builder.getNumAddrRegisters();
|
|
case G4_FLAG:
|
|
return builder.getNumFlagRegisters();
|
|
case G4_SCALAR:
|
|
return builder.kernel.getSRFInWords();
|
|
default:
|
|
vISA_ASSERT_UNREACHABLE("illegal reg file");
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
//
|
|
// Get the forbidden vectors of reserved GRFs
|
|
// May be reserved for user, stack call, and spill
|
|
// This is the default RC for all GRF live ranges.
|
|
//
|
|
void ForbiddenRegs::generateReservedGRFForbidden(
|
|
unsigned reserveSpillSize) {
|
|
bool hasStackCall = builder.kernel.fg.getHasStackCalls() ||
|
|
builder.kernel.fg.getIsStackCallFunc();
|
|
uint32_t reservedGRFNum = builder.getuint32Option(vISA_ReservedGRFNum);
|
|
uint32_t reservedFromFrontGRFNum =
|
|
builder.getuint32Option(vISA_ReservedFromFrontGRFNum);
|
|
unsigned int stackCallRegSize =
|
|
hasStackCall ? builder.kernel.stackCall.numReservedABIGRF() : 0;
|
|
|
|
// r0 - Forbidden when platform is not 3d
|
|
// The last 1-3 GRFs may be reserved for stack call ABIs.
|
|
int index = static_cast<int>(forbiddenKind::FBD_RESERVEDGRF);
|
|
unsigned totalGRFNum = builder.kernel.getNumRegTotal();
|
|
forbiddenVec[index].resize(getForbiddenVectorSize(G4_GRF));
|
|
forbiddenVec[index].clear();
|
|
|
|
if (builder.kernel.getKernelType() != VISA_3D || !builder.canWriteR0() ||
|
|
reserveSpillSize > 0 || builder.kernel.getOption(vISA_PreserveR0InR0)) {
|
|
forbiddenVec[index].set(0, true);
|
|
}
|
|
|
|
if (builder.mustReserveR1()) {
|
|
// r1 is reserved for SIP kernel
|
|
forbiddenVec[index].set(1, true);
|
|
}
|
|
|
|
unsigned reservedRegSize = stackCallRegSize + reserveSpillSize;
|
|
for (unsigned int i = 0; i < reservedRegSize; i++) {
|
|
forbiddenVec[index].set(totalGRFNum - 1 - i, true);
|
|
}
|
|
|
|
unsigned largestNoneReservedReg = totalGRFNum - reservedRegSize - 1;
|
|
if (totalGRFNum - reservedRegSize >= totalGRFNum - 16) {
|
|
largestNoneReservedReg = totalGRFNum - 16 - 1;
|
|
}
|
|
|
|
if (totalGRFNum - reservedRegSize < reservedGRFNum) {
|
|
vISA_ASSERT(false, "After reservation, there is not enough regiser!");
|
|
}
|
|
|
|
for (unsigned int i = 0; i < reservedGRFNum; i++) {
|
|
forbiddenVec[index].set(largestNoneReservedReg - i, true);
|
|
}
|
|
|
|
for (unsigned int i = 0; i < reservedFromFrontGRFNum; i++) {
|
|
forbiddenVec[index].set(i, true);
|
|
}
|
|
|
|
auto &fg = builder.kernel.fg;
|
|
if (fg.reserveSR) {
|
|
forbiddenVec[index].set(
|
|
fg.scratchRegDcl->getRegVar()->getPhyReg()->asGreg()->getRegNum(),
|
|
true);
|
|
}
|
|
}
|
|
|
|
// ETO use only last 16 registers
|
|
void ForbiddenRegs::generateEOTGRFForbidden() {
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_EOT].resize(
|
|
getForbiddenVectorSize(G4_GRF));
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_EOT].clear();
|
|
for (unsigned i = 0; i < builder.kernel.getNumRegTotal() - 16; i++) {
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_EOT].set(i, true);
|
|
}
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_EOT] |=
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_RESERVEDGRF];
|
|
}
|
|
|
|
void ForbiddenRegs::generateLastGRFForbidden() {
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_LASTGRF].resize(
|
|
getForbiddenVectorSize(G4_GRF));
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_LASTGRF].clear();
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_LASTGRF].set(
|
|
builder.kernel.getNumRegTotal() - 1, true);
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_LASTGRF] |=
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_RESERVEDGRF];
|
|
}
|
|
|
|
void ForbiddenRegs::generateEOTLastGRFForbidden() {
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_EOTLASTGRF].resize(
|
|
getForbiddenVectorSize(G4_GRF));
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_EOTLASTGRF].clear();
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_EOTLASTGRF] |=
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_EOT];
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_EOTLASTGRF] |=
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_LASTGRF];
|
|
}
|
|
|
|
|
|
//
|
|
// mark forbidden registers for caller-save pseudo var
|
|
//
|
|
void ForbiddenRegs::generateCallerSaveGRFForbidden() {
|
|
unsigned int startCalleeSave = builder.kernel.stackCall.calleeSaveStart();
|
|
unsigned int endCalleeSave =
|
|
startCalleeSave + builder.kernel.stackCall.getNumCalleeSaveRegs();
|
|
// r60-r124 are caller save regs for SKL
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_CALLERSAVE].resize(
|
|
getForbiddenVectorSize(G4_GRF));
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_CALLERSAVE].clear();
|
|
for (unsigned int i = startCalleeSave; i < endCalleeSave; i++) {
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_CALLERSAVE].set(i, true);
|
|
}
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_CALLERSAVE] |=
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_RESERVEDGRF];
|
|
}
|
|
|
|
//
|
|
// mark forbidden registers for callee-save pseudo var
|
|
//
|
|
void ForbiddenRegs::generateCalleeSaveGRFForbidden() {
|
|
unsigned int numCallerSaveGRFs =
|
|
builder.kernel.stackCall.getCallerSaveLastGRF() + 1;
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_CALLEESAVE].resize(
|
|
getForbiddenVectorSize(G4_GRF));
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_CALLEESAVE].clear();
|
|
for (unsigned int i = 1; i < numCallerSaveGRFs; i++) {
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_CALLEESAVE].set(i, true);
|
|
}
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_CALLEESAVE] |=
|
|
forbiddenVec[(size_t)forbiddenKind::FBD_RESERVEDGRF];
|
|
}
|
|
|
|
//
|
|
// Add GRF caller/callee save/restore code for stack calls.
|
|
// localSpillAreaOwordsize specifices the starting offset of the
|
|
// caller/callee-save area in this frame. It is 64-byte aligned.
|
|
//
|
|
void GlobalRA::addSaveRestoreCode(unsigned localSpillAreaOwordSize) {
|
|
if (builder.getIsKernel()) {
|
|
builder.kernel.fg.callerSaveAreaOffset = localSpillAreaOwordSize;
|
|
} else {
|
|
builder.kernel.fg.calleeSaveAreaOffset = localSpillAreaOwordSize;
|
|
addCalleeSaveRestoreCode();
|
|
}
|
|
addCallerSaveRestoreCode();
|
|
if (builder.getIsKernel()) {
|
|
addGenxMainStackSetupCode();
|
|
} else {
|
|
addCalleeStackSetupCode();
|
|
}
|
|
stackCallProlog();
|
|
builder.instList.clear();
|
|
}
|
|
|
|
//
|
|
// If the graph has stack calls, then add the caller-save pseudo code
|
|
// immediately before and after the stack call. The pseudo code is either
|
|
// converted to actual save/restore code or is eliminated at the end of
|
|
// coloringRegAlloc().
|
|
//
|
|
void GlobalRA::addCallerSavePseudoCode() {
|
|
unsigned retID = 0;
|
|
|
|
for (G4_BB *bb : builder.kernel.fg) {
|
|
if (bb->isEndWithFCall()) {
|
|
// GRF caller save/restore
|
|
auto fcallInst = bb->back()->asCFInst();
|
|
G4_Declare *pseudoVCADcl =
|
|
bb->getParent().fcallToPseudoDclMap[fcallInst].VCA;
|
|
G4_DstRegRegion *dst =
|
|
builder.createDst(pseudoVCADcl->getRegVar(), 0, 0, 1, Type_UD);
|
|
G4_INST *saveInst = builder.createInternalIntrinsicInst(
|
|
nullptr, Intrinsic::CallerSave, g4::SIMD1, dst, nullptr, nullptr,
|
|
nullptr, InstOpt_WriteEnable);
|
|
saveInst->inheritDIFrom(fcallInst);
|
|
INST_LIST_ITER callBBIt = bb->end();
|
|
bb->insertBefore(--callBBIt, saveInst);
|
|
|
|
auto fcall = builder.getFcallInfo(bb->back());
|
|
vISA_ASSERT(fcall != std::nullopt, "fcall info not found");
|
|
uint16_t retSize = fcall->getRetSize();
|
|
if (retSize > 0) {
|
|
const char *name =
|
|
builder.getNameString(32, "FCALL_RETVAL_%d", retID++);
|
|
auto retDcl = builder.createHardwiredDeclare(
|
|
kernel.numEltPerGRF<Type_UD>() * retSize, Type_UD,
|
|
kernel.stackCall.retReg, 0);
|
|
retDcl->setName(name);
|
|
addVarToRA(retDcl);
|
|
fcallRetMap.emplace(pseudoVCADcl, retDcl);
|
|
}
|
|
|
|
vISA_ASSERT(bb->Succs.size() == 1,
|
|
"fcall basic block cannot have more than 1 successor node");
|
|
|
|
G4_BB *retBB = bb->Succs.front();
|
|
const RegionDesc *rd = builder.getRegionScalar();
|
|
G4_Operand *src =
|
|
builder.createSrc(pseudoVCADcl->getRegVar(), 0, 0, rd, Type_UD);
|
|
INST_LIST_ITER retBBIt = retBB->begin();
|
|
for (; retBBIt != retBB->end() && (*retBBIt)->isLabel(); ++retBBIt)
|
|
;
|
|
G4_INST *restoreInst = builder.createInternalIntrinsicInst(
|
|
nullptr, Intrinsic::CallerRestore, g4::SIMD1, nullptr, src, nullptr,
|
|
nullptr, InstOpt_WriteEnable);
|
|
restoreInst->inheritDIFrom(fcallInst);
|
|
retBB->insertBefore(retBBIt, restoreInst);
|
|
}
|
|
}
|
|
builder.instList.clear();
|
|
}
|
|
|
|
//
|
|
// If the graph has stack calls, then add the callee-save pseudo code at the
|
|
// entry/exit blocks of the function. The pseudo code is either converted to
|
|
// actual save/restore code or is eliminated at the end of coloringRegAlloc().
|
|
//
|
|
void GlobalRA::addCalleeSavePseudoCode() {
|
|
G4_Declare *pseudoVCEDcl = builder.kernel.fg.pseudoVCEDcl;
|
|
|
|
G4_DstRegRegion *dst =
|
|
builder.createDst(pseudoVCEDcl->getRegVar(), 0, 0, 1, Type_UD);
|
|
auto saveInst = builder.createInternalIntrinsicInst(
|
|
nullptr, Intrinsic::CalleeSave, g4::SIMD1, dst, nullptr, nullptr, nullptr,
|
|
InstOpt_WriteEnable);
|
|
INST_LIST_ITER insertIt = builder.kernel.fg.getEntryBB()->begin();
|
|
for (; insertIt != builder.kernel.fg.getEntryBB()->end() &&
|
|
(*insertIt)->isLabel();
|
|
++insertIt) { /* void */
|
|
};
|
|
builder.kernel.fg.getEntryBB()->insertBefore(insertIt, saveInst);
|
|
|
|
G4_BB *exitBB = builder.kernel.fg.getUniqueReturnBlock();
|
|
const RegionDesc *rDesc = builder.getRegionScalar();
|
|
G4_Operand *src =
|
|
builder.createSrc(pseudoVCEDcl->getRegVar(), 0, 0, rDesc, Type_UD);
|
|
G4_INST *restoreInst = builder.createInternalIntrinsicInst(
|
|
nullptr, Intrinsic::CalleeRestore, g4::SIMD1, nullptr, src, nullptr,
|
|
nullptr, InstOpt_WriteEnable);
|
|
INST_LIST_ITER exitBBIt = exitBB->end();
|
|
--exitBBIt;
|
|
vISA_ASSERT((*exitBBIt)->isFReturn(), ERROR_REGALLOC);
|
|
exitBB->insertBefore(exitBBIt, restoreInst);
|
|
builder.instList.clear();
|
|
}
|
|
|
|
void GlobalRA::storeCEInProlog() {
|
|
if (!kernel.getOption(vISA_storeCE))
|
|
return;
|
|
|
|
// If we've to store CE in prolog, we emit:
|
|
// TmpReg (GRF_Aligned) = CE0.0
|
|
// Store TmpReg @ FP+Offset
|
|
//
|
|
// Where Offset = 1 GRF size in bytes
|
|
|
|
// Create new variable equal to GRF size so it's always GRF aligned.
|
|
// It's transitory so shouldn't impact register pressure. We want to
|
|
// write CE0.0 in 0th location of this variable so that it can be
|
|
// used as send payload.
|
|
auto TmpReg = builder.createDeclare(
|
|
"TmpCEReg", G4_GRF, builder.numEltPerGRF<Type_UD>(), 1, Type_UD);
|
|
auto *DstRgn = builder.createDstRegRegion(TmpReg, 1);
|
|
auto *CEReg = regPool.getMask0Reg();
|
|
auto *SrcOpnd = builder.createSrc(
|
|
CEReg, 0, 0, kernel.fg.builder->getRegionScalar(), Type_UD);
|
|
auto Mov = builder.createMov(g4::SIMD1, DstRgn, SrcOpnd,
|
|
G4_InstOption::InstOpt_WriteEnable, false);
|
|
auto nextPos = kernel.fg.getEntryBB()->insertBefore(
|
|
kernel.fg.getEntryBB()->getFirstInsertPos(), Mov);
|
|
|
|
auto payloadSrc =
|
|
builder.createSrcRegRegion(TmpReg, builder.getRegionStride1());
|
|
const unsigned execSize = 8;
|
|
G4_DstRegRegion *postDst = builder.createNullDst(Type_UD);
|
|
G4_INST *store = nullptr;
|
|
unsigned int HWOffset = builder.numEltPerGRF<Type_UB>() / getHWordByteSize();
|
|
vISA_ASSERT(kernel.stackCall.getFrameDescriptorByteSize() <=
|
|
builder.numEltPerGRF<Type_UB>(),
|
|
"ce0 overwrote FDE");
|
|
kernel.getKernelDebugInfo()->setCESaveOffset(HWOffset * getHWordByteSize());
|
|
|
|
if (builder.supportsLSC()) {
|
|
auto headerOpnd = getSpillFillHeader(*kernel.fg.builder, nullptr);
|
|
store = builder.createSpill(postDst, headerOpnd, payloadSrc,
|
|
G4_ExecSize(execSize), 1, HWOffset,
|
|
builder.getBEFP(), InstOpt_WriteEnable, false);
|
|
} else {
|
|
store = builder.createSpill(postDst, payloadSrc, G4_ExecSize(execSize), 1,
|
|
HWOffset, builder.getBEFP(),
|
|
InstOpt_WriteEnable, false);
|
|
}
|
|
kernel.fg.getEntryBB()->insertAfter(nextPos, store);
|
|
|
|
if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
|
|
builder.kernel.getKernelDebugInfo()->setSaveCEInst(store);
|
|
}
|
|
}
|
|
|
|
//
|
|
// Insert store r125.[0-4] at entry and restore before return.
|
|
// Dst of store will be a hardwired temp at upper end of caller save area.
|
|
// This method emits:
|
|
// (W) mov (4) SR_BEStack<1>:ud r125.0<4;4,1>:ud <-- in prolog
|
|
// (W) mov (4) r125.0<1>:ud SR_BEStack<4;4,1>:ud <-- in epilog
|
|
void GlobalRA::addStoreRestoreToReturn() {
|
|
unsigned int size = 4;
|
|
if (kernel.stackCall.getVersion() ==
|
|
StackCallABI::StackCallABIVersion::VER_3)
|
|
size = 8;
|
|
|
|
unsigned regNum = kernel.stackCall.getCallerSaveLastGRF();
|
|
unsigned subRegNum = kernel.numEltPerGRF<Type_UD>() - size;
|
|
oldFPDcl = builder.createHardwiredDeclare(size, Type_UD, regNum, subRegNum);
|
|
oldFPDcl->setName(builder.getNameString(24, "CallerSaveRetIp_BE_FP"));
|
|
|
|
G4_DstRegRegion *oldFPDst =
|
|
builder.createDst(oldFPDcl->getRegVar(), 0, 0, 1, Type_UD);
|
|
const RegionDesc *rd = builder.getRegionStride1();
|
|
G4_Operand *oldFPSrc =
|
|
builder.createSrc(oldFPDcl->getRegVar(), 0, 0, rd, Type_UD);
|
|
|
|
unsigned saveRestoreSubReg =
|
|
kernel.stackCall.getVersion() == StackCallABI::StackCallABIVersion::VER_3
|
|
? kernel.stackCall.subRegs.BE_FP
|
|
: kernel.stackCall.subRegs.Ret_IP;
|
|
auto saveRestoreDecl = builder.createHardwiredDeclare(
|
|
size, Type_UD, kernel.stackCall.getFPSPGRF(), saveRestoreSubReg);
|
|
addVarToRA(saveRestoreDecl);
|
|
saveRestoreDecl->setName(builder.getNameString(24, "SR_BEStack"));
|
|
G4_DstRegRegion *FPdst =
|
|
builder.createDst(saveRestoreDecl->getRegVar(), 0, 0, 1, Type_UD);
|
|
rd = builder.getRegionStride1();
|
|
G4_Operand *FPsrc =
|
|
builder.createSrc(saveRestoreDecl->getRegVar(), 0, 0, rd, Type_UD);
|
|
|
|
saveBE_FPInst = builder.createMov(size == 4 ? g4::SIMD4 : g4::SIMD8, oldFPDst,
|
|
FPsrc, InstOpt_WriteEnable, false);
|
|
saveBE_FPInst->addComment("save vISA SP/FP to temp");
|
|
builder.setPartFDSaveInst(saveBE_FPInst);
|
|
|
|
auto entryBB = builder.kernel.fg.getEntryBB();
|
|
auto insertIt = std::find_if(entryBB->begin(), entryBB->end(),
|
|
[](G4_INST *inst) { return !inst->isLabel(); });
|
|
entryBB->insertBefore(insertIt, saveBE_FPInst);
|
|
|
|
auto fretBB = builder.kernel.fg.getUniqueReturnBlock();
|
|
auto iter = std::prev(fretBB->end());
|
|
vISA_ASSERT((*iter)->isFReturn(), "fret BB must end with fret");
|
|
|
|
// Following 4 cases exist for combination of EU fusion WA value, -skipFDE:
|
|
// 1. No WA needed, no -skipFDE: restore r127 from r59
|
|
// 2. No WA needed, -skipFDE: restore r127 from r59 and skip FDE store in
|
|
// leaf function
|
|
// 3. WA needed, no -skipFDE: restore r127 using load reading FDE
|
|
// 4. WA needed, -skipFDE: restore r127 from r59 in leaf function. In
|
|
// non-lead, use load to read stored FDE.
|
|
if (!EUFusionCallWANeeded() || canSkipFDE()) {
|
|
restoreBE_FPInst =
|
|
builder.createMov(size == 4 ? g4::SIMD4 : g4::SIMD8, FPdst, oldFPSrc,
|
|
InstOpt_WriteEnable, false);
|
|
fretBB->insertBefore(iter, restoreBE_FPInst);
|
|
} else {
|
|
// emit frame descriptor
|
|
auto dstDcl =
|
|
builder.createHardwiredDeclare(8, Type_UD, kernel.stackCall.getFPSPGRF(), 0);
|
|
dstDcl->setName(builder.getNameString(24, "FrameDescriptorGRF"));
|
|
auto dstData = builder.createDstRegRegion(dstDcl, 1);
|
|
const unsigned execSize = 8;
|
|
G4_INST *load = nullptr;
|
|
if (builder.supportsLSC()) {
|
|
auto headerOpnd = getSpillFillHeader(*kernel.fg.builder, nullptr);
|
|
load =
|
|
builder.createFill(headerOpnd, dstData, G4_ExecSize(execSize), 1, 0,
|
|
builder.getBEFP(), InstOpt_WriteEnable, false);
|
|
} else {
|
|
load = builder.createFill(dstData, G4_ExecSize(execSize), 1, 0,
|
|
builder.getBEFP(), InstOpt_WriteEnable, false);
|
|
}
|
|
fretBB->insertBefore(iter, load);
|
|
addEUFusionCallWAInst(load);
|
|
restoreBE_FPInst = load;
|
|
}
|
|
|
|
restoreBE_FPInst->addComment("restore vISA SP/FP from temp");
|
|
|
|
if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
|
|
builder.kernel.getKernelDebugInfo()->setCallerBEFPRestoreInst(
|
|
restoreBE_FPInst);
|
|
builder.kernel.getKernelDebugInfo()->setCallerSPRestoreInst(
|
|
restoreBE_FPInst);
|
|
if (!EUFusionCallWANeeded())
|
|
builder.kernel.getKernelDebugInfo()->setCallerBEFPSaveInst(saveBE_FPInst);
|
|
}
|
|
}
|
|
|
|
void GlobalRA::updateDefSet(std::set<G4_Declare *> &defs,
|
|
G4_Declare *referencedDcl) {
|
|
// Get topmost dcl
|
|
while (referencedDcl->getAliasDeclare() != NULL) {
|
|
referencedDcl = referencedDcl->getAliasDeclare();
|
|
}
|
|
|
|
defs.insert(referencedDcl);
|
|
}
|
|
|
|
void GlobalRA::detectUndefinedUses(LivenessAnalysis &liveAnalysis,
|
|
G4_Kernel &kernel) {
|
|
// This function iterates over each inst and checks whether there is
|
|
// a reaching def for each src operand.
|
|
VISA_DEBUG_VERBOSE({
|
|
std::cout << "\n";
|
|
if (liveAnalysis.livenessClass(G4_FLAG)) {
|
|
std::cout << "=== Uses with reaching def - Flags ===\n";
|
|
} else if (liveAnalysis.livenessClass(G4_ADDRESS)) {
|
|
std::cout << "=== Uses with reaching def - Address ===\n";
|
|
} else {
|
|
std::cout << "=== Uses with reaching def - GRF ===\n";
|
|
}
|
|
if (useLocalRA) {
|
|
std::cout
|
|
<< "(Use -nolocalra switch for accurate results of uses without "
|
|
"reaching defs)\n";
|
|
}
|
|
});
|
|
|
|
for (G4_BB *bb : kernel.fg) {
|
|
std::set<G4_Declare *> defs;
|
|
std::set<G4_Declare *>::iterator defs_it;
|
|
G4_Declare *referencedDcl = nullptr;
|
|
|
|
for (G4_INST *inst : *bb) {
|
|
// Src/predicate opnds are uses
|
|
if (inst->getPredicate() && inst->getPredicate()->getBase() &&
|
|
inst->getPredicate()->getBase()->isRegVar() &&
|
|
inst->getPredicate()->getBase()->isRegAllocPartaker()) {
|
|
referencedDcl = inst->getPredicate()
|
|
->asPredicate()
|
|
->getBase()
|
|
->asRegVar()
|
|
->getDeclare();
|
|
reportUndefinedUses(liveAnalysis, bb, inst, referencedDcl, defs,
|
|
Opnd_pred);
|
|
}
|
|
|
|
for (unsigned i = 0, numSrc = inst->getNumSrc(); i < numSrc; i++) {
|
|
G4_Operand *opnd = inst->getSrc(i);
|
|
|
|
if (opnd && opnd->isAddrExp() == false && opnd->getBase() &&
|
|
opnd->getBase()->isRegVar() &&
|
|
opnd->getBase()->isRegAllocPartaker()) {
|
|
referencedDcl = opnd->getBase()->asRegVar()->getDeclare();
|
|
reportUndefinedUses(liveAnalysis, bb, inst, referencedDcl, defs,
|
|
(Gen4_Operand_Number)(i + Opnd_src0));
|
|
}
|
|
}
|
|
|
|
// Dst/cond modifier opnds are defs
|
|
if (inst->getCondModBase() && inst->getCondMod()->getBase()->isRegVar() &&
|
|
inst->getCondMod()->getBase()->isRegAllocPartaker()) {
|
|
referencedDcl = inst->getCondMod()
|
|
->asCondMod()
|
|
->getBase()
|
|
->asRegVar()
|
|
->getDeclare();
|
|
updateDefSet(defs, referencedDcl);
|
|
}
|
|
|
|
if (inst->getDst() && inst->getDst()->getBase() &&
|
|
inst->getDst()->getBase()->isRegVar() &&
|
|
inst->getDst()->getBase()->isRegAllocPartaker()) {
|
|
referencedDcl = inst->getDst()->getBase()->asRegVar()->getDeclare();
|
|
updateDefSet(defs, referencedDcl);
|
|
}
|
|
}
|
|
}
|
|
|
|
VISA_DEBUG_VERBOSE(std::cout << "\n\n");
|
|
}
|
|
|
|
void GlobalRA::detectNeverDefinedUses() {
|
|
// Detect variables that are used but never defined in entire CFG.
|
|
// This does not use liveness information.
|
|
// Hold all decls from symbol table as key.
|
|
// Boolean mapped value determines whether the dcl is
|
|
// defined in kernel or not.
|
|
std::map<G4_Declare *, bool> vars;
|
|
std::map<G4_Declare *, bool>::iterator map_it;
|
|
|
|
for (auto bb : kernel.fg) {
|
|
for (G4_INST *inst : *bb) {
|
|
G4_Declare *referencedDcl = nullptr;
|
|
|
|
if (inst->getDst() && inst->getDst()->getBase() &&
|
|
inst->getDst()->getBase()->isRegVar()) {
|
|
referencedDcl = inst->getDst()->getBaseRegVarRootDeclare();
|
|
|
|
// Always insert top-most dcl
|
|
map_it = vars.find(referencedDcl);
|
|
if (map_it == vars.end()) {
|
|
vars.emplace(referencedDcl, true);
|
|
} else {
|
|
map_it->second = true;
|
|
}
|
|
}
|
|
|
|
if (inst->getCondModBase() && inst->getCondMod()->getBase()->isRegVar()) {
|
|
referencedDcl = inst->getCondMod()->getBaseRegVarRootDeclare();
|
|
|
|
map_it = vars.find(referencedDcl);
|
|
if (map_it == vars.end()) {
|
|
vars.emplace(referencedDcl, true);
|
|
} else {
|
|
map_it->second = true;
|
|
}
|
|
}
|
|
|
|
if (inst->getPredicate() && inst->getPredicate()->getBase() &&
|
|
inst->getPredicate()->getBase()->isRegVar()) {
|
|
referencedDcl = inst->getPredicate()->getBaseRegVarRootDeclare();
|
|
|
|
// Check whether dcl was already added to list.
|
|
// If not, add it with flag set to false to indicate
|
|
// that a use was found but a def hasnt been seen yet.
|
|
map_it = vars.find(referencedDcl);
|
|
if (map_it == vars.end()) {
|
|
vars.emplace(referencedDcl, false);
|
|
}
|
|
}
|
|
|
|
for (unsigned i = 0, numSrc = inst->getNumSrc(); i < numSrc; i++) {
|
|
G4_Operand *opnd = inst->getSrc(i);
|
|
|
|
if (opnd && opnd->getBase() && opnd->getBase()->isRegVar()) {
|
|
referencedDcl = opnd->getBaseRegVarRootDeclare();
|
|
|
|
map_it = vars.find(referencedDcl);
|
|
if (map_it == vars.end()) {
|
|
vars.emplace(referencedDcl, false);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
VISA_DEBUG_VERBOSE(std::cout
|
|
<< "\n=== Variables used but never defined ===\n\n");
|
|
|
|
for (auto dcl : kernel.Declares) {
|
|
while (dcl->getAliasDeclare())
|
|
dcl = dcl->getAliasDeclare();
|
|
|
|
map_it = vars.find(dcl);
|
|
if (map_it != vars.end()) {
|
|
if (map_it->second == false && dcl->getRegFile() != G4_INPUT &&
|
|
dcl->getAddressed() == false) {
|
|
// No def found for this non-input variable in
|
|
// entire CFG so report it.
|
|
VISA_DEBUG_VERBOSE({
|
|
std::cout << dcl->getName();
|
|
if (dcl->getRegFile() == G4_GRF) {
|
|
std::cout << " (General)";
|
|
} else if (dcl->getRegFile() == G4_ADDRESS) {
|
|
std::cout << " (Address)";
|
|
} else if (dcl->getRegFile() == G4_FLAG) {
|
|
std::cout << " (Flag)";
|
|
}
|
|
std::cout << "\n";
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
VISA_DEBUG_VERBOSE(std::cout << "\n\n");
|
|
}
|
|
|
|
//
|
|
// Check the overlap of two sources' ranges and do range splitting
|
|
// Such as, range1: 0~63, range2: 32~95 --> 0~31,32~63,64~95
|
|
// or, range1: 0~63, range2: 32~63 --> 0~31,32~63
|
|
//
|
|
VarRange *VarSplit::splitVarRange(VarRange *src1, VarRange *src2,
|
|
std::stack<VarRange *> *toDelete) {
|
|
VarRange *new_var_range = nullptr;
|
|
|
|
vISA_ASSERT(!(src1->leftBound == src2->leftBound &&
|
|
src1->rightBound == src2->rightBound),
|
|
"Same ranges can not be spiltted");
|
|
|
|
if (src1->leftBound > src2->rightBound ||
|
|
src1->rightBound < src2->leftBound) // No overlap
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
unsigned left1 = std::min(src1->leftBound, src2->leftBound); // left
|
|
unsigned right1 = std::max(src1->leftBound, src2->leftBound);
|
|
|
|
unsigned left2 = std::min(src1->rightBound, src2->rightBound); // right
|
|
unsigned right2 = std::max(src1->rightBound, src2->rightBound);
|
|
|
|
if (left1 == right1) // Same left
|
|
{
|
|
src1->leftBound = left1;
|
|
src1->rightBound = left2;
|
|
|
|
src2->leftBound = left2 + 1;
|
|
src2->rightBound = right2;
|
|
} else if (left2 == right2) // Same right
|
|
{
|
|
src1->leftBound = left1;
|
|
src1->rightBound = right1 - 1;
|
|
src2->leftBound = right1;
|
|
src2->rightBound = right2;
|
|
} else // No same boundary
|
|
{
|
|
src1->leftBound = left1; // Left one: in list already
|
|
src1->rightBound = right1 - 1;
|
|
|
|
src2->leftBound = left2 + 1; // Right one: keep in list
|
|
src2->rightBound = right2;
|
|
|
|
new_var_range = new VarRange;
|
|
new_var_range->leftBound = right1; // Middle one: need add one range object
|
|
new_var_range->rightBound = left2;
|
|
toDelete->push(new_var_range);
|
|
}
|
|
|
|
return new_var_range;
|
|
}
|
|
|
|
//
|
|
// Scan the range list, Insert the new range into the range list.
|
|
// Range splitting is applied if required.
|
|
//
|
|
void VarSplit::rangeListSpliting(VAR_RANGE_LIST *rangeList, G4_Operand *opnd,
|
|
std::stack<VarRange *> *toDelete) {
|
|
VarRange *range = new VarRange;
|
|
range->leftBound = opnd->getLeftBound();
|
|
range->rightBound = opnd->getRightBound();
|
|
toDelete->push(range);
|
|
|
|
VAR_RANGE_LIST_ITER it = rangeList->begin();
|
|
|
|
// The ranges in the list are ordered from low to high
|
|
while (it != rangeList->end()) {
|
|
if ((*it)->leftBound == range->leftBound &&
|
|
((*it)->rightBound == range->rightBound)) {
|
|
// Same range exists in the list already
|
|
return;
|
|
}
|
|
|
|
if ((*it)->leftBound > range->rightBound) {
|
|
// The range item in the list is on the right of current range, insert it
|
|
// before the postion. Since the whole range is inserted first, all the
|
|
// ranges should be continuous.
|
|
vISA_ASSERT((*it)->leftBound - range->rightBound == 1,
|
|
"none continuous spliting happened\n");
|
|
rangeList->insert(it, range);
|
|
return;
|
|
}
|
|
|
|
// Overlap happened, do splitting.
|
|
//(*lt) is updated to the left range
|
|
//"range" is updated to the right range
|
|
// If "newRange" is not NULL, it's the middle range.
|
|
VarRange *newRange = splitVarRange((*it), range, toDelete);
|
|
|
|
// Insert the middle one
|
|
it++;
|
|
if (newRange) {
|
|
it = rangeList->insert(it, newRange);
|
|
}
|
|
}
|
|
|
|
rangeList->push_back(range); // Insert the right one
|
|
|
|
return;
|
|
}
|
|
|
|
void VarSplit::getHeightWidth(G4_Type type, unsigned numberElements,
|
|
unsigned short &dclWidth,
|
|
unsigned short &dclHeight,
|
|
int &totalByteSize) const {
|
|
dclWidth = 1, dclHeight = 1;
|
|
totalByteSize = numberElements * TypeSize(type);
|
|
if (totalByteSize <= (int)kernel.numEltPerGRF<Type_UB>()) {
|
|
dclWidth = (uint16_t)numberElements;
|
|
} else {
|
|
// here we assume that the start point of the var is the beginning of a GRF?
|
|
// so subregister must be 0?
|
|
dclWidth = kernel.numEltPerGRF<Type_UB>() / TypeSize(type);
|
|
dclHeight = totalByteSize / kernel.numEltPerGRF<Type_UB>();
|
|
if (totalByteSize % kernel.numEltPerGRF<Type_UB>() != 0) {
|
|
dclHeight++;
|
|
}
|
|
}
|
|
}
|
|
|
|
void VarSplit::createSubDcls(G4_Kernel &kernel, G4_Declare *oldDcl,
|
|
std::vector<G4_Declare *> &splitDclList) {
|
|
if (oldDcl->getByteSize() <= kernel.numEltPerGRF<Type_UB>() ||
|
|
oldDcl->getByteSize() % kernel.numEltPerGRF<Type_UB>()) {
|
|
return;
|
|
}
|
|
|
|
int splitVarSize = kernel.getSimdSize() == g4::SIMD8 ? 1 : 2;
|
|
for (unsigned i = 0, bSizePerGRFSize = (oldDcl->getByteSize() /
|
|
kernel.numEltPerGRF<Type_UB>());
|
|
i < bSizePerGRFSize; i += splitVarSize) {
|
|
G4_Declare *splitDcl = NULL;
|
|
unsigned leftBound = i * kernel.numEltPerGRF<Type_UB>();
|
|
unsigned rightBound =
|
|
(i + splitVarSize) * kernel.numEltPerGRF<Type_UB>() - 1;
|
|
unsigned short dclWidth = 0;
|
|
unsigned short dclHeight = 0;
|
|
int dclTotalSize = 0;
|
|
|
|
getHeightWidth(oldDcl->getElemType(),
|
|
(rightBound - leftBound + 1) / oldDcl->getElemSize(),
|
|
dclWidth, dclHeight, dclTotalSize);
|
|
const char *splitDclName = kernel.fg.builder->getNameString(
|
|
16, "split_%d_%s", i, oldDcl->getName());
|
|
splitDcl = kernel.fg.builder->createDeclare(
|
|
splitDclName, G4_GRF, dclWidth, dclHeight, oldDcl->getElemType());
|
|
gra.setSubOffset(splitDcl, leftBound);
|
|
splitDcl->copyAlign(oldDcl);
|
|
gra.copyAlignment(splitDcl, oldDcl);
|
|
unsigned nElementSize =
|
|
(rightBound - leftBound + 1) / oldDcl->getElemSize();
|
|
if ((rightBound - leftBound + 1) % oldDcl->getElemSize()) {
|
|
nElementSize++;
|
|
}
|
|
splitDcl->setTotalElems(nElementSize);
|
|
splitDclList.push_back(splitDcl);
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
void VarSplit::insertMovesToTemp(IR_Builder &builder, G4_Declare *oldDcl,
|
|
G4_Operand *dstOpnd, G4_BB *bb,
|
|
INST_LIST_ITER instIter,
|
|
std::vector<G4_Declare *> &splitDclList) {
|
|
G4_INST *inst = (*instIter);
|
|
INST_LIST_ITER iter = instIter;
|
|
iter++;
|
|
|
|
for (size_t i = 0, size = splitDclList.size(); i < size; i++) {
|
|
G4_Declare *subDcl = splitDclList[i];
|
|
unsigned leftBound = gra.getSubOffset(subDcl);
|
|
unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
|
|
|
|
if (!(dstOpnd->getRightBound() < leftBound ||
|
|
rightBound < dstOpnd->getLeftBound())) {
|
|
unsigned maskFlag = (inst->getOption() & 0xFFF010C);
|
|
G4_DstRegRegion *dst = builder.createDstRegRegion(subDcl, 1);
|
|
auto src = builder.createSrc(
|
|
oldDcl->getRegVar(),
|
|
(gra.getSubOffset(subDcl)) / kernel.numEltPerGRF<Type_UB>(), 0,
|
|
builder.getRegionStride1(), oldDcl->getElemType());
|
|
G4_INST *splitInst = builder.createMov(
|
|
G4_ExecSize(subDcl->getTotalElems()), dst, src, maskFlag, false);
|
|
bb->insertBefore(iter, splitInst);
|
|
if (splitInst->isWriteEnableInst() && gra.EUFusionNoMaskWANeeded()) {
|
|
gra.addEUFusionNoMaskWAInst(bb, splitInst);
|
|
}
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
void VarSplit::insertMovesFromTemp(G4_Kernel &kernel, G4_Declare *oldDcl,
|
|
int index, G4_Operand *srcOpnd, int pos,
|
|
G4_BB *bb, INST_LIST_ITER instIter,
|
|
std::vector<G4_Declare *> &splitDclList) {
|
|
G4_INST *inst = (*instIter);
|
|
|
|
int sizeInGRF = (srcOpnd->getRightBound() - srcOpnd->getLeftBound() +
|
|
kernel.numEltPerGRF<Type_UB>() - 1) /
|
|
kernel.numEltPerGRF<Type_UB>();
|
|
int splitSize = kernel.getSimdSize() == g4::SIMD8 ? 1 : 2;
|
|
if (sizeInGRF != splitSize) {
|
|
unsigned short dclWidth = 0;
|
|
unsigned short dclHeight = 0;
|
|
int dclTotalSize = 0;
|
|
G4_SrcRegRegion *oldSrc = srcOpnd->asSrcRegRegion();
|
|
getHeightWidth(oldSrc->getType(),
|
|
(srcOpnd->getRightBound() - srcOpnd->getLeftBound() + 1) /
|
|
oldSrc->getElemSize(),
|
|
dclWidth, dclHeight, dclTotalSize);
|
|
const char *newDclName = kernel.fg.builder->getNameString(
|
|
16, "copy_%d_%s", index, oldDcl->getName());
|
|
G4_Declare *newDcl = kernel.fg.builder->createDeclare(
|
|
newDclName, G4_GRF, dclWidth, dclHeight, oldSrc->getType());
|
|
newDcl->copyAlign(oldDcl);
|
|
gra.copyAlignment(newDcl, oldDcl);
|
|
|
|
unsigned newLeftBound = 0;
|
|
|
|
for (size_t i = 0, size = splitDclList.size(); i < size; i++) {
|
|
G4_Declare *subDcl = splitDclList[i];
|
|
unsigned leftBound = gra.getSubOffset(subDcl);
|
|
unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
|
|
|
|
if (!(srcOpnd->getRightBound() < leftBound ||
|
|
rightBound < srcOpnd->getLeftBound())) {
|
|
|
|
G4_DstRegRegion *dst = kernel.fg.builder->createDst(
|
|
newDcl->getRegVar(), newLeftBound / kernel.numEltPerGRF<Type_UB>(),
|
|
0, 1, oldSrc->getType());
|
|
newLeftBound += subDcl->getByteSize();
|
|
G4_SrcRegRegion *src = kernel.fg.builder->createSrc(
|
|
subDcl->getRegVar(), 0, 0, kernel.fg.builder->getRegionStride1(),
|
|
oldSrc->getType());
|
|
G4_INST *movInst =
|
|
kernel.fg.builder->createMov(G4_ExecSize(subDcl->getTotalElems()),
|
|
dst, src, InstOpt_WriteEnable, false);
|
|
bb->insertBefore(instIter, movInst);
|
|
if (gra.EUFusionNoMaskWANeeded()) {
|
|
gra.addEUFusionNoMaskWAInst(bb, movInst);
|
|
}
|
|
}
|
|
}
|
|
auto newSrc = kernel.fg.builder->createSrcRegRegion(
|
|
oldSrc->getModifier(), Direct, newDcl->getRegVar(), 0,
|
|
oldSrc->getSubRegOff(), oldSrc->getRegion(), newDcl->getElemType());
|
|
inst->setSrc(newSrc, pos);
|
|
} else {
|
|
for (size_t i = 0, size = splitDclList.size(); i < size; i++) {
|
|
G4_Declare *subDcl = splitDclList[i];
|
|
unsigned leftBound = gra.getSubOffset(subDcl);
|
|
unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
|
|
|
|
if (!(srcOpnd->getRightBound() < leftBound ||
|
|
rightBound < srcOpnd->getLeftBound())) {
|
|
G4_SrcRegRegion *oldSrc = srcOpnd->asSrcRegRegion();
|
|
G4_SrcRegRegion *newSrc = kernel.fg.builder->createSrcRegRegion(
|
|
oldSrc->getModifier(), Direct, subDcl->getRegVar(), 0,
|
|
oldSrc->getSubRegOff(), oldSrc->getRegion(), oldSrc->getType());
|
|
inst->setSrc(newSrc, pos);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
bool VarSplit::canDoGlobalSplit(IR_Builder &builder, G4_Kernel &kernel,
|
|
uint32_t sendSpillRefCount) {
|
|
if (!builder.getOption(vISA_GlobalSendVarSplit)) {
|
|
return false;
|
|
}
|
|
|
|
if (!builder.getOption(vISA_Debug) && // Not work in debug mode
|
|
kernel.getInt32KernelAttr(Attributes::ATTR_Target) ==
|
|
VISA_3D && // Only works for 3D/OCL/OGL
|
|
sendSpillRefCount) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
void VarSplit::globalSplit(IR_Builder &builder, G4_Kernel &kernel) {
|
|
typedef std::list<
|
|
std::tuple<G4_BB *, G4_Operand *, int, unsigned, INST_LIST_ITER>>
|
|
SPLIT_OPERANDS;
|
|
typedef std::list<std::tuple<G4_BB *, G4_Operand *, int, unsigned,
|
|
INST_LIST_ITER>>::iterator SPLIT_OPERANDS_ITER;
|
|
typedef std::map<G4_RegVar *, SPLIT_OPERANDS> SPLIT_DECL_OPERANDS;
|
|
typedef std::map<G4_RegVar *, SPLIT_OPERANDS>::iterator
|
|
SPLIT_DECL_OPERANDS_ITER;
|
|
|
|
SPLIT_DECL_OPERANDS splitDcls;
|
|
unsigned instIndex = 0;
|
|
int splitSize = kernel.getSimdSize() == g4::SIMD8 ? 1 : 2;
|
|
for (auto bb : kernel.fg) {
|
|
for (INST_LIST_ITER it = bb->begin(), iend = bb->end(); it != iend;
|
|
++it, ++instIndex) {
|
|
G4_INST *inst = (*it);
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
|
|
if (inst->isLifeTimeEnd() || inst->isPseudoKill()) {
|
|
continue;
|
|
}
|
|
|
|
//
|
|
// process send destination operand
|
|
//
|
|
if (inst->isSend() &&
|
|
inst->getMsgDesc()->getDstLenRegs() > (size_t)splitSize &&
|
|
inst->asSendInst()->isDirectSplittableSend()) {
|
|
G4_DstRegRegion *dstrgn = dst;
|
|
G4_Declare *topdcl = GetTopDclFromRegRegion(dstrgn);
|
|
|
|
if (topdcl && dstrgn->getRegAccess() == Direct &&
|
|
!topdcl->getAddressed() && topdcl->getRegFile() != G4_INPUT &&
|
|
(dstrgn->getRightBound() - dstrgn->getLeftBound() + 1) ==
|
|
topdcl->getByteSize() &&
|
|
(dstrgn->getRightBound() - dstrgn->getLeftBound()) >
|
|
kernel.numEltPerGRF<Type_UB>()) {
|
|
// The tuple<G4_BB*, G4_Operand*, int pos, unsigned instIndex,
|
|
// INST_LIST_ITER>, these info are tuning and split
|
|
// operand/instruction generation
|
|
splitDcls[topdcl->getRegVar()].push_front(
|
|
make_tuple(bb, dst, 0, instIndex, it));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
instIndex = 0;
|
|
for (auto bb : kernel.fg) {
|
|
for (INST_LIST_ITER it = bb->begin(), end = bb->end(); it != end;
|
|
++it, ++instIndex) {
|
|
|
|
G4_INST *inst = (*it);
|
|
|
|
if (inst->isLifeTimeEnd() || inst->isPseudoKill()) {
|
|
continue;
|
|
}
|
|
|
|
//
|
|
// process each source operand
|
|
//
|
|
for (unsigned j = 0, numSrc = inst->getNumSrc(); j < numSrc; j++) {
|
|
G4_Operand *src = inst->getSrc(j);
|
|
|
|
if (src == NULL) {
|
|
continue;
|
|
}
|
|
|
|
if (src->isSrcRegRegion()) {
|
|
G4_Declare *topdcl = GetTopDclFromRegRegion(src);
|
|
|
|
if (topdcl && topdcl->getRegFile() != G4_INPUT &&
|
|
!topdcl->getAddressed() &&
|
|
splitDcls.find(topdcl->getRegVar()) != splitDcls.end() &&
|
|
((src->asSrcRegRegion()->getRightBound() -
|
|
src->asSrcRegRegion()->getLeftBound() + 1) <
|
|
topdcl->getByteSize()) &&
|
|
src->asSrcRegRegion()->getRegAccess() ==
|
|
Direct) // We don't split the indirect access
|
|
{
|
|
splitDcls[topdcl->getRegVar()].push_back(
|
|
make_tuple(bb, src, j, instIndex, it));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (SPLIT_DECL_OPERANDS_ITER it = splitDcls.begin();
|
|
it != splitDcls.end();) {
|
|
unsigned srcIndex = 0xFFFFFFFF;
|
|
unsigned dstIndex = 0;
|
|
SPLIT_DECL_OPERANDS_ITER succIt = it;
|
|
succIt++;
|
|
G4_Declare *topDcl = it->first->getDeclare();
|
|
if (topDcl->getByteSize() <= kernel.numEltPerGRF<Type_UB>() * 2u) {
|
|
splitDcls.erase(it);
|
|
it = succIt;
|
|
continue;
|
|
}
|
|
|
|
bool hasSrcOpearnd = false;
|
|
for (SPLIT_OPERANDS_ITER vt = it->second.begin(); vt != it->second.end();
|
|
vt++) {
|
|
G4_BB *bb = nullptr;
|
|
G4_Operand *opnd = nullptr;
|
|
INST_LIST_ITER instIter;
|
|
int pos = 0;
|
|
unsigned iIndex = 0;
|
|
|
|
std::tie(bb, opnd, pos, iIndex, instIter) = (*vt);
|
|
|
|
if (opnd == nullptr) {
|
|
continue;
|
|
}
|
|
|
|
if (opnd->isDstRegRegion()) {
|
|
dstIndex = std::max(dstIndex, iIndex);
|
|
}
|
|
|
|
if (opnd->isSrcRegRegion()) {
|
|
srcIndex = std::min(srcIndex, iIndex);
|
|
hasSrcOpearnd = true;
|
|
}
|
|
}
|
|
|
|
if (!hasSrcOpearnd ||
|
|
(dstIndex > srcIndex && dstIndex - srcIndex < it->second.size() + 1)) {
|
|
splitDcls.erase(it);
|
|
it = succIt;
|
|
continue;
|
|
}
|
|
|
|
it++;
|
|
}
|
|
|
|
for (SPLIT_DECL_OPERANDS_ITER it = splitDcls.begin(); it != splitDcls.end();
|
|
it++) {
|
|
G4_Declare *topDcl = it->first->getDeclare();
|
|
std::vector<G4_Declare *> splitDclList;
|
|
splitDclList.clear();
|
|
|
|
createSubDcls(kernel, topDcl, splitDclList);
|
|
int srcIndex = 0;
|
|
for (SPLIT_OPERANDS_ITER vt = it->second.begin(); vt != it->second.end();
|
|
vt++) {
|
|
G4_BB *bb = nullptr;
|
|
G4_Operand *opnd = nullptr;
|
|
INST_LIST_ITER instIter;
|
|
int pos = 0;
|
|
unsigned instIndex = 0;
|
|
std::tie(bb, opnd, pos, instIndex, instIter) = (*vt);
|
|
|
|
if (opnd == nullptr) {
|
|
continue;
|
|
}
|
|
|
|
if (opnd->isDstRegRegion()) {
|
|
insertMovesToTemp(builder, topDcl, opnd, bb, instIter, splitDclList);
|
|
}
|
|
|
|
if (opnd->isSrcRegRegion()) {
|
|
insertMovesFromTemp(kernel, topDcl, srcIndex, opnd, pos, bb, instIter,
|
|
splitDclList);
|
|
}
|
|
|
|
srcIndex++;
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
void VarSplit::localSplit(IR_Builder &builder, G4_BB *bb) {
|
|
class CmpRegVarId {
|
|
public:
|
|
bool operator()(G4_RegVar *first, G4_RegVar *second) const {
|
|
return first->getDeclare()->getDeclId() <
|
|
second->getDeclare()->getDeclId();
|
|
}
|
|
};
|
|
std::map<G4_RegVar *, std::vector<std::pair<G4_Operand *, INST_LIST_ITER>>,
|
|
CmpRegVarId>
|
|
localRanges;
|
|
std::map<G4_RegVar *, std::vector<std::pair<G4_Operand *, INST_LIST_ITER>>,
|
|
CmpRegVarId>::iterator localRangesIt;
|
|
std::map<G4_RegVar *, VarRangeListPackage, CmpRegVarId> varRanges;
|
|
std::map<G4_RegVar *, VarRangeListPackage, CmpRegVarId>::iterator varRangesIt;
|
|
std::stack<VarRange *> toDelete;
|
|
|
|
// Skip BB if there are no sends.
|
|
bool hasSends = std::any_of(bb->begin(), bb->end(),
|
|
[](G4_INST *inst) { return inst->isSend(); });
|
|
if (!hasSends)
|
|
return;
|
|
|
|
//
|
|
// Iterate instruction in BB from back to front
|
|
//
|
|
for (INST_LIST::reverse_iterator rit = bb->rbegin(), rend = bb->rend();
|
|
rit != rend; ++rit) {
|
|
G4_INST *i = (*rit);
|
|
G4_DstRegRegion *dst = i->getDst();
|
|
|
|
if (i->isLifeTimeEnd() || i->isPseudoKill()) {
|
|
continue;
|
|
}
|
|
|
|
//
|
|
// process destination operand
|
|
//
|
|
if (dst) {
|
|
// It's RA candidate
|
|
G4_Declare *topdcl = GetTopDclFromRegRegion(dst);
|
|
|
|
LocalLiveRange *topdclLR = nullptr;
|
|
// Local only
|
|
if ((topdcl && (topdclLR = gra.getLocalLR(topdcl)) &&
|
|
topdcl->getIsRefInSendDcl() && topdclLR->isLiveRangeLocal()) &&
|
|
topdcl->getRegFile() == G4_GRF) {
|
|
varRangesIt = varRanges.find(topdcl->getRegVar());
|
|
INST_LIST_ITER iterToInsert = rit.base();
|
|
iterToInsert--; // Point to the iterator of current instruction
|
|
if (varRangesIt == varRanges.end()) {
|
|
VarRange *new_range = new VarRange;
|
|
new_range->leftBound = 0;
|
|
new_range->rightBound = topdcl->getByteSize() - 1;
|
|
toDelete.push(new_range);
|
|
varRanges[topdcl->getRegVar()].list.push_back(new_range);
|
|
} else {
|
|
rangeListSpliting(&(varRanges[topdcl->getRegVar()].list), dst,
|
|
&toDelete);
|
|
}
|
|
|
|
localRanges[topdcl->getRegVar()].emplace_back(
|
|
dst, iterToInsert); // Ordered from back to front.
|
|
}
|
|
}
|
|
|
|
//
|
|
// process each source operand
|
|
//
|
|
for (unsigned j = 0, numSrc = i->getNumSrc(); j < numSrc; j++) {
|
|
G4_Operand *src = i->getSrc(j);
|
|
|
|
if (src == NULL) {
|
|
continue;
|
|
}
|
|
|
|
// Local only
|
|
if (src->isSrcRegRegion()) {
|
|
G4_Declare *topdcl = GetTopDclFromRegRegion(src);
|
|
LocalLiveRange *topdclLR = nullptr;
|
|
|
|
if (topdcl && (topdclLR = gra.getLocalLR(topdcl)) &&
|
|
topdcl->getIsRefInSendDcl() && topdclLR->isLiveRangeLocal() &&
|
|
topdcl->getRegFile() == G4_GRF) {
|
|
G4_VarBase *base = topdcl->getRegVar();
|
|
|
|
INST_LIST_ITER iterToInsert = rit.base();
|
|
iterToInsert--;
|
|
|
|
varRangesIt = varRanges.find(base->asRegVar());
|
|
if (varRangesIt == varRanges.end()) {
|
|
VarRange *new_range = new VarRange;
|
|
new_range->leftBound = 0;
|
|
new_range->rightBound = topdcl->getByteSize() - 1;
|
|
toDelete.push(new_range);
|
|
varRanges[topdcl->getRegVar()].list.push_back(new_range);
|
|
}
|
|
|
|
rangeListSpliting(&(varRanges[topdcl->getRegVar()].list), src,
|
|
&toDelete);
|
|
|
|
localRanges[topdcl->getRegVar()].emplace_back(
|
|
src, iterToInsert); // Ordered from back to front.
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Clean the varaibles without no partial usage, or whose partial live range
|
|
// is too short
|
|
std::map<G4_RegVar *, VarRangeListPackage>::iterator it = varRanges.begin();
|
|
while (it != varRanges.end()) {
|
|
std::map<G4_RegVar *, VarRangeListPackage>::iterator succ_it = it;
|
|
succ_it++;
|
|
|
|
// No partial
|
|
if (it->second.list.size() <= 1) {
|
|
varRanges.erase(it);
|
|
it = succ_it;
|
|
continue;
|
|
}
|
|
|
|
// If total GRF size divides partial number is less than 16 bytes (half
|
|
// GRF), remove it
|
|
if (((*it->second.list.rbegin())->rightBound -
|
|
(*it->second.list.begin())->leftBound) /
|
|
it->second.list.size() <
|
|
kernel.numEltPerGRF<Type_UW>() * 2 / 2) {
|
|
varRanges.erase(it);
|
|
it = succ_it;
|
|
continue;
|
|
}
|
|
|
|
G4_Declare *topDcl = it->first->getDeclare();
|
|
bool aligned = true;
|
|
for (const VarRange *vr : it->second.list) {
|
|
unsigned leftBound = vr->leftBound;
|
|
unsigned rightBound = vr->rightBound;
|
|
int elementSize =
|
|
topDcl->getElemSize() > G4_WSIZE ? topDcl->getElemSize() : G4_WSIZE;
|
|
unsigned short elemsNum = (rightBound - leftBound + 1) / elementSize;
|
|
|
|
if (!elemsNum) {
|
|
aligned = false;
|
|
break;
|
|
}
|
|
|
|
// TODO: we can merge serveral unaligned sub declares into one aligned.
|
|
// Such as [0-1], [2-63] --> [0-63]
|
|
if (leftBound % kernel.numEltPerGRF<Type_UW>() ||
|
|
(rightBound + 1) % kernel.numEltPerGRF<Type_UW>()) {
|
|
aligned = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!aligned) {
|
|
varRanges.erase(it);
|
|
it = succ_it;
|
|
continue;
|
|
}
|
|
|
|
it = succ_it;
|
|
}
|
|
|
|
int splitid = 0;
|
|
for (std::map<G4_RegVar *, VarRangeListPackage>::iterator it =
|
|
varRanges.begin();
|
|
it != varRanges.end(); it++) {
|
|
G4_Declare *topDcl = it->first->getDeclare();
|
|
const char *dclName = topDcl->getName();
|
|
|
|
topDcl->setIsSplittedDcl(true);
|
|
|
|
// Vertical split: varaible split
|
|
unsigned splitVarNum = 0;
|
|
unsigned pre_rightBound = 0;
|
|
for (VAR_RANGE_LIST_ITER vt = it->second.list.begin();
|
|
vt != it->second.list.end(); vt++) {
|
|
unsigned leftBound = (*vt)->leftBound;
|
|
unsigned rightBound = (*vt)->rightBound;
|
|
int elementSize =
|
|
topDcl->getElemSize() > G4_WSIZE ? topDcl->getElemSize() : G4_WSIZE;
|
|
unsigned short elemsNum = (rightBound - leftBound + 1) / elementSize;
|
|
|
|
if (!elemsNum) {
|
|
vASSERT(false);
|
|
pre_rightBound = rightBound;
|
|
continue;
|
|
}
|
|
|
|
if (leftBound && pre_rightBound + 1 != leftBound) {
|
|
vASSERT(false);
|
|
}
|
|
pre_rightBound = rightBound;
|
|
|
|
std::stringstream nameStrm;
|
|
nameStrm << dclName << "_" << splitid << "_" << leftBound << "_"
|
|
<< rightBound << std::ends;
|
|
int nameLen = unsigned(nameStrm.str().length()) + 1;
|
|
const char *name = builder.getNameString(nameLen, "%s_%d_%d_%d", dclName,
|
|
splitid, leftBound, rightBound);
|
|
|
|
unsigned short dclWidth = 0;
|
|
unsigned short dclHeight = 0;
|
|
int dclTotalSize = 0;
|
|
|
|
getHeightWidth(topDcl->getElemType(),
|
|
(rightBound - leftBound + 1) / topDcl->getElemSize(),
|
|
dclWidth, dclHeight, dclTotalSize);
|
|
G4_Declare *partialDcl = builder.createDeclare(
|
|
name, G4_GRF, dclWidth, dclHeight, topDcl->getElemType());
|
|
gra.setSubOffset(partialDcl, leftBound);
|
|
partialDcl->setIsPartialDcl(true);
|
|
gra.setSplittedDeclare(partialDcl, topDcl);
|
|
unsigned nElementSize =
|
|
(rightBound - leftBound + 1) / topDcl->getElemSize();
|
|
if ((rightBound - leftBound + 1) % topDcl->getElemSize()) {
|
|
nElementSize++;
|
|
}
|
|
partialDcl->setTotalElems(nElementSize);
|
|
gra.addSubDcl(topDcl, partialDcl);
|
|
splitVarNum++;
|
|
VISA_DEBUG_VERBOSE(std::cout << "==> Sub Declare: " << splitid
|
|
<< "::" << name << "\n");
|
|
splitid++;
|
|
}
|
|
if (splitVarNum) {
|
|
gra.setSplitVarNum(topDcl, splitVarNum);
|
|
}
|
|
}
|
|
|
|
while (toDelete.size() > 0) {
|
|
delete toDelete.top();
|
|
toDelete.pop();
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
void GlobalRA::addrRegAlloc() {
|
|
uint32_t addrSpillId = 0;
|
|
unsigned maxRAIterations = builder.getuint32Option(vISA_MaxRAIterations);
|
|
unsigned iterationNo = 0;
|
|
|
|
while (iterationNo < maxRAIterations) {
|
|
RA_TRACE(std::cout << "--address RA iteration " << iterationNo << "\n");
|
|
//
|
|
// choose reg vars whose reg file kind is ARF
|
|
//
|
|
LivenessAnalysis liveAnalysis(*this, G4_ADDRESS);
|
|
liveAnalysis.computeLiveness();
|
|
|
|
//
|
|
// if no reg var needs to reg allocated, then skip reg allocation
|
|
//
|
|
if (liveAnalysis.getNumSelectedVar() > 0) {
|
|
GraphColor coloring(liveAnalysis, false, false);
|
|
if (!coloring.regAlloc(false, false, nullptr)) {
|
|
SpillManager spillARF(*this, coloring.getSpilledLiveRanges(),
|
|
addrSpillId);
|
|
spillARF.insertSpillCode();
|
|
addrSpillId = spillARF.getNextTempDclId();
|
|
|
|
//
|
|
// if new addr temps are created, we need to do RA again so that newly
|
|
// created temps can get registers. If there are no more newly created
|
|
// temps, we then commit reg assignments
|
|
//
|
|
if (spillARF.isAnyNewTempCreated() == false) {
|
|
coloring.confirmRegisterAssignments();
|
|
coloring.cleanupRedundantARFFillCode();
|
|
if ((builder.kernel.fg.getHasStackCalls() ||
|
|
builder.kernel.fg.getIsStackCallFunc())) {
|
|
coloring.addA0SaveRestoreCode();
|
|
}
|
|
break; // no more new addr temps; done with ARF allocation
|
|
}
|
|
} else // successfully allocate register without spilling
|
|
{
|
|
coloring.confirmRegisterAssignments();
|
|
coloring.cleanupRedundantARFFillCode();
|
|
if ((builder.kernel.fg.getHasStackCalls() ||
|
|
builder.kernel.fg.getIsStackCallFunc())) {
|
|
coloring.addA0SaveRestoreCode();
|
|
}
|
|
VISA_DEBUG_VERBOSE(detectUndefinedUses(liveAnalysis, kernel));
|
|
|
|
break; // done with ARF allocation
|
|
}
|
|
} else {
|
|
break; // no ARF allocation needed
|
|
}
|
|
kernel.dumpToFile("after.Address_RA." + std::to_string(iterationNo));
|
|
iterationNo++;
|
|
}
|
|
|
|
// Addr spill/fill
|
|
addVarToRA(kernel.Declares.back());
|
|
|
|
vISA_ASSERT(iterationNo < maxRAIterations, "Address RA has failed.");
|
|
}
|
|
|
|
void GlobalRA::flagRegAlloc() {
|
|
uint32_t flagSpillId = 0;
|
|
unsigned maxRAIterations = builder.getuint32Option(vISA_MaxRAIterations);
|
|
uint32_t iterationNo = 0;
|
|
bool spillingFlag = false;
|
|
|
|
while (iterationNo < maxRAIterations) {
|
|
RA_TRACE(std::cout << "--flag RA iteration " << iterationNo << "\n");
|
|
|
|
//
|
|
// choose reg vars whose reg file kind is FLAG
|
|
//
|
|
LivenessAnalysis liveAnalysis(*this, G4_FLAG);
|
|
liveAnalysis.computeLiveness();
|
|
|
|
//
|
|
// if no reg var needs to reg allocated, then skip reg allocation
|
|
//
|
|
if (liveAnalysis.getNumSelectedVar() > 0) {
|
|
GraphColor coloring(liveAnalysis, false, false);
|
|
if (!coloring.regAlloc(false, false, nullptr)) {
|
|
SpillManager spillFlag(*this, coloring.getSpilledLiveRanges(),
|
|
flagSpillId);
|
|
spillFlag.insertSpillCode();
|
|
VISA_DEBUG_VERBOSE({
|
|
printf("FLAG Spill inst count: %d\n",
|
|
spillFlag.getNumFlagSpillStore());
|
|
printf("FLAG Fill inst count: %d\n", spillFlag.getNumFlagSpillLoad());
|
|
printf("*************************\n");
|
|
});
|
|
flagSpillId = spillFlag.getNextTempDclId();
|
|
|
|
spillingFlag = true;
|
|
if (spillFlag.isAnyNewTempCreated() == false) {
|
|
coloring.confirmRegisterAssignments();
|
|
|
|
if ((builder.kernel.fg.getHasStackCalls() ||
|
|
builder.kernel.fg.getIsStackCallFunc())) {
|
|
coloring.addFlagSaveRestoreCode();
|
|
}
|
|
break;
|
|
}
|
|
builder.getJitInfo()->stats.numFlagSpillStore +=
|
|
spillFlag.getNumFlagSpillStore();
|
|
builder.getJitInfo()->stats.numFlagSpillLoad +=
|
|
spillFlag.getNumFlagSpillLoad();
|
|
} else // successfully allocate register without spilling
|
|
{
|
|
coloring.confirmRegisterAssignments();
|
|
if ((builder.kernel.fg.getHasStackCalls() ||
|
|
builder.kernel.fg.getIsStackCallFunc())) {
|
|
coloring.addFlagSaveRestoreCode();
|
|
}
|
|
|
|
if (spillingFlag && builder.getOption(vISA_FlagSpillCodeCleanup)) {
|
|
CLEAN_NUM_PROFILE clean_num_profile;
|
|
|
|
FlagSpillCleanup f(*this);
|
|
f.spillFillCodeCleanFlag(builder, kernel, &clean_num_profile);
|
|
|
|
#ifdef DEBUG_VERBOSE_ON1
|
|
for (int i = 0; i < 3; i++) {
|
|
printf("Profiler %d Spill clean: %d\n", i,
|
|
clean_num_profile.spill_clean_num[i]);
|
|
printf("Profiler %d Fill clean: %d\n", i,
|
|
clean_num_profile.fill_clean_num[i]);
|
|
clean_num += clean_num_profile.spill_clean_num[i];
|
|
clean_num += clean_num_profile.fill_clean_num[i];
|
|
}
|
|
printf("**Flag clean num: %d\n", clean_num);
|
|
#endif
|
|
}
|
|
|
|
VISA_DEBUG_VERBOSE(detectUndefinedUses(liveAnalysis, kernel));
|
|
|
|
break; // done with FLAG allocation
|
|
}
|
|
} else {
|
|
break; // no FLAG allocation needed
|
|
}
|
|
kernel.dumpToFile("after.Flag_RA." + std::to_string(iterationNo));
|
|
iterationNo++;
|
|
}
|
|
|
|
// Flag spill/fill
|
|
addVarToRA(kernel.Declares.back());
|
|
|
|
vISA_ASSERT(iterationNo < maxRAIterations, "Flag RA has failed.");
|
|
}
|
|
void GlobalRA::scalarRegAlloc() {
|
|
uint32_t scalarSpillId = 0;
|
|
unsigned maxRAIterations = builder.getuint32Option(vISA_MaxRAIterations);
|
|
unsigned iterationNo = 0;
|
|
|
|
std::set<G4_Declare *> PreAssigned;
|
|
for (auto dcl : kernel.Declares) {
|
|
if (dcl->getRegFile() == G4_SCALAR) {
|
|
auto regVar = dcl->getRegVar();
|
|
if (regVar->isS0())
|
|
PreAssigned.insert(dcl);
|
|
}
|
|
}
|
|
while (iterationNo < maxRAIterations) {
|
|
RA_TRACE(std::cout << "--scalar RA iteration " << iterationNo << "\n");
|
|
//
|
|
// choose reg vars whose reg file kind is ARF
|
|
//
|
|
LivenessAnalysis liveAnalysis(*this, G4_SCALAR);
|
|
liveAnalysis.computeLiveness();
|
|
|
|
//
|
|
// if no reg var needs to reg allocated, then skip reg allocation
|
|
//
|
|
if (liveAnalysis.getNumSelectedVar() > 0) {
|
|
GraphColor coloring(liveAnalysis, false, false);
|
|
if (!coloring.regAlloc(false, false, nullptr)) {
|
|
SpillManager spillScalar(*this, coloring.getSpilledLiveRanges(),
|
|
scalarSpillId);
|
|
spillScalar.insertSpillCode();
|
|
scalarSpillId = spillScalar.getNextTempDclId();
|
|
|
|
//
|
|
// if new scalar temps are created, we need to do RA again so that newly
|
|
// created temps can get registers. If there are no more newly created
|
|
// temps, we then commit reg assignments
|
|
//
|
|
if (spillScalar.isAnyNewTempCreated() == false) {
|
|
coloring.confirmRegisterAssignments();
|
|
if ((builder.kernel.fg.getHasStackCalls() ||
|
|
builder.kernel.fg.getIsStackCallFunc())) {
|
|
vASSERT(false &&
|
|
"missing code"); // coloring.addA0SaveRestoreCode();
|
|
}
|
|
break; // no more new scalar temps; done with scalar allocation
|
|
}
|
|
} else // successfully allocate register without spilling
|
|
{
|
|
coloring.confirmRegisterAssignments();
|
|
if ((builder.kernel.fg.getHasStackCalls() ||
|
|
builder.kernel.fg.getIsStackCallFunc())) {
|
|
vASSERT(false && "missing code"); // coloring.addA0SaveRestoreCode();
|
|
}
|
|
VISA_DEBUG_VERBOSE(detectUndefinedUses(liveAnalysis, kernel));
|
|
|
|
break; // done with scalar allocation
|
|
}
|
|
} else {
|
|
break; // no scalar allocation needed
|
|
}
|
|
iterationNo++;
|
|
}
|
|
kernel.dumpToFile("after.Scalar_RA." + std::to_string(iterationNo));
|
|
constexpr unsigned ScalarRegisterGRFBase = 96;
|
|
// change scalar-register assignment back to fixed GRF location so that
|
|
// the code can be simulated on platform without scalar pipe
|
|
for (G4_Declare *dcl : kernel.Declares) {
|
|
if (dcl->getRegFile() == G4_SCALAR && !PreAssigned.count(dcl)) {
|
|
auto regVar = dcl->getRegVar();
|
|
if (regVar->isS0()) {
|
|
auto offset = regVar->getPhyRegOff() * dcl->getElemSize();
|
|
unsigned int regNum = offset / builder.getGRFSize();
|
|
unsigned int subRegNum =
|
|
(offset % builder.getGRFSize()) / dcl->getElemSize();
|
|
regVar->setPhyReg(regPool.getGreg(regNum + ScalarRegisterGRFBase),
|
|
subRegNum);
|
|
dcl->setRegFile(G4_GRF);
|
|
}
|
|
}
|
|
}
|
|
kernel.dumpToFile("after.Scalar_Rename." + std::to_string(iterationNo));
|
|
vISA_ASSERT(iterationNo < maxRAIterations, "Scalar RA has failed.");
|
|
}
|
|
|
|
void GlobalRA::assignRegForAliasDcl() {
|
|
//
|
|
// assign Reg for Alias DCL
|
|
//
|
|
for (G4_Declare *dcl : kernel.Declares) {
|
|
G4_RegVar *AliasRegVar;
|
|
G4_RegVar *CurrentRegVar;
|
|
unsigned tempoffset;
|
|
|
|
if (dcl->getAliasDeclare() != NULL) {
|
|
AliasRegVar = dcl->getAliasDeclare()->getRegVar();
|
|
CurrentRegVar = dcl->getRegVar();
|
|
tempoffset = AliasRegVar->getPhyRegOff() *
|
|
AliasRegVar->getDeclare()->getElemSize() +
|
|
dcl->getAliasOffset();
|
|
if (AliasRegVar->getPhyReg() != NULL) {
|
|
//
|
|
// alias register assignment for A0
|
|
//
|
|
if (CurrentRegVar->getDeclare()->useGRF()) {
|
|
// if the tempoffset is one grf
|
|
if (tempoffset < kernel.numEltPerGRF<Type_UW>() * 2u) {
|
|
CurrentRegVar->setPhyReg(
|
|
AliasRegVar->getPhyReg(),
|
|
tempoffset / CurrentRegVar->getDeclare()->getElemSize());
|
|
}
|
|
// tempoffset covers several GRFs
|
|
else {
|
|
unsigned addtionalrow =
|
|
tempoffset / (kernel.numEltPerGRF<Type_UW>() * 2);
|
|
unsigned actualoffset =
|
|
tempoffset % (kernel.numEltPerGRF<Type_UW>() * 2);
|
|
bool valid = false;
|
|
unsigned orignalrow = AliasRegVar->ExRegNum(valid);
|
|
vISA_ASSERT(valid == true, ERROR_REGALLOC);
|
|
CurrentRegVar->setPhyReg(
|
|
regPool.getGreg(orignalrow + addtionalrow),
|
|
actualoffset / CurrentRegVar->getDeclare()->getElemSize());
|
|
}
|
|
} else if (CurrentRegVar->getDeclare()->getRegFile() == G4_ADDRESS) {
|
|
vISA_ASSERT(tempoffset < builder.getNumAddrRegisters() * 2,
|
|
ERROR_REGALLOC); // Must hold tempoffset in one A0 reg
|
|
CurrentRegVar->setPhyReg(
|
|
AliasRegVar->getPhyReg(),
|
|
tempoffset / CurrentRegVar->getDeclare()->getElemSize());
|
|
} else if (CurrentRegVar->getDeclare()->getRegFile() == G4_SCALAR) {
|
|
if (builder.getuint32Option(vISA_ScalarPipe))
|
|
vISA_ASSERT(tempoffset < kernel.getSRFInWords() *2,
|
|
ERROR_REGALLOC);
|
|
else
|
|
vISA_ASSERT(tempoffset < builder.getScalarRegisterSizeInBytes(),
|
|
ERROR_REGALLOC);
|
|
CurrentRegVar->setPhyReg(
|
|
AliasRegVar->getPhyReg(),
|
|
tempoffset / CurrentRegVar->getDeclare()->getElemSize());
|
|
} else {
|
|
vISA_ASSERT(false, ERROR_REGALLOC);
|
|
}
|
|
} else {
|
|
if (dcl->isSpilled() == false)
|
|
dcl->setSpillFlag();
|
|
}
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
void GlobalRA::removeSplitDecl() {
|
|
for (auto dcl : kernel.Declares) {
|
|
if (!getSubDclList(dcl).empty()) {
|
|
clearSubDcl(dcl);
|
|
dcl->setIsSplittedDcl(false);
|
|
}
|
|
}
|
|
|
|
kernel.Declares.erase(
|
|
std::remove_if(kernel.Declares.begin(), kernel.Declares.end(),
|
|
[](G4_Declare *dcl) { return dcl->getIsPartialDcl(); }),
|
|
kernel.Declares.end());
|
|
}
|
|
|
|
|
|
void GlobalRA::fastRADecision()
|
|
{
|
|
if (builder.getOption(vISA_SelectiveFastRA)) {
|
|
unsigned instNum = 0;
|
|
for (auto bb : kernel.fg) {
|
|
instNum += (int)bb->size();
|
|
}
|
|
|
|
if (instNum > builder.getOptions()->getuInt32Option(vISA_SelectiveRAInstThreshold)) {
|
|
useFastRA = true;
|
|
useHybridRAwithSpill = true;
|
|
} else {
|
|
useFastRA = false;
|
|
useHybridRAwithSpill = false;
|
|
}
|
|
RA_TRACE(std::cout << "\t--SelectiveFastRA decision: " << useFastRA << "\n");
|
|
} else {
|
|
useFastRA = builder.getOption(vISA_FastCompileRA);
|
|
useHybridRAwithSpill = builder.getOption(vISA_HybridRAWithSpill);
|
|
}
|
|
|
|
}
|
|
|
|
bool GlobalRA::tryHybridRA() {
|
|
copyMissingAlignment();
|
|
BankConflictPass bc(*this, false);
|
|
|
|
|
|
LocalRA lra(bc, *this);
|
|
if (lra.localRA()) {
|
|
return true;
|
|
}
|
|
|
|
if (useHybridRAwithSpill) {
|
|
insertPhyRegDecls();
|
|
} else {
|
|
if (hybridRA(lra)) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool GlobalRA::hybridRA(LocalRA &lra) {
|
|
RA_TRACE(std::cout << "--hybrid RA--\n");
|
|
uint32_t numOrigDcl = (uint32_t)kernel.Declares.size();
|
|
insertPhyRegDecls();
|
|
|
|
LivenessAnalysis liveAnalysis(*this, G4_GRF | G4_INPUT);
|
|
liveAnalysis.computeLiveness();
|
|
|
|
if (liveAnalysis.getNumSelectedVar() > 0) {
|
|
RPE rpe(*this, &liveAnalysis);
|
|
rpe.run();
|
|
|
|
bool spillLikely =
|
|
kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
|
|
rpe.getMaxRP() >= kernel.getNumRegTotal() - 16;
|
|
if (spillLikely) {
|
|
RA_TRACE(std::cout << "\t--skip hybrid RA due to high pressure: "
|
|
<< rpe.getMaxRP() << "\n");
|
|
kernel.Declares.resize(numOrigDcl);
|
|
lra.undoLocalRAAssignments(false);
|
|
// We check src/dst overlap WA because here to keep intf graph simple.
|
|
// When LRA is run, it sets augmentation alignment conservatively so
|
|
// that LRA assignments can co-exist with HRA assignments after
|
|
// augmentation is run. If we reset alignment here, it means that
|
|
// augmentation buckets are reset and alignment is copied over
|
|
// from original G4_Declare. This is correct behavior. However, when
|
|
// avoidSrcDstOverlap WA sees that src/dst of an instruction have no
|
|
// alignment, it forces an interference edge between them. This causes
|
|
// extra interferences in graph compared to case when we use conservative
|
|
// alignment computed in LRA. So when the WA is enabled, we avoid
|
|
// resetting alignment as it may produce better code.
|
|
if (!builder.avoidDstSrcOverlap() || use4GRFAlign)
|
|
copyAlignment();
|
|
return false;
|
|
}
|
|
|
|
GraphColor coloring(liveAnalysis, /*isHybrid*/ true, /*forceSpill*/ false);
|
|
generateForbiddenTemplates(0);
|
|
// FIXME: doBankConflictReduction and highInternalConflict are computed by
|
|
// local RA, they should be moved to some common code.
|
|
bool isColoringGood =
|
|
coloring.regAlloc(lra.doHybridBCR(), lra.hasHighInternalBC(), &rpe);
|
|
if (!isColoringGood) {
|
|
if (!kernel.getOption(vISA_Debug)) {
|
|
// Why?? Keep LRA results when -debug is passed
|
|
kernel.Declares.resize(numOrigDcl);
|
|
lra.undoLocalRAAssignments(false);
|
|
}
|
|
// Restore alignment in case LRA modified it
|
|
copyAlignment();
|
|
return false;
|
|
}
|
|
coloring.confirmRegisterAssignments();
|
|
|
|
if (kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc()) {
|
|
coloring.getSaveRestoreRegister();
|
|
addSaveRestoreCode(0);
|
|
}
|
|
|
|
if (verifyAugmentation) {
|
|
assignRegForAliasDcl();
|
|
verifyAugmentation->verify();
|
|
}
|
|
}
|
|
|
|
kernel.setRAType(lra.doHybridBCR() ? RA_Type::HYBRID_BC_RA
|
|
: RA_Type::HYBRID_RA);
|
|
return true;
|
|
}
|
|
|
|
|
|
//
|
|
// change single-element dcl for G4_GRF to G4_SCALAR
|
|
//
|
|
void GlobalRA::selectScalarCandidates() {
|
|
// collect root-declares that may be changed to scalar
|
|
std::set<G4_Declare *> candidates;
|
|
for (auto bb : kernel.fg) {
|
|
for (auto inst : *bb) {
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
if (dst && dst->getTopDcl()) {
|
|
auto rootDcl = dst->getTopDcl();
|
|
if (rootDcl->getRegFile() == G4_GRF && !candidates.count(rootDcl) &&
|
|
rootDcl->getNumRows() <= 1) {
|
|
bool isNoMaskInst =
|
|
(inst->isWriteEnableInst() || bb->isAllLaneActive());
|
|
if (inst->getExecSize() == g4::SIMD1 && isNoMaskInst) {
|
|
candidates.insert(rootDcl);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// filter candidates that we cannot handle
|
|
std::set<G4_Declare *> multiUseInputs;
|
|
std::set<G4_Declare *> visitedInputs;
|
|
for (auto bb : kernel.fg) {
|
|
for (auto inst : *bb) {
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
if (dst && dst->getTopDcl()) {
|
|
auto rootDcl = dst->getTopDcl();
|
|
if (candidates.count(rootDcl)) {
|
|
// when there is only one SRF, it is unrealistic to allow
|
|
// send to write to SRF
|
|
if (inst->isSend() && kernel.getSRFInWords()*2
|
|
<= builder.getScalarRegisterSizeInBytes()) {
|
|
candidates.erase(rootDcl);
|
|
}
|
|
bool isNoMaskInst =
|
|
(inst->isWriteEnableInst() || bb->isAllLaneActive());
|
|
// all writes to that top-dcl have to be simd1-nomask
|
|
if (!(inst->getExecSize() == g4::SIMD1 && isNoMaskInst)) {
|
|
candidates.erase(rootDcl);
|
|
}
|
|
}
|
|
}
|
|
// when there is only one SRF, it is also unrealistic to allow
|
|
// SRF as regular send source because it has to be 64-byte aligned
|
|
if (inst->isSend() && kernel.getSRFInWords() * 2 <=
|
|
builder.getScalarRegisterSizeInBytes()) {
|
|
for (int i = 0; i < inst->getNumSrc(); i++) {
|
|
auto src = inst->getSrc(i);
|
|
if (!src || !src->isSrcRegRegion())
|
|
continue;
|
|
auto srcDcl = src->getTopDcl();
|
|
if (srcDcl && candidates.count(srcDcl))
|
|
candidates.erase(srcDcl);
|
|
}
|
|
}
|
|
// Also find all the input dcls that is used inside a loop,
|
|
// or used more than once. Skip moves and sends
|
|
if (inst->isRawMov() || inst->isSend())
|
|
continue;
|
|
for (int i = 0; i < inst->getNumSrc(); i++) {
|
|
auto src = inst->getSrc(i);
|
|
if (!src || !src->isSrcRegRegion())
|
|
continue;
|
|
G4_SrcRegRegion *origSrc = src->asSrcRegRegion();
|
|
auto srcDcl = src->getTopDcl();
|
|
if (srcDcl && srcDcl->getRegFile() == G4_INPUT &&
|
|
srcDcl->getTotalElems() == 1 && origSrc && origSrc->isScalar()) {
|
|
// mark multi-use
|
|
if (bb->getNestLevel() > 0)
|
|
multiUseInputs.insert(srcDcl);
|
|
else if (visitedInputs.find(srcDcl) != visitedInputs.end())
|
|
multiUseInputs.insert(srcDcl);
|
|
// mark any use
|
|
visitedInputs.insert(srcDcl);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// update declares
|
|
for (auto dcl : kernel.Declares) {
|
|
auto rootDcl = dcl->getRootDeclare();
|
|
if (candidates.count(rootDcl)) {
|
|
dcl->setRegFile(G4_SCALAR);
|
|
}
|
|
}
|
|
G4_BB *entryBB = kernel.fg.getEntryBB();
|
|
auto insertIt = entryBB->begin();
|
|
for (INST_LIST_ITER IB = entryBB->end(); insertIt != IB; ++insertIt) {
|
|
G4_INST *tI = (*insertIt);
|
|
if (tI->isFlowControl() || tI == entryBB->back())
|
|
break;
|
|
}
|
|
std::map<G4_Declare *, G4_Declare *> inputMap;
|
|
for (auto bb : kernel.fg) {
|
|
INST_LIST_ITER it = bb->begin(), iEnd = bb->end();
|
|
INST_LIST_ITER next_iter = it;
|
|
for (; it != iEnd; it = next_iter) {
|
|
++next_iter;
|
|
G4_INST *inst = *it;
|
|
// skip moves and sends
|
|
if (inst->isRawMov() || inst->isSend())
|
|
continue;
|
|
// Add move for scalar-iput with multiple-uses to save power.
|
|
// The move should be inserted into the entry-block before the first use.
|
|
for (int i = 0; i < inst->getNumSrc(); i++) {
|
|
auto src = inst->getSrc(i);
|
|
if (!src || !src->isSrcRegRegion())
|
|
continue;
|
|
G4_SrcRegRegion *origSrc = src->asSrcRegRegion();
|
|
auto srcDcl = src->getTopDcl();
|
|
if (srcDcl && origSrc && origSrc->isScalar() &&
|
|
multiUseInputs.find(srcDcl) != multiUseInputs.end()) {
|
|
vISA_ASSERT(!candidates.count(srcDcl),
|
|
"input-variable cannot be a scalar candidate");
|
|
// insert a move to a scalar-register candidate
|
|
auto subAlign = Get_G4_SubRegAlign_From_Size(
|
|
(uint16_t)origSrc->getElemSize(), builder.getPlatform(),
|
|
builder.getGRFAlign());
|
|
G4_Declare *tmpDcl = nullptr;
|
|
G4_SrcModifier modifier = origSrc->getModifier();
|
|
if (inputMap.find(srcDcl) != inputMap.end())
|
|
tmpDcl = inputMap[srcDcl];
|
|
else {
|
|
// create dcl for scalar
|
|
tmpDcl =
|
|
builder.createTempVar(g4::SIMD1, origSrc->getType(), subAlign);
|
|
tmpDcl->setRegFile(G4_SCALAR);
|
|
candidates.insert(tmpDcl);
|
|
inputMap[srcDcl] = tmpDcl;
|
|
addVarToRA(tmpDcl);
|
|
// create mov
|
|
origSrc->setModifier(Mod_src_undef);
|
|
G4_DstRegRegion *dst = builder.createDstRegRegion(tmpDcl, 1);
|
|
G4_INST *movInst = builder.createMov(g4::SIMD1, dst, origSrc,
|
|
InstOpt_WriteEnable, false);
|
|
// insert mov
|
|
if (bb == entryBB)
|
|
bb->insertBefore(it, movInst);
|
|
else
|
|
entryBB->insertBefore(insertIt, movInst);
|
|
}
|
|
G4_SrcRegRegion *newSrc = builder.createSrcRegRegion(
|
|
modifier, Direct, tmpDcl->getRegVar(), 0, 0,
|
|
builder.getRegionScalar(), tmpDcl->getElemType());
|
|
inst->setSrc(newSrc, i);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// flag and address spill location now should be scalar registers.
|
|
// scalar registers can be spilled into GRF
|
|
// need this? addrFlagSpillDcls.clear();
|
|
kernel.dumpToFile("after.select_scalar.");
|
|
}
|
|
|
|
|
|
std::pair<unsigned, unsigned> GlobalRA::reserveGRFSpillReg(GraphColor &coloring) {
|
|
coloring.markFailSafeIter(true);
|
|
unsigned spillRegSize = 0;
|
|
unsigned indrSpillRegSize = 0;
|
|
|
|
if (kernel.getOption(vISA_NewFailSafeRA)) {
|
|
spillRegSize = getNumReservedGRFs();
|
|
} else {
|
|
determineSpillRegSize(spillRegSize, indrSpillRegSize);
|
|
}
|
|
|
|
if (builder.usesStack())
|
|
vISA_ASSERT(spillRegSize + indrSpillRegSize <
|
|
kernel.stackCall.getNumCalleeSaveRegs(),
|
|
"Invalid reserveSpillSize in fail-safe RA!");
|
|
coloring.setReserveSpillGRFCount(spillRegSize + indrSpillRegSize);
|
|
return std::make_pair(spillRegSize, indrSpillRegSize);
|
|
}
|
|
|
|
// pre-allocate the bits for forbidden registers which will not be used in
|
|
// register assignment.
|
|
// Note that the order of the calls matters, as all RCs inherit from RESERVEDGRF
|
|
// for example.
|
|
void GlobalRA::generateForbiddenTemplates(unsigned reserveSpillSize) {
|
|
fbdRegs.generateReservedGRFForbidden(reserveSpillSize);
|
|
fbdRegs.generateCallerSaveGRFForbidden();
|
|
fbdRegs.generateCalleeSaveGRFForbidden();
|
|
fbdRegs.generateEOTGRFForbidden();
|
|
fbdRegs.generateLastGRFForbidden();
|
|
fbdRegs.generateEOTLastGRFForbidden();
|
|
}
|
|
|
|
//
|
|
// Create variables will be used in fail safe RA
|
|
//
|
|
void GlobalRA::createVariablesForHybridRAWithSpill() {
|
|
// To conduct fail safe in iteration 0, some variables need be allocated
|
|
// first so that they can join RA and be used in the spill/fill directly.
|
|
addVarToRA(builder.getSpillFillHeader());
|
|
addVarToRA(builder.getOldA0Dot2Temp());
|
|
|
|
if (builder.hasScratchSurface() && !builder.getSpillSurfaceOffset()) {
|
|
vISA_ASSERT(builder.instList.empty(),
|
|
"Inst list should be empty at this point before creating "
|
|
"instruction that initializes SSO");
|
|
builder.initScratchSurfaceOffset();
|
|
addVarToRA(builder.getSpillSurfaceOffset());
|
|
if (!builder.instList.empty()) {
|
|
// If SSO is not yet initialized, insert the created
|
|
// instruction into the entry BB.
|
|
auto entryBB = builder.kernel.fg.getEntryBB();
|
|
auto iter = std::find_if(entryBB->begin(), entryBB->end(),
|
|
[](G4_INST *inst) { return !inst->isLabel(); });
|
|
entryBB->splice(iter, builder.instList);
|
|
}
|
|
}
|
|
// BuiltinR0 may be spilled which is not allowed.
|
|
// FIXME: BuiltinR0 spill cost has been set to MAX already,
|
|
// keep spilling means there is some issue in cost model
|
|
builder.getBuiltinR0()->setLiveOut();
|
|
builder.getBuiltinR0()->getRegVar()->setPhyReg(builder.phyregpool.getGreg(0),
|
|
0);
|
|
}
|
|
|
|
void GlobalRA::initSRAsScratch() const {
|
|
// Verify old scratch dcl assignment before changing it
|
|
vISA_ASSERT(kernel.fg.scratchRegDcl->getRegVar()
|
|
->getPhyReg()
|
|
->asGreg()
|
|
->getRegNum() == kernel.stackCall.getSpillHeaderGRF(),
|
|
"unexpected assignment");
|
|
vISA_ASSERT(kernel.stackCall.getSpillHeaderGRF() ==
|
|
kernel.stackCall.getFPSPGRF(),
|
|
"expecting same GRF");
|
|
// Use last caller save GRF for spill/fill addr computation. Since this
|
|
// address is used as LSC header, we must use 0th sub-reg of reserved
|
|
// GRF.
|
|
kernel.fg.scratchRegDcl->getRegVar()->setPhyReg(
|
|
regPool.getGreg(kernel.stackCall.getCallerSaveLastGRF()), 0);
|
|
|
|
// Mark SR assignment as reserved so other variables don't try to
|
|
// use it.
|
|
kernel.fg.reserveSR = true;
|
|
}
|
|
|
|
void GlobalRA::stackCallSaveRestore(bool hasStackCall) {
|
|
//
|
|
// If the graph has stack calls, then add the caller-save/callee-save pseudo
|
|
// declares and code. This currently must be done after flag/addr RA due to
|
|
// the assumption about the location of the pseudo save/restore instructions
|
|
//
|
|
if (hasStackCall) {
|
|
addCallerSavePseudoCode();
|
|
|
|
// Only GENX sub-graphs require callee-save code.
|
|
|
|
if (builder.getIsKernel() == false) {
|
|
storeCEInProlog();
|
|
addCalleeSavePseudoCode();
|
|
addStoreRestoreToReturn();
|
|
}
|
|
|
|
if (!kernel.getOption(vISA_PreserveR0InR0)) {
|
|
// bind builtinR0 to the reserved stack call ABI GRF so that caller and
|
|
// callee can agree on which GRF to use for r0
|
|
builder.getBuiltinR0()->getRegVar()->setPhyReg(
|
|
builder.phyregpool.getGreg(kernel.stackCall.getThreadHeaderGRF()), 0);
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
int GlobalRA::doGlobalLinearScanRA() {
|
|
copyMissingAlignment();
|
|
BankConflictPass bc(*this, false);
|
|
LivenessAnalysis liveAnalysis(*this, G4_GRF | G4_INPUT);
|
|
liveAnalysis.computeLiveness();
|
|
|
|
TIME_SCOPE(LINEARSCAN_RA);
|
|
LinearScanRA lra(bc, *this, liveAnalysis);
|
|
int ret = lra.doLinearScanRA();
|
|
if (ret == VISA_SUCCESS) {
|
|
expandSpillFillIntrinsics(nextSpillOffset);
|
|
assignRegForAliasDcl();
|
|
if (builder.getOption(vISA_verifyLinearScan)) {
|
|
resetGlobalRAStates();
|
|
markGraphBlockLocalVars();
|
|
LivenessAnalysis live(*this, G4_GRF | G4_INPUT, false, true);
|
|
live.computeLiveness();
|
|
GraphColor coloring(live, false, false);
|
|
coloring.createLiveRanges();
|
|
Interference intf(&live, *this);
|
|
intf.init();
|
|
intf.computeInterference();
|
|
|
|
if (kernel.getOption(vISA_DumpRAIntfGraph))
|
|
intf.dumpInterference();
|
|
intf.linearScanVerify();
|
|
}
|
|
return VISA_SUCCESS;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
void GlobalRA::incRABookKeeping() {
|
|
// Reset state of incremental RA here as we move from hybrid RA
|
|
// to global RA. Note that when moving from flag->address or from
|
|
// address->GRF RA, we don't need to explicitly reset state because
|
|
// incremental RA can deduce we're moving to RA for different
|
|
// variable class. But it cannot deduce so when moving from hybrid
|
|
// to global RA.
|
|
incRA.moveFromHybridToGlobalGRF();
|
|
|
|
// This part makes incremental RA a non-NFC change. The reason we need
|
|
// to do this is because variables that spill intrinsics use may end up
|
|
// getting extended in each RA iteration. Given that those variables
|
|
// are either r0 or scalars, we mark them as Output here so they're
|
|
// live-out throughout. To make this an NFC change, we can enable this
|
|
// block even when incremental RA is not enabled.
|
|
if (incRA.isEnabled()) {
|
|
builder.getBuiltinR0()->getRootDeclare()->setLiveOut();
|
|
builder.getSpillFillHeader();
|
|
|
|
bool initSS = builder.hasScratchSurface();
|
|
if (initSS) {
|
|
builder.initScratchSurfaceOffset();
|
|
builder.getOldA0Dot2Temp();
|
|
}
|
|
}
|
|
}
|
|
|
|
std::pair<bool, bool> GlobalRA::remat(bool fastCompile, bool rematDone,
|
|
LivenessAnalysis &liveAnalysis,
|
|
GraphColor &coloring, RPE &rpe) {
|
|
bool runRemat = kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM
|
|
? true
|
|
: kernel.getSimdSize() < kernel.numEltPerGRF<Type_UB>();
|
|
// -noremat takes precedence over -forceremat
|
|
bool rematOn = !kernel.getOption(vISA_Debug) &&
|
|
!kernel.getOption(vISA_NoRemat) &&
|
|
!kernel.getOption(vISA_FastSpill) && !fastCompile &&
|
|
(kernel.getOption(vISA_ForceRemat) || runRemat);
|
|
|
|
if (!rematDone && rematOn) {
|
|
RA_TRACE(std::cout << "\t--rematerialize\n");
|
|
Rematerialization remat(kernel, liveAnalysis, coloring, rpe, *this);
|
|
remat.run();
|
|
|
|
// Re-run GRA loop only if remat caused changes to IR
|
|
return std::make_pair(remat.getChangesMade(), true);
|
|
}
|
|
return std::make_pair(false, rematDone);
|
|
}
|
|
|
|
std::tuple<bool, bool, bool>
|
|
GlobalRA ::alignedScalarSplit(bool fastCompile, bool alignedScalarSplitDone,
|
|
GraphColor &coloring) {
|
|
bool isEarlyExit = false;
|
|
if (kernel.getOption(vISA_SplitGRFAlignedScalar) && !fastCompile &&
|
|
!kernel.getOption(vISA_FastSpill) && !alignedScalarSplitDone) {
|
|
SplitAlignedScalars split(*this, coloring);
|
|
split.run();
|
|
|
|
// Re-run GRA loop if changes were made to IR
|
|
bool rerunGRA = split.getChangesMade();
|
|
kernel.dumpToFile("after.Split_Aligned_Scalar." +
|
|
std::to_string(getIterNo()));
|
|
#ifndef DLL_MODE
|
|
if (stopAfter("Split_Aligned_Scalar")) {
|
|
isEarlyExit = true;
|
|
}
|
|
#endif // DLL_MODE
|
|
return std::make_tuple(rerunGRA, true, isEarlyExit);
|
|
}
|
|
return std::make_tuple(false, alignedScalarSplitDone, false);
|
|
}
|
|
|
|
bool GlobalRA::globalSplit(VarSplit& splitPass, GraphColor& coloring) {
|
|
unsigned int sendAssociatedGRFSpillFillCount = 0;
|
|
// Calculate the spill caused by send to decide if global splitting is
|
|
// required or not
|
|
for (auto spilled : coloring.getSpilledLiveRanges()) {
|
|
auto spillDcl = spilled->getDcl();
|
|
if (spillDcl->getIsRefInSendDcl() && spillDcl->getNumRows() > 1) {
|
|
sendAssociatedGRFSpillFillCount += spilled->getRefCount();
|
|
}
|
|
}
|
|
|
|
if (getIterNo() ==
|
|
0 && // Only works when first iteration of Global RA failed.
|
|
!splitPass.didGlobalSplit && // Do only one time.
|
|
splitPass.canDoGlobalSplit(builder, kernel,
|
|
sendAssociatedGRFSpillFillCount)) {
|
|
RA_TRACE(std::cout << "\t--global send split\n");
|
|
splitPass.globalSplit(builder, kernel);
|
|
splitPass.didGlobalSplit = true;
|
|
// TODO: Since global split is rarely enabled, for now we skip
|
|
// incremental RA whenever it is enabled.
|
|
incRA.skipIncrementalRANextIter();
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
void GlobalRA::localSplit(bool fastCompile, VarSplit& splitPass) {
|
|
// Do variable splitting in each iteration
|
|
// Don't do when fast compile is required
|
|
if (builder.getOption(vISA_LocalDeclareSplitInGlobalRA) && !fastCompile) {
|
|
RA_TRACE(std::cout << "\t--split local send--\n");
|
|
for (auto bb : kernel.fg) {
|
|
splitPass.localSplit(builder, bb);
|
|
}
|
|
}
|
|
}
|
|
|
|
std::pair<bool, bool> GlobalRA::bankConflict() {
|
|
bool doBankConflictReduction = false, highInternalConflict = false;
|
|
if (builder.getOption(vISA_LocalBankConflictReduction) &&
|
|
builder.hasBankCollision()) {
|
|
bool reduceBCInRR = false;
|
|
bool reduceBCInTAandFF = false;
|
|
BankConflictPass bc(*this, true);
|
|
|
|
reduceBCInRR = bc.setupBankConflictsForKernel(
|
|
true, reduceBCInTAandFF, SECOND_HALF_BANK_START_GRF * 2,
|
|
highInternalConflict);
|
|
doBankConflictReduction = reduceBCInRR && reduceBCInTAandFF;
|
|
}
|
|
return std::make_pair(doBankConflictReduction, highInternalConflict);
|
|
}
|
|
|
|
bool GlobalRA::setupFailSafeIfNeeded(bool fastCompile, bool hasStackCall,
|
|
unsigned int maxRAIterations,
|
|
unsigned int failSafeRAIteration) {
|
|
bool reserveSpillReg = false;
|
|
bool allowAddrTaken = builder.getOption(vISA_FastSpill) || fastCompile ||
|
|
!kernel.getHasAddrTaken();
|
|
if (builder.getOption(vISA_FailSafeRA) &&
|
|
kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
|
|
kernel.getNumRegTotal() > 32 &&
|
|
!hasStackCall &&
|
|
((getIterNo() == maxRAIterations - 1) ||
|
|
(allowAddrTaken && getIterNo() == failSafeRAIteration))) {
|
|
RA_TRACE(std::cout << "\t--enable failSafe RA\n");
|
|
reserveSpillReg = true;
|
|
|
|
if (incRA.isEnabled()) {
|
|
incRA.skipIncrementalRANextIter();
|
|
}
|
|
|
|
if (builder.hasScratchSurface() && !hasStackCall) {
|
|
// Since this is fail safe RA iteration, we ensure the 2 special
|
|
// variables are created before coloring so spill code can use
|
|
// them, if needed.
|
|
auto a0Dot2Temp = kernel.fg.builder->getOldA0Dot2Temp();
|
|
addVarToRA(a0Dot2Temp);
|
|
if (builder.supportsLSC()) {
|
|
auto spillFillHdr = kernel.fg.builder->getSpillFillHeader();
|
|
addVarToRA(spillFillHdr);
|
|
}
|
|
}
|
|
}
|
|
return reserveSpillReg;
|
|
}
|
|
|
|
void GlobalRA::undefinedUses(bool rematDone, LivenessAnalysis& liveAnalysis) {
|
|
if (builder.getOption(vISA_DumpUndefUsesFromLiveness) && getIterNo() == 0 &&
|
|
!rematDone) {
|
|
liveAnalysis.reportUndefinedUses();
|
|
}
|
|
}
|
|
|
|
void GlobalRA::writeVerboseStatsNumVars(LivenessAnalysis &liveAnalysis,
|
|
FINALIZER_INFO *jitInfo) {
|
|
if (builder.getOption(vISA_DumpPerfStatsVerbose)) {
|
|
jitInfo->statsVerbose.varNum = liveAnalysis.getNumSelectedVar();
|
|
jitInfo->statsVerbose.globalVarNum = liveAnalysis.getNumSelectedGlobalVar();
|
|
}
|
|
}
|
|
|
|
void GlobalRA::writeVerboseRPEStats(RPE &rpe) {
|
|
if (builder.getOption(vISA_DumpPerfStatsVerbose) &&
|
|
builder.getJitInfo()->statsVerbose.RAIterNum == 1) {
|
|
builder.getJitInfo()->statsVerbose.maxRP = rpe.getMaxRP();
|
|
}
|
|
if (builder.getOption(vISA_DumpPerfStats)) {
|
|
builder.getJitInfo()->stats.maxGRFPressure = rpe.getMaxRP();
|
|
}
|
|
}
|
|
|
|
bool GlobalRA::VRTIncreasedGRF(GraphColor &coloring) {
|
|
if (kernel.useAutoGRFSelection()) {
|
|
bool infCostSpilled =
|
|
coloring.getSpilledLiveRanges().end() !=
|
|
std::find_if(coloring.getSpilledLiveRanges().begin(),
|
|
coloring.getSpilledLiveRanges().end(),
|
|
[](const LiveRange *spilledLR) {
|
|
return spilledLR->getSpillCost() == MAXSPILLCOST;
|
|
});
|
|
// Check if GRF can be increased to avoid large spills
|
|
if (canIncreaseGRF(computeSpillSize(coloring.getSpilledLiveRanges()),
|
|
infCostSpilled))
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void GlobalRA::splitOnSpill(bool fastCompile, GraphColor &coloring,
|
|
LivenessAnalysis &liveAnalysis) {
|
|
if (!kernel.getOption(vISA_Debug) && getIterNo() == 0 && !fastCompile &&
|
|
kernel.getOption(vISA_DoSplitOnSpill)) {
|
|
RA_TRACE(std::cout << "\t--var split around loop\n");
|
|
LoopVarSplit loopSplit(kernel, &coloring, &liveAnalysis);
|
|
kernel.fg.getLoops().computePreheaders();
|
|
loopSplit.run();
|
|
}
|
|
}
|
|
|
|
bool GlobalRA::convertToFailSafe(bool reserveSpillReg, GraphColor &coloring,
|
|
LivenessAnalysis &liveAnalysis,
|
|
unsigned int nextSpillOffset) {
|
|
// Very few spills in this iter. Check if we can convert this to fail
|
|
// safe iter. By converting this iter to fail safe we can save (at
|
|
// least) 1 additional iter to allocate spilled temps. But converting to
|
|
// fail safe needs extra checks because no reserved GRF may exist at
|
|
// this point. So push/pop needs to succeed without additional GRF
|
|
// potentially.
|
|
if (!kernel.getOption(vISA_Debug) && getIterNo() >= 1 &&
|
|
kernel.getOption(vISA_NewFailSafeRA) && !reserveSpillReg &&
|
|
coloring.getSpilledLiveRanges().size() <= BoundedRA::MaxSpillNumVars &&
|
|
liveAnalysis.getNumSelectedVar() > BoundedRA::LargeProgramSize) {
|
|
// Stack call always has free GRF so it is safe to convert this iter
|
|
// to fail safe
|
|
if (builder.usesStack() ||
|
|
// If LSC has to be used for spill/fill then we need to ensure
|
|
// spillHeader is created
|
|
(useLscForNonStackCallSpillFill && builder.hasValidSpillFillHeader()) ||
|
|
// or if immediate can be folded in to LSC
|
|
canUseLscImmediateOffsetSpillFill ||
|
|
// If scratch is to be used then max spill offset must be within
|
|
// addressable range and r0 must be available as reserved. If r0
|
|
// is not reserved, we cannot conver current iteration to fail
|
|
// safe because r0 may get assigned to other virtual variables.
|
|
((kernel.getOption(vISA_PreserveR0InR0) ||
|
|
builder.getBuiltinR0()->isOutput()) &&
|
|
(nextSpillOffset + BoundedRA::getNumPhyVarSlots(kernel)) <
|
|
SCRATCH_MSG_LIMIT)) {
|
|
// Few ranges are spilled but this was not executed as fail
|
|
// safe iteration. However, we've the capability of doing
|
|
// push/pop with new fail safe RA implementation. So for very
|
|
// few spills, we insert push/pop to free up some GRFs rather
|
|
// than executing a new RA iteration. When doing so, we mark
|
|
// this RA iteration as fail safe.
|
|
coloring.markFailSafeIter(true);
|
|
// No reserved GRFs
|
|
setNumReservedGRFsFailSafe(0);
|
|
RA_TRACE(std::cout << "\t--enabling new fail safe RA\n");
|
|
return true;
|
|
}
|
|
}
|
|
return reserveSpillReg;
|
|
}
|
|
|
|
std::pair<bool, unsigned int>
|
|
GlobalRA::abortOnSpill(unsigned int GRFSpillFillCount,
|
|
GraphColor &coloring) {
|
|
// Calculate the spill caused by send to decide if global splitting is
|
|
// required or not
|
|
for (auto spilled : coloring.getSpilledLiveRanges()) {
|
|
GRFSpillFillCount += spilled->getRefCount();
|
|
}
|
|
|
|
// vISA_AbortOnSpillThreshold is defined as [0..200]
|
|
// where 0 means abort on any spill and 200 means never abort
|
|
auto underSpillThreshold = [this](int numSpill, int asmCount,
|
|
GraphColor &coloring) {
|
|
int threshold = std::min(
|
|
builder.getOptions()->getuInt32Option(vISA_AbortOnSpillThreshold),
|
|
200u);
|
|
unsigned spillSize = computeSpillSize(coloring.getSpilledLiveRanges());
|
|
|
|
return (numSpill * 200) < (threshold * asmCount) ||
|
|
spillSize < kernel.grfMode.getSpillThreshold();
|
|
};
|
|
|
|
unsigned int instNum = instCount();
|
|
bool isUnderThreshold =
|
|
underSpillThreshold(GRFSpillFillCount, instNum, coloring);
|
|
isUnderThreshold = builder.getFreqInfoManager().underFreqSpillThreshold(
|
|
coloring.getSpilledLiveRanges(), instNum, GRFSpillFillCount,
|
|
isUnderThreshold);
|
|
|
|
if (isUnderThreshold) {
|
|
if (auto jitInfo = builder.getJitInfo()) {
|
|
jitInfo->avoidRetry = true;
|
|
}
|
|
}
|
|
|
|
if (builder.getOption(vISA_AbortOnSpill) && !isUnderThreshold) {
|
|
// update jit metadata information
|
|
if (auto jitInfo = builder.getJitInfo()) {
|
|
jitInfo->stats.spillMemUsed = 0;
|
|
jitInfo->stats.numAsmCountUnweighted = instNum;
|
|
jitInfo->stats.numGRFSpillFillWeighted = GRFSpillFillCount;
|
|
}
|
|
|
|
return std::make_pair(true, GRFSpillFillCount);
|
|
}
|
|
return std::make_pair(false, GRFSpillFillCount);
|
|
}
|
|
|
|
unsigned GlobalRA::computeSpillSize(std::list<LSLiveRange *> &spilledLRs) {
|
|
unsigned spillSize = 0;
|
|
for (auto lr : spilledLRs) {
|
|
spillSize += lr->getTopDcl()->getByteSize();
|
|
}
|
|
return spillSize;
|
|
}
|
|
|
|
unsigned GlobalRA::computeSpillSize(const LIVERANGE_LIST &spilledLRs) {
|
|
unsigned spillSize = 0;
|
|
for (auto lr : spilledLRs) {
|
|
spillSize += lr->getDcl()->getByteSize();
|
|
}
|
|
return spillSize;
|
|
}
|
|
|
|
bool GlobalRA::spillSpaceCompression(int spillSize,
|
|
const int globalScratchOffset) {
|
|
if (builder.getOption(vISA_ForceSpillSpaceCompression) &&
|
|
(builder.getuint32Option(vISA_SpillSpaceCompressionThreshold) == 0))
|
|
return true;
|
|
|
|
int spillcompressionThreshold =
|
|
(int)builder.getuint32Option(vISA_SpillSpaceCompressionThreshold) * 1024;
|
|
|
|
// user disabled vISA_ForceSpillSpaceCompression and no threshold override.
|
|
if (spillcompressionThreshold == 0) {
|
|
spillcompressionThreshold = SCRATCH_COMPRESS_THRESHOLD;
|
|
}
|
|
|
|
// factor 1.2 is used to count in the space used for the following
|
|
// iterations. Generally, the most spill will happen in first iteration.
|
|
if ((spillSize * 1.2) <
|
|
(spillcompressionThreshold - globalScratchOffset)) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void GlobalRA::verifyNoInfCostSpill(GraphColor& coloring, bool reserveSpillReg)
|
|
{
|
|
vISA_ASSERT(std::all_of(coloring.getSpilledLiveRanges().begin(),
|
|
coloring.getSpilledLiveRanges().end(),
|
|
[&](const LiveRange *spilledLR) {
|
|
// EOT spills even of infinite cost are
|
|
// specially handled in spill insertion when
|
|
// using old fail safe RA. So don't assert for
|
|
// such spills.
|
|
if (isEOTSpillWithFailSafeRA(builder, spilledLR,
|
|
reserveSpillReg) &&
|
|
!builder.getOption(vISA_NewFailSafeRA))
|
|
return true;
|
|
return spilledLR->getSpillCost() != MAXSPILLCOST;
|
|
}),
|
|
"Spilled inf spill cost range");
|
|
}
|
|
|
|
void GlobalRA::setupA0Dot2OnSpill(bool hasStackCall,
|
|
unsigned int nextSpillOffset,
|
|
int globalScratchOffset) {
|
|
if (builder.hasScratchSurface() && !hasStackCall &&
|
|
(nextSpillOffset + globalScratchOffset) >= SCRATCH_MSG_LIMIT) {
|
|
// create temp variable to store old a0.2 - this is marked as live-in
|
|
// and live-out. because the variable is emitted only post RA to
|
|
// preserve old value of a0.2.
|
|
kernel.fg.builder->getOldA0Dot2Temp();
|
|
} else if (useLscForNonStackCallSpillFill || useLscForScatterSpill) {
|
|
// Xe2+ LSC-based spill/fill needs the same as above
|
|
{
|
|
kernel.fg.builder->getOldA0Dot2Temp();
|
|
}
|
|
}
|
|
}
|
|
|
|
bool GlobalRA::spillCleanup(bool fastCompile, bool useScratchMsgForSpill,
|
|
bool hasStackCall, bool reserveSpillReg, RPE &rpe,
|
|
GraphColor &coloring,
|
|
LivenessAnalysis &liveAnalysis,
|
|
SpillManagerGRF &spillGRF) {
|
|
bool disableSpillCoalecse = builder.getOption(vISA_DisableSpillCoalescing) ||
|
|
builder.getOption(vISA_FastSpill) ||
|
|
fastCompile || builder.getOption(vISA_Debug) ||
|
|
// spill cleanup is not support when we use oword
|
|
// msg for spill/fill for non-stack calls.
|
|
(!useScratchMsgForSpill && !hasStackCall);
|
|
|
|
if (!reserveSpillReg && !disableSpillCoalecse && builder.useSends()) {
|
|
RA_TRACE(std::cout << "\t--spill/fill cleanup\n");
|
|
CoalesceSpillFills c(kernel, liveAnalysis, coloring, spillGRF, getIterNo(),
|
|
rpe, *this);
|
|
c.run();
|
|
#ifndef DLL_MODE
|
|
if (stopAfter("spillCleanup")) {
|
|
return true;
|
|
}
|
|
#endif // DLL_MODE
|
|
}
|
|
return false;
|
|
}
|
|
|
|
std::tuple<bool, bool, bool, unsigned int, unsigned int>
|
|
GlobalRA::insertSpillCode(bool enableSpillSpaceCompression,
|
|
GraphColor &coloring, LivenessAnalysis &liveAnalysis,
|
|
RPE &rpe, unsigned int scratchOffset,
|
|
bool fastCompile, bool hasStackCall,
|
|
int globalScratchOffset, unsigned int nextSpillOffset,
|
|
bool reserveSpillReg, unsigned int spillRegSize,
|
|
unsigned int indrSpillRegSize,
|
|
bool useScratchMsgForSpill) {
|
|
if (getIterNo() == 0 && enableSpillSpaceCompression &&
|
|
kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
|
|
!hasStackCall) {
|
|
enableSpillSpaceCompression = spillSpaceCompression(
|
|
computeSpillSize(coloring.getSpilledLiveRanges()), globalScratchOffset);
|
|
}
|
|
|
|
startTimer(TimerID::SPILL);
|
|
SpillManagerGRF spillGRF(*this, nextSpillOffset, &liveAnalysis,
|
|
coloring.getIntf(), &coloring.getSpilledLiveRanges(),
|
|
reserveSpillReg, spillRegSize, indrSpillRegSize,
|
|
enableSpillSpaceCompression, useScratchMsgForSpill);
|
|
|
|
if (kernel.getOption(vISA_SpillAnalysis)) {
|
|
spillAnalysis->Do(&liveAnalysis, &coloring, &spillGRF);
|
|
}
|
|
|
|
verifyNoInfCostSpill(coloring, reserveSpillReg);
|
|
|
|
bool success = spillGRF.insertSpillFillCode(&kernel, pointsToAnalysis);
|
|
nextSpillOffset = spillGRF.getNextOffset();
|
|
|
|
if (kernel.getOption(vISA_VerifyRA)) {
|
|
// For least false positives, turn off RMW opt and spill cleanup
|
|
verifySpillFill();
|
|
}
|
|
|
|
setupA0Dot2OnSpill(hasStackCall, nextSpillOffset, globalScratchOffset);
|
|
|
|
RA_TRACE({
|
|
auto &&spills = coloring.getSpilledLiveRanges();
|
|
std::cout << "\t--# variables spilled: " << spills.size() << "\n";
|
|
if (spills.size() < 100) {
|
|
std::cout << "\t--spilled variables: ";
|
|
for (auto &&lr : spills) {
|
|
std::cout << lr->getDcl()->getName() << " ";
|
|
}
|
|
std::cout << "\n";
|
|
}
|
|
std::cout << "\t--current spill size: " << nextSpillOffset << "\n";
|
|
});
|
|
|
|
if (!success) {
|
|
return std::make_tuple(false, enableSpillSpaceCompression, false,
|
|
scratchOffset, nextSpillOffset);
|
|
}
|
|
|
|
kernel.dumpToFile("after.Spill_GRF." + std::to_string(getIterNo() + 1));
|
|
#ifndef DLL_MODE
|
|
if (stopAfter("Spill_GRF")) {
|
|
return std::make_tuple(true, enableSpillSpaceCompression, true,
|
|
scratchOffset, nextSpillOffset);
|
|
}
|
|
#endif // DLL_MODE
|
|
|
|
scratchOffset = std::max(scratchOffset, spillGRF.getNextScratchOffset());
|
|
|
|
bool isEarlyExit =
|
|
spillCleanup(fastCompile, useScratchMsgForSpill, hasStackCall,
|
|
reserveSpillReg, rpe, coloring, liveAnalysis, spillGRF);
|
|
|
|
return std::make_tuple(true, enableSpillSpaceCompression, isEarlyExit,
|
|
scratchOffset, nextSpillOffset);
|
|
}
|
|
|
|
bool GlobalRA::rerunGRAIter(bool rerunGRA)
|
|
{
|
|
if (getIterNo() == 0 && (rerunGRA || kernel.getOption(vISA_forceBCR))) {
|
|
if (kernel.getOption(vISA_forceBCR)) {
|
|
// FIXME: We shouldn't modify options. Use local bool flag instead.
|
|
kernel.getOptions()->setOption(vISA_forceBCR, false);
|
|
}
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
//
|
|
// graph coloring entry point. returns nonzero if RA fails
|
|
//
|
|
int GlobalRA::coloringRegAlloc() {
|
|
VISA_DEBUG_VERBOSE({
|
|
std::cout << "\n=== Register Allocation ===\n";
|
|
if (builder.getIsKernel() == false) {
|
|
std::cout << "Function: " << kernel.getName() << "\n";
|
|
} else {
|
|
std::cout << "Kernel: " << kernel.getName() << "\n";
|
|
}
|
|
|
|
detectNeverDefinedUses();
|
|
});
|
|
|
|
#ifndef DLL_MODE
|
|
// Points-to analysis is done in RegAlloc.cpp just before constructing
|
|
// GlobalRA instance.
|
|
if (stopAfter("p2a")) {
|
|
pointsToAnalysis.dump(std::cout);
|
|
return VISA_EARLY_EXIT;
|
|
}
|
|
#endif // DLL_MODE
|
|
|
|
bool hasStackCall =
|
|
kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc();
|
|
|
|
fastRADecision();
|
|
|
|
bool hybridWithSpill = useHybridRAwithSpill &&
|
|
(!hasStackCall || builder.getOption(vISA_PartitionWithFastHybridRA));
|
|
useLocalRA = builder.getOption(vISA_LocalRA)
|
|
&& (kernel.fg.funcInfoTable.size() == 0
|
|
|| kernel.getInt32KernelAttr(Attributes::ATTR_Target) != VISA_3D
|
|
|| hybridWithSpill);
|
|
|
|
// this needs to be called before addr/flag RA since it changes their
|
|
// alignment as well
|
|
fixAlignment();
|
|
|
|
{
|
|
TIME_SCOPE(ADDR_FLAG_RA);
|
|
|
|
addVarToRA(kernel.Declares.back());
|
|
|
|
addrRegAlloc();
|
|
|
|
flagRegAlloc();
|
|
}
|
|
if (builder.getuint32Option(vISA_ScalarPipe)) {
|
|
selectScalarCandidates();
|
|
scalarRegAlloc();
|
|
}
|
|
// LSC messages are used when:
|
|
// a. Stack call is used on PVC+,
|
|
// b. Spill size exceeds what can be represented using hword msg on PVC+
|
|
// c. Xe2+ requires LSC stack (can force on DG2+ via -lscNonStackSpill)
|
|
if (builder.supportsLSC()) {
|
|
canUseLscImmediateOffsetSpillFill = LSCUsesImmOff(builder);
|
|
}
|
|
|
|
stackCallSaveRestore(hasStackCall);
|
|
|
|
if (kernel.getOption(vISA_SpillAnalysis)) {
|
|
spillAnalysis = std::make_unique<SpillAnalysis>();
|
|
}
|
|
|
|
if (kernel.fg.getIsStackCallFunc()) {
|
|
// Allocate space to store Frame Descriptor
|
|
nextSpillOffset += builder.numEltPerGRF<Type_UB>();
|
|
scratchOffset += builder.numEltPerGRF<Type_UB>();
|
|
|
|
if (kernel.getOption(vISA_storeCE)) {
|
|
nextSpillOffset += builder.numEltPerGRF<Type_UB>();
|
|
scratchOffset += builder.numEltPerGRF<Type_UB>();
|
|
}
|
|
}
|
|
|
|
// Global linear scan RA
|
|
if (builder.getOption(vISA_LinearScan) &&
|
|
builder.kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D) {
|
|
int success = doGlobalLinearScanRA();
|
|
if (success == VISA_SUCCESS)
|
|
return success;
|
|
else if (success == VISA_SPILL) {
|
|
return VISA_SPILL;
|
|
}
|
|
} else if (useLocalRA && !hasStackCall) {
|
|
if (tryHybridRA()) {
|
|
assignRegForAliasDcl();
|
|
return VISA_SUCCESS;
|
|
}
|
|
}
|
|
|
|
startTimer(TimerID::GRF_GLOBAL_RA);
|
|
unsigned maxRAIterations = builder.getuint32Option(vISA_MaxRAIterations);
|
|
unsigned iterationNo = 0;
|
|
|
|
int globalScratchOffset =
|
|
kernel.getInt32KernelAttr(Attributes::ATTR_SpillMemOffset);
|
|
bool useScratchMsgForSpill =
|
|
!hasStackCall &&
|
|
(globalScratchOffset < (int)(SCRATCH_MSG_LIMIT * 0.6)
|
|
// useScratchMsgForSpill is true for
|
|
// * scratch msg
|
|
// * LSC msg
|
|
// Spill insertion module decides whether to expand a fill/spill to
|
|
// scratch or LSC depending on spill offset. oword is supported for PVC
|
|
// but it is not emitted in favor of LSC.
|
|
|| builder.supportsLSC());
|
|
bool enableSpillSpaceCompression =
|
|
builder.getOption(vISA_SpillSpaceCompression);
|
|
|
|
uint32_t GRFSpillFillCount = 0;
|
|
if (builder.getFreqInfoManager().isFreqBasedSpillSelectionEnabled())
|
|
builder.getFreqInfoManager().initGRFSpillFillFreq();
|
|
|
|
unsigned fastCompileIter = 1;
|
|
bool fastCompile =
|
|
(useFastRA || useHybridRAwithSpill) &&
|
|
(!hasStackCall || builder.getOption(vISA_PartitionWithFastHybridRA));
|
|
|
|
if (fastCompile) {
|
|
fastCompileIter = 0;
|
|
}
|
|
|
|
if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM) {
|
|
maxRAIterations = 12;
|
|
}
|
|
|
|
unsigned failSafeRAIteration =
|
|
(builder.getOption(vISA_FastSpill) || fastCompile)
|
|
? fastCompileIter
|
|
: builder.getuint32Option(vISA_FailSafeRALimit);
|
|
|
|
if (failSafeRAIteration == 0) { // Fail safe RA directly in iteration 0, used
|
|
// for hybrid RA with spill
|
|
createVariablesForHybridRAWithSpill();
|
|
}
|
|
|
|
bool rematDone = false, alignedScalarSplitDone = false;
|
|
bool reserveSpillReg = false;
|
|
VarSplit splitPass(*this);
|
|
DynPerfModel perfModel(kernel);
|
|
FINALIZER_INFO *jitInfo = builder.getJitInfo();
|
|
|
|
incRABookKeeping();
|
|
while (iterationNo < maxRAIterations) {
|
|
jitInfo->statsVerbose.RAIterNum++;
|
|
if (builder.getOption(vISA_DynPerfModel)) {
|
|
perfModel.NumRAIters++;
|
|
}
|
|
RA_TRACE(std::cout << "--GRF RA iteration " << iterationNo << "--"
|
|
<< kernel.getName() << "\n");
|
|
setIterNo(iterationNo);
|
|
|
|
if (builder.getOption(vISA_clearScratchWritesBeforeEOT) &&
|
|
(globalScratchOffset + nextSpillOffset) > 0) {
|
|
// we need to set r0 be live out for this WA
|
|
builder.getBuiltinR0()->setLiveOut();
|
|
}
|
|
|
|
if (!useHybridRAwithSpill) {
|
|
resetGlobalRAStates();
|
|
// Identify the local variables to speedup following analysis
|
|
markGraphBlockLocalVars();
|
|
}
|
|
|
|
if (kernel.getOption(vISA_SpillAnalysis)) {
|
|
spillAnalysis->Clear();
|
|
}
|
|
|
|
localSplit(fastCompile, splitPass);
|
|
|
|
const auto [doBankConflictReduction, highInternalConflict] = bankConflict();
|
|
|
|
reserveSpillReg = setupFailSafeIfNeeded(
|
|
fastCompile, hasStackCall, maxRAIterations, failSafeRAIteration);
|
|
|
|
LivenessAnalysis liveAnalysis(*this, G4_GRF | G4_INPUT);
|
|
liveAnalysis.computeLiveness();
|
|
|
|
#ifndef DLL_MODE
|
|
if (stopAfter("Global_RA_liveness")) {
|
|
return VISA_EARLY_EXIT;
|
|
}
|
|
#endif // DLL_MODE
|
|
if (builder.getOption(vISA_dumpLiveness)) {
|
|
liveAnalysis.dump();
|
|
}
|
|
if (jitInfo->statsVerbose.RAIterNum == 1) {
|
|
writeVerboseStatsNumVars(liveAnalysis, jitInfo);
|
|
RA_TRACE(std::cout << "\t--# global variable: "
|
|
<< jitInfo->statsVerbose.globalVarNum << "\n");
|
|
}
|
|
#ifdef DEBUG_VERBOSE_ON
|
|
emitFGWithLiveness(liveAnalysis);
|
|
#endif
|
|
// if no reg var needs to reg allocated, then skip reg allocation
|
|
if (liveAnalysis.getNumSelectedVar() == 0)
|
|
break;
|
|
|
|
undefinedUses(rematDone, liveAnalysis);
|
|
|
|
// force spill should be done only for the 1st iteration
|
|
bool forceSpill =
|
|
iterationNo > 0 ? false : builder.getOption(vISA_ForceSpills);
|
|
RPE rpe(*this, &liveAnalysis);
|
|
if (!fastCompile) {
|
|
rpe.run();
|
|
writeVerboseRPEStats(rpe);
|
|
}
|
|
GraphColor coloring(liveAnalysis, false, forceSpill);
|
|
|
|
if (builder.getOption(vISA_dumpRPE) && iterationNo == 0 && !rematDone) {
|
|
coloring.dumpRPEToFile();
|
|
// dump pressure the first time we enter global RA
|
|
coloring.dumpRegisterPressure(std::cerr);
|
|
}
|
|
|
|
// Get the size of register which are reserved for spill
|
|
unsigned spillRegSize = 0;
|
|
unsigned indrSpillRegSize = 0;
|
|
|
|
if (reserveSpillReg) {
|
|
std::tie(spillRegSize, indrSpillRegSize) = reserveGRFSpillReg(coloring);
|
|
}
|
|
generateForbiddenTemplates(spillRegSize + indrSpillRegSize);
|
|
bool isColoringGood =
|
|
coloring.regAlloc(doBankConflictReduction, highInternalConflict, &rpe);
|
|
if (!isColoringGood) {
|
|
// When there are spills and -abortonspill is set, vISA will bump up the
|
|
// number of GRFs first and try to compile without spills under one of
|
|
// the following conditions:
|
|
// - Variable with inf spill cost, or
|
|
// - #GRFs selected and next larger one has same number of threads, or
|
|
// - Spill ratio is above threshold
|
|
// If none of the conditions is met, vISA will abort and return VISA_SPILL.
|
|
if (VRTIncreasedGRF(coloring))
|
|
continue;
|
|
|
|
bool rerunGRA1 = false, rerunGRA2 = false, rerunGRA3 = false,
|
|
isEarlyExit = false, abort = false, success = false;
|
|
std::tie(rerunGRA1, rematDone) = remat(fastCompile, rematDone, liveAnalysis, coloring, rpe);
|
|
std::tie(rerunGRA2, alignedScalarSplitDone, isEarlyExit) =
|
|
alignedScalarSplit(fastCompile, alignedScalarSplitDone, coloring);
|
|
#ifndef DLL_MODE
|
|
if (isEarlyExit) {
|
|
return VISA_EARLY_EXIT;
|
|
}
|
|
#endif // DLL_MODE
|
|
|
|
rerunGRA3 = globalSplit(splitPass, coloring);
|
|
|
|
if (rerunGRAIter(rerunGRA1 || rerunGRA2 || rerunGRA3))
|
|
continue;
|
|
|
|
splitOnSpill(fastCompile, coloring, liveAnalysis);
|
|
|
|
reserveSpillReg = convertToFailSafe(reserveSpillReg, coloring, liveAnalysis,
|
|
nextSpillOffset);
|
|
|
|
if (iterationNo == 0) {
|
|
// Dump out interference graph information of spill candidates
|
|
VISA_DEBUG_VERBOSE(reportSpillInfo(liveAnalysis, coloring));
|
|
}
|
|
|
|
std::tie(abort, GRFSpillFillCount) =
|
|
abortOnSpill(GRFSpillFillCount, coloring);
|
|
if (abort) {
|
|
// Early exit when -abortonspill is passed, instead of
|
|
// spending time inserting spill code and then aborting.
|
|
stopTimer(TimerID::GRF_GLOBAL_RA);
|
|
return VISA_SPILL;
|
|
}
|
|
|
|
std::tie(success, enableSpillSpaceCompression, isEarlyExit, scratchOffset,
|
|
nextSpillOffset) =
|
|
insertSpillCode(enableSpillSpaceCompression, coloring, liveAnalysis,
|
|
rpe, scratchOffset, fastCompile, hasStackCall,
|
|
globalScratchOffset, nextSpillOffset, reserveSpillReg,
|
|
spillRegSize, indrSpillRegSize,
|
|
useScratchMsgForSpill);
|
|
if (!success) {
|
|
iterationNo = maxRAIterations;
|
|
break;
|
|
}
|
|
#ifndef DLL_MODE
|
|
if (isEarlyExit)
|
|
return VISA_EARLY_EXIT;
|
|
#endif // DLL_MODE
|
|
|
|
++iterationNo;
|
|
|
|
if (iterationNo == builder.getuint32Option(vISA_FailSafeRALimit)) {
|
|
if (coloring.getSpilledLiveRanges().size() < 2) {
|
|
// give regular RA one more try as we are close to success
|
|
failSafeRAIteration++;
|
|
}
|
|
}
|
|
stopTimer(TimerID::SPILL);
|
|
}
|
|
// RA successfully allocates regs
|
|
if (isColoringGood == true || reserveSpillReg) {
|
|
coloring.confirmRegisterAssignments();
|
|
|
|
if (hasStackCall) {
|
|
// spill/fill intrinsics expect offset in HWord, so round up to 64
|
|
// byte but maintain it in OWord unit ToDo: we really need to change
|
|
// everything to byte for everyone's sanity..
|
|
unsigned localSpillAreaOwordSize = ROUND(scratchOffset, 64) / 16;
|
|
coloring.getSaveRestoreRegister();
|
|
addSaveRestoreCode(localSpillAreaOwordSize);
|
|
}
|
|
|
|
if (kernel.getOption(vISA_DumpRegChart)) {
|
|
assignRegForAliasDcl();
|
|
// invoke before expanding spill/fill since
|
|
// it modifies IR
|
|
regChart->dumpRegChart(std::cerr, {}, 0);
|
|
}
|
|
|
|
if (builder.getOption(vISA_DynPerfModel)) {
|
|
perfModel.run();
|
|
}
|
|
expandSpillFillIntrinsics(nextSpillOffset);
|
|
|
|
VISA_DEBUG_VERBOSE(detectUndefinedUses(liveAnalysis, kernel));
|
|
|
|
if (nextSpillOffset) {
|
|
switch (kernel.getRAType()) {
|
|
case RA_Type::GRAPH_COLORING_RR_BC_RA:
|
|
kernel.setRAType(RA_Type::GRAPH_COLORING_SPILL_RR_BC_RA);
|
|
break;
|
|
case RA_Type::GRAPH_COLORING_FF_BC_RA:
|
|
kernel.setRAType(RA_Type::GRAPH_COLORING_SPILL_FF_BC_RA);
|
|
break;
|
|
case RA_Type::GRAPH_COLORING_RR_RA:
|
|
kernel.setRAType(RA_Type::GRAPH_COLORING_SPILL_RR_RA);
|
|
break;
|
|
case RA_Type::GRAPH_COLORING_FF_RA:
|
|
kernel.setRAType(RA_Type::GRAPH_COLORING_SPILL_FF_RA);
|
|
break;
|
|
default:
|
|
vISA_ASSERT_UNREACHABLE("invalid ra type");
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (verifyAugmentation) {
|
|
assignRegForAliasDcl();
|
|
verifyAugmentation->verify();
|
|
}
|
|
break; // done
|
|
}
|
|
}
|
|
assignRegForAliasDcl();
|
|
|
|
stopTimer(TimerID::GRF_GLOBAL_RA);
|
|
//
|
|
// Report failure to allocate due to excessive register pressure.
|
|
//
|
|
if (!reserveSpillReg && (iterationNo == maxRAIterations)) {
|
|
std::stringstream spilledVars;
|
|
for (auto dcl : kernel.Declares) {
|
|
if (dcl->isSpilled() && dcl->getRegFile() == G4_GRF) {
|
|
spilledVars << dcl->getName() << "\t";
|
|
}
|
|
}
|
|
|
|
vISA_ASSERT(false, "%d GRF registers are NOT enough to compile kernel %s \
|
|
The maximum register pressure in the kernel is higher than the available \
|
|
physical registers in hardware (even with spill code). Please consider \
|
|
rewriting the kernel. Compiling with the symbolic register option and \
|
|
inspecting the spilled registers may help in determinig the region of high \
|
|
pressure. The spilling virtual registers are as follows %s.",
|
|
(kernel.getNumRegTotal() -
|
|
builder.getOptions()->getuInt32Option(vISA_ReservedGRFNum)),
|
|
kernel.getName(), spilledVars.str().c_str());
|
|
|
|
return VISA_SPILL;
|
|
}
|
|
|
|
// this includes vISA's scratch space use only and does not include whatever
|
|
// IGC may use for private memory
|
|
uint32_t spillMemUsed = builder.kernel.fg.frameSizeInOWord ?
|
|
(builder.kernel.fg.frameSizeInOWord * 16) : nextSpillOffset;
|
|
|
|
spillMemUsed = ROUND(spillMemUsed, kernel.numEltPerGRF<Type_UB>());
|
|
|
|
if (spillMemUsed &&
|
|
!(kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc())) {
|
|
builder.criticalMsgStream()
|
|
<< "Spill memory used = " << spillMemUsed << " bytes for kernel "
|
|
<< kernel.getName()
|
|
<< "\n Compiling kernel with spill code may degrade performance."
|
|
<< " Please consider rewriting the kernel to use less registers.\n";
|
|
}
|
|
|
|
// update jit metadata information for spill
|
|
if (auto jitInfo = builder.getJitInfo()) {
|
|
// jitInfo->spillMemUsed is the entire visa stack size. Consider the
|
|
// caller/callee save size if having caller/callee save
|
|
// globalScratchOffset in unit of byte, others in Oword
|
|
//
|
|
// FIXME: globalScratchOffset must be 0 when having stack call, or
|
|
// there is a problem at stack setup
|
|
// (see GlobalRA::addGenxMainStackSetupCode)
|
|
//
|
|
// vISA stack
|
|
// globalScratchOffset -> ---------------------
|
|
// | spill |
|
|
// calleeSaveAreaOffset -> ---------------------
|
|
// | callee save |
|
|
// callerSaveAreaOffset -> ---------------------
|
|
// | caller save |
|
|
// frameSizeInOWord -> ---------------------
|
|
|
|
jitInfo->hasStackcalls = kernel.fg.getHasStackCalls();
|
|
|
|
// Each function reports its required stack size.
|
|
// We will summarize the final stack size of entire vISA module into
|
|
// the main functions (ref: CISA_IR_Builder::summarizeFunctionInfo)
|
|
jitInfo->stats.spillMemUsed = spillMemUsed;
|
|
kernel.getGTPinData()->setScratchNextFree(spillMemUsed +
|
|
globalScratchOffset);
|
|
jitInfo->stats.numGRFSpillFillWeighted = GRFSpillFillCount;
|
|
}
|
|
|
|
if (builder.getOption(vISA_LocalDeclareSplitInGlobalRA)) {
|
|
removeSplitDecl();
|
|
}
|
|
|
|
if (builder.getOption(vISA_DynPerfModel)) {
|
|
perfModel.dump();
|
|
}
|
|
|
|
return VISA_SUCCESS;
|
|
}
|
|
|
|
// Insert declarations with pre-assigned registers in kernel
|
|
// this is needed for HRA, and the fake declares will be removed at the end of
|
|
// HRA
|
|
void GlobalRA::insertPhyRegDecls() {
|
|
int numGRF = kernel.getNumRegTotal();
|
|
std::vector<bool> grfUsed(numGRF, false);
|
|
GRFDclsForHRA.resize(numGRF);
|
|
|
|
for (auto curBB : kernel.fg) {
|
|
if (auto summary = getBBLRASummary(curBB)) {
|
|
for (int i = 0; i < numGRF; i++) {
|
|
if (summary->isGRFBusy(i)) {
|
|
grfUsed[i] = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Insert declarations for each GRF that is used
|
|
unsigned numGRFsUsed = 0;
|
|
for (int i = 0; i < numGRF; i++) {
|
|
if (grfUsed[i] == true) {
|
|
const char *dclName = builder.getNameString(10, "r%d", i);
|
|
G4_Declare *phyRegDcl =
|
|
builder.createDeclare(dclName, G4_GRF, kernel.numEltPerGRF<Type_UD>(),
|
|
1, Type_D, Regular, NULL, NULL);
|
|
G4_Greg *phyReg = builder.phyregpool.getGreg(i);
|
|
phyRegDcl->getRegVar()->setPhyReg(phyReg, 0);
|
|
GRFDclsForHRA[i] = phyRegDcl;
|
|
addVarToRA(phyRegDcl);
|
|
numGRFsUsed++;
|
|
}
|
|
}
|
|
|
|
VISA_DEBUG(std::cout << "Local RA used " << numGRFsUsed << " GRFs\n");
|
|
}
|
|
|
|
void GraphColor::dumpRPEToFile() {
|
|
// Dump RPE output to file if asmName is set
|
|
auto *asmOutput = builder.getOptions()->getOptionCstr(VISA_AsmFileName);
|
|
if (asmOutput) {
|
|
std::string FN(asmOutput);
|
|
FN += ".rpe";
|
|
std::ofstream OF;
|
|
OF.open(FN, std::ofstream::out);
|
|
dumpRegisterPressure(OF);
|
|
OF.close();
|
|
}
|
|
}
|
|
|
|
void GraphColor::dumpRegisterPressure(std::ostream &OS) {
|
|
RPE rpe(gra, &liveAnalysis);
|
|
uint32_t max = 0;
|
|
std::vector<G4_INST *> maxInst;
|
|
rpe.run();
|
|
|
|
for (auto bb : builder.kernel.fg) {
|
|
OS << "BB " << bb->getId() << ": (Pred: ";
|
|
for (auto pred : bb->Preds) {
|
|
OS << pred->getId() << ",";
|
|
}
|
|
OS << " Succ: ";
|
|
for (auto succ : bb->Succs) {
|
|
OS << succ->getId() << ",";
|
|
}
|
|
OS << ")\n";
|
|
for (auto instIt = bb->begin(); instIt != bb->end(); ++instIt) {
|
|
auto *inst = *instIt;
|
|
uint32_t pressure = rpe.getRegisterPressure(inst);
|
|
if (pressure > max) {
|
|
max = pressure;
|
|
maxInst.clear();
|
|
maxInst.push_back(inst);
|
|
} else if (pressure == max) {
|
|
maxInst.push_back(inst);
|
|
}
|
|
|
|
if (kernel.getOption(vISA_EmitSrcFileLineToRPE))
|
|
bb->emitInstructionSourceLineMapping(OS, instIt);
|
|
OS << "[" << pressure << "] ";
|
|
inst->print(OS);
|
|
}
|
|
}
|
|
OS << "max pressure: " << max << ", " << maxInst.size() << " inst(s)\n";
|
|
for (auto inst : maxInst) {
|
|
inst->print(OS);
|
|
}
|
|
}
|
|
|
|
void GlobalRA::fixAlignment() {
|
|
// Copy over alignment from G4_RegVar to GlobalRA instance
|
|
// Rest of RA shouldnt have to read/modify alignment of G4_RegVar
|
|
copyAlignment();
|
|
|
|
for (auto dcl : kernel.Declares) {
|
|
if (dcl->getRegFile() & G4_FLAG) {
|
|
if (dcl->getByteSize() > 2 ||
|
|
(kernel.getSimdSize() == g4::SIMD32 &&
|
|
kernel.getInt32KernelAttr(Attributes::ATTR_Target) != VISA_CM))
|
|
setSubRegAlign(dcl, G4_SubReg_Align::Even_Word);
|
|
}
|
|
}
|
|
|
|
if (builder.getPlatform() == GENX_BDW) {
|
|
// BDW requires even_word alignment for scalar HF variables
|
|
for (auto dcl : kernel.Declares) {
|
|
if (dcl->getElemType() == Type_HF && dcl->getSubRegAlign() == Any) {
|
|
setSubRegAlign(dcl, Even_Word);
|
|
}
|
|
}
|
|
}
|
|
|
|
// ToDo: remove these as it should be done by HWConformity
|
|
for (auto BB : kernel.fg) {
|
|
for (auto inst : *BB) {
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
if (dst && dst->getTopDcl()) {
|
|
G4_RegVar *var = dst->getBase()->asRegVar();
|
|
if (inst->isSend() && dst->getRegAccess() == Direct) {
|
|
if (!var->isPhyRegAssigned()) {
|
|
setSubRegAlign(dst->getTopDcl(), builder.getGRFAlign());
|
|
}
|
|
}
|
|
|
|
if (!var->isPhyRegAssigned() && var->getDeclare()->getNumRows() <= 1 &&
|
|
dst->getRegAccess() == Direct &&
|
|
var->getDeclare()->getSubRegAlign() == Any) {
|
|
if (inst->isAccSrcInst()) {
|
|
setSubRegAlign(dst->getTopDcl(),
|
|
var->getDeclare()->getRegFile() != G4_ADDRESS
|
|
? builder.getGRFAlign()
|
|
: Eight_Word);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// DFS to check if there is any conflict in subroutine return location
|
|
//
|
|
bool GlobalRA::isSubRetLocConflict(G4_BB *bb, std::vector<unsigned> &usedLoc,
|
|
unsigned stackTop) {
|
|
auto &fg = kernel.fg;
|
|
if (bb->isAlreadyTraversed(fg.getTraversalNum()))
|
|
return false;
|
|
bb->markTraversed(fg.getTraversalNum());
|
|
|
|
G4_INST *lastInst = bb->size() == 0 ? NULL : bb->back();
|
|
if (lastInst && lastInst->isReturn()) {
|
|
if (lastInst->getPredicate() == NULL)
|
|
return false;
|
|
else {
|
|
return isSubRetLocConflict(bb->fallThroughBB(), usedLoc, stackTop);
|
|
}
|
|
} else if (lastInst && lastInst->isCall()) // need to traverse to next level
|
|
{
|
|
unsigned curSubRetLoc = getSubRetLoc(bb);
|
|
//
|
|
// check conflict firstly
|
|
//
|
|
for (unsigned i = 0; i < stackTop; i++)
|
|
if (usedLoc[i] == curSubRetLoc)
|
|
return true;
|
|
//
|
|
// then traverse all the subroutines and return BB
|
|
//
|
|
usedLoc[stackTop] = curSubRetLoc;
|
|
unsigned afterCallId = bb->BBAfterCall()->getId();
|
|
|
|
// call can have 1 or 2 successors
|
|
// If it has 1 then it is sub-entry block, if it has 2
|
|
// then call has to be predicated. In case of predication,
|
|
// 1st successor is physically following BB, 2nd is
|
|
// sub-entry.
|
|
if (lastInst->getPredicate()) {
|
|
vISA_ASSERT(bb->Succs.size() == 2,
|
|
"Expecting 2 successor BBs for predicated call");
|
|
if (isSubRetLocConflict(bb->Succs.back(), usedLoc, stackTop))
|
|
return true;
|
|
}
|
|
|
|
if (bb->BBAfterCall()->getId() == afterCallId) {
|
|
if (isSubRetLocConflict(bb->BBAfterCall(), usedLoc, stackTop))
|
|
return true;
|
|
}
|
|
} else {
|
|
for (G4_BB *succ : bb->Succs)
|
|
if (isSubRetLocConflict(succ, usedLoc, stackTop))
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
//
|
|
// The routine traverses all BBs that can be reached from the entry of a
|
|
// subroutine (not traversing into nested subroutine calls). Mark retLoc[bb] =
|
|
// entryId (to associate bb with the subroutine entry. When two subroutines
|
|
// share code, we return the location of the subroutine that was previously
|
|
// traversed so that the two routines can then use the same location to save
|
|
// their return addresses.
|
|
//
|
|
unsigned GlobalRA::determineReturnAddrLoc(unsigned entryId,
|
|
std::vector<unsigned> &retLoc,
|
|
G4_BB *bb) {
|
|
auto &fg = kernel.fg;
|
|
if (bb->isAlreadyTraversed(fg.getTraversalNum()))
|
|
return retLoc[bb->getId()];
|
|
bb->markTraversed(fg.getTraversalNum());
|
|
|
|
if (retLoc[bb->getId()] != UNDEFINED_VAL)
|
|
return retLoc[bb->getId()];
|
|
|
|
retLoc[bb->getId()] = entryId;
|
|
G4_INST *lastInst = bb->size() == 0 ? NULL : bb->back();
|
|
|
|
if (lastInst && lastInst->isReturn()) {
|
|
if (!lastInst->getPredicate())
|
|
return entryId;
|
|
return determineReturnAddrLoc(entryId, retLoc, bb->fallThroughBB());
|
|
} else if (lastInst && lastInst->isCall()) {
|
|
// skip nested subroutine calls
|
|
return determineReturnAddrLoc(entryId, retLoc, bb->BBAfterCall());
|
|
}
|
|
unsigned sharedId = entryId;
|
|
for (G4_BB *succ : bb->Succs) {
|
|
unsigned loc = determineReturnAddrLoc(entryId, retLoc, succ);
|
|
if (loc != entryId) {
|
|
while (retLoc[loc] != loc) // find the root of subroutine loc
|
|
loc = retLoc[loc]; // follow the link to reach the root
|
|
if (sharedId == entryId) {
|
|
sharedId = loc;
|
|
} else if (sharedId != loc) {
|
|
//
|
|
// The current subroutine share code with two other subroutines, we
|
|
// force all three of them to use the same location by linking them
|
|
// togethers.
|
|
//
|
|
retLoc[loc] = sharedId;
|
|
}
|
|
}
|
|
}
|
|
return sharedId;
|
|
}
|
|
|
|
void GlobalRA::assignLocForReturnAddr() {
|
|
auto &fg = kernel.fg;
|
|
std::vector<unsigned> retLoc(fg.getNumBB(), UNDEFINED_VAL);
|
|
// a data structure for doing a quick map[id] ---> block
|
|
// FIXME: I have no idea why we need this vector, do we have to iterate the
|
|
// blocks by their id for some reason?
|
|
std::vector<G4_BB *> BBs(fg.getNumBB());
|
|
for (G4_BB *bb : fg) {
|
|
unsigned i = bb->getId();
|
|
BBs[i] = bb; // BBs are sorted by ID
|
|
}
|
|
|
|
//
|
|
// Firstly, keep the original algorithm unchanged to mark the retLoc
|
|
//
|
|
std::vector<G4_BB *> caller; // just to accelerate the algorithm later
|
|
|
|
for (unsigned i = 0, bbNum = fg.getNumBB(); i < bbNum; i++) {
|
|
G4_BB *bb = BBs[i];
|
|
if (bb->isEndWithCall() == false) {
|
|
continue;
|
|
}
|
|
|
|
#ifdef _DEBUG
|
|
G4_INST *last = bb->empty() ? NULL : bb->back();
|
|
vISA_ASSERT(last, ERROR_FLOWGRAPH);
|
|
#endif
|
|
|
|
caller.push_back(
|
|
bb); // record the callers, just to accelerate the algorithm
|
|
|
|
G4_BB *subEntry = bb->getCalleeInfo()->getInitBB();
|
|
if (retLoc[subEntry->getId()] !=
|
|
UNDEFINED_VAL) // a loc has been assigned to the subroutine
|
|
{
|
|
// Need to setSubRetLoc if subEntry is part of another subRoutine because,
|
|
// in the final phase, we use SubRetLoc != UNDEFINED_VAL to indicate
|
|
// a block is an entry of a subroutine.
|
|
setSubRetLoc(subEntry, retLoc[subEntry->getId()]);
|
|
} else {
|
|
fg.prepareTraversal();
|
|
unsigned loc =
|
|
determineReturnAddrLoc(subEntry->getId(), retLoc, subEntry);
|
|
if (loc != subEntry->getId()) {
|
|
retLoc[subEntry->getId()] = loc;
|
|
}
|
|
setSubRetLoc(subEntry, loc);
|
|
//
|
|
// We do not merge indirect call here, because it will createt additional
|
|
// (bb->getSubRetLoc() != bb->getId()) cases that kill the share code
|
|
// detection
|
|
//
|
|
}
|
|
|
|
// retBB is the exit basic block of callee, ie the block with return
|
|
// statement at end
|
|
G4_BB *retBB = bb->getCalleeInfo()->getExitBB();
|
|
|
|
if (retLoc[retBB->getId()] == UNDEFINED_VAL) {
|
|
// retBB block was unreachable so retLoc element corresponding to that
|
|
// block was left undefined
|
|
retLoc[retBB->getId()] = getSubRetLoc(subEntry);
|
|
}
|
|
}
|
|
VISA_DEBUG_VERBOSE({
|
|
std::cout << "\nBefore merge indirect call:\n";
|
|
for (unsigned i = 0; i < fg.getNumBB(); i++)
|
|
if (retLoc[i] == UNDEFINED_VAL) {
|
|
std::cout << "BB" << i << ": X ";
|
|
} else {
|
|
std::cout << "BB" << i << ": " << retLoc[i] << " ";
|
|
}
|
|
std::cout << "\n";
|
|
});
|
|
|
|
//
|
|
// this final phase is needed. Consider the following scenario. Sub2 shared
|
|
// code with both Sub1 and Sub3. All three must use the same location to save
|
|
// return addresses. If we traverse Sub1 then Sub3, retLoc[Sub1] and
|
|
// retLoc[Sub3] all point to their own roots. As we traverse Sub2, code
|
|
// sharing is detected, we need to this phase to make sure that Sub1 and Sub3
|
|
// use the same location.
|
|
//
|
|
for (unsigned i = 0, bbNum = fg.getNumBB(); i < bbNum; i++) {
|
|
G4_BB *bb = BBs[i];
|
|
if (getSubRetLoc(bb) != UNDEFINED_VAL) {
|
|
if (getSubRetLoc(bb) != bb->getId()) {
|
|
unsigned loc = bb->getId();
|
|
while (retLoc[loc] != loc) // not root
|
|
loc = retLoc[loc]; // follow the link to reach the root
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// Merge the retLoc in indirect call cases
|
|
//
|
|
for (G4_BB *bb : caller) {
|
|
G4_INST *last = bb->empty() ? NULL : bb->back();
|
|
vISA_ASSERT(last, ERROR_FLOWGRAPH);
|
|
|
|
unsigned fallThroughId = bb->fallThroughBB() == NULL
|
|
? UNDEFINED_VAL
|
|
: bb->fallThroughBB()->getId();
|
|
if ((last && last->getPredicate() == NULL && bb->Succs.size() > 1) ||
|
|
(last && last->getPredicate() != NULL && bb->Succs.size() > 2)) {
|
|
//
|
|
// merge all subroutines to the last one, it is a trick to conduct the
|
|
// conditional call by using last one instead of first one
|
|
//
|
|
unsigned masterEntryId = bb->Succs.back()->getId();
|
|
//
|
|
// find the root of the master subroutine
|
|
//
|
|
unsigned masterRetLoc = masterEntryId;
|
|
while (retLoc[masterRetLoc] != masterRetLoc)
|
|
masterRetLoc = retLoc[masterRetLoc];
|
|
//
|
|
// check other subroutines in one vertex
|
|
//
|
|
for (G4_BB *subBB : bb->Succs) {
|
|
if (subBB->getId() != masterEntryId &&
|
|
subBB->getId() != fallThroughId) {
|
|
//
|
|
// find the root of the current subroutine
|
|
//
|
|
unsigned loc = subBB->getId();
|
|
while (retLoc[loc] != loc)
|
|
loc = retLoc[loc];
|
|
//
|
|
// Merge: let all the items in retLoc with value loc pointing to
|
|
// masterRetLoc Suppose indirect call X calls subroutine A and B,
|
|
// indirect call Y calls B and C, and indirect call Z calls C and D.
|
|
// Before merge, the A~D will be assigned different return location.
|
|
// Suppose we process the callers in order X-->Z-->Y in the merge, if
|
|
// we just modified the return locations of one indirect call, we will
|
|
// fail to merge the return locations of A~D.
|
|
//
|
|
if (loc != masterRetLoc) {
|
|
for (unsigned i = 0; i < fg.getNumBB(); i++)
|
|
if (retLoc[i] == loc)
|
|
retLoc[i] = masterRetLoc;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
VISA_DEBUG_VERBOSE({
|
|
std::cout << "\nAfter merge indirect call:\n";
|
|
for (unsigned i = 0; i < fg.getNumBB(); i++)
|
|
if (retLoc[i] == UNDEFINED_VAL) {
|
|
std::cout << "BB" << i << ": X ";
|
|
} else {
|
|
std::cout << "BB" << i << ": " << retLoc[i] << " ";
|
|
}
|
|
std::cout << "\n";
|
|
});
|
|
|
|
//
|
|
// Assign ret loc for subroutines firstly, and then check if it is wrong (due
|
|
// to circle in call graph).
|
|
//
|
|
for (unsigned i = 0, bbNum = fg.getNumBB(); i < bbNum; i++) {
|
|
//
|
|
// reset the return BB's retLoc
|
|
//
|
|
unsigned loc = i;
|
|
if (retLoc[i] != UNDEFINED_VAL) {
|
|
while (retLoc[loc] != loc)
|
|
loc = retLoc[loc];
|
|
retLoc[i] = loc;
|
|
setSubRetLoc(BBs[i], retLoc[loc]);
|
|
}
|
|
}
|
|
|
|
for (G4_BB *bb : caller) {
|
|
//
|
|
// set caller BB's retLoc
|
|
//
|
|
#ifdef _DEBUG
|
|
G4_INST *last = bb->empty() ? NULL : bb->back();
|
|
vISA_ASSERT(last, ERROR_FLOWGRAPH);
|
|
#endif
|
|
G4_BB *subBB = bb->getCalleeInfo()->getInitBB();
|
|
//
|
|
// 1: Must use retLoc here, because some subBB is also the caller of another
|
|
// subroutine, so the entry loc in BB may be changed in this step 2: In some
|
|
// cases, the caller BB is also the entry BB. At this time, the associated
|
|
// entry BB ID will be overwritten. However, it will not impact the conflict
|
|
// detection and return location assignment, since we only check the return
|
|
// BB and/or caller BB in these two moudles.
|
|
//
|
|
setSubRetLoc(bb, retLoc[subBB->getId()]);
|
|
}
|
|
|
|
VISA_DEBUG_VERBOSE({
|
|
for (unsigned i = 0; i < fg.getNumBB(); i++) {
|
|
G4_BB *bb = BBs[i];
|
|
if (getSubRetLoc(bb) != UNDEFINED_VAL) {
|
|
if (!bb->empty() && bb->front()->isLabel()) {
|
|
std::cout << ((G4_Label *)bb->front()->getSrc(0))->getLabel()
|
|
<< " assigned location " << getSubRetLoc(bb) << "\n";
|
|
}
|
|
}
|
|
}
|
|
});
|
|
|
|
//
|
|
// detect the conflict (circle) at last
|
|
//
|
|
std::vector<unsigned> usedLoc(fg.getNumBB());
|
|
unsigned stackTop = 0;
|
|
for (G4_BB *bb : caller) {
|
|
//
|
|
// Must re-start the traversal from each caller, otherwise will lose some
|
|
// circle cases like TestRA_Call_1_1_3B, D, F, G, H
|
|
//
|
|
fg.prepareTraversal();
|
|
|
|
usedLoc[stackTop] = getSubRetLoc(bb);
|
|
|
|
G4_BB *subEntry = bb->Succs.back();
|
|
|
|
if (isSubRetLocConflict(subEntry, usedLoc, stackTop + 1)) {
|
|
vISA_ASSERT(false, "ERROR: Fail to assign call-return variables due to "
|
|
"cycle in call graph!");
|
|
}
|
|
}
|
|
|
|
insertCallReturnVar();
|
|
}
|
|
|
|
void GlobalRA::insertCallReturnVar() {
|
|
for (auto bb : kernel.fg) {
|
|
G4_INST *last = bb->empty() ? NULL : bb->back();
|
|
if (last) {
|
|
if (last->isCall()) {
|
|
insertSaveAddr(bb);
|
|
} else {
|
|
if (last->isReturn()) {
|
|
// G4_BB_EXIT_TYPE is just a dummy BB, and the return will be the last
|
|
// inst in each of its predecessors
|
|
insertRestoreAddr(bb);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void GlobalRA::insertSaveAddr(G4_BB *bb) {
|
|
vISA_ASSERT(bb != NULL, ERROR_INTERNAL_ARGUMENT);
|
|
vISA_ASSERT(getSubRetLoc(bb) != UNDEFINED_VAL,
|
|
ERROR_FLOWGRAPH); // must have a assigned loc
|
|
|
|
G4_INST *last = bb->back();
|
|
vASSERT(last->isCall());
|
|
if (last->getDst() == NULL) {
|
|
unsigned loc = getSubRetLoc(bb);
|
|
G4_Declare *dcl = getRetDecl(loc);
|
|
|
|
last->setDest(builder.createDst(dcl->getRegVar(), 0, 0, 1,
|
|
Type_UD)); // RET__loc12<1>:ud
|
|
|
|
last->setExecSize(g4::SIMD2);
|
|
}
|
|
}
|
|
|
|
void GlobalRA::insertRestoreAddr(G4_BB *bb) {
|
|
vISA_ASSERT(bb != NULL, ERROR_INTERNAL_ARGUMENT);
|
|
|
|
G4_INST *last = bb->back();
|
|
vASSERT(last->isReturn());
|
|
if (last->getSrc(0) == NULL) {
|
|
unsigned loc = getSubRetLoc(bb);
|
|
G4_Declare *dcl = getRetDecl(loc);
|
|
|
|
G4_SrcRegRegion *new_src = builder.createSrc(
|
|
dcl->getRegVar(), 0, 0, builder.createRegionDesc(0, 2, 1), Type_UD);
|
|
|
|
last->setSrc(new_src, 0);
|
|
last->setDest(builder.createNullDst(Type_UD));
|
|
|
|
last->setExecSize(g4::SIMD2);
|
|
}
|
|
}
|
|
|
|
// This function returns the weight of interference edge lr1--lr2,
|
|
// which is used for computing the degree of lr1.
|
|
//
|
|
// When there is no alignment restriction, we should use the normal weight,
|
|
// which is lr1_nreg + lr2_nreg - 1.
|
|
//
|
|
// Otherewise, we need to take into account additional space that may be
|
|
// required because of the alignment restriction. For example,
|
|
// if lr1 has even alignment and lr2 has no alignment restriction,
|
|
// we need to consider the following cases that would require the
|
|
// maximal available GRF space for successful allocation:
|
|
// 1) lr1's size is odd, lr2's size is odd and lr2's start position is even,
|
|
// the total space required would be (lr1_nreg + lr2_nreg + 1)
|
|
// 2) lr1's size is odd, lr2's size is even and lr2's start position is even,
|
|
// the total space required would be (lr1_nreg + lr2_nreg)
|
|
// 3) lr1's size is even, lr2's size is odd and lr2's start position is odd,
|
|
// the total space required would be (lr1_nreg + lr2_nreg)
|
|
// 4) lr1's size is even, lr2's size is even and lr2's start position is odd,
|
|
// the total space required would be (lr1_nreg + lr2_nreg + 1)
|
|
// The above logic can be simplified to the following formula:
|
|
// lr1_nreg + lr2_nreg + 1 - ((lr1_nreg + lr2_nreg) % 2)
|
|
//
|
|
// If both lr1 and lr2 have even alignment restriction,
|
|
// we need to consider the following cases that would require the
|
|
// maximal available GRF space for successful allocation:
|
|
// 1) lr1's size is odd, lr2's size is odd and lr2's start position is even,
|
|
// the total space required would be (lr1_nreg + lr2_nreg + 1)
|
|
// 2) lr1's size is odd, lr2's size is even and lr2's start position is even,
|
|
// the total space required would be (lr1_nreg + lr2_nreg)
|
|
// 3) lr1's size is even, lr2's size is odd and lr2's start position is even,
|
|
// the total space required would be (lr1_nreg + lr2_nreg)
|
|
// 4) lr1's size is even, lr2's size is even and lr2's start position is even,
|
|
// the total space required would be (lr1_nreg + lr2_nreg - 1)
|
|
// The above logic can be simplified to the following formula:
|
|
// lr1_nreg + lr2_nreg - 1 + (lr1_nreg % 2) + (lr2_nreg % 2)
|
|
//
|
|
// Note: Edge weight between 2 nodes is asymmetric and depends on ordering
|
|
// of nodes. Swapping lr1, lr2 and invoking edgeWeightGRF() may return
|
|
// different result. So using the correct order of lr1, lr2 during edge
|
|
// weight computation and later during simplification is necessary for
|
|
// correctness.
|
|
//
|
|
template <bool Support4GRFAlign>
|
|
unsigned GraphColor::edgeWeightGRF(const LiveRange *lr1, const LiveRange *lr2) {
|
|
unsigned lr1_nreg = lr1->getNumRegNeeded();
|
|
unsigned lr2_nreg = lr2->getNumRegNeeded();
|
|
|
|
if constexpr (Support4GRFAlign) {
|
|
auto lr1Align = gra.getAugAlign(lr1->getDcl());
|
|
auto lr2Align = gra.getAugAlign(lr2->getDcl());
|
|
|
|
return edgeWeightWith4GRF(lr1Align, lr2Align, lr1_nreg, lr2_nreg);
|
|
} else {
|
|
bool lr1EvenAlign = gra.isEvenAligned<false>(lr1->getDcl());
|
|
bool lr2EvenAlign = gra.isEvenAligned<false>(lr2->getDcl());
|
|
|
|
return edgeWeightGRF(lr1EvenAlign, lr2EvenAlign, lr1_nreg, lr2_nreg);
|
|
}
|
|
}
|
|
|
|
unsigned GraphColor::edgeWeightARF(const LiveRange *lr1, const LiveRange *lr2) {
|
|
if (lr1->getRegKind() == G4_FLAG) {
|
|
G4_SubReg_Align lr1_align = gra.getSubRegAlign(lr1->getVar()->getDeclare());
|
|
G4_SubReg_Align lr2_align = gra.getSubRegAlign(lr2->getVar()->getDeclare());
|
|
unsigned lr1_nreg = lr1->getNumRegNeeded();
|
|
unsigned lr2_nreg = lr2->getNumRegNeeded();
|
|
|
|
if (lr1_align == Any) {
|
|
return lr1_nreg + lr2_nreg - 1;
|
|
} else if (lr1_align == Even_Word && lr2_align == Any) {
|
|
return lr1_nreg + lr2_nreg + 1 - ((lr1_nreg + lr2_nreg) % 2);
|
|
} else if (lr1_align == Even_Word && lr2_align == Even_Word) {
|
|
if (lr1_nreg % 2 == 0 && lr2_nreg % 2 == 0) {
|
|
return lr1_nreg + lr2_nreg - 2;
|
|
} else {
|
|
return lr1_nreg + lr2_nreg - 1 + (lr1_nreg % 2) + (lr2_nreg % 2);
|
|
}
|
|
} else {
|
|
vISA_ASSERT_UNREACHABLE(
|
|
"Found unsupported subRegAlignment in flag register allocation!");
|
|
return 0;
|
|
}
|
|
} else if (lr1->getRegKind() == G4_ADDRESS) {
|
|
G4_SubReg_Align lr1_align = gra.getSubRegAlign(lr1->getVar()->getDeclare());
|
|
G4_SubReg_Align lr2_align = gra.getSubRegAlign(lr2->getVar()->getDeclare());
|
|
unsigned lr1_nreg = lr1->getNumRegNeeded();
|
|
unsigned lr2_nreg = lr2->getNumRegNeeded();
|
|
|
|
if (lr1_align < lr2_align) {
|
|
G4_SubReg_Align tmp_align = lr1_align;
|
|
unsigned tmp_nreg = lr1_nreg;
|
|
lr1_align = lr2_align;
|
|
lr2_align = tmp_align;
|
|
lr1_nreg = lr2_nreg;
|
|
lr2_nreg = tmp_nreg;
|
|
}
|
|
|
|
if (lr1_align == Any) {
|
|
// Any vs
|
|
return lr1_nreg + lr2_nreg - 1;
|
|
} else if (lr1_align == Four_Word && lr2_align == Any) {
|
|
// 4 vs Any
|
|
return lr1_nreg + lr2_nreg + 3 - (lr1_nreg + lr2_nreg) % 4;
|
|
} else if (lr1_align == Four_Word && lr2_align == Four_Word) {
|
|
// 4 vs 4
|
|
return lr1_nreg + lr2_nreg - 1 + (4 - lr1_nreg % 4) % 4 +
|
|
(4 - lr2_nreg % 4) % 4;
|
|
} else if (lr1_align == Eight_Word && lr2_align == Any) {
|
|
// 8 vs Any
|
|
return lr1_nreg + lr2_nreg + 7 - (lr1_nreg + lr2_nreg) % 8;
|
|
} else if (lr1_align == Eight_Word && lr2_align == Four_Word) {
|
|
// 8 vs 4
|
|
if (((8 - lr1_nreg % 8) % 8) >= 4)
|
|
return lr1_nreg + lr2_nreg - 1 + (8 - lr1_nreg % 8) % 8 - 4;
|
|
return lr1_nreg + lr2_nreg - 1 + (8 - lr1_nreg % 8) % 8 +
|
|
(4 - lr2_nreg % 4) % 4;
|
|
} else if (lr1_align == Eight_Word && lr2_align == Eight_Word) {
|
|
// 8 vs 8
|
|
return lr1_nreg + lr2_nreg - 1 + (8 - lr1_nreg % 8) % 8 +
|
|
(8 - lr2_nreg % 8) % 8;
|
|
} else if (lr1_align == Sixteen_Word && lr2_align == Any) {
|
|
// 16 vs Any
|
|
return lr1_nreg + lr2_nreg + 15 - (lr1_nreg + lr2_nreg) % 16;
|
|
} else if (lr1_align == Sixteen_Word && lr2_align == Four_Word) {
|
|
// 16 vs 4
|
|
if (((16 - lr1_nreg % 16) % 16) >= 4)
|
|
return lr1_nreg + lr2_nreg - 1 + (16 - lr1_nreg % 16) % 16 - 4;
|
|
return lr1_nreg + lr2_nreg - 1 + (16 - lr1_nreg % 16) % 16 +
|
|
(4 - lr2_nreg % 4) % 4;
|
|
} else if (lr1_align == Sixteen_Word && lr2_align == Eight_Word) {
|
|
// 16 vs 8
|
|
if (((16 - lr1_nreg % 16) % 16) >= 8)
|
|
return lr1_nreg + lr2_nreg - 1 + (16 - lr1_nreg % 16) % 16 - 8;
|
|
return lr1_nreg + lr2_nreg - 1 + (16 - lr1_nreg % 16) % 16 +
|
|
(8 - lr2_nreg % 8) % 8;
|
|
} else if (lr1_align == Sixteen_Word && lr2_align == Sixteen_Word) {
|
|
// 16 vs 16
|
|
return lr1_nreg + lr2_nreg - 1 + (16 - lr1_nreg % 16) % 16 +
|
|
(16 - lr2_nreg % 16) % 16;
|
|
} else {
|
|
vISA_ASSERT_UNREACHABLE(
|
|
"Found unsupported subRegAlignment in address register allocation!");
|
|
return 0;
|
|
}
|
|
}
|
|
else if (lr1->getRegKind() == G4_SCALAR) {
|
|
return edgeWeightGRF<false>(lr1, lr2); // treat scalar just like GRF
|
|
}
|
|
vISA_ASSERT_UNREACHABLE(
|
|
"Found unsupported ARF reg type in register allocation!");
|
|
return 0;
|
|
}
|
|
|
|
void GlobalRA::fixSrc0IndirFcall() {
|
|
// Indirect calls look like:
|
|
// mov (1|NM) V10 0x123456:ud
|
|
// fcall (1) dst V10 <-- V10 which is src0 contains %ip to jump to
|
|
//
|
|
// In this function, we want to set V10 to r125.0 which is same as dst of
|
|
// fcall as per ABI. This way, when inserting save/restore code around fcall,
|
|
// no special checks are needed to handle V10.
|
|
//
|
|
// But this works only if V10 is a local. If it not a local we create a mov
|
|
// that copies V10 in to a new temp variable. And then we map this temp
|
|
// variable to r125.0. Hopefully V10 being global would be a rare occurence.
|
|
for (auto bb : kernel.fg) {
|
|
if (bb->isEndWithFCall()) {
|
|
auto fcall = bb->back()->asCFInst();
|
|
if (!fcall->getSrc(0) || !fcall->getSrc(0)->isSrcRegRegion())
|
|
continue;
|
|
|
|
auto src0Rgn = fcall->getSrc(0)->asSrcRegRegion();
|
|
auto src0TypeSize = src0Rgn->getTypeSize();
|
|
auto src0Dcl = src0Rgn->getBase()->asRegVar()->getDeclare();
|
|
auto src0TopDcl = src0Rgn->getTopDcl();
|
|
|
|
if (src0Dcl != src0TopDcl || !isBlockLocal(src0TopDcl) ||
|
|
src0TopDcl->getNumElems() > 1) {
|
|
// create a copy
|
|
auto tmpDcl = kernel.fg.builder->createHardwiredDeclare(
|
|
1, src0Rgn->getType(), kernel.stackCall.getFPSPGRF(),
|
|
kernel.stackCall.subRegs.Ret_IP * TypeSize(Type_UD) / src0TypeSize);
|
|
auto dst = kernel.fg.builder->createDst(tmpDcl->getRegVar(),
|
|
src0Rgn->getType());
|
|
auto src = kernel.fg.builder->duplicateOperand(src0Rgn);
|
|
auto copy = kernel.fg.builder->createMov(g4::SIMD1, dst, src,
|
|
InstOpt_WriteEnable, false);
|
|
auto iter = std::find_if(bb->begin(), bb->end(),
|
|
[](G4_INST *inst) { return inst->isFCall(); });
|
|
bb->insertBefore(iter, copy);
|
|
auto newSrc = kernel.fg.builder->createSrc(
|
|
tmpDcl->getRegVar(), 0, 0, kernel.fg.builder->getRegionScalar(),
|
|
Type_UD);
|
|
fcall->setSrc(newSrc, 0);
|
|
} else {
|
|
auto fcallDstTypeSize = fcall->getDst()->getTypeSize();
|
|
vISA_ASSERT(fcallDstTypeSize == 4, "expecting DW type dst");
|
|
src0TopDcl->getRegVar()->setPhyReg(
|
|
fcall->getDst()->getBase()->asRegVar()->getPhyReg(),
|
|
fcall->getDst()->getBase()->asRegVar()->getPhyRegOff() *
|
|
fcallDstTypeSize / src0TypeSize);
|
|
}
|
|
}
|
|
}
|
|
}
|