Files
intel-graphics-compiler/visa/G4_Kernel.cpp
Joel Fuentes aafca7ed1b Improve spill threshold handling
Improve spill threshold handling in units of GRFs calculated from
byte input.
2025-08-12 23:08:27 +02:00

2267 lines
70 KiB
C++

/*========================== begin_copyright_notice ============================
Copyright (C) 2021 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
#include "G4_Kernel.hpp"
#include "BinaryEncodingIGA.h"
#include "BuildIR.h"
#include "Common_ISA_framework.h"
#include "DebugInfo.h"
#include "G4_BB.hpp"
#include "KernelCost.hpp"
#include "VISAKernel.h"
#include "VarSplit.h"
#include "iga/IGALibrary/Models/Models.hpp"
#include "iga/IGALibrary/api/kv.hpp"
#include "visa_wa.h"
#include <fstream>
#include <functional>
#include <iomanip>
#include <list>
#include <utility>
using namespace vISA;
void *gtPinData::getFreeGRFInfo(unsigned &size) {
// Here is agreed upon format for reporting free GRFs:
// struct freeBytes
//{
// unsigned short startByte;
// unsigned short numConsecutiveBytes;
//};
// Added magic 0xDEADD00D at start and
// magic 0xDEADBEEF at the end of buffer
// on request of gtpin team.
//
// struct freeGRFInfo
//{
// unsigned short numItems;
//
// freeBytes data[numItems];
//};
struct freeBytes {
unsigned short startByte;
unsigned short numConsecutiveBytes;
};
struct freeGRFInfo {
unsigned int magicStart;
unsigned int numItems;
};
// Compute free register information using vector for efficiency,
// then convert to POS for passing back to gtpin.
std::vector<std::pair<unsigned short, unsigned short>> vecFreeBytes;
for (auto byte : globalFreeRegs) {
if (vecFreeBytes.size() > 0) {
auto &lastFree = vecFreeBytes.back();
if (byte == (lastFree.first + lastFree.second)) {
lastFree.second += 1;
} else {
vecFreeBytes.push_back(std::make_pair(byte, 1));
}
} else {
vecFreeBytes.push_back(std::make_pair(byte, 1));
}
}
// Now convert vector to POS
unsigned int numItems = (unsigned int)vecFreeBytes.size();
freeGRFInfo *buffer = (freeGRFInfo *)malloc(
numItems * sizeof(freeBytes) + sizeof(unsigned int) +
sizeof(unsigned int) + sizeof(unsigned int));
if (buffer) {
buffer->numItems = numItems;
buffer->magicStart = 0xDEADD00D;
memcpy_s((unsigned char *)buffer + sizeof(unsigned int) +
sizeof(unsigned int),
numItems * sizeof(freeBytes), vecFreeBytes.data(),
numItems * sizeof(freeBytes));
unsigned int magicEnd = 0xDEADBEEF;
memcpy_s((unsigned char *)buffer + sizeof(unsigned int) +
sizeof(unsigned int) + (numItems * sizeof(freeBytes)),
sizeof(magicEnd), &magicEnd, sizeof(magicEnd));
// numItems - unsigned int
// magicStart - unsigned int
// magicEnd - unsigned int
// data - numItems * sizeof(freeBytes)
size = sizeof(unsigned int) + sizeof(unsigned int) + sizeof(unsigned int) +
(numItems * sizeof(freeBytes));
}
return buffer;
}
void gtPinData::setGTPinInit(void *buffer) {
vISA_ASSERT(sizeof(gtpin::igc::igc_init_t) <= 200,
"Check size of igc_init_t");
gtpin_init = (gtpin::igc::igc_init_t *)buffer;
// reRA pass is no longer supported.
// FIXME: should we assert here?
// if (gtpin_init->re_ra)
if (gtpin_init->grf_info)
kernel.getOptions()->setOption(vISA_GetFreeGRFInfo, true);
}
template <class T>
void write(void *buffer, const T &data, unsigned int &offset) {
memcpy_s((char *)buffer + offset, sizeof(T), &data, sizeof(T));
offset += sizeof(T);
}
void *gtPinData::getIndirRefs(unsigned int &size) {
// Store indirect access per %ip
// %ip -> vector[start byte, size]
std::map<unsigned int, std::vector<std::pair<unsigned int, unsigned int>>>
indirRefMap;
// return %ip of first executable instruction in kernel
auto getIpOfFirstInst = [&]() {
unsigned int startIp = 0;
if (kernel.fg.getIsStackCallFunc()) {
for (auto bb : kernel.fg.getBBList()) {
if (startIp > 0)
break;
for (auto inst : bb->getInstList()) {
startIp = (unsigned int)inst->getGenOffset();
if (inst->isLabel())
continue;
// verify truncation is still legal
vISA_ASSERT(inst->getGenOffset() == (uint32_t)inst->getGenOffset(),
"%ip out of bounds");
if (startIp > 0)
break;
}
}
}
return startIp;
};
unsigned int startIp = getIpOfFirstInst();
auto getIndirRefData = [&](G4_Declare *addr) {
// for given addr, return std::vector<std::pair<start byte, size>>
std::vector<std::pair<unsigned int, unsigned int>> indirs;
auto it = indirRefs.find(addr);
if (it == indirRefs.end())
return indirs;
for (auto target : (*it).second) {
if (target->isSpilled())
continue;
auto start = target->getGRFOffsetFromR0();
auto size = target->getByteSize();
indirs.push_back(std::make_pair(start, size));
}
return indirs;
};
for (auto bb : kernel.fg.getBBList()) {
// Kernel's CFG may be stitched together
// with that of its callees. We want to
// iterate over only those BBs that belong
// to current CFG.
if (&bb->getParent() != &kernel.fg)
break;
for (auto inst : bb->getInstList()) {
auto dst = inst->getDst();
if (dst && dst->isIndirect()) {
// encode dst indirect reference
auto indirs = getIndirRefData(dst->getTopDcl());
auto &mapEntry = indirRefMap[(uint32_t)inst->getGenOffset() - startIp];
mapEntry.insert(mapEntry.end(), indirs.begin(), indirs.end());
}
for (unsigned int i = 0; i != inst->getNumSrc(); ++i) {
auto src = inst->getSrc(i);
if (src && src->isSrcRegRegion() &&
src->asSrcRegRegion()->isIndirect()) {
// encode src indirect reference
auto indirs = getIndirRefData(src->asSrcRegRegion()->getTopDcl());
auto &mapEntry =
indirRefMap[(uint32_t)inst->getGenOffset() - startIp];
mapEntry.insert(mapEntry.end(), indirs.begin(), indirs.end());
}
}
}
}
unsigned int numRanges = 0;
for (auto &item : indirRefMap) {
numRanges += item.second.size();
}
// see gtpin_IGC_interface.h for format of igc_token_indirect_access_info_t
size = sizeof(gtpin::igc::igc_token_indirect_access_info_t::num_ranges) +
numRanges * sizeof(gtpin::igc::ins_reg_range_t);
auto buffer = malloc(size);
unsigned int offset = 0;
write<uint32_t>(buffer, numRanges, offset);
for (auto &item : indirRefMap) {
for (const auto &arg : item.second) {
vISA_ASSERT(offset < size, "Out of bounds");
write<uint32_t>(buffer, item.first, offset);
vISA_ASSERT(offset < size, "Out of bounds");
write<uint16_t>(buffer, arg.first, offset);
vISA_ASSERT(offset < size, "Out of bounds");
write<uint16_t>(buffer, arg.second, offset);
}
}
vISA_ASSERT(offset == size, "Unexpected bounds");
return buffer;
}
template <typename T>
static void writeBuffer(std::vector<unsigned char> &buffer,
unsigned &bufferSize, const T *t, unsigned numBytes) {
const unsigned char *data = (const unsigned char *)t;
for (unsigned i = 0; i != numBytes; i++) {
buffer.push_back(data[i]);
}
bufferSize += numBytes;
}
void *gtPinData::getGTPinInfoBuffer(unsigned &bufferSize,
unsigned int scratchOffset) {
if (!gtpin_init && !gtpinInitFromL0) {
bufferSize = 0;
return nullptr;
}
gtpin::igc::igc_init_t t;
std::vector<unsigned char> buffer;
unsigned numTokens = 0;
auto stackABI =
kernel.fg.getIsStackCallFunc() || kernel.fg.getHasStackCalls();
bufferSize = 0;
memset(&t, 0, sizeof(t));
t.version = gtpin::igc::GTPIN_IGC_INTERFACE_VERSION;
t.igc_init_size = sizeof(t);
if (gtpinInitFromL0) {
if (!stackABI) {
if (kernel.getOption(vISA_GetFreeGRFInfo)) {
t.grf_info = 1;
numTokens++;
// indirect info
numTokens++;
}
if (kernel.getOption(vISA_GTPinReRA)) {
t.re_ra = 1;
}
} else {
// provide only indirect info for stack calls
if (kernel.getOption(vISA_GetFreeGRFInfo)) {
t.grf_info = 1;
numTokens++;
}
}
if (kernel.getOptions()->getOption(vISA_GenerateDebugInfo))
t.srcline_mapping = 1;
if (kernel.getOptions()->getuInt32Option(vISA_GTPinScratchAreaSize) > 0) {
t.scratch_area_size = getNumBytesScratchUse();
numTokens++;
}
if (!t.grf_info && kernel.getOptions()->getOption(vISA_GetFreeGRFInfo)) {
// this check is to report out indir references, irrespective of
// whether stack call is present.
t.grf_info = 1;
numTokens++;
}
} else {
t.version =
std::min(gtpin_init->version, gtpin::igc::GTPIN_IGC_INTERFACE_VERSION);
if (!stackABI) {
if (gtpin_init->grf_info) {
t.grf_info = 1;
numTokens++;
// indirect info
numTokens++;
}
if (gtpin_init->re_ra) {
t.re_ra = 1;
}
} else {
// provide only indirect info for stack calls
if (gtpin_init->grf_info) {
t.grf_info = 1;
numTokens++;
}
}
if (gtpin_init->srcline_mapping &&
kernel.getOptions()->getOption(vISA_GenerateDebugInfo))
t.srcline_mapping = 1;
if (gtpin_init->scratch_area_size > 0) {
t.scratch_area_size = gtpin_init->scratch_area_size;
numTokens++;
}
if (!t.grf_info && gtpin_init->grf_info) {
t.grf_info = 1;
numTokens++;
}
}
// For payload offsets
numTokens++;
// Report #GRFs
numTokens++;
writeBuffer(buffer, bufferSize, &t, sizeof(t));
writeBuffer(buffer, bufferSize, &numTokens, sizeof(uint32_t));
if (t.grf_info) {
if (!stackABI) {
// create token
void *rerabuffer = nullptr;
unsigned rerasize = 0;
rerabuffer = getFreeGRFInfo(rerasize);
gtpin::igc::igc_token_header_t th;
th.token = gtpin::igc::GTPIN_IGC_TOKEN::GTPIN_IGC_TOKEN_GRF_INFO;
th.token_size = sizeof(gtpin::igc::igc_token_header_t) + rerasize;
// write token and data to buffer
writeBuffer(buffer, bufferSize, &th, sizeof(th));
writeBuffer(buffer, bufferSize, rerabuffer, rerasize);
free(rerabuffer);
}
// report indir refs
void *indirRefs = nullptr;
unsigned int indirRefsSize = 0;
indirRefs = getIndirRefs(indirRefsSize);
gtpin::igc::igc_token_header_t th;
th.token =
gtpin::igc::GTPIN_IGC_TOKEN::GTPIN_IGC_TOKEN_INDIRECT_ACCESS_INFO;
th.token_size = sizeof(gtpin::igc::igc_token_header_t) + indirRefsSize;
// write token and data to buffer
writeBuffer(buffer, bufferSize, &th, sizeof(th));
writeBuffer(buffer, bufferSize, indirRefs, indirRefsSize);
free(indirRefs);
}
if (t.scratch_area_size) {
gtpin::igc::igc_token_scratch_area_info_t scratchSlotData;
scratchSlotData.scratch_area_size = t.scratch_area_size;
vISA_ASSERT(scratchOffset >= nextScratchFree, "scratch offset mismatch");
scratchSlotData.scratch_area_offset = scratchOffset;
// gtpin scratch slots are beyond spill memory
scratchSlotData.token = gtpin::igc::GTPIN_IGC_TOKEN_SCRATCH_AREA_INFO;
scratchSlotData.token_size = sizeof(scratchSlotData);
writeBuffer(buffer, bufferSize, &scratchSlotData, sizeof(scratchSlotData));
}
{
// Write payload offsets
gtpin::igc::igc_token_kernel_start_info_t offsets;
offsets.token = gtpin::igc::GTPIN_IGC_TOKEN_KERNEL_START_INFO;
offsets.per_thread_prolog_size = kernel.getPerThreadNextOff();
offsets.cross_thread_prolog_size =
kernel.getCrossThreadNextOff() - offsets.per_thread_prolog_size;
offsets.token_size = sizeof(offsets);
writeBuffer(buffer, bufferSize, &offsets, sizeof(offsets));
}
{
// Report num GRFs
gtpin::igc::igc_token_num_grf_regs_t numGRFs;
numGRFs.token = gtpin::igc::GTPIN_IGC_TOKEN_NUM_GRF_REGS;
numGRFs.token_size = sizeof(numGRFs);
numGRFs.num_grf_regs = kernel.getNumRegTotal();
writeBuffer(buffer, bufferSize, &numGRFs, sizeof(numGRFs));
}
void *gtpinBuffer = allocCodeBlock(bufferSize);
memcpy_s(gtpinBuffer, bufferSize, buffer.data(), bufferSize);
// Dump buffer with shader dumps
if (kernel.getOption(vISA_outputToFile)) {
std::string asmName = kernel.getOptions()->getOptionCstr(VISA_AsmFileName);
if (!asmName.empty()) {
const VISAKernelImpl *vKernel =
kernel.fg.builder->getParent()->getKernel(kernel.getName());
if (vKernel && vKernel->getIsFunction()) {
unsigned funcID = -1;
vKernel->GetFunctionId(funcID);
asmName += "_f" + std::to_string(funcID);
}
std::ofstream ofInit;
std::stringstream ssInit;
ssInit << asmName << ".gtpin_igc_init";
ofInit.open(ssInit.str(), std::ofstream::binary);
if (gtpin_init) {
ofInit.write((const char *)gtpin_init, sizeof(*gtpin_init));
}
ofInit.close();
std::ofstream ofInfo;
std::stringstream ssInfo;
ssInfo << asmName << ".gtpin_igc_info";
ofInfo.open(ssInfo.str(), std::ofstream::binary);
if (gtpinBuffer) {
ofInfo.write((const char *)gtpinBuffer, bufferSize);
}
ofInfo.close();
}
}
return gtpinBuffer;
}
void gtPinData::setScratchNextFree(unsigned next) {
nextScratchFree = ((next + kernel.numEltPerGRF<Type_UB>() - 1) /
kernel.numEltPerGRF<Type_UB>()) *
kernel.numEltPerGRF<Type_UB>();
}
unsigned int gtPinData::getScratchNextFree() const { return nextScratchFree; }
uint32_t gtPinData::getNumBytesScratchUse() const {
if (gtpin_init) {
return gtpin_init->scratch_area_size;
} else if (isGTPinInitFromL0()) {
return kernel.getOptions()->getuInt32Option(vISA_GTPinScratchAreaSize);
}
return 0;
}
G4_Kernel::G4_Kernel(const PlatformInfo &pInfo, INST_LIST_NODE_ALLOCATOR &alloc,
Mem_Manager &m, Options *options, Attributes *anAttr,
uint32_t funcId, unsigned char major, unsigned char minor)
: platformInfo(pInfo), m_options(options), m_kernelAttrs(anAttr),
m_function_id(funcId), RAType(RA_Type::UNKNOWN_RA), asmInstCount(0),
kernelID(0), fg(alloc, this, m), major_version(major),
minor_version(minor), grfMode(pInfo.platform, pInfo.grfSize, options) {
vISA_ASSERT(major < COMMON_ISA_MAJOR_VER || (major == COMMON_ISA_MAJOR_VER &&
minor <= COMMON_ISA_MINOR_VER),
"CISA version not supported by this JIT-compiler");
name = NULL;
hasAddrTaken = false;
kernelDbgInfo = nullptr;
if (options->getOption(vISAOptions::vISA_GetFreeGRFInfo) ||
options->getuInt32Option(vISAOptions::vISA_GTPinScratchAreaSize)) {
allocGTPinData();
} else {
gtPinInfo = nullptr;
}
autoGRFSelection = m_options->getOption(vISA_AutoGRFSelection);
// NoMask WA
m_EUFusionNoMaskWAInfo = nullptr;
setKernelParameters();
}
G4_Kernel::~G4_Kernel() {
if (kernelDbgInfo) {
kernelDbgInfo.reset();
}
if (gtPinInfo) {
gtPinInfo.reset();
}
Declares.clear();
}
void G4_Kernel::computeChannelSlicing() {
G4_ExecSize simdSize = getSimdSize();
channelSliced = true;
if (simdSize == g4::SIMD8 || simdSize == g4::SIMD16) {
// SIMD8/16 kernels are not sliced
channelSliced = false;
return;
}
if (simdSize == g4::SIMD32 && numEltPerGRF<Type_UB>() >= 64) {
// For 64 bytes GRF, simd32 kernel, there is no slicing
channelSliced = false;
return;
}
// .dcl V1 size = 128 bytes
// op (16|M0) V1(0,0) ..
// op (16|M16) V1(2,0) ..
// For above sequence, return 32. Instruction
// is broken in to 2 only due to hw restriction.
// Allocation of dcl is still as if it were a
// SIMD32 kernel.
// Store emask bits that are ever used to define a variable
std::unordered_map<G4_Declare *, std::bitset<32>> emaskRef;
for (auto bb : fg) {
for (auto inst : *bb) {
if (inst->isSend())
continue;
auto dst = inst->getDst();
if (!dst || !dst->getTopDcl() || dst->getHorzStride() != 1)
continue;
if (inst->isWriteEnableInst())
continue;
auto regFileKind = dst->getTopDcl()->getRegFile();
if (regFileKind != G4_RegFileKind::G4_GRF &&
regFileKind != G4_RegFileKind::G4_INPUT)
continue;
if (dst->getTopDcl()->getByteSize() <=
dst->getTypeSize() * (unsigned)simdSize)
continue;
auto emaskOffStart = inst->getMaskOffset();
// Reset all bits on first encounter of dcl
if (emaskRef.find(dst->getTopDcl()) == emaskRef.end())
emaskRef[dst->getTopDcl()].reset();
// Set bits based on which EM bits are used in the def
for (unsigned i = emaskOffStart;
i != (emaskOffStart + inst->getExecSize()); i++) {
emaskRef[dst->getTopDcl()].set(i);
}
}
}
// Check whether any variable's emask usage straddles across lower and upper
// 16 bits
for (auto &emRefs : emaskRef) {
auto &bits = emRefs.second;
auto num = bits.to_ulong();
// Check whether any lower 16 and upper 16 bits are set
if (((num & 0xffff) != 0) && ((num & 0xffff0000) != 0)) {
channelSliced = false;
return;
}
}
return;
}
void G4_Kernel::calculateSimdSize() {
// Iterate over all instructions in kernel to check
// whether default execution size of kernel is
// SIMD8/16. This is required for knowing alignment
// to use for GRF candidates.
// only do it once per kernel, as we should not introduce inst with larger
// simd size than in the input
if (simdSize.value != 0) {
return;
}
// First, get simdsize from attribute (0 : not given)
// If not 0|8|16|32, wrong value from attribute.
simdSize = G4_ExecSize(
(unsigned)m_kernelAttrs->getInt32KernelAttr(Attributes::ATTR_SimdSize));
if (simdSize != g4::SIMD8 && simdSize != g4::SIMD16 &&
simdSize != g4::SIMD32) {
vISA_ASSERT(simdSize.value == 0,
"vISA: wrong value for SimdSize attribute");
// pvc+: simd16; simd8 otherwise
simdSize = fg.builder->getNativeExecSize();
for (auto bb : fg) {
for (auto inst : *bb) {
// do not consider send since for certain messages we have to set its
// execution size to 16 even in simd8 shaders
// Also skip noMask inst
if (!inst->isLabel() && !inst->isSend() && !inst->isWriteEnableInst()) {
uint32_t size = inst->getMaskOffset() + inst->getExecSize();
if (size > 16) {
simdSize = g4::SIMD32;
break;
} else if (size > 8) {
simdSize = g4::SIMD16;
}
}
}
if (simdSize == g4::SIMD32)
break;
}
}
if (GlobalRA::useGenericAugAlign(getPlatformGeneration()))
computeChannelSlicing();
}
//
// Updates kernel's related structures to large GRF
//
bool G4_Kernel::updateKernelToLargerGRF() {
if (numRegTotal == grfMode.getMaxGRF())
return false;
// Scale number of GRFs, Acc, SWSB tokens.
setKernelParameters(grfMode.moveToLargerGRF());
fg.builder->rebuildPhyRegPool(getNumRegTotal());
return true;
}
//
// Updates kernel's related structures based on register pressure
//
void G4_Kernel::updateKernelByRegPressure(unsigned regPressure,
bool forceGRFModeUp) {
unsigned largestInputReg = getLargestInputRegister();
if (m_kernelAttrs->isKernelAttrSet(Attributes::ATTR_MaxRegThreadDispatch)) {
unsigned maxRegPayloadDispatch = m_kernelAttrs->getInt32KernelAttr(
Attributes::ATTR_MaxRegThreadDispatch);
largestInputReg = std::max(largestInputReg, maxRegPayloadDispatch);
}
unsigned newGRF = grfMode.setModeByRegPressure(regPressure, largestInputReg,
forceGRFModeUp);
if (newGRF == numRegTotal)
return;
// Scale number of threads, Acc, SWSB tokens.
setKernelParameters(newGRF);
// Update physical register pool
fg.builder->rebuildPhyRegPool(getNumRegTotal());
}
//
// Updates kernel's related structures based on NumGRF attribute
//
bool G4_Kernel::updateKernelFromNumGRFAttr() {
unsigned attrNumGRF =
m_kernelAttrs->getInt32KernelAttr(Attributes::ATTR_NumGRF);
if (attrNumGRF != 0 && !grfMode.isValidNumGRFs(attrNumGRF))
return false;
if (numRegTotal == attrNumGRF)
return true;
autoGRFSelection = (attrNumGRF == 0);
// Scale number of GRFs, Acc, SWSB tokens.
setKernelParameters(attrNumGRF);
fg.builder->rebuildPhyRegPool(getNumRegTotal());
return true;
}
//
// Evaluate AddrExp/AddrExpList to Imm
//
void G4_Kernel::evalAddrExp() {
for (std::list<G4_BB *>::iterator it = fg.begin(), itEnd = fg.end();
it != itEnd; ++it) {
G4_BB *bb = (*it);
for (INST_LIST_ITER i = bb->begin(), iEnd = bb->end(); i != iEnd; i++) {
G4_INST *inst = (*i);
//
// process each source operand
//
for (unsigned j = 0, numSrc = inst->getNumSrc(); j < numSrc; j++) {
G4_Operand *opnd = inst->getSrc(j);
if (!opnd)
continue;
if (opnd->isAddrExp()) {
int val = opnd->asAddrExp()->eval(*fg.builder);
G4_Type ty = opnd->asAddrExp()->getType();
G4_Imm *imm = fg.builder->createImm(val, ty);
inst->setSrc(imm, j);
}
}
}
}
}
[[maybe_unused]] static std::vector<std::string> split(const std::string &str,
const char *delimiter) {
std::vector<std::string> v;
std::string::size_type start = 0;
for (auto pos = str.find_first_of(delimiter, start); pos != std::string::npos;
start = pos + 1, pos = str.find_first_of(delimiter, start)) {
if (pos != start) {
v.emplace_back(str, start, pos - start);
}
}
if (start < str.length())
v.emplace_back(str, start, str.length() - start);
return v;
}
static iga_gen_t getIGAPlatform(TARGET_PLATFORM genPlatform) {
iga_gen_t platform = IGA_GEN_INVALID;
switch (genPlatform) {
case GENX_BDW:
platform = IGA_GEN8;
break;
case GENX_CHV:
platform = IGA_GEN8lp;
break;
case GENX_SKL:
platform = IGA_GEN9;
break;
case GENX_BXT:
platform = IGA_GEN9lp;
break;
case GENX_ICLLP:
platform = IGA_GEN11;
break;
case GENX_TGLLP:
platform = IGA_GEN12p1;
break;
case Xe_XeHPSDV:
platform = IGA_XE_HP;
break;
case Xe_DG2:
case Xe_MTL:
case Xe_ARL:
platform = IGA_XE_HPG;
break;
case Xe_PVC:
case Xe_PVCXT:
platform = IGA_XE_HPC;
break;
case Xe2:
platform = IGA_XE2;
break;
case Xe3:
platform = IGA_XE3;
break;
default:
break;
}
return platform;
}
KernelDebugInfo *G4_Kernel::getKernelDebugInfo() {
if (kernelDbgInfo == nullptr) {
kernelDbgInfo = std::make_shared<KernelDebugInfo>();
}
return kernelDbgInfo.get();
}
void G4_Kernel::createKernelCostInfo(KernelCost *KCA) {
//
// copy data from FuncCost of KernelCostAnalysis to G4_Kernel's kernelCost
// (LoopCost is src type, LoopCostInfo is dst type)
//
m_kernelCost = std::make_unique<KernelCostInfo>();
FuncCost &FC = KCA->getKernelCost();
int sz = FC.m_allLoopsInProgramOrder.size();
m_kernelCost.get()->allLoopCosts.resize(sz);
m_kernelCost.get()->kernelCost.C = FC.m_funcCost.C.getCostMetrics();
for (int i = 0; i < sz; ++i) {
const Loop *L = FC.m_allLoopsInProgramOrder[i];
LoopCost &LC = KCA->getLoopCost(L);
LoopCostInfo &LCI = m_kernelCost.get()->allLoopCosts[i];
LCI.loopId = i;
vISA_ASSERT(i == LC.m_loopId, "Kernel Cost Analysis: incorrect loop id");
LCI.backedge_visaId = LC.m_backedge_visaId;
const CostMetrics &cm = LC.m_loopBodyCost.C.getCostMetrics();
LCI.loopBodyCost.C = cm;
LCI.LCE = nullptr;
LCI.numChildLoops = L->getNumImmChildLoops();
vISA_ASSERT(LCI.numChildLoops == LC.m_loopBodyCost.LoopCosts.size(),
"Kernel Cost Analysis: incorrect number of child loops!");
LCI.nestingLevel = L->getNestingLevel();
for (LoopCost *immLC : LC.m_loopBodyCost.LoopCosts) {
int loop_id = immLC->m_loopId;
LoopCostInfo &immLCI = m_kernelCost.get()->allLoopCosts[loop_id];
LCI.loopBodyCost.loopCosts.push_back(&immLCI);
}
}
}
void StackCallABI::setVersion() {
// VISA ABI version 1 is deprecated so default version to use is version 2
version = StackCallABIVersion::VER_2;
}
void StackCallABI::init(G4_Kernel *k) {
vISA_ASSERT(!kernel, "init called multiple times");
kernel = k;
setVersion();
if (version == StackCallABIVersion::VER_3) {
vISA_ASSERT(kernel->getGRFSize() == 64, "require 64-byte GRF for ABI v3");
vISA_ASSERT(kernel->getPlatform() >= TARGET_PLATFORM::Xe3,
"ABI v3 supported only for Xe3+");
}
switch (version) {
case StackCallABIVersion::VER_1:
case StackCallABIVersion::VER_2:
subRegs.Ret_IP = SubRegs_Stackcall_v1_v2_Ret_IP;
subRegs.Ret_EM = SubRegs_Stackcall_v1_v2_Ret_EM;
subRegs.BE_SP = SubRegs_Stackcall_v1_v2_BE_SP;
subRegs.BE_FP = SubRegs_Stackcall_v1_v2_BE_FP;
subRegs.FE_FP = SubRegs_Stackcall_v1_v2_FE_FP;
subRegs.FE_SP = SubRegs_Stackcall_v1_v2_FE_SP;
offsets.Ret_IP = FrameDescriptorOfsets_v1_v2_Ret_IP;
offsets.Ret_EM = FrameDescriptorOfsets_v1_v2_Ret_EM;
offsets.BE_SP = FrameDescriptorOfsets_v1_v2_BE_SP;
offsets.BE_FP = FrameDescriptorOfsets_v1_v2_BE_FP;
offsets.FE_FP = FrameDescriptorOfsets_v1_v2_FE_FP;
offsets.FE_SP = FrameDescriptorOfsets_v1_v2_FE_SP;
break;
case StackCallABIVersion::VER_3:
subRegs.Ret_IP = SubRegs_Stackcall_v3_Ret_IP;
subRegs.Ret_EM = SubRegs_Stackcall_v3_Ret_EM;
subRegs.BE_SP = SubRegs_Stackcall_v3_BE_SP;
subRegs.BE_FP = SubRegs_Stackcall_v3_BE_FP;
subRegs.FE_FP = SubRegs_Stackcall_v3_FE_FP;
subRegs.FE_SP = SubRegs_Stackcall_v3_FE_SP;
offsets.Ret_IP = FrameDescriptorOfsets_v3_Ret_IP;
offsets.Ret_EM = FrameDescriptorOfsets_v3_Ret_EM;
offsets.BE_SP = FrameDescriptorOfsets_v3_BE_SP;
offsets.BE_FP = FrameDescriptorOfsets_v3_BE_FP;
offsets.FE_FP = FrameDescriptorOfsets_v3_FE_FP;
offsets.FE_SP = FrameDescriptorOfsets_v3_FE_SP;
break;
default:
vISA_ASSERT(false, "unknown ABI");
}
argReg = ArgRet_Stackcall_Arg;
retReg = ArgRet_Stackcall_Ret;
}
unsigned StackCallABI::getStackCallStartReg() const {
// Last 3 (or 2) GRFs reserved for stack call purpose
unsigned totalGRFs = kernel->getNumRegTotal();
unsigned startReg = totalGRFs - numReservedABIGRF();
return startReg;
}
unsigned StackCallABI::calleeSaveStart() const {
return getCallerSaveLastGRF() + 1;
}
unsigned StackCallABI::getNumCalleeSaveRegs() const {
unsigned totalGRFs = kernel->getNumRegTotal();
return totalGRFs - calleeSaveStart() - numReservedABIGRF();
}
uint32_t StackCallABI::numReservedABIGRF() const {
if (version == StackCallABIVersion::VER_1)
return 3;
else if (version == StackCallABIVersion::VER_2) {
if (kernel->getOption(vISA_PreserveR0InR0))
return 2;
return 3;
} else {
// for ABI version > 2
return 1;
}
}
uint32_t StackCallABI::getFPSPGRF() const {
// For ABI V1, return (numRegTotal - 3), i.e. 125.
// For ABI V2, return (numRegTotal - 1), i.e. 127, 255.
// For ABI V3, return (numRegTotal - 1), i.e. 127, 255.
if (version == StackCallABIVersion::VER_1) {
return getStackCallStartReg() + FPSPGRF;
} else if (version == StackCallABIVersion::VER_2) {
return (kernel->getNumRegTotal() - 1) - FPSPGRF;
} else {
return (kernel->getNumRegTotal() - 1) - FPSPGRF;
}
}
uint32_t StackCallABI::getSpillHeaderGRF() const {
// For ABI V1 return r126.
// For ABI V2 return r126.
// For ABI V3 return r127.
if (version == StackCallABIVersion::VER_1)
return getStackCallStartReg() + SpillHeaderGRF;
else if (version == StackCallABIVersion::VER_2)
return (kernel->getNumRegTotal() - 1) - SpillHeaderGRF;
else
return kernel->stackCall.getFPSPGRF();
}
uint32_t StackCallABI::getThreadHeaderGRF() const {
// For ABI V1 return r127.
// For ABI V2 return r125.
vISA_ASSERT(
kernel->getOption(vISA_PreserveR0InR0) == false,
"r0 is preserved in r0 itself. no special stack call header needed");
if (version == StackCallABIVersion::VER_1)
return getStackCallStartReg() + ThreadHeaderGRF;
else
return (kernel->getNumRegTotal() - 1) - ThreadHeaderGRF;
}
//
// perform relocation for every entry in the allocation table
//
void G4_Kernel::doRelocation(void *binary, uint32_t binarySize) {
for (auto &&entry : relocationTable) {
entry.doRelocation(*this, binary, binarySize);
}
}
G4_INST *G4_Kernel::getFirstNonLabelInst() const {
for (auto I = fg.cbegin(), E = fg.cend(); I != E; ++I) {
auto bb = *I;
G4_INST *firstInst = bb->getFirstInst();
if (firstInst) {
return firstInst;
}
}
// empty kernel
return nullptr;
}
std::string G4_Kernel::getDebugSrcLine(const std::string &fileName,
int srcLine) {
auto iter = debugSrcLineMap.find(fileName);
if (iter == debugSrcLineMap.end()) {
std::ifstream ifs(fileName);
if (!ifs) {
// file doesn't exist
debugSrcLineMap[fileName] =
std::make_pair<bool, std::vector<std::string>>(false, {});
return "";
}
std::string line;
std::vector<std::string> srcLines;
while (std::getline(ifs, line)) {
srcLines.push_back(line);
}
debugSrcLineMap[fileName] = std::make_pair(true, std::move(srcLines));
}
iter = debugSrcLineMap.find(fileName);
if (iter == debugSrcLineMap.end() || !iter->second.first) {
return "";
}
auto &lines = iter->second.second;
if (srcLine > (int)lines.size() || srcLine <= 0) {
return "invalid line number";
}
return lines[srcLine - 1];
}
unsigned G4_Kernel::getLargestInputRegister() {
const unsigned inputCount = fg.builder->getInputCount();
unsigned regNum = 0;
if (inputCount) {
const input_info_t *ii = fg.builder->getInputArg(inputCount - 1);
regNum = (ii->offset + ii->dcl->getByteSize()) /
fg.builder->numEltPerGRF<Type_UB>();
}
return regNum;
}
void G4_Kernel::setKernelParameters(unsigned newGRF) {
unsigned overrideGRFNum = 0, overrideNumThreads = 0, overrideNumSWSB = 0,
overrideNumAcc = 0;
overrideGRFNum = m_options->getuInt32Option(vISA_TotalGRFNum);
overrideNumThreads = m_options->getuInt32Option(vISA_HWThreadNumberPerEU);
overrideNumSWSB = m_options->getuInt32Option(vISA_SWSBTokenNum);
overrideNumAcc = m_options->getuInt32Option(vISA_numGeneralAcc);
//
// Number of threads/GRF can currently be set by:
// 1.- Per kernel attribute
// 2.- IGC flag (reg key)
// 3.- Compiler option entered by user for
// 2.1 entire module
// 2.2 kernel function
// 4.- Compiler heuristics
//
// 1 is set via kernel attribute. 2 and 3 via vISA option.
// If none of them are set, compiler selects the best option (4).
//
if (newGRF > 0) {
// per kernel attribute or GRF change during compilation
grfMode.setModeByNumGRFs(newGRF);
overrideGRFNum = 0;
} else if (overrideNumThreads > 0) {
// Forcing a specific number of threads
grfMode.setModeByNumThreads(overrideNumThreads);
overrideGRFNum = 0;
autoGRFSelection = false;
} else if (overrideGRFNum > 0) {
// Forcing a specific number of GRFs
grfMode.setModeByNumGRFs(overrideGRFNum);
autoGRFSelection = false;
} else {
// Use default value
grfMode.setDefaultGRF();
overrideGRFNum = 0;
}
// Set number of GRFs
numRegTotal = overrideGRFNum ? overrideGRFNum : grfMode.getNumGRF();
auto lastCallerSavedGRF =
getOptions()->getuInt32Option(vISA_LastCallerSavedGRF);
// When vISA_LastCallerSavedGRF is set, it's an ABI breaking change.
// Kernel and entire callee nest must be compiled with same
// value of vISA_LastCallerSavedGRF for correctness.
if (lastCallerSavedGRF)
stackCall.setCallerSaveLastGRF(lastCallerSavedGRF);
else
stackCall.setCallerSaveLastGRF(((numRegTotal - 8) / 2) - 1);
// Set number of threads
numThreads = grfMode.getNumThreads();
// Set the number of SWSB tokens
numSWSBTokens =
overrideNumSWSB ? overrideNumSWSB : grfMode.getNumSWSBTokens();
// Set the number of Acc
numAcc = overrideNumAcc ? overrideNumAcc : grfMode.getNumAcc();
// Special configurations go here
if (m_options->getOption(vISA_hasDoubleAcc)) {
numAcc = 16;
}
}
bool G4_Kernel::hasInlineData() const {
const IR_Builder &b = *fg.builder;
return
b.getOption(vISA_useInlineData);
}
std::vector<ArgLayout> G4_Kernel::getArgumentLayout() {
const uint32_t startGRF =
getOptions()->getuInt32Option(vISA_loadThreadPayloadStartReg);
const uint32_t inputsStart = startGRF * getGRFSize();
const uint32_t inputCount = fg.builder->getInputCount();
const int PTIS = AlignUp(
getInt32KernelAttr(Attributes::ATTR_PerThreadInputSize), getGRFSize());
// Checks if input_info is cross-thread-input
auto isInCrossThreadData = [&](const input_info_t *input_info) {
return (uint32_t)input_info->offset >= inputsStart + PTIS;
};
const uint32_t inlineDataSize = fg.builder->getInlineDataSize();
const bool useInlineData = hasInlineData();
// Checks if input_info fits in inlineData
auto isInInlineData = [&](const input_info_t *const input_info) {
if (!useInlineData) {
return false;
}
uint32_t inputEnd = input_info->offset + input_info->size;
bool fitsInInlineData = inputEnd <= inputsStart + PTIS + inlineDataSize;
return isInCrossThreadData(input_info) && fitsInInlineData;
};
const uint32_t startGrfAddr =
getOptions()->getuInt32Option(vISA_loadThreadPayloadStartReg) *
getGRFSize();
std::vector<ArgLayout> args;
for (unsigned ix = 0; ix < inputCount; ix++) {
const input_info_t *input = fg.builder->getInputArg(ix);
if (input->isPseudoInput()) {
continue;
} else if (fg.builder->getFCPatchInfo()->getIsEntryKernel()) {
const vISA::G4_Declare *dcl = input->dcl;
if (INPUT_GENERAL == input->getInputClass() && !dcl->isLiveIn()) {
break;
}
}
int dstGrfAddr = input->offset;
auto memSrc = ArgLayout::MemSrc::INVALID;
int memOff = input->offset - startGrfAddr; // subtract off r0
if (isInInlineData(input)) {
memSrc = ArgLayout::MemSrc::INLINE;
memOff %= getGRFSize();
vISA_ASSERT(memOff < (int)inlineDataSize, "inline reg arg OOB");
vISA_ASSERT(memOff + (int)input->size <= (int)inlineDataSize,
"inline reg arg overflows");
} else if (isInCrossThreadData(input)) {
memSrc = ArgLayout::MemSrc::CTI;
memOff -= PTIS + (useInlineData ? inlineDataSize : 0);
} else {
memSrc = ArgLayout::MemSrc::PTI;
}
args.emplace_back(input->dcl, dstGrfAddr, memSrc, memOff, input->size);
}
std::sort(args.begin(), args.end(),
[&](const ArgLayout &a1, const ArgLayout &a2) {
return a1.dstGrfAddr < a2.dstGrfAddr;
});
return args;
}
void G4_Kernel::dump(std::ostream &os) const { fg.print(os); }
void G4_Kernel::dumpToFile(const std::string &suffixIn, bool forceG4Dump) {
bool dumpDot = m_options->getOption(vISA_DumpDot);
bool dumpG4 = forceG4Dump || m_options->getOption(vISA_DumpPasses) ||
m_options->getuInt32Option(vISA_DumpPassesSubset) >= 1;
if (!dumpDot && !dumpG4)
return;
// todo: remove else branch as it is not reached at all.
std::stringstream ss;
const char *prefix = nullptr;
getOptions()->getOption(VISA_AsmFileName, prefix);
if (prefix != nullptr) {
// Use AsmFileName as prefix for g4/dot dumps
if (fg.builder->getIsKernel()) {
// entry
ss << prefix << "." << std::setfill('0') << std::setw(3)
<< nextDumpIndex++ << "." << suffixIn;
} else {
// callee
ss << prefix << "_f" << getFunctionId() << "." << std::setfill('0')
<< std::setw(3) << nextDumpIndex++ << "." << suffixIn;
}
} else {
// calls to this will produce a sequence of dumps
// [kernel-name].000.[suffix].{dot,g4}
// [kernel-name].001.[suffix].{dot,g4}
// ...
// If vISA_DumpPassesSubset == 1 then we omit any files that don't change
// the string representation of the kernel (i.e. skip passes that don't do
// anything).
if (m_options->getOption(vISA_DumpUseInternalName) || name == nullptr) {
if (fg.builder->getIsKernel()) {
ss << "k" << getKernelID();
} else {
ss << "f" << getFunctionId();
}
} else {
ss << name;
}
ss << "." << std::setfill('0') << std::setw(3) << nextDumpIndex++ << "."
<< suffixIn;
}
std::string baseName = sanitizePathString(ss.str());
if (dumpDot)
dumpDotFileInternal(baseName);
if (dumpG4)
dumpG4Internal(baseName);
}
void G4_Kernel::dumpToConsole() { dumpG4InternalTo(std::cout); }
void G4_Kernel::emitDeviceAsm(std::ostream &os, const void *binary,
uint32_t binarySize) {
//
// for GTGPU lib release, don't dump out asm
//
#ifdef NDEBUG
#ifdef GTGPU_LIB
return;
#endif
#endif
const bool newAsm = m_options->getOption(vISA_dumpNewSyntax) &&
!(binary == NULL || binarySize == 0);
if (!m_options->getOption(vISA_StripComments)) {
emitDeviceAsmHeaderComment(os);
}
if (!newAsm) {
emitDeviceAsmInstructionsOldAsm(os);
return;
}
emitDeviceAsmInstructionsIga(os, binary, binarySize);
if (getPlatformGeneration() >= PlatformGen::XE) {
os << "\n\n";
auto jitInfo = fg.builder->getJitInfo();
os << "//.BankConflicts: " << jitInfo->statsVerbose.BCNum << "\n";
os << "//.ByteRMWs: " << jitInfo->statsVerbose.numByteRMWs << "\n//\n";
} else {
os << "// Bank Conflict Statistics: \n";
os << "// -- GOOD: " << fg.BCStats.NumOfGoodInsts << "\n";
os << "// -- BAD: " << fg.BCStats.NumOfBadInsts << "\n";
os << "// -- OK: " << fg.BCStats.NumOfOKInsts << "\n";
}
}
void G4_Kernel::emitRegInfo() {
const char *asmName = nullptr;
getOptions()->getOption(VISA_AsmFileName, asmName);
const char *asmNameEmpty = "";
if (!asmName) {
asmName = asmNameEmpty;
}
std::string dumpFileName = std::string(asmName) + ".reginfo";
std::fstream ofile(dumpFileName, std::ios::out);
emitRegInfoKernel(ofile);
ofile.close();
}
void G4_Kernel::emitRegInfoKernel(std::ostream &output) {
output << "//.platform " << getGenxPlatformString();
output << "\n"
<< "//.kernel ID 0x" << std::hex << getKernelID() << "\n";
output << std::dec << "\n";
int instOffset = 0;
for (BB_LIST_ITER itBB = fg.begin(); itBB != fg.end(); ++itBB) {
for (INST_LIST_ITER itInst = (*itBB)->begin(); itInst != (*itBB)->end();
++itInst) {
G4_INST *inst = (*itInst);
if (inst->isLabel()) {
continue;
}
if (inst->getLexicalId() == -1) {
continue;
}
(*itBB)->emitRegInfo(output, inst, instOffset);
instOffset += inst->isCompactedInst() ? 8 : 16;
}
}
return;
}
//
// This routine dumps out the dot file of the control flow graph along with
// instructions. dot is drawing graph tool from AT&T.
//
void G4_Kernel::dumpDotFileInternal(const std::string &baseName) {
std::fstream ofile(baseName + ".dot", std::ios::out);
vASSERT(!ofile.fail());
//
// write digraph KernelName {"
// size = "8, 10";
//
const char *asmFileName = NULL;
m_options->getOption(VISA_AsmFileName, asmFileName);
if (asmFileName == NULL)
ofile << "digraph UnknownKernel"
<< " {"
<< "\n";
else
ofile << "digraph " << asmFileName << " {"
<< "\n";
//
// keep the graph width 8, estimate a reasonable graph height
//
const unsigned itemPerPage = 64; // 60 instructions per Letter page
unsigned totalItem = (unsigned)Declares.size();
for (std::list<G4_BB *>::iterator it = fg.begin(); it != fg.end(); ++it)
totalItem += ((unsigned)(*it)->size());
totalItem += (unsigned)fg.size();
float graphHeight = (float)totalItem / itemPerPage;
graphHeight =
graphHeight < 100.0f ? 100.0f : graphHeight; // minimal size: Letter
ofile << "\n\t// Setup\n";
ofile << "\tsize = \"80.0, " << graphHeight << "\";\n";
ofile << "\tpage= \"80.5, 110\";\n";
ofile << "\tpagedir=\"TL\";\n";
// dump out flow graph
for (std::list<G4_BB *>::iterator it = fg.begin(); it != fg.end(); ++it) {
G4_BB *bb = (*it);
//
// write: BB0 [shape=plaintext, label=<
// <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
// <TR><TD ALIGN="CENTER">BB0: TestRA_Dot</TD></TR>
// <TR><TD>
// <TABLE BORDER="0" CELLBORDER="0"
// CELLSPACING="0">
// <TR><TD
// ALIGN="LEFT">TestRA_Dot:</TD></TR>
// <TR><TD ALIGN="LEFT"><FONT
// color="red">add (8) Region(0,0)[1]
// Region(0,0)[8;8,1] PAYLOAD(0,0)[8;8,1]
// [NoMask]</FONT></TD></TR>
// </TABLE>
// </TD></TR>
// </TABLE>>];
// print out label if the first inst is a label inst
//
ofile << "\t";
bb->writeBBId(ofile);
ofile << " [shape=plaintext, label=<"
<< "\n";
ofile << "\t\t\t <TABLE BORDER=\"0\" CELLBORDER=\"1\" CELLSPACING=\"0\">"
<< "\n";
ofile << "\t\t\t\t<TR><TD ALIGN=\"CENTER\">";
bb->writeBBId(ofile);
ofile << ": ";
if (!bb->empty() && bb->front()->isLabel()) {
bb->front()->getSrc(0)->emit(ofile);
}
ofile << "</TD></TR>"
<< "\n";
// emit all instructions within basic block
ofile << "\t\t\t\t<TR><TD>"
<< "\n";
if (!bb->empty()) {
ofile << "\t\t\t\t\t <TABLE BORDER=\"0\" CELLBORDER=\"0\" "
"CELLSPACING=\"0\">"
<< "\n";
for (INST_LIST_ITER i = bb->begin(); i != bb->end(); i++) {
//
// detect if there is spill code first, set different color for it
//
std::string fontColor = "black";
//
// emit the instruction
//
ofile << "\t\t\t\t\t\t<TR><TD ALIGN=\"LEFT\"><FONT color=\""
<< fontColor << "\">";
std::ostringstream os;
(*i)->emit(os);
std::string dotStr(os.str());
// TODO: dot doesn't like '<', '>', '{', or '}' (and '&') this code
// below is a hack. need to replace with delimiters.
// std::replace_if(dotStr.begin(), dotStr.end(),
// bind2nd(equal_to<char>(), '<'), '[');
std::replace_if(
dotStr.begin(), dotStr.end(),
std::bind(std::equal_to<char>(), std::placeholders::_1, '<'), '[');
std::replace_if(
dotStr.begin(), dotStr.end(),
std::bind(std::equal_to<char>(), std::placeholders::_1, '>'), ']');
std::replace_if(
dotStr.begin(), dotStr.end(),
std::bind(std::equal_to<char>(), std::placeholders::_1, '{'), '[');
std::replace_if(
dotStr.begin(), dotStr.end(),
std::bind(std::equal_to<char>(), std::placeholders::_1, '}'), ']');
std::replace_if(
dotStr.begin(), dotStr.end(),
std::bind(std::equal_to<char>(), std::placeholders::_1, '&'), '$');
ofile << dotStr;
ofile << "</FONT></TD></TR>"
<< "\n";
// ofile << "\\l"; // left adjusted
}
ofile << "\t\t\t\t\t </TABLE>"
<< "\n";
}
ofile << "\t\t\t\t</TD></TR>"
<< "\n";
ofile << "\t\t\t </TABLE>>];"
<< "\n";
//
// dump out succ edges
// BB12 -> BB10
//
for (std::list<G4_BB *>::iterator sit = bb->Succs.begin();
sit != bb->Succs.end(); ++sit) {
bb->writeBBId(ofile);
ofile << " -> ";
(*sit)->writeBBId(ofile);
ofile << "\n";
}
}
//
// write "}" to end digraph
//
ofile << "\n"
<< " }"
<< "\n";
//
// close dot file
//
ofile.close();
}
// Dump the instructions into a .g4 file
void G4_Kernel::dumpG4Internal(const std::string &file) {
std::stringstream g4asm;
dumpG4InternalTo(g4asm);
std::string g4asms = g4asm.str();
if (m_options->getuInt32Option(vISA_DumpPassesSubset) == 1 &&
g4asms == lastG4Asm) {
return;
}
lastG4Asm = std::move(g4asms);
std::fstream ofile(file + ".g4", std::ios::out);
vASSERT(!ofile.fail());
dumpG4InternalTo(ofile);
}
void G4_Kernel::dumpG4InternalTo(std::ostream &os) {
if (name)
os << ".kernel " << name << "\n";
else
os << ".kernel\n";
for (const G4_Declare *d : Declares) {
static const int MIN_DECL = 34; // skip the built-in decls
if (d->getDeclId() > MIN_DECL) {
// os << d->getDeclId() << "\n";
d->emit(os);
}
}
os << "\n";
// Additional dumps for lit testing
os << "// simdSize = " << (int)simdSize.value << "\n";
os << "\n";
for (std::list<G4_BB *>::iterator it = fg.begin(); it != fg.end(); ++it) {
// Emit BB number
G4_BB *bb = (*it);
bb->writeBBId(os);
// Emit BB type
if (bb->getBBType()) {
os << " [" << bb->getBBTypeStr() << "] ";
}
os << "\tPreds: ";
for (auto pred : bb->Preds) {
pred->writeBBId(os);
os << " ";
}
os << "\tSuccs: ";
for (auto succ : bb->Succs) {
succ->writeBBId(os);
os << " ";
}
os << "\n";
bb->emit(os);
os << "\n\n";
} // bbs
}
void G4_Kernel::emitDeviceAsmHeaderComment(std::ostream &os) {
os << "//.kernel ";
if (name != NULL) {
// some 3D kernels do not have a name
os << name;
}
#if !Release
os << "\n"
<< "//.platform " << getGenxPlatformString();
os << "\n"
<< "//.thread_config "
<< "numGRF=" << numRegTotal << ", numAcc=" << numAcc;
#endif
if (fg.builder->hasSWSB()) {
os << ", numSWSB=" << numSWSBTokens;
}
os << "\n"
<< "//.options_string \"" << m_options->getUserArgString().str() << "\"";
os << "\n"
<< "//.full_options \"" << m_options->getFullArgString() << "\"";
os << "\n"
<< "//.instCount " << asmInstCount;
static const char *const RATypeString[]{RA_TYPE(STRINGIFY)};
os << "\n//.RA type\t" << RATypeString[RAType];
if (!m_options->getOption(vISA_skipGitHash))
os << "\n//.git-hash " << GIT_COMMIT_HASH;
if (auto jitInfo = fg.builder->getJitInfo()) {
if (jitInfo->stats.numGRFUsed != 0) {
os << "\n"
<< "//.GRF count " << jitInfo->stats.numGRFUsed;
}
if (jitInfo->stats.spillMemUsed > 0) {
os << "\n"
<< "//.spill size " << jitInfo->stats.spillMemUsed;
}
if (jitInfo->stats.numGRFSpillFillWeighted > 0) {
os << "\n"
<< "//.spill GRF est. ref count "
<< jitInfo->stats.numGRFSpillFillWeighted;
}
if (jitInfo->stats.numFlagSpillStore > 0) {
os << "\n//.spill flag store " << jitInfo->stats.numFlagSpillStore;
os << "\n//.spill flag load " << jitInfo->stats.numFlagSpillLoad;
}
}
auto privateMemSize = getInt32KernelAttr(Attributes::ATTR_SpillMemOffset);
if (privateMemSize != 0) {
os << "\n//.private memory size " << privateMemSize;
}
os << "\n\n";
// Step2: emit declares (as needed)
for (auto dcl : Declares) {
dcl->emit(os);
}
os << "\n";
auto fmtHex = [](int i) {
std::stringstream ss;
ss << "0x" << std::hex << std::uppercase << i;
return ss.str();
};
auto args = getArgumentLayout();
const unsigned inputCount = (unsigned)args.size();
std::vector<std::string> argNames;
size_t maxNameLen = 8;
for (unsigned ix = 0; ix < inputCount; ix++) {
const ArgLayout &a = args[ix];
std::stringstream ss;
if (a.decl && a.decl->getName()) {
ss << a.decl->getName();
} else {
ss << "__unnamed" << (ix + 1);
}
argNames.push_back(ss.str());
maxNameLen = std::max(maxNameLen, argNames.back().size());
}
// emit input location and size
os << "// .inputs\n";
const size_t COLW_IDENT = maxNameLen;
static const size_t COLW_TYPE = 8;
static const size_t COLW_SIZE = 6;
static const size_t COLW_AT = 8; // e.g. "r16+0x20"
static const size_t COLW_FROM = 16; // e.g. "inline+0x20"
std::stringstream bordss;
bordss << "// ";
bordss << '+';
bordss << std::setfill('-') << std::setw(COLW_IDENT + 2) << "";
bordss << '+';
bordss << std::setfill('-') << std::setw(COLW_TYPE + 2) << "";
bordss << '+';
bordss << std::setfill('-') << std::setw(COLW_SIZE + 2) << "";
bordss << '+';
bordss << std::setfill('-') << std::setw(COLW_AT + 2) << "";
bordss << '+';
bordss << std::setfill('-') << std::setw(COLW_FROM + 2) << "";
bordss << '+' << "\n";
std::string border = bordss.str();
os << border;
os << "//"
<< " | " << std::left << std::setw(COLW_IDENT) << "id"
<< " | " << std::left << std::setw(COLW_TYPE) << "type"
<< " | " << std::right << std::setw(COLW_SIZE) << "bytes"
<< " | " << std::left << std::setw(COLW_AT) << "at"
<< " | " << std::left << std::setw(COLW_FROM) << "from"
<< " |"
<< "\n";
os << border;
const unsigned grfSize = getGRFSize();
for (unsigned ix = 0; ix < inputCount; ix++) {
const ArgLayout &a = args[ix];
//
os << "//";
//
// id
os << " | " << std::left << std::setw(COLW_IDENT) << argNames[ix];
//
// type and length
// e.g. :uq x 16
const G4_Declare *dcl = a.decl;
std::stringstream sstype;
if (dcl) {
switch (dcl->getElemType()) {
case Type_B:
sstype << ":b";
break;
case Type_W:
sstype << ":w";
break;
case Type_D:
sstype << ":d";
break;
case Type_Q:
sstype << ":q";
break;
case Type_V:
sstype << ":v";
break;
case Type_UB:
sstype << ":ub";
break;
case Type_UW:
sstype << ":uw";
break;
case Type_UD:
sstype << ":ud";
break;
case Type_UQ:
sstype << ":uq";
break;
case Type_UV:
sstype << ":uv";
break;
//
case Type_F:
sstype << ":f";
break;
case Type_HF:
sstype << ":hf";
break;
case Type_DF:
sstype << ":df";
break;
case Type_NF:
sstype << ":nf";
break;
case Type_BF:
sstype << ":bf";
break;
default:
sstype << fmtHex((int)dcl->getElemType()) << "?";
break;
}
if (dcl->getTotalElems() != 1)
sstype << " x " << dcl->getTotalElems();
} else {
sstype << "?";
}
os << " | " << std::left << std::setw(COLW_TYPE) << sstype.str();
//
// size
os << " | " << std::right << std::setw(COLW_SIZE) << fmtHex(a.size);
// location
unsigned reg = a.dstGrfAddr / grfSize, subRegBytes = a.dstGrfAddr % grfSize;
std::stringstream ssloc;
ssloc << "r" << reg;
if (subRegBytes != 0)
ssloc << "+" << fmtHex(subRegBytes);
os << " | " << std::left << std::setw(COLW_AT) << ssloc.str();
// from
std::string from;
switch (a.memSource) {
case ArgLayout::MemSrc::CTI:
from = "cti";
break;
case ArgLayout::MemSrc::PTI:
from = "pti[tid]";
break;
case ArgLayout::MemSrc::INLINE:
from = "inline";
break;
default:
from = fmtHex(int(a.memSource)) + "?";
break;
}
std::stringstream ssf;
ssf << from;
ssf << "+" << fmtHex(a.memOffset);
os << " | " << std::left << std::setw(COLW_FROM) << ssf.str();
//
os << " |\n";
}
os << border << "\n";
if (getPlatformGeneration() < PlatformGen::XE) {
fg.BCStats.clear();
}
}
using BlockOffsets = std::map<int32_t, std::vector<std::string>>;
static BlockOffsets precomputeBlockOffsets(std::ostream &os, G4_Kernel &g4k,
const KernelView &kv) {
// pre-compute the PCs of each basic block
int32_t currPc = 0, lastInstSize = -1;
BlockOffsets blockOffsets;
for (BB_LIST_ITER itBB = g4k.fg.begin(); itBB != g4k.fg.end(); ++itBB) {
for (INST_LIST_ITER itInst = (*itBB)->begin(); itInst != (*itBB)->end();
++itInst) {
if ((*itInst)->isLabel()) {
// G4 treats labels as special instructions
const char *lbl = (*itInst)->getLabelStr();
if (lbl && *lbl) {
blockOffsets[currPc].emplace_back(lbl);
}
} else {
// we are looking at the next G4 instruction,
// but reached the end of the decode stream
if (lastInstSize == 0) {
os << "// ERROR: deducing G4 block PCs "
"(IGA decoded stream ends early); falling back to IGA labels\n";
blockOffsets.clear(); // fallback to IGA default labels
return blockOffsets;
}
lastInstSize = kv.getInstSize(currPc);
G4_INST *inst = (*itInst);
// For HW WA.
// In which, vISA may ask IGA to emit some additional instructions.
// For example, sync is used to make instruction aligned, and nop is
// used to support stepping in debugger.
// However, due to compaction, we might not know the exact location of
// the instruction, the sync instruction insertion has to happen during
// encoding, which is unknown for the instruction size of kernel in the
// decoding. That's the issue we have to make these changes.
if (inst->isCachelineAligned()) {
iga::Op opcode = kv.getOpcode(currPc);
// There could be multiple sync.nop instructions emitted by IGA to
// make the instruction aligned. Here we continue to advance PC when
// seeing sync.nop so that vISA inst and IGA inst could match again.
while (opcode == iga::Op::SYNC) {
currPc += lastInstSize;
opcode = kv.getOpcode(currPc);
lastInstSize = kv.getInstSize(currPc);
}
}
// When the inst requires an additional nop after it, again we need to
// advance PC to consume NOP to make vISA inst and IGA inst match later.
if (inst->requireNopAfter()) {
currPc += lastInstSize;
lastInstSize = kv.getInstSize(currPc);
vASSERT(kv.getOpcode(currPc) == iga::Op::NOP);
}
currPc += lastInstSize;
}
}
}
if (kv.getInstSize(currPc) != 0) {
// we are looking at the next G4 instruction,
// but reached the end of the decode stream
os << "// ERROR: deducing G4 block PCs "
"(G4_INST stream ends early); falling back to IGA labels\n";
blockOffsets.clear(); // fallback to IGA default labels
}
return blockOffsets;
}
// needs further cleanup (confirm label prefixes are gone, newAsm == true)
void G4_Kernel::emitDeviceAsmInstructionsIga(std::ostream &os,
const void *binary,
uint32_t binarySize) {
os << "\n";
const size_t ERROR_STRING_MAX_LENGTH = 16 * 1024;
char *errBuf = new char[ERROR_STRING_MAX_LENGTH];
vASSERT(errBuf);
if (!errBuf)
return;
iga_gen_t igaPlatform = getIGAPlatform(getPlatform());
const iga::Model *igaModel =
iga::Model::LookupModel(iga::ToPlatform(igaPlatform));
iga::SWSB_ENCODE_MODE swsbEncodeMode = igaModel->getSWSBEncodeMode();
KernelView kv(igaPlatform, binary, binarySize, swsbEncodeMode, errBuf,
ERROR_STRING_MAX_LENGTH
);
if (!kv.decodeSucceeded()) {
const char *MSG =
"vISA asm emission: failed to re-decode binary for asm output\n";
// trb: do we really need to clobber std::cerr from a driver?
// Shader dump output will have the message.
std::cerr << MSG;
std::cerr << errBuf << "\n";
os << MSG;
os << errBuf << "\n";
// still continue since parital output might be present
}
delete[] errBuf;
const auto blockOffsets = precomputeBlockOffsets(os, *this, kv);
//
// Generate a label with uniqueLabel as prefix (required by some tools).
// We do so by using labeler callback. If uniqueLabels is not present, use
// iga's default label. For example,
// Without option -uniqueLabels:
// generating default label, L1234
// With option -uniqueLabels <sth>:
// generating label with <sth> as prefix, <sth>_L1234
//
std::string labelPrefix;
if (m_options->getOption(vISA_UniqueLabels)) {
const char *labelPrefixC = nullptr;
m_options->getOption(vISA_LabelStr, labelPrefixC);
labelPrefix = labelPrefixC;
if (!labelPrefix.empty())
labelPrefix += '_';
}
struct LabelerState {
const KernelView *kv;
const BlockOffsets &blockOffsets;
const std::string labelPrefix;
std::string labelStorage;
LabelerState(const KernelView *_kv, const BlockOffsets &offs,
const std::string &lblPfx)
: kv(_kv), blockOffsets(offs), labelPrefix(lblPfx) {}
};
LabelerState ls(&kv, blockOffsets, labelPrefix);
// storage for the IGA labeler
auto labeler = [](int32_t pc, void *data) -> const char * {
LabelerState &ls = *(LabelerState *)data;
ls.labelStorage = ls.labelPrefix;
auto itr = ls.blockOffsets.find(pc);
if (itr == ls.blockOffsets.end()) {
// let IGA choose the label name, but we still have to prefix
// our user provided prefix
char igaDefaultLabel[128];
ls.kv->getDefaultLabelName(pc, igaDefaultLabel, sizeof(igaDefaultLabel));
ls.labelStorage += igaDefaultLabel;
return ls.labelStorage.c_str();
}
std::string g4Label = itr->second.front().c_str();
ls.labelStorage += g4Label;
return ls.labelStorage.c_str();
};
// initialize register suppression info
int suppressRegs[5] = {};
int lastRegs[3] = {};
for (int i = 0; i < 3; i++) {
suppressRegs[i] = -1;
lastRegs[i] = -1;
}
////////////////////////////////////////
// emit the program text (instructions) iteratively
// this is a little tricky because G4 treats labels as instructions
// thus we need to do a little checking to keep the two streams in sync
int32_t pc = 0;
std::vector<char> igaStringBuffer;
igaStringBuffer.resize(512); // TODO: expand default after testing
// printedLabels - tracked the labels those have been printed to the pc to
// avoid printing the same label twice at the same pc. This can happen when
// there's an empty BB contains only labels. The BB and the following BB will
// both print those labels. The pair is the pc to label name pair.
std::set<std::pair<int32_t, std::string>> printedLabels;
// tryPrintLable - check if the given label is already printed with the given
// pc. Print it if not, and skip it if yes.
auto tryPrintLabel = [&os, &printedLabels](int32_t label_pc,
const std::string &label_name) {
auto label_pair = std::make_pair(label_pc, label_name);
// skip if the same label in the set
if (printedLabels.find(label_pair) != printedLabels.end())
return;
os << label_name << ":\n";
printedLabels.insert(label_pair);
};
for (BB_LIST_ITER itBB = fg.begin(); itBB != fg.end(); ++itBB) {
os << "// ";
(*itBB)->emitBbInfo(os);
os << "\n";
for (INST_LIST_ITER itInst = (*itBB)->begin(); itInst != (*itBB)->end();
++itInst) {
G4_INST *i = (*itInst);
// walk to next non-label in this block;
// return true if we find one, else fails if at end of block
auto findNextNonLabel = [&](bool print) {
while ((*itInst)->isLabel()) {
if (print)
os << "// " << (*itInst)->getLabelStr() << ":\n";
itInst++;
if (itInst == (*itBB)->end())
break;
}
if (itInst == (*itBB)->end())
return false;
i = (*itInst);
return true;
};
bool isInstTarget = kv.isInstTarget(pc);
if (isInstTarget) {
auto itr = ls.blockOffsets.find(pc);
if (itr == ls.blockOffsets.end()) {
std::string labelname(labeler(pc, &ls));
tryPrintLabel(pc, labelname);
} else {
// there can be multiple labels per PC
for (const std::string &lbl : itr->second) {
std::string labelname(ls.labelPrefix + lbl);
tryPrintLabel(pc, labelname);
}
}
if (!findNextNonLabel(false)) {
break; // at end of block
}
} else if (i->isLabel()) {
// IGA doesn't consider this PC to be a label but G4 does
//
// move forward until we find the next non-label
if (!findNextNonLabel(true)) {
break; // at end of block
}
}
///////////////////////////////////////////////////////////////////
// we are looking at a non-label G4_INST at the next valid IGA PC
// (same instruction)
if (!getOptions()->getOption(vISA_disableInstDebugInfo)) {
(*itBB)->emitInstructionSourceLineMapping(os, itInst);
}
uint32_t fmtOpts =
IGA_FORMATTING_OPTS_DEFAULT | IGA_FORMATTING_OPT_PRINT_BFNEXPRS;
if (getOption(vISA_PrintHexFloatInAsm))
fmtOpts |= IGA_FORMATTING_OPT_PRINT_HEX_FLOATS;
if (!getOption(vISA_noLdStAsmSyntax))
fmtOpts |= IGA_FORMATTING_OPT_PRINT_LDST;
auto formatToInstToStream = [&](int32_t pc, std::ostream &os) {
// multiple calls to getInstSyntax since we may have to
// dynamically resize buffer
while (true) {
size_t nw =
kv.getInstSyntax(pc, igaStringBuffer.data(),
igaStringBuffer.size(), fmtOpts, labeler, &ls);
if (nw == 0) {
os << "<<error formatting instruction at "
"PC 0x"
<< std::uppercase << std::hex << pc << ">>\n";
break;
} else if (nw <= igaStringBuffer.size()) {
// print it (pad it out so comments line up on most instructions)
std::string line = igaStringBuffer.data();
while (line.size() < 100)
line += ' ';
os << line;
break;
} else {
igaStringBuffer.resize(igaStringBuffer.size() + 512);
// try again
}
}
};
// Advance PC when the vISA instruction needs to be cacheline-aligned or
// requires a Nop after. See comments in precomputeBlockOffsets for
// details.
if (i->isCachelineAligned()) {
iga::Op opcode = kv.getOpcode(pc);
while (opcode == iga::Op::SYNC) {
formatToInstToStream(pc, os);
os << "\n";
pc += kv.getInstSize(pc);
opcode = kv.getOpcode(pc);
}
}
if (i->requireNopAfter()) {
formatToInstToStream(pc, os);
os << "\n";
pc += kv.getInstSize(pc);
vASSERT(kv.getOpcode(pc) == iga::Op::NOP);
}
formatToInstToStream(pc, os);
(*itBB)->emitBasicInstructionComment(os, itInst, suppressRegs, lastRegs,
pc);
os << "\n";
pc += kv.getInstSize(pc);
} // for insts in block
} // for blocks
} // emitDeviceAsmInstructionsIga
// Should be removed once we can confirm no one uses it
// the output comes from G4_INST::... and almost certainly won't be
// parsable by IGA
void G4_Kernel::emitDeviceAsmInstructionsOldAsm(std::ostream &os) {
os << "\n"
<< ".code";
for (BB_LIST_ITER it = fg.begin(); it != fg.end(); ++it) {
os << "\n";
(*it)->emit(os);
}
// Step4: emit clean-up.
os << "\n";
os << ".end_code"
<< "\n";
os << ".end_kernel"
<< "\n";
os << "\n";
}
G4_BB *G4_Kernel::getNextBB(G4_BB *bb) const {
if (!bb)
return nullptr;
// Return the lexically following bb.
G4_BB *nextBB = nullptr;
for (auto it = fg.cbegin(), ie = fg.cend(); it != ie; it++) {
auto curBB = (*it);
if (curBB == bb) {
it++;
if (it != ie) {
nextBB = (*it);
}
break;
}
}
return nextBB;
}
unsigned G4_Kernel::getBinOffsetOfBB(G4_BB *bb) const {
G4_INST *succInst = bb ? bb->getFirstInst() : nullptr;
if (succInst != nullptr) {
return (unsigned)succInst->getGenOffset();
} else {
G4_BB *succBB = bb ? getNextBB(bb) : nullptr;
while ((succBB != nullptr) && (succInst == nullptr)) {
succInst = succBB->getFirstInst();
succBB = getNextBB(succBB);
}
if (succInst != nullptr) {
return (unsigned)succInst->getGenOffset();
} else {
return 0;
}
}
}
unsigned G4_Kernel::getPerThreadNextOff() const {
if (!hasPerThreadPayloadBB())
return 0;
G4_BB *next = getNextBB(perThreadPayloadBB);
return getBinOffsetOfBB(next);
}
unsigned G4_Kernel::getCrossThreadNextOff() const {
if (!hasCrossThreadPayloadBB())
return 0;
G4_BB *next = getNextBB(crossThreadPayloadBB);
return getBinOffsetOfBB(next);
}
unsigned G4_Kernel::getComputeFFIDGPNextOff() const {
if (!hasComputeFFIDProlog())
return 0;
// return the offset of the second entry (GP1)
// the first instruction in the second BB is the start of the second entry
vISA_ASSERT(fg.getNumBB() > 1, "expect at least one prolog BB");
vASSERT(!computeFFIDGP1->empty() && !computeFFIDGP1->front()->isLabel());
return getBinOffsetOfBB(computeFFIDGP1);
}
unsigned G4_Kernel::getComputeFFIDGP1NextOff() const {
if (!hasComputeFFIDProlog())
return 0;
// return the offset of the BB next to GP1
// the first instruction in the second BB is the start of the second entry
vISA_ASSERT(fg.getNumBB() > 1, "expect at least one prolog BB");
G4_BB *next = getNextBB(computeFFIDGP1);
return getBinOffsetOfBB(next);
}
unsigned G4_Kernel::getSRFInWords() {
return (fg.builder->getNumScalarRegisters() *
fg.builder->getScalarRegisterSizeInBytes()) /
2;
}
// GRF modes supported by HW
// There must be at least one Config that is VRTEnable for each platform
GRFMode::GRFMode(const TARGET_PLATFORM plat, unsigned regSize, Options *op)
: platform(plat), grfSize(regSize), options(op) {
switch (platform) {
case Xe_XeHPSDV:
case Xe_DG2:
case Xe_MTL:
case Xe_ARL:
configs.resize(2);
// Configurations with <numGRF, numThreads, SWSBTokens, numAcc>
configs[0] = Config(128, 8, 16, 4);
configs[1] = Config(256, 4, 16, 8);
defaultMode = 0;
break;
case Xe_PVC:
case Xe_PVCXT:
case Xe2:
configs.resize(2);
// Configurations with <numGRF, numThreads, SWSBTokens, numAcc>
configs[0] = Config(128, 8, 16, 4);
configs[1] = Config(256, 4, 32, 8);
defaultMode = 0;
break;
case Xe3:
configs.resize(7);
// Configurations with <numGRF, numThreads, SWSBTokens, numAcc>
configs[0] = Config(32, 10, 32, 4);
configs[1] = Config(64, 10, 32, 4);
configs[2] = Config(96, 10, 32, 4);
configs[3] = Config(128, 8, 32, 4);
configs[4] = Config(160, 6, 32, 4);
configs[5] = Config(192, 5, 32, 4);
configs[6] = Config(256, 4, 32, 8);
defaultMode = 3;
break;
default:
// platforms <= TGL
configs.resize(1);
// Configurations with <numGRF, numThreads, SWSBTokens, numAcc>
configs[0] = {128, 7, 16, 2};
defaultMode = 0;
}
currentMode = defaultMode;
// Set lower bound GRF
unsigned minGRF = op->getuInt32Option(vISA_MinGRFNum);
lowerBoundGRF = minGRF > 0 ? minGRF : configs.front().numGRF;
vISA_ASSERT(isValidNumGRFs(lowerBoundGRF),
"Invalid lower bound for GRF number");
// Set upper bound GRF
unsigned maxGRF = op->getuInt32Option(vISA_MaxGRFNum);
upperBoundGRF = maxGRF > 0 ? maxGRF : configs.back().numGRF;
vISA_ASSERT(isValidNumGRFs(upperBoundGRF),
"Invalid upper bound for GRF number");
// Select higher GRF
GRFModeUpValue = op->getuInt32Option(vISA_ForceGRFModeUp);
vISA_ASSERT(GRFModeUpValue >= 0 && GRFModeUpValue <= configs.size(),
"Invalid value for selecting a higher GRF mode");
}
unsigned GRFMode::setModeByRegPressure(unsigned maxRP, unsigned largestInputReg,
bool forceGRFModeUp) {
unsigned size = configs.size(), i = 0;
bool spillAllowed = getSpillThreshold() > 0;
unsigned spillThresholdInRegs = getSpillThreshold() / grfSize;
// find appropiate GRF based on reg pressure
for (; i < size; i++) {
if (configs[i].VRTEnable && configs[i].numGRF >= lowerBoundGRF &&
configs[i].numGRF <= upperBoundGRF) {
currentMode = i;
if (maxRP <= configs[i].numGRF &&
// Check that we've at least 8 GRFs over and above
// those blocked for kernel input. This helps cases
// where an 8 GRF variable shows up in entry BB.
(largestInputReg + 8) <= configs[i].numGRF) {
if (forceGRFModeUp && GRFModeUpValue > 0) {
// Check if user is force a higher GRF mode
unsigned newGRFMode = currentMode + GRFModeUpValue;
unsigned maxGRFMode = getMaxGRFMode();
currentMode = newGRFMode < maxGRFMode ? newGRFMode : maxGRFMode;
}
if (spillAllowed && !hasSmallerGRFSameThreads() && currentMode > 0) {
unsigned lowerGRFNum = getSmallerGRF();
// Select a lower GRF number in PreRA in case the register
// pressure computed is a bit higher (e.g. 4%) than the lower GRF
// config. If spills are detected, RA will still bump up the GRF
// number to avoid them.
// For example, if reg pressure is 165, we select 160GRF since
// we have spill threshold enabled and the diff between 165 and 160
// is less than 4%.
if ((lowerGRFNum * 1.04 >= maxRP ||
configs[currentMode].numGRF == getMaxGRF()) &&
lowerGRFNum >= (largestInputReg + 8) &&
lowerGRFNum >= lowerBoundGRF)
setModeByNumGRFs(lowerGRFNum);
}
return configs[currentMode].numGRF;
} else if (spillAllowed &&
maxRP <= configs[i].numGRF + spillThresholdInRegs &&
(largestInputReg + 8) <= configs[i].numGRF) {
return configs[currentMode].numGRF;
}
}
}
// RP is greater than the maximum GRF available, so set the largest GRF
// available
return configs[currentMode].numGRF;
}
// Check if next larger GRF has the same number of threads per EU
bool GRFMode::hasLargerGRFSameThreads() const {
unsigned largerGrfIdx = currentMode + 1;
if (largerGrfIdx == configs.size() || !configs[largerGrfIdx].VRTEnable)
return false;
return configs[currentMode].numThreads == configs[largerGrfIdx].numThreads;
}
// Check if next smaller GRF has the same number of threads per EU
bool GRFMode::hasSmallerGRFSameThreads() const {
int smallerGrfIdx = currentMode - 1;
if (smallerGrfIdx < 0 || !configs[smallerGrfIdx].VRTEnable)
return false;
return configs[currentMode].numThreads == configs[smallerGrfIdx].numThreads;
}
// Get spill threshold for current GRF mode
unsigned GRFMode::getSpillThreshold() const {
if (platform < Xe3)
return 0;
// FIXME: currently spill thresholds for <128GRF are
// causing some performance regressions. We need more
// study to define proper thresholds for this range.
if (configs[currentMode].numGRF < 128)
return 0;
if (configs[currentMode].numGRF == 256 &&
options->getuInt32Option(vISA_SpillAllowed256GRF) > 0)
return options->getuInt32Option(vISA_SpillAllowed256GRF);
return options->getuInt32Option(vISA_SpillAllowed);
}