mirror of
https://github.com/intel/intel-graphics-compiler.git
synced 2025-11-04 08:21:06 +08:00
2267 lines
70 KiB
C++
2267 lines
70 KiB
C++
/*========================== begin_copyright_notice ============================
|
|
|
|
Copyright (C) 2021 Intel Corporation
|
|
|
|
SPDX-License-Identifier: MIT
|
|
|
|
============================= end_copyright_notice ===========================*/
|
|
|
|
#include "G4_Kernel.hpp"
|
|
#include "BinaryEncodingIGA.h"
|
|
#include "BuildIR.h"
|
|
#include "Common_ISA_framework.h"
|
|
#include "DebugInfo.h"
|
|
#include "G4_BB.hpp"
|
|
#include "KernelCost.hpp"
|
|
#include "VISAKernel.h"
|
|
#include "VarSplit.h"
|
|
#include "iga/IGALibrary/Models/Models.hpp"
|
|
#include "iga/IGALibrary/api/kv.hpp"
|
|
#include "visa_wa.h"
|
|
|
|
#include <fstream>
|
|
#include <functional>
|
|
#include <iomanip>
|
|
#include <list>
|
|
#include <utility>
|
|
|
|
using namespace vISA;
|
|
|
|
void *gtPinData::getFreeGRFInfo(unsigned &size) {
|
|
// Here is agreed upon format for reporting free GRFs:
|
|
// struct freeBytes
|
|
//{
|
|
// unsigned short startByte;
|
|
// unsigned short numConsecutiveBytes;
|
|
//};
|
|
|
|
// Added magic 0xDEADD00D at start and
|
|
// magic 0xDEADBEEF at the end of buffer
|
|
// on request of gtpin team.
|
|
//
|
|
// struct freeGRFInfo
|
|
//{
|
|
// unsigned short numItems;
|
|
//
|
|
// freeBytes data[numItems];
|
|
//};
|
|
struct freeBytes {
|
|
unsigned short startByte;
|
|
unsigned short numConsecutiveBytes;
|
|
};
|
|
|
|
struct freeGRFInfo {
|
|
unsigned int magicStart;
|
|
unsigned int numItems;
|
|
};
|
|
|
|
// Compute free register information using vector for efficiency,
|
|
// then convert to POS for passing back to gtpin.
|
|
std::vector<std::pair<unsigned short, unsigned short>> vecFreeBytes;
|
|
|
|
for (auto byte : globalFreeRegs) {
|
|
if (vecFreeBytes.size() > 0) {
|
|
auto &lastFree = vecFreeBytes.back();
|
|
if (byte == (lastFree.first + lastFree.second)) {
|
|
lastFree.second += 1;
|
|
} else {
|
|
vecFreeBytes.push_back(std::make_pair(byte, 1));
|
|
}
|
|
} else {
|
|
vecFreeBytes.push_back(std::make_pair(byte, 1));
|
|
}
|
|
}
|
|
|
|
// Now convert vector to POS
|
|
unsigned int numItems = (unsigned int)vecFreeBytes.size();
|
|
freeGRFInfo *buffer = (freeGRFInfo *)malloc(
|
|
numItems * sizeof(freeBytes) + sizeof(unsigned int) +
|
|
sizeof(unsigned int) + sizeof(unsigned int));
|
|
if (buffer) {
|
|
buffer->numItems = numItems;
|
|
buffer->magicStart = 0xDEADD00D;
|
|
memcpy_s((unsigned char *)buffer + sizeof(unsigned int) +
|
|
sizeof(unsigned int),
|
|
numItems * sizeof(freeBytes), vecFreeBytes.data(),
|
|
numItems * sizeof(freeBytes));
|
|
unsigned int magicEnd = 0xDEADBEEF;
|
|
memcpy_s((unsigned char *)buffer + sizeof(unsigned int) +
|
|
sizeof(unsigned int) + (numItems * sizeof(freeBytes)),
|
|
sizeof(magicEnd), &magicEnd, sizeof(magicEnd));
|
|
|
|
// numItems - unsigned int
|
|
// magicStart - unsigned int
|
|
// magicEnd - unsigned int
|
|
// data - numItems * sizeof(freeBytes)
|
|
size = sizeof(unsigned int) + sizeof(unsigned int) + sizeof(unsigned int) +
|
|
(numItems * sizeof(freeBytes));
|
|
}
|
|
|
|
return buffer;
|
|
}
|
|
|
|
void gtPinData::setGTPinInit(void *buffer) {
|
|
vISA_ASSERT(sizeof(gtpin::igc::igc_init_t) <= 200,
|
|
"Check size of igc_init_t");
|
|
gtpin_init = (gtpin::igc::igc_init_t *)buffer;
|
|
|
|
// reRA pass is no longer supported.
|
|
// FIXME: should we assert here?
|
|
// if (gtpin_init->re_ra)
|
|
if (gtpin_init->grf_info)
|
|
kernel.getOptions()->setOption(vISA_GetFreeGRFInfo, true);
|
|
}
|
|
|
|
template <class T>
|
|
void write(void *buffer, const T &data, unsigned int &offset) {
|
|
memcpy_s((char *)buffer + offset, sizeof(T), &data, sizeof(T));
|
|
offset += sizeof(T);
|
|
}
|
|
|
|
void *gtPinData::getIndirRefs(unsigned int &size) {
|
|
// Store indirect access per %ip
|
|
// %ip -> vector[start byte, size]
|
|
std::map<unsigned int, std::vector<std::pair<unsigned int, unsigned int>>>
|
|
indirRefMap;
|
|
|
|
// return %ip of first executable instruction in kernel
|
|
auto getIpOfFirstInst = [&]() {
|
|
unsigned int startIp = 0;
|
|
if (kernel.fg.getIsStackCallFunc()) {
|
|
for (auto bb : kernel.fg.getBBList()) {
|
|
if (startIp > 0)
|
|
break;
|
|
for (auto inst : bb->getInstList()) {
|
|
startIp = (unsigned int)inst->getGenOffset();
|
|
|
|
if (inst->isLabel())
|
|
continue;
|
|
|
|
// verify truncation is still legal
|
|
vISA_ASSERT(inst->getGenOffset() == (uint32_t)inst->getGenOffset(),
|
|
"%ip out of bounds");
|
|
|
|
if (startIp > 0)
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return startIp;
|
|
};
|
|
|
|
unsigned int startIp = getIpOfFirstInst();
|
|
|
|
auto getIndirRefData = [&](G4_Declare *addr) {
|
|
// for given addr, return std::vector<std::pair<start byte, size>>
|
|
std::vector<std::pair<unsigned int, unsigned int>> indirs;
|
|
|
|
auto it = indirRefs.find(addr);
|
|
if (it == indirRefs.end())
|
|
return indirs;
|
|
|
|
for (auto target : (*it).second) {
|
|
if (target->isSpilled())
|
|
continue;
|
|
auto start = target->getGRFOffsetFromR0();
|
|
auto size = target->getByteSize();
|
|
indirs.push_back(std::make_pair(start, size));
|
|
}
|
|
return indirs;
|
|
};
|
|
|
|
for (auto bb : kernel.fg.getBBList()) {
|
|
// Kernel's CFG may be stitched together
|
|
// with that of its callees. We want to
|
|
// iterate over only those BBs that belong
|
|
// to current CFG.
|
|
if (&bb->getParent() != &kernel.fg)
|
|
break;
|
|
for (auto inst : bb->getInstList()) {
|
|
auto dst = inst->getDst();
|
|
if (dst && dst->isIndirect()) {
|
|
// encode dst indirect reference
|
|
auto indirs = getIndirRefData(dst->getTopDcl());
|
|
auto &mapEntry = indirRefMap[(uint32_t)inst->getGenOffset() - startIp];
|
|
mapEntry.insert(mapEntry.end(), indirs.begin(), indirs.end());
|
|
}
|
|
|
|
for (unsigned int i = 0; i != inst->getNumSrc(); ++i) {
|
|
auto src = inst->getSrc(i);
|
|
if (src && src->isSrcRegRegion() &&
|
|
src->asSrcRegRegion()->isIndirect()) {
|
|
// encode src indirect reference
|
|
auto indirs = getIndirRefData(src->asSrcRegRegion()->getTopDcl());
|
|
auto &mapEntry =
|
|
indirRefMap[(uint32_t)inst->getGenOffset() - startIp];
|
|
mapEntry.insert(mapEntry.end(), indirs.begin(), indirs.end());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
unsigned int numRanges = 0;
|
|
for (auto &item : indirRefMap) {
|
|
numRanges += item.second.size();
|
|
}
|
|
|
|
// see gtpin_IGC_interface.h for format of igc_token_indirect_access_info_t
|
|
size = sizeof(gtpin::igc::igc_token_indirect_access_info_t::num_ranges) +
|
|
numRanges * sizeof(gtpin::igc::ins_reg_range_t);
|
|
auto buffer = malloc(size);
|
|
unsigned int offset = 0;
|
|
write<uint32_t>(buffer, numRanges, offset);
|
|
for (auto &item : indirRefMap) {
|
|
for (const auto &arg : item.second) {
|
|
vISA_ASSERT(offset < size, "Out of bounds");
|
|
write<uint32_t>(buffer, item.first, offset);
|
|
vISA_ASSERT(offset < size, "Out of bounds");
|
|
write<uint16_t>(buffer, arg.first, offset);
|
|
vISA_ASSERT(offset < size, "Out of bounds");
|
|
write<uint16_t>(buffer, arg.second, offset);
|
|
}
|
|
}
|
|
|
|
vISA_ASSERT(offset == size, "Unexpected bounds");
|
|
|
|
return buffer;
|
|
}
|
|
|
|
template <typename T>
|
|
static void writeBuffer(std::vector<unsigned char> &buffer,
|
|
unsigned &bufferSize, const T *t, unsigned numBytes) {
|
|
const unsigned char *data = (const unsigned char *)t;
|
|
for (unsigned i = 0; i != numBytes; i++) {
|
|
buffer.push_back(data[i]);
|
|
}
|
|
bufferSize += numBytes;
|
|
}
|
|
|
|
void *gtPinData::getGTPinInfoBuffer(unsigned &bufferSize,
|
|
unsigned int scratchOffset) {
|
|
if (!gtpin_init && !gtpinInitFromL0) {
|
|
bufferSize = 0;
|
|
return nullptr;
|
|
}
|
|
gtpin::igc::igc_init_t t;
|
|
std::vector<unsigned char> buffer;
|
|
unsigned numTokens = 0;
|
|
auto stackABI =
|
|
kernel.fg.getIsStackCallFunc() || kernel.fg.getHasStackCalls();
|
|
bufferSize = 0;
|
|
|
|
memset(&t, 0, sizeof(t));
|
|
|
|
t.version = gtpin::igc::GTPIN_IGC_INTERFACE_VERSION;
|
|
t.igc_init_size = sizeof(t);
|
|
if (gtpinInitFromL0) {
|
|
if (!stackABI) {
|
|
if (kernel.getOption(vISA_GetFreeGRFInfo)) {
|
|
t.grf_info = 1;
|
|
numTokens++;
|
|
// indirect info
|
|
numTokens++;
|
|
}
|
|
|
|
if (kernel.getOption(vISA_GTPinReRA)) {
|
|
t.re_ra = 1;
|
|
}
|
|
} else {
|
|
// provide only indirect info for stack calls
|
|
if (kernel.getOption(vISA_GetFreeGRFInfo)) {
|
|
t.grf_info = 1;
|
|
numTokens++;
|
|
}
|
|
}
|
|
|
|
if (kernel.getOptions()->getOption(vISA_GenerateDebugInfo))
|
|
t.srcline_mapping = 1;
|
|
|
|
if (kernel.getOptions()->getuInt32Option(vISA_GTPinScratchAreaSize) > 0) {
|
|
t.scratch_area_size = getNumBytesScratchUse();
|
|
numTokens++;
|
|
}
|
|
|
|
if (!t.grf_info && kernel.getOptions()->getOption(vISA_GetFreeGRFInfo)) {
|
|
// this check is to report out indir references, irrespective of
|
|
// whether stack call is present.
|
|
t.grf_info = 1;
|
|
numTokens++;
|
|
}
|
|
} else {
|
|
t.version =
|
|
std::min(gtpin_init->version, gtpin::igc::GTPIN_IGC_INTERFACE_VERSION);
|
|
if (!stackABI) {
|
|
if (gtpin_init->grf_info) {
|
|
t.grf_info = 1;
|
|
numTokens++;
|
|
// indirect info
|
|
numTokens++;
|
|
}
|
|
|
|
if (gtpin_init->re_ra) {
|
|
t.re_ra = 1;
|
|
}
|
|
} else {
|
|
// provide only indirect info for stack calls
|
|
if (gtpin_init->grf_info) {
|
|
t.grf_info = 1;
|
|
numTokens++;
|
|
}
|
|
}
|
|
|
|
if (gtpin_init->srcline_mapping &&
|
|
kernel.getOptions()->getOption(vISA_GenerateDebugInfo))
|
|
t.srcline_mapping = 1;
|
|
|
|
if (gtpin_init->scratch_area_size > 0) {
|
|
t.scratch_area_size = gtpin_init->scratch_area_size;
|
|
numTokens++;
|
|
}
|
|
|
|
if (!t.grf_info && gtpin_init->grf_info) {
|
|
t.grf_info = 1;
|
|
numTokens++;
|
|
}
|
|
}
|
|
|
|
// For payload offsets
|
|
numTokens++;
|
|
|
|
// Report #GRFs
|
|
numTokens++;
|
|
|
|
writeBuffer(buffer, bufferSize, &t, sizeof(t));
|
|
writeBuffer(buffer, bufferSize, &numTokens, sizeof(uint32_t));
|
|
|
|
if (t.grf_info) {
|
|
if (!stackABI) {
|
|
// create token
|
|
void *rerabuffer = nullptr;
|
|
unsigned rerasize = 0;
|
|
|
|
rerabuffer = getFreeGRFInfo(rerasize);
|
|
|
|
gtpin::igc::igc_token_header_t th;
|
|
th.token = gtpin::igc::GTPIN_IGC_TOKEN::GTPIN_IGC_TOKEN_GRF_INFO;
|
|
th.token_size = sizeof(gtpin::igc::igc_token_header_t) + rerasize;
|
|
|
|
// write token and data to buffer
|
|
writeBuffer(buffer, bufferSize, &th, sizeof(th));
|
|
writeBuffer(buffer, bufferSize, rerabuffer, rerasize);
|
|
|
|
free(rerabuffer);
|
|
}
|
|
// report indir refs
|
|
void *indirRefs = nullptr;
|
|
unsigned int indirRefsSize = 0;
|
|
|
|
indirRefs = getIndirRefs(indirRefsSize);
|
|
|
|
gtpin::igc::igc_token_header_t th;
|
|
th.token =
|
|
gtpin::igc::GTPIN_IGC_TOKEN::GTPIN_IGC_TOKEN_INDIRECT_ACCESS_INFO;
|
|
th.token_size = sizeof(gtpin::igc::igc_token_header_t) + indirRefsSize;
|
|
|
|
// write token and data to buffer
|
|
writeBuffer(buffer, bufferSize, &th, sizeof(th));
|
|
writeBuffer(buffer, bufferSize, indirRefs, indirRefsSize);
|
|
|
|
free(indirRefs);
|
|
}
|
|
|
|
if (t.scratch_area_size) {
|
|
gtpin::igc::igc_token_scratch_area_info_t scratchSlotData;
|
|
scratchSlotData.scratch_area_size = t.scratch_area_size;
|
|
vISA_ASSERT(scratchOffset >= nextScratchFree, "scratch offset mismatch");
|
|
scratchSlotData.scratch_area_offset = scratchOffset;
|
|
|
|
// gtpin scratch slots are beyond spill memory
|
|
scratchSlotData.token = gtpin::igc::GTPIN_IGC_TOKEN_SCRATCH_AREA_INFO;
|
|
scratchSlotData.token_size = sizeof(scratchSlotData);
|
|
|
|
writeBuffer(buffer, bufferSize, &scratchSlotData, sizeof(scratchSlotData));
|
|
}
|
|
|
|
{
|
|
// Write payload offsets
|
|
gtpin::igc::igc_token_kernel_start_info_t offsets;
|
|
offsets.token = gtpin::igc::GTPIN_IGC_TOKEN_KERNEL_START_INFO;
|
|
offsets.per_thread_prolog_size = kernel.getPerThreadNextOff();
|
|
offsets.cross_thread_prolog_size =
|
|
kernel.getCrossThreadNextOff() - offsets.per_thread_prolog_size;
|
|
offsets.token_size = sizeof(offsets);
|
|
writeBuffer(buffer, bufferSize, &offsets, sizeof(offsets));
|
|
}
|
|
|
|
{
|
|
// Report num GRFs
|
|
gtpin::igc::igc_token_num_grf_regs_t numGRFs;
|
|
numGRFs.token = gtpin::igc::GTPIN_IGC_TOKEN_NUM_GRF_REGS;
|
|
numGRFs.token_size = sizeof(numGRFs);
|
|
numGRFs.num_grf_regs = kernel.getNumRegTotal();
|
|
writeBuffer(buffer, bufferSize, &numGRFs, sizeof(numGRFs));
|
|
}
|
|
|
|
void *gtpinBuffer = allocCodeBlock(bufferSize);
|
|
|
|
memcpy_s(gtpinBuffer, bufferSize, buffer.data(), bufferSize);
|
|
|
|
// Dump buffer with shader dumps
|
|
if (kernel.getOption(vISA_outputToFile)) {
|
|
std::string asmName = kernel.getOptions()->getOptionCstr(VISA_AsmFileName);
|
|
if (!asmName.empty()) {
|
|
const VISAKernelImpl *vKernel =
|
|
kernel.fg.builder->getParent()->getKernel(kernel.getName());
|
|
if (vKernel && vKernel->getIsFunction()) {
|
|
unsigned funcID = -1;
|
|
vKernel->GetFunctionId(funcID);
|
|
asmName += "_f" + std::to_string(funcID);
|
|
}
|
|
|
|
std::ofstream ofInit;
|
|
std::stringstream ssInit;
|
|
ssInit << asmName << ".gtpin_igc_init";
|
|
ofInit.open(ssInit.str(), std::ofstream::binary);
|
|
if (gtpin_init) {
|
|
ofInit.write((const char *)gtpin_init, sizeof(*gtpin_init));
|
|
}
|
|
ofInit.close();
|
|
|
|
std::ofstream ofInfo;
|
|
std::stringstream ssInfo;
|
|
ssInfo << asmName << ".gtpin_igc_info";
|
|
ofInfo.open(ssInfo.str(), std::ofstream::binary);
|
|
if (gtpinBuffer) {
|
|
ofInfo.write((const char *)gtpinBuffer, bufferSize);
|
|
}
|
|
ofInfo.close();
|
|
}
|
|
}
|
|
|
|
return gtpinBuffer;
|
|
}
|
|
|
|
void gtPinData::setScratchNextFree(unsigned next) {
|
|
nextScratchFree = ((next + kernel.numEltPerGRF<Type_UB>() - 1) /
|
|
kernel.numEltPerGRF<Type_UB>()) *
|
|
kernel.numEltPerGRF<Type_UB>();
|
|
}
|
|
|
|
unsigned int gtPinData::getScratchNextFree() const { return nextScratchFree; }
|
|
|
|
uint32_t gtPinData::getNumBytesScratchUse() const {
|
|
if (gtpin_init) {
|
|
return gtpin_init->scratch_area_size;
|
|
} else if (isGTPinInitFromL0()) {
|
|
return kernel.getOptions()->getuInt32Option(vISA_GTPinScratchAreaSize);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
G4_Kernel::G4_Kernel(const PlatformInfo &pInfo, INST_LIST_NODE_ALLOCATOR &alloc,
|
|
Mem_Manager &m, Options *options, Attributes *anAttr,
|
|
uint32_t funcId, unsigned char major, unsigned char minor)
|
|
: platformInfo(pInfo), m_options(options), m_kernelAttrs(anAttr),
|
|
m_function_id(funcId), RAType(RA_Type::UNKNOWN_RA), asmInstCount(0),
|
|
kernelID(0), fg(alloc, this, m), major_version(major),
|
|
minor_version(minor), grfMode(pInfo.platform, pInfo.grfSize, options) {
|
|
vISA_ASSERT(major < COMMON_ISA_MAJOR_VER || (major == COMMON_ISA_MAJOR_VER &&
|
|
minor <= COMMON_ISA_MINOR_VER),
|
|
"CISA version not supported by this JIT-compiler");
|
|
|
|
name = NULL;
|
|
hasAddrTaken = false;
|
|
kernelDbgInfo = nullptr;
|
|
if (options->getOption(vISAOptions::vISA_GetFreeGRFInfo) ||
|
|
options->getuInt32Option(vISAOptions::vISA_GTPinScratchAreaSize)) {
|
|
allocGTPinData();
|
|
} else {
|
|
gtPinInfo = nullptr;
|
|
}
|
|
autoGRFSelection = m_options->getOption(vISA_AutoGRFSelection);
|
|
// NoMask WA
|
|
m_EUFusionNoMaskWAInfo = nullptr;
|
|
|
|
setKernelParameters();
|
|
}
|
|
|
|
G4_Kernel::~G4_Kernel() {
|
|
if (kernelDbgInfo) {
|
|
kernelDbgInfo.reset();
|
|
}
|
|
|
|
if (gtPinInfo) {
|
|
gtPinInfo.reset();
|
|
}
|
|
|
|
Declares.clear();
|
|
}
|
|
|
|
void G4_Kernel::computeChannelSlicing() {
|
|
G4_ExecSize simdSize = getSimdSize();
|
|
channelSliced = true;
|
|
|
|
if (simdSize == g4::SIMD8 || simdSize == g4::SIMD16) {
|
|
// SIMD8/16 kernels are not sliced
|
|
channelSliced = false;
|
|
return;
|
|
}
|
|
|
|
if (simdSize == g4::SIMD32 && numEltPerGRF<Type_UB>() >= 64) {
|
|
// For 64 bytes GRF, simd32 kernel, there is no slicing
|
|
channelSliced = false;
|
|
return;
|
|
}
|
|
// .dcl V1 size = 128 bytes
|
|
// op (16|M0) V1(0,0) ..
|
|
// op (16|M16) V1(2,0) ..
|
|
// For above sequence, return 32. Instruction
|
|
// is broken in to 2 only due to hw restriction.
|
|
// Allocation of dcl is still as if it were a
|
|
// SIMD32 kernel.
|
|
|
|
// Store emask bits that are ever used to define a variable
|
|
std::unordered_map<G4_Declare *, std::bitset<32>> emaskRef;
|
|
for (auto bb : fg) {
|
|
for (auto inst : *bb) {
|
|
if (inst->isSend())
|
|
continue;
|
|
|
|
auto dst = inst->getDst();
|
|
if (!dst || !dst->getTopDcl() || dst->getHorzStride() != 1)
|
|
continue;
|
|
|
|
if (inst->isWriteEnableInst())
|
|
continue;
|
|
|
|
auto regFileKind = dst->getTopDcl()->getRegFile();
|
|
if (regFileKind != G4_RegFileKind::G4_GRF &&
|
|
regFileKind != G4_RegFileKind::G4_INPUT)
|
|
continue;
|
|
|
|
if (dst->getTopDcl()->getByteSize() <=
|
|
dst->getTypeSize() * (unsigned)simdSize)
|
|
continue;
|
|
|
|
auto emaskOffStart = inst->getMaskOffset();
|
|
|
|
// Reset all bits on first encounter of dcl
|
|
if (emaskRef.find(dst->getTopDcl()) == emaskRef.end())
|
|
emaskRef[dst->getTopDcl()].reset();
|
|
|
|
// Set bits based on which EM bits are used in the def
|
|
for (unsigned i = emaskOffStart;
|
|
i != (emaskOffStart + inst->getExecSize()); i++) {
|
|
emaskRef[dst->getTopDcl()].set(i);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check whether any variable's emask usage straddles across lower and upper
|
|
// 16 bits
|
|
for (auto &emRefs : emaskRef) {
|
|
auto &bits = emRefs.second;
|
|
auto num = bits.to_ulong();
|
|
|
|
// Check whether any lower 16 and upper 16 bits are set
|
|
if (((num & 0xffff) != 0) && ((num & 0xffff0000) != 0)) {
|
|
channelSliced = false;
|
|
return;
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
void G4_Kernel::calculateSimdSize() {
|
|
// Iterate over all instructions in kernel to check
|
|
// whether default execution size of kernel is
|
|
// SIMD8/16. This is required for knowing alignment
|
|
// to use for GRF candidates.
|
|
|
|
// only do it once per kernel, as we should not introduce inst with larger
|
|
// simd size than in the input
|
|
if (simdSize.value != 0) {
|
|
return;
|
|
}
|
|
|
|
// First, get simdsize from attribute (0 : not given)
|
|
// If not 0|8|16|32, wrong value from attribute.
|
|
simdSize = G4_ExecSize(
|
|
(unsigned)m_kernelAttrs->getInt32KernelAttr(Attributes::ATTR_SimdSize));
|
|
if (simdSize != g4::SIMD8 && simdSize != g4::SIMD16 &&
|
|
simdSize != g4::SIMD32) {
|
|
vISA_ASSERT(simdSize.value == 0,
|
|
"vISA: wrong value for SimdSize attribute");
|
|
// pvc+: simd16; simd8 otherwise
|
|
simdSize = fg.builder->getNativeExecSize();
|
|
|
|
for (auto bb : fg) {
|
|
for (auto inst : *bb) {
|
|
// do not consider send since for certain messages we have to set its
|
|
// execution size to 16 even in simd8 shaders
|
|
// Also skip noMask inst
|
|
if (!inst->isLabel() && !inst->isSend() && !inst->isWriteEnableInst()) {
|
|
uint32_t size = inst->getMaskOffset() + inst->getExecSize();
|
|
if (size > 16) {
|
|
simdSize = g4::SIMD32;
|
|
break;
|
|
} else if (size > 8) {
|
|
simdSize = g4::SIMD16;
|
|
}
|
|
}
|
|
}
|
|
if (simdSize == g4::SIMD32)
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (GlobalRA::useGenericAugAlign(getPlatformGeneration()))
|
|
computeChannelSlicing();
|
|
}
|
|
|
|
//
|
|
// Updates kernel's related structures to large GRF
|
|
//
|
|
bool G4_Kernel::updateKernelToLargerGRF() {
|
|
if (numRegTotal == grfMode.getMaxGRF())
|
|
return false;
|
|
|
|
// Scale number of GRFs, Acc, SWSB tokens.
|
|
setKernelParameters(grfMode.moveToLargerGRF());
|
|
fg.builder->rebuildPhyRegPool(getNumRegTotal());
|
|
return true;
|
|
}
|
|
|
|
//
|
|
// Updates kernel's related structures based on register pressure
|
|
//
|
|
void G4_Kernel::updateKernelByRegPressure(unsigned regPressure,
|
|
bool forceGRFModeUp) {
|
|
unsigned largestInputReg = getLargestInputRegister();
|
|
if (m_kernelAttrs->isKernelAttrSet(Attributes::ATTR_MaxRegThreadDispatch)) {
|
|
unsigned maxRegPayloadDispatch = m_kernelAttrs->getInt32KernelAttr(
|
|
Attributes::ATTR_MaxRegThreadDispatch);
|
|
largestInputReg = std::max(largestInputReg, maxRegPayloadDispatch);
|
|
}
|
|
|
|
unsigned newGRF = grfMode.setModeByRegPressure(regPressure, largestInputReg,
|
|
forceGRFModeUp);
|
|
|
|
if (newGRF == numRegTotal)
|
|
return;
|
|
|
|
// Scale number of threads, Acc, SWSB tokens.
|
|
setKernelParameters(newGRF);
|
|
|
|
// Update physical register pool
|
|
fg.builder->rebuildPhyRegPool(getNumRegTotal());
|
|
}
|
|
|
|
//
|
|
// Updates kernel's related structures based on NumGRF attribute
|
|
//
|
|
bool G4_Kernel::updateKernelFromNumGRFAttr() {
|
|
unsigned attrNumGRF =
|
|
m_kernelAttrs->getInt32KernelAttr(Attributes::ATTR_NumGRF);
|
|
if (attrNumGRF != 0 && !grfMode.isValidNumGRFs(attrNumGRF))
|
|
return false;
|
|
if (numRegTotal == attrNumGRF)
|
|
return true;
|
|
|
|
autoGRFSelection = (attrNumGRF == 0);
|
|
// Scale number of GRFs, Acc, SWSB tokens.
|
|
setKernelParameters(attrNumGRF);
|
|
fg.builder->rebuildPhyRegPool(getNumRegTotal());
|
|
return true;
|
|
}
|
|
|
|
//
|
|
// Evaluate AddrExp/AddrExpList to Imm
|
|
//
|
|
void G4_Kernel::evalAddrExp() {
|
|
for (std::list<G4_BB *>::iterator it = fg.begin(), itEnd = fg.end();
|
|
it != itEnd; ++it) {
|
|
G4_BB *bb = (*it);
|
|
|
|
for (INST_LIST_ITER i = bb->begin(), iEnd = bb->end(); i != iEnd; i++) {
|
|
G4_INST *inst = (*i);
|
|
|
|
//
|
|
// process each source operand
|
|
//
|
|
for (unsigned j = 0, numSrc = inst->getNumSrc(); j < numSrc; j++) {
|
|
G4_Operand *opnd = inst->getSrc(j);
|
|
if (!opnd)
|
|
continue;
|
|
if (opnd->isAddrExp()) {
|
|
int val = opnd->asAddrExp()->eval(*fg.builder);
|
|
G4_Type ty = opnd->asAddrExp()->getType();
|
|
|
|
G4_Imm *imm = fg.builder->createImm(val, ty);
|
|
inst->setSrc(imm, j);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
[[maybe_unused]] static std::vector<std::string> split(const std::string &str,
|
|
const char *delimiter) {
|
|
std::vector<std::string> v;
|
|
std::string::size_type start = 0;
|
|
|
|
for (auto pos = str.find_first_of(delimiter, start); pos != std::string::npos;
|
|
start = pos + 1, pos = str.find_first_of(delimiter, start)) {
|
|
if (pos != start) {
|
|
v.emplace_back(str, start, pos - start);
|
|
}
|
|
}
|
|
|
|
if (start < str.length())
|
|
v.emplace_back(str, start, str.length() - start);
|
|
return v;
|
|
}
|
|
|
|
static iga_gen_t getIGAPlatform(TARGET_PLATFORM genPlatform) {
|
|
iga_gen_t platform = IGA_GEN_INVALID;
|
|
switch (genPlatform) {
|
|
case GENX_BDW:
|
|
platform = IGA_GEN8;
|
|
break;
|
|
case GENX_CHV:
|
|
platform = IGA_GEN8lp;
|
|
break;
|
|
case GENX_SKL:
|
|
platform = IGA_GEN9;
|
|
break;
|
|
case GENX_BXT:
|
|
platform = IGA_GEN9lp;
|
|
break;
|
|
case GENX_ICLLP:
|
|
platform = IGA_GEN11;
|
|
break;
|
|
case GENX_TGLLP:
|
|
platform = IGA_GEN12p1;
|
|
break;
|
|
case Xe_XeHPSDV:
|
|
platform = IGA_XE_HP;
|
|
break;
|
|
case Xe_DG2:
|
|
case Xe_MTL:
|
|
case Xe_ARL:
|
|
platform = IGA_XE_HPG;
|
|
break;
|
|
case Xe_PVC:
|
|
case Xe_PVCXT:
|
|
platform = IGA_XE_HPC;
|
|
break;
|
|
case Xe2:
|
|
platform = IGA_XE2;
|
|
break;
|
|
case Xe3:
|
|
platform = IGA_XE3;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return platform;
|
|
}
|
|
|
|
KernelDebugInfo *G4_Kernel::getKernelDebugInfo() {
|
|
if (kernelDbgInfo == nullptr) {
|
|
kernelDbgInfo = std::make_shared<KernelDebugInfo>();
|
|
}
|
|
|
|
return kernelDbgInfo.get();
|
|
}
|
|
|
|
void G4_Kernel::createKernelCostInfo(KernelCost *KCA) {
|
|
//
|
|
// copy data from FuncCost of KernelCostAnalysis to G4_Kernel's kernelCost
|
|
// (LoopCost is src type, LoopCostInfo is dst type)
|
|
//
|
|
m_kernelCost = std::make_unique<KernelCostInfo>();
|
|
FuncCost &FC = KCA->getKernelCost();
|
|
|
|
int sz = FC.m_allLoopsInProgramOrder.size();
|
|
m_kernelCost.get()->allLoopCosts.resize(sz);
|
|
m_kernelCost.get()->kernelCost.C = FC.m_funcCost.C.getCostMetrics();
|
|
|
|
for (int i = 0; i < sz; ++i) {
|
|
const Loop *L = FC.m_allLoopsInProgramOrder[i];
|
|
LoopCost &LC = KCA->getLoopCost(L);
|
|
LoopCostInfo &LCI = m_kernelCost.get()->allLoopCosts[i];
|
|
|
|
LCI.loopId = i;
|
|
vISA_ASSERT(i == LC.m_loopId, "Kernel Cost Analysis: incorrect loop id");
|
|
|
|
LCI.backedge_visaId = LC.m_backedge_visaId;
|
|
const CostMetrics &cm = LC.m_loopBodyCost.C.getCostMetrics();
|
|
LCI.loopBodyCost.C = cm;
|
|
LCI.LCE = nullptr;
|
|
LCI.numChildLoops = L->getNumImmChildLoops();
|
|
vISA_ASSERT(LCI.numChildLoops == LC.m_loopBodyCost.LoopCosts.size(),
|
|
"Kernel Cost Analysis: incorrect number of child loops!");
|
|
LCI.nestingLevel = L->getNestingLevel();
|
|
|
|
for (LoopCost *immLC : LC.m_loopBodyCost.LoopCosts) {
|
|
int loop_id = immLC->m_loopId;
|
|
LoopCostInfo &immLCI = m_kernelCost.get()->allLoopCosts[loop_id];
|
|
LCI.loopBodyCost.loopCosts.push_back(&immLCI);
|
|
}
|
|
}
|
|
}
|
|
|
|
void StackCallABI::setVersion() {
|
|
// VISA ABI version 1 is deprecated so default version to use is version 2
|
|
version = StackCallABIVersion::VER_2;
|
|
}
|
|
|
|
void StackCallABI::init(G4_Kernel *k) {
|
|
vISA_ASSERT(!kernel, "init called multiple times");
|
|
kernel = k;
|
|
setVersion();
|
|
if (version == StackCallABIVersion::VER_3) {
|
|
vISA_ASSERT(kernel->getGRFSize() == 64, "require 64-byte GRF for ABI v3");
|
|
vISA_ASSERT(kernel->getPlatform() >= TARGET_PLATFORM::Xe3,
|
|
"ABI v3 supported only for Xe3+");
|
|
}
|
|
|
|
switch (version) {
|
|
case StackCallABIVersion::VER_1:
|
|
case StackCallABIVersion::VER_2:
|
|
subRegs.Ret_IP = SubRegs_Stackcall_v1_v2_Ret_IP;
|
|
subRegs.Ret_EM = SubRegs_Stackcall_v1_v2_Ret_EM;
|
|
subRegs.BE_SP = SubRegs_Stackcall_v1_v2_BE_SP;
|
|
subRegs.BE_FP = SubRegs_Stackcall_v1_v2_BE_FP;
|
|
subRegs.FE_FP = SubRegs_Stackcall_v1_v2_FE_FP;
|
|
subRegs.FE_SP = SubRegs_Stackcall_v1_v2_FE_SP;
|
|
|
|
offsets.Ret_IP = FrameDescriptorOfsets_v1_v2_Ret_IP;
|
|
offsets.Ret_EM = FrameDescriptorOfsets_v1_v2_Ret_EM;
|
|
offsets.BE_SP = FrameDescriptorOfsets_v1_v2_BE_SP;
|
|
offsets.BE_FP = FrameDescriptorOfsets_v1_v2_BE_FP;
|
|
offsets.FE_FP = FrameDescriptorOfsets_v1_v2_FE_FP;
|
|
offsets.FE_SP = FrameDescriptorOfsets_v1_v2_FE_SP;
|
|
break;
|
|
case StackCallABIVersion::VER_3:
|
|
subRegs.Ret_IP = SubRegs_Stackcall_v3_Ret_IP;
|
|
subRegs.Ret_EM = SubRegs_Stackcall_v3_Ret_EM;
|
|
subRegs.BE_SP = SubRegs_Stackcall_v3_BE_SP;
|
|
subRegs.BE_FP = SubRegs_Stackcall_v3_BE_FP;
|
|
subRegs.FE_FP = SubRegs_Stackcall_v3_FE_FP;
|
|
subRegs.FE_SP = SubRegs_Stackcall_v3_FE_SP;
|
|
|
|
offsets.Ret_IP = FrameDescriptorOfsets_v3_Ret_IP;
|
|
offsets.Ret_EM = FrameDescriptorOfsets_v3_Ret_EM;
|
|
offsets.BE_SP = FrameDescriptorOfsets_v3_BE_SP;
|
|
offsets.BE_FP = FrameDescriptorOfsets_v3_BE_FP;
|
|
offsets.FE_FP = FrameDescriptorOfsets_v3_FE_FP;
|
|
offsets.FE_SP = FrameDescriptorOfsets_v3_FE_SP;
|
|
break;
|
|
default:
|
|
vISA_ASSERT(false, "unknown ABI");
|
|
}
|
|
argReg = ArgRet_Stackcall_Arg;
|
|
retReg = ArgRet_Stackcall_Ret;
|
|
}
|
|
|
|
unsigned StackCallABI::getStackCallStartReg() const {
|
|
// Last 3 (or 2) GRFs reserved for stack call purpose
|
|
unsigned totalGRFs = kernel->getNumRegTotal();
|
|
unsigned startReg = totalGRFs - numReservedABIGRF();
|
|
return startReg;
|
|
}
|
|
unsigned StackCallABI::calleeSaveStart() const {
|
|
return getCallerSaveLastGRF() + 1;
|
|
}
|
|
unsigned StackCallABI::getNumCalleeSaveRegs() const {
|
|
unsigned totalGRFs = kernel->getNumRegTotal();
|
|
return totalGRFs - calleeSaveStart() - numReservedABIGRF();
|
|
}
|
|
|
|
uint32_t StackCallABI::numReservedABIGRF() const {
|
|
if (version == StackCallABIVersion::VER_1)
|
|
return 3;
|
|
else if (version == StackCallABIVersion::VER_2) {
|
|
if (kernel->getOption(vISA_PreserveR0InR0))
|
|
return 2;
|
|
return 3;
|
|
} else {
|
|
// for ABI version > 2
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
uint32_t StackCallABI::getFPSPGRF() const {
|
|
// For ABI V1, return (numRegTotal - 3), i.e. 125.
|
|
// For ABI V2, return (numRegTotal - 1), i.e. 127, 255.
|
|
// For ABI V3, return (numRegTotal - 1), i.e. 127, 255.
|
|
|
|
if (version == StackCallABIVersion::VER_1) {
|
|
return getStackCallStartReg() + FPSPGRF;
|
|
} else if (version == StackCallABIVersion::VER_2) {
|
|
return (kernel->getNumRegTotal() - 1) - FPSPGRF;
|
|
} else {
|
|
|
|
return (kernel->getNumRegTotal() - 1) - FPSPGRF;
|
|
}
|
|
}
|
|
|
|
uint32_t StackCallABI::getSpillHeaderGRF() const {
|
|
// For ABI V1 return r126.
|
|
// For ABI V2 return r126.
|
|
// For ABI V3 return r127.
|
|
if (version == StackCallABIVersion::VER_1)
|
|
return getStackCallStartReg() + SpillHeaderGRF;
|
|
else if (version == StackCallABIVersion::VER_2)
|
|
return (kernel->getNumRegTotal() - 1) - SpillHeaderGRF;
|
|
else
|
|
return kernel->stackCall.getFPSPGRF();
|
|
}
|
|
|
|
uint32_t StackCallABI::getThreadHeaderGRF() const {
|
|
// For ABI V1 return r127.
|
|
// For ABI V2 return r125.
|
|
vISA_ASSERT(
|
|
kernel->getOption(vISA_PreserveR0InR0) == false,
|
|
"r0 is preserved in r0 itself. no special stack call header needed");
|
|
if (version == StackCallABIVersion::VER_1)
|
|
return getStackCallStartReg() + ThreadHeaderGRF;
|
|
else
|
|
return (kernel->getNumRegTotal() - 1) - ThreadHeaderGRF;
|
|
}
|
|
|
|
//
|
|
// perform relocation for every entry in the allocation table
|
|
//
|
|
void G4_Kernel::doRelocation(void *binary, uint32_t binarySize) {
|
|
for (auto &&entry : relocationTable) {
|
|
entry.doRelocation(*this, binary, binarySize);
|
|
}
|
|
}
|
|
|
|
G4_INST *G4_Kernel::getFirstNonLabelInst() const {
|
|
for (auto I = fg.cbegin(), E = fg.cend(); I != E; ++I) {
|
|
auto bb = *I;
|
|
G4_INST *firstInst = bb->getFirstInst();
|
|
if (firstInst) {
|
|
return firstInst;
|
|
}
|
|
}
|
|
// empty kernel
|
|
return nullptr;
|
|
}
|
|
|
|
std::string G4_Kernel::getDebugSrcLine(const std::string &fileName,
|
|
int srcLine) {
|
|
auto iter = debugSrcLineMap.find(fileName);
|
|
if (iter == debugSrcLineMap.end()) {
|
|
std::ifstream ifs(fileName);
|
|
if (!ifs) {
|
|
// file doesn't exist
|
|
debugSrcLineMap[fileName] =
|
|
std::make_pair<bool, std::vector<std::string>>(false, {});
|
|
return "";
|
|
}
|
|
std::string line;
|
|
std::vector<std::string> srcLines;
|
|
while (std::getline(ifs, line)) {
|
|
srcLines.push_back(line);
|
|
}
|
|
debugSrcLineMap[fileName] = std::make_pair(true, std::move(srcLines));
|
|
}
|
|
iter = debugSrcLineMap.find(fileName);
|
|
if (iter == debugSrcLineMap.end() || !iter->second.first) {
|
|
return "";
|
|
}
|
|
auto &lines = iter->second.second;
|
|
if (srcLine > (int)lines.size() || srcLine <= 0) {
|
|
return "invalid line number";
|
|
}
|
|
return lines[srcLine - 1];
|
|
}
|
|
|
|
unsigned G4_Kernel::getLargestInputRegister() {
|
|
const unsigned inputCount = fg.builder->getInputCount();
|
|
unsigned regNum = 0;
|
|
if (inputCount) {
|
|
const input_info_t *ii = fg.builder->getInputArg(inputCount - 1);
|
|
regNum = (ii->offset + ii->dcl->getByteSize()) /
|
|
fg.builder->numEltPerGRF<Type_UB>();
|
|
}
|
|
|
|
return regNum;
|
|
}
|
|
|
|
void G4_Kernel::setKernelParameters(unsigned newGRF) {
|
|
unsigned overrideGRFNum = 0, overrideNumThreads = 0, overrideNumSWSB = 0,
|
|
overrideNumAcc = 0;
|
|
|
|
overrideGRFNum = m_options->getuInt32Option(vISA_TotalGRFNum);
|
|
overrideNumThreads = m_options->getuInt32Option(vISA_HWThreadNumberPerEU);
|
|
overrideNumSWSB = m_options->getuInt32Option(vISA_SWSBTokenNum);
|
|
overrideNumAcc = m_options->getuInt32Option(vISA_numGeneralAcc);
|
|
|
|
//
|
|
// Number of threads/GRF can currently be set by:
|
|
// 1.- Per kernel attribute
|
|
// 2.- IGC flag (reg key)
|
|
// 3.- Compiler option entered by user for
|
|
// 2.1 entire module
|
|
// 2.2 kernel function
|
|
// 4.- Compiler heuristics
|
|
//
|
|
// 1 is set via kernel attribute. 2 and 3 via vISA option.
|
|
// If none of them are set, compiler selects the best option (4).
|
|
//
|
|
|
|
if (newGRF > 0) {
|
|
// per kernel attribute or GRF change during compilation
|
|
grfMode.setModeByNumGRFs(newGRF);
|
|
overrideGRFNum = 0;
|
|
} else if (overrideNumThreads > 0) {
|
|
// Forcing a specific number of threads
|
|
grfMode.setModeByNumThreads(overrideNumThreads);
|
|
overrideGRFNum = 0;
|
|
autoGRFSelection = false;
|
|
} else if (overrideGRFNum > 0) {
|
|
// Forcing a specific number of GRFs
|
|
grfMode.setModeByNumGRFs(overrideGRFNum);
|
|
autoGRFSelection = false;
|
|
} else {
|
|
// Use default value
|
|
grfMode.setDefaultGRF();
|
|
overrideGRFNum = 0;
|
|
}
|
|
|
|
// Set number of GRFs
|
|
numRegTotal = overrideGRFNum ? overrideGRFNum : grfMode.getNumGRF();
|
|
auto lastCallerSavedGRF =
|
|
getOptions()->getuInt32Option(vISA_LastCallerSavedGRF);
|
|
// When vISA_LastCallerSavedGRF is set, it's an ABI breaking change.
|
|
// Kernel and entire callee nest must be compiled with same
|
|
// value of vISA_LastCallerSavedGRF for correctness.
|
|
if (lastCallerSavedGRF)
|
|
stackCall.setCallerSaveLastGRF(lastCallerSavedGRF);
|
|
else
|
|
stackCall.setCallerSaveLastGRF(((numRegTotal - 8) / 2) - 1);
|
|
|
|
// Set number of threads
|
|
numThreads = grfMode.getNumThreads();
|
|
|
|
// Set the number of SWSB tokens
|
|
numSWSBTokens =
|
|
overrideNumSWSB ? overrideNumSWSB : grfMode.getNumSWSBTokens();
|
|
|
|
// Set the number of Acc
|
|
numAcc = overrideNumAcc ? overrideNumAcc : grfMode.getNumAcc();
|
|
|
|
// Special configurations go here
|
|
if (m_options->getOption(vISA_hasDoubleAcc)) {
|
|
numAcc = 16;
|
|
}
|
|
}
|
|
|
|
bool G4_Kernel::hasInlineData() const {
|
|
const IR_Builder &b = *fg.builder;
|
|
return
|
|
b.getOption(vISA_useInlineData);
|
|
}
|
|
|
|
std::vector<ArgLayout> G4_Kernel::getArgumentLayout() {
|
|
const uint32_t startGRF =
|
|
getOptions()->getuInt32Option(vISA_loadThreadPayloadStartReg);
|
|
const uint32_t inputsStart = startGRF * getGRFSize();
|
|
const uint32_t inputCount = fg.builder->getInputCount();
|
|
|
|
const int PTIS = AlignUp(
|
|
getInt32KernelAttr(Attributes::ATTR_PerThreadInputSize), getGRFSize());
|
|
|
|
// Checks if input_info is cross-thread-input
|
|
auto isInCrossThreadData = [&](const input_info_t *input_info) {
|
|
return (uint32_t)input_info->offset >= inputsStart + PTIS;
|
|
};
|
|
|
|
const uint32_t inlineDataSize = fg.builder->getInlineDataSize();
|
|
const bool useInlineData = hasInlineData();
|
|
// Checks if input_info fits in inlineData
|
|
auto isInInlineData = [&](const input_info_t *const input_info) {
|
|
if (!useInlineData) {
|
|
return false;
|
|
}
|
|
uint32_t inputEnd = input_info->offset + input_info->size;
|
|
bool fitsInInlineData = inputEnd <= inputsStart + PTIS + inlineDataSize;
|
|
return isInCrossThreadData(input_info) && fitsInInlineData;
|
|
};
|
|
|
|
const uint32_t startGrfAddr =
|
|
getOptions()->getuInt32Option(vISA_loadThreadPayloadStartReg) *
|
|
getGRFSize();
|
|
|
|
std::vector<ArgLayout> args;
|
|
for (unsigned ix = 0; ix < inputCount; ix++) {
|
|
const input_info_t *input = fg.builder->getInputArg(ix);
|
|
if (input->isPseudoInput()) {
|
|
continue;
|
|
} else if (fg.builder->getFCPatchInfo()->getIsEntryKernel()) {
|
|
const vISA::G4_Declare *dcl = input->dcl;
|
|
if (INPUT_GENERAL == input->getInputClass() && !dcl->isLiveIn()) {
|
|
break;
|
|
}
|
|
}
|
|
int dstGrfAddr = input->offset;
|
|
auto memSrc = ArgLayout::MemSrc::INVALID;
|
|
int memOff = input->offset - startGrfAddr; // subtract off r0
|
|
if (isInInlineData(input)) {
|
|
memSrc = ArgLayout::MemSrc::INLINE;
|
|
memOff %= getGRFSize();
|
|
vISA_ASSERT(memOff < (int)inlineDataSize, "inline reg arg OOB");
|
|
vISA_ASSERT(memOff + (int)input->size <= (int)inlineDataSize,
|
|
"inline reg arg overflows");
|
|
} else if (isInCrossThreadData(input)) {
|
|
memSrc = ArgLayout::MemSrc::CTI;
|
|
memOff -= PTIS + (useInlineData ? inlineDataSize : 0);
|
|
} else {
|
|
memSrc = ArgLayout::MemSrc::PTI;
|
|
}
|
|
args.emplace_back(input->dcl, dstGrfAddr, memSrc, memOff, input->size);
|
|
}
|
|
std::sort(args.begin(), args.end(),
|
|
[&](const ArgLayout &a1, const ArgLayout &a2) {
|
|
return a1.dstGrfAddr < a2.dstGrfAddr;
|
|
});
|
|
return args;
|
|
}
|
|
|
|
void G4_Kernel::dump(std::ostream &os) const { fg.print(os); }
|
|
|
|
void G4_Kernel::dumpToFile(const std::string &suffixIn, bool forceG4Dump) {
|
|
bool dumpDot = m_options->getOption(vISA_DumpDot);
|
|
bool dumpG4 = forceG4Dump || m_options->getOption(vISA_DumpPasses) ||
|
|
m_options->getuInt32Option(vISA_DumpPassesSubset) >= 1;
|
|
if (!dumpDot && !dumpG4)
|
|
return;
|
|
|
|
// todo: remove else branch as it is not reached at all.
|
|
std::stringstream ss;
|
|
const char *prefix = nullptr;
|
|
getOptions()->getOption(VISA_AsmFileName, prefix);
|
|
if (prefix != nullptr) {
|
|
// Use AsmFileName as prefix for g4/dot dumps
|
|
if (fg.builder->getIsKernel()) {
|
|
// entry
|
|
ss << prefix << "." << std::setfill('0') << std::setw(3)
|
|
<< nextDumpIndex++ << "." << suffixIn;
|
|
} else {
|
|
// callee
|
|
ss << prefix << "_f" << getFunctionId() << "." << std::setfill('0')
|
|
<< std::setw(3) << nextDumpIndex++ << "." << suffixIn;
|
|
}
|
|
} else {
|
|
// calls to this will produce a sequence of dumps
|
|
// [kernel-name].000.[suffix].{dot,g4}
|
|
// [kernel-name].001.[suffix].{dot,g4}
|
|
// ...
|
|
// If vISA_DumpPassesSubset == 1 then we omit any files that don't change
|
|
// the string representation of the kernel (i.e. skip passes that don't do
|
|
// anything).
|
|
if (m_options->getOption(vISA_DumpUseInternalName) || name == nullptr) {
|
|
if (fg.builder->getIsKernel()) {
|
|
ss << "k" << getKernelID();
|
|
} else {
|
|
ss << "f" << getFunctionId();
|
|
}
|
|
} else {
|
|
ss << name;
|
|
}
|
|
ss << "." << std::setfill('0') << std::setw(3) << nextDumpIndex++ << "."
|
|
<< suffixIn;
|
|
}
|
|
std::string baseName = sanitizePathString(ss.str());
|
|
|
|
if (dumpDot)
|
|
dumpDotFileInternal(baseName);
|
|
|
|
if (dumpG4)
|
|
dumpG4Internal(baseName);
|
|
}
|
|
|
|
void G4_Kernel::dumpToConsole() { dumpG4InternalTo(std::cout); }
|
|
|
|
void G4_Kernel::emitDeviceAsm(std::ostream &os, const void *binary,
|
|
uint32_t binarySize) {
|
|
//
|
|
// for GTGPU lib release, don't dump out asm
|
|
//
|
|
#ifdef NDEBUG
|
|
#ifdef GTGPU_LIB
|
|
return;
|
|
#endif
|
|
#endif
|
|
const bool newAsm = m_options->getOption(vISA_dumpNewSyntax) &&
|
|
!(binary == NULL || binarySize == 0);
|
|
|
|
if (!m_options->getOption(vISA_StripComments)) {
|
|
emitDeviceAsmHeaderComment(os);
|
|
}
|
|
|
|
if (!newAsm) {
|
|
emitDeviceAsmInstructionsOldAsm(os);
|
|
return;
|
|
}
|
|
|
|
emitDeviceAsmInstructionsIga(os, binary, binarySize);
|
|
|
|
if (getPlatformGeneration() >= PlatformGen::XE) {
|
|
os << "\n\n";
|
|
auto jitInfo = fg.builder->getJitInfo();
|
|
os << "//.BankConflicts: " << jitInfo->statsVerbose.BCNum << "\n";
|
|
os << "//.ByteRMWs: " << jitInfo->statsVerbose.numByteRMWs << "\n//\n";
|
|
} else {
|
|
os << "// Bank Conflict Statistics: \n";
|
|
os << "// -- GOOD: " << fg.BCStats.NumOfGoodInsts << "\n";
|
|
os << "// -- BAD: " << fg.BCStats.NumOfBadInsts << "\n";
|
|
os << "// -- OK: " << fg.BCStats.NumOfOKInsts << "\n";
|
|
}
|
|
}
|
|
|
|
void G4_Kernel::emitRegInfo() {
|
|
const char *asmName = nullptr;
|
|
getOptions()->getOption(VISA_AsmFileName, asmName);
|
|
const char *asmNameEmpty = "";
|
|
if (!asmName) {
|
|
asmName = asmNameEmpty;
|
|
}
|
|
|
|
std::string dumpFileName = std::string(asmName) + ".reginfo";
|
|
std::fstream ofile(dumpFileName, std::ios::out);
|
|
|
|
emitRegInfoKernel(ofile);
|
|
|
|
ofile.close();
|
|
}
|
|
|
|
void G4_Kernel::emitRegInfoKernel(std::ostream &output) {
|
|
output << "//.platform " << getGenxPlatformString();
|
|
output << "\n"
|
|
<< "//.kernel ID 0x" << std::hex << getKernelID() << "\n";
|
|
output << std::dec << "\n";
|
|
int instOffset = 0;
|
|
|
|
for (BB_LIST_ITER itBB = fg.begin(); itBB != fg.end(); ++itBB) {
|
|
for (INST_LIST_ITER itInst = (*itBB)->begin(); itInst != (*itBB)->end();
|
|
++itInst) {
|
|
G4_INST *inst = (*itInst);
|
|
if (inst->isLabel()) {
|
|
continue;
|
|
}
|
|
if (inst->getLexicalId() == -1) {
|
|
continue;
|
|
}
|
|
|
|
(*itBB)->emitRegInfo(output, inst, instOffset);
|
|
instOffset += inst->isCompactedInst() ? 8 : 16;
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
//
|
|
// This routine dumps out the dot file of the control flow graph along with
|
|
// instructions. dot is drawing graph tool from AT&T.
|
|
//
|
|
void G4_Kernel::dumpDotFileInternal(const std::string &baseName) {
|
|
std::fstream ofile(baseName + ".dot", std::ios::out);
|
|
vASSERT(!ofile.fail());
|
|
//
|
|
// write digraph KernelName {"
|
|
// size = "8, 10";
|
|
//
|
|
const char *asmFileName = NULL;
|
|
m_options->getOption(VISA_AsmFileName, asmFileName);
|
|
if (asmFileName == NULL)
|
|
ofile << "digraph UnknownKernel"
|
|
<< " {"
|
|
<< "\n";
|
|
else
|
|
ofile << "digraph " << asmFileName << " {"
|
|
<< "\n";
|
|
//
|
|
// keep the graph width 8, estimate a reasonable graph height
|
|
//
|
|
const unsigned itemPerPage = 64; // 60 instructions per Letter page
|
|
unsigned totalItem = (unsigned)Declares.size();
|
|
for (std::list<G4_BB *>::iterator it = fg.begin(); it != fg.end(); ++it)
|
|
totalItem += ((unsigned)(*it)->size());
|
|
totalItem += (unsigned)fg.size();
|
|
float graphHeight = (float)totalItem / itemPerPage;
|
|
graphHeight =
|
|
graphHeight < 100.0f ? 100.0f : graphHeight; // minimal size: Letter
|
|
ofile << "\n\t// Setup\n";
|
|
ofile << "\tsize = \"80.0, " << graphHeight << "\";\n";
|
|
ofile << "\tpage= \"80.5, 110\";\n";
|
|
ofile << "\tpagedir=\"TL\";\n";
|
|
// dump out flow graph
|
|
for (std::list<G4_BB *>::iterator it = fg.begin(); it != fg.end(); ++it) {
|
|
G4_BB *bb = (*it);
|
|
//
|
|
// write: BB0 [shape=plaintext, label=<
|
|
// <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
|
|
// <TR><TD ALIGN="CENTER">BB0: TestRA_Dot</TD></TR>
|
|
// <TR><TD>
|
|
// <TABLE BORDER="0" CELLBORDER="0"
|
|
// CELLSPACING="0">
|
|
// <TR><TD
|
|
// ALIGN="LEFT">TestRA_Dot:</TD></TR>
|
|
// <TR><TD ALIGN="LEFT"><FONT
|
|
// color="red">add (8) Region(0,0)[1]
|
|
// Region(0,0)[8;8,1] PAYLOAD(0,0)[8;8,1]
|
|
// [NoMask]</FONT></TD></TR>
|
|
// </TABLE>
|
|
// </TD></TR>
|
|
// </TABLE>>];
|
|
// print out label if the first inst is a label inst
|
|
//
|
|
ofile << "\t";
|
|
bb->writeBBId(ofile);
|
|
ofile << " [shape=plaintext, label=<"
|
|
<< "\n";
|
|
ofile << "\t\t\t <TABLE BORDER=\"0\" CELLBORDER=\"1\" CELLSPACING=\"0\">"
|
|
<< "\n";
|
|
ofile << "\t\t\t\t<TR><TD ALIGN=\"CENTER\">";
|
|
bb->writeBBId(ofile);
|
|
ofile << ": ";
|
|
|
|
if (!bb->empty() && bb->front()->isLabel()) {
|
|
bb->front()->getSrc(0)->emit(ofile);
|
|
}
|
|
ofile << "</TD></TR>"
|
|
<< "\n";
|
|
// emit all instructions within basic block
|
|
ofile << "\t\t\t\t<TR><TD>"
|
|
<< "\n";
|
|
|
|
if (!bb->empty()) {
|
|
ofile << "\t\t\t\t\t <TABLE BORDER=\"0\" CELLBORDER=\"0\" "
|
|
"CELLSPACING=\"0\">"
|
|
<< "\n";
|
|
for (INST_LIST_ITER i = bb->begin(); i != bb->end(); i++) {
|
|
//
|
|
// detect if there is spill code first, set different color for it
|
|
//
|
|
std::string fontColor = "black";
|
|
//
|
|
// emit the instruction
|
|
//
|
|
ofile << "\t\t\t\t\t\t<TR><TD ALIGN=\"LEFT\"><FONT color=\""
|
|
<< fontColor << "\">";
|
|
std::ostringstream os;
|
|
(*i)->emit(os);
|
|
std::string dotStr(os.str());
|
|
// TODO: dot doesn't like '<', '>', '{', or '}' (and '&') this code
|
|
// below is a hack. need to replace with delimiters.
|
|
// std::replace_if(dotStr.begin(), dotStr.end(),
|
|
// bind2nd(equal_to<char>(), '<'), '[');
|
|
std::replace_if(
|
|
dotStr.begin(), dotStr.end(),
|
|
std::bind(std::equal_to<char>(), std::placeholders::_1, '<'), '[');
|
|
std::replace_if(
|
|
dotStr.begin(), dotStr.end(),
|
|
std::bind(std::equal_to<char>(), std::placeholders::_1, '>'), ']');
|
|
std::replace_if(
|
|
dotStr.begin(), dotStr.end(),
|
|
std::bind(std::equal_to<char>(), std::placeholders::_1, '{'), '[');
|
|
std::replace_if(
|
|
dotStr.begin(), dotStr.end(),
|
|
std::bind(std::equal_to<char>(), std::placeholders::_1, '}'), ']');
|
|
std::replace_if(
|
|
dotStr.begin(), dotStr.end(),
|
|
std::bind(std::equal_to<char>(), std::placeholders::_1, '&'), '$');
|
|
ofile << dotStr;
|
|
|
|
ofile << "</FONT></TD></TR>"
|
|
<< "\n";
|
|
// ofile << "\\l"; // left adjusted
|
|
}
|
|
ofile << "\t\t\t\t\t </TABLE>"
|
|
<< "\n";
|
|
}
|
|
|
|
ofile << "\t\t\t\t</TD></TR>"
|
|
<< "\n";
|
|
ofile << "\t\t\t </TABLE>>];"
|
|
<< "\n";
|
|
//
|
|
// dump out succ edges
|
|
// BB12 -> BB10
|
|
//
|
|
for (std::list<G4_BB *>::iterator sit = bb->Succs.begin();
|
|
sit != bb->Succs.end(); ++sit) {
|
|
bb->writeBBId(ofile);
|
|
ofile << " -> ";
|
|
(*sit)->writeBBId(ofile);
|
|
ofile << "\n";
|
|
}
|
|
}
|
|
//
|
|
// write "}" to end digraph
|
|
//
|
|
ofile << "\n"
|
|
<< " }"
|
|
<< "\n";
|
|
//
|
|
// close dot file
|
|
//
|
|
ofile.close();
|
|
}
|
|
|
|
// Dump the instructions into a .g4 file
|
|
void G4_Kernel::dumpG4Internal(const std::string &file) {
|
|
std::stringstream g4asm;
|
|
dumpG4InternalTo(g4asm);
|
|
std::string g4asms = g4asm.str();
|
|
if (m_options->getuInt32Option(vISA_DumpPassesSubset) == 1 &&
|
|
g4asms == lastG4Asm) {
|
|
return;
|
|
}
|
|
lastG4Asm = std::move(g4asms);
|
|
|
|
std::fstream ofile(file + ".g4", std::ios::out);
|
|
vASSERT(!ofile.fail());
|
|
dumpG4InternalTo(ofile);
|
|
}
|
|
|
|
void G4_Kernel::dumpG4InternalTo(std::ostream &os) {
|
|
if (name)
|
|
os << ".kernel " << name << "\n";
|
|
else
|
|
os << ".kernel\n";
|
|
|
|
for (const G4_Declare *d : Declares) {
|
|
static const int MIN_DECL = 34; // skip the built-in decls
|
|
if (d->getDeclId() > MIN_DECL) {
|
|
// os << d->getDeclId() << "\n";
|
|
d->emit(os);
|
|
}
|
|
}
|
|
os << "\n";
|
|
|
|
// Additional dumps for lit testing
|
|
os << "// simdSize = " << (int)simdSize.value << "\n";
|
|
|
|
os << "\n";
|
|
for (std::list<G4_BB *>::iterator it = fg.begin(); it != fg.end(); ++it) {
|
|
// Emit BB number
|
|
G4_BB *bb = (*it);
|
|
bb->writeBBId(os);
|
|
|
|
// Emit BB type
|
|
if (bb->getBBType()) {
|
|
os << " [" << bb->getBBTypeStr() << "] ";
|
|
}
|
|
|
|
os << "\tPreds: ";
|
|
for (auto pred : bb->Preds) {
|
|
pred->writeBBId(os);
|
|
os << " ";
|
|
}
|
|
os << "\tSuccs: ";
|
|
for (auto succ : bb->Succs) {
|
|
succ->writeBBId(os);
|
|
os << " ";
|
|
}
|
|
os << "\n";
|
|
|
|
bb->emit(os);
|
|
os << "\n\n";
|
|
} // bbs
|
|
}
|
|
|
|
void G4_Kernel::emitDeviceAsmHeaderComment(std::ostream &os) {
|
|
os << "//.kernel ";
|
|
if (name != NULL) {
|
|
// some 3D kernels do not have a name
|
|
os << name;
|
|
}
|
|
|
|
#if !Release
|
|
os << "\n"
|
|
<< "//.platform " << getGenxPlatformString();
|
|
os << "\n"
|
|
<< "//.thread_config "
|
|
<< "numGRF=" << numRegTotal << ", numAcc=" << numAcc;
|
|
#endif
|
|
|
|
if (fg.builder->hasSWSB()) {
|
|
os << ", numSWSB=" << numSWSBTokens;
|
|
}
|
|
os << "\n"
|
|
<< "//.options_string \"" << m_options->getUserArgString().str() << "\"";
|
|
os << "\n"
|
|
<< "//.full_options \"" << m_options->getFullArgString() << "\"";
|
|
os << "\n"
|
|
<< "//.instCount " << asmInstCount;
|
|
static const char *const RATypeString[]{RA_TYPE(STRINGIFY)};
|
|
os << "\n//.RA type\t" << RATypeString[RAType];
|
|
if (!m_options->getOption(vISA_skipGitHash))
|
|
os << "\n//.git-hash " << GIT_COMMIT_HASH;
|
|
|
|
if (auto jitInfo = fg.builder->getJitInfo()) {
|
|
if (jitInfo->stats.numGRFUsed != 0) {
|
|
os << "\n"
|
|
<< "//.GRF count " << jitInfo->stats.numGRFUsed;
|
|
}
|
|
if (jitInfo->stats.spillMemUsed > 0) {
|
|
os << "\n"
|
|
<< "//.spill size " << jitInfo->stats.spillMemUsed;
|
|
}
|
|
if (jitInfo->stats.numGRFSpillFillWeighted > 0) {
|
|
os << "\n"
|
|
<< "//.spill GRF est. ref count "
|
|
<< jitInfo->stats.numGRFSpillFillWeighted;
|
|
}
|
|
if (jitInfo->stats.numFlagSpillStore > 0) {
|
|
os << "\n//.spill flag store " << jitInfo->stats.numFlagSpillStore;
|
|
os << "\n//.spill flag load " << jitInfo->stats.numFlagSpillLoad;
|
|
}
|
|
}
|
|
|
|
auto privateMemSize = getInt32KernelAttr(Attributes::ATTR_SpillMemOffset);
|
|
if (privateMemSize != 0) {
|
|
os << "\n//.private memory size " << privateMemSize;
|
|
}
|
|
os << "\n\n";
|
|
|
|
// Step2: emit declares (as needed)
|
|
for (auto dcl : Declares) {
|
|
dcl->emit(os);
|
|
}
|
|
os << "\n";
|
|
|
|
auto fmtHex = [](int i) {
|
|
std::stringstream ss;
|
|
ss << "0x" << std::hex << std::uppercase << i;
|
|
return ss.str();
|
|
};
|
|
|
|
auto args = getArgumentLayout();
|
|
const unsigned inputCount = (unsigned)args.size();
|
|
std::vector<std::string> argNames;
|
|
size_t maxNameLen = 8;
|
|
for (unsigned ix = 0; ix < inputCount; ix++) {
|
|
const ArgLayout &a = args[ix];
|
|
std::stringstream ss;
|
|
if (a.decl && a.decl->getName()) {
|
|
ss << a.decl->getName();
|
|
} else {
|
|
ss << "__unnamed" << (ix + 1);
|
|
}
|
|
argNames.push_back(ss.str());
|
|
maxNameLen = std::max(maxNameLen, argNames.back().size());
|
|
}
|
|
|
|
// emit input location and size
|
|
os << "// .inputs\n";
|
|
const size_t COLW_IDENT = maxNameLen;
|
|
static const size_t COLW_TYPE = 8;
|
|
static const size_t COLW_SIZE = 6;
|
|
static const size_t COLW_AT = 8; // e.g. "r16+0x20"
|
|
static const size_t COLW_FROM = 16; // e.g. "inline+0x20"
|
|
|
|
std::stringstream bordss;
|
|
bordss << "// ";
|
|
bordss << '+';
|
|
bordss << std::setfill('-') << std::setw(COLW_IDENT + 2) << "";
|
|
bordss << '+';
|
|
bordss << std::setfill('-') << std::setw(COLW_TYPE + 2) << "";
|
|
bordss << '+';
|
|
bordss << std::setfill('-') << std::setw(COLW_SIZE + 2) << "";
|
|
bordss << '+';
|
|
bordss << std::setfill('-') << std::setw(COLW_AT + 2) << "";
|
|
bordss << '+';
|
|
bordss << std::setfill('-') << std::setw(COLW_FROM + 2) << "";
|
|
bordss << '+' << "\n";
|
|
std::string border = bordss.str();
|
|
|
|
os << border;
|
|
os << "//"
|
|
<< " | " << std::left << std::setw(COLW_IDENT) << "id"
|
|
<< " | " << std::left << std::setw(COLW_TYPE) << "type"
|
|
<< " | " << std::right << std::setw(COLW_SIZE) << "bytes"
|
|
<< " | " << std::left << std::setw(COLW_AT) << "at"
|
|
<< " | " << std::left << std::setw(COLW_FROM) << "from"
|
|
<< " |"
|
|
<< "\n";
|
|
os << border;
|
|
|
|
const unsigned grfSize = getGRFSize();
|
|
for (unsigned ix = 0; ix < inputCount; ix++) {
|
|
const ArgLayout &a = args[ix];
|
|
//
|
|
os << "//";
|
|
//
|
|
// id
|
|
os << " | " << std::left << std::setw(COLW_IDENT) << argNames[ix];
|
|
//
|
|
// type and length
|
|
// e.g. :uq x 16
|
|
const G4_Declare *dcl = a.decl;
|
|
std::stringstream sstype;
|
|
if (dcl) {
|
|
switch (dcl->getElemType()) {
|
|
case Type_B:
|
|
sstype << ":b";
|
|
break;
|
|
case Type_W:
|
|
sstype << ":w";
|
|
break;
|
|
case Type_D:
|
|
sstype << ":d";
|
|
break;
|
|
case Type_Q:
|
|
sstype << ":q";
|
|
break;
|
|
case Type_V:
|
|
sstype << ":v";
|
|
break;
|
|
case Type_UB:
|
|
sstype << ":ub";
|
|
break;
|
|
case Type_UW:
|
|
sstype << ":uw";
|
|
break;
|
|
case Type_UD:
|
|
sstype << ":ud";
|
|
break;
|
|
case Type_UQ:
|
|
sstype << ":uq";
|
|
break;
|
|
case Type_UV:
|
|
sstype << ":uv";
|
|
break;
|
|
//
|
|
case Type_F:
|
|
sstype << ":f";
|
|
break;
|
|
case Type_HF:
|
|
sstype << ":hf";
|
|
break;
|
|
case Type_DF:
|
|
sstype << ":df";
|
|
break;
|
|
case Type_NF:
|
|
sstype << ":nf";
|
|
break;
|
|
case Type_BF:
|
|
sstype << ":bf";
|
|
break;
|
|
default:
|
|
sstype << fmtHex((int)dcl->getElemType()) << "?";
|
|
break;
|
|
}
|
|
if (dcl->getTotalElems() != 1)
|
|
sstype << " x " << dcl->getTotalElems();
|
|
} else {
|
|
sstype << "?";
|
|
}
|
|
os << " | " << std::left << std::setw(COLW_TYPE) << sstype.str();
|
|
//
|
|
// size
|
|
os << " | " << std::right << std::setw(COLW_SIZE) << fmtHex(a.size);
|
|
|
|
// location
|
|
unsigned reg = a.dstGrfAddr / grfSize, subRegBytes = a.dstGrfAddr % grfSize;
|
|
std::stringstream ssloc;
|
|
ssloc << "r" << reg;
|
|
if (subRegBytes != 0)
|
|
ssloc << "+" << fmtHex(subRegBytes);
|
|
os << " | " << std::left << std::setw(COLW_AT) << ssloc.str();
|
|
|
|
// from
|
|
std::string from;
|
|
switch (a.memSource) {
|
|
case ArgLayout::MemSrc::CTI:
|
|
from = "cti";
|
|
break;
|
|
case ArgLayout::MemSrc::PTI:
|
|
from = "pti[tid]";
|
|
break;
|
|
case ArgLayout::MemSrc::INLINE:
|
|
from = "inline";
|
|
break;
|
|
default:
|
|
from = fmtHex(int(a.memSource)) + "?";
|
|
break;
|
|
}
|
|
std::stringstream ssf;
|
|
ssf << from;
|
|
ssf << "+" << fmtHex(a.memOffset);
|
|
|
|
os << " | " << std::left << std::setw(COLW_FROM) << ssf.str();
|
|
//
|
|
os << " |\n";
|
|
}
|
|
os << border << "\n";
|
|
|
|
if (getPlatformGeneration() < PlatformGen::XE) {
|
|
fg.BCStats.clear();
|
|
}
|
|
}
|
|
|
|
using BlockOffsets = std::map<int32_t, std::vector<std::string>>;
|
|
|
|
static BlockOffsets precomputeBlockOffsets(std::ostream &os, G4_Kernel &g4k,
|
|
const KernelView &kv) {
|
|
// pre-compute the PCs of each basic block
|
|
int32_t currPc = 0, lastInstSize = -1;
|
|
BlockOffsets blockOffsets;
|
|
for (BB_LIST_ITER itBB = g4k.fg.begin(); itBB != g4k.fg.end(); ++itBB) {
|
|
for (INST_LIST_ITER itInst = (*itBB)->begin(); itInst != (*itBB)->end();
|
|
++itInst) {
|
|
if ((*itInst)->isLabel()) {
|
|
// G4 treats labels as special instructions
|
|
const char *lbl = (*itInst)->getLabelStr();
|
|
if (lbl && *lbl) {
|
|
blockOffsets[currPc].emplace_back(lbl);
|
|
}
|
|
} else {
|
|
// we are looking at the next G4 instruction,
|
|
// but reached the end of the decode stream
|
|
if (lastInstSize == 0) {
|
|
os << "// ERROR: deducing G4 block PCs "
|
|
"(IGA decoded stream ends early); falling back to IGA labels\n";
|
|
blockOffsets.clear(); // fallback to IGA default labels
|
|
return blockOffsets;
|
|
}
|
|
lastInstSize = kv.getInstSize(currPc);
|
|
|
|
G4_INST *inst = (*itInst);
|
|
|
|
// For HW WA.
|
|
// In which, vISA may ask IGA to emit some additional instructions.
|
|
// For example, sync is used to make instruction aligned, and nop is
|
|
// used to support stepping in debugger.
|
|
// However, due to compaction, we might not know the exact location of
|
|
// the instruction, the sync instruction insertion has to happen during
|
|
// encoding, which is unknown for the instruction size of kernel in the
|
|
// decoding. That's the issue we have to make these changes.
|
|
if (inst->isCachelineAligned()) {
|
|
iga::Op opcode = kv.getOpcode(currPc);
|
|
// There could be multiple sync.nop instructions emitted by IGA to
|
|
// make the instruction aligned. Here we continue to advance PC when
|
|
// seeing sync.nop so that vISA inst and IGA inst could match again.
|
|
while (opcode == iga::Op::SYNC) {
|
|
currPc += lastInstSize;
|
|
opcode = kv.getOpcode(currPc);
|
|
lastInstSize = kv.getInstSize(currPc);
|
|
}
|
|
}
|
|
|
|
// When the inst requires an additional nop after it, again we need to
|
|
// advance PC to consume NOP to make vISA inst and IGA inst match later.
|
|
if (inst->requireNopAfter()) {
|
|
currPc += lastInstSize;
|
|
lastInstSize = kv.getInstSize(currPc);
|
|
vASSERT(kv.getOpcode(currPc) == iga::Op::NOP);
|
|
}
|
|
|
|
currPc += lastInstSize;
|
|
}
|
|
}
|
|
}
|
|
if (kv.getInstSize(currPc) != 0) {
|
|
// we are looking at the next G4 instruction,
|
|
// but reached the end of the decode stream
|
|
os << "// ERROR: deducing G4 block PCs "
|
|
"(G4_INST stream ends early); falling back to IGA labels\n";
|
|
blockOffsets.clear(); // fallback to IGA default labels
|
|
}
|
|
return blockOffsets;
|
|
}
|
|
|
|
// needs further cleanup (confirm label prefixes are gone, newAsm == true)
|
|
void G4_Kernel::emitDeviceAsmInstructionsIga(std::ostream &os,
|
|
const void *binary,
|
|
uint32_t binarySize) {
|
|
os << "\n";
|
|
|
|
const size_t ERROR_STRING_MAX_LENGTH = 16 * 1024;
|
|
char *errBuf = new char[ERROR_STRING_MAX_LENGTH];
|
|
vASSERT(errBuf);
|
|
if (!errBuf)
|
|
return;
|
|
|
|
iga_gen_t igaPlatform = getIGAPlatform(getPlatform());
|
|
|
|
const iga::Model *igaModel =
|
|
iga::Model::LookupModel(iga::ToPlatform(igaPlatform));
|
|
iga::SWSB_ENCODE_MODE swsbEncodeMode = igaModel->getSWSBEncodeMode();
|
|
|
|
|
|
KernelView kv(igaPlatform, binary, binarySize, swsbEncodeMode, errBuf,
|
|
ERROR_STRING_MAX_LENGTH
|
|
);
|
|
|
|
if (!kv.decodeSucceeded()) {
|
|
const char *MSG =
|
|
"vISA asm emission: failed to re-decode binary for asm output\n";
|
|
// trb: do we really need to clobber std::cerr from a driver?
|
|
// Shader dump output will have the message.
|
|
std::cerr << MSG;
|
|
std::cerr << errBuf << "\n";
|
|
os << MSG;
|
|
os << errBuf << "\n";
|
|
// still continue since parital output might be present
|
|
}
|
|
delete[] errBuf;
|
|
|
|
const auto blockOffsets = precomputeBlockOffsets(os, *this, kv);
|
|
|
|
//
|
|
// Generate a label with uniqueLabel as prefix (required by some tools).
|
|
// We do so by using labeler callback. If uniqueLabels is not present, use
|
|
// iga's default label. For example,
|
|
// Without option -uniqueLabels:
|
|
// generating default label, L1234
|
|
// With option -uniqueLabels <sth>:
|
|
// generating label with <sth> as prefix, <sth>_L1234
|
|
//
|
|
std::string labelPrefix;
|
|
if (m_options->getOption(vISA_UniqueLabels)) {
|
|
const char *labelPrefixC = nullptr;
|
|
m_options->getOption(vISA_LabelStr, labelPrefixC);
|
|
labelPrefix = labelPrefixC;
|
|
if (!labelPrefix.empty())
|
|
labelPrefix += '_';
|
|
}
|
|
|
|
struct LabelerState {
|
|
const KernelView *kv;
|
|
const BlockOffsets &blockOffsets;
|
|
const std::string labelPrefix;
|
|
std::string labelStorage;
|
|
LabelerState(const KernelView *_kv, const BlockOffsets &offs,
|
|
const std::string &lblPfx)
|
|
: kv(_kv), blockOffsets(offs), labelPrefix(lblPfx) {}
|
|
};
|
|
LabelerState ls(&kv, blockOffsets, labelPrefix);
|
|
|
|
// storage for the IGA labeler
|
|
auto labeler = [](int32_t pc, void *data) -> const char * {
|
|
LabelerState &ls = *(LabelerState *)data;
|
|
ls.labelStorage = ls.labelPrefix;
|
|
auto itr = ls.blockOffsets.find(pc);
|
|
if (itr == ls.blockOffsets.end()) {
|
|
// let IGA choose the label name, but we still have to prefix
|
|
// our user provided prefix
|
|
char igaDefaultLabel[128];
|
|
ls.kv->getDefaultLabelName(pc, igaDefaultLabel, sizeof(igaDefaultLabel));
|
|
ls.labelStorage += igaDefaultLabel;
|
|
return ls.labelStorage.c_str();
|
|
}
|
|
std::string g4Label = itr->second.front().c_str();
|
|
ls.labelStorage += g4Label;
|
|
return ls.labelStorage.c_str();
|
|
};
|
|
|
|
// initialize register suppression info
|
|
int suppressRegs[5] = {};
|
|
int lastRegs[3] = {};
|
|
for (int i = 0; i < 3; i++) {
|
|
suppressRegs[i] = -1;
|
|
lastRegs[i] = -1;
|
|
}
|
|
|
|
////////////////////////////////////////
|
|
// emit the program text (instructions) iteratively
|
|
// this is a little tricky because G4 treats labels as instructions
|
|
// thus we need to do a little checking to keep the two streams in sync
|
|
int32_t pc = 0;
|
|
std::vector<char> igaStringBuffer;
|
|
igaStringBuffer.resize(512); // TODO: expand default after testing
|
|
|
|
// printedLabels - tracked the labels those have been printed to the pc to
|
|
// avoid printing the same label twice at the same pc. This can happen when
|
|
// there's an empty BB contains only labels. The BB and the following BB will
|
|
// both print those labels. The pair is the pc to label name pair.
|
|
std::set<std::pair<int32_t, std::string>> printedLabels;
|
|
// tryPrintLable - check if the given label is already printed with the given
|
|
// pc. Print it if not, and skip it if yes.
|
|
auto tryPrintLabel = [&os, &printedLabels](int32_t label_pc,
|
|
const std::string &label_name) {
|
|
auto label_pair = std::make_pair(label_pc, label_name);
|
|
// skip if the same label in the set
|
|
if (printedLabels.find(label_pair) != printedLabels.end())
|
|
return;
|
|
os << label_name << ":\n";
|
|
printedLabels.insert(label_pair);
|
|
};
|
|
|
|
for (BB_LIST_ITER itBB = fg.begin(); itBB != fg.end(); ++itBB) {
|
|
os << "// ";
|
|
(*itBB)->emitBbInfo(os);
|
|
os << "\n";
|
|
for (INST_LIST_ITER itInst = (*itBB)->begin(); itInst != (*itBB)->end();
|
|
++itInst) {
|
|
G4_INST *i = (*itInst);
|
|
|
|
// walk to next non-label in this block;
|
|
// return true if we find one, else fails if at end of block
|
|
auto findNextNonLabel = [&](bool print) {
|
|
while ((*itInst)->isLabel()) {
|
|
if (print)
|
|
os << "// " << (*itInst)->getLabelStr() << ":\n";
|
|
itInst++;
|
|
if (itInst == (*itBB)->end())
|
|
break;
|
|
}
|
|
if (itInst == (*itBB)->end())
|
|
return false;
|
|
i = (*itInst);
|
|
return true;
|
|
};
|
|
|
|
bool isInstTarget = kv.isInstTarget(pc);
|
|
if (isInstTarget) {
|
|
auto itr = ls.blockOffsets.find(pc);
|
|
if (itr == ls.blockOffsets.end()) {
|
|
std::string labelname(labeler(pc, &ls));
|
|
tryPrintLabel(pc, labelname);
|
|
} else {
|
|
// there can be multiple labels per PC
|
|
for (const std::string &lbl : itr->second) {
|
|
std::string labelname(ls.labelPrefix + lbl);
|
|
tryPrintLabel(pc, labelname);
|
|
}
|
|
}
|
|
if (!findNextNonLabel(false)) {
|
|
break; // at end of block
|
|
}
|
|
} else if (i->isLabel()) {
|
|
// IGA doesn't consider this PC to be a label but G4 does
|
|
//
|
|
// move forward until we find the next non-label
|
|
if (!findNextNonLabel(true)) {
|
|
break; // at end of block
|
|
}
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////
|
|
// we are looking at a non-label G4_INST at the next valid IGA PC
|
|
// (same instruction)
|
|
if (!getOptions()->getOption(vISA_disableInstDebugInfo)) {
|
|
(*itBB)->emitInstructionSourceLineMapping(os, itInst);
|
|
}
|
|
|
|
uint32_t fmtOpts =
|
|
IGA_FORMATTING_OPTS_DEFAULT | IGA_FORMATTING_OPT_PRINT_BFNEXPRS;
|
|
if (getOption(vISA_PrintHexFloatInAsm))
|
|
fmtOpts |= IGA_FORMATTING_OPT_PRINT_HEX_FLOATS;
|
|
if (!getOption(vISA_noLdStAsmSyntax))
|
|
fmtOpts |= IGA_FORMATTING_OPT_PRINT_LDST;
|
|
|
|
auto formatToInstToStream = [&](int32_t pc, std::ostream &os) {
|
|
// multiple calls to getInstSyntax since we may have to
|
|
// dynamically resize buffer
|
|
while (true) {
|
|
size_t nw =
|
|
kv.getInstSyntax(pc, igaStringBuffer.data(),
|
|
igaStringBuffer.size(), fmtOpts, labeler, &ls);
|
|
if (nw == 0) {
|
|
os << "<<error formatting instruction at "
|
|
"PC 0x"
|
|
<< std::uppercase << std::hex << pc << ">>\n";
|
|
break;
|
|
} else if (nw <= igaStringBuffer.size()) {
|
|
// print it (pad it out so comments line up on most instructions)
|
|
std::string line = igaStringBuffer.data();
|
|
while (line.size() < 100)
|
|
line += ' ';
|
|
os << line;
|
|
break;
|
|
} else {
|
|
igaStringBuffer.resize(igaStringBuffer.size() + 512);
|
|
// try again
|
|
}
|
|
}
|
|
};
|
|
|
|
// Advance PC when the vISA instruction needs to be cacheline-aligned or
|
|
// requires a Nop after. See comments in precomputeBlockOffsets for
|
|
// details.
|
|
if (i->isCachelineAligned()) {
|
|
iga::Op opcode = kv.getOpcode(pc);
|
|
while (opcode == iga::Op::SYNC) {
|
|
formatToInstToStream(pc, os);
|
|
os << "\n";
|
|
pc += kv.getInstSize(pc);
|
|
opcode = kv.getOpcode(pc);
|
|
}
|
|
}
|
|
if (i->requireNopAfter()) {
|
|
formatToInstToStream(pc, os);
|
|
os << "\n";
|
|
pc += kv.getInstSize(pc);
|
|
vASSERT(kv.getOpcode(pc) == iga::Op::NOP);
|
|
}
|
|
|
|
formatToInstToStream(pc, os);
|
|
|
|
(*itBB)->emitBasicInstructionComment(os, itInst, suppressRegs, lastRegs,
|
|
pc);
|
|
os << "\n";
|
|
|
|
pc += kv.getInstSize(pc);
|
|
} // for insts in block
|
|
} // for blocks
|
|
} // emitDeviceAsmInstructionsIga
|
|
|
|
// Should be removed once we can confirm no one uses it
|
|
// the output comes from G4_INST::... and almost certainly won't be
|
|
// parsable by IGA
|
|
void G4_Kernel::emitDeviceAsmInstructionsOldAsm(std::ostream &os) {
|
|
os << "\n"
|
|
<< ".code";
|
|
for (BB_LIST_ITER it = fg.begin(); it != fg.end(); ++it) {
|
|
os << "\n";
|
|
(*it)->emit(os);
|
|
}
|
|
// Step4: emit clean-up.
|
|
os << "\n";
|
|
os << ".end_code"
|
|
<< "\n";
|
|
os << ".end_kernel"
|
|
<< "\n";
|
|
os << "\n";
|
|
}
|
|
|
|
G4_BB *G4_Kernel::getNextBB(G4_BB *bb) const {
|
|
if (!bb)
|
|
return nullptr;
|
|
|
|
// Return the lexically following bb.
|
|
G4_BB *nextBB = nullptr;
|
|
for (auto it = fg.cbegin(), ie = fg.cend(); it != ie; it++) {
|
|
auto curBB = (*it);
|
|
if (curBB == bb) {
|
|
it++;
|
|
if (it != ie) {
|
|
nextBB = (*it);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
return nextBB;
|
|
}
|
|
|
|
unsigned G4_Kernel::getBinOffsetOfBB(G4_BB *bb) const {
|
|
G4_INST *succInst = bb ? bb->getFirstInst() : nullptr;
|
|
|
|
if (succInst != nullptr) {
|
|
return (unsigned)succInst->getGenOffset();
|
|
} else {
|
|
G4_BB *succBB = bb ? getNextBB(bb) : nullptr;
|
|
|
|
while ((succBB != nullptr) && (succInst == nullptr)) {
|
|
succInst = succBB->getFirstInst();
|
|
succBB = getNextBB(succBB);
|
|
}
|
|
|
|
if (succInst != nullptr) {
|
|
return (unsigned)succInst->getGenOffset();
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
unsigned G4_Kernel::getPerThreadNextOff() const {
|
|
if (!hasPerThreadPayloadBB())
|
|
return 0;
|
|
G4_BB *next = getNextBB(perThreadPayloadBB);
|
|
return getBinOffsetOfBB(next);
|
|
}
|
|
|
|
unsigned G4_Kernel::getCrossThreadNextOff() const {
|
|
if (!hasCrossThreadPayloadBB())
|
|
return 0;
|
|
G4_BB *next = getNextBB(crossThreadPayloadBB);
|
|
return getBinOffsetOfBB(next);
|
|
}
|
|
|
|
unsigned G4_Kernel::getComputeFFIDGPNextOff() const {
|
|
if (!hasComputeFFIDProlog())
|
|
return 0;
|
|
// return the offset of the second entry (GP1)
|
|
// the first instruction in the second BB is the start of the second entry
|
|
vISA_ASSERT(fg.getNumBB() > 1, "expect at least one prolog BB");
|
|
vASSERT(!computeFFIDGP1->empty() && !computeFFIDGP1->front()->isLabel());
|
|
return getBinOffsetOfBB(computeFFIDGP1);
|
|
}
|
|
|
|
unsigned G4_Kernel::getComputeFFIDGP1NextOff() const {
|
|
if (!hasComputeFFIDProlog())
|
|
return 0;
|
|
// return the offset of the BB next to GP1
|
|
// the first instruction in the second BB is the start of the second entry
|
|
vISA_ASSERT(fg.getNumBB() > 1, "expect at least one prolog BB");
|
|
G4_BB *next = getNextBB(computeFFIDGP1);
|
|
return getBinOffsetOfBB(next);
|
|
}
|
|
|
|
unsigned G4_Kernel::getSRFInWords() {
|
|
return (fg.builder->getNumScalarRegisters() *
|
|
fg.builder->getScalarRegisterSizeInBytes()) /
|
|
2;
|
|
}
|
|
|
|
// GRF modes supported by HW
|
|
// There must be at least one Config that is VRTEnable for each platform
|
|
GRFMode::GRFMode(const TARGET_PLATFORM plat, unsigned regSize, Options *op)
|
|
: platform(plat), grfSize(regSize), options(op) {
|
|
switch (platform) {
|
|
case Xe_XeHPSDV:
|
|
case Xe_DG2:
|
|
case Xe_MTL:
|
|
case Xe_ARL:
|
|
configs.resize(2);
|
|
// Configurations with <numGRF, numThreads, SWSBTokens, numAcc>
|
|
configs[0] = Config(128, 8, 16, 4);
|
|
configs[1] = Config(256, 4, 16, 8);
|
|
defaultMode = 0;
|
|
break;
|
|
case Xe_PVC:
|
|
case Xe_PVCXT:
|
|
case Xe2:
|
|
configs.resize(2);
|
|
// Configurations with <numGRF, numThreads, SWSBTokens, numAcc>
|
|
configs[0] = Config(128, 8, 16, 4);
|
|
configs[1] = Config(256, 4, 32, 8);
|
|
defaultMode = 0;
|
|
break;
|
|
case Xe3:
|
|
configs.resize(7);
|
|
// Configurations with <numGRF, numThreads, SWSBTokens, numAcc>
|
|
configs[0] = Config(32, 10, 32, 4);
|
|
configs[1] = Config(64, 10, 32, 4);
|
|
configs[2] = Config(96, 10, 32, 4);
|
|
configs[3] = Config(128, 8, 32, 4);
|
|
configs[4] = Config(160, 6, 32, 4);
|
|
configs[5] = Config(192, 5, 32, 4);
|
|
configs[6] = Config(256, 4, 32, 8);
|
|
defaultMode = 3;
|
|
break;
|
|
default:
|
|
// platforms <= TGL
|
|
configs.resize(1);
|
|
// Configurations with <numGRF, numThreads, SWSBTokens, numAcc>
|
|
configs[0] = {128, 7, 16, 2};
|
|
defaultMode = 0;
|
|
}
|
|
currentMode = defaultMode;
|
|
|
|
// Set lower bound GRF
|
|
unsigned minGRF = op->getuInt32Option(vISA_MinGRFNum);
|
|
lowerBoundGRF = minGRF > 0 ? minGRF : configs.front().numGRF;
|
|
vISA_ASSERT(isValidNumGRFs(lowerBoundGRF),
|
|
"Invalid lower bound for GRF number");
|
|
|
|
// Set upper bound GRF
|
|
unsigned maxGRF = op->getuInt32Option(vISA_MaxGRFNum);
|
|
upperBoundGRF = maxGRF > 0 ? maxGRF : configs.back().numGRF;
|
|
vISA_ASSERT(isValidNumGRFs(upperBoundGRF),
|
|
"Invalid upper bound for GRF number");
|
|
|
|
// Select higher GRF
|
|
GRFModeUpValue = op->getuInt32Option(vISA_ForceGRFModeUp);
|
|
vISA_ASSERT(GRFModeUpValue >= 0 && GRFModeUpValue <= configs.size(),
|
|
"Invalid value for selecting a higher GRF mode");
|
|
}
|
|
|
|
unsigned GRFMode::setModeByRegPressure(unsigned maxRP, unsigned largestInputReg,
|
|
bool forceGRFModeUp) {
|
|
unsigned size = configs.size(), i = 0;
|
|
bool spillAllowed = getSpillThreshold() > 0;
|
|
unsigned spillThresholdInRegs = getSpillThreshold() / grfSize;
|
|
// find appropiate GRF based on reg pressure
|
|
for (; i < size; i++) {
|
|
if (configs[i].VRTEnable && configs[i].numGRF >= lowerBoundGRF &&
|
|
configs[i].numGRF <= upperBoundGRF) {
|
|
currentMode = i;
|
|
if (maxRP <= configs[i].numGRF &&
|
|
// Check that we've at least 8 GRFs over and above
|
|
// those blocked for kernel input. This helps cases
|
|
// where an 8 GRF variable shows up in entry BB.
|
|
(largestInputReg + 8) <= configs[i].numGRF) {
|
|
if (forceGRFModeUp && GRFModeUpValue > 0) {
|
|
// Check if user is force a higher GRF mode
|
|
unsigned newGRFMode = currentMode + GRFModeUpValue;
|
|
unsigned maxGRFMode = getMaxGRFMode();
|
|
currentMode = newGRFMode < maxGRFMode ? newGRFMode : maxGRFMode;
|
|
}
|
|
|
|
if (spillAllowed && !hasSmallerGRFSameThreads() && currentMode > 0) {
|
|
unsigned lowerGRFNum = getSmallerGRF();
|
|
// Select a lower GRF number in PreRA in case the register
|
|
// pressure computed is a bit higher (e.g. 4%) than the lower GRF
|
|
// config. If spills are detected, RA will still bump up the GRF
|
|
// number to avoid them.
|
|
// For example, if reg pressure is 165, we select 160GRF since
|
|
// we have spill threshold enabled and the diff between 165 and 160
|
|
// is less than 4%.
|
|
if ((lowerGRFNum * 1.04 >= maxRP ||
|
|
configs[currentMode].numGRF == getMaxGRF()) &&
|
|
lowerGRFNum >= (largestInputReg + 8) &&
|
|
lowerGRFNum >= lowerBoundGRF)
|
|
setModeByNumGRFs(lowerGRFNum);
|
|
}
|
|
return configs[currentMode].numGRF;
|
|
} else if (spillAllowed &&
|
|
maxRP <= configs[i].numGRF + spillThresholdInRegs &&
|
|
(largestInputReg + 8) <= configs[i].numGRF) {
|
|
return configs[currentMode].numGRF;
|
|
}
|
|
}
|
|
}
|
|
// RP is greater than the maximum GRF available, so set the largest GRF
|
|
// available
|
|
return configs[currentMode].numGRF;
|
|
}
|
|
|
|
// Check if next larger GRF has the same number of threads per EU
|
|
bool GRFMode::hasLargerGRFSameThreads() const {
|
|
unsigned largerGrfIdx = currentMode + 1;
|
|
if (largerGrfIdx == configs.size() || !configs[largerGrfIdx].VRTEnable)
|
|
return false;
|
|
|
|
return configs[currentMode].numThreads == configs[largerGrfIdx].numThreads;
|
|
}
|
|
|
|
// Check if next smaller GRF has the same number of threads per EU
|
|
bool GRFMode::hasSmallerGRFSameThreads() const {
|
|
int smallerGrfIdx = currentMode - 1;
|
|
if (smallerGrfIdx < 0 || !configs[smallerGrfIdx].VRTEnable)
|
|
return false;
|
|
return configs[currentMode].numThreads == configs[smallerGrfIdx].numThreads;
|
|
}
|
|
|
|
// Get spill threshold for current GRF mode
|
|
unsigned GRFMode::getSpillThreshold() const {
|
|
if (platform < Xe3)
|
|
return 0;
|
|
// FIXME: currently spill thresholds for <128GRF are
|
|
// causing some performance regressions. We need more
|
|
// study to define proper thresholds for this range.
|
|
if (configs[currentMode].numGRF < 128)
|
|
return 0;
|
|
if (configs[currentMode].numGRF == 256 &&
|
|
options->getuInt32Option(vISA_SpillAllowed256GRF) > 0)
|
|
return options->getuInt32Option(vISA_SpillAllowed256GRF);
|
|
|
|
return options->getuInt32Option(vISA_SpillAllowed);
|
|
}
|