Files
intel-graphics-compiler/visa/PrologEpilog.cpp
DianaChen f0962ad863 IGA: Clean-up gcc warnings
Clean-up gcc warnings, such as
-Werror=misleading-indentation
-Werror=catch-value
-Werror=class-memaccess
-Werror=unused-variable
-Werror=unused-but-set-variable
2025-07-01 18:23:24 +02:00

1406 lines
54 KiB
C++

/*========================== begin_copyright_notice ============================
Copyright (C) 2023 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
#include "Assertions.h"
#include "FlowGraph.h"
#include "G4_Opcode.h"
#include "G4_Verifier.hpp"
#include "Optimizer.h"
#include "Timer.h"
#include <algorithm>
#include <array>
#include <fstream>
#include <map>
#include <sstream>
#include <vector>
using namespace vISA;
// A place for all kernel prolog/epilog related code.
// TODO: Currently prolog/epilog code is spread into multiple standlone
// functions with no clear ordering between them. It may be good to have single
// PrologEpilog pass that clearly defines the order for which the different
// instructions are inserted.
// Prolog functions.
// Create a copy of R0 at top of kernel,
// to support midthread preemption.
void Optimizer::createR0Copy() {
if (!builder.getIsKernel()) {
return;
}
// r0 copy is needed only if:
// a. pre-emption VISA option is enabled OR
// b. current object is kernel with stack calls since VISA ABI requires r0
// copy to be available in a pre-defined register
if (!R0CopyNeeded())
return;
// Skip copying of ``copy of R0'' if it's never assigned, a case where
// ``copy of R0'' is never used. As EOT always use ``copy of R0'', that
// case only happens for synthetic tests where no practical code is
// generated.
if (!builder.getBuiltinR0()->getRegVar()->isPhyRegAssigned())
return;
G4_Declare *R0Dcl = builder.getRealR0();
G4_SrcRegRegion *R0Opnd =
builder.createSrcRegRegion(R0Dcl, builder.getRegionStride1());
G4_DstRegRegion *R0CopyOpnd =
builder.createDst(builder.getBuiltinR0()->getRegVar(), 0, 0, 1, Type_UD);
unsigned int options = InstOpt_WriteEnable;
unsigned numElt = kernel.getGRFSize() / TypeSize(Type_UD);
G4_INST *movInst = builder.createMov(G4_ExecSize(numElt),
R0CopyOpnd, R0Opnd, options, false);
for (G4_BB *bb : kernel.fg) {
INST_LIST_ITER ii = bb->begin();
INST_LIST_ITER iend = bb->end();
for (; ii != iend; ii++) {
G4_INST *inst = *ii;
if (inst->opcode() != G4_label) {
bb->insertBefore(ii, movInst);
return;
}
}
}
}
void Optimizer::initializePayload() {
if (!kernel.fg.builder->getIsKernel()) {
return;
}
const unsigned grfSize = kernel.getGRFSize();
unsigned inputEnd = grfSize;
unsigned inputCount = kernel.fg.builder->getInputCount();
for (unsigned id = 0; id < inputCount; id++) {
input_info_t *input_info = kernel.fg.builder->getInputArg(id);
unsigned argEnd = input_info->offset + input_info->size;
inputEnd = std::max(inputEnd, argEnd);
}
G4_BB *bb = kernel.fg.getEntryBB();
// iter points to the first non-label inst
auto iter = bb->begin(), bbEnd = bb->end();
while (iter != bbEnd) {
if (!(*iter)->isLabel()) {
break;
}
++iter;
}
const unsigned maxGRFNum = kernel.getNumRegTotal();
// First full GRF that needs to be initialized
unsigned regNum = (inputEnd + grfSize - 1) / grfSize;
// Initialize bulk of GRFs, two at a time
unsigned numElt = grfSize * 2 / TypeSize(Type_UD);
while (regNum + 2 <= maxGRFNum) {
G4_Declare *tempDcl =
builder.createHardwiredDeclare(numElt, Type_UD, regNum, 0);
G4_DstRegRegion *dst =
builder.createDst(tempDcl->getRegVar(), 0, 0, 1, Type_UD);
G4_Imm *src0 = builder.createImm(0, Type_UD);
G4_INST *initInst = builder.createMov(G4_ExecSize(numElt), dst, src0,
InstOpt_WriteEnable, false);
bb->insertBefore(iter, initInst);
regNum += 2;
}
// Initialize the last register if bulk of GRFs was odd
if (regNum != maxGRFNum) {
vASSERT(regNum == maxGRFNum - 1);
numElt = grfSize / TypeSize(Type_UD);
G4_Declare *tempDcl =
builder.createHardwiredDeclare(numElt, Type_UD, regNum, 0);
G4_DstRegRegion *dst =
builder.createDst(tempDcl->getRegVar(), 0, 0, 1, Type_UD);
G4_Imm *src0 = builder.createImm(0, Type_UD);
G4_INST *initInst = builder.createMov(G4_ExecSize(numElt), dst, src0,
InstOpt_WriteEnable, false);
bb->insertBefore(iter, initInst);
}
// The GRF that needs to be partial initialized
regNum = inputEnd / grfSize;
// offset within GRF from which to start to initialize
unsigned subOffset = (inputEnd % grfSize);
// beginning execution size for byte remainder initialization
unsigned execSize = grfSize / 2;
// use an already initialized GRF as src
unsigned grfSrc = maxGRFNum - 2;
// inits remainder GRF
// loops until all bytes within GRF are initialized
// on each iteration goes down by execution size
// There was a small bug if inputEnd offset is GRF aligned it would think all
// of last payload register is the "remainder" and will initialize it.
while (subOffset && (subOffset != grfSize)) {
while (subOffset + execSize <= grfSize) {
G4_Declare *tempDcl =
builder.createHardwiredDeclare(execSize, Type_UB, regNum, subOffset);
G4_DstRegRegion *dst =
builder.createDst(tempDcl->getRegVar(), 0, 0, 1, Type_UB);
vASSERT(grfSrc > regNum);
G4_Declare *tempDclSrc =
builder.createHardwiredDeclare(1, Type_UD, grfSrc , 0);
G4_SrcRegRegion *src0 = builder.createSrc(
tempDclSrc->getRegVar(), 0, 0, builder.getRegionScalar(), Type_UB);
G4_INST *initInst = builder.createMov(G4_ExecSize(execSize), dst, src0,
InstOpt_WriteEnable, false);
bb->insertBefore(iter, initInst);
subOffset += execSize;
}
// next lowest execution size
execSize = std::max(1U, execSize / 2);
}
// Initializing Flag register
for (unsigned i = 0, e = builder.getNumFlagRegisters() / 2; i < e; ++i) {
G4_Declare *tmpFlagDcl = builder.createTempFlag(2);
tmpFlagDcl->getRegVar()->setPhyReg(builder.phyregpool.getFlagAreg(i), 0);
G4_DstRegRegion *tempPredVar =
builder.createDst(tmpFlagDcl->getRegVar(), 0, 0, 1, Type_UD);
G4_INST *predInst =
builder.createMov(g4::SIMD1, tempPredVar, builder.createImm(0, Type_UW),
InstOpt_WriteEnable, false);
bb->insertBefore(iter, predInst);
}
}
// create prolog to set sr0 to FFID. TGL WA.
// Do only when there is cr0 write inside the kernel
void Optimizer::addFFIDProlog() {
if (!builder.getIsKernel())
return;
FFID ffid =
static_cast<FFID>(builder.getOptions()->getuInt32Option(vISA_setFFID));
// return if FFID is not given
if (ffid == FFID_INVALID)
return;
// get r127.0 decl
G4_Declare *rtail = builder.createHardwiredDeclare(
8, Type_UD, kernel.getNumRegTotal() - 1, 0);
// (W) and (1|M0) r127.0 <1>:ud sr0.0 <0;1,0>:ud 0xF0FFFFFF:ud
auto createAnd = [this, &rtail]() {
auto src0 = builder.createSrc(builder.phyregpool.getSr0Reg(), 0, 0,
builder.getRegionScalar(), Type_UD);
auto src1 = builder.createImm(0xF0FFFFFF, Type_UD);
auto dst = builder.createDst(rtail->getRegVar(), 0, 0, 1, Type_UD);
return builder.createBinOp(G4_and, g4::SIMD1, dst, src0, src1,
InstOpt_WriteEnable, false);
};
// (W) or (1|M0) sr0.0<1>:ud 127.0<0;1,0>:ud imm:ud
auto createOr = [this, &rtail](uint32_t imm) {
auto src0 = builder.createSrc(rtail->getRegVar(), 0, 0,
builder.getRegionScalar(), Type_UD);
auto src1 = builder.createImm(imm, Type_UD);
auto dst =
builder.createDst(builder.phyregpool.getSr0Reg(), 0, 0, 1, Type_UD);
return builder.createBinOp(G4_or, g4::SIMD1, dst, src0, src1,
InstOpt_WriteEnable, false);
};
// (W) jmpi (1|M0) label
auto createJmpi = [this](G4_Label *label) {
return builder.createInternalInst(nullptr, G4_jmpi, nullptr, g4::NOSAT,
g4::SIMD1, nullptr, label, nullptr,
InstOpt_WriteEnable);
};
auto createLabelInst = [this](G4_Label *label) {
return kernel.fg.createNewLabelInst(label);
};
// for compute shader, create two entris
if (ffid == FFID_GP || ffid == FFID_GP1) {
// Entry0: Set sr0 to FFID_GP (0x7)
// (W) and (1|M0) r127.0 <1>:ud sr0.0 <0;1,0>:ud 0xF0FFFFFF:ud
// (W) or (1|M0) sr0.0<1>:ud 127.0<0;1,0>:ud 0x07000000:ud
// jmpi ffid_prolog_end
// Entry1: Set sr0 to FFID_GP1 (0x8)
// (W) and (1|M0) r127.0 <1>:ud sr0.0 <0;1,0>:ud 0xF0FFFFFF:ud
// (W) or (1|M0) sr0.0<1>:ud 127.0<0;1,0>:ud 0x08000000:ud
// ffid_prolog_end:
// Put the entry0 block into a new BB, so that we can make it 64-bit
// aligned in BinaryEncodingIGA
G4_BB *entry_0_bb = kernel.fg.createNewBB();
entry_0_bb->push_back(createAnd());
entry_0_bb->push_back(createOr(0x07000000));
// get jmp target label. If the next bb has no label, create one and insert
// it at the beginning
G4_Label *jmp_label = nullptr;
vASSERT(kernel.fg.begin() != kernel.fg.end());
G4_BB *next_bb = *kernel.fg.begin();
if (next_bb->front()->isLabel()) {
jmp_label = next_bb->front()->getSrc(0)->asLabel();
} else {
jmp_label = builder.createLocalBlockLabel("ffid_prolog_end");
next_bb->insertBefore(next_bb->begin(), createLabelInst(jmp_label));
}
entry_0_bb->push_back(createJmpi(jmp_label));
// Put the rest in another BB
G4_BB *entry_1_bb = kernel.fg.createNewBB();
entry_1_bb->push_back(createAnd());
entry_1_bb->push_back(createOr(0x08000000));
// add these two BB to be the first two in the shader
kernel.fg.addPrologBB(entry_1_bb);
kernel.fg.addPrologBB(entry_0_bb);
kernel.setComputeFFIDGPBB(entry_0_bb);
kernel.setComputeFFIDGP1BB(entry_1_bb);
} else {
// for other shaders, set the FFID
// (W) and (1|M0) r127.0 <1>:ud sr0.0 <0;1,0>:ud 0xF0FFFFFF:ud
// (W) or (1|M0) sr0.0<1>:ud 127.0<0;1,0>:ud (FFID << 24):ud
G4_BB *bb = kernel.fg.createNewBB();
bb->push_back(createAnd());
bb->push_back(createOr(ffid << 24));
kernel.fg.addPrologBB(bb);
}
}
// clang-format off
///////////////////////////////////////////////////////////////////////////////
// Argument Loading for GPGPU
//
// Payload in Memory Payload in GRF
// (prepared by runtime) (for thread T[i])
//
// IndirectArgPtr = r0.0[31:6] + GeneralStateBase
//
// As an example, assume per thread data is 3 GRFs (numCrossThreadDW / 16 = 3)
//
// Memory: Register File:
//
// +---------------------+ <- [IndirectArgPtr e.g. r0.0[31:6]+GeneralStateBase]
// | implicit_args |
// | (if enabled) |
// +---------------------+ R1 +------------------------+ <-- perThreadLoadStartGRF
// | cross thread data | \ | |
// | | numCrossThreadDW | per thread data T[i] |
// | ... [ padding? * ] | / | |
// +---------------------+ <-- perThreadOffsetMem R4 +------------------------+ <- perThreadLoadGRF + numPerThreadGRF
// | | \ | inline data (optional) |
// | per thread data T0 | numPerThreadGRF R5 +------------------------+ <-- crossThreadLoadStartGRF
// | | / (GRFs) | cross thread data | |
// +---------------------+ | | numCrossThreadDW (Dwords)
// | | | | |
// | per thread data T1 | +------------------------+
// | | (NOTE: register numbers are examples)
// +---------------------+ vISA_loadThreadPayloadStartReg shifts payload in GRF
// ...
//
// * inline data comes from the compute walker command not memory;
// "inline" (or immediate) with respect the command streamer instructions
//
// * padding vISA_crossThreadDataAlignment rounds up cross-thread memory section
// so that per-thread blocks start aligned; successive per thread blocks are GRF aligned
//
// clang-format on
class PayloadLoader
{
IR_Builder &builder;
G4_Kernel &kernel;
FlowGraph &fg;
// if the inline data register is being used
const bool useInlineData;
// indirect data address is at r0.0[5:31]:d
// thread id in group is at r0.2[7:0]:d (same as r0.4[7:0]:w)
G4_Declare *r0;
// temp register to use for offset computation or load payload
G4_Declare *rtmp;
// see the image above
const uint32_t perThreadLoadStartGRF;
// final cross thread size to be loaded as number of DW (including aligenment)
// does not include the inline register argument
uint32_t numCrossThreadDW = 0;
// payload memory offset of where local id should be loaded from
// this is in bytes
uint32_t perThreadOffsetMem = 0;
// number of per-thread GRFs to be loaded (e.g. local ids)
const uint32_t numPerThreadGRF = 0;
// start GRF for load data
uint32_t crossThreadLoadStartGRF = 0;
std::vector<G4_INST *> instBuffer;
public:
PayloadLoader(IR_Builder &b, G4_Kernel &k, FlowGraph &_fg)
: builder(b), kernel(k), fg(_fg),
useInlineData(k.hasInlineData()),
r0(
b.createHardwiredDeclare(
k.numEltPerGRF<Type_UD>(), Type_UD, 0, 0)),
perThreadLoadStartGRF(
k.getOptions()->getuInt32Option(vISA_loadThreadPayloadStartReg)),
numPerThreadGRF(
AlignUp(k.getInt32KernelAttr(Attributes::ATTR_PerThreadInputSize),
k.numEltPerGRF<Type_UB>()) / k.numEltPerGRF<Type_UB>())
{
auto rtmpRegNum = k.getNumRegTotal() - 1;
rtmp = b.createHardwiredDeclare(k.numEltPerGRF<Type_UD>(), Type_UD, rtmpRegNum, 0);
r0->setName("r0");
rtmp->setName("rtmp");
// pre-compute various offsets into memory and GRF for the later use
uint32_t crossThreadLoadStart = 0; // register file (grf) offset in byte
// cross thread size (not including inlinedata size and alignement)
int CTIS = kernel.getInt32KernelAttr(Attributes::ATTR_CrossThreadInputSize);
if (CTIS < 0) {
// per-thread payload vars
// N = inlinedata size
// Cross thread data size is aligned to 32byte,
// if inlinedata is used, runtime puts first N bytes of payload in
// inlinedata. Rest of payload is shifted in the buffer by N bytes. So
// payload args which start at N offset, now start at 0 offset. Because of
// this we need to calculate localID offset:
const unsigned crossThreadDataAlignment =
builder.getuint32Option(vISA_crossThreadDataAlignment);
const uint32_t loadedCrossThreadInputSize =
findCrossThreadInputSize(crossThreadLoadStart);
const uint32_t inlineDataSize = builder.getInlineDataSize();
perThreadOffsetMem =
useInlineData ?
AlignUp(loadedCrossThreadInputSize + inlineDataSize,
crossThreadDataAlignment) - inlineDataSize :
AlignUp(loadedCrossThreadInputSize, crossThreadDataAlignment);
// cross-thread payload vars
numCrossThreadDW =
AlignUp(loadedCrossThreadInputSize, crossThreadDataAlignment) /
TypeSize(Type_UD);
crossThreadLoadStartGRF = crossThreadLoadStart / kernel.getGRFSize();
} else {
// per-thread payload vars
perThreadOffsetMem = CTIS;
if (useInlineData && builder.getInlineDataSize() >= perThreadOffsetMem)
{
perThreadOffsetMem = 0;
}
else if (useInlineData)
{
perThreadOffsetMem -= builder.getInlineDataSize();
}
// cross-thread payload vars
numCrossThreadDW = CTIS / TypeSize(Type_UD);
crossThreadLoadStartGRF = perThreadLoadStartGRF + numPerThreadGRF;
if (useInlineData) {
// first GRF of cross-thread data is already loaded
crossThreadLoadStartGRF++;
// FIXME: reduce "numCrossThreadDW" for grf size instead of inline data
// size (builder.getInlineDataSize()) to workaround ogl behavior that it
// sets ATTR_CrossThreadInputSize larger than acutal input size.
numCrossThreadDW =
numCrossThreadDW > kernel.numEltPerGRF<Type_UD>() ?
numCrossThreadDW - kernel.numEltPerGRF<Type_UD>() : 0;
}
}
} // PayloadLoader::PayloadLoader(...)
private:
// load <numGRF> GRFs from the address "loadAddress", starting from <startGRF>
// using an oword block load
void loadFromMemoryHdcBti(G4_Declare *loadAddress,
uint32_t startGRF,
uint32_t numTotalDW)
{
auto getHWordBlockEncoding = [](uint32_t numHW) {
switch (numHW) {
case 1:
return 0x0;
case 2:
return 0x1;
case 4:
return 0x2;
case 8:
return 0x3;
default:
vISA_ASSERT_UNREACHABLE("unexpected number of HW");
return 0x0;
}
};
for (uint32_t numRemainingDW = numTotalDW, nextGRF = startGRF;
numRemainingDW > 0;
/* updated in body */)
{
// can load 4, 2 or 1 grf per send.
// Still load 1 GRF if the remainingDW is less than 1 GRF. The addtional
// bytes those being loaded won't be used.
uint32_t DWin4GRF = 4 * builder.numEltPerGRF<Type_UD>();
uint32_t DWin2GRF = DWin4GRF / 2;
uint32_t DWin1GRF = DWin2GRF / 2;
uint32_t numGRFToLoad = numRemainingDW >= DWin4GRF ? 4 : // 4 GRF
numRemainingDW >= DWin2GRF ? 2 : // 2 GRF
1; // 1 GRF or less than 1 GRF
bool useHword = builder.hasHWordBlockLoad();
uint32_t numElts =
(numGRFToLoad * kernel.getGRFSize()) / (useHword ? 32 : 16);
uint32_t dataBlocks = useHword
? getHWordBlockEncoding(numElts)
: (numElts == 2 ? 2 : (numElts == 4 ? 3 : 4));
// A32 unaligned hword/oword block read
uint32_t msgDescVal = (1 << 25) | (numGRFToLoad << 20) | (1 << 19) |
(DC_ALIGNED_OWORD_BLOCK_READ << 14) |
((useHword ? 1 : 0) << 13) | (dataBlocks << 8) |
253;
auto desc = builder.createReadMsgDesc(SFID::DP_DC0, msgDescVal);
auto sendSrc =
builder.createSrcRegRegion(loadAddress, builder.getRegionStride1());
auto sendDstDcl =
builder.createHardwiredDeclare(numGRFToLoad * 8, Type_UD, nextGRF, 0);
auto sendDst = builder.createDstRegRegion(sendDstDcl, 1);
auto sendInst =
builder.createSendInst(nullptr, G4_send, g4::SIMD8, sendDst, sendSrc,
builder.createImm(msgDescVal, Type_UD),
InstOpt_WriteEnable | InstOpt_NoCompact, desc,
true);
instBuffer.push_back(sendInst);
if (numRemainingDW < DWin1GRF)
break;
numRemainingDW -= numGRFToLoad * builder.numEltPerGRF<Type_UD>();
nextGRF += numGRFToLoad;
if (numRemainingDW > 0) {
// advance the address offset
// (W) add (1) loadAddress.2 loadAddress.2 numGRFToLoad*sizeof(GRF)
auto addSrc0 = builder.createSrc(loadAddress->getRegVar(), 0, 2,
builder.getRegionScalar(), Type_UD);
auto addSrc1 = builder.createImm(
numGRFToLoad * kernel.numEltPerGRF<Type_UB>(), Type_UW);
auto addDst =
builder.createDst(loadAddress->getRegVar(), 0, 2, 1, Type_UD);
auto addInst =
builder.createBinOp(G4_add, g4::SIMD1,
addDst, addSrc0, addSrc1,
InstOpt_WriteEnable | InstOpt_NoCompact, false);
instBuffer.push_back(addInst);
}
}
} // loadFromMemoryHdcBti
// a helper function LSC loads to get the max DW number which can
// fulfill LSC element number;
// - this rounds down to a GRF, or
// - up to a legal vector size (e.g. 5 -> 8)
uint32_t roundDwordsToLegalSize(uint32_t numDW) const {
if (builder.lscGetElementNum(numDW) != LSC_DATA_ELEMS_INVALID)
return numDW;
if (numDW > builder.numEltPerGRF<Type_UD>()) {
if (numDW > 64)
return (uint32_t)64;
else if (numDW > 32)
return (uint32_t)32;
else if (numDW > 16)
return (uint32_t)16;
else if (numDW > 8)
return (uint32_t)8;
vISA_ASSERT_UNREACHABLE("unreachable");
}
// when the numDW is less than 1 grf, we want to load all within one send
// The additional bytes being loaded won't be used so should be fine
if (numDW < 2)
return (uint32_t)2;
else if (numDW < 4)
return (uint32_t)4;
else if (numDW < 8)
return (uint32_t)8;
else if (numDW < 16)
return (uint32_t)16;
vISA_ASSERT_UNREACHABLE("unreachable");
return (uint32_t)0;
}
// LSC allows transpose with V1, V2, V3, V4, V8, V16, V32, V64
// We assume this is called in descending sequence with register-sized
// chunks and then on down to sub register size.
//
// Only the last load in a sequence may be smaller than a GRF and must
// round up.
// DWords:
// >=64 => d32x64t possible residue of next iteration
// 32-63 => d32x32t possible residue of next iteration
// 17-31 => d32x16t possible residue of next iteration
// Final Load Residues:
// 9-16 => d32x16t loads some padding
// 5-8 => d32x8t loads some padding
// 4,3,2,1 => d32x{4,3,2,1}t
//
// Thus, given V7 we need to load V8
//
uint32_t roundDwordsToLegalSizeLSC(uint32_t numDw) {
if (numDw >= 64) {
return 64; // 4GRF
} else if (numDw >= 32) {
return 32; // 2GRF
} else if (numDw > 8) {
return 16; // 1GRF (possibly padding)
} else if (numDw > 4) {
return 8; // half a GRF (possibly padding)
} else { // V1, V2, V3, V4
return numDw;
}
}
void loadFromMemoryLscBti(G4_Declare *baseLoadAddr,
uint32_t startGRF,
uint32_t numTotalDW)
{
G4_Declare *loadAddress = baseLoadAddr;
// Use immediate offsets to avoid the adds.
const uint32_t immOffOpts =
builder.getuint32Option(vISA_lscEnableImmOffsFor);
const bool useLscImmOff =
// HW supports it
builder.getPlatform() >= Xe2 &&
//
// BTI only gets 12b of range (signed+DW aligned) ~ 31 GRF
(numTotalDW * TypeSize(Type_UD)) <= ((1 << 11) - 4) &&
//
// enabled in options
((immOffOpts & (1 << VISA_LSC_IMMOFF_PAYLOAD_LOADING)) != 0) &&
//
// the payload address type is also enabled in options
(immOffOpts & (1 << getLscImmOffOpt(LSC_ADDR_TYPE_BTI))) != 0;
for (uint32_t numRemainingDW = numTotalDW, nextGRF = startGRF;
numRemainingDW > 0;
/* updated in body */) {
// Generate a A32 tranpose LSC load to BTI 255. size is d32x{16/32}t
LSC_OP op = LSC_LOAD;
LSC_SFID lscSfid = LSC_UGM;
LSC_CACHE_OPTS cacheOpts{LSC_CACHING_CACHED, LSC_CACHING_CACHED};
if (builder.getPlatformGeneration() >= PlatformGen::XE2) {
// use XE2+ L3 CC
cacheOpts = {LSC_CACHING_CACHED, LSC_CACHING_CONSTCACHED};
}
LSC_ADDR addrInfo{};
addrInfo.type = LSC_ADDR_TYPE_BTI;
addrInfo.size = LSC_ADDR_SIZE_32b;
addrInfo.immScale = 1;
addrInfo.immOffset = 0;
if (useLscImmOff) {
addrInfo.immOffset =
((int)nextGRF - startGRF) * (int)kernel.getGRFSize();
}
LSC_DATA_SHAPE dataShape{};
dataShape.size = LSC_DATA_SIZE_32b; // in the unit of 32b
dataShape.order = LSC_DATA_ORDER_TRANSPOSE;
uint32_t numDWToLoad = roundDwordsToLegalSize(numRemainingDW);
dataShape.elems = builder.lscGetElementNum(numDWToLoad);
G4_Imm *surfaceBTI = builder.createImm(255, Type_UW);
auto sendDstDcl =
builder.createHardwiredDeclare(numDWToLoad, Type_UD, nextGRF, 0);
auto dstRead = builder.createDstRegRegion(sendDstDcl, 1);
auto src0Addr = builder.createSrcRegRegion(
loadAddress, builder.getRegionStride1()); // address base
G4_InstSend *sendInst = nullptr;
G4_SendDescRaw *desc = builder.createLscMsgDesc(
op, lscSfid, EXEC_SIZE_1, cacheOpts, addrInfo, dataShape, surfaceBTI,
numDWToLoad < builder.numEltPerGRF<Type_UD>()
? 1
: numDWToLoad / builder.numEltPerGRF<Type_UD>(),
1, LdStAttrs::NONE);
sendInst =
builder.createLscSendInst(nullptr, dstRead, src0Addr, nullptr,
g4::SIMD1, desc,
InstOpt_WriteEnable | InstOpt_NoCompact,
LSC_ADDR_TYPE_BTI, 0x0, true);
instBuffer.push_back(sendInst);
// we pick to load all data within one send in
// roundDwordsToLegalSize if numRemainingDW is less than one
// grf. All should be loaded at this point.
if (numRemainingDW < builder.numEltPerGRF<Type_UD>())
break;
numRemainingDW -= numDWToLoad;
nextGRF += numDWToLoad / builder.numEltPerGRF<Type_UD>();
bool advanceLoadAddress = numRemainingDW > 0;
advanceLoadAddress &= !useLscImmOff;
if (advanceLoadAddress) {
// advance the address offset
// (W) add (1) loadAddress.0 baseLoadAddr.0 numGRFLoadedInBytes
auto addSrc0 = builder.createSrcRegRegion(
baseLoadAddr, builder.getRegionScalar());
auto addSrc1 = builder.createImm(
(nextGRF - startGRF) * kernel.getGRFSize(), Type_UW);
vASSERT(loadAddress->getRegVar()->isPhyRegAssigned() &&
loadAddress->getRegVar()->getPhyReg()->isPhyGreg());
// Use different GRF for the subsequent load address computation to
// mitigate WAR stall on prev send src. Use the address GRF - 1 from
// the current load for the next one here, and fallback to use the last
// GRF when it conflicts with input.
// TODO: Consider moving prolog emission before local schedule or do
// hand schedule to hide the RAW dependence of send on the address GRF.
unsigned rTmpAddDst =
loadAddress->getRegVar()->getPhyReg()->asGreg()->getRegNum() - 1;
if (nextGRF * kernel.numEltPerGRF<Type_UD>() + numRemainingDW >
rTmpAddDst * kernel.numEltPerGRF<Type_UD>()) {
loadAddress = baseLoadAddr;
} else {
loadAddress =
builder.createHardwiredDeclare(1, Type_UD, rTmpAddDst, 0);
}
auto addDst = builder.createDstRegRegion(loadAddress, 1);
auto addInst =
builder.createBinOp(G4_add, g4::SIMD1,
addDst, addSrc0, addSrc1,
InstOpt_WriteEnable | InstOpt_NoCompact,
false);
instBuffer.push_back(addInst);
}
}
} // loadFromMemoryLscBti
void loadFromMemory(G4_Declare *loadAddress,
uint32_t startGRF,
uint32_t numTotalDW)
{
// Need to reserve 1 GRF for offset computation or load payload at least.
vISA_ASSERT(numTotalDW == 0 ||
(startGRF + (numTotalDW + kernel.numEltPerGRF<Type_UD>() - 1) /
kernel.numEltPerGRF<Type_UD>()) < (kernel.getNumRegTotal() - 1),
"The payload exceeds GRF capacity.");
if (builder.useLSCForPayloadLoad()) {
loadFromMemoryLscBti(loadAddress, startGRF, numTotalDW);
} else {
loadFromMemoryHdcBti(loadAddress, startGRF, numTotalDW);
}
}
// add (1) rtmp.2<1>:ud rtmp.2<0;1,0>:ud <reloc imm>
void emitRelocAddInst(int subreg) {
auto dst = builder.createDst(rtmp->getRegVar(), 0, subreg, 1, Type_UD);
auto src0 = builder.createSrc(rtmp->getRegVar(), 0, subreg,
builder.getRegionScalar(), Type_UD);
auto src1 =
builder.createRelocImm(GenRelocType::R_SYM_ADDR_32,
CROSS_THREAD_OFF_R0_RELOCATION_NAME, 0, Type_UD);
auto addInst =
builder.createBinOp(G4_add, g4::SIMD1, dst, src0, src1,
InstOpt_WriteEnable | InstOpt_NoCompact, false);
RelocationEntry::createRelocation(builder.kernel, *addInst, 1,
CROSS_THREAD_OFF_R0_RELOCATION_NAME,
GenRelocType::R_SYM_ADDR_32);
instBuffer.push_back(addInst);
}
// helper function to find the size of cross thread data which needs to be
// loaded
// * loadStartOffset - in this parameter we put the offset of first
// cross thread input which gets loaded.
// * returns the size of the cross thread section that must be loaded
uint32_t findCrossThreadInputSize(uint32_t &loadStartOffset) const {
const uint32_t startGRF =
kernel.getOptions()->getuInt32Option(vISA_loadThreadPayloadStartReg);
const uint32_t inputsStart = startGRF * kernel.getGRFSize();
const uint32_t inputCount = kernel.fg.builder->getInputCount();
const int PTIS =
AlignUp(kernel.getInt32KernelAttr(Attributes::ATTR_PerThreadInputSize),
kernel.getGRFSize());
const uint32_t inlineDataSize = builder.getInlineDataSize();
// Checks if input_info is cross-thread-input
auto isInCrossThreadData = [&](const input_info_t *const input_info) {
return (uint32_t)input_info->offset >= inputsStart + PTIS;
};
// Checks if input_info fits in inlineData
auto isInInlineData = [&](const input_info_t *const input_info) {
if (!useInlineData) {
return false;
}
uint32_t inputEnd = input_info->offset + input_info->size;
bool fitsInInlineData = inputEnd <= inputsStart + PTIS + inlineDataSize;
return isInCrossThreadData(input_info) && fitsInInlineData;
};
uint32_t firstNotInlinedCrossThreadInput =
std::numeric_limits<uint32_t>::max();
uint32_t inputEnd = 32;
// iterate over inputs and find:
// - where they end
// - where first not inlined cross thread input is
for (unsigned int id = 0; id < inputCount; id++) {
const input_info_t *input_info = kernel.fg.builder->getInputArg(id);
// skip pseudo input for register bindings.
if (input_info->isPseudoInput()) {
continue;
}
if (kernel.fg.builder->getFCPatchInfo()->getIsEntryKernel()) {
const vISA::G4_Declare *dcl = input_info->dcl;
if (INPUT_GENERAL == input_info->getInputClass() && !(dcl->isLiveIn())) {
break;
}
}
if (inputEnd < (unsigned)(input_info->size + input_info->offset)) {
inputEnd = input_info->size + input_info->offset;
}
// let's find first cross thread input position which is not delivered in
// inlineData
if (isInCrossThreadData(input_info) && !isInInlineData(input_info) &&
firstNotInlinedCrossThreadInput > (uint32_t)input_info->offset) {
firstNotInlinedCrossThreadInput = input_info->offset;
}
}
loadStartOffset = firstNotInlinedCrossThreadInput;
// check if we have anything to load
if (firstNotInlinedCrossThreadInput == std::numeric_limits<uint32_t>::max()) {
return 0;
}
return inputEnd - firstNotInlinedCrossThreadInput;
} // findCrossThreadInputSize
// (W) and (1) rtmp.2<1>:ud r0.0<0;1,0>:ud 0xFFFFFFC0
void getStartAddrInst(int subreg) {
auto src0 = builder.createSrc(r0->getRegVar(), 0, 0,
builder.getRegionScalar(), Type_UD);
const uint32_t ArgOffsetMask = 0xFFFFFFC0;
auto src1 = builder.createImm(ArgOffsetMask, Type_UD);
auto dst = builder.createDst(rtmp->getRegVar(), 0, subreg, 1, Type_UD);
auto andInst = builder.createBinOp(G4_and, g4::SIMD1, dst, src0, src1,
InstOpt_WriteEnable | InstOpt_NoCompact,
false);
instBuffer.push_back(andInst);
}
// (W) mov (ExecSize) rtmp.0:ud 0x0
void clearTmpRegister() {
auto src0 = builder.createImm(0, Type_UD);
auto dst = builder.createDstRegRegion(rtmp, 1);
G4_ExecSize execSize(kernel.getGRFSize() / 4);
auto movInst =
builder.createMov(execSize, dst, src0,
InstOpt_WriteEnable | InstOpt_NoCompact, false);
instBuffer.push_back(movInst);
};
// (W) mov (NumDwords) dstGRF:ud srcGRF:ud
//
// Moves the inline argument GRF
void emitMovInlineData(int dstGRF, int srcGRF, uint32_t numDWord) {
if (dstGRF == srcGRF) {
return;
}
G4_Declare *srcDcl =
builder.createHardwiredDeclare(numDWord, Type_UD, srcGRF, 0);
srcDcl->setName("inlineRegFromTDL");
G4_Declare *dstDcl =
builder.createHardwiredDeclare(numDWord, Type_UD, dstGRF, 0);
dstDcl->setName("inlineRegExpectedLocation");
auto movInst =
builder.createMov(
G4_ExecSize(numDWord), builder.createDstRegRegion(dstDcl, 1),
builder.createSrcRegRegion(srcDcl, builder.getRegionStride1()),
InstOpt_WriteEnable | InstOpt_NoCompact, false);
instBuffer.push_back(movInst);
}
void appendLabel(const char *label) {
G4_INST *lbl =
kernel.fg.createNewLabelInst(builder.createLabel(label, LABEL_BLOCK));
instBuffer.push_back(lbl);
}
public:
// preparation of thread payload size and start offsets
void emitLoadSequence()
{
// the subregister that the header takes the address from is
// addr.2:d for OWord block load and addr.0:d for LSC
const int addrSubreg = builder.useLSCForPayloadLoad() ? 0 : 2;
G4_BB *perThreadBB = nullptr;
// Load per-thread data, if any. Per-thread data always start from r1
// this is a fixed size 8 inst (nop padded as necessary), which may be skipped
// by runtime if the local_id are auto-generated by HW.
//
// The size of this first block must be a multiple of 64B so that the
// forward start label is 64B aligned.
if (builder.needsToLoadLocalID()) {
appendLabel("per_thread_prolog");
// compute per-thread starting address into (rtmp.2)
// (W) mov (ExecSize) rtmp.0:ud 0x0
// (W) and (1) rtmp.2<1>:ud r0.0<0;1,0>:ud 0xFFFFFFC0 // start address
// (W) and (1) rtmp.0:uw r0.4:uw(tid) 0xFF // tid
// (W) add (1) rtmp.2 rtmp.2 cross_thread_size
// (W) mad (1) rtmp.2 rtmp.2 rtmp.0 per_thread_size
clearTmpRegister();
getStartAddrInst(2);
// (W) and (1) rtmp.0:uw r0.4:uw(tid) 0xFF // tid
auto andSrc0 = builder.createSrc(r0->getRegVar(), 0, 4,
builder.getRegionScalar(), Type_UW);
auto andSrc1 = builder.createImm(0xFF, Type_UW);
auto andDst = builder.createDst(rtmp->getRegVar(), 0, 0, 1, Type_UW);
auto andInst =
builder.createBinOp(G4_and, g4::SIMD1, andDst, andSrc0, andSrc1,
InstOpt_WriteEnable | InstOpt_NoCompact, false);
instBuffer.push_back(andInst);
// (W) add (1) rtmp.2 rtmp.2 cross_thread_size
auto addSrc0 = builder.createSrc(rtmp->getRegVar(), 0, 2,
builder.getRegionScalar(), Type_UD);
// create a relocation for cross_thread_size (per_thread_payload_offset). In
// case of the cross_thread_size is changed after compilation (e.g. gtpin
// inserted argument), the relocation need to be resolved to the new
// cross_thread_size.
G4_Operand *addSrc1 =
builder.createRelocImm(GenRelocType::R_SYM_ADDR_32,
PER_THREAD_OFF_RELOCATION_NAME, perThreadOffsetMem, Type_UD);
auto addDst = builder.createDst(rtmp->getRegVar(), 0, 2, 1, Type_UD);
// instruction has relocation must not be compacted
auto addInst =
builder.createBinOp(G4_add, g4::SIMD1, addDst, addSrc0, addSrc1,
InstOpt_WriteEnable | InstOpt_NoCompact, false);
#if 0
// disable the relocation entry that gtpin is able to recognize the
// instruction pattern and doesn't rely on this relocation. We still mark
// addSrc1 as RelocImm (so relocation name is printed in vISA dump), but
// the relocation entry won't be emitted to zebin
RelocationEntry::createRelocation(builder.kernel, *addInst, 1,
PER_THREAD_OFF_RELOCATION_NAME,
GenRelocType::R_SYM_ADDR_32);
#endif
instBuffer.push_back(addInst);
if (kernel.getOption(vISA_emitCrossThreadOffR0Reloc)) {
// per thread payload is stored after cross thread
// payload in memory. when implicit arg buffer
// pointer is present, we need to shift load address
// of per thread payload as well.
emitRelocAddInst(2);
}
// (W) mad (1) rtmp.2 rtmp.2 rtmp.0 per_thread_size
auto madSrc0 = builder.createSrc(rtmp->getRegVar(), 0, 2,
builder.getRegionScalar(), Type_UD);
auto madSrc1 = builder.createSrc(rtmp->getRegVar(), 0, 0,
builder.getRegionScalar(), Type_UW);
auto madSrc2 = builder.createImm(
numPerThreadGRF * kernel.numEltPerGRF<Type_UB>(), Type_UW);
auto madDst =
builder.createDst(rtmp->getRegVar(), 0, addrSubreg, 1, Type_UD);
auto madInst = builder.createInternalInst(
nullptr, G4_mad, nullptr, g4::NOSAT, g4::SIMD1, madDst, madSrc0,
madSrc1, madSrc2, InstOpt_WriteEnable | InstOpt_NoCompact);
instBuffer.push_back(madInst);
if (builder.getOption(vISA_useInlineData)) {
// copy inline data to the first GRF of cross-thread-data
// e.g. (W) mov (8) inlineDataReg.0:ud r1.0:ud
// Inline data size is 8 DWords.
emitMovInlineData(perThreadLoadStartGRF + numPerThreadGRF,
perThreadLoadStartGRF,
builder.getInlineDataSize()/TypeSize(Type_UD));
}
loadFromMemory(rtmp, perThreadLoadStartGRF,
numPerThreadGRF * builder.numEltPerGRF<Type_UD>());
perThreadBB = kernel.fg.createNewBB();
std::for_each(instBuffer.begin(), instBuffer.end(),
[](G4_INST *inst) { inst->invalidateVISAId(); });
perThreadBB->insert(perThreadBB->begin(), instBuffer.begin(),
instBuffer.end());
instBuffer.clear();
kernel.setPerThreadPayloadBB(perThreadBB);
} // builder.needsToLoadLocalID()
// code for loading the cross-thread data
if (builder.needsToLoadCrossThreadConstantData()) {
G4_BB *crossThreadBB = kernel.fg.createNewBB();
appendLabel("cross_thread_prolog");
if (!builder.useLSCForPayloadLoad()) {
// we must clear rtmp again as the per-thread loading code may not be
// executed
clearTmpRegister();
}
getStartAddrInst(addrSubreg);
if (kernel.getOption(vISA_emitCrossThreadOffR0Reloc)) {
// emit add with relocatable imm operand.
// when this is true, runtime loads global
// state buffer in r0.0[5:31]. kernel cross
// thread data is written in some other
// memory location. runtime is required to
// patch this relocatable immediate operand
// to allow correct loading of cross thread
// data.
emitRelocAddInst(addrSubreg);
}
// based on discussions with OCL runtime team, the first GRF
// of the cross-thread data will be loaded automatically as the inline data,
// and it will be either at R1 (if local id is not auto-generated) or
// R1 + sizeof(local id) (if local id is auto-generated).
loadFromMemory(rtmp, crossThreadLoadStartGRF, numCrossThreadDW);
std::for_each(instBuffer.begin(), instBuffer.end(),
[](G4_INST *inst) { inst->invalidateVISAId(); });
// create separate blocks instead of directly inserting to the old entryBB
// This is for the situation where the entry BB is part of a loop, as we
// don't want the prolog to be executed multiple times
crossThreadBB->insert(crossThreadBB->begin(), instBuffer.begin(),
instBuffer.end());
instBuffer.clear();
kernel.fg.addPrologBB(crossThreadBB);
kernel.setCrossThreadPayloadBB(crossThreadBB);
}
if (perThreadBB) {
kernel.fg.addPrologBB(perThreadBB);
}
} // emitLoadSequence
}; // class PayloadLoader
void Optimizer::loadThreadPayload() {
if (!builder.loadThreadPayload() || !builder.getIsKernel()) {
return;
}
PayloadLoader pl {builder, kernel, fg};
pl.emitLoadSequence();
}
// Some platforms require that the first instruction of any kernel should have
// non-zero emask, i.e. emask != 0 by setting MaskCtrl bit to 1: WriteEnable
// (NoMask)
//
// This can be done by introducing a dummy instruction for example:
// (W) mov(1) null:ud 0x0:ud
void Optimizer::addEmaskSetupProlog() {
if (!builder.needEmaskSetupProlog())
return;
// Only apply the WA to the kernel which is the actual entry point.
if (!builder.getIsKernel())
return;
// When the kernel has no prolog and the first inst has zero emask, insert
// a dummy WA inst with WriteEnable.
G4_BB *entry = kernel.fg.getEntryBB();
if (!entry)
return;
G4_INST *first = entry->getFirstInst();
if (first && !first->isWriteEnableInst()) {
G4_BB *bb = kernel.fg.createNewBB();
G4_INST *mov = builder.createMov(g4::SIMD1, builder.createNullDst(Type_UD),
builder.createImm(0, Type_UD),
InstOpt_WriteEnable, false);
bb->push_back(mov);
kernel.fg.addPrologBB(bb);
}
}
// some platform/shaders require a memory fence at kernel entry
// this needs to be called before RA since fence may have a (dummy) destination.
void Optimizer::insertFenceAtEntry() {
// for scalar path option was used and is still used
bool injectEntryFences = builder.getOption(vISA_InjectEntryFences);
// for vector path this option is the same as vISA_LSC_BackupMode
// and that option is, in turn, same as the value in WA table
if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM) {
injectEntryFences = injectEntryFences ||
builder.getOption(vISA_LSCBackupMode) ||
VISA_WA_CHECK(builder.getPWaTable(), Wa_14010198302);
const_cast<Options *>(builder.getOptions())
->setOption(vISA_LSCBackupMode, injectEntryFences);
}
if (injectEntryFences) {
auto entryBB = kernel.fg.getEntryBB();
auto iter = std::find_if(entryBB->begin(), entryBB->end(),
[](G4_INST *inst) { return !inst->isLabel(); });
builder.instList.clear();
builder.translateLscFence(nullptr, SFID::UGM, LSC_FENCE_OP_EVICT,
LSC_SCOPE_GPU);
// according to architects the invalidate fence should not use backup mode
const_cast<Options *>(builder.getOptions())
->setOption(vISA_LSCBackupMode, false);
builder.translateLscFence(nullptr, SFID::UGM, LSC_FENCE_OP_INVALIDATE,
LSC_SCOPE_GPU);
const_cast<Options *>(builder.getOptions())
->setOption(vISA_LSCBackupMode, true);
entryBB->insert(iter, builder.instList.begin(), builder.instList.end());
builder.instList.clear();
}
}
// Reset A0 to 0 at the beginning of the shader if the shader use VxH a0
void Optimizer::resetA0() {
// check all instructions to see if VxH a0 src is used
// only reset A0 when it's used
bool hasA0 = false;
for (auto bb : kernel.fg) {
for (auto inst : *bb) {
// VxH must be in src0
if (inst->getSrc(0) && inst->getSrc(0)->isSrcRegRegion() &&
inst->getSrc(0)->asSrcRegRegion()->isIndirect() &&
inst->getSrc(0)->asSrcRegRegion()->getRegion()->isRegionWH()) {
hasA0 = true;
break;
}
}
if (hasA0)
break;
}
if (!hasA0)
return;
// insert "mov (16) a0.0:uw 0x0:uw" at the beginning of the shader
if (kernel.fg.begin() != kernel.fg.end()) {
G4_BB *bb = *kernel.fg.begin();
auto insertIt = std::find_if(
bb->begin(), bb->end(), [](G4_INST *inst) { return !inst->isLabel(); });
if (builder.supportNativeSIMD32()) {
bb->insertBefore(
insertIt,
builder.createMov(G4_ExecSize(16),
builder.createDst(builder.phyregpool.getAddrReg(),
0, 0, 1, Type_UW),
builder.createImm(0, Type_UW), InstOpt_WriteEnable,
false));
bb->insertBefore(
insertIt,
builder.createMov(G4_ExecSize(16),
builder.createDst(builder.phyregpool.getAddrReg(),
0, 16, 1, Type_UW),
builder.createImm(0, Type_UW), InstOpt_WriteEnable,
false));
} else {
bb->insertBefore(
insertIt,
builder.createMov(G4_ExecSize(builder.getNumAddrRegisters()),
builder.createDst(builder.phyregpool.getAddrReg(),
0, 0, 1, Type_UW),
builder.createImm(0, Type_UW), InstOpt_WriteEnable,
false));
}
}
}
// Epilog functions.
// some platform/shaders require a memory fence before the end of thread
// ToDo: add fence only when the writes can reach EOT without a fence in between
void Optimizer::insertFenceBeforeEOT() {
// If vISA_removeFence is set, try to remove fence on UGM if there
// is no write to UGM in the entire kernel.
const bool toRemoveFence = builder.getOption(vISA_removeFence);
bool needLscUgmFence = false; // true if fence is needed.
// for scalar path option was used and is still used
bool clearHdcWritesLSCUGM =
builder.getOption(vISA_clearLSCUGMWritesBeforeEOT);
bool clearHDCWritesBeforeEOT =
builder.getOption(vISA_clearHDCWritesBeforeEOT);
bool clearWritesBeforeEOT = builder.needBarrierWA() && builder.supportsLSC();
// for vector path we need this WA always, so just use table
if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM) {
clearHDCWritesBeforeEOT =
clearHDCWritesBeforeEOT ||
VISA_WA_CHECK(builder.getPWaTable(), Wa_1807084924);
clearHdcWritesLSCUGM = clearHdcWritesLSCUGM ||
VISA_WA_CHECK(builder.getPWaTable(), Wa_22013689345);
}
if (!toRemoveFence && !clearHDCWritesBeforeEOT &&
!(builder.supportsLSC() && clearHdcWritesLSCUGM) &&
!clearWritesBeforeEOT) {
return;
}
if (!kernel.fg.builder->getIsKernel()) {
// we dont allow a function to exit
return;
}
bool hasUAVWrites = false;
bool hasSLMWrites = false;
bool hasTypedWrites = false;
bool hasWrites = false;
std::list<std::pair<G4_BB *, G4_INST *>> toBeRemoved;
for (auto bb : kernel.fg) {
if (bb->isEndWithFCall()) {
// conservatively assume we need a fence
// ToDo: we don't need a SLM fence if kernel doesnt use SLM, since
// function can't allocate SLM on its own We can move this W/A to IGC for
// more precise analysis
hasUAVWrites = true;
hasSLMWrites = true;
hasTypedWrites = true;
hasWrites = true;
break;
}
for (auto inst : *bb) {
if (inst->isSend() && !inst->isEOT()) {
auto msgDesc = inst->asSendInst()->getMsgDesc();
if (msgDesc->isLSC()) {
if (toRemoveFence && msgDesc->getSFID() == SFID::UGM &&
msgDesc->isFence()) {
toBeRemoved.push_back(std::make_pair(bb, inst));
}
}
// Skip fence (fence is both write/read)
if (msgDesc->isFence()) {
continue;
}
if (msgDesc->isWrite()) {
hasWrites = true;
if (msgDesc->isHDC()) {
if (msgDesc->isSLM()) {
hasSLMWrites = true;
} else if (msgDesc->isRaw() && ((const G4_SendDescRaw *)msgDesc)
->isHdcTypedSurfaceWrite()) {
hasTypedWrites = true;
} else {
hasUAVWrites = true;
if (builder.supportsLSC() && clearHdcWritesLSCUGM &&
!msgDesc->isScratch()) {
// Those HDC msg will go thru LSC, so need wa too.
needLscUgmFence = true;
}
}
}
if (msgDesc->isLSC()) {
switch (msgDesc->getSFID()) {
case SFID::UGM: {
hasUAVWrites = true;
if (clearHdcWritesLSCUGM) {
if ((msgDesc->isAtomic() && !msgDesc->isRead()) || // case 1
(!msgDesc->isAtomic() && // case 2
!msgDesc->isScratchWrite() &&
!(msgDesc->getCachingL1() == Caching::WB ||
msgDesc->getCachingL1() == Caching::ST))) {
needLscUgmFence = true;
}
}
break;
}
case SFID::SLM:
hasSLMWrites = true;
break;
case SFID::TGM:
hasTypedWrites = true;
break;
default:
break; // ignore other SFID
}
}
}
}
}
}
if (toRemoveFence && !toBeRemoved.empty() && !hasUAVWrites) {
for (const auto &II : toBeRemoved) {
G4_BB *aBB = II.first;
G4_INST *aInst = II.second;
aBB->remove(aInst);
}
toBeRemoved.clear();
}
if ((!clearHDCWritesBeforeEOT &&
!(builder.supportsLSC() && clearHdcWritesLSCUGM) &&
!clearWritesBeforeEOT) ||
!(hasUAVWrites || hasSLMWrites || hasTypedWrites || hasWrites)) {
return;
}
for (auto bb : kernel.fg) {
if (bb->isLastInstEOT()) {
auto iter = std::prev(bb->end());
if (builder.supportsLSC() && clearHdcWritesLSCUGM) {
if (needLscUgmFence) {
G4_INST *fenceInst = nullptr;
if (builder.getPlatform() == Xe_PVCXT) {
fenceInst = builder.translateLscFence(
nullptr, SFID::UGM, LSC_FENCE_OP_NONE, LSC_SCOPE_TILE);
} else {
// use fence.ugm.6.tile. 6 is reserved and is the same as none.
fenceInst = builder.translateLscFence(
nullptr, SFID::UGM, LSC_FENCE_OP_TYPE6, LSC_SCOPE_TILE);
}
bb->insertBefore(iter, fenceInst);
}
}
if (clearHDCWritesBeforeEOT) {
if (builder.supportsLSC()) {
if (hasTypedWrites) {
auto fenceInst = builder.translateLscFence(
nullptr, SFID::TGM, LSC_FENCE_OP_NONE, LSC_SCOPE_LOCAL);
bb->insertBefore(iter, fenceInst);
}
// If needLSCFence is true, the fence has been added already, skip the
// following.
if (hasUAVWrites && !needLscUgmFence) {
auto fenceInst = builder.translateLscFence(
nullptr, SFID::UGM, LSC_FENCE_OP_NONE, LSC_SCOPE_LOCAL);
bb->insertBefore(iter, fenceInst);
}
if (hasSLMWrites && !hasUAVWrites) {
// UGM fence takes of SLM fence as well
auto fenceInst = builder.translateLscFence(
nullptr, SFID::SLM, LSC_FENCE_OP_NONE, LSC_SCOPE_LOCAL);
bb->insertBefore(iter, fenceInst);
}
} else {
if (builder.getPlatform() == GENX_ICLLP) {
hasTypedWrites =
false; // Workaround Under debug and being clarified
hasSLMWrites = false; // Workaround not needed for ICL SLM Writes
}
if (hasUAVWrites || hasTypedWrites) {
auto fenceInst = builder.createFenceInstructionPreLSC(
nullptr, 0, true, true, false);
bb->insertBefore(iter, fenceInst);
}
if (hasSLMWrites) {
auto fenceInst = builder.createFenceInstructionPreLSC(
nullptr, 0, true, false, false);
bb->insertBefore(iter, fenceInst);
}
}
}
if (clearWritesBeforeEOT && hasWrites) {
auto fenseInst = builder.translateLscFence(
nullptr, SFID::UGM, LSC_FENCE_OP_EVICT, LSC_SCOPE_TILE);
bb->insertBefore(iter, fenseInst);
}
builder.instList.clear();
}
}
}
// some platforms require extra instruction before an EOT to
// ensure that all outstanding scratch writes are globally observed
void Optimizer::insertScratchReadBeforeEOT() {
int globalScratchOffset =
kernel.getInt32KernelAttr(Attributes::ATTR_SpillMemOffset);
if (builder.needFenceBeforeEOT() ||
(globalScratchOffset == 0 &&
builder.getJitInfo()->stats.spillMemUsed == 0)) {
return;
}
struct ScratchReadDesc {
uint32_t addrOffset : 12;
uint32_t dataElements : 2;
uint32_t reserved : 3;
uint32_t opType : 2;
uint32_t header : 1;
uint32_t resLen : 5;
uint32_t msgLen : 4;
uint32_t reserved2 : 3;
};
union {
uint32_t value;
ScratchReadDesc layout;
} desc;
// msg desc for 1GRF scratch block read
desc.value = 0;
desc.layout.opType = 2;
desc.layout.header = 1;
desc.layout.resLen = 1;
desc.layout.msgLen = 1;
for (auto bb : kernel.fg) {
if (bb->isLastInstEOT()) {
auto iter = std::prev(bb->end());
if (builder.getPlatformGeneration() >= PlatformGen::GEN10) {
// an HDC fence is more efficient in this case
// fence with commit enable
int fenceDesc =
G4_SendDescRaw::createDesc((0x7 << 14) | (1 << 13), true, 1, 1);
auto msgDesc = builder.createSyncMsgDesc(SFID::DP_DC0, fenceDesc);
auto src = builder.createSrcRegRegion(builder.getBuiltinR0(),
builder.getRegionStride1());
auto dst = builder.createDstRegRegion(builder.getBuiltinR0(), 1);
G4_INST *inst =
builder.createSendInst(nullptr, G4_send, g4::SIMD8, dst, src,
builder.createImm(fenceDesc, Type_UD),
InstOpt_WriteEnable, msgDesc, true);
bb->insertBefore(iter, inst);
} else {
// insert a dumy scratch read
auto msgDesc = builder.createReadMsgDesc(SFID::DP_DC0, desc.value);
auto src = builder.createSrcRegRegion(builder.getBuiltinR0(),
builder.getRegionStride1());
// We can use any dst that does not conflcit with EOT src, which must be
// between r112-r127
auto dstDcl = builder.createHardwiredDeclare(8, Type_UD, 1, 0);
auto dst = builder.createDstRegRegion(dstDcl, 1);
G4_INST *sendInst =
builder.createSendInst(nullptr, G4_send, g4::SIMD8, dst, src,
builder.createImm(desc.value, Type_UD),
InstOpt_WriteEnable, msgDesc, true);
bb->insertBefore(iter, sendInst);
}
builder.instList.clear();
}
}
}