mirror of
https://github.com/intel/intel-graphics-compiler.git
synced 2025-11-04 08:21:06 +08:00
Clean-up gcc warnings, such as -Werror=misleading-indentation -Werror=catch-value -Werror=class-memaccess -Werror=unused-variable -Werror=unused-but-set-variable
4181 lines
160 KiB
C++
4181 lines
160 KiB
C++
/*========================== begin_copyright_notice ============================
|
|
|
|
Copyright (C) 2023 Intel Corporation
|
|
|
|
SPDX-License-Identifier: MIT
|
|
|
|
============================= end_copyright_notice ===========================*/
|
|
|
|
#include "Assertions.h"
|
|
#include "FlowGraph.h"
|
|
#include "G4_Opcode.h"
|
|
#include "G4_Verifier.hpp"
|
|
#include "Optimizer.h"
|
|
#include "PointsToAnalysis.h"
|
|
#include "Timer.h"
|
|
#include "visa_igc_common_header.h"
|
|
|
|
#include <algorithm>
|
|
#include <fstream>
|
|
#include <map>
|
|
#include <sstream>
|
|
#include <vector>
|
|
|
|
using namespace vISA;
|
|
|
|
// A place for all software workarounds for HW issues. Future work may be to
|
|
// move large SWWAs into their own pass instead of inside Optimizer.
|
|
|
|
// Various helper functions for creating dummy instructions that may assist in
|
|
// SW workarounds.
|
|
void Optimizer::insertDummyCompactInst() {
|
|
// Only for SKL+ and compaction is enabled.
|
|
if (builder.getPlatform() < GENX_SKL || !builder.getOption(vISA_Compaction))
|
|
return;
|
|
|
|
// Insert mov (1) r0 r0 at the beginning of this kernel.
|
|
G4_Declare *dcl = builder.getBuiltinR0();
|
|
auto src = builder.createSrc(dcl->getRegVar(), 0, 0,
|
|
builder.getRegionScalar(), Type_F);
|
|
auto dst = builder.createDst(dcl->getRegVar(), 0, 0, 1, Type_F);
|
|
G4_INST *movInst =
|
|
builder.createMov(g4::SIMD1, dst, src, InstOpt_WriteEnable, false);
|
|
|
|
auto bb = fg.getEntryBB();
|
|
for (auto it = bb->begin(), ie = bb->end(); it != ie; ++it) {
|
|
if ((*it)->opcode() != G4_label) {
|
|
bb->insertBefore(it, movInst);
|
|
return;
|
|
}
|
|
}
|
|
|
|
// The entry block is empty or only contains a label.
|
|
bb->push_back(movInst);
|
|
}
|
|
|
|
void Optimizer::swapSrc1Src2OfMadForCompaction() {
|
|
if (!builder.src1Src2SwapForCompaction())
|
|
return;
|
|
|
|
BB_LIST_ITER ib, bend(fg.end());
|
|
for (ib = fg.begin(); ib != bend; ++ib) {
|
|
G4_BB *bb = (*ib);
|
|
INST_LIST_ITER ii = bb->begin();
|
|
|
|
while (ii != bb->end()) {
|
|
G4_INST *inst = *ii;
|
|
if (inst->opcode() == G4_mad) {
|
|
G4_Operand *src1 = inst->getSrc(1);
|
|
G4_Operand *src2 = inst->getSrc(2);
|
|
if (src1 && src2 && src1->getType() == src2->getType() &&
|
|
src1->isSrcRegRegion() &&
|
|
src2->isSrcRegRegion() &&
|
|
src1->getBase()->isRegVar() && src2->getBase()->isRegVar() &&
|
|
src1->getTopDcl()->getRegFile() == G4_GRF &&
|
|
src2->getTopDcl()->getRegFile() == G4_GRF) {
|
|
if (src1->asSrcRegRegion()->getRegion()->isScalar() &&
|
|
src2->asSrcRegRegion()->getRegion()->isFlatRegion()) {
|
|
inst->setSrc(src2, 1);
|
|
inst->setSrc(src1, 2);
|
|
}
|
|
}
|
|
}
|
|
ii++;
|
|
}
|
|
}
|
|
}
|
|
|
|
// add (1|M0) null<1>:uw null<0;1,0>:uw 0x0:uw
|
|
void Optimizer::insertDummyAdd(G4_BB *bb, INST_LIST_ITER inst_it, int imm) {
|
|
// Dst
|
|
auto nullDst = builder.createNullDst(Type_UW);
|
|
auto nullSrc0 = builder.createNullSrc(Type_UW);
|
|
auto immSrc1 = builder.createImm(imm, Type_UW);
|
|
|
|
auto addInst = builder.createBinOp(G4_add, g4::SIMD1, nullDst, nullSrc0,
|
|
immSrc1, InstOpt_WriteEnable, false);
|
|
|
|
bb->insertBefore(inst_it, addInst);
|
|
}
|
|
|
|
// Float and DP share same GRF cache.
|
|
// Integer and Math shader same GRF cache.
|
|
void Optimizer::insertDummyMad(G4_BB *bb, INST_LIST_ITER inst_it) {
|
|
// Dst
|
|
auto nullDst1 = builder.createNullDst(Type_W);
|
|
auto nullDst2 = builder.createNullDst(Type_F);
|
|
|
|
const RegionDesc *region = builder.createRegionDesc(8, 8, 1);
|
|
|
|
// Src0
|
|
auto src0Dcl_0 = builder.createHardwiredDeclare(1, Type_W, 1, 0);
|
|
auto src0Dcl_1 = builder.createHardwiredDeclare(1, Type_F, 1, 0);
|
|
G4_SrcRegRegion *src0Opnd_0 =
|
|
kernel.fg.builder->createSrcRegRegion(src0Dcl_0, region);
|
|
G4_SrcRegRegion *src0Opnd_1 =
|
|
kernel.fg.builder->createSrcRegRegion(src0Dcl_1, region);
|
|
|
|
G4_SrcRegRegion *src1Opnd_0 =
|
|
kernel.fg.builder->createSrcRegRegion(src0Dcl_0, region);
|
|
G4_SrcRegRegion *src1Opnd_1 =
|
|
kernel.fg.builder->createSrcRegRegion(src0Dcl_1, region);
|
|
|
|
G4_SrcRegRegion *src2Opnd_0 =
|
|
kernel.fg.builder->createSrcRegRegion(src0Dcl_0, region);
|
|
G4_SrcRegRegion *src2Opnd_1 =
|
|
kernel.fg.builder->createSrcRegRegion(src0Dcl_1, region);
|
|
|
|
auto madInst1 = builder.createInternalInst(
|
|
nullptr, G4_mad, nullptr, g4::NOSAT, g4::SIMD8, nullDst1, src0Opnd_0,
|
|
src1Opnd_0, src2Opnd_0, InstOpt_NoOpt);
|
|
|
|
auto madInst2 = builder.createInternalInst(
|
|
nullptr, G4_mad, nullptr, g4::NOSAT, g4::SIMD8, nullDst2, src0Opnd_1,
|
|
src1Opnd_1, src2Opnd_1, InstOpt_NoOpt);
|
|
|
|
bb->insertBefore(inst_it, madInst1);
|
|
bb->insertBefore(inst_it, madInst2);
|
|
|
|
G4_SrcRegRegion *src =
|
|
kernel.fg.builder->createSrcRegRegion(src0Dcl_1, region);
|
|
G4_DstRegRegion *dst = kernel.fg.builder->createDstRegRegion(src0Dcl_1, 1);
|
|
G4_INST *movInst =
|
|
builder.createMov(g4::SIMD8, dst, src, InstOpt_NoOpt, false);
|
|
|
|
bb->insertBefore(inst_it, movInst);
|
|
}
|
|
|
|
void Optimizer::insertDummyCsel(G4_BB *bb, INST_LIST_ITER inst_it, bool newBB) {
|
|
const RegionDesc *region = builder.createRegionDesc(4, 4, 1);
|
|
|
|
G4_Declare *dummyFlagDcl = builder.createTempFlag(1, "dmflag");
|
|
dummyFlagDcl->getRegVar()->setPhyReg(builder.phyregpool.getFlagAreg(0), 0);
|
|
auto dummyCondMod0 =
|
|
builder.createCondMod(Mod_e, dummyFlagDcl->getRegVar(), 0);
|
|
auto src0Dcl_0 = builder.createHardwiredDeclare(4, Type_W, 1, 0);
|
|
G4_SrcRegRegion *src0Opnd_0 =
|
|
kernel.fg.builder->createSrcRegRegion(src0Dcl_0, region);
|
|
G4_SrcRegRegion *src1Opnd_0 =
|
|
kernel.fg.builder->createSrcRegRegion(src0Dcl_0, region);
|
|
G4_SrcRegRegion *src2Opnd_0 =
|
|
kernel.fg.builder->createSrcRegRegion(src0Dcl_0, region);
|
|
G4_DstRegRegion *dst0 = kernel.fg.builder->createDstRegRegion(src0Dcl_0, 1);
|
|
auto cselInst0 = builder.createInternalInst(
|
|
nullptr, G4_csel, dummyCondMod0, g4::NOSAT, g4::SIMD4, dst0, src0Opnd_0,
|
|
src1Opnd_0, src2Opnd_0, InstOpt_WriteEnable);
|
|
|
|
if (newBB) {
|
|
bb->push_back(cselInst0);
|
|
} else {
|
|
bb->insertBefore(inst_it, cselInst0);
|
|
}
|
|
|
|
if (!builder.hasSingleALUPipe()) {
|
|
auto src0Dcl_1 = builder.createHardwiredDeclare(4, Type_F, 1, 4);
|
|
G4_SrcRegRegion *src0Opnd_1 =
|
|
kernel.fg.builder->createSrcRegRegion(src0Dcl_1, region);
|
|
G4_SrcRegRegion *src1Opnd_1 =
|
|
kernel.fg.builder->createSrcRegRegion(src0Dcl_1, region);
|
|
G4_SrcRegRegion *src2Opnd_1 =
|
|
kernel.fg.builder->createSrcRegRegion(src0Dcl_1, region);
|
|
G4_DstRegRegion *dst1 = kernel.fg.builder->createDstRegRegion(src0Dcl_1, 1);
|
|
auto dummyCondMod1 =
|
|
builder.createCondMod(Mod_e, dummyFlagDcl->getRegVar(), 0);
|
|
auto cselInst1 = builder.createInternalInst(
|
|
nullptr, G4_csel, dummyCondMod1, g4::NOSAT, g4::SIMD4, dst1, src0Opnd_1,
|
|
src1Opnd_1, src2Opnd_1, InstOpt_WriteEnable);
|
|
if (newBB) {
|
|
bb->push_back(cselInst1);
|
|
} else {
|
|
bb->insertBefore(inst_it, cselInst1);
|
|
}
|
|
}
|
|
}
|
|
|
|
void Optimizer::insertDummyMov(G4_BB *bb, INST_LIST_ITER inst_it,
|
|
G4_Operand *opnd) {
|
|
G4_SrcRegRegion *src =
|
|
builder.createSrc(opnd->getBase(), opnd->asSrcRegRegion()->getRegOff(), 0,
|
|
builder.createRegionDesc(8, 8, 1), Type_UD);
|
|
G4_DstRegRegion *dst = builder.createDst(
|
|
opnd->getBase(), opnd->asSrcRegRegion()->getRegOff(), 0, 1, Type_UD);
|
|
G4_INST *movInst =
|
|
builder.createMov(g4::SIMD8, dst, src, InstOpt_NoOpt, false);
|
|
bb->insertBefore(inst_it, movInst);
|
|
|
|
return;
|
|
}
|
|
|
|
void Optimizer::insertDummyMovForHWRSWADPAS(G4_BB *bb) {
|
|
INST_LIST_ITER curr_iter = bb->begin();
|
|
bool PreDPAS = false;
|
|
while (curr_iter != bb->end()) {
|
|
G4_INST *inst = (*curr_iter);
|
|
|
|
if (inst->isDpas() &&
|
|
!PreDPAS) // Within a BB, only first one need invalid DPAS suppresion
|
|
{
|
|
insertDummyMov(bb, curr_iter, inst->getSrc(1));
|
|
PreDPAS = true;
|
|
}
|
|
|
|
if (inst->getPredicate() && inst->getDst() &&
|
|
!inst->getDst()->isNullReg()) {
|
|
if (inst->isSend()) {
|
|
PreDPAS = false;
|
|
}
|
|
}
|
|
|
|
++curr_iter;
|
|
}
|
|
}
|
|
|
|
void Optimizer::insertDummyMovForHWRSWAonaAllpipelines() {
|
|
bool hasNonUniformBranch = false;
|
|
bool hasPredicatedSendOrIndirect = false;
|
|
BB_LIST dpasBBs;
|
|
|
|
for (BB_LIST_ITER bb_it = kernel.fg.begin(); bb_it != kernel.fg.end();
|
|
bb_it++) {
|
|
G4_BB *bb = (*bb_it);
|
|
|
|
if (bb->empty()) {
|
|
continue;
|
|
}
|
|
|
|
INST_LIST_ITER curr_iter = bb->begin();
|
|
INST_LIST_ITER pre_iter = curr_iter;
|
|
bool insertDPASBB = false;
|
|
while (curr_iter != bb->end()) {
|
|
G4_INST *inst = (*curr_iter);
|
|
|
|
if (inst->isDpas() && !insertDPASBB) {
|
|
dpasBBs.push_back(bb);
|
|
insertDPASBB = true;
|
|
}
|
|
|
|
if (inst->getPredicate() && inst->getDst() &&
|
|
!inst->getDst()->isNullReg()) {
|
|
if (inst->isSend()) {
|
|
insertDummyCsel(bb, curr_iter, false);
|
|
hasPredicatedSendOrIndirect = true;
|
|
}
|
|
}
|
|
|
|
if (builder.hasEOTReadSuppressionIssue() && inst->isEOT()) {
|
|
if (pre_iter != curr_iter) {
|
|
G4_INST *pre_inst = (*pre_iter);
|
|
if (pre_inst->isAtomicInst()) {
|
|
insertDummyCsel(bb, pre_iter, false);
|
|
} else {
|
|
insertDummyCsel(bb, curr_iter, false);
|
|
}
|
|
}
|
|
}
|
|
|
|
pre_iter = curr_iter;
|
|
++curr_iter;
|
|
}
|
|
|
|
bool newBB = false;
|
|
G4_INST *inst = (bb->getInstList().back());
|
|
if (inst->isRSWADivergentInst() && !inst->asCFInst()->isUniform()) {
|
|
bool previousElse = false;
|
|
|
|
G4_BB *preBB = bb->getPhysicalPred();
|
|
if (preBB && preBB->getInstList().size()) {
|
|
G4_INST *preBBLastInst = (preBB->getInstList().back());
|
|
previousElse = (preBBLastInst->opcode() == G4_else);
|
|
}
|
|
|
|
INST_LIST_ITER iter = bb->end();
|
|
iter--;
|
|
if (iter != bb->begin() && !previousElse) {
|
|
INST_LIST_ITER preIter = iter;
|
|
preIter--;
|
|
G4_INST *preInst = (*preIter);
|
|
if (preInst->isLabel()) {
|
|
bool hasJmpIPred = false;
|
|
|
|
for (G4_BB *predBB : bb->Preds) {
|
|
G4_INST *predBBLastInst = NULL;
|
|
if (!predBB->empty()) {
|
|
predBBLastInst = predBB->getInstList().back();
|
|
}
|
|
if (predBBLastInst && predBBLastInst->opcode() == G4_jmpi) {
|
|
hasJmpIPred = true;
|
|
}
|
|
}
|
|
G4_BB *wa_bb = hasJmpIPred ? kernel.fg.createNewBBWithLabel("RSWA")
|
|
: kernel.fg.createNewBB();
|
|
kernel.fg.insert(bb_it, wa_bb);
|
|
G4_Label *newLabel = hasJmpIPred ? wa_bb->getLabel() : NULL;
|
|
|
|
// replace bb with wa_bb in the pred BB of bb.
|
|
for (G4_BB *predBB : bb->Preds) {
|
|
G4_INST *predBBLastInst = NULL;
|
|
if (!predBB->empty()) {
|
|
predBBLastInst = predBB->getInstList().back();
|
|
}
|
|
if (predBBLastInst && predBBLastInst->opcode() == G4_jmpi) {
|
|
vASSERT(newLabel);
|
|
predBBLastInst->setSrc(newLabel, 0);
|
|
}
|
|
|
|
// C++17: std::replace(predBB->Succs.begin(), predBB->Succs.end(),
|
|
// bb, wa_bb);
|
|
for (G4_BB *&succ : predBB->Succs) {
|
|
if (succ == bb) {
|
|
succ = wa_bb;
|
|
}
|
|
}
|
|
wa_bb->Preds.push_back(predBB);
|
|
}
|
|
wa_bb->Succs.push_back(bb);
|
|
bb->Preds.clear();
|
|
bb->Preds.push_back(wa_bb);
|
|
newBB = true;
|
|
bb = wa_bb;
|
|
}
|
|
}
|
|
|
|
insertDummyCsel(bb, iter, newBB);
|
|
hasNonUniformBranch = true;
|
|
}
|
|
}
|
|
|
|
if (dpasBBs.size() &&
|
|
builder.getOptions()->getOption(vISA_InsertDummyMovForDPASRSWA) &&
|
|
(hasPredicatedSendOrIndirect || hasNonUniformBranch)) {
|
|
for (G4_BB *bb : kernel.fg) {
|
|
insertDummyMovForHWRSWADPAS(bb);
|
|
}
|
|
}
|
|
}
|
|
|
|
void Optimizer::insertDummyMovForHWRSWAonDPAS() {
|
|
bool hasNonUniformBranch = false;
|
|
bool hasPredicatedSendOrIndirect = false;
|
|
BB_LIST dpasBBs;
|
|
|
|
for (BB_LIST_ITER bb_it = kernel.fg.begin(); bb_it != kernel.fg.end();
|
|
bb_it++) {
|
|
G4_BB *bb = (*bb_it);
|
|
|
|
if (bb->empty()) {
|
|
continue;
|
|
}
|
|
|
|
INST_LIST_ITER curr_iter = bb->begin();
|
|
bool insertDPASBB = false;
|
|
while (curr_iter != bb->end()) {
|
|
G4_INST *inst = (*curr_iter);
|
|
|
|
if (inst->isDpas() && !insertDPASBB) {
|
|
dpasBBs.push_back(bb);
|
|
insertDPASBB = true;
|
|
}
|
|
|
|
if (inst->getPredicate() && inst->getDst() &&
|
|
!inst->getDst()->isNullReg()) {
|
|
if (inst->isSend()) {
|
|
hasPredicatedSendOrIndirect = true;
|
|
}
|
|
}
|
|
|
|
++curr_iter;
|
|
}
|
|
|
|
G4_INST *inst = (bb->getInstList().back());
|
|
if (inst->isRSWADivergentInst() && !inst->asCFInst()->isUniform()) {
|
|
hasNonUniformBranch = true;
|
|
}
|
|
}
|
|
|
|
if (dpasBBs.size() &&
|
|
builder.getOptions()->getOption(vISA_InsertDummyMovForDPASRSWA) &&
|
|
(hasPredicatedSendOrIndirect || hasNonUniformBranch)) {
|
|
for (G4_BB *bb : dpasBBs) {
|
|
insertDummyMovForHWRSWADPAS(bb);
|
|
}
|
|
}
|
|
}
|
|
|
|
void Optimizer::insertDummyMovForHWRSWA() {
|
|
if (!((VISA_WA_CHECK(builder.getPWaTable(), Wa_16012061344) ||
|
|
VISA_WA_CHECK(builder.getPWaTable(), Wa_22012856258) ||
|
|
VISA_WA_CHECK(builder.getPWaTable(), Wa_14017322320) ||
|
|
VISA_WA_CHECK(builder.getPWaTable(), Wa_16012292205)))) {
|
|
return;
|
|
}
|
|
|
|
if (builder.hasRSForSpecificPlatform()) {
|
|
insertDummyMovForHWRSWAonaAllpipelines();
|
|
} else {
|
|
insertDummyMovForHWRSWAonDPAS();
|
|
}
|
|
}
|
|
|
|
// 1. set DMask so that upper 16bits are ones.
|
|
// This may be done in applyFusedCallWA(). Doing so here has minimum impact
|
|
// to visa.
|
|
// 2. Perform IP WA if needed.
|
|
void Optimizer::finishFusedCallWA_preSWSB() {
|
|
if (builder.getIsKernel()) {
|
|
// If it is from scalar IGC, need to extend its dmask. For example, simd8 to
|
|
// simd16 or simd16 to simd32 by adding or instructions on the entry. Note
|
|
// that the first BB is not necessarily the kernel's entry when kernel needs
|
|
// to load its payload!
|
|
// (W) or (1|M0) dmask(sr0.2) dmasksr0.2 0xFFFF0000
|
|
if (true /*kernel.getInt32KernelAttr(Attributes::ATTR_Target) != VISA_CM */)
|
|
{
|
|
// Use M16 always.
|
|
vASSERT(kernel.getSimdSize() <= 16);
|
|
uint32_t orImm = kernel.getSimdSize() == 16 ? 0xFFFF0000 : 0xFFFFFF00;
|
|
|
|
G4_VarBase *V_sr0 = builder.phyregpool.getSr0Reg();
|
|
G4_SrcRegRegion *I0_Src0 =
|
|
builder.createSrc(V_sr0, 0, 2, builder.getRegionScalar(), Type_UD);
|
|
G4_Imm *newDMask = builder.createImm(orImm, Type_UD);
|
|
G4_DstRegRegion *I0_Dst = builder.createDst(V_sr0, 0, 2, 1, Type_UD);
|
|
G4_INST *I0 = builder.createInternalInst(
|
|
nullptr, G4_or, nullptr, g4::NOSAT, g4::SIMD1, I0_Dst, I0_Src0,
|
|
newDMask, InstOpt_WriteEnable);
|
|
|
|
G4_BB *entryBB = fg.getEntryBB();
|
|
// Make sure to skip prolog BBs to insert into the 1st BB of a kernel.
|
|
G4_BB *perThreadBB = kernel.getPerThreadPayloadBB();
|
|
G4_BB *crossThreadBB = kernel.getCrossThreadPayloadBB();
|
|
if (perThreadBB != nullptr || crossThreadBB != nullptr) {
|
|
while (entryBB != nullptr) {
|
|
if (entryBB == perThreadBB || entryBB == crossThreadBB) {
|
|
// perthread/crossThread BB has a single succ.
|
|
vASSERT(entryBB->Succs.size() == 1);
|
|
entryBB = entryBB->Succs.front();
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
entryBB->insertBefore(entryBB->getFirstInsertPos(), I0);
|
|
}
|
|
}
|
|
|
|
if (kernel.m_indirectCallWAInfo.empty() && kernel.m_maskOffWAInsts.empty())
|
|
return;
|
|
|
|
#if defined(_DEBUG)
|
|
// Expect all BBs and insts related to call wa are present and the insts are
|
|
// still in their BBs (they could be reordered, but are required to be in the
|
|
// original BB).
|
|
//
|
|
// Don't expect any violation, but do the sanity check here to make sure.
|
|
for (auto &II : kernel.m_indirectCallWAInfo) {
|
|
G4_BB *BB = II.first;
|
|
IndirectCallWAInfo &callWAInfo = II.second;
|
|
G4_BB *BigBB = callWAInfo.Big_BB;
|
|
G4_BB *SmallBB = callWAInfo.Small_BB;
|
|
if (std::find(kernel.fg.begin(), kernel.fg.end(), BB) == kernel.fg.end() ||
|
|
std::find(kernel.fg.begin(), kernel.fg.end(), BigBB) ==
|
|
kernel.fg.end() ||
|
|
std::find(kernel.fg.begin(), kernel.fg.end(), SmallBB) ==
|
|
kernel.fg.end()) {
|
|
vISA_ASSERT(false, "ICE: BB not found in indirect call WA info!");
|
|
break;
|
|
}
|
|
|
|
G4_INST *ip_wa = callWAInfo.IP_WA_placeholder;
|
|
G4_INST *bigStart = callWAInfo.Big_start;
|
|
G4_INST *bigPatch = callWAInfo.Big_patch;
|
|
G4_INST *smallStart = callWAInfo.Small_start;
|
|
G4_INST *smallPatch = callWAInfo.Small_patch;
|
|
G4_INST *bigCall = callWAInfo.Big_call;
|
|
G4_INST *smallCall = callWAInfo.Small_call;
|
|
if ((ip_wa && std::find(BB->begin(), BB->end(), ip_wa) == BB->end()) ||
|
|
(bigStart &&
|
|
std::find(BB->begin(), BB->end(), bigStart) == BB->end()) ||
|
|
(bigPatch &&
|
|
std::find(BB->begin(), BB->end(), bigPatch) == BB->end()) ||
|
|
(smallStart &&
|
|
std::find(BB->begin(), BB->end(), smallStart) == BB->end()) ||
|
|
(smallPatch &&
|
|
std::find(BB->begin(), BB->end(), smallPatch) == BB->end()) ||
|
|
(bigCall &&
|
|
std::find(BigBB->begin(), BigBB->end(), bigCall) == BigBB->end()) ||
|
|
(smallCall && std::find(SmallBB->begin(), SmallBB->end(), smallCall) ==
|
|
SmallBB->end())) {
|
|
vISA_ASSERT(false, "ICE: inst not found in its original BB!");
|
|
break;
|
|
}
|
|
}
|
|
|
|
for (const auto& II : kernel.m_maskOffWAInsts) {
|
|
G4_INST *tInst = II.first;
|
|
G4_BB *tBB = II.second;
|
|
|
|
// make sure BB and inst are still valid
|
|
if (std::find(kernel.fg.begin(), kernel.fg.end(), tBB) == kernel.fg.end()) {
|
|
vISA_ASSERT(false, "ICE: BB not in m_maskOffWAInsts!");
|
|
continue;
|
|
}
|
|
if (std::find(tBB->begin(), tBB->end(), tInst) == tBB->end()) {
|
|
vISA_ASSERT(false, "ICE: inst not in m_maskOffWAInsts!");
|
|
continue;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
if (builder.needIPWA()) {
|
|
for (auto &II : kernel.m_indirectCallWAInfo) {
|
|
G4_BB *BB = II.first;
|
|
IndirectCallWAInfo &callWAInfo = II.second;
|
|
|
|
G4_INST *ip_wa = callWAInfo.IP_WA_placeholder;
|
|
if (ip_wa == nullptr) {
|
|
// calla, ip wa not needed.
|
|
continue;
|
|
}
|
|
|
|
G4_INST *ip_inst = nullptr;
|
|
if (ip_wa) {
|
|
// clang-format off
|
|
// Simplified example to show what it does:
|
|
// Given
|
|
// pseudo_fcall (16) r4.0:ud
|
|
//
|
|
// After applyFusedCallWA and RA:
|
|
// (W) mov (1) r2.0<1>:ud sr0.0<0;1,0>:ud
|
|
// (W) and (16) (eq)f1.0 null<1>:uw r2.0<0;1,0>:uw 0x80:uw
|
|
// (W&!f1.0) mov (1) cr0.2<1>:ud r4.0<0;1,0>:ud
|
|
// (W) mov (1) r3.2<1>:ud cr0.2<0;1,0>:ud
|
|
// (W) mov (1) r3.0<1>:d 0x89abcdef:d :ip_wa (placeholder)
|
|
// (W) add (1) r2.0<1>:d -r3.0<0;1,0>:d r3.2<0;1,0>:d :small_start
|
|
// (W) add (1) r70.0<1>:d r2.0<0;1,0>:d 0x33333333:d :small_patch
|
|
// (W) add (1) r2.0<1>:d -r3.0<0;1,0>:d r4.0<0;1,0>:d :big_start
|
|
// (W) add (1) r2.0<1>:d r2.0<0;1,0>:d 0x33333333:d :big_patch
|
|
// if (BigEU)
|
|
// (W) mov (1) r125.0<1>:f r2.0<0;1,0>:f
|
|
// pseudo_fcall (16) r125.0<1>:ud r125.0<0;1,0>:ud :big_call
|
|
// else
|
|
// (W) mov (1) r125.0<1>:f r70.0<0;1,0>:f
|
|
// pseudo_fcall (16) r125.0<1>:ud r125.0<0;1,0>:ud :small_call
|
|
//
|
|
//
|
|
// After finishFusedCallWA()
|
|
// (W) mov (1) r2.0<1>:ud sr0.0<0;1,0>:ud
|
|
// (W) and (16) (eq)f1.0 null<1>:uw r2.0<0;1,0>:uw 0x80:uw
|
|
// (W&!f1.0) mov (1) cr0.2<1>:ud r4.0<0;1,0>:ud
|
|
// (W) mov (1) r3.2<1>:ud cr0.2<0;1,0>:ud
|
|
//
|
|
// (W) call (1) r3.0<1>:d _label_ip_wa
|
|
// _label_ip_wa:
|
|
// (W) add (1|M16) r3.0<1>:d r3.0<0;1,0>:d 0x20:d {NoCompact}
|
|
// (W) return (1) r3.0<0;1,0>:d {NoCompact}
|
|
//
|
|
// (W) add (1) r2.0<1>:d -r3.0<0;1,0>:d r3.2<0;1,0>:d :IP
|
|
// (W) add (1) r70.0<1>:d r2.0<0;1,0>:d 144
|
|
// (W) add (1) r2.0<1>:d -r3.0<0;1,0>:d r4.0<0;1,0>:d
|
|
// (W) add (1) r2.0<1>:d r2.0<0;1,0>:d 96
|
|
// if (BigEU)
|
|
// (W) mov (1) r125.0<1>:f r2.0<0;1,0>:f
|
|
// pseudo_fcall (16) r125.0<1>:ud r125.0<0;1,0>:ud : IP+96
|
|
// else
|
|
// (W) mov (1) r125.0<1>:f r70.0<0;1,0>:f
|
|
// pseudo_fcall (16) r125.0<1>:ud r70.0<0;1,0>:f : IP+144
|
|
//
|
|
// clang-format on
|
|
BB->resetLocalIds();
|
|
G4_INST *sI = callWAInfo.Small_start;
|
|
G4_INST *bI = callWAInfo.Big_start;
|
|
ip_inst = (sI->getLocalId() < bI->getLocalId() ? sI : bI);
|
|
|
|
// Get IP to ip_inst.
|
|
// IP-WA's call sequence must be inserted right before ip_inst and
|
|
// IP must be stored in ip_wa's dst, not ip_inst's dst.
|
|
InstListType waInsts;
|
|
replaceIPWithCall(waInsts, ip_wa);
|
|
|
|
// find IP adjustment add and set mask offset to M16!
|
|
// (it is the 3rd inst!)
|
|
G4_INST *adjust_ip_add = nullptr;
|
|
for (auto tI : waInsts) {
|
|
if (tI->opcode() == G4_add) {
|
|
adjust_ip_add = tI;
|
|
break;
|
|
}
|
|
}
|
|
vASSERT(adjust_ip_add);
|
|
kernel.setMaskOffset(adjust_ip_add, InstOpt_M16);
|
|
|
|
auto ip_inst_ii = std::find(BB->begin(), BB->end(), ip_inst);
|
|
BB->insert(ip_inst_ii, waInsts.begin(), waInsts.end());
|
|
|
|
// Remove placeholder
|
|
BB->remove(ip_wa);
|
|
|
|
// finishFusedCallWA() will use this to calculate the offset.
|
|
callWAInfo.IP_WA_placeholder = ip_inst;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Need to be done after SWSB so we can set call relative IP correctly.
|
|
void Optimizer::finishFusedCallWA() {
|
|
// Regarding using M16 as maskOff to force running some instructions
|
|
//
|
|
// For each nested stack call like the following:
|
|
// (1) (W) mov (4|M0) r59.4<1>:ud r125.0<4;4,1>:ud // save code in prolog
|
|
// (2) call (16|M0) r125.0 inner
|
|
// (3) (W) mov (4|M0) r125.0<1>:ud r59.4<4;4,1>:ud // restore code in ret
|
|
// (4) ret (16|M0) r125.0
|
|
// If no active channels, call inst will always execute due to the hw bug,
|
|
// therefore r125 will be modified by this call inst at (2). As no active
|
|
// channels, r125 restore code at (3) is not going to be run. Therefore, r125
|
|
// returned at (4) is not the one that is saved into r59.4 at (1), which is
|
|
// wrong.
|
|
//
|
|
// The fix is to make save/restore mov instructions run always even though
|
|
// there are no active channels. They run if their quarter control is outside
|
|
// the current JEU size (16 in this case), but still active (dmask still show
|
|
// it is active). We will set dmask to simd32 in this case, quarter control to
|
|
// M16 instead M0:
|
|
// (1) (W) mov (4|M16) r59.4<1>:ud r125.0<4;4,1>:ud
|
|
// (2) call (16|M0) r125.0 inner
|
|
// (3) (W) mov (4|M16) r125.0<1>:ud r59.4<4;4,1>:ud
|
|
//
|
|
// Note:
|
|
// r59.4 needs to write on stack frame before call and read back after call
|
|
// and its address payload needs to be correct. For this purpose, all call
|
|
// stack-related WA is done in RA, not here.
|
|
//
|
|
|
|
if (kernel.m_indirectCallWAInfo.empty() && kernel.m_maskOffWAInsts.empty())
|
|
return;
|
|
|
|
auto update_ip_distance = [](G4_INST *inst, int32_t &ip_dist) {
|
|
G4_opcode op = inst->opcode();
|
|
if (op == G4_sync_nop) {
|
|
inst->setCompacted();
|
|
ip_dist += 8;
|
|
} else if (op != G4_label) {
|
|
inst->setNoCompacted();
|
|
ip_dist += 16;
|
|
}
|
|
return;
|
|
};
|
|
|
|
// 1. (W) mov (1|M0) r2.0<1>:ud sr0.0<0;1,0>:ud
|
|
// 2. (W) and (16|M0) (eq)f1.0 null<1>:uw r2.0<0;1,0>:uw 0x80:uw
|
|
// 3. (W & ~f1.0) mov (1|M0) cr0.2<1>:ud r3.0<0;1,0>:ud
|
|
// 4. (W)mov (1|M0) r64.0<1>:ud cr0.2<0;1,0>:ud
|
|
// WA requires the mov at 4 to be in M16, not M0 in case the BigEU is off.
|
|
// Here set quarter control of that mov to M16 (When stackcall is used,
|
|
// only simd8/simd16 is allowed. Thus, we will set M16 always no matter
|
|
// the kernel is simd8 or simd16).
|
|
for (const auto& II : kernel.m_maskOffWAInsts) {
|
|
G4_INST *tInst = II.first;
|
|
kernel.setMaskOffset(tInst, InstOpt_M16);
|
|
}
|
|
|
|
// indirect relative call
|
|
for (const auto &II : kernel.m_indirectCallWAInfo) {
|
|
G4_BB *BB = II.first;
|
|
const IndirectCallWAInfo &callWAInfo = II.second;
|
|
|
|
if (callWAInfo.Small_start == nullptr) { // calla, skip
|
|
continue;
|
|
}
|
|
|
|
// finishFusedCallWA_preSWSB() sets this placeholder.
|
|
G4_INST *ip_inst = callWAInfo.IP_WA_placeholder;
|
|
|
|
// IP WA is applied if ip_inst isn't null.
|
|
for (int i = 0; i < 2; ++i) {
|
|
G4_INST *patch_add =
|
|
(i == 0 ? callWAInfo.Small_patch : callWAInfo.Big_patch);
|
|
G4_INST *ip_start =
|
|
(i == 0 ? callWAInfo.Small_start : callWAInfo.Big_start);
|
|
if (ip_inst) {
|
|
// IP WA: ip is taken at ip_inst for both small and big targets.
|
|
ip_start = ip_inst;
|
|
}
|
|
G4_INST *ip_end = (i == 0 ? callWAInfo.Small_call : callWAInfo.Big_call);
|
|
G4_BB *start_bb = BB;
|
|
G4_BB *end_bb = (i == 0 ? callWAInfo.Small_BB : callWAInfo.Big_BB);
|
|
|
|
int32_t dist = 0;
|
|
G4_BB *b;
|
|
G4_BB *next_b = start_bb;
|
|
INST_LIST_ITER it_start =
|
|
std::find(start_bb->begin(), start_bb->end(), ip_start);
|
|
INST_LIST_ITER it_end = std::find(end_bb->begin(), end_bb->end(), ip_end);
|
|
do {
|
|
b = next_b;
|
|
INST_LIST_ITER iter = (b == start_bb ? it_start : b->begin());
|
|
INST_LIST_ITER iterEnd = (b == end_bb ? it_end : b->end());
|
|
for (; iter != iterEnd; ++iter) {
|
|
G4_INST *tI = *iter;
|
|
update_ip_distance(tI, dist);
|
|
}
|
|
next_b = b->getPhysicalSucc();
|
|
} while (b != end_bb && next_b != nullptr);
|
|
vASSERT(b == end_bb);
|
|
|
|
G4_Imm *distOprd = builder.createImm(-dist, Type_D);
|
|
patch_add->setSrc(distOprd, 1);
|
|
}
|
|
}
|
|
|
|
// RA does the following
|
|
// (W) mov(1|M0) r125.0<1>:f r60.0<0;1,0>:f
|
|
// (W) send.dc0(16|M0) null r126 r5 0x80 0x020A03FF // stack spill
|
|
// sync.nop null{ Compacted,$4.src }
|
|
// call (8|M0) r125.0 r125.0
|
|
//
|
|
// To make call WA work, call for SmallEU has to use r60, not r125, as below:
|
|
// call (8|M0) r125.0 r60.0
|
|
// Here propogate r60.0 down to call instruction
|
|
// (For call, can just copy patch's dst to call's target. Here the code works
|
|
// for both call and calla.)
|
|
for (const auto &II : kernel.m_indirectCallWAInfo) {
|
|
const IndirectCallWAInfo &callWAInfo = II.second;
|
|
|
|
G4_INST *iCallInst = callWAInfo.Small_call;
|
|
G4_BB *B = callWAInfo.Small_BB;
|
|
vASSERT(iCallInst->isFCall() && iCallInst->getSrc(0)->isGreg());
|
|
|
|
bool isValid;
|
|
G4_SrcRegRegion *T = iCallInst->getSrc(0)->asSrcRegRegion();
|
|
int regno = T->ExRegNum(isValid);
|
|
int subreg = T->ExSubRegNum(isValid);
|
|
|
|
// Search backward to find the the 1st mov that defined this reg
|
|
// This works for ifcall that has been put into a separate BB, in
|
|
// which only insts related to call sequence are present in the BB.
|
|
// If not found, do nothing.
|
|
INST_LIST_ITER it_end = std::find(B->begin(), B->end(), iCallInst);
|
|
vASSERT(it_end != B->end());
|
|
for (auto II = it_end, IB = B->begin(); II != IB; --II) {
|
|
auto prevII = std::prev(II);
|
|
G4_INST *tInst = *prevII;
|
|
if (tInst->opcode() == G4_mov && tInst->getExecSize() == g4::SIMD1 &&
|
|
tInst->isWriteEnableInst() && tInst->getDst()->isGreg() &&
|
|
tInst->getSrc(0)->isGreg() &&
|
|
T->getTypeSize() == tInst->getSrc(0)->getTypeSize()) {
|
|
G4_DstRegRegion *D = tInst->getDst();
|
|
int dst_regno = D->ExRegNum(isValid);
|
|
int dst_subreg = D->ExSubRegNum(isValid);
|
|
if (dst_regno == regno && subreg == dst_subreg) {
|
|
// found
|
|
G4_SrcRegRegion *Src0 = tInst->getSrc(0)->asSrcRegRegion();
|
|
G4_SrcRegRegion *newT = builder.createSrcRegRegion(*Src0);
|
|
iCallInst->setSrc(newT, 0);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
kernel.m_maskOffWAInsts.clear();
|
|
kernel.m_indirectCallWAInfo.clear();
|
|
}
|
|
|
|
void Optimizer::adjustIndirectCallOffsetAfterSWSBSet() {
|
|
// the call code sequence done at Optimizer::expandIndirectCallWithRegTarget
|
|
// is:
|
|
|
|
// if has IP WA, more instructions are added:
|
|
// call dst _label_ip_wa
|
|
// _label_ip_wa:
|
|
// add dst dst 32 // 3rd add, sync_off_2
|
|
// // 32 is hardcoded
|
|
// ret dst
|
|
// else it'll be :
|
|
// add r2.0 -IP call_target // 2nd add
|
|
// add r2.0 r2.0 -32 // 1st add, sync_off_1
|
|
// // -32 is hardcoded
|
|
// call r1.0 r2.0
|
|
// SWSB could've inserted sync instructions between offset-hardcoded
|
|
// instructions. We need to re-adjust the offset
|
|
|
|
// update the offset if the given inst is a sync
|
|
// return true if inst is sync
|
|
auto update_sync_off = [](G4_INST &inst, uint64_t &sync_offset) {
|
|
G4_opcode op = inst.opcode();
|
|
if (op == G4_sync_allrd || op == G4_sync_allwr) {
|
|
inst.setNoCompacted();
|
|
sync_offset += 16;
|
|
return true;
|
|
} else if (op == G4_sync_nop) {
|
|
inst.setCompacted();
|
|
sync_offset += 8;
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
for (auto bb : kernel.fg) {
|
|
if (bb->empty())
|
|
continue;
|
|
|
|
if (bb->back()->isFCall()) {
|
|
G4_InstCF *fcall = bb->back()->asCFInst();
|
|
if (fcall->isIndirectCall()) {
|
|
// for every indirect call, count # of instructions inserted
|
|
// between call and the first add
|
|
uint64_t sync_off_1 = 0;
|
|
G4_INST *first_add = nullptr;
|
|
INST_LIST::reverse_iterator it = bb->rbegin();
|
|
// skip call itself
|
|
++it;
|
|
// calculate sync_off_1
|
|
for (; it != bb->rend(); ++it) {
|
|
G4_INST &inst = **it;
|
|
if (update_sync_off(inst, sync_off_1))
|
|
continue;
|
|
else if (inst.opcode() == G4_add) {
|
|
if (first_add == nullptr) {
|
|
first_add = &inst;
|
|
continue;
|
|
} else {
|
|
// found 2nd add
|
|
break;
|
|
}
|
|
}
|
|
// instructions between pattern sequence could only be
|
|
// sync.nop, sync.allrd or sync.allwr
|
|
vASSERT(false);
|
|
}
|
|
vASSERT(first_add->getSrc(1)->isImm());
|
|
int64_t adjust_off =
|
|
first_add->getSrc(1)->asImm()->getInt() - sync_off_1;
|
|
first_add->setSrc(builder.createImm(adjust_off, Type_D), 1);
|
|
|
|
// calculate sync_off_2
|
|
if (builder.needIPWA()) {
|
|
// at this point, it should point to 2nd add, skip it
|
|
++it;
|
|
uint64_t sync_off_2 = 0;
|
|
G4_INST *third_add = nullptr;
|
|
for (; it != bb->rend(); ++it) {
|
|
G4_INST &inst = **it;
|
|
if (update_sync_off(inst, sync_off_2))
|
|
continue;
|
|
else if (inst.opcode() == G4_return)
|
|
continue;
|
|
else if (inst.opcode() == G4_add) {
|
|
vASSERT(third_add == nullptr);
|
|
third_add = &inst;
|
|
break;
|
|
}
|
|
// instructions between pattern sequence could only be
|
|
// sync.nop, sync.allrd or sync.allwr
|
|
vASSERT(false);
|
|
}
|
|
vASSERT(third_add->getSrc(1)->isImm());
|
|
int64_t adjust_off_2 =
|
|
third_add->getSrc(1)->asImm()->getInt() + sync_off_2;
|
|
third_add->setSrc(
|
|
builder.createImm(adjust_off_2, third_add->getSrc(1)->getType()),
|
|
1);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// [NoMask WA]
|
|
// EU Fusion introduced a new hardware : fused Mask (2 bits, one for each fused
|
|
// EUs to indicate whether EU is on or off) to control NoMask instructions from
|
|
// running on off EU. However, there is a hw bug that will not let the fused
|
|
// mask change from 01 to 00, causing off EU to run NoMask inst that should not
|
|
// run.
|
|
//
|
|
// A WA is to change any NoMask instruction by adding a predicate to it.
|
|
// And this predicate is equivalent to correct NoMask semantics. For example,
|
|
// the following instruction
|
|
//
|
|
// (W) add (8|M0) r10.0<1>:d r11.0<1;1,0>:d r12.0<1;1,0>:d
|
|
//
|
|
// will be changed to
|
|
//
|
|
// (W) mov (1|M0) f0.0<1>:w 0
|
|
// cmp (8|M0) (eq)f0.0 r0:uw r0:uw
|
|
// (W&f0.0.any8h) add (8|M0) r10.0<1>:d r11.0<1;1,0>:d r12.0<1;1,0>:d
|
|
//
|
|
// Note that f0.0 is called "WA flag".
|
|
//
|
|
// The HW still have the correct CE mask so that the above mov&cmp sequence
|
|
// still works, that is, f0.0 will be all zero if no active lanes and will not
|
|
// be zero if there is at least one active lane.
|
|
//
|
|
// Nested Divergence
|
|
// For a fused mask to be 01, the control-flow must be divergent
|
|
// at that point. Furthermore, changing 01 to 00 happens only if a further
|
|
// divergence happens within a already-divergent path. This further
|
|
// divergence is referred to as the nested divergence.
|
|
//
|
|
// As changing from 01 to 00 never happens with backward goto, backward
|
|
// goto is treated as divergent, but not nested divergent for the purpose
|
|
// of this WA.
|
|
//
|
|
// This function first finds out which BB are in nested divergent branch and
|
|
// then add predicates to those NoMask instructions.
|
|
//
|
|
// [Some details]
|
|
// --------------
|
|
// This WA could be understood in terms of physical registers. When a NoMask
|
|
// instruction runs when it should not, it will change physical registers. If
|
|
// the physical registers have valid values that will be used later, this NoMask
|
|
// instruction will result in incorrect values in those registers. Here is an
|
|
// example:
|
|
// clang-format off
|
|
// fusedMask
|
|
// (0) (f0.0.any16h) goto(16) BB1 [11]
|
|
// BB0 [01]
|
|
// (1) (W) mov (1|M0) f0.1<1>:uw 0x3:uw
|
|
// (2) goto BB3
|
|
//
|
|
// BB1: [01, should be 00]
|
|
// (3) join (16) [11, should be 10]
|
|
// (4) (W) mov (1|M0) f0.1<1>:uw 0x0:uw
|
|
// (5) cmp (16|M0) (eq)f0.1 null<1>:uw r0.0<0;1,0>:uw r0.0<0;1,0>:uw
|
|
// (6) (W&f0.1.any16h) mov (1|M0) f0.1<1>:uw 0x0:uw
|
|
// BB2: [11, should be 10]
|
|
// (7) or (8|M0) (ne)f0.1 null<1>:uw r1.4<8;8,1>:uw r3.0<8;8,1>:uw
|
|
//
|
|
// BB3:
|
|
// (8) join (16) [11, correct]
|
|
// (9) (f0.1) sel (8|M0) r1.4<1>:uw r1.3<0;1,0>:uw 0x0:uw
|
|
// clang-format on
|
|
//
|
|
// where (4) & (5) are WA instructions. (6) has WA applied. f0.1 at (9) takes
|
|
// value either defined at (1) or (7). Suppose BigEU takes BB0 and SmallEU
|
|
// takes BB1-BB2 and both BigEU and SmallEU will join at (8). Thus, (9) of
|
|
// BigEU will take its value defined at (1) in BB0. Due to this HW bug, BigEU
|
|
// will execute noMask instruction (4) in BB1, causing f0.1's value to be
|
|
// changed. As a result, (9) of BigEU will actually take the value defined at
|
|
// (4), which is wrong.
|
|
//
|
|
// To prevent this from happening, the workaround flag will have the following
|
|
// sequence:
|
|
// (W) mov (1|M0) r32.3:uw f0.1 // save f0.1
|
|
// (4) (W) mov (1|M0) f0.1<1>:uw 0x0:uw
|
|
// (5) cmp (16|M0) (eq)f0.1 null<1>:uw r0.0<0;1,0>:uw
|
|
// r0.0<0;1,0>:uw (6) (W&f0.1.any16h) mov (1|M0) f0.1<1>:uw 0x0:uw
|
|
// (W) mov (1|M0) f0.1 f32.3:uw // restore f0.1
|
|
// In doing so, f0.1 will be the original value, and the above issue is
|
|
// avoided.
|
|
//
|
|
// Since new mov (save/restore f0.1) instructions are noMask instructions,
|
|
// r32.3 is also needed to avoid clobbering any valid variables allocated to
|
|
// r32.3 too.
|
|
//
|
|
// We guarantee this by reserving GRFs as needed during applying WAs.
|
|
//
|
|
// [more on insts after register allocation]
|
|
// -----------------------------------------
|
|
// Assuming BB1 is on off EU.
|
|
//
|
|
// V77 (2GRF) spills at offset[4x32]. The following code reads V77 from spill
|
|
// location, and modifies it, and finally write the result back into
|
|
// offset[4xi32]. If the code can keep the content at this location unchanged,
|
|
// no WA is needed; otherwise, we must have WA.
|
|
//
|
|
// But write at (3) will write whatever in r4 into offset[4x32], which is
|
|
// undefined, definitely not guaranteed to be the same as r1 just read from
|
|
// the same location. (Note that mul at (2) will not run because the channel
|
|
// enable is off. Thus it modifies the content at offset[4x32], which is
|
|
// wrong.
|
|
//
|
|
// Before RA:
|
|
// BB1:
|
|
// mul (M1, 16) V77(0,0)<1> V141(0,0)<0;1,0> V77(0,0)<1;1,0>
|
|
// BB2:
|
|
// svm_block_st (4) V154(0,0)<0;1,0> V77.0
|
|
//
|
|
// After RA
|
|
// BB1:
|
|
// (1) // wr:1h+0, rd:2; hword scratch block read x2
|
|
// // scratch space fill: FL_GRF_V77_6 from offset[4x32]
|
|
// (W) send.dc0 (16|M0) r1 r0 null 0x0 0x022C1004
|
|
// (2) mul (16|M0) r4.0<1>:f r3.0<0;1,0>:f r1.0<8;8,1>:f
|
|
// (3) // wr:1h+2, rd:0; hword scratch block write x2
|
|
// // scratch space spill: SP_GRF_V77_3 from offset[4x32];
|
|
// (W) send.dc0 (16|M0) null r0 r4 0x80 0x020F1004
|
|
//
|
|
// For flag spill:
|
|
// Need WA as well due to the following case:
|
|
//
|
|
// After RA:
|
|
// BB_19:
|
|
// (W) mov (1|M0) r34.8<1>:uw f0.1<0;1,0>:uw
|
|
// ...
|
|
// BB_21:
|
|
// (W) mov (1|M0) f1.1<1>:uw r34.8<0;1,0>:uw
|
|
//
|
|
// If BB_19 should be skipped but runs due to this HW bug, r34.8 will be
|
|
// updated with a f0.1, which is undefined value. And at BB_21, reading from
|
|
// r34.8 will get garbage value!
|
|
// ======================================================================================
|
|
// The NoMask WA has two parts:
|
|
// preRA part: prepare for applying WA in postRA
|
|
// postRA part: apply WAs
|
|
//
|
|
// prepareNoMaskWA is preRA part. It does:
|
|
// 1. Determines if NoMask WA needs to be applied for any BB
|
|
// This is done by using nested divergence to decide whether a BB needs
|
|
// WA.
|
|
// 2. If WA is needed, reserve dedicated GRFs
|
|
// Check all insts that need WA and decide how much GRF to be reserved.
|
|
// At most 2GRF + 2DW is needed.
|
|
// This info, reserved GRFs and whether there are insts that need WA, is
|
|
// passed into postRA. Note that even though there is no inst that need WA
|
|
// preRA, it is still possible that spill/fill needs WA. Thus, at least 2DW
|
|
// will be reserved.
|
|
//
|
|
// ApplyNoMaskWA() : postRA part.
|
|
void Optimizer::prepareNoMaskWA() {
|
|
std::unordered_map<G4_BB *, int> nestedDivergentBBs;
|
|
const G4_ExecSize simdsize = fg.getKernel()->getSimdSize();
|
|
|
|
// Identify BBs that need WA
|
|
fg.reassignBlockIDs();
|
|
fg.findNestedDivergentBBs(nestedDivergentBBs);
|
|
|
|
// Return true if a NoMask inst is either send or global
|
|
auto isCandidateInst = [&](G4_INST *Inst, FlowGraph &cfg) -> bool {
|
|
// pseudo should be gone at this time [skip all pseudo].
|
|
if (!Inst->isWriteEnableInst() || Inst->isCFInst() ||
|
|
Inst->isPseudoLogic() || Inst->isPseudoKill() ||
|
|
Inst->isWait() || // predicate not supported
|
|
Inst->opcode() == G4_nop) // predicate not supported
|
|
{
|
|
return false;
|
|
}
|
|
if (Inst->isSend() && Inst->getPredicate() &&
|
|
Inst->getExecSize() > simdsize) {
|
|
// fused send, already correctly predicated, skip
|
|
return false;
|
|
}
|
|
if (Inst->isEOT()) {
|
|
// Algo assumes no WA needed for entry and exit, skip EOT for now.
|
|
return false;
|
|
}
|
|
return true;
|
|
};
|
|
|
|
// If true, there exist NoMask insts that need WA.
|
|
bool hasWAInst = false;
|
|
bool reserveWAFlag = false;
|
|
uint32_t numTempInUD = 0; // size of temp in UD
|
|
G4_SubReg_Align tempAlign = Even_Word;
|
|
|
|
auto updateTempReserve = [&](uint32_t aNumElts, G4_Type aEltTy,
|
|
G4_SubReg_Align aAlign) {
|
|
uint32_t newBytes = aNumElts * TypeSize(aEltTy);
|
|
uint32_t newDWs = (newBytes + 3) / 4;
|
|
if (newDWs > numTempInUD) {
|
|
numTempInUD = newDWs;
|
|
}
|
|
if (tempAlign < aAlign) {
|
|
tempAlign = aAlign;
|
|
}
|
|
};
|
|
|
|
// Scan all insts and mark then if WAs are needed
|
|
for (auto BI : fg) {
|
|
G4_BB *BB = BI;
|
|
if ((BB->getBBType() & G4_BB_NM_WA_TYPE) == 0) {
|
|
continue;
|
|
}
|
|
|
|
// This BB might need WA, thus reserved GRF for WA flags.
|
|
// (Even though there is no NoMask inst in this BB now, later RA might
|
|
// generate
|
|
// spill/fill in this BB. Thus WAFlagReserve shoud be set here.)
|
|
reserveWAFlag = true;
|
|
for (auto II = BB->begin(), IE = BB->end(); II != IE; ++II) {
|
|
G4_INST *I = *II;
|
|
if (isCandidateInst(I, fg)) {
|
|
I->setNeedPostRA(true);
|
|
hasWAInst = true;
|
|
|
|
// Check if any temps are needed.
|
|
G4_CondMod *condmod = I->getCondMod();
|
|
G4_Predicate *pred = I->getPredicate();
|
|
if (I->opcode() == G4_sel || I->opcode() == G4_csel) {
|
|
// doFlagModifierSelInstWA : temp for saving dst (could be 2GRF)
|
|
// Note: sel's pred isn't used for calculating WrEn, and csel does
|
|
// not allow predicate.
|
|
G4_DstRegRegion* dst = I->getDst();
|
|
if (dst && !dst->isNullReg()) {
|
|
(void)updateTempReserve(I->getExecSize() * dst->getHorzStride(),
|
|
dst->getType(), dst->getTopDcl()->getSubRegAlign());
|
|
}
|
|
else
|
|
vISA_ASSERT(false, "ICE: expect dst to be non-null!");
|
|
} else if (pred && !condmod) {
|
|
// doPredicateInstWA(): need 1 DW
|
|
updateTempReserve(1, Type_UD, Even_Word);
|
|
} else if (!pred && condmod) {
|
|
// doFlagModifierInstWA : temp for saving condmod
|
|
updateTempReserve(1, Type_UD, Even_Word);
|
|
} else if (pred && condmod) {
|
|
// doPredicateAndFlagModifierInstWA : temp for saving predicate
|
|
updateTempReserve(1, Type_UD, Even_Word);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
G4_BB *entryBB = fg.getEntryBB();
|
|
vASSERT(entryBB);
|
|
G4_Declare *WATemp = nullptr;
|
|
if (numTempInUD > 0) {
|
|
// For temps other than WA flags. Its size will be the largest of all temps
|
|
// It is at most 2 GRF (dst that uses maximum 2 GRF).
|
|
WATemp = builder.createTempVar(numTempInUD, Type_UD, tempAlign, "WATemp");
|
|
WATemp->setLiveIn();
|
|
WATemp->setLiveOut();
|
|
WATemp->setDoNotSpill();
|
|
|
|
// Add a pseudo use inst so that RA will include this temp for reg
|
|
// allocation.
|
|
G4_ExecSize sz =
|
|
builder.toExecSize(Get_VISA_Exec_Size_From_Raw_Size(numTempInUD));
|
|
G4_SrcRegRegion *use =
|
|
builder.createSrc(WATemp->getRegVar(), 0, 0,
|
|
(sz == g4::SIMD1 ? builder.getRegionScalar()
|
|
: builder.getRegionStride1()),
|
|
Type_UD);
|
|
G4_INST *pseudoUseInst = builder.createIntrinsicInst(
|
|
nullptr, Intrinsic::FlagSpill, sz, nullptr, use, nullptr, nullptr,
|
|
InstOpt_NoOpt, false);
|
|
|
|
INST_LIST_ITER inst_it = entryBB->getFirstInsertPos();
|
|
entryBB->insertBefore(inst_it, pseudoUseInst);
|
|
}
|
|
|
|
// WA flag temp: 2 DW.
|
|
// The First for saving the existing flag so that WA flag can use it.
|
|
// The second one is a temp for saving WA flag to avoid recalculating it.
|
|
G4_Declare *WAFlagReserve = nullptr;
|
|
if (reserveWAFlag) {
|
|
WAFlagReserve = builder.createTempVar(2, Type_UD, Even_Word, "WAFlag");
|
|
WAFlagReserve->setLiveIn();
|
|
WAFlagReserve->setLiveOut();
|
|
WAFlagReserve->setDoNotSpill();
|
|
|
|
G4_SrcRegRegion *src = builder.createSrc(
|
|
WAFlagReserve->getRegVar(), 0, 0, builder.getRegionStride1(), Type_UD);
|
|
G4_INST *pseudoUseInst = builder.createIntrinsicInst(
|
|
nullptr, Intrinsic::FlagSpill, g4::SIMD2, nullptr, src, nullptr,
|
|
nullptr, InstOpt_NoOpt, false);
|
|
|
|
INST_LIST_ITER inst_it = entryBB->getFirstInsertPos();
|
|
entryBB->insertBefore(inst_it, pseudoUseInst);
|
|
};
|
|
|
|
// Save info for applyNoMaskWA() to use after RA.
|
|
// If reserveWAFlag is false, there is no need to apply WA at all (including
|
|
// postRA).
|
|
if (reserveWAFlag) {
|
|
kernel.createNoMaskWAInfo(WAFlagReserve, WATemp, hasWAInst);
|
|
}
|
|
}
|
|
|
|
void Optimizer::applyNoMaskWA() {
|
|
// Utility class to get flag def/use info for a BB
|
|
// Each of 16-bit flag has one bit to track whether it is used or defined.
|
|
// We have 4 flags, thus 4 bits for use and 4 bits for def.
|
|
//
|
|
// DefUse info is encoded as uint32_t, in which the first 4 bits of 1st
|
|
// half and the 2nd half are for use and def, respectively, that is,
|
|
// [3:0] : use (f1.1, f1.0, f0.1, f0.0)
|
|
// [19:16] : def (f1.1, f1.0, f0.1, f0.0)
|
|
//
|
|
// For example, 0xA0001 (1010b, 0001b) -> f1.1 & f0.1 are defined, f0.0 is
|
|
// used
|
|
//
|
|
// Convention:
|
|
// Inst iterator range is represented as [a, b], or [a, b), in which '['
|
|
// and ']' means inclusive, where '(' and ')' means exclusive. For
|
|
// example, [1, 10) means 1 to 9, where [1, 10] means 1 to 10.
|
|
class FlagDefUse {
|
|
G4_BB *m_BB;
|
|
// Keep track DefUse info for each inst.
|
|
std::unordered_map<G4_INST *, uint32_t> m_flagDefUse;
|
|
|
|
public:
|
|
FlagDefUse(G4_BB *aBB) : m_BB(aBB) {}
|
|
|
|
// return value:
|
|
// true: if "O" is flag and has assigned a physical flag. This physical
|
|
// reg
|
|
// is returned as (freg, fsreg):ty.
|
|
// false: otherwise
|
|
//
|
|
// Note this code mimics the logic of printRegVarOff() in G4_IR.cpp.
|
|
//
|
|
// For pred/condMod, "ty" is the actual size that this "O" accesses,
|
|
// not the decl size of "O". For example,
|
|
// cmp (16|M16) (eq)f0.0 ...
|
|
// this func returns with f(0,0):UW, but "O" is of UD!
|
|
static bool getFlagRegAndSubreg(G4_Operand *O, uint32_t &freg,
|
|
uint32_t &fsreg, G4_Type &ty) {
|
|
// flag:
|
|
// reg no = base's ExRegNum()
|
|
// subregoff = base's subregoff + Operand's subregoff (in UW)
|
|
//
|
|
// Type difference b/w base and operand is not considered here for flag as
|
|
// the base's type is always UW. Operand's type can be UW/UD. If operand's
|
|
// type is UD, its subregoff in UD must be 0, which is the same as one in
|
|
// UW. Therefore, simply treat operand's subRegOff as in UW.
|
|
uint32_t nSubFlag = (O->getRightBound() - O->getLeftBound() + 16) / 16;
|
|
uint32_t subregoff = 0;
|
|
if (O->isSrcRegRegion()) {
|
|
subregoff = O->asSrcRegRegion()->getSubRegOff();
|
|
} else if (O->isDstRegRegion()) {
|
|
subregoff = O->asDstRegRegion()->getSubRegOff();
|
|
} else if (O->isPredicate()) {
|
|
subregoff = O->asPredicate()->getSubRegOff();
|
|
} else if (O->isCondMod()) {
|
|
subregoff = O->asCondMod()->getSubRegOff();
|
|
}
|
|
|
|
G4_VarBase *BVar = O->getBase();
|
|
ty = (nSubFlag == 1 ? Type_UW : Type_UD);
|
|
bool isValid = false;
|
|
if (BVar) {
|
|
freg = BVar->ExRegNum(isValid);
|
|
fsreg = BVar->asRegVar()->getPhyRegOff() + subregoff;
|
|
}
|
|
return isValid;
|
|
}
|
|
|
|
private:
|
|
uint16_t getFlagBits(G4_Operand *O) {
|
|
uint32_t r, sr;
|
|
G4_Type t;
|
|
if (getFlagRegAndSubreg(O, r, sr, t)) {
|
|
// For the following cases, getFlagRegAndSubreg() returns with r=1,
|
|
// sr=0, ty=UW. But they really access f1.1. Thus, do adjustment to get
|
|
// the right flag bits!
|
|
// cmp (16|M16) (eq)f1.0 ...
|
|
// (f1.0) mov (16|M16) ....
|
|
if ((O->isPredicate() || O->isCondMod()) && t == Type_UW) {
|
|
// sanity check: subreg could be 1 only if rightBound < 16
|
|
vASSERT(sr == 0 || O->getRightBound() < 16);
|
|
|
|
if (O->getLeftBound() >= 16) {
|
|
// typical cases like ones in comments above
|
|
sr = 1;
|
|
} else if (O->getRightBound() >= 16) {
|
|
// cross two sub-flags (f1.0 and f1.1). Reset t to UD
|
|
t = Type_UD;
|
|
}
|
|
}
|
|
|
|
uint16_t bits = (t == Type_UD ? 0x3 : 0x1);
|
|
return (bits << (r * 2 + sr));
|
|
}
|
|
vISA_ASSERT_UNREACHABLE("Flag: not allocated to physical register!");
|
|
return 0;
|
|
};
|
|
|
|
uint32_t getFlagDefUseBits(G4_INST *aI) {
|
|
auto MI = m_flagDefUse.find(aI);
|
|
if (MI != m_flagDefUse.end()) {
|
|
return MI->second;
|
|
}
|
|
|
|
uint16_t flagUse = 0;
|
|
uint16_t flagDef = 0;
|
|
for (int i = 0, sz = (int)aI->getNumSrc(); i < sz; ++i) {
|
|
G4_Operand *S = aI->getOperand(aI->getSrcOperandNum(i));
|
|
if (S && S->isFlag()) {
|
|
vASSERT(S->asSrcRegRegion()->getBase()->getAreg());
|
|
flagUse |= getFlagBits(S);
|
|
}
|
|
}
|
|
// predicate
|
|
if (G4_Predicate *P = aI->getPredicate()) {
|
|
flagUse |= getFlagBits(P);
|
|
}
|
|
// defs
|
|
G4_Operand *D = aI->getDst();
|
|
if (D && !D->isNullReg() && D->isFlag()) {
|
|
vASSERT(D->asDstRegRegion()->getBase()->getAreg());
|
|
flagDef |= getFlagBits(D);
|
|
}
|
|
if (aI->opcode() != G4_sel &&
|
|
aI->opcode() != G4_csel) { // sel does not update condMod
|
|
if (G4_CondMod *Mod = aI->getCondMod()) {
|
|
flagDef |= getFlagBits(Mod);
|
|
}
|
|
}
|
|
uint32_t retBits = (flagDef << 16) | flagUse;
|
|
m_flagDefUse.insert(std::make_pair(aI, retBits));
|
|
return retBits;
|
|
}
|
|
|
|
// Return flag bits for instructions within [SI, EI).
|
|
uint32_t getInstsBits(INST_LIST_ITER SI, INST_LIST_ITER EI) {
|
|
uint32_t defuse = 0;
|
|
for (auto II = SI; II != EI; ++II) {
|
|
G4_INST *tI = *II;
|
|
defuse |= getFlagDefUseBits(tI);
|
|
}
|
|
return defuse;
|
|
}
|
|
|
|
// Return true: if there is a flag that is not referenced by this duBits.
|
|
// The returned flag (freg, fsreg) is a unreferenced one.
|
|
// false: otherwise.
|
|
bool getUnreferencedFlag(uint32_t duBits, G4_Type fty, uint32_t &freg,
|
|
uint32_t &fsreg) {
|
|
uint32_t fBits = (fty == Type_UD) ? 0x3 : 0x1;
|
|
uint32_t duBitsD = (duBits >> 16);
|
|
int i = 0;
|
|
for (; i < 4; i += (fty == Type_UD ? 2 : 1)) {
|
|
if ((fBits & duBits) == 0 // Use
|
|
&& (fBits & duBitsD) == 0) // Def
|
|
{
|
|
freg = i / 2;
|
|
fsreg = i % 2;
|
|
return true;
|
|
}
|
|
fBits = (fBits << (fty == Type_UD ? 2 : 1));
|
|
}
|
|
return false;
|
|
}
|
|
|
|
public:
|
|
// Let BI = aWaInsts[aStartIx], EI = ++(aWaInsts.back()).
|
|
// Note that aWaInsts's element is of INST_LIST_ITER.
|
|
//
|
|
// getBestFlagIfAvailable() searches [BI, EI), and it searches in order
|
|
// until no available flag can be used. (In doing so, we have the maximum
|
|
// number of WA insts that can use the same WA flag.) The argument 'aEndIx'
|
|
// is the index it stops when no flag can be used.
|
|
// Return value:
|
|
// false: If aEndIx == aStartIx, no flag can be used. This means that
|
|
// the inst at aStartIx takes
|
|
// all two flags.
|
|
// true: otherwise, (retFreg, retFsreg):FTy is not used in [
|
|
// aWaInsts[aStartIx], aWaInsts[aEndIx] ).
|
|
// If aEndIx = aWaInsts.size(), it means (retFreg, retFsreg):FTy
|
|
// can be used for all insts of aWaInsts, starting from
|
|
// aStartIx.
|
|
bool getBestFlagIfAvailable(const std::vector<INST_LIST_ITER> &aWaInsts,
|
|
const int32_t aStartIx, int32_t &aEndIx,
|
|
G4_Type FTy, uint32_t &retFreg,
|
|
uint32_t &retFsreg) {
|
|
// initialize flag to be invalid
|
|
retFreg = 0xff;
|
|
retFsreg = 0xff;
|
|
|
|
int SIx = aStartIx;
|
|
INST_LIST_ITER BI = aWaInsts[SIx];
|
|
uint32_t DUBits = 0;
|
|
for (const int EIx = (int)aWaInsts.size(); SIx < EIx; ++SIx) {
|
|
uint32_t r, s;
|
|
INST_LIST_ITER NI = std::next(aWaInsts[SIx]);
|
|
DUBits |= getInstsBits(BI, NI);
|
|
if (!getUnreferencedFlag(DUBits, FTy, r, s)) {
|
|
// no flag is available at ix
|
|
break;
|
|
}
|
|
retFreg = r;
|
|
retFsreg = s;
|
|
BI = NI; // set the next starting iterator
|
|
}
|
|
|
|
aEndIx = SIx;
|
|
return SIx != aStartIx;
|
|
}
|
|
};
|
|
|
|
// Only need to create at most 6 WAFlag temps.
|
|
G4_Declare *FlagUD[2] = {nullptr, nullptr};
|
|
G4_Declare *FlagUW[4] = {nullptr, nullptr, nullptr, nullptr};
|
|
auto getFlagDcl = [&](uint32_t aFreg, uint32_t aFsreg, G4_Type aFTy) {
|
|
G4_Declare *retDcl;
|
|
if (aFTy == Type_UD) {
|
|
int ix = aFreg;
|
|
vASSERT(ix < ARRAY_COUNT(FlagUD));
|
|
if (FlagUD[ix] == nullptr) {
|
|
FlagUD[ix] = builder.createTempFlag(2, "WAFlagUD");
|
|
}
|
|
retDcl = FlagUD[ix];
|
|
} else {
|
|
int ix = 2 * aFreg + aFsreg;
|
|
vASSERT(ix < ARRAY_COUNT(FlagUW));
|
|
if (FlagUW[ix] == nullptr) {
|
|
FlagUW[ix] = builder.createTempFlag(1, "WAFlagUW");
|
|
}
|
|
retDcl = FlagUW[ix];
|
|
}
|
|
return retDcl;
|
|
};
|
|
|
|
// Get those GRFs reserved in prepareNoMaskWA()
|
|
NoMaskWAInfo *WAInfo = kernel.getEUFusionNoMaskWAInfo();
|
|
|
|
// If no spill AND no inst that needs WA, just return.
|
|
// ' HasWAInsts = true' means that before RA, there are insts that need WA
|
|
const bool HasFlagSpill = (builder.getJitInfo()->stats.numFlagSpillStore > 0);
|
|
const bool HasGRFSpill = (builder.getJitInfo()->stats.spillMemUsed > 0);
|
|
if (!WAInfo || // No BB needs WA
|
|
(!(HasFlagSpill || HasGRFSpill) &&
|
|
!WAInfo->HasWAInsts)) // No Spill, no WA Insts
|
|
{
|
|
kernel.deleteEUFusionNoMaskWAInfo();
|
|
return;
|
|
}
|
|
|
|
const G4_ExecSize Simdsize = fg.getKernel()->getSimdSize();
|
|
const RegionDesc *ScalarReg = builder.getRegionScalar();
|
|
bool UseAnyh = true; // default, adjusted for each BB.
|
|
|
|
// WAFlagReserve is 2DW GRF.
|
|
// An example about how to use it.
|
|
// Assume WAFlag is f0.1:uw
|
|
//
|
|
// ===========================================
|
|
// | DW0 | DW |
|
|
// | uw0 | uw1 | uw0 | uw1 |
|
|
// ===========================================
|
|
// | orig f0.1 | | WA f0.1 | | <-- WAFlag = f0.1:uw
|
|
// ============================================
|
|
// | orig f0.0 | WA f0.0 | <-- WAFlag = f0.0:ud
|
|
// ===========================================
|
|
//
|
|
// If WAFlag cannot be used to all insts as it is clobbered somewhere in the
|
|
// middle, it must be saved in DW1.
|
|
//
|
|
G4_Declare *SaveDcl = WAInfo->WAFlagReserved; // 2DW
|
|
G4_RegVar *SaveVar = SaveDcl->getRegVar();
|
|
G4_Declare *WATempDcl = WAInfo->WATempReserved; // 0 - 2 GRF
|
|
G4_RegVar *WATempVar = (WATempDcl ? WATempDcl->getRegVar() : nullptr);
|
|
|
|
#if defined(_DEBUG) || defined(_INTERNAL)
|
|
// Check if linearStart has been done and SaveDcl/WATempDcl has been
|
|
// allocated. (computePReg() set GRFBaseOffset().
|
|
auto checkDclPReg = [&](G4_Declare *aDcl) {
|
|
// Set lineartStar for aDcl
|
|
G4_RegVar *RegVar = aDcl->getRegVar();
|
|
vASSERT(RegVar->isPhyRegAssigned() && RegVar->getPhyReg()->isGreg());
|
|
uint32_t regNum =
|
|
(static_cast<G4_Greg *>(RegVar->getPhyReg()))->getRegNum();
|
|
uint32_t subRegNum = RegVar->getPhyRegOff();
|
|
uint32_t dclEltBytes = aDcl->getElemSize();
|
|
uint32_t linearizedStart =
|
|
(regNum * builder.numEltPerGRF<Type_UB>()) + (subRegNum * dclEltBytes);
|
|
vASSERT(aDcl->getGRFOffsetFromR0() == linearizedStart);
|
|
};
|
|
|
|
checkDclPReg(SaveDcl);
|
|
if (WATempDcl != nullptr) {
|
|
checkDclPReg(WATempDcl);
|
|
}
|
|
#endif
|
|
|
|
auto verifyRegVarSize = [&](G4_RegVar *aRegVar, uint32_t aBytes) {
|
|
#if defined(_DEBUG) || defined(_INTERNAL)
|
|
uint32_t var_sz =
|
|
(aRegVar != nullptr ? aRegVar->getDeclare()->getByteSize() : 0);
|
|
if (var_sz < aBytes) {
|
|
vISA_ASSERT(false, "WATemp does not reserve enough space!");
|
|
}
|
|
#endif
|
|
};
|
|
|
|
auto WAFlagSaveOff = [](G4_Type aT) { return aT == Type_UD ? 1 : 2; };
|
|
auto isNull = [](G4_Operand *aO) {
|
|
return (aO == nullptr || aO->isNullReg());
|
|
};
|
|
|
|
auto getPredCtrl = [&Simdsize](bool aUseAnyh) -> G4_Predicate_Control {
|
|
if (aUseAnyh) {
|
|
return Simdsize == g4::SIMD8
|
|
? PRED_ANY8H
|
|
: (Simdsize == g4::SIMD16 ? PRED_ANY16H : PRED_ANY32H);
|
|
}
|
|
return PRED_DEFAULT;
|
|
};
|
|
|
|
auto isCandidate = [](G4_INST *I) {
|
|
return (I->getNeedPostRA() && I->isWriteEnableInst());
|
|
};
|
|
|
|
// Create WAFlag using mov and cmp.
|
|
auto createFlagFromCmp = [&](G4_BB *aBB, INST_LIST_ITER &aInsertBeforePos,
|
|
G4_RegVar *aFlag, G4_Type aTy) {
|
|
// I0: (W) mov (1|M0) f0.0<1>:aTy, 0
|
|
// I1: cmp (Simdsize|M0) (eq)f0.0 r0<0;1,0>:uw r0<0;1,0>:uw
|
|
// I2 (W&f0.0.anyh) mov (1|M0) f0.0:aTy 0xffffffff:aTy [optional]
|
|
G4_DstRegRegion *D = builder.createDst(aFlag, 0, 0, 1, aTy);
|
|
G4_INST *I0 = builder.createMov(g4::SIMD1, D, builder.createImm(0, aTy),
|
|
InstOpt_WriteEnable, false);
|
|
aBB->insertBefore(aInsertBeforePos, I0);
|
|
|
|
G4_RegVar *cmpVar;
|
|
const bool USE_R0_FOR_EMASK_CMP = false;
|
|
if (USE_R0_FOR_EMASK_CMP) {
|
|
cmpVar = builder.getRealR0()->getRegVar();
|
|
} else {
|
|
// using r2.0:uw for cmp
|
|
G4_Declare *cmpDcl = builder.createHardwiredDeclare(1, Type_UW, 2, 0);
|
|
cmpVar = cmpDcl->getRegVar();
|
|
}
|
|
|
|
G4_SrcRegRegion *r_0 = builder.createSrc(cmpVar, 0, 0, ScalarReg, Type_UW);
|
|
G4_SrcRegRegion *r_1 = builder.createSrc(cmpVar, 0, 0, ScalarReg, Type_UW);
|
|
G4_CondMod *flagCM = builder.createCondMod(Mod_e, aFlag, 0);
|
|
G4_DstRegRegion *nullDst = builder.createNullDst(Type_UW);
|
|
G4_INST *I1 =
|
|
builder.createInternalInst(NULL, G4_cmp, flagCM, g4::NOSAT, Simdsize,
|
|
nullDst, r_0, r_1, InstOpt_M0);
|
|
aBB->insertBefore(aInsertBeforePos, I1);
|
|
|
|
if (!UseAnyh) {
|
|
G4_Imm *allone = builder.createImm(0xFFFFFFFF, aTy);
|
|
G4_DstRegRegion *tF = builder.createDst(aFlag, 0, 0, 1, aTy);
|
|
G4_INST *I2 =
|
|
builder.createMov(g4::SIMD1, tF, allone, InstOpt_WriteEnable, false);
|
|
G4_Predicate *I2_P = builder.createPredicate(
|
|
PredState_Plus, aFlag, 0,
|
|
(Simdsize == g4::SIMD8
|
|
? PRED_ANY8H
|
|
: (Simdsize == g4::SIMD16 ? PRED_ANY16H : PRED_ANY32H)));
|
|
I2->setPredicate(I2_P);
|
|
aBB->insertBefore(aInsertBeforePos, I2);
|
|
}
|
|
};
|
|
|
|
auto createSIMD1Mov = [&](G4_BB *aBB, INST_LIST_ITER &aInsertBeforePos,
|
|
G4_RegVar *Dst, unsigned Dst_soff, G4_RegVar *Src,
|
|
unsigned Src_soff, G4_Type Ty) {
|
|
G4_DstRegRegion *D = builder.createDst(Dst, 0, Dst_soff, 1, Ty);
|
|
G4_SrcRegRegion *S = builder.createSrc(Src, 0, Src_soff, ScalarReg, Ty);
|
|
G4_INST *tI =
|
|
builder.createMov(g4::SIMD1, D, S, InstOpt_WriteEnable, false);
|
|
aBB->insertBefore(aInsertBeforePos, tI);
|
|
return tI;
|
|
};
|
|
|
|
auto initWAFlag = [&](G4_BB *aBB, INST_LIST_ITER &aInsertBeforePos,
|
|
G4_RegVar *aFlag, G4_Type aTy, bool &aFlagCreated,
|
|
bool &aFlagSaved, const bool aSaveFlag) {
|
|
if (aFlagCreated) {
|
|
// Reload the already-saved WAFlag
|
|
vISA_ASSERT(aFlagSaved, "WAFlag should have been saved!");
|
|
(void)createSIMD1Mov(aBB, aInsertBeforePos, aFlag, 0, SaveVar,
|
|
WAFlagSaveOff(aTy), aTy);
|
|
} else {
|
|
// Create a WAFlag for this BB
|
|
createFlagFromCmp(aBB, aInsertBeforePos, aFlag, aTy);
|
|
aFlagCreated = true;
|
|
|
|
if (!aFlagSaved && aSaveFlag) {
|
|
// save WAFlag
|
|
(void)createSIMD1Mov(aBB, aInsertBeforePos, SaveVar, WAFlagSaveOff(aTy),
|
|
aFlag, 0, aTy);
|
|
aFlagSaved = true;
|
|
}
|
|
}
|
|
};
|
|
|
|
// doPredicateInstWA() : WA for a predicated inst without condMod
|
|
//
|
|
// flagVar : Var for WA flag for this BB:
|
|
// currII: iter to inst to which WA is applied.
|
|
// Given a predicated inst 'I'
|
|
// I : (W&[+-]P) <inst> (8|M0) ...
|
|
// to:
|
|
// I0: (W) mov (1|M0) waTemp<0;1,0> P
|
|
// I1: (W&-flagVar) mov (1|M0) P 0 [+] | 0xffff [-]
|
|
// I : (W&[+-]P) <inst> (8|M0) ... [unchanged]
|
|
// I2: (W&-flagVar) mov (1|M0) P waTemp<0;1,0>
|
|
//
|
|
// where the original predCtrl of P at 'I' shall remain unchanged.
|
|
//
|
|
auto doPredicateInstWA = [&](G4_BB *aBB, INST_LIST_ITER &aII,
|
|
G4_RegVar *aFlagVar) {
|
|
G4_INST *I = *aII;
|
|
G4_Predicate *P = I->getPredicate();
|
|
vISA_ASSERT((P && !I->getCondMod()),
|
|
"ICE: expect predicate and no flagModifier!");
|
|
|
|
uint32_t flagBits =
|
|
(P->getRightBound() - P->getLeftBound() + 1) + I->getMaskOffset();
|
|
vISA_ASSERT(
|
|
(16 * aFlagVar->getDeclare()->getRootDeclare()->getWordSize()) >=
|
|
flagBits,
|
|
"ICE[vISA]: WA's flagVar should not be smaller!");
|
|
|
|
G4_Type Ty = (flagBits > 16) ? Type_UD : Type_UW;
|
|
|
|
// I0: (W) mov (1|M0) waTemp P
|
|
verifyRegVarSize(WATempVar, 4);
|
|
(void)createSIMD1Mov(aBB, aII, WATempVar, 0, P->getTopDcl()->getRegVar(), 0,
|
|
Ty);
|
|
|
|
// I1: (W&-flagVar) mov (1|M0) P 0 [+] | 0xffff [-]
|
|
int64_t imm = (P->getState() == PredState_Plus ? 0 : 0xFFFFFFFF);
|
|
G4_Imm *I1_s0 = builder.createImm(imm, Ty);
|
|
G4_DstRegRegion *I1_d =
|
|
builder.createDst(P->getTopDcl()->getRegVar(), 0, 0, 1, Ty);
|
|
G4_Predicate *I1_flag = builder.createPredicate(PredState_Minus, aFlagVar,
|
|
0, getPredCtrl(UseAnyh));
|
|
G4_INST *I1 =
|
|
builder.createMov(g4::SIMD1, I1_d, I1_s0, InstOpt_WriteEnable, false);
|
|
I1->setPredicate(I1_flag);
|
|
aBB->insertBefore(aII, I1);
|
|
|
|
// I : unchanged
|
|
|
|
// I2: (W&-flagVar) mov (1|M0) P waTemp<0;1,0>
|
|
auto nextII = std::next(aII);
|
|
G4_INST *I2 = createSIMD1Mov(aBB, nextII, P->getTopDcl()->getRegVar(), 0,
|
|
WATempVar, 0, Ty);
|
|
G4_Predicate *I2_flag = builder.createPredicate(PredState_Minus, aFlagVar,
|
|
0, getPredCtrl(UseAnyh));
|
|
I2->setPredicate(I2_flag);
|
|
};
|
|
|
|
// doFlagModifierSelInstWA : WA for sel/csel inst
|
|
// sel: either predicate or condmod, not both
|
|
// csel: no predicate, must have condMod
|
|
// Both do not update flag.
|
|
//
|
|
// flagVar : WA flag for this BB
|
|
// Before:
|
|
// I: (W) sel.ge.f0.0 (1|M0) r10.0<1>:f r20.0<0;1,0>:f 0:f
|
|
// After
|
|
// I: (W) sel.ge.f0.0 (1|M0) WATemp:f r20.0<0;1,0>:f 0:f
|
|
// I0: (W&flagVar) mov (1|M0) r10.0<1>:f WATemp:f
|
|
//
|
|
auto doFlagModifierSelInstWA = [&](G4_BB *aBB, INST_LIST_ITER &aII,
|
|
G4_RegVar *aFlagVar) {
|
|
G4_INST *I = *aII;
|
|
G4_DstRegRegion *dst = I->getDst();
|
|
vISA_ASSERT(!isNull(dst), "ICE: expect dst to be non-null!");
|
|
|
|
// Make sure that a temp, created in preRA, is big enough to hold data and
|
|
// possible gap b/w data due to alignment/hw restriction.
|
|
const uint16_t HS = dst->getHorzStride();
|
|
uint32_t dst_bytes = I->getExecSize() * HS * dst->getTypeSize();
|
|
verifyRegVarSize(WATempVar, dst_bytes);
|
|
|
|
// I : (W) sel.ge.f0.0 (1|M0) WATemp:f r20.0<0;1,0>:f 0:f
|
|
G4_DstRegRegion *I_d =
|
|
builder.createDst(WATempVar, 0, 0, HS, dst->getType());
|
|
I->setDest(I_d);
|
|
|
|
// I0: (W&flagVar) mov (1|M0) r10.0<1>:f WATemp:f
|
|
const RegionDesc *regionSave =
|
|
builder.createRegionDesc(I->getExecSize(), HS, 1, 0);
|
|
auto nextII = std::next(aII);
|
|
G4_SrcRegRegion *I0_src0 =
|
|
builder.createSrc(WATempVar, 0, 0, regionSave, dst->getType());
|
|
G4_INST *I0 = builder.createMov(I->getExecSize(), dst, I0_src0,
|
|
InstOpt_WriteEnable, false);
|
|
G4_Predicate *I0_f = builder.createPredicate(PredState_Plus, aFlagVar, 0,
|
|
getPredCtrl(UseAnyh));
|
|
I0->setPredicate(I0_f);
|
|
aBB->insertBefore(nextII, I0);
|
|
};
|
|
|
|
// clang-format off
|
|
// doFlagModifierInstWA : WA for an inst with flagModifier but no predicate.
|
|
//
|
|
// flagVar : WA flag for this BB.
|
|
// Before:
|
|
// I: (W) cmp (16|M16) (ne)P D .... // 32-bit flag
|
|
// or
|
|
// (W) cmp (16|M0) (ne)P D .... // 16-bit flag
|
|
//
|
|
// After:
|
|
// (1) D = null (common)
|
|
// I0: (W) mov (1|M0) WATemp P
|
|
// I: (W) cmp (16|M16) (ne)P ....
|
|
// I1: (W&-flagVar) mov (1|M0) P WATemp
|
|
// (2) I's execMask is the same as flagVar's size
|
|
// (I's entire condMod is defined by I.)
|
|
// I0 (W) mov (1|M0) WATemp P
|
|
// I1: (W) mov (1|M0) P flagVar
|
|
// I: (W&P) cmp (16|M0) (ne)P ..... // add predicate
|
|
// I2: (W&~flagVar) mov (1|M0) P WATemp
|
|
// (3) otherwise(less common)
|
|
// Note that the sequence can only modify P that this cmp will
|
|
// change.
|
|
// I0: (W) mov (1|M0) WATemp P
|
|
// I1: (W) or (1|M0) P P <I's execMask> // enable all
|
|
// I2: (W&~flagVar) and (1|M0) P P ~<I's execMask> // disable all
|
|
// I: (W&P) cmp (16|M0) (ne)P ..... // add pred
|
|
// I3: (W&~flagVar) mov (1|M0) P WATemp
|
|
//
|
|
// clang-format on
|
|
auto doFlagModifierInstWA = [&](G4_BB *aBB, INST_LIST_ITER &aII,
|
|
G4_RegVar *aFlagVar) {
|
|
G4_INST *I = *aII;
|
|
G4_CondMod *P = I->getCondMod();
|
|
vISA_ASSERT((P && !I->getPredicate()),
|
|
"ICE: expect flagModifier and no predicate!");
|
|
|
|
// sel is specially handled in a different function.
|
|
vASSERT(!(I->opcode() == G4_sel || I->opcode() == G4_csel));
|
|
|
|
G4_Declare *modDcl = P->getTopDcl();
|
|
G4_RegVar *modVar = modDcl->getRegVar();
|
|
G4_Type Ty = (modDcl->getWordSize() > 1) ? Type_UD : Type_UW;
|
|
G4_Type flagVarTy =
|
|
(aFlagVar->getDeclare()->getWordSize() > 1 ? Type_UD : Type_UW);
|
|
if (isNull(I->getDst())) { // case 1
|
|
|
|
// I0: (W) mov (1|M0) WATemp P
|
|
verifyRegVarSize(WATempVar, 4);
|
|
(void)createSIMD1Mov(aBB, aII, WATempVar, 0, modVar, 0, Ty);
|
|
|
|
// I : unchanged
|
|
|
|
// I1: (W&-flagVar.anyh) mov (1|M0) P WATemp
|
|
auto nextII = std::next(aII);
|
|
G4_INST *I1 = createSIMD1Mov(aBB, nextII, modVar, 0, WATempVar, 0, Ty);
|
|
G4_Predicate *I1_f = builder.createPredicate(PredState_Minus, aFlagVar, 0,
|
|
getPredCtrl(UseAnyh));
|
|
I1->setPredicate(I1_f);
|
|
|
|
return;
|
|
}
|
|
|
|
const uint32_t execMask = I->getExecLaneMask();
|
|
vISA_ASSERT(
|
|
(Ty == Type_UD || (execMask & 0xFFFF0000) == 0),
|
|
"ICE: a flag used in an inst should not be smaller than the inst's "
|
|
"execMask!");
|
|
if (flagVarTy == Ty && ((execMask == 0xFFFF && Ty == Type_UW) ||
|
|
(execMask == 0xFFFFFFFF && Ty == Type_UD))) {
|
|
// case 2 : entire mod is defined by 'I' !
|
|
//
|
|
// I0: (W) mov (1|M0) WATemp P
|
|
verifyRegVarSize(WATempVar, 4);
|
|
(void)createSIMD1Mov(aBB, aII, WATempVar, 0, modVar, 0, Ty);
|
|
|
|
// I1: (W) mov (1|M0) P flagVar
|
|
(void)createSIMD1Mov(aBB, aII, modVar, 0, aFlagVar, 0, Ty);
|
|
|
|
// I: add the new predicate (must be the same as modDcl), for example:
|
|
// (W&P.anyh) cmp (16|M0) (ne)P ....
|
|
G4_Predicate *I_P = builder.createPredicate(PredState_Plus, modVar, 0,
|
|
getPredCtrl(UseAnyh));
|
|
I->setPredicate(I_P);
|
|
|
|
// I2: (W&~flagVar.anyh) mov (1|M0) P WATemp
|
|
auto nextII = std::next(aII);
|
|
G4_INST *I2 = createSIMD1Mov(aBB, nextII, modVar, 0, WATempVar, 0, Ty);
|
|
G4_Predicate *I2_f = builder.createPredicate(PredState_Minus, aFlagVar, 0,
|
|
getPredCtrl(UseAnyh));
|
|
I2->setPredicate(I2_f);
|
|
|
|
return;
|
|
}
|
|
|
|
// case 3 (less common)
|
|
//
|
|
// I0: (W) mov (1|M0) WATemp P<0;1,0>
|
|
verifyRegVarSize(WATempVar, 4);
|
|
(void)createSIMD1Mov(aBB, aII, WATempVar, 0, modVar, 0, Ty);
|
|
|
|
// I1: (W) or (1|M0) P P ExecMask
|
|
G4_SrcRegRegion *I1_s0 = builder.createSrc(modVar, 0, 0, ScalarReg, Ty);
|
|
G4_Imm *I1_s1 = builder.createImm(execMask, Ty);
|
|
G4_DstRegRegion *I1_d = builder.createDst(modVar, 0, 0, 1, Ty);
|
|
G4_INST *I1 = builder.createBinOp(G4_or, g4::SIMD1, I1_d, I1_s0, I1_s1,
|
|
InstOpt_WriteEnable, false);
|
|
aBB->insertBefore(aII, I1);
|
|
|
|
// I2: (W&~flagVar.anyh) and (1|M0) P P ~ExecMask
|
|
uint32_t negExecMask = (uint32_t)(~execMask);
|
|
G4_SrcRegRegion *I2_s0 = builder.createSrc(modVar, 0, 0, ScalarReg, Ty);
|
|
G4_Imm *I2_s1 = builder.createImm(negExecMask, Ty);
|
|
G4_DstRegRegion *I2_d = builder.createDst(modVar, 0, 0, 1, Ty);
|
|
G4_INST *I2 = builder.createBinOp(G4_and, g4::SIMD1, I2_d, I2_s0, I2_s1,
|
|
InstOpt_WriteEnable, false);
|
|
G4_Predicate *I2_f = builder.createPredicate(PredState_Minus, aFlagVar, 0,
|
|
getPredCtrl(UseAnyh));
|
|
I2->setPredicate(I2_f);
|
|
aBB->insertBefore(aII, I2);
|
|
|
|
// I: add a new predicate, for example:
|
|
// (W&P) cmp (16|M0) (ne)P .....
|
|
G4_Predicate *I_P =
|
|
builder.createPredicate(PredState_Plus, modVar, 0, PRED_DEFAULT);
|
|
I->setPredicate(I_P);
|
|
|
|
// I3: (W&~flagVar.anyh) mov (1|M0) P WATemp
|
|
auto nextII = std::next(aII);
|
|
G4_INST *I3 = createSIMD1Mov(aBB, nextII, modVar, 0, WATempVar, 0, Ty);
|
|
G4_Predicate *I3_f = builder.createPredicate(PredState_Minus, aFlagVar, 0,
|
|
getPredCtrl(UseAnyh));
|
|
I3->setPredicate(I3_f);
|
|
};
|
|
|
|
// clang-format off
|
|
// doPredicateAndFlagModifierInstWA : WA for inst with both predicate and
|
|
// condMod
|
|
//
|
|
// flagVar : emask for this BB:
|
|
//
|
|
// Before:
|
|
// I: (W&[-]P) and (16|M0) (ne)P ....
|
|
//
|
|
// After:
|
|
// I0: (W) mov (1|M0) WATemp P
|
|
// Three cases
|
|
// case 1: 'I' defines entire P
|
|
// I1: (W&-flagVar) mov (1|M0) P 0 (for +p)| ExecMask (for -P) // disable all lanes
|
|
// case 2: +P
|
|
// I1 (W&-flagVar) and (1|M0) P P ~execMask // disable all lanes
|
|
// case 3: -P
|
|
// I1 (W&-flagVar) or (1|M0) P P execMask // disable all lanes
|
|
//
|
|
// I: (W&[-]P) and (16|M0) (ne)P .... // unchanged
|
|
// I2: (W&-flagVar) mov (1|M0) P WATemp
|
|
//
|
|
// clang-format on
|
|
auto doPredicateAndFlagModifierInstWA = [&](G4_BB *aBB, INST_LIST_ITER &aII,
|
|
G4_RegVar *aFlagVar) {
|
|
G4_INST *I = *aII;
|
|
[[maybe_unused]] G4_Predicate *P = I->getPredicate();
|
|
[[maybe_unused]] G4_CondMod *M = I->getCondMod();
|
|
vISA_ASSERT((P && M), "ICE: expect both predicate and flagModifier!");
|
|
vISA_ASSERT(P->getTopDcl() == M->getTopDcl(),
|
|
"ICE: both predicate and flagMod must be the same flag!");
|
|
|
|
G4_Declare *modDcl = M->getTopDcl();
|
|
G4_RegVar *modVar = modDcl->getRegVar();
|
|
G4_Type Ty = (modDcl->getWordSize() > 1) ? Type_UD : Type_UW;
|
|
|
|
// I0: (W) mov (1|M0) WATemp P
|
|
verifyRegVarSize(WATempVar, 4);
|
|
(void)createSIMD1Mov(aBB, aII, WATempVar, 0, modVar, 0, Ty);
|
|
|
|
uint32_t execMask = I->getExecLaneMask();
|
|
uint32_t negExecMask = (uint32_t)(~execMask);
|
|
bool isPlusP = (P->getState() == PredState_Plus);
|
|
G4_INST *I1 = nullptr;
|
|
if ((Ty == Type_UD && execMask == 0xFFFFFFFF) ||
|
|
(Ty == Type_UW && execMask == 0xFFFF)) {
|
|
// case 1 : entire P are defined.
|
|
// I1: (W&-flagVar) mov (1|M0) P 0 (for +p)| ExecMask (for -P)
|
|
G4_DstRegRegion *I1_d = builder.createDst(modVar, 0, 0, 1, Ty);
|
|
G4_Imm *I1_imm = builder.createImm(isPlusP ? 0 : execMask, Ty);
|
|
I1 = builder.createMov(g4::SIMD1, I1_d, I1_imm, InstOpt_WriteEnable,
|
|
false);
|
|
G4_Predicate *I1_f = builder.createPredicate(PredState_Minus, aFlagVar, 0,
|
|
getPredCtrl(UseAnyh));
|
|
I1->setPredicate(I1_f);
|
|
aBB->insertBefore(aII, I1);
|
|
} else {
|
|
// case 2 & 3
|
|
//
|
|
// case 2: +P
|
|
// I1: (W&-flagVar) and (1|M0) P P ~execMask
|
|
// case 3: -P
|
|
// I1: (W&-flagVar) or (1|M0) P P execMask
|
|
G4_DstRegRegion *I1_d = builder.createDst(modVar, 0, 0, 1, Ty);
|
|
G4_SrcRegRegion *I1_s0 = builder.createSrc(modVar, 0, 0, ScalarReg, Ty);
|
|
G4_Imm *I1_imm =
|
|
builder.createImm((isPlusP ? negExecMask : execMask), Ty);
|
|
G4_opcode opc1 = (isPlusP ? G4_and : G4_or);
|
|
I1 = builder.createBinOp(opc1, g4::SIMD1, I1_d, I1_s0, I1_imm,
|
|
InstOpt_WriteEnable, false);
|
|
G4_Predicate *I1_f = builder.createPredicate(PredState_Minus, aFlagVar, 0,
|
|
getPredCtrl(UseAnyh));
|
|
I1->setPredicate(I1_f);
|
|
aBB->insertBefore(aII, I1);
|
|
}
|
|
|
|
// No change to I
|
|
|
|
// I2: (W&-flagVar) mov (1|M0) P WATemp
|
|
auto nextII = std::next(aII);
|
|
G4_INST *I2 = createSIMD1Mov(aBB, nextII, modVar, 0, WATempVar, 0, Ty);
|
|
G4_Predicate *I2_f = builder.createPredicate(PredState_Minus, aFlagVar, 0,
|
|
getPredCtrl(UseAnyh));
|
|
I2->setPredicate(I2_f);
|
|
};
|
|
|
|
auto doSimpleInstWA = [&](G4_BB *aBB, INST_LIST_ITER &aII,
|
|
G4_RegVar *aFlagVar) {
|
|
G4_INST *I = *aII;
|
|
[[maybe_unused]] G4_Predicate *P = I->getPredicate();
|
|
[[maybe_unused]] G4_CondMod *M = I->getCondMod();
|
|
vISA_ASSERT((P == nullptr && M == nullptr),
|
|
"ICE: expect neither pred nor condmod!");
|
|
|
|
G4_Predicate *newPred = builder.createPredicate(PredState_Plus, aFlagVar, 0,
|
|
getPredCtrl(UseAnyh));
|
|
I->setPredicate(newPred);
|
|
};
|
|
|
|
auto applyWAToInst = [&](G4_BB *aBB, INST_LIST_ITER &aII,
|
|
G4_RegVar *aFlagVar) {
|
|
G4_INST *I = *aII;
|
|
G4_Predicate *P = I->getPredicate();
|
|
G4_CondMod *M = I->getCondMod();
|
|
|
|
if ((I->opcode() == G4_sel || I->opcode() == G4_csel)) {
|
|
// Not expecting null dst, as it is no-op
|
|
if (!isNull(I->getDst())) {
|
|
doFlagModifierSelInstWA(aBB, aII, aFlagVar);
|
|
}
|
|
} else if (P == nullptr && M == nullptr) {
|
|
doSimpleInstWA(aBB, aII, aFlagVar);
|
|
} else if (P != nullptr && M == nullptr) {
|
|
doPredicateInstWA(aBB, aII, aFlagVar);
|
|
} else if (P == nullptr && M != nullptr) {
|
|
doFlagModifierInstWA(aBB, aII, aFlagVar);
|
|
} else {
|
|
doPredicateAndFlagModifierInstWA(aBB, aII, aFlagVar);
|
|
}
|
|
};
|
|
|
|
for (G4_BB *BB : kernel.fg) {
|
|
if ((BB->getBBType() & G4_BB_NM_WA_TYPE) == 0) {
|
|
continue;
|
|
}
|
|
|
|
std::vector<INST_LIST_ITER> waInsts;
|
|
// Set default for WAFlag's type, and it may be changed later.
|
|
G4_Type WATy = (Simdsize == g4::SIMD32 ? Type_UD : Type_UW);
|
|
// use anyh is preferred as it uses one instruction less.
|
|
UseAnyh = true;
|
|
|
|
// Collect all insts that need to apply WA. It also does:
|
|
// 1. Determine WAFlag is UD or UW (simdsize isn't enough); and
|
|
// 2. Check if WAFlag can use anyh or WAFlag must be all one's.
|
|
for (auto II = BB->begin(), IE = BB->end(); II != IE; ++II) {
|
|
G4_INST *I = *II;
|
|
if (isCandidate(I)) {
|
|
waInsts.push_back(II);
|
|
|
|
if ((I->getExecSize() + I->getMaskOffset()) > 16) {
|
|
WATy = Type_UD;
|
|
}
|
|
if (UseAnyh &&
|
|
(I->getExecSize() > Simdsize || I->getMaskOffset() != 0)) {
|
|
UseAnyh = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (waInsts.empty()) {
|
|
continue;
|
|
}
|
|
|
|
FlagDefUse FlagDUInfo(BB);
|
|
|
|
bool WAFlagCreated = false;
|
|
bool WAFlagSaved = false;
|
|
int ix = 0;
|
|
const int NumWAInsts = (int)waInsts.size();
|
|
while (ix < NumWAInsts) {
|
|
INST_LIST_ITER currII = waInsts[ix];
|
|
uint32_t WAFreg = 0xff; // init to invalid number
|
|
uint32_t WAFsreg = 0xff; // init to invalid number
|
|
|
|
int nextIx;
|
|
bool hasFreeFlag = FlagDUInfo.getBestFlagIfAvailable(
|
|
waInsts, ix, nextIx, WATy, WAFreg, WAFsreg);
|
|
if (hasFreeFlag) { // found available flag in [ix, nextIx).
|
|
vASSERT(nextIx > ix);
|
|
// Given
|
|
// (W) add (16|M0) r10 r20 r30
|
|
// Changed to
|
|
// 1) (W) mov (1|M0) saveVar f1.0
|
|
// 2) <init waflag f1.0>
|
|
// 3) apply WA to all inst in [ix, nextIx). "(W) add (16|M0) r10
|
|
// r20 r30" is at ix 4) (W) mov (1|M0) f1.0 saveVar
|
|
G4_RegVar *WAFlagVar = getFlagDcl(WAFreg, WAFsreg, WATy)->getRegVar();
|
|
WAFlagVar->setPhyReg(builder.phyregpool.getFlagAreg(WAFreg), WAFsreg);
|
|
|
|
// 1) save the original flag for WAFlag.
|
|
(void)createSIMD1Mov(BB, currII, SaveVar, 0, WAFlagVar, 0, WATy);
|
|
|
|
// 2) init or reload WAFlag
|
|
bool saveWAFlag = (nextIx < NumWAInsts);
|
|
initWAFlag(BB, currII, WAFlagVar, WATy, WAFlagCreated, WAFlagSaved,
|
|
saveWAFlag);
|
|
|
|
// 3) apply WA
|
|
INST_LIST_ITER lastII = waInsts[nextIx - 1];
|
|
INST_LIST_ITER nextII = std::next(lastII);
|
|
for (int j = ix; j < nextIx; ++j) {
|
|
currII = waInsts[j];
|
|
applyWAToInst(BB, currII, WAFlagVar);
|
|
}
|
|
|
|
// 4) restore the saved original flag before the next inst.
|
|
(void)createSIMD1Mov(BB, nextII, WAFlagVar, 0, SaveVar, 0, WATy);
|
|
|
|
// set ix for the next wa inst.
|
|
ix = nextIx;
|
|
} else {
|
|
uint32_t fr, fsr;
|
|
G4_Type ty;
|
|
|
|
// waInsts[ix] uses all flags. Need to save one to the reserved tmp.
|
|
// It is possible to have flag in src0, dst, and condMod/predicate.
|
|
// First, need to pick up one that is not used by condMod/predicate
|
|
// so that WAFlag can still work.
|
|
G4_INST *I = *currII;
|
|
G4_Predicate *P = I->getPredicate();
|
|
G4_CondMod *M = I->getCondMod();
|
|
G4_Operand *O_f = (P != nullptr ? (G4_Operand *)P : (G4_Operand *)M);
|
|
G4_Operand *src0 = I->getSrc(0);
|
|
G4_SrcRegRegion *sreg =
|
|
((!isNull(src0) && src0->isSrcRegRegion()) ? src0->asSrcRegRegion()
|
|
: nullptr);
|
|
G4_DstRegRegion *dreg = I->getDst();
|
|
if (O_f != nullptr) {
|
|
[[maybe_unused]] bool isValid =
|
|
FlagDefUse::getFlagRegAndSubreg(O_f, WAFreg, WAFsreg, ty);
|
|
vISA_ASSERT(isValid,
|
|
"Flag should've been assigned physical reg already!");
|
|
|
|
// WAFlag must use the other flag
|
|
WAFreg = (WAFreg == 0 ? 1 : 0);
|
|
} else {
|
|
G4_Operand *O =
|
|
(!isNull(sreg) && src0->isFlag())
|
|
? (G4_Operand *)sreg
|
|
: (G4_Operand *)((!isNull(dreg) && dreg->isFlag()) ? dreg
|
|
: nullptr);
|
|
vISA_ASSERT(
|
|
O != nullptr,
|
|
"ICE: inst must have flag operands if it uses all flags!");
|
|
|
|
[[maybe_unused]] bool isValid =
|
|
FlagDefUse::getFlagRegAndSubreg(O, WAFreg, WAFsreg, ty);
|
|
vISA_ASSERT(isValid,
|
|
"Flag should've been assigned physical reg already!");
|
|
}
|
|
|
|
// Save the entire flag, even though only the half is used.
|
|
G4_RegVar *tVar = getFlagDcl(WAFreg, 0, Type_UD)->getRegVar();
|
|
tVar->setPhyReg(builder.phyregpool.getFlagAreg(WAFreg), 0);
|
|
|
|
// WAFlag. It can be UW (no tVar:UD). Uses 0 as sreg always in this
|
|
// case.
|
|
WAFsreg = 0;
|
|
G4_RegVar *WAFlagVar = getFlagDcl(WAFreg, WAFsreg, WATy)->getRegVar();
|
|
WAFlagVar->setPhyReg(builder.phyregpool.getFlagAreg(WAFreg), WAFsreg);
|
|
|
|
// clang-format off
|
|
// Assume that simdsize = 32 and currII is
|
|
// (W&f0.1) or (1|M0) f1.0:uw f1.1 0x101:uw
|
|
// WA codes are:
|
|
// 1) (W) mov (1|M0) saveVar:ud f1.0:ud
|
|
// 2) <init waflag f1.0>
|
|
// 3) (W&f0.1) or (1|M0) saveVar:uw saveVar.1:uw 0x101:uw [WA will be applied]
|
|
// 4) (W) mov (1|M0) f1.0:ud saveVar:ud [needed for dst change]
|
|
// clang-format on
|
|
|
|
// 1) save the original flag for WAFlag.
|
|
(void)createSIMD1Mov(BB, currII, SaveVar, 0, tVar, 0, Type_UD);
|
|
|
|
// 2) create WAFlag if not yet, or reload the WAFlag
|
|
bool saveWAFlag = (ix != (NumWAInsts - 1));
|
|
initWAFlag(BB, currII, WAFlagVar, WATy, WAFlagCreated, WAFlagSaved,
|
|
saveWAFlag);
|
|
|
|
// 3) (1) Modify I; (2) apply WA
|
|
INST_LIST_ITER nextII = std::next(currII);
|
|
for (int i = 0; i < 2; ++i) {
|
|
G4_Operand *O = (i == 0 ? (G4_Operand *)dreg : (G4_Operand *)sreg);
|
|
if (!isNull(O) && O->isFlag()) {
|
|
[[maybe_unused]] bool isValid = FlagDefUse::getFlagRegAndSubreg(O, fr, fsr, ty);
|
|
vISA_ASSERT(isValid,
|
|
"Flag should've been assigned physical reg already!");
|
|
|
|
if (fr == WAFreg) {
|
|
// flag : either 2bytes at roff 0 or 1; or 4 bytes at roff 0
|
|
vASSERT(fsr == 0 || O->getTypeSize() == 2);
|
|
if (i == 0) {
|
|
// dst
|
|
G4_DstRegRegion *newDreg = builder.createDst(
|
|
SaveVar, 0, fsr, dreg->getHorzStride(), dreg->getType());
|
|
I->setDest(newDreg);
|
|
} else {
|
|
// src0
|
|
G4_SrcRegRegion *newSreg = builder.createSrc(
|
|
SaveVar, 0, fsr, sreg->getRegion(), sreg->getType());
|
|
if (O->asSrcRegRegion() &&
|
|
O->asSrcRegRegion()->getModifier() != Mod_src_undef) {
|
|
newSreg->setModifier(O->asSrcRegRegion()->getModifier());
|
|
}
|
|
I->setSrc(newSreg, 0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
applyWAToInst(BB, currII, WAFlagVar);
|
|
|
|
// 4) Restore the original flag before the next inst
|
|
(void)createSIMD1Mov(BB, nextII, tVar, 0, SaveVar, 0, Type_UD);
|
|
|
|
// set ix for the next wa inst
|
|
++ix;
|
|
}
|
|
}
|
|
}
|
|
kernel.deleteEUFusionNoMaskWAInfo();
|
|
}
|
|
|
|
// Summary:
|
|
// vISA assumes the call's target would be uniform within a thread. This is
|
|
// consistent with hardware call instructions. Under EU fusion, a pair of
|
|
// fused thread 0 and 1 might diverge, meaning that an indirect call invokes A
|
|
// in thread 0 and invokes B in thread 1, which isn't supported by fused EU
|
|
// hardware.
|
|
//
|
|
// This function is used to make sure each fused call will have a single target.
|
|
// As there are HW bugs in fused calls, this function will WA HW bugs as well.
|
|
// The general idea is:
|
|
// Given:
|
|
// (p) call r5
|
|
// Changed it to:
|
|
// if (BigEU)
|
|
// (p) call r5
|
|
// else // SmallEU
|
|
// (p) call r5
|
|
//
|
|
// As HW has a bug in which call always runs (even no active channels) and it
|
|
// always uses BigEU's target as targets for both EUs. This causes several
|
|
// issues and the software WA is used to fix this harware bug. There are several
|
|
// cases:
|
|
// 1. For platforms that has NO HW fix (fusedCallWA 1), applying software WA
|
|
// as described
|
|
// below in "Details of 1",
|
|
//
|
|
// 2. For platforms that has the PARTIAL HW fix (fusedCallWA 2)
|
|
// Any predicated call must be changed to unpredicated like the following:
|
|
// (p) call ...
|
|
// changed to
|
|
// if (p)
|
|
// call ...
|
|
//
|
|
// This is done in Flowgraph::convertPredCall(), right after control-flow
|
|
// is constructed.
|
|
//
|
|
// 2.1 for direct call like the following
|
|
// (p) call r5
|
|
//
|
|
// if (p)
|
|
// if (BigEU) // BigEU
|
|
// call r5
|
|
// else // SmallEU
|
|
// call r5
|
|
// 3. For platforms that have a full fix (if any) (fusedCallWA 0),
|
|
// just do the following for indirect call.
|
|
// (p) call r5
|
|
// if (BigEU) // BigEU
|
|
// (p) call r5
|
|
// else // SmallEU
|
|
// (p) call r5
|
|
//
|
|
// This function handles 1) and duplicating call for BigEU and SmallEU.
|
|
//
|
|
// Details of 1
|
|
// ============
|
|
// Under EU fusion, assume that an indirect call invokes A in thread 0 and
|
|
// invokes B in thread 1. Assume that these two threads are fused and run on a
|
|
// pair of fused EUs {bigEU, smallEU}. The hardware will always invoke A: the
|
|
// callee from thread 0 in bigEU even in else branch (in general case), which
|
|
// is incorrect. To workaround this bug, we have to rely on the fact that cr0.2
|
|
// is shared among the pair of fused EUs and copy thread 1's callee B into
|
|
// thread 0 via cr0.2. In doing so, thread 1's callee can be invoked. The
|
|
// details are as follows:
|
|
//
|
|
// clang-format off
|
|
// before:
|
|
// -------
|
|
// BB:
|
|
// pseudo_fcall (16) V44(0,0)<0;1,0>:ud
|
|
// nextBB:
|
|
//
|
|
// Let Target = V44
|
|
//
|
|
// after WA // Var Names
|
|
// --------
|
|
// BB:
|
|
// (W) mov (1 |M0) tmp<1>:ud sr0.0<0;1,0>:ud // I0
|
|
// (W) and (16|M0) (eq)F null<1>:uw tmp<0;1,0>:uw 0x80:uw // I1
|
|
// (W&~F) mov (1 |M0) cr0.2<1>:ud Target<0;1,0>:ud // I2
|
|
// (W) mov (1 |M0) smallEUTarget:ud cr0.2<0;1,0>:ud // I3
|
|
// (W) add (1 |M0) I4_IP:d -ip:d smallEUTarget:d // I4_ip_start
|
|
// (W) add (1 |M0) I4Target:d I4_IP:d 0x33333333:d // I4_patch_add
|
|
// (W) add (1 |M0) I5_IP:d -ip:d Target:d // I5_ip_start
|
|
// (W) add (1 |M0) I5Target:d I5_IP:d 0x33333333:d // I5_patch_add
|
|
// (~F) goto smallB0
|
|
// // [gotoSmallB0]
|
|
// bigB0:
|
|
// pseudo_fcall (16) I5Target:ud // callI
|
|
// (orig call)
|
|
// bigB1:
|
|
// goto nextBB // gotoEnd
|
|
// smallB0:
|
|
// join nextBB // joinSmall
|
|
// pseudo_fcall (16) I4Target<0;1,0>:ud // nCallI
|
|
// smallB1:
|
|
//
|
|
// nextBB:
|
|
// join <nextJoin or null> // finalJoin
|
|
// clang-format on
|
|
//
|
|
// The BBs and those insts such as I4_patch_add/I5_patch_add, etc are added into
|
|
// m_indirectCallWAInfo so that finishFusedCallWA() can finish post-processing
|
|
// to patch the relative IP and others. If calla can be used, no IP patching is
|
|
// needed. See code for details.
|
|
//
|
|
// In order to make the following to run always even through bigEU is off,
|
|
// "(W) mov (1 |M0) smallEUTarget:ud cr0.2<0;1,0>:ud"
|
|
// a special maskOff (M16) must be used to force NoMask to run no matter if the
|
|
// EU is off or on. This will be handled in finishFusedCallWA(). (See details in
|
|
// finishFusedCallWA(). To make it work, any kernel with indirect call is
|
|
// required to be simd16 or simd8, not simd32, so that M16 can be used to force
|
|
// running the inst always.)
|
|
//
|
|
void Optimizer::applyFusedCallWA() {
|
|
auto updateSubroutineTableIfNeeded = [&](G4_BB *aLeadBB, G4_BB *aB0,
|
|
G4_BB *aB1, G4_BB *aS0, G4_BB *aS1,
|
|
G4_BB *aEndB_or_null) {
|
|
if (int numFuncs = (int)fg.sortedFuncTable.size()) {
|
|
for (int i = 0; i < numFuncs; ++i) {
|
|
FuncInfo *pFInfo = fg.sortedFuncTable[i];
|
|
vASSERT(pFInfo);
|
|
auto &tBBs = pFInfo->getBBList();
|
|
auto tBI = std::find(tBBs.begin(), tBBs.end(), aLeadBB);
|
|
if (tBI != tBBs.end()) {
|
|
// This is FuncInfo for the current func (including kernel entry func)
|
|
// Make sure new BBs are in the FuncInfo's BBList.
|
|
std::list<G4_BB *> toBeInserted;
|
|
toBeInserted.push_back(aB0);
|
|
toBeInserted.push_back(aB1);
|
|
toBeInserted.push_back(aS0);
|
|
toBeInserted.push_back(aS1);
|
|
if (aEndB_or_null) {
|
|
toBeInserted.push_back(aEndB_or_null);
|
|
}
|
|
tBBs.insert(tBI, toBeInserted.begin(), toBeInserted.end());
|
|
|
|
// inc call count as a call is duplicated
|
|
pFInfo->incrementCallCount();
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
unsigned int fusedEUCallWA = builder.getuint32Option(vISA_fusedCallWA);
|
|
// Only process call wa (fusedCallWA = 1) or indirect call is non-uniform
|
|
if (!((fusedEUCallWA == 1) ||
|
|
!builder.getOption(vISA_fusedCallUniform))) {
|
|
return;
|
|
}
|
|
|
|
for (BB_LIST_ITER BI = fg.begin(), BE = fg.end(); BI != BE;) {
|
|
BB_LIST_ITER currBI = BI;
|
|
++BI;
|
|
|
|
G4_BB *BB = (*currBI);
|
|
if (!BB->isEndWithFCall()) {
|
|
continue;
|
|
}
|
|
G4_InstCF *callI = BB->back()->asCFInst();
|
|
if (!callI->isIndirectCall()) {
|
|
// direct call, no wa needed
|
|
continue;
|
|
}
|
|
|
|
if (fusedEUCallWA == 2) {
|
|
auto callInfo = builder.getFcallInfo(callI);
|
|
vISA_ASSERT(callInfo, "call info absent for ifcall");
|
|
if (callInfo->isUniform())
|
|
continue;
|
|
}
|
|
|
|
// Assume fcall always have a single/fall-thru succ
|
|
if (BI == BE || BB->Succs.size() != 1 || BB->Succs.back() != (*BI)) {
|
|
// Skip! (Could this happen ?)
|
|
continue;
|
|
}
|
|
|
|
BB_LIST_ITER nextBI = BI;
|
|
G4_BB *origNextBB = (*nextBI);
|
|
G4_BB *nextBB = origNextBB;
|
|
G4_BB *newNextBB = nullptr;
|
|
if (G4_INST *leadInst = nextBB->getFirstInst()) {
|
|
if (leadInst->opcode() == G4_while || leadInst->opcode() == G4_endif) {
|
|
// Cannot insert join, otherwise, label for while/endif would be wrong
|
|
// Here, create a new empty BB so that we can add join into it.
|
|
newNextBB = fg.createNewBBWithLabel("CallWA_EndBB");
|
|
nextBI = fg.insert(nextBI, newNextBB);
|
|
|
|
// Adjust control-flow
|
|
fg.removePredSuccEdges(BB, nextBB);
|
|
|
|
fg.addPredSuccEdges(BB, newNextBB, true);
|
|
fg.addPredSuccEdges(newNextBB, nextBB, false);
|
|
nextBB = newNextBB;
|
|
|
|
newNextBB->setDivergent(BB->isDivergent());
|
|
if (builder.hasFusedEUNoMaskWA()) {
|
|
newNextBB->setBBType(G4_BB_NM_WA_TYPE);
|
|
}
|
|
}
|
|
}
|
|
G4_ExecSize simdsz = fg.getKernel()->getSimdSize();
|
|
G4_SrcRegRegion *Target = callI->getSrc(0)->asSrcRegRegion();
|
|
|
|
// Create BBs, two for each then (BigEU) and else (SmallEU) branches.
|
|
G4_BB *bigB0 = fg.createNewBBWithLabel("CallWA_BigB0");
|
|
G4_BB *bigB1 = fg.createNewBBWithLabel("CallWA_BigB1");
|
|
G4_BB *smallB0 = fg.createNewBBWithLabel("CallWA_SmallB0");
|
|
G4_BB *smallB1 = fg.createNewBBWithLabel("CallWA_SmallB1");
|
|
// Note that nextBI points to the nextBB!
|
|
fg.insert(nextBI, bigB0);
|
|
fg.insert(nextBI, bigB1);
|
|
fg.insert(nextBI, smallB0);
|
|
fg.insert(nextBI, smallB1); // this is an empty BB. Might be needed for
|
|
// stack restore, etc.
|
|
|
|
G4_Label *endLabel = nextBB->front()->getLabel();
|
|
G4_INST *joinSmallB0 = builder.createCFInst(
|
|
nullptr, G4_join, simdsz, endLabel, nullptr, InstOpt_NoOpt, false);
|
|
smallB0->push_back(joinSmallB0);
|
|
// Let SWSB skip this join when building SIMD CF.
|
|
joinSmallB0->asCFInst()->setSWSBSkip(true);
|
|
|
|
G4_Label *smallB0Label = smallB0->front()->getLabel();
|
|
G4_INST *gotoEnd = builder.createCFInst(
|
|
nullptr, G4_goto, simdsz, smallB0Label, endLabel, InstOpt_NoOpt, false);
|
|
bigB1->push_back(gotoEnd);
|
|
|
|
// Need to insert a join in nextBB
|
|
// This join will never jump, thus set its JIP to nullptr.
|
|
G4_INST *tjoin = nextBB->getFirstInst();
|
|
if (tjoin == nullptr || tjoin->opcode() != G4_join) {
|
|
G4_INST *finalJoin = builder.createCFInst(
|
|
nullptr, G4_join, simdsz, nullptr, nullptr, InstOpt_NoOpt, false);
|
|
if (tjoin == nullptr) {
|
|
nextBB->insertBefore(nextBB->end(), finalJoin);
|
|
} else {
|
|
auto iter = std::find(nextBB->begin(), nextBB->end(), tjoin);
|
|
nextBB->insertBefore(iter, finalJoin);
|
|
}
|
|
}
|
|
|
|
fg.removePredSuccEdges(BB, nextBB);
|
|
|
|
fg.addPredSuccEdges(BB, bigB0, true);
|
|
fg.addPredSuccEdges(BB, smallB0, false);
|
|
fg.addPredSuccEdges(bigB0, bigB1);
|
|
fg.addPredSuccEdges(bigB1, nextBB);
|
|
fg.addPredSuccEdges(smallB0, smallB1);
|
|
fg.addPredSuccEdges(smallB1, nextBB, true);
|
|
|
|
// To make RA know that the real inst can flow from bigB1 to smallB0
|
|
// an edge is added from bigB1 to smallB0
|
|
fg.addPredSuccEdges(bigB1, smallB0);
|
|
|
|
// divergence property update
|
|
// new BBs's divergence is the same as BB's
|
|
bool isDivergent = BB->isDivergent();
|
|
bigB0->setDivergent(isDivergent);
|
|
bigB1->setDivergent(isDivergent);
|
|
smallB0->setDivergent(isDivergent);
|
|
smallB1->setDivergent(isDivergent);
|
|
|
|
// I0: mov tmp sr0.0
|
|
G4_VarBase *V_sr0 = builder.phyregpool.getSr0Reg();
|
|
G4_SrcRegRegion *I0_Src0 =
|
|
builder.createSrc(V_sr0, 0, 0, builder.getRegionScalar(), Type_UD);
|
|
G4_Declare *tmp = builder.createTempVar(1, Type_UD, Any, "tmpSr0");
|
|
G4_DstRegRegion *I0_Dst =
|
|
builder.createDst(tmp->getRegVar(), 0, 0, 1, Type_UD);
|
|
G4_INST *I0 = builder.createInternalInst(
|
|
nullptr, G4_mov, nullptr, g4::NOSAT, g4::SIMD1, I0_Dst, I0_Src0,
|
|
nullptr, InstOpt_WriteEnable);
|
|
|
|
// I1: and (e)F tmp 0x80
|
|
G4_Declare *F =
|
|
builder.createTempFlag(simdsz > g4::SIMD16 ? 2 : 1, "euid2");
|
|
G4_CondMod *F_cm = builder.createCondMod(Mod_e, F->getRegVar(), 0);
|
|
G4_SrcRegRegion *I1_Src0 = builder.createSrc(
|
|
tmp->getRegVar(), 0, 0, builder.getRegionScalar(), Type_UW);
|
|
G4_Imm *Bit7 = builder.createImm(0x80, Type_UW);
|
|
G4_INST *I1 = builder.createInternalInst(
|
|
nullptr, G4_and, F_cm, g4::NOSAT,
|
|
simdsz > g4::SIMD16 ? g4::SIMD32 : g4::SIMD16,
|
|
builder.createNullDst(Type_UW), I1_Src0, Bit7, InstOpt_WriteEnable);
|
|
|
|
if (builder.getuint32Option(vISA_fusedCallWA) != 1) {
|
|
vASSERT(!builder.getOption(vISA_fusedCallUniform));
|
|
// Just need to duplicate the call so that one is called under BigEU,
|
|
// and the other is under SmallEU.
|
|
|
|
BB->pop_back(); // unlink the call inst from BB
|
|
BB->push_back(I0);
|
|
BB->push_back(I1);
|
|
|
|
I0->addDefUse(I1, Opnd_src0);
|
|
|
|
G4_Predicate *pred_m1 =
|
|
builder.createPredicate(PredState_Minus, F->getRegVar(), 0);
|
|
G4_INST *gotoSmallB0 =
|
|
builder.createCFInst(pred_m1, G4_goto, simdsz, smallB0Label,
|
|
smallB0Label, InstOpt_NoOpt, false);
|
|
BB->push_back(gotoSmallB0);
|
|
I1->addDefUse(gotoSmallB0, Opnd_pred);
|
|
|
|
G4_Predicate *nPred(callI->getPredicate());
|
|
G4_SrcRegRegion *nSrc = builder.createSrc(
|
|
Target->getBase(), 0, 0, builder.getRegionScalar(), Type_UD);
|
|
G4_INST *nCallI = builder.createInternalInst(
|
|
nPred, callI->opcode(), nullptr, g4::NOSAT, callI->getExecSize(),
|
|
nullptr, nSrc, nullptr, callI->getOption());
|
|
(void)bigB0->push_back(callI);
|
|
(void)smallB0->push_back(nCallI);
|
|
|
|
// Need to create fcall info
|
|
auto orig_fcallinfo = builder.getFcallInfo(callI);
|
|
if (orig_fcallinfo) {
|
|
builder.addFcallInfo(nCallI, orig_fcallinfo->getArgSize(),
|
|
orig_fcallinfo->getRetSize(),
|
|
orig_fcallinfo->isUniform());
|
|
}
|
|
// Might need to update subroutine table
|
|
updateSubroutineTableIfNeeded(origNextBB, bigB0, bigB1, smallB0, smallB1,
|
|
newNextBB);
|
|
|
|
if (!fg.globalOpndHT.isOpndGlobal(Target)) {
|
|
callI->removeDefUse(Opnd_src0);
|
|
}
|
|
fg.globalOpndHT.addGlobalOpnd(Target);
|
|
fg.globalOpndHT.addGlobalOpnd(nSrc);
|
|
|
|
// done with this indirect call.
|
|
continue;
|
|
}
|
|
|
|
//
|
|
// main call WA under fusedCallWA = 1
|
|
//
|
|
|
|
// I2: (!flag) mov cr0.2 callee
|
|
G4_VarBase *V_cr0 = builder.phyregpool.getCr0Reg();
|
|
G4_DstRegRegion *I2_Dst = builder.createDst(V_cr0, 0, 2, 1, Type_UD);
|
|
G4_SrcRegRegion *I2_Src0 = builder.createSrc(
|
|
Target->getBase(), 0, 0, builder.getRegionScalar(), Type_UD);
|
|
G4_Predicate *pred_m =
|
|
builder.createPredicate(PredState_Minus, F->getRegVar(), 0);
|
|
G4_INST *I2 = builder.createMov(g4::SIMD1, I2_Dst, I2_Src0,
|
|
InstOpt_WriteEnable, false);
|
|
I2->setPredicate(pred_m);
|
|
|
|
// I3: mov smallEUTarget cr0.2
|
|
// Note that both operands of call need to be GRF aligned due to bug.
|
|
// With calla, we need to create grf-aligned sTargetDecl. With call, the
|
|
// relative ip temp, created later as I5Target, will be grf-aligned,
|
|
// thus, sTargetDecl here does not need to be grf-aligned.
|
|
G4_SubReg_Align calleeAlign =
|
|
builder.supportCallaRegSrc() ? builder.getGRFAlign() : Any;
|
|
G4_Declare *sTargetDecl =
|
|
builder.createTempVar(1, Type_UD, calleeAlign, "smallEUTarget");
|
|
G4_DstRegRegion *I3_Dst =
|
|
builder.createDst(sTargetDecl->getRegVar(), 0, 0, 1, Type_UD);
|
|
G4_SrcRegRegion *I3_Src0 =
|
|
builder.createSrc(V_cr0, 0, 2, builder.getRegionScalar(), Type_UD);
|
|
G4_INST *I3 = builder.createMov(g4::SIMD1, I3_Dst, I3_Src0,
|
|
InstOpt_WriteEnable, false);
|
|
|
|
// Insert WA instructions
|
|
BB->pop_back(); // unlink the call inst from BB
|
|
BB->push_back(I0);
|
|
BB->push_back(I1);
|
|
BB->push_back(I2);
|
|
BB->push_back(I3);
|
|
|
|
// update local dataflow
|
|
I0->addDefUse(I1, Opnd_src0);
|
|
I1->addDefUse(I2, Opnd_pred);
|
|
|
|
G4_INST *nCallI;
|
|
if (builder.supportCallaRegSrc()) {
|
|
(void)bigB0->push_back(callI);
|
|
|
|
G4_Predicate *nPred(callI->getPredicate());
|
|
G4_SrcRegRegion *nSrc = builder.createSrc(
|
|
sTargetDecl->getRegVar(), 0, 0, builder.getRegionScalar(), Type_UD);
|
|
nCallI = builder.createInternalInst(
|
|
nPred, callI->opcode(), nullptr, g4::NOSAT, callI->getExecSize(),
|
|
nullptr, nSrc, nullptr, callI->getOption());
|
|
smallB0->push_back(nCallI);
|
|
|
|
if (!fg.globalOpndHT.isOpndGlobal(Target)) {
|
|
callI->removeDefUse(Opnd_src0);
|
|
}
|
|
fg.globalOpndHT.addGlobalOpnd(Target);
|
|
fg.globalOpndHT.addGlobalOpnd(nSrc);
|
|
|
|
kernel.m_maskOffWAInsts.insert(std::make_pair(I3, BB));
|
|
kernel.m_indirectCallWAInfo.emplace(
|
|
BB, IndirectCallWAInfo(bigB0, smallB0, nullptr, nullptr, nullptr,
|
|
nullptr, nullptr, callI, nCallI));
|
|
// BB, bigB0, smallB0 should not be deleted and its instructions shall
|
|
// stay inside. Set BB type to G4_BB_KEEP_TYPE so the other optim passes
|
|
// will not delete them.
|
|
BB->setBBType(G4_BB_KEEP_TYPE);
|
|
bigB0->setBBType(G4_BB_KEEP_TYPE);
|
|
smallB0->setBBType(G4_BB_KEEP_TYPE);
|
|
} else {
|
|
// relative target: need to patch offset after SWSB in
|
|
// finishFusedCallWA()
|
|
|
|
//
|
|
// I4_ip_start: add rSmallIP (-ip) smallTarget
|
|
// I4_patch_add: add I4Target rSmallIP -0x33333333
|
|
// I5_ip_start: add rBigIP (-ip) + bigTarget
|
|
// I5_patch_add: add I5Target rBigIP -0x33333333
|
|
// where 0x33333333 should be the IP difference between I4_ip_start
|
|
// and nCallI (to I4Target), I5_ip_start and callI (I5Target),
|
|
// respectively. and it is patched later.
|
|
// If IP WA is needed, will add the following:
|
|
// ip_wa_mov: mov tIP 0x89ABCDEF : placeholder.
|
|
// I4_ip_start: add rSmallIP -tIP smallTarget
|
|
// I4_patch_add: add I4Target rSmallIP -0x33333333 : patch needed
|
|
// I5_ip_start: add rBigIP -tIP smallTarget
|
|
// I5_patch_add: add I5Target rBigIP -0x33333333 : patch needed
|
|
// where ip_wa_mov will be removed in finishFusedCallWA() with ip wa
|
|
// using in-place call.
|
|
//
|
|
G4_VarBase *V_ip = nullptr;
|
|
G4_INST *ip_wa_placeholder = nullptr;
|
|
if (builder.needIPWA()) {
|
|
// Need 2 DWs (grf-aligned) as using IP WA needs 2 DWs (return IP and
|
|
// call mask)
|
|
G4_Declare *tIP_dcl =
|
|
builder.createTempVar(2, Type_D, builder.getGRFAlign(), "tIP");
|
|
V_ip = (G4_VarBase *)tIP_dcl->getRegVar();
|
|
|
|
// placeholder mov makes sure tIP has a valid live range.
|
|
G4_DstRegRegion *IP_WA_Dst = builder.createDst(V_ip, 0, 0, 1, Type_D);
|
|
G4_Imm *IP_WA_Src0 = builder.createImm(0x89ABCDEF, Type_D);
|
|
ip_wa_placeholder = builder.createMov(g4::SIMD1, IP_WA_Dst, IP_WA_Src0,
|
|
InstOpt_WriteEnable, false);
|
|
BB->push_back(ip_wa_placeholder);
|
|
} else {
|
|
V_ip = (G4_VarBase *)builder.phyregpool.getIpReg();
|
|
}
|
|
|
|
// SmallEU
|
|
G4_Declare *I4_IP = builder.createTempVar(1, Type_D, Any, "rSmallIP");
|
|
G4_DstRegRegion *I4_Dst =
|
|
builder.createDst(I4_IP->getRegVar(), 0, 0, 1, Type_D);
|
|
G4_SrcRegRegion *I4_Src0 = builder.createSrcRegRegion(
|
|
Mod_Minus, Direct, V_ip, 0, 0, builder.getRegionScalar(), Type_D);
|
|
G4_SrcRegRegion *I4_Src1 = builder.createSrc(
|
|
sTargetDecl->getRegVar(), 0, 0, builder.getRegionScalar(), Type_D);
|
|
G4_INST *I4_ip_start =
|
|
builder.createBinOp(G4_add, g4::SIMD1, I4_Dst, I4_Src0, I4_Src1,
|
|
InstOpt_WriteEnable, false);
|
|
|
|
G4_Declare *I4Target = builder.createTempVar(
|
|
1, Type_D, builder.getGRFAlign(), "rSmallEUTarget");
|
|
G4_DstRegRegion *I4_pDst =
|
|
builder.createDst(I4Target->getRegVar(), 0, 0, 1, Type_D);
|
|
G4_SrcRegRegion *I4_pSrc0 = builder.createSrc(
|
|
I4_IP->getRegVar(), 0, 0, builder.getRegionScalar(), Type_D);
|
|
G4_Imm *I4_pSrc1 =
|
|
builder.createImm(0x33333333, Type_D); // to be patched later
|
|
G4_INST *I4_patch_add =
|
|
builder.createBinOp(G4_add, g4::SIMD1, I4_pDst, I4_pSrc0, I4_pSrc1,
|
|
InstOpt_WriteEnable, false);
|
|
|
|
// BigEU
|
|
G4_Declare *I5_IP = builder.createTempVar(1, Type_D, Any, "rBigIP");
|
|
G4_DstRegRegion *I5_Dst =
|
|
builder.createDst(I5_IP->getRegVar(), 0, 0, 1, Type_D);
|
|
G4_SrcRegRegion *I5_Src0 = builder.createSrcRegRegion(
|
|
Mod_Minus, Direct, V_ip, 0, 0, builder.getRegionScalar(), Type_D);
|
|
G4_SrcRegRegion *I5_Src1 = builder.createSrc(
|
|
Target->getBase(), 0, 0, builder.getRegionScalar(), Type_D);
|
|
G4_INST *I5_ip_start =
|
|
builder.createBinOp(G4_add, g4::SIMD1, I5_Dst, I5_Src0, I5_Src1,
|
|
InstOpt_WriteEnable, false);
|
|
|
|
G4_Declare *I5Target = builder.createTempVar(
|
|
1, Type_D, builder.getGRFAlign(), "rBigEUTarget");
|
|
G4_DstRegRegion *I5_pDst =
|
|
builder.createDst(I5Target->getRegVar(), 0, 0, 1, Type_D);
|
|
G4_SrcRegRegion *I5_pSrc0 = builder.createSrc(
|
|
I5_IP->getRegVar(), 0, 0, builder.getRegionScalar(), Type_D);
|
|
G4_Imm *I5_pSrc1 =
|
|
builder.createImm(0x33333333, Type_D); // to be patched later
|
|
G4_INST *I5_patch_add =
|
|
builder.createBinOp(G4_add, g4::SIMD1, I5_pDst, I5_pSrc0, I5_pSrc1,
|
|
InstOpt_WriteEnable, false);
|
|
|
|
BB->push_back(I4_ip_start);
|
|
BB->push_back(I4_patch_add);
|
|
BB->push_back(I5_ip_start);
|
|
BB->push_back(I5_patch_add);
|
|
|
|
callI->setSrc(builder.createSrc(I5Target->getRegVar(), 0, 0,
|
|
builder.getRegionScalar(), Type_UD),
|
|
0);
|
|
(void)bigB0->push_back(callI);
|
|
|
|
G4_Predicate *nPred(callI->getPredicate());
|
|
G4_SrcRegRegion *nSrc = builder.createSrc(
|
|
I4Target->getRegVar(), 0, 0, builder.getRegionScalar(), Type_UD);
|
|
nCallI = builder.createInternalInst(
|
|
nPred, callI->opcode(), nullptr, g4::NOSAT, callI->getExecSize(),
|
|
nullptr, nSrc, nullptr, callI->getOption());
|
|
smallB0->push_back(nCallI);
|
|
|
|
I3->addDefUse(I4_ip_start, Opnd_src1);
|
|
I4_ip_start->addDefUse(I4_patch_add, Opnd_src0);
|
|
I5_ip_start->addDefUse(I5_patch_add, Opnd_src0);
|
|
fg.globalOpndHT.addGlobalOpnd(I4_pDst);
|
|
fg.globalOpndHT.addGlobalOpnd(I5_pDst);
|
|
if (!fg.globalOpndHT.isOpndGlobal(Target)) {
|
|
callI->copyDef(I2, Opnd_src0, Opnd_src0);
|
|
callI->transferDef(I5_ip_start, Opnd_src0, Opnd_src1);
|
|
}
|
|
|
|
// add indirect call wa info
|
|
kernel.m_indirectCallWAInfo.emplace(
|
|
BB, IndirectCallWAInfo(bigB0, smallB0, ip_wa_placeholder, I4_ip_start,
|
|
I4_patch_add, I5_ip_start, I5_patch_add, callI,
|
|
nCallI));
|
|
|
|
kernel.m_maskOffWAInsts.insert(std::make_pair(I3, BB));
|
|
kernel.m_maskOffWAInsts.insert(std::make_pair(I4_ip_start, BB));
|
|
kernel.m_maskOffWAInsts.insert(std::make_pair(I4_patch_add, BB));
|
|
}
|
|
|
|
G4_Predicate *pred_m1 =
|
|
builder.createPredicate(PredState_Minus, F->getRegVar(), 0);
|
|
G4_INST *gotoSmallB0 =
|
|
builder.createCFInst(pred_m1, G4_goto, simdsz, smallB0Label,
|
|
smallB0Label, InstOpt_NoOpt, false);
|
|
BB->push_back(gotoSmallB0);
|
|
I1->addDefUse(gotoSmallB0, Opnd_pred);
|
|
|
|
// Need to create fcall info
|
|
auto orig_fcallinfo = builder.getFcallInfo(callI);
|
|
if (orig_fcallinfo) {
|
|
builder.addFcallInfo(nCallI, orig_fcallinfo->getArgSize(),
|
|
orig_fcallinfo->getRetSize(),
|
|
orig_fcallinfo->isUniform());
|
|
}
|
|
// Might need to update subroutine table
|
|
updateSubroutineTableIfNeeded(origNextBB, bigB0, bigB1, smallB0, smallB1,
|
|
newNextBB);
|
|
|
|
// nomask wa property
|
|
// if BB is marked with NM_WA_TYPE, set all new BBs with NM_WA_TYPE
|
|
// if BB is not marked with NM_WA_TYPE and is divergent, mark the
|
|
// smallB0/B1
|
|
// as NM_WA_TYPE
|
|
if (builder.hasFusedEUNoMaskWA()) {
|
|
if ((BB->getBBType() & G4_BB_NM_WA_TYPE) != 0) {
|
|
bigB0->setBBType(G4_BB_NM_WA_TYPE);
|
|
bigB1->setBBType(G4_BB_NM_WA_TYPE);
|
|
smallB0->setBBType(G4_BB_NM_WA_TYPE);
|
|
smallB1->setBBType(G4_BB_NM_WA_TYPE);
|
|
} else if (isDivergent) {
|
|
smallB0->setBBType(G4_BB_NM_WA_TYPE);
|
|
smallB1->setBBType(G4_BB_NM_WA_TYPE);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Convert vISA MULH dst:d src0:d src1:d into
|
|
// mul acc0.0<1>:d src0:d src1:w
|
|
// mach dst:d src0:d src1:d
|
|
// convert vISA mul dst:d src0:d src1:d into
|
|
// mul acc0.0<1>:d src0:d src1:w
|
|
// macl dst:d src0:d src1:d
|
|
void Optimizer::expandMulPostSchedule() {
|
|
if (!VISA_WA_CHECK(builder.getPWaTable(), Wa_14013677893)) {
|
|
return;
|
|
}
|
|
|
|
for (auto bb : kernel.fg) {
|
|
for (INST_LIST_ITER it = bb->begin(); it != bb->end(); it++) {
|
|
G4_INST *inst = *it;
|
|
if (inst->opcode() != G4_mul && inst->opcode() != G4_mulh) {
|
|
continue;
|
|
}
|
|
|
|
G4_Operand *src0 = inst->getSrc(0);
|
|
G4_Operand *src1 = inst->getSrc(1);
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
|
|
if (dst->isAccReg()) {
|
|
continue;
|
|
}
|
|
|
|
if (!IS_DTYPE(src0->getType()) || !IS_DTYPE(src1->getType()) ||
|
|
!IS_DTYPE(dst->getType())) {
|
|
continue;
|
|
}
|
|
|
|
vISA_ASSERT(inst->getSaturate() == g4::NOSAT,
|
|
"NOSAT is expected in mul/mulh expanding");
|
|
vISA_ASSERT(inst->getCondMod() == nullptr,
|
|
"DW multiply does not support conditional modifiers");
|
|
vISA_ASSERT(!src0->isSrcRegRegion() ||
|
|
src0->asSrcRegRegion()->getModifier() == Mod_src_undef,
|
|
"no src0 modifier is expected in mul/mulh expanding");
|
|
vISA_ASSERT(!src1->isSrcRegRegion() ||
|
|
src1->asSrcRegRegion()->getModifier() == Mod_src_undef,
|
|
"no src1 modifier is expected in mul/mulh expanding");
|
|
|
|
uint32_t origOptions = inst->getOption();
|
|
G4_Predicate *origPredicate = inst->getPredicate();
|
|
auto execSize = inst->getExecSize();
|
|
auto tmpType =
|
|
(IS_UNSIGNED_INT(src0->getType()) && IS_UNSIGNED_INT(src1->getType()))
|
|
? Type_UD
|
|
: Type_D;
|
|
|
|
// 1, create a new mul inst
|
|
G4_DstRegRegion *accDstOpnd =
|
|
builder.createDst(builder.phyregpool.getAcc0Reg(), 0, 0, 1, tmpType);
|
|
auto newMul = builder.createBinOp(
|
|
G4_mul, execSize, accDstOpnd, builder.duplicateOperand(src0),
|
|
builder.duplicateOperand(src1), origOptions, false);
|
|
bb->insertBefore(it, newMul);
|
|
inst->copyDefsTo(newMul, false);
|
|
// change the src1 of MUL from :d to :w
|
|
HWConformity hwConf(builder, kernel);
|
|
hwConf.fixMulSrc1(std::prev(it), bb);
|
|
|
|
// 2, create a mach/macl inst
|
|
G4_INST *maclOrMachInst = nullptr;
|
|
if (inst->opcode() == G4_mul) {
|
|
// create a macl inst
|
|
maclOrMachInst = builder.createMacl(
|
|
execSize, dst, builder.duplicateOperand(src0),
|
|
builder.duplicateOperand(src1), origOptions, tmpType);
|
|
} else if (inst->opcode() == G4_mulh) {
|
|
// create a mach inst
|
|
maclOrMachInst = builder.createMach(
|
|
execSize, dst, builder.duplicateOperand(src0),
|
|
builder.duplicateOperand(src1), origOptions, tmpType);
|
|
}
|
|
maclOrMachInst->setPredicate(origPredicate);
|
|
*it = maclOrMachInst;
|
|
inst->removeAllDefs();
|
|
newMul->addDefUse(maclOrMachInst, Opnd_implAccSrc);
|
|
|
|
// 3, always add a dummy mov after mach/macl for HW read suppresion W/A
|
|
auto dummyMovSrc = builder.createSrc(dst->getBase(), dst->getRegOff(), 0,
|
|
builder.getRegionScalar(), Type_D);
|
|
G4_INST *dummyMov =
|
|
builder.createMov(g4::SIMD1, builder.createNullDst(Type_D),
|
|
dummyMovSrc, InstOpt_WriteEnable, false);
|
|
bb->insertAfter(it, dummyMov);
|
|
}
|
|
}
|
|
}
|
|
|
|
// SOA layout of dst:(dst_hi32:d, dst_lo32:d)
|
|
// if src2 is not immediate value of zero, then expand MADW((dst_hi32, dst_lo32)
|
|
// = src0 * src1 + src2) to:
|
|
// mul (16) acc0.0<1>:d src0<1;1,0>:d src1<2;1,0>:uw
|
|
// mach (16) dst_hi32<1>:d src0<1;1,0>:d src1<1;1,0>:d
|
|
// addc (16) dst_lo32<1>:d acc0.0<1;1,0>:d src2<1;1,0>:d // Low 32
|
|
// bits add (16) dst_hi32<1>:d acc0.0<1;1,0>:d dst_hi32<1;1,0>:d // High
|
|
// 32 bits
|
|
// otherwise, expand to:
|
|
// mul (16) acc0.0<1>:d src0<1;1,0>:d src1<2;1,0>:uw
|
|
// mach (16) dst_hi32<1>:d src0<1;1,0>:d src1<1;1,0>:d // High 32 bits
|
|
// mov (16) dst_lo32<1>:d acc0.0<1;1,0>:d // Low 32 bits
|
|
void Optimizer::expandMadwPostSchedule() {
|
|
if (!VISA_WA_CHECK(builder.getPWaTable(), Wa_14013677893)) {
|
|
return;
|
|
}
|
|
|
|
for (auto bb : kernel.fg) {
|
|
for (INST_LIST_ITER it = bb->begin(); it != bb->end(); it++) {
|
|
G4_INST *inst = *it;
|
|
if (inst->opcode() != G4_madw) {
|
|
continue;
|
|
}
|
|
|
|
// Unset a AccWrCtrl first.
|
|
inst->setOptionOff(InstOpt_AccWrCtrl);
|
|
|
|
G4_Operand *src0 = inst->getSrc(0);
|
|
G4_Operand *src1 = inst->getSrc(1);
|
|
G4_Operand *src2 = inst->getSrc(2);
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
|
|
vISA_ASSERT(inst->getSaturate() == g4::NOSAT,
|
|
"NOSAT is expected in mul/mulh/madw expanding");
|
|
vISA_ASSERT(inst->getCondMod() == nullptr,
|
|
"DW multiply does not support conditional modifiers");
|
|
vISA_ASSERT(!src0->isSrcRegRegion() ||
|
|
src0->asSrcRegRegion()->getModifier() == Mod_src_undef,
|
|
"no src0 modifier is expected in mul/mulh/madw expanding");
|
|
vISA_ASSERT(!src1->isSrcRegRegion() ||
|
|
src1->asSrcRegRegion()->getModifier() == Mod_src_undef,
|
|
"no src1 modifier is expected in mul/mulh/madw expanding");
|
|
vISA_ASSERT(IS_DTYPE(src0->getType()) && IS_DTYPE(src1->getType()) &&
|
|
IS_DTYPE(src2->getType()),
|
|
"only DW-type sources are supported");
|
|
|
|
uint32_t origOptions = inst->getOption();
|
|
G4_Predicate *origPredicate = inst->getPredicate();
|
|
auto execSize = inst->getExecSize();
|
|
G4_Type tmpType =
|
|
(IS_UNSIGNED_INT(src0->getType()) &&
|
|
IS_UNSIGNED_INT(src1->getType()) && IS_UNSIGNED_INT(src2->getType()))
|
|
? Type_UD
|
|
: Type_D;
|
|
|
|
// 1, create a new mul inst
|
|
G4_DstRegRegion *accDstOpnd =
|
|
builder.createDst(builder.phyregpool.getAcc0Reg(), 0, 0, 1, tmpType);
|
|
auto newMul = builder.createBinOp(
|
|
G4_mul, execSize, accDstOpnd, builder.duplicateOperand(src0),
|
|
builder.duplicateOperand(src1), origOptions, false);
|
|
auto startIter = bb->insertBefore(it, newMul);
|
|
inst->copyDefsTo(newMul, false);
|
|
// change the src1 of MUL from :d to :w
|
|
HWConformity hwConf(builder, kernel);
|
|
hwConf.fixMulSrc1(startIter, bb);
|
|
|
|
// 2, create a mach/macl inst
|
|
int DstHiRegOffset = (int)std::ceil(
|
|
(float)(execSize * TypeSize(tmpType)) / kernel.getGRFSize());
|
|
G4_DstRegRegion *dstHi32 =
|
|
builder.createDst(dst->getBase(), dst->getRegOff() + DstHiRegOffset,
|
|
dst->getSubRegOff(), 1, tmpType);
|
|
G4_INST *machInst = builder.createMach(
|
|
execSize, dstHi32, builder.duplicateOperand(src0),
|
|
builder.duplicateOperand(src1), origOptions, tmpType);
|
|
|
|
machInst->setPredicate(origPredicate);
|
|
*it = machInst;
|
|
inst->removeAllDefs();
|
|
newMul->addDefUse(machInst, Opnd_implAccSrc);
|
|
|
|
auto endIter = it;
|
|
// always add a dummy mov after mach/macl for HW read suppresion W/A
|
|
auto dummyMovSrc =
|
|
builder.createSrc(dst->getBase(), dst->getRegOff() + DstHiRegOffset,
|
|
0, builder.getRegionScalar(), Type_D);
|
|
G4_INST *dummyMov =
|
|
builder.createMov(g4::SIMD1, builder.createNullDst(Type_D),
|
|
dummyMovSrc, InstOpt_WriteEnable, false);
|
|
endIter = bb->insertAfter(endIter, dummyMov);
|
|
|
|
// optimize: only do multiply if src2 is imme 0
|
|
if (src2->isImm() && src2->asImm()->getImm() == 0) {
|
|
// 3, create a mov inst
|
|
auto dstLo32 = builder.createDst(dst->getBase(), dst->getRegOff(),
|
|
dst->getSubRegOff(), 1, tmpType);
|
|
auto accSrcOpndMov = builder.createSrc(
|
|
builder.phyregpool.getAcc0Reg(), 0, 0,
|
|
execSize == g4::SIMD1 ? builder.getRegionScalar()
|
|
: builder.getRegionStride1(),
|
|
tmpType);
|
|
auto movInst = builder.createMov(execSize, dstLo32, accSrcOpndMov,
|
|
origOptions, false);
|
|
movInst->setPredicate(origPredicate);
|
|
endIter = bb->insertAfter(endIter, movInst);
|
|
} else {
|
|
// 3, create a addc inst
|
|
auto dstLo32 = builder.createDst(dst->getBase(), dst->getRegOff(),
|
|
dst->getSubRegOff(), 1, tmpType);
|
|
auto accSrcOpnd = builder.createSrc(
|
|
builder.phyregpool.getAcc0Reg(), 0, 0,
|
|
execSize == g4::SIMD1 ? builder.getRegionScalar()
|
|
: builder.getRegionStride1(),
|
|
tmpType);
|
|
auto addcInst = builder.createBinOp(
|
|
G4_addc, execSize, dstLo32, accSrcOpnd,
|
|
builder.duplicateOperand(src2), origOptions, false);
|
|
addcInst->setPredicate(origPredicate);
|
|
endIter = bb->insertAfter(endIter, addcInst);
|
|
|
|
// 4, create a add inst
|
|
auto src1Add = builder.createSrc(
|
|
dstHi32->getBase(), dstHi32->getRegOff(), dstHi32->getSubRegOff(),
|
|
execSize == g4::SIMD1 ? builder.getRegionScalar()
|
|
: builder.getRegionStride1(),
|
|
tmpType);
|
|
auto addInst = builder.createBinOp(
|
|
G4_add, execSize, builder.duplicateOperand(dstHi32),
|
|
builder.duplicateOperand(accSrcOpnd), src1Add, origOptions, false);
|
|
addInst->setPredicate(origPredicate);
|
|
endIter = bb->insertAfter(endIter, addInst);
|
|
}
|
|
|
|
// split inst if execSize is larger than native execSize
|
|
if (execSize > builder.getNativeExecSize()) {
|
|
hwConf.splitDWMULInst(startIter, endIter, bb);
|
|
it = startIter;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
void Optimizer::fixReadSuppressioninFPU0() {
|
|
auto isFloatPipe = [](G4_INST *inst) -> bool {
|
|
// There seems to be 2 implementations used to determine whether an
|
|
// instruction would go to float pipe:
|
|
// G4_INST::isFloatPipeInstructionXe() and HWConformity::isFloatOr64().
|
|
// Only check the types of dst and src0 now.
|
|
if (G4_DstRegRegion *dst = inst->getDst())
|
|
return IS_TYPE_FLOAT_ALL(dst->getType());
|
|
|
|
if (const G4_Operand *src = inst->getSrc(0))
|
|
return IS_TYPE_FLOAT_ALL(src->getType());
|
|
|
|
return false;
|
|
};
|
|
auto isRawMov = [](G4_INST *inst) -> bool {
|
|
if (!inst->isRawMov())
|
|
return false;
|
|
|
|
if (inst->hasACCOpnd())
|
|
return false;
|
|
|
|
G4_Type dstType = inst->getDst()->getType();
|
|
return IS_TYPE_FLOAT_ALL(dstType) && dstType != Type_DF;
|
|
};
|
|
|
|
auto isRawSel = [](G4_INST *inst) -> bool {
|
|
if (inst->opcode() != G4_sel)
|
|
return false;
|
|
|
|
if (const G4_CondMod *condMod = inst->getCondMod()) {
|
|
if (condMod->getMod() != Mod_ge && condMod->getMod() != Mod_l)
|
|
return false;
|
|
}
|
|
|
|
if (inst->getSaturate())
|
|
return false;
|
|
|
|
if (inst->getSrc(0)->isSrcRegRegion() &&
|
|
inst->getSrc(0)->asSrcRegRegion()->hasModifier())
|
|
return false;
|
|
|
|
if (inst->getSrc(1)->isSrcRegRegion() &&
|
|
inst->getSrc(1)->asSrcRegRegion()->hasModifier())
|
|
return false;
|
|
|
|
G4_Type dstType = inst->getDst()->getType();
|
|
G4_Type src0Type = inst->getSrc(0)->getType();
|
|
return ((src0Type == dstType && dstType == Type_F) ||
|
|
(src0Type == Type_HF && dstType == Type_HF));
|
|
};
|
|
|
|
auto isSPPath = [&](G4_INST *inst) -> bool {
|
|
return (isRawMov(inst) && inst->getSrc(0)->getType() == Type_HF) ||
|
|
(isRawSel(inst) && inst->getSrc(0)->getType() == Type_HF) ||
|
|
(inst->getSrc(0) && inst->getSrc(0)->getType() == Type_DF &&
|
|
inst->getDst() && inst->getDst()->getType() == Type_F);
|
|
};
|
|
|
|
G4_INST *prev = nullptr;
|
|
bool isPrevOnSPPath = false;
|
|
for (auto bb : fg) {
|
|
for (auto it = bb->begin(), ie = bb->end(); it != ie; ++it) {
|
|
G4_INST *cur = *it;
|
|
// Only check the instruction that goes to fp pipe.
|
|
if (!isFloatPipe(cur))
|
|
continue;
|
|
|
|
bool isCurOnSPPath = isSPPath(cur);
|
|
// insert a dummy csel to invalidate the read suppression buffer
|
|
// when the current instruction would switch buses while having
|
|
// same source register and data type.
|
|
if (prev && isPrevOnSPPath ^ isCurOnSPPath) {
|
|
G4_SrcRegRegion *srcToFix = nullptr;
|
|
int maxNumSrc = std::max(prev->getNumSrc(), cur->getNumSrc());
|
|
for (int i = 0; i < maxNumSrc; ++i) {
|
|
if (!prev || !prev->getSrc(i) || !prev->getSrc(i)->isSrcRegRegion())
|
|
continue;
|
|
if (!cur->getSrc(i) || !cur->getSrc(i)->isSrcRegRegion())
|
|
continue;
|
|
G4_SrcRegRegion *prevSrc = prev->getSrc(i)->asSrcRegRegion();
|
|
G4_SrcRegRegion *curSrc = cur->getSrc(i)->asSrcRegRegion();
|
|
if (*curSrc == *prevSrc) {
|
|
srcToFix = curSrc;
|
|
break;
|
|
}
|
|
}
|
|
if (srcToFix) {
|
|
const RegionDesc *region = builder.createRegionDesc(4, 4, 1);
|
|
G4_Declare *decl = builder.createHardwiredDeclare(4, Type_F, 1, 0);
|
|
G4_SrcRegRegion *src0 = fg.builder->createSrcRegRegion(decl, region);
|
|
G4_SrcRegRegion *src1 = fg.builder->createSrcRegRegion(decl, region);
|
|
G4_SrcRegRegion *src2 = fg.builder->createSrcRegRegion(decl, region);
|
|
G4_DstRegRegion *dst = fg.builder->createDstRegRegion(decl, 1);
|
|
G4_INST *cselInst = builder.createInternalInst(
|
|
nullptr, G4_csel, nullptr, g4::NOSAT, g4::SIMD4, dst, src0, src1,
|
|
src2, InstOpt_WriteEnable);
|
|
bb->insertBefore(it, cselInst);
|
|
}
|
|
}
|
|
prev = cur;
|
|
isPrevOnSPPath = isCurOnSPPath;
|
|
}
|
|
}
|
|
}
|
|
|
|
void Optimizer::prepareDPASFuseRSWA() {
|
|
vISA_ASSERT(builder.hasDPAS() && builder.hasDPASFuseRSWA(),
|
|
"Expected the function is called only when WA is specified in "
|
|
"WATable or options");
|
|
|
|
kernel.fg.resetLocalDataFlowData();
|
|
kernel.fg.localDataFlowAnalysis();
|
|
|
|
BitSet GRFwriteByALU(kernel.getNumRegTotal(), false);
|
|
builder.src1FirstGRFOfLastDpas.resize(kernel.getNumRegTotal());
|
|
builder.src1FirstGRFOfLastDpas.clear();
|
|
|
|
std::list<G4_INST *> dpasList;
|
|
|
|
for (auto BI : fg) {
|
|
G4_BB *BB = BI;
|
|
G4_INST *lastDpas = nullptr;
|
|
for (auto II = BB->begin(), IE = BB->end(); II != IE; ++II) {
|
|
G4_INST *I = *II;
|
|
|
|
if (!I->isSend()) {
|
|
G4_Operand *dst = I->getDst();
|
|
if (dst && !dst->isNullReg() && dst->isGreg()) {
|
|
unsigned int LB = 0;
|
|
unsigned int RB = 0;
|
|
|
|
LB = (unsigned int)(dst->getLinearizedStart() /
|
|
builder.numEltPerGRF<Type_UB>());
|
|
RB = (unsigned int)(dst->getLinearizedEnd() /
|
|
builder.numEltPerGRF<Type_UB>());
|
|
GRFwriteByALU.set(LB, RB);
|
|
}
|
|
}
|
|
|
|
if (I->isDpas()) {
|
|
dpasList.push_back(I);
|
|
lastDpas = I;
|
|
}
|
|
}
|
|
if (lastDpas != nullptr) {
|
|
G4_Operand *src1Opnd = lastDpas->asDpasInst()->getSrc(1);
|
|
unsigned int LB = (unsigned int)(src1Opnd->getLinearizedStart() /
|
|
builder.numEltPerGRF<Type_UB>());
|
|
builder.src1FirstGRFOfLastDpas.set(LB, true);
|
|
}
|
|
}
|
|
vISA_ASSERT(!builder.src1FirstGRFOfLastDpas.isAllset(),
|
|
"Do not expect the first GRF of src1 in last dpas inst of every "
|
|
"BB touches all GRFs");
|
|
|
|
for (auto I : dpasList) {
|
|
bool found_src1_def = false;
|
|
bool sendDefineOnly = true;
|
|
for (auto i = I->def_begin(), E = I->def_end(); i != E; ++i) {
|
|
if (i->second == Opnd_src1) {
|
|
found_src1_def = true;
|
|
auto defInst = i->first;
|
|
if (!defInst->isSend()) {
|
|
sendDefineOnly = false;
|
|
kernel.setNeedDPASWA(true);
|
|
I->asDpasInst()->setMayNeedWA(true);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (sendDefineOnly) {
|
|
G4_Operand *src1Opnd = I->asDpasInst()->getSrc(1);
|
|
unsigned int LB = (unsigned int)(src1Opnd->getLinearizedStart() /
|
|
builder.numEltPerGRF<Type_UB>());
|
|
unsigned int RB = (unsigned int)(src1Opnd->getLinearizedEnd() /
|
|
builder.numEltPerGRF<Type_UB>());
|
|
|
|
if (!GRFwriteByALU.isEmpty(LB, RB)) {
|
|
kernel.setNeedDPASWA(true);
|
|
I->asDpasInst()->setMayNeedWA(true);
|
|
}
|
|
}
|
|
|
|
if (!found_src1_def) {
|
|
kernel.setNeedDPASWA(true);
|
|
I->asDpasInst()->setMayNeedWA(true);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Expand Intrinsic::BarrierWA instruction
|
|
void Optimizer::applyBarrierWA(INST_LIST_ITER it, G4_BB *bb) {
|
|
G4_INST *inst = *it;
|
|
|
|
if (!inst->isBarrierWAIntrinsic())
|
|
return;
|
|
|
|
// The dst of Intrinsic::BarrierWA instruction has 1 DW for saving existing
|
|
// flag so WA can use it in the loop
|
|
auto dst = inst->getDst();
|
|
|
|
G4_RegVar *WAFlagVar = builder.createTempFlag(2, "WAFlagUD")->getRegVar();
|
|
WAFlagVar->setPhyReg(builder.phyregpool.getF0Reg(), 0);
|
|
|
|
// save f0.0:ud to dst.0:ud, then f0.0 can be used in the loop
|
|
// (W) mov(1) dst.0:ud f0.0:ud
|
|
G4_DstRegRegion *dstMovForSave = builder.createDst(
|
|
dst->getBase(), dst->getRegOff(), dst->getSubRegOff(), 1, Type_UD);
|
|
G4_SrcRegRegion *srcMovForSave =
|
|
builder.createSrc(WAFlagVar, 0, 0, builder.getRegionScalar(), Type_UD);
|
|
auto saveInst = builder.createMov(g4::SIMD1, dstMovForSave, srcMovForSave,
|
|
InstOpt_WriteEnable, false);
|
|
vASSERT(dstMovForSave->getLinearizedStart() >= dst->getLinearizedStart() &&
|
|
dstMovForSave->getLinearizedEnd() <= dst->getLinearizedEnd());
|
|
bb->insertBefore(it, saveInst);
|
|
|
|
// create label
|
|
G4_Label *label = builder.createLocalBlockLabel("barrier_WA_loop");
|
|
auto labelInst = builder.createLabelInst(label, false);
|
|
bb->insertBefore(it, labelInst);
|
|
|
|
// (W) and(1) (eq)f0.0 null:ud n0.0:ud 0x1:ud
|
|
G4_DstRegRegion *nullDst = builder.createNullDst(Type_UD);
|
|
G4_SrcRegRegion *src0And = builder.createSrc(
|
|
builder.phyregpool.getN0Reg(), 0, 0, builder.getRegionScalar(), Type_UD);
|
|
G4_CondMod *condMod = builder.createCondMod(Mod_e, WAFlagVar, 0);
|
|
auto andInst = builder.createInternalInst(
|
|
nullptr, G4_and, condMod, g4::NOSAT, g4::SIMD1, nullDst, src0And,
|
|
builder.createImm(0x1, Type_UD), InstOpt_WriteEnable);
|
|
bb->insertBefore(it, andInst);
|
|
|
|
// (W&f0.0) while(1) loop
|
|
G4_Predicate *pred = builder.createPredicate(PredState_Plus, WAFlagVar, 0);
|
|
auto whileInst = builder.createInternalCFInst(
|
|
pred, G4_while, g4::SIMD1, label, label, InstOpt_WriteEnable);
|
|
bb->insertBefore(it, whileInst);
|
|
|
|
// restore f0.0:ud from dst.0:ud
|
|
// mov(1) f0.0:ud dst.0:ud
|
|
G4_DstRegRegion *dstMovForRestore = builder.createDst(WAFlagVar, Type_UD);
|
|
G4_SrcRegRegion *srcMovForRestore =
|
|
builder.createSrc(dst->getBase(), dst->getRegOff(), dst->getSubRegOff(),
|
|
builder.getRegionScalar(), Type_UD);
|
|
auto restoreInst =
|
|
builder.createMov(g4::SIMD1, dstMovForRestore, srcMovForRestore,
|
|
InstOpt_WriteEnable, false);
|
|
*it = restoreInst;
|
|
}
|
|
|
|
// Expand Intrinsic::NamedBarrierWA instruction
|
|
void Optimizer::applyNamedBarrierWA(INST_LIST_ITER it, G4_BB *bb) {
|
|
G4_INST *inst = *it;
|
|
|
|
if (!inst->isNamedBarrierWAIntrinsic())
|
|
return;
|
|
|
|
// The dst of Intrinsic::NamedBarrierWA instruction has 3 DWs:
|
|
// dst.0:ud is for legalizing the barrier id which could be :b datatype
|
|
// or immediate.
|
|
// dst.1:ud is for generating the mask.
|
|
// dst.2:ud is for saving existing flag so WA can use it in the loop
|
|
// The src0 of Intrinsic::NamedBarrierWA instruction is the barrier id.
|
|
|
|
auto dst = inst->getDst();
|
|
auto src = inst->getSrc(0);
|
|
|
|
G4_RegVar *WAFlagVar = builder.createTempFlag(2, "WAFlagUD")->getRegVar();
|
|
WAFlagVar->setPhyReg(builder.phyregpool.getF0Reg(), 0);
|
|
|
|
// save f0.0:ud to dst.2:ud, then f0.0 can be used in the loop
|
|
// (W) mov(1) dst.2:ud f0.0:ud
|
|
G4_DstRegRegion *dstMovForSave = builder.createDst(
|
|
dst->getBase(), dst->getRegOff(), dst->getSubRegOff() + 2, 1, Type_UD);
|
|
G4_SrcRegRegion *srcMovForSave =
|
|
builder.createSrc(WAFlagVar, 0, 0, builder.getRegionScalar(), Type_UD);
|
|
auto saveInst = builder.createMov(g4::SIMD1, dstMovForSave, srcMovForSave,
|
|
InstOpt_WriteEnable, false);
|
|
vASSERT(dstMovForSave->getLinearizedStart() >= dst->getLinearizedStart() &&
|
|
dstMovForSave->getLinearizedEnd() <= dst->getLinearizedEnd());
|
|
bb->insertBefore(it, saveInst);
|
|
|
|
// (W) mov dst.1<1>:ud 0x1:ud
|
|
G4_DstRegRegion *dstMov = builder.createDst(
|
|
dst->getBase(), dst->getRegOff(), dst->getSubRegOff() + 1, 1, Type_UD);
|
|
auto movInst =
|
|
builder.createMov(g4::SIMD1, dstMov, builder.createImm(0x1, Type_UD),
|
|
InstOpt_WriteEnable, false);
|
|
vASSERT(dstMov->getLinearizedStart() >= dst->getLinearizedStart() &&
|
|
dstMov->getLinearizedEnd() <= dst->getLinearizedEnd());
|
|
bb->insertBefore(it, movInst);
|
|
|
|
// (W) mov dst.0<1>:ud src(barrierId):ud
|
|
G4_DstRegRegion *dstMov2 = builder.createDst(dst->getBase(), dst->getRegOff(),
|
|
dst->getSubRegOff(), 1, Type_UD);
|
|
auto movInst2 =
|
|
builder.createMov(g4::SIMD1, dstMov2, src, InstOpt_WriteEnable, false);
|
|
vASSERT(dstMov2->getLinearizedStart() >= dst->getLinearizedStart() &&
|
|
dstMov2->getLinearizedEnd() <= dst->getLinearizedEnd());
|
|
bb->insertBefore(it, movInst2);
|
|
|
|
// (W) shl(1) dst.1:ud dst.1:ud dst.0:ud
|
|
G4_SrcRegRegion *src0Shl = builder.createSrc(
|
|
dst->getBase(), dst->getRegOff(), dst->getSubRegOff() + 1,
|
|
builder.getRegionScalar(), Type_UD);
|
|
G4_SrcRegRegion *src1Shl =
|
|
builder.createSrc(dst->getBase(), dst->getRegOff(), dst->getSubRegOff(),
|
|
builder.getRegionScalar(), Type_UD);
|
|
auto shlInst =
|
|
builder.createBinOp(G4_shl, g4::SIMD1, builder.duplicateOperand(dstMov),
|
|
src0Shl, src1Shl, InstOpt_WriteEnable, false);
|
|
bb->insertBefore(it, shlInst);
|
|
|
|
// create label
|
|
G4_Label *label = builder.createLocalBlockLabel("barrier_WA_loop");
|
|
auto labelInst = builder.createLabelInst(label, false);
|
|
bb->insertBefore(it, labelInst);
|
|
|
|
// (W) and(1) (eq)f0.0 null:ud n0.0:ud dst1.1:ud
|
|
G4_DstRegRegion *nullDst = builder.createNullDst(Type_UD);
|
|
G4_SrcRegRegion *src0And = builder.createSrc(
|
|
builder.phyregpool.getN0Reg(), 0, 0, builder.getRegionScalar(), Type_UD);
|
|
G4_SrcRegRegion *src1And = builder.duplicateOperand(src0Shl);
|
|
G4_CondMod *condMod = builder.createCondMod(Mod_e, WAFlagVar, 0);
|
|
auto andInst = builder.createInternalInst(nullptr, G4_and, condMod, g4::NOSAT,
|
|
g4::SIMD1, nullDst, src0And,
|
|
src1And, InstOpt_WriteEnable);
|
|
bb->insertBefore(it, andInst);
|
|
|
|
// (W&f0.0) while(1) loop
|
|
G4_Predicate *pred = builder.createPredicate(PredState_Plus, WAFlagVar, 0);
|
|
auto whileInst = builder.createInternalCFInst(
|
|
pred, G4_while, g4::SIMD1, label, label, InstOpt_WriteEnable);
|
|
bb->insertBefore(it, whileInst);
|
|
|
|
// restore f0.0:ud from dst.2:ud
|
|
// mov(1) f0.0:ud dst.2:ud
|
|
G4_DstRegRegion *dstMovForRestore = builder.createDst(WAFlagVar, Type_UD);
|
|
G4_SrcRegRegion *srcMovForRestore = builder.createSrc(
|
|
dst->getBase(), dst->getRegOff(), dst->getSubRegOff() + 2,
|
|
builder.getRegionScalar(), Type_UD);
|
|
auto restoreInst =
|
|
builder.createMov(g4::SIMD1, dstMovForRestore, srcMovForRestore,
|
|
InstOpt_WriteEnable, false);
|
|
*it = restoreInst;
|
|
}
|
|
|
|
// Insert IEEEExceptionTrap before EOT.
|
|
void Optimizer::insertIEEEExceptionTrap() {
|
|
if (!fg.builder->getOption(vISA_AddIEEEExceptionTrap))
|
|
return;
|
|
|
|
for (auto bb : fg) {
|
|
for (auto it = bb->begin(), ie = bb->end(); it != ie; ++it) {
|
|
G4_INST *inst = *it;
|
|
if (!inst->isEOT())
|
|
continue;
|
|
// Reserve 2 UD: one for sr0.1, the other for flag
|
|
G4_Declare *tmp =
|
|
builder.createTempVar(2, Type_UD, Even_Word, "ExTrapTemp");
|
|
G4_INST *trap = builder.createIntrinsicInst(
|
|
nullptr, Intrinsic::IEEEExceptionTrap, g4::SIMD1,
|
|
builder.createDst(tmp->getRegVar(), 0, 0, 1, Type_UD), nullptr,
|
|
nullptr, nullptr, InstOpt_WriteEnable, false);
|
|
bb->insertBefore(it, trap);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Expand IEEEExceptionTrap intrinsic as an infinite loop to catch any IEEE
|
|
// exception. Note that the IEEE exception trap enable bit should be set
|
|
// separately in CR initialization.
|
|
// TODO: Check if we can expand the trap into other inst like sync.host or
|
|
// illegal instruction to support this debug feature.
|
|
void Optimizer::expandIEEEExceptionTrap(INST_LIST_ITER it, G4_BB *bb) {
|
|
G4_INST *inst = *it;
|
|
vASSERT(inst->isIEEEExceptionTrap());
|
|
|
|
auto dst = inst->getDst();
|
|
// Get IEEE exception bits of state register where bits 5:0 of sr0.1:ud are
|
|
// for IEEE exception.
|
|
// (W) mov (1) dst.0:ud sr0.1<0;1,0>:ud
|
|
G4_DstRegRegion *tmpSR0Dot1Dst = builder.createDst(
|
|
dst->getBase(), dst->getRegOff(), dst->getSubRegOff(), 1, Type_UD);
|
|
G4_SrcRegRegion *SR0Dot1 = builder.createSrc(
|
|
builder.phyregpool.getSr0Reg(), 0, 1, builder.getRegionScalar(), Type_UD);
|
|
auto saveInst = builder.createMov(g4::SIMD1, tmpSR0Dot1Dst, SR0Dot1,
|
|
InstOpt_WriteEnable, false);
|
|
vASSERT(tmpSR0Dot1Dst->getLinearizedStart() >= dst->getLinearizedStart() &&
|
|
tmpSR0Dot1Dst->getLinearizedEnd() <= dst->getLinearizedEnd());
|
|
bb->insertBefore(it, saveInst);
|
|
|
|
// Save f0.0:ud to dst.1:ud, then f0.0 can be used in the loop
|
|
// (W) mov(1) dst.1:ud f0.0:ud
|
|
G4_RegVar *flagVar = builder.createTempFlag(1, "ex_trap_flag")->getRegVar();
|
|
flagVar->setPhyReg(builder.phyregpool.getF0Reg(), 0);
|
|
G4_DstRegRegion *tmpFlagDst = builder.createDst(
|
|
dst->getBase(), dst->getRegOff(), dst->getSubRegOff() + 1, 1, Type_UD);
|
|
G4_SrcRegRegion *flagSrc =
|
|
builder.createSrc(flagVar, 0, 0, builder.getRegionScalar(), Type_UD);
|
|
auto saveFlag = builder.createMov(g4::SIMD1, tmpFlagDst, flagSrc,
|
|
InstOpt_WriteEnable, false);
|
|
vASSERT(tmpFlagDst->getLinearizedStart() >= dst->getLinearizedStart() &&
|
|
tmpFlagDst->getLinearizedEnd() <= dst->getLinearizedEnd());
|
|
bb->insertBefore(it, saveFlag);
|
|
|
|
// Check if any IEEE exception bit is set and update flag register.
|
|
// (W) and (1) (ne)f0.0 tmpSR0Dot1 tmpSR0Dot1 0x3f:uw
|
|
G4_SrcRegRegion *tmpSR0Dot1Src =
|
|
builder.createSrc(dst->getBase(), dst->getRegOff(), dst->getSubRegOff(),
|
|
builder.getRegionStride1(), Type_UD);
|
|
auto andInst = builder.createInternalInst(
|
|
nullptr, G4_and, builder.createCondMod(Mod_ne, flagVar, 0), g4::NOSAT,
|
|
g4::SIMD1, builder.duplicateOperand(tmpSR0Dot1Dst), tmpSR0Dot1Src,
|
|
builder.createImm(0x3f, Type_UW), InstOpt_WriteEnable);
|
|
bb->insertBefore(it, andInst);
|
|
|
|
// Create label
|
|
G4_Label *label = builder.createLocalBlockLabel("ex_trap_loop");
|
|
auto labelInst = builder.createLabelInst(label, false);
|
|
bb->insertBefore(it, labelInst);
|
|
|
|
// Create a trap as infinite loop if flag register is set.
|
|
// (W&f0.0) while (1) ex_trap_loop
|
|
auto whileInst = builder.createInternalCFInst(
|
|
builder.createPredicate(PredState_Plus, flagVar, 0), G4_while, g4::SIMD1,
|
|
label, label, InstOpt_WriteEnable);
|
|
bb->insertBefore(it, whileInst);
|
|
|
|
// Restore flag register.
|
|
// (W) mov(1) f0.0:ud dst.1:ud
|
|
G4_DstRegRegion *flagDst = builder.createDst(flagVar, Type_UD);
|
|
G4_SrcRegRegion *tmpFlagSrc = builder.createSrc(
|
|
dst->getBase(), dst->getRegOff(), dst->getSubRegOff() + 1,
|
|
builder.getRegionScalar(), Type_UD);
|
|
auto restoreFlag = builder.createMov(g4::SIMD1, flagDst, tmpFlagSrc,
|
|
InstOpt_WriteEnable, false);
|
|
*it = restoreFlag;
|
|
}
|
|
|
|
// For a subroutine, insert a dummy move with {Switch} option immediately
|
|
// before the first non-label instruction in BB. Otherwie, for a following
|
|
// basic block, insert a dummy move before *any* instruction to ensure that
|
|
// no instruction should be placed between the targe jip/uip label and its
|
|
// associated instruction.
|
|
void Optimizer::addSwitchOptionToBB(G4_BB *bb, bool isSubroutine) {
|
|
auto instIter = bb->begin();
|
|
if (isSubroutine) {
|
|
for (auto instEnd = bb->end(); instIter != instEnd; ++instIter) {
|
|
G4_INST *bbInst = *instIter;
|
|
if (!bbInst->isLabel()) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (instIter != bb->end() && ((*instIter)->getOption() & InstOpt_Switch)) {
|
|
// this BB is already processed, skip
|
|
return;
|
|
}
|
|
|
|
// mov (1) null<1>:ud r0.0<0;1,0>:ud {Switch}
|
|
G4_DstRegRegion *movDst = builder.createNullDst(Type_UD);
|
|
G4_SrcRegRegion *movSrc = builder.createSrcRegRegion(
|
|
builder.getBuiltinR0(), builder.getRegionScalar());
|
|
G4_INST *movInst =
|
|
builder.createMov(g4::SIMD1, movDst, movSrc, InstOpt_WriteEnable, false);
|
|
movInst->setOptionOn(InstOpt_Switch);
|
|
bb->insertBefore(instIter, movInst);
|
|
}
|
|
|
|
void Optimizer::linePlaneWA(G4_INST *inst) {
|
|
// Putting it here instead of in HW confomrity because we need original src0
|
|
// region in scheduler to calculate RB correctly. Otherwise setup moves for
|
|
// src0 get scheduled after instruction
|
|
//
|
|
// HW check #12: Check and correct the first operand for line instruction
|
|
// Actually it must be a replicated stream of 4 contiguous elements.
|
|
// That means <0;4,1> region. But in asm code it must be presented as
|
|
// replicated scalar - <0;1,0>.
|
|
if (inst->opcode() == G4_line || inst->opcode() == G4_pln) {
|
|
G4_Operand *src = inst->getSrc(0);
|
|
const RegionDesc *rd =
|
|
src->isSrcRegRegion() ? src->asSrcRegRegion()->getRegion() : NULL;
|
|
vISA_ASSERT(rd != NULL, " Src0 of line inst is not regregion. ");
|
|
if (rd->isScalar()) {
|
|
return;
|
|
}
|
|
vISA_ASSERT((rd->vertStride == 0 || rd->vertStride == 4) && rd->width == 4,
|
|
"Unexpected region for the first line operand.");
|
|
|
|
// create a new rd for src0
|
|
const RegionDesc *new_rd = builder.getRegionScalar();
|
|
src->asSrcRegRegion()->setRegion(builder, new_rd);
|
|
}
|
|
}
|
|
|
|
//
|
|
// This inserts two dummy moves to clear flag dependencies before EOT:
|
|
// mov(1) null:ud f0.0<0;1,0>:ud{ Align1, Q1, NoMask }
|
|
// mov(1) null:ud f1.0<0;1,0>:ud{ Align1, Q1, NoMask }
|
|
// This is done if f0/f1 is ever defined in a BB but not used in it, as we
|
|
// conservatively assume that the flag may be undefined when the EOT is reached.
|
|
// Note that USC only does this if EOT is inside control flow, i.e., EOT is an
|
|
// early exit
|
|
//
|
|
void Optimizer::clearARFDependencies() {
|
|
auto flagToInt = [](G4_Areg *areg) {
|
|
vISA_ASSERT(areg->isFlag(), "expect F0 or F1");
|
|
return areg->getArchRegType() == AREG_F0 ? 0 : 1;
|
|
};
|
|
// see if F0 and F1 are ever defined but not used in the same BB
|
|
bool unusedFlag[2]; // f0 and f1
|
|
unusedFlag[0] = unusedFlag[1] = false;
|
|
for (auto bb : fg) {
|
|
bool unusedFlagLocal[2]; // f0 and f1
|
|
unusedFlagLocal[0] = unusedFlagLocal[1] = false;
|
|
|
|
for (auto inst : *bb) {
|
|
if (inst->isEOT()) {
|
|
// EOT should be the last inst in BB.
|
|
continue;
|
|
}
|
|
|
|
// check predicate source
|
|
if (inst->getPredicate()) {
|
|
G4_VarBase *flag = inst->getPredicate()->getBase();
|
|
if (flag->isRegVar()) {
|
|
G4_Areg *areg = flag->asRegVar()->getPhyReg()->asAreg();
|
|
unusedFlagLocal[flagToInt(areg)] = false;
|
|
}
|
|
}
|
|
|
|
// check explicit source
|
|
for (int i = 0; i < inst->getNumSrc(); ++i) {
|
|
if (inst->getSrc(i) && inst->getSrc(i)->isSrcRegRegion() &&
|
|
inst->getSrc(i)->isFlag()) {
|
|
G4_SrcRegRegion *src = inst->getSrc(i)->asSrcRegRegion();
|
|
if (src->getBase()->isRegVar()) {
|
|
G4_Areg *flag = src->getBase()->asRegVar()->getPhyReg()->asAreg();
|
|
unusedFlagLocal[flagToInt(flag)] = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
// check explicit dst
|
|
if (inst->getDst() && inst->getDst()->isFlag()) {
|
|
// flag is an explicit dst
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
if (dst->getBase()->isRegVar()) {
|
|
G4_Areg *flag = dst->getBase()->asRegVar()->getPhyReg()->asAreg();
|
|
unusedFlagLocal[flagToInt(flag)] = true;
|
|
}
|
|
}
|
|
// check cond mod
|
|
else if (G4_VarBase *flag = inst->getCondModBase()) {
|
|
if (flag->isRegVar()) {
|
|
G4_Areg *areg = flag->asRegVar()->getPhyReg()->asAreg();
|
|
unusedFlagLocal[flagToInt(areg)] = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (unusedFlagLocal[0] && unusedFlag[0] == false) {
|
|
unusedFlag[0] = true;
|
|
}
|
|
|
|
if (unusedFlagLocal[1] && unusedFlag[1] == false) {
|
|
unusedFlag[1] = true;
|
|
}
|
|
|
|
if (unusedFlag[0] && unusedFlag[1]) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (unusedFlag[0] || unusedFlag[1]) {
|
|
for (auto bb : fg) {
|
|
if (bb->size() == 0) {
|
|
return;
|
|
}
|
|
G4_INST *inst = bb->back();
|
|
if (inst->isEOT()) {
|
|
auto instIter = bb->end();
|
|
--instIter;
|
|
if (unusedFlag[0]) {
|
|
G4_SrcRegRegion *flagSrc =
|
|
builder.createSrc(builder.phyregpool.getF0Reg(), 0, 0,
|
|
builder.getRegionScalar(), Type_UD);
|
|
G4_DstRegRegion *nullDst = builder.createNullDst(Type_UD);
|
|
G4_INST *inst = builder.createMov(g4::SIMD1, nullDst, flagSrc,
|
|
InstOpt_WriteEnable, false);
|
|
bb->insertBefore(instIter, inst);
|
|
}
|
|
if (unusedFlag[1]) {
|
|
G4_SrcRegRegion *flagSrc =
|
|
builder.createSrc(builder.phyregpool.getF1Reg(), 0, 0,
|
|
builder.getRegionScalar(), Type_UD);
|
|
G4_DstRegRegion *nullDst = builder.createNullDst(Type_UD);
|
|
G4_INST *inst = builder.createMov(g4::SIMD1, nullDst, flagSrc,
|
|
InstOpt_WriteEnable, false);
|
|
bb->insertBefore(instIter, inst);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
void Optimizer::mulMacRSWA() {
|
|
auto hasGRFOverlap = [this](G4_Operand *A, G4_Operand *B) {
|
|
if (A->isNullReg() || !A->isGreg())
|
|
return false;
|
|
if (B->isNullReg() || !B->isGreg())
|
|
return false;
|
|
|
|
unsigned LB1 =
|
|
A->getLinearizedStart() / fg.builder->numEltPerGRF<Type_UB>();
|
|
unsigned RB1 = A->getLinearizedEnd() / fg.builder->numEltPerGRF<Type_UB>();
|
|
unsigned LB2 =
|
|
B->getLinearizedStart() / fg.builder->numEltPerGRF<Type_UB>();
|
|
unsigned RB2 = B->getLinearizedEnd() / fg.builder->numEltPerGRF<Type_UB>();
|
|
|
|
return (RB2 >= LB1 && RB1 >= LB2);
|
|
};
|
|
|
|
auto isBothMulClass = [](G4_INST *inst1, G4_INST *inst2) {
|
|
return (inst1->opcode() == G4_mul || inst1->opcode() == G4_mac) &&
|
|
(inst2->opcode() == G4_mul || inst2->opcode() == G4_mac);
|
|
};
|
|
|
|
auto isBothMaclClass = [](G4_INST *inst1, G4_INST *inst2) {
|
|
// In vISA, only G4_mach will be used. IGA will change it G4_macl according
|
|
// to certain conditions.
|
|
return (inst1->opcode() == G4_mach) &&
|
|
(inst2->opcode() == G4_mach);
|
|
};
|
|
|
|
auto checkFlatRegRegionFunc =
|
|
[](uint8_t dstStrideInBytes, uint8_t dstSubRegOffInBytes,
|
|
uint8_t srcStrideInBytes, uint8_t srcSubRegOffInBytes,
|
|
uint8_t exChannelWidth) -> bool {
|
|
return ((dstSubRegOffInBytes == srcSubRegOffInBytes) &&
|
|
(dstStrideInBytes == srcStrideInBytes) &&
|
|
(dstStrideInBytes % exChannelWidth == 0));
|
|
};
|
|
|
|
G4_INST *prevInst = nullptr;
|
|
for (auto bb : fg) {
|
|
INST_LIST_ITER ii = bb->begin();
|
|
|
|
while (ii != bb->end()) {
|
|
G4_INST *inst = *ii;
|
|
|
|
if (!inst->isIntegerPipeInstructionXe()) {
|
|
ii++;
|
|
continue;
|
|
}
|
|
|
|
if (!prevInst) {
|
|
prevInst = inst;
|
|
ii++;
|
|
continue;
|
|
}
|
|
|
|
uint8_t exChannelWidth = (uint8_t)TypeSize(inst->getExecType());
|
|
|
|
// Issue 1:
|
|
// MUL opcode class = {MUL, MAC}
|
|
// MACL opcode class = {MACL, MACH}
|
|
//
|
|
// Issue is present for MUL opcode class OR MACL opcode class (both
|
|
// prev/current instruction should belong to the same opcode class)
|
|
// 1. prev instructions src1 has REGIONING/SCALAR
|
|
// 2. current instruction src1 is FLAT and shares the same src1 as prev
|
|
//
|
|
// instruction Issue is not present for below cases.
|
|
// 1. prev instruction is FLAT and current instruction has
|
|
// REGIONING/SCALAR
|
|
// 2. prev/current both are FLAT
|
|
// 3. prev/current both has REGIONING/SCALAR
|
|
// 4. One instruction is in MUL opcode class and the other instruction
|
|
// is in MACL opcode class
|
|
if (isBothMulClass(prevInst, inst) || isBothMaclClass(prevInst, inst)) {
|
|
G4_Operand *prevSrc1 = prevInst->getSrc(1);
|
|
G4_Operand *curSrc1 = inst->getSrc(1);
|
|
|
|
if (prevSrc1 && prevSrc1->isGreg() && prevSrc1->isSrcRegRegion() &&
|
|
curSrc1 && curSrc1->isGreg() &&
|
|
curSrc1->isSrcRegRegion()) { // All regions
|
|
|
|
if (!prevSrc1->asSrcRegRegion()->isFlatRegRegion(
|
|
exChannelWidth, checkFlatRegRegionFunc) &&
|
|
curSrc1->asSrcRegRegion()->isFlatRegRegion(
|
|
exChannelWidth, checkFlatRegRegionFunc) &&
|
|
hasGRFOverlap(
|
|
prevSrc1,
|
|
curSrc1)) { // none flat vs flat regions, and overlap
|
|
// WorkAround: Insert dummy instruction that can break src1 RS
|
|
// chain between regioning MUL instruction and FLAT MULK
|
|
// instruction (IMMEDIATE operand can be used for src1 to break
|
|
// the RS chain)
|
|
insertDummyAdd(bb, ii);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Issue 2
|
|
// prev.instruction is non-MUL opcode class instruction AND non-MACL
|
|
// opcode class instruction has(FLAT or Regioning / Scalar) src1 and
|
|
// current Instruction is MACL opcode class
|
|
// instruction AND has FLAT regioning AND shares the same src1 has the
|
|
// prev.instruction,
|
|
if (inst->opcode() == G4_mach) {
|
|
G4_Operand *prevSrc1 = prevInst->getSrc(1);
|
|
G4_Operand *curSrc1 = inst->getSrc(1);
|
|
|
|
if (prevSrc1 && prevSrc1->isGreg() && prevSrc1->isSrcRegRegion() &&
|
|
curSrc1 && curSrc1->isGreg() && curSrc1->isSrcRegRegion()) {
|
|
if (prevInst->opcode() != G4_mach && prevInst->opcode() != G4_mul &&
|
|
prevInst->opcode() != G4_mac) {
|
|
if (curSrc1->asSrcRegRegion()->isFlatRegRegion(
|
|
exChannelWidth, checkFlatRegRegionFunc) &&
|
|
hasGRFOverlap(prevSrc1, curSrc1)) {
|
|
insertDummyAdd(bb, ii, 1);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
prevInst = inst;
|
|
ii++;
|
|
}
|
|
}
|
|
}
|
|
|
|
// change the send src0 region to be consistent with assembler expectation
|
|
// We do it here instead of HW conformity since they only affect binary encoding
|
|
// ToDo: this should not be necessary anymore, should see if we can remove
|
|
void Optimizer::fixSendSrcRegion(G4_INST *inst) {
|
|
if (inst->isSend() && inst->getSrc(0) != NULL) {
|
|
const RegionDesc *newDesc = NULL;
|
|
uint8_t execSize = inst->getExecSize();
|
|
if (execSize == 1) {
|
|
newDesc = builder.getRegionScalar();
|
|
} else if (execSize > 8) {
|
|
newDesc = builder.getRegionStride1();
|
|
} else {
|
|
newDesc = builder.createRegionDesc(execSize, execSize, 1);
|
|
}
|
|
inst->getSrc(0)->asSrcRegRegion()->setRegion(builder, newDesc);
|
|
}
|
|
}
|
|
|
|
// some workaround for HW restrictions. We apply them here so as not to affect
|
|
// optimizations, RA, and scheduling
|
|
void Optimizer::HWWorkaround() {
|
|
// Ensure the first instruction of a stack function has switch option.
|
|
if (fg.getIsStackCallFunc() &&
|
|
VISA_WA_CHECK(builder.getPWaTable(), WaThreadSwitchAfterCall)) {
|
|
addSwitchOptionToBB(fg.getEntryBB(), true);
|
|
}
|
|
|
|
DPASSrc2RSCache src2GRFCache;
|
|
// set physical pred/succ as it's needed for the call WA
|
|
fg.setPhysicalPredSucc();
|
|
const bool scheduleFenceCommit =
|
|
builder.getOption(vISA_scheduleFenceCommit) &&
|
|
builder.getPlatform() >= GENX_TGLLP;
|
|
BB_LIST_ITER ib, bend(fg.end());
|
|
for (ib = fg.begin(); ib != bend; ++ib) {
|
|
G4_BB *bb = (*ib);
|
|
INST_LIST_ITER ii = bb->begin();
|
|
|
|
while (ii != bb->end()) {
|
|
G4_INST *inst = *ii;
|
|
|
|
G4_InstSend *sendInst = inst->asSendInst();
|
|
if (sendInst && sendInst->isFence() &&
|
|
!builder.getOption(vISA_skipFenceCommit)) {
|
|
addFenceCommit(ii, bb, scheduleFenceCommit);
|
|
}
|
|
|
|
// To solve truncation issue in compaction table implementation
|
|
if (VISA_WA_CHECK(builder.getPWaTable(), Wa_22010811838) &&
|
|
inst->isDpas()) {
|
|
G4_InstDpas *dpasInst = inst->asDpasInst();
|
|
GenPrecision p = dpasInst->getSrc1Precision();
|
|
if (p == GenPrecision::S8 || p == GenPrecision::S4 ||
|
|
p == GenPrecision::S2 || p == GenPrecision::BF16) {
|
|
dpasInst->setOptionOn(InstOpt_NoCompact);
|
|
}
|
|
}
|
|
if (inst->isCall() || inst->isFCall()) {
|
|
if (VISA_WA_CHECK(builder.getPWaTable(), WaThreadSwitchAfterCall)) {
|
|
// WA:
|
|
// A call instruction must be followed by an instruction that supports
|
|
// Switch. When call takes a jump, the first instruction must have a
|
|
// Switch.
|
|
BB_LIST_ITER nextBBIter = ib;
|
|
++nextBBIter;
|
|
if (nextBBIter != bend) {
|
|
addSwitchOptionToBB(*nextBBIter, false);
|
|
}
|
|
// also do this for call target
|
|
addSwitchOptionToBB(bb->Succs.front(), true);
|
|
}
|
|
}
|
|
|
|
// we must set {Switch} if the instruction updates ARF with no scoreboard
|
|
{
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
if (dst != nullptr && dst->getBase()->noScoreBoard()) {
|
|
inst->setOptionOn(InstOpt_Switch);
|
|
}
|
|
}
|
|
|
|
if (inst->isSend() && !inst->isNoPreemptInst() &&
|
|
builder.needsNoPreemptR2ForSend()) {
|
|
G4_Operand *Src0 = inst->getSrc(0);
|
|
if (Src0 && Src0->isGreg()) {
|
|
unsigned LB = Src0->getLinearizedStart();
|
|
if (LB == 2 * kernel.numEltPerGRF<Type_UB>()) {
|
|
inst->setOptionOn(InstOpt_NoPreempt);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (builder.hasFdivPowWA() && inst->isMath() &&
|
|
(inst->asMathInst()->getMathCtrl() == MATH_FDIV ||
|
|
inst->asMathInst()->getMathCtrl() == MATH_POW)) {
|
|
INST_LIST_ITER nextIter = ii;
|
|
nextIter++;
|
|
if (nextIter == bb->end()) {
|
|
break;
|
|
}
|
|
// check next inst
|
|
G4_INST *nextInst = *nextIter;
|
|
if (!nextInst->isSend() && nextInst->getDst() &&
|
|
!nextInst->hasNULLDst() && nextInst->getDst()->crossGRF(builder)) {
|
|
// insert a nop
|
|
G4_INST *nopInst = builder.createNop(inst->getOption());
|
|
bb->insertBefore(nextIter, nopInst);
|
|
}
|
|
}
|
|
|
|
if (inst->isCall() || inst->isReturn()) {
|
|
inst->setExecSize(kernel.getSimdSize());
|
|
}
|
|
|
|
// HW Workaround: for platforms without 64-bit regioning, change send
|
|
// src/dst type from QWord to DWord
|
|
if (builder.no64bitRegioning() && inst->isSend()) {
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
if (dst != nullptr && dst->getTypeSize() == 8) {
|
|
dst->setType(builder, Type_D);
|
|
}
|
|
|
|
G4_Operand *src0 = inst->getSrc(0);
|
|
if (src0 != nullptr && src0->getTypeSize() == 8) {
|
|
src0->asSrcRegRegion()->setType(builder, Type_D);
|
|
}
|
|
|
|
if (inst->isSplitSend()) {
|
|
G4_Operand *src1 = inst->getSrc(1);
|
|
if (src1 != nullptr && src1->getTypeSize() == 8) {
|
|
src1->asSrcRegRegion()->setType(builder, Type_D);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (inst->isEOT() && VISA_WA_CHECK(builder.getPWaTable(),
|
|
WaClearTDRRegBeforeEOTForNonPS)) {
|
|
// insert
|
|
// mov(8) tdr0:uw 0x0:uw {NoMask}
|
|
G4_DstRegRegion *tdrDst =
|
|
builder.createDst(builder.phyregpool.getTDRReg(), 0, 0, 1, Type_UW);
|
|
G4_Imm *src = builder.createImm(0, Type_UW);
|
|
G4_INST *movInst =
|
|
builder.createMov(g4::SIMD8, tdrDst, src,
|
|
InstOpt_WriteEnable | InstOpt_Switch, false);
|
|
bb->insertBefore(ii, movInst);
|
|
}
|
|
|
|
if (inst->isEOT() &&
|
|
VISA_WA_CHECK(builder.getPWaTable(), Wa_14010017096)) {
|
|
// insert "(W) mov(16) acc0.0:f 0x0:f" before EOT
|
|
G4_INST *movInst = builder.createMov(
|
|
g4::SIMD16,
|
|
builder.createDst(builder.phyregpool.getAcc0Reg(), 0, 0, 1, Type_F),
|
|
builder.createImm(0, Type_F), InstOpt_WriteEnable, false);
|
|
// insert mov before contiguous send, in case that there are instruction
|
|
// combined set on continuous two send
|
|
INST_LIST_ITER insert_point = ii;
|
|
for (; insert_point != bb->begin(); --insert_point)
|
|
if (!(*insert_point)->isSend())
|
|
break;
|
|
|
|
if (!(*insert_point)->isEOT())
|
|
++insert_point;
|
|
bb->insertBefore(insert_point, movInst);
|
|
}
|
|
|
|
if (inst->isEOT() &&
|
|
VISA_WA_CHECK(builder.getPWaTable(), Wa_16013338947)) {
|
|
bool hasLegalInstAfterEOT = false;
|
|
for (auto bnext = std::next(ib); bnext != bend; ++bnext) {
|
|
G4_BB *nextBB = *bnext;
|
|
bool found =
|
|
std::any_of(nextBB->begin(), nextBB->end(),
|
|
[](G4_INST *inst) { return !inst->isLabel(); });
|
|
if (found) {
|
|
hasLegalInstAfterEOT = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!hasLegalInstAfterEOT) {
|
|
G4_INST *nopInst = builder.createNop(InstOpt_NoOpt);
|
|
bb->insertAfter(ii, nopInst);
|
|
}
|
|
}
|
|
|
|
if (VISA_WA_CHECK(builder.getPWaTable(), WaResetN0BeforeGatewayMessage) &&
|
|
inst->isSend() && inst->getMsgDesc()->isBarrier()) {
|
|
// mov (1) n0.0 0x0 {Switch}
|
|
G4_DstRegRegion *n0Dst =
|
|
builder.createDst(builder.phyregpool.getN0Reg(), 0, 0, 1, Type_UD);
|
|
auto movInst =
|
|
builder.createMov(g4::SIMD1, n0Dst, builder.createImm(0, Type_UD),
|
|
InstOpt_WriteEnable | InstOpt_Switch, false);
|
|
bb->insertBefore(ii, movInst);
|
|
}
|
|
|
|
linePlaneWA(inst);
|
|
fixSendSrcRegion(inst);
|
|
if (builder.hasMathDpasConflict() && inst->isMath()) {
|
|
INST_LIST_ITER nextIter = ii;
|
|
nextIter++;
|
|
|
|
for (int i = 0; i < 5; i++) {
|
|
G4_INST *newInst = inst->cloneInst();
|
|
bb->insertBefore(nextIter, newInst);
|
|
}
|
|
ii = nextIter;
|
|
continue;
|
|
}
|
|
|
|
if (VISA_WA_CHECK(builder.getPWaTable(), Wa_22013880840) &&
|
|
builder.getOption(vISA_ALTMode) == true && inst->opcode() == G4_sel &&
|
|
inst->getPredicate() != nullptr && inst->getCondMod() == nullptr &&
|
|
inst->getDst() && IS_TYPE_FLOAT_ALL(inst->getDst()->getType())) {
|
|
auto pred = inst->getPredicate();
|
|
auto movInst1 = builder.createInternalInst(
|
|
builder.duplicateOperand(pred), G4_mov, nullptr,
|
|
inst->getSaturate(), inst->getExecSize(),
|
|
builder.duplicateOperand(inst->getDst()),
|
|
builder.duplicateOperand(inst->getSrc(0)), nullptr,
|
|
inst->getOption());
|
|
bb->insertBefore(ii, movInst1);
|
|
|
|
G4_PredState reverse = pred->getState() == PredState_Minus
|
|
? PredState_Plus
|
|
: PredState_Minus;
|
|
auto newPred = builder.createPredicate(
|
|
reverse, pred->getBase(), pred->getSubRegOff(), pred->getControl());
|
|
auto movInst2 = builder.createInternalInst(
|
|
newPred, G4_mov, nullptr, inst->getSaturate(), inst->getExecSize(),
|
|
builder.duplicateOperand(inst->getDst()),
|
|
builder.duplicateOperand(inst->getSrc(1)), nullptr,
|
|
inst->getOption());
|
|
*ii = movInst2;
|
|
inst->removeAllDefs();
|
|
}
|
|
|
|
if (builder.kernel.getNumRegTotal() == 256 && inst->isEOT() &&
|
|
VISA_WA_CHECK(builder.getPWaTable(), Wa_14016880151)) {
|
|
INST_LIST_ITER preIter = std::prev(ii);
|
|
if (preIter != ii) {
|
|
G4_INST *preInst = (*preIter);
|
|
if (preInst->isAtomicInst()) {
|
|
insertDummyCsel(bb, preIter, false);
|
|
} else {
|
|
insertDummyCsel(bb, ii, false);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (builder.needBarrierWA() && inst->isBarrierWAIntrinsic()) {
|
|
applyBarrierWA(ii, bb);
|
|
}
|
|
|
|
if (builder.needBarrierWA() && inst->isNamedBarrierWAIntrinsic()) {
|
|
applyNamedBarrierWA(ii, bb);
|
|
}
|
|
|
|
if (inst->isIEEEExceptionTrap())
|
|
expandIEEEExceptionTrap(ii, bb);
|
|
|
|
// Double up every TGM fence instruction if fenceOp is not
|
|
// LSC_FENCE_OP_NONE
|
|
if (builder.needTGMDoubleFenceWA() && inst->isSend() &&
|
|
inst->asSendInst()->isFence() &&
|
|
inst->asSendInst()->getMsgDesc()->getSFID() == SFID::TGM &&
|
|
inst->asSendInst()->getMsgDescRaw()->getLscFenceOp() !=
|
|
LSC_FENCE_OP_NONE)
|
|
bb->insertBefore(ii, inst->cloneInst());
|
|
|
|
ii++;
|
|
}
|
|
}
|
|
|
|
if (VISA_WA_CHECK(builder.getPWaTable(), WaClearArfDependenciesBeforeEot)) {
|
|
clearARFDependencies();
|
|
}
|
|
if (VISA_WA_CHECK(builder.getPWaTable(), Wa_2201674230)) {
|
|
clearSendDependencies();
|
|
}
|
|
|
|
if (builder.hasMulMacRSIssue()) {
|
|
mulMacRSWA();
|
|
}
|
|
|
|
if (builder.needResetA0forVxHA0()) {
|
|
// reset a0 to 0 at the beginning of a shader.
|
|
// The goal of this initialization is to make sure that there is no
|
|
// garbage values in the address register for inactive simd lanes.
|
|
// With indirect addressing HW requires that there is no
|
|
// out-of-bounds access even on inactive simd lanes.
|
|
|
|
// Note: this initialization doesn't cover scenarios where the
|
|
// address register is used in a send descriptor and later used in
|
|
// indirect addressing.
|
|
resetA0();
|
|
}
|
|
|
|
if (builder.getOption(vISA_setA0toTdrForSendc)) {
|
|
// set A0 to tdr0 before sendc/sendsc. TGL WA
|
|
setA0toTdrForSendc();
|
|
}
|
|
|
|
if (builder.needReplaceIndirectCallWithJmpi() &&
|
|
kernel.getBoolKernelAttr(Attributes::ATTR_Extern)) {
|
|
// jmpi WA can't properly work on platforms with SWSB. We didn't re-caculate
|
|
// the jump offset after swsb insertion.
|
|
vASSERT(!builder.hasSWSB());
|
|
// replace ret in the external functions with jmpi. That we will
|
|
// also replace the call with jmpi in
|
|
// Optimizer::expandIndirectCallWithRegTarget
|
|
replaceRetWithJmpi();
|
|
}
|
|
|
|
if (!builder.supportCallaRegSrc() && kernel.hasIndirectCall()) {
|
|
// If the indirect call has regiser src0, the register must be a
|
|
// ip-based address of the call target. Insert instructions before call to
|
|
// calculate the relative offset from call to the target
|
|
expandIndirectCallWithRegTarget();
|
|
}
|
|
|
|
if (builder.hasFPU0ReadSuppressionIssue()) {
|
|
fixReadSuppressioninFPU0();
|
|
}
|
|
}
|
|
|
|
// When destination is an address register the following apply:
|
|
// Destination must not span across the lower to upper 8 dword
|
|
// boundary of the register.
|
|
// Fix this restriction after RA instead of HWConformity just because
|
|
// RA(spill/fill, A0 save/restore) would generate such instructions.
|
|
void Optimizer::fixDirectAddrBoundOnDst() {
|
|
HWConformity hwConf(builder, kernel);
|
|
for (auto bb : kernel.fg) {
|
|
for (auto it = bb->begin(), ie = bb->end(); it != ie; ++it) {
|
|
G4_INST *inst = *it;
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
if (dst && !dst->isNullReg() &&
|
|
dst->getRegAccess() == Direct && dst->getTopDcl() &&
|
|
dst->getTopDcl()->getRegVar()->isAddress()) {
|
|
G4_Declare *dcl = dst->getTopDcl();
|
|
if (dcl->getTotalElems() > Eight_Word) {
|
|
if (dcl->getSubRegAlign() < Sixteen_Word)
|
|
dcl->setSubRegAlign(Sixteen_Word);
|
|
} else if (dcl->getTotalElems() > Four_Word) {
|
|
if (dcl->getSubRegAlign() < Eight_Word)
|
|
dcl->setSubRegAlign(Eight_Word);
|
|
} else if (dcl->getTotalElems() > Any) {
|
|
if (dcl->getSubRegAlign() < Four_Word)
|
|
dcl->setSubRegAlign(Four_Word);
|
|
}
|
|
if (((dst->getSubRegOff() + inst->getExecSize() - 1) / 16 !=
|
|
(dst->getSubRegOff() / 16)) ||
|
|
inst->getExecSize() == g4::SIMD32) {
|
|
hwConf.evenlySplitInst(it, bb, /*checkOverlap*/ false);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static bool retires(G4_Operand *Opnd, G4_INST *SI) {
|
|
vASSERT(SI);
|
|
const IR_Builder &builder = SI->getBuilder();
|
|
vASSERT(Opnd && Opnd->isGreg());
|
|
unsigned LB = Opnd->getLinearizedStart() / builder.numEltPerGRF<Type_UB>();
|
|
unsigned RB = Opnd->getLinearizedEnd() / builder.numEltPerGRF<Type_UB>();
|
|
|
|
auto overlaps = [=, &builder](G4_Operand *A) {
|
|
if (A == nullptr || A->isNullReg() || !A->isGreg())
|
|
return false;
|
|
unsigned LB1 = A->getLinearizedStart() / builder.numEltPerGRF<Type_UB>();
|
|
unsigned RB1 = A->getLinearizedEnd() / builder.numEltPerGRF<Type_UB>();
|
|
return (RB >= LB1 && RB1 >= LB);
|
|
};
|
|
|
|
// RAW or WAW
|
|
if (overlaps(SI->getDst()))
|
|
return true;
|
|
|
|
if (Opnd->isSrcRegRegion())
|
|
return false;
|
|
|
|
// WAR.
|
|
if (overlaps(SI->getSrc(0)))
|
|
return true;
|
|
if (SI->isSplitSend() && overlaps(SI->getSrc(1)))
|
|
return true;
|
|
|
|
// Do not retire this send.
|
|
return false;
|
|
}
|
|
|
|
// Emit a self-move to retire this send.
|
|
static G4_INST *emitRetiringMov(IR_Builder &builder, G4_BB *BB, G4_INST *SI,
|
|
INST_LIST_ITER InsertBefore) {
|
|
vASSERT(SI && SI->isSend());
|
|
G4_Operand *Src0 = SI->getSrc(0);
|
|
|
|
unsigned RegNum =
|
|
Src0->getLinearizedStart() / builder.numEltPerGRF<Type_UB>();
|
|
G4_Declare *Dcl = builder.createTempVar(16, Type_F, Any);
|
|
Dcl->getRegVar()->setPhyReg(builder.phyregpool.getGreg(RegNum), 0);
|
|
|
|
G4_DstRegRegion *MovDst =
|
|
builder.createDst(Dcl->getRegVar(), 0, 0, 1, Type_F);
|
|
G4_SrcRegRegion *MovSrc = builder.createSrc(
|
|
Dcl->getRegVar(), 0, 0, builder.getRegionStride1(), Type_F);
|
|
G4_INST *MovInst = builder.createMov(g4::SIMD8, MovDst, MovSrc,
|
|
InstOpt_M0 | InstOpt_WriteEnable, false);
|
|
BB->insertBefore(InsertBefore, MovInst);
|
|
return MovInst;
|
|
}
|
|
|
|
// Use this instruction to retire live sends.
|
|
static void retireSends(std::vector<G4_INST *> &LiveSends, G4_INST *Inst) {
|
|
if (LiveSends.empty())
|
|
return;
|
|
|
|
// Predicated instructions may not retire a send.
|
|
if (Inst->getPredicate() != nullptr && Inst->opcode() != G4_sel)
|
|
return;
|
|
|
|
// Collect operands for dependency checking.
|
|
std::vector<G4_Operand *> Opnds;
|
|
if (G4_DstRegRegion *Dst = Inst->getDst()) {
|
|
if (!Dst->isNullReg() && !Dst->isIndirect() && Dst->isGreg())
|
|
Opnds.push_back(Dst);
|
|
}
|
|
for (int i = 0; i < Inst->getNumSrc(); ++i) {
|
|
G4_Operand *Opnd = Inst->getSrc(i);
|
|
if (Opnd == nullptr || !Opnd->isSrcRegRegion() || Opnd->isNullReg())
|
|
continue;
|
|
G4_SrcRegRegion *Src = Opnd->asSrcRegRegion();
|
|
if (!Src->isIndirect() && Src->isGreg())
|
|
Opnds.push_back(Opnd);
|
|
}
|
|
|
|
// WRA, RAW or WAW dependency retires a live send.
|
|
bool Changed = false;
|
|
for (auto Opnd : Opnds) {
|
|
for (auto &SI : LiveSends) {
|
|
if (SI && retires(Opnd, SI)) {
|
|
SI = nullptr;
|
|
Changed = true;
|
|
}
|
|
}
|
|
}
|
|
// Remove nullptr values when there are changes.
|
|
if (Changed) {
|
|
auto Iter =
|
|
std::remove(LiveSends.begin(), LiveSends.end(), (G4_INST *)nullptr);
|
|
LiveSends.erase(Iter, LiveSends.end());
|
|
}
|
|
}
|
|
|
|
// Limit the number of live sends and clear all sends at the end of a block.
|
|
void Optimizer::clearSendDependencies() {
|
|
for (auto BB : fg) {
|
|
// Live send instructions. This vector will only have MAX_SENDS
|
|
// or less instructions.
|
|
const unsigned MAX_SENDS = 3;
|
|
std::vector<G4_INST *> LiveSends;
|
|
|
|
for (auto I = BB->begin(); I != BB->end(); /*empty*/) {
|
|
auto CurI = I++;
|
|
G4_INST *Inst = *CurI;
|
|
|
|
// Try to retire live sends.
|
|
retireSends(LiveSends, Inst);
|
|
if (!Inst->isSend())
|
|
continue;
|
|
|
|
// This is a send.
|
|
if (LiveSends.size() >= MAX_SENDS) {
|
|
// OK, too many live sends. Retire the earliest live send.
|
|
G4_INST *SI = LiveSends.front();
|
|
G4_INST *MovInst = emitRetiringMov(builder, BB, SI, CurI);
|
|
retireSends(LiveSends, MovInst);
|
|
vASSERT(LiveSends.size() < MAX_SENDS);
|
|
}
|
|
|
|
// If this is EOT and send queue is not full, then nothing to do.
|
|
// Otherwise a new send becomes live.
|
|
if (Inst->isEOT())
|
|
LiveSends.clear();
|
|
else
|
|
LiveSends.push_back(Inst);
|
|
}
|
|
|
|
// Retire remainig live sends in this block, if any.
|
|
for (auto SI : LiveSends) {
|
|
vASSERT(SI && SI->isSend());
|
|
auto InsertBefore = BB->end();
|
|
G4_INST *LastInst = BB->back();
|
|
if (LastInst->isFlowControl())
|
|
InsertBefore = std::prev(InsertBefore);
|
|
emitRetiringMov(builder, BB, SI, InsertBefore);
|
|
}
|
|
}
|
|
}
|