mirror of
https://github.com/intel/intel-graphics-compiler.git
synced 2025-11-04 08:21:06 +08:00
Removed definitions of old relocation table types (m_funcRelocationTable). Changed usages of relocation tables to use the ZEBinary type (m_relocs).
8681 lines
290 KiB
C++
8681 lines
290 KiB
C++
/*========================== begin_copyright_notice ============================
|
|
|
|
Copyright (C) 2017-2022 Intel Corporation
|
|
|
|
SPDX-License-Identifier: MIT
|
|
|
|
============================= end_copyright_notice ===========================*/
|
|
|
|
#include "Optimizer.h"
|
|
#include "Assertions.h"
|
|
#include "G4_Opcode.h"
|
|
#include "G4_Verifier.hpp"
|
|
#include "Timer.h"
|
|
#include "ifcvt.h"
|
|
#include "Common_BinaryEncoding.h"
|
|
#include "DebugInfo.h"
|
|
#include "FlowGraph.h"
|
|
#include "Passes/AccSubstitution.hpp"
|
|
#include "PointsToAnalysis.h"
|
|
#include "Passes/SRSubstitution.hpp"
|
|
#include "Passes/InstCombine.hpp"
|
|
#include "Passes/LVN.hpp"
|
|
#include "Passes/MergeScalars.hpp"
|
|
#include "Passes/SendFusion.hpp"
|
|
#include "Passes/StaticProfiling.hpp"
|
|
|
|
// clang-format off
|
|
#include "common/LLVMWarningsPush.hpp"
|
|
#include "llvm/Support/Allocator.h"
|
|
#include "common/LLVMWarningsPop.hpp"
|
|
// clang-format on
|
|
|
|
#include <optional>
|
|
#include <algorithm>
|
|
#include <chrono>
|
|
#include <fstream>
|
|
#include <iomanip>
|
|
#include <map>
|
|
#include <random>
|
|
#include <sstream>
|
|
#include <tuple>
|
|
#include <vector>
|
|
|
|
using namespace vISA;
|
|
|
|
void Optimizer::LVN() {
|
|
// Run a simple LVN pass that replaces redundant
|
|
// immediate loads in current BB. Also this pass
|
|
// does not optimize operations like a
|
|
// conventional VN pass because those require
|
|
// more compile time, and are presumably already
|
|
// done by FE generating VISA. This pass catches
|
|
// redundancies that got introduced mainly by HW
|
|
// conformity or due to VISA lowering.
|
|
int numInstsRemoved = 0;
|
|
PointsToAnalysis p(kernel.Declares, kernel.fg.getNumBB());
|
|
p.doPointsToAnalysis(kernel.fg);
|
|
for (auto bb : kernel.fg) {
|
|
::LVN lvn(fg, bb, *fg.builder, p);
|
|
lvn.doLVN();
|
|
numInstsRemoved += lvn.getNumInstsRemoved();
|
|
numInstsRemoved += ::LVN::removeRedundantSamplerMovs(kernel, bb);
|
|
}
|
|
|
|
VISA_DEBUG({
|
|
std::cout << "===== LVN ====="
|
|
<< "\n";
|
|
std::cout << "Number of instructions removed: " << numInstsRemoved << "\n"
|
|
<< "\n";
|
|
});
|
|
}
|
|
|
|
// helper functions
|
|
|
|
static int getDstSubReg(G4_DstRegRegion *dst) {
|
|
int dstSubReg;
|
|
if (dst->getBase()->isPhyReg()) {
|
|
dstSubReg = dst->getSubRegOff();
|
|
} else {
|
|
dstSubReg = dst->getSubRegOff() +
|
|
static_cast<G4_RegVar *>(dst->getBase())->getPhyRegOff();
|
|
}
|
|
|
|
return dstSubReg;
|
|
}
|
|
|
|
static int getSrcSubReg(G4_Operand *src) {
|
|
vISA_ASSERT(src->isSrcRegRegion(), "expect Src Reg Region");
|
|
int srcSubReg;
|
|
if (src->asSrcRegRegion()->getBase()->isPhyReg()) {
|
|
srcSubReg = src->asSrcRegRegion()->getSubRegOff();
|
|
} else {
|
|
srcSubReg = src->asSrcRegRegion()->getSubRegOff() +
|
|
static_cast<G4_RegVar *>(src->asSrcRegRegion()->getBase())
|
|
->getPhyRegOff();
|
|
}
|
|
return srcSubReg;
|
|
}
|
|
|
|
//
|
|
// determine if fall-through jump is needed
|
|
//
|
|
// also remove redundant jumps
|
|
// if there is no predicate applied to a jump and its target is its fall
|
|
// though BB, remove the jump instruction.
|
|
|
|
void Optimizer::insertFallThroughJump() {
|
|
|
|
fg.setPhysicalPredSucc();
|
|
for (BB_LIST_ITER it = fg.begin(); it != fg.end();) {
|
|
G4_BB *bb = *it;
|
|
BB_LIST_ITER next = ++it;
|
|
//
|
|
// determine if the current bb needs a fall through jump
|
|
// check if the fall-through bb follows the current bb
|
|
//
|
|
G4_BB *fb = bb->fallThroughBB();
|
|
if (fb && (next == fg.end() || // bb is the last bb
|
|
fb != (*next))) {
|
|
// This is bogus in SIMD CF, as bad things happen when you randomly insert
|
|
// jumps in the middle of SIMD CF
|
|
} else if (next != fg.end()) {
|
|
// do not remove a jmpi if it's the target of an indirect jmp
|
|
// this makes the code more readable
|
|
if (!(*next)->empty() && (*next)->front()->isLabel() && !bb->empty() &&
|
|
bb->back()->opcode() == G4_jmpi &&
|
|
bb->back()->getPredicate() == NULL &&
|
|
!fg.isIndirectJmpTarget(bb->back())) {
|
|
if ((*next)->front()->getSrc(0) == bb->back()->getSrc(0)) {
|
|
std::list<G4_INST *>::iterator it = bb->end();
|
|
it--;
|
|
bb->erase(it);
|
|
}
|
|
}
|
|
}
|
|
it = next;
|
|
}
|
|
}
|
|
|
|
void Optimizer::forceAssignRegs() {
|
|
const char *rawStr =
|
|
builder.getOptions()->getOptionCstr(vISA_ForceAssignRhysicalReg);
|
|
if (!rawStr)
|
|
return;
|
|
|
|
llvm::StringRef line(rawStr);
|
|
llvm::SmallVector<llvm::StringRef, 4> assignments;
|
|
line.split(assignments, ',');
|
|
std::map<std::string /*decl name or id*/,
|
|
std::pair<int /*reg*/, int /*subreg*/>> forceAssign;
|
|
for (llvm::StringRef assignment : assignments) {
|
|
llvm::StringRef decl, reg, subreg;
|
|
std::tie(decl, reg) = assignment.split(':');
|
|
std::tie(reg, subreg) = reg.split('.');
|
|
int regNum = std::stoi(reg.str());
|
|
int subregNum = subreg.empty() ? 0 : std::stoi(subreg.str());
|
|
forceAssign[decl.str()] = std::make_pair(regNum, subregNum);
|
|
}
|
|
|
|
for (G4_Declare *dcl : kernel.Declares) {
|
|
int reg, subreg;
|
|
// skip forcing register assignment for the decl if both name and id are
|
|
// not specified in the option. Name will be used if both are given.
|
|
auto it = forceAssign.find(dcl->getName());
|
|
if (it == forceAssign.end()) {
|
|
it = forceAssign.find(std::to_string(dcl->getDeclId()));
|
|
if (it == forceAssign.end())
|
|
continue;
|
|
}
|
|
std::tie(reg, subreg) = it->second;
|
|
dcl->getRegVar()->setPhyReg(builder.phyregpool.getGreg(reg), subreg);
|
|
VISA_DEBUG({
|
|
std::cerr << "Force assigning Decl : " << it->first
|
|
<< " to r" << reg << "." << subreg << "\n";
|
|
dcl->dump();
|
|
});
|
|
}
|
|
}
|
|
|
|
void Optimizer::forceSpillVars() {
|
|
const char *rawStr =
|
|
builder.getOptions()->getOptionCstr(vISA_ForceSpillVariables);
|
|
if (!rawStr)
|
|
return;
|
|
|
|
llvm::StringRef line(rawStr);
|
|
llvm::SmallVector<llvm::StringRef, 4> vars;
|
|
line.split(vars, ',');
|
|
std::vector<int> token;
|
|
|
|
for (llvm::StringRef var : vars)
|
|
token.push_back(std::stoi(var.str()));
|
|
|
|
for (G4_Declare *dcl : kernel.Declares) {
|
|
if (std::find(token.begin(), token.end(), dcl->getDeclId()) !=
|
|
token.end()) {
|
|
dcl->setForceSpilled();
|
|
}
|
|
}
|
|
}
|
|
|
|
void Optimizer::preRegAlloc() {
|
|
forceAssignRegs();
|
|
forceSpillVars();
|
|
}
|
|
|
|
void Optimizer::regAlloc() {
|
|
|
|
fg.prepareTraversal();
|
|
|
|
// realR0 and BuiltInR0 are 2 different dcls.
|
|
// realR0 is always tied to physical r0.
|
|
// if copy of r0 isnt needed then set latter to r0 as well.
|
|
// if copy of r0 is required, then let RA decide allocation of BuiltInR0.
|
|
if (!R0CopyNeeded()) {
|
|
// when no copy is needed, make BuiltInR0 an alias of realR0
|
|
builder.getBuiltinR0()->setAliasDeclare(builder.getRealR0(), 0);
|
|
builder.getBuiltinR0()->getRegVar()->setPhyReg(
|
|
builder.getRealR0()->getRegVar()->getPhyReg(), 0);
|
|
}
|
|
|
|
//
|
|
// assign registers
|
|
//
|
|
int status = ::regAlloc(builder, builder.phyregpool, kernel);
|
|
if (status == VISA_EARLY_EXIT) {
|
|
EarlyExited = true;
|
|
} else if (status != VISA_SUCCESS) {
|
|
RAFail = true;
|
|
}
|
|
}
|
|
|
|
// HW debugging needs to zero certain ARF registers such as a0, acc, etc.
|
|
// Here, we zero a0 and acc on entry to a kernel.
|
|
void Optimizer::zeroSomeARF() {
|
|
if (builder.getIsKernel()) {
|
|
// The first BB is not necessarily the kernel's entry when kernel needs to
|
|
// load its payload!
|
|
G4_BB *mainBB = fg.getEntryBB();
|
|
if (builder.loadThreadPayload()) {
|
|
// Make sure to skip prolog BBs to insert into the 1st BB of a kernel.
|
|
// [perThreadBB:]
|
|
// crossThreadBB:
|
|
// main:
|
|
if (G4_BB *crossThreadBB = kernel.getCrossThreadPayloadBB()) {
|
|
vASSERT(crossThreadBB->Succs.size() == 1);
|
|
mainBB = crossThreadBB->Succs.front();
|
|
} else if (G4_BB *perThreadBB = kernel.getPerThreadPayloadBB()) {
|
|
vASSERT(perThreadBB->Succs.size() == 1);
|
|
mainBB = perThreadBB->Succs.front();
|
|
}
|
|
}
|
|
|
|
INST_LIST_ITER insertBeforePos = mainBB->getFirstInsertPos();
|
|
|
|
// Zero all address ARF
|
|
G4_DstRegRegion *A0Dst =
|
|
builder.createDst(builder.phyregpool.getAddrReg(), 0, 0, 1, Type_UD);
|
|
G4_INST *zeroA0 =
|
|
builder.createMov(g4::SIMD8, A0Dst, builder.createImm(0, Type_UD),
|
|
InstOpt_WriteEnable, false);
|
|
(void)mainBB->insertBefore(insertBeforePos, zeroA0);
|
|
|
|
// Zero acc ARF (at least two, some platform has more).
|
|
G4_DstRegRegion *Acc0Dst =
|
|
builder.createDst(builder.phyregpool.getAcc0Reg(), 0, 0, 1, Type_UD);
|
|
G4_INST *zeroAcc0 = builder.createMov(builder.getNativeExecSize(), Acc0Dst,
|
|
builder.createImm(0, Type_UD),
|
|
InstOpt_WriteEnable, false);
|
|
(void)mainBB->insertBefore(insertBeforePos, zeroAcc0);
|
|
|
|
G4_DstRegRegion *Acc1Dst =
|
|
builder.createDst(builder.phyregpool.getAcc1Reg(), 0, 0, 1, Type_UD);
|
|
G4_INST *zeroAcc1 = builder.createMov(builder.getNativeExecSize(), Acc1Dst,
|
|
builder.createImm(0, Type_UD),
|
|
InstOpt_WriteEnable, false);
|
|
(void)mainBB->insertBefore(insertBeforePos, zeroAcc1);
|
|
|
|
// Zero flags
|
|
int num32bitFlags = (int)(builder.getNumFlagRegisters() / 2);
|
|
for (int i = 0; i < num32bitFlags; ++i) {
|
|
G4_DstRegRegion *flagDst = builder.createDst(
|
|
builder.phyregpool.getFlagAreg(i), 0, 0, 1, Type_UD);
|
|
G4_INST *zeroFlag =
|
|
builder.createMov(g4::SIMD1, flagDst, builder.createImm(0, Type_UD),
|
|
InstOpt_WriteEnable, false);
|
|
(void)mainBB->insertBefore(insertBeforePos, zeroFlag);
|
|
}
|
|
}
|
|
}
|
|
|
|
void Optimizer::addSWSBInfo() {
|
|
if (builder.hasDPAS() && builder.hasDPASFuseRSWA()) {
|
|
// Currently the DPASFuseRSWA is tied to SWSB, so we make the
|
|
// preparation work the first part of addSWSBInfo.
|
|
prepareDPASFuseRSWA();
|
|
}
|
|
|
|
bool do_fcall_wa = builder.hasFusedEU() &&
|
|
builder.getuint32Option(vISA_fusedCallWA) == 1 &&
|
|
(kernel.fg.getHasStackCalls() || kernel.hasIndirectCall());
|
|
|
|
if (do_fcall_wa) {
|
|
// Need to be done before SWSB
|
|
finishFusedCallWA_preSWSB();
|
|
}
|
|
|
|
if (!builder.hasSWSB()) {
|
|
if (do_fcall_wa) {
|
|
finishFusedCallWA();
|
|
}
|
|
return;
|
|
}
|
|
|
|
if (!builder.getOption(vISA_forceDebugSWSB)) {
|
|
SWSB swsb(kernel);
|
|
swsb.SWSBGenerator();
|
|
} else {
|
|
forceDebugSWSB(&kernel);
|
|
}
|
|
|
|
if (builder.getOptions()->getuInt32Option(vISA_SWSBTokenBarrier) != 0) {
|
|
singleInstStallSWSB(
|
|
&kernel, builder.getOptions()->getuInt32Option(vISA_SWSBTokenBarrier),
|
|
0, true);
|
|
}
|
|
|
|
if (builder.getOptions()->getuInt32Option(vISA_SWSBInstStall) != 0) {
|
|
singleInstStallSWSB(
|
|
&kernel, builder.getOptions()->getuInt32Option(vISA_SWSBInstStall),
|
|
builder.getOptions()->getuInt32Option(vISA_SWSBInstStallEnd), false);
|
|
}
|
|
|
|
if (do_fcall_wa) {
|
|
// Need to be done when code is stable (no add, no delete).
|
|
finishFusedCallWA();
|
|
} else if (kernel.hasIndirectCall() && !builder.supportCallaRegSrc()) {
|
|
adjustIndirectCallOffsetAfterSWSBSet();
|
|
}
|
|
return;
|
|
}
|
|
|
|
// Common pass for HW debug functions
|
|
void Optimizer::HWDebug() {
|
|
if (builder.getOption(vISA_InsertHashMovs))
|
|
insertHashMovs();
|
|
}
|
|
|
|
void Optimizer::insertHashMovs() {
|
|
// As per request from IGC team, we want to conditionally insert
|
|
// two mov instructions like following:
|
|
//
|
|
// send ... {EOT}
|
|
// mov (16) null<1>:d lo32 {NoMask}
|
|
// mov (16) null<1>:d hi32 {NoMask}
|
|
//
|
|
bool hashAtPrologue = kernel.getOption(vISA_HashMovsAtPrologue);
|
|
for (G4_BB *bb : kernel.fg) {
|
|
for (auto it = bb->begin(); it != bb->end(); ++it) {
|
|
auto inst = (*it);
|
|
if (inst->isEOT() || hashAtPrologue) {
|
|
auto insertBefore = it;
|
|
if (inst->isLabel())
|
|
++insertBefore;
|
|
// We have to insert new instructions after EOT.
|
|
// Lexically, EOT could even be in the middle
|
|
// of the program.
|
|
auto insertHashMovInsts = [&](uint64_t hashVal) {
|
|
G4_INST *lo;
|
|
G4_INST *hi;
|
|
lo = kernel.fg.builder->createMov(
|
|
g4::SIMD16, kernel.fg.builder->createNullDst(Type_UD),
|
|
kernel.fg.builder->createImm((unsigned int)(hashVal & 0xffffffff),
|
|
Type_UD),
|
|
InstOpt_WriteEnable, false);
|
|
|
|
hi = kernel.fg.builder->createMov(
|
|
g4::SIMD16, kernel.fg.builder->createNullDst(Type_UD),
|
|
kernel.fg.builder->createImm(
|
|
(unsigned int)((hashVal >> 32) & 0xffffffff), Type_UD),
|
|
InstOpt_WriteEnable, false);
|
|
// Option: -hashmovs hi lo
|
|
// To be consistent, 'mov hi' goes before 'mov lo'
|
|
if (hashAtPrologue) {
|
|
bb->insertBefore(insertBefore, hi);
|
|
bb->insertBefore(insertBefore, lo);
|
|
} else {
|
|
bb->push_back(hi);
|
|
bb->push_back(lo);
|
|
}
|
|
};
|
|
|
|
// This func is called when vISA_HashVal is set by user;
|
|
// but vISA_HashVal1 is still optional.
|
|
uint64_t hashVal = builder.getOptions()->getuInt64Option(vISA_HashVal);
|
|
insertHashMovInsts(hashVal);
|
|
// vISA_HashVal1 is an extra hash value used to distinguish each entry
|
|
// in the module. That works for IGC as IGC invokes vISA for each
|
|
// kernel. However, VC invokes vISA for the whole module, so here we use
|
|
// an unique id for the purpose. Note that if vISA_HashVal1 is given,
|
|
// the value would still be used to emit the extra hash.
|
|
if (builder.getOptions()->isOptionSetByUser(vISA_HashVal1)) {
|
|
uint64_t hashVal1 =
|
|
builder.getOptions()->getuInt64Option(vISA_HashVal1);
|
|
insertHashMovInsts(hashVal1);
|
|
} else if (kernel.getKernelType() == VISA_CM) {
|
|
insertHashMovInsts(kernel.getFunctionId());
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// Break a sample instruction
|
|
// send.smpl (16) dst src0 src1
|
|
// into
|
|
// (P1) send.smpl (16) dst src0 src1
|
|
// (~P1) send.smpl (16) dst src0 src1
|
|
// where P1 is 0x5555 (i.e., pixels with even x coordinates)
|
|
// Ideally this would only affect 3d textures, but at
|
|
// the moment it will affect 2d array textures as well.
|
|
//
|
|
// P1 is initialized per BB before the first sample inst; we could make it per
|
|
// shader but I'm worried about flag spill this works for SIMD8 and SIMD32
|
|
// shaders as well.
|
|
//
|
|
void Optimizer::cloneSampleInst() {
|
|
bool cloneSample = builder.getOption(vISA_enableCloneSampleInst) &&
|
|
VISA_WA_CHECK(builder.getPWaTable(), Wa_14014414195);
|
|
bool cloneEvaluateSample = builder.getOption(vISA_cloneEvaluateSampleInst);
|
|
if (!cloneSample && !cloneEvaluateSample) {
|
|
return;
|
|
}
|
|
|
|
bool isSIMD32 = kernel.getSimdSize() == 32;
|
|
for (auto &&bb : kernel.fg) {
|
|
auto tmpFlag = builder.createTempFlag(isSIMD32 ? 2 : 1);
|
|
auto hasSample = false;
|
|
for (auto I = bb->begin(), E = bb->end(); I != E;) {
|
|
auto Next = std::next(I);
|
|
auto inst = *I;
|
|
if (inst->isSend() && inst->getMsgDesc()->getSFID() == SFID::SAMPLER &&
|
|
inst->getMsgDescRaw() != nullptr &&
|
|
inst->getExecSize() >= builder.getNativeExecSize()) {
|
|
G4_InstSend *sendInst = inst->asSendInst();
|
|
G4_Operand *src0 = sendInst->getSrc(0);
|
|
|
|
unsigned int messageSizeInBytes =
|
|
src0->getRightBound() - src0->getLeftBound() + 1;
|
|
if (sendInst->isSplitSend()) {
|
|
G4_Operand *src1 = sendInst->getSrc(1);
|
|
messageSizeInBytes +=
|
|
src1->getRightBound() - src1->getLeftBound() + 1;
|
|
}
|
|
if (sendInst->getMsgDescRaw()->isHeaderPresent()) {
|
|
messageSizeInBytes -= kernel.getGRFSize();
|
|
}
|
|
unsigned int numParams = messageSizeInBytes / kernel.getGRFSize() *
|
|
builder.getNativeExecSize() /
|
|
inst->getExecSize();
|
|
bool isEval = sendInst->getMsgDesc()->getDstLenRegs() == 0;
|
|
uint32_t messageType =
|
|
sendInst->getMsgDescRaw()->getSamplerMessageType();
|
|
vISA_ASSERT(!inst->getPredicate(),
|
|
"do not handle predicated sampler inst for now");
|
|
if (!isEval && cloneSample && messageType == 0 && numParams == 3) {
|
|
if (!hasSample) {
|
|
hasSample = true;
|
|
auto flagInit = builder.createMov(
|
|
g4::SIMD1,
|
|
builder.createDst(tmpFlag->getRegVar(),
|
|
isSIMD32 ? Type_UD : Type_UW),
|
|
builder.createImm(isSIMD32 ? 0x55555555 : 0x5555,
|
|
isSIMD32 ? Type_UD : Type_UW),
|
|
InstOpt_WriteEnable, false);
|
|
bb->insertBefore(I, flagInit);
|
|
}
|
|
auto newInst = inst->cloneInst();
|
|
inst->setPredicate(
|
|
builder.createPredicate(PredState_Plus, tmpFlag->getRegVar(), 0));
|
|
newInst->setPredicate(builder.createPredicate(
|
|
PredState_Minus, tmpFlag->getRegVar(), 0));
|
|
auto newInstIt = bb->insertAfter(I, newInst);
|
|
|
|
uint16_t rspLen =
|
|
inst->asSendInst()->getMsgDescRaw()->ResponseLength();
|
|
// If Pixel Null Mask feedback is requested sampler message
|
|
// has header, all data channels enabled and an additional
|
|
// GRF of writeback payload with Pixel Null Mask.
|
|
// Possible message response lengths are:
|
|
// - 5 GRFs for all simd8 messages and for simd16 messages
|
|
// with 16-bit return format
|
|
// - 9 GRFs for simd16 message with 32-bit return format
|
|
// It is enough to check send's response length to determine
|
|
// if Pixel Null Mask feedback is enabled.
|
|
vASSERT(inst->getExecSize() == g4::SIMD8 ||
|
|
inst->getExecSize() == g4::SIMD16);
|
|
uint16_t pixelNullMaskRspLen =
|
|
(inst->getExecSize() == g4::SIMD16 &&
|
|
!sendInst->getMsgDescRaw()->is16BitReturn())
|
|
? 9
|
|
: 5;
|
|
|
|
if (sendInst->getMsgDescRaw()->isHeaderPresent() &&
|
|
rspLen == pixelNullMaskRspLen) {
|
|
// Pixel Null Mask is in the first word of the last GRF
|
|
// of send's writeback message. This mask has bits set
|
|
// to 0 for pixels in which a null page was source for
|
|
// at least one texel. Otherwise bits are set to 1.
|
|
|
|
// Create a copy of Pixel Null Mask from the first send
|
|
// writeback message and AND it with the mask from the
|
|
// second send.
|
|
G4_Declare *maskCopy = builder.createTempVar(1, Type_UW, Any);
|
|
G4_Declare *maskAlias = builder.createTempVar(1, Type_UW, Any);
|
|
maskAlias->setAliasDeclare(
|
|
inst->getDst()->getBase()->asRegVar()->getDeclare(),
|
|
(inst->getDst()->getRegOff() + rspLen - 1) *
|
|
kernel.numEltPerGRF<Type_UB>());
|
|
G4_SrcRegRegion *src = builder.createSrcRegRegion(
|
|
maskAlias, builder.getRegionScalar());
|
|
G4_DstRegRegion *dst =
|
|
builder.createDst(maskCopy->getRegVar(), Type_UW);
|
|
G4_INST *movInst = builder.createMov(g4::SIMD1, dst, src,
|
|
InstOpt_WriteEnable, false);
|
|
bb->insertAfter(I, movInst);
|
|
G4_SrcRegRegion *src0 = builder.createSrcRegRegion(*src);
|
|
G4_SrcRegRegion *src1 =
|
|
builder.createSrcRegRegion(maskCopy, builder.getRegionScalar());
|
|
dst = builder.createDst(maskAlias->getRegVar(), Type_UW);
|
|
G4_INST *andInst = builder.createBinOp(
|
|
G4_and, g4::SIMD1, dst, src0, src1, InstOpt_WriteEnable, false);
|
|
bb->insertAfter(newInstIt, andInst);
|
|
}
|
|
} else if (isEval && cloneEvaluateSample && messageType != 0x1F) {
|
|
// 0x1F is the opcode for sampler cache flush
|
|
uint32_t newExecSize =
|
|
(messageType == VISA_3D_SAMPLE_L || messageType == VISA_3D_LD)
|
|
? 8
|
|
: 1;
|
|
uint32_t mask = (1 << newExecSize) - 1;
|
|
auto evalTmpFlag = builder.createTempFlag(isSIMD32 ? 2 : 1);
|
|
auto flagInit = builder.createMov(
|
|
g4::SIMD1,
|
|
builder.createDst(evalTmpFlag->getRegVar(),
|
|
isSIMD32 ? Type_UD : Type_UW),
|
|
builder.createImm(mask, isSIMD32 ? Type_UD : Type_UW),
|
|
InstOpt_WriteEnable, false);
|
|
bb->insertBefore(I, flagInit);
|
|
inst->setPredicate(builder.createPredicate(
|
|
PredState_Plus, evalTmpFlag->getRegVar(), 0));
|
|
unsigned numInsts = kernel.getSimdSize() / newExecSize;
|
|
for (unsigned int i = 1; i < numInsts; i++) {
|
|
auto newInst = inst->cloneInst();
|
|
bb->insertAfter(I, newInst);
|
|
evalTmpFlag = builder.createTempFlag(isSIMD32 ? 2 : 1);
|
|
flagInit = builder.createMov(
|
|
g4::SIMD1,
|
|
builder.createDst(evalTmpFlag->getRegVar(),
|
|
isSIMD32 ? Type_UD : Type_UW),
|
|
builder.createImm(mask << (i * newExecSize),
|
|
isSIMD32 ? Type_UD : Type_UW),
|
|
InstOpt_WriteEnable, false);
|
|
newInst->setPredicate(builder.createPredicate(
|
|
PredState_Plus, evalTmpFlag->getRegVar(), 0));
|
|
bb->insertAfter(I, flagInit);
|
|
}
|
|
}
|
|
}
|
|
I = Next;
|
|
}
|
|
}
|
|
}
|
|
|
|
void Optimizer::removeLifetimeOps() {
|
|
// Call this function after RA only.
|
|
|
|
// Remove all pseudo_kill and lifetime.end
|
|
// instructions.
|
|
// Also remove pseudo_use instructions.
|
|
for (G4_BB *bb : kernel.fg) {
|
|
bb->erase(std::remove_if(bb->begin(), bb->end(),
|
|
[](G4_INST *inst) {
|
|
return inst->isPseudoKill() ||
|
|
inst->isLifeTimeEnd() ||
|
|
inst->isPseudoUse();
|
|
}),
|
|
bb->end());
|
|
}
|
|
}
|
|
|
|
void Optimizer::runPass(PassIndex Index) {
|
|
const PassInfo &PI = Passes[Index];
|
|
|
|
// Do not execute.
|
|
if ((PI.Option != vISA_EnableAlways && !builder.getOption(PI.Option)) ||
|
|
EarlyExited)
|
|
return;
|
|
|
|
std::string Name = PI.Name;
|
|
|
|
#ifndef DLL_MODE
|
|
if (StopBeforePass == Name) {
|
|
EarlyExited = true;
|
|
kernel.dumpToConsole();
|
|
return;
|
|
}
|
|
#endif // DLL_MODE
|
|
|
|
setCurrentDebugPass(PI.Name);
|
|
|
|
if (PI.Timer != TimerID::NUM_TIMERS)
|
|
startTimer(PI.Timer);
|
|
|
|
kernel.dumpToFile("before." + Name);
|
|
|
|
// Execute pass.
|
|
(this->*(PI.Pass))();
|
|
|
|
if (PI.Timer != TimerID::NUM_TIMERS)
|
|
stopTimer(PI.Timer);
|
|
|
|
kernel.dumpToFile("after." + Name);
|
|
#ifndef DLL_MODE
|
|
// Only check for stop-after in offline build as it's intended for vISA
|
|
// debugging only. Note that stop-after does not work if the pass is not
|
|
// executed.
|
|
if (StopAfterPass == Name || EarlyExited) {
|
|
EarlyExited = true;
|
|
kernel.dumpToConsole();
|
|
}
|
|
#endif // DLL_MODE
|
|
|
|
#ifdef _DEBUG
|
|
bool skipVerify = Index == PI_regAlloc && (RAFail || EarlyExited);
|
|
if (!skipVerify) {
|
|
verifyG4Kernel(kernel, Index, true, G4Verifier::VC_ASSERT);
|
|
}
|
|
#endif
|
|
setCurrentDebugPass(nullptr);
|
|
}
|
|
|
|
void Optimizer::initOptimizations() {
|
|
#define OPT_INITIALIZE_PASS(Name, Option, Timer) \
|
|
Passes[PI_##Name] = PassInfo(&Optimizer::Name, "" #Name, Option, Timer)
|
|
|
|
// To initialize a pass, the member function name is the first argument.
|
|
// This member function must return void and take no argument.
|
|
//
|
|
// The second argument is the corresponding option to enable this pass.
|
|
// If it always runs then use vISA_EnableAlways.
|
|
//
|
|
// The third argument is the intended timer for this pass. If no timing
|
|
// is necessary, then TIMER_NUM_TIMERS can be used.
|
|
//
|
|
OPT_INITIALIZE_PASS(cleanMessageHeader, vISA_LocalCleanMessageHeader,
|
|
TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(forceNoMaskOnM0, vISA_forceNoMaskOnM0, TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(sendFusion, vISA_EnableSendFusion, TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(renameRegister, vISA_LocalRenameRegister, TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(localDefHoisting, vISA_LocalDefHoist, TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(localCopyPropagation, vISA_LocalCopyProp, TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(localInstCombine, vISA_LocalInstCombine, TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(removePartialMovs, vISA_RemovePartialMovs,
|
|
TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(cselPeepHoleOpt, vISA_enableCSEL, TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(optimizeLogicOperation, vISA_EnableAlways,
|
|
TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(EmulateInt64Add, vISA_EnableAlways, TimerID::HW_CONFORMITY);
|
|
OPT_INITIALIZE_PASS(HWConformityChk, vISA_EnableAlways, TimerID::HW_CONFORMITY);
|
|
OPT_INITIALIZE_PASS(preRA_Schedule, vISA_preRA_Schedule,
|
|
TimerID::PRERA_SCHEDULING);
|
|
OPT_INITIALIZE_PASS(preRA_HWWorkaround, vISA_EnableAlways, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(preRegAlloc, vISA_EnableAlways, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(regAlloc, vISA_EnableAlways, TimerID::TOTAL_RA);
|
|
OPT_INITIALIZE_PASS(removeLifetimeOps, vISA_EnableAlways, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(postRA_HWWorkaround, vISA_EnableAlways, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(removeRedundMov, vISA_removeRedundMov, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(removeEmptyBlocks, vISA_EnableAlways, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(insertFallThroughJump, vISA_EnableAlways, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(reassignBlockIDs, vISA_EnableAlways, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(evalAddrExp, vISA_EnableAlways, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(FoldAddrImmediate, vISA_FoldAddrImmed, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(localSchedule, vISA_LocalScheduling, TimerID::SCHEDULING);
|
|
OPT_INITIALIZE_PASS(HWWorkaround, vISA_EnableAlways, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(fixEndIfWhileLabels, vISA_EnableAlways, TimerID::NUM_TIMERS);
|
|
OPT_INITIALIZE_PASS(HWDebug, vISA_EnableAlways, TimerID::NUM_TIMERS);
|
|
OPT_INITIALIZE_PASS(insertDummyMovForHWRSWA, vISA_InsertDummyMovForHWRSWA,
|
|
TimerID::NUM_TIMERS);
|
|
OPT_INITIALIZE_PASS(insertDummyCompactInst, vISA_InsertDummyCompactInst,
|
|
TimerID::NUM_TIMERS);
|
|
OPT_INITIALIZE_PASS(swapSrc1Src2OfMadForCompaction,
|
|
vISA_SwapSrc1Src2OfMadForCompaction,
|
|
TimerID::NUM_TIMERS);
|
|
OPT_INITIALIZE_PASS(mergeScalarInst, vISA_MergeScalar, TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(lowerMadSequence, vISA_EnableMACOpt, TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(LVN, vISA_LVN, TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(ifCvt, vISA_ifCvt, TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(dumpPayload, vISA_dumpPayload, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(normalizeRegion, vISA_EnableAlways, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(collectStats, vISA_EnableAlways, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(createR0Copy, vISA_EnableAlways, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(initializePayload, vISA_InitPayload, TimerID::NUM_TIMERS);
|
|
OPT_INITIALIZE_PASS(cleanupBindless, vISA_enableCleanupBindless,
|
|
TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(cleanupA0Movs, vISA_enableCleanupA0Movs,
|
|
TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(countGRFUsage, vISA_PrintRegUsage, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(changeMoveType, vISA_ChangeMoveType, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(accSubBeforeRA, vISA_accSubBeforeRA, TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(accSubPostSchedule, vISA_accSubstitution, TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(s0SubAfterRA, vISA_EnableAlways, TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(removePseudoMov, vISA_EnableAlways,
|
|
TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(dce, vISA_EnableDCE, TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(reassociateConst, vISA_reassociate, TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(split4GRFVars, vISA_split4GRFVar, TimerID::OPTIMIZER);
|
|
OPT_INITIALIZE_PASS(loadThreadPayload, vISA_loadThreadPayload,
|
|
TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(addFFIDProlog, vISA_addFFIDProlog, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(addEmaskSetupProlog, vISA_addEmaskSetupProlog,
|
|
TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(insertFenceBeforeEOT, vISA_EnableAlways, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(insertScratchReadBeforeEOT, vISA_clearScratchWritesBeforeEOT,
|
|
TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(mapOrphans, vISA_EnableAlways, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(legalizeType, vISA_EnableAlways, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(analyzeMove, vISA_analyzeMove, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(removeIntrinsics, vISA_EnableAlways, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(expandMulPostSchedule, vISA_expandMulPostSchedule,
|
|
TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(zeroSomeARF, vISA_zeroSomeARF, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(addSWSBInfo, vISA_addSWSBInfo, TimerID::SWSB);
|
|
OPT_INITIALIZE_PASS(expandMadwPostSchedule, vISA_expandMadwPostSchedule,
|
|
TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(ACCSchedule, vISA_PreSchedForAcc, TimerID::PRERA_SCHEDULING);
|
|
OPT_INITIALIZE_PASS(staticProfiling, vISA_staticProfiling, TimerID::MISC_OPTS);
|
|
OPT_INITIALIZE_PASS(sinkBarrierWait, vISA_SinkBarrierWait,
|
|
TimerID::OPTIMIZER);
|
|
|
|
// Verify all passes are initialized.
|
|
#ifdef _DEBUG
|
|
for (unsigned i = 0; i < PI_NUM_PASSES; ++i) {
|
|
vISA_ASSERT(Passes[i].Pass, "uninitialized pass");
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// simple heuristics to decide if it's profitable to do copy propagation for the
|
|
// move add more as necessary
|
|
bool Optimizer::isCopyPropProfitable(G4_INST *movInst) const {
|
|
vISA_ASSERT(movInst->opcode() == G4_mov, "expected a move instruction");
|
|
|
|
// if inst is a simd16 W/HF packing, we don't want to optimize it if
|
|
// there are >=2 simd16 mad uses, since it will slow down the mad.
|
|
// for gen9 additionally check for simd8 mad as it doesn't support strided
|
|
// regions
|
|
auto dst = movInst->getDst();
|
|
auto src0 = movInst->getSrc(0);
|
|
auto hasStrideSource = dst->getHorzStride() == 1 && src0->isSrcRegRegion() &&
|
|
!(src0->asSrcRegRegion()->getRegion()->isContiguous(
|
|
movInst->getExecSize()) ||
|
|
src0->asSrcRegRegion()->getRegion()->isScalar());
|
|
|
|
hasStrideSource &=
|
|
movInst->getExecSize() == g4::SIMD16 ||
|
|
(!builder.hasAlign1Ternary() && movInst->getExecSize() == g4::SIMD8);
|
|
|
|
auto hasNSIMD16or8MadUse = [](G4_INST *movInst, int N, bool checkSIMD8) {
|
|
int numMadUses = 0;
|
|
for (auto iter = movInst->use_begin(), iterEnd = movInst->use_end();
|
|
iter != iterEnd; ++iter) {
|
|
auto use = *iter;
|
|
auto inst = use.first;
|
|
if (inst->opcode() == G4_pseudo_mad &&
|
|
(inst->getExecSize() == g4::SIMD16 ||
|
|
(checkSIMD8 && inst->getExecSize() == g4::SIMD8))) {
|
|
++numMadUses;
|
|
if (numMadUses == N) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
};
|
|
|
|
if (hasStrideSource) {
|
|
if (hasNSIMD16or8MadUse(movInst, 2, true)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// another condition where copy prop may be not profitable:
|
|
// mov is from HF to F and dst is used in simd16 mad.
|
|
// copy propagating away the move results in mix mode mad which is bad for
|
|
// bank conflicts
|
|
if (dst->getType() == Type_F && src0->getType() == Type_HF) {
|
|
if (hasNSIMD16or8MadUse(movInst, 4, false)) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void Optimizer::accSubPostSchedule() {
|
|
if (!builder.doAccSub() || !builder.getOption(vISA_doAccSubAfterSchedule)) {
|
|
return;
|
|
}
|
|
|
|
kernel.fg.resetLocalDataFlowData();
|
|
kernel.fg.localDataFlowAnalysis();
|
|
|
|
if (builder.getOption(vISA_localizationForAccSub)) {
|
|
HWConformity hwConf(builder, kernel);
|
|
for (auto bb : kernel.fg) {
|
|
hwConf.localizeForAcc(bb);
|
|
}
|
|
|
|
kernel.fg.resetLocalDataFlowData();
|
|
kernel.fg.localDataFlowAnalysis();
|
|
}
|
|
|
|
AccSubPass accSub(builder, kernel);
|
|
accSub.run();
|
|
}
|
|
|
|
void Optimizer::s0SubAfterRA() {
|
|
if (!builder.enableSendIndirect()) {
|
|
return;
|
|
}
|
|
|
|
kernel.fg.resetLocalDataFlowData();
|
|
kernel.fg.localDataFlowAnalysis();
|
|
|
|
SRSubPassAfterRA s0Sub(builder, kernel);
|
|
s0Sub.run();
|
|
}
|
|
|
|
void Optimizer::accSubBeforeRA() {
|
|
if (!builder.doAccSub() || !builder.getOption(vISA_doAccSubAfterSchedule)) {
|
|
return;
|
|
}
|
|
|
|
kernel.fg.resetLocalDataFlowData();
|
|
kernel.fg.localDataFlowAnalysis();
|
|
|
|
if (builder.getOption(vISA_localizationForAccSub)) {
|
|
HWConformity hwConf(builder, kernel);
|
|
for (auto bb : kernel.fg) {
|
|
hwConf.localizeForAcc(bb);
|
|
}
|
|
|
|
kernel.fg.resetLocalDataFlowData();
|
|
kernel.fg.localDataFlowAnalysis();
|
|
}
|
|
|
|
AccSubPass accSub(builder, kernel);
|
|
accSub.run();
|
|
}
|
|
|
|
bool Optimizer::R0CopyNeeded() {
|
|
if (!builder.canReadR0()) {
|
|
// If r0 cannot be read then r0 has to be copied
|
|
// and cannot be said to be preserved in r0. In
|
|
// other words, these 2 are mutually exclusive
|
|
// options.
|
|
vISA_ASSERT(!kernel.getOption(vISA_PreserveR0InR0),
|
|
"opposing options for r0 detected");
|
|
return true;
|
|
}
|
|
|
|
if (kernel.getOption(vISA_PreserveR0InR0)) {
|
|
return false;
|
|
}
|
|
|
|
if (builder.getIsKernel() && kernel.fg.getHasStackCalls()) {
|
|
// As per VISA ABI, last register in GRF file should
|
|
// contain copy of r0.
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
int Optimizer::optimization() {
|
|
#ifndef DLL_MODE
|
|
if (StopAfterPass == "CFGConstruction") {
|
|
EarlyExited = true;
|
|
kernel.dumpToConsole();
|
|
}
|
|
#endif // DLL_MODE
|
|
|
|
// remove redundant message headers.
|
|
runPass(PI_cleanMessageHeader);
|
|
|
|
// Set NoMask inst's mask offset to 0 if possible
|
|
runPass(PI_forceNoMaskOnM0);
|
|
|
|
|
|
runPass(PI_sendFusion);
|
|
|
|
// rename registers.
|
|
runPass(PI_renameRegister);
|
|
|
|
runPass(PI_localDefHoisting);
|
|
|
|
runPass(PI_removePartialMovs);
|
|
|
|
runPass(PI_cleanupA0Movs);
|
|
|
|
// remove redundant movs and fold some other patterns
|
|
runPass(PI_localCopyPropagation);
|
|
|
|
// fold some binary operations
|
|
runPass(PI_localInstCombine);
|
|
|
|
runPass(PI_mergeScalarInst);
|
|
|
|
runPass(PI_cselPeepHoleOpt);
|
|
|
|
runPass(PI_reassociateConst);
|
|
|
|
runPass(PI_lowerMadSequence);
|
|
|
|
// optimize logic operantions
|
|
runPass(PI_optimizeLogicOperation);
|
|
|
|
// Dead code elimination
|
|
runPass(PI_dce);
|
|
|
|
// Emulate 64 Int add if needed
|
|
runPass(PI_EmulateInt64Add);
|
|
|
|
// HW conformity check
|
|
runPass(PI_HWConformityChk);
|
|
|
|
// Local Value Numbering
|
|
runPass(PI_LVN);
|
|
|
|
// this must be run after copy prop cleans up the moves
|
|
runPass(PI_cleanupBindless);
|
|
|
|
runPass(PI_split4GRFVars);
|
|
|
|
runPass(PI_insertFenceBeforeEOT);
|
|
|
|
|
|
// PreRA scheduling
|
|
runPass(PI_preRA_Schedule);
|
|
|
|
// HW workaround before RA
|
|
runPass(PI_preRA_HWWorkaround);
|
|
|
|
if (builder.enableACCBeforRA() && builder.enablePreSchedACC()) {
|
|
runPass(PI_ACCSchedule);
|
|
}
|
|
|
|
if (builder.enableACCBeforRA() && !builder.enablePreSchedACC()) {
|
|
runPass(PI_accSubBeforeRA);
|
|
}
|
|
|
|
runPass(PI_preRegAlloc);
|
|
|
|
// perform register allocation
|
|
runPass(PI_regAlloc);
|
|
if (RAFail) {
|
|
return VISA_SPILL;
|
|
}
|
|
|
|
|
|
|
|
runPass(PI_removeLifetimeOps);
|
|
|
|
// HW workaround after RA
|
|
runPass(PI_postRA_HWWorkaround);
|
|
|
|
//
|
|
// if a fall-through BB does not immediately follow its predecessor
|
|
// in the code layout, then insert a jump-to-fall-through in the predecessor
|
|
//
|
|
runPass(PI_insertFallThroughJump);
|
|
|
|
// Run if-conversion to convert short if-blocks.
|
|
runPass(PI_ifCvt);
|
|
|
|
//
|
|
// re-assign block ID so that we can use id to determine the ordering of
|
|
// two blocks in the code layout
|
|
//
|
|
runPass(PI_reassignBlockIDs);
|
|
|
|
runPass(PI_FoldAddrImmediate);
|
|
|
|
// FIXME houjenko: Disable local scheduling due to issues when
|
|
// using extra regiser that may corrupt unknown liveout
|
|
if (!builder.getIsPayload()) {
|
|
runPass(PI_localSchedule);
|
|
}
|
|
|
|
if (!builder.enableACCBeforRA() && !builder.enablePreSchedACC()) {
|
|
runPass(PI_expandMulPostSchedule);
|
|
|
|
runPass(PI_expandMadwPostSchedule);
|
|
|
|
runPass(PI_accSubPostSchedule);
|
|
}
|
|
|
|
runPass(PI_legalizeType);
|
|
|
|
runPass(PI_changeMoveType);
|
|
runPass(PI_s0SubAfterRA);
|
|
|
|
// No pass after this should expect def-use to be preserved as this pass
|
|
// removes raw movs with identical src/dst physical GRFs.
|
|
runPass(PI_removeRedundMov);
|
|
|
|
// remove any placeholders blocks inserted to aid regalloc
|
|
// run this pass after reRA pass otherwise CFG can become
|
|
// invalid (funcInfo, calleeInfo may point to bad initBB).
|
|
runPass(PI_removeEmptyBlocks);
|
|
|
|
runPass(PI_insertScratchReadBeforeEOT);
|
|
|
|
runPass(PI_sinkBarrierWait);
|
|
// HW workaround
|
|
runPass(PI_HWWorkaround);
|
|
|
|
runPass(PI_normalizeRegion);
|
|
|
|
runPass(PI_countGRFUsage);
|
|
|
|
runPass(PI_dumpPayload);
|
|
|
|
// this must be the last step of the optimization so as to not violate
|
|
// the CFG assumption
|
|
runPass(PI_fixEndIfWhileLabels);
|
|
|
|
runPass(PI_HWDebug);
|
|
|
|
runPass(PI_insertDummyMovForHWRSWA);
|
|
|
|
runPass(PI_collectStats);
|
|
|
|
// Create a copy of R0 at the top of kernel.
|
|
// This must be done after all other optimizer
|
|
// passes except for loadThreadPlayoad
|
|
runPass(PI_createR0Copy);
|
|
|
|
runPass(PI_initializePayload);
|
|
|
|
runPass(PI_loadThreadPayload);
|
|
|
|
runPass(PI_addFFIDProlog);
|
|
|
|
runPass(PI_addEmaskSetupProlog);
|
|
|
|
// Insert a dummy compact instruction if requested for SKL+
|
|
runPass(PI_insertDummyCompactInst);
|
|
|
|
runPass(PI_swapSrc1Src2OfMadForCompaction);
|
|
|
|
runPass(PI_mapOrphans);
|
|
|
|
runPass(PI_analyzeMove);
|
|
|
|
runPass(PI_removeIntrinsics);
|
|
|
|
runPass(PI_zeroSomeARF);
|
|
|
|
//-----------------------------------------------------------------------------------------------------------------
|
|
//------NOTE!!!! No instruction change(add/remove, or operand associated
|
|
// change) is allowed after SWSB-------------
|
|
//-----------------------------------------------------------------------------------------------------------------
|
|
runPass(PI_addSWSBInfo);
|
|
|
|
runPass(PI_removePseudoMov);
|
|
|
|
runPass(PI_staticProfiling);
|
|
|
|
if (EarlyExited) {
|
|
return VISA_EARLY_EXIT;
|
|
}
|
|
return VISA_SUCCESS;
|
|
}
|
|
|
|
// When constructing CFG we have the assumption that a label must be the first
|
|
// instruction in a bb. During structure analysis, however, we may end up with
|
|
// a bb that starts with multiple endifs if the bb is the target of multiple
|
|
// gotos that have been converted to an if. Instead of creating a BB for each
|
|
// of the endif, we associate each endif with a label and emit them only at the
|
|
// very end.
|
|
//
|
|
// For break and continue, UIP must be the lable directly attached to the while
|
|
// op. If not, create such a label
|
|
//
|
|
// DO
|
|
// IF
|
|
// P =
|
|
// CONT L1
|
|
// ENDIF L1
|
|
// IF
|
|
// BREAK L2
|
|
// ENDIF L1
|
|
// L1
|
|
// (P) WHILE
|
|
// L2
|
|
//
|
|
// will be transfered into
|
|
//
|
|
// DO
|
|
// IF
|
|
// P =
|
|
// Spill <- P
|
|
// CONT L3 // UIP becomes L3
|
|
// ENDIF L1
|
|
// IF
|
|
// BREAK L3 // UIP becomes L3
|
|
// ENDIF L1
|
|
// L1 // existing label
|
|
// P <- fill
|
|
// L3 // new label
|
|
// (P) WHILE
|
|
// L2
|
|
//
|
|
void Optimizer::fixEndIfWhileLabels() {
|
|
for (BB_LIST_CITER iter = fg.cbegin(), bend = fg.cend(); iter != bend;
|
|
++iter) {
|
|
G4_BB *bb = *iter;
|
|
INST_LIST_ITER iter2 = bb->begin();
|
|
INST_LIST_ITER iend = bb->end();
|
|
while (iter2 != iend) {
|
|
INST_LIST_ITER currIter = iter2;
|
|
++iter2;
|
|
|
|
G4_INST *inst = *currIter;
|
|
G4_Label *endifLabel = fg.getLabelForEndif(inst);
|
|
if (endifLabel) {
|
|
G4_INST *labelInst = fg.createNewLabelInst(endifLabel);
|
|
bb->insertBefore(currIter, labelInst);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Patch labels if necessary.
|
|
for (G4_BB *bb : fg) {
|
|
if (bb->empty())
|
|
continue;
|
|
|
|
G4_INST *inst = bb->back();
|
|
G4_opcode opc = inst->opcode();
|
|
if (opc != G4_cont && opc != G4_break)
|
|
continue;
|
|
|
|
// The matching while BB.
|
|
G4_BB *whileBB = nullptr;
|
|
if (opc == G4_cont) {
|
|
// The whileBB is the first successor bb, if this is continue.
|
|
whileBB = bb->Succs.front();
|
|
} else {
|
|
// For break, the whileBB should be the physical predecessor of
|
|
// break's first successor bb.
|
|
for (G4_BB *succBB : bb->Succs) {
|
|
if (succBB->getPhysicalPred() &&
|
|
(!succBB->getPhysicalPred()->empty()) &&
|
|
(succBB->getPhysicalPred()->back()->opcode() == G4_while)) {
|
|
whileBB = succBB->getPhysicalPred();
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (whileBB == nullptr || whileBB->empty() ||
|
|
whileBB->back()->opcode() != G4_while) {
|
|
vISA_ASSERT(false, "can not find while BB");
|
|
continue;
|
|
}
|
|
|
|
// If while instruction is following the label, then no need
|
|
// to insert a new uip label, just use the existing one.
|
|
G4_InstCF *instCF = inst->asCFInst();
|
|
auto whileIter = std::prev(whileBB->end());
|
|
G4_INST *prevInst = *std::prev(whileIter);
|
|
if (prevInst->isLabel()) {
|
|
instCF->setUip(prevInst->getLabel());
|
|
} else {
|
|
std::string NewUipName = instCF->getUipLabelStr();
|
|
NewUipName += "_UIP";
|
|
G4_Label *label = builder.createLabel(NewUipName, LABEL_BLOCK);
|
|
instCF->setUip(label);
|
|
|
|
G4_INST *newInst = fg.createNewLabelInst(label);
|
|
|
|
whileBB->insertBefore(whileIter, newInst);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fold address register into address register offset, such that we can same one
|
|
// instruction that computes: add a0.0 a0.0 immed; mul dst r[a0.0, 0] src2
|
|
//-->
|
|
// mul dst r[a0.0, immed] src2
|
|
|
|
// The condition is that immed is in range [-512..511] and it is dividable
|
|
// by 32. This is a local OPT. For simplicity, only execsize 1 is considered.
|
|
// Since physical registers are alreasy assigned, we use the info directly here
|
|
// without check.
|
|
|
|
void Optimizer::reverseOffsetProp(AddrSubReg_Node addrRegInfo[8], int subReg,
|
|
unsigned int srcNum, INST_LIST_ITER lastIter,
|
|
INST_LIST_ITER iend) {
|
|
if (addrRegInfo[subReg].usedImmed && addrRegInfo[subReg].canUseImmed) {
|
|
INST_LIST_ITER iter;
|
|
G4_INST *inst;
|
|
G4_Operand *inst_src;
|
|
G4_DstRegRegion *inst_dst;
|
|
for (iter = addrRegInfo[subReg].iter; iter != lastIter; ++iter) {
|
|
if (iter == lastIter)
|
|
break;
|
|
inst = *iter;
|
|
if (inst->isDead())
|
|
continue;
|
|
inst_dst = inst->getDst();
|
|
if (inst_dst && inst_dst->getRegAccess() != Direct) {
|
|
int subReg1 = getDstSubReg(inst_dst);
|
|
|
|
short currOff = inst_dst->getAddrImm();
|
|
if (subReg1 == subReg) {
|
|
// create a new dst
|
|
G4_DstRegRegion tmpRgn(*inst_dst);
|
|
G4_DstRegRegion *newDst = &tmpRgn;
|
|
newDst->setImmAddrOff(
|
|
short(currOff - addrRegInfo[subReg].immAddrOff));
|
|
inst->setDest(builder.createDstRegRegion(*newDst));
|
|
}
|
|
}
|
|
for (int i = 0; i < inst->getNumSrc(); i++) {
|
|
inst_src = inst->getSrc(i);
|
|
if (inst_src && inst_src->isSrcRegRegion() &&
|
|
inst_src->asSrcRegRegion()->getRegAccess() != Direct) {
|
|
int subReg1 = getSrcSubReg(inst_src);
|
|
|
|
short currOff = inst_src->asSrcRegRegion()->getAddrImm();
|
|
if (subReg1 == subReg) {
|
|
G4_SrcRegRegion tmpRgn(*inst_src->asSrcRegRegion());
|
|
G4_SrcRegRegion *newSrc = &tmpRgn;
|
|
newSrc->setImmAddrOff(
|
|
short(currOff - addrRegInfo[subReg].immAddrOff));
|
|
inst->setSrc(builder.createSrcRegRegion(*newSrc), i);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Immed has been propagated to srcs before this src in *ii, also reverse
|
|
// this
|
|
if (srcNum > 0) {
|
|
inst = *lastIter;
|
|
for (unsigned i = 0; i < srcNum; i++) {
|
|
inst_src = inst->getSrc(i);
|
|
if (inst_src && inst_src->isSrcRegRegion() &&
|
|
inst_src->asSrcRegRegion()->getRegAccess() != Direct) {
|
|
int subReg1 = getSrcSubReg(inst_src);
|
|
|
|
short currOff = inst_src->asSrcRegRegion()->getAddrImm();
|
|
if (subReg1 == subReg) {
|
|
G4_SrcRegRegion tmpRgn(*inst_src->asSrcRegRegion());
|
|
G4_SrcRegRegion *newSrc = &tmpRgn;
|
|
newSrc->setImmAddrOff(
|
|
short(currOff - addrRegInfo[subReg].immAddrOff));
|
|
inst->setSrc(builder.createSrcRegRegion(*newSrc), i);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
addrRegInfo[subReg].immAddrOff = 0;
|
|
addrRegInfo[subReg].iter = iend;
|
|
addrRegInfo[subReg].canRemoveInst = false;
|
|
addrRegInfo[subReg].canUseImmed = false;
|
|
addrRegInfo[subReg].usedImmed = false;
|
|
}
|
|
|
|
void Optimizer::removePseudoMov() {
|
|
if (!builder.enableSendIndirect()) {
|
|
return;
|
|
}
|
|
|
|
for (G4_BB *bb : fg) {
|
|
INST_LIST_ITER ii(bb->begin()), iend(bb->end());
|
|
while (ii != iend) {
|
|
G4_INST *inst = (*ii);
|
|
|
|
if (inst->isPseudoAddrMovIntrinsic()) {
|
|
uint64_t value = 0;
|
|
|
|
for (int i = 0; i < inst->getNumSrc(); i++) {
|
|
G4_Operand *src = inst->getSrc(i);
|
|
|
|
if (!src || src->isNullReg()) {
|
|
continue;
|
|
}
|
|
|
|
vASSERT(src->isAddrExp());
|
|
G4_RegVar *regVar = src->asAddrExp()->getRegVar();
|
|
vASSERT(regVar->getPhyReg()->isGreg());
|
|
|
|
unsigned int regNum =
|
|
(static_cast<G4_Greg *>(regVar->getPhyReg()))->getRegNum();
|
|
regNum += src->asAddrExp()->getOffset() / kernel.getGRFSize();
|
|
if (inst->isPseudoAddrMovWIntrinsic()) {
|
|
value |= (uint64_t)regNum << (16 * i);
|
|
} else {
|
|
value |= (uint64_t)regNum << (8 * i);
|
|
}
|
|
}
|
|
G4_Imm *src = builder.createImm(value, Type_UQ);
|
|
G4_INST *movInst = builder.createMov(g4::SIMD1, inst->getDst(), src,
|
|
InstOpt_WriteEnable, false);
|
|
movInst->setToken(inst->getToken());
|
|
movInst->setTokenType(inst->getTokenType());
|
|
movInst->setDistance(inst->getDistance());
|
|
movInst->setDistanceTypeXe(inst->getDistanceTypeXe());
|
|
bb->insertBefore(ii, movInst);
|
|
INST_LIST_ITER tmp = ii;
|
|
ii++;
|
|
bb->erase(tmp);
|
|
continue;
|
|
}
|
|
ii++;
|
|
}
|
|
}
|
|
}
|
|
void Optimizer::FoldAddrImmediate() {
|
|
AddrSubReg_Node *addrRegInfo =
|
|
new AddrSubReg_Node[builder.getNumAddrRegisters()];
|
|
int dst_subReg = 0, src0_subReg = 0;
|
|
G4_DstRegRegion *dst;
|
|
G4_Operand *src0, *src1;
|
|
unsigned num_srcs;
|
|
|
|
for (G4_BB *bb : fg) {
|
|
INST_LIST_ITER ii, iend(bb->end());
|
|
// reset address offset info
|
|
for (unsigned i = 0; i < builder.getNumAddrRegisters(); i++) {
|
|
addrRegInfo[i].subReg = 0;
|
|
addrRegInfo[i].immAddrOff = 0;
|
|
addrRegInfo[i].iter = iend;
|
|
addrRegInfo[i].canRemoveInst = false;
|
|
addrRegInfo[i].canUseImmed = false;
|
|
addrRegInfo[i].usedImmed = false;
|
|
}
|
|
for (ii = bb->begin(); ii != iend; ii++) {
|
|
G4_INST *inst = *ii;
|
|
if (inst->isDead()) {
|
|
continue;
|
|
}
|
|
num_srcs = inst->getNumSrc();
|
|
dst = inst->getDst();
|
|
if (dst) {
|
|
dst_subReg = getDstSubReg(dst);
|
|
}
|
|
src0 = inst->getSrc(0);
|
|
if (src0 && src0->isSrcRegRegion()) {
|
|
src0_subReg = getSrcSubReg(src0);
|
|
}
|
|
src1 = inst->getSrc(1);
|
|
|
|
if (dst && dst->isDirectA0() && src0 && src0->isSrcRegRegion() &&
|
|
src0->asSrcRegRegion()->isDirectA0() && !src1) {
|
|
continue;
|
|
}
|
|
|
|
if (inst->opcode() == G4_add && inst->getExecSize() == g4::SIMD1 &&
|
|
!inst->getPredicate() && (src1->isImm() && !src1->isRelocImm()) &&
|
|
dst && dst->isDirectA0() && src0 && src0->isSrcRegRegion() &&
|
|
src0->asSrcRegRegion()->isDirectA0() && dst_subReg == src0_subReg) {
|
|
// since there is use of a0.x here, we can not remove the former def of
|
|
// a0.x reverse immed offset propagation
|
|
reverseOffsetProp(addrRegInfo, dst_subReg, 0, ii, iend);
|
|
|
|
int64_t offset = src1->asImm()->getImm();
|
|
if (offset >= -512 && offset <= 511 && offset % 0x20 == 0) {
|
|
// this kills the previous def on a0.x
|
|
if (addrRegInfo[dst_subReg].canRemoveInst &&
|
|
addrRegInfo[dst_subReg].iter != iend) {
|
|
// mark dead
|
|
(*(addrRegInfo[dst_subReg].iter))->markDead();
|
|
}
|
|
addrRegInfo[dst_subReg].subReg = dst_subReg;
|
|
addrRegInfo[dst_subReg].immAddrOff = (short)offset;
|
|
addrRegInfo[dst_subReg].iter = ii;
|
|
addrRegInfo[dst_subReg].canRemoveInst = true;
|
|
addrRegInfo[dst_subReg].canUseImmed = true;
|
|
addrRegInfo[dst_subReg].usedImmed = false;
|
|
}
|
|
} else {
|
|
G4_Operand *src;
|
|
// if there is any direct use of addr reg, the ADD inst can not be
|
|
// removed
|
|
for (unsigned i = 0; i < num_srcs; i++) {
|
|
src = inst->getSrc(i);
|
|
if (src && src->isSrcRegRegion() &&
|
|
src->asSrcRegRegion()->isDirectA0()) {
|
|
// TODO: show if an inst is generated for spill code
|
|
// if there is no regVar for this srcRegion, the physical register
|
|
// is hard-wired in input or generated by spillCode. in this case,
|
|
// the subregister info is in the subRegOff of G4_SrcRegRegion this
|
|
// also applies to dst register
|
|
int subReg = getSrcSubReg(src);
|
|
|
|
// it is possible that several elements are used
|
|
int width, hstride, vstride, outerloop = 1;
|
|
width = src->asSrcRegRegion()->getRegion()->width;
|
|
hstride = src->asSrcRegRegion()->getRegion()->horzStride;
|
|
vstride = src->asSrcRegRegion()->getRegion()->vertStride;
|
|
if (vstride != 0) {
|
|
outerloop = inst->getExecSize() / vstride;
|
|
}
|
|
|
|
for (int k = 0; k < outerloop; k++) {
|
|
for (int j = 0; j < width; j++) {
|
|
int currSubreg = subReg + k * vstride + j * hstride;
|
|
// there may be inst whose src or dst addr immediate offset are
|
|
// already changed reverse the change
|
|
reverseOffsetProp(addrRegInfo, currSubreg, i, ii, iend);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// use of address register in index region
|
|
for (unsigned i = 0; i < num_srcs; i++) {
|
|
src = inst->getSrc(i);
|
|
if (src && src->isSrcRegRegion() &&
|
|
src->asSrcRegRegion()->getRegAccess() != Direct) {
|
|
int subReg = getSrcSubReg(src);
|
|
|
|
// if VxH is used and more than one sub registers are used in
|
|
// addressing do not fold the immediate even though they have the
|
|
// same immediate value
|
|
unsigned short vertStride =
|
|
src->asSrcRegRegion()->getRegion()->vertStride;
|
|
if (vertStride == UNDEFINED_SHORT ||
|
|
(vertStride > 0 &&
|
|
(unsigned short)inst->getExecSize() / vertStride > 1)) {
|
|
int numSubReg = 0;
|
|
if (vertStride == UNDEFINED_SHORT) {
|
|
numSubReg = inst->getExecSize() /
|
|
src->asSrcRegRegion()->getRegion()->width;
|
|
} else {
|
|
numSubReg = 1; // inst->getExecSize()/vertStride;
|
|
}
|
|
for (int j = subReg; j < subReg + numSubReg; j++) {
|
|
reverseOffsetProp(addrRegInfo, j, i, ii, iend);
|
|
}
|
|
} else {
|
|
// we check the existing address reg imm offset.
|
|
short currOff = src->asSrcRegRegion()->getAddrImm();
|
|
if (addrRegInfo[subReg].canUseImmed) {
|
|
if (currOff % 0x20 == 0 &&
|
|
(currOff + addrRegInfo[subReg].immAddrOff) <= 511 &&
|
|
(currOff + addrRegInfo[subReg].immAddrOff) >= -512) {
|
|
G4_SrcRegRegion tmpRgn(*src->asSrcRegRegion());
|
|
G4_SrcRegRegion *newSrc = &tmpRgn;
|
|
newSrc->setImmAddrOff(
|
|
short(currOff + addrRegInfo[subReg].immAddrOff));
|
|
inst->setSrc(builder.createSrcRegRegion(*newSrc), i);
|
|
|
|
addrRegInfo[subReg].usedImmed = true;
|
|
} else {
|
|
// if the offset can not be folded into all uses of a0.0,
|
|
// reverse the former folding
|
|
reverseOffsetProp(addrRegInfo, subReg, i, ii, iend);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (dst) {
|
|
// make sure the addr reg is not redefined
|
|
// direct access to a0.x
|
|
if (dst->isDirectA0()) {
|
|
int width, hstride;
|
|
width = inst->getExecSize();
|
|
hstride = dst->getHorzStride();
|
|
|
|
for (int j = 0; j < width; j++) {
|
|
int currSubreg = dst_subReg + j * hstride;
|
|
// this kills the previous def on a0.x
|
|
if (addrRegInfo[currSubreg].iter != iend &&
|
|
addrRegInfo[currSubreg].canRemoveInst) {
|
|
// mark dead
|
|
(*(addrRegInfo[currSubreg].iter))->markDead();
|
|
}
|
|
addrRegInfo[currSubreg].immAddrOff = 0;
|
|
addrRegInfo[currSubreg].iter = iend;
|
|
addrRegInfo[currSubreg].canRemoveInst = false;
|
|
addrRegInfo[currSubreg].canUseImmed = false;
|
|
addrRegInfo[currSubreg].usedImmed = false;
|
|
}
|
|
}
|
|
// check if dst is indirectly addressed
|
|
else if (dst->getRegAccess() != Direct) {
|
|
short currOff = dst->getAddrImm();
|
|
if (addrRegInfo[dst_subReg].canUseImmed) {
|
|
if (currOff % 0x20 == 0 &&
|
|
(currOff + addrRegInfo[dst_subReg].immAddrOff) <= 511 &&
|
|
(currOff + addrRegInfo[dst_subReg].immAddrOff) >= -512) {
|
|
// create a new dst
|
|
G4_DstRegRegion tmpRgn(*dst);
|
|
G4_DstRegRegion *newDst = &tmpRgn;
|
|
newDst->setImmAddrOff(
|
|
short(currOff + addrRegInfo[dst_subReg].immAddrOff));
|
|
inst->setDest(builder.createDstRegRegion(*newDst));
|
|
addrRegInfo[dst_subReg].usedImmed = true;
|
|
} else {
|
|
// if the offset can not be folded into all uses of a0.0,
|
|
// reverse the former folding
|
|
reverseOffsetProp(addrRegInfo, dst_subReg, 0, ii, iend);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// if a def lives out of this BB, we can not delete the defining inst
|
|
for (unsigned i = 0; i < builder.getNumAddrRegisters(); i++) {
|
|
// reverse immed offset propagation
|
|
reverseOffsetProp(addrRegInfo, i, 0, iend, iend);
|
|
}
|
|
// remove the ADD instructions that marked as dead
|
|
for (ii = bb->begin(); ii != bb->end();) {
|
|
G4_INST *inst = *ii;
|
|
INST_LIST_ITER curr = ii++;
|
|
if (inst->isDead()) {
|
|
bb->erase(curr);
|
|
}
|
|
}
|
|
}
|
|
|
|
delete[] addrRegInfo;
|
|
}
|
|
G4_SrcModifier Optimizer::mergeModifier(G4_Operand *src, G4_Operand *use) {
|
|
if ((src == NULL || !src->isSrcRegRegion()) && use && use->isSrcRegRegion()) {
|
|
return use->asSrcRegRegion()->getModifier();
|
|
} else if ((use == NULL || !use->isSrcRegRegion()) && src &&
|
|
src->isSrcRegRegion()) {
|
|
return src->asSrcRegRegion()->getModifier();
|
|
} else if (src && src->isSrcRegRegion() && use && use->isSrcRegRegion()) {
|
|
G4_SrcModifier mod1 = src->asSrcRegRegion()->getModifier(),
|
|
mod2 = use->asSrcRegRegion()->getModifier();
|
|
if (mod2 == Mod_Abs || mod2 == Mod_Minus_Abs) {
|
|
return mod2;
|
|
} else if (mod2 == Mod_src_undef) {
|
|
return mod1;
|
|
} else {
|
|
// mod2 == Minus
|
|
if (mod1 == Mod_Minus) {
|
|
return Mod_src_undef;
|
|
} else if (mod1 == Mod_Abs) {
|
|
return Mod_Minus_Abs;
|
|
} else if (mod1 == Mod_Minus_Abs) {
|
|
return Mod_Abs;
|
|
} else {
|
|
return mod2;
|
|
}
|
|
}
|
|
} else {
|
|
return Mod_src_undef;
|
|
}
|
|
}
|
|
|
|
// Prevent sinking in presence of lifetime.end for any src op.
|
|
// For eg,
|
|
// add V33, V32, 0x1
|
|
// ...
|
|
// lifetime.end V32 <-- This prevents sinking of add to mov
|
|
// ...
|
|
// pseudo_kill V34 <-- This prevents hoisting of V34 to add dst
|
|
// mov V34, V33
|
|
//
|
|
static bool checkLifetime(G4_INST *defInst, G4_INST *inst) {
|
|
// Check whether current instruction ends any src opnd of op
|
|
if (!inst->isLifeTimeEnd())
|
|
return true;
|
|
|
|
G4_RegVar *Var = GetTopDclFromRegRegion(inst->getSrc(0))->getRegVar();
|
|
// Check whether lifetime op corresponds to any operand of current inst.
|
|
if (defInst->getPredicate()) {
|
|
G4_RegVar *opndVar =
|
|
defInst->getPredicate()->asPredicate()->getBase()->asRegVar();
|
|
if (opndVar == Var)
|
|
return false;
|
|
}
|
|
if (defInst->getCondMod()) {
|
|
G4_RegVar *opndVar =
|
|
defInst->getCondMod()->asCondMod()->getBase()->asRegVar();
|
|
if (opndVar == Var)
|
|
return false;
|
|
}
|
|
if (defInst->getDst() && !defInst->getDst()->isNullReg()) {
|
|
G4_RegVar *opndVar = GetTopDclFromRegRegion(defInst->getDst())->getRegVar();
|
|
if (opndVar == Var)
|
|
return false;
|
|
}
|
|
for (unsigned int srcOpnd = 0, numSrc = defInst->getNumSrc();
|
|
srcOpnd < numSrc; srcOpnd++) {
|
|
G4_Operand *src = defInst->getSrc(srcOpnd);
|
|
if (src && src->isSrcRegRegion()) {
|
|
G4_RegVar *opndVar = GetTopDclFromRegRegion(src)->getRegVar();
|
|
if (opndVar == Var)
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
//
|
|
// Sink definition towards its use.
|
|
//
|
|
// For example, without sinking the def instruction once, use cannot
|
|
// be hoisted, since there is a data dependency between the middle
|
|
// instruction and the last move.
|
|
//
|
|
// def: shr (1) V39(0,0)<1>:ud V38(0,0)<0;1,0>:d 0x4:w {Align1, Q1}
|
|
// mov (8) V68(0,0)<1>:ud r0.0<8;8,1>:ud {Align1, NoMask}
|
|
// use: mov (1) V68(0,2)<1>:ud V39(0,0)<0;1,0>:ud {Align1, NoMask}
|
|
//
|
|
// after sinking, it becomes
|
|
//
|
|
// mov (8) V68(0,0)<1>:ud r0.0<8;8,1>:ud {Align1, NoMask}
|
|
// def: shr (1) V39(0,0)<1>:ud V38(0,0)<0;1,0>:d 0x4:w {Align1, Q1}
|
|
// use: mov (1) V68(0,2)<1>:ud V39(0,0)<0;1,0>:ud {Align1, NoMask}
|
|
//
|
|
// which makes local def hoisting possible.
|
|
//
|
|
// The third argument 'other' points to the first instruction (upwards) that
|
|
// has data-dependency with the use instruction.
|
|
//
|
|
static bool canSink(G4_BB *bb, INST_LIST_RITER revIter, INST_LIST_RITER other) {
|
|
// The use instruction.
|
|
G4_INST *inst = *revIter;
|
|
|
|
// Currently we do not handle multiple definition for this optimization.
|
|
if (inst->def_size() != 1)
|
|
return false;
|
|
|
|
// Find its def instruction.
|
|
G4_INST *defInst = inst->def_back().first;
|
|
|
|
vISA_ASSERT(*other != defInst, "iterator points to def already");
|
|
|
|
// Walk up to check if sinking is safe.
|
|
INST_LIST_RITER it = other;
|
|
|
|
while (*it != defInst) {
|
|
if ((*it)->isWAWdep(defInst) || (*it)->isRAWdep(defInst) ||
|
|
(*it)->isWARdep(defInst))
|
|
return false;
|
|
|
|
if (!checkLifetime(defInst, *it))
|
|
return false;
|
|
|
|
// move towards to defInst.
|
|
++it;
|
|
}
|
|
|
|
// At this point, there is no data dependency and sinking is safe.
|
|
//
|
|
// We do sinking right here.
|
|
//
|
|
vISA_ASSERT(*it == defInst, "iterator out of sync");
|
|
|
|
// Both 'other' and 'it' are reverse iterators, and sinking is through
|
|
// forward iterators. The fisrt base should not be decremented by 1,
|
|
// otherwise, the instruction will be inserted before not after.
|
|
bb->insertBefore(other.base(), defInst);
|
|
bb->erase(--it.base());
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool canHoist(FlowGraph &fg, G4_BB *bb, INST_LIST_RITER revIter) {
|
|
G4_INST *inst = *revIter;
|
|
|
|
if (inst->isMixedMode() && fg.builder->getOption(vISA_DisableleHFOpt))
|
|
return false;
|
|
// Cannot hoist if this is not a move, or it is a global operand.
|
|
if (inst->opcode() != G4_mov ||
|
|
fg.globalOpndHT.isOpndGlobal(inst->getSrc(0)) ||
|
|
!inst->canHoist(!bb->isAllLaneActive(), fg.builder->getOptions())) {
|
|
return false;
|
|
}
|
|
|
|
if (auto Dst = inst->getDst()) {
|
|
G4_Declare *Dcl = Dst->getTopDcl();
|
|
// Do not do def-hoisting for setting flags which is likely to increase flag
|
|
// register pressure.
|
|
if (Dcl && Dcl->getRegFile() == G4_RegFileKind::G4_FLAG) {
|
|
return false;
|
|
}
|
|
|
|
// Do not do def-hoisting for s0 registers
|
|
if (Dcl && Dcl->getRegFile() == G4_RegFileKind::G4_SCALAR) {
|
|
return false;
|
|
}
|
|
|
|
if (Dcl && Dcl->getRegFile() == G4_RegFileKind::G4_ADDRESS &&
|
|
Dcl->getRegVar() && Dcl->getRegVar()->getPhyReg()) {
|
|
// Dont def-hoist if dst is hardwired to address register.
|
|
// Doing so extends live-range of assigned register a0.
|
|
// Given that the machine has single addr register, a0,
|
|
// it may even cause address RA to fail due to uncolorable
|
|
// graph.
|
|
return false;
|
|
}
|
|
|
|
if (!fg.builder->hasByteALU() && Dst->getTypeSize() == 1) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Now check each definition of src(0)
|
|
for (auto I = inst->def_begin(), E = inst->def_end(); I != E; ++I) {
|
|
vISA_ASSERT(I->second == Opnd_src0, "invalid use-def chain");
|
|
if (!inst->canHoistTo(I->first, !bb->isAllLaneActive()))
|
|
return false;
|
|
|
|
auto defInst = I->first;
|
|
if (fg.globalOpndHT.isOpndGlobal(defInst->getDst())) {
|
|
return false;
|
|
}
|
|
|
|
auto defSrc0 = defInst->getSrc(0);
|
|
if (inst->getDst()->getType() == Type_BF &&
|
|
(defSrc0->getType() != Type_F ||
|
|
(defInst->isMov() &&
|
|
defSrc0->isSrcRegRegion() &&
|
|
defSrc0->asSrcRegRegion()->hasModifier()))) {
|
|
// we currently don't handle conversion to BF from other type than float
|
|
// As F->BF does not support srcMod, cannot hoist if definst has mod.
|
|
return false;
|
|
}
|
|
// don't hoist if defInst could become a movi (localCopyProp is later pass)
|
|
if (fg.builder->canPromoteToMovi(defInst)) {
|
|
return false;
|
|
}
|
|
|
|
// Further check data-dependency, that is, no other instruction
|
|
// should have WAR or WAW dependency with this inst.
|
|
//
|
|
// defInst
|
|
//
|
|
// other inst
|
|
//
|
|
// inst <-- revIter
|
|
//
|
|
INST_LIST_RITER other = revIter;
|
|
++other;
|
|
|
|
// Measure the distance in between
|
|
unsigned distance = 0;
|
|
|
|
// Walkup until hits its defining instruction.
|
|
while (*other != I->first) {
|
|
// FIXME: remove duplicate computations for multiple definitions.
|
|
if (inst->isWAWdep(*other) || inst->isWARdep(*other)) {
|
|
break;
|
|
}
|
|
++other;
|
|
++distance;
|
|
}
|
|
|
|
// Check the distance first, if this is too far then the following
|
|
// sinking optimization is very expensive.
|
|
#define MAX_DEF_HOIST_DIST 160
|
|
if (distance > MAX_DEF_HOIST_DIST)
|
|
return false;
|
|
|
|
// There is a data dependency.
|
|
if (*other != I->first) {
|
|
// check if sinking is possible.
|
|
if (!canSink(bb, revIter, other))
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static G4_DstRegRegion *buildNewDstOperand(FlowGraph &fg, G4_INST *inst,
|
|
G4_INST *defInst) {
|
|
G4_Operand *src = inst->getSrc(0);
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
|
|
G4_Type srcType = src->getType();
|
|
G4_Type dstType = dst->getType();
|
|
G4_DstRegRegion *dstRegion = dst;
|
|
bool indirectDst = (dstRegion->getRegAccess() != Direct);
|
|
unsigned char srcElSize = (unsigned char)TypeSize(srcType);
|
|
|
|
G4_DstRegRegion *defDstRegion = defInst->getDst();
|
|
G4_DstRegRegion *newDstOpnd = dst;
|
|
|
|
unsigned char defDstElSize = (unsigned char)defDstRegion->getTypeSize();
|
|
G4_CmpRelation rel = src->compareOperand(defDstRegion, *fg.builder);
|
|
G4_Type defDstType = defDstRegion->getType();
|
|
|
|
unsigned char dstElSize = (unsigned char)TypeSize(dstType);
|
|
unsigned short dstHS = dst->getHorzStride();
|
|
|
|
if (rel == Rel_gt || srcElSize != defDstElSize ||
|
|
(defInst->getSaturate() && srcType != defDstType) || inst->isRawMov() ||
|
|
(dstType != defDstType &&
|
|
(IS_FTYPE(defDstType) ||
|
|
(IS_FTYPE(dstType) && defDstType != srcType)))) {
|
|
unsigned short regOff = 0, subRegOff = 0;
|
|
if (rel == Rel_gt) {
|
|
// compute new dst for defInst
|
|
// get dst portion based on src region
|
|
unsigned defDstLB = defDstRegion->getLeftBound();
|
|
|
|
unsigned srcLB = src->getLeftBound();
|
|
const RegionDesc *srcRegionDesc = src->asSrcRegRegion()->getRegion();
|
|
bool contRegion = srcRegionDesc->isContiguous(inst->getExecSize());
|
|
|
|
uint32_t dist = defDstLB - srcLB, dstDist = 0, tempLen = 0;
|
|
if (src->asSrcRegRegion()->isScalar() || contRegion) {
|
|
// mov (1) V18(0,0)[1]:b 0x73:w [Align1]
|
|
// mov (1) V18(0,1)[1]:b 0x61:w [Align1]
|
|
// mov (1) V18(0,2)[1]:b 0x70:w [Align1]
|
|
// mov (1) V18(0,3)[1]:b 0:w [Align1]
|
|
// mov (1) V20(1,0)[1]:ud V18(0,0)[0;1,0]:ud [Align1]
|
|
// length of subregoff part
|
|
tempLen = dstRegion->getSubRegOff() * dstElSize + dist * dstHS;
|
|
|
|
if (tempLen >= fg.builder->numEltPerGRF<Type_UB>()) {
|
|
regOff = dst->getRegOff() + 1;
|
|
subRegOff =
|
|
(unsigned short)((tempLen - fg.builder->numEltPerGRF<Type_UB>()) /
|
|
defDstElSize);
|
|
} else {
|
|
regOff = dst->getRegOff();
|
|
subRegOff = (unsigned short)tempLen / defDstElSize;
|
|
}
|
|
} else {
|
|
// mov (16) V18(0,0)[1]:b 0x73:w [Align1]
|
|
// mov (16) V18(0,16)[1]:b 0x61:w [Align1]
|
|
// mov (16) V18(1,0)[1]:b 0x70:w [Align1]
|
|
// mov (16) V18(1,16)[1]:b 0:w [Align1]
|
|
// mov (32) V20(1,0)[1]:b V18(0,0)[32;16,1]:b [Align1]
|
|
// mov (32) V20(2,0)[1]:b V18(0,16)[32;16,1]:b [Align1]
|
|
|
|
// Compute the linear index of the first element from defInst's dst
|
|
// in useInst's src.
|
|
//
|
|
// mov <2> V50(0, 14)<1>:b 0xa:w <- defInst
|
|
// mov <16> V51(0, 9)<2>:b V50(0, 0)<0; 16, 2>:b <- useInst
|
|
//
|
|
// Starting from left bound difference, dist = 14.
|
|
//
|
|
// FirstEltIndex is 7 = 14 / 2. With this index, we can compute
|
|
// the register offset and sub-register offset in useInst's dst.
|
|
//
|
|
// In the above example, there is only a single row. In general
|
|
// there may have multiple rows in useInst's src region.
|
|
//
|
|
// (1) convert difference in number of elements.
|
|
vISA_ASSERT(dist % srcElSize == 0, "unexpected difference");
|
|
dist = dist / srcElSize;
|
|
|
|
// (2) compute row and column index, by default a single row.
|
|
unsigned rowIndex = 0, colIndex = dist;
|
|
if (srcRegionDesc->vertStride > 0) {
|
|
rowIndex = dist / srcRegionDesc->vertStride;
|
|
colIndex = dist % srcRegionDesc->vertStride;
|
|
}
|
|
|
|
// (3) compute the final linear index.
|
|
vISA_ASSERT(srcRegionDesc->horzStride == 0 ||
|
|
colIndex % srcRegionDesc->horzStride == 0,
|
|
"invalid region");
|
|
unsigned FirstEltIndex = rowIndex * srcRegionDesc->width +
|
|
(srcRegionDesc->horzStride == 0
|
|
? colIndex
|
|
: (colIndex / srcRegionDesc->horzStride));
|
|
|
|
// (4) compute the register and subregister offet in useInst's dst.
|
|
dstDist = FirstEltIndex * dstElSize * dstHS;
|
|
tempLen = dstDist + dst->getSubRegOff() * dstElSize;
|
|
regOff =
|
|
(unsigned short)(dst->getRegOff() +
|
|
tempLen / fg.builder->numEltPerGRF<Type_UB>());
|
|
|
|
subRegOff =
|
|
(unsigned short)(tempLen % fg.builder->numEltPerGRF<Type_UB>()) /
|
|
defDstElSize;
|
|
}
|
|
|
|
unsigned short defDstHS = defDstRegion->getHorzStride();
|
|
if (!indirectDst) {
|
|
newDstOpnd =
|
|
fg.builder->createDst(dst->getBase(), regOff, subRegOff,
|
|
dstHS * defDstHS, defDstRegion->getType());
|
|
} else {
|
|
newDstOpnd = fg.builder->createIndirectDst(
|
|
dst->getBase(), dst->getSubRegOff(), dstHS * defDstHS,
|
|
defDstRegion->getType(), (int16_t)dst->getAddrImm() + tempLen);
|
|
}
|
|
} else {
|
|
unsigned char scale = dstElSize / defDstElSize;
|
|
|
|
// If instruction that gets def hoisted is just re-interpretation of bits,
|
|
// doesn't do conversion, then I think original type of the defInst
|
|
// should be used. This preserves the original behavior.
|
|
//
|
|
// mov (8) V57(0,0)[1]:hf V52(0,0)[8;8,1]:f [Align1, Q1] %21
|
|
// mov (8) V58(0,0)[1]:w V59(0,0)[8;8,1]:w [Align1, Q1] %22
|
|
if (dst->getType() == src->getType()) {
|
|
if (!indirectDst) {
|
|
newDstOpnd = fg.builder->createDst(
|
|
dst->getBase(), dst->getRegOff(),
|
|
(scale == 0 ? dst->getSubRegOff() / (defDstElSize / dstElSize)
|
|
: dst->getSubRegOff() * scale),
|
|
dstHS, defDstRegion->getType());
|
|
} else {
|
|
newDstOpnd = fg.builder->createIndirectDst(
|
|
dst->getBase(), dst->getSubRegOff(), dstHS,
|
|
defDstRegion->getType(), dst->getAddrImm());
|
|
}
|
|
} else {
|
|
if (!indirectDst) {
|
|
newDstOpnd =
|
|
fg.builder->createDst(dst->getBase(), dst->getRegOff(),
|
|
dst->getSubRegOff(), dstHS, dst->getType());
|
|
} else {
|
|
newDstOpnd = fg.builder->createIndirectDst(
|
|
dst->getBase(), dst->getSubRegOff(), dstHS, dst->getType(),
|
|
dst->getAddrImm());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return newDstOpnd;
|
|
}
|
|
|
|
//
|
|
// inst0: op0 rx<2>(0), ry0, ry1
|
|
// inst1: op1 rx<2>(1), ry2, ry3
|
|
// inst2: mov rz<1>(0), rx<0, 0>(8; 8, 1)
|
|
//
|
|
// ==>
|
|
//
|
|
// inst0: op0, rz<2>(0), ry0, ry1
|
|
// inst1: op1, rz<2>(1), ry2, ry3
|
|
// inst2: mov rz<1>(0), rx<0, 0>(8; 8, 1) (to be deleted)
|
|
//
|
|
// Def-use/use-def chains will be updated as follows:
|
|
//
|
|
// (0) all use-defs remain the same for inst0 and inst1.
|
|
//
|
|
// (1) remove all use-defs of inst2. (They must be from inst0 and inst1,
|
|
// which is the pre-condition of doHoisting.)
|
|
//
|
|
// (2) remove all def-uses of inst0 and inst1 from dst.
|
|
//
|
|
// (3) remove all def-uses of inst2.
|
|
//
|
|
// (4) add new def-uses to inst0 and inst1.
|
|
//
|
|
static void doHoisting(FlowGraph &fg, G4_BB *bb, INST_LIST_RITER revIter) {
|
|
G4_INST *inst = *revIter;
|
|
|
|
for (auto I = inst->def_begin(), E = inst->def_end(); I != E; ++I) {
|
|
G4_INST *defInst = I->first;
|
|
|
|
// Build a new dst operand for each def instruction.
|
|
G4_DstRegRegion *newDst = buildNewDstOperand(fg, inst, defInst);
|
|
|
|
// Update the defInst with a new operand and set attributes properly.
|
|
if (inst->def_size() == 1) {
|
|
defInst->setDest(newDst);
|
|
} else {
|
|
defInst->setDest(fg.builder->duplicateOperand(newDst)->asDstRegRegion());
|
|
}
|
|
|
|
// (4) for each def-use of inst, add it to defInst, if it is
|
|
// an effective use.
|
|
for (auto UI = inst->use_begin(), UE = inst->use_end(); UI != UE; ++UI) {
|
|
G4_Operand *UseOpnd = UI->first->getOperand(UI->second);
|
|
// this comparison is necessary, since uses of inst's dst may be
|
|
// different from those from defInst's dst.
|
|
G4_CmpRelation rel =
|
|
defInst->getDst()->compareOperand(UseOpnd, *fg.builder);
|
|
if (rel != Rel_disjoint) {
|
|
defInst->addDefUse(UI->first, UI->second);
|
|
}
|
|
}
|
|
|
|
if (inst->getPredicate()) {
|
|
vISA_ASSERT(inst->def_size() == 1, "multiple defs not implemented");
|
|
// Remove existing definitions on defInst[opnd_pred].
|
|
defInst->removeDefUse(Opnd_pred);
|
|
|
|
defInst->setPredicate(inst->getPredicate());
|
|
|
|
// (4) Transfer definitions of inst[opnd_pred] to definitions of
|
|
// defInst[opnd_pred].
|
|
inst->transferDef(defInst, Opnd_pred, Opnd_pred);
|
|
}
|
|
if (inst->getSrc(0)->asSrcRegRegion()->isScalar() &&
|
|
inst->getExecSize() > g4::SIMD1) {
|
|
defInst->setExecSize(
|
|
G4_ExecSize(defInst->getExecSize() * inst->getExecSize()));
|
|
}
|
|
defInst->setSaturate(inst->getSaturate() || defInst->getSaturate());
|
|
if (!bb->isAllLaneActive()) {
|
|
// set writeEnable of dstInst to be off
|
|
defInst->setOptions((defInst->getOption() & ~0xFFF000C) |
|
|
(inst->getMaskOption()));
|
|
}
|
|
}
|
|
|
|
// (1), (2), (3) Remove all defs/uses and it is ready to be deleted.
|
|
inst->removeAllDefs();
|
|
inst->removeAllUses();
|
|
}
|
|
|
|
void Optimizer::localDefHoisting() {
|
|
unsigned numDefHoisted = 0;
|
|
for (auto bb : fg) {
|
|
for (auto I = bb->rbegin(); I != bb->rend(); /* empty */) {
|
|
if (canHoist(fg, bb, I)) {
|
|
doHoisting(fg, bb, I);
|
|
++numDefHoisted;
|
|
|
|
// list::erase does not take a reverse_iterator.
|
|
//
|
|
// The base iterator is an iterator of the same type as the one
|
|
// used to construct the reverse_iterator, but pointing to the
|
|
// element next to the one that the reverse_iterator is currently
|
|
// pointing to (a reverse_iterator has always an offset of -1
|
|
// with respect to its base iterator).
|
|
I = INST_LIST::reverse_iterator(bb->erase(--I.base()));
|
|
} else {
|
|
++I;
|
|
}
|
|
}
|
|
}
|
|
|
|
VISA_DEBUG({
|
|
std::cout
|
|
<< " === Local Definition Hoisting Optimization ===\n";
|
|
std::cout << "Number of defs hoisted: " << numDefHoisted << "\n";
|
|
});
|
|
}
|
|
|
|
//
|
|
// Do very simple const only reassociation to fold const values
|
|
// e.g.,
|
|
// V2 = V1 + K1
|
|
// V3 = V2 + K2
|
|
// -->
|
|
// V3 = V1 + (K1 + K2)
|
|
// we only search one level (+ and +, * and *) for now as more complex
|
|
// reassociation should be taken care of by IGC earlier. also only do it for
|
|
// integer type for now
|
|
//
|
|
void Optimizer::reassociateConst() {
|
|
for (auto BB : fg) {
|
|
for (auto iter = BB->begin(), iterEnd = BB->end(); iter != iterEnd;
|
|
++iter) {
|
|
G4_INST *inst = *iter;
|
|
if (inst->opcode() != G4_add && inst->opcode() != G4_mul) {
|
|
continue;
|
|
}
|
|
auto isSrc1Const = [](G4_INST *inst) {
|
|
if (!IS_INT(inst->getDst()->getType())) {
|
|
return false;
|
|
}
|
|
if (!inst->getSrc(0)->isImm() && inst->getSrc(1)->isImm()) {
|
|
return true;
|
|
} else if (inst->getSrc(0)->isImm() && !inst->getSrc(1)->isImm()) {
|
|
inst->swapSrc(0, 1);
|
|
inst->swapDefUse(); // swap def/use for src0 and src1
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
if (!isSrc1Const(inst)) {
|
|
continue;
|
|
}
|
|
auto src0Def = inst->getSingleDef(Opnd_src0);
|
|
if (!src0Def) {
|
|
continue;
|
|
}
|
|
|
|
auto isGoodSrc0Def = [isSrc1Const](G4_INST *def, G4_INST *use,
|
|
const IR_Builder &builder) {
|
|
vISA_ASSERT(use->getSrc(0)->isSrcRegRegion(),
|
|
"expect src0 to be src region");
|
|
if (def->opcode() != use->opcode()) {
|
|
return false;
|
|
}
|
|
if (def->getSaturate() || def->getPredicate() || def->getCondMod() ||
|
|
def->getMaskOffset() != use->getMaskOffset()) {
|
|
return false;
|
|
}
|
|
if (!isSrc1Const(def)) {
|
|
return false;
|
|
}
|
|
auto useSrc = use->getSrc(0)->asSrcRegRegion();
|
|
if (useSrc->hasModifier() ||
|
|
def->getDst()->getTypeSize() != useSrc->getTypeSize() ||
|
|
def->getDst()->compareOperand(useSrc, builder) != Rel_eq) {
|
|
// make sure def fully defines use and have the same integer type size
|
|
// (signed-ness should not matter)
|
|
return false;
|
|
}
|
|
if (def->getDst()->compareOperand(def->getSrc(0), builder) !=
|
|
Rel_disjoint) {
|
|
// can't sink source if def overwrites it
|
|
return false;
|
|
}
|
|
// additionally check for the use inst that dst type size is >= src type
|
|
// size otherwise the first add may truncate upper bits due to overflow,
|
|
// which makes reassociation unsafe
|
|
if (useSrc->getTypeSize() < use->getDst()->getTypeSize()) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
};
|
|
|
|
if (isGoodSrc0Def(src0Def, inst, builder) &&
|
|
!chkFwdOutputHazard(src0Def, iter)) {
|
|
// std::cout << "reassociate: \n";
|
|
// src0Def->dump();
|
|
// inst->dump();
|
|
G4_Imm *constOne = src0Def->getSrc(1)->asImm();
|
|
G4_Imm *constTwo = inst->getSrc(1)->asImm();
|
|
G4_Imm *resultImm =
|
|
builder.foldConstVal(constOne, constTwo, inst->opcode());
|
|
|
|
if (resultImm) {
|
|
inst->setSrc(builder.duplicateOperand(src0Def->getSrc(0)), 0);
|
|
inst->setSrc(resultImm, 1);
|
|
inst->removeDefUse(Opnd_src0);
|
|
src0Def->copyDef(inst, Opnd_src0, Opnd_src0);
|
|
// ToDo: remove this when DCE pass is enabled
|
|
if (src0Def->use_size() == 0 &&
|
|
!fg.globalOpndHT.isOpndGlobal(src0Def->getDst()) &&
|
|
!src0Def->getDst()->isIndirect()) {
|
|
src0Def->markDead();
|
|
src0Def->removeAllDefs();
|
|
}
|
|
// std::cout << "--> new inst:\t";
|
|
// inst->dump();
|
|
}
|
|
}
|
|
}
|
|
BB->erase(std::remove_if(BB->begin(), BB->end(),
|
|
[](G4_INST *inst) { return inst->isDead(); }),
|
|
BB->end());
|
|
}
|
|
}
|
|
|
|
static void hoistUseInst(G4_BB *bb, G4_INST *inst, INST_LIST_ITER forwardIter,
|
|
bool &canRemove) {
|
|
// check if we can move the use inst up.
|
|
// currently we do not handle multiple use for this optimization
|
|
G4_INST *useInst = inst->use_front().first;
|
|
if (inst->hasOneUse()) {
|
|
forwardIter--;
|
|
INST_LIST_ITER backwardIter = forwardIter;
|
|
INST_LIST_ITER instListEnd = bb->end();
|
|
while (backwardIter != instListEnd && *backwardIter != useInst) {
|
|
backwardIter++;
|
|
}
|
|
|
|
INST_LIST_ITER useInstIter = backwardIter;
|
|
backwardIter--;
|
|
while (backwardIter != forwardIter) {
|
|
if (useInst->isWAWdep(*backwardIter) ||
|
|
useInst->isRAWdep(*backwardIter) ||
|
|
useInst->isWARdep(*backwardIter)) {
|
|
break;
|
|
}
|
|
backwardIter--;
|
|
}
|
|
if (backwardIter != forwardIter) {
|
|
canRemove = false;
|
|
} else {
|
|
// hoisting
|
|
backwardIter++;
|
|
bb->insertBefore(backwardIter, useInst);
|
|
bb->erase(useInstIter);
|
|
}
|
|
} else {
|
|
canRemove = false;
|
|
}
|
|
}
|
|
|
|
// conver modifier(imm)
|
|
template <class T>
|
|
static typename std::enable_if<std::is_floating_point<T>::value, T>::type
|
|
getImmValue(T imm, G4_SrcModifier modifier) {
|
|
switch (modifier) {
|
|
case Mod_Minus:
|
|
return -imm;
|
|
case Mod_Abs:
|
|
return std::abs(imm);
|
|
case Mod_Minus_Abs:
|
|
return -(std::abs(imm));
|
|
case Mod_Not:
|
|
vISA_ASSERT_UNREACHABLE("unexpected not modifier for floating types");
|
|
return imm;
|
|
default:
|
|
return imm;
|
|
}
|
|
}
|
|
|
|
template <class T>
|
|
static typename std::enable_if<std::is_integral<T>::value, T>::type
|
|
getImmValue(T imm, G4_SrcModifier modifier) {
|
|
switch (modifier) {
|
|
case Mod_Minus:
|
|
return -imm;
|
|
case Mod_Abs:
|
|
return std::llabs(imm);
|
|
case Mod_Minus_Abs:
|
|
return -(std::llabs(imm));
|
|
case Mod_Not:
|
|
return ~imm;
|
|
default:
|
|
return imm;
|
|
}
|
|
}
|
|
|
|
// Source operand of the MOV instruction is already known being able to be
|
|
// propagated into all its uses. But, due to the dependency issue, it cannot be
|
|
// propagated. Try to propagate the type if a narrower type could be used.
|
|
static bool propagateType(IR_Builder &Builder, G4_BB *BB, G4_INST *Mov,
|
|
G4_INST::MovType MT) {
|
|
// Only propagate type if a narrower type could be used.
|
|
if (MT != G4_INST::ZExt && MT != G4_INST::SExt)
|
|
return false;
|
|
|
|
G4_DstRegRegion *Dst = Mov->getDst();
|
|
if (Dst->isIndirect())
|
|
return false;
|
|
|
|
// Check all propagation types are the same.
|
|
G4_Type PT = Type_UNDEF;
|
|
for (auto UI = Mov->use_begin(), UE = Mov->use_end(); UI != UE; ++UI) {
|
|
auto Use = UI->first;
|
|
auto OpndNum = UI->second;
|
|
auto Opnd = Use->getOperand(OpndNum);
|
|
if (!Opnd->isSrcRegRegion())
|
|
return false;
|
|
if (Opnd->asSrcRegRegion()->isIndirect())
|
|
return false;
|
|
G4_Type PropType = Use->getPropType(OpndNum, MT, Mov);
|
|
if (PropType == Type_UNDEF)
|
|
return false;
|
|
if (PT != Type_UNDEF && PT != PropType)
|
|
return false;
|
|
PT = PropType;
|
|
}
|
|
if (PT == Type_UNDEF)
|
|
return false;
|
|
// Create a new destination of MOV of the propagation type.
|
|
// Consider both execution size and the dst horizontal stride to calculate
|
|
// the number of elements needed, so that we have the enough var size when
|
|
// creating the temp var.
|
|
unsigned NumElt = Mov->getExecSize() * Dst->getHorzStride();
|
|
auto NewDcl = Builder.createTempVar(NumElt, PT, Any);
|
|
auto NewDst = Builder.createDstRegRegion(NewDcl, Dst->getHorzStride());
|
|
Mov->setDest(NewDst);
|
|
// Propagate type
|
|
for (auto UI = Mov->use_begin(), UE = Mov->use_end(); UI != UE; ++UI) {
|
|
auto Use = UI->first;
|
|
auto OpndNum = UI->second;
|
|
auto Opnd = Use->getOperand(OpndNum)->asSrcRegRegion();
|
|
auto NewOpnd = Builder.createSrcRegRegion(NewDcl, Opnd->getRegion());
|
|
Use->setSrc(NewOpnd, OpndNum - 1);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static unsigned getMaskSize(G4_INST *Inst, Gen4_Operand_Number OpNum) {
|
|
G4_Operand *Opnd = Inst->getOperand(OpNum);
|
|
vISA_ASSERT(Opnd, "null opnd");
|
|
|
|
if (Opnd) {
|
|
G4_Declare *Dcl = Opnd->getTopDcl();
|
|
if (Dcl == nullptr) {
|
|
// There is no top declaration for this operand, so this is ARF.
|
|
return 32;
|
|
}
|
|
return Dcl->getRegVar()->isFlag() ? Dcl->getNumberFlagElements()
|
|
: Dcl->getByteSize();
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
void Optimizer::removePartialMovs() {
|
|
auto IsValidCandidate = [](G4_Operand *dst, G4_Operand *src, int execSize) {
|
|
if (dst->isDstRegRegion() && src->isSrcRegRegion()) {
|
|
unsigned short dstSize, sourceSize;
|
|
dstSize =
|
|
dst->getTopDcl()->getTotalElems() * dst->getTopDcl()->getElemSize();
|
|
sourceSize =
|
|
src->getTopDcl()->getTotalElems() * src->getTopDcl()->getElemSize();
|
|
|
|
if (!src->asSrcRegRegion()->getRegion()->isSingleStride(execSize)) {
|
|
return false;
|
|
}
|
|
if (dst->asDstRegRegion()->getHorzStride() != 1 && execSize != 1) {
|
|
return false;
|
|
}
|
|
if (src->getRightBound() - src->getLeftBound() !=
|
|
dst->getRightBound() - dst->getLeftBound()) {
|
|
return false;
|
|
}
|
|
if (dstSize != sourceSize) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Common cases should be covered.
|
|
return false;
|
|
};
|
|
|
|
auto IsSameDstSrc = [](G4_Operand *dst, G4_Operand *src) {
|
|
if (dst->isDstRegRegion() && src->isSrcRegRegion()) {
|
|
if (dst->getTopDcl() != src->getTopDcl()) {
|
|
return false;
|
|
}
|
|
|
|
unsigned short dstSize, sourceSize;
|
|
dstSize =
|
|
dst->getTopDcl()->getTotalElems() * dst->getTopDcl()->getElemSize();
|
|
sourceSize =
|
|
src->getTopDcl()->getTotalElems() * src->getTopDcl()->getElemSize();
|
|
|
|
if (dst->asDstRegRegion()->getHorzStride() != 1) {
|
|
return false;
|
|
}
|
|
if (src->getRightBound() - src->getLeftBound() !=
|
|
dst->getRightBound() - dst->getLeftBound()) {
|
|
return false;
|
|
}
|
|
if (dstSize != sourceSize) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Common cases should be covered.
|
|
return false;
|
|
};
|
|
|
|
auto IsStatelessSend = [](G4_INST *inst) {
|
|
if (!inst->isSend()) {
|
|
return false;
|
|
}
|
|
auto msgDesc = inst->asSendInst()->getMsgDesc();
|
|
|
|
if (!msgDesc->isLSC() || msgDesc->isSLM() || !inst->getMsgDescRaw()) {
|
|
return false;
|
|
}
|
|
|
|
if (inst->getMsgDescRaw()) {
|
|
uint32_t desc = inst->getMsgDescRaw()->getDesc();
|
|
|
|
if ((desc >> 29) & 0x3) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
};
|
|
|
|
for (G4_BB *bb : fg) {
|
|
bb->resetLocalIds();
|
|
|
|
INST_LIST_ITER ii = bb->begin(), iend(bb->end());
|
|
while (ii != iend) {
|
|
INST_LIST_ITER firstIt = ii;
|
|
G4_INST *inst1 = *ii;
|
|
G4_Operand *dst1 = inst1->getDst();
|
|
G4_Operand *src1 = inst1->getSrc(0);
|
|
if (inst1->opcode() != G4_mov || !dst1) {
|
|
ii++;
|
|
continue;
|
|
}
|
|
ii++;
|
|
if (ii == iend) {
|
|
break;
|
|
}
|
|
INST_LIST_ITER secondIt = ii;
|
|
G4_INST *inst2 = *ii;
|
|
G4_Operand *dst2 = inst2->getDst();
|
|
G4_Operand *src2 = inst2->getSrc(0);
|
|
if (inst2->opcode() != G4_mov || !dst2) {
|
|
continue;
|
|
}
|
|
ii++;
|
|
if (ii == iend) {
|
|
break;
|
|
}
|
|
INST_LIST_ITER thirdIt = ii;
|
|
G4_INST *inst3 = *ii;
|
|
G4_Operand *dst3 = inst3->getDst();
|
|
G4_Operand *src3 = inst3->getSrc(0);
|
|
if (ii == iend) {
|
|
break;
|
|
}
|
|
if (inst3->opcode() != G4_mov || !dst3) {
|
|
continue;
|
|
}
|
|
|
|
if (inst1->getDst()->getTopDcl()->getRegFile() != G4_GRF ||
|
|
inst2->getDst()->getTopDcl()->getRegFile() != G4_GRF ||
|
|
inst3->getDst()->getTopDcl()->getRegFile() != G4_GRF) {
|
|
continue;
|
|
}
|
|
|
|
// All three instructions can be propagated
|
|
G4_INST::MovType MT1 = inst1->canPropagate();
|
|
G4_INST::MovType MT2 = inst2->canPropagate();
|
|
G4_INST::MovType MT3 = inst3->canPropagate();
|
|
if (MT1 == G4_INST::SuperMov || MT2 == G4_INST::SuperMov ||
|
|
MT3 == G4_INST::SuperMov) {
|
|
continue;
|
|
}
|
|
|
|
// Constraints for each instruction
|
|
if (!IsValidCandidate(dst1, src1, inst1->getExecSize()) ||
|
|
!IsValidCandidate(dst2, src2, inst2->getExecSize()) ||
|
|
!IsValidCandidate(dst3, src3, inst3->getExecSize())) {
|
|
continue;
|
|
}
|
|
|
|
// Profitable
|
|
if (!isCopyPropProfitable(inst1) || !isCopyPropProfitable(inst2) ||
|
|
!isCopyPropProfitable(inst3)) {
|
|
continue;
|
|
}
|
|
|
|
// Same declare in both dst and src for inst1 and inst2
|
|
if (src1->getTopDcl() != src2->getTopDcl() ||
|
|
dst1->getTopDcl() != dst2->getTopDcl()) {
|
|
continue;
|
|
}
|
|
|
|
// Used in same single instruction inst3
|
|
if (inst1->use_size() != 1 || inst2->use_size() != 1 ||
|
|
inst3->use_size() != 1 ||
|
|
inst1->use_begin()->first != inst2->use_begin()->first ||
|
|
inst1->def_begin()->first != inst2->def_begin()->first ||
|
|
inst1->use_begin()->first != inst3) {
|
|
continue;
|
|
}
|
|
|
|
// Same mask order, to avoid the reverting
|
|
BitSet srcMask(getMaskSize(inst1, Opnd_src0), 0);
|
|
BitSet dstMask(getMaskSize(inst1, Opnd_src0), 0);
|
|
src1->updateFootPrint(srcMask, true, builder);
|
|
dst1->updateFootPrint(dstMask, true, builder);
|
|
if (dstMask != srcMask) {
|
|
continue;
|
|
}
|
|
src2->updateFootPrint(srcMask, true, builder);
|
|
dst2->updateFootPrint(dstMask, true, builder);
|
|
if (dstMask != srcMask) {
|
|
continue;
|
|
}
|
|
|
|
// Check if use can be propgated.
|
|
G4_INST *useInst = inst3->use_begin()->first;
|
|
Gen4_Operand_Number opndNum = inst3->use_begin()->second;
|
|
if (!inst3->canPropagateTo(
|
|
useInst, opndNum, MT3, !bb->isAllLaneActive(),
|
|
IsStatelessSend(useInst) &&
|
|
IsSameDstSrc(inst3->getDst(),
|
|
useInst->getSrc(opndNum - 1)))) {
|
|
continue;
|
|
}
|
|
|
|
// Propgation for the define to use
|
|
G4_INST *defInst = inst1->def_begin()->first;
|
|
G4_Operand *useSrc = useInst->getSrc(opndNum - 1);
|
|
G4_Operand *new_src_opnd = builder.createSrcRegRegion(
|
|
Mod_src_undef, src1->asSrcRegRegion()->getRegAccess(),
|
|
src1->asSrcRegRegion()->getBase(),
|
|
src1->asSrcRegRegion()->getRegOff(),
|
|
src1->asSrcRegRegion()->getSubRegOff(),
|
|
useSrc->asSrcRegRegion()->getRegion(), useSrc->getType());
|
|
useInst->setSrc(new_src_opnd, opndNum - 1);
|
|
inst1->copyDefsTo(useInst, true);
|
|
inst3->copyUsesTo(defInst, true);
|
|
|
|
inst1->removeAllDefs();
|
|
inst1->removeAllUses();
|
|
inst2->removeAllDefs();
|
|
inst2->removeAllUses();
|
|
inst3->removeAllDefs();
|
|
inst3->removeAllUses();
|
|
|
|
ii++;
|
|
bb->erase(firstIt);
|
|
bb->erase(secondIt);
|
|
bb->erase(thirdIt);
|
|
}
|
|
}
|
|
}
|
|
|
|
void Optimizer::localCopyPropagation() {
|
|
for (G4_BB *bb : fg) {
|
|
bb->resetLocalIds();
|
|
|
|
INST_LIST_ITER ii = bb->begin(), iend(bb->end());
|
|
while (ii != iend) {
|
|
G4_INST *inst = *ii;
|
|
G4_Operand *dst = inst->getDst();
|
|
if (!dst) {
|
|
ii++;
|
|
continue;
|
|
}
|
|
|
|
builder.doConsFolding(inst);
|
|
inst = builder.doMathConsFolding(ii);
|
|
builder.doSimplification(inst);
|
|
|
|
G4_INST::MovType MT = inst->canPropagate();
|
|
// Skip super mov.
|
|
if (MT == G4_INST::SuperMov) {
|
|
ii++;
|
|
continue;
|
|
}
|
|
|
|
if (!isCopyPropProfitable(inst)) {
|
|
++ii;
|
|
continue;
|
|
}
|
|
bool canRemove = true;
|
|
|
|
// check if each use may be copy propagated.
|
|
USE_EDGE_LIST_ITER iter, iend1(inst->use_end());
|
|
for (iter = inst->use_begin(); iter != iend1; iter++) {
|
|
G4_INST *useInst = iter->first;
|
|
Gen4_Operand_Number opndNum = iter->second;
|
|
|
|
if (inst->getDst()->isDirectA0() && useInst->isSplitSend() &&
|
|
VISA_WA_CHECK(builder.getPWaTable(),
|
|
WaSendSEnableIndirectMsgDesc)) {
|
|
canRemove = false;
|
|
break;
|
|
}
|
|
|
|
if (!inst->canPropagateTo(useInst, opndNum, MT,
|
|
!bb->isAllLaneActive())) {
|
|
canRemove = false;
|
|
break;
|
|
}
|
|
|
|
// Make sure there is no lifetime.end for src0 of the move inst
|
|
INST_LIST_ITER cpIter = ii;
|
|
cpIter++;
|
|
while (*cpIter != useInst) {
|
|
// Detect patterns like:
|
|
//
|
|
// mov A, B
|
|
// ...
|
|
// lifetime.end B
|
|
// op C, A, D
|
|
//
|
|
// Because of presence of lifetime.end B, copy propagation for inst
|
|
// mov A, B
|
|
// cannot be done
|
|
|
|
if ((*cpIter)->isLifeTimeEnd()) {
|
|
// Check whether lifetime end is for same opnd
|
|
G4_Declare *lifetimeEndTopDcl =
|
|
GetTopDclFromRegRegion((*cpIter)->getSrc(0));
|
|
G4_Declare *curInstDstTopDcl =
|
|
GetTopDclFromRegRegion((*ii)->getDst());
|
|
|
|
if (lifetimeEndTopDcl == curInstDstTopDcl) {
|
|
canRemove = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
//
|
|
// Following instructions may use acc0 register in HWComformity,
|
|
// if any of them appear in the propagation range, it may have
|
|
// correctness issue.
|
|
// Such as in following case:
|
|
// addc
|
|
// mov V1, acc0
|
|
// mulh
|
|
// add V2, V1, V3
|
|
// Since HW conformity will replace mulh with mul + acc0 dst,
|
|
// the propagation acc0 through V1 will introduce correctness
|
|
// issue.
|
|
//
|
|
if (inst->getSrc(0)->isAccReg() &&
|
|
((*cpIter)->opcode() == G4_mul ||
|
|
(*cpIter)->opcode() == G4_mulh ||
|
|
(*cpIter)->opcode() == G4_pln ||
|
|
(*cpIter)->opcode() == G4_pseudo_sada2 ||
|
|
(*cpIter)->opcode() == G4_madw ||
|
|
(*cpIter)->opcode() == G4_subb ||
|
|
(*cpIter)->opcode() == G4_addc)) {
|
|
canRemove = false;
|
|
break;
|
|
}
|
|
|
|
cpIter++;
|
|
}
|
|
} // for uses
|
|
|
|
if (canRemove && inst->getSrc(0)->isSrcRegRegion()) {
|
|
// check for anti-dependencies for src0 of the move instruction
|
|
bool def_use_in_between = false;
|
|
|
|
G4_INST *lastUse = inst->use_front().first;
|
|
for (USE_EDGE_LIST_ITER iter = inst->use_begin(),
|
|
uend = inst->use_end();
|
|
iter != uend; ++iter) {
|
|
G4_INST *useInst = iter->first;
|
|
if (useInst->getLocalId() > lastUse->getLocalId()) {
|
|
lastUse = useInst;
|
|
}
|
|
}
|
|
|
|
INST_LIST_ITER forwardIter = ii;
|
|
forwardIter++;
|
|
INST_LIST_ITER instListEnd = bb->end();
|
|
|
|
while (!def_use_in_between && forwardIter != instListEnd &&
|
|
*forwardIter != lastUse) {
|
|
if ((*forwardIter)->isWARdep(inst)) {
|
|
def_use_in_between = true;
|
|
break;
|
|
}
|
|
forwardIter++;
|
|
}
|
|
|
|
// check if hoisting is possible
|
|
if (def_use_in_between) {
|
|
hoistUseInst(bb, inst, forwardIter, canRemove);
|
|
}
|
|
|
|
if (!canRemove) {
|
|
// Check whether the type could be propagated instead to demote
|
|
// type if possible.
|
|
propagateType(builder, bb, inst, MT);
|
|
}
|
|
}
|
|
|
|
if (!canRemove) {
|
|
ii++;
|
|
continue;
|
|
}
|
|
|
|
G4_Operand *src = inst->getSrc(0);
|
|
// do propagation
|
|
for (iter = inst->use_begin(); iter != iend1; /* empty */) {
|
|
G4_INST *useInst = (*iter).first;
|
|
Gen4_Operand_Number opndNum = (*iter).second;
|
|
G4_Operand *use = useInst->getOperand(opndNum);
|
|
G4_Type propType = useInst->getPropType(opndNum, MT, inst);
|
|
|
|
// replace use with def
|
|
if (src->isImm()) {
|
|
auto newImmVal =
|
|
G4_Imm::typecastVals(src->asImm()->getImm(), propType);
|
|
G4_Imm *newImm = builder.createImm(newImmVal, propType);
|
|
G4_SrcModifier modifier = use->asSrcRegRegion()->getModifier();
|
|
if (modifier != Mod_src_undef) {
|
|
if (IS_TYPE_FLOAT_ALL(propType)) {
|
|
if (propType == Type_DF) {
|
|
double imm = getImmValue(newImm->getDouble(), modifier);
|
|
newImm = builder.createDFImm(imm);
|
|
} else {
|
|
float imm = getImmValue(newImm->getFloat(), modifier);
|
|
newImm = builder.createImm(imm);
|
|
}
|
|
} else {
|
|
int64_t imm = getImmValue(newImm->getImm(), modifier);
|
|
newImm = builder.createImm(imm, propType);
|
|
}
|
|
}
|
|
useInst->setSrc(newImm, opndNum - 1);
|
|
} else {
|
|
if (use == NULL) {
|
|
break;
|
|
}
|
|
G4_SrcModifier new_mod = mergeModifier(src, use);
|
|
|
|
unsigned use_elsize = use->getTypeSize();
|
|
unsigned dstElSize = inst->getDst()->getTypeSize();
|
|
const RegionDesc *rd = src->asSrcRegRegion()->getRegion();
|
|
G4_Operand *new_src_opnd = NULL;
|
|
bool new_src = false;
|
|
unsigned char scale = 1, newExecSize = useInst->getExecSize();
|
|
|
|
// Compute the composed region if exists.
|
|
auto getComposedRegion =
|
|
[this](unsigned dStride, unsigned ex1, const RegionDesc *rd1,
|
|
unsigned ex2, const RegionDesc *rd2) -> const RegionDesc * {
|
|
// Easy cases.
|
|
if (rd1->isScalar())
|
|
return rd1;
|
|
else if (rd2->isScalar())
|
|
return rd2;
|
|
else if (dStride == 1 && rd1->isContiguous(ex1))
|
|
return rd2;
|
|
else if (dStride == 1 && rd2->isContiguous(ex2))
|
|
return rd1;
|
|
|
|
// rd1 and rd2 must be single strided. Use a non-zero
|
|
// invalid stride value as the initial value, which
|
|
// simplifies and unifies the checking.
|
|
uint16_t stride1 = 64;
|
|
if (rd1->isContiguous(ex1))
|
|
stride1 = 1;
|
|
else
|
|
rd1->isSingleNonUnitStride(ex1, stride1);
|
|
|
|
uint16_t stride2 = 64;
|
|
if (rd2->isContiguous(ex2))
|
|
stride2 = 1;
|
|
else
|
|
rd2->isSingleNonUnitStride(ex2, stride2);
|
|
|
|
// All are single strided; the composition is the product of
|
|
// strides.
|
|
if (stride1 * stride2 * dStride <= 32)
|
|
return builder.createRegionDesc(
|
|
(uint16_t)ex2, stride1 * stride2 * dStride, 1, 0);
|
|
|
|
// Should be unreachable, since the legality check
|
|
// before should reject cases that are difficult to do
|
|
// composition. Assert?
|
|
return nullptr;
|
|
};
|
|
|
|
if (MT == G4_INST::Trunc) {
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
G4_SrcRegRegion *src0 = src->asSrcRegRegion();
|
|
unsigned typeSizeRatio = src0->getTypeSize() / dst->getTypeSize();
|
|
unsigned numElt =
|
|
src0->isScalar() ? 1 : inst->getExecSize() * typeSizeRatio;
|
|
// src0 region is guaranteed to be scalar/contiguous due to
|
|
// canPropagate() check earlier
|
|
const RegionDesc *region =
|
|
src0->isScalar()
|
|
? builder.getRegionScalar()
|
|
: builder.createRegionDesc(
|
|
useInst->getExecSize(),
|
|
(uint16_t)inst->getExecSize() * typeSizeRatio,
|
|
inst->getExecSize(), (uint16_t)typeSizeRatio);
|
|
if (src0->isIndirect()) {
|
|
new_src_opnd = builder.createIndirectSrc(
|
|
new_mod, src0->getBase(), src0->getRegOff(),
|
|
src0->getSubRegOff() * typeSizeRatio, region, propType,
|
|
src0->getAddrImm());
|
|
} else {
|
|
G4_Declare *newDcl =
|
|
builder.createTempVar(numElt, inst->getDst()->getType(), Any);
|
|
newDcl->setAliasDeclare(src0->getBase()->asRegVar()->getDeclare(),
|
|
0);
|
|
|
|
new_src_opnd = builder.createSrcRegRegion(
|
|
new_mod, Direct, newDcl->getRegVar(), src0->getRegOff(),
|
|
src0->getSubRegOff() * typeSizeRatio, region, propType);
|
|
}
|
|
new_src = true;
|
|
} else if (dstElSize < use_elsize) {
|
|
// FIXME: How could this happen? Revisit later if
|
|
// NoMask is guaranteed.
|
|
// TODO!!! src is aligned to use type. -- should check this.
|
|
new_src = true;
|
|
scale = use_elsize / dstElSize;
|
|
unsigned short vs = rd->vertStride, wd = rd->width;
|
|
// packed word/byte
|
|
if (use->asSrcRegRegion()->isScalar()) {
|
|
rd = builder.getRegionScalar();
|
|
} else if (inst->isComprInst() && vs == wd) {
|
|
rd = builder.getRegionStride1();
|
|
} else {
|
|
rd = builder.createRegionDesc(vs / scale, wd / scale, 1);
|
|
}
|
|
} else if (inst->getExecSize() < useInst->getExecSize() && rd &&
|
|
use->isSrcRegRegion()) {
|
|
unsigned dStride = inst->getDst()->getHorzStride();
|
|
const RegionDesc *rd2 = use->asSrcRegRegion()->getRegion();
|
|
if (auto compRd = getComposedRegion(dStride, inst->getExecSize(),
|
|
rd, newExecSize, rd2)) {
|
|
new_src = true;
|
|
rd = compRd;
|
|
}
|
|
}
|
|
|
|
if (new_mod != Mod_src_undef || new_src) {
|
|
// For truncation case, new src operand is already built.
|
|
if (MT != G4_INST::Trunc) {
|
|
new_src_opnd = builder.createSrcRegRegion(
|
|
new_mod, src->asSrcRegRegion()->getRegAccess(),
|
|
src->asSrcRegRegion()->getBase(),
|
|
src->asSrcRegRegion()->getRegOff(),
|
|
src->asSrcRegRegion()->getSubRegOff() / scale, rd, propType);
|
|
if (src->asSrcRegRegion()->getRegAccess() != Direct) {
|
|
new_src_opnd->asSrcRegRegion()->setImmAddrOff(
|
|
src->asSrcRegRegion()->getAddrImm());
|
|
}
|
|
}
|
|
} else {
|
|
new_src_opnd = builder.duplicateOperand(src);
|
|
new_src_opnd->asSrcRegRegion()->setModifier(new_mod);
|
|
new_src_opnd->asSrcRegRegion()->setType(builder, propType);
|
|
}
|
|
useInst->setSrc(new_src_opnd, opndNum - 1);
|
|
}
|
|
|
|
iter = inst->eraseUse(iter);
|
|
// due to truncation a (partial) def of the move may no longer be a def
|
|
// of the use
|
|
inst->copyDef(useInst, Opnd_src0, opndNum, true);
|
|
|
|
builder.doConsFolding(useInst);
|
|
}
|
|
// remove decl corresponding to this def
|
|
// TODO!!! what if there is some alias to this decl?
|
|
// remove MOV inst
|
|
|
|
// remove it from the use list of its deflists
|
|
inst->removeDefUse(Opnd_src0);
|
|
|
|
INST_LIST_ITER tmp = ii;
|
|
ii++;
|
|
bb->erase(tmp);
|
|
}
|
|
}
|
|
}
|
|
|
|
void Optimizer::localInstCombine() { InstCombine(builder, fg); }
|
|
|
|
void Optimizer::cselPeepHoleOpt() {
|
|
if (!builder.hasCondModForTernary()) {
|
|
return;
|
|
}
|
|
G4_SrcRegRegion *cmpSrc0 = NULL;
|
|
G4_Operand *cmpSrc1 = NULL;
|
|
for (G4_BB *bb : fg) {
|
|
INST_LIST_ITER ii;
|
|
INST_LIST_ITER nextIter;
|
|
INST_LIST_ITER iiEnd;
|
|
if (bb->empty()) {
|
|
continue;
|
|
}
|
|
|
|
bb->resetLocalIds();
|
|
ii = bb->begin();
|
|
iiEnd = bb->end();
|
|
|
|
nextIter = ii;
|
|
|
|
do {
|
|
ii = nextIter;
|
|
++nextIter;
|
|
G4_INST *inst = *ii;
|
|
G4_opcode op = inst->opcode();
|
|
bool hasGRFDst = inst->getDst() && !inst->hasNULLDst();
|
|
/*
|
|
csel doesn't have the same semantics for destination
|
|
as cmp instruction
|
|
*/
|
|
if (op != G4_cmp || hasGRFDst || inst->getPredicate() || inst->isDead() ||
|
|
!inst->getSrc(0)->isSrcRegRegion()) {
|
|
continue;
|
|
}
|
|
|
|
cmpSrc0 = inst->getSrc(0)->asSrcRegRegion();
|
|
cmpSrc1 = inst->getSrc(1);
|
|
|
|
G4_CondMod *cModifier = inst->getCondMod();
|
|
|
|
// check if dst is global
|
|
if (fg.globalOpndHT.isOpndGlobal(cModifier)) {
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
csel instruction implicitly compares src2 to 0
|
|
only supports floats
|
|
no predication
|
|
*/
|
|
|
|
if (!cmpSrc1->isImm() ||
|
|
(cmpSrc1->asImm()->getImm() != 0 &&
|
|
(cmpSrc1->asImm()->getType() != Type_F ||
|
|
cmpSrc1->asImm()->getFloat() != -0.0f)) ||
|
|
cmpSrc0->getType() != Type_F || cmpSrc0->isImm())
|
|
continue;
|
|
|
|
if (inst->getSrc(0)->isRelocImm() || inst->getSrc(1)->isRelocImm()) {
|
|
continue;
|
|
}
|
|
|
|
// Only allow single strided regions.
|
|
uint16_t src0Stride = 0;
|
|
if (!cmpSrc0->getRegion()->isSingleStride(inst->getExecSize(),
|
|
src0Stride))
|
|
continue;
|
|
|
|
/*
|
|
Can do scan until use instruction to see if src0 is modified
|
|
but I think for general case this will suffice. If we are
|
|
not capturing opportunities can revisit.
|
|
*/
|
|
|
|
if (inst->useEmpty())
|
|
continue;
|
|
|
|
int execSize = inst->getExecSize();
|
|
if (execSize == 2)
|
|
continue;
|
|
|
|
USE_EDGE_LIST_ITER iter = inst->use_begin();
|
|
USE_EDGE_LIST_ITER endUseList = inst->use_end();
|
|
|
|
bool canOpt = true;
|
|
int maxInstID = 0;
|
|
|
|
for (; iter != endUseList; ++iter) {
|
|
G4_INST *useInst = (*iter).first;
|
|
|
|
if (useInst->getNumSrc() != 2) {
|
|
canOpt = false;
|
|
break;
|
|
}
|
|
|
|
maxInstID = std::max(useInst->getLocalId(), maxInstID);
|
|
G4_Operand *dstUse = useInst->getDst();
|
|
G4_Operand *selSrc0 = useInst->getSrc(0);
|
|
G4_Operand *selSrc1 = useInst->getSrc(1);
|
|
|
|
if (useInst->opcode() != G4_sel || selSrc0->isImm() ||
|
|
selSrc1->isImm() || selSrc0->getType() != Type_F ||
|
|
selSrc1->getType() != Type_F || dstUse->getType() != Type_F ||
|
|
// 3-src restriction
|
|
!builder.tryToAlignOperand(dstUse, 16) ||
|
|
!builder.tryToAlignOperand(selSrc0, 16) ||
|
|
!builder.tryToAlignOperand(selSrc1, 16)) {
|
|
canOpt = false;
|
|
break;
|
|
}
|
|
|
|
// if inst is NoMask use inst can be anything.
|
|
// if inst is not NoMask then useInst needs to be subset of inst.
|
|
if (!(inst->getMaskOption() & InstOpt_WriteEnable)) {
|
|
auto isInclusive = [](int lb1, int rb1, int lb2, int rb2) {
|
|
return lb1 <= lb2 && rb1 >= rb2;
|
|
};
|
|
if (!isInclusive(inst->getMaskOffset(),
|
|
inst->getMaskOffset() + inst->getExecSize(),
|
|
useInst->getMaskOffset(),
|
|
useInst->getMaskOffset() + useInst->getExecSize())) {
|
|
canOpt = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
uint8_t numPredDefs = 0;
|
|
DEF_EDGE_LIST_ITER useIter = useInst->def_begin();
|
|
DEF_EDGE_LIST_ITER iterEnd = useInst->def_end();
|
|
|
|
// Just in case some weird code is generated with partial writes to
|
|
// predicate
|
|
for (; useIter != iterEnd; ++useIter) {
|
|
if ((*useIter).second == Opnd_pred)
|
|
++numPredDefs;
|
|
|
|
// Check whether pseudo_kill for dst exists between cmp and sel
|
|
// cmp.xx.fx.0 (8) ... src0 $0
|
|
// ...
|
|
// pseudo_kill dst
|
|
// (f0) sel (8) dst src1 src2
|
|
//
|
|
// These two cannot be merged because pseudo_kill is in between them
|
|
|
|
INST_LIST_ITER cselOptIt = ii;
|
|
cselOptIt++;
|
|
while ((*cselOptIt) != useInst) {
|
|
if ((*cselOptIt)->isLifeTimeEnd()) {
|
|
if (GetTopDclFromRegRegion((*cselOptIt)->getDst()) ==
|
|
GetTopDclFromRegRegion(useInst->getDst())) {
|
|
canOpt = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
cselOptIt++;
|
|
}
|
|
}
|
|
|
|
if (numPredDefs > 1) {
|
|
canOpt = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
INST_LIST_ITER tempInstIter = nextIter;
|
|
// explicit check that cmp sr0 is not over written or partially writen to
|
|
// between cmp and sel.
|
|
for (; tempInstIter != iiEnd; ++tempInstIter) {
|
|
G4_INST *tempInst = *tempInstIter;
|
|
|
|
if (tempInst->getLocalId() == maxInstID) {
|
|
break;
|
|
}
|
|
|
|
if (!tempInst->getDst())
|
|
continue;
|
|
|
|
// also checks for indirect, will return inerference.
|
|
G4_CmpRelation rel =
|
|
tempInst->getDst()->compareOperand(cmpSrc0, builder);
|
|
if (rel != Rel_disjoint) {
|
|
canOpt = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (canOpt) {
|
|
for (auto iter = inst->use_begin(); iter != inst->use_end();
|
|
/*empty*/) {
|
|
G4_INST *useInst = (*iter).first;
|
|
G4_CondMod *mod = inst->getCondMod();
|
|
useInst->setOpcode(G4_csel);
|
|
useInst->setSrc(builder.duplicateOperand(inst->getSrc(0)), 2);
|
|
useInst->setCondMod(builder.duplicateOperand(mod));
|
|
useInst->setPredicate(NULL);
|
|
|
|
G4_SrcRegRegion *opnd2 = useInst->getSrc(2)->asSrcRegRegion();
|
|
|
|
if (!opnd2->isScalar() &&
|
|
inst->getExecSize() > useInst->getExecSize()) {
|
|
// earlier check establishes that useInst mask is equivalent or
|
|
// subset sel instruction
|
|
/*
|
|
case which considering:
|
|
cmp (16)
|
|
sel (8)
|
|
*/
|
|
if (useInst->getMaskOffset() != inst->getMaskOffset()) {
|
|
// check elsewhere guarantees this is float.
|
|
G4_Type type = opnd2->getType();
|
|
unsigned short typeSize = TypeSize(type);
|
|
unsigned offset =
|
|
opnd2->getRegOff() * kernel.numEltPerGRF<Type_UB>() +
|
|
opnd2->getSubRegOff() * typeSize;
|
|
offset += useInst->getExecSize() * src0Stride * typeSize;
|
|
|
|
auto newSrc2 = builder.createSrcRegRegion(
|
|
opnd2->getModifier(), Direct, opnd2->getBase(),
|
|
offset / kernel.numEltPerGRF<Type_UB>(),
|
|
(offset % kernel.numEltPerGRF<Type_UB>()) / typeSize,
|
|
opnd2->getRegion(), opnd2->getType());
|
|
useInst->setSrc(newSrc2, 2);
|
|
}
|
|
}
|
|
//
|
|
// Modifying useDef links
|
|
//
|
|
// cmp.xx.f0.0 (8) ... src2 $0 <- inst (to be deleted)
|
|
// (f0) sel (8) dst src0 src1 <- useInst
|
|
// =>
|
|
// csel.xx.f0.0 (8) dst src0 src1 src2
|
|
|
|
// useInst's predicate becomes NULL.
|
|
iter = inst->eraseUse(iter);
|
|
|
|
// inst's src0 becomes useInst's src2.
|
|
inst->copyDef(useInst, Opnd_src0, Opnd_src2);
|
|
}
|
|
vISA_ASSERT(inst->useEmpty(), "all predicate uses are removed.");
|
|
inst->removeAllDefs();
|
|
bb->erase(ii);
|
|
}
|
|
} while (nextIter != iiEnd);
|
|
}
|
|
}
|
|
|
|
// helper function to convert
|
|
// and/or p3 p1 p2
|
|
// ==>
|
|
// (p1) sel t1 1 0
|
|
// (p2) sel t2 1 0
|
|
// and/or.nz.p3 t1 t2
|
|
// if the original inst is NoMask and Q1/H1, we do
|
|
// and/or p3 p1 p2
|
|
// ==>
|
|
// and/or (1) p3 p1 p2
|
|
// p3's type is uw for simd8/16 and ud for simd32
|
|
static void expandPseudoLogic(IR_Builder &builder, G4_BB *bb,
|
|
INST_LIST_ITER &iter)
|
|
|
|
{
|
|
G4_INST *inst = *iter;
|
|
vISA_ASSERT(inst->isPseudoLogic(),
|
|
"inst must be either pseudo_and/or/xor/not");
|
|
INST_LIST_ITER newIter = iter;
|
|
|
|
bool isFirstInst = iter == bb->begin();
|
|
if (!isFirstInst) {
|
|
--iter;
|
|
}
|
|
|
|
auto canFoldOnSIMD1 = [=, &builder]() {
|
|
if (inst->isWriteEnableInst() &&
|
|
(inst->getMaskOffset() == 0 || inst->getMaskOffset() == 16) &&
|
|
// we can't do this for simd8 inst in simd16 kernels as it will
|
|
// overwrite upper flag bits
|
|
(inst->getExecSize() > g4::SIMD8 ||
|
|
inst->getExecSize() == builder.kernel.getSimdSize())) {
|
|
return true;
|
|
}
|
|
|
|
// inst writes the whole flag.
|
|
if (inst->isWriteEnableInst() && inst->getMaskOffset() == 0) {
|
|
auto Dcl = inst->getDst()->getTopDcl();
|
|
if (Dcl && Dcl->getNumberFlagElements() <= inst->getExecSize()) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
};
|
|
|
|
if (canFoldOnSIMD1()) {
|
|
G4_opcode newOpcode = G4_illegal;
|
|
if (inst->getMaskOffset() == 16) {
|
|
vISA_ASSERT(inst->getExecSize() == g4::SIMD16,
|
|
"Only support simd16 pseudo-logic instructions");
|
|
// we have to use the upper flag bits (.1) instead
|
|
vISA_ASSERT(inst->getSrc(0)->isSrcRegRegion() &&
|
|
inst->getSrc(0)->isFlag(),
|
|
"expect src0 to be flag");
|
|
auto newSrc0 = builder.createSrcWithNewSubRegOff(
|
|
inst->getSrc(0)->asSrcRegRegion(), 1);
|
|
inst->setSrc(newSrc0, 0);
|
|
if (inst->getSrc(1) != nullptr) {
|
|
vISA_ASSERT(inst->getSrc(1)->isSrcRegRegion() &&
|
|
inst->getSrc(1)->isFlag(),
|
|
"expect src1 to be flag");
|
|
auto newSrc1 = builder.createSrcWithNewSubRegOff(
|
|
inst->getSrc(1)->asSrcRegRegion(), 1);
|
|
inst->setSrc(newSrc1, 1);
|
|
}
|
|
auto newDst = builder.createDstWithNewSubRegOff(inst->getDst(), 1);
|
|
inst->setDest(newDst);
|
|
}
|
|
|
|
switch (inst->opcode()) {
|
|
case G4_pseudo_and:
|
|
newOpcode = G4_and;
|
|
break;
|
|
case G4_pseudo_or:
|
|
newOpcode = G4_or;
|
|
break;
|
|
case G4_pseudo_xor:
|
|
newOpcode = G4_xor;
|
|
break;
|
|
case G4_pseudo_not:
|
|
newOpcode = G4_not;
|
|
break;
|
|
default:
|
|
vISA_ASSERT_UNREACHABLE(
|
|
"unexpected opcode for pseudo-logic instructions");
|
|
}
|
|
|
|
inst->setOpcode(newOpcode);
|
|
inst->setExecSize(g4::SIMD1);
|
|
} else {
|
|
G4_ExecSize tmpSize = inst->getExecSize();
|
|
auto LowerOpnd = [=, &builder](Gen4_Operand_Number opNum,
|
|
G4_INST *&SI) -> G4_Operand * {
|
|
G4_Operand *Opnd = inst->getOperand(opNum);
|
|
if (Opnd) {
|
|
auto src = Opnd->asSrcRegRegion();
|
|
auto newDcl = builder.createTempVar(tmpSize, Type_UW, Any);
|
|
auto newDst = builder.createDst(newDcl->getRegVar(), 0, 0, 1, Type_UW);
|
|
auto newPred = builder.createPredicate(PredState_Plus, src->getBase(),
|
|
src->getSubRegOff());
|
|
auto newSel = builder.createInternalInst(
|
|
newPred, G4_sel, nullptr, g4::NOSAT, tmpSize, newDst,
|
|
builder.createImm(1, Type_UW), builder.createImm(0, Type_UW),
|
|
inst->getOption());
|
|
inst->transferDef(newSel, opNum, Gen4_Operand_Number::Opnd_pred);
|
|
bb->insertBefore(newIter, newSel);
|
|
SI = newSel;
|
|
const RegionDesc *rd = tmpSize == g4::SIMD1
|
|
? builder.getRegionScalar()
|
|
: builder.getRegionStride1();
|
|
return builder.createSrcRegRegion(newDcl, rd);
|
|
}
|
|
return Opnd;
|
|
};
|
|
|
|
G4_INST *Sel0 = nullptr;
|
|
G4_Operand *logicSrc0 = LowerOpnd(Gen4_Operand_Number::Opnd_src0, Sel0);
|
|
|
|
G4_INST *Sel1 = nullptr;
|
|
G4_Operand *logicSrc1 = LowerOpnd(Gen4_Operand_Number::Opnd_src1, Sel1);
|
|
|
|
if (logicSrc1 == nullptr) {
|
|
vISA_ASSERT(inst->opcode() == G4_pseudo_not,
|
|
"Must be a pseudo-not instruction");
|
|
// for not P1 P0
|
|
// we generate
|
|
// (P0) sel V1 1 0
|
|
// xor.P1 null V1 1
|
|
// so that the upper bits would stay zero
|
|
logicSrc1 = builder.createImm(1, Type_UW);
|
|
}
|
|
|
|
auto nullDst = builder.createNullDst(Type_UW);
|
|
auto newCondMod =
|
|
builder.createCondMod(Mod_nz, inst->getDst()->getBase()->asRegVar(), 0);
|
|
G4_opcode newOpcode = G4_illegal;
|
|
switch (inst->opcode()) {
|
|
case G4_pseudo_and:
|
|
newOpcode = G4_and;
|
|
break;
|
|
case G4_pseudo_or:
|
|
newOpcode = G4_or;
|
|
break;
|
|
case G4_pseudo_xor:
|
|
newOpcode = G4_xor;
|
|
break;
|
|
case G4_pseudo_not:
|
|
// see comment above
|
|
newOpcode = G4_xor;
|
|
break;
|
|
default:
|
|
vISA_ASSERT_UNREACHABLE(
|
|
"unexpected opcode for pseudo-logic instructions");
|
|
}
|
|
|
|
G4_INST *newLogicOp = builder.createInternalInst(
|
|
NULL, newOpcode, newCondMod, g4::NOSAT, tmpSize, nullDst, logicSrc0,
|
|
logicSrc1,
|
|
inst->getOption() // keep the original instruction emask
|
|
);
|
|
|
|
// Fix def-use
|
|
if (Sel0 != nullptr) {
|
|
Sel0->addDefUse(newLogicOp, Gen4_Operand_Number::Opnd_src0);
|
|
}
|
|
if (Sel1 != nullptr) {
|
|
Sel1->addDefUse(newLogicOp, Gen4_Operand_Number::Opnd_src1);
|
|
}
|
|
inst->transferUse(newLogicOp);
|
|
bb->insertBefore(newIter, newLogicOp);
|
|
bb->erase(newIter);
|
|
}
|
|
|
|
// iter either points to the start or the first expanded instruction. Caller
|
|
// will advance it to the previous instruction
|
|
if (isFirstInst) {
|
|
iter = bb->begin();
|
|
} else {
|
|
++iter;
|
|
}
|
|
}
|
|
|
|
// mov(1) P0 Imm(NoMask)
|
|
// (P0) mov(esize) r[A0, 0] src0 src1
|
|
// == >
|
|
// smov(esize) r[A0, 0] src0 src1 Imm
|
|
//
|
|
// esize is either 8 or 16
|
|
bool Optimizer::createSmov(G4_BB *bb, G4_INST *flagMove, G4_INST *next_inst) {
|
|
if ((next_inst->getExecSize() != g4::SIMD8 &&
|
|
next_inst->getExecSize() != g4::SIMD16) ||
|
|
next_inst->getPredicate() == NULL || next_inst->getCondMod() != NULL ||
|
|
next_inst->getSaturate() == true ||
|
|
next_inst->getDst()->getRegAccess() == Direct ||
|
|
next_inst->getDst()->getTypeSize() == 1 ||
|
|
next_inst->getSrc(0)->getTypeSize() == 1 ||
|
|
(builder.getPlatform() < GENX_SKL && builder.getPlatform() != GENX_BDW) ||
|
|
next_inst->getDst()->getTypeSize() <
|
|
next_inst->getSrc(0)->getTypeSize()) {
|
|
return false;
|
|
}
|
|
|
|
if (next_inst->getSrc(0)->isSrcRegRegion() &&
|
|
next_inst->getSrc(0)->asSrcRegRegion()->getModifier() != Mod_src_undef) {
|
|
return false;
|
|
}
|
|
|
|
if (flagMove->use_size() != 1 || flagMove->use_front().first != next_inst) {
|
|
return false;
|
|
}
|
|
|
|
G4_CmpRelation rel =
|
|
flagMove->getDst()->compareOperand(next_inst->getPredicate(), builder);
|
|
if (rel != Rel_eq && !(rel == Rel_gt && next_inst->getMaskOffset() == 0)) {
|
|
return false;
|
|
}
|
|
|
|
if (kernel.getKernelType() == VISA_3D || !bb->isAllLaneActive()) {
|
|
if (!flagMove->isWriteEnableInst()) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
next_inst->setOpcode(G4_smov);
|
|
next_inst->setSrc(flagMove->getSrc(0), 1);
|
|
next_inst->setPredicate(nullptr);
|
|
|
|
flagMove->removeUseOfInst();
|
|
return true;
|
|
}
|
|
|
|
// Returns true if *iter has an use that is a cmp and we can fold that cmp
|
|
// into *iter as a conditional modifier. The cmp instruction is deleted as part
|
|
// of folding. Note that iter may be modified to point to the next inst if we
|
|
// decide to sync *iter to where the cmp was to work around dependencies
|
|
bool Optimizer::foldCmpToCondMod(G4_BB *bb, INST_LIST_ITER &iter) {
|
|
// find a cmp that uses inst dst
|
|
G4_INST *inst = *iter;
|
|
G4_INST *cmpInst = nullptr;
|
|
bool canFold = false;
|
|
|
|
if (inst->getCondMod()) {
|
|
return false;
|
|
}
|
|
|
|
for (auto UI = inst->use_begin(), UE = inst->use_end(); UI != UE; ++UI) {
|
|
cmpInst = (*UI).first;
|
|
|
|
// cmp instruction must be of the form
|
|
// cmp [<op> P0] null src 0
|
|
// where src is singly defined by inst
|
|
if (cmpInst->opcode() == G4_cmp &&
|
|
cmpInst->getExecSize() == inst->getExecSize() &&
|
|
cmpInst->hasNULLDst() && cmpInst->getSrc(0)->isSrcRegRegion() &&
|
|
cmpInst->getSrc(0)->asSrcRegRegion()->getModifier() == Mod_src_undef &&
|
|
cmpInst->def_size() == 1 && !cmpInst->getPredicate() &&
|
|
cmpInst->getSrc(1)->isImm() && cmpInst->getSrc(1)->asImm()->isZero()) {
|
|
canFold = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!canFold) {
|
|
return false;
|
|
}
|
|
|
|
// floating point cmp may flush denorms to zero, but mov may not.
|
|
//
|
|
// mov(1|M0) (lt)f0.0 r6.2<1>:f r123.2<0;1,0>:f
|
|
// may not be the same as
|
|
// mov(1|M0) r6.2<1>:f r123.2<0;1,0>:f
|
|
// cmp(1|M0) (lt)f0.0 null<1>:f 6.2<0;1,0>:f 0x0:f
|
|
// for denorm inputs.
|
|
if (inst->opcode() == G4_mov &&
|
|
IS_TYPE_FLOAT_ALL(cmpInst->getSrc(0)->getType())) {
|
|
return false;
|
|
}
|
|
|
|
// If dst is B and exectype is Q, dst is misaligned. This misaligned inst
|
|
// will be split in HWComformity. If a condMod is present, it would be
|
|
// harder to split and may result in wrong code. Here, simply disable folding.
|
|
//
|
|
// For example,
|
|
// (W) mov (1) conv_i(0,0)<1>:b V0040(0,0)<0;1,0>:q
|
|
// (W) cmp(1) (eq)P01.0 null<1>:w conv_i(0, 0)<0;1,0>:b 0:w
|
|
//
|
|
int extypesize = 0;
|
|
(void)inst->getOpExecType(extypesize);
|
|
if (inst->getDst()->getTypeSize() == 1 && extypesize == 8) {
|
|
return false;
|
|
}
|
|
|
|
auto cmpIter = std::find(iter, bb->end(), cmpInst);
|
|
|
|
// check if cmp instruction is close enough
|
|
constexpr int DISTANCE = 60;
|
|
if (std::distance(iter, cmpIter) > DISTANCE) {
|
|
return false;
|
|
}
|
|
|
|
auto isSupportedCondMod = [](G4_CondModifier mod) {
|
|
return mod == Mod_g || mod == Mod_ge || mod == Mod_l || mod == Mod_le ||
|
|
mod == Mod_e || mod == Mod_ne;
|
|
};
|
|
G4_CondModifier mod = cmpInst->getCondMod()->getMod();
|
|
if (!isSupportedCondMod(mod)) {
|
|
return false;
|
|
}
|
|
|
|
// and/or/xor/not opcodes support only the conditional modifiers
|
|
// .e/.z or .ne.nz
|
|
auto opcode = inst->opcode();
|
|
bool isLogicOp = (opcode == G4_xor || opcode == G4_or || opcode == G4_and ||
|
|
opcode == G4_not);
|
|
bool isSupportedCondModForLogicOp =
|
|
(mod == Mod_e || mod == Mod_z || mod == Mod_ne || mod == Mod_nz);
|
|
if (isLogicOp && !isSupportedCondModForLogicOp)
|
|
return false;
|
|
|
|
if (kernel.getKernelType() == VISA_3D || !bb->isAllLaneActive()) {
|
|
// Make sure masks of both instructions are same
|
|
if (inst->getMaskOption() != cmpInst->getMaskOption()) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
auto getUnsignedTy = [](G4_Type Ty) {
|
|
switch (Ty) {
|
|
case Type_D:
|
|
return Type_UD;
|
|
case Type_W:
|
|
return Type_UW;
|
|
case Type_B:
|
|
return Type_UB;
|
|
case Type_Q:
|
|
return Type_UQ;
|
|
case Type_V:
|
|
return Type_UV;
|
|
default:
|
|
break;
|
|
}
|
|
return Ty;
|
|
};
|
|
|
|
G4_Type T1 = inst->getDst()->getType();
|
|
G4_Type T2 = cmpInst->getSrc(0)->getType();
|
|
if (getUnsignedTy(T1) != getUnsignedTy(T2)) {
|
|
return false;
|
|
}
|
|
if (!isSupportedCondModForLogicOp && T1 != T2) {
|
|
// If dst signedness of inst is not same as cmp src0, then only z/nz
|
|
// conditions can be evaluated correctly.
|
|
//
|
|
// If inst is a type-conversion mov then it's incorrect to use cmp src0
|
|
// type as mov dst type.
|
|
//
|
|
// mov A:d B:f // f->d mov
|
|
// cmp.gt P1 A:ud 0x0
|
|
//
|
|
// When folding cmp in the mov, we must preserve mov's dst type :d.
|
|
// Otherwise type-conversion semantics change which can lead to wrong
|
|
// result if f->d yields negative result.
|
|
//
|
|
// But if cmp used cmp.z/nz then folding is legal.
|
|
bool isDstSigned = IS_SIGNED_INT(T1);
|
|
bool isDstUnsigned = IS_SIGNED_INT(T1);
|
|
bool isSrcSigned = IS_SIGNED_INT(T2);
|
|
bool isSrcUnsigned = IS_SIGNED_INT(T2);
|
|
if (!(isDstSigned && isSrcSigned) && !(isDstUnsigned && isSrcUnsigned))
|
|
return false;
|
|
}
|
|
|
|
// Skip if the dst needs saturating but it's used as different sign.
|
|
if (inst->getSaturate() && T1 != T2) {
|
|
return false;
|
|
}
|
|
|
|
if (chkBwdOutputHazard(iter, cmpIter)) {
|
|
return false;
|
|
}
|
|
|
|
G4_Declare *dstDcl = GetTopDclFromRegRegion(inst->getDst());
|
|
if (dstDcl->getAddressed() && chkBwdWAWdep(inst, cmpIter)) {
|
|
return false;
|
|
}
|
|
|
|
auto isSafeToSink = [this](INST_LIST_ITER defIter, INST_LIST_ITER beforeIter,
|
|
int maxDist) {
|
|
G4_INST *inst = *defIter;
|
|
int dist = 0;
|
|
for (auto it = std::next(defIter); it != beforeIter; ++it) {
|
|
if (dist++ >= maxDist)
|
|
return false;
|
|
if (inst->isWAWdep(*it) || inst->isRAWdep(*it) || inst->isWARdep(*it))
|
|
return false;
|
|
if (!checkLifetime(inst, *it))
|
|
return false;
|
|
if (inst->isAccSrcInst() && builder.hasMacMacl() &&
|
|
(*it)->opcode() == G4_mul && IS_DTYPE((*it)->getSrc(0)->getType()) &&
|
|
IS_DTYPE((*it)->getSrc(1)->getType())) {
|
|
// Do not sink instructions with explicit ACC src over mul
|
|
// instructions as mul can be changed to:
|
|
// mul (8) acc0.0<1>:d src0:d src1:w
|
|
// mach (8) dst:d src0:d src1:d
|
|
// see HWConformity::generateMacl()
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
};
|
|
|
|
// Merge two instructions
|
|
// If legal, use the cmp location as new insert position.
|
|
bool sinkInst = false;
|
|
|
|
if (inst->getDst()->compareOperand(cmpInst->getSrc(0), builder) == Rel_eq) {
|
|
if (inst->use_size() == 1) {
|
|
// see if we can replace dst with null
|
|
if (inst->supportsNullDst() &&
|
|
!fg.globalOpndHT.isOpndGlobal(inst->getDst())) {
|
|
inst->setDest(builder.createDst(builder.phyregpool.getNullReg(), 0, 0,
|
|
inst->getDst()->getHorzStride(),
|
|
inst->getDst()->getType()));
|
|
}
|
|
// Check if it is safe to sink inst right before cmp inst, which lowers
|
|
// flag pressure in general.
|
|
const int MAX_DISTANCE = 20;
|
|
sinkInst = isSafeToSink(iter, cmpIter, MAX_DISTANCE);
|
|
}
|
|
inst->setCondMod(cmpInst->getCondMod());
|
|
inst->setOptions((inst->getOption() & ~InstOpt_Masks) |
|
|
(cmpInst->getMaskOption()));
|
|
// The sign of dst should follow its use instead of its
|
|
// def. The later is meaningless from how hardware works.
|
|
auto honorSignedness = [](G4_CondModifier mod) {
|
|
switch (mod) {
|
|
case Mod_g:
|
|
case Mod_ge:
|
|
case Mod_l:
|
|
case Mod_le:
|
|
return true;
|
|
default:
|
|
break;
|
|
}
|
|
return false;
|
|
};
|
|
if (honorSignedness(inst->getCondMod()->getMod()))
|
|
inst->getDst()->setType(builder, T2);
|
|
|
|
// update def-use
|
|
// since cmp is deleted, we have to
|
|
// -- transfer cmp's use to inst
|
|
// -- remove cmp from its definition's use list
|
|
cmpInst->transferUse(inst, true);
|
|
cmpInst->removeUseOfInst();
|
|
if (!sinkInst) {
|
|
bb->erase(cmpIter);
|
|
} else {
|
|
// Before and <- ii
|
|
// cmp <- next_iter
|
|
// After cmp <- ii
|
|
// and <- next
|
|
std::iter_swap(iter, cmpIter);
|
|
auto nextii = std::next(iter);
|
|
bb->erase(iter);
|
|
iter = nextii;
|
|
}
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// Return true : folding cmp and sel is performed
|
|
// false : no folding is performed.
|
|
bool Optimizer::foldCmpSel(G4_BB *BB, G4_INST *selInst,
|
|
INST_LIST_ITER &selInst_II) {
|
|
vISA_ASSERT(
|
|
(selInst->opcode() == G4_sel && selInst->getPredicate() &&
|
|
selInst->getCondMod() == NULL),
|
|
"foldCmpSel: Inst should be a sel with predicate and without cmod.");
|
|
G4_Predicate *pred = selInst->getPredicate();
|
|
|
|
// global predicates are not eligible since folding the cmp removes the def
|
|
if (fg.globalOpndHT.isOpndGlobal(pred)) {
|
|
return false;
|
|
}
|
|
|
|
// Needs to find cmp instruction that defines predicate of sel. The cmp is
|
|
// not necessarily right before the sel. To be able to fold the cmp to the
|
|
// sel, we have to check if the cmp can be moved right before the sel. It not,
|
|
// no folding is performed.
|
|
G4_INST *cmpInst = nullptr;
|
|
for (auto DI = selInst->def_begin(), DE = selInst->def_end(); DI != DE;
|
|
++DI) {
|
|
if ((*DI).second == Opnd_pred) {
|
|
if (cmpInst) { // only handle single def.
|
|
cmpInst = nullptr;
|
|
break;
|
|
}
|
|
cmpInst = (*DI).first;
|
|
if (cmpInst && cmpInst->opcode() != G4_cmp) {
|
|
cmpInst = nullptr;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (!cmpInst) {
|
|
// G4_cmp is not found, skip optimization.
|
|
return false;
|
|
}
|
|
|
|
// Do a fast check first. Note that sel w/ cmod
|
|
// does not allow predication. So, we will give up if cmp has predicate.
|
|
bool isSubExecSize = (selInst->getExecSize() < cmpInst->getExecSize());
|
|
bool isSameExecSize = (selInst->getExecSize() == cmpInst->getExecSize());
|
|
if ((!isSameExecSize && !isSubExecSize) ||
|
|
(cmpInst->getDst() && !cmpInst->hasNULLDst()) ||
|
|
cmpInst->use_size() != 1 || cmpInst->getPredicate() != nullptr) {
|
|
return false;
|
|
}
|
|
|
|
// Check if two source operands have the same type and value.
|
|
auto IsEqual = [](G4_Operand *opnd1, G4_Operand *opnd2,
|
|
const IR_Builder &builder) {
|
|
if (opnd1->isImm() && opnd2->isImm())
|
|
return opnd1->asImm()->isEqualTo(opnd2->asImm());
|
|
if (opnd1->compareOperand(opnd2, builder) != Rel_eq)
|
|
return false;
|
|
// footprint does not imply equality.
|
|
// (1) region difference: r10.0<1;4,2>:f vs r10.0<8;8,1>
|
|
// (2) source modifier: r10.0<0;1,0>:f vs -r10.0<0;1,0>
|
|
//
|
|
if (opnd1->isSrcRegRegion() && opnd2->isSrcRegRegion())
|
|
return opnd1->asSrcRegRegion()->sameSrcRegRegion(
|
|
*opnd2->asSrcRegRegion());
|
|
|
|
// Common cases should be covered.
|
|
return false;
|
|
};
|
|
|
|
G4_Operand *sel_src0 = selInst->getSrc(0);
|
|
G4_Operand *sel_src1 = selInst->getSrc(1);
|
|
G4_Operand *cmp_src0 = cmpInst->getSrc(0);
|
|
G4_Operand *cmp_src1 = cmpInst->getSrc(1);
|
|
|
|
// Normalize SEL's predicate state to Plus.
|
|
if (G4_PredState::PredState_Minus == pred->getState()) {
|
|
selInst->swapSrc(0, 1);
|
|
selInst->swapDefUse();
|
|
pred->setState(G4_PredState::PredState_Plus);
|
|
std::swap(sel_src0, sel_src1);
|
|
}
|
|
|
|
// source operands of SEL and CMP are reversed or not.
|
|
bool reversed = false;
|
|
G4_CondMod *condMod = cmpInst->getCondMod();
|
|
|
|
auto canFold = [=, &reversed](const IR_Builder &builder) {
|
|
G4_CmpRelation condRel =
|
|
pred->asPredicate()->compareOperand(condMod, builder);
|
|
if (!(condRel == Rel_eq && isSameExecSize) &&
|
|
!(condRel == Rel_lt && isSubExecSize))
|
|
return false;
|
|
|
|
if (!cmpInst->isWriteEnableInst() &&
|
|
cmpInst->getMaskOption() != selInst->getMaskOption())
|
|
return false;
|
|
|
|
// P = cmp.gt A, B
|
|
// C = (+P) sel A, B => C = sel.gt A, B
|
|
//
|
|
// P = cmp.ne A, B
|
|
// C = (+P) sel A, B => C = sel.ne A, B
|
|
//
|
|
if (IsEqual(sel_src0, cmp_src0, builder) &&
|
|
IsEqual(sel_src1, cmp_src1, builder))
|
|
return true;
|
|
|
|
// Sel operands are reversed.
|
|
// P = cmp.gt A, B
|
|
// C = (+P) sel B, A => C = sel.le B, A
|
|
//
|
|
// P = cmp.ne A, B
|
|
// C = (+P) sel B, A => C = sel.ne B, A
|
|
//
|
|
if (IsEqual(sel_src0, cmp_src1, builder) &&
|
|
IsEqual(sel_src1, cmp_src0, builder)) {
|
|
reversed = true;
|
|
// In case of float cmp with possible NaN operands, should not swap
|
|
// operands.
|
|
if (builder.getOption(vISA_finiteMathOnly) ||
|
|
!IS_TYPE_FLOAT_ALL(cmp_src0->getType())) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
};
|
|
|
|
if (!canFold(builder)) {
|
|
return false;
|
|
}
|
|
|
|
// check if cmpInst can be legally moved right before selInst;
|
|
// if it cannot, we cannot fold cmp to sel!.
|
|
INST_LIST_ITER cmpInst_II = selInst_II;
|
|
while (cmpInst_II != BB->begin()) {
|
|
cmpInst_II--;
|
|
if (*cmpInst_II == cmpInst) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// No local def (possible?)
|
|
if (cmpInst_II == BB->begin()) {
|
|
return false;
|
|
}
|
|
|
|
// If cmpInst has no WAR harzard b/w cmpInst and selInst, cmpInst
|
|
// can be moved right before selInst.
|
|
if (chkFwdOutputHazard(cmpInst_II, selInst_II)) {
|
|
return false;
|
|
}
|
|
|
|
G4_CondModifier mod = condMod->getMod();
|
|
if (reversed)
|
|
mod = G4_CondMod::getReverseCondMod(mod);
|
|
G4_CondMod *cmod =
|
|
builder.createCondMod(mod, condMod->getBase(), condMod->getSubRegOff());
|
|
selInst->setCondMod(cmod);
|
|
selInst->setPredicate(nullptr);
|
|
|
|
// update def-use
|
|
// since cmp is deleted, we have to
|
|
// -- remove def-use between cmp and sel
|
|
// -- transfer cmp's remaining use to sel
|
|
// -- remove cmp and its definitions' use
|
|
selInst->removeDefUse(Opnd_pred);
|
|
cmpInst->transferUse(selInst, true);
|
|
cmpInst->removeUseOfInst();
|
|
BB->erase(cmpInst_II);
|
|
return true;
|
|
}
|
|
|
|
// try to fold a pseudo not instruction into its use(s)
|
|
// return true if successful
|
|
bool Optimizer::foldPseudoNot(G4_BB *bb, INST_LIST_ITER &iter) {
|
|
G4_INST *notInst = *iter;
|
|
vISA_ASSERT(notInst->opcode() == G4_pseudo_not, "expect not instruction");
|
|
if (notInst->getPredicate() || notInst->getCondMod()) {
|
|
return false;
|
|
}
|
|
|
|
G4_DstRegRegion *dst = notInst->getDst();
|
|
if (fg.globalOpndHT.isOpndGlobal(dst)) {
|
|
return false;
|
|
}
|
|
if (!notInst->getSrc(0)->isSrcRegRegion()) {
|
|
return false;
|
|
}
|
|
|
|
// unfortunately, def-use chain is not always properly maintained, so we have
|
|
// to skip opt even if we can't find a use
|
|
bool canFold = notInst->use_size() > 0;
|
|
for (auto uses = notInst->use_begin(), end = notInst->use_end(); uses != end;
|
|
++uses) {
|
|
auto &&use = *uses;
|
|
G4_INST *useInst = use.first;
|
|
Gen4_Operand_Number opndPos = use.second;
|
|
if (!useInst->isLogic() || !G4_INST::isSrcNum(opndPos) ||
|
|
useInst->getSingleDef(opndPos) == nullptr /* not single def */) {
|
|
canFold = false;
|
|
break;
|
|
}
|
|
|
|
// sanity check
|
|
vASSERT(useInst->getSingleDef(opndPos) == notInst);
|
|
|
|
// check the case where flag is partially used
|
|
G4_SrcRegRegion *opnd =
|
|
useInst->getSrc(G4_INST::getSrcNum(opndPos))->asSrcRegRegion();
|
|
if (dst->compareOperand(opnd, builder) != Rel_eq) {
|
|
canFold = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (canFold) {
|
|
G4_SrcRegRegion *origUse = notInst->getSrc(0)->asSrcRegRegion();
|
|
if (notInst->getMaskOffset() == 16) {
|
|
// Fold upper bits
|
|
vASSERT(notInst->getExecSize() == g4::SIMD16);
|
|
origUse = builder.createSrcWithNewSubRegOff(origUse, 1);
|
|
notInst->setSrc(origUse, 0);
|
|
}
|
|
for (auto uses = notInst->use_begin(), uend = notInst->use_end();
|
|
uses != uend; ++uses) {
|
|
auto use = *uses;
|
|
G4_INST *useInst = use.first;
|
|
Gen4_Operand_Number opndPos = use.second;
|
|
G4_SrcRegRegion *opnd =
|
|
useInst->getSrc(G4_INST::getSrcNum(opndPos))->asSrcRegRegion();
|
|
int numNot = 1 + (origUse->getModifier() == Mod_Not ? 1 : 0) +
|
|
(opnd->getModifier() == Mod_Not ? 1 : 0);
|
|
G4_SrcModifier newModifier = numNot & 0x1 ? Mod_Not : Mod_src_undef;
|
|
G4_SrcRegRegion *newSrc = builder.createSrcRegRegion(*origUse);
|
|
newSrc->setModifier(newModifier);
|
|
useInst->setSrc(newSrc, G4_INST::getSrcNum(opndPos));
|
|
}
|
|
|
|
for (auto defs = notInst->def_begin(); defs != notInst->def_end(); ++defs) {
|
|
auto def = *defs;
|
|
G4_INST *defInst = def.first;
|
|
notInst->copyUsesTo(defInst, false);
|
|
}
|
|
notInst->removeAllDefs();
|
|
notInst->removeAllUses();
|
|
bb->erase(iter);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/***
|
|
this function optimizes the following cases:
|
|
|
|
case 1:
|
|
cmp.gt.P0 s0 s1
|
|
(P0) sel d s0 s1
|
|
==>
|
|
sel.gt.P0 d s0 s1
|
|
|
|
case 2:
|
|
and dst src0 src1 -- other OPs are also optimized: or, xor...
|
|
cmp.nz.P0 NULL dst 0
|
|
(P0) ...
|
|
==>
|
|
and.nz.P0 dst src0 src1
|
|
(P0) ...
|
|
add/addc instructions also handled in case 2. Few more cond
|
|
modifiers supported to such arithmetic instructions.
|
|
|
|
case 3:
|
|
cmp.l.P0 NULL src0 src1
|
|
cmp.l.P1 NULL src2 src3
|
|
and P2 P0 P1 -- other OPs are also optimized: or, xor...
|
|
==>
|
|
cmp.l.P2 NULL src0 src1
|
|
(P2)cmp.l.P2 NULL src2 src3
|
|
|
|
case 4:
|
|
mov (1) P0 Imm (NoMask)
|
|
(P0) mov (8) r[A0, 0] src0 src1
|
|
==>
|
|
smov (8) r[A0, 0] src0 src1 Imm
|
|
|
|
case 5:
|
|
pseudo_not (1) P2 P1
|
|
and (1) P4 P3 P2
|
|
==>
|
|
and (1) P4 P3 ~P1
|
|
*/
|
|
|
|
void Optimizer::optimizeLogicOperation() {
|
|
G4_Operand *dst = NULL;
|
|
bool resetLocalIds = false;
|
|
bool doLogicOpt = builder.getOption(vISA_LocalFlagOpt);
|
|
|
|
if (!doLogicOpt) {
|
|
// we still need to expand the pseudo logic ops
|
|
for (auto bb : fg) {
|
|
for (auto I = bb->begin(), E = bb->end(); I != E; ++I) {
|
|
auto inst = *I;
|
|
if (inst->isPseudoLogic()) {
|
|
expandPseudoLogic(builder, bb, I);
|
|
}
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
for (G4_BB *bb : fg) {
|
|
INST_LIST_ITER ii;
|
|
if ((bb->begin() == bb->end())) {
|
|
continue;
|
|
}
|
|
resetLocalIds = false;
|
|
ii = bb->end();
|
|
do {
|
|
ii--;
|
|
G4_INST *inst = *ii;
|
|
G4_opcode op = inst->opcode();
|
|
dst = inst->getDst();
|
|
bool nullDst = inst->hasNULLDst();
|
|
G4_Declare *dcl = nullptr;
|
|
if (dst) {
|
|
dcl = dst->getTopDcl();
|
|
}
|
|
|
|
if ((op != G4_sel && op != G4_cmp && !inst->canSupportCondMod() &&
|
|
!inst->isPseudoLogic()) ||
|
|
!dst || nullDst || (dcl && dcl->isOutput())) {
|
|
continue;
|
|
}
|
|
|
|
INST_LIST_ITER next_iter = std::next(ii);
|
|
|
|
if (!resetLocalIds) {
|
|
bb->resetLocalIds();
|
|
resetLocalIds = true;
|
|
}
|
|
|
|
// case 5
|
|
if (inst->opcode() == G4_pseudo_not) {
|
|
bool folded = foldPseudoNot(bb, ii);
|
|
if (folded) {
|
|
ii = next_iter;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// case 1
|
|
if (ii != bb->begin() && op == G4_sel && inst->getPredicate() &&
|
|
!inst->getCondMod()) {
|
|
foldCmpSel(bb, inst, ii);
|
|
} else if (builder.hasSmov() && inst->opcode() == G4_mov &&
|
|
inst->getPredicate() == NULL && inst->getCondMod() == NULL &&
|
|
inst->getExecSize() == g4::SIMD1 && inst->getSrc(0)->isImm() &&
|
|
inst->getDst()->isFlag() && next_iter != bb->end() &&
|
|
(*next_iter)->opcode() == G4_mov) {
|
|
// case 4
|
|
G4_INST *next_inst = *next_iter;
|
|
if (createSmov(bb, inst, next_inst)) {
|
|
bb->erase(ii);
|
|
ii = next_iter;
|
|
}
|
|
continue;
|
|
} else if (!inst->getPredicate() && inst->canSupportCondMod()) {
|
|
// FIXME: why this condition?
|
|
if (op == G4_pseudo_mad && inst->getExecSize() == g4::SIMD1) {
|
|
continue;
|
|
}
|
|
|
|
// case 2
|
|
foldCmpToCondMod(bb, ii);
|
|
} else if (inst->getPredicate() == NULL && inst->isPseudoLogic()) {
|
|
bool merged = false;
|
|
|
|
if (op == G4_pseudo_and || op == G4_pseudo_or) {
|
|
merged = foldPseudoAndOr(bb, ii);
|
|
}
|
|
|
|
// translate the pseudo op
|
|
if (!merged) {
|
|
expandPseudoLogic(builder, bb, ii);
|
|
}
|
|
}
|
|
} while (ii != bb->begin());
|
|
}
|
|
}
|
|
|
|
// see if we can fold a pseudo and/or instruction with previous cmp
|
|
// returns true if successful, and ii (inst-list-iter) is also updated
|
|
// to the next inst
|
|
bool Optimizer::foldPseudoAndOr(G4_BB *bb, INST_LIST_ITER &ii) {
|
|
// case 3
|
|
|
|
// optimization should apply even when the dst of the pseudo-and/pseudo-or is
|
|
// global, since we are just hoisting it up, and WAR/WAW checks should be
|
|
// performed as we search for the src0 and src1 inst. Also need to check if
|
|
// the mask option of the pseudo-and/pseudo-or matches with the options of
|
|
// the defining instructions when dst is global.
|
|
|
|
G4_INST *inst = *ii;
|
|
// look for def of srcs
|
|
G4_Operand *src0 = inst->getSrc(0);
|
|
G4_Operand *src1 = inst->getSrc(1);
|
|
|
|
/*
|
|
Old logic would scan from inst (and/or) up until it encountered
|
|
def instructions that did full write.
|
|
It would scan from def instruction down to inst looking for WAW, WAR
|
|
conflicts between def instruction and intermediate instructions.
|
|
Basically insuring that there were no partial writes in to flag registers
|
|
use by the inst.
|
|
|
|
The new code uses defInstList directly, and aborts if there are more then are
|
|
two definitions. Which means there is more then one instruction writing to
|
|
source. Disadvantage of that is that it is less precise. For example if we
|
|
are folding in to closest definition then before it was OK, but now will be
|
|
disallowed.
|
|
*/
|
|
int32_t maxSrc1 = 0;
|
|
int32_t maxSrc2 = 0;
|
|
G4_INST *defInstructions[2] = {nullptr, nullptr};
|
|
G4_INST *src0DefInst = nullptr;
|
|
G4_INST *src1DefInst = nullptr;
|
|
|
|
// Picks the latest two instructions to compare.
|
|
// local IDs are reset at the beginning of this function.
|
|
if (inst->def_size() < 2) {
|
|
return false;
|
|
}
|
|
// trying to find latest instructions that define src0 and src1
|
|
for (auto I = inst->def_begin(), E = inst->def_end(); I != E; ++I) {
|
|
G4_INST *srcInst = I->first;
|
|
if ((I->second == Opnd_src0) && (srcInst->getLocalId() >= maxSrc1)) {
|
|
maxSrc1 = srcInst->getLocalId();
|
|
defInstructions[0] = srcInst;
|
|
src0DefInst = srcInst;
|
|
} else if ((I->second == Opnd_src1) && (srcInst->getLocalId() >= maxSrc2)) {
|
|
maxSrc2 = srcInst->getLocalId();
|
|
defInstructions[1] = srcInst;
|
|
src1DefInst = srcInst;
|
|
}
|
|
}
|
|
|
|
// Making sure that dst of pseudo instruction is not used or defined between
|
|
// pseudo instruction and the first definition of the source
|
|
if (defInstructions[0] && defInstructions[1]) {
|
|
// make defInst[0] the closer def to the pseudo-and/or
|
|
if (maxSrc2 > maxSrc1) {
|
|
std::swap(defInstructions[0], defInstructions[1]);
|
|
std::swap(maxSrc1, maxSrc2);
|
|
}
|
|
// Doing backward scan until earliest src to make sure dst of and/or is not
|
|
// being written to or being read
|
|
/*
|
|
handling case like in spmv_csr
|
|
cmp.lt (M1, 1) P15 V40(0,0)<0;1,0> 0x10:w /// $191
|
|
cmp.lt (M1, 1) P16 V110(0,0)<0;1,0> V34(0,0)<0;1,0> /// $192
|
|
and (M1, 1) P16 P16 P15 /// $193
|
|
*/
|
|
if (chkBwdOutputHazard(defInstructions[1], ii, defInstructions[0])) {
|
|
return false;
|
|
}
|
|
} else {
|
|
return false;
|
|
}
|
|
|
|
// check if the defInst can be folded into the pseudo and/or for a given
|
|
// source folding is legal if
|
|
// -- src is the only use of defInst
|
|
// -- def completely defines the src
|
|
// -- def inst does not have predicate
|
|
// -- the defInst closer to the pseudo inst is a cmp without pred
|
|
// -- def inst is not global operand
|
|
// the last condition can be relaxed if defInst is the same as the inst's dst,
|
|
// as they will be the same flag
|
|
if (!(defInstructions[0]->opcode() == G4_cmp &&
|
|
defInstructions[0]->getPredicate() == nullptr)) {
|
|
return false;
|
|
}
|
|
|
|
auto checkSource = [this, inst](G4_INST *defInst, G4_Operand *src) {
|
|
if (defInst == nullptr) {
|
|
return false;
|
|
}
|
|
|
|
if (defInst->use_size() > 1) {
|
|
return false;
|
|
}
|
|
G4_Operand *curr_dst = defInst->getCondMod()
|
|
? defInst->getCondMod()
|
|
: (G4_Operand *)defInst->getDst();
|
|
|
|
G4_CmpRelation rel = curr_dst->compareOperand(src, builder);
|
|
if (rel != Rel_eq ||
|
|
(defInst->getPredicate() && defInst->opcode() != G4_sel)) {
|
|
return false;
|
|
}
|
|
|
|
if (fg.globalOpndHT.isOpndGlobal(curr_dst)) {
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
if (dst->compareOperand(curr_dst, builder) != Rel_eq) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
};
|
|
|
|
if (!checkSource(src0DefInst, src0) || !checkSource(src1DefInst, src1)) {
|
|
return false;
|
|
}
|
|
|
|
// Check if mask options are mismatched between the pseudo-and/pseudo-or and
|
|
// its defining instructions.
|
|
if ((inst->getMaskOption() != src0DefInst->getMaskOption() ||
|
|
inst->getMaskOption() != src1DefInst->getMaskOption()) &&
|
|
fg.globalOpndHT.isOpndGlobal(inst->getDst()) &&
|
|
!fg.builder->alwaysAllowGlobalFlagOpt())
|
|
return false;
|
|
|
|
// do the case 3 optimization
|
|
|
|
G4_PredState ps =
|
|
(inst->opcode() == G4_pseudo_or) ? PredState_Minus : PredState_Plus;
|
|
G4_INST *first_inst = defInstructions[1];
|
|
G4_INST *second_inst = defInstructions[0];
|
|
|
|
// unify subregister according to logic op dst
|
|
G4_VarBase *curr_base = inst->getDst()->getBase()->asRegVar();
|
|
unsigned short curr_subreg = 0;
|
|
|
|
G4_Operand *first_def, *second_def;
|
|
G4_VarBase *first_def_base, *second_def_base;
|
|
int first_def_subreg, second_def_subreg;
|
|
|
|
// change condmod and predicate of second inst
|
|
|
|
if (first_inst->getCondMod()) {
|
|
first_def = first_inst->getCondMod();
|
|
first_def_base = first_def->asCondMod()->getBase();
|
|
first_def_subreg = first_def->asCondMod()->getSubRegOff();
|
|
} else {
|
|
first_def = first_inst->getDst();
|
|
first_def_base = first_def->asDstRegRegion()->getBase()->asRegVar();
|
|
first_def_subreg = 0;
|
|
}
|
|
|
|
if (second_inst->getCondMod()) {
|
|
second_def = second_inst->getCondMod();
|
|
second_def_base = second_def->asCondMod()->getBase();
|
|
second_def_subreg = second_def->asCondMod()->getSubRegOff();
|
|
} else {
|
|
second_def = second_inst->getDst();
|
|
second_def_base = second_def->asDstRegRegion()->getBase()->asRegVar();
|
|
second_def_subreg = 0;
|
|
}
|
|
|
|
bool change_flag = false;
|
|
if (second_inst->getCondMod() &&
|
|
(second_def_base != curr_base || second_def_subreg != curr_subreg)) {
|
|
change_flag = true;
|
|
G4_CondMod *new_condMod = builder.createCondMod(
|
|
second_inst->getCondMod()->getMod(), curr_base, curr_subreg);
|
|
|
|
second_inst->setCondMod(new_condMod);
|
|
}
|
|
// create a predicate
|
|
|
|
G4_Predicate *new_pred = builder.createPredicate(ps, curr_base, curr_subreg);
|
|
|
|
second_inst->setPredicate(new_pred);
|
|
|
|
if (change_flag) {
|
|
for (USE_EDGE_LIST_ITER iter = second_inst->use_begin();
|
|
iter != second_inst->use_end(); ++iter) {
|
|
G4_INST *curr_inst = (*iter).first;
|
|
if (curr_inst == inst) {
|
|
continue;
|
|
}
|
|
if (curr_inst->getPredicate() &&
|
|
(curr_inst->getPredicate()->getBase() != curr_base ||
|
|
curr_inst->getPredicate()->getSubRegOff() != curr_subreg) &&
|
|
curr_inst->getPredicate()->compareOperand(second_def, builder) ==
|
|
Rel_eq) {
|
|
curr_inst->setPredicate(builder.duplicateOperand(new_pred));
|
|
}
|
|
|
|
for (int k = 0; k < curr_inst->getNumSrc(); k++) {
|
|
G4_Operand *curr_src = curr_inst->getSrc(k);
|
|
if (curr_src->isSrcRegRegion() &&
|
|
!(curr_inst->isMath() && k == 1 && curr_src->isNullReg())) {
|
|
if (curr_src->asSrcRegRegion()->compareOperand(second_def, builder) ==
|
|
Rel_eq) {
|
|
G4_SrcRegRegion *new_src_opnd = builder.createSrcRegRegion(
|
|
curr_src->asSrcRegRegion()->getModifier(),
|
|
curr_src->asSrcRegRegion()->getRegAccess(),
|
|
inst->getDst()->getBase(), 0, 0,
|
|
curr_src->asSrcRegRegion()->getRegion(),
|
|
curr_src->asSrcRegRegion()->getType());
|
|
|
|
curr_inst->setSrc(new_src_opnd, k);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (first_def_base != curr_base || first_def_subreg != curr_subreg) {
|
|
if (first_inst->getCondMod() && first_def->isCondMod()) {
|
|
G4_CondMod *new_condMod = builder.createCondMod(
|
|
first_inst->getCondMod()->getMod(), curr_base, curr_subreg);
|
|
first_inst->setCondMod(new_condMod);
|
|
} else {
|
|
first_inst->setDest(builder.duplicateOperand(inst->getDst()));
|
|
}
|
|
for (USE_EDGE_LIST_ITER iter = first_inst->use_begin();
|
|
iter != first_inst->use_end(); ++iter) {
|
|
G4_INST *curr_inst = (*iter).first;
|
|
if (curr_inst == inst) {
|
|
continue;
|
|
}
|
|
if (curr_inst->getPredicate() &&
|
|
(curr_inst->getPredicate()->getBase() != curr_base ||
|
|
curr_inst->getPredicate()->getSubRegOff() != curr_subreg) &&
|
|
curr_inst->getPredicate()->compareOperand(first_def, builder) ==
|
|
Rel_eq) {
|
|
curr_inst->setPredicate(builder.duplicateOperand(new_pred));
|
|
}
|
|
|
|
for (int k = 0; k < curr_inst->getNumSrc(); k++) {
|
|
G4_Operand *curr_src = curr_inst->getSrc(k);
|
|
if (curr_src->isSrcRegRegion() &&
|
|
!(curr_inst->isMath() && k == 1 && curr_src->isNullReg())) {
|
|
if (curr_src->asSrcRegRegion()->compareOperand(first_def, builder) ==
|
|
Rel_eq) {
|
|
G4_SrcRegRegion *new_src_opnd = builder.createSrcRegRegion(
|
|
curr_src->asSrcRegRegion()->getModifier(),
|
|
curr_src->asSrcRegRegion()->getRegAccess(), curr_base, 0,
|
|
curr_subreg, curr_src->asSrcRegRegion()->getRegion(),
|
|
curr_src->asSrcRegRegion()->getType());
|
|
curr_inst->setSrc(new_src_opnd, k);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// update def-use
|
|
// since inst (the pseudo-and/or) is deleted and the same flag is used for
|
|
// first and second inst, we have to
|
|
// -- transfer inst's use to the second_inst
|
|
// -- add def-use between first_inst and second_inst
|
|
// -- remove inst from first_inst and second_inst's use
|
|
inst->transferUse(second_inst, true);
|
|
first_inst->addDefUse(second_inst, Opnd_pred);
|
|
inst->removeUseOfInst();
|
|
|
|
INST_LIST_ITER new_iter = ii;
|
|
++ii;
|
|
bb->erase(new_iter);
|
|
return true;
|
|
}
|
|
|
|
/*** The beginning of message header optimization ***/
|
|
|
|
/*
|
|
* reuse the previous header which can save the redundant definitions.
|
|
*/
|
|
void MSGTable::reusePreviousHeader(G4_INST *dest, G4_INST *source,
|
|
G4_INST *mDot2, IR_Builder &builder) {
|
|
if (dest == NULL)
|
|
return;
|
|
if (source != NULL) {
|
|
dest->setDest(builder.duplicateOperand(source->getDst()));
|
|
} else {
|
|
short subRegOff = dest->getDst()->getSubRegOff();
|
|
auto newDst = builder.createDstWithNewSubRegOff(mDot2->getDst(), subRegOff);
|
|
dest->setDest(newDst);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* insert a mov, from the previous header to current header
|
|
* this is only required for the instruction,
|
|
* whose payload size >1 so that we can't directly reuse
|
|
* the previous header. keep a copy from the previous header,
|
|
* so that we only need to update the fields that need to be changed.
|
|
*/
|
|
void MSGTable::insertHeaderMovInst(G4_INST *source_send, IR_Builder &builder,
|
|
G4_BB *bb) {
|
|
INST_LIST_ITER pos;
|
|
|
|
switch (first) {
|
|
case HEADER_FULL_REGISTER:
|
|
pos = m_it;
|
|
break;
|
|
case HEADER_X:
|
|
pos = mDot0_it;
|
|
break;
|
|
case HEADER_Y:
|
|
pos = mDot1_it;
|
|
break;
|
|
case HEADER_SIZE:
|
|
pos = mDot2_it;
|
|
break;
|
|
default:
|
|
vISA_ASSERT_UNREACHABLE(
|
|
"did not catch the first def instruction correctly");
|
|
return;
|
|
}
|
|
|
|
// mov(8) m_new<1>, m_old<8;8:1> {Align1}
|
|
G4_Declare *srcDcl =
|
|
source_send->getSrc(0)->getBase()->asRegVar()->getDeclare();
|
|
G4_SrcRegRegion *newSrcOpnd =
|
|
builder.createSrcRegRegion(srcDcl, builder.getRegionStride1());
|
|
|
|
G4_INST *mov =
|
|
builder.createMov(m->getExecSize(), builder.duplicateOperand(m->getDst()),
|
|
newSrcOpnd, m->getOption(), false);
|
|
bb->insertBefore(pos, mov);
|
|
|
|
// maintain def-use.
|
|
//
|
|
// (1) Uses. m is ready to be deleted.
|
|
m->transferUse(mov);
|
|
|
|
// (2) Defs
|
|
// The defs should be from definitions for source->send's src(0).
|
|
//
|
|
// mov (8) V244(1,0)<1>:ud V88(0,0)<8;8,1>:ud {Align1, NoMask}
|
|
// mov (8) V244(0,0)<1>:ud r0.0<8;8,1>:ud {Align1, NoMask}
|
|
// mov (1) V244(0,2)<1>:ud 0x1f:ud {Align1, NoMask}
|
|
// mov (1) V244(0,0)<1>:ud 0:uw {Align1, NoMask}
|
|
// mov (1) V244(0,1)<1>:ud 0:uw {Align1, NoMask}
|
|
// add (1) a0.0<1>:ud r1.1<0;1,0>:ud 0x40a8000:ud {Align1, NoMask}
|
|
// source_send:
|
|
// send (8) null<1>:ud V244(0,0)<8;8,1>:ud a0.0<0;1,0>:ud {Align1, NoMask}
|
|
// mov (8) V89(0,0)<1>:d V34_histogram(1,0)<8;8,1>:d {Align1, Q1}
|
|
// mov (8) V246(1,0)<1>:ud V89(0,0)<8;8,1>:ud {Align1, NoMask}
|
|
// mov (8) V246(0,0)<1>:ud V244(0,0)<8;8,1>:ud {Align1, NoMask} <-- mov
|
|
// mov (8) V246(0,0)<1>:ud r0.0<8;8,1>:ud {Align1, NoMask} <-- m
|
|
//
|
|
// There are more than one defs here.
|
|
//
|
|
// Note that
|
|
//
|
|
// mov (8) V244(1,0)<1>:ud V88(0,0)<8;8,1>:ud {Align1, NoMask}
|
|
//
|
|
// is a definition of send but not for mov. We enable checked
|
|
// while copying defs.
|
|
source_send->copyDef(mov, Opnd_src0, Opnd_src0, /*checked*/ true);
|
|
}
|
|
|
|
/*
|
|
* compare the two instructions
|
|
* they must have:
|
|
* the same instruction opcode, predicate, condmodifier, optionString
|
|
* the same dst, and number of src args
|
|
*/
|
|
bool Optimizer::isHeaderOptCandidate(G4_INST *dst, G4_INST *src) {
|
|
if (!dst || !src) {
|
|
return true;
|
|
}
|
|
|
|
// Compare instructions
|
|
if (dst->opcode() != src->opcode() || dst->getOption() != src->getOption() ||
|
|
dst->getExecSize() != src->getExecSize() ||
|
|
dst->getPredicate() != src->getPredicate() ||
|
|
dst->getCondMod() != src->getCondMod()) {
|
|
return false;
|
|
}
|
|
|
|
if (dst->getNumSrc() != src->getNumSrc()) {
|
|
return false;
|
|
}
|
|
|
|
// Compare destination args
|
|
G4_Operand *dst_dest = dst->getDst();
|
|
G4_Operand *src_dest = src->getDst();
|
|
if (!dst_dest || !src_dest) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* compare the two instructions to see if they are redundent
|
|
* they must have:
|
|
* the same src args
|
|
* the same def
|
|
*/
|
|
bool Optimizer::isHeaderOptReuse(G4_INST *dst, G4_INST *src) {
|
|
if (!dst && !src) {
|
|
return true;
|
|
}
|
|
if (!dst || !src) {
|
|
return false;
|
|
}
|
|
|
|
for (unsigned int i = 0, numSrc = dst->getNumSrc(); i < numSrc; i++) {
|
|
G4_Operand *opnd = dst->getSrc(i);
|
|
if (opnd && opnd->compareOperand(src->getSrc(i), builder) != Rel_eq) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// The same number of def instructions
|
|
if (dst->def_size() != src->def_size()) {
|
|
return false;
|
|
}
|
|
|
|
if (dst->def_size() == 0) {
|
|
return true; // both have no def at all
|
|
}
|
|
|
|
for (auto ii = dst->def_begin(); ii != dst->def_end(); ii++) {
|
|
bool sameDef = false;
|
|
for (auto jj = src->def_begin(); jj != src->def_end(); jj++) {
|
|
if ((*ii).first == (*jj).first && (*ii).second == (*jj).second) {
|
|
sameDef = true;
|
|
break; // break the inner jj loop
|
|
}
|
|
}
|
|
if (sameDef == false) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool Optimizer::headerOptValidityCheck(MSGTable *dest, MSGTable *source) {
|
|
if (!isHeaderOptCandidate(dest->a0Dot0, source->a0Dot0) ||
|
|
!isHeaderOptCandidate(dest->mDot0, source->mDot0) ||
|
|
!isHeaderOptCandidate(dest->m, source->m) ||
|
|
!isHeaderOptCandidate(dest->mDot1, source->mDot1) ||
|
|
!isHeaderOptCandidate(dest->mDot2, source->mDot2)) {
|
|
return false;
|
|
}
|
|
|
|
if (dest->m) {
|
|
if (!(dest->m->hasOneUse() && dest->m->use_front().first == dest->send)) {
|
|
return false;
|
|
}
|
|
}
|
|
if (dest->mDot0) {
|
|
if (!(dest->mDot0->hasOneUse() &&
|
|
dest->mDot0->use_front().first == dest->send)) {
|
|
return false;
|
|
}
|
|
}
|
|
if (dest->mDot1) {
|
|
if (!(dest->mDot1->hasOneUse() &&
|
|
dest->mDot1->use_front().first == dest->send)) {
|
|
return false;
|
|
}
|
|
}
|
|
if (dest->mDot2) {
|
|
if (!(dest->mDot2->hasOneUse() &&
|
|
dest->mDot2->use_front().first == dest->send)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (dest->send && dest->send->getSrc(0) &&
|
|
dest->send->getSrc(0)->getTopDcl() && source->send &&
|
|
source->send->getSrc(0) && source->send->getSrc(0)->getTopDcl()) {
|
|
unsigned short dstSize, sourceSize;
|
|
dstSize = dest->send->getSrc(0)->getTopDcl()->getTotalElems() *
|
|
dest->send->getSrc(0)->getTopDcl()->getElemSize();
|
|
sourceSize = source->send->getSrc(0)->getTopDcl()->getTotalElems() *
|
|
source->send->getSrc(0)->getTopDcl()->getElemSize();
|
|
if (dstSize != sourceSize) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// a class to store all presently valid values, each value is an instruction
|
|
// if the number of values exceeds the max allowed, the oldest is removed
|
|
class InstValues {
|
|
const int maxNumVal;
|
|
std::list<G4_INST *> values;
|
|
|
|
public:
|
|
InstValues(int maxCount) : maxNumVal(maxCount) {}
|
|
|
|
void addValue(G4_INST *inst) {
|
|
if (values.size() == maxNumVal) {
|
|
values.pop_front();
|
|
}
|
|
values.push_back(inst);
|
|
}
|
|
|
|
// delete all values that may be invalid after inst
|
|
void deleteValue(G4_INST *inst) {
|
|
if (inst->isOptBarrier()) {
|
|
values.clear();
|
|
return;
|
|
}
|
|
|
|
auto hasIndirectGather = [](G4_INST *inst) {
|
|
for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i) {
|
|
auto src = inst->getSrc(i);
|
|
if (src && src->isSrcRegRegion() &&
|
|
src->asSrcRegRegion()->isIndirect() &&
|
|
src->asSrcRegRegion()->getRegion()->isRegionWH()) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
};
|
|
|
|
if (hasIndirectGather(inst)) {
|
|
// optimization is likely unprofitable due to high address register
|
|
// pressure in this case. more importantly, it may actually cause RA to
|
|
// fail since we don't spill physical a0.0
|
|
values.clear();
|
|
return;
|
|
}
|
|
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
auto interferes = [dst](G4_INST *valInst) {
|
|
const IR_Builder &builder = valInst->getBuilder();
|
|
G4_DstRegRegion *valDst = valInst->getDst();
|
|
if (dst->compareOperand(valDst, builder) != Rel_disjoint) {
|
|
return true;
|
|
}
|
|
for (int i = 0, numSrc = valInst->getNumSrc(); i < numSrc; ++i) {
|
|
G4_Operand *src = valInst->getSrc(i);
|
|
if (src != nullptr &&
|
|
dst->compareOperand(src, builder) != Rel_disjoint) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
};
|
|
if (dst != nullptr) {
|
|
values.remove_if(interferes);
|
|
}
|
|
}
|
|
|
|
G4_INST *findValue(G4_INST *inst) {
|
|
for (auto valInst : values) {
|
|
if (inst->opcode() != valInst->opcode() ||
|
|
inst->getExecSize() != valInst->getExecSize()) {
|
|
continue;
|
|
}
|
|
// skip flags for now
|
|
if ((inst->getPredicate() || valInst->getPredicate()) ||
|
|
(inst->getCondMod() || valInst->getCondMod())) {
|
|
continue;
|
|
}
|
|
// emask checks
|
|
if (inst->getMaskOffset() != valInst->getMaskOffset() ||
|
|
inst->isWriteEnableInst() ^ valInst->isWriteEnableInst()) {
|
|
continue;
|
|
}
|
|
// all source should be isomorphic (same type/shape)
|
|
bool srcMatch = true;
|
|
for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i) {
|
|
G4_Operand *src = inst->getSrc(i);
|
|
G4_Operand *valSrc = valInst->getSrc(i);
|
|
if (src == nullptr || valSrc == nullptr ||
|
|
src->compareOperand(valSrc, inst->getBuilder()) != Rel_eq) {
|
|
srcMatch = false;
|
|
break;
|
|
}
|
|
}
|
|
if (srcMatch) {
|
|
return valInst;
|
|
}
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
void clear() { values.clear(); }
|
|
};
|
|
|
|
G4_Operand *Optimizer::updateSendsHeaderReuse(
|
|
std::vector<std::vector<G4_INST *>> &instLookUpTable,
|
|
std::vector<G4_INST *> &iVector, INST_LIST_ITER endIter) {
|
|
int bSize = (int)iVector.size();
|
|
for (auto &Cache : instLookUpTable) {
|
|
if (Cache.size() == bSize) {
|
|
bool match[8] = {false};
|
|
bool anyMatch = false;
|
|
for (int index = 0; index < bSize; ++index) {
|
|
G4_INST *cInst = Cache[index];
|
|
G4_INST *iInst = iVector[index];
|
|
|
|
// opcode check
|
|
if (cInst->opcode() != iInst->opcode() ||
|
|
cInst->getExecSize() != iInst->getExecSize()) {
|
|
continue;
|
|
}
|
|
// flag check
|
|
if (cInst->getPredicate() != iInst->getPredicate() ||
|
|
cInst->getCondMod() != iInst->getCondMod()) {
|
|
continue;
|
|
}
|
|
// emask check
|
|
if (cInst->getMaskOffset() != iInst->getMaskOffset() ||
|
|
cInst->isWriteEnableInst() ^ iInst->isWriteEnableInst()) {
|
|
continue;
|
|
}
|
|
// dst check
|
|
G4_DstRegRegion *cDstRgn = cInst->getDst();
|
|
G4_DstRegRegion *iDstRgn = iInst->getDst();
|
|
if (cDstRgn->getRegOff() != iDstRgn->getRegOff() ||
|
|
cDstRgn->getSubRegOff() != iDstRgn->getSubRegOff() ||
|
|
cDstRgn->getHorzStride() != iDstRgn->getHorzStride() ||
|
|
cDstRgn->getRegAccess() != iDstRgn->getRegAccess() ||
|
|
cDstRgn->getType() != iDstRgn->getType()) {
|
|
continue;
|
|
}
|
|
|
|
// all source should be isomorphic (same type/shape) and unaltered
|
|
// between declaration and reuse
|
|
bool srcMatch = true;
|
|
|
|
for (int iSrc = 0, numSrc = cInst->getNumSrc(); iSrc < numSrc; ++iSrc) {
|
|
G4_Operand *cOpnd = cInst->getSrc(iSrc);
|
|
G4_Operand *iOpnd = iInst->getSrc(iSrc);
|
|
if (cOpnd == nullptr || iOpnd == nullptr ||
|
|
cOpnd->compareOperand(iOpnd, builder) != Rel_eq) {
|
|
srcMatch = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (chkBwdWARdep(cInst, endIter))
|
|
srcMatch = false;
|
|
|
|
match[index] = srcMatch;
|
|
anyMatch |= srcMatch;
|
|
}
|
|
|
|
if (anyMatch) {
|
|
// at least partial match
|
|
|
|
for (int index = 0; index < bSize; ++index) {
|
|
G4_INST *cInst = Cache[index];
|
|
G4_INST *iInst = iVector[index];
|
|
|
|
// mark it if there is match
|
|
if (match[index]) {
|
|
iInst->markDead();
|
|
continue;
|
|
}
|
|
|
|
// create new dst region to replace one in iVector[i]
|
|
G4_DstRegRegion *cDst = cInst->getDst();
|
|
G4_DstRegRegion *iDst = iInst->getDst();
|
|
G4_DstRegRegion *newDstRegion = builder.createDst(
|
|
cDst->getTopDcl()->getRegVar(), iDst->getRegOff(),
|
|
iDst->getSubRegOff(), iDst->getHorzStride(), iDst->getType());
|
|
iInst->setDest(newDstRegion);
|
|
|
|
// update the look-up table list
|
|
Cache[index] = iInst;
|
|
}
|
|
return Cache[0]->getDst();
|
|
}
|
|
}
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
void Optimizer::cleanupA0Movs() {
|
|
for (auto bb : fg) {
|
|
InstValues values(4);
|
|
for (auto iter = bb->begin(), iterEnd = bb->end(); iter != iterEnd;) {
|
|
G4_INST *inst = *iter;
|
|
|
|
auto isDstExtDesc = [](G4_INST *inst) {
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
if (dst && dst->getTopDcl() && dst->getTopDcl()->isMsgDesc()) {
|
|
// check that its single use is at src3 of split send
|
|
if (inst->use_size() != 1) {
|
|
return false;
|
|
}
|
|
auto use = inst->use_front();
|
|
G4_INST *useInst = use.first;
|
|
if (useInst->isSend()) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
};
|
|
|
|
if (isDstExtDesc(inst)) {
|
|
G4_INST *valInst = values.findValue(inst);
|
|
if (valInst != nullptr) {
|
|
VISA_DEBUG_VERBOSE({
|
|
std::cout << "can replace \n";
|
|
inst->emit(std::cout);
|
|
std::cout << "\n with \n";
|
|
valInst->emit(std::cout);
|
|
std::cout << "\n";
|
|
});
|
|
for (auto I = inst->use_begin(), E = inst->use_end(); I != E; ++I) {
|
|
// each use is in the form of A0(0,0)<0;1,0>:ud in a send
|
|
G4_INST *useInst = I->first;
|
|
Gen4_Operand_Number num = I->second;
|
|
vISA_ASSERT(useInst->isSend(), "use inst must be a send");
|
|
G4_SrcRegRegion *newExDesc =
|
|
builder.createSrc(valInst->getDst()->getBase(), 0, 0,
|
|
builder.getRegionScalar(), Type_UD);
|
|
useInst->setSrc(newExDesc, useInst->getSrcNum(num));
|
|
}
|
|
(*iter)->removeAllDefs();
|
|
(*iter)->transferUse(valInst);
|
|
iter = bb->erase(iter);
|
|
continue;
|
|
} else {
|
|
VISA_DEBUG_VERBOSE({
|
|
std::cout << "add new value:\n";
|
|
inst->emit(std::cout);
|
|
std::cout << "\n";
|
|
});
|
|
// this is necessary since for msg desc we always the physical a0.0,
|
|
// so a new inst will invalidate the previous one
|
|
values.deleteValue(inst);
|
|
values.addValue(inst);
|
|
}
|
|
} else {
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
if (dst && dst->isDirectAddress()) {
|
|
// If the address register is used for none extdesc
|
|
values.clear();
|
|
} else {
|
|
values.deleteValue(inst);
|
|
}
|
|
}
|
|
++iter;
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// Perform value numbering on writes to the extended msg descriptor for bindless
|
|
// access of the form op (1) a0.2<1>:ud src0 src1 src2 {NoMask} and remove
|
|
// redundant instructions. This is limited to within BB
|
|
//
|
|
void Optimizer::cleanupBindless() {
|
|
kernel.fg.resetLocalDataFlowData();
|
|
kernel.fg.localDataFlowAnalysis();
|
|
|
|
// Perform send header cleanup for bindless sampler/surface
|
|
for (auto bb : fg) {
|
|
std::vector<std::vector<G4_INST *>> instLookUpTable;
|
|
std::vector<G4_INST *> instVector;
|
|
for (auto iter = bb->begin(), iterEnd = bb->end(); iter != iterEnd;
|
|
++iter) {
|
|
G4_INST *inst = *iter;
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
|
|
if (dst != nullptr && dst->getTopDcl() != nullptr &&
|
|
dst->getTopDcl()->getCapableOfReuse()) {
|
|
// it is header definition instruction
|
|
instVector.push_back(inst);
|
|
}
|
|
|
|
if (inst->isSplitSend()) {
|
|
G4_Operand *header = inst->getSrc(0);
|
|
G4_Operand *exDesc = inst->getSrc(3);
|
|
|
|
// When header has multiple uses other than send, be conservative and
|
|
// do not reuse the cached value. It could be introduced by
|
|
// optimizations like LVN.
|
|
if (header->getTopDcl() && header->getTopDcl()->getCapableOfReuse() &&
|
|
exDesc->isSrcRegRegion() && !instVector.empty() &&
|
|
std::all_of(instVector.begin(), instVector.end(), [&](G4_INST *i) {
|
|
return i->hasOneUse() && i->use_front().first == inst;
|
|
})) {
|
|
|
|
// check if we can reuse cached values.
|
|
G4_Operand *value =
|
|
updateSendsHeaderReuse(instLookUpTable, instVector, iter);
|
|
if (!value) {
|
|
// no found, cache the header
|
|
instLookUpTable.push_back(instVector);
|
|
} else {
|
|
// update sends header src
|
|
G4_SrcRegRegion *newHeaderRgn = builder.createSrc(
|
|
value->getBase(), 0, 0, builder.getRegionStride1(), Type_UD);
|
|
inst->setSrc(newHeaderRgn, 0);
|
|
}
|
|
}
|
|
// clear header def
|
|
instVector.clear();
|
|
} else if (inst->isSend()) {
|
|
instVector.clear();
|
|
}
|
|
}
|
|
bb->erase(std::remove_if(bb->begin(), bb->end(),
|
|
[](G4_INST *inst) { return inst->isDead(); }),
|
|
bb->end());
|
|
}
|
|
|
|
for (auto bb : fg) {
|
|
InstValues values(4);
|
|
for (auto iter = bb->begin(), iterEnd = bb->end(); iter != iterEnd;) {
|
|
G4_INST *inst = *iter;
|
|
|
|
auto isDstExtDesc = [](G4_INST *inst) {
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
if (dst && dst->getTopDcl() && dst->getTopDcl()->isMsgDesc()) {
|
|
// if a use is something other than a send, do not perform the
|
|
// optimization
|
|
for (auto use = inst->use_begin(); use != inst->use_end(); use++) {
|
|
G4_INST* useInst = use->first;
|
|
if (!useInst->isSend())
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
if (isDstExtDesc(inst)) {
|
|
G4_INST *valInst = values.findValue(inst);
|
|
if (valInst != nullptr) {
|
|
VISA_DEBUG_VERBOSE({
|
|
std::cout << "can replace \n";
|
|
inst->emit(std::cout);
|
|
std::cout << "\n with \n";
|
|
valInst->emit(std::cout);
|
|
std::cout << "\n";
|
|
});
|
|
for (auto I = inst->use_begin(), E = inst->use_end(); I != E; ++I) {
|
|
// each use is in the form of A0(0,0)<0;1,0>:ud in a send
|
|
G4_INST *useInst = I->first;
|
|
Gen4_Operand_Number num = I->second;
|
|
vISA_ASSERT(useInst->isSend(), "use inst must be a send");
|
|
G4_SrcRegRegion *newExDesc =
|
|
builder.createSrc(valInst->getDst()->getBase(), 0, 0,
|
|
builder.getRegionScalar(), Type_UD);
|
|
useInst->setSrc(newExDesc, useInst->getSrcNum(num));
|
|
}
|
|
iter = bb->erase(iter);
|
|
continue;
|
|
} else {
|
|
VISA_DEBUG_VERBOSE({
|
|
std::cout << "add new value:\n";
|
|
inst->emit(std::cout);
|
|
std::cout << "\n";
|
|
});
|
|
// this is necessary since for msg desc we always the physical a0.0,
|
|
// so a new inst will invalidate the previous one
|
|
values.deleteValue(inst);
|
|
values.addValue(inst);
|
|
}
|
|
} else {
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
if (dst && dst->isDirectAddress()) {
|
|
// If the address register is used for none extdesc
|
|
values.clear();
|
|
} else {
|
|
values.deleteValue(inst);
|
|
}
|
|
}
|
|
++iter;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* compare the two send and their defs
|
|
* determine whether to remove the redundant mov inst
|
|
* or reuse the previous header
|
|
*
|
|
* 1 mov (8) V152(0,0)<1>:ud r0.0<8;8,1>:ud {Align1, NoMask}
|
|
* 2 mov (1) V152(0,2)<1>:ud 0x7000f:ud {Align1, NoMask}
|
|
* 3 mov (1) V152(0,0)<1>:ud 0:uw {Align1, NoMask}
|
|
* 4 mov (1) V152(0,1)<1>:ud 0:uw {Align1, NoMask}
|
|
* 5 add (1) a0.0<1>:ud r1.0<0;1,0>:ud 0x2490000:ud {Align1, NoMask}
|
|
* 6 send (8) V32_in(0,0)<1>:ud V152(0,0)<8;8,1>:ud a0.0<0;1,0>:ud {Align1,
|
|
* NoMask}
|
|
*
|
|
* 7 mov (8) V154(0,0)<1>:ud r0.0<8;8,1>:ud {Align1, NoMask}
|
|
* 8 mov (1) V152(0,2)<1>:ud 0x1f:ud {Align1, NoMask}
|
|
* 9 mov (1) V154(0,0)<1>:ud 0:uw {Align1, NoMask}
|
|
* 10 mov (1) V154(0,1)<1>:ud 0:uw {Align1, NoMask}
|
|
* 11 add (1) a0.0<1>:ud r1.1<0;1,0>:ud 0x2190000:ud {Align1, NoMask}
|
|
* 12 send (8) V33(0,0)<1>:d V152(0,0)<8;8,1>:ud a0.0<0;1,0>:ud {Align1,
|
|
* NoMask}
|
|
*
|
|
* It is rather tricky to maintain def-use chains for this optimization.
|
|
* The send instruction (Line 12) can reuse Inst[1, 3, 4] and need to
|
|
* keep Inst[8, 11]. The defs for send at line 12 is Inst[1, 3, 4, 8, 11]
|
|
* and the defs for send at line 6 is Inst[1, 2, 3, 4, 5].
|
|
*
|
|
* We take the following approach to maintain def-use.
|
|
*
|
|
* - Starting with initial valid defs, [7, 8, 9, 10, 11].
|
|
*
|
|
* - Each to be removed instruction transfers its use to a proper
|
|
* previous definition.
|
|
*
|
|
* - Each to be kept instruction remains, even there may have changes
|
|
* in its definition. For example, dst of move instruction at Line 8
|
|
* is changed from V154 to V152, but no def-use modification should
|
|
* be made for this instruction.
|
|
*
|
|
* - No def-use modification should be made to the final send, since
|
|
* all have been properly set.
|
|
*/
|
|
void Optimizer::optMessageHeaders(MSGTableList &msgList, G4_BB *bb,
|
|
DEFA0 &myA0) {
|
|
unsigned char redundancyCount = 0;
|
|
bool isSameX, isSameY, isSameSize;
|
|
bool replaceOldHeader = false;
|
|
|
|
uint16_t payLoadSize;
|
|
|
|
MSGTable *dest, *source;
|
|
MSGTable_ITER iter = msgList.begin();
|
|
|
|
if (iter == msgList.end()) {
|
|
return;
|
|
}
|
|
dest = *iter; // dest is the front
|
|
iter++;
|
|
if (iter == msgList.end()) {
|
|
return;
|
|
}
|
|
source = *iter; // source is the cached one
|
|
|
|
if (!headerOptValidityCheck(dest, source)) {
|
|
return;
|
|
}
|
|
|
|
if (isHeaderOptReuse(dest->a0Dot0, myA0.pred) && !myA0.isA0Redef) {
|
|
// Transfer uses of dstDot0 to myA0.pred. This removes uses from
|
|
// dest->a0Dot0 and add to myA0.pred. dest->a0Dot0 to be deleted.
|
|
dest->a0Dot0->transferUse(myA0.pred, /*keepExisting*/ true);
|
|
dest->a0Dot0->markDead();
|
|
}
|
|
|
|
payLoadSize = dest->send->getMsgDesc()->getSrc0LenRegs();
|
|
|
|
isSameX = isHeaderOptReuse(dest->mDot0, source->mDot0) && !source->isXRedef;
|
|
|
|
isSameY = isHeaderOptReuse(dest->mDot1, source->mDot1) && !source->isYRedef;
|
|
|
|
isSameSize =
|
|
isHeaderOptReuse(dest->mDot2, source->mDot2) && !source->isSizeRedef;
|
|
|
|
if (isSameX && dest->mDot0) {
|
|
redundancyCount++;
|
|
}
|
|
if (isSameY && dest->mDot1) {
|
|
redundancyCount++;
|
|
}
|
|
if (isSameSize && dest->mDot2) {
|
|
redundancyCount++;
|
|
}
|
|
|
|
if (payLoadSize > 1 && redundancyCount < MESSAGE_HEADER_THRESHOLD) {
|
|
return; // don't delete if redunant insts >=THRESHold
|
|
};
|
|
|
|
if (payLoadSize > 1 && !(redundancyCount == 3 &&
|
|
dest->send->getSrc(0)->compareOperand(
|
|
source->send->getSrc(0), builder) == Rel_eq)) {
|
|
dest->insertHeaderMovInst(source->send, builder, bb);
|
|
replaceOldHeader = true;
|
|
}
|
|
|
|
{ // always remove "mov(8) Mx<1>, r0.0<8;8,1>:ud{Align1}"
|
|
dest->m->markDead();
|
|
if (!replaceOldHeader) {
|
|
dest->m->transferUse(source->m, /*keepExisting*/ true);
|
|
dest->m = source->m;
|
|
}
|
|
}
|
|
|
|
if (isSameX && dest->mDot0) {
|
|
dest->mDot0->markDead();
|
|
if (!replaceOldHeader) {
|
|
dest->mDot0->transferUse(source->mDot0, /*keepExisting*/ true);
|
|
dest->mDot0 = source->mDot0;
|
|
}
|
|
} else if (payLoadSize == 1 && dest->mDot0) {
|
|
dest->reusePreviousHeader(dest->mDot0, source->mDot0, source->mDot2,
|
|
builder);
|
|
if (!replaceOldHeader) {
|
|
source->mDot0 = dest->mDot0;
|
|
}
|
|
}
|
|
|
|
if (isSameY && dest->mDot1) {
|
|
dest->mDot1->markDead();
|
|
if (!replaceOldHeader) {
|
|
dest->mDot1->transferUse(source->mDot1, /*keepExisting*/ true);
|
|
dest->mDot1 = source->mDot1;
|
|
}
|
|
} else if (payLoadSize == 1 && dest->mDot1) {
|
|
dest->reusePreviousHeader(dest->mDot1, source->mDot1, source->mDot2,
|
|
builder);
|
|
if (!replaceOldHeader) {
|
|
source->mDot1 = dest->mDot1;
|
|
}
|
|
}
|
|
|
|
if (isSameSize && dest->mDot2) {
|
|
dest->mDot2->markDead();
|
|
if (!replaceOldHeader) {
|
|
dest->mDot2->transferUse(source->mDot2, /*keepExisting*/ true);
|
|
dest->mDot2 = source->mDot2;
|
|
}
|
|
} else if (payLoadSize == 1 && dest->mDot2) {
|
|
dest->reusePreviousHeader(dest->mDot2, source->mDot2, source->mDot2,
|
|
builder);
|
|
if (!replaceOldHeader) {
|
|
source->mDot2 = dest->mDot2;
|
|
}
|
|
}
|
|
|
|
if (payLoadSize == 1) {
|
|
// Check this function's comments for why no def-use changes
|
|
// should be made on resetting src(0).
|
|
G4_Operand *src0 = source->send->getSrc(0);
|
|
dest->send->setSrc(builder.duplicateOperand(src0), 0);
|
|
}
|
|
|
|
dest->opt = true;
|
|
|
|
return;
|
|
}
|
|
|
|
bool Optimizer::isHeaderCachingCandidate(G4_INST *inst) {
|
|
if (inst->isSend()) {
|
|
return true;
|
|
}
|
|
|
|
if (inst->useEmpty()) {
|
|
return false;
|
|
}
|
|
|
|
for (USE_EDGE_LIST_ITER iter = inst->use_begin(), iend = inst->use_end();
|
|
iter != iend; ++iter) {
|
|
if ((*iter).first->isSend()) {
|
|
G4_INST *send = (*iter).first;
|
|
G4_Operand *header = send->getSrc(0);
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
|
|
// def to BuiltInA0 is part of header opt
|
|
if (inst->getDst() && inst->getDst()->getBase() &&
|
|
inst->getDst()->getBase()->isRegVar() &&
|
|
inst->getDst()->getBase()->asRegVar() ==
|
|
builder.getBuiltinA0()->getRegVar() &&
|
|
inst->getDst()->getRegOff() == 0 &&
|
|
inst->getDst()->getSubRegOff() == 0) {
|
|
return true;
|
|
}
|
|
|
|
// make sure that dst of the current inst is header, not payload
|
|
// header is hard-coded to be 32 bytes
|
|
if (header->getTopDcl() == dst->getTopDcl() &&
|
|
dst->getLeftBound() >= header->getLeftBound() &&
|
|
dst->getRightBound() <=
|
|
header->getLeftBound() + kernel.numEltPerGRF<Type_UB>() - 1) {
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* mark the below "and" instruction as redundant;
|
|
* the "and" instruction is the header for a barrier:
|
|
* and (8) r1.0<1>:ud r0.2<0;1,0>:ud 0xf000000:ud {Align1, NoMask}
|
|
* send (1) null<1>:ud r1 0x3 0x2000004:ud{Align1}
|
|
* wait n0:ud {Align1}
|
|
*/
|
|
void Optimizer::removeRedundantBarrierHeaders(G4_INST *sendInst,
|
|
G4_SrcRegRegion *barrierSrc0,
|
|
bool first) {
|
|
bool barrier = false;
|
|
G4_SrcRegRegion *src0 = NULL;
|
|
if (!first) // skip the check as already done so for first barrier
|
|
{
|
|
barrier = isBarrierPattern(sendInst, src0);
|
|
}
|
|
if (barrier || first) {
|
|
auto item = sendInst->def_begin();
|
|
G4_INST *andInst = item->first;
|
|
// delete all the uses of andInst
|
|
// addInst and sendInst will have no def and no use
|
|
andInst->removeAllUses();
|
|
// sendInst.src0 (addInst.dst) will be replaced by barrierSend.src0
|
|
// create a new G4_SrcRegRegion, which is a copy of barrierSend.src0
|
|
G4_SrcRegRegion *src = builder.createSrc(
|
|
barrierSrc0->getTopDcl()->getRegVar(), 0, 0, barrierSrc0->getRegion(),
|
|
barrierSrc0->getTopDcl()->getElemType());
|
|
sendInst->setSrc(src, 0);
|
|
andInst->markDead();
|
|
}
|
|
}
|
|
|
|
/*
|
|
* pattern match of a code sequence for ISA_BARRIER:
|
|
* and (8) r1.0<1>:ud r0.2<0;1,0>:ud 0xf000000:ud {Align1, NoMask}
|
|
* send (1) null<1>:ud r1 0x3 0x2000004:ud{Align1}
|
|
* wait n0:ud {Align1}
|
|
*/
|
|
bool Optimizer::isBarrierPattern(G4_INST *sendInst,
|
|
G4_SrcRegRegion *&barrierSendSrc0) {
|
|
/*
|
|
* check G4_send
|
|
*/
|
|
G4_SendDescRaw *desc = sendInst->getMsgDescRaw();
|
|
if (!desc)
|
|
return false;
|
|
uint32_t descVal = desc->getDesc();
|
|
if ((desc->getFuncId() == SFID::GATEWAY) &&
|
|
(descVal == (0x1 << 25) + 0x4) && // 0x2000004
|
|
(sendInst->def_size() == 1)) {
|
|
auto item = sendInst->def_begin();
|
|
G4_INST *andInst = item->first; // getting "and" from send's def
|
|
|
|
/*
|
|
* check G4_and
|
|
*/
|
|
if ((andInst) && (andInst->opcode() == G4_and) &&
|
|
(item->second == Opnd_src0)) {
|
|
G4_Operand *src0 = andInst->getSrc(0);
|
|
G4_Operand *src1 = andInst->getSrc(1);
|
|
|
|
bool isSrc0 =
|
|
((src0->isSrcRegRegion()) && (src0->asSrcRegRegion()->getBase()) &&
|
|
(src0->asSrcRegRegion()->getBase()->isRegVar()) &&
|
|
(src0->asSrcRegRegion()->getBase()->asRegVar() ==
|
|
builder.getBuiltinR0()->getRegVar()) &&
|
|
(src0->asSrcRegRegion()->getRegOff() == 0) &&
|
|
(src0->asSrcRegRegion()->getSubRegOff() == 2)); // r0.2
|
|
|
|
bool isSrc1 =
|
|
src1->isImm() && !src1->isRelocImm() &&
|
|
src1->asImm()->getInt() ==
|
|
(builder.getPlatform() >= GENX_SKL ? 0x8F000000 : 0x0F000000);
|
|
|
|
if (isSrc0 && isSrc1 && sendInst->getSrc(0) &&
|
|
sendInst->getSrc(0)->isSrcRegRegion()) {
|
|
barrierSendSrc0 = sendInst->getSrc(0)->asSrcRegRegion();
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* add the the barrier header as the top instruction
|
|
*/
|
|
void Optimizer::hoistBarrierHeaderToTop(G4_SrcRegRegion *barrierSendSrc0) {
|
|
G4_Declare *dcl = barrierSendSrc0->getTopDcl();
|
|
IR_Builder *mybuilder = &builder;
|
|
|
|
// below code is copied from translateVISASyncInst() for ISA_BARRIER
|
|
// all other dwords are ignored
|
|
// and (8) r32.0:ud r0.2:ud 0x0F000000
|
|
|
|
G4_SrcRegRegion *r0_src_opnd =
|
|
builder.createSrc(mybuilder->getBuiltinR0()->getRegVar(), 0, 2,
|
|
builder.getRegionScalar(), Type_UD);
|
|
G4_DstRegRegion *dst1_opnd =
|
|
builder.createDst(dcl->getRegVar(), 0, 0, 1, Type_UD);
|
|
|
|
G4_Imm *g4Imm = NULL;
|
|
|
|
// for SKL+ there are 5 bits for barrierID
|
|
// 5th bit is stored in bit 31 of second dword
|
|
if (builder.getPlatform() < GENX_SKL) {
|
|
g4Imm = builder.createImm(0x0F000000, Type_UD);
|
|
} else {
|
|
g4Imm = builder.createImm(0x8F000000, Type_UD);
|
|
}
|
|
|
|
G4_INST *andInst =
|
|
builder.createBinOp(G4_and, g4::SIMD8, dst1_opnd, r0_src_opnd, g4Imm,
|
|
InstOpt_WriteEnable, false);
|
|
for (auto bb : fg) {
|
|
auto iter = std::find_if(bb->begin(), bb->end(),
|
|
[](G4_INST *inst) { return !inst->isLabel(); });
|
|
if (iter != bb->end()) {
|
|
bb->insertBefore(iter, andInst);
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* check whether there are new definitions in order to determine redundancy
|
|
*/
|
|
bool Optimizer::chkNewDefBetweenSends(G4_INST *inst, MSGTableList &msgList,
|
|
DEFA0 &myA0) {
|
|
bool isDef = false;
|
|
msgList.unique();
|
|
|
|
// check SIMD8 VxH region everytime
|
|
if (inst->getDst() &&
|
|
(inst->getDst()->isIndirect() || inst->getDst()->isDirectAddress())) {
|
|
isDef = myA0.isA0Redef = true;
|
|
} else {
|
|
for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; i++) {
|
|
if (!inst->getSrc(i) || !inst->getSrc(i)->isSrcRegRegion())
|
|
continue;
|
|
auto src = inst->getSrc(i)->asSrcRegRegion();
|
|
if (src->isIndirect() || src->isDirectAddress()) {
|
|
isDef = myA0.isA0Redef = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (msgList.size() < 2) {
|
|
return false;
|
|
}
|
|
MSGTable_ITER ii = msgList.begin();
|
|
if (ii == msgList.end()) {
|
|
return false;
|
|
}
|
|
|
|
ii++;
|
|
|
|
if (ii == msgList.end()) {
|
|
return false;
|
|
}
|
|
|
|
MSGTable *last = *(ii);
|
|
if (last == NULL || last->send == NULL) {
|
|
return false;
|
|
}
|
|
G4_Operand *def = inst->getDst();
|
|
if (def == NULL) {
|
|
return false;
|
|
}
|
|
if (last->mDot0 &&
|
|
(def->compareOperand(last->mDot0->getSrc(0), builder) == Rel_eq ||
|
|
(last->mDot0->getSrc(1) &&
|
|
def->compareOperand(last->mDot0->getSrc(1), builder) == Rel_eq) ||
|
|
(last->mDot0->getSrc(2) &&
|
|
def->compareOperand(last->mDot0->getSrc(2), builder) == Rel_eq))) {
|
|
isDef = last->isXRedef = true;
|
|
} else if (last->mDot1 &&
|
|
(def->compareOperand(last->mDot1->getSrc(0), builder) == Rel_eq ||
|
|
(last->mDot1->getSrc(1) &&
|
|
def->compareOperand(last->mDot1->getSrc(1), builder) ==
|
|
Rel_eq) ||
|
|
(last->mDot1->getSrc(2) &&
|
|
def->compareOperand(last->mDot1->getSrc(2), builder) ==
|
|
Rel_eq))) {
|
|
isDef = last->isYRedef = true;
|
|
} else if (last->mDot2 &&
|
|
(def->compareOperand(last->mDot2->getSrc(0), builder) == Rel_eq ||
|
|
(last->mDot2->getSrc(1) &&
|
|
def->compareOperand(last->mDot2->getSrc(1), builder) ==
|
|
Rel_eq) ||
|
|
(last->mDot2->getSrc(2) &&
|
|
def->compareOperand(last->mDot2->getSrc(2), builder) ==
|
|
Rel_eq))) {
|
|
isDef = last->isSizeRedef = true;
|
|
} else if (last->m &&
|
|
(def->compareOperand(last->m->getSrc(0), builder) == Rel_eq ||
|
|
(last->m->getSrc(1) &&
|
|
def->compareOperand(last->m->getSrc(1), builder) == Rel_eq) ||
|
|
(last->m->getSrc(2) &&
|
|
def->compareOperand(last->m->getSrc(2), builder) == Rel_eq))) {
|
|
isDef = last->isR0Dot0Redef = true;
|
|
}
|
|
return isDef;
|
|
}
|
|
|
|
/*
|
|
* Cache the send and its def into a table for optimization
|
|
*/
|
|
void Optimizer::addEntryToMessageTable(G4_INST *inst, MSGTableList &msgList,
|
|
G4_BB *bb, INST_LIST_ITER ii,
|
|
DEFA0 &myA0) {
|
|
MSGTable *item = msgList.front();
|
|
if (inst->isSend()) {
|
|
item->send = inst;
|
|
item->opt = false;
|
|
item->isR0Dot0Redef = false;
|
|
item->isXRedef = false;
|
|
item->isYRedef = false;
|
|
item->isSizeRedef = false;
|
|
|
|
if (item->invalid) {
|
|
msgList.pop_front();
|
|
} else if (item->a0Dot0 != NULL && // only def a0.0
|
|
(item->m == NULL || item->mDot2 == NULL)) {
|
|
if (isHeaderOptCandidate(item->a0Dot0, myA0.pred)) {
|
|
if (isHeaderOptReuse(item->a0Dot0, myA0.pred) && !myA0.isA0Redef) {
|
|
// Transfer uses of a0Dot0 to myA0.pred. This removes uses from
|
|
// a0Dot0 and add to myA0.pred. item->a0Dot0 to be deleted.
|
|
item->a0Dot0->transferUse(myA0.pred, /*keepExisting*/ true);
|
|
item->a0Dot0->markDead();
|
|
}
|
|
}
|
|
msgList.pop_front();
|
|
} else if (item->a0Dot0 && item->m && item->mDot2) // complete header def
|
|
{
|
|
|
|
msgList.unique();
|
|
if (msgList.size() >= 2) {
|
|
optMessageHeaders(msgList, bb, myA0);
|
|
if (msgList.front()->opt &&
|
|
msgList.front()->send->getMsgDesc()->getSrc0LenRegs() == 1) {
|
|
// keep the oldest send for subsequent read operations
|
|
// but the instruction to define a0.0 needs to be latest
|
|
// msgList.back()->a0Dot0 = msgList.front()->a0Dot0;
|
|
msgList.pop_front(); // delete first element
|
|
} else if (msgList.front()->opt &&
|
|
msgList.front()->send->getMsgDesc()->getSrc0LenRegs() >= 1) {
|
|
// keep the latest send for subsequent write operations
|
|
msgList.pop_back();
|
|
} else {
|
|
msgList.pop_back();
|
|
}
|
|
myA0.isA0Redef = false;
|
|
}
|
|
} else {
|
|
// not an optimization candidate
|
|
msgList.pop_front();
|
|
}
|
|
} else if (inst->getDst() && inst->getDst()->getBase() &&
|
|
inst->getDst()->getBase()->isRegVar() &&
|
|
inst->getDst()->getBase()->asRegVar() ==
|
|
builder.getBuiltinA0()->getRegVar() &&
|
|
inst->getDst()->getRegOff() == 0 &&
|
|
inst->getDst()->getSubRegOff() == 0) {
|
|
// is builtInA0.0
|
|
item->a0Dot0 = inst;
|
|
item->a0Dot0_it = ii;
|
|
|
|
if (myA0.curr == NULL) {
|
|
myA0.pred = NULL;
|
|
myA0.isA0Redef = false;
|
|
} else if (!myA0.curr->isDead()) {
|
|
// only update the a0 def when we didn't remove it
|
|
myA0.pred = myA0.curr;
|
|
myA0.predIt = myA0.currIt;
|
|
}
|
|
myA0.currIt = ii;
|
|
myA0.curr = inst;
|
|
} else if (inst->getSrc(0)) {
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
if (dst) {
|
|
if (dst->getRegOff() == 0) {
|
|
// mov(8) m.0, builtInR0.0
|
|
G4_Operand *src = inst->getSrc(0);
|
|
if (dst->getSubRegOff() == 0 && inst->getExecSize() == g4::SIMD8 &&
|
|
src && src->isSrcRegRegion() && src->asSrcRegRegion()->getBase() &&
|
|
src->asSrcRegRegion()->getBase()->isRegVar() &&
|
|
src->asSrcRegRegion()->getBase()->asRegVar() ==
|
|
builder.getBuiltinR0()->getRegVar() &&
|
|
src->asSrcRegRegion()->getRegOff() == 0 &&
|
|
src->asSrcRegRegion()->getSubRegOff() == 0) {
|
|
if (item->first == HEADER_UNDEF)
|
|
item->first = HEADER_FULL_REGISTER;
|
|
item->m = inst;
|
|
item->m_it = ii;
|
|
}
|
|
// mov(1) m.0
|
|
else if (dst->getSubRegOff() == 0 && inst->getExecSize() == g4::SIMD1) {
|
|
if (item->first == HEADER_UNDEF)
|
|
item->first = HEADER_X;
|
|
item->mDot0 = inst;
|
|
item->mDot0_it = ii;
|
|
}
|
|
// mov(1) m.1
|
|
else if (dst->getSubRegOff() == 1 && inst->getExecSize() == g4::SIMD1) {
|
|
if (item->first == HEADER_UNDEF)
|
|
item->first = HEADER_Y;
|
|
item->mDot1 = inst;
|
|
item->mDot1_it = ii;
|
|
}
|
|
// mov(1) m0.2
|
|
else if (dst->getSubRegOff() == 2 && inst->getExecSize() == g4::SIMD1) {
|
|
if (item->first == HEADER_UNDEF)
|
|
item->first = HEADER_SIZE;
|
|
item->mDot2 = inst;
|
|
item->mDot2_it = ii;
|
|
} else {
|
|
// unrecognized update to header
|
|
item->invalid = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Optimizer::messageHeaderReport(size_t ic_before, size_t ic_after,
|
|
G4_Kernel &kernel) {
|
|
VISA_DEBUG({
|
|
std::cout << " === Message Header Optimization ===\n";
|
|
std::cout << std::fixed << "\n";
|
|
std::cout << kernel.getName() << " is reduced from " << ic_before << " to "
|
|
<< ic_after << " instructions.\n";
|
|
if (((float)(ic_before)) != 0.0) {
|
|
std::cout << std::setprecision(0)
|
|
<< (float)((ic_before - ic_after) * 100) / (float)(ic_before)
|
|
<< "% instructions of this kernel are removed.\n";
|
|
}
|
|
std::cout << "\n";
|
|
});
|
|
}
|
|
|
|
//
|
|
// optimizer for removal of redundant message header instructions
|
|
//
|
|
void Optimizer::cleanMessageHeader() {
|
|
MSGTableList msgList;
|
|
size_t ic_before = 0;
|
|
size_t ic_after = 0;
|
|
|
|
llvm::SpecificBumpPtrAllocator<MSGTable> MSGTableAlloc;
|
|
bool isRedundantBarrier = false;
|
|
G4_SrcRegRegion *barrierSendSrc0 = nullptr;
|
|
|
|
for (G4_BB *bb : fg) {
|
|
msgList.clear();
|
|
auto MSGTableMem = MSGTableAlloc.Allocate();
|
|
MSGTable *newItem = new (MSGTableMem) MSGTable();
|
|
newItem->first = HEADER_UNDEF;
|
|
|
|
msgList.push_front(newItem);
|
|
INST_LIST_ITER ii = bb->begin();
|
|
INST_LIST_ITER iend = bb->end();
|
|
ic_before += bb->size();
|
|
|
|
DEFA0 myA0;
|
|
myA0.curr = nullptr;
|
|
myA0.pred = nullptr;
|
|
myA0.isA0Redef = false;
|
|
|
|
for (; ii != iend; ii++) {
|
|
G4_INST *inst = *ii;
|
|
if (isHeaderCachingCandidate(inst)) {
|
|
if (inst->opcode() == G4_send && isRedundantBarrier) {
|
|
removeRedundantBarrierHeaders(inst, barrierSendSrc0, false);
|
|
} else if (inst->opcode() == G4_send && !isRedundantBarrier) {
|
|
isRedundantBarrier = isBarrierPattern(inst, barrierSendSrc0);
|
|
if (isRedundantBarrier) {
|
|
removeRedundantBarrierHeaders(inst, barrierSendSrc0, true);
|
|
}
|
|
}
|
|
|
|
addEntryToMessageTable(inst, msgList, bb, ii, myA0);
|
|
if (inst->isSend()) {
|
|
auto MSGTableMem = MSGTableAlloc.Allocate();
|
|
MSGTable *item = new (MSGTableMem) MSGTable();
|
|
item->first = HEADER_UNDEF;
|
|
msgList.push_front(item);
|
|
}
|
|
} else {
|
|
chkNewDefBetweenSends(inst, msgList, myA0);
|
|
}
|
|
}
|
|
|
|
// Dead code elimination
|
|
for (ii = bb->begin(); ii != bb->end();) {
|
|
G4_INST *inst = *ii;
|
|
INST_LIST_ITER curr = ii++;
|
|
if (inst->isDead()) {
|
|
inst->removeUseOfInst();
|
|
bb->erase(curr);
|
|
}
|
|
}
|
|
|
|
ic_after += bb->size();
|
|
}
|
|
|
|
messageHeaderReport(ic_before, ic_after, kernel);
|
|
|
|
if (isRedundantBarrier) {
|
|
hoistBarrierHeaderToTop(barrierSendSrc0);
|
|
}
|
|
msgList.clear();
|
|
}
|
|
// The end of message header optimization
|
|
|
|
// For NoMask inst with non-zero mask offset, set maskoffset = 0 if possible.
|
|
//
|
|
// For any NoMask inst with non-zero mask offset, if it does not access any
|
|
// ARF and it is not a CF instruction, set its mask offset to zero.
|
|
void Optimizer::forceNoMaskOnM0() {
|
|
for (G4_BB *currBB : fg) {
|
|
for (auto &I : *currBB) {
|
|
if (!I->isWriteEnableInst() || I->isCFInst() || I->getPredicate() ||
|
|
I->getCondMod() || I->getMaskOffset() == 0 ||
|
|
I->hasImplicitAccDst() || I->hasImplicitAccSrc())
|
|
continue;
|
|
|
|
// skip if I is logical on flag registers.
|
|
// For example:
|
|
// (W) pseudo_and (16|M16) P2:uw P2:uw P1:uw
|
|
// where P2 is the high 16 bits of a 32-bit flag
|
|
// and M16 cannot be changed to M0.
|
|
if (I->isLogic() || I->isPseudoLogic()) {
|
|
// Only checking dst is enough.
|
|
G4_DstRegRegion *dst = I->getDst();
|
|
if (dst && !dst->isNullReg() && dst->isFlag()) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
I->setMaskOption(InstOpt_M0);
|
|
}
|
|
}
|
|
}
|
|
|
|
void Optimizer::sendFusion() {
|
|
// Potential problem related to noMask WA
|
|
//
|
|
// Send fusion creates the following code:
|
|
// 1. (W) mov (1|M0) f1.0<1>:uw 0x0:uw
|
|
// 2. cmp (8|M0) (eq)f1.0 null<1>:uw r0.0<8;8,1>:uw r0.0<8;8,1>:uw
|
|
// 3. (W) mov (1|M0) r18.4<1>:uw f1.0<0;1,0>:uw
|
|
// 4. (W) mov (2|M0) r18.8<1>:ub r18.8<0;1,0>:ub
|
|
// 5. (W) mov (1|M0) f0.1<1>:uw r18.4<0;1,0>:uw
|
|
// fused send:
|
|
// (W&f0.1) send.dc1 (16|M0) r5 r27 r1 0x40 0x02205EFF
|
|
//
|
|
// This code also works if NoMask WA is needed. Actually, this f0.1 behaves
|
|
// the same as NoMask WA. And it is critical that all of them should be
|
|
// executed without applying NoMask WA. Here is the reason why:
|
|
// Assume we have a HW bug, no channels are on but it runs thru those
|
|
// instructions. We have f1.0 be all 0 at the end of 2. As result, f0.1
|
|
// will be all zero. And the fused send will not run as its predicate is
|
|
// false. But if NoMask WA applies to 3 in postRA WA as it thinks it is a
|
|
// flag spill. (3) becomes:
|
|
// (3) (W& f0.0.any8) mov (1|M0) r18.4<1>:uw f1.0<0;1,0>:uw
|
|
// therefore, this instruction will no longer run, as result, f0.1 has
|
|
// garbage and it may make the fused send to run, which is wrong.
|
|
//
|
|
// The solutions:
|
|
// 1) turn off send fusion (does it really help?);
|
|
// 2) don't apply WA on those instructions.
|
|
// As those 1-5 are all local definitions, postRA WA should skip them.
|
|
// For now, we will do 2 to minimize potential impacts.
|
|
if (builder.hasFusedEU()) {
|
|
// Turn off send fusion for EU Fusion platforms.
|
|
return;
|
|
}
|
|
(void)doSendFusion(&fg, &mem);
|
|
}
|
|
|
|
G4_SrcRegRegion *IR_Builder::createSubSrcOperand(G4_SrcRegRegion *src,
|
|
uint16_t start, uint8_t size,
|
|
uint16_t newVs,
|
|
uint16_t newWd) {
|
|
const RegionDesc *rd = NULL;
|
|
uint16_t vs = src->getRegion()->vertStride, hs = src->getRegion()->horzStride,
|
|
wd = src->getRegion()->width;
|
|
G4_Type srcType = src->getType();
|
|
// even if src has VxH region, it could have a width that is equal to the new
|
|
// exec_size, meaning that it's really just a 1x1 region.
|
|
auto isVxHRegion = src->getRegion()->isRegionWH() && wd < size;
|
|
if (!isVxHRegion) {
|
|
// r[a0.0,0]<4;2,1> and size is 4 or 1
|
|
if (size < newWd) {
|
|
newWd = size;
|
|
}
|
|
rd = size == 1
|
|
? getRegionScalar()
|
|
: createRegionDesc(size == newWd ? newWd * hs : newVs, newWd, hs);
|
|
rd = getNormalizedRegion(size, rd);
|
|
}
|
|
|
|
if (src->getRegAccess() != Direct) {
|
|
if (isVxHRegion) {
|
|
// just handle <1,0>
|
|
if (start > 0) {
|
|
// Change a0.N to a0.(N+start)
|
|
vISA_ASSERT((start % wd == 0),
|
|
"illegal starting offset and width combination");
|
|
uint16_t subRegOff = src->getSubRegOff() + start / wd;
|
|
return createIndirectSrc(src->getModifier(), src->getBase(),
|
|
src->getRegOff(), subRegOff, src->getRegion(),
|
|
src->getType(), src->getAddrImm());
|
|
} else {
|
|
return duplicateOperand(src);
|
|
}
|
|
}
|
|
|
|
if (start > 0) {
|
|
short numRows = start / wd;
|
|
short numCols = start % wd;
|
|
short newOff = (numRows * vs + numCols * hs) * TypeSize(srcType);
|
|
auto newSrc = createIndirectSrc(
|
|
src->getModifier(), src->getBase(), src->getRegOff(),
|
|
src->getSubRegOff(), rd, src->getType(), src->getAddrImm() + newOff);
|
|
return newSrc;
|
|
} else {
|
|
G4_SrcRegRegion *newSrc = duplicateOperand(src);
|
|
newSrc->setRegion(*this, rd);
|
|
return newSrc;
|
|
}
|
|
}
|
|
|
|
// direct access oprand
|
|
if (src->isAccReg()) {
|
|
switch (srcType) {
|
|
case Type_F:
|
|
// must be acc1.0 as result of simd16 -> 8 split
|
|
vISA_ASSERT(size == 8, "only support simd16->simd8 for now");
|
|
return createSrcRegRegion(src->getModifier(), Direct,
|
|
phyregpool.getAcc1Reg(), 0, 0, src->getRegion(),
|
|
srcType);
|
|
case Type_HF: {
|
|
// can be one of acc0.8, acc1.0, acc1.8
|
|
if (src->getBase()->asAreg()->getArchRegType() == AREG_ACC1) {
|
|
start += 16;
|
|
}
|
|
G4_Areg *accReg =
|
|
start >= 16 ? phyregpool.getAcc1Reg() : phyregpool.getAcc0Reg();
|
|
return createSrcRegRegion(src->getModifier(), Direct, accReg, 0,
|
|
start % 16, src->getRegion(), srcType);
|
|
}
|
|
default:
|
|
// Keep using acc0 for other types.
|
|
return duplicateOperand(src);
|
|
}
|
|
}
|
|
|
|
// Since this function creates a new sub src operand based on a start offset,
|
|
// the reg and subreg offsets need to be re-computed.
|
|
uint16_t regOff, subRegOff, subRegOffByte, newSubRegOffByte, newEleOff,
|
|
newEleOffByte, crossGRF;
|
|
|
|
newEleOff =
|
|
start * hs +
|
|
(start >= wd && vs != wd * hs ? (start / wd * (vs - wd * hs)) : 0);
|
|
|
|
// Linearize offsets into bytes to verify potential GRF crossing
|
|
newEleOffByte = newEleOff * src->getTypeSize();
|
|
subRegOffByte = src->getSubRegOff() * src->getTypeSize();
|
|
|
|
// If subreg crosses GRF size, update reg and subreg offset accordingly
|
|
newSubRegOffByte = subRegOffByte + newEleOffByte;
|
|
crossGRF = newSubRegOffByte / kernel.numEltPerGRF<Type_UB>();
|
|
|
|
newSubRegOffByte =
|
|
newSubRegOffByte - crossGRF * kernel.numEltPerGRF<Type_UB>();
|
|
|
|
// Compute final reg and subreg offsets
|
|
regOff = src->getRegOff() + crossGRF;
|
|
subRegOff = newSubRegOffByte / src->getTypeSize();
|
|
|
|
return createSrcRegRegion(src->getModifier(), Direct, src->getBase(), regOff,
|
|
subRegOff, rd, srcType, src->getAccRegSel());
|
|
}
|
|
|
|
G4_DstRegRegion *IR_Builder::createSubDstOperand(G4_DstRegRegion *dst,
|
|
uint16_t start, uint8_t size) {
|
|
if (dst->getRegAccess() != Direct) {
|
|
if (start > 0) {
|
|
// just change immediate offset
|
|
uint16_t newOff = start * dst->getTypeSize() * dst->getHorzStride();
|
|
G4_DstRegRegion *newDst = duplicateOperand(dst);
|
|
newDst->setImmAddrOff(dst->getAddrImm() + newOff);
|
|
return newDst;
|
|
} else {
|
|
return duplicateOperand(dst);
|
|
}
|
|
}
|
|
|
|
uint16_t regOff, subRegOff;
|
|
if (start > 0) {
|
|
G4_Type dstType = dst->getType();
|
|
uint16_t hs = dst->getHorzStride();
|
|
if (dst->isAccReg()) {
|
|
switch (dstType) {
|
|
case Type_F:
|
|
// must be acc1.0 as result of simd16 -> 8 split
|
|
vISA_ASSERT(size == 8, "only support simd16->simd8 for now");
|
|
return createDst(phyregpool.getAcc1Reg(), 0, 0, hs, dstType);
|
|
case Type_HF: {
|
|
// can be one of acc0.8, acc1.0, acc1.8
|
|
if (dst->getBase()->asAreg()->getArchRegType() == AREG_ACC1) {
|
|
start += 16;
|
|
}
|
|
G4_Areg *accReg =
|
|
start >= 16 ? phyregpool.getAcc1Reg() : phyregpool.getAcc0Reg();
|
|
return createDst(accReg, 0, start % 16, hs, dstType);
|
|
}
|
|
default:
|
|
|
|
// other types do not support acc1, we have to continue to use acc0
|
|
// whoever doing the split must fix the dependencies later by shuffling
|
|
// instructions so that acc0 does not get overwritten
|
|
return createDstRegRegion(*dst);
|
|
}
|
|
}
|
|
|
|
// Linearize offsets into bytes to verify potential GRF crossing
|
|
uint16_t newSubRegOff, newSubRegOffByte, crossGRF;
|
|
|
|
newSubRegOff = dst->getSubRegOff() + start * hs;
|
|
newSubRegOffByte = newSubRegOff * TypeSize(dstType);
|
|
|
|
crossGRF = newSubRegOffByte / kernel.numEltPerGRF<Type_UB>();
|
|
newSubRegOffByte =
|
|
newSubRegOffByte - crossGRF * kernel.numEltPerGRF<Type_UB>();
|
|
|
|
// Compute final reg and subreg offsets
|
|
regOff = dst->getRegOff() + crossGRF;
|
|
subRegOff = newSubRegOffByte / TypeSize(dstType);
|
|
|
|
// create a new one
|
|
return createDst(dst->getBase(), regOff, subRegOff, hs, dst->getType(),
|
|
dst->getAccRegSel());
|
|
} else {
|
|
G4_DstRegRegion *newDst = duplicateOperand(dst);
|
|
return newDst;
|
|
}
|
|
}
|
|
|
|
G4_INST *IR_Builder::makeSplittingInst(G4_INST *inst, G4_ExecSize ExSize) {
|
|
// Instruction's option is reused. Call sites should reset this field
|
|
// properly. FIXME: fix all call sites.
|
|
G4_INST *newInst = NULL;
|
|
G4_opcode op = inst->opcode();
|
|
if (inst->isMath()) {
|
|
newInst = createMathInst(NULL, inst->getSaturate(), ExSize, NULL, NULL,
|
|
NULL, inst->asMathInst()->getMathCtrl(),
|
|
inst->getOption(), true);
|
|
} else if (inst->getNumSrc() < 3) {
|
|
newInst = createInternalInst(NULL, op, NULL, inst->getSaturate(), ExSize,
|
|
NULL, NULL, NULL, inst->getOption());
|
|
} else {
|
|
newInst = createInternalInst(NULL, op, NULL, inst->getSaturate(), ExSize,
|
|
NULL, NULL, NULL, NULL, inst->getOption());
|
|
}
|
|
|
|
newInst->inheritDIFrom(inst);
|
|
|
|
return newInst;
|
|
}
|
|
|
|
// HW WAs that are done before RA.
|
|
void Optimizer::preRA_HWWorkaround() {
|
|
if (builder.hasFusedEUNoMaskWA()) {
|
|
prepareNoMaskWA();
|
|
}
|
|
|
|
// Call WA for fused EU
|
|
if (builder.hasFusedEU() && kernel.hasIndirectCall()) {
|
|
applyFusedCallWA();
|
|
// Reset pre- and post-Id after FusedCallWA, which may add new
|
|
// basic blocks.
|
|
kernel.fg.reassignBlockIDs();
|
|
kernel.fg.findBackEdges();
|
|
}
|
|
|
|
insertFenceAtEntry();
|
|
|
|
cloneSampleInst();
|
|
|
|
insertIEEEExceptionTrap();
|
|
|
|
if (builder.supportNativeSIMD32())
|
|
fixDirectAddrBoundOnDst();
|
|
}
|
|
|
|
//
|
|
// HW WAs that are done right after RA.
|
|
// Sometime, a WA needs both preRA and postRA WA and postRA needs info from
|
|
// preRA (NoMask WA). If doing postWA in HWWorkaround, some instructions, or
|
|
// even basic blocks (ifcvt), are removed, which could interfere information
|
|
// passing from preRA to postRA. The loss of such the interference can cause
|
|
// postRA WA to fail. For this purpose, a postRA_HWWorkaround is added. This
|
|
// also means that BBs and insts between preRA pass and postRA pass remain
|
|
// undeleted (is it too strong?).
|
|
//
|
|
// Note that for those WAs that should be done after inst scheduling, they
|
|
// should go to HWWorkaround, not here.
|
|
//
|
|
void Optimizer::postRA_HWWorkaround() {
|
|
if (builder.hasFusedEUNoMaskWA()) {
|
|
applyNoMaskWA();
|
|
}
|
|
if (builder.supportNativeSIMD32())
|
|
fixDirectAddrBoundOnDst();
|
|
}
|
|
|
|
// should only be called post-RA, return true if this operand has overlapping
|
|
// GRF with other ToDo: extend to non-GRF operands?
|
|
static bool hasOverlappingGRF(G4_Operand *opnd, G4_Operand *other) {
|
|
if (!opnd || !other || !opnd->isGreg() || !other->isGreg())
|
|
return false;
|
|
auto LB = opnd->getLinearizedStart(), RB = opnd->getLinearizedEnd();
|
|
auto otherLB = other->getLinearizedStart(),
|
|
otherRB = other->getLinearizedEnd();
|
|
return !(RB < otherLB || LB > otherRB);
|
|
}
|
|
|
|
// returns for this fence instruction the iterator position where the commit
|
|
// move should be inserted. We conservatively assume a commit is needed before
|
|
// -- another send
|
|
// -- any optimization barrier
|
|
// -- any instruction that writes to fence's dst GRF
|
|
// If another instruction happens to read dst GRF, then it serves as the commit
|
|
// and we don't need the dummy move
|
|
std::optional<INST_LIST_ITER>
|
|
Optimizer::findFenceCommitPos(INST_LIST_ITER fence, G4_BB *bb) const {
|
|
auto fenceInst = *fence;
|
|
vASSERT(fenceInst->isSend() && fenceInst->asSendInst()->isFence());
|
|
auto dst = fenceInst->getDst();
|
|
auto I = std::next(fence);
|
|
for (auto E = bb->end(); I != E; ++I) {
|
|
G4_INST *inst = *I;
|
|
if (inst->isSend() || inst->isCFInst() || inst->isLabel() ||
|
|
inst->isOptBarrier()) {
|
|
break;
|
|
}
|
|
if (hasOverlappingGRF(dst, inst->getDst())) {
|
|
break;
|
|
}
|
|
for (auto SI = inst->src_begin(), SE = inst->src_end(); SI != SE; ++SI) {
|
|
auto src = *SI;
|
|
if (hasOverlappingGRF(dst, src)) {
|
|
return std::nullopt;
|
|
}
|
|
}
|
|
}
|
|
return I;
|
|
}
|
|
|
|
bool Optimizer::addFenceCommit(INST_LIST_ITER ii, G4_BB *bb,
|
|
bool scheduleFenceCommit) {
|
|
G4_INST *inst = *ii;
|
|
G4_InstSend *sendInst = inst->asSendInst();
|
|
vASSERT(sendInst);
|
|
if (sendInst && sendInst->getMsgDesc()->getDstLenRegs() > 0) {
|
|
// commit is enabled for the fence, need to generate a move after to make
|
|
// sure the fence is complete mov (8) r1.0<1>:ud r1.0<8;8,1>:ud {NoMask}
|
|
auto nextIter = std::next(ii);
|
|
if (scheduleFenceCommit) {
|
|
auto iter = findFenceCommitPos(ii, bb);
|
|
if (!iter) {
|
|
return false; // skip commit for this fence
|
|
}
|
|
nextIter = *iter;
|
|
}
|
|
auto dst = inst->getDst();
|
|
G4_Declare *fenceDcl = dst->getBase()->asRegVar()->getDeclare();
|
|
G4_DstRegRegion *movDst = builder.createDst(
|
|
builder.phyregpool.getNullReg(), 0, 0, 1, fenceDcl->getElemType());
|
|
G4_SrcRegRegion *movSrc =
|
|
builder.createSrcRegRegion(fenceDcl, builder.createRegionDesc(8, 8, 1));
|
|
G4_INST *movInst = builder.createMov(g4::SIMD8, movDst, movSrc,
|
|
InstOpt_WriteEnable, false);
|
|
movInst->addComment("memory fence commit");
|
|
bb->insertBefore(nextIter, movInst);
|
|
} else if (builder.hasFenceControl()) {
|
|
// null dst, use sync.fence instead
|
|
auto nextIter = std::next(ii);
|
|
G4_INST *syncInst = builder.createInternalInst(
|
|
nullptr, G4_sync_fence, nullptr, g4::NOSAT, g4::SIMD1, nullptr,
|
|
builder.createNullSrc(Type_UD), nullptr, InstOpt_NoOpt);
|
|
bb->insertBefore(nextIter, syncInst);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
//
|
|
// rewrite source regions to satisfy various HW requirements. This pass will
|
|
// not modify the instructions otherwise
|
|
// -- rewrite <1;1,0> to <2;2,1> when possible (exec size > 1, width is not used
|
|
// to cross GRF)
|
|
// as HW doesn't allow <1;1,0> to be co-issued
|
|
//
|
|
void Optimizer::normalizeRegion() {
|
|
for (auto bb : fg) {
|
|
for (auto inst : *bb) {
|
|
if (inst->isCall() || inst->isFCall() ||
|
|
inst->isReturn() || inst->isFReturn()) {
|
|
// Do not rewrite region for call or return,
|
|
// as the effective execution size is 2.
|
|
continue;
|
|
}
|
|
|
|
// Do not rewrite region for dpas as HW requires region <1;1,0>
|
|
if (inst->isDpas())
|
|
continue;
|
|
|
|
if (inst->getExecSize() == g4::SIMD1) {
|
|
// Replace: mov (1) r64.0<4>:df r3.0<0;1,0>:df
|
|
// with: mov (1) r64.0<1>:df r3.0<0;1,0>:df
|
|
// otherwise, will get incorrect results for HSW, HW mode
|
|
G4_Operand *dst = inst->getDst();
|
|
if (dst != NULL && dst->asDstRegRegion()->getHorzStride() > 1 &&
|
|
dst->getTypeSize() == 8) {
|
|
dst->asDstRegRegion()->setHorzStride(1);
|
|
}
|
|
} else {
|
|
for (int i = 0; i < inst->getNumSrc(); ++i) {
|
|
G4_Operand *src = inst->getSrc(i);
|
|
// Only rewrite direct regions.
|
|
if (src && src->isSrcRegRegion() &&
|
|
src->asSrcRegRegion()->getRegAccess() == Direct) {
|
|
G4_SrcRegRegion *srcRegion = src->asSrcRegRegion();
|
|
if (srcRegion->getRegion()->isContiguous(inst->getExecSize())) {
|
|
srcRegion->rewriteContiguousRegion(builder, i);
|
|
} else if (inst->isAlign1Ternary()) {
|
|
// special checks for 3src inst with single non-unit stride region
|
|
// rewrite it as <s*2;s>
|
|
uint16_t stride = 0;
|
|
if (srcRegion->getRegion()->isSingleNonUnitStride(
|
|
inst->getExecSize(), stride)) {
|
|
vISA_ASSERT(stride <= 4,
|
|
"illegal stride for align1 ternary region");
|
|
srcRegion->setRegion(
|
|
builder,
|
|
kernel.fg.builder->createRegionDesc(stride * 2, 2, stride));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Optimizer::countGRFUsage() {
|
|
unsigned int maxGRFNum = kernel.getNumRegTotal();
|
|
int count = 0;
|
|
std::vector<bool> GRFUse(maxGRFNum, false);
|
|
for (auto dcl : kernel.Declares) {
|
|
if (!fg.getHasStackCalls() &&
|
|
(builder.isPreDefFEStackVar(dcl) || builder.isPreDefSpillHeader(dcl))) {
|
|
continue;
|
|
}
|
|
if (dcl->getRegVar()->isGreg()) {
|
|
int GRFStart = dcl->getRegVar()->getPhyReg()->asGreg()->getRegNum();
|
|
int numRows = dcl->getNumRows();
|
|
vISA_ASSERT(GRFStart >= 0 && (GRFStart + numRows) <= (int)maxGRFNum,
|
|
"illegal GRF assignment");
|
|
for (int i = GRFStart; i < GRFStart + numRows; ++i) {
|
|
GRFUse[i] = true;
|
|
}
|
|
}
|
|
}
|
|
for (unsigned int i = 0; i < maxGRFNum; ++i)
|
|
if (GRFUse[i])
|
|
count++;
|
|
fg.builder->getJitInfo()->stats.numGRFUsed = count;
|
|
fg.builder->criticalMsgStream()
|
|
<< "\tKernel " << kernel.getName() << " : " << count << " registers\n";
|
|
}
|
|
|
|
//
|
|
// Dump the input payload to start of scratch space.
|
|
// this is strictly for debugging and we do not care if this gets overwritten
|
|
// by other usage of the scratch space (private memory, spill, etc.)
|
|
//
|
|
void Optimizer::dumpPayload() {
|
|
int inputEnd = 0;
|
|
for (int i = 0, numInput = kernel.fg.builder->getInputCount(); i < numInput;
|
|
++i) {
|
|
input_info_t *input_info = kernel.fg.builder->getInputArg(i);
|
|
if (inputEnd < input_info->size + input_info->offset) {
|
|
inputEnd = input_info->size + input_info->offset;
|
|
}
|
|
}
|
|
|
|
G4_BB *bb = kernel.fg.getEntryBB();
|
|
// iter points to the first non-label inst
|
|
auto iter = bb->begin(), bbEnd = bb->end();
|
|
while (iter != bbEnd) {
|
|
if (!(*iter)->isLabel()) {
|
|
break;
|
|
}
|
|
++iter;
|
|
}
|
|
|
|
int regOffset = (inputEnd + kernel.numEltPerGRF<Type_UB>() - 1) /
|
|
kernel.numEltPerGRF<Type_UB>();
|
|
|
|
static const unsigned SCRATCH_MSG_DESC_CATEGORY = 18;
|
|
static const unsigned SCRATCH_MSG_DESC_OPERATION_MODE = 17;
|
|
static const unsigned SCRATCH_MSG_DESC_CHANNEL_MODE = 16;
|
|
static const unsigned SCRATCH_MSG_DESC_BLOCK_SIZE = 12;
|
|
|
|
// write 8 GRFs at a time
|
|
int msgSize = 8;
|
|
for (int i = 1; i < regOffset; i += msgSize) {
|
|
uint16_t extFuncCtrl = 0;
|
|
// both scratch and block read use DC
|
|
SFID funcID = SFID::DP_DC0;
|
|
|
|
uint32_t headerPresent = 0x80000;
|
|
uint32_t msgDescImm = headerPresent;
|
|
uint32_t msgLength = 1;
|
|
uint32_t blocksizeEncoding = 0x3; // 8 GRF
|
|
msgDescImm |= (msgLength << getSendMsgLengthBitOffset());
|
|
msgDescImm |= (1 << SCRATCH_MSG_DESC_CATEGORY);
|
|
msgDescImm |= (1 << SCRATCH_MSG_DESC_CHANNEL_MODE);
|
|
msgDescImm |= (1 << SCRATCH_MSG_DESC_OPERATION_MODE);
|
|
|
|
msgDescImm |= (blocksizeEncoding << SCRATCH_MSG_DESC_BLOCK_SIZE);
|
|
msgDescImm |= i;
|
|
|
|
G4_SendDescRaw *desc = kernel.fg.builder->createSendMsgDesc(
|
|
msgDescImm, 0, 1, funcID, msgSize, extFuncCtrl, SendAccess::WRITE_ONLY);
|
|
const RegionDesc *region = kernel.fg.builder->getRegionStride1();
|
|
G4_SrcRegRegion *headerOpnd = kernel.fg.builder->createSrcRegRegion(
|
|
kernel.fg.builder->getBuiltinR0(), region);
|
|
G4_Declare *tempDcl =
|
|
builder.createHardwiredDeclare(msgSize * 8, Type_UD, i, 0);
|
|
G4_SrcRegRegion *srcOpnd =
|
|
kernel.fg.builder->createSrcRegRegion(tempDcl, region);
|
|
G4_DstRegRegion *dstOpnd = kernel.fg.builder->createNullDst(Type_UD);
|
|
|
|
G4_INST *sendInst = kernel.fg.builder->createSplitSendInst(
|
|
nullptr, G4_sends, g4::SIMD16, dstOpnd, headerOpnd, srcOpnd,
|
|
kernel.fg.builder->createImm(msgDescImm, Type_UD), InstOpt_WriteEnable,
|
|
desc, nullptr, true);
|
|
bb->insertBefore(iter, sendInst);
|
|
}
|
|
}
|
|
|
|
// perform simple stat collection (e.g., numSends)
|
|
// IR is not modified
|
|
void Optimizer::collectStats() {
|
|
uint32_t numSends = 0;
|
|
for (auto bb : fg) {
|
|
for (auto inst : *bb) {
|
|
if (inst->isSend()) {
|
|
numSends++;
|
|
}
|
|
if (!builder.hasDFInst() && inst->isDFInstruction()) {
|
|
builder.setHasDFInst(true);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Optimizer::mapOrphans() {
|
|
auto catchAllCISAOff = builder.debugInfoPlaceholder;
|
|
if (catchAllCISAOff == UNMAPPABLE_VISA_INDEX)
|
|
return;
|
|
|
|
for (auto bb : kernel.fg) {
|
|
for (auto inst : *bb) {
|
|
if (inst->getVISAId() == UNMAPPABLE_VISA_INDEX) {
|
|
inst->setVISAId(catchAllCISAOff);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
G4_Declare *Optimizer::createInstsForCallTargetOffset(InstListType &insts,
|
|
G4_INST *fcall,
|
|
int64_t adjust_off) {
|
|
// create instruction sequence:
|
|
// add r2.0 -IP call_target
|
|
// add r2.0 r2.0 adjust_off
|
|
|
|
// call's dst must be r125.0, which is reserved at
|
|
// GlobalRA::setABIForStackCallFunctionCalls.
|
|
vASSERT(fcall->getDst()->isGreg());
|
|
// call dst must not be overlapped with r2 which is hardcoded as the new jump
|
|
// target
|
|
vASSERT((fcall->getDst()->getLinearizedStart() /
|
|
kernel.numEltPerGRF<Type_UB>()) != 2);
|
|
|
|
// hardcoded add's dst to r2
|
|
// the reg offset must be the same as call's dst reg, and must be 0 (HW
|
|
// restriction)
|
|
uint32_t reg_off = fcall->getDst()->getLinearizedStart() %
|
|
kernel.numEltPerGRF<Type_UB>() /
|
|
fcall->getDst()->getTypeSize();
|
|
|
|
G4_Declare *add_dst_decl =
|
|
builder.createHardwiredDeclare(1, fcall->getDst()->getType(), 2, reg_off);
|
|
|
|
// create the first add instruction
|
|
// add r2.0 -IP call_target
|
|
G4_INST *add_inst = builder.createBinOp(
|
|
G4_add, g4::SIMD1, builder.createDstRegRegion(add_dst_decl, 1),
|
|
builder.createSrcRegRegion(Mod_Minus, Direct,
|
|
builder.phyregpool.getIpReg(), 0, 0,
|
|
builder.getRegionScalar(), Type_UD),
|
|
fcall->getSrc(0), InstOpt_WriteEnable | InstOpt_NoCompact, false);
|
|
|
|
if (builder.needIPWA())
|
|
replaceIPWithCall(insts, add_inst);
|
|
|
|
// create the second add to add the -ip to adjust_off, adjust_off dependes
|
|
// on how many instructions from the fist add to the jmp instruction, and
|
|
// if it's post-increment (jmpi) or pre-increment (call)
|
|
// add r2.0 r2.0 adjust_off
|
|
G4_INST *add_inst2 = builder.createBinOp(
|
|
G4_add, g4::SIMD1, builder.createDstRegRegion(add_dst_decl, 1),
|
|
builder.createSrcRegRegion(add_dst_decl, builder.getRegionScalar()),
|
|
builder.createImm(adjust_off, Type_D),
|
|
InstOpt_WriteEnable | InstOpt_NoCompact, false);
|
|
|
|
insts.push_back(add_inst);
|
|
insts.push_back(add_inst2);
|
|
|
|
return add_dst_decl;
|
|
}
|
|
|
|
void Optimizer::replaceIPWithCall(InstListType &insts, G4_INST *inst_with_ip) {
|
|
// Expand
|
|
// add dst -IP call_target
|
|
// To
|
|
// call dst _label_ip_wa // jump to the next instruction
|
|
// _label_ip_wa:
|
|
// add dst dst 32 // adjust dst to the next 2
|
|
// instruction's ip ret dst // jump to the
|
|
// next instruction add dst -dst call_target // at this
|
|
// instruction dst is the ip value
|
|
|
|
uint32_t reg_num = inst_with_ip->getDst()->getLinearizedStart() /
|
|
kernel.numEltPerGRF<Type_UB>();
|
|
uint32_t reg_off = inst_with_ip->getDst()->getLinearizedStart() %
|
|
kernel.numEltPerGRF<Type_UB>() /
|
|
inst_with_ip->getDst()->getTypeSize();
|
|
// call's dst must have sub-reg num 0 (HW restriction)
|
|
vASSERT(reg_off == 0);
|
|
G4_Declare *dst_decl = builder.createHardwiredDeclare(
|
|
1, inst_with_ip->getDst()->getType(), reg_num, reg_off);
|
|
|
|
// call dst _label_ip_wa
|
|
// NOTE: create the call and label instructions directly without forming a BB
|
|
// to skip the BB end with call checking (e.g. in SWSB setting) that this is
|
|
// just a fall-throug call and is a temporarily WA
|
|
G4_Label *label = builder.createLocalBlockLabel("ip_wa");
|
|
insts.push_back(builder.createInternalInst(
|
|
nullptr, G4_call, nullptr, g4::NOSAT, g4::SIMD1,
|
|
builder.createDstRegRegion(dst_decl, 1), label, nullptr,
|
|
InstOpt_WriteEnable));
|
|
// _label_ip_wa:
|
|
insts.push_back(builder.createLabelInst(label, false));
|
|
|
|
// add dst dst 32
|
|
insts.push_back(builder.createBinOp(
|
|
G4_add, g4::SIMD1, builder.createDstRegRegion(dst_decl, 1),
|
|
builder.createSrcRegRegion(dst_decl, builder.getRegionScalar()),
|
|
builder.createImm(32, Type_D), InstOpt_WriteEnable | InstOpt_NoCompact,
|
|
false));
|
|
|
|
// ret dst
|
|
insts.push_back(builder.createInternalInst(
|
|
nullptr, G4_return, nullptr, g4::NOSAT, g4::SIMD1, nullptr,
|
|
builder.createSrcRegRegion(dst_decl, builder.getRegionScalar()), nullptr,
|
|
InstOpt_WriteEnable | InstOpt_NoCompact));
|
|
|
|
// update given add instruction's src0 if needed
|
|
if (inst_with_ip->opcode() == G4_add) {
|
|
G4_SrcRegRegion *new_src =
|
|
builder.createSrcRegRegion(dst_decl, builder.getRegionScalar());
|
|
new_src->setModifier(Mod_Minus);
|
|
inst_with_ip->setSrc(new_src, 0);
|
|
}
|
|
}
|
|
|
|
void Optimizer::createInstForJmpiSequence(InstListType &insts, G4_INST *fcall) {
|
|
// SKL workaround for indirect call
|
|
// r125.0 is the return IP (the instruction right after jmpi)
|
|
// r125.1 is the return mask. While we'll replace the ret in callee to jmpi as
|
|
// well, we do not need to consider the return mask here.
|
|
|
|
// Do not allow predicate call on jmpi WA
|
|
vASSERT(fcall->getPredicate() == nullptr);
|
|
|
|
// calculate the reserved register's num and offset from fcall's dst register
|
|
// (shoud be r125.0)
|
|
vASSERT(fcall->getDst()->isGreg());
|
|
uint32_t reg_num =
|
|
fcall->getDst()->getLinearizedStart() / kernel.numEltPerGRF<Type_UB>();
|
|
uint32_t reg_off = fcall->getDst()->getLinearizedStart() %
|
|
kernel.numEltPerGRF<Type_UB>() /
|
|
fcall->getDst()->getTypeSize();
|
|
|
|
G4_Declare *new_target_decl =
|
|
createInstsForCallTargetOffset(insts, fcall, -64);
|
|
|
|
// add r125.0 IP 32
|
|
G4_Declare *ret_decl = builder.createHardwiredDeclare(
|
|
1, fcall->getDst()->getType(), reg_num, reg_off);
|
|
insts.push_back(builder.createBinOp(
|
|
G4_add, g4::SIMD1, builder.createDstRegRegion(ret_decl, 1),
|
|
builder.createSrc(builder.phyregpool.getIpReg(), 0, 0,
|
|
builder.getRegionScalar(), Type_UD),
|
|
builder.createImm(32, Type_UD), InstOpt_WriteEnable | InstOpt_NoCompact,
|
|
false));
|
|
|
|
// jmpi r2.0
|
|
// update jump target (src0) to add's dst
|
|
G4_SrcRegRegion *jump_target =
|
|
builder.createSrcRegRegion(new_target_decl, builder.getRegionScalar());
|
|
jump_target->setType(builder, Type_D);
|
|
insts.push_back(
|
|
builder.createJmp(nullptr, jump_target, InstOpt_NoCompact, false));
|
|
}
|
|
|
|
void Optimizer::expandIndirectCallWithRegTarget() {
|
|
if (builder.hasFusedEU() && builder.getuint32Option(vISA_fusedCallWA) == 1) {
|
|
vASSERT(!builder.needReplaceIndirectCallWithJmpi());
|
|
// Relative IP has been applied in fusedCallWA()
|
|
return;
|
|
}
|
|
|
|
// check every fcall
|
|
for (auto bb : kernel.fg) {
|
|
if (bb->back()->isFCall()) {
|
|
G4_InstCF *fcall = bb->back()->asCFInst();
|
|
if (fcall->isIndirectCall()) {
|
|
// at this point the call instruction's src0 has the target_address
|
|
// and the call dst is the reserved register (r125.0) for ret
|
|
// All the caller save register should be saved. We usd r2 directly
|
|
// here to calculate the new call's target.
|
|
//
|
|
// expand call
|
|
// From:
|
|
// call r125.0 call_target
|
|
// To:
|
|
// add r2.0 -IP call_target
|
|
// add r2.0 r2.0 -32
|
|
// call r125.0 r2.0
|
|
|
|
// For SKL workaround, expand call
|
|
// From:
|
|
// call r125.0 call_target
|
|
// To:
|
|
// add r2.0 -IP call_target
|
|
// add r2.0 r2.0 -64
|
|
// add r125.0 IP 32 // set the return IP
|
|
// jmpi r2.0
|
|
InstListType expanded_insts;
|
|
if (builder.needReplaceIndirectCallWithJmpi()) {
|
|
createInstForJmpiSequence(expanded_insts, fcall);
|
|
} else {
|
|
G4_Declare *jmp_target_decl =
|
|
createInstsForCallTargetOffset(expanded_insts, fcall, -32);
|
|
// Updated call's target to the new target
|
|
G4_SrcRegRegion *jump_target = builder.createSrcRegRegion(
|
|
jmp_target_decl, builder.getRegionScalar());
|
|
fcall->setSrc(jump_target, 0);
|
|
fcall->setNoCompacted();
|
|
}
|
|
// then insert the expaneded instructions right before the call
|
|
INST_LIST_ITER insert_point = bb->end();
|
|
--insert_point;
|
|
for (auto inst_to_add : expanded_insts) {
|
|
bb->insertBefore(insert_point, inst_to_add);
|
|
}
|
|
|
|
// remove call from the instlist for Jmpi WA
|
|
if (builder.needReplaceIndirectCallWithJmpi())
|
|
bb->erase(--bb->end());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Replace ret with jmpi, must be single return
|
|
void Optimizer::replaceRetWithJmpi() {
|
|
size_t num_ret = 0;
|
|
|
|
for (G4_BB *bb : kernel.fg) {
|
|
if (bb->empty())
|
|
continue;
|
|
if (bb->isEndWithFRet()) {
|
|
++num_ret;
|
|
G4_INST *ret_inst = bb->back();
|
|
|
|
// ret dst's decl
|
|
G4_Declare *ret_reg = ret_inst->getSrc(0)->getTopDcl();
|
|
|
|
// calculate the jmpi target offset
|
|
// expand the original ret from:
|
|
// ret r125.0
|
|
// To:
|
|
// add r125.0 -ip r125.0
|
|
// add r125.0 r125.0 -48
|
|
// jmpi r125.0
|
|
|
|
// add r125.0 -ip r125.0
|
|
G4_INST *add0 = builder.createBinOp(
|
|
G4_add, g4::SIMD1, builder.createDstRegRegion(ret_reg, 1),
|
|
builder.createSrcRegRegion(Mod_Minus, Direct,
|
|
builder.phyregpool.getIpReg(), 0, 0,
|
|
builder.getRegionScalar(), Type_UD),
|
|
builder.createSrcRegRegion(ret_reg, builder.getRegionScalar()),
|
|
InstOpt_WriteEnable | InstOpt_NoCompact, false);
|
|
|
|
// add r125.0 r125.0 -48
|
|
G4_INST *add1 = builder.createBinOp(
|
|
G4_add, g4::SIMD1, builder.createDstRegRegion(ret_reg, 1),
|
|
builder.createSrcRegRegion(ret_reg, builder.getRegionScalar()),
|
|
builder.createImm(-48, Type_D),
|
|
InstOpt_WriteEnable | InstOpt_NoCompact, false);
|
|
|
|
// jmpi r125.0
|
|
G4_SrcRegRegion *jmpi_target =
|
|
builder.createSrcRegRegion(ret_reg, builder.getRegionScalar());
|
|
jmpi_target->setType(builder, Type_D);
|
|
G4_INST *jmpi =
|
|
builder.createJmp(nullptr, jmpi_target, InstOpt_NoCompact, false);
|
|
|
|
// remove the ret
|
|
bb->pop_back();
|
|
// add the jmpi
|
|
bb->push_back(add0);
|
|
bb->push_back(add1);
|
|
bb->push_back(jmpi);
|
|
}
|
|
}
|
|
|
|
// there should be exactly one ret in a external function. We did not try
|
|
// to restore the CallMask. We rely on single return of a function to make
|
|
// sure the CallMask before and after calling this function is the same.
|
|
vASSERT(num_ret == 1);
|
|
}
|
|
|
|
// Set a0 to tdr0 before sendc/sendsc
|
|
void Optimizer::setA0toTdrForSendc() {
|
|
// check for the last inst of each BB, if it's sendc/sendsc, insert
|
|
// "(W) mov(8) a0.0:uw tdr0.0:uw" right before it
|
|
for (G4_BB *bb : kernel.fg) {
|
|
if (bb->empty())
|
|
continue;
|
|
if (bb->back()->isSendConditional()) {
|
|
// "(W) mov(8) a0.0:uw tdr0.0:uw"
|
|
bb->insertBefore(
|
|
--bb->end(),
|
|
builder.createMov(g4::SIMD8,
|
|
builder.createDst(builder.phyregpool.getAddrReg(),
|
|
0, 0, 1, Type_UW),
|
|
builder.createSrc(builder.phyregpool.getTDRReg(), 0,
|
|
0, builder.getRegionScalar(),
|
|
Type_UW),
|
|
InstOpt_WriteEnable, false));
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check if there is WAR/WAW dependency between end inst and the preceding
|
|
// instruction
|
|
bool Optimizer::chkFwdOutputHazard(INST_LIST_ITER &startIter,
|
|
INST_LIST_ITER &endIter) {
|
|
G4_INST *startInst = *startIter;
|
|
|
|
INST_LIST_ITER forwardIter = startIter;
|
|
forwardIter++;
|
|
while (forwardIter != endIter) {
|
|
if ((*forwardIter)->isWAWdep(startInst) ||
|
|
(*forwardIter)->isWARdep(startInst)) {
|
|
break;
|
|
}
|
|
forwardIter++;
|
|
}
|
|
|
|
if (forwardIter != endIter) {
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// check if startInst has any WAR/WAW conflicts with subsequent insts up till
|
|
// endIter (excluding endIter) precondition: startInst must be before endIter in
|
|
// the same BB this is used to sink an inst (or its sources) to the endIter
|
|
// location
|
|
bool Optimizer::chkFwdOutputHazard(G4_INST *startInst, INST_LIST_ITER endIter) {
|
|
INST_LIST_ITER backIter = std::prev(endIter, 1);
|
|
while (*backIter != startInst) {
|
|
G4_INST *inst = *backIter;
|
|
if (inst->isWARdep(startInst) || inst->isWAWdep(startInst)) {
|
|
return true;
|
|
}
|
|
--backIter;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Optimizer::chkBwdOutputHazard(INST_LIST_ITER &startIter,
|
|
INST_LIST_ITER &endIter) {
|
|
G4_INST *endInst = *endIter;
|
|
|
|
INST_LIST_ITER backwardIter = endIter;
|
|
backwardIter--;
|
|
while (backwardIter != startIter) {
|
|
if (endInst->isWAWdep(*backwardIter) || endInst->isWARdep(*backwardIter)) {
|
|
break;
|
|
}
|
|
backwardIter--;
|
|
}
|
|
|
|
if (backwardIter != startIter) {
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool Optimizer::chkBwdOutputHazard(G4_INST *startInst,
|
|
INST_LIST_ITER &endIter) {
|
|
G4_INST *endInst = *endIter;
|
|
|
|
INST_LIST_ITER backwardIter = endIter;
|
|
backwardIter--;
|
|
while (*backwardIter != startInst) {
|
|
if (endInst->isWAWdep(*backwardIter) ||
|
|
// Makes sure there is not WAR conflict between this instruction and
|
|
// instruction preceding it:
|
|
// ... grf1(use preceding inst)
|
|
// grf1 <---- def this inst
|
|
endInst->isWARdep(*backwardIter)) {
|
|
break;
|
|
}
|
|
backwardIter--;
|
|
}
|
|
|
|
if (*backwardIter != startInst) {
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/*
|
|
Skips WAW check for the skipInst
|
|
*/
|
|
bool Optimizer::chkBwdOutputHazard(G4_INST *startInst, INST_LIST_ITER &endIter,
|
|
G4_INST *skipInst) {
|
|
G4_INST *endInst = *endIter;
|
|
|
|
INST_LIST_ITER backwardIter = endIter;
|
|
backwardIter--;
|
|
while (*backwardIter != startInst) {
|
|
if (skipInst == *backwardIter) {
|
|
--backwardIter;
|
|
continue;
|
|
}
|
|
|
|
// Makes sure there is not WAR conflict between this instruction and
|
|
// instruction preceding it:
|
|
// ... grf1(use preceding inst)
|
|
// grf1 <---- def this inst
|
|
if (endInst->isWARdep(*backwardIter) || endInst->isWAWdep(*backwardIter)) {
|
|
break;
|
|
}
|
|
--backwardIter;
|
|
}
|
|
|
|
if (*backwardIter != startInst) {
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool Optimizer::chkBwdWARdep(G4_INST *startInst, INST_LIST_ITER endIter) {
|
|
while (*endIter != startInst) {
|
|
G4_INST *inst = *endIter;
|
|
if (inst->isWARdep(startInst)) {
|
|
return true;
|
|
}
|
|
--endIter;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// Check if there is WAW dependency between startInst and subsequent insts till
|
|
// endIter
|
|
bool Optimizer::chkBwdWAWdep(G4_INST *startInst, INST_LIST_ITER endIter) {
|
|
INST_LIST_ITER backIter = std::prev(endIter, 1);
|
|
while (*backIter != startInst) {
|
|
G4_INST *inst = *backIter;
|
|
if (inst->isWAWdep(startInst)) {
|
|
return true;
|
|
}
|
|
--backIter;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// This function performs the following renaming to enable further optimization
|
|
// opportunities:
|
|
//
|
|
// op v3, v1, v2
|
|
// mov v4, v3
|
|
// mov v5, v3
|
|
// mov v6, v3
|
|
// ======>
|
|
// op v3, v1, v2
|
|
// mov v4, v3
|
|
// mov v5, v4
|
|
// mov v6, v4
|
|
void Optimizer::renameRegister() {
|
|
const int MAX_REG_RENAME_DIST = 250;
|
|
const int MAX_REG_RENAME_SIZE = 2;
|
|
|
|
for (G4_BB *bb : fg) {
|
|
bb->resetLocalIds();
|
|
std::unordered_set<G4_INST *> Seen;
|
|
|
|
INST_LIST_ITER ii = bb->begin(), iend(bb->end());
|
|
while (ii != iend) {
|
|
G4_INST *inst = *ii;
|
|
|
|
if (!inst->isRawMov() || inst->getPredicate() || Seen.count(inst) > 0 ||
|
|
inst->def_size() != 1 ||
|
|
!inst->canHoist(!bb->isAllLaneActive(), fg.builder->getOptions())) {
|
|
ii++;
|
|
continue;
|
|
}
|
|
|
|
G4_Operand *src = inst->getSrc(0);
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
|
|
G4_Declare *srcDcl =
|
|
src->isRegRegion() ? GetTopDclFromRegRegion(src) : nullptr;
|
|
G4_Declare *dstDcl = GetTopDclFromRegRegion(dst);
|
|
|
|
if (!srcDcl || !dstDcl) {
|
|
++ii;
|
|
continue;
|
|
}
|
|
|
|
// If this move is between two different register files, then
|
|
// do not do register renaming.
|
|
if (srcDcl && dstDcl && srcDcl->getRegFile() != dstDcl->getRegFile()) {
|
|
++ii;
|
|
continue;
|
|
}
|
|
|
|
G4_INST *defInst = inst->def_front().first;
|
|
G4_Declare *defDstDcl = GetTopDclFromRegRegion(defInst->getDst());
|
|
if ((dstDcl && dstDcl->getAddressed()) ||
|
|
(defDstDcl && defDstDcl->getAddressed())) {
|
|
ii++;
|
|
continue;
|
|
}
|
|
|
|
G4_DstRegRegion *defDstRegion = defInst->getDst();
|
|
if (Seen.count(defInst) > 0 ||
|
|
src->compareOperand(defDstRegion, builder) != Rel_eq) {
|
|
ii++;
|
|
continue;
|
|
}
|
|
|
|
unsigned int instMaskOption = inst->getMaskOption();
|
|
bool canRename = true;
|
|
|
|
if (defInst->use_size() == 1) {
|
|
ii++;
|
|
continue;
|
|
}
|
|
|
|
int32_t sizeRatio = dstDcl->getByteSize() / srcDcl->getByteSize();
|
|
|
|
G4_INST *lastUse = defInst->use_front().first;
|
|
for (auto iter = defInst->use_begin(), E = defInst->use_end(); iter != E;
|
|
++iter) {
|
|
G4_INST *useInst = (*iter).first;
|
|
|
|
if (useInst == inst || ((useInst->getLocalId() - inst->getLocalId()) >
|
|
MAX_REG_RENAME_DIST &&
|
|
sizeRatio > MAX_REG_RENAME_SIZE)) {
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
it incorrectly renames in this case, because it doesn't consider
|
|
defInst mask.
|
|
BEFORE:
|
|
<TR><TD ALIGN="LEFT"><FONT color="black">0:mov (1) V44(0,0)[1]:d
|
|
V46(0,0)[0;1,0]:d [Align1, NoMask] %11</FONT></TD></TR> <TR><TD
|
|
ALIGN="LEFT"><FONT color="black">0:mov (8) V48(0,0)[1]:d
|
|
V44(0,0)[0;1,0]:d [Align1, Q1] %12</FONT></TD></TR> <TR><TD
|
|
ALIGN="LEFT"><FONT color="black">0:mov (8) V56(0,0)[1]:d
|
|
V44(0,0)[0;1,0]:d [Align1, Q1] %22</FONT></TD></TR>
|
|
|
|
AFTER:
|
|
<TR><TD ALIGN="LEFT"><FONT color="black">0:mov (1) V44(0,0)[1]:d
|
|
V46(0,0)[0;1,0]:d [Align1, NoMask] %11</FONT></TD></TR> <TR><TD
|
|
ALIGN="LEFT"><FONT color="black">0:mov (8) V48(0,0)[1]:d
|
|
V44(0,0)[0;1,0]:d [Align1, Q1] %12</FONT></TD></TR> <TR><TD
|
|
ALIGN="LEFT"><FONT color="black">0:mov (8) V56(0,0)[1]:d
|
|
V48(0,0)[0;1,0]:d [Align1, Q1] %22</FONT></TD></TR>
|
|
|
|
Fix: BB in SIMD control flow && (inst not NoMask) !(instMask ==
|
|
defFask == useMask)
|
|
|
|
Disallow replication?
|
|
*/
|
|
if (useInst->getLocalId() < inst->getLocalId() ||
|
|
!useInst->isRawMov() ||
|
|
inst->getExecSize() != useInst->getExecSize() ||
|
|
(useInst->getSrc(0))->compareOperand(defDstRegion, builder) !=
|
|
Rel_eq ||
|
|
useInst->def_size() > 1 ||
|
|
(!(inst->isWriteEnableInst()) &&
|
|
useInst->getMaskOption() != instMaskOption) ||
|
|
// fix described above
|
|
(!bb->isAllLaneActive() && !inst->isWriteEnableInst() &&
|
|
!(inst->getExecSize() == defInst->getExecSize() &&
|
|
inst->getExecSize() == useInst->getExecSize()))) {
|
|
canRename = false;
|
|
break;
|
|
}
|
|
|
|
if (useInst->getLocalId() > lastUse->getLocalId()) {
|
|
lastUse = useInst;
|
|
}
|
|
}
|
|
|
|
for (auto iter = defInst->use_begin(), E = defInst->use_end(); iter != E;
|
|
++iter) {
|
|
G4_INST *useInst = (*iter).first;
|
|
Seen.insert(useInst);
|
|
}
|
|
|
|
if (!canRename) {
|
|
ii++;
|
|
continue;
|
|
}
|
|
|
|
INST_LIST_ITER forwardIter = ii;
|
|
forwardIter++;
|
|
while (canRename && *forwardIter != lastUse &&
|
|
(((*forwardIter)->getLocalId() - inst->getLocalId()) <=
|
|
MAX_REG_RENAME_DIST ||
|
|
sizeRatio <= MAX_REG_RENAME_SIZE)) {
|
|
if ((*forwardIter)->isWAWdep(inst)) {
|
|
canRename = false;
|
|
break;
|
|
}
|
|
forwardIter++;
|
|
}
|
|
|
|
if (!canRename) {
|
|
ii++;
|
|
continue;
|
|
}
|
|
|
|
for (auto useIter = defInst->use_begin(); useIter != defInst->use_end();
|
|
/*empty*/) {
|
|
G4_INST *useInst = (*useIter).first;
|
|
|
|
if (useInst == inst || ((useInst->getLocalId() - inst->getLocalId()) >
|
|
MAX_REG_RENAME_DIST &&
|
|
sizeRatio > MAX_REG_RENAME_SIZE)) {
|
|
useIter++;
|
|
continue;
|
|
}
|
|
|
|
G4_Operand *useSrc = useInst->getSrc(0);
|
|
unsigned char execSize = useInst->getExecSize();
|
|
unsigned short dstHS = dst->getHorzStride();
|
|
const RegionDesc *newSrcRd;
|
|
|
|
if (useSrc->asSrcRegRegion()->isScalar()) {
|
|
newSrcRd = builder.getRegionScalar();
|
|
} else {
|
|
unsigned tExecSize = (execSize > 8) ? 8 : execSize;
|
|
if (RegionDesc::isLegal(tExecSize * dstHS, execSize, dstHS) &&
|
|
(execSize * dstHS <= 32)) { // VS at most 32
|
|
newSrcRd = builder.createRegionDesc((uint16_t)tExecSize * dstHS,
|
|
execSize, dstHS);
|
|
} else {
|
|
// Skip this use. TODO: normalize this region.
|
|
++useIter;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
G4_SrcRegRegion *newSrcOpnd = builder.createSrcRegRegion(
|
|
Mod_src_undef, dst->getRegAccess(), dst->getBase(),
|
|
dst->getRegOff(), dst->getSubRegOff(), newSrcRd, useSrc->getType());
|
|
if (dst->getRegAccess() != Direct) {
|
|
newSrcOpnd->asSrcRegRegion()->setImmAddrOff(dst->getAddrImm());
|
|
}
|
|
useInst->setSrc(newSrcOpnd, 0);
|
|
|
|
// Maintain def-use for this change:
|
|
// - remove this use from defInst
|
|
// - add a new use to inst
|
|
useIter = defInst->eraseUse(useIter);
|
|
inst->addDefUse(useInst, Opnd_src0);
|
|
}
|
|
|
|
ii++;
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// recompute bounds for declares in the given unordered_set
|
|
// this only affects DstRegRegion and SrcRegRegion
|
|
// If the operand is global, we have to update the global operand table as well
|
|
// as the bounds may have changed
|
|
//
|
|
void Optimizer::recomputeBound(std::unordered_set<G4_Declare *> &declares) {
|
|
|
|
for (auto bb : fg) {
|
|
for (auto ii = bb->begin(), iiEnd = bb->end(); ii != iiEnd; ++ii) {
|
|
G4_INST *inst = *ii;
|
|
if (inst->getDst() != NULL) {
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
if (dst->getTopDcl() != NULL &&
|
|
declares.find(dst->getTopDcl()) != declares.end()) {
|
|
bool isGlobal = builder.kernel.fg.globalOpndHT.isOpndGlobal(dst);
|
|
dst->computeLeftBound(builder);
|
|
inst->computeRightBound(dst);
|
|
if (isGlobal) {
|
|
builder.kernel.fg.globalOpndHT.addGlobalOpnd(dst);
|
|
}
|
|
}
|
|
}
|
|
for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i) {
|
|
if (inst->getSrc(i) != NULL && inst->getSrc(i)->isSrcRegRegion()) {
|
|
G4_SrcRegRegion *src = inst->getSrc(i)->asSrcRegRegion();
|
|
if (src->getTopDcl() != NULL &&
|
|
declares.find(src->getTopDcl()) != declares.end()) {
|
|
bool isGlobal = builder.kernel.fg.globalOpndHT.isOpndGlobal(src);
|
|
src->computeLeftBound(builder);
|
|
inst->computeRightBound(src);
|
|
if (isGlobal) {
|
|
builder.kernel.fg.globalOpndHT.addGlobalOpnd(src);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// Given a sequence of simd1 instructions (max 4), try to merge them into a
|
|
// single instruction e.g., mul (1) r0.4<1>:f r0.0<0;1,0>:f r6.5<0;1,0>:f
|
|
// {NoMask} mul (1) r0.5<1>:f r0.1<0;1,0>:f r6.5<0;1,0>:f {NoMask} mul (1)
|
|
// r0.6<1>:f r0.2<0;1,0>:f r6.5<0;1,0>:f {NoMask} mul (1) r0.7<1>:f
|
|
// r0.3<0;1,0>:f r6.5<0;1,0>:f {NoMask} becomes mul (4) r0.4<1>:f r0.0<1;1,0>:f
|
|
// r6.5<0;1,0>:f {NoMask} A bunch of conditions have to be satisified; check
|
|
// BUNDLE_INFO for more details. This is only performed for 3D input as CM is
|
|
// very unlikely to benefit from this (put another way, if this succeeds for CM
|
|
// our FE is doing something wrong)
|
|
//
|
|
void Optimizer::mergeScalarInst() {
|
|
|
|
int bundleSizeLimit = BUNDLE_INFO::maxBundleSize;
|
|
if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D || builder.noInt64()) {
|
|
bundleSizeLimit = 4;
|
|
}
|
|
|
|
// set of declares that have been changed to alias to another declare
|
|
std::unordered_set<G4_Declare *> modifiedDcl;
|
|
std::vector<G4_Declare *> newInputs;
|
|
|
|
// stats
|
|
int numBundles = 0;
|
|
int numDeletedInst = 0;
|
|
|
|
for (G4_BB *bb : fg) {
|
|
std::vector<BUNDLE_INFO> bundles;
|
|
INST_LIST_ITER ii = bb->begin(), iiEnd = bb->end();
|
|
while (ii != iiEnd) {
|
|
G4_INST *inst = *ii;
|
|
auto nextIter = ii;
|
|
++nextIter;
|
|
if (nextIter != iiEnd && BUNDLE_INFO::isMergeCandidate(
|
|
inst, builder, !bb->isAllLaneActive())) {
|
|
BUNDLE_INFO bundle(bb, ii, bundleSizeLimit);
|
|
bundle.findInstructionToMerge(nextIter, builder);
|
|
if (bundle.size > 1)
|
|
bundles.emplace_back(bundle);
|
|
ii = nextIter;
|
|
} else {
|
|
++ii;
|
|
}
|
|
}
|
|
|
|
for (auto &bundle : bundles) {
|
|
bool success = bundle.doMerge(builder, modifiedDcl, newInputs);
|
|
if (success) {
|
|
numBundles++;
|
|
numDeletedInst += bundle.size - 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
// we have to reset the bound for all operands whose declares have been
|
|
// modified
|
|
recomputeBound(modifiedDcl);
|
|
|
|
VISA_DEBUG({
|
|
std::cout << " === Merge Scalar Optimization ===\n";
|
|
std::cout << "Number of optimized bundles:\t" << numBundles << "\n";
|
|
std::cout << "Number of instructions saved:\t" << numDeletedInst << "\n";
|
|
});
|
|
}
|
|
|
|
static bool isMad(G4_INST *I) {
|
|
// Disable int mac for PVC
|
|
auto dst = I->getDst();
|
|
return (I->opcode() == G4_pseudo_mad &&
|
|
!(I->getBuilder().waDisableIntMac() && dst &&
|
|
I->isIntegerPipeType(dst->getType())));
|
|
}
|
|
|
|
static inline bool isWBTypeAndNotNull(G4_Operand *opnd) {
|
|
// Requires opnd to be not null.
|
|
return opnd && (IS_BTYPE(opnd->getType()) || IS_WTYPE(opnd->getType()));
|
|
}
|
|
|
|
namespace {
|
|
|
|
/// Class to describe a mad sequence that can be turned into a sequence of MAC
|
|
/// instructions.
|
|
class MadSequenceInfo {
|
|
public:
|
|
// IR builder object.
|
|
IR_Builder &builder;
|
|
|
|
// The basic block being examined.
|
|
G4_BB *bb;
|
|
|
|
enum MadSeqKind { MK_unknown, MK_isSafe, MK_isNotSafe };
|
|
|
|
private:
|
|
// Flag indicates if ACC transformation is safe.
|
|
MadSeqKind kind;
|
|
|
|
// The single definition that defines the first mad's src2. If there are
|
|
// multiple defintions, it is nullptr.
|
|
G4_INST *src2Def;
|
|
|
|
// The sequence of mad instruction to be examined.
|
|
std::vector<G4_INST *> madSequence;
|
|
|
|
// The use chain of the last mad. This use chain ended with an instruction
|
|
// that has *B/*W type, and the chain length is limited by a predefined
|
|
// constant.
|
|
std::vector<G4_INST *> lastMadUserChain;
|
|
|
|
public:
|
|
MadSequenceInfo(IR_Builder &builder, G4_BB *bb)
|
|
: builder(builder), bb(bb), kind(MK_unknown), src2Def(nullptr) {}
|
|
|
|
bool isSafe() const { return kind == MK_isSafe; }
|
|
bool isNotSafe() const { return kind == MK_isNotSafe; }
|
|
void setNotSafe() { kind = MK_isNotSafe; }
|
|
void setSafe() { kind = MK_isSafe; }
|
|
|
|
G4_INST *getSrc2Def() { return src2Def; }
|
|
G4_INST *getFirstMad() const { return madSequence.front(); }
|
|
G4_INST *getLastMad() const { return madSequence.back(); }
|
|
void appendMad(INST_LIST_ITER begin, INST_LIST_ITER end) {
|
|
madSequence.insert(madSequence.end(), begin, end);
|
|
}
|
|
typedef std::vector<G4_INST *>::iterator mad_iter;
|
|
mad_iter mad_begin() { return madSequence.begin(); }
|
|
mad_iter mad_end() { return madSequence.end(); }
|
|
|
|
void appendUser(G4_INST *inst) { lastMadUserChain.push_back(inst); }
|
|
G4_INST *getLastUser() { return lastMadUserChain.back(); }
|
|
|
|
void reset() {
|
|
kind = MK_unknown;
|
|
src2Def = nullptr;
|
|
madSequence.clear();
|
|
lastMadUserChain.clear();
|
|
}
|
|
|
|
// Collect all candidate instructions and perform minimal checks.
|
|
INST_LIST_ITER populateCandidates(INST_LIST_ITER iter);
|
|
|
|
// Make changes to all the candidates collected. This is the only
|
|
// function that makes changes to the IR.
|
|
void processCandidates();
|
|
|
|
private:
|
|
void populateUserChain(G4_INST *defInst, int level);
|
|
void populateSrc2Def();
|
|
|
|
// Check whether this mad sequence can be turned into a MAC sequence.
|
|
bool checkMadSequence();
|
|
|
|
// Check whether the user chain blocks this transformation or not.
|
|
bool checkUserChain();
|
|
|
|
// Check if other instructions between defInst and useInst are also updating
|
|
// ACC registers, which may block this transformation.
|
|
bool checkACCDependency(G4_INST *defInst, G4_INST *useInst);
|
|
|
|
// The common type for accumulator operands.
|
|
G4_Type getCommonACCType() {
|
|
G4_Type T = Type_UNDEF;
|
|
|
|
// If there is no user chain to check. Use the last mad's destionation
|
|
// operand type as the common type. Otherwise, use the last user's
|
|
// destionation type.
|
|
if (lastMadUserChain.empty())
|
|
T = getLastMad()->getDst()->getType();
|
|
else
|
|
T = getLastUser()->getDst()->getType();
|
|
|
|
vISA_ASSERT((IS_FTYPE(T) || IS_HFTYPE(T) || IS_INT(T)),
|
|
"Only F/HF/W/B types are expected here");
|
|
return (IS_FTYPE(T) || IS_HFTYPE(T))
|
|
? T
|
|
: (IS_SIGNED_INT(T) ? Type_W : Type_UW);
|
|
}
|
|
};
|
|
|
|
class AccRestriction {
|
|
unsigned encoding;
|
|
|
|
public:
|
|
// Accumulator Restriction Kind:
|
|
enum Accumulator_RK {
|
|
ARK_NoRestriction = 0x01, // No restrictions.
|
|
ARK_NoAccess = 0x02, // No accumulator access, implicit or explicit.
|
|
ARK_NoSourceOperand = 0x04, // Source operands cannot be accumulators.
|
|
ARK_NoModifier =
|
|
0x08, // Source modifier is not allowed if source is an accumulator.
|
|
ARK_NoExplicitSource = 0x010, // Accumulator is an implicit source and thus
|
|
// cannot be an explicit source operand.
|
|
ARK_NoDst =
|
|
0x20, // Accumulator cannot be destination, implicit or explicit.
|
|
ARK_AccWrEnRequired =
|
|
0x40, // AccWrEn is required. The accumulator is an implicit
|
|
// destination and thus cannot be an explicit destination operand.
|
|
ARK_NoIntegerSource =
|
|
0x80, // Integer source operands cannot be accumulators.
|
|
ARK_NoExplicitSrcAllowAccWrEn =
|
|
0x100, // No explicit accumulator access because this is a three-source
|
|
// instruction. AccWrEn is allowed for implicitly updating the
|
|
// accumulator.
|
|
ARK_NoBothSrcAndDst = 0x200 // An accumulator can be a source or destination
|
|
// operand but not both.
|
|
};
|
|
|
|
AccRestriction(unsigned val) : encoding(val) {}
|
|
|
|
bool useAccAsSrc(G4_SrcRegRegion *opnd, bool isExplicit = true,
|
|
bool isAlreadyDst = false) const {
|
|
if (!opnd)
|
|
return false;
|
|
|
|
if (encoding & ARK_NoAccess)
|
|
return false;
|
|
|
|
if (encoding & ARK_NoRestriction)
|
|
return true;
|
|
|
|
if (encoding & ARK_NoSourceOperand)
|
|
return false;
|
|
|
|
if (encoding & ARK_NoIntegerSource)
|
|
return !IS_TYPE_INT(opnd->getType());
|
|
|
|
if (encoding & ARK_NoExplicitSource)
|
|
return !isExplicit;
|
|
|
|
if (encoding & ARK_NoExplicitSrcAllowAccWrEn)
|
|
return false;
|
|
|
|
if (encoding & ARK_NoBothSrcAndDst)
|
|
return !isAlreadyDst;
|
|
|
|
if (encoding & ARK_NoModifier)
|
|
return opnd->getModifier() == Mod_src_undef;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool useAccAsDst(G4_DstRegRegion *opnd, bool isExplicit = true,
|
|
bool isAlreadySrc = false) const {
|
|
if (!opnd)
|
|
return false;
|
|
|
|
if (encoding & ARK_NoAccess)
|
|
return false;
|
|
|
|
if (encoding & ARK_NoRestriction)
|
|
return true;
|
|
|
|
if (encoding & ARK_NoDst)
|
|
return false;
|
|
|
|
if (encoding & ARK_AccWrEnRequired)
|
|
return !isExplicit;
|
|
|
|
if (encoding & ARK_NoBothSrcAndDst)
|
|
return !isAlreadySrc;
|
|
|
|
return true;
|
|
}
|
|
|
|
static AccRestriction getRestrictionKind(G4_INST *inst);
|
|
};
|
|
|
|
} // namespace
|
|
|
|
AccRestriction AccRestriction::getRestrictionKind(G4_INST *inst) {
|
|
switch (inst->opcode()) {
|
|
default:
|
|
break;
|
|
case G4_add:
|
|
case G4_asr:
|
|
case G4_avg:
|
|
case G4_csel:
|
|
case G4_frc:
|
|
case G4_sel:
|
|
case G4_shr:
|
|
case G4_smov:
|
|
return ARK_NoRestriction;
|
|
case G4_addc:
|
|
case G4_subb:
|
|
return ARK_AccWrEnRequired;
|
|
case G4_and:
|
|
case G4_not:
|
|
case G4_or:
|
|
case G4_xor:
|
|
return ARK_NoModifier;
|
|
case G4_cmp:
|
|
case G4_cmpn:
|
|
case G4_lzd:
|
|
return ARK_NoRestriction;
|
|
case G4_dp2:
|
|
case G4_dp3:
|
|
case G4_dp4:
|
|
case G4_dph:
|
|
case G4_line:
|
|
case G4_movi:
|
|
case G4_pln:
|
|
case G4_sad2:
|
|
case G4_sada2:
|
|
return ARK_NoSourceOperand;
|
|
case G4_lrp:
|
|
case G4_mac:
|
|
return ARK_NoExplicitSource;
|
|
case G4_madm:
|
|
return ARK_NoExplicitSrcAllowAccWrEn;
|
|
case G4_mach:
|
|
return ARK_NoExplicitSource | ARK_AccWrEnRequired;
|
|
case G4_mov:
|
|
return ARK_NoBothSrcAndDst;
|
|
case G4_mul:
|
|
return ARK_NoIntegerSource;
|
|
case G4_rndd:
|
|
case G4_rnde:
|
|
case G4_rndu:
|
|
case G4_rndz:
|
|
return ARK_NoRestriction;
|
|
case G4_shl:
|
|
return ARK_NoDst;
|
|
}
|
|
|
|
return ARK_NoAccess;
|
|
}
|
|
|
|
/// Check this pseudo-mad's dst operand. Returns false if there is anything
|
|
/// blocking acc's usage.
|
|
static bool checkMadDst(G4_INST *inst, IR_Builder &builder) {
|
|
G4_DstRegRegion *dst = inst->getDst();
|
|
if (!dst || builder.kernel.fg.globalOpndHT.isOpndGlobal(dst))
|
|
return false;
|
|
|
|
if (dst->getRegAccess() != Direct)
|
|
return false;
|
|
|
|
// Only acc0 is available for w/uw destination.
|
|
// FIXME: This acc type size is only for simd 16.
|
|
unsigned Sz = TypeSize(Type_W);
|
|
Sz *= dst->getHorzStride() * inst->getExecSize();
|
|
return Sz <= builder.numEltPerGRF<Type_UB>();
|
|
}
|
|
|
|
// Check whether this mad sequence can be turned into a MAC sequence.
|
|
bool MadSequenceInfo::checkMadSequence() {
|
|
unsigned int maskOffset = getFirstMad()->getMaskOffset();
|
|
|
|
// First check each individual mad.
|
|
for (auto inst : madSequence) {
|
|
vISA_ASSERT(isMad(inst), "not a mad");
|
|
|
|
// Only for simd 16. TODO: support simd8.
|
|
if (inst->getExecSize() != g4::SIMD16)
|
|
return false;
|
|
|
|
if (inst->getMaskOffset() != maskOffset)
|
|
return false;
|
|
|
|
// Do not handle predicate yet.
|
|
if (inst->getPredicate() != nullptr)
|
|
return false;
|
|
|
|
// Do not handle cond modifier yet.
|
|
if (inst->getCondMod() != nullptr)
|
|
return false;
|
|
|
|
if (!checkMadDst(inst, builder))
|
|
return false;
|
|
|
|
G4_Operand *src0 = inst->getSrc(0);
|
|
G4_Operand *src1 = inst->getSrc(1);
|
|
G4_Operand *src2 = inst->getSrc(2);
|
|
|
|
if (!src0 || !src1 || !src2)
|
|
return false;
|
|
|
|
if (builder.noDFTypeMac()) {
|
|
if (IS_DFTYPE(src0->getType()) || IS_DFTYPE(src1->getType()) ||
|
|
IS_DFTYPE(src2->getType()) || IS_DFTYPE(inst->getDst()->getType()))
|
|
return false;
|
|
}
|
|
|
|
if (IS_FTYPE(src0->getType()) && IS_FTYPE(src1->getType()) &&
|
|
IS_FTYPE(src2->getType())) {
|
|
// ok
|
|
} else if ((!IS_BTYPE(src0->getType()) && !IS_WTYPE(src0->getType())) ||
|
|
(!IS_BTYPE(src1->getType()) && !IS_WTYPE(src1->getType()))) {
|
|
// Only when src0 and src1 are of Byte/Word types.
|
|
return false;
|
|
} else if (!IS_BTYPE(src2->getType()) && !IS_WTYPE(src2->getType()) &&
|
|
!IS_DTYPE(src2->getType())) {
|
|
// Only when src2 is of Byte/Word/DWord types.
|
|
return false;
|
|
}
|
|
|
|
if (!builder.hasByteALU()) {
|
|
if (IS_BTYPE(src0->getType()) || IS_BTYPE(src1->getType())) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (builder.avoidAccDstWithIndirectSource()) {
|
|
if (src0->isSrcRegRegion() && src0->asSrcRegRegion()->isIndirect()) {
|
|
return false;
|
|
}
|
|
if (src1->isSrcRegRegion() && src1->asSrcRegRegion()->isIndirect()) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// If there is a modifier for src2, or src2 is accessed somewhere
|
|
// indirectly then we will not generate a MAC.
|
|
if (!src2->isSrcRegRegion())
|
|
return false;
|
|
|
|
if (src2->asSrcRegRegion()->getModifier() != Mod_src_undef ||
|
|
src2->asSrcRegRegion()->getRegAccess() != Direct ||
|
|
(src2->getTopDcl() && src2->getTopDcl()->getAddressed()))
|
|
return false;
|
|
}
|
|
|
|
// Now check instructions in pairs.
|
|
G4_INST *defInst = getSrc2Def();
|
|
G4_INST *lastMad = getLastMad();
|
|
for (auto I = mad_begin(); defInst != lastMad; ++I) {
|
|
G4_INST *useInst = *I;
|
|
G4_Operand *dst = defInst->getDst();
|
|
G4_Operand *src2 = useInst->getSrc(2);
|
|
vISA_ASSERT(dst && dst->isDstRegRegion(), "invalid dst");
|
|
vISA_ASSERT(src2 && src2->isSrcRegRegion(), "invalid src2");
|
|
if (dst->compareOperand(src2, builder) != Rel_eq)
|
|
return false;
|
|
|
|
// Move the next pair.
|
|
defInst = useInst;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool MadSequenceInfo::checkACCDependency(G4_INST *defInst, G4_INST *useInst) {
|
|
auto iter = std::find(bb->begin(), bb->end(), defInst);
|
|
vISA_ASSERT(iter != bb->end(), "no instruction found?");
|
|
|
|
for (++iter; (*iter) != useInst; ++iter) {
|
|
if ((*iter)->defAcc() || (*iter)->useAcc() ||
|
|
(*iter)->mayExpandToAccMacro())
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Check whether this user chain is safe to use ACC.
|
|
bool MadSequenceInfo::checkUserChain() {
|
|
// skip if there is no user to be analyzed.
|
|
if (lastMadUserChain.empty())
|
|
return true;
|
|
|
|
G4_INST *defInst = getLastMad();
|
|
G4_INST *useInst = defInst->use_back().first;
|
|
|
|
while (true) {
|
|
// TODO: enable simd8.
|
|
if (useInst->getExecSize() != g4::SIMD16)
|
|
return false;
|
|
|
|
// Only when used as a source operand.
|
|
Gen4_Operand_Number opndNum = defInst->use_back().second;
|
|
if (!G4_INST::isSrcNum(opndNum))
|
|
return false;
|
|
|
|
G4_Operand *useOpnd = useInst->getSrc(G4_INST::getSrcNum(opndNum));
|
|
if (!useOpnd || !useOpnd->isSrcRegRegion())
|
|
return false;
|
|
|
|
if (useOpnd->asSrcRegRegion()->isIndirect())
|
|
return false;
|
|
|
|
// check other source type.
|
|
for (int i = 0, e = useInst->getNumSrc(); i < e; ++i) {
|
|
G4_Operand *otherSrc = useInst->getSrc(i);
|
|
if (otherSrc == useOpnd)
|
|
continue;
|
|
|
|
if (!isWBTypeAndNotNull(otherSrc))
|
|
return false;
|
|
}
|
|
|
|
bool isLastUser = (useInst == getLastUser());
|
|
|
|
// The last user does not use ACC as its dst.
|
|
AccRestriction AR = AccRestriction::getRestrictionKind(useInst);
|
|
if (!AR.useAccAsSrc(useOpnd->asSrcRegRegion(), true, !isLastUser))
|
|
return false;
|
|
|
|
// Now check between defInst and useInst, no ACC will be written by
|
|
// other instructions.
|
|
if (!checkACCDependency(defInst, useInst))
|
|
return false;
|
|
|
|
if (isLastUser)
|
|
// No extra check for the last user.
|
|
break;
|
|
|
|
// This is not the last user. We check its dst too.
|
|
G4_Operand *useDst = useInst->getDst();
|
|
if (!useDst)
|
|
return false;
|
|
|
|
// check type, no support for *Q types yet.
|
|
if (!IS_DTYPE(useDst->getType()))
|
|
return false;
|
|
|
|
// For each inner user, need to use ACC as its explicit dst.
|
|
if (!AR.useAccAsDst(useDst->asDstRegRegion(), true, true))
|
|
return false;
|
|
|
|
if (defInst->getDst()->compareOperand(useOpnd, builder) != Rel_eq)
|
|
return false;
|
|
|
|
// move to next pair.
|
|
defInst = useInst;
|
|
useInst = defInst->use_back().first;
|
|
}
|
|
|
|
// nothing wrong.
|
|
return true;
|
|
}
|
|
|
|
void MadSequenceInfo::populateSrc2Def() {
|
|
G4_INST *firstMad = getFirstMad();
|
|
vISA_ASSERT(firstMad && isMad(firstMad), "invalid mad");
|
|
|
|
src2Def = firstMad->getSingleDef(Opnd_src2, true);
|
|
if (src2Def == nullptr)
|
|
// Cannot find a single definition.
|
|
return setNotSafe();
|
|
|
|
// Check it right here.
|
|
// Only support splats or simd16 initialization.
|
|
if (src2Def->getExecSize() != g4::SIMD16 &&
|
|
src2Def->getExecSize() != g4::SIMD1) {
|
|
return setNotSafe();
|
|
}
|
|
|
|
G4_Operand *Dst = src2Def->getDst();
|
|
if (!Dst || builder.kernel.fg.globalOpndHT.isOpndGlobal(Dst))
|
|
return setNotSafe();
|
|
|
|
if (Dst->asDstRegRegion()->getRegAccess() != Direct)
|
|
return setNotSafe();
|
|
|
|
if (src2Def->getPredicate() || src2Def->getSaturate() ||
|
|
!src2Def->hasOneUse())
|
|
return setNotSafe();
|
|
|
|
if (!src2Def->canDstBeAcc())
|
|
return setNotSafe();
|
|
|
|
if (IS_DTYPE(src2Def->getExecType())) {
|
|
// since we use <1>:w region for our acc temp, due to alignment requirements
|
|
// we can't allow dword source types
|
|
return setNotSafe();
|
|
}
|
|
|
|
if (!builder.hasByteALU()) {
|
|
// do not allow acc if src2Dest inst has byte source
|
|
for (int i = 0; i < src2Def->getNumSrc(); ++i) {
|
|
if (IS_BTYPE(src2Def->getSrc(i)->getType())) {
|
|
return setNotSafe();
|
|
}
|
|
}
|
|
}
|
|
|
|
if (builder.avoidAccDstWithIndirectSource()) {
|
|
for (int i = 0; i < src2Def->getNumSrc(); ++i) {
|
|
if (src2Def->getSrc(i)->isSrcRegRegion() &&
|
|
src2Def->getSrc(i)->asSrcRegRegion()->isIndirect()) {
|
|
return setNotSafe();
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check if there is any ACC dependency.
|
|
if (!checkACCDependency(src2Def, firstMad))
|
|
return setNotSafe();
|
|
|
|
// Check restrictions on compression to ensure that changing the destination
|
|
// type will not change the source region meaning due to instruction
|
|
// compression.
|
|
//
|
|
// If both instructions are compressed or both non-compressed then it is
|
|
// safe. Otherwise, check whether source regions are compression invariants.
|
|
if (src2Def->isComprInst() ^ firstMad->isComprInst()) {
|
|
auto checkCompression = [](G4_INST *inst) {
|
|
for (int i = 0; i < inst->getNumSrc(); ++i) {
|
|
G4_Operand *opnd = inst->getSrc(i);
|
|
if (!opnd || !opnd->isSrcRegRegion())
|
|
continue;
|
|
if (!inst->isComprInvariantSrcRegion(opnd->asSrcRegRegion(), i))
|
|
return false;
|
|
if (IS_DTYPE(opnd->getType()))
|
|
return false;
|
|
}
|
|
return true;
|
|
};
|
|
|
|
if (src2Def->isComprInst()) {
|
|
if (!checkCompression(src2Def))
|
|
return setNotSafe();
|
|
} else {
|
|
if (!checkCompression(firstMad))
|
|
return setNotSafe();
|
|
}
|
|
}
|
|
|
|
AccRestriction AR = AccRestriction::getRestrictionKind(src2Def);
|
|
if (!AR.useAccAsDst(Dst->asDstRegRegion()))
|
|
return setNotSafe();
|
|
}
|
|
|
|
void MadSequenceInfo::populateUserChain(G4_INST *defInst, int level) {
|
|
// Failed.
|
|
if (level <= 0)
|
|
return setNotSafe();
|
|
|
|
// Only for a single use.
|
|
if (!defInst->hasOneUse())
|
|
return setNotSafe();
|
|
|
|
// Only when used as a source operand.
|
|
Gen4_Operand_Number opndNum = defInst->use_back().second;
|
|
if (!G4_INST::isSrcNum(opndNum))
|
|
return setNotSafe();
|
|
|
|
G4_INST *useInst = defInst->use_back().first;
|
|
auto useDst = useInst->getDst();
|
|
|
|
if (useDst == nullptr)
|
|
return setNotSafe();
|
|
|
|
appendUser(useInst);
|
|
|
|
// If this user has *W/*B types then candidate found and stop the search.
|
|
if (IS_BTYPE(useDst->getType()) || IS_WTYPE(useDst->getType()))
|
|
return;
|
|
|
|
// Search to the next level.
|
|
return populateUserChain(useInst, level - 1);
|
|
}
|
|
|
|
// Currently, we assume that MAD instructions are back-to-back. This avoids
|
|
// dependency checking among mad and non-mad instructions.
|
|
//
|
|
// TODO: hoist or sink instructions.
|
|
//
|
|
INST_LIST_ITER MadSequenceInfo::populateCandidates(INST_LIST_ITER iter) {
|
|
// Find the first pseudo-mad instruction.
|
|
iter = std::find_if(iter, bb->end(), isMad);
|
|
|
|
// No mad found
|
|
if (iter == bb->end())
|
|
return iter;
|
|
|
|
// Find the first non-mad instruction following this sequence.
|
|
auto end = std::find_if_not(iter, bb->end(), isMad);
|
|
|
|
vISA_ASSERT(iter != end, "out of sync");
|
|
appendMad(iter, end);
|
|
|
|
// Checking the first mad's src2.
|
|
populateSrc2Def();
|
|
|
|
// If the mad sequence has *W/*B types then it is safe to use ACC regardless
|
|
// of the destionation operand type in its use.
|
|
// ..
|
|
// mac (16) acc0.0<1>:w r10.7<0;1,0>:w r62.16<16;16,1>:ub {Align1, H1}
|
|
// mac (16) acc0.0<1>:w r11.6<0;1,0>:w r63.0<16;16,1>:ub {Align1, H1}
|
|
// mac (16) r24.0<1>:w r11.7<0;1,0>:w r63.16<16;16,1>:ub {Align1, H1}
|
|
// add (16) r14.0<1>:d r14.0<8;8,1>:d r24.0<8;8,1>:w {Align1, H1}
|
|
//
|
|
// could be generated.
|
|
G4_Type MadDstType = getLastMad()->getDst()->getType();
|
|
if (IS_DTYPE(MadDstType)) {
|
|
// Populate the user chain up to some predetermined level.
|
|
const int level = 4;
|
|
populateUserChain(getLastMad(), level);
|
|
}
|
|
|
|
// We have gathered all candidates for this optimization. Now we make
|
|
// comprehensive checks on the mad sequence and the user chain.
|
|
if (isNotSafe())
|
|
return end;
|
|
|
|
if (!checkMadSequence())
|
|
return end;
|
|
|
|
if (!checkUserChain())
|
|
return end;
|
|
|
|
// everything is OK, preceed to do transformation.
|
|
setSafe();
|
|
return end;
|
|
}
|
|
|
|
// Makes changes to the mad sequence and its users.
|
|
void MadSequenceInfo::processCandidates() {
|
|
vISA_ASSERT(isSafe(), "not safe for ACC");
|
|
vISA_ASSERT(getSrc2Def(), "null src");
|
|
vISA_ASSERT(!madSequence.empty(), "no mad");
|
|
|
|
// In this function we replace all src2 to implicit ACC registers and
|
|
// update operands in its use chain.
|
|
G4_Type AdjustedType = getCommonACCType();
|
|
|
|
// Fix src2Def
|
|
G4_INST *src2Def = getSrc2Def();
|
|
{
|
|
// change dst of the last MAD
|
|
G4_DstRegRegion *accDstOpnd = builder.createDst(
|
|
builder.phyregpool.getAcc0Reg(), 0, 0, 1, AdjustedType);
|
|
src2Def->setDest(accDstOpnd);
|
|
|
|
// Convert splat.
|
|
if (src2Def->getExecSize() == g4::SIMD1) {
|
|
src2Def->setExecSize(getFirstMad()->getExecSize());
|
|
}
|
|
}
|
|
|
|
// update use-chain
|
|
if (!lastMadUserChain.empty()) {
|
|
G4_INST *defInst = getLastMad();
|
|
G4_INST *useInst = defInst->use_back().first;
|
|
Gen4_Operand_Number opndNum = defInst->use_back().second;
|
|
vISA_ASSERT(defInst->hasOneUse(), "bad candidate");
|
|
|
|
while (true) {
|
|
const RegionDesc *rd = builder.getRegionStride1();
|
|
auto mod = useInst->getOperand(opndNum)->asSrcRegRegion()->getModifier();
|
|
G4_SrcRegRegion *accSrcOpnd = builder.createSrcRegRegion(
|
|
mod, Direct, builder.phyregpool.getAcc0Reg(), 0, 0, rd, AdjustedType);
|
|
useInst->setSrc(accSrcOpnd, G4_INST::getSrcNum(opndNum));
|
|
|
|
// The last use, only update source, and exit.
|
|
if (useInst == getLastUser())
|
|
break;
|
|
|
|
// Also update the destination.
|
|
G4_DstRegRegion *accDstOpnd = builder.createDst(
|
|
builder.phyregpool.getAcc0Reg(), 0, 0, 1, AdjustedType);
|
|
useInst->setDest(accDstOpnd);
|
|
|
|
// move to the next pair.
|
|
defInst = useInst;
|
|
useInst = defInst->use_back().first;
|
|
opndNum = defInst->use_back().second;
|
|
vISA_ASSERT(defInst->hasOneUse(), "bad candidate");
|
|
}
|
|
}
|
|
|
|
// update mad sequence
|
|
for (auto I = mad_begin(), E = mad_end(); I != E; ++I) {
|
|
G4_INST *inst = *I;
|
|
vISA_ASSERT(isMad(inst), "not a mad");
|
|
|
|
const RegionDesc *rd = builder.getRegionStride1();
|
|
G4_SrcRegRegion *accSrcOpnd = builder.createSrc(
|
|
builder.phyregpool.getAcc0Reg(), 0, 0, rd, AdjustedType);
|
|
|
|
inst->setImplAccSrc(accSrcOpnd);
|
|
inst->setSrc(nullptr, 2);
|
|
// For the last mad, if it has *B/*W type, then no user will be modified
|
|
// and do not change its destination operand. Otherwise, use acc as the
|
|
// destination.
|
|
if (getLastMad() != inst || !lastMadUserChain.empty()) {
|
|
G4_DstRegRegion *accDstOpnd = builder.createDst(
|
|
builder.phyregpool.getAcc0Reg(), 0, 0, 1, AdjustedType);
|
|
inst->setDest(accDstOpnd);
|
|
}
|
|
inst->setOpcode(G4_mac);
|
|
inst->fixMACSrc2DefUse();
|
|
}
|
|
}
|
|
|
|
// Do any kind of proprocessing in this basic block to help MAC transformation.
|
|
// Returns false if we can easily detect this optimization is not possible.
|
|
// Otherwise, returns true.
|
|
static bool preprocessMadInBlock(IR_Builder &builder, G4_BB *bb) {
|
|
bool hasMad = false;
|
|
for (auto inst : *bb) {
|
|
if (isMad(inst)) {
|
|
hasMad = true;
|
|
HWConformity::tryEliminateMadSrcModifier(builder, inst);
|
|
}
|
|
}
|
|
|
|
// nothing to do if there is no mad.
|
|
return hasMad;
|
|
}
|
|
|
|
// clang-format off
|
|
//
|
|
// mul (16) V48(0,0)<1>:d r0.1<0;1,0>:w V42_in(7,2)<16;16,1>:ub {Align1, H1}
|
|
// psuedo_mad (16) V51_tempConvolve(0,0)<1>:d r1.0<0;1,0>:w V42_in(7,1)<16;16,1>:ub V48(0,0)<16;16,1>:d {Align1, H1}
|
|
// psuedo_mad (16) V51_tempConvolve(0,0)<1>:d r0.2<0;1,0>:w V42_in(7,3)<16;16,1>:ub V51_tempConvolve(0,0)<16;16,1>:d {Align1, H1}
|
|
// psuedo_mad (16) V51_tempConvolve(0,0)<1>:d r0.3<0;1,0>:w V42_in(8,1)<16;16,1>:ub V51_tempConvolve(0,0)<16;16,1>:d {Align1, H1}
|
|
// psuedo_mad (16) V51_tempConvolve(0,0)<1>:d r0.4<0;1,0>:w V42_in(8,2)<16;16,1>:ub V51_tempConvolve(0,0)<16;16,1>:d {Align1, H1}
|
|
// psuedo_mad (16) V51_tempConvolve(0,0)<1>:d r0.5<0;1,0>:w V42_in(8,3)<16;16,1>:ub V51_tempConvolve(0,0)<16;16,1>:d {Align1, H1}
|
|
// psuedo_mad (16) V51_tempConvolve(0,0)<1>:d r0.6<0;1,0>:w V42_in(9,1)<16;16,1>:ub V51_tempConvolve(0,0)<16;16,1>:d {Align1, H1}
|
|
// psuedo_mad (16) V51_tempConvolve(0,0)<1>:d r0.7<0;1,0>:w V42_in(9,2)<16;16,1>:ub V51_tempConvolve(0,0)<16;16,1>:d {Align1, H1}
|
|
// psuedo_mad (16) V51_tempConvolve(0,0)<1>:d r0.8<0;1,0>:w V42_in(9,3)<16;16,1>:ub V51_tempConvolve(0,0)<16;16,1>:d {Align1, H1}
|
|
// add (16) V51_tempConvolve(0,0)<1>:d V51_tempConvolve(0,0)<16;16,1>:d 0x4000:w {Align1, H1}
|
|
// shr.sat (16) V52(0,0)<1>:ub V51_tempConvolve(0,0)<16;16,1>:d 0xf:w {Align1, H1}
|
|
//
|
|
// clang-format on
|
|
void Optimizer::lowerMadSequence() {
|
|
|
|
// Only enable CM for now.
|
|
if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) != VISA_CM)
|
|
return;
|
|
|
|
if (!builder.hasMacMacl())
|
|
return;
|
|
|
|
for (G4_BB *bb : fg) {
|
|
// Preprocess this basic block. If no mad sequence found then skip to
|
|
// the next basic block right away.
|
|
if (!preprocessMadInBlock(builder, bb))
|
|
continue;
|
|
|
|
// Object to gather information for ACC optimization.
|
|
MadSequenceInfo madInfo(builder, bb);
|
|
|
|
auto iter = bb->begin();
|
|
while (iter != bb->end()) {
|
|
// Returns an iterator to the next non-mad instruction after the mad
|
|
// sequence. It is safe to insert/delete instructions before it.
|
|
iter = madInfo.populateCandidates(iter);
|
|
|
|
// Perform transformation. The resulted IR may still need to be
|
|
// fixed by HWConformity, e.g. the dst may still have *B type.
|
|
if (madInfo.isSafe())
|
|
madInfo.processCandidates();
|
|
|
|
// Cleanup immediate results, whether change has been made or not.
|
|
madInfo.reset();
|
|
}
|
|
}
|
|
}
|
|
|
|
void Optimizer::ifCvt() { runIfCvt(fg); }
|
|
|
|
|
|
namespace {
|
|
|
|
enum SplitBounds : unsigned {
|
|
LoLBound = 0,
|
|
LoRBound = 63,
|
|
HiLBound = 64,
|
|
HiRBound = 127
|
|
};
|
|
|
|
static bool isCandidateDecl(G4_Declare *Dcl, const IR_Builder &builder) {
|
|
G4_Declare *RootDcl = Dcl->getRootDeclare();
|
|
if (RootDcl->getRegFile() != G4_GRF)
|
|
return false;
|
|
|
|
// Only split 4GRF variables. We should be able to split > 4GRF variables,
|
|
// but this should have been done in FE.
|
|
if (RootDcl->getByteSize() != 4 * builder.numEltPerGRF<Type_UB>())
|
|
return false;
|
|
|
|
if (RootDcl->getAddressed())
|
|
return false;
|
|
|
|
if (builder.isPreDefArg(RootDcl) || builder.isPreDefRet(RootDcl)) {
|
|
return false;
|
|
}
|
|
|
|
if (Dcl->isOutput())
|
|
return false;
|
|
|
|
// ToDo: add more special declares to exclude list
|
|
|
|
return true;
|
|
}
|
|
|
|
// Associated declarations for splitting.
|
|
struct DclMapInfo {
|
|
// The low part of the splitted variable.
|
|
G4_Declare *DclLow;
|
|
|
|
// The high part of the splitted variable.
|
|
G4_Declare *DclHigh;
|
|
|
|
// Aliases of the low part. Created if needed for different types.
|
|
std::vector<G4_Declare *> AliasLow;
|
|
|
|
// Aliases of the high part. Created if needed for different types.
|
|
std::vector<G4_Declare *> AliasHigh;
|
|
|
|
DclMapInfo(G4_Declare *Lo, G4_Declare *Hi) : DclLow(Lo), DclHigh(Hi) {}
|
|
|
|
// Return an appropriate declaration/alias for low or high part.
|
|
G4_Declare *getDcl(IR_Builder &Builder, G4_Type Ty, bool IsLow) {
|
|
return IsLow ? getDcl(Builder, Ty, DclLow, AliasLow)
|
|
: getDcl(Builder, Ty, DclHigh, AliasHigh);
|
|
}
|
|
|
|
private:
|
|
G4_Declare *getDcl(IR_Builder &Builder, G4_Type Ty, G4_Declare *RootDcl,
|
|
std::vector<G4_Declare *> &Aliases) {
|
|
if (Ty == RootDcl->getElemType())
|
|
return RootDcl;
|
|
|
|
for (auto AL : Aliases) {
|
|
if (Ty == AL->getElemType())
|
|
return AL;
|
|
}
|
|
|
|
// Create such an alias if it does not exist yet.
|
|
unsigned NElts = RootDcl->getByteSize() / TypeSize(Ty);
|
|
auto Alias = Builder.createTempVar(
|
|
NElts, Ty, Any,
|
|
(std::string(RootDcl->getName()) + "_" + TypeSymbol(Ty)).c_str(),
|
|
false);
|
|
Alias->setAliasDeclare(RootDcl, 0);
|
|
Aliases.push_back(Alias);
|
|
return Alias;
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
//
|
|
// We split any 4GRF variables (they typically result from simd16 64-bit vars)
|
|
// into two half if
|
|
// -- they are not address taken or used in send
|
|
// -- none of the operands cross from the 2nd to the 3rd GRF
|
|
// This is intended to give RA more freedom as the split variables do
|
|
// not have to be allocated contiguously.
|
|
// Note that this invalidates existing def-use chains
|
|
//
|
|
void Optimizer::split4GRFVars() {
|
|
std::unordered_set<G4_Declare *> varToSplit;
|
|
std::vector<G4_Declare *> varToSplitOrdering;
|
|
// map each split candidate to their replacement split variables
|
|
std::unordered_map<const G4_Declare *, DclMapInfo *> DclMap;
|
|
|
|
if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) != VISA_3D) {
|
|
return;
|
|
}
|
|
|
|
if (builder.getOption(vISA_Debug)) {
|
|
return;
|
|
}
|
|
|
|
// Only for simd16 and simd32.
|
|
if (kernel.getSimdSize() == g4::SIMD8) {
|
|
return;
|
|
}
|
|
|
|
// first scan the decl list
|
|
for (auto dcl : kernel.Declares) {
|
|
if (dcl->getAliasDeclare() == nullptr) {
|
|
if (isCandidateDecl(dcl, builder)) {
|
|
if (varToSplit.find(dcl) == varToSplit.end()) {
|
|
varToSplitOrdering.push_back(dcl);
|
|
}
|
|
|
|
varToSplit.emplace(dcl);
|
|
}
|
|
} else {
|
|
// strictly speaking this condition is not necesary, but having
|
|
// no aliases that could point into a middle of the split candidate
|
|
// makes replacing the split var much easier. By construction the root
|
|
// must appear before its alias decls
|
|
uint32_t offset = 0;
|
|
G4_Declare *rootDcl = dcl->getRootDeclare(offset);
|
|
if (offset != 0 && isCandidateDecl(rootDcl, builder)) {
|
|
varToSplit.erase(rootDcl);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (varToSplit.empty()) {
|
|
// early exit if there are no split candidate
|
|
return;
|
|
}
|
|
|
|
// first pass is to make sure the validity of all split candidates
|
|
for (auto bb : kernel.fg) {
|
|
for (auto inst : *bb) {
|
|
auto removeCandidate = [&varToSplit](G4_Declare *dcl) {
|
|
if (dcl) {
|
|
dcl = dcl->getRootDeclare();
|
|
varToSplit.erase(dcl);
|
|
}
|
|
};
|
|
|
|
if (inst->isSend()) {
|
|
removeCandidate(inst->getDst()->getTopDcl());
|
|
removeCandidate(inst->getSrc(0)->getTopDcl());
|
|
if (inst->isSplitSend()) {
|
|
removeCandidate(inst->getSrc(1)->getTopDcl());
|
|
}
|
|
} else {
|
|
auto cross2GRF = [this](G4_Operand *opnd) {
|
|
uint32_t lb = opnd->getLeftBound();
|
|
uint32_t rb = opnd->getRightBound();
|
|
return (lb < 2u * kernel.numEltPerGRF<Type_UB>()) &&
|
|
(rb >= 2u * kernel.numEltPerGRF<Type_UB>());
|
|
};
|
|
// check and remove decls with operands that cross 2GRF boundary
|
|
if (inst->getDst()) {
|
|
G4_Declare *dstDcl = inst->getDst()->getTopDcl();
|
|
if (dstDcl && cross2GRF(inst->getDst())) {
|
|
removeCandidate(dstDcl);
|
|
}
|
|
}
|
|
for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i) {
|
|
G4_Operand *src = inst->getSrc(i);
|
|
if (src && src->getTopDcl() && cross2GRF(src)) {
|
|
removeCandidate(src->getTopDcl());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (varToSplit.empty()) {
|
|
// early exit if there are no split candidate
|
|
return;
|
|
}
|
|
|
|
// create Lo/Hi for each variable being split
|
|
for (auto splitDcl : varToSplitOrdering) {
|
|
// varToSplitOrdering may have extra elements since we never delete any
|
|
// inserted dcl from it.
|
|
if (varToSplit.find(splitDcl) == varToSplit.end())
|
|
continue;
|
|
G4_Type Ty = splitDcl->getElemType();
|
|
unsigned NElts = splitDcl->getTotalElems();
|
|
std::string varName(splitDcl->getName());
|
|
auto DclLow = builder.createTempVar(NElts / 2, Ty, builder.getGRFAlign(),
|
|
(varName + "Lo").c_str(), false);
|
|
auto DclHi = builder.createTempVar(NElts / 2, Ty, builder.getGRFAlign(),
|
|
(varName + "Hi").c_str(), false);
|
|
DclMap[splitDcl] = new DclMapInfo(DclLow, DclHi);
|
|
// std::cerr << "split " << splitDcl->getName() << " into (" <<
|
|
// DclLow->getName() << ", " << DclHi->getName() << ")\n";
|
|
}
|
|
|
|
// second pass actually does the replacement
|
|
for (auto bb : kernel.fg) {
|
|
for (auto inst : *bb) {
|
|
auto dst = inst->getDst();
|
|
if (dst && dst->getTopDcl()) {
|
|
G4_Declare *dstRootDcl = dst->getTopDcl()->getRootDeclare();
|
|
if (DclMap.count(dstRootDcl)) {
|
|
bool isLow =
|
|
dst->getLeftBound() < 2u * kernel.numEltPerGRF<Type_UB>();
|
|
auto NewDcl =
|
|
DclMap[dstRootDcl]->getDcl(builder, dst->getType(), isLow);
|
|
auto NewDst = builder.createDst(
|
|
NewDcl->getRegVar(), dst->getRegOff() - (isLow ? 0 : 2),
|
|
dst->getSubRegOff(), dst->getHorzStride(), dst->getType(),
|
|
dst->getAccRegSel());
|
|
inst->setDest(NewDst);
|
|
}
|
|
}
|
|
|
|
for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i) {
|
|
G4_Operand *src = inst->getSrc(i);
|
|
if (src && src->getTopDcl()) {
|
|
G4_SrcRegRegion *srcRegion = src->asSrcRegRegion();
|
|
G4_Declare *srcRootDcl = src->getTopDcl()->getRootDeclare();
|
|
if (DclMap.count(srcRootDcl)) {
|
|
bool isLow =
|
|
src->getLeftBound() < 2u * kernel.numEltPerGRF<Type_UB>();
|
|
auto NewSrcDcl =
|
|
DclMap[srcRootDcl]->getDcl(builder, src->getType(), isLow);
|
|
auto NewSrc = builder.createSrcRegRegion(
|
|
srcRegion->getModifier(), srcRegion->getRegAccess(),
|
|
NewSrcDcl->getRegVar(),
|
|
srcRegion->getRegOff() - (isLow ? 0 : 2),
|
|
srcRegion->getSubRegOff(), srcRegion->getRegion(),
|
|
src->getType(), src->getAccRegSel());
|
|
inst->setSrc(NewSrc, i);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (const auto &DI : DclMap) {
|
|
delete DI.second;
|
|
}
|
|
}
|
|
|
|
//
|
|
// A platform may not support 64b types (FP64, INT64, or neither).
|
|
// While HW conformity should have legalized away use of such types, they may
|
|
// get re-introduced again later due to copy moves inserted by spill code
|
|
// generation, rematerialization etc. Instead of checking whether 64b type is
|
|
// used at each createMov(), we add a catch-all pass here. Since this is called
|
|
// post-RA the change we can make are very limited, for now just handle copy
|
|
// moves. We make this a separate pass instead of part of changeMoveType() as
|
|
// the latter is considered an optimization.
|
|
//
|
|
void Optimizer::legalizeType() {
|
|
if (builder.noFP64() || builder.noInt64()) {
|
|
for (auto bb : kernel.fg) {
|
|
for (auto inst : *bb) {
|
|
auto uses64bType = [](G4_INST *inst) {
|
|
bool useFP64 = false;
|
|
bool useInt64 = false;
|
|
{
|
|
auto dstTy =
|
|
inst->getDst() ? inst->getDst()->getType() : Type_UNDEF;
|
|
if (dstTy == Type_DF) {
|
|
useFP64 = true;
|
|
} else if (dstTy == Type_Q || dstTy == Type_UQ) {
|
|
useInt64 = true;
|
|
}
|
|
}
|
|
for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i) {
|
|
auto srcTy =
|
|
inst->getSrc(i) ? inst->getSrc(i)->getType() : Type_UNDEF;
|
|
if (srcTy == Type_DF) {
|
|
useFP64 = true;
|
|
} else if (srcTy == Type_Q || srcTy == Type_UQ) {
|
|
useInt64 = true;
|
|
}
|
|
}
|
|
return std::make_tuple(useFP64, useInt64);
|
|
};
|
|
// ToDo: handle more cases (e.g., immSrc, use UD for copy moves)
|
|
if (inst->isRawMov() && inst->getSrc(0)->isSrcRegRegion()) {
|
|
bool hasFP64 = false, hasInt64 = false;
|
|
std::tie(hasFP64, hasInt64) = uses64bType(inst);
|
|
if (hasFP64 && hasInt64) {
|
|
vISA_ASSERT(
|
|
false,
|
|
"can't handle inst with both FP64 and INT64 at this point");
|
|
return;
|
|
}
|
|
if (hasFP64 && builder.noFP64()) {
|
|
vISA_ASSERT(!builder.noInt64(), "can't change DF to UQ");
|
|
inst->getDst()->setType(builder, Type_UQ);
|
|
inst->getSrc(0)->asSrcRegRegion()->setType(builder, Type_UQ);
|
|
}
|
|
if (hasInt64 && builder.noInt64() && !builder.noFP64()) {
|
|
inst->getDst()->setType(builder, Type_DF);
|
|
inst->getSrc(0)->asSrcRegRegion()->setType(builder, Type_DF);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// Categorize move instructions to help with performance analysis
|
|
//
|
|
void Optimizer::analyzeMove() {
|
|
|
|
#define MOVE_TYPE(DO) \
|
|
DO(Total) \
|
|
DO(SatOrMod) \
|
|
DO(Imm32) \
|
|
DO(Imm64) \
|
|
DO(FPConvert) \
|
|
DO(Trunc) \
|
|
DO(Extend) \
|
|
DO(Broadcast) \
|
|
DO(UNPACK) \
|
|
DO(PACK) \
|
|
DO(Copy) \
|
|
DO(Misc) \
|
|
DO(LAST)
|
|
|
|
enum MovTypes { MOVE_TYPE(MAKE_ENUM) };
|
|
|
|
static const char *moveNames[] = {MOVE_TYPE(STRINGIFY)};
|
|
|
|
std::array<int, MovTypes::LAST> moveCount = {0};
|
|
|
|
for (auto bb : kernel.fg) {
|
|
for (auto inst : *bb) {
|
|
if (!inst->isMov()) {
|
|
continue;
|
|
}
|
|
moveCount[MovTypes::Total]++;
|
|
|
|
if (inst->getSaturate()) {
|
|
moveCount[MovTypes::SatOrMod]++;
|
|
continue;
|
|
}
|
|
auto dstTy = inst->getDst()->getType();
|
|
auto srcTy = inst->getSrc(0)->getType();
|
|
if (inst->getSrc(0)->isImm()) {
|
|
moveCount[TypeSize(srcTy) == 8 ? MovTypes::Imm64 : MovTypes::Imm32]++;
|
|
} else if (inst->getSrc(0)->isSrcRegRegion()) {
|
|
auto srcRR = inst->getSrc(0)->asSrcRegRegion();
|
|
if (srcRR->getModifier() != Mod_src_undef) {
|
|
moveCount[SatOrMod]++;
|
|
continue;
|
|
}
|
|
bool signChange = false;
|
|
if (dstTy != srcTy) {
|
|
if (IS_FTYPE(srcTy) || IS_FTYPE(dstTy)) {
|
|
// distinguish inttofp and fpconvert?
|
|
moveCount[MovTypes::FPConvert]++;
|
|
} else if (TypeSize(dstTy) > TypeSize(srcTy)) {
|
|
moveCount[MovTypes::Extend]++;
|
|
} else if (TypeSize(srcTy) > TypeSize(dstTy)) {
|
|
moveCount[MovTypes::Trunc]++;
|
|
} else {
|
|
signChange = true;
|
|
}
|
|
}
|
|
if (dstTy == srcTy || signChange) {
|
|
if (srcRR->isScalar()) {
|
|
moveCount[inst->getExecSize() > g4::SIMD1 ? MovTypes::Broadcast
|
|
: MovTypes::Copy]++;
|
|
} else if (srcRR->getRegion()->isContiguous(inst->getExecSize())) {
|
|
moveCount[inst->getDst()->getHorzStride() == 1
|
|
? MovTypes::Copy
|
|
: MovTypes::UNPACK]++;
|
|
} else {
|
|
bool singleStride =
|
|
srcRR->getRegion()->isSingleStride(inst->getExecSize());
|
|
if (singleStride && inst->getDst()->getHorzStride() == 1) {
|
|
moveCount[MovTypes::PACK]++;
|
|
} else {
|
|
// give up
|
|
moveCount[MovTypes::Misc]++;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
moveCount[MovTypes::Misc]++;
|
|
}
|
|
}
|
|
}
|
|
|
|
std::cerr << "Move classification:\n";
|
|
for (int i = 0; i < MovTypes::LAST; ++i) {
|
|
if (moveCount[i] > 0) {
|
|
std::cerr << "\t" << moveNames[i] << ":\t" << moveCount[i] << "\n";
|
|
}
|
|
}
|
|
|
|
#undef MOVE_TYPE
|
|
}
|
|
|
|
void Optimizer::staticProfiling() {
|
|
// NOTE: local data flow analysis can not be called because regVar info
|
|
// missed.
|
|
StaticProfiling s(builder, kernel);
|
|
s.run();
|
|
|
|
// Do static cycle profiling only for platforms have 3 or more ALU pipelines.
|
|
// Do static cycle profling only when shader dump is enabled.
|
|
if (builder.hasThreeALUPipes() &&
|
|
builder.getOptions()->getOption(vISA_outputToFile) &&
|
|
builder.getOptions()->getOption(vISA_staticBBProfiling)) {
|
|
StaticCycleProfiling sc(kernel);
|
|
sc.run();
|
|
}
|
|
}
|
|
|
|
static void markBreakpoint(G4_BB *bb, INST_LIST_ITER it, IR_Builder *builder) {
|
|
[[maybe_unused]] G4_INST *inst = *it;
|
|
vISA_ASSERT(inst->isIntrinsic() &&
|
|
inst->asIntrinsicInst()->getIntrinsicId() ==
|
|
Intrinsic::Breakpoint,
|
|
"expect breakpoint intrinsic");
|
|
auto nextIt = ++it;
|
|
// if we encounter a breakpoint, mark the instruction after the
|
|
// breakpoint intrinsic with breakpoint instruction option
|
|
if (nextIt != bb->end()) {
|
|
// intrinsic is not at the end of bb
|
|
G4_INST *nextInst = *(nextIt);
|
|
nextInst->setOptionOn(InstOpt_BreakPoint);
|
|
} else {
|
|
// intrinsic is at the end of bb
|
|
// create a dummy mov with breakpoint option set
|
|
auto nullDst = builder->createNullDst(Type_UD);
|
|
auto nullSrc = builder->createNullSrc(Type_UD);
|
|
|
|
G4_INST *dummyMov = builder->createMov(g4::SIMD1, nullDst, nullSrc,
|
|
InstOpt_BreakPoint, false);
|
|
bb->push_back(dummyMov);
|
|
}
|
|
}
|
|
|
|
//
|
|
// remove Intrinsics
|
|
//
|
|
void Optimizer::removeIntrinsics() {
|
|
for (auto bb : kernel.fg) {
|
|
for (auto I = bb->begin(); I != bb->end(); ++I) {
|
|
G4_INST *inst = *I;
|
|
if (!inst->isIntrinsic())
|
|
continue;
|
|
if (inst->asIntrinsicInst()->getIntrinsicId() == Intrinsic::Breakpoint) {
|
|
markBreakpoint(bb, I, fg.builder);
|
|
}
|
|
}
|
|
|
|
std::vector<Intrinsic> intrinIdVec = {
|
|
Intrinsic::MemFence,
|
|
Intrinsic::FlagSpill,
|
|
Intrinsic::Breakpoint
|
|
};
|
|
bb->removeIntrinsics(intrinIdVec);
|
|
}
|
|
}
|
|
|
|
//
|
|
// for some platforms int has half throughout compared to float,
|
|
// so for copy moves we should change their type
|
|
// from D/UD to F or W/UW to HF when possible
|
|
//
|
|
void Optimizer::changeMoveType() {
|
|
if (!builder.favorFloatMov() && !builder.balanceIntFloatMoves()) {
|
|
return;
|
|
}
|
|
|
|
if (builder.avoidSrc1Src2Overlap()) {
|
|
return;
|
|
}
|
|
|
|
auto changeType = [this](G4_INST *movInst, G4_Type newTy) {
|
|
movInst->getDst()->setType(builder, newTy);
|
|
auto src0 = movInst->getSrc(0);
|
|
if (src0->isImm()) {
|
|
uint32_t mask = TypeSize(newTy) == 4 ? 0xFFFFFFFF : 0xFFFF;
|
|
movInst->setSrc(
|
|
fg.builder->createImm(src0->asImm()->getImm() & mask, newTy), 0);
|
|
if (newTy == Type_F) {
|
|
uint32_t value = src0->asImm()->getImm() & mask;
|
|
std::stringstream ss;
|
|
ss << "(";
|
|
ss << "0x" << std::setfill('0') << std::hex << std::setw(8) << value;
|
|
ss << ":f)";
|
|
movInst->addComment(ss.str());
|
|
}
|
|
} else {
|
|
movInst->getSrc(0)->asSrcRegRegion()->setType(builder, newTy);
|
|
}
|
|
};
|
|
|
|
auto isCandidateMov = [this](G4_INST *inst) {
|
|
if (inst->opcode() != G4_mov || inst->getSaturate() || inst->getCondMod()) {
|
|
return false;
|
|
}
|
|
auto src0 = inst->getSrc(0);
|
|
G4_Type dstTy = inst->getDst()->getType();
|
|
G4_Type src0Ty = src0->getType();
|
|
if (!inst->getDst()->isGreg()) {
|
|
// don't apply it on ARFs (both dst and src)
|
|
return false;
|
|
}
|
|
// Used only for splitting QW->2xUD
|
|
if (TypeSize(dstTy) == 8) {
|
|
if (dstTy != src0Ty) {
|
|
// allow UD->(Q|UQ) as we zext, but not D->Q
|
|
if (!(IS_TYPE_INT(dstTy) && src0Ty == Type_UD)) {
|
|
return false;
|
|
}
|
|
}
|
|
// we can split: scalar, contigous, stride2 w/out SrcMod
|
|
if (src0->isSrcRegRegion() && src0->isGreg()) {
|
|
auto srcReg = src0->asSrcRegRegion();
|
|
bool modifier = srcReg->hasModifier();
|
|
uint16_t stride = 0;
|
|
bool singleStrideMax2 =
|
|
srcReg->getRegion()->isSingleStride(inst->getExecSize(), stride) &&
|
|
(stride <= 2);
|
|
|
|
return !modifier && singleStrideMax2;
|
|
} else // or immediates
|
|
return src0->isImm();
|
|
}
|
|
if (dstTy != src0Ty) {
|
|
// allow D <-> UD and W <-> UW moves
|
|
if (!(IS_TYPE_INT(dstTy) && IS_TYPE_INT(src0Ty) &&
|
|
TypeSize(dstTy) == TypeSize(src0Ty))) {
|
|
return false;
|
|
}
|
|
}
|
|
auto isLegalType = [](G4_Type ty) {
|
|
return TypeSize(ty) == 2 || TypeSize(ty) == 4;
|
|
};
|
|
if (!isLegalType(dstTy) || !isLegalType(src0Ty)) {
|
|
return false;
|
|
}
|
|
|
|
if (src0->isRelocImm()) {
|
|
return false;
|
|
}
|
|
|
|
if (src0->isSrcRegRegion() && src0->isGreg()) {
|
|
auto src0R = src0->asSrcRegRegion();
|
|
bool hasNoModifier = src0R->getModifier() == Mod_src_undef;
|
|
bool hasSimpleRegion =
|
|
src0R->isScalar() ||
|
|
(src0R->getRegion()->isContiguous(inst->getExecSize()) &&
|
|
inst->getDst()->getHorzStride() == 1);
|
|
bool dstSrcAligned =
|
|
src0R->getLinearizedStart() % kernel.numEltPerGRF<Type_UB>() ==
|
|
inst->getDst()->getLinearizedStart() % kernel.numEltPerGRF<Type_UB>();
|
|
return hasNoModifier && hasSimpleRegion && dstSrcAligned;
|
|
} else if (src0->isImm()) {
|
|
// allow sext and zext imm moves
|
|
// float imm can always be converted to int imm
|
|
int64_t immVal = src0->asImm()->getImm();
|
|
bool isIntImmMove = IS_TYPE_INT(dstTy) && IS_TYPE_INT(src0Ty) &&
|
|
G4_Imm::isInTypeRange(immVal, dstTy);
|
|
return isIntImmMove || IS_FTYPE(dstTy) || IS_HFTYPE(dstTy);
|
|
}
|
|
return false;
|
|
};
|
|
auto splitMov64Imm = [this](INST_LIST_ITER curInst, G4_BB *BB) {
|
|
auto firstMovInst = *curInst;
|
|
auto src0 = firstMovInst->getSrc(0);
|
|
auto dst = firstMovInst->getDst();
|
|
auto srcType = src0->getType();
|
|
auto dstType = dst->getType();
|
|
|
|
// Saturate, CondMod, SrcMod, regioning are covered when adding to input
|
|
// list, so no need for check now
|
|
bool isSrcReg = src0->isSrcRegRegion();
|
|
|
|
bool isSrcImm = src0->isImm();
|
|
bool is64to64 = isSrcReg && (srcType == dstType) &&
|
|
(IS_QTYPE(dstType) || dstType == Type_DF);
|
|
bool isU32to64 = isSrcReg && (srcType == Type_UD) &&
|
|
IS_QTYPE(dstType); // can zero extend it
|
|
|
|
if (!(isSrcImm || isU32to64 || is64to64))
|
|
return;
|
|
|
|
// common for each variant
|
|
auto newTy = Type_UD;
|
|
unsigned char execSize = firstMovInst->getExecSize();
|
|
unsigned char secondMovExecSize = firstMovInst->getExecSize();
|
|
|
|
dst =
|
|
builder.createDst(dst->getBase(), dst->getRegOff(),
|
|
2 * dst->getSubRegOff(), dst->getHorzStride(), newTy);
|
|
|
|
G4_Operand *firstMovSrc0 = src0;
|
|
G4_Operand *secondMovSrc0 = nullptr;
|
|
|
|
bool canDoubleExecSize = false;
|
|
if (isSrcImm) {
|
|
uint64_t original = src0->asImm()->getImm();
|
|
uint64_t lopart = original & 0xFFFFFFFF;
|
|
uint64_t hipart = (original >> 32);
|
|
|
|
// original mov takes low part
|
|
firstMovSrc0 = fg.builder->createImm(lopart, newTy);
|
|
|
|
// second mov, with high part and offset
|
|
secondMovSrc0 = fg.builder->createImm(hipart, newTy);
|
|
|
|
/*
|
|
from :
|
|
(W) mov (8|M0) r2.0<1>:df 0x0:df
|
|
|
|
make:
|
|
(W) mov (16|M0) r2.0<1>:ud 0x0:ud
|
|
*/
|
|
if (lopart == hipart)
|
|
canDoubleExecSize = true;
|
|
} else if (isU32to64) {
|
|
// original move src0 stays the same (will have different dst)
|
|
// second mov zero extends type
|
|
// TODO(?): mov r1:uq 0:ud
|
|
secondMovSrc0 = fg.builder->createImm(0, newTy);
|
|
} else if (is64to64) {
|
|
auto src0ASR = src0->asSrcRegRegion();
|
|
auto prevReg = src0ASR->getRegion();
|
|
|
|
src0ASR = builder.createSrcRegRegion(
|
|
src0ASR->getModifier(), src0ASR->getRegAccess(), src0ASR->getBase(),
|
|
src0ASR->getRegOff(), src0ASR->getSubRegOff() * 2,
|
|
src0ASR->getRegion(), newTy);
|
|
|
|
if (prevReg->vertStride <= 1) {
|
|
// from:
|
|
// mov (4|M0) r14.0<1>:q r24.0<1;1,0>:q
|
|
// mov (1|M0) r94.2<1>:q r14.2<0;1,0>:q
|
|
// to:
|
|
// mov (8|M0) r14.0<1>:ud r24.0<1;1,0>:ud
|
|
// mov (2|M0) r94.4<1>:ud r14.4<1;1,0>:ud
|
|
canDoubleExecSize = true;
|
|
|
|
// convert both <0;1,0> and <1;1,0>
|
|
src0ASR->setRegion(builder, fg.builder->getRegionStride1());
|
|
|
|
// just create copy of src region to second mov
|
|
secondMovSrc0 = fg.builder->createSubSrcOperand(
|
|
src0ASR, 0, 2 * execSize, 1, prevReg->width);
|
|
} else {
|
|
/* some weird stuff like
|
|
mov (2|M0) r14.0<1>:q r24.1<2;1,0>:q
|
|
|
|
we should split into 2 (can't double exec).
|
|
mov (2|M0) r14.0<1>:ud r24.2<2;1,0>:ud
|
|
mov (2|M0) r14.1<1>:ud r24.3<2;1,0>:ud
|
|
*/
|
|
|
|
// calculate offset on original regioning at lower type
|
|
secondMovSrc0 = fg.builder->createSubSrcOperand(
|
|
src0ASR, 1, execSize, prevReg->vertStride, prevReg->width);
|
|
|
|
// change to stride2 now
|
|
auto newReg =
|
|
fg.builder->createRegionDesc(execSize, prevReg->vertStride * 2,
|
|
prevReg->width, prevReg->horzStride);
|
|
|
|
src0ASR->setRegion(builder, newReg);
|
|
secondMovSrc0->asSrcRegRegion()->setRegion(builder, newReg);
|
|
}
|
|
}
|
|
firstMovInst->setSrc(firstMovSrc0, 0);
|
|
|
|
// common offset for all paths
|
|
G4_DstRegRegion *secondMovDst;
|
|
|
|
if (canDoubleExecSize) {
|
|
secondMovExecSize *= 2;
|
|
secondMovDst = fg.builder->createSubDstOperand(dst, 0, secondMovExecSize);
|
|
} else {
|
|
secondMovDst = fg.builder->createSubDstOperand(dst, 1, secondMovExecSize);
|
|
|
|
// set HzStride for both dst if it matters
|
|
if (execSize > 1) {
|
|
dst->setHorzStride(2);
|
|
secondMovDst->setHorzStride(2);
|
|
}
|
|
}
|
|
|
|
G4_Predicate *pred =
|
|
firstMovInst->getPredicate()
|
|
? builder.duplicateOperand(firstMovInst->getPredicate())
|
|
: nullptr;
|
|
|
|
// Create second mov, with different only src/dst, rest the same
|
|
G4_INST *secondMovInst = builder.createInternalInst(
|
|
pred, G4_mov, nullptr, g4::NOSAT, G4_ExecSize(secondMovExecSize),
|
|
secondMovDst, secondMovSrc0, nullptr, firstMovInst->getOption());
|
|
|
|
BB->insertBefore(curInst, secondMovInst);
|
|
|
|
// we can't alter execSize of first mov, so newMov will take it's place, and
|
|
// remove original
|
|
// TODO: we don't estimate cost of this doubledExec correctly need to fix
|
|
if (canDoubleExecSize) {
|
|
BB->erase(curInst);
|
|
}
|
|
|
|
/*
|
|
TODO: currently we do this
|
|
(W) mov (1|M0) r66.0<1>:df 0x37F0000000000000:df
|
|
(W) mov (1|M0) r66.1<1>:df 0x47F0000000000000:df
|
|
(W) mov (1|M0) r66.2<1>:df 0x7FF0000000000000:df
|
|
->
|
|
(W) mov (1|M0) r66.1<1>:ud 0x37F00000:ud
|
|
(W) mov (1|M0) r66.0<1>:ud 0x0:ud
|
|
(W) mov (1|M0) r66.3<1>:ud 0x47F00000:ud
|
|
(W) mov (1|M0) r66.2<1>:ud 0x0:ud
|
|
(W) mov (1|M0) r66.5<1>:ud 0x7FF00000:ud
|
|
(W) mov (1|M0) r66.4<1>:ud 0x0:ud
|
|
|
|
but we could do this ?
|
|
->
|
|
(W) mov (1|M0) r66.1<1>:ud 0x37F00000:ud
|
|
(W) mov (1|M0) r66.3<1>:ud 0x47F00000:ud
|
|
(W) mov (1|M0) r66.5<1>:ud 0x7FF00000:ud
|
|
(W) mov (2|M0) r66.0<2>:ud 0x0:ud
|
|
(W) mov (1|M0) r66.4<1>:ud 0x0:ud
|
|
*/
|
|
};
|
|
|
|
/*
|
|
0 - don't convert.
|
|
1 - per BB balance. <default>
|
|
2 - all suitable 64bit mov (experimental)
|
|
*/
|
|
unsigned SplitMov64Mode =
|
|
fg.builder->getOptions()->getuInt32Option(vISA_SplitMov64);
|
|
|
|
if (builder.balanceIntFloatMoves()) {
|
|
auto dstOrAnySrcIs2GRF = [this](G4_INST *inst) {
|
|
auto dst = inst->getDst();
|
|
bool dstIs2GRF = dst && !dst->isNullReg() && dst->isCrossGRFDst(builder);
|
|
if (dstIs2GRF)
|
|
return true;
|
|
|
|
for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; ++i) {
|
|
auto curSrc = inst->getSrc(i);
|
|
if (inst->getSrc(i) == nullptr)
|
|
continue;
|
|
if (curSrc->isGreg() && curSrc->asSrcRegRegion()->crossGRF(builder))
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
// attempt to balance the number of float v. int instructions in each BB
|
|
// by changing the types of int or float copy moves
|
|
for (auto bb : fg) {
|
|
// candidate int and float moves
|
|
std::vector<G4_INST *> intMovs, floatMovs;
|
|
std::vector<INST_LIST_ITER> QWInstructions;
|
|
// int/math/send share one decoder, float and 64b share the other decoder
|
|
int numIntCost = 0, numFloatCost = 0;
|
|
for (auto I = bb->begin(); I != bb->end(); /*empty*/) {
|
|
auto CurI = I++;
|
|
G4_INST *inst = *CurI;
|
|
if (inst->getDst() && !inst->isDpas()) {
|
|
auto execSize = inst->getExecSize();
|
|
G4_Type dstTy = inst->getDst()->getType();
|
|
uint32_t dstTySize = TypeSize(dstTy);
|
|
|
|
uint32_t affectedGRFsCost = dstOrAnySrcIs2GRF(inst) ? 2 : 1;
|
|
|
|
// Assumption:
|
|
// FPU0 : FLT16/FLT32/FLT64/INT64
|
|
// FPU1 : INT16 / INT32 / EM
|
|
if (inst->isMath()) {
|
|
// native simd1 for :DF, simd2 for :F
|
|
numIntCost += (dstTySize == 8) ? execSize : execSize / 2;
|
|
} else if (inst->isSend()) {
|
|
numIntCost++;
|
|
} else if (dstTySize == 8) {
|
|
numFloatCost += affectedGRFsCost;
|
|
if (isCandidateMov(inst)) {
|
|
QWInstructions.push_back(CurI);
|
|
}
|
|
} else {
|
|
if (IS_TYPE_INT(dstTy)) {
|
|
numIntCost += affectedGRFsCost;
|
|
if (isCandidateMov(inst)) {
|
|
intMovs.push_back(inst);
|
|
}
|
|
} else if (IS_TYPE_FLOAT_ALL(dstTy)) {
|
|
numFloatCost += affectedGRFsCost;
|
|
if (isCandidateMov(inst)) {
|
|
floatMovs.push_back(inst);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// std::cout << "num int cost/mov: " << numIntCost << "/" <<
|
|
// intMovs.size() << " "
|
|
// << "num float cost/mov: " << numFloatCost << "/" <<
|
|
// floatMovs.size() << " "
|
|
// << "QW movs: " << QWInstructions.size() << "\n";
|
|
int diff = std::abs(numIntCost - numFloatCost) / 2;
|
|
|
|
auto changeMovsFromVector = [&](std::vector<G4_INST *> &table,
|
|
G4_Type newType32, G4_Type newType16) {
|
|
for (int i = 0, numInt = table.size(); diff > 0 && i < numInt; ++i) {
|
|
auto inst = table[i];
|
|
auto typeSize = inst->getDst()->getTypeSize();
|
|
G4_Type floatTy = typeSize == 4 ? newType32 : newType16;
|
|
|
|
changeType(inst, floatTy);
|
|
|
|
auto estimatedClockCount = dstOrAnySrcIs2GRF(inst) ? 2 : 1;
|
|
diff -= estimatedClockCount;
|
|
}
|
|
};
|
|
|
|
bool forceSplitAllMov64 = (SplitMov64Mode == 2);
|
|
|
|
if (numIntCost > numFloatCost && !forceSplitAllMov64) {
|
|
// change int move to float move
|
|
changeMovsFromVector(intMovs, Type_F, Type_HF);
|
|
} else {
|
|
// change float move to int move
|
|
changeMovsFromVector(floatMovs, Type_UD, Type_UW);
|
|
|
|
// if there's still unbalance
|
|
// split `mov <imm64>` (or `mov 64to64` or mov `u32to64`) into 2x `mov
|
|
// <imm32>`
|
|
// TODO: or maybe split "and", "or" as well
|
|
|
|
// Above changeMovsFromVector() had always same added and decreased
|
|
// values so it operated on halfDiff but now we might have different
|
|
// values so let's operate on full diff, not half
|
|
diff = diff * 2;
|
|
|
|
int rep = 0;
|
|
if (!SplitMov64Mode)
|
|
diff = 0;
|
|
|
|
for (int i = 0, numInt = QWInstructions.size();
|
|
((diff > 0) || forceSplitAllMov64) && i < numInt; ++i) {
|
|
auto inst = *QWInstructions[i];
|
|
auto execSize = inst->getExecSize();
|
|
auto estimatedSrcCost =
|
|
dstOrAnySrcIs2GRF(inst) ? 2 : 1; // cost of mov before change
|
|
|
|
auto dstTypeSize = TypeSize(Type_UD);
|
|
auto estimatedDstCost = (execSize * dstTypeSize * /*HzStride*/ 2) > 32
|
|
? 2
|
|
: 1; // cost of new mov
|
|
|
|
// it might be that we remove 1 cycle mov (1) :df, and add 2x 1cycle
|
|
// mov(1) :ud => 3 cycles diff.
|
|
auto new_diff = diff - (2 * estimatedDstCost + estimatedSrcCost);
|
|
|
|
if (abs(new_diff) >= abs(diff) && !forceSplitAllMov64) {
|
|
break;
|
|
}
|
|
|
|
splitMov64Imm(QWInstructions[i], bb);
|
|
|
|
diff = new_diff;
|
|
rep++;
|
|
}
|
|
// std::cout << "diff before " << diff_prev << " after " << diff <<"
|
|
// reps done " << rep << "\n";
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
for (auto bb : fg) {
|
|
for (auto inst : *bb) {
|
|
if (inst->opcode() != G4_mov) {
|
|
continue;
|
|
}
|
|
// copy move means dst and src has identical bits (implies same type
|
|
// width), and there are no sat/conditional modifier as well as src
|
|
// modifier ToDo: we should probably change isRawMov() to include mov UD D
|
|
auto src0 = inst->getSrc(0);
|
|
|
|
// FIXME: This is a quick WA to bypass RelocImm, so that it won't create a
|
|
// new src0 and overwrite the original RelocImm While this optimization
|
|
// should still be able to apply to RelocImm. Once we turn on this
|
|
// optimization for RelocImm, we should update assert in
|
|
// VISAKernelImpl::GetRelocations to allow float type
|
|
if (src0->isRelocImm())
|
|
continue;
|
|
|
|
G4_Type dstTy = inst->getDst()->getType();
|
|
G4_Type src0Ty = src0->getType();
|
|
bool hasNoModifier =
|
|
!inst->getSaturate() && !inst->getCondMod() &&
|
|
(src0->isImm() ||
|
|
(src0->isSrcRegRegion() &&
|
|
src0->asSrcRegRegion()->getModifier() == Mod_src_undef));
|
|
|
|
// it may be unsafe to change the move type for acc as it has higher
|
|
// precision
|
|
if (inst->getDst()->isGreg() && hasNoModifier) {
|
|
if (src0->isGreg()) {
|
|
bool isIntCopyMove = IS_TYPE_INT(dstTy) && IS_TYPE_INT(src0Ty) &&
|
|
TypeSize(dstTy) == TypeSize(src0Ty);
|
|
if (isIntCopyMove) {
|
|
if (dstTy == Type_D || dstTy == Type_UD) {
|
|
changeType(inst, Type_F);
|
|
} else if (dstTy == Type_W || dstTy == Type_UW) {
|
|
changeType(inst, Type_HF);
|
|
}
|
|
}
|
|
} else if (src0->isImm()) {
|
|
// allow sext and zext imm moves
|
|
int64_t immVal = src0->asImm()->getImm();
|
|
bool isIntImmMove = IS_TYPE_INT(dstTy) && IS_TYPE_INT(src0Ty) &&
|
|
G4_Imm::isInTypeRange(immVal, dstTy);
|
|
if (isIntImmMove) {
|
|
if (dstTy == Type_D || dstTy == Type_UD) {
|
|
changeType(inst, Type_F);
|
|
} else if (dstTy == Type_W || dstTy == Type_UW) {
|
|
changeType(inst, Type_HF);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static bool isDeadInst(FlowGraph &fg, G4_INST *Inst) {
|
|
if ((Inst->isMov() && !Inst->isRelocationMov() &&
|
|
!Inst->getDst()->isNullReg()) ||
|
|
Inst->isLogic() || Inst->isCompare() || Inst->isArithmetic() ||
|
|
Inst->isVector()) {
|
|
|
|
// Check side-effects.
|
|
// - no global
|
|
// - no indirect
|
|
// - not physically assigned (including ARF)
|
|
auto checkOpnd = [&](G4_Operand *Opnd) {
|
|
if (Opnd == nullptr || Opnd->isNullReg())
|
|
return true;
|
|
if (fg.globalOpndHT.isOpndGlobal(Opnd))
|
|
return false;
|
|
if (Opnd->isDstRegRegion() && Opnd->asDstRegRegion()->isIndirect())
|
|
return false;
|
|
if (G4_VarBase *Base = Opnd->getBase()) {
|
|
if (!Base->isRegVar())
|
|
return false;
|
|
if (Base->asRegVar()->isPhyRegAssigned())
|
|
return false;
|
|
}
|
|
if (G4_Declare *Dcl = Opnd->getTopDcl()) {
|
|
if (Dcl->isPreDefinedVar()) {
|
|
// This can be improved by checking each preDefinedVar
|
|
return false;
|
|
}
|
|
if (Dcl->isOutput() || Dcl->isPayloadLiveOut())
|
|
return false;
|
|
}
|
|
return true;
|
|
};
|
|
|
|
// Should have no use.
|
|
if (Inst->use_size() > 0)
|
|
return false;
|
|
|
|
// Skip instructions with special attributes.
|
|
if (Inst->isYieldInst() || Inst->isBreakPointInst())
|
|
return false;
|
|
|
|
// Check defs. Assuming acc operands are all locally defined
|
|
// and def-use are correctly maintained.
|
|
if (!checkOpnd(Inst->getDst()) || !checkOpnd(Inst->getCondMod()))
|
|
return false;
|
|
|
|
// At this point, this instruction is dead.
|
|
return true;
|
|
}
|
|
|
|
// By default it is not dead.
|
|
return false;
|
|
}
|
|
|
|
// DCE() is disabled if the kernel has non-LSC messages because of below issue:
|
|
// Some cases have inaccurate kills, thus it is unsafe to turn it on by default.
|
|
// For example,
|
|
// 1. mov (1) r10.2:ud r 20.0:ud
|
|
// 2. send (1) r10:ud ... // a32 dword read
|
|
// 3. ... r10.2 ...
|
|
// In this case, send's footprint is the entire r10 (0-7 dw), thus it kill 1).
|
|
// In fact, send only modifies r10.0:ud (a single dw), thus it actually does not
|
|
// kill 1). If dce is on, it willl use the false kill info to remove 1), as
|
|
// result, the code would be wrong.
|
|
//
|
|
//
|
|
void Optimizer::dce() {
|
|
// Do not do DCE if there is any legacy message.
|
|
for (auto bb : fg) {
|
|
for (auto I = bb->rbegin(), E = bb->rend(); I != E; ++I) {
|
|
G4_INST *Inst = *I;
|
|
if (Inst->isSend() && Inst->getMsgDesc()->isHDC()) {
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
// make sure dataflow is up to date
|
|
kernel.fg.resetLocalDataFlowData();
|
|
kernel.fg.localDataFlowAnalysis();
|
|
|
|
for (auto bb : fg) {
|
|
for (auto I = bb->rbegin(), E = bb->rend(); I != E; ++I) {
|
|
G4_INST *Inst = *I;
|
|
if (isDeadInst(fg, Inst)) {
|
|
Inst->removeAllDefs();
|
|
Inst->markDead();
|
|
}
|
|
}
|
|
bb->erase(std::remove_if(bb->begin(), bb->end(),
|
|
[](G4_INST *Inst) { return Inst->isDead(); }),
|
|
bb->end());
|
|
}
|
|
}
|
|
|
|
// Barrier is translated into signal and wait instructions. Both are scheduling
|
|
// barriers resulting in no any other instruction could be scheduled in-between.
|
|
// However, signal will take a while to go through among threads, so we could
|
|
// treat wait as a scheduling barrier for mayLoad/mayStore instructions only to
|
|
// improve performance. This pass tries to sink barrier wait until non-scratch
|
|
// send is found or when there's a possible flag overlap for nbarrier cases.
|
|
void Optimizer::sinkBarrierWait() {
|
|
// Skip the optimization when barrier WA is required.
|
|
if (builder.needBarrierWA())
|
|
return;
|
|
|
|
// TODO: Check whether there's really a flag overlap between the two
|
|
// instructions. Given named barrier with flag src0 probably is rarely used,
|
|
// currently simply treat the case, wait's src0 is flag and the current inst
|
|
// writes flag, as an overlap.
|
|
auto hasFlagOverlap = [](const G4_INST *i1, const G4_INST *i2) -> bool {
|
|
vASSERT(i1 && i1->opcode() == G4_wait);
|
|
return i1->getSrc(0)->isFlag() && i2->writesFlag();
|
|
};
|
|
|
|
for (auto bb : fg) {
|
|
INST_LIST waits;
|
|
for (auto it = bb->begin(), ie = bb->end(); it != ie;) {
|
|
G4_INST *inst = *it;
|
|
// Move any barrier wait to the temporary list.
|
|
if (inst->opcode() == G4_wait) {
|
|
auto next = std::next(it);
|
|
waits.splice(waits.end(), bb->getInstList(), it);
|
|
it = next;
|
|
continue;
|
|
}
|
|
|
|
// Move all barrier waits from the temporary list back to inst list right
|
|
// before an interesting position like a non-scratch send or
|
|
// a control-flow instruction.
|
|
// TODO: Check if we can relax or need more restrictions. For example,
|
|
// private memory access probably could also be skipped.
|
|
if ((inst->isSend() && !inst->getMsgDesc()->isScratch()) ||
|
|
inst->isCFInst())
|
|
bb->splice(it, waits);
|
|
|
|
// When there's any wait that has a flag overlap with the current inst,
|
|
// move the range [waits.begin(), last overlapping wait iterator] back to
|
|
// the inst list so that the waits are not reordered.
|
|
auto rwit = std::find_if(waits.rbegin(), waits.rend(),
|
|
[=](const G4_INST *i) { return hasFlagOverlap(i, inst); });
|
|
if (rwit != waits.rend()) {
|
|
G4_INST *prev = nullptr, *last = *rwit;
|
|
auto wit = waits.begin();
|
|
while (prev != last) {
|
|
prev = *wit;
|
|
auto next = std::next(wit);
|
|
bb->splice(it, waits, wit);
|
|
wit = next;
|
|
};
|
|
}
|
|
|
|
++it;
|
|
}
|
|
// Every BB should end with a EOT or a CF inst like goto/jmpi/ret.
|
|
vASSERT(waits.empty());
|
|
}
|
|
}
|