mirror of
https://github.com/intel/intel-graphics-compiler.git
synced 2025-11-04 08:21:06 +08:00
2116 lines
73 KiB
C++
2116 lines
73 KiB
C++
/*===================== begin_copyright_notice ==================================
|
|
|
|
Copyright (c) 2017 Intel Corporation
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a
|
|
copy of this software and associated documentation files (the
|
|
"Software"), to deal in the Software without restriction, including
|
|
without limitation the rights to use, copy, modify, merge, publish,
|
|
distribute, sublicense, and/or sell copies of the Software, and to
|
|
permit persons to whom the Software is furnished to do so, subject to
|
|
the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included
|
|
in all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
|
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
|
|
======================= end_copyright_notice ==================================*/
|
|
|
|
#include "FlowGraph.h"
|
|
#include "GraphColor.h"
|
|
#include "SpillManagerGMRF.h"
|
|
#include <list>
|
|
#include "SpillCleanup.h"
|
|
|
|
uint32_t computeFillMsgDesc(unsigned int payloadSize, unsigned int offset);
|
|
uint32_t computeSpillMsgDesc(unsigned int payloadSize, unsigned int offset);
|
|
|
|
namespace vISA
|
|
{
|
|
G4_SrcRegRegion* CoalesceSpillFills::generateCoalescedSpill(unsigned int scratchOffset, unsigned int payloadSize,
|
|
G4_SendMsgDescriptor* sample, bool useNoMask, G4_InstOption mask, int srcCISAOff, G4_Declare* spillDcl, unsigned int row)
|
|
{
|
|
// Generate split send instruction with specified payload size and offset
|
|
auto header = kernel.fg.builder->createSrcRegRegion(Mod_src_undef, Direct,
|
|
kernel.fg.builder->getBuiltinR0()->getRegVar(), 0, 0,
|
|
kernel.fg.builder->getRegionStride1(), Type_UD);
|
|
auto spillSrcPayload = kernel.fg.builder->createSrcRegRegion(Mod_src_undef, Direct, spillDcl->getRegVar(),
|
|
(short)row, 0, kernel.fg.builder->getRegionStride1(), Type_UD);
|
|
|
|
uint32_t spillMsgDesc = computeSpillMsgDesc(payloadSize, scratchOffset);
|
|
|
|
G4_SendMsgDescriptor* msgDesc = kernel.fg.builder->createSendMsgDesc(spillMsgDesc & 0x000FFFFFu,
|
|
0, 1, SFID_DP_DC, false, payloadSize, 0, false, true,
|
|
sample->getBti(), sample->getSti());
|
|
|
|
G4_Imm* msgDescImm = kernel.fg.builder->createImm(msgDesc->getDesc(), Type_UD);
|
|
G4_Imm* extDesc = kernel.fg.builder->createImm(msgDesc->getExtendedDesc(), Type_UD);
|
|
|
|
// Create send instruction with payloadSize starting at scratch offset min
|
|
unsigned int option = useNoMask ? InstOpt_WriteEnable : 0;
|
|
auto spillInst = kernel.fg.builder->createSplitSendInst(nullptr, G4_sends, 16,
|
|
kernel.fg.builder->createNullDst(Type_UW), header, spillSrcPayload, msgDescImm, option, msgDesc, extDesc);
|
|
|
|
if (!useNoMask)
|
|
{
|
|
// Care about mask only for non-NoMask sends
|
|
spillInst->setMaskOption(mask);
|
|
}
|
|
|
|
spillInst->setCISAOff(srcCISAOff);
|
|
|
|
#if 0
|
|
spillInst->dump();
|
|
#endif
|
|
|
|
return spillSrcPayload;
|
|
}
|
|
|
|
G4_DstRegRegion* CoalesceSpillFills::generateCoalescedFill(unsigned int scratchOffset, unsigned int payloadSize,
|
|
unsigned int dclSize, G4_SendMsgDescriptor* sample, int srcCISAOff, G4_Align alignment)
|
|
{
|
|
// Generate split send instruction with specified payload size and offset
|
|
// Construct fillDst
|
|
char* dclName = kernel.fg.builder->getNameString(kernel.fg.mem, 32,
|
|
"COAL_FILL_%d", kernel.Declares.size());
|
|
auto fillDcl = kernel.fg.builder->createDeclareNoLookup(dclName, G4_GRF,
|
|
8, (unsigned short)dclSize, Type_UD, DeclareType::CoalescedFill);
|
|
|
|
fillDcl->setAlign(alignment);
|
|
fillDcl->setDoNotSpill();
|
|
|
|
auto fillDst = kernel.fg.builder->createDstRegRegion(Direct, fillDcl->getRegVar(), 0,
|
|
0, 1, Type_UW);
|
|
auto header = kernel.fg.builder->createSrcRegRegion(Mod_src_undef, Direct,
|
|
kernel.fg.builder->getBuiltinR0()->getRegVar(), 0, 0,
|
|
kernel.fg.builder->getRegionStride1(), Type_UD);
|
|
|
|
uint32_t fillMsgDesc = computeFillMsgDesc(payloadSize, scratchOffset);
|
|
|
|
G4_SendMsgDescriptor* msgDesc = kernel.fg.builder->createSendMsgDesc(fillMsgDesc,
|
|
payloadSize, 1, sample->getFuncId(), false, 0, sample->getExtFuncCtrl(), true, false,
|
|
sample->getBti(), sample->getSti());
|
|
|
|
G4_Imm* msgDescImm = kernel.fg.builder->createImm(msgDesc->getDesc(), Type_UD);
|
|
G4_Imm* extDesc = kernel.fg.builder->createImm(sample->getExtendedDesc(), Type_UD);
|
|
|
|
// Create send instruction with payloadSize starting at scratch offset min
|
|
auto fillInst = kernel.fg.builder->createSendInst(nullptr, G4_send, 16,
|
|
fillDst, header, extDesc, msgDescImm, InstOpt_WriteEnable, true, false, nullptr);
|
|
fillInst->setCISAOff(srcCISAOff);
|
|
|
|
#if 0
|
|
fillInst->dump();
|
|
#endif
|
|
|
|
return fillDst;
|
|
}
|
|
|
|
void CoalesceSpillFills::copyToOldFills(G4_DstRegRegion* coalescedFillDst, std::list<std::pair<G4_DstRegRegion*, std::pair<unsigned int, unsigned int>>> indFills,
|
|
INST_LIST_ITER f, G4_BB* bb, int srcCISAOff)
|
|
{
|
|
// Copy data from coalesced fill in to older fills.
|
|
// This way we dont carry entire coalesced payload
|
|
// till last fill.
|
|
for (auto oldFill : indFills)
|
|
{
|
|
unsigned int numGRFs = (oldFill.first->getRightBound() - oldFill.first->getLeftBound()
|
|
+ G4_GRF_REG_NBYTES - 1) / G4_GRF_REG_NBYTES;
|
|
unsigned int rowOff = 0;
|
|
// TODO: Check for > 2 GRF dst
|
|
while (numGRFs > 0)
|
|
{
|
|
unsigned int simdSize = 8;
|
|
|
|
unsigned int off = oldFill.second.first;
|
|
unsigned int size = oldFill.second.second;
|
|
|
|
unsigned int scratchOff = coalescedFillDst->getInst()->getMsgDesc()->getScratchRWOffset();
|
|
|
|
// Scratch msg offset is always equal or lower than individual fills
|
|
unsigned int offToUse = off - scratchOff + rowOff;
|
|
|
|
if (size > 8)
|
|
simdSize = 16;
|
|
|
|
G4_DstRegRegion* movDst = kernel.fg.builder->createDstRegRegion(Direct,
|
|
oldFill.first->getBase(), (short)rowOff, 0, 1, Type_UD);
|
|
|
|
G4_SrcRegRegion* src = kernel.fg.builder->createSrcRegRegion(Mod_src_undef, Direct,
|
|
coalescedFillDst->getBase(), (short)offToUse, 0, kernel.fg.builder->getRegionStride1(), Type_UD);
|
|
|
|
G4_INST* copy = kernel.fg.builder->createInternalInst(nullptr, G4_mov, nullptr, false, (unsigned char)simdSize,
|
|
movDst, src, nullptr, InstOpt_WriteEnable);
|
|
copy->setCISAOff(srcCISAOff);
|
|
|
|
bb->instList.insert(f, copy);
|
|
|
|
numGRFs -= simdSize == 8 ? 1 : 2;
|
|
rowOff += simdSize == 8 ? 1 : 2;
|
|
}
|
|
}
|
|
}
|
|
|
|
G4_Declare* CoalesceSpillFills::createCoalescedSpillDcl(unsigned int payloadSize)
|
|
{
|
|
// Construct spill src
|
|
char* dclName = nullptr;
|
|
G4_Declare* spillDcl = nullptr;
|
|
|
|
dclName = kernel.fg.builder->getNameString(kernel.fg.mem, 32,
|
|
"COAL_SPILL_%d", kernel.Declares.size());
|
|
spillDcl = kernel.fg.builder->createDeclareNoLookup(dclName, G4_GRF,
|
|
8, (unsigned short)payloadSize, Type_UD, DeclareType::CoalescedSpill);
|
|
|
|
spillDcl->setDoNotSpill();
|
|
|
|
return spillDcl;
|
|
}
|
|
|
|
void CoalesceSpillFills::coalesceSpills(std::list<INST_LIST_ITER>& coalesceableSpills, unsigned int min,
|
|
unsigned int max, bool useNoMask, G4_InstOption mask, G4_BB* bb, int srcCISAOff)
|
|
{
|
|
// Generate fill with minimum size = max-min. This should be compatible with
|
|
// payload sizes supported by hardware.
|
|
unsigned int payloadSize = (max - min) + 1;
|
|
|
|
MUST_BE_TRUE(payloadSize == 1 || payloadSize == 2 || payloadSize == 4 || payloadSize == 8,
|
|
"Unsupported payload size");
|
|
|
|
std::set<G4_Declare*> declares;
|
|
unsigned int minRow = UINT_MAX;
|
|
for (auto d : coalesceableSpills)
|
|
{
|
|
auto src1Opnd = (*d)->getSrc(1)->asSrcRegRegion();
|
|
auto curRow = src1Opnd->getLeftBound() / G4_GRF_REG_NBYTES;
|
|
declares.insert(src1Opnd->getTopDcl());
|
|
minRow = minRow > curRow ? curRow : minRow;
|
|
}
|
|
|
|
G4_Declare* dcl = nullptr;
|
|
if (declares.size() == 1)
|
|
{
|
|
dcl = (*declares.begin());
|
|
}
|
|
else
|
|
{
|
|
dcl = createCoalescedSpillDcl(payloadSize);
|
|
minRow = 0;
|
|
}
|
|
|
|
auto coalescedSpillSrc = generateCoalescedSpill(min,
|
|
payloadSize, (*coalesceableSpills.front())->getMsgDesc(),
|
|
useNoMask, mask, srcCISAOff, dcl, minRow);
|
|
|
|
if (declares.size() != 1)
|
|
{
|
|
for (auto c : coalesceableSpills)
|
|
{
|
|
unsigned int scratchOffset, scratchSize;
|
|
getScratchMsgInfo((*c), scratchOffset, scratchSize);
|
|
MUST_BE_TRUE((*c)->getSrc(0)->asSrcRegRegion()->getTopDcl() == kernel.fg.builder->getBuiltinR0(),
|
|
"Unexpected src0");
|
|
|
|
unsigned int rowOff = scratchOffset - min;
|
|
replaceMap.insert(std::make_pair((*c)->getSrc(1)->getTopDcl(),
|
|
std::make_pair(coalescedSpillSrc->getTopDcl(), rowOff)));
|
|
}
|
|
}
|
|
|
|
auto f = coalesceableSpills.back();
|
|
f++;
|
|
|
|
for (auto spill : coalesceableSpills)
|
|
{
|
|
bb->instList.erase(spill);
|
|
}
|
|
coalesceableSpills.clear();
|
|
auto copyIt = bb->instList.insert(f, coalescedSpillSrc->getInst());
|
|
}
|
|
|
|
void CoalesceSpillFills::coalesceFills(std::list<INST_LIST_ITER>& coalesceableFills, unsigned int min,
|
|
unsigned int max, G4_BB* bb, int srcCISAOff)
|
|
{
|
|
// Generate fill with minimum size = max-min. This should be compatible with
|
|
// payload sizes supported by hardware.
|
|
unsigned int payloadSize = (max - min) + 1;
|
|
if (payloadSize == 3)
|
|
payloadSize = 4;
|
|
else if (payloadSize > 4)
|
|
payloadSize = 8;
|
|
else if (payloadSize == 0)
|
|
payloadSize = 1;
|
|
|
|
MUST_BE_TRUE(payloadSize == 1 || payloadSize == 2 || payloadSize == 4 || payloadSize == 8,
|
|
"Unsupported payload size");
|
|
|
|
// dclSize could be larger than payload size when
|
|
// 2 variables across scratch writes are coalesced.
|
|
unsigned int dclSize = payloadSize;
|
|
for (auto c : coalesceableFills)
|
|
{
|
|
unsigned int scratchOffset, scratchSize;
|
|
auto fill = (*c);
|
|
getScratchMsgInfo((*c), scratchOffset, scratchSize);
|
|
|
|
auto fillDst = fill->getDst();
|
|
auto fillDstRegOff = fillDst->getRegOff();
|
|
unsigned int dstDclRows = fillDst->getTopDcl()->getNumRows();
|
|
unsigned int maxRow = dstDclRows + scratchOffset - fillDstRegOff - min;
|
|
|
|
if (maxRow > dclSize)
|
|
dclSize = maxRow;
|
|
}
|
|
|
|
auto leadInst = *coalesceableFills.front();
|
|
|
|
auto coalescedFillDst = generateCoalescedFill(min, payloadSize, dclSize,
|
|
leadInst->getMsgDesc(), srcCISAOff, leadInst->getDst()->getTopDcl()->getAlign());
|
|
|
|
for (auto c : coalesceableFills)
|
|
{
|
|
unsigned int scratchOffset, scratchSize;
|
|
getScratchMsgInfo((*c), scratchOffset, scratchSize);
|
|
|
|
unsigned int rowOff = scratchOffset - min;
|
|
replaceMap.insert(std::make_pair((*c)->getDst()->getTopDcl(),
|
|
std::make_pair(coalescedFillDst->getTopDcl(), rowOff)));
|
|
}
|
|
|
|
auto f = coalesceableFills.front();
|
|
f++;
|
|
|
|
for (auto fill : coalesceableFills)
|
|
{
|
|
if (fill == f)
|
|
{
|
|
f++;
|
|
}
|
|
bb->instList.erase(fill);
|
|
}
|
|
|
|
coalesceableFills.clear();
|
|
auto copyIt = bb->instList.insert(f, coalescedFillDst->getInst());
|
|
|
|
// Insert pseudo kill for coalesced range
|
|
auto pseudoKill = kernel.fg.builder->createInternalInst(nullptr, G4_pseudo_kill, nullptr,
|
|
false, 1, kernel.fg.builder->createDstRegRegion(*coalescedFillDst), nullptr, nullptr, 0);
|
|
bb->instList.insert(copyIt, pseudoKill);
|
|
|
|
// copyToOldFills(coalescedFillDst, indFills, f, bb, srcCISAOff);
|
|
}
|
|
|
|
// Return true if heuristic agrees to coalescing.
|
|
bool CoalesceSpillFills::fillHeuristic(std::list<INST_LIST_ITER>& coalesceableFills,
|
|
std::list<INST_LIST_ITER>& instList, const std::list<INST_LIST_ITER>& origInstList,
|
|
unsigned int& min, unsigned int& max)
|
|
{
|
|
#if 0
|
|
std::bitset<8> bits(0);
|
|
MUST_BE_TRUE(cMaxFillPayloadSize == 8, "Handle other max fill payload size");
|
|
#else
|
|
std::bitset<4> bits(0);
|
|
MUST_BE_TRUE(cMaxFillPayloadSize == 4, "Handle other max fill payload size");
|
|
#endif
|
|
|
|
if (coalesceableFills.size() <= 1)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
min = 0xffffffff, max = 0;
|
|
for (auto f : coalesceableFills)
|
|
{
|
|
unsigned int scratchOffset, scratchSize;
|
|
getScratchMsgInfo(*f, scratchOffset, scratchSize);
|
|
|
|
if (scratchSize == 8)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (addrTakenSpillFillDcl.find((*f)->getDst()->getTopDcl()) !=
|
|
addrTakenSpillFillDcl.end())
|
|
{
|
|
return false;
|
|
}
|
|
|
|
for (auto i = scratchOffset; i < (scratchOffset + scratchSize); i++)
|
|
bits.set(i - scratchOffset);
|
|
|
|
if (min > scratchOffset)
|
|
min = scratchOffset;
|
|
|
|
if (max < (scratchOffset + scratchSize - 1))
|
|
max = (scratchOffset + scratchSize - 1);
|
|
}
|
|
|
|
// Iterate over coalescable fills and ensure all rows of a variable
|
|
// are fill candidates. If not, then dont fill. This helps cases like,
|
|
// #1 FILL_V10(0,0) <-- load 0x10 ... (4 GRFs)
|
|
// #2 FILL_V10(4,0) <-- load 0x14 ... (1 GRF)
|
|
// #3 send ... FILL_V10(0,0) ... (use 3 GRFs of FILL_V10)
|
|
// #4 FILL_V11(0,0) <-- load 0x15 ... (1 GRF)
|
|
//
|
|
// Loads at #2 and #4 can be coalesced. But this requires a new coalesced
|
|
// variable of size = 6 GRFs. This size can quickly increase for Cm where
|
|
// payloads of 8 GRF are also present. So instead of allowing these cases
|
|
// at the risk of spilling more, we require that all rows of fill range
|
|
// are candidates in coalesceableFills list. So when only #2 and #4 are
|
|
// fill candidates, we will not coalesce them. This makes us miss some
|
|
// cases where subset rows of fills have been converted to movs.
|
|
const int maxDclSize = 128;
|
|
|
|
std::map<G4_Declare*, std::bitset<maxDclSize>> allRows;
|
|
for (auto c : coalesceableFills)
|
|
{
|
|
auto topdcl = (*c)->getDst()->getTopDcl();
|
|
|
|
unsigned int scratchOffset, scratchSize;
|
|
getScratchMsgInfo(*c, scratchOffset, scratchSize);
|
|
|
|
auto it = allRows.find(topdcl);
|
|
if (it == allRows.end())
|
|
{
|
|
allRows.insert(std::make_pair(topdcl, std::bitset<maxDclSize>()));
|
|
it = allRows.find(topdcl);
|
|
}
|
|
|
|
// Now mark bits corresponding to rows
|
|
unsigned int regOff = (*c)->getDst()->getRegOff();
|
|
for (unsigned int r = regOff;
|
|
r < (regOff+scratchSize); r++)
|
|
{
|
|
it->second.set(r);
|
|
}
|
|
}
|
|
|
|
// Check whether all dcls in map have all rows filled
|
|
for (auto&& r : allRows)
|
|
{
|
|
unsigned int numRows = r.first->getNumRows();
|
|
|
|
for (unsigned int i = 0; i < numRows; i++)
|
|
{
|
|
if (r.second.test(i) == false)
|
|
{
|
|
// Found a row of variable that isnt captured in
|
|
// list of candidate fills.
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
#if 0
|
|
for (auto f : coalesceableFills)
|
|
{
|
|
unsigned int scratchOffset, scratchSize;
|
|
getScratchMsgInfo(*f, scratchOffset, scratchSize);
|
|
|
|
for (auto i = scratchOffset; i < (scratchOffset + scratchSize); i++)
|
|
bits.set(i - min);
|
|
}
|
|
#endif
|
|
|
|
if (max - min <= 3)
|
|
{
|
|
// Will emit at most 4GRF read
|
|
if (bits[0] != bits[1] &&
|
|
bits[2] != bits[3])
|
|
{
|
|
// Dont coalesce patterns like
|
|
// 1010, 0101
|
|
return false;
|
|
}
|
|
|
|
if ((bits[0] & bits[3]) &&
|
|
!(bits[1] | bits[2]))
|
|
{
|
|
// 1001
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// instList contains all instructions (fills or spills) within window size.
|
|
// At exit, instList will contain instructions that will not be coalesced.
|
|
// coalescable list will contain instructions within min-max offset range.
|
|
// First instruction's offset in instList is set to be min. max is
|
|
// min + maxPayloadSize - 1.
|
|
void CoalesceSpillFills::sendsInRange(std::list<INST_LIST_ITER>& instList,
|
|
std::list<INST_LIST_ITER>& coalescable,
|
|
unsigned int maxPayloadSize, unsigned int& min, unsigned int& max)
|
|
{
|
|
min = 0xffffffff;
|
|
max = 0;
|
|
bool isFirstNoMask = false;
|
|
unsigned int mask = 0;
|
|
for (auto iter = instList.begin();
|
|
iter != instList.end();
|
|
)
|
|
{
|
|
unsigned scratchOffset, sizeInGrfUnit, lastScratchOffset;
|
|
auto inst = *(*iter);
|
|
scratchOffset = inst->getMsgDesc()->getScratchRWOffset();
|
|
sizeInGrfUnit = inst->getMsgDesc()->getScratchRWSize();
|
|
lastScratchOffset = scratchOffset + sizeInGrfUnit - 1;
|
|
|
|
if (min == 0xffffffff && max == 0)
|
|
{
|
|
// First spill is definitely a candidate
|
|
min = scratchOffset;
|
|
max = lastScratchOffset;
|
|
coalescable.push_back(*iter);
|
|
iter = instList.erase(iter);
|
|
isFirstNoMask = inst->isWriteEnableInst();
|
|
mask = inst->getMaskOption();
|
|
|
|
if (addrTakenSpillFillDcl.find(inst->getDst()->getTopDcl()) !=
|
|
addrTakenSpillFillDcl.end())
|
|
{
|
|
return;
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
if (min != 0xffffffff || max != 0)
|
|
{
|
|
bool maskMatch = (isFirstNoMask && inst->isWriteEnableInst()) ||
|
|
(mask == inst->getMaskOption());
|
|
|
|
// don't coalesce if non-leading fill inst has alignment requirements,
|
|
// as we may not be able to satisfy it
|
|
bool fillDstisAligned = inst->getDst()->getTopDcl()->getAlign() != Either;
|
|
|
|
if (!maskMatch || fillDstisAligned)
|
|
{
|
|
iter++;
|
|
continue;
|
|
}
|
|
|
|
// Check whether min/max can be extended
|
|
if (scratchOffset <= min &&
|
|
(min - scratchOffset) <= (cMaxFillPayloadSize - 1) &&
|
|
(max - scratchOffset) <= (cMaxFillPayloadSize - 1))
|
|
{
|
|
// This instruction can be coalesced
|
|
min = scratchOffset;
|
|
if (max < lastScratchOffset)
|
|
max = lastScratchOffset;
|
|
|
|
//MUST_BE_TRUE(max - min <= (cMaxFillPayloadSize - 1), "Unexpected fills coalesced. (max - min) is out of bounds - 1");
|
|
|
|
coalescable.push_back(*iter);
|
|
iter = instList.erase(iter);
|
|
}
|
|
else if (scratchOffset >= max &&
|
|
(lastScratchOffset - min) <= (cMaxFillPayloadSize - 1) &&
|
|
(lastScratchOffset - max) <= (cMaxFillPayloadSize - 1))
|
|
{
|
|
max = lastScratchOffset;
|
|
|
|
//MUST_BE_TRUE(max - min <= cMaxFillPayloadSize, "Unexpected spills coalesced. (max - min) is out of bounds - 2");
|
|
|
|
coalescable.push_back(*iter);
|
|
iter = instList.erase(iter);
|
|
}
|
|
else if (scratchOffset >= min &&
|
|
lastScratchOffset <= max)
|
|
{
|
|
coalescable.push_back(*iter);
|
|
iter = instList.erase(iter);
|
|
}
|
|
else
|
|
{
|
|
iter++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// instList contains all spills seen in window.
|
|
// coalescable is empty and should contain consecutive spills.
|
|
// This funtion will prune spills so they write consecutive
|
|
// memory slots. First spill is first candidate to start window.
|
|
void CoalesceSpillFills::keepConsecutiveSpills(std::list<INST_LIST_ITER>& instList,
|
|
std::list<INST_LIST_ITER>& coalescable,
|
|
unsigned int maxPayloadSize, unsigned int& minOffset, unsigned int& maxOffset,
|
|
bool& useNoMask, G4_InstOption& mask)
|
|
{
|
|
// allowed list contains instructions to be coalesced in
|
|
// ascending order of their spill slots.
|
|
std::list<INST_LIST_ITER> allowed;
|
|
auto origInstList = instList;
|
|
allowed.push_back(instList.front());
|
|
instList.pop_front();
|
|
unsigned int maskOffset = (*allowed.front())->getMaskOption();
|
|
mask = (G4_InstOption)(maskOffset & InstOpt_QuarterMasks);
|
|
useNoMask = (maskOffset & InstOpt_WriteEnable) ? true : false;
|
|
unsigned int size;
|
|
getScratchMsgInfo(*allowed.front(), minOffset, size);
|
|
maxOffset = minOffset + size - 1;
|
|
|
|
bool firstSpillFromSend = false;
|
|
G4_Declare* sendDstTopDcl = (*allowed.front())->getSrc(1)->getTopDcl();
|
|
if (sendDstDcl.find(sendDstTopDcl) != sendDstDcl.end())
|
|
firstSpillFromSend = true;
|
|
|
|
for (auto instIt : instList)
|
|
{
|
|
auto inst = (*instIt);
|
|
useNoMask &= inst->isWriteEnableInst();
|
|
|
|
if (!useNoMask)
|
|
break;
|
|
}
|
|
|
|
if (useNoMask)
|
|
{
|
|
// Spill coalescing doesnt work as expected without NoMask
|
|
bool redo;
|
|
do
|
|
{
|
|
redo = false;
|
|
for (auto spillIt = instList.begin();
|
|
spillIt != instList.end();
|
|
spillIt++)
|
|
{
|
|
unsigned int scratchOffset, scratchSize;
|
|
getScratchMsgInfo(*(*spillIt), scratchOffset, scratchSize);
|
|
|
|
auto src1 = (*(*spillIt))->getSrc(1);
|
|
if (src1 &&
|
|
addrTakenSpillFillDcl.find(src1->getTopDcl()) !=
|
|
addrTakenSpillFillDcl.end())
|
|
{
|
|
// Address taken dcls should not be coalesed with others.
|
|
// This is dangerous because nothing ties indirect opnd
|
|
// with fill/spill instructions for it. Only after RA do
|
|
// we update offset of address register holding the
|
|
// indirect operand, based on RA assigment to spill/fill
|
|
// address taken variable.
|
|
continue;
|
|
}
|
|
|
|
if (// Consecutive scratch offsets
|
|
scratchOffset == maxOffset + 1 &&
|
|
// Scratch offset + size is within max payload size
|
|
(scratchOffset + scratchSize - 1) <= (minOffset + maxPayloadSize - 1) &&
|
|
// Either both masks are same or both are NoMask
|
|
(((*(*spillIt))->getMaskOption() == (maskOffset & InstOpt_QuarterMasks)) ||
|
|
(useNoMask && (*(*spillIt))->isWriteEnableInst())))
|
|
{
|
|
auto curInstDstTopDcl = (*(*spillIt))->getSrc(1)->getTopDcl();
|
|
// Check whether current inst's topdcl was spilled in a send.
|
|
// If it was and first instruction in instList wasnt then
|
|
// dont consider current instruction as coalescing candidate.
|
|
if (!firstSpillFromSend &&
|
|
sendDstDcl.find(curInstDstTopDcl) != sendDstDcl.end())
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// This condition allows send coalescing iff
|
|
// a. Either none of the vars are defined in a send
|
|
// b. All vars defined in same send
|
|
if (!firstSpillFromSend ||
|
|
curInstDstTopDcl == sendDstTopDcl)
|
|
{
|
|
if (curInstDstTopDcl == sendDstTopDcl)
|
|
{
|
|
// Make sure src1 operands are consecutive
|
|
auto curSrc1Row = (*(*spillIt))->getSrc(1)->asSrcRegRegion()->getRegOff();
|
|
bool success = true;
|
|
for (auto candidate : allowed)
|
|
{
|
|
unsigned int candOffset, candSize;
|
|
getScratchMsgInfo(*candidate, candOffset, candSize);
|
|
auto prevSrc1Row = (*candidate)->getSrc(1)->asSrcRegRegion()->getRegOff();
|
|
|
|
unsigned int scratchOffDelta = scratchOffset - candOffset;
|
|
if ((prevSrc1Row + scratchOffDelta) != curSrc1Row)
|
|
{
|
|
// Following is disallowed
|
|
// send (8) V10(1,0) ... <-- resLen = 4
|
|
// sends (8) null r0 V10(1,0) 0x100 <-- extLen = 1
|
|
// mov (8) T2 V10(2,0)
|
|
// sends (8) null r0 r10(3,0) 0x101 <-- extLen = 1
|
|
// mov (8) T4 V10(4,0)
|
|
// Two scratch writes cannot be coalesced here
|
|
// because their src1 regions arent consecutive.
|
|
success = false;
|
|
break;
|
|
}
|
|
}
|
|
if (!success)
|
|
continue;
|
|
}
|
|
|
|
allowed.push_back(*spillIt);
|
|
instList.erase(spillIt);
|
|
redo = true;
|
|
maxOffset += scratchSize;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
} while (redo);
|
|
}
|
|
|
|
while (allowed.size() > 1)
|
|
{
|
|
unsigned int slots = maxOffset - minOffset + 1;
|
|
if (slots == 2 || slots == 4)
|
|
{
|
|
// Insert coalescable spills in order of appearance
|
|
for (auto origInst : origInstList)
|
|
{
|
|
for (auto allowedSpills : allowed)
|
|
{
|
|
if (*origInst == *allowedSpills)
|
|
{
|
|
coalescable.push_back(origInst);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
MUST_BE_TRUE(coalescable.size() == allowed.size(),
|
|
"Coalesced spills list missing entries");
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
allowed.pop_back();
|
|
unsigned int scratchOffset, scratchSize;
|
|
getScratchMsgInfo(*allowed.back(), scratchOffset, scratchSize);
|
|
maxOffset = scratchOffset + scratchSize - 1;
|
|
}
|
|
}
|
|
|
|
instList = origInstList;
|
|
for (auto coalIt = coalescable.begin(),
|
|
instIt = instList.begin();
|
|
coalIt != coalescable.end();
|
|
coalIt++)
|
|
{
|
|
if (*instIt == *coalIt)
|
|
instIt = instList.erase(instIt);
|
|
else
|
|
{
|
|
while (*instIt != *coalIt)
|
|
{
|
|
instIt++;
|
|
}
|
|
instIt = instList.erase(instIt);
|
|
}
|
|
}
|
|
}
|
|
|
|
INST_LIST_ITER CoalesceSpillFills::analyzeSpillCoalescing(std::list<INST_LIST_ITER>& instList,
|
|
INST_LIST_ITER start, INST_LIST_ITER end, G4_BB* bb)
|
|
{
|
|
// Check and perform coalescing, if possible, amongst spills in instList.
|
|
// Return inst iter points to either last inst+1 in instList if all spills
|
|
// were coalesced. Otherwise, it points to first spill that wasnt coalesced.
|
|
// Spill coalescing is possible only when all slots in coalesced range
|
|
// have a write.
|
|
INST_LIST_ITER last = end;
|
|
last++;
|
|
#if 0
|
|
unsigned int startCISAOff = (*instList.front())->getCISAOff();
|
|
#endif
|
|
if (instList.size() < 2)
|
|
{
|
|
return last;
|
|
}
|
|
|
|
std::list<INST_LIST_ITER> coalesceableSpills;
|
|
auto origInstList = instList;
|
|
unsigned int min, max;
|
|
G4_InstOption mask;
|
|
bool useNoMask;
|
|
keepConsecutiveSpills(instList, coalesceableSpills, cMaxSpillPayloadSize, min, max, useNoMask, mask);
|
|
|
|
#if 0
|
|
printf("Start -- \n");
|
|
if (coalesceableSpills.size() > 0)
|
|
{
|
|
printf("Will coalesce following spill (offset, size) pairs:\n");
|
|
for (auto k : coalesceableSpills)
|
|
{
|
|
printf("(%d, %d) @ $%d,\t", (*k)->getMsgDesc()->getScratchRWOffset(), (*k)->getMsgDesc()->getScratchRWSize(), (*k)->getCISAOff());
|
|
}
|
|
printf("\n\n");
|
|
}
|
|
|
|
if (instList.size() > 0)
|
|
{
|
|
printf("Will NOT coalesce following spill (offset, size) pairs:\n");
|
|
for (auto k : instList)
|
|
{
|
|
printf("(%d, %d) @ $%d,\t", (*k)->getMsgDesc()->getScratchRWOffset(), (*k)->getMsgDesc()->getScratchRWSize(), (*k)->getCISAOff());
|
|
}
|
|
printf("\n\n");
|
|
}
|
|
|
|
printf("End --\n");
|
|
#endif
|
|
|
|
if (coalesceableSpills.size() > 1)
|
|
{
|
|
coalesceSpills(coalesceableSpills, min, max, useNoMask, mask, bb, (*coalesceableSpills.front())->getCISAOff());
|
|
}
|
|
else
|
|
{
|
|
// When coalescing is not done, we want to
|
|
// move to second instruction in instList in
|
|
// next loop iteration.
|
|
instList.pop_front();
|
|
}
|
|
|
|
if (instList.size() == 0)
|
|
{
|
|
return last;
|
|
}
|
|
else
|
|
{
|
|
return instList.front();
|
|
}
|
|
}
|
|
|
|
INST_LIST_ITER CoalesceSpillFills::analyzeFillCoalescing(std::list<INST_LIST_ITER>& instList,
|
|
INST_LIST_ITER start, INST_LIST_ITER end, G4_BB* bb)
|
|
{
|
|
// Check and perform coalescing, if possible, amongst fills in instList.
|
|
// Return inst iter points to either last inst+1 in instList if all fills
|
|
// were coalesced. Otherwise, it points to first fill that wasnt coalesced.
|
|
INST_LIST_ITER last = end;
|
|
last++;
|
|
#if 0
|
|
G4_INST* lastInst = nullptr;
|
|
if (last != bb->instList.end())
|
|
lastInst = (*last);
|
|
#endif
|
|
if (instList.size() < 2)
|
|
{
|
|
return last;
|
|
}
|
|
|
|
std::list<INST_LIST_ITER> coalesceableFills;
|
|
auto origInstList = instList;
|
|
unsigned int min, max;
|
|
sendsInRange(instList, coalesceableFills, cMaxFillPayloadSize, min, max);
|
|
|
|
bool heuristic = fillHeuristic(coalesceableFills, instList, origInstList, min, max);
|
|
if (!heuristic)
|
|
{
|
|
coalesceableFills.clear();
|
|
instList = origInstList;
|
|
instList.pop_front();
|
|
#if 0
|
|
printf("Fill heuristic didnt agree to coalescing\n");
|
|
#endif
|
|
}
|
|
|
|
#if 0
|
|
printf("Start -- \n");
|
|
if (coalesceableFills.size() > 0)
|
|
{
|
|
printf("Will coalesce following fill (offset, size) pairs:\n");
|
|
for (auto k : coalesceableFills)
|
|
{
|
|
printf("(%d, %d) @ $%d,\t", (*k)->getMsgDesc()->getScratchRWOffset(), (*k)->getMsgDesc()->getScratchRWSize(), (*k)->getCISAOff());
|
|
}
|
|
printf("\n\n");
|
|
}
|
|
|
|
if (instList.size() > 0)
|
|
{
|
|
printf("Will NOT coalesce following fill (offset, size) pairs:\n");
|
|
for (auto k : instList)
|
|
{
|
|
printf("(%d, %d) @ $%d,\t", (*k)->getMsgDesc()->getScratchRWOffset(), (*k)->getMsgDesc()->getScratchRWSize(), (*k)->getCISAOff());
|
|
}
|
|
printf("\n\n");
|
|
}
|
|
|
|
printf("End --\n");
|
|
#endif
|
|
|
|
if (coalesceableFills.size() > 1)
|
|
{
|
|
coalesceFills(coalesceableFills, min, max, bb, (*coalesceableFills.front())->getCISAOff());
|
|
}
|
|
|
|
if (instList.size() == 0)
|
|
{
|
|
return last;
|
|
}
|
|
else
|
|
{
|
|
return instList.front();
|
|
}
|
|
}
|
|
|
|
bool CoalesceSpillFills::overlap(G4_INST* inst1, G4_INST* inst2, bool& isFullOverlap)
|
|
{
|
|
unsigned int scratchOffset1, scratchSize1, scratchOffset2, scratchSize2;
|
|
unsigned int scratchEnd1, scratchEnd2;
|
|
getScratchMsgInfo(inst1, scratchOffset1, scratchSize1);
|
|
getScratchMsgInfo(inst2, scratchOffset2, scratchSize2);
|
|
|
|
// isFullOverlap is true only if inst1 full covers inst2
|
|
isFullOverlap = false;
|
|
|
|
scratchEnd1 = scratchOffset1 + scratchSize1 - 1;
|
|
scratchEnd2 = scratchOffset2 + scratchSize2 - 1;
|
|
|
|
if (scratchOffset1 <= scratchOffset2)
|
|
{
|
|
// inst1 |---------| or |----------|
|
|
// inst2 |------| |---|
|
|
if (scratchEnd1 >= scratchOffset2)
|
|
{
|
|
if (scratchOffset1 <= scratchOffset2 &&
|
|
(scratchOffset1 + scratchSize1) >= (scratchOffset2 + scratchSize2))
|
|
{
|
|
isFullOverlap = true;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// inst1 |------| or |-----|
|
|
// inst2 |-----| |-----------|
|
|
if (scratchEnd2 >= scratchOffset1)
|
|
{
|
|
if (scratchOffset1 <= scratchOffset2 &&
|
|
(scratchOffset1 + scratchSize1) >= (scratchOffset2 + scratchSize2))
|
|
{
|
|
isFullOverlap = true;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool CoalesceSpillFills::overlap(G4_INST* inst, std::list<INST_LIST_ITER>& allInsts)
|
|
{
|
|
for (auto sp : allInsts)
|
|
{
|
|
bool t;
|
|
auto spillInst = (*sp);
|
|
if (overlap(inst, spillInst, t))
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
void CoalesceSpillFills::removeWARFills(std::list<INST_LIST_ITER>& fills, std::list<INST_LIST_ITER>& spills)
|
|
{
|
|
for (auto flIt = fills.begin();
|
|
flIt != fills.end();
|
|
)
|
|
{
|
|
if (overlap((*(*flIt)), spills))
|
|
{
|
|
flIt = fills.erase(flIt);
|
|
continue;
|
|
}
|
|
flIt++;
|
|
}
|
|
}
|
|
|
|
void CoalesceSpillFills::replaceCoalescedOperands(G4_INST* inst)
|
|
{
|
|
auto dst = inst->getDst();
|
|
if (dst &&
|
|
dst->getTopDcl())
|
|
{
|
|
auto dcl = dst->getTopDcl();
|
|
auto it = replaceMap.find(dcl);
|
|
|
|
if (it != replaceMap.end())
|
|
{
|
|
auto dstRgn = dst->asDstRegRegion();
|
|
auto newDstRgn = kernel.fg.builder->createDstRegRegion(Direct, it->second.first->getRegVar(),
|
|
it->second.second + dstRgn->getRegOff(), dstRgn->getSubRegOff(), dstRgn->getHorzStride(), dstRgn->getType());
|
|
|
|
inst->setDest(newDstRgn);
|
|
}
|
|
}
|
|
|
|
for (unsigned int i = 0; i < G4_MAX_SRCS; i++)
|
|
{
|
|
auto opnd = inst->getSrc(i);
|
|
|
|
if (opnd &&
|
|
opnd->getTopDcl())
|
|
{
|
|
auto dcl = opnd->getTopDcl();
|
|
auto it = replaceMap.find(dcl);
|
|
|
|
if (it == replaceMap.end())
|
|
continue;
|
|
|
|
if (opnd->isSrcRegRegion())
|
|
{
|
|
auto srcRgn = opnd->asSrcRegRegion();
|
|
auto oldRgnDesc = srcRgn->getRegion();
|
|
|
|
auto newSrcRgn = kernel.fg.builder->createSrcRegRegion(srcRgn->getModifier(), Direct,
|
|
it->second.first->getRegVar(), it->second.second + srcRgn->getRegOff(),
|
|
srcRgn->getSubRegOff(), oldRgnDesc,
|
|
opnd->getType());
|
|
|
|
inst->setSrc(newSrcRgn, i);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void CoalesceSpillFills::insertKill(G4_BB* bb, INST_LIST_ITER instIt, std::set<G4_Declare*>& coalescedRangeKills)
|
|
{
|
|
// Check whether current instruction is first reference of coalesced
|
|
// spill range. If it is then insert pseudo_kill for coalesced var
|
|
// before instIt.
|
|
auto inst = (*instIt);
|
|
auto dst = inst->getDst();
|
|
if (dst)
|
|
{
|
|
auto topdcl = dst->getTopDcl();
|
|
if (topdcl)
|
|
{
|
|
auto entry = replaceMap.find(topdcl);
|
|
if (entry != replaceMap.end() &&
|
|
coalescedRangeKills.find(entry->second.first) == coalescedRangeKills.end())
|
|
{
|
|
// This means topdcl is first reference of to-be-replaced
|
|
// dcl. Insert pseudokill for coalesced dcl here.
|
|
auto killDst = kernel.fg.builder->createDstRegRegion(Direct,
|
|
entry->second.first->getRegVar(), 0, 0, 1, Type_UD);
|
|
auto kill = kernel.fg.builder->createInternalInst(nullptr,
|
|
G4_pseudo_kill, nullptr, false, 1, killDst, nullptr, nullptr, 0);
|
|
bb->instList.insert(instIt, kill);
|
|
coalescedRangeKills.insert(entry->second.first);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
bool CoalesceSpillFills::allSpillsSameVar(std::list<INST_LIST_ITER>& spills)
|
|
{
|
|
// Return true if all vars in spills list have same dcl
|
|
G4_Declare* dcl = nullptr;
|
|
for (auto s : spills)
|
|
{
|
|
auto topdcl = (*s)->getSrc(1)->getTopDcl();
|
|
|
|
if (!dcl)
|
|
dcl = topdcl;
|
|
|
|
if (topdcl != dcl)
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Allow only if all dcls are defined by same send
|
|
if (sendDstDcl.find(dcl) != sendDstDcl.end())
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
void CoalesceSpillFills::fills()
|
|
{
|
|
// Iterate over all BBs, find fills that are closeby and coalesce
|
|
// a bunch of them. Insert movs as required.
|
|
for (auto bb : kernel.fg.BBs)
|
|
{
|
|
auto endIter = bb->instList.end();
|
|
std::list<INST_LIST_ITER> fillsToCoalesce;
|
|
std::list<INST_LIST_ITER> spills;
|
|
INST_LIST_ITER startIter = bb->instList.begin();
|
|
unsigned int w = 0;
|
|
for (auto instIter = startIter;
|
|
instIter != endIter;)
|
|
{
|
|
auto inst = (*instIter);
|
|
|
|
if (inst->isPseudoKill() ||
|
|
inst->isLabel())
|
|
{
|
|
instIter++;
|
|
continue;
|
|
}
|
|
|
|
if (inst->isSend())
|
|
{
|
|
if (inst->getMsgDesc()->isScratchWrite())
|
|
{
|
|
spills.push_back(instIter);
|
|
}
|
|
else if (inst->getMsgDesc()->isScratchRead())
|
|
{
|
|
// Check if coalescing is possible
|
|
if (fillsToCoalesce.size() == 0)
|
|
{
|
|
w = 0;
|
|
startIter = instIter;
|
|
spills.clear();
|
|
}
|
|
|
|
if (!overlap(*instIter, spills))
|
|
{
|
|
fillsToCoalesce.push_back(instIter);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (fillsToCoalesce.size() > 0 &&
|
|
rpe.getRegisterPressure(inst) > 180)
|
|
{
|
|
// High register pressure region so reduce window size to 3
|
|
w = (cWindowSize - w > 3) ? cWindowSize - 3 : w;
|
|
}
|
|
|
|
if (w == cWindowSize || inst == bb->instList.back())
|
|
{
|
|
if (fillsToCoalesce.size() > 1)
|
|
{
|
|
instIter = analyzeFillCoalescing(fillsToCoalesce, startIter, instIter, bb);
|
|
}
|
|
else if (w == cWindowSize)
|
|
{
|
|
startIter = instIter;
|
|
}
|
|
else if (inst == bb->instList.back())
|
|
{
|
|
break;
|
|
}
|
|
|
|
w = 0;
|
|
fillsToCoalesce.clear();
|
|
spills.clear();
|
|
|
|
continue;
|
|
}
|
|
|
|
if (fillsToCoalesce.size() > 0)
|
|
{
|
|
w++;
|
|
}
|
|
|
|
instIter++;
|
|
}
|
|
|
|
// One pass to replace old fills with coalesced dcl
|
|
for (auto instIt = bb->instList.begin();
|
|
instIt != bb->instList.end();
|
|
)
|
|
{
|
|
auto inst = (*instIt);
|
|
|
|
if (inst->isPseudoKill() &&
|
|
replaceMap.find(inst->getDst()->getTopDcl()) != replaceMap.end())
|
|
{
|
|
instIt = bb->instList.erase(instIt);
|
|
continue;
|
|
}
|
|
|
|
replaceCoalescedOperands(inst);
|
|
instIt++;
|
|
}
|
|
}
|
|
}
|
|
|
|
void CoalesceSpillFills::populateSendDstDcl()
|
|
{
|
|
// Find and store all G4_Declares that are dest in sends
|
|
// and are spilled. This is required when coalescing
|
|
// scratch writes for such spills. We cannot mix coalescing
|
|
// for G4_Declares from one and and other instructions.
|
|
// Otherwise register pressure increases significantly.
|
|
for (auto bb : kernel.fg.BBs)
|
|
{
|
|
for (auto inst : bb->instList)
|
|
{
|
|
if (inst->isSend() &&
|
|
inst->getDst())
|
|
{
|
|
if (!inst->getDst()->isNullReg())
|
|
{
|
|
if (!inst->getMsgDesc()->isScratchRW())
|
|
{
|
|
auto topdcl = inst->getDst()->getTopDcl();
|
|
|
|
sendDstDcl.insert(topdcl);
|
|
}
|
|
}
|
|
else if (inst->getMsgDesc()->isScratchWrite() &&
|
|
inst->getSrc(1)->getBase()->asRegVar()->isRegVarCoalesced())
|
|
{
|
|
auto topdcl = inst->getSrc(1)->getTopDcl();
|
|
|
|
sendDstDcl.insert(topdcl);
|
|
}
|
|
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void CoalesceSpillFills::spills()
|
|
{
|
|
populateSendDstDcl();
|
|
|
|
// Iterate over all BBs, find fills that are closeby and coalesce
|
|
// a bunch of them. Insert movs as required.
|
|
for (auto bb : kernel.fg.BBs)
|
|
{
|
|
auto endIter = bb->instList.end();
|
|
std::list<INST_LIST_ITER> spillsToCoalesce;
|
|
INST_LIST_ITER startIter = bb->instList.begin();
|
|
unsigned int w = 0;
|
|
for (auto instIter = startIter;
|
|
instIter != endIter;)
|
|
{
|
|
auto inst = (*instIter);
|
|
|
|
if (inst->isPseudoKill() ||
|
|
inst->isLabel())
|
|
{
|
|
instIter++;
|
|
continue;
|
|
}
|
|
|
|
bool earlyCoalesce = false;
|
|
if (inst->isSend())
|
|
{
|
|
if (inst->getMsgDesc()->isScratchWrite())
|
|
{
|
|
// Check if coalescing is possible
|
|
if (spillsToCoalesce.size() == 0)
|
|
{
|
|
w = 0;
|
|
startIter = instIter;
|
|
spillsToCoalesce.clear();
|
|
}
|
|
|
|
for (auto coalIt = spillsToCoalesce.begin();
|
|
coalIt != spillsToCoalesce.end();
|
|
)
|
|
{
|
|
bool fullOverlap = false;
|
|
if (overlap(*instIter, *(*coalIt), fullOverlap))
|
|
{
|
|
if (fullOverlap)
|
|
{
|
|
#if 0
|
|
printf("Deleting spill at $%d due to %d\n", (*(*coalIt))->getCISAOff(), (*instIter)->getCISAOff());
|
|
#endif
|
|
// Delete earlier spill since its made redundant
|
|
// by current spill.
|
|
bb->instList.erase(*coalIt);
|
|
}
|
|
|
|
coalIt = spillsToCoalesce.erase(coalIt);
|
|
continue;
|
|
}
|
|
coalIt++;
|
|
}
|
|
spillsToCoalesce.push_back(instIter);
|
|
}
|
|
else if (inst->getMsgDesc()->isScratchRead())
|
|
{
|
|
for (auto coalIt = spillsToCoalesce.begin();
|
|
coalIt != spillsToCoalesce.end();
|
|
)
|
|
{
|
|
bool temp = false;
|
|
if (overlap(*instIter, *(*coalIt), temp))
|
|
{
|
|
#if 1
|
|
// Instead of deleting scratch writes try coalescing
|
|
// at this point. This way maybe the fill can also
|
|
// be cleaned up in later phase.
|
|
earlyCoalesce = true;
|
|
break;
|
|
#else
|
|
coalIt = spillsToCoalesce.erase(coalIt);
|
|
continue;
|
|
#endif
|
|
}
|
|
coalIt++;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (spillsToCoalesce.size() > 0 &&
|
|
rpe.getRegisterPressure(inst) > 120)
|
|
{
|
|
if (!allSpillsSameVar(spillsToCoalesce))
|
|
{
|
|
// High register pressure region so reduce window size to 3
|
|
w = (cWindowSize - w > 3) ? cWindowSize - 3 : w;
|
|
}
|
|
else
|
|
{
|
|
#if 0
|
|
printf("Found register pressure = %d at %d. Still coalescing spills because all spills are from same var.\n",
|
|
rpe.getRegisterPressure(inst), inst->getCISAOff());
|
|
#endif
|
|
}
|
|
}
|
|
|
|
if (w == cWindowSize || inst == bb->instList.back() ||
|
|
earlyCoalesce)
|
|
{
|
|
if (spillsToCoalesce.size() > 1)
|
|
{
|
|
instIter = analyzeSpillCoalescing(spillsToCoalesce, startIter, instIter, bb);
|
|
}
|
|
else if (w == cWindowSize)
|
|
{
|
|
startIter = instIter;
|
|
}
|
|
else if (inst == bb->instList.back())
|
|
{
|
|
break;
|
|
}
|
|
|
|
w = 0;
|
|
spillsToCoalesce.clear();
|
|
continue;
|
|
}
|
|
|
|
if (spillsToCoalesce.size() > 0)
|
|
{
|
|
w++;
|
|
}
|
|
|
|
instIter++;
|
|
}
|
|
|
|
std::set<G4_Declare*> coalescedRangeKills;
|
|
// One pass to replace old fills with coalesced dcl
|
|
for (auto instIt = bb->instList.begin();
|
|
instIt != bb->instList.end();
|
|
)
|
|
{
|
|
auto inst = (*instIt);
|
|
|
|
if (inst->isPseudoKill() &&
|
|
replaceMap.find(inst->getDst()->getTopDcl()) != replaceMap.end())
|
|
{
|
|
instIt = bb->instList.erase(instIt);
|
|
continue;
|
|
}
|
|
|
|
insertKill(bb, instIt, coalescedRangeKills);
|
|
|
|
replaceCoalescedOperands(inst);
|
|
instIt++;
|
|
}
|
|
}
|
|
}
|
|
|
|
void CoalesceSpillFills::fixSendsSrcOverlap()
|
|
{
|
|
// Overlap for sends src operands is not allowed.
|
|
//
|
|
// Fix for following code pattern after spill/fill coalescing:
|
|
// send (16) COAL_FILL_373(0,0)<1>:ud r0 0xa 0x24c2001:ud{Align1, NoMask} // #??:$365:%657:&-1 // scratch read, resLen=4, msgLen=1
|
|
// sends(1) null:ud COAL_FILL_373(0, 0) COAL_FILL_373(1, 0) 0x4c : ud 0x40680ff : ud{ Align1, Q1, NoMask } // #??:$365:%365:&-1 // a64 scatt
|
|
// ered write, resLen = 0, msgLen = 2, extMsgLen = 1
|
|
//
|
|
// for CISA:
|
|
// svm_scatter.1.1 (M1_NM, 1) V441.0 V449.0 /// $365
|
|
//
|
|
// where V441 and V449 are both scalars of type :uq and :ud respectively
|
|
//
|
|
for (auto bb : kernel.fg.BBs)
|
|
{
|
|
for (auto instIt = bb->instList.begin();
|
|
instIt != bb->instList.end();
|
|
instIt++)
|
|
{
|
|
auto inst = (*instIt);
|
|
|
|
if (!inst->isSplitSend())
|
|
{
|
|
continue;
|
|
}
|
|
|
|
auto src0 = inst->getSrc(0);
|
|
auto src1 = inst->getSrc(1);
|
|
|
|
if (src0->getTopDcl() == src1->getTopDcl())
|
|
{
|
|
auto lb0 = src0->getLeftBound();
|
|
auto rb0 = src0->getRightBound();
|
|
auto lb1 = src1->getLeftBound();
|
|
auto rb1 = src1->getRightBound();
|
|
|
|
if ((lb0 < lb1 && rb0 > lb1) ||
|
|
(lb1 < lb0 && rb1 > lb0))
|
|
{
|
|
// Ideally we should create copy of
|
|
// operand with less number of GRFs,
|
|
// but this is a really corner case
|
|
// and probably shows up only for
|
|
// force spills. So we simply choose
|
|
// src1 of sends.
|
|
char* dclName = kernel.fg.builder->getNameString(kernel.fg.mem, 32,
|
|
"COPY_%d", kernel.Declares.size());
|
|
G4_Declare* copyDcl = kernel.fg.builder->createDeclareNoLookup(dclName, G4_GRF,
|
|
8, src1->getTopDcl()->getNumRows(),
|
|
Type_UD);
|
|
|
|
unsigned int elems = copyDcl->getNumElems();
|
|
short row = 0;
|
|
while (elems > 0)
|
|
{
|
|
G4_SrcRegRegion* srcRgn = kernel.fg.builder->createSrcRegRegion(
|
|
Mod_src_undef, Direct, src1->getTopDcl()->getRegVar(), row, 0,
|
|
kernel.fg.builder->getRegionStride1(), Type_UD);
|
|
G4_DstRegRegion* dstRgn = kernel.fg.builder->createDstRegRegion(
|
|
Direct, copyDcl->getRegVar(), row, 0, 1, Type_UD);
|
|
G4_INST* copyInst = kernel.fg.builder->createInternalInst(nullptr,
|
|
G4_mov, nullptr, false, 8, dstRgn, srcRgn, nullptr, InstOpt_WriteEnable);
|
|
copyInst->setCISAOff(inst->getCISAOff());
|
|
bb->instList.insert(instIt, copyInst);
|
|
elems -= 8;
|
|
row++;
|
|
}
|
|
|
|
G4_SrcRegRegion* sendSrc1 = kernel.fg.builder->createSrcRegRegion(Mod_src_undef,
|
|
Direct, copyDcl->getRegVar(), 0, 0, kernel.fg.builder->getRegionStride1(),
|
|
Type_UD);
|
|
inst->setSrc(sendSrc1, 1);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void CoalesceSpillFills::removeRedundantSplitMovs()
|
|
{
|
|
// send (8) V_SAMPLER -- resLen = 3
|
|
// COAL_0(0,0) = V_SAMPLE(0,0)
|
|
// COAL_0(1,0) = V_SAMPLE(1,0)
|
|
// send (8) <null> COAL_0(0,0) <-- len = 2
|
|
// TV0(0,0) = V_SAMPLE(2,0)
|
|
// ===>
|
|
// send (8) V_SAMPLER -- resLen = 3
|
|
// send (8) <null> V_SAMPLE(0,0) <-- len = 2
|
|
// TV0(0,0) = V_SAMPLE(2,0)
|
|
|
|
// Look for scratch writes. Src1 is data to write to memory.
|
|
// Iterate in bottom-order to check whether raw movs exist
|
|
// that define src1 of scratch write and whether their
|
|
// source operands are consecutive.
|
|
|
|
// Store numUses for dcls replaced and location of defs.
|
|
// This structure is used to eliminate redundant movs
|
|
// later.
|
|
typedef std::pair<G4_BB*, INST_LIST_ITER> MovLoc;
|
|
typedef unsigned int NumRefs;
|
|
std::map<G4_Declare*, std::pair<NumRefs, std::list<MovLoc>>> movs;
|
|
|
|
for (auto bb : kernel.fg.BBs)
|
|
{
|
|
// Store all dcls defined by non scratch sends
|
|
// as only they are candidates for this pass.
|
|
// Without this, we might end up identifying
|
|
// other raw movs coming from partial write like:
|
|
// add (8) r8.0<1>:q r20.0<4;4,1>:q r4.0<0;1,0>:ud {Align1, Q1}
|
|
// send(16) r27.0<1>:uw r26 0xa 0x22c1000 : ud{ Align1, NoMask } // scratch read, fill, offset = 0, resLen=2, msgLen=1
|
|
// mov(8) r27.0<1> : q r8.0<4; 4, 1> : q{ Align1, Q1 }
|
|
// sends(16) null : uw r26 r27 0x8a : ud 0x20f1000 : ud{ Align1, NoMask } // scratch write, spill, offset = 0, resLen=0, msgLen=1, extMsgLen=2
|
|
//
|
|
// Although there is a raw mov before scratch write,
|
|
// it has to be preserved for correctness.
|
|
std::set<G4_Declare*> sendDst;
|
|
for (auto inst : bb->instList)
|
|
{
|
|
if (inst->isSend() &&
|
|
inst->getDst() &&
|
|
!inst->getDst()->isNullReg() &&
|
|
!inst->getMsgDesc()->isScratchRead() &&
|
|
!inst->getMsgDesc()->isScratchWrite())
|
|
{
|
|
sendDst.insert(inst->getDst()->getTopDcl());
|
|
}
|
|
}
|
|
|
|
for (auto instIt = bb->instList.begin(), endIt = bb->instList.end();
|
|
instIt != endIt;
|
|
instIt++)
|
|
{
|
|
auto inst = (*instIt);
|
|
|
|
if (inst->isSplitSend() &&
|
|
inst->getMsgDesc()->isScratchWrite())
|
|
{
|
|
// Spill sends
|
|
auto src1Dcl = inst->getSrc(1)->getTopDcl();
|
|
unsigned int lb = inst->getSrc(1)->getLeftBound();
|
|
unsigned int rb = inst->getSrc(1)->getRightBound();
|
|
std::set<unsigned int> rows;
|
|
for (unsigned int k = lb / G4_GRF_REG_NBYTES; k != (rb + G4_GRF_REG_NBYTES - 1) / G4_GRF_REG_NBYTES; k++)
|
|
{
|
|
rows.insert(k);
|
|
}
|
|
auto tmpIt = instIt;
|
|
tmpIt--;
|
|
G4_Declare* srcDcl = nullptr;
|
|
std::map<unsigned int, unsigned int> dstSrcRowMapping;
|
|
std::list<MovLoc> copies;
|
|
while (tmpIt != bb->instList.begin())
|
|
{
|
|
auto pInst = (*tmpIt);
|
|
|
|
// Each copy should be a raw mov
|
|
if (!pInst->isRawMov())
|
|
break;
|
|
|
|
// Ensure src0 topdcl comes from a send dst in this BB
|
|
if (sendDst.find(pInst->getSrc(0)->getTopDcl()) ==
|
|
sendDst.end())
|
|
break;
|
|
|
|
// Check whether dcls match
|
|
auto pDstDcl = pInst->getDst()->getTopDcl();
|
|
if (pDstDcl != src1Dcl)
|
|
break;
|
|
|
|
unsigned int plb = pInst->getDst()->getLeftBound();
|
|
unsigned int prb = pInst->getDst()->getRightBound();
|
|
|
|
// Check whether complete row(s) defined
|
|
if ((prb - plb + 1) % G4_GRF_REG_NBYTES != 0)
|
|
break;
|
|
|
|
unsigned int rowStart = plb / G4_GRF_REG_NBYTES;
|
|
unsigned int numRows = (prb - plb + 1) / G4_GRF_REG_NBYTES;
|
|
bool punt = false;
|
|
for (unsigned int k = rowStart; k != (rowStart + numRows); k++)
|
|
{
|
|
if (rows.find(k) == rows.end())
|
|
{
|
|
punt = true;
|
|
break;
|
|
}
|
|
dstSrcRowMapping.insert(std::make_pair(k, INT_MAX));
|
|
}
|
|
|
|
if (punt)
|
|
break;
|
|
|
|
auto pSrc0 = pInst->getSrc(0);
|
|
if (!pSrc0->isSrcRegRegion())
|
|
break;
|
|
|
|
auto pSrcDcl = pSrc0->getTopDcl();
|
|
if (!srcDcl)
|
|
srcDcl = pSrcDcl;
|
|
else if (srcDcl != pSrcDcl)
|
|
break;
|
|
|
|
// mov src should be GRF aligned
|
|
if (pSrc0->getLeftBound() % G4_GRF_REG_NBYTES != 0)
|
|
break;
|
|
|
|
unsigned int src0lb = pSrc0->getLeftBound();
|
|
unsigned int src0rb = pSrc0->getRightBound();
|
|
|
|
// (rb - lb) should match dst (rb - lb)
|
|
if ((src0rb - src0lb) != (prb - plb))
|
|
break;
|
|
|
|
unsigned int pStartRow = pSrc0->getLeftBound() / G4_GRF_REG_NBYTES;
|
|
for (unsigned int k = rowStart; k != (rowStart + numRows); k++)
|
|
{
|
|
auto it = dstSrcRowMapping.find(k);
|
|
if (it == dstSrcRowMapping.end())
|
|
{
|
|
punt = true;
|
|
break;
|
|
}
|
|
|
|
it->second = pStartRow + (k - rowStart);
|
|
}
|
|
|
|
if (punt)
|
|
break;
|
|
|
|
copies.push_back(std::make_pair(bb, tmpIt));
|
|
tmpIt--;
|
|
}
|
|
|
|
if (dstSrcRowMapping.size() > 0)
|
|
{
|
|
// Now check whether each entry of src1 has a corresponding src offset
|
|
unsigned int dstRowStart = lb / G4_GRF_REG_NBYTES;
|
|
bool success = true;
|
|
auto baseIt = dstSrcRowMapping.find(dstRowStart);
|
|
if (baseIt != dstSrcRowMapping.end())
|
|
{
|
|
auto base = dstSrcRowMapping.find(dstRowStart)->second;
|
|
for (auto m : dstSrcRowMapping)
|
|
{
|
|
unsigned int curRow = m.first - dstRowStart;
|
|
if (m.second == INT_MAX)
|
|
{
|
|
success = false;
|
|
break;
|
|
}
|
|
|
|
if (m.second != (base + curRow))
|
|
{
|
|
success = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (success && srcDcl)
|
|
{
|
|
// Replace src1 of send with srcDcl
|
|
G4_SrcRegRegion* sendSrc1 = kernel.fg.builder->createSrcRegRegion(Mod_src_undef, Direct, srcDcl->getRegVar(),
|
|
(short)base, 0, kernel.fg.builder->getRegionStride1(), inst->getSrc(1)->getType());
|
|
inst->setSrc(sendSrc1, 1);
|
|
|
|
for (auto c : copies)
|
|
{
|
|
auto defDcl = (*c.second)->getDst()->getTopDcl();
|
|
auto it = movs.find(defDcl);
|
|
if (it == movs.end())
|
|
{
|
|
std::list<MovLoc> t;
|
|
t.push_back(c);
|
|
movs.insert(std::make_pair(defDcl, std::make_pair(0, t)));
|
|
}
|
|
else
|
|
{
|
|
it->second.second.push_back(c);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Update number of uses of each dcl
|
|
for (auto bb : kernel.fg.BBs)
|
|
{
|
|
for (auto instIt = bb->instList.begin(), endIt = bb->instList.end();
|
|
instIt != endIt; instIt++)
|
|
{
|
|
auto inst = (*instIt);
|
|
|
|
if (inst->isPseudoKill())
|
|
{
|
|
auto dcl = inst->getDst()->getTopDcl();
|
|
auto it = movs.find(dcl);
|
|
if (it != movs.end())
|
|
{
|
|
it->second.second.push_back(std::make_pair(bb, instIt));
|
|
}
|
|
}
|
|
|
|
for (unsigned int i = 0; i < G4_MAX_SRCS; i++)
|
|
{
|
|
G4_Operand* opnd = inst->getSrc(i);
|
|
|
|
if (opnd &&
|
|
opnd->getTopDcl())
|
|
{
|
|
auto it = movs.find(opnd->getTopDcl());
|
|
if (it != movs.end())
|
|
it->second.first++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (auto mov : movs)
|
|
{
|
|
auto dcl = mov.first;
|
|
auto numRefs = mov.second.first;
|
|
auto& allMovs = mov.second.second;
|
|
|
|
if (numRefs == 0 && !dcl->getAddressed())
|
|
{
|
|
#if 0
|
|
printf("Removing movs/pseudoKill for dcl %s\n", dcl->getName());
|
|
#endif
|
|
for (auto m : allMovs)
|
|
{
|
|
auto bb = m.first;
|
|
auto iter = m.second;
|
|
#if 0
|
|
printf("\tFound %s occurence at $%d\n", (*iter)->opcode() == G4_mov ? "mov" : "pseudokill", (*iter)->getCISAOff());
|
|
#endif
|
|
bb->instList.erase(iter);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void CoalesceSpillFills::spillFillCleanup()
|
|
{
|
|
// Eliminate redundant fills when a write
|
|
// is close by:
|
|
//
|
|
// spill TV1 at offset = 1
|
|
// ..
|
|
// ..
|
|
// ..
|
|
// fill FP1 from offset = 1
|
|
// = FP1
|
|
// ===>
|
|
// Remove fill and replace occurence of FP1 with TV1
|
|
//
|
|
|
|
for (auto bb : kernel.fg.BBs)
|
|
{
|
|
auto startIt = bb->instList.begin();
|
|
auto endIt = bb->instList.end();
|
|
for (auto instIt = startIt;
|
|
instIt != endIt;
|
|
instIt++)
|
|
{
|
|
auto inst = (*instIt);
|
|
|
|
std::map<unsigned int, G4_INST*> writesPerOffset;
|
|
std::set<G4_Declare*> defs;
|
|
if (inst->isSend() &&
|
|
inst->getMsgDesc()->isScratchRead())
|
|
{
|
|
// Store offset, spill inst pair
|
|
unsigned int rowStart, numRows;
|
|
getScratchMsgInfo(inst, rowStart, numRows);
|
|
unsigned int lastRow = rowStart + numRows - 1;
|
|
|
|
// Scan window of instruction above current inst
|
|
// to check whether all rows read by current inst
|
|
// have been written.
|
|
auto pInstIt = instIt;
|
|
pInstIt--;
|
|
unsigned int w = cSpillFillCleanupWindowSize;
|
|
while (pInstIt != startIt &&
|
|
w > 0)
|
|
{
|
|
auto pInst = (*pInstIt);
|
|
|
|
if (pInst->isSplitSend() &&
|
|
pInst->getMsgDesc()->isScratchWrite())
|
|
{
|
|
unsigned int pRowStart, pNumRows;
|
|
getScratchMsgInfo(pInst, pRowStart, pNumRows);
|
|
|
|
// If any def of src1 dcl is found then dont
|
|
// consider this write for optimization. Its
|
|
// value in memory could be different than
|
|
// one held in variable.
|
|
auto pSrc1Dcl = pInst->getSrc(1)->getTopDcl();
|
|
if (defs.find(pSrc1Dcl) != defs.end())
|
|
{
|
|
pInstIt--;
|
|
continue;
|
|
}
|
|
|
|
for (unsigned int pRow = pRowStart;
|
|
pRow != (pRowStart + pNumRows);
|
|
pRow++)
|
|
{
|
|
auto writeIt = writesPerOffset.find(pRow);
|
|
|
|
// Check whether a more recent write was found for this row
|
|
if (writeIt != writesPerOffset.end())
|
|
continue;
|
|
|
|
writesPerOffset.insert(std::make_pair(pRow, pInst));
|
|
}
|
|
}
|
|
|
|
if (pInst->getDst() &&
|
|
pInst->getDst()->getTopDcl())
|
|
{
|
|
// Store any defs seen to handle WAR
|
|
defs.insert(pInst->getDst()->getTopDcl());
|
|
}
|
|
|
|
w--;
|
|
pInstIt--;
|
|
}
|
|
|
|
// Check whether writes for all rows were found
|
|
bool found = true;
|
|
for (auto row = rowStart; row <= lastRow; row++)
|
|
{
|
|
if (writesPerOffset.find(row) == writesPerOffset.end())
|
|
{
|
|
found = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!found)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Writes for all rows found
|
|
unsigned int execSize;
|
|
execSize = kernel.getSimdSize() > 16 ? 16 : kernel.getSimdSize();
|
|
|
|
for (auto row = rowStart; row <= lastRow;)
|
|
{
|
|
if (execSize == 16 &&
|
|
row == lastRow)
|
|
{
|
|
// In case of odd rows in SIMD16
|
|
execSize = 8;
|
|
}
|
|
else if (execSize == 16)
|
|
{
|
|
// In SIMD16 kernel 2 consecutive rows should come from same spill
|
|
if (writesPerOffset.find(row)->second != writesPerOffset.find(row + 1)->second)
|
|
{
|
|
execSize = 8;
|
|
}
|
|
}
|
|
|
|
// Insert SIMD8 mov per row
|
|
G4_DstRegRegion* nDst = kernel.fg.builder->createDstRegRegion(Direct,
|
|
inst->getDst()->getBase(), row + inst->getDst()->asDstRegRegion()->getRegOff() - rowStart,
|
|
0, 1, Type_UD);
|
|
|
|
auto write = writesPerOffset.find(row)->second;
|
|
G4_SrcRegRegion* src1Write = write->getSrc(1)->asSrcRegRegion();
|
|
unsigned int writeRowStart = write->getMsgDesc()->getScratchRWOffset();
|
|
unsigned int diff = row - writeRowStart;
|
|
G4_SrcRegRegion* nSrc = kernel.fg.builder->createSrcRegRegion(Mod_src_undef, Direct,
|
|
src1Write->getBase(), diff + src1Write->getRegOff(), 0,
|
|
kernel.fg.builder->getRegionStride1(), Type_UD);
|
|
|
|
G4_INST* mov = kernel.fg.builder->createInternalInst(nullptr, G4_mov, nullptr, false, (unsigned char)execSize,
|
|
nDst, nSrc, nullptr, InstOpt_WriteEnable);
|
|
bb->instList.insert(instIt, mov);
|
|
mov->setCISAOff(inst->getCISAOff());
|
|
|
|
row += execSize / 8;
|
|
}
|
|
|
|
auto tempIt = instIt;
|
|
tempIt--;
|
|
bb->instList.erase(instIt);
|
|
instIt = tempIt;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void CoalesceSpillFills::removeRedundantWrites()
|
|
{
|
|
typedef std::list<std::pair<G4_BB*, INST_LIST_ITER>> SPILLS;
|
|
typedef std::list<std::pair<G4_BB*, INST_LIST_ITER>> FILLS;
|
|
std::map<unsigned int, std::pair<SPILLS, FILLS>> scratchOffsetAccess;
|
|
// Traverse bottom-up to detect and remove redundant writes.
|
|
// Redundant writes include:
|
|
// 1. Successive writes to same offset without a fill in between,
|
|
// 2. Writes in program without any fill from that slot throughout
|
|
for (auto bb : kernel.fg.BBs)
|
|
{
|
|
auto endIt = bb->instList.end();
|
|
endIt--;
|
|
// Store spill slots that are written in to alongwith emask used
|
|
std::map<unsigned int, unsigned int> scratchOffToMask;
|
|
for (auto instIt = endIt;
|
|
instIt != bb->instList.begin();
|
|
instIt--)
|
|
{
|
|
auto inst = (*instIt);
|
|
|
|
if (inst->isSend())
|
|
{
|
|
unsigned int offset = 0, size = 0;
|
|
if (inst->getMsgDesc()->isScratchRead())
|
|
{
|
|
getScratchMsgInfo(inst, offset, size);
|
|
for (unsigned int k = offset; k != (offset + size); k++)
|
|
{
|
|
auto it = scratchOffToMask.find(k);
|
|
if (it != scratchOffToMask.end())
|
|
{
|
|
scratchOffToMask.erase(it);
|
|
}
|
|
}
|
|
}
|
|
else if (inst->getMsgDesc()->isScratchWrite())
|
|
{
|
|
getScratchMsgInfo(inst, offset, size);
|
|
bool allRowsFound = true;
|
|
unsigned int emask = inst->getMaskOption();
|
|
for (unsigned int k = offset; k != (offset + size); k++)
|
|
{
|
|
auto it = scratchOffToMask.find(k);
|
|
if (it != scratchOffToMask.end())
|
|
{
|
|
if (emask != it->second &&
|
|
(it->second & InstOpt_WriteEnable) == 0)
|
|
{
|
|
allRowsFound = false;
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
allRowsFound = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (allRowsFound)
|
|
{
|
|
#if 0
|
|
printf("Removing redundant successive write at $%d\n", inst->getCISAOff());
|
|
#endif
|
|
instIt = bb->instList.erase(instIt);
|
|
}
|
|
else
|
|
{
|
|
unsigned int emask = inst->getOption();
|
|
for (unsigned int k = offset; k != (offset + size); k++)
|
|
{
|
|
scratchOffToMask.insert(std::make_pair(k, emask));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (auto bb : kernel.fg.BBs)
|
|
{
|
|
auto endIt = bb->instList.end();
|
|
for (auto instIt = bb->instList.begin();
|
|
instIt != endIt;
|
|
instIt++)
|
|
{
|
|
auto inst = (*instIt);
|
|
|
|
if (!inst->isSend())
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if (inst->getMsgDesc()->isScratchRead() ||
|
|
inst->getMsgDesc()->isScratchWrite())
|
|
{
|
|
unsigned int offset, size;
|
|
getScratchMsgInfo(inst, offset, size);
|
|
bool isRead = inst->getMsgDesc()->isScratchRead();
|
|
for (unsigned int i = offset; i != (offset + size); i++)
|
|
{
|
|
auto it = scratchOffsetAccess.find(i);
|
|
if (it != scratchOffsetAccess.end())
|
|
{
|
|
if (isRead)
|
|
{
|
|
auto& fill = it->second.second;
|
|
fill.push_back(std::make_pair(bb, instIt));
|
|
}
|
|
else
|
|
{
|
|
auto& spill = it->second.first;
|
|
spill.push_back(std::make_pair(bb, instIt));
|
|
}
|
|
}
|
|
else
|
|
{
|
|
SPILLS s;
|
|
FILLS f;
|
|
if (isRead)
|
|
f.push_back(std::make_pair(bb, instIt));
|
|
else
|
|
s.push_back(std::make_pair(bb, instIt));
|
|
scratchOffsetAccess.insert(std::make_pair(i, std::make_pair(s, f)));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
std::map<G4_INST*, std::pair<INST_LIST_ITER, G4_BB*>> spillToRemove;
|
|
for (auto scratchAccess : scratchOffsetAccess)
|
|
{
|
|
if (scratchAccess.second.second.size() == 0 &&
|
|
scratchAccess.second.first.size() > 0)
|
|
{
|
|
// 0 fills for scratch slot
|
|
// Check whether all spill slots have 0 fills
|
|
// in case spills are coalesced.
|
|
for (auto spill : scratchAccess.second.first)
|
|
{
|
|
bool spillRequired = false;
|
|
unsigned int offset, size;
|
|
getScratchMsgInfo(*spill.second, offset, size);
|
|
|
|
// Verify that all slots from offset->(offset+size) have 0 fills
|
|
for (unsigned int slot = offset; slot != (offset + size); slot++)
|
|
{
|
|
auto it = scratchOffsetAccess.find(slot);
|
|
if (it->second.second.size() != 0)
|
|
{
|
|
spillRequired = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!spillRequired)
|
|
{
|
|
spillToRemove.insert(std::make_pair(*spill.second, std::make_pair(spill.second, spill.first)));
|
|
}
|
|
|
|
}
|
|
}
|
|
else if (scratchAccess.second.first.size() == 0 &&
|
|
scratchAccess.second.second.size() > 0)
|
|
{
|
|
// 0 spills for scratch slot, non-zero fills
|
|
// Check whether all fill slots have 0 spills
|
|
// in case fills are coalesced.
|
|
for (auto fill : scratchAccess.second.second)
|
|
{
|
|
bool fillRequired = false;
|
|
unsigned int offset, size;
|
|
getScratchMsgInfo(*fill.second, offset, size);
|
|
|
|
// Verify that all slots from offset->(offset+size) have 0 spills
|
|
for (unsigned int slot = offset; slot != (offset + size); slot++)
|
|
{
|
|
auto it = scratchOffsetAccess.find(slot);
|
|
if (it->second.first.size() != 0)
|
|
{
|
|
fillRequired = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!fillRequired)
|
|
{
|
|
spillToRemove.insert(std::make_pair(*fill.second, std::make_pair(fill.second, fill.first)));
|
|
}
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
for (auto removeSp : spillToRemove)
|
|
{
|
|
G4_BB* bb = removeSp.second.second;
|
|
#if 0
|
|
printf("Removing redundant scratch access at CISA $%d\n", removeSp.first->getCISAOff());
|
|
#endif
|
|
bb->instList.erase(removeSp.second.first);
|
|
}
|
|
}
|
|
|
|
void CoalesceSpillFills::run()
|
|
{
|
|
removeRedundantSplitMovs();
|
|
|
|
fills();
|
|
replaceMap.clear();
|
|
spills();
|
|
replaceMap.clear();
|
|
spillFillCleanup();
|
|
|
|
removeRedundantWrites();
|
|
|
|
fixSendsSrcOverlap();
|
|
}
|
|
|
|
void CoalesceSpillFills::dumpKernel()
|
|
{
|
|
for (auto bb : kernel.fg.BBs)
|
|
{
|
|
for (auto inst : bb->instList)
|
|
{
|
|
inst->emit(std::cerr);
|
|
std::cerr << "\t$" << inst->getCISAOff() << ", #" << rpe.getRegisterPressure(inst) << "\n";
|
|
}
|
|
}
|
|
}
|
|
|
|
void CoalesceSpillFills::dumpKernel(unsigned int v1, unsigned int v2)
|
|
{
|
|
bool start = false, end = false, canEnd = false;
|
|
for (auto bb : kernel.fg.BBs)
|
|
{
|
|
if (end)
|
|
break;
|
|
|
|
for (auto inst : bb->instList)
|
|
{
|
|
if (canEnd &&
|
|
inst->getCISAOff() > (int)v2)
|
|
{
|
|
end = true;
|
|
break;
|
|
}
|
|
|
|
if (inst->getCISAOff() == v2)
|
|
{
|
|
// This ensures invalid offsets
|
|
// are dumped till v2 is hit.
|
|
canEnd = true;
|
|
}
|
|
|
|
if (inst->getCISAOff() == v1)
|
|
start = true;
|
|
|
|
if (start && !end)
|
|
{
|
|
inst->dump();
|
|
printf(" // $%d, #%d\n", inst->getCISAOff(), rpe.getRegisterPressure(inst));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void CoalesceSpillFills::computeAddressTakenDcls()
|
|
{
|
|
for (auto dcl : kernel.Declares)
|
|
{
|
|
auto addrSpillFill = dcl->getAddrTakenSpillFill();
|
|
if (addrSpillFill)
|
|
addrTakenSpillFillDcl.insert(addrSpillFill);
|
|
}
|
|
}
|
|
}
|