Files
intel-graphics-compiler/IGC/Compiler/CISACodeGen/CoalescingEngine.cpp
Paige, Alexander 420b632df9 Update IGC code format
Update IGC code format
2025-07-20 06:20:11 +02:00

1336 lines
52 KiB
C++

/*========================== begin_copyright_notice ============================
Copyright (C) 2017-2021 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
//===-------- CoalescingEngine.cpp - Pass and analysis for coalescing-------===//
//
// Intel Extention to LLVM core
//===----------------------------------------------------------------------===//
//
// This pass is an implementation of idea of coalescing originated from USC
// but re-designed to work (and take advantage of) on SSA form.
//
//===----------------------------------------------------------------------===//
#include "Compiler/CISACodeGen/CoalescingEngine.hpp"
#include "Compiler/CISACodeGen/DeSSA.hpp"
#include "Compiler/CISACodeGen/ShaderCodeGen.hpp"
#include "Compiler/CISACodeGen/PatternMatchPass.hpp"
#include "Compiler/MetaDataApi/MetaDataApi.h"
#include "Compiler/CISACodeGen/helper.h"
#include "PayloadMapping.hpp"
#include "common/debug/Debug.hpp"
#include "common/debug/Dump.hpp"
#include "common/igc_regkeys.hpp"
#include "common/LLVMWarningsPush.hpp"
#include <llvm/IR/InstVisitor.h>
#include <llvm/ADT/SmallVector.h>
#include "common/LLVMWarningsPop.hpp"
#include "Compiler/IGCPassSupport.h"
#include "Probe/Assertion.h"
using namespace llvm;
using namespace IGC;
using namespace IGC::Debug;
using namespace IGC::IGCMD;
char CoalescingEngine::ID = 0;
#define PASS_FLAG "CoalescingEngine"
#define PASS_DESCRIPTION "coalesce moves coming payloads, insert and extract element"
#define PASS_CFG_ONLY true
#define PASS_ANALYSIS true
IGC_INITIALIZE_PASS_BEGIN(CoalescingEngine, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
IGC_INITIALIZE_PASS_DEPENDENCY(WIAnalysis)
IGC_INITIALIZE_PASS_DEPENDENCY(LiveVarsAnalysis)
IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenPatternMatch)
IGC_INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
IGC_INITIALIZE_PASS_DEPENDENCY(DeSSA)
IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
IGC_INITIALIZE_PASS_DEPENDENCY(ResourceLoopAnalysis)
IGC_INITIALIZE_PASS_END(CoalescingEngine, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
namespace IGC {
CoalescingEngine::CoalescingEngine() : FunctionPass(ID) {
initializeCoalescingEnginePass(*PassRegistry::getPassRegistry());
}
void CoalescingEngine::getAnalysisUsage(llvm::AnalysisUsage &AU) const {
AU.setPreservesAll();
AU.addRequired<llvm::DominatorTreeWrapperPass>();
AU.addRequired<WIAnalysis>();
AU.addRequired<LiveVarsAnalysis>();
AU.addRequired<CodeGenPatternMatch>();
AU.addRequired<DeSSA>();
AU.addRequired<MetaDataUtilsWrapper>();
AU.addRequired<CodeGenContextWrapper>();
AU.addRequired<ResourceLoopAnalysis>();
}
void CoalescingEngine::CCTuple::print(raw_ostream &OS, const Module *) const {
OS << "CC Tuple ";
auto IT = OffsetToCCMap.begin();
while (IT != OffsetToCCMap.end()) {
OS << IT->first << " : ";
IT++;
}
}
void CoalescingEngine::CCTuple::dump() const { print(ods()); }
void CoalescingEngine::print(raw_ostream &OS, const Module *) const {
OS << "CC Tuple list: \n";
for (auto itr = m_CCTupleList.begin(), iend = m_CCTupleList.end(); itr != iend; ++itr) {
CCTuple *ccTuple = *itr;
ccTuple->print(OS);
}
OS << "CC Tuple list end. \n";
}
void CoalescingEngine::dump() const { print(ods()); }
/// \brief Entry point.
bool CoalescingEngine::runOnFunction(Function &MF) {
MetaDataUtils *pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
m_ModuleMetadata = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
m_CodeGenContext = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
m_Platform = m_CodeGenContext->platform;
m_PayloadMapping = PayloadMapping(m_CodeGenContext);
if (IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing) || m_ModuleMetadata->compOpt.WaDisablePayloadCoalescing) {
return false;
}
if (!isEntryFunc(pMdUtils, &MF)) {
return false;
}
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
WIA = &getAnalysis<WIAnalysis>();
CG = &getAnalysis<CodeGenPatternMatch>();
LV = &getAnalysis<LiveVarsAnalysis>().getLiveVars();
RLA = &getAnalysis<ResourceLoopAnalysis>();
if (IGC_IS_FLAG_ENABLED(EnableDeSSA)) {
m_DeSSA = &getAnalysis<DeSSA>();
} else {
m_DeSSA = nullptr;
}
for (Function::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
ProcessBlock(&(*I));
}
for (df_iterator<DomTreeNode *> DI = df_begin(DT->getRootNode()), DE = df_end(DT->getRootNode()); DI != DE; ++DI) {
IncrementalCoalesce(DI->getBlock());
}
if (IGC_IS_FLAG_ENABLED(PrintToConsole))
dump();
return false;
}
/// Coalesce instructions within a single-BB
void CoalescingEngine::IncrementalCoalesce(BasicBlock *MBB) {
std::vector<Instruction *> &DefInstrs = BBProcessingDefs[MBB];
std::sort(DefInstrs.begin(), DefInstrs.end(), MIIndexCompare(LV));
for (std::vector<Instruction *>::const_iterator BBI = DefInstrs.begin(), BBE = DefInstrs.end(); BBI != BBE; ++BBI) {
Instruction *DefMI = *BBI;
auto RI = ValueNodeMap.find((Value *)DefMI);
if (RI == ValueNodeMap.end()) {
// Intrinsics themselves are not mapped to a node, they are
// tuple generators.
// NOTE: Please leave this commented code snippet! It is useful when debugging coalescing regressions:
// (used to isolate coalescing to particular shader to check failing/passing using binary search)
// DWORD hash = (DWORD) static_cast<IGC::CodeGenContext&>(MBB->getParent()->getContext()).hash.getAsmHash();
////if (hash >= 0xA27Affff && hash <= 0xA3CFFF00)
// if (hash >= 0xA3C00000 && hash <= 0xA3CFFFFF)
//{
// continue;
// }
if (GenIntrinsicInst *intrinsic = llvm::dyn_cast<llvm::GenIntrinsicInst>(DefMI)) {
if ((isURBWriteIntrinsic(intrinsic) && !IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing_URB)) ||
(llvm::isa<llvm::RTWriteIntrinsic>(intrinsic) && !IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing_RT)) ||
(llvm::isa<llvm::AtomicTypedIntrinsic>(intrinsic) &&
!IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing_AtomicTyped)) ||
(llvm::isa<llvm::RTDualBlendSourceIntrinsic>(intrinsic) &&
!IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing_RT) && m_Platform.hasDualKSPPS())) {
ProcessTuple(DefMI);
}
}
if (isSampleInstruction(DefMI) && !IGC_IS_FLAG_ENABLED(DisablePayloadCoalescing_Sample)) {
ProcessTuple(DefMI);
} else if (isLdInstruction(DefMI)) {
ProcessTuple(DefMI);
}
}
}
}
static bool IsBindlessSampler(Instruction *inst) {
if (SampleIntrinsic *sample = dyn_cast<SampleIntrinsic>(inst)) {
Value *sampler = sample->getSamplerValue();
uint bufferIndex = 0;
bool directIndexing = false;
if (sampler->getType()->isPointerTy()) {
BufferType bufType =
DecodeAS4GFXResource(sampler->getType()->getPointerAddressSpace(), directIndexing, bufferIndex);
if (bufType == BINDLESS_SAMPLER) {
return true;
}
}
}
return false;
}
/// Core algorithm.
void CoalescingEngine::ProcessTuple(Instruction *tupleGeneratingInstruction) {
const uint numOperands = m_PayloadMapping.GetNumPayloadElements(tupleGeneratingInstruction);
// bindless sampler always have a header
bool needsHeader = IsBindlessSampler(tupleGeneratingInstruction);
// There should be an early exit in the case we have split-send available
// and this instruction is sure to generate two-Element payload.
if (m_Platform.supportSplitSend() && !needsHeader) {
if (numOperands < 3) {
return;
}
} else {
if (numOperands < 2) {
return;
}
}
#if 0
// turn off coalescing when inst is inside a fused resource-loop
// feels like this approach may not be strong enough.
// So opt to the solution of disabling coalescing in whole BB
auto loopMarker =
RLA->GetResourceLoopMarker(tupleGeneratingInstruction);
if ((loopMarker & ResourceLoopAnalysis::MarkResourceLoopInside) ||
((loopMarker & ResourceLoopAnalysis::MarkResourceLoopStart) &&
!(loopMarker & ResourceLoopAnalysis::MarkResourceLoopEnd)))
{
return;
}
#endif
IGC_ASSERT(numOperands >= 2);
// No result, but has side effects of updating the split mapping.
DecideSplit(tupleGeneratingInstruction);
uint numSplits = GetNumSplitParts(tupleGeneratingInstruction);
for (uint numPart = 0; numPart < numSplits; numPart++) {
SetCurrentPart(tupleGeneratingInstruction, numPart);
const uint numOperands = GetNumPayloadElements(tupleGeneratingInstruction);
bool isAnyNodeAnchored = false, isAnyNodeCoalescable = false;
SmallPtrSet<CCTuple *, MaxTouchedTuples> touchedTuplesSet;
SmallVector<CCTuple *, MaxTouchedTuples> touchedTuples;
// Step: Prepare.
PrepareTuple(numOperands, tupleGeneratingInstruction,
// out:
touchedTuplesSet, touchedTuples, isAnyNodeAnchored, isAnyNodeCoalescable);
if (!isAnyNodeCoalescable) {
continue;
}
// So at this point, there is at least one node that is candidate for coalescing.
// It might be already a part of CCtuple, or it might be yet unconstrained.
// Tuple Generating Instruction enforces sequential order of registers allocated to values.
// Each value might belong to (up to) one CCtuple.
// If it belongs to CCTuple, it has an unique offset in that CCTuple.
// Three cases are considered:
// 1) No value have CCtuple assigned -> create new CCTuple and assign offsets.
// a) if platform supports split-send, then some (arbitrary) split point should be determined
// 2) At least one value belongs to a CCtuple (we say it is 'anchored' (constrained) to that tuple)
// and all values belong to the same CCtuple
// 3) More than one value belong to a tuple, and they belong to different (>1) tuples: NOT implemented
// Case 1) : create new CCTuple
if (!isAnyNodeAnchored) {
CreateTuple(numOperands, tupleGeneratingInstruction);
} else {
// Case 2) : at least one value is 'anchored'(constrained) to a CCtuple
// Sub-case: All arguments belong to the same tuple.
if (touchedTuples.size() == 1) {
// Step I: find an anchoring element.
// If arguments of this tuple-instruction are already constrained by CCtuple, it means
// that they also constrain the tuple that would be generated by this payload.
// They 'anchor' the new tuple, so pick the element that anchors the tuple and
// determine its offset.
// E.g. :
// .. v1 (0) v2 (1) CCtuple
// v0 (0) v1 (1) v2 (2) (this payload)
// v1 will have: thisTupleStartOffset = 1, rootTupleStartOffset = 0
// CCtuple will be 'grown' with element v0, that is at relative offset = -1
// v0(-1) v1 (0) v2 (1) CCtuple
CCTuple *ccTuple = NULL;
int rootTupleStartOffset = 0;
int thisTupleStartOffset = 0;
DetermineAnchor(numOperands, tupleGeneratingInstruction,
// out:
ccTuple, rootTupleStartOffset, thisTupleStartOffset);
IGC_ASSERT(ccTuple);
/* Step II: Interference checking */
int offsetDiff = rootTupleStartOffset - thisTupleStartOffset;
bool interferes = false;
// Heuristic: we do not want to grow the ccTuple size too much
// [c0 c1 c2 c3 c4 c5 c6 c7]
//(offsetDiff=4)[e0' e1' e2' e3' e4' e5' e6' e7']
// here, tuple numElements = 8, and offsetDiff = 4, so it would result in
// 12 element CC tuple if joined together. We want to prevent growing
// more than MaxTupleSize elements.
if (abs(offsetDiff) > 0 && ccTuple->GetNumElements() >= MaxTupleSize) {
interferes = true;
}
ProcessInterferencesElementFunctor interferencesFunctor(false /*forceEviction */, tupleGeneratingInstruction,
offsetDiff, ccTuple, this);
if (!interferes) {
interferes = InterferenceCheck(numOperands, tupleGeneratingInstruction, offsetDiff, ccTuple,
// out:
&interferencesFunctor);
if (!interferes && (ccTuple->HasNonHomogeneousElements() ||
m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction))) {
interferes =
CheckIntersectionForNonHomogeneous(numOperands, tupleGeneratingInstruction, offsetDiff, ccTuple);
}
}
// If 'interferes' is false, it means that whole transaction could be committed,
// provided that elements in dominatorsForDisplacement are displaced, and other nodes are attached.
// If interferes is true, then no element will be attached to the ccTuple.
if (!interferes) {
SmallPtrSet<Value *, MaxTouchedTuples> touchedValuesSet;
for (uint i = 0; i < numOperands; i++) {
Value *val = GetPayloadElementToValueMapping(tupleGeneratingInstruction, i);
ccTuple->CheckIndexForNonInitializedSlot(i + offsetDiff);
if (touchedValuesSet.count(val)) {
continue;
} else {
touchedValuesSet.insert(val);
}
if (IsValConstOrIsolated(val)) {
// do nothing
} else {
Value *RootV = getRegRoot(val);
if (!RootV) // bail out if isolated
continue;
// We do not expect to get Argument here, it must have been isolated.
IGC_ASSERT(dyn_cast<llvm::Instruction>(val));
IGC_ASSERT(ValueNodeMap.count(val));
// auto RI = ValueNodeMap.find(val);
// ElementNode *Node = RI->second;
ElementNode *Node = ValueNodeMap[val];
if (!interferencesFunctor.GetComputedValuesForIsolation().count(val)) {
ccTuple->AttachNodeAtIndex(i + offsetDiff, Node, *this);
IGC_ASSERT(getRegRoot(val));
}
}
} // for
} else // (interferes) is true
{
CreateTuple(numOperands, tupleGeneratingInstruction);
}
} else {
// Case 3)
// More than one value belong to a tuple, and they belong to different (>1) tuples.
// FIXME: not implemented yet
// Need to implement code for joining two non-overlapping tuples.
// This probably does not bring much benefit, but is nevertheless complex to implement.
// Having two congruence class tuples, we need to find whether piecewise congruent classes
// of the tuples interfere. Here a 'two-set interference check' due to Boissinot could be
// used (first sorting elements)
}
}
}
}
///
bool CoalescingEngine::CheckIntersectionForNonHomogeneous(const uint numOperands,
Instruction *tupleGeneratingInstruction, const int offsetDiff,
CCTuple *ccTuple) {
// Picturesque description:
// Hi - homogeneous slots
// <left_part>[H_0 H_1 .. H_n]<right_part>
// ccTuple:
// <left_part> [H_0 H_1 .. H_n]<right_part>
//
// 1) this tuple instruction contains non-homogeneous part:
// <left_part'>[H_0' H_1' .. H_n']<right_part'>
// 2) or this tuple instruction does not contain non-homogeneous part
// [H_0' H_1' .. H_n']
bool interferes = false;
if (ccTuple->HasNonHomogeneousElements()) {
// Finding a supremum instruction for a homogeneous part is implemented
// only for render target write instructions (RTWriteIntrinsic).
// An only other possible combination for non-homogeneous instructions comprises
// RTWriteIntrinsic and RTDualBlendSourceIntrinsic.
// In some cases there is an opportunity to coalesce their payloads but there exists
// a danger that they are compiled in different SIMD modes so there is a safe assumption
// that they cannot be coalesced.
// A comparison of the payload of RTWriteIntrinsic and RTDualBlendSourceIntrinsic:
// +----------------------------+----------------------------+
// | RTWriteIntrinsic | RTDualBlendSourceIntrinsic |
// +----------------------------+----------------------------+
// | src0 Alpha (optional) | src0 Alpha (unavailable)* |
// +----------------------------+----------------------------+
// | output mask (optional) | output mask (optional) |
// +----------------------------+----------------------------+
// | - | R0 channel |
// +----------------------------+----------------------------+
// | - | G0 channel |
// +----------------------------+----------------------------+
// | - | B0 channel |
// +----------------------------+----------------------------+
// | - | A0 channel |
// +----------------------------+----------------------------+
// | R0 channel | R1 channel |
// +----------------------------+----------------------------+
// | G0 channel | G1 channel |
// +----------------------------+----------------------------+
// | B0 channel | B1 channel |
// +----------------------------+----------------------------+
// | A0 channel | A1 channel |
// +----------------------------+----------------------------+
// | Depth (optional) | Depth (optional) |
// +----------------------------+----------------------------+
// | Stencil (optional) | Stencil (optional) |
// +----------------------------+----------------------------+
// All optional fields except for src0 Alpha are consistent for both instruction called in a fragment shader.
// * RTDualBlendSourceIntrinsic doesn't have such an argument but it is defined in its payload.
if (offsetDiff == 0 && ccTuple->GetNumElements() == numOperands &&
llvm::isa<llvm::RTWriteIntrinsic>(ccTuple->GetRoot()) &&
llvm::isa<llvm::RTWriteIntrinsic>(tupleGeneratingInstruction)) {
if (m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction)) {
// TODO: test for 'supremum' of TGI and ccTuple root element can be obtained
const Instruction *supremum =
m_PayloadMapping.GetSupremumOfNonHomogeneousPart(tupleGeneratingInstruction, ccTuple->GetRoot());
if (supremum) {
} else {
interferes = true;
}
}
} else {
// Since ccTuple contains non-homogeneous, and this homogeneous
// part is not perfectly aligned to ccTuple, we cannot reason about
// interferences safely, thus assume interferes.
interferes = true;
}
} else {
// ccTuple:
// [H_0 H_1 .. H_n]
//
// 1) this tuple instruction contains non-homogeneous part:
// <left_part'>[H_0' H_1' .. H_n']<right_part'>
if (m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction)) {
if (offsetDiff == 0 && ccTuple->GetNumElements() == numOperands) {
ccTuple->SetHasNonHomogeneousElements(tupleGeneratingInstruction);
} else {
interferes = true;
}
}
}
return interferes;
}
void CoalescingEngine::DecideSplit(Instruction *tupleGeneratingInstruction) {
if (m_PayloadMapping.DoPeelFirstElement(tupleGeneratingInstruction)) {
splitPoint_[tupleGeneratingInstruction] = 1;
return;
}
uint splitPoint = 0;
const uint numOperands = m_PayloadMapping.GetNumPayloadElements(tupleGeneratingInstruction);
// No sense entering here if we have less than 2 operands.
IGC_ASSERT(numOperands >= 2);
Value *val = m_PayloadMapping.GetPayloadElementToValueMapping(tupleGeneratingInstruction, 0);
if (IsValConstOrIsolated(val) || !getRegRoot(val)) {
splitPoint = 1;
}
val = m_PayloadMapping.GetPayloadElementToValueMapping(tupleGeneratingInstruction, numOperands - 1);
if (IsValConstOrIsolated(val) || !getRegRoot(val)) {
splitPoint = numOperands - 1;
}
if (splitPoint > 0 && m_PayloadMapping.DoesAllowSplit(tupleGeneratingInstruction)) {
splitPoint_[tupleGeneratingInstruction] = splitPoint;
}
}
/// Determines if any value that is an argument of (possible) tuple generating instruction
/// can participate in payload coalescing. If so, it also determines if there are any
/// 'anchored'(constrained) values and the number of constraining tuples.
void CoalescingEngine::PrepareTuple(const uint numOperands, Instruction *tupleGeneratingInstruction,
SmallPtrSet<CCTuple *, MaxTouchedTuples> &touchedTuplesSet,
SmallVector<CCTuple *, MaxTouchedTuples> &touchedTuples, bool &isAnyNodeAnchored,
bool &isAnyNodeCoalescable) {
uint numConstants = 0;
// Step: Prepare.
for (uint i = 0; i < numOperands; i++) {
Value *val = GetPayloadElementToValueMapping(tupleGeneratingInstruction, i);
if (IsValConstOrIsolated(val) || !getRegRoot(val)) {
numConstants++;
continue;
}
Value *RootV = getRegRoot(val);
IGC_ASSERT(RootV);
{
isAnyNodeCoalescable = true;
IGC_ASSERT(ValueNodeMap.count(RootV));
auto RI = ValueNodeMap.find(RootV);
ElementNode *Node = RI->second;
auto CCI = NodeCCTupleMap.find(Node);
if (CCI != NodeCCTupleMap.end()) {
CCTuple *ccTuple = NodeCCTupleMap[Node];
if (!touchedTuplesSet.count(ccTuple)) {
touchedTuplesSet.insert(ccTuple);
touchedTuples.push_back(ccTuple);
}
isAnyNodeAnchored = true;
}
// Do not coalesce values having excessive number of uses.
// This otfen introduces many anti-dependencies.
const unsigned MAX_USE_COUNT = 20;
if (val->hasNUsesOrMore(MAX_USE_COUNT) && !llvm::isa<llvm::AtomicTypedIntrinsic>(tupleGeneratingInstruction)) {
isAnyNodeAnchored = true;
}
}
}
// heuristic:
// we do not want to create 'wide' root variables, where most of the slots
// will be anyway non-coalesced (due to immediate constants/isolated variables)
// vISA will anyway not benefit, while this increases pressure and makes coloring
// with round-robin heuristic harder
if (numOperands > 2 && numConstants > (numOperands / 2)) {
if (!m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction)) {
// force non-coalescing:
isAnyNodeCoalescable = false;
}
}
}
/// Creates fresh tuple.
/// Fresh tuple consists of nodes which do not have any tuple assigned yet.
/// Precondition is that at least one value is coalescable (otherwise does not make sense).
void CoalescingEngine::CreateTuple(const uint numOperands, Instruction *tupleGeneratingInstruction) {
CCTuple *ccTuple = new CCTuple();
m_CCTupleList.push_back(ccTuple);
for (uint i = 0; i < numOperands; i++) {
Value *val = GetPayloadElementToValueMapping(tupleGeneratingInstruction, i);
if (IsValConstOrIsolated(val)) {
// It is important to initialize CC with empty slots, in order to get the proper
// size of the vISA variable that will back CCtuple that corresponds to payload.
ccTuple->ResizeBounds(i);
continue;
}
Value *RootV = getRegRoot(val);
if (!RootV) { // isolated
ccTuple->ResizeBounds(i);
continue;
}
IGC_ASSERT(ValueNodeMap.count(RootV));
ElementNode *RootNode = ValueNodeMap[RootV];
auto CCTI = NodeCCTupleMap.find(RootNode);
if (CCTI == NodeCCTupleMap.end()) {
NodeCCTupleMap[RootNode] = ccTuple;
NodeOffsetMap[RootNode] = i;
ccTuple->InitializeIndexWithCCRoot(i, RootNode);
CurrentDominatingParent[RootV] = RootV;
ImmediateDominatingParent[RootV] = NULL;
} else {
// This must have been a value that is copied to payload multiple times.
// OR: The new CCtuple has been isolated, and this is the element that
// belongs to other tuple.
// Since this value belongs to another tuple, we should not increase the
// current tuple's size, no?
// ccTuple->ResizeBounds(i);
}
} // loop over arguments
if (m_PayloadMapping.HasNonHomogeneousPayloadElements(tupleGeneratingInstruction)) {
ccTuple->SetHasNonHomogeneousElements(tupleGeneratingInstruction);
}
}
/// Given a tuple generating instruction, picks out an 'anchored'(constrained)
/// value and determines the relative offset
/// output:
/// ccTuple - a tuple that is constraining a picked value
/// thisTupleStartOffset - the offset of the value in the current tuple generating
/// instruction
/// rootTupleStartOffset - the offset of the value in constraining tuple
/// Assumption: it is already checked all values belong to exactly one tuple!
/// (otherwise the result might pick an arbitrary tuple)
void CoalescingEngine::DetermineAnchor(const uint numOperands, const Instruction *tupleGeneratingInstruction,
CCTuple *&ccTuple, int &rootTupleStartOffset, int &thisTupleStartOffset) {
for (uint i = 0; i < numOperands; i++) {
Value *val = GetPayloadElementToValueMapping(tupleGeneratingInstruction, i);
if (IsValConstOrIsolated(val)) {
continue;
}
Value *RootV = getRegRoot(val);
if (!RootV)
continue;
IGC_ASSERT(ValueNodeMap.count(RootV));
ElementNode *RootNode = ValueNodeMap[RootV];
auto CCTI = NodeCCTupleMap.find(RootNode);
if (CCTI != NodeCCTupleMap.end()) {
// Element is constrained by ccTuple.
IGC_ASSERT(NodeOffsetMap.count(RootNode));
rootTupleStartOffset = NodeOffsetMap[RootNode];
thisTupleStartOffset = i;
ccTuple = NodeCCTupleMap[RootNode];
// Just a sanity check.. :
IGC_ASSERT(RootNode == ccTuple->GetCCNodeWithIndex(NodeOffsetMap[RootNode]));
break;
}
} // for
}
/// Prepares the slot for moves (e.g. induced by const, isolated, uniform to vector)
/// into payload by evicting any value that is currently occupying this slot.
/// Assumption is that heuristic has determined it is profitable to do so.
/// input: bool evictFullCongruenceClass - tells whether the full congruence class
/// should be evicted, or only the element that occupies the tuple instruction
/// for the lifetime of a 'mov' (this is sufficient for issuing a mov to the payload slot).
void CoalescingEngine::PrepareInsertionSlot(CCTuple *ccTuple, const int index, Instruction *inst,
const bool evictFullCongruenceClass) {
ElementNode *RootNode = ccTuple->OffsetToCCMap[index];
IGC_ASSERT(RootNode);
if (evictFullCongruenceClass) {
Value *dominatingParent = GetActualDominatingParent(RootNode->value, inst);
// It might turn out that the root node does not dominate the
// 'inst' since it is in an another branch of DT. In such case,
// isolate all nodes in the CC.
llvm::Value *CCVal = dominatingParent ? dominatingParent : CurrentDominatingParent[RootNode->value];
while (CCVal) {
if (getRegRoot(CCVal)) {
isolateReg(CCVal);
}
CCVal = ImmediateDominatingParent[CCVal];
}
} else {
// Evict dominating parent from CC.
Value *NewParent = GetActualDominatingParent(RootNode->value, inst);
if (NewParent)
isolateReg(NewParent);
}
}
bool CoalescingEngine::IsInsertionSlotAvailable(CCTuple *ccTuple, const int index,
Instruction *tupleGeneratingInstruction, const bool canExtend) {
bool avaiable = true;
// ccTuple->OffsetToCCMap.count(index) == 0
if (!canExtend) {
const int tupleNumElements = ccTuple->GetNumElements();
if (!(0 <= index && index < tupleNumElements)) {
return false;
}
}
ElementNode *nodeToCheck = ccTuple->GetCCNodeWithIndex(index);
if (nodeToCheck == NULL) {
// tuple : X CC1 CC2
// payload : <slot> v1 v2
// Means: Tuple has no live interval (X) that is live at the point the payload is
// defined. It means, that a given 'register slot' can be used to issue a 'mov'
// with an immediate (or isolated) value.
} else {
// tuple : CC0 CC1 CC2
// .... | <- lifetime
// payload : <slot> v1 v2
// ..... | <- lifetime
// Here we need to check whether the value in CC is live
// and intersects with the desired <slot> .
// Sanity check: tuple contains the root element.
// IGC_ASSERT(getRegRoot(nodeToCheck->value) == nodeToCheck->value);
Value *RootV = nodeToCheck->value;
Value *NewParent = GetActualDominatingParent(RootV, tupleGeneratingInstruction);
// FIXME: what if we do not have pure ssa form? Look at de-ssa comment.
if (NewParent && LV->isLiveAt(NewParent, tupleGeneratingInstruction)) {
avaiable = false;
// interferes = true;
// break;
}
}
return avaiable;
}
/// Processes the tuple elements in sequential order.
/// A function object is passed as an argument, which responds to
/// specific events (whether given payload elements is const/isolated/copied etc.)
/// It is meant to be used as a template method in various context where different
/// functors could be passed, but the walking mechanism is the same.
void CoalescingEngine::ProcessElements(const uint numOperands, Instruction *tupleInst, const int offsetDiff,
CCTuple *ccTuple, ElementFunctor *functor) {
SmallPtrSet<Value *, MaxTouchedTuples> touchedValuesSet;
for (uint i = 0; i < numOperands; i++) {
functor->SetIndex(i);
Value *val = GetPayloadElementToValueMapping(tupleInst, i);
if (touchedValuesSet.count(val)) {
if (functor->visitCopy())
continue;
else
break;
} else {
touchedValuesSet.insert(val);
}
// Case 1: Constant
if (llvm::isa<llvm::Constant>(val)) {
if (functor->visitConstant())
continue;
else
break;
} else // Case 2: Variable
{
if (IsValIsolated(val) || !getRegRoot(val)) {
// If value is isolated, a copy <slot> will be necessary.
if (functor->visitIsolated())
continue;
else
break;
}
IGC_ASSERT(ValueNodeMap.count(val));
ElementNode *Node = ValueNodeMap[val];
ElementNode *ccRootNode = ccTuple->GetCCNodeWithIndex(i + offsetDiff);
// Interferes if and only if: live at this definition point AND has different value
if (ccRootNode == Node) {
// No interference, since it is the same value. We have got a reuse of the same value in
// different tuple.
if (functor->visitAnchored())
continue;
else
break;
} else {
IGC_ASSERT(llvm::isa<llvm::Instruction>(val));
// Here comes the meat, and the most interesting case:
if (ccRootNode != NULL) {
Value *CCRootV = ccRootNode->value;
Value *dominating = nullptr;
Value *dominated = nullptr;
SymmetricInterferenceTest(CCRootV, dyn_cast<llvm::Instruction>(val), dominating, dominated);
// Case 1):
// (dominating=null)
//
//! (value)
//
// CC dominance tree:
//! v67 <- (dominated)
// ^
// |
//! v190
//
//! (tupleInst)
// Value has no dominator inside CC class, but dominates elements in CC class
// and its lifetime spans until (but not including) tupleInst.
if (dominating == nullptr && dominated != nullptr) {
// IGC_ASSERT(LV->isLiveAt(val, tupleInst));
// Interference check: value is live at dominated
if (functor->visitInterfering(val, false))
continue;
else
break;
} else if (dominating != nullptr && dominated != nullptr) {
// Case 2):
// CC dominance tree
//! v67
// ^
// |
//! v121 (dominating)
// ^
//! | <-----(value)
// |
//! v190 <- (dominated)
// TODO: it would be possible to 'fit' v181 between v121 and v190 in some
// cases, but this would require redesign of the 'dominance forest' logic
// (regarding current dominating and immediate dominator handling)
// For now, just assume there is an interference.
if (functor->visitInterfering(val, false))
continue;
else
break;
} else if (dominating != nullptr && dominated == nullptr) {
// Case 3):
// CC dominance tree
//! v121
// ^
// |
//! v190 <- (dominating)
// |
//! | ----- (value)
// |
// | (dominated == null)
if (LV->isLiveAt(dominating, dyn_cast<llvm::Instruction>(val))) {
if (functor->visitInterfering(val, false))
continue;
else
break;
}
} else {
// It is possible that the ccRootNode is not empty, but all
// elements that belong to it have been isolated in previously
// dominating tuples.
// IGC_ASSERT(0);
}
} else {
// ccRootNode == NULL -> congruence class not occupied yet
if (functor->visitPackedNonInterfering(val))
continue;
else
break;
}
} // if( ccRootNode == Node )
} // if( llvm::isa<llvm::Constant>( val ) )
} // for loop
}
/// Here we have two major differences with the implementation in de-ssa:
/// 1) Transactional character: even if one 'element' of CC-tuple does not interfere,
/// some other element in CC-tuple might be interfering, rendering the whole group
/// non-'coalescable'.
/// 2) Values in payload coalescing are by default isolated, so if there is an interference,
/// do nothing, no need for explicit isolation.
bool CoalescingEngine::InterferenceCheck(const uint numOperands, Instruction *tupleInst, const int offsetDiff,
CCTuple *ccTuple, ProcessInterferencesElementFunctor *interferencesFunctor) {
SmallPtrSet<Value *, MaxTouchedTuples> touchedValuesSet;
GatherWeightElementFunctor gatherFunctor;
ProcessElements(numOperands, tupleInst, offsetDiff, ccTuple, &gatherFunctor);
bool forceEviction = (gatherFunctor.GetNumInsertionSlotsRequired() + gatherFunctor.GetNumNeedsDisplacement() <=
gatherFunctor.GetNumAlignedAnchors());
interferencesFunctor->SetForceEviction(forceEviction);
ProcessElements(numOperands, tupleInst, offsetDiff, ccTuple, interferencesFunctor);
return interferencesFunctor->IsInterfering();
}
/// Given an instruction and numOperands (corresponding to the number of coalesce-partaking
/// payload elements), return whether any 'value' that is an argument of an intrinsic
/// takes part in the coalescing CC tuple. If yes, return non-zero value that is
/// the representative tuple for this coalescing.
/// output: zeroBasedPayloadElementOffset - an offset of this instructions payload
/// relative to the containing ccTuple - this is necessary to correctly slice the
/// the relevant part of the payload from ccTuple (e.g. for preparing a payload for URB writes)
/// offset [payload]
/// [ccTuple ...... ]
/// note: obviously, if return value is null, then the meaning of zeroBasedPayloadElementOffset
/// is undefined
CoalescingEngine::CCTuple *CoalescingEngine::IsAnyValueCoalescedInCCTuple(llvm::Instruction *inst,
const uint numOperands,
int &zeroBasedPayloadElementOffset,
Value *&outVal) {
outVal = nullptr;
zeroBasedPayloadElementOffset = 0; // provide some meaningful default
CoalescingEngine::CCTuple *ccTuple = nullptr;
uint index = 0;
while (index < numOperands) {
Value *val = GetPayloadElementToValueMapping(inst, index);
CoalescingEngine::CCTuple *ccTupleCheck = GetValueCCTupleMapping(val);
if (!isa<Constant>(val) && ccTupleCheck) {
ccTuple = ccTupleCheck;
Value *valRoot = getRegRoot(val);
IGC_ASSERT(valRoot);
ElementNode *Node = ValueNodeMap[valRoot];
int offset = NodeOffsetMap[Node];
zeroBasedPayloadElementOffset = (offset - ccTuple->GetLeftBound()) - index;
outVal = val;
return ccTuple;
}
index++;
}
return ccTuple;
}
/// Return true if payload tuple might be completed (either by directly
/// aliasing or by inserting an appropriate copy instruction).
bool CoalescingEngine::IsPayloadCovered(llvm::Instruction *inst, CCTuple *ccTuple, const uint numOperands,
const int payloadToCCTupleRelativeOffset) {
uint relativeOffset = 0;
bool payloadCovered = false;
if (ccTuple) {
SmallPtrSet<Value *, MaxTouchedTuples> touchedValuesSet;
// index = 0;
payloadCovered = true;
for (uint index = 0; index < numOperands; index++) {
Value *val = GetPayloadElementToValueMapping(inst, index);
const int ccIndex = index + payloadToCCTupleRelativeOffset;
if (touchedValuesSet.count(val)) {
if (!IsInsertionSlotAvailable(ccTuple, ccIndex, inst, false)) {
payloadCovered = false;
break;
}
continue;
} else {
touchedValuesSet.insert(val);
}
if (IsValConstOrIsolated(val)) {
if (!IsInsertionSlotAvailable(ccTuple, ccIndex, inst, false)) {
payloadCovered = false;
break;
}
} else {
if (CoalescingEngine::CCTuple *thisCCTuple = GetValueCCTupleMapping(val)) {
if (thisCCTuple == ccTuple) {
relativeOffset = GetValueOffsetInCCTuple(val);
if (relativeOffset != (ccIndex)) {
payloadCovered = false;
break;
}
} else {
// different cc tuple
payloadCovered = false;
break;
}
} else {
if (!IsInsertionSlotAvailable(ccTuple, ccIndex, inst, false)) {
payloadCovered = false;
break;
}
}
} // if constant
} // while
} // if cctuple
return payloadCovered;
}
/// Uses the tuple information provided by coalescing engine in order
/// to generate optimized sequence of 'movs' for preparing a payload.
CVariable *CoalescingEngine::PrepareExplicitPayload(CShader *outProgram, CEncoder *encoder, SIMDMode simdMode,
const DataLayout *pDL, llvm::Instruction *inst,
int &payloadOffsetInBytes) {
SetCurrentPart(inst, 0);
const uint numOperands = m_PayloadMapping.GetNumPayloadElements(inst);
const uint grfSize = outProgram->GetContext()->platform.getGRFSize();
const uint numSubVarsPerOperand = numLanes(simdMode) / (grfSize / 4);
IGC_ASSERT(numSubVarsPerOperand == 1 || numSubVarsPerOperand == 2);
CoalescingEngine::CCTuple *ccTuple = nullptr;
int payloadToCCTupleRelativeOffset = 0;
Value *dummyValPtr = nullptr;
ccTuple = IsAnyValueCoalescedInCCTuple(inst, numOperands, payloadToCCTupleRelativeOffset, dummyValPtr);
bool payloadCovered = IsPayloadCovered(inst, ccTuple, numOperands, payloadToCCTupleRelativeOffset);
bool payloadOffsetComputed = false;
payloadOffsetInBytes = 0;
CVariable *payload = nullptr;
if (payloadToCCTupleRelativeOffset < 0) {
payloadCovered = false;
}
// Check for EOT payload.
if (payloadCovered) {
if (IGCLLVM::TerminatorInst *terminator = inst->getParent()->getTerminator()) {
if (terminator != &(*terminator->getParent()->begin())) {
IGC_ASSERT(dyn_cast<GenIntrinsicInst>(inst));
IGC_ASSERT(terminator->getPrevNode());
if (terminator->getPrevNode() == inst) {
// heuristic:
// this inst is an EOT node. Be very careful about payload size:
// coalescing: hard constraint
if (ccTuple->GetNumElements() > MaxTupleSize) {
payloadCovered = false;
}
}
}
}
}
if (payloadCovered) {
SmallPtrSet<Value *, MaxTouchedTuples> touchedValuesSet;
for (uint index = 0; index < numOperands; index++) {
Value *val = m_PayloadMapping.GetPayloadElementToValueMapping(inst, index);
CVariable *dataElement = outProgram->GetSymbol(val);
// FIXME: need to do additional checks for size
uint subVar = (payloadToCCTupleRelativeOffset + index) * numSubVarsPerOperand;
encoder->SetDstSubVar(subVar);
payload = outProgram->GetCCTupleToVariableMapping(ccTuple);
if (touchedValuesSet.count(val)) {
encoder->Copy(payload, dataElement);
encoder->Push();
continue;
} else {
touchedValuesSet.insert(val);
}
if (IsValConstOrIsolated(val)) {
encoder->Copy(payload, dataElement);
encoder->Push();
} else {
if (GetValueCCTupleMapping(val)) {
if (!payloadOffsetComputed) {
payloadOffsetComputed = true;
payloadOffsetInBytes = payloadToCCTupleRelativeOffset * GetSingleElementWidth(simdMode, pDL, val);
}
} else {
// this one actually encompasses the case for !getRegRoot(val)
encoder->Copy(payload, dataElement);
encoder->Push();
}
} // if constant
} // for
} else {
payload = outProgram->GetNewVariable(
numOperands * numLanes(simdMode), ISA_TYPE_F,
outProgram->GetContext()->platform.getGRFSize() == 64 ? EALIGN_32WORD : EALIGN_HWORD, "CEExplicitPayload");
// insert explicit lifetime start in case some of the operands are undefs
// otherwise, VISA will see the variable as not fully initialized and will extend the lifetime all the way to the
// beginning of the kernel
encoder->Lifetime(VISAVarLifetime::LIFETIME_START, payload);
for (uint i = 0; i < numOperands; i++) {
Value *val = m_PayloadMapping.GetPayloadElementToValueMapping(inst, i);
CVariable *data = outProgram->GetSymbol(val);
encoder->SetDstSubVar(i * numSubVarsPerOperand);
encoder->Copy(payload, data);
encoder->Push();
}
}
return payload;
}
// Prepares payload for the uniform URB Write messages that are used in
// mesh and task shader stages.
// On DG2 platform a uniform URB write is a simd1 message that writes data
// from the fist DWORDs of each GRF of payload, e.g.:
// (W) mov (1|M0) r16.0<1>:ud 0x0:ud
// (W) mov (1|M0) r17.0<1>:f -1.0:f
// (W) mov (1|M0) r18.0<1>:f 0.0:f
// (W) mov (1|M0) r19.0<1>:f 1.0:f
// (W) send.urb (1|M0) null r2 r16:4 0x0 0x020827F7
// On Xe2+ URB write messages are transposed LSC store messages that write
// consecutive DWORDs of payload, e.g.:
// (W) mov (1|M0) r16.0<1>:ud 0x0:ud
// (W) mov (1|M0) r16.1<1>:f -1.0:f
// (W) mov (1|M0) r16.2<1>:f 0.0:f
// (W) mov (1|M0) r16.3<1>:f 1.0:f
// (W) store.urb.d32x4t.a32 (1|M0) [r2:1+0x27F0] r16:1
CVariable *CoalescingEngine::PrepareUniformUrbWritePayload(CShader *shader, CEncoder *encoder,
llvm::GenIntrinsicInst *inst) {
IGC_ASSERT(inst->getIntrinsicID() == GenISAIntrinsic::GenISA_URBWrite);
CVariable *payload = nullptr;
const uint numOperands = m_PayloadMapping.GetNumPayloadElements(inst);
const uint grfSize = shader->GetContext()->platform.getGRFSize();
if (shader->GetContext()->platform.hasLSCUrbMessage()) {
// Payload for transposed LSC URB Write - all data chunks stored in subsequent DWORDs
payload = shader->GetNewVariable(numOperands, ISA_TYPE_F, (grfSize == 64 ? EALIGN_32WORD : EALIGN_HWORD),
true /*uniform*/, "CEExplicitPayload_Uniform");
} else {
// Payload spans multiple GRFs, only the first DWORD of each GRF
// will be populated with (uniform) data.
const uint elemsPerGrf = grfSize / CVariable::GetCISADataTypeSize(ISA_TYPE_F);
payload = shader->GetNewVariable(numOperands * elemsPerGrf, ISA_TYPE_F,
(grfSize == 64 ? EALIGN_32WORD : EALIGN_HWORD), "CEExplicitPayload");
}
for (uint i = 0; i < numOperands; i++) {
Value *val = m_PayloadMapping.GetPayloadElementToValueMapping(inst, i);
CVariable *data = shader->GetSymbol(val);
IGC_ASSERT(isUniform(val) && data->IsUniform());
encoder->SetNoMask();
encoder->SetSimdSize(SIMDMode::SIMD1);
if (shader->GetContext()->platform.hasLSCUrbMessage()) {
encoder->SetDstSubReg(i);
} else {
encoder->SetDstSubVar(i);
}
encoder->Copy(payload, data);
}
return payload;
}
// Returns URB write payload for a split simd8 message in a simd16 mesh/task
// shader.
// TODO: This method does not use the CoalescingEngine currently and mearly
// copies payload to a newly created variable.
CVariable *CoalescingEngine::PrepareSplitUrbWritePayload(
CShader *outProgram, CEncoder *encoder,
SIMDMode simdMode, ///< must be SIMD16
uint32_t splitPartNo, ///< can be either 0 - lower quarter, or 1 upper quarter
llvm::Instruction *inst) {
SetCurrentPart(inst, 0);
IGC_ASSERT(outProgram->m_SIMDSize == SIMDMode::SIMD16);
IGC_ASSERT(splitPartNo < 2);
const GenIntrinsicInst *const intrinsicInst = dyn_cast<GenIntrinsicInst>(inst);
IGC_ASSERT(nullptr != intrinsicInst);
IGC_ASSERT(intrinsicInst->getIntrinsicID() == GenISAIntrinsic::GenISA_URBWrite);
const uint numOperands = m_PayloadMapping.GetNumPayloadElements(inst);
CVariable *payload = outProgram->GetNewVariable(
numOperands * numLanes(SIMDMode::SIMD8), ISA_TYPE_F,
outProgram->GetContext()->platform.getGRFSize() == 64 ? EALIGN_32WORD : EALIGN_HWORD, CName::NONE);
for (uint i = 0; i < numOperands; i++) {
Value *val = m_PayloadMapping.GetPayloadElementToValueMapping(inst, i);
CVariable *data = outProgram->GetSymbol(val);
encoder->SetSimdSize(SIMDMode::SIMD8);
encoder->SetMask((splitPartNo == 0) ? EMASK_Q1 : EMASK_Q2);
encoder->SetSrcSubVar(0, data->IsUniform() ? 0 : splitPartNo);
encoder->SetDstSubVar(i);
encoder->Copy(payload, data);
encoder->Push();
}
return payload;
}
CoalescingEngine::ElementNode *CoalescingEngine::ElementNode::getLeader() {
ElementNode *N = this;
ElementNode *Parent = parent.getPointer();
ElementNode *Grandparent = Parent->parent.getPointer();
while (Parent != Grandparent) {
N->parent.setPointer(Grandparent);
N = Grandparent;
Parent = Parent->parent.getPointer();
Grandparent = Parent->parent.getPointer();
}
return Parent;
}
Value *CoalescingEngine::getRegRoot(Value *Val) const {
auto RI = ValueNodeMap.find(Val);
if (RI == ValueNodeMap.end())
return nullptr;
ElementNode *Node = RI->second;
if (Node->parent.getInt() & ElementNode::kRegisterIsolatedFlag)
return 0x0;
return Node->getLeader()->value;
}
Value *CoalescingEngine::getPHIRoot(Instruction *PHI) const {
IGC_ASSERT(dyn_cast<PHINode>(PHI));
auto RI = ValueNodeMap.find(PHI);
IGC_ASSERT(RI != ValueNodeMap.end());
ElementNode *DestNode = RI->second;
if (DestNode->parent.getInt() & ElementNode::kPHIIsolatedFlag)
return 0x0;
for (unsigned i = 0; i < PHI->getNumOperands(); i++) {
Value *SrcVal = PHI->getOperand(i);
if (isa<Instruction>(SrcVal)) { // skip constant source
Value *SrcRoot = getRegRoot(SrcVal);
if (SrcRoot)
return SrcRoot;
}
}
return 0x0;
}
void CoalescingEngine::unionRegs(Value *Val1, Value *Val2) {
ElementNode *Node1 = ValueNodeMap[Val1]->getLeader();
ElementNode *Node2 = ValueNodeMap[Val2]->getLeader();
if (Node1->rank > Node2->rank) {
Node2->parent.setPointer(Node1->getLeader());
} else if (Node1->rank < Node2->rank) {
Node1->parent.setPointer(Node2->getLeader());
} else if (Node1 != Node2) {
Node2->parent.setPointer(Node1->getLeader());
Node1->rank++;
}
}
// For now, return true if V is dessa-aliased/InsEltMap-ed/phi-coalesced.
bool CoalescingEngine::isCoalescedByDeSSA(Value *V) const {
if (m_DeSSA && m_DeSSA->getRootValue(V))
return true;
return false;
}
void CoalescingEngine::ProcessBlock(llvm::BasicBlock *bb) {
llvm::BasicBlock::InstListType::iterator I, E;
// Loop through instructions top to bottom
for (I = bb->begin(), E = bb->end(); I != E; ++I) {
llvm::Instruction &inst = (*I);
auto loopMarker = RLA->GetResourceLoopMarker(&inst);
// turn off coalescing in this block when there is a fused resource-loop
if ((loopMarker & ResourceLoopAnalysis::MarkResourceLoopInside) ||
((loopMarker & ResourceLoopAnalysis::MarkResourceLoopStart) &&
!(loopMarker & ResourceLoopAnalysis::MarkResourceLoopEnd))) {
BBProcessingDefs[bb].clear();
return;
}
visit(inst);
}
}
bool CoalescingEngine::MatchSingleInstruction(llvm::GenIntrinsicInst *inst) {
if (isSampleInstruction(inst) || isLdInstruction(inst) || isURBWriteIntrinsic(inst) ||
llvm::isa<llvm::AtomicTypedIntrinsic>(inst) || llvm::isa<llvm::RTWriteIntrinsic>(inst) ||
(llvm::isa<llvm::RTDualBlendSourceIntrinsic>(inst) && m_Platform.hasDualKSPPS())) {
uint numOperands = inst->getNumOperands();
for (uint i = 0; i < numOperands; i++) {
Value *val = inst->getOperand(i);
if (llvm::isa<llvm::Constant>(val)) {
continue;
}
if (!ValueNodeMap.count(val)) {
Instruction *DefMI = dyn_cast<Instruction>(val);
if (DefMI) {
BBProcessingDefs[DefMI->getParent()].push_back(DefMI);
}
ValueNodeMap[val] = new (Allocator) ElementNode(val);
}
}
// Add itself to processing stack as well.
BBProcessingDefs[inst->getParent()].push_back(inst);
}
return true;
}
void CoalescingEngine::visitCallInst(llvm::CallInst &I) {
// we do not care if we do not match
// bool match = true;
if (GenIntrinsicInst *CI = llvm::dyn_cast<GenIntrinsicInst>(&I)) {
switch (CI->getIntrinsicID()) {
case GenISAIntrinsic::GenISA_ROUNDNE:
case GenISAIntrinsic::GenISA_imulH:
case GenISAIntrinsic::GenISA_umulH:
case GenISAIntrinsic::GenISA_uaddc:
case GenISAIntrinsic::GenISA_usubb:
case GenISAIntrinsic::GenISA_bfrev:
break;
case GenISAIntrinsic::GenISA_intatomicraw:
case GenISAIntrinsic::GenISA_floatatomicraw:
case GenISAIntrinsic::GenISA_icmpxchgatomicraw:
case GenISAIntrinsic::GenISA_fcmpxchgatomicraw:
case GenISAIntrinsic::GenISA_dwordatomicstructured:
case GenISAIntrinsic::GenISA_floatatomicstructured:
case GenISAIntrinsic::GenISA_cmpxchgatomicstructured:
case GenISAIntrinsic::GenISA_fcmpxchgatomicstructured:
case GenISAIntrinsic::GenISA_icmpxchgatomictyped:
case GenISAIntrinsic::GenISA_floatatomictyped:
case GenISAIntrinsic::GenISA_fcmpxchgatomictyped:
case GenISAIntrinsic::GenISA_ldstructured:
case GenISAIntrinsic::GenISA_atomiccounterinc:
case GenISAIntrinsic::GenISA_atomiccounterpredec:
break;
case GenISAIntrinsic::GenISA_GradientX:
case GenISAIntrinsic::GenISA_GradientY:
case GenISAIntrinsic::GenISA_GradientXfine:
case GenISAIntrinsic::GenISA_GradientYfine:
break;
default:
MatchSingleInstruction(CI);
// no pattern for the rest of the intrinsics
break;
}
} else if (IntrinsicInst *CI = llvm::dyn_cast<IntrinsicInst>(&I)) {
switch (CI->getIntrinsicID()) {
case Intrinsic::sqrt:
case Intrinsic::log2:
case Intrinsic::cos:
case Intrinsic::sin:
case Intrinsic::pow:
case Intrinsic::floor:
case Intrinsic::ceil:
case Intrinsic::trunc:
case Intrinsic::maxnum:
case Intrinsic::minnum:
break;
case Intrinsic::exp2:
break;
default:
break;
}
}
}
void CoalescingEngine::visitCastInst(llvm::CastInst &I) {}
void CoalescingEngine::visitBinaryOperator(llvm::BinaryOperator &I) {}
void CoalescingEngine::visitCmpInst(llvm::CmpInst &I) {}
void CoalescingEngine::visitPHINode(llvm::PHINode &I) {}
void CoalescingEngine::visitUnaryInstruction(llvm::UnaryInstruction &I) {}
void CoalescingEngine::visitSelectInst(llvm::SelectInst &I) {}
void CoalescingEngine::visitBitCastInst(llvm::BitCastInst &I) {}
void CoalescingEngine::visitInstruction(llvm::Instruction &I) {}
void CoalescingEngine::visitLoadInst(LoadInst &I) {}
void CoalescingEngine::visitStoreInst(StoreInst &I) {}
} // namespace IGC