Files
intel-graphics-compiler/IGC/Compiler/CISACodeGen/EmitVISAPass.hpp
2025-10-07 23:55:45 +02:00

1026 lines
54 KiB
C++

/*========================== begin_copyright_notice ============================
Copyright (C) 2017-2023 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
#pragma once
#include "LoopCountAnalysis.hpp"
#include "BlockCoalescing.hpp"
#include "PatternMatchPass.hpp"
#include "ShaderCodeGen.hpp"
#include "CoalescingEngine.hpp"
#include "Simd32Profitability.hpp"
#include "GenCodeGenModule.h"
#include "VariableReuseAnalysis.hpp"
#include "CastToGASAnalysis.h"
#include "ResourceLoopAnalysis.h"
#include "Compiler/MetaDataUtilsWrapper.h"
#include "Probe/Assertion.h"
#include "common/LLVMWarningsPush.hpp"
#include <llvm/IR/DataLayout.h>
#include <llvm/IR/InlineAsm.h>
#include <llvm/IR/GetElementPtrTypeIterator.h>
#include <llvm/Analysis/CallGraph.h>
#include "common/LLVMWarningsPop.hpp"
#include <functional>
#include <optional>
#include <type_traits>
namespace llvm {
class GenIntrinsicInst;
}
namespace IGC {
// Forward declaration
class IDebugEmitter;
struct PSSignature;
void initializeEmitPassPass(llvm::PassRegistry &);
class EmitPass : public llvm::FunctionPass {
template <typename P> void addRequired(llvm::AnalysisUsage &AU) const {
static_assert(std::is_base_of_v<llvm::FunctionPass, P> || std::is_base_of_v<llvm::ImmutablePass, P>);
AU.addRequired<P>();
}
public:
EmitPass();
EmitPass(CShaderProgram::KernelShaderMap &shaders, SIMDMode mode, bool canAbortOnSpill, ShaderDispatchMode shaderMode,
PSSignature *pSignature = nullptr);
virtual ~EmitPass();
EmitPass(const EmitPass &) = delete;
EmitPass &operator=(const EmitPass &) = delete;
// Note: all analysis passes should be function passes. If a module analysis pass
// is used, it would invalidate function analysis passes and therefore cause
// those analysis passes to be invoked twice, which increases compiling time.
virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
AU.setPreservesAll();
addRequired<llvm::DominatorTreeWrapperPass>(AU);
addRequired<WIAnalysis>(AU);
addRequired<LiveVarsAnalysis>(AU);
addRequired<CodeGenPatternMatch>(AU);
addRequired<DeSSA>(AU);
addRequired<BlockCoalescing>(AU);
addRequired<CoalescingEngine>(AU);
addRequired<MetaDataUtilsWrapper>(AU);
addRequired<Simd32ProfitabilityAnalysis>(AU);
addRequired<CodeGenContextWrapper>(AU);
addRequired<VariableReuseAnalysis>(AU);
addRequired<CastToGASInfo>(AU);
addRequired<ResourceLoopAnalysis>(AU);
}
virtual bool runOnFunction(llvm::Function &F) override;
virtual llvm::StringRef getPassName() const override { return "EmitPass"; }
void CreateKernelShaderMap(CodeGenContext *ctx, IGC::IGCMD::MetaDataUtils *pMdUtils, llvm::Function &F);
void Frc(const SSource &source, const DstModifier &modifier);
void Floor(const SSource &source, const DstModifier &modifier);
void Mad(const SSource sources[3], const DstModifier &modifier);
void Lrp(const SSource sources[3], const DstModifier &modifier);
void Cmp(llvm::CmpInst::Predicate pred, const SSource sources[2], const DstModifier &modifier,
uint8_t clearTagMask = 0);
void Sub(const SSource[2], const DstModifier &mofidier);
void Xor(const SSource[2], const DstModifier &modifier);
void FDiv(const SSource[2], const DstModifier &modifier);
void VectorMad(const SSource sources[3], const DstModifier &modifier);
void Pow(const SSource sources[2], const DstModifier &modifier);
void Avg(const SSource sources[2], const DstModifier &modifier);
void Rsqrt(const SSource &source, const DstModifier &modifier);
void Select(const SSource sources[3], const DstModifier &modifier);
void PredAdd(const SSource &pred, bool invert, const SSource sources[2], const DstModifier &modifier);
void Mul(const SSource[2], const DstModifier &modifier);
void Div(const SSource[2], const DstModifier &modifier);
void Inv(const SSource[2], const DstModifier &modifier);
void Exp2(const SSource[2], const DstModifier &modifier);
void MaxNum(const SSource sources[2], const DstModifier &modifier);
void Add(const SSource[2], const DstModifier &modifier);
void FPTrunc(const SSource[2], const DstModifier &modifier);
void Powi(const SSource[2], const DstModifier &modifier);
void Mov(const SSource &source, const DstModifier &modifier);
void Unary(e_opcode opCode, const SSource sources[1], const DstModifier &modifier);
void Binary(e_opcode opCode, const SSource sources[2], const DstModifier &modifier);
void Tenary(e_opcode opCode, const SSource sources[3], const DstModifier &modifier);
void Bfn(uint8_t booleanFuncCtrl, const SSource sources[3], const DstModifier &modifier);
void CmpBfn(llvm::CmpInst::Predicate predicate, const SSource cmpSources[2], uint8_t booleanFuncCtrl,
const SSource bfnSources[3], const DstModifier &modifier);
void Mul64(CVariable *dst, CVariable *src[2], SIMDMode simdMode, bool noMask = false) const;
template <int N> void Alu(e_opcode opCode, const SSource sources[N], const DstModifier &modifier);
void BinaryUnary(llvm::Instruction *inst, const SSource source[2], const DstModifier &modifier);
void CmpBoolOp(Pattern *cmpPattern, llvm::BinaryOperator *inst, const SSource &bitSource,
const DstModifier &modifier);
void emitAluConditionMod(Pattern *aluPattern, llvm::Instruction *alu, llvm::CmpInst *cmp, int aluOprdNum);
void EmitGenericPointersCmp(llvm::Instruction *inst, const SSource source[2], const DstModifier &modifier,
uint8_t clearTagMask);
void EmitAluIntrinsic(llvm::CallInst *I, const SSource source[2], const DstModifier &modifier);
void EmitSimpleAlu(llvm::Instruction *inst, const SSource source[2], const DstModifier &modifier);
void EmitSimpleAlu(llvm::Instruction *inst, CVariable *dst, CVariable *src0, CVariable *src1);
void EmitSimpleAlu(EOPCODE opCode, const SSource source[2], const DstModifier &modifier, bool isUnsigned = false);
void EmitSimpleAlu(EOPCODE opCode, CVariable *dst, CVariable *src0, CVariable *src1);
void EmitMinMax(bool isMin, bool isUnsigned, const SSource source[2], const DstModifier &modifier);
void EmitUAdd(llvm::BinaryOperator *inst, const DstModifier &modifier);
void EmitFullMul32(bool isUnsigned, const SSource srcs[2], const DstModifier &dstMod);
void EmitFPToIntWithSat(bool isUnsigned, bool needBitCast, VISA_Type type, const SSource &source,
const DstModifier &modifier);
void EmitNoModifier(llvm::Instruction *inst);
void EmitIntrinsicMessage(llvm::IntrinsicInst *inst);
void EmitGenIntrinsicMessage(llvm::GenIntrinsicInst *inst);
void EmitSIToFPZExt(const SSource &source, const DstModifier &dstMod);
void EmitIntegerTruncWithSat(bool isSignedDst, bool isSignedSrc, const SSource &source, const DstModifier &dstMod);
void EmitPack4i8(const std::array<EOPCODE, 4> &opcodes, const std::array<SSource, 4> &sources0,
const std::array<SSource, 4> &sources1, const std::array<bool, 4> isSat, const DstModifier &dstMod);
void EmitUnpack4i8(const SSource &source, uint32_t index, bool isUnsigned, const DstModifier &dstMod);
void EmitRepack4i8(const std::array<SSource, 4> &sources, const std::array<uint32_t, 4> &mappings,
const DstModifier &dstMod);
void EmitAddPair(llvm::GenIntrinsicInst *GII, const SSource Sources[4], const DstModifier &DstMod);
void EmitSubPair(llvm::GenIntrinsicInst *GII, const SSource Sources[4], const DstModifier &DstMod);
void EmitMulPair(llvm::GenIntrinsicInst *GII, const SSource Sources[4], const DstModifier &DstMod);
void EmitPtrToPair(llvm::GenIntrinsicInst *GII, const SSource Sources[1], const DstModifier &DstMod);
void EmitInlineAsm(llvm::CallInst *inst);
void EmitInitializePHI(llvm::PHINode *phi);
void emitPairToPtr(llvm::GenIntrinsicInst *GII);
void emitMulAdd16(llvm::Instruction *I, const SSource source[2], const DstModifier &dstMod);
void emitCall(llvm::CallInst *inst);
void emitReturn(llvm::ReturnInst *inst);
void EmitInsertValueToStruct(llvm::InsertValueInst *II);
void EmitExtractValueFromStruct(llvm::ExtractValueInst *EI);
void EmitInsertValueToLayoutStruct(llvm::InsertValueInst *IVI);
void EmitExtractValueFromLayoutStruct(llvm::ExtractValueInst *EVI);
void EmitSelectStruct(llvm::SelectInst *SI);
void emitVectorCopyToAOS(uint32_t AOSBytes, CVariable *Dst, CVariable *Src, uint32_t nElts,
uint32_t DstSubRegOffset = 0, uint32_t SrcSubRegOffset = 0) {
emitVectorCopyToOrFromAOS(AOSBytes, Dst, Src, nElts, DstSubRegOffset, SrcSubRegOffset, true);
}
void emitVectorCopyFromAOS(uint32_t AOSBytes, CVariable *Dst, CVariable *Src, uint32_t nElts,
uint32_t DstSubRegOffset = 0, uint32_t SrcSubRegOffset = 0) {
emitVectorCopyToOrFromAOS(AOSBytes, Dst, Src, nElts, DstSubRegOffset, SrcSubRegOffset, false);
}
void emitVectorCopyToOrFromAOS(uint32_t AOSBytes, CVariable *Dst, CVariable *Src, uint32_t nElts,
uint32_t DstSubRegOffset, uint32_t SrcSubRegOffset, bool IsToAOS);
void emitCopyToOrFromLayoutStruct(llvm::Value *D, llvm::Value *S);
/// stack-call code-gen functions
void emitStackCall(llvm::CallInst *inst);
void emitStackFuncEntry(llvm::Function *F);
void emitStackFuncExit(llvm::ReturnInst *inst);
void InitializeKernelStack(llvm::Function *pKernel, CVariable *stackBufferBase = nullptr);
/// stack-call functions for reading and writing argument/retval data to stack
typedef SmallVector<std::tuple<CVariable *, uint32_t, uint32_t, uint32_t, bool>, 8> StackDataBlocks;
uint CalculateStackDataBlocks(StackDataBlocks &blkData, std::vector<CVariable *> &Args);
void ReadStackDataBlocks(StackDataBlocks &blkData, uint offsetS);
void WriteStackDataBlocks(StackDataBlocks &blkData, uint offsetS);
void emitCopyGRFBlock(CVariable *Dst, CVariable *Src, Type *type, uint32_t BlkOffset, unsigned numInstance,
bool isWriteToBlk);
// emits the visa relocation instructions for function/global symbols
void emitSymbolRelocation(llvm::Function &F);
void emitOutput(llvm::GenIntrinsicInst *inst);
// TODO: unify the functions below and clean up
void emitStore(llvm::StoreInst *inst, llvm::Value *varOffset, llvm::ConstantInt *immOffset,
ConstantInt *immScale = nullptr
);
void emitPredicatedStore(llvm::Instruction *inst);
void emitStore3DInner(llvm::Value *pllValToStore, llvm::Value *pllDstPtr, llvm::Value *pllElmIdx);
void emitLoad(llvm::LoadInst *inst, llvm::Value *varOffset, llvm::ConstantInt *immOffset,
ConstantInt *immScale = nullptr
); // single load, no pattern
void emitPredicatedLoad(llvm::Instruction *inst);
void emitLoad3DInner(llvm::LdRawIntrinsic *inst, ResourceDescriptor &resource, llvm::Value *elemIdxV);
// when resource is dynamically indexed, load/store must use special intrinsics
void emitLoadRawIndexed(llvm::LdRawIntrinsic *inst, llvm::Value *varOffset, llvm::ConstantInt *immScale,
llvm::ConstantInt *immOffset);
void emitStoreRawIndexed(llvm::StoreRawIntrinsic *inst, llvm::Value *varOffset, llvm::ConstantInt *immScale,
llvm::ConstantInt *immOffset);
void emitGetBufferPtr(llvm::GenIntrinsicInst *inst);
// \todo, remove this function after we lower all GEP to IntToPtr before CodeGen.
// Only remaining GEPs are for scratch in GFX path
void emitGEP(llvm::Instruction *inst);
// Emit lifetime start right before inst V. If ForAllInstance is true, emit lifestart
// for both instances; otherwise, just the current instance set in the calling context.
void emitLifetimeStart(CVariable *Var, llvm::BasicBlock *BB, llvm::Instruction *I, bool ForAllInstance);
bool waveShuffleCase(CVariable *Var, BasicBlock *BB, Instruction *I, bool ForAllInstance);
// Helper methods for message emit functions.
template <typename T>
void prepareRenderTargetWritePayload(T *inst, llvm::DenseMap<llvm::Value *, CVariable **> &valueToVariableMap,
llvm::Value *color[], uint8_t colorCnt,
// output:
CVariable **src, bool *isUndefined, CVariable *&source0Alpha,
CVariable *&oMaskOpnd, CVariable *&outputDepthOpnd, CVariable *&vStencilOpnd);
ResourceDescriptor GetSampleResourceHelper(llvm::SampleIntrinsic *inst);
void interceptSamplePayloadCoalescing(llvm::SampleIntrinsic *inst, uint numPart,
llvm::SmallVector<CVariable *, 4> &payload, bool &payloadCovered);
// message emit functions
void emitSimdLaneId(llvm::Instruction *inst);
void emitSimdLaneIdReplicate(llvm::Instruction *inst);
void emitSimdSize(llvm::Instruction *inst);
void emitSimdShuffle(llvm::Instruction *inst);
void emitSimdClusteredBroadcast(llvm::Instruction *inst);
void emitCrossInstanceMov(const SSource &source, const DstModifier &modifier);
void emitSimdShuffleDown(llvm::Instruction *inst);
void emitSimdShuffleXor(llvm::Instruction *inst);
void emitSimdBlockRead(llvm::Instruction *inst, llvm::Value *ptrVal = nullptr);
void emitSimdBlockWrite(llvm::Instruction *inst, llvm::Value *ptrVal = nullptr);
void emitLegacySimdBlockWrite(llvm::Instruction *inst, llvm::Value *ptrVal = nullptr);
void emitLegacySimdBlockRead(llvm::Instruction *inst, llvm::Value *ptrVal = nullptr);
void emitLSCSimdBlockWrite(llvm::Instruction *inst, llvm::Value *ptrVal = nullptr);
void emitLSCSimdBlockRead(llvm::Instruction *inst, llvm::Value *ptrVal = nullptr);
void emitSimdMediaBlockRead(llvm::Instruction *inst);
void emitSimdMediaBlockWrite(llvm::Instruction *inst);
void emitMediaBlockIO(const llvm::GenIntrinsicInst *inst, bool isRead);
void emitMediaBlockRectangleRead(llvm::Instruction *inst);
void emitSampleInstruction(llvm::SampleIntrinsic *inst);
void emitLdInstruction(llvm::Instruction *inst);
void emitInfoInstruction(llvm::InfoIntrinsic *inst);
void emitGather4Instruction(llvm::SamplerGatherIntrinsic *inst);
void emitLdmsInstruction(llvm::Instruction *inst);
void emitTypedRead(llvm::Instruction *inst);
void emitTypedWrite(llvm::Instruction *inst);
void emitThreadGroupBarrier(llvm::Instruction *inst);
void emitThreadGroupNamedBarriersSignal(llvm::Instruction *inst);
void emitThreadGroupNamedBarriersWait(llvm::Instruction *inst);
void emitLSCTypedRead(llvm::Instruction *inst);
void emitLSCTypedWrite(llvm::Instruction *inst);
void emitLSCAtomicTyped(llvm::GenIntrinsicInst *inst);
void emitLscUniformAtomicCounter(llvm::GenIntrinsicInst *pInst);
void emitCastSelect(CVariable *flag, CVariable *dst, CVariable *src0, CVariable *src1);
void emitMemoryFence(llvm::Instruction *inst);
void emitMemoryFence(void);
void emitTypedMemoryFence(llvm::Instruction *inst);
void emitFlushSamplerCache();
void emitSurfaceInfo(llvm::GenIntrinsicInst *intrinsic);
void emitStackAlloca(llvm::GenIntrinsicInst *intrinsic);
void emitVLAStackAlloca(llvm::GenIntrinsicInst *intrinsic);
void emitUAVSerialize();
void emitScalarAtomics(llvm::Instruction *pInst, ResourceDescriptor &resource, AtomicOp atomic_op,
CVariable *pDstAddr, CVariable *pU, CVariable *pV, CVariable *pR, CVariable *pSrc, bool isA64,
int bitSize, int immOffset, int immScale, LSC_ADDR_SIZE addrSize);
void emitScalarAtomicLoad(llvm::Instruction *pInst, ResourceDescriptor &resource,
CVariable *pDstAddr, CVariable *pU, CVariable *pV, CVariable *pR, CVariable *pSrc,
bool isA64, int bitWidth, int immOffset, int immScale, LSC_ADDR_SIZE addrSize);
/// wave/subgroup support
/// reduction and prefix/postfix facilities
static bool ScanReduceIs64BitType(VISA_Type type);
static bool ScanReduceIsInt64Mul(e_opcode op, VISA_Type type);
bool ScanReduceIsInt64EmulationNeeded(e_opcode op, VISA_Type type);
CVariable *ScanReducePrepareSrc(VISA_Type type, uint64_t identityValue, bool negate, bool secondHalf, CVariable *src,
CVariable *dst, CVariable *flag = nullptr);
CVariable *ReductionReduceHelper(e_opcode op, VISA_Type type, SIMDMode simd, CVariable *src,
CVariable *srcSecondHalf = nullptr);
void ReductionExpandHelper(e_opcode op, VISA_Type type, CVariable *src, CVariable *dst);
void ReductionClusteredSrcHelper(CVariable *(&pSrc)[2], CVariable *src, uint16_t numLanes, VISA_Type type,
uint numInst, bool secondHalf);
CVariable *ReductionClusteredReduceHelper(e_opcode op, VISA_Type type, SIMDMode simd, bool secondHalf, CVariable *src,
CVariable *dst);
void ReductionClusteredExpandHelper(e_opcode op, VISA_Type type, SIMDMode simd, const uint clusterSize,
bool secondHalf, CVariable *src, CVariable *dst);
/// reduction and prefix/postfix emitters
void emitReductionAll(e_opcode op, uint64_t identityValue, VISA_Type type, bool negate, CVariable *src,
CVariable *dst);
void emitReductionTree(e_opcode op, VISA_Type type, CVariable *src, CVariable *dst);
void emitReductionTrees(e_opcode op, VISA_Type type, SIMDMode simdMode, CVariable *src, CVariable *dst,
unsigned int startIdx, unsigned int endIdx);
void emitReductionClustered(const e_opcode op, const uint64_t identityValue, const VISA_Type type, const bool negate,
const unsigned int clusterSize, CVariable *const src, CVariable *const dst);
void emitReductionInterleave(const e_opcode op, const uint64_t identityValue, const VISA_Type type, const bool negate,
const unsigned int step, CVariable *const src, CVariable *const dst);
void emitReductionInterleave(const e_opcode op, const VISA_Type type, const SIMDMode simd, const unsigned int step,
const bool noMaskBroadcast, CVariable *const src1, CVariable *const src2,
CVariable *const dst);
void emitReductionClusteredInterleave(const e_opcode op, const uint64_t identityValue, const VISA_Type type,
const bool negate, const unsigned int clusterSize,
const unsigned int interleaveStep, CVariable *const src, CVariable *const dst);
void emitPreOrPostFixOp(e_opcode op, uint64_t identityValue, VISA_Type type, bool negateSrc, CVariable *src,
CVariable *result[2], CVariable *Flag = nullptr, bool isPrefix = false, bool isQuad = false,
int clusterSize = 0);
void emitPreOrPostFixOpScalar(e_opcode op, uint64_t identityValue, VISA_Type type, bool negateSrc, CVariable *src,
CVariable *result[2], CVariable *Flag, bool isPrefix, int clusterSize = 0);
bool IsUniformAtomic(llvm::Instruction *pInst);
void emitAtomicRaw(llvm::GenIntrinsicInst *pInst, Value *varOffset, ConstantInt *immOffset = nullptr,
ConstantInt *immScale = nullptr
);
void emitAtomicTyped(llvm::GenIntrinsicInst *pInst);
void emitAtomicCounter(llvm::GenIntrinsicInst *pInst);
void emitFastClear(llvm::LoadInst *inst);
void emitFastClearSend(llvm::Instruction *pInst);
void setRovCacheCtrl(llvm::GenIntrinsicInst *inst);
std::optional<LSC_CACHE_OPTS> cacheOptionsForConstantBufferLoads(Instruction *inst, LSC_L1_L3_CC Ctrl) const;
std::optional<LSC_CACHE_OPTS> cacheOptionsForConstantBufferLoads(Instruction *inst) const;
bool useRasterizerOrderedByteAddressBuffer(llvm::GenIntrinsicInst *inst);
void emitUniformAtomicCounter(llvm::GenIntrinsicInst *pInst);
void emitDiscard(llvm::Instruction *inst);
void emitcycleCounter(llvm::Instruction *inst);
void emitSetDebugReg(llvm::Instruction *inst);
void emitInsert(llvm::Instruction *inst);
void emitExtract(llvm::Instruction *inst);
void emitBitCast(llvm::BitCastInst *btCst);
void emitBitcastfromstruct(llvm::GenIntrinsicInst *BCFromStruct);
void emitBitcasttostruct(llvm::GenIntrinsicInst *BCToStruct);
void emitPtrToInt(llvm::PtrToIntInst *p2iCst);
void emitIntToPtr(llvm::IntToPtrInst *i2pCst);
void emitAddrSpaceCast(llvm::AddrSpaceCastInst *addrSpaceCast);
void emitBranch(llvm::BranchInst *br, const SSource &cond, e_predMode predMode);
void emitDiscardBranch(llvm::BranchInst *br, const SSource &cond);
void emitAluNoModifier(llvm::GenIntrinsicInst *inst);
CVariable *GetVMaskPred(CVariable *&predicate);
void createVMaskPred(CVariable *&predicate);
void UseVMaskPred();
CVariable *GetCombinedVMaskPred(CVariable *basePredicate = nullptr);
CVariable *m_vMaskPredForSubplane = nullptr;
void emitGradientX(const SSource &source, const DstModifier &modifier);
void emitGradientY(const SSource &source, const DstModifier &modifier);
void emitGradientXFine(const SSource &source, const DstModifier &modifier);
void emitGradientYFine(const SSource &source, const DstModifier &modifier);
void emitf32tof16_rtz(llvm::GenIntrinsicInst *inst);
void emitfitof(llvm::GenIntrinsicInst *inst);
void emitFPOWithNonDefaultRoundingMode(llvm::GenIntrinsicInst *inst);
void emitftoi(llvm::GenIntrinsicInst *inst);
void emitCtlz(const SSource &source);
void emitBfn(llvm::GenIntrinsicInst *inst);
// VME
void emitVMESendIME(llvm::GenIntrinsicInst *inst);
void emitVMESendFBR(llvm::GenIntrinsicInst *inst);
void emitVMESendSIC(llvm::GenIntrinsicInst *inst);
void emitVMESendIME2(llvm::GenIntrinsicInst *inst);
void emitVMESendFBR2(llvm::GenIntrinsicInst *inst);
void emitVMESendSIC2(llvm::GenIntrinsicInst *inst);
void emitCreateMessagePhases(llvm::GenIntrinsicInst *inst);
void emitSetMessagePhaseX_legacy(llvm::GenIntrinsicInst *inst);
void emitSetMessagePhase_legacy(llvm::GenIntrinsicInst *inst);
void emitGetMessagePhaseX(llvm::GenIntrinsicInst *inst);
void emitSetMessagePhaseX(llvm::GenIntrinsicInst *inst);
void emitGetMessagePhase(llvm::GenIntrinsicInst *inst);
void emitSetMessagePhase(llvm::GenIntrinsicInst *inst);
void emitSimdGetMessagePhase(llvm::GenIntrinsicInst *inst);
void emitBroadcastMessagePhase(llvm::GenIntrinsicInst *inst);
void emitSimdSetMessagePhase(llvm::GenIntrinsicInst *inst);
void emitSimdMediaRegionCopy(llvm::GenIntrinsicInst *inst);
void emitExtractMVAndSAD(llvm::GenIntrinsicInst *inst);
void emitCmpSADs(llvm::GenIntrinsicInst *inst);
// VA
void emitVideoAnalyticSLM(llvm::GenIntrinsicInst *inst, const DWORD responseLen);
// New VA without using SLM and barriers (result is returned in GRF).
void emitVideoAnalyticGRF(llvm::GenIntrinsicInst *inst, const DWORD responseLen);
// CrossLane Instructions
void emitWaveBallot(llvm::GenIntrinsicInst *inst);
void emitWaveClusteredBallot(llvm::GenIntrinsicInst *inst);
void emitBallotUniform(llvm::GenIntrinsicInst *inst, CVariable **destination, bool disableHelperLanes);
void emitWaveInverseBallot(llvm::GenIntrinsicInst *inst);
void emitWaveShuffleIndex(llvm::GenIntrinsicInst *inst);
void emitWavePrefix(llvm::WavePrefixIntrinsic *I);
void emitQuadPrefix(llvm::QuadPrefixIntrinsic *I);
void emitWaveClusteredPrefix(llvm::GenIntrinsicInst *I);
void emitWaveAll(llvm::GenIntrinsicInst *inst);
void emitWaveClustered(llvm::GenIntrinsicInst *inst);
void emitWaveInterleave(llvm::GenIntrinsicInst *inst);
void emitWaveClusteredInterleave(llvm::GenIntrinsicInst *inst);
// Those three "vector" version shall be combined with
// non-vector version.
bool isUniformStoreOCL(llvm::StoreInst *SI);
bool isUniformStoreOCL(llvm::Value *ptr, llvm::Value *storeVal);
void emitVectorBitCast(llvm::BitCastInst *BCI);
void emitVectorLoad(llvm::LoadInst *LI, llvm::Value *offset, llvm::ConstantInt *immOffset);
void emitVectorStore(llvm::StoreInst *SI, llvm::Value *offset, llvm::ConstantInt *immOffset);
void emitLSCVectorLoad(llvm::Instruction *Inst, llvm::Value *Ptr,
llvm::Value *offset, llvm::ConstantInt *immOffset, ConstantInt *immScale,
LSC_CACHE_OPTS cacheOpts, LSC_DOC_ADDR_SPACE addrSpace
);
void emitLSCVectorStore(llvm::Value *Ptr,
llvm::Value *offset, llvm::ConstantInt *immOffset, llvm::ConstantInt *immScale,
llvm::Value *storedVal, llvm::BasicBlock *BB, LSC_CACHE_OPTS cacheOpts, alignment_t align,
bool dontForceDMask, LSC_DOC_ADDR_SPACE addrSpace
,
llvm::Value *predicate = nullptr);
void emitUniformVectorCopy(CVariable *Dst, CVariable *Src, uint32_t nElts, uint32_t DstSubRegOffset = 0,
uint32_t SrcSubRegOffset = 0, bool allowLargerSIMDSize = false,
CVariable *predicate = nullptr);
void emitVectorCopy(CVariable *Dst, CVariable *Src, uint32_t nElts, uint32_t DstSubRegOffset = 0,
uint32_t SrcSubRegOffset = 0, bool allowLargerSIMDSize = false, CVariable *predicate = nullptr);
void emitConstantVector(CVariable *Dst, uint64_t value = 0);
void emitCopyAll(CVariable *Dst, CVariable *Src, llvm::Type *Ty);
void emitCopyAllInstances(CVariable *Dst, CVariable *Src, llvm::Type *Ty);
void emitPredicatedVectorCopy(CVariable *Dst, CVariable *Src, CVariable *pred);
void emitPredicatedVectorSelect(CVariable *Dst, CVariable *Src0, CVariable *Src1, CVariable *pred);
void emitPushFrameToStack(Function *ParentFunction, unsigned &pushSize);
// emitMul64 - emulate 64bit multiply by 32-bit operations.
// Dst must be a 64-bit type variable.
// Src0 and Src1 must be in 32-bit type variable/immediate
void emitMul64_UDxUD(CVariable *Dst, CVariable *Src0, CVariable *Src1);
void emitAddPointer(CVariable *Dst, CVariable *Src, CVariable *offset);
// emitAddPair - emulate 64bit addtition by 32-bit operations.
// Dst and Src0 must be a 64-bit type variable.
// Src1 must be in 32/64-bit type variable/immediate
void emitAddPair(CVariable *Dst, CVariable *Src0, CVariable *Src1);
void emitSqrt(llvm::Instruction *inst);
void emitUmin(llvm::IntrinsicInst *inst);
void emitSmin(llvm::IntrinsicInst *inst);
void emitUmax(llvm::IntrinsicInst *inst);
void emitSmax(llvm::IntrinsicInst *inst);
void emitCanonicalize(llvm::Instruction *inst, const DstModifier &modifier);
void emitRsq(llvm::Instruction *inst);
void emitFrc(llvm::GenIntrinsicInst *inst);
void emitLLVMbswap(llvm::IntrinsicInst *inst);
void emitDP4A(llvm::GenIntrinsicInst *GII, const SSource *source = nullptr,
const DstModifier &modifier = DstModifier(), bool isAccSigned = true);
void emitLLVMStackSave(llvm::IntrinsicInst *inst);
void emitLLVMStackRestore(llvm::IntrinsicInst *inst);
void emitUnmaskedRegionBoundary(bool start);
LSC_CACHE_OPTS getDefaultRaytracingCachePolicy(bool isLoad) const;
void emitAsyncStackID(llvm::GenIntrinsicInst *I);
void emitTraceRay(llvm::TraceRayIntrinsic *I, bool RayQueryEnable);
void emitReadTraceRaySync(llvm::GenIntrinsicInst *I);
void emitRayQueryCheckRelease(llvm::GenIntrinsicInst *I, bool RayQueryCheckEnable = false,
bool RayQueryReleaseEnable = false);
void emitPreemptionDisable(llvm::PreemptionDisableIntrinsic *PDI);
void emitPreemptionEnable(llvm::PreemptionEnableIntrinsic *PEI);
enum PreemptionEncoding { PreemptionDisabled = (0 << 14), PreemptionEnabled = (1 << 14) };
static PreemptionEncoding getEncoderPreemptionMode(EPreemptionMode preemptionMode);
void emitBTD(CVariable *GlobalBufferPtr, CVariable *StackID, CVariable *ShaderRecord, CVariable *Flag,
bool releaseStackID);
void emitBindlessThreadDispatch(llvm::BTDIntrinsic *I);
void emitStackIDRelease(llvm::StackIDReleaseIntrinsic *I);
void emitGetShaderRecordPtr(llvm::GetShaderRecordPtrIntrinsic *I);
void emitGlobalBufferPtr(llvm::GenIntrinsicInst *I);
void emitLocalBufferPtr(llvm::GenIntrinsicInst *I);
void emitInlinedDataValue(llvm::GenIntrinsicInst *I);
void emitDpas(llvm::GenIntrinsicInst *GII, const SSource *source, const DstModifier &modifier);
void emitfcvt(llvm::GenIntrinsicInst *GII);
void emitSystemMemoryFence(llvm::GenIntrinsicInst *I);
void emitUrbFence();
void emitHDCuncompressedwrite(llvm::GenIntrinsicInst *I);
////////////////////////////////////////////////////////////////////
// LSC related functions
bool tryOverrideCacheOpts(LSC_CACHE_OPTS &cacheOpts, bool isLoad, bool isTGM, const llvm::Value *warningContextValue,
CacheControlOverride m_CacheControlOption) const;
LSC_CACHE_OPTS translateLSCCacheControlsEnum(LSC_L1_L3_CC l1l3cc, bool isLoad,
const llvm::Value *warningContextValue) const;
LSC_CACHE_OPTS translateLSCCacheControlsFromValue(llvm::Value *value, bool isLoad) const;
LSC_CACHE_OPTS translateLSCCacheControlsFromMetadata(llvm::Instruction *inst, bool isLoad, bool isTGM = 0) const;
struct LscMessageFragmentInfo {
LSC_DATA_ELEMS fragElem;
int fragElemCount;
int addrOffsetDelta;
int grfOffsetDelta;
bool lastIsV1; // e.g. splitting a V3 up is a V2 + V1
};
LscMessageFragmentInfo checkForLscMessageFragmentation(LSC_DATA_SIZE size, LSC_DATA_ELEMS elems) const;
// (CVariable* gatherDst, int fragIx, LSC_DATA_ELEMS fragElems, int fragImmOffset)
using LscIntrinsicFragmentEmitter = std::function<void(CVariable *, int, LSC_DATA_ELEMS, int)>;
void emitLscIntrinsicFragments(CVariable *gatherDst, LSC_DATA_SIZE dataSize, LSC_DATA_ELEMS dataElems,
int immOffsetBytes, const LscIntrinsicFragmentEmitter &emitter);
void emitLscIntrinsicLoad(llvm::GenIntrinsicInst *GII);
void emitLscIntrinsicPrefetch(llvm::GenIntrinsicInst *GII);
void emitLscSimdBlockPrefetch(llvm::GenIntrinsicInst *GII);
void emitLscIntrinsicStore(llvm::GenIntrinsicInst *GII);
void emitLscIntrinsicLoadCmask(llvm::GenIntrinsicInst *inst);
void emitLscIntrinsicStoreCmask(llvm::GenIntrinsicInst *GII);
void emitLSCFence(llvm::GenIntrinsicInst *inst);
void emitLSC2DBlockOperation(llvm::GenIntrinsicInst *inst);
void emitLSC2DBlockAddrPayload(llvm::GenIntrinsicInst *GII);
void emitLSC2DBlockReadWriteWithAddrPayload(llvm::GenIntrinsicInst *GII);
void emitLSC2DBlockSetAddrPayloadField(llvm::GenIntrinsicInst *GII);
void emitLSCAtomic(llvm::GenIntrinsicInst *inst);
void emitLSCIntrinsic(llvm::GenIntrinsicInst *GII);
void emitLSCLoad(llvm::Instruction *inst, CVariable *dst, CVariable *offset, unsigned elemSize, unsigned numElems,
unsigned blockOffset, ResourceDescriptor *resource, LSC_ADDR_SIZE addr_size,
LSC_DATA_ORDER data_order, int immOffset, int immScale);
void emitLSCLoad(LSC_CACHE_OPTS cacheOpts, CVariable *dst, CVariable *offset, unsigned elemSize, unsigned numElems,
unsigned blockOffset, ResourceDescriptor *resource, LSC_ADDR_SIZE addr_size,
LSC_DATA_ORDER data_order, int immOffset, int immScale, LSC_DOC_ADDR_SPACE addrSpace);
void emitLSCStore(llvm::Instruction *inst, CVariable *src, CVariable *offset, unsigned elemSize, unsigned numElems,
unsigned blockOffset, ResourceDescriptor *resource, LSC_ADDR_SIZE addr_size,
LSC_DATA_ORDER data_order, int immOffset, int immScale);
void emitLSCStore(LSC_CACHE_OPTS cacheOpts, CVariable *src, CVariable *offset, unsigned elemSize, unsigned numElems,
unsigned blockOffset, ResourceDescriptor *resource, LSC_ADDR_SIZE addr_size,
LSC_DATA_ORDER data_order, int immOffset, int immScale, LSC_DOC_ADDR_SPACE addrSpace);
////////////////////////////////////////////////////////////////////
// NOTE: for vector load/stores instructions pass the
// optional instruction argument checks additional constraints
static Tristate shouldGenerateLSCQuery(const CodeGenContext &Ctx, llvm::Instruction *vectorLdStInst = nullptr,
SIMDMode Mode = SIMDMode::UNKNOWN);
bool shouldGenerateLSC(llvm::Instruction *vectorLdStInst = nullptr, bool isTGM = false);
bool forceCacheCtrl(llvm::Instruction *vectorLdStInst = nullptr);
uint32_t totalBytesToStoreOrLoad(llvm::Instruction *vectorLdStInst);
void emitSrnd(llvm::GenIntrinsicInst *GII);
void emitStaticConstantPatchValue(llvm::StaticConstantPatchIntrinsic *staticConstantPatch32);
// Debug Built-Ins
void emitStateRegID(uint32_t BitStart, uint32_t BitEnd);
void emitThreadPause(llvm::GenIntrinsicInst *inst);
void MovPhiSources(llvm::BasicBlock *bb);
void InitConstant(llvm::BasicBlock *BB);
void emitLifetimeStartResourceLoopUnroll(llvm::BasicBlock *BB);
void emitLifetimeStartAtEndOfBB(llvm::BasicBlock *BB);
void emitDebugPlaceholder(llvm::GenIntrinsicInst *I);
void emitDummyInst(llvm::GenIntrinsicInst *GII);
void emitLaunder(llvm::GenIntrinsicInst *GII);
void emitImplicitArgIntrinsic(llvm::GenIntrinsicInst *I);
void emitStoreImplBufferPtr(llvm::GenIntrinsicInst *I);
void emitSetStackCallsBaseAddress(llvm::GenIntrinsicInst *I);
void emitSaveInReservedArgSpace(llvm::SaveInReservedArgSpaceIntrinsic *I);
void emitReadFromReservedArgSpace(llvm::ReadFromReservedArgSpaceIntrinsic *I);
void emitStoreLocalIdBufferPtr(llvm::GenIntrinsicInst *I);
void emitStoreGlobalBufferArg(llvm::GenIntrinsicInst *I);
void emitLoadImplBufferPtr(llvm::GenIntrinsicInst *I);
void emitLoadLocalIdBufferPtr(llvm::GenIntrinsicInst *I);
void emitLoadGlobalBufferArg(llvm::GenIntrinsicInst *I);
void emitMayUnalignedVectorCopy(CVariable *D, uint32_t D_off, CVariable *S, uint32_t S_off, llvm::Type *Ty);
Function *findStackOverflowDetectionFunction(Function *ParentFunction, bool FindInitFunction);
void emitStackOverflowDetectionCall(llvm::Function *ParentFunction, bool EmitInitFunction);
std::pair<llvm::Value *, llvm::Value *> getPairOutput(llvm::Value *) const;
// helper function
void SplitSIMD(llvm::Instruction *inst, uint numSources, uint headerSize, CVariable *payload, SIMDMode mode,
uint half);
template <size_t N> void JoinSIMD(CVariable *(&tempdst)[N], uint responseLength, SIMDMode mode);
CVariable *BroadcastIfUniform(CVariable *pVar, bool nomask = false);
bool IsNoMaskAllowed(llvm::Instruction *inst);
bool IsSubspanDestination(llvm::Instruction *inst);
uint DecideInstanceAndSlice(const llvm::BasicBlock &blk, SDAG &sdag, bool &slicing);
bool IsUndefOrZeroImmediate(const llvm::Value *value);
inline bool isUndefOrConstInt0(const llvm::Value *val) {
if (val == nullptr || llvm::isa<llvm::UndefValue>(val) ||
(llvm::isa<llvm::ConstantInt>(val) && llvm::cast<llvm::ConstantInt>(val)->getZExtValue() == 0)) {
return true;
}
return false;
}
inline llvm::Value *getOperandIfExist(llvm::Instruction *pInst, unsigned op) {
if (llvm::CallInst *pCall = llvm::dyn_cast<llvm::CallInst>(pInst)) {
if (op < IGCLLVM::getNumArgOperands(pCall)) {
return pInst->getOperand(op);
}
}
return nullptr;
}
bool IsGRFAligned(CVariable *pVar, e_alignment requiredAlign) const {
e_alignment align = pVar->GetAlign();
if (requiredAlign == EALIGN_BYTE) {
// trivial
return true;
}
if (requiredAlign == EALIGN_AUTO || align == EALIGN_AUTO) {
// Can only assume that AUTO only matches AUTO (?)
// (keep the previous behavior unchanged.)
return align == requiredAlign;
}
return align >= requiredAlign;
}
CVariable *ExtendVariable(CVariable *pVar, e_alignment uniformAlign);
CVariable *BroadcastAndExtend(CVariable *pVar);
CVariable *TruncatePointer(CVariable *pVar, bool TruncBothHalves = false);
CVariable *ReAlignUniformVariable(CVariable *pVar, e_alignment align);
CVariable *BroadcastAndTruncPointer(CVariable *pVar);
CVariable *IndexableResourceIndex(CVariable *indexVar, uint btiIndex);
ResourceDescriptor GetResourceVariable(llvm::Value *resourcePtr, bool Check = false);
SamplerDescriptor GetSamplerVariable(llvm::Value *samplerPtr);
CVariable *ComputeSampleIntOffset(llvm::Instruction *sample, uint sourceIndex);
void emitPlnInterpolation(CVariable *bary, CVariable *inputvar);
// the number of lanes of the entire dispatch. It is read only as it is cached for reuse.
CVariable *GetNumActiveLanes();
CVariable *CastFlagToVariable(CVariable *flag);
CVariable *GetExecutionMask();
CVariable *GetExecutionMask(CVariable *&vecMaskVar);
CVariable *GetHalfExecutionMask();
CVariable *UniformCopy(CVariable *var, bool doSub = false);
CVariable *UniformCopy(CVariable *var, CVariable *&LaneOffset, CVariable *eMask = nullptr, bool doSub = false,
bool safeGuard = false, CVariable *predicate = nullptr);
// generate loop header to process sample instruction with varying resource/sampler
bool ResourceLoopHeader(const CVariable *destination, ResourceDescriptor &resource, SamplerDescriptor &sampler,
CVariable *&flag, uint &label, uint ResourceLoopMarker = 0, int *subInteration = nullptr);
bool ResourceLoopHeader(const CVariable *destination, ResourceDescriptor &resource, CVariable *&flag, uint &label,
uint ResourceLoopMarker = 0, int *subInteration = nullptr);
bool ResourceLoopSubIteration(ResourceDescriptor &resource, SamplerDescriptor &sampler, CVariable *&flag, uint &label,
uint ResourceLoopMarker = 0, int iteration = 0, CVariable *prevFlag = nullptr);
bool ResourceLoopSubIteration(ResourceDescriptor &resource, CVariable *&flag, uint &label,
uint ResourceLoopMarker = 0, int iteration = 0, CVariable *prevFlag = nullptr);
void ResourceLoopBackEdge(bool needLoop, CVariable *flag, uint label, uint ResourceLoopMarker = 0);
bool ResourceLoopNeedsLoop(ResourceDescriptor &resource, SamplerDescriptor &sampler, CVariable *&flag,
uint ResourceLoopMarker);
template <typename Func>
void ResourceLoop(ResourceDescriptor &resource, SamplerDescriptor &sampler, const Func &Fn,
uint ResourceLoopMarker = 0) {
uint label = 0;
CVariable *flag = nullptr;
// 0 - default (loop header is set up)
// 1 - first unroll (no safe guard)
// 2 - second unroll, and so on.
int subInteration = 0;
int iterations = m_pCtx->platform.hasSlowSameSBIDLoad() ? IGC_GET_FLAG_VALUE(ResourceLoopUnrollIteration) : 1;
CVariable *currentDestination = m_destination;
std::vector<std::pair<CVariable *, CVariable *>> fills;
// check if need loop
bool needLoop = ResourceLoopNeedsLoop(resource, sampler, flag, ResourceLoopMarker);
std::vector<CVariable *> cumulativeFlags;
// nested unroll won't need loop as the resources are uniformed
if (needLoop) {
// we init this before label;
for (int iter = 0; iter < iterations - 1; iter++) {
cumulativeFlags.push_back(m_currShader->ImmToVariable(0x0, ISA_TYPE_BOOL));
}
// label resource loop
ResourceLoopHeader(currentDestination, resource, sampler, flag, label, ResourceLoopMarker, &subInteration);
}
// subInteration == 0 could mean no resource loop tag indicated
// iterations == 1 could mean no subiteration unroll
if (subInteration == 0 || iterations == 1) {
// get exclusive load info from nested loop unroll meta data
if (m_encoder->GetUniqueExclusiveLoad() && m_destination &&
IGC_IS_FLAG_DISABLED(DisableResourceLoopUnrollExclusiveLoad)) {
m_encoder->MarkAsExclusiveLoad(m_destination);
}
ResourceLoopSubIteration(resource, sampler, flag, label, ResourceLoopMarker);
Fn(flag, m_destination, resource, needLoop);
} else {
// This will be sum of lanes that did something so exit loop
CVariable *flagSumMask = m_currShader->ImmToVariable(0x0, ISA_TYPE_BOOL);
// This will be used as remaining exec mask
CVariable *flagExecMask = nullptr;
// it's also the remaining exec mask but in dword (for fbl)
CVariable *dwordPrevFlag = GetExecutionMask(flagExecMask);
// save the original input resource, as resource will be used in Fn()
ResourceDescriptor resourceOrig = resource;
if ((iterations > 1) && IGC_IS_FLAG_DISABLED(DisableResourceLoopUnrollExclusiveLoad)) {
m_encoder->MarkAsExclusiveLoad(currentDestination);
}
for (int iter = 0; iter < iterations; iter++, subInteration++) {
CVariable *flagSameLaneFlag = nullptr;
// Use original reource as ResourceLoopHeader needs non-uniform
resource = resourceOrig;
ResourceLoopSubIteration(resource, sampler, flagSameLaneFlag, label, ResourceLoopMarker, subInteration,
dwordPrevFlag);
// First iteration does not need to safeguard.
if (iter > 0 && flagSameLaneFlag) {
// We safeguard against case when all lanes were the same in first addr
// like <10 10 10 10> -> we handled all in first iteration
// so we want to zero other iterations, so we don't load 3 times the same
m_encoder->SetNoMask();
m_encoder->And(flagSameLaneFlag, flagSameLaneFlag, flagExecMask);
m_encoder->Push();
}
// need a temp (iter > 0) to save the unroll dst result to avoid shared SBID
Fn(flagSameLaneFlag, currentDestination, resource, needLoop);
if (flagSameLaneFlag) {
m_encoder->SetNoMask();
// Sum lanes that did something (for correct goto at the end)
m_encoder->Or(flagSumMask, flagSumMask, flagSameLaneFlag);
m_encoder->Push();
// Last iteration does not need this
if ((iter < (iterations - 1)) && flagExecMask) {
m_encoder->SetNoMask();
// mask out handled lanes out of remaining ExecMask
m_encoder->Xor(flagExecMask, flagExecMask, flagSameLaneFlag);
m_encoder->Cast(dwordPrevFlag, flagExecMask);
m_encoder->Push();
}
}
}
flag = flagSumMask;
}
ResourceLoopBackEdge(needLoop, flag, label, ResourceLoopMarker);
}
void ForceDMask(bool createJmpForDiscard = true);
void ResetVMask(bool createJmpForDiscard = true);
void setPredicateForDiscard(CVariable *pPredicate = nullptr);
void PackSIMD8HFRet(CVariable *dst);
unsigned int GetPrimitiveTypeSizeInRegisterInBits(const llvm::Type *Ty) const;
unsigned int GetPrimitiveTypeSizeInRegister(const llvm::Type *Ty) const;
unsigned int GetScalarTypeSizeInRegisterInBits(const llvm::Type *Ty) const;
unsigned int GetScalarTypeSizeInRegister(const llvm::Type *Ty) const;
/// return true if succeeds, false otherwise.
bool setCurrentShader(llvm::Function *F);
/// checks FunctionInfo metadata and returns highest register pressure in the func group
/// metadata is stored so subroutines have combined pressure (theirs & calling context)
unsigned int getMaxRegPressureInFunctionGroup(llvm::Function *F, const IGCMD::MetaDataUtils *pM);
/// check if symbol table is needed
bool isSymbolTableRequired(llvm::Function *F);
// Arithmetic operations with constant folding
// Src0 and Src1 are the input operands
// DstPrototype is a prototype of the result of operation and may be used for cloning to a new variable
// Return a variable with the result of the compute which may be one the the sources, an immediate or a variable
CVariable *Mul(CVariable *Src0, CVariable *Src1, const CVariable *DstPrototype);
CVariable *Add(CVariable *Src0, CVariable *Src1, const CVariable *DstPrototype);
// temporary helper function
CVariable *GetSymbol(llvm::Value *v) const;
// Check if stateless indirect access is available
// If yes, increase the counter, otherwise do nothing
void CountStatelessIndirectAccess(llvm::Value *pointer, ResourceDescriptor resource);
// An indirect access happens when GPU loads from an address that was not directly given as one of the kernel
// arguments. It's usually a pointer loaded from memory pointed by a kernel argument. Otherwise the access is direct.
bool IsIndirectAccess(llvm::Value *value);
CVariable *GetSrcVariable(const SSource &source, bool fromConstPool = false);
void SetSourceModifiers(unsigned int sourceIndex, const SSource &source) const;
SBasicBlock *getCurrentBlock() const {
// if m_currentBlock is not set to initial value then return current block
return m_currentBlock == UINT32_MAX ? nullptr : &(m_pattern->m_blocks[m_currentBlock]);
}
CodeGenContext *m_pCtx = nullptr;
CVariable *m_destination = nullptr;
GenXFunctionGroupAnalysis *m_FGA = nullptr;
CodeGenPatternMatch *m_pattern = nullptr;
DeSSA *m_deSSA = nullptr;
BlockCoalescing *m_blockCoalescing = nullptr;
const SIMDMode m_SimdMode;
const ShaderDispatchMode m_ShaderDispatchMode;
CShaderProgram::KernelShaderMap &m_shaders;
CShader *m_currShader;
CEncoder *m_encoder;
const llvm::DataLayout *m_DL = nullptr;
CoalescingEngine *m_CE = nullptr;
VariableReuseAnalysis *m_VRA = nullptr;
ResourceLoopAnalysis *m_RLA = nullptr;
CollectLoopCount *m_CLC = nullptr;
ModuleMetaData *m_moduleMD = nullptr;
bool m_canAbortOnSpill;
PSSignature *const m_pSignature;
llvm::DenseSet<llvm::Value *> m_alreadyInitializedPHI;
// Debug info emitter
IDebugEmitter *m_pDebugEmitter = nullptr;
llvm::DominatorTree *m_pDT = nullptr;
static char ID;
inline void ContextSwitchPayloadSection(bool first = true);
inline void ContextSwitchShaderBody(bool last = true);
private:
uint m_labelForDMaskJmp = 0;
llvm::DenseMap<llvm::Instruction *, bool> instrMap;
// caching the number of instances for the current inst.
int16_t m_currInstNumInstances = -1;
inline void resetCurrInstNumInstances() { m_currInstNumInstances = -1; }
inline void setCurrInstNumInstances(int16_t aV) { m_currInstNumInstances = aV; }
inline int16_t getCurrInstNumInstances() const { return m_currInstNumInstances; }
// Current rounding Mode
// As RM of FPCvtInt and FP could be different, there
// are two fields to keep track of their current values.
//
// Default rounding modes:
// the rounding modes that are pre-defined by each API or
// shaders/kernels.
//
// Not all combinations of FP's RM and FPCvtInt's RM can be
// used as default. Currently, the default RMs have the
// following restrictions:
// 1. If FPCvtInt's RM = ROUND_TO_ZERO, FP's RM can be any;
// 2. Otherwise, FPCvtInt's RM must be the same as FP's RM
//
// The default remains unchanged throughout the entire
// shaders/kernels. Dynamically setting a different default
// rounding mode in the middle of a shader/kernel is not
// supported for now. And the default remains unchanged
// throughout the entire shaders/kernels.
//
// However, each instruction's RM can be set dynamically,
// such as via intrinsics. If an instruction needs setting RMs,
// its RMs must follow the above restrictions. So far, an
// instruction either relies on FP's RM or FPCvtInt's RM, but
// not both, thus setting an instruction's RM dynamically
// cannot violate the above restrictions.
//
ERoundingMode m_roundingMode_FP;
ERoundingMode m_roundingMode_FPCvtInt;
uint m_currentBlock = UINT32_MAX;
bool m_currFuncHasSubroutine = false;
bool m_canGenericPointToPrivate = false;
bool m_canGenericPointToLocal = false;
typedef struct {
CVariable *var;
CVariable *broadcastedVar[2];
llvm::BasicBlock *BB;
} ConstVectorStoreData;
llvm::DenseMap<llvm::Constant *, ConstVectorStoreData> m_constantVectorStores;
// Used to relocate phi-mov to different BB. phiMovToBB is the map from "fromBB"
// to "toBB" (meaning to move phi-mov from "fromBB" to "toBB"). See MovPhiSources.
llvm::DenseMap<llvm::BasicBlock *, llvm::BasicBlock *> phiMovToBB;
// Used to check for the constraint types with the actual llvmIR params for inlineASM instructions
bool validateInlineAsmConstraints(llvm::CallInst *inst, llvm::SmallVector<llvm::StringRef, 8> &constraints);
void emitGetMessagePhaseType(llvm::GenIntrinsicInst *inst, VISA_Type type, uint32_t width);
void emitSetMessagePhaseType(llvm::GenIntrinsicInst *inst, VISA_Type type);
void emitSetMessagePhaseType_legacy(llvm::GenIntrinsicInst *inst, VISA_Type type);
void emitScan(llvm::Value *Src, IGC::WaveOps Op, bool isInclusiveScan, llvm::Value *Mask, bool isQuad,
bool noMask = false);
// Cached per lane offset variables. This is a per basic block data
// structure. For each entry, the first item is the scalar type size in
// bytes, the second item is the corresponding symbol.
llvm::SmallVector<std::pair<unsigned, CVariable *>, 4> PerLaneOffsetVars;
// Helper function to reduce common code for emitting indirect address
// computation.
CVariable *getOrCreatePerLaneOffsetVariable(unsigned TypeSizeInBytes) {
for (const auto &Item : PerLaneOffsetVars) {
if (Item.first == TypeSizeInBytes) {
IGC_ASSERT_MESSAGE(Item.second, "null variable");
return Item.second;
}
}
CVariable *Var = m_currShader->GetPerLaneOffsetsReg(TypeSizeInBytes);
PerLaneOffsetVars.push_back(std::make_pair(TypeSizeInBytes, Var));
return Var;
}
// If constant vector is stored and there is already var instance for it
// try reusing it (if it was defined in the same basic block)
// or create a new var instance and make it available for reusing in further stores
CVariable *tryReusingConstVectorStoreData(llvm::Value *storedVal, llvm::BasicBlock *BB, bool isBroadcast);
CVariable *tryReusingXYZWPayload(llvm::Value *storedVal, llvm::BasicBlock *BB, unsigned numElems, VISA_Type type,
CVariable *pSrc_X, CVariable *pSrc_Y, CVariable *pSrc_Z, CVariable *pSrc_W,
const unsigned int numEltGRF);
// Emit code in slice starting from (reverse) iterator I. Return the
// iterator to the next pattern to emit.
SBasicBlock::reverse_iterator emitInSlice(SBasicBlock &block, SBasicBlock::reverse_iterator I);
/**
* Reuse SampleDescriptor for sampleID, so that we can pass it to
* ResourceLoop to generate loop for non-uniform values.
*/
inline SamplerDescriptor getSampleIDVariable(llvm::Value *sampleIdVar) {
SamplerDescriptor sampler;
sampler.m_sampler = GetSymbol(sampleIdVar);
return sampler;
}
CVariable *UnpackOrBroadcastIfUniform(CVariable *pVar);
int getGRFSize() const { return m_currShader->getGRFSize(); }
void initDefaultRoundingMode();
void SetRoundingMode_FP(ERoundingMode RM_FP);
void SetRoundingMode_FPCvtInt(ERoundingMode RM_FPCvtInt);
void ResetRoundingMode(llvm::Instruction *inst);
// A64 load/store with HWA that make sure the offset hi part is the same per LS call
// addrUnifrom: if the load/store address is uniform, we can skip A64 WA
void emitGatherA64(llvm::Value *loadInst, CVariable *dst, CVariable *offset, unsigned elemSize, unsigned numElems,
bool addrUniform);
void emitGather4A64(llvm::Value *loadInst, CVariable *dst, CVariable *offset, bool addrUniform);
void emitScatterA64(CVariable *val, CVariable *offset, unsigned elementSize, unsigned numElems, bool addrUniform);
void emitScatter4A64(CVariable *src, CVariable *offset, bool addrUniform);
// Helper functions that create loop for above WA
void A64LSLoopHead(CVariable *addr, CVariable *&curMask, CVariable *&lsPred, uint &label);
void A64LSLoopTail(CVariable *curMask, CVariable *lsPred, uint label);
// Helper function to check if A64 WA is required
bool hasA64WAEnable() const;
bool shouldForceEarlyRecompile(IGCMD::MetaDataUtils *pMdUtils, llvm::Function *F);
bool isHalfGRFReturn(CVariable *dst, SIMDMode simdMode);
void emitFeedbackEnable();
void emitAddrSpaceToGenericCast(llvm::AddrSpaceCastInst *addrSpaceCast, CVariable *srcV, unsigned tag);
// used for loading/storing uniform value using scatter/gather messages.
CVariable *prepareAddressForUniform(CVariable *AddrVar, uint32_t EltBytes, uint32_t NElts, uint32_t ExecSz,
e_alignment Align);
CVariable *prepareDataForUniform(CVariable *DataVar, uint32_t ExecSz, e_alignment Align);
// sub-function of vector load/store
void emitLSCVectorLoad_subDW(LSC_CACHE_OPTS CacheOpts, bool UseA32, ResourceDescriptor &Resource, CVariable *Dest,
CVariable *Offset, int ImmOffset, int ImmScale, uint32_t NumElts, uint32_t EltBytes,
LSC_DOC_ADDR_SPACE AddrSpace, LSC_ADDR_SIZE AddrSize,
CVariable *inputPredicate = nullptr, CVariable *mergeVal = nullptr);
void emitLSCVectorLoad_uniform(LSC_CACHE_OPTS CacheOpts, bool UseA32, ResourceDescriptor &Resource, CVariable *Dest,
CVariable *Offset, int ImmOffset, int ImmScale, uint32_t NumElts, uint32_t EltBytes,
uint64_t Align, uint32_t Addrspace, LSC_DOC_ADDR_SPACE UserAddrSpace,
LSC_ADDR_SIZE AddrSize, CVariable *inputPredicate = nullptr,
CVariable *mergeVal = nullptr);
void emitLSCVectorStore_subDW(LSC_CACHE_OPTS CacheOpts, bool UseA32, ResourceDescriptor &Resource,
CVariable *StoreVar, CVariable *Offset, int ImmOffset, int ImmScale, uint32_t NumElts,
uint32_t EltBytes, alignment_t Align, LSC_DOC_ADDR_SPACE AddrSpace,
LSC_ADDR_SIZE AddrSize, llvm::Value *predicate = nullptr);
void emitLSCVectorStore_uniform(LSC_CACHE_OPTS CacheOpts, bool UseA32, ResourceDescriptor &Resource,
CVariable *StoreVar, CVariable *Offset, int ImmOffset, int ImmScale, uint32_t NumElts,
uint32_t EltBytes, alignment_t Align, LSC_DOC_ADDR_SPACE AddrSpace,
LSC_ADDR_SIZE AddrSize, llvm::Value *predicate = nullptr);
LSC_FENCE_OP getLSCMemoryFenceOp(bool IsGlobalMemFence, bool InvalidateL1, bool EvictL1) const;
CVariable *getStackSizePerThread(llvm::Function *parentFunc);
uint32_t getReqBlkBitsForBlockStLd(llvm::CallInst *call);
bool m_isDuplicate;
CVariable *m_tmpDest = nullptr;
std::set<CoalescingEngine::CCTuple *> lifetimeStartAdded;
tuple<CVariable *, CVariable *, CVariable *> addToCachedPayloadUVR(CVariable *pU, CVariable *pV, CVariable *pR);
std::map<tuple<Value *, Value *, Value *>, tuple<CVariable *, CVariable *, CVariable *>> atomic_shared_pUVR;
};
} // namespace IGC