Files
intel-graphics-compiler/IGC/Compiler/CISACodeGen/OpenCLKernelCodeGen.hpp
Krause, Michal d19cdc5a52 Refactor ZEBinary flags and documentation
Refactored all conditions based on enableZEBinary() and supportsZEBin(), as if they were always true. Removed said conditions.
2025-08-13 09:05:48 +02:00

226 lines
9.5 KiB
C++

/*========================== begin_copyright_notice ============================
Copyright (C) 2017-2023 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
#pragma once
#include "Compiler/CISACodeGen/ComputeShaderBase.hpp"
#include "Compiler/CISACodeGen/OpenCLOptions.hpp"
namespace IGC {
class KernelArg;
} // namespace IGC
namespace IGC {
class OpenCLProgramContext : public CodeGenContext {
public:
// output: shader information
iOpenCL::CGen8OpenCLProgram m_programOutput;
SOpenCLProgramInfo m_programInfo;
const InternalOptions m_InternalOptions;
const Options m_Options;
bool isSpirV;
float m_ProfilingTimerResolution = 0.0f;
bool m_ShouldUseNonCoherentStatelessBTI;
uint32_t m_numUAVs = 0;
private:
// To minimize negative performance implications caused by a dynamic generic address
// space resolution, private memory can be allocated in the same address space as
// global memory. It gives a possibility to treat private memory operations as global
// memory operations, so there is no necessity to distinguish between them.
// However, when a module uses `to_global` or `to_private` OpenCL builtins, differentiating
// between private and global pointer is necessary to preserve conformity.
// Below flag is set to true when IGC detects that any of these builtins is called in
// a module and could not be resolved statically at compile time.
bool m_mustDistinguishBetweenPrivateAndGlobalPtr = false;
public:
// Additional text visaasm to link.
std::vector<const char *> m_VISAAsmToLink;
// Functions that are forced to be direct calls.
std::unordered_set<std::string> m_DirectCallFunctions;
SComputeShaderWalkOrder m_walkOrderStruct;
OpenCLProgramContext(const COCLBTILayout &btiLayout, const CPlatform &platform,
const TC::STB_TranslateInputArgs *pInputArgs, const CDriverInfo &driverInfo,
LLVMContextWrapper *llvmContext = nullptr, bool shouldUseNonCoherentStatelessBTI = false,
const bool createResourceDimTypes = true)
: CodeGenContext(ShaderType::OPENCL_SHADER, btiLayout, platform, driverInfo, createResourceDimTypes, llvmContext),
m_programOutput(platform.getPlatformInfo(), *this), m_InternalOptions(pInputArgs), m_Options(pInputArgs),
isSpirV(false), m_ShouldUseNonCoherentStatelessBTI(shouldUseNonCoherentStatelessBTI) {
if (pInputArgs && pInputArgs->pVISAAsmToLinkArray) {
for (uint32_t i = 0; i < pInputArgs->NumVISAAsmsToLink; ++i) {
m_VISAAsmToLink.push_back(pInputArgs->pVISAAsmToLinkArray[i]);
}
}
if (pInputArgs && pInputArgs->pDirectCallFunctions) {
for (uint32_t i = 0; i < pInputArgs->NumDirectCallFunctions; ++i) {
m_DirectCallFunctions.insert(pInputArgs->pDirectCallFunctions[i]);
}
}
}
bool isSPIRV() const;
void setAsSPIRV();
float getProfilingTimerResolution();
uint32_t getNumGRFPerThread(bool returnDefault = true) override;
int32_t getNumThreadsPerEU() const override;
uint32_t getExpGRFSize() const override;
bool forceGlobalMemoryAllocation() const override;
bool allocatePrivateAsGlobalBuffer() const override;
bool noLocalToGenericOptionEnabled() const override;
bool mustDistinguishBetweenPrivateAndGlobalPtr() const override;
void setDistinguishBetweenPrivateAndGlobalPtr(bool);
bool enableTakeGlobalAddress() const override;
int16_t getVectorCoalescingControl() const override;
uint32_t getPrivateMemoryMinimalSizePerThread() const override;
uint32_t getIntelScratchSpacePrivateMemoryMinimalSizePerThread() const override;
bool isBufferBoundsChecking() const override;
void failOnSpills();
bool needsDivergentBarrierHandling() const;
unsigned GetSlmSizePerSubslice();
float GetSpillThreshold(SIMDMode dispatchSize);
bool isAutoGRFSelectionEnabled() const override;
uint64_t getMinimumValidAddress() const override;
void clearBeforeRetry() { m_programOutput.clearBeforeRetry(); }
private:
llvm::DenseMap<llvm::Function *, std::string> m_hashes_per_kernel;
};
class COpenCLKernel : public CComputeShaderBase {
public:
friend class CShaderProgram;
COpenCLKernel(OpenCLProgramContext *ctx, llvm::Function *, CShaderProgram *pProgram);
~COpenCLKernel();
COpenCLKernel(const COpenCLKernel &) = delete;
COpenCLKernel &operator=(const COpenCLKernel &) = delete;
void PreCompile() override;
void AllocatePayload() override;
void ParseShaderSpecificOpcode(llvm::Instruction *inst) override;
void ExtractGlobalVariables() override {}
bool hasReadWriteImage(llvm::Function &F) override;
bool CompileSIMDSize(SIMDMode simdMode, EmitPass &EP, llvm::Function &F) override;
SIMDStatus checkSIMDCompileConds(SIMDMode simdMode, EmitPass &EP, llvm::Function &F, bool hasSyncRTCalls);
SIMDStatus checkSIMDCompileCondsForMin16(SIMDMode simdMode, EmitPass &EP, llvm::Function &F, bool hasSyncRTCalls);
bool IsRegularGRFRequested() override;
bool IsLargeGRFRequested() override;
int getAnnotatedNumThreads() override;
void FillKernel(SIMDMode simdMode);
// Recomputes the binding table layout according to the present kernel args
void RecomputeBTLayout();
bool HasFullDispatchMask() override;
// Returns the immediate value mapped to GlobalVariable c.
// (GlobalVariables represent the pointer to the global,
// which is a compile-time constant)
unsigned int GetSLMMappingValue(llvm::Value *c) override;
CVariable *GetSLMMapping(llvm::Value *c) override;
const SOpenCLKernelInfo &getKernelInfo() const { return m_kernelInfo; }
static bool IsValidShader(COpenCLKernel *shader);
static bool IsVisaCompiledSuccessfullyForShader(COpenCLKernel *shader);
static bool IsVisaCompileStatusFailureForShader(COpenCLKernel *shader);
GenericShaderState m_State;
public:
SOpenCLProgramInfo *m_programInfo;
SOpenCLKernelInfo m_kernelInfo;
SOpenCLKernelCostExpInfo m_kernelCostexpInfo;
unsigned int m_perWIStatelessPrivateMemSize;
bool GetDisableMidThreadPreemption() const { return m_disableMidThreadPreemption; }
void SetDisableMidthreadPreemption() { m_disableMidThreadPreemption = true; }
bool passNOSInlineData() override;
bool loadThreadPayload() override;
protected:
// keep track of the pointer arguments' addrspace and access_type for
// setting the correct attributes to their corresponding bindless offset arguments
typedef std::tuple<zebin::PreDefinedAttrGetter::ArgAddrSpace, zebin::PreDefinedAttrGetter::ArgAccessType,
zebin::PreDefinedAttrGetter::ArgType>
PtrArgAttrType;
typedef std::map<uint32_t, PtrArgAttrType> PtrArgsAttrMapType;
protected:
// Fill SOpenCLKernelInfo::m_zePayloadArgs
// Return true: if the argument is supported in ZEBinary and it's created successfully
// Return false: if the argument cannot be supported by ZEBinary
bool CreateZEPayloadArguments(IGC::KernelArg *kernelArg, uint payloadPosition, PtrArgsAttrMapType &ptrArgsAttrMap);
// Fill SOpenCLKernelInfo::m_zeUserAttribute for ZEBinary
void FillZEUserAttributes(IGC::IGCMD::FunctionInfoMetaDataHandle &funcInfoMD);
// Fill SOpenCLKernelInfo::m_zeKernelArgInfo for ZEBinary
void FillZEKernelArgInfo();
// a helper function to get image type from kernelArg
iOpenCL::IMAGE_MEMORY_OBJECT_TYPE getImageTypeFromKernelArg(const KernelArg &kernelArg);
// a helper function to get sampler type from kernelArg
iOpenCL::SAMPLER_OBJECT_TYPE getSamplerTypeFromKernelArg(const KernelArg &kernelArg);
// Creates annotations for inline sampler_t objects
void CreateZEInlineSamplerAnnotations();
// A helper function to get vector type hint string for filling user attributes
std::string getVecTypeHintTypeString(const IGC::IGCMD::VectorTypeHintMetaDataHandle &vecTypeHintInfo) const;
// Load from MD and return the resource information for argument number argNo
SOpenCLKernelInfo::SResourceInfo getResourceInfo(int argNo);
// Load from MD and return the resource extension information for argument number argNo
ResourceExtensionTypeEnum getExtensionInfo(int argNo);
// Resolve the binding table index for resource resInfo (using the BTL)
unsigned int getBTI(SOpenCLKernelInfo::SResourceInfo &resInfo);
// Find the sum of inline local sizes used by this kernel
unsigned int getSumFixedTGSMSizes(llvm::Function *F);
bool m_HasTID;
bool m_HasGlobalSize;
bool m_disableMidThreadPreemption;
bool m_largeGRFRequested;
bool m_regularGRFRequested;
int m_annotatedNumThreads;
// Maps GlobalVariables representing local address-space pointers
// to their offsets in SLM.
std::map<llvm::Value *, unsigned int> m_localOffsetsMap;
OpenCLProgramContext *m_Context;
private:
WorkGroupWalkOrderMD getWorkGroupWalkOrder();
void tryHWGenerateLocalIDs();
// helper functions for collecting kernel argument info
// Format the strings the way the OpenCL runtime expects them
std::string getKernelArgTypeName(const FunctionMetaData &funcMD, uint argIndex) const;
std::string getKernelArgTypeQualifier(const FunctionMetaData &funcMD, uint argIndex) const;
std::string getKernelArgAddressQualifier(const FunctionMetaData &funcMD, uint argIndex) const;
std::string getKernelArgAccessQualifier(const FunctionMetaData &funcMD, uint argIndex) const;
// Helper function to get SIMD size specified in intel_reqd_sub_group_size attribute
uint32_t getReqdSubGroupSize(llvm::Function &F, IGC::IGCMD::MetaDataUtils *MDUtils) const;
uint32_t getMaxPressure(llvm::Function &F, IGC::IGCMD::MetaDataUtils *MDUtils) const;
bool isUnusedArg(KernelArg &arg) const;
};
void CodeGen(OpenCLProgramContext *ctx);
} // namespace IGC