/* * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "shared/source/command_stream/csr_properties_flags.h" #include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/device/device.h" #include "shared/source/helpers/aux_translation.h" #include "shared/source/helpers/vec.h" #include "shared/source/kernel/implicit_args_helper.h" #include "shared/source/kernel/kernel_execution_type.h" #include "shared/source/program/kernel_info.h" #include "shared/source/unified_memory/unified_memory.h" #include "shared/source/utilities/logger.h" #include "opencl/extensions/public/cl_ext_private.h" #include "opencl/source/cl_device/cl_device.h" #include "opencl/source/kernel/kernel_objects_for_aux_translation.h" #include #include namespace NEO { class MemObj; class TimestampPacketContainer; class Context; class Program; struct ImplicitArgs; enum class AllocationType; struct PatchInfoData; struct CompletionStamp; class Buffer; class CommandQueue; class CommandStreamReceiver; class GraphicsAllocation; class Surface; class PrintfHandler; class MultiDeviceKernel; class LocalIdsCache; class Kernel : public ReferenceTrackedObject { public: static const uint32_t kernelBinaryAlignment = 64; enum KernelArgType { NONE_OBJ, IMAGE_OBJ, BUFFER_OBJ, PIPE_OBJ, SVM_OBJ, SVM_ALLOC_OBJ, SAMPLER_OBJ, ACCELERATOR_OBJ, DEVICE_QUEUE_OBJ, SLM_OBJ }; struct SimpleKernelArgInfo { cl_mem_flags svmFlags; void *object; const void *value; size_t size; GraphicsAllocation *svmAllocation; KernelArgType type; uint32_t allocId; uint32_t allocIdMemoryManagerCounter; bool isPatched = false; bool isStatelessUncacheable = false; bool isSetToNullptr = false; }; enum class TunningStatus { standardTunningInProgress, subdeviceTunningInProgress, tunningDone }; enum class TunningType { disabled, simple, full }; typedef int32_t (Kernel::*KernelArgHandler)(uint32_t argIndex, size_t argSize, const void *argVal); template static KernelType *create(ProgramType *program, const KernelInfo &kernelInfo, ClDevice &clDevice, cl_int &errcodeRet) { cl_int retVal; KernelType *pKernel = nullptr; pKernel = new KernelType(program, kernelInfo, clDevice); retVal = pKernel->initialize(); if (retVal != CL_SUCCESS) { delete pKernel; pKernel = nullptr; } errcodeRet = retVal; if (fileLoggerInstance().enabled()) { std::string source; program->getSource(source); fileLoggerInstance().dumpKernel(kernelInfo.kernelDescriptor.kernelMetadata.kernelName, source); } return pKernel; } Kernel &operator=(const Kernel &) = delete; Kernel(const Kernel &) = delete; ~Kernel() override; static bool isMemObj(KernelArgType kernelArg) { return kernelArg == BUFFER_OBJ || kernelArg == IMAGE_OBJ || kernelArg == PIPE_OBJ; } bool isAuxTranslationRequired() const { return auxTranslationRequired; } void setAuxTranslationRequired(bool onOff) { auxTranslationRequired = onOff; } void updateAuxTranslationRequired(); ArrayRef getCrossThreadDataRef() { return ArrayRef(reinterpret_cast(crossThreadData), crossThreadDataSize); } char *getCrossThreadData() const { return crossThreadData; } uint32_t getCrossThreadDataSize() const { return crossThreadDataSize; } cl_int initialize(); MOCKABLE_VIRTUAL cl_int cloneKernel(Kernel *pSourceKernel); MOCKABLE_VIRTUAL bool isPatched() const; // API entry points cl_int setArgument(uint32_t argIndex, size_t argSize, const void *argVal) { return setArg(argIndex, argSize, argVal); } cl_int setArgSvm(uint32_t argIndex, size_t svmAllocSize, void *svmPtr, GraphicsAllocation *svmAlloc, cl_mem_flags svmFlags); MOCKABLE_VIRTUAL cl_int setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocation *svmAlloc, uint32_t allocId); void setSvmKernelExecInfo(GraphicsAllocation *argValue); void clearSvmKernelExecInfo(); cl_int getInfo(cl_kernel_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const; cl_int getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const; cl_int getWorkGroupInfo(cl_kernel_work_group_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const; cl_int getSubGroupInfo(cl_kernel_sub_group_info paramName, size_t inputValueSize, const void *inputValue, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const; const void *getKernelHeap() const; void *getSurfaceStateHeap() const; const void *getDynamicStateHeap() const; size_t getKernelHeapSize() const; size_t getSurfaceStateHeapSize() const; size_t getDynamicStateHeapSize() const; size_t getNumberOfBindingTableStates() const; size_t getBindingTableOffset() const { return localBindingTableOffset; } void resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset); void substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize); bool isKernelHeapSubstituted() const; uint64_t getKernelId() const; void setKernelId(uint64_t newKernelId); uint32_t getStartOffset() const; void setStartOffset(uint32_t offset); const std::vector &getKernelArguments() const { return kernelArguments; } size_t getKernelArgsNumber() const { return kernelArguments.size(); } bool usesBindfulAddressingForBuffers() const { return KernelDescriptor::BindfulAndStateless == kernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode; } inline const KernelDescriptor &getDescriptor() const { return kernelInfo.kernelDescriptor; } inline const KernelInfo &getKernelInfo() const { return kernelInfo; } Context &getContext() const; Program *getProgram() const { return program; } uint32_t getScratchSize(uint32_t slotId) { return kernelInfo.kernelDescriptor.kernelAttributes.perThreadScratchSize[slotId]; } bool usesSyncBuffer() const; void patchSyncBuffer(GraphicsAllocation *gfxAllocation, size_t bufferOffset); void *patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless); uint32_t getSurfaceStateIndexForBindlessOffset(NEO::CrossThreadDataOffset bindlessOffset) const; void patchBindlessOffsetsForImplicitArgs(uint64_t bindlessSurfaceStateBaseOffset) const; void patchBindlessOffsetsInCrossThreadData(uint64_t bindlessSurfaceStateBaseOffset) const; // Helpers cl_int setArg(uint32_t argIndex, uint32_t argValue); cl_int setArg(uint32_t argIndex, uint64_t argValue); cl_int setArg(uint32_t argIndex, cl_mem argValue); cl_int setArg(uint32_t argIndex, cl_mem argValue, uint32_t mipLevel); cl_int setArg(uint32_t argIndex, size_t argSize, const void *argVal); // Handlers void setKernelArgHandler(uint32_t argIndex, KernelArgHandler handler); void unsetArg(uint32_t argIndex); cl_int setArgImmediate(uint32_t argIndex, size_t argSize, const void *argVal); cl_int setArgBuffer(uint32_t argIndex, size_t argSize, const void *argVal); cl_int setArgPipe(uint32_t argIndex, size_t argSize, const void *argVal); cl_int setArgImage(uint32_t argIndex, size_t argSize, const void *argVal); cl_int setArgImageWithMipLevel(uint32_t argIndex, size_t argSize, const void *argVal, uint32_t mipLevel); cl_int setArgLocal(uint32_t argIndex, size_t argSize, const void *argVal); cl_int setArgSampler(uint32_t argIndex, size_t argSize, const void *argVal); cl_int setArgAccelerator(uint32_t argIndex, size_t argSize, const void *argVal); void storeKernelArg(uint32_t argIndex, KernelArgType argType, void *argObject, const void *argValue, size_t argSize, GraphicsAllocation *argSvmAlloc = nullptr, cl_mem_flags argSvmFlags = 0); void storeKernelArgAllocIdMemoryManagerCounter(uint32_t argIndex, uint32_t allocIdMemoryManagerCounter); const void *getKernelArg(uint32_t argIndex) const; const SimpleKernelArgInfo &getKernelArgInfo(uint32_t argIndex) const; bool getAllowNonUniform() const; bool isVmeKernel() const { return kernelInfo.kernelDescriptor.kernelAttributes.flags.usesVme; } bool requiresSystolicPipelineSelectMode() const { return systolicPipelineSelectMode; } void performKernelTuning(CommandStreamReceiver &commandStreamReceiver, const Vec3 &lws, const Vec3 &gws, const Vec3 &offsets, TimestampPacketContainer *timestampContainer); MOCKABLE_VIRTUAL bool isSingleSubdevicePreferred() const; void setInlineSamplers(); // residency for kernel surfaces MOCKABLE_VIRTUAL void makeResident(CommandStreamReceiver &commandStreamReceiver); MOCKABLE_VIRTUAL void getResidency(std::vector &dst); void resetSharedObjectsPatchAddresses(); bool isUsingSharedObjArgs() const { return usingSharedObjArgs; } bool hasUncacheableStatelessArgs() const { return statelessUncacheableArgsCount > 0; } bool hasPrintfOutput() const; cl_int checkCorrectImageAccessQualifier(cl_uint argIndex, size_t argSize, const void *argValue) const; static uint32_t dummyPatchLocation; uint32_t allBufferArgsStateful = CL_TRUE; bool isBuiltIn = false; KernelExecutionType getExecutionType() const { return executionType; } bool is32Bit() const { return kernelInfo.kernelDescriptor.kernelAttributes.gpuPointerSize == 4; } size_t getPerThreadSystemThreadSurfaceSize() const { return kernelInfo.kernelDescriptor.kernelAttributes.perThreadSystemThreadSurfaceSize; } std::vector &getPatchInfoDataList() { return patchInfoDataList; }; bool usesImages() const { return usingImages; } bool usesOnlyImages() const { return usingImagesOnly; } std::unique_ptr fillWithKernelObjsForAuxTranslation(); void setAuxTranslationDirection(AuxTranslationDirection auxTranslationDirection) { this->auxTranslationDirection = auxTranslationDirection; } void setUnifiedMemorySyncRequirement(bool isUnifiedMemorySyncRequired) { this->isUnifiedMemorySyncRequired = isUnifiedMemorySyncRequired; } void setUnifiedMemoryProperty(cl_kernel_exec_info infoType, bool infoValue); void setUnifiedMemoryExecInfo(GraphicsAllocation *argValue); void clearUnifiedMemoryExecInfo(); bool areStatelessWritesUsed() { return containsStatelessWrites; } int setKernelThreadArbitrationPolicy(uint32_t propertyValue); cl_int setKernelExecutionType(cl_execution_info_kernel_type_intel executionType); void getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *globalWorkSize, const size_t *globalWorkOffset, size_t *localWorkSize); uint32_t getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize, const CommandQueue *commandQueue, bool forceSingleTileQuery) const; uint64_t getKernelStartAddress(const bool localIdsGenerationByRuntime, const bool kernelUsesLocalIds, const bool isCssUsed, const bool returnFullAddress) const; void setAdditionalKernelExecInfo(uint32_t additionalKernelExecInfo); uint32_t getAdditionalKernelExecInfo() const; MOCKABLE_VIRTUAL bool requiresWaDisableRccRhwoOptimization() const; // dispatch traits void setGlobalWorkOffsetValues(uint32_t globalWorkOffsetX, uint32_t globalWorkOffsetY, uint32_t globalWorkOffsetZ); void setGlobalWorkSizeValues(uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ); void setLocalWorkSizeValues(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ); void setLocalWorkSize2Values(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ); void setEnqueuedLocalWorkSizeValues(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ); void setNumWorkGroupsValues(uint32_t numWorkGroupsX, uint32_t numWorkGroupsY, uint32_t numWorkGroupsZ); void setWorkDim(uint32_t workDim); const uint32_t *getDispatchTrait(const CrossThreadDataOffset offset) const { return isValidOffset(offset) ? reinterpret_cast(getCrossThreadData() + offset) : &Kernel::dummyPatchLocation; } const uint32_t *getWorkDim() const { return getDispatchTrait(getDescriptor().payloadMappings.dispatchTraits.workDim); } std::array getDispatchTraitArray(const CrossThreadDataOffset dispatchTrait[3]) const { return {getDispatchTrait(dispatchTrait[0]), getDispatchTrait(dispatchTrait[1]), getDispatchTrait(dispatchTrait[2])}; } std::array getGlobalWorkOffsetValues() const { return getDispatchTraitArray(getDescriptor().payloadMappings.dispatchTraits.globalWorkOffset); } std::array getLocalWorkSizeValues() const { return getDispatchTraitArray(getDescriptor().payloadMappings.dispatchTraits.localWorkSize); } std::array getLocalWorkSize2Values() const { return getDispatchTraitArray(getDescriptor().payloadMappings.dispatchTraits.localWorkSize2); } std::array getEnqueuedLocalWorkSizeValues() const { return getDispatchTraitArray(getDescriptor().payloadMappings.dispatchTraits.enqueuedLocalWorkSize); } std::array getNumWorkGroupsValues() const { return getDispatchTraitArray(getDescriptor().payloadMappings.dispatchTraits.numWorkGroups); } bool isLocalWorkSize2Patchable(); uint32_t getMaxKernelWorkGroupSize() const; uint32_t getSlmTotalSize() const; bool getHasIndirectAccess() const { return this->kernelHasIndirectAccess; } MultiDeviceKernel *getMultiDeviceKernel() const { return pMultiDeviceKernel; } void setMultiDeviceKernel(MultiDeviceKernel *pMultiDeviceKernelToSet) { pMultiDeviceKernel = pMultiDeviceKernelToSet; } bool areMultipleSubDevicesInContext() const; bool requiresMemoryMigration() const { return migratableArgsMap.size() > 0; } const std::map &getMemObjectsToMigrate() const { return migratableArgsMap; } ImplicitArgs *getImplicitArgs() const { return pImplicitArgs.get(); } const HardwareInfo &getHardwareInfo() const; bool isAnyKernelArgumentUsingSystemMemory() const { return anyKernelArgumentUsingSystemMemory; } static bool graphicsAllocationTypeUseSystemMemory(AllocationType type); void setDestinationAllocationInSystemMemory(bool value) { isDestinationAllocationInSystemMemory = value; } bool getDestinationAllocationInSystemMemory() const { return isDestinationAllocationInSystemMemory; } void setLocalIdsForGroup(const Vec3 &groupSize, void *destination) const; size_t getLocalIdsSizeForGroup(const Vec3 &groupSize) const; size_t getLocalIdsSizePerThread() const; const GfxCoreHelper &getGfxCoreHelper() const { return getDevice().getGfxCoreHelper(); } protected: struct KernelConfig { Vec3 gws; Vec3 lws; Vec3 offsets; bool operator==(const KernelConfig &other) const { return this->gws == other.gws && this->lws == other.lws && this->offsets == other.offsets; } }; struct KernelConfigHash { size_t operator()(KernelConfig const &config) const { auto hash = std::hash{}; size_t gwsHashX = hash(config.gws.x); size_t gwsHashY = hash(config.gws.y); size_t gwsHashZ = hash(config.gws.z); size_t gwsHash = hashCombine(gwsHashX, gwsHashY, gwsHashZ); size_t lwsHashX = hash(config.lws.x); size_t lwsHashY = hash(config.lws.y); size_t lwsHashZ = hash(config.lws.z); size_t lwsHash = hashCombine(lwsHashX, lwsHashY, lwsHashZ); size_t offsetsHashX = hash(config.offsets.x); size_t offsetsHashY = hash(config.offsets.y); size_t offsetsHashZ = hash(config.offsets.z); size_t offsetsHash = hashCombine(offsetsHashX, offsetsHashY, offsetsHashZ); return hashCombine(gwsHash, lwsHash, offsetsHash); } size_t hashCombine(size_t hash1, size_t hash2, size_t hash3) const { return (hash1 ^ (hash2 << 1u)) ^ (hash3 << 2u); } }; struct KernelSubmissionData { std::unique_ptr kernelStandardTimestamps; std::unique_ptr kernelSubdeviceTimestamps; TunningStatus status; bool singleSubdevicePreferred = false; }; Kernel(Program *programArg, const KernelInfo &kernelInfo, ClDevice &clDevice); void makeArgsResident(CommandStreamReceiver &commandStreamReceiver); void *patchBufferOffset(const ArgDescPointer &argAsPtr, void *svmPtr, GraphicsAllocation *svmAlloc); void patchWithImplicitSurface(uint64_t ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const ArgDescPointer &arg); void provideInitializationHints(); void markArgPatchedAndResolveArgs(uint32_t argIndex); void reconfigureKernel(); bool hasDirectStatelessAccessToSharedBuffer() const; bool hasDirectStatelessAccessToHostMemory() const; bool hasIndirectStatelessAccessToHostMemory() const; const ClDevice &getDevice() const { return clDevice; } cl_int patchPrivateSurface(); bool hasTunningFinished(KernelSubmissionData &submissionData); bool hasRunFinished(TimestampPacketContainer *timestampContainer); void initializeLocalIdsCache(); std::unique_ptr localIdsCache; UnifiedMemoryControls unifiedMemoryControls{}; std::map migratableArgsMap{}; std::unordered_map kernelSubmissionMap; std::vector kernelArguments; std::vector kernelArgHandlers; std::vector kernelSvmGfxAllocations; std::vector kernelUnifiedMemoryGfxAllocations; std::vector patchInfoDataList; std::vector slmSizes; std::unique_ptr pSshLocal; std::unique_ptr pImplicitArgs = nullptr; uint64_t privateSurfaceSize = 0u; size_t numberOfBindingTableStates = 0u; size_t localBindingTableOffset = 0u; const ExecutionEnvironment &executionEnvironment; Program *program; ClDevice &clDevice; const KernelInfo &kernelInfo; GraphicsAllocation *privateSurface = nullptr; MultiDeviceKernel *pMultiDeviceKernel = nullptr; uint32_t *maxWorkGroupSizeForCrossThreadData = &Kernel::dummyPatchLocation; uint32_t *dataParameterSimdSize = &Kernel::dummyPatchLocation; uint32_t *parentEventOffset = &Kernel::dummyPatchLocation; uint32_t *preferredWkgMultipleOffset = &Kernel::dummyPatchLocation; char *crossThreadData = nullptr; AuxTranslationDirection auxTranslationDirection = AuxTranslationDirection::none; KernelExecutionType executionType = KernelExecutionType::defaultType; uint32_t patchedArgumentsNum = 0; uint32_t startOffset = 0; uint32_t statelessUncacheableArgsCount = 0; uint32_t additionalKernelExecInfo = AdditionalKernelExecInfo::disableOverdispatch; uint32_t maxKernelWorkGroupSize = 0; uint32_t slmTotalSize = 0u; uint32_t sshLocalSize = 0u; uint32_t crossThreadDataSize = 0u; bool containsStatelessWrites = true; bool usingSharedObjArgs = false; bool usingImages = false; bool usingImagesOnly = false; bool auxTranslationRequired = false; bool systolicPipelineSelectMode = false; bool isUnifiedMemorySyncRequired = true; bool singleSubdevicePreferredInCurrentEnqueue = false; bool kernelHasIndirectAccess = true; bool anyKernelArgumentUsingSystemMemory = false; bool isDestinationAllocationInSystemMemory = false; }; } // namespace NEO