/* * Copyright (C) 2020-2024 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "shared/source/command_stream/thread_arbitration_policy.h" #include "shared/source/device_binary_format/device_binary_formats.h" #include "shared/source/kernel/debug_data.h" #include "shared/source/kernel/grf_config.h" #include "shared/source/kernel/kernel_arg_descriptor.h" #include "shared/source/kernel/kernel_arg_metadata.h" #include #include #include #include #include #include namespace NEO { using StringMap = std::unordered_map; using BindlessToSurfaceStateMap = std::unordered_map; using InstructionsSegmentOffset = uint16_t; struct KernelDescriptor { static bool isBindlessAddressingKernel(const KernelDescriptor &desc); enum AddressingMode : uint8_t { AddrNone, Stateless, Bindful, Bindless, BindfulAndStateless, BindlessAndStateless }; KernelDescriptor() = default; virtual ~KernelDescriptor() = default; void updateCrossThreadDataSize(); void initBindlessOffsetToSurfaceState(); const BindlessToSurfaceStateMap &getBindlessOffsetToSurfaceState() const { return bindlessArgsMap; } struct KernelAttributes { uint32_t slmInlineSize = 0U; uint32_t perThreadScratchSize[2] = {0U, 0U}; uint32_t perHwThreadPrivateMemorySize = 0U; uint32_t perThreadSystemThreadSurfaceSize = 0U; uint32_t numThreadsRequired = 0u; uint32_t spillFillScratchMemorySize = 0u; uint32_t privateScratchMemorySize = 0u; ThreadArbitrationPolicy threadArbitrationPolicy = NotPresent; uint16_t requiredWorkgroupSize[3] = {0U, 0U, 0U}; uint16_t crossThreadDataSize = 0U; uint16_t inlineDataPayloadSize = 0U; uint16_t perThreadDataSize = 0U; uint16_t numArgsToPatch = 0U; uint16_t numGrfRequired = GrfConfig::defaultGrfNumber; uint16_t numArgsStateful = 0U; uint8_t barrierCount = 0u; bool hasNonKernelArgLoad = false; bool hasNonKernelArgStore = false; bool hasNonKernelArgAtomic = false; bool hasIndirectStatelessAccess = false; bool hasIndirectAccessInImplicitArg = false; AddressingMode bufferAddressingMode = BindfulAndStateless; AddressingMode imageAddressingMode = Bindful; AddressingMode samplerAddressingMode = Bindful; DeviceBinaryFormat binaryFormat = DeviceBinaryFormat::unknown; uint8_t workgroupWalkOrder[3] = {0, 1, 2}; uint8_t workgroupDimensionsOrder[3] = {0, 1, 2}; uint8_t gpuPointerSize = 0; uint8_t simdSize = 8; uint8_t numLocalIdChannels = 0; uint8_t localId[3] = {0U, 0U, 0U}; bool supportsBuffersBiggerThan4Gb() const { return Stateless == bufferAddressingMode; } bool usesBarriers() const { return 0 != barrierCount; } union { struct { // 0 bool usesSystolicPipelineSelectMode : 1; bool usesStringMapForPrintf : 1; bool usesPrintf : 1; bool usesFencesForReadWriteImages : 1; bool usesFlattenedLocalIds : 1; bool usesPrivateMemory : 1; bool usesVme : 1; bool usesImages : 1; // 1 bool usesSamplers : 1; bool usesSyncBuffer : 1; bool deprecatedDoNotUse : 1; bool usesStatelessWrites : 1; bool passInlineData : 1; bool perThreadDataHeaderIsPresent : 1; bool perThreadDataUnusedGrfIsPresent : 1; bool requiresDisabledEUFusion : 1; // 2 bool requiresDisabledMidThreadPreemption : 1; bool requiresSubgroupIndependentForwardProgress : 1; bool requiresWorkgroupWalkOrder : 1; bool requiresImplicitArgs : 1; bool useStackCalls : 1; bool hasRTCalls : 1; bool isInvalid : 1; bool hasSample : 1; // 3 bool usesAssert : 1; bool usesRegionGroupBarrier : 1; bool reserved : 6; }; std::array packed; } flags = {}; static_assert(sizeof(KernelAttributes::flags) == sizeof(KernelAttributes::flags.packed), ""); bool usesStringMap() const { if (binaryFormat == DeviceBinaryFormat::patchtokens) { return flags.usesStringMapForPrintf || flags.requiresImplicitArgs; } return false; } } kernelAttributes; struct { InstructionsSegmentOffset skipPerThreadDataLoad = 0U; InstructionsSegmentOffset skipSetFFIDGP = 0U; InstructionsSegmentOffset systemKernel = 0U; } entryPoints; struct PayloadMappings { struct { CrossThreadDataOffset globalWorkOffset[3] = {undefined, undefined, undefined}; CrossThreadDataOffset globalWorkSize[3] = {undefined, undefined, undefined}; CrossThreadDataOffset localWorkSize[3] = {undefined, undefined, undefined}; CrossThreadDataOffset localWorkSize2[3] = {undefined, undefined, undefined}; CrossThreadDataOffset enqueuedLocalWorkSize[3] = {undefined, undefined, undefined}; CrossThreadDataOffset numWorkGroups[3] = {undefined, undefined, undefined}; CrossThreadDataOffset workDim = undefined; CrossThreadDataOffset regionGroupSize[3] = {undefined, undefined, undefined}; CrossThreadDataOffset regionGroupDimension = undefined; CrossThreadDataOffset regionGroupWgCount = undefined; } dispatchTraits; struct { SurfaceStateHeapOffset tableOffset = undefined; uint8_t numEntries = 0; } bindingTable; struct { DynamicStateHeapOffset tableOffset = undefined; DynamicStateHeapOffset borderColor = undefined; uint8_t numSamplers = 0; } samplerTable; StackVec explicitArgs; struct { ArgDescPointer printfSurfaceAddress; ArgDescPointer globalVariablesSurfaceAddress; ArgDescPointer globalConstantsSurfaceAddress; ArgDescPointer privateMemoryAddress; ArgDescPointer deviceSideEnqueueDefaultQueueSurfaceAddress; ArgDescPointer systemThreadSurfaceAddress; ArgDescPointer syncBufferAddress; ArgDescPointer rtDispatchGlobals; ArgDescPointer assertBufferAddress; ArgDescPointer regionGroupBarrierBuffer; CrossThreadDataOffset privateMemorySize = undefined; CrossThreadDataOffset maxWorkGroupSize = undefined; CrossThreadDataOffset simdSize = undefined; CrossThreadDataOffset deviceSideEnqueueParentEvent = undefined; CrossThreadDataOffset preferredWkgMultiple = undefined; CrossThreadDataOffset localMemoryStatelessWindowSize = undefined; CrossThreadDataOffset localMemoryStatelessWindowStartAddres = undefined; CrossThreadDataOffset implicitArgsBuffer = undefined; ArgDescInlineDataPointer indirectDataPointerAddress; ArgDescInlineDataPointer scratchPointerAddress; } implicitArgs; std::vector> explicitArgsExtendedDescriptors; } payloadMappings; StackVec getImplicitArgBindlessCandidatesVec() const { StackVec implicitArgsVec({&payloadMappings.implicitArgs.printfSurfaceAddress, &payloadMappings.implicitArgs.globalVariablesSurfaceAddress, &payloadMappings.implicitArgs.globalConstantsSurfaceAddress, &payloadMappings.implicitArgs.privateMemoryAddress, &payloadMappings.implicitArgs.systemThreadSurfaceAddress, &payloadMappings.implicitArgs.syncBufferAddress, &payloadMappings.implicitArgs.rtDispatchGlobals, &payloadMappings.implicitArgs.assertBufferAddress}); return implicitArgsVec; } std::vector explicitArgsExtendedMetadata; struct InlineSampler { enum class AddrMode : uint8_t { none, repeat, clampEdge, clampBorder, mirror }; enum class FilterMode : uint8_t { nearest, linear }; static constexpr size_t borderColorStateSize = 64U; static constexpr size_t samplerStateSize = 16U; uint32_t samplerIndex; bool isNormalized; AddrMode addrMode; FilterMode filterMode; CrossThreadDataOffset bindless = undefined; uint8_t size = undefined; constexpr uint32_t getSamplerBindfulOffset() const { return borderColorStateSize + samplerStateSize * samplerIndex; } }; std::vector inlineSamplers; struct { std::string kernelName; std::string kernelLanguageAttributes; StringMap printfStringsMap; uint16_t compiledSubGroupsNumber = 0U; uint8_t requiredSubGroupSize = 0U; bool isGeneratedByIgc = true; } kernelMetadata; struct { std::unique_ptr debugData; std::unique_ptr relocatedDebugData; const void *igcInfoForGtpin = nullptr; } external; std::vector generatedSsh; std::vector generatedDsh; BindlessToSurfaceStateMap bindlessArgsMap; std::once_flag initBindlessArgsMapOnce; }; } // namespace NEO