From 2c25777f3cf4fdbeb62fb9331073426d49aa61ca Mon Sep 17 00:00:00 2001 From: Jaroslaw Chodor Date: Tue, 7 Apr 2020 14:07:31 +0200 Subject: [PATCH] DispatchKernelEncoder refactor Replacing parts of DispatchKernelEncoder with KernelDescriptor Change-Id: I1c780b04a2d3d1de0fb75d5413a0dde8b41bbe07 --- level_zero/core/source/cmdlist/cmdlist_hw.h | 2 +- level_zero/core/source/cmdlist/cmdlist_hw.inl | 7 +- .../core/source/cmdlist/cmdlist_hw_base.inl | 6 +- level_zero/core/source/kernel/kernel.h | 18 --- level_zero/core/source/kernel/kernel_hw.h | 2 +- level_zero/core/source/kernel/kernel_imp.cpp | 125 ++---------------- level_zero/core/source/kernel/kernel_imp.h | 52 ++------ .../unit_tests/sources/kernel/test_kernel.cpp | 21 --- .../source/helpers/hardware_commands_helper.h | 4 +- .../enqueue_copy_buffer_to_image_tests.cpp | 4 +- .../enqueue_copy_image_tests.cpp | 4 +- .../enqueue_copy_image_to_buffer_tests.cpp | 4 +- .../enqueue_fill_image_tests.cpp | 4 +- .../enqueue_read_image_tests.cpp | 4 +- .../enqueue_write_image_tests.cpp | 4 +- .../command_container/command_encoder.h | 6 +- .../command_container/command_encoder.inl | 22 +-- .../command_encoder_base.inl | 53 ++++---- shared/source/helpers/kernel_helpers.cpp | 4 +- shared/source/helpers/register_offsets.h | 2 + .../dispatch_kernel_encoder_interface.h | 52 +++----- .../kernel_descriptor_from_patchtokens.cpp | 3 +- .../encoders/test_encode_dispatch_kernel.cpp | 57 ++++---- .../unit_test/encoders/test_encode_math.cpp | 6 +- ...rnel_descriptor_from_patchtokens_tests.cpp | 3 + ...mock_dispatch_kernel_encoder_interface.cpp | 46 +++---- .../mock_dispatch_kernel_encoder_interface.h | 48 +++---- 27 files changed, 180 insertions(+), 383 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index e3419c0f1c..2b13e47f16 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -155,7 +155,7 @@ struct CommandListCoreFamily : CommandListImp { void applyMemoryRangesBarrier(uint32_t numRanges, const size_t *pRangeSizes, const void **pRanges); - ze_result_t setGroupSizeIndirect(uint32_t offsets[3], void *crossThreadAddress, uint32_t lws[3]); + ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, uint32_t lws[3]); void appendEventForProfiling(ze_event_handle_t hEvent, bool beforeWalker); void appendSignalEventPostWalker(ze_event_handle_t hEvent); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index c41647588b..c179ebb65c 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -1176,13 +1176,10 @@ ze_result_t CommandListCoreFamily::prepareIndirectParams(const ze } template -ze_result_t CommandListCoreFamily::setGroupSizeIndirect(uint32_t offsets[3], - void *crossThreadAddress, - uint32_t lws[3]) { - +ze_result_t CommandListCoreFamily::setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, uint32_t lws[3]) { using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; - NEO::EncodeIndirectParams::setGroupSizeIndirect(commandContainer, offsets, crossThreadAddress, lws); + NEO::EncodeIndirectParams::setGlobalWorkSizeIndirect(commandContainer, offsets, crossThreadAddress, lws); return ZE_RESULT_SUCCESS; } diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl index 358cd394bf..9dc03fd58e 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl @@ -29,10 +29,10 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(z const auto kernel = Kernel::fromHandle(hKernel); UNRECOVERABLE_IF(kernel == nullptr); const auto functionImmutableData = kernel->getImmutableData(); - commandListPerThreadScratchSize = std::max(commandListPerThreadScratchSize, kernel->getPerThreadScratchSize()); + commandListPerThreadScratchSize = std::max(commandListPerThreadScratchSize, kernel->getImmutableData()->getDescriptor().kernelAttributes.perThreadScratchSize[0]); - auto functionPreemptionMode = obtainFunctionPreemptionMode(kernel); - commandListPreemptionMode = std::min(commandListPreemptionMode, functionPreemptionMode); + auto kernelPreemptionMode = obtainFunctionPreemptionMode(kernel); + commandListPreemptionMode = std::min(commandListPreemptionMode, kernelPreemptionMode); if (!isIndirect) { kernel->setGroupCount(pThreadGroupDimensions->groupCountX, diff --git a/level_zero/core/source/kernel/kernel.h b/level_zero/core/source/kernel/kernel.h index bbc968638d..174b54e5cf 100644 --- a/level_zero/core/source/kernel/kernel.h +++ b/level_zero/core/source/kernel/kernel.h @@ -98,8 +98,6 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI { virtual ze_result_t setArgBufferWithAlloc(uint32_t argIndex, const void *argVal, NEO::GraphicsAllocation *allocation) = 0; virtual ze_result_t setArgRedescribedImage(uint32_t argIndex, ze_image_handle_t argVal) = 0; - virtual bool getGroupCountOffsets(uint32_t *locations) = 0; - virtual bool getGroupSizeOffsets(uint32_t *locations) = 0; virtual ze_result_t setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY, uint32_t groupSizeZ) = 0; virtual ze_result_t suggestGroupSize(uint32_t globalSizeX, uint32_t globalSizeY, @@ -113,22 +111,6 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI { virtual const std::vector &getResidencyContainer() const = 0; - virtual void getGroupSize(uint32_t &outGroupSizeX, uint32_t &outGroupSizeY, uint32_t &outGroupSizeZ) const = 0; - virtual uint32_t getThreadsPerThreadGroup() const = 0; - virtual uint32_t getThreadExecutionMask() const = 0; - - virtual const uint8_t *getCrossThreadData() const = 0; - virtual uint32_t getCrossThreadDataSize() const = 0; - - virtual const uint8_t *getPerThreadData() const = 0; - virtual uint32_t getPerThreadDataSizeForWholeThreadGroup() const = 0; - virtual uint32_t getPerThreadDataSize() const = 0; - virtual const uint8_t *getSurfaceStateHeapData() const = 0; - virtual uint32_t getSurfaceStateHeapDataSize() const = 0; - - virtual const uint8_t *getDynamicStateHeapData() const = 0; - virtual size_t getDynamicStateHeapDataSize() const = 0; - virtual UnifiedMemoryControls getUnifiedMemoryControls() const = 0; virtual bool hasIndirectAllocationsAllowed() const = 0; diff --git a/level_zero/core/source/kernel/kernel_hw.h b/level_zero/core/source/kernel/kernel_hw.h index a54b698e0b..3e6f872b16 100644 --- a/level_zero/core/source/kernel/kernel_hw.h +++ b/level_zero/core/source/kernel/kernel_hw.h @@ -77,7 +77,7 @@ struct KernelHw : public KernelImp { } std::copy(this->groupSize, this->groupSize + 3, cloned->groupSize); - cloned->threadsPerThreadGroup = this->threadsPerThreadGroup; + cloned->numThreadsPerThreadGroup = this->numThreadsPerThreadGroup; cloned->threadExecutionMask = this->threadExecutionMask; if (this->surfaceStateHeapDataSize > 0) { diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 7e797c5020..484305f779 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -185,14 +185,9 @@ ze_result_t KernelImp::setArgumentValue(uint32_t argIndex, size_t argSize, } void KernelImp::setGroupCount(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) { - uint32_t groupSizeX; - uint32_t groupSizeY; - uint32_t groupSizeZ; - getGroupSize(groupSizeX, groupSizeY, groupSizeZ); - const NEO::KernelDescriptor &desc = kernelImmData->getDescriptor(); - uint32_t globalWorkSize[3] = {groupCountX * groupSizeX, groupCountY * groupSizeY, - groupCountZ * groupSizeZ}; + uint32_t globalWorkSize[3] = {groupCountX * groupSize[0], groupCountY * groupSize[1], + groupCountZ * groupSize[2]}; auto dst = ArrayRef(crossThreadData.get(), crossThreadDataSize); NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.globalWorkSize, globalWorkSize); @@ -200,30 +195,6 @@ void KernelImp::setGroupCount(uint32_t groupCountX, uint32_t groupCountY, uint32 NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.numWorkGroups, groupCount); } -bool KernelImp::getGroupCountOffsets(uint32_t *locations) { - const NEO::KernelDescriptor &desc = kernelImmData->getDescriptor(); - for (int i = 0; i < 3; i++) { - if (NEO::isValidOffset(desc.payloadMappings.dispatchTraits.numWorkGroups[i])) { - locations[i] = desc.payloadMappings.dispatchTraits.numWorkGroups[i]; - } else { - return false; - } - } - return true; -} - -bool KernelImp::getGroupSizeOffsets(uint32_t *locations) { - const NEO::KernelDescriptor &desc = kernelImmData->getDescriptor(); - for (int i = 0; i < 3; i++) { - if (NEO::isValidOffset(desc.payloadMappings.dispatchTraits.globalWorkSize[i])) { - locations[i] = desc.payloadMappings.dispatchTraits.globalWorkSize[i]; - } else { - return false; - } - } - return true; -} - ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY, uint32_t groupSizeZ) { if ((0 == groupSizeX) || (0 == groupSizeY) || (0 == groupSizeZ)) { @@ -267,8 +238,8 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY, this->groupSize[2] = groupSizeZ; auto simdSize = kernelImmData->getDescriptor().kernelAttributes.simdSize; - this->threadsPerThreadGroup = static_cast((itemsInGroup + simdSize - 1u) / simdSize); - this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / threadsPerThreadGroup; + this->numThreadsPerThreadGroup = static_cast((itemsInGroup + simdSize - 1u) / simdSize); + this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / numThreadsPerThreadGroup; patchWorkgroupSizeInCrossThreadData(groupSizeX, groupSizeY, groupSizeZ); auto remainderSimdLanes = itemsInGroup & (simdSize - 1u); @@ -297,7 +268,7 @@ ze_result_t KernelImp::suggestGroupSize(uint32_t globalSizeX, uint32_t globalSiz uint32_t numThreadsPerSubSlice = (uint32_t)deviceInfo.maxNumEUsPerSubSlice * deviceInfo.numThreadsPerEU; uint32_t localMemSize = (uint32_t)deviceInfo.localMemSize; - NEO::WorkSizeInfo wsInfo(maxWorkGroupSize, this->hasBarriers(), simd, this->getSlmTotalSize(), + NEO::WorkSizeInfo wsInfo(maxWorkGroupSize, kernelImmData->getDescriptor().kernelAttributes.flags.usesBarriers, simd, this->getSlmTotalSize(), coreFamily, numThreadsPerSubSlice, localMemSize, usesImages, false); NEO::computeWorkgroupSizeND(wsInfo, retGroupSize, workItems, dim); @@ -672,92 +643,12 @@ bool KernelImp::hasIndirectAllocationsAllowed() const { unifiedMemoryControls.indirectSharedAllocationsAllowed); } -bool KernelImp::hasBarriers() { - return getImmutableData()->getDescriptor().kernelAttributes.flags.usesBarriers; -} -uint32_t KernelImp::getSlmTotalSize() { +uint32_t KernelImp::getSlmTotalSize() const { return slmArgsTotalSize + getImmutableData()->getDescriptor().kernelAttributes.slmInlineSize; } -uint32_t KernelImp::getBindingTableOffset() { - return getImmutableData()->getDescriptor().payloadMappings.bindingTable.tableOffset; -} -uint32_t KernelImp::getBorderColor() { - return getImmutableData()->getDescriptor().payloadMappings.samplerTable.borderColor; -} -uint32_t KernelImp::getSamplerTableOffset() { - return getImmutableData()->getDescriptor().payloadMappings.samplerTable.tableOffset; -} -uint32_t KernelImp::getNumSurfaceStates() { - return getImmutableData()->getDescriptor().payloadMappings.bindingTable.numEntries; -} -uint32_t KernelImp::getNumSamplers() { - return getImmutableData()->getDescriptor().payloadMappings.samplerTable.numSamplers; -} -uint32_t KernelImp::getSimdSize() { - return getImmutableData()->getDescriptor().kernelAttributes.simdSize; -} -uint32_t KernelImp::getSizeCrossThreadData() { - return getCrossThreadDataSize(); -} -uint32_t KernelImp::getPerThreadScratchSize() { - return getImmutableData()->getDescriptor().kernelAttributes.perThreadScratchSize[0]; -} -uint32_t KernelImp::getThreadsPerThreadGroupCount() { - return getThreadsPerThreadGroup(); -} -uint32_t KernelImp::getSizePerThreadData() { - return getPerThreadDataSize(); -} -uint32_t KernelImp::getSizePerThreadDataForWholeGroup() { - return getPerThreadDataSizeForWholeThreadGroup(); -} -uint32_t KernelImp::getSizeSurfaceStateHeapData() { - return getSurfaceStateHeapDataSize(); -} -uint32_t KernelImp::getPerThreadExecutionMask() { - return getThreadExecutionMask(); -} -uint32_t *KernelImp::getCountOffsets() { - return groupCountOffsets; -} -uint32_t *KernelImp::getSizeOffsets() { - return groupSizeOffsets; -} -uint32_t *KernelImp::getLocalWorkSize() { - if (hasGroupSize()) { - getGroupSize(localWorkSize[0], localWorkSize[1], localWorkSize[2]); - } - return localWorkSize; -} -uint32_t KernelImp::getNumGrfRequired() { - return getImmutableData()->getDescriptor().kernelAttributes.numGrfRequired; -} -NEO::GraphicsAllocation *KernelImp::getIsaAllocation() { + +NEO::GraphicsAllocation *KernelImp::getIsaAllocation() const { return getImmutableData()->getIsaGraphicsAllocation(); } -bool KernelImp::hasGroupCounts() { - return getGroupCountOffsets(groupCountOffsets); -} -bool KernelImp::hasGroupSize() { - return getGroupSizeOffsets(groupSizeOffsets); -} -const void *KernelImp::getSurfaceStateHeap() { - return getSurfaceStateHeapData(); -} -const void *KernelImp::getDynamicStateHeap() { - return getDynamicStateHeapData(); -} -const void *KernelImp::getCrossThread() { - return getCrossThreadData(); -} -const void *KernelImp::getPerThread() { - return getPerThreadData(); -} -bool KernelImp::isInlineDataRequired() { - return getImmutableData()->getDescriptor().kernelAttributes.flags.passInlineData; -} -uint8_t KernelImp::getNumLocalIdChannels() { - return getImmutableData()->getDescriptor().kernelAttributes.numLocalIdChannels; -} } // namespace L0 diff --git a/level_zero/core/source/kernel/kernel_imp.h b/level_zero/core/source/kernel/kernel_imp.h index 359b25ba1a..48684e3006 100644 --- a/level_zero/core/source/kernel/kernel_imp.h +++ b/level_zero/core/source/kernel/kernel_imp.h @@ -42,10 +42,6 @@ struct KernelImp : Kernel { void setGroupCount(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) override; - bool getGroupCountOffsets(uint32_t *locations) override; - - bool getGroupSizeOffsets(uint32_t *locations) override; - ze_result_t setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY, uint32_t groupSizeZ) override; @@ -62,13 +58,6 @@ struct KernelImp : Kernel { return residencyContainer; } - void getGroupSize(uint32_t &outGroupSizeX, uint32_t &outGroupSizeY, - uint32_t &outGroupSizeZ) const override { - outGroupSizeX = this->groupSize[0]; - outGroupSizeY = this->groupSize[1]; - outGroupSizeZ = this->groupSize[2]; - } - ze_result_t setArgImmediate(uint32_t argIndex, size_t argSize, const void *argVal); ze_result_t setArgBuffer(uint32_t argIndex, size_t argSize, const void *argVal); @@ -89,7 +78,7 @@ struct KernelImp : Kernel { uint32_t getPerThreadDataSizeForWholeThreadGroup() const override { return perThreadDataSizeForWholeThreadGroup; } uint32_t getPerThreadDataSize() const override { return perThreadDataSize; } - uint32_t getThreadsPerThreadGroup() const override { return threadsPerThreadGroup; } + uint32_t getNumThreadsPerThreadGroup() const override { return numThreadsPerThreadGroup; } uint32_t getThreadExecutionMask() const override { return threadExecutionMask; } NEO::GraphicsAllocation *getPrintfBufferAllocation() override { return this->printfBuffer; } @@ -99,41 +88,20 @@ struct KernelImp : Kernel { uint32_t getSurfaceStateHeapDataSize() const override { return surfaceStateHeapDataSize; } const uint8_t *getDynamicStateHeapData() const override { return dynamicStateHeapData.get(); } - size_t getDynamicStateHeapDataSize() const override { return dynamicStateHeapDataSize; } const KernelImmutableData *getImmutableData() const override { return kernelImmData; } UnifiedMemoryControls getUnifiedMemoryControls() const override { return unifiedMemoryControls; } bool hasIndirectAllocationsAllowed() const override; - bool hasBarriers() override; - uint32_t getSlmTotalSize() override; - uint32_t getBindingTableOffset() override; - uint32_t getBorderColor() override; - uint32_t getSamplerTableOffset() override; - uint32_t getNumSurfaceStates() override; - uint32_t getNumSamplers() override; - uint32_t getSimdSize() override; - uint32_t getSizeCrossThreadData() override; - uint32_t getPerThreadScratchSize() override; - uint32_t getThreadsPerThreadGroupCount() override; - uint32_t getSizePerThreadData() override; - uint32_t getSizePerThreadDataForWholeGroup() override; - uint32_t getSizeSurfaceStateHeapData() override; - uint32_t getPerThreadExecutionMask() override; - uint32_t *getCountOffsets() override; - uint32_t *getSizeOffsets() override; - uint32_t *getLocalWorkSize() override; - uint32_t getNumGrfRequired() override; - NEO::GraphicsAllocation *getIsaAllocation() override; - bool hasGroupCounts() override; - bool hasGroupSize() override; - const void *getSurfaceStateHeap() override; - const void *getDynamicStateHeap() override; - const void *getCrossThread() override; - const void *getPerThread() override; - bool isInlineDataRequired() override; - uint8_t getNumLocalIdChannels() override; + const NEO::KernelDescriptor &getKernelDescriptor() const override { + return kernelImmData->getDescriptor(); + } + const uint32_t *getGroupSize() const override { + return groupSize; + } + uint32_t getSlmTotalSize() const override; + NEO::GraphicsAllocation *getIsaAllocation() const override; protected: KernelImp() = default; @@ -153,7 +121,7 @@ struct KernelImp : Kernel { NEO::GraphicsAllocation *printfBuffer = nullptr; uint32_t groupSize[3] = {0u, 0u, 0u}; - uint32_t threadsPerThreadGroup = 0u; + uint32_t numThreadsPerThreadGroup = 0u; uint32_t threadExecutionMask = 0u; std::unique_ptr crossThreadData = 0; diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp index 85826fe857..8f5e9cc48d 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp @@ -12,26 +12,5 @@ namespace L0 { namespace ult { -TEST(Kernel, givenPassInlineDataTrueWhenCallingIsInlineDataRequiredThenTrueIsReturned) { - Mock kernel; - - kernel.descriptor.kernelAttributes.flags.passInlineData = true; - EXPECT_TRUE(kernel.isInlineDataRequired()); -} - -TEST(Kernel, givenPassInlineDataFalseWhenCallingIsInlineDataRequiredThenFalseIsReturned) { - Mock kernel; - - kernel.descriptor.kernelAttributes.flags.passInlineData = false; - EXPECT_FALSE(kernel.isInlineDataRequired()); -} - -TEST(Kernel, whenGettingLocalIdsChannelNumberThenCorrectValueIsReturned) { - Mock kernel; - - kernel.descriptor.kernelAttributes.numLocalIdChannels = 3; - EXPECT_EQ(3u, kernel.getNumLocalIdChannels()); -} - } // namespace ult } // namespace L0 diff --git a/opencl/source/helpers/hardware_commands_helper.h b/opencl/source/helpers/hardware_commands_helper.h index d4db090559..83741d6c9f 100644 --- a/opencl/source/helpers/hardware_commands_helper.h +++ b/opencl/source/helpers/hardware_commands_helper.h @@ -51,7 +51,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper { static void setAdditionalInfo( INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const Kernel &kernel, - const uint32_t threadsPerThreadGroup); + const uint32_t numThreadsPerThreadGroup); inline static uint32_t additionalSizeRequiredDsh(); @@ -64,7 +64,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper { size_t bindingTablePointer, size_t offsetSamplerState, uint32_t numSamplers, - uint32_t threadsPerThreadGroup, + uint32_t numThreadsPerThreadGroup, const Kernel &kernel, uint32_t bindingTablePrefetchSize, PreemptionMode preemptionMode, diff --git a/opencl/test/unit_test/command_queue/enqueue_copy_buffer_to_image_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_copy_buffer_to_image_tests.cpp index 5a5cbdce8d..974c7b0823 100644 --- a/opencl/test/unit_test/command_queue/enqueue_copy_buffer_to_image_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_copy_buffer_to_image_tests.cpp @@ -144,8 +144,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueCopyBufferToImageTest, WhenCopyingBufferToIma auto localWorkSize = std::min( maxLocalSize, Image2dDefaults::imageDesc.image_width * Image2dDefaults::imageDesc.image_height); auto simd = 32u; - auto threadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd); - EXPECT_EQ(threadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup()); + auto numThreadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd); + EXPECT_EQ(numThreadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup()); EXPECT_NE(0u, interfaceDescriptorData.getCrossThreadConstantDataReadLength()); EXPECT_NE(0u, interfaceDescriptorData.getConstantIndirectUrbEntryReadLength()); diff --git a/opencl/test/unit_test/command_queue/enqueue_copy_image_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_copy_image_tests.cpp index 400983d926..080a3f46c3 100644 --- a/opencl/test/unit_test/command_queue/enqueue_copy_image_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_copy_image_tests.cpp @@ -146,8 +146,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueCopyImageTest, WhenCopyingImageThenInterfaceD auto localWorkSize = std::min(maxLocalSize, Image2dDefaults::imageDesc.image_width * Image2dDefaults::imageDesc.image_height); auto simd = 32u; - auto threadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd); - EXPECT_EQ(threadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup()); + auto numThreadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd); + EXPECT_EQ(numThreadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup()); EXPECT_NE(0u, interfaceDescriptorData.getCrossThreadConstantDataReadLength()); EXPECT_NE(0u, interfaceDescriptorData.getConstantIndirectUrbEntryReadLength()); diff --git a/opencl/test/unit_test/command_queue/enqueue_copy_image_to_buffer_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_copy_image_to_buffer_tests.cpp index 0499782c05..a65105850e 100644 --- a/opencl/test/unit_test/command_queue/enqueue_copy_image_to_buffer_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_copy_image_to_buffer_tests.cpp @@ -145,8 +145,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueCopyImageToBufferTest, WhenCopyingImageToBuff auto localWorkSize = std::min( maxLocalSize, Image2dDefaults::imageDesc.image_width * Image2dDefaults::imageDesc.image_height); auto simd = 32u; - auto threadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd); - EXPECT_EQ(threadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup()); + auto numThreadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd); + EXPECT_EQ(numThreadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup()); EXPECT_NE(0u, interfaceDescriptorData.getCrossThreadConstantDataReadLength()); EXPECT_NE(0u, interfaceDescriptorData.getConstantIndirectUrbEntryReadLength()); diff --git a/opencl/test/unit_test/command_queue/enqueue_fill_image_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_fill_image_tests.cpp index e8fea8b766..bdf5a30d9c 100644 --- a/opencl/test/unit_test/command_queue/enqueue_fill_image_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_fill_image_tests.cpp @@ -153,8 +153,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueFillImageTest, WhenFillingImageThenInterfaceD auto localWorkSize = std::min(maxLocalSize, Image2dDefaults::imageDesc.image_width * Image2dDefaults::imageDesc.image_height); auto simd = 32u; - auto threadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd); - EXPECT_EQ(threadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup()); + auto numThreadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd); + EXPECT_EQ(numThreadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup()); EXPECT_NE(0u, interfaceDescriptorData.getCrossThreadConstantDataReadLength()); EXPECT_NE(0u, interfaceDescriptorData.getConstantIndirectUrbEntryReadLength()); diff --git a/opencl/test/unit_test/command_queue/enqueue_read_image_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_read_image_tests.cpp index 7c0ee5dd02..123225a329 100644 --- a/opencl/test/unit_test/command_queue/enqueue_read_image_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_read_image_tests.cpp @@ -154,8 +154,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueReadImageTest, WhenReadingImageThenInterfaceD auto localWorkSize = 4u; auto simd = 32u; - auto threadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd); - EXPECT_EQ(threadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup()); + auto numThreadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd); + EXPECT_EQ(numThreadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup()); EXPECT_NE(0u, interfaceDescriptorData.getCrossThreadConstantDataReadLength()); EXPECT_NE(0u, interfaceDescriptorData.getConstantIndirectUrbEntryReadLength()); diff --git a/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp index 2d45dd5035..ddb628786b 100644 --- a/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp @@ -155,8 +155,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueWriteImageTest, WhenWritingImageThenInterface // EnqueueWriteImage uses a byte copy. Need to convert to bytes. auto localWorkSize = 2 * 2 * sizeof(float); auto simd = 32; - auto threadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd); - EXPECT_EQ(threadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup()); + auto numThreadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd); + EXPECT_EQ(numThreadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup()); EXPECT_NE(0u, interfaceDescriptorData.getCrossThreadConstantDataReadLength()); EXPECT_NE(0u, interfaceDescriptorData.getConstantIndirectUrbEntryReadLength()); diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index f7be3c6f69..4f6446d932 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -12,6 +12,7 @@ #include "shared/source/helpers/register_offsets.h" #include "shared/source/helpers/simd_helper.h" #include "shared/source/kernel/dispatch_kernel_encoder_interface.h" +#include "shared/source/kernel/kernel_arg_descriptor.h" #include @@ -96,9 +97,8 @@ struct EncodeIndirectParams { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; using MI_MATH = typename GfxFamily::MI_MATH; using MI_MATH_ALU_INST_INLINE = typename GfxFamily::MI_MATH_ALU_INST_INLINE; - static void setGroupCountIndirect(CommandContainer &container, uint32_t offsets[3], void *crossThreadAddress); - - static void setGroupSizeIndirect(CommandContainer &container, uint32_t offsets[3], void *crossThreadAddress, uint32_t lws[3]); + static void setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress); + static void setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, const uint32_t *lws); static size_t getCmdsSizeForIndirectParams(); static size_t getCmdsSizeForSetGroupSizeIndirect(); diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index 399ed6c227..7b7542fcd2 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -199,17 +199,23 @@ void EncodeMathMMIO::encodeAluAdd(MI_MATH_ALU_INST_INLINE *pAluParam, } template -void EncodeIndirectParams::setGroupCountIndirect(CommandContainer &container, uint32_t offsets[3], void *crossThreadAddress) { - EncodeStoreMMIO::encode(container, GPUGPU_DISPATCHDIMX, ptrOffset(reinterpret_cast(crossThreadAddress), offsets[0])); - EncodeStoreMMIO::encode(container, GPUGPU_DISPATCHDIMY, ptrOffset(reinterpret_cast(crossThreadAddress), offsets[1])); - EncodeStoreMMIO::encode(container, GPUGPU_DISPATCHDIMZ, ptrOffset(reinterpret_cast(crossThreadAddress), offsets[2])); +void EncodeIndirectParams::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress) { + for (int i = 0; i < 3; ++i) { + if (NEO::isUndefinedOffset(offsets[i])) { + continue; + } + EncodeStoreMMIO::encode(container, GPUGPU_DISPATCHDIM[i], ptrOffset(reinterpret_cast(crossThreadAddress), offsets[i])); + } } template -void EncodeIndirectParams::setGroupSizeIndirect(CommandContainer &container, uint32_t offsets[3], void *crossThreadAddress, uint32_t lws[3]) { - EncodeMathMMIO::encodeMulRegVal(container, GPUGPU_DISPATCHDIMX, lws[0], ptrOffset(reinterpret_cast(crossThreadAddress), offsets[0])); - EncodeMathMMIO::encodeMulRegVal(container, GPUGPU_DISPATCHDIMY, lws[1], ptrOffset(reinterpret_cast(crossThreadAddress), offsets[1])); - EncodeMathMMIO::encodeMulRegVal(container, GPUGPU_DISPATCHDIMZ, lws[2], ptrOffset(reinterpret_cast(crossThreadAddress), offsets[2])); +void EncodeIndirectParams::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, const uint32_t *lws) { + for (int i = 0; i < 3; ++i) { + if (NEO::isUndefinedOffset(offsets[i])) { + continue; + } + EncodeMathMMIO::encodeMulRegVal(container, GPUGPU_DISPATCHDIM[i], lws[i], ptrOffset(reinterpret_cast(crossThreadAddress), offsets[i])); + } } template diff --git a/shared/source/command_container/command_encoder_base.inl b/shared/source/command_container/command_encoder_base.inl index 99f4bc3f11..8cf6f9f700 100644 --- a/shared/source/command_container/command_encoder_base.inl +++ b/shared/source/command_container/command_encoder_base.inl @@ -29,9 +29,10 @@ void EncodeDispatchKernel::encode(CommandContainer &container, using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD; using MI_BATCH_BUFFER_END = typename Family::MI_BATCH_BUFFER_END; - auto sizeCrossThreadData = dispatchInterface->getSizeCrossThreadData(); - auto sizePerThreadData = dispatchInterface->getSizePerThreadData(); - auto sizePerThreadDataForWholeGroup = dispatchInterface->getSizePerThreadDataForWholeGroup(); + auto &kernelDescriptor = dispatchInterface->getKernelDescriptor(); + auto sizeCrossThreadData = dispatchInterface->getCrossThreadDataSize(); + auto sizePerThreadData = dispatchInterface->getPerThreadDataSize(); + auto sizePerThreadDataForWholeGroup = dispatchInterface->getPerThreadDataSizeForWholeThreadGroup(); LinearStream *listCmdBufferStream = container.getCommandStream(); @@ -58,26 +59,26 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeStates::adjustStateComputeMode(*container.getCommandStream(), container.lastSentNumGrfRequired, nullptr, false, false); EncodeWA::encodeAdditionalPipelineSelect(*container.getDevice(), *container.getCommandStream(), false); - auto threadsPerThreadGroup = dispatchInterface->getThreadsPerThreadGroupCount(); - idd.setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup); + auto numThreadsPerThreadGroup = dispatchInterface->getNumThreadsPerThreadGroup(); + idd.setNumberOfThreadsInGpgpuThreadGroup(numThreadsPerThreadGroup); - idd.setBarrierEnable(dispatchInterface->hasBarriers()); + idd.setBarrierEnable(kernelDescriptor.kernelAttributes.flags.usesBarriers); idd.setSharedLocalMemorySize( dispatchInterface->getSlmTotalSize() > 0 ? static_cast(HardwareCommandsHelper::computeSlmValues(dispatchInterface->getSlmTotalSize())) : INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE_ENCODES_0K); { - auto bindingTableStateCount = dispatchInterface->getNumSurfaceStates(); + uint32_t bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries; uint32_t bindingTablePointer = 0u; if (bindingTableStateCount > 0u) { - auto ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, dispatchInterface->getSizeSurfaceStateHeapData(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); + auto ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); bindingTablePointer = static_cast(HardwareCommandsHelper::pushBindingTableAndSurfaceStates( *ssh, bindingTableStateCount, - dispatchInterface->getSurfaceStateHeap(), - dispatchInterface->getSizeSurfaceStateHeapData(), bindingTableStateCount, - dispatchInterface->getBindingTableOffset())); + dispatchInterface->getSurfaceStateHeapData(), + dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount, + kernelDescriptor.payloadMappings.bindingTable.tableOffset)); } idd.setBindingTablePointer(bindingTablePointer); @@ -96,12 +97,12 @@ void EncodeDispatchKernel::encode(CommandContainer &container, uint32_t samplerStateOffset = 0; uint32_t samplerCount = 0; - if (dispatchInterface->getNumSamplers() > 0) { - samplerCount = dispatchInterface->getNumSamplers(); - samplerStateOffset = EncodeStates::copySamplerState(heap, dispatchInterface->getSamplerTableOffset(), - dispatchInterface->getNumSamplers(), - dispatchInterface->getBorderColor(), - dispatchInterface->getDynamicStateHeap()); + if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0) { + samplerCount = kernelDescriptor.payloadMappings.samplerTable.numSamplers; + samplerStateOffset = EncodeStates::copySamplerState(heap, kernelDescriptor.payloadMappings.samplerTable.tableOffset, + kernelDescriptor.payloadMappings.samplerTable.numSamplers, + kernelDescriptor.payloadMappings.samplerTable.borderColor, + dispatchInterface->getDynamicStateHeapData()); } idd.setSamplerStatePointer(samplerStateOffset); @@ -129,21 +130,17 @@ void EncodeDispatchKernel::encode(CommandContainer &container, offsetThreadData = heapIndirect->getHeapGpuStartOffset() + static_cast(heapIndirect->getUsed() - sizeThreadData); memcpy_s(ptr, sizeCrossThreadData, - dispatchInterface->getCrossThread(), sizeCrossThreadData); + dispatchInterface->getCrossThreadData(), sizeCrossThreadData); if (isIndirect) { void *gpuPtr = reinterpret_cast(heapIndirect->getHeapGpuBase() + heapIndirect->getUsed() - sizeThreadData); - if (dispatchInterface->hasGroupCounts()) { - EncodeIndirectParams::setGroupCountIndirect(container, dispatchInterface->getCountOffsets(), gpuPtr); - } - if (dispatchInterface->hasGroupSize()) { - EncodeIndirectParams::setGroupSizeIndirect(container, dispatchInterface->getSizeOffsets(), gpuPtr, dispatchInterface->getLocalWorkSize()); - } + EncodeIndirectParams::setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, gpuPtr); + EncodeIndirectParams::setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, gpuPtr, dispatchInterface->getGroupSize()); } ptr = ptrOffset(ptr, sizeCrossThreadData); memcpy_s(ptr, sizePerThreadDataForWholeGroup, - dispatchInterface->getPerThread(), sizePerThreadDataForWholeGroup); + dispatchInterface->getPerThreadData(), sizePerThreadDataForWholeGroup); } auto slmSizeNew = dispatchInterface->getSlmTotalSize(); @@ -185,14 +182,14 @@ void EncodeDispatchKernel::encode(CommandContainer &container, cmd.setThreadGroupIdZDimension(threadDims[2]); } - auto simdSize = dispatchInterface->getSimdSize(); + auto simdSize = kernelDescriptor.kernelAttributes.simdSize; auto simdSizeOp = getSimdConfig(simdSize); cmd.setSimdSize(simdSizeOp); - cmd.setRightExecutionMask(dispatchInterface->getPerThreadExecutionMask()); + cmd.setRightExecutionMask(dispatchInterface->getThreadExecutionMask()); cmd.setBottomExecutionMask(0xffffffff); - cmd.setThreadWidthCounterMaximum(threadsPerThreadGroup); + cmd.setThreadWidthCounterMaximum(numThreadsPerThreadGroup); cmd.setPredicateEnable(isPredicate); diff --git a/shared/source/helpers/kernel_helpers.cpp b/shared/source/helpers/kernel_helpers.cpp index ec90a8b551..84a42a3a50 100644 --- a/shared/source/helpers/kernel_helpers.cpp +++ b/shared/source/helpers/kernel_helpers.cpp @@ -25,8 +25,8 @@ uint32_t KernelHelper::getMaxWorkGroupCount(uint32_t simd, uint32_t availableThr workGroupSize *= localWorkSize[i]; } - auto threadsPerThreadGroup = static_cast(Math::divideAndRoundUp(workGroupSize, simd)); - auto maxWorkGroupsCount = availableThreadCount / threadsPerThreadGroup; + auto numThreadsPerThreadGroup = static_cast(Math::divideAndRoundUp(workGroupSize, simd)); + auto maxWorkGroupsCount = availableThreadCount / numThreadsPerThreadGroup; if (numberOfBarriers > 0) { auto maxWorkGroupsCountDueToBarrierUsage = dssCount * (maxBarrierCount / numberOfBarriers); diff --git a/shared/source/helpers/register_offsets.h b/shared/source/helpers/register_offsets.h index 8d24e47680..dfef250587 100644 --- a/shared/source/helpers/register_offsets.h +++ b/shared/source/helpers/register_offsets.h @@ -19,6 +19,8 @@ constexpr uint32_t GPUGPU_DISPATCHDIMX = 0x2500; constexpr uint32_t GPUGPU_DISPATCHDIMY = 0x2504; constexpr uint32_t GPUGPU_DISPATCHDIMZ = 0x2508; +constexpr uint32_t GPUGPU_DISPATCHDIM[3] = {GPUGPU_DISPATCHDIMX, GPUGPU_DISPATCHDIMY, GPUGPU_DISPATCHDIMZ}; + constexpr uint32_t CS_GPR_R0 = 0x2600; constexpr uint32_t CS_GPR_R1 = 0x2608; constexpr uint32_t CS_GPR_R2 = 0x2610; diff --git a/shared/source/kernel/dispatch_kernel_encoder_interface.h b/shared/source/kernel/dispatch_kernel_encoder_interface.h index ee085aae48..29160fa55e 100644 --- a/shared/source/kernel/dispatch_kernel_encoder_interface.h +++ b/shared/source/kernel/dispatch_kernel_encoder_interface.h @@ -10,42 +10,28 @@ namespace NEO { class GraphicsAllocation; +struct KernelDescriptor; struct DispatchKernelEncoderI { - public: - virtual bool hasBarriers() = 0; - virtual uint32_t getSlmTotalSize() = 0; - virtual uint32_t getBindingTableOffset() = 0; - virtual uint32_t getBorderColor() = 0; - virtual uint32_t getSamplerTableOffset() = 0; - virtual uint32_t getNumSurfaceStates() = 0; - virtual uint32_t getNumSamplers() = 0; - virtual uint32_t getSimdSize() = 0; - virtual uint32_t getSizeCrossThreadData() = 0; - virtual uint32_t getPerThreadScratchSize() = 0; - virtual uint32_t getPerThreadExecutionMask() = 0; - virtual uint32_t getSizePerThreadData() = 0; - virtual uint32_t getSizePerThreadDataForWholeGroup() = 0; - virtual uint32_t getSizeSurfaceStateHeapData() = 0; - virtual uint32_t *getCountOffsets() = 0; - virtual uint32_t *getSizeOffsets() = 0; - virtual uint32_t *getLocalWorkSize() = 0; - virtual uint32_t getNumGrfRequired() = 0; - virtual uint32_t getThreadsPerThreadGroupCount() = 0; - virtual GraphicsAllocation *getIsaAllocation() = 0; - virtual bool hasGroupCounts() = 0; - virtual bool hasGroupSize() = 0; - virtual const void *getSurfaceStateHeap() = 0; - virtual const void *getDynamicStateHeap() = 0; - virtual const void *getCrossThread() = 0; - virtual const void *getPerThread() = 0; - virtual bool isInlineDataRequired() = 0; - virtual uint8_t getNumLocalIdChannels() = 0; virtual ~DispatchKernelEncoderI() = default; - protected: - uint32_t groupCountOffsets[3] = {}; - uint32_t groupSizeOffsets[3] = {}; - uint32_t localWorkSize[3] = {}; + virtual const KernelDescriptor &getKernelDescriptor() const = 0; + virtual const uint32_t *getGroupSize() const = 0; + virtual uint32_t getSlmTotalSize() const = 0; + + virtual const uint8_t *getCrossThreadData() const = 0; + virtual uint32_t getCrossThreadDataSize() const = 0; + + virtual uint32_t getThreadExecutionMask() const = 0; + virtual uint32_t getNumThreadsPerThreadGroup() const = 0; + virtual const uint8_t *getPerThreadData() const = 0; + virtual uint32_t getPerThreadDataSize() const = 0; + virtual uint32_t getPerThreadDataSizeForWholeThreadGroup() const = 0; + + virtual const uint8_t *getSurfaceStateHeapData() const = 0; + virtual uint32_t getSurfaceStateHeapDataSize() const = 0; + + virtual GraphicsAllocation *getIsaAllocation() const = 0; + virtual const uint8_t *getDynamicStateHeapData() const = 0; }; } // namespace NEO \ No newline at end of file diff --git a/shared/source/kernel/kernel_descriptor_from_patchtokens.cpp b/shared/source/kernel/kernel_descriptor_from_patchtokens.cpp index f83279482c..f1690fd3ce 100644 --- a/shared/source/kernel/kernel_descriptor_from_patchtokens.cpp +++ b/shared/source/kernel/kernel_descriptor_from_patchtokens.cpp @@ -55,7 +55,7 @@ void populateKernelDescriptor(KernelDescriptor &dst, const SPatchExecutionEnviro dst.kernelAttributes.flags.requiresSubgroupIndependentForwardProgress = (0 != execEnv.SubgroupIndependentForwardProgressRequired); dst.kernelAttributes.numGrfRequired = execEnv.NumGRFRequired; dst.kernelAttributes.flags.useGlobalAtomics = execEnv.HasGlobalAtomics; - dst.kernelAttributes.flags.usesStatelessWrites = 0U; + dst.kernelAttributes.flags.usesStatelessWrites = (execEnv.StatelessWritesCount > 0U); } void populateKernelDescriptor(KernelDescriptor &dst, const SPatchSamplerStateArray &token) { @@ -85,7 +85,6 @@ void populateKernelDescriptor(KernelDescriptor &dst, const SPatchInterfaceDescri void populateKernelDescriptor(KernelDescriptor &dst, const SPatchThreadPayload &token) { dst.kernelAttributes.flags.perThreadDataHeaderIsPresent = (0U != token.HeaderPresent); dst.kernelAttributes.numLocalIdChannels = token.LocalIDXPresent + token.LocalIDYPresent + token.LocalIDZPresent; - ; dst.kernelAttributes.flags.usesFlattenedLocalIds = (0U != token.LocalIDFlattenedPresent); dst.kernelAttributes.flags.perThreadDataUnusedGrfIsPresent = (0U != token.UnusedPerThreadConstantPresent); dst.kernelAttributes.flags.passInlineData = (0 != token.PassInlineData); diff --git a/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp b/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp index cce8d4b07f..0df97356b3 100644 --- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp +++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp @@ -92,7 +92,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, givenSlmTotalSizeEqualZeroW EXPECT_EQ(expectedValue, interfaceDescriptorData->getSharedLocalMemorySize()); } -HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, givennumBindingTableOneWhenDispatchingKernelThenBindingTableOffsetIsCorrect) { +HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, givenOneBindingTableEntryWhenDispatchingKernelThenBindingTableOffsetIsCorrect) { using BINDING_TABLE_STATE = typename FamilyType::BINDING_TABLE_STATE; using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; uint32_t numBindingTable = 1; @@ -107,10 +107,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, givennumBindingTableOneWhen uint32_t dims[] = {2, 1, 1}; std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); - EXPECT_CALL(*dispatchInterface.get(), getNumSurfaceStates()).WillRepeatedly(::testing::Return(numBindingTable)); - EXPECT_CALL(*dispatchInterface.get(), getSurfaceStateHeap()).WillRepeatedly(::testing::Return(&bindingTableState)); - EXPECT_CALL(*dispatchInterface.get(), getSizeSurfaceStateHeapData()).WillRepeatedly(::testing::Return(static_cast(sizeof(BINDING_TABLE_STATE)))); - EXPECT_CALL(*dispatchInterface.get(), getBindingTableOffset()).WillRepeatedly(::testing::Return(0)); + dispatchInterface->kernelDescriptor.payloadMappings.bindingTable.numEntries = numBindingTable; + dispatchInterface->kernelDescriptor.payloadMappings.bindingTable.tableOffset = 0U; + const uint8_t *sshData = reinterpret_cast(&bindingTableState); + EXPECT_CALL(*dispatchInterface.get(), getSurfaceStateHeapData()).WillRepeatedly(::testing::Return(sshData)); + EXPECT_CALL(*dispatchInterface.get(), getSurfaceStateHeapDataSize()).WillRepeatedly(::testing::Return(static_cast(sizeof(BINDING_TABLE_STATE)))); EncodeDispatchKernel::encode(*cmdContainer.get(), dims, false, false, dispatchInterface.get(), 0, pDevice, NEO::PreemptionMode::Disabled); auto interfaceDescriptorData = static_cast(cmdContainer->getIddBlock()); @@ -132,10 +133,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, giveNumBindingTableZeroWhen uint32_t dims[] = {2, 1, 1}; std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); - EXPECT_CALL(*dispatchInterface.get(), getNumSurfaceStates()).WillRepeatedly(::testing::Return(numBindingTable)); - EXPECT_CALL(*dispatchInterface.get(), getSurfaceStateHeap()).WillRepeatedly(::testing::Return(&bindingTableState)); - EXPECT_CALL(*dispatchInterface.get(), getSizeSurfaceStateHeapData()).WillRepeatedly(::testing::Return(static_cast(sizeof(BINDING_TABLE_STATE)))); - EXPECT_CALL(*dispatchInterface.get(), getBindingTableOffset()).WillRepeatedly(::testing::Return(0)); + dispatchInterface->kernelDescriptor.payloadMappings.bindingTable.numEntries = numBindingTable; + dispatchInterface->kernelDescriptor.payloadMappings.bindingTable.tableOffset = 0U; + const uint8_t *sshData = reinterpret_cast(&bindingTableState); + EXPECT_CALL(*dispatchInterface.get(), getSurfaceStateHeapData()).WillRepeatedly(::testing::Return(sshData)); + EXPECT_CALL(*dispatchInterface.get(), getSurfaceStateHeapDataSize()).WillRepeatedly(::testing::Return(static_cast(sizeof(BINDING_TABLE_STATE)))); EncodeDispatchKernel::encode(*cmdContainer.get(), dims, false, false, dispatchInterface.get(), 0, pDevice, NEO::PreemptionMode::Disabled); auto interfaceDescriptorData = static_cast(cmdContainer->getIddBlock()); @@ -156,10 +158,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, giveNumSamplersOneWhenDispa uint32_t dims[] = {2, 1, 1}; std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); - EXPECT_CALL(*dispatchInterface.get(), getNumSamplers()).WillRepeatedly(::testing::Return(numSamplers)); - EXPECT_CALL(*dispatchInterface.get(), getSamplerTableOffset()).WillRepeatedly(::testing::Return(0)); - EXPECT_CALL(*dispatchInterface.get(), getBorderColor()).WillRepeatedly(::testing::Return(0)); - EXPECT_CALL(*dispatchInterface.get(), getDynamicStateHeap()).WillRepeatedly(::testing::Return(&samplerState)); + dispatchInterface->kernelDescriptor.payloadMappings.samplerTable.numSamplers = numSamplers; + dispatchInterface->kernelDescriptor.payloadMappings.samplerTable.tableOffset = 0U; + dispatchInterface->kernelDescriptor.payloadMappings.samplerTable.borderColor = 0U; + const uint8_t *dshData = reinterpret_cast(&samplerState); + EXPECT_CALL(*dispatchInterface.get(), getDynamicStateHeapData()).WillRepeatedly(::testing::Return(dshData)); EncodeDispatchKernel::encode(*cmdContainer.get(), dims, false, false, dispatchInterface.get(), 0, pDevice, NEO::PreemptionMode::Disabled); auto interfaceDescriptorData = static_cast(cmdContainer->getIddBlock()); @@ -186,10 +189,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, giveNumSamplersZeroWhenDisp uint32_t dims[] = {2, 1, 1}; std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); - EXPECT_CALL(*dispatchInterface.get(), getNumSamplers()).WillRepeatedly(::testing::Return(numSamplers)); - EXPECT_CALL(*dispatchInterface.get(), getSamplerTableOffset()).WillRepeatedly(::testing::Return(0)); - EXPECT_CALL(*dispatchInterface.get(), getBorderColor()).WillRepeatedly(::testing::Return(0)); - EXPECT_CALL(*dispatchInterface.get(), getDynamicStateHeap()).WillRepeatedly(::testing::Return(&samplerState)); + dispatchInterface->kernelDescriptor.payloadMappings.samplerTable.numSamplers = numSamplers; + dispatchInterface->kernelDescriptor.payloadMappings.samplerTable.tableOffset = 0U; + dispatchInterface->kernelDescriptor.payloadMappings.samplerTable.borderColor = 0U; + const uint8_t *dshData = reinterpret_cast(&samplerState); + EXPECT_CALL(*dispatchInterface.get(), getDynamicStateHeapData()).WillRepeatedly(::testing::Return(dshData)); EncodeDispatchKernel::encode(*cmdContainer.get(), dims, false, false, dispatchInterface.get(), 0, pDevice, NEO::PreemptionMode::Disabled); auto interfaceDescriptorData = static_cast(cmdContainer->getIddBlock()); @@ -203,16 +207,14 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, giveNumSamplersZeroWhenDisp EXPECT_NE(memcmp(pSmplr, &samplerState, sizeof(SAMPLER_STATE)), 0); } -HWTEST_F(CommandEncodeStatesTest, givenIndarectOffsetsCountsWhenDispatchingKernelThenCorrestMIStoreOffsetsSet) { +HWTEST_F(CommandEncodeStatesTest, givenIndirectOffsetsCountsWhenDispatchingKernelThenCorrestMIStoreOffsetsSet) { using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM; uint32_t dims[] = {2, 1, 1}; uint32_t offsets[] = {0x10, 0x20, 0x30}; std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); - - EXPECT_CALL(*dispatchInterface.get(), hasGroupCounts()).WillRepeatedly(::testing::Return(true)); - EXPECT_CALL(*dispatchInterface.get(), getCountOffsets()).WillRepeatedly(::testing::Return(offsets)); - EXPECT_CALL(*dispatchInterface.get(), hasGroupSize()).WillRepeatedly(::testing::Return(false)); - + dispatchInterface->kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = offsets[0]; + dispatchInterface->kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[1] = offsets[1]; + dispatchInterface->kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[2] = offsets[2]; EncodeDispatchKernel::encode(*cmdContainer.get(), dims, true, false, dispatchInterface.get(), 0, pDevice, NEO::PreemptionMode::Disabled); GenCmdList commands; @@ -233,11 +235,10 @@ HWTEST_F(CommandEncodeStatesTest, givenIndarectOffsetsSizeWhenDispatchingKernelT uint32_t lws[] = {1, 1, 1}; std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); - EXPECT_CALL(*dispatchInterface.get(), hasGroupCounts()).WillRepeatedly(::testing::Return(false)); - EXPECT_CALL(*dispatchInterface.get(), getSizeOffsets()).WillRepeatedly(::testing::Return(offsets)); - EXPECT_CALL(*dispatchInterface.get(), hasGroupSize()).WillRepeatedly(::testing::Return(true)); - EXPECT_CALL(*dispatchInterface.get(), getLocalWorkSize()).WillRepeatedly(::testing::Return(lws)); - + EXPECT_CALL(*dispatchInterface.get(), getGroupSize()).WillRepeatedly(::testing::Return(lws)); + dispatchInterface->kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = offsets[0]; + dispatchInterface->kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[1] = offsets[1]; + dispatchInterface->kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[2] = offsets[2]; EncodeDispatchKernel::encode(*cmdContainer.get(), dims, true, false, dispatchInterface.get(), 0, pDevice, NEO::PreemptionMode::Disabled); GenCmdList commands; diff --git a/shared/test/unit_test/encoders/test_encode_math.cpp b/shared/test/unit_test/encoders/test_encode_math.cpp index d676f55635..be9078ff35 100644 --- a/shared/test/unit_test/encoders/test_encode_math.cpp +++ b/shared/test/unit_test/encoders/test_encode_math.cpp @@ -185,11 +185,11 @@ HWTEST_F(CommandEncoderMathTest, setGroupSizeIndirect) { CommandContainer cmdContainer; cmdContainer.initialize(pDevice); - uint32_t offsets[3] = {0, sizeof(uint32_t), 2 * sizeof(uint32_t)}; + CrossThreadDataOffset offsets[3] = {0, sizeof(uint32_t), 2 * sizeof(uint32_t)}; uint32_t crossThreadAdress[3] = {}; uint32_t lws[3] = {2, 1, 1}; - EncodeIndirectParams::setGroupSizeIndirect(cmdContainer, offsets, crossThreadAdress, lws); + EncodeIndirectParams::setGlobalWorkSizeIndirect(cmdContainer, offsets, crossThreadAdress, lws); GenCmdList commands; CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed()); @@ -211,7 +211,7 @@ HWTEST_F(CommandEncoderMathTest, setGroupCountIndirect) { CommandContainer cmdContainer; cmdContainer.initialize(pDevice); - uint32_t offsets[3] = {0, sizeof(uint32_t), 2 * sizeof(uint32_t)}; + CrossThreadDataOffset offsets[3] = {0, sizeof(uint32_t), 2 * sizeof(uint32_t)}; uint32_t crossThreadAdress[3] = {}; EncodeIndirectParams::setGroupCountIndirect(cmdContainer, offsets, crossThreadAdress); diff --git a/shared/test/unit_test/kernel/kernel_descriptor_from_patchtokens_tests.cpp b/shared/test/unit_test/kernel/kernel_descriptor_from_patchtokens_tests.cpp index 065c295977..9bc7e47f66 100644 --- a/shared/test/unit_test/kernel/kernel_descriptor_from_patchtokens_tests.cpp +++ b/shared/test/unit_test/kernel/kernel_descriptor_from_patchtokens_tests.cpp @@ -129,6 +129,9 @@ TEST(KernelDescriptorFromPatchtokens, GivenExecutionEnvironmentThenSetsProperPar EXPECT_TRUE(kernelDescriptor.kernelAttributes.flags.useGlobalAtomics); EXPECT_FALSE(kernelDescriptor.kernelAttributes.flags.usesStatelessWrites); + execEnv.StatelessWritesCount = 1U; + NEO::populateKernelDescriptor(kernelDescriptor, kernelTokens, 4); + EXPECT_TRUE(kernelDescriptor.kernelAttributes.flags.usesStatelessWrites); } TEST(KernelDescriptorFromPatchtokens, GivenThreadPayloadThenSetsProperPartsOfDescriptor) { diff --git a/shared/test/unit_test/mocks/mock_dispatch_kernel_encoder_interface.cpp b/shared/test/unit_test/mocks/mock_dispatch_kernel_encoder_interface.cpp index 5f32a745f7..86817b9b30 100644 --- a/shared/test/unit_test/mocks/mock_dispatch_kernel_encoder_interface.cpp +++ b/shared/test/unit_test/mocks/mock_dispatch_kernel_encoder_interface.cpp @@ -12,34 +12,28 @@ using namespace NEO; using ::testing::Return; MockDispatchKernelEncoder::MockDispatchKernelEncoder() { - EXPECT_CALL(*this, getIsaAllocation).WillRepeatedly(Return(&mockAllocation)); - EXPECT_CALL(*this, getSizeCrossThreadData).WillRepeatedly(Return(crossThreadSize)); - EXPECT_CALL(*this, getSizePerThreadData).WillRepeatedly(Return(perThreadSize)); + EXPECT_CALL(*this, getKernelDescriptor).WillRepeatedly(::testing::ReturnRef(kernelDescriptor)); + + EXPECT_CALL(*this, getIsaAllocation).WillRepeatedly(Return(&mockAllocation)); + EXPECT_CALL(*this, getCrossThreadDataSize).WillRepeatedly(Return(crossThreadSize)); + EXPECT_CALL(*this, getPerThreadDataSize).WillRepeatedly(Return(perThreadSize)); + + EXPECT_CALL(*this, getCrossThreadData).WillRepeatedly(Return(dataCrossThread)); + EXPECT_CALL(*this, getPerThreadData).WillRepeatedly(Return(dataPerThread)); - EXPECT_CALL(*this, getCrossThread).WillRepeatedly(Return(&dataCrossThread)); - EXPECT_CALL(*this, getPerThread).WillRepeatedly(Return(&dataPerThread)); expectAnyMockFunctionCall(); } + void MockDispatchKernelEncoder::expectAnyMockFunctionCall() { - EXPECT_CALL(*this, hasBarriers()).Times(::testing::AnyNumber()); + EXPECT_CALL(*this, getGroupSize()).Times(::testing::AnyNumber()); EXPECT_CALL(*this, getSlmTotalSize()).Times(::testing::AnyNumber()); - EXPECT_CALL(*this, getBindingTableOffset()).Times(::testing::AnyNumber()); - EXPECT_CALL(*this, getBorderColor()).Times(::testing::AnyNumber()); - EXPECT_CALL(*this, getSamplerTableOffset()).Times(::testing::AnyNumber()); - EXPECT_CALL(*this, getNumSurfaceStates()).Times(::testing::AnyNumber()); - EXPECT_CALL(*this, getNumSamplers()).Times(::testing::AnyNumber()); - EXPECT_CALL(*this, getSimdSize()).Times(::testing::AnyNumber()); - EXPECT_CALL(*this, getPerThreadScratchSize()).Times(::testing::AnyNumber()); - EXPECT_CALL(*this, getPerThreadExecutionMask()).Times(::testing::AnyNumber()); - EXPECT_CALL(*this, getSizePerThreadDataForWholeGroup()).Times(::testing::AnyNumber()); - EXPECT_CALL(*this, getSizeSurfaceStateHeapData()).Times(::testing::AnyNumber()); - EXPECT_CALL(*this, getCountOffsets()).Times(::testing::AnyNumber()); - EXPECT_CALL(*this, getSizeOffsets()).Times(::testing::AnyNumber()); - EXPECT_CALL(*this, getLocalWorkSize()).Times(::testing::AnyNumber()); - EXPECT_CALL(*this, getNumGrfRequired()).Times(::testing::AnyNumber()); - EXPECT_CALL(*this, getThreadsPerThreadGroupCount()).Times(::testing::AnyNumber()); - EXPECT_CALL(*this, hasGroupCounts()).Times(::testing::AnyNumber()); - EXPECT_CALL(*this, getSurfaceStateHeap()).Times(::testing::AnyNumber()); - EXPECT_CALL(*this, getDynamicStateHeap()).Times(::testing::AnyNumber()); - EXPECT_CALL(*this, isInlineDataRequired()).Times(::testing::AnyNumber()); -} \ No newline at end of file + + EXPECT_CALL(*this, getThreadExecutionMask()).Times(::testing::AnyNumber()); + EXPECT_CALL(*this, getNumThreadsPerThreadGroup()).Times(::testing::AnyNumber()); + EXPECT_CALL(*this, getPerThreadDataSizeForWholeThreadGroup()).Times(::testing::AnyNumber()); + + EXPECT_CALL(*this, getSurfaceStateHeapData()).Times(::testing::AnyNumber()); + EXPECT_CALL(*this, getSurfaceStateHeapDataSize()).Times(::testing::AnyNumber()); + + EXPECT_CALL(*this, getDynamicStateHeapData()).Times(::testing::AnyNumber()); +} diff --git a/shared/test/unit_test/mocks/mock_dispatch_kernel_encoder_interface.h b/shared/test/unit_test/mocks/mock_dispatch_kernel_encoder_interface.h index dc869329f2..37e175b0ab 100644 --- a/shared/test/unit_test/mocks/mock_dispatch_kernel_encoder_interface.h +++ b/shared/test/unit_test/mocks/mock_dispatch_kernel_encoder_interface.h @@ -7,6 +7,7 @@ #pragma once #include "shared/source/kernel/dispatch_kernel_encoder_interface.h" +#include "shared/source/kernel/kernel_descriptor.h" #include "opencl/test/unit_test/mocks/mock_graphics_allocation.h" @@ -20,34 +21,24 @@ class GraphicsAllocation; struct MockDispatchKernelEncoder : public DispatchKernelEncoderI { public: MockDispatchKernelEncoder(); - MOCK_METHOD0(hasBarriers, bool()); - MOCK_METHOD0(getSlmTotalSize, uint32_t()); - MOCK_METHOD0(getBindingTableOffset, uint32_t()); - MOCK_METHOD0(getBorderColor, uint32_t()); - MOCK_METHOD0(getSamplerTableOffset, uint32_t()); - MOCK_METHOD0(getNumSurfaceStates, uint32_t()); - MOCK_METHOD0(getNumSamplers, uint32_t()); - MOCK_METHOD0(getSimdSize, uint32_t()); - MOCK_METHOD0(getSizeCrossThreadData, uint32_t()); - MOCK_METHOD0(getPerThreadScratchSize, uint32_t()); - MOCK_METHOD0(getPerThreadExecutionMask, uint32_t()); - MOCK_METHOD0(getSizePerThreadData, uint32_t()); - MOCK_METHOD0(getSizePerThreadDataForWholeGroup, uint32_t()); - MOCK_METHOD0(getSizeSurfaceStateHeapData, uint32_t()); - MOCK_METHOD0(getCountOffsets, uint32_t *()); - MOCK_METHOD0(getSizeOffsets, uint32_t *()); - MOCK_METHOD0(getLocalWorkSize, uint32_t *()); - MOCK_METHOD0(getNumGrfRequired, uint32_t()); - MOCK_METHOD0(getThreadsPerThreadGroupCount, uint32_t()); - MOCK_METHOD0(getIsaAllocation, GraphicsAllocation *()); - MOCK_METHOD0(hasGroupCounts, bool()); - MOCK_METHOD0(hasGroupSize, bool()); - MOCK_METHOD0(getSurfaceStateHeap, const void *()); - MOCK_METHOD0(getDynamicStateHeap, const void *()); - MOCK_METHOD0(getCrossThread, const void *()); - MOCK_METHOD0(getPerThread, const void *()); - MOCK_METHOD0(isInlineDataRequired, bool()); - MOCK_METHOD0(getNumLocalIdChannels, uint8_t()); + MOCK_CONST_METHOD0(getKernelDescriptor, const KernelDescriptor &()); + MOCK_CONST_METHOD0(getGroupSize, const uint32_t *()); + MOCK_CONST_METHOD0(getSlmTotalSize, uint32_t()); + + MOCK_CONST_METHOD0(getCrossThreadData, const uint8_t *()); + MOCK_CONST_METHOD0(getCrossThreadDataSize, uint32_t()); + + MOCK_CONST_METHOD0(getThreadExecutionMask, uint32_t()); + MOCK_CONST_METHOD0(getNumThreadsPerThreadGroup, uint32_t()); + MOCK_CONST_METHOD0(getPerThreadData, const uint8_t *()); + MOCK_CONST_METHOD0(getPerThreadDataSize, uint32_t()); + MOCK_CONST_METHOD0(getPerThreadDataSizeForWholeThreadGroup, uint32_t()); + + MOCK_CONST_METHOD0(getSurfaceStateHeapData, const uint8_t *()); + MOCK_CONST_METHOD0(getSurfaceStateHeapDataSize, uint32_t()); + + MOCK_CONST_METHOD0(getIsaAllocation, GraphicsAllocation *()); + MOCK_CONST_METHOD0(getDynamicStateHeapData, const uint8_t *()); void expectAnyMockFunctionCall(); @@ -56,5 +47,6 @@ struct MockDispatchKernelEncoder : public DispatchKernelEncoderI { static constexpr uint32_t perThreadSize = 0x20; uint8_t dataCrossThread[crossThreadSize]; uint8_t dataPerThread[perThreadSize]; + KernelDescriptor kernelDescriptor; }; } // namespace NEO \ No newline at end of file