From 61f701aba5b728c9b6a5d78cf3ee7de3046efd48 Mon Sep 17 00:00:00 2001 From: "Cencelewska, Katarzyna" Date: Fri, 30 Jun 2023 14:17:13 +0000 Subject: [PATCH] fix: Unify logic calculating threads per work group part 3 Related-To: NEO-8087 Signed-off-by: Cencelewska, Katarzyna --- level_zero/core/source/kernel/kernel_imp.cpp | 2 +- .../test_cmdlist_append_launch_kernel_2.cpp | 7 +++--- .../test_cmdlist_append_launch_kernel_3.cpp | 3 ++- .../source/helpers/hardware_commands_helper.h | 2 +- .../helpers/hardware_commands_helper_base.inl | 21 +++++++++++++--- ...hardware_commands_helper_bdw_and_later.inl | 6 +++-- ...ardware_commands_helper_xehp_and_later.inl | 14 ++++++----- .../command_queue/dispatch_walker_tests.cpp | 15 ++++++----- .../get_size_required_buffer_tests.cpp | 6 +++-- .../get_size_required_image_tests.cpp | 18 ++++++++----- .../hardware_commands_helper_tests.cpp | 14 +++++------ .../command_encoder_bdw_and_later.inl | 3 ++- .../command_encoder_xehp_and_later.inl | 2 +- shared/source/helpers/per_thread_data.h | 7 ++++-- shared/source/kernel/implicit_args.h | 2 +- shared/source/kernel/implicit_args_helper.cpp | 8 +++--- .../kernel/implicit_args_helper_tests.cpp | 25 ++++++++++--------- 17 files changed, 93 insertions(+), 62 deletions(-) diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index c7f369fd50..84fb371322 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -345,7 +345,7 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY, auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize; uint32_t perThreadDataSizeForWholeThreadGroupNeeded = static_cast(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal( - simdSize, grfSize, numChannels, itemsInGroup)); + simdSize, grfSize, numChannels, itemsInGroup, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper)); if (perThreadDataSizeForWholeThreadGroupNeeded > perThreadDataSizeForWholeThreadGroupAllocated) { alignedFree(perThreadDataForWholeThreadGroup); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp index 35f10367a1..4886d73b6d 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp @@ -991,7 +991,8 @@ struct CmdlistAppendLaunchKernelWithImplicitArgsTests : CmdlistAppendLaunchKerne result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); EXPECT_EQ(ZE_RESULT_SUCCESS, result); - implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&expectedImplicitArgs, *kernelDescriptor); + const auto &gfxCoreHelper = device->getGfxCoreHelper(); + implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&expectedImplicitArgs, *kernelDescriptor, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper); auto sizeCrossThreadData = kernel->getCrossThreadDataSize(); auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup(); EXPECT_EQ(indirectHeap->getUsed(), sizeCrossThreadData + sizePerThreadDataForWholeGroup + implicitArgsProgrammingSize); @@ -1027,7 +1028,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, gfxCoreHelper); auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs); - size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize); + size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper); EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds)); alignedFree(expectedLocalIds); @@ -1073,7 +1074,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, gfxCoreHelper); auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs); - size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize); + size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper); EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds)); alignedFree(expectedLocalIds); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp index 8ac69a45b9..251dda7ce8 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp @@ -2062,7 +2062,8 @@ struct CommandListAppendLaunchKernelWithImplicitArgs : CommandListAppendLaunchKe template uint64_t getIndirectHeapOffsetForImplicitArgsBuffer(const Mock<::L0::KernelImp> &kernel) { if (FamilyType::supportsCmdSet(IGFX_XE_HP_CORE)) { - auto implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernel.pImplicitArgs.get(), kernel.getKernelDescriptor()); + const auto &gfxCoreHelper = device->getGfxCoreHelper(); + auto implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernel.pImplicitArgs.get(), kernel.getKernelDescriptor(), !kernel.kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper); return implicitArgsProgrammingSize - sizeof(ImplicitArgs); } else { return 0u; diff --git a/opencl/source/helpers/hardware_commands_helper.h b/opencl/source/helpers/hardware_commands_helper.h index 6a364ae988..05f919ccae 100644 --- a/opencl/source/helpers/hardware_commands_helper.h +++ b/opencl/source/helpers/hardware_commands_helper.h @@ -104,7 +104,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper { const Kernel &kernel); static size_t getSizeRequiredIOH( const Kernel &kernel, - size_t localWorkSize = 256); + const size_t localWorkSizes[3]); static size_t getSizeRequiredSSH( const Kernel &kernel); diff --git a/opencl/source/helpers/hardware_commands_helper_base.inl b/opencl/source/helpers/hardware_commands_helper_base.inl index 582c6b4d6d..9b8653433c 100644 --- a/opencl/source/helpers/hardware_commands_helper_base.inl +++ b/opencl/source/helpers/hardware_commands_helper_base.inl @@ -45,20 +45,33 @@ size_t HardwareCommandsHelper::getSizeRequiredDSH(const Kernel &kerne template size_t HardwareCommandsHelper::getSizeRequiredIOH(const Kernel &kernel, - size_t localWorkSize) { + const size_t localWorkSizes[3]) { + auto localWorkSize = Math::computeTotalElementsCount(localWorkSizes); typedef typename GfxFamily::WALKER_TYPE WALKER_TYPE; const auto &kernelDescriptor = kernel.getDescriptor(); const auto &hwInfo = kernel.getHardwareInfo(); + const auto &gfxCoreHelper = kernel.getGfxCoreHelper(); auto numChannels = kernelDescriptor.kernelAttributes.numLocalIdChannels; uint32_t grfSize = hwInfo.capabilityTable.grfSize; auto simdSize = kernelDescriptor.kernelAttributes.simdSize; + uint32_t requiredWalkOrder = 0u; + auto isHwLocalIdGeneration = !NEO::EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( + numChannels, + localWorkSizes, + std::array{ + {kernelDescriptor.kernelAttributes.workgroupWalkOrder[0], + kernelDescriptor.kernelAttributes.workgroupWalkOrder[1], + kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]}}, + kernelDescriptor.kernelAttributes.flags.requiresWorkgroupWalkOrder, + requiredWalkOrder, + simdSize); auto size = kernel.getCrossThreadDataSize() + - getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, localWorkSize); + getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, localWorkSize, isHwLocalIdGeneration, gfxCoreHelper); auto pImplicitArgs = kernel.getImplicitArgs(); if (pImplicitArgs) { - size += ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor); + size += ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, gfxCoreHelper); } return alignUp(size, WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); } @@ -94,7 +107,7 @@ size_t HardwareCommandsHelper::getTotalSizeRequiredIOH( const MultiDispatchInfo &multiDispatchInfo) { return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredIOH( *dispatchInfo.getKernel(), - Math::computeTotalElementsCount(dispatchInfo.getLocalWorkgroupSize())); }); + dispatchInfo.getLocalWorkgroupSize().values); }); } template diff --git a/opencl/source/helpers/hardware_commands_helper_bdw_and_later.inl b/opencl/source/helpers/hardware_commands_helper_bdw_and_later.inl index 44199f2dd0..b16fa64a8a 100644 --- a/opencl/source/helpers/hardware_commands_helper_bdw_and_later.inl +++ b/opencl/source/helpers/hardware_commands_helper_bdw_and_later.inl @@ -87,11 +87,13 @@ size_t HardwareCommandsHelper::sendCrossThreadData( auto pImplicitArgs = kernel.getImplicitArgs(); if (pImplicitArgs) { const auto &kernelDescriptor = kernel.getDescriptor(); - auto sizeForImplicitArgsProgramming = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor); + const auto &gfxCoreHelper = kernel.getGfxCoreHelper(); + auto isHwLocalIdGeneration = false; + auto sizeForImplicitArgsProgramming = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, gfxCoreHelper); auto implicitArgsGpuVA = indirectHeap.getGraphicsAllocation()->getGpuAddress() + indirectHeap.getUsed(); auto ptrToPatchImplicitArgs = indirectHeap.getSpace(sizeForImplicitArgsProgramming); - const auto &gfxCoreHelper = kernel.getGfxCoreHelper(); + ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, {}, gfxCoreHelper); auto implicitArgsCrossThreadPtr = ptrOffset(reinterpret_cast(kernel.getCrossThreadData()), kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer); diff --git a/opencl/source/helpers/hardware_commands_helper_xehp_and_later.inl b/opencl/source/helpers/hardware_commands_helper_xehp_and_later.inl index 7dd4e52ca4..1b1d5b0700 100644 --- a/opencl/source/helpers/hardware_commands_helper_xehp_and_later.inl +++ b/opencl/source/helpers/hardware_commands_helper_xehp_and_later.inl @@ -76,12 +76,6 @@ size_t HardwareCommandsHelper::sendCrossThreadData( pImplicitArgs->localIdTablePtr = indirectHeap.getGraphicsAllocation()->getGpuAddress() + offsetCrossThreadData; const auto &kernelDescriptor = kernel.getDescriptor(); - auto sizeForImplicitArgsProgramming = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor); - - auto sizeForLocalIdsProgramming = sizeForImplicitArgsProgramming - sizeof(ImplicitArgs); - offsetCrossThreadData += sizeForLocalIdsProgramming; - - auto ptrToPatchImplicitArgs = indirectHeap.getSpace(sizeForImplicitArgsProgramming); const auto &kernelAttributes = kernelDescriptor.kernelAttributes; uint32_t requiredWalkOrder = 0u; @@ -96,7 +90,15 @@ size_t HardwareCommandsHelper::sendCrossThreadData( kernelAttributes.flags.requiresWorkgroupWalkOrder, requiredWalkOrder, kernelDescriptor.kernelAttributes.simdSize); + const auto &gfxCoreHelper = kernel.getGfxCoreHelper(); + auto sizeForImplicitArgsProgramming = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, !generationOfLocalIdsByRuntime, gfxCoreHelper); + + auto sizeForLocalIdsProgramming = sizeForImplicitArgsProgramming - sizeof(ImplicitArgs); + offsetCrossThreadData += sizeForLocalIdsProgramming; + + auto ptrToPatchImplicitArgs = indirectHeap.getSpace(sizeForImplicitArgsProgramming); + ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, std::make_pair(generationOfLocalIdsByRuntime, requiredWalkOrder), gfxCoreHelper); } diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp index 44bd6a252c..10f2dd5af9 100644 --- a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp +++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp @@ -741,10 +741,8 @@ HWTEST_F(DispatchWalkerTest, GivenBlockedQueueWhenDispatchingWalkerThenRequiredH CsrDependencies(), walkerArgs); - Vec3 localWorkgroupSize(workGroupSize); - auto expectedSizeDSH = HardwareCommandsHelper::getSizeRequiredDSH(kernel); - auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(kernel, Math::computeTotalElementsCount(localWorkgroupSize)); + auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(kernel, workGroupSize); auto expectedSizeSSH = HardwareCommandsHelper::getSizeRequiredSSH(kernel); EXPECT_LE(expectedSizeDSH, blockedCommandsData->dsh->getMaxAvailableSpace()); @@ -1433,7 +1431,7 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp size_t workItems[3] = {1, 1, 1}; size_t workGroupSize[3] = {2, 5, 10}; cl_uint dimensions = 1; - Vec3 localWorkgroupSize(workGroupSize); + auto blockedCommandsData = createBlockedCommandsData(*pCmdQ); kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 1u; @@ -1458,7 +1456,7 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp CsrDependencies(), walkerArgsWithoutImplicitArgs); - auto iohSizeWithoutImplicitArgs = HardwareCommandsHelper::getSizeRequiredIOH(kernelWithoutImplicitArgs, Math::computeTotalElementsCount(localWorkgroupSize)); + auto iohSizeWithoutImplicitArgs = HardwareCommandsHelper::getSizeRequiredIOH(kernelWithoutImplicitArgs, workGroupSize); DispatchInfo dispatchInfoWithImplicitArgs(pClDevice, const_cast(&kernelWithImplicitArgs), dimensions, workItems, workGroupSize, globalOffsets); dispatchInfoWithImplicitArgs.setNumberOfWorkgroups({1, 1, 1}); @@ -1473,7 +1471,7 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp CsrDependencies(), walkerArgsWithImplicitArgs); - auto iohSizeWithImplicitArgs = HardwareCommandsHelper::getSizeRequiredIOH(kernelWithImplicitArgs, Math::computeTotalElementsCount(localWorkgroupSize)); + auto iohSizeWithImplicitArgs = HardwareCommandsHelper::getSizeRequiredIOH(kernelWithImplicitArgs, workGroupSize); EXPECT_LE(iohSizeWithoutImplicitArgs, iohSizeWithImplicitArgs); @@ -1481,9 +1479,10 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels; auto simdSize = kernelInfo.getMaxSimdSize(); uint32_t grfSize = sizeof(typename FamilyType::GRF); + const auto &gfxCoreHelper = getHelper(); auto size = kernelWithImplicitArgs.getCrossThreadDataSize() + - HardwareCommandsHelper::getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, Math::computeTotalElementsCount(localWorkgroupSize)) + - ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernelWithImplicitArgs.getImplicitArgs(), kernelWithImplicitArgs.getDescriptor()); + HardwareCommandsHelper::getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, Math::computeTotalElementsCount(workGroupSize), false, gfxCoreHelper) + + ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernelWithImplicitArgs.getImplicitArgs(), kernelWithImplicitArgs.getDescriptor(), false, gfxCoreHelper); size = alignUp(size, MemoryConstants::cacheLineSize); EXPECT_EQ(size, iohSizeWithImplicitArgs); diff --git a/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp b/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp index 4b76794821..5d8ff4ac24 100644 --- a/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp +++ b/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp @@ -499,7 +499,8 @@ HWTEST_F(GetSizeRequiredBufferTest, GivenHelloWorldKernelWhenEnqueingKernelThenH auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, KernelFixture::pKernel, {}); auto expectedSizeDSH = HardwareCommandsHelper::getSizeRequiredDSH(*KernelFixture::pKernel); - auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*KernelFixture::pKernel, workSize[0]); + size_t localWorkSizes[] = {64, 1, 1}; + auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*KernelFixture::pKernel, localWorkSizes); auto expectedSizeSSH = HardwareCommandsHelper::getSizeRequiredSSH(*KernelFixture::pKernel); // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended. @@ -538,7 +539,8 @@ HWTEST_F(GetSizeRequiredBufferTest, GivenKernelWithSimpleArgWhenEnqueingKernelTh auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, KernelFixture::pKernel, {}); auto expectedSizeDSH = HardwareCommandsHelper::getSizeRequiredDSH(*KernelFixture::pKernel); - auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*KernelFixture::pKernel, workSize[0]); + size_t localWorkSizes[] = {64, 1, 1}; + auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*KernelFixture::pKernel, localWorkSizes); auto expectedSizeSSH = HardwareCommandsHelper::getSizeRequiredSSH(*KernelFixture::pKernel); EXPECT_EQ(0u, expectedSizeIOH % GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); diff --git a/opencl/test/unit_test/command_queue/get_size_required_image_tests.cpp b/opencl/test/unit_test/command_queue/get_size_required_image_tests.cpp index 2594fd93a9..76850b4135 100644 --- a/opencl/test/unit_test/command_queue/get_size_required_image_tests.cpp +++ b/opencl/test/unit_test/command_queue/get_size_required_image_tests.cpp @@ -96,7 +96,8 @@ HWTEST_F(GetSizeRequiredImageTest, WhenCopyingImageThenHeapsAndCommandBufferCons auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_COPY_IMAGE, false, false, *pCmdQ, kernel, {}); auto expectedSizeDSH = HardwareCommandsHelper::getSizeRequiredDSH(*kernel); - auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel); + size_t localWorkSizes[] = {256, 1, 1}; + auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel, localWorkSizes); auto expectedSizeSSH = HardwareCommandsHelper::getSizeRequiredSSH(*kernel); // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended. @@ -143,7 +144,8 @@ HWTEST_F(GetSizeRequiredImageTest, WhenCopyingReadWriteImageThenHeapsAndCommandB auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_COPY_IMAGE, false, false, *pCmdQ, kernel.get(), {}); auto expectedSizeDSH = HardwareCommandsHelper::getSizeRequiredDSH(*kernel.get()); - auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel.get()); + size_t localWorkSizes[] = {256, 1, 1}; + auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel.get(), localWorkSizes); auto expectedSizeSSH = HardwareCommandsHelper::getSizeRequiredSSH(*kernel.get()); // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended. @@ -200,7 +202,8 @@ HWTEST_F(GetSizeRequiredImageTest, WhenReadingImageNonBlockingThenHeapsAndComman auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_READ_IMAGE, false, false, *pCmdQ, kernel, {}); auto expectedSizeDSH = HardwareCommandsHelper::getSizeRequiredDSH(*kernel); - auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel); + size_t localWorkSizes[] = {256, 1, 1}; + auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel, localWorkSizes); auto expectedSizeSSH = HardwareCommandsHelper::getSizeRequiredSSH(*kernel); // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended. @@ -255,7 +258,8 @@ HWTEST_F(GetSizeRequiredImageTest, WhenReadingImageBlockingThenHeapsAndCommandBu auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_READ_IMAGE, false, false, *pCmdQ, kernel, {}); auto expectedSizeDSH = HardwareCommandsHelper::getSizeRequiredDSH(*kernel); - auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel); + size_t localWorkSizes[] = {256, 1, 1}; + auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel, localWorkSizes); auto expectedSizeSSH = HardwareCommandsHelper::getSizeRequiredSSH(*kernel); // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended. @@ -310,7 +314,8 @@ HWTEST_F(GetSizeRequiredImageTest, WhenWritingImageNonBlockingThenHeapsAndComman auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_WRITE_IMAGE, false, false, *pCmdQ, kernel, {}); auto expectedSizeDSH = HardwareCommandsHelper::getSizeRequiredDSH(*kernel); - auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel); + size_t localWorkSizes[] = {256, 1, 1}; + auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel, localWorkSizes); auto expectedSizeSSH = HardwareCommandsHelper::getSizeRequiredSSH(*kernel); // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended. @@ -365,7 +370,8 @@ HWTEST_F(GetSizeRequiredImageTest, WhenWritingImageBlockingThenHeapsAndCommandBu auto expectedSizeCS = EnqueueOperation::getSizeRequiredCS(CL_COMMAND_WRITE_IMAGE, false, false, *pCmdQ, kernel, {}); auto expectedSizeDSH = HardwareCommandsHelper::getSizeRequiredDSH(*kernel); - auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel); + size_t localWorkSizes[] = {256, 1, 1}; + auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel, localWorkSizes); auto expectedSizeSSH = HardwareCommandsHelper::getSizeRequiredSSH(*kernel); // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended. diff --git a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp index 76dbfdd63c..b5cd004dad 100644 --- a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp +++ b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp @@ -365,7 +365,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenAllocatingIndirectStateRes auto usedAfterIOH = ioh.getUsed(); auto usedAfterSSH = ssh.getUsed(); auto sizeRequiredDSH = HardwareCommandsHelper::getSizeRequiredDSH(*kernel); - auto sizeRequiredIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel, localWorkSize); + auto sizeRequiredIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel, localWorkSizes); auto sizeRequiredSSH = HardwareCommandsHelper::getSizeRequiredSSH(*kernel); EXPECT_GE(sizeRequiredDSH, usedAfterDSH - usedBeforeDSH); @@ -548,11 +548,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe constexpr uint32_t grfSize = sizeof(typename FamilyType::GRF); size_t localWorkSize = localWorkSizeX * localWorkSizeY * localWorkSizeZ; auto numChannels = modifiedKernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels; - size_t expectedIohSize = PerThreadDataHelper::getPerThreadDataSizeTotal(modifiedKernelInfo.getMaxSimdSize(), grfSize, numChannels, localWorkSize); + const auto &gfxCoreHelper = pDevice->getGfxCoreHelper(); + size_t expectedIohSize = PerThreadDataHelper::getPerThreadDataSizeTotal(modifiedKernelInfo.getMaxSimdSize(), grfSize, numChannels, localWorkSize, !kernelUsesLocalIds, gfxCoreHelper); ASSERT_LE(expectedIohSize, ioh.getUsed()); auto expectedLocalIds = alignedMalloc(expectedIohSize, 64); - const auto &gfxCoreHelper = pDevice->getGfxCoreHelper(); generateLocalIDs(expectedLocalIds, modifiedKernelInfo.getMaxSimdSize(), std::array{{localWorkSizeX, localWorkSizeY, localWorkSizeZ}}, std::array{{modifiedKernelInfo.kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0], @@ -1315,8 +1315,8 @@ struct HardwareCommandsImplicitArgsTests : Test { kernel.setGlobalWorkSizeValues(static_cast(expectedImplicitArgs.globalSizeX), static_cast(expectedImplicitArgs.globalSizeY), static_cast(expectedImplicitArgs.globalSizeZ)); kernel.setGlobalWorkOffsetValues(static_cast(expectedImplicitArgs.globalOffsetX), static_cast(expectedImplicitArgs.globalOffsetY), static_cast(expectedImplicitArgs.globalOffsetZ)); kernel.setNumWorkGroupsValues(expectedImplicitArgs.groupCountX, expectedImplicitArgs.groupCountY, expectedImplicitArgs.groupCountZ); - - implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernel.getDescriptor()); + const auto &gfxCoreHelper = pDevice->getGfxCoreHelper(); + implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernel.getDescriptor(), false, gfxCoreHelper); auto sizeCrossThreadData = kernel.getCrossThreadDataSize(); HardwareCommandsHelper::sendCrossThreadData( @@ -1382,7 +1382,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, gfxCoreHelper); auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs); - size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize); + size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, false, gfxCoreHelper); EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds)); alignedFree(expectedLocalIds); @@ -1416,7 +1416,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, gfxCoreHelper); auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs); - size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize); + size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, false, gfxCoreHelper); EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds)); alignedFree(expectedLocalIds); diff --git a/shared/source/command_container/command_encoder_bdw_and_later.inl b/shared/source/command_container/command_encoder_bdw_and_later.inl index 19c7e9eef6..edb50ec51e 100644 --- a/shared/source/command_container/command_encoder_bdw_and_later.inl +++ b/shared/source/command_container/command_encoder_bdw_and_later.inl @@ -161,7 +161,8 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis sizePerThreadData, hwInfo); uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData; - uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor); + bool isHwLocalIdGeneration = false; + uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, gfxCoreHelper); uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching; uint64_t offsetThreadData = 0u; { diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index a14be7c67f..18459e887f 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -200,7 +200,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis } uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData; - uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor); + uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, !localIdsGenerationByRuntime, gfxCoreHelper); uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching; { auto heap = container.getIndirectHeap(HeapType::INDIRECT_OBJECT); diff --git a/shared/source/helpers/per_thread_data.h b/shared/source/helpers/per_thread_data.h index 09de735093..e8f6fe7766 100644 --- a/shared/source/helpers/per_thread_data.h +++ b/shared/source/helpers/per_thread_data.h @@ -6,6 +6,7 @@ */ #pragma once +#include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/local_id_gen.h" #include @@ -19,8 +20,10 @@ struct PerThreadDataHelper { uint32_t simd, uint32_t grfSize, uint32_t numChannels, - size_t localWorkSize) { - return getThreadsPerWG(simd, static_cast(localWorkSize)) * getPerThreadSizeLocalIDs(simd, grfSize, numChannels); + size_t localWorkSize, + bool isHwLocalIdGeneration, + const GfxCoreHelper &gfxCoreHelper) { + return gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkSize), grfSize, isHwLocalIdGeneration) * getPerThreadSizeLocalIDs(simd, grfSize, numChannels); } }; // namespace PerThreadDataHelper } // namespace NEO diff --git a/shared/source/kernel/implicit_args.h b/shared/source/kernel/implicit_args.h index 9046809586..5e72622d13 100644 --- a/shared/source/kernel/implicit_args.h +++ b/shared/source/kernel/implicit_args.h @@ -51,7 +51,7 @@ inline constexpr const char *implicitArgsRelocationSymbolName = "__INTEL_PATCH_C namespace ImplicitArgsHelper { std::array getDimensionOrderForLocalIds(const uint8_t *workgroupDimensionsOrder, std::optional> hwGenerationOfLocalIdsParams); uint32_t getGrfSize(uint32_t simd); -uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor); +uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor, bool localIdsGeneratedByRuntime, const GfxCoreHelper &gfxCoreHelper); void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional> hwGenerationOfLocalIdsParams, const GfxCoreHelper &gfxCoreHelper); } // namespace ImplicitArgsHelper } // namespace NEO diff --git a/shared/source/kernel/implicit_args_helper.cpp b/shared/source/kernel/implicit_args_helper.cpp index 7b0b7ecdc8..39070ad831 100644 --- a/shared/source/kernel/implicit_args_helper.cpp +++ b/shared/source/kernel/implicit_args_helper.cpp @@ -41,7 +41,7 @@ uint32_t getGrfSize(uint32_t simd) { return 32u; } -uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor) { +uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor, bool isHwLocalIdGeneration, const GfxCoreHelper &gfxCoreHelper) { if (!pImplicitArgs) { return 0; } @@ -56,15 +56,15 @@ uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const auto itemsInGroup = Math::computeTotalElementsCount(localWorkSize); uint32_t localIdsSizeNeeded = alignUp(static_cast(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal( - simdSize, grfSize, 3u, itemsInGroup)), + simdSize, grfSize, 3u, itemsInGroup, isHwLocalIdGeneration, gfxCoreHelper)), MemoryConstants::cacheLineSize); return implicitArgsSize + localIdsSizeNeeded; } } void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, std::optional> hwGenerationOfLocalIdsParams, const GfxCoreHelper &gfxCoreHelper) { - - auto totalSizeToProgram = getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor); + auto localIdsGeneratedByHw = hwGenerationOfLocalIdsParams.has_value() ? hwGenerationOfLocalIdsParams.value().first : false; + auto totalSizeToProgram = getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, localIdsGeneratedByHw, gfxCoreHelper); auto retVal = ptrOffset(ptrToPatch, totalSizeToProgram); auto patchImplicitArgsBufferInCrossThread = NEO::isValidOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer); diff --git a/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp b/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp index e5401b893d..12d8886716 100644 --- a/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp +++ b/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp @@ -57,8 +57,8 @@ TEST(ImplicitArgsHelperTest, givenSimdGreaterThanOneWhenGettingGrfSizeThenGrfSiz TEST(ImplicitArgsHelperTest, givenNoImplicitArgsWhenGettingSizeForImplicitArgsProgrammingThenZeroIsReturned) { KernelDescriptor kernelDescriptor{}; - - EXPECT_EQ(0u, ImplicitArgsHelper::getSizeForImplicitArgsPatching(nullptr, kernelDescriptor)); + auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); + EXPECT_EQ(0u, ImplicitArgsHelper::getSizeForImplicitArgsPatching(nullptr, kernelDescriptor, false, *gfxCoreHelper.get())); } TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInPayloadMappingWhenGettingSizeForImplicitArgsProgrammingThenCorrectSizeIsReturned) { @@ -75,8 +75,9 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP auto totalWorkgroupSize = implicitArgs.localSizeX * implicitArgs.localSizeY * implicitArgs.localSizeZ; - auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.simdWidth, 32u /* grfSize */, 3u /* num channels */, totalWorkgroupSize), MemoryConstants::cacheLineSize); - EXPECT_EQ(localIdsSize + implicitArgs.structSize, ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor)); + auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); + auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.simdWidth, 32u /* grfSize */, 3u /* num channels */, totalWorkgroupSize, false, *gfxCoreHelper.get()), MemoryConstants::cacheLineSize); + EXPECT_EQ(localIdsSize + implicitArgs.structSize, ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, *gfxCoreHelper.get())); } TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayloadMappingWhenGettingSizeForImplicitArgsProgrammingThenCorrectSizeIsReturned) { @@ -90,8 +91,8 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl implicitArgs.localSizeX = 2; implicitArgs.localSizeY = 3; implicitArgs.localSizeZ = 4; - - EXPECT_EQ(alignUp(implicitArgs.structSize, MemoryConstants::cacheLineSize), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor)); + auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); + EXPECT_EQ(alignUp(implicitArgs.structSize, MemoryConstants::cacheLineSize), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, *gfxCoreHelper.get())); } TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInPayloadMappingWhenPatchingImplicitArgsThenOnlyProperRegionIsPatched) { @@ -108,8 +109,8 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP implicitArgs.localSizeX = 2; implicitArgs.localSizeY = 3; implicitArgs.localSizeZ = 4; - - auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor); + auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); + auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, *gfxCoreHelper.get()); auto totalWorkgroupSize = implicitArgs.localSizeX * implicitArgs.localSizeY * implicitArgs.localSizeZ; auto localIdsPatchingSize = totalWorkgroupSize * 3 * sizeof(uint16_t); @@ -119,7 +120,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP uint8_t pattern = 0xcd; memset(memoryToPatch.get(), pattern, totalSizeForPatching); - auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); + auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {}, *gfxCoreHelper.get()); EXPECT_EQ(retVal, ptrOffset(memoryToPatch.get(), totalSizeForPatching)); @@ -150,8 +151,8 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl implicitArgs.localSizeX = 2; implicitArgs.localSizeY = 3; implicitArgs.localSizeZ = 4; - - auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor); + auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); + auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, *gfxCoreHelper.get()); EXPECT_EQ(0x80u, totalSizeForPatching); @@ -160,7 +161,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl uint8_t pattern = 0xcd; memset(memoryToPatch.get(), pattern, totalSizeForPatching); - auto gfxCoreHelper = GfxCoreHelper::create(defaultHwInfo->platform.eRenderCoreFamily); + auto retVal = ImplicitArgsHelper::patchImplicitArgs(memoryToPatch.get(), implicitArgs, kernelDescriptor, {}, *gfxCoreHelper.get()); EXPECT_EQ(retVal, ptrOffset(memoryToPatch.get(), totalSizeForPatching));