From 4f71aaf59510584b8cedcc124ac9e4dfc68d4fcf Mon Sep 17 00:00:00 2001 From: Mateusz Jablonski Date: Thu, 17 Feb 2022 12:41:06 +0000 Subject: [PATCH] Handle SIMD-1 scenario when programming local ids for implicit args according to implicit args design for SIMD-1 local ids are one-by-one Resolves: NEO-6692 Signed-off-by: Mateusz Jablonski --- level_zero/core/source/kernel/kernel_imp.cpp | 10 +-- .../test_cmdlist_append_launch_kernel_2.cpp | 72 +++++++++++++++++++ .../helpers/hardware_commands_helper_base.inl | 8 ++- .../command_queue/dispatch_walker_tests.cpp | 7 +- .../hardware_commands_helper_tests.cpp | 72 +++++++++++++++++++ shared/source/kernel/implicit_args.h | 3 +- shared/source/kernel/implicit_args_helper.cpp | 12 +++- .../kernel/implicit_args_helper_tests.cpp | 12 ++++ 8 files changed, 184 insertions(+), 12 deletions(-) diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 5ac19adf80..fc1e156a86 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -1020,12 +1020,13 @@ uint32_t KernelImp::getSizeForImplicitArgsPatching() const { } auto implicitArgsSize = static_cast(sizeof(NEO::ImplicitArgs)); const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor(); - auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize; + auto simdSize = kernelDescriptor.kernelAttributes.simdSize; + auto grfSize = NEO::ImplicitArgsHelper::getGrfSize(simdSize, this->module->getDevice()->getHwInfo().capabilityTable.grfSize); Vec3 groupSize{this->groupSize[0], this->groupSize[1], this->groupSize[2]}; auto itemsInGroup = Math::computeTotalElementsCount(groupSize); uint32_t localIdsSizeNeeded = alignUp(static_cast(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal( - kernelDescriptor.kernelAttributes.simdSize, grfSize, 3u, itemsInGroup)), + simdSize, grfSize, 3u, itemsInGroup)), MemoryConstants::cacheLineSize); return implicitArgsSize + localIdsSizeNeeded; } @@ -1035,12 +1036,13 @@ void KernelImp::patchImplicitArgs(void *&pOut) const { return; } const auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes; - auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize; + auto simdSize = kernelAttributes.simdSize; + auto grfSize = NEO::ImplicitArgsHelper::getGrfSize(simdSize, this->module->getDevice()->getHwInfo().capabilityTable.grfSize); auto dimensionOrder = NEO::ImplicitArgsHelper::getDimensionOrderForLocalIds(kernelAttributes.workgroupDimensionsOrder, kernelRequiresGenerationOfLocalIdsByRuntime, requiredWorkgroupOrder); NEO::generateLocalIDs( pOut, - static_cast(kernelAttributes.simdSize), + simdSize, std::array{{static_cast(groupSize[0]), static_cast(groupSize[1]), static_cast(groupSize[2])}}, diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp index cf69f01c36..3617f3104b 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp @@ -1077,6 +1077,78 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, givenKernelWithImplicitArgsAndHwGenerat alignedFree(expectedLocalIds); } + +HWTEST_F(CmdlistAppendLaunchKernelTests, givenKernelWithImplicitArgsWhenAppendLaunchKernelWithSimd1ThenLocalIdsAreGeneratedCorrectly) { + std::unique_ptr mockKernelImmData = std::make_unique(0u); + auto kernelDescriptor = mockKernelImmData->kernelDescriptor; + kernelDescriptor->kernelAttributes.flags.requiresImplicitArgs = true; + kernelDescriptor->kernelAttributes.simdSize = 1u; + kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[0] = 2; + kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[1] = 1; + kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[2] = 0; + createModuleFromBinary(0u, false, mockKernelImmData.get()); + + auto kernel = std::make_unique(module.get()); + + ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC}; + kernel->initialize(&kernelDesc); + + EXPECT_TRUE(kernel->getKernelDescriptor().kernelAttributes.flags.requiresImplicitArgs); + ASSERT_NE(nullptr, kernel->getImplicitArgs()); + + kernel->setGroupSize(2, 2, 1); + + ze_result_t result{}; + std::unique_ptr commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, result)); + + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto indirectHeap = commandList->commandContainer.getIndirectHeap(NEO::HeapType::INDIRECT_OBJECT); + memset(indirectHeap->getSpace(0), 0, kernel->getSizeForImplicitArgsPatching()); + + ze_group_count_t groupCount{1, 1, 1}; + result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto sizeCrossThreadData = kernel->getCrossThreadDataSize(); + auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup(); + EXPECT_EQ(indirectHeap->getUsed(), sizeCrossThreadData + sizePerThreadDataForWholeGroup + kernel->getSizeForImplicitArgsPatching()); + + ImplicitArgs expectedImplicitArgs{sizeof(ImplicitArgs)}; + expectedImplicitArgs.numWorkDim = 2; + expectedImplicitArgs.simdWidth = 1; + expectedImplicitArgs.localSizeX = 2; + expectedImplicitArgs.localSizeY = 2; + expectedImplicitArgs.localSizeZ = 1; + expectedImplicitArgs.globalSizeX = 2; + expectedImplicitArgs.globalSizeY = 2; + expectedImplicitArgs.globalSizeZ = 1; + expectedImplicitArgs.groupCountX = 1; + expectedImplicitArgs.groupCountY = 1; + expectedImplicitArgs.groupCountZ = 1; + expectedImplicitArgs.localIdTablePtr = indirectHeap->getGraphicsAllocation()->getGpuAddress(); + expectedImplicitArgs.printfBufferPtr = kernel->getPrintfBufferAllocation()->getGpuAddress(); + + auto sizeForImplicitArgPatching = kernel->getSizeForImplicitArgsPatching(); + + EXPECT_LT(0u, sizeForImplicitArgPatching); + + auto localIdsProgrammingSize = sizeForImplicitArgPatching - sizeof(ImplicitArgs); + + uint16_t expectedLocalIds[][3] = {{0, 0, 0}, + {0, 1, 0}, + {0, 0, 1}, + {0, 1, 1}}; + + uint8_t zeros[MemoryConstants::cacheLineSize]{}; + EXPECT_EQ(localIdsProgrammingSize, alignUp(sizeof(expectedLocalIds), MemoryConstants::cacheLineSize)); + + EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeap->getCpuBase(), sizeof(expectedLocalIds))); + EXPECT_EQ(0, memcmp(zeros, ptrOffset(indirectHeap->getCpuBase(), sizeof(expectedLocalIds)), localIdsProgrammingSize - sizeof(expectedLocalIds))); + auto pImplicitArgs = reinterpret_cast(ptrOffset(indirectHeap->getCpuBase(), localIdsProgrammingSize)); + EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, sizeof(ImplicitArgs))); +} + HWTEST_F(CmdlistAppendLaunchKernelTests, givenKernelWithoutImplicitArgsWhenAppendLaunchKernelThenImplicitArgsAreNotSentToIndirectHeap) { std::unique_ptr mockKernelImmData = std::make_unique(0u); auto kernelDescriptor = mockKernelImmData->kernelDescriptor; diff --git a/opencl/source/helpers/hardware_commands_helper_base.inl b/opencl/source/helpers/hardware_commands_helper_base.inl index 9688ef6271..63336c4fb8 100644 --- a/opencl/source/helpers/hardware_commands_helper_base.inl +++ b/opencl/source/helpers/hardware_commands_helper_base.inl @@ -51,11 +51,13 @@ size_t HardwareCommandsHelper::getSizeRequiredIOH(const Kernel &kerne auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels; uint32_t grfSize = sizeof(typename GfxFamily::GRF); + auto simdSize = kernelInfo.getMaxSimdSize(); auto size = kernel.getCrossThreadDataSize() + - getPerThreadDataSizeTotal(kernelInfo.getMaxSimdSize(), grfSize, numChannels, localWorkSize); + getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, localWorkSize); if (kernel.getImplicitArgs()) { - size += sizeof(ImplicitArgs) + alignUp(getPerThreadDataSizeTotal(kernelInfo.getMaxSimdSize(), grfSize, 3u, localWorkSize), MemoryConstants::cacheLineSize); + auto grfSizeForImplicitArgs = ImplicitArgsHelper::getGrfSize(simdSize, grfSize); + size += sizeof(ImplicitArgs) + alignUp(getPerThreadDataSizeTotal(simdSize, grfSizeForImplicitArgs, 3u, localWorkSize), MemoryConstants::cacheLineSize); } return alignUp(size, WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); } @@ -218,7 +220,7 @@ size_t HardwareCommandsHelper::sendIndirectState( auto pImplicitArgs = kernel.getImplicitArgs(); if (pImplicitArgs) { - constexpr uint32_t grfSize = sizeof(typename GfxFamily::GRF); + auto grfSize = ImplicitArgsHelper::getGrfSize(simd, sizeof(typename GfxFamily::GRF)); const auto &kernelAttributes = kernelInfo.kernelDescriptor.kernelAttributes; uint32_t requiredWalkOrder = 0u; auto generationOfLocalIdsByRuntime = EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp index 990f571c94..e72dbea1e7 100644 --- a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp +++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp @@ -1561,6 +1561,7 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp Vec3 localWorkgroupSize(workGroupSize); auto blockedCommandsData = createBlockedCommandsData(*pCmdQ); + kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 1u; kernelInfo.kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = false; MockKernel kernelWithoutImplicitArgs(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernelWithoutImplicitArgs.initialize()); @@ -1609,11 +1610,13 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp { auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels; + auto simdSize = kernelInfo.getMaxSimdSize(); uint32_t grfSize = sizeof(typename FamilyType::GRF); + auto grfSizeForImplicitArgs = ImplicitArgsHelper::getGrfSize(simdSize, grfSize); auto size = kernelWithImplicitArgs.getCrossThreadDataSize() + - HardwareCommandsHelper::getPerThreadDataSizeTotal(kernelInfo.getMaxSimdSize(), grfSize, numChannels, Math::computeTotalElementsCount(localWorkgroupSize)) + + HardwareCommandsHelper::getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, Math::computeTotalElementsCount(localWorkgroupSize)) + sizeof(ImplicitArgs) + - alignUp(HardwareCommandsHelper::getPerThreadDataSizeTotal(kernelInfo.getMaxSimdSize(), grfSize, 3u, Math::computeTotalElementsCount(localWorkgroupSize)), MemoryConstants::cacheLineSize); + alignUp(HardwareCommandsHelper::getPerThreadDataSizeTotal(simdSize, grfSizeForImplicitArgs, 3u, Math::computeTotalElementsCount(localWorkgroupSize)), MemoryConstants::cacheLineSize); size = alignUp(size, MemoryConstants::cacheLineSize); EXPECT_EQ(size, iohSizeWithImplicitArgs); diff --git a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp index c54290e0ee..b3f0a1ab9e 100644 --- a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp +++ b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp @@ -1400,6 +1400,78 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI EXPECT_EQ(ioh.getGraphicsAllocation()->getGpuAddress(), pImplicitArgs->localIdTablePtr); } +HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithImplicitArgsWhenSendingIndirectStateWithSimd1ThenLocalIdsAreGeneratedCorrectly) { + DebugManagerStateRestore restorer; + DebugManager.flags.EnableHwGenerationLocalIds.set(0); + auto pKernelInfo = std::make_unique(); + uint32_t simd = 1; + pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = simd; + pKernelInfo->kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = true; + pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0] = 2; + pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1] = 1; + pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2] = 0; + + MockContext context(pClDevice); + CommandQueueHw cmdQ(&context, pClDevice, 0, false); + MockProgram program(&context, false, toClDeviceVector(*pClDevice)); + + MockKernel kernel(&program, *pKernelInfo, *pClDevice); + ASSERT_EQ(CL_SUCCESS, kernel.initialize()); + + const size_t localWorkSizeX = 2; + const size_t localWorkSizeY = 2; + const size_t localWorkSizeZ = 1; + const size_t localWorkSizes[3]{localWorkSizeX, localWorkSizeY, localWorkSizeZ}; + + auto &commandStream = cmdQ.getCS(1024); + auto pWalkerCmd = reinterpret_cast(commandStream.getSpace(0)); + + auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 8192); + auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::Type::INDIRECT_OBJECT, 8192); + auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::Type::SURFACE_STATE, 8192); + + dsh.align(EncodeStates::alignInterfaceDescriptorData); + auto interfaceDescriptor = reinterpret_cast(dsh.getSpace(0)); + uint32_t interfaceDescriptorIndex = 0u; + + HardwareCommandsHelper::sendIndirectState( + commandStream, + dsh, + ioh, + ssh, + kernel, + 0u, + simd, + localWorkSizes, + 0u, + interfaceDescriptorIndex, + pDevice->getPreemptionMode(), + pWalkerCmd, + interfaceDescriptor, + false, + *pDevice); + + uint32_t grfSize = ImplicitArgsHelper::getGrfSize(simd, sizeof(typename FamilyType::GRF)); + + EXPECT_EQ(3 * sizeof(uint16_t), grfSize); + size_t localWorkSize = localWorkSizeX * localWorkSizeY * localWorkSizeZ; + size_t expectedLocalIdsSize = PerThreadDataHelper::getPerThreadDataSizeTotal(simd, grfSize, 3u, localWorkSize); + ASSERT_LE(expectedLocalIdsSize, ioh.getUsed()); + + uint16_t expectedLocalIds[][3] = {{0, 0, 0}, + {0, 1, 0}, + {0, 0, 1}, + {0, 1, 1}}; + EXPECT_EQ(expectedLocalIdsSize, sizeof(expectedLocalIds)); + + EXPECT_EQ(0, memcmp(expectedLocalIds, ioh.getCpuBase(), sizeof(expectedLocalIds))); + + auto localIdsProgrammingSize = alignUp(sizeof(expectedLocalIds), MemoryConstants::cacheLineSize); + ASSERT_LE(localIdsProgrammingSize + sizeof(ImplicitArgs), ioh.getUsed()); + auto pImplicitArgs = reinterpret_cast(ptrOffset(ioh.getCpuBase(), localIdsProgrammingSize)); + EXPECT_EQ(ioh.getGraphicsAllocation()->getGpuAddress(), pImplicitArgs->localIdTablePtr); +} + using HardwareCommandsTestXeHpAndLater = HardwareCommandsTest; HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsTestXeHpAndLater, givenIndirectHeapNotAllocatedFromInternalPoolWhenSendCrossThreadDataIsCalledThenOffsetZeroIsReturned) { diff --git a/shared/source/kernel/implicit_args.h b/shared/source/kernel/implicit_args.h index b5775f5e80..499c86ea65 100644 --- a/shared/source/kernel/implicit_args.h +++ b/shared/source/kernel/implicit_args.h @@ -40,5 +40,6 @@ constexpr const char *implicitArgsRelocationSymbolName = "INTEL_PATCH_CROSS_THRE namespace ImplicitArgsHelper { std::array getDimensionOrderForLocalIds(const uint8_t *workgroupDimensionsOrder, bool generationOfLocalIdsByRuntime, uint32_t walkOrderForHwGenerationOfLocalIds); -} +uint32_t getGrfSize(uint32_t simd, uint32_t grfSize); +} // namespace ImplicitArgsHelper } // namespace NEO diff --git a/shared/source/kernel/implicit_args_helper.cpp b/shared/source/kernel/implicit_args_helper.cpp index 18ca65d9a6..a75b74a399 100644 --- a/shared/source/kernel/implicit_args_helper.cpp +++ b/shared/source/kernel/implicit_args_helper.cpp @@ -10,8 +10,8 @@ #include "shared/source/kernel/kernel_descriptor.h" namespace NEO { - -std::array ImplicitArgsHelper::getDimensionOrderForLocalIds(const uint8_t *workgroupDimensionsOrder, bool generationOfLocalIdsByRuntime, uint32_t walkOrderForHwGenerationOfLocalIds) { +namespace ImplicitArgsHelper { +std::array getDimensionOrderForLocalIds(const uint8_t *workgroupDimensionsOrder, bool generationOfLocalIdsByRuntime, uint32_t walkOrderForHwGenerationOfLocalIds) { if (generationOfLocalIdsByRuntime) { UNRECOVERABLE_IF(!workgroupDimensionsOrder); return {{ @@ -24,4 +24,12 @@ std::array ImplicitArgsHelper::getDimensionOrderForLocalIds(const ui UNRECOVERABLE_IF(walkOrderForHwGenerationOfLocalIds >= HwWalkOrderHelper::walkOrderPossibilties); return HwWalkOrderHelper::compatibleDimensionOrders[walkOrderForHwGenerationOfLocalIds]; } + +uint32_t getGrfSize(uint32_t simd, uint32_t grfSize) { + if (simd == 1u) { + return 3 * sizeof(uint16_t); + } + return grfSize; +} +} // namespace ImplicitArgsHelper } // namespace NEO diff --git a/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp b/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp index cf28af59d2..9bd8255427 100644 --- a/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp +++ b/shared/test/unit_test/kernel/implicit_args_helper_tests.cpp @@ -32,3 +32,15 @@ TEST(ImplicitArgsHelperTest, whenLocalIdsAreGeneratedByHwThenProperDimensionOrde EXPECT_EQ(HwWalkOrderHelper::compatibleDimensionOrders[i], dimOrderForImplicitArgs); } } + +TEST(ImplicitArgsHelperTest, whenGettingGrfSizeForSimd1ThenSizeOfSingleLocalIdIsReturned) { + auto regularGrfsize = 32u; + EXPECT_EQ(3 * sizeof(uint16_t), ImplicitArgsHelper::getGrfSize(1u, regularGrfsize)); +} + +TEST(ImplicitArgsHelperTest, givenSimdGreaterThanOneWhenGettingGrfSizeThenInputGrfSizeIsReturned) { + auto regularGrfsize = 32u; + EXPECT_EQ(regularGrfsize, ImplicitArgsHelper::getGrfSize(8u, regularGrfsize)); + EXPECT_EQ(regularGrfsize, ImplicitArgsHelper::getGrfSize(16u, regularGrfsize)); + EXPECT_EQ(regularGrfsize, ImplicitArgsHelper::getGrfSize(32u, regularGrfsize)); +} \ No newline at end of file