From 5a59a6ad2e7f35bda455e1c785bd18015df419e4 Mon Sep 17 00:00:00 2001 From: Mateusz Jablonski Date: Mon, 13 Sep 2021 16:49:13 +0000 Subject: [PATCH] Add support for implicit args in L0 Related-To: NEO-5081 Signed-off-by: Mateusz Jablonski --- level_zero/core/source/kernel/kernel_hw.h | 1 + level_zero/core/source/kernel/kernel_imp.cpp | 95 ++++++++++++-- level_zero/core/source/kernel/kernel_imp.h | 5 + .../unit_tests/fixtures/device_fixture.cpp | 6 +- .../test/unit_tests/fixtures/device_fixture.h | 1 + .../test/unit_tests/fixtures/module_fixture.h | 14 ++- .../test_cmdlist_append_launch_kernel_2.cpp | 118 ++++++++++++++++++ .../unit_tests/sources/kernel/test_kernel.cpp | 103 +++++++++++++-- .../command_encoder_bdw_and_later.inl | 12 +- .../command_encoder_xehp_and_later.inl | 12 +- .../dispatch_kernel_encoder_interface.h | 5 + shared/test/common/mocks/mock_device.h | 9 +- .../mock_dispatch_kernel_encoder_interface.h | 4 + 13 files changed, 352 insertions(+), 33 deletions(-) diff --git a/level_zero/core/source/kernel/kernel_hw.h b/level_zero/core/source/kernel/kernel_hw.h index 4fa78059ba..14214079ca 100644 --- a/level_zero/core/source/kernel/kernel_hw.h +++ b/level_zero/core/source/kernel/kernel_hw.h @@ -13,6 +13,7 @@ #include "shared/source/helpers/bindless_heaps_helper.h" #include "shared/source/helpers/hw_helper.h" #include "shared/source/helpers/string.h" +#include "shared/source/kernel/implicit_args.h" #include "level_zero/core/source/kernel/kernel_imp.h" #include "level_zero/core/source/module/module.h" diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index c17005e65d..f70fa20206 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -292,6 +292,18 @@ void KernelImp::setGroupCount(uint32_t groupCountX, uint32_t groupCountY, uint32 auto destinationBuffer = ArrayRef(crossThreadData.get(), crossThreadDataSize); NEO::patchNonPointer(destinationBuffer, desc.payloadMappings.dispatchTraits.workDim, workDim); } + + if (pImplicitArgs) { + pImplicitArgs->numWorkDim = workDim; + + pImplicitArgs->globalSizeX = globalWorkSize[0]; + pImplicitArgs->globalSizeY = globalWorkSize[1]; + pImplicitArgs->globalSizeZ = globalWorkSize[2]; + + pImplicitArgs->groupCountX = groupCount[0]; + pImplicitArgs->groupCountY = groupCount[1]; + pImplicitArgs->groupCountZ = groupCount[2]; + } } ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY, @@ -756,6 +768,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) { auto neoDevice = module->getDevice()->getNEODevice(); auto &hwInfo = neoDevice->getHardwareInfo(); auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily); + auto &kernelDescriptor = kernelImmData->getDescriptor(); this->schedulingHintExpFlag = hwHelper.getDefaultThreadArbitrationPolicy(); @@ -769,7 +782,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) { static_cast(this->kernelImmData->getKernelInfo()->heapInfo.KernelHeapSize)); } - for (const auto &argT : kernelImmData->getDescriptor().payloadMappings.explicitArgs) { + for (const auto &argT : kernelDescriptor.payloadMappings.explicitArgs) { switch (argT.type) { default: this->kernelArgHandlers.push_back(&KernelImp::setArgUnknown); @@ -802,13 +815,13 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) { this->surfaceStateHeapDataSize = kernelImmData->getSurfaceStateHeapSize(); } - if (kernelImmData->getDescriptor().kernelAttributes.crossThreadDataSize != 0) { - this->crossThreadData.reset(new uint8_t[kernelImmData->getDescriptor().kernelAttributes.crossThreadDataSize]); + if (kernelDescriptor.kernelAttributes.crossThreadDataSize != 0) { + this->crossThreadData.reset(new uint8_t[kernelDescriptor.kernelAttributes.crossThreadDataSize]); memcpy_s(this->crossThreadData.get(), - kernelImmData->getDescriptor().kernelAttributes.crossThreadDataSize, + kernelDescriptor.kernelAttributes.crossThreadDataSize, kernelImmData->getCrossThreadDataTemplate(), - kernelImmData->getDescriptor().kernelAttributes.crossThreadDataSize); - this->crossThreadDataSize = kernelImmData->getDescriptor().kernelAttributes.crossThreadDataSize; + kernelDescriptor.kernelAttributes.crossThreadDataSize); + this->crossThreadDataSize = kernelDescriptor.kernelAttributes.crossThreadDataSize; } if (kernelImmData->getDynamicStateHeapDataSize() != 0) { @@ -820,8 +833,8 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) { this->dynamicStateHeapDataSize = kernelImmData->getDynamicStateHeapDataSize(); } - if (kernelImmData->getDescriptor().kernelAttributes.requiredWorkgroupSize[0] > 0) { - auto *reqdSize = kernelImmData->getDescriptor().kernelAttributes.requiredWorkgroupSize; + if (kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0] > 0) { + auto *reqdSize = kernelDescriptor.kernelAttributes.requiredWorkgroupSize; UNRECOVERABLE_IF(reqdSize[1] == 0); UNRECOVERABLE_IF(reqdSize[2] == 0); auto result = setGroupSize(reqdSize[0], reqdSize[1], reqdSize[2]); @@ -829,7 +842,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) { return result; } } else { - auto result = setGroupSize(kernelImmData->getDescriptor().kernelAttributes.simdSize, 1, 1); + auto result = setGroupSize(kernelDescriptor.kernelAttributes.simdSize, 1, 1); if (result != ZE_RESULT_SUCCESS) { return result; } @@ -837,7 +850,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) { residencyContainer.resize(this->kernelArgHandlers.size(), nullptr); - auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes; + auto &kernelAttributes = kernelDescriptor.kernelAttributes; if ((kernelAttributes.perHwThreadPrivateMemorySize != 0U) && (false == module->shouldAllocatePrivateMemoryPerDispatch())) { this->privateMemoryGraphicsAllocation = allocatePrivateMemoryGraphicsAllocation(); this->patchCrossthreadDataWithPrivateAllocation(this->privateMemoryGraphicsAllocation); @@ -851,14 +864,21 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) { residencyContainer.insert(residencyContainer.end(), kernelImmData->getResidencyContainer().begin(), kernelImmData->getResidencyContainer().end()); - kernelHasIndirectAccess = kernelImmData->getDescriptor().kernelAttributes.hasNonKernelArgLoad || - kernelImmData->getDescriptor().kernelAttributes.hasNonKernelArgStore || - kernelImmData->getDescriptor().kernelAttributes.hasNonKernelArgAtomic; + kernelHasIndirectAccess = kernelDescriptor.kernelAttributes.hasNonKernelArgLoad || + kernelDescriptor.kernelAttributes.hasNonKernelArgStore || + kernelDescriptor.kernelAttributes.hasNonKernelArgAtomic; if (this->usesRayTracing()) { neoDevice->initializeRayTracing(); this->residencyContainer.push_back(neoDevice->getRTMemoryBackedBuffer()); } + if (kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs) { + pImplicitArgs = std::make_unique(); + *pImplicitArgs = {}; + pImplicitArgs->structSize = sizeof(NEO::ImplicitArgs); + pImplicitArgs->structVersion = 0; + pImplicitArgs->simdWidth = kernelDescriptor.kernelAttributes.simdSize; + } return ZE_RESULT_SUCCESS; } @@ -918,6 +938,11 @@ void KernelImp::patchWorkgroupSizeInCrossThreadData(uint32_t x, uint32_t y, uint NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.localWorkSize, workgroupSize); NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.localWorkSize2, workgroupSize); NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.enqueuedLocalWorkSize, workgroupSize); + if (pImplicitArgs) { + pImplicitArgs->localSizeX = x; + pImplicitArgs->localSizeY = y; + pImplicitArgs->localSizeZ = z; + } } ze_result_t KernelImp::setGlobalOffsetExp(uint32_t offsetX, @@ -934,6 +959,11 @@ void KernelImp::patchGlobalOffset() { const NEO::KernelDescriptor &desc = kernelImmData->getDescriptor(); auto dst = ArrayRef(crossThreadData.get(), crossThreadDataSize); NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.globalWorkOffset, this->globalOffsets); + if (pImplicitArgs) { + pImplicitArgs->globalOffsetX = globalOffsets[0]; + pImplicitArgs->globalOffsetY = globalOffsets[1]; + pImplicitArgs->globalOffsetZ = globalOffsets[2]; + } } Kernel *Kernel::create(uint32_t productFamily, Module *module, @@ -980,4 +1010,43 @@ uint32_t KernelImp::getSchedulingHintExp() { return this->schedulingHintExpFlag; } +uint32_t KernelImp::getSizeForImplicitArgsPatching() const { + if (!pImplicitArgs) { + return 0; + } + auto implicitArgsSize = static_cast(sizeof(NEO::ImplicitArgs)); + const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor(); + auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize; + Vec3 groupSize{this->groupSize[0], this->groupSize[1], this->groupSize[2]}; + auto itemsInGroup = Math::computeTotalElementsCount(groupSize); + uint32_t localIdsSizeNeeded = + alignUp(static_cast(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal( + kernelDescriptor.kernelAttributes.simdSize, grfSize, 3u, itemsInGroup)), + MemoryConstants::cacheLineSize); + return implicitArgsSize + localIdsSizeNeeded; +} + +void KernelImp::patchImplicitArgs(void *&pOut) const { + if (!pImplicitArgs) { + return; + } + const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor(); + auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize; + NEO::generateLocalIDs( + pOut, + static_cast(kernelDescriptor.kernelAttributes.simdSize), + std::array{{static_cast(groupSize[0]), + static_cast(groupSize[1]), + static_cast(groupSize[2])}}, + std::array{{ + kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0], + kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1], + kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2], + }}, + false, grfSize); + auto sizeForLocalIdsProgramming = getSizeForImplicitArgsPatching() - sizeof(NEO::ImplicitArgs); + pOut = ptrOffset(pOut, sizeForLocalIdsProgramming); + memcpy_s(pOut, sizeof(NEO::ImplicitArgs), pImplicitArgs.get(), sizeof(NEO::ImplicitArgs)); + pOut = ptrOffset(pOut, sizeof(NEO::ImplicitArgs)); +} } // namespace L0 diff --git a/level_zero/core/source/kernel/kernel_imp.h b/level_zero/core/source/kernel/kernel_imp.h index c3beaf2b02..8a97196794 100644 --- a/level_zero/core/source/kernel/kernel_imp.h +++ b/level_zero/core/source/kernel/kernel_imp.h @@ -152,6 +152,10 @@ struct KernelImp : Kernel { ze_result_t setSchedulingHintExp(ze_scheduling_hint_exp_desc_t *pHint) override; uint32_t getSchedulingHintExp(); + NEO::ImplicitArgs *getImplicitArgs() const override { return pImplicitArgs.get(); } + uint32_t getSizeForImplicitArgsPatching() const override; + void patchImplicitArgs(void *&pOut) const override; + protected: KernelImp() = default; @@ -207,6 +211,7 @@ struct KernelImp : Kernel { bool kernelHasIndirectAccess = true; uint32_t schedulingHintExpFlag = 0u; + std::unique_ptr pImplicitArgs; }; } // namespace L0 diff --git a/level_zero/core/test/unit_tests/fixtures/device_fixture.cpp b/level_zero/core/test/unit_tests/fixtures/device_fixture.cpp index ebab93d8e5..579761156a 100644 --- a/level_zero/core/test/unit_tests/fixtures/device_fixture.cpp +++ b/level_zero/core/test/unit_tests/fixtures/device_fixture.cpp @@ -17,7 +17,11 @@ namespace L0 { namespace ult { void DeviceFixture::SetUp() { // NOLINT(readability-identifier-naming) - neoDevice = NEO::MockDevice::createWithNewExecutionEnvironment(NEO::defaultHwInfo.get()); + auto executionEnvironment = MockDevice::prepareExecutionEnvironment(NEO::defaultHwInfo.get(), 0u); + setupWithExecutionEnvironment(*executionEnvironment); +} +void DeviceFixture::setupWithExecutionEnvironment(NEO::ExecutionEnvironment &executionEnvironment) { + neoDevice = NEO::MockDevice::createWithExecutionEnvironment(NEO::defaultHwInfo.get(), &executionEnvironment, 0u); mockBuiltIns = new MockBuiltins(); neoDevice->executionEnvironment->rootDeviceEnvironments[0]->builtins.reset(mockBuiltIns); NEO::DeviceVector devices; diff --git a/level_zero/core/test/unit_tests/fixtures/device_fixture.h b/level_zero/core/test/unit_tests/fixtures/device_fixture.h index 3add70f554..6a231ade4d 100644 --- a/level_zero/core/test/unit_tests/fixtures/device_fixture.h +++ b/level_zero/core/test/unit_tests/fixtures/device_fixture.h @@ -34,6 +34,7 @@ struct DeviceFixture { NEO::MockCompilerEnableGuard compilerMock = NEO::MockCompilerEnableGuard(true); virtual void SetUp(); // NOLINT(readability-identifier-naming) virtual void TearDown(); // NOLINT(readability-identifier-naming) + void setupWithExecutionEnvironment(NEO::ExecutionEnvironment &executionEnvironment); std::unique_ptr> driverHandle; NEO::MockDevice *neoDevice = nullptr; diff --git a/level_zero/core/test/unit_tests/fixtures/module_fixture.h b/level_zero/core/test/unit_tests/fixtures/module_fixture.h index 631a658b7e..e587dc5641 100644 --- a/level_zero/core/test/unit_tests/fixtures/module_fixture.h +++ b/level_zero/core/test/unit_tests/fixtures/module_fixture.h @@ -115,7 +115,7 @@ struct ModuleImmutableDataFixture : public DeviceFixture { public: using KernelImp::kernelArgHandlers; using KernelImp::kernelHasIndirectAccess; - using L0::KernelImp::privateMemoryGraphicsAllocation; + using KernelImp::privateMemoryGraphicsAllocation; MockKernel(MockModule *mockModule) : WhiteBox(mockModule) { } @@ -125,14 +125,20 @@ struct ModuleImmutableDataFixture : public DeviceFixture { void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override { return; } + void setCrossThreadData(uint32_t dataSize) { + crossThreadData.reset(new uint8_t[dataSize]); + crossThreadDataSize = dataSize; + memset(crossThreadData.get(), 0x00, crossThreadDataSize); + } ~MockKernel() override { } }; void SetUp() override { - DeviceFixture::SetUp(); - memoryManager = new MockImmutableMemoryManager(*neoDevice->executionEnvironment); - neoDevice->executionEnvironment->memoryManager.reset(memoryManager); + auto executionEnvironment = MockDevice::prepareExecutionEnvironment(NEO::defaultHwInfo.get(), 0u); + memoryManager = new MockImmutableMemoryManager(*executionEnvironment); + executionEnvironment->memoryManager.reset(memoryManager); + DeviceFixture::setupWithExecutionEnvironment(*executionEnvironment); } void createModuleFromBinary(uint32_t perHwThreadPrivateMemorySize, bool isInternal, MockImmutableData *mockKernelImmData) { diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp index c1c60f1d0e..7e4ef1f61a 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp @@ -6,6 +6,7 @@ */ #include "shared/source/gen9/reg_configs.h" +#include "shared/source/helpers/local_id_gen.h" #include "shared/source/utilities/software_tags_manager.h" #include "shared/test/common/cmd_parse/gen_cmd_parse.h" #include "shared/test/common/mocks/mock_compilers.h" @@ -668,5 +669,122 @@ HWTEST_F(CommandListArbitrationPolicyTest, whenCommandListIsResetThenOriginalThr } } +using CmdlistAppendLaunchKernelTests = Test; +HWTEST_F(CmdlistAppendLaunchKernelTests, givenKernelWithImplicitArgsWhenAppendLaunchKernelThenImplicitArgsAreSentToIndirectHeap) { + std::unique_ptr mockKernelImmData = std::make_unique(0u); + auto kernelDescriptor = mockKernelImmData->kernelDescriptor; + kernelDescriptor->kernelAttributes.flags.requiresImplicitArgs = true; + auto simd = kernelDescriptor->kernelAttributes.simdSize; + kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[0] = 2; + kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[1] = 1; + kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[2] = 0; + createModuleFromBinary(0u, false, mockKernelImmData.get()); + + auto kernel = std::make_unique(module.get()); + + ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC}; + kernel->initialize(&kernelDesc); + + EXPECT_TRUE(kernel->getKernelDescriptor().kernelAttributes.flags.requiresImplicitArgs); + ASSERT_NE(nullptr, kernel->getImplicitArgs()); + + kernel->setGroupSize(4, 5, 6); + kernel->setGroupCount(3, 2, 1); + kernel->setGlobalOffsetExp(1, 2, 3); + kernel->patchGlobalOffset(); + + ze_result_t result{}; + std::unique_ptr commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, result)); + + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto indirectHeap = commandList->commandContainer.getIndirectHeap(NEO::HeapType::INDIRECT_OBJECT); + memset(indirectHeap->getSpace(0), 0, kernel->getSizeForImplicitArgsPatching()); + + ze_group_count_t groupCount{3, 2, 1}; + result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto sizeCrossThreadData = kernel->getCrossThreadDataSize(); + auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup(); + EXPECT_EQ(indirectHeap->getUsed(), sizeCrossThreadData + sizePerThreadDataForWholeGroup + kernel->getSizeForImplicitArgsPatching()); + + ImplicitArgs expectedImplicitArgs{sizeof(ImplicitArgs)}; + expectedImplicitArgs.numWorkDim = 3; + expectedImplicitArgs.simdWidth = simd; + expectedImplicitArgs.localSizeX = 4; + expectedImplicitArgs.localSizeY = 5; + expectedImplicitArgs.localSizeZ = 6; + expectedImplicitArgs.globalSizeX = 12; + expectedImplicitArgs.globalSizeY = 10; + expectedImplicitArgs.globalSizeZ = 6; + expectedImplicitArgs.globalOffsetX = 1; + expectedImplicitArgs.globalOffsetY = 2; + expectedImplicitArgs.globalOffsetZ = 3; + expectedImplicitArgs.groupCountX = 3; + expectedImplicitArgs.groupCountY = 2; + expectedImplicitArgs.groupCountZ = 1; + expectedImplicitArgs.localIdTablePtr = indirectHeap->getGraphicsAllocation()->getGpuAddress(); + + auto sizeForImplicitArgPatching = kernel->getSizeForImplicitArgsPatching(); + + EXPECT_LT(0u, sizeForImplicitArgPatching); + + auto localIdsProgrammingSize = sizeForImplicitArgPatching - sizeof(ImplicitArgs); + + auto expectedLocalIds = alignedMalloc(localIdsProgrammingSize, 64); + memset(expectedLocalIds, 0, localIdsProgrammingSize); + constexpr uint32_t grfSize = sizeof(typename FamilyType::GRF); + NEO::generateLocalIDs(expectedLocalIds, simd, + std::array{{4, 5, 6}}, + std::array{{kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[0], + kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[1], + kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[2]}}, + false, grfSize); + + EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeap->getCpuBase(), localIdsProgrammingSize)); + auto pImplicitArgs = reinterpret_cast(ptrOffset(indirectHeap->getCpuBase(), localIdsProgrammingSize)); + EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, sizeof(ImplicitArgs))); + + alignedFree(expectedLocalIds); +} +HWTEST_F(CmdlistAppendLaunchKernelTests, givenKernelWithoutImplicitArgsWhenAppendLaunchKernelThenImplicitArgsAreNotSentToIndirectHeap) { + std::unique_ptr mockKernelImmData = std::make_unique(0u); + auto kernelDescriptor = mockKernelImmData->kernelDescriptor; + kernelDescriptor->kernelAttributes.flags.requiresImplicitArgs = false; + createModuleFromBinary(0u, false, mockKernelImmData.get()); + + auto kernel = std::make_unique(module.get()); + + ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC}; + kernel->initialize(&kernelDesc); + + EXPECT_FALSE(kernel->getKernelDescriptor().kernelAttributes.flags.requiresImplicitArgs); + EXPECT_EQ(nullptr, kernel->getImplicitArgs()); + + kernel->setGroupSize(4, 5, 6); + kernel->setGroupCount(3, 2, 1); + kernel->setGlobalOffsetExp(1, 2, 3); + kernel->patchGlobalOffset(); + + ze_result_t result{}; + std::unique_ptr commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, result)); + + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + ze_group_count_t groupCount = {3, 2, 1}; + result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto indirectHeap = commandList->commandContainer.getIndirectHeap(NEO::HeapType::INDIRECT_OBJECT); + + auto sizeCrossThreadData = kernel->getCrossThreadDataSize(); + auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup(); + EXPECT_EQ(indirectHeap->getUsed(), sizeCrossThreadData + sizePerThreadDataForWholeGroup); + + auto sizeForImplicitArgPatching = kernel->getSizeForImplicitArgsPatching(); + + EXPECT_EQ(0u, sizeForImplicitArgPatching); +} } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp index 8077fd6f76..07aa0b69e9 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp @@ -1909,23 +1909,15 @@ HWTEST_F(KernelGlobalWorkOffsetTests, whenSettingGlobalOffsetThenCrossThreadData using KernelWorkDimTests = Test; -HWTEST_F(KernelWorkDimTests, givenGroupCountsWhenPatchingWorkDimThenCrossThreadDataIsPatched) { - struct MockKernelWithMockCrossThreadData : public MockKernel { - public: - MockKernelWithMockCrossThreadData(MockModule *mockModule) : MockKernel(mockModule) {} - void setCrossThreadData(uint32_t dataSize) { - crossThreadData.reset(new uint8_t[dataSize]); - crossThreadDataSize = dataSize; - memset(crossThreadData.get(), 0x00, crossThreadDataSize); - } - }; +TEST_F(KernelWorkDimTests, givenGroupCountsWhenPatchingWorkDimThenCrossThreadDataIsPatched) { + uint32_t perHwThreadPrivateMemorySizeRequested = 32u; std::unique_ptr mockKernelImmData = std::make_unique(perHwThreadPrivateMemorySizeRequested); createModuleFromBinary(perHwThreadPrivateMemorySizeRequested, false, mockKernelImmData.get()); - auto kernel = std::make_unique(module.get()); + auto kernel = std::make_unique(module.get()); createKernel(kernel.get()); kernel->setCrossThreadData(sizeof(uint32_t)); @@ -2047,5 +2039,94 @@ TEST_F(PrintfTest, WhenCreatingPrintfBufferThenCrossThreadDataIsPatched) { mockKernel.crossThreadData.release(); } +using KernelImplicitArgTests = Test; + +TEST_F(KernelImplicitArgTests, givenImplicitArgsRequiredWhenCreatingKernelThenImplicitArgsAreCreated) { + std::unique_ptr mockKernelImmData = std::make_unique(0u); + + mockKernelImmData->kernelDescriptor->kernelAttributes.flags.requiresImplicitArgs = true; + auto simd = mockKernelImmData->kernelDescriptor->kernelAttributes.simdSize; + + createModuleFromBinary(0u, false, mockKernelImmData.get()); + + auto kernel = std::make_unique(module.get()); + + ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC}; + kernel->initialize(&kernelDesc); + + EXPECT_TRUE(kernel->getKernelDescriptor().kernelAttributes.flags.requiresImplicitArgs); + auto pImplicitArgs = kernel->getImplicitArgs(); + ASSERT_NE(nullptr, pImplicitArgs); + + ImplicitArgs expectedImplicitArgs{sizeof(ImplicitArgs)}; + expectedImplicitArgs.simdWidth = simd; + EXPECT_EQ(0, memcmp(pImplicitArgs, &expectedImplicitArgs, sizeof(ImplicitArgs))); +} + +TEST_F(KernelImplicitArgTests, givenKernelWithImplicitArgsWhenSettingKernelParamsThenImplicitArgsAreUpdated) { + std::unique_ptr mockKernelImmData = std::make_unique(0u); + mockKernelImmData->kernelDescriptor->kernelAttributes.flags.requiresImplicitArgs = true; + auto simd = mockKernelImmData->kernelDescriptor->kernelAttributes.simdSize; + + createModuleFromBinary(0u, false, mockKernelImmData.get()); + + auto kernel = std::make_unique(module.get()); + + ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC}; + kernel->initialize(&kernelDesc); + + EXPECT_TRUE(kernel->getKernelDescriptor().kernelAttributes.flags.requiresImplicitArgs); + auto pImplicitArgs = kernel->getImplicitArgs(); + ASSERT_NE(nullptr, pImplicitArgs); + + ImplicitArgs expectedImplicitArgs{sizeof(ImplicitArgs)}; + expectedImplicitArgs.numWorkDim = 3; + expectedImplicitArgs.simdWidth = simd; + expectedImplicitArgs.localSizeX = 4; + expectedImplicitArgs.localSizeY = 5; + expectedImplicitArgs.localSizeZ = 6; + expectedImplicitArgs.globalSizeX = 12; + expectedImplicitArgs.globalSizeY = 10; + expectedImplicitArgs.globalSizeZ = 6; + expectedImplicitArgs.globalOffsetX = 1; + expectedImplicitArgs.globalOffsetY = 2; + expectedImplicitArgs.globalOffsetZ = 3; + expectedImplicitArgs.groupCountX = 3; + expectedImplicitArgs.groupCountY = 2; + expectedImplicitArgs.groupCountZ = 1; + + kernel->setGroupSize(4, 5, 6); + kernel->setGroupCount(3, 2, 1); + kernel->setGlobalOffsetExp(1, 2, 3); + kernel->patchGlobalOffset(); + EXPECT_EQ(0, memcmp(pImplicitArgs, &expectedImplicitArgs, sizeof(ImplicitArgs))); +} + +TEST_F(KernelImplicitArgTests, givenKernelWithoutImplicitArgsWhenPatchingImplicitArgsThenNothingHappens) { + std::unique_ptr mockKernelImmData = std::make_unique(0u); + mockKernelImmData->kernelDescriptor->kernelAttributes.flags.requiresImplicitArgs = false; + + createModuleFromBinary(0u, false, mockKernelImmData.get()); + + auto kernel = std::make_unique(module.get()); + + ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC}; + kernel->initialize(&kernelDesc); + EXPECT_EQ(nullptr, kernel->getImplicitArgs()); + + uint8_t initData[64]{}; + uint8_t data[64]{}; + int pattern = 0xcd; + memset(data, pattern, 64); + memset(initData, pattern, 64); + + EXPECT_EQ(0u, kernel->getSizeForImplicitArgsPatching()); + void *dataPtr = data; + kernel->patchImplicitArgs(dataPtr); + + EXPECT_EQ(dataPtr, data); + + EXPECT_EQ(0, memcmp(data, initData, 64)); +} } // namespace ult } // namespace L0 diff --git a/shared/source/command_container/command_encoder_bdw_and_later.inl b/shared/source/command_container/command_encoder_bdw_and_later.inl index 8fe145fafb..e1512cb7eb 100644 --- a/shared/source/command_container/command_encoder_bdw_and_later.inl +++ b/shared/source/command_container/command_encoder_bdw_and_later.inl @@ -16,6 +16,7 @@ #include "shared/source/helpers/simd_helper.h" #include "shared/source/helpers/state_base_address.h" #include "shared/source/kernel/dispatch_kernel_encoder_interface.h" +#include "shared/source/kernel/implicit_args.h" #include "pipe_control_args.h" @@ -129,16 +130,25 @@ void EncodeDispatchKernel::encode(CommandContainer &container, idd.setConstantIndirectUrbEntryReadLength(numGrfPerThreadData); uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData; + uint32_t sizeForImplicitArgsPatching = dispatchInterface->getSizeForImplicitArgsPatching(); + uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching; uint64_t offsetThreadData = 0u; { auto heapIndirect = container.getIndirectHeap(HeapType::INDIRECT_OBJECT); UNRECOVERABLE_IF(!(heapIndirect)); heapIndirect->align(WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); - auto ptr = container.getHeapSpaceAllowGrow(HeapType::INDIRECT_OBJECT, sizeThreadData); + auto ptr = container.getHeapSpaceAllowGrow(HeapType::INDIRECT_OBJECT, iohRequiredSize); UNRECOVERABLE_IF(!(ptr)); offsetThreadData = heapIndirect->getHeapGpuStartOffset() + static_cast(heapIndirect->getUsed() - sizeThreadData); + auto pImplicitArgs = dispatchInterface->getImplicitArgs(); + if (pImplicitArgs) { + offsetThreadData -= sizeof(ImplicitArgs); + pImplicitArgs->localIdTablePtr = heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - iohRequiredSize; + dispatchInterface->patchImplicitArgs(ptr); + } + memcpy_s(ptr, sizeCrossThreadData, dispatchInterface->getCrossThreadData(), sizeCrossThreadData); diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index 0a4337c6a9..20bbc09832 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -24,6 +24,7 @@ #include "shared/source/helpers/simd_helper.h" #include "shared/source/helpers/state_base_address.h" #include "shared/source/kernel/dispatch_kernel_encoder_interface.h" +#include "shared/source/kernel/implicit_args.h" #include "shared/source/kernel/kernel_descriptor.h" #include "shared/source/os_interface/hw_info_config.h" @@ -162,15 +163,24 @@ void EncodeDispatchKernel::encode(CommandContainer &container, } uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData; + uint32_t sizeForImplicitArgsPatching = dispatchInterface->getSizeForImplicitArgsPatching(); + uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching; { auto heap = container.getIndirectHeap(HeapType::INDIRECT_OBJECT); UNRECOVERABLE_IF(!heap); heap->align(WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); - auto ptr = container.getHeapSpaceAllowGrow(HeapType::INDIRECT_OBJECT, sizeThreadData); + auto ptr = container.getHeapSpaceAllowGrow(HeapType::INDIRECT_OBJECT, iohRequiredSize); UNRECOVERABLE_IF(!ptr); offsetThreadData = (is64bit ? heap->getHeapGpuStartOffset() : heap->getHeapGpuBase()) + static_cast(heap->getUsed() - sizeThreadData); + auto pImplicitArgs = dispatchInterface->getImplicitArgs(); + if (pImplicitArgs) { + offsetThreadData -= sizeof(ImplicitArgs); + pImplicitArgs->localIdTablePtr = heap->getGraphicsAllocation()->getGpuAddress() + heap->getUsed() - iohRequiredSize; + dispatchInterface->patchImplicitArgs(ptr); + } + if (sizeCrossThreadData > 0) { memcpy_s(ptr, sizeCrossThreadData, crossThreadData, sizeCrossThreadData); diff --git a/shared/source/kernel/dispatch_kernel_encoder_interface.h b/shared/source/kernel/dispatch_kernel_encoder_interface.h index d988217b5f..d411e9be01 100644 --- a/shared/source/kernel/dispatch_kernel_encoder_interface.h +++ b/shared/source/kernel/dispatch_kernel_encoder_interface.h @@ -11,6 +11,7 @@ namespace NEO { class GraphicsAllocation; +struct ImplicitArgs; struct KernelDescriptor; enum class SlmPolicy { @@ -44,5 +45,9 @@ struct DispatchKernelEncoderI { virtual uint32_t getRequiredWorkgroupOrder() const = 0; virtual bool requiresGenerationOfLocalIdsByRuntime() const = 0; + + virtual ImplicitArgs *getImplicitArgs() const = 0; + virtual uint32_t getSizeForImplicitArgsPatching() const = 0; + virtual void patchImplicitArgs(void *&pOut) const = 0; }; } // namespace NEO diff --git a/shared/test/common/mocks/mock_device.h b/shared/test/common/mocks/mock_device.h index b818a40abf..506fb97234 100644 --- a/shared/test/common/mocks/mock_device.h +++ b/shared/test/common/mocks/mock_device.h @@ -118,8 +118,7 @@ class MockDevice : public RootDevice { return createDeviceInternals(device); } - template - static T *createWithNewExecutionEnvironment(const HardwareInfo *pHwInfo, uint32_t rootDeviceIndex = 0) { + static ExecutionEnvironment *prepareExecutionEnvironment(const HardwareInfo *pHwInfo, uint32_t rootDeviceIndex) { ExecutionEnvironment *executionEnvironment = new ExecutionEnvironment(); auto numRootDevices = DebugManager.flags.CreateMultipleRootDevices.get() ? DebugManager.flags.CreateMultipleRootDevices.get() : rootDeviceIndex + 1; executionEnvironment->prepareRootDeviceEnvironments(numRootDevices); @@ -127,6 +126,12 @@ class MockDevice : public RootDevice { for (auto i = 0u; i < executionEnvironment->rootDeviceEnvironments.size(); i++) { executionEnvironment->rootDeviceEnvironments[i]->setHwInfo(pHwInfo); } + return executionEnvironment; + } + + template + static T *createWithNewExecutionEnvironment(const HardwareInfo *pHwInfo, uint32_t rootDeviceIndex = 0) { + auto executionEnvironment = prepareExecutionEnvironment(pHwInfo, rootDeviceIndex); return createWithExecutionEnvironment(pHwInfo, executionEnvironment, rootDeviceIndex); } diff --git a/shared/test/common/mocks/mock_dispatch_kernel_encoder_interface.h b/shared/test/common/mocks/mock_dispatch_kernel_encoder_interface.h index 76beeee3a0..698fc60d63 100644 --- a/shared/test/common/mocks/mock_dispatch_kernel_encoder_interface.h +++ b/shared/test/common/mocks/mock_dispatch_kernel_encoder_interface.h @@ -54,6 +54,10 @@ struct MockDispatchKernelEncoder : public DispatchKernelEncoderI { } void expectAnyMockFunctionCall(); + NEO::ImplicitArgs *getImplicitArgs() const override { return nullptr; } + uint32_t getSizeForImplicitArgsPatching() const override { return 0; } + void patchImplicitArgs(void *&pOut) const override {} + ::testing::NiceMock mockAllocation; static constexpr uint32_t crossThreadSize = 0x40; static constexpr uint32_t perThreadSize = 0x20;