From 9dfed7cd54395f37c030179954a793bc656b8c70 Mon Sep 17 00:00:00 2001 From: Fabian Zwolinski Date: Wed, 11 Jan 2023 11:50:28 +0000 Subject: [PATCH] Use cached group sizes in zeKernelSetGroupSize Optimize zeKernelSetGroupSize by early returning success if group size values have not changed since last function call. Moved ImplicitArgs construction above setGroupSize call in kernel initialization to prevent pImplicitArgs being nullptr in calls in which we use cached group sizes and early return. Related-To: NEO-7394 Signed-off-by: Fabian Zwolinski --- level_zero/core/source/kernel/kernel_imp.cpp | 21 +++++--- .../unit_tests/sources/kernel/test_kernel.cpp | 50 +++++++++++++++++++ 2 files changed, 64 insertions(+), 7 deletions(-) diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 6222568de9..59af0dbace 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -271,6 +271,12 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY, return ZE_RESULT_ERROR_INVALID_ARGUMENT; } + if (this->groupSize[0] == groupSizeX && + this->groupSize[1] == groupSizeY && + this->groupSize[2] == groupSizeZ) { + return ZE_RESULT_SUCCESS; + } + auto numChannels = kernelImmData->getDescriptor().kernelAttributes.numLocalIdChannels; Vec3 groupSize{groupSizeX, groupSizeY, groupSizeZ}; auto itemsInGroup = Math::computeTotalElementsCount(groupSize); @@ -873,6 +879,14 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) { this->dynamicStateHeapDataSize = kernelImmData->getDynamicStateHeapDataSize(); } + if (kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs) { + pImplicitArgs = std::make_unique(); + *pImplicitArgs = {}; + pImplicitArgs->structSize = sizeof(NEO::ImplicitArgs); + pImplicitArgs->structVersion = 0; + pImplicitArgs->simdWidth = kernelDescriptor.kernelAttributes.simdSize; + } + if (kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0] > 0) { auto *reqdSize = kernelDescriptor.kernelAttributes.requiredWorkgroupSize; UNRECOVERABLE_IF(reqdSize[1] == 0); @@ -896,13 +910,6 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) { this->patchCrossthreadDataWithPrivateAllocation(this->privateMemoryGraphicsAllocation); this->residencyContainer.push_back(this->privateMemoryGraphicsAllocation); } - if (kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs) { - pImplicitArgs = std::make_unique(); - *pImplicitArgs = {}; - pImplicitArgs->structSize = sizeof(NEO::ImplicitArgs); - pImplicitArgs->structVersion = 0; - pImplicitArgs->simdWidth = kernelDescriptor.kernelAttributes.simdSize; - } this->createPrintfBuffer(); diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp index 333d679456..479cdf20f0 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp @@ -138,6 +138,18 @@ struct MockKernelWithCallTracking : Mock<::L0::Kernel> { return KernelImp::setArgBufferWithAlloc(argIndex, argVal, allocation); } size_t setArgBufferWithAllocCalled = 0u; + + ze_result_t setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY, uint32_t groupSizeZ) override { + if (this->groupSize[0] == groupSizeX && + this->groupSize[1] == groupSizeY && + this->groupSize[2] == groupSizeZ) { + setGroupSizeSkipCount++; + } else { + setGroupSizeSkipCount = 0u; + } + return KernelImp::setGroupSize(groupSizeX, groupSizeY, groupSizeZ); + } + size_t setGroupSizeSkipCount = 0u; }; using SetKernelArgCacheTest = Test; @@ -314,6 +326,44 @@ TEST_F(KernelImpSetGroupSizeTest, givenZeroGroupSizeWhenSettingGroupSizeThenInva EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, ret); } +TEST_F(KernelImpSetGroupSizeTest, givenValidGroupSizeWhenSetMultipleTimesThenSetGroupSizeIsOnlyExecutedIfNeeded) { + MockKernelWithCallTracking mockKernel; + Mock mockModule(this->device, nullptr); + mockKernel.module = &mockModule; + + // First call with {2u, 3u, 5u} group size - don't skip setGroupSize execution + auto ret = mockKernel.setGroupSize(2u, 3u, 5u); + EXPECT_EQ(2u, mockKernel.groupSize[0]); + EXPECT_EQ(3u, mockKernel.groupSize[1]); + EXPECT_EQ(5u, mockKernel.groupSize[2]); + EXPECT_EQ(0u, mockKernel.setGroupSizeSkipCount); + EXPECT_EQ(ZE_RESULT_SUCCESS, ret); + + // Second call with {2u, 3u, 5u} group size - skip setGroupSize execution + ret = mockKernel.setGroupSize(2u, 3u, 5u); + EXPECT_EQ(2u, mockKernel.groupSize[0]); + EXPECT_EQ(3u, mockKernel.groupSize[1]); + EXPECT_EQ(5u, mockKernel.groupSize[2]); + EXPECT_EQ(1u, mockKernel.setGroupSizeSkipCount); + EXPECT_EQ(ZE_RESULT_SUCCESS, ret); + + // First call with {1u, 2u, 3u} group size - don't skip setGroupSize execution + ret = mockKernel.setGroupSize(1u, 2u, 3u); + EXPECT_EQ(1u, mockKernel.groupSize[0]); + EXPECT_EQ(2u, mockKernel.groupSize[1]); + EXPECT_EQ(3u, mockKernel.groupSize[2]); + EXPECT_EQ(0u, mockKernel.setGroupSizeSkipCount); + EXPECT_EQ(ZE_RESULT_SUCCESS, ret); + + // Second call with {1u, 2u, 3u} group size - skip setGroupSize execution + ret = mockKernel.setGroupSize(1u, 2u, 3u); + EXPECT_EQ(1u, mockKernel.groupSize[0]); + EXPECT_EQ(2u, mockKernel.groupSize[1]); + EXPECT_EQ(3u, mockKernel.groupSize[2]); + EXPECT_EQ(1u, mockKernel.setGroupSizeSkipCount); + EXPECT_EQ(ZE_RESULT_SUCCESS, ret); +} + using SetKernelArg = Test; using ImageSupport = IsWithinProducts;