Use cached group sizes in zeKernelSetGroupSize

Optimize zeKernelSetGroupSize by early returning success if group size values have not changed since last function call. Moved ImplicitArgs construction above setGroupSize call in kernel initialization to prevent pImplicitArgs being nullptr in calls in which we use cached group sizes and early return. Related-To: NEO-7394 Signed-off-by: Fabian Zwolinski <fabian.zwolinski@intel.com>
2023-01-11 11:50:28 +00:00 · 2023-01-11 11:50:28 +00:00 · 9dfed7cd54
parent 3a5a418488
commit 9dfed7cd54
2 changed files with 64 additions and 7 deletions
--- a/level_zero/core/source/kernel/kernel_imp.cpp
+++ b/level_zero/core/source/kernel/kernel_imp.cpp
@ -271,6 +271,12 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
        return ZE_RESULT_ERROR_INVALID_ARGUMENT;
    }

+    if (this->groupSize[0] == groupSizeX &&
+        this->groupSize[1] == groupSizeY &&
+        this->groupSize[2] == groupSizeZ) {
+        return ZE_RESULT_SUCCESS;
+    }
+
    auto numChannels = kernelImmData->getDescriptor().kernelAttributes.numLocalIdChannels;
    Vec3<size_t> groupSize{groupSizeX, groupSizeY, groupSizeZ};
    auto itemsInGroup = Math::computeTotalElementsCount(groupSize);
@ -873,6 +879,14 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
        this->dynamicStateHeapDataSize = kernelImmData->getDynamicStateHeapDataSize();
    }

+    if (kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs) {
+        pImplicitArgs = std::make_unique<NEO::ImplicitArgs>();
+        *pImplicitArgs = {};
+        pImplicitArgs->structSize = sizeof(NEO::ImplicitArgs);
+        pImplicitArgs->structVersion = 0;
+        pImplicitArgs->simdWidth = kernelDescriptor.kernelAttributes.simdSize;
+    }
+
    if (kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0] > 0) {
        auto *reqdSize = kernelDescriptor.kernelAttributes.requiredWorkgroupSize;
        UNRECOVERABLE_IF(reqdSize[1] == 0);
@ -896,13 +910,6 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
        this->patchCrossthreadDataWithPrivateAllocation(this->privateMemoryGraphicsAllocation);
        this->residencyContainer.push_back(this->privateMemoryGraphicsAllocation);
    }
-    if (kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs) {
-        pImplicitArgs = std::make_unique<NEO::ImplicitArgs>();
-        *pImplicitArgs = {};
-        pImplicitArgs->structSize = sizeof(NEO::ImplicitArgs);
-        pImplicitArgs->structVersion = 0;
-        pImplicitArgs->simdWidth = kernelDescriptor.kernelAttributes.simdSize;
-    }

    this->createPrintfBuffer();

--- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp
+++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp
@ -138,6 +138,18 @@ struct MockKernelWithCallTracking : Mock<::L0::Kernel> {
        return KernelImp::setArgBufferWithAlloc(argIndex, argVal, allocation);
    }
    size_t setArgBufferWithAllocCalled = 0u;
+
+    ze_result_t setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY, uint32_t groupSizeZ) override {
+        if (this->groupSize[0] == groupSizeX &&
+            this->groupSize[1] == groupSizeY &&
+            this->groupSize[2] == groupSizeZ) {
+            setGroupSizeSkipCount++;
+        } else {
+            setGroupSizeSkipCount = 0u;
+        }
+        return KernelImp::setGroupSize(groupSizeX, groupSizeY, groupSizeZ);
+    }
+    size_t setGroupSizeSkipCount = 0u;
 };

 using SetKernelArgCacheTest = Test<ModuleFixture>;
@ -314,6 +326,44 @@ TEST_F(KernelImpSetGroupSizeTest, givenZeroGroupSizeWhenSettingGroupSizeThenInva
    EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, ret);
 }

+TEST_F(KernelImpSetGroupSizeTest, givenValidGroupSizeWhenSetMultipleTimesThenSetGroupSizeIsOnlyExecutedIfNeeded) {
+    MockKernelWithCallTracking mockKernel;
+    Mock<Module> mockModule(this->device, nullptr);
+    mockKernel.module = &mockModule;
+
+    // First call with {2u, 3u, 5u} group size - don't skip setGroupSize execution
+    auto ret = mockKernel.setGroupSize(2u, 3u, 5u);
+    EXPECT_EQ(2u, mockKernel.groupSize[0]);
+    EXPECT_EQ(3u, mockKernel.groupSize[1]);
+    EXPECT_EQ(5u, mockKernel.groupSize[2]);
+    EXPECT_EQ(0u, mockKernel.setGroupSizeSkipCount);
+    EXPECT_EQ(ZE_RESULT_SUCCESS, ret);
+
+    // Second call with {2u, 3u, 5u} group size - skip setGroupSize execution
+    ret = mockKernel.setGroupSize(2u, 3u, 5u);
+    EXPECT_EQ(2u, mockKernel.groupSize[0]);
+    EXPECT_EQ(3u, mockKernel.groupSize[1]);
+    EXPECT_EQ(5u, mockKernel.groupSize[2]);
+    EXPECT_EQ(1u, mockKernel.setGroupSizeSkipCount);
+    EXPECT_EQ(ZE_RESULT_SUCCESS, ret);
+
+    // First call with {1u, 2u, 3u} group size - don't skip setGroupSize execution
+    ret = mockKernel.setGroupSize(1u, 2u, 3u);
+    EXPECT_EQ(1u, mockKernel.groupSize[0]);
+    EXPECT_EQ(2u, mockKernel.groupSize[1]);
+    EXPECT_EQ(3u, mockKernel.groupSize[2]);
+    EXPECT_EQ(0u, mockKernel.setGroupSizeSkipCount);
+    EXPECT_EQ(ZE_RESULT_SUCCESS, ret);
+
+    // Second call with {1u, 2u, 3u} group size - skip setGroupSize execution
+    ret = mockKernel.setGroupSize(1u, 2u, 3u);
+    EXPECT_EQ(1u, mockKernel.groupSize[0]);
+    EXPECT_EQ(2u, mockKernel.groupSize[1]);
+    EXPECT_EQ(3u, mockKernel.groupSize[2]);
+    EXPECT_EQ(1u, mockKernel.setGroupSizeSkipCount);
+    EXPECT_EQ(ZE_RESULT_SUCCESS, ret);
+}
+
 using SetKernelArg = Test<ModuleFixture>;
 using ImageSupport = IsWithinProducts<IGFX_SKYLAKE, IGFX_TIGERLAKE_LP>;