Modify function dispatching cross and per-thread data

Related-To: NEO-4585 Change-Id: Ia6b54b8d0c868cab5403332411655dc8c9ef4c8d Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
2026-01-03 14:55:24 +08:00 · 2020-07-06 22:55:37 +02:00
parent 97aa485048
commit bac5506b62
13 changed files with 283 additions and 82 deletions
--- a/level_zero/core/source/kernel/kernel_hw.h
+++ b/level_zero/core/source/kernel/kernel_hw.h
@@ -119,6 +119,24 @@ struct KernelHw : public KernelImp {

        return ret;
    }
+
+    void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {
+        size_t localWorkSizes[3];
+        localWorkSizes[0] = this->groupSize[0];
+        localWorkSizes[1] = this->groupSize[1];
+        localWorkSizes[2] = this->groupSize[2];
+
+        kernelRequiresGenerationOfLocalIdsByRuntime = NEO::EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(
+            kernelDescriptor.kernelAttributes.numLocalIdChannels,
+            localWorkSizes,
+            std::array<uint8_t, 3>{
+                {kernelDescriptor.kernelAttributes.workgroupWalkOrder[0],
+                 kernelDescriptor.kernelAttributes.workgroupWalkOrder[1],
+                 kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]}},
+            kernelDescriptor.kernelAttributes.flags.requiresWorkgroupWalkOrder,
+            requiredWorkgroupOrder,
+            kernelDescriptor.kernelAttributes.simdSize);
+    }
 };

 } // namespace L0
--- a/level_zero/core/source/kernel/kernel_imp.cpp
+++ b/level_zero/core/source/kernel/kernel_imp.cpp
@@ -237,37 +237,14 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
        DEBUG_BREAK_IF(true);
        return ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION;
    }
-    auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
-    uint32_t perThreadDataSizeForWholeThreadGroupNeeded =
-        static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
-            kernelImmData->getDescriptor().kernelAttributes.simdSize, grfSize, numChannels, itemsInGroup));
-    if (perThreadDataSizeForWholeThreadGroupNeeded >
-        perThreadDataSizeForWholeThreadGroupAllocated) {
-        alignedFree(perThreadDataForWholeThreadGroup);
-        perThreadDataForWholeThreadGroup = static_cast<uint8_t *>(alignedMalloc(perThreadDataSizeForWholeThreadGroupNeeded, 32));
-        perThreadDataSizeForWholeThreadGroupAllocated = perThreadDataSizeForWholeThreadGroupNeeded;
-    }
-    perThreadDataSizeForWholeThreadGroup = perThreadDataSizeForWholeThreadGroupNeeded;
-
-    if (numChannels > 0) {
-        UNRECOVERABLE_IF(3 != numChannels);
-        NEO::generateLocalIDs(
-            perThreadDataForWholeThreadGroup,
-            static_cast<uint16_t>(kernelImmData->getDescriptor().kernelAttributes.simdSize),
-            std::array<uint16_t, 3>{{static_cast<uint16_t>(groupSizeX),
-                                     static_cast<uint16_t>(groupSizeY),
-                                     static_cast<uint16_t>(groupSizeZ)}},
-            std::array<uint8_t, 3>{{0, 1, 2}},
-            false, grfSize);
-    }

    this->groupSize[0] = groupSizeX;
    this->groupSize[1] = groupSizeY;
    this->groupSize[2] = groupSizeZ;
+    const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor();

-    auto simdSize = kernelImmData->getDescriptor().kernelAttributes.simdSize;
+    auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
    this->numThreadsPerThreadGroup = static_cast<uint32_t>((itemsInGroup + simdSize - 1u) / simdSize);
-    this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / numThreadsPerThreadGroup;
    patchWorkgroupSizeInCrossThreadData(groupSizeX, groupSizeY, groupSizeZ);

    auto remainderSimdLanes = itemsInGroup & (simdSize - 1u);
@@ -275,7 +252,35 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
    if (!threadExecutionMask) {
        threadExecutionMask = static_cast<uint32_t>(maxNBitValue((simdSize == 1) ? 32 : simdSize));
    }
+    evaluateIfRequiresGenerationOfLocalIdsByRuntime(kernelDescriptor);

+    if (kernelRequiresGenerationOfLocalIdsByRuntime) {
+        auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
+        uint32_t perThreadDataSizeForWholeThreadGroupNeeded =
+            static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
+                simdSize, grfSize, numChannels, itemsInGroup));
+        if (perThreadDataSizeForWholeThreadGroupNeeded >
+            perThreadDataSizeForWholeThreadGroupAllocated) {
+            alignedFree(perThreadDataForWholeThreadGroup);
+            perThreadDataForWholeThreadGroup = static_cast<uint8_t *>(alignedMalloc(perThreadDataSizeForWholeThreadGroupNeeded, 32));
+            perThreadDataSizeForWholeThreadGroupAllocated = perThreadDataSizeForWholeThreadGroupNeeded;
+        }
+        perThreadDataSizeForWholeThreadGroup = perThreadDataSizeForWholeThreadGroupNeeded;
+
+        if (numChannels > 0) {
+            UNRECOVERABLE_IF(3 != numChannels);
+            NEO::generateLocalIDs(
+                perThreadDataForWholeThreadGroup,
+                static_cast<uint16_t>(simdSize),
+                std::array<uint16_t, 3>{{static_cast<uint16_t>(groupSizeX),
+                                         static_cast<uint16_t>(groupSizeY),
+                                         static_cast<uint16_t>(groupSizeZ)}},
+                std::array<uint8_t, 3>{{0, 1, 2}},
+                false, grfSize);
+        }
+
+        this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / numThreadsPerThreadGroup;
+    }
    return ZE_RESULT_SUCCESS;
 }

--- a/level_zero/core/source/kernel/kernel_imp.h
+++ b/level_zero/core/source/kernel/kernel_imp.h
@@ -105,6 +105,9 @@ struct KernelImp : Kernel {
    uint32_t getSlmTotalSize() const override;
    NEO::GraphicsAllocation *getIsaAllocation() const override;

+    uint32_t getRequiredWorkgroupOrder() const override { return requiredWorkgroupOrder; }
+    bool requiresGenerationOfLocalIdsByRuntime() const override { return kernelRequiresGenerationOfLocalIdsByRuntime; }
+
  protected:
    KernelImp() = default;

@@ -112,6 +115,7 @@ struct KernelImp : Kernel {

    void createPrintfBuffer();
    void setDebugSurface();
+    virtual void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) = 0;

    const KernelImmutableData *kernelImmData = nullptr;
    Module *module = nullptr;
@@ -143,6 +147,9 @@ struct KernelImp : Kernel {
    UnifiedMemoryControls unifiedMemoryControls;
    std::vector<uint32_t> slmArgSizes;
    uint32_t slmArgsTotalSize = 0U;
+    uint32_t requiredWorkgroupOrder = 0u;
+
+    bool kernelRequiresGenerationOfLocalIdsByRuntime = true;
 };

 } // namespace L0
--- a/level_zero/core/test/unit_tests/mocks/mock_kernel.h
+++ b/level_zero/core/test/unit_tests/mocks/mock_kernel.h
@@ -40,18 +40,22 @@ struct WhiteBox<::L0::Kernel> : public ::L0::KernelImp {
    using ::L0::KernelImp::crossThreadDataSize;
    using ::L0::KernelImp::groupSize;
    using ::L0::KernelImp::kernelImmData;
+    using ::L0::KernelImp::kernelRequiresGenerationOfLocalIdsByRuntime;
    using ::L0::KernelImp::module;
    using ::L0::KernelImp::numThreadsPerThreadGroup;
    using ::L0::KernelImp::perThreadDataForWholeThreadGroup;
    using ::L0::KernelImp::perThreadDataSize;
    using ::L0::KernelImp::perThreadDataSizeForWholeThreadGroup;
    using ::L0::KernelImp::printfBuffer;
+    using ::L0::KernelImp::requiredWorkgroupOrder;
    using ::L0::KernelImp::residencyContainer;
    using ::L0::KernelImp::unifiedMemoryControls;

    void setBufferSurfaceState(uint32_t argIndex, void *address,
                               NEO::GraphicsAllocation *alloc) override {}

+    void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {}
+
    std::unique_ptr<Kernel> clone() const override { return nullptr; }

    WhiteBox() : ::L0::KernelImp(nullptr) {}
@@ -85,6 +89,7 @@ struct Mock<::L0::Kernel> : public WhiteBox<::L0::Kernel> {
    }

    void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override {}
+    void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {}
    std::unique_ptr<Kernel> clone() const override {
        return nullptr;
    }
--- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp
+++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp
@@ -51,6 +51,22 @@ HWTEST_F(KernelImpSetGroupSizeTest, WhenCalculatingLocalIdsThenGrfSizeIsTakenFro
    }
 }

+HWTEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeDisabledWhenSettingGroupSizeThenLocalIdsAreNotGenerated) {
+    Mock<Kernel> mockKernel;
+    Mock<Module> mockModule(this->device, nullptr);
+    mockKernel.descriptor.kernelAttributes.simdSize = 1;
+    mockKernel.module = &mockModule;
+    mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime = false;
+
+    uint32_t groupSize[3] = {2, 3, 5};
+    auto ret = mockKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2]);
+    EXPECT_EQ(ZE_RESULT_SUCCESS, ret);
+    EXPECT_EQ(groupSize[0] * groupSize[1] * groupSize[2], mockKernel.numThreadsPerThreadGroup);
+    EXPECT_EQ(0u, mockKernel.perThreadDataSizeForWholeThreadGroup);
+    EXPECT_EQ(0u, mockKernel.perThreadDataSize);
+    EXPECT_EQ(nullptr, mockKernel.perThreadDataForWholeThreadGroup);
+}
+
 using SetKernelArg = Test<ModuleFixture>;
 using ImageSupport = IsWithinProducts<IGFX_SKYLAKE, IGFX_TIGERLAKE_LP>;

@@ -234,5 +250,11 @@ HWTEST_F(KernelPropertiesTest, givenKernelThenPropertiesAreRetrieved) {
    Kernel::fromHandle(kernelHandle)->destroy();
 }

+HWTEST_F(KernelPropertiesTest, WhenKernelIsCreatedThenDefaultLocalIdGenerationbyRuntimeIsTrue) {
+    createKernel();
+
+    EXPECT_TRUE(kernel->requiresGenerationOfLocalIdsByRuntime());
+}
+
 } // namespace ult
 } // namespace L0