refactor: add kernel properties and move implementations to imp class

Related-To: NEO-15374 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
2025-12-30 01:35:20 +08:00 · 2025-07-15 13:12:06 +00:00
parent 377b99e741
commit ef5efeac0c
9 changed files with 98 additions and 38 deletions
--- a/level_zero/core/source/kernel/kernel_hw.h
+++ b/level_zero/core/source/kernel/kernel_hw.h
@@ -32,10 +32,9 @@ struct KernelHw : public KernelImp {

    void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override {
        uint64_t baseAddress = alloc->getGpuAddressToPatch();
-        auto sshAlignmentMask = NEO::EncodeSurfaceState<GfxFamily>::getSurfaceBaseAddressAlignmentMask();

        // Remove misaligned bytes, accounted for in bufferOffset patch token
-        baseAddress &= sshAlignmentMask;
+        baseAddress &= this->surfaceStateAlignmentMask;
        auto misalignedSize = ptrDiff(alloc->getGpuAddressToPatch(), baseAddress);
        auto offset = ptrDiff(address, reinterpret_cast<void *>(baseAddress));
        size_t bufferSizeForSsh = alloc->getUnderlyingBufferSize();
@@ -52,7 +51,7 @@ struct KernelHw : public KernelImp {
            offsetedAddress = baseAddress != reinterpret_cast<uintptr_t>(address);
            baseAddress = reinterpret_cast<uintptr_t>(address);
            bufferSizeForSsh -= offset;
-            DEBUG_BREAK_IF(baseAddress != (baseAddress & sshAlignmentMask));
+            DEBUG_BREAK_IF(baseAddress != (baseAddress & this->surfaceStateAlignmentMask));

            offset = 0;
        }
@@ -76,9 +75,8 @@ struct KernelHw : public KernelImp {
        }

        uint64_t bufferAddressForSsh = baseAddress;
-        auto alignment = NEO::EncodeSurfaceState<GfxFamily>::getSurfaceBaseAddressAlignment();
        bufferSizeForSsh += misalignedSize;
-        bufferSizeForSsh = alignUp(bufferSizeForSsh, alignment);
+        bufferSizeForSsh = alignUp(bufferSizeForSsh, this->surfaceStateAlignment);

        bool l3Enabled = true;
        // Allocation MUST be cacheline (64 byte) aligned in order to enable L3 caching otherwise Heap corruption will occur coming from the KMD.
@@ -114,38 +112,6 @@ struct KernelHw : public KernelImp {
        UNRECOVERABLE_IF(surfaceStateAddress == nullptr);
        *reinterpret_cast<typename GfxFamily::RENDER_SURFACE_STATE *>(surfaceStateAddress) = surfaceState;
    }
-
-    void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {
-        size_t localWorkSizes[3];
-        localWorkSizes[0] = this->groupSize[0];
-        localWorkSizes[1] = this->groupSize[1];
-        localWorkSizes[2] = this->groupSize[2];
-
-        kernelRequiresGenerationOfLocalIdsByRuntime = NEO::EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(
-            kernelDescriptor.kernelAttributes.numLocalIdChannels,
-            localWorkSizes,
-            std::array<uint8_t, 3>{
-                {kernelDescriptor.kernelAttributes.workgroupWalkOrder[0],
-                 kernelDescriptor.kernelAttributes.workgroupWalkOrder[1],
-                 kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]}},
-            kernelDescriptor.kernelAttributes.flags.requiresWorkgroupWalkOrder,
-            requiredWorkgroupOrder,
-            kernelDescriptor.kernelAttributes.simdSize);
-    }
-
-    uint32_t getIndirectSize() const override {
-        uint32_t totalPayloadSize = getCrossThreadDataSize() + getPerThreadDataSizeForWholeThreadGroup();
-
-        if (getKernelDescriptor().kernelAttributes.flags.passInlineData) {
-            if (totalPayloadSize > GfxFamily::DefaultWalkerType::getInlineDataSize()) {
-                totalPayloadSize -= GfxFamily::DefaultWalkerType::getInlineDataSize();
-            } else {
-                totalPayloadSize = 0;
-            }
-        }
-
-        return totalPayloadSize;
-    }
 };

 } // namespace L0
--- a/level_zero/core/source/kernel/kernel_imp.cpp
+++ b/level_zero/core/source/kernel/kernel_imp.cpp
@@ -1078,6 +1078,9 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {

    this->rcsAvailable = gfxHelper.isRcsAvailable(hwInfo);
    this->cooperativeSupport = productHelper.isCooperativeEngineSupported(hwInfo);
+    this->walkerInlineDataSize = gfxHelper.getDefaultWalkerInlineDataSize();
+    this->surfaceStateAlignmentMask = gfxHelper.getSurfaceBaseAddressAlignmentMask();
+    this->surfaceStateAlignment = gfxHelper.getSurfaceBaseAddressAlignment();

    if (isaAllocation->getAllocationType() == NEO::AllocationType::kernelIsaInternal && this->kernelImmData->getIsaParentAllocation() == nullptr) {
        isaAllocation->setTbxWritable(true, std::numeric_limits<uint32_t>::max());
@@ -1621,4 +1624,39 @@ KernelExt *KernelImp::getExtension(uint32_t extensionType) {
    return nullptr;
 }

+uint32_t KernelImp::getIndirectSize() const {
+    uint32_t totalPayloadSize = getCrossThreadDataSize() + getPerThreadDataSizeForWholeThreadGroup();
+
+    if (getKernelDescriptor().kernelAttributes.flags.passInlineData) {
+        if (totalPayloadSize > this->walkerInlineDataSize) {
+            totalPayloadSize -= this->walkerInlineDataSize;
+        } else {
+            totalPayloadSize = 0;
+        }
+    }
+
+    return totalPayloadSize;
+}
+
+void KernelImp::evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) {
+    auto &gfxHelper = module->getDevice()->getNEODevice()->getRootDeviceEnvironment().getHelper<NEO::GfxCoreHelper>();
+
+    size_t localWorkSizes[3];
+    localWorkSizes[0] = this->groupSize[0];
+    localWorkSizes[1] = this->groupSize[1];
+    localWorkSizes[2] = this->groupSize[2];
+
+    std::array<uint8_t, 3> kernelWalkOrder{
+        kernelDescriptor.kernelAttributes.workgroupWalkOrder[0],
+        kernelDescriptor.kernelAttributes.workgroupWalkOrder[1],
+        kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]};
+
+    kernelRequiresGenerationOfLocalIdsByRuntime = gfxHelper.isRuntimeLocalIdsGenerationRequired(kernelDescriptor.kernelAttributes.numLocalIdChannels,
+                                                                                                localWorkSizes,
+                                                                                                kernelWalkOrder,
+                                                                                                kernelDescriptor.kernelAttributes.flags.requiresWorkgroupWalkOrder,
+                                                                                                requiredWorkgroupOrder,
+                                                                                                kernelDescriptor.kernelAttributes.simdSize);
+}
+
 } // namespace L0
--- a/level_zero/core/source/kernel/kernel_imp.h
+++ b/level_zero/core/source/kernel/kernel_imp.h
@@ -239,6 +239,8 @@ struct KernelImp : Kernel {
        return kernelArgInfos;
    }

+    uint32_t getIndirectSize() const override;
+
  protected:
    KernelImp() = default;

@@ -248,7 +250,7 @@ struct KernelImp : Kernel {

    void createPrintfBuffer();
    void setAssertBuffer();
-    virtual void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) = 0;
+    MOCKABLE_VIRTUAL void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor);
    void *patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless);
    uint32_t getSurfaceStateIndexForBindlessOffset(NEO::CrossThreadDataOffset bindlessOffset) const;
    ze_result_t validateWorkgroupSize() const;
@@ -268,6 +270,8 @@ struct KernelImp : Kernel {
    NEO::GraphicsAllocation *printfBuffer = nullptr;
    size_t syncBufferIndex = std::numeric_limits<size_t>::max();
    size_t regionGroupBarrierIndex = std::numeric_limits<size_t>::max();
+    uintptr_t surfaceStateAlignmentMask = 0;
+    uintptr_t surfaceStateAlignment = 0;

    uint32_t groupSize[3] = {0u, 0u, 0u};
    uint32_t numThreadsPerThreadGroup = 1u;
@@ -286,6 +290,7 @@ struct KernelImp : Kernel {
    uint32_t perThreadDataSizeForWholeThreadGroupAllocated = 0;
    uint32_t perThreadDataSizeForWholeThreadGroup = 0u;
    uint32_t perThreadDataSize = 0u;
+    uint32_t walkerInlineDataSize = 0;

    UnifiedMemoryControls unifiedMemoryControls;
    std::vector<uint32_t> slmArgSizes;