Refactor implicit scaling parameters for surface state

Related-To: NEO-6589 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
2026-01-05 17:41:26 +08:00 · 2022-01-14 17:50:42 +00:00
parent 79c8605ed2
commit c36c083812
9 changed files with 105 additions and 18 deletions
--- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl
+++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl
@@ -266,6 +266,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
        args.gmmHelper = neoDevice->getGmmHelper();
        args.useGlobalAtomics = kernelImp->getKernelDescriptor().kernelAttributes.flags.useGlobalAtomics;
        args.areMultipleSubDevicesInContext = args.numAvailableDevices > 1;
+        args.implicitScaling = this->partitionCount > 1;

        NEO::EncodeSurfaceState<GfxFamily>::encodeBuffer(args);
        *reinterpret_cast<typename GfxFamily::RENDER_SURFACE_STATE *>(surfaceStateSpace) = surfaceState;
--- a/level_zero/core/source/kernel/kernel_hw.h
+++ b/level_zero/core/source/kernel/kernel_hw.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020-2021 Intel Corporation
+ * Copyright (C) 2020-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -63,12 +63,14 @@ struct KernelHw : public KernelImp {
        bufferSizeForSsh = alignUp(bufferSizeForSsh, alignment);

        bool l3Enabled = true;
-
        // Allocation MUST be cacheline (64 byte) aligned in order to enable L3 caching otherwise Heap corruption will occur coming from the KMD.
        // Most commonly this issue will occur with Host Point Allocations from customers.
        l3Enabled = isL3Capable(*alloc);

-        auto allocData = this->module->getDevice()->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(reinterpret_cast<void *>(alloc->getGpuAddress()));
+        Device *device = module->getDevice();
+        NEO::Device *neoDevice = device->getNEODevice();
+
+        auto allocData = device->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(reinterpret_cast<void *>(alloc->getGpuAddress()));
        if (allocData && allocData->allocationFlagsProperty.flags.locallyUncachedResource) {
            l3Enabled = false;
        }
@@ -77,18 +79,17 @@ struct KernelHw : public KernelImp {
            this->kernelRequiresQueueUncachedMocsCount++;
        }

-        NEO::Device *neoDevice = module->getDevice()->getNEODevice();
-
        NEO::EncodeSurfaceStateArgs args;
        args.outMemory = &surfaceState;
        args.graphicsAddress = bufferAddressForSsh;
        args.size = bufferSizeForSsh;
-        args.mocs = this->module->getDevice()->getMOCS(l3Enabled, false);
+        args.mocs = device->getMOCS(l3Enabled, false);
        args.numAvailableDevices = neoDevice->getNumGenericSubDevices();
        args.allocation = alloc;
        args.gmmHelper = neoDevice->getGmmHelper();
        args.useGlobalAtomics = kernelImmData->getDescriptor().kernelAttributes.flags.useGlobalAtomics;
        args.areMultipleSubDevicesInContext = args.numAvailableDevices > 1;
+        args.implicitScaling = device->isImplicitScalingCapable();

        NEO::EncodeSurfaceState<GfxFamily>::encodeBuffer(args);
        *reinterpret_cast<typename GfxFamily::RENDER_SURFACE_STATE *>(surfaceStateAddress) = surfaceState;
--- a/level_zero/core/source/kernel/kernel_imp.cpp
+++ b/level_zero/core/source/kernel/kernel_imp.cpp
@@ -84,7 +84,8 @@ KernelImmutableData::~KernelImmutableData() {

 inline void patchWithImplicitSurface(ArrayRef<uint8_t> crossThreadData, ArrayRef<uint8_t> surfaceStateHeap,
                                     uintptr_t ptrToPatchInCrossThreadData, NEO::GraphicsAllocation &allocation,
-                                     const NEO::ArgDescPointer &ptr, const NEO::Device &device, bool useGlobalAtomics) {
+                                     const NEO::ArgDescPointer &ptr, const NEO::Device &device, bool useGlobalAtomics,
+                                     bool implicitScaling) {
    if (false == crossThreadData.empty()) {
        NEO::patchPointer(crossThreadData, ptr, ptrToPatchInCrossThreadData);
    }
@@ -107,6 +108,7 @@ inline void patchWithImplicitSurface(ArrayRef<uint8_t> crossThreadData, ArrayRef
        args.numAvailableDevices = device.getNumGenericSubDevices();
        args.areMultipleSubDevicesInContext = args.numAvailableDevices > 1;
        args.mocs = hwHelper.getMocsIndex(*args.gmmHelper, true, false) << 1;
+        args.implicitScaling = implicitScaling;

        hwHelper.encodeBufferSurfaceState(args);
    }
@@ -179,7 +181,7 @@ void KernelImmutableData::initialize(NEO::KernelInfo *kernelInfo, Device *device
        patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
                                 static_cast<uintptr_t>(globalConstBuffer->getGpuAddressToPatch()),
                                 *globalConstBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalConstantsSurfaceAddress,
-                                 *neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics);
+                                 *neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics, deviceImp->isImplicitScalingCapable());
        this->residencyContainer.push_back(globalConstBuffer);
    } else if (nullptr != globalConstBuffer) {
        this->residencyContainer.push_back(globalConstBuffer);
@@ -191,7 +193,7 @@ void KernelImmutableData::initialize(NEO::KernelInfo *kernelInfo, Device *device
        patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
                                 static_cast<uintptr_t>(globalVarBuffer->getGpuAddressToPatch()),
                                 *globalVarBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalVariablesSurfaceAddress,
-                                 *neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics);
+                                 *neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics, deviceImp->isImplicitScalingCapable());
        this->residencyContainer.push_back(globalVarBuffer);
    } else if (nullptr != globalVarBuffer) {
        this->residencyContainer.push_back(globalVarBuffer);
@@ -758,7 +760,7 @@ void KernelImp::patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocatio
    patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
                             static_cast<uintptr_t>(privateAllocation->getGpuAddressToPatch()),
                             *privateAllocation, kernelImmData->getDescriptor().payloadMappings.implicitArgs.privateMemoryAddress,
-                             *device->getNEODevice(), kernelAttributes.flags.useGlobalAtomics);
+                             *device->getNEODevice(), kernelAttributes.flags.useGlobalAtomics, device->isImplicitScalingCapable());
 }

 ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
@@ -941,7 +943,7 @@ void KernelImp::setDebugSurface() {
        patchWithImplicitSurface(ArrayRef<uint8_t>(), surfaceStateHeapRef,
                                 0,
                                 *device->getDebugSurface(), this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.systemThreadSurfaceAddress,
-                                 *device->getNEODevice(), getKernelDescriptor().kernelAttributes.flags.useGlobalAtomics);
+                                 *device->getNEODevice(), getKernelDescriptor().kernelAttributes.flags.useGlobalAtomics, device->isImplicitScalingCapable());
    }
 }
 void *KernelImp::patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless) {
--- a/level_zero/core/test/unit_tests/fixtures/module_fixture.h
+++ b/level_zero/core/test/unit_tests/fixtures/module_fixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020-2021 Intel Corporation
+ * Copyright (C) 2020-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -7,6 +7,7 @@

 #pragma once

+#include "shared/source/command_container/implicit_scaling.h"
 #include "shared/source/helpers/file_io.h"
 #include "shared/source/memory_manager/allocation_properties.h"
 #include "shared/source/program/kernel_info.h"
@@ -392,5 +393,26 @@ struct ImportHostPointerModuleFixture : public ModuleFixture {
    void *hostPointer = nullptr;
 };

+struct MultiTileModuleFixture : public MultiDeviceModuleFixture {
+    void SetUp() {
+        DebugManager.flags.EnableImplicitScaling.set(1);
+        MultiDeviceFixture::numRootDevices = 1u;
+        MultiDeviceFixture::numSubDevices = 2u;
+
+        MultiDeviceModuleFixture::SetUp();
+        createModuleFromBinary(0);
+
+        device = driverHandle->devices[0];
+    }
+
+    void TearDown() {
+        MultiDeviceModuleFixture::TearDown();
+    }
+
+    DebugManagerStateRestore debugRestore;
+    VariableBackup<bool> backup{&NEO::ImplicitScaling::apiSupport, true};
+    L0::Device *device = nullptr;
+};
+
 } // namespace ult
 } // namespace L0
--- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp
+++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp
@@ -2328,5 +2328,34 @@ TEST_F(KernelImplicitArgTests, givenKernelWithoutImplicitArgsWhenPatchingImplici
    EXPECT_EQ(0, memcmp(data, initData, 64));
 }

+using MultiTileModuleTest = Test<MultiTileModuleFixture>;
+
+HWTEST2_F(MultiTileModuleTest, GivenMultiTileDeviceWhenSettingKernelArgAndSurfaceStateThenMultiTileFlagsAreSetCorrectly, IsXEHP) {
+    using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
+    ze_kernel_desc_t desc = {};
+    desc.pKernelName = kernelName.c_str();
+
+    WhiteBoxKernelHw<gfxCoreFamily> mockKernel;
+    mockKernel.module = modules[0].get();
+    mockKernel.initialize(&desc);
+
+    auto &arg = const_cast<NEO::ArgDescPointer &>(mockKernel.kernelImmData->getDescriptor().payloadMappings.explicitArgs[0].template as<NEO::ArgDescPointer>());
+    arg.bindless = undefined<CrossThreadDataOffset>;
+    arg.bindful = 0x40;
+
+    constexpr size_t size = 128;
+    uint64_t gpuAddress = 0x2000;
+    char bufferArray[size] = {};
+    void *buffer = reinterpret_cast<void *>(bufferArray);
+    NEO::MockGraphicsAllocation mockAllocation(buffer, gpuAddress, size);
+
+    mockKernel.setBufferSurfaceState(0, buffer, &mockAllocation);
+
+    void *surfaceStateAddress = ptrOffset(mockKernel.surfaceStateHeapData.get(), arg.bindful);
+    RENDER_SURFACE_STATE *surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(surfaceStateAddress);
+    EXPECT_FALSE(surfaceState->getDisableSupportForMultiGpuAtomics());
+    EXPECT_FALSE(surfaceState->getDisableSupportForMultiGpuPartialWrites());
+}
+
 } // namespace ult
 } // namespace L0