Enable implicit scaling via platform config

Related-To: NEO-6819 Signed-off-by: Daniel Chabrowski <daniel.chabrowski@intel.com>
2025-12-18 13:54:58 +08:00 · 2022-05-23 17:03:53 +00:00
parent 630ecfdd09
commit b5495169ca
13 changed files with 71 additions and 12 deletions
--- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp
+++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp
@@ -1314,7 +1314,14 @@ TEST_F(DeviceCreateCommandQueueTest, givenLowPriorityDescAndWithoutLowPriorityCs
    EXPECT_THROW(device->createCommandQueue(&desc, &commandQueueHandle), std::exception);
 }

-using MultiDeviceCreateCommandQueueTest = Test<MultiDeviceFixture>;
+struct MultiDeviceCreateCommandQueueFixture : MultiDeviceFixture {
+    void SetUp() {
+        DebugManager.flags.EnableImplicitScaling = false;
+        MultiDeviceFixture::SetUp();
+    }
+};
+
+using MultiDeviceCreateCommandQueueTest = Test<MultiDeviceCreateCommandQueueFixture>;

 TEST_F(MultiDeviceCreateCommandQueueTest, givenLowPriorityDescWhenCreateCommandQueueIsCalledThenLowPriorityCsrIsAssigned) {
    auto device = driverHandle->devices[0];
--- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist.cpp
+++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist.cpp
@@ -576,7 +576,15 @@ HWTEST2_F(CommandQueueExecuteCommandLists, givenCommandListsWithCooperativeAndNo
    pCommandQueue->destroy();
 }

-HWTEST2_F(CommandQueueExecuteCommandLists, givenCommandListWithCooperativeKernelsWhenExecuteCommandListsIsCalledThenCorrectBatchBufferIsSubmitted, IsAtLeastXeHpCore) {
+struct CommandQueueExecuteCommandListsImplicitScalingDisabled : CommandQueueExecuteCommandLists {
+    void SetUp() override {
+        DebugManager.flags.EnableImplicitScaling.set(0);
+        CommandQueueExecuteCommandLists::SetUp();
+    }
+    DebugManagerStateRestore restorer{};
+};
+
+HWTEST2_F(CommandQueueExecuteCommandListsImplicitScalingDisabled, givenCommandListWithCooperativeKernelsWhenExecuteCommandListsIsCalledThenCorrectBatchBufferIsSubmitted, IsAtLeastXeHpCore) {
    struct MockCsr : NEO::CommandStreamReceiverHw<FamilyType> {
        using NEO::CommandStreamReceiverHw<FamilyType>::CommandStreamReceiverHw;
        NEO::SubmissionStatus submitBatchBuffer(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) override {
--- a/level_zero/core/test/unit_tests/sources/device/test_device.cpp
+++ b/level_zero/core/test/unit_tests/sources/device/test_device.cpp
@@ -2236,7 +2236,7 @@ TEST_F(MultipleDevicesTest, givenTwoSubDevicesFromTheSameRootDeviceThenCanAccess
    EXPECT_TRUE(canAccess);
 }

-TEST_F(MultipleDevicesTest, givenTopologyForTwoSubdevicesWhenGettingApiSliceIdWithRootDeviceThenCorrectMappingIsUsedAndApiSliceIdsForSubdeviceReturned) {
+TEST_F(MultipleDevicesDisabledImplicitScalingTest, givenTopologyForTwoSubdevicesWhenGettingApiSliceIdWithRootDeviceThenCorrectMappingIsUsedAndApiSliceIdsForSubdeviceReturned) {
    L0::Device *device0 = driverHandle->devices[0];
    auto deviceImp0 = static_cast<DeviceImp *>(device0);
    auto hwInfo = device0->getHwInfo();
@@ -2279,7 +2279,7 @@ TEST_F(MultipleDevicesTest, givenTopologyForTwoSubdevicesWhenGettingApiSliceIdWi
    EXPECT_EQ(hwInfo.gtSystemInfo.SliceCount + 0u, sliceId);
 }

-TEST_F(MultipleDevicesTest, givenTopologyForSingleSubdeviceWhenGettingApiSliceIdWithRootDeviceThenCorrectApiSliceIdsForFirstSubDeviceIsReturned) {
+TEST_F(MultipleDevicesDisabledImplicitScalingTest, givenTopologyForSingleSubdeviceWhenGettingApiSliceIdWithRootDeviceThenCorrectApiSliceIdsForFirstSubDeviceIsReturned) {
    L0::Device *device0 = driverHandle->devices[0];
    auto deviceImp0 = static_cast<DeviceImp *>(device0);
    auto hwInfo = device0->getHwInfo();
--- a/level_zero/core/test/unit_tests/sources/memory/test_memory.cpp
+++ b/level_zero/core/test/unit_tests/sources/memory/test_memory.cpp
@@ -1345,6 +1345,7 @@ struct ContextMemoryTests : public MemoryRelaxedSizeTests {

 TEST_F(ContextMemoryTests, givenMultipleSubDevicesWhenAllocatingThenUseCorrectGlobalMemorySize) {
    size_t allocationSize = neoDevice->getDeviceInfo().globalMemSize;
+    const size_t unsupportedAllocationSize = allocationSize + 1;
    size_t alignment = 1u;
    void *ptr = nullptr;

@@ -1352,11 +1353,11 @@ TEST_F(ContextMemoryTests, givenMultipleSubDevicesWhenAllocatingThenUseCorrectGl
    ze_device_mem_alloc_desc_t deviceDesc = {};
    deviceDesc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;

-    ze_result_t result = context->allocSharedMem(device->toHandle(), &deviceDesc, &hostDesc, allocationSize, alignment, &ptr);
+    ze_result_t result = context->allocSharedMem(device->toHandle(), &deviceDesc, &hostDesc, unsupportedAllocationSize, alignment, &ptr);
    EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_SIZE, result);
    EXPECT_EQ(nullptr, ptr);

-    result = context->allocDeviceMem(device->toHandle(), &deviceDesc, allocationSize, alignment, &ptr);
+    result = context->allocDeviceMem(device->toHandle(), &deviceDesc, unsupportedAllocationSize, alignment, &ptr);
    EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_SIZE, result);
    EXPECT_EQ(nullptr, ptr);

--- a/programmers-guide/IMPLICIT_SCALING.md
+++ b/programmers-guide/IMPLICIT_SCALING.md
@@ -32,7 +32,7 @@ To manage the resources on those sub-devices, the UMD introduces two main develo
 * *Implicit scaling* model, on which application allocates and submits to the root device and driver is responsible for distribution of work and memory across tiles.
 * *Explicit scaling* model, on which application is responsible for distributing work and memory across tiles using sub-device handles.

-When doing allocations in implicit scaling mode, driver *colors* an allocation among the available tiles. Default coloring divides an allocation size evenly by the number of avaialable tiles. Other policies include dividing the allocation in chunks of a given size, which are then interleaved on each tile.
+When doing allocations in implicit scaling mode, driver *colors* an allocation among the available tiles. Default coloring divides an allocation size evenly by the number of available tiles. Other policies include dividing the allocation in chunks of a given size, which are then interleaved on each tile.

 When scheduling a kernel for execution, driver distributes the kernel workgroups among the available tiles. Default mechanism is called *Static Partitioning*, where the workgroups are evenly distributed among tiles. For instance, in a 2-tile system, half of the workgroups go to tile 0, and the other half to tile 1.

@@ -40,7 +40,7 @@ The number of CCSs, or compute engines, currently available with implicit scalin

 No implicit scaling support is available for BCSs. Considering that, two models are followed in terms of discovery of copy engines:

-* In Level Zero, the copy engines from sub-device 0 are exposed also in the root device. This to align the engine model on both the implicit and the non-implicit-scaling scenarios.
+* In Level Zero, the copy engines from sub-device 0 are exposed also in the root device. This is to align the engine model on both the implicit and the non-implicit-scaling scenarios.
 * In OpenCL, copy engines are not exposed in the root device.

 Since implicit scaling is only done for EUs, which are associated only with kernels submitted to CCS, BCSs are currently not being exposed and access to them are done through sub-device handles.
@@ -76,4 +76,4 @@ For workloads with no coherent L3 caches among tiles, such as XeHP_SDV, the foll
    * `ForceMultiGpuAtomics`: Set to `0` to have global atomics (slow mode for multi-tile) and `1` to have atomics on L3 cache (fast mode for on tile).

 * Caches are flushed after every kernel. This can be disabled with `DoNotFlushCaches=1`.
-* Kernels are serialized to maintain functional correctness of split execution.
+* Kernels are serialized to maintain functional correctness of split execution.
--- a/shared/source/os_interface/hw_info_config.h
+++ b/shared/source/os_interface/hw_info_config.h
@@ -110,6 +110,7 @@ class HwInfoConfig {
    virtual bool allowMemoryPrefetch(const HardwareInfo &hwInfo) const = 0;
    virtual bool isBcsReportWaRequired(const HardwareInfo &hwInfo) const = 0;
    virtual bool isBlitCopyRequiredForLocalMemory(const HardwareInfo &hwInfo, const GraphicsAllocation &allocation) const = 0;
+    virtual bool isImplicitScalingSupported(const HardwareInfo &hwInfo) const = 0;

    MOCKABLE_VIRTUAL ~HwInfoConfig() = default;

@@ -202,6 +203,7 @@ class HwInfoConfigHw : public HwInfoConfig {
    bool allowMemoryPrefetch(const HardwareInfo &hwInfo) const override;
    bool isBcsReportWaRequired(const HardwareInfo &hwInfo) const override;
    bool isBlitCopyRequiredForLocalMemory(const HardwareInfo &hwInfo, const GraphicsAllocation &allocation) const override;
+    bool isImplicitScalingSupported(const HardwareInfo &hwInfo) const override;

  protected:
    HwInfoConfigHw() = default;
--- a/shared/source/os_interface/hw_info_config.inl
+++ b/shared/source/os_interface/hw_info_config.inl
@@ -399,4 +399,9 @@ bool HwInfoConfigHw<gfxProduct>::isBlitCopyRequiredForLocalMemory(const Hardware
           (HwInfoConfig::get(hwInfo.platform.eProductFamily)->getLocalMemoryAccessMode(hwInfo) == LocalMemoryAccessMode::CpuAccessDisallowed ||
            !allocation.isAllocationLockable());
 }
+
+template <PRODUCT_FAMILY gfxProduct>
+bool HwInfoConfigHw<gfxProduct>::isImplicitScalingSupported(const HardwareInfo &hwInfo) const {
+    return false;
+}
 } // namespace NEO
--- a/shared/source/xe_hp_core/os_agnostic_hw_info_config_xe_hp_core.inl
+++ b/shared/source/xe_hp_core/os_agnostic_hw_info_config_xe_hp_core.inl
@@ -134,3 +134,8 @@ template <>
 bool HwInfoConfigHw<gfxProduct>::isBlitterForImagesSupported() const {
    return true;
 }
+
+template <>
+bool HwInfoConfigHw<gfxProduct>::isImplicitScalingSupported(const HardwareInfo &hwInfo) const {
+    return true;
+}
--- a/shared/source/xe_hpc_core/implicit_scaling_xe_hpc_core.cpp
+++ b/shared/source/xe_hpc_core/implicit_scaling_xe_hpc_core.cpp
@@ -22,7 +22,7 @@ bool ImplicitScalingDispatch<Family>::platformSupportsImplicitScaling(const Hard
    if (ApiSpecificConfig::getApiType() == ApiSpecificConfig::ApiType::OCL) {
        return true;
    } else {
-        return HwInfoConfig::get(hwInfo.platform.eProductFamily)->getSteppingFromHwRevId(hwInfo) >= REVISION_B;
+        return HwInfoConfig::get(hwInfo.platform.eProductFamily)->isImplicitScalingSupported(hwInfo);
    }
 }

--- a/shared/source/xe_hpc_core/pvc/os_agnostic_hw_info_config_pvc.inl
+++ b/shared/source/xe_hpc_core/pvc/os_agnostic_hw_info_config_pvc.inl
@@ -109,7 +109,7 @@ bool HwInfoConfigHw<gfxProduct>::isAdjustProgrammableIdPreferredSlmSizeRequired(

 template <>
 bool HwInfoConfigHw<gfxProduct>::isCooperativeEngineSupported(const HardwareInfo &hwInfo) const {
-    return (HwInfoConfig::get(hwInfo.platform.eProductFamily)->getSteppingFromHwRevId(hwInfo) >= REVISION_B);
+    return getSteppingFromHwRevId(hwInfo) >= REVISION_B;
 }

 bool isBaseDieA0(const HardwareInfo &hwInfo) {
@@ -174,3 +174,8 @@ bool HwInfoConfigHw<gfxProduct>::isBlitCopyRequiredForLocalMemory(const Hardware

    return false;
 }
+
+template <>
+bool HwInfoConfigHw<gfxProduct>::isImplicitScalingSupported(const HardwareInfo &hwInfo) const {
+    return getSteppingFromHwRevId(hwInfo) >= REVISION_B;
+}
--- a/shared/test/common/xe_hp_core/test_hw_info_config_xe_hp_core.cpp
+++ b/shared/test/common/xe_hp_core/test_hw_info_config_xe_hp_core.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2021 Intel Corporation
+ * Copyright (C) 2021-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -136,3 +136,8 @@ XEHPTEST_F(TestXeHPHwInfoConfig, givenXeHpCoreWhenIsBlitterForImagesSupportedIsC

    EXPECT_TRUE(hwInfoConfig.isBlitterForImagesSupported());
 }
+
+XEHPTEST_F(TestXeHPHwInfoConfig, givenHwInfoConfigWhenIsImplicitScalingSupportedThenExpectTrue) {
+    const auto &hwInfoConfig = *HwInfoConfig::get(defaultHwInfo->platform.eProductFamily);
+    EXPECT_TRUE(hwInfoConfig.isImplicitScalingSupported(*defaultHwInfo));
+}
--- a/shared/test/unit_test/helpers/test_hw_info_config.cpp
+++ b/shared/test/unit_test/helpers/test_hw_info_config.cpp
@@ -69,3 +69,8 @@ HWTEST_F(HwInfoConfigTest, givenForceGrfNumProgrammingWithScmFlagSetWhenIsGrfNum
    DebugManager.flags.ForceGrfNumProgrammingWithScm.set(1);
    EXPECT_TRUE(hwInfoConfig.isGrfNumReportedWithScm());
 }
+
+HWTEST2_F(HwInfoConfigTest, givenHwInfoConfigWhenIsImplicitScalingSupportedThenExpectFalse, isNotXeHpOrXeHpcCore) {
+    const auto &hwInfoConfig = *HwInfoConfig::get(defaultHwInfo->platform.eProductFamily);
+    EXPECT_FALSE(hwInfoConfig.isImplicitScalingSupported(*defaultHwInfo));
+}
--- a/shared/test/unit_test/xe_hpc_core/pvc/test_hw_info_config_pvc.cpp
+++ b/shared/test/unit_test/xe_hpc_core/pvc/test_hw_info_config_pvc.cpp
@@ -5,6 +5,7 @@
 *
 */

+#include "shared/source/helpers/constants.h"
 #include "shared/source/os_interface/hw_info_config.h"
 #include "shared/test/common/helpers/default_hw_info.h"
 #include "shared/test/common/test_macros/test.h"
@@ -29,3 +30,18 @@ PVCTEST_F(PVCHwInfoConfig, givenPVCRevId0WhenGettingThreadEuRatioForScratchThen8
    hwInfo.platform.usRevId = 0;
    EXPECT_EQ(8u, hwInfoConfig.getThreadEuRatioForScratch(hwInfo));
 }
+
+PVCTEST_F(PVCHwInfoConfig, givenPVCWithDifferentSteppingsThenImplicitScalingIsEnabledForBAndHigher) {
+    const auto &hwInfoConfig = *HwInfoConfig::get(defaultHwInfo->platform.eProductFamily);
+
+    auto hwInfo = *defaultHwInfo;
+
+    for (uint32_t stepping = 0; stepping < 0x10; stepping++) {
+        auto hwRevIdFromStepping = hwInfoConfig.getHwRevIdFromStepping(stepping, hwInfo);
+        if (hwRevIdFromStepping != CommonConstants::invalidStepping) {
+            hwInfo.platform.usRevId = hwRevIdFromStepping;
+            const bool shouldSupportImplicitScaling = hwRevIdFromStepping >= REVISION_B;
+            EXPECT_EQ(shouldSupportImplicitScaling, hwInfoConfig.isImplicitScalingSupported(hwInfo)) << "hwRevId: " << hwRevIdFromStepping;
+        }
+    }
+}