Limit space in tile for concurrent kernels on pvc

Related-To: NEO-7658, HSD-16016919338

Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>
This commit is contained in:
Maciej Plewka
2023-02-16 11:10:34 +00:00
committed by Compute-Runtime-Automation
parent aa661c1878
commit 829c93ca68
14 changed files with 233 additions and 56 deletions

View File

@@ -17,6 +17,7 @@ set(L0_FIXTURES_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/event_fixture.cpp
${CMAKE_CURRENT_SOURCE_DIR}/event_fixture.h
${CMAKE_CURRENT_SOURCE_DIR}/host_pointer_manager_fixture.h
${CMAKE_CURRENT_SOURCE_DIR}/kernel_max_cooperative_groups_count_fixture.h
${CMAKE_CURRENT_SOURCE_DIR}/module_fixture.cpp
${CMAKE_CURRENT_SOURCE_DIR}/module_fixture.h
${CMAKE_CURRENT_SOURCE_DIR}/memory_ipc_fixture.h

View File

@@ -0,0 +1,69 @@
/*
* Copyright (C) 2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/helpers/gfx_core_helper.h"
#include "level_zero/core/test/unit_tests/fixtures/device_fixture.h"
#include "level_zero/core/test/unit_tests/mocks/mock_kernel.h"
#include "level_zero/core/test/unit_tests/mocks/mock_module.h"
namespace L0 {
namespace ult {
class KernelImpSuggestMaxCooperativeGroupCountTests : public Test<DeviceFixture> {
public:
const uint32_t numGrf = 128;
const uint32_t simd = 8;
const uint32_t lws[3] = {1, 1, 1};
uint32_t usedSlm = 0;
uint32_t usesBarriers = 0;
uint32_t availableThreadCount;
uint32_t dssCount;
uint32_t availableSlm;
uint32_t maxBarrierCount;
WhiteBox<::L0::KernelImmutableData> kernelInfo;
NEO::KernelDescriptor kernelDescriptor;
void SetUp() override {
Test<DeviceFixture>::SetUp();
kernelInfo.kernelDescriptor = &kernelDescriptor;
auto &hardwareInfo = device->getHwInfo();
auto &helper = device->getNEODevice()->getRootDeviceEnvironment().getHelper<GfxCoreHelper>();
availableThreadCount = helper.calculateAvailableThreadCount(hardwareInfo, numGrf);
dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount;
if (dssCount == 0) {
dssCount = hardwareInfo.gtSystemInfo.SubSliceCount;
}
availableSlm = dssCount * KB * hardwareInfo.capabilityTable.slmSize;
maxBarrierCount = static_cast<uint32_t>(helper.getMaxBarrierRegisterPerSlice());
kernelInfo.kernelDescriptor->kernelAttributes.simdSize = simd;
kernelInfo.kernelDescriptor->kernelAttributes.numGrfRequired = numGrf;
}
uint32_t getMaxWorkGroupCount() {
kernelInfo.kernelDescriptor->kernelAttributes.slmInlineSize = usedSlm;
kernelInfo.kernelDescriptor->kernelAttributes.barrierCount = usesBarriers;
Mock<Kernel> kernel;
kernel.kernelImmData = &kernelInfo;
auto module = std::make_unique<ModuleImp>(device, nullptr, ModuleType::User);
kernel.module = module.get();
kernel.groupSize[0] = lws[0];
kernel.groupSize[1] = lws[1];
kernel.groupSize[2] = lws[2];
uint32_t totalGroupCount = 0;
kernel.KernelImp::suggestMaxCooperativeGroupCount(&totalGroupCount, NEO::EngineGroupType::CooperativeCompute, true);
return totalGroupCount;
}
};
} // namespace ult
} // namespace L0

View File

@@ -15,6 +15,7 @@
#include "shared/test/common/test_macros/test.h"
#include "level_zero/core/test/unit_tests/fixtures/device_fixture.h"
#include "level_zero/core/test/unit_tests/fixtures/kernel_max_cooperative_groups_count_fixture.h"
#include "level_zero/core/test/unit_tests/mocks/mock_kernel.h"
#include "level_zero/core/test/unit_tests/mocks/mock_module.h"
@@ -351,70 +352,19 @@ TEST(zeKernelGetProperties, WhenGettingKernelPropertiesThenSuccessIsReturned) {
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
}
class KernelImpSuggestMaxCooperativeGroupCountTests : public KernelImp {
public:
const uint32_t numGrf = 128;
const uint32_t simd = 8;
const uint32_t lws[3] = {1, 1, 1};
uint32_t usedSlm = 0;
uint32_t usesBarriers = 0;
uint32_t availableThreadCount;
uint32_t dssCount;
uint32_t availableSlm;
uint32_t maxBarrierCount;
WhiteBox<::L0::KernelImmutableData> kernelInfo;
NEO::KernelDescriptor kernelDescriptor;
void SetUp() override {
KernelImp::SetUp();
kernelInfo.kernelDescriptor = &kernelDescriptor;
auto &hardwareInfo = device->getHwInfo();
auto &helper = device->getNEODevice()->getRootDeviceEnvironment().getHelper<GfxCoreHelper>();
availableThreadCount = helper.calculateAvailableThreadCount(hardwareInfo, numGrf);
dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount;
if (dssCount == 0) {
dssCount = hardwareInfo.gtSystemInfo.SubSliceCount;
}
availableSlm = dssCount * KB * hardwareInfo.capabilityTable.slmSize;
maxBarrierCount = static_cast<uint32_t>(helper.getMaxBarrierRegisterPerSlice());
kernelInfo.kernelDescriptor->kernelAttributes.simdSize = simd;
kernelInfo.kernelDescriptor->kernelAttributes.numGrfRequired = numGrf;
}
uint32_t getMaxWorkGroupCount() {
kernelInfo.kernelDescriptor->kernelAttributes.slmInlineSize = usedSlm;
kernelInfo.kernelDescriptor->kernelAttributes.barrierCount = usesBarriers;
Mock<Kernel> kernel;
kernel.kernelImmData = &kernelInfo;
auto module = std::make_unique<ModuleImp>(device, nullptr, ModuleType::User);
kernel.module = module.get();
kernel.groupSize[0] = lws[0];
kernel.groupSize[1] = lws[1];
kernel.groupSize[2] = lws[2];
uint32_t totalGroupCount = 0;
kernel.KernelImp::suggestMaxCooperativeGroupCount(&totalGroupCount, NEO::EngineGroupType::CooperativeCompute, true);
return totalGroupCount;
}
};
TEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenNoBarriersOrSlmUsedWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithSimd) {
HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenNoBarriersOrSlmUsedWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithSimd) {
auto workGroupSize = lws[0] * lws[1] * lws[2];
auto expected = availableThreadCount / Math::divideAndRoundUp(workGroupSize, simd);
EXPECT_EQ(expected, getMaxWorkGroupCount());
}
TEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenBarriersWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToBarriersCount) {
HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenBarriersWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToBarriersCount) {
usesBarriers = 1;
auto expected = dssCount * (maxBarrierCount / usesBarriers);
EXPECT_EQ(expected, getMaxWorkGroupCount());
}
TEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenUsedSlmSizeWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToUsedSlmSize) {
HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenUsedSlmSizeWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToUsedSlmSize) {
usedSlm = 64 * KB;
auto expected = availableSlm / usedSlm;
EXPECT_EQ(expected, getMaxWorkGroupCount());

View File

@@ -9,6 +9,7 @@ if(TESTS_PVC)
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/test_device_pvc.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_cmdlist_pvc.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_kernel_pvc.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_sampler_pvc.cpp
)
endif()

View File

@@ -0,0 +1,42 @@
/*
* Copyright (C) 2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/basic_math.h"
#include "shared/source/xe_hpc_core/hw_cmds_pvc.h"
#include "shared/test/common/test_macros/header/per_product_test_definitions.h"
#include "shared/test/common/test_macros/hw_test_base.h"
#include "level_zero/core/test/unit_tests/fixtures/kernel_max_cooperative_groups_count_fixture.h"
namespace L0 {
namespace ult {
HWTEST_EXCLUDE_PRODUCT(KernelImpSuggestMaxCooperativeGroupCountTests, GivenUsedSlmSizeWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToUsedSlmSize, IGFX_PVC);
HWTEST_EXCLUDE_PRODUCT(KernelImpSuggestMaxCooperativeGroupCountTests, GivenBarriersWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToBarriersCount, IGFX_PVC);
HWTEST_EXCLUDE_PRODUCT(KernelImpSuggestMaxCooperativeGroupCountTests, GivenNoBarriersOrSlmUsedWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithSimd, IGFX_PVC);
using KernelImpSuggestMaxCooperativeGroupCountTestsPvc = KernelImpSuggestMaxCooperativeGroupCountTests;
PVCTEST_F(KernelImpSuggestMaxCooperativeGroupCountTestsPvc, GivenNoBarriersOrSlmUsedWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithSimd) {
auto workGroupSize = lws[0] * lws[1] * lws[2];
auto expected = (availableThreadCount / Math::divideAndRoundUp(workGroupSize, simd)) / PVC::numberOfpartsInTileForConcurrentKernels;
EXPECT_EQ(expected, getMaxWorkGroupCount());
}
PVCTEST_F(KernelImpSuggestMaxCooperativeGroupCountTestsPvc, GivenBarriersWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToBarriersCount) {
usesBarriers = 1;
auto expected = (dssCount * (maxBarrierCount / usesBarriers)) / PVC::numberOfpartsInTileForConcurrentKernels;
EXPECT_EQ(expected, getMaxWorkGroupCount());
}
PVCTEST_F(KernelImpSuggestMaxCooperativeGroupCountTestsPvc, GivenUsedSlmSizeWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToUsedSlmSize) {
usedSlm = 64 * KB;
auto expected = (availableSlm / usedSlm) / PVC::numberOfpartsInTileForConcurrentKernels;
EXPECT_EQ(expected, getMaxWorkGroupCount());
}
} // namespace ult
} // namespace L0

View File

@@ -63,13 +63,14 @@ uint32_t GfxCoreHelperHw<Family>::adjustMaxWorkGroupCount(uint32_t maxWorkGroupC
bool requiresLimitation = productHelper.isCooperativeEngineSupported(hwInfo) &&
(engineGroupType != EngineGroupType::CooperativeCompute) &&
(!isEngineInstanced);
auto numberOfpartsInTileForConcurrentKernels = productHelper.getNumberOfPartsInTileForConcurrentKernel();
if (requiresLimitation) {
auto ccsCount = hwInfo.gtSystemInfo.CCSInfo.NumberOfCCSEnabled;
UNRECOVERABLE_IF(ccsCount == 0);
return maxWorkGroupCount / ccsCount;
numberOfpartsInTileForConcurrentKernels = std::max(numberOfpartsInTileForConcurrentKernels, ccsCount);
}
return maxWorkGroupCount;
return maxWorkGroupCount / numberOfpartsInTileForConcurrentKernels;
}
template <typename Family>

View File

@@ -198,6 +198,7 @@ class ProductHelper {
virtual bool isMultiContextResourceDeferDeletionSupported() const = 0;
virtual bool isFusedEuDisabledForDpas(bool kernelHasDpasInstructions, const uint32_t *lws, const uint32_t *groupCount) const = 0;
virtual bool isCalculationForDisablingEuFusionWithDpasNeeded() const = 0;
virtual uint32_t getNumberOfPartsInTileForConcurrentKernel() const = 0;
virtual ~ProductHelper() = default;

View File

@@ -526,6 +526,10 @@ template <PRODUCT_FAMILY gfxProduct>
bool ProductHelperHw<gfxProduct>::isStatefulAddressingModeSupported() const {
return true;
}
template <PRODUCT_FAMILY gfxProduct>
uint32_t ProductHelperHw<gfxProduct>::getNumberOfPartsInTileForConcurrentKernel() const {
return 1u;
}
template <PRODUCT_FAMILY gfxProduct>
bool ProductHelperHw<gfxProduct>::isPlatformQuerySupported() const {

View File

@@ -117,6 +117,7 @@ class ProductHelperHw : public ProductHelper {
void adjustNumberOfCcs(HardwareInfo &hwInfo) const override;
bool isPrefetcherDisablingInDirectSubmissionRequired() const override;
bool isStatefulAddressingModeSupported() const override;
uint32_t getNumberOfPartsInTileForConcurrentKernel() const override;
bool isPlatformQuerySupported() const override;
bool isNonBlockingGpuSubmissionSupported() const override;
bool isResolveDependenciesByPipeControlsSupported(const HardwareInfo &hwInfo, bool isOOQ) const override;

View File

@@ -25,6 +25,7 @@ struct PVC : public XeHpcCoreFamily {
static const uint32_t maxSubslicesSupported = 64;
static const uint32_t maxDualSubslicesSupported = 64;
static const RuntimeCapabilityTable capabilityTable;
static constexpr uint32_t numberOfpartsInTileForConcurrentKernels = 8u;
struct FrontEndStateSupport {
static constexpr bool scratchSize = true;

View File

@@ -226,4 +226,8 @@ std::optional<aub_stream::ProductFamily> ProductHelperHw<gfxProduct>::getAubStre
return aub_stream::ProductFamily::Pvc;
};
template <>
uint32_t ProductHelperHw<gfxProduct>::getNumberOfPartsInTileForConcurrentKernel() const {
return PVC::numberOfpartsInTileForConcurrentKernels;
}
} // namespace NEO

View File

@@ -1474,4 +1474,55 @@ HWTEST_F(ProductHelperCommonTest, givenHwHelperWhenIsFusedEuDisabledForDpasCalle
HWTEST_F(ProductHelperCommonTest, givenProductHelperWhenCallingIsCalculationForDisablingEuFusionWithDpasNeededThenFalseReturned) {
auto &gfxCoreHelper = getHelper<ProductHelper>();
EXPECT_FALSE(gfxCoreHelper.isCalculationForDisablingEuFusionWithDpasNeeded());
}
HWTEST_F(GfxCoreHelperTest, GivenCooperativeEngineSupportedAndNotUsedWhenAdjustMaxWorkGroupCountIsCalledThenSmallerValueIsReturned) {
MockExecutionEnvironment mockExecutionEnvironment{};
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
const auto &productHelper = rootDeviceEnvironment.getHelper<ProductHelper>();
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<GfxCoreHelper>();
auto &hwInfo = *rootDeviceEnvironment.getMutableHardwareInfo();
hwInfo.capabilityTable.defaultEngineType = aub_stream::EngineType::ENGINE_CCS;
hwInfo.featureTable.flags.ftrRcsNode = false;
uint32_t revisions[] = {REVISION_A0, REVISION_B};
for (auto &revision : revisions) {
auto hwRevId = productHelper.getHwRevIdFromStepping(revision, hwInfo);
if (hwRevId == CommonConstants::invalidStepping) {
continue;
}
hwInfo.platform.usRevId = hwRevId;
for (auto isEngineInstanced : ::testing::Bool()) {
for (auto isRcsEnabled : ::testing::Bool()) {
hwInfo.featureTable.flags.ftrRcsNode = isRcsEnabled;
for (auto engineGroupType : {EngineGroupType::RenderCompute, EngineGroupType::Compute, EngineGroupType::CooperativeCompute}) {
if (productHelper.isCooperativeEngineSupported(hwInfo)) {
bool disallowDispatch = (engineGroupType == EngineGroupType::RenderCompute) ||
((engineGroupType == EngineGroupType::Compute) && isRcsEnabled);
bool applyLimitation = !isEngineInstanced &&
(engineGroupType != EngineGroupType::CooperativeCompute);
if (disallowDispatch) {
EXPECT_EQ(1u, gfxCoreHelper.adjustMaxWorkGroupCount(4u, engineGroupType, rootDeviceEnvironment, isEngineInstanced));
EXPECT_EQ(1u, gfxCoreHelper.adjustMaxWorkGroupCount(1024u, engineGroupType, rootDeviceEnvironment, isEngineInstanced));
} else if (applyLimitation) {
hwInfo.gtSystemInfo.CCSInfo.NumberOfCCSEnabled = 4;
EXPECT_EQ(1u, gfxCoreHelper.adjustMaxWorkGroupCount(4u, engineGroupType, rootDeviceEnvironment, isEngineInstanced));
EXPECT_EQ(256u, gfxCoreHelper.adjustMaxWorkGroupCount(1024u, engineGroupType, rootDeviceEnvironment, isEngineInstanced));
hwInfo.gtSystemInfo.CCSInfo.NumberOfCCSEnabled = 2;
EXPECT_EQ(2u, gfxCoreHelper.adjustMaxWorkGroupCount(4u, engineGroupType, rootDeviceEnvironment, isEngineInstanced));
EXPECT_EQ(512u, gfxCoreHelper.adjustMaxWorkGroupCount(1024u, engineGroupType, rootDeviceEnvironment, isEngineInstanced));
} else {
EXPECT_EQ(4u, gfxCoreHelper.adjustMaxWorkGroupCount(4u, engineGroupType, rootDeviceEnvironment, isEngineInstanced));
EXPECT_EQ(1024u, gfxCoreHelper.adjustMaxWorkGroupCount(1024u, engineGroupType, rootDeviceEnvironment, isEngineInstanced));
}
} else {
EXPECT_EQ(4u, gfxCoreHelper.adjustMaxWorkGroupCount(4u, engineGroupType, rootDeviceEnvironment, isEngineInstanced));
EXPECT_EQ(1024u, gfxCoreHelper.adjustMaxWorkGroupCount(1024u, engineGroupType, rootDeviceEnvironment, isEngineInstanced));
}
}
}
}
}
}

View File

@@ -9,3 +9,4 @@
HWTEST_EXCLUDE_PRODUCT(ProductHelperTest, givenProductHelperWhenAskedIfIsBlitSplitEnqueueWARequiredThenReturnFalse, IGFX_PVC);
HWTEST_EXCLUDE_PRODUCT(BlitTests, GivenCpuAccessToLocalMemoryWhenGettingMaxBlitSizeThenValuesAreOverriden_BlitPlatforms, IGFX_PVC);
HWTEST_EXCLUDE_PRODUCT(GfxCoreHelperTest, GivenCooperativeEngineSupportedAndNotUsedWhenAdjustMaxWorkGroupCountIsCalledThenSmallerValueIsReturned, IGFX_PVC);

View File

@@ -157,4 +157,54 @@ PVCTEST_F(GfxCoreHelperTestsPvc, givenMemorySynchronizationCommandsWhenAddingSyn
}
}
}
PVCTEST_F(GfxCoreHelperTestsPvc, GivenCooperativeEngineSupportedAndNotUsedWhenAdjustMaxWorkGroupCountIsCalledThenSmallerValueIsReturned) {
MockExecutionEnvironment mockExecutionEnvironment{};
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
const auto &productHelper = rootDeviceEnvironment.getHelper<ProductHelper>();
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<GfxCoreHelper>();
auto &hwInfo = *rootDeviceEnvironment.getMutableHardwareInfo();
hwInfo.capabilityTable.defaultEngineType = aub_stream::EngineType::ENGINE_CCS;
hwInfo.featureTable.flags.ftrRcsNode = false;
auto tilePartsForConcurrentKernels = PVC::numberOfpartsInTileForConcurrentKernels;
auto passedMaxWorkGroupCount = 1024;
uint32_t revisions[] = {REVISION_A0, REVISION_B};
for (auto &revision : revisions) {
auto hwRevId = productHelper.getHwRevIdFromStepping(revision, hwInfo);
if (hwRevId == CommonConstants::invalidStepping) {
continue;
}
hwInfo.platform.usRevId = hwRevId;
for (auto isEngineInstanced : ::testing::Bool()) {
for (auto isRcsEnabled : ::testing::Bool()) {
hwInfo.featureTable.flags.ftrRcsNode = isRcsEnabled;
for (auto engineGroupType : {EngineGroupType::RenderCompute, EngineGroupType::Compute, EngineGroupType::CooperativeCompute}) {
if (productHelper.isCooperativeEngineSupported(hwInfo)) {
bool disallowDispatch = (engineGroupType == EngineGroupType::RenderCompute) ||
((engineGroupType == EngineGroupType::Compute) && isRcsEnabled);
bool applyLimitation = !isEngineInstanced &&
(engineGroupType != EngineGroupType::CooperativeCompute);
if (disallowDispatch) {
EXPECT_EQ(1u, gfxCoreHelper.adjustMaxWorkGroupCount(passedMaxWorkGroupCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced));
} else if (applyLimitation) {
hwInfo.gtSystemInfo.CCSInfo.NumberOfCCSEnabled = 4;
EXPECT_EQ(passedMaxWorkGroupCount / tilePartsForConcurrentKernels, gfxCoreHelper.adjustMaxWorkGroupCount(passedMaxWorkGroupCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced));
hwInfo.gtSystemInfo.CCSInfo.NumberOfCCSEnabled = 16;
EXPECT_EQ(passedMaxWorkGroupCount / hwInfo.gtSystemInfo.CCSInfo.NumberOfCCSEnabled, gfxCoreHelper.adjustMaxWorkGroupCount(passedMaxWorkGroupCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced));
} else {
EXPECT_EQ(passedMaxWorkGroupCount / tilePartsForConcurrentKernels, gfxCoreHelper.adjustMaxWorkGroupCount(passedMaxWorkGroupCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced));
}
} else {
EXPECT_EQ(passedMaxWorkGroupCount / tilePartsForConcurrentKernels, gfxCoreHelper.adjustMaxWorkGroupCount(passedMaxWorkGroupCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced));
}
}
}
}
}
}
} // namespace NEO