Limit space in tile for concurrent kernels on pvc

Related-To: NEO-7658, HSD-16016919338

Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>
This commit is contained in:
Maciej Plewka
2023-02-16 11:10:34 +00:00
committed by Compute-Runtime-Automation
parent 2d9f1caa8c
commit 83eb52591d
14 changed files with 235 additions and 58 deletions

View File

@@ -17,6 +17,7 @@ set(L0_FIXTURES_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/event_fixture.cpp
${CMAKE_CURRENT_SOURCE_DIR}/event_fixture.h
${CMAKE_CURRENT_SOURCE_DIR}/host_pointer_manager_fixture.h
${CMAKE_CURRENT_SOURCE_DIR}/kernel_max_cooperative_groups_count_fixture.h
${CMAKE_CURRENT_SOURCE_DIR}/module_fixture.cpp
${CMAKE_CURRENT_SOURCE_DIR}/module_fixture.h
${CMAKE_CURRENT_SOURCE_DIR}/memory_ipc_fixture.h

View File

@@ -0,0 +1,69 @@
/*
* Copyright (C) 2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/helpers/gfx_core_helper.h"
#include "level_zero/core/test/unit_tests/fixtures/device_fixture.h"
#include "level_zero/core/test/unit_tests/mocks/mock_kernel.h"
#include "level_zero/core/test/unit_tests/mocks/mock_module.h"
namespace L0 {
namespace ult {
class KernelImpSuggestMaxCooperativeGroupCountTests : public Test<DeviceFixture> {
public:
const uint32_t numGrf = 128;
const uint32_t simd = 8;
const uint32_t lws[3] = {1, 1, 1};
uint32_t usedSlm = 0;
uint32_t usesBarriers = 0;
uint32_t availableThreadCount;
uint32_t dssCount;
uint32_t availableSlm;
uint32_t maxBarrierCount;
WhiteBox<::L0::KernelImmutableData> kernelInfo;
NEO::KernelDescriptor kernelDescriptor;
void SetUp() override {
Test<DeviceFixture>::SetUp();
kernelInfo.kernelDescriptor = &kernelDescriptor;
auto &hardwareInfo = device->getHwInfo();
auto &helper = device->getNEODevice()->getRootDeviceEnvironment().getHelper<GfxCoreHelper>();
availableThreadCount = helper.calculateAvailableThreadCount(hardwareInfo, numGrf);
dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount;
if (dssCount == 0) {
dssCount = hardwareInfo.gtSystemInfo.SubSliceCount;
}
availableSlm = dssCount * KB * hardwareInfo.capabilityTable.slmSize;
maxBarrierCount = static_cast<uint32_t>(helper.getMaxBarrierRegisterPerSlice());
kernelInfo.kernelDescriptor->kernelAttributes.simdSize = simd;
kernelInfo.kernelDescriptor->kernelAttributes.numGrfRequired = numGrf;
}
uint32_t getMaxWorkGroupCount() {
kernelInfo.kernelDescriptor->kernelAttributes.slmInlineSize = usedSlm;
kernelInfo.kernelDescriptor->kernelAttributes.barrierCount = usesBarriers;
Mock<Kernel> kernel;
kernel.kernelImmData = &kernelInfo;
auto module = std::make_unique<ModuleImp>(device, nullptr, ModuleType::User);
kernel.module = module.get();
kernel.groupSize[0] = lws[0];
kernel.groupSize[1] = lws[1];
kernel.groupSize[2] = lws[2];
uint32_t totalGroupCount = 0;
kernel.KernelImp::suggestMaxCooperativeGroupCount(&totalGroupCount, NEO::EngineGroupType::CooperativeCompute, true);
return totalGroupCount;
}
};
} // namespace ult
} // namespace L0

View File

@@ -15,6 +15,7 @@
#include "shared/test/common/test_macros/test.h"
#include "level_zero/core/test/unit_tests/fixtures/device_fixture.h"
#include "level_zero/core/test/unit_tests/fixtures/kernel_max_cooperative_groups_count_fixture.h"
#include "level_zero/core/test/unit_tests/mocks/mock_kernel.h"
#include "level_zero/core/test/unit_tests/mocks/mock_module.h"
@@ -351,70 +352,19 @@ TEST(zeKernelGetProperties, WhenGettingKernelPropertiesThenSuccessIsReturned) {
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
}
class KernelImpSuggestMaxCooperativeGroupCountTests : public KernelImp {
public:
const uint32_t numGrf = 128;
const uint32_t simd = 8;
const uint32_t lws[3] = {1, 1, 1};
uint32_t usedSlm = 0;
uint32_t usesBarriers = 0;
uint32_t availableThreadCount;
uint32_t dssCount;
uint32_t availableSlm;
uint32_t maxBarrierCount;
WhiteBox<::L0::KernelImmutableData> kernelInfo;
NEO::KernelDescriptor kernelDescriptor;
void SetUp() override {
KernelImp::SetUp();
kernelInfo.kernelDescriptor = &kernelDescriptor;
auto &hardwareInfo = device->getHwInfo();
auto &helper = device->getNEODevice()->getRootDeviceEnvironment().getHelper<GfxCoreHelper>();
availableThreadCount = helper.calculateAvailableThreadCount(hardwareInfo, numGrf);
dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount;
if (dssCount == 0) {
dssCount = hardwareInfo.gtSystemInfo.SubSliceCount;
}
availableSlm = dssCount * KB * hardwareInfo.capabilityTable.slmSize;
maxBarrierCount = static_cast<uint32_t>(helper.getMaxBarrierRegisterPerSlice());
kernelInfo.kernelDescriptor->kernelAttributes.simdSize = simd;
kernelInfo.kernelDescriptor->kernelAttributes.numGrfRequired = numGrf;
}
uint32_t getMaxWorkGroupCount() {
kernelInfo.kernelDescriptor->kernelAttributes.slmInlineSize = usedSlm;
kernelInfo.kernelDescriptor->kernelAttributes.barrierCount = usesBarriers;
Mock<Kernel> kernel;
kernel.kernelImmData = &kernelInfo;
auto module = std::make_unique<ModuleImp>(device, nullptr, ModuleType::User);
kernel.module = module.get();
kernel.groupSize[0] = lws[0];
kernel.groupSize[1] = lws[1];
kernel.groupSize[2] = lws[2];
uint32_t totalGroupCount = 0;
kernel.KernelImp::suggestMaxCooperativeGroupCount(&totalGroupCount, NEO::EngineGroupType::CooperativeCompute, true);
return totalGroupCount;
}
};
TEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenNoBarriersOrSlmUsedWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithSimd) {
HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenNoBarriersOrSlmUsedWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithSimd) {
auto workGroupSize = lws[0] * lws[1] * lws[2];
auto expected = availableThreadCount / Math::divideAndRoundUp(workGroupSize, simd);
EXPECT_EQ(expected, getMaxWorkGroupCount());
}
TEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenBarriersWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToBarriersCount) {
HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenBarriersWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToBarriersCount) {
usesBarriers = 1;
auto expected = dssCount * (maxBarrierCount / usesBarriers);
EXPECT_EQ(expected, getMaxWorkGroupCount());
}
TEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenUsedSlmSizeWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToUsedSlmSize) {
HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenUsedSlmSizeWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToUsedSlmSize) {
usedSlm = 64 * KB;
auto expected = availableSlm / usedSlm;
EXPECT_EQ(expected, getMaxWorkGroupCount());

View File

@@ -1,5 +1,5 @@
#
# Copyright (C) 2022 Intel Corporation
# Copyright (C) 2022-2023 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
@@ -9,6 +9,7 @@ if(TESTS_PVC)
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/test_device_pvc.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_cmdlist_pvc.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_kernel_pvc.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_sampler_pvc.cpp
)
endif()

View File

@@ -0,0 +1,42 @@
/*
* Copyright (C) 2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/basic_math.h"
#include "shared/source/xe_hpc_core/hw_cmds_pvc.h"
#include "shared/test/common/test_macros/header/per_product_test_definitions.h"
#include "shared/test/common/test_macros/hw_test_base.h"
#include "level_zero/core/test/unit_tests/fixtures/kernel_max_cooperative_groups_count_fixture.h"
namespace L0 {
namespace ult {
HWTEST_EXCLUDE_PRODUCT(KernelImpSuggestMaxCooperativeGroupCountTests, GivenUsedSlmSizeWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToUsedSlmSize, IGFX_PVC);
HWTEST_EXCLUDE_PRODUCT(KernelImpSuggestMaxCooperativeGroupCountTests, GivenBarriersWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToBarriersCount, IGFX_PVC);
HWTEST_EXCLUDE_PRODUCT(KernelImpSuggestMaxCooperativeGroupCountTests, GivenNoBarriersOrSlmUsedWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithSimd, IGFX_PVC);
using KernelImpSuggestMaxCooperativeGroupCountTestsPvc = KernelImpSuggestMaxCooperativeGroupCountTests;
PVCTEST_F(KernelImpSuggestMaxCooperativeGroupCountTestsPvc, GivenNoBarriersOrSlmUsedWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithSimd) {
auto workGroupSize = lws[0] * lws[1] * lws[2];
auto expected = (availableThreadCount / Math::divideAndRoundUp(workGroupSize, simd)) / PVC::numberOfpartsInTileForConcurrentKernels;
EXPECT_EQ(expected, getMaxWorkGroupCount());
}
PVCTEST_F(KernelImpSuggestMaxCooperativeGroupCountTestsPvc, GivenBarriersWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToBarriersCount) {
usesBarriers = 1;
auto expected = (dssCount * (maxBarrierCount / usesBarriers)) / PVC::numberOfpartsInTileForConcurrentKernels;
EXPECT_EQ(expected, getMaxWorkGroupCount());
}
PVCTEST_F(KernelImpSuggestMaxCooperativeGroupCountTestsPvc, GivenUsedSlmSizeWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToUsedSlmSize) {
usedSlm = 64 * KB;
auto expected = (availableSlm / usedSlm) / PVC::numberOfpartsInTileForConcurrentKernels;
EXPECT_EQ(expected, getMaxWorkGroupCount());
}
} // namespace ult
} // namespace L0