feature: improve reporting max cooperative group count

Related-To: NEO-8210

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz 2024-01-22 11:57:53 +00:00 committed by Compute-Runtime-Automation
parent 8d56f8fb6b
commit b77e1a6a71
7 changed files with 97 additions and 5 deletions

View File

@ -8,6 +8,7 @@
#include "level_zero/core/source/kernel/kernel_imp.h"
#include "shared/source/assert_handler/assert_handler.h"
#include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/debugger/debugger_l0.h"
#include "shared/source/execution_environment/root_device_environment.h"
#include "shared/source/gmm_helper/gmm_helper.h"
@ -482,8 +483,18 @@ ze_result_t KernelImp::suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount
const uint32_t workDim = 3;
const size_t localWorkSize[] = {groupSize[0], groupSize[1], groupSize[2]};
uint32_t numSubDevicesForExecution = 1;
bool platformImplicitScaling = helper.platformSupportsImplicitScaling(rootDeviceEnvironment);
auto deviceBitfield = module->getDevice()->getNEODevice()->getDeviceBitfield();
if (NEO::ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, platformImplicitScaling)) {
numSubDevicesForExecution = static_cast<uint32_t>(deviceBitfield.count());
}
*totalGroupCount = NEO::KernelHelper::getMaxWorkGroupCount(rootDeviceEnvironment,
descriptor,
numSubDevicesForExecution,
usedSlmSize,
workDim,
localWorkSize,

View File

@ -447,6 +447,26 @@ HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenNoBarriersOrSlmUsed
EXPECT_EQ(expected, getMaxWorkGroupCount());
}
HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenMultiTileWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithSimd) {
DebugManagerStateRestore restore;
neoDevice->deviceBitfield = 0b1;
auto baseCount = getMaxWorkGroupCount();
debugManager.flags.EnableImplicitScaling.set(1);
neoDevice->deviceBitfield = 0b11;
auto countWithSubDevices = getMaxWorkGroupCount();
auto &helper = neoDevice->getGfxCoreHelper();
if (helper.singleTileExecImplicitScalingRequired(true)) {
EXPECT_EQ(baseCount, countWithSubDevices);
} else {
EXPECT_EQ(baseCount * 2, countWithSubDevices);
}
}
HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenBarriersWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToBarriersCount) {
usesBarriers = 1;
auto expected = dssCount * (maxBarrierCount / usesBarriers);

View File

@ -8,6 +8,7 @@
#include "opencl/source/kernel/kernel.h"
#include "shared/source/built_ins/built_ins.h"
#include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/execution_environment/execution_environment.h"
@ -1132,8 +1133,18 @@ uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *local
auto usedSlmSize = helper.alignSlmSize(slmTotalSize);
uint32_t numSubDevicesForExecution = 1;
bool platformImplicitScaling = helper.platformSupportsImplicitScaling(rootDeviceEnvironment);
auto deviceBitfield = commandQueue->getClDevice().getDeviceBitfield();
if (NEO::ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, platformImplicitScaling)) {
numSubDevicesForExecution = static_cast<uint32_t>(deviceBitfield.count());
}
auto maxWorkGroupCount = KernelHelper::getMaxWorkGroupCount(rootDeviceEnvironment,
kernelInfo.kernelDescriptor,
numSubDevicesForExecution,
usedSlmSize,
workDim,
localWorkSize,

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2023 Intel Corporation
* Copyright (C) 2020-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -83,4 +83,31 @@ TEST_F(clGetKernelMaxConcurrentWorkGroupCountTests, GivenVariousInputWhenGetting
EXPECT_EQ(expectedMaxConcurrentWorkGroupCount, maxConcurrentWorkGroupCount);
}
TEST_F(clGetKernelMaxConcurrentWorkGroupCountTests, GivenMultiTileWhenGettingMaxConcurrentWorkGroupCountThenCorrectValuesAreReturned) {
DebugManagerStateRestore restore;
auto &mockDevice = static_cast<MockDevice &>(pDevice->getDevice());
cl_uint workDim = 3;
size_t localWorkSize[] = {8, 8, 8};
const_cast<KernelInfo &>(pKernel->getKernelInfo()).kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::defaultGrfNumber;
mockDevice.deviceBitfield = 0b1;
auto baseCount = pKernel->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue);
debugManager.flags.EnableImplicitScaling.set(1);
mockDevice.deviceBitfield = 0b11;
auto countWithSubDevices = pKernel->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue);
auto &helper = pDevice->getGfxCoreHelper();
if (helper.singleTileExecImplicitScalingRequired(true)) {
EXPECT_EQ(baseCount, countWithSubDevices);
} else {
EXPECT_EQ(baseCount * 2, countWithSubDevices);
}
}
} // namespace ULT

View File

@ -20,7 +20,7 @@
namespace NEO {
uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor,
uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor, uint32_t numSubDevices,
uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced) {
if (debugManager.flags.OverrideMaxWorkGroupCount.get() != -1) {
return static_cast<uint32_t>(debugManager.flags.OverrideMaxWorkGroupCount.get());
@ -59,7 +59,13 @@ uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDev
maxWorkGroupsCount = std::min(maxWorkGroupsCount, maxWorkGroupsCountDueToSlm);
}
return helper.adjustMaxWorkGroupCount(maxWorkGroupsCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced);
maxWorkGroupsCount = helper.adjustMaxWorkGroupCount(maxWorkGroupsCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced);
if (!helper.singleTileExecImplicitScalingRequired(true)) {
maxWorkGroupsCount *= numSubDevices;
}
return maxWorkGroupsCount;
}
KernelHelper::ErrorCode KernelHelper::checkIfThereIsSpaceForScratchOrPrivate(KernelDescriptor::KernelAttributes attributes, Device *device) {

View File

@ -24,7 +24,7 @@ struct KernelHelper {
invalidKernel = 2
};
static uint32_t getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor,
uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced);
uint32_t numSubDevices, uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced);
static inline uint64_t getPrivateSurfaceSize(uint64_t perHwThreadPrivateMemorySize, uint32_t computeUnitsUsedForScratch) {
return perHwThreadPrivateMemorySize * computeUnitsUsedForScratch;
}

View File

@ -28,6 +28,7 @@ struct KernelHelperMaxWorkGroupsTests : ::testing::Test {
uint32_t numberOfBarriers = 0;
uint32_t workDim = 3;
uint32_t grf = 128;
uint32_t numSubdevices = 1;
size_t lws[3] = {10, 10, 10};
void SetUp() override {
@ -45,7 +46,7 @@ struct KernelHelperMaxWorkGroupsTests : ::testing::Test {
hwInfo->gtSystemInfo.DualSubSliceCount = dssCount;
hwInfo->capabilityTable.slmSize = (availableSlm / MemoryConstants::kiloByte) / dssCount;
return KernelHelper::getMaxWorkGroupCount(*rootDeviceEnvironment, descriptor, usedSlm, workDim, lws, engineType, false);
return KernelHelper::getMaxWorkGroupCount(*rootDeviceEnvironment, descriptor, numSubdevices, usedSlm, workDim, lws, engineType, false);
}
std::unique_ptr<MockExecutionEnvironment> executionEnvironment;
@ -69,6 +70,22 @@ TEST_F(KernelHelperMaxWorkGroupsTests, GivenDebugFlagSetWhenGetMaxWorkGroupCount
EXPECT_EQ(123u, getMaxWorkGroupCount());
}
TEST_F(KernelHelperMaxWorkGroupsTests, givenMultipleSubdevicesWenCalculatingMaxWorkGroupsCountTenMultiply) {
auto &helper = rootDeviceEnvironment->getHelper<NEO::GfxCoreHelper>();
auto baseCount = getMaxWorkGroupCount();
numSubdevices = 4;
auto countWithSubdevices = getMaxWorkGroupCount();
if (helper.singleTileExecImplicitScalingRequired(true)) {
EXPECT_EQ(baseCount, countWithSubdevices);
} else {
EXPECT_EQ(baseCount * numSubdevices, countWithSubdevices);
}
}
TEST_F(KernelHelperMaxWorkGroupsTests, GivenBarriersWhenCalculatingMaxWorkGroupsCountThenResultIsCalculatedWithRegardToBarriersCount) {
numberOfBarriers = 0;
auto baseCount = getMaxWorkGroupCount();