feature: improve reporting max cooperative group count
Related-To: NEO-8210 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
parent
8d56f8fb6b
commit
b77e1a6a71
|
@ -8,6 +8,7 @@
|
|||
#include "level_zero/core/source/kernel/kernel_imp.h"
|
||||
|
||||
#include "shared/source/assert_handler/assert_handler.h"
|
||||
#include "shared/source/command_container/implicit_scaling.h"
|
||||
#include "shared/source/debugger/debugger_l0.h"
|
||||
#include "shared/source/execution_environment/root_device_environment.h"
|
||||
#include "shared/source/gmm_helper/gmm_helper.h"
|
||||
|
@ -482,8 +483,18 @@ ze_result_t KernelImp::suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount
|
|||
const uint32_t workDim = 3;
|
||||
const size_t localWorkSize[] = {groupSize[0], groupSize[1], groupSize[2]};
|
||||
|
||||
uint32_t numSubDevicesForExecution = 1;
|
||||
|
||||
bool platformImplicitScaling = helper.platformSupportsImplicitScaling(rootDeviceEnvironment);
|
||||
auto deviceBitfield = module->getDevice()->getNEODevice()->getDeviceBitfield();
|
||||
|
||||
if (NEO::ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, platformImplicitScaling)) {
|
||||
numSubDevicesForExecution = static_cast<uint32_t>(deviceBitfield.count());
|
||||
}
|
||||
|
||||
*totalGroupCount = NEO::KernelHelper::getMaxWorkGroupCount(rootDeviceEnvironment,
|
||||
descriptor,
|
||||
numSubDevicesForExecution,
|
||||
usedSlmSize,
|
||||
workDim,
|
||||
localWorkSize,
|
||||
|
|
|
@ -447,6 +447,26 @@ HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenNoBarriersOrSlmUsed
|
|||
EXPECT_EQ(expected, getMaxWorkGroupCount());
|
||||
}
|
||||
|
||||
HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenMultiTileWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithSimd) {
|
||||
DebugManagerStateRestore restore;
|
||||
|
||||
neoDevice->deviceBitfield = 0b1;
|
||||
auto baseCount = getMaxWorkGroupCount();
|
||||
|
||||
debugManager.flags.EnableImplicitScaling.set(1);
|
||||
neoDevice->deviceBitfield = 0b11;
|
||||
|
||||
auto countWithSubDevices = getMaxWorkGroupCount();
|
||||
|
||||
auto &helper = neoDevice->getGfxCoreHelper();
|
||||
|
||||
if (helper.singleTileExecImplicitScalingRequired(true)) {
|
||||
EXPECT_EQ(baseCount, countWithSubDevices);
|
||||
} else {
|
||||
EXPECT_EQ(baseCount * 2, countWithSubDevices);
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenBarriersWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToBarriersCount) {
|
||||
usesBarriers = 1;
|
||||
auto expected = dssCount * (maxBarrierCount / usesBarriers);
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
#include "opencl/source/kernel/kernel.h"
|
||||
|
||||
#include "shared/source/built_ins/built_ins.h"
|
||||
#include "shared/source/command_container/implicit_scaling.h"
|
||||
#include "shared/source/command_stream/command_stream_receiver.h"
|
||||
#include "shared/source/debug_settings/debug_settings_manager.h"
|
||||
#include "shared/source/execution_environment/execution_environment.h"
|
||||
|
@ -1132,8 +1133,18 @@ uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *local
|
|||
|
||||
auto usedSlmSize = helper.alignSlmSize(slmTotalSize);
|
||||
|
||||
uint32_t numSubDevicesForExecution = 1;
|
||||
|
||||
bool platformImplicitScaling = helper.platformSupportsImplicitScaling(rootDeviceEnvironment);
|
||||
auto deviceBitfield = commandQueue->getClDevice().getDeviceBitfield();
|
||||
|
||||
if (NEO::ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, platformImplicitScaling)) {
|
||||
numSubDevicesForExecution = static_cast<uint32_t>(deviceBitfield.count());
|
||||
}
|
||||
|
||||
auto maxWorkGroupCount = KernelHelper::getMaxWorkGroupCount(rootDeviceEnvironment,
|
||||
kernelInfo.kernelDescriptor,
|
||||
numSubDevicesForExecution,
|
||||
usedSlmSize,
|
||||
workDim,
|
||||
localWorkSize,
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (C) 2020-2023 Intel Corporation
|
||||
* Copyright (C) 2020-2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
|
@ -83,4 +83,31 @@ TEST_F(clGetKernelMaxConcurrentWorkGroupCountTests, GivenVariousInputWhenGetting
|
|||
EXPECT_EQ(expectedMaxConcurrentWorkGroupCount, maxConcurrentWorkGroupCount);
|
||||
}
|
||||
|
||||
TEST_F(clGetKernelMaxConcurrentWorkGroupCountTests, GivenMultiTileWhenGettingMaxConcurrentWorkGroupCountThenCorrectValuesAreReturned) {
|
||||
DebugManagerStateRestore restore;
|
||||
auto &mockDevice = static_cast<MockDevice &>(pDevice->getDevice());
|
||||
|
||||
cl_uint workDim = 3;
|
||||
size_t localWorkSize[] = {8, 8, 8};
|
||||
|
||||
const_cast<KernelInfo &>(pKernel->getKernelInfo()).kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::defaultGrfNumber;
|
||||
|
||||
mockDevice.deviceBitfield = 0b1;
|
||||
|
||||
auto baseCount = pKernel->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue);
|
||||
|
||||
debugManager.flags.EnableImplicitScaling.set(1);
|
||||
mockDevice.deviceBitfield = 0b11;
|
||||
|
||||
auto countWithSubDevices = pKernel->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue);
|
||||
|
||||
auto &helper = pDevice->getGfxCoreHelper();
|
||||
|
||||
if (helper.singleTileExecImplicitScalingRequired(true)) {
|
||||
EXPECT_EQ(baseCount, countWithSubDevices);
|
||||
} else {
|
||||
EXPECT_EQ(baseCount * 2, countWithSubDevices);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ULT
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
|
||||
namespace NEO {
|
||||
|
||||
uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor,
|
||||
uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor, uint32_t numSubDevices,
|
||||
uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced) {
|
||||
if (debugManager.flags.OverrideMaxWorkGroupCount.get() != -1) {
|
||||
return static_cast<uint32_t>(debugManager.flags.OverrideMaxWorkGroupCount.get());
|
||||
|
@ -59,7 +59,13 @@ uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDev
|
|||
maxWorkGroupsCount = std::min(maxWorkGroupsCount, maxWorkGroupsCountDueToSlm);
|
||||
}
|
||||
|
||||
return helper.adjustMaxWorkGroupCount(maxWorkGroupsCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced);
|
||||
maxWorkGroupsCount = helper.adjustMaxWorkGroupCount(maxWorkGroupsCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced);
|
||||
|
||||
if (!helper.singleTileExecImplicitScalingRequired(true)) {
|
||||
maxWorkGroupsCount *= numSubDevices;
|
||||
}
|
||||
|
||||
return maxWorkGroupsCount;
|
||||
}
|
||||
|
||||
KernelHelper::ErrorCode KernelHelper::checkIfThereIsSpaceForScratchOrPrivate(KernelDescriptor::KernelAttributes attributes, Device *device) {
|
||||
|
|
|
@ -24,7 +24,7 @@ struct KernelHelper {
|
|||
invalidKernel = 2
|
||||
};
|
||||
static uint32_t getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor,
|
||||
uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced);
|
||||
uint32_t numSubDevices, uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced);
|
||||
static inline uint64_t getPrivateSurfaceSize(uint64_t perHwThreadPrivateMemorySize, uint32_t computeUnitsUsedForScratch) {
|
||||
return perHwThreadPrivateMemorySize * computeUnitsUsedForScratch;
|
||||
}
|
||||
|
|
|
@ -28,6 +28,7 @@ struct KernelHelperMaxWorkGroupsTests : ::testing::Test {
|
|||
uint32_t numberOfBarriers = 0;
|
||||
uint32_t workDim = 3;
|
||||
uint32_t grf = 128;
|
||||
uint32_t numSubdevices = 1;
|
||||
size_t lws[3] = {10, 10, 10};
|
||||
|
||||
void SetUp() override {
|
||||
|
@ -45,7 +46,7 @@ struct KernelHelperMaxWorkGroupsTests : ::testing::Test {
|
|||
hwInfo->gtSystemInfo.DualSubSliceCount = dssCount;
|
||||
hwInfo->capabilityTable.slmSize = (availableSlm / MemoryConstants::kiloByte) / dssCount;
|
||||
|
||||
return KernelHelper::getMaxWorkGroupCount(*rootDeviceEnvironment, descriptor, usedSlm, workDim, lws, engineType, false);
|
||||
return KernelHelper::getMaxWorkGroupCount(*rootDeviceEnvironment, descriptor, numSubdevices, usedSlm, workDim, lws, engineType, false);
|
||||
}
|
||||
|
||||
std::unique_ptr<MockExecutionEnvironment> executionEnvironment;
|
||||
|
@ -69,6 +70,22 @@ TEST_F(KernelHelperMaxWorkGroupsTests, GivenDebugFlagSetWhenGetMaxWorkGroupCount
|
|||
EXPECT_EQ(123u, getMaxWorkGroupCount());
|
||||
}
|
||||
|
||||
TEST_F(KernelHelperMaxWorkGroupsTests, givenMultipleSubdevicesWenCalculatingMaxWorkGroupsCountTenMultiply) {
|
||||
auto &helper = rootDeviceEnvironment->getHelper<NEO::GfxCoreHelper>();
|
||||
|
||||
auto baseCount = getMaxWorkGroupCount();
|
||||
|
||||
numSubdevices = 4;
|
||||
|
||||
auto countWithSubdevices = getMaxWorkGroupCount();
|
||||
|
||||
if (helper.singleTileExecImplicitScalingRequired(true)) {
|
||||
EXPECT_EQ(baseCount, countWithSubdevices);
|
||||
} else {
|
||||
EXPECT_EQ(baseCount * numSubdevices, countWithSubdevices);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(KernelHelperMaxWorkGroupsTests, GivenBarriersWhenCalculatingMaxWorkGroupsCountThenResultIsCalculatedWithRegardToBarriersCount) {
|
||||
numberOfBarriers = 0;
|
||||
auto baseCount = getMaxWorkGroupCount();
|
||||
|
|
Loading…
Reference in New Issue