refactor: unify getMaxWorkGroupCount logic

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2024-01-19 16:00:58 +00:00
committed by Compute-Runtime-Automation
parent 1002cb9f34
commit 6f4ed10919
6 changed files with 93 additions and 69 deletions

View File

@@ -474,35 +474,22 @@ ze_result_t KernelImp::suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount
UNRECOVERABLE_IF(0 == groupSize[1]);
UNRECOVERABLE_IF(0 == groupSize[2]);
auto &hardwareInfo = module->getDevice()->getHwInfo();
auto dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount;
if (dssCount == 0) {
dssCount = hardwareInfo.gtSystemInfo.SubSliceCount;
}
auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
auto &helper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
auto &descriptor = kernelImmData->getDescriptor();
auto availableThreadCount = helper.calculateAvailableThreadCount(hardwareInfo, descriptor.kernelAttributes.numGrfRequired);
auto availableSlmSize = static_cast<uint32_t>(dssCount * MemoryConstants::kiloByte * hardwareInfo.capabilityTable.slmSize);
auto usedSlmSize = helper.alignSlmSize(slmArgsTotalSize + descriptor.kernelAttributes.slmInlineSize);
auto maxBarrierCount = static_cast<uint32_t>(helper.getMaxBarrierRegisterPerSlice());
auto barrierCount = descriptor.kernelAttributes.barrierCount;
const uint32_t workDim = 3;
const size_t localWorkSize[] = {groupSize[0], groupSize[1], groupSize[2]};
*totalGroupCount = NEO::KernelHelper::getMaxWorkGroupCount(descriptor.kernelAttributes.simdSize,
availableThreadCount,
dssCount,
availableSlmSize,
*totalGroupCount = NEO::KernelHelper::getMaxWorkGroupCount(rootDeviceEnvironment,
descriptor,
usedSlmSize,
maxBarrierCount,
barrierCount,
workDim,
localWorkSize);
*totalGroupCount = helper.adjustMaxWorkGroupCount(*totalGroupCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced);
localWorkSize,
engineGroupType,
isEngineInstanced);
return ZE_RESULT_SUCCESS;
}

View File

@@ -1128,29 +1128,18 @@ uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *local
auto engineGroupType = helper.getEngineGroupType(commandQueue->getGpgpuEngine().getEngineType(),
commandQueue->getGpgpuEngine().getEngineUsage(), hardwareInfo);
const auto &kernelDescriptor = kernelInfo.kernelDescriptor;
auto dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount;
if (dssCount == 0) {
dssCount = hardwareInfo.gtSystemInfo.SubSliceCount;
}
auto availableThreadCount = helper.calculateAvailableThreadCount(hardwareInfo, kernelDescriptor.kernelAttributes.numGrfRequired);
auto availableSlmSize = static_cast<uint32_t>(dssCount * MemoryConstants::kiloByte * hardwareInfo.capabilityTable.slmSize);
auto usedSlmSize = helper.alignSlmSize(slmTotalSize);
auto maxBarrierCount = static_cast<uint32_t>(helper.getMaxBarrierRegisterPerSlice());
auto barrierCount = kernelDescriptor.kernelAttributes.barrierCount;
auto maxWorkGroupCount = KernelHelper::getMaxWorkGroupCount(kernelInfo.getMaxSimdSize(),
availableThreadCount,
dssCount,
availableSlmSize,
usedSlmSize,
maxBarrierCount,
barrierCount,
workDim,
localWorkSize);
auto isEngineInstanced = commandQueue->getGpgpuCommandStreamReceiver().getOsContext().isEngineInstanced();
maxWorkGroupCount = helper.adjustMaxWorkGroupCount(maxWorkGroupCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced);
auto usedSlmSize = helper.alignSlmSize(slmTotalSize);
auto maxWorkGroupCount = KernelHelper::getMaxWorkGroupCount(rootDeviceEnvironment,
kernelInfo.kernelDescriptor,
usedSlmSize,
workDim,
localWorkSize,
engineGroupType,
isEngineInstanced);
return maxWorkGroupCount;
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2023 Intel Corporation
* Copyright (C) 2021-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -70,7 +70,7 @@ uint32_t GfxCoreHelperHw<Family>::adjustMaxWorkGroupCount(uint32_t maxWorkGroupC
UNRECOVERABLE_IF(ccsCount == 0);
numberOfpartsInTileForConcurrentKernels = std::max(numberOfpartsInTileForConcurrentKernels, ccsCount);
}
return maxWorkGroupCount / numberOfpartsInTileForConcurrentKernels;
return std::max(maxWorkGroupCount / numberOfpartsInTileForConcurrentKernels, 1u);
}
template <typename Family>

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2023 Intel Corporation
* Copyright (C) 2019-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -11,20 +11,33 @@
#include "shared/source/device/device.h"
#include "shared/source/execution_environment/root_device_environment.h"
#include "shared/source/helpers/basic_math.h"
#include "shared/source/helpers/constants.h"
#include "shared/source/helpers/debug_helpers.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/helpers/hw_info.h"
#include <algorithm>
namespace NEO {
uint32_t KernelHelper::getMaxWorkGroupCount(uint32_t simd, uint32_t availableThreadCount, uint32_t dssCount, uint32_t availableSlmSize,
uint32_t usedSlmSize, uint32_t maxBarrierCount, uint32_t numberOfBarriers, uint32_t workDim,
const size_t *localWorkSize) {
uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor,
uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced) {
if (debugManager.flags.OverrideMaxWorkGroupCount.get() != -1) {
return static_cast<uint32_t>(debugManager.flags.OverrideMaxWorkGroupCount.get());
}
auto &helper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
auto dssCount = hwInfo.gtSystemInfo.DualSubSliceCount;
if (dssCount == 0) {
dssCount = hwInfo.gtSystemInfo.SubSliceCount;
}
auto availableThreadCount = helper.calculateAvailableThreadCount(hwInfo, kernelDescriptor.kernelAttributes.numGrfRequired);
auto availableSlmSize = static_cast<uint32_t>(dssCount * MemoryConstants::kiloByte * hwInfo.capabilityTable.slmSize);
auto maxBarrierCount = static_cast<uint32_t>(helper.getMaxBarrierRegisterPerSlice());
UNRECOVERABLE_IF((workDim == 0) || (workDim > 3));
UNRECOVERABLE_IF(localWorkSize == nullptr);
@@ -33,11 +46,11 @@ uint32_t KernelHelper::getMaxWorkGroupCount(uint32_t simd, uint32_t availableThr
workGroupSize *= localWorkSize[i];
}
auto numThreadsPerThreadGroup = static_cast<uint32_t>(Math::divideAndRoundUp(workGroupSize, simd));
auto numThreadsPerThreadGroup = static_cast<uint32_t>(Math::divideAndRoundUp(workGroupSize, kernelDescriptor.kernelAttributes.simdSize));
auto maxWorkGroupsCount = availableThreadCount / numThreadsPerThreadGroup;
if (numberOfBarriers > 0) {
auto maxWorkGroupsCountDueToBarrierUsage = dssCount * (maxBarrierCount / numberOfBarriers);
if (kernelDescriptor.kernelAttributes.barrierCount > 0) {
auto maxWorkGroupsCountDueToBarrierUsage = dssCount * (maxBarrierCount / kernelDescriptor.kernelAttributes.barrierCount);
maxWorkGroupsCount = std::min(maxWorkGroupsCount, maxWorkGroupsCountDueToBarrierUsage);
}
@@ -46,7 +59,7 @@ uint32_t KernelHelper::getMaxWorkGroupCount(uint32_t simd, uint32_t availableThr
maxWorkGroupsCount = std::min(maxWorkGroupsCount, maxWorkGroupsCountDueToSlm);
}
return maxWorkGroupsCount;
return helper.adjustMaxWorkGroupCount(maxWorkGroupsCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced);
}
KernelHelper::ErrorCode KernelHelper::checkIfThereIsSpaceForScratchOrPrivate(KernelDescriptor::KernelAttributes attributes, Device *device) {

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2023 Intel Corporation
* Copyright (C) 2019-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -7,6 +7,7 @@
#pragma once
#include "shared/source/helpers/definitions/engine_group_types.h"
#include "shared/source/kernel/kernel_descriptor.h"
#include <cstddef>
@@ -14,6 +15,7 @@
namespace NEO {
class Device;
struct RootDeviceEnvironment;
struct KernelHelper {
enum class ErrorCode {
@@ -21,9 +23,8 @@ struct KernelHelper {
outOfDeviceMemory = 1,
invalidKernel = 2
};
static uint32_t getMaxWorkGroupCount(uint32_t simd, uint32_t availableThreadCount, uint32_t dssCount, uint32_t availableSlmSize,
uint32_t usedSlmSize, uint32_t maxBarrierCount, uint32_t numberOfBarriers, uint32_t workDim,
const size_t *localWorkSize);
static uint32_t getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor,
uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced);
static inline uint64_t getPrivateSurfaceSize(uint64_t perHwThreadPrivateMemorySize, uint32_t computeUnitsUsedForScratch) {
return perHwThreadPrivateMemorySize * computeUnitsUsedForScratch;
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2023 Intel Corporation
* Copyright (C) 2019-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -12,30 +12,53 @@
#include "shared/test/common/fixtures/device_fixture.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/mocks/mock_device.h"
#include "shared/test/common/mocks/mock_execution_environment.h"
#include "shared/test/common/test_macros/test.h"
#include <algorithm>
using namespace NEO;
struct KernelHelperMaxWorkGroupsTests : ::testing::Test {
EngineGroupType engineType = EngineGroupType::compute;
uint32_t simd = 8;
uint32_t threadCount = 8 * 1024;
uint32_t dssCount = 16;
uint32_t availableSlm = 64 * MemoryConstants::kiloByte;
uint32_t usedSlm = 0;
uint32_t maxBarrierCount = 32;
uint32_t numberOfBarriers = 0;
uint32_t workDim = 3;
uint32_t grf = 128;
size_t lws[3] = {10, 10, 10};
uint32_t getMaxWorkGroupCount() {
return KernelHelper::getMaxWorkGroupCount(simd, threadCount, dssCount, availableSlm, usedSlm,
maxBarrierCount, numberOfBarriers, workDim, lws);
void SetUp() override {
executionEnvironment = std::make_unique<MockExecutionEnvironment>(defaultHwInfo.get(), false, 1u);
rootDeviceEnvironment = executionEnvironment->rootDeviceEnvironments[0].get();
}
uint32_t getMaxWorkGroupCount() {
KernelDescriptor descriptor = {};
descriptor.kernelAttributes.simdSize = simd;
descriptor.kernelAttributes.barrierCount = numberOfBarriers;
descriptor.kernelAttributes.numGrfRequired = grf;
auto hwInfo = rootDeviceEnvironment->getMutableHardwareInfo();
hwInfo->gtSystemInfo.DualSubSliceCount = dssCount;
hwInfo->capabilityTable.slmSize = (availableSlm / MemoryConstants::kiloByte) / dssCount;
return KernelHelper::getMaxWorkGroupCount(*rootDeviceEnvironment, descriptor, usedSlm, workDim, lws, engineType, false);
}
std::unique_ptr<MockExecutionEnvironment> executionEnvironment;
RootDeviceEnvironment *rootDeviceEnvironment = nullptr;
};
TEST_F(KernelHelperMaxWorkGroupsTests, GivenNoBarriersOrSlmUsedWhenCalculatingMaxWorkGroupsCountThenResultIsCalculatedWithSimd) {
auto workGroupSize = lws[0] * lws[1] * lws[2];
auto expected = threadCount / Math::divideAndRoundUp(workGroupSize, simd);
auto &helper = rootDeviceEnvironment->getHelper<NEO::GfxCoreHelper>();
uint32_t workGroupSize = static_cast<uint32_t>(lws[0] * lws[1] * lws[2]);
uint32_t expected = helper.calculateAvailableThreadCount(*rootDeviceEnvironment->getHardwareInfo(), grf) / static_cast<uint32_t>(Math::divideAndRoundUp(workGroupSize, simd));
expected = helper.adjustMaxWorkGroupCount(expected, EngineGroupType::compute, *rootDeviceEnvironment, false);
EXPECT_EQ(expected, getMaxWorkGroupCount());
}
@@ -47,33 +70,44 @@ TEST_F(KernelHelperMaxWorkGroupsTests, GivenDebugFlagSetWhenGetMaxWorkGroupCount
}
TEST_F(KernelHelperMaxWorkGroupsTests, GivenBarriersWhenCalculatingMaxWorkGroupsCountThenResultIsCalculatedWithRegardToBarriersCount) {
numberOfBarriers = 0;
auto baseCount = getMaxWorkGroupCount();
numberOfBarriers = 16;
auto expected = dssCount * (maxBarrierCount / numberOfBarriers);
auto &helper = rootDeviceEnvironment->getHelper<NEO::GfxCoreHelper>();
auto maxBarrierCount = helper.getMaxBarrierRegisterPerSlice();
auto expected = std::min(baseCount, static_cast<uint32_t>(dssCount * (maxBarrierCount / numberOfBarriers)));
EXPECT_EQ(expected, getMaxWorkGroupCount());
}
TEST_F(KernelHelperMaxWorkGroupsTests, GivenUsedSlmSizeWhenCalculatingMaxWorkGroupsCountThenResultIsCalculatedWithRegardToUsedSlmSize) {
usedSlm = 0;
auto baseCount = getMaxWorkGroupCount();
usedSlm = 4 * MemoryConstants::kiloByte;
auto expected = availableSlm / usedSlm;
auto expected = std::min(baseCount, availableSlm / usedSlm);
EXPECT_EQ(expected, getMaxWorkGroupCount());
}
TEST_F(KernelHelperMaxWorkGroupsTests, GivenVariousValuesWhenCalculatingMaxWorkGroupsCountThenLowestResultIsAlwaysReturned) {
auto &helper = rootDeviceEnvironment->getHelper<NEO::GfxCoreHelper>();
engineType = EngineGroupType::cooperativeCompute;
usedSlm = 1 * MemoryConstants::kiloByte;
numberOfBarriers = 1;
dssCount = 1;
workDim = 1;
lws[0] = simd;
threadCount = 1;
EXPECT_EQ(1u, getMaxWorkGroupCount());
auto hwInfo = rootDeviceEnvironment->getMutableHardwareInfo();
threadCount = 1024;
hwInfo->gtSystemInfo.ThreadCount = 1024;
EXPECT_NE(1u, getMaxWorkGroupCount());
numberOfBarriers = 32;
numberOfBarriers = static_cast<uint32_t>(helper.getMaxBarrierRegisterPerSlice());
EXPECT_EQ(1u, getMaxWorkGroupCount());
numberOfBarriers = 1;