mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-08 22:12:59 +08:00
refactor: unify getMaxWorkGroupCount logic
Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
1002cb9f34
commit
6f4ed10919
@@ -474,35 +474,22 @@ ze_result_t KernelImp::suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount
|
||||
UNRECOVERABLE_IF(0 == groupSize[1]);
|
||||
UNRECOVERABLE_IF(0 == groupSize[2]);
|
||||
|
||||
auto &hardwareInfo = module->getDevice()->getHwInfo();
|
||||
|
||||
auto dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount;
|
||||
if (dssCount == 0) {
|
||||
dssCount = hardwareInfo.gtSystemInfo.SubSliceCount;
|
||||
}
|
||||
|
||||
auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
|
||||
auto &helper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||
auto &descriptor = kernelImmData->getDescriptor();
|
||||
auto availableThreadCount = helper.calculateAvailableThreadCount(hardwareInfo, descriptor.kernelAttributes.numGrfRequired);
|
||||
|
||||
auto availableSlmSize = static_cast<uint32_t>(dssCount * MemoryConstants::kiloByte * hardwareInfo.capabilityTable.slmSize);
|
||||
auto usedSlmSize = helper.alignSlmSize(slmArgsTotalSize + descriptor.kernelAttributes.slmInlineSize);
|
||||
auto maxBarrierCount = static_cast<uint32_t>(helper.getMaxBarrierRegisterPerSlice());
|
||||
auto barrierCount = descriptor.kernelAttributes.barrierCount;
|
||||
const uint32_t workDim = 3;
|
||||
const size_t localWorkSize[] = {groupSize[0], groupSize[1], groupSize[2]};
|
||||
|
||||
*totalGroupCount = NEO::KernelHelper::getMaxWorkGroupCount(descriptor.kernelAttributes.simdSize,
|
||||
availableThreadCount,
|
||||
dssCount,
|
||||
availableSlmSize,
|
||||
*totalGroupCount = NEO::KernelHelper::getMaxWorkGroupCount(rootDeviceEnvironment,
|
||||
descriptor,
|
||||
usedSlmSize,
|
||||
maxBarrierCount,
|
||||
barrierCount,
|
||||
workDim,
|
||||
localWorkSize);
|
||||
*totalGroupCount = helper.adjustMaxWorkGroupCount(*totalGroupCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced);
|
||||
localWorkSize,
|
||||
engineGroupType,
|
||||
isEngineInstanced);
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -1128,29 +1128,18 @@ uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *local
|
||||
|
||||
auto engineGroupType = helper.getEngineGroupType(commandQueue->getGpgpuEngine().getEngineType(),
|
||||
commandQueue->getGpgpuEngine().getEngineUsage(), hardwareInfo);
|
||||
|
||||
const auto &kernelDescriptor = kernelInfo.kernelDescriptor;
|
||||
auto dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount;
|
||||
if (dssCount == 0) {
|
||||
dssCount = hardwareInfo.gtSystemInfo.SubSliceCount;
|
||||
}
|
||||
auto availableThreadCount = helper.calculateAvailableThreadCount(hardwareInfo, kernelDescriptor.kernelAttributes.numGrfRequired);
|
||||
auto availableSlmSize = static_cast<uint32_t>(dssCount * MemoryConstants::kiloByte * hardwareInfo.capabilityTable.slmSize);
|
||||
auto usedSlmSize = helper.alignSlmSize(slmTotalSize);
|
||||
auto maxBarrierCount = static_cast<uint32_t>(helper.getMaxBarrierRegisterPerSlice());
|
||||
auto barrierCount = kernelDescriptor.kernelAttributes.barrierCount;
|
||||
|
||||
auto maxWorkGroupCount = KernelHelper::getMaxWorkGroupCount(kernelInfo.getMaxSimdSize(),
|
||||
availableThreadCount,
|
||||
dssCount,
|
||||
availableSlmSize,
|
||||
usedSlmSize,
|
||||
maxBarrierCount,
|
||||
barrierCount,
|
||||
workDim,
|
||||
localWorkSize);
|
||||
auto isEngineInstanced = commandQueue->getGpgpuCommandStreamReceiver().getOsContext().isEngineInstanced();
|
||||
maxWorkGroupCount = helper.adjustMaxWorkGroupCount(maxWorkGroupCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced);
|
||||
|
||||
auto usedSlmSize = helper.alignSlmSize(slmTotalSize);
|
||||
|
||||
auto maxWorkGroupCount = KernelHelper::getMaxWorkGroupCount(rootDeviceEnvironment,
|
||||
kernelInfo.kernelDescriptor,
|
||||
usedSlmSize,
|
||||
workDim,
|
||||
localWorkSize,
|
||||
engineGroupType,
|
||||
isEngineInstanced);
|
||||
|
||||
return maxWorkGroupCount;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2021-2023 Intel Corporation
|
||||
* Copyright (C) 2021-2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -70,7 +70,7 @@ uint32_t GfxCoreHelperHw<Family>::adjustMaxWorkGroupCount(uint32_t maxWorkGroupC
|
||||
UNRECOVERABLE_IF(ccsCount == 0);
|
||||
numberOfpartsInTileForConcurrentKernels = std::max(numberOfpartsInTileForConcurrentKernels, ccsCount);
|
||||
}
|
||||
return maxWorkGroupCount / numberOfpartsInTileForConcurrentKernels;
|
||||
return std::max(maxWorkGroupCount / numberOfpartsInTileForConcurrentKernels, 1u);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2023 Intel Corporation
|
||||
* Copyright (C) 2019-2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -11,20 +11,33 @@
|
||||
#include "shared/source/device/device.h"
|
||||
#include "shared/source/execution_environment/root_device_environment.h"
|
||||
#include "shared/source/helpers/basic_math.h"
|
||||
#include "shared/source/helpers/constants.h"
|
||||
#include "shared/source/helpers/debug_helpers.h"
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
#include "shared/source/helpers/hw_info.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace NEO {
|
||||
|
||||
uint32_t KernelHelper::getMaxWorkGroupCount(uint32_t simd, uint32_t availableThreadCount, uint32_t dssCount, uint32_t availableSlmSize,
|
||||
uint32_t usedSlmSize, uint32_t maxBarrierCount, uint32_t numberOfBarriers, uint32_t workDim,
|
||||
const size_t *localWorkSize) {
|
||||
uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor,
|
||||
uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced) {
|
||||
if (debugManager.flags.OverrideMaxWorkGroupCount.get() != -1) {
|
||||
return static_cast<uint32_t>(debugManager.flags.OverrideMaxWorkGroupCount.get());
|
||||
}
|
||||
|
||||
auto &helper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||
auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
|
||||
|
||||
auto dssCount = hwInfo.gtSystemInfo.DualSubSliceCount;
|
||||
if (dssCount == 0) {
|
||||
dssCount = hwInfo.gtSystemInfo.SubSliceCount;
|
||||
}
|
||||
|
||||
auto availableThreadCount = helper.calculateAvailableThreadCount(hwInfo, kernelDescriptor.kernelAttributes.numGrfRequired);
|
||||
auto availableSlmSize = static_cast<uint32_t>(dssCount * MemoryConstants::kiloByte * hwInfo.capabilityTable.slmSize);
|
||||
auto maxBarrierCount = static_cast<uint32_t>(helper.getMaxBarrierRegisterPerSlice());
|
||||
|
||||
UNRECOVERABLE_IF((workDim == 0) || (workDim > 3));
|
||||
UNRECOVERABLE_IF(localWorkSize == nullptr);
|
||||
|
||||
@@ -33,11 +46,11 @@ uint32_t KernelHelper::getMaxWorkGroupCount(uint32_t simd, uint32_t availableThr
|
||||
workGroupSize *= localWorkSize[i];
|
||||
}
|
||||
|
||||
auto numThreadsPerThreadGroup = static_cast<uint32_t>(Math::divideAndRoundUp(workGroupSize, simd));
|
||||
auto numThreadsPerThreadGroup = static_cast<uint32_t>(Math::divideAndRoundUp(workGroupSize, kernelDescriptor.kernelAttributes.simdSize));
|
||||
auto maxWorkGroupsCount = availableThreadCount / numThreadsPerThreadGroup;
|
||||
|
||||
if (numberOfBarriers > 0) {
|
||||
auto maxWorkGroupsCountDueToBarrierUsage = dssCount * (maxBarrierCount / numberOfBarriers);
|
||||
if (kernelDescriptor.kernelAttributes.barrierCount > 0) {
|
||||
auto maxWorkGroupsCountDueToBarrierUsage = dssCount * (maxBarrierCount / kernelDescriptor.kernelAttributes.barrierCount);
|
||||
maxWorkGroupsCount = std::min(maxWorkGroupsCount, maxWorkGroupsCountDueToBarrierUsage);
|
||||
}
|
||||
|
||||
@@ -46,7 +59,7 @@ uint32_t KernelHelper::getMaxWorkGroupCount(uint32_t simd, uint32_t availableThr
|
||||
maxWorkGroupsCount = std::min(maxWorkGroupsCount, maxWorkGroupsCountDueToSlm);
|
||||
}
|
||||
|
||||
return maxWorkGroupsCount;
|
||||
return helper.adjustMaxWorkGroupCount(maxWorkGroupsCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced);
|
||||
}
|
||||
|
||||
KernelHelper::ErrorCode KernelHelper::checkIfThereIsSpaceForScratchOrPrivate(KernelDescriptor::KernelAttributes attributes, Device *device) {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2023 Intel Corporation
|
||||
* Copyright (C) 2019-2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "shared/source/helpers/definitions/engine_group_types.h"
|
||||
#include "shared/source/kernel/kernel_descriptor.h"
|
||||
|
||||
#include <cstddef>
|
||||
@@ -14,6 +15,7 @@
|
||||
|
||||
namespace NEO {
|
||||
class Device;
|
||||
struct RootDeviceEnvironment;
|
||||
|
||||
struct KernelHelper {
|
||||
enum class ErrorCode {
|
||||
@@ -21,9 +23,8 @@ struct KernelHelper {
|
||||
outOfDeviceMemory = 1,
|
||||
invalidKernel = 2
|
||||
};
|
||||
static uint32_t getMaxWorkGroupCount(uint32_t simd, uint32_t availableThreadCount, uint32_t dssCount, uint32_t availableSlmSize,
|
||||
uint32_t usedSlmSize, uint32_t maxBarrierCount, uint32_t numberOfBarriers, uint32_t workDim,
|
||||
const size_t *localWorkSize);
|
||||
static uint32_t getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor,
|
||||
uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced);
|
||||
static inline uint64_t getPrivateSurfaceSize(uint64_t perHwThreadPrivateMemorySize, uint32_t computeUnitsUsedForScratch) {
|
||||
return perHwThreadPrivateMemorySize * computeUnitsUsedForScratch;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2023 Intel Corporation
|
||||
* Copyright (C) 2019-2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -12,30 +12,53 @@
|
||||
#include "shared/test/common/fixtures/device_fixture.h"
|
||||
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
||||
#include "shared/test/common/mocks/mock_device.h"
|
||||
#include "shared/test/common/mocks/mock_execution_environment.h"
|
||||
#include "shared/test/common/test_macros/test.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
using namespace NEO;
|
||||
|
||||
struct KernelHelperMaxWorkGroupsTests : ::testing::Test {
|
||||
EngineGroupType engineType = EngineGroupType::compute;
|
||||
uint32_t simd = 8;
|
||||
uint32_t threadCount = 8 * 1024;
|
||||
uint32_t dssCount = 16;
|
||||
uint32_t availableSlm = 64 * MemoryConstants::kiloByte;
|
||||
uint32_t usedSlm = 0;
|
||||
uint32_t maxBarrierCount = 32;
|
||||
uint32_t numberOfBarriers = 0;
|
||||
uint32_t workDim = 3;
|
||||
uint32_t grf = 128;
|
||||
size_t lws[3] = {10, 10, 10};
|
||||
|
||||
uint32_t getMaxWorkGroupCount() {
|
||||
return KernelHelper::getMaxWorkGroupCount(simd, threadCount, dssCount, availableSlm, usedSlm,
|
||||
maxBarrierCount, numberOfBarriers, workDim, lws);
|
||||
void SetUp() override {
|
||||
executionEnvironment = std::make_unique<MockExecutionEnvironment>(defaultHwInfo.get(), false, 1u);
|
||||
rootDeviceEnvironment = executionEnvironment->rootDeviceEnvironments[0].get();
|
||||
}
|
||||
|
||||
uint32_t getMaxWorkGroupCount() {
|
||||
KernelDescriptor descriptor = {};
|
||||
descriptor.kernelAttributes.simdSize = simd;
|
||||
descriptor.kernelAttributes.barrierCount = numberOfBarriers;
|
||||
descriptor.kernelAttributes.numGrfRequired = grf;
|
||||
|
||||
auto hwInfo = rootDeviceEnvironment->getMutableHardwareInfo();
|
||||
hwInfo->gtSystemInfo.DualSubSliceCount = dssCount;
|
||||
hwInfo->capabilityTable.slmSize = (availableSlm / MemoryConstants::kiloByte) / dssCount;
|
||||
|
||||
return KernelHelper::getMaxWorkGroupCount(*rootDeviceEnvironment, descriptor, usedSlm, workDim, lws, engineType, false);
|
||||
}
|
||||
|
||||
std::unique_ptr<MockExecutionEnvironment> executionEnvironment;
|
||||
RootDeviceEnvironment *rootDeviceEnvironment = nullptr;
|
||||
};
|
||||
|
||||
TEST_F(KernelHelperMaxWorkGroupsTests, GivenNoBarriersOrSlmUsedWhenCalculatingMaxWorkGroupsCountThenResultIsCalculatedWithSimd) {
|
||||
auto workGroupSize = lws[0] * lws[1] * lws[2];
|
||||
auto expected = threadCount / Math::divideAndRoundUp(workGroupSize, simd);
|
||||
auto &helper = rootDeviceEnvironment->getHelper<NEO::GfxCoreHelper>();
|
||||
|
||||
uint32_t workGroupSize = static_cast<uint32_t>(lws[0] * lws[1] * lws[2]);
|
||||
uint32_t expected = helper.calculateAvailableThreadCount(*rootDeviceEnvironment->getHardwareInfo(), grf) / static_cast<uint32_t>(Math::divideAndRoundUp(workGroupSize, simd));
|
||||
|
||||
expected = helper.adjustMaxWorkGroupCount(expected, EngineGroupType::compute, *rootDeviceEnvironment, false);
|
||||
EXPECT_EQ(expected, getMaxWorkGroupCount());
|
||||
}
|
||||
|
||||
@@ -47,33 +70,44 @@ TEST_F(KernelHelperMaxWorkGroupsTests, GivenDebugFlagSetWhenGetMaxWorkGroupCount
|
||||
}
|
||||
|
||||
TEST_F(KernelHelperMaxWorkGroupsTests, GivenBarriersWhenCalculatingMaxWorkGroupsCountThenResultIsCalculatedWithRegardToBarriersCount) {
|
||||
numberOfBarriers = 0;
|
||||
auto baseCount = getMaxWorkGroupCount();
|
||||
|
||||
numberOfBarriers = 16;
|
||||
|
||||
auto expected = dssCount * (maxBarrierCount / numberOfBarriers);
|
||||
auto &helper = rootDeviceEnvironment->getHelper<NEO::GfxCoreHelper>();
|
||||
auto maxBarrierCount = helper.getMaxBarrierRegisterPerSlice();
|
||||
|
||||
auto expected = std::min(baseCount, static_cast<uint32_t>(dssCount * (maxBarrierCount / numberOfBarriers)));
|
||||
EXPECT_EQ(expected, getMaxWorkGroupCount());
|
||||
}
|
||||
|
||||
TEST_F(KernelHelperMaxWorkGroupsTests, GivenUsedSlmSizeWhenCalculatingMaxWorkGroupsCountThenResultIsCalculatedWithRegardToUsedSlmSize) {
|
||||
usedSlm = 0;
|
||||
auto baseCount = getMaxWorkGroupCount();
|
||||
|
||||
usedSlm = 4 * MemoryConstants::kiloByte;
|
||||
|
||||
auto expected = availableSlm / usedSlm;
|
||||
auto expected = std::min(baseCount, availableSlm / usedSlm);
|
||||
EXPECT_EQ(expected, getMaxWorkGroupCount());
|
||||
}
|
||||
|
||||
TEST_F(KernelHelperMaxWorkGroupsTests, GivenVariousValuesWhenCalculatingMaxWorkGroupsCountThenLowestResultIsAlwaysReturned) {
|
||||
auto &helper = rootDeviceEnvironment->getHelper<NEO::GfxCoreHelper>();
|
||||
|
||||
engineType = EngineGroupType::cooperativeCompute;
|
||||
usedSlm = 1 * MemoryConstants::kiloByte;
|
||||
numberOfBarriers = 1;
|
||||
dssCount = 1;
|
||||
|
||||
workDim = 1;
|
||||
lws[0] = simd;
|
||||
threadCount = 1;
|
||||
EXPECT_EQ(1u, getMaxWorkGroupCount());
|
||||
auto hwInfo = rootDeviceEnvironment->getMutableHardwareInfo();
|
||||
|
||||
threadCount = 1024;
|
||||
hwInfo->gtSystemInfo.ThreadCount = 1024;
|
||||
EXPECT_NE(1u, getMaxWorkGroupCount());
|
||||
|
||||
numberOfBarriers = 32;
|
||||
numberOfBarriers = static_cast<uint32_t>(helper.getMaxBarrierRegisterPerSlice());
|
||||
EXPECT_EQ(1u, getMaxWorkGroupCount());
|
||||
|
||||
numberOfBarriers = 1;
|
||||
|
||||
Reference in New Issue
Block a user