refactor: unify getMaxWorkGroupCount logic

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2024-01-19 16:00:58 +00:00
committed by Compute-Runtime-Automation
parent 1002cb9f34
commit 6f4ed10919
6 changed files with 93 additions and 69 deletions

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2023 Intel Corporation
* Copyright (C) 2021-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -70,7 +70,7 @@ uint32_t GfxCoreHelperHw<Family>::adjustMaxWorkGroupCount(uint32_t maxWorkGroupC
UNRECOVERABLE_IF(ccsCount == 0);
numberOfpartsInTileForConcurrentKernels = std::max(numberOfpartsInTileForConcurrentKernels, ccsCount);
}
return maxWorkGroupCount / numberOfpartsInTileForConcurrentKernels;
return std::max(maxWorkGroupCount / numberOfpartsInTileForConcurrentKernels, 1u);
}
template <typename Family>

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2023 Intel Corporation
* Copyright (C) 2019-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -11,20 +11,33 @@
#include "shared/source/device/device.h"
#include "shared/source/execution_environment/root_device_environment.h"
#include "shared/source/helpers/basic_math.h"
#include "shared/source/helpers/constants.h"
#include "shared/source/helpers/debug_helpers.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/helpers/hw_info.h"
#include <algorithm>
namespace NEO {
uint32_t KernelHelper::getMaxWorkGroupCount(uint32_t simd, uint32_t availableThreadCount, uint32_t dssCount, uint32_t availableSlmSize,
uint32_t usedSlmSize, uint32_t maxBarrierCount, uint32_t numberOfBarriers, uint32_t workDim,
const size_t *localWorkSize) {
uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor,
uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced) {
if (debugManager.flags.OverrideMaxWorkGroupCount.get() != -1) {
return static_cast<uint32_t>(debugManager.flags.OverrideMaxWorkGroupCount.get());
}
auto &helper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
auto dssCount = hwInfo.gtSystemInfo.DualSubSliceCount;
if (dssCount == 0) {
dssCount = hwInfo.gtSystemInfo.SubSliceCount;
}
auto availableThreadCount = helper.calculateAvailableThreadCount(hwInfo, kernelDescriptor.kernelAttributes.numGrfRequired);
auto availableSlmSize = static_cast<uint32_t>(dssCount * MemoryConstants::kiloByte * hwInfo.capabilityTable.slmSize);
auto maxBarrierCount = static_cast<uint32_t>(helper.getMaxBarrierRegisterPerSlice());
UNRECOVERABLE_IF((workDim == 0) || (workDim > 3));
UNRECOVERABLE_IF(localWorkSize == nullptr);
@@ -33,11 +46,11 @@ uint32_t KernelHelper::getMaxWorkGroupCount(uint32_t simd, uint32_t availableThr
workGroupSize *= localWorkSize[i];
}
auto numThreadsPerThreadGroup = static_cast<uint32_t>(Math::divideAndRoundUp(workGroupSize, simd));
auto numThreadsPerThreadGroup = static_cast<uint32_t>(Math::divideAndRoundUp(workGroupSize, kernelDescriptor.kernelAttributes.simdSize));
auto maxWorkGroupsCount = availableThreadCount / numThreadsPerThreadGroup;
if (numberOfBarriers > 0) {
auto maxWorkGroupsCountDueToBarrierUsage = dssCount * (maxBarrierCount / numberOfBarriers);
if (kernelDescriptor.kernelAttributes.barrierCount > 0) {
auto maxWorkGroupsCountDueToBarrierUsage = dssCount * (maxBarrierCount / kernelDescriptor.kernelAttributes.barrierCount);
maxWorkGroupsCount = std::min(maxWorkGroupsCount, maxWorkGroupsCountDueToBarrierUsage);
}
@@ -46,7 +59,7 @@ uint32_t KernelHelper::getMaxWorkGroupCount(uint32_t simd, uint32_t availableThr
maxWorkGroupsCount = std::min(maxWorkGroupsCount, maxWorkGroupsCountDueToSlm);
}
return maxWorkGroupsCount;
return helper.adjustMaxWorkGroupCount(maxWorkGroupsCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced);
}
KernelHelper::ErrorCode KernelHelper::checkIfThereIsSpaceForScratchOrPrivate(KernelDescriptor::KernelAttributes attributes, Device *device) {

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2023 Intel Corporation
* Copyright (C) 2019-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -7,6 +7,7 @@
#pragma once
#include "shared/source/helpers/definitions/engine_group_types.h"
#include "shared/source/kernel/kernel_descriptor.h"
#include <cstddef>
@@ -14,6 +15,7 @@
namespace NEO {
class Device;
struct RootDeviceEnvironment;
struct KernelHelper {
enum class ErrorCode {
@@ -21,9 +23,8 @@ struct KernelHelper {
outOfDeviceMemory = 1,
invalidKernel = 2
};
static uint32_t getMaxWorkGroupCount(uint32_t simd, uint32_t availableThreadCount, uint32_t dssCount, uint32_t availableSlmSize,
uint32_t usedSlmSize, uint32_t maxBarrierCount, uint32_t numberOfBarriers, uint32_t workDim,
const size_t *localWorkSize);
static uint32_t getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor,
uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType, bool isEngineInstanced);
static inline uint64_t getPrivateSurfaceSize(uint64_t perHwThreadPrivateMemorySize, uint32_t computeUnitsUsedForScratch) {
return perHwThreadPrivateMemorySize * computeUnitsUsedForScratch;
}