fix: correct limitation for num threads per thread group

taking into account the max work group limit

Resolves: NEO-14922
Related-To: NEO-11881
Signed-off-by: Katarzyna Cencelewska <katarzyna.cencelewska@intel.com>
This commit is contained in:
Katarzyna Cencelewska
2025-05-20 14:41:49 +00:00
committed by Compute-Runtime-Automation
parent 5f80490385
commit 6ad4ad41b1
26 changed files with 171 additions and 228 deletions

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2024 Intel Corporation
* Copyright (C) 2018-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -44,9 +44,8 @@ LocalIDHelper::LocalIDHelper() {
LocalIDHelper LocalIDHelper::initializer;
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) {
bool localIdsGeneratedByHw = false;
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfCount, localIdsGeneratedByHw, rootDeviceEnvironment));
auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfCount, rootDeviceEnvironment));
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
if (useLayoutForImages) {
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);

View File

@@ -120,7 +120,7 @@ class GfxCoreHelper {
virtual bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
virtual uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
virtual uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
virtual uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0;
virtual size_t getSipKernelMaxDbgSurfaceSize(const HardwareInfo &hwInfo) const = 0;
virtual bool isCpuImageTransferPreferred(const HardwareInfo &hwInfo) const = 0;
@@ -166,7 +166,7 @@ class GfxCoreHelper {
virtual bool isChipsetUniqueUUIDSupported() const = 0;
virtual bool isTimestampShiftRequired() const = 0;
virtual bool isRelaxedOrderingSupported() const = 0;
virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
virtual uint32_t overrideMaxWorkGroupSize(uint32_t maxWG) const = 0;
virtual DeviceHierarchyMode getDefaultDeviceHierarchy() const = 0;
static bool isWorkaroundRequired(uint32_t lowestSteppingWithBug, uint32_t steppingWithFix, const HardwareInfo &hwInfo, const ProductHelper &productHelper);
@@ -361,7 +361,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
const RootDeviceEnvironment &rootDeviceEnvironment) const override;
uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
size_t getMaxFillPaternSizeForCopyEngine() const override;
size_t getSipKernelMaxDbgSurfaceSize(const HardwareInfo &hwInfo) const override;
@@ -415,7 +415,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
bool isChipsetUniqueUUIDSupported() const override;
bool isTimestampShiftRequired() const override;
bool isRelaxedOrderingSupported() const override;
uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
uint32_t overrideMaxWorkGroupSize(uint32_t maxWG) const override;
DeviceHierarchyMode getDefaultDeviceHierarchy() const override;

View File

@@ -661,7 +661,7 @@ uint32_t GfxCoreHelperHw<GfxFamily>::overrideMaxWorkGroupSize(uint32_t maxWG) co
}
template <typename GfxFamily>
uint32_t GfxCoreHelperHw<GfxFamily>::adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const {
uint32_t GfxCoreHelperHw<GfxFamily>::adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const {
return defaultMaxGroupSize;
}
@@ -671,7 +671,7 @@ uint32_t GfxCoreHelperHw<GfxFamily>::getMinimalGrfSize() const {
}
template <typename GfxFamily>
uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const {
uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) const {
return getThreadsPerWG(simd, totalWorkItems);
}

View File

@@ -25,4 +25,37 @@ uint32_t GfxCoreHelperHw<Family>::calculateAvailableThreadCount(const HardwareIn
}
return std::min(hwInfo.gtSystemInfo.ThreadCount, maxThreadsPerEuCount * hwInfo.gtSystemInfo.EUCount);
}
template <>
uint32_t GfxCoreHelperHw<Family>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) const {
uint32_t numThreadsPerThreadGroup = getThreadsPerWG(simd, totalWorkItems);
if (debugManager.flags.RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup.get() == 1) {
return numThreadsPerThreadGroup;
}
auto simt = isSimd1(simd) ? 32u : simd;
const auto &compilerProductHelper = rootDeviceEnvironment.getHelper<CompilerProductHelper>();
const auto &productHelper = rootDeviceEnvironment.getProductHelper();
const auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
auto isHeaplessMode = compilerProductHelper.isHeaplessModeEnabled(hwInfo);
uint32_t maxThreadsPerThreadGroup = 32u;
if (grfCount == 512) {
maxThreadsPerThreadGroup = 16u;
} else if ((grfCount == 256) || (simt == 32u)) {
// driver limit maxWorkgroupSize to 1024 (NEO-11881) so for simt 32 the max threads per thread group is 32
maxThreadsPerThreadGroup = 32u;
} else if (grfCount == 192) {
maxThreadsPerThreadGroup = 40u;
} else if (grfCount == 160) {
maxThreadsPerThreadGroup = 48u;
} else if (grfCount <= 128) {
maxThreadsPerThreadGroup = 64u;
}
maxThreadsPerThreadGroup = productHelper.adjustMaxThreadsPerThreadGroup(maxThreadsPerThreadGroup, simt, grfCount, isHeaplessMode);
DEBUG_BREAK_IF(maxThreadsPerThreadGroup * simt > CommonConstants::maxWorkgroupSize);
return std::min(numThreadsPerThreadGroup, maxThreadsPerThreadGroup);
}
} // namespace NEO

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2024 Intel Corporation
* Copyright (C) 2018-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -24,14 +24,13 @@ struct PerThreadDataHelper {
uint32_t grfCount,
uint32_t numChannels,
size_t localWorkSize,
bool isHwLocalIdGeneration,
const RootDeviceEnvironment &rootDeviceEnvironment) {
auto perThreadSizeLocalIDs = static_cast<size_t>(getPerThreadSizeLocalIDs(simd, grfSize, numChannels));
if (isSimd1(simd)) {
return perThreadSizeLocalIDs * localWorkSize;
}
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
return perThreadSizeLocalIDs * gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkSize), grfCount, isHwLocalIdGeneration, rootDeviceEnvironment);
return perThreadSizeLocalIDs * gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkSize), grfCount, rootDeviceEnvironment);
}
}; // namespace PerThreadDataHelper
} // namespace NEO

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2024 Intel Corporation
* Copyright (C) 2018-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -47,9 +47,8 @@ LocalIDHelper LocalIDHelper::initializer;
// traditional function to generate local IDs
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) {
bool localIdsGeneratedByHw = false;
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfCount, localIdsGeneratedByHw, rootDeviceEnvironment));
auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfCount, rootDeviceEnvironment));
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
if (useLayoutForImages) {
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);