mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 14:55:24 +08:00
fix: correct limitation for num threads per thread group
taking into account the max work group limit Resolves: NEO-14922 Related-To: NEO-11881 Signed-off-by: Katarzyna Cencelewska <katarzyna.cencelewska@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
5f80490385
commit
6ad4ad41b1
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2024 Intel Corporation
|
||||
* Copyright (C) 2018-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -44,9 +44,8 @@ LocalIDHelper::LocalIDHelper() {
|
||||
LocalIDHelper LocalIDHelper::initializer;
|
||||
|
||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
bool localIdsGeneratedByHw = false;
|
||||
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||
auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfCount, localIdsGeneratedByHw, rootDeviceEnvironment));
|
||||
auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfCount, rootDeviceEnvironment));
|
||||
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
|
||||
if (useLayoutForImages) {
|
||||
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
|
||||
|
||||
@@ -120,7 +120,7 @@ class GfxCoreHelper {
|
||||
virtual bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
|
||||
virtual uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
|
||||
virtual uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
|
||||
virtual uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
|
||||
virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0;
|
||||
virtual size_t getSipKernelMaxDbgSurfaceSize(const HardwareInfo &hwInfo) const = 0;
|
||||
virtual bool isCpuImageTransferPreferred(const HardwareInfo &hwInfo) const = 0;
|
||||
@@ -166,7 +166,7 @@ class GfxCoreHelper {
|
||||
virtual bool isChipsetUniqueUUIDSupported() const = 0;
|
||||
virtual bool isTimestampShiftRequired() const = 0;
|
||||
virtual bool isRelaxedOrderingSupported() const = 0;
|
||||
virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
|
||||
virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
|
||||
virtual uint32_t overrideMaxWorkGroupSize(uint32_t maxWG) const = 0;
|
||||
virtual DeviceHierarchyMode getDefaultDeviceHierarchy() const = 0;
|
||||
static bool isWorkaroundRequired(uint32_t lowestSteppingWithBug, uint32_t steppingWithFix, const HardwareInfo &hwInfo, const ProductHelper &productHelper);
|
||||
@@ -361,7 +361,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
|
||||
uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment) const override;
|
||||
|
||||
uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
|
||||
uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
|
||||
size_t getMaxFillPaternSizeForCopyEngine() const override;
|
||||
|
||||
size_t getSipKernelMaxDbgSurfaceSize(const HardwareInfo &hwInfo) const override;
|
||||
@@ -415,7 +415,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
|
||||
bool isChipsetUniqueUUIDSupported() const override;
|
||||
bool isTimestampShiftRequired() const override;
|
||||
bool isRelaxedOrderingSupported() const override;
|
||||
uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
|
||||
uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) const override;
|
||||
uint32_t overrideMaxWorkGroupSize(uint32_t maxWG) const override;
|
||||
DeviceHierarchyMode getDefaultDeviceHierarchy() const override;
|
||||
|
||||
|
||||
@@ -661,7 +661,7 @@ uint32_t GfxCoreHelperHw<GfxFamily>::overrideMaxWorkGroupSize(uint32_t maxWG) co
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint32_t GfxCoreHelperHw<GfxFamily>::adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, bool isHwLocalGeneration, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const {
|
||||
uint32_t GfxCoreHelperHw<GfxFamily>::adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const {
|
||||
return defaultMaxGroupSize;
|
||||
}
|
||||
|
||||
@@ -671,7 +671,7 @@ uint32_t GfxCoreHelperHw<GfxFamily>::getMinimalGrfSize() const {
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) const {
|
||||
uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) const {
|
||||
return getThreadsPerWG(simd, totalWorkItems);
|
||||
}
|
||||
|
||||
|
||||
@@ -25,4 +25,37 @@ uint32_t GfxCoreHelperHw<Family>::calculateAvailableThreadCount(const HardwareIn
|
||||
}
|
||||
return std::min(hwInfo.gtSystemInfo.ThreadCount, maxThreadsPerEuCount * hwInfo.gtSystemInfo.EUCount);
|
||||
}
|
||||
|
||||
template <>
|
||||
uint32_t GfxCoreHelperHw<Family>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) const {
|
||||
uint32_t numThreadsPerThreadGroup = getThreadsPerWG(simd, totalWorkItems);
|
||||
if (debugManager.flags.RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup.get() == 1) {
|
||||
return numThreadsPerThreadGroup;
|
||||
}
|
||||
auto simt = isSimd1(simd) ? 32u : simd;
|
||||
const auto &compilerProductHelper = rootDeviceEnvironment.getHelper<CompilerProductHelper>();
|
||||
const auto &productHelper = rootDeviceEnvironment.getProductHelper();
|
||||
const auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
|
||||
auto isHeaplessMode = compilerProductHelper.isHeaplessModeEnabled(hwInfo);
|
||||
|
||||
uint32_t maxThreadsPerThreadGroup = 32u;
|
||||
|
||||
if (grfCount == 512) {
|
||||
maxThreadsPerThreadGroup = 16u;
|
||||
} else if ((grfCount == 256) || (simt == 32u)) {
|
||||
// driver limit maxWorkgroupSize to 1024 (NEO-11881) so for simt 32 the max threads per thread group is 32
|
||||
maxThreadsPerThreadGroup = 32u;
|
||||
} else if (grfCount == 192) {
|
||||
maxThreadsPerThreadGroup = 40u;
|
||||
} else if (grfCount == 160) {
|
||||
maxThreadsPerThreadGroup = 48u;
|
||||
} else if (grfCount <= 128) {
|
||||
maxThreadsPerThreadGroup = 64u;
|
||||
}
|
||||
|
||||
maxThreadsPerThreadGroup = productHelper.adjustMaxThreadsPerThreadGroup(maxThreadsPerThreadGroup, simt, grfCount, isHeaplessMode);
|
||||
DEBUG_BREAK_IF(maxThreadsPerThreadGroup * simt > CommonConstants::maxWorkgroupSize);
|
||||
return std::min(numThreadsPerThreadGroup, maxThreadsPerThreadGroup);
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2024 Intel Corporation
|
||||
* Copyright (C) 2018-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -24,14 +24,13 @@ struct PerThreadDataHelper {
|
||||
uint32_t grfCount,
|
||||
uint32_t numChannels,
|
||||
size_t localWorkSize,
|
||||
bool isHwLocalIdGeneration,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
auto perThreadSizeLocalIDs = static_cast<size_t>(getPerThreadSizeLocalIDs(simd, grfSize, numChannels));
|
||||
if (isSimd1(simd)) {
|
||||
return perThreadSizeLocalIDs * localWorkSize;
|
||||
}
|
||||
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||
return perThreadSizeLocalIDs * gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkSize), grfCount, isHwLocalIdGeneration, rootDeviceEnvironment);
|
||||
return perThreadSizeLocalIDs * gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkSize), grfCount, rootDeviceEnvironment);
|
||||
}
|
||||
}; // namespace PerThreadDataHelper
|
||||
} // namespace NEO
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2024 Intel Corporation
|
||||
* Copyright (C) 2018-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -47,9 +47,8 @@ LocalIDHelper LocalIDHelper::initializer;
|
||||
|
||||
// traditional function to generate local IDs
|
||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
bool localIdsGeneratedByHw = false;
|
||||
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||
auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfCount, localIdsGeneratedByHw, rootDeviceEnvironment));
|
||||
auto threadsPerWorkGroup = static_cast<uint16_t>(gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]), grfCount, rootDeviceEnvironment));
|
||||
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
|
||||
if (useLayoutForImages) {
|
||||
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
|
||||
|
||||
Reference in New Issue
Block a user