fix: align thread group to dss size if kernel uses slm

Related-To: NEO-12133
Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>
This commit is contained in:
Maciej Plewka
2024-10-17 10:58:39 +00:00
committed by Compute-Runtime-Automation
parent b17fabb120
commit 9d6d6e85f1
9 changed files with 59 additions and 39 deletions

View File

@@ -54,16 +54,16 @@ uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDev
UNRECOVERABLE_IF(workGroupSize == 0);
auto numThreadsPerThreadGroup = static_cast<uint32_t>(Math::divideAndRoundUp(workGroupSize, simdSize));
auto maxWorkGroupsCount = availableThreadCount / numThreadsPerThreadGroup;
if (barrierCount > 0) {
auto maxWorkGroupsCountDueToBarrierUsage = dssCount * (maxBarrierCount / barrierCount);
if (barrierCount > 0 || usedSlmSize > 0) {
helper.alignThreadGroupCountToDssSize(maxWorkGroupsCount, dssCount, availableThreadCount / dssCount, numThreadsPerThreadGroup);
maxWorkGroupsCount = std::min(maxWorkGroupsCount, maxWorkGroupsCountDueToBarrierUsage);
}
if (usedSlmSize > 0) {
auto maxWorkGroupsCountDueToSlm = availableSlmSize / usedSlmSize;
maxWorkGroupsCount = std::min(maxWorkGroupsCount, maxWorkGroupsCountDueToSlm);
if (barrierCount > 0) {
auto maxWorkGroupsCountDueToBarrierUsage = dssCount * (maxBarrierCount / barrierCount);
maxWorkGroupsCount = std::min(maxWorkGroupsCount, maxWorkGroupsCountDueToBarrierUsage);
}
if (usedSlmSize > 0) {
auto maxWorkGroupsCountDueToSlm = availableSlmSize / usedSlmSize;
maxWorkGroupsCount = std::min(maxWorkGroupsCount, maxWorkGroupsCountDueToSlm);
}
}
maxWorkGroupsCount = helper.adjustMaxWorkGroupCount(maxWorkGroupsCount, engineGroupType, rootDeviceEnvironment);