Revert "fix: correct limitation for num threads per thread group"

This reverts commit 6ad4ad41b1.

Signed-off-by: Compute-Runtime-Validation <compute-runtime-validation@intel.com>
This commit is contained in:
Compute-Runtime-Validation
2025-05-23 02:11:46 +02:00
committed by Compute-Runtime-Automation
parent d6849a5605
commit 593c9e76f2
26 changed files with 223 additions and 166 deletions

View File

@@ -69,7 +69,7 @@ size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(const Kernel &kerne
requiredWalkOrder,
simdSize);
auto size = kernel.getCrossThreadDataSize() +
HardwareCommandsHelper::getPerThreadDataSizeTotal(simdSize, grfSize, grfCount, numChannels, localWorkSize, rootDeviceEnvironment);
HardwareCommandsHelper::getPerThreadDataSizeTotal(simdSize, grfSize, grfCount, numChannels, localWorkSize, isHwLocalIdGeneration, rootDeviceEnvironment);
auto pImplicitArgs = kernel.getImplicitArgs();
if (pImplicitArgs) {
@@ -297,7 +297,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
auto &gfxCoreHelper = device.getGfxCoreHelper();
auto grfCount = kernel.getDescriptor().kernelAttributes.numGrfRequired;
auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkItems), grfCount, device.getRootDeviceEnvironment());
auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkItems), grfCount, !localIdsGenerationByRuntime, device.getRootDeviceEnvironment());
uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();

View File

@@ -2235,9 +2235,10 @@ void Kernel::reconfigureKernel() {
const auto &kernelDescriptor = kernelInfo.kernelDescriptor;
const auto &gfxCoreHelper = this->getGfxCoreHelper();
auto maxWorkGroupSize = gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, this->maxKernelWorkGroupSize);
bool isLocalIdsGeneratedByHw = false; // if local ids generated by runtime then more work groups available
maxWorkGroupSize = static_cast<uint32_t>(kernelInfo.getMaxRequiredWorkGroupSize(maxWorkGroupSize));
this->maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, maxWorkGroupSize, getDevice().getRootDeviceEnvironment());
this->maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, isLocalIdsGeneratedByHw, maxWorkGroupSize, getDevice().getRootDeviceEnvironment());
this->containsStatelessWrites = kernelDescriptor.kernelAttributes.flags.usesStatelessWrites;
this->systolicPipelineSelectMode = kernelDescriptor.kernelAttributes.flags.usesSystolicPipelineSelectMode;