fix: to always use grfs count in calculateNumThreadsPerThreadGroup

grf size != grf count

Related-To: GSD-8437
Signed-off-by: Katarzyna Cencelewska <katarzyna.cencelewska@intel.com>
This commit is contained in:
Katarzyna Cencelewska
2024-03-22 09:39:15 +00:00
committed by Compute-Runtime-Automation
parent df54d67f40
commit da7b03dd15
27 changed files with 86 additions and 68 deletions

View File

@@ -1029,12 +1029,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
dispatchKernelWithImplicitArgs<FamilyType>();
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
auto numGrf = GrfConfig::defaultGrfNumber;
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, rootDeviceEnvironment);
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
alignedFree(expectedLocalIds);
@@ -1075,12 +1076,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
dispatchKernelWithImplicitArgs<FamilyType>();
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
auto numGrf = GrfConfig::defaultGrfNumber;
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, rootDeviceEnvironment);
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
alignedFree(expectedLocalIds);

View File

@@ -306,6 +306,7 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSett
mockKernel.module = &mockModule;
const auto &device = mockModule.getDevice();
auto grfSize = device->getHwInfo().capabilityTable.grfSize;
auto numGrf = GrfConfig::defaultGrfNumber;
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
uint32_t groupSize[3] = {2, 3, 5};
auto ret = mockKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2]);
@@ -315,13 +316,14 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSett
auto numThreadsPerTG = gfxHelper.calculateNumThreadsPerThreadGroup(
mockKernel.descriptor.kernelAttributes.simdSize,
groupSize[0] * groupSize[1] * groupSize[2],
grfSize,
numGrf,
mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime,
rootDeviceEnvironment);
auto perThreadDataSizeForWholeTGNeeded =
static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
mockKernel.descriptor.kernelAttributes.simdSize,
grfSize,
numGrf,
mockKernel.descriptor.kernelAttributes.numLocalIdChannels,
groupSize[0] * groupSize[1] * groupSize[2],
!mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime,