refactor: add param rootDeviceEnvironment to calculateNumThreadsPerThreadGroup

Signed-off-by: Katarzyna Cencelewska <katarzyna.cencelewska@intel.com>
This commit is contained in:
Katarzyna Cencelewska
2024-03-21 18:54:41 +00:00
committed by Compute-Runtime-Automation
parent ec009cf9e3
commit dd1d52259e
30 changed files with 231 additions and 180 deletions

View File

@@ -381,13 +381,13 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
this->numThreadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(
simdSize, static_cast<uint32_t>(itemsInGroup), grfSize, !kernelRequiresGenerationOfLocalIdsByRuntime);
simdSize, static_cast<uint32_t>(itemsInGroup), grfSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
if (kernelRequiresGenerationOfLocalIdsByRuntime) {
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
uint32_t perThreadDataSizeForWholeThreadGroupNeeded =
static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
simdSize, grfSize, numChannels, itemsInGroup, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper));
simdSize, grfSize, numChannels, itemsInGroup, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment));
if (perThreadDataSizeForWholeThreadGroupNeeded >
perThreadDataSizeForWholeThreadGroupAllocated) {
alignedFree(perThreadDataForWholeThreadGroup);
@@ -405,7 +405,7 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
static_cast<uint16_t>(groupSizeY),
static_cast<uint16_t>(groupSizeZ)}},
std::array<uint8_t, 3>{{0, 1, 2}},
false, grfSize, gfxCoreHelper);
false, grfSize, rootDeviceEnvironment);
}
this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / numThreadsPerThreadGroup;
@@ -902,8 +902,8 @@ ze_result_t KernelImp::getProperties(ze_kernel_properties_t *pKernelProperties)
memset(pKernelProperties->uuid.mid, 0, ZE_MAX_MODULE_UUID_SIZE);
uint32_t maxKernelWorkGroupSize = static_cast<uint32_t>(this->module->getMaxGroupSize(kernelDescriptor));
maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, !kernelRequiresGenerationOfLocalIdsByRuntime, maxKernelWorkGroupSize);
const auto &rootDeviceEnvironment = this->module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, !kernelRequiresGenerationOfLocalIdsByRuntime, maxKernelWorkGroupSize, rootDeviceEnvironment);
pKernelProperties->maxNumSubgroups = maxKernelWorkGroupSize / kernelDescriptor.kernelAttributes.simdSize;
void *pNext = pKernelProperties->pNext;

View File

@@ -996,8 +996,9 @@ struct CmdlistAppendLaunchKernelWithImplicitArgsTests : CmdlistAppendLaunchKerne
result = commandList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
const auto &gfxCoreHelper = device->getGfxCoreHelper();
implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&expectedImplicitArgs, *kernelDescriptor, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper);
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
const auto &gfxCoreHelper = device->getNEODevice()->getGfxCoreHelper();
implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&expectedImplicitArgs, *kernelDescriptor, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup();
EXPECT_EQ(indirectHeap->getUsed(), alignUp(sizeCrossThreadData + sizePerThreadDataForWholeGroup + implicitArgsProgrammingSize, gfxCoreHelper.getIOHAlignment()));
@@ -1029,11 +1030,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
const auto &gfxCoreHelper = device->getNEODevice()->getGfxCoreHelper();
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, gfxCoreHelper);
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, rootDeviceEnvironment);
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper);
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
alignedFree(expectedLocalIds);
@@ -1075,11 +1076,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
const auto &gfxCoreHelper = device->getNEODevice()->getGfxCoreHelper();
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, gfxCoreHelper);
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, rootDeviceEnvironment);
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper);
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
alignedFree(expectedLocalIds);

View File

@@ -732,8 +732,8 @@ struct CommandListAppendLaunchKernelWithImplicitArgs : CommandListAppendLaunchKe
template <typename FamilyType>
uint64_t getIndirectHeapOffsetForImplicitArgsBuffer(const Mock<::L0::KernelImp> &kernel) {
if (FamilyType::supportsCmdSet(IGFX_XE_HP_CORE)) {
const auto &gfxCoreHelper = device->getGfxCoreHelper();
auto implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernel.pImplicitArgs.get(), kernel.getKernelDescriptor(), !kernel.kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper);
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
auto implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernel.pImplicitArgs.get(), kernel.getKernelDescriptor(), !kernel.kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
return implicitArgsProgrammingSize - ImplicitArgs::getSize();
} else {
return 0u;

View File

@@ -304,7 +304,9 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSett
mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime = true; // although it is enabled for SIMD 1, make sure it is enforced
mockKernel.descriptor.kernelAttributes.numLocalIdChannels = 3;
mockKernel.module = &mockModule;
auto grfSize = mockModule.getDevice()->getHwInfo().capabilityTable.grfSize;
const auto &device = mockModule.getDevice();
auto grfSize = device->getHwInfo().capabilityTable.grfSize;
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
uint32_t groupSize[3] = {2, 3, 5};
auto ret = mockKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2]);
EXPECT_EQ(ZE_RESULT_SUCCESS, ret);
@@ -314,7 +316,8 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSett
mockKernel.descriptor.kernelAttributes.simdSize,
groupSize[0] * groupSize[1] * groupSize[2],
grfSize,
mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime);
mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime,
rootDeviceEnvironment);
auto perThreadDataSizeForWholeTGNeeded =
static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
mockKernel.descriptor.kernelAttributes.simdSize,
@@ -322,7 +325,7 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSett
mockKernel.descriptor.kernelAttributes.numLocalIdChannels,
groupSize[0] * groupSize[1] * groupSize[2],
!mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime,
gfxHelper));
rootDeviceEnvironment));
EXPECT_EQ(numThreadsPerTG, mockKernel.getNumThreadsPerThreadGroup());
EXPECT_EQ((perThreadDataSizeForWholeTGNeeded / numThreadsPerTG), mockKernel.perThreadDataSize);
@@ -1692,9 +1695,9 @@ TEST_F(KernelPropertiesTests, whenPassingKernelMaxGroupSizePropertiesStructToGet
ze_result_t res = kernel->getProperties(&kernelProperties);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
auto &gfxCoreHelper = module->getDevice()->getGfxCoreHelper();
uint32_t maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, false, static_cast<uint32_t>(this->module->getMaxGroupSize(kernelDescriptor)));
auto &device = *module->getDevice();
auto &gfxCoreHelper = device.getGfxCoreHelper();
uint32_t maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, false, static_cast<uint32_t>(this->module->getMaxGroupSize(kernelDescriptor)), device.getNEODevice()->getRootDeviceEnvironment());
EXPECT_EQ(maxKernelWorkGroupSize, maxGroupSizeProperties.maxGroupSize);
}