mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-04 15:53:45 +08:00
refactor: add param rootDeviceEnvironment to calculateNumThreadsPerThreadGroup
Signed-off-by: Katarzyna Cencelewska <katarzyna.cencelewska@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
ec009cf9e3
commit
dd1d52259e
@@ -381,13 +381,13 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
|
||||
auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
|
||||
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||
this->numThreadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(
|
||||
simdSize, static_cast<uint32_t>(itemsInGroup), grfSize, !kernelRequiresGenerationOfLocalIdsByRuntime);
|
||||
simdSize, static_cast<uint32_t>(itemsInGroup), grfSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||
|
||||
if (kernelRequiresGenerationOfLocalIdsByRuntime) {
|
||||
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
|
||||
uint32_t perThreadDataSizeForWholeThreadGroupNeeded =
|
||||
static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
|
||||
simdSize, grfSize, numChannels, itemsInGroup, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper));
|
||||
simdSize, grfSize, numChannels, itemsInGroup, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment));
|
||||
if (perThreadDataSizeForWholeThreadGroupNeeded >
|
||||
perThreadDataSizeForWholeThreadGroupAllocated) {
|
||||
alignedFree(perThreadDataForWholeThreadGroup);
|
||||
@@ -405,7 +405,7 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
|
||||
static_cast<uint16_t>(groupSizeY),
|
||||
static_cast<uint16_t>(groupSizeZ)}},
|
||||
std::array<uint8_t, 3>{{0, 1, 2}},
|
||||
false, grfSize, gfxCoreHelper);
|
||||
false, grfSize, rootDeviceEnvironment);
|
||||
}
|
||||
|
||||
this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / numThreadsPerThreadGroup;
|
||||
@@ -902,8 +902,8 @@ ze_result_t KernelImp::getProperties(ze_kernel_properties_t *pKernelProperties)
|
||||
memset(pKernelProperties->uuid.mid, 0, ZE_MAX_MODULE_UUID_SIZE);
|
||||
|
||||
uint32_t maxKernelWorkGroupSize = static_cast<uint32_t>(this->module->getMaxGroupSize(kernelDescriptor));
|
||||
|
||||
maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, !kernelRequiresGenerationOfLocalIdsByRuntime, maxKernelWorkGroupSize);
|
||||
const auto &rootDeviceEnvironment = this->module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
|
||||
maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, !kernelRequiresGenerationOfLocalIdsByRuntime, maxKernelWorkGroupSize, rootDeviceEnvironment);
|
||||
pKernelProperties->maxNumSubgroups = maxKernelWorkGroupSize / kernelDescriptor.kernelAttributes.simdSize;
|
||||
|
||||
void *pNext = pKernelProperties->pNext;
|
||||
|
||||
@@ -996,8 +996,9 @@ struct CmdlistAppendLaunchKernelWithImplicitArgsTests : CmdlistAppendLaunchKerne
|
||||
result = commandList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
|
||||
const auto &gfxCoreHelper = device->getGfxCoreHelper();
|
||||
implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&expectedImplicitArgs, *kernelDescriptor, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper);
|
||||
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
|
||||
const auto &gfxCoreHelper = device->getNEODevice()->getGfxCoreHelper();
|
||||
implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&expectedImplicitArgs, *kernelDescriptor, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
|
||||
auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup();
|
||||
EXPECT_EQ(indirectHeap->getUsed(), alignUp(sizeCrossThreadData + sizePerThreadDataForWholeGroup + implicitArgsProgrammingSize, gfxCoreHelper.getIOHAlignment()));
|
||||
@@ -1029,11 +1030,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
|
||||
|
||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
|
||||
const auto &gfxCoreHelper = device->getNEODevice()->getGfxCoreHelper();
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, gfxCoreHelper);
|
||||
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, rootDeviceEnvironment);
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
||||
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper);
|
||||
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||
alignedFree(expectedLocalIds);
|
||||
@@ -1075,11 +1076,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
|
||||
|
||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
|
||||
const auto &gfxCoreHelper = device->getNEODevice()->getGfxCoreHelper();
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, gfxCoreHelper);
|
||||
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, rootDeviceEnvironment);
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
||||
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper);
|
||||
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||
alignedFree(expectedLocalIds);
|
||||
|
||||
@@ -732,8 +732,8 @@ struct CommandListAppendLaunchKernelWithImplicitArgs : CommandListAppendLaunchKe
|
||||
template <typename FamilyType>
|
||||
uint64_t getIndirectHeapOffsetForImplicitArgsBuffer(const Mock<::L0::KernelImp> &kernel) {
|
||||
if (FamilyType::supportsCmdSet(IGFX_XE_HP_CORE)) {
|
||||
const auto &gfxCoreHelper = device->getGfxCoreHelper();
|
||||
auto implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernel.pImplicitArgs.get(), kernel.getKernelDescriptor(), !kernel.kernelRequiresGenerationOfLocalIdsByRuntime, gfxCoreHelper);
|
||||
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
|
||||
auto implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernel.pImplicitArgs.get(), kernel.getKernelDescriptor(), !kernel.kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||
return implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
||||
} else {
|
||||
return 0u;
|
||||
|
||||
@@ -304,7 +304,9 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSett
|
||||
mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime = true; // although it is enabled for SIMD 1, make sure it is enforced
|
||||
mockKernel.descriptor.kernelAttributes.numLocalIdChannels = 3;
|
||||
mockKernel.module = &mockModule;
|
||||
auto grfSize = mockModule.getDevice()->getHwInfo().capabilityTable.grfSize;
|
||||
const auto &device = mockModule.getDevice();
|
||||
auto grfSize = device->getHwInfo().capabilityTable.grfSize;
|
||||
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
|
||||
uint32_t groupSize[3] = {2, 3, 5};
|
||||
auto ret = mockKernel.setGroupSize(groupSize[0], groupSize[1], groupSize[2]);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, ret);
|
||||
@@ -314,7 +316,8 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSett
|
||||
mockKernel.descriptor.kernelAttributes.simdSize,
|
||||
groupSize[0] * groupSize[1] * groupSize[2],
|
||||
grfSize,
|
||||
mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime);
|
||||
mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime,
|
||||
rootDeviceEnvironment);
|
||||
auto perThreadDataSizeForWholeTGNeeded =
|
||||
static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
|
||||
mockKernel.descriptor.kernelAttributes.simdSize,
|
||||
@@ -322,7 +325,7 @@ TEST_F(KernelImpSetGroupSizeTest, givenLocalIdGenerationByRuntimeEnabledWhenSett
|
||||
mockKernel.descriptor.kernelAttributes.numLocalIdChannels,
|
||||
groupSize[0] * groupSize[1] * groupSize[2],
|
||||
!mockKernel.kernelRequiresGenerationOfLocalIdsByRuntime,
|
||||
gfxHelper));
|
||||
rootDeviceEnvironment));
|
||||
|
||||
EXPECT_EQ(numThreadsPerTG, mockKernel.getNumThreadsPerThreadGroup());
|
||||
EXPECT_EQ((perThreadDataSizeForWholeTGNeeded / numThreadsPerTG), mockKernel.perThreadDataSize);
|
||||
@@ -1692,9 +1695,9 @@ TEST_F(KernelPropertiesTests, whenPassingKernelMaxGroupSizePropertiesStructToGet
|
||||
|
||||
ze_result_t res = kernel->getProperties(&kernelProperties);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
|
||||
|
||||
auto &gfxCoreHelper = module->getDevice()->getGfxCoreHelper();
|
||||
uint32_t maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, false, static_cast<uint32_t>(this->module->getMaxGroupSize(kernelDescriptor)));
|
||||
auto &device = *module->getDevice();
|
||||
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
||||
uint32_t maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, false, static_cast<uint32_t>(this->module->getMaxGroupSize(kernelDescriptor)), device.getNEODevice()->getRootDeviceEnvironment());
|
||||
EXPECT_EQ(maxKernelWorkGroupSize, maxGroupSizeProperties.maxGroupSize);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user