Handle SIMD-1 scenario when programming local ids for implicit args

according to implicit args design for SIMD-1 local ids are one-by-one

Resolves: NEO-6692
Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
Mateusz Jablonski
2022-02-17 12:41:06 +00:00
committed by Compute-Runtime-Automation
parent 28e89b2c30
commit 4f71aaf595
8 changed files with 184 additions and 12 deletions

View File

@@ -51,11 +51,13 @@ size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(const Kernel &kerne
auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
uint32_t grfSize = sizeof(typename GfxFamily::GRF);
auto simdSize = kernelInfo.getMaxSimdSize();
auto size = kernel.getCrossThreadDataSize() +
getPerThreadDataSizeTotal(kernelInfo.getMaxSimdSize(), grfSize, numChannels, localWorkSize);
getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, localWorkSize);
if (kernel.getImplicitArgs()) {
size += sizeof(ImplicitArgs) + alignUp(getPerThreadDataSizeTotal(kernelInfo.getMaxSimdSize(), grfSize, 3u, localWorkSize), MemoryConstants::cacheLineSize);
auto grfSizeForImplicitArgs = ImplicitArgsHelper::getGrfSize(simdSize, grfSize);
size += sizeof(ImplicitArgs) + alignUp(getPerThreadDataSizeTotal(simdSize, grfSizeForImplicitArgs, 3u, localWorkSize), MemoryConstants::cacheLineSize);
}
return alignUp(size, WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
}
@@ -218,7 +220,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
auto pImplicitArgs = kernel.getImplicitArgs();
if (pImplicitArgs) {
constexpr uint32_t grfSize = sizeof(typename GfxFamily::GRF);
auto grfSize = ImplicitArgsHelper::getGrfSize(simd, sizeof(typename GfxFamily::GRF));
const auto &kernelAttributes = kernelInfo.kernelDescriptor.kernelAttributes;
uint32_t requiredWalkOrder = 0u;
auto generationOfLocalIdsByRuntime = EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(

View File

@@ -1561,6 +1561,7 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp
Vec3<size_t> localWorkgroupSize(workGroupSize);
auto blockedCommandsData = createBlockedCommandsData(*pCmdQ);
kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 1u;
kernelInfo.kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = false;
MockKernel kernelWithoutImplicitArgs(program.get(), kernelInfo, *pClDevice);
ASSERT_EQ(CL_SUCCESS, kernelWithoutImplicitArgs.initialize());
@@ -1609,11 +1610,13 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp
{
auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
auto simdSize = kernelInfo.getMaxSimdSize();
uint32_t grfSize = sizeof(typename FamilyType::GRF);
auto grfSizeForImplicitArgs = ImplicitArgsHelper::getGrfSize(simdSize, grfSize);
auto size = kernelWithImplicitArgs.getCrossThreadDataSize() +
HardwareCommandsHelper<FamilyType>::getPerThreadDataSizeTotal(kernelInfo.getMaxSimdSize(), grfSize, numChannels, Math::computeTotalElementsCount(localWorkgroupSize)) +
HardwareCommandsHelper<FamilyType>::getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, Math::computeTotalElementsCount(localWorkgroupSize)) +
sizeof(ImplicitArgs) +
alignUp(HardwareCommandsHelper<FamilyType>::getPerThreadDataSizeTotal(kernelInfo.getMaxSimdSize(), grfSize, 3u, Math::computeTotalElementsCount(localWorkgroupSize)), MemoryConstants::cacheLineSize);
alignUp(HardwareCommandsHelper<FamilyType>::getPerThreadDataSizeTotal(simdSize, grfSizeForImplicitArgs, 3u, Math::computeTotalElementsCount(localWorkgroupSize)), MemoryConstants::cacheLineSize);
size = alignUp(size, MemoryConstants::cacheLineSize);
EXPECT_EQ(size, iohSizeWithImplicitArgs);

View File

@@ -1400,6 +1400,78 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
EXPECT_EQ(ioh.getGraphicsAllocation()->getGpuAddress(), pImplicitArgs->localIdTablePtr);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithImplicitArgsWhenSendingIndirectStateWithSimd1ThenLocalIdsAreGeneratedCorrectly) {
DebugManagerStateRestore restorer;
DebugManager.flags.EnableHwGenerationLocalIds.set(0);
auto pKernelInfo = std::make_unique<MockKernelInfo>();
uint32_t simd = 1;
pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = simd;
pKernelInfo->kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = true;
pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0] = 2;
pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1] = 1;
pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2] = 0;
MockContext context(pClDevice);
CommandQueueHw<FamilyType> cmdQ(&context, pClDevice, 0, false);
MockProgram program(&context, false, toClDeviceVector(*pClDevice));
MockKernel kernel(&program, *pKernelInfo, *pClDevice);
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
const size_t localWorkSizeX = 2;
const size_t localWorkSizeY = 2;
const size_t localWorkSizeZ = 1;
const size_t localWorkSizes[3]{localWorkSizeX, localWorkSizeY, localWorkSizeZ};
auto &commandStream = cmdQ.getCS(1024);
auto pWalkerCmd = reinterpret_cast<typename FamilyType::WALKER_TYPE *>(commandStream.getSpace(0));
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::Type::INDIRECT_OBJECT, 8192);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::Type::SURFACE_STATE, 8192);
dsh.align(EncodeStates<FamilyType>::alignInterfaceDescriptorData);
auto interfaceDescriptor = reinterpret_cast<typename FamilyType::INTERFACE_DESCRIPTOR_DATA *>(dsh.getSpace(0));
uint32_t interfaceDescriptorIndex = 0u;
HardwareCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
ioh,
ssh,
kernel,
0u,
simd,
localWorkSizes,
0u,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
interfaceDescriptor,
false,
*pDevice);
uint32_t grfSize = ImplicitArgsHelper::getGrfSize(simd, sizeof(typename FamilyType::GRF));
EXPECT_EQ(3 * sizeof(uint16_t), grfSize);
size_t localWorkSize = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
size_t expectedLocalIdsSize = PerThreadDataHelper::getPerThreadDataSizeTotal(simd, grfSize, 3u, localWorkSize);
ASSERT_LE(expectedLocalIdsSize, ioh.getUsed());
uint16_t expectedLocalIds[][3] = {{0, 0, 0},
{0, 1, 0},
{0, 0, 1},
{0, 1, 1}};
EXPECT_EQ(expectedLocalIdsSize, sizeof(expectedLocalIds));
EXPECT_EQ(0, memcmp(expectedLocalIds, ioh.getCpuBase(), sizeof(expectedLocalIds)));
auto localIdsProgrammingSize = alignUp(sizeof(expectedLocalIds), MemoryConstants::cacheLineSize);
ASSERT_LE(localIdsProgrammingSize + sizeof(ImplicitArgs), ioh.getUsed());
auto pImplicitArgs = reinterpret_cast<ImplicitArgs *>(ptrOffset(ioh.getCpuBase(), localIdsProgrammingSize));
EXPECT_EQ(ioh.getGraphicsAllocation()->getGpuAddress(), pImplicitArgs->localIdTablePtr);
}
using HardwareCommandsTestXeHpAndLater = HardwareCommandsTest;
HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsTestXeHpAndLater, givenIndirectHeapNotAllocatedFromInternalPoolWhenSendCrossThreadDataIsCalledThenOffsetZeroIsReturned) {