mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 14:55:24 +08:00
Handle SIMD-1 scenario when programming local ids for implicit args
according to implicit args design for SIMD-1 local ids are one-by-one Resolves: NEO-6692 Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
28e89b2c30
commit
4f71aaf595
@@ -51,11 +51,13 @@ size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(const Kernel &kerne
|
||||
|
||||
auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
|
||||
uint32_t grfSize = sizeof(typename GfxFamily::GRF);
|
||||
auto simdSize = kernelInfo.getMaxSimdSize();
|
||||
auto size = kernel.getCrossThreadDataSize() +
|
||||
getPerThreadDataSizeTotal(kernelInfo.getMaxSimdSize(), grfSize, numChannels, localWorkSize);
|
||||
getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, localWorkSize);
|
||||
|
||||
if (kernel.getImplicitArgs()) {
|
||||
size += sizeof(ImplicitArgs) + alignUp(getPerThreadDataSizeTotal(kernelInfo.getMaxSimdSize(), grfSize, 3u, localWorkSize), MemoryConstants::cacheLineSize);
|
||||
auto grfSizeForImplicitArgs = ImplicitArgsHelper::getGrfSize(simdSize, grfSize);
|
||||
size += sizeof(ImplicitArgs) + alignUp(getPerThreadDataSizeTotal(simdSize, grfSizeForImplicitArgs, 3u, localWorkSize), MemoryConstants::cacheLineSize);
|
||||
}
|
||||
return alignUp(size, WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
||||
}
|
||||
@@ -218,7 +220,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
|
||||
auto pImplicitArgs = kernel.getImplicitArgs();
|
||||
if (pImplicitArgs) {
|
||||
constexpr uint32_t grfSize = sizeof(typename GfxFamily::GRF);
|
||||
auto grfSize = ImplicitArgsHelper::getGrfSize(simd, sizeof(typename GfxFamily::GRF));
|
||||
const auto &kernelAttributes = kernelInfo.kernelDescriptor.kernelAttributes;
|
||||
uint32_t requiredWalkOrder = 0u;
|
||||
auto generationOfLocalIdsByRuntime = EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(
|
||||
|
||||
@@ -1561,6 +1561,7 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp
|
||||
Vec3<size_t> localWorkgroupSize(workGroupSize);
|
||||
auto blockedCommandsData = createBlockedCommandsData(*pCmdQ);
|
||||
|
||||
kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 1u;
|
||||
kernelInfo.kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = false;
|
||||
MockKernel kernelWithoutImplicitArgs(program.get(), kernelInfo, *pClDevice);
|
||||
ASSERT_EQ(CL_SUCCESS, kernelWithoutImplicitArgs.initialize());
|
||||
@@ -1609,11 +1610,13 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp
|
||||
|
||||
{
|
||||
auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
|
||||
auto simdSize = kernelInfo.getMaxSimdSize();
|
||||
uint32_t grfSize = sizeof(typename FamilyType::GRF);
|
||||
auto grfSizeForImplicitArgs = ImplicitArgsHelper::getGrfSize(simdSize, grfSize);
|
||||
auto size = kernelWithImplicitArgs.getCrossThreadDataSize() +
|
||||
HardwareCommandsHelper<FamilyType>::getPerThreadDataSizeTotal(kernelInfo.getMaxSimdSize(), grfSize, numChannels, Math::computeTotalElementsCount(localWorkgroupSize)) +
|
||||
HardwareCommandsHelper<FamilyType>::getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, Math::computeTotalElementsCount(localWorkgroupSize)) +
|
||||
sizeof(ImplicitArgs) +
|
||||
alignUp(HardwareCommandsHelper<FamilyType>::getPerThreadDataSizeTotal(kernelInfo.getMaxSimdSize(), grfSize, 3u, Math::computeTotalElementsCount(localWorkgroupSize)), MemoryConstants::cacheLineSize);
|
||||
alignUp(HardwareCommandsHelper<FamilyType>::getPerThreadDataSizeTotal(simdSize, grfSizeForImplicitArgs, 3u, Math::computeTotalElementsCount(localWorkgroupSize)), MemoryConstants::cacheLineSize);
|
||||
|
||||
size = alignUp(size, MemoryConstants::cacheLineSize);
|
||||
EXPECT_EQ(size, iohSizeWithImplicitArgs);
|
||||
|
||||
@@ -1400,6 +1400,78 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
|
||||
EXPECT_EQ(ioh.getGraphicsAllocation()->getGpuAddress(), pImplicitArgs->localIdTablePtr);
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithImplicitArgsWhenSendingIndirectStateWithSimd1ThenLocalIdsAreGeneratedCorrectly) {
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.EnableHwGenerationLocalIds.set(0);
|
||||
auto pKernelInfo = std::make_unique<MockKernelInfo>();
|
||||
uint32_t simd = 1;
|
||||
pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = simd;
|
||||
pKernelInfo->kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = true;
|
||||
pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0] = 2;
|
||||
pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1] = 1;
|
||||
pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2] = 0;
|
||||
|
||||
MockContext context(pClDevice);
|
||||
CommandQueueHw<FamilyType> cmdQ(&context, pClDevice, 0, false);
|
||||
MockProgram program(&context, false, toClDeviceVector(*pClDevice));
|
||||
|
||||
MockKernel kernel(&program, *pKernelInfo, *pClDevice);
|
||||
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
|
||||
|
||||
const size_t localWorkSizeX = 2;
|
||||
const size_t localWorkSizeY = 2;
|
||||
const size_t localWorkSizeZ = 1;
|
||||
const size_t localWorkSizes[3]{localWorkSizeX, localWorkSizeY, localWorkSizeZ};
|
||||
|
||||
auto &commandStream = cmdQ.getCS(1024);
|
||||
auto pWalkerCmd = reinterpret_cast<typename FamilyType::WALKER_TYPE *>(commandStream.getSpace(0));
|
||||
|
||||
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 8192);
|
||||
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::Type::INDIRECT_OBJECT, 8192);
|
||||
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::Type::SURFACE_STATE, 8192);
|
||||
|
||||
dsh.align(EncodeStates<FamilyType>::alignInterfaceDescriptorData);
|
||||
auto interfaceDescriptor = reinterpret_cast<typename FamilyType::INTERFACE_DESCRIPTOR_DATA *>(dsh.getSpace(0));
|
||||
uint32_t interfaceDescriptorIndex = 0u;
|
||||
|
||||
HardwareCommandsHelper<FamilyType>::sendIndirectState(
|
||||
commandStream,
|
||||
dsh,
|
||||
ioh,
|
||||
ssh,
|
||||
kernel,
|
||||
0u,
|
||||
simd,
|
||||
localWorkSizes,
|
||||
0u,
|
||||
interfaceDescriptorIndex,
|
||||
pDevice->getPreemptionMode(),
|
||||
pWalkerCmd,
|
||||
interfaceDescriptor,
|
||||
false,
|
||||
*pDevice);
|
||||
|
||||
uint32_t grfSize = ImplicitArgsHelper::getGrfSize(simd, sizeof(typename FamilyType::GRF));
|
||||
|
||||
EXPECT_EQ(3 * sizeof(uint16_t), grfSize);
|
||||
size_t localWorkSize = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
|
||||
size_t expectedLocalIdsSize = PerThreadDataHelper::getPerThreadDataSizeTotal(simd, grfSize, 3u, localWorkSize);
|
||||
ASSERT_LE(expectedLocalIdsSize, ioh.getUsed());
|
||||
|
||||
uint16_t expectedLocalIds[][3] = {{0, 0, 0},
|
||||
{0, 1, 0},
|
||||
{0, 0, 1},
|
||||
{0, 1, 1}};
|
||||
EXPECT_EQ(expectedLocalIdsSize, sizeof(expectedLocalIds));
|
||||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, ioh.getCpuBase(), sizeof(expectedLocalIds)));
|
||||
|
||||
auto localIdsProgrammingSize = alignUp(sizeof(expectedLocalIds), MemoryConstants::cacheLineSize);
|
||||
ASSERT_LE(localIdsProgrammingSize + sizeof(ImplicitArgs), ioh.getUsed());
|
||||
auto pImplicitArgs = reinterpret_cast<ImplicitArgs *>(ptrOffset(ioh.getCpuBase(), localIdsProgrammingSize));
|
||||
EXPECT_EQ(ioh.getGraphicsAllocation()->getGpuAddress(), pImplicitArgs->localIdTablePtr);
|
||||
}
|
||||
|
||||
using HardwareCommandsTestXeHpAndLater = HardwareCommandsTest;
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsTestXeHpAndLater, givenIndirectHeapNotAllocatedFromInternalPoolWhenSendCrossThreadDataIsCalledThenOffsetZeroIsReturned) {
|
||||
|
||||
Reference in New Issue
Block a user