Handle SIMD-1 scenario when programming local ids for implicit args

according to implicit args design for SIMD-1 local ids are one-by-one

Resolves: NEO-6692
Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
Mateusz Jablonski
2022-02-17 12:41:06 +00:00
committed by Compute-Runtime-Automation
parent 28e89b2c30
commit 4f71aaf595
8 changed files with 184 additions and 12 deletions

View File

@@ -1020,12 +1020,13 @@ uint32_t KernelImp::getSizeForImplicitArgsPatching() const {
}
auto implicitArgsSize = static_cast<uint32_t>(sizeof(NEO::ImplicitArgs));
const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor();
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
auto grfSize = NEO::ImplicitArgsHelper::getGrfSize(simdSize, this->module->getDevice()->getHwInfo().capabilityTable.grfSize);
Vec3<size_t> groupSize{this->groupSize[0], this->groupSize[1], this->groupSize[2]};
auto itemsInGroup = Math::computeTotalElementsCount(groupSize);
uint32_t localIdsSizeNeeded =
alignUp(static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
kernelDescriptor.kernelAttributes.simdSize, grfSize, 3u, itemsInGroup)),
simdSize, grfSize, 3u, itemsInGroup)),
MemoryConstants::cacheLineSize);
return implicitArgsSize + localIdsSizeNeeded;
}
@@ -1035,12 +1036,13 @@ void KernelImp::patchImplicitArgs(void *&pOut) const {
return;
}
const auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes;
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
auto simdSize = kernelAttributes.simdSize;
auto grfSize = NEO::ImplicitArgsHelper::getGrfSize(simdSize, this->module->getDevice()->getHwInfo().capabilityTable.grfSize);
auto dimensionOrder = NEO::ImplicitArgsHelper::getDimensionOrderForLocalIds(kernelAttributes.workgroupDimensionsOrder, kernelRequiresGenerationOfLocalIdsByRuntime, requiredWorkgroupOrder);
NEO::generateLocalIDs(
pOut,
static_cast<uint16_t>(kernelAttributes.simdSize),
simdSize,
std::array<uint16_t, 3>{{static_cast<uint16_t>(groupSize[0]),
static_cast<uint16_t>(groupSize[1]),
static_cast<uint16_t>(groupSize[2])}},

View File

@@ -1077,6 +1077,78 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, givenKernelWithImplicitArgsAndHwGenerat
alignedFree(expectedLocalIds);
}
HWTEST_F(CmdlistAppendLaunchKernelTests, givenKernelWithImplicitArgsWhenAppendLaunchKernelWithSimd1ThenLocalIdsAreGeneratedCorrectly) {
std::unique_ptr<MockImmutableData> mockKernelImmData = std::make_unique<MockImmutableData>(0u);
auto kernelDescriptor = mockKernelImmData->kernelDescriptor;
kernelDescriptor->kernelAttributes.flags.requiresImplicitArgs = true;
kernelDescriptor->kernelAttributes.simdSize = 1u;
kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[0] = 2;
kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[1] = 1;
kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[2] = 0;
createModuleFromBinary(0u, false, mockKernelImmData.get());
auto kernel = std::make_unique<MockKernel>(module.get());
ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC};
kernel->initialize(&kernelDesc);
EXPECT_TRUE(kernel->getKernelDescriptor().kernelAttributes.flags.requiresImplicitArgs);
ASSERT_NE(nullptr, kernel->getImplicitArgs());
kernel->setGroupSize(2, 2, 1);
ze_result_t result{};
std::unique_ptr<L0::CommandList> commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto indirectHeap = commandList->commandContainer.getIndirectHeap(NEO::HeapType::INDIRECT_OBJECT);
memset(indirectHeap->getSpace(0), 0, kernel->getSizeForImplicitArgsPatching());
ze_group_count_t groupCount{1, 1, 1};
result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup();
EXPECT_EQ(indirectHeap->getUsed(), sizeCrossThreadData + sizePerThreadDataForWholeGroup + kernel->getSizeForImplicitArgsPatching());
ImplicitArgs expectedImplicitArgs{sizeof(ImplicitArgs)};
expectedImplicitArgs.numWorkDim = 2;
expectedImplicitArgs.simdWidth = 1;
expectedImplicitArgs.localSizeX = 2;
expectedImplicitArgs.localSizeY = 2;
expectedImplicitArgs.localSizeZ = 1;
expectedImplicitArgs.globalSizeX = 2;
expectedImplicitArgs.globalSizeY = 2;
expectedImplicitArgs.globalSizeZ = 1;
expectedImplicitArgs.groupCountX = 1;
expectedImplicitArgs.groupCountY = 1;
expectedImplicitArgs.groupCountZ = 1;
expectedImplicitArgs.localIdTablePtr = indirectHeap->getGraphicsAllocation()->getGpuAddress();
expectedImplicitArgs.printfBufferPtr = kernel->getPrintfBufferAllocation()->getGpuAddress();
auto sizeForImplicitArgPatching = kernel->getSizeForImplicitArgsPatching();
EXPECT_LT(0u, sizeForImplicitArgPatching);
auto localIdsProgrammingSize = sizeForImplicitArgPatching - sizeof(ImplicitArgs);
uint16_t expectedLocalIds[][3] = {{0, 0, 0},
{0, 1, 0},
{0, 0, 1},
{0, 1, 1}};
uint8_t zeros[MemoryConstants::cacheLineSize]{};
EXPECT_EQ(localIdsProgrammingSize, alignUp(sizeof(expectedLocalIds), MemoryConstants::cacheLineSize));
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeap->getCpuBase(), sizeof(expectedLocalIds)));
EXPECT_EQ(0, memcmp(zeros, ptrOffset(indirectHeap->getCpuBase(), sizeof(expectedLocalIds)), localIdsProgrammingSize - sizeof(expectedLocalIds)));
auto pImplicitArgs = reinterpret_cast<ImplicitArgs *>(ptrOffset(indirectHeap->getCpuBase(), localIdsProgrammingSize));
EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, sizeof(ImplicitArgs)));
}
HWTEST_F(CmdlistAppendLaunchKernelTests, givenKernelWithoutImplicitArgsWhenAppendLaunchKernelThenImplicitArgsAreNotSentToIndirectHeap) {
std::unique_ptr<MockImmutableData> mockKernelImmData = std::make_unique<MockImmutableData>(0u);
auto kernelDescriptor = mockKernelImmData->kernelDescriptor;