mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 14:55:24 +08:00
Handle SIMD-1 scenario when programming local ids for implicit args
according to implicit args design for SIMD-1 local ids are one-by-one Resolves: NEO-6692 Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
28e89b2c30
commit
4f71aaf595
@@ -1020,12 +1020,13 @@ uint32_t KernelImp::getSizeForImplicitArgsPatching() const {
|
||||
}
|
||||
auto implicitArgsSize = static_cast<uint32_t>(sizeof(NEO::ImplicitArgs));
|
||||
const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor();
|
||||
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
|
||||
auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
|
||||
auto grfSize = NEO::ImplicitArgsHelper::getGrfSize(simdSize, this->module->getDevice()->getHwInfo().capabilityTable.grfSize);
|
||||
Vec3<size_t> groupSize{this->groupSize[0], this->groupSize[1], this->groupSize[2]};
|
||||
auto itemsInGroup = Math::computeTotalElementsCount(groupSize);
|
||||
uint32_t localIdsSizeNeeded =
|
||||
alignUp(static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
|
||||
kernelDescriptor.kernelAttributes.simdSize, grfSize, 3u, itemsInGroup)),
|
||||
simdSize, grfSize, 3u, itemsInGroup)),
|
||||
MemoryConstants::cacheLineSize);
|
||||
return implicitArgsSize + localIdsSizeNeeded;
|
||||
}
|
||||
@@ -1035,12 +1036,13 @@ void KernelImp::patchImplicitArgs(void *&pOut) const {
|
||||
return;
|
||||
}
|
||||
const auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes;
|
||||
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
|
||||
auto simdSize = kernelAttributes.simdSize;
|
||||
auto grfSize = NEO::ImplicitArgsHelper::getGrfSize(simdSize, this->module->getDevice()->getHwInfo().capabilityTable.grfSize);
|
||||
auto dimensionOrder = NEO::ImplicitArgsHelper::getDimensionOrderForLocalIds(kernelAttributes.workgroupDimensionsOrder, kernelRequiresGenerationOfLocalIdsByRuntime, requiredWorkgroupOrder);
|
||||
|
||||
NEO::generateLocalIDs(
|
||||
pOut,
|
||||
static_cast<uint16_t>(kernelAttributes.simdSize),
|
||||
simdSize,
|
||||
std::array<uint16_t, 3>{{static_cast<uint16_t>(groupSize[0]),
|
||||
static_cast<uint16_t>(groupSize[1]),
|
||||
static_cast<uint16_t>(groupSize[2])}},
|
||||
|
||||
@@ -1077,6 +1077,78 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, givenKernelWithImplicitArgsAndHwGenerat
|
||||
|
||||
alignedFree(expectedLocalIds);
|
||||
}
|
||||
|
||||
HWTEST_F(CmdlistAppendLaunchKernelTests, givenKernelWithImplicitArgsWhenAppendLaunchKernelWithSimd1ThenLocalIdsAreGeneratedCorrectly) {
|
||||
std::unique_ptr<MockImmutableData> mockKernelImmData = std::make_unique<MockImmutableData>(0u);
|
||||
auto kernelDescriptor = mockKernelImmData->kernelDescriptor;
|
||||
kernelDescriptor->kernelAttributes.flags.requiresImplicitArgs = true;
|
||||
kernelDescriptor->kernelAttributes.simdSize = 1u;
|
||||
kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[0] = 2;
|
||||
kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[1] = 1;
|
||||
kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[2] = 0;
|
||||
createModuleFromBinary(0u, false, mockKernelImmData.get());
|
||||
|
||||
auto kernel = std::make_unique<MockKernel>(module.get());
|
||||
|
||||
ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC};
|
||||
kernel->initialize(&kernelDesc);
|
||||
|
||||
EXPECT_TRUE(kernel->getKernelDescriptor().kernelAttributes.flags.requiresImplicitArgs);
|
||||
ASSERT_NE(nullptr, kernel->getImplicitArgs());
|
||||
|
||||
kernel->setGroupSize(2, 2, 1);
|
||||
|
||||
ze_result_t result{};
|
||||
std::unique_ptr<L0::CommandList> commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, result));
|
||||
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
|
||||
auto indirectHeap = commandList->commandContainer.getIndirectHeap(NEO::HeapType::INDIRECT_OBJECT);
|
||||
memset(indirectHeap->getSpace(0), 0, kernel->getSizeForImplicitArgsPatching());
|
||||
|
||||
ze_group_count_t groupCount{1, 1, 1};
|
||||
result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
|
||||
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
|
||||
auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup();
|
||||
EXPECT_EQ(indirectHeap->getUsed(), sizeCrossThreadData + sizePerThreadDataForWholeGroup + kernel->getSizeForImplicitArgsPatching());
|
||||
|
||||
ImplicitArgs expectedImplicitArgs{sizeof(ImplicitArgs)};
|
||||
expectedImplicitArgs.numWorkDim = 2;
|
||||
expectedImplicitArgs.simdWidth = 1;
|
||||
expectedImplicitArgs.localSizeX = 2;
|
||||
expectedImplicitArgs.localSizeY = 2;
|
||||
expectedImplicitArgs.localSizeZ = 1;
|
||||
expectedImplicitArgs.globalSizeX = 2;
|
||||
expectedImplicitArgs.globalSizeY = 2;
|
||||
expectedImplicitArgs.globalSizeZ = 1;
|
||||
expectedImplicitArgs.groupCountX = 1;
|
||||
expectedImplicitArgs.groupCountY = 1;
|
||||
expectedImplicitArgs.groupCountZ = 1;
|
||||
expectedImplicitArgs.localIdTablePtr = indirectHeap->getGraphicsAllocation()->getGpuAddress();
|
||||
expectedImplicitArgs.printfBufferPtr = kernel->getPrintfBufferAllocation()->getGpuAddress();
|
||||
|
||||
auto sizeForImplicitArgPatching = kernel->getSizeForImplicitArgsPatching();
|
||||
|
||||
EXPECT_LT(0u, sizeForImplicitArgPatching);
|
||||
|
||||
auto localIdsProgrammingSize = sizeForImplicitArgPatching - sizeof(ImplicitArgs);
|
||||
|
||||
uint16_t expectedLocalIds[][3] = {{0, 0, 0},
|
||||
{0, 1, 0},
|
||||
{0, 0, 1},
|
||||
{0, 1, 1}};
|
||||
|
||||
uint8_t zeros[MemoryConstants::cacheLineSize]{};
|
||||
EXPECT_EQ(localIdsProgrammingSize, alignUp(sizeof(expectedLocalIds), MemoryConstants::cacheLineSize));
|
||||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeap->getCpuBase(), sizeof(expectedLocalIds)));
|
||||
EXPECT_EQ(0, memcmp(zeros, ptrOffset(indirectHeap->getCpuBase(), sizeof(expectedLocalIds)), localIdsProgrammingSize - sizeof(expectedLocalIds)));
|
||||
auto pImplicitArgs = reinterpret_cast<ImplicitArgs *>(ptrOffset(indirectHeap->getCpuBase(), localIdsProgrammingSize));
|
||||
EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, sizeof(ImplicitArgs)));
|
||||
}
|
||||
|
||||
HWTEST_F(CmdlistAppendLaunchKernelTests, givenKernelWithoutImplicitArgsWhenAppendLaunchKernelThenImplicitArgsAreNotSentToIndirectHeap) {
|
||||
std::unique_ptr<MockImmutableData> mockKernelImmData = std::make_unique<MockImmutableData>(0u);
|
||||
auto kernelDescriptor = mockKernelImmData->kernelDescriptor;
|
||||
|
||||
Reference in New Issue
Block a user