Unify implicit args programming across APIs

Related-To: NEO-5081
Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
Mateusz Jablonski
2022-02-22 12:24:30 +00:00
committed by Compute-Runtime-Automation
parent d24c6cedfb
commit ea6f089e17
17 changed files with 394 additions and 541 deletions

View File

@@ -1013,44 +1013,4 @@ uint32_t KernelImp::getSchedulingHintExp() {
} }
return this->schedulingHintExpFlag; return this->schedulingHintExpFlag;
} }
uint32_t KernelImp::getSizeForImplicitArgsPatching() const {
if (!pImplicitArgs) {
return 0;
}
auto implicitArgsSize = static_cast<uint32_t>(sizeof(NEO::ImplicitArgs));
const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor();
auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
auto grfSize = NEO::ImplicitArgsHelper::getGrfSize(simdSize, this->module->getDevice()->getHwInfo().capabilityTable.grfSize);
Vec3<size_t> groupSize{this->groupSize[0], this->groupSize[1], this->groupSize[2]};
auto itemsInGroup = Math::computeTotalElementsCount(groupSize);
uint32_t localIdsSizeNeeded =
alignUp(static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
simdSize, grfSize, 3u, itemsInGroup)),
MemoryConstants::cacheLineSize);
return implicitArgsSize + localIdsSizeNeeded;
}
void KernelImp::patchImplicitArgs(void *&pOut) const {
if (!pImplicitArgs) {
return;
}
const auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes;
auto simdSize = kernelAttributes.simdSize;
auto grfSize = NEO::ImplicitArgsHelper::getGrfSize(simdSize, this->module->getDevice()->getHwInfo().capabilityTable.grfSize);
auto dimensionOrder = NEO::ImplicitArgsHelper::getDimensionOrderForLocalIds(kernelAttributes.workgroupDimensionsOrder, kernelRequiresGenerationOfLocalIdsByRuntime, requiredWorkgroupOrder);
NEO::generateLocalIDs(
pOut,
simdSize,
std::array<uint16_t, 3>{{static_cast<uint16_t>(groupSize[0]),
static_cast<uint16_t>(groupSize[1]),
static_cast<uint16_t>(groupSize[2])}},
dimensionOrder,
false, grfSize);
auto sizeForLocalIdsProgramming = getSizeForImplicitArgsPatching() - sizeof(NEO::ImplicitArgs);
pOut = ptrOffset(pOut, sizeForLocalIdsProgramming);
memcpy_s(pOut, sizeof(NEO::ImplicitArgs), pImplicitArgs.get(), sizeof(NEO::ImplicitArgs));
pOut = ptrOffset(pOut, sizeof(NEO::ImplicitArgs));
}
} // namespace L0 } // namespace L0

View File

@@ -154,8 +154,6 @@ struct KernelImp : Kernel {
uint32_t getSchedulingHintExp(); uint32_t getSchedulingHintExp();
NEO::ImplicitArgs *getImplicitArgs() const override { return pImplicitArgs.get(); } NEO::ImplicitArgs *getImplicitArgs() const override { return pImplicitArgs.get(); }
uint32_t getSizeForImplicitArgsPatching() const override;
void patchImplicitArgs(void *&pOut) const override;
protected: protected:
KernelImp() = default; KernelImp() = default;

View File

@@ -648,6 +648,9 @@ HWTEST_F(CommandListAppendLaunchKernel, givenIndirectDispatchWithImplicitArgsWhe
kernel.pImplicitArgs.reset(new ImplicitArgs()); kernel.pImplicitArgs.reset(new ImplicitArgs());
kernel.setGroupSize(1, 1, 1); kernel.setGroupSize(1, 1, 1);
auto implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernel.pImplicitArgs.get(), kernel.getKernelDescriptor(), neoDevice->getHardwareInfo());
ze_result_t returnValue; ze_result_t returnValue;
std::unique_ptr<L0::CommandList> commandList(L0::CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue)); std::unique_ptr<L0::CommandList> commandList(L0::CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue));
@@ -661,7 +664,7 @@ HWTEST_F(CommandListAppendLaunchKernel, givenIndirectDispatchWithImplicitArgsWhe
nullptr, 0, nullptr); nullptr, 0, nullptr);
EXPECT_EQ(result, ZE_RESULT_SUCCESS); EXPECT_EQ(result, ZE_RESULT_SUCCESS);
auto heap = commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT); auto heap = commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT);
uint64_t pImplicitArgsGPUVA = heap->getGraphicsAllocation()->getGpuAddress() + kernel.getSizeForImplicitArgsPatching() - sizeof(ImplicitArgs); uint64_t pImplicitArgsGPUVA = heap->getGraphicsAllocation()->getGpuAddress() + implicitArgsProgrammingSize - sizeof(ImplicitArgs);
auto workDimStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem; auto workDimStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
workDimStoreRegisterMemCmd.setRegisterAddress(CS_GPR_R0); workDimStoreRegisterMemCmd.setRegisterAddress(CS_GPR_R0);
workDimStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA); workDimStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA);

View File

@@ -8,6 +8,7 @@
#include "shared/source/command_container/implicit_scaling.h" #include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/gen9/reg_configs.h" #include "shared/source/gen9/reg_configs.h"
#include "shared/source/helpers/local_id_gen.h" #include "shared/source/helpers/local_id_gen.h"
#include "shared/source/helpers/per_thread_data.h"
#include "shared/source/utilities/software_tags_manager.h" #include "shared/source/utilities/software_tags_manager.h"
#include "shared/test/common/cmd_parse/gen_cmd_parse.h" #include "shared/test/common/cmd_parse/gen_cmd_parse.h"
#include "shared/test/common/mocks/mock_compilers.h" #include "shared/test/common/mocks/mock_compilers.h"
@@ -919,234 +920,161 @@ HWTEST_F(CommandListArbitrationPolicyTest, whenCommandListIsResetThenOriginalThr
} }
using CmdlistAppendLaunchKernelTests = Test<ModuleImmutableDataFixture>; using CmdlistAppendLaunchKernelTests = Test<ModuleImmutableDataFixture>;
HWTEST_F(CmdlistAppendLaunchKernelTests, givenKernelWithImplicitArgsWhenAppendLaunchKernelThenImplicitArgsAreSentToIndirectHeap) { struct CmdlistAppendLaunchKernelWithImplicitArgsTests : CmdlistAppendLaunchKernelTests {
std::unique_ptr<MockImmutableData> mockKernelImmData = std::make_unique<MockImmutableData>(0u);
auto kernelDescriptor = mockKernelImmData->kernelDescriptor;
kernelDescriptor->kernelAttributes.flags.requiresImplicitArgs = true;
auto simd = kernelDescriptor->kernelAttributes.simdSize;
kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[0] = 2;
kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[1] = 1;
kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[2] = 0;
createModuleFromBinary(0u, false, mockKernelImmData.get());
auto kernel = std::make_unique<MockKernel>(module.get()); void SetUp() override {
CmdlistAppendLaunchKernelTests::SetUp();
expectedImplicitArgs.numWorkDim = 3;
expectedImplicitArgs.simdWidth = 32;
expectedImplicitArgs.localSizeX = 2;
expectedImplicitArgs.localSizeY = 3;
expectedImplicitArgs.localSizeZ = 4;
expectedImplicitArgs.globalOffsetX = 1;
expectedImplicitArgs.globalOffsetY = 2;
expectedImplicitArgs.globalOffsetZ = 3;
expectedImplicitArgs.groupCountX = 2;
expectedImplicitArgs.groupCountY = 1;
expectedImplicitArgs.groupCountZ = 3;
}
ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC}; void dispatchKernelWithImplicitArgs() {
kernel->initialize(&kernelDesc); expectedImplicitArgs.globalSizeX = expectedImplicitArgs.localSizeX * expectedImplicitArgs.groupCountX;
expectedImplicitArgs.globalSizeY = expectedImplicitArgs.localSizeY * expectedImplicitArgs.groupCountY;
expectedImplicitArgs.globalSizeZ = expectedImplicitArgs.localSizeZ * expectedImplicitArgs.groupCountZ;
EXPECT_TRUE(kernel->getKernelDescriptor().kernelAttributes.flags.requiresImplicitArgs); std::unique_ptr<MockImmutableData> mockKernelImmData = std::make_unique<MockImmutableData>(0u);
ASSERT_NE(nullptr, kernel->getImplicitArgs()); auto kernelDescriptor = mockKernelImmData->kernelDescriptor;
kernelDescriptor->kernelAttributes.flags.requiresImplicitArgs = true;
kernelDescriptor->kernelAttributes.simdSize = expectedImplicitArgs.simdWidth;
kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[0] = workgroupDimOrder[0];
kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[1] = workgroupDimOrder[1];
kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[2] = workgroupDimOrder[2];
createModuleFromBinary(0u, false, mockKernelImmData.get());
kernel->setGroupSize(4, 5, 6); auto kernel = std::make_unique<MockKernel>(module.get());
kernel->setGroupCount(3, 2, 1);
kernel->setGlobalOffsetExp(1, 2, 3);
kernel->patchGlobalOffset();
ze_result_t result{}; ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC};
std::unique_ptr<L0::CommandList> commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, result)); kernel->initialize(&kernelDesc);
kernel->kernelRequiresGenerationOfLocalIdsByRuntime = kernelRequiresGenerationOfLocalIdsByRuntime;
kernel->requiredWorkgroupOrder = requiredWorkgroupOrder;
EXPECT_EQ(ZE_RESULT_SUCCESS, result); EXPECT_TRUE(kernel->getKernelDescriptor().kernelAttributes.flags.requiresImplicitArgs);
ASSERT_NE(nullptr, kernel->getImplicitArgs());
auto indirectHeap = commandList->commandContainer.getIndirectHeap(NEO::HeapType::INDIRECT_OBJECT); kernel->setGroupSize(expectedImplicitArgs.localSizeX, expectedImplicitArgs.localSizeY, expectedImplicitArgs.localSizeZ);
memset(indirectHeap->getSpace(0), 0, kernel->getSizeForImplicitArgsPatching()); kernel->setGlobalOffsetExp(static_cast<uint32_t>(expectedImplicitArgs.globalOffsetX), static_cast<uint32_t>(expectedImplicitArgs.globalOffsetY), static_cast<uint32_t>(expectedImplicitArgs.globalOffsetZ));
kernel->patchGlobalOffset();
ze_group_count_t groupCount{3, 2, 1}; ze_result_t result{};
result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr); commandList.reset(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto sizeCrossThreadData = kernel->getCrossThreadDataSize(); EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup();
EXPECT_EQ(indirectHeap->getUsed(), sizeCrossThreadData + sizePerThreadDataForWholeGroup + kernel->getSizeForImplicitArgsPatching());
ImplicitArgs expectedImplicitArgs{sizeof(ImplicitArgs)}; auto indirectHeap = commandList->commandContainer.getIndirectHeap(NEO::HeapType::INDIRECT_OBJECT);
expectedImplicitArgs.numWorkDim = 3; indirectHeapAllocation = indirectHeap->getGraphicsAllocation();
expectedImplicitArgs.simdWidth = simd;
expectedImplicitArgs.localSizeX = 4;
expectedImplicitArgs.localSizeY = 5;
expectedImplicitArgs.localSizeZ = 6;
expectedImplicitArgs.globalSizeX = 12;
expectedImplicitArgs.globalSizeY = 10;
expectedImplicitArgs.globalSizeZ = 6;
expectedImplicitArgs.globalOffsetX = 1;
expectedImplicitArgs.globalOffsetY = 2;
expectedImplicitArgs.globalOffsetZ = 3;
expectedImplicitArgs.groupCountX = 3;
expectedImplicitArgs.groupCountY = 2;
expectedImplicitArgs.groupCountZ = 1;
expectedImplicitArgs.localIdTablePtr = indirectHeap->getGraphicsAllocation()->getGpuAddress();
expectedImplicitArgs.printfBufferPtr = kernel->getPrintfBufferAllocation()->getGpuAddress();
auto sizeForImplicitArgPatching = kernel->getSizeForImplicitArgsPatching(); ze_group_count_t groupCount{expectedImplicitArgs.groupCountX, expectedImplicitArgs.groupCountY, expectedImplicitArgs.groupCountZ};
result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_LT(0u, sizeForImplicitArgPatching); implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&expectedImplicitArgs, *kernelDescriptor, neoDevice->getHardwareInfo());
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup();
EXPECT_EQ(indirectHeap->getUsed(), sizeCrossThreadData + sizePerThreadDataForWholeGroup + implicitArgsProgrammingSize);
auto localIdsProgrammingSize = sizeForImplicitArgPatching - sizeof(ImplicitArgs); expectedImplicitArgs.localIdTablePtr = indirectHeapAllocation->getGpuAddress();
expectedImplicitArgs.printfBufferPtr = kernel->getPrintfBufferAllocation()->getGpuAddress();
}
std::unique_ptr<L0::CommandList> commandList;
GraphicsAllocation *indirectHeapAllocation = nullptr;
ImplicitArgs expectedImplicitArgs = {sizeof(ImplicitArgs)};
std::array<uint8_t, 3> workgroupDimOrder{0, 1, 2};
uint32_t implicitArgsProgrammingSize = 0u;
auto expectedLocalIds = alignedMalloc(localIdsProgrammingSize, 64); bool kernelRequiresGenerationOfLocalIdsByRuntime = true;
memset(expectedLocalIds, 0, localIdsProgrammingSize); uint32_t requiredWorkgroupOrder = 0;
constexpr uint32_t grfSize = sizeof(typename FamilyType::GRF); };
NEO::generateLocalIDs(expectedLocalIds, simd, HWTEST_F(CmdlistAppendLaunchKernelWithImplicitArgsTests, givenKernelWithImplicitArgsWhenAppendLaunchKernelThenImplicitArgsAreSentToIndirectHeap) {
std::array<uint16_t, 3>{{4, 5, 6}}, std::array<uint16_t, 3> localSize{2, 3, 4};
std::array<uint8_t, 3>{{kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[0], size_t totalLocalSize = localSize[0] * localSize[1] * localSize[2];
kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[1],
kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[2]}},
false, grfSize);
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeap->getCpuBase(), localIdsProgrammingSize)); expectedImplicitArgs.localSizeX = localSize[0];
auto pImplicitArgs = reinterpret_cast<ImplicitArgs *>(ptrOffset(indirectHeap->getCpuBase(), localIdsProgrammingSize)); expectedImplicitArgs.localSizeY = localSize[1];
EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, sizeof(ImplicitArgs))); expectedImplicitArgs.localSizeZ = localSize[2];
dispatchKernelWithImplicitArgs();
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth, sizeof(typename FamilyType::GRF));
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - sizeof(ImplicitArgs), MemoryConstants::cacheLineSize);
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize);
auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs);
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize);
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
alignedFree(expectedLocalIds); alignedFree(expectedLocalIds);
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, sizeof(ImplicitArgs)));
} }
HWTEST_F(CmdlistAppendLaunchKernelTests, givenKernelWithImplicitArgsAndHwGeneratedLocalIdsWhenAppendLaunchKernelThenImplicitArgsLocalIdsRespectWalkOrder) { HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, givenKernelWithImplicitArgsAndHwGeneratedLocalIdsWhenAppendLaunchKernelThenImplicitArgsLocalIdsRespectWalkOrder) {
std::unique_ptr<MockImmutableData> mockKernelImmData = std::make_unique<MockImmutableData>(0u); workgroupDimOrder[0] = 2;
auto kernelDescriptor = mockKernelImmData->kernelDescriptor; workgroupDimOrder[1] = 1;
kernelDescriptor->kernelAttributes.flags.requiresImplicitArgs = true; workgroupDimOrder[2] = 0;
auto simd = kernelDescriptor->kernelAttributes.simdSize;
kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[0] = 2;
kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[1] = 1;
kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[2] = 0;
createModuleFromBinary(0u, false, mockKernelImmData.get());
auto kernel = std::make_unique<MockKernel>(module.get()); kernelRequiresGenerationOfLocalIdsByRuntime = false;
requiredWorkgroupOrder = 2; // walk order 1 0 2
ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC}; std::array<uint8_t, 3> expectedDimOrder = {1, 0, 2};
kernel->initialize(&kernelDesc);
kernel->kernelRequiresGenerationOfLocalIdsByRuntime = false;
kernel->requiredWorkgroupOrder = 2; // walk order 1 0 2
EXPECT_TRUE(kernel->getKernelDescriptor().kernelAttributes.flags.requiresImplicitArgs); std::array<uint16_t, 3> localSize{2, 3, 4};
ASSERT_NE(nullptr, kernel->getImplicitArgs()); size_t totalLocalSize = localSize[0] * localSize[1] * localSize[2];
kernel->setGroupSize(4, 5, 6); expectedImplicitArgs.localSizeX = localSize[0];
kernel->setGroupCount(3, 2, 1); expectedImplicitArgs.localSizeY = localSize[1];
kernel->setGlobalOffsetExp(1, 2, 3); expectedImplicitArgs.localSizeZ = localSize[2];
kernel->patchGlobalOffset();
ze_result_t result{}; dispatchKernelWithImplicitArgs();
std::unique_ptr<L0::CommandList> commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result); auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth, sizeof(typename FamilyType::GRF));
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - sizeof(ImplicitArgs), MemoryConstants::cacheLineSize);
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize);
auto indirectHeap = commandList->commandContainer.getIndirectHeap(NEO::HeapType::INDIRECT_OBJECT); auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs);
memset(indirectHeap->getSpace(0), 0, kernel->getSizeForImplicitArgsPatching()); size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize);
ze_group_count_t groupCount{3, 2, 1};
result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup();
EXPECT_EQ(indirectHeap->getUsed(), sizeCrossThreadData + sizePerThreadDataForWholeGroup + kernel->getSizeForImplicitArgsPatching());
ImplicitArgs expectedImplicitArgs{sizeof(ImplicitArgs)};
expectedImplicitArgs.numWorkDim = 3;
expectedImplicitArgs.simdWidth = simd;
expectedImplicitArgs.localSizeX = 4;
expectedImplicitArgs.localSizeY = 5;
expectedImplicitArgs.localSizeZ = 6;
expectedImplicitArgs.globalSizeX = 12;
expectedImplicitArgs.globalSizeY = 10;
expectedImplicitArgs.globalSizeZ = 6;
expectedImplicitArgs.globalOffsetX = 1;
expectedImplicitArgs.globalOffsetY = 2;
expectedImplicitArgs.globalOffsetZ = 3;
expectedImplicitArgs.groupCountX = 3;
expectedImplicitArgs.groupCountY = 2;
expectedImplicitArgs.groupCountZ = 1;
expectedImplicitArgs.localIdTablePtr = indirectHeap->getGraphicsAllocation()->getGpuAddress();
expectedImplicitArgs.printfBufferPtr = kernel->getPrintfBufferAllocation()->getGpuAddress();
auto sizeForImplicitArgPatching = kernel->getSizeForImplicitArgsPatching();
EXPECT_LT(0u, sizeForImplicitArgPatching);
auto localIdsProgrammingSize = sizeForImplicitArgPatching - sizeof(ImplicitArgs);
auto expectedLocalIds = alignedMalloc(localIdsProgrammingSize, 64);
memset(expectedLocalIds, 0, localIdsProgrammingSize);
constexpr uint32_t grfSize = sizeof(typename FamilyType::GRF);
NEO::generateLocalIDs(expectedLocalIds, simd,
std::array<uint16_t, 3>{{4, 5, 6}},
std::array<uint8_t, 3>{{1, 0, 2}},
false, grfSize);
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeap->getCpuBase(), localIdsProgrammingSize));
auto pImplicitArgs = reinterpret_cast<ImplicitArgs *>(ptrOffset(indirectHeap->getCpuBase(), localIdsProgrammingSize));
EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, sizeof(ImplicitArgs)));
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
alignedFree(expectedLocalIds); alignedFree(expectedLocalIds);
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, sizeof(ImplicitArgs)));
} }
HWTEST_F(CmdlistAppendLaunchKernelTests, givenKernelWithImplicitArgsWhenAppendLaunchKernelWithSimd1ThenLocalIdsAreGeneratedCorrectly) { HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, givenKernelWithImplicitArgsWhenAppendLaunchKernelWithSimd1ThenLocalIdsAreGeneratedCorrectly) {
std::unique_ptr<MockImmutableData> mockKernelImmData = std::make_unique<MockImmutableData>(0u); workgroupDimOrder[0] = 2;
auto kernelDescriptor = mockKernelImmData->kernelDescriptor; workgroupDimOrder[1] = 1;
kernelDescriptor->kernelAttributes.flags.requiresImplicitArgs = true; workgroupDimOrder[2] = 0;
kernelDescriptor->kernelAttributes.simdSize = 1u;
kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[0] = 2;
kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[1] = 1;
kernelDescriptor->kernelAttributes.workgroupDimensionsOrder[2] = 0;
createModuleFromBinary(0u, false, mockKernelImmData.get());
auto kernel = std::make_unique<MockKernel>(module.get());
ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC};
kernel->initialize(&kernelDesc);
EXPECT_TRUE(kernel->getKernelDescriptor().kernelAttributes.flags.requiresImplicitArgs);
ASSERT_NE(nullptr, kernel->getImplicitArgs());
kernel->setGroupSize(2, 2, 1);
ze_result_t result{};
std::unique_ptr<L0::CommandList> commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto indirectHeap = commandList->commandContainer.getIndirectHeap(NEO::HeapType::INDIRECT_OBJECT);
memset(indirectHeap->getSpace(0), 0, kernel->getSizeForImplicitArgsPatching());
ze_group_count_t groupCount{1, 1, 1};
result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup();
EXPECT_EQ(indirectHeap->getUsed(), sizeCrossThreadData + sizePerThreadDataForWholeGroup + kernel->getSizeForImplicitArgsPatching());
ImplicitArgs expectedImplicitArgs{sizeof(ImplicitArgs)};
expectedImplicitArgs.numWorkDim = 2;
expectedImplicitArgs.simdWidth = 1; expectedImplicitArgs.simdWidth = 1;
expectedImplicitArgs.localSizeX = 2; expectedImplicitArgs.localSizeX = 2;
expectedImplicitArgs.localSizeY = 2; expectedImplicitArgs.localSizeY = 2;
expectedImplicitArgs.localSizeZ = 1; expectedImplicitArgs.localSizeZ = 1;
expectedImplicitArgs.globalSizeX = 2;
expectedImplicitArgs.globalSizeY = 2;
expectedImplicitArgs.globalSizeZ = 1;
expectedImplicitArgs.groupCountX = 1;
expectedImplicitArgs.groupCountY = 1;
expectedImplicitArgs.groupCountZ = 1;
expectedImplicitArgs.localIdTablePtr = indirectHeap->getGraphicsAllocation()->getGpuAddress();
expectedImplicitArgs.printfBufferPtr = kernel->getPrintfBufferAllocation()->getGpuAddress();
auto sizeForImplicitArgPatching = kernel->getSizeForImplicitArgsPatching(); dispatchKernelWithImplicitArgs();
EXPECT_LT(0u, sizeForImplicitArgPatching);
auto localIdsProgrammingSize = sizeForImplicitArgPatching - sizeof(ImplicitArgs);
uint16_t expectedLocalIds[][3] = {{0, 0, 0}, uint16_t expectedLocalIds[][3] = {{0, 0, 0},
{0, 1, 0}, {0, 1, 0},
{0, 0, 1}, {0, 0, 1},
{0, 1, 1}}; {0, 1, 1}};
uint8_t zeros[MemoryConstants::cacheLineSize]{}; EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeof(expectedLocalIds)));
EXPECT_EQ(localIdsProgrammingSize, alignUp(sizeof(expectedLocalIds), MemoryConstants::cacheLineSize));
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeap->getCpuBase(), sizeof(expectedLocalIds))); auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs);
EXPECT_EQ(0, memcmp(zeros, ptrOffset(indirectHeap->getCpuBase(), sizeof(expectedLocalIds)), localIdsProgrammingSize - sizeof(expectedLocalIds)));
auto pImplicitArgs = reinterpret_cast<ImplicitArgs *>(ptrOffset(indirectHeap->getCpuBase(), localIdsProgrammingSize)); EXPECT_EQ(alignUp(sizeof(expectedLocalIds), MemoryConstants::cacheLineSize), localIdsProgrammingSize);
EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, sizeof(ImplicitArgs)));
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, sizeof(ImplicitArgs)));
} }
HWTEST_F(CmdlistAppendLaunchKernelTests, givenKernelWithoutImplicitArgsWhenAppendLaunchKernelThenImplicitArgsAreNotSentToIndirectHeap) { HWTEST_F(CmdlistAppendLaunchKernelTests, givenKernelWithoutImplicitArgsWhenAppendLaunchKernelThenImplicitArgsAreNotSentToIndirectHeap) {
@@ -1182,10 +1110,6 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, givenKernelWithoutImplicitArgsWhenAppen
auto sizeCrossThreadData = kernel->getCrossThreadDataSize(); auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup(); auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup();
EXPECT_EQ(indirectHeap->getUsed(), sizeCrossThreadData + sizePerThreadDataForWholeGroup); EXPECT_EQ(indirectHeap->getUsed(), sizeCrossThreadData + sizePerThreadDataForWholeGroup);
auto sizeForImplicitArgPatching = kernel->getSizeForImplicitArgsPatching();
EXPECT_EQ(0u, sizeForImplicitArgPatching);
} }
HWTEST2_F(CmdlistAppendLaunchKernelTests, givenKernelWitchScratchAndPrivateWhenAppendLaunchKernelThenCmdListHasCorrectPrivateAndScratchSizesSet, IsAtLeastXeHpCore) { HWTEST2_F(CmdlistAppendLaunchKernelTests, givenKernelWitchScratchAndPrivateWhenAppendLaunchKernelThenCmdListHasCorrectPrivateAndScratchSizesSet, IsAtLeastXeHpCore) {

View File

@@ -2357,33 +2357,6 @@ TEST_F(KernelImplicitArgTests, givenKernelWithImplicitArgsWhenSettingKernelParam
EXPECT_EQ(0, memcmp(pImplicitArgs, &expectedImplicitArgs, sizeof(ImplicitArgs))); EXPECT_EQ(0, memcmp(pImplicitArgs, &expectedImplicitArgs, sizeof(ImplicitArgs)));
} }
TEST_F(KernelImplicitArgTests, givenKernelWithoutImplicitArgsWhenPatchingImplicitArgsThenNothingHappens) {
std::unique_ptr<MockImmutableData> mockKernelImmData = std::make_unique<MockImmutableData>(0u);
mockKernelImmData->kernelDescriptor->kernelAttributes.flags.requiresImplicitArgs = false;
createModuleFromBinary(0u, false, mockKernelImmData.get());
auto kernel = std::make_unique<MockKernel>(module.get());
ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC};
kernel->initialize(&kernelDesc);
EXPECT_EQ(nullptr, kernel->getImplicitArgs());
uint8_t initData[64]{};
uint8_t data[64]{};
int pattern = 0xcd;
memset(data, pattern, 64);
memset(initData, pattern, 64);
EXPECT_EQ(0u, kernel->getSizeForImplicitArgsPatching());
void *dataPtr = data;
kernel->patchImplicitArgs(dataPtr);
EXPECT_EQ(dataPtr, data);
EXPECT_EQ(0, memcmp(data, initData, 64));
}
using MultiTileModuleTest = Test<MultiTileModuleFixture>; using MultiTileModuleTest = Test<MultiTileModuleFixture>;
HWTEST2_F(MultiTileModuleTest, GivenMultiTileDeviceWhenSettingKernelArgAndSurfaceStateThenMultiTileFlagsAreSetCorrectly, IsXeHpCore) { HWTEST2_F(MultiTileModuleTest, GivenMultiTileDeviceWhenSettingKernelArgAndSurfaceStateThenMultiTileFlagsAreSetCorrectly, IsXeHpCore) {

View File

@@ -47,17 +47,18 @@ template <typename GfxFamily>
size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(const Kernel &kernel, size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(const Kernel &kernel,
size_t localWorkSize) { size_t localWorkSize) {
typedef typename GfxFamily::WALKER_TYPE WALKER_TYPE; typedef typename GfxFamily::WALKER_TYPE WALKER_TYPE;
const auto &kernelInfo = kernel.getKernelInfo(); const auto &kernelDescriptor = kernel.getDescriptor();
const auto &hwInfo = kernel.getHardwareInfo();
auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels; auto numChannels = kernelDescriptor.kernelAttributes.numLocalIdChannels;
uint32_t grfSize = sizeof(typename GfxFamily::GRF); uint32_t grfSize = hwInfo.capabilityTable.grfSize;
auto simdSize = kernelInfo.getMaxSimdSize(); auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
auto size = kernel.getCrossThreadDataSize() + auto size = kernel.getCrossThreadDataSize() +
getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, localWorkSize); getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, localWorkSize);
if (kernel.getImplicitArgs()) { auto pImplicitArgs = kernel.getImplicitArgs();
auto grfSizeForImplicitArgs = ImplicitArgsHelper::getGrfSize(simdSize, grfSize); if (pImplicitArgs) {
size += sizeof(ImplicitArgs) + alignUp(getPerThreadDataSizeTotal(simdSize, grfSizeForImplicitArgs, 3u, localWorkSize), MemoryConstants::cacheLineSize); size += ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, hwInfo);
} }
return alignUp(size, WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); return alignUp(size, WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
} }
@@ -218,36 +219,6 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
auto threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkItems)); auto threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkItems));
auto numChannels = static_cast<uint32_t>(kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels); auto numChannels = static_cast<uint32_t>(kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels);
auto pImplicitArgs = kernel.getImplicitArgs();
if (pImplicitArgs) {
auto grfSize = ImplicitArgsHelper::getGrfSize(simd, sizeof(typename GfxFamily::GRF));
const auto &kernelAttributes = kernelInfo.kernelDescriptor.kernelAttributes;
uint32_t requiredWalkOrder = 0u;
auto generationOfLocalIdsByRuntime = EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(
3,
localWorkSize,
std::array<uint8_t, 3>{
{kernelAttributes.workgroupWalkOrder[0],
kernelAttributes.workgroupWalkOrder[1],
kernelAttributes.workgroupWalkOrder[2]}},
kernelAttributes.flags.requiresWorkgroupWalkOrder,
requiredWalkOrder,
simd);
auto dimensionOrder = ImplicitArgsHelper::getDimensionOrderForLocalIds(kernelAttributes.workgroupDimensionsOrder, generationOfLocalIdsByRuntime, requiredWalkOrder);
auto offsetLocalIds = sendPerThreadData(
ioh,
simd,
grfSize,
3u, // all channels for implicit args
std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSize[0]), static_cast<uint16_t>(localWorkSize[1]), static_cast<uint16_t>(localWorkSize[2])}},
dimensionOrder,
kernel.usesOnlyImages());
pImplicitArgs->localIdTablePtr = offsetLocalIds + ioh.getGraphicsAllocation()->getGpuAddress();
}
uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize(); uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();
size_t offsetCrossThreadData = HardwareCommandsHelper<GfxFamily>::sendCrossThreadData( size_t offsetCrossThreadData = HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(

View File

@@ -117,9 +117,18 @@ size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
auto pImplicitArgs = kernel.getImplicitArgs(); auto pImplicitArgs = kernel.getImplicitArgs();
if (pImplicitArgs) { if (pImplicitArgs) {
auto implicitArgsSize = static_cast<uint32_t>(sizeof(ImplicitArgs)); pImplicitArgs->localIdTablePtr = indirectHeap.getGraphicsAllocation()->getGpuAddress() + offsetCrossThreadData;
pDest = static_cast<char *>(indirectHeap.getSpace(implicitArgsSize));
memcpy_s(pDest, implicitArgsSize, pImplicitArgs, implicitArgsSize); const auto &kernelDescriptor = kernel.getDescriptor();
const auto &hwInfo = kernel.getHardwareInfo();
auto sizeForImplicitArgsProgramming = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, hwInfo);
auto sizeForLocalIdsProgramming = sizeForImplicitArgsProgramming - sizeof(ImplicitArgs);
offsetCrossThreadData += sizeForLocalIdsProgramming;
auto ptrToPatchImplicitArgs = indirectHeap.getSpace(sizeForImplicitArgsProgramming);
ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, hwInfo, {});
} }
pDest = static_cast<char *>(indirectHeap.getSpace(sizeCrossThreadData)); pDest = static_cast<char *>(indirectHeap.getSpace(sizeCrossThreadData));

View File

@@ -101,9 +101,32 @@ size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
auto pImplicitArgs = kernel.getImplicitArgs(); auto pImplicitArgs = kernel.getImplicitArgs();
if (pImplicitArgs) { if (pImplicitArgs) {
auto implicitArgsSize = static_cast<uint32_t>(sizeof(ImplicitArgs)); pImplicitArgs->localIdTablePtr = indirectHeap.getGraphicsAllocation()->getGpuAddress() + offsetCrossThreadData;
dest = static_cast<char *>(indirectHeap.getSpace(implicitArgsSize));
memcpy_s(dest, implicitArgsSize, pImplicitArgs, implicitArgsSize); const auto &kernelDescriptor = kernel.getDescriptor();
const auto &hwInfo = kernel.getHardwareInfo();
auto sizeForImplicitArgsProgramming = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, hwInfo);
auto sizeForLocalIdsProgramming = sizeForImplicitArgsProgramming - sizeof(ImplicitArgs);
offsetCrossThreadData += sizeForLocalIdsProgramming;
auto ptrToPatchImplicitArgs = indirectHeap.getSpace(sizeForImplicitArgsProgramming);
const auto &kernelAttributes = kernelDescriptor.kernelAttributes;
uint32_t requiredWalkOrder = 0u;
size_t localWorkSize[3] = {pImplicitArgs->localSizeX, pImplicitArgs->localSizeY, pImplicitArgs->localSizeZ};
auto generationOfLocalIdsByRuntime = EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(
3,
localWorkSize,
std::array<uint8_t, 3>{
{kernelAttributes.workgroupWalkOrder[0],
kernelAttributes.workgroupWalkOrder[1],
kernelAttributes.workgroupWalkOrder[2]}},
kernelAttributes.flags.requiresWorkgroupWalkOrder,
requiredWalkOrder,
kernelDescriptor.kernelAttributes.simdSize);
ImplicitArgsHelper::patchImplicitArgs(ptrToPatchImplicitArgs, *pImplicitArgs, kernelDescriptor, hwInfo, std::make_pair(generationOfLocalIdsByRuntime, requiredWalkOrder));
} }
using InlineData = typename GfxFamily::INLINE_DATA; using InlineData = typename GfxFamily::INLINE_DATA;

View File

@@ -403,6 +403,7 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
bool requiresMemoryMigration() const { return migratableArgsMap.size() > 0; } bool requiresMemoryMigration() const { return migratableArgsMap.size() > 0; }
const std::map<uint32_t, MemObj *> &getMemObjectsToMigrate() const { return migratableArgsMap; } const std::map<uint32_t, MemObj *> &getMemObjectsToMigrate() const { return migratableArgsMap; }
ImplicitArgs *getImplicitArgs() const { return pImplicitArgs.get(); } ImplicitArgs *getImplicitArgs() const { return pImplicitArgs.get(); }
const HardwareInfo &getHardwareInfo() const;
protected: protected:
void void
@@ -426,8 +427,6 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
void addAllocationToCacheFlushVector(uint32_t argIndex, GraphicsAllocation *argAllocation); void addAllocationToCacheFlushVector(uint32_t argIndex, GraphicsAllocation *argAllocation);
bool allocationForCacheFlush(GraphicsAllocation *argAllocation) const; bool allocationForCacheFlush(GraphicsAllocation *argAllocation) const;
const HardwareInfo &getHardwareInfo() const;
const ClDevice &getDevice() const { const ClDevice &getDevice() const {
return clDevice; return clDevice;
} }

View File

@@ -1206,270 +1206,178 @@ HWTEST_F(KernelCacheFlushTests, givenLocallyUncachedBufferWhenGettingAllocations
clReleaseMemObject(bufferRegular); clReleaseMemObject(bufferRegular);
} }
using HardwareCommandsImplicitArgsTests = Test<ClDeviceFixture>; struct HardwareCommandsImplicitArgsTests : Test<ClDeviceFixture> {
HWTEST_F(HardwareCommandsImplicitArgsTests, givenKernelWithImplicitArgsWhenSendingCrossThreadDataThenImplicitArgsAreSetAtTheBeginningOfIndirectData) { void SetUp() override {
auto indirectHeapAllocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), MemoryConstants::pageSize}); ClDeviceFixture::SetUp();
IndirectHeap indirectHeap(indirectHeapAllocation, false); indirectHeapAllocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), MemoryConstants::pageSize});
auto pKernelInfo = std::make_unique<MockKernelInfo>(); expectedImplicitArgs.numWorkDim = 3;
pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = 32; expectedImplicitArgs.simdWidth = 32;
pKernelInfo->kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = true; expectedImplicitArgs.localSizeX = 2;
expectedImplicitArgs.localSizeY = 3;
expectedImplicitArgs.localSizeZ = 4;
expectedImplicitArgs.globalOffsetX = 1;
expectedImplicitArgs.globalOffsetY = 2;
expectedImplicitArgs.globalOffsetZ = 3;
expectedImplicitArgs.groupCountX = 2;
expectedImplicitArgs.groupCountY = 1;
expectedImplicitArgs.groupCountZ = 3;
}
MockContext context(pClDevice); void TearDown() override {
MockProgram program(&context, false, toClDeviceVector(*pClDevice)); pDevice->getMemoryManager()->freeGraphicsMemory(indirectHeapAllocation);
ClDeviceFixture::TearDown();
}
MockKernel kernel(&program, *pKernelInfo, *pClDevice); template <typename FamilyType>
ASSERT_EQ(CL_SUCCESS, kernel.initialize()); void dispatchKernelWithImplicitArgs() {
auto pImplicitArgs = kernel.getImplicitArgs(); expectedImplicitArgs.globalSizeX = expectedImplicitArgs.localSizeX * expectedImplicitArgs.groupCountX;
expectedImplicitArgs.globalSizeY = expectedImplicitArgs.localSizeY * expectedImplicitArgs.groupCountY;
expectedImplicitArgs.globalSizeZ = expectedImplicitArgs.localSizeZ * expectedImplicitArgs.groupCountZ;
ASSERT_NE(nullptr, pImplicitArgs); IndirectHeap indirectHeap(indirectHeapAllocation, false);
auto pKernelInfo = std::make_unique<MockKernelInfo>();
pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = expectedImplicitArgs.simdWidth;
pKernelInfo->kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = true;
pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0] = workgroupDimOrder[0];
pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1] = workgroupDimOrder[1];
pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2] = workgroupDimOrder[2];
MockContext context(pClDevice);
MockProgram program(&context, false, toClDeviceVector(*pClDevice));
MockKernel kernel(&program, *pKernelInfo, *pClDevice);
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
auto pImplicitArgs = kernel.getImplicitArgs();
ASSERT_NE(nullptr, pImplicitArgs);
kernel.setWorkDim(expectedImplicitArgs.numWorkDim);
kernel.setLocalWorkSizeValues(expectedImplicitArgs.localSizeX, expectedImplicitArgs.localSizeY, expectedImplicitArgs.localSizeZ);
kernel.setGlobalWorkSizeValues(static_cast<uint32_t>(expectedImplicitArgs.globalSizeX), static_cast<uint32_t>(expectedImplicitArgs.globalSizeY), static_cast<uint32_t>(expectedImplicitArgs.globalSizeZ));
kernel.setGlobalWorkOffsetValues(static_cast<uint32_t>(expectedImplicitArgs.globalOffsetX), static_cast<uint32_t>(expectedImplicitArgs.globalOffsetY), static_cast<uint32_t>(expectedImplicitArgs.globalOffsetZ));
kernel.setNumWorkGroupsValues(expectedImplicitArgs.groupCountX, expectedImplicitArgs.groupCountY, expectedImplicitArgs.groupCountZ);
implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernel.getDescriptor(), pDevice->getHardwareInfo());
auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
indirectHeap,
kernel,
false,
nullptr,
sizeCrossThreadData);
EXPECT_LE(implicitArgsProgrammingSize, indirectHeap.getUsed());
expectedImplicitArgs.localIdTablePtr = indirectHeapAllocation->getGpuAddress();
}
ImplicitArgs expectedImplicitArgs = {sizeof(ImplicitArgs)}; ImplicitArgs expectedImplicitArgs = {sizeof(ImplicitArgs)};
expectedImplicitArgs.numWorkDim = 3; GraphicsAllocation *indirectHeapAllocation = nullptr;
expectedImplicitArgs.simdWidth = 32; std::array<uint8_t, 3> workgroupDimOrder{0, 1, 2};
expectedImplicitArgs.localSizeX = 4; uint32_t implicitArgsProgrammingSize = 0u;
expectedImplicitArgs.localSizeY = 5; };
expectedImplicitArgs.localSizeZ = 6;
expectedImplicitArgs.globalSizeX = 7;
expectedImplicitArgs.globalSizeY = 8;
expectedImplicitArgs.globalSizeZ = 9;
expectedImplicitArgs.globalOffsetX = 1;
expectedImplicitArgs.globalOffsetY = 2;
expectedImplicitArgs.globalOffsetZ = 3;
expectedImplicitArgs.groupCountX = 3;
expectedImplicitArgs.groupCountY = 2;
expectedImplicitArgs.groupCountZ = 1;
kernel.setWorkDim(3); HWTEST_F(HardwareCommandsImplicitArgsTests, givenKernelWithImplicitArgsWhenSendingCrossThreadDataThenImplicitArgsAreSetAtTheBeginningOfIndirectData) {
kernel.setLocalWorkSizeValues(4, 5, 6); dispatchKernelWithImplicitArgs<FamilyType>();
kernel.setGlobalWorkSizeValues(7, 8, 9);
kernel.setGlobalWorkOffsetValues(1, 2, 3);
kernel.setNumWorkGroupsValues(3, 2, 1);
auto indirectData = indirectHeapAllocation->getUnderlyingBuffer(); auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs);
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
auto sizeCrossThreadData = kernel.getCrossThreadDataSize(); EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, sizeof(ImplicitArgs)));
HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
indirectHeap,
kernel,
false,
nullptr,
sizeCrossThreadData);
EXPECT_EQ(0, memcmp(indirectData, &expectedImplicitArgs, sizeof(ImplicitArgs)));
pDevice->getMemoryManager()->freeGraphicsMemory(indirectHeapAllocation);
} }
HWTEST_F(HardwareCommandsImplicitArgsTests, givenKernelWithImplicitArgsAndRuntimeLocalIdsGenerationWhenSendingIndirectStateThenLocalIdsAreGeneratedAndCorrectlyProgrammedInCrossThreadData) { HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithImplicitArgsAndRuntimeLocalIdsGenerationWhenSendingIndirectStateThenLocalIdsAreGeneratedAndCorrectlyProgrammedInCrossThreadData) {
DebugManagerStateRestore restorer; DebugManagerStateRestore restorer;
DebugManager.flags.EnableHwGenerationLocalIds.set(0); DebugManager.flags.EnableHwGenerationLocalIds.set(0);
auto pKernelInfo = std::make_unique<MockKernelInfo>();
uint32_t simd = 32;
pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = simd;
pKernelInfo->kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = true;
pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0] = 2;
pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1] = 1;
pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2] = 0;
MockContext context(pClDevice); workgroupDimOrder[0] = 2;
CommandQueueHw<FamilyType> cmdQ(&context, pClDevice, 0, false); workgroupDimOrder[1] = 1;
MockProgram program(&context, false, toClDeviceVector(*pClDevice)); workgroupDimOrder[2] = 0;
MockKernel kernel(&program, *pKernelInfo, *pClDevice); std::array<uint16_t, 3> localSize{2, 3, 4};
ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t totalLocalSize = localSize[0] * localSize[1] * localSize[2];
const size_t localWorkSizeX = 2; expectedImplicitArgs.localSizeX = localSize[0];
const size_t localWorkSizeY = 3; expectedImplicitArgs.localSizeY = localSize[1];
const size_t localWorkSizeZ = 4; expectedImplicitArgs.localSizeZ = localSize[2];
const size_t localWorkSizes[3]{localWorkSizeX, localWorkSizeY, localWorkSizeZ};
auto &commandStream = cmdQ.getCS(1024); dispatchKernelWithImplicitArgs<FamilyType>();
auto pWalkerCmd = reinterpret_cast<typename FamilyType::WALKER_TYPE *>(commandStream.getSpace(0));
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 8192); auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth, sizeof(typename FamilyType::GRF));
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::Type::INDIRECT_OBJECT, 8192); auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - sizeof(ImplicitArgs), MemoryConstants::cacheLineSize);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::Type::SURFACE_STATE, 8192); generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize);
dsh.align(EncodeStates<FamilyType>::alignInterfaceDescriptorData); auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs);
auto interfaceDescriptor = reinterpret_cast<typename FamilyType::INTERFACE_DESCRIPTOR_DATA *>(dsh.getSpace(0)); size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize);
uint32_t interfaceDescriptorIndex = 0u;
HardwareCommandsHelper<FamilyType>::sendIndirectState( EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
commandStream,
dsh,
ioh,
ssh,
kernel,
0u,
simd,
localWorkSizes,
0u,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
interfaceDescriptor,
false,
*pDevice);
constexpr uint32_t grfSize = sizeof(typename FamilyType::GRF);
size_t localWorkSize = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
size_t expectedIohSize = PerThreadDataHelper::getPerThreadDataSizeTotal(simd, grfSize, 3u, localWorkSize);
ASSERT_LE(expectedIohSize, ioh.getUsed());
auto expectedLocalIds = alignedMalloc(expectedIohSize, 64);
generateLocalIDs(expectedLocalIds, simd,
std::array<uint16_t, 3>{{localWorkSizeX, localWorkSizeY, localWorkSizeZ}},
std::array<uint8_t, 3>{{pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0],
pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1],
pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2]}},
false, grfSize);
EXPECT_EQ(0, memcmp(expectedLocalIds, ioh.getCpuBase(), expectedIohSize));
alignedFree(expectedLocalIds); alignedFree(expectedLocalIds);
auto pImplicitArgs = reinterpret_cast<ImplicitArgs *>(ptrOffset(ioh.getCpuBase(), alignUp(expectedIohSize, MemoryConstants::cacheLineSize))); auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
EXPECT_EQ(ioh.getGraphicsAllocation()->getGpuAddress(), pImplicitArgs->localIdTablePtr); EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, sizeof(ImplicitArgs)));
} }
HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithImplicitArgsAndHwLocalIdsGenerationWhenSendingIndirectStateThenLocalIdsAreGeneratedAndCorrectlyProgrammedInCrossThreadData) { HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithImplicitArgsAndHwLocalIdsGenerationWhenSendingIndirectStateThenLocalIdsAreGeneratedAndCorrectlyProgrammedInCrossThreadData) {
auto pKernelInfo = std::make_unique<MockKernelInfo>(); DebugManagerStateRestore restorer;
uint32_t simd = 32; DebugManager.flags.EnableHwGenerationLocalIds.set(1);
pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = simd;
pKernelInfo->kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = true;
pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0] = 2;
pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1] = 1;
pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2] = 0;
MockContext context(pClDevice); workgroupDimOrder[0] = 2;
CommandQueueHw<FamilyType> cmdQ(&context, pClDevice, 0, false); workgroupDimOrder[1] = 1;
MockProgram program(&context, false, toClDeviceVector(*pClDevice)); workgroupDimOrder[2] = 0;
MockKernel kernel(&program, *pKernelInfo, *pClDevice);
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
const size_t localWorkSizeX = 2;
const size_t localWorkSizeY = 3;
const size_t localWorkSizeZ = 4;
const size_t localWorkSizes[3]{localWorkSizeX, localWorkSizeY, localWorkSizeZ};
std::array<uint8_t, 3> expectedDimOrder = {0, 2, 1}; std::array<uint8_t, 3> expectedDimOrder = {0, 2, 1};
auto &commandStream = cmdQ.getCS(1024); std::array<uint16_t, 3> localSize{2, 3, 4};
auto pWalkerCmd = reinterpret_cast<typename FamilyType::WALKER_TYPE *>(commandStream.getSpace(0)); size_t totalLocalSize = localSize[0] * localSize[1] * localSize[2];
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 8192); expectedImplicitArgs.localSizeX = localSize[0];
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::Type::INDIRECT_OBJECT, 8192); expectedImplicitArgs.localSizeY = localSize[1];
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::Type::SURFACE_STATE, 8192); expectedImplicitArgs.localSizeZ = localSize[2];
dsh.align(EncodeStates<FamilyType>::alignInterfaceDescriptorData); dispatchKernelWithImplicitArgs<FamilyType>();
auto interfaceDescriptor = reinterpret_cast<typename FamilyType::INTERFACE_DESCRIPTOR_DATA *>(dsh.getSpace(0));
uint32_t interfaceDescriptorIndex = 0u;
HardwareCommandsHelper<FamilyType>::sendIndirectState( auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth, sizeof(typename FamilyType::GRF));
commandStream, auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - sizeof(ImplicitArgs), MemoryConstants::cacheLineSize);
dsh, generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize);
ioh,
ssh,
kernel,
0u,
simd,
localWorkSizes,
0u,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
interfaceDescriptor,
false,
*pDevice);
constexpr uint32_t grfSize = sizeof(typename FamilyType::GRF); auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs);
size_t localWorkSize = localWorkSizeX * localWorkSizeY * localWorkSizeZ; size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, 3u, totalLocalSize);
size_t expectedIohSize = PerThreadDataHelper::getPerThreadDataSizeTotal(simd, grfSize, 3u, localWorkSize);
ASSERT_LE(expectedIohSize, ioh.getUsed());
auto expectedLocalIds = alignedMalloc(expectedIohSize, 64); EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
generateLocalIDs(expectedLocalIds, simd,
std::array<uint16_t, 3>{{localWorkSizeX, localWorkSizeY, localWorkSizeZ}},
expectedDimOrder,
false, grfSize);
EXPECT_EQ(0, memcmp(expectedLocalIds, ioh.getCpuBase(), expectedIohSize));
alignedFree(expectedLocalIds); alignedFree(expectedLocalIds);
auto pImplicitArgs = reinterpret_cast<ImplicitArgs *>(ptrOffset(ioh.getCpuBase(), alignUp(expectedIohSize, MemoryConstants::cacheLineSize))); auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
EXPECT_EQ(ioh.getGraphicsAllocation()->getGpuAddress(), pImplicitArgs->localIdTablePtr); EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, sizeof(ImplicitArgs)));
} }
HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithImplicitArgsWhenSendingIndirectStateWithSimd1ThenLocalIdsAreGeneratedCorrectly) { HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithImplicitArgsWhenSendingIndirectStateWithSimd1ThenLocalIdsAreGeneratedCorrectly) {
DebugManagerStateRestore restorer; workgroupDimOrder[0] = 2;
DebugManager.flags.EnableHwGenerationLocalIds.set(0); workgroupDimOrder[1] = 1;
auto pKernelInfo = std::make_unique<MockKernelInfo>(); workgroupDimOrder[2] = 0;
uint32_t simd = 1;
pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = simd;
pKernelInfo->kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = true;
pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0] = 2;
pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1] = 1;
pKernelInfo->kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2] = 0;
MockContext context(pClDevice); expectedImplicitArgs.simdWidth = 1;
CommandQueueHw<FamilyType> cmdQ(&context, pClDevice, 0, false); expectedImplicitArgs.localSizeX = 2;
MockProgram program(&context, false, toClDeviceVector(*pClDevice)); expectedImplicitArgs.localSizeY = 2;
expectedImplicitArgs.localSizeZ = 1;
MockKernel kernel(&program, *pKernelInfo, *pClDevice); dispatchKernelWithImplicitArgs<FamilyType>();
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
const size_t localWorkSizeX = 2;
const size_t localWorkSizeY = 2;
const size_t localWorkSizeZ = 1;
const size_t localWorkSizes[3]{localWorkSizeX, localWorkSizeY, localWorkSizeZ};
auto &commandStream = cmdQ.getCS(1024);
auto pWalkerCmd = reinterpret_cast<typename FamilyType::WALKER_TYPE *>(commandStream.getSpace(0));
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::Type::INDIRECT_OBJECT, 8192);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::Type::SURFACE_STATE, 8192);
dsh.align(EncodeStates<FamilyType>::alignInterfaceDescriptorData);
auto interfaceDescriptor = reinterpret_cast<typename FamilyType::INTERFACE_DESCRIPTOR_DATA *>(dsh.getSpace(0));
uint32_t interfaceDescriptorIndex = 0u;
HardwareCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
ioh,
ssh,
kernel,
0u,
simd,
localWorkSizes,
0u,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
interfaceDescriptor,
false,
*pDevice);
uint32_t grfSize = ImplicitArgsHelper::getGrfSize(simd, sizeof(typename FamilyType::GRF));
EXPECT_EQ(3 * sizeof(uint16_t), grfSize);
size_t localWorkSize = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
size_t expectedLocalIdsSize = PerThreadDataHelper::getPerThreadDataSizeTotal(simd, grfSize, 3u, localWorkSize);
ASSERT_LE(expectedLocalIdsSize, ioh.getUsed());
uint16_t expectedLocalIds[][3] = {{0, 0, 0}, uint16_t expectedLocalIds[][3] = {{0, 0, 0},
{0, 1, 0}, {0, 1, 0},
{0, 0, 1}, {0, 0, 1},
{0, 1, 1}}; {0, 1, 1}};
EXPECT_EQ(expectedLocalIdsSize, sizeof(expectedLocalIds));
EXPECT_EQ(0, memcmp(expectedLocalIds, ioh.getCpuBase(), sizeof(expectedLocalIds))); EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeof(expectedLocalIds)));
auto localIdsProgrammingSize = alignUp(sizeof(expectedLocalIds), MemoryConstants::cacheLineSize); auto localIdsProgrammingSize = implicitArgsProgrammingSize - sizeof(ImplicitArgs);
ASSERT_LE(localIdsProgrammingSize + sizeof(ImplicitArgs), ioh.getUsed());
auto pImplicitArgs = reinterpret_cast<ImplicitArgs *>(ptrOffset(ioh.getCpuBase(), localIdsProgrammingSize)); EXPECT_EQ(alignUp(sizeof(expectedLocalIds), MemoryConstants::cacheLineSize), localIdsProgrammingSize);
EXPECT_EQ(ioh.getGraphicsAllocation()->getGpuAddress(), pImplicitArgs->localIdTablePtr);
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, sizeof(ImplicitArgs)));
} }
using HardwareCommandsTestXeHpAndLater = HardwareCommandsTest; using HardwareCommandsTestXeHpAndLater = HardwareCommandsTest;

View File

@@ -129,7 +129,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
EncodeDispatchKernel<Family>::setGrfInfo(&idd, kernelDescriptor.kernelAttributes.numGrfRequired, sizeCrossThreadData, sizePerThreadData); EncodeDispatchKernel<Family>::setGrfInfo(&idd, kernelDescriptor.kernelAttributes.numGrfRequired, sizeCrossThreadData, sizePerThreadData);
uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData; uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData;
uint32_t sizeForImplicitArgsPatching = args.dispatchInterface->getSizeForImplicitArgsPatching(); uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, hwInfo);
uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching; uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching;
uint64_t offsetThreadData = 0u; uint64_t offsetThreadData = 0u;
{ {
@@ -144,7 +144,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
if (pImplicitArgs) { if (pImplicitArgs) {
offsetThreadData -= sizeof(ImplicitArgs); offsetThreadData -= sizeof(ImplicitArgs);
pImplicitArgs->localIdTablePtr = heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - iohRequiredSize; pImplicitArgs->localIdTablePtr = heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - iohRequiredSize;
args.dispatchInterface->patchImplicitArgs(ptr); ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, hwInfo, {});
} }
memcpy_s(ptr, sizeCrossThreadData, memcpy_s(ptr, sizeCrossThreadData,

View File

@@ -76,6 +76,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
EncodeDispatchKernel<Family>::setGrfInfo(&idd, kernelDescriptor.kernelAttributes.numGrfRequired, sizeCrossThreadData, sizePerThreadData); EncodeDispatchKernel<Family>::setGrfInfo(&idd, kernelDescriptor.kernelAttributes.numGrfRequired, sizeCrossThreadData, sizePerThreadData);
bool localIdsGenerationByRuntime = args.dispatchInterface->requiresGenerationOfLocalIdsByRuntime(); bool localIdsGenerationByRuntime = args.dispatchInterface->requiresGenerationOfLocalIdsByRuntime();
auto requiredWorkgroupOrder = args.dispatchInterface->getRequiredWorkgroupOrder();
bool inlineDataProgramming = EncodeDispatchKernel<Family>::inlineDataProgrammingRequired(kernelDescriptor); bool inlineDataProgramming = EncodeDispatchKernel<Family>::inlineDataProgrammingRequired(kernelDescriptor);
{ {
auto alloc = args.dispatchInterface->getIsaAllocation(); auto alloc = args.dispatchInterface->getIsaAllocation();
@@ -160,7 +161,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
} }
uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData; uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData;
uint32_t sizeForImplicitArgsPatching = args.dispatchInterface->getSizeForImplicitArgsPatching(); uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, hwInfo);
uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching; uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching;
{ {
auto heap = container.getIndirectHeap(HeapType::INDIRECT_OBJECT); auto heap = container.getIndirectHeap(HeapType::INDIRECT_OBJECT);
@@ -174,7 +175,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
if (pImplicitArgs) { if (pImplicitArgs) {
offsetThreadData -= sizeof(ImplicitArgs); offsetThreadData -= sizeof(ImplicitArgs);
pImplicitArgs->localIdTablePtr = heap->getGraphicsAllocation()->getGpuAddress() + heap->getUsed() - iohRequiredSize; pImplicitArgs->localIdTablePtr = heap->getGraphicsAllocation()->getGpuAddress() + heap->getUsed() - iohRequiredSize;
args.dispatchInterface->patchImplicitArgs(ptr); ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, hwInfo, std::make_pair(localIdsGenerationByRuntime, requiredWorkgroupOrder));
} }
if (sizeCrossThreadData > 0) { if (sizeCrossThreadData > 0) {
@@ -231,7 +232,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
localIdsGenerationByRuntime, localIdsGenerationByRuntime,
inlineDataProgramming, inlineDataProgramming,
args.isIndirect, args.isIndirect,
args.dispatchInterface->getRequiredWorkgroupOrder()); requiredWorkgroupOrder);
using POSTSYNC_DATA = typename Family::POSTSYNC_DATA; using POSTSYNC_DATA = typename Family::POSTSYNC_DATA;
auto &postSync = walkerCmd.getPostSync(); auto &postSync = walkerCmd.getPostSync();

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (C) 2020-2021 Intel Corporation * Copyright (C) 2020-2022 Intel Corporation
* *
* SPDX-License-Identifier: MIT * SPDX-License-Identifier: MIT
* *
@@ -47,7 +47,5 @@ struct DispatchKernelEncoderI {
virtual bool requiresGenerationOfLocalIdsByRuntime() const = 0; virtual bool requiresGenerationOfLocalIdsByRuntime() const = 0;
virtual ImplicitArgs *getImplicitArgs() const = 0; virtual ImplicitArgs *getImplicitArgs() const = 0;
virtual uint32_t getSizeForImplicitArgsPatching() const = 0;
virtual void patchImplicitArgs(void *&pOut) const = 0;
}; };
} // namespace NEO } // namespace NEO

View File

@@ -9,10 +9,14 @@
#include <array> #include <array>
#include <cstdint> #include <cstdint>
#include <optional>
#include <type_traits> #include <type_traits>
namespace NEO { namespace NEO {
struct KernelDescriptor;
struct HardwareInfo;
struct ImplicitArgs { struct ImplicitArgs {
uint8_t structSize; uint8_t structSize;
uint8_t structVersion; uint8_t structVersion;
@@ -32,6 +36,7 @@ struct ImplicitArgs {
uint32_t groupCountX; uint32_t groupCountX;
uint32_t groupCountY; uint32_t groupCountY;
uint32_t groupCountZ; uint32_t groupCountZ;
uint32_t reserved;
}; };
static_assert((sizeof(ImplicitArgs) & 31) == 0, "Implicit args size need to be aligned to 32"); static_assert((sizeof(ImplicitArgs) & 31) == 0, "Implicit args size need to be aligned to 32");
static_assert(std::is_pod<ImplicitArgs>::value); static_assert(std::is_pod<ImplicitArgs>::value);
@@ -39,7 +44,9 @@ static_assert(std::is_pod<ImplicitArgs>::value);
constexpr const char *implicitArgsRelocationSymbolName = "INTEL_PATCH_CROSS_THREAD_OFFSET_OFF_R0"; constexpr const char *implicitArgsRelocationSymbolName = "INTEL_PATCH_CROSS_THREAD_OFFSET_OFF_R0";
namespace ImplicitArgsHelper { namespace ImplicitArgsHelper {
std::array<uint8_t, 3> getDimensionOrderForLocalIds(const uint8_t *workgroupDimensionsOrder, bool generationOfLocalIdsByRuntime, uint32_t walkOrderForHwGenerationOfLocalIds); std::array<uint8_t, 3> getDimensionOrderForLocalIds(const uint8_t *workgroupDimensionsOrder, std::optional<std::pair<bool /* localIdsGeneratedByRuntime */, uint32_t /* walkOrderForHwGenerationOfLocalIds */>> hwGenerationOfLocalIdsParams);
uint32_t getGrfSize(uint32_t simd, uint32_t grfSize); uint32_t getGrfSize(uint32_t simd, uint32_t grfSize);
uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor, const HardwareInfo &hardwareInfo);
void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, const HardwareInfo &hardwareInfo, std::optional<std::pair<bool /* localIdsGeneratedByRuntime */, uint32_t /* walkOrderForHwGenerationOfLocalIds */>> hwGenerationOfLocalIdsParams);
} // namespace ImplicitArgsHelper } // namespace ImplicitArgsHelper
} // namespace NEO } // namespace NEO

View File

@@ -5,14 +5,21 @@
* *
*/ */
#include "shared/source/helpers/basic_math.h"
#include "shared/source/helpers/hw_info.h"
#include "shared/source/helpers/hw_walk_order.h" #include "shared/source/helpers/hw_walk_order.h"
#include "shared/source/helpers/per_thread_data.h"
#include "shared/source/helpers/string.h"
#include "shared/source/helpers/vec.h"
#include "shared/source/kernel/implicit_args.h" #include "shared/source/kernel/implicit_args.h"
#include "shared/source/kernel/kernel_descriptor.h" #include "shared/source/kernel/kernel_descriptor.h"
namespace NEO { namespace NEO {
namespace ImplicitArgsHelper { namespace ImplicitArgsHelper {
std::array<uint8_t, 3> getDimensionOrderForLocalIds(const uint8_t *workgroupDimensionsOrder, bool generationOfLocalIdsByRuntime, uint32_t walkOrderForHwGenerationOfLocalIds) { std::array<uint8_t, 3> getDimensionOrderForLocalIds(const uint8_t *workgroupDimensionsOrder, std::optional<std::pair<bool, uint32_t>> hwGenerationOfLocalIdsParams) {
if (generationOfLocalIdsByRuntime) { auto localIdsGeneratedByRuntime = !hwGenerationOfLocalIdsParams.has_value() || hwGenerationOfLocalIdsParams.value().first;
if (localIdsGeneratedByRuntime) {
UNRECOVERABLE_IF(!workgroupDimensionsOrder); UNRECOVERABLE_IF(!workgroupDimensionsOrder);
return {{ return {{
workgroupDimensionsOrder[0], workgroupDimensionsOrder[0],
@@ -21,6 +28,7 @@ std::array<uint8_t, 3> getDimensionOrderForLocalIds(const uint8_t *workgroupDime
}}; }};
} }
auto walkOrderForHwGenerationOfLocalIds = hwGenerationOfLocalIdsParams.value().second;
UNRECOVERABLE_IF(walkOrderForHwGenerationOfLocalIds >= HwWalkOrderHelper::walkOrderPossibilties); UNRECOVERABLE_IF(walkOrderForHwGenerationOfLocalIds >= HwWalkOrderHelper::walkOrderPossibilties);
return HwWalkOrderHelper::compatibleDimensionOrders[walkOrderForHwGenerationOfLocalIds]; return HwWalkOrderHelper::compatibleDimensionOrders[walkOrderForHwGenerationOfLocalIds];
} }
@@ -31,5 +39,46 @@ uint32_t getGrfSize(uint32_t simd, uint32_t grfSize) {
} }
return grfSize; return grfSize;
} }
uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor, const HardwareInfo &hardwareInfo) {
if (!pImplicitArgs) {
return 0;
}
auto implicitArgsSize = static_cast<uint32_t>(sizeof(NEO::ImplicitArgs));
auto simdSize = pImplicitArgs->simdWidth;
auto grfSize = NEO::ImplicitArgsHelper::getGrfSize(simdSize, hardwareInfo.capabilityTable.grfSize);
Vec3<size_t> localWorkSize = {pImplicitArgs->localSizeX, pImplicitArgs->localSizeY, pImplicitArgs->localSizeZ};
auto itemsInGroup = Math::computeTotalElementsCount(localWorkSize);
uint32_t localIdsSizeNeeded =
alignUp(static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
simdSize, grfSize, 3u, itemsInGroup)),
MemoryConstants::cacheLineSize);
return implicitArgsSize + localIdsSizeNeeded;
}
void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, const KernelDescriptor &kernelDescriptor, const HardwareInfo &hardwareInfo, std::optional<std::pair<bool, uint32_t>> hwGenerationOfLocalIdsParams) {
auto totalSizeToProgram = getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, hardwareInfo);
auto retVal = ptrOffset(ptrToPatch, totalSizeToProgram);
const auto &kernelAttributes = kernelDescriptor.kernelAttributes;
auto simdSize = kernelAttributes.simdSize;
auto grfSize = getGrfSize(simdSize, hardwareInfo.capabilityTable.grfSize);
auto dimensionOrder = getDimensionOrderForLocalIds(kernelAttributes.workgroupDimensionsOrder, hwGenerationOfLocalIdsParams);
NEO::generateLocalIDs(
ptrToPatch,
simdSize,
std::array<uint16_t, 3>{{static_cast<uint16_t>(implicitArgs.localSizeX),
static_cast<uint16_t>(implicitArgs.localSizeY),
static_cast<uint16_t>(implicitArgs.localSizeZ)}},
dimensionOrder,
false, grfSize);
auto sizeForLocalIdsProgramming = totalSizeToProgram - sizeof(NEO::ImplicitArgs);
ptrToPatch = ptrOffset(ptrToPatch, sizeForLocalIdsProgramming);
memcpy_s(ptrToPatch, sizeof(NEO::ImplicitArgs), &implicitArgs, sizeof(NEO::ImplicitArgs));
return retVal;
}
} // namespace ImplicitArgsHelper } // namespace ImplicitArgsHelper
} // namespace NEO } // namespace NEO

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (C) 2020-2021 Intel Corporation * Copyright (C) 2020-2022 Intel Corporation
* *
* SPDX-License-Identifier: MIT * SPDX-License-Identifier: MIT
* *
@@ -28,8 +28,6 @@ struct MockDispatchKernelEncoder : public DispatchKernelEncoderI {
} }
NEO::ImplicitArgs *getImplicitArgs() const override { return nullptr; } NEO::ImplicitArgs *getImplicitArgs() const override { return nullptr; }
uint32_t getSizeForImplicitArgsPatching() const override { return 0; }
void patchImplicitArgs(void *&pOut) const override {}
MockGraphicsAllocation mockAllocation{}; MockGraphicsAllocation mockAllocation{};
static constexpr uint32_t crossThreadSize = 0x40; static constexpr uint32_t crossThreadSize = 0x40;

View File

@@ -6,29 +6,36 @@
*/ */
#include "shared/source/helpers/hw_walk_order.h" #include "shared/source/helpers/hw_walk_order.h"
#include "shared/source/helpers/per_thread_data.h"
#include "shared/source/kernel/implicit_args.h" #include "shared/source/kernel/implicit_args.h"
#include "shared/source/kernel/kernel_descriptor.h"
#include "shared/test/common/helpers/default_hw_info.h"
#include "shared/test/common/test_macros/test.h" #include "shared/test/common/test_macros/test.h"
using namespace NEO; using namespace NEO;
TEST(ImplicitArgsHelperTest, whenLocalIdsAreGeneratedByRuntimeThenDimensionOrderIsTakedFromInput) { TEST(ImplicitArgsHelperTest, whenLocalIdsAreGeneratedByRuntimeThenDimensionOrderIsTakedFromInput) {
uint8_t inputDimensionOrder[3] = {2, 0, 1};
for (auto i = 0u; i < HwWalkOrderHelper::walkOrderPossibilties; i++) { for (auto i = 0u; i < HwWalkOrderHelper::walkOrderPossibilties; i++) {
uint8_t inputDimensionOrder[3] = {2, 0, 1}; auto dimOrderForImplicitArgs = ImplicitArgsHelper::getDimensionOrderForLocalIds(inputDimensionOrder, std::make_pair(true, i));
auto dimOrderForImplicitArgs = ImplicitArgsHelper::getDimensionOrderForLocalIds(inputDimensionOrder, true, i);
EXPECT_EQ(inputDimensionOrder[0], dimOrderForImplicitArgs[0]); EXPECT_EQ(inputDimensionOrder[0], dimOrderForImplicitArgs[0]);
EXPECT_EQ(inputDimensionOrder[1], dimOrderForImplicitArgs[1]); EXPECT_EQ(inputDimensionOrder[1], dimOrderForImplicitArgs[1]);
EXPECT_EQ(inputDimensionOrder[2], dimOrderForImplicitArgs[2]); EXPECT_EQ(inputDimensionOrder[2], dimOrderForImplicitArgs[2]);
} }
auto dimOrderForImplicitArgs = ImplicitArgsHelper::getDimensionOrderForLocalIds(inputDimensionOrder, {});
EXPECT_EQ(inputDimensionOrder[0], dimOrderForImplicitArgs[0]);
EXPECT_EQ(inputDimensionOrder[1], dimOrderForImplicitArgs[1]);
EXPECT_EQ(inputDimensionOrder[2], dimOrderForImplicitArgs[2]);
} }
TEST(ImplicitArgsHelperTest, givenIncorrectcInputWhenGettingDimensionOrderThenAbortIsCalled) { TEST(ImplicitArgsHelperTest, givenIncorrectcInputWhenGettingDimensionOrderThenAbortIsCalled) {
EXPECT_THROW(ImplicitArgsHelper::getDimensionOrderForLocalIds(nullptr, true, 0), std::runtime_error); EXPECT_THROW(ImplicitArgsHelper::getDimensionOrderForLocalIds(nullptr, std::make_pair(true, 0u)), std::runtime_error);
EXPECT_THROW(ImplicitArgsHelper::getDimensionOrderForLocalIds(nullptr, false, HwWalkOrderHelper::walkOrderPossibilties), std::runtime_error); EXPECT_THROW(ImplicitArgsHelper::getDimensionOrderForLocalIds(nullptr, std::make_pair(false, HwWalkOrderHelper::walkOrderPossibilties)), std::runtime_error);
} }
TEST(ImplicitArgsHelperTest, whenLocalIdsAreGeneratedByHwThenProperDimensionOrderIsReturned) { TEST(ImplicitArgsHelperTest, whenLocalIdsAreGeneratedByHwThenProperDimensionOrderIsReturned) {
for (auto i = 0u; i < HwWalkOrderHelper::walkOrderPossibilties; i++) { for (auto i = 0u; i < HwWalkOrderHelper::walkOrderPossibilties; i++) {
auto dimOrderForImplicitArgs = ImplicitArgsHelper::getDimensionOrderForLocalIds(nullptr, false, i); auto dimOrderForImplicitArgs = ImplicitArgsHelper::getDimensionOrderForLocalIds(nullptr, std::make_pair(false, i));
EXPECT_EQ(HwWalkOrderHelper::compatibleDimensionOrders[i], dimOrderForImplicitArgs); EXPECT_EQ(HwWalkOrderHelper::compatibleDimensionOrders[i], dimOrderForImplicitArgs);
} }
} }
@@ -43,4 +50,29 @@ TEST(ImplicitArgsHelperTest, givenSimdGreaterThanOneWhenGettingGrfSizeThenInputG
EXPECT_EQ(regularGrfsize, ImplicitArgsHelper::getGrfSize(8u, regularGrfsize)); EXPECT_EQ(regularGrfsize, ImplicitArgsHelper::getGrfSize(8u, regularGrfsize));
EXPECT_EQ(regularGrfsize, ImplicitArgsHelper::getGrfSize(16u, regularGrfsize)); EXPECT_EQ(regularGrfsize, ImplicitArgsHelper::getGrfSize(16u, regularGrfsize));
EXPECT_EQ(regularGrfsize, ImplicitArgsHelper::getGrfSize(32u, regularGrfsize)); EXPECT_EQ(regularGrfsize, ImplicitArgsHelper::getGrfSize(32u, regularGrfsize));
} }
TEST(ImplicitArgsHelperTest, givenNoImplicitArgsWhenGettingSizeForImplicitArgsProgrammingThenZeroIsReturned) {
KernelDescriptor kernelDescriptor{};
const auto &hwInfo = *defaultHwInfo;
EXPECT_EQ(0u, ImplicitArgsHelper::getSizeForImplicitArgsPatching(nullptr, kernelDescriptor, hwInfo));
}
TEST(ImplicitArgsHelperTest, givenImplicitArgsWhenGettingSizeForImplicitArgsProgrammingThenCorrectSizeIsReturned) {
ImplicitArgs implicitArgs{sizeof(ImplicitArgs)};
KernelDescriptor kernelDescriptor{};
const auto &hwInfo = *defaultHwInfo;
implicitArgs.simdWidth = 32;
implicitArgs.localSizeX = 2;
implicitArgs.localSizeY = 3;
implicitArgs.localSizeZ = 4;
auto totalWorkgroupSize = implicitArgs.localSizeX * implicitArgs.localSizeY * implicitArgs.localSizeZ;
auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.simdWidth, hwInfo.capabilityTable.grfSize, 3u, totalWorkgroupSize), MemoryConstants::cacheLineSize);
EXPECT_EQ(localIdsSize + implicitArgs.structSize, ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, hwInfo));
}