mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 06:49:52 +08:00
Correct cross thread data GPU address in indirect dispatch programming
Related-To: NEO-5081 Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
5e201f40be
commit
b891ec2588
@@ -48,8 +48,18 @@ HWTEST_F(CommandListDualStorage, givenIndirectDispatchWithSharedDualStorageMemor
|
||||
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
|
||||
|
||||
Mock<::L0::Kernel> kernel;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = 2;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = 2;
|
||||
uint32_t globalWorkSizeXOffset = 0x20u;
|
||||
uint32_t globalWorkSizeYOffset = 0x24u;
|
||||
uint32_t globalWorkSizeZOffset = 0x28u;
|
||||
uint32_t numWorkGroupXOffset = 0x30u;
|
||||
uint32_t numWorkGroupYOffset = 0x34u;
|
||||
uint32_t numWorkGroupZOffset = 0x38u;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = globalWorkSizeXOffset;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[1] = globalWorkSizeYOffset;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[2] = globalWorkSizeZOffset;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = numWorkGroupXOffset;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[1] = numWorkGroupYOffset;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[2] = numWorkGroupZOffset;
|
||||
ze_result_t returnValue;
|
||||
std::unique_ptr<L0::CommandList> commandList(L0::CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue));
|
||||
|
||||
@@ -123,6 +133,36 @@ HWTEST_F(CommandListDualStorage, givenIndirectDispatchWithSharedDualStorageMemor
|
||||
} while (itor != cmdList.end() && cmd2->getRegisterAddress() != GPUGPU_DISPATCHDIMX);
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
|
||||
auto groupCountStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
|
||||
groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMX);
|
||||
groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupXOffset);
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
|
||||
groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMY);
|
||||
groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupYOffset);
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
|
||||
groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMZ);
|
||||
groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupZOffset);
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
auto workSizeStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
|
||||
workSizeStoreRegisterMemCmd.setRegisterAddress(CS_GPR_R1);
|
||||
|
||||
// Find workgroup size cmds
|
||||
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
@@ -131,7 +171,212 @@ HWTEST_F(CommandListDualStorage, givenIndirectDispatchWithSharedDualStorageMemor
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
EXPECT_EQ(CS_GPR_R1, cmd2->getRegisterAddress());
|
||||
|
||||
workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeXOffset);
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
|
||||
workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeYOffset);
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
|
||||
workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeZOffset);
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
context->freeMem(alloc);
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, CommandListDualStorage, givenIndirectDispatchWithSharedDualStorageMemoryAndInlineDataWhenAppendingThenWorkGroupCountAndGlobalWorkSizeIsSetInCrossThreadData) {
|
||||
using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM;
|
||||
using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM;
|
||||
using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG;
|
||||
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
|
||||
using INLINE_DATA = typename FamilyType::INLINE_DATA;
|
||||
|
||||
Mock<::L0::Kernel> kernel;
|
||||
kernel.crossThreadDataSize = 0x60u;
|
||||
kernel.descriptor.kernelAttributes.flags.passInlineData = true;
|
||||
|
||||
uint32_t globalWorkSizeXOffset = 0x40u;
|
||||
uint32_t globalWorkSizeYOffset = 0x44u;
|
||||
uint32_t globalWorkSizeZOffset = 0x48u;
|
||||
uint32_t numWorkGroupXOffset = 0x30u;
|
||||
uint32_t numWorkGroupYOffset = 0x34u;
|
||||
uint32_t numWorkGroupZOffset = 0x38u;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = globalWorkSizeXOffset;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[1] = globalWorkSizeYOffset;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[2] = globalWorkSizeZOffset;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = numWorkGroupXOffset;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[1] = numWorkGroupYOffset;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[2] = numWorkGroupZOffset;
|
||||
ze_result_t returnValue;
|
||||
std::unique_ptr<L0::CommandList> commandList(L0::CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue));
|
||||
|
||||
void *alloc = nullptr;
|
||||
ze_device_mem_alloc_desc_t deviceDesc = {};
|
||||
ze_host_mem_alloc_desc_t hostDesc = {};
|
||||
auto result = context->allocSharedMem(device->toHandle(), &deviceDesc, &hostDesc, 16384u, 4096u, &alloc);
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
|
||||
ze_group_count_t *pThreadGroupDimensions = static_cast<ze_group_count_t *>(ptrOffset(alloc, sizeof(ze_group_count_t)));
|
||||
|
||||
pThreadGroupDimensions->groupCountX = 3;
|
||||
pThreadGroupDimensions->groupCountY = 4;
|
||||
pThreadGroupDimensions->groupCountZ = 5;
|
||||
|
||||
result = commandList->appendLaunchKernelIndirect(kernel.toHandle(),
|
||||
pThreadGroupDimensions,
|
||||
nullptr, 0, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
|
||||
auto allocData = device->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(pThreadGroupDimensions);
|
||||
ASSERT_NE(nullptr, allocData->cpuAllocation);
|
||||
auto gpuAllocation = allocData->gpuAllocations.getGraphicsAllocation(device->getNEODevice()->getRootDeviceIndex());
|
||||
ASSERT_NE(nullptr, gpuAllocation);
|
||||
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
|
||||
cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0), commandList->commandContainer.getCommandStream()->getUsed()));
|
||||
|
||||
uint32_t regAddress = 0;
|
||||
uint64_t gpuAddress = 0;
|
||||
auto expectedXAddress = reinterpret_cast<uint64_t>(ptrOffset(pThreadGroupDimensions, offsetof(ze_group_count_t, groupCountX)));
|
||||
auto expectedYAddress = reinterpret_cast<uint64_t>(ptrOffset(pThreadGroupDimensions, offsetof(ze_group_count_t, groupCountY)));
|
||||
auto expectedZAddress = reinterpret_cast<uint64_t>(ptrOffset(pThreadGroupDimensions, offsetof(ze_group_count_t, groupCountZ)));
|
||||
|
||||
auto itor = find<MI_LOAD_REGISTER_MEM *>(cmdList.begin(), cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
|
||||
auto cmd = genCmdCast<MI_LOAD_REGISTER_MEM *>(*itor);
|
||||
regAddress = cmd->getRegisterAddress();
|
||||
gpuAddress = cmd->getMemoryAddress();
|
||||
|
||||
EXPECT_EQ(GPUGPU_DISPATCHDIMX, regAddress);
|
||||
EXPECT_EQ(expectedXAddress, gpuAddress);
|
||||
|
||||
itor = find<MI_LOAD_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
|
||||
cmd = genCmdCast<MI_LOAD_REGISTER_MEM *>(*itor);
|
||||
regAddress = cmd->getRegisterAddress();
|
||||
gpuAddress = cmd->getMemoryAddress();
|
||||
|
||||
EXPECT_EQ(GPUGPU_DISPATCHDIMY, regAddress);
|
||||
EXPECT_EQ(expectedYAddress, gpuAddress);
|
||||
|
||||
itor = find<MI_LOAD_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
|
||||
cmd = genCmdCast<MI_LOAD_REGISTER_MEM *>(*itor);
|
||||
regAddress = cmd->getRegisterAddress();
|
||||
gpuAddress = cmd->getMemoryAddress();
|
||||
|
||||
EXPECT_EQ(GPUGPU_DISPATCHDIMZ, regAddress);
|
||||
EXPECT_EQ(expectedZAddress, gpuAddress);
|
||||
|
||||
MI_STORE_REGISTER_MEM *cmd2 = nullptr;
|
||||
// Find group count cmds
|
||||
do {
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
} while (itor != cmdList.end() && cmd2->getRegisterAddress() != GPUGPU_DISPATCHDIMX);
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
|
||||
auto groupCountStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
|
||||
groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMX);
|
||||
groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupXOffset - sizeof(INLINE_DATA));
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
|
||||
groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMY);
|
||||
groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupYOffset - sizeof(INLINE_DATA));
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
|
||||
groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMZ);
|
||||
groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupZOffset - sizeof(INLINE_DATA));
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
auto workSizeStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
|
||||
workSizeStoreRegisterMemCmd.setRegisterAddress(CS_GPR_R1);
|
||||
|
||||
// Find workgroup size cmds
|
||||
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
|
||||
workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeXOffset - sizeof(INLINE_DATA));
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
|
||||
workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeYOffset - sizeof(INLINE_DATA));
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
|
||||
workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeZOffset - sizeof(INLINE_DATA));
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
context->freeMem(alloc);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user