mirror of
https://github.com/intel/compute-runtime.git
synced 2025-09-15 13:01:45 +08:00
Correct cross thread data GPU address in indirect dispatch programming
Related-To: NEO-5081 Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:

committed by
Compute-Runtime-Automation

parent
5e201f40be
commit
b891ec2588
@ -48,8 +48,18 @@ HWTEST_F(CommandListDualStorage, givenIndirectDispatchWithSharedDualStorageMemor
|
||||
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
|
||||
|
||||
Mock<::L0::Kernel> kernel;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = 2;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = 2;
|
||||
uint32_t globalWorkSizeXOffset = 0x20u;
|
||||
uint32_t globalWorkSizeYOffset = 0x24u;
|
||||
uint32_t globalWorkSizeZOffset = 0x28u;
|
||||
uint32_t numWorkGroupXOffset = 0x30u;
|
||||
uint32_t numWorkGroupYOffset = 0x34u;
|
||||
uint32_t numWorkGroupZOffset = 0x38u;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = globalWorkSizeXOffset;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[1] = globalWorkSizeYOffset;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[2] = globalWorkSizeZOffset;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = numWorkGroupXOffset;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[1] = numWorkGroupYOffset;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[2] = numWorkGroupZOffset;
|
||||
ze_result_t returnValue;
|
||||
std::unique_ptr<L0::CommandList> commandList(L0::CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue));
|
||||
|
||||
@ -123,6 +133,36 @@ HWTEST_F(CommandListDualStorage, givenIndirectDispatchWithSharedDualStorageMemor
|
||||
} while (itor != cmdList.end() && cmd2->getRegisterAddress() != GPUGPU_DISPATCHDIMX);
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
|
||||
auto groupCountStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
|
||||
groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMX);
|
||||
groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupXOffset);
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
|
||||
groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMY);
|
||||
groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupYOffset);
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
|
||||
groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMZ);
|
||||
groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupZOffset);
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
auto workSizeStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
|
||||
workSizeStoreRegisterMemCmd.setRegisterAddress(CS_GPR_R1);
|
||||
|
||||
// Find workgroup size cmds
|
||||
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
@ -131,7 +171,212 @@ HWTEST_F(CommandListDualStorage, givenIndirectDispatchWithSharedDualStorageMemor
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
EXPECT_EQ(CS_GPR_R1, cmd2->getRegisterAddress());
|
||||
|
||||
workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeXOffset);
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
|
||||
workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeYOffset);
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
|
||||
workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeZOffset);
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
context->freeMem(alloc);
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, CommandListDualStorage, givenIndirectDispatchWithSharedDualStorageMemoryAndInlineDataWhenAppendingThenWorkGroupCountAndGlobalWorkSizeIsSetInCrossThreadData) {
|
||||
using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM;
|
||||
using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM;
|
||||
using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG;
|
||||
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
|
||||
using INLINE_DATA = typename FamilyType::INLINE_DATA;
|
||||
|
||||
Mock<::L0::Kernel> kernel;
|
||||
kernel.crossThreadDataSize = 0x60u;
|
||||
kernel.descriptor.kernelAttributes.flags.passInlineData = true;
|
||||
|
||||
uint32_t globalWorkSizeXOffset = 0x40u;
|
||||
uint32_t globalWorkSizeYOffset = 0x44u;
|
||||
uint32_t globalWorkSizeZOffset = 0x48u;
|
||||
uint32_t numWorkGroupXOffset = 0x30u;
|
||||
uint32_t numWorkGroupYOffset = 0x34u;
|
||||
uint32_t numWorkGroupZOffset = 0x38u;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = globalWorkSizeXOffset;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[1] = globalWorkSizeYOffset;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[2] = globalWorkSizeZOffset;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = numWorkGroupXOffset;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[1] = numWorkGroupYOffset;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[2] = numWorkGroupZOffset;
|
||||
ze_result_t returnValue;
|
||||
std::unique_ptr<L0::CommandList> commandList(L0::CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue));
|
||||
|
||||
void *alloc = nullptr;
|
||||
ze_device_mem_alloc_desc_t deviceDesc = {};
|
||||
ze_host_mem_alloc_desc_t hostDesc = {};
|
||||
auto result = context->allocSharedMem(device->toHandle(), &deviceDesc, &hostDesc, 16384u, 4096u, &alloc);
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
|
||||
ze_group_count_t *pThreadGroupDimensions = static_cast<ze_group_count_t *>(ptrOffset(alloc, sizeof(ze_group_count_t)));
|
||||
|
||||
pThreadGroupDimensions->groupCountX = 3;
|
||||
pThreadGroupDimensions->groupCountY = 4;
|
||||
pThreadGroupDimensions->groupCountZ = 5;
|
||||
|
||||
result = commandList->appendLaunchKernelIndirect(kernel.toHandle(),
|
||||
pThreadGroupDimensions,
|
||||
nullptr, 0, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
|
||||
auto allocData = device->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(pThreadGroupDimensions);
|
||||
ASSERT_NE(nullptr, allocData->cpuAllocation);
|
||||
auto gpuAllocation = allocData->gpuAllocations.getGraphicsAllocation(device->getNEODevice()->getRootDeviceIndex());
|
||||
ASSERT_NE(nullptr, gpuAllocation);
|
||||
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
|
||||
cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0), commandList->commandContainer.getCommandStream()->getUsed()));
|
||||
|
||||
uint32_t regAddress = 0;
|
||||
uint64_t gpuAddress = 0;
|
||||
auto expectedXAddress = reinterpret_cast<uint64_t>(ptrOffset(pThreadGroupDimensions, offsetof(ze_group_count_t, groupCountX)));
|
||||
auto expectedYAddress = reinterpret_cast<uint64_t>(ptrOffset(pThreadGroupDimensions, offsetof(ze_group_count_t, groupCountY)));
|
||||
auto expectedZAddress = reinterpret_cast<uint64_t>(ptrOffset(pThreadGroupDimensions, offsetof(ze_group_count_t, groupCountZ)));
|
||||
|
||||
auto itor = find<MI_LOAD_REGISTER_MEM *>(cmdList.begin(), cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
|
||||
auto cmd = genCmdCast<MI_LOAD_REGISTER_MEM *>(*itor);
|
||||
regAddress = cmd->getRegisterAddress();
|
||||
gpuAddress = cmd->getMemoryAddress();
|
||||
|
||||
EXPECT_EQ(GPUGPU_DISPATCHDIMX, regAddress);
|
||||
EXPECT_EQ(expectedXAddress, gpuAddress);
|
||||
|
||||
itor = find<MI_LOAD_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
|
||||
cmd = genCmdCast<MI_LOAD_REGISTER_MEM *>(*itor);
|
||||
regAddress = cmd->getRegisterAddress();
|
||||
gpuAddress = cmd->getMemoryAddress();
|
||||
|
||||
EXPECT_EQ(GPUGPU_DISPATCHDIMY, regAddress);
|
||||
EXPECT_EQ(expectedYAddress, gpuAddress);
|
||||
|
||||
itor = find<MI_LOAD_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
|
||||
cmd = genCmdCast<MI_LOAD_REGISTER_MEM *>(*itor);
|
||||
regAddress = cmd->getRegisterAddress();
|
||||
gpuAddress = cmd->getMemoryAddress();
|
||||
|
||||
EXPECT_EQ(GPUGPU_DISPATCHDIMZ, regAddress);
|
||||
EXPECT_EQ(expectedZAddress, gpuAddress);
|
||||
|
||||
MI_STORE_REGISTER_MEM *cmd2 = nullptr;
|
||||
// Find group count cmds
|
||||
do {
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
} while (itor != cmdList.end() && cmd2->getRegisterAddress() != GPUGPU_DISPATCHDIMX);
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
|
||||
auto groupCountStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
|
||||
groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMX);
|
||||
groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupXOffset - sizeof(INLINE_DATA));
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
|
||||
groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMY);
|
||||
groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupYOffset - sizeof(INLINE_DATA));
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
|
||||
groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMZ);
|
||||
groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupZOffset - sizeof(INLINE_DATA));
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
auto workSizeStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
|
||||
workSizeStoreRegisterMemCmd.setRegisterAddress(CS_GPR_R1);
|
||||
|
||||
// Find workgroup size cmds
|
||||
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
|
||||
workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeXOffset - sizeof(INLINE_DATA));
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
|
||||
workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeYOffset - sizeof(INLINE_DATA));
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
|
||||
|
||||
workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeZOffset - sizeof(INLINE_DATA));
|
||||
|
||||
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
|
||||
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
|
||||
|
||||
context->freeMem(alloc);
|
||||
}
|
||||
|
@ -173,6 +173,8 @@ struct EncodeIndirectParams {
|
||||
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
|
||||
using MI_MATH = typename GfxFamily::MI_MATH;
|
||||
using MI_MATH_ALU_INST_INLINE = typename GfxFamily::MI_MATH_ALU_INST_INLINE;
|
||||
|
||||
static void encode(CommandContainer &container, void *crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface);
|
||||
static void setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress);
|
||||
static void setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offset, void *crossThreadAddress, const uint32_t *groupSize);
|
||||
static void setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, const uint32_t *lws);
|
||||
|
@ -526,6 +526,14 @@ bool EncodeDispatchKernel<Family>::inlineDataProgrammingRequired(const KernelDes
|
||||
template <typename Family>
|
||||
void EncodeDispatchKernel<Family>::adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo) {}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeIndirectParams<Family>::encode(CommandContainer &container, void *crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface) {
|
||||
const auto &kernelDescriptor = dispatchInterface->getKernelDescriptor();
|
||||
setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, crossThreadDataGpuVa);
|
||||
setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, crossThreadDataGpuVa, dispatchInterface->getGroupSize());
|
||||
setWorkDimIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.workDim, crossThreadDataGpuVa, dispatchInterface->getGroupSize());
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeIndirectParams<Family>::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress) {
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
|
@ -153,10 +153,8 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
|
||||
dispatchInterface->getCrossThreadData(), sizeCrossThreadData);
|
||||
|
||||
if (isIndirect) {
|
||||
void *gpuPtr = reinterpret_cast<void *>(heapIndirect->getHeapGpuBase() + heapIndirect->getUsed() - sizeThreadData);
|
||||
EncodeIndirectParams<Family>::setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, gpuPtr);
|
||||
EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, gpuPtr, dispatchInterface->getGroupSize());
|
||||
EncodeIndirectParams<Family>::setWorkDimIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.workDim, gpuPtr, dispatchInterface->getGroupSize());
|
||||
void *gpuPtr = reinterpret_cast<void *>(heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - sizeThreadData);
|
||||
EncodeIndirectParams<Family>::encode(container, gpuPtr, dispatchInterface);
|
||||
}
|
||||
|
||||
ptr = ptrOffset(ptr, sizeCrossThreadData);
|
||||
|
@ -152,14 +152,15 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
|
||||
const uint32_t inlineDataSize = sizeof(INLINE_DATA);
|
||||
auto crossThreadData = dispatchInterface->getCrossThreadData();
|
||||
|
||||
uint32_t inlineDataProgrammingOffset = 0u;
|
||||
|
||||
if (inlineDataProgramming) {
|
||||
auto copySize = std::min(inlineDataSize, sizeCrossThreadData);
|
||||
inlineDataProgrammingOffset = std::min(inlineDataSize, sizeCrossThreadData);
|
||||
auto dest = reinterpret_cast<char *>(walkerCmd.getInlineDataPointer());
|
||||
memcpy_s(dest, copySize, crossThreadData, copySize);
|
||||
auto offset = std::min(inlineDataSize, sizeCrossThreadData);
|
||||
sizeCrossThreadData -= copySize;
|
||||
crossThreadData = ptrOffset(crossThreadData, offset);
|
||||
inlineDataProgramming = copySize != 0;
|
||||
memcpy_s(dest, inlineDataProgrammingOffset, crossThreadData, inlineDataProgrammingOffset);
|
||||
sizeCrossThreadData -= inlineDataProgrammingOffset;
|
||||
crossThreadData = ptrOffset(crossThreadData, inlineDataProgrammingOffset);
|
||||
inlineDataProgramming = inlineDataProgrammingOffset != 0;
|
||||
}
|
||||
|
||||
uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData;
|
||||
@ -186,10 +187,8 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
|
||||
crossThreadData, sizeCrossThreadData);
|
||||
}
|
||||
if (isIndirect) {
|
||||
void *gpuPtr = reinterpret_cast<void *>(heap->getHeapGpuBase() + heap->getUsed() - sizeThreadData);
|
||||
EncodeIndirectParams<Family>::setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, gpuPtr);
|
||||
EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, gpuPtr, dispatchInterface->getGroupSize());
|
||||
EncodeIndirectParams<Family>::setWorkDimIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.workDim, gpuPtr, dispatchInterface->getGroupSize());
|
||||
void *gpuPtr = reinterpret_cast<void *>(heap->getGraphicsAllocation()->getGpuAddress() + static_cast<uint64_t>(heap->getUsed() - sizeThreadData - inlineDataProgrammingOffset));
|
||||
EncodeIndirectParams<Family>::encode(container, gpuPtr, dispatchInterface);
|
||||
}
|
||||
|
||||
auto perThreadDataPtr = dispatchInterface->getPerThreadData();
|
||||
|
Reference in New Issue
Block a user