Correct cross thread data GPU address in indirect dispatch programming

Related-To: NEO-5081
Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
Mateusz Jablonski
2021-09-24 12:20:21 +00:00
committed by Compute-Runtime-Automation
parent 5e201f40be
commit b891ec2588
5 changed files with 269 additions and 17 deletions

View File

@ -48,8 +48,18 @@ HWTEST_F(CommandListDualStorage, givenIndirectDispatchWithSharedDualStorageMemor
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
Mock<::L0::Kernel> kernel;
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = 2;
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = 2;
uint32_t globalWorkSizeXOffset = 0x20u;
uint32_t globalWorkSizeYOffset = 0x24u;
uint32_t globalWorkSizeZOffset = 0x28u;
uint32_t numWorkGroupXOffset = 0x30u;
uint32_t numWorkGroupYOffset = 0x34u;
uint32_t numWorkGroupZOffset = 0x38u;
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = globalWorkSizeXOffset;
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[1] = globalWorkSizeYOffset;
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[2] = globalWorkSizeZOffset;
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = numWorkGroupXOffset;
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[1] = numWorkGroupYOffset;
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[2] = numWorkGroupZOffset;
ze_result_t returnValue;
std::unique_ptr<L0::CommandList> commandList(L0::CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue));
@ -123,6 +133,36 @@ HWTEST_F(CommandListDualStorage, givenIndirectDispatchWithSharedDualStorageMemor
} while (itor != cmdList.end() && cmd2->getRegisterAddress() != GPUGPU_DISPATCHDIMX);
EXPECT_NE(cmdList.end(), itor);
auto groupCountStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMX);
groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupXOffset);
EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress());
EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress());
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMY);
groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupYOffset);
EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress());
EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress());
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMZ);
groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupZOffset);
EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress());
EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress());
auto workSizeStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
workSizeStoreRegisterMemCmd.setRegisterAddress(CS_GPR_R1);
// Find workgroup size cmds
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
@ -131,7 +171,212 @@ HWTEST_F(CommandListDualStorage, givenIndirectDispatchWithSharedDualStorageMemor
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
EXPECT_EQ(CS_GPR_R1, cmd2->getRegisterAddress());
workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeXOffset);
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeYOffset);
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeZOffset);
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
context->freeMem(alloc);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, CommandListDualStorage, givenIndirectDispatchWithSharedDualStorageMemoryAndInlineDataWhenAppendingThenWorkGroupCountAndGlobalWorkSizeIsSetInCrossThreadData) {
using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM;
using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM;
using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG;
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
using INLINE_DATA = typename FamilyType::INLINE_DATA;
Mock<::L0::Kernel> kernel;
kernel.crossThreadDataSize = 0x60u;
kernel.descriptor.kernelAttributes.flags.passInlineData = true;
uint32_t globalWorkSizeXOffset = 0x40u;
uint32_t globalWorkSizeYOffset = 0x44u;
uint32_t globalWorkSizeZOffset = 0x48u;
uint32_t numWorkGroupXOffset = 0x30u;
uint32_t numWorkGroupYOffset = 0x34u;
uint32_t numWorkGroupZOffset = 0x38u;
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = globalWorkSizeXOffset;
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[1] = globalWorkSizeYOffset;
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[2] = globalWorkSizeZOffset;
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = numWorkGroupXOffset;
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[1] = numWorkGroupYOffset;
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[2] = numWorkGroupZOffset;
ze_result_t returnValue;
std::unique_ptr<L0::CommandList> commandList(L0::CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue));
void *alloc = nullptr;
ze_device_mem_alloc_desc_t deviceDesc = {};
ze_host_mem_alloc_desc_t hostDesc = {};
auto result = context->allocSharedMem(device->toHandle(), &deviceDesc, &hostDesc, 16384u, 4096u, &alloc);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
ze_group_count_t *pThreadGroupDimensions = static_cast<ze_group_count_t *>(ptrOffset(alloc, sizeof(ze_group_count_t)));
pThreadGroupDimensions->groupCountX = 3;
pThreadGroupDimensions->groupCountY = 4;
pThreadGroupDimensions->groupCountZ = 5;
result = commandList->appendLaunchKernelIndirect(kernel.toHandle(),
pThreadGroupDimensions,
nullptr, 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto allocData = device->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(pThreadGroupDimensions);
ASSERT_NE(nullptr, allocData->cpuAllocation);
auto gpuAllocation = allocData->gpuAllocations.getGraphicsAllocation(device->getNEODevice()->getRootDeviceIndex());
ASSERT_NE(nullptr, gpuAllocation);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0), commandList->commandContainer.getCommandStream()->getUsed()));
uint32_t regAddress = 0;
uint64_t gpuAddress = 0;
auto expectedXAddress = reinterpret_cast<uint64_t>(ptrOffset(pThreadGroupDimensions, offsetof(ze_group_count_t, groupCountX)));
auto expectedYAddress = reinterpret_cast<uint64_t>(ptrOffset(pThreadGroupDimensions, offsetof(ze_group_count_t, groupCountY)));
auto expectedZAddress = reinterpret_cast<uint64_t>(ptrOffset(pThreadGroupDimensions, offsetof(ze_group_count_t, groupCountZ)));
auto itor = find<MI_LOAD_REGISTER_MEM *>(cmdList.begin(), cmdList.end());
EXPECT_NE(cmdList.end(), itor);
auto cmd = genCmdCast<MI_LOAD_REGISTER_MEM *>(*itor);
regAddress = cmd->getRegisterAddress();
gpuAddress = cmd->getMemoryAddress();
EXPECT_EQ(GPUGPU_DISPATCHDIMX, regAddress);
EXPECT_EQ(expectedXAddress, gpuAddress);
itor = find<MI_LOAD_REGISTER_MEM *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
cmd = genCmdCast<MI_LOAD_REGISTER_MEM *>(*itor);
regAddress = cmd->getRegisterAddress();
gpuAddress = cmd->getMemoryAddress();
EXPECT_EQ(GPUGPU_DISPATCHDIMY, regAddress);
EXPECT_EQ(expectedYAddress, gpuAddress);
itor = find<MI_LOAD_REGISTER_MEM *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
cmd = genCmdCast<MI_LOAD_REGISTER_MEM *>(*itor);
regAddress = cmd->getRegisterAddress();
gpuAddress = cmd->getMemoryAddress();
EXPECT_EQ(GPUGPU_DISPATCHDIMZ, regAddress);
EXPECT_EQ(expectedZAddress, gpuAddress);
MI_STORE_REGISTER_MEM *cmd2 = nullptr;
// Find group count cmds
do {
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
} while (itor != cmdList.end() && cmd2->getRegisterAddress() != GPUGPU_DISPATCHDIMX);
EXPECT_NE(cmdList.end(), itor);
auto groupCountStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMX);
groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupXOffset - sizeof(INLINE_DATA));
EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress());
EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress());
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMY);
groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupYOffset - sizeof(INLINE_DATA));
EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress());
EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress());
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMZ);
groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupZOffset - sizeof(INLINE_DATA));
EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress());
EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress());
auto workSizeStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
workSizeStoreRegisterMemCmd.setRegisterAddress(CS_GPR_R1);
// Find workgroup size cmds
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeXOffset - sizeof(INLINE_DATA));
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeYOffset - sizeof(INLINE_DATA));
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
cmd2 = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeZOffset - sizeof(INLINE_DATA));
EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress());
EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress());
context->freeMem(alloc);
}

View File

@ -173,6 +173,8 @@ struct EncodeIndirectParams {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
using MI_MATH = typename GfxFamily::MI_MATH;
using MI_MATH_ALU_INST_INLINE = typename GfxFamily::MI_MATH_ALU_INST_INLINE;
static void encode(CommandContainer &container, void *crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface);
static void setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress);
static void setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offset, void *crossThreadAddress, const uint32_t *groupSize);
static void setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, const uint32_t *lws);

View File

@ -526,6 +526,14 @@ bool EncodeDispatchKernel<Family>::inlineDataProgrammingRequired(const KernelDes
template <typename Family>
void EncodeDispatchKernel<Family>::adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo) {}
template <typename Family>
void EncodeIndirectParams<Family>::encode(CommandContainer &container, void *crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface) {
const auto &kernelDescriptor = dispatchInterface->getKernelDescriptor();
setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, crossThreadDataGpuVa);
setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, crossThreadDataGpuVa, dispatchInterface->getGroupSize());
setWorkDimIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.workDim, crossThreadDataGpuVa, dispatchInterface->getGroupSize());
}
template <typename Family>
void EncodeIndirectParams<Family>::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress) {
for (int i = 0; i < 3; ++i) {

View File

@ -153,10 +153,8 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
dispatchInterface->getCrossThreadData(), sizeCrossThreadData);
if (isIndirect) {
void *gpuPtr = reinterpret_cast<void *>(heapIndirect->getHeapGpuBase() + heapIndirect->getUsed() - sizeThreadData);
EncodeIndirectParams<Family>::setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, gpuPtr);
EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, gpuPtr, dispatchInterface->getGroupSize());
EncodeIndirectParams<Family>::setWorkDimIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.workDim, gpuPtr, dispatchInterface->getGroupSize());
void *gpuPtr = reinterpret_cast<void *>(heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - sizeThreadData);
EncodeIndirectParams<Family>::encode(container, gpuPtr, dispatchInterface);
}
ptr = ptrOffset(ptr, sizeCrossThreadData);

View File

@ -152,14 +152,15 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
const uint32_t inlineDataSize = sizeof(INLINE_DATA);
auto crossThreadData = dispatchInterface->getCrossThreadData();
uint32_t inlineDataProgrammingOffset = 0u;
if (inlineDataProgramming) {
auto copySize = std::min(inlineDataSize, sizeCrossThreadData);
inlineDataProgrammingOffset = std::min(inlineDataSize, sizeCrossThreadData);
auto dest = reinterpret_cast<char *>(walkerCmd.getInlineDataPointer());
memcpy_s(dest, copySize, crossThreadData, copySize);
auto offset = std::min(inlineDataSize, sizeCrossThreadData);
sizeCrossThreadData -= copySize;
crossThreadData = ptrOffset(crossThreadData, offset);
inlineDataProgramming = copySize != 0;
memcpy_s(dest, inlineDataProgrammingOffset, crossThreadData, inlineDataProgrammingOffset);
sizeCrossThreadData -= inlineDataProgrammingOffset;
crossThreadData = ptrOffset(crossThreadData, inlineDataProgrammingOffset);
inlineDataProgramming = inlineDataProgrammingOffset != 0;
}
uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData;
@ -186,10 +187,8 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
crossThreadData, sizeCrossThreadData);
}
if (isIndirect) {
void *gpuPtr = reinterpret_cast<void *>(heap->getHeapGpuBase() + heap->getUsed() - sizeThreadData);
EncodeIndirectParams<Family>::setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, gpuPtr);
EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, gpuPtr, dispatchInterface->getGroupSize());
EncodeIndirectParams<Family>::setWorkDimIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.workDim, gpuPtr, dispatchInterface->getGroupSize());
void *gpuPtr = reinterpret_cast<void *>(heap->getGraphicsAllocation()->getGpuAddress() + static_cast<uint64_t>(heap->getUsed() - sizeThreadData - inlineDataProgrammingOffset));
EncodeIndirectParams<Family>::encode(container, gpuPtr, dispatchInterface);
}
auto perThreadDataPtr = dispatchInterface->getPerThreadData();