From b891ec258881b204cf522dc9002e156a98f3d692 Mon Sep 17 00:00:00 2001 From: Mateusz Jablonski Date: Fri, 24 Sep 2021 12:20:21 +0000 Subject: [PATCH] Correct cross thread data GPU address in indirect dispatch programming Related-To: NEO-5081 Signed-off-by: Mateusz Jablonski --- .../test_cmdlist_append_launch_kernel_2.cpp | 251 +++++++++++++++++- .../command_container/command_encoder.h | 2 + .../command_container/command_encoder.inl | 8 + .../command_encoder_bdw_and_later.inl | 6 +- .../command_encoder_xehp_and_later.inl | 19 +- 5 files changed, 269 insertions(+), 17 deletions(-) diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp index 1f80d4457f..1c5ebc7599 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp @@ -48,8 +48,18 @@ HWTEST_F(CommandListDualStorage, givenIndirectDispatchWithSharedDualStorageMemor using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; Mock<::L0::Kernel> kernel; - kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = 2; - kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = 2; + uint32_t globalWorkSizeXOffset = 0x20u; + uint32_t globalWorkSizeYOffset = 0x24u; + uint32_t globalWorkSizeZOffset = 0x28u; + uint32_t numWorkGroupXOffset = 0x30u; + uint32_t numWorkGroupYOffset = 0x34u; + uint32_t numWorkGroupZOffset = 0x38u; + kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = globalWorkSizeXOffset; + kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[1] = globalWorkSizeYOffset; + kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[2] = globalWorkSizeZOffset; + kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = numWorkGroupXOffset; + kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[1] = numWorkGroupYOffset; + kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[2] = numWorkGroupZOffset; ze_result_t returnValue; std::unique_ptr commandList(L0::CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue)); @@ -123,6 +133,36 @@ HWTEST_F(CommandListDualStorage, givenIndirectDispatchWithSharedDualStorageMemor } while (itor != cmdList.end() && cmd2->getRegisterAddress() != GPUGPU_DISPATCHDIMX); EXPECT_NE(cmdList.end(), itor); + auto groupCountStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem; + groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMX); + groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupXOffset); + + EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress()); + EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress()); + + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + cmd2 = genCmdCast(*itor); + + groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMY); + groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupYOffset); + + EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress()); + EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress()); + + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + cmd2 = genCmdCast(*itor); + + groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMZ); + groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupZOffset); + + EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress()); + EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress()); + + auto workSizeStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem; + workSizeStoreRegisterMemCmd.setRegisterAddress(CS_GPR_R1); + // Find workgroup size cmds itor = find(++itor, cmdList.end()); EXPECT_NE(cmdList.end(), itor); @@ -131,7 +171,212 @@ HWTEST_F(CommandListDualStorage, givenIndirectDispatchWithSharedDualStorageMemor itor = find(++itor, cmdList.end()); EXPECT_NE(cmdList.end(), itor); cmd2 = genCmdCast(*itor); - EXPECT_EQ(CS_GPR_R1, cmd2->getRegisterAddress()); + + workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeXOffset); + + EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress()); + EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress()); + + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + cmd2 = genCmdCast(*itor); + + workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeYOffset); + + EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress()); + EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress()); + + EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress()); + EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress()); + + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + cmd2 = genCmdCast(*itor); + + workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeZOffset); + + EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress()); + EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress()); + + context->freeMem(alloc); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandListDualStorage, givenIndirectDispatchWithSharedDualStorageMemoryAndInlineDataWhenAppendingThenWorkGroupCountAndGlobalWorkSizeIsSetInCrossThreadData) { + using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM; + using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM; + using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG; + using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; + using INLINE_DATA = typename FamilyType::INLINE_DATA; + + Mock<::L0::Kernel> kernel; + kernel.crossThreadDataSize = 0x60u; + kernel.descriptor.kernelAttributes.flags.passInlineData = true; + + uint32_t globalWorkSizeXOffset = 0x40u; + uint32_t globalWorkSizeYOffset = 0x44u; + uint32_t globalWorkSizeZOffset = 0x48u; + uint32_t numWorkGroupXOffset = 0x30u; + uint32_t numWorkGroupYOffset = 0x34u; + uint32_t numWorkGroupZOffset = 0x38u; + kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = globalWorkSizeXOffset; + kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[1] = globalWorkSizeYOffset; + kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[2] = globalWorkSizeZOffset; + kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = numWorkGroupXOffset; + kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[1] = numWorkGroupYOffset; + kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[2] = numWorkGroupZOffset; + ze_result_t returnValue; + std::unique_ptr commandList(L0::CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue)); + + void *alloc = nullptr; + ze_device_mem_alloc_desc_t deviceDesc = {}; + ze_host_mem_alloc_desc_t hostDesc = {}; + auto result = context->allocSharedMem(device->toHandle(), &deviceDesc, &hostDesc, 16384u, 4096u, &alloc); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + ze_group_count_t *pThreadGroupDimensions = static_cast(ptrOffset(alloc, sizeof(ze_group_count_t))); + + pThreadGroupDimensions->groupCountX = 3; + pThreadGroupDimensions->groupCountY = 4; + pThreadGroupDimensions->groupCountZ = 5; + + result = commandList->appendLaunchKernelIndirect(kernel.toHandle(), + pThreadGroupDimensions, + nullptr, 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto allocData = device->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(pThreadGroupDimensions); + ASSERT_NE(nullptr, allocData->cpuAllocation); + auto gpuAllocation = allocData->gpuAllocations.getGraphicsAllocation(device->getNEODevice()->getRootDeviceIndex()); + ASSERT_NE(nullptr, gpuAllocation); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0), commandList->commandContainer.getCommandStream()->getUsed())); + + uint32_t regAddress = 0; + uint64_t gpuAddress = 0; + auto expectedXAddress = reinterpret_cast(ptrOffset(pThreadGroupDimensions, offsetof(ze_group_count_t, groupCountX))); + auto expectedYAddress = reinterpret_cast(ptrOffset(pThreadGroupDimensions, offsetof(ze_group_count_t, groupCountY))); + auto expectedZAddress = reinterpret_cast(ptrOffset(pThreadGroupDimensions, offsetof(ze_group_count_t, groupCountZ))); + + auto itor = find(cmdList.begin(), cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + + auto cmd = genCmdCast(*itor); + regAddress = cmd->getRegisterAddress(); + gpuAddress = cmd->getMemoryAddress(); + + EXPECT_EQ(GPUGPU_DISPATCHDIMX, regAddress); + EXPECT_EQ(expectedXAddress, gpuAddress); + + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + + cmd = genCmdCast(*itor); + regAddress = cmd->getRegisterAddress(); + gpuAddress = cmd->getMemoryAddress(); + + EXPECT_EQ(GPUGPU_DISPATCHDIMY, regAddress); + EXPECT_EQ(expectedYAddress, gpuAddress); + + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + + cmd = genCmdCast(*itor); + regAddress = cmd->getRegisterAddress(); + gpuAddress = cmd->getMemoryAddress(); + + EXPECT_EQ(GPUGPU_DISPATCHDIMZ, regAddress); + EXPECT_EQ(expectedZAddress, gpuAddress); + + MI_STORE_REGISTER_MEM *cmd2 = nullptr; + // Find group count cmds + do { + itor = find(++itor, cmdList.end()); + cmd2 = genCmdCast(*itor); + } while (itor != cmdList.end() && cmd2->getRegisterAddress() != GPUGPU_DISPATCHDIMX); + EXPECT_NE(cmdList.end(), itor); + + auto groupCountStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem; + groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMX); + groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupXOffset - sizeof(INLINE_DATA)); + + EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress()); + EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress()); + + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + cmd2 = genCmdCast(*itor); + + groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMY); + groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupYOffset - sizeof(INLINE_DATA)); + + EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress()); + EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress()); + + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + cmd2 = genCmdCast(*itor); + + groupCountStoreRegisterMemCmd.setRegisterAddress(GPUGPU_DISPATCHDIMZ); + groupCountStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + numWorkGroupZOffset - sizeof(INLINE_DATA)); + + EXPECT_EQ(cmd2->getRegisterAddress(), groupCountStoreRegisterMemCmd.getRegisterAddress()); + EXPECT_EQ(cmd2->getMemoryAddress(), groupCountStoreRegisterMemCmd.getMemoryAddress()); + + auto workSizeStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem; + workSizeStoreRegisterMemCmd.setRegisterAddress(CS_GPR_R1); + + // Find workgroup size cmds + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + cmd2 = genCmdCast(*itor); + + workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeXOffset - sizeof(INLINE_DATA)); + + EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress()); + EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress()); + + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + cmd2 = genCmdCast(*itor); + + workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeYOffset - sizeof(INLINE_DATA)); + + EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress()); + EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress()); + + EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress()); + EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress()); + + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + itor = find(++itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + cmd2 = genCmdCast(*itor); + + workSizeStoreRegisterMemCmd.setMemoryAddress(commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT)->getGraphicsAllocation()->getGpuAddress() + globalWorkSizeZOffset - sizeof(INLINE_DATA)); + + EXPECT_EQ(cmd2->getRegisterAddress(), workSizeStoreRegisterMemCmd.getRegisterAddress()); + EXPECT_EQ(cmd2->getMemoryAddress(), workSizeStoreRegisterMemCmd.getMemoryAddress()); context->freeMem(alloc); } diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index 6fadbfc99b..865d51ec67 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -173,6 +173,8 @@ struct EncodeIndirectParams { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; using MI_MATH = typename GfxFamily::MI_MATH; using MI_MATH_ALU_INST_INLINE = typename GfxFamily::MI_MATH_ALU_INST_INLINE; + + static void encode(CommandContainer &container, void *crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface); static void setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress); static void setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offset, void *crossThreadAddress, const uint32_t *groupSize); static void setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, const uint32_t *lws); diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index ba9252eb91..5f75543ac3 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -526,6 +526,14 @@ bool EncodeDispatchKernel::inlineDataProgrammingRequired(const KernelDes template void EncodeDispatchKernel::adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo) {} +template +void EncodeIndirectParams::encode(CommandContainer &container, void *crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface) { + const auto &kernelDescriptor = dispatchInterface->getKernelDescriptor(); + setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, crossThreadDataGpuVa); + setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, crossThreadDataGpuVa, dispatchInterface->getGroupSize()); + setWorkDimIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.workDim, crossThreadDataGpuVa, dispatchInterface->getGroupSize()); +} + template void EncodeIndirectParams::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress) { for (int i = 0; i < 3; ++i) { diff --git a/shared/source/command_container/command_encoder_bdw_and_later.inl b/shared/source/command_container/command_encoder_bdw_and_later.inl index 7dec24063a..e65fb8805f 100644 --- a/shared/source/command_container/command_encoder_bdw_and_later.inl +++ b/shared/source/command_container/command_encoder_bdw_and_later.inl @@ -153,10 +153,8 @@ void EncodeDispatchKernel::encode(CommandContainer &container, dispatchInterface->getCrossThreadData(), sizeCrossThreadData); if (isIndirect) { - void *gpuPtr = reinterpret_cast(heapIndirect->getHeapGpuBase() + heapIndirect->getUsed() - sizeThreadData); - EncodeIndirectParams::setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, gpuPtr); - EncodeIndirectParams::setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, gpuPtr, dispatchInterface->getGroupSize()); - EncodeIndirectParams::setWorkDimIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.workDim, gpuPtr, dispatchInterface->getGroupSize()); + void *gpuPtr = reinterpret_cast(heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - sizeThreadData); + EncodeIndirectParams::encode(container, gpuPtr, dispatchInterface); } ptr = ptrOffset(ptr, sizeCrossThreadData); diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index 46cc3e8250..89860ec9ef 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -152,14 +152,15 @@ void EncodeDispatchKernel::encode(CommandContainer &container, const uint32_t inlineDataSize = sizeof(INLINE_DATA); auto crossThreadData = dispatchInterface->getCrossThreadData(); + uint32_t inlineDataProgrammingOffset = 0u; + if (inlineDataProgramming) { - auto copySize = std::min(inlineDataSize, sizeCrossThreadData); + inlineDataProgrammingOffset = std::min(inlineDataSize, sizeCrossThreadData); auto dest = reinterpret_cast(walkerCmd.getInlineDataPointer()); - memcpy_s(dest, copySize, crossThreadData, copySize); - auto offset = std::min(inlineDataSize, sizeCrossThreadData); - sizeCrossThreadData -= copySize; - crossThreadData = ptrOffset(crossThreadData, offset); - inlineDataProgramming = copySize != 0; + memcpy_s(dest, inlineDataProgrammingOffset, crossThreadData, inlineDataProgrammingOffset); + sizeCrossThreadData -= inlineDataProgrammingOffset; + crossThreadData = ptrOffset(crossThreadData, inlineDataProgrammingOffset); + inlineDataProgramming = inlineDataProgrammingOffset != 0; } uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData; @@ -186,10 +187,8 @@ void EncodeDispatchKernel::encode(CommandContainer &container, crossThreadData, sizeCrossThreadData); } if (isIndirect) { - void *gpuPtr = reinterpret_cast(heap->getHeapGpuBase() + heap->getUsed() - sizeThreadData); - EncodeIndirectParams::setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, gpuPtr); - EncodeIndirectParams::setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, gpuPtr, dispatchInterface->getGroupSize()); - EncodeIndirectParams::setWorkDimIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.workDim, gpuPtr, dispatchInterface->getGroupSize()); + void *gpuPtr = reinterpret_cast(heap->getGraphicsAllocation()->getGpuAddress() + static_cast(heap->getUsed() - sizeThreadData - inlineDataProgrammingOffset)); + EncodeIndirectParams::encode(container, gpuPtr, dispatchInterface); } auto perThreadDataPtr = dispatchInterface->getPerThreadData();