From 5d2d81b2d1f92ff4536c4b5554e61848c045675a Mon Sep 17 00:00:00 2001 From: Mateusz Jablonski Date: Wed, 6 Oct 2021 17:32:04 +0000 Subject: [PATCH] Use uint64_t instead of void * in indirect dispatch programming Signed-off-by: Mateusz Jablonski --- level_zero/core/source/cmdlist/cmdlist_hw.h | 2 +- level_zero/core/source/cmdlist/cmdlist_hw.inl | 2 +- .../test_cmdlist_append_launch_kernel_2.cpp | 8 ++++---- shared/source/command_container/command_encoder.h | 8 ++++---- .../source/command_container/command_encoder.inl | 14 +++++++------- .../command_encoder_bdw_and_later.inl | 6 +++--- .../command_encoder_xehp_and_later.inl | 6 +++--- .../test/unit_test/encoders/test_encode_math.cpp | 4 ++-- 8 files changed, 25 insertions(+), 25 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 0500f57d0e..52553e9419 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -230,7 +230,7 @@ struct CommandListCoreFamily : CommandListImp { void applyMemoryRangesBarrier(uint32_t numRanges, const size_t *pRangeSizes, const void **pRanges); - ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, uint32_t lws[3]); + ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, uint32_t lws[3]); ze_result_t programSyncBuffer(Kernel &kernel, NEO::Device &device, const ze_group_count_t *pThreadGroupDimensions); void appendWriteKernelTimestamp(ze_event_handle_t hEvent, bool beforeWalker, bool maskLsb); void adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, bool maskLsb, uint32_t mask); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 2dd6eed28d..d622905bd8 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -2246,7 +2246,7 @@ void CommandListCoreFamily::clearCommandsToPatch() { } template -ze_result_t CommandListCoreFamily::setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, uint32_t lws[3]) { +ze_result_t CommandListCoreFamily::setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, uint32_t lws[3]) { using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; NEO::EncodeIndirectParams::setGlobalWorkSizeIndirect(commandContainer, offsets, crossThreadAddress, lws); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp index c1dbcbe3de..6fa18a9982 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp @@ -1043,7 +1043,7 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, whenEncodingWorkDimForIndirectDispatchT uint32_t groupSize[] = {1, 1, 1}; auto estimate = EncodeIndirectParams::getCmdsSizeForSetWorkDimIndirect(groupSize, false); auto sizeBefore = commandList->commandContainer.getCommandStream()->getUsed(); - EncodeIndirectParams::setWorkDimIndirect(commandList->commandContainer, 0x4, nullptr, groupSize); + EncodeIndirectParams::setWorkDimIndirect(commandList->commandContainer, 0x4, 0u, groupSize); auto sizeAfter = commandList->commandContainer.getCommandStream()->getUsed(); EXPECT_LE(sizeAfter - sizeBefore, estimate); } @@ -1051,7 +1051,7 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, whenEncodingWorkDimForIndirectDispatchT uint32_t groupSize[] = {1, 1, 2}; auto estimate = EncodeIndirectParams::getCmdsSizeForSetWorkDimIndirect(groupSize, false); auto sizeBefore = commandList->commandContainer.getCommandStream()->getUsed(); - EncodeIndirectParams::setWorkDimIndirect(commandList->commandContainer, 0x4, nullptr, groupSize); + EncodeIndirectParams::setWorkDimIndirect(commandList->commandContainer, 0x4, 0u, groupSize); auto sizeAfter = commandList->commandContainer.getCommandStream()->getUsed(); EXPECT_LE(sizeAfter - sizeBefore, estimate); } @@ -1059,7 +1059,7 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, whenEncodingWorkDimForIndirectDispatchT uint32_t groupSize[] = {1, 1, 1}; auto estimate = EncodeIndirectParams::getCmdsSizeForSetWorkDimIndirect(groupSize, true); auto sizeBefore = commandList->commandContainer.getCommandStream()->getUsed(); - EncodeIndirectParams::setWorkDimIndirect(commandList->commandContainer, 0x2, nullptr, groupSize); + EncodeIndirectParams::setWorkDimIndirect(commandList->commandContainer, 0x2, 0u, groupSize); auto sizeAfter = commandList->commandContainer.getCommandStream()->getUsed(); EXPECT_LE(sizeAfter - sizeBefore, estimate); } @@ -1067,7 +1067,7 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, whenEncodingWorkDimForIndirectDispatchT uint32_t groupSize[] = {1, 1, 2}; auto estimate = EncodeIndirectParams::getCmdsSizeForSetWorkDimIndirect(groupSize, true); auto sizeBefore = commandList->commandContainer.getCommandStream()->getUsed(); - EncodeIndirectParams::setWorkDimIndirect(commandList->commandContainer, 0x2, nullptr, groupSize); + EncodeIndirectParams::setWorkDimIndirect(commandList->commandContainer, 0x2, 0u, groupSize); auto sizeAfter = commandList->commandContainer.getCommandStream()->getUsed(); EXPECT_LE(sizeAfter - sizeBefore, estimate); } diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index 1fd90ce6b9..a08364c7cd 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -170,10 +170,10 @@ struct EncodeIndirectParams { using MI_MATH = typename GfxFamily::MI_MATH; using MI_MATH_ALU_INST_INLINE = typename GfxFamily::MI_MATH_ALU_INST_INLINE; - static void encode(CommandContainer &container, void *crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, void *implicitArgsGpuPtr); - static void setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress); - static void setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offset, void *crossThreadAddress, const uint32_t *groupSize); - static void setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, const uint32_t *lws); + static void encode(CommandContainer &container, uint64_t crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, uint64_t implicitArgsGpuPtr); + static void setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress); + static void setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offset, uint64_t crossThreadAddress, const uint32_t *groupSize); + static void setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws); static size_t getCmdsSizeForIndirectParams(); static size_t getCmdsSizeForSetGroupSizeIndirect(); diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index e3b060a8fa..8877484ad9 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -514,7 +514,7 @@ template void EncodeDispatchKernel::adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo) {} template -void EncodeIndirectParams::encode(CommandContainer &container, void *crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, void *implicitArgsGpuPtr) { +void EncodeIndirectParams::encode(CommandContainer &container, uint64_t crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, uint64_t implicitArgsGpuPtr) { const auto &kernelDescriptor = dispatchInterface->getKernelDescriptor(); setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, crossThreadDataGpuVa); setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, crossThreadDataGpuVa, dispatchInterface->getGroupSize()); @@ -530,19 +530,19 @@ void EncodeIndirectParams::encode(CommandContainer &container, void *cro } template -void EncodeIndirectParams::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress) { +void EncodeIndirectParams::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress) { for (int i = 0; i < 3; ++i) { if (NEO::isUndefinedOffset(offsets[i])) { continue; } - EncodeStoreMMIO::encode(*container.getCommandStream(), GPUGPU_DISPATCHDIM[i], ptrOffset(reinterpret_cast(crossThreadAddress), offsets[i])); + EncodeStoreMMIO::encode(*container.getCommandStream(), GPUGPU_DISPATCHDIM[i], ptrOffset(crossThreadAddress, offsets[i])); } } template -void EncodeIndirectParams::setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset workDimOffset, void *crossThreadAddress, const uint32_t *groupSize) { +void EncodeIndirectParams::setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset workDimOffset, uint64_t crossThreadAddress, const uint32_t *groupSize) { if (NEO::isValidOffset(workDimOffset)) { - auto dstPtr = ptrOffset(reinterpret_cast(crossThreadAddress), workDimOffset); + auto dstPtr = ptrOffset(crossThreadAddress, workDimOffset); constexpr uint32_t RESULT_REGISTER = CS_GPR_R0; constexpr AluRegisters RESULT_ALU_REGISTER = AluRegisters::R_0; const uint32_t offset = static_cast((1ull << 8 * (dstPtr & 0b11)) - 1); @@ -650,12 +650,12 @@ template void EncodeDispatchKernel::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo) {} template -void EncodeIndirectParams::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, const uint32_t *lws) { +void EncodeIndirectParams::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws) { for (int i = 0; i < 3; ++i) { if (NEO::isUndefinedOffset(offsets[i])) { continue; } - EncodeMathMMIO::encodeMulRegVal(container, GPUGPU_DISPATCHDIM[i], lws[i], ptrOffset(reinterpret_cast(crossThreadAddress), offsets[i])); + EncodeMathMMIO::encodeMulRegVal(container, GPUGPU_DISPATCHDIM[i], lws[i], ptrOffset(crossThreadAddress, offsets[i])); } } diff --git a/shared/source/command_container/command_encoder_bdw_and_later.inl b/shared/source/command_container/command_encoder_bdw_and_later.inl index 443689b23a..50af000f5a 100644 --- a/shared/source/command_container/command_encoder_bdw_and_later.inl +++ b/shared/source/command_container/command_encoder_bdw_and_later.inl @@ -153,10 +153,10 @@ void EncodeDispatchKernel::encode(CommandContainer &container, dispatchInterface->getCrossThreadData(), sizeCrossThreadData); if (isIndirect) { - void *gpuPtr = reinterpret_cast(heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - sizeThreadData); - void *implicitArgsGpuPtr = nullptr; + auto gpuPtr = heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - sizeThreadData; + uint64_t implicitArgsGpuPtr = 0u; if (pImplicitArgs) { - implicitArgsGpuPtr = reinterpret_cast(reinterpret_cast(gpuPtr) - sizeof(ImplicitArgs)); + implicitArgsGpuPtr = gpuPtr - sizeof(ImplicitArgs); } EncodeIndirectParams::encode(container, gpuPtr, dispatchInterface, implicitArgsGpuPtr); } diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index 2b8d074286..837c3bea9b 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -187,10 +187,10 @@ void EncodeDispatchKernel::encode(CommandContainer &container, crossThreadData, sizeCrossThreadData); } if (isIndirect) { - void *gpuPtr = reinterpret_cast(heap->getGraphicsAllocation()->getGpuAddress() + static_cast(heap->getUsed() - sizeThreadData - inlineDataProgrammingOffset)); - void *implicitArgsGpuPtr = nullptr; + auto gpuPtr = heap->getGraphicsAllocation()->getGpuAddress() + static_cast(heap->getUsed() - sizeThreadData - inlineDataProgrammingOffset); + uint64_t implicitArgsGpuPtr = 0u; if (pImplicitArgs) { - implicitArgsGpuPtr = reinterpret_cast(reinterpret_cast(gpuPtr) + inlineDataProgrammingOffset - sizeof(ImplicitArgs)); + implicitArgsGpuPtr = gpuPtr + inlineDataProgrammingOffset - sizeof(ImplicitArgs); } EncodeIndirectParams::encode(container, gpuPtr, dispatchInterface, implicitArgsGpuPtr); } diff --git a/shared/test/unit_test/encoders/test_encode_math.cpp b/shared/test/unit_test/encoders/test_encode_math.cpp index 97d4a03ffa..e903707da4 100644 --- a/shared/test/unit_test/encoders/test_encode_math.cpp +++ b/shared/test/unit_test/encoders/test_encode_math.cpp @@ -229,7 +229,7 @@ HWTEST_F(CommandEncoderMathTest, WhenSettingGroupSizeIndirectThenCommandsAreCorr uint32_t crossThreadAdress[3] = {}; uint32_t lws[3] = {2, 1, 1}; - EncodeIndirectParams::setGlobalWorkSizeIndirect(cmdContainer, offsets, crossThreadAdress, lws); + EncodeIndirectParams::setGlobalWorkSizeIndirect(cmdContainer, offsets, reinterpret_cast(crossThreadAdress), lws); GenCmdList commands; CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed()); @@ -254,7 +254,7 @@ HWTEST_F(CommandEncoderMathTest, WhenSettingGroupCountIndirectThenCommandsAreCor CrossThreadDataOffset offsets[3] = {0, sizeof(uint32_t), 2 * sizeof(uint32_t)}; uint32_t crossThreadAdress[3] = {}; - EncodeIndirectParams::setGroupCountIndirect(cmdContainer, offsets, crossThreadAdress); + EncodeIndirectParams::setGroupCountIndirect(cmdContainer, offsets, reinterpret_cast(crossThreadAdress)); GenCmdList commands; CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed());