From bb518adf347ad69712692f818f5f52ffd3cc6e4e Mon Sep 17 00:00:00 2001 From: Mateusz Jablonski Date: Thu, 3 Apr 2025 16:02:01 +0000 Subject: [PATCH] fix: patching payload arguments in inline data in case of indirect kernel Related-To: NEO-14532 Signed-off-by: Mateusz Jablonski --- level_zero/core/source/cmdlist/cmdlist_hw.h | 1 - level_zero/core/source/cmdlist/cmdlist_hw.inl | 7 - .../test_cmdlist_append_launch_kernel_2.cpp | 8 +- .../command_container/command_encoder.h | 26 ++- .../command_container/command_encoder.inl | 93 ++++++-- .../command_encoder_xehp_and_later.inl | 8 +- .../gen12lp/command_encoder_gen12lp.cpp | 2 +- .../unit_test/encoders/test_encode_math.cpp | 221 +++++++++++++++++- 8 files changed, 326 insertions(+), 40 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index fba698ab88..69c89ea3b8 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -312,7 +312,6 @@ struct CommandListCoreFamily : public CommandListImp { void applyMemoryRangesBarrier(uint32_t numRanges, const size_t *pRangeSizes, const void **pRanges); - ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, uint32_t lws[3]); ze_result_t programSyncBuffer(Kernel &kernel, NEO::Device &device, const ze_group_count_t &threadGroupDimensions, size_t &patchIndex); void programRegionGroupBarrier(Kernel &kernel, const ze_group_count_t &threadGroupDimensions, size_t localRegionSize, size_t &patchIndex); void appendWriteKernelTimestamp(Event *event, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool maskLsb, bool workloadPartition, bool copyOperation); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 765519831d..a2eedf42cd 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -3637,13 +3637,6 @@ inline bool CommandListCoreFamily::isAppendSplitNeeded(NEO::Memor directionOut != NEO::TransferDirection::localToLocal; } -template -ze_result_t CommandListCoreFamily::setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, uint32_t lws[3]) { - NEO::EncodeIndirectParams::setGlobalWorkSizeIndirect(commandContainer, offsets, crossThreadAddress, lws); - - return ZE_RESULT_SUCCESS; -} - template void CommandListCoreFamily::programStateBaseAddress(NEO::CommandContainer &container, bool useSbaProperties) { using STATE_BASE_ADDRESS = typename GfxFamily::STATE_BASE_ADDRESS; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp index 71e4b5cad9..9b727573fd 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp @@ -1183,7 +1183,7 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, whenEncodingWorkDimForIndirectDispatchT uint32_t groupSize[] = {1, 1, 1}; auto estimate = EncodeIndirectParams::getCmdsSizeForSetWorkDimIndirect(groupSize, false); auto sizeBefore = commandList->getCmdContainer().getCommandStream()->getUsed(); - EncodeIndirectParams::setWorkDimIndirect(commandList->getCmdContainer(), 0x4, 0u, groupSize); + EncodeIndirectParams::setWorkDimIndirect(commandList->getCmdContainer(), 0x4, 0u, groupSize, nullptr); auto sizeAfter = commandList->getCmdContainer().getCommandStream()->getUsed(); EXPECT_LE(sizeAfter - sizeBefore, estimate); } @@ -1191,7 +1191,7 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, whenEncodingWorkDimForIndirectDispatchT uint32_t groupSize[] = {1, 1, 2}; auto estimate = EncodeIndirectParams::getCmdsSizeForSetWorkDimIndirect(groupSize, false); auto sizeBefore = commandList->getCmdContainer().getCommandStream()->getUsed(); - EncodeIndirectParams::setWorkDimIndirect(commandList->getCmdContainer(), 0x4, 0u, groupSize); + EncodeIndirectParams::setWorkDimIndirect(commandList->getCmdContainer(), 0x4, 0u, groupSize, nullptr); auto sizeAfter = commandList->getCmdContainer().getCommandStream()->getUsed(); EXPECT_LE(sizeAfter - sizeBefore, estimate); } @@ -1199,7 +1199,7 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, whenEncodingWorkDimForIndirectDispatchT uint32_t groupSize[] = {1, 1, 1}; auto estimate = EncodeIndirectParams::getCmdsSizeForSetWorkDimIndirect(groupSize, true); auto sizeBefore = commandList->getCmdContainer().getCommandStream()->getUsed(); - EncodeIndirectParams::setWorkDimIndirect(commandList->getCmdContainer(), 0x2, 0u, groupSize); + EncodeIndirectParams::setWorkDimIndirect(commandList->getCmdContainer(), 0x2, 0u, groupSize, nullptr); auto sizeAfter = commandList->getCmdContainer().getCommandStream()->getUsed(); EXPECT_LE(sizeAfter - sizeBefore, estimate); } @@ -1207,7 +1207,7 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, whenEncodingWorkDimForIndirectDispatchT uint32_t groupSize[] = {1, 1, 2}; auto estimate = EncodeIndirectParams::getCmdsSizeForSetWorkDimIndirect(groupSize, true); auto sizeBefore = commandList->getCmdContainer().getCommandStream()->getUsed(); - EncodeIndirectParams::setWorkDimIndirect(commandList->getCmdContainer(), 0x2, 0u, groupSize); + EncodeIndirectParams::setWorkDimIndirect(commandList->getCmdContainer(), 0x2, 0u, groupSize, nullptr); auto sizeAfter = commandList->getCmdContainer().getCommandStream()->getUsed(); EXPECT_LE(sizeAfter - sizeBefore, estimate); } diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index e34b2fee76..8c66143e34 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -97,6 +97,14 @@ struct EncodeDispatchKernelArgs { } }; +struct EncodeStoreMMIOParams { + uint64_t address; + void *command; + uint32_t offset; + bool workloadPartition; + bool isBcs; +}; + enum class MiPredicateType : uint32_t { disable = 0, noopOnResult2Clear = 1, @@ -346,7 +354,7 @@ struct EncodeMathMMIO { static const size_t size = sizeof(MI_STORE_REGISTER_MEM); - static void encodeMulRegVal(CommandContainer &container, uint32_t offset, uint32_t val, uint64_t dstAddress, bool isBcs); + static void encodeMulRegVal(CommandContainer &container, uint32_t offset, uint32_t val, uint64_t dstAddress, bool isBcs, EncodeStoreMMIOParams *outStoreMMIOParams); static void encodeGreaterThanPredicate(CommandContainer &container, uint64_t lhsVal, uint32_t rhsVal, bool isBcs); @@ -387,6 +395,13 @@ struct EncodeMathMMIO { static void encodeIncrementOrDecrement(LinearStream &cmdStream, AluRegisters operandRegister, IncrementOrDecrementOperation operationType, bool isBcs); }; +struct IndirectParamsInInlineDataArgs { + std::vector commandsToPatch; + bool storeGroupCountInInlineData[3]; + bool storeGlobalWorkSizeInInlineData[3]; + bool storeWorkDimInInlineData; +}; + template struct EncodeIndirectParams { using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM; @@ -396,10 +411,11 @@ struct EncodeIndirectParams { using MI_MATH = typename GfxFamily::MI_MATH; using MI_MATH_ALU_INST_INLINE = typename GfxFamily::MI_MATH_ALU_INST_INLINE; - static void encode(CommandContainer &container, uint64_t crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, uint64_t implicitArgsGpuPtr); - static void setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress); - static void setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offset, uint64_t crossThreadAddress, const uint32_t *groupSize); - static void setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws); + static void encode(CommandContainer &container, uint64_t crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, uint64_t implicitArgsGpuPtr, IndirectParamsInInlineDataArgs *outArgs); + static void setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, IndirectParamsInInlineDataArgs *outArgs); + static void setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offset, uint64_t crossThreadAddress, const uint32_t *groupSize, IndirectParamsInInlineDataArgs *outArgs); + static void setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws, IndirectParamsInInlineDataArgs *outArgs); + static void applyInlineDataGpuVA(IndirectParamsInInlineDataArgs &args, uint64_t inlineDataGpuVa); static size_t getCmdsSizeForSetWorkDimIndirect(const uint32_t *groupSize, bool misalignedPtr); }; diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index 9cabdc9ec5..23a4c23695 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -117,7 +117,7 @@ uint32_t EncodeStates::copySamplerState(IndirectHeap *dsh, } template -void EncodeMathMMIO::encodeMulRegVal(CommandContainer &container, uint32_t offset, uint32_t val, uint64_t dstAddress, bool isBcs) { +void EncodeMathMMIO::encodeMulRegVal(CommandContainer &container, uint32_t offset, uint32_t val, uint64_t dstAddress, bool isBcs, EncodeStoreMMIOParams *outStoreMMIOParams) { int logLws = 0; int i = val; while (val >> logLws) { @@ -139,7 +139,15 @@ void EncodeMathMMIO::encodeMulRegVal(CommandContainer &container, uint32 EncodeSetMMIO::encodeREG(container, RegisterOffsets::csGprR0, RegisterOffsets::csGprR2, isBcs); i++; } - EncodeStoreMMIO::encode(*container.getCommandStream(), RegisterOffsets::csGprR1, dstAddress, false, nullptr, isBcs); + void **outStoreMMIOCmd = nullptr; + if (outStoreMMIOParams) { + outStoreMMIOParams->address = dstAddress; + outStoreMMIOParams->offset = RegisterOffsets::csGprR1; + outStoreMMIOParams->workloadPartition = false; + outStoreMMIOParams->isBcs = isBcs; + outStoreMMIOCmd = &outStoreMMIOParams->command; + } + EncodeStoreMMIO::encode(*container.getCommandStream(), RegisterOffsets::csGprR1, dstAddress, false, outStoreMMIOCmd, isBcs); } /* @@ -586,44 +594,75 @@ bool EncodeDispatchKernel::inlineDataProgrammingRequired(const KernelDes } template -void EncodeIndirectParams::encode(CommandContainer &container, uint64_t crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, uint64_t implicitArgsGpuPtr) { +void EncodeIndirectParams::encode(CommandContainer &container, uint64_t crossThreadDataGpuVa, DispatchKernelEncoderI *dispatchInterface, uint64_t implicitArgsGpuPtr, IndirectParamsInInlineDataArgs *outArgs) { const auto &kernelDescriptor = dispatchInterface->getKernelDescriptor(); - setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, crossThreadDataGpuVa); - setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, crossThreadDataGpuVa, dispatchInterface->getGroupSize()); + if (outArgs) { + for (int i = 0; i < 3; i++) { + if (!NEO::isUndefinedOffset(kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[i]) && kernelDescriptor.kernelAttributes.inlineDataPayloadSize > kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[i]) { + outArgs->storeGroupCountInInlineData[i] = true; + } + if (!NEO::isUndefinedOffset(kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[i]) && kernelDescriptor.kernelAttributes.inlineDataPayloadSize > kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[i]) { + outArgs->storeGlobalWorkSizeInInlineData[i] = true; + } + } + if (!NEO::isUndefinedOffset(kernelDescriptor.payloadMappings.dispatchTraits.workDim) && kernelDescriptor.kernelAttributes.inlineDataPayloadSize > kernelDescriptor.payloadMappings.dispatchTraits.workDim) { + outArgs->storeWorkDimInInlineData = true; + } + } + setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, crossThreadDataGpuVa, outArgs); + setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, crossThreadDataGpuVa, dispatchInterface->getGroupSize(), outArgs); UNRECOVERABLE_IF(NEO::isValidOffset(kernelDescriptor.payloadMappings.dispatchTraits.workDim) && (kernelDescriptor.payloadMappings.dispatchTraits.workDim & 0b11) != 0u); - setWorkDimIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.workDim, crossThreadDataGpuVa, dispatchInterface->getGroupSize()); + setWorkDimIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.workDim, crossThreadDataGpuVa, dispatchInterface->getGroupSize(), outArgs); if (implicitArgsGpuPtr) { const auto version = container.getDevice()->getGfxCoreHelper().getImplicitArgsVersion(); if (version == 0) { constexpr CrossThreadDataOffset groupCountOffset[] = {offsetof(ImplicitArgsV0, groupCountX), offsetof(ImplicitArgsV0, groupCountY), offsetof(ImplicitArgsV0, groupCountZ)}; constexpr CrossThreadDataOffset globalSizeOffset[] = {offsetof(ImplicitArgsV0, globalSizeX), offsetof(ImplicitArgsV0, globalSizeY), offsetof(ImplicitArgsV0, globalSizeZ)}; constexpr auto numWorkDimOffset = offsetof(ImplicitArgsV0, numWorkDim); - setGroupCountIndirect(container, groupCountOffset, implicitArgsGpuPtr); - setGlobalWorkSizeIndirect(container, globalSizeOffset, implicitArgsGpuPtr, dispatchInterface->getGroupSize()); - setWorkDimIndirect(container, numWorkDimOffset, implicitArgsGpuPtr, dispatchInterface->getGroupSize()); + setGroupCountIndirect(container, groupCountOffset, implicitArgsGpuPtr, nullptr); + setGlobalWorkSizeIndirect(container, globalSizeOffset, implicitArgsGpuPtr, dispatchInterface->getGroupSize(), nullptr); + setWorkDimIndirect(container, numWorkDimOffset, implicitArgsGpuPtr, dispatchInterface->getGroupSize(), nullptr); } else if (version == 1) { constexpr CrossThreadDataOffset groupCountOffsetV1[] = {offsetof(ImplicitArgsV1, groupCountX), offsetof(ImplicitArgsV1, groupCountY), offsetof(ImplicitArgsV1, groupCountZ)}; constexpr CrossThreadDataOffset globalSizeOffsetV1[] = {offsetof(ImplicitArgsV1, globalSizeX), offsetof(ImplicitArgsV1, globalSizeY), offsetof(ImplicitArgsV1, globalSizeZ)}; constexpr auto numWorkDimOffsetV1 = offsetof(ImplicitArgsV1, numWorkDim); - setGroupCountIndirect(container, groupCountOffsetV1, implicitArgsGpuPtr); - setGlobalWorkSizeIndirect(container, globalSizeOffsetV1, implicitArgsGpuPtr, dispatchInterface->getGroupSize()); - setWorkDimIndirect(container, numWorkDimOffsetV1, implicitArgsGpuPtr, dispatchInterface->getGroupSize()); + setGroupCountIndirect(container, groupCountOffsetV1, implicitArgsGpuPtr, nullptr); + setGlobalWorkSizeIndirect(container, globalSizeOffsetV1, implicitArgsGpuPtr, dispatchInterface->getGroupSize(), nullptr); + setWorkDimIndirect(container, numWorkDimOffsetV1, implicitArgsGpuPtr, dispatchInterface->getGroupSize(), nullptr); } } } template -void EncodeIndirectParams::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress) { +void EncodeIndirectParams::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, IndirectParamsInInlineDataArgs *outArgs) { for (int i = 0; i < 3; ++i) { if (NEO::isUndefinedOffset(offsets[i])) { continue; } - EncodeStoreMMIO::encode(*container.getCommandStream(), RegisterOffsets::gpgpuDispatchDim[i], ptrOffset(crossThreadAddress, offsets[i]), false, nullptr, false); + void **storeCmd = nullptr; + if (outArgs && outArgs->storeGroupCountInInlineData[i]) { + outArgs->commandsToPatch.push_back({}); + auto &commandArgs = outArgs->commandsToPatch.back(); + storeCmd = &commandArgs.command; + commandArgs.address = offsets[i]; + commandArgs.offset = RegisterOffsets::gpgpuDispatchDim[i]; + commandArgs.isBcs = false; + commandArgs.workloadPartition = false; + } + EncodeStoreMMIO::encode(*container.getCommandStream(), RegisterOffsets::gpgpuDispatchDim[i], ptrOffset(crossThreadAddress, offsets[i]), false, storeCmd, false); } } template -void EncodeIndirectParams::setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset workDimOffset, uint64_t crossThreadAddress, const uint32_t *groupSize) { +void EncodeIndirectParams::applyInlineDataGpuVA(IndirectParamsInInlineDataArgs &args, uint64_t inlineDataGpuVa) { + for (auto &commandArgs : args.commandsToPatch) { + auto commandToPatch = reinterpret_cast(commandArgs.command); + EncodeStoreMMIO::encode(commandToPatch, commandArgs.offset, commandArgs.address + inlineDataGpuVa, commandArgs.workloadPartition, commandArgs.isBcs); + } +} + +template +void EncodeIndirectParams::setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset workDimOffset, uint64_t crossThreadAddress, const uint32_t *groupSize, IndirectParamsInInlineDataArgs *outArgs) { if (NEO::isValidOffset(workDimOffset)) { auto dstPtr = ptrOffset(crossThreadAddress, workDimOffset); constexpr uint32_t resultRegister = RegisterOffsets::csGprR0; @@ -709,7 +748,17 @@ void EncodeIndirectParams::setWorkDimIndirect(CommandContainer &containe EncodeMath::addition(container, resultAluRegister, backupAluRegister, resultAluRegister); } } - EncodeStoreMMIO::encode(*container.getCommandStream(), resultRegister, dstPtr, false, nullptr, false); + void **storeCmd = nullptr; + if (outArgs && outArgs->storeWorkDimInInlineData) { + outArgs->commandsToPatch.push_back({}); + auto &commandArgs = outArgs->commandsToPatch.back(); + storeCmd = &commandArgs.command; + commandArgs.address = workDimOffset; + commandArgs.offset = resultRegister; + commandArgs.isBcs = false; + commandArgs.workloadPartition = false; + } + EncodeStoreMMIO::encode(*container.getCommandStream(), resultRegister, dstPtr, false, storeCmd, false); } } @@ -777,12 +826,20 @@ size_t EncodeDispatchKernel::getDefaultDshAlignment() { } template -void EncodeIndirectParams::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws) { +void EncodeIndirectParams::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws, IndirectParamsInInlineDataArgs *outArgs) { for (int i = 0; i < 3; ++i) { if (NEO::isUndefinedOffset(offsets[i])) { continue; } - EncodeMathMMIO::encodeMulRegVal(container, RegisterOffsets::gpgpuDispatchDim[i], lws[i], ptrOffset(crossThreadAddress, offsets[i]), false); + EncodeStoreMMIOParams *storeParams = nullptr; + + auto patchLocation = ptrOffset(crossThreadAddress, offsets[i]); + if (outArgs && outArgs->storeGlobalWorkSizeInInlineData[i]) { + outArgs->commandsToPatch.push_back({}); + storeParams = &outArgs->commandsToPatch.back(); + patchLocation = offsets[i]; + } + EncodeMathMMIO::encodeMulRegVal(container, RegisterOffsets::gpgpuDispatchDim[i], lws[i], patchLocation, false, storeParams); } } diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index 50793322fe..20d4361458 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -251,6 +251,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, !localIdsGenerationByRuntime, rootDeviceEnvironment); uint32_t sizeForImplicitArgsStruct = NEO::ImplicitArgsHelper::getSizeForImplicitArgsStruct(pImplicitArgs, kernelDescriptor, true, rootDeviceEnvironment); uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching + args.reserveExtraPayloadSpace; + IndirectParamsInInlineDataArgs encodeIndirectParamsArgs{}; { void *ptr = nullptr; if (!args.makeCommandView) { @@ -279,7 +280,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis if (pImplicitArgs) { implicitArgsGpuPtr = gpuPtr + inlineDataProgrammingOffset - sizeForImplicitArgsStruct; } - EncodeIndirectParams::encode(container, gpuPtr, args.dispatchInterface, implicitArgsGpuPtr); + EncodeIndirectParams::encode(container, gpuPtr, args.dispatchInterface, implicitArgsGpuPtr, &encodeIndirectParamsArgs); } } else { ptr = args.cpuPayloadBuffer; @@ -460,6 +461,11 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis } } + if (args.isIndirect) { + auto walkerGpuVa = listCmdBufferStream->getGpuBase() + ptrDiff(args.outWalkerPtr, listCmdBufferStream->getCpuBase()); + EncodeIndirectParams::applyInlineDataGpuVA(encodeIndirectParamsArgs, walkerGpuVa + ptrDiff(walkerCmd.getInlineDataPointer(), &walkerCmd)); + } + if (args.cpuWalkerBuffer) { *reinterpret_cast(args.cpuWalkerBuffer) = walkerCmd; } diff --git a/shared/source/gen12lp/command_encoder_gen12lp.cpp b/shared/source/gen12lp/command_encoder_gen12lp.cpp index 4506bbf9e4..a5e96ff174 100644 --- a/shared/source/gen12lp/command_encoder_gen12lp.cpp +++ b/shared/source/gen12lp/command_encoder_gen12lp.cpp @@ -213,7 +213,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis if (args.isIndirect) { auto crossThreadDataGpuVA = heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - sizeThreadData; - EncodeIndirectParams::encode(container, crossThreadDataGpuVA, args.dispatchInterface, implicitArgsGpuVA); + EncodeIndirectParams::encode(container, crossThreadDataGpuVA, args.dispatchInterface, implicitArgsGpuVA, nullptr); } ptr = ptrOffset(ptr, sizeCrossThreadData); diff --git a/shared/test/unit_test/encoders/test_encode_math.cpp b/shared/test/unit_test/encoders/test_encode_math.cpp index 4e9ee65f8d..fde1616e36 100644 --- a/shared/test/unit_test/encoders/test_encode_math.cpp +++ b/shared/test/unit_test/encoders/test_encode_math.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -12,6 +12,7 @@ #include "shared/test/common/fixtures/device_fixture.h" #include "shared/test/common/mocks/mock_device.h" #include "shared/test/common/test_macros/hw_test.h" +#include "shared/test/unit_test/mocks/mock_dispatch_kernel_encoder_interface.h" using namespace NEO; @@ -231,7 +232,7 @@ HWTEST_F(CommandEncoderMathTest, WhenSettingGroupSizeIndirectThenCommandsAreCorr uint32_t crossThreadAddress[3] = {}; uint32_t lws[3] = {2, 1, 1}; - EncodeIndirectParams::setGlobalWorkSizeIndirect(cmdContainer, offsets, reinterpret_cast(crossThreadAddress), lws); + EncodeIndirectParams::setGlobalWorkSizeIndirect(cmdContainer, offsets, reinterpret_cast(crossThreadAddress), lws, nullptr); GenCmdList commands; CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed()); @@ -254,7 +255,7 @@ HWTEST_F(CommandEncoderMathTest, WhenSettingGroupCountIndirectThenCommandsAreCor CrossThreadDataOffset offsets[3] = {0, sizeof(uint32_t), 2 * sizeof(uint32_t)}; uint32_t crossThreadAddress[3] = {}; - EncodeIndirectParams::setGroupCountIndirect(cmdContainer, offsets, reinterpret_cast(crossThreadAddress)); + EncodeIndirectParams::setGroupCountIndirect(cmdContainer, offsets, reinterpret_cast(crossThreadAddress), nullptr); GenCmdList commands; CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed()); @@ -274,6 +275,220 @@ HWTEST_F(CommandEncoderMathTest, WhenSettingGroupCountIndirectThenCommandsAreCor ASSERT_EQ(itor, commands.end()); } +HWTEST_F(CommandEncoderMathTest, givenPayloadArgumentStoredInInlineDataWhenSettingGroupCountIndirectThenInlineDataRelatedCommandIsStoredInCommandsToPatch) { + using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM; + + CommandContainer cmdContainer; + cmdContainer.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false); + + CrossThreadDataOffset offsets[3] = {0, sizeof(uint32_t), 2 * sizeof(uint32_t)}; + uint64_t crossThreadGpuVa = 0xBADF000; + + IndirectParamsInInlineDataArgs args{}; + args.storeGroupCountInInlineData[1] = true; + + EncodeIndirectParams::setGroupCountIndirect(cmdContainer, offsets, crossThreadGpuVa, &args); + + EXPECT_EQ(1u, args.commandsToPatch.size()); + + GenCmdList commands; + CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed()); + + auto itor = commands.begin(); + + itor = find(itor, commands.end()); + ASSERT_NE(itor, commands.end()); + auto storeRegMem = reinterpret_cast(*itor); + EXPECT_EQ(crossThreadGpuVa + offsets[0], storeRegMem->getMemoryAddress()); + + itor = find(++itor, commands.end()); + ASSERT_NE(itor, commands.end()); + EXPECT_EQ(*itor, args.commandsToPatch[0].command); + storeRegMem = reinterpret_cast(*itor); + EXPECT_EQ(crossThreadGpuVa + offsets[1], storeRegMem->getMemoryAddress()); + EXPECT_EQ(args.commandsToPatch[0].address, offsets[1]); + ; + EXPECT_EQ(args.commandsToPatch[0].offset, storeRegMem->getRegisterAddress()); + + itor = find(++itor, commands.end()); + ASSERT_NE(itor, commands.end()); + storeRegMem = reinterpret_cast(*itor); + EXPECT_EQ(crossThreadGpuVa + offsets[2], storeRegMem->getMemoryAddress()); + + itor = find(++itor, commands.end()); + ASSERT_EQ(itor, commands.end()); +} + +HWTEST_F(CommandEncoderMathTest, givenPayloadArgumentStoredInInlineDataWhenSettingGlobalGroupSizeIndirectThenInlineDataRelatedCommandIsStoredInCommandsToPatch) { + using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM; + + CommandContainer cmdContainer; + cmdContainer.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false); + + CrossThreadDataOffset offsets[3] = {0, sizeof(uint32_t), 2 * sizeof(uint32_t)}; + uint64_t crossThreadGpuVa = 0xBADF000; + + IndirectParamsInInlineDataArgs args{}; + args.storeGlobalWorkSizeInInlineData[1] = true; + + uint32_t lws[3] = {1, 2, 3}; + + EncodeIndirectParams::setGlobalWorkSizeIndirect(cmdContainer, offsets, crossThreadGpuVa, lws, &args); + + EXPECT_EQ(1u, args.commandsToPatch.size()); + + GenCmdList commands; + CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed()); + + auto itor = commands.begin(); + + itor = find(itor, commands.end()); + ASSERT_NE(itor, commands.end()); + auto storeRegMem = reinterpret_cast(*itor); + EXPECT_EQ(crossThreadGpuVa + offsets[0], storeRegMem->getMemoryAddress()); + + itor = find(++itor, commands.end()); + ASSERT_NE(itor, commands.end()); + EXPECT_EQ(*itor, args.commandsToPatch[0].command); + storeRegMem = reinterpret_cast(*itor); + EXPECT_EQ(offsets[1], storeRegMem->getMemoryAddress()); + EXPECT_EQ(args.commandsToPatch[0].address, offsets[1]); + ; + EXPECT_EQ(args.commandsToPatch[0].offset, storeRegMem->getRegisterAddress()); + + itor = find(++itor, commands.end()); + ASSERT_NE(itor, commands.end()); + storeRegMem = reinterpret_cast(*itor); + EXPECT_EQ(crossThreadGpuVa + offsets[2], storeRegMem->getMemoryAddress()); + + itor = find(++itor, commands.end()); + ASSERT_EQ(itor, commands.end()); +} + +HWTEST_F(CommandEncoderMathTest, givenPayloadArgumentStoredInInlineDataWhenSettingWorkDimIndirectThenInlineDataRelatedCommandIsStoredInCommandsToPatch) { + using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM; + + CommandContainer cmdContainer; + cmdContainer.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false); + + CrossThreadDataOffset offset = sizeof(uint32_t); + uint64_t crossThreadGpuVa = 0xBADF000; + + IndirectParamsInInlineDataArgs args{}; + args.storeWorkDimInInlineData = true; + + uint32_t groupSizes[3] = {1, 2, 3}; + + EncodeIndirectParams::setWorkDimIndirect(cmdContainer, offset, crossThreadGpuVa, groupSizes, &args); + + EXPECT_EQ(1u, args.commandsToPatch.size()); + + GenCmdList commands; + CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed()); + + auto itor = commands.begin(); + + itor = find(++itor, commands.end()); + ASSERT_NE(itor, commands.end()); + EXPECT_EQ(*itor, args.commandsToPatch[0].command); + auto storeRegMem = reinterpret_cast(*itor); + EXPECT_EQ(args.commandsToPatch[0].address, offset); + ; + EXPECT_EQ(args.commandsToPatch[0].offset, storeRegMem->getRegisterAddress()); + + itor = find(++itor, commands.end()); + ASSERT_EQ(itor, commands.end()); +} + +HWTEST_F(CommandEncoderMathTest, givenPayloadArgumentStoredInInlineDataWhenEncodeIndirectParamsAndApplyingInlineGpuVaThenCorrectCommandsAreProgrammed) { + using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM; + + for (auto workDimInInlineData : ::testing::Bool()) { + CommandContainer cmdContainer; + cmdContainer.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false); + + uint64_t crossThreadGpuVa = 0xBADF000; + + IndirectParamsInInlineDataArgs args{}; + + MockDispatchKernelEncoder dispatchInterface; + + auto &kernelDescriptor = dispatchInterface.kernelDescriptor; + uint32_t groupSizes[3] = {1, 2, 3}; + dispatchInterface.getGroupSizeResult = groupSizes; + + kernelDescriptor.kernelAttributes.inlineDataPayloadSize = 0x100; + kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = 0x8; + kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[1] = 0x120; + kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[2] = undefined; + + kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = undefined; + kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[1] = 0x20; + kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[2] = 0x100; + + kernelDescriptor.payloadMappings.dispatchTraits.workDim = workDimInInlineData ? 0x60 : 0x110; + + EncodeIndirectParams::encode(cmdContainer, crossThreadGpuVa, &dispatchInterface, 0u, &args); + + if (workDimInInlineData) { + EXPECT_EQ(3u, args.commandsToPatch.size()); + } else { + EXPECT_EQ(2u, args.commandsToPatch.size()); + } + EXPECT_TRUE(args.storeGroupCountInInlineData[0]); + EXPECT_FALSE(args.storeGroupCountInInlineData[1]); + EXPECT_FALSE(args.storeGroupCountInInlineData[2]); + + EXPECT_FALSE(args.storeGlobalWorkSizeInInlineData[0]); + EXPECT_TRUE(args.storeGlobalWorkSizeInInlineData[1]); + EXPECT_FALSE(args.storeGlobalWorkSizeInInlineData[2]); + + EXPECT_EQ(workDimInInlineData, args.storeWorkDimInInlineData); + + uint64_t inlineDataGpuVa = 0x12340000; + EncodeIndirectParams::applyInlineDataGpuVA(args, inlineDataGpuVa); + + GenCmdList commands; + CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed()); + + auto itor = commands.begin(); + itor = find(itor, commands.end()); + ASSERT_NE(itor, commands.end()); + auto storeRegMem = reinterpret_cast(*itor); + EXPECT_EQ(storeRegMem, args.commandsToPatch[0].command); + EXPECT_EQ(inlineDataGpuVa + kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[0], storeRegMem->getMemoryAddress()); + + itor = find(++itor, commands.end()); + ASSERT_NE(itor, commands.end()); + storeRegMem = reinterpret_cast(*itor); + EXPECT_EQ(crossThreadGpuVa + kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[1], storeRegMem->getMemoryAddress()); + + itor = find(++itor, commands.end()); + ASSERT_NE(itor, commands.end()); + storeRegMem = reinterpret_cast(*itor); + EXPECT_EQ(storeRegMem, args.commandsToPatch[1].command); + EXPECT_EQ(inlineDataGpuVa + kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[1], storeRegMem->getMemoryAddress()); + + itor = find(++itor, commands.end()); + ASSERT_NE(itor, commands.end()); + storeRegMem = reinterpret_cast(*itor); + EXPECT_EQ(crossThreadGpuVa + kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[2], storeRegMem->getMemoryAddress()); + + itor = find(++itor, commands.end()); + ASSERT_NE(itor, commands.end()); + storeRegMem = reinterpret_cast(*itor); + if (workDimInInlineData) { + EXPECT_EQ(storeRegMem, args.commandsToPatch[2].command); + EXPECT_EQ(inlineDataGpuVa + kernelDescriptor.payloadMappings.dispatchTraits.workDim, storeRegMem->getMemoryAddress()); + } else { + EXPECT_EQ(crossThreadGpuVa + kernelDescriptor.payloadMappings.dispatchTraits.workDim, storeRegMem->getMemoryAddress()); + } + + itor = find(++itor, commands.end()); + ASSERT_EQ(itor, commands.end()); + } +} + using CommandEncodeAluTests = ::testing::Test; HWTEST_F(CommandEncodeAluTests, whenAskingForIncrementOrDecrementCmdsSizeThenReturnCorrectValue) {