diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 6a4766c0e3..e3bdce8313 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -20,6 +20,9 @@ namespace NEO { enum class MemoryPool; enum class ImageType; +struct EncodeDispatchKernelArgs; +struct KernelDescriptor; + } // namespace NEO namespace L0 { @@ -359,7 +362,7 @@ struct CommandListCoreFamily : public CommandListImp { bool isInOrderNonWalkerSignalingRequired(const Event *event) const; bool hasInOrderDependencies() const; void appendFullSynchronizedDispatchInit(); - + void addPatchScratchAddressInImplicitArgs(CommandsToPatch &commandsToPatch, NEO::EncodeDispatchKernelArgs &args, const NEO::KernelDescriptor &kernelDescriptor, bool kernelNeedsImplicitArgs); size_t addCmdForPatching(std::shared_ptr *externalInOrderExecInfo, void *cmd1, void *cmd2, uint64_t counterValue, NEO::InOrderPatchCommandHelpers::PatchCmdType patchCmdType); uint64_t getInOrderIncrementValue() const; bool isSkippingInOrderBarrierAllowed(ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) const; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 009e9d8ee4..35d7210807 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -4097,6 +4097,10 @@ void CommandListCoreFamily::appendFullSynchronizedDispatchInit() NEO::EncodeMiPredicate::encode(*cmdStream, NEO::MiPredicateType::disable); } +template +void CommandListCoreFamily::addPatchScratchAddressInImplicitArgs(CommandsToPatch &commandsToPatch, NEO::EncodeDispatchKernelArgs &args, const NEO::KernelDescriptor &kernelDescriptor, bool kernelNeedsImplicitArgs) { +} + template void CommandListCoreFamily::appendSynchronizedDispatchCleanupSection() { if (getSynchronizedDispatchMode() != NEO::SynchronizedDispatchMode::full) { diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 582d589953..8358e1b86d 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -126,6 +126,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K ", Group count: ", threadGroupDimensions.groupCountX, ", ", threadGroupDimensions.groupCountY, ", ", threadGroupDimensions.groupCountZ, ", SIMD: ", kernelInfo->getMaxSimdSize()); + bool kernelNeedsImplicitArgs = kernel->getImplicitArgs() != nullptr; bool needScratchSpace = false; bool kernelNeedsScratchSpace = false; for (uint32_t slotId = 0u; slotId < 2; slotId++) { @@ -373,6 +374,8 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K scratchInlineData.baseAddress = ssh->getGpuBase(); } commandsToPatch.push_back(scratchInlineData); + + addPatchScratchAddressInImplicitArgs(commandsToPatch, dispatchKernelArgs, kernelDescriptor, kernelNeedsImplicitArgs); } if (!this->isFlushTaskSubmissionEnabled) { diff --git a/level_zero/core/source/cmdlist/cmdlist_launch_params.h b/level_zero/core/source/cmdlist/cmdlist_launch_params.h index d4781d451d..bc788db947 100644 --- a/level_zero/core/source/cmdlist/cmdlist_launch_params.h +++ b/level_zero/core/source/cmdlist/cmdlist_launch_params.h @@ -31,6 +31,7 @@ struct CommandToPatch { CbWaitEventSemaphoreWait, CbWaitEventLoadRegisterImm, ComputeWalkerInlineDataScratch, + ComputeWalkerImplicitArgsScratch, Invalid }; void *pDestination = nullptr; diff --git a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl index 48b9a685b3..0c9c60c6bc 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl @@ -173,11 +173,11 @@ void CommandQueueHw::patchCommands(CommandList &commandList, uint using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT; using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; - bool patchNewInlineScratchAddress = false; + bool patchNewScratchAddress = false; if (this->heaplessModeEnabled && (commandList.getCommandListPatchedPerThreadScratchSize(0) < perThreadScratchSpaceSlot0Size || commandList.getCommandListPatchedPerThreadScratchSize(1) < perThreadScratchSpaceSlot1Size)) { - patchNewInlineScratchAddress = true; + patchNewScratchAddress = true; } auto &commandsToPatch = commandList.getCommandsToPatch(); @@ -241,7 +241,16 @@ void CommandQueueHw::patchCommands(CommandList &commandList, uint break; } case CommandToPatch::ComputeWalkerInlineDataScratch: { - if (!patchNewInlineScratchAddress) { + if (!patchNewScratchAddress) { + continue; + } + uint64_t fullScratchAddress = scratchAddress + commandToPatch.baseAddress; + void *scratchAddressPatch = ptrOffset(commandToPatch.pDestination, commandToPatch.offset); + std::memcpy(scratchAddressPatch, &fullScratchAddress, commandToPatch.patchSize); + break; + } + case CommandToPatch::ComputeWalkerImplicitArgsScratch: { + if (!patchNewScratchAddress) { continue; } uint64_t fullScratchAddress = scratchAddress + commandToPatch.baseAddress; @@ -254,7 +263,7 @@ void CommandQueueHw::patchCommands(CommandList &commandList, uint } } - if (patchNewInlineScratchAddress) { + if (patchNewScratchAddress) { commandList.setCommandListPatchedPerThreadScratchSize(0, perThreadScratchSpaceSlot0Size); commandList.setCommandListPatchedPerThreadScratchSize(1, perThreadScratchSpaceSlot1Size); } diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index 6407db14f1..194a6ff131 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -32,6 +32,7 @@ struct WhiteBox<::L0::CommandListCoreFamily> using BaseClass = ::L0::CommandListCoreFamily; using BaseClass::addCmdForPatching; using BaseClass::addFlushRequiredCommand; + using BaseClass::addPatchScratchAddressInImplicitArgs; using BaseClass::allocateOrReuseKernelPrivateMemoryIfNeeded; using BaseClass::allowCbWaitEventsNoopDispatch; using BaseClass::appendBlitFill; diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index 4c580e9641..3a0082faa2 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -208,6 +208,8 @@ struct EncodeDispatchKernel { static void patchScratchAddressInImplicitArgs(ImplicitArgs &implicitArgs, uint64_t scratchAddress, bool scratchPtrPatchingRequired); static size_t getInlineDataOffset(EncodeDispatchKernelArgs &args); + static void *getImplicitArgsAddress(EncodeDispatchKernelArgs &args, const KernelDescriptor &kernelDescriptor); + static size_t getScratchPtrOffsetOfImplicitArgs(); }; template diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index ad10f00d85..e8f523f0b7 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -906,6 +906,16 @@ size_t EncodeDispatchKernel::getDefaultDshAlignment() { return EncodeStates::alignIndirectStatePointer; } +template +void *EncodeDispatchKernel::getImplicitArgsAddress(EncodeDispatchKernelArgs &args, const KernelDescriptor &kernelDescriptor) { + return nullptr; +} + +template +size_t EncodeDispatchKernel::getScratchPtrOffsetOfImplicitArgs() { + return 0; +} + template template void EncodeDispatchKernel::setScratchAddress(uint64_t &scratchAddress, uint32_t requiredScratchSlot0Size, uint32_t requiredScratchSlot1Size, IndirectHeap *ssh, CommandStreamReceiver &csr) { diff --git a/shared/test/unit_test/command_container/command_encoder_tests.cpp b/shared/test/unit_test/command_container/command_encoder_tests.cpp index 2ce925bbcf..139c46cd5a 100644 --- a/shared/test/unit_test/command_container/command_encoder_tests.cpp +++ b/shared/test/unit_test/command_container/command_encoder_tests.cpp @@ -698,3 +698,17 @@ HWTEST_F(CommandEncoderTests, givenInterfaceDescriptorWhenEncodeEuSchedulingPoli EXPECT_EQ(expectedIdd.getRawData(i), idd.getRawData(i)); } } + +HWTEST_F(CommandEncoderTests, whenGetImplicitArgsAddressIsCalledThenNullptrIsReturned) { + + KernelDescriptor kernelDescriptor{}; + EncodeDispatchKernelArgs args{}; + auto implicitArgsPtr = EncodeDispatchKernel::getImplicitArgsAddress(args, kernelDescriptor); + EXPECT_EQ(nullptr, implicitArgsPtr); +} + +HWTEST_F(CommandEncoderTests, whenGetScratchPtrOffsetOfImplicitArgsIsCalledThenZeroIsReturned) { + + auto scratchOffset = EncodeDispatchKernel::getScratchPtrOffsetOfImplicitArgs(); + EXPECT_EQ(0u, scratchOffset); +}