fix: add scratch ptr in implicit args patching for L0 regular cmdlists

Related-To: NEO-11874 Signed-off-by: Kamil Kopryk <kamil.kopryk@intel.com>
2026-01-04 15:53:45 +08:00 · 2024-07-18 11:03:36 +00:00
parent 153cda9a9f
commit 4008ccea05
9 changed files with 52 additions and 5 deletions
--- a/level_zero/core/source/cmdlist/cmdlist_hw.h
+++ b/level_zero/core/source/cmdlist/cmdlist_hw.h
@@ -20,6 +20,9 @@
 namespace NEO {
 enum class MemoryPool;
 enum class ImageType;
+struct EncodeDispatchKernelArgs;
+struct KernelDescriptor;
+
 } // namespace NEO

 namespace L0 {
@@ -359,7 +362,7 @@ struct CommandListCoreFamily : public CommandListImp {
    bool isInOrderNonWalkerSignalingRequired(const Event *event) const;
    bool hasInOrderDependencies() const;
    void appendFullSynchronizedDispatchInit();
-
+    void addPatchScratchAddressInImplicitArgs(CommandsToPatch &commandsToPatch, NEO::EncodeDispatchKernelArgs &args, const NEO::KernelDescriptor &kernelDescriptor, bool kernelNeedsImplicitArgs);
    size_t addCmdForPatching(std::shared_ptr<NEO::InOrderExecInfo> *externalInOrderExecInfo, void *cmd1, void *cmd2, uint64_t counterValue, NEO::InOrderPatchCommandHelpers::PatchCmdType patchCmdType);
    uint64_t getInOrderIncrementValue() const;
    bool isSkippingInOrderBarrierAllowed(ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) const;
--- a/level_zero/core/source/cmdlist/cmdlist_hw.inl
+++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl
@@ -4097,6 +4097,10 @@ void CommandListCoreFamily<gfxCoreFamily>::appendFullSynchronizedDispatchInit()
    NEO::EncodeMiPredicate<GfxFamily>::encode(*cmdStream, NEO::MiPredicateType::disable);
 }

+template <GFXCORE_FAMILY gfxCoreFamily>
+void CommandListCoreFamily<gfxCoreFamily>::addPatchScratchAddressInImplicitArgs(CommandsToPatch &commandsToPatch, NEO::EncodeDispatchKernelArgs &args, const NEO::KernelDescriptor &kernelDescriptor, bool kernelNeedsImplicitArgs) {
+}
+
 template <GFXCORE_FAMILY gfxCoreFamily>
 void CommandListCoreFamily<gfxCoreFamily>::appendSynchronizedDispatchCleanupSection() {
    if (getSynchronizedDispatchMode() != NEO::SynchronizedDispatchMode::full) {
--- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl
+++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl
@@ -126,6 +126,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
            ", Group count: ", threadGroupDimensions.groupCountX, ", ", threadGroupDimensions.groupCountY, ", ", threadGroupDimensions.groupCountZ,
            ", SIMD: ", kernelInfo->getMaxSimdSize());

+    bool kernelNeedsImplicitArgs = kernel->getImplicitArgs() != nullptr;
    bool needScratchSpace = false;
    bool kernelNeedsScratchSpace = false;
    for (uint32_t slotId = 0u; slotId < 2; slotId++) {
@@ -373,6 +374,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
            scratchInlineData.baseAddress = ssh->getGpuBase();
        }
        commandsToPatch.push_back(scratchInlineData);
+
+        addPatchScratchAddressInImplicitArgs(commandsToPatch, dispatchKernelArgs, kernelDescriptor, kernelNeedsImplicitArgs);
    }

    if (!this->isFlushTaskSubmissionEnabled) {
--- a/level_zero/core/source/cmdlist/cmdlist_launch_params.h
+++ b/level_zero/core/source/cmdlist/cmdlist_launch_params.h
@@ -31,6 +31,7 @@ struct CommandToPatch {
        CbWaitEventSemaphoreWait,
        CbWaitEventLoadRegisterImm,
        ComputeWalkerInlineDataScratch,
+        ComputeWalkerImplicitArgsScratch,
        Invalid
    };
    void *pDestination = nullptr;
--- a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl
+++ b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl
@@ -173,11 +173,11 @@ void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint
    using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
    using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;

-    bool patchNewInlineScratchAddress = false;
+    bool patchNewScratchAddress = false;
    if (this->heaplessModeEnabled &&
        (commandList.getCommandListPatchedPerThreadScratchSize(0) < perThreadScratchSpaceSlot0Size ||
         commandList.getCommandListPatchedPerThreadScratchSize(1) < perThreadScratchSpaceSlot1Size)) {
-        patchNewInlineScratchAddress = true;
+        patchNewScratchAddress = true;
    }

    auto &commandsToPatch = commandList.getCommandsToPatch();
@@ -241,7 +241,16 @@ void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint
            break;
        }
        case CommandToPatch::ComputeWalkerInlineDataScratch: {
-            if (!patchNewInlineScratchAddress) {
+            if (!patchNewScratchAddress) {
+                continue;
+            }
+            uint64_t fullScratchAddress = scratchAddress + commandToPatch.baseAddress;
+            void *scratchAddressPatch = ptrOffset(commandToPatch.pDestination, commandToPatch.offset);
+            std::memcpy(scratchAddressPatch, &fullScratchAddress, commandToPatch.patchSize);
+            break;
+        }
+        case CommandToPatch::ComputeWalkerImplicitArgsScratch: {
+            if (!patchNewScratchAddress) {
                continue;
            }
            uint64_t fullScratchAddress = scratchAddress + commandToPatch.baseAddress;
@@ -254,7 +263,7 @@ void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint
        }
    }

-    if (patchNewInlineScratchAddress) {
+    if (patchNewScratchAddress) {
        commandList.setCommandListPatchedPerThreadScratchSize(0, perThreadScratchSpaceSlot0Size);
        commandList.setCommandListPatchedPerThreadScratchSize(1, perThreadScratchSpaceSlot1Size);
    }
--- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h
+++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h
@@ -32,6 +32,7 @@ struct WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>
    using BaseClass = ::L0::CommandListCoreFamily<gfxCoreFamily>;
    using BaseClass::addCmdForPatching;
    using BaseClass::addFlushRequiredCommand;
+    using BaseClass::addPatchScratchAddressInImplicitArgs;
    using BaseClass::allocateOrReuseKernelPrivateMemoryIfNeeded;
    using BaseClass::allowCbWaitEventsNoopDispatch;
    using BaseClass::appendBlitFill;