Improve obtaining LinearStream during enqueue call

- Move logic to enqueueHandler to cover all scenarios - Create BlockedCommandsData not only for Kernel enqueue - KernelOperation cleanup Change-Id: Ie4a673cbbc986c685996a38ab296444d38e7bbd5 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
2025-12-29 00:58:39 +08:00 · 2019-07-18 21:15:50 +02:00
parent 1460713d69
commit 95c2dcd8b0
18 changed files with 264 additions and 220 deletions
--- a/runtime/command_queue/command_queue_hw.h
+++ b/runtime/command_queue/command_queue_hw.h
@@ -337,7 +337,7 @@ class CommandQueueHw : public CommandQueue {
                        bool &blocking,
                        const MultiDispatchInfo &multiDispatchInfo,
                        TimestampPacketContainer *previousTimestampPacketNodes,
-                        KernelOperation *blockedCommandsData,
+                        std::unique_ptr<KernelOperation> &blockedCommandsData,
                        EventsRequest &eventsRequest,
                        bool slmUsed,
                        EventBuilder &externalEventBuilder,
@@ -385,6 +385,29 @@ class CommandQueueHw : public CommandQueue {
    MOCKABLE_VIRTUAL void dispatchAuxTranslation(MultiDispatchInfo &multiDispatchInfo, MemObjsForAuxTranslation &memObjsForAuxTranslation,
                                                 AuxTranslationDirection auxTranslationDirection);

+    template <uint32_t commandType>
+    LinearStream *obtainCommandStream(const CsrDependencies &csrDependencies, bool profilingRequired,
+                                      bool perfCountersRequired, bool blitEnqueue, bool blockedQueue,
+                                      const MultiDispatchInfo &multiDispatchInfo,
+                                      std::unique_ptr<KernelOperation> &blockedCommandsData,
+                                      Surface **surfaces, size_t numSurfaces) {
+        LinearStream *commandStream = nullptr;
+        if (blockedQueue && !multiDispatchInfo.empty()) {
+            constexpr size_t additionalAllocationSize = CSRequirements::csOverfetchSize;
+            constexpr size_t allocationSize = MemoryConstants::pageSize64k - CSRequirements::csOverfetchSize;
+            commandStream = new LinearStream();
+
+            auto &gpgpuCsr = getGpgpuCommandStreamReceiver();
+            gpgpuCsr.ensureCommandBufferAllocation(*commandStream, allocationSize, additionalAllocationSize);
+
+            blockedCommandsData = std::make_unique<KernelOperation>(commandStream, *gpgpuCsr.getInternalAllocationStorage());
+        } else {
+            commandStream = &getCommandStream<GfxFamily, commandType>(*this, csrDependencies, profilingRequired, perfCountersRequired,
+                                                                      blitEnqueue, multiDispatchInfo, surfaces, numSurfaces);
+        }
+        return commandStream;
+    }
+
  private:
    bool isTaskLevelUpdateRequired(const uint32_t &taskLevel, const cl_event *eventWaitList, const cl_uint &numEventsInWaitList, unsigned int commandType);
    void obtainTaskLevelAndBlockedStatus(unsigned int &taskLevel, cl_uint &numEventsInWaitList, const cl_event *&eventWaitList, bool &blockQueueStatus, unsigned int commandType) override;
@@ -414,7 +437,7 @@ class CommandQueueHw : public CommandQueue {
                                   bool blockQueue,
                                   DeviceQueueHw<GfxFamily> *devQueueHw,
                                   CsrDependencies &csrDeps,
-                                   KernelOperation *&blockedCommandsData,
+                                   KernelOperation *blockedCommandsData,
                                   TimestampPacketContainer &previousTimestampPacketNodes,
                                   PreemptionMode preemption);
 };
--- a/runtime/command_queue/enqueue_common.h
+++ b/runtime/command_queue/enqueue_common.h
@@ -165,7 +165,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,

    bool profilingRequired = (this->isProfilingEnabled() && event != nullptr);
    bool perfCountersRequired = (this->isPerfCountersEnabled() && event != nullptr);
-    KernelOperation *blockedCommandsData = nullptr;
+    std::unique_ptr<KernelOperation> blockedCommandsData;
    std::unique_ptr<PrintfHandler> printfHandler;
    bool slmUsed = multiDispatchInfo.usesSlm() || parentKernel;
    auto preemption = PreemptionHelper::taskPreemptionMode(*device, multiDispatchInfo);
@@ -227,8 +227,8 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
        }
    }

-    auto &commandStream = getCommandStream<GfxFamily, commandType>(*this, csrDeps, profilingRequired, perfCountersRequired,
-                                                                   blitEnqueue, multiDispatchInfo, surfacesForResidency, numSurfaceForResidency);
+    auto &commandStream = *obtainCommandStream<commandType>(csrDeps, profilingRequired, perfCountersRequired, blitEnqueue, blockQueue,
+                                                            multiDispatchInfo, blockedCommandsData, surfacesForResidency, numSurfaceForResidency);
    auto commandStreamStart = commandStream.getUsed();

    if (eventBuilder.getEvent() && getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
@@ -241,7 +241,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
        processDispatchForBlitEnqueue(multiDispatchInfo, previousTimestampPacketNodes, eventsRequest, commandStream, commandType);
    } else if (multiDispatchInfo.empty() == false) {
        processDispatchForKernels<commandType>(multiDispatchInfo, printfHandler, eventBuilder.getEvent(),
-                                               hwTimeStamps, parentKernel, blockQueue, devQueueHw, csrDeps, blockedCommandsData,
+                                               hwTimeStamps, parentKernel, blockQueue, devQueueHw, csrDeps, blockedCommandsData.get(),
                                               previousTimestampPacketNodes, preemption);
    } else if (isCacheFlushCommand(commandType)) {
        processDispatchForCacheFlush(surfacesForResidency, numSurfaceForResidency, &commandStream, csrDeps);
@@ -396,7 +396,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
                                                          bool blockQueue,
                                                          DeviceQueueHw<GfxFamily> *devQueueHw,
                                                          CsrDependencies &csrDeps,
-                                                          KernelOperation *&blockedCommandsData,
+                                                          KernelOperation *blockedCommandsData,
                                                          TimestampPacketContainer &previousTimestampPacketNodes,
                                                          PreemptionMode preemption) {
    TagNode<HwPerfCounter> *hwPerfCounter = nullptr;
@@ -437,13 +437,12 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
        *this,
        multiDispatchInfo,
        csrDeps,
-        &blockedCommandsData,
+        blockedCommandsData,
        hwTimeStamps,
        hwPerfCounter,
        &previousTimestampPacketNodes,
        timestampPacketContainer.get(),
        preemption,
-        blockQueue,
        commandType);

    if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
@@ -738,7 +737,7 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
    bool &blocking,
    const MultiDispatchInfo &multiDispatchInfo,
    TimestampPacketContainer *previousTimestampPacketNodes,
-    KernelOperation *blockedCommandsData,
+    std::unique_ptr<KernelOperation> &blockedCommandsData,
    EventsRequest &eventsRequest,
    bool slmUsed,
    EventBuilder &externalEventBuilder,
@@ -795,10 +794,9 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
            allSurfaces.push_back(surface->duplicate());
        }
        PreemptionMode preemptionMode = PreemptionHelper::taskPreemptionMode(*device, multiDispatchInfo);
-        auto kernelOperation = std::unique_ptr<KernelOperation>(blockedCommandsData); // marking ownership
        auto cmd = std::make_unique<CommandComputeKernel>(
            *this,
-            std::move(kernelOperation),
+            std::move(blockedCommandsData),
            allSurfaces,
            shouldFlushDC(commandType, printfHandler.get()),
            slmUsed,
--- a/runtime/command_queue/hardware_interface.h
+++ b/runtime/command_queue/hardware_interface.h
@@ -39,14 +39,13 @@ class HardwareInterface {
        CommandQueue &commandQueue,
        const MultiDispatchInfo &multiDispatchInfo,
        const CsrDependencies &csrDependencies,
-        KernelOperation **blockedCommandsData,
+        KernelOperation *blockedCommandsData,
        TagNode<HwTimeStamps> *hwTimeStamps,
        TagNode<HwPerfCounter> *hwPerfCounter,
        TimestampPacketContainer *previousTimestampPacketNodes,
        TimestampPacketContainer *currentTimestampPacketNodes,
        PreemptionMode preemptionMode,
-        bool blockQueue,
-        uint32_t commandType = 0);
+        uint32_t commandType);

    static void getDefaultDshSpace(
        const size_t &offsetInterfaceDescriptorTable,
--- a/runtime/command_queue/hardware_interface_base.inl
+++ b/runtime/command_queue/hardware_interface_base.inl
@@ -26,13 +26,12 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
    CommandQueue &commandQueue,
    const MultiDispatchInfo &multiDispatchInfo,
    const CsrDependencies &csrDependencies,
-    KernelOperation **blockedCommandsData,
+    KernelOperation *blockedCommandsData,
    TagNode<HwTimeStamps> *hwTimeStamps,
    TagNode<HwPerfCounter> *hwPerfCounter,
    TimestampPacketContainer *previousTimestampPacketNodes,
    TimestampPacketContainer *currentTimestampPacketNodes,
    PreemptionMode preemptionMode,
-    bool blockQueue,
    uint32_t commandType) {

    LinearStream *commandStream = nullptr;
@@ -49,19 +48,11 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
    }

    // Allocate command stream and indirect heaps
-    obtainIndirectHeaps(commandQueue, multiDispatchInfo, blockQueue, dsh, ioh, ssh);
-    if (blockQueue) {
-        constexpr static auto additionalAllocationSize = CSRequirements::csOverfetchSize;
-        constexpr static auto allocationSize = MemoryConstants::pageSize64k - additionalAllocationSize;
-        commandStream = new LinearStream();
-        commandQueue.getGpgpuCommandStreamReceiver().ensureCommandBufferAllocation(*commandStream, allocationSize, additionalAllocationSize);
-
-        using UniqueIH = std::unique_ptr<IndirectHeap>;
-        *blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh), UniqueIH(ioh),
-                                                   UniqueIH(ssh), *commandQueue.getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
-        if (parentKernel) {
-            (*blockedCommandsData)->doNotFreeISH = true;
-        }
+    bool blockedQueue = (blockedCommandsData != nullptr);
+    obtainIndirectHeaps(commandQueue, multiDispatchInfo, blockedQueue, dsh, ioh, ssh);
+    if (blockedQueue) {
+        blockedCommandsData->setHeaps(dsh, ioh, ssh);
+        commandStream = blockedCommandsData->commandStream.get();
    } else {
        commandStream = &commandQueue.getCS(0);
    }