Add method to submit kernel on single subdevice

Signed-off-by: Jobczyk, Lukasz <lukasz.jobczyk@intel.com>
2025-12-21 09:14:47 +08:00 · 2020-11-18 13:56:18 +00:00
parent 00c92c8c14
commit 343fd602fa
23 changed files with 191 additions and 152 deletions
--- a/shared/source/command_stream/command_stream_receiver_hw.h
+++ b/shared/source/command_stream/command_stream_receiver_hw.h
@@ -146,6 +146,8 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {

    CsrSizeRequestFlags csrSizeRequestFlags = {};

+    bool wasSubmittedToSingleSubdevice = false;
+
    std::unique_ptr<DirectSubmissionHw<GfxFamily, RenderDispatcher<GfxFamily>>> directSubmission;
    std::unique_ptr<DirectSubmissionHw<GfxFamily, BlitterDispatcher<GfxFamily>>> blitterDirectSubmission;
 };
--- a/shared/source/command_stream/command_stream_receiver_hw_base.inl
+++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl
@@ -197,7 +197,11 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
    void *currentPipeControlForNooping = nullptr;
    void *epiloguePipeControlLocation = nullptr;

-    if (DebugManager.flags.ForceCsrFlushing.get()) {
+    bool csrFlush = this->wasSubmittedToSingleSubdevice != dispatchFlags.useSingleSubdevice;
+
+    csrFlush |= DebugManager.flags.ForceCsrFlushing.get();
+
+    if (csrFlush) {
        flushBatchedSubmissions();
    }

@@ -544,7 +548,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
    auto &streamToSubmit = submitCommandStreamFromCsr ? commandStreamCSR : commandStreamTask;
    BatchBuffer batchBuffer{streamToSubmit.getGraphicsAllocation(), startOffset, chainedBatchBufferStartOffset, chainedBatchBuffer,
                            dispatchFlags.requiresCoherency, dispatchFlags.lowPriority, dispatchFlags.throttle, dispatchFlags.sliceCount,
-                            streamToSubmit.getUsed(), &streamToSubmit, bbEndLocation};
+                            streamToSubmit.getUsed(), &streamToSubmit, bbEndLocation, dispatchFlags.useSingleSubdevice};

    if (submitCSR | submitTask) {
        if (this->dispatchMode == DispatchMode::ImmediateDispatch) {
@@ -566,6 +570,8 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
        this->makeSurfacePackNonResident(this->getResidencyAllocations());
    }

+    this->wasSubmittedToSingleSubdevice = dispatchFlags.useSingleSubdevice;
+
    //check if we are not over the budget, if we are do implicit flush
    if (getMemoryManager()->isMemoryBudgetExhausted()) {
        if (this->totalMemoryUsed >= device.getDeviceInfo().globalMemSize / 4) {
@@ -1027,7 +1033,7 @@ uint32_t CommandStreamReceiverHw<GfxFamily>::blitBuffer(const BlitPropertiesCont
    }

    BatchBuffer batchBuffer{commandStream.getGraphicsAllocation(), commandStreamStart, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount,
-                            commandStream.getUsed(), &commandStream, endingCmdPtr};
+                            commandStream.getUsed(), &commandStream, endingCmdPtr, false};

    flush(batchBuffer, getResidencyAllocations());
    makeSurfacePackNonResident(getResidencyAllocations());
--- a/shared/source/command_stream/csr_definitions.h
+++ b/shared/source/command_stream/csr_definitions.h
@@ -55,29 +55,31 @@ struct DispatchFlags {
                  uint32_t l3CacheSettings, uint32_t threadArbitrationPolicy, uint32_t additionalKernelExecInfo, KernelExecutionType kernelExecutionType, uint64_t sliceCount, bool blocking, bool dcFlush,
                  bool useSLM, bool guardCommandBufferWithPipeControl, bool gsba32BitRequired,
                  bool requiresCoherency, bool lowPriority, bool implicitFlush,
-                  bool outOfOrderExecutionAllowed, bool epilogueRequired, bool usePerDSSbackedBuffer) : csrDependencies(csrDependencies),
-                                                                                                        barrierTimestampPacketNodes(barrierTimestampPacketNodes),
-                                                                                                        pipelineSelectArgs(pipelineSelectArgs),
-                                                                                                        flushStampReference(flushStampReference),
-                                                                                                        throttle(throttle),
-                                                                                                        preemptionMode(preemptionMode),
-                                                                                                        numGrfRequired(numGrfRequired),
-                                                                                                        l3CacheSettings(l3CacheSettings),
-                                                                                                        threadArbitrationPolicy(threadArbitrationPolicy),
-                                                                                                        additionalKernelExecInfo(additionalKernelExecInfo),
-                                                                                                        kernelExecutionType(kernelExecutionType),
-                                                                                                        sliceCount(sliceCount),
-                                                                                                        blocking(blocking),
-                                                                                                        dcFlush(dcFlush),
-                                                                                                        useSLM(useSLM),
-                                                                                                        guardCommandBufferWithPipeControl(guardCommandBufferWithPipeControl),
-                                                                                                        gsba32BitRequired(gsba32BitRequired),
-                                                                                                        requiresCoherency(requiresCoherency),
-                                                                                                        lowPriority(lowPriority),
-                                                                                                        implicitFlush(implicitFlush),
-                                                                                                        outOfOrderExecutionAllowed(outOfOrderExecutionAllowed),
-                                                                                                        epilogueRequired(epilogueRequired),
-                                                                                                        usePerDssBackedBuffer(usePerDSSbackedBuffer){};
+                  bool outOfOrderExecutionAllowed, bool epilogueRequired, bool usePerDSSbackedBuffer, bool useSingleSubdevice) : csrDependencies(csrDependencies),
+                                                                                                                                 barrierTimestampPacketNodes(barrierTimestampPacketNodes),
+                                                                                                                                 pipelineSelectArgs(pipelineSelectArgs),
+                                                                                                                                 flushStampReference(flushStampReference),
+                                                                                                                                 throttle(throttle),
+                                                                                                                                 preemptionMode(preemptionMode),
+                                                                                                                                 numGrfRequired(numGrfRequired),
+                                                                                                                                 l3CacheSettings(l3CacheSettings),
+                                                                                                                                 threadArbitrationPolicy(threadArbitrationPolicy),
+                                                                                                                                 additionalKernelExecInfo(additionalKernelExecInfo),
+                                                                                                                                 kernelExecutionType(kernelExecutionType),
+                                                                                                                                 sliceCount(sliceCount),
+                                                                                                                                 blocking(blocking),
+                                                                                                                                 dcFlush(dcFlush),
+                                                                                                                                 useSLM(useSLM),
+                                                                                                                                 guardCommandBufferWithPipeControl(guardCommandBufferWithPipeControl),
+                                                                                                                                 gsba32BitRequired(gsba32BitRequired),
+                                                                                                                                 requiresCoherency(requiresCoherency),
+                                                                                                                                 lowPriority(lowPriority),
+                                                                                                                                 implicitFlush(implicitFlush),
+                                                                                                                                 outOfOrderExecutionAllowed(outOfOrderExecutionAllowed),
+                                                                                                                                 epilogueRequired(epilogueRequired),
+                                                                                                                                 usePerDssBackedBuffer(usePerDSSbackedBuffer),
+                                                                                                                                 useSingleSubdevice(useSingleSubdevice){};
+
    CsrDependencies csrDependencies;
    TimestampPacketContainer *barrierTimestampPacketNodes = nullptr;
    PipelineSelectArgs pipelineSelectArgs;
@@ -102,6 +104,7 @@ struct DispatchFlags {
    bool outOfOrderExecutionAllowed = false;
    bool epilogueRequired = false;
    bool usePerDssBackedBuffer = false;
+    bool useSingleSubdevice = false;
 };

 struct CsrSizeRequestFlags {
--- a/shared/source/command_stream/submissions_aggregator.cpp
+++ b/shared/source/command_stream/submissions_aggregator.cpp
@@ -102,12 +102,12 @@ NEO::BatchBuffer::BatchBuffer(GraphicsAllocation *commandBufferAllocation, size_
                              size_t chainedBatchBufferStartOffset, GraphicsAllocation *chainedBatchBuffer,
                              bool requiresCoherency, bool lowPriority,
                              QueueThrottle throttle, uint64_t sliceCount,
-                              size_t usedSize, LinearStream *stream, void *endCmdPtr)
+                              size_t usedSize, LinearStream *stream, void *endCmdPtr, bool useSingleSubdevice)
    : commandBufferAllocation(commandBufferAllocation), startOffset(startOffset),
      chainedBatchBufferStartOffset(chainedBatchBufferStartOffset), chainedBatchBuffer(chainedBatchBuffer),
      requiresCoherency(requiresCoherency), low_priority(lowPriority),
      throttle(throttle), sliceCount(sliceCount),
-      usedSize(usedSize), stream(stream), endCmdPtr(endCmdPtr) {}
+      usedSize(usedSize), stream(stream), endCmdPtr(endCmdPtr), useSingleSubdevice(useSingleSubdevice) {}

 NEO::CommandBuffer::CommandBuffer(Device &device) : device(device) {
    flushStamp.reset(new FlushStampTracker(false));
--- a/shared/source/command_stream/submissions_aggregator.h
+++ b/shared/source/command_stream/submissions_aggregator.h
@@ -30,7 +30,8 @@ struct BatchBuffer {
                uint64_t sliceCount,
                size_t usedSize,
                LinearStream *stream,
-                void *endCmdPtr);
+                void *endCmdPtr,
+                bool useSingleSubdevice);
    BatchBuffer() {}
    GraphicsAllocation *commandBufferAllocation = nullptr;
    size_t startOffset = 0u;
@@ -45,6 +46,8 @@ struct BatchBuffer {
    //only used in drm csr in gem close worker active mode
    LinearStream *stream = nullptr;
    void *endCmdPtr = nullptr;
+
+    bool useSingleSubdevice = false;
 };

 struct CommandBuffer : public IDNode<CommandBuffer> {