Add multi tile barrier to marker command on multi tile device

Related-To: NEO-6262 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
2025-12-31 20:13:04 +08:00 · 2021-12-10 21:31:34 +00:00
parent d8ea439dab
commit 01348451db
12 changed files with 414 additions and 313 deletions
--- a/opencl/source/command_queue/enqueue_common.h
+++ b/opencl/source/command_queue/enqueue_common.h
@@ -168,8 +168,8 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
    DeviceQueueHw<GfxFamily> *devQueueHw = castToObject<DeviceQueueHw<GfxFamily>>(devQueue);

    TagNodeBase *hwTimeStamps = nullptr;
-
-    auto commandStreamReceiverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
+    CommandStreamReceiver &computeCommandStreamReceiver = getGpgpuCommandStreamReceiver();
+    auto commandStreamReceiverOwnership = computeCommandStreamReceiver.obtainUniqueOwnership();

    EventBuilder eventBuilder;
    setupEvent(eventBuilder, event, commandType);
@@ -206,17 +206,17 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
    BlitPropertiesContainer blitPropertiesContainer;

    if (this->context->getRootDeviceIndices().size() > 1) {
-        eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, getGpgpuCommandStreamReceiver());
+        eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, computeCommandStreamReceiver);
    }

    bool enqueueWithBlitAuxTranslation = isBlitAuxTranslationRequired(multiDispatchInfo);

-    if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
+    if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
        if (!clearDependenciesForSubCapture) {
-            eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
+            eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, computeCommandStreamReceiver, CsrDependencies::DependenciesType::OnCsr);
        }

-        auto allocator = getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
+        auto allocator = computeCommandStreamReceiver.getTimestampPacketAllocator();

        size_t nodesCount = 0u;
        if (isCacheFlushCommand(commandType) || isMarkerWithProfiling) {
@@ -231,7 +231,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
        }

        if (nodesCount > 0) {
-            obtainNewTimestampPacketNodes(nodesCount, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, getGpgpuCommandStreamReceiver());
+            obtainNewTimestampPacketNodes(nodesCount, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, computeCommandStreamReceiver);
            csrDeps.timestampPacketContainer.push_back(&timestampPacketDependencies.previousEnqueueNodes);
        }
    }
@@ -250,10 +250,10 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
    }

    if (!blockQueue && isOOQEnabled()) {
-        setupBarrierTimestampForBcsEngines(getGpgpuCommandStreamReceiver().getOsContext().getEngineType(), timestampPacketDependencies);
+        setupBarrierTimestampForBcsEngines(computeCommandStreamReceiver.getOsContext().getEngineType(), timestampPacketDependencies);
    }

-    if (eventBuilder.getEvent() && getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
+    if (eventBuilder.getEvent() && computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
        eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer);
        eventBuilder.getEvent()->addTimestampPacketNodes(timestampPacketDependencies.nonAuxToAuxNodes);
        eventBuilder.getEvent()->addTimestampPacketNodes(timestampPacketDependencies.auxToNonAuxNodes);
@@ -267,9 +267,9 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
                                               timestampPacketDependencies);
    } else if (isCacheFlushCommand(commandType)) {
        processDispatchForCacheFlush(surfacesForResidency, numSurfaceForResidency, &commandStream, csrDeps);
-    } else if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
+    } else if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
        if (CL_COMMAND_BARRIER == commandType) {
-            getGpgpuCommandStreamReceiver().requestStallingCommandsOnNextFlush();
+            computeCommandStreamReceiver.requestStallingCommandsOnNextFlush();
        }

        for (size_t i = 0; i < eventsRequest.numEventsInWaitList; i++) {
@@ -288,8 +288,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,

        if (isMarkerWithProfiling) {
            if (numEventsInWaitList == 0) {
-                PipeControlArgs args(false);
-                MemorySynchronizationCommands<GfxFamily>::addPipeControl(commandStream, args);
+                computeCommandStreamReceiver.programComputeBarrierCommand(commandStream);
            }
            processDispatchForMarkerWithTimestampPacket(*this, &commandStream, eventsRequest, csrDeps);
        }
@@ -305,7 +304,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,

    if (!blockQueue && multiDispatchInfo.peekMainKernel() && multiDispatchInfo.peekMainKernel()->requiresMemoryMigration()) {
        for (auto &arg : multiDispatchInfo.peekMainKernel()->getMemObjectsToMigrate()) {
-            MigrationController::handleMigration(*this->context, getGpgpuCommandStreamReceiver(), arg.second);
+            MigrationController::handleMigration(*this->context, computeCommandStreamReceiver, arg.second);
            migratedMemory = true;
        }
    }
@@ -315,7 +314,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
        }

        if (enqueueProperties.operation == EnqueueProperties::Operation::GpuKernel) {
-            csrDeps.makeResident(getGpgpuCommandStreamReceiver());
+            csrDeps.makeResident(computeCommandStreamReceiver);

            completionStamp = enqueueNonBlocked<commandType>(
                surfacesForResidency,
@@ -334,7 +333,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
                getBcsForAuxTranslation());

            if (parentKernel) {
-                getGpgpuCommandStreamReceiver().setMediaVFEStateDirty(true);
+                computeCommandStreamReceiver.setMediaVFEStateDirty(true);

                if (devQueueHw->getSchedulerReturnInstance() > 0) {
                    waitUntilComplete(completionStamp.taskCount, {}, completionStamp.flushStamp, false);
@@ -427,7 +426,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
        }
    }
    if (migratedMemory) {
-        getGpgpuCommandStreamReceiver().flushBatchedSubmissions();
+        computeCommandStreamReceiver.flushBatchedSubmissions();
    }
 }

--- a/opencl/source/command_queue/gpgpu_walker_base.inl
+++ b/opencl/source/command_queue/gpgpu_walker_base.inl
@@ -208,7 +208,7 @@ size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, c
        expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredForTimestampPacketWrite();
        if (isMarkerWithProfiling) {
            if (!eventsInWaitlist) {
-                expectedSizeCS += MemorySynchronizationCommands<GfxFamily>::getSizeForSinglePipeControl();
+                expectedSizeCS += commandQueue.getGpgpuCommandStreamReceiver().getCmdsSizeForComputeBarrierCommand();
            }
            expectedSizeCS += 4 * EncodeStoreMMIO<GfxFamily>::size;
        }