Feature(OCL) Use tag nodes for root device synchronization

With this commit events created on multi root device contexts will synchronize using signaled TagNodes instead of using taskCounts. Signed-off-by: Maciej Plewka <maciej.plewka@intel.com> Related-To: NEO-7105
2026-01-04 23:56:39 +08:00 · 2022-12-07 11:07:43 +00:00
parent 6fac234655
commit 547d1c37b3
62 changed files with 995 additions and 210 deletions
--- a/opencl/source/helpers/properties_helper.cpp
+++ b/opencl/source/helpers/properties_helper.cpp
@@ -20,7 +20,6 @@ namespace NEO {
 void flushDependentCsr(CommandStreamReceiver &dependentCsr, CsrDependencies &csrDeps) {
    auto csrOwnership = dependentCsr.obtainUniqueOwnership();
    dependentCsr.updateTagFromWait();
-    csrDeps.taskCountContainer.push_back({dependentCsr.peekTaskCount(), reinterpret_cast<uint64_t>(dependentCsr.getTagAddress())});
 }

 void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr, CsrDependencies::DependenciesType depsType) const {
@@ -54,6 +53,7 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci
                if (productHelper.isDcFlushAllowed()) {
                    if (!dependentCsr.isLatestTaskCountFlushed()) {
                        flushDependentCsr(dependentCsr, csrDeps);
+                        //csrDeps.taskCountContainer.push_back({dependentCsr.peekTaskCount(), reinterpret_cast<uint64_t>(dependentCsr.getTagAddress())});
                        currentCsr.makeResident(*dependentCsr.getTagAllocation());
                    }
                }
@@ -62,23 +62,22 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci
    }
 }

-void EventsRequest::fillCsrDependenciesForTaskCountContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr) const {
+void EventsRequest::fillCsrDependenciesForRootDevices(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr) const {
    for (cl_uint i = 0; i < this->numEventsInWaitList; i++) {
        auto event = castToObjectOrAbort<Event>(this->eventWaitList[i]);
        if (event->isUserEvent() || CompletionStamp::notReady == event->peekTaskCount()) {
            continue;
        }
-
        if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() != currentCsr.getRootDeviceIndex()) {
+            auto timestampPacketContainer = event->getMultiRootDeviceTimestampPacketNodes();
+            if (!timestampPacketContainer || timestampPacketContainer->peekNodes().empty()) {
+                continue;
+            }
            auto &dependentCsr = event->getCommandQueue()->getGpgpuCommandStreamReceiver();
            if (!dependentCsr.isLatestTaskCountFlushed()) {
                flushDependentCsr(dependentCsr, csrDeps);
-            } else {
-                csrDeps.taskCountContainer.push_back({event->peekTaskCount(), reinterpret_cast<uint64_t>(dependentCsr.getTagAddress())});
            }
-
-            auto graphicsAllocation = event->getCommandQueue()->getGpgpuCommandStreamReceiver().getTagsMultiAllocation()->getGraphicsAllocation(currentCsr.getRootDeviceIndex());
-            currentCsr.getResidencyAllocations().push_back(graphicsAllocation);
+            csrDeps.multiRootTimeStampSyncContainer.push_back(timestampPacketContainer);
        }
    }
 }
--- a/opencl/source/helpers/properties_helper.h
+++ b/opencl/source/helpers/properties_helper.h
@@ -25,7 +25,7 @@ struct EventsRequest {
        : numEventsInWaitList(numEventsInWaitList), eventWaitList(eventWaitList), outEvent(outEvent) {}

    void fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr, CsrDependencies::DependenciesType depsType) const;
-    void fillCsrDependenciesForTaskCountContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr) const;
+    void fillCsrDependenciesForRootDevices(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr) const;
    void setupBcsCsrForOutputEvent(CommandStreamReceiver &bcsCsr) const;

    cl_uint numEventsInWaitList;
--- a/opencl/source/helpers/task_information.cpp
+++ b/opencl/source/helpers/task_information.cpp
@@ -117,10 +117,11 @@ CompletionStamp &CommandMapUnmap::submit(TaskCountType taskLevel, bool terminate

 CommandComputeKernel::CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> surfaces,
                                           bool flushDC, bool usesSLM, uint32_t commandType, std::unique_ptr<PrintfHandler> &&printfHandler,
-                                           PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount)
+                                           PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount,
+                                           TagNodeBase *multiRootDeviceSyncNode)
    : Command(commandQueue, kernelOperation), surfaces(std::move(surfaces)), flushDC(flushDC), slmUsed(usesSLM),
      commandType(commandType), printfHandler(std::move(printfHandler)), kernel(kernel),
-      kernelCount(kernelCount), preemptionMode(preemptionMode) {
+      kernelCount(kernelCount), preemptionMode(preemptionMode), multiRootDeviceSyncNode(multiRootDeviceSyncNode) {
    UNRECOVERABLE_IF(nullptr == this->kernel);
    kernel->incRefInternal();
 }
@@ -162,6 +163,9 @@ CompletionStamp &CommandComputeKernel::submit(TaskCountType taskLevel, bool term
        printfHandler->makeResident(commandStreamReceiver);
    }
    makeTimestampPacketsResident(commandStreamReceiver);
+    if (multiRootDeviceSyncNode != nullptr) {
+        commandStreamReceiver.makeResident(*multiRootDeviceSyncNode->getBaseGraphicsAllocation());
+    }

    if (kernelOperation->blitPropertiesContainer.size() > 0) {
        CsrDependencies csrDeps;
@@ -213,7 +217,7 @@ CompletionStamp &CommandComputeKernel::submit(TaskCountType taskLevel, bool term
        false);                                                                           // hasRelaxedOrderingDependencies

    if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
-        eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
+        eventsRequest.fillCsrDependenciesForRootDevices(dispatchFlags.csrDependencies, commandStreamReceiver);
    }

    const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();
@@ -306,7 +310,7 @@ TaskCountType CommandWithoutKernel::dispatchBlitOperation() {
    blitProperties.outputTimestampPacket = currentTimestampPacketNodes->peekNodes()[0];

    if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
-        eventsRequest.fillCsrDependenciesForTaskCountContainer(blitProperties.csrDependencies, *bcsCsr);
+        eventsRequest.fillCsrDependenciesForRootDevices(blitProperties.csrDependencies, *bcsCsr);
    }

    const auto newTaskCount = bcsCsr->flushBcsTask(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice());
@@ -388,7 +392,7 @@ CompletionStamp &CommandWithoutKernel::submit(TaskCountType taskLevel, bool term
        false);                                                                // hasRelaxedOrderingDependencies

    if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
-        eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
+        eventsRequest.fillCsrDependenciesForRootDevices(dispatchFlags.csrDependencies, commandStreamReceiver);
    }

    const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired();
--- a/opencl/source/helpers/task_information.h
+++ b/opencl/source/helpers/task_information.h
@@ -131,7 +131,7 @@ class CommandComputeKernel : public Command {
  public:
    CommandComputeKernel(CommandQueue &commandQueue, std::unique_ptr<KernelOperation> &kernelOperation, std::vector<Surface *> surfaces,
                         bool flushDC, bool usesSLM, uint32_t commandType, std::unique_ptr<PrintfHandler> &&printfHandler,
-                         PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount);
+                         PreemptionMode preemptionMode, Kernel *kernel, uint32_t kernelCount, TagNodeBase *multiRootDeviceSyncNode);

    ~CommandComputeKernel() override;

@@ -150,6 +150,7 @@ class CommandComputeKernel : public Command {
    Kernel *kernel;
    uint32_t kernelCount;
    PreemptionMode preemptionMode;
+    TagNodeBase *multiRootDeviceSyncNode;
 };

 class CommandWithoutKernel : public Command {