From 4266f861ac91dae967483694b027a9b63983720d Mon Sep 17 00:00:00 2001
From: Szymon Morek <szymon.morek@intel.com>
Date: Wed, 27 Apr 2022 16:34:20 +0000
Subject: [PATCH] Make implicit flush for cross-device dependency

Related-To: NEO-6418

If there's a cross-device dependency, flush batched
submissions to avoid deadlock.

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
---
 opencl/source/helpers/properties_helper.cpp   | 21 ++++++----
 ...and_stream_receiver_flush_task_4_tests.cpp | 38 +++++++++++++++++++
 .../command_stream_receiver_hw_base.inl       |  1 +
 .../command_stream/submissions_aggregator.cpp |  2 +-
 .../deferrable_allocation_deletion.cpp        |  5 +--
 5 files changed, 54 insertions(+), 13 deletions(-)
diff --git a/opencl/source/helpers/properties_helper.cpp b/opencl/source/helpers/properties_helper.cpp
index 806857b715..1af6b72f8b 100644
--- a/opencl/source/helpers/properties_helper.cpp
+++ b/opencl/source/helpers/properties_helper.cpp
@@ -17,6 +17,12 @@
 
 namespace NEO {
 
+void flushDependentCsr(CommandStreamReceiver &dependentCsr, CsrDependencies &csrDeps) {
+    auto csrOwnership = dependentCsr.obtainUniqueOwnership();
+    dependentCsr.updateTagFromWait();
+    csrDeps.taskCountContainer.push_back({dependentCsr.peekTaskCount(), reinterpret_cast<uint64_t>(dependentCsr.getTagAddress())});
+}
+
 void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr, CsrDependencies::DependenciesType depsType) const {
     for (cl_uint i = 0; i < this->numEventsInWaitList; i++) {
         auto event = castToObjectOrAbort<Event>(this->eventWaitList[i]);
@@ -47,10 +53,7 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci
                 const auto &hwInfoConfig = *NEO::HwInfoConfig::get(event->getCommandQueue()->getDevice().getHardwareInfo().platform.eProductFamily);
                 if (hwInfoConfig.isDcFlushAllowed()) {
                     if (!dependentCsr.isLatestTaskCountFlushed()) {
-                        auto csrOwnership = dependentCsr.obtainUniqueOwnership();
-                        dependentCsr.flushBatchedSubmissions();
-                        dependentCsr.updateTagFromWait();
-                        csrDeps.taskCountContainer.push_back({dependentCsr.peekTaskCount(), reinterpret_cast<uint64_t>(dependentCsr.getTagAddress())});
+                        flushDependentCsr(dependentCsr, csrDeps);
                         currentCsr.makeResident(*dependentCsr.getTagAllocation());
                     }
                 }
@@ -67,10 +70,12 @@ void EventsRequest::fillCsrDependenciesForTaskCountContainer(CsrDependencies &cs
         }
 
         if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() != currentCsr.getRootDeviceIndex()) {
-            auto taskCountPreviousRootDevice = event->peekTaskCount();
-            auto tagAddressPreviousRootDevice = event->getCommandQueue()->getGpgpuCommandStreamReceiver().getTagAddress();
-
-            csrDeps.taskCountContainer.push_back({taskCountPreviousRootDevice, reinterpret_cast<uint64_t>(tagAddressPreviousRootDevice)});
+            auto &dependentCsr = event->getCommandQueue()->getGpgpuCommandStreamReceiver();
+            if (!dependentCsr.isLatestTaskCountFlushed()) {
+                flushDependentCsr(dependentCsr, csrDeps);
+            } else {
+                csrDeps.taskCountContainer.push_back({event->peekTaskCount(), reinterpret_cast<uint64_t>(dependentCsr.getTagAddress())});
+            }
 
             auto graphicsAllocation = event->getCommandQueue()->getGpgpuCommandStreamReceiver().getTagsMultiAllocation()->getGraphicsAllocation(currentCsr.getRootDeviceIndex());
             currentCsr.getResidencyAllocations().push_back(graphicsAllocation);
diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp
index caec18ddc2..7e9a662b39 100644
--- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp
+++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp
@@ -260,6 +260,7 @@ struct CrossDeviceDependenciesTests : public ::testing::Test {
         defaultHwInfo->capabilityTable.blitterOperationsSupported = true;
         deviceFactory = std::make_unique<UltClDeviceFactory>(3, 0);
         auto device1 = deviceFactory->rootDevices[1];
+
         auto device2 = deviceFactory->rootDevices[2];
 
         cl_device_id devices[] = {device1, device2};
@@ -633,6 +634,43 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW
     pCmdQ2->release();
 }
 
+HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenUnflushedQueueAndEventInMultiRootDeviceEnvironmentWhenTheyArePassedToSecondQueueThenFlushSubmissions) {
+    auto deviceFactory = std::make_unique<UltClDeviceFactory>(3, 0);
+    deviceFactory->rootDevices[1]->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = true;
+    deviceFactory->rootDevices[1]->getUltCommandStreamReceiver<FamilyType>().useNewResourceImplicitFlush = false;
+
+    cl_device_id devices[] = {deviceFactory->rootDevices[1], deviceFactory->rootDevices[2]};
+
+    auto context = std::make_unique<MockContext>(ClDeviceVector(devices, 2), false);
+    auto pCmdQ1 = context.get()->getSpecialQueue(1u);
+    auto pCmdQ2 = context.get()->getSpecialQueue(2u);
+
+    pCmdQ1->getGpgpuCommandStreamReceiver().overrideDispatchPolicy(DispatchMode::BatchedDispatch);
+    cl_event outputEvent{};
+    cl_event inputEvent;
+
+    pCmdQ1->enqueueMarkerWithWaitList(
+        0,
+        nullptr,
+        &inputEvent);
+    pCmdQ1->enqueueMarkerWithWaitList(
+        1,
+        &inputEvent,
+        &outputEvent);
+
+    EXPECT_FALSE(pCmdQ1->getGpgpuCommandStreamReceiver().isLatestTaskCountFlushed());
+
+    pCmdQ2->enqueueMarkerWithWaitList(
+        1,
+        &outputEvent,
+        nullptr);
+    EXPECT_TRUE(pCmdQ1->getGpgpuCommandStreamReceiver().isLatestTaskCountFlushed());
+    castToObject<Event>(inputEvent)->release();
+    castToObject<Event>(outputEvent)->release();
+    pCmdQ1->finish();
+    pCmdQ2->finish();
+}
+
 HWTEST_F(CommandStreamReceiverFlushTaskTests, givenStaticPartitioningEnabledWhenFlushingTaskThenWorkPartitionAllocationIsMadeResident) {
     DebugManagerStateRestore restore{};
     DebugManager.flags.EnableStaticPartitioning.set(1);
diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl
index 5f38127c85..943d7e8422 100644
--- a/shared/source/command_stream/command_stream_receiver_hw_base.inl
+++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl
@@ -1261,6 +1261,7 @@ inline bool CommandStreamReceiverHw<GfxFamily>::isUpdateTagFromWaitEnabled() {
 
 template <typename GfxFamily>
 inline void CommandStreamReceiverHw<GfxFamily>::updateTagFromWait() {
+    flushBatchedSubmissions();
     if (isUpdateTagFromWaitEnabled()) {
         flushTagUpdate();
     }
diff --git a/shared/source/command_stream/submissions_aggregator.cpp b/shared/source/command_stream/submissions_aggregator.cpp
index 92afdb6258..95fcc162cd 100644
--- a/shared/source/command_stream/submissions_aggregator.cpp
+++ b/shared/source/command_stream/submissions_aggregator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/shared/source/memory_manager/deferrable_allocation_deletion.cpp b/shared/source/memory_manager/deferrable_allocation_deletion.cpp
index 5569f6debe..b9ebd4bcba 100644
--- a/shared/source/memory_manager/deferrable_allocation_deletion.cpp
+++ b/shared/source/memory_manager/deferrable_allocation_deletion.cpp
@@ -26,10 +26,7 @@ bool DeferrableAllocationDeletion::apply() {
                     graphicsAllocation.releaseUsageInOsContext(contextId);
                 } else {
                     isStillUsed = true;
-                    engine.commandStreamReceiver->flushBatchedSubmissions();
-                    if (engine.commandStreamReceiver->peekLatestFlushedTaskCount() < graphicsAllocation.getTaskCount(contextId)) {
-                        engine.commandStreamReceiver->updateTagFromWait();
-                    }
+                    engine.commandStreamReceiver->updateTagFromWait();
                 }
             }
         }