From 4266f861ac91dae967483694b027a9b63983720d Mon Sep 17 00:00:00 2001 From: Szymon Morek Date: Wed, 27 Apr 2022 16:34:20 +0000 Subject: [PATCH] Make implicit flush for cross-device dependency Related-To: NEO-6418 If there's a cross-device dependency, flush batched submissions to avoid deadlock. Signed-off-by: Szymon Morek --- opencl/source/helpers/properties_helper.cpp | 21 ++++++---- ...and_stream_receiver_flush_task_4_tests.cpp | 38 +++++++++++++++++++ .../command_stream_receiver_hw_base.inl | 1 + .../command_stream/submissions_aggregator.cpp | 2 +- .../deferrable_allocation_deletion.cpp | 5 +-- 5 files changed, 54 insertions(+), 13 deletions(-) diff --git a/opencl/source/helpers/properties_helper.cpp b/opencl/source/helpers/properties_helper.cpp index 806857b715..1af6b72f8b 100644 --- a/opencl/source/helpers/properties_helper.cpp +++ b/opencl/source/helpers/properties_helper.cpp @@ -17,6 +17,12 @@ namespace NEO { +void flushDependentCsr(CommandStreamReceiver &dependentCsr, CsrDependencies &csrDeps) { + auto csrOwnership = dependentCsr.obtainUniqueOwnership(); + dependentCsr.updateTagFromWait(); + csrDeps.taskCountContainer.push_back({dependentCsr.peekTaskCount(), reinterpret_cast(dependentCsr.getTagAddress())}); +} + void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver ¤tCsr, CsrDependencies::DependenciesType depsType) const { for (cl_uint i = 0; i < this->numEventsInWaitList; i++) { auto event = castToObjectOrAbort(this->eventWaitList[i]); @@ -47,10 +53,7 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci const auto &hwInfoConfig = *NEO::HwInfoConfig::get(event->getCommandQueue()->getDevice().getHardwareInfo().platform.eProductFamily); if (hwInfoConfig.isDcFlushAllowed()) { if (!dependentCsr.isLatestTaskCountFlushed()) { - auto csrOwnership = dependentCsr.obtainUniqueOwnership(); - dependentCsr.flushBatchedSubmissions(); - dependentCsr.updateTagFromWait(); - csrDeps.taskCountContainer.push_back({dependentCsr.peekTaskCount(), reinterpret_cast(dependentCsr.getTagAddress())}); + flushDependentCsr(dependentCsr, csrDeps); currentCsr.makeResident(*dependentCsr.getTagAllocation()); } } @@ -67,10 +70,12 @@ void EventsRequest::fillCsrDependenciesForTaskCountContainer(CsrDependencies &cs } if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() != currentCsr.getRootDeviceIndex()) { - auto taskCountPreviousRootDevice = event->peekTaskCount(); - auto tagAddressPreviousRootDevice = event->getCommandQueue()->getGpgpuCommandStreamReceiver().getTagAddress(); - - csrDeps.taskCountContainer.push_back({taskCountPreviousRootDevice, reinterpret_cast(tagAddressPreviousRootDevice)}); + auto &dependentCsr = event->getCommandQueue()->getGpgpuCommandStreamReceiver(); + if (!dependentCsr.isLatestTaskCountFlushed()) { + flushDependentCsr(dependentCsr, csrDeps); + } else { + csrDeps.taskCountContainer.push_back({event->peekTaskCount(), reinterpret_cast(dependentCsr.getTagAddress())}); + } auto graphicsAllocation = event->getCommandQueue()->getGpgpuCommandStreamReceiver().getTagsMultiAllocation()->getGraphicsAllocation(currentCsr.getRootDeviceIndex()); currentCsr.getResidencyAllocations().push_back(graphicsAllocation); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp index caec18ddc2..7e9a662b39 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_4_tests.cpp @@ -260,6 +260,7 @@ struct CrossDeviceDependenciesTests : public ::testing::Test { defaultHwInfo->capabilityTable.blitterOperationsSupported = true; deviceFactory = std::make_unique(3, 0); auto device1 = deviceFactory->rootDevices[1]; + auto device2 = deviceFactory->rootDevices[2]; cl_device_id devices[] = {device1, device2}; @@ -633,6 +634,43 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW pCmdQ2->release(); } +HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenUnflushedQueueAndEventInMultiRootDeviceEnvironmentWhenTheyArePassedToSecondQueueThenFlushSubmissions) { + auto deviceFactory = std::make_unique(3, 0); + deviceFactory->rootDevices[1]->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + deviceFactory->rootDevices[1]->getUltCommandStreamReceiver().useNewResourceImplicitFlush = false; + + cl_device_id devices[] = {deviceFactory->rootDevices[1], deviceFactory->rootDevices[2]}; + + auto context = std::make_unique(ClDeviceVector(devices, 2), false); + auto pCmdQ1 = context.get()->getSpecialQueue(1u); + auto pCmdQ2 = context.get()->getSpecialQueue(2u); + + pCmdQ1->getGpgpuCommandStreamReceiver().overrideDispatchPolicy(DispatchMode::BatchedDispatch); + cl_event outputEvent{}; + cl_event inputEvent; + + pCmdQ1->enqueueMarkerWithWaitList( + 0, + nullptr, + &inputEvent); + pCmdQ1->enqueueMarkerWithWaitList( + 1, + &inputEvent, + &outputEvent); + + EXPECT_FALSE(pCmdQ1->getGpgpuCommandStreamReceiver().isLatestTaskCountFlushed()); + + pCmdQ2->enqueueMarkerWithWaitList( + 1, + &outputEvent, + nullptr); + EXPECT_TRUE(pCmdQ1->getGpgpuCommandStreamReceiver().isLatestTaskCountFlushed()); + castToObject(inputEvent)->release(); + castToObject(outputEvent)->release(); + pCmdQ1->finish(); + pCmdQ2->finish(); +} + HWTEST_F(CommandStreamReceiverFlushTaskTests, givenStaticPartitioningEnabledWhenFlushingTaskThenWorkPartitionAllocationIsMadeResident) { DebugManagerStateRestore restore{}; DebugManager.flags.EnableStaticPartitioning.set(1); diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 5f38127c85..943d7e8422 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -1261,6 +1261,7 @@ inline bool CommandStreamReceiverHw::isUpdateTagFromWaitEnabled() { template inline void CommandStreamReceiverHw::updateTagFromWait() { + flushBatchedSubmissions(); if (isUpdateTagFromWaitEnabled()) { flushTagUpdate(); } diff --git a/shared/source/command_stream/submissions_aggregator.cpp b/shared/source/command_stream/submissions_aggregator.cpp index 92afdb6258..95fcc162cd 100644 --- a/shared/source/command_stream/submissions_aggregator.cpp +++ b/shared/source/command_stream/submissions_aggregator.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2021 Intel Corporation + * Copyright (C) 2018-2022 Intel Corporation * * SPDX-License-Identifier: MIT * diff --git a/shared/source/memory_manager/deferrable_allocation_deletion.cpp b/shared/source/memory_manager/deferrable_allocation_deletion.cpp index 5569f6debe..b9ebd4bcba 100644 --- a/shared/source/memory_manager/deferrable_allocation_deletion.cpp +++ b/shared/source/memory_manager/deferrable_allocation_deletion.cpp @@ -26,10 +26,7 @@ bool DeferrableAllocationDeletion::apply() { graphicsAllocation.releaseUsageInOsContext(contextId); } else { isStillUsed = true; - engine.commandStreamReceiver->flushBatchedSubmissions(); - if (engine.commandStreamReceiver->peekLatestFlushedTaskCount() < graphicsAllocation.getTaskCount(contextId)) { - engine.commandStreamReceiver->updateTagFromWait(); - } + engine.commandStreamReceiver->updateTagFromWait(); } } }