Make implicit flush for cross-device dependency

Related-To: NEO-6418

If there's a cross-device dependency, flush batched
submissions to avoid deadlock.

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek 2022-04-27 16:34:20 +00:00 committed by Compute-Runtime-Automation
parent 9d31d36491
commit 4266f861ac
5 changed files with 54 additions and 13 deletions

View File

@ -17,6 +17,12 @@
namespace NEO { namespace NEO {
void flushDependentCsr(CommandStreamReceiver &dependentCsr, CsrDependencies &csrDeps) {
auto csrOwnership = dependentCsr.obtainUniqueOwnership();
dependentCsr.updateTagFromWait();
csrDeps.taskCountContainer.push_back({dependentCsr.peekTaskCount(), reinterpret_cast<uint64_t>(dependentCsr.getTagAddress())});
}
void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr, CsrDependencies::DependenciesType depsType) const { void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr, CsrDependencies::DependenciesType depsType) const {
for (cl_uint i = 0; i < this->numEventsInWaitList; i++) { for (cl_uint i = 0; i < this->numEventsInWaitList; i++) {
auto event = castToObjectOrAbort<Event>(this->eventWaitList[i]); auto event = castToObjectOrAbort<Event>(this->eventWaitList[i]);
@ -47,10 +53,7 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci
const auto &hwInfoConfig = *NEO::HwInfoConfig::get(event->getCommandQueue()->getDevice().getHardwareInfo().platform.eProductFamily); const auto &hwInfoConfig = *NEO::HwInfoConfig::get(event->getCommandQueue()->getDevice().getHardwareInfo().platform.eProductFamily);
if (hwInfoConfig.isDcFlushAllowed()) { if (hwInfoConfig.isDcFlushAllowed()) {
if (!dependentCsr.isLatestTaskCountFlushed()) { if (!dependentCsr.isLatestTaskCountFlushed()) {
auto csrOwnership = dependentCsr.obtainUniqueOwnership(); flushDependentCsr(dependentCsr, csrDeps);
dependentCsr.flushBatchedSubmissions();
dependentCsr.updateTagFromWait();
csrDeps.taskCountContainer.push_back({dependentCsr.peekTaskCount(), reinterpret_cast<uint64_t>(dependentCsr.getTagAddress())});
currentCsr.makeResident(*dependentCsr.getTagAllocation()); currentCsr.makeResident(*dependentCsr.getTagAllocation());
} }
} }
@ -67,10 +70,12 @@ void EventsRequest::fillCsrDependenciesForTaskCountContainer(CsrDependencies &cs
} }
if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() != currentCsr.getRootDeviceIndex()) { if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() != currentCsr.getRootDeviceIndex()) {
auto taskCountPreviousRootDevice = event->peekTaskCount(); auto &dependentCsr = event->getCommandQueue()->getGpgpuCommandStreamReceiver();
auto tagAddressPreviousRootDevice = event->getCommandQueue()->getGpgpuCommandStreamReceiver().getTagAddress(); if (!dependentCsr.isLatestTaskCountFlushed()) {
flushDependentCsr(dependentCsr, csrDeps);
csrDeps.taskCountContainer.push_back({taskCountPreviousRootDevice, reinterpret_cast<uint64_t>(tagAddressPreviousRootDevice)}); } else {
csrDeps.taskCountContainer.push_back({event->peekTaskCount(), reinterpret_cast<uint64_t>(dependentCsr.getTagAddress())});
}
auto graphicsAllocation = event->getCommandQueue()->getGpgpuCommandStreamReceiver().getTagsMultiAllocation()->getGraphicsAllocation(currentCsr.getRootDeviceIndex()); auto graphicsAllocation = event->getCommandQueue()->getGpgpuCommandStreamReceiver().getTagsMultiAllocation()->getGraphicsAllocation(currentCsr.getRootDeviceIndex());
currentCsr.getResidencyAllocations().push_back(graphicsAllocation); currentCsr.getResidencyAllocations().push_back(graphicsAllocation);

View File

@ -260,6 +260,7 @@ struct CrossDeviceDependenciesTests : public ::testing::Test {
defaultHwInfo->capabilityTable.blitterOperationsSupported = true; defaultHwInfo->capabilityTable.blitterOperationsSupported = true;
deviceFactory = std::make_unique<UltClDeviceFactory>(3, 0); deviceFactory = std::make_unique<UltClDeviceFactory>(3, 0);
auto device1 = deviceFactory->rootDevices[1]; auto device1 = deviceFactory->rootDevices[1];
auto device2 = deviceFactory->rootDevices[2]; auto device2 = deviceFactory->rootDevices[2];
cl_device_id devices[] = {device1, device2}; cl_device_id devices[] = {device1, device2};
@ -633,6 +634,43 @@ HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventW
pCmdQ2->release(); pCmdQ2->release();
} }
HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenUnflushedQueueAndEventInMultiRootDeviceEnvironmentWhenTheyArePassedToSecondQueueThenFlushSubmissions) {
auto deviceFactory = std::make_unique<UltClDeviceFactory>(3, 0);
deviceFactory->rootDevices[1]->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = true;
deviceFactory->rootDevices[1]->getUltCommandStreamReceiver<FamilyType>().useNewResourceImplicitFlush = false;
cl_device_id devices[] = {deviceFactory->rootDevices[1], deviceFactory->rootDevices[2]};
auto context = std::make_unique<MockContext>(ClDeviceVector(devices, 2), false);
auto pCmdQ1 = context.get()->getSpecialQueue(1u);
auto pCmdQ2 = context.get()->getSpecialQueue(2u);
pCmdQ1->getGpgpuCommandStreamReceiver().overrideDispatchPolicy(DispatchMode::BatchedDispatch);
cl_event outputEvent{};
cl_event inputEvent;
pCmdQ1->enqueueMarkerWithWaitList(
0,
nullptr,
&inputEvent);
pCmdQ1->enqueueMarkerWithWaitList(
1,
&inputEvent,
&outputEvent);
EXPECT_FALSE(pCmdQ1->getGpgpuCommandStreamReceiver().isLatestTaskCountFlushed());
pCmdQ2->enqueueMarkerWithWaitList(
1,
&outputEvent,
nullptr);
EXPECT_TRUE(pCmdQ1->getGpgpuCommandStreamReceiver().isLatestTaskCountFlushed());
castToObject<Event>(inputEvent)->release();
castToObject<Event>(outputEvent)->release();
pCmdQ1->finish();
pCmdQ2->finish();
}
HWTEST_F(CommandStreamReceiverFlushTaskTests, givenStaticPartitioningEnabledWhenFlushingTaskThenWorkPartitionAllocationIsMadeResident) { HWTEST_F(CommandStreamReceiverFlushTaskTests, givenStaticPartitioningEnabledWhenFlushingTaskThenWorkPartitionAllocationIsMadeResident) {
DebugManagerStateRestore restore{}; DebugManagerStateRestore restore{};
DebugManager.flags.EnableStaticPartitioning.set(1); DebugManager.flags.EnableStaticPartitioning.set(1);

View File

@ -1261,6 +1261,7 @@ inline bool CommandStreamReceiverHw<GfxFamily>::isUpdateTagFromWaitEnabled() {
template <typename GfxFamily> template <typename GfxFamily>
inline void CommandStreamReceiverHw<GfxFamily>::updateTagFromWait() { inline void CommandStreamReceiverHw<GfxFamily>::updateTagFromWait() {
flushBatchedSubmissions();
if (isUpdateTagFromWaitEnabled()) { if (isUpdateTagFromWaitEnabled()) {
flushTagUpdate(); flushTagUpdate();
} }

View File

@ -1,5 +1,5 @@
/* /*
* Copyright (C) 2018-2021 Intel Corporation * Copyright (C) 2018-2022 Intel Corporation
* *
* SPDX-License-Identifier: MIT * SPDX-License-Identifier: MIT
* *

View File

@ -26,10 +26,7 @@ bool DeferrableAllocationDeletion::apply() {
graphicsAllocation.releaseUsageInOsContext(contextId); graphicsAllocation.releaseUsageInOsContext(contextId);
} else { } else {
isStillUsed = true; isStillUsed = true;
engine.commandStreamReceiver->flushBatchedSubmissions(); engine.commandStreamReceiver->updateTagFromWait();
if (engine.commandStreamReceiver->peekLatestFlushedTaskCount() < graphicsAllocation.getTaskCount(contextId)) {
engine.commandStreamReceiver->updateTagFromWait();
}
} }
} }
} }