fix: add cache flush as dependency for bcs ccs synchronization

Related-to: NEO-9872
Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>
This commit is contained in:
Maciej Plewka
2024-05-09 11:17:06 +00:00
committed by Compute-Runtime-Automation
parent 10ed479b16
commit e39893485c
13 changed files with 112 additions and 5 deletions

View File

@@ -580,6 +580,7 @@ BlitProperties CommandQueueHw<GfxFamily>::processDispatchForBlitEnqueue(CommandS
blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies.cacheFlushNodes);
blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies.previousEnqueueNodes);
blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies.barrierNodes);
blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies.multiCsrDependencies);
}
blitProperties.multiRootDeviceEventSync = multiRootDeviceEventSync;
auto currentTimestampPacketNode = timestampPacketContainer->peekNodes().at(0);
@@ -1437,7 +1438,14 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
if (isCacheFlushForBcsRequired() && gpgpuSubmission) {
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
}
for (auto &dependentCsr : csrDeps.csrWithMultiEngineDependencies) {
auto tag = allocator->getTag();
timestampPacketDependencies.multiCsrDependencies.add(tag);
bool submitStatus = dependentCsr->submitDependencyUpdate(tag);
if (!submitStatus) {
return CL_OUT_OF_RESOURCES;
}
}
obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, bcsCsr);
csrDeps.timestampPacketContainer.push_back(&timestampPacketDependencies.previousEnqueueNodes);

View File

@@ -65,6 +65,7 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci
flushDependentCsr(*dependentCsr, csrDeps);
currentCsr.makeResident(*dependentCsr->getTagAllocation());
}
csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr);
}
}
}

View File

@@ -1373,11 +1373,11 @@ HWTEST_TEMPLATED_F(BlitEnqueueTaskCountTests, givenBlockedEventWhenWaitingForCom
userEvent.setStatus(CL_COMPLETE);
clWaitForEvents(1, &outEvent2);
EXPECT_EQ(2u, ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
EXPECT_EQ(3u, ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
EXPECT_EQ(2u, ultBcsCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
clWaitForEvents(1, &outEvent1);
EXPECT_EQ(1u, ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
EXPECT_EQ(2u, ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
EXPECT_EQ(1u, ultBcsCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
clReleaseEvent(outEvent1);

View File

@@ -130,7 +130,46 @@ HWTEST_F(TimestampPacketTests, givenCrossCsrDependenciesWhenFillCsrDepsThenFlush
} else {
EXPECT_FALSE(mockCmdQ2->getUltCommandStreamReceiver().flushBatchedSubmissionsCalled);
}
EXPECT_FALSE(mockCmdQHw->getUltCommandStreamReceiver().flushBatchedSubmissionsCalled);
mockCmdQHw->getUltCommandStreamReceiver().latestFlushedTaskCount = 1;
*mockCmdQHw->getUltCommandStreamReceiver().tagAddress = 1;
mockCmdQ2->getUltCommandStreamReceiver().latestFlushedTaskCount = 1;
*mockCmdQ2->getUltCommandStreamReceiver().tagAddress = 1;
}
HWTEST_F(TimestampPacketTests, givenCrossCsrDependenciesWhenFillCsrDepsThendependentCsrIsStoredInSet) {
auto mockCmdQHw = std::make_unique<MockCommandQueueHw<FamilyType>>(context, device.get(), nullptr);
mockCmdQHw->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true;
mockCmdQHw->getUltCommandStreamReceiver().taskCount = 1;
mockCmdQHw->getUltCommandStreamReceiver().latestFlushedTaskCount = 0;
cl_queue_properties props[] = {CL_QUEUE_PRIORITY_KHR, CL_QUEUE_PRIORITY_LOW_KHR, 0};
auto mockCmdQ2 = std::make_unique<MockCommandQueueHw<FamilyType>>(context, device.get(), props);
mockCmdQ2->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true;
mockCmdQ2->getUltCommandStreamReceiver().taskCount = 1;
mockCmdQ2->getUltCommandStreamReceiver().latestFlushedTaskCount = 0;
const cl_uint eventsOnWaitlist = 2;
MockTimestampPacketContainer timestamp(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1);
MockTimestampPacketContainer timestamp2(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1);
Event event(mockCmdQ, 0, 0, 0);
event.addTimestampPacketNodes(timestamp);
Event event2(mockCmdQ2.get(), 0, 0, 0);
event2.addTimestampPacketNodes(timestamp2);
cl_event waitlist[] = {&event, &event2};
EventsRequest eventsRequest(eventsOnWaitlist, waitlist, nullptr);
CsrDependencies csrDeps;
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, mockCmdQ->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::all);
const auto &productHelper = device->getProductHelper();
if (productHelper.isDcFlushAllowed()) {
EXPECT_NE(csrDeps.csrWithMultiEngineDependencies.size(), 0u);
} else {
EXPECT_EQ(csrDeps.csrWithMultiEngineDependencies.size(), 0u);
}
mockCmdQHw->getUltCommandStreamReceiver().latestFlushedTaskCount = 1;
*mockCmdQHw->getUltCommandStreamReceiver().tagAddress = 1;

View File

@@ -642,6 +642,7 @@ class CommandStreamReceiverMock : public CommandStreamReceiver {
bool isUpdateTagFromWaitEnabled() override { return false; };
bool isMultiOsContextCapable() const override { return false; }
bool submitDependencyUpdate(TagNodeBase *tag) override { return true; }
MemoryCompressionState getMemoryCompressionState(bool auxTranslationRequired) const override {
return MemoryCompressionState::notApplicable;

View File

@@ -67,6 +67,7 @@ class TimestampPackets;
template <typename T1>
class TagAllocator;
class TagNodeBase;
enum class DispatchMode {
deviceDefault = 0, // default for given device
@@ -515,6 +516,7 @@ class CommandStreamReceiver {
uint32_t getRequiredScratchSlot0Size() { return requiredScratchSlot0Size; }
uint32_t getRequiredScratchSlot1Size() { return requiredScratchSlot1Size; }
virtual bool submitDependencyUpdate(TagNodeBase *tag) = 0;
protected:
void cleanupResources();

View File

@@ -199,6 +199,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
void programStateBaseAddressHeapless(Device &device, LinearStream &commandStream);
void programComputeModeHeapless(Device &device, LinearStream &commandStream);
void handleAllocationsResidencyForflushTaskStateless(const IndirectHeap *dsh, const IndirectHeap *ioh, const IndirectHeap *ssh);
bool submitDependencyUpdate(TagNodeBase *tag) override;
protected:
void programPreemption(LinearStream &csr, DispatchFlags &dispatchFlags);

View File

@@ -2337,4 +2337,28 @@ inline void CommandStreamReceiverHw<GfxFamily>::chainCsrWorkToTask(LinearStream
this->makeResident(*chainedBatchBuffer);
EncodeNoop<GfxFamily>::alignToCacheLine(commandStreamCSR);
}
template <typename GfxFamily>
bool CommandStreamReceiverHw<GfxFamily>::submitDependencyUpdate(TagNodeBase *tag) {
if (tag == nullptr) {
return false;
}
auto ownership = obtainUniqueOwnership();
PipeControlArgs args;
auto expectedSize = MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(peekRootDeviceEnvironment(), args.tlbInvalidation) + this->getCmdSizeForPrologue();
auto &commandStream = getCS(expectedSize);
auto commandStreamStart = commandStream.getUsed();
auto cacheFlushTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*tag);
this->programEnginePrologue(commandStream);
args.dcFlushEnable = MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(true, this->peekRootDeviceEnvironment());
MemorySynchronizationCommands<GfxFamily>::addBarrierWithPostSyncOperation(
commandStream,
PostSyncMode::immediateData,
cacheFlushTimestampPacketGpuAddress,
0,
this->peekRootDeviceEnvironment(),
args);
makeResident(*(tag->getBaseGraphicsAllocation()->getDefaultGraphicsAllocation()));
auto submissionStatus = this->flushSmallTask(commandStream, commandStreamStart);
return submissionStatus == SubmissionStatus::success;
}
} // namespace NEO

View File

@@ -8,6 +8,8 @@
#pragma once
#include "shared/source/utilities/stackvec.h"
#include <set>
namespace NEO {
class TimestampPacketContainer;
@@ -27,5 +29,7 @@ class CsrDependencies {
void makeResident(CommandStreamReceiver &commandStreamReceiver) const;
void copyNodesToNewContainer(TimestampPacketContainer &newTimestampPacketContainer);
void copyRootDeviceSyncNodesToNewContainer(TimestampPacketContainer &newTimestampPacketContainer);
std::set<CommandStreamReceiver *> csrWithMultiEngineDependencies;
};
} // namespace NEO

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2023 Intel Corporation
* Copyright (C) 2018-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -65,4 +65,5 @@ void TimestampPacketDependencies::moveNodesToNewContainer(TimestampPacketContain
barrierNodes.moveNodesToNewContainer(timestampPacketContainer);
auxToNonAuxNodes.moveNodesToNewContainer(timestampPacketContainer);
nonAuxToAuxNodes.moveNodesToNewContainer(timestampPacketContainer);
multiCsrDependencies.moveNodesToNewContainer(timestampPacketContainer);
}

View File

@@ -39,6 +39,7 @@ struct TimestampPacketDependencies : public NonCopyableClass {
TimestampPacketContainer barrierNodes;
TimestampPacketContainer auxToNonAuxNodes;
TimestampPacketContainer nonAuxToAuxNodes;
TimestampPacketContainer multiCsrDependencies;
void moveNodesToNewContainer(TimestampPacketContainer &timestampPacketContainer);
};

View File

@@ -78,6 +78,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
SubmissionStatus flushTagUpdate() override { return SubmissionStatus::success; };
void updateTagFromWait() override{};
bool submitDependencyUpdate(TagNodeBase *tag) override { return true; };
bool isUpdateTagFromWaitEnabled() override { return false; };
void writeMemoryAub(aub_stream::AllocationParams &allocationParams) override {

View File

@@ -96,6 +96,10 @@ TEST_F(CommandStreamReceiverTest, givenOsAgnosticCsrWhenGettingCompletionValueTh
EXPECT_EQ(expectedValue, commandStreamReceiver->getCompletionValue(allocation));
}
TEST_F(CommandStreamReceiverTest, givenOsAgnosticCsrWhenSubmitingCsrDependencyWithNoTagFlushThenFalseRturned) {
EXPECT_FALSE(commandStreamReceiver->submitDependencyUpdate(nullptr));
}
TEST_F(CommandStreamReceiverTest, givenCsrWhenGettingCompletionAddressThenProperAddressIsReturned) {
auto expectedAddress = castToUint64(const_cast<TagAddressType *>(commandStreamReceiver->getTagAddress()));
EXPECT_EQ(expectedAddress + TagAllocationLayout::completionFenceOffset, commandStreamReceiver->getCompletionAddress());
@@ -3408,6 +3412,26 @@ HWTEST_F(CommandStreamReceiverHwTest, givenFlushPipeControlWhenFlushWithoutState
EXPECT_FALSE(UnitTestHelper<FamilyType>::findStateCacheFlushPipeControl(commandStreamReceiver, commandStreamReceiver.commandStream));
}
HWTEST_F(CommandStreamReceiverHwTest, givenFCommandStreamWhenSubmitingDependencyUpdateThenPCWithTagAddresIsDispatched) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
auto mockTagAllocator = std::make_unique<MockTagAllocator<>>(pDevice->getRootDeviceIndex(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u);
auto tag = mockTagAllocator->getTag();
auto usedSizeBeforeSubmit = commandStreamReceiver.commandStream.getUsed();
if (MemorySynchronizationCommands<FamilyType>::isBarrierWaRequired(commandStreamReceiver.peekRootDeviceEnvironment())) {
usedSizeBeforeSubmit += sizeof(PIPE_CONTROL);
}
commandStreamReceiver.submitDependencyUpdate(tag);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(commandStreamReceiver.commandStream, usedSizeBeforeSubmit);
const auto pipeControlItor = find<PIPE_CONTROL *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
const auto pipeControl = genCmdCast<PIPE_CONTROL *>(*pipeControlItor);
EXPECT_NE(nullptr, pipeControl);
auto cacheFlushTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*tag);
EXPECT_EQ(UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*pipeControl), cacheFlushTimestampPacketGpuAddress);
EXPECT_EQ(pipeControl->getDcFlushEnable(), MemorySynchronizationCommands<FamilyType>::getDcFlushEnable(true, commandStreamReceiver.peekRootDeviceEnvironment()));
}
HWTEST_F(CommandStreamReceiverHwTest, givenFlushPipeControlWhenFlushWithStateCacheFlushThenExpectStateCacheFlushFlagsSet) {
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();