mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-19 06:16:51 +08:00
fix: add cache flush as dependency for bcs ccs synchronization
Related-to: NEO-9872 Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
10ed479b16
commit
e39893485c
@@ -580,6 +580,7 @@ BlitProperties CommandQueueHw<GfxFamily>::processDispatchForBlitEnqueue(CommandS
|
||||
blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.cacheFlushNodes);
|
||||
blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes);
|
||||
blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.barrierNodes);
|
||||
blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.multiCsrDependencies);
|
||||
}
|
||||
blitProperties.multiRootDeviceEventSync = multiRootDeviceEventSync;
|
||||
auto currentTimestampPacketNode = timestampPacketContainer->peekNodes().at(0);
|
||||
@@ -1437,7 +1438,14 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
|
||||
if (isCacheFlushForBcsRequired() && gpgpuSubmission) {
|
||||
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
|
||||
}
|
||||
|
||||
for (auto &dependentCsr : csrDeps.csrWithMultiEngineDependencies) {
|
||||
auto tag = allocator->getTag();
|
||||
timestampPacketDependencies.multiCsrDependencies.add(tag);
|
||||
bool submitStatus = dependentCsr->submitDependencyUpdate(tag);
|
||||
if (!submitStatus) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
}
|
||||
obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, bcsCsr);
|
||||
csrDeps.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes);
|
||||
|
||||
|
||||
@@ -65,6 +65,7 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci
|
||||
flushDependentCsr(*dependentCsr, csrDeps);
|
||||
currentCsr.makeResident(*dependentCsr->getTagAllocation());
|
||||
}
|
||||
csrDeps.csrWithMultiEngineDependencies.insert(dependentCsr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1373,11 +1373,11 @@ HWTEST_TEMPLATED_F(BlitEnqueueTaskCountTests, givenBlockedEventWhenWaitingForCom
|
||||
userEvent.setStatus(CL_COMPLETE);
|
||||
|
||||
clWaitForEvents(1, &outEvent2);
|
||||
EXPECT_EQ(2u, ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
|
||||
EXPECT_EQ(3u, ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
|
||||
EXPECT_EQ(2u, ultBcsCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
|
||||
|
||||
clWaitForEvents(1, &outEvent1);
|
||||
EXPECT_EQ(1u, ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
|
||||
EXPECT_EQ(2u, ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
|
||||
EXPECT_EQ(1u, ultBcsCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
|
||||
|
||||
clReleaseEvent(outEvent1);
|
||||
|
||||
@@ -130,7 +130,46 @@ HWTEST_F(TimestampPacketTests, givenCrossCsrDependenciesWhenFillCsrDepsThenFlush
|
||||
} else {
|
||||
EXPECT_FALSE(mockCmdQ2->getUltCommandStreamReceiver().flushBatchedSubmissionsCalled);
|
||||
}
|
||||
EXPECT_FALSE(mockCmdQHw->getUltCommandStreamReceiver().flushBatchedSubmissionsCalled);
|
||||
|
||||
mockCmdQHw->getUltCommandStreamReceiver().latestFlushedTaskCount = 1;
|
||||
*mockCmdQHw->getUltCommandStreamReceiver().tagAddress = 1;
|
||||
mockCmdQ2->getUltCommandStreamReceiver().latestFlushedTaskCount = 1;
|
||||
*mockCmdQ2->getUltCommandStreamReceiver().tagAddress = 1;
|
||||
}
|
||||
|
||||
HWTEST_F(TimestampPacketTests, givenCrossCsrDependenciesWhenFillCsrDepsThendependentCsrIsStoredInSet) {
|
||||
auto mockCmdQHw = std::make_unique<MockCommandQueueHw<FamilyType>>(context, device.get(), nullptr);
|
||||
mockCmdQHw->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true;
|
||||
mockCmdQHw->getUltCommandStreamReceiver().taskCount = 1;
|
||||
mockCmdQHw->getUltCommandStreamReceiver().latestFlushedTaskCount = 0;
|
||||
|
||||
cl_queue_properties props[] = {CL_QUEUE_PRIORITY_KHR, CL_QUEUE_PRIORITY_LOW_KHR, 0};
|
||||
auto mockCmdQ2 = std::make_unique<MockCommandQueueHw<FamilyType>>(context, device.get(), props);
|
||||
mockCmdQ2->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true;
|
||||
mockCmdQ2->getUltCommandStreamReceiver().taskCount = 1;
|
||||
mockCmdQ2->getUltCommandStreamReceiver().latestFlushedTaskCount = 0;
|
||||
|
||||
const cl_uint eventsOnWaitlist = 2;
|
||||
MockTimestampPacketContainer timestamp(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1);
|
||||
MockTimestampPacketContainer timestamp2(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1);
|
||||
|
||||
Event event(mockCmdQ, 0, 0, 0);
|
||||
event.addTimestampPacketNodes(timestamp);
|
||||
Event event2(mockCmdQ2.get(), 0, 0, 0);
|
||||
event2.addTimestampPacketNodes(timestamp2);
|
||||
|
||||
cl_event waitlist[] = {&event, &event2};
|
||||
EventsRequest eventsRequest(eventsOnWaitlist, waitlist, nullptr);
|
||||
CsrDependencies csrDeps;
|
||||
|
||||
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, mockCmdQ->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::all);
|
||||
|
||||
const auto &productHelper = device->getProductHelper();
|
||||
if (productHelper.isDcFlushAllowed()) {
|
||||
EXPECT_NE(csrDeps.csrWithMultiEngineDependencies.size(), 0u);
|
||||
} else {
|
||||
EXPECT_EQ(csrDeps.csrWithMultiEngineDependencies.size(), 0u);
|
||||
}
|
||||
|
||||
mockCmdQHw->getUltCommandStreamReceiver().latestFlushedTaskCount = 1;
|
||||
*mockCmdQHw->getUltCommandStreamReceiver().tagAddress = 1;
|
||||
|
||||
@@ -642,6 +642,7 @@ class CommandStreamReceiverMock : public CommandStreamReceiver {
|
||||
bool isUpdateTagFromWaitEnabled() override { return false; };
|
||||
|
||||
bool isMultiOsContextCapable() const override { return false; }
|
||||
bool submitDependencyUpdate(TagNodeBase *tag) override { return true; }
|
||||
|
||||
MemoryCompressionState getMemoryCompressionState(bool auxTranslationRequired) const override {
|
||||
return MemoryCompressionState::notApplicable;
|
||||
|
||||
@@ -67,6 +67,7 @@ class TimestampPackets;
|
||||
|
||||
template <typename T1>
|
||||
class TagAllocator;
|
||||
class TagNodeBase;
|
||||
|
||||
enum class DispatchMode {
|
||||
deviceDefault = 0, // default for given device
|
||||
@@ -515,6 +516,7 @@ class CommandStreamReceiver {
|
||||
|
||||
uint32_t getRequiredScratchSlot0Size() { return requiredScratchSlot0Size; }
|
||||
uint32_t getRequiredScratchSlot1Size() { return requiredScratchSlot1Size; }
|
||||
virtual bool submitDependencyUpdate(TagNodeBase *tag) = 0;
|
||||
|
||||
protected:
|
||||
void cleanupResources();
|
||||
|
||||
@@ -199,6 +199,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
|
||||
void programStateBaseAddressHeapless(Device &device, LinearStream &commandStream);
|
||||
void programComputeModeHeapless(Device &device, LinearStream &commandStream);
|
||||
void handleAllocationsResidencyForflushTaskStateless(const IndirectHeap *dsh, const IndirectHeap *ioh, const IndirectHeap *ssh);
|
||||
bool submitDependencyUpdate(TagNodeBase *tag) override;
|
||||
|
||||
protected:
|
||||
void programPreemption(LinearStream &csr, DispatchFlags &dispatchFlags);
|
||||
|
||||
@@ -2337,4 +2337,28 @@ inline void CommandStreamReceiverHw<GfxFamily>::chainCsrWorkToTask(LinearStream
|
||||
this->makeResident(*chainedBatchBuffer);
|
||||
EncodeNoop<GfxFamily>::alignToCacheLine(commandStreamCSR);
|
||||
}
|
||||
template <typename GfxFamily>
|
||||
bool CommandStreamReceiverHw<GfxFamily>::submitDependencyUpdate(TagNodeBase *tag) {
|
||||
if (tag == nullptr) {
|
||||
return false;
|
||||
}
|
||||
auto ownership = obtainUniqueOwnership();
|
||||
PipeControlArgs args;
|
||||
auto expectedSize = MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(peekRootDeviceEnvironment(), args.tlbInvalidation) + this->getCmdSizeForPrologue();
|
||||
auto &commandStream = getCS(expectedSize);
|
||||
auto commandStreamStart = commandStream.getUsed();
|
||||
auto cacheFlushTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*tag);
|
||||
this->programEnginePrologue(commandStream);
|
||||
args.dcFlushEnable = MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(true, this->peekRootDeviceEnvironment());
|
||||
MemorySynchronizationCommands<GfxFamily>::addBarrierWithPostSyncOperation(
|
||||
commandStream,
|
||||
PostSyncMode::immediateData,
|
||||
cacheFlushTimestampPacketGpuAddress,
|
||||
0,
|
||||
this->peekRootDeviceEnvironment(),
|
||||
args);
|
||||
makeResident(*(tag->getBaseGraphicsAllocation()->getDefaultGraphicsAllocation()));
|
||||
auto submissionStatus = this->flushSmallTask(commandStream, commandStreamStart);
|
||||
return submissionStatus == SubmissionStatus::success;
|
||||
}
|
||||
} // namespace NEO
|
||||
|
||||
@@ -8,6 +8,8 @@
|
||||
#pragma once
|
||||
#include "shared/source/utilities/stackvec.h"
|
||||
|
||||
#include <set>
|
||||
|
||||
namespace NEO {
|
||||
|
||||
class TimestampPacketContainer;
|
||||
@@ -27,5 +29,7 @@ class CsrDependencies {
|
||||
void makeResident(CommandStreamReceiver &commandStreamReceiver) const;
|
||||
void copyNodesToNewContainer(TimestampPacketContainer &newTimestampPacketContainer);
|
||||
void copyRootDeviceSyncNodesToNewContainer(TimestampPacketContainer &newTimestampPacketContainer);
|
||||
|
||||
std::set<CommandStreamReceiver *> csrWithMultiEngineDependencies;
|
||||
};
|
||||
} // namespace NEO
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2023 Intel Corporation
|
||||
* Copyright (C) 2018-2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -65,4 +65,5 @@ void TimestampPacketDependencies::moveNodesToNewContainer(TimestampPacketContain
|
||||
barrierNodes.moveNodesToNewContainer(timestampPacketContainer);
|
||||
auxToNonAuxNodes.moveNodesToNewContainer(timestampPacketContainer);
|
||||
nonAuxToAuxNodes.moveNodesToNewContainer(timestampPacketContainer);
|
||||
multiCsrDependencies.moveNodesToNewContainer(timestampPacketContainer);
|
||||
}
|
||||
|
||||
@@ -39,6 +39,7 @@ struct TimestampPacketDependencies : public NonCopyableClass {
|
||||
TimestampPacketContainer barrierNodes;
|
||||
TimestampPacketContainer auxToNonAuxNodes;
|
||||
TimestampPacketContainer nonAuxToAuxNodes;
|
||||
TimestampPacketContainer multiCsrDependencies;
|
||||
|
||||
void moveNodesToNewContainer(TimestampPacketContainer ×tampPacketContainer);
|
||||
};
|
||||
|
||||
@@ -78,6 +78,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
|
||||
|
||||
SubmissionStatus flushTagUpdate() override { return SubmissionStatus::success; };
|
||||
void updateTagFromWait() override{};
|
||||
bool submitDependencyUpdate(TagNodeBase *tag) override { return true; };
|
||||
bool isUpdateTagFromWaitEnabled() override { return false; };
|
||||
|
||||
void writeMemoryAub(aub_stream::AllocationParams &allocationParams) override {
|
||||
|
||||
@@ -96,6 +96,10 @@ TEST_F(CommandStreamReceiverTest, givenOsAgnosticCsrWhenGettingCompletionValueTh
|
||||
EXPECT_EQ(expectedValue, commandStreamReceiver->getCompletionValue(allocation));
|
||||
}
|
||||
|
||||
TEST_F(CommandStreamReceiverTest, givenOsAgnosticCsrWhenSubmitingCsrDependencyWithNoTagFlushThenFalseRturned) {
|
||||
EXPECT_FALSE(commandStreamReceiver->submitDependencyUpdate(nullptr));
|
||||
}
|
||||
|
||||
TEST_F(CommandStreamReceiverTest, givenCsrWhenGettingCompletionAddressThenProperAddressIsReturned) {
|
||||
auto expectedAddress = castToUint64(const_cast<TagAddressType *>(commandStreamReceiver->getTagAddress()));
|
||||
EXPECT_EQ(expectedAddress + TagAllocationLayout::completionFenceOffset, commandStreamReceiver->getCompletionAddress());
|
||||
@@ -3408,6 +3412,26 @@ HWTEST_F(CommandStreamReceiverHwTest, givenFlushPipeControlWhenFlushWithoutState
|
||||
EXPECT_FALSE(UnitTestHelper<FamilyType>::findStateCacheFlushPipeControl(commandStreamReceiver, commandStreamReceiver.commandStream));
|
||||
}
|
||||
|
||||
HWTEST_F(CommandStreamReceiverHwTest, givenFCommandStreamWhenSubmitingDependencyUpdateThenPCWithTagAddresIsDispatched) {
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
|
||||
auto mockTagAllocator = std::make_unique<MockTagAllocator<>>(pDevice->getRootDeviceIndex(), pDevice->getExecutionEnvironment()->memoryManager.get(), 10u);
|
||||
auto tag = mockTagAllocator->getTag();
|
||||
auto usedSizeBeforeSubmit = commandStreamReceiver.commandStream.getUsed();
|
||||
if (MemorySynchronizationCommands<FamilyType>::isBarrierWaRequired(commandStreamReceiver.peekRootDeviceEnvironment())) {
|
||||
usedSizeBeforeSubmit += sizeof(PIPE_CONTROL);
|
||||
}
|
||||
commandStreamReceiver.submitDependencyUpdate(tag);
|
||||
HardwareParse hwParser;
|
||||
hwParser.parseCommands<FamilyType>(commandStreamReceiver.commandStream, usedSizeBeforeSubmit);
|
||||
const auto pipeControlItor = find<PIPE_CONTROL *>(hwParser.cmdList.begin(), hwParser.cmdList.end());
|
||||
const auto pipeControl = genCmdCast<PIPE_CONTROL *>(*pipeControlItor);
|
||||
EXPECT_NE(nullptr, pipeControl);
|
||||
auto cacheFlushTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*tag);
|
||||
EXPECT_EQ(UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*pipeControl), cacheFlushTimestampPacketGpuAddress);
|
||||
EXPECT_EQ(pipeControl->getDcFlushEnable(), MemorySynchronizationCommands<FamilyType>::getDcFlushEnable(true, commandStreamReceiver.peekRootDeviceEnvironment()));
|
||||
}
|
||||
|
||||
HWTEST_F(CommandStreamReceiverHwTest, givenFlushPipeControlWhenFlushWithStateCacheFlushThenExpectStateCacheFlushFlagsSet) {
|
||||
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user