fix: add missing aub polls on sync points

Related-To: HSD-14023925176

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2024-11-20 12:36:45 +00:00
committed by Compute-Runtime-Automation
parent 0f360e96f6
commit dab4166837
20 changed files with 140 additions and 20 deletions

View File

@@ -1465,6 +1465,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::synchronizeInOrderExe
}
if (signaled) {
csr->pollForAubCompletion();
status = ZE_RESULT_SUCCESS;
break;
}

View File

@@ -282,6 +282,8 @@ void EventImp<TagSizeT>::downloadAllTbxAllocations() {
template <typename TagSizeT>
void EventImp<TagSizeT>::handleSuccessfulHostSynchronization() {
csrs[0]->pollForAubCompletion();
if (this->tbxMode) {
downloadAllTbxAllocations();
}
@@ -633,6 +635,7 @@ ze_result_t EventImp<TagSizeT>::hostSynchronize(uint64_t timeout) {
ze_result_t ret = ZE_RESULT_NOT_READY;
if (this->csrs[0]->getType() == NEO::CommandStreamReceiverType::aub) {
this->csrs[0]->pollForAubCompletion();
return ZE_RESULT_SUCCESS;
}

View File

@@ -4616,6 +4616,39 @@ HWTEST2_F(InOrderCmdListTests, givenImmediateCmdListWhenDoingCpuCopyThenPassInfo
context->freeMem(deviceAlloc);
}
HWTEST2_F(InOrderCmdListTests, givenAubModeWhenSyncCalledAlwaysPollForCompletion, IsAtLeastXeHpCore) {
auto immCmdList = createImmCmdList<gfxCoreFamily>();
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(immCmdList->getCsr(false));
ultCsr->commandStreamReceiverType = CommandStreamReceiverType::aub;
auto eventPool = createEvents<FamilyType>(1, false);
if (immCmdList->inOrderExecInfo->isHostStorageDuplicated()) {
uint64_t *hostAddress = immCmdList->inOrderExecInfo->getBaseHostAddress();
*hostAddress = 3;
} else {
auto deviceAlloc = immCmdList->inOrderExecInfo->getDeviceCounterAllocation();
auto hostAddress = static_cast<uint64_t *>(deviceAlloc->getUnderlyingBuffer());
*hostAddress = 3;
}
*ultCsr->getTagAddress() = 3;
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, events[0]->toHandle(), 0, nullptr, launchParams, false);
immCmdList->hostSynchronize(0, false);
EXPECT_EQ(1u, ultCsr->pollForAubCompletionCalled);
events[0]->hostSynchronize(std::numeric_limits<uint64_t>::max());
EXPECT_EQ(2u, ultCsr->pollForAubCompletionCalled);
ultCsr->commandStreamReceiverType = CommandStreamReceiverType::hardwareWithAub;
events[0]->hostSynchronize(std::numeric_limits<uint64_t>::max());
EXPECT_EQ(3u, ultCsr->pollForAubCompletionCalled);
}
HWTEST2_F(InOrderCmdListTests, givenProfilingEventWhenDoingCpuCopyThenSetProfilingData, IsAtLeastXeHpCore) {
auto immCmdList = createImmCmdList<gfxCoreFamily>();
immCmdList->copyThroughLockedPtrEnabled = true;

View File

@@ -23,6 +23,7 @@ class AUBCommandStreamReceiverHw : public CommandStreamReceiverSimulatedHw<GfxFa
using ExternalAllocationsContainer = std::vector<AllocationView>;
using BaseClass::getParametersForMemory;
using BaseClass::osContext;
using BaseClass::pollForCompletion;
public:
using BaseClass::peekExecutionEnvironment;
@@ -56,8 +57,9 @@ class AUBCommandStreamReceiverHw : public CommandStreamReceiverSimulatedHw<GfxFa
// Family specific version
MOCKABLE_VIRTUAL void submitBatchBufferAub(uint64_t batchBufferGpuAddress, const void *batchBuffer, size_t batchBufferSize, uint32_t memoryBank, uint64_t entryBits);
void pollForCompletion() override;
void pollForCompletion(bool skipTaskCountCheck) override;
void pollForCompletionImpl() override;
void pollForAubCompletion() override { pollForCompletion(true); };
WaitStatus waitForTaskCountWithKmdNotifyFallback(TaskCountType taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, QueueThrottle throttle) override;
uint32_t getDumpHandle();

View File

@@ -585,9 +585,9 @@ void AUBCommandStreamReceiverHw<GfxFamily>::submitBatchBufferAub(uint64_t batchB
}
template <typename GfxFamily>
void AUBCommandStreamReceiverHw<GfxFamily>::pollForCompletion() {
void AUBCommandStreamReceiverHw<GfxFamily>::pollForCompletion(bool skipTaskCountCheck) {
const auto lock = std::unique_lock<decltype(pollForCompletionLock)>{pollForCompletionLock};
if (this->pollForCompletionTaskCount == this->latestSentTaskCount) {
if (!skipTaskCountCheck && (this->pollForCompletionTaskCount == this->latestSentTaskCount)) {
return;
}
pollForCompletionImpl();

View File

@@ -118,7 +118,9 @@ class CommandStreamReceiver {
virtual bool flushBatchedSubmissions() = 0;
MOCKABLE_VIRTUAL SubmissionStatus submitBatchBuffer(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency);
virtual void pollForCompletion() {}
void pollForCompletion() { pollForCompletion(false); }
virtual void pollForAubCompletion(){};
virtual void pollForCompletion(bool skipTaskCountCheck) {}
virtual void programHardwareContext(LinearStream &cmdStream) = 0;
virtual size_t getCmdsSizeForHardwareContext() const = 0;

View File

@@ -35,6 +35,7 @@ class CommandStreamReceiverSimulatedCommonHw : public CommandStreamReceiverHw<Gf
public:
using CommandStreamReceiverHw<GfxFamily>::peekExecutionEnvironment;
using CommandStreamReceiverHw<GfxFamily>::writeMemory;
using CommandStreamReceiverHw<GfxFamily>::pollForCompletion;
CommandStreamReceiverSimulatedCommonHw(ExecutionEnvironment &executionEnvironment,
uint32_t rootDeviceIndex,

View File

@@ -19,6 +19,7 @@ class CommandStreamReceiverWithAUBDump : public BaseCSR {
using BaseCSR::osContext;
public:
using BaseCSR::pollForCompletion;
using BaseCSR::writeMemory;
CommandStreamReceiverWithAUBDump(const std::string &baseName,
@@ -49,7 +50,8 @@ class CommandStreamReceiverWithAUBDump : public BaseCSR {
void addAubComment(const char *comment) override;
void pollForCompletion() override;
void pollForCompletion(bool skipTaskCountCheck) override;
void pollForAubCompletion() override;
bool expectMemory(const void *gfxAddress, const void *srcAddress,
size_t length, uint32_t compareOperation) override;

View File

@@ -99,11 +99,18 @@ void CommandStreamReceiverWithAUBDump<BaseCSR>::addAubComment(const char *commen
}
template <typename BaseCSR>
void CommandStreamReceiverWithAUBDump<BaseCSR>::pollForCompletion() {
void CommandStreamReceiverWithAUBDump<BaseCSR>::pollForCompletion(bool skipTaskCountCheck) {
if (aubCSR) {
aubCSR->pollForCompletion();
aubCSR->pollForCompletion(skipTaskCountCheck);
}
BaseCSR::pollForCompletion(skipTaskCountCheck);
}
template <typename BaseCSR>
void CommandStreamReceiverWithAUBDump<BaseCSR>::pollForAubCompletion() {
if (aubCSR) {
aubCSR->pollForCompletion(true);
}
BaseCSR::pollForCompletion();
}
template <typename BaseCSR>

View File

@@ -27,6 +27,7 @@ class TbxCommandStreamReceiverHw : public CommandStreamReceiverSimulatedHw<GfxFa
using BaseClass::forceSkipResourceCleanupRequired;
using BaseClass::getParametersForMemory;
using BaseClass::osContext;
using BaseClass::pollForCompletion;
uint32_t getMaskAndValueForPollForCompletion() const;
bool getpollNotEqualValueForPollForCompletion() const;
@@ -63,7 +64,7 @@ class TbxCommandStreamReceiverHw : public CommandStreamReceiverSimulatedHw<GfxFa
// Family specific version
MOCKABLE_VIRTUAL void submitBatchBufferTbx(uint64_t batchBufferGpuAddress, const void *batchBuffer, size_t batchBufferSize, uint32_t memoryBank, uint64_t entryBits, bool overrideRingHead);
void pollForCompletion() override;
void pollForCompletion(bool skipTaskCountCheck) override;
void dumpAllocation(GraphicsAllocation &gfxAllocation) override;

View File

@@ -395,7 +395,7 @@ void TbxCommandStreamReceiverHw<GfxFamily>::submitBatchBufferTbx(uint64_t batchB
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::pollForCompletion() {
void TbxCommandStreamReceiverHw<GfxFamily>::pollForCompletion(bool skipTaskCountCheck) {
if (hardwareContextController) {
hardwareContextController->pollForCompletion();
return;

View File

@@ -53,9 +53,9 @@ class UltAubCommandStreamReceiver : public AUBCommandStreamReceiverHw<GfxFamily>
return BaseClass::flushBcsTask(blitPropertiesContainer, blocking, profilingEnabled, device);
}
void pollForCompletion() override {
void pollForCompletion(bool skipTaskCountCheck) override {
pollForCompletionCalled++;
BaseClass::pollForCompletion();
BaseClass::pollForCompletion(skipTaskCountCheck);
}
uint32_t blitBufferCalled = 0;

View File

@@ -78,6 +78,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
using BaseClass::makeResident;
using BaseClass::pageTableManagerInitialized;
using BaseClass::perDssBackedBuffer;
using BaseClass::pollForCompletion;
using BaseClass::postInitFlagsSetup;
using BaseClass::primaryCsr;
using BaseClass::programActivePartitionConfig;
@@ -479,10 +480,14 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
return commandStreamReceiverType;
}
void pollForCompletion() override {
void pollForCompletion(bool skipTaskCountCheck) override {
pollForCompletionCalled++;
}
void pollForAubCompletion() override {
pollForAubCompletionCalled++;
}
bool checkGpuHangDetected(CommandStreamReceiver::TimeType currentTime, CommandStreamReceiver::TimeType &lastHangCheckTime) const override {
checkGpuHangDetectedCalled++;
if (forceReturnGpuHang) {
@@ -566,6 +571,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
uint32_t initDirectSubmissionCalled = 0;
uint32_t fillReusableAllocationsListCalled = 0;
uint32_t pollForCompletionCalled = 0;
uint32_t pollForAubCompletionCalled = 0;
uint32_t initializeDeviceWithFirstSubmissionCalled = 0;
uint32_t drainPagingFenceQueueCalled = 0;
uint32_t flushHandlerCalled = 0;

View File

@@ -60,6 +60,7 @@ struct MockAubCsr : public AUBCommandStreamReceiverHw<GfxFamily> {
using AUBCommandStreamReceiverHw<GfxFamily>::pollForCompletionTaskCount;
using AUBCommandStreamReceiverHw<GfxFamily>::getParametersForMemory;
using AUBCommandStreamReceiverHw<GfxFamily>::writeMemory;
using AUBCommandStreamReceiverHw<GfxFamily>::pollForCompletion;
using AUBCommandStreamReceiverHw<GfxFamily>::AUBCommandStreamReceiverHw;
CompletionStamp flushTask(LinearStream &commandStream, size_t commandStreamStart,
@@ -120,10 +121,12 @@ struct MockAubCsr : public AUBCommandStreamReceiverHw<GfxFamily> {
writeMemoryWithAubManagerCalled = true;
}
void pollForCompletion() override {
AUBCommandStreamReceiverHw<GfxFamily>::pollForCompletion();
void pollForCompletion(bool skipTaskCountCheck) override {
AUBCommandStreamReceiverHw<GfxFamily>::pollForCompletion(skipTaskCountCheck);
pollForCompletionCalled = true;
skipTaskCountCheckForCompletionPoll = skipTaskCountCheck;
}
bool expectMemoryEqual(void *gfxAddress, const void *srcAddress, size_t length) override {
expectMemoryEqualCalled = true;
return AUBCommandStreamReceiverHw<GfxFamily>::expectMemoryEqual(gfxAddress, srcAddress, length);
@@ -168,6 +171,7 @@ struct MockAubCsr : public AUBCommandStreamReceiverHw<GfxFamily> {
bool expectMemoryCompressedCalled = false;
bool addAubCommentCalled = false;
bool dumpAllocationCalled = false;
bool skipTaskCountCheckForCompletionPoll = false;
void initFile(const std::string &fileName) override {
fileIsOpen = true;

View File

@@ -52,8 +52,8 @@ class MockTbxCsr : public TbxCommandStreamReceiverHw<GfxFamily> {
overrideRingHeadPassed = overrideRingHead;
submitBatchBufferCalled = true;
}
void pollForCompletion() override {
TbxCommandStreamReceiverHw<GfxFamily>::pollForCompletion();
void pollForCompletion(bool skipTaskCountCheck) override {
TbxCommandStreamReceiverHw<GfxFamily>::pollForCompletion(skipTaskCountCheck);
pollForCompletionCalled = true;
}
void downloadAllocationTbxMock(GraphicsAllocation &gfxAllocation) {

View File

@@ -611,6 +611,29 @@ HWTEST_F(AubCommandStreamReceiverTests, givenAubCommandStreamReceiverInSubCaptur
EXPECT_TRUE(aubCsr->pollForCompletionCalled);
}
HWTEST_F(AubCommandStreamReceiverTests, whenPollForAubCompletionCalledThenInsertPoll) {
MockOsContext osContext(0, EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_CCS, EngineUsage::regular}));
std::string fileName = "file_name.aub";
MockAubManager *mockManager = new MockAubManager();
MockAubCenter *mockAubCenter = new MockAubCenter(pDevice->getRootDeviceEnvironment(), false, fileName, CommandStreamReceiverType::aub);
mockAubCenter->aubManager = std::unique_ptr<MockAubManager>(mockManager);
pDevice->executionEnvironment->rootDeviceEnvironments[0]->aubCenter = std::unique_ptr<MockAubCenter>(mockAubCenter);
std::unique_ptr<AUBCommandStreamReceiverHw<FamilyType>> aubCsr(static_cast<AUBCommandStreamReceiverHw<FamilyType> *>(AUBCommandStreamReceiver::create(fileName, true, *pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield())));
aubCsr->setupContext(osContext);
auto mockHardwareContext = static_cast<MockHardwareContext *>(aubCsr->hardwareContextController->hardwareContexts[0].get());
aubCsr->pollForAubCompletion();
EXPECT_TRUE(mockHardwareContext->pollForCompletionCalled);
mockHardwareContext->pollForCompletionCalled = false;
aubCsr->pollForAubCompletion();
EXPECT_TRUE(mockHardwareContext->pollForCompletionCalled);
}
HWTEST_F(AubCommandStreamReceiverTests, givenAubCommandStreamReceiverInStandaloneModeWhenFlushIsCalledThenItShouldCallMakeResidentOnCommandBufferAllocation) {
auto aubExecutionEnvironment = getEnvironment<MockAubCsr<FamilyType>>(true, true, true);
auto aubCsr = aubExecutionEnvironment->template getCsr<MockAubCsr<FamilyType>>();

View File

@@ -2371,7 +2371,7 @@ template <typename FamilyType>
struct MockSimulatedCsrHw : public CommandStreamReceiverSimulatedHw<FamilyType> {
using CommandStreamReceiverSimulatedHw<FamilyType>::CommandStreamReceiverSimulatedHw;
using CommandStreamReceiverSimulatedHw<FamilyType>::getDeviceIndex;
void pollForCompletion() override {}
void pollForCompletion(bool skipTaskCountCheck) override {}
void initializeEngine() override {}
bool writeMemory(GraphicsAllocation &gfxAllocation) override { return true; }
void writeMemory(uint64_t gpuAddress, void *cpuAddress, size_t size, uint32_t memoryBank, uint64_t entryBits) override {}

View File

@@ -253,6 +253,30 @@ HWTEST_F(CommandStreamReceiverWithAubDumpSimpleTest, givenCsrWithAubDumpWhenWait
csrWithAubDump.waitForTaskCountWithKmdNotifyFallback(1, 0, false, QueueThrottle::MEDIUM);
}
HWTEST_F(CommandStreamReceiverWithAubDumpSimpleTest, whenPollForAubCompletionCalledThenDontInsertPoll) {
MockOsContext osContext(0, EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_CCS, EngineUsage::regular}));
auto executionEnvironment = pDevice->getExecutionEnvironment();
executionEnvironment->initializeMemoryManager();
MockAubCenter *mockAubCenter = new MockAubCenter(pDevice->getRootDeviceEnvironment(), false, "file_name.aub", CommandStreamReceiverType::hardwareWithAub);
mockAubCenter->aubManager = std::unique_ptr<MockAubManager>(new MockAubManager());
executionEnvironment->rootDeviceEnvironments[0]->aubCenter = std::unique_ptr<MockAubCenter>(mockAubCenter);
DeviceBitfield deviceBitfield(1);
CommandStreamReceiverWithAUBDump<UltCommandStreamReceiver<FamilyType>> csrWithAubDump("file_name.aub", *executionEnvironment, 0, deviceBitfield);
csrWithAubDump.initializeTagAllocation();
auto mockAubCsr = new MockAubCsr<FamilyType>("file_name.aub", false, *executionEnvironment, 0, deviceBitfield);
mockAubCsr->initializeTagAllocation();
csrWithAubDump.aubCSR.reset(mockAubCsr);
csrWithAubDump.setupContext(osContext);
csrWithAubDump.pollForAubCompletion();
EXPECT_FALSE(csrWithAubDump.pollForCompletionCalled);
EXPECT_TRUE(mockAubCsr->pollForCompletionCalled);
}
HWTEST_F(CommandStreamReceiverWithAubDumpSimpleTest, givenCsrWithAubDumpWhenPollForCompletionCalledThenAubCsrPollForCompletionCalled) {
auto executionEnvironment = pDevice->getExecutionEnvironment();
executionEnvironment->initializeMemoryManager();

View File

@@ -633,6 +633,17 @@ HWTEST_F(TbxCommandStreamTests, givenTbxCommandStreamReceiverWhenFlushIsCalledTh
pDevice->executionEnvironment->memoryManager->freeGraphicsMemory(commandBuffer);
}
HWTEST_F(TbxCommandStreamTests, whenPollForAubCompletionCalledThenDontInsertPoll) {
MockTbxCsr<FamilyType> tbxCsr(*pDevice->executionEnvironment, pDevice->getDeviceBitfield());
MockOsContext osContext(0, EngineDescriptorHelper::getDefaultDescriptor(pDevice->getDeviceBitfield()));
tbxCsr.setupContext(osContext);
auto mockHardwareContext = static_cast<MockHardwareContext *>(tbxCsr.hardwareContextController->hardwareContexts[0].get());
tbxCsr.pollForAubCompletion();
EXPECT_FALSE(tbxCsr.pollForCompletionCalled);
EXPECT_FALSE(mockHardwareContext->pollForCompletionCalled);
}
HWTEST_F(TbxCommandStreamTests, givenTbxCommandStreamReceiverInBatchedModeWhenFlushIsCalledThenItShouldMakeCommandBufferResident) {
DebugManagerStateRestore dbgRestore;
debugManager.flags.CsrDispatchMode.set(static_cast<uint32_t>(DispatchMode::batchedDispatch));

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2023 Intel Corporation
* Copyright (C) 2021-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -21,7 +21,7 @@ class MockSimulatedCsrHw : public CommandStreamReceiverSimulatedHw<GfxFamily> {
using CommandStreamReceiverSimulatedHw<GfxFamily>::writeMemory;
void writeMemory(uint64_t gpuAddress, void *cpuAddress, size_t size, uint32_t memoryBank, uint64_t entryBits) override {
}
void pollForCompletion() override {
void pollForCompletion(bool skipTaskCountCheck) override {
}
void initializeEngine() override {
}