From e70f441f528215e56ae6152419c2c79c71c80765 Mon Sep 17 00:00:00 2001 From: Lukasz Jobczyk Date: Wed, 5 Jul 2023 08:55:12 +0000 Subject: [PATCH] fix: Idle gpu before invalidate aux table Related-To: NEO-8067 Signed-off-by: Lukasz Jobczyk --- .../windows/wddm_memory_manager_tests.cpp | 10 +++++++++- .../command_stream/command_stream_receiver.h | 2 +- .../command_stream/command_stream_receiver_hw.h | 2 +- .../command_stream_receiver_hw_base.inl | 6 +++--- shared/source/device/device.cpp | 3 ++- .../direct_submission_controller.cpp | 2 +- .../direct_submission/direct_submission_hw.h | 3 ++- .../direct_submission/direct_submission_hw.inl | 8 ++++++-- .../linux/drm_direct_submission.h | 2 ++ .../linux/drm_direct_submission.inl | 8 ++++++-- .../windows/wddm_direct_submission.h | 3 ++- .../windows/wddm_direct_submission.inl | 8 ++++++-- .../os_interface/windows/wddm_memory_manager.cpp | 5 +++++ .../common/libult/ult_command_stream_receiver.h | 4 ++-- .../test/common/mocks/mock_direct_submission_hw.h | 2 +- .../direct_submission_tests_1.cpp | 8 ++++---- .../direct_submission_tests_2.cpp | 14 +++++++------- .../linux/drm_command_stream_tests_1.cpp | 7 +++---- 18 files changed, 63 insertions(+), 34 deletions(-) diff --git a/opencl/test/unit_test/os_interface/windows/wddm_memory_manager_tests.cpp b/opencl/test/unit_test/os_interface/windows/wddm_memory_manager_tests.cpp index 55c7bd8a9c..137025b6bc 100644 --- a/opencl/test/unit_test/os_interface/windows/wddm_memory_manager_tests.cpp +++ b/opencl/test/unit_test/os_interface/windows/wddm_memory_manager_tests.cpp @@ -26,6 +26,7 @@ #include "shared/test/common/helpers/gtest_helpers.h" #include "shared/test/common/helpers/ult_hw_config.h" #include "shared/test/common/helpers/unit_test_helper.h" +#include "shared/test/common/libult/ult_command_stream_receiver.h" #include "shared/test/common/mocks/mock_deferred_deleter.h" #include "shared/test/common/mocks/mock_device.h" #include "shared/test/common/mocks/mock_gmm_client_context.h" @@ -2300,7 +2301,7 @@ TEST_F(MockWddmMemoryManagerTest, givenCompressedFlagSetWhenInternalIsUnsetThenD EXPECT_EQ(0u, mockMngr->updateAuxTableCalled); } -TEST_F(MockWddmMemoryManagerTest, givenCompressedFlagSetWhenInternalIsSetThenUpdateAuxTable) { +HWTEST_F(MockWddmMemoryManagerTest, givenCompressedFlagSetWhenInternalIsSetThenUpdateAuxTable) { auto &productHelper = executionEnvironment->rootDeviceEnvironments[0]->getHelper(); if (!productHelper.isPageTableManagerSupported(*defaultHwInfo)) { GTEST_SKIP(); @@ -2336,7 +2337,14 @@ TEST_F(MockWddmMemoryManagerTest, givenCompressedFlagSetWhenInternalIsSetThenUpd auto result = wddm->mapGpuVirtualAddress(myGmm, ALLOCATION_HANDLE, wddm->getGfxPartition().Standard.Base, wddm->getGfxPartition().Standard.Limit, 0u, gpuVa); EXPECT_TRUE(result); + + auto ultCsr = reinterpret_cast *>(csr.get()); + ultCsr->directSubmissionAvailable = true; + EXPECT_FALSE(ultCsr->stopDirectSubmissionCalled); + memoryManager.freeGraphicsMemory(wddmAlloc); + + EXPECT_TRUE(ultCsr->stopDirectSubmissionCalled); EXPECT_EQ(expectedCallCount, mockMngr->updateAuxTableCalled); } diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index df5a22c79f..4013205850 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -301,7 +301,7 @@ class CommandStreamReceiver { return false; } - virtual void stopDirectSubmission() {} + virtual void stopDirectSubmission(bool blocking) {} bool isStaticWorkPartitioningEnabled() const { return staticWorkPartitioningEnabled; diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h index f3fabec3aa..17096df2bc 100644 --- a/shared/source/command_stream/command_stream_receiver_hw.h +++ b/shared/source/command_stream/command_stream_receiver_hw.h @@ -143,7 +143,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { bool directSubmissionRelaxedOrderingEnabled() const override; - void stopDirectSubmission() override; + void stopDirectSubmission(bool blocking) override; virtual bool isKmdWaitModeActive() { return true; } diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 5b0df90a67..aa6d6fb4f0 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -1472,11 +1472,11 @@ inline size_t CommandStreamReceiverHw::getCmdSizeForPrologue() const } template -inline void CommandStreamReceiverHw::stopDirectSubmission() { +inline void CommandStreamReceiverHw::stopDirectSubmission(bool blocking) { if (EngineHelpers::isBcs(this->osContext->getEngineType())) { - this->blitterDirectSubmission->stopRingBuffer(); + this->blitterDirectSubmission->stopRingBuffer(blocking); } else { - this->directSubmission->stopRingBuffer(); + this->directSubmission->stopRingBuffer(blocking); } } diff --git a/shared/source/device/device.cpp b/shared/source/device/device.cpp index 7054759d8d..df273b71a6 100644 --- a/shared/source/device/device.cpp +++ b/shared/source/device/device.cpp @@ -869,7 +869,8 @@ void Device::stopDirectSubmission() { for (auto &engine : allEngines) { auto csr = engine.commandStreamReceiver; if (csr->isAnyDirectSubmissionEnabled()) { - csr->stopDirectSubmission(); + auto lock = csr->obtainUniqueOwnership(); + csr->stopDirectSubmission(false); } } } diff --git a/shared/source/direct_submission/direct_submission_controller.cpp b/shared/source/direct_submission/direct_submission_controller.cpp index 6b6cae687d..aafc20dbf0 100644 --- a/shared/source/direct_submission/direct_submission_controller.cpp +++ b/shared/source/direct_submission/direct_submission_controller.cpp @@ -88,7 +88,7 @@ void DirectSubmissionController::checkNewSubmissions() { continue; } else { auto lock = csr->obtainUniqueOwnership(); - csr->stopDirectSubmission(); + csr->stopDirectSubmission(false); state.isStopped = true; shouldRecalculateTimeout = true; } diff --git a/shared/source/direct_submission/direct_submission_hw.h b/shared/source/direct_submission/direct_submission_hw.h index d26a0f70fe..0134b2967b 100644 --- a/shared/source/direct_submission/direct_submission_hw.h +++ b/shared/source/direct_submission/direct_submission_hw.h @@ -81,7 +81,7 @@ class DirectSubmissionHw { bool initialize(bool submitOnInit, bool useNotify); - MOCKABLE_VIRTUAL bool stopRingBuffer(); + MOCKABLE_VIRTUAL bool stopRingBuffer(bool blocking); bool startRingBuffer(); @@ -109,6 +109,7 @@ class DirectSubmissionHw { bool isNewResourceHandleNeeded(); size_t getSizeNewResourceHandler(); virtual void handleStopRingBuffer(){}; + virtual void ensureRingCompletion(){}; virtual uint64_t switchRingBuffers(); virtual void handleSwitchRingBuffers() = 0; GraphicsAllocation *switchRingBuffersAllocations(); diff --git a/shared/source/direct_submission/direct_submission_hw.inl b/shared/source/direct_submission/direct_submission_hw.inl index e663001456..e29deaf9d9 100644 --- a/shared/source/direct_submission/direct_submission_hw.inl +++ b/shared/source/direct_submission/direct_submission_hw.inl @@ -541,7 +541,7 @@ bool DirectSubmissionHw::startRingBuffer() { } template -bool DirectSubmissionHw::stopRingBuffer() { +bool DirectSubmissionHw::stopRingBuffer(bool blocking) { if (!ringStart) { return true; } @@ -572,6 +572,10 @@ bool DirectSubmissionHw::stopRingBuffer() { this->handleStopRingBuffer(); this->ringStart = false; + if (blocking) { + this->ensureRingCompletion(); + } + return true; } @@ -931,7 +935,7 @@ bool DirectSubmissionHw::dispatchCommandBuffer(BatchBuffe UNRECOVERABLE_IF(batchBuffer.requiresCoherency); if (batchBuffer.ringBufferRestartRequest) { - this->stopRingBuffer(); + this->stopRingBuffer(false); } this->startRingBuffer(); diff --git a/shared/source/direct_submission/linux/drm_direct_submission.h b/shared/source/direct_submission/linux/drm_direct_submission.h index aaf8b90334..30b548a159 100644 --- a/shared/source/direct_submission/linux/drm_direct_submission.h +++ b/shared/source/direct_submission/linux/drm_direct_submission.h @@ -28,6 +28,8 @@ class DrmDirectSubmission : public DirectSubmissionHw { bool handleResidency() override; void handleStopRingBuffer() override; + + void ensureRingCompletion() override; void handleSwitchRingBuffers() override; uint64_t updateTagValue() override; void getTagAddressValue(TagData &tagData) override; diff --git a/shared/source/direct_submission/linux/drm_direct_submission.inl b/shared/source/direct_submission/linux/drm_direct_submission.inl index 8d5bde6bc7..68c94ba8ad 100644 --- a/shared/source/direct_submission/linux/drm_direct_submission.inl +++ b/shared/source/direct_submission/linux/drm_direct_submission.inl @@ -93,8 +93,7 @@ DrmDirectSubmission::DrmDirectSubmission(const DirectSubm template inline DrmDirectSubmission::~DrmDirectSubmission() { if (this->ringStart) { - this->stopRingBuffer(); - this->wait(static_cast(this->currentTagData.tagValue)); + this->stopRingBuffer(true); } if (this->isCompletionFenceSupported()) { auto osContextLinux = static_cast(&this->osContext); @@ -116,6 +115,11 @@ TaskCountType *DrmDirectSubmission::getCompletionValuePoi return DirectSubmissionHw::getCompletionValuePointer(); } +template +void DrmDirectSubmission::ensureRingCompletion() { + this->wait(static_cast(this->currentTagData.tagValue)); +} + template bool DrmDirectSubmission::allocateOsResources() { this->currentTagData.tagAddress = this->semaphoreGpuVa + offsetof(RingSemaphoreData, tagAllocation); diff --git a/shared/source/direct_submission/windows/wddm_direct_submission.h b/shared/source/direct_submission/windows/wddm_direct_submission.h index e09451958d..8f8edc9e8a 100644 --- a/shared/source/direct_submission/windows/wddm_direct_submission.h +++ b/shared/source/direct_submission/windows/wddm_direct_submission.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2022 Intel Corporation + * Copyright (C) 2020-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -29,6 +29,7 @@ class WddmDirectSubmission : public DirectSubmissionHw { bool handleResidency() override; void handleCompletionFence(uint64_t completionValue, MonitoredFence &fence); + void ensureRingCompletion() override; void handleSwitchRingBuffers() override; uint64_t updateTagValue() override; void getTagAddressValue(TagData &tagData) override; diff --git a/shared/source/direct_submission/windows/wddm_direct_submission.inl b/shared/source/direct_submission/windows/wddm_direct_submission.inl index 6d449378ae..4a993b0c3b 100644 --- a/shared/source/direct_submission/windows/wddm_direct_submission.inl +++ b/shared/source/direct_submission/windows/wddm_direct_submission.inl @@ -41,13 +41,17 @@ template WddmDirectSubmission::~WddmDirectSubmission() { perfLogResidencyVariadicLog(wddm->getResidencyLogger(), "Stopping Wddm ULLS\n"); if (this->ringStart) { - this->stopRingBuffer(); - WddmDirectSubmission::handleCompletionFence(ringFence.lastSubmittedFence, ringFence); + this->stopRingBuffer(true); } this->deallocateResources(); wddm->getWddmInterface()->destroyMonitorFence(ringFence); } +template +void WddmDirectSubmission::ensureRingCompletion() { + WddmDirectSubmission::handleCompletionFence(ringFence.lastSubmittedFence, ringFence); +} + template bool WddmDirectSubmission::allocateOsResources() { // for now only WDDM2.0 diff --git a/shared/source/os_interface/windows/wddm_memory_manager.cpp b/shared/source/os_interface/windows/wddm_memory_manager.cpp index 4b0649189e..2569775a64 100644 --- a/shared/source/os_interface/windows/wddm_memory_manager.cpp +++ b/shared/source/os_interface/windows/wddm_memory_manager.cpp @@ -635,6 +635,11 @@ void WddmMemoryManager::freeGraphicsMemoryImpl(GraphicsAllocation *gfxAllocation if (gfxAllocation->isCompressionEnabled() && productHelper.isPageTableManagerSupported(*hwInfo)) { for (auto &engine : registeredEngines) { if (engine.commandStreamReceiver->pageTableManager.get()) { + std::unique_lock lock; + if (engine.commandStreamReceiver->isAnyDirectSubmissionEnabled()) { + lock = engine.commandStreamReceiver->obtainUniqueOwnership(); + engine.commandStreamReceiver->stopDirectSubmission(true); + } [[maybe_unused]] auto status = engine.commandStreamReceiver->pageTableManager->updateAuxTable(input->getGpuAddress(), defaultGmm, false); DEBUG_BREAK_IF(!status); } diff --git a/shared/test/common/libult/ult_command_stream_receiver.h b/shared/test/common/libult/ult_command_stream_receiver.h index 3f825009ea..db36778ead 100644 --- a/shared/test/common/libult/ult_command_stream_receiver.h +++ b/shared/test/common/libult/ult_command_stream_receiver.h @@ -413,9 +413,9 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ return *flushReturnValue; } - void stopDirectSubmission() override { + void stopDirectSubmission(bool blocking) override { stopDirectSubmissionCalled = true; - BaseClass::stopDirectSubmission(); + BaseClass::stopDirectSubmission(blocking); } std::vector aubCommentMessages; diff --git a/shared/test/common/mocks/mock_direct_submission_hw.h b/shared/test/common/mocks/mock_direct_submission_hw.h index 6634b8ecc8..b6160f1ddd 100644 --- a/shared/test/common/mocks/mock_direct_submission_hw.h +++ b/shared/test/common/mocks/mock_direct_submission_hw.h @@ -90,7 +90,7 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw ~MockDirectSubmissionHw() override { if (ringStart) { - stopRingBuffer(); + stopRingBuffer(false); } deallocateResources(); } diff --git a/shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp b/shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp index 6d095dadc9..46cdf0abbf 100644 --- a/shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp +++ b/shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp @@ -72,7 +72,7 @@ HWTEST_F(DirectSubmissionTest, givenDirectSubmissionWhenStopThenRingIsNotStarted EXPECT_TRUE(ret); EXPECT_TRUE(directSubmission.ringStart); - csr.stopDirectSubmission(); + csr.stopDirectSubmission(false); EXPECT_FALSE(directSubmission.ringStart); csr.directSubmission.release(); @@ -91,7 +91,7 @@ HWTEST_F(DirectSubmissionTest, givenBlitterDirectSubmissionWhenStopThenRingIsNot EXPECT_TRUE(ret); EXPECT_TRUE(directSubmission.ringStart); - csr.stopDirectSubmission(); + csr.stopDirectSubmission(false); EXPECT_FALSE(directSubmission.ringStart); csr.blitterDirectSubmission.release(); @@ -437,7 +437,7 @@ HWTEST_F(DirectSubmissionTest, givenDirectSubmissionStopWhenStopRingIsCalledThen size_t alreadyDispatchedSize = directSubmission.ringCommandStream.getUsed(); uint32_t oldQueueCount = directSubmission.semaphoreData->queueWorkCount; - directSubmission.stopRingBuffer(); + directSubmission.stopRingBuffer(false); size_t expectedDispatchSize = alreadyDispatchedSize + directSubmission.getSizeEnd(false); EXPECT_LE(directSubmission.ringCommandStream.getUsed(), expectedDispatchSize); @@ -469,7 +469,7 @@ HWTEST_F(DirectSubmissionTest, directSubmission.tagValueSetValue = 0x4343123ull; directSubmission.tagAddressSetValue = 0xBEEF00000ull; - directSubmission.stopRingBuffer(); + directSubmission.stopRingBuffer(false); size_t expectedDispatchSize = disabledSizeEnd; EXPECT_LE(directSubmission.ringCommandStream.getUsed(), expectedDispatchSize); EXPECT_GE(directSubmission.ringCommandStream.getUsed() + MemoryConstants::cacheLineSize, expectedDispatchSize); diff --git a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp index 9f87303dd1..5a123fe087 100644 --- a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp +++ b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp @@ -1030,7 +1030,7 @@ HWTEST_F(DirectSubmissionDispatchBufferTest, givenDebugFlagSetWhenStoppingRingbu auto initialCounterValue = CpuIntrinsicsTests::sfenceCounter.load(); - EXPECT_TRUE(directSubmission.stopRingBuffer()); + EXPECT_TRUE(directSubmission.stopRingBuffer(false)); uint32_t expectedCount = (debugFlag == -1) ? (pDevice->getHardwareInfo().capabilityTable.isIntegratedDevice ? 0 : 2) : static_cast(debugFlag); @@ -1993,7 +1993,7 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, whenStoppingRingThenProgramSched offset = directSubmission.ringCommandStream.getUsed(); - directSubmission.stopRingBuffer(); + directSubmission.stopRingBuffer(false); auto startAddress = ptrOffset(directSubmission.ringCommandStream.getCpuBase(), offset); auto jumpOffset = directSubmission.getSizeSemaphoreSection(true) + sizeof(typename FamilyType::MI_LOAD_REGISTER_IMM) + @@ -2045,7 +2045,7 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, WhenStoppingRingWithoutSubmissio EXPECT_FALSE(verifyDynamicSchedulerProgramming(directSubmission.ringCommandStream, staticSchedulerGpuAddress, semaphoreGpuVa, directSubmission.currentQueueWorkCount, 0, endOffset)); - directSubmission.stopRingBuffer(); + directSubmission.stopRingBuffer(false); HardwareParse hwParse; hwParse.parseCommands(directSubmission.ringCommandStream, offset); @@ -2376,7 +2376,7 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, givenBbWithStallingCmdsAndDepend batchBuffer.hasRelaxedOrderingDependencies = true; directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp); - directSubmission.stopRingBuffer(); + directSubmission.stopRingBuffer(false); EXPECT_EQ(2u, directSubmission.dispatchRelaxedOrderingSchedulerSectionCalled); EXPECT_EQ(1u, directSubmission.dispatchRelaxedOrderingQueueStallCalled); EXPECT_EQ(1u, directSubmission.dispatchTaskStoreSectionCalled); @@ -2463,7 +2463,7 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, givenBbWithNonStallingCmdsAndDep batchBuffer.hasRelaxedOrderingDependencies = true; directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp); - directSubmission.stopRingBuffer(); + directSubmission.stopRingBuffer(false); EXPECT_EQ(2u, directSubmission.dispatchRelaxedOrderingSchedulerSectionCalled); EXPECT_EQ(1u, directSubmission.dispatchRelaxedOrderingQueueStallCalled); EXPECT_EQ(1u, directSubmission.dispatchTaskStoreSectionCalled); @@ -2550,7 +2550,7 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, givenBbWithStallingCmdsAndWithou batchBuffer.hasRelaxedOrderingDependencies = false; directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp); - directSubmission.stopRingBuffer(); + directSubmission.stopRingBuffer(false); EXPECT_EQ(0u, directSubmission.dispatchRelaxedOrderingSchedulerSectionCalled); EXPECT_EQ(0u, directSubmission.dispatchRelaxedOrderingQueueStallCalled); EXPECT_EQ(0u, directSubmission.dispatchTaskStoreSectionCalled); @@ -2637,7 +2637,7 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, givenBbWithNonStallingCmdsAndWit batchBuffer.hasRelaxedOrderingDependencies = false; directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp); - directSubmission.stopRingBuffer(); + directSubmission.stopRingBuffer(false); EXPECT_EQ(0u, directSubmission.dispatchRelaxedOrderingSchedulerSectionCalled); EXPECT_EQ(0u, directSubmission.dispatchRelaxedOrderingQueueStallCalled); EXPECT_EQ(0u, directSubmission.dispatchTaskStoreSectionCalled); diff --git a/shared/test/unit_test/os_interface/linux/drm_command_stream_tests_1.cpp b/shared/test/unit_test/os_interface/linux/drm_command_stream_tests_1.cpp index d3940f6359..b4cc0051bd 100644 --- a/shared/test/unit_test/os_interface/linux/drm_command_stream_tests_1.cpp +++ b/shared/test/unit_test/os_interface/linux/drm_command_stream_tests_1.cpp @@ -806,13 +806,12 @@ struct MockDrmDirectSubmissionToTestDtor : public DrmDirectSubmission(this->currentTagData.tagValue)); // NOLINT(clang-analyzer-optin.cplusplus.VirtualCall) + stopRingBuffer(true); // NOLINT(clang-analyzer-optin.cplusplus.VirtualCall) } deallocateResources(); // NOLINT(clang-analyzer-optin.cplusplus.VirtualCall) } using DrmDirectSubmission>::ringStart; - bool stopRingBuffer() override { + bool stopRingBuffer(bool blocking) override { functionsCalled.stopRingBuffer = true; return true; } @@ -856,7 +855,7 @@ HWTEST_TEMPLATED_F(DrmCommandStreamDirectSubmissionTest, givenEnabledDirectSubmi auto directSubmission = std::make_unique>(*device->getDefaultEngine().commandStreamReceiver); ASSERT_NE(nullptr, directSubmission); - directSubmission->stopRingBuffer(); + directSubmission->stopRingBuffer(false); EXPECT_FALSE(directSubmission->ringStart); }