From 82728ff3946c34bfed4a3d538e141ff311d43e71 Mon Sep 17 00:00:00 2001 From: Young Jin Yoon Date: Mon, 26 Feb 2024 09:53:24 +0000 Subject: [PATCH] feature: add logic to iterate for all contexts to check GPU pagefault Implemented to go through entire contexts in the process and then query reset status to check the unexpected GPU segfault. Added a new debug variable GpuFaultCheckThreshold to change the checking frequency for each hang check for performance analysis. Related-To: GSD-5673 Signed-off-by: Young Jin Yoon --- .../debug_settings/debug_variables_base.inl | 1 + .../os_interface/linux/drm_memory_manager.cpp | 10 +++ .../os_interface/linux/drm_memory_manager.h | 2 + shared/source/os_interface/linux/drm_neo.cpp | 33 ++++++-- shared/source/os_interface/linux/drm_neo.h | 5 ++ .../mocks/linux/mock_drm_memory_manager.h | 2 + .../linux/device_command_stream_fixture.h | 9 ++ shared/test/common/test_files/igdrcl.config | 1 + .../linux/drm_memory_manager_tests.cpp | 12 ++- .../os_interface/linux/drm_tests.cpp | 83 +++++++++++++++++++ 10 files changed, 149 insertions(+), 9 deletions(-) diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 0e58fdabb9..bd221548e6 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -248,6 +248,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, EnableMultipleRegularContextForBcs, -1, "-1: def DECLARE_DEBUG_VARIABLE(int32_t, AppendAubStreamContextFlags, -1, "-1: default, >0: Append flags passed during HardwareContext creation.") DECLARE_DEBUG_VARIABLE(int32_t, ContextGroupSize, -1, "-1: default, 0-1: context group disabled, >1: number of contexts in group.") DECLARE_DEBUG_VARIABLE(int32_t, DisableScratchPages, -1, "-1: default, 0: do not disable scratch pages during VM creations, 1: disable scratch pages during VM creations") +DECLARE_DEBUG_VARIABLE(int32_t, GpuFaultCheckThreshold, -1, "-1: default, 0: disable, >0: value for detecting the gpu pagefault for all contexts with scratch page disabled. When the number of hang check reaches to the threshold, gpu pagefault check will happen.") DECLARE_DEBUG_VARIABLE(int32_t, OptimizeIoqBarriersHandling, -1, "-1: default, 0: disable, 1: enable. If enabled, dont dispatch stalling commands for IOQ. Instead, inherit TimestampPackets from previous enqueue.") DECLARE_DEBUG_VARIABLE(int32_t, ExitOnSubmissionNumber, -1, "Call exit(0) on X submission. >=0: submission count (start from 0)") DECLARE_DEBUG_VARIABLE(int32_t, ExitOnSubmissionMode, 0, "Exit on X submission mode. 0: Any context type, 1: Compute context only, 2: Copy context only ") diff --git a/shared/source/os_interface/linux/drm_memory_manager.cpp b/shared/source/os_interface/linux/drm_memory_manager.cpp index f7c15908f4..3266d913cf 100644 --- a/shared/source/os_interface/linux/drm_memory_manager.cpp +++ b/shared/source/os_interface/linux/drm_memory_manager.cpp @@ -1990,6 +1990,16 @@ bool DrmMemoryManager::checkAllocationForChunking(size_t allocSize, size_t minSi (((allocSize / MemoryConstants::chunkThreshold) % 2) == 0) && subDeviceEnabled && debugDisabled && modeEnabled && bufferEnabled); } +void DrmMemoryManager::checkUnexpectedGpuPageFault() { + for (auto &engineContainer : allRegisteredEngines) { + for (auto &engine : engineContainer) { + CommandStreamReceiver *csr = engine.commandStreamReceiver; + Drm &drm = getDrm(csr->getRootDeviceIndex()); + drm.checkResetStatus(*engine.osContext); + } + } +} + bool DrmMemoryManager::createDrmChunkedAllocation(Drm *drm, DrmAllocation *allocation, uint64_t boAddress, size_t boSize, size_t maxOsContextCount) { auto &storageInfo = allocation->storageInfo; auto memoryInfo = drm->getMemoryInfo(); diff --git a/shared/source/os_interface/linux/drm_memory_manager.h b/shared/source/os_interface/linux/drm_memory_manager.h index 130c79055d..730fc5ce8f 100644 --- a/shared/source/os_interface/linux/drm_memory_manager.h +++ b/shared/source/os_interface/linux/drm_memory_manager.h @@ -98,6 +98,8 @@ class DrmMemoryManager : public MemoryManager { size_t getSizeOfChunk(size_t allocSize); bool checkAllocationForChunking(size_t allocSize, size_t minSize, bool subDeviceEnabled, bool debugDisabled, bool modeEnabled, bool bufferEnabled); + MOCKABLE_VIRTUAL void checkUnexpectedGpuPageFault(); + protected: void registerSharedBoHandleAllocation(DrmAllocation *drmAllocation); BufferObjectHandleWrapper tryToGetBoHandleWrapperWithSharedOwnership(int boHandle); diff --git a/shared/source/os_interface/linux/drm_neo.cpp b/shared/source/os_interface/linux/drm_neo.cpp index cafc2e6849..e0c2594d4f 100644 --- a/shared/source/os_interface/linux/drm_neo.cpp +++ b/shared/source/os_interface/linux/drm_neo.cpp @@ -59,6 +59,17 @@ Drm::Drm(std::unique_ptr &&hwDeviceIdIn, RootDeviceEnvironment &r hwDeviceId(std::move(hwDeviceIdIn)), rootDeviceEnvironment(rootDeviceEnvironment) { pagingFence.fill(0u); fenceVal.fill(0u); + + if (rootDeviceEnvironment.executionEnvironment.isDebuggingEnabled()) { + disableScratch = false; + } + if (debugManager.flags.DisableScratchPages.get() != -1) { + disableScratch = debugManager.flags.DisableScratchPages.get(); + } + auto threshold = debugManager.flags.GpuFaultCheckThreshold.get(); + if (disableScratch && threshold != -1) { + gpuFaultCheckThreshold = threshold; + } } SubmissionStatus Drm::getSubmissionStatusFromReturnCode(int32_t retCode) { @@ -238,6 +249,20 @@ int Drm::queryGttSize(uint64_t >tSizeOutput) { } bool Drm::isGpuHangDetected(OsContext &osContext) { + bool ret = checkResetStatus(osContext); + if (gpuFaultCheckThreshold != 0) { + if (gpuFaultCheckCounter == gpuFaultCheckThreshold) { + auto memoryManager = static_cast(this->rootDeviceEnvironment.executionEnvironment.memoryManager.get()); + memoryManager->checkUnexpectedGpuPageFault(); + gpuFaultCheckCounter = 0; + return false; + } + gpuFaultCheckCounter++; + } + return ret; +} + +bool Drm::checkResetStatus(OsContext &osContext) { const auto osContextLinux = static_cast(&osContext); const auto &drmContextIds = osContextLinux->getDrmContextIds(); @@ -1419,14 +1444,6 @@ int Drm::createDrmVirtualMemory(uint32_t &drmVmId) { ctl.extensions = castToUint64(vmControlExtRegion.get()); } - bool disableScratch = false; - if (rootDeviceEnvironment.executionEnvironment.isDebuggingEnabled()) { - disableScratch = false; - } - if (debugManager.flags.DisableScratchPages.get() != -1) { - disableScratch = debugManager.flags.DisableScratchPages.get(); - } - bool useVmBind = isVmBindAvailable(); bool enablePageFault = hasPageFaultSupport() && useVmBind; diff --git a/shared/source/os_interface/linux/drm_neo.h b/shared/source/os_interface/linux/drm_neo.h index 83bbed37b5..dc83089928 100644 --- a/shared/source/os_interface/linux/drm_neo.h +++ b/shared/source/os_interface/linux/drm_neo.h @@ -130,6 +130,7 @@ class Drm : public DriverModel { PhysicalDevicePciBusInfo getPciBusInfo() const override; bool isGpuHangDetected(OsContext &osContext) override; + MOCKABLE_VIRTUAL bool checkResetStatus(OsContext &osContext); bool areNonPersistentContextsSupported() const { return nonPersistentContextsSupported; } void checkNonPersistentContextsSupport(); @@ -342,6 +343,10 @@ class Drm : public DriverModel { bool pageFaultSupported = false; bool completionFenceSupported = false; bool vmBindPatIndexProgrammingSupported = false; + bool disableScratch = false; + + uint32_t gpuFaultCheckThreshold = 0u; + uint32_t gpuFaultCheckCounter = 0u; private: int getParamIoctl(DrmParam param, int *dstValue); diff --git a/shared/test/common/mocks/linux/mock_drm_memory_manager.h b/shared/test/common/mocks/linux/mock_drm_memory_manager.h index 0b47ee01d7..b56a90757b 100644 --- a/shared/test/common/mocks/linux/mock_drm_memory_manager.h +++ b/shared/test/common/mocks/linux/mock_drm_memory_manager.h @@ -45,6 +45,7 @@ class TestedDrmMemoryManager : public MemoryManagerCreate { using DrmMemoryManager::allocatePhysicalLocalDeviceMemory; using DrmMemoryManager::allocationTypeForCompletionFence; using DrmMemoryManager::allocUserptr; + using DrmMemoryManager::checkUnexpectedGpuPageFault; using DrmMemoryManager::createAllocWithAlignment; using DrmMemoryManager::createAllocWithAlignmentFromUserptr; using DrmMemoryManager::createGraphicsAllocation; @@ -72,6 +73,7 @@ class TestedDrmMemoryManager : public MemoryManagerCreate { using DrmMemoryManager::registerSharedBoHandleAllocation; using DrmMemoryManager::releaseGpuRange; using DrmMemoryManager::retrieveMmapOffsetForBufferObject; + using DrmMemoryManager::secondaryEngines; using DrmMemoryManager::selectAlignmentAndHeap; using DrmMemoryManager::setDomainCpu; using DrmMemoryManager::sharedBoHandles; diff --git a/shared/test/common/os_interface/linux/device_command_stream_fixture.h b/shared/test/common/os_interface/linux/device_command_stream_fixture.h index 1b162d6d9c..71af90069e 100644 --- a/shared/test/common/os_interface/linux/device_command_stream_fixture.h +++ b/shared/test/common/os_interface/linux/device_command_stream_fixture.h @@ -9,6 +9,7 @@ #include "shared/source/helpers/hw_info.h" #include "shared/source/os_interface/linux/drm_memory_manager.h" #include "shared/source/os_interface/linux/drm_neo.h" +#include "shared/source/os_interface/os_context.h" #include "shared/test/common/helpers/default_hw_info.h" #include "shared/test/common/mocks/linux/mock_drm_wrappers.h" @@ -18,6 +19,7 @@ using NEO::Drm; using NEO::DrmIoctl; using NEO::HwDeviceIdDrm; +using NEO::OsContext; using NEO::RootDeviceEnvironment; extern const int mockFd; @@ -190,6 +192,11 @@ class DrmMockCustom : public Drm { return 0u; } + bool checkResetStatus(OsContext &osContext) override { + checkResetStatusCalled++; + return Drm::checkResetStatus(osContext); + } + Ioctls ioctlCnt{}; Ioctls ioctlExpected{}; @@ -203,6 +210,8 @@ class DrmMockCustom : public Drm { ChunkingModeCall getChunkingModeCall{}; IsChunkingAvailableCall isChunkingAvailableCall{}; + size_t checkResetStatusCalled = 0u; + std::atomic ioctlRes; std::atomic ioctlResExt; diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index a3fd12d622..d0e6fe4d60 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -456,6 +456,7 @@ AccessCountersGranularity = -1 OverridePatIndex = -1 UseTileMemoryBankInVirtualMemoryCreation = -1 DisableScratchPages = -1 +GpuFaultCheckThreshold = -1 ForceAllResourcesUncached = 0 ForcePreParserEnabledForMiArbCheck = -1 UseDynamicEventPacketsCount = -1 diff --git a/shared/test/unit_test/os_interface/linux/drm_memory_manager_tests.cpp b/shared/test/unit_test/os_interface/linux/drm_memory_manager_tests.cpp index 5789942f05..6b0308de87 100644 --- a/shared/test/unit_test/os_interface/linux/drm_memory_manager_tests.cpp +++ b/shared/test/unit_test/os_interface/linux/drm_memory_manager_tests.cpp @@ -327,6 +327,16 @@ TEST_F(DrmMemoryManagerTest, GivenAllocatePhysicalDeviceMemoryThenSuccessReturne memoryManager->freeGraphicsMemory(allocation); } +TEST_F(DrmMemoryManagerTest, whenCallingChekcUnexpectedGpuPagedfaultThenAllEnginesWereChecked) { + memoryManager->checkUnexpectedGpuPageFault(); + size_t allEnginesSize = 0u; + for (auto &engineContainer : memoryManager->allRegisteredEngines) { + allEnginesSize += engineContainer.size(); + } + ASSERT_NE(0u, allEnginesSize); + EXPECT_EQ(allEnginesSize, mock->checkResetStatusCalled); +} + TEST_F(DrmMemoryManagerWithExplicitExpectationsTest, givenDrmMemoryManagerWhenGpuAddressReservationIsAttemptedAtIndex1ThenAddressFromGfxPartitionIsUsed) { auto memoryManager = std::make_unique(false, true, false, *executionEnvironment); RootDeviceIndicesContainer rootDeviceIndices; @@ -7813,4 +7823,4 @@ TEST_F(DrmMemoryManagerTest, givenDebugVariableToToggleGpuVaBitsWhenAllocatingRe memoryManager->freeGraphicsMemory(allocation); } -} \ No newline at end of file +} diff --git a/shared/test/unit_test/os_interface/linux/drm_tests.cpp b/shared/test/unit_test/os_interface/linux/drm_tests.cpp index 14e71caa51..a029a495cc 100644 --- a/shared/test/unit_test/os_interface/linux/drm_tests.cpp +++ b/shared/test/unit_test/os_interface/linux/drm_tests.cpp @@ -23,6 +23,7 @@ #include "shared/test/common/helpers/test_files.h" #include "shared/test/common/helpers/variable_backup.h" #include "shared/test/common/libult/linux/drm_mock.h" +#include "shared/test/common/mocks/linux/mock_drm_memory_manager.h" #include "shared/test/common/mocks/linux/mock_ioctl_helper.h" #include "shared/test/common/mocks/mock_execution_environment.h" #include "shared/test/common/mocks/mock_memory_manager.h" @@ -1428,6 +1429,88 @@ TEST(DrmDeathTest, GivenResetStatsWithValidFaultWhenIsGpuHangIsCalledThenProcess EXPECT_THROW(drm.isGpuHangDetected(mockOsContextLinux), std::runtime_error); } +struct DrmMockCheckPageFault : public DrmMock { + public: + using DrmMock::DrmMock; + using DrmMock::gpuFaultCheckThreshold; +}; + +TEST(DrmTest, givenDisableScratchPagesWhenSettingGpuFaultCheckThresholdThenThesholdValueIsSet) { + constexpr unsigned int iteration = 3u; + constexpr unsigned int threshold = 3u; + ASSERT_NE(0u, iteration); + ASSERT_NE(0u, threshold); + DebugManagerStateRestore restore; + + auto executionEnvironment = std::make_unique(); + + debugManager.flags.DisableScratchPages.set(false); + debugManager.flags.GpuFaultCheckThreshold.set(threshold); + DrmMockCheckPageFault drm1{*executionEnvironment->rootDeviceEnvironments[0]}; + EXPECT_EQ(0u, drm1.gpuFaultCheckThreshold); + + debugManager.flags.DisableScratchPages.set(true); + debugManager.flags.GpuFaultCheckThreshold.set(-1); + DrmMockCheckPageFault drm2{*executionEnvironment->rootDeviceEnvironments[0]}; + EXPECT_EQ(0u, drm2.gpuFaultCheckThreshold); + + debugManager.flags.DisableScratchPages.set(true); + debugManager.flags.GpuFaultCheckThreshold.set(threshold); + DrmMockCheckPageFault drm3{*executionEnvironment->rootDeviceEnvironments[0]}; + EXPECT_EQ(threshold, drm3.gpuFaultCheckThreshold); +} + +struct MockDrmMemoryManagerCheckPageFault : public MockDrmMemoryManager { + using MockDrmMemoryManager::MockDrmMemoryManager; + void checkUnexpectedGpuPageFault() override { + checkUnexpectedGpuPageFaultCalled++; + } + size_t checkUnexpectedGpuPageFaultCalled = 0; +}; + +TEST(DrmTest, givenDisableScratchPagesSetWhenSettingGpuFaultCheckThresholdThenFaultCheckingIsHappeningAfterThreshold) { + constexpr unsigned int iteration = 3u; + constexpr unsigned int threshold = 3u; + ASSERT_NE(0u, iteration); + ASSERT_NE(0u, threshold); + DebugManagerStateRestore restore; + debugManager.flags.DisableScratchPages.set(true); + debugManager.flags.GpuFaultCheckThreshold.set(threshold); + + auto executionEnvironment = std::make_unique(); + auto rootDeviceEnvironment = executionEnvironment->rootDeviceEnvironments[0].get(); + rootDeviceEnvironment->setHwInfoAndInitHelpers(defaultHwInfo.get()); + rootDeviceEnvironment->osInterface = std::make_unique(); + rootDeviceEnvironment->osInterface->setDriverModel(std::unique_ptr(new DrmMock(*rootDeviceEnvironment))); + + auto memoryManager = new MockDrmMemoryManagerCheckPageFault(GemCloseWorkerMode::gemCloseWorkerInactive, false, false, *executionEnvironment); + executionEnvironment->memoryManager.reset(memoryManager); + auto &drm = *executionEnvironment->rootDeviceEnvironments[0]->osInterface->getDriverModel()->as(); + + uint32_t contextId{0}; + EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::regular})}; + + MockOsContextLinux mockOsContextLinux{drm, 0, contextId, engineDescriptor}; + mockOsContextLinux.drmContextIds.push_back(0); + + ResetStats resetStats{}; + resetStats.contextId = 0; + drm.resetStatsToReturn.push_back(resetStats); + + bool isGpuHangDetected{}; + for (auto i = 0u; i < iteration; i++) { + memoryManager->checkUnexpectedGpuPageFaultCalled = 0u; + for (auto j = 0u; j < threshold; j++) { + EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux)); + EXPECT_FALSE(isGpuHangDetected); + EXPECT_EQ(0u, memoryManager->checkUnexpectedGpuPageFaultCalled); + } + EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux)); + EXPECT_FALSE(isGpuHangDetected); + EXPECT_EQ(1u, memoryManager->checkUnexpectedGpuPageFaultCalled); + } +} + TEST(DrmTest, givenSetupIoctlHelperWhenCalledTwiceThenIoctlHelperIsSetOnlyOnce) { auto executionEnvironment = std::make_unique(); DrmMock drm{*executionEnvironment->rootDeviceEnvironments[0]};