diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 0e58fdabb9..bd221548e6 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -248,6 +248,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, EnableMultipleRegularContextForBcs, -1, "-1: def DECLARE_DEBUG_VARIABLE(int32_t, AppendAubStreamContextFlags, -1, "-1: default, >0: Append flags passed during HardwareContext creation.") DECLARE_DEBUG_VARIABLE(int32_t, ContextGroupSize, -1, "-1: default, 0-1: context group disabled, >1: number of contexts in group.") DECLARE_DEBUG_VARIABLE(int32_t, DisableScratchPages, -1, "-1: default, 0: do not disable scratch pages during VM creations, 1: disable scratch pages during VM creations") +DECLARE_DEBUG_VARIABLE(int32_t, GpuFaultCheckThreshold, -1, "-1: default, 0: disable, >0: value for detecting the gpu pagefault for all contexts with scratch page disabled. When the number of hang check reaches to the threshold, gpu pagefault check will happen.") DECLARE_DEBUG_VARIABLE(int32_t, OptimizeIoqBarriersHandling, -1, "-1: default, 0: disable, 1: enable. If enabled, dont dispatch stalling commands for IOQ. Instead, inherit TimestampPackets from previous enqueue.") DECLARE_DEBUG_VARIABLE(int32_t, ExitOnSubmissionNumber, -1, "Call exit(0) on X submission. >=0: submission count (start from 0)") DECLARE_DEBUG_VARIABLE(int32_t, ExitOnSubmissionMode, 0, "Exit on X submission mode. 0: Any context type, 1: Compute context only, 2: Copy context only ") diff --git a/shared/source/os_interface/linux/drm_memory_manager.cpp b/shared/source/os_interface/linux/drm_memory_manager.cpp index f7c15908f4..3266d913cf 100644 --- a/shared/source/os_interface/linux/drm_memory_manager.cpp +++ b/shared/source/os_interface/linux/drm_memory_manager.cpp @@ -1990,6 +1990,16 @@ bool DrmMemoryManager::checkAllocationForChunking(size_t allocSize, size_t minSi (((allocSize / MemoryConstants::chunkThreshold) % 2) == 0) && subDeviceEnabled && debugDisabled && modeEnabled && bufferEnabled); } +void DrmMemoryManager::checkUnexpectedGpuPageFault() { + for (auto &engineContainer : allRegisteredEngines) { + for (auto &engine : engineContainer) { + CommandStreamReceiver *csr = engine.commandStreamReceiver; + Drm &drm = getDrm(csr->getRootDeviceIndex()); + drm.checkResetStatus(*engine.osContext); + } + } +} + bool DrmMemoryManager::createDrmChunkedAllocation(Drm *drm, DrmAllocation *allocation, uint64_t boAddress, size_t boSize, size_t maxOsContextCount) { auto &storageInfo = allocation->storageInfo; auto memoryInfo = drm->getMemoryInfo(); diff --git a/shared/source/os_interface/linux/drm_memory_manager.h b/shared/source/os_interface/linux/drm_memory_manager.h index 130c79055d..730fc5ce8f 100644 --- a/shared/source/os_interface/linux/drm_memory_manager.h +++ b/shared/source/os_interface/linux/drm_memory_manager.h @@ -98,6 +98,8 @@ class DrmMemoryManager : public MemoryManager { size_t getSizeOfChunk(size_t allocSize); bool checkAllocationForChunking(size_t allocSize, size_t minSize, bool subDeviceEnabled, bool debugDisabled, bool modeEnabled, bool bufferEnabled); + MOCKABLE_VIRTUAL void checkUnexpectedGpuPageFault(); + protected: void registerSharedBoHandleAllocation(DrmAllocation *drmAllocation); BufferObjectHandleWrapper tryToGetBoHandleWrapperWithSharedOwnership(int boHandle); diff --git a/shared/source/os_interface/linux/drm_neo.cpp b/shared/source/os_interface/linux/drm_neo.cpp index cafc2e6849..e0c2594d4f 100644 --- a/shared/source/os_interface/linux/drm_neo.cpp +++ b/shared/source/os_interface/linux/drm_neo.cpp @@ -59,6 +59,17 @@ Drm::Drm(std::unique_ptr &&hwDeviceIdIn, RootDeviceEnvironment &r hwDeviceId(std::move(hwDeviceIdIn)), rootDeviceEnvironment(rootDeviceEnvironment) { pagingFence.fill(0u); fenceVal.fill(0u); + + if (rootDeviceEnvironment.executionEnvironment.isDebuggingEnabled()) { + disableScratch = false; + } + if (debugManager.flags.DisableScratchPages.get() != -1) { + disableScratch = debugManager.flags.DisableScratchPages.get(); + } + auto threshold = debugManager.flags.GpuFaultCheckThreshold.get(); + if (disableScratch && threshold != -1) { + gpuFaultCheckThreshold = threshold; + } } SubmissionStatus Drm::getSubmissionStatusFromReturnCode(int32_t retCode) { @@ -238,6 +249,20 @@ int Drm::queryGttSize(uint64_t >tSizeOutput) { } bool Drm::isGpuHangDetected(OsContext &osContext) { + bool ret = checkResetStatus(osContext); + if (gpuFaultCheckThreshold != 0) { + if (gpuFaultCheckCounter == gpuFaultCheckThreshold) { + auto memoryManager = static_cast(this->rootDeviceEnvironment.executionEnvironment.memoryManager.get()); + memoryManager->checkUnexpectedGpuPageFault(); + gpuFaultCheckCounter = 0; + return false; + } + gpuFaultCheckCounter++; + } + return ret; +} + +bool Drm::checkResetStatus(OsContext &osContext) { const auto osContextLinux = static_cast(&osContext); const auto &drmContextIds = osContextLinux->getDrmContextIds(); @@ -1419,14 +1444,6 @@ int Drm::createDrmVirtualMemory(uint32_t &drmVmId) { ctl.extensions = castToUint64(vmControlExtRegion.get()); } - bool disableScratch = false; - if (rootDeviceEnvironment.executionEnvironment.isDebuggingEnabled()) { - disableScratch = false; - } - if (debugManager.flags.DisableScratchPages.get() != -1) { - disableScratch = debugManager.flags.DisableScratchPages.get(); - } - bool useVmBind = isVmBindAvailable(); bool enablePageFault = hasPageFaultSupport() && useVmBind; diff --git a/shared/source/os_interface/linux/drm_neo.h b/shared/source/os_interface/linux/drm_neo.h index 83bbed37b5..dc83089928 100644 --- a/shared/source/os_interface/linux/drm_neo.h +++ b/shared/source/os_interface/linux/drm_neo.h @@ -130,6 +130,7 @@ class Drm : public DriverModel { PhysicalDevicePciBusInfo getPciBusInfo() const override; bool isGpuHangDetected(OsContext &osContext) override; + MOCKABLE_VIRTUAL bool checkResetStatus(OsContext &osContext); bool areNonPersistentContextsSupported() const { return nonPersistentContextsSupported; } void checkNonPersistentContextsSupport(); @@ -342,6 +343,10 @@ class Drm : public DriverModel { bool pageFaultSupported = false; bool completionFenceSupported = false; bool vmBindPatIndexProgrammingSupported = false; + bool disableScratch = false; + + uint32_t gpuFaultCheckThreshold = 0u; + uint32_t gpuFaultCheckCounter = 0u; private: int getParamIoctl(DrmParam param, int *dstValue); diff --git a/shared/test/common/mocks/linux/mock_drm_memory_manager.h b/shared/test/common/mocks/linux/mock_drm_memory_manager.h index 0b47ee01d7..b56a90757b 100644 --- a/shared/test/common/mocks/linux/mock_drm_memory_manager.h +++ b/shared/test/common/mocks/linux/mock_drm_memory_manager.h @@ -45,6 +45,7 @@ class TestedDrmMemoryManager : public MemoryManagerCreate { using DrmMemoryManager::allocatePhysicalLocalDeviceMemory; using DrmMemoryManager::allocationTypeForCompletionFence; using DrmMemoryManager::allocUserptr; + using DrmMemoryManager::checkUnexpectedGpuPageFault; using DrmMemoryManager::createAllocWithAlignment; using DrmMemoryManager::createAllocWithAlignmentFromUserptr; using DrmMemoryManager::createGraphicsAllocation; @@ -72,6 +73,7 @@ class TestedDrmMemoryManager : public MemoryManagerCreate { using DrmMemoryManager::registerSharedBoHandleAllocation; using DrmMemoryManager::releaseGpuRange; using DrmMemoryManager::retrieveMmapOffsetForBufferObject; + using DrmMemoryManager::secondaryEngines; using DrmMemoryManager::selectAlignmentAndHeap; using DrmMemoryManager::setDomainCpu; using DrmMemoryManager::sharedBoHandles; diff --git a/shared/test/common/os_interface/linux/device_command_stream_fixture.h b/shared/test/common/os_interface/linux/device_command_stream_fixture.h index 1b162d6d9c..71af90069e 100644 --- a/shared/test/common/os_interface/linux/device_command_stream_fixture.h +++ b/shared/test/common/os_interface/linux/device_command_stream_fixture.h @@ -9,6 +9,7 @@ #include "shared/source/helpers/hw_info.h" #include "shared/source/os_interface/linux/drm_memory_manager.h" #include "shared/source/os_interface/linux/drm_neo.h" +#include "shared/source/os_interface/os_context.h" #include "shared/test/common/helpers/default_hw_info.h" #include "shared/test/common/mocks/linux/mock_drm_wrappers.h" @@ -18,6 +19,7 @@ using NEO::Drm; using NEO::DrmIoctl; using NEO::HwDeviceIdDrm; +using NEO::OsContext; using NEO::RootDeviceEnvironment; extern const int mockFd; @@ -190,6 +192,11 @@ class DrmMockCustom : public Drm { return 0u; } + bool checkResetStatus(OsContext &osContext) override { + checkResetStatusCalled++; + return Drm::checkResetStatus(osContext); + } + Ioctls ioctlCnt{}; Ioctls ioctlExpected{}; @@ -203,6 +210,8 @@ class DrmMockCustom : public Drm { ChunkingModeCall getChunkingModeCall{}; IsChunkingAvailableCall isChunkingAvailableCall{}; + size_t checkResetStatusCalled = 0u; + std::atomic ioctlRes; std::atomic ioctlResExt; diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index a3fd12d622..d0e6fe4d60 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -456,6 +456,7 @@ AccessCountersGranularity = -1 OverridePatIndex = -1 UseTileMemoryBankInVirtualMemoryCreation = -1 DisableScratchPages = -1 +GpuFaultCheckThreshold = -1 ForceAllResourcesUncached = 0 ForcePreParserEnabledForMiArbCheck = -1 UseDynamicEventPacketsCount = -1 diff --git a/shared/test/unit_test/os_interface/linux/drm_memory_manager_tests.cpp b/shared/test/unit_test/os_interface/linux/drm_memory_manager_tests.cpp index 5789942f05..6b0308de87 100644 --- a/shared/test/unit_test/os_interface/linux/drm_memory_manager_tests.cpp +++ b/shared/test/unit_test/os_interface/linux/drm_memory_manager_tests.cpp @@ -327,6 +327,16 @@ TEST_F(DrmMemoryManagerTest, GivenAllocatePhysicalDeviceMemoryThenSuccessReturne memoryManager->freeGraphicsMemory(allocation); } +TEST_F(DrmMemoryManagerTest, whenCallingChekcUnexpectedGpuPagedfaultThenAllEnginesWereChecked) { + memoryManager->checkUnexpectedGpuPageFault(); + size_t allEnginesSize = 0u; + for (auto &engineContainer : memoryManager->allRegisteredEngines) { + allEnginesSize += engineContainer.size(); + } + ASSERT_NE(0u, allEnginesSize); + EXPECT_EQ(allEnginesSize, mock->checkResetStatusCalled); +} + TEST_F(DrmMemoryManagerWithExplicitExpectationsTest, givenDrmMemoryManagerWhenGpuAddressReservationIsAttemptedAtIndex1ThenAddressFromGfxPartitionIsUsed) { auto memoryManager = std::make_unique(false, true, false, *executionEnvironment); RootDeviceIndicesContainer rootDeviceIndices; @@ -7813,4 +7823,4 @@ TEST_F(DrmMemoryManagerTest, givenDebugVariableToToggleGpuVaBitsWhenAllocatingRe memoryManager->freeGraphicsMemory(allocation); } -} \ No newline at end of file +} diff --git a/shared/test/unit_test/os_interface/linux/drm_tests.cpp b/shared/test/unit_test/os_interface/linux/drm_tests.cpp index 14e71caa51..a029a495cc 100644 --- a/shared/test/unit_test/os_interface/linux/drm_tests.cpp +++ b/shared/test/unit_test/os_interface/linux/drm_tests.cpp @@ -23,6 +23,7 @@ #include "shared/test/common/helpers/test_files.h" #include "shared/test/common/helpers/variable_backup.h" #include "shared/test/common/libult/linux/drm_mock.h" +#include "shared/test/common/mocks/linux/mock_drm_memory_manager.h" #include "shared/test/common/mocks/linux/mock_ioctl_helper.h" #include "shared/test/common/mocks/mock_execution_environment.h" #include "shared/test/common/mocks/mock_memory_manager.h" @@ -1428,6 +1429,88 @@ TEST(DrmDeathTest, GivenResetStatsWithValidFaultWhenIsGpuHangIsCalledThenProcess EXPECT_THROW(drm.isGpuHangDetected(mockOsContextLinux), std::runtime_error); } +struct DrmMockCheckPageFault : public DrmMock { + public: + using DrmMock::DrmMock; + using DrmMock::gpuFaultCheckThreshold; +}; + +TEST(DrmTest, givenDisableScratchPagesWhenSettingGpuFaultCheckThresholdThenThesholdValueIsSet) { + constexpr unsigned int iteration = 3u; + constexpr unsigned int threshold = 3u; + ASSERT_NE(0u, iteration); + ASSERT_NE(0u, threshold); + DebugManagerStateRestore restore; + + auto executionEnvironment = std::make_unique(); + + debugManager.flags.DisableScratchPages.set(false); + debugManager.flags.GpuFaultCheckThreshold.set(threshold); + DrmMockCheckPageFault drm1{*executionEnvironment->rootDeviceEnvironments[0]}; + EXPECT_EQ(0u, drm1.gpuFaultCheckThreshold); + + debugManager.flags.DisableScratchPages.set(true); + debugManager.flags.GpuFaultCheckThreshold.set(-1); + DrmMockCheckPageFault drm2{*executionEnvironment->rootDeviceEnvironments[0]}; + EXPECT_EQ(0u, drm2.gpuFaultCheckThreshold); + + debugManager.flags.DisableScratchPages.set(true); + debugManager.flags.GpuFaultCheckThreshold.set(threshold); + DrmMockCheckPageFault drm3{*executionEnvironment->rootDeviceEnvironments[0]}; + EXPECT_EQ(threshold, drm3.gpuFaultCheckThreshold); +} + +struct MockDrmMemoryManagerCheckPageFault : public MockDrmMemoryManager { + using MockDrmMemoryManager::MockDrmMemoryManager; + void checkUnexpectedGpuPageFault() override { + checkUnexpectedGpuPageFaultCalled++; + } + size_t checkUnexpectedGpuPageFaultCalled = 0; +}; + +TEST(DrmTest, givenDisableScratchPagesSetWhenSettingGpuFaultCheckThresholdThenFaultCheckingIsHappeningAfterThreshold) { + constexpr unsigned int iteration = 3u; + constexpr unsigned int threshold = 3u; + ASSERT_NE(0u, iteration); + ASSERT_NE(0u, threshold); + DebugManagerStateRestore restore; + debugManager.flags.DisableScratchPages.set(true); + debugManager.flags.GpuFaultCheckThreshold.set(threshold); + + auto executionEnvironment = std::make_unique(); + auto rootDeviceEnvironment = executionEnvironment->rootDeviceEnvironments[0].get(); + rootDeviceEnvironment->setHwInfoAndInitHelpers(defaultHwInfo.get()); + rootDeviceEnvironment->osInterface = std::make_unique(); + rootDeviceEnvironment->osInterface->setDriverModel(std::unique_ptr(new DrmMock(*rootDeviceEnvironment))); + + auto memoryManager = new MockDrmMemoryManagerCheckPageFault(GemCloseWorkerMode::gemCloseWorkerInactive, false, false, *executionEnvironment); + executionEnvironment->memoryManager.reset(memoryManager); + auto &drm = *executionEnvironment->rootDeviceEnvironments[0]->osInterface->getDriverModel()->as(); + + uint32_t contextId{0}; + EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::regular})}; + + MockOsContextLinux mockOsContextLinux{drm, 0, contextId, engineDescriptor}; + mockOsContextLinux.drmContextIds.push_back(0); + + ResetStats resetStats{}; + resetStats.contextId = 0; + drm.resetStatsToReturn.push_back(resetStats); + + bool isGpuHangDetected{}; + for (auto i = 0u; i < iteration; i++) { + memoryManager->checkUnexpectedGpuPageFaultCalled = 0u; + for (auto j = 0u; j < threshold; j++) { + EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux)); + EXPECT_FALSE(isGpuHangDetected); + EXPECT_EQ(0u, memoryManager->checkUnexpectedGpuPageFaultCalled); + } + EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux)); + EXPECT_FALSE(isGpuHangDetected); + EXPECT_EQ(1u, memoryManager->checkUnexpectedGpuPageFaultCalled); + } +} + TEST(DrmTest, givenSetupIoctlHelperWhenCalledTwiceThenIoctlHelperIsSetOnlyOnce) { auto executionEnvironment = std::make_unique(); DrmMock drm{*executionEnvironment->rootDeviceEnvironments[0]};