feature: add logic to iterate for all contexts to check GPU pagefault

Implemented to go through entire contexts in the process and then query
reset status to check the unexpected GPU segfault.

Added a new debug variable GpuFaultCheckThreshold to change the checking
frequency for each hang check for performance analysis.

Related-To: GSD-5673
Signed-off-by: Young Jin Yoon <young.jin.yoon@intel.com>
This commit is contained in:
Young Jin Yoon
2024-02-26 09:53:24 +00:00
committed by Compute-Runtime-Automation
parent 5111f30116
commit 82728ff394
10 changed files with 149 additions and 9 deletions

View File

@@ -45,6 +45,7 @@ class TestedDrmMemoryManager : public MemoryManagerCreate<DrmMemoryManager> {
using DrmMemoryManager::allocatePhysicalLocalDeviceMemory;
using DrmMemoryManager::allocationTypeForCompletionFence;
using DrmMemoryManager::allocUserptr;
using DrmMemoryManager::checkUnexpectedGpuPageFault;
using DrmMemoryManager::createAllocWithAlignment;
using DrmMemoryManager::createAllocWithAlignmentFromUserptr;
using DrmMemoryManager::createGraphicsAllocation;
@@ -72,6 +73,7 @@ class TestedDrmMemoryManager : public MemoryManagerCreate<DrmMemoryManager> {
using DrmMemoryManager::registerSharedBoHandleAllocation;
using DrmMemoryManager::releaseGpuRange;
using DrmMemoryManager::retrieveMmapOffsetForBufferObject;
using DrmMemoryManager::secondaryEngines;
using DrmMemoryManager::selectAlignmentAndHeap;
using DrmMemoryManager::setDomainCpu;
using DrmMemoryManager::sharedBoHandles;

View File

@@ -9,6 +9,7 @@
#include "shared/source/helpers/hw_info.h"
#include "shared/source/os_interface/linux/drm_memory_manager.h"
#include "shared/source/os_interface/linux/drm_neo.h"
#include "shared/source/os_interface/os_context.h"
#include "shared/test/common/helpers/default_hw_info.h"
#include "shared/test/common/mocks/linux/mock_drm_wrappers.h"
@@ -18,6 +19,7 @@
using NEO::Drm;
using NEO::DrmIoctl;
using NEO::HwDeviceIdDrm;
using NEO::OsContext;
using NEO::RootDeviceEnvironment;
extern const int mockFd;
@@ -190,6 +192,11 @@ class DrmMockCustom : public Drm {
return 0u;
}
bool checkResetStatus(OsContext &osContext) override {
checkResetStatusCalled++;
return Drm::checkResetStatus(osContext);
}
Ioctls ioctlCnt{};
Ioctls ioctlExpected{};
@@ -203,6 +210,8 @@ class DrmMockCustom : public Drm {
ChunkingModeCall getChunkingModeCall{};
IsChunkingAvailableCall isChunkingAvailableCall{};
size_t checkResetStatusCalled = 0u;
std::atomic<int> ioctlRes;
std::atomic<IoctlResExt *> ioctlResExt;

View File

@@ -456,6 +456,7 @@ AccessCountersGranularity = -1
OverridePatIndex = -1
UseTileMemoryBankInVirtualMemoryCreation = -1
DisableScratchPages = -1
GpuFaultCheckThreshold = -1
ForceAllResourcesUncached = 0
ForcePreParserEnabledForMiArbCheck = -1
UseDynamicEventPacketsCount = -1

View File

@@ -327,6 +327,16 @@ TEST_F(DrmMemoryManagerTest, GivenAllocatePhysicalDeviceMemoryThenSuccessReturne
memoryManager->freeGraphicsMemory(allocation);
}
TEST_F(DrmMemoryManagerTest, whenCallingChekcUnexpectedGpuPagedfaultThenAllEnginesWereChecked) {
memoryManager->checkUnexpectedGpuPageFault();
size_t allEnginesSize = 0u;
for (auto &engineContainer : memoryManager->allRegisteredEngines) {
allEnginesSize += engineContainer.size();
}
ASSERT_NE(0u, allEnginesSize);
EXPECT_EQ(allEnginesSize, mock->checkResetStatusCalled);
}
TEST_F(DrmMemoryManagerWithExplicitExpectationsTest, givenDrmMemoryManagerWhenGpuAddressReservationIsAttemptedAtIndex1ThenAddressFromGfxPartitionIsUsed) {
auto memoryManager = std::make_unique<TestedDrmMemoryManager>(false, true, false, *executionEnvironment);
RootDeviceIndicesContainer rootDeviceIndices;
@@ -7813,4 +7823,4 @@ TEST_F(DrmMemoryManagerTest, givenDebugVariableToToggleGpuVaBitsWhenAllocatingRe
memoryManager->freeGraphicsMemory(allocation);
}
}
}

View File

@@ -23,6 +23,7 @@
#include "shared/test/common/helpers/test_files.h"
#include "shared/test/common/helpers/variable_backup.h"
#include "shared/test/common/libult/linux/drm_mock.h"
#include "shared/test/common/mocks/linux/mock_drm_memory_manager.h"
#include "shared/test/common/mocks/linux/mock_ioctl_helper.h"
#include "shared/test/common/mocks/mock_execution_environment.h"
#include "shared/test/common/mocks/mock_memory_manager.h"
@@ -1428,6 +1429,88 @@ TEST(DrmDeathTest, GivenResetStatsWithValidFaultWhenIsGpuHangIsCalledThenProcess
EXPECT_THROW(drm.isGpuHangDetected(mockOsContextLinux), std::runtime_error);
}
struct DrmMockCheckPageFault : public DrmMock {
public:
using DrmMock::DrmMock;
using DrmMock::gpuFaultCheckThreshold;
};
TEST(DrmTest, givenDisableScratchPagesWhenSettingGpuFaultCheckThresholdThenThesholdValueIsSet) {
constexpr unsigned int iteration = 3u;
constexpr unsigned int threshold = 3u;
ASSERT_NE(0u, iteration);
ASSERT_NE(0u, threshold);
DebugManagerStateRestore restore;
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
debugManager.flags.DisableScratchPages.set(false);
debugManager.flags.GpuFaultCheckThreshold.set(threshold);
DrmMockCheckPageFault drm1{*executionEnvironment->rootDeviceEnvironments[0]};
EXPECT_EQ(0u, drm1.gpuFaultCheckThreshold);
debugManager.flags.DisableScratchPages.set(true);
debugManager.flags.GpuFaultCheckThreshold.set(-1);
DrmMockCheckPageFault drm2{*executionEnvironment->rootDeviceEnvironments[0]};
EXPECT_EQ(0u, drm2.gpuFaultCheckThreshold);
debugManager.flags.DisableScratchPages.set(true);
debugManager.flags.GpuFaultCheckThreshold.set(threshold);
DrmMockCheckPageFault drm3{*executionEnvironment->rootDeviceEnvironments[0]};
EXPECT_EQ(threshold, drm3.gpuFaultCheckThreshold);
}
struct MockDrmMemoryManagerCheckPageFault : public MockDrmMemoryManager {
using MockDrmMemoryManager::MockDrmMemoryManager;
void checkUnexpectedGpuPageFault() override {
checkUnexpectedGpuPageFaultCalled++;
}
size_t checkUnexpectedGpuPageFaultCalled = 0;
};
TEST(DrmTest, givenDisableScratchPagesSetWhenSettingGpuFaultCheckThresholdThenFaultCheckingIsHappeningAfterThreshold) {
constexpr unsigned int iteration = 3u;
constexpr unsigned int threshold = 3u;
ASSERT_NE(0u, iteration);
ASSERT_NE(0u, threshold);
DebugManagerStateRestore restore;
debugManager.flags.DisableScratchPages.set(true);
debugManager.flags.GpuFaultCheckThreshold.set(threshold);
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
auto rootDeviceEnvironment = executionEnvironment->rootDeviceEnvironments[0].get();
rootDeviceEnvironment->setHwInfoAndInitHelpers(defaultHwInfo.get());
rootDeviceEnvironment->osInterface = std::make_unique<OSInterface>();
rootDeviceEnvironment->osInterface->setDriverModel(std::unique_ptr<DriverModel>(new DrmMock(*rootDeviceEnvironment)));
auto memoryManager = new MockDrmMemoryManagerCheckPageFault(GemCloseWorkerMode::gemCloseWorkerInactive, false, false, *executionEnvironment);
executionEnvironment->memoryManager.reset(memoryManager);
auto &drm = *executionEnvironment->rootDeviceEnvironments[0]->osInterface->getDriverModel()->as<DrmMock>();
uint32_t contextId{0};
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::regular})};
MockOsContextLinux mockOsContextLinux{drm, 0, contextId, engineDescriptor};
mockOsContextLinux.drmContextIds.push_back(0);
ResetStats resetStats{};
resetStats.contextId = 0;
drm.resetStatsToReturn.push_back(resetStats);
bool isGpuHangDetected{};
for (auto i = 0u; i < iteration; i++) {
memoryManager->checkUnexpectedGpuPageFaultCalled = 0u;
for (auto j = 0u; j < threshold; j++) {
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
EXPECT_FALSE(isGpuHangDetected);
EXPECT_EQ(0u, memoryManager->checkUnexpectedGpuPageFaultCalled);
}
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
EXPECT_FALSE(isGpuHangDetected);
EXPECT_EQ(1u, memoryManager->checkUnexpectedGpuPageFaultCalled);
}
}
TEST(DrmTest, givenSetupIoctlHelperWhenCalledTwiceThenIoctlHelperIsSetOnlyOnce) {
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
DrmMock drm{*executionEnvironment->rootDeviceEnvironments[0]};