fix: Don't abort application due to gpu fault when debugging is enabled

Signed-off-by: Brandon Yates <brandon.yates@intel.com>
This commit is contained in:
Brandon Yates
2025-01-27 23:44:23 +00:00
committed by Compute-Runtime-Automation
parent c306c457db
commit 106e8be9a9
2 changed files with 35 additions and 1 deletions

View File

@@ -253,7 +253,8 @@ bool Drm::checkResetStatus(OsContext &osContext) {
uint32_t status = 0;
const auto retVal{ioctlHelper->getResetStats(resetStats, &status, &fault)};
UNRECOVERABLE_IF(retVal != 0);
if (checkToDisableScratchPage() && ioctlHelper->validPageFault(fault.flags)) {
auto debuggingEnabled = rootDeviceEnvironment.executionEnvironment.isDebuggingEnabled();
if (!debuggingEnabled && checkToDisableScratchPage() && ioctlHelper->validPageFault(fault.flags)) {
bool banned = ((status & ioctlHelper->getStatusForResetStats(true)) != 0);
IoFunctions::fprintf(stderr, "Segmentation fault from GPU at 0x%llx, ctx_id: %u (%s) type: %d (%s), level: %d (%s), access: %d (%s), banned: %d, aborting.\n",
fault.addr,

View File

@@ -1469,6 +1469,39 @@ class MockIoctlHelperResetStats : public MockIoctlHelper {
ResetStatsFault resetStatsFaultReturnValue{};
};
TEST(DrmTest, GivenResetStatsWithValidFaultAndDebuggingEnabledWhenIsGpuHangIsCalledThenProcessNotTerminated) {
DebugManagerStateRestore restore;
debugManager.flags.DisableScratchPages.set(true);
MockExecutionEnvironment executionEnvironment{};
DrmMock drm{*executionEnvironment.rootDeviceEnvironments[0]};
executionEnvironment.setDebuggingMode(NEO::DebuggingMode::online);
drm.configureScratchPagePolicy();
drm.configureGpuFaultCheckThreshold();
uint32_t contextId{0};
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::regular})};
auto ioctlHelper = std::make_unique<MockIoctlHelperResetStats>(drm);
MockOsContextLinux mockOsContextLinux{drm, 0, contextId, engineDescriptor};
mockOsContextLinux.drmContextIds.push_back(0);
ResetStats resetStatsExpected{};
ResetStatsFault resetStatsFaultExpected{};
resetStatsExpected.contextId = 0;
drm.resetStatsToReturn.push_back(resetStatsExpected);
resetStatsFaultExpected.flags = 1;
resetStatsFaultExpected.addr = 0x1234;
resetStatsFaultExpected.type = 2;
resetStatsFaultExpected.level = 3;
ioctlHelper->statusReturnValue = 2u;
ioctlHelper->resetStatsFaultReturnValue = resetStatsFaultExpected;
drm.ioctlHelper = std::move(ioctlHelper);
EXPECT_FALSE(drm.isGpuHangDetected(mockOsContextLinux));
}
TEST(DrmDeathTest, GivenResetStatsWithValidFaultWhenIsGpuHangIsCalledThenProcessTerminated) {
DebugManagerStateRestore restore;
debugManager.flags.DisableScratchPages.set(true);