fix: print to stdout for disable scratch page

Modified to print out error messages to stdout when disable scratch page
is used.

Related-To: GSD-7611
Signed-off-by: Young Jin Yoon <young.jin.yoon@intel.com>
This commit is contained in:
Young Jin Yoon
2024-05-11 07:43:42 +00:00
committed by Compute-Runtime-Automation
parent 4b7e2d5064
commit e204d27190
7 changed files with 214 additions and 13 deletions

View File

@@ -93,6 +93,8 @@ set(NEO_CORE_HELPERS
${CMAKE_CURRENT_SOURCE_DIR}/gfx_core_helper_bdw_to_icllp.inl
${CMAKE_CURRENT_SOURCE_DIR}/gfx_core_helper_pvc_and_later.inl
${CMAKE_CURRENT_SOURCE_DIR}/gfx_core_helper_tgllp_and_later.inl
${CMAKE_CURRENT_SOURCE_DIR}/gpu_page_fault_helper.cpp
${CMAKE_CURRENT_SOURCE_DIR}/gpu_page_fault_helper.h
${CMAKE_CURRENT_SOURCE_DIR}/hw_info.cpp
${CMAKE_CURRENT_SOURCE_DIR}/hw_info.h
${CMAKE_CURRENT_SOURCE_DIR}/hw_info_helper.cpp

View File

@@ -0,0 +1,55 @@
/*
* Copyright (C) 2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/gpu_page_fault_helper.h"
namespace NEO::GpuPageFaultHelpers {
std::string faultTypeToString(FaultType type) {
switch (type) {
case FaultType::notPresent:
return "NotPresent";
case FaultType::writeAccessViolation:
return "WriteAccessViolation";
case FaultType::atomicAccessViolation:
return "AtomicAccessViolation";
default:
return "Unknown";
}
}
std::string faultAccessToString(FaultAccess access) {
switch (access) {
case FaultAccess::read:
return "Read";
case FaultAccess::write:
return "Write";
case FaultAccess::atomic:
return "Atomic";
default:
return "Unknown";
}
}
std::string faultLevelToString(FaultLevel level) {
switch (level) {
case FaultLevel::pte:
return "PTE";
case FaultLevel::pde:
return "PDE";
case FaultLevel::pdp:
return "PDP";
case FaultLevel::pml4:
return "PML4";
case FaultLevel::pml5:
return "PML5";
default:
return "Unknown";
}
}
} // namespace NEO::GpuPageFaultHelpers

View File

@@ -0,0 +1,41 @@
/*
* Copyright (C) 2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include <cstdint>
#include <string>
namespace NEO {
enum class FaultType : uint16_t {
notPresent = 0b00,
writeAccessViolation = 0b01,
atomicAccessViolation = 0b10
};
enum class FaultAccess : uint16_t {
read = 0b00,
write = 0b01,
atomic = 0b10
};
enum class FaultLevel : uint16_t {
pte = 0b000,
pde = 0b001,
pdp = 0b010,
pml4 = 0b011,
pml5 = 0b0100
};
namespace GpuPageFaultHelpers {
std::string faultTypeToString(FaultType type);
std::string faultAccessToString(FaultAccess access);
std::string faultLevelToString(FaultLevel level);
} // namespace GpuPageFaultHelpers
} // namespace NEO

View File

@@ -20,6 +20,7 @@
#include "shared/source/helpers/constants.h"
#include "shared/source/helpers/debug_helpers.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/helpers/gpu_page_fault_helper.h"
#include "shared/source/helpers/hw_info.h"
#include "shared/source/helpers/ptr_math.h"
#include "shared/source/os_interface/driver_info.h"
@@ -265,9 +266,23 @@ bool Drm::checkResetStatus(OsContext &osContext) {
const auto retVal{ioctlHelper->getResetStats(resetStats, &status, &fault)};
UNRECOVERABLE_IF(retVal != 0);
if (checkToDisableScratchPage() && ioctlHelper->validPageFault(fault.flags)) {
bool banned = ((status & ioctlHelper->getStatusForResetStats(true)) == 0);
PRINT_DEBUG_STRING(debugManager.flags.PrintDebugMessages.get(), stderr, "ERROR: Unexpected page fault from GPU at 0x%llx, type: %d, level: %d, access: %d, banned: %d, aborting.\n",
fault.addr, fault.type, fault.level, fault.access, banned);
bool banned = ((status & ioctlHelper->getStatusForResetStats(true)) != 0);
fprintf(stderr, "FATAL: Unexpected page fault from GPU at 0x%lx, ctx_id: %u (%s) type: %d (%s), level: %d (%s), access: %d (%s), banned: %d, aborting.\n",
fault.addr,
resetStats.contextId,
EngineHelpers::engineTypeToString(osContext.getEngineType()).c_str(),
fault.type, GpuPageFaultHelpers::faultTypeToString(static_cast<FaultType>(fault.type)).c_str(),
fault.level, GpuPageFaultHelpers::faultLevelToString(static_cast<FaultLevel>(fault.level)).c_str(),
fault.access, GpuPageFaultHelpers::faultAccessToString(static_cast<FaultAccess>(fault.access)).c_str(),
banned);
fprintf(stdout, "FATAL: Unexpected page fault from GPU at 0x%lx, ctx_id: %u (%s) type: %d (%s), level: %d (%s), access: %d (%s), banned: %d, aborting.\n",
fault.addr,
resetStats.contextId,
EngineHelpers::engineTypeToString(osContext.getEngineType()).c_str(),
fault.type, GpuPageFaultHelpers::faultTypeToString(static_cast<FaultType>(fault.type)).c_str(),
fault.level, GpuPageFaultHelpers::faultLevelToString(static_cast<FaultLevel>(fault.level)).c_str(),
fault.access, GpuPageFaultHelpers::faultAccessToString(static_cast<FaultAccess>(fault.access)).c_str(),
banned);
UNRECOVERABLE_IF(true);
}
if (resetStats.batchActive > 0 || resetStats.batchPending > 0) {

View File

@@ -1,5 +1,5 @@
#
# Copyright (C) 2018-2023 Intel Corporation
# Copyright (C) 2018-2024 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
@@ -32,6 +32,7 @@ target_sources(neo_shared_tests PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/hw_aot_config_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/gfx_core_helper_default_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/gfx_core_helper_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/gpu_page_fault_helper_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/local_id_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/l3_range_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/kernel_helpers_tests.cpp

View File

@@ -0,0 +1,34 @@
/*
* Copyright (C) 2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/gpu_page_fault_helper.h"
#include "shared/test/common/test_macros/test.h"
using namespace NEO;
TEST(GpuPageFaultHelperTest, givenValidAndInvalidFaultTypesWhenGettingStringRepresentationThenItIsCorrect) {
EXPECT_EQ(std::string{"NotPresent"}, GpuPageFaultHelpers::faultTypeToString(FaultType::notPresent));
EXPECT_EQ(std::string{"WriteAccessViolation"}, GpuPageFaultHelpers::faultTypeToString(FaultType::writeAccessViolation));
EXPECT_EQ(std::string{"AtomicAccessViolation"}, GpuPageFaultHelpers::faultTypeToString(FaultType::atomicAccessViolation));
EXPECT_EQ(std::string{"Unknown"}, GpuPageFaultHelpers::faultTypeToString(static_cast<FaultType>(0xcc)));
}
TEST(GpuPageFaultHelperTest, givenValidAndInvalidFaultAccessesWhenGettingStringRepresentationThenItIsCorrect) {
EXPECT_EQ(std::string{"Read"}, GpuPageFaultHelpers::faultAccessToString(FaultAccess::read));
EXPECT_EQ(std::string{"Write"}, GpuPageFaultHelpers::faultAccessToString(FaultAccess::write));
EXPECT_EQ(std::string{"Atomic"}, GpuPageFaultHelpers::faultAccessToString(FaultAccess::atomic));
EXPECT_EQ(std::string{"Unknown"}, GpuPageFaultHelpers::faultAccessToString(static_cast<FaultAccess>(0xcc)));
}
TEST(GpuPageFaultHelperTest, givenValidAndInvalidFaultLevelWhenGettingStringRepresentationThenItIsCorrect) {
EXPECT_EQ(std::string{"PTE"}, GpuPageFaultHelpers::faultLevelToString(FaultLevel::pte));
EXPECT_EQ(std::string{"PDE"}, GpuPageFaultHelpers::faultLevelToString(FaultLevel::pde));
EXPECT_EQ(std::string{"PDP"}, GpuPageFaultHelpers::faultLevelToString(FaultLevel::pdp));
EXPECT_EQ(std::string{"PML4"}, GpuPageFaultHelpers::faultLevelToString(FaultLevel::pml4));
EXPECT_EQ(std::string{"PML5"}, GpuPageFaultHelpers::faultLevelToString(FaultLevel::pml5));
EXPECT_EQ(std::string{"Unknown"}, GpuPageFaultHelpers::faultLevelToString(static_cast<FaultLevel>(0xcc)));
}

View File

@@ -7,6 +7,7 @@
#include "shared/source/command_stream/submission_status.h"
#include "shared/source/helpers/file_io.h"
#include "shared/source/helpers/gpu_page_fault_helper.h"
#include "shared/source/helpers/hw_info.h"
#include "shared/source/os_interface/device_factory.h"
#include "shared/source/os_interface/driver_info.h"
@@ -1414,14 +1415,31 @@ TEST(DrmTest, GivenBatchPendingGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThe
class MockIoctlHelperResetStats : public MockIoctlHelper {
public:
using MockIoctlHelper::MockIoctlHelper;
int getResetStats(ResetStats &resetStats, uint32_t *status, ResetStatsFault *resetStatsFault) override {
int ret = MockIoctlHelper::getResetStats(resetStats, status, resetStatsFault);
if (status) {
*status = statusReturnValue;
}
if (resetStatsFault) {
*resetStatsFault = resetStatsFaultReturnValue;
}
return ret;
}
bool validPageFault(uint16_t flags) override {
return validPageFaultReturnValue;
return true;
}
uint32_t getStatusForResetStats(bool banned) override {
return getStatusForResetStatsReturnValue;
if (banned) {
return statusReturnValue;
} else {
return 0u;
}
}
bool validPageFaultReturnValue = 0;
uint32_t getStatusForResetStatsReturnValue = 0;
uint32_t statusReturnValue = 0;
ResetStatsFault resetStatsFaultReturnValue{};
};
TEST(DrmDeathTest, GivenResetStatsWithValidFaultWhenIsGpuHangIsCalledThenProcessTerminated) {
@@ -1439,15 +1457,50 @@ TEST(DrmDeathTest, GivenResetStatsWithValidFaultWhenIsGpuHangIsCalledThenProcess
MockOsContextLinux mockOsContextLinux{drm, 0, contextId, engineDescriptor};
mockOsContextLinux.drmContextIds.push_back(0);
ResetStats resetStats{};
resetStats.contextId = 0;
drm.resetStatsToReturn.push_back(resetStats);
ResetStats resetStatsExpected{};
ResetStatsFault resetStatsFaultExpected{};
resetStatsExpected.contextId = 0;
drm.resetStatsToReturn.push_back(resetStatsExpected);
resetStatsFaultExpected.flags = 1;
resetStatsFaultExpected.addr = 0x1234;
resetStatsFaultExpected.type = 2;
resetStatsFaultExpected.level = 3;
ioctlHelper->statusReturnValue = 2u;
ioctlHelper->resetStatsFaultReturnValue = resetStatsFaultExpected;
ioctlHelper->getStatusForResetStatsReturnValue = 1;
ioctlHelper->validPageFaultReturnValue = true;
drm.ioctlHelper = std::move(ioctlHelper);
int strSize = std::snprintf(nullptr, 0, "FATAL: Unexpected page fault from GPU at 0x%lx, ctx_id: %u (%s) type: %d (%s), level: %d (%s), access: %d (%s), banned: %d, aborting.\n",
resetStatsFaultExpected.addr,
resetStatsExpected.contextId,
EngineHelpers::engineTypeToString(aub_stream::ENGINE_BCS).c_str(),
resetStatsFaultExpected.type, GpuPageFaultHelpers::faultTypeToString(static_cast<FaultType>(resetStatsFaultExpected.type)).c_str(),
resetStatsFaultExpected.level, GpuPageFaultHelpers::faultLevelToString(static_cast<FaultLevel>(resetStatsFaultExpected.level)).c_str(),
resetStatsFaultExpected.access, GpuPageFaultHelpers::faultAccessToString(static_cast<FaultAccess>(resetStatsFaultExpected.access)).c_str(),
true) +
1;
std::unique_ptr<char[]> buf(new char[strSize]);
std::snprintf(buf.get(), strSize, "FATAL: Unexpected page fault from GPU at 0x%lx, ctx_id: %u (%s) type: %d (%s), level: %d (%s), access: %d (%s), banned: %d, aborting.\n",
resetStatsFaultExpected.addr,
resetStatsExpected.contextId,
EngineHelpers::engineTypeToString(aub_stream::ENGINE_BCS).c_str(),
resetStatsFaultExpected.type, GpuPageFaultHelpers::faultTypeToString(static_cast<FaultType>(resetStatsFaultExpected.type)).c_str(),
resetStatsFaultExpected.level, GpuPageFaultHelpers::faultLevelToString(static_cast<FaultLevel>(resetStatsFaultExpected.level)).c_str(),
resetStatsFaultExpected.access, GpuPageFaultHelpers::faultAccessToString(static_cast<FaultAccess>(resetStatsFaultExpected.access)).c_str(),
true);
std::string expectedString = std::string(buf.get());
::testing::internal::CaptureStderr();
::testing::internal::CaptureStdout();
EXPECT_THROW(drm.isGpuHangDetected(mockOsContextLinux), std::runtime_error);
auto stderrString = ::testing::internal::GetCapturedStderr();
auto stdoutString = ::testing::internal::GetCapturedStdout();
EXPECT_EQ(expectedString, stderrString);
EXPECT_EQ(expectedString, stdoutString);
}
struct DrmMockCheckPageFault : public DrmMock {