mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-20 17:00:59 +08:00
fix: print to stdout for disable scratch page
Modified to print out error messages to stdout when disable scratch page is used. Related-To: GSD-7611 Signed-off-by: Young Jin Yoon <young.jin.yoon@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
4b7e2d5064
commit
e204d27190
@@ -93,6 +93,8 @@ set(NEO_CORE_HELPERS
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/gfx_core_helper_bdw_to_icllp.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/gfx_core_helper_pvc_and_later.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/gfx_core_helper_tgllp_and_later.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/gpu_page_fault_helper.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/gpu_page_fault_helper.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/hw_info.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/hw_info.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/hw_info_helper.cpp
|
||||
|
||||
55
shared/source/helpers/gpu_page_fault_helper.cpp
Normal file
55
shared/source/helpers/gpu_page_fault_helper.cpp
Normal file
@@ -0,0 +1,55 @@
|
||||
/*
|
||||
* Copyright (C) 2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/source/helpers/gpu_page_fault_helper.h"
|
||||
|
||||
namespace NEO::GpuPageFaultHelpers {
|
||||
|
||||
std::string faultTypeToString(FaultType type) {
|
||||
switch (type) {
|
||||
case FaultType::notPresent:
|
||||
return "NotPresent";
|
||||
case FaultType::writeAccessViolation:
|
||||
return "WriteAccessViolation";
|
||||
case FaultType::atomicAccessViolation:
|
||||
return "AtomicAccessViolation";
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
std::string faultAccessToString(FaultAccess access) {
|
||||
switch (access) {
|
||||
case FaultAccess::read:
|
||||
return "Read";
|
||||
case FaultAccess::write:
|
||||
return "Write";
|
||||
case FaultAccess::atomic:
|
||||
return "Atomic";
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
std::string faultLevelToString(FaultLevel level) {
|
||||
switch (level) {
|
||||
case FaultLevel::pte:
|
||||
return "PTE";
|
||||
case FaultLevel::pde:
|
||||
return "PDE";
|
||||
case FaultLevel::pdp:
|
||||
return "PDP";
|
||||
case FaultLevel::pml4:
|
||||
return "PML4";
|
||||
case FaultLevel::pml5:
|
||||
return "PML5";
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace NEO::GpuPageFaultHelpers
|
||||
41
shared/source/helpers/gpu_page_fault_helper.h
Normal file
41
shared/source/helpers/gpu_page_fault_helper.h
Normal file
@@ -0,0 +1,41 @@
|
||||
/*
|
||||
* Copyright (C) 2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
|
||||
namespace NEO {
|
||||
|
||||
enum class FaultType : uint16_t {
|
||||
notPresent = 0b00,
|
||||
writeAccessViolation = 0b01,
|
||||
atomicAccessViolation = 0b10
|
||||
};
|
||||
enum class FaultAccess : uint16_t {
|
||||
read = 0b00,
|
||||
write = 0b01,
|
||||
atomic = 0b10
|
||||
};
|
||||
enum class FaultLevel : uint16_t {
|
||||
pte = 0b000,
|
||||
pde = 0b001,
|
||||
pdp = 0b010,
|
||||
pml4 = 0b011,
|
||||
pml5 = 0b0100
|
||||
};
|
||||
|
||||
namespace GpuPageFaultHelpers {
|
||||
|
||||
std::string faultTypeToString(FaultType type);
|
||||
std::string faultAccessToString(FaultAccess access);
|
||||
std::string faultLevelToString(FaultLevel level);
|
||||
|
||||
} // namespace GpuPageFaultHelpers
|
||||
|
||||
} // namespace NEO
|
||||
@@ -20,6 +20,7 @@
|
||||
#include "shared/source/helpers/constants.h"
|
||||
#include "shared/source/helpers/debug_helpers.h"
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
#include "shared/source/helpers/gpu_page_fault_helper.h"
|
||||
#include "shared/source/helpers/hw_info.h"
|
||||
#include "shared/source/helpers/ptr_math.h"
|
||||
#include "shared/source/os_interface/driver_info.h"
|
||||
@@ -265,9 +266,23 @@ bool Drm::checkResetStatus(OsContext &osContext) {
|
||||
const auto retVal{ioctlHelper->getResetStats(resetStats, &status, &fault)};
|
||||
UNRECOVERABLE_IF(retVal != 0);
|
||||
if (checkToDisableScratchPage() && ioctlHelper->validPageFault(fault.flags)) {
|
||||
bool banned = ((status & ioctlHelper->getStatusForResetStats(true)) == 0);
|
||||
PRINT_DEBUG_STRING(debugManager.flags.PrintDebugMessages.get(), stderr, "ERROR: Unexpected page fault from GPU at 0x%llx, type: %d, level: %d, access: %d, banned: %d, aborting.\n",
|
||||
fault.addr, fault.type, fault.level, fault.access, banned);
|
||||
bool banned = ((status & ioctlHelper->getStatusForResetStats(true)) != 0);
|
||||
fprintf(stderr, "FATAL: Unexpected page fault from GPU at 0x%lx, ctx_id: %u (%s) type: %d (%s), level: %d (%s), access: %d (%s), banned: %d, aborting.\n",
|
||||
fault.addr,
|
||||
resetStats.contextId,
|
||||
EngineHelpers::engineTypeToString(osContext.getEngineType()).c_str(),
|
||||
fault.type, GpuPageFaultHelpers::faultTypeToString(static_cast<FaultType>(fault.type)).c_str(),
|
||||
fault.level, GpuPageFaultHelpers::faultLevelToString(static_cast<FaultLevel>(fault.level)).c_str(),
|
||||
fault.access, GpuPageFaultHelpers::faultAccessToString(static_cast<FaultAccess>(fault.access)).c_str(),
|
||||
banned);
|
||||
fprintf(stdout, "FATAL: Unexpected page fault from GPU at 0x%lx, ctx_id: %u (%s) type: %d (%s), level: %d (%s), access: %d (%s), banned: %d, aborting.\n",
|
||||
fault.addr,
|
||||
resetStats.contextId,
|
||||
EngineHelpers::engineTypeToString(osContext.getEngineType()).c_str(),
|
||||
fault.type, GpuPageFaultHelpers::faultTypeToString(static_cast<FaultType>(fault.type)).c_str(),
|
||||
fault.level, GpuPageFaultHelpers::faultLevelToString(static_cast<FaultLevel>(fault.level)).c_str(),
|
||||
fault.access, GpuPageFaultHelpers::faultAccessToString(static_cast<FaultAccess>(fault.access)).c_str(),
|
||||
banned);
|
||||
UNRECOVERABLE_IF(true);
|
||||
}
|
||||
if (resetStats.batchActive > 0 || resetStats.batchPending > 0) {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (C) 2018-2023 Intel Corporation
|
||||
# Copyright (C) 2018-2024 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
#
|
||||
@@ -32,6 +32,7 @@ target_sources(neo_shared_tests PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/hw_aot_config_tests.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/gfx_core_helper_default_tests.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/gfx_core_helper_tests.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/gpu_page_fault_helper_tests.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/local_id_tests.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/l3_range_tests.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kernel_helpers_tests.cpp
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
/*
|
||||
* Copyright (C) 2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/source/helpers/gpu_page_fault_helper.h"
|
||||
#include "shared/test/common/test_macros/test.h"
|
||||
|
||||
using namespace NEO;
|
||||
|
||||
TEST(GpuPageFaultHelperTest, givenValidAndInvalidFaultTypesWhenGettingStringRepresentationThenItIsCorrect) {
|
||||
EXPECT_EQ(std::string{"NotPresent"}, GpuPageFaultHelpers::faultTypeToString(FaultType::notPresent));
|
||||
EXPECT_EQ(std::string{"WriteAccessViolation"}, GpuPageFaultHelpers::faultTypeToString(FaultType::writeAccessViolation));
|
||||
EXPECT_EQ(std::string{"AtomicAccessViolation"}, GpuPageFaultHelpers::faultTypeToString(FaultType::atomicAccessViolation));
|
||||
EXPECT_EQ(std::string{"Unknown"}, GpuPageFaultHelpers::faultTypeToString(static_cast<FaultType>(0xcc)));
|
||||
}
|
||||
|
||||
TEST(GpuPageFaultHelperTest, givenValidAndInvalidFaultAccessesWhenGettingStringRepresentationThenItIsCorrect) {
|
||||
EXPECT_EQ(std::string{"Read"}, GpuPageFaultHelpers::faultAccessToString(FaultAccess::read));
|
||||
EXPECT_EQ(std::string{"Write"}, GpuPageFaultHelpers::faultAccessToString(FaultAccess::write));
|
||||
EXPECT_EQ(std::string{"Atomic"}, GpuPageFaultHelpers::faultAccessToString(FaultAccess::atomic));
|
||||
EXPECT_EQ(std::string{"Unknown"}, GpuPageFaultHelpers::faultAccessToString(static_cast<FaultAccess>(0xcc)));
|
||||
}
|
||||
|
||||
TEST(GpuPageFaultHelperTest, givenValidAndInvalidFaultLevelWhenGettingStringRepresentationThenItIsCorrect) {
|
||||
EXPECT_EQ(std::string{"PTE"}, GpuPageFaultHelpers::faultLevelToString(FaultLevel::pte));
|
||||
EXPECT_EQ(std::string{"PDE"}, GpuPageFaultHelpers::faultLevelToString(FaultLevel::pde));
|
||||
EXPECT_EQ(std::string{"PDP"}, GpuPageFaultHelpers::faultLevelToString(FaultLevel::pdp));
|
||||
EXPECT_EQ(std::string{"PML4"}, GpuPageFaultHelpers::faultLevelToString(FaultLevel::pml4));
|
||||
EXPECT_EQ(std::string{"PML5"}, GpuPageFaultHelpers::faultLevelToString(FaultLevel::pml5));
|
||||
EXPECT_EQ(std::string{"Unknown"}, GpuPageFaultHelpers::faultLevelToString(static_cast<FaultLevel>(0xcc)));
|
||||
}
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include "shared/source/command_stream/submission_status.h"
|
||||
#include "shared/source/helpers/file_io.h"
|
||||
#include "shared/source/helpers/gpu_page_fault_helper.h"
|
||||
#include "shared/source/helpers/hw_info.h"
|
||||
#include "shared/source/os_interface/device_factory.h"
|
||||
#include "shared/source/os_interface/driver_info.h"
|
||||
@@ -1414,14 +1415,31 @@ TEST(DrmTest, GivenBatchPendingGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThe
|
||||
class MockIoctlHelperResetStats : public MockIoctlHelper {
|
||||
public:
|
||||
using MockIoctlHelper::MockIoctlHelper;
|
||||
int getResetStats(ResetStats &resetStats, uint32_t *status, ResetStatsFault *resetStatsFault) override {
|
||||
int ret = MockIoctlHelper::getResetStats(resetStats, status, resetStatsFault);
|
||||
if (status) {
|
||||
*status = statusReturnValue;
|
||||
}
|
||||
if (resetStatsFault) {
|
||||
*resetStatsFault = resetStatsFaultReturnValue;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool validPageFault(uint16_t flags) override {
|
||||
return validPageFaultReturnValue;
|
||||
return true;
|
||||
}
|
||||
|
||||
uint32_t getStatusForResetStats(bool banned) override {
|
||||
return getStatusForResetStatsReturnValue;
|
||||
if (banned) {
|
||||
return statusReturnValue;
|
||||
} else {
|
||||
return 0u;
|
||||
}
|
||||
}
|
||||
bool validPageFaultReturnValue = 0;
|
||||
uint32_t getStatusForResetStatsReturnValue = 0;
|
||||
|
||||
uint32_t statusReturnValue = 0;
|
||||
ResetStatsFault resetStatsFaultReturnValue{};
|
||||
};
|
||||
|
||||
TEST(DrmDeathTest, GivenResetStatsWithValidFaultWhenIsGpuHangIsCalledThenProcessTerminated) {
|
||||
@@ -1439,15 +1457,50 @@ TEST(DrmDeathTest, GivenResetStatsWithValidFaultWhenIsGpuHangIsCalledThenProcess
|
||||
MockOsContextLinux mockOsContextLinux{drm, 0, contextId, engineDescriptor};
|
||||
mockOsContextLinux.drmContextIds.push_back(0);
|
||||
|
||||
ResetStats resetStats{};
|
||||
resetStats.contextId = 0;
|
||||
drm.resetStatsToReturn.push_back(resetStats);
|
||||
ResetStats resetStatsExpected{};
|
||||
ResetStatsFault resetStatsFaultExpected{};
|
||||
resetStatsExpected.contextId = 0;
|
||||
drm.resetStatsToReturn.push_back(resetStatsExpected);
|
||||
|
||||
resetStatsFaultExpected.flags = 1;
|
||||
resetStatsFaultExpected.addr = 0x1234;
|
||||
resetStatsFaultExpected.type = 2;
|
||||
resetStatsFaultExpected.level = 3;
|
||||
|
||||
ioctlHelper->statusReturnValue = 2u;
|
||||
ioctlHelper->resetStatsFaultReturnValue = resetStatsFaultExpected;
|
||||
|
||||
ioctlHelper->getStatusForResetStatsReturnValue = 1;
|
||||
ioctlHelper->validPageFaultReturnValue = true;
|
||||
drm.ioctlHelper = std::move(ioctlHelper);
|
||||
|
||||
int strSize = std::snprintf(nullptr, 0, "FATAL: Unexpected page fault from GPU at 0x%lx, ctx_id: %u (%s) type: %d (%s), level: %d (%s), access: %d (%s), banned: %d, aborting.\n",
|
||||
resetStatsFaultExpected.addr,
|
||||
resetStatsExpected.contextId,
|
||||
EngineHelpers::engineTypeToString(aub_stream::ENGINE_BCS).c_str(),
|
||||
resetStatsFaultExpected.type, GpuPageFaultHelpers::faultTypeToString(static_cast<FaultType>(resetStatsFaultExpected.type)).c_str(),
|
||||
resetStatsFaultExpected.level, GpuPageFaultHelpers::faultLevelToString(static_cast<FaultLevel>(resetStatsFaultExpected.level)).c_str(),
|
||||
resetStatsFaultExpected.access, GpuPageFaultHelpers::faultAccessToString(static_cast<FaultAccess>(resetStatsFaultExpected.access)).c_str(),
|
||||
true) +
|
||||
1;
|
||||
|
||||
std::unique_ptr<char[]> buf(new char[strSize]);
|
||||
std::snprintf(buf.get(), strSize, "FATAL: Unexpected page fault from GPU at 0x%lx, ctx_id: %u (%s) type: %d (%s), level: %d (%s), access: %d (%s), banned: %d, aborting.\n",
|
||||
resetStatsFaultExpected.addr,
|
||||
resetStatsExpected.contextId,
|
||||
EngineHelpers::engineTypeToString(aub_stream::ENGINE_BCS).c_str(),
|
||||
resetStatsFaultExpected.type, GpuPageFaultHelpers::faultTypeToString(static_cast<FaultType>(resetStatsFaultExpected.type)).c_str(),
|
||||
resetStatsFaultExpected.level, GpuPageFaultHelpers::faultLevelToString(static_cast<FaultLevel>(resetStatsFaultExpected.level)).c_str(),
|
||||
resetStatsFaultExpected.access, GpuPageFaultHelpers::faultAccessToString(static_cast<FaultAccess>(resetStatsFaultExpected.access)).c_str(),
|
||||
true);
|
||||
|
||||
std::string expectedString = std::string(buf.get());
|
||||
|
||||
::testing::internal::CaptureStderr();
|
||||
::testing::internal::CaptureStdout();
|
||||
EXPECT_THROW(drm.isGpuHangDetected(mockOsContextLinux), std::runtime_error);
|
||||
auto stderrString = ::testing::internal::GetCapturedStderr();
|
||||
auto stdoutString = ::testing::internal::GetCapturedStdout();
|
||||
EXPECT_EQ(expectedString, stderrString);
|
||||
EXPECT_EQ(expectedString, stdoutString);
|
||||
}
|
||||
|
||||
struct DrmMockCheckPageFault : public DrmMock {
|
||||
|
||||
Reference in New Issue
Block a user