mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 06:49:52 +08:00
fix: update new RAS errors for sysman
Related-To: NEO-12603 Signed-off-by: Shreyas Kunder <shreyas.kunder@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
bb1a125f0c
commit
ee3ef684ad
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2022-2024 Intel Corporation
|
||||
* Copyright (C) 2022-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -29,6 +29,7 @@ constexpr uint64_t correctableEuErrorCount = 75u;
|
||||
constexpr uint64_t fatalEuErrorCount = 50u;
|
||||
constexpr uint64_t fatalTlb = 3u;
|
||||
constexpr uint64_t socFatalPsfCsc0Count = 5u;
|
||||
constexpr uint64_t socFatalIosfPciaer = 3u;
|
||||
constexpr uint64_t fatalEngineResetCount = 45u;
|
||||
constexpr uint64_t correctableGrfErrorCountTile0 = 90u;
|
||||
constexpr uint64_t correctableEuErrorCountTile0 = 70u;
|
||||
@@ -60,7 +61,7 @@ constexpr uint64_t driverEngineOther = 3u;
|
||||
constexpr uint64_t initialUncorrectableCacheErrors = 2u;
|
||||
constexpr uint64_t initialEngineReset = 2u;
|
||||
constexpr uint64_t initialProgrammingErrors = 7u;
|
||||
constexpr uint64_t initialUncorrectableNonComputeErrors = 3u;
|
||||
constexpr uint64_t initialUncorrectableNonComputeErrors = 4u;
|
||||
constexpr uint64_t initialUncorrectableFabricErrors = 8u;
|
||||
constexpr uint64_t initialUncorrectableComputeErrors = 7u;
|
||||
constexpr uint64_t initialCorrectableComputeErrors = 6u;
|
||||
@@ -70,7 +71,7 @@ constexpr uint64_t initialUncorrectableCacheErrorsTile0 = 2u;
|
||||
constexpr uint64_t initialCorrectableCacheErrorTile0 = 2u;
|
||||
constexpr uint64_t initialEngineResetTile0 = 2u;
|
||||
constexpr uint64_t initialProgrammingErrorsTile0 = 7u;
|
||||
constexpr uint64_t initialUncorrectableNonComputeErrorsTile0 = 10u;
|
||||
constexpr uint64_t initialUncorrectableNonComputeErrorsTile0 = 11u;
|
||||
constexpr uint64_t initialCorrectableNonComputeErrorsTile0 = 2u;
|
||||
constexpr uint64_t initialUncorrectableComputeErrorsTile0 = 8u;
|
||||
constexpr uint64_t initialUncorrectableFabricErrorsTile0 = 8u;
|
||||
@@ -139,9 +140,10 @@ struct MockRasPmuInterfaceImp : public PmuInterfaceImp {
|
||||
data[7] = 0;
|
||||
data[8] = fatalEuErrorCount;
|
||||
data[9] = socFatalPsfCsc0Count;
|
||||
data[10] = fatalTlb;
|
||||
data[11] = 0;
|
||||
data[12] = socFatalMdfiEastCount;
|
||||
data[10] = socFatalIosfPciaer;
|
||||
data[11] = fatalTlb;
|
||||
data[12] = 0;
|
||||
data[13] = socFatalMdfiEastCount;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -167,11 +169,12 @@ struct MockRasPmuInterfaceImp : public PmuInterfaceImp {
|
||||
data[8] = fatalSubslice;
|
||||
data[9] = fatalEuErrorCount;
|
||||
data[10] = socFatalPsfCsc0Count;
|
||||
data[11] = nonFatalGscAonParity;
|
||||
data[12] = nonFataGscSelfmBist;
|
||||
data[13] = fatalTlb;
|
||||
data[14] = 0;
|
||||
data[15] = socFatalMdfiEastCount;
|
||||
data[11] = socFatalIosfPciaer;
|
||||
data[12] = nonFatalGscAonParity;
|
||||
data[13] = nonFataGscSelfmBist;
|
||||
data[14] = fatalTlb;
|
||||
data[15] = 0;
|
||||
data[16] = socFatalMdfiEastCount;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -382,6 +385,9 @@ struct MockRasSysfsAccess : public SysfsAccess {
|
||||
} else if (file.compare("gt/gt1/error_counter/correctable_subslice") == 0) {
|
||||
val = 2u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt0/error_counter/soc_fatal_iosf_pciaer") == 0) {
|
||||
val = 1u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
return ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
}
|
||||
@@ -453,6 +459,9 @@ struct MockRasSysfsAccess : public SysfsAccess {
|
||||
} else if (file.compare("gt/gt1/error_counter/driver_engine_other") == 0) {
|
||||
val = 3u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
} else if (file.compare("gt/gt0/error_counter/soc_fatal_iosf_pciaer") == 0) {
|
||||
val = 1u;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
return ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
}
|
||||
@@ -513,6 +522,7 @@ struct MockRasFsAccess : public FsAccess {
|
||||
events.push_back("error--fatal-fpu");
|
||||
events.push_back("error--fatal-l3-fabric");
|
||||
events.push_back("ccs0-busy");
|
||||
events.push_back("error--soc-fatal-iosf-pciaer");
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
return ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
@@ -560,6 +570,7 @@ struct MockRasFsAccess : public FsAccess {
|
||||
events.push_back("error-gt0--fatal-subslice");
|
||||
events.push_back("error-gt1--fatal-l3bank");
|
||||
events.push_back("error-gt1--correctable-subslice");
|
||||
events.push_back("error-gt0--soc-fatal-iosf-pciaer");
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
return ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2022-2024 Intel Corporation
|
||||
* Copyright (C) 2022-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -244,7 +244,7 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtThenSu
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + socFatalIosfPciaer + initialUncorrectableNonComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors);
|
||||
}
|
||||
@@ -281,7 +281,7 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtAfterCl
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + socFatalIosfPciaer + initialUncorrectableNonComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors);
|
||||
}
|
||||
@@ -720,7 +720,7 @@ TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateF
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineResetTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrorsTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalSubslice + fatalEuErrorCount + initialUncorrectableComputeErrorsTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + socFatalIosfPciaer + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrorsTile0);
|
||||
} else if (handleIndex == 2u) {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2023-2024 Intel Corporation
|
||||
* Copyright (C) 2023-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -125,7 +125,7 @@ TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingZesRasGetStateExpForGt
|
||||
expectedErrCount = fatalEuErrorCount + initialUncorrectableComputeErrors;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
|
||||
expectedErrCount = socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors;
|
||||
expectedErrCount = socFatalPsfCsc0Count + socFatalIosfPciaer + initialUncorrectableNonComputeErrors;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) {
|
||||
expectedErrCount = euAttention + initialProgrammingErrors;
|
||||
@@ -197,7 +197,7 @@ TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingZesRasGetStateExpForGt
|
||||
expectedErrCount = fatalEuErrorCount + initialUncorrectableComputeErrors;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
|
||||
expectedErrCount = socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors;
|
||||
expectedErrCount = socFatalPsfCsc0Count + socFatalIosfPciaer + initialUncorrectableNonComputeErrors;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) {
|
||||
expectedErrCount = euAttention + initialProgrammingErrors;
|
||||
@@ -507,7 +507,7 @@ TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingzesRasClearStateExpAnd
|
||||
expectedErrCount = fatalEuErrorCount + initialUncorrectableComputeErrors;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
|
||||
expectedErrCount = socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors;
|
||||
expectedErrCount = socFatalPsfCsc0Count + socFatalIosfPciaer + initialUncorrectableNonComputeErrors;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) {
|
||||
expectedErrCount = euAttention + initialProgrammingErrors;
|
||||
@@ -806,7 +806,7 @@ TEST_F(SysmanRasExpMultiDeviceFixture, GivenValidRasHandleWhenCallingZesRasGetSt
|
||||
expectedErrCount = fatalSubslice + fatalEuErrorCount + initialUncorrectableComputeErrorsTile0;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
|
||||
expectedErrCount = socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0;
|
||||
expectedErrCount = socFatalPsfCsc0Count + socFatalIosfPciaer + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) {
|
||||
expectedErrCount = euAttention + initialProgrammingErrors;
|
||||
@@ -975,7 +975,7 @@ TEST_F(SysmanRasExpMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasClear
|
||||
expectedErrCount = fatalSubslice + fatalEuErrorCount + initialUncorrectableComputeErrorsTile0;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
|
||||
expectedErrCount = socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0;
|
||||
expectedErrCount = socFatalPsfCsc0Count + socFatalIosfPciaer + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) {
|
||||
expectedErrCount = euAttention + initialProgrammingErrors;
|
||||
|
||||
Reference in New Issue
Block a user