From ee3ef684ad6275e0a73a69df5fd3f03aa8aefe55 Mon Sep 17 00:00:00 2001 From: Shreyas Kunder Date: Mon, 6 Jan 2025 10:49:36 +0000 Subject: [PATCH] fix: update new RAS errors for sysman Related-To: NEO-12603 Signed-off-by: Shreyas Kunder --- .../linux/ras_util/sysman_ras_util_pmu.cpp | 6 ++-- .../sources/ras/linux/mock_sysman_ras.h | 33 ++++++++++++------- .../sources/ras/linux/test_zes_ras.cpp | 8 ++--- .../sources/ras/linux/test_zes_ras_exp.cpp | 12 +++---- .../source/sysman/ras/linux/os_ras_imp_gt.cpp | 6 ++-- .../sysman/ras/linux/mock_sysman_ras.h | 33 ++++++++++++------- .../sources/sysman/ras/linux/test_zes_ras.cpp | 8 ++--- .../sysman/ras/linux/test_zes_ras_exp.cpp | 12 +++---- 8 files changed, 72 insertions(+), 46 deletions(-) diff --git a/level_zero/sysman/source/api/ras/linux/ras_util/sysman_ras_util_pmu.cpp b/level_zero/sysman/source/api/ras/linux/ras_util/sysman_ras_util_pmu.cpp index 4d5d704cd1..288800b3a5 100644 --- a/level_zero/sysman/source/api/ras/linux/ras_util/sysman_ras_util_pmu.cpp +++ b/level_zero/sysman/source/api/ras/linux/ras_util/sysman_ras_util_pmu.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2023-2024 Intel Corporation + * Copyright (C) 2023-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -35,7 +35,9 @@ static const std::map> ca "sgunit-fatal", "soc-nonfatal-punit", "sgunit-fatal", "sgunit-nonfatal", "gsc-nonfatal-mia-shutdown", "gsc-nonfatal-aon-parity", "gsc-nonfatal-rom-parity", "gsc-nonfatal-fuse-crc-check", "gsc-nonfatal-selfmbist", "gsc-nonfatal-fuse-pull", "gsc-nonfatal-sram-ecc", "gsc-nonfatal-glitch-det", - "gsc-nonfatal-ucode-parity", "gsc-nonfatal-mia-int", "gsc-nonfatal-wdg-timeout"}}, + "gsc-nonfatal-ucode-parity", "gsc-nonfatal-mia-int", "gsc-nonfatal-wdg-timeout", "soc-fatal-iosf-pciaer", + "soc-fatal-iosf-pcierr", "soc-fatal-pciaer", "soc-fatal-pcierr", "soc-fatal-serr-spi", "soc-fatal-serr-srcs", + "soc-fatal-ur-response", "soc-fatal-ur", "soc-fatal-hbm-mca", "soc-fatal-hbm-punit-mca"}}, {ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS, {"fatal-fpu", "fatal-eu-grf", "fatal-sampler", "fatal-slm", "fatal-guc", "fatal-eu-ic", "fatal-subslice"}}, diff --git a/level_zero/sysman/test/unit_tests/sources/ras/linux/mock_sysman_ras.h b/level_zero/sysman/test/unit_tests/sources/ras/linux/mock_sysman_ras.h index 84efcfeb08..2161923e76 100644 --- a/level_zero/sysman/test/unit_tests/sources/ras/linux/mock_sysman_ras.h +++ b/level_zero/sysman/test/unit_tests/sources/ras/linux/mock_sysman_ras.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2023-2024 Intel Corporation + * Copyright (C) 2023-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -36,6 +36,7 @@ constexpr uint64_t correctableEuErrorCount = 75u; constexpr uint64_t fatalEuErrorCount = 50u; constexpr uint64_t fatalTlb = 3u; constexpr uint64_t socFatalPsfCsc0Count = 5u; +constexpr uint64_t socFatalIosfPciaer = 3u; constexpr uint64_t fatalEngineResetCount = 45u; constexpr uint64_t correctableGrfErrorCountTile0 = 90u; constexpr uint64_t correctableEuErrorCountTile0 = 70u; @@ -67,7 +68,7 @@ constexpr uint64_t driverEngineOther = 3u; constexpr uint64_t initialUncorrectableCacheErrors = 2u; constexpr uint64_t initialEngineReset = 2u; constexpr uint64_t initialProgrammingErrors = 7u; -constexpr uint64_t initialUncorrectableNonComputeErrors = 3u; +constexpr uint64_t initialUncorrectableNonComputeErrors = 4u; constexpr uint64_t initialUncorrectableFabricErrors = 8u; constexpr uint64_t initialUncorrectableComputeErrors = 7u; constexpr uint64_t initialCorrectableComputeErrors = 6u; @@ -77,7 +78,7 @@ constexpr uint64_t initialUncorrectableCacheErrorsTile0 = 2u; constexpr uint64_t initialCorrectableCacheErrorTile0 = 2u; constexpr uint64_t initialEngineResetTile0 = 2u; constexpr uint64_t initialProgrammingErrorsTile0 = 7u; -constexpr uint64_t initialUncorrectableNonComputeErrorsTile0 = 10u; +constexpr uint64_t initialUncorrectableNonComputeErrorsTile0 = 11u; constexpr uint64_t initialCorrectableNonComputeErrorsTile0 = 2u; constexpr uint64_t initialUncorrectableComputeErrorsTile0 = 8u; constexpr uint64_t initialUncorrectableFabricErrorsTile0 = 8u; @@ -142,9 +143,10 @@ struct MockRasPmuInterfaceImp : public L0::Sysman::PmuInterfaceImp { data[7] = 0; data[8] = fatalEuErrorCount; data[9] = socFatalPsfCsc0Count; - data[10] = fatalTlb; - data[11] = 0; - data[12] = socFatalMdfiEastCount; + data[10] = socFatalIosfPciaer; + data[11] = fatalTlb; + data[12] = 0; + data[13] = socFatalMdfiEastCount; return 0; } @@ -170,11 +172,12 @@ struct MockRasPmuInterfaceImp : public L0::Sysman::PmuInterfaceImp { data[8] = fatalSubslice; data[9] = fatalEuErrorCount; data[10] = socFatalPsfCsc0Count; - data[11] = nonFatalGscAonParity; - data[12] = nonFataGscSelfmBist; - data[13] = fatalTlb; - data[14] = 0; - data[15] = socFatalMdfiEastCount; + data[11] = socFatalIosfPciaer; + data[12] = nonFatalGscAonParity; + data[13] = nonFataGscSelfmBist; + data[14] = fatalTlb; + data[15] = 0; + data[16] = socFatalMdfiEastCount; return 0; } @@ -391,6 +394,9 @@ struct MockRasSysfsAccess : public L0::Sysman::SysFsAccessInterface { } else if (file.compare("gt/gt1/error_counter/correctable_subslice") == 0) { val = 2u; return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt0/error_counter/soc_fatal_iosf_pciaer") == 0) { + val = 1u; + return ZE_RESULT_SUCCESS; } return ZE_RESULT_ERROR_NOT_AVAILABLE; } @@ -462,6 +468,9 @@ struct MockRasSysfsAccess : public L0::Sysman::SysFsAccessInterface { } else if (file.compare("gt/gt1/error_counter/driver_engine_other") == 0) { val = 3u; return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt0/error_counter/soc_fatal_iosf_pciaer") == 0) { + val = 1u; + return ZE_RESULT_SUCCESS; } return ZE_RESULT_ERROR_NOT_AVAILABLE; } @@ -522,6 +531,7 @@ struct MockRasFsAccess : public L0::Sysman::FsAccessInterface { events.push_back("error--fatal-fpu"); events.push_back("error--fatal-l3-fabric"); events.push_back("ccs0-busy"); + events.push_back("error--soc-fatal-iosf-pciaer"); return ZE_RESULT_SUCCESS; } return ZE_RESULT_ERROR_NOT_AVAILABLE; @@ -569,6 +579,7 @@ struct MockRasFsAccess : public L0::Sysman::FsAccessInterface { events.push_back("error-gt0--fatal-subslice"); events.push_back("error-gt1--fatal-l3bank"); events.push_back("error-gt1--correctable-subslice"); + events.push_back("error-gt0--soc-fatal-iosf-pciaer"); return ZE_RESULT_SUCCESS; } return ZE_RESULT_ERROR_NOT_AVAILABLE; diff --git a/level_zero/sysman/test/unit_tests/sources/ras/linux/test_zes_ras.cpp b/level_zero/sysman/test/unit_tests/sources/ras/linux/test_zes_ras.cpp index 6a154b1bbd..743a5cc47e 100644 --- a/level_zero/sysman/test/unit_tests/sources/ras/linux/test_zes_ras.cpp +++ b/level_zero/sysman/test/unit_tests/sources/ras/linux/test_zes_ras.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2023-2024 Intel Corporation + * Copyright (C) 2023-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -228,7 +228,7 @@ HWTEST2_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtThe EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + socFatalIosfPciaer + initialUncorrectableNonComputeErrors); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors); } @@ -274,7 +274,7 @@ HWTEST2_F(SysmanRasFixture, GivenValidRasHandleWhenCallingRasGetStateForGtAfterC EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + socFatalIosfPciaer + initialUncorrectableNonComputeErrors); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrors); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); } @@ -698,7 +698,7 @@ HWTEST2_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGetSt EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineResetTile0); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrorsTile0); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalSubslice + fatalEuErrorCount + initialUncorrectableComputeErrorsTile0); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + socFatalIosfPciaer + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrorsTile0); } else if (handleIndex == 2u) { diff --git a/level_zero/sysman/test/unit_tests/sources/ras/linux/test_zes_ras_exp.cpp b/level_zero/sysman/test/unit_tests/sources/ras/linux/test_zes_ras_exp.cpp index 994bf561bc..0a4bd3bf54 100644 --- a/level_zero/sysman/test/unit_tests/sources/ras/linux/test_zes_ras_exp.cpp +++ b/level_zero/sysman/test/unit_tests/sources/ras/linux/test_zes_ras_exp.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2023-2024 Intel Corporation + * Copyright (C) 2023-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -117,7 +117,7 @@ HWTEST2_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingZesRasGetStateExpTh expectedErrCount = fatalEuErrorCount + initialUncorrectableComputeErrors; EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount); } else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) { - expectedErrCount = socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors; + expectedErrCount = socFatalPsfCsc0Count + socFatalIosfPciaer + initialUncorrectableNonComputeErrors; EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount); } else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) { expectedErrCount = euAttention + initialProgrammingErrors; @@ -194,7 +194,7 @@ HWTEST2_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingZesRasGetStateExpFo expectedErrCount = fatalEuErrorCount + initialUncorrectableComputeErrors; EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount); } else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) { - expectedErrCount = socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors; + expectedErrCount = socFatalPsfCsc0Count + socFatalIosfPciaer + initialUncorrectableNonComputeErrors; EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount); } else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) { expectedErrCount = euAttention + initialProgrammingErrors; @@ -481,7 +481,7 @@ HWTEST2_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingzesRasClearStateExp expectedErrCount = fatalEuErrorCount + initialUncorrectableComputeErrors; EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount); } else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) { - expectedErrCount = socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors; + expectedErrCount = socFatalPsfCsc0Count + socFatalIosfPciaer + initialUncorrectableNonComputeErrors; EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount); } else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) { expectedErrCount = euAttention + initialProgrammingErrors; @@ -858,7 +858,7 @@ HWTEST2_F(SysmanRasExpMultiDeviceFixture, GivenValidRasHandleWhenCallingZesRasGe expectedErrCount = fatalSubslice + fatalEuErrorCount + initialUncorrectableComputeErrorsTile0; EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount); } else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) { - expectedErrCount = socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0; + expectedErrCount = socFatalPsfCsc0Count + socFatalIosfPciaer + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0; EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount); } else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) { expectedErrCount = euAttention + initialProgrammingErrors; @@ -1040,7 +1040,7 @@ HWTEST2_F(SysmanRasExpMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasCl expectedErrCount = fatalSubslice + fatalEuErrorCount + initialUncorrectableComputeErrorsTile0; EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount); } else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) { - expectedErrCount = socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0; + expectedErrCount = socFatalPsfCsc0Count + socFatalIosfPciaer + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0; EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount); } else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) { expectedErrCount = euAttention + initialProgrammingErrors; diff --git a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp index 3d77c60f49..b6afae0aa0 100644 --- a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp +++ b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2024 Intel Corporation + * Copyright (C) 2021-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -30,7 +30,9 @@ static const std::map> ca "gsc-nonfatal-selfmbist", "gsc-nonfatal-fuse-pull", "gsc-nonfatal-sram-ecc", "gsc-nonfatal-glitch-det", "gsc-nonfatal-ucode-parity", "gsc-nonfatal-mia-int", "gsc-nonfatal-wdg-timeout", "soc-nonfatal-mdfi-east", "soc-nonfatal-mdfi-south", - "soc-nonfatal-cd0-mdfi"}}, + "soc-nonfatal-cd0-mdfi", "soc-fatal-iosf-pciaer", "soc-fatal-iosf-pcierr", "soc-fatal-pciaer", + "soc-fatal-pcierr", "soc-fatal-serr-spi", "soc-fatal-serr-srcs", "soc-fatal-ur-response", "soc-fatal-ur", + "soc-fatal-hbm-mca", "soc-fatal-hbm-punit-mca"}}, {ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS, {"fatal-fpu", "fatal-eu-grf", "fatal-sampler", "fatal-slm", "fatal-guc", "fatal-eu-ic", "fatal-subslice"}}, diff --git a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_sysman_ras.h b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_sysman_ras.h index 1dab65b6f8..b66078ac52 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_sysman_ras.h +++ b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_sysman_ras.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2024 Intel Corporation + * Copyright (C) 2022-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -29,6 +29,7 @@ constexpr uint64_t correctableEuErrorCount = 75u; constexpr uint64_t fatalEuErrorCount = 50u; constexpr uint64_t fatalTlb = 3u; constexpr uint64_t socFatalPsfCsc0Count = 5u; +constexpr uint64_t socFatalIosfPciaer = 3u; constexpr uint64_t fatalEngineResetCount = 45u; constexpr uint64_t correctableGrfErrorCountTile0 = 90u; constexpr uint64_t correctableEuErrorCountTile0 = 70u; @@ -60,7 +61,7 @@ constexpr uint64_t driverEngineOther = 3u; constexpr uint64_t initialUncorrectableCacheErrors = 2u; constexpr uint64_t initialEngineReset = 2u; constexpr uint64_t initialProgrammingErrors = 7u; -constexpr uint64_t initialUncorrectableNonComputeErrors = 3u; +constexpr uint64_t initialUncorrectableNonComputeErrors = 4u; constexpr uint64_t initialUncorrectableFabricErrors = 8u; constexpr uint64_t initialUncorrectableComputeErrors = 7u; constexpr uint64_t initialCorrectableComputeErrors = 6u; @@ -70,7 +71,7 @@ constexpr uint64_t initialUncorrectableCacheErrorsTile0 = 2u; constexpr uint64_t initialCorrectableCacheErrorTile0 = 2u; constexpr uint64_t initialEngineResetTile0 = 2u; constexpr uint64_t initialProgrammingErrorsTile0 = 7u; -constexpr uint64_t initialUncorrectableNonComputeErrorsTile0 = 10u; +constexpr uint64_t initialUncorrectableNonComputeErrorsTile0 = 11u; constexpr uint64_t initialCorrectableNonComputeErrorsTile0 = 2u; constexpr uint64_t initialUncorrectableComputeErrorsTile0 = 8u; constexpr uint64_t initialUncorrectableFabricErrorsTile0 = 8u; @@ -139,9 +140,10 @@ struct MockRasPmuInterfaceImp : public PmuInterfaceImp { data[7] = 0; data[8] = fatalEuErrorCount; data[9] = socFatalPsfCsc0Count; - data[10] = fatalTlb; - data[11] = 0; - data[12] = socFatalMdfiEastCount; + data[10] = socFatalIosfPciaer; + data[11] = fatalTlb; + data[12] = 0; + data[13] = socFatalMdfiEastCount; return 0; } @@ -167,11 +169,12 @@ struct MockRasPmuInterfaceImp : public PmuInterfaceImp { data[8] = fatalSubslice; data[9] = fatalEuErrorCount; data[10] = socFatalPsfCsc0Count; - data[11] = nonFatalGscAonParity; - data[12] = nonFataGscSelfmBist; - data[13] = fatalTlb; - data[14] = 0; - data[15] = socFatalMdfiEastCount; + data[11] = socFatalIosfPciaer; + data[12] = nonFatalGscAonParity; + data[13] = nonFataGscSelfmBist; + data[14] = fatalTlb; + data[15] = 0; + data[16] = socFatalMdfiEastCount; return 0; } @@ -382,6 +385,9 @@ struct MockRasSysfsAccess : public SysfsAccess { } else if (file.compare("gt/gt1/error_counter/correctable_subslice") == 0) { val = 2u; return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt0/error_counter/soc_fatal_iosf_pciaer") == 0) { + val = 1u; + return ZE_RESULT_SUCCESS; } return ZE_RESULT_ERROR_NOT_AVAILABLE; } @@ -453,6 +459,9 @@ struct MockRasSysfsAccess : public SysfsAccess { } else if (file.compare("gt/gt1/error_counter/driver_engine_other") == 0) { val = 3u; return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt0/error_counter/soc_fatal_iosf_pciaer") == 0) { + val = 1u; + return ZE_RESULT_SUCCESS; } return ZE_RESULT_ERROR_NOT_AVAILABLE; } @@ -513,6 +522,7 @@ struct MockRasFsAccess : public FsAccess { events.push_back("error--fatal-fpu"); events.push_back("error--fatal-l3-fabric"); events.push_back("ccs0-busy"); + events.push_back("error--soc-fatal-iosf-pciaer"); return ZE_RESULT_SUCCESS; } return ZE_RESULT_ERROR_NOT_AVAILABLE; @@ -560,6 +570,7 @@ struct MockRasFsAccess : public FsAccess { events.push_back("error-gt0--fatal-subslice"); events.push_back("error-gt1--fatal-l3bank"); events.push_back("error-gt1--correctable-subslice"); + events.push_back("error-gt0--soc-fatal-iosf-pciaer"); return ZE_RESULT_SUCCESS; } return ZE_RESULT_ERROR_NOT_AVAILABLE; diff --git a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras.cpp b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras.cpp index 311f04a86e..9a672bb884 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras.cpp +++ b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2024 Intel Corporation + * Copyright (C) 2022-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -244,7 +244,7 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtThenSu EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + socFatalIosfPciaer + initialUncorrectableNonComputeErrors); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors); } @@ -281,7 +281,7 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtAfterCl EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + socFatalIosfPciaer + initialUncorrectableNonComputeErrors); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors); } @@ -720,7 +720,7 @@ TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateF EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineResetTile0); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrorsTile0); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalSubslice + fatalEuErrorCount + initialUncorrectableComputeErrorsTile0); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + socFatalIosfPciaer + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrorsTile0); } else if (handleIndex == 2u) { diff --git a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_exp.cpp b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_exp.cpp index 4251e21d4c..373911b6d4 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_exp.cpp +++ b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_exp.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2023-2024 Intel Corporation + * Copyright (C) 2023-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -125,7 +125,7 @@ TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingZesRasGetStateExpForGt expectedErrCount = fatalEuErrorCount + initialUncorrectableComputeErrors; EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount); } else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) { - expectedErrCount = socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors; + expectedErrCount = socFatalPsfCsc0Count + socFatalIosfPciaer + initialUncorrectableNonComputeErrors; EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount); } else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) { expectedErrCount = euAttention + initialProgrammingErrors; @@ -197,7 +197,7 @@ TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingZesRasGetStateExpForGt expectedErrCount = fatalEuErrorCount + initialUncorrectableComputeErrors; EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount); } else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) { - expectedErrCount = socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors; + expectedErrCount = socFatalPsfCsc0Count + socFatalIosfPciaer + initialUncorrectableNonComputeErrors; EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount); } else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) { expectedErrCount = euAttention + initialProgrammingErrors; @@ -507,7 +507,7 @@ TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingzesRasClearStateExpAnd expectedErrCount = fatalEuErrorCount + initialUncorrectableComputeErrors; EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount); } else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) { - expectedErrCount = socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors; + expectedErrCount = socFatalPsfCsc0Count + socFatalIosfPciaer + initialUncorrectableNonComputeErrors; EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount); } else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) { expectedErrCount = euAttention + initialProgrammingErrors; @@ -806,7 +806,7 @@ TEST_F(SysmanRasExpMultiDeviceFixture, GivenValidRasHandleWhenCallingZesRasGetSt expectedErrCount = fatalSubslice + fatalEuErrorCount + initialUncorrectableComputeErrorsTile0; EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount); } else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) { - expectedErrCount = socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0; + expectedErrCount = socFatalPsfCsc0Count + socFatalIosfPciaer + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0; EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount); } else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) { expectedErrCount = euAttention + initialProgrammingErrors; @@ -975,7 +975,7 @@ TEST_F(SysmanRasExpMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasClear expectedErrCount = fatalSubslice + fatalEuErrorCount + initialUncorrectableComputeErrorsTile0; EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount); } else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) { - expectedErrCount = socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0; + expectedErrCount = socFatalPsfCsc0Count + socFatalIosfPciaer + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0; EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount); } else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) { expectedErrCount = euAttention + initialProgrammingErrors;