From 065232eac7c8a89018fbf049a30b467f7d7dcecb Mon Sep 17 00:00:00 2001 From: Mayank Raghuwanshi Date: Sat, 18 Mar 2023 11:42:33 +0000 Subject: [PATCH] Add support for ras l3 fabric errors Related-To: LOCI-3966 Signed-off-by: Mayank Raghuwanshi --- .../source/sysman/ras/linux/os_ras_imp_gt.cpp | 8 ++-- .../sysman/ras/linux/mock_fs_ras_prelim.h | 47 ++++++++++--------- .../sysman/ras/linux/test_zes_ras_prelim.cpp | 10 ++-- 3 files changed, 36 insertions(+), 29 deletions(-) diff --git a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp index 262c3d4161..9cdc6ca518 100644 --- a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp +++ b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp @@ -20,20 +20,20 @@ static const std::map> categoryToL {ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS, {"eu-attention"}}, {ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS, - {"soc-fatal-mdfi-east", "soc-fatal-mdfi-south", "soc-fatal-mdfi-west", - "soc-fatal-psf-0", "soc-fatal-psf-1", "soc-fatal-psf-2", "soc-fatal-psf-csc-0", + {"soc-fatal-psf-0", "soc-fatal-psf-1", "soc-fatal-psf-2", "soc-fatal-psf-csc-0", "soc-fatal-psf-csc-1", "soc-fatal-psf-csc-2", "soc-fatal-punit", "sgunit-fatal", "soc-nonfatal-punit", "sgunit-fatal", "sgunit-nonfatal", "gsc-nonfatal-mia-shutdown", "gsc-nonfatal-aon-parity", "gsc-nonfatal-rom-parity", "gsc-nonfatal-fuse-crc-check", "gsc-nonfatal-selfmbist", "gsc-nonfatal-fuse-pull", "gsc-nonfatal-sram-ecc", "gsc-nonfatal-glitch-det", "gsc-nonfatal-ucode-parity", "gsc-nonfatal-mia-int", "gsc-nonfatal-wdg-timeout"}}, {ZES_RAS_ERROR_CAT_COMPUTE_ERRORS, - {"fatal-fpu", "fatal-l3-fabric", "fatal-eu-grf", "fatal-sampler", "fatal-slm", + {"fatal-fpu", "fatal-eu-grf", "fatal-sampler", "fatal-slm", "fatal-guc", "fatal-eu-ic", "fatal-subslice"}}, {ZES_RAS_ERROR_CAT_DRIVER_ERRORS, {"driver-object-migration", "driver-engine-other", "driver-ggtt", "driver-gt-interrupt", "driver-gt-other", "driver-guc-communication", - "driver-rps"}}}; + "driver-rps"}}, + {ZES_RAS_ERROR_CAT_L3FABRIC_ERRORS, {"soc-fatal-mdfi-east", "soc-fatal-mdfi-south", "soc-fatal-mdfi-west", "fatal-l3-fabric", "soc-fatal-cd0-mdfi"}}}; static const std::map> categoryToListOfEventsCorrectable = { {ZES_RAS_ERROR_CAT_CACHE_ERRORS, diff --git a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_prelim.h b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_prelim.h index f95de7808c..d0253a6517 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_prelim.h +++ b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_prelim.h @@ -58,8 +58,9 @@ constexpr uint64_t driverEngineOther = 3u; constexpr uint64_t initialUncorrectableCacheErrors = 2u; constexpr uint64_t initialEngineReset = 2u; constexpr uint64_t initialProgrammingErrors = 7u; -constexpr uint64_t initialUncorrectableNonComputeErrors = 8u; -constexpr uint64_t initialUncorrectableComputeErrors = 10u; +constexpr uint64_t initialUncorrectableNonComputeErrors = 3u; +constexpr uint64_t initialUncorrectableFabricErrors = 8u; +constexpr uint64_t initialUncorrectableComputeErrors = 7u; constexpr uint64_t initialCorrectableComputeErrors = 6u; constexpr uint64_t initialUncorrectableDriverErrors = 5u; @@ -67,16 +68,18 @@ constexpr uint64_t initialUncorrectableCacheErrorsTile0 = 2u; constexpr uint64_t initialCorrectableCacheErrorTile0 = 2u; constexpr uint64_t initialEngineResetTile0 = 2u; constexpr uint64_t initialProgrammingErrorsTile0 = 7u; -constexpr uint64_t initialUncorrectableNonComputeErrorsTile0 = 15u; +constexpr uint64_t initialUncorrectableNonComputeErrorsTile0 = 10u; constexpr uint64_t initialCorrectableNonComputeErrorsTile0 = 2u; -constexpr uint64_t initialUncorrectableComputeErrorsTile0 = 11u; +constexpr uint64_t initialUncorrectableComputeErrorsTile0 = 8u; +constexpr uint64_t initialUncorrectableFabricErrorsTile0 = 8u; constexpr uint64_t initialCorrectableComputeErrorsTile0 = 6u; constexpr uint64_t initialUncorrectableDriverErrorsTile0 = 5u; constexpr uint64_t initialUncorrectableCacheErrorsTile1 = 1u; constexpr uint64_t initialEngineResetTile1 = 4u; constexpr uint64_t initialProgrammingErrorsTile1 = 5u; constexpr uint64_t initialCorrectableComputeErrorsTile1 = 7u; -constexpr uint64_t initialUncorrectableNonComputeErrorsTile1 = 5u; +constexpr uint64_t initialUncorrectableNonComputeErrorsTile1 = 3u; +constexpr uint64_t initialUncorrectableFabricErrorsTile1 = 2u; constexpr uint64_t initialUncorrectableComputeErrorsTile1 = 6u; constexpr uint64_t initialUncorrectableDriverErrorsTile1 = 4u; constexpr uint64_t timeStamp = 1000u; @@ -132,11 +135,11 @@ struct MockRasPmuInterfaceImp : public PmuInterfaceImp { data[5] = driverGgtt; data[6] = driverRps; data[7] = 0; - data[8] = 0; - data[9] = fatalEuErrorCount; - data[10] = socFatalMdfiEastCount; - data[11] = socFatalPsfCsc0Count; - data[12] = fatalTlb; + data[8] = fatalEuErrorCount; + data[9] = socFatalPsfCsc0Count; + data[10] = fatalTlb; + data[11] = 0; + data[12] = socFatalMdfiEastCount; return 0; } @@ -159,14 +162,14 @@ struct MockRasPmuInterfaceImp : public PmuInterfaceImp { data[5] = driverGgtt; data[6] = driverRps; data[7] = 0; - data[8] = 0; - data[9] = fatalSubslice; - data[10] = fatalEuErrorCount; - data[11] = socFatalMdfiEastCount; - data[12] = socFatalPsfCsc0Count; - data[13] = nonFatalGscAonParity; - data[14] = nonFataGscSelfmBist; - data[15] = fatalTlb; + data[8] = fatalSubslice; + data[9] = fatalEuErrorCount; + data[10] = socFatalPsfCsc0Count; + data[11] = nonFatalGscAonParity; + data[12] = nonFataGscSelfmBist; + data[13] = fatalTlb; + data[14] = 0; + data[15] = socFatalMdfiEastCount; return 0; } @@ -187,10 +190,10 @@ struct MockRasPmuInterfaceImp : public PmuInterfaceImp { data[4] = driverMigration; data[5] = driverEngineOther; data[6] = fatalGucErrorCountTile1; - data[7] = socFatalMdfiWestCountTile1; - data[8] = socFatalPunitTile1; - data[9] = fatalIdiParityErrorCountTile1; - data[10] = fatalL3BankTile1; + data[7] = socFatalPunitTile1; + data[8] = fatalIdiParityErrorCountTile1; + data[9] = fatalL3BankTile1; + data[10] = socFatalMdfiWestCountTile1; return 0; } diff --git a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_prelim.cpp b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_prelim.cpp index c0b426e885..ac80ef920b 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_prelim.cpp +++ b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_prelim.cpp @@ -192,9 +192,10 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtThenSuc EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_L3FABRIC_ERRORS], socFatalMdfiEastCount + initialUncorrectableFabricErrors); } } } @@ -229,9 +230,10 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtAfterCl EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_L3FABRIC_ERRORS], socFatalMdfiEastCount + initialUncorrectableFabricErrors); } } correctable = true; @@ -655,9 +657,10 @@ TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateF EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineResetTile0); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrorsTile0); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalSubslice + fatalEuErrorCount + initialUncorrectableComputeErrorsTile0); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrorsTile0); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_L3FABRIC_ERRORS], socFatalMdfiEastCount + initialUncorrectableFabricErrorsTile0); } else if (handleIndex == 2u) { EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); // No. of correctable error type for subdevice 1 EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); @@ -674,6 +677,7 @@ TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateF EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiWestCountTile1 + socFatalPunitTile1 + initialUncorrectableNonComputeErrorsTile1); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverEngineOther + initialUncorrectableDriverErrorsTile1); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_L3FABRIC_ERRORS], socFatalMdfiWestCountTile1 + initialUncorrectableFabricErrorsTile1); } handleIndex++; }