diff --git a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp index 41ca626040..65ade21d06 100644 --- a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp +++ b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2022 Intel Corporation + * Copyright (C) 2021-2023 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -24,7 +24,10 @@ static const std::map> categoryToL {"soc-fatal-mdfi-east", "soc-fatal-mdfi-south", "soc-fatal-mdfi-west", "soc-fatal-psf-0", "soc-fatal-psf-1", "soc-fatal-psf-2", "soc-fatal-psf-csc-0", "soc-fatal-psf-csc-1", "soc-fatal-psf-csc-2", "soc-fatal-punit", - "sgunit-fatal", "soc-nonfatal-punit", "sgunit-fatal", "sgunit-nonfatal"}}, + "sgunit-fatal", "soc-nonfatal-punit", "sgunit-fatal", "sgunit-nonfatal", "gsc-nonfatal-mia-shutdown", + "gsc-nonfatal-aon-parity", "gsc-nonfatal-rom-parity", "gsc-nonfatal-fuse-crc-check", + "gsc-nonfatal-selfmbist", "gsc-nonfatal-fuse-pull", "gsc-nonfatal-sram-ecc", "gsc-nonfatal-glitch-det", + "gsc-nonfatal-ucode-parity", "gsc-nonfatal-mia-int", "gsc-nonfatal-wdg-timeout"}}, {ZES_RAS_ERROR_CAT_COMPUTE_ERRORS, {"fatal-fpu", "fatal-l3-fabric", "fatal-eu-grf", "fatal-sampler", "fatal-slm", "fatal-guc", "fatal-eu-ic"}}, @@ -37,7 +40,7 @@ static const std::map> categoryToL {ZES_RAS_ERROR_CAT_CACHE_ERRORS, {"correctable-l3-sng"}}, {ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS, - {"sgunit-correctable"}}, + {"sgunit-correctable", "gsc-correctable-sram-ecc"}}, {ZES_RAS_ERROR_CAT_COMPUTE_ERRORS, {"correctable-eu-grf", "correctable-eu-ic", "correctable-guc", "correctable-sampler", "correctable-slm"}}}; diff --git a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_prelim.h b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_prelim.h index de1c0b3166..c371b39307 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_prelim.h +++ b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/mock_fs_ras_prelim.h @@ -33,6 +33,9 @@ constexpr uint64_t correctableEuErrorCountTile0 = 70u; constexpr uint64_t fatalEuErrorCountTile0 = 55u; constexpr uint64_t fatalEngineResetCountTile0 = 72u; constexpr uint64_t correctableSamplerErrorCountTile1 = 30u; +constexpr uint64_t correctableGscSramEcc = 2u; +constexpr uint64_t nonFatalGscAonParity = 2u; +constexpr uint64_t nonFataGscSelfmBist = 3u; constexpr uint64_t fatalGucErrorCountTile1 = 40u; constexpr uint64_t fatalIdiParityErrorCountTile1 = 60u; constexpr uint64_t correctableGucErrorCountTile1 = 25u; @@ -56,6 +59,15 @@ constexpr uint64_t initialUncorrectableNonComputeErrors = 8u; constexpr uint64_t initialUncorrectableComputeErrors = 10u; constexpr uint64_t initialCorrectableComputeErrors = 6u; constexpr uint64_t initialUncorrectableDriverErrors = 5u; + +constexpr uint64_t initialUncorrectableCacheErrorsTile0 = 2u; +constexpr uint64_t initialEngineResetTile0 = 2u; +constexpr uint64_t initialProgrammingErrorsTile0 = 7u; +constexpr uint64_t initialUncorrectableNonComputeErrorsTile0 = 15u; +constexpr uint64_t initialCorrectableNonComputeErrorsTile0 = 2u; +constexpr uint64_t initialUncorrectableComputeErrorsTile0 = 10u; +constexpr uint64_t initialCorrectableComputeErrorsTile0 = 6u; +constexpr uint64_t initialUncorrectableDriverErrorsTile0 = 5u; constexpr uint64_t initialUncorrectableCacheErrorsTile1 = 1u; constexpr uint64_t initialEngineResetTile1 = 4u; constexpr uint64_t initialProgrammingErrorsTile1 = 5u; @@ -129,6 +141,7 @@ struct MockRasPmuInterfaceImp : public PmuInterfaceImp { data[1] = timeStamp; data[2] = correctableGrfErrorCount; data[3] = correctableEuErrorCount; + data[4] = correctableGscSramEcc; return 0; } @@ -145,7 +158,9 @@ struct MockRasPmuInterfaceImp : public PmuInterfaceImp { data[9] = fatalEuErrorCount; data[10] = socFatalMdfiEastCount; data[11] = socFatalPsfCsc0Count; - data[12] = fatalTlb; + data[12] = nonFatalGscAonParity; + data[13] = nonFataGscSelfmBist; + data[14] = fatalTlb; return 0; } @@ -244,6 +259,7 @@ struct MockRasSysfsAccess : public SysfsAccess { ze_result_t mockReadSymLinkStatus = ZE_RESULT_SUCCESS; bool mockReadSymLinkResult = false; + bool isMultiTileArch = false; ze_result_t readSymLink(const std::string file, std::string &val) override { @@ -266,7 +282,87 @@ struct MockRasSysfsAccess : public SysfsAccess { return ZE_RESULT_ERROR_NOT_AVAILABLE; } - ze_result_t read(const std::string file, uint64_t &val) override { + ze_result_t multiTileSysfsRead(const std::string file, uint64_t &val) { + if (file.compare("gt/gt0/error_counter/correctable_eu_grf") == 0) { + val = 5u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt0/error_counter/correctable_eu_ic") == 0) { + val = 1u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt0/error_counter/fatal_eu_ic") == 0) { + val = 5u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt0/error_counter/fatal_tlb") == 0) { + val = 2u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt0/error_counter/engine_reset") == 0) { + val = 2u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt1/error_counter/correctable_sampler") == 0) { + val = 2u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt1/error_counter/fatal_guc") == 0) { + val = 6u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt1/error_counter/fatal_idi_parity") == 0) { + val = 1u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt1/error_counter/correctable_guc") == 0) { + val = 3u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt1/error_counter/engine_reset") == 0) { + val = 4u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt0/error_counter/eu_attention") == 0) { + val = 7u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt1/error_counter/eu_attention") == 0) { + val = 5u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt0/error_counter/soc_fatal_mdfi_east") == 0) { + val = 5u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt0/error_counter/soc_fatal_psf_csc_0") == 0) { + val = 3u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt1/error_counter/soc_fatal_punit") == 0) { + val = 3u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt1/error_counter/soc_fatal_mdfi_west") == 0) { + val = 2u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt0/error_counter/fatal_fpu") == 0) { + val = 2u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt0/error_counter/fatal_l3_fabric") == 0) { + val = 3u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt0/error_counter/driver_ggtt") == 0) { + val = 2u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt0/error_counter/driver_rps") == 0) { + val = 2u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("error_counter/driver_object_migration") == 0) { + val = 1u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt1/error_counter/driver_engine_other") == 0) { + val = 3u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt0/error_counter/gsc_correctable_sram_ecc") == 0) { + val = 2u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt0/error_counter/gsc_nonfatal_aon_parity") == 0) { + val = 3u; + return ZE_RESULT_SUCCESS; + } else if (file.compare("gt/gt0/error_counter/gsc_nonfatal_selfmbist") == 0) { + val = 4u; + return ZE_RESULT_SUCCESS; + } + return ZE_RESULT_ERROR_NOT_AVAILABLE; + } + + ze_result_t sysfsRead(const std::string file, uint64_t &val) { if (file.compare("gt/gt0/error_counter/correctable_eu_grf") == 0) { val = 5u; return ZE_RESULT_SUCCESS; @@ -336,6 +432,12 @@ struct MockRasSysfsAccess : public SysfsAccess { } return ZE_RESULT_ERROR_NOT_AVAILABLE; } + ze_result_t read(const std::string file, uint64_t &val) override { + if (isMultiTileArch == true) { + return multiTileSysfsRead(file, val); + } + return sysfsRead(file, val); + } }; struct MockRasFsAccess : public FsAccess { @@ -427,6 +529,9 @@ struct MockRasFsAccess : public FsAccess { events.push_back("error-gt1--engine-reset"); events.push_back("error-gt1--eu-attention"); events.push_back("error-gt1--driver-engine-other"); + events.push_back("error-gt0--gsc-correctable-sram-ecc"); + events.push_back("error-gt0--gsc-nonfatal-aon-parity"); + events.push_back("error-gt0--gsc-nonfatal-selfmbist"); return ZE_RESULT_SUCCESS; } return ZE_RESULT_ERROR_NOT_AVAILABLE; diff --git a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_prelim.cpp b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_prelim.cpp index dcd7bc6d4a..c09881ce8e 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_prelim.cpp +++ b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/test_zes_ras_prelim.cpp @@ -630,6 +630,7 @@ TEST_F(SysmanRasMultiDeviceFixture, GivenValidHandleWhenGettingRasPropertiesThen TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtThenSuccessIsReturned) { pPmuInterface->mockPmuReadTile = true; + pSysfsAccess->isMultiTileArch = true; for (const auto &handle : pSysmanDeviceImp->pRasHandleContext->handleList) { delete handle; @@ -645,18 +646,18 @@ TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateF EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); // No. of correctable error type for subdevice 0 EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], 0u); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrorsTile0); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], correctableGscSramEcc + initialCorrectableNonComputeErrorsTile0); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], 0u); } else if (handleIndex == 1u) { - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrors); // No. of uncorrectable error type for subdevice 0 - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], fatalTlb + initialUncorrectableCacheErrorsTile0); // No. of uncorrectable error type for subdevice 0 + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineResetTile0); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrorsTile0); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrorsTile0); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0); EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u); - EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors); + EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrorsTile0); } else if (handleIndex == 2u) { EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_CACHE_ERRORS], 0u); // No. of correctable error type for subdevice 1 EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], 0u);