mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 06:49:52 +08:00
feature: Add support for RAS mdfi errors
Related-To: LOCI-4479 Signed-off-by: Mayank Raghuwanshi <mayank.raghuwanshi@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
c11809e002
commit
a69110a7ec
@@ -27,10 +27,12 @@ static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToL
|
||||
"sgunit-fatal", "soc-nonfatal-punit", "sgunit-fatal", "sgunit-nonfatal", "gsc-nonfatal-mia-shutdown",
|
||||
"gsc-nonfatal-aon-parity", "gsc-nonfatal-rom-parity", "gsc-nonfatal-fuse-crc-check",
|
||||
"gsc-nonfatal-selfmbist", "gsc-nonfatal-fuse-pull", "gsc-nonfatal-sram-ecc", "gsc-nonfatal-glitch-det",
|
||||
"gsc-nonfatal-ucode-parity", "gsc-nonfatal-mia-int", "gsc-nonfatal-wdg-timeout"}},
|
||||
"gsc-nonfatal-ucode-parity", "gsc-nonfatal-mia-int", "gsc-nonfatal-wdg-timeout", "soc-fatal-mdfi-east",
|
||||
"soc-fatal-mdfi-south", "soc-nonfatal-mdfi-east", "soc-nonfatal-mdfi-south", "soc-fatal-mdfi-west",
|
||||
"soc-fatal-cd0-mdfi", "soc-nonfatal-cd0-mdfi"}},
|
||||
{ZES_RAS_ERROR_CAT_COMPUTE_ERRORS,
|
||||
{"fatal-fpu", "fatal-eu-grf", "fatal-sampler", "fatal-slm",
|
||||
"fatal-guc", "fatal-eu-ic", "fatal-subslice"}},
|
||||
"fatal-guc", "fatal-eu-ic", "fatal-subslice", "fatal-l3-fabric"}},
|
||||
{ZES_RAS_ERROR_CAT_DRIVER_ERRORS,
|
||||
{"driver-object-migration", "driver-engine-other", "driver-ggtt",
|
||||
"driver-gt-interrupt", "driver-gt-other", "driver-guc-communication",
|
||||
|
||||
@@ -63,9 +63,9 @@ constexpr uint64_t driverEngineOther = 3u;
|
||||
constexpr uint64_t initialUncorrectableCacheErrors = 2u;
|
||||
constexpr uint64_t initialEngineReset = 2u;
|
||||
constexpr uint64_t initialProgrammingErrors = 7u;
|
||||
constexpr uint64_t initialUncorrectableNonComputeErrors = 3u;
|
||||
constexpr uint64_t initialUncorrectableNonComputeErrors = 8u;
|
||||
constexpr uint64_t initialUncorrectableFabricErrors = 8u;
|
||||
constexpr uint64_t initialUncorrectableComputeErrors = 7u;
|
||||
constexpr uint64_t initialUncorrectableComputeErrors = 10u;
|
||||
constexpr uint64_t initialCorrectableComputeErrors = 6u;
|
||||
constexpr uint64_t initialUncorrectableDriverErrors = 5u;
|
||||
|
||||
@@ -73,9 +73,9 @@ constexpr uint64_t initialUncorrectableCacheErrorsTile0 = 2u;
|
||||
constexpr uint64_t initialCorrectableCacheErrorTile0 = 2u;
|
||||
constexpr uint64_t initialEngineResetTile0 = 2u;
|
||||
constexpr uint64_t initialProgrammingErrorsTile0 = 7u;
|
||||
constexpr uint64_t initialUncorrectableNonComputeErrorsTile0 = 10u;
|
||||
constexpr uint64_t initialUncorrectableNonComputeErrorsTile0 = 15u;
|
||||
constexpr uint64_t initialCorrectableNonComputeErrorsTile0 = 2u;
|
||||
constexpr uint64_t initialUncorrectableComputeErrorsTile0 = 8u;
|
||||
constexpr uint64_t initialUncorrectableComputeErrorsTile0 = 11u;
|
||||
constexpr uint64_t initialUncorrectableFabricErrorsTile0 = 8u;
|
||||
constexpr uint64_t initialCorrectableComputeErrorsTile0 = 6u;
|
||||
constexpr uint64_t initialUncorrectableDriverErrorsTile0 = 5u;
|
||||
@@ -83,7 +83,7 @@ constexpr uint64_t initialUncorrectableCacheErrorsTile1 = 1u;
|
||||
constexpr uint64_t initialEngineResetTile1 = 4u;
|
||||
constexpr uint64_t initialProgrammingErrorsTile1 = 5u;
|
||||
constexpr uint64_t initialCorrectableComputeErrorsTile1 = 7u;
|
||||
constexpr uint64_t initialUncorrectableNonComputeErrorsTile1 = 3u;
|
||||
constexpr uint64_t initialUncorrectableNonComputeErrorsTile1 = 5u;
|
||||
constexpr uint64_t initialUncorrectableFabricErrorsTile1 = 2u;
|
||||
constexpr uint64_t initialUncorrectableComputeErrorsTile1 = 6u;
|
||||
constexpr uint64_t initialUncorrectableDriverErrorsTile1 = 4u;
|
||||
@@ -140,9 +140,11 @@ struct MockRasPmuInterfaceImp : public PmuInterfaceImp {
|
||||
data[5] = driverGgtt;
|
||||
data[6] = driverRps;
|
||||
data[7] = 0;
|
||||
data[8] = fatalEuErrorCount;
|
||||
data[9] = socFatalPsfCsc0Count;
|
||||
data[10] = fatalTlb;
|
||||
data[8] = 0;
|
||||
data[9] = fatalEuErrorCount;
|
||||
data[10] = socFatalPsfCsc0Count;
|
||||
data[11] = socFatalMdfiEastCount;
|
||||
data[12] = fatalTlb;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -165,12 +167,14 @@ struct MockRasPmuInterfaceImp : public PmuInterfaceImp {
|
||||
data[5] = driverGgtt;
|
||||
data[6] = driverRps;
|
||||
data[7] = 0;
|
||||
data[8] = fatalSubslice;
|
||||
data[9] = fatalEuErrorCount;
|
||||
data[10] = socFatalPsfCsc0Count;
|
||||
data[11] = nonFatalGscAonParity;
|
||||
data[12] = nonFataGscSelfmBist;
|
||||
data[13] = fatalTlb;
|
||||
data[8] = 0;
|
||||
data[9] = fatalSubslice;
|
||||
data[10] = fatalEuErrorCount;
|
||||
data[11] = socFatalPsfCsc0Count;
|
||||
data[12] = socFatalMdfiEastCount;
|
||||
data[13] = nonFatalGscAonParity;
|
||||
data[14] = nonFataGscSelfmBist;
|
||||
data[15] = fatalTlb;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -191,9 +195,10 @@ struct MockRasPmuInterfaceImp : public PmuInterfaceImp {
|
||||
data[4] = driverMigration;
|
||||
data[5] = driverEngineOther;
|
||||
data[6] = fatalGucErrorCountTile1;
|
||||
data[7] = socFatalPunitTile1;
|
||||
data[8] = fatalIdiParityErrorCountTile1;
|
||||
data[9] = fatalL3BankTile1;
|
||||
data[7] = socFatalMdfiWestCountTile1;
|
||||
data[8] = socFatalPunitTile1;
|
||||
data[9] = fatalIdiParityErrorCountTile1;
|
||||
data[10] = fatalL3BankTile1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -242,7 +242,7 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtThenSuc
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + socFatalMdfiEastCount + initialUncorrectableNonComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors);
|
||||
}
|
||||
@@ -278,7 +278,7 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtAfterCl
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineReset);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalEuErrorCount + initialUncorrectableComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors);
|
||||
}
|
||||
@@ -702,7 +702,7 @@ TEST_F(SysmanRasMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasGeStateF
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_RESET], fatalEngineResetCount + initialEngineResetTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS], euAttention + initialProgrammingErrorsTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS], fatalSubslice + fatalEuErrorCount + initialUncorrectableComputeErrorsTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS], socFatalMdfiEastCount + socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DISPLAY_ERRORS], 0u);
|
||||
EXPECT_EQ(state.category[ZES_RAS_ERROR_CAT_DRIVER_ERRORS], driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrorsTile0);
|
||||
} else if (handleIndex == 2u) {
|
||||
|
||||
Reference in New Issue
Block a user