mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-04 15:53:45 +08:00
feature: Add support for ras clear state exp
Related-To: NEO-8873 Signed-off-by: Bellekallu Rajkiran <bellekallu.rajkiran@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
dcf74e8d29
commit
b5a09f8eb4
@@ -222,7 +222,7 @@ struct MockRasPmuInterfaceImp : public L0::Sysman::PmuInterfaceImp {
|
||||
}
|
||||
}
|
||||
|
||||
if (mockPmuReadAfterClear == true) {
|
||||
if ((mockPmuReadAfterClear == true) && (mockPmuReadTile == false)) {
|
||||
if (mockPmuReadCountAfterClear == 0) {
|
||||
mockPmuReadCountAfterClear++;
|
||||
return mockedPmuReadForCorrectableAndSuccessReturn(fd, data, sizeOfdata);
|
||||
@@ -240,24 +240,29 @@ struct MockRasPmuInterfaceImp : public L0::Sysman::PmuInterfaceImp {
|
||||
}
|
||||
|
||||
if (mockPmuReadTile == true) {
|
||||
if (mockPmuReadTileCount == 0) {
|
||||
if (mockPmuReadAfterClear == true) {
|
||||
mockPmuReadCountAfterClear++;
|
||||
}
|
||||
if ((mockPmuReadTileCount == 0) && (mockPmuReadCountAfterClear < 4)) {
|
||||
mockPmuReadTileCount++;
|
||||
return mockedPmuReadForCorrectableTile0AndSuccessReturn(fd, data, sizeOfdata);
|
||||
}
|
||||
|
||||
else if (mockPmuReadTileCount == 1) {
|
||||
else if ((mockPmuReadTileCount == 1) && (mockPmuReadCountAfterClear < 4)) {
|
||||
mockPmuReadTileCount++;
|
||||
return mockedPmuReadForUncorrectableTile0AndSuccessReturn(fd, data, sizeOfdata);
|
||||
}
|
||||
|
||||
else if (mockPmuReadTileCount == 2) {
|
||||
else if ((mockPmuReadTileCount == 2) && (mockPmuReadCountAfterClear < 4)) {
|
||||
mockPmuReadTileCount++;
|
||||
return mockedPmuReadForCorrectableTile1AndSuccessReturn(fd, data, sizeOfdata);
|
||||
}
|
||||
|
||||
else if (mockPmuReadTileCount == 3) {
|
||||
else if ((mockPmuReadTileCount == 3) && (mockPmuReadCountAfterClear < 4)) {
|
||||
mockPmuReadTileCount++;
|
||||
return mockedPmuReadForUncorrectableTile1AndSuccessReturn(fd, data, sizeOfdata);
|
||||
} else {
|
||||
return mockedPmuReadAfterClearAndSuccessReturn(fd, data, sizeOfdata);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
||||
@@ -161,7 +161,7 @@ TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingZesRasGetStateExpForGt
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
rasStates[i].errorCounter = 0u;
|
||||
}
|
||||
uint32_t requestedCount = correctable ? count : (count - 1);
|
||||
uint32_t requestedCount = correctable ? count : (count - 2);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &requestedCount, rasStates.data()));
|
||||
if (correctable == true) {
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
@@ -443,6 +443,261 @@ TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingZesRasGetStateExpForGt
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingzesRasClearStateExpForGtAndRasErrorsAreNotretrievedBeforeThenSuccessIsReturned) {
|
||||
|
||||
VariableBackup<L0::Sysman::FsAccessInterface *> fsBackup(&pLinuxSysmanImp->pFsAccess);
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (const auto &handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_SCALE_ERRORS));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingzesRasClearStateExpAndGetStateExpForGtThenVerifyErrorCountersAreCleared) {
|
||||
|
||||
VariableBackup<decltype(NEO::SysCalls::sysCallsReadlink)> mockReadLink(&NEO::SysCalls::sysCallsReadlink, [](const char *path, char *buf, size_t bufsize) -> int {
|
||||
constexpr size_t sizeofPath = sizeof("/sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0");
|
||||
strcpy_s(buf, sizeofPath, "/sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0");
|
||||
return sizeofPath;
|
||||
});
|
||||
|
||||
VariableBackup<decltype(NEO::SysCalls::sysCallsPread)> mockPread(&NEO::SysCalls::sysCallsPread, [](int fd, void *buf, size_t count, off_t offset) -> ssize_t {
|
||||
std::ostringstream oStream;
|
||||
oStream << pmuDriverType;
|
||||
std::string value = oStream.str();
|
||||
memcpy(buf, value.data(), count);
|
||||
return count;
|
||||
});
|
||||
|
||||
VariableBackup<L0::Sysman::FsAccessInterface *> fsBackup(&pLinuxSysmanImp->pFsAccess);
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
|
||||
|
||||
pPmuInterface->mockPmuReadAfterClear = true;
|
||||
VariableBackup<L0::Sysman::PmuInterface *> pmuBackup(&pLinuxSysmanImp->pPmuInterface);
|
||||
pLinuxSysmanImp->pPmuInterface = pPmuInterface.get();
|
||||
|
||||
VariableBackup<L0::Sysman::SysFsAccessInterface *> sysfsBackup(&pLinuxSysmanImp->pSysfsAccess);
|
||||
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
bool correctable = true;
|
||||
for (const auto &handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
uint32_t count = 0;
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, nullptr));
|
||||
EXPECT_NE(0u, count);
|
||||
std::vector<zes_ras_state_exp_t> rasStates(count);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, rasStates.data()));
|
||||
uint32_t expectedErrCount = 0u;
|
||||
if (correctable == true) {
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
|
||||
expectedErrCount = correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrors;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS));
|
||||
break;
|
||||
}
|
||||
}
|
||||
correctable = false;
|
||||
} else {
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS) {
|
||||
expectedErrCount = fatalTlb + initialUncorrectableCacheErrors;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
|
||||
expectedErrCount = fatalEuErrorCount + initialUncorrectableComputeErrors;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
|
||||
expectedErrCount = socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) {
|
||||
expectedErrCount = euAttention + initialProgrammingErrors;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS) {
|
||||
expectedErrCount = driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_SCALE_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS));
|
||||
}
|
||||
}
|
||||
correctable = true;
|
||||
for (const auto &handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
uint32_t count = 0;
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, nullptr));
|
||||
EXPECT_NE(0u, count);
|
||||
std::vector<zes_ras_state_exp_t> rasStates(count);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, rasStates.data()));
|
||||
if (correctable == true) {
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
break;
|
||||
}
|
||||
}
|
||||
correctable = false;
|
||||
} else {
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingzesGetClearStateExpAndFirmwareInterfaceIsAbsentOtherInterfacesAreAlsoAbsentThenCallFails) {
|
||||
pFsAccess->mockReadVal = true;
|
||||
VariableBackup<L0::Sysman::FsAccessInterface *> fsBackup(&pLinuxSysmanImp->pFsAccess);
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
|
||||
|
||||
VariableBackup<L0::Sysman::FirmwareUtil *> fwBackup(&pLinuxSysmanImp->pFwUtilInterface);
|
||||
pLinuxSysmanImp->pFwUtilInterface = nullptr;
|
||||
|
||||
VariableBackup<L0::Sysman::SysFsAccessInterface *> sysfsBackup(&pLinuxSysmanImp->pSysfsAccess);
|
||||
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (const auto &handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingzesGetClearStateExpAndGetMemoryErrorFailsAndOtherInterfacesAreAlsoAbsentThenCallFails) {
|
||||
VariableBackup<L0::Sysman::FirmwareUtil *> fwBackup(&pLinuxSysmanImp->pFwUtilInterface);
|
||||
pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get();
|
||||
|
||||
VariableBackup<L0::Sysman::FsAccessInterface *> fsBackup(&pLinuxSysmanImp->pFsAccess);
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (const auto &handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingzesGetClearStateExpWithoutWritePermissionsThenCallFails) {
|
||||
pFsAccess->mockRootUser = true;
|
||||
VariableBackup<L0::Sysman::FirmwareUtil *> fwBackup(&pLinuxSysmanImp->pFwUtilInterface);
|
||||
pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get();
|
||||
|
||||
VariableBackup<L0::Sysman::FsAccessInterface *> fsBackup(&pLinuxSysmanImp->pFsAccess);
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (const auto &handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingzesGetClearStateExpWithInvalidCategoryThenCallFails) {
|
||||
VariableBackup<L0::Sysman::FirmwareUtil *> fwBackup(&pLinuxSysmanImp->pFwUtilInterface);
|
||||
pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get();
|
||||
|
||||
VariableBackup<L0::Sysman::FsAccessInterface *> fsBackup(&pLinuxSysmanImp->pFsAccess);
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
for (const auto &handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ENUMERATION, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_FORCE_UINT32));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingzesRasClearStateExpAndGetStateExpForHbmThenVerifyErrorCountersAreCleared) {
|
||||
|
||||
pPmuInterface->mockPmuReadResult = true;
|
||||
VariableBackup<L0::Sysman::PmuInterface *> pmuBackup(&pLinuxSysmanImp->pPmuInterface);
|
||||
pLinuxSysmanImp->pPmuInterface = pPmuInterface.get();
|
||||
|
||||
pRasFwUtilInterface->mockMemorySuccess = true;
|
||||
VariableBackup<L0::Sysman::FirmwareUtil *> fwBackup(&pLinuxSysmanImp->pFwUtilInterface);
|
||||
pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get();
|
||||
|
||||
VariableBackup<L0::Sysman::FsAccessInterface *> fsBackup(&pLinuxSysmanImp->pFsAccess);
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
|
||||
|
||||
VariableBackup<L0::Sysman::SysFsAccessInterface *> sysfsBackup(&pLinuxSysmanImp->pSysfsAccess);
|
||||
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
|
||||
|
||||
auto handles = getRasHandles(mockHandleCount);
|
||||
bool correctable = true;
|
||||
for (const auto &handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
uint32_t count = 0;
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, nullptr));
|
||||
EXPECT_NE(0u, count);
|
||||
std::vector<zes_ras_state_exp_t> rasStates(count);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, rasStates.data()));
|
||||
if (correctable == true) {
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, hbmCorrectableErrorCount);
|
||||
break;
|
||||
}
|
||||
}
|
||||
correctable = false;
|
||||
} else {
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, hbmUncorrectableErrorCount);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS));
|
||||
}
|
||||
|
||||
correctable = true;
|
||||
for (const auto &handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
uint32_t count = 0;
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, nullptr));
|
||||
EXPECT_NE(0u, count);
|
||||
std::vector<zes_ras_state_exp_t> rasStates(count);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, rasStates.data()));
|
||||
if (correctable == true) {
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
break;
|
||||
}
|
||||
}
|
||||
correctable = false;
|
||||
} else {
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct SysmanRasExpMultiDeviceFixture : public SysmanMultiDeviceFixture {
|
||||
protected:
|
||||
std::unique_ptr<MockRasFsAccess> pFsAccess;
|
||||
@@ -654,6 +909,266 @@ TEST_F(SysmanRasExpMultiDeviceFixture, GivenValidRasHandleWhenCallingZesRasGetSt
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasExpMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasClearStateExpAndGetStateExpForGtThenVerifyErrorCountersAreCleared) {
|
||||
|
||||
VariableBackup<decltype(NEO::SysCalls::sysCallsReadlink)> mockReadLink(&NEO::SysCalls::sysCallsReadlink, [](const char *path, char *buf, size_t bufsize) -> int {
|
||||
constexpr size_t sizeofPath = sizeof("/sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0");
|
||||
strcpy_s(buf, sizeofPath, "/sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0");
|
||||
return sizeofPath;
|
||||
});
|
||||
|
||||
VariableBackup<decltype(NEO::SysCalls::sysCallsPread)> mockPread(&NEO::SysCalls::sysCallsPread, [](int fd, void *buf, size_t count, off_t offset) -> ssize_t {
|
||||
std::ostringstream oStream;
|
||||
oStream << pmuDriverType;
|
||||
std::string value = oStream.str();
|
||||
memcpy(buf, value.data(), count);
|
||||
return count;
|
||||
});
|
||||
|
||||
pPmuInterface->mockPmuReadTile = true;
|
||||
VariableBackup<L0::Sysman::PmuInterface *> pmuBackup(&pLinuxSysmanImp->pPmuInterface);
|
||||
pLinuxSysmanImp->pPmuInterface = pPmuInterface.get();
|
||||
|
||||
VariableBackup<L0::Sysman::FsAccessInterface *> fsBackup(&pLinuxSysmanImp->pFsAccess);
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
|
||||
|
||||
pSysfsAccess->isMultiTileArch = true;
|
||||
VariableBackup<L0::Sysman::SysFsAccessInterface *> sysfsBackup(&pLinuxSysmanImp->pSysfsAccess);
|
||||
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
|
||||
|
||||
VariableBackup<L0::Sysman::FirmwareUtil *> fwBackup(&pLinuxSysmanImp->pFwUtilInterface);
|
||||
pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get();
|
||||
auto handles = getRasHandles(mockHandleCountForSubDevice);
|
||||
uint32_t handleIndex = 0u;
|
||||
for (const auto &handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
uint32_t count = 0;
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, nullptr));
|
||||
EXPECT_NE(0u, count);
|
||||
std::vector<zes_ras_state_exp_t> rasStates(count);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, rasStates.data()));
|
||||
uint32_t expectedErrCount = 0u;
|
||||
if (handleIndex == 0u) {
|
||||
// Correctable errors for Tile 0
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS) {
|
||||
expectedErrCount = correctablel3Bank + initialCorrectableCacheErrorTile0;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
|
||||
expectedErrCount = correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrorsTile0;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
|
||||
expectedErrCount = correctableGscSramEcc + initialCorrectableNonComputeErrorsTile0;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS));
|
||||
} else if (handleIndex == 1u) {
|
||||
// Uncorrectable errors for Tile 0
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS) {
|
||||
expectedErrCount = fatalTlb + initialUncorrectableCacheErrors;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
|
||||
expectedErrCount = fatalSubslice + fatalEuErrorCount + initialUncorrectableComputeErrorsTile0;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
|
||||
expectedErrCount = socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) {
|
||||
expectedErrCount = euAttention + initialProgrammingErrors;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS) {
|
||||
expectedErrCount = driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_SCALE_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS));
|
||||
} else if (handleIndex == 2u) {
|
||||
// Correctable errors for Tile 1
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
|
||||
expectedErrCount = correctableSubsliceTile1 + correctableGucErrorCountTile1 + correctableSamplerErrorCountTile1 + initialCorrectableComputeErrorsTile1;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS));
|
||||
}
|
||||
}
|
||||
} else if (handleIndex == 3u) {
|
||||
// Uncorrectable errors for Tile 1
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS) {
|
||||
expectedErrCount = fatalL3BankTile1 + fatalIdiParityErrorCountTile1 + initialUncorrectableCacheErrorsTile1; // No. of uncorrectable error type for subdevice 1
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
|
||||
expectedErrCount = fatalGucErrorCountTile1 + initialUncorrectableComputeErrorsTile1;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
|
||||
expectedErrCount = socFatalMdfiWestCountTile1 + socFatalPunitTile1 + initialUncorrectableNonComputeErrorsTile1;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) {
|
||||
expectedErrCount = euAttentionTile1 + initialProgrammingErrorsTile1;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS) {
|
||||
expectedErrCount = driverMigration + driverEngineOther + initialUncorrectableDriverErrorsTile1;
|
||||
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_SCALE_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS));
|
||||
}
|
||||
handleIndex++;
|
||||
}
|
||||
|
||||
handleIndex = 0u;
|
||||
for (const auto &handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
uint32_t count = 0;
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, nullptr));
|
||||
EXPECT_NE(0u, count);
|
||||
std::vector<zes_ras_state_exp_t> rasStates(count);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, rasStates.data()));
|
||||
// Correctable errors for Tile 0
|
||||
if (handleIndex == 0u) {
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
}
|
||||
}
|
||||
} else if (handleIndex == 1u) {
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
}
|
||||
}
|
||||
} else if (handleIndex == 2u) {
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if (handleIndex == 3u) {
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
}
|
||||
}
|
||||
}
|
||||
handleIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasExpMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasClearStateExpAndGetStateExpForHbmThenVerifyErrorCountersAreCleared) {
|
||||
|
||||
pPmuInterface->mockPmuReadResult = true;
|
||||
VariableBackup<L0::Sysman::PmuInterface *> pmuBackup(&pLinuxSysmanImp->pPmuInterface);
|
||||
pLinuxSysmanImp->pPmuInterface = pPmuInterface.get();
|
||||
|
||||
pRasFwUtilInterface->mockMemorySuccess = true;
|
||||
VariableBackup<L0::Sysman::FirmwareUtil *> fwBackup(&pLinuxSysmanImp->pFwUtilInterface);
|
||||
pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get();
|
||||
|
||||
VariableBackup<L0::Sysman::FsAccessInterface *> fsBackup(&pLinuxSysmanImp->pFsAccess);
|
||||
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
|
||||
|
||||
pSysfsAccess->isMultiTileArch = true;
|
||||
VariableBackup<L0::Sysman::SysFsAccessInterface *> sysfsBackup(&pLinuxSysmanImp->pSysfsAccess);
|
||||
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
|
||||
|
||||
auto handles = getRasHandles(mockHandleCountForSubDevice);
|
||||
uint32_t handleIndex = 0u;
|
||||
|
||||
for (const auto &handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
uint32_t count = 0;
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, nullptr));
|
||||
EXPECT_NE(0u, count);
|
||||
std::vector<zes_ras_state_exp_t> rasStates(count);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, rasStates.data()));
|
||||
if (handleIndex == 0u) {
|
||||
// Correctable errors for Tile 0
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, hbmCorrectableErrorCount);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if (handleIndex == 1u) {
|
||||
// Uncorrectable errors for Tile 0
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, hbmUncorrectableErrorCount);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if (handleIndex == 2u) {
|
||||
// Correctable errors for Tile 1
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, hbmCorrectableErrorCount);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if (handleIndex == 3u) {
|
||||
// Uncorrectable errors for Tile 1
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, hbmUncorrectableErrorCount);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS));
|
||||
handleIndex++;
|
||||
}
|
||||
|
||||
for (const auto &handle : handles) {
|
||||
ASSERT_NE(nullptr, handle);
|
||||
uint32_t count = 0;
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, nullptr));
|
||||
EXPECT_NE(0u, count);
|
||||
std::vector<zes_ras_state_exp_t> rasStates(count);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, rasStates.data()));
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS) {
|
||||
EXPECT_EQ(rasStates[i].errorCounter, 0u);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ult
|
||||
} // namespace Sysman
|
||||
} // namespace L0
|
||||
} // namespace L0
|
||||
@@ -50,6 +50,17 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingRasGetStateThenFailureIsR
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, pRasImp->rasGetState(&state, false));
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingRasGetStateExpThenFailureIsReturned) {
|
||||
auto pRasImp = std::make_unique<L0::Sysman::RasImp>(pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, false, 0);
|
||||
uint32_t pCount = 0;
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, pRasImp->rasGetStateExp(&pCount, nullptr));
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingRasClearStateExpThenFailureIsReturned) {
|
||||
auto pRasImp = std::make_unique<L0::Sysman::RasImp>(pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, false, 0);
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, pRasImp->rasClearStateExp(ZES_RAS_ERROR_CATEGORY_EXP_RESET));
|
||||
}
|
||||
|
||||
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingRasGetConfigThenFailureIsReturned) {
|
||||
auto pRasImp = std::make_unique<L0::Sysman::RasImp>(pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, false, 0);
|
||||
zes_ras_config_t config = {};
|
||||
|
||||
Reference in New Issue
Block a user