feature: Add support for ras clear state exp

Related-To: NEO-8873

Signed-off-by: Bellekallu Rajkiran <bellekallu.rajkiran@intel.com>
This commit is contained in:
Bellekallu Rajkiran
2023-11-17 09:42:29 +00:00
committed by Compute-Runtime-Automation
parent dcf74e8d29
commit b5a09f8eb4
13 changed files with 636 additions and 46 deletions

View File

@@ -1156,15 +1156,19 @@ ze_result_t zesRasGetStateExp(
if (L0::Sysman::sysmanOnlyInit) {
return L0::Sysman::Ras::fromHandle(hRas)->rasGetStateExp(pCount, pState);
} else {
return ZE_RESULT_ERROR_UNINITIALIZED;
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
}
ze_result_t zesRasClearStateExp(
zes_ras_handle_t hRas,
zes_ras_error_category_exp_t category) {
if (L0::Sysman::sysmanOnlyInit) {
return L0::Sysman::Ras::fromHandle(hRas)->rasClearStateExp(category);
} else {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
}
ze_result_t zesDeviceEventRegister(
zes_device_handle_t hDevice,

View File

@@ -90,7 +90,6 @@ ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear)
ze_result_t LinuxRasImp::osRasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) {
ze_result_t result = ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE;
uint32_t totalCategoryCount = 0;
std::vector<uint32_t> numCategoriesBySources = {};
for (auto &rasSource : rasSources) {
@@ -122,6 +121,29 @@ ze_result_t LinuxRasImp::osRasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t
return result;
}
ze_result_t LinuxRasImp::osRasClearStateExp(zes_ras_error_category_exp_t category) {
if (pFsAccess->isRootUser() == false) {
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Insufficient permissions and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS);
return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS;
}
if (ZES_RAS_ERROR_CATEGORY_EXP_L3FABRIC_ERRORS < category) {
return ZE_RESULT_ERROR_INVALID_ENUMERATION;
}
ze_result_t result = ZE_RESULT_ERROR_NOT_AVAILABLE;
for (auto &rasSource : rasSources) {
result = rasSource->osRasClearStateExp(category);
if (result != ZE_RESULT_SUCCESS) {
if (result == ZE_RESULT_ERROR_NOT_AVAILABLE) {
continue;
}
return result;
}
}
return result;
}
void LinuxRasImp::initSources() {
rasSources.push_back(std::make_unique<L0::Sysman::LinuxRasSourceGt>(pLinuxSysmanImp, osRasErrorType, isSubdevice, subdeviceId));
if (isMemoryTypeHbm(pLinuxSysmanImp) == true) {

View File

@@ -31,6 +31,7 @@ class LinuxRasSources : NEO::NonCopyableOrMovableClass {
virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) = 0;
virtual ze_result_t osRasGetStateExp(uint32_t numCategoriesRequested, zes_ras_state_exp_t *pState) = 0;
virtual uint32_t osRasGetCategoryCount() = 0;
virtual ze_result_t osRasClearStateExp(zes_ras_error_category_exp_t category) = 0;
virtual ~LinuxRasSources() = default;
};
@@ -41,6 +42,7 @@ class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass {
ze_result_t osRasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) override;
ze_result_t osRasGetConfig(zes_ras_config_t *config) override;
ze_result_t osRasSetConfig(const zes_ras_config_t *config) override;
ze_result_t osRasClearStateExp(zes_ras_error_category_exp_t category) override;
LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
LinuxRasImp() = default;
~LinuxRasImp() override = default;
@@ -63,6 +65,7 @@ class LinuxRasSourceGt : public LinuxRasSources {
public:
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
ze_result_t osRasGetStateExp(uint32_t numCategoriesRequested, zes_ras_state_exp_t *pState) override;
ze_result_t osRasClearStateExp(zes_ras_error_category_exp_t category) override;
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId);
uint32_t osRasGetCategoryCount() override;
LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
@@ -87,12 +90,15 @@ class LinuxRasSourceGt : public LinuxRasSources {
std::string nameOfError,
const std::string &errorCounterDir,
uint64_t &errorVal);
inline bool getAbsoluteCount(zes_ras_error_category_exp_t category) {
return !(clearStatus & (1 << category));
}
void closeFds();
int64_t groupFd = -1;
std::vector<int64_t> memberFds = {};
uint64_t initialErrorCount[maxRasErrorCategoryCount] = {0};
std::map<zes_ras_error_cat_t, uint64_t> errorCategoryToEventCount;
uint64_t totalEventCount = 0;
uint32_t clearStatus = 0;
std::map<zes_ras_error_category_exp_t, uint64_t> errorCategoryToEventCount;
bool isSubdevice = false;
uint32_t subdeviceId = 0;
};
@@ -101,6 +107,7 @@ class LinuxRasSourceHbm : public LinuxRasSources {
public:
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
ze_result_t osRasGetStateExp(uint32_t numCategoriesRequested, zes_ras_state_exp_t *pState) override;
ze_result_t osRasClearStateExp(zes_ras_error_category_exp_t category) override;
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId);
uint32_t osRasGetCategoryCount() override;
LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId);

View File

@@ -17,46 +17,38 @@
namespace L0 {
namespace Sysman {
static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEventsUncorrectable = {
{ZES_RAS_ERROR_CAT_CACHE_ERRORS,
static const std::map<zes_ras_error_category_exp_t, std::vector<std::string>> categoryToListOfEventsUncorrectable = {
{ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS,
{"fatal-array-bist", "fatal-idi-parity", "fatal-l3-double",
"fatal-l3-ecc-checker",
"fatal-sqidi", "fatal-tlb", "fatal-l3bank"}},
{ZES_RAS_ERROR_CAT_RESET,
{ZES_RAS_ERROR_CATEGORY_EXP_RESET,
{"engine-reset"}},
{ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS,
{ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS,
{"eu-attention"}},
{ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS,
{ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS,
{"soc-fatal-psf-0", "soc-fatal-psf-1", "soc-fatal-psf-2", "soc-fatal-psf-csc-0",
"soc-fatal-psf-csc-1", "soc-fatal-psf-csc-2", "soc-fatal-punit",
"sgunit-fatal", "soc-nonfatal-punit", "sgunit-fatal", "sgunit-nonfatal", "gsc-nonfatal-mia-shutdown",
"gsc-nonfatal-aon-parity", "gsc-nonfatal-rom-parity", "gsc-nonfatal-fuse-crc-check",
"gsc-nonfatal-selfmbist", "gsc-nonfatal-fuse-pull", "gsc-nonfatal-sram-ecc", "gsc-nonfatal-glitch-det",
"gsc-nonfatal-ucode-parity", "gsc-nonfatal-mia-int", "gsc-nonfatal-wdg-timeout"}},
{ZES_RAS_ERROR_CAT_COMPUTE_ERRORS,
{ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS,
{"fatal-fpu", "fatal-eu-grf", "fatal-sampler", "fatal-slm",
"fatal-guc", "fatal-eu-ic", "fatal-subslice"}},
{ZES_RAS_ERROR_CAT_DRIVER_ERRORS,
{ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS,
{"driver-object-migration", "driver-engine-other", "driver-ggtt",
"driver-gt-interrupt", "driver-gt-other", "driver-guc-communication",
"driver-rps"}}};
static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEventsCorrectable = {
{ZES_RAS_ERROR_CAT_CACHE_ERRORS,
static const std::map<zes_ras_error_category_exp_t, std::vector<std::string>> categoryToListOfEventsCorrectable = {
{ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS,
{"correctable-l3-sng", "correctable-l3bank"}},
{ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS,
{ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS,
{"sgunit-correctable", "gsc-correctable-sram-ecc"}},
{ZES_RAS_ERROR_CAT_COMPUTE_ERRORS,
{ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS,
{"correctable-eu-grf", "correctable-eu-ic", "correctable-guc", "correctable-sampler", "correctable-slm", "correctable-subslice"}}};
static std::map<zes_ras_error_cat_t, zes_ras_error_category_exp_t> categoryStandardToExpMap = {
{ZES_RAS_ERROR_CAT_RESET, ZES_RAS_ERROR_CATEGORY_EXP_RESET},
{ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS, ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS},
{ZES_RAS_ERROR_CAT_DRIVER_ERRORS, ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS},
{ZES_RAS_ERROR_CAT_COMPUTE_ERRORS, ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS},
{ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS, ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS},
{ZES_RAS_ERROR_CAT_CACHE_ERRORS, ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS}};
static void closeFd(int64_t &fd) {
if (fd != -1) {
close(static_cast<int>(fd));
@@ -104,7 +96,7 @@ static uint64_t convertHexToUint64(std::string strVal) {
return config;
}
static bool getErrorType(std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEvents, std::vector<std::string> &eventList, ze_bool_t isSubDevice, uint32_t subDeviceId) {
static bool getErrorType(std::map<zes_ras_error_category_exp_t, std::vector<std::string>> categoryToListOfEvents, std::vector<std::string> &eventList, ze_bool_t isSubDevice, uint32_t subDeviceId) {
// Naming convention of files containing config values for errors
// error--<Name of error> Ex:- error--engine-reset (config file with no subdevice)
// error-gt<N>--<Name of error> Ex:- error-gt0--engine-reset (config file with subdevices)
@@ -157,7 +149,6 @@ void LinuxRasSourceGt::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t>
ze_result_t LinuxRasSourceGt::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
if (clear == true) {
closeFds();
totalEventCount = 0;
memset(state.category, 0, maxRasErrorCategoryCount * sizeof(uint64_t));
memset(initialErrorCount, 0, maxRasErrorCategoryCount * sizeof(uint64_t));
}
@@ -168,7 +159,8 @@ ze_result_t LinuxRasSourceGt::osRasGetState(zes_ras_state_t &state, ze_bool_t cl
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
std::vector<std::uint64_t> data(2 + totalEventCount, 0); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
auto numEvents = memberFds.size() + 1; // Add 1 for group Fd
std::vector<std::uint64_t> data(2 + numEvents, 0); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
if (pPmuInterface->pmuRead(static_cast<int>(groupFd), data.data(), sizeof(uint64_t) * data.size()) < 0) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
@@ -195,7 +187,8 @@ ze_result_t LinuxRasSourceGt::osRasGetStateExp(uint32_t numCategoriesRequested,
return ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE;
}
std::vector<std::uint64_t> data(2 + totalEventCount, 0); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
auto numEvents = memberFds.size() + 1; // Add 1 for group Fd
std::vector<std::uint64_t> data(2 + numEvents, 0); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
if (pPmuInterface->pmuRead(static_cast<int>(groupFd), data.data(), sizeof(uint64_t) * data.size()) < 0) {
return ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE;
}
@@ -209,7 +202,7 @@ ze_result_t LinuxRasSourceGt::osRasGetStateExp(uint32_t numCategoriesRequested,
for (; j < errorCat->second; j++) {
errorCount += data[initialIndex + j];
}
pState[categoryIdx].category = categoryStandardToExpMap[errorCat->first];
pState[categoryIdx].category = errorCat->first;
pState[categoryIdx].errorCounter = errorCount + initialErrorCount[errorCat->first];
initialIndex += j;
categoryIdx++;
@@ -225,6 +218,18 @@ uint32_t LinuxRasSourceGt::osRasGetCategoryCount() {
return static_cast<uint32_t>(categoryToListOfEventsCorrectable.size());
}
ze_result_t LinuxRasSourceGt::osRasClearStateExp(zes_ras_error_category_exp_t category) {
ze_result_t result = ZE_RESULT_ERROR_NOT_AVAILABLE;
// check requested category is already initialized
if (errorCategoryToEventCount.find(category) != errorCategoryToEventCount.end()) {
closeFds();
clearStatus |= (1 << category);
initialErrorCount[category] = 0;
result = ZE_RESULT_SUCCESS;
}
return result;
}
ze_result_t LinuxRasSourceGt::getPmuConfig(
const std::string &eventDirectory,
const std::vector<std::string> &listOfEvents,
@@ -259,7 +264,7 @@ void LinuxRasSourceGt::initRasErrors(ze_bool_t clear) {
if (result != ZE_RESULT_SUCCESS) {
return;
}
std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEvents;
std::map<zes_ras_error_category_exp_t, std::vector<std::string>> categoryToListOfEvents;
if (osRasErrorType == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
categoryToListOfEvents = categoryToListOfEventsCorrectable;
}
@@ -290,7 +295,7 @@ void LinuxRasSourceGt::initRasErrors(ze_bool_t clear) {
errorPrefixLocal = "error--";
}
uint64_t initialErrorVal = 0;
if (clear == false) {
if ((clear == false) && (getAbsoluteCount(rasErrorCatToListOfEvents.first) == true)) {
result = getBootUpErrorCountFromSysfs(nameOfError, errorCounterDirLocal, initialErrorVal);
if (result != ZE_RESULT_SUCCESS) {
continue;
@@ -314,9 +319,10 @@ void LinuxRasSourceGt::initRasErrors(ze_bool_t clear) {
eventCount++;
errorCount += initialErrorVal;
}
clearStatus &= ~(1 << rasErrorCatToListOfEvents.first);
initialErrorCount[rasErrorCatToListOfEvents.first] = errorCount;
errorCategoryToEventCount[rasErrorCatToListOfEvents.first] = eventCount;
totalEventCount += eventCount;
}
}

View File

@@ -24,13 +24,13 @@ void LinuxRasSourceHbm::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t>
}
ze_result_t LinuxRasSourceHbm::getMemoryErrorCountFromFw(zes_ras_error_type_t rasErrorType, uint32_t subDeviceCount, uint64_t &errorCount) {
if (pFwInterface == nullptr) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
return pFwInterface->fwGetMemoryErrorCount(rasErrorType, subDeviceCount, subdeviceId, errorCount);
}
ze_result_t LinuxRasSourceHbm::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
if (pFwInterface == nullptr) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
if (clear == true) {
uint64_t errorCount = 0;
ze_result_t result = getMemoryErrorCountFromFw(osRasErrorType, this->subDeviceCount, errorCount);
@@ -51,10 +51,6 @@ ze_result_t LinuxRasSourceHbm::osRasGetState(zes_ras_state_t &state, ze_bool_t c
}
ze_result_t LinuxRasSourceHbm::osRasGetStateExp(uint32_t numCategoriesRequested, zes_ras_state_exp_t *pState) {
if (pFwInterface == nullptr) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
uint64_t errorCount = 0;
ze_result_t result = getMemoryErrorCountFromFw(osRasErrorType, this->subDeviceCount, errorCount);
if (result != ZE_RESULT_SUCCESS) {
@@ -72,6 +68,18 @@ uint32_t LinuxRasSourceHbm::osRasGetCategoryCount() {
return 1u;
}
ze_result_t LinuxRasSourceHbm::osRasClearStateExp(zes_ras_error_category_exp_t category) {
if (category == ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS) {
uint64_t errorCount = 0;
ze_result_t result = getMemoryErrorCountFromFw(osRasErrorType, this->subDeviceCount, errorCount);
if (result != ZE_RESULT_SUCCESS) {
return result;
}
errorBaseline = errorCount;
}
return ZE_RESULT_SUCCESS;
}
LinuxRasSourceHbm::LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId) : pLinuxSysmanImp(pLinuxSysmanImp), osRasErrorType(type), subdeviceId(subdeviceId) {
pFwInterface = pLinuxSysmanImp->getFwUtilInterface();
pDevice = pLinuxSysmanImp->getSysmanDeviceImp();

View File

@@ -22,6 +22,7 @@ class OsRas {
virtual ze_result_t osRasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) = 0;
virtual ze_result_t osRasGetConfig(zes_ras_config_t *config) = 0;
virtual ze_result_t osRasSetConfig(const zes_ras_config_t *config) = 0;
virtual ze_result_t osRasClearStateExp(zes_ras_error_category_exp_t category) = 0;
static OsRas *create(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId);
virtual ~OsRas() = default;

View File

@@ -25,6 +25,7 @@ class Ras : _zes_ras_handle_t {
virtual ze_result_t rasSetConfig(const zes_ras_config_t *pConfig) = 0;
virtual ze_result_t rasGetState(zes_ras_state_t *pState, ze_bool_t clear) = 0;
virtual ze_result_t rasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) = 0;
virtual ze_result_t rasClearStateExp(zes_ras_error_category_exp_t category) = 0;
static Ras *fromHandle(zes_ras_handle_t handle) {
return static_cast<Ras *>(handle);

View File

@@ -38,6 +38,10 @@ ze_result_t RasImp::rasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState
return pOsRas->osRasGetStateExp(pCount, pState);
}
ze_result_t RasImp::rasClearStateExp(zes_ras_error_category_exp_t category) {
return pOsRas->osRasClearStateExp(category);
}
void RasImp::init() {
pOsRas->osRasGetProperties(rasProperties);
}

View File

@@ -22,6 +22,7 @@ class RasImp : public Ras, NEO::NonCopyableOrMovableClass {
ze_result_t rasSetConfig(const zes_ras_config_t *pConfig) override;
ze_result_t rasGetState(zes_ras_state_t *pConfig, ze_bool_t clear) override;
ze_result_t rasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) override;
ze_result_t rasClearStateExp(zes_ras_error_category_exp_t category) override;
RasImp() = default;
RasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t isSubDevice, uint32_t subDeviceId);

View File

@@ -16,6 +16,7 @@ class WddmRasImp : public OsRas {
ze_result_t osRasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) override;
ze_result_t osRasGetConfig(zes_ras_config_t *config) override;
ze_result_t osRasSetConfig(const zes_ras_config_t *config) override;
ze_result_t osRasClearStateExp(zes_ras_error_category_exp_t category) override;
};
void OsRas::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId) {}
@@ -40,6 +41,10 @@ ze_result_t WddmRasImp::osRasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
ze_result_t WddmRasImp::osRasClearStateExp(zes_ras_error_category_exp_t category) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
OsRas *OsRas::create(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) {
WddmRasImp *pWddmRasImp = new WddmRasImp();
return static_cast<OsRas *>(pWddmRasImp);

View File

@@ -222,7 +222,7 @@ struct MockRasPmuInterfaceImp : public L0::Sysman::PmuInterfaceImp {
}
}
if (mockPmuReadAfterClear == true) {
if ((mockPmuReadAfterClear == true) && (mockPmuReadTile == false)) {
if (mockPmuReadCountAfterClear == 0) {
mockPmuReadCountAfterClear++;
return mockedPmuReadForCorrectableAndSuccessReturn(fd, data, sizeOfdata);
@@ -240,24 +240,29 @@ struct MockRasPmuInterfaceImp : public L0::Sysman::PmuInterfaceImp {
}
if (mockPmuReadTile == true) {
if (mockPmuReadTileCount == 0) {
if (mockPmuReadAfterClear == true) {
mockPmuReadCountAfterClear++;
}
if ((mockPmuReadTileCount == 0) && (mockPmuReadCountAfterClear < 4)) {
mockPmuReadTileCount++;
return mockedPmuReadForCorrectableTile0AndSuccessReturn(fd, data, sizeOfdata);
}
else if (mockPmuReadTileCount == 1) {
else if ((mockPmuReadTileCount == 1) && (mockPmuReadCountAfterClear < 4)) {
mockPmuReadTileCount++;
return mockedPmuReadForUncorrectableTile0AndSuccessReturn(fd, data, sizeOfdata);
}
else if (mockPmuReadTileCount == 2) {
else if ((mockPmuReadTileCount == 2) && (mockPmuReadCountAfterClear < 4)) {
mockPmuReadTileCount++;
return mockedPmuReadForCorrectableTile1AndSuccessReturn(fd, data, sizeOfdata);
}
else if (mockPmuReadTileCount == 3) {
else if ((mockPmuReadTileCount == 3) && (mockPmuReadCountAfterClear < 4)) {
mockPmuReadTileCount++;
return mockedPmuReadForUncorrectableTile1AndSuccessReturn(fd, data, sizeOfdata);
} else {
return mockedPmuReadAfterClearAndSuccessReturn(fd, data, sizeOfdata);
}
}
return 0;

View File

@@ -161,7 +161,7 @@ TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingZesRasGetStateExpForGt
for (uint32_t i = 0; i < count; i++) {
rasStates[i].errorCounter = 0u;
}
uint32_t requestedCount = correctable ? count : (count - 1);
uint32_t requestedCount = correctable ? count : (count - 2);
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &requestedCount, rasStates.data()));
if (correctable == true) {
for (uint32_t i = 0; i < count; i++) {
@@ -443,6 +443,261 @@ TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingZesRasGetStateExpForGt
}
}
TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingzesRasClearStateExpForGtAndRasErrorsAreNotretrievedBeforeThenSuccessIsReturned) {
VariableBackup<L0::Sysman::FsAccessInterface *> fsBackup(&pLinuxSysmanImp->pFsAccess);
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
auto handles = getRasHandles(mockHandleCount);
for (const auto &handle : handles) {
ASSERT_NE(nullptr, handle);
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_SCALE_ERRORS));
}
}
TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingzesRasClearStateExpAndGetStateExpForGtThenVerifyErrorCountersAreCleared) {
VariableBackup<decltype(NEO::SysCalls::sysCallsReadlink)> mockReadLink(&NEO::SysCalls::sysCallsReadlink, [](const char *path, char *buf, size_t bufsize) -> int {
constexpr size_t sizeofPath = sizeof("/sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0");
strcpy_s(buf, sizeofPath, "/sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0");
return sizeofPath;
});
VariableBackup<decltype(NEO::SysCalls::sysCallsPread)> mockPread(&NEO::SysCalls::sysCallsPread, [](int fd, void *buf, size_t count, off_t offset) -> ssize_t {
std::ostringstream oStream;
oStream << pmuDriverType;
std::string value = oStream.str();
memcpy(buf, value.data(), count);
return count;
});
VariableBackup<L0::Sysman::FsAccessInterface *> fsBackup(&pLinuxSysmanImp->pFsAccess);
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
pPmuInterface->mockPmuReadAfterClear = true;
VariableBackup<L0::Sysman::PmuInterface *> pmuBackup(&pLinuxSysmanImp->pPmuInterface);
pLinuxSysmanImp->pPmuInterface = pPmuInterface.get();
VariableBackup<L0::Sysman::SysFsAccessInterface *> sysfsBackup(&pLinuxSysmanImp->pSysfsAccess);
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
auto handles = getRasHandles(mockHandleCount);
bool correctable = true;
for (const auto &handle : handles) {
ASSERT_NE(nullptr, handle);
uint32_t count = 0;
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, nullptr));
EXPECT_NE(0u, count);
std::vector<zes_ras_state_exp_t> rasStates(count);
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, rasStates.data()));
uint32_t expectedErrCount = 0u;
if (correctable == true) {
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
expectedErrCount = correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrors;
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS));
break;
}
}
correctable = false;
} else {
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS) {
expectedErrCount = fatalTlb + initialUncorrectableCacheErrors;
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
expectedErrCount = fatalEuErrorCount + initialUncorrectableComputeErrors;
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
expectedErrCount = socFatalPsfCsc0Count + initialUncorrectableNonComputeErrors;
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) {
expectedErrCount = euAttention + initialProgrammingErrors;
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS) {
expectedErrCount = driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors;
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
}
}
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_SCALE_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS));
}
}
correctable = true;
for (const auto &handle : handles) {
ASSERT_NE(nullptr, handle);
uint32_t count = 0;
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, nullptr));
EXPECT_NE(0u, count);
std::vector<zes_ras_state_exp_t> rasStates(count);
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, rasStates.data()));
if (correctable == true) {
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
break;
}
}
correctable = false;
} else {
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
}
}
}
}
}
TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingzesGetClearStateExpAndFirmwareInterfaceIsAbsentOtherInterfacesAreAlsoAbsentThenCallFails) {
pFsAccess->mockReadVal = true;
VariableBackup<L0::Sysman::FsAccessInterface *> fsBackup(&pLinuxSysmanImp->pFsAccess);
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
VariableBackup<L0::Sysman::FirmwareUtil *> fwBackup(&pLinuxSysmanImp->pFwUtilInterface);
pLinuxSysmanImp->pFwUtilInterface = nullptr;
VariableBackup<L0::Sysman::SysFsAccessInterface *> sysfsBackup(&pLinuxSysmanImp->pSysfsAccess);
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
auto handles = getRasHandles(mockHandleCount);
for (const auto &handle : handles) {
ASSERT_NE(nullptr, handle);
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS));
}
}
TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingzesGetClearStateExpAndGetMemoryErrorFailsAndOtherInterfacesAreAlsoAbsentThenCallFails) {
VariableBackup<L0::Sysman::FirmwareUtil *> fwBackup(&pLinuxSysmanImp->pFwUtilInterface);
pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get();
VariableBackup<L0::Sysman::FsAccessInterface *> fsBackup(&pLinuxSysmanImp->pFsAccess);
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
auto handles = getRasHandles(mockHandleCount);
for (const auto &handle : handles) {
ASSERT_NE(nullptr, handle);
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS));
}
}
TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingzesGetClearStateExpWithoutWritePermissionsThenCallFails) {
pFsAccess->mockRootUser = true;
VariableBackup<L0::Sysman::FirmwareUtil *> fwBackup(&pLinuxSysmanImp->pFwUtilInterface);
pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get();
VariableBackup<L0::Sysman::FsAccessInterface *> fsBackup(&pLinuxSysmanImp->pFsAccess);
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
auto handles = getRasHandles(mockHandleCount);
for (const auto &handle : handles) {
ASSERT_NE(nullptr, handle);
EXPECT_EQ(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS));
}
}
TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingzesGetClearStateExpWithInvalidCategoryThenCallFails) {
VariableBackup<L0::Sysman::FirmwareUtil *> fwBackup(&pLinuxSysmanImp->pFwUtilInterface);
pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get();
VariableBackup<L0::Sysman::FsAccessInterface *> fsBackup(&pLinuxSysmanImp->pFsAccess);
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
auto handles = getRasHandles(mockHandleCount);
for (const auto &handle : handles) {
ASSERT_NE(nullptr, handle);
EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ENUMERATION, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_FORCE_UINT32));
}
}
TEST_F(SysmanRasExpFixture, GivenValidRasHandleWhenCallingzesRasClearStateExpAndGetStateExpForHbmThenVerifyErrorCountersAreCleared) {
pPmuInterface->mockPmuReadResult = true;
VariableBackup<L0::Sysman::PmuInterface *> pmuBackup(&pLinuxSysmanImp->pPmuInterface);
pLinuxSysmanImp->pPmuInterface = pPmuInterface.get();
pRasFwUtilInterface->mockMemorySuccess = true;
VariableBackup<L0::Sysman::FirmwareUtil *> fwBackup(&pLinuxSysmanImp->pFwUtilInterface);
pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get();
VariableBackup<L0::Sysman::FsAccessInterface *> fsBackup(&pLinuxSysmanImp->pFsAccess);
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
VariableBackup<L0::Sysman::SysFsAccessInterface *> sysfsBackup(&pLinuxSysmanImp->pSysfsAccess);
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
auto handles = getRasHandles(mockHandleCount);
bool correctable = true;
for (const auto &handle : handles) {
ASSERT_NE(nullptr, handle);
uint32_t count = 0;
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, nullptr));
EXPECT_NE(0u, count);
std::vector<zes_ras_state_exp_t> rasStates(count);
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, rasStates.data()));
if (correctable == true) {
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, hbmCorrectableErrorCount);
break;
}
}
correctable = false;
} else {
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, hbmUncorrectableErrorCount);
break;
}
}
}
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS));
}
correctable = true;
for (const auto &handle : handles) {
ASSERT_NE(nullptr, handle);
uint32_t count = 0;
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, nullptr));
EXPECT_NE(0u, count);
std::vector<zes_ras_state_exp_t> rasStates(count);
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, rasStates.data()));
if (correctable == true) {
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
break;
}
}
correctable = false;
} else {
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
break;
}
}
}
}
}
struct SysmanRasExpMultiDeviceFixture : public SysmanMultiDeviceFixture {
protected:
std::unique_ptr<MockRasFsAccess> pFsAccess;
@@ -654,6 +909,266 @@ TEST_F(SysmanRasExpMultiDeviceFixture, GivenValidRasHandleWhenCallingZesRasGetSt
}
}
TEST_F(SysmanRasExpMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasClearStateExpAndGetStateExpForGtThenVerifyErrorCountersAreCleared) {
VariableBackup<decltype(NEO::SysCalls::sysCallsReadlink)> mockReadLink(&NEO::SysCalls::sysCallsReadlink, [](const char *path, char *buf, size_t bufsize) -> int {
constexpr size_t sizeofPath = sizeof("/sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0");
strcpy_s(buf, sizeofPath, "/sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0");
return sizeofPath;
});
VariableBackup<decltype(NEO::SysCalls::sysCallsPread)> mockPread(&NEO::SysCalls::sysCallsPread, [](int fd, void *buf, size_t count, off_t offset) -> ssize_t {
std::ostringstream oStream;
oStream << pmuDriverType;
std::string value = oStream.str();
memcpy(buf, value.data(), count);
return count;
});
pPmuInterface->mockPmuReadTile = true;
VariableBackup<L0::Sysman::PmuInterface *> pmuBackup(&pLinuxSysmanImp->pPmuInterface);
pLinuxSysmanImp->pPmuInterface = pPmuInterface.get();
VariableBackup<L0::Sysman::FsAccessInterface *> fsBackup(&pLinuxSysmanImp->pFsAccess);
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
pSysfsAccess->isMultiTileArch = true;
VariableBackup<L0::Sysman::SysFsAccessInterface *> sysfsBackup(&pLinuxSysmanImp->pSysfsAccess);
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
VariableBackup<L0::Sysman::FirmwareUtil *> fwBackup(&pLinuxSysmanImp->pFwUtilInterface);
pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get();
auto handles = getRasHandles(mockHandleCountForSubDevice);
uint32_t handleIndex = 0u;
for (const auto &handle : handles) {
ASSERT_NE(nullptr, handle);
uint32_t count = 0;
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, nullptr));
EXPECT_NE(0u, count);
std::vector<zes_ras_state_exp_t> rasStates(count);
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, rasStates.data()));
uint32_t expectedErrCount = 0u;
if (handleIndex == 0u) {
// Correctable errors for Tile 0
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS) {
expectedErrCount = correctablel3Bank + initialCorrectableCacheErrorTile0;
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
expectedErrCount = correctableGrfErrorCount + correctableEuErrorCount + initialCorrectableComputeErrorsTile0;
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
expectedErrCount = correctableGscSramEcc + initialCorrectableNonComputeErrorsTile0;
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
}
}
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS));
} else if (handleIndex == 1u) {
// Uncorrectable errors for Tile 0
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS) {
expectedErrCount = fatalTlb + initialUncorrectableCacheErrors;
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
expectedErrCount = fatalSubslice + fatalEuErrorCount + initialUncorrectableComputeErrorsTile0;
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
expectedErrCount = socFatalPsfCsc0Count + nonFatalGscAonParity + nonFataGscSelfmBist + initialUncorrectableNonComputeErrorsTile0;
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) {
expectedErrCount = euAttention + initialProgrammingErrors;
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS) {
expectedErrCount = driverMigration + driverGgtt + driverRps + initialUncorrectableDriverErrors;
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
}
}
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_SCALE_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS));
} else if (handleIndex == 2u) {
// Correctable errors for Tile 1
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
expectedErrCount = correctableSubsliceTile1 + correctableGucErrorCountTile1 + correctableSamplerErrorCountTile1 + initialCorrectableComputeErrorsTile1;
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS));
}
}
} else if (handleIndex == 3u) {
// Uncorrectable errors for Tile 1
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS) {
expectedErrCount = fatalL3BankTile1 + fatalIdiParityErrorCountTile1 + initialUncorrectableCacheErrorsTile1; // No. of uncorrectable error type for subdevice 1
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
expectedErrCount = fatalGucErrorCountTile1 + initialUncorrectableComputeErrorsTile1;
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
expectedErrCount = socFatalMdfiWestCountTile1 + socFatalPunitTile1 + initialUncorrectableNonComputeErrorsTile1;
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) {
expectedErrCount = euAttentionTile1 + initialProgrammingErrorsTile1;
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS) {
expectedErrCount = driverMigration + driverEngineOther + initialUncorrectableDriverErrorsTile1;
EXPECT_EQ(rasStates[i].errorCounter, expectedErrCount);
}
}
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_SCALE_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS));
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS));
}
handleIndex++;
}
handleIndex = 0u;
for (const auto &handle : handles) {
ASSERT_NE(nullptr, handle);
uint32_t count = 0;
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, nullptr));
EXPECT_NE(0u, count);
std::vector<zes_ras_state_exp_t> rasStates(count);
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, rasStates.data()));
// Correctable errors for Tile 0
if (handleIndex == 0u) {
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
}
}
} else if (handleIndex == 1u) {
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
}
}
} else if (handleIndex == 2u) {
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
break;
}
}
} else if (handleIndex == 3u) {
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
} else if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
}
}
}
handleIndex++;
}
}
TEST_F(SysmanRasExpMultiDeviceFixture, GivenValidRasHandleWhenCallingzesRasClearStateExpAndGetStateExpForHbmThenVerifyErrorCountersAreCleared) {
pPmuInterface->mockPmuReadResult = true;
VariableBackup<L0::Sysman::PmuInterface *> pmuBackup(&pLinuxSysmanImp->pPmuInterface);
pLinuxSysmanImp->pPmuInterface = pPmuInterface.get();
pRasFwUtilInterface->mockMemorySuccess = true;
VariableBackup<L0::Sysman::FirmwareUtil *> fwBackup(&pLinuxSysmanImp->pFwUtilInterface);
pLinuxSysmanImp->pFwUtilInterface = pRasFwUtilInterface.get();
VariableBackup<L0::Sysman::FsAccessInterface *> fsBackup(&pLinuxSysmanImp->pFsAccess);
pLinuxSysmanImp->pFsAccess = pFsAccess.get();
pSysfsAccess->isMultiTileArch = true;
VariableBackup<L0::Sysman::SysFsAccessInterface *> sysfsBackup(&pLinuxSysmanImp->pSysfsAccess);
pLinuxSysmanImp->pSysfsAccess = pSysfsAccess.get();
auto handles = getRasHandles(mockHandleCountForSubDevice);
uint32_t handleIndex = 0u;
for (const auto &handle : handles) {
ASSERT_NE(nullptr, handle);
uint32_t count = 0;
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, nullptr));
EXPECT_NE(0u, count);
std::vector<zes_ras_state_exp_t> rasStates(count);
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, rasStates.data()));
if (handleIndex == 0u) {
// Correctable errors for Tile 0
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, hbmCorrectableErrorCount);
break;
}
}
} else if (handleIndex == 1u) {
// Uncorrectable errors for Tile 0
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, hbmUncorrectableErrorCount);
break;
}
}
} else if (handleIndex == 2u) {
// Correctable errors for Tile 1
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, hbmCorrectableErrorCount);
break;
}
}
} else if (handleIndex == 3u) {
// Uncorrectable errors for Tile 1
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, hbmUncorrectableErrorCount);
break;
}
}
}
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasClearStateExp(handle, ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS));
handleIndex++;
}
for (const auto &handle : handles) {
ASSERT_NE(nullptr, handle);
uint32_t count = 0;
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, nullptr));
EXPECT_NE(0u, count);
std::vector<zes_ras_state_exp_t> rasStates(count);
EXPECT_EQ(ZE_RESULT_SUCCESS, zesRasGetStateExp(handle, &count, rasStates.data()));
for (uint32_t i = 0; i < count; i++) {
if (rasStates[i].category == ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS) {
EXPECT_EQ(rasStates[i].errorCounter, 0u);
break;
}
}
}
}
} // namespace ult
} // namespace Sysman
} // namespace L0

View File

@@ -50,6 +50,17 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingRasGetStateThenFailureIsR
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, pRasImp->rasGetState(&state, false));
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingRasGetStateExpThenFailureIsReturned) {
auto pRasImp = std::make_unique<L0::Sysman::RasImp>(pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, false, 0);
uint32_t pCount = 0;
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, pRasImp->rasGetStateExp(&pCount, nullptr));
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingRasClearStateExpThenFailureIsReturned) {
auto pRasImp = std::make_unique<L0::Sysman::RasImp>(pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, false, 0);
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, pRasImp->rasClearStateExp(ZES_RAS_ERROR_CATEGORY_EXP_RESET));
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingRasGetConfigThenFailureIsReturned) {
auto pRasImp = std::make_unique<L0::Sysman::RasImp>(pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, false, 0);
zes_ras_config_t config = {};