feature(sysman): Add Support for ras exp API's

Related-To: NEO-8839, NEO-8873

Signed-off-by: Bellekallu Rajkiran <bellekallu.rajkiran@intel.com>
This commit is contained in:
Bellekallu Rajkiran
2023-12-21 05:59:23 +00:00
committed by Compute-Runtime-Automation
parent c339e57041
commit b39aafec26
16 changed files with 1389 additions and 40 deletions

View File

@@ -15,6 +15,7 @@
#include "drm/intel_hwconfig_types.h"
#include <algorithm>
namespace L0 {
static bool isMemoryTypeHbm(LinuxSysmanImp *pLinuxSysmanImp) {
@@ -84,6 +85,62 @@ ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear)
return result;
}
ze_result_t LinuxRasImp::osRasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) {
ze_result_t result = ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE;
uint32_t totalCategoryCount = 0;
std::vector<uint32_t> numCategoriesBySources = {};
for (auto &rasSource : rasSources) {
totalCategoryCount += rasSource->osRasGetCategoryCount();
numCategoriesBySources.push_back(totalCategoryCount);
}
if (*pCount == 0) {
*pCount = totalCategoryCount;
return ZE_RESULT_SUCCESS;
}
uint32_t remainingCategories = std::min(totalCategoryCount, *pCount);
uint32_t numCategoriesAssigned = 0u;
for (uint32_t rasSourceIdx = 0u; rasSourceIdx < rasSources.size(); rasSourceIdx++) {
auto &rasSource = rasSources[rasSourceIdx];
uint32_t numCategoriesRequested = std::min(remainingCategories, numCategoriesBySources[rasSourceIdx]);
ze_result_t localResult = rasSource->osRasGetStateExp(numCategoriesRequested, &pState[numCategoriesAssigned]);
if (localResult != ZE_RESULT_SUCCESS) {
continue;
}
remainingCategories -= numCategoriesRequested;
numCategoriesAssigned += numCategoriesBySources[rasSourceIdx];
result = localResult;
if (remainingCategories == 0u) {
break;
}
}
return result;
}
ze_result_t LinuxRasImp::osRasClearStateExp(zes_ras_error_category_exp_t category) {
if (pFsAccess->isRootUser() == false) {
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Insufficient permissions and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS);
return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS;
}
if (ZES_RAS_ERROR_CATEGORY_EXP_L3FABRIC_ERRORS < category) {
return ZE_RESULT_ERROR_INVALID_ENUMERATION;
}
ze_result_t result = ZE_RESULT_ERROR_NOT_AVAILABLE;
for (auto &rasSource : rasSources) {
result = rasSource->osRasClearStateExp(category);
if (result != ZE_RESULT_SUCCESS) {
if (result == ZE_RESULT_ERROR_NOT_AVAILABLE) {
continue;
}
return result;
}
}
return result;
}
void LinuxRasImp::initSources() {
rasSources.push_back(std::make_unique<L0::LinuxRasSourceGt>(pLinuxSysmanImp, osRasErrorType, isSubdevice, subdeviceId));
if (isMemoryTypeHbm(pLinuxSysmanImp) == true) {

View File

@@ -31,6 +31,8 @@ class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass {
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
ze_result_t osRasGetConfig(zes_ras_config_t *config) override;
ze_result_t osRasSetConfig(const zes_ras_config_t *config) override;
ze_result_t osRasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) override;
ze_result_t osRasClearStateExp(zes_ras_error_category_exp_t category) override;
LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
LinuxRasImp() = default;
~LinuxRasImp() override = default;
@@ -52,13 +54,19 @@ class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass {
class LinuxRasSources : NEO::NonCopyableOrMovableClass {
public:
virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) = 0;
virtual ze_result_t osRasGetStateExp(uint32_t numCategoriesRequested, zes_ras_state_exp_t *pState) = 0;
virtual uint32_t osRasGetCategoryCount() = 0;
virtual ze_result_t osRasClearStateExp(zes_ras_error_category_exp_t category) = 0;
virtual ~LinuxRasSources() = default;
};
class LinuxRasSourceGt : public LinuxRasSources {
public:
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
ze_result_t osRasGetStateExp(uint32_t numCategoriesRequested, zes_ras_state_exp_t *pState) override;
ze_result_t osRasClearStateExp(zes_ras_error_category_exp_t category) override;
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle);
uint32_t osRasGetCategoryCount() override;
LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
LinuxRasSourceGt() = default;
~LinuxRasSourceGt() override;
@@ -82,11 +90,14 @@ class LinuxRasSourceGt : public LinuxRasSources {
const std::string &errorCounterDir,
uint64_t &errorVal);
void closeFds();
bool getAbsoluteCount(zes_ras_error_category_exp_t category) {
return !(clearStatus & (1 << category));
}
int64_t groupFd = -1;
std::vector<int64_t> memberFds = {};
uint64_t initialErrorCount[maxRasErrorCategoryCount] = {0};
std::map<zes_ras_error_cat_t, uint64_t> errorCategoryToEventCount;
uint64_t totalEventCount = 0;
uint32_t clearStatus = 0;
std::map<zes_ras_error_category_exp_t, uint64_t> errorCategoryToEventCount;
bool isSubdevice = false;
uint32_t subdeviceId = 0;
};
@@ -94,12 +105,16 @@ class LinuxRasSourceGt : public LinuxRasSources {
class LinuxRasSourceHbm : public LinuxRasSources {
public:
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
ze_result_t osRasGetStateExp(uint32_t numCategoriesRequested, zes_ras_state_exp_t *pState) override;
ze_result_t osRasClearStateExp(zes_ras_error_category_exp_t category) override;
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle);
uint32_t osRasGetCategoryCount() override;
LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId);
LinuxRasSourceHbm() = default;
~LinuxRasSourceHbm() override{};
protected:
ze_result_t getMemoryErrorCountFromFw(zes_ras_error_type_t rasErrorType, uint32_t subDeviceCount, uint64_t &errorCount);
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
zes_ras_error_type_t osRasErrorType = {};
FirmwareUtil *pFwInterface = nullptr;
@@ -108,6 +123,7 @@ class LinuxRasSourceHbm : public LinuxRasSources {
private:
uint64_t errorBaseline = 0;
uint32_t subdeviceId = 0;
uint32_t subDeviceCount = 0;
};
} // namespace L0

View File

@@ -12,16 +12,16 @@
#include "level_zero/tools/source/sysman/sysman_imp.h"
namespace L0 {
static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEventsUncorrectable = {
{ZES_RAS_ERROR_CAT_CACHE_ERRORS,
static const std::map<zes_ras_error_category_exp_t, std::vector<std::string>> categoryToListOfEventsUncorrectable = {
{ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS,
{"fatal-array-bist", "fatal-idi-parity", "fatal-l3-double",
"fatal-l3-ecc-checker",
"fatal-sqidi", "fatal-tlb", "fatal-l3bank"}},
{ZES_RAS_ERROR_CAT_RESET,
{ZES_RAS_ERROR_CATEGORY_EXP_RESET,
{"engine-reset"}},
{ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS,
{ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS,
{"eu-attention"}},
{ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS,
{ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS,
{"soc-fatal-psf-0", "soc-fatal-psf-1", "soc-fatal-psf-2", "soc-fatal-psf-csc-0",
"soc-fatal-psf-csc-1", "soc-fatal-psf-csc-2", "soc-fatal-punit",
"sgunit-fatal", "soc-nonfatal-punit", "sgunit-fatal", "sgunit-nonfatal", "gsc-nonfatal-mia-shutdown",
@@ -30,20 +30,20 @@ static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToL
"gsc-nonfatal-ucode-parity", "gsc-nonfatal-mia-int", "gsc-nonfatal-wdg-timeout", "soc-fatal-mdfi-east",
"soc-fatal-mdfi-south", "soc-nonfatal-mdfi-east", "soc-nonfatal-mdfi-south", "soc-fatal-mdfi-west",
"soc-fatal-cd0-mdfi", "soc-nonfatal-cd0-mdfi"}},
{ZES_RAS_ERROR_CAT_COMPUTE_ERRORS,
{ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS,
{"fatal-fpu", "fatal-eu-grf", "fatal-sampler", "fatal-slm",
"fatal-guc", "fatal-eu-ic", "fatal-subslice", "fatal-l3-fabric"}},
{ZES_RAS_ERROR_CAT_DRIVER_ERRORS,
{ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS,
{"driver-object-migration", "driver-engine-other", "driver-ggtt",
"driver-gt-interrupt", "driver-gt-other", "driver-guc-communication",
"driver-rps"}}};
static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEventsCorrectable = {
{ZES_RAS_ERROR_CAT_CACHE_ERRORS,
static const std::map<zes_ras_error_category_exp_t, std::vector<std::string>> categoryToListOfEventsCorrectable = {
{ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS,
{"correctable-l3-sng", "correctable-l3bank"}},
{ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS,
{ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS,
{"sgunit-correctable", "gsc-correctable-sram-ecc"}},
{ZES_RAS_ERROR_CAT_COMPUTE_ERRORS,
{ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS,
{"correctable-eu-grf", "correctable-eu-ic", "correctable-guc", "correctable-sampler", "correctable-slm", "correctable-subslice"}}};
static void closeFd(int64_t &fd) {
@@ -93,7 +93,7 @@ static uint64_t convertHexToUint64(std::string strVal) {
return config;
}
static bool getErrorType(std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEvents, std::vector<std::string> &eventList, ze_device_handle_t deviceHandle) {
static bool getErrorType(std::map<zes_ras_error_category_exp_t, std::vector<std::string>> categoryToListOfEvents, std::vector<std::string> &eventList, ze_device_handle_t deviceHandle) {
ze_bool_t onSubDevice = false;
uint32_t subDeviceId = 0;
SysmanDeviceImp::getSysmanDeviceInfo(deviceHandle, subDeviceId, onSubDevice, true);
@@ -149,7 +149,6 @@ void LinuxRasSourceGt::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t>
ze_result_t LinuxRasSourceGt::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
if (clear == true) {
closeFds();
totalEventCount = 0;
memset(state.category, 0, maxRasErrorCategoryCount * sizeof(uint64_t));
memset(initialErrorCount, 0, maxRasErrorCategoryCount * sizeof(uint64_t));
}
@@ -160,14 +159,8 @@ ze_result_t LinuxRasSourceGt::osRasGetState(zes_ras_state_t &state, ze_bool_t cl
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToEvent;
if (osRasErrorType == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
categoryToEvent = categoryToListOfEventsCorrectable;
}
if (osRasErrorType == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
categoryToEvent = categoryToListOfEventsUncorrectable;
}
std::vector<std::uint64_t> data(2 + totalEventCount, 0); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
auto numEvents = memberFds.size() + 1; // Add 1 for group Fd
std::vector<std::uint64_t> data(2 + numEvents, 0); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
if (pPmuInterface->pmuRead(static_cast<int>(groupFd), data.data(), sizeof(uint64_t) * data.size()) < 0) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
@@ -186,6 +179,57 @@ ze_result_t LinuxRasSourceGt::osRasGetState(zes_ras_state_t &state, ze_bool_t cl
return ZE_RESULT_SUCCESS;
}
ze_result_t LinuxRasSourceGt::osRasGetStateExp(uint32_t numCategoriesRequested, zes_ras_state_exp_t *pState) {
initRasErrors(false);
// Iterate over all the file descriptor values present in vector which is mapped to given ras error category
// Use the file descriptors to read pmu counters and add all the errors corresponding to the ras error category
if (groupFd < 0) {
return ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE;
}
auto numEvents = memberFds.size() + 1; // Add 1 for group Fd
std::vector<std::uint64_t> data(2 + numEvents, 0); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
if (pPmuInterface->pmuRead(static_cast<int>(groupFd), data.data(), sizeof(uint64_t) * data.size()) < 0) {
return ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE;
}
/* The data buffer retrieved after reading pmu counters is parsed to get the error count for each suberror category */
uint64_t initialIndex = 2; // Initial index in the buffer from which the data be parsed begins
uint32_t categoryIdx = 0u;
for (auto errorCat = errorCategoryToEventCount.begin(); (errorCat != errorCategoryToEventCount.end()) && (categoryIdx < numCategoriesRequested); errorCat++) {
uint64_t errorCount = 0;
uint64_t j = 0;
for (; j < errorCat->second; j++) {
errorCount += data[initialIndex + j];
}
pState[categoryIdx].category = errorCat->first;
pState[categoryIdx].errorCounter = errorCount + initialErrorCount[errorCat->first];
initialIndex += j;
categoryIdx++;
}
return ZE_RESULT_SUCCESS;
}
ze_result_t LinuxRasSourceGt::osRasClearStateExp(zes_ras_error_category_exp_t category) {
ze_result_t result = ZE_RESULT_ERROR_NOT_AVAILABLE;
// check requested category is already initialized
if (errorCategoryToEventCount.find(category) != errorCategoryToEventCount.end()) {
closeFds();
clearStatus |= (1 << category);
initialErrorCount[category] = 0;
result = ZE_RESULT_SUCCESS;
}
return result;
}
uint32_t LinuxRasSourceGt::osRasGetCategoryCount() {
if (osRasErrorType == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
return static_cast<uint32_t>(categoryToListOfEventsUncorrectable.size());
}
return static_cast<uint32_t>(categoryToListOfEventsCorrectable.size());
}
ze_result_t LinuxRasSourceGt::getPmuConfig(
const std::string &eventDirectory,
const std::vector<std::string> &listOfEvents,
@@ -220,7 +264,7 @@ void LinuxRasSourceGt::initRasErrors(ze_bool_t clear) {
if (result != ZE_RESULT_SUCCESS) {
return;
}
std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEvents;
std::map<zes_ras_error_category_exp_t, std::vector<std::string>> categoryToListOfEvents;
if (osRasErrorType == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
categoryToListOfEvents = categoryToListOfEventsCorrectable;
}
@@ -251,7 +295,7 @@ void LinuxRasSourceGt::initRasErrors(ze_bool_t clear) {
errorPrefixLocal = "error--";
}
uint64_t initialErrorVal = 0;
if (clear == false) {
if ((clear == false) && (getAbsoluteCount(rasErrorCatToListOfEvents.first) == true)) {
result = getBootUpErrorCountFromSysfs(nameOfError, errorCounterDirLocal, initialErrorVal);
if (result != ZE_RESULT_SUCCESS) {
continue;
@@ -275,9 +319,9 @@ void LinuxRasSourceGt::initRasErrors(ze_bool_t clear) {
eventCount++;
errorCount += initialErrorVal;
}
clearStatus &= ~(1 << rasErrorCatToListOfEvents.first);
initialErrorCount[rasErrorCatToListOfEvents.first] = errorCount;
errorCategoryToEventCount[rasErrorCatToListOfEvents.first] = eventCount;
totalEventCount += eventCount;
}
}

View File

@@ -13,6 +13,13 @@
namespace L0 {
ze_result_t LinuxRasSourceHbm::getMemoryErrorCountFromFw(zes_ras_error_type_t rasErrorType, uint32_t subDeviceCount, uint64_t &errorCount) {
if (pFwInterface == nullptr) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
return pFwInterface->fwGetMemoryErrorCount(rasErrorType, subDeviceCount, subdeviceId, errorCount);
}
void LinuxRasSourceHbm::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle) {
LinuxSysmanImp *pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
FirmwareUtil *pFwInterface = pLinuxSysmanImp->getFwUtilInterface();
@@ -23,14 +30,9 @@ void LinuxRasSourceHbm::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t>
}
ze_result_t LinuxRasSourceHbm::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
if (pFwInterface == nullptr) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
uint32_t subDeviceCount = 0;
pDevice->getSubDevices(&subDeviceCount, nullptr);
if (clear == true) {
uint64_t errorCount = 0;
ze_result_t result = pFwInterface->fwGetMemoryErrorCount(osRasErrorType, subDeviceCount, subdeviceId, errorCount);
ze_result_t result = getMemoryErrorCountFromFw(osRasErrorType, this->subDeviceCount, errorCount);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed while getting fwGetMemoryErrorCount() for RasErrorType:%d, SubDeviceCount:%d, SubdeviceId:%d, errorBaseline update:%d and returning error:0x%x \n", __FUNCTION__, osRasErrorType, subDeviceCount, subdeviceId, clear, result);
return result;
@@ -38,7 +40,7 @@ ze_result_t LinuxRasSourceHbm::osRasGetState(zes_ras_state_t &state, ze_bool_t c
errorBaseline = errorCount; // during clear update the error baseline value
}
uint64_t errorCount = 0;
ze_result_t result = pFwInterface->fwGetMemoryErrorCount(osRasErrorType, subDeviceCount, subdeviceId, errorCount);
ze_result_t result = getMemoryErrorCountFromFw(osRasErrorType, this->subDeviceCount, errorCount);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed while getting fwGetMemoryErrorCount() for RasErrorType:%d, SubDeviceCount:%d, SubdeviceId:%d, errorBaseline update:%d and returning error:0x%x \n", __FUNCTION__, osRasErrorType, subDeviceCount, subdeviceId, clear, result);
return result;
@@ -47,9 +49,40 @@ ze_result_t LinuxRasSourceHbm::osRasGetState(zes_ras_state_t &state, ze_bool_t c
return ZE_RESULT_SUCCESS;
}
ze_result_t LinuxRasSourceHbm::osRasGetStateExp(uint32_t numCategoriesRequested, zes_ras_state_exp_t *pState) {
uint64_t errorCount = 0;
ze_result_t result = getMemoryErrorCountFromFw(osRasErrorType, this->subDeviceCount, errorCount);
if (result != ZE_RESULT_SUCCESS) {
return result;
}
pState[0].category = ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS;
pState[0].errorCounter = errorCount - errorBaseline;
return ZE_RESULT_SUCCESS;
}
ze_result_t LinuxRasSourceHbm::osRasClearStateExp(zes_ras_error_category_exp_t category) {
if (category == ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS) {
uint64_t errorCount = 0;
ze_result_t result = getMemoryErrorCountFromFw(osRasErrorType, this->subDeviceCount, errorCount);
if (result != ZE_RESULT_SUCCESS) {
return result;
}
errorBaseline = errorCount;
}
return ZE_RESULT_SUCCESS;
}
uint32_t LinuxRasSourceHbm::osRasGetCategoryCount() {
// Return one for "MEMORY" category
return 1u;
}
LinuxRasSourceHbm::LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId) : pLinuxSysmanImp(pLinuxSysmanImp), osRasErrorType(type), subdeviceId(subdeviceId) {
pFwInterface = pLinuxSysmanImp->getFwUtilInterface();
pDevice = pLinuxSysmanImp->getDeviceHandle();
pDevice->getSubDevices(&subDeviceCount, nullptr);
}
} // namespace L0

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2021 Intel Corporation
* Copyright (C) 2020-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -20,6 +20,8 @@ class OsRas {
virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) = 0;
virtual ze_result_t osRasGetConfig(zes_ras_config_t *config) = 0;
virtual ze_result_t osRasSetConfig(const zes_ras_config_t *config) = 0;
virtual ze_result_t osRasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) = 0;
virtual ze_result_t osRasClearStateExp(zes_ras_error_category_exp_t category) = 0;
static OsRas *create(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle);
virtual ~OsRas() = default;

View File

@@ -23,6 +23,8 @@ class Ras : _zes_ras_handle_t {
virtual ze_result_t rasGetConfig(zes_ras_config_t *pConfig) = 0;
virtual ze_result_t rasSetConfig(const zes_ras_config_t *pConfig) = 0;
virtual ze_result_t rasGetState(zes_ras_state_t *pState, ze_bool_t clear) = 0;
virtual ze_result_t rasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) = 0;
virtual ze_result_t rasClearStateExp(zes_ras_error_category_exp_t category) = 0;
static Ras *fromHandle(zes_ras_handle_t handle) {
return static_cast<Ras *>(handle);

View File

@@ -34,6 +34,14 @@ ze_result_t RasImp::rasGetState(zes_ras_state_t *pState, ze_bool_t clear) {
return pOsRas->osRasGetState(*pState, clear);
}
ze_result_t RasImp::rasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) {
return pOsRas->osRasGetStateExp(pCount, pState);
}
ze_result_t RasImp::rasClearStateExp(zes_ras_error_category_exp_t category) {
return pOsRas->osRasClearStateExp(category);
}
void RasImp::init() {
pOsRas->osRasGetProperties(rasProperties);
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2021 Intel Corporation
* Copyright (C) 2020-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -20,6 +20,8 @@ class RasImp : public Ras, NEO::NonCopyableOrMovableClass {
ze_result_t rasGetConfig(zes_ras_config_t *pConfig) override;
ze_result_t rasSetConfig(const zes_ras_config_t *pConfig) override;
ze_result_t rasGetState(zes_ras_state_t *pConfig, ze_bool_t clear) override;
ze_result_t rasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) override;
ze_result_t rasClearStateExp(zes_ras_error_category_exp_t category) override;
RasImp() = default;
RasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_device_handle_t deviceHandle);

View File

@@ -14,6 +14,8 @@ class WddmRasImp : public OsRas {
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
ze_result_t osRasGetConfig(zes_ras_config_t *config) override;
ze_result_t osRasSetConfig(const zes_ras_config_t *config) override;
ze_result_t osRasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) override;
ze_result_t osRasClearStateExp(zes_ras_error_category_exp_t category) override;
};
void OsRas::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle) {}
@@ -34,6 +36,14 @@ ze_result_t WddmRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
ze_result_t WddmRasImp::osRasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
ze_result_t WddmRasImp::osRasClearStateExp(zes_ras_error_category_exp_t category) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
OsRas *OsRas::create(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) {
WddmRasImp *pWddmRasImp = new WddmRasImp();
return static_cast<OsRas *>(pWddmRasImp);

View File

@@ -8,6 +8,7 @@ set(L0_TESTS_TOOLS_SYSMAN_RAS_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/test_zes_ras.cpp
${CMAKE_CURRENT_SOURCE_DIR}/mock_sysman_ras.h
${CMAKE_CURRENT_SOURCE_DIR}/test_zes_ras_exp.cpp
)
if(UNIX)

View File

@@ -277,7 +277,6 @@ struct MockRasSysfsAccess : public SysfsAccess {
bool isMultiTileArch = false;
ze_result_t readSymLink(const std::string file, std::string &val) override {
if (mockReadSymLinkStatus != ZE_RESULT_SUCCESS) {
return mockReadSymLinkStatus;
}

View File

@@ -216,7 +216,7 @@ TEST_F(SysmanRasFixture, GivenValidSysmanHandleWhenRetrievingRasHandlesIfRasEven
EXPECT_EQ(count, 0u);
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGeStateForGtThenSuccessIsReturned) {
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingzesRasGetStateForGtThenSuccessIsReturned) {
pPmuInterface->mockPmuReadCorrectable = true;

File diff suppressed because it is too large Load Diff

View File

@@ -68,5 +68,16 @@ TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingRasSetConfigThenFailureIs
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, pRasImp->rasSetConfig(&config));
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingRasGetStateExpThenFailureIsReturned) {
auto pRasImp = std::make_unique<L0::RasImp>(pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, device->toHandle());
uint32_t pCount = 0;
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, pRasImp->rasGetStateExp(&pCount, nullptr));
}
TEST_F(SysmanRasFixture, GivenValidRasHandleWhenCallingRasClearStateExpThenFailureIsReturned) {
auto pRasImp = std::make_unique<L0::RasImp>(pOsSysman, ZES_RAS_ERROR_TYPE_CORRECTABLE, device->toHandle());
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, pRasImp->rasClearStateExp(ZES_RAS_ERROR_CATEGORY_EXP_RESET));
}
} // namespace ult
} // namespace L0