feature(sysman): Add support for RAS module

- Port RAS module to new sysman design
- Add RAS ULTs for new sysman interface

Related-To: LOCI-4246

Signed-off-by: Aravind Gopalakrishnan <aravind.gopalakrishnan@intel.com>
This commit is contained in:
Aravind Gopalakrishnan
2023-05-05 22:19:35 +00:00
committed by Compute-Runtime-Automation
parent cfacbbd811
commit 826abf338a
25 changed files with 2517 additions and 8 deletions

View File

@@ -0,0 +1,28 @@
#
# Copyright (C) 2023 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
if(UNIX)
target_sources(${L0_STATIC_LIB_NAME}
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
)
if(NEO_ENABLE_i915_PRELIM_DETECTION)
target_sources(${L0_STATIC_LIB_NAME}
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_prelim.cpp
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_prelim.h
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_gt.cpp
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_hbm.cpp
)
else()
target_sources(${L0_STATIC_LIB_NAME}
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.cpp
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.h
)
endif()
endif()

View File

@@ -0,0 +1,57 @@
/*
* Copyright (C) 2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "level_zero/sysman/source/ras/linux/os_ras_imp.h"
#include "level_zero/sysman/source/linux/os_sysman_imp.h"
#include <cstring>
namespace L0 {
namespace Sysman {
LinuxRasImp::LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) {
pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
pFsAccess = &pLinuxSysmanImp->getFsAccess();
}
void OsRas::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId) {}
ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
ze_result_t LinuxRasImp::osRasGetConfig(zes_ras_config_t *config) {
config->totalThreshold = totalThreshold;
memcpy(config->detailedThresholds.category, categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t));
return ZE_RESULT_SUCCESS;
}
ze_result_t LinuxRasImp::osRasSetConfig(const zes_ras_config_t *config) {
if (pFsAccess->isRootUser() == true) {
totalThreshold = config->totalThreshold;
memcpy(categoryThreshold, config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t));
return ZE_RESULT_SUCCESS;
}
return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS;
}
ze_result_t LinuxRasImp::osRasGetProperties(zes_ras_properties_t &properties) {
properties.pNext = nullptr;
properties.type = osRasErrorType;
properties.onSubdevice = isSubdevice;
properties.subdeviceId = subdeviceId;
return ZE_RESULT_SUCCESS;
}
OsRas *OsRas::create(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) {
LinuxRasImp *pLinuxRasImp = new LinuxRasImp(pOsSysman, type, onSubdevice, subdeviceId);
return static_cast<OsRas *>(pLinuxRasImp);
}
} // namespace Sysman
} // namespace L0

View File

@@ -0,0 +1,41 @@
/*
* Copyright (C) 2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/helpers/non_copyable_or_moveable.h"
#include "level_zero/sysman/source/linux/fs_access.h"
#include "level_zero/sysman/source/ras/os_ras.h"
#include "level_zero/sysman/source/sysman_const.h"
namespace L0 {
namespace Sysman {
class LinuxSysmanImp;
class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass {
public:
ze_result_t osRasGetProperties(zes_ras_properties_t &properties) override;
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
ze_result_t osRasGetConfig(zes_ras_config_t *config) override;
ze_result_t osRasSetConfig(const zes_ras_config_t *config) override;
LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
LinuxRasImp() = default;
~LinuxRasImp() override = default;
protected:
zes_ras_error_type_t osRasErrorType = {};
FsAccess *pFsAccess = nullptr;
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
private:
bool isSubdevice = false;
uint32_t subdeviceId = 0;
uint64_t totalThreshold = 0;
uint64_t categoryThreshold[maxRasErrorCategoryCount] = {0};
};
} // namespace Sysman
} // namespace L0

View File

@@ -0,0 +1,284 @@
/*
* Copyright (C) 2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "level_zero/sysman/source/linux/os_sysman_imp.h"
#include "level_zero/sysman/source/ras/linux/os_ras_imp_prelim.h"
#include <cstring>
namespace L0 {
namespace Sysman {
static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEventsUncorrectable = {
{ZES_RAS_ERROR_CAT_CACHE_ERRORS,
{"fatal-array-bist", "fatal-idi-parity", "fatal-l3-double",
"fatal-l3-ecc-checker",
"fatal-sqidi", "fatal-tlb", "fatal-l3bank"}},
{ZES_RAS_ERROR_CAT_RESET,
{"engine-reset"}},
{ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS,
{"eu-attention"}},
{ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS,
{"soc-fatal-psf-0", "soc-fatal-psf-1", "soc-fatal-psf-2", "soc-fatal-psf-csc-0",
"soc-fatal-psf-csc-1", "soc-fatal-psf-csc-2", "soc-fatal-punit",
"sgunit-fatal", "soc-nonfatal-punit", "sgunit-fatal", "sgunit-nonfatal", "gsc-nonfatal-mia-shutdown",
"gsc-nonfatal-aon-parity", "gsc-nonfatal-rom-parity", "gsc-nonfatal-fuse-crc-check",
"gsc-nonfatal-selfmbist", "gsc-nonfatal-fuse-pull", "gsc-nonfatal-sram-ecc", "gsc-nonfatal-glitch-det",
"gsc-nonfatal-ucode-parity", "gsc-nonfatal-mia-int", "gsc-nonfatal-wdg-timeout"}},
{ZES_RAS_ERROR_CAT_COMPUTE_ERRORS,
{"fatal-fpu", "fatal-eu-grf", "fatal-sampler", "fatal-slm",
"fatal-guc", "fatal-eu-ic", "fatal-subslice"}},
{ZES_RAS_ERROR_CAT_DRIVER_ERRORS,
{"driver-object-migration", "driver-engine-other", "driver-ggtt",
"driver-gt-interrupt", "driver-gt-other", "driver-guc-communication",
"driver-rps"}}};
static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEventsCorrectable = {
{ZES_RAS_ERROR_CAT_CACHE_ERRORS,
{"correctable-l3-sng", "correctable-l3bank"}},
{ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS,
{"sgunit-correctable", "gsc-correctable-sram-ecc"}},
{ZES_RAS_ERROR_CAT_COMPUTE_ERRORS,
{"correctable-eu-grf", "correctable-eu-ic", "correctable-guc", "correctable-sampler", "correctable-slm", "correctable-subslice"}}};
static void closeFd(int64_t &fd) {
if (fd != -1) {
close(static_cast<int>(fd));
fd = -1;
}
}
static ze_result_t readI915EventsDirectory(LinuxSysmanImp *pLinuxSysmanImp, std::vector<std::string> &listOfEvents, std::string *eventDirectory) {
// To know how many errors are supported on a platform scan
// /sys/devices/i915_0000_01_00.0/events/
// all events are enumerated in sysfs at /sys/devices/i915_0000_01_00.0/events/
// For above example device is in PCI slot 0000:01:00.0:
SysfsAccess *pSysfsAccess = &pLinuxSysmanImp->getSysfsAccess();
const std::string deviceDir("device");
const std::string sysDevicesDir("/sys/devices/");
std::string bdfDir;
ze_result_t result = pSysfsAccess->readSymLink(deviceDir, bdfDir);
if (ZE_RESULT_SUCCESS != result) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
const auto loc = bdfDir.find_last_of('/');
auto bdf = bdfDir.substr(loc + 1);
std::replace(bdf.begin(), bdf.end(), ':', '_');
std::string i915DirName = "i915_" + bdf;
std::string sysfsNode = sysDevicesDir + i915DirName + "/" + "events";
if (eventDirectory != nullptr) {
*eventDirectory = sysfsNode;
}
FsAccess *pFsAccess = &pLinuxSysmanImp->getFsAccess();
result = pFsAccess->listDirectory(sysfsNode, listOfEvents);
if (ZE_RESULT_SUCCESS != result) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
return ZE_RESULT_SUCCESS;
}
static uint64_t convertHexToUint64(std::string strVal) {
auto loc = strVal.find('=');
std::stringstream ss;
ss << std::hex << strVal.substr(loc + 1);
uint64_t config = 0;
ss >> config;
return config;
}
static bool getErrorType(std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEvents, std::vector<std::string> &eventList, ze_bool_t isSubDevice, uint32_t subDeviceId) {
// Naming convention of files containing config values for errors
// error--<Name of error> Ex:- error--engine-reset (config file with no subdevice)
// error-gt<N>--<Name of error> Ex:- error-gt0--engine-reset (config file with subdevices)
// error--<Name of error> Ex:- error--driver-object-migration (config file for device level errors)
std::string errorPrefix = "error--"; // prefix string of the file containing config value for pmu counters
if (isSubDevice == true) {
errorPrefix = "error-gt" + std::to_string(subDeviceId) + "--";
}
for (auto const &rasErrorCatToListOfEvents : categoryToListOfEvents) {
for (auto const &nameOfError : rasErrorCatToListOfEvents.second) {
std::string errorPrefixLocal = errorPrefix;
if (nameOfError == "driver-object-migration") { // check for errors which occur at device level
errorPrefixLocal = "error--";
}
if (std::find(eventList.begin(), eventList.end(), errorPrefixLocal + nameOfError) != eventList.end()) {
return true;
}
}
}
return false;
}
void LinuxRasSourceGt::closeFds() {
for (auto &memberFd : memberFds) {
closeFd(memberFd);
}
memberFds.clear();
closeFd(groupFd);
}
LinuxRasSourceGt::~LinuxRasSourceGt() {
closeFds();
}
void LinuxRasSourceGt::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId) {
LinuxSysmanImp *pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
std::vector<std::string> listOfEvents = {};
ze_result_t result = readI915EventsDirectory(pLinuxSysmanImp, listOfEvents, nullptr);
if (result != ZE_RESULT_SUCCESS) {
return;
}
if (getErrorType(categoryToListOfEventsCorrectable, listOfEvents, isSubDevice, subDeviceId) == true) {
errorType.insert(ZES_RAS_ERROR_TYPE_CORRECTABLE);
}
if (getErrorType(categoryToListOfEventsUncorrectable, listOfEvents, isSubDevice, subDeviceId) == true) {
errorType.insert(ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
}
}
ze_result_t LinuxRasSourceGt::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
if (clear == true) {
closeFds();
totalEventCount = 0;
memset(state.category, 0, maxRasErrorCategoryCount * sizeof(uint64_t));
memset(initialErrorCount, 0, maxRasErrorCategoryCount * sizeof(uint64_t));
}
initRasErrors(clear);
// Iterate over all the file descriptor values present in vector which is mapped to given ras error category
// Use the file descriptors to read pmu counters and add all the errors corresponding to the ras error category
if (groupFd < 0) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToEvent;
if (osRasErrorType == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
categoryToEvent = categoryToListOfEventsCorrectable;
}
if (osRasErrorType == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
categoryToEvent = categoryToListOfEventsUncorrectable;
}
std::vector<std::uint64_t> data(2 + totalEventCount, 0); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
if (pPmuInterface->pmuRead(static_cast<int>(groupFd), data.data(), sizeof(uint64_t) * data.size()) < 0) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
/* The data buffer retrieved after reading pmu counters is parsed to get the error count for each suberror category */
uint64_t initialIndex = 2; // Initial index in the buffer from which the data be parsed begins
for (auto errorCat = errorCategoryToEventCount.begin(); errorCat != errorCategoryToEventCount.end(); errorCat++) {
uint64_t errorCount = 0;
uint64_t j = 0;
for (; j < errorCat->second; j++) {
errorCount += data[initialIndex + j];
}
state.category[errorCat->first] = errorCount + initialErrorCount[errorCat->first];
initialIndex += j;
}
return ZE_RESULT_SUCCESS;
}
ze_result_t LinuxRasSourceGt::getPmuConfig(
const std::string &eventDirectory,
const std::vector<std::string> &listOfEvents,
const std::string &errorFileToGetConfig,
std::string &pmuConfig) {
auto findErrorInList = std::find(listOfEvents.begin(), listOfEvents.end(), errorFileToGetConfig);
if (findErrorInList == listOfEvents.end()) {
return ZE_RESULT_ERROR_UNKNOWN;
}
return pFsAccess->read(eventDirectory + "/" + errorFileToGetConfig, pmuConfig);
}
ze_result_t LinuxRasSourceGt::getBootUpErrorCountFromSysfs(
std::string nameOfError,
const std::string &errorCounterDir,
uint64_t &errorVal) {
std::replace(nameOfError.begin(), nameOfError.end(), '-', '_'); // replace - with _ to convert name of pmu config node to name of sysfs node
return pSysfsAccess->read(errorCounterDir + "/" + nameOfError, errorVal);
}
void LinuxRasSourceGt::initRasErrors(ze_bool_t clear) {
// if already initialized
if (groupFd >= 0) {
return;
}
std::string eventDirectory;
std::vector<std::string> listOfEvents = {};
ze_result_t result = readI915EventsDirectory(pLinuxSysmanImp, listOfEvents, &eventDirectory);
if (result != ZE_RESULT_SUCCESS) {
return;
}
std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEvents;
if (osRasErrorType == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
categoryToListOfEvents = categoryToListOfEventsCorrectable;
}
if (osRasErrorType == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
categoryToListOfEvents = categoryToListOfEventsUncorrectable;
}
std::string errorPrefix = "error--"; // prefix string of the file containing config value for pmu counters
std::string errorCounterDir = "gt/gt0/error_counter"; // Directory containing the sysfs nodes which in turn contains initial value of error count
if (isSubdevice == true) {
errorPrefix = "error-gt" + std::to_string(subdeviceId) + "--";
errorCounterDir = "gt/gt" + std::to_string(subdeviceId) + "/error_counter";
}
// Following loop retrieves initial count of errors from sysfs and pmu config values for each ras error
// PMU: error--<Name of error> Ex:- error--engine-reset (config with no subdevice)
// PMU: error-gt<N>--<Name of error> Ex:- error-gt0--engine-reset (config with subdevices)
// PMU: error--<Name of error> Ex:- error--driver-object-migration (config for device level errors)
// Sysfs: card0/gt/gt0/error_counter/<Name of error> Ex:- gt/gt0/error_counter/engine_reset (sysfs with no subdevice)
// Sysfs: card0/gt/gt<N>/error_counter/<Name of error> Ex:- gt/gt1/error_counter/engine_reset (sysfs with subdevices)
// Sysfs: error_counter/<Name of error> Ex:- error_counter/driver_object_migration (sysfs for error which occur at device level)
for (auto const &rasErrorCatToListOfEvents : categoryToListOfEvents) {
uint64_t eventCount = 0;
uint64_t errorCount = 0;
for (auto const &nameOfError : rasErrorCatToListOfEvents.second) {
std::string errorPrefixLocal = errorPrefix;
std::string errorCounterDirLocal = errorCounterDir;
if (nameOfError == "driver-object-migration") { // check for errors which occur at device level
errorCounterDirLocal = "error_counter";
errorPrefixLocal = "error--";
}
uint64_t initialErrorVal = 0;
if (clear == false) {
result = getBootUpErrorCountFromSysfs(nameOfError, errorCounterDirLocal, initialErrorVal);
if (result != ZE_RESULT_SUCCESS) {
continue;
}
}
std::string pmuConfig;
result = getPmuConfig(eventDirectory, listOfEvents, errorPrefixLocal + nameOfError, pmuConfig);
if (result != ZE_RESULT_SUCCESS) {
continue;
}
uint64_t config = convertHexToUint64(pmuConfig);
if (groupFd == -1) {
groupFd = pPmuInterface->pmuInterfaceOpen(config, -1, PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP); // To get file descriptor of the group leader
if (groupFd < 0) {
return;
}
} else {
// The rest of the group members are created with subsequent calls with groupFd being set to the file descriptor of the group leader
memberFds.push_back(pPmuInterface->pmuInterfaceOpen(config, static_cast<int>(groupFd), PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP));
}
eventCount++;
errorCount += initialErrorVal;
}
initialErrorCount[rasErrorCatToListOfEvents.first] = errorCount;
errorCategoryToEventCount[rasErrorCatToListOfEvents.first] = eventCount;
totalEventCount += eventCount;
}
}
LinuxRasSourceGt::LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : pLinuxSysmanImp(pLinuxSysmanImp), osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) {
pPmuInterface = pLinuxSysmanImp->getPmuInterface();
pFsAccess = &pLinuxSysmanImp->getFsAccess();
pSysfsAccess = &pLinuxSysmanImp->getSysfsAccess();
}
} // namespace Sysman
} // namespace L0

View File

@@ -0,0 +1,55 @@
/*
* Copyright (C) 2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/gfx_core_helper.h"
#include "level_zero/sysman/source/firmware_util/firmware_util.h"
#include "level_zero/sysman/source/linux/os_sysman_imp.h"
#include "level_zero/sysman/source/ras/linux/os_ras_imp_prelim.h"
namespace L0 {
namespace Sysman {
void LinuxRasSourceHbm::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId) {
LinuxSysmanImp *pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
FirmwareUtil *pFwInterface = pLinuxSysmanImp->getFwUtilInterface();
if (pFwInterface != nullptr) {
errorType.insert(ZES_RAS_ERROR_TYPE_CORRECTABLE);
errorType.insert(ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
}
}
ze_result_t LinuxRasSourceHbm::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
if (pFwInterface == nullptr) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
uint32_t subDeviceCount = 0;
subDeviceCount = NEO::GfxCoreHelper::getSubDevicesCount(&pDevice->getHardwareInfo());
if (clear == true) {
uint64_t errorCount = 0;
ze_result_t result = pFwInterface->fwGetMemoryErrorCount(osRasErrorType, subDeviceCount, subdeviceId, errorCount);
if (result != ZE_RESULT_SUCCESS) {
return result;
}
errorBaseline = errorCount; // during clear update the error baseline value
}
uint64_t errorCount = 0;
ze_result_t result = pFwInterface->fwGetMemoryErrorCount(osRasErrorType, subDeviceCount, subdeviceId, errorCount);
if (result != ZE_RESULT_SUCCESS) {
return result;
}
state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS] = errorCount - errorBaseline;
return ZE_RESULT_SUCCESS;
}
LinuxRasSourceHbm::LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId) : pLinuxSysmanImp(pLinuxSysmanImp), osRasErrorType(type), subdeviceId(subdeviceId) {
pFwInterface = pLinuxSysmanImp->getFwUtilInterface();
pDevice = pLinuxSysmanImp->getSysmanDeviceImp();
}
} // namespace Sysman
} // namespace L0

View File

@@ -0,0 +1,104 @@
/*
* Copyright (C) 2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "level_zero/sysman/source/ras/linux/os_ras_imp_prelim.h"
#include "shared/source/helpers/string.h"
#include "shared/source/os_interface/linux/system_info.h"
#include "level_zero/sysman/source/linux/os_sysman_imp.h"
#include "drm/intel_hwconfig_types.h"
namespace L0 {
namespace Sysman {
static bool isMemoryTypeHbm(LinuxSysmanImp *pLinuxSysmanImp) {
uint32_t memType = pLinuxSysmanImp->getMemoryType();
if (memType == INTEL_HWCONFIG_MEMORY_TYPE_HBM2e || memType == INTEL_HWCONFIG_MEMORY_TYPE_HBM2) {
return true;
}
return false;
}
void OsRas::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId) {
constexpr auto maxErrorTypes = 2;
LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, isSubDevice, subDeviceId);
if (errorType.size() < maxErrorTypes) {
auto pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
if (isMemoryTypeHbm(pLinuxSysmanImp) == true) {
LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, isSubDevice, subDeviceId);
}
}
}
ze_result_t LinuxRasImp::osRasGetConfig(zes_ras_config_t *config) {
config->totalThreshold = totalThreshold;
memcpy_s(config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t), categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t));
return ZE_RESULT_SUCCESS;
}
ze_result_t LinuxRasImp::osRasSetConfig(const zes_ras_config_t *config) {
if (pFsAccess->isRootUser() == true) {
totalThreshold = config->totalThreshold;
memcpy_s(categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t), config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t));
return ZE_RESULT_SUCCESS;
}
return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS;
}
ze_result_t LinuxRasImp::osRasGetProperties(zes_ras_properties_t &properties) {
properties.pNext = nullptr;
properties.type = osRasErrorType;
properties.onSubdevice = isSubdevice;
properties.subdeviceId = subdeviceId;
return ZE_RESULT_SUCCESS;
}
ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
if (clear == true) {
if (pFsAccess->isRootUser() == false) {
return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS;
}
}
ze_result_t result = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
for (auto &rasSource : rasSources) {
zes_ras_state_t localState = {};
ze_result_t localResult = rasSource->osRasGetState(localState, clear);
if (localResult != ZE_RESULT_SUCCESS) {
continue;
}
for (uint32_t i = 0; i < maxRasErrorCategoryCount; i++) {
state.category[i] += localState.category[i];
}
result = ZE_RESULT_SUCCESS;
}
return result;
}
void LinuxRasImp::initSources() {
rasSources.push_back(std::make_unique<L0::Sysman::LinuxRasSourceGt>(pLinuxSysmanImp, osRasErrorType, isSubdevice, subdeviceId));
if (isMemoryTypeHbm(pLinuxSysmanImp) == true) {
rasSources.push_back(std::make_unique<L0::Sysman::LinuxRasSourceHbm>(pLinuxSysmanImp, osRasErrorType, subdeviceId));
}
}
LinuxRasImp::LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) {
pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
pFsAccess = &pLinuxSysmanImp->getFsAccess();
initSources();
}
OsRas *OsRas::create(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) {
LinuxRasImp *pLinuxRasImp = new LinuxRasImp(pOsSysman, type, onSubdevice, subdeviceId);
return static_cast<OsRas *>(pLinuxRasImp);
}
} // namespace Sysman
} // namespace L0

View File

@@ -0,0 +1,114 @@
/*
* Copyright (C) 2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/helpers/non_copyable_or_moveable.h"
#include "level_zero/sysman/source/linux/fs_access.h"
#include "level_zero/sysman/source/linux/pmu/pmu_imp.h"
#include "level_zero/sysman/source/ras/os_ras.h"
#include "level_zero/sysman/source/sysman_const.h"
#include "level_zero/sysman/source/sysman_device_imp.h"
#include <map>
#include <memory>
#include <string>
#include <vector>
namespace L0 {
namespace Sysman {
class LinuxSysmanImp;
class FirmwareUtil;
class LinuxRasSources : NEO::NonCopyableOrMovableClass {
public:
virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) = 0;
virtual ~LinuxRasSources() = default;
};
class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass {
public:
ze_result_t osRasGetProperties(zes_ras_properties_t &properties) override;
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
ze_result_t osRasGetConfig(zes_ras_config_t *config) override;
ze_result_t osRasSetConfig(const zes_ras_config_t *config) override;
LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
LinuxRasImp() = default;
~LinuxRasImp() override = default;
protected:
zes_ras_error_type_t osRasErrorType = {};
FsAccess *pFsAccess = nullptr;
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
std::vector<std::unique_ptr<L0::Sysman::LinuxRasSources>> rasSources = {};
private:
void initSources();
bool isSubdevice = false;
uint32_t subdeviceId = 0;
uint64_t totalThreshold = 0;
uint64_t categoryThreshold[maxRasErrorCategoryCount] = {0};
};
class LinuxRasSourceGt : public LinuxRasSources {
public:
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId);
LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
LinuxRasSourceGt() = default;
~LinuxRasSourceGt() override;
protected:
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
zes_ras_error_type_t osRasErrorType = {};
PmuInterface *pPmuInterface = nullptr;
FsAccess *pFsAccess = nullptr;
SysfsAccess *pSysfsAccess = nullptr;
private:
void initRasErrors(ze_bool_t clear);
ze_result_t getPmuConfig(
const std::string &eventDirectory,
const std::vector<std::string> &listOfEvents,
const std::string &errorFileToGetConfig,
std::string &pmuConfig);
ze_result_t getBootUpErrorCountFromSysfs(
std::string nameOfError,
const std::string &errorCounterDir,
uint64_t &errorVal);
void closeFds();
int64_t groupFd = -1;
std::vector<int64_t> memberFds = {};
uint64_t initialErrorCount[maxRasErrorCategoryCount] = {0};
std::map<zes_ras_error_cat_t, uint64_t> errorCategoryToEventCount;
uint64_t totalEventCount = 0;
bool isSubdevice = false;
uint32_t subdeviceId = 0;
};
class LinuxRasSourceHbm : public LinuxRasSources {
public:
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId);
LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId);
LinuxRasSourceHbm() = default;
~LinuxRasSourceHbm() override{};
protected:
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
zes_ras_error_type_t osRasErrorType = {};
FirmwareUtil *pFwInterface = nullptr;
SysmanDeviceImp *pDevice = nullptr;
private:
uint64_t errorBaseline = 0;
uint32_t subdeviceId = 0;
};
} // namespace Sysman
} // namespace L0