Add prelim support for ras diagnostics and firmware

Related-To: LOCI-2864

Signed-off-by: Bellekallu Rajkiran <bellekallu.rajkiran@intel.com>
This commit is contained in:
Bellekallu Rajkiran
2022-02-21 05:28:33 +00:00
committed by Compute-Runtime-Automation
parent 922a224cc9
commit 5a2145ad8d
13 changed files with 1093 additions and 20 deletions

21
level_zero/tools/source/sysman/ras/linux/CMakeLists.txt Executable file → Normal file
View File

@@ -1,14 +1,28 @@
#
# Copyright (C) 2020-2021 Intel Corporation
# Copyright (C) 2020-2022 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
set(L0_SRCS_TOOLS_SYSMAN_RAS_LINUX
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}os_ras_imp.cpp
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}os_ras_imp.h
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
)
if(NEO_ENABLE_i915_PRELIM_DETECTION)
list(APPEND L0_SRCS_TOOLS_SYSMAN_RAS_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_prelim.cpp
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_prelim.h
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_gt.cpp
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_fabric.cpp
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_hbm.cpp
)
else()
list(APPEND L0_SRCS_TOOLS_SYSMAN_RAS_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.cpp
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.h
)
endif()
if(UNIX)
target_sources(${L0_STATIC_LIB_NAME}
PRIVATE
@@ -18,5 +32,4 @@ endif()
# Make our source files visible to parent
set_property(GLOBAL PROPERTY L0_SRCS_TOOLS_SYSMAN_RAS_LINUX ${L0_SRCS_TOOLS_SYSMAN_RAS_LINUX})
add_subdirectories()

View File

@@ -0,0 +1,105 @@
/*
* Copyright (C) 2021-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h"
#include "sysman/linux/fs_access.h"
#include "sysman/linux/os_sysman_imp.h"
#include <regex>
namespace L0 {
void LinuxRasSourceFabric::getNodes(std::vector<std::string> &nodes, uint32_t subdeviceId, FsAccess *fsAccess, const zes_ras_error_type_t &type) {
const uint32_t minBoardStrappedNumber = 0;
const uint32_t maxBoardStrappedNumber = 31;
const uint32_t minPortId = 1;
const uint32_t maxPortId = 8;
nodes.clear();
for (auto boardStrappedNumber = minBoardStrappedNumber; boardStrappedNumber <= maxBoardStrappedNumber; boardStrappedNumber++) {
const auto iafPathString("/sys/module/iaf/drivers/platform:iaf/iaf.");
const auto boardStrappedString(iafPathString + std::to_string(boardStrappedNumber));
if (!fsAccess->directoryExists(boardStrappedString)) {
continue;
}
const auto subDeviceString(boardStrappedString + "/sd." + std::to_string(subdeviceId));
std::vector<std::string> subDeviceErrorNodes;
if (type == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
subDeviceErrorNodes.push_back(subDeviceString + "/fw_comm_errors");
for (auto portId = minPortId; portId <= maxPortId; portId++) {
subDeviceErrorNodes.push_back(subDeviceString + "/port." + std::to_string(portId) + "/link_degrades");
}
} else {
subDeviceErrorNodes.push_back(subDeviceString + "/sd_failure");
subDeviceErrorNodes.push_back(subDeviceString + "/fw_error");
for (auto portId = minPortId; portId <= maxPortId; portId++) {
subDeviceErrorNodes.push_back(subDeviceString + "/port." + std::to_string(portId) + "/link_failures");
}
}
for (auto &subDeviceErrorNode : subDeviceErrorNodes) {
if (ZE_RESULT_SUCCESS == fsAccess->canRead(subDeviceErrorNode)) {
nodes.push_back(subDeviceErrorNode);
}
}
}
}
ze_result_t LinuxRasSourceFabric::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType,
OsSysman *pOsSysman, ze_device_handle_t deviceHandle) {
LinuxSysmanImp *pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
NEO::Device *neoDevice = static_cast<Device *>(deviceHandle)->getNEODevice();
uint32_t subDeviceIndex = neoDevice->isSubDevice() ? static_cast<NEO::SubDevice *>(neoDevice)->getSubDeviceIndex() : 0;
std::vector<std::string> nodes;
getNodes(nodes, subDeviceIndex, &pLinuxSysmanImp->getFsAccess(), ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
if (nodes.size()) {
errorType.insert(ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
}
getNodes(nodes, subDeviceIndex, &pLinuxSysmanImp->getFsAccess(), ZES_RAS_ERROR_TYPE_CORRECTABLE);
if (nodes.size()) {
errorType.insert(ZES_RAS_ERROR_TYPE_CORRECTABLE);
}
return ZE_RESULT_SUCCESS;
}
LinuxRasSourceFabric::LinuxRasSourceFabric(OsSysman *pOsSysman, zes_ras_error_type_t type, uint32_t subDeviceId) {
fsAccess = &static_cast<LinuxSysmanImp *>(pOsSysman)->getFsAccess();
getNodes(errorNodes, subDeviceId, fsAccess, type);
}
uint64_t LinuxRasSourceFabric::getComputeErrorCount() {
uint64_t currentErrorCount = 0;
for (const auto &node : errorNodes) {
uint64_t errorCount = 0;
fsAccess->read(node, errorCount);
currentErrorCount += errorCount;
}
return currentErrorCount;
}
ze_result_t LinuxRasSourceFabric::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
if (errorNodes.size() == 0) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
std::memset(state.category, 0, sizeof(zes_ras_state_t::category));
uint64_t currentComputeErrorCount = getComputeErrorCount();
if (clear) {
baseComputeErrorCount = currentComputeErrorCount;
currentComputeErrorCount = getComputeErrorCount();
}
state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS] = currentComputeErrorCount - baseComputeErrorCount;
return ZE_RESULT_SUCCESS;
}
} // namespace L0

View File

@@ -0,0 +1,345 @@
/*
* Copyright (C) 2021-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h"
#include "sysman/linux/os_sysman_imp.h"
#include <regex>
namespace L0 {
static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEventsUncorrectable = {
{ZES_RAS_ERROR_CAT_CACHE_ERRORS,
{"fatal-array-bist", "fatal-eu-grf", "fatal-eu-ic",
"fatal-guc", "fatal-idi-parity", "fatal-l3-double",
"fatal-l3-ecc-checker", "fatal-sampler", "fatal-slm",
"fatal-sqidi", "fatal-tlb"}},
{ZES_RAS_ERROR_CAT_RESET,
{"engine-reset"}},
{ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS,
{"eu-attention"}},
{ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS,
{"soc-fatal-fabric-ss0-0", "soc-fatal-fabric-ss0-1", "soc-fatal-fabric-ss0-2",
"soc-fatal-fabric-ss0-3", "soc-fatal-fabric-ss1-0", "soc-fatal-fabric-ss1-1",
"soc-fatal-fabric-ss1-2", "soc-fatal-fabric-ss1-3", "soc-fatal-fabric-ss1-4",
"soc-fatal-hbm-ss0-0", "soc-fatal-hbm-ss0-1", "soc-fatal-hbm-ss0-2",
"soc-fatal-hbm-ss0-3", "soc-fatal-hbm-ss0-4", "soc-fatal-hbm-ss0-5",
"soc-fatal-hbm-ss0-6", "soc-fatal-hbm-ss0-7", "soc-fatal-hbm-ss0-8",
"soc-fatal-hbm-ss0-9", "soc-fatal-hbm-ss0-10", "soc-fatal-hbm-ss0-11",
"soc-fatal-hbm-ss0-12", "soc-fatal-hbm-ss0-13", "soc-fatal-hbm-ss0-14",
"soc-fatal-hbm-ss0-15", "soc-fatal-hbm-ss1-0", "soc-fatal-hbm-ss1-1",
"soc-fatal-hbm-ss1-2", "soc-fatal-hbm-ss1-3", "soc-fatal-hbm-ss1-4",
"soc-fatal-hbm-ss1-5", "soc-fatal-hbm-ss1-6", "soc-fatal-hbm-ss1-7",
"soc-fatal-hbm-ss1-8", "soc-fatal-hbm-ss1-9", "soc-fatal-hbm-ss1-10",
"soc-fatal-hbm-ss1-11", "soc-fatal-hbm-ss1-12", "soc-fatal-hbm-ss1-13",
"soc-fatal-hbm-ss1-14", "soc-fatal-hbm-ss1-15", "soc-fatal-mdfi-east",
"soc-fatal-mdfi-south", "soc-fatal-mdfi-west", "soc-fatal-psf-csc-0",
"soc-fatal-psf-csc-1", "soc-fatal-psf-csc-2", "soc-fatal-punit",
"sgunit-fatal", "soc-nonfatal-fabric-ss0-0", "soc-nonfatal-fabric-ss0-1",
"soc-nonfatal-fabric-ss0-2", "soc-nonfatal-fabric-ss0-3", "soc-nonfatal-fabric-ss1-0",
"soc-nonfatal-fabric-ss1-1", "soc-nonfatal-fabric-ss1-2", "soc-nonfatal-fabric-ss1-3",
"soc-nonfatal-fabric-ss1-4", "soc-nonfatal-hbm-ss0-0", "soc-nonfatal-hbm-ss0-1",
"soc-nonfatal-hbm-ss0-2", "soc-nonfatal-hbm-ss0-3", "soc-nonfatal-hbm-ss0-4",
"soc-nonfatal-hbm-ss0-5", "soc-nonfatal-hbm-ss0-6", "soc-nonfatal-hbm-ss0-7",
"soc-nonfatal-hbm-ss0-8", "soc-nonfatal-hbm-ss0-9", "soc-nonfatal-hbm-ss0-10",
"soc-nonfatal-hbm-ss0-11", "soc-nonfatal-hbm-ss0-12", "soc-nonfatal-hbm-ss0-13",
"soc-nonfatal-hbm-ss0-14", "soc-nonfatal-hbm-ss0-15", "soc-nonfatal-hbm-ss1-0",
"soc-nonfatal-hbm-ss1-1", "soc-nonfatal-hbm-ss1-2", "soc-nonfatal-hbm-ss1-3",
"soc-nonfatal-hbm-ss1-4", "soc-nonfatal-hbm-ss1-5", "soc-nonfatal-hbm-ss1-6",
"soc-nonfatal-hbm-ss1-7", "soc-nonfatal-hbm-ss1-8", "soc-nonfatal-hbm-ss1-9",
"soc-nonfatal-hbm-ss1-10", "soc-nonfatal-hbm-ss1-11", "soc-nonfatal-hbm-ss1-12",
"soc-nonfatal-hbm-ss1-13", "soc-nonfatal-hbm-ss1-14", "soc-nonfatal-hbm-ss1-15",
"soc-nonfatal-mdfi-east", "soc-nonfatal-mdfi-south", "soc-nonfatal-mdfi-west",
"soc-nonfatal-psf-csc-0", "soc-nonfatal-psf-csc-1", "soc-nonfatal-psf-csc-2",
"soc-nonfatal-punit", "sgunit-nonfatal"}},
{ZES_RAS_ERROR_CAT_COMPUTE_ERRORS,
{"fatal-fpu", "fatal-l3-fabric"}},
{ZES_RAS_ERROR_CAT_DRIVER_ERRORS,
{"driver-object-migration", "driver-engine-other", "driver-ggtt",
"driver-gt-interrupt", "driver-gt-other", "driver-guc-communication",
"driver-rps"}}};
static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEventsCorrectable = {
{ZES_RAS_ERROR_CAT_CACHE_ERRORS,
{"correctable-eu-grf", "correctable-eu-ic", "correctable-guc",
"correctable-l3-sng", "correctable-sampler", "correctable-slm"}},
{ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS,
{"soc-correctable-fabric-ss0-0", "soc-correctable-fabric-ss0-1", "soc-correctable-fabric-ss0-2",
"soc-correctable-fabric-ss0-3", "soc-correctable-fabric-ss1-0", "soc-correctable-fabric-ss1-1",
"soc-correctable-fabric-ss1-2", "soc-correctable-fabric-ss1-3", "soc-correctable-fabric-ss1-4",
"soc-correctable-hbm-ss0-0", "soc-correctable-hbm-ss0-1", "soc-correctable-hbm-ss0-2",
"soc-correctable-hbm-ss0-3", "soc-correctable-hbm-ss0-4", "soc-correctable-hbm-ss0-5",
"soc-correctable-hbm-ss0-6", "soc-correctable-hbm-ss0-7", "soc-correctable-hbm-ss0-8",
"soc-correctable-hbm-ss0-9", "soc-correctable-hbm-ss0-10", "soc-correctable-hbm-ss0-11",
"soc-correctable-hbm-ss0-12", "soc-correctable-hbm-ss0-13", "soc-correctable-hbm-ss0-14",
"soc-correctable-hbm-ss0-15", "soc-correctable-hbm-ss1-0", "soc-correctable-hbm-ss1-1",
"soc-correctable-hbm-ss1-2", "soc-correctable-hbm-ss1-3", "soc-correctable-hbm-ss1-4",
"soc-correctable-hbm-ss1-5", "soc-correctable-hbm-ss1-6", "soc-correctable-hbm-ss1-7",
"soc-correctable-hbm-ss1-8", "soc-correctable-hbm-ss1-9", "soc-correctable-hbm-ss1-10",
"soc-correctable-hbm-ss1-11", "soc-correctable-hbm-ss1-12", "soc-correctable-hbm-ss1-13",
"soc-correctable-hbm-ss1-14", "soc-correctable-hbm-ss1-15", "soc-correctable-mdfi-east",
"soc-correctable-mdfi-south", "soc-correctable-mdfi-west", "soc-correctable-psf-csc-0",
"soc-correctable-psf-csc-1", "soc-correctable-punit", "sgunit-correctable"}}};
static void closeFd(int64_t &fd) {
if (fd != -1) {
close(static_cast<int>(fd));
fd = -1;
}
}
static ze_result_t readI915EventsDirectory(LinuxSysmanImp *pLinuxSysmanImp, std::vector<std::string> &listOfEvents, std::string *eventDirectory) {
// To know how many errors are supported on a platform scan
// /sys/devices/i915_0000_01_00.0/events/
// all events are enumerated in sysfs at /sys/devices/i915_0000_01_00.0/events/
// For above example device is in PCI slot 0000:01:00.0:
SysfsAccess *pSysfsAccess = &pLinuxSysmanImp->getSysfsAccess();
const std::string deviceDir("device");
const std::string sysDevicesDir("/sys/devices/");
std::string bdfDir;
ze_result_t result = pSysfsAccess->readSymLink(deviceDir, bdfDir);
if (ZE_RESULT_SUCCESS != result) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
const auto loc = bdfDir.find_last_of('/');
auto bdf = bdfDir.substr(loc + 1);
std::replace(bdf.begin(), bdf.end(), ':', '_');
std::string i915DirName = "i915_" + bdf;
std::string sysfsNode = sysDevicesDir + i915DirName + "/" + "events";
if (eventDirectory != nullptr) {
*eventDirectory = sysfsNode;
}
FsAccess *pFsAccess = &pLinuxSysmanImp->getFsAccess();
result = pFsAccess->listDirectory(sysfsNode, listOfEvents);
if (ZE_RESULT_SUCCESS != result) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
return ZE_RESULT_SUCCESS;
}
static uint64_t convertHexToUint64(std::string strVal) {
auto loc = strVal.find('=');
std::stringstream ss;
ss << std::hex << strVal.substr(loc + 1);
uint64_t config = 0;
ss >> config;
return config;
}
static bool isErrorTypeSupported(std::string pattern, std::vector<std::string> &eventList) {
std::regex pPattern(pattern);
for (const auto &entry : eventList) {
if (regex_match(entry, pPattern) == true) {
return true;
}
}
return false;
}
static bool getErrorType(std::vector<std::string> errorPattern, std::vector<std::string> &eventList) {
for (auto &pattern : errorPattern) {
if (isErrorTypeSupported(pattern, eventList) == true) {
return true;
}
}
return false;
}
void LinuxRasSourceGt::closeFds() {
for (auto &memberFd : memberFds) {
closeFd(memberFd);
}
memberFds.clear();
closeFd(groupFd);
}
LinuxRasSourceGt::~LinuxRasSourceGt() {
closeFds();
}
void LinuxRasSourceGt::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle) {
LinuxSysmanImp *pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
std::vector<std::string> listOfEvents = {};
ze_result_t result = readI915EventsDirectory(pLinuxSysmanImp, listOfEvents, nullptr);
if (result != ZE_RESULT_SUCCESS) {
return;
}
ze_device_properties_t deviceProperties = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES};
Device::fromHandle(deviceHandle)->getProperties(&deviceProperties);
bool onSubDevice = deviceProperties.flags & ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE;
uint32_t subDeviceId = deviceProperties.subdeviceId;
std::vector<std::string> uncorrectablePattern;
std::vector<std::string> correctablePattern;
// For device with no subDevice error entries are of form error--<Name of error type>
// and for device having subDevice error entries are of form error-gt<N>--<Name of error type>
uncorrectablePattern.push_back("^error--driver.*");
if (onSubDevice == false) {
correctablePattern.push_back("^error--correctable.*");
correctablePattern.push_back("^error--soc-correctable.*");
uncorrectablePattern.push_back("^error--engine-reset.*");
uncorrectablePattern.push_back("^error--eu-attention.*");
uncorrectablePattern.push_back("^error--fatal.*");
uncorrectablePattern.push_back("^error--soc-fatal.*");
uncorrectablePattern.push_back("^error--soc-nonfatal.*");
} else {
correctablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--correctable.*");
correctablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--soc-correctable.*");
uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--driver.*");
uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--fatal.*");
uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--soc-fatal.*");
uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--soc-nonfatal.*");
uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--eu-attention.*");
uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--engine-reset.*");
}
if (getErrorType(correctablePattern, listOfEvents) == true) {
errorType.insert(ZES_RAS_ERROR_TYPE_CORRECTABLE);
}
if (getErrorType(uncorrectablePattern, listOfEvents) == true) {
errorType.insert(ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
}
}
ze_result_t LinuxRasSourceGt::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
if (clear == true) {
closeFds();
totalEventCount = 0;
memset(state.category, 0, sizeof(state.category));
memset(initialErrorCount, 0, sizeof(initialErrorCount));
}
initRasErrors(clear);
// Iterate over all the file descriptor values present in vector which is mapped to given ras error category
// Use the file descriptors to read pmu counters and add all the errors corresponding to the ras error category
if (groupFd < 0) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToEvent;
if (osRasErrorType == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
categoryToEvent = categoryToListOfEventsCorrectable;
}
if (osRasErrorType == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
categoryToEvent = categoryToListOfEventsUncorrectable;
}
std::vector<std::uint64_t> data(2 + totalEventCount, 0); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
if (pPmuInterface->pmuRead(static_cast<int>(groupFd), data.data(), sizeof(uint64_t) * data.size()) < 0) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
/* The data buffer retrieved after reading pmu counters is parsed to get the error count for each suberror category */
uint64_t initialIndex = 2; // Initial index in the buffer from which the data be parsed begins
for (auto errorCat = errorCategoryToEventCount.begin(); errorCat != errorCategoryToEventCount.end(); errorCat++) {
uint64_t errorCount = 0;
uint64_t j = 0;
for (; j < errorCat->second; j++) {
errorCount += data[initialIndex + j];
}
state.category[errorCat->first] = errorCount + initialErrorCount[errorCat->first];
initialIndex += j;
}
return ZE_RESULT_SUCCESS;
}
ze_result_t LinuxRasSourceGt::getPmuConfig(
const std::string &eventDirectory,
const std::vector<std::string> &listOfEvents,
const std::string &errorFileToGetConfig,
std::string &pmuConfig) {
auto findErrorInList = std::find(listOfEvents.begin(), listOfEvents.end(), errorFileToGetConfig);
if (findErrorInList == listOfEvents.end()) {
return ZE_RESULT_ERROR_UNKNOWN;
}
return pFsAccess->read(eventDirectory + "/" + errorFileToGetConfig, pmuConfig);
}
ze_result_t LinuxRasSourceGt::getBootUpErrorCountFromSysfs(
std::string nameOfError,
const std::string &errorCounterDir,
uint64_t &errorVal) {
std::replace(nameOfError.begin(), nameOfError.end(), '-', '_'); // replace - with _ to convert name of pmu config node to name of sysfs node
return pSysfsAccess->read(errorCounterDir + "/" + nameOfError, errorVal);
}
void LinuxRasSourceGt::initRasErrors(ze_bool_t clear) {
// if already initialized
if (groupFd >= 0) {
return;
}
std::string eventDirectory;
std::vector<std::string> listOfEvents = {};
ze_result_t result = readI915EventsDirectory(pLinuxSysmanImp, listOfEvents, &eventDirectory);
if (result != ZE_RESULT_SUCCESS) {
return;
}
std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEvents;
if (osRasErrorType == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
categoryToListOfEvents = categoryToListOfEventsCorrectable;
}
if (osRasErrorType == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
categoryToListOfEvents = categoryToListOfEventsUncorrectable;
}
std::string errorPrefix = "error--"; // prefix string of the file containing config value for pmu counters
std::string errorCounterDir = "gt/gt0/error_counter"; // Directory containing the sysfs nodes which in turn contains initial value of error count
if (isSubdevice == true) {
errorPrefix = "error-gt" + std::to_string(subdeviceId) + "--";
errorCounterDir = "gt/gt" + std::to_string(subdeviceId) + "/error_counter";
}
// Following loop retrieves initial count of errors from sysfs and pmu config values for each ras error
// PMU: error--<Name of error> Ex:- error--engine-reset (config with no subdevice)
// PMU: error-gt<N>--<Name of error> Ex:- error-gt0--engine-reset (config with subdevices)
// PMU: error--<Name of error> Ex:- error--driver-object-migration (config for device level errors)
// Sysfs: card0/gt/gt0/error_counter/<Name of error> Ex:- gt/gt0/error_counter/engine_reset (sysfs with no subdevice)
// Sysfs: card0/gt/gt<N>/error_counter/<Name of error> Ex:- gt/gt1/error_counter/engine_reset (sysfs with dubdevices)
// Sysfs: error_counter/<Name of error> Ex:- error_counter/driver_object_migration (sysfs for error which occur at device level)
for (auto const &rasErrorCatToListOfEvents : categoryToListOfEvents) {
uint64_t eventCount = 0;
uint64_t errorCount = 0;
for (auto const &nameOfError : rasErrorCatToListOfEvents.second) {
std::string errorPrefixLocal = errorPrefix;
std::string errorCounterDirLocal = errorCounterDir;
if (nameOfError == "driver-object-migration") { // check for errors which occur at device level
errorCounterDirLocal = "error_counter";
errorPrefixLocal = "error--";
}
uint64_t initialErrorVal = 0;
if (clear == false) {
result = getBootUpErrorCountFromSysfs(nameOfError, errorCounterDirLocal, initialErrorVal);
if (result != ZE_RESULT_SUCCESS) {
continue;
}
}
std::string pmuConfig;
result = getPmuConfig(eventDirectory, listOfEvents, errorPrefixLocal + nameOfError, pmuConfig);
if (result != ZE_RESULT_SUCCESS) {
continue;
}
uint64_t config = convertHexToUint64(pmuConfig);
if (groupFd == -1) {
groupFd = pPmuInterface->pmuInterfaceOpen(config, -1, PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP); // To get file descriptor of the group leader
if (groupFd < 0) {
return;
}
} else {
// The rest of the group members are created with subsequent calls with groupFd being set to the file descriptor of the group leader
memberFds.push_back(pPmuInterface->pmuInterfaceOpen(config, static_cast<int>(groupFd), PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP));
}
eventCount++;
errorCount += initialErrorVal;
}
initialErrorCount[rasErrorCatToListOfEvents.first] = errorCount;
errorCategoryToEventCount[rasErrorCatToListOfEvents.first] = eventCount;
totalEventCount += eventCount;
}
}
LinuxRasSourceGt::LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : pLinuxSysmanImp(pLinuxSysmanImp), osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) {
pPmuInterface = pLinuxSysmanImp->getPmuInterface();
pFsAccess = &pLinuxSysmanImp->getFsAccess();
pSysfsAccess = &pLinuxSysmanImp->getSysfsAccess();
}
} // namespace L0

View File

@@ -0,0 +1,51 @@
/*
* Copyright (C) 2021-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h"
#include "sysman/linux/os_sysman_imp.h"
namespace L0 {
void LinuxRasSourceHbm::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle) {
LinuxSysmanImp *pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
FirmwareUtil *pFwInterface = pLinuxSysmanImp->getFwUtilInterface();
if (pFwInterface != nullptr) {
errorType.insert(ZES_RAS_ERROR_TYPE_CORRECTABLE);
errorType.insert(ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
}
}
ze_result_t LinuxRasSourceHbm::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
if (pFwInterface == nullptr) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
uint32_t subDeviceCount = 0;
pDevice->getSubDevices(&subDeviceCount, nullptr);
if (clear == true) {
uint64_t errorCount = 0;
ze_result_t result = pFwInterface->fwGetMemoryErrorCount(osRasErrorType, subDeviceCount, subdeviceId, errorCount);
if (result != ZE_RESULT_SUCCESS) {
return result;
}
errorBaseline = errorCount; // during clear update the error baseline value
}
uint64_t errorCount = 0;
ze_result_t result = pFwInterface->fwGetMemoryErrorCount(osRasErrorType, subDeviceCount, subdeviceId, errorCount);
if (result != ZE_RESULT_SUCCESS) {
return result;
}
state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS] = errorCount - errorBaseline;
return ZE_RESULT_SUCCESS;
}
LinuxRasSourceHbm::LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId) : pLinuxSysmanImp(pLinuxSysmanImp), osRasErrorType(type), subdeviceId(subdeviceId) {
pFwInterface = pLinuxSysmanImp->getFwUtilInterface();
pDevice = pLinuxSysmanImp->getDeviceHandle();
}
} // namespace L0

View File

@@ -0,0 +1,88 @@
/*
* Copyright (C) 2020-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h"
#include "sysman/linux/os_sysman_imp.h"
namespace L0 {
void OsRas::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle) {
constexpr auto maxErrorTypes = 2;
LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle);
if (errorType.size() < maxErrorTypes) {
LinuxRasSourceFabric::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle);
if (errorType.size() < maxErrorTypes) {
LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle);
}
}
}
ze_result_t LinuxRasImp::osRasGetConfig(zes_ras_config_t *config) {
config->totalThreshold = totalThreshold;
memcpy(config->detailedThresholds.category, categoryThreshold, sizeof(config->detailedThresholds.category));
return ZE_RESULT_SUCCESS;
}
ze_result_t LinuxRasImp::osRasSetConfig(const zes_ras_config_t *config) {
if (pFsAccess->isRootUser() == true) {
totalThreshold = config->totalThreshold;
memcpy(categoryThreshold, config->detailedThresholds.category, sizeof(config->detailedThresholds.category));
return ZE_RESULT_SUCCESS;
}
return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS;
}
ze_result_t LinuxRasImp::osRasGetProperties(zes_ras_properties_t &properties) {
properties.pNext = nullptr;
properties.type = osRasErrorType;
properties.onSubdevice = isSubdevice;
properties.subdeviceId = subdeviceId;
return ZE_RESULT_SUCCESS;
}
ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
if (clear == true) {
if (pFsAccess->isRootUser() == false) {
return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS;
}
}
ze_result_t result = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
for (auto &rasSource : rasSources) {
zes_ras_state_t localState = {};
ze_result_t localResult = rasSource->osRasGetState(localState, clear);
if (localResult != ZE_RESULT_SUCCESS) {
continue;
}
for (int i = 0; i < ZES_MAX_RAS_ERROR_CATEGORY_COUNT; i++) {
state.category[i] += localState.category[i];
}
result = ZE_RESULT_SUCCESS;
}
return result;
}
void LinuxRasImp::initSources() {
rasSources.push_back(std::make_unique<L0::LinuxRasSourceGt>(pLinuxSysmanImp, osRasErrorType, isSubdevice, subdeviceId));
rasSources.push_back(std::make_unique<L0::LinuxRasSourceFabric>(pLinuxSysmanImp, osRasErrorType, subdeviceId));
rasSources.push_back(std::make_unique<L0::LinuxRasSourceHbm>(pLinuxSysmanImp, osRasErrorType, subdeviceId));
}
LinuxRasImp::LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) {
pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
pFsAccess = &pLinuxSysmanImp->getFsAccess();
initSources();
}
OsRas *OsRas::create(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) {
LinuxRasImp *pLinuxRasImp = new LinuxRasImp(pOsSysman, type, onSubdevice, subdeviceId);
return static_cast<OsRas *>(pLinuxRasImp);
}
} // namespace L0

View File

@@ -0,0 +1,128 @@
/*
* Copyright (C) 2020-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/helpers/non_copyable_or_moveable.h"
#include "level_zero/tools/source/sysman/ras/os_ras.h"
#include <map>
#include <memory>
#include <string>
#include <vector>
namespace L0 {
class FsAccess;
class SysfsAccess;
class PmuInterface;
class LinuxSysmanImp;
class LinuxRasSources;
class FirmwareUtil;
struct Device;
class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass {
public:
ze_result_t osRasGetProperties(zes_ras_properties_t &properties) override;
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
ze_result_t osRasGetConfig(zes_ras_config_t *config) override;
ze_result_t osRasSetConfig(const zes_ras_config_t *config) override;
LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
LinuxRasImp() = default;
~LinuxRasImp() override = default;
protected:
zes_ras_error_type_t osRasErrorType = {};
FsAccess *pFsAccess = nullptr;
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
std::vector<std::unique_ptr<L0::LinuxRasSources>> rasSources = {};
private:
void initSources();
bool isSubdevice = false;
uint32_t subdeviceId = 0;
uint64_t totalThreshold = 0;
uint64_t categoryThreshold[ZES_MAX_RAS_ERROR_CATEGORY_COUNT] = {0};
};
class LinuxRasSources : NEO::NonCopyableOrMovableClass {
public:
virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) = 0;
virtual ~LinuxRasSources() = default;
};
class LinuxRasSourceGt : public LinuxRasSources {
public:
virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle);
LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
LinuxRasSourceGt() = default;
virtual ~LinuxRasSourceGt();
protected:
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
zes_ras_error_type_t osRasErrorType = {};
PmuInterface *pPmuInterface = nullptr;
FsAccess *pFsAccess = nullptr;
SysfsAccess *pSysfsAccess = nullptr;
private:
void initRasErrors(ze_bool_t clear);
ze_result_t getPmuConfig(
const std::string &eventDirectory,
const std::vector<std::string> &listOfEvents,
const std::string &errorFileToGetConfig,
std::string &pmuConfig);
ze_result_t getBootUpErrorCountFromSysfs(
std::string nameOfError,
const std::string &errorCounterDir,
uint64_t &errorVal);
void closeFds();
int64_t groupFd = -1;
std::vector<int64_t> memberFds = {};
uint64_t initialErrorCount[ZES_MAX_RAS_ERROR_CATEGORY_COUNT] = {0};
std::map<zes_ras_error_cat_t, uint64_t> errorCategoryToEventCount;
uint64_t totalEventCount = 0;
bool isSubdevice = false;
uint32_t subdeviceId = 0;
};
class LinuxRasSourceFabric : public LinuxRasSources {
public:
static ze_result_t getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle);
LinuxRasSourceFabric(OsSysman *pOsSysman, zes_ras_error_type_t type, uint32_t subDeviceId);
~LinuxRasSourceFabric() = default;
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
private:
FsAccess *fsAccess = nullptr;
std::vector<std::string> errorNodes = {};
uint64_t baseComputeErrorCount = 0;
uint64_t getComputeErrorCount();
static void getNodes(std::vector<std::string> &nodes, uint32_t subdeviceId, FsAccess *fsAccess, const zes_ras_error_type_t &type);
};
class LinuxRasSourceHbm : public LinuxRasSources {
public:
virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle);
LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId);
LinuxRasSourceHbm() = default;
virtual ~LinuxRasSourceHbm() override{};
protected:
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
zes_ras_error_type_t osRasErrorType = {};
FirmwareUtil *pFwInterface = nullptr;
Device *pDevice = nullptr;
private:
uint64_t errorBaseline = 0;
uint32_t subdeviceId = 0;
};
} // namespace L0