mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-05 09:09:04 +08:00
feature(sysman): Add support for RAS module
- Port RAS module to new sysman design - Add RAS ULTs for new sysman interface Related-To: LOCI-4246 Signed-off-by: Aravind Gopalakrishnan <aravind.gopalakrishnan@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
cfacbbd811
commit
826abf338a
28
level_zero/sysman/source/ras/linux/CMakeLists.txt
Normal file
28
level_zero/sysman/source/ras/linux/CMakeLists.txt
Normal file
@@ -0,0 +1,28 @@
|
||||
#
|
||||
# Copyright (C) 2023 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
#
|
||||
|
||||
if(UNIX)
|
||||
target_sources(${L0_STATIC_LIB_NAME}
|
||||
PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
|
||||
)
|
||||
|
||||
if(NEO_ENABLE_i915_PRELIM_DETECTION)
|
||||
target_sources(${L0_STATIC_LIB_NAME}
|
||||
PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_prelim.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_prelim.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_gt.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_hbm.cpp
|
||||
)
|
||||
else()
|
||||
target_sources(${L0_STATIC_LIB_NAME}
|
||||
PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.h
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
57
level_zero/sysman/source/ras/linux/os_ras_imp.cpp
Normal file
57
level_zero/sysman/source/ras/linux/os_ras_imp.cpp
Normal file
@@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Copyright (C) 2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "level_zero/sysman/source/ras/linux/os_ras_imp.h"
|
||||
|
||||
#include "level_zero/sysman/source/linux/os_sysman_imp.h"
|
||||
|
||||
#include <cstring>
|
||||
|
||||
namespace L0 {
|
||||
namespace Sysman {
|
||||
|
||||
LinuxRasImp::LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) {
|
||||
pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
|
||||
pFsAccess = &pLinuxSysmanImp->getFsAccess();
|
||||
}
|
||||
|
||||
void OsRas::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId) {}
|
||||
|
||||
ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasImp::osRasGetConfig(zes_ras_config_t *config) {
|
||||
config->totalThreshold = totalThreshold;
|
||||
memcpy(config->detailedThresholds.category, categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t));
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasImp::osRasSetConfig(const zes_ras_config_t *config) {
|
||||
if (pFsAccess->isRootUser() == true) {
|
||||
totalThreshold = config->totalThreshold;
|
||||
memcpy(categoryThreshold, config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t));
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS;
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasImp::osRasGetProperties(zes_ras_properties_t &properties) {
|
||||
properties.pNext = nullptr;
|
||||
properties.type = osRasErrorType;
|
||||
properties.onSubdevice = isSubdevice;
|
||||
properties.subdeviceId = subdeviceId;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
OsRas *OsRas::create(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) {
|
||||
LinuxRasImp *pLinuxRasImp = new LinuxRasImp(pOsSysman, type, onSubdevice, subdeviceId);
|
||||
return static_cast<OsRas *>(pLinuxRasImp);
|
||||
}
|
||||
|
||||
} // namespace Sysman
|
||||
} // namespace L0
|
||||
41
level_zero/sysman/source/ras/linux/os_ras_imp.h
Normal file
41
level_zero/sysman/source/ras/linux/os_ras_imp.h
Normal file
@@ -0,0 +1,41 @@
|
||||
/*
|
||||
* Copyright (C) 2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "shared/source/helpers/non_copyable_or_moveable.h"
|
||||
|
||||
#include "level_zero/sysman/source/linux/fs_access.h"
|
||||
#include "level_zero/sysman/source/ras/os_ras.h"
|
||||
#include "level_zero/sysman/source/sysman_const.h"
|
||||
|
||||
namespace L0 {
|
||||
namespace Sysman {
|
||||
class LinuxSysmanImp;
|
||||
class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass {
|
||||
public:
|
||||
ze_result_t osRasGetProperties(zes_ras_properties_t &properties) override;
|
||||
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
|
||||
ze_result_t osRasGetConfig(zes_ras_config_t *config) override;
|
||||
ze_result_t osRasSetConfig(const zes_ras_config_t *config) override;
|
||||
LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
|
||||
LinuxRasImp() = default;
|
||||
~LinuxRasImp() override = default;
|
||||
|
||||
protected:
|
||||
zes_ras_error_type_t osRasErrorType = {};
|
||||
FsAccess *pFsAccess = nullptr;
|
||||
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
|
||||
|
||||
private:
|
||||
bool isSubdevice = false;
|
||||
uint32_t subdeviceId = 0;
|
||||
uint64_t totalThreshold = 0;
|
||||
uint64_t categoryThreshold[maxRasErrorCategoryCount] = {0};
|
||||
};
|
||||
|
||||
} // namespace Sysman
|
||||
} // namespace L0
|
||||
284
level_zero/sysman/source/ras/linux/os_ras_imp_gt.cpp
Normal file
284
level_zero/sysman/source/ras/linux/os_ras_imp_gt.cpp
Normal file
@@ -0,0 +1,284 @@
|
||||
/*
|
||||
* Copyright (C) 2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "level_zero/sysman/source/linux/os_sysman_imp.h"
|
||||
#include "level_zero/sysman/source/ras/linux/os_ras_imp_prelim.h"
|
||||
|
||||
#include <cstring>
|
||||
|
||||
namespace L0 {
|
||||
namespace Sysman {
|
||||
|
||||
static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEventsUncorrectable = {
|
||||
{ZES_RAS_ERROR_CAT_CACHE_ERRORS,
|
||||
{"fatal-array-bist", "fatal-idi-parity", "fatal-l3-double",
|
||||
"fatal-l3-ecc-checker",
|
||||
"fatal-sqidi", "fatal-tlb", "fatal-l3bank"}},
|
||||
{ZES_RAS_ERROR_CAT_RESET,
|
||||
{"engine-reset"}},
|
||||
{ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS,
|
||||
{"eu-attention"}},
|
||||
{ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS,
|
||||
{"soc-fatal-psf-0", "soc-fatal-psf-1", "soc-fatal-psf-2", "soc-fatal-psf-csc-0",
|
||||
"soc-fatal-psf-csc-1", "soc-fatal-psf-csc-2", "soc-fatal-punit",
|
||||
"sgunit-fatal", "soc-nonfatal-punit", "sgunit-fatal", "sgunit-nonfatal", "gsc-nonfatal-mia-shutdown",
|
||||
"gsc-nonfatal-aon-parity", "gsc-nonfatal-rom-parity", "gsc-nonfatal-fuse-crc-check",
|
||||
"gsc-nonfatal-selfmbist", "gsc-nonfatal-fuse-pull", "gsc-nonfatal-sram-ecc", "gsc-nonfatal-glitch-det",
|
||||
"gsc-nonfatal-ucode-parity", "gsc-nonfatal-mia-int", "gsc-nonfatal-wdg-timeout"}},
|
||||
{ZES_RAS_ERROR_CAT_COMPUTE_ERRORS,
|
||||
{"fatal-fpu", "fatal-eu-grf", "fatal-sampler", "fatal-slm",
|
||||
"fatal-guc", "fatal-eu-ic", "fatal-subslice"}},
|
||||
{ZES_RAS_ERROR_CAT_DRIVER_ERRORS,
|
||||
{"driver-object-migration", "driver-engine-other", "driver-ggtt",
|
||||
"driver-gt-interrupt", "driver-gt-other", "driver-guc-communication",
|
||||
"driver-rps"}}};
|
||||
|
||||
static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEventsCorrectable = {
|
||||
{ZES_RAS_ERROR_CAT_CACHE_ERRORS,
|
||||
{"correctable-l3-sng", "correctable-l3bank"}},
|
||||
{ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS,
|
||||
{"sgunit-correctable", "gsc-correctable-sram-ecc"}},
|
||||
{ZES_RAS_ERROR_CAT_COMPUTE_ERRORS,
|
||||
{"correctable-eu-grf", "correctable-eu-ic", "correctable-guc", "correctable-sampler", "correctable-slm", "correctable-subslice"}}};
|
||||
|
||||
static void closeFd(int64_t &fd) {
|
||||
if (fd != -1) {
|
||||
close(static_cast<int>(fd));
|
||||
fd = -1;
|
||||
}
|
||||
}
|
||||
|
||||
static ze_result_t readI915EventsDirectory(LinuxSysmanImp *pLinuxSysmanImp, std::vector<std::string> &listOfEvents, std::string *eventDirectory) {
|
||||
// To know how many errors are supported on a platform scan
|
||||
// /sys/devices/i915_0000_01_00.0/events/
|
||||
// all events are enumerated in sysfs at /sys/devices/i915_0000_01_00.0/events/
|
||||
// For above example device is in PCI slot 0000:01:00.0:
|
||||
SysfsAccess *pSysfsAccess = &pLinuxSysmanImp->getSysfsAccess();
|
||||
const std::string deviceDir("device");
|
||||
const std::string sysDevicesDir("/sys/devices/");
|
||||
std::string bdfDir;
|
||||
ze_result_t result = pSysfsAccess->readSymLink(deviceDir, bdfDir);
|
||||
if (ZE_RESULT_SUCCESS != result) {
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
}
|
||||
const auto loc = bdfDir.find_last_of('/');
|
||||
auto bdf = bdfDir.substr(loc + 1);
|
||||
std::replace(bdf.begin(), bdf.end(), ':', '_');
|
||||
std::string i915DirName = "i915_" + bdf;
|
||||
std::string sysfsNode = sysDevicesDir + i915DirName + "/" + "events";
|
||||
if (eventDirectory != nullptr) {
|
||||
*eventDirectory = sysfsNode;
|
||||
}
|
||||
FsAccess *pFsAccess = &pLinuxSysmanImp->getFsAccess();
|
||||
result = pFsAccess->listDirectory(sysfsNode, listOfEvents);
|
||||
if (ZE_RESULT_SUCCESS != result) {
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
}
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
static uint64_t convertHexToUint64(std::string strVal) {
|
||||
auto loc = strVal.find('=');
|
||||
std::stringstream ss;
|
||||
ss << std::hex << strVal.substr(loc + 1);
|
||||
uint64_t config = 0;
|
||||
ss >> config;
|
||||
return config;
|
||||
}
|
||||
|
||||
static bool getErrorType(std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEvents, std::vector<std::string> &eventList, ze_bool_t isSubDevice, uint32_t subDeviceId) {
|
||||
// Naming convention of files containing config values for errors
|
||||
// error--<Name of error> Ex:- error--engine-reset (config file with no subdevice)
|
||||
// error-gt<N>--<Name of error> Ex:- error-gt0--engine-reset (config file with subdevices)
|
||||
// error--<Name of error> Ex:- error--driver-object-migration (config file for device level errors)
|
||||
std::string errorPrefix = "error--"; // prefix string of the file containing config value for pmu counters
|
||||
if (isSubDevice == true) {
|
||||
errorPrefix = "error-gt" + std::to_string(subDeviceId) + "--";
|
||||
}
|
||||
for (auto const &rasErrorCatToListOfEvents : categoryToListOfEvents) {
|
||||
for (auto const &nameOfError : rasErrorCatToListOfEvents.second) {
|
||||
std::string errorPrefixLocal = errorPrefix;
|
||||
if (nameOfError == "driver-object-migration") { // check for errors which occur at device level
|
||||
errorPrefixLocal = "error--";
|
||||
}
|
||||
if (std::find(eventList.begin(), eventList.end(), errorPrefixLocal + nameOfError) != eventList.end()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void LinuxRasSourceGt::closeFds() {
|
||||
for (auto &memberFd : memberFds) {
|
||||
closeFd(memberFd);
|
||||
}
|
||||
memberFds.clear();
|
||||
closeFd(groupFd);
|
||||
}
|
||||
|
||||
LinuxRasSourceGt::~LinuxRasSourceGt() {
|
||||
closeFds();
|
||||
}
|
||||
|
||||
void LinuxRasSourceGt::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId) {
|
||||
LinuxSysmanImp *pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
|
||||
std::vector<std::string> listOfEvents = {};
|
||||
ze_result_t result = readI915EventsDirectory(pLinuxSysmanImp, listOfEvents, nullptr);
|
||||
if (result != ZE_RESULT_SUCCESS) {
|
||||
return;
|
||||
}
|
||||
if (getErrorType(categoryToListOfEventsCorrectable, listOfEvents, isSubDevice, subDeviceId) == true) {
|
||||
errorType.insert(ZES_RAS_ERROR_TYPE_CORRECTABLE);
|
||||
}
|
||||
if (getErrorType(categoryToListOfEventsUncorrectable, listOfEvents, isSubDevice, subDeviceId) == true) {
|
||||
errorType.insert(ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
|
||||
}
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasSourceGt::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
|
||||
if (clear == true) {
|
||||
closeFds();
|
||||
totalEventCount = 0;
|
||||
memset(state.category, 0, maxRasErrorCategoryCount * sizeof(uint64_t));
|
||||
memset(initialErrorCount, 0, maxRasErrorCategoryCount * sizeof(uint64_t));
|
||||
}
|
||||
initRasErrors(clear);
|
||||
// Iterate over all the file descriptor values present in vector which is mapped to given ras error category
|
||||
// Use the file descriptors to read pmu counters and add all the errors corresponding to the ras error category
|
||||
if (groupFd < 0) {
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
}
|
||||
|
||||
std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToEvent;
|
||||
if (osRasErrorType == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
|
||||
categoryToEvent = categoryToListOfEventsCorrectable;
|
||||
}
|
||||
if (osRasErrorType == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
|
||||
categoryToEvent = categoryToListOfEventsUncorrectable;
|
||||
}
|
||||
std::vector<std::uint64_t> data(2 + totalEventCount, 0); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
|
||||
if (pPmuInterface->pmuRead(static_cast<int>(groupFd), data.data(), sizeof(uint64_t) * data.size()) < 0) {
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
}
|
||||
/* The data buffer retrieved after reading pmu counters is parsed to get the error count for each suberror category */
|
||||
uint64_t initialIndex = 2; // Initial index in the buffer from which the data be parsed begins
|
||||
for (auto errorCat = errorCategoryToEventCount.begin(); errorCat != errorCategoryToEventCount.end(); errorCat++) {
|
||||
uint64_t errorCount = 0;
|
||||
uint64_t j = 0;
|
||||
for (; j < errorCat->second; j++) {
|
||||
errorCount += data[initialIndex + j];
|
||||
}
|
||||
state.category[errorCat->first] = errorCount + initialErrorCount[errorCat->first];
|
||||
initialIndex += j;
|
||||
}
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasSourceGt::getPmuConfig(
|
||||
const std::string &eventDirectory,
|
||||
const std::vector<std::string> &listOfEvents,
|
||||
const std::string &errorFileToGetConfig,
|
||||
std::string &pmuConfig) {
|
||||
auto findErrorInList = std::find(listOfEvents.begin(), listOfEvents.end(), errorFileToGetConfig);
|
||||
if (findErrorInList == listOfEvents.end()) {
|
||||
return ZE_RESULT_ERROR_UNKNOWN;
|
||||
}
|
||||
return pFsAccess->read(eventDirectory + "/" + errorFileToGetConfig, pmuConfig);
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasSourceGt::getBootUpErrorCountFromSysfs(
|
||||
std::string nameOfError,
|
||||
const std::string &errorCounterDir,
|
||||
uint64_t &errorVal) {
|
||||
std::replace(nameOfError.begin(), nameOfError.end(), '-', '_'); // replace - with _ to convert name of pmu config node to name of sysfs node
|
||||
return pSysfsAccess->read(errorCounterDir + "/" + nameOfError, errorVal);
|
||||
}
|
||||
|
||||
void LinuxRasSourceGt::initRasErrors(ze_bool_t clear) {
|
||||
|
||||
// if already initialized
|
||||
if (groupFd >= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::string eventDirectory;
|
||||
std::vector<std::string> listOfEvents = {};
|
||||
ze_result_t result = readI915EventsDirectory(pLinuxSysmanImp, listOfEvents, &eventDirectory);
|
||||
if (result != ZE_RESULT_SUCCESS) {
|
||||
return;
|
||||
}
|
||||
std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEvents;
|
||||
if (osRasErrorType == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
|
||||
categoryToListOfEvents = categoryToListOfEventsCorrectable;
|
||||
}
|
||||
if (osRasErrorType == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
|
||||
categoryToListOfEvents = categoryToListOfEventsUncorrectable;
|
||||
}
|
||||
std::string errorPrefix = "error--"; // prefix string of the file containing config value for pmu counters
|
||||
std::string errorCounterDir = "gt/gt0/error_counter"; // Directory containing the sysfs nodes which in turn contains initial value of error count
|
||||
if (isSubdevice == true) {
|
||||
errorPrefix = "error-gt" + std::to_string(subdeviceId) + "--";
|
||||
errorCounterDir = "gt/gt" + std::to_string(subdeviceId) + "/error_counter";
|
||||
}
|
||||
// Following loop retrieves initial count of errors from sysfs and pmu config values for each ras error
|
||||
// PMU: error--<Name of error> Ex:- error--engine-reset (config with no subdevice)
|
||||
// PMU: error-gt<N>--<Name of error> Ex:- error-gt0--engine-reset (config with subdevices)
|
||||
// PMU: error--<Name of error> Ex:- error--driver-object-migration (config for device level errors)
|
||||
// Sysfs: card0/gt/gt0/error_counter/<Name of error> Ex:- gt/gt0/error_counter/engine_reset (sysfs with no subdevice)
|
||||
// Sysfs: card0/gt/gt<N>/error_counter/<Name of error> Ex:- gt/gt1/error_counter/engine_reset (sysfs with subdevices)
|
||||
// Sysfs: error_counter/<Name of error> Ex:- error_counter/driver_object_migration (sysfs for error which occur at device level)
|
||||
for (auto const &rasErrorCatToListOfEvents : categoryToListOfEvents) {
|
||||
uint64_t eventCount = 0;
|
||||
uint64_t errorCount = 0;
|
||||
for (auto const &nameOfError : rasErrorCatToListOfEvents.second) {
|
||||
std::string errorPrefixLocal = errorPrefix;
|
||||
std::string errorCounterDirLocal = errorCounterDir;
|
||||
if (nameOfError == "driver-object-migration") { // check for errors which occur at device level
|
||||
errorCounterDirLocal = "error_counter";
|
||||
errorPrefixLocal = "error--";
|
||||
}
|
||||
uint64_t initialErrorVal = 0;
|
||||
if (clear == false) {
|
||||
result = getBootUpErrorCountFromSysfs(nameOfError, errorCounterDirLocal, initialErrorVal);
|
||||
if (result != ZE_RESULT_SUCCESS) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
std::string pmuConfig;
|
||||
result = getPmuConfig(eventDirectory, listOfEvents, errorPrefixLocal + nameOfError, pmuConfig);
|
||||
if (result != ZE_RESULT_SUCCESS) {
|
||||
continue;
|
||||
}
|
||||
uint64_t config = convertHexToUint64(pmuConfig);
|
||||
if (groupFd == -1) {
|
||||
groupFd = pPmuInterface->pmuInterfaceOpen(config, -1, PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP); // To get file descriptor of the group leader
|
||||
if (groupFd < 0) {
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
// The rest of the group members are created with subsequent calls with groupFd being set to the file descriptor of the group leader
|
||||
memberFds.push_back(pPmuInterface->pmuInterfaceOpen(config, static_cast<int>(groupFd), PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP));
|
||||
}
|
||||
eventCount++;
|
||||
errorCount += initialErrorVal;
|
||||
}
|
||||
initialErrorCount[rasErrorCatToListOfEvents.first] = errorCount;
|
||||
errorCategoryToEventCount[rasErrorCatToListOfEvents.first] = eventCount;
|
||||
totalEventCount += eventCount;
|
||||
}
|
||||
}
|
||||
|
||||
LinuxRasSourceGt::LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : pLinuxSysmanImp(pLinuxSysmanImp), osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) {
|
||||
pPmuInterface = pLinuxSysmanImp->getPmuInterface();
|
||||
pFsAccess = &pLinuxSysmanImp->getFsAccess();
|
||||
pSysfsAccess = &pLinuxSysmanImp->getSysfsAccess();
|
||||
}
|
||||
|
||||
} // namespace Sysman
|
||||
} // namespace L0
|
||||
55
level_zero/sysman/source/ras/linux/os_ras_imp_hbm.cpp
Normal file
55
level_zero/sysman/source/ras/linux/os_ras_imp_hbm.cpp
Normal file
@@ -0,0 +1,55 @@
|
||||
/*
|
||||
* Copyright (C) 2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
|
||||
#include "level_zero/sysman/source/firmware_util/firmware_util.h"
|
||||
#include "level_zero/sysman/source/linux/os_sysman_imp.h"
|
||||
#include "level_zero/sysman/source/ras/linux/os_ras_imp_prelim.h"
|
||||
|
||||
namespace L0 {
|
||||
namespace Sysman {
|
||||
|
||||
void LinuxRasSourceHbm::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId) {
|
||||
LinuxSysmanImp *pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
|
||||
FirmwareUtil *pFwInterface = pLinuxSysmanImp->getFwUtilInterface();
|
||||
if (pFwInterface != nullptr) {
|
||||
errorType.insert(ZES_RAS_ERROR_TYPE_CORRECTABLE);
|
||||
errorType.insert(ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
|
||||
}
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasSourceHbm::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
|
||||
if (pFwInterface == nullptr) {
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
}
|
||||
uint32_t subDeviceCount = 0;
|
||||
subDeviceCount = NEO::GfxCoreHelper::getSubDevicesCount(&pDevice->getHardwareInfo());
|
||||
if (clear == true) {
|
||||
uint64_t errorCount = 0;
|
||||
ze_result_t result = pFwInterface->fwGetMemoryErrorCount(osRasErrorType, subDeviceCount, subdeviceId, errorCount);
|
||||
if (result != ZE_RESULT_SUCCESS) {
|
||||
return result;
|
||||
}
|
||||
errorBaseline = errorCount; // during clear update the error baseline value
|
||||
}
|
||||
uint64_t errorCount = 0;
|
||||
ze_result_t result = pFwInterface->fwGetMemoryErrorCount(osRasErrorType, subDeviceCount, subdeviceId, errorCount);
|
||||
if (result != ZE_RESULT_SUCCESS) {
|
||||
return result;
|
||||
}
|
||||
state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS] = errorCount - errorBaseline;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
LinuxRasSourceHbm::LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId) : pLinuxSysmanImp(pLinuxSysmanImp), osRasErrorType(type), subdeviceId(subdeviceId) {
|
||||
pFwInterface = pLinuxSysmanImp->getFwUtilInterface();
|
||||
pDevice = pLinuxSysmanImp->getSysmanDeviceImp();
|
||||
}
|
||||
|
||||
} // namespace Sysman
|
||||
} // namespace L0
|
||||
104
level_zero/sysman/source/ras/linux/os_ras_imp_prelim.cpp
Normal file
104
level_zero/sysman/source/ras/linux/os_ras_imp_prelim.cpp
Normal file
@@ -0,0 +1,104 @@
|
||||
/*
|
||||
* Copyright (C) 2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "level_zero/sysman/source/ras/linux/os_ras_imp_prelim.h"
|
||||
|
||||
#include "shared/source/helpers/string.h"
|
||||
#include "shared/source/os_interface/linux/system_info.h"
|
||||
|
||||
#include "level_zero/sysman/source/linux/os_sysman_imp.h"
|
||||
|
||||
#include "drm/intel_hwconfig_types.h"
|
||||
|
||||
namespace L0 {
|
||||
namespace Sysman {
|
||||
|
||||
static bool isMemoryTypeHbm(LinuxSysmanImp *pLinuxSysmanImp) {
|
||||
uint32_t memType = pLinuxSysmanImp->getMemoryType();
|
||||
if (memType == INTEL_HWCONFIG_MEMORY_TYPE_HBM2e || memType == INTEL_HWCONFIG_MEMORY_TYPE_HBM2) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void OsRas::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId) {
|
||||
|
||||
constexpr auto maxErrorTypes = 2;
|
||||
LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, isSubDevice, subDeviceId);
|
||||
if (errorType.size() < maxErrorTypes) {
|
||||
auto pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
|
||||
if (isMemoryTypeHbm(pLinuxSysmanImp) == true) {
|
||||
LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, isSubDevice, subDeviceId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasImp::osRasGetConfig(zes_ras_config_t *config) {
|
||||
config->totalThreshold = totalThreshold;
|
||||
memcpy_s(config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t), categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t));
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasImp::osRasSetConfig(const zes_ras_config_t *config) {
|
||||
if (pFsAccess->isRootUser() == true) {
|
||||
totalThreshold = config->totalThreshold;
|
||||
memcpy_s(categoryThreshold, maxRasErrorCategoryCount * sizeof(uint64_t), config->detailedThresholds.category, maxRasErrorCategoryCount * sizeof(uint64_t));
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS;
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasImp::osRasGetProperties(zes_ras_properties_t &properties) {
|
||||
properties.pNext = nullptr;
|
||||
properties.type = osRasErrorType;
|
||||
properties.onSubdevice = isSubdevice;
|
||||
properties.subdeviceId = subdeviceId;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
|
||||
if (clear == true) {
|
||||
if (pFsAccess->isRootUser() == false) {
|
||||
return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS;
|
||||
}
|
||||
}
|
||||
|
||||
ze_result_t result = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
for (auto &rasSource : rasSources) {
|
||||
zes_ras_state_t localState = {};
|
||||
ze_result_t localResult = rasSource->osRasGetState(localState, clear);
|
||||
if (localResult != ZE_RESULT_SUCCESS) {
|
||||
continue;
|
||||
}
|
||||
for (uint32_t i = 0; i < maxRasErrorCategoryCount; i++) {
|
||||
state.category[i] += localState.category[i];
|
||||
}
|
||||
result = ZE_RESULT_SUCCESS;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void LinuxRasImp::initSources() {
|
||||
rasSources.push_back(std::make_unique<L0::Sysman::LinuxRasSourceGt>(pLinuxSysmanImp, osRasErrorType, isSubdevice, subdeviceId));
|
||||
if (isMemoryTypeHbm(pLinuxSysmanImp) == true) {
|
||||
rasSources.push_back(std::make_unique<L0::Sysman::LinuxRasSourceHbm>(pLinuxSysmanImp, osRasErrorType, subdeviceId));
|
||||
}
|
||||
}
|
||||
|
||||
LinuxRasImp::LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) {
|
||||
pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
|
||||
pFsAccess = &pLinuxSysmanImp->getFsAccess();
|
||||
initSources();
|
||||
}
|
||||
|
||||
OsRas *OsRas::create(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) {
|
||||
LinuxRasImp *pLinuxRasImp = new LinuxRasImp(pOsSysman, type, onSubdevice, subdeviceId);
|
||||
return static_cast<OsRas *>(pLinuxRasImp);
|
||||
}
|
||||
|
||||
} // namespace Sysman
|
||||
} // namespace L0
|
||||
114
level_zero/sysman/source/ras/linux/os_ras_imp_prelim.h
Normal file
114
level_zero/sysman/source/ras/linux/os_ras_imp_prelim.h
Normal file
@@ -0,0 +1,114 @@
|
||||
/*
|
||||
* Copyright (C) 2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "shared/source/helpers/non_copyable_or_moveable.h"
|
||||
|
||||
#include "level_zero/sysman/source/linux/fs_access.h"
|
||||
#include "level_zero/sysman/source/linux/pmu/pmu_imp.h"
|
||||
#include "level_zero/sysman/source/ras/os_ras.h"
|
||||
#include "level_zero/sysman/source/sysman_const.h"
|
||||
#include "level_zero/sysman/source/sysman_device_imp.h"
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace L0 {
|
||||
namespace Sysman {
|
||||
|
||||
class LinuxSysmanImp;
|
||||
class FirmwareUtil;
|
||||
|
||||
class LinuxRasSources : NEO::NonCopyableOrMovableClass {
|
||||
public:
|
||||
virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) = 0;
|
||||
virtual ~LinuxRasSources() = default;
|
||||
};
|
||||
|
||||
class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass {
|
||||
public:
|
||||
ze_result_t osRasGetProperties(zes_ras_properties_t &properties) override;
|
||||
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
|
||||
ze_result_t osRasGetConfig(zes_ras_config_t *config) override;
|
||||
ze_result_t osRasSetConfig(const zes_ras_config_t *config) override;
|
||||
LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
|
||||
LinuxRasImp() = default;
|
||||
~LinuxRasImp() override = default;
|
||||
|
||||
protected:
|
||||
zes_ras_error_type_t osRasErrorType = {};
|
||||
FsAccess *pFsAccess = nullptr;
|
||||
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
|
||||
std::vector<std::unique_ptr<L0::Sysman::LinuxRasSources>> rasSources = {};
|
||||
|
||||
private:
|
||||
void initSources();
|
||||
bool isSubdevice = false;
|
||||
uint32_t subdeviceId = 0;
|
||||
uint64_t totalThreshold = 0;
|
||||
uint64_t categoryThreshold[maxRasErrorCategoryCount] = {0};
|
||||
};
|
||||
|
||||
class LinuxRasSourceGt : public LinuxRasSources {
|
||||
public:
|
||||
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
|
||||
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId);
|
||||
LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
|
||||
LinuxRasSourceGt() = default;
|
||||
~LinuxRasSourceGt() override;
|
||||
|
||||
protected:
|
||||
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
|
||||
zes_ras_error_type_t osRasErrorType = {};
|
||||
PmuInterface *pPmuInterface = nullptr;
|
||||
FsAccess *pFsAccess = nullptr;
|
||||
SysfsAccess *pSysfsAccess = nullptr;
|
||||
|
||||
private:
|
||||
void initRasErrors(ze_bool_t clear);
|
||||
ze_result_t getPmuConfig(
|
||||
const std::string &eventDirectory,
|
||||
const std::vector<std::string> &listOfEvents,
|
||||
const std::string &errorFileToGetConfig,
|
||||
std::string &pmuConfig);
|
||||
ze_result_t getBootUpErrorCountFromSysfs(
|
||||
std::string nameOfError,
|
||||
const std::string &errorCounterDir,
|
||||
uint64_t &errorVal);
|
||||
void closeFds();
|
||||
int64_t groupFd = -1;
|
||||
std::vector<int64_t> memberFds = {};
|
||||
uint64_t initialErrorCount[maxRasErrorCategoryCount] = {0};
|
||||
std::map<zes_ras_error_cat_t, uint64_t> errorCategoryToEventCount;
|
||||
uint64_t totalEventCount = 0;
|
||||
bool isSubdevice = false;
|
||||
uint32_t subdeviceId = 0;
|
||||
};
|
||||
|
||||
class LinuxRasSourceHbm : public LinuxRasSources {
|
||||
public:
|
||||
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
|
||||
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId);
|
||||
LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId);
|
||||
LinuxRasSourceHbm() = default;
|
||||
~LinuxRasSourceHbm() override{};
|
||||
|
||||
protected:
|
||||
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
|
||||
zes_ras_error_type_t osRasErrorType = {};
|
||||
FirmwareUtil *pFwInterface = nullptr;
|
||||
SysmanDeviceImp *pDevice = nullptr;
|
||||
|
||||
private:
|
||||
uint64_t errorBaseline = 0;
|
||||
uint32_t subdeviceId = 0;
|
||||
};
|
||||
|
||||
} // namespace Sysman
|
||||
} // namespace L0
|
||||
Reference in New Issue
Block a user