Revert "Add prelim support for ras diagnostics and firmware"

This reverts commit 5a2145ad8d.

Signed-off-by: Compute-Runtime-Validation <compute-runtime-validation@intel.com>
This commit is contained in:
Compute-Runtime-Validation
2022-03-06 11:04:41 +01:00
committed by Compute-Runtime-Automation
parent a010fb3634
commit 1a823356a3
13 changed files with 14 additions and 1087 deletions

View File

@@ -8,18 +8,9 @@ set(L0_SRCS_TOOLS_SYSMAN_DIAGNOSTICS_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/os_diagnostics_imp.cpp
${CMAKE_CURRENT_SOURCE_DIR}/os_diagnostics_imp.h
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}os_diagnostics_helper.cpp
)
if(NEO_ENABLE_i915_PRELIM_DETECTION)
list(APPEND L0_SRCS_TOOLS_SYSMAN_DIAGNOSTICS_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/os_diagnostics_helper_prelim.cpp
)
else()
list(APPEND L0_SRCS_TOOLS_SYSMAN_DIAGNOSTICS_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/os_diagnostics_helper.cpp
)
endif()
if(UNIX)
target_sources(${L0_STATIC_LIB_NAME}
PRIVATE

View File

@@ -1,237 +0,0 @@
/*
* Copyright (C) 2021-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/string.h"
#include "shared/source/os_interface/device_factory.h"
#include "level_zero/core/source/device/device_imp.h"
#include "level_zero/tools/source/sysman/diagnostics/linux/os_diagnostics_imp.h"
#include <linux/pci_regs.h>
namespace L0 {
//All memory mappings where LMEMBAR is being referenced are invalidated.
//Also prevents new ones from being created.
//It will invalidate LMEM memory mappings only when sysfs entry quiesce_gpu is set.
//the sysfs node will be at /sys/class/drm/card<n>/invalidate_lmem_mmaps
const std::string LinuxDiagnosticsImp::invalidateLmemFile("invalidate_lmem_mmaps");
// the sysfs node will be at /sys/class/drm/card<n>/quiesce_gpu
const std::string LinuxDiagnosticsImp::quiescentGpuFile("quiesce_gpu");
void OsDiagnostics::getSupportedDiagTestsFromFW(void *pOsSysman, std::vector<std::string> &supportedDiagTests) {
LinuxSysmanImp *pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
if (IGFX_PVC == pLinuxSysmanImp->getProductFamily()) {
FirmwareUtil *pFwInterface = pLinuxSysmanImp->getFwUtilInterface();
if (pFwInterface != nullptr) {
if (ZE_RESULT_SUCCESS == static_cast<FirmwareUtil *>(pFwInterface)->fwDeviceInit()) {
static_cast<FirmwareUtil *>(pFwInterface)->fwSupportedDiagTests(supportedDiagTests);
}
}
}
}
void LinuxDiagnosticsImp::releaseSysmanDeviceResources() {
pLinuxSysmanImp->getSysmanDeviceImp()->pEngineHandleContext->releaseEngines();
pLinuxSysmanImp->getSysmanDeviceImp()->pRasHandleContext->releaseRasHandles();
pLinuxSysmanImp->releasePmtObject();
pLinuxSysmanImp->releaseLocalDrmHandle();
}
void LinuxDiagnosticsImp::releaseDeviceResources() {
releaseSysmanDeviceResources();
auto device = static_cast<DeviceImp *>(pLinuxSysmanImp->getDeviceHandle());
device->releaseResources();
executionEnvironment->memoryManager->releaseDeviceSpecificMemResources(rootDeviceIndex);
executionEnvironment->releaseRootDeviceEnvironmentResources(executionEnvironment->rootDeviceEnvironments[rootDeviceIndex].get());
executionEnvironment->rootDeviceEnvironments[rootDeviceIndex].reset();
}
void LinuxDiagnosticsImp::reInitSysmanDeviceResources() {
pLinuxSysmanImp->getSysmanDeviceImp()->updateSubDeviceHandlesLocally();
pLinuxSysmanImp->createPmtHandles();
pLinuxSysmanImp->getSysmanDeviceImp()->pRasHandleContext->init(pLinuxSysmanImp->getSysmanDeviceImp()->deviceHandles);
pLinuxSysmanImp->getSysmanDeviceImp()->pEngineHandleContext->init();
}
ze_result_t LinuxDiagnosticsImp::initDevice() {
ze_result_t result = ZE_RESULT_SUCCESS;
auto device = static_cast<DeviceImp *>(pLinuxSysmanImp->getDeviceHandle());
auto neoDevice = NEO::DeviceFactory::createDevice(*executionEnvironment, devicePciBdf, rootDeviceIndex);
if (neoDevice == nullptr) {
return ZE_RESULT_ERROR_DEVICE_LOST;
}
static_cast<L0::DriverHandleImp *>(device->getDriverHandle())->updateRootDeviceBitFields(neoDevice);
static_cast<L0::DriverHandleImp *>(device->getDriverHandle())->enableRootDeviceDebugger(neoDevice);
Device::deviceReinit(device->getDriverHandle(), device, neoDevice, &result);
reInitSysmanDeviceResources();
return ZE_RESULT_SUCCESS;
}
static void getPidFdsForOpenDevice(ProcfsAccess *pProcfsAccess, SysfsAccess *pSysfsAccess, const ::pid_t pid, std::vector<int> &deviceFds) {
// Return a list of all the file descriptors of this process that point to this device
std::vector<int> fds;
deviceFds.clear();
if (ZE_RESULT_SUCCESS != pProcfsAccess->getFileDescriptors(pid, fds)) {
// Process exited. Not an error. Just ignore.
return;
}
for (auto &&fd : fds) {
std::string file;
if (pProcfsAccess->getFileName(pid, fd, file) != ZE_RESULT_SUCCESS) {
// Process closed this file. Not an error. Just ignore.
continue;
}
if (pSysfsAccess->isMyDeviceFile(file)) {
deviceFds.push_back(fd);
}
}
}
// A 'warm reset' is a conventional reset that is triggered across a PCI express link.
// A warm reset is triggered either when a link is forced into electrical idle or
// by sending TS1 and TS2 ordered sets with the hot reset bit set.
// Software can initiate a warm reset by setting and then clearing the secondary bus reset bit
// in the bridge control register in the PCI configuration space of the bridge port upstream of the device.
ze_result_t LinuxDiagnosticsImp::osWarmReset() {
std::string rootPortPath;
std::string realRootPath;
ze_result_t result = pSysfsAccess->getRealPath(deviceDir, realRootPath);
if (ZE_RESULT_SUCCESS != result) {
return result;
}
auto device = static_cast<DeviceImp *>(pDevice);
executionEnvironment = device->getNEODevice()->getExecutionEnvironment();
ExecutionEnvironmentRefCountRestore restorer(executionEnvironment);
releaseDeviceResources();
// write 1 to remove
result = pFsAccess->write(realRootPath + '/' + "remove", "1");
if (ZE_RESULT_SUCCESS != result) {
return result;
}
size_t loc;
loc = realRootPath.find_last_of('/');
realRootPath = realRootPath.substr(0, loc);
int fd, ret = 0;
unsigned int offset = PCI_BRIDGE_CONTROL; // Bridge control offset in Header of PCI config space
unsigned int value = 0x00;
unsigned int resetValue = 0x00;
std::string configFilePath = realRootPath + '/' + "config";
fd = this->openFunction(configFilePath.c_str(), O_RDWR);
if (fd < 0) {
return ZE_RESULT_ERROR_UNKNOWN;
}
this->preadFunction(fd, &value, 0x01, offset);
resetValue = value | PCI_BRIDGE_CTL_BUS_RESET;
this->pwriteFunction(fd, &resetValue, 0x01, offset);
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // Sleep for 100 milliseconds just to make sure the change is propagated.
this->pwriteFunction(fd, &value, 0x01, offset);
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // Sleep for 500 milliseconds
ret = this->closeFunction(fd);
if (ret < 0) {
return ZE_RESULT_ERROR_UNKNOWN;
}
result = pFsAccess->write(realRootPath + '/' + "rescan", "1");
if (ZE_RESULT_SUCCESS != result) {
return result;
}
return initDevice();
}
std::string getRootPortaddress(std::string &rootPortPath) {
size_t loc;
loc = rootPortPath.find_last_of('/'); // we get the pci address of the root port from rootPortPath
return rootPortPath.substr(loc + 1, std::string::npos);
}
ze_result_t LinuxDiagnosticsImp::osColdReset() {
const std::string slotPath("/sys/bus/pci/slots/"); // holds the directories matching to the number of slots in the PC
std::string rootPortPath; // will hold the PCIe Root port directory path (the address of the PCIe slot).
std::string realRootPath; // will hold the absolute real path (not symlink) to the selected Device
ze_result_t result = pSysfsAccess->getRealPath(deviceDir, realRootPath); // e.g realRootPath=/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:01.0/0000:8c:00.0
if (ZE_RESULT_SUCCESS != result) {
return result;
}
auto device = static_cast<DeviceImp *>(pDevice);
executionEnvironment = device->getNEODevice()->getExecutionEnvironment();
ExecutionEnvironmentRefCountRestore restorer(executionEnvironment);
releaseDeviceResources();
rootPortPath = pLinuxSysmanImp->getPciRootPortDirectoryPath(realRootPath); // e.g rootPortPath=/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0
std::string rootAddress = getRootPortaddress(rootPortPath); // e.g rootAddress = 0000:8a:00.0
std::vector<std::string> dir;
result = pFsAccess->listDirectory(slotPath, dir); // get list of slot directories from /sys/bus/pci/slots/
if (ZE_RESULT_SUCCESS != result) {
return result;
}
for (auto &slot : dir) {
std::string slotAddress;
result = pFsAccess->read((slotPath + slot + "/address"), slotAddress); // extract slot address from the slot directory /sys/bus/pci/slots/<slot num>/address
if (ZE_RESULT_SUCCESS != result) {
return result;
}
if (slotAddress.compare(rootAddress) == 0) { // compare slot address to root port address
result = pFsAccess->write((slotPath + slot + "/power"), "0"); // turn off power
if (ZE_RESULT_SUCCESS != result) {
return result;
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // Sleep for 100 milliseconds just to make sure, 1 ms is defined as part of spec
result = pFsAccess->write((slotPath + slot + "/power"), "1"); // turn on power
if (ZE_RESULT_SUCCESS != result) {
return result;
}
return initDevice();
}
}
return ZE_RESULT_ERROR_DEVICE_LOST; // incase the reset fails inform upper layers.
}
ze_result_t LinuxDiagnosticsImp::osRunDiagTestsinFW(zes_diag_result_t *pResult) {
const int intVal = 1;
// before running diagnostics need to close all active workloads
// writing 1 to /sys/class/drm/card<n>/quiesce_gpu will signal KMD
//GPU (every gt in the card) will be wedged.
// GPU will only be unwedged after warm/cold reset
::pid_t myPid = pProcfsAccess->myProcessId();
std::vector<::pid_t> processes;
ze_result_t result = pProcfsAccess->listProcesses(processes);
if (ZE_RESULT_SUCCESS != result) {
return result;
}
for (auto &&pid : processes) {
std::vector<int> fds;
getPidFdsForOpenDevice(pProcfsAccess, pSysfsAccess, pid, fds);
if (pid == myPid) {
// L0 is expected to have this file open.
// Keep list of fds. Close before unbind.
continue;
}
if (!fds.empty()) {
pProcfsAccess->kill(pid);
}
}
result = pSysfsAccess->write(quiescentGpuFile, intVal);
if (ZE_RESULT_SUCCESS != result) {
return result;
}
result = pSysfsAccess->write(invalidateLmemFile, intVal);
if (ZE_RESULT_SUCCESS != result) {
return result;
}
pFwInterface->fwRunDiagTests(osDiagType, pResult);
if (*pResult == ZES_DIAG_RESULT_REBOOT_FOR_REPAIR) {
return osColdReset();
}
return osWarmReset(); // we need to at least do a Warm reset to bring the machine out of wedged state
}
} // namespace L0

View File

@@ -8,18 +8,9 @@ set(L0_SRCS_TOOLS_SYSMAN_FIRMWARE_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/os_firmware_imp.cpp
${CMAKE_CURRENT_SOURCE_DIR}/os_firmware_imp.h
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}os_firmware_imp_helper.cpp
)
if(NEO_ENABLE_i915_PRELIM_DETECTION)
list(APPEND L0_SRCS_TOOLS_SYSMAN_FIRMWARE_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/os_firmware_imp_helper_prelim.cpp
)
else()
list(APPEND L0_SRCS_TOOLS_SYSMAN_FIRMWARE_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/os_firmware_imp_helper.cpp
)
endif()
if(UNIX)
target_sources(${L0_STATIC_LIB_NAME}
PRIVATE

View File

@@ -1,56 +0,0 @@
/*
* Copyright (C) 2021-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "level_zero/tools/source/sysman/firmware/linux/os_firmware_imp.h"
const std::string iafPath = "device/";
const std::string iafDirectory = "iaf.";
const std::string pscbin_version = "/pscbin_version";
namespace L0 {
ze_result_t LinuxFirmwareImp::getFirmwareVersion(std::string fwType, zes_firmware_properties_t *pProperties) {
std::string fwVersion;
if (fwType == "PSC") {
std::string path;
path.clear();
std::vector<std::string> list;
// scans the directories present in /sys/class/drm/cardX/device/
ze_result_t result = pSysfsAccess->scanDirEntries(iafPath, list);
if (ZE_RESULT_SUCCESS != result) {
// There should be a device directory
return result;
}
for (const auto &entry : list) {
if (!iafDirectory.compare(entry.substr(0, iafDirectory.length()))) {
// device/iaf.X/pscbin_version, where X is the hardware slot number
path = iafPath + entry + pscbin_version;
}
}
if (path.empty()) {
// This device does not have a PSC Version
return ZE_RESULT_ERROR_NOT_AVAILABLE;
}
std::string pscVersion;
pscVersion.clear();
result = pSysfsAccess->read(path, pscVersion);
if (ZE_RESULT_SUCCESS != result) {
// not able to read PSC version from iaf.x
return result;
}
strncpy_s(static_cast<char *>(pProperties->version), ZES_STRING_PROPERTY_SIZE, pscVersion.c_str(), ZES_STRING_PROPERTY_SIZE);
return result;
}
ze_result_t result = pFwInterface->getFwVersion(fwType, fwVersion);
if (result == ZE_RESULT_SUCCESS) {
strncpy_s(static_cast<char *>(pProperties->version), ZES_STRING_PROPERTY_SIZE, fwVersion.c_str(), ZES_STRING_PROPERTY_SIZE);
}
return result;
}
} // namespace L0

19
level_zero/tools/source/sysman/ras/linux/CMakeLists.txt Normal file → Executable file
View File

@@ -5,24 +5,10 @@
#
set(L0_SRCS_TOOLS_SYSMAN_RAS_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}os_ras_imp.cpp
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}os_ras_imp.h
)
if(NEO_ENABLE_i915_PRELIM_DETECTION)
list(APPEND L0_SRCS_TOOLS_SYSMAN_RAS_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_prelim.cpp
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_prelim.h
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_gt.cpp
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_fabric.cpp
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_hbm.cpp
)
else()
list(APPEND L0_SRCS_TOOLS_SYSMAN_RAS_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.cpp
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.h
)
endif()
if(UNIX)
target_sources(${L0_STATIC_LIB_NAME}
PRIVATE
@@ -32,4 +18,5 @@ endif()
# Make our source files visible to parent
set_property(GLOBAL PROPERTY L0_SRCS_TOOLS_SYSMAN_RAS_LINUX ${L0_SRCS_TOOLS_SYSMAN_RAS_LINUX})
add_subdirectories()

View File

@@ -1,105 +0,0 @@
/*
* Copyright (C) 2021-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h"
#include "sysman/linux/fs_access.h"
#include "sysman/linux/os_sysman_imp.h"
#include <regex>
namespace L0 {
void LinuxRasSourceFabric::getNodes(std::vector<std::string> &nodes, uint32_t subdeviceId, FsAccess *fsAccess, const zes_ras_error_type_t &type) {
const uint32_t minBoardStrappedNumber = 0;
const uint32_t maxBoardStrappedNumber = 31;
const uint32_t minPortId = 1;
const uint32_t maxPortId = 8;
nodes.clear();
for (auto boardStrappedNumber = minBoardStrappedNumber; boardStrappedNumber <= maxBoardStrappedNumber; boardStrappedNumber++) {
const auto iafPathString("/sys/module/iaf/drivers/platform:iaf/iaf.");
const auto boardStrappedString(iafPathString + std::to_string(boardStrappedNumber));
if (!fsAccess->directoryExists(boardStrappedString)) {
continue;
}
const auto subDeviceString(boardStrappedString + "/sd." + std::to_string(subdeviceId));
std::vector<std::string> subDeviceErrorNodes;
if (type == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
subDeviceErrorNodes.push_back(subDeviceString + "/fw_comm_errors");
for (auto portId = minPortId; portId <= maxPortId; portId++) {
subDeviceErrorNodes.push_back(subDeviceString + "/port." + std::to_string(portId) + "/link_degrades");
}
} else {
subDeviceErrorNodes.push_back(subDeviceString + "/sd_failure");
subDeviceErrorNodes.push_back(subDeviceString + "/fw_error");
for (auto portId = minPortId; portId <= maxPortId; portId++) {
subDeviceErrorNodes.push_back(subDeviceString + "/port." + std::to_string(portId) + "/link_failures");
}
}
for (auto &subDeviceErrorNode : subDeviceErrorNodes) {
if (ZE_RESULT_SUCCESS == fsAccess->canRead(subDeviceErrorNode)) {
nodes.push_back(subDeviceErrorNode);
}
}
}
}
ze_result_t LinuxRasSourceFabric::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType,
OsSysman *pOsSysman, ze_device_handle_t deviceHandle) {
LinuxSysmanImp *pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
NEO::Device *neoDevice = static_cast<Device *>(deviceHandle)->getNEODevice();
uint32_t subDeviceIndex = neoDevice->isSubDevice() ? static_cast<NEO::SubDevice *>(neoDevice)->getSubDeviceIndex() : 0;
std::vector<std::string> nodes;
getNodes(nodes, subDeviceIndex, &pLinuxSysmanImp->getFsAccess(), ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
if (nodes.size()) {
errorType.insert(ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
}
getNodes(nodes, subDeviceIndex, &pLinuxSysmanImp->getFsAccess(), ZES_RAS_ERROR_TYPE_CORRECTABLE);
if (nodes.size()) {
errorType.insert(ZES_RAS_ERROR_TYPE_CORRECTABLE);
}
return ZE_RESULT_SUCCESS;
}
LinuxRasSourceFabric::LinuxRasSourceFabric(OsSysman *pOsSysman, zes_ras_error_type_t type, uint32_t subDeviceId) {
fsAccess = &static_cast<LinuxSysmanImp *>(pOsSysman)->getFsAccess();
getNodes(errorNodes, subDeviceId, fsAccess, type);
}
uint64_t LinuxRasSourceFabric::getComputeErrorCount() {
uint64_t currentErrorCount = 0;
for (const auto &node : errorNodes) {
uint64_t errorCount = 0;
fsAccess->read(node, errorCount);
currentErrorCount += errorCount;
}
return currentErrorCount;
}
ze_result_t LinuxRasSourceFabric::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
if (errorNodes.size() == 0) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
std::memset(state.category, 0, sizeof(zes_ras_state_t::category));
uint64_t currentComputeErrorCount = getComputeErrorCount();
if (clear) {
baseComputeErrorCount = currentComputeErrorCount;
currentComputeErrorCount = getComputeErrorCount();
}
state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS] = currentComputeErrorCount - baseComputeErrorCount;
return ZE_RESULT_SUCCESS;
}
} // namespace L0

View File

@@ -1,345 +0,0 @@
/*
* Copyright (C) 2021-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h"
#include "sysman/linux/os_sysman_imp.h"
#include <regex>
namespace L0 {
static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEventsUncorrectable = {
{ZES_RAS_ERROR_CAT_CACHE_ERRORS,
{"fatal-array-bist", "fatal-eu-grf", "fatal-eu-ic",
"fatal-guc", "fatal-idi-parity", "fatal-l3-double",
"fatal-l3-ecc-checker", "fatal-sampler", "fatal-slm",
"fatal-sqidi", "fatal-tlb"}},
{ZES_RAS_ERROR_CAT_RESET,
{"engine-reset"}},
{ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS,
{"eu-attention"}},
{ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS,
{"soc-fatal-fabric-ss0-0", "soc-fatal-fabric-ss0-1", "soc-fatal-fabric-ss0-2",
"soc-fatal-fabric-ss0-3", "soc-fatal-fabric-ss1-0", "soc-fatal-fabric-ss1-1",
"soc-fatal-fabric-ss1-2", "soc-fatal-fabric-ss1-3", "soc-fatal-fabric-ss1-4",
"soc-fatal-hbm-ss0-0", "soc-fatal-hbm-ss0-1", "soc-fatal-hbm-ss0-2",
"soc-fatal-hbm-ss0-3", "soc-fatal-hbm-ss0-4", "soc-fatal-hbm-ss0-5",
"soc-fatal-hbm-ss0-6", "soc-fatal-hbm-ss0-7", "soc-fatal-hbm-ss0-8",
"soc-fatal-hbm-ss0-9", "soc-fatal-hbm-ss0-10", "soc-fatal-hbm-ss0-11",
"soc-fatal-hbm-ss0-12", "soc-fatal-hbm-ss0-13", "soc-fatal-hbm-ss0-14",
"soc-fatal-hbm-ss0-15", "soc-fatal-hbm-ss1-0", "soc-fatal-hbm-ss1-1",
"soc-fatal-hbm-ss1-2", "soc-fatal-hbm-ss1-3", "soc-fatal-hbm-ss1-4",
"soc-fatal-hbm-ss1-5", "soc-fatal-hbm-ss1-6", "soc-fatal-hbm-ss1-7",
"soc-fatal-hbm-ss1-8", "soc-fatal-hbm-ss1-9", "soc-fatal-hbm-ss1-10",
"soc-fatal-hbm-ss1-11", "soc-fatal-hbm-ss1-12", "soc-fatal-hbm-ss1-13",
"soc-fatal-hbm-ss1-14", "soc-fatal-hbm-ss1-15", "soc-fatal-mdfi-east",
"soc-fatal-mdfi-south", "soc-fatal-mdfi-west", "soc-fatal-psf-csc-0",
"soc-fatal-psf-csc-1", "soc-fatal-psf-csc-2", "soc-fatal-punit",
"sgunit-fatal", "soc-nonfatal-fabric-ss0-0", "soc-nonfatal-fabric-ss0-1",
"soc-nonfatal-fabric-ss0-2", "soc-nonfatal-fabric-ss0-3", "soc-nonfatal-fabric-ss1-0",
"soc-nonfatal-fabric-ss1-1", "soc-nonfatal-fabric-ss1-2", "soc-nonfatal-fabric-ss1-3",
"soc-nonfatal-fabric-ss1-4", "soc-nonfatal-hbm-ss0-0", "soc-nonfatal-hbm-ss0-1",
"soc-nonfatal-hbm-ss0-2", "soc-nonfatal-hbm-ss0-3", "soc-nonfatal-hbm-ss0-4",
"soc-nonfatal-hbm-ss0-5", "soc-nonfatal-hbm-ss0-6", "soc-nonfatal-hbm-ss0-7",
"soc-nonfatal-hbm-ss0-8", "soc-nonfatal-hbm-ss0-9", "soc-nonfatal-hbm-ss0-10",
"soc-nonfatal-hbm-ss0-11", "soc-nonfatal-hbm-ss0-12", "soc-nonfatal-hbm-ss0-13",
"soc-nonfatal-hbm-ss0-14", "soc-nonfatal-hbm-ss0-15", "soc-nonfatal-hbm-ss1-0",
"soc-nonfatal-hbm-ss1-1", "soc-nonfatal-hbm-ss1-2", "soc-nonfatal-hbm-ss1-3",
"soc-nonfatal-hbm-ss1-4", "soc-nonfatal-hbm-ss1-5", "soc-nonfatal-hbm-ss1-6",
"soc-nonfatal-hbm-ss1-7", "soc-nonfatal-hbm-ss1-8", "soc-nonfatal-hbm-ss1-9",
"soc-nonfatal-hbm-ss1-10", "soc-nonfatal-hbm-ss1-11", "soc-nonfatal-hbm-ss1-12",
"soc-nonfatal-hbm-ss1-13", "soc-nonfatal-hbm-ss1-14", "soc-nonfatal-hbm-ss1-15",
"soc-nonfatal-mdfi-east", "soc-nonfatal-mdfi-south", "soc-nonfatal-mdfi-west",
"soc-nonfatal-psf-csc-0", "soc-nonfatal-psf-csc-1", "soc-nonfatal-psf-csc-2",
"soc-nonfatal-punit", "sgunit-nonfatal"}},
{ZES_RAS_ERROR_CAT_COMPUTE_ERRORS,
{"fatal-fpu", "fatal-l3-fabric"}},
{ZES_RAS_ERROR_CAT_DRIVER_ERRORS,
{"driver-object-migration", "driver-engine-other", "driver-ggtt",
"driver-gt-interrupt", "driver-gt-other", "driver-guc-communication",
"driver-rps"}}};
static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEventsCorrectable = {
{ZES_RAS_ERROR_CAT_CACHE_ERRORS,
{"correctable-eu-grf", "correctable-eu-ic", "correctable-guc",
"correctable-l3-sng", "correctable-sampler", "correctable-slm"}},
{ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS,
{"soc-correctable-fabric-ss0-0", "soc-correctable-fabric-ss0-1", "soc-correctable-fabric-ss0-2",
"soc-correctable-fabric-ss0-3", "soc-correctable-fabric-ss1-0", "soc-correctable-fabric-ss1-1",
"soc-correctable-fabric-ss1-2", "soc-correctable-fabric-ss1-3", "soc-correctable-fabric-ss1-4",
"soc-correctable-hbm-ss0-0", "soc-correctable-hbm-ss0-1", "soc-correctable-hbm-ss0-2",
"soc-correctable-hbm-ss0-3", "soc-correctable-hbm-ss0-4", "soc-correctable-hbm-ss0-5",
"soc-correctable-hbm-ss0-6", "soc-correctable-hbm-ss0-7", "soc-correctable-hbm-ss0-8",
"soc-correctable-hbm-ss0-9", "soc-correctable-hbm-ss0-10", "soc-correctable-hbm-ss0-11",
"soc-correctable-hbm-ss0-12", "soc-correctable-hbm-ss0-13", "soc-correctable-hbm-ss0-14",
"soc-correctable-hbm-ss0-15", "soc-correctable-hbm-ss1-0", "soc-correctable-hbm-ss1-1",
"soc-correctable-hbm-ss1-2", "soc-correctable-hbm-ss1-3", "soc-correctable-hbm-ss1-4",
"soc-correctable-hbm-ss1-5", "soc-correctable-hbm-ss1-6", "soc-correctable-hbm-ss1-7",
"soc-correctable-hbm-ss1-8", "soc-correctable-hbm-ss1-9", "soc-correctable-hbm-ss1-10",
"soc-correctable-hbm-ss1-11", "soc-correctable-hbm-ss1-12", "soc-correctable-hbm-ss1-13",
"soc-correctable-hbm-ss1-14", "soc-correctable-hbm-ss1-15", "soc-correctable-mdfi-east",
"soc-correctable-mdfi-south", "soc-correctable-mdfi-west", "soc-correctable-psf-csc-0",
"soc-correctable-psf-csc-1", "soc-correctable-punit", "sgunit-correctable"}}};
static void closeFd(int64_t &fd) {
if (fd != -1) {
close(static_cast<int>(fd));
fd = -1;
}
}
static ze_result_t readI915EventsDirectory(LinuxSysmanImp *pLinuxSysmanImp, std::vector<std::string> &listOfEvents, std::string *eventDirectory) {
// To know how many errors are supported on a platform scan
// /sys/devices/i915_0000_01_00.0/events/
// all events are enumerated in sysfs at /sys/devices/i915_0000_01_00.0/events/
// For above example device is in PCI slot 0000:01:00.0:
SysfsAccess *pSysfsAccess = &pLinuxSysmanImp->getSysfsAccess();
const std::string deviceDir("device");
const std::string sysDevicesDir("/sys/devices/");
std::string bdfDir;
ze_result_t result = pSysfsAccess->readSymLink(deviceDir, bdfDir);
if (ZE_RESULT_SUCCESS != result) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
const auto loc = bdfDir.find_last_of('/');
auto bdf = bdfDir.substr(loc + 1);
std::replace(bdf.begin(), bdf.end(), ':', '_');
std::string i915DirName = "i915_" + bdf;
std::string sysfsNode = sysDevicesDir + i915DirName + "/" + "events";
if (eventDirectory != nullptr) {
*eventDirectory = sysfsNode;
}
FsAccess *pFsAccess = &pLinuxSysmanImp->getFsAccess();
result = pFsAccess->listDirectory(sysfsNode, listOfEvents);
if (ZE_RESULT_SUCCESS != result) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
return ZE_RESULT_SUCCESS;
}
static uint64_t convertHexToUint64(std::string strVal) {
auto loc = strVal.find('=');
std::stringstream ss;
ss << std::hex << strVal.substr(loc + 1);
uint64_t config = 0;
ss >> config;
return config;
}
static bool isErrorTypeSupported(std::string pattern, std::vector<std::string> &eventList) {
std::regex pPattern(pattern);
for (const auto &entry : eventList) {
if (regex_match(entry, pPattern) == true) {
return true;
}
}
return false;
}
static bool getErrorType(std::vector<std::string> errorPattern, std::vector<std::string> &eventList) {
for (auto &pattern : errorPattern) {
if (isErrorTypeSupported(pattern, eventList) == true) {
return true;
}
}
return false;
}
void LinuxRasSourceGt::closeFds() {
for (auto &memberFd : memberFds) {
closeFd(memberFd);
}
memberFds.clear();
closeFd(groupFd);
}
LinuxRasSourceGt::~LinuxRasSourceGt() {
closeFds();
}
void LinuxRasSourceGt::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle) {
LinuxSysmanImp *pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
std::vector<std::string> listOfEvents = {};
ze_result_t result = readI915EventsDirectory(pLinuxSysmanImp, listOfEvents, nullptr);
if (result != ZE_RESULT_SUCCESS) {
return;
}
ze_device_properties_t deviceProperties = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES};
Device::fromHandle(deviceHandle)->getProperties(&deviceProperties);
bool onSubDevice = deviceProperties.flags & ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE;
uint32_t subDeviceId = deviceProperties.subdeviceId;
std::vector<std::string> uncorrectablePattern;
std::vector<std::string> correctablePattern;
// For device with no subDevice error entries are of form error--<Name of error type>
// and for device having subDevice error entries are of form error-gt<N>--<Name of error type>
uncorrectablePattern.push_back("^error--driver.*");
if (onSubDevice == false) {
correctablePattern.push_back("^error--correctable.*");
correctablePattern.push_back("^error--soc-correctable.*");
uncorrectablePattern.push_back("^error--engine-reset.*");
uncorrectablePattern.push_back("^error--eu-attention.*");
uncorrectablePattern.push_back("^error--fatal.*");
uncorrectablePattern.push_back("^error--soc-fatal.*");
uncorrectablePattern.push_back("^error--soc-nonfatal.*");
} else {
correctablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--correctable.*");
correctablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--soc-correctable.*");
uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--driver.*");
uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--fatal.*");
uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--soc-fatal.*");
uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--soc-nonfatal.*");
uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--eu-attention.*");
uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--engine-reset.*");
}
if (getErrorType(correctablePattern, listOfEvents) == true) {
errorType.insert(ZES_RAS_ERROR_TYPE_CORRECTABLE);
}
if (getErrorType(uncorrectablePattern, listOfEvents) == true) {
errorType.insert(ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
}
}
ze_result_t LinuxRasSourceGt::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
if (clear == true) {
closeFds();
totalEventCount = 0;
memset(state.category, 0, sizeof(state.category));
memset(initialErrorCount, 0, sizeof(initialErrorCount));
}
initRasErrors(clear);
// Iterate over all the file descriptor values present in vector which is mapped to given ras error category
// Use the file descriptors to read pmu counters and add all the errors corresponding to the ras error category
if (groupFd < 0) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToEvent;
if (osRasErrorType == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
categoryToEvent = categoryToListOfEventsCorrectable;
}
if (osRasErrorType == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
categoryToEvent = categoryToListOfEventsUncorrectable;
}
std::vector<std::uint64_t> data(2 + totalEventCount, 0); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
if (pPmuInterface->pmuRead(static_cast<int>(groupFd), data.data(), sizeof(uint64_t) * data.size()) < 0) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
/* The data buffer retrieved after reading pmu counters is parsed to get the error count for each suberror category */
uint64_t initialIndex = 2; // Initial index in the buffer from which the data be parsed begins
for (auto errorCat = errorCategoryToEventCount.begin(); errorCat != errorCategoryToEventCount.end(); errorCat++) {
uint64_t errorCount = 0;
uint64_t j = 0;
for (; j < errorCat->second; j++) {
errorCount += data[initialIndex + j];
}
state.category[errorCat->first] = errorCount + initialErrorCount[errorCat->first];
initialIndex += j;
}
return ZE_RESULT_SUCCESS;
}
ze_result_t LinuxRasSourceGt::getPmuConfig(
const std::string &eventDirectory,
const std::vector<std::string> &listOfEvents,
const std::string &errorFileToGetConfig,
std::string &pmuConfig) {
auto findErrorInList = std::find(listOfEvents.begin(), listOfEvents.end(), errorFileToGetConfig);
if (findErrorInList == listOfEvents.end()) {
return ZE_RESULT_ERROR_UNKNOWN;
}
return pFsAccess->read(eventDirectory + "/" + errorFileToGetConfig, pmuConfig);
}
ze_result_t LinuxRasSourceGt::getBootUpErrorCountFromSysfs(
std::string nameOfError,
const std::string &errorCounterDir,
uint64_t &errorVal) {
std::replace(nameOfError.begin(), nameOfError.end(), '-', '_'); // replace - with _ to convert name of pmu config node to name of sysfs node
return pSysfsAccess->read(errorCounterDir + "/" + nameOfError, errorVal);
}
void LinuxRasSourceGt::initRasErrors(ze_bool_t clear) {
// if already initialized
if (groupFd >= 0) {
return;
}
std::string eventDirectory;
std::vector<std::string> listOfEvents = {};
ze_result_t result = readI915EventsDirectory(pLinuxSysmanImp, listOfEvents, &eventDirectory);
if (result != ZE_RESULT_SUCCESS) {
return;
}
std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEvents;
if (osRasErrorType == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
categoryToListOfEvents = categoryToListOfEventsCorrectable;
}
if (osRasErrorType == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
categoryToListOfEvents = categoryToListOfEventsUncorrectable;
}
std::string errorPrefix = "error--"; // prefix string of the file containing config value for pmu counters
std::string errorCounterDir = "gt/gt0/error_counter"; // Directory containing the sysfs nodes which in turn contains initial value of error count
if (isSubdevice == true) {
errorPrefix = "error-gt" + std::to_string(subdeviceId) + "--";
errorCounterDir = "gt/gt" + std::to_string(subdeviceId) + "/error_counter";
}
// Following loop retrieves initial count of errors from sysfs and pmu config values for each ras error
// PMU: error--<Name of error> Ex:- error--engine-reset (config with no subdevice)
// PMU: error-gt<N>--<Name of error> Ex:- error-gt0--engine-reset (config with subdevices)
// PMU: error--<Name of error> Ex:- error--driver-object-migration (config for device level errors)
// Sysfs: card0/gt/gt0/error_counter/<Name of error> Ex:- gt/gt0/error_counter/engine_reset (sysfs with no subdevice)
// Sysfs: card0/gt/gt<N>/error_counter/<Name of error> Ex:- gt/gt1/error_counter/engine_reset (sysfs with dubdevices)
// Sysfs: error_counter/<Name of error> Ex:- error_counter/driver_object_migration (sysfs for error which occur at device level)
for (auto const &rasErrorCatToListOfEvents : categoryToListOfEvents) {
uint64_t eventCount = 0;
uint64_t errorCount = 0;
for (auto const &nameOfError : rasErrorCatToListOfEvents.second) {
std::string errorPrefixLocal = errorPrefix;
std::string errorCounterDirLocal = errorCounterDir;
if (nameOfError == "driver-object-migration") { // check for errors which occur at device level
errorCounterDirLocal = "error_counter";
errorPrefixLocal = "error--";
}
uint64_t initialErrorVal = 0;
if (clear == false) {
result = getBootUpErrorCountFromSysfs(nameOfError, errorCounterDirLocal, initialErrorVal);
if (result != ZE_RESULT_SUCCESS) {
continue;
}
}
std::string pmuConfig;
result = getPmuConfig(eventDirectory, listOfEvents, errorPrefixLocal + nameOfError, pmuConfig);
if (result != ZE_RESULT_SUCCESS) {
continue;
}
uint64_t config = convertHexToUint64(pmuConfig);
if (groupFd == -1) {
groupFd = pPmuInterface->pmuInterfaceOpen(config, -1, PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP); // To get file descriptor of the group leader
if (groupFd < 0) {
return;
}
} else {
// The rest of the group members are created with subsequent calls with groupFd being set to the file descriptor of the group leader
memberFds.push_back(pPmuInterface->pmuInterfaceOpen(config, static_cast<int>(groupFd), PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP));
}
eventCount++;
errorCount += initialErrorVal;
}
initialErrorCount[rasErrorCatToListOfEvents.first] = errorCount;
errorCategoryToEventCount[rasErrorCatToListOfEvents.first] = eventCount;
totalEventCount += eventCount;
}
}
LinuxRasSourceGt::LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : pLinuxSysmanImp(pLinuxSysmanImp), osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) {
pPmuInterface = pLinuxSysmanImp->getPmuInterface();
pFsAccess = &pLinuxSysmanImp->getFsAccess();
pSysfsAccess = &pLinuxSysmanImp->getSysfsAccess();
}
} // namespace L0

View File

@@ -1,51 +0,0 @@
/*
* Copyright (C) 2021-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h"
#include "sysman/linux/os_sysman_imp.h"
namespace L0 {
void LinuxRasSourceHbm::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle) {
LinuxSysmanImp *pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
FirmwareUtil *pFwInterface = pLinuxSysmanImp->getFwUtilInterface();
if (pFwInterface != nullptr) {
errorType.insert(ZES_RAS_ERROR_TYPE_CORRECTABLE);
errorType.insert(ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
}
}
ze_result_t LinuxRasSourceHbm::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
if (pFwInterface == nullptr) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
uint32_t subDeviceCount = 0;
pDevice->getSubDevices(&subDeviceCount, nullptr);
if (clear == true) {
uint64_t errorCount = 0;
ze_result_t result = pFwInterface->fwGetMemoryErrorCount(osRasErrorType, subDeviceCount, subdeviceId, errorCount);
if (result != ZE_RESULT_SUCCESS) {
return result;
}
errorBaseline = errorCount; // during clear update the error baseline value
}
uint64_t errorCount = 0;
ze_result_t result = pFwInterface->fwGetMemoryErrorCount(osRasErrorType, subDeviceCount, subdeviceId, errorCount);
if (result != ZE_RESULT_SUCCESS) {
return result;
}
state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS] = errorCount - errorBaseline;
return ZE_RESULT_SUCCESS;
}
LinuxRasSourceHbm::LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId) : pLinuxSysmanImp(pLinuxSysmanImp), osRasErrorType(type), subdeviceId(subdeviceId) {
pFwInterface = pLinuxSysmanImp->getFwUtilInterface();
pDevice = pLinuxSysmanImp->getDeviceHandle();
}
} // namespace L0

View File

@@ -1,88 +0,0 @@
/*
* Copyright (C) 2020-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h"
#include "sysman/linux/os_sysman_imp.h"
namespace L0 {
void OsRas::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle) {
constexpr auto maxErrorTypes = 2;
LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle);
if (errorType.size() < maxErrorTypes) {
LinuxRasSourceFabric::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle);
if (errorType.size() < maxErrorTypes) {
LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle);
}
}
}
ze_result_t LinuxRasImp::osRasGetConfig(zes_ras_config_t *config) {
config->totalThreshold = totalThreshold;
memcpy(config->detailedThresholds.category, categoryThreshold, sizeof(config->detailedThresholds.category));
return ZE_RESULT_SUCCESS;
}
ze_result_t LinuxRasImp::osRasSetConfig(const zes_ras_config_t *config) {
if (pFsAccess->isRootUser() == true) {
totalThreshold = config->totalThreshold;
memcpy(categoryThreshold, config->detailedThresholds.category, sizeof(config->detailedThresholds.category));
return ZE_RESULT_SUCCESS;
}
return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS;
}
ze_result_t LinuxRasImp::osRasGetProperties(zes_ras_properties_t &properties) {
properties.pNext = nullptr;
properties.type = osRasErrorType;
properties.onSubdevice = isSubdevice;
properties.subdeviceId = subdeviceId;
return ZE_RESULT_SUCCESS;
}
ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
if (clear == true) {
if (pFsAccess->isRootUser() == false) {
return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS;
}
}
ze_result_t result = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
for (auto &rasSource : rasSources) {
zes_ras_state_t localState = {};
ze_result_t localResult = rasSource->osRasGetState(localState, clear);
if (localResult != ZE_RESULT_SUCCESS) {
continue;
}
for (int i = 0; i < ZES_MAX_RAS_ERROR_CATEGORY_COUNT; i++) {
state.category[i] += localState.category[i];
}
result = ZE_RESULT_SUCCESS;
}
return result;
}
void LinuxRasImp::initSources() {
rasSources.push_back(std::make_unique<L0::LinuxRasSourceGt>(pLinuxSysmanImp, osRasErrorType, isSubdevice, subdeviceId));
rasSources.push_back(std::make_unique<L0::LinuxRasSourceFabric>(pLinuxSysmanImp, osRasErrorType, subdeviceId));
rasSources.push_back(std::make_unique<L0::LinuxRasSourceHbm>(pLinuxSysmanImp, osRasErrorType, subdeviceId));
}
LinuxRasImp::LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) {
pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
pFsAccess = &pLinuxSysmanImp->getFsAccess();
initSources();
}
OsRas *OsRas::create(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) {
LinuxRasImp *pLinuxRasImp = new LinuxRasImp(pOsSysman, type, onSubdevice, subdeviceId);
return static_cast<OsRas *>(pLinuxRasImp);
}
} // namespace L0

View File

@@ -1,128 +0,0 @@
/*
* Copyright (C) 2020-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/helpers/non_copyable_or_moveable.h"
#include "level_zero/tools/source/sysman/ras/os_ras.h"
#include <map>
#include <memory>
#include <string>
#include <vector>
namespace L0 {
class FsAccess;
class SysfsAccess;
class PmuInterface;
class LinuxSysmanImp;
class LinuxRasSources;
class FirmwareUtil;
struct Device;
class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass {
public:
ze_result_t osRasGetProperties(zes_ras_properties_t &properties) override;
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
ze_result_t osRasGetConfig(zes_ras_config_t *config) override;
ze_result_t osRasSetConfig(const zes_ras_config_t *config) override;
LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
LinuxRasImp() = default;
~LinuxRasImp() override = default;
protected:
zes_ras_error_type_t osRasErrorType = {};
FsAccess *pFsAccess = nullptr;
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
std::vector<std::unique_ptr<L0::LinuxRasSources>> rasSources = {};
private:
void initSources();
bool isSubdevice = false;
uint32_t subdeviceId = 0;
uint64_t totalThreshold = 0;
uint64_t categoryThreshold[ZES_MAX_RAS_ERROR_CATEGORY_COUNT] = {0};
};
class LinuxRasSources : NEO::NonCopyableOrMovableClass {
public:
virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) = 0;
virtual ~LinuxRasSources() = default;
};
class LinuxRasSourceGt : public LinuxRasSources {
public:
virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle);
LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
LinuxRasSourceGt() = default;
virtual ~LinuxRasSourceGt();
protected:
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
zes_ras_error_type_t osRasErrorType = {};
PmuInterface *pPmuInterface = nullptr;
FsAccess *pFsAccess = nullptr;
SysfsAccess *pSysfsAccess = nullptr;
private:
void initRasErrors(ze_bool_t clear);
ze_result_t getPmuConfig(
const std::string &eventDirectory,
const std::vector<std::string> &listOfEvents,
const std::string &errorFileToGetConfig,
std::string &pmuConfig);
ze_result_t getBootUpErrorCountFromSysfs(
std::string nameOfError,
const std::string &errorCounterDir,
uint64_t &errorVal);
void closeFds();
int64_t groupFd = -1;
std::vector<int64_t> memberFds = {};
uint64_t initialErrorCount[ZES_MAX_RAS_ERROR_CATEGORY_COUNT] = {0};
std::map<zes_ras_error_cat_t, uint64_t> errorCategoryToEventCount;
uint64_t totalEventCount = 0;
bool isSubdevice = false;
uint32_t subdeviceId = 0;
};
class LinuxRasSourceFabric : public LinuxRasSources {
public:
static ze_result_t getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle);
LinuxRasSourceFabric(OsSysman *pOsSysman, zes_ras_error_type_t type, uint32_t subDeviceId);
~LinuxRasSourceFabric() = default;
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
private:
FsAccess *fsAccess = nullptr;
std::vector<std::string> errorNodes = {};
uint64_t baseComputeErrorCount = 0;
uint64_t getComputeErrorCount();
static void getNodes(std::vector<std::string> &nodes, uint32_t subdeviceId, FsAccess *fsAccess, const zes_ras_error_type_t &type);
};
class LinuxRasSourceHbm : public LinuxRasSources {
public:
virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle);
LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId);
LinuxRasSourceHbm() = default;
virtual ~LinuxRasSourceHbm() override{};
protected:
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
zes_ras_error_type_t osRasErrorType = {};
FirmwareUtil *pFwInterface = nullptr;
Device *pDevice = nullptr;
private:
uint64_t errorBaseline = 0;
uint32_t subdeviceId = 0;
};
} // namespace L0

View File

@@ -4,21 +4,11 @@
# SPDX-License-Identifier: MIT
#
set(L0_TESTS_TOOLS_SYSMAN_DIAGNOSTICS_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}test_zes_sysman_diagnostics.cpp
${CMAKE_CURRENT_SOURCE_DIR}/mock_zes_sysman_diagnostics.h
)
if((NEO_ENABLE_i915_PRELIM_DETECTION) AND ("${BRANCH_TYPE}" STREQUAL ""))
list(REMOVE_ITEM L0_TESTS_TOOLS_SYSMAN_DIAGNOSTICS_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/test_zes_sysman_diagnostics.cpp
)
endif()
if(UNIX)
target_sources(${TARGET_NAME}
PRIVATE
${L0_TESTS_TOOLS_SYSMAN_DIAGNOSTICS_LINUX}
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/mock_zes_sysman_diagnostics.h
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}test_zes_sysman_diagnostics.cpp
)
endif()

View File

@@ -4,22 +4,11 @@
# SPDX-License-Identifier: MIT
#
set(L0_TESTS_TOOLS_SYSMAN_FIRMWARE_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}mock_zes_sysman_firmware.h
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}test_zes_sysman_firmware.cpp
)
if((NEO_ENABLE_i915_PRELIM_DETECTION) AND ("${BRANCH_TYPE}" STREQUAL ""))
list(REMOVE_ITEM L0_TESTS_TOOLS_SYSMAN_FIRMWARE_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/test_zes_sysman_firmware.cpp
${CMAKE_CURRENT_SOURCE_DIR}/mock_zes_sysman_firmware.h
)
endif()
if(UNIX)
target_sources(${TARGET_NAME}
PRIVATE
${L0_TESTS_TOOLS_SYSMAN_FIRMWARE_LINUX}
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}mock_zes_sysman_firmware.h
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}test_zes_sysman_firmware.cpp
)
endif()

View File

@@ -4,23 +4,12 @@
# SPDX-License-Identifier: MIT
#
set(L0_TESTS_TOOLS_SYSMAN_RAS_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}test_zes_ras.cpp
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}mock_fs_ras.h
)
if((NEO_ENABLE_i915_PRELIM_DETECTION) AND ("${BRANCH_TYPE}" STREQUAL ""))
list(REMOVE_ITEM L0_TESTS_TOOLS_SYSMAN_RAS_LINUX
${CMAKE_CURRENT_SOURCE_DIR}/test_zes_ras.cpp
${CMAKE_CURRENT_SOURCE_DIR}/mock_fs_ras.h
)
endif()
if(UNIX)
target_sources(${TARGET_NAME}
PRIVATE
${L0_TESTS_TOOLS_SYSMAN_RAS_LINUX}
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}test_zes_ras.cpp
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}mock_fs_ras.h
)
endif()
add_subdirectories()