mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-29 17:13:29 +08:00
Add support for sysman zesFabricPortGetFabricErrorCounters API
Related-To: LOCI-3398 Signed-off-by: Mayank Raghuwanshi <mayank.raghuwanshi@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
48ed9f9c92
commit
07d3353b1f
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020-2022 Intel Corporation
|
||||
* Copyright (C) 2020-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -38,6 +38,7 @@ class FabricPort : _zes_fabric_port_handle_t {
|
||||
virtual ze_result_t fabricPortSetConfig(const zes_fabric_port_config_t *pConfig) = 0;
|
||||
virtual ze_result_t fabricPortGetState(zes_fabric_port_state_t *pState) = 0;
|
||||
virtual ze_result_t fabricPortGetThroughput(zes_fabric_port_throughput_t *pThroughput) = 0;
|
||||
virtual ze_result_t fabricPortGetErrorCounters(zes_fabric_port_error_counters_t *pErrors) = 0;
|
||||
|
||||
inline zes_fabric_port_handle_t toZesHandle() { return this; }
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020 Intel Corporation
|
||||
* Copyright (C) 2020-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -53,6 +53,10 @@ ze_result_t FabricPortImp::fabricPortGetState(zes_fabric_port_state_t *pState) {
|
||||
return pOsFabricPort->getState(pState);
|
||||
}
|
||||
|
||||
ze_result_t FabricPortImp::fabricPortGetErrorCounters(zes_fabric_port_error_counters_t *pErrors) {
|
||||
return pOsFabricPort->getErrorCounters(pErrors);
|
||||
}
|
||||
|
||||
ze_result_t FabricPortImp::fabricPortGetThroughput(zes_fabric_port_throughput_t *pThroughput) {
|
||||
fabricPortGetTimestamp(pThroughput->timestamp);
|
||||
return pOsFabricPort->getThroughput(pThroughput);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020 Intel Corporation
|
||||
* Copyright (C) 2020-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -35,6 +35,7 @@ class FabricPortImp : public FabricPort, NEO::NonCopyableOrMovableClass {
|
||||
ze_result_t fabricPortSetConfig(const zes_fabric_port_config_t *pConfig) override;
|
||||
ze_result_t fabricPortGetState(zes_fabric_port_state_t *pState) override;
|
||||
ze_result_t fabricPortGetThroughput(zes_fabric_port_throughput_t *pThroughput) override;
|
||||
ze_result_t fabricPortGetErrorCounters(zes_fabric_port_error_counters_t *pErrors) override;
|
||||
|
||||
FabricPortImp() = delete;
|
||||
FabricPortImp(FabricDevice *pFabricDevice, uint32_t portNum);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020 Intel Corporation
|
||||
* Copyright (C) 2020-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -56,6 +56,10 @@ ze_result_t LinuxFabricPortImp::getThroughput(zes_fabric_port_throughput_t *pThr
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
ze_result_t LinuxFabricPortImp::getErrorCounters(zes_fabric_port_error_counters_t *pErrors) {
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
}
|
||||
|
||||
ze_result_t LinuxFabricPortImp::getProperties(zes_fabric_port_properties_t *pProperties) {
|
||||
::snprintf(pProperties->model, ZES_MAX_FABRIC_PORT_MODEL_SIZE, "%s", this->model.c_str());
|
||||
pProperties->onSubdevice = false;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020 Intel Corporation
|
||||
* Copyright (C) 2020-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -34,6 +34,7 @@ class LinuxFabricPortImp : public OsFabricPort, NEO::NonCopyableOrMovableClass {
|
||||
ze_result_t setConfig(const zes_fabric_port_config_t *pConfig) override;
|
||||
ze_result_t getState(zes_fabric_port_state_t *pState) override;
|
||||
ze_result_t getThroughput(zes_fabric_port_throughput_t *pThroughput) override;
|
||||
ze_result_t getErrorCounters(zes_fabric_port_error_counters_t *pErrors) override;
|
||||
|
||||
LinuxFabricPortImp() = delete;
|
||||
LinuxFabricPortImp(OsFabricDevice *pOsFabricDevice, uint32_t portNum);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2022 Intel Corporation
|
||||
* Copyright (C) 2022-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -7,8 +7,11 @@
|
||||
|
||||
#include "os_fabric_port_imp_prelim.h"
|
||||
|
||||
#include "shared/source/debug_settings/debug_settings_manager.h"
|
||||
#include "shared/source/helpers/debug_helpers.h"
|
||||
|
||||
#include "sysman/linux/os_sysman_imp.h"
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
namespace L0 {
|
||||
@@ -36,6 +39,78 @@ ze_result_t LinuxFabricDeviceImp::getThroughput(const zes_fabric_port_id_t portI
|
||||
return pFabricDeviceAccess->getThroughput(portId, *pThroughput);
|
||||
}
|
||||
|
||||
ze_result_t LinuxFabricDeviceImp::getErrorCounters(const zes_fabric_port_id_t portId, zes_fabric_port_error_counters_t *pErrors) {
|
||||
FsAccess *pFsAccess = &pLinuxSysmanImp->getFsAccess();
|
||||
SysfsAccess *pSysfsAccess = &pLinuxSysmanImp->getSysfsAccess();
|
||||
std::string devicePciPath("");
|
||||
ze_result_t result = pSysfsAccess->getRealPath("device/", devicePciPath);
|
||||
if (result != ZE_RESULT_SUCCESS) {
|
||||
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr,
|
||||
"error@<%s> <failed to get device path> <result: 0x%x>\n", __func__, result);
|
||||
return result;
|
||||
}
|
||||
std::string path("");
|
||||
std::vector<std::string> list;
|
||||
result = pFsAccess->listDirectory(devicePciPath, list);
|
||||
if (result != ZE_RESULT_SUCCESS) {
|
||||
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr,
|
||||
"error@<%s> <failed to get list of files in device directory> <result: 0x%x>\n", __func__, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
for (auto entry : list) {
|
||||
if ((entry.find("i915.iaf.") != std::string::npos) ||
|
||||
(entry.find("iaf.") != std::string::npos)) {
|
||||
path = devicePciPath + "/" + entry;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (path.empty()) {
|
||||
// This device does not have a fabric
|
||||
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr,
|
||||
"error@<%s> <Device does not have fabric>\n", __func__);
|
||||
return ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
}
|
||||
std::string fabricFwErrorPath = path + "/sd." + std::to_string(portId.attachId);
|
||||
std::string fabricLinkErrorPath = path + "/sd." + std::to_string(portId.attachId) + "/port." + std::to_string(portId.portNumber);
|
||||
uint64_t linkErrorCount = 0;
|
||||
std::string linkFailureFile = fabricLinkErrorPath + "/link_failures";
|
||||
result = pFsAccess->read(linkFailureFile, linkErrorCount);
|
||||
if (result != ZE_RESULT_SUCCESS) {
|
||||
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr,
|
||||
"error@<%s> <failed to read file %s> <result: 0x%x>\n", __func__, linkFailureFile.c_str(), result);
|
||||
linkErrorCount = 0;
|
||||
}
|
||||
uint64_t linkDegradeCount = 0;
|
||||
std::string linkDegradeFile = fabricLinkErrorPath + "/link_degrades";
|
||||
result = pFsAccess->read(linkDegradeFile, linkDegradeCount);
|
||||
if (result != ZE_RESULT_SUCCESS) {
|
||||
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr,
|
||||
"error@<%s> <failed to read file %s> <result: 0x%x>\n", __func__, linkDegradeFile.c_str(), result);
|
||||
linkDegradeCount = 0;
|
||||
}
|
||||
uint64_t fwErrorCount = 0;
|
||||
std::string fwErrorFile = fabricFwErrorPath + "/fw_error";
|
||||
result = pFsAccess->read(fwErrorFile, fwErrorCount);
|
||||
if (result != ZE_RESULT_SUCCESS) {
|
||||
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr,
|
||||
"error@<%s> <failed to read file %s> <result: 0x%x>\n", __func__, fwErrorFile.c_str(), result);
|
||||
fwErrorCount = 0;
|
||||
}
|
||||
uint64_t fwCommErrorCount = 0;
|
||||
std::string fwCommErrorFile = fabricFwErrorPath + "/fw_comm_errors";
|
||||
result = pFsAccess->read(fwCommErrorFile, fwCommErrorCount);
|
||||
if (result != ZE_RESULT_SUCCESS) {
|
||||
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr,
|
||||
"error@<%s> <failed to read file %s> <result: 0x%x>\n", __func__, fwCommErrorFile.c_str(), result);
|
||||
fwCommErrorCount = 0;
|
||||
}
|
||||
pErrors->linkFailureCount = linkErrorCount;
|
||||
pErrors->linkDegradeCount = linkDegradeCount;
|
||||
pErrors->fwErrorCount = fwErrorCount;
|
||||
pErrors->fwCommErrorCount = fwCommErrorCount;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
ze_result_t LinuxFabricDeviceImp::performSweep() {
|
||||
uint32_t start = 0U;
|
||||
uint32_t end = 0U;
|
||||
@@ -143,6 +218,7 @@ ze_result_t LinuxFabricDeviceImp::routingQuery(uint32_t &start, uint32_t &end) {
|
||||
LinuxFabricDeviceImp::LinuxFabricDeviceImp(OsSysman *pOsSysman) {
|
||||
pFabricDeviceAccess = FabricDeviceAccess::create(pOsSysman);
|
||||
UNRECOVERABLE_IF(nullptr == pFabricDeviceAccess);
|
||||
pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
|
||||
}
|
||||
|
||||
LinuxFabricDeviceImp::~LinuxFabricDeviceImp() {
|
||||
@@ -206,6 +282,10 @@ ze_result_t LinuxFabricPortImp::getThroughput(zes_fabric_port_throughput_t *pThr
|
||||
return pLinuxFabricDeviceImp->getThroughput(portId, pThroughput);
|
||||
}
|
||||
|
||||
ze_result_t LinuxFabricPortImp::getErrorCounters(zes_fabric_port_error_counters_t *pErrors) {
|
||||
return pLinuxFabricDeviceImp->getErrorCounters(portId, pErrors);
|
||||
}
|
||||
|
||||
ze_result_t LinuxFabricPortImp::getProperties(zes_fabric_port_properties_t *pProperties) {
|
||||
::snprintf(pProperties->model, ZES_MAX_FABRIC_PORT_MODEL_SIZE, "%s", this->model.c_str());
|
||||
pProperties->onSubdevice = this->onSubdevice;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2022 Intel Corporation
|
||||
* Copyright (C) 2022-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -15,7 +15,7 @@
|
||||
#include <vector>
|
||||
|
||||
namespace L0 {
|
||||
|
||||
class LinuxSysmanImp;
|
||||
class LinuxFabricDeviceImp : public OsFabricDevice, NEO::NonCopyableOrMovableClass {
|
||||
public:
|
||||
uint32_t getNumPorts() override;
|
||||
@@ -29,6 +29,7 @@ class LinuxFabricDeviceImp : public OsFabricDevice, NEO::NonCopyableOrMovableCla
|
||||
ze_result_t disablePortBeaconing(const zes_fabric_port_id_t portId);
|
||||
ze_result_t getState(const zes_fabric_port_id_t portId, zes_fabric_port_state_t *pState);
|
||||
ze_result_t getThroughput(const zes_fabric_port_id_t portId, zes_fabric_port_throughput_t *pThroughput);
|
||||
ze_result_t getErrorCounters(const zes_fabric_port_id_t portId, zes_fabric_port_error_counters_t *pErrors);
|
||||
|
||||
void getPortId(const uint32_t portNumber, zes_fabric_port_id_t &portId);
|
||||
void getProperties(const zes_fabric_port_id_t portId, std::string &model, bool &onSubdevice,
|
||||
@@ -49,6 +50,7 @@ class LinuxFabricDeviceImp : public OsFabricDevice, NEO::NonCopyableOrMovableCla
|
||||
ze_result_t disableUsage(const zes_fabric_port_id_t portId);
|
||||
|
||||
protected:
|
||||
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
|
||||
FabricDeviceAccess *pFabricDeviceAccess = nullptr;
|
||||
};
|
||||
|
||||
@@ -60,6 +62,7 @@ class LinuxFabricPortImp : public OsFabricPort, NEO::NonCopyableOrMovableClass {
|
||||
ze_result_t setConfig(const zes_fabric_port_config_t *pConfig) override;
|
||||
ze_result_t getState(zes_fabric_port_state_t *pState) override;
|
||||
ze_result_t getThroughput(zes_fabric_port_throughput_t *pThroughput) override;
|
||||
ze_result_t getErrorCounters(zes_fabric_port_error_counters_t *pErrors) override;
|
||||
|
||||
LinuxFabricPortImp() = delete;
|
||||
LinuxFabricPortImp(OsFabricDevice *pOsFabricDevice, uint32_t portNum);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020 Intel Corporation
|
||||
* Copyright (C) 2020-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -29,6 +29,7 @@ class OsFabricPort {
|
||||
virtual ze_result_t setConfig(const zes_fabric_port_config_t *pConfig) = 0;
|
||||
virtual ze_result_t getState(zes_fabric_port_state_t *pState) = 0;
|
||||
virtual ze_result_t getThroughput(zes_fabric_port_throughput_t *pThroughput) = 0;
|
||||
virtual ze_result_t getErrorCounters(zes_fabric_port_error_counters_t *pErrors) = 0;
|
||||
|
||||
static OsFabricPort *create(OsFabricDevice *pOsFabricDevice, uint32_t portNum);
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020 Intel Corporation
|
||||
* Copyright (C) 2020-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -41,6 +41,10 @@ ze_result_t WddmFabricPortImp::getThroughput(zes_fabric_port_throughput_t *pThro
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
}
|
||||
|
||||
ze_result_t WddmFabricPortImp::getErrorCounters(zes_fabric_port_error_counters_t *pErrors) {
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
}
|
||||
|
||||
ze_result_t WddmFabricPortImp::getProperties(zes_fabric_port_properties_t *pProperties) {
|
||||
::memset(pProperties->model, '\0', ZES_MAX_FABRIC_PORT_MODEL_SIZE);
|
||||
pProperties->onSubdevice = false;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020 Intel Corporation
|
||||
* Copyright (C) 2020-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -35,6 +35,7 @@ class WddmFabricPortImp : public OsFabricPort, NEO::NonCopyableOrMovableClass {
|
||||
ze_result_t setConfig(const zes_fabric_port_config_t *pConfig) override;
|
||||
ze_result_t getState(zes_fabric_port_state_t *pState) override;
|
||||
ze_result_t getThroughput(zes_fabric_port_throughput_t *pThroughput) override;
|
||||
ze_result_t getErrorCounters(zes_fabric_port_error_counters_t *pErrors) override;
|
||||
|
||||
WddmFabricPortImp() = delete;
|
||||
WddmFabricPortImp(OsFabricDevice *pOsFabricDevice, uint32_t portNum);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (C) 2020-2022 Intel Corporation
|
||||
# Copyright (C) 2020-2023 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
#
|
||||
@@ -13,7 +13,6 @@ if(NEO_ENABLE_i915_PRELIM_DETECTION)
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_prelim.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_prelim.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_gt.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_fabric.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_hbm.cpp
|
||||
)
|
||||
else()
|
||||
|
||||
@@ -1,127 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2021-2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/source/device/sub_device.h"
|
||||
|
||||
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h"
|
||||
#include "level_zero/tools/source/sysman/sysman_imp.h"
|
||||
|
||||
#include "sysman/linux/fs_access.h"
|
||||
#include "sysman/linux/os_sysman_imp.h"
|
||||
|
||||
#include <cstring>
|
||||
#include <regex>
|
||||
namespace L0 {
|
||||
|
||||
void LinuxRasSourceFabric::getNodes(std::vector<std::string> &nodes, uint32_t subdeviceId, LinuxSysmanImp *pSysmanImp, const zes_ras_error_type_t &type) {
|
||||
const uint32_t minBoardStrappedNumber = 0;
|
||||
const uint32_t maxBoardStrappedNumber = 31;
|
||||
const uint32_t minPortId = 1;
|
||||
const uint32_t maxPortId = 8;
|
||||
nodes.clear();
|
||||
|
||||
const std::string iafPathStringMfd("/sys/module/iaf/drivers/platform:iaf/");
|
||||
const std::string iafPathStringAuxillary("/sys/module/iaf/drivers/auxiliary:iaf/");
|
||||
std::string iafPathString("");
|
||||
|
||||
if (pSysmanImp->getSysfsAccess().getRealPath("device/", iafPathString) != ZE_RESULT_SUCCESS) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto &fsAccess = pSysmanImp->getFsAccess();
|
||||
if (fsAccess.directoryExists(iafPathStringMfd)) {
|
||||
iafPathString = iafPathString + "/iaf.";
|
||||
} else if (fsAccess.directoryExists(iafPathStringAuxillary)) {
|
||||
iafPathString = iafPathString + "/i915.iaf.";
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
|
||||
for (auto boardStrappedNumber = minBoardStrappedNumber; boardStrappedNumber <= maxBoardStrappedNumber; boardStrappedNumber++) {
|
||||
|
||||
const auto boardStrappedString(iafPathString + std::to_string(boardStrappedNumber));
|
||||
if (!fsAccess.directoryExists(boardStrappedString)) {
|
||||
continue;
|
||||
}
|
||||
const auto subDeviceString(boardStrappedString + "/sd." + std::to_string(subdeviceId));
|
||||
std::vector<std::string> subDeviceErrorNodes;
|
||||
|
||||
if (type == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
|
||||
subDeviceErrorNodes.push_back(subDeviceString + "/fw_comm_errors");
|
||||
for (auto portId = minPortId; portId <= maxPortId; portId++) {
|
||||
subDeviceErrorNodes.push_back(subDeviceString + "/port." + std::to_string(portId) + "/link_degrades");
|
||||
}
|
||||
} else {
|
||||
subDeviceErrorNodes.push_back(subDeviceString + "/sd_failure");
|
||||
subDeviceErrorNodes.push_back(subDeviceString + "/fw_error");
|
||||
for (auto portId = minPortId; portId <= maxPortId; portId++) {
|
||||
subDeviceErrorNodes.push_back(subDeviceString + "/port." + std::to_string(portId) + "/link_failures");
|
||||
}
|
||||
}
|
||||
|
||||
for (auto &subDeviceErrorNode : subDeviceErrorNodes) {
|
||||
if (ZE_RESULT_SUCCESS == fsAccess.canRead(subDeviceErrorNode)) {
|
||||
nodes.push_back(subDeviceErrorNode);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasSourceFabric::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType,
|
||||
OsSysman *pOsSysman, ze_device_handle_t deviceHandle) {
|
||||
LinuxSysmanImp *pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
|
||||
ze_bool_t onSubDevice = false;
|
||||
uint32_t subDeviceIndex = 0;
|
||||
SysmanDeviceImp::getSysmanDeviceInfo(deviceHandle, subDeviceIndex, onSubDevice, true);
|
||||
std::vector<std::string> nodes;
|
||||
getNodes(nodes, subDeviceIndex, pLinuxSysmanImp, ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
|
||||
if (nodes.size()) {
|
||||
errorType.insert(ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
|
||||
}
|
||||
getNodes(nodes, subDeviceIndex, pLinuxSysmanImp, ZES_RAS_ERROR_TYPE_CORRECTABLE);
|
||||
if (nodes.size()) {
|
||||
errorType.insert(ZES_RAS_ERROR_TYPE_CORRECTABLE);
|
||||
}
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
LinuxRasSourceFabric::LinuxRasSourceFabric(OsSysman *pOsSysman, zes_ras_error_type_t type, uint32_t subDeviceId) {
|
||||
|
||||
pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
|
||||
getNodes(errorNodes, subDeviceId, pLinuxSysmanImp, type);
|
||||
}
|
||||
|
||||
uint64_t LinuxRasSourceFabric::getComputeErrorCount() {
|
||||
uint64_t currentErrorCount = 0;
|
||||
auto &fsAccess = pLinuxSysmanImp->getFsAccess();
|
||||
for (const auto &node : errorNodes) {
|
||||
uint64_t errorCount = 0;
|
||||
fsAccess.read(node, errorCount);
|
||||
currentErrorCount += errorCount;
|
||||
}
|
||||
return currentErrorCount;
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasSourceFabric::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
|
||||
|
||||
if (errorNodes.size() == 0) {
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
}
|
||||
|
||||
std::memset(state.category, 0, sizeof(zes_ras_state_t::category));
|
||||
uint64_t currentComputeErrorCount = getComputeErrorCount();
|
||||
|
||||
if (clear) {
|
||||
baseComputeErrorCount = currentComputeErrorCount;
|
||||
currentComputeErrorCount = getComputeErrorCount();
|
||||
}
|
||||
state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS] = currentComputeErrorCount - baseComputeErrorCount;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
} // namespace L0
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020-2022 Intel Corporation
|
||||
* Copyright (C) 2020-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -18,10 +18,7 @@ void OsRas::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType,
|
||||
constexpr auto maxErrorTypes = 2;
|
||||
LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle);
|
||||
if (errorType.size() < maxErrorTypes) {
|
||||
LinuxRasSourceFabric::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle);
|
||||
if (errorType.size() < maxErrorTypes) {
|
||||
LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle);
|
||||
}
|
||||
LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -72,7 +69,6 @@ ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear)
|
||||
|
||||
void LinuxRasImp::initSources() {
|
||||
rasSources.push_back(std::make_unique<L0::LinuxRasSourceGt>(pLinuxSysmanImp, osRasErrorType, isSubdevice, subdeviceId));
|
||||
rasSources.push_back(std::make_unique<L0::LinuxRasSourceFabric>(pLinuxSysmanImp, osRasErrorType, subdeviceId));
|
||||
rasSources.push_back(std::make_unique<L0::LinuxRasSourceHbm>(pLinuxSysmanImp, osRasErrorType, subdeviceId));
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020-2022 Intel Corporation
|
||||
* Copyright (C) 2020-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -90,22 +90,6 @@ class LinuxRasSourceGt : public LinuxRasSources {
|
||||
uint32_t subdeviceId = 0;
|
||||
};
|
||||
|
||||
class LinuxRasSourceFabric : public LinuxRasSources {
|
||||
public:
|
||||
static ze_result_t getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle);
|
||||
LinuxRasSourceFabric(OsSysman *pOsSysman, zes_ras_error_type_t type, uint32_t subDeviceId);
|
||||
~LinuxRasSourceFabric() override = default;
|
||||
|
||||
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
|
||||
|
||||
private:
|
||||
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
|
||||
std::vector<std::string> errorNodes = {};
|
||||
uint64_t baseComputeErrorCount = 0;
|
||||
uint64_t getComputeErrorCount();
|
||||
static void getNodes(std::vector<std::string> &nodes, uint32_t subdeviceId, LinuxSysmanImp *pSysmanImp, const zes_ras_error_type_t &type);
|
||||
};
|
||||
|
||||
class LinuxRasSourceHbm : public LinuxRasSources {
|
||||
public:
|
||||
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
|
||||
|
||||
Reference in New Issue
Block a user