Add support for sysman zesFabricPortGetFabricErrorCounters API

Related-To: LOCI-3398

Signed-off-by: Mayank Raghuwanshi <mayank.raghuwanshi@intel.com>
This commit is contained in:
Mayank Raghuwanshi
2023-01-30 16:07:51 +00:00
committed by Compute-Runtime-Automation
parent 48ed9f9c92
commit 07d3353b1f
21 changed files with 571 additions and 784 deletions

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2022 Intel Corporation
* Copyright (C) 2020-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -38,6 +38,7 @@ class FabricPort : _zes_fabric_port_handle_t {
virtual ze_result_t fabricPortSetConfig(const zes_fabric_port_config_t *pConfig) = 0;
virtual ze_result_t fabricPortGetState(zes_fabric_port_state_t *pState) = 0;
virtual ze_result_t fabricPortGetThroughput(zes_fabric_port_throughput_t *pThroughput) = 0;
virtual ze_result_t fabricPortGetErrorCounters(zes_fabric_port_error_counters_t *pErrors) = 0;
inline zes_fabric_port_handle_t toZesHandle() { return this; }

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020 Intel Corporation
* Copyright (C) 2020-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -53,6 +53,10 @@ ze_result_t FabricPortImp::fabricPortGetState(zes_fabric_port_state_t *pState) {
return pOsFabricPort->getState(pState);
}
ze_result_t FabricPortImp::fabricPortGetErrorCounters(zes_fabric_port_error_counters_t *pErrors) {
return pOsFabricPort->getErrorCounters(pErrors);
}
ze_result_t FabricPortImp::fabricPortGetThroughput(zes_fabric_port_throughput_t *pThroughput) {
fabricPortGetTimestamp(pThroughput->timestamp);
return pOsFabricPort->getThroughput(pThroughput);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020 Intel Corporation
* Copyright (C) 2020-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -35,6 +35,7 @@ class FabricPortImp : public FabricPort, NEO::NonCopyableOrMovableClass {
ze_result_t fabricPortSetConfig(const zes_fabric_port_config_t *pConfig) override;
ze_result_t fabricPortGetState(zes_fabric_port_state_t *pState) override;
ze_result_t fabricPortGetThroughput(zes_fabric_port_throughput_t *pThroughput) override;
ze_result_t fabricPortGetErrorCounters(zes_fabric_port_error_counters_t *pErrors) override;
FabricPortImp() = delete;
FabricPortImp(FabricDevice *pFabricDevice, uint32_t portNum);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020 Intel Corporation
* Copyright (C) 2020-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -56,6 +56,10 @@ ze_result_t LinuxFabricPortImp::getThroughput(zes_fabric_port_throughput_t *pThr
return ZE_RESULT_SUCCESS;
}
ze_result_t LinuxFabricPortImp::getErrorCounters(zes_fabric_port_error_counters_t *pErrors) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
ze_result_t LinuxFabricPortImp::getProperties(zes_fabric_port_properties_t *pProperties) {
::snprintf(pProperties->model, ZES_MAX_FABRIC_PORT_MODEL_SIZE, "%s", this->model.c_str());
pProperties->onSubdevice = false;

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020 Intel Corporation
* Copyright (C) 2020-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -34,6 +34,7 @@ class LinuxFabricPortImp : public OsFabricPort, NEO::NonCopyableOrMovableClass {
ze_result_t setConfig(const zes_fabric_port_config_t *pConfig) override;
ze_result_t getState(zes_fabric_port_state_t *pState) override;
ze_result_t getThroughput(zes_fabric_port_throughput_t *pThroughput) override;
ze_result_t getErrorCounters(zes_fabric_port_error_counters_t *pErrors) override;
LinuxFabricPortImp() = delete;
LinuxFabricPortImp(OsFabricDevice *pOsFabricDevice, uint32_t portNum);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2022 Intel Corporation
* Copyright (C) 2022-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -7,8 +7,11 @@
#include "os_fabric_port_imp_prelim.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/helpers/debug_helpers.h"
#include "sysman/linux/os_sysman_imp.h"
#include <cstdio>
namespace L0 {
@@ -36,6 +39,78 @@ ze_result_t LinuxFabricDeviceImp::getThroughput(const zes_fabric_port_id_t portI
return pFabricDeviceAccess->getThroughput(portId, *pThroughput);
}
ze_result_t LinuxFabricDeviceImp::getErrorCounters(const zes_fabric_port_id_t portId, zes_fabric_port_error_counters_t *pErrors) {
FsAccess *pFsAccess = &pLinuxSysmanImp->getFsAccess();
SysfsAccess *pSysfsAccess = &pLinuxSysmanImp->getSysfsAccess();
std::string devicePciPath("");
ze_result_t result = pSysfsAccess->getRealPath("device/", devicePciPath);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr,
"error@<%s> <failed to get device path> <result: 0x%x>\n", __func__, result);
return result;
}
std::string path("");
std::vector<std::string> list;
result = pFsAccess->listDirectory(devicePciPath, list);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr,
"error@<%s> <failed to get list of files in device directory> <result: 0x%x>\n", __func__, result);
return result;
}
for (auto entry : list) {
if ((entry.find("i915.iaf.") != std::string::npos) ||
(entry.find("iaf.") != std::string::npos)) {
path = devicePciPath + "/" + entry;
break;
}
}
if (path.empty()) {
// This device does not have a fabric
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr,
"error@<%s> <Device does not have fabric>\n", __func__);
return ZE_RESULT_ERROR_NOT_AVAILABLE;
}
std::string fabricFwErrorPath = path + "/sd." + std::to_string(portId.attachId);
std::string fabricLinkErrorPath = path + "/sd." + std::to_string(portId.attachId) + "/port." + std::to_string(portId.portNumber);
uint64_t linkErrorCount = 0;
std::string linkFailureFile = fabricLinkErrorPath + "/link_failures";
result = pFsAccess->read(linkFailureFile, linkErrorCount);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr,
"error@<%s> <failed to read file %s> <result: 0x%x>\n", __func__, linkFailureFile.c_str(), result);
linkErrorCount = 0;
}
uint64_t linkDegradeCount = 0;
std::string linkDegradeFile = fabricLinkErrorPath + "/link_degrades";
result = pFsAccess->read(linkDegradeFile, linkDegradeCount);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr,
"error@<%s> <failed to read file %s> <result: 0x%x>\n", __func__, linkDegradeFile.c_str(), result);
linkDegradeCount = 0;
}
uint64_t fwErrorCount = 0;
std::string fwErrorFile = fabricFwErrorPath + "/fw_error";
result = pFsAccess->read(fwErrorFile, fwErrorCount);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr,
"error@<%s> <failed to read file %s> <result: 0x%x>\n", __func__, fwErrorFile.c_str(), result);
fwErrorCount = 0;
}
uint64_t fwCommErrorCount = 0;
std::string fwCommErrorFile = fabricFwErrorPath + "/fw_comm_errors";
result = pFsAccess->read(fwCommErrorFile, fwCommErrorCount);
if (result != ZE_RESULT_SUCCESS) {
NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr,
"error@<%s> <failed to read file %s> <result: 0x%x>\n", __func__, fwCommErrorFile.c_str(), result);
fwCommErrorCount = 0;
}
pErrors->linkFailureCount = linkErrorCount;
pErrors->linkDegradeCount = linkDegradeCount;
pErrors->fwErrorCount = fwErrorCount;
pErrors->fwCommErrorCount = fwCommErrorCount;
return ZE_RESULT_SUCCESS;
}
ze_result_t LinuxFabricDeviceImp::performSweep() {
uint32_t start = 0U;
uint32_t end = 0U;
@@ -143,6 +218,7 @@ ze_result_t LinuxFabricDeviceImp::routingQuery(uint32_t &start, uint32_t &end) {
LinuxFabricDeviceImp::LinuxFabricDeviceImp(OsSysman *pOsSysman) {
pFabricDeviceAccess = FabricDeviceAccess::create(pOsSysman);
UNRECOVERABLE_IF(nullptr == pFabricDeviceAccess);
pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
}
LinuxFabricDeviceImp::~LinuxFabricDeviceImp() {
@@ -206,6 +282,10 @@ ze_result_t LinuxFabricPortImp::getThroughput(zes_fabric_port_throughput_t *pThr
return pLinuxFabricDeviceImp->getThroughput(portId, pThroughput);
}
ze_result_t LinuxFabricPortImp::getErrorCounters(zes_fabric_port_error_counters_t *pErrors) {
return pLinuxFabricDeviceImp->getErrorCounters(portId, pErrors);
}
ze_result_t LinuxFabricPortImp::getProperties(zes_fabric_port_properties_t *pProperties) {
::snprintf(pProperties->model, ZES_MAX_FABRIC_PORT_MODEL_SIZE, "%s", this->model.c_str());
pProperties->onSubdevice = this->onSubdevice;

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2022 Intel Corporation
* Copyright (C) 2022-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -15,7 +15,7 @@
#include <vector>
namespace L0 {
class LinuxSysmanImp;
class LinuxFabricDeviceImp : public OsFabricDevice, NEO::NonCopyableOrMovableClass {
public:
uint32_t getNumPorts() override;
@@ -29,6 +29,7 @@ class LinuxFabricDeviceImp : public OsFabricDevice, NEO::NonCopyableOrMovableCla
ze_result_t disablePortBeaconing(const zes_fabric_port_id_t portId);
ze_result_t getState(const zes_fabric_port_id_t portId, zes_fabric_port_state_t *pState);
ze_result_t getThroughput(const zes_fabric_port_id_t portId, zes_fabric_port_throughput_t *pThroughput);
ze_result_t getErrorCounters(const zes_fabric_port_id_t portId, zes_fabric_port_error_counters_t *pErrors);
void getPortId(const uint32_t portNumber, zes_fabric_port_id_t &portId);
void getProperties(const zes_fabric_port_id_t portId, std::string &model, bool &onSubdevice,
@@ -49,6 +50,7 @@ class LinuxFabricDeviceImp : public OsFabricDevice, NEO::NonCopyableOrMovableCla
ze_result_t disableUsage(const zes_fabric_port_id_t portId);
protected:
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
FabricDeviceAccess *pFabricDeviceAccess = nullptr;
};
@@ -60,6 +62,7 @@ class LinuxFabricPortImp : public OsFabricPort, NEO::NonCopyableOrMovableClass {
ze_result_t setConfig(const zes_fabric_port_config_t *pConfig) override;
ze_result_t getState(zes_fabric_port_state_t *pState) override;
ze_result_t getThroughput(zes_fabric_port_throughput_t *pThroughput) override;
ze_result_t getErrorCounters(zes_fabric_port_error_counters_t *pErrors) override;
LinuxFabricPortImp() = delete;
LinuxFabricPortImp(OsFabricDevice *pOsFabricDevice, uint32_t portNum);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020 Intel Corporation
* Copyright (C) 2020-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -29,6 +29,7 @@ class OsFabricPort {
virtual ze_result_t setConfig(const zes_fabric_port_config_t *pConfig) = 0;
virtual ze_result_t getState(zes_fabric_port_state_t *pState) = 0;
virtual ze_result_t getThroughput(zes_fabric_port_throughput_t *pThroughput) = 0;
virtual ze_result_t getErrorCounters(zes_fabric_port_error_counters_t *pErrors) = 0;
static OsFabricPort *create(OsFabricDevice *pOsFabricDevice, uint32_t portNum);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020 Intel Corporation
* Copyright (C) 2020-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -41,6 +41,10 @@ ze_result_t WddmFabricPortImp::getThroughput(zes_fabric_port_throughput_t *pThro
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
ze_result_t WddmFabricPortImp::getErrorCounters(zes_fabric_port_error_counters_t *pErrors) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
ze_result_t WddmFabricPortImp::getProperties(zes_fabric_port_properties_t *pProperties) {
::memset(pProperties->model, '\0', ZES_MAX_FABRIC_PORT_MODEL_SIZE);
pProperties->onSubdevice = false;

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020 Intel Corporation
* Copyright (C) 2020-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -35,6 +35,7 @@ class WddmFabricPortImp : public OsFabricPort, NEO::NonCopyableOrMovableClass {
ze_result_t setConfig(const zes_fabric_port_config_t *pConfig) override;
ze_result_t getState(zes_fabric_port_state_t *pState) override;
ze_result_t getThroughput(zes_fabric_port_throughput_t *pThroughput) override;
ze_result_t getErrorCounters(zes_fabric_port_error_counters_t *pErrors) override;
WddmFabricPortImp() = delete;
WddmFabricPortImp(OsFabricDevice *pOsFabricDevice, uint32_t portNum);

View File

@@ -1,5 +1,5 @@
#
# Copyright (C) 2020-2022 Intel Corporation
# Copyright (C) 2020-2023 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
@@ -13,7 +13,6 @@ if(NEO_ENABLE_i915_PRELIM_DETECTION)
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_prelim.cpp
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_prelim.h
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_gt.cpp
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_fabric.cpp
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_hbm.cpp
)
else()

View File

@@ -1,127 +0,0 @@
/*
* Copyright (C) 2021-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/device/sub_device.h"
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h"
#include "level_zero/tools/source/sysman/sysman_imp.h"
#include "sysman/linux/fs_access.h"
#include "sysman/linux/os_sysman_imp.h"
#include <cstring>
#include <regex>
namespace L0 {
void LinuxRasSourceFabric::getNodes(std::vector<std::string> &nodes, uint32_t subdeviceId, LinuxSysmanImp *pSysmanImp, const zes_ras_error_type_t &type) {
const uint32_t minBoardStrappedNumber = 0;
const uint32_t maxBoardStrappedNumber = 31;
const uint32_t minPortId = 1;
const uint32_t maxPortId = 8;
nodes.clear();
const std::string iafPathStringMfd("/sys/module/iaf/drivers/platform:iaf/");
const std::string iafPathStringAuxillary("/sys/module/iaf/drivers/auxiliary:iaf/");
std::string iafPathString("");
if (pSysmanImp->getSysfsAccess().getRealPath("device/", iafPathString) != ZE_RESULT_SUCCESS) {
return;
}
auto &fsAccess = pSysmanImp->getFsAccess();
if (fsAccess.directoryExists(iafPathStringMfd)) {
iafPathString = iafPathString + "/iaf.";
} else if (fsAccess.directoryExists(iafPathStringAuxillary)) {
iafPathString = iafPathString + "/i915.iaf.";
} else {
return;
}
for (auto boardStrappedNumber = minBoardStrappedNumber; boardStrappedNumber <= maxBoardStrappedNumber; boardStrappedNumber++) {
const auto boardStrappedString(iafPathString + std::to_string(boardStrappedNumber));
if (!fsAccess.directoryExists(boardStrappedString)) {
continue;
}
const auto subDeviceString(boardStrappedString + "/sd." + std::to_string(subdeviceId));
std::vector<std::string> subDeviceErrorNodes;
if (type == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
subDeviceErrorNodes.push_back(subDeviceString + "/fw_comm_errors");
for (auto portId = minPortId; portId <= maxPortId; portId++) {
subDeviceErrorNodes.push_back(subDeviceString + "/port." + std::to_string(portId) + "/link_degrades");
}
} else {
subDeviceErrorNodes.push_back(subDeviceString + "/sd_failure");
subDeviceErrorNodes.push_back(subDeviceString + "/fw_error");
for (auto portId = minPortId; portId <= maxPortId; portId++) {
subDeviceErrorNodes.push_back(subDeviceString + "/port." + std::to_string(portId) + "/link_failures");
}
}
for (auto &subDeviceErrorNode : subDeviceErrorNodes) {
if (ZE_RESULT_SUCCESS == fsAccess.canRead(subDeviceErrorNode)) {
nodes.push_back(subDeviceErrorNode);
}
}
}
}
ze_result_t LinuxRasSourceFabric::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType,
OsSysman *pOsSysman, ze_device_handle_t deviceHandle) {
LinuxSysmanImp *pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
ze_bool_t onSubDevice = false;
uint32_t subDeviceIndex = 0;
SysmanDeviceImp::getSysmanDeviceInfo(deviceHandle, subDeviceIndex, onSubDevice, true);
std::vector<std::string> nodes;
getNodes(nodes, subDeviceIndex, pLinuxSysmanImp, ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
if (nodes.size()) {
errorType.insert(ZES_RAS_ERROR_TYPE_UNCORRECTABLE);
}
getNodes(nodes, subDeviceIndex, pLinuxSysmanImp, ZES_RAS_ERROR_TYPE_CORRECTABLE);
if (nodes.size()) {
errorType.insert(ZES_RAS_ERROR_TYPE_CORRECTABLE);
}
return ZE_RESULT_SUCCESS;
}
LinuxRasSourceFabric::LinuxRasSourceFabric(OsSysman *pOsSysman, zes_ras_error_type_t type, uint32_t subDeviceId) {
pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
getNodes(errorNodes, subDeviceId, pLinuxSysmanImp, type);
}
uint64_t LinuxRasSourceFabric::getComputeErrorCount() {
uint64_t currentErrorCount = 0;
auto &fsAccess = pLinuxSysmanImp->getFsAccess();
for (const auto &node : errorNodes) {
uint64_t errorCount = 0;
fsAccess.read(node, errorCount);
currentErrorCount += errorCount;
}
return currentErrorCount;
}
ze_result_t LinuxRasSourceFabric::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
if (errorNodes.size() == 0) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
std::memset(state.category, 0, sizeof(zes_ras_state_t::category));
uint64_t currentComputeErrorCount = getComputeErrorCount();
if (clear) {
baseComputeErrorCount = currentComputeErrorCount;
currentComputeErrorCount = getComputeErrorCount();
}
state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS] = currentComputeErrorCount - baseComputeErrorCount;
return ZE_RESULT_SUCCESS;
}
} // namespace L0

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2022 Intel Corporation
* Copyright (C) 2020-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -18,10 +18,7 @@ void OsRas::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType,
constexpr auto maxErrorTypes = 2;
LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle);
if (errorType.size() < maxErrorTypes) {
LinuxRasSourceFabric::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle);
if (errorType.size() < maxErrorTypes) {
LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle);
}
LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle);
}
}
@@ -72,7 +69,6 @@ ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear)
void LinuxRasImp::initSources() {
rasSources.push_back(std::make_unique<L0::LinuxRasSourceGt>(pLinuxSysmanImp, osRasErrorType, isSubdevice, subdeviceId));
rasSources.push_back(std::make_unique<L0::LinuxRasSourceFabric>(pLinuxSysmanImp, osRasErrorType, subdeviceId));
rasSources.push_back(std::make_unique<L0::LinuxRasSourceHbm>(pLinuxSysmanImp, osRasErrorType, subdeviceId));
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2022 Intel Corporation
* Copyright (C) 2020-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -90,22 +90,6 @@ class LinuxRasSourceGt : public LinuxRasSources {
uint32_t subdeviceId = 0;
};
class LinuxRasSourceFabric : public LinuxRasSources {
public:
static ze_result_t getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle);
LinuxRasSourceFabric(OsSysman *pOsSysman, zes_ras_error_type_t type, uint32_t subDeviceId);
~LinuxRasSourceFabric() override = default;
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
private:
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
std::vector<std::string> errorNodes = {};
uint64_t baseComputeErrorCount = 0;
uint64_t getComputeErrorCount();
static void getNodes(std::vector<std::string> &nodes, uint32_t subdeviceId, LinuxSysmanImp *pSysmanImp, const zes_ras_error_type_t &type);
};
class LinuxRasSourceHbm : public LinuxRasSources {
public:
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;