From 7c050291bf7c3992892dc3e912ca762d52e11ed7 Mon Sep 17 00:00:00 2001 From: Joshua Santosh Ranjan Date: Wed, 9 Nov 2022 11:51:03 +0000 Subject: [PATCH] Fix fabric ras errors accumulated to all devices This patch fixes the issue that fabric ras errors from all devies are reported for all devices. Related-To: LOCI-3548 Signed-off-by: Joshua Santosh Ranjan --- .../sysman/ras/linux/os_ras_imp_fabric.cpp | 30 +++++++++++-------- .../sysman/ras/linux/os_ras_imp_prelim.h | 4 +-- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_fabric.cpp b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_fabric.cpp index 796dd08824..5ad2dab895 100644 --- a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_fabric.cpp +++ b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_fabric.cpp @@ -17,7 +17,7 @@ #include namespace L0 { -void LinuxRasSourceFabric::getNodes(std::vector &nodes, uint32_t subdeviceId, FsAccess *fsAccess, const zes_ras_error_type_t &type) { +void LinuxRasSourceFabric::getNodes(std::vector &nodes, uint32_t subdeviceId, LinuxSysmanImp *pSysmanImp, const zes_ras_error_type_t &type) { const uint32_t minBoardStrappedNumber = 0; const uint32_t maxBoardStrappedNumber = 31; const uint32_t minPortId = 1; @@ -28,10 +28,15 @@ void LinuxRasSourceFabric::getNodes(std::vector &nodes, uint32_t su const std::string iafPathStringAuxillary("/sys/module/iaf/drivers/auxiliary:iaf/"); std::string iafPathString(""); - if (fsAccess->directoryExists(iafPathStringMfd)) { - iafPathString = iafPathStringMfd + "iaf."; - } else if (fsAccess->directoryExists(iafPathStringAuxillary)) { - iafPathString = iafPathStringAuxillary + "i915.iaf."; + if (pSysmanImp->getSysfsAccess().getRealPath("device/", iafPathString) != ZE_RESULT_SUCCESS) { + return; + } + + auto &fsAccess = pSysmanImp->getFsAccess(); + if (fsAccess.directoryExists(iafPathStringMfd)) { + iafPathString = iafPathString + "/iaf."; + } else if (fsAccess.directoryExists(iafPathStringAuxillary)) { + iafPathString = iafPathString + "/i915.iaf."; } else { return; } @@ -39,7 +44,7 @@ void LinuxRasSourceFabric::getNodes(std::vector &nodes, uint32_t su for (auto boardStrappedNumber = minBoardStrappedNumber; boardStrappedNumber <= maxBoardStrappedNumber; boardStrappedNumber++) { const auto boardStrappedString(iafPathString + std::to_string(boardStrappedNumber)); - if (!fsAccess->directoryExists(boardStrappedString)) { + if (!fsAccess.directoryExists(boardStrappedString)) { continue; } const auto subDeviceString(boardStrappedString + "/sd." + std::to_string(subdeviceId)); @@ -59,7 +64,7 @@ void LinuxRasSourceFabric::getNodes(std::vector &nodes, uint32_t su } for (auto &subDeviceErrorNode : subDeviceErrorNodes) { - if (ZE_RESULT_SUCCESS == fsAccess->canRead(subDeviceErrorNode)) { + if (ZE_RESULT_SUCCESS == fsAccess.canRead(subDeviceErrorNode)) { nodes.push_back(subDeviceErrorNode); } } @@ -73,11 +78,11 @@ ze_result_t LinuxRasSourceFabric::getSupportedRasErrorTypes(std::set nodes; - getNodes(nodes, subDeviceIndex, &pLinuxSysmanImp->getFsAccess(), ZES_RAS_ERROR_TYPE_UNCORRECTABLE); + getNodes(nodes, subDeviceIndex, pLinuxSysmanImp, ZES_RAS_ERROR_TYPE_UNCORRECTABLE); if (nodes.size()) { errorType.insert(ZES_RAS_ERROR_TYPE_UNCORRECTABLE); } - getNodes(nodes, subDeviceIndex, &pLinuxSysmanImp->getFsAccess(), ZES_RAS_ERROR_TYPE_CORRECTABLE); + getNodes(nodes, subDeviceIndex, pLinuxSysmanImp, ZES_RAS_ERROR_TYPE_CORRECTABLE); if (nodes.size()) { errorType.insert(ZES_RAS_ERROR_TYPE_CORRECTABLE); } @@ -87,15 +92,16 @@ ze_result_t LinuxRasSourceFabric::getSupportedRasErrorTypes(std::set(pOsSysman)->getFsAccess(); - getNodes(errorNodes, subDeviceId, fsAccess, type); + pLinuxSysmanImp = static_cast(pOsSysman); + getNodes(errorNodes, subDeviceId, pLinuxSysmanImp, type); } uint64_t LinuxRasSourceFabric::getComputeErrorCount() { uint64_t currentErrorCount = 0; + auto &fsAccess = pLinuxSysmanImp->getFsAccess(); for (const auto &node : errorNodes) { uint64_t errorCount = 0; - fsAccess->read(node, errorCount); + fsAccess.read(node, errorCount); currentErrorCount += errorCount; } return currentErrorCount; diff --git a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h index 26790c178b..82ddfd356f 100644 --- a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h +++ b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h @@ -99,11 +99,11 @@ class LinuxRasSourceFabric : public LinuxRasSources { ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override; private: - FsAccess *fsAccess = nullptr; + LinuxSysmanImp *pLinuxSysmanImp = nullptr; std::vector errorNodes = {}; uint64_t baseComputeErrorCount = 0; uint64_t getComputeErrorCount(); - static void getNodes(std::vector &nodes, uint32_t subdeviceId, FsAccess *fsAccess, const zes_ras_error_type_t &type); + static void getNodes(std::vector &nodes, uint32_t subdeviceId, LinuxSysmanImp *pSysmanImp, const zes_ras_error_type_t &type); }; class LinuxRasSourceHbm : public LinuxRasSources {