diff --git a/level_zero/tools/source/sysman/diagnostics/linux/CMakeLists.txt b/level_zero/tools/source/sysman/diagnostics/linux/CMakeLists.txt index 2652a0f611..13fb25a634 100644 --- a/level_zero/tools/source/sysman/diagnostics/linux/CMakeLists.txt +++ b/level_zero/tools/source/sysman/diagnostics/linux/CMakeLists.txt @@ -8,9 +8,18 @@ set(L0_SRCS_TOOLS_SYSMAN_DIAGNOSTICS_LINUX ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt ${CMAKE_CURRENT_SOURCE_DIR}/os_diagnostics_imp.cpp ${CMAKE_CURRENT_SOURCE_DIR}/os_diagnostics_imp.h - ${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}os_diagnostics_helper.cpp ) +if(NEO_ENABLE_i915_PRELIM_DETECTION) + list(APPEND L0_SRCS_TOOLS_SYSMAN_DIAGNOSTICS_LINUX + ${CMAKE_CURRENT_SOURCE_DIR}/os_diagnostics_helper_prelim.cpp + ) +else() + list(APPEND L0_SRCS_TOOLS_SYSMAN_DIAGNOSTICS_LINUX + ${CMAKE_CURRENT_SOURCE_DIR}/os_diagnostics_helper.cpp + ) +endif() + if(UNIX) target_sources(${L0_STATIC_LIB_NAME} PRIVATE diff --git a/level_zero/tools/source/sysman/diagnostics/linux/os_diagnostics_helper_prelim.cpp b/level_zero/tools/source/sysman/diagnostics/linux/os_diagnostics_helper_prelim.cpp new file mode 100644 index 0000000000..be1db25159 --- /dev/null +++ b/level_zero/tools/source/sysman/diagnostics/linux/os_diagnostics_helper_prelim.cpp @@ -0,0 +1,237 @@ +/* + * Copyright (C) 2021-2022 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/helpers/string.h" +#include "shared/source/os_interface/device_factory.h" + +#include "level_zero/core/source/device/device_imp.h" +#include "level_zero/tools/source/sysman/diagnostics/linux/os_diagnostics_imp.h" + +#include +namespace L0 { +//All memory mappings where LMEMBAR is being referenced are invalidated. +//Also prevents new ones from being created. +//It will invalidate LMEM memory mappings only when sysfs entry quiesce_gpu is set. + +//the sysfs node will be at /sys/class/drm/card/invalidate_lmem_mmaps +const std::string LinuxDiagnosticsImp::invalidateLmemFile("invalidate_lmem_mmaps"); +// the sysfs node will be at /sys/class/drm/card/quiesce_gpu +const std::string LinuxDiagnosticsImp::quiescentGpuFile("quiesce_gpu"); +void OsDiagnostics::getSupportedDiagTestsFromFW(void *pOsSysman, std::vector &supportedDiagTests) { + LinuxSysmanImp *pLinuxSysmanImp = static_cast(pOsSysman); + if (IGFX_PVC == pLinuxSysmanImp->getProductFamily()) { + FirmwareUtil *pFwInterface = pLinuxSysmanImp->getFwUtilInterface(); + if (pFwInterface != nullptr) { + if (ZE_RESULT_SUCCESS == static_cast(pFwInterface)->fwDeviceInit()) { + static_cast(pFwInterface)->fwSupportedDiagTests(supportedDiagTests); + } + } + } +} + +void LinuxDiagnosticsImp::releaseSysmanDeviceResources() { + pLinuxSysmanImp->getSysmanDeviceImp()->pEngineHandleContext->releaseEngines(); + pLinuxSysmanImp->getSysmanDeviceImp()->pRasHandleContext->releaseRasHandles(); + pLinuxSysmanImp->releasePmtObject(); + pLinuxSysmanImp->releaseLocalDrmHandle(); +} + +void LinuxDiagnosticsImp::releaseDeviceResources() { + releaseSysmanDeviceResources(); + auto device = static_cast(pLinuxSysmanImp->getDeviceHandle()); + device->releaseResources(); + executionEnvironment->memoryManager->releaseDeviceSpecificMemResources(rootDeviceIndex); + executionEnvironment->releaseRootDeviceEnvironmentResources(executionEnvironment->rootDeviceEnvironments[rootDeviceIndex].get()); + executionEnvironment->rootDeviceEnvironments[rootDeviceIndex].reset(); +} + +void LinuxDiagnosticsImp::reInitSysmanDeviceResources() { + pLinuxSysmanImp->getSysmanDeviceImp()->updateSubDeviceHandlesLocally(); + pLinuxSysmanImp->createPmtHandles(); + pLinuxSysmanImp->getSysmanDeviceImp()->pRasHandleContext->init(pLinuxSysmanImp->getSysmanDeviceImp()->deviceHandles); + pLinuxSysmanImp->getSysmanDeviceImp()->pEngineHandleContext->init(); +} + +ze_result_t LinuxDiagnosticsImp::initDevice() { + ze_result_t result = ZE_RESULT_SUCCESS; + auto device = static_cast(pLinuxSysmanImp->getDeviceHandle()); + + auto neoDevice = NEO::DeviceFactory::createDevice(*executionEnvironment, devicePciBdf, rootDeviceIndex); + if (neoDevice == nullptr) { + return ZE_RESULT_ERROR_DEVICE_LOST; + } + static_cast(device->getDriverHandle())->updateRootDeviceBitFields(neoDevice); + static_cast(device->getDriverHandle())->enableRootDeviceDebugger(neoDevice); + Device::deviceReinit(device->getDriverHandle(), device, neoDevice, &result); + reInitSysmanDeviceResources(); + return ZE_RESULT_SUCCESS; +} + +static void getPidFdsForOpenDevice(ProcfsAccess *pProcfsAccess, SysfsAccess *pSysfsAccess, const ::pid_t pid, std::vector &deviceFds) { + // Return a list of all the file descriptors of this process that point to this device + std::vector fds; + deviceFds.clear(); + if (ZE_RESULT_SUCCESS != pProcfsAccess->getFileDescriptors(pid, fds)) { + // Process exited. Not an error. Just ignore. + return; + } + for (auto &&fd : fds) { + std::string file; + if (pProcfsAccess->getFileName(pid, fd, file) != ZE_RESULT_SUCCESS) { + // Process closed this file. Not an error. Just ignore. + continue; + } + if (pSysfsAccess->isMyDeviceFile(file)) { + deviceFds.push_back(fd); + } + } +} +// A 'warm reset' is a conventional reset that is triggered across a PCI express link. +// A warm reset is triggered either when a link is forced into electrical idle or +// by sending TS1 and TS2 ordered sets with the hot reset bit set. +// Software can initiate a warm reset by setting and then clearing the secondary bus reset bit +// in the bridge control register in the PCI configuration space of the bridge port upstream of the device. +ze_result_t LinuxDiagnosticsImp::osWarmReset() { + std::string rootPortPath; + std::string realRootPath; + ze_result_t result = pSysfsAccess->getRealPath(deviceDir, realRootPath); + if (ZE_RESULT_SUCCESS != result) { + return result; + } + auto device = static_cast(pDevice); + executionEnvironment = device->getNEODevice()->getExecutionEnvironment(); + + ExecutionEnvironmentRefCountRestore restorer(executionEnvironment); + releaseDeviceResources(); + // write 1 to remove + result = pFsAccess->write(realRootPath + '/' + "remove", "1"); + if (ZE_RESULT_SUCCESS != result) { + return result; + } + size_t loc; + + loc = realRootPath.find_last_of('/'); + realRootPath = realRootPath.substr(0, loc); + + int fd, ret = 0; + unsigned int offset = PCI_BRIDGE_CONTROL; // Bridge control offset in Header of PCI config space + unsigned int value = 0x00; + unsigned int resetValue = 0x00; + std::string configFilePath = realRootPath + '/' + "config"; + fd = this->openFunction(configFilePath.c_str(), O_RDWR); + if (fd < 0) { + return ZE_RESULT_ERROR_UNKNOWN; + } + this->preadFunction(fd, &value, 0x01, offset); + resetValue = value | PCI_BRIDGE_CTL_BUS_RESET; + this->pwriteFunction(fd, &resetValue, 0x01, offset); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // Sleep for 100 milliseconds just to make sure the change is propagated. + this->pwriteFunction(fd, &value, 0x01, offset); + std::this_thread::sleep_for(std::chrono::milliseconds(500)); // Sleep for 500 milliseconds + ret = this->closeFunction(fd); + if (ret < 0) { + return ZE_RESULT_ERROR_UNKNOWN; + } + + result = pFsAccess->write(realRootPath + '/' + "rescan", "1"); + if (ZE_RESULT_SUCCESS != result) { + return result; + } + + return initDevice(); +} + +std::string getRootPortaddress(std::string &rootPortPath) { + size_t loc; + loc = rootPortPath.find_last_of('/'); // we get the pci address of the root port from rootPortPath + return rootPortPath.substr(loc + 1, std::string::npos); +} + +ze_result_t LinuxDiagnosticsImp::osColdReset() { + const std::string slotPath("/sys/bus/pci/slots/"); // holds the directories matching to the number of slots in the PC + std::string rootPortPath; // will hold the PCIe Root port directory path (the address of the PCIe slot). + std::string realRootPath; // will hold the absolute real path (not symlink) to the selected Device + ze_result_t result = pSysfsAccess->getRealPath(deviceDir, realRootPath); // e.g realRootPath=/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:01.0/0000:8c:00.0 + if (ZE_RESULT_SUCCESS != result) { + return result; + } + auto device = static_cast(pDevice); + executionEnvironment = device->getNEODevice()->getExecutionEnvironment(); + + ExecutionEnvironmentRefCountRestore restorer(executionEnvironment); + releaseDeviceResources(); + + rootPortPath = pLinuxSysmanImp->getPciRootPortDirectoryPath(realRootPath); // e.g rootPortPath=/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0 + std::string rootAddress = getRootPortaddress(rootPortPath); // e.g rootAddress = 0000:8a:00.0 + + std::vector dir; + result = pFsAccess->listDirectory(slotPath, dir); // get list of slot directories from /sys/bus/pci/slots/ + if (ZE_RESULT_SUCCESS != result) { + return result; + } + for (auto &slot : dir) { + std::string slotAddress; + result = pFsAccess->read((slotPath + slot + "/address"), slotAddress); // extract slot address from the slot directory /sys/bus/pci/slots//address + if (ZE_RESULT_SUCCESS != result) { + return result; + } + if (slotAddress.compare(rootAddress) == 0) { // compare slot address to root port address + result = pFsAccess->write((slotPath + slot + "/power"), "0"); // turn off power + if (ZE_RESULT_SUCCESS != result) { + return result; + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); // Sleep for 100 milliseconds just to make sure, 1 ms is defined as part of spec + result = pFsAccess->write((slotPath + slot + "/power"), "1"); // turn on power + if (ZE_RESULT_SUCCESS != result) { + return result; + } + return initDevice(); + } + } + return ZE_RESULT_ERROR_DEVICE_LOST; // incase the reset fails inform upper layers. +} + +ze_result_t LinuxDiagnosticsImp::osRunDiagTestsinFW(zes_diag_result_t *pResult) { + const int intVal = 1; + // before running diagnostics need to close all active workloads + // writing 1 to /sys/class/drm/card/quiesce_gpu will signal KMD + //GPU (every gt in the card) will be wedged. + // GPU will only be unwedged after warm/cold reset + ::pid_t myPid = pProcfsAccess->myProcessId(); + std::vector<::pid_t> processes; + ze_result_t result = pProcfsAccess->listProcesses(processes); + if (ZE_RESULT_SUCCESS != result) { + return result; + } + for (auto &&pid : processes) { + std::vector fds; + getPidFdsForOpenDevice(pProcfsAccess, pSysfsAccess, pid, fds); + if (pid == myPid) { + // L0 is expected to have this file open. + // Keep list of fds. Close before unbind. + continue; + } + if (!fds.empty()) { + pProcfsAccess->kill(pid); + } + } + result = pSysfsAccess->write(quiescentGpuFile, intVal); + if (ZE_RESULT_SUCCESS != result) { + return result; + } + result = pSysfsAccess->write(invalidateLmemFile, intVal); + if (ZE_RESULT_SUCCESS != result) { + return result; + } + pFwInterface->fwRunDiagTests(osDiagType, pResult); + if (*pResult == ZES_DIAG_RESULT_REBOOT_FOR_REPAIR) { + return osColdReset(); + } + return osWarmReset(); // we need to at least do a Warm reset to bring the machine out of wedged state +} + +} // namespace L0 diff --git a/level_zero/tools/source/sysman/firmware/linux/CMakeLists.txt b/level_zero/tools/source/sysman/firmware/linux/CMakeLists.txt index b6c51b9f97..30b1a65536 100644 --- a/level_zero/tools/source/sysman/firmware/linux/CMakeLists.txt +++ b/level_zero/tools/source/sysman/firmware/linux/CMakeLists.txt @@ -8,9 +8,18 @@ set(L0_SRCS_TOOLS_SYSMAN_FIRMWARE_LINUX ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt ${CMAKE_CURRENT_SOURCE_DIR}/os_firmware_imp.cpp ${CMAKE_CURRENT_SOURCE_DIR}/os_firmware_imp.h - ${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}os_firmware_imp_helper.cpp ) +if(NEO_ENABLE_i915_PRELIM_DETECTION) + list(APPEND L0_SRCS_TOOLS_SYSMAN_FIRMWARE_LINUX + ${CMAKE_CURRENT_SOURCE_DIR}/os_firmware_imp_helper_prelim.cpp + ) +else() + list(APPEND L0_SRCS_TOOLS_SYSMAN_FIRMWARE_LINUX + ${CMAKE_CURRENT_SOURCE_DIR}/os_firmware_imp_helper.cpp + ) +endif() + if(UNIX) target_sources(${L0_STATIC_LIB_NAME} PRIVATE diff --git a/level_zero/tools/source/sysman/firmware/linux/os_firmware_imp_helper_prelim.cpp b/level_zero/tools/source/sysman/firmware/linux/os_firmware_imp_helper_prelim.cpp new file mode 100644 index 0000000000..3eacf107d1 --- /dev/null +++ b/level_zero/tools/source/sysman/firmware/linux/os_firmware_imp_helper_prelim.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2021-2022 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "level_zero/tools/source/sysman/firmware/linux/os_firmware_imp.h" + +const std::string iafPath = "device/"; +const std::string iafDirectory = "iaf."; +const std::string pscbin_version = "/pscbin_version"; + +namespace L0 { + +ze_result_t LinuxFirmwareImp::getFirmwareVersion(std::string fwType, zes_firmware_properties_t *pProperties) { + std::string fwVersion; + if (fwType == "PSC") { + std::string path; + path.clear(); + std::vector list; + // scans the directories present in /sys/class/drm/cardX/device/ + ze_result_t result = pSysfsAccess->scanDirEntries(iafPath, list); + if (ZE_RESULT_SUCCESS != result) { + // There should be a device directory + return result; + } + for (const auto &entry : list) { + if (!iafDirectory.compare(entry.substr(0, iafDirectory.length()))) { + // device/iaf.X/pscbin_version, where X is the hardware slot number + path = iafPath + entry + pscbin_version; + } + } + if (path.empty()) { + // This device does not have a PSC Version + return ZE_RESULT_ERROR_NOT_AVAILABLE; + } + std::string pscVersion; + pscVersion.clear(); + result = pSysfsAccess->read(path, pscVersion); + if (ZE_RESULT_SUCCESS != result) { + // not able to read PSC version from iaf.x + return result; + } + strncpy_s(static_cast(pProperties->version), ZES_STRING_PROPERTY_SIZE, pscVersion.c_str(), ZES_STRING_PROPERTY_SIZE); + return result; + } + ze_result_t result = pFwInterface->getFwVersion(fwType, fwVersion); + if (result == ZE_RESULT_SUCCESS) { + strncpy_s(static_cast(pProperties->version), ZES_STRING_PROPERTY_SIZE, fwVersion.c_str(), ZES_STRING_PROPERTY_SIZE); + } + + return result; +} + +} // namespace L0 \ No newline at end of file diff --git a/level_zero/tools/source/sysman/ras/linux/CMakeLists.txt b/level_zero/tools/source/sysman/ras/linux/CMakeLists.txt old mode 100755 new mode 100644 index 2d0b8a5b08..35784a9734 --- a/level_zero/tools/source/sysman/ras/linux/CMakeLists.txt +++ b/level_zero/tools/source/sysman/ras/linux/CMakeLists.txt @@ -5,10 +5,24 @@ # set(L0_SRCS_TOOLS_SYSMAN_RAS_LINUX - ${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}os_ras_imp.cpp - ${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}os_ras_imp.h + ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt ) +if(NEO_ENABLE_i915_PRELIM_DETECTION) + list(APPEND L0_SRCS_TOOLS_SYSMAN_RAS_LINUX + ${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_prelim.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_prelim.h + ${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_gt.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_fabric.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp_hbm.cpp + ) +else() + list(APPEND L0_SRCS_TOOLS_SYSMAN_RAS_LINUX + ${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.h + ) +endif() + if(UNIX) target_sources(${L0_STATIC_LIB_NAME} PRIVATE @@ -18,5 +32,4 @@ endif() # Make our source files visible to parent set_property(GLOBAL PROPERTY L0_SRCS_TOOLS_SYSMAN_RAS_LINUX ${L0_SRCS_TOOLS_SYSMAN_RAS_LINUX}) -add_subdirectories() diff --git a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_fabric.cpp b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_fabric.cpp new file mode 100644 index 0000000000..6876d3bf7a --- /dev/null +++ b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_fabric.cpp @@ -0,0 +1,105 @@ +/* + * Copyright (C) 2021-2022 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h" + +#include "sysman/linux/fs_access.h" +#include "sysman/linux/os_sysman_imp.h" + +#include +namespace L0 { + +void LinuxRasSourceFabric::getNodes(std::vector &nodes, uint32_t subdeviceId, FsAccess *fsAccess, const zes_ras_error_type_t &type) { + const uint32_t minBoardStrappedNumber = 0; + const uint32_t maxBoardStrappedNumber = 31; + const uint32_t minPortId = 1; + const uint32_t maxPortId = 8; + nodes.clear(); + + for (auto boardStrappedNumber = minBoardStrappedNumber; boardStrappedNumber <= maxBoardStrappedNumber; boardStrappedNumber++) { + const auto iafPathString("/sys/module/iaf/drivers/platform:iaf/iaf."); + const auto boardStrappedString(iafPathString + std::to_string(boardStrappedNumber)); + if (!fsAccess->directoryExists(boardStrappedString)) { + continue; + } + const auto subDeviceString(boardStrappedString + "/sd." + std::to_string(subdeviceId)); + std::vector subDeviceErrorNodes; + + if (type == ZES_RAS_ERROR_TYPE_CORRECTABLE) { + subDeviceErrorNodes.push_back(subDeviceString + "/fw_comm_errors"); + for (auto portId = minPortId; portId <= maxPortId; portId++) { + subDeviceErrorNodes.push_back(subDeviceString + "/port." + std::to_string(portId) + "/link_degrades"); + } + } else { + subDeviceErrorNodes.push_back(subDeviceString + "/sd_failure"); + subDeviceErrorNodes.push_back(subDeviceString + "/fw_error"); + for (auto portId = minPortId; portId <= maxPortId; portId++) { + subDeviceErrorNodes.push_back(subDeviceString + "/port." + std::to_string(portId) + "/link_failures"); + } + } + + for (auto &subDeviceErrorNode : subDeviceErrorNodes) { + if (ZE_RESULT_SUCCESS == fsAccess->canRead(subDeviceErrorNode)) { + nodes.push_back(subDeviceErrorNode); + } + } + } +} + +ze_result_t LinuxRasSourceFabric::getSupportedRasErrorTypes(std::set &errorType, + OsSysman *pOsSysman, ze_device_handle_t deviceHandle) { + LinuxSysmanImp *pLinuxSysmanImp = static_cast(pOsSysman); + NEO::Device *neoDevice = static_cast(deviceHandle)->getNEODevice(); + uint32_t subDeviceIndex = neoDevice->isSubDevice() ? static_cast(neoDevice)->getSubDeviceIndex() : 0; + + std::vector nodes; + getNodes(nodes, subDeviceIndex, &pLinuxSysmanImp->getFsAccess(), ZES_RAS_ERROR_TYPE_UNCORRECTABLE); + if (nodes.size()) { + errorType.insert(ZES_RAS_ERROR_TYPE_UNCORRECTABLE); + } + getNodes(nodes, subDeviceIndex, &pLinuxSysmanImp->getFsAccess(), ZES_RAS_ERROR_TYPE_CORRECTABLE); + if (nodes.size()) { + errorType.insert(ZES_RAS_ERROR_TYPE_CORRECTABLE); + } + + return ZE_RESULT_SUCCESS; +} + +LinuxRasSourceFabric::LinuxRasSourceFabric(OsSysman *pOsSysman, zes_ras_error_type_t type, uint32_t subDeviceId) { + + fsAccess = &static_cast(pOsSysman)->getFsAccess(); + getNodes(errorNodes, subDeviceId, fsAccess, type); +} + +uint64_t LinuxRasSourceFabric::getComputeErrorCount() { + uint64_t currentErrorCount = 0; + for (const auto &node : errorNodes) { + uint64_t errorCount = 0; + fsAccess->read(node, errorCount); + currentErrorCount += errorCount; + } + return currentErrorCount; +} + +ze_result_t LinuxRasSourceFabric::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) { + + if (errorNodes.size() == 0) { + return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + std::memset(state.category, 0, sizeof(zes_ras_state_t::category)); + uint64_t currentComputeErrorCount = getComputeErrorCount(); + + if (clear) { + baseComputeErrorCount = currentComputeErrorCount; + currentComputeErrorCount = getComputeErrorCount(); + } + state.category[ZES_RAS_ERROR_CAT_COMPUTE_ERRORS] = currentComputeErrorCount - baseComputeErrorCount; + return ZE_RESULT_SUCCESS; +} + +} // namespace L0 \ No newline at end of file diff --git a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp new file mode 100644 index 0000000000..9c0d69b4b1 --- /dev/null +++ b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp @@ -0,0 +1,345 @@ +/* + * Copyright (C) 2021-2022 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h" + +#include "sysman/linux/os_sysman_imp.h" + +#include +namespace L0 { +static const std::map> categoryToListOfEventsUncorrectable = { + {ZES_RAS_ERROR_CAT_CACHE_ERRORS, + {"fatal-array-bist", "fatal-eu-grf", "fatal-eu-ic", + "fatal-guc", "fatal-idi-parity", "fatal-l3-double", + "fatal-l3-ecc-checker", "fatal-sampler", "fatal-slm", + "fatal-sqidi", "fatal-tlb"}}, + {ZES_RAS_ERROR_CAT_RESET, + {"engine-reset"}}, + {ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS, + {"eu-attention"}}, + {ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS, + {"soc-fatal-fabric-ss0-0", "soc-fatal-fabric-ss0-1", "soc-fatal-fabric-ss0-2", + "soc-fatal-fabric-ss0-3", "soc-fatal-fabric-ss1-0", "soc-fatal-fabric-ss1-1", + "soc-fatal-fabric-ss1-2", "soc-fatal-fabric-ss1-3", "soc-fatal-fabric-ss1-4", + "soc-fatal-hbm-ss0-0", "soc-fatal-hbm-ss0-1", "soc-fatal-hbm-ss0-2", + "soc-fatal-hbm-ss0-3", "soc-fatal-hbm-ss0-4", "soc-fatal-hbm-ss0-5", + "soc-fatal-hbm-ss0-6", "soc-fatal-hbm-ss0-7", "soc-fatal-hbm-ss0-8", + "soc-fatal-hbm-ss0-9", "soc-fatal-hbm-ss0-10", "soc-fatal-hbm-ss0-11", + "soc-fatal-hbm-ss0-12", "soc-fatal-hbm-ss0-13", "soc-fatal-hbm-ss0-14", + "soc-fatal-hbm-ss0-15", "soc-fatal-hbm-ss1-0", "soc-fatal-hbm-ss1-1", + "soc-fatal-hbm-ss1-2", "soc-fatal-hbm-ss1-3", "soc-fatal-hbm-ss1-4", + "soc-fatal-hbm-ss1-5", "soc-fatal-hbm-ss1-6", "soc-fatal-hbm-ss1-7", + "soc-fatal-hbm-ss1-8", "soc-fatal-hbm-ss1-9", "soc-fatal-hbm-ss1-10", + "soc-fatal-hbm-ss1-11", "soc-fatal-hbm-ss1-12", "soc-fatal-hbm-ss1-13", + "soc-fatal-hbm-ss1-14", "soc-fatal-hbm-ss1-15", "soc-fatal-mdfi-east", + "soc-fatal-mdfi-south", "soc-fatal-mdfi-west", "soc-fatal-psf-csc-0", + "soc-fatal-psf-csc-1", "soc-fatal-psf-csc-2", "soc-fatal-punit", + "sgunit-fatal", "soc-nonfatal-fabric-ss0-0", "soc-nonfatal-fabric-ss0-1", + "soc-nonfatal-fabric-ss0-2", "soc-nonfatal-fabric-ss0-3", "soc-nonfatal-fabric-ss1-0", + "soc-nonfatal-fabric-ss1-1", "soc-nonfatal-fabric-ss1-2", "soc-nonfatal-fabric-ss1-3", + "soc-nonfatal-fabric-ss1-4", "soc-nonfatal-hbm-ss0-0", "soc-nonfatal-hbm-ss0-1", + "soc-nonfatal-hbm-ss0-2", "soc-nonfatal-hbm-ss0-3", "soc-nonfatal-hbm-ss0-4", + "soc-nonfatal-hbm-ss0-5", "soc-nonfatal-hbm-ss0-6", "soc-nonfatal-hbm-ss0-7", + "soc-nonfatal-hbm-ss0-8", "soc-nonfatal-hbm-ss0-9", "soc-nonfatal-hbm-ss0-10", + "soc-nonfatal-hbm-ss0-11", "soc-nonfatal-hbm-ss0-12", "soc-nonfatal-hbm-ss0-13", + "soc-nonfatal-hbm-ss0-14", "soc-nonfatal-hbm-ss0-15", "soc-nonfatal-hbm-ss1-0", + "soc-nonfatal-hbm-ss1-1", "soc-nonfatal-hbm-ss1-2", "soc-nonfatal-hbm-ss1-3", + "soc-nonfatal-hbm-ss1-4", "soc-nonfatal-hbm-ss1-5", "soc-nonfatal-hbm-ss1-6", + "soc-nonfatal-hbm-ss1-7", "soc-nonfatal-hbm-ss1-8", "soc-nonfatal-hbm-ss1-9", + "soc-nonfatal-hbm-ss1-10", "soc-nonfatal-hbm-ss1-11", "soc-nonfatal-hbm-ss1-12", + "soc-nonfatal-hbm-ss1-13", "soc-nonfatal-hbm-ss1-14", "soc-nonfatal-hbm-ss1-15", + "soc-nonfatal-mdfi-east", "soc-nonfatal-mdfi-south", "soc-nonfatal-mdfi-west", + "soc-nonfatal-psf-csc-0", "soc-nonfatal-psf-csc-1", "soc-nonfatal-psf-csc-2", + "soc-nonfatal-punit", "sgunit-nonfatal"}}, + {ZES_RAS_ERROR_CAT_COMPUTE_ERRORS, + {"fatal-fpu", "fatal-l3-fabric"}}, + {ZES_RAS_ERROR_CAT_DRIVER_ERRORS, + {"driver-object-migration", "driver-engine-other", "driver-ggtt", + "driver-gt-interrupt", "driver-gt-other", "driver-guc-communication", + "driver-rps"}}}; + +static const std::map> categoryToListOfEventsCorrectable = { + {ZES_RAS_ERROR_CAT_CACHE_ERRORS, + {"correctable-eu-grf", "correctable-eu-ic", "correctable-guc", + "correctable-l3-sng", "correctable-sampler", "correctable-slm"}}, + {ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS, + {"soc-correctable-fabric-ss0-0", "soc-correctable-fabric-ss0-1", "soc-correctable-fabric-ss0-2", + "soc-correctable-fabric-ss0-3", "soc-correctable-fabric-ss1-0", "soc-correctable-fabric-ss1-1", + "soc-correctable-fabric-ss1-2", "soc-correctable-fabric-ss1-3", "soc-correctable-fabric-ss1-4", + "soc-correctable-hbm-ss0-0", "soc-correctable-hbm-ss0-1", "soc-correctable-hbm-ss0-2", + "soc-correctable-hbm-ss0-3", "soc-correctable-hbm-ss0-4", "soc-correctable-hbm-ss0-5", + "soc-correctable-hbm-ss0-6", "soc-correctable-hbm-ss0-7", "soc-correctable-hbm-ss0-8", + "soc-correctable-hbm-ss0-9", "soc-correctable-hbm-ss0-10", "soc-correctable-hbm-ss0-11", + "soc-correctable-hbm-ss0-12", "soc-correctable-hbm-ss0-13", "soc-correctable-hbm-ss0-14", + "soc-correctable-hbm-ss0-15", "soc-correctable-hbm-ss1-0", "soc-correctable-hbm-ss1-1", + "soc-correctable-hbm-ss1-2", "soc-correctable-hbm-ss1-3", "soc-correctable-hbm-ss1-4", + "soc-correctable-hbm-ss1-5", "soc-correctable-hbm-ss1-6", "soc-correctable-hbm-ss1-7", + "soc-correctable-hbm-ss1-8", "soc-correctable-hbm-ss1-9", "soc-correctable-hbm-ss1-10", + "soc-correctable-hbm-ss1-11", "soc-correctable-hbm-ss1-12", "soc-correctable-hbm-ss1-13", + "soc-correctable-hbm-ss1-14", "soc-correctable-hbm-ss1-15", "soc-correctable-mdfi-east", + "soc-correctable-mdfi-south", "soc-correctable-mdfi-west", "soc-correctable-psf-csc-0", + "soc-correctable-psf-csc-1", "soc-correctable-punit", "sgunit-correctable"}}}; + +static void closeFd(int64_t &fd) { + if (fd != -1) { + close(static_cast(fd)); + fd = -1; + } +} + +static ze_result_t readI915EventsDirectory(LinuxSysmanImp *pLinuxSysmanImp, std::vector &listOfEvents, std::string *eventDirectory) { + // To know how many errors are supported on a platform scan + // /sys/devices/i915_0000_01_00.0/events/ + // all events are enumerated in sysfs at /sys/devices/i915_0000_01_00.0/events/ + // For above example device is in PCI slot 0000:01:00.0: + SysfsAccess *pSysfsAccess = &pLinuxSysmanImp->getSysfsAccess(); + const std::string deviceDir("device"); + const std::string sysDevicesDir("/sys/devices/"); + std::string bdfDir; + ze_result_t result = pSysfsAccess->readSymLink(deviceDir, bdfDir); + if (ZE_RESULT_SUCCESS != result) { + return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + const auto loc = bdfDir.find_last_of('/'); + auto bdf = bdfDir.substr(loc + 1); + std::replace(bdf.begin(), bdf.end(), ':', '_'); + std::string i915DirName = "i915_" + bdf; + std::string sysfsNode = sysDevicesDir + i915DirName + "/" + "events"; + if (eventDirectory != nullptr) { + *eventDirectory = sysfsNode; + } + FsAccess *pFsAccess = &pLinuxSysmanImp->getFsAccess(); + result = pFsAccess->listDirectory(sysfsNode, listOfEvents); + if (ZE_RESULT_SUCCESS != result) { + return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + return ZE_RESULT_SUCCESS; +} + +static uint64_t convertHexToUint64(std::string strVal) { + auto loc = strVal.find('='); + std::stringstream ss; + ss << std::hex << strVal.substr(loc + 1); + uint64_t config = 0; + ss >> config; + return config; +} + +static bool isErrorTypeSupported(std::string pattern, std::vector &eventList) { + std::regex pPattern(pattern); + for (const auto &entry : eventList) { + if (regex_match(entry, pPattern) == true) { + return true; + } + } + return false; +} + +static bool getErrorType(std::vector errorPattern, std::vector &eventList) { + for (auto &pattern : errorPattern) { + if (isErrorTypeSupported(pattern, eventList) == true) { + return true; + } + } + return false; +} + +void LinuxRasSourceGt::closeFds() { + for (auto &memberFd : memberFds) { + closeFd(memberFd); + } + memberFds.clear(); + closeFd(groupFd); +} + +LinuxRasSourceGt::~LinuxRasSourceGt() { + closeFds(); +} + +void LinuxRasSourceGt::getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle) { + LinuxSysmanImp *pLinuxSysmanImp = static_cast(pOsSysman); + std::vector listOfEvents = {}; + ze_result_t result = readI915EventsDirectory(pLinuxSysmanImp, listOfEvents, nullptr); + if (result != ZE_RESULT_SUCCESS) { + return; + } + ze_device_properties_t deviceProperties = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES}; + Device::fromHandle(deviceHandle)->getProperties(&deviceProperties); + bool onSubDevice = deviceProperties.flags & ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE; + uint32_t subDeviceId = deviceProperties.subdeviceId; + std::vector uncorrectablePattern; + std::vector correctablePattern; + // For device with no subDevice error entries are of form error-- + // and for device having subDevice error entries are of form error-gt-- + uncorrectablePattern.push_back("^error--driver.*"); + if (onSubDevice == false) { + correctablePattern.push_back("^error--correctable.*"); + correctablePattern.push_back("^error--soc-correctable.*"); + uncorrectablePattern.push_back("^error--engine-reset.*"); + uncorrectablePattern.push_back("^error--eu-attention.*"); + uncorrectablePattern.push_back("^error--fatal.*"); + uncorrectablePattern.push_back("^error--soc-fatal.*"); + uncorrectablePattern.push_back("^error--soc-nonfatal.*"); + } else { + correctablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--correctable.*"); + correctablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--soc-correctable.*"); + uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--driver.*"); + uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--fatal.*"); + uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--soc-fatal.*"); + uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--soc-nonfatal.*"); + uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--eu-attention.*"); + uncorrectablePattern.push_back("^error-gt" + std::to_string(subDeviceId) + "--engine-reset.*"); + } + if (getErrorType(correctablePattern, listOfEvents) == true) { + errorType.insert(ZES_RAS_ERROR_TYPE_CORRECTABLE); + } + if (getErrorType(uncorrectablePattern, listOfEvents) == true) { + errorType.insert(ZES_RAS_ERROR_TYPE_UNCORRECTABLE); + } +} + +ze_result_t LinuxRasSourceGt::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) { + if (clear == true) { + closeFds(); + totalEventCount = 0; + memset(state.category, 0, sizeof(state.category)); + memset(initialErrorCount, 0, sizeof(initialErrorCount)); + } + initRasErrors(clear); + // Iterate over all the file descriptor values present in vector which is mapped to given ras error category + // Use the file descriptors to read pmu counters and add all the errors corresponding to the ras error category + if (groupFd < 0) { + return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + std::map> categoryToEvent; + if (osRasErrorType == ZES_RAS_ERROR_TYPE_CORRECTABLE) { + categoryToEvent = categoryToListOfEventsCorrectable; + } + if (osRasErrorType == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) { + categoryToEvent = categoryToListOfEventsUncorrectable; + } + std::vector data(2 + totalEventCount, 0); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp + if (pPmuInterface->pmuRead(static_cast(groupFd), data.data(), sizeof(uint64_t) * data.size()) < 0) { + return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + /* The data buffer retrieved after reading pmu counters is parsed to get the error count for each suberror category */ + uint64_t initialIndex = 2; // Initial index in the buffer from which the data be parsed begins + for (auto errorCat = errorCategoryToEventCount.begin(); errorCat != errorCategoryToEventCount.end(); errorCat++) { + uint64_t errorCount = 0; + uint64_t j = 0; + for (; j < errorCat->second; j++) { + errorCount += data[initialIndex + j]; + } + state.category[errorCat->first] = errorCount + initialErrorCount[errorCat->first]; + initialIndex += j; + } + + return ZE_RESULT_SUCCESS; +} + +ze_result_t LinuxRasSourceGt::getPmuConfig( + const std::string &eventDirectory, + const std::vector &listOfEvents, + const std::string &errorFileToGetConfig, + std::string &pmuConfig) { + auto findErrorInList = std::find(listOfEvents.begin(), listOfEvents.end(), errorFileToGetConfig); + if (findErrorInList == listOfEvents.end()) { + return ZE_RESULT_ERROR_UNKNOWN; + } + return pFsAccess->read(eventDirectory + "/" + errorFileToGetConfig, pmuConfig); +} + +ze_result_t LinuxRasSourceGt::getBootUpErrorCountFromSysfs( + std::string nameOfError, + const std::string &errorCounterDir, + uint64_t &errorVal) { + std::replace(nameOfError.begin(), nameOfError.end(), '-', '_'); // replace - with _ to convert name of pmu config node to name of sysfs node + return pSysfsAccess->read(errorCounterDir + "/" + nameOfError, errorVal); +} + +void LinuxRasSourceGt::initRasErrors(ze_bool_t clear) { + + // if already initialized + if (groupFd >= 0) { + return; + } + + std::string eventDirectory; + std::vector listOfEvents = {}; + ze_result_t result = readI915EventsDirectory(pLinuxSysmanImp, listOfEvents, &eventDirectory); + if (result != ZE_RESULT_SUCCESS) { + return; + } + std::map> categoryToListOfEvents; + if (osRasErrorType == ZES_RAS_ERROR_TYPE_CORRECTABLE) { + categoryToListOfEvents = categoryToListOfEventsCorrectable; + } + if (osRasErrorType == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) { + categoryToListOfEvents = categoryToListOfEventsUncorrectable; + } + std::string errorPrefix = "error--"; // prefix string of the file containing config value for pmu counters + std::string errorCounterDir = "gt/gt0/error_counter"; // Directory containing the sysfs nodes which in turn contains initial value of error count + if (isSubdevice == true) { + errorPrefix = "error-gt" + std::to_string(subdeviceId) + "--"; + errorCounterDir = "gt/gt" + std::to_string(subdeviceId) + "/error_counter"; + } + // Following loop retrieves initial count of errors from sysfs and pmu config values for each ras error + // PMU: error-- Ex:- error--engine-reset (config with no subdevice) + // PMU: error-gt-- Ex:- error-gt0--engine-reset (config with subdevices) + // PMU: error-- Ex:- error--driver-object-migration (config for device level errors) + // Sysfs: card0/gt/gt0/error_counter/ Ex:- gt/gt0/error_counter/engine_reset (sysfs with no subdevice) + // Sysfs: card0/gt/gt/error_counter/ Ex:- gt/gt1/error_counter/engine_reset (sysfs with dubdevices) + // Sysfs: error_counter/ Ex:- error_counter/driver_object_migration (sysfs for error which occur at device level) + for (auto const &rasErrorCatToListOfEvents : categoryToListOfEvents) { + uint64_t eventCount = 0; + uint64_t errorCount = 0; + for (auto const &nameOfError : rasErrorCatToListOfEvents.second) { + std::string errorPrefixLocal = errorPrefix; + std::string errorCounterDirLocal = errorCounterDir; + if (nameOfError == "driver-object-migration") { // check for errors which occur at device level + errorCounterDirLocal = "error_counter"; + errorPrefixLocal = "error--"; + } + uint64_t initialErrorVal = 0; + if (clear == false) { + result = getBootUpErrorCountFromSysfs(nameOfError, errorCounterDirLocal, initialErrorVal); + if (result != ZE_RESULT_SUCCESS) { + continue; + } + } + std::string pmuConfig; + result = getPmuConfig(eventDirectory, listOfEvents, errorPrefixLocal + nameOfError, pmuConfig); + if (result != ZE_RESULT_SUCCESS) { + continue; + } + uint64_t config = convertHexToUint64(pmuConfig); + if (groupFd == -1) { + groupFd = pPmuInterface->pmuInterfaceOpen(config, -1, PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP); // To get file descriptor of the group leader + if (groupFd < 0) { + return; + } + } else { + // The rest of the group members are created with subsequent calls with groupFd being set to the file descriptor of the group leader + memberFds.push_back(pPmuInterface->pmuInterfaceOpen(config, static_cast(groupFd), PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_GROUP)); + } + eventCount++; + errorCount += initialErrorVal; + } + initialErrorCount[rasErrorCatToListOfEvents.first] = errorCount; + errorCategoryToEventCount[rasErrorCatToListOfEvents.first] = eventCount; + totalEventCount += eventCount; + } +} + +LinuxRasSourceGt::LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : pLinuxSysmanImp(pLinuxSysmanImp), osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) { + pPmuInterface = pLinuxSysmanImp->getPmuInterface(); + pFsAccess = &pLinuxSysmanImp->getFsAccess(); + pSysfsAccess = &pLinuxSysmanImp->getSysfsAccess(); +} + +} // namespace L0 \ No newline at end of file diff --git a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_hbm.cpp b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_hbm.cpp new file mode 100644 index 0000000000..6851435dfb --- /dev/null +++ b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_hbm.cpp @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2021-2022 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h" + +#include "sysman/linux/os_sysman_imp.h" + +namespace L0 { + +void LinuxRasSourceHbm::getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle) { + LinuxSysmanImp *pLinuxSysmanImp = static_cast(pOsSysman); + FirmwareUtil *pFwInterface = pLinuxSysmanImp->getFwUtilInterface(); + if (pFwInterface != nullptr) { + errorType.insert(ZES_RAS_ERROR_TYPE_CORRECTABLE); + errorType.insert(ZES_RAS_ERROR_TYPE_UNCORRECTABLE); + } +} + +ze_result_t LinuxRasSourceHbm::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) { + if (pFwInterface == nullptr) { + return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + uint32_t subDeviceCount = 0; + pDevice->getSubDevices(&subDeviceCount, nullptr); + if (clear == true) { + uint64_t errorCount = 0; + ze_result_t result = pFwInterface->fwGetMemoryErrorCount(osRasErrorType, subDeviceCount, subdeviceId, errorCount); + if (result != ZE_RESULT_SUCCESS) { + return result; + } + errorBaseline = errorCount; // during clear update the error baseline value + } + uint64_t errorCount = 0; + ze_result_t result = pFwInterface->fwGetMemoryErrorCount(osRasErrorType, subDeviceCount, subdeviceId, errorCount); + if (result != ZE_RESULT_SUCCESS) { + return result; + } + state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS] = errorCount - errorBaseline; + return ZE_RESULT_SUCCESS; +} + +LinuxRasSourceHbm::LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId) : pLinuxSysmanImp(pLinuxSysmanImp), osRasErrorType(type), subdeviceId(subdeviceId) { + pFwInterface = pLinuxSysmanImp->getFwUtilInterface(); + pDevice = pLinuxSysmanImp->getDeviceHandle(); +} + +} // namespace L0 diff --git a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.cpp b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.cpp new file mode 100644 index 0000000000..b2fe4d60c9 --- /dev/null +++ b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2020-2022 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h" + +#include "sysman/linux/os_sysman_imp.h" + +namespace L0 { + +void OsRas::getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle) { + + constexpr auto maxErrorTypes = 2; + LinuxRasSourceGt::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle); + if (errorType.size() < maxErrorTypes) { + LinuxRasSourceFabric::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle); + if (errorType.size() < maxErrorTypes) { + LinuxRasSourceHbm::getSupportedRasErrorTypes(errorType, pOsSysman, deviceHandle); + } + } +} + +ze_result_t LinuxRasImp::osRasGetConfig(zes_ras_config_t *config) { + config->totalThreshold = totalThreshold; + memcpy(config->detailedThresholds.category, categoryThreshold, sizeof(config->detailedThresholds.category)); + return ZE_RESULT_SUCCESS; +} + +ze_result_t LinuxRasImp::osRasSetConfig(const zes_ras_config_t *config) { + if (pFsAccess->isRootUser() == true) { + totalThreshold = config->totalThreshold; + memcpy(categoryThreshold, config->detailedThresholds.category, sizeof(config->detailedThresholds.category)); + return ZE_RESULT_SUCCESS; + } + return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS; +} + +ze_result_t LinuxRasImp::osRasGetProperties(zes_ras_properties_t &properties) { + properties.pNext = nullptr; + properties.type = osRasErrorType; + properties.onSubdevice = isSubdevice; + properties.subdeviceId = subdeviceId; + return ZE_RESULT_SUCCESS; +} + +ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) { + if (clear == true) { + if (pFsAccess->isRootUser() == false) { + return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS; + } + } + + ze_result_t result = ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; + for (auto &rasSource : rasSources) { + zes_ras_state_t localState = {}; + ze_result_t localResult = rasSource->osRasGetState(localState, clear); + if (localResult != ZE_RESULT_SUCCESS) { + continue; + } + for (int i = 0; i < ZES_MAX_RAS_ERROR_CATEGORY_COUNT; i++) { + state.category[i] += localState.category[i]; + } + result = ZE_RESULT_SUCCESS; + } + return result; +} + +void LinuxRasImp::initSources() { + rasSources.push_back(std::make_unique(pLinuxSysmanImp, osRasErrorType, isSubdevice, subdeviceId)); + rasSources.push_back(std::make_unique(pLinuxSysmanImp, osRasErrorType, subdeviceId)); + rasSources.push_back(std::make_unique(pLinuxSysmanImp, osRasErrorType, subdeviceId)); +} + +LinuxRasImp::LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) : osRasErrorType(type), isSubdevice(onSubdevice), subdeviceId(subdeviceId) { + pLinuxSysmanImp = static_cast(pOsSysman); + pFsAccess = &pLinuxSysmanImp->getFsAccess(); + initSources(); +} + +OsRas *OsRas::create(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) { + LinuxRasImp *pLinuxRasImp = new LinuxRasImp(pOsSysman, type, onSubdevice, subdeviceId); + return static_cast(pLinuxRasImp); +} + +} // namespace L0 diff --git a/level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h new file mode 100644 index 0000000000..ba064ae1cc --- /dev/null +++ b/level_zero/tools/source/sysman/ras/linux/os_ras_imp_prelim.h @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2020-2022 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once +#include "shared/source/helpers/non_copyable_or_moveable.h" + +#include "level_zero/tools/source/sysman/ras/os_ras.h" + +#include +#include +#include +#include + +namespace L0 { +class FsAccess; +class SysfsAccess; +class PmuInterface; +class LinuxSysmanImp; +class LinuxRasSources; +class FirmwareUtil; +struct Device; + +class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass { + public: + ze_result_t osRasGetProperties(zes_ras_properties_t &properties) override; + ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override; + ze_result_t osRasGetConfig(zes_ras_config_t *config) override; + ze_result_t osRasSetConfig(const zes_ras_config_t *config) override; + LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId); + LinuxRasImp() = default; + ~LinuxRasImp() override = default; + + protected: + zes_ras_error_type_t osRasErrorType = {}; + FsAccess *pFsAccess = nullptr; + LinuxSysmanImp *pLinuxSysmanImp = nullptr; + std::vector> rasSources = {}; + + private: + void initSources(); + bool isSubdevice = false; + uint32_t subdeviceId = 0; + uint64_t totalThreshold = 0; + uint64_t categoryThreshold[ZES_MAX_RAS_ERROR_CATEGORY_COUNT] = {0}; +}; + +class LinuxRasSources : NEO::NonCopyableOrMovableClass { + public: + virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) = 0; + virtual ~LinuxRasSources() = default; +}; + +class LinuxRasSourceGt : public LinuxRasSources { + public: + virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override; + static void getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle); + LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId); + LinuxRasSourceGt() = default; + virtual ~LinuxRasSourceGt(); + + protected: + LinuxSysmanImp *pLinuxSysmanImp = nullptr; + zes_ras_error_type_t osRasErrorType = {}; + PmuInterface *pPmuInterface = nullptr; + FsAccess *pFsAccess = nullptr; + SysfsAccess *pSysfsAccess = nullptr; + + private: + void initRasErrors(ze_bool_t clear); + ze_result_t getPmuConfig( + const std::string &eventDirectory, + const std::vector &listOfEvents, + const std::string &errorFileToGetConfig, + std::string &pmuConfig); + ze_result_t getBootUpErrorCountFromSysfs( + std::string nameOfError, + const std::string &errorCounterDir, + uint64_t &errorVal); + void closeFds(); + int64_t groupFd = -1; + std::vector memberFds = {}; + uint64_t initialErrorCount[ZES_MAX_RAS_ERROR_CATEGORY_COUNT] = {0}; + std::map errorCategoryToEventCount; + uint64_t totalEventCount = 0; + bool isSubdevice = false; + uint32_t subdeviceId = 0; +}; + +class LinuxRasSourceFabric : public LinuxRasSources { + public: + static ze_result_t getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle); + LinuxRasSourceFabric(OsSysman *pOsSysman, zes_ras_error_type_t type, uint32_t subDeviceId); + ~LinuxRasSourceFabric() = default; + + ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override; + + private: + FsAccess *fsAccess = nullptr; + std::vector errorNodes = {}; + uint64_t baseComputeErrorCount = 0; + uint64_t getComputeErrorCount(); + static void getNodes(std::vector &nodes, uint32_t subdeviceId, FsAccess *fsAccess, const zes_ras_error_type_t &type); +}; + +class LinuxRasSourceHbm : public LinuxRasSources { + public: + virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override; + static void getSupportedRasErrorTypes(std::set &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle); + LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId); + LinuxRasSourceHbm() = default; + virtual ~LinuxRasSourceHbm() override{}; + + protected: + LinuxSysmanImp *pLinuxSysmanImp = nullptr; + zes_ras_error_type_t osRasErrorType = {}; + FirmwareUtil *pFwInterface = nullptr; + Device *pDevice = nullptr; + + private: + uint64_t errorBaseline = 0; + uint32_t subdeviceId = 0; +}; + +} // namespace L0 diff --git a/level_zero/tools/test/unit_tests/sources/sysman/diagnostics/linux/CMakeLists.txt b/level_zero/tools/test/unit_tests/sources/sysman/diagnostics/linux/CMakeLists.txt index af9e3fb10a..179c3f7cba 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/diagnostics/linux/CMakeLists.txt +++ b/level_zero/tools/test/unit_tests/sources/sysman/diagnostics/linux/CMakeLists.txt @@ -4,11 +4,21 @@ # SPDX-License-Identifier: MIT # +set(L0_TESTS_TOOLS_SYSMAN_DIAGNOSTICS_LINUX + ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt + ${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}test_zes_sysman_diagnostics.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/mock_zes_sysman_diagnostics.h +) + +if((NEO_ENABLE_i915_PRELIM_DETECTION) AND ("${BRANCH_TYPE}" STREQUAL "")) + list(REMOVE_ITEM L0_TESTS_TOOLS_SYSMAN_DIAGNOSTICS_LINUX + ${CMAKE_CURRENT_SOURCE_DIR}/test_zes_sysman_diagnostics.cpp + ) +endif() + if(UNIX) target_sources(${TARGET_NAME} PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt - ${CMAKE_CURRENT_SOURCE_DIR}/mock_zes_sysman_diagnostics.h - ${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}test_zes_sysman_diagnostics.cpp + ${L0_TESTS_TOOLS_SYSMAN_DIAGNOSTICS_LINUX} ) endif() diff --git a/level_zero/tools/test/unit_tests/sources/sysman/firmware/linux/CMakeLists.txt b/level_zero/tools/test/unit_tests/sources/sysman/firmware/linux/CMakeLists.txt index e51342d8ca..6fbc1bbcbc 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/firmware/linux/CMakeLists.txt +++ b/level_zero/tools/test/unit_tests/sources/sysman/firmware/linux/CMakeLists.txt @@ -4,11 +4,22 @@ # SPDX-License-Identifier: MIT # +set(L0_TESTS_TOOLS_SYSMAN_FIRMWARE_LINUX + ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt + ${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}mock_zes_sysman_firmware.h + ${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}test_zes_sysman_firmware.cpp +) + +if((NEO_ENABLE_i915_PRELIM_DETECTION) AND ("${BRANCH_TYPE}" STREQUAL "")) + list(REMOVE_ITEM L0_TESTS_TOOLS_SYSMAN_FIRMWARE_LINUX + ${CMAKE_CURRENT_SOURCE_DIR}/test_zes_sysman_firmware.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/mock_zes_sysman_firmware.h + ) +endif() + if(UNIX) target_sources(${TARGET_NAME} PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt - ${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}mock_zes_sysman_firmware.h - ${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}test_zes_sysman_firmware.cpp + ${L0_TESTS_TOOLS_SYSMAN_FIRMWARE_LINUX} ) endif() diff --git a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/CMakeLists.txt b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/CMakeLists.txt index 16e69d4e65..01789a347c 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/CMakeLists.txt +++ b/level_zero/tools/test/unit_tests/sources/sysman/ras/linux/CMakeLists.txt @@ -4,12 +4,23 @@ # SPDX-License-Identifier: MIT # +set(L0_TESTS_TOOLS_SYSMAN_RAS_LINUX + ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt + ${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}test_zes_ras.cpp + ${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}mock_fs_ras.h +) + +if((NEO_ENABLE_i915_PRELIM_DETECTION) AND ("${BRANCH_TYPE}" STREQUAL "")) + list(REMOVE_ITEM L0_TESTS_TOOLS_SYSMAN_RAS_LINUX + ${CMAKE_CURRENT_SOURCE_DIR}/test_zes_ras.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/mock_fs_ras.h + ) +endif() + if(UNIX) target_sources(${TARGET_NAME} PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt - ${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}test_zes_ras.cpp - ${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}mock_fs_ras.h + ${L0_TESTS_TOOLS_SYSMAN_RAS_LINUX} ) endif() add_subdirectories()