mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-06 02:18:05 +08:00
add counter support for RAS.
- added dual handle support for RAS Correctable and Uncorrectable Errors. - added reset counter for RAS. - added Os Specific ULT for RAS Change-Id: Ia10115bf6720ab211f549571e810ec0d6c0801ec
This commit is contained in:
committed by
sys_ocldev
parent
fa3cb35fde
commit
0c9c55cd17
@@ -7,7 +7,7 @@
|
||||
set(L0_SRCS_TOOLS_SYSMAN_RAS
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/ras.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/ras.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}/ras_imp.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/ras_imp.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/ras_imp.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras.h
|
||||
)
|
||||
|
||||
@@ -5,7 +5,8 @@
|
||||
#
|
||||
|
||||
set(L0_SRCS_TOOLS_SYSMAN_RAS_LINUX
|
||||
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}/os_ras_imp.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.h
|
||||
)
|
||||
|
||||
if(UNIX)
|
||||
|
||||
@@ -5,26 +5,39 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/source/helpers/non_copyable_or_moveable.h"
|
||||
|
||||
#include "level_zero/tools/source/sysman/ras/os_ras.h"
|
||||
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp.h"
|
||||
|
||||
#include "sysman/linux/os_sysman_imp.h"
|
||||
|
||||
namespace L0 {
|
||||
|
||||
class LinuxRasImp : public OsRas, public NEO::NonCopyableClass {
|
||||
public:
|
||||
LinuxRasImp(OsSysman *pOsSysman);
|
||||
~LinuxRasImp() override = default;
|
||||
ze_result_t getCounterValues(zet_ras_details_t *pDetails) override;
|
||||
|
||||
private:
|
||||
FsAccess *pFsAccess = nullptr;
|
||||
};
|
||||
const std::string LinuxRasImp::rasCounterDir("/var/lib/libze_intel_gpu/");
|
||||
const std::string LinuxRasImp::resetCounter("ras_reset_count");
|
||||
const std::string LinuxRasImp::resetCounterFile = rasCounterDir + resetCounter;
|
||||
|
||||
void LinuxRasImp::setRasErrorType(zet_ras_error_type_t type) {
|
||||
osRasErrorType = type;
|
||||
}
|
||||
bool LinuxRasImp::isRasSupported(void) {
|
||||
if (false == pFsAccess->fileExists(rasCounterDir)) {
|
||||
return false;
|
||||
}
|
||||
if (osRasErrorType == ZET_RAS_ERROR_TYPE_CORRECTABLE) {
|
||||
return false;
|
||||
} else {
|
||||
// i915 support for UNCORRECTABLE errors is assumed true
|
||||
// since support for reset event is already available.
|
||||
return true;
|
||||
}
|
||||
}
|
||||
ze_result_t LinuxRasImp::getCounterValues(zet_ras_details_t *pDetails) {
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
uint64_t counterValue = 0;
|
||||
ze_result_t result = pFsAccess->read(resetCounterFile, counterValue);
|
||||
if (ZE_RESULT_SUCCESS != result) {
|
||||
return result;
|
||||
}
|
||||
pDetails->numResets = counterValue;
|
||||
return result;
|
||||
}
|
||||
|
||||
LinuxRasImp::LinuxRasImp(OsSysman *pOsSysman) {
|
||||
|
||||
39
level_zero/tools/source/sysman/ras/linux/os_ras_imp.h
Normal file
39
level_zero/tools/source/sysman/ras/linux/os_ras_imp.h
Normal file
@@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Copyright (C) 2020 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "shared/source/helpers/non_copyable_or_moveable.h"
|
||||
|
||||
#include "level_zero/tools/source/sysman/ras/os_ras.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace L0 {
|
||||
|
||||
class FsAccess;
|
||||
class LinuxRasImp : public OsRas, public NEO::NonCopyableClass {
|
||||
public:
|
||||
LinuxRasImp(OsSysman *pOsSysman);
|
||||
LinuxRasImp() = default;
|
||||
~LinuxRasImp() override = default;
|
||||
ze_result_t getCounterValues(zet_ras_details_t *pDetails) override;
|
||||
bool isRasSupported(void) override;
|
||||
void setRasErrorType(zet_ras_error_type_t rasErrorType) override;
|
||||
|
||||
protected:
|
||||
FsAccess *pFsAccess = nullptr;
|
||||
zet_ras_error_type_t osRasErrorType;
|
||||
|
||||
private:
|
||||
static const std::string rasCounterDir;
|
||||
static const std::string resetCounter;
|
||||
static const std::string resetCounterFile;
|
||||
std::vector<std::string> rasCounterDirFileList = {};
|
||||
};
|
||||
|
||||
} // namespace L0
|
||||
@@ -15,6 +15,8 @@ struct OsSysman;
|
||||
class OsRas {
|
||||
public:
|
||||
virtual ze_result_t getCounterValues(zet_ras_details_t *pDetails) = 0;
|
||||
virtual bool isRasSupported(void) = 0;
|
||||
virtual void setRasErrorType(zet_ras_error_type_t type) = 0;
|
||||
static OsRas *create(OsSysman *pOsSysman);
|
||||
virtual ~OsRas() = default;
|
||||
};
|
||||
|
||||
@@ -14,13 +14,20 @@ RasHandleContext::~RasHandleContext() {
|
||||
delete pRas;
|
||||
}
|
||||
}
|
||||
|
||||
void RasHandleContext::init() {
|
||||
Ras *pRas = new RasImp(pOsSysman);
|
||||
handleList.push_back(pRas);
|
||||
void RasHandleContext::createHandle(zet_ras_error_type_t type) {
|
||||
Ras *pRas = new RasImp(pOsSysman, type);
|
||||
if (pRas->isRasErrorSupported == true) {
|
||||
handleList.push_back(pRas);
|
||||
} else {
|
||||
delete pRas;
|
||||
}
|
||||
}
|
||||
|
||||
ze_result_t RasHandleContext::rasGet(uint32_t *pCount, zet_sysman_ras_handle_t *phRas) {
|
||||
void RasHandleContext::init() {
|
||||
createHandle(ZET_RAS_ERROR_TYPE_UNCORRECTABLE);
|
||||
createHandle(ZET_RAS_ERROR_TYPE_CORRECTABLE);
|
||||
}
|
||||
ze_result_t RasHandleContext::rasGet(uint32_t *pCount,
|
||||
zet_sysman_ras_handle_t *phRas) {
|
||||
if (nullptr == phRas) {
|
||||
*pCount = static_cast<uint32_t>(handleList.size());
|
||||
return ZE_RESULT_SUCCESS;
|
||||
|
||||
@@ -29,6 +29,8 @@ class Ras : _zet_sysman_ras_handle_t {
|
||||
return static_cast<Ras *>(handle);
|
||||
}
|
||||
inline zet_sysman_ras_handle_t toHandle() { return this; }
|
||||
bool isRasErrorSupported = false;
|
||||
zet_ras_error_type_t rasErrorType;
|
||||
};
|
||||
|
||||
struct RasHandleContext {
|
||||
@@ -41,6 +43,9 @@ struct RasHandleContext {
|
||||
|
||||
OsSysman *pOsSysman = nullptr;
|
||||
std::vector<Ras *> handleList;
|
||||
|
||||
private:
|
||||
void createHandle(zet_ras_error_type_t type);
|
||||
};
|
||||
|
||||
} // namespace L0
|
||||
|
||||
@@ -7,10 +7,29 @@
|
||||
|
||||
#include "level_zero/tools/source/sysman/ras/ras_imp.h"
|
||||
|
||||
#include "shared/source/helpers/debug_helpers.h"
|
||||
#include "shared/source/helpers/string.h"
|
||||
|
||||
#include <cstring>
|
||||
|
||||
namespace L0 {
|
||||
|
||||
uint64_t getTotalErrors(zet_ras_details_t pDetails) {
|
||||
return (pDetails.numResets +
|
||||
pDetails.numProgrammingErrors +
|
||||
pDetails.numNonComputeErrors +
|
||||
pDetails.numComputeErrors +
|
||||
pDetails.numDriverErrors +
|
||||
pDetails.numDisplayErrors +
|
||||
pDetails.numCacheErrors);
|
||||
}
|
||||
|
||||
ze_result_t RasImp::rasGetProperties(zet_ras_properties_t *pProperties) {
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
rasProperties.type = this->rasErrorType;
|
||||
rasProperties.onSubdevice = false;
|
||||
rasProperties.subdeviceId = 0;
|
||||
*pProperties = rasProperties;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
ze_result_t RasImp::rasGetConfig(zet_ras_config_t *pConfig) {
|
||||
@@ -22,11 +41,28 @@ ze_result_t RasImp::rasSetConfig(const zet_ras_config_t *pConfig) {
|
||||
}
|
||||
|
||||
ze_result_t RasImp::rasGetState(ze_bool_t clear, uint64_t *pTotalErrors, zet_ras_details_t *pDetails) {
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
zet_ras_details_t pDetailsInternal;
|
||||
memset(&pDetailsInternal, 0, sizeof(zet_ras_details_t));
|
||||
ze_result_t result = pOsRas->getCounterValues(&pDetailsInternal);
|
||||
if (result != ZE_RESULT_SUCCESS) {
|
||||
return result;
|
||||
}
|
||||
*pTotalErrors = getTotalErrors(pDetailsInternal);
|
||||
if (pDetails != nullptr) {
|
||||
memcpy_s(pDetails, sizeof(zet_ras_details_t), &pDetailsInternal, sizeof(zet_ras_details_t));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
RasImp::RasImp(OsSysman *pOsSysman) {
|
||||
void RasImp::init() {
|
||||
pOsRas->setRasErrorType(this->rasErrorType);
|
||||
isRasErrorSupported = pOsRas->isRasSupported();
|
||||
}
|
||||
|
||||
RasImp::RasImp(OsSysman *pOsSysman, zet_ras_error_type_t type) {
|
||||
pOsRas = OsRas::create(pOsSysman);
|
||||
this->rasErrorType = type;
|
||||
init();
|
||||
}
|
||||
|
||||
RasImp::~RasImp() {
|
||||
|
||||
@@ -22,7 +22,7 @@ class RasImp : public NEO::NonCopyableClass, public Ras {
|
||||
ze_result_t rasGetState(ze_bool_t clear, uint64_t *pTotalErrors, zet_ras_details_t *pDetails) override;
|
||||
|
||||
RasImp() = default;
|
||||
RasImp(OsSysman *pOsSysman);
|
||||
RasImp(OsSysman *pOsSysman, zet_ras_error_type_t type);
|
||||
~RasImp() override;
|
||||
|
||||
OsRas *pOsRas = nullptr;
|
||||
|
||||
@@ -11,12 +11,20 @@ namespace L0 {
|
||||
|
||||
class WddmRasImp : public OsRas {
|
||||
ze_result_t getCounterValues(zet_ras_details_t *pDetails) override;
|
||||
bool isRasSupported(void) override;
|
||||
void setRasErrorType(zet_ras_error_type_t type) override;
|
||||
};
|
||||
|
||||
ze_result_t WddmRasImp::getCounterValues(zet_ras_details_t *pDetails) {
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
}
|
||||
|
||||
bool WddmRasImp::isRasSupported(void) {
|
||||
return false;
|
||||
}
|
||||
|
||||
void WddmRasImp::setRasErrorType(zet_ras_error_type_t type) {}
|
||||
|
||||
OsRas *OsRas::create(OsSysman *pOsSysman) {
|
||||
WddmRasImp *pWddmRasImp = new WddmRasImp();
|
||||
return static_cast<OsRas *>(pWddmRasImp);
|
||||
|
||||
Reference in New Issue
Block a user