add counter support for RAS.

- added dual handle support for RAS Correctable and Uncorrectable Errors.
- added reset counter for RAS.
- added Os Specific ULT for RAS

Change-Id: Ia10115bf6720ab211f549571e810ec0d6c0801ec
This commit is contained in:
Vilvaraj, T J Vivek
2020-06-15 16:33:11 +05:30
committed by sys_ocldev
parent fa3cb35fde
commit 0c9c55cd17
16 changed files with 423 additions and 31 deletions

View File

@@ -7,7 +7,7 @@
set(L0_SRCS_TOOLS_SYSMAN_RAS
${CMAKE_CURRENT_SOURCE_DIR}/ras.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ras.h
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}/ras_imp.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ras_imp.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ras_imp.h
${CMAKE_CURRENT_SOURCE_DIR}/os_ras.h
)

View File

@@ -5,7 +5,8 @@
#
set(L0_SRCS_TOOLS_SYSMAN_RAS_LINUX
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}/os_ras_imp.cpp
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.cpp
${CMAKE_CURRENT_SOURCE_DIR}/os_ras_imp.h
)
if(UNIX)

View File

@@ -5,26 +5,39 @@
*
*/
#include "shared/source/helpers/non_copyable_or_moveable.h"
#include "level_zero/tools/source/sysman/ras/os_ras.h"
#include "level_zero/tools/source/sysman/ras/linux/os_ras_imp.h"
#include "sysman/linux/os_sysman_imp.h"
namespace L0 {
class LinuxRasImp : public OsRas, public NEO::NonCopyableClass {
public:
LinuxRasImp(OsSysman *pOsSysman);
~LinuxRasImp() override = default;
ze_result_t getCounterValues(zet_ras_details_t *pDetails) override;
private:
FsAccess *pFsAccess = nullptr;
};
const std::string LinuxRasImp::rasCounterDir("/var/lib/libze_intel_gpu/");
const std::string LinuxRasImp::resetCounter("ras_reset_count");
const std::string LinuxRasImp::resetCounterFile = rasCounterDir + resetCounter;
void LinuxRasImp::setRasErrorType(zet_ras_error_type_t type) {
osRasErrorType = type;
}
bool LinuxRasImp::isRasSupported(void) {
if (false == pFsAccess->fileExists(rasCounterDir)) {
return false;
}
if (osRasErrorType == ZET_RAS_ERROR_TYPE_CORRECTABLE) {
return false;
} else {
// i915 support for UNCORRECTABLE errors is assumed true
// since support for reset event is already available.
return true;
}
}
ze_result_t LinuxRasImp::getCounterValues(zet_ras_details_t *pDetails) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
uint64_t counterValue = 0;
ze_result_t result = pFsAccess->read(resetCounterFile, counterValue);
if (ZE_RESULT_SUCCESS != result) {
return result;
}
pDetails->numResets = counterValue;
return result;
}
LinuxRasImp::LinuxRasImp(OsSysman *pOsSysman) {

View File

@@ -0,0 +1,39 @@
/*
* Copyright (C) 2020 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/helpers/non_copyable_or_moveable.h"
#include "level_zero/tools/source/sysman/ras/os_ras.h"
#include <string>
#include <vector>
namespace L0 {
class FsAccess;
class LinuxRasImp : public OsRas, public NEO::NonCopyableClass {
public:
LinuxRasImp(OsSysman *pOsSysman);
LinuxRasImp() = default;
~LinuxRasImp() override = default;
ze_result_t getCounterValues(zet_ras_details_t *pDetails) override;
bool isRasSupported(void) override;
void setRasErrorType(zet_ras_error_type_t rasErrorType) override;
protected:
FsAccess *pFsAccess = nullptr;
zet_ras_error_type_t osRasErrorType;
private:
static const std::string rasCounterDir;
static const std::string resetCounter;
static const std::string resetCounterFile;
std::vector<std::string> rasCounterDirFileList = {};
};
} // namespace L0

View File

@@ -15,6 +15,8 @@ struct OsSysman;
class OsRas {
public:
virtual ze_result_t getCounterValues(zet_ras_details_t *pDetails) = 0;
virtual bool isRasSupported(void) = 0;
virtual void setRasErrorType(zet_ras_error_type_t type) = 0;
static OsRas *create(OsSysman *pOsSysman);
virtual ~OsRas() = default;
};

View File

@@ -14,13 +14,20 @@ RasHandleContext::~RasHandleContext() {
delete pRas;
}
}
void RasHandleContext::init() {
Ras *pRas = new RasImp(pOsSysman);
handleList.push_back(pRas);
void RasHandleContext::createHandle(zet_ras_error_type_t type) {
Ras *pRas = new RasImp(pOsSysman, type);
if (pRas->isRasErrorSupported == true) {
handleList.push_back(pRas);
} else {
delete pRas;
}
}
ze_result_t RasHandleContext::rasGet(uint32_t *pCount, zet_sysman_ras_handle_t *phRas) {
void RasHandleContext::init() {
createHandle(ZET_RAS_ERROR_TYPE_UNCORRECTABLE);
createHandle(ZET_RAS_ERROR_TYPE_CORRECTABLE);
}
ze_result_t RasHandleContext::rasGet(uint32_t *pCount,
zet_sysman_ras_handle_t *phRas) {
if (nullptr == phRas) {
*pCount = static_cast<uint32_t>(handleList.size());
return ZE_RESULT_SUCCESS;

View File

@@ -29,6 +29,8 @@ class Ras : _zet_sysman_ras_handle_t {
return static_cast<Ras *>(handle);
}
inline zet_sysman_ras_handle_t toHandle() { return this; }
bool isRasErrorSupported = false;
zet_ras_error_type_t rasErrorType;
};
struct RasHandleContext {
@@ -41,6 +43,9 @@ struct RasHandleContext {
OsSysman *pOsSysman = nullptr;
std::vector<Ras *> handleList;
private:
void createHandle(zet_ras_error_type_t type);
};
} // namespace L0

View File

@@ -7,10 +7,29 @@
#include "level_zero/tools/source/sysman/ras/ras_imp.h"
#include "shared/source/helpers/debug_helpers.h"
#include "shared/source/helpers/string.h"
#include <cstring>
namespace L0 {
uint64_t getTotalErrors(zet_ras_details_t pDetails) {
return (pDetails.numResets +
pDetails.numProgrammingErrors +
pDetails.numNonComputeErrors +
pDetails.numComputeErrors +
pDetails.numDriverErrors +
pDetails.numDisplayErrors +
pDetails.numCacheErrors);
}
ze_result_t RasImp::rasGetProperties(zet_ras_properties_t *pProperties) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
rasProperties.type = this->rasErrorType;
rasProperties.onSubdevice = false;
rasProperties.subdeviceId = 0;
*pProperties = rasProperties;
return ZE_RESULT_SUCCESS;
}
ze_result_t RasImp::rasGetConfig(zet_ras_config_t *pConfig) {
@@ -22,11 +41,28 @@ ze_result_t RasImp::rasSetConfig(const zet_ras_config_t *pConfig) {
}
ze_result_t RasImp::rasGetState(ze_bool_t clear, uint64_t *pTotalErrors, zet_ras_details_t *pDetails) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
zet_ras_details_t pDetailsInternal;
memset(&pDetailsInternal, 0, sizeof(zet_ras_details_t));
ze_result_t result = pOsRas->getCounterValues(&pDetailsInternal);
if (result != ZE_RESULT_SUCCESS) {
return result;
}
*pTotalErrors = getTotalErrors(pDetailsInternal);
if (pDetails != nullptr) {
memcpy_s(pDetails, sizeof(zet_ras_details_t), &pDetailsInternal, sizeof(zet_ras_details_t));
}
return result;
}
RasImp::RasImp(OsSysman *pOsSysman) {
void RasImp::init() {
pOsRas->setRasErrorType(this->rasErrorType);
isRasErrorSupported = pOsRas->isRasSupported();
}
RasImp::RasImp(OsSysman *pOsSysman, zet_ras_error_type_t type) {
pOsRas = OsRas::create(pOsSysman);
this->rasErrorType = type;
init();
}
RasImp::~RasImp() {

View File

@@ -22,7 +22,7 @@ class RasImp : public NEO::NonCopyableClass, public Ras {
ze_result_t rasGetState(ze_bool_t clear, uint64_t *pTotalErrors, zet_ras_details_t *pDetails) override;
RasImp() = default;
RasImp(OsSysman *pOsSysman);
RasImp(OsSysman *pOsSysman, zet_ras_error_type_t type);
~RasImp() override;
OsRas *pOsRas = nullptr;

View File

@@ -11,12 +11,20 @@ namespace L0 {
class WddmRasImp : public OsRas {
ze_result_t getCounterValues(zet_ras_details_t *pDetails) override;
bool isRasSupported(void) override;
void setRasErrorType(zet_ras_error_type_t type) override;
};
ze_result_t WddmRasImp::getCounterValues(zet_ras_details_t *pDetails) {
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
bool WddmRasImp::isRasSupported(void) {
return false;
}
void WddmRasImp::setRasErrorType(zet_ras_error_type_t type) {}
OsRas *OsRas::create(OsSysman *pOsSysman) {
WddmRasImp *pWddmRasImp = new WddmRasImp();
return static_cast<OsRas *>(pWddmRasImp);