mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-06 02:18:05 +08:00
feature(sysman): Add ras get state exp implementation
Related-To: NEO-8839 Signed-off-by: Aravind Gopalakrishnan <aravind.gopalakrishnan@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
c3d3a4db1f
commit
ec3d4d0956
@@ -16,6 +16,8 @@
|
||||
|
||||
#include "drm/intel_hwconfig_types.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace L0 {
|
||||
namespace Sysman {
|
||||
|
||||
@@ -86,6 +88,40 @@ ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear)
|
||||
return result;
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasImp::osRasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) {
|
||||
ze_result_t result = ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE;
|
||||
|
||||
uint32_t totalCategoryCount = 0;
|
||||
std::vector<uint32_t> numCategoriesBySources = {};
|
||||
for (auto &rasSource : rasSources) {
|
||||
totalCategoryCount += rasSource->osRasGetCategoryCount();
|
||||
numCategoriesBySources.push_back(totalCategoryCount);
|
||||
}
|
||||
|
||||
if (*pCount == 0) {
|
||||
*pCount = totalCategoryCount;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
uint32_t remainingCategories = std::min(totalCategoryCount, *pCount);
|
||||
uint32_t numCategoriesAssigned = 0u;
|
||||
for (uint32_t rasSourceIdx = 0u; rasSourceIdx < rasSources.size(); rasSourceIdx++) {
|
||||
auto &rasSource = rasSources[rasSourceIdx];
|
||||
uint32_t numCategoriesRequested = std::min(remainingCategories, numCategoriesBySources[rasSourceIdx]);
|
||||
ze_result_t localResult = rasSource->osRasGetStateExp(numCategoriesRequested, &pState[numCategoriesAssigned]);
|
||||
if (localResult != ZE_RESULT_SUCCESS) {
|
||||
continue;
|
||||
}
|
||||
remainingCategories -= numCategoriesRequested;
|
||||
numCategoriesAssigned += numCategoriesBySources[rasSourceIdx];
|
||||
result = localResult;
|
||||
if (remainingCategories == 0u) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void LinuxRasImp::initSources() {
|
||||
rasSources.push_back(std::make_unique<L0::Sysman::LinuxRasSourceGt>(pLinuxSysmanImp, osRasErrorType, isSubdevice, subdeviceId));
|
||||
if (isMemoryTypeHbm(pLinuxSysmanImp) == true) {
|
||||
|
||||
@@ -29,6 +29,8 @@ class SysFsAccessInterface;
|
||||
class LinuxRasSources : NEO::NonCopyableOrMovableClass {
|
||||
public:
|
||||
virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) = 0;
|
||||
virtual ze_result_t osRasGetStateExp(uint32_t numCategoriesRequested, zes_ras_state_exp_t *pState) = 0;
|
||||
virtual uint32_t osRasGetCategoryCount() = 0;
|
||||
virtual ~LinuxRasSources() = default;
|
||||
};
|
||||
|
||||
@@ -36,6 +38,7 @@ class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass {
|
||||
public:
|
||||
ze_result_t osRasGetProperties(zes_ras_properties_t &properties) override;
|
||||
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
|
||||
ze_result_t osRasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) override;
|
||||
ze_result_t osRasGetConfig(zes_ras_config_t *config) override;
|
||||
ze_result_t osRasSetConfig(const zes_ras_config_t *config) override;
|
||||
LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
|
||||
@@ -59,7 +62,9 @@ class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass {
|
||||
class LinuxRasSourceGt : public LinuxRasSources {
|
||||
public:
|
||||
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
|
||||
ze_result_t osRasGetStateExp(uint32_t numCategoriesRequested, zes_ras_state_exp_t *pState) override;
|
||||
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId);
|
||||
uint32_t osRasGetCategoryCount() override;
|
||||
LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
|
||||
LinuxRasSourceGt() = default;
|
||||
~LinuxRasSourceGt() override;
|
||||
@@ -95,12 +100,15 @@ class LinuxRasSourceGt : public LinuxRasSources {
|
||||
class LinuxRasSourceHbm : public LinuxRasSources {
|
||||
public:
|
||||
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
|
||||
ze_result_t osRasGetStateExp(uint32_t numCategoriesRequested, zes_ras_state_exp_t *pState) override;
|
||||
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_bool_t isSubDevice, uint32_t subDeviceId);
|
||||
uint32_t osRasGetCategoryCount() override;
|
||||
LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId);
|
||||
LinuxRasSourceHbm() = default;
|
||||
~LinuxRasSourceHbm() override{};
|
||||
|
||||
protected:
|
||||
ze_result_t getMemoryErrorCountFromFw(zes_ras_error_type_t rasErrorType, uint32_t subDeviceCount, uint64_t &errorCount);
|
||||
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
|
||||
zes_ras_error_type_t osRasErrorType = {};
|
||||
FirmwareUtil *pFwInterface = nullptr;
|
||||
@@ -109,6 +117,7 @@ class LinuxRasSourceHbm : public LinuxRasSources {
|
||||
private:
|
||||
uint64_t errorBaseline = 0;
|
||||
uint32_t subdeviceId = 0;
|
||||
uint32_t subDeviceCount = 0;
|
||||
};
|
||||
|
||||
} // namespace Sysman
|
||||
|
||||
@@ -49,6 +49,14 @@ static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToL
|
||||
{ZES_RAS_ERROR_CAT_COMPUTE_ERRORS,
|
||||
{"correctable-eu-grf", "correctable-eu-ic", "correctable-guc", "correctable-sampler", "correctable-slm", "correctable-subslice"}}};
|
||||
|
||||
static std::map<zes_ras_error_cat_t, zes_ras_error_category_exp_t> categoryStandardToExpMap = {
|
||||
{ZES_RAS_ERROR_CAT_RESET, ZES_RAS_ERROR_CATEGORY_EXP_RESET},
|
||||
{ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS, ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS},
|
||||
{ZES_RAS_ERROR_CAT_DRIVER_ERRORS, ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS},
|
||||
{ZES_RAS_ERROR_CAT_COMPUTE_ERRORS, ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS},
|
||||
{ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS, ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS},
|
||||
{ZES_RAS_ERROR_CAT_CACHE_ERRORS, ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS}};
|
||||
|
||||
static void closeFd(int64_t &fd) {
|
||||
if (fd != -1) {
|
||||
close(static_cast<int>(fd));
|
||||
@@ -160,13 +168,6 @@ ze_result_t LinuxRasSourceGt::osRasGetState(zes_ras_state_t &state, ze_bool_t cl
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
}
|
||||
|
||||
std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToEvent;
|
||||
if (osRasErrorType == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
|
||||
categoryToEvent = categoryToListOfEventsCorrectable;
|
||||
}
|
||||
if (osRasErrorType == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
|
||||
categoryToEvent = categoryToListOfEventsUncorrectable;
|
||||
}
|
||||
std::vector<std::uint64_t> data(2 + totalEventCount, 0); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
|
||||
if (pPmuInterface->pmuRead(static_cast<int>(groupFd), data.data(), sizeof(uint64_t) * data.size()) < 0) {
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
@@ -186,6 +187,44 @@ ze_result_t LinuxRasSourceGt::osRasGetState(zes_ras_state_t &state, ze_bool_t cl
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasSourceGt::osRasGetStateExp(uint32_t numCategoriesRequested, zes_ras_state_exp_t *pState) {
|
||||
initRasErrors(false);
|
||||
// Iterate over all the file descriptor values present in vector which is mapped to given ras error category
|
||||
// Use the file descriptors to read pmu counters and add all the errors corresponding to the ras error category
|
||||
if (groupFd < 0) {
|
||||
return ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE;
|
||||
}
|
||||
|
||||
std::vector<std::uint64_t> data(2 + totalEventCount, 0); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
|
||||
if (pPmuInterface->pmuRead(static_cast<int>(groupFd), data.data(), sizeof(uint64_t) * data.size()) < 0) {
|
||||
return ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE;
|
||||
}
|
||||
|
||||
/* The data buffer retrieved after reading pmu counters is parsed to get the error count for each suberror category */
|
||||
uint64_t initialIndex = 2; // Initial index in the buffer from which the data be parsed begins
|
||||
uint32_t categoryIdx = 0u;
|
||||
for (auto errorCat = errorCategoryToEventCount.begin(); (errorCat != errorCategoryToEventCount.end()) && (categoryIdx < numCategoriesRequested); errorCat++) {
|
||||
uint64_t errorCount = 0;
|
||||
uint64_t j = 0;
|
||||
for (; j < errorCat->second; j++) {
|
||||
errorCount += data[initialIndex + j];
|
||||
}
|
||||
pState[categoryIdx].category = categoryStandardToExpMap[errorCat->first];
|
||||
pState[categoryIdx].errorCounter = errorCount + initialErrorCount[errorCat->first];
|
||||
initialIndex += j;
|
||||
categoryIdx++;
|
||||
}
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
uint32_t LinuxRasSourceGt::osRasGetCategoryCount() {
|
||||
if (osRasErrorType == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
|
||||
return static_cast<uint32_t>(categoryToListOfEventsUncorrectable.size());
|
||||
}
|
||||
return static_cast<uint32_t>(categoryToListOfEventsCorrectable.size());
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasSourceGt::getPmuConfig(
|
||||
const std::string &eventDirectory,
|
||||
const std::vector<std::string> &listOfEvents,
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
*/
|
||||
|
||||
#include "shared/source/debug_settings/debug_settings_manager.h"
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
|
||||
#include "level_zero/sysman/source/api/ras/linux/sysman_os_ras_imp.h"
|
||||
#include "level_zero/sysman/source/shared/firmware_util/sysman_firmware_util.h"
|
||||
@@ -24,34 +23,59 @@ void LinuxRasSourceHbm::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t>
|
||||
}
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasSourceHbm::getMemoryErrorCountFromFw(zes_ras_error_type_t rasErrorType, uint32_t subDeviceCount, uint64_t &errorCount) {
|
||||
return pFwInterface->fwGetMemoryErrorCount(rasErrorType, subDeviceCount, subdeviceId, errorCount);
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasSourceHbm::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
|
||||
if (pFwInterface == nullptr) {
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
}
|
||||
uint32_t subDeviceCount = 0;
|
||||
subDeviceCount = NEO::GfxCoreHelper::getSubDevicesCount(&pDevice->getHardwareInfo());
|
||||
if (clear == true) {
|
||||
uint64_t errorCount = 0;
|
||||
ze_result_t result = pFwInterface->fwGetMemoryErrorCount(osRasErrorType, subDeviceCount, subdeviceId, errorCount);
|
||||
ze_result_t result = getMemoryErrorCountFromFw(osRasErrorType, this->subDeviceCount, errorCount);
|
||||
if (result != ZE_RESULT_SUCCESS) {
|
||||
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed while getting fwGetMemoryErrorCount() for RasErrorType:%d, SubDeviceCount:%d, SubdeviceId:%d and returning error:0x%x \n", __FUNCTION__, osRasErrorType, subDeviceCount, subdeviceId, result);
|
||||
return result;
|
||||
}
|
||||
errorBaseline = errorCount; // during clear update the error baseline value
|
||||
}
|
||||
|
||||
uint64_t errorCount = 0;
|
||||
ze_result_t result = pFwInterface->fwGetMemoryErrorCount(osRasErrorType, subDeviceCount, subdeviceId, errorCount);
|
||||
ze_result_t result = getMemoryErrorCountFromFw(osRasErrorType, this->subDeviceCount, errorCount);
|
||||
if (result != ZE_RESULT_SUCCESS) {
|
||||
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Failed while getting fwGetMemoryErrorCount() for RasErrorType:%d, SubDeviceCount:%d, SubdeviceId:%d and returning error:0x%x \n", __FUNCTION__, osRasErrorType, subDeviceCount, subdeviceId, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
state.category[ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS] = errorCount - errorBaseline;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
ze_result_t LinuxRasSourceHbm::osRasGetStateExp(uint32_t numCategoriesRequested, zes_ras_state_exp_t *pState) {
|
||||
if (pFwInterface == nullptr) {
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
}
|
||||
|
||||
uint64_t errorCount = 0;
|
||||
ze_result_t result = getMemoryErrorCountFromFw(osRasErrorType, this->subDeviceCount, errorCount);
|
||||
if (result != ZE_RESULT_SUCCESS) {
|
||||
return result;
|
||||
}
|
||||
|
||||
pState[0].category = ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS;
|
||||
pState[0].errorCounter = errorCount - errorBaseline;
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
uint32_t LinuxRasSourceHbm::osRasGetCategoryCount() {
|
||||
// Return one for "MEMORY" category
|
||||
return 1u;
|
||||
}
|
||||
|
||||
LinuxRasSourceHbm::LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId) : pLinuxSysmanImp(pLinuxSysmanImp), osRasErrorType(type), subdeviceId(subdeviceId) {
|
||||
pFwInterface = pLinuxSysmanImp->getFwUtilInterface();
|
||||
pDevice = pLinuxSysmanImp->getSysmanDeviceImp();
|
||||
subDeviceCount = pLinuxSysmanImp->getSubDeviceCount();
|
||||
}
|
||||
|
||||
} // namespace Sysman
|
||||
|
||||
@@ -19,6 +19,7 @@ class OsRas {
|
||||
public:
|
||||
virtual ze_result_t osRasGetProperties(zes_ras_properties_t &properties) = 0;
|
||||
virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) = 0;
|
||||
virtual ze_result_t osRasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) = 0;
|
||||
virtual ze_result_t osRasGetConfig(zes_ras_config_t *config) = 0;
|
||||
virtual ze_result_t osRasSetConfig(const zes_ras_config_t *config) = 0;
|
||||
static OsRas *create(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
|
||||
|
||||
@@ -24,6 +24,7 @@ class Ras : _zes_ras_handle_t {
|
||||
virtual ze_result_t rasGetConfig(zes_ras_config_t *pConfig) = 0;
|
||||
virtual ze_result_t rasSetConfig(const zes_ras_config_t *pConfig) = 0;
|
||||
virtual ze_result_t rasGetState(zes_ras_state_t *pState, ze_bool_t clear) = 0;
|
||||
virtual ze_result_t rasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) = 0;
|
||||
|
||||
static Ras *fromHandle(zes_ras_handle_t handle) {
|
||||
return static_cast<Ras *>(handle);
|
||||
|
||||
@@ -34,6 +34,10 @@ ze_result_t RasImp::rasGetState(zes_ras_state_t *pState, ze_bool_t clear) {
|
||||
return pOsRas->osRasGetState(*pState, clear);
|
||||
}
|
||||
|
||||
ze_result_t RasImp::rasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) {
|
||||
return pOsRas->osRasGetStateExp(pCount, pState);
|
||||
}
|
||||
|
||||
void RasImp::init() {
|
||||
pOsRas->osRasGetProperties(rasProperties);
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@ class RasImp : public Ras, NEO::NonCopyableOrMovableClass {
|
||||
ze_result_t rasGetConfig(zes_ras_config_t *pConfig) override;
|
||||
ze_result_t rasSetConfig(const zes_ras_config_t *pConfig) override;
|
||||
ze_result_t rasGetState(zes_ras_state_t *pConfig, ze_bool_t clear) override;
|
||||
ze_result_t rasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) override;
|
||||
|
||||
RasImp() = default;
|
||||
RasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t isSubDevice, uint32_t subDeviceId);
|
||||
|
||||
@@ -13,6 +13,7 @@ namespace Sysman {
|
||||
class WddmRasImp : public OsRas {
|
||||
ze_result_t osRasGetProperties(zes_ras_properties_t &properties) override;
|
||||
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
|
||||
ze_result_t osRasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) override;
|
||||
ze_result_t osRasGetConfig(zes_ras_config_t *config) override;
|
||||
ze_result_t osRasSetConfig(const zes_ras_config_t *config) override;
|
||||
};
|
||||
@@ -35,6 +36,10 @@ ze_result_t WddmRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
}
|
||||
|
||||
ze_result_t WddmRasImp::osRasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) {
|
||||
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
|
||||
}
|
||||
|
||||
OsRas *OsRas::create(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId) {
|
||||
WddmRasImp *pWddmRasImp = new WddmRasImp();
|
||||
return static_cast<OsRas *>(pWddmRasImp);
|
||||
|
||||
Reference in New Issue
Block a user