diff --git a/level_zero/tools/source/sysman/firmware_util/firmware_util_imp.h b/level_zero/tools/source/sysman/firmware_util/firmware_util_imp.h index 715ae02625..7234e74046 100644 --- a/level_zero/tools/source/sysman/firmware_util/firmware_util_imp.h +++ b/level_zero/tools/source/sysman/firmware_util/firmware_util_imp.h @@ -62,6 +62,9 @@ typedef int (*pIgscIafPscUpdate)(struct igsc_device_handle *handle, typedef int (*pIgscGfspMemoryErrors)(struct igsc_device_handle *handle, struct igsc_gfsp_mem_err *tiles); +typedef int (*pIgscGfspCountTiles)(struct igsc_device_handle *handle, + uint32_t *numOfTiles); + typedef int (*pIgscIfrRunArrayScanTest)(struct igsc_device_handle *handle, uint32_t *status, uint32_t *extendedStatus, @@ -97,6 +100,7 @@ extern pIgscDeviceClose deviceClose; extern pIgscIfrGetStatusExt deviceIfrGetStatusExt; extern pIgscIafPscUpdate iafPscUpdate; extern pIgscGfspMemoryErrors gfspMemoryErrors; +extern pIgscGfspCountTiles gfspCountTiles; extern pIgscIfrRunArrayScanTest deviceIfrRunArrayScanTest; extern pIgscIfrRunMemPPRTest deviceIfrRunMemPPRTest; extern pIgscGetEccConfig getEccConfig; diff --git a/level_zero/tools/source/sysman/firmware_util/firmware_util_imp_helper.cpp b/level_zero/tools/source/sysman/firmware_util/firmware_util_imp_helper.cpp index 97c8258dd4..139efb09e3 100644 --- a/level_zero/tools/source/sysman/firmware_util/firmware_util_imp_helper.cpp +++ b/level_zero/tools/source/sysman/firmware_util/firmware_util_imp_helper.cpp @@ -16,6 +16,7 @@ namespace L0 { const std::string fwDeviceIfrGetStatusExt = "igsc_ifr_get_status_ext"; const std::string fwIafPscUpdate = "igsc_iaf_psc_update"; const std::string fwGfspMemoryErrors = "igsc_gfsp_memory_errors"; +const std::string fwGfspCountTiles = "igsc_gfsp_count_tiles"; const std::string fwDeviceIfrRunArrayScanTest = "igsc_ifr_run_array_scan_test"; const std::string fwDeviceIfrRunMemPPRTest = "igsc_ifr_run_mem_ppr_test"; const std::string fwEccConfigGet = "igsc_ecc_config_get"; @@ -24,6 +25,7 @@ const std::string fwEccConfigSet = "igsc_ecc_config_set"; pIgscIfrGetStatusExt deviceIfrGetStatusExt; pIgscIafPscUpdate iafPscUpdate; pIgscGfspMemoryErrors gfspMemoryErrors; +pIgscGfspCountTiles gfspCountTiles; pIgscIfrRunArrayScanTest deviceIfrRunArrayScanTest; pIgscIfrRunMemPPRTest deviceIfrRunMemPPRTest; pIgscGetEccConfig getEccConfig; @@ -61,18 +63,39 @@ ze_result_t FirmwareUtilImp::fwCallGetstatusExt(uint32_t &supportedTests, uint32 ze_result_t FirmwareUtilImp::fwGetMemoryErrorCount(zes_ras_error_type_t type, uint32_t subDeviceCount, uint32_t subDeviceId, uint64_t &count) { const std::lock_guard lock(this->fwLock); + uint32_t numOfTiles = 0; + int ret = -1; + gfspCountTiles = reinterpret_cast(libraryHandle->getProcAddress(fwGfspCountTiles)); + if (gfspCountTiles != nullptr) { + ret = gfspCountTiles(&fwDeviceHandle, &numOfTiles); + } + + if (ret != IGSC_SUCCESS) { + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, + "Error@ %s(): Could not retrieve tile count from igsc\n", __FUNCTION__); + // igsc_gfsp_count_tiles returns max tile info rather than actual count, igsc behaves in such a way that + // it expects buffer (igsc_gfsp_mem_err) to be allocated for max tile count and not actual tile count. + // This is fallback path when igsc_gfsp_count_tiles fails, where buffer for actual tile count is used to + // get memory error count. + numOfTiles = (subDeviceCount == 0) ? 1 : subDeviceCount; + } + gfspMemoryErrors = reinterpret_cast(libraryHandle->getProcAddress(fwGfspMemoryErrors)); if (gfspMemoryErrors != nullptr) { - uint32_t numOfTiles = (subDeviceCount == 0) ? 1 : subDeviceCount; - auto size = sizeof(igsc_gfsp_mem_err) + subDeviceCount * sizeof(igsc_gfsp_tile_mem_err); + auto size = sizeof(igsc_gfsp_mem_err) + numOfTiles * sizeof(igsc_gfsp_tile_mem_err); std::vector buf(size); igsc_gfsp_mem_err *tiles = reinterpret_cast(buf.data()); tiles->num_of_tiles = numOfTiles; // set the number of tiles in the structure that will be passed as a buffer - int ret = gfspMemoryErrors(&fwDeviceHandle, tiles); + ret = gfspMemoryErrors(&fwDeviceHandle, tiles); if (ret != IGSC_SUCCESS) { + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, + "Error@ %s(): Could not retrieve memory errors from igsc (error:0x%x) \n", __FUNCTION__, ret); return ZE_RESULT_ERROR_UNINITIALIZED; } - if (tiles->num_of_tiles != subDeviceCount) { + + if (tiles->num_of_tiles < subDeviceCount) { + NEO::printDebugString(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, + "Error@ %s(): Inappropriate tile count \n", __FUNCTION__); return ZE_RESULT_ERROR_UNKNOWN; } if (type == ZES_RAS_ERROR_TYPE_CORRECTABLE) { diff --git a/level_zero/tools/test/unit_tests/sources/sysman/firmware_util/test_fw_util_helper.cpp b/level_zero/tools/test/unit_tests/sources/sysman/firmware_util/test_fw_util_helper.cpp index cf8d18f57d..794ddd527b 100644 --- a/level_zero/tools/test/unit_tests/sources/sysman/firmware_util/test_fw_util_helper.cpp +++ b/level_zero/tools/test/unit_tests/sources/sysman/firmware_util/test_fw_util_helper.cpp @@ -23,6 +23,7 @@ extern pIgscSetEccConfig setEccConfig; namespace ult { std::map IFRfuncMap; +constexpr static uint32_t mockMaxTileCount = 2; int mockDeviceIfrGetStatusExt(struct igsc_device_handle *handle, uint32_t *supportedTests, uint32_t *hwCapabilities, uint32_t *ifrApplied, uint32_t *prevErrors, uint32_t *pendingReset) { return 0; @@ -52,6 +53,15 @@ static inline int mockEccConfigSetFailure(struct igsc_device_handle *handle, uin return -1; } +static inline int mockCountTiles(struct igsc_device_handle *handle, uint32_t *numOfTiles) { + *numOfTiles = mockMaxTileCount; + return 0; +} + +static inline int mockMemoryErrors(struct igsc_device_handle *handle, struct igsc_gfsp_mem_err *tiles) { + return 0; +} + TEST(FwStatusExtTest, GivenIFRWasSetWhenFirmwareUtilChecksIFRThenIFRStatusIsUpdated) { if (!sysmanUltsEnable) { @@ -276,5 +286,66 @@ TEST(LinuxFwEccTest, GivenGetProcAddrCallFailsWhenFirmwareUtilChecksEccGetAndSet delete pFwUtilImp; } +TEST(FwGetMemErrorCountTest, GivenGetProcAddrCallFailsWhenMemoryErrorCountIsRequestedThenFailureIsReturned) { + + if (!sysmanUltsEnable) { + GTEST_SKIP(); + } + + L0::ult::MockFwUtilOsLibrary::getNonNullProcAddr = false; + + FirmwareUtilImp *pFwUtilImp = new FirmwareUtilImp(0, 0, 0, 0); + pFwUtilImp->libraryHandle = static_cast(new MockFwUtilOsLibrary()); + zes_ras_error_type_t errorType = ZES_RAS_ERROR_TYPE_CORRECTABLE; + uint32_t subDeviceCount = 1; + uint32_t subDeviceId = 0; + uint64_t errorCount = 0; + auto ret = pFwUtilImp->fwGetMemoryErrorCount(errorType, subDeviceCount, subDeviceId, errorCount); + EXPECT_EQ(ZE_RESULT_ERROR_UNINITIALIZED, ret); + + delete pFwUtilImp->libraryHandle; + pFwUtilImp->libraryHandle = nullptr; + delete pFwUtilImp; +} + +TEST(FwGetMemErrorCountTest, GivenValidFwUtilMethodWhenMemoryErrorCountIsRequestedThenCorrespondingCallSucceeds) { + + if (!sysmanUltsEnable) { + GTEST_SKIP(); + } + + struct IgscMemErrMockOsLibrary : public OsLibraryUtil { + public: + ~IgscMemErrMockOsLibrary() override = default; + void *getProcAddress(const std::string &procName) override { + memErrFuncMap["igsc_gfsp_count_tiles"] = reinterpret_cast(&mockCountTiles); + memErrFuncMap["igsc_gfsp_memory_errors"] = reinterpret_cast(&mockMemoryErrors); + auto it = memErrFuncMap.find(procName); + if (memErrFuncMap.end() == it) { + return nullptr; + } else { + return it->second; + } + return nullptr; + } + bool isLoaded() override { + return false; + } + std::map memErrFuncMap; + }; + FirmwareUtilImp *pFwUtilImp = new FirmwareUtilImp(0, 0, 0, 0); + pFwUtilImp->libraryHandle = static_cast(new IgscMemErrMockOsLibrary()); + zes_ras_error_type_t errorType = ZES_RAS_ERROR_TYPE_CORRECTABLE; + uint32_t subDeviceCount = 1; + uint32_t subDeviceId = 0; + uint64_t errorCount = 0; + auto ret = pFwUtilImp->fwGetMemoryErrorCount(errorType, subDeviceCount, subDeviceId, errorCount); + EXPECT_EQ(ZE_RESULT_SUCCESS, ret); + + delete pFwUtilImp->libraryHandle; + pFwUtilImp->libraryHandle = nullptr; + delete pFwUtilImp; +} + } // namespace ult } // namespace L0