fix(sysman): Fix the Warm Device Reset

The Workaround added earlier to resize the VF Bar has been removed.

Related-To: NEO-13775

Signed-off-by: Pratik Bari <pratik.bari@intel.com>
This commit is contained in:
Pratik Bari
2025-02-18 06:54:58 +00:00
committed by Compute-Runtime-Automation
parent b33b4233ad
commit 568d90902d
4 changed files with 39 additions and 170 deletions

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2023-2024 Intel Corporation
* Copyright (C) 2023-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -341,50 +341,6 @@ void LinuxSysmanImp::clearHPIE(int fd) {
NEO::sleep(std::chrono::seconds(10)); // Sleep for 10seconds just to make sure the change is propagated.
}
// Function to adjust VF BAR size i.e Modify VF BAR Control register.
// size param is an encoded value described as follows:
// 0 - 1 MB (2^20 bytes)
// 1 - 2 MB (2^21 bytes)
// 2 - 4 MB (2^22 bytes)
// 3 - 8 MB (2^23 bytes)
// .
// .
// .
// b - 2 GB (2^31 bytes)
// 43 - 8 EB (2^63 bytes)
ze_result_t LinuxSysmanImp::resizeVfBar(uint8_t size) {
std::string pciConfigNode;
pciConfigNode = gtDevicePath + "/config";
auto fdConfig = NEO::FileDescriptor(pciConfigNode.c_str(), O_RDWR);
if (fdConfig < 0) {
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stdout,
"Config node open failed\n");
return ZE_RESULT_ERROR_UNKNOWN;
}
std::unique_ptr<uint8_t[]> configMemory = std::make_unique<uint8_t[]>(PCI_CFG_SPACE_EXP_SIZE);
memset(configMemory.get(), 0, PCI_CFG_SPACE_EXP_SIZE);
if (this->preadFunction(fdConfig, configMemory.get(), PCI_CFG_SPACE_EXP_SIZE, 0) < 0) {
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stdout,
"Read to get config space failed\n");
return ZE_RESULT_ERROR_UNKNOWN;
}
auto reBarCapPos = L0::Sysman::LinuxPciImp::getRebarCapabilityPos(configMemory.get(), true);
if (!reBarCapPos) {
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stdout,
"VF BAR capability not found\n");
return ZE_RESULT_ERROR_UNKNOWN;
}
auto barSizePos = reBarCapPos + PCI_REBAR_CTRL + 1; // position of VF(0) BAR SIZE.
if (this->pwriteFunction(fdConfig, &size, 0x01, barSizePos) < 0) {
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stdout,
"Write to change VF bar size failed\n");
return ZE_RESULT_ERROR_UNKNOWN;
}
return ZE_RESULT_SUCCESS;
}
// A 'warm reset' is a conventional reset that is triggered across a PCI express link.
// A warm reset is triggered either when a link is forced into electrical idle or
// by sending TS1 and TS2 ordered sets with the hot reset bit set.
@@ -443,33 +399,6 @@ ze_result_t LinuxSysmanImp::osWarmReset() {
}
NEO::sleep(std::chrono::seconds(10)); // Sleep for 10seconds, allows the rescan to complete on all devices attached to the root port.
// PCIe port driver uses the BIOS allocated VF bars on bootup. A known bug exists in pcie port driver
// and is causing VF bar allocation failure in PCIe port driver after an SBR - https://bugzilla.kernel.org/show_bug.cgi?id=216795
// WA to adjust VF bar size to 2GB. The default VF bar size is 8GB and for 63VFs, 504GB need to be allocated which is failing on SBR.
// When configured VF bar size to 2GB, an allocation of 126GB is successful. This WA resizes VF0 bar to 2GB. Once pcie port driver
// issue is resolved, this WA may not be necessary. Description for 0xb is explained at function definition - resizeVfVar.
if (NEO::debugManager.flags.VfBarResourceAllocationWa.get()) {
if (ZE_RESULT_SUCCESS != (result = resizeVfBar(0xb))) {
return result;
}
result = pFsAccess->write(devicePath + '/' + "remove", "1");
if (ZE_RESULT_SUCCESS != result) {
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stdout,
"Card Bus remove after resizing VF bar failed\n");
return result;
}
NEO::sleep(std::chrono::seconds(10)); // Sleep for 10seconds to make sure that the config spaces of all devices are saved correctly.
result = pFsAccess->write(rootPortPath + '/' + "rescan", "1");
if (ZE_RESULT_SUCCESS != result) {
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stdout,
"Rescanning root port failed after resizing VF bar failed\n");
return result;
}
NEO::sleep(std::chrono::seconds(10)); // Sleep for 10seconds, allows the rescan to complete on all devices attached to the root port.
}
return result;
}

View File

@@ -25,22 +25,11 @@ inline static int openMockDiag(const char *pathname, int flags) {
}
return -1;
}
void mockSleepFunctionSecs(int64_t secs) {
return;
}
inline static int openMockDiagFail(const char *pathname, int flags) {
return -1;
}
inline static int gtPciConfigOpenFail(const char *pathname, int flags) {
if (strcmp(pathname, mockRealPathConfig.c_str()) == 0) {
return mockFileDescriptor;
} else {
return -1;
}
}
ssize_t preadMockDiag(int fd, void *buf, size_t count, off_t offset) {
uint8_t *mockBuf = static_cast<uint8_t *>(buf);
if (fd == mockGtPciConfigFd) {
@@ -78,24 +67,6 @@ ssize_t preadMockDiag(int fd, void *buf, size_t count, off_t offset) {
return count;
}
ssize_t mockGtConfigPreadInvalid(int fd, void *buf, size_t count, off_t offset) {
return count;
}
ssize_t mockGtConfigPreadFail(int fd, void *buf, size_t count, off_t offset) {
if (fd == mockGtPciConfigFd) {
return -1;
}
return count;
}
ssize_t mockGtConfigPwriteFail(int fd, const void *buf, size_t count, off_t offset) {
if (fd == mockGtPciConfigFd) {
return -1;
}
return count;
}
ssize_t pwriteMockDiag(int fd, const void *buf, size_t count, off_t offset) {
return count;
}
@@ -529,66 +500,6 @@ TEST_F(ZesDiagnosticsFixture, GivenValidSysmanImpPointerWhenCallingWarmResetThen
EXPECT_EQ(ZE_RESULT_SUCCESS, pLinuxSysmanImp->osWarmReset());
}
TEST_F(ZesDiagnosticsFixture, GivenValidSysmanImpPointerAndVfBarIsResizedWhenCallingWarmResetAndGtPciConfigOpenFailsThenCallReturnsFailure) {
pLinuxSysmanImp->gtDevicePath = "/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:01.0/0000:8c:00.0";
VariableBackup<decltype(NEO::SysCalls::sysCallsOpen)> openBackup(&NEO::SysCalls::sysCallsOpen, gtPciConfigOpenFail);
pLinuxSysmanImp->preadFunction = preadMockDiag;
pLinuxSysmanImp->pwriteFunction = pwriteMockDiag;
EXPECT_EQ(ZE_RESULT_ERROR_UNKNOWN, pLinuxSysmanImp->osWarmReset());
}
TEST_F(ZesDiagnosticsFixture, GivenValidSysmanImpPointerAndVfBarIsResizedWhenCallingWarmResetAndConfigHeaderIsInvalidThenCallReturnsFailure) {
pLinuxSysmanImp->gtDevicePath = "/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:01.0/0000:8c:00.0";
VariableBackup<decltype(NEO::SysCalls::sysCallsOpen)> openBackup(&NEO::SysCalls::sysCallsOpen, openMockDiag);
pLinuxSysmanImp->preadFunction = mockGtConfigPreadInvalid;
pLinuxSysmanImp->pwriteFunction = pwriteMockDiag;
EXPECT_EQ(ZE_RESULT_ERROR_UNKNOWN, pLinuxSysmanImp->osWarmReset());
}
TEST_F(ZesDiagnosticsFixture, GivenValidSysmanImpPointerAndVfBarIsResizedWhenCallingWarmResetAndGtConfigPreadFailsThenCallReturnsFailure) {
pLinuxSysmanImp->gtDevicePath = "/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:01.0/0000:8c:00.0";
VariableBackup<decltype(NEO::SysCalls::sysCallsOpen)> openBackup(&NEO::SysCalls::sysCallsOpen, openMockDiag);
pLinuxSysmanImp->preadFunction = mockGtConfigPreadFail;
pLinuxSysmanImp->pwriteFunction = pwriteMockDiag;
EXPECT_EQ(ZE_RESULT_ERROR_UNKNOWN, pLinuxSysmanImp->osWarmReset());
}
TEST_F(ZesDiagnosticsFixture, GivenValidSysmanImpPointerAndVfBarIsResizedWhenCallingWarmResetAndGtConfigPwriteFailsThenCallReturnsFailure) {
pLinuxSysmanImp->gtDevicePath = "/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:01.0/0000:8c:00.0";
VariableBackup<decltype(NEO::SysCalls::sysCallsOpen)> openBackup(&NEO::SysCalls::sysCallsOpen, openMockDiag);
pLinuxSysmanImp->preadFunction = preadMockDiag;
pLinuxSysmanImp->pwriteFunction = mockGtConfigPwriteFail;
EXPECT_EQ(ZE_RESULT_ERROR_UNKNOWN, pLinuxSysmanImp->osWarmReset());
}
TEST_F(ZesDiagnosticsFixture, GivenValidSysmanImpPointerAndVfBarIsResizedWhenCallingWarmResetAndCardBusRemoveFailsThenCallReturnsFailure) {
pLinuxSysmanImp->gtDevicePath = "/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:01.0/0000:8c:00.0";
VariableBackup<decltype(NEO::SysCalls::sysCallsOpen)> openBackup(&NEO::SysCalls::sysCallsOpen, openMockDiag);
pLinuxSysmanImp->preadFunction = preadMockDiag;
pLinuxSysmanImp->pwriteFunction = pwriteMockDiag;
pMockFsAccess->checkErrorAfterCount = 2;
pMockFsAccess->mockWriteError = ZE_RESULT_ERROR_NOT_AVAILABLE;
EXPECT_EQ(ZE_RESULT_ERROR_NOT_AVAILABLE, pLinuxSysmanImp->osWarmReset());
}
TEST_F(ZesDiagnosticsFixture, GivenValidSysmanImpPointerAndVfBarIsResizedWhenCallingWarmResetAndRootPortRescanFailsThenCallReturnsFailure) {
pLinuxSysmanImp->gtDevicePath = "/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:01.0/0000:8c:00.0";
VariableBackup<decltype(NEO::SysCalls::sysCallsOpen)> openBackup(&NEO::SysCalls::sysCallsOpen, openMockDiag);
pLinuxSysmanImp->preadFunction = preadMockDiag;
pLinuxSysmanImp->pwriteFunction = pwriteMockDiag;
pMockFsAccess->checkErrorAfterCount = 3;
pMockFsAccess->mockWriteError = ZE_RESULT_ERROR_NOT_AVAILABLE;
EXPECT_EQ(ZE_RESULT_ERROR_NOT_AVAILABLE, pLinuxSysmanImp->osWarmReset());
}
TEST_F(ZesDiagnosticsFixture, GivenValidSysmanImpPointerAndVfBarIsResizedWhenCallingWarmResetThenCallSucceeds) {
pLinuxSysmanImp->gtDevicePath = "/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:01.0/0000:8c:00.0";
VariableBackup<decltype(NEO::SysCalls::sysCallsOpen)> openBackup(&NEO::SysCalls::sysCallsOpen, openMockDiag);

View File

@@ -845,8 +845,6 @@ TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingzesDeviceResetE
std::unique_ptr<SysmanProductHelper> pSysmanProductHelper = std::make_unique<MockSysmanProductHelper>();
std::swap(pLinuxSysmanImp->pSysmanProductHelper, pSysmanProductHelper);
DebugManagerStateRestore dbgRestore;
debugManager.flags.VfBarResourceAllocationWa.set(false);
zes_reset_properties_t pProperties = {.stype = ZES_STRUCTURE_TYPE_RESET_PROPERTIES, .pNext = nullptr, .force = true, .resetType = ZES_RESET_TYPE_WARM};
ze_result_t result = zesDeviceResetExt(pSysmanDevice->toHandle(), &pProperties);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
@@ -894,8 +892,6 @@ TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingZesDeviceResetE
TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingResetExtWithInvalidTypeThenFailureIsReturned) {
init(true);
DebugManagerStateRestore dbgRestore;
debugManager.flags.VfBarResourceAllocationWa.set(false);
zes_reset_properties_t pProperties = {.stype = ZES_STRUCTURE_TYPE_RESET_PROPERTIES, .pNext = nullptr, .force = true, .resetType = ZES_RESET_TYPE_FORCE_UINT32};
ze_result_t result = zesDeviceResetExt(pSysmanDevice->toHandle(), &pProperties);
EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, result);
@@ -903,8 +899,6 @@ TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingResetExtWithInv
TEST_F(SysmanGlobalOperationsFixture, GivenGettingSysfsPathFailsWhenCallingResetExtThenFailureIsReturned) {
init(true);
DebugManagerStateRestore dbgRestore;
debugManager.flags.VfBarResourceAllocationWa.set(false);
pSysfsAccess->mockDeviceUnbound = true;
zes_reset_properties_t pProperties = {.stype = ZES_STRUCTURE_TYPE_RESET_PROPERTIES, .pNext = nullptr, .force = true, .resetType = ZES_RESET_TYPE_FORCE_UINT32};
ze_result_t result = zesDeviceResetExt(pSysmanDevice->toHandle(), &pProperties);
@@ -912,8 +906,6 @@ TEST_F(SysmanGlobalOperationsFixture, GivenGettingSysfsPathFailsWhenCallingReset
}
TEST_F(SysmanGlobalOperationsFixture, GivenForceTrueWhenCallingResetThenSuccessIsReturned) {
DebugManagerStateRestore dbgRestore;
debugManager.flags.VfBarResourceAllocationWa.set(false);
ze_result_t result = zesDeviceReset(device, true);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2023-2024 Intel Corporation
* Copyright (C) 2023-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -668,6 +668,43 @@ TEST_F(ZesPciFixture, WhenConvertingLinkSpeedFromGigaTransfersPerSecondToBytesPe
EXPECT_EQ(0, L0::Sysman::convertPcieSpeedFromGTsToBs(0.0));
}
TEST_F(ZesPciFixture, GivenValidConfigMemoryDataWhenCallingGetRebarCapabilityPosThenTrueValueIsReturned) {
std::unique_ptr<PublicLinuxPciImp> pLinuxPciImp = std::make_unique<PublicLinuxPciImp>(pOsSysman);
std::vector<uint8_t> configMemory(PCI_CFG_SPACE_EXP_SIZE);
uint8_t *mockBuf = configMemory.data();
mockBuf[0x006] = 0x10;
mockBuf[0x034] = 0x40;
mockBuf[0x040] = 0x0d;
mockBuf[0x041] = 0x50;
mockBuf[0x050] = 0x10;
mockBuf[0x051] = 0x70;
mockBuf[0x052] = 0x90;
mockBuf[0x070] = 0x10;
mockBuf[0x071] = 0xac;
mockBuf[0x072] = 0xa0;
mockBuf[0x0ac] = 0x10;
mockBuf[0x0b8] = 0x11;
mockBuf[0x100] = 0x0e;
mockBuf[0x102] = 0x01;
mockBuf[0x103] = 0x42;
mockBuf[0x420] = 0x15;
mockBuf[0x422] = 0x01;
mockBuf[0x423] = 0x22;
mockBuf[0x425] = 0xf0;
mockBuf[0x426] = 0x3f;
mockBuf[0x428] = 0x22;
mockBuf[0x429] = 0x11;
mockBuf[0x220] = 0x24;
mockBuf[0x222] = 0x01;
mockBuf[0x223] = 0x32;
mockBuf[0x320] = 0x10;
mockBuf[0x322] = 0x01;
mockBuf[0x323] = 0x40;
mockBuf[0x400] = 0x18;
mockBuf[0x402] = 0x01;
EXPECT_TRUE(pLinuxPciImp->getRebarCapabilityPos(mockBuf, true));
}
} // namespace ult
} // namespace Sysman
} // namespace L0