mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-20 00:24:58 +08:00
fix(sysman): Fix the Warm Device Reset
The Workaround added earlier to resize the VF Bar has been removed. Related-To: NEO-13775 Signed-off-by: Pratik Bari <pratik.bari@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
b33b4233ad
commit
568d90902d
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2023-2024 Intel Corporation
|
||||
* Copyright (C) 2023-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -341,50 +341,6 @@ void LinuxSysmanImp::clearHPIE(int fd) {
|
||||
NEO::sleep(std::chrono::seconds(10)); // Sleep for 10seconds just to make sure the change is propagated.
|
||||
}
|
||||
|
||||
// Function to adjust VF BAR size i.e Modify VF BAR Control register.
|
||||
// size param is an encoded value described as follows:
|
||||
// 0 - 1 MB (2^20 bytes)
|
||||
// 1 - 2 MB (2^21 bytes)
|
||||
// 2 - 4 MB (2^22 bytes)
|
||||
// 3 - 8 MB (2^23 bytes)
|
||||
// .
|
||||
// .
|
||||
// .
|
||||
// b - 2 GB (2^31 bytes)
|
||||
// 43 - 8 EB (2^63 bytes)
|
||||
ze_result_t LinuxSysmanImp::resizeVfBar(uint8_t size) {
|
||||
std::string pciConfigNode;
|
||||
pciConfigNode = gtDevicePath + "/config";
|
||||
|
||||
auto fdConfig = NEO::FileDescriptor(pciConfigNode.c_str(), O_RDWR);
|
||||
if (fdConfig < 0) {
|
||||
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stdout,
|
||||
"Config node open failed\n");
|
||||
return ZE_RESULT_ERROR_UNKNOWN;
|
||||
}
|
||||
std::unique_ptr<uint8_t[]> configMemory = std::make_unique<uint8_t[]>(PCI_CFG_SPACE_EXP_SIZE);
|
||||
memset(configMemory.get(), 0, PCI_CFG_SPACE_EXP_SIZE);
|
||||
if (this->preadFunction(fdConfig, configMemory.get(), PCI_CFG_SPACE_EXP_SIZE, 0) < 0) {
|
||||
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stdout,
|
||||
"Read to get config space failed\n");
|
||||
return ZE_RESULT_ERROR_UNKNOWN;
|
||||
}
|
||||
auto reBarCapPos = L0::Sysman::LinuxPciImp::getRebarCapabilityPos(configMemory.get(), true);
|
||||
if (!reBarCapPos) {
|
||||
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stdout,
|
||||
"VF BAR capability not found\n");
|
||||
return ZE_RESULT_ERROR_UNKNOWN;
|
||||
}
|
||||
|
||||
auto barSizePos = reBarCapPos + PCI_REBAR_CTRL + 1; // position of VF(0) BAR SIZE.
|
||||
if (this->pwriteFunction(fdConfig, &size, 0x01, barSizePos) < 0) {
|
||||
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stdout,
|
||||
"Write to change VF bar size failed\n");
|
||||
return ZE_RESULT_ERROR_UNKNOWN;
|
||||
}
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
// A 'warm reset' is a conventional reset that is triggered across a PCI express link.
|
||||
// A warm reset is triggered either when a link is forced into electrical idle or
|
||||
// by sending TS1 and TS2 ordered sets with the hot reset bit set.
|
||||
@@ -443,33 +399,6 @@ ze_result_t LinuxSysmanImp::osWarmReset() {
|
||||
}
|
||||
NEO::sleep(std::chrono::seconds(10)); // Sleep for 10seconds, allows the rescan to complete on all devices attached to the root port.
|
||||
|
||||
// PCIe port driver uses the BIOS allocated VF bars on bootup. A known bug exists in pcie port driver
|
||||
// and is causing VF bar allocation failure in PCIe port driver after an SBR - https://bugzilla.kernel.org/show_bug.cgi?id=216795
|
||||
|
||||
// WA to adjust VF bar size to 2GB. The default VF bar size is 8GB and for 63VFs, 504GB need to be allocated which is failing on SBR.
|
||||
// When configured VF bar size to 2GB, an allocation of 126GB is successful. This WA resizes VF0 bar to 2GB. Once pcie port driver
|
||||
// issue is resolved, this WA may not be necessary. Description for 0xb is explained at function definition - resizeVfVar.
|
||||
if (NEO::debugManager.flags.VfBarResourceAllocationWa.get()) {
|
||||
if (ZE_RESULT_SUCCESS != (result = resizeVfBar(0xb))) {
|
||||
return result;
|
||||
}
|
||||
|
||||
result = pFsAccess->write(devicePath + '/' + "remove", "1");
|
||||
if (ZE_RESULT_SUCCESS != result) {
|
||||
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stdout,
|
||||
"Card Bus remove after resizing VF bar failed\n");
|
||||
return result;
|
||||
}
|
||||
NEO::sleep(std::chrono::seconds(10)); // Sleep for 10seconds to make sure that the config spaces of all devices are saved correctly.
|
||||
|
||||
result = pFsAccess->write(rootPortPath + '/' + "rescan", "1");
|
||||
if (ZE_RESULT_SUCCESS != result) {
|
||||
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stdout,
|
||||
"Rescanning root port failed after resizing VF bar failed\n");
|
||||
return result;
|
||||
}
|
||||
NEO::sleep(std::chrono::seconds(10)); // Sleep for 10seconds, allows the rescan to complete on all devices attached to the root port.
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@@ -25,22 +25,11 @@ inline static int openMockDiag(const char *pathname, int flags) {
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
void mockSleepFunctionSecs(int64_t secs) {
|
||||
return;
|
||||
}
|
||||
|
||||
inline static int openMockDiagFail(const char *pathname, int flags) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
inline static int gtPciConfigOpenFail(const char *pathname, int flags) {
|
||||
if (strcmp(pathname, mockRealPathConfig.c_str()) == 0) {
|
||||
return mockFileDescriptor;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
ssize_t preadMockDiag(int fd, void *buf, size_t count, off_t offset) {
|
||||
uint8_t *mockBuf = static_cast<uint8_t *>(buf);
|
||||
if (fd == mockGtPciConfigFd) {
|
||||
@@ -78,24 +67,6 @@ ssize_t preadMockDiag(int fd, void *buf, size_t count, off_t offset) {
|
||||
return count;
|
||||
}
|
||||
|
||||
ssize_t mockGtConfigPreadInvalid(int fd, void *buf, size_t count, off_t offset) {
|
||||
return count;
|
||||
}
|
||||
|
||||
ssize_t mockGtConfigPreadFail(int fd, void *buf, size_t count, off_t offset) {
|
||||
if (fd == mockGtPciConfigFd) {
|
||||
return -1;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
ssize_t mockGtConfigPwriteFail(int fd, const void *buf, size_t count, off_t offset) {
|
||||
if (fd == mockGtPciConfigFd) {
|
||||
return -1;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
ssize_t pwriteMockDiag(int fd, const void *buf, size_t count, off_t offset) {
|
||||
return count;
|
||||
}
|
||||
@@ -529,66 +500,6 @@ TEST_F(ZesDiagnosticsFixture, GivenValidSysmanImpPointerWhenCallingWarmResetThen
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, pLinuxSysmanImp->osWarmReset());
|
||||
}
|
||||
|
||||
TEST_F(ZesDiagnosticsFixture, GivenValidSysmanImpPointerAndVfBarIsResizedWhenCallingWarmResetAndGtPciConfigOpenFailsThenCallReturnsFailure) {
|
||||
pLinuxSysmanImp->gtDevicePath = "/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:01.0/0000:8c:00.0";
|
||||
VariableBackup<decltype(NEO::SysCalls::sysCallsOpen)> openBackup(&NEO::SysCalls::sysCallsOpen, gtPciConfigOpenFail);
|
||||
pLinuxSysmanImp->preadFunction = preadMockDiag;
|
||||
pLinuxSysmanImp->pwriteFunction = pwriteMockDiag;
|
||||
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNKNOWN, pLinuxSysmanImp->osWarmReset());
|
||||
}
|
||||
|
||||
TEST_F(ZesDiagnosticsFixture, GivenValidSysmanImpPointerAndVfBarIsResizedWhenCallingWarmResetAndConfigHeaderIsInvalidThenCallReturnsFailure) {
|
||||
pLinuxSysmanImp->gtDevicePath = "/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:01.0/0000:8c:00.0";
|
||||
VariableBackup<decltype(NEO::SysCalls::sysCallsOpen)> openBackup(&NEO::SysCalls::sysCallsOpen, openMockDiag);
|
||||
pLinuxSysmanImp->preadFunction = mockGtConfigPreadInvalid;
|
||||
pLinuxSysmanImp->pwriteFunction = pwriteMockDiag;
|
||||
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNKNOWN, pLinuxSysmanImp->osWarmReset());
|
||||
}
|
||||
|
||||
TEST_F(ZesDiagnosticsFixture, GivenValidSysmanImpPointerAndVfBarIsResizedWhenCallingWarmResetAndGtConfigPreadFailsThenCallReturnsFailure) {
|
||||
pLinuxSysmanImp->gtDevicePath = "/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:01.0/0000:8c:00.0";
|
||||
VariableBackup<decltype(NEO::SysCalls::sysCallsOpen)> openBackup(&NEO::SysCalls::sysCallsOpen, openMockDiag);
|
||||
pLinuxSysmanImp->preadFunction = mockGtConfigPreadFail;
|
||||
pLinuxSysmanImp->pwriteFunction = pwriteMockDiag;
|
||||
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNKNOWN, pLinuxSysmanImp->osWarmReset());
|
||||
}
|
||||
|
||||
TEST_F(ZesDiagnosticsFixture, GivenValidSysmanImpPointerAndVfBarIsResizedWhenCallingWarmResetAndGtConfigPwriteFailsThenCallReturnsFailure) {
|
||||
pLinuxSysmanImp->gtDevicePath = "/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:01.0/0000:8c:00.0";
|
||||
VariableBackup<decltype(NEO::SysCalls::sysCallsOpen)> openBackup(&NEO::SysCalls::sysCallsOpen, openMockDiag);
|
||||
pLinuxSysmanImp->preadFunction = preadMockDiag;
|
||||
pLinuxSysmanImp->pwriteFunction = mockGtConfigPwriteFail;
|
||||
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_UNKNOWN, pLinuxSysmanImp->osWarmReset());
|
||||
}
|
||||
|
||||
TEST_F(ZesDiagnosticsFixture, GivenValidSysmanImpPointerAndVfBarIsResizedWhenCallingWarmResetAndCardBusRemoveFailsThenCallReturnsFailure) {
|
||||
pLinuxSysmanImp->gtDevicePath = "/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:01.0/0000:8c:00.0";
|
||||
VariableBackup<decltype(NEO::SysCalls::sysCallsOpen)> openBackup(&NEO::SysCalls::sysCallsOpen, openMockDiag);
|
||||
pLinuxSysmanImp->preadFunction = preadMockDiag;
|
||||
pLinuxSysmanImp->pwriteFunction = pwriteMockDiag;
|
||||
|
||||
pMockFsAccess->checkErrorAfterCount = 2;
|
||||
pMockFsAccess->mockWriteError = ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_NOT_AVAILABLE, pLinuxSysmanImp->osWarmReset());
|
||||
}
|
||||
|
||||
TEST_F(ZesDiagnosticsFixture, GivenValidSysmanImpPointerAndVfBarIsResizedWhenCallingWarmResetAndRootPortRescanFailsThenCallReturnsFailure) {
|
||||
pLinuxSysmanImp->gtDevicePath = "/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:01.0/0000:8c:00.0";
|
||||
VariableBackup<decltype(NEO::SysCalls::sysCallsOpen)> openBackup(&NEO::SysCalls::sysCallsOpen, openMockDiag);
|
||||
pLinuxSysmanImp->preadFunction = preadMockDiag;
|
||||
pLinuxSysmanImp->pwriteFunction = pwriteMockDiag;
|
||||
|
||||
pMockFsAccess->checkErrorAfterCount = 3;
|
||||
pMockFsAccess->mockWriteError = ZE_RESULT_ERROR_NOT_AVAILABLE;
|
||||
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_NOT_AVAILABLE, pLinuxSysmanImp->osWarmReset());
|
||||
}
|
||||
|
||||
TEST_F(ZesDiagnosticsFixture, GivenValidSysmanImpPointerAndVfBarIsResizedWhenCallingWarmResetThenCallSucceeds) {
|
||||
pLinuxSysmanImp->gtDevicePath = "/sys/devices/pci0000:89/0000:89:02.0/0000:8a:00.0/0000:8b:01.0/0000:8c:00.0";
|
||||
VariableBackup<decltype(NEO::SysCalls::sysCallsOpen)> openBackup(&NEO::SysCalls::sysCallsOpen, openMockDiag);
|
||||
|
||||
@@ -845,8 +845,6 @@ TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingzesDeviceResetE
|
||||
std::unique_ptr<SysmanProductHelper> pSysmanProductHelper = std::make_unique<MockSysmanProductHelper>();
|
||||
std::swap(pLinuxSysmanImp->pSysmanProductHelper, pSysmanProductHelper);
|
||||
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
debugManager.flags.VfBarResourceAllocationWa.set(false);
|
||||
zes_reset_properties_t pProperties = {.stype = ZES_STRUCTURE_TYPE_RESET_PROPERTIES, .pNext = nullptr, .force = true, .resetType = ZES_RESET_TYPE_WARM};
|
||||
ze_result_t result = zesDeviceResetExt(pSysmanDevice->toHandle(), &pProperties);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
@@ -894,8 +892,6 @@ TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingZesDeviceResetE
|
||||
|
||||
TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingResetExtWithInvalidTypeThenFailureIsReturned) {
|
||||
init(true);
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
debugManager.flags.VfBarResourceAllocationWa.set(false);
|
||||
zes_reset_properties_t pProperties = {.stype = ZES_STRUCTURE_TYPE_RESET_PROPERTIES, .pNext = nullptr, .force = true, .resetType = ZES_RESET_TYPE_FORCE_UINT32};
|
||||
ze_result_t result = zesDeviceResetExt(pSysmanDevice->toHandle(), &pProperties);
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, result);
|
||||
@@ -903,8 +899,6 @@ TEST_F(SysmanGlobalOperationsFixture, GivenDeviceInUseWhenCallingResetExtWithInv
|
||||
|
||||
TEST_F(SysmanGlobalOperationsFixture, GivenGettingSysfsPathFailsWhenCallingResetExtThenFailureIsReturned) {
|
||||
init(true);
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
debugManager.flags.VfBarResourceAllocationWa.set(false);
|
||||
pSysfsAccess->mockDeviceUnbound = true;
|
||||
zes_reset_properties_t pProperties = {.stype = ZES_STRUCTURE_TYPE_RESET_PROPERTIES, .pNext = nullptr, .force = true, .resetType = ZES_RESET_TYPE_FORCE_UINT32};
|
||||
ze_result_t result = zesDeviceResetExt(pSysmanDevice->toHandle(), &pProperties);
|
||||
@@ -912,8 +906,6 @@ TEST_F(SysmanGlobalOperationsFixture, GivenGettingSysfsPathFailsWhenCallingReset
|
||||
}
|
||||
|
||||
TEST_F(SysmanGlobalOperationsFixture, GivenForceTrueWhenCallingResetThenSuccessIsReturned) {
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
debugManager.flags.VfBarResourceAllocationWa.set(false);
|
||||
ze_result_t result = zesDeviceReset(device, true);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2023-2024 Intel Corporation
|
||||
* Copyright (C) 2023-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -668,6 +668,43 @@ TEST_F(ZesPciFixture, WhenConvertingLinkSpeedFromGigaTransfersPerSecondToBytesPe
|
||||
EXPECT_EQ(0, L0::Sysman::convertPcieSpeedFromGTsToBs(0.0));
|
||||
}
|
||||
|
||||
TEST_F(ZesPciFixture, GivenValidConfigMemoryDataWhenCallingGetRebarCapabilityPosThenTrueValueIsReturned) {
|
||||
std::unique_ptr<PublicLinuxPciImp> pLinuxPciImp = std::make_unique<PublicLinuxPciImp>(pOsSysman);
|
||||
std::vector<uint8_t> configMemory(PCI_CFG_SPACE_EXP_SIZE);
|
||||
uint8_t *mockBuf = configMemory.data();
|
||||
mockBuf[0x006] = 0x10;
|
||||
mockBuf[0x034] = 0x40;
|
||||
mockBuf[0x040] = 0x0d;
|
||||
mockBuf[0x041] = 0x50;
|
||||
mockBuf[0x050] = 0x10;
|
||||
mockBuf[0x051] = 0x70;
|
||||
mockBuf[0x052] = 0x90;
|
||||
mockBuf[0x070] = 0x10;
|
||||
mockBuf[0x071] = 0xac;
|
||||
mockBuf[0x072] = 0xa0;
|
||||
mockBuf[0x0ac] = 0x10;
|
||||
mockBuf[0x0b8] = 0x11;
|
||||
mockBuf[0x100] = 0x0e;
|
||||
mockBuf[0x102] = 0x01;
|
||||
mockBuf[0x103] = 0x42;
|
||||
mockBuf[0x420] = 0x15;
|
||||
mockBuf[0x422] = 0x01;
|
||||
mockBuf[0x423] = 0x22;
|
||||
mockBuf[0x425] = 0xf0;
|
||||
mockBuf[0x426] = 0x3f;
|
||||
mockBuf[0x428] = 0x22;
|
||||
mockBuf[0x429] = 0x11;
|
||||
mockBuf[0x220] = 0x24;
|
||||
mockBuf[0x222] = 0x01;
|
||||
mockBuf[0x223] = 0x32;
|
||||
mockBuf[0x320] = 0x10;
|
||||
mockBuf[0x322] = 0x01;
|
||||
mockBuf[0x323] = 0x40;
|
||||
mockBuf[0x400] = 0x18;
|
||||
mockBuf[0x402] = 0x01;
|
||||
EXPECT_TRUE(pLinuxPciImp->getRebarCapabilityPos(mockBuf, true));
|
||||
}
|
||||
|
||||
} // namespace ult
|
||||
} // namespace Sysman
|
||||
} // namespace L0
|
||||
|
||||
Reference in New Issue
Block a user