mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 06:49:52 +08:00
feature: add logic to iterate for all contexts to check GPU pagefault
Implemented to go through entire contexts in the process and then query reset status to check the unexpected GPU segfault. Added a new debug variable GpuFaultCheckThreshold to change the checking frequency for each hang check for performance analysis. Related-To: GSD-5673 Signed-off-by: Young Jin Yoon <young.jin.yoon@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
5111f30116
commit
82728ff394
@@ -45,6 +45,7 @@ class TestedDrmMemoryManager : public MemoryManagerCreate<DrmMemoryManager> {
|
||||
using DrmMemoryManager::allocatePhysicalLocalDeviceMemory;
|
||||
using DrmMemoryManager::allocationTypeForCompletionFence;
|
||||
using DrmMemoryManager::allocUserptr;
|
||||
using DrmMemoryManager::checkUnexpectedGpuPageFault;
|
||||
using DrmMemoryManager::createAllocWithAlignment;
|
||||
using DrmMemoryManager::createAllocWithAlignmentFromUserptr;
|
||||
using DrmMemoryManager::createGraphicsAllocation;
|
||||
@@ -72,6 +73,7 @@ class TestedDrmMemoryManager : public MemoryManagerCreate<DrmMemoryManager> {
|
||||
using DrmMemoryManager::registerSharedBoHandleAllocation;
|
||||
using DrmMemoryManager::releaseGpuRange;
|
||||
using DrmMemoryManager::retrieveMmapOffsetForBufferObject;
|
||||
using DrmMemoryManager::secondaryEngines;
|
||||
using DrmMemoryManager::selectAlignmentAndHeap;
|
||||
using DrmMemoryManager::setDomainCpu;
|
||||
using DrmMemoryManager::sharedBoHandles;
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include "shared/source/helpers/hw_info.h"
|
||||
#include "shared/source/os_interface/linux/drm_memory_manager.h"
|
||||
#include "shared/source/os_interface/linux/drm_neo.h"
|
||||
#include "shared/source/os_interface/os_context.h"
|
||||
#include "shared/test/common/helpers/default_hw_info.h"
|
||||
#include "shared/test/common/mocks/linux/mock_drm_wrappers.h"
|
||||
|
||||
@@ -18,6 +19,7 @@
|
||||
using NEO::Drm;
|
||||
using NEO::DrmIoctl;
|
||||
using NEO::HwDeviceIdDrm;
|
||||
using NEO::OsContext;
|
||||
using NEO::RootDeviceEnvironment;
|
||||
|
||||
extern const int mockFd;
|
||||
@@ -190,6 +192,11 @@ class DrmMockCustom : public Drm {
|
||||
return 0u;
|
||||
}
|
||||
|
||||
bool checkResetStatus(OsContext &osContext) override {
|
||||
checkResetStatusCalled++;
|
||||
return Drm::checkResetStatus(osContext);
|
||||
}
|
||||
|
||||
Ioctls ioctlCnt{};
|
||||
Ioctls ioctlExpected{};
|
||||
|
||||
@@ -203,6 +210,8 @@ class DrmMockCustom : public Drm {
|
||||
ChunkingModeCall getChunkingModeCall{};
|
||||
IsChunkingAvailableCall isChunkingAvailableCall{};
|
||||
|
||||
size_t checkResetStatusCalled = 0u;
|
||||
|
||||
std::atomic<int> ioctlRes;
|
||||
std::atomic<IoctlResExt *> ioctlResExt;
|
||||
|
||||
|
||||
@@ -456,6 +456,7 @@ AccessCountersGranularity = -1
|
||||
OverridePatIndex = -1
|
||||
UseTileMemoryBankInVirtualMemoryCreation = -1
|
||||
DisableScratchPages = -1
|
||||
GpuFaultCheckThreshold = -1
|
||||
ForceAllResourcesUncached = 0
|
||||
ForcePreParserEnabledForMiArbCheck = -1
|
||||
UseDynamicEventPacketsCount = -1
|
||||
|
||||
@@ -327,6 +327,16 @@ TEST_F(DrmMemoryManagerTest, GivenAllocatePhysicalDeviceMemoryThenSuccessReturne
|
||||
memoryManager->freeGraphicsMemory(allocation);
|
||||
}
|
||||
|
||||
TEST_F(DrmMemoryManagerTest, whenCallingChekcUnexpectedGpuPagedfaultThenAllEnginesWereChecked) {
|
||||
memoryManager->checkUnexpectedGpuPageFault();
|
||||
size_t allEnginesSize = 0u;
|
||||
for (auto &engineContainer : memoryManager->allRegisteredEngines) {
|
||||
allEnginesSize += engineContainer.size();
|
||||
}
|
||||
ASSERT_NE(0u, allEnginesSize);
|
||||
EXPECT_EQ(allEnginesSize, mock->checkResetStatusCalled);
|
||||
}
|
||||
|
||||
TEST_F(DrmMemoryManagerWithExplicitExpectationsTest, givenDrmMemoryManagerWhenGpuAddressReservationIsAttemptedAtIndex1ThenAddressFromGfxPartitionIsUsed) {
|
||||
auto memoryManager = std::make_unique<TestedDrmMemoryManager>(false, true, false, *executionEnvironment);
|
||||
RootDeviceIndicesContainer rootDeviceIndices;
|
||||
@@ -7813,4 +7823,4 @@ TEST_F(DrmMemoryManagerTest, givenDebugVariableToToggleGpuVaBitsWhenAllocatingRe
|
||||
|
||||
memoryManager->freeGraphicsMemory(allocation);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
#include "shared/test/common/helpers/test_files.h"
|
||||
#include "shared/test/common/helpers/variable_backup.h"
|
||||
#include "shared/test/common/libult/linux/drm_mock.h"
|
||||
#include "shared/test/common/mocks/linux/mock_drm_memory_manager.h"
|
||||
#include "shared/test/common/mocks/linux/mock_ioctl_helper.h"
|
||||
#include "shared/test/common/mocks/mock_execution_environment.h"
|
||||
#include "shared/test/common/mocks/mock_memory_manager.h"
|
||||
@@ -1428,6 +1429,88 @@ TEST(DrmDeathTest, GivenResetStatsWithValidFaultWhenIsGpuHangIsCalledThenProcess
|
||||
EXPECT_THROW(drm.isGpuHangDetected(mockOsContextLinux), std::runtime_error);
|
||||
}
|
||||
|
||||
struct DrmMockCheckPageFault : public DrmMock {
|
||||
public:
|
||||
using DrmMock::DrmMock;
|
||||
using DrmMock::gpuFaultCheckThreshold;
|
||||
};
|
||||
|
||||
TEST(DrmTest, givenDisableScratchPagesWhenSettingGpuFaultCheckThresholdThenThesholdValueIsSet) {
|
||||
constexpr unsigned int iteration = 3u;
|
||||
constexpr unsigned int threshold = 3u;
|
||||
ASSERT_NE(0u, iteration);
|
||||
ASSERT_NE(0u, threshold);
|
||||
DebugManagerStateRestore restore;
|
||||
|
||||
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
|
||||
|
||||
debugManager.flags.DisableScratchPages.set(false);
|
||||
debugManager.flags.GpuFaultCheckThreshold.set(threshold);
|
||||
DrmMockCheckPageFault drm1{*executionEnvironment->rootDeviceEnvironments[0]};
|
||||
EXPECT_EQ(0u, drm1.gpuFaultCheckThreshold);
|
||||
|
||||
debugManager.flags.DisableScratchPages.set(true);
|
||||
debugManager.flags.GpuFaultCheckThreshold.set(-1);
|
||||
DrmMockCheckPageFault drm2{*executionEnvironment->rootDeviceEnvironments[0]};
|
||||
EXPECT_EQ(0u, drm2.gpuFaultCheckThreshold);
|
||||
|
||||
debugManager.flags.DisableScratchPages.set(true);
|
||||
debugManager.flags.GpuFaultCheckThreshold.set(threshold);
|
||||
DrmMockCheckPageFault drm3{*executionEnvironment->rootDeviceEnvironments[0]};
|
||||
EXPECT_EQ(threshold, drm3.gpuFaultCheckThreshold);
|
||||
}
|
||||
|
||||
struct MockDrmMemoryManagerCheckPageFault : public MockDrmMemoryManager {
|
||||
using MockDrmMemoryManager::MockDrmMemoryManager;
|
||||
void checkUnexpectedGpuPageFault() override {
|
||||
checkUnexpectedGpuPageFaultCalled++;
|
||||
}
|
||||
size_t checkUnexpectedGpuPageFaultCalled = 0;
|
||||
};
|
||||
|
||||
TEST(DrmTest, givenDisableScratchPagesSetWhenSettingGpuFaultCheckThresholdThenFaultCheckingIsHappeningAfterThreshold) {
|
||||
constexpr unsigned int iteration = 3u;
|
||||
constexpr unsigned int threshold = 3u;
|
||||
ASSERT_NE(0u, iteration);
|
||||
ASSERT_NE(0u, threshold);
|
||||
DebugManagerStateRestore restore;
|
||||
debugManager.flags.DisableScratchPages.set(true);
|
||||
debugManager.flags.GpuFaultCheckThreshold.set(threshold);
|
||||
|
||||
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
|
||||
auto rootDeviceEnvironment = executionEnvironment->rootDeviceEnvironments[0].get();
|
||||
rootDeviceEnvironment->setHwInfoAndInitHelpers(defaultHwInfo.get());
|
||||
rootDeviceEnvironment->osInterface = std::make_unique<OSInterface>();
|
||||
rootDeviceEnvironment->osInterface->setDriverModel(std::unique_ptr<DriverModel>(new DrmMock(*rootDeviceEnvironment)));
|
||||
|
||||
auto memoryManager = new MockDrmMemoryManagerCheckPageFault(GemCloseWorkerMode::gemCloseWorkerInactive, false, false, *executionEnvironment);
|
||||
executionEnvironment->memoryManager.reset(memoryManager);
|
||||
auto &drm = *executionEnvironment->rootDeviceEnvironments[0]->osInterface->getDriverModel()->as<DrmMock>();
|
||||
|
||||
uint32_t contextId{0};
|
||||
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::regular})};
|
||||
|
||||
MockOsContextLinux mockOsContextLinux{drm, 0, contextId, engineDescriptor};
|
||||
mockOsContextLinux.drmContextIds.push_back(0);
|
||||
|
||||
ResetStats resetStats{};
|
||||
resetStats.contextId = 0;
|
||||
drm.resetStatsToReturn.push_back(resetStats);
|
||||
|
||||
bool isGpuHangDetected{};
|
||||
for (auto i = 0u; i < iteration; i++) {
|
||||
memoryManager->checkUnexpectedGpuPageFaultCalled = 0u;
|
||||
for (auto j = 0u; j < threshold; j++) {
|
||||
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
|
||||
EXPECT_FALSE(isGpuHangDetected);
|
||||
EXPECT_EQ(0u, memoryManager->checkUnexpectedGpuPageFaultCalled);
|
||||
}
|
||||
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
|
||||
EXPECT_FALSE(isGpuHangDetected);
|
||||
EXPECT_EQ(1u, memoryManager->checkUnexpectedGpuPageFaultCalled);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(DrmTest, givenSetupIoctlHelperWhenCalledTwiceThenIoctlHelperIsSetOnlyOnce) {
|
||||
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
|
||||
DrmMock drm{*executionEnvironment->rootDeviceEnvironments[0]};
|
||||
|
||||
Reference in New Issue
Block a user