feature: add logic to iterate for all contexts to check GPU pagefault

Implemented to go through entire contexts in the process and then query
reset status to check the unexpected GPU segfault.

Added a new debug variable GpuFaultCheckThreshold to change the checking
frequency for each hang check for performance analysis.

Related-To: GSD-5673
Signed-off-by: Young Jin Yoon <young.jin.yoon@intel.com>
This commit is contained in:
Young Jin Yoon 2024-02-26 09:53:24 +00:00 committed by Compute-Runtime-Automation
parent 5111f30116
commit 82728ff394
10 changed files with 149 additions and 9 deletions

View File

@ -248,6 +248,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, EnableMultipleRegularContextForBcs, -1, "-1: def
DECLARE_DEBUG_VARIABLE(int32_t, AppendAubStreamContextFlags, -1, "-1: default, >0: Append flags passed during HardwareContext creation.")
DECLARE_DEBUG_VARIABLE(int32_t, ContextGroupSize, -1, "-1: default, 0-1: context group disabled, >1: number of contexts in group.")
DECLARE_DEBUG_VARIABLE(int32_t, DisableScratchPages, -1, "-1: default, 0: do not disable scratch pages during VM creations, 1: disable scratch pages during VM creations")
DECLARE_DEBUG_VARIABLE(int32_t, GpuFaultCheckThreshold, -1, "-1: default, 0: disable, >0: value for detecting the gpu pagefault for all contexts with scratch page disabled. When the number of hang check reaches to the threshold, gpu pagefault check will happen.")
DECLARE_DEBUG_VARIABLE(int32_t, OptimizeIoqBarriersHandling, -1, "-1: default, 0: disable, 1: enable. If enabled, dont dispatch stalling commands for IOQ. Instead, inherit TimestampPackets from previous enqueue.")
DECLARE_DEBUG_VARIABLE(int32_t, ExitOnSubmissionNumber, -1, "Call exit(0) on X submission. >=0: submission count (start from 0)")
DECLARE_DEBUG_VARIABLE(int32_t, ExitOnSubmissionMode, 0, "Exit on X submission mode. 0: Any context type, 1: Compute context only, 2: Copy context only ")

View File

@ -1990,6 +1990,16 @@ bool DrmMemoryManager::checkAllocationForChunking(size_t allocSize, size_t minSi
(((allocSize / MemoryConstants::chunkThreshold) % 2) == 0) && subDeviceEnabled && debugDisabled && modeEnabled && bufferEnabled);
}
void DrmMemoryManager::checkUnexpectedGpuPageFault() {
for (auto &engineContainer : allRegisteredEngines) {
for (auto &engine : engineContainer) {
CommandStreamReceiver *csr = engine.commandStreamReceiver;
Drm &drm = getDrm(csr->getRootDeviceIndex());
drm.checkResetStatus(*engine.osContext);
}
}
}
bool DrmMemoryManager::createDrmChunkedAllocation(Drm *drm, DrmAllocation *allocation, uint64_t boAddress, size_t boSize, size_t maxOsContextCount) {
auto &storageInfo = allocation->storageInfo;
auto memoryInfo = drm->getMemoryInfo();

View File

@ -98,6 +98,8 @@ class DrmMemoryManager : public MemoryManager {
size_t getSizeOfChunk(size_t allocSize);
bool checkAllocationForChunking(size_t allocSize, size_t minSize, bool subDeviceEnabled, bool debugDisabled, bool modeEnabled, bool bufferEnabled);
MOCKABLE_VIRTUAL void checkUnexpectedGpuPageFault();
protected:
void registerSharedBoHandleAllocation(DrmAllocation *drmAllocation);
BufferObjectHandleWrapper tryToGetBoHandleWrapperWithSharedOwnership(int boHandle);

View File

@ -59,6 +59,17 @@ Drm::Drm(std::unique_ptr<HwDeviceIdDrm> &&hwDeviceIdIn, RootDeviceEnvironment &r
hwDeviceId(std::move(hwDeviceIdIn)), rootDeviceEnvironment(rootDeviceEnvironment) {
pagingFence.fill(0u);
fenceVal.fill(0u);
if (rootDeviceEnvironment.executionEnvironment.isDebuggingEnabled()) {
disableScratch = false;
}
if (debugManager.flags.DisableScratchPages.get() != -1) {
disableScratch = debugManager.flags.DisableScratchPages.get();
}
auto threshold = debugManager.flags.GpuFaultCheckThreshold.get();
if (disableScratch && threshold != -1) {
gpuFaultCheckThreshold = threshold;
}
}
SubmissionStatus Drm::getSubmissionStatusFromReturnCode(int32_t retCode) {
@ -238,6 +249,20 @@ int Drm::queryGttSize(uint64_t &gttSizeOutput) {
}
bool Drm::isGpuHangDetected(OsContext &osContext) {
bool ret = checkResetStatus(osContext);
if (gpuFaultCheckThreshold != 0) {
if (gpuFaultCheckCounter == gpuFaultCheckThreshold) {
auto memoryManager = static_cast<DrmMemoryManager *>(this->rootDeviceEnvironment.executionEnvironment.memoryManager.get());
memoryManager->checkUnexpectedGpuPageFault();
gpuFaultCheckCounter = 0;
return false;
}
gpuFaultCheckCounter++;
}
return ret;
}
bool Drm::checkResetStatus(OsContext &osContext) {
const auto osContextLinux = static_cast<OsContextLinux *>(&osContext);
const auto &drmContextIds = osContextLinux->getDrmContextIds();
@ -1419,14 +1444,6 @@ int Drm::createDrmVirtualMemory(uint32_t &drmVmId) {
ctl.extensions = castToUint64(vmControlExtRegion.get());
}
bool disableScratch = false;
if (rootDeviceEnvironment.executionEnvironment.isDebuggingEnabled()) {
disableScratch = false;
}
if (debugManager.flags.DisableScratchPages.get() != -1) {
disableScratch = debugManager.flags.DisableScratchPages.get();
}
bool useVmBind = isVmBindAvailable();
bool enablePageFault = hasPageFaultSupport() && useVmBind;

View File

@ -130,6 +130,7 @@ class Drm : public DriverModel {
PhysicalDevicePciBusInfo getPciBusInfo() const override;
bool isGpuHangDetected(OsContext &osContext) override;
MOCKABLE_VIRTUAL bool checkResetStatus(OsContext &osContext);
bool areNonPersistentContextsSupported() const { return nonPersistentContextsSupported; }
void checkNonPersistentContextsSupport();
@ -342,6 +343,10 @@ class Drm : public DriverModel {
bool pageFaultSupported = false;
bool completionFenceSupported = false;
bool vmBindPatIndexProgrammingSupported = false;
bool disableScratch = false;
uint32_t gpuFaultCheckThreshold = 0u;
uint32_t gpuFaultCheckCounter = 0u;
private:
int getParamIoctl(DrmParam param, int *dstValue);

View File

@ -45,6 +45,7 @@ class TestedDrmMemoryManager : public MemoryManagerCreate<DrmMemoryManager> {
using DrmMemoryManager::allocatePhysicalLocalDeviceMemory;
using DrmMemoryManager::allocationTypeForCompletionFence;
using DrmMemoryManager::allocUserptr;
using DrmMemoryManager::checkUnexpectedGpuPageFault;
using DrmMemoryManager::createAllocWithAlignment;
using DrmMemoryManager::createAllocWithAlignmentFromUserptr;
using DrmMemoryManager::createGraphicsAllocation;
@ -72,6 +73,7 @@ class TestedDrmMemoryManager : public MemoryManagerCreate<DrmMemoryManager> {
using DrmMemoryManager::registerSharedBoHandleAllocation;
using DrmMemoryManager::releaseGpuRange;
using DrmMemoryManager::retrieveMmapOffsetForBufferObject;
using DrmMemoryManager::secondaryEngines;
using DrmMemoryManager::selectAlignmentAndHeap;
using DrmMemoryManager::setDomainCpu;
using DrmMemoryManager::sharedBoHandles;

View File

@ -9,6 +9,7 @@
#include "shared/source/helpers/hw_info.h"
#include "shared/source/os_interface/linux/drm_memory_manager.h"
#include "shared/source/os_interface/linux/drm_neo.h"
#include "shared/source/os_interface/os_context.h"
#include "shared/test/common/helpers/default_hw_info.h"
#include "shared/test/common/mocks/linux/mock_drm_wrappers.h"
@ -18,6 +19,7 @@
using NEO::Drm;
using NEO::DrmIoctl;
using NEO::HwDeviceIdDrm;
using NEO::OsContext;
using NEO::RootDeviceEnvironment;
extern const int mockFd;
@ -190,6 +192,11 @@ class DrmMockCustom : public Drm {
return 0u;
}
bool checkResetStatus(OsContext &osContext) override {
checkResetStatusCalled++;
return Drm::checkResetStatus(osContext);
}
Ioctls ioctlCnt{};
Ioctls ioctlExpected{};
@ -203,6 +210,8 @@ class DrmMockCustom : public Drm {
ChunkingModeCall getChunkingModeCall{};
IsChunkingAvailableCall isChunkingAvailableCall{};
size_t checkResetStatusCalled = 0u;
std::atomic<int> ioctlRes;
std::atomic<IoctlResExt *> ioctlResExt;

View File

@ -456,6 +456,7 @@ AccessCountersGranularity = -1
OverridePatIndex = -1
UseTileMemoryBankInVirtualMemoryCreation = -1
DisableScratchPages = -1
GpuFaultCheckThreshold = -1
ForceAllResourcesUncached = 0
ForcePreParserEnabledForMiArbCheck = -1
UseDynamicEventPacketsCount = -1

View File

@ -327,6 +327,16 @@ TEST_F(DrmMemoryManagerTest, GivenAllocatePhysicalDeviceMemoryThenSuccessReturne
memoryManager->freeGraphicsMemory(allocation);
}
TEST_F(DrmMemoryManagerTest, whenCallingChekcUnexpectedGpuPagedfaultThenAllEnginesWereChecked) {
memoryManager->checkUnexpectedGpuPageFault();
size_t allEnginesSize = 0u;
for (auto &engineContainer : memoryManager->allRegisteredEngines) {
allEnginesSize += engineContainer.size();
}
ASSERT_NE(0u, allEnginesSize);
EXPECT_EQ(allEnginesSize, mock->checkResetStatusCalled);
}
TEST_F(DrmMemoryManagerWithExplicitExpectationsTest, givenDrmMemoryManagerWhenGpuAddressReservationIsAttemptedAtIndex1ThenAddressFromGfxPartitionIsUsed) {
auto memoryManager = std::make_unique<TestedDrmMemoryManager>(false, true, false, *executionEnvironment);
RootDeviceIndicesContainer rootDeviceIndices;

View File

@ -23,6 +23,7 @@
#include "shared/test/common/helpers/test_files.h"
#include "shared/test/common/helpers/variable_backup.h"
#include "shared/test/common/libult/linux/drm_mock.h"
#include "shared/test/common/mocks/linux/mock_drm_memory_manager.h"
#include "shared/test/common/mocks/linux/mock_ioctl_helper.h"
#include "shared/test/common/mocks/mock_execution_environment.h"
#include "shared/test/common/mocks/mock_memory_manager.h"
@ -1428,6 +1429,88 @@ TEST(DrmDeathTest, GivenResetStatsWithValidFaultWhenIsGpuHangIsCalledThenProcess
EXPECT_THROW(drm.isGpuHangDetected(mockOsContextLinux), std::runtime_error);
}
struct DrmMockCheckPageFault : public DrmMock {
public:
using DrmMock::DrmMock;
using DrmMock::gpuFaultCheckThreshold;
};
TEST(DrmTest, givenDisableScratchPagesWhenSettingGpuFaultCheckThresholdThenThesholdValueIsSet) {
constexpr unsigned int iteration = 3u;
constexpr unsigned int threshold = 3u;
ASSERT_NE(0u, iteration);
ASSERT_NE(0u, threshold);
DebugManagerStateRestore restore;
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
debugManager.flags.DisableScratchPages.set(false);
debugManager.flags.GpuFaultCheckThreshold.set(threshold);
DrmMockCheckPageFault drm1{*executionEnvironment->rootDeviceEnvironments[0]};
EXPECT_EQ(0u, drm1.gpuFaultCheckThreshold);
debugManager.flags.DisableScratchPages.set(true);
debugManager.flags.GpuFaultCheckThreshold.set(-1);
DrmMockCheckPageFault drm2{*executionEnvironment->rootDeviceEnvironments[0]};
EXPECT_EQ(0u, drm2.gpuFaultCheckThreshold);
debugManager.flags.DisableScratchPages.set(true);
debugManager.flags.GpuFaultCheckThreshold.set(threshold);
DrmMockCheckPageFault drm3{*executionEnvironment->rootDeviceEnvironments[0]};
EXPECT_EQ(threshold, drm3.gpuFaultCheckThreshold);
}
struct MockDrmMemoryManagerCheckPageFault : public MockDrmMemoryManager {
using MockDrmMemoryManager::MockDrmMemoryManager;
void checkUnexpectedGpuPageFault() override {
checkUnexpectedGpuPageFaultCalled++;
}
size_t checkUnexpectedGpuPageFaultCalled = 0;
};
TEST(DrmTest, givenDisableScratchPagesSetWhenSettingGpuFaultCheckThresholdThenFaultCheckingIsHappeningAfterThreshold) {
constexpr unsigned int iteration = 3u;
constexpr unsigned int threshold = 3u;
ASSERT_NE(0u, iteration);
ASSERT_NE(0u, threshold);
DebugManagerStateRestore restore;
debugManager.flags.DisableScratchPages.set(true);
debugManager.flags.GpuFaultCheckThreshold.set(threshold);
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
auto rootDeviceEnvironment = executionEnvironment->rootDeviceEnvironments[0].get();
rootDeviceEnvironment->setHwInfoAndInitHelpers(defaultHwInfo.get());
rootDeviceEnvironment->osInterface = std::make_unique<OSInterface>();
rootDeviceEnvironment->osInterface->setDriverModel(std::unique_ptr<DriverModel>(new DrmMock(*rootDeviceEnvironment)));
auto memoryManager = new MockDrmMemoryManagerCheckPageFault(GemCloseWorkerMode::gemCloseWorkerInactive, false, false, *executionEnvironment);
executionEnvironment->memoryManager.reset(memoryManager);
auto &drm = *executionEnvironment->rootDeviceEnvironments[0]->osInterface->getDriverModel()->as<DrmMock>();
uint32_t contextId{0};
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::regular})};
MockOsContextLinux mockOsContextLinux{drm, 0, contextId, engineDescriptor};
mockOsContextLinux.drmContextIds.push_back(0);
ResetStats resetStats{};
resetStats.contextId = 0;
drm.resetStatsToReturn.push_back(resetStats);
bool isGpuHangDetected{};
for (auto i = 0u; i < iteration; i++) {
memoryManager->checkUnexpectedGpuPageFaultCalled = 0u;
for (auto j = 0u; j < threshold; j++) {
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
EXPECT_FALSE(isGpuHangDetected);
EXPECT_EQ(0u, memoryManager->checkUnexpectedGpuPageFaultCalled);
}
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
EXPECT_FALSE(isGpuHangDetected);
EXPECT_EQ(1u, memoryManager->checkUnexpectedGpuPageFaultCalled);
}
}
TEST(DrmTest, givenSetupIoctlHelperWhenCalledTwiceThenIoctlHelperIsSetOnlyOnce) {
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
DrmMock drm{*executionEnvironment->rootDeviceEnvironments[0]};