feature: add logic to iterate for all contexts to check GPU pagefault
Implemented to go through entire contexts in the process and then query reset status to check the unexpected GPU segfault. Added a new debug variable GpuFaultCheckThreshold to change the checking frequency for each hang check for performance analysis. Related-To: GSD-5673 Signed-off-by: Young Jin Yoon <young.jin.yoon@intel.com>
This commit is contained in:
parent
5111f30116
commit
82728ff394
|
@ -248,6 +248,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, EnableMultipleRegularContextForBcs, -1, "-1: def
|
|||
DECLARE_DEBUG_VARIABLE(int32_t, AppendAubStreamContextFlags, -1, "-1: default, >0: Append flags passed during HardwareContext creation.")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ContextGroupSize, -1, "-1: default, 0-1: context group disabled, >1: number of contexts in group.")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, DisableScratchPages, -1, "-1: default, 0: do not disable scratch pages during VM creations, 1: disable scratch pages during VM creations")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, GpuFaultCheckThreshold, -1, "-1: default, 0: disable, >0: value for detecting the gpu pagefault for all contexts with scratch page disabled. When the number of hang check reaches to the threshold, gpu pagefault check will happen.")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, OptimizeIoqBarriersHandling, -1, "-1: default, 0: disable, 1: enable. If enabled, dont dispatch stalling commands for IOQ. Instead, inherit TimestampPackets from previous enqueue.")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExitOnSubmissionNumber, -1, "Call exit(0) on X submission. >=0: submission count (start from 0)")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExitOnSubmissionMode, 0, "Exit on X submission mode. 0: Any context type, 1: Compute context only, 2: Copy context only ")
|
||||
|
|
|
@ -1990,6 +1990,16 @@ bool DrmMemoryManager::checkAllocationForChunking(size_t allocSize, size_t minSi
|
|||
(((allocSize / MemoryConstants::chunkThreshold) % 2) == 0) && subDeviceEnabled && debugDisabled && modeEnabled && bufferEnabled);
|
||||
}
|
||||
|
||||
void DrmMemoryManager::checkUnexpectedGpuPageFault() {
|
||||
for (auto &engineContainer : allRegisteredEngines) {
|
||||
for (auto &engine : engineContainer) {
|
||||
CommandStreamReceiver *csr = engine.commandStreamReceiver;
|
||||
Drm &drm = getDrm(csr->getRootDeviceIndex());
|
||||
drm.checkResetStatus(*engine.osContext);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool DrmMemoryManager::createDrmChunkedAllocation(Drm *drm, DrmAllocation *allocation, uint64_t boAddress, size_t boSize, size_t maxOsContextCount) {
|
||||
auto &storageInfo = allocation->storageInfo;
|
||||
auto memoryInfo = drm->getMemoryInfo();
|
||||
|
|
|
@ -98,6 +98,8 @@ class DrmMemoryManager : public MemoryManager {
|
|||
size_t getSizeOfChunk(size_t allocSize);
|
||||
bool checkAllocationForChunking(size_t allocSize, size_t minSize, bool subDeviceEnabled, bool debugDisabled, bool modeEnabled, bool bufferEnabled);
|
||||
|
||||
MOCKABLE_VIRTUAL void checkUnexpectedGpuPageFault();
|
||||
|
||||
protected:
|
||||
void registerSharedBoHandleAllocation(DrmAllocation *drmAllocation);
|
||||
BufferObjectHandleWrapper tryToGetBoHandleWrapperWithSharedOwnership(int boHandle);
|
||||
|
|
|
@ -59,6 +59,17 @@ Drm::Drm(std::unique_ptr<HwDeviceIdDrm> &&hwDeviceIdIn, RootDeviceEnvironment &r
|
|||
hwDeviceId(std::move(hwDeviceIdIn)), rootDeviceEnvironment(rootDeviceEnvironment) {
|
||||
pagingFence.fill(0u);
|
||||
fenceVal.fill(0u);
|
||||
|
||||
if (rootDeviceEnvironment.executionEnvironment.isDebuggingEnabled()) {
|
||||
disableScratch = false;
|
||||
}
|
||||
if (debugManager.flags.DisableScratchPages.get() != -1) {
|
||||
disableScratch = debugManager.flags.DisableScratchPages.get();
|
||||
}
|
||||
auto threshold = debugManager.flags.GpuFaultCheckThreshold.get();
|
||||
if (disableScratch && threshold != -1) {
|
||||
gpuFaultCheckThreshold = threshold;
|
||||
}
|
||||
}
|
||||
|
||||
SubmissionStatus Drm::getSubmissionStatusFromReturnCode(int32_t retCode) {
|
||||
|
@ -238,6 +249,20 @@ int Drm::queryGttSize(uint64_t >tSizeOutput) {
|
|||
}
|
||||
|
||||
bool Drm::isGpuHangDetected(OsContext &osContext) {
|
||||
bool ret = checkResetStatus(osContext);
|
||||
if (gpuFaultCheckThreshold != 0) {
|
||||
if (gpuFaultCheckCounter == gpuFaultCheckThreshold) {
|
||||
auto memoryManager = static_cast<DrmMemoryManager *>(this->rootDeviceEnvironment.executionEnvironment.memoryManager.get());
|
||||
memoryManager->checkUnexpectedGpuPageFault();
|
||||
gpuFaultCheckCounter = 0;
|
||||
return false;
|
||||
}
|
||||
gpuFaultCheckCounter++;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool Drm::checkResetStatus(OsContext &osContext) {
|
||||
const auto osContextLinux = static_cast<OsContextLinux *>(&osContext);
|
||||
const auto &drmContextIds = osContextLinux->getDrmContextIds();
|
||||
|
||||
|
@ -1419,14 +1444,6 @@ int Drm::createDrmVirtualMemory(uint32_t &drmVmId) {
|
|||
ctl.extensions = castToUint64(vmControlExtRegion.get());
|
||||
}
|
||||
|
||||
bool disableScratch = false;
|
||||
if (rootDeviceEnvironment.executionEnvironment.isDebuggingEnabled()) {
|
||||
disableScratch = false;
|
||||
}
|
||||
if (debugManager.flags.DisableScratchPages.get() != -1) {
|
||||
disableScratch = debugManager.flags.DisableScratchPages.get();
|
||||
}
|
||||
|
||||
bool useVmBind = isVmBindAvailable();
|
||||
bool enablePageFault = hasPageFaultSupport() && useVmBind;
|
||||
|
||||
|
|
|
@ -130,6 +130,7 @@ class Drm : public DriverModel {
|
|||
|
||||
PhysicalDevicePciBusInfo getPciBusInfo() const override;
|
||||
bool isGpuHangDetected(OsContext &osContext) override;
|
||||
MOCKABLE_VIRTUAL bool checkResetStatus(OsContext &osContext);
|
||||
|
||||
bool areNonPersistentContextsSupported() const { return nonPersistentContextsSupported; }
|
||||
void checkNonPersistentContextsSupport();
|
||||
|
@ -342,6 +343,10 @@ class Drm : public DriverModel {
|
|||
bool pageFaultSupported = false;
|
||||
bool completionFenceSupported = false;
|
||||
bool vmBindPatIndexProgrammingSupported = false;
|
||||
bool disableScratch = false;
|
||||
|
||||
uint32_t gpuFaultCheckThreshold = 0u;
|
||||
uint32_t gpuFaultCheckCounter = 0u;
|
||||
|
||||
private:
|
||||
int getParamIoctl(DrmParam param, int *dstValue);
|
||||
|
|
|
@ -45,6 +45,7 @@ class TestedDrmMemoryManager : public MemoryManagerCreate<DrmMemoryManager> {
|
|||
using DrmMemoryManager::allocatePhysicalLocalDeviceMemory;
|
||||
using DrmMemoryManager::allocationTypeForCompletionFence;
|
||||
using DrmMemoryManager::allocUserptr;
|
||||
using DrmMemoryManager::checkUnexpectedGpuPageFault;
|
||||
using DrmMemoryManager::createAllocWithAlignment;
|
||||
using DrmMemoryManager::createAllocWithAlignmentFromUserptr;
|
||||
using DrmMemoryManager::createGraphicsAllocation;
|
||||
|
@ -72,6 +73,7 @@ class TestedDrmMemoryManager : public MemoryManagerCreate<DrmMemoryManager> {
|
|||
using DrmMemoryManager::registerSharedBoHandleAllocation;
|
||||
using DrmMemoryManager::releaseGpuRange;
|
||||
using DrmMemoryManager::retrieveMmapOffsetForBufferObject;
|
||||
using DrmMemoryManager::secondaryEngines;
|
||||
using DrmMemoryManager::selectAlignmentAndHeap;
|
||||
using DrmMemoryManager::setDomainCpu;
|
||||
using DrmMemoryManager::sharedBoHandles;
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
#include "shared/source/helpers/hw_info.h"
|
||||
#include "shared/source/os_interface/linux/drm_memory_manager.h"
|
||||
#include "shared/source/os_interface/linux/drm_neo.h"
|
||||
#include "shared/source/os_interface/os_context.h"
|
||||
#include "shared/test/common/helpers/default_hw_info.h"
|
||||
#include "shared/test/common/mocks/linux/mock_drm_wrappers.h"
|
||||
|
||||
|
@ -18,6 +19,7 @@
|
|||
using NEO::Drm;
|
||||
using NEO::DrmIoctl;
|
||||
using NEO::HwDeviceIdDrm;
|
||||
using NEO::OsContext;
|
||||
using NEO::RootDeviceEnvironment;
|
||||
|
||||
extern const int mockFd;
|
||||
|
@ -190,6 +192,11 @@ class DrmMockCustom : public Drm {
|
|||
return 0u;
|
||||
}
|
||||
|
||||
bool checkResetStatus(OsContext &osContext) override {
|
||||
checkResetStatusCalled++;
|
||||
return Drm::checkResetStatus(osContext);
|
||||
}
|
||||
|
||||
Ioctls ioctlCnt{};
|
||||
Ioctls ioctlExpected{};
|
||||
|
||||
|
@ -203,6 +210,8 @@ class DrmMockCustom : public Drm {
|
|||
ChunkingModeCall getChunkingModeCall{};
|
||||
IsChunkingAvailableCall isChunkingAvailableCall{};
|
||||
|
||||
size_t checkResetStatusCalled = 0u;
|
||||
|
||||
std::atomic<int> ioctlRes;
|
||||
std::atomic<IoctlResExt *> ioctlResExt;
|
||||
|
||||
|
|
|
@ -456,6 +456,7 @@ AccessCountersGranularity = -1
|
|||
OverridePatIndex = -1
|
||||
UseTileMemoryBankInVirtualMemoryCreation = -1
|
||||
DisableScratchPages = -1
|
||||
GpuFaultCheckThreshold = -1
|
||||
ForceAllResourcesUncached = 0
|
||||
ForcePreParserEnabledForMiArbCheck = -1
|
||||
UseDynamicEventPacketsCount = -1
|
||||
|
|
|
@ -327,6 +327,16 @@ TEST_F(DrmMemoryManagerTest, GivenAllocatePhysicalDeviceMemoryThenSuccessReturne
|
|||
memoryManager->freeGraphicsMemory(allocation);
|
||||
}
|
||||
|
||||
TEST_F(DrmMemoryManagerTest, whenCallingChekcUnexpectedGpuPagedfaultThenAllEnginesWereChecked) {
|
||||
memoryManager->checkUnexpectedGpuPageFault();
|
||||
size_t allEnginesSize = 0u;
|
||||
for (auto &engineContainer : memoryManager->allRegisteredEngines) {
|
||||
allEnginesSize += engineContainer.size();
|
||||
}
|
||||
ASSERT_NE(0u, allEnginesSize);
|
||||
EXPECT_EQ(allEnginesSize, mock->checkResetStatusCalled);
|
||||
}
|
||||
|
||||
TEST_F(DrmMemoryManagerWithExplicitExpectationsTest, givenDrmMemoryManagerWhenGpuAddressReservationIsAttemptedAtIndex1ThenAddressFromGfxPartitionIsUsed) {
|
||||
auto memoryManager = std::make_unique<TestedDrmMemoryManager>(false, true, false, *executionEnvironment);
|
||||
RootDeviceIndicesContainer rootDeviceIndices;
|
||||
|
@ -7813,4 +7823,4 @@ TEST_F(DrmMemoryManagerTest, givenDebugVariableToToggleGpuVaBitsWhenAllocatingRe
|
|||
|
||||
memoryManager->freeGraphicsMemory(allocation);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#include "shared/test/common/helpers/test_files.h"
|
||||
#include "shared/test/common/helpers/variable_backup.h"
|
||||
#include "shared/test/common/libult/linux/drm_mock.h"
|
||||
#include "shared/test/common/mocks/linux/mock_drm_memory_manager.h"
|
||||
#include "shared/test/common/mocks/linux/mock_ioctl_helper.h"
|
||||
#include "shared/test/common/mocks/mock_execution_environment.h"
|
||||
#include "shared/test/common/mocks/mock_memory_manager.h"
|
||||
|
@ -1428,6 +1429,88 @@ TEST(DrmDeathTest, GivenResetStatsWithValidFaultWhenIsGpuHangIsCalledThenProcess
|
|||
EXPECT_THROW(drm.isGpuHangDetected(mockOsContextLinux), std::runtime_error);
|
||||
}
|
||||
|
||||
struct DrmMockCheckPageFault : public DrmMock {
|
||||
public:
|
||||
using DrmMock::DrmMock;
|
||||
using DrmMock::gpuFaultCheckThreshold;
|
||||
};
|
||||
|
||||
TEST(DrmTest, givenDisableScratchPagesWhenSettingGpuFaultCheckThresholdThenThesholdValueIsSet) {
|
||||
constexpr unsigned int iteration = 3u;
|
||||
constexpr unsigned int threshold = 3u;
|
||||
ASSERT_NE(0u, iteration);
|
||||
ASSERT_NE(0u, threshold);
|
||||
DebugManagerStateRestore restore;
|
||||
|
||||
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
|
||||
|
||||
debugManager.flags.DisableScratchPages.set(false);
|
||||
debugManager.flags.GpuFaultCheckThreshold.set(threshold);
|
||||
DrmMockCheckPageFault drm1{*executionEnvironment->rootDeviceEnvironments[0]};
|
||||
EXPECT_EQ(0u, drm1.gpuFaultCheckThreshold);
|
||||
|
||||
debugManager.flags.DisableScratchPages.set(true);
|
||||
debugManager.flags.GpuFaultCheckThreshold.set(-1);
|
||||
DrmMockCheckPageFault drm2{*executionEnvironment->rootDeviceEnvironments[0]};
|
||||
EXPECT_EQ(0u, drm2.gpuFaultCheckThreshold);
|
||||
|
||||
debugManager.flags.DisableScratchPages.set(true);
|
||||
debugManager.flags.GpuFaultCheckThreshold.set(threshold);
|
||||
DrmMockCheckPageFault drm3{*executionEnvironment->rootDeviceEnvironments[0]};
|
||||
EXPECT_EQ(threshold, drm3.gpuFaultCheckThreshold);
|
||||
}
|
||||
|
||||
struct MockDrmMemoryManagerCheckPageFault : public MockDrmMemoryManager {
|
||||
using MockDrmMemoryManager::MockDrmMemoryManager;
|
||||
void checkUnexpectedGpuPageFault() override {
|
||||
checkUnexpectedGpuPageFaultCalled++;
|
||||
}
|
||||
size_t checkUnexpectedGpuPageFaultCalled = 0;
|
||||
};
|
||||
|
||||
TEST(DrmTest, givenDisableScratchPagesSetWhenSettingGpuFaultCheckThresholdThenFaultCheckingIsHappeningAfterThreshold) {
|
||||
constexpr unsigned int iteration = 3u;
|
||||
constexpr unsigned int threshold = 3u;
|
||||
ASSERT_NE(0u, iteration);
|
||||
ASSERT_NE(0u, threshold);
|
||||
DebugManagerStateRestore restore;
|
||||
debugManager.flags.DisableScratchPages.set(true);
|
||||
debugManager.flags.GpuFaultCheckThreshold.set(threshold);
|
||||
|
||||
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
|
||||
auto rootDeviceEnvironment = executionEnvironment->rootDeviceEnvironments[0].get();
|
||||
rootDeviceEnvironment->setHwInfoAndInitHelpers(defaultHwInfo.get());
|
||||
rootDeviceEnvironment->osInterface = std::make_unique<OSInterface>();
|
||||
rootDeviceEnvironment->osInterface->setDriverModel(std::unique_ptr<DriverModel>(new DrmMock(*rootDeviceEnvironment)));
|
||||
|
||||
auto memoryManager = new MockDrmMemoryManagerCheckPageFault(GemCloseWorkerMode::gemCloseWorkerInactive, false, false, *executionEnvironment);
|
||||
executionEnvironment->memoryManager.reset(memoryManager);
|
||||
auto &drm = *executionEnvironment->rootDeviceEnvironments[0]->osInterface->getDriverModel()->as<DrmMock>();
|
||||
|
||||
uint32_t contextId{0};
|
||||
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::regular})};
|
||||
|
||||
MockOsContextLinux mockOsContextLinux{drm, 0, contextId, engineDescriptor};
|
||||
mockOsContextLinux.drmContextIds.push_back(0);
|
||||
|
||||
ResetStats resetStats{};
|
||||
resetStats.contextId = 0;
|
||||
drm.resetStatsToReturn.push_back(resetStats);
|
||||
|
||||
bool isGpuHangDetected{};
|
||||
for (auto i = 0u; i < iteration; i++) {
|
||||
memoryManager->checkUnexpectedGpuPageFaultCalled = 0u;
|
||||
for (auto j = 0u; j < threshold; j++) {
|
||||
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
|
||||
EXPECT_FALSE(isGpuHangDetected);
|
||||
EXPECT_EQ(0u, memoryManager->checkUnexpectedGpuPageFaultCalled);
|
||||
}
|
||||
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
|
||||
EXPECT_FALSE(isGpuHangDetected);
|
||||
EXPECT_EQ(1u, memoryManager->checkUnexpectedGpuPageFaultCalled);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(DrmTest, givenSetupIoctlHelperWhenCalledTwiceThenIoctlHelperIsSetOnlyOnce) {
|
||||
auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
|
||||
DrmMock drm{*executionEnvironment->rootDeviceEnvironments[0]};
|
||||
|
|
Loading…
Reference in New Issue