Implement GPU hang detection on Windows

This change uses value of cpuAddress from monitored fence
to detect GPU hang.

Related-To: NEO-5313
Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
This commit is contained in:
Patryk Wrobel
2022-02-04 16:03:36 +00:00
committed by Compute-Runtime-Automation
parent 61ca84e94b
commit 18cafd3a52
12 changed files with 69 additions and 86 deletions

View File

@ -38,6 +38,10 @@ struct MockDriverModel : NEO::DriverModel {
size_t getMaxMemAllocSize() const override {
return maxAllocSize;
}
bool isGpuHangDetected(NEO::OsContext &osContext) override {
return false;
}
};
struct MockDriverModelWDDM : NEO::DriverModel {
@ -51,6 +55,10 @@ struct MockDriverModelWDDM : NEO::DriverModel {
size_t getMaxMemAllocSize() const override {
return maxAllocSize;
}
bool isGpuHangDetected(NEO::OsContext &osContext) override {
return false;
}
};
struct MockDriverModelDRM : NEO::DriverModel {
@ -64,6 +72,10 @@ struct MockDriverModelDRM : NEO::DriverModel {
size_t getMaxMemAllocSize() const override {
return maxAllocSize;
}
bool isGpuHangDetected(NEO::OsContext &osContext) override {
return false;
}
};
struct ContextShareableMock : public L0::ContextImp {

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2021 Intel Corporation
* Copyright (C) 2020-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -366,6 +366,10 @@ class UnknownDriverModel : public DriverModel {
PhysicalDevicePciBusInfo pciBusInfo(PhysicalDevicePciBusInfo::InvalidValue, PhysicalDevicePciBusInfo::InvalidValue, PhysicalDevicePciBusInfo::InvalidValue, PhysicalDevicePciBusInfo::InvalidValue);
return pciBusInfo;
}
bool isGpuHangDetected(OsContext &osContext) override {
return false;
}
};
using SysmanUnknownDriverModelTest = Test<DeviceFixture>;

View File

@ -1038,30 +1038,6 @@ TEST(DrmTest, GivenCompletionFenceDebugFlagWhenCreatingDrmObjectThenExpectCorrec
EXPECT_FALSE(drmDisabled.completionFenceSupport());
}
TEST(DrmTest, GivenInvalidContextIdWhenIsGpuHangIsCalledThenErrorIsThrown) {
ExecutionEnvironment executionEnvironment{};
executionEnvironment.prepareRootDeviceEnvironments(1);
DrmMock drm{*executionEnvironment.rootDeviceEnvironments[0]};
uint32_t contextId{0};
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
CommandStreamReceiver *csr{nullptr};
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
auto memoryManager = std::make_unique<MockMemoryManager>();
auto memoryManagerRaw = memoryManager.get();
memoryManagerRaw->registeredEngines = std::move(engines);
executionEnvironment.memoryManager = std::move(memoryManager);
const auto invalidContextId = 1;
EXPECT_THROW(drm.isGpuHangDetected(invalidContextId), std::runtime_error);
memoryManagerRaw->registeredEngines.clear();
}
TEST(DrmTest, GivenIoctlErrorWhenIsGpuHangIsCalledThenErrorIsThrown) {
ExecutionEnvironment executionEnvironment{};
executionEnvironment.prepareRootDeviceEnvironments(1);
@ -1070,22 +1046,11 @@ TEST(DrmTest, GivenIoctlErrorWhenIsGpuHangIsCalledThenErrorIsThrown) {
uint32_t contextId{0};
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
CommandStreamReceiver *csr{nullptr};
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
auto memoryManager = std::make_unique<MockMemoryManager>();
auto memoryManagerRaw = memoryManager.get();
memoryManagerRaw->registeredEngines = std::move(engines);
executionEnvironment.memoryManager = std::move(memoryManager);
mockOsContextLinux.drmContextIds.push_back(0);
mockOsContextLinux.drmContextIds.push_back(3);
EXPECT_THROW(drm.isGpuHangDetected(0), std::runtime_error);
memoryManagerRaw->registeredEngines.clear();
EXPECT_THROW(drm.isGpuHangDetected(mockOsContextLinux), std::runtime_error);
}
TEST(DrmTest, GivenZeroBatchActiveAndZeroBatchPendingResetStatsWhenIsGpuHangIsCalledThenNoHangIsReported) {
@ -1096,30 +1061,20 @@ TEST(DrmTest, GivenZeroBatchActiveAndZeroBatchPendingResetStatsWhenIsGpuHangIsCa
uint32_t contextId{0};
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
CommandStreamReceiver *csr{nullptr};
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
auto memoryManager = std::make_unique<MockMemoryManager>();
auto memoryManagerRaw = memoryManager.get();
memoryManagerRaw->registeredEngines = std::move(engines);
executionEnvironment.memoryManager = std::move(memoryManager);
mockOsContextLinux.drmContextIds.push_back(0);
mockOsContextLinux.drmContextIds.push_back(3);
drm_i915_reset_stats resetStats{};
resetStats.ctx_id = 0;
mockOsContextLinux.drmContextIds.push_back(0);
drm.resetStatsToReturn.push_back(resetStats);
resetStats.ctx_id = 3;
mockOsContextLinux.drmContextIds.push_back(3);
drm.resetStatsToReturn.push_back(resetStats);
bool isGpuHangDetected{};
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(0));
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
EXPECT_FALSE(isGpuHangDetected);
memoryManagerRaw->registeredEngines.clear();
}
TEST(DrmTest, GivenBatchActiveGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThenHangIsReported) {
@ -1130,31 +1085,21 @@ TEST(DrmTest, GivenBatchActiveGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThen
uint32_t contextId{0};
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
CommandStreamReceiver *csr{nullptr};
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
auto memoryManager = std::make_unique<MockMemoryManager>();
auto memoryManagerRaw = memoryManager.get();
memoryManagerRaw->registeredEngines = std::move(engines);
executionEnvironment.memoryManager = std::move(memoryManager);
mockOsContextLinux.drmContextIds.push_back(0);
mockOsContextLinux.drmContextIds.push_back(3);
drm_i915_reset_stats resetStats{};
resetStats.ctx_id = 0;
mockOsContextLinux.drmContextIds.push_back(0);
drm.resetStatsToReturn.push_back(resetStats);
resetStats.ctx_id = 3;
resetStats.batch_active = 2;
mockOsContextLinux.drmContextIds.push_back(3);
drm.resetStatsToReturn.push_back(resetStats);
bool isGpuHangDetected{};
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(0));
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
EXPECT_TRUE(isGpuHangDetected);
memoryManagerRaw->registeredEngines.clear();
}
TEST(DrmTest, GivenBatchPendingGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThenHangIsReported) {
@ -1165,27 +1110,17 @@ TEST(DrmTest, GivenBatchPendingGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThe
uint32_t contextId{0};
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
CommandStreamReceiver *csr{nullptr};
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
auto memoryManager = std::make_unique<MockMemoryManager>();
auto memoryManagerRaw = memoryManager.get();
memoryManagerRaw->registeredEngines = std::move(engines);
executionEnvironment.memoryManager = std::move(memoryManager);
mockOsContextLinux.drmContextIds.push_back(8);
drm_i915_reset_stats resetStats{};
resetStats.ctx_id = 8;
resetStats.batch_pending = 7;
mockOsContextLinux.drmContextIds.push_back(8);
drm.resetStatsToReturn.push_back(resetStats);
bool isGpuHangDetected{};
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(0));
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
EXPECT_TRUE(isGpuHangDetected);
memoryManagerRaw->registeredEngines.clear();
}
TEST(DrmTest, givenSetupIoctlHelperThenIoctlHelperNotNull) {

View File

@ -38,7 +38,9 @@
#include "gtest/gtest.h"
#include "mock_gmm_memory.h"
#include <cstdint>
#include <functional>
#include <limits>
#include <memory>
namespace NEO {
@ -344,6 +346,24 @@ TEST_F(Wddm20Tests, givenGraphicsAllocationWhenItIsMappedInHeap0ThenItHasGpuAddr
EXPECT_LE(gpuAddress, cannonizedHeapEnd);
}
TEST_F(Wddm20WithMockGdiDllTests, GivenInvalidCpuAddressWhenCheckingForGpuHangThenFalseIsReturned) {
osContext->getResidencyController().getMonitoredFence().cpuAddress = nullptr;
EXPECT_FALSE(wddm->isGpuHangDetected(*osContext));
}
TEST_F(Wddm20WithMockGdiDllTests, GivenCpuValueDifferentThanGpuHangIndicationWhenCheckingForGpuHangThenFalseIsReturned) {
constexpr auto cpuValue{777u};
ASSERT_NE(NEO::Wddm::gpuHangIndication, cpuValue);
*osContext->getResidencyController().getMonitoredFence().cpuAddress = cpuValue;
EXPECT_FALSE(wddm->isGpuHangDetected(*osContext));
}
TEST_F(Wddm20WithMockGdiDllTests, GivenGpuHangIndicationWhenCheckingForGpuHangThenTrueIsReturned) {
*osContext->getResidencyController().getMonitoredFence().cpuAddress = NEO::Wddm::gpuHangIndication;
EXPECT_TRUE(wddm->isGpuHangDetected(*osContext));
}
TEST_F(Wddm20WithMockGdiDllTests, GivenThreeOsHandlesWhenAskedForDestroyAllocationsThenAllMarkedAllocationsAreDestroyed) {
OsHandleStorage storage;
OsHandleWin osHandle1;

View File

@ -243,7 +243,7 @@ bool CommandStreamReceiver::skipResourceCleanup() const {
}
bool CommandStreamReceiver::isGpuHangDetected() const {
return this->getOSInterface() && this->getOSInterface()->getDriverModel() && this->getOSInterface()->getDriverModel()->isGpuHangDetected(osContext->getContextId());
return this->osContext && this->getOSInterface() && this->getOSInterface()->getDriverModel() && this->getOSInterface()->getDriverModel()->isGpuHangDetected(*osContext);
}
void CommandStreamReceiver::cleanupResources() {

View File

@ -318,11 +318,8 @@ int Drm::queryGttSize(uint64_t &gttSizeOutput) {
return ret;
}
bool Drm::isGpuHangDetected(uint32_t contextId) {
const auto &engines = this->rootDeviceEnvironment.executionEnvironment.memoryManager->getRegisteredEngines();
UNRECOVERABLE_IF(engines.size() <= contextId);
const auto osContextLinux = static_cast<OsContextLinux *>(engines[contextId].osContext);
bool Drm::isGpuHangDetected(OsContext &osContext) {
const auto osContextLinux = static_cast<OsContextLinux *>(&osContext);
const auto &drmContextIds = osContextLinux->getDrmContextIds();
for (const auto drmContextId : drmContextIds) {

View File

@ -148,7 +148,7 @@ class Drm : public DriverModel {
MOCKABLE_VIRTUAL void getPrelimVersion(std::string &prelimVersion);
PhysicalDevicePciBusInfo getPciBusInfo() const override;
bool isGpuHangDetected(uint32_t contextId) override;
bool isGpuHangDetected(OsContext &osContext) override;
bool areNonPersistentContextsSupported() const { return nonPersistentContextsSupported; }
void checkNonPersistentContextsSupport();

View File

@ -19,6 +19,7 @@
namespace NEO {
class ExecutionEnvironment;
class MemoryManager;
class OsContext;
class HwDeviceId : public NonCopyableClass {
public:
@ -85,9 +86,7 @@ class DriverModel : public NonCopyableClass {
return false;
}
virtual bool isGpuHangDetected(uint32_t contextId) {
return false;
}
virtual bool isGpuHangDetected(OsContext &osContext) = 0;
protected:
DriverModelType driverModelType;

View File

@ -918,6 +918,13 @@ bool Wddm::waitFromCpu(uint64_t lastFenceValue, const MonitoredFence &monitoredF
return status == STATUS_SUCCESS;
}
bool Wddm::isGpuHangDetected(OsContext &osContext) {
const auto osContextWin = static_cast<OsContextWin *>(&osContext);
const auto &monitoredFence = osContextWin->getResidencyController().getMonitoredFence();
return monitoredFence.cpuAddress && *monitoredFence.cpuAddress == gpuHangIndication;
}
void Wddm::initGfxPartition(GfxPartition &outGfxPartition, uint32_t rootDeviceIndex, size_t numRootDevices, bool useExternalFrontWindowPool) const {
if (gfxPartition.SVM.Limit != 0) {
outGfxPartition.heapInit(HeapIndex::HEAP_SVM, gfxPartition.SVM.Base, gfxPartition.SVM.Limit - gfxPartition.SVM.Base + 1);

View File

@ -23,6 +23,8 @@
#include "sku_info.h"
#include <cstdint>
#include <limits>
#include <memory>
#include <mutex>
@ -57,6 +59,7 @@ CREATECONTEXT_PVTDATA initPrivateData(OsContextWin &osContext);
class Wddm : public DriverModel {
public:
static constexpr DriverModelType driverModelType = DriverModelType::WDDM;
static constexpr std::uint64_t gpuHangIndication{std::numeric_limits<std::uint64_t>::max()};
typedef HRESULT(WINAPI *CreateDXGIFactoryFcn)(REFIID riid, void **ppFactory);
typedef HRESULT(WINAPI *DXCoreCreateAdapterFactoryFcn)(REFIID riid, void **ppFactory);
@ -109,6 +112,8 @@ class Wddm : public DriverModel {
MOCKABLE_VIRTUAL bool isShutdownInProgress();
bool isGpuHangDetected(OsContext &osContext) override;
bool configureDeviceAddressSpace();
const FeatureTable &getFeatureTable() const {
return *featureTable;

View File

@ -27,7 +27,7 @@ class MockDriverModel : public NEO::DriverModel {
return 0;
}
bool isGpuHangDetected(uint32_t contextId) override {
bool isGpuHangDetected(NEO::OsContext &osContext) override {
if (isGpuHangDetectedSideEffect) {
std::invoke(isGpuHangDetectedSideEffect);
}

View File

@ -192,6 +192,10 @@ TEST_F(DeviceGetCapsTest, whenDriverModelHasLimitationForMaxMemoryAllocationSize
void setGmmInputArgs(void *args) override {}
uint32_t getDeviceHandle() const override { return {}; }
PhysicalDevicePciBusInfo getPciBusInfo() const override { return {}; }
bool isGpuHangDetected(NEO::OsContext &osContext) override {
return false;
}
size_t getMaxMemAllocSize() const override {
return maxAllocSize;
}