mirror of
https://github.com/intel/compute-runtime.git
synced 2025-09-15 13:01:45 +08:00
Implement GPU hang detection on Windows
This change uses value of cpuAddress from monitored fence to detect GPU hang. Related-To: NEO-5313 Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
This commit is contained in:

committed by
Compute-Runtime-Automation

parent
61ca84e94b
commit
18cafd3a52
@ -38,6 +38,10 @@ struct MockDriverModel : NEO::DriverModel {
|
||||
size_t getMaxMemAllocSize() const override {
|
||||
return maxAllocSize;
|
||||
}
|
||||
|
||||
bool isGpuHangDetected(NEO::OsContext &osContext) override {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
struct MockDriverModelWDDM : NEO::DriverModel {
|
||||
@ -51,6 +55,10 @@ struct MockDriverModelWDDM : NEO::DriverModel {
|
||||
size_t getMaxMemAllocSize() const override {
|
||||
return maxAllocSize;
|
||||
}
|
||||
|
||||
bool isGpuHangDetected(NEO::OsContext &osContext) override {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
struct MockDriverModelDRM : NEO::DriverModel {
|
||||
@ -64,6 +72,10 @@ struct MockDriverModelDRM : NEO::DriverModel {
|
||||
size_t getMaxMemAllocSize() const override {
|
||||
return maxAllocSize;
|
||||
}
|
||||
|
||||
bool isGpuHangDetected(NEO::OsContext &osContext) override {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
struct ContextShareableMock : public L0::ContextImp {
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020-2021 Intel Corporation
|
||||
* Copyright (C) 2020-2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@ -366,6 +366,10 @@ class UnknownDriverModel : public DriverModel {
|
||||
PhysicalDevicePciBusInfo pciBusInfo(PhysicalDevicePciBusInfo::InvalidValue, PhysicalDevicePciBusInfo::InvalidValue, PhysicalDevicePciBusInfo::InvalidValue, PhysicalDevicePciBusInfo::InvalidValue);
|
||||
return pciBusInfo;
|
||||
}
|
||||
|
||||
bool isGpuHangDetected(OsContext &osContext) override {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
using SysmanUnknownDriverModelTest = Test<DeviceFixture>;
|
||||
|
@ -1038,30 +1038,6 @@ TEST(DrmTest, GivenCompletionFenceDebugFlagWhenCreatingDrmObjectThenExpectCorrec
|
||||
EXPECT_FALSE(drmDisabled.completionFenceSupport());
|
||||
}
|
||||
|
||||
TEST(DrmTest, GivenInvalidContextIdWhenIsGpuHangIsCalledThenErrorIsThrown) {
|
||||
ExecutionEnvironment executionEnvironment{};
|
||||
executionEnvironment.prepareRootDeviceEnvironments(1);
|
||||
|
||||
DrmMock drm{*executionEnvironment.rootDeviceEnvironments[0]};
|
||||
uint32_t contextId{0};
|
||||
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
|
||||
|
||||
CommandStreamReceiver *csr{nullptr};
|
||||
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
|
||||
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
|
||||
|
||||
auto memoryManager = std::make_unique<MockMemoryManager>();
|
||||
auto memoryManagerRaw = memoryManager.get();
|
||||
|
||||
memoryManagerRaw->registeredEngines = std::move(engines);
|
||||
executionEnvironment.memoryManager = std::move(memoryManager);
|
||||
|
||||
const auto invalidContextId = 1;
|
||||
EXPECT_THROW(drm.isGpuHangDetected(invalidContextId), std::runtime_error);
|
||||
|
||||
memoryManagerRaw->registeredEngines.clear();
|
||||
}
|
||||
|
||||
TEST(DrmTest, GivenIoctlErrorWhenIsGpuHangIsCalledThenErrorIsThrown) {
|
||||
ExecutionEnvironment executionEnvironment{};
|
||||
executionEnvironment.prepareRootDeviceEnvironments(1);
|
||||
@ -1070,22 +1046,11 @@ TEST(DrmTest, GivenIoctlErrorWhenIsGpuHangIsCalledThenErrorIsThrown) {
|
||||
uint32_t contextId{0};
|
||||
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
|
||||
|
||||
CommandStreamReceiver *csr{nullptr};
|
||||
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
|
||||
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
|
||||
|
||||
auto memoryManager = std::make_unique<MockMemoryManager>();
|
||||
auto memoryManagerRaw = memoryManager.get();
|
||||
|
||||
memoryManagerRaw->registeredEngines = std::move(engines);
|
||||
executionEnvironment.memoryManager = std::move(memoryManager);
|
||||
|
||||
mockOsContextLinux.drmContextIds.push_back(0);
|
||||
mockOsContextLinux.drmContextIds.push_back(3);
|
||||
|
||||
EXPECT_THROW(drm.isGpuHangDetected(0), std::runtime_error);
|
||||
|
||||
memoryManagerRaw->registeredEngines.clear();
|
||||
EXPECT_THROW(drm.isGpuHangDetected(mockOsContextLinux), std::runtime_error);
|
||||
}
|
||||
|
||||
TEST(DrmTest, GivenZeroBatchActiveAndZeroBatchPendingResetStatsWhenIsGpuHangIsCalledThenNoHangIsReported) {
|
||||
@ -1096,30 +1061,20 @@ TEST(DrmTest, GivenZeroBatchActiveAndZeroBatchPendingResetStatsWhenIsGpuHangIsCa
|
||||
uint32_t contextId{0};
|
||||
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
|
||||
|
||||
CommandStreamReceiver *csr{nullptr};
|
||||
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
|
||||
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
|
||||
|
||||
auto memoryManager = std::make_unique<MockMemoryManager>();
|
||||
auto memoryManagerRaw = memoryManager.get();
|
||||
|
||||
memoryManagerRaw->registeredEngines = std::move(engines);
|
||||
executionEnvironment.memoryManager = std::move(memoryManager);
|
||||
mockOsContextLinux.drmContextIds.push_back(0);
|
||||
mockOsContextLinux.drmContextIds.push_back(3);
|
||||
|
||||
drm_i915_reset_stats resetStats{};
|
||||
resetStats.ctx_id = 0;
|
||||
mockOsContextLinux.drmContextIds.push_back(0);
|
||||
drm.resetStatsToReturn.push_back(resetStats);
|
||||
|
||||
resetStats.ctx_id = 3;
|
||||
mockOsContextLinux.drmContextIds.push_back(3);
|
||||
drm.resetStatsToReturn.push_back(resetStats);
|
||||
|
||||
bool isGpuHangDetected{};
|
||||
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(0));
|
||||
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
|
||||
EXPECT_FALSE(isGpuHangDetected);
|
||||
|
||||
memoryManagerRaw->registeredEngines.clear();
|
||||
}
|
||||
|
||||
TEST(DrmTest, GivenBatchActiveGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThenHangIsReported) {
|
||||
@ -1130,31 +1085,21 @@ TEST(DrmTest, GivenBatchActiveGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThen
|
||||
uint32_t contextId{0};
|
||||
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
|
||||
|
||||
CommandStreamReceiver *csr{nullptr};
|
||||
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
|
||||
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
|
||||
|
||||
auto memoryManager = std::make_unique<MockMemoryManager>();
|
||||
auto memoryManagerRaw = memoryManager.get();
|
||||
|
||||
memoryManagerRaw->registeredEngines = std::move(engines);
|
||||
executionEnvironment.memoryManager = std::move(memoryManager);
|
||||
mockOsContextLinux.drmContextIds.push_back(0);
|
||||
mockOsContextLinux.drmContextIds.push_back(3);
|
||||
|
||||
drm_i915_reset_stats resetStats{};
|
||||
resetStats.ctx_id = 0;
|
||||
mockOsContextLinux.drmContextIds.push_back(0);
|
||||
drm.resetStatsToReturn.push_back(resetStats);
|
||||
|
||||
resetStats.ctx_id = 3;
|
||||
resetStats.batch_active = 2;
|
||||
mockOsContextLinux.drmContextIds.push_back(3);
|
||||
drm.resetStatsToReturn.push_back(resetStats);
|
||||
|
||||
bool isGpuHangDetected{};
|
||||
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(0));
|
||||
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
|
||||
EXPECT_TRUE(isGpuHangDetected);
|
||||
|
||||
memoryManagerRaw->registeredEngines.clear();
|
||||
}
|
||||
|
||||
TEST(DrmTest, GivenBatchPendingGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThenHangIsReported) {
|
||||
@ -1165,27 +1110,17 @@ TEST(DrmTest, GivenBatchPendingGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThe
|
||||
uint32_t contextId{0};
|
||||
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
|
||||
|
||||
CommandStreamReceiver *csr{nullptr};
|
||||
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
|
||||
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
|
||||
|
||||
auto memoryManager = std::make_unique<MockMemoryManager>();
|
||||
auto memoryManagerRaw = memoryManager.get();
|
||||
|
||||
memoryManagerRaw->registeredEngines = std::move(engines);
|
||||
executionEnvironment.memoryManager = std::move(memoryManager);
|
||||
mockOsContextLinux.drmContextIds.push_back(8);
|
||||
|
||||
drm_i915_reset_stats resetStats{};
|
||||
resetStats.ctx_id = 8;
|
||||
resetStats.batch_pending = 7;
|
||||
mockOsContextLinux.drmContextIds.push_back(8);
|
||||
drm.resetStatsToReturn.push_back(resetStats);
|
||||
|
||||
bool isGpuHangDetected{};
|
||||
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(0));
|
||||
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
|
||||
EXPECT_TRUE(isGpuHangDetected);
|
||||
|
||||
memoryManagerRaw->registeredEngines.clear();
|
||||
}
|
||||
|
||||
TEST(DrmTest, givenSetupIoctlHelperThenIoctlHelperNotNull) {
|
||||
|
@ -38,7 +38,9 @@
|
||||
#include "gtest/gtest.h"
|
||||
#include "mock_gmm_memory.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
|
||||
namespace NEO {
|
||||
@ -344,6 +346,24 @@ TEST_F(Wddm20Tests, givenGraphicsAllocationWhenItIsMappedInHeap0ThenItHasGpuAddr
|
||||
EXPECT_LE(gpuAddress, cannonizedHeapEnd);
|
||||
}
|
||||
|
||||
TEST_F(Wddm20WithMockGdiDllTests, GivenInvalidCpuAddressWhenCheckingForGpuHangThenFalseIsReturned) {
|
||||
osContext->getResidencyController().getMonitoredFence().cpuAddress = nullptr;
|
||||
EXPECT_FALSE(wddm->isGpuHangDetected(*osContext));
|
||||
}
|
||||
|
||||
TEST_F(Wddm20WithMockGdiDllTests, GivenCpuValueDifferentThanGpuHangIndicationWhenCheckingForGpuHangThenFalseIsReturned) {
|
||||
constexpr auto cpuValue{777u};
|
||||
ASSERT_NE(NEO::Wddm::gpuHangIndication, cpuValue);
|
||||
|
||||
*osContext->getResidencyController().getMonitoredFence().cpuAddress = cpuValue;
|
||||
EXPECT_FALSE(wddm->isGpuHangDetected(*osContext));
|
||||
}
|
||||
|
||||
TEST_F(Wddm20WithMockGdiDllTests, GivenGpuHangIndicationWhenCheckingForGpuHangThenTrueIsReturned) {
|
||||
*osContext->getResidencyController().getMonitoredFence().cpuAddress = NEO::Wddm::gpuHangIndication;
|
||||
EXPECT_TRUE(wddm->isGpuHangDetected(*osContext));
|
||||
}
|
||||
|
||||
TEST_F(Wddm20WithMockGdiDllTests, GivenThreeOsHandlesWhenAskedForDestroyAllocationsThenAllMarkedAllocationsAreDestroyed) {
|
||||
OsHandleStorage storage;
|
||||
OsHandleWin osHandle1;
|
||||
|
@ -243,7 +243,7 @@ bool CommandStreamReceiver::skipResourceCleanup() const {
|
||||
}
|
||||
|
||||
bool CommandStreamReceiver::isGpuHangDetected() const {
|
||||
return this->getOSInterface() && this->getOSInterface()->getDriverModel() && this->getOSInterface()->getDriverModel()->isGpuHangDetected(osContext->getContextId());
|
||||
return this->osContext && this->getOSInterface() && this->getOSInterface()->getDriverModel() && this->getOSInterface()->getDriverModel()->isGpuHangDetected(*osContext);
|
||||
}
|
||||
|
||||
void CommandStreamReceiver::cleanupResources() {
|
||||
|
@ -318,11 +318,8 @@ int Drm::queryGttSize(uint64_t >tSizeOutput) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool Drm::isGpuHangDetected(uint32_t contextId) {
|
||||
const auto &engines = this->rootDeviceEnvironment.executionEnvironment.memoryManager->getRegisteredEngines();
|
||||
UNRECOVERABLE_IF(engines.size() <= contextId);
|
||||
|
||||
const auto osContextLinux = static_cast<OsContextLinux *>(engines[contextId].osContext);
|
||||
bool Drm::isGpuHangDetected(OsContext &osContext) {
|
||||
const auto osContextLinux = static_cast<OsContextLinux *>(&osContext);
|
||||
const auto &drmContextIds = osContextLinux->getDrmContextIds();
|
||||
|
||||
for (const auto drmContextId : drmContextIds) {
|
||||
|
@ -148,7 +148,7 @@ class Drm : public DriverModel {
|
||||
MOCKABLE_VIRTUAL void getPrelimVersion(std::string &prelimVersion);
|
||||
|
||||
PhysicalDevicePciBusInfo getPciBusInfo() const override;
|
||||
bool isGpuHangDetected(uint32_t contextId) override;
|
||||
bool isGpuHangDetected(OsContext &osContext) override;
|
||||
|
||||
bool areNonPersistentContextsSupported() const { return nonPersistentContextsSupported; }
|
||||
void checkNonPersistentContextsSupport();
|
||||
|
@ -19,6 +19,7 @@
|
||||
namespace NEO {
|
||||
class ExecutionEnvironment;
|
||||
class MemoryManager;
|
||||
class OsContext;
|
||||
|
||||
class HwDeviceId : public NonCopyableClass {
|
||||
public:
|
||||
@ -85,9 +86,7 @@ class DriverModel : public NonCopyableClass {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual bool isGpuHangDetected(uint32_t contextId) {
|
||||
return false;
|
||||
}
|
||||
virtual bool isGpuHangDetected(OsContext &osContext) = 0;
|
||||
|
||||
protected:
|
||||
DriverModelType driverModelType;
|
||||
|
@ -918,6 +918,13 @@ bool Wddm::waitFromCpu(uint64_t lastFenceValue, const MonitoredFence &monitoredF
|
||||
return status == STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
bool Wddm::isGpuHangDetected(OsContext &osContext) {
|
||||
const auto osContextWin = static_cast<OsContextWin *>(&osContext);
|
||||
const auto &monitoredFence = osContextWin->getResidencyController().getMonitoredFence();
|
||||
|
||||
return monitoredFence.cpuAddress && *monitoredFence.cpuAddress == gpuHangIndication;
|
||||
}
|
||||
|
||||
void Wddm::initGfxPartition(GfxPartition &outGfxPartition, uint32_t rootDeviceIndex, size_t numRootDevices, bool useExternalFrontWindowPool) const {
|
||||
if (gfxPartition.SVM.Limit != 0) {
|
||||
outGfxPartition.heapInit(HeapIndex::HEAP_SVM, gfxPartition.SVM.Base, gfxPartition.SVM.Limit - gfxPartition.SVM.Base + 1);
|
||||
|
@ -23,6 +23,8 @@
|
||||
|
||||
#include "sku_info.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
|
||||
@ -57,6 +59,7 @@ CREATECONTEXT_PVTDATA initPrivateData(OsContextWin &osContext);
|
||||
class Wddm : public DriverModel {
|
||||
public:
|
||||
static constexpr DriverModelType driverModelType = DriverModelType::WDDM;
|
||||
static constexpr std::uint64_t gpuHangIndication{std::numeric_limits<std::uint64_t>::max()};
|
||||
|
||||
typedef HRESULT(WINAPI *CreateDXGIFactoryFcn)(REFIID riid, void **ppFactory);
|
||||
typedef HRESULT(WINAPI *DXCoreCreateAdapterFactoryFcn)(REFIID riid, void **ppFactory);
|
||||
@ -109,6 +112,8 @@ class Wddm : public DriverModel {
|
||||
|
||||
MOCKABLE_VIRTUAL bool isShutdownInProgress();
|
||||
|
||||
bool isGpuHangDetected(OsContext &osContext) override;
|
||||
|
||||
bool configureDeviceAddressSpace();
|
||||
const FeatureTable &getFeatureTable() const {
|
||||
return *featureTable;
|
||||
|
@ -27,7 +27,7 @@ class MockDriverModel : public NEO::DriverModel {
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool isGpuHangDetected(uint32_t contextId) override {
|
||||
bool isGpuHangDetected(NEO::OsContext &osContext) override {
|
||||
if (isGpuHangDetectedSideEffect) {
|
||||
std::invoke(isGpuHangDetectedSideEffect);
|
||||
}
|
||||
|
@ -192,6 +192,10 @@ TEST_F(DeviceGetCapsTest, whenDriverModelHasLimitationForMaxMemoryAllocationSize
|
||||
void setGmmInputArgs(void *args) override {}
|
||||
uint32_t getDeviceHandle() const override { return {}; }
|
||||
PhysicalDevicePciBusInfo getPciBusInfo() const override { return {}; }
|
||||
bool isGpuHangDetected(NEO::OsContext &osContext) override {
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t getMaxMemAllocSize() const override {
|
||||
return maxAllocSize;
|
||||
}
|
||||
|
Reference in New Issue
Block a user