mirror of
https://github.com/intel/compute-runtime.git
synced 2025-09-15 13:01:45 +08:00
Implement GPU hang detection on Windows
This change uses value of cpuAddress from monitored fence to detect GPU hang. Related-To: NEO-5313 Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
This commit is contained in:

committed by
Compute-Runtime-Automation

parent
61ca84e94b
commit
18cafd3a52
@ -1038,30 +1038,6 @@ TEST(DrmTest, GivenCompletionFenceDebugFlagWhenCreatingDrmObjectThenExpectCorrec
|
||||
EXPECT_FALSE(drmDisabled.completionFenceSupport());
|
||||
}
|
||||
|
||||
TEST(DrmTest, GivenInvalidContextIdWhenIsGpuHangIsCalledThenErrorIsThrown) {
|
||||
ExecutionEnvironment executionEnvironment{};
|
||||
executionEnvironment.prepareRootDeviceEnvironments(1);
|
||||
|
||||
DrmMock drm{*executionEnvironment.rootDeviceEnvironments[0]};
|
||||
uint32_t contextId{0};
|
||||
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
|
||||
|
||||
CommandStreamReceiver *csr{nullptr};
|
||||
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
|
||||
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
|
||||
|
||||
auto memoryManager = std::make_unique<MockMemoryManager>();
|
||||
auto memoryManagerRaw = memoryManager.get();
|
||||
|
||||
memoryManagerRaw->registeredEngines = std::move(engines);
|
||||
executionEnvironment.memoryManager = std::move(memoryManager);
|
||||
|
||||
const auto invalidContextId = 1;
|
||||
EXPECT_THROW(drm.isGpuHangDetected(invalidContextId), std::runtime_error);
|
||||
|
||||
memoryManagerRaw->registeredEngines.clear();
|
||||
}
|
||||
|
||||
TEST(DrmTest, GivenIoctlErrorWhenIsGpuHangIsCalledThenErrorIsThrown) {
|
||||
ExecutionEnvironment executionEnvironment{};
|
||||
executionEnvironment.prepareRootDeviceEnvironments(1);
|
||||
@ -1070,22 +1046,11 @@ TEST(DrmTest, GivenIoctlErrorWhenIsGpuHangIsCalledThenErrorIsThrown) {
|
||||
uint32_t contextId{0};
|
||||
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
|
||||
|
||||
CommandStreamReceiver *csr{nullptr};
|
||||
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
|
||||
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
|
||||
|
||||
auto memoryManager = std::make_unique<MockMemoryManager>();
|
||||
auto memoryManagerRaw = memoryManager.get();
|
||||
|
||||
memoryManagerRaw->registeredEngines = std::move(engines);
|
||||
executionEnvironment.memoryManager = std::move(memoryManager);
|
||||
|
||||
mockOsContextLinux.drmContextIds.push_back(0);
|
||||
mockOsContextLinux.drmContextIds.push_back(3);
|
||||
|
||||
EXPECT_THROW(drm.isGpuHangDetected(0), std::runtime_error);
|
||||
|
||||
memoryManagerRaw->registeredEngines.clear();
|
||||
EXPECT_THROW(drm.isGpuHangDetected(mockOsContextLinux), std::runtime_error);
|
||||
}
|
||||
|
||||
TEST(DrmTest, GivenZeroBatchActiveAndZeroBatchPendingResetStatsWhenIsGpuHangIsCalledThenNoHangIsReported) {
|
||||
@ -1096,30 +1061,20 @@ TEST(DrmTest, GivenZeroBatchActiveAndZeroBatchPendingResetStatsWhenIsGpuHangIsCa
|
||||
uint32_t contextId{0};
|
||||
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
|
||||
|
||||
CommandStreamReceiver *csr{nullptr};
|
||||
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
|
||||
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
|
||||
|
||||
auto memoryManager = std::make_unique<MockMemoryManager>();
|
||||
auto memoryManagerRaw = memoryManager.get();
|
||||
|
||||
memoryManagerRaw->registeredEngines = std::move(engines);
|
||||
executionEnvironment.memoryManager = std::move(memoryManager);
|
||||
mockOsContextLinux.drmContextIds.push_back(0);
|
||||
mockOsContextLinux.drmContextIds.push_back(3);
|
||||
|
||||
drm_i915_reset_stats resetStats{};
|
||||
resetStats.ctx_id = 0;
|
||||
mockOsContextLinux.drmContextIds.push_back(0);
|
||||
drm.resetStatsToReturn.push_back(resetStats);
|
||||
|
||||
resetStats.ctx_id = 3;
|
||||
mockOsContextLinux.drmContextIds.push_back(3);
|
||||
drm.resetStatsToReturn.push_back(resetStats);
|
||||
|
||||
bool isGpuHangDetected{};
|
||||
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(0));
|
||||
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
|
||||
EXPECT_FALSE(isGpuHangDetected);
|
||||
|
||||
memoryManagerRaw->registeredEngines.clear();
|
||||
}
|
||||
|
||||
TEST(DrmTest, GivenBatchActiveGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThenHangIsReported) {
|
||||
@ -1130,31 +1085,21 @@ TEST(DrmTest, GivenBatchActiveGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThen
|
||||
uint32_t contextId{0};
|
||||
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
|
||||
|
||||
CommandStreamReceiver *csr{nullptr};
|
||||
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
|
||||
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
|
||||
|
||||
auto memoryManager = std::make_unique<MockMemoryManager>();
|
||||
auto memoryManagerRaw = memoryManager.get();
|
||||
|
||||
memoryManagerRaw->registeredEngines = std::move(engines);
|
||||
executionEnvironment.memoryManager = std::move(memoryManager);
|
||||
mockOsContextLinux.drmContextIds.push_back(0);
|
||||
mockOsContextLinux.drmContextIds.push_back(3);
|
||||
|
||||
drm_i915_reset_stats resetStats{};
|
||||
resetStats.ctx_id = 0;
|
||||
mockOsContextLinux.drmContextIds.push_back(0);
|
||||
drm.resetStatsToReturn.push_back(resetStats);
|
||||
|
||||
resetStats.ctx_id = 3;
|
||||
resetStats.batch_active = 2;
|
||||
mockOsContextLinux.drmContextIds.push_back(3);
|
||||
drm.resetStatsToReturn.push_back(resetStats);
|
||||
|
||||
bool isGpuHangDetected{};
|
||||
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(0));
|
||||
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
|
||||
EXPECT_TRUE(isGpuHangDetected);
|
||||
|
||||
memoryManagerRaw->registeredEngines.clear();
|
||||
}
|
||||
|
||||
TEST(DrmTest, GivenBatchPendingGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThenHangIsReported) {
|
||||
@ -1165,27 +1110,17 @@ TEST(DrmTest, GivenBatchPendingGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThe
|
||||
uint32_t contextId{0};
|
||||
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
|
||||
|
||||
CommandStreamReceiver *csr{nullptr};
|
||||
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
|
||||
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
|
||||
|
||||
auto memoryManager = std::make_unique<MockMemoryManager>();
|
||||
auto memoryManagerRaw = memoryManager.get();
|
||||
|
||||
memoryManagerRaw->registeredEngines = std::move(engines);
|
||||
executionEnvironment.memoryManager = std::move(memoryManager);
|
||||
mockOsContextLinux.drmContextIds.push_back(8);
|
||||
|
||||
drm_i915_reset_stats resetStats{};
|
||||
resetStats.ctx_id = 8;
|
||||
resetStats.batch_pending = 7;
|
||||
mockOsContextLinux.drmContextIds.push_back(8);
|
||||
drm.resetStatsToReturn.push_back(resetStats);
|
||||
|
||||
bool isGpuHangDetected{};
|
||||
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(0));
|
||||
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
|
||||
EXPECT_TRUE(isGpuHangDetected);
|
||||
|
||||
memoryManagerRaw->registeredEngines.clear();
|
||||
}
|
||||
|
||||
TEST(DrmTest, givenSetupIoctlHelperThenIoctlHelperNotNull) {
|
||||
|
@ -38,7 +38,9 @@
|
||||
#include "gtest/gtest.h"
|
||||
#include "mock_gmm_memory.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
|
||||
namespace NEO {
|
||||
@ -344,6 +346,24 @@ TEST_F(Wddm20Tests, givenGraphicsAllocationWhenItIsMappedInHeap0ThenItHasGpuAddr
|
||||
EXPECT_LE(gpuAddress, cannonizedHeapEnd);
|
||||
}
|
||||
|
||||
TEST_F(Wddm20WithMockGdiDllTests, GivenInvalidCpuAddressWhenCheckingForGpuHangThenFalseIsReturned) {
|
||||
osContext->getResidencyController().getMonitoredFence().cpuAddress = nullptr;
|
||||
EXPECT_FALSE(wddm->isGpuHangDetected(*osContext));
|
||||
}
|
||||
|
||||
TEST_F(Wddm20WithMockGdiDllTests, GivenCpuValueDifferentThanGpuHangIndicationWhenCheckingForGpuHangThenFalseIsReturned) {
|
||||
constexpr auto cpuValue{777u};
|
||||
ASSERT_NE(NEO::Wddm::gpuHangIndication, cpuValue);
|
||||
|
||||
*osContext->getResidencyController().getMonitoredFence().cpuAddress = cpuValue;
|
||||
EXPECT_FALSE(wddm->isGpuHangDetected(*osContext));
|
||||
}
|
||||
|
||||
TEST_F(Wddm20WithMockGdiDllTests, GivenGpuHangIndicationWhenCheckingForGpuHangThenTrueIsReturned) {
|
||||
*osContext->getResidencyController().getMonitoredFence().cpuAddress = NEO::Wddm::gpuHangIndication;
|
||||
EXPECT_TRUE(wddm->isGpuHangDetected(*osContext));
|
||||
}
|
||||
|
||||
TEST_F(Wddm20WithMockGdiDllTests, GivenThreeOsHandlesWhenAskedForDestroyAllocationsThenAllMarkedAllocationsAreDestroyed) {
|
||||
OsHandleStorage storage;
|
||||
OsHandleWin osHandle1;
|
||||
|
Reference in New Issue
Block a user