Implement GPU hang detection on Windows

This change uses value of cpuAddress from monitored fence
to detect GPU hang.

Related-To: NEO-5313
Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
This commit is contained in:
Patryk Wrobel
2022-02-04 16:03:36 +00:00
committed by Compute-Runtime-Automation
parent 61ca84e94b
commit 18cafd3a52
12 changed files with 69 additions and 86 deletions

View File

@ -1038,30 +1038,6 @@ TEST(DrmTest, GivenCompletionFenceDebugFlagWhenCreatingDrmObjectThenExpectCorrec
EXPECT_FALSE(drmDisabled.completionFenceSupport());
}
TEST(DrmTest, GivenInvalidContextIdWhenIsGpuHangIsCalledThenErrorIsThrown) {
ExecutionEnvironment executionEnvironment{};
executionEnvironment.prepareRootDeviceEnvironments(1);
DrmMock drm{*executionEnvironment.rootDeviceEnvironments[0]};
uint32_t contextId{0};
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
CommandStreamReceiver *csr{nullptr};
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
auto memoryManager = std::make_unique<MockMemoryManager>();
auto memoryManagerRaw = memoryManager.get();
memoryManagerRaw->registeredEngines = std::move(engines);
executionEnvironment.memoryManager = std::move(memoryManager);
const auto invalidContextId = 1;
EXPECT_THROW(drm.isGpuHangDetected(invalidContextId), std::runtime_error);
memoryManagerRaw->registeredEngines.clear();
}
TEST(DrmTest, GivenIoctlErrorWhenIsGpuHangIsCalledThenErrorIsThrown) {
ExecutionEnvironment executionEnvironment{};
executionEnvironment.prepareRootDeviceEnvironments(1);
@ -1070,22 +1046,11 @@ TEST(DrmTest, GivenIoctlErrorWhenIsGpuHangIsCalledThenErrorIsThrown) {
uint32_t contextId{0};
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
CommandStreamReceiver *csr{nullptr};
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
auto memoryManager = std::make_unique<MockMemoryManager>();
auto memoryManagerRaw = memoryManager.get();
memoryManagerRaw->registeredEngines = std::move(engines);
executionEnvironment.memoryManager = std::move(memoryManager);
mockOsContextLinux.drmContextIds.push_back(0);
mockOsContextLinux.drmContextIds.push_back(3);
EXPECT_THROW(drm.isGpuHangDetected(0), std::runtime_error);
memoryManagerRaw->registeredEngines.clear();
EXPECT_THROW(drm.isGpuHangDetected(mockOsContextLinux), std::runtime_error);
}
TEST(DrmTest, GivenZeroBatchActiveAndZeroBatchPendingResetStatsWhenIsGpuHangIsCalledThenNoHangIsReported) {
@ -1096,30 +1061,20 @@ TEST(DrmTest, GivenZeroBatchActiveAndZeroBatchPendingResetStatsWhenIsGpuHangIsCa
uint32_t contextId{0};
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
CommandStreamReceiver *csr{nullptr};
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
auto memoryManager = std::make_unique<MockMemoryManager>();
auto memoryManagerRaw = memoryManager.get();
memoryManagerRaw->registeredEngines = std::move(engines);
executionEnvironment.memoryManager = std::move(memoryManager);
mockOsContextLinux.drmContextIds.push_back(0);
mockOsContextLinux.drmContextIds.push_back(3);
drm_i915_reset_stats resetStats{};
resetStats.ctx_id = 0;
mockOsContextLinux.drmContextIds.push_back(0);
drm.resetStatsToReturn.push_back(resetStats);
resetStats.ctx_id = 3;
mockOsContextLinux.drmContextIds.push_back(3);
drm.resetStatsToReturn.push_back(resetStats);
bool isGpuHangDetected{};
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(0));
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
EXPECT_FALSE(isGpuHangDetected);
memoryManagerRaw->registeredEngines.clear();
}
TEST(DrmTest, GivenBatchActiveGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThenHangIsReported) {
@ -1130,31 +1085,21 @@ TEST(DrmTest, GivenBatchActiveGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThen
uint32_t contextId{0};
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
CommandStreamReceiver *csr{nullptr};
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
auto memoryManager = std::make_unique<MockMemoryManager>();
auto memoryManagerRaw = memoryManager.get();
memoryManagerRaw->registeredEngines = std::move(engines);
executionEnvironment.memoryManager = std::move(memoryManager);
mockOsContextLinux.drmContextIds.push_back(0);
mockOsContextLinux.drmContextIds.push_back(3);
drm_i915_reset_stats resetStats{};
resetStats.ctx_id = 0;
mockOsContextLinux.drmContextIds.push_back(0);
drm.resetStatsToReturn.push_back(resetStats);
resetStats.ctx_id = 3;
resetStats.batch_active = 2;
mockOsContextLinux.drmContextIds.push_back(3);
drm.resetStatsToReturn.push_back(resetStats);
bool isGpuHangDetected{};
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(0));
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
EXPECT_TRUE(isGpuHangDetected);
memoryManagerRaw->registeredEngines.clear();
}
TEST(DrmTest, GivenBatchPendingGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThenHangIsReported) {
@ -1165,27 +1110,17 @@ TEST(DrmTest, GivenBatchPendingGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThe
uint32_t contextId{0};
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
CommandStreamReceiver *csr{nullptr};
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
auto memoryManager = std::make_unique<MockMemoryManager>();
auto memoryManagerRaw = memoryManager.get();
memoryManagerRaw->registeredEngines = std::move(engines);
executionEnvironment.memoryManager = std::move(memoryManager);
mockOsContextLinux.drmContextIds.push_back(8);
drm_i915_reset_stats resetStats{};
resetStats.ctx_id = 8;
resetStats.batch_pending = 7;
mockOsContextLinux.drmContextIds.push_back(8);
drm.resetStatsToReturn.push_back(resetStats);
bool isGpuHangDetected{};
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(0));
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
EXPECT_TRUE(isGpuHangDetected);
memoryManagerRaw->registeredEngines.clear();
}
TEST(DrmTest, givenSetupIoctlHelperThenIoctlHelperNotNull) {

View File

@ -38,7 +38,9 @@
#include "gtest/gtest.h"
#include "mock_gmm_memory.h"
#include <cstdint>
#include <functional>
#include <limits>
#include <memory>
namespace NEO {
@ -344,6 +346,24 @@ TEST_F(Wddm20Tests, givenGraphicsAllocationWhenItIsMappedInHeap0ThenItHasGpuAddr
EXPECT_LE(gpuAddress, cannonizedHeapEnd);
}
TEST_F(Wddm20WithMockGdiDllTests, GivenInvalidCpuAddressWhenCheckingForGpuHangThenFalseIsReturned) {
osContext->getResidencyController().getMonitoredFence().cpuAddress = nullptr;
EXPECT_FALSE(wddm->isGpuHangDetected(*osContext));
}
TEST_F(Wddm20WithMockGdiDllTests, GivenCpuValueDifferentThanGpuHangIndicationWhenCheckingForGpuHangThenFalseIsReturned) {
constexpr auto cpuValue{777u};
ASSERT_NE(NEO::Wddm::gpuHangIndication, cpuValue);
*osContext->getResidencyController().getMonitoredFence().cpuAddress = cpuValue;
EXPECT_FALSE(wddm->isGpuHangDetected(*osContext));
}
TEST_F(Wddm20WithMockGdiDllTests, GivenGpuHangIndicationWhenCheckingForGpuHangThenTrueIsReturned) {
*osContext->getResidencyController().getMonitoredFence().cpuAddress = NEO::Wddm::gpuHangIndication;
EXPECT_TRUE(wddm->isGpuHangDetected(*osContext));
}
TEST_F(Wddm20WithMockGdiDllTests, GivenThreeOsHandlesWhenAskedForDestroyAllocationsThenAllMarkedAllocationsAreDestroyed) {
OsHandleStorage storage;
OsHandleWin osHandle1;