From 1275c4e200161f9036afa40d82157e659ecbbdaf Mon Sep 17 00:00:00 2001 From: Patryk Wrobel Date: Wed, 16 Feb 2022 10:22:03 +0000 Subject: [PATCH] Detect GPU hang in remaining blocking calls of L0 This change introduces detection of GPU hangs in zeEventHostSynchronize and zeFenceHostSynchronize. Furthermore, if CommandQueueHw::executeCommandLists uses ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS and hang occurs, the information about it is propagated to the caller. Related-To: NEO-6681 Signed-off-by: Patryk Wrobel --- .../core/source/cmdqueue/cmdqueue_hw.inl | 5 +- level_zero/core/source/event/event.h | 4 +- level_zero/core/source/event/event_impl.inl | 19 +++- level_zero/core/source/fence/fence.cpp | 22 +++-- level_zero/core/source/fence/fence.h | 2 + .../test/unit_tests/mocks/mock_cmdqueue.h | 3 +- .../core/test/unit_tests/mocks/mock_fence.h | 3 + .../unit_tests/sources/event/test_event.cpp | 43 +++++++++ .../unit_tests/sources/fence/test_fence.cpp | 88 +++++++++++++++++++ .../command_stream/command_stream_receiver.h | 5 +- .../mocks/mock_command_stream_receiver.h | 19 ++++ 11 files changed, 199 insertions(+), 14 deletions(-) diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index fbdfcebf38..0fdc101c31 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -462,7 +462,10 @@ ze_result_t CommandQueueHw::executeCommandLists( csr->makeSurfacePackNonResident(csr->getResidencyAllocations()); if (getSynchronousMode() == ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS) { - this->synchronize(std::numeric_limits::max()); + const auto synchronizeResult = this->synchronize(std::numeric_limits::max()); + if (synchronizeResult == ZE_RESULT_ERROR_DEVICE_LOST) { + return ZE_RESULT_ERROR_DEVICE_LOST; + } } this->heapContainer.clear(); diff --git a/level_zero/core/source/event/event.h b/level_zero/core/source/event/event.h index 1114cd8e09..489d83224a 100644 --- a/level_zero/core/source/event/event.h +++ b/level_zero/core/source/event/event.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2021 Intel Corporation + * Copyright (C) 2020-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -15,6 +15,7 @@ #include "level_zero/core/source/driver/driver_handle.h" #include +#include #include struct _ze_event_handle_t {}; @@ -102,6 +103,7 @@ struct Event : _ze_event_handle_t { ze_event_scope_flags_t waitScope = 0u; uint32_t kernelCount = 1u; + std::chrono::microseconds gpuHangCheckPeriod{500'000}; protected: size_t contextStartOffset = 0u; diff --git a/level_zero/core/source/event/event_impl.inl b/level_zero/core/source/event/event_impl.inl index a82c89795d..a288bf4b16 100644 --- a/level_zero/core/source/event/event_impl.inl +++ b/level_zero/core/source/event/event_impl.inl @@ -199,7 +199,8 @@ ze_result_t EventImp::hostSignal() { template ze_result_t EventImp::hostSynchronize(uint64_t timeout) { - std::chrono::high_resolution_clock::time_point time1, time2; + std::chrono::microseconds elapsedTimeSinceGpuHangCheck{0}; + std::chrono::high_resolution_clock::time_point waitStartTime, lastHangCheckTime, currentTime; uint64_t timeDiff = 0; ze_result_t ret = ZE_RESULT_NOT_READY; @@ -212,7 +213,8 @@ ze_result_t EventImp::hostSynchronize(uint64_t timeout) { return queryStatus(); } - time1 = std::chrono::high_resolution_clock::now(); + waitStartTime = std::chrono::high_resolution_clock::now(); + lastHangCheckTime = waitStartTime; while (true) { ret = queryStatus(); if (ret == ZE_RESULT_SUCCESS) { @@ -221,12 +223,21 @@ ze_result_t EventImp::hostSynchronize(uint64_t timeout) { NEO::WaitUtils::waitFunction(nullptr, 0u); + currentTime = std::chrono::high_resolution_clock::now(); + elapsedTimeSinceGpuHangCheck = std::chrono::duration_cast(currentTime - lastHangCheckTime); + + if (elapsedTimeSinceGpuHangCheck.count() >= this->gpuHangCheckPeriod.count()) { + lastHangCheckTime = currentTime; + if (this->csr->isGpuHangDetected()) { + return ZE_RESULT_ERROR_DEVICE_LOST; + } + } + if (timeout == std::numeric_limits::max()) { continue; } - time2 = std::chrono::high_resolution_clock::now(); - timeDiff = std::chrono::duration_cast(time2 - time1).count(); + timeDiff = std::chrono::duration_cast(currentTime - waitStartTime).count(); if (timeDiff >= timeout) { break; diff --git a/level_zero/core/source/fence/fence.cpp b/level_zero/core/source/fence/fence.cpp index a026bc2909..b28463507c 100644 --- a/level_zero/core/source/fence/fence.cpp +++ b/level_zero/core/source/fence/fence.cpp @@ -43,11 +43,13 @@ ze_result_t FenceImp::reset() { } ze_result_t FenceImp::hostSynchronize(uint64_t timeout) { - std::chrono::high_resolution_clock::time_point time1, time2; + std::chrono::microseconds elapsedTimeSinceGpuHangCheck{0}; + std::chrono::high_resolution_clock::time_point waitStartTime, lastHangCheckTime, currentTime; uint64_t timeDiff = 0; ze_result_t ret = ZE_RESULT_NOT_READY; + const auto csr = cmdQueue->getCsr(); - if (cmdQueue->getCsr()->getType() == NEO::CommandStreamReceiverType::CSR_AUB) { + if (csr->getType() == NEO::CommandStreamReceiverType::CSR_AUB) { return ZE_RESULT_SUCCESS; } @@ -59,7 +61,8 @@ ze_result_t FenceImp::hostSynchronize(uint64_t timeout) { return queryStatus(); } - time1 = std::chrono::high_resolution_clock::now(); + waitStartTime = std::chrono::high_resolution_clock::now(); + lastHangCheckTime = waitStartTime; while (timeDiff < timeout) { ret = queryStatus(); if (ret == ZE_RESULT_SUCCESS) { @@ -68,12 +71,21 @@ ze_result_t FenceImp::hostSynchronize(uint64_t timeout) { NEO::WaitUtils::waitFunction(nullptr, 0u); + currentTime = std::chrono::high_resolution_clock::now(); + elapsedTimeSinceGpuHangCheck = std::chrono::duration_cast(currentTime - lastHangCheckTime); + + if (elapsedTimeSinceGpuHangCheck.count() >= gpuHangCheckPeriod.count()) { + lastHangCheckTime = currentTime; + if (csr->isGpuHangDetected()) { + return ZE_RESULT_ERROR_DEVICE_LOST; + } + } + if (timeout == std::numeric_limits::max()) { continue; } - time2 = std::chrono::high_resolution_clock::now(); - timeDiff = std::chrono::duration_cast(time2 - time1).count(); + timeDiff = std::chrono::duration_cast(currentTime - waitStartTime).count(); } return ret; diff --git a/level_zero/core/source/fence/fence.h b/level_zero/core/source/fence/fence.h index 581911ece1..099e7875d0 100644 --- a/level_zero/core/source/fence/fence.h +++ b/level_zero/core/source/fence/fence.h @@ -13,6 +13,7 @@ #include "level_zero/core/source/cmdqueue/cmdqueue_imp.h" #include +#include #include struct _ze_fence_handle_t {}; @@ -39,6 +40,7 @@ struct Fence : _ze_fence_handle_t { protected: uint32_t partitionCount = 1; uint32_t taskCount = 0; + std::chrono::microseconds gpuHangCheckPeriod{500'000}; }; struct FenceImp : public Fence { diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h b/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h index 15fc5fee24..453de9b5f3 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h @@ -69,7 +69,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw { } ze_result_t synchronize(uint64_t timeout) override { synchronizedCalled++; - return ZE_RESULT_SUCCESS; + return synchronizeReturnValue; } NEO::SubmissionStatus submitBatchBuffer(size_t offset, NEO::ResidencyContainer &residencyContainer, void *endingCmdPtr, bool isCooperative) override { @@ -79,6 +79,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw { uint32_t synchronizedCalled = 0; NEO::ResidencyContainer residencyContainerSnapshot; + ze_result_t synchronizeReturnValue{ZE_RESULT_SUCCESS}; }; struct Deleter { diff --git a/level_zero/core/test/unit_tests/mocks/mock_fence.h b/level_zero/core/test/unit_tests/mocks/mock_fence.h index 8419b44570..941565055d 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_fence.h +++ b/level_zero/core/test/unit_tests/mocks/mock_fence.h @@ -19,6 +19,9 @@ namespace ult { template <> struct WhiteBox<::L0::Fence> : public ::L0::Fence { + ~WhiteBox() override = default; + + using ::L0::Fence::gpuHangCheckPeriod; using ::L0::Fence::partitionCount; using ::L0::Fence::taskCount; }; diff --git a/level_zero/core/test/unit_tests/sources/event/test_event.cpp b/level_zero/core/test/unit_tests/sources/event/test_event.cpp index 15e3c73ef4..1f5014dccd 100644 --- a/level_zero/core/test/unit_tests/sources/event/test_event.cpp +++ b/level_zero/core/test/unit_tests/sources/event/test_event.cpp @@ -21,6 +21,12 @@ #include "level_zero/core/test/unit_tests/mocks/mock_event.h" #include +#include +#include +#include +#include + +using namespace std::chrono_literals; namespace CpuIntrinsicsTests { extern std::atomic lastClFlushedPtr; @@ -597,6 +603,43 @@ class EventSynchronizeTest : public Test { std::unique_ptr event; }; +TEST_F(EventSynchronizeTest, GivenGpuHangWhenHostSynchronizeIsCalledThenDeviceLostIsReturned) { + const auto csr = std::make_unique(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield()); + csr->isGpuHangDetectedReturnValue = true; + + event->csr = csr.get(); + event->gpuHangCheckPeriod = 0ms; + + const auto timeout = std::numeric_limits::max(); + const auto result = event->hostSynchronize(timeout); + + EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, result); +} + +TEST_F(EventSynchronizeTest, GivenNoGpuHangAndOneNanosecondTimeoutWhenHostSynchronizeIsCalledThenResultNotReadyIsReturnedDueToTimeout) { + const auto csr = std::make_unique(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield()); + csr->isGpuHangDetectedReturnValue = false; + + event->csr = csr.get(); + event->gpuHangCheckPeriod = 0ms; + + const auto timeoutNanoseconds = 1; + const auto result = event->hostSynchronize(timeoutNanoseconds); + + EXPECT_EQ(ZE_RESULT_NOT_READY, result); +} + +TEST_F(EventSynchronizeTest, GivenLongPeriodOfGpuCheckAndOneNanosecondTimeoutWhenHostSynchronizeIsCalledThenResultNotReadyIsReturnedDueToTimeout) { + const auto csr = std::make_unique(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield()); + event->csr = csr.get(); + event->gpuHangCheckPeriod = 50000000ms; + + const auto timeoutNanoseconds = 1; + const auto result = event->hostSynchronize(timeoutNanoseconds); + + EXPECT_EQ(ZE_RESULT_NOT_READY, result); +} + TEST_F(EventSynchronizeTest, givenCallToEventHostSynchronizeWithTimeoutZeroAndStateInitialHostSynchronizeReturnsNotReady) { ze_result_t result = event->hostSynchronize(0); EXPECT_EQ(ZE_RESULT_NOT_READY, result); diff --git a/level_zero/core/test/unit_tests/sources/fence/test_fence.cpp b/level_zero/core/test/unit_tests/sources/fence/test_fence.cpp index 052c51e19c..5e543d146f 100644 --- a/level_zero/core/test/unit_tests/sources/fence/test_fence.cpp +++ b/level_zero/core/test/unit_tests/sources/fence/test_fence.cpp @@ -15,6 +15,13 @@ #include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h" #include "level_zero/core/test/unit_tests/mocks/mock_fence.h" +#include +#include +#include +#include + +using namespace std::chrono_literals; + namespace L0 { namespace ult { @@ -49,6 +56,87 @@ TEST_F(FenceTest, whenQueryingStatusWithoutCsrAndFenceUnsignaledThenReturnsNotRe fence->destroy(); } +TEST_F(FenceTest, GivenGpuHangWhenHostSynchronizeIsCalledThenDeviceLostIsReturned) { + const auto csr = std::make_unique(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield()); + csr->isGpuHangDetectedReturnValue = true; + csr->testTaskCountReadyReturnValue = false; + + Mock cmdqueue(device, csr.get()); + ze_fence_desc_t desc; + + std::unique_ptr> fence; + fence.reset(whitebox_cast(Fence::create(&cmdqueue, &desc))); + ASSERT_NE(nullptr, fence); + + fence->taskCount = 1; + fence->gpuHangCheckPeriod = 0ms; + + const auto timeout = std::numeric_limits::max(); + const auto result = fence->hostSynchronize(timeout); + + EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, result); +} + +TEST_F(FenceTest, GivenNoGpuHangAndOneNanosecondTimeoutWhenHostSynchronizeIsCalledThenResultNotReadyIsReturnedDueToTimeout) { + const auto csr = std::make_unique(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield()); + csr->isGpuHangDetectedReturnValue = false; + csr->testTaskCountReadyReturnValue = false; + + Mock cmdqueue(device, csr.get()); + ze_fence_desc_t desc; + + std::unique_ptr> fence; + fence.reset(whitebox_cast(Fence::create(&cmdqueue, &desc))); + ASSERT_NE(nullptr, fence); + + fence->taskCount = 1; + fence->gpuHangCheckPeriod = 0ms; + + const auto timeoutNanoseconds = 1; + const auto result = fence->hostSynchronize(timeoutNanoseconds); + + EXPECT_EQ(ZE_RESULT_NOT_READY, result); +} + +TEST_F(FenceTest, GivenLongPeriodOfGpuCheckAndOneNanosecondTimeoutWhenHostSynchronizeIsCalledThenResultNotReadyIsReturnedDueToTimeout) { + const auto csr = std::make_unique(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield()); + csr->testTaskCountReadyReturnValue = false; + + Mock cmdqueue(device, csr.get()); + ze_fence_desc_t desc; + + std::unique_ptr> fence; + fence.reset(whitebox_cast(Fence::create(&cmdqueue, &desc))); + ASSERT_NE(nullptr, fence); + + fence->taskCount = 1; + fence->gpuHangCheckPeriod = 50000000ms; + + const auto timeoutNanoseconds = 1; + const auto result = fence->hostSynchronize(timeoutNanoseconds); + + EXPECT_EQ(ZE_RESULT_NOT_READY, result); +} + +TEST_F(FenceTest, GivenSuccessfulQueryResultAndNoTimeoutWhenHostSynchronizeIsCalledThenResultSuccessIsReturned) { + const auto csr = std::make_unique(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield()); + csr->testTaskCountReadyReturnValue = true; + + Mock cmdqueue(device, csr.get()); + ze_fence_desc_t desc; + + std::unique_ptr> fence; + fence.reset(whitebox_cast(Fence::create(&cmdqueue, &desc))); + ASSERT_NE(nullptr, fence); + + fence->taskCount = 1; + + const auto timeout = std::numeric_limits::max(); + const auto result = fence->hostSynchronize(timeout); + + EXPECT_EQ(ZE_RESULT_SUCCESS, result); +} + using FenceSynchronizeTest = Test; TEST_F(FenceSynchronizeTest, givenCallToFenceHostSynchronizeWithTimeoutZeroAndStateInitialThenHostSynchronizeReturnsNotReady) { diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index d568c3bc67..8e20110b0a 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -168,7 +168,7 @@ class CommandStreamReceiver { virtual WaitStatus waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) = 0; virtual WaitStatus waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait); WaitStatus baseWaitFunction(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait); - bool testTaskCountReady(volatile uint32_t *pollAddress, uint32_t taskCountToWait); + MOCKABLE_VIRTUAL bool testTaskCountReady(volatile uint32_t *pollAddress, uint32_t taskCountToWait); virtual void downloadAllocations(){}; void setSamplerCacheFlushRequired(SamplerCacheFlushState value) { this->samplerCacheFlushRequired = value; } @@ -318,12 +318,13 @@ class CommandStreamReceiver { const HardwareInfo &peekHwInfo() const; + MOCKABLE_VIRTUAL bool isGpuHangDetected() const; + protected: void cleanupResources(); void printDeviceIndex(); void checkForNewResources(uint32_t submittedTaskCount, uint32_t allocationTaskCount, GraphicsAllocation &gfxAllocation); bool checkImplicitFlushForGpuIdle(); - bool isGpuHangDetected() const; MOCKABLE_VIRTUAL std::unique_lock obtainHostPtrSurfaceCreationLock(); std::unique_ptr flushStamp; diff --git a/shared/test/common/mocks/mock_command_stream_receiver.h b/shared/test/common/mocks/mock_command_stream_receiver.h index ba7e7ffd7e..0b0a85bb3c 100644 --- a/shared/test/common/mocks/mock_command_stream_receiver.h +++ b/shared/test/common/mocks/mock_command_stream_receiver.h @@ -19,6 +19,7 @@ #include "gmock/gmock.h" +#include #include using namespace NEO; @@ -63,6 +64,22 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { bool isMultiOsContextCapable() const override { return multiOsContextCapable; } + bool isGpuHangDetected() const override { + if (isGpuHangDetectedReturnValue.has_value()) { + return *isGpuHangDetectedReturnValue; + } else { + return CommandStreamReceiver::isGpuHangDetected(); + } + } + + bool testTaskCountReady(volatile uint32_t *pollAddress, uint32_t taskCountToWait) override { + if (testTaskCountReadyReturnValue.has_value()) { + return *testTaskCountReadyReturnValue; + } else { + return CommandStreamReceiver::testTaskCountReady(pollAddress, taskCountToWait); + } + } + MemoryCompressionState getMemoryCompressionState(bool auxTranslationRequired, const HardwareInfo &hwInfo) const override { return MemoryCompressionState::NotApplicable; }; @@ -147,6 +164,8 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { bool createPreemptionAllocationReturn = true; bool createPreemptionAllocationParentCall = false; bool programComputeBarrierCommandCalled = false; + std::optional isGpuHangDetectedReturnValue{}; + std::optional testTaskCountReadyReturnValue{}; }; class MockCommandStreamReceiverWithFailingSubmitBatch : public MockCommandStreamReceiver {