From 0192e8038f2e54f4a0e76c23c5eba565fc5ddd62 Mon Sep 17 00:00:00 2001 From: "Milczarek, Slawomir" Date: Mon, 19 Sep 2022 10:20:14 +0000 Subject: [PATCH] Check for GPU hang in path with wait for timestamps Related-To: NEO-6868 Signed-off-by: Milczarek, Slawomir --- level_zero/core/source/fence/fence.cpp | 10 ++----- opencl/source/command_queue/command_queue.cpp | 8 ++++-- opencl/source/command_queue/command_queue.h | 2 +- .../source/command_queue/command_queue_hw.h | 2 +- .../command_queue/command_queue_hw_base.inl | 15 ++++++++--- .../command_queue_hw_1_tests.cpp | 27 ++++++++++++++++++- .../test/unit_test/mocks/mock_command_queue.h | 2 +- .../command_stream_receiver.cpp | 22 +++++++++------ .../command_stream/command_stream_receiver.h | 2 ++ .../mocks/mock_command_stream_receiver.h | 1 + 10 files changed, 65 insertions(+), 26 deletions(-) diff --git a/level_zero/core/source/fence/fence.cpp b/level_zero/core/source/fence/fence.cpp index 46c793aa96..da830fe52a 100644 --- a/level_zero/core/source/fence/fence.cpp +++ b/level_zero/core/source/fence/fence.cpp @@ -45,7 +45,6 @@ ze_result_t Fence::reset(bool signaled) { } ze_result_t Fence::hostSynchronize(uint64_t timeout) { - std::chrono::microseconds elapsedTimeSinceGpuHangCheck{0}; std::chrono::high_resolution_clock::time_point waitStartTime, lastHangCheckTime, currentTime; uint64_t timeDiff = 0; ze_result_t ret = ZE_RESULT_NOT_READY; @@ -72,13 +71,8 @@ ze_result_t Fence::hostSynchronize(uint64_t timeout) { } currentTime = std::chrono::high_resolution_clock::now(); - elapsedTimeSinceGpuHangCheck = std::chrono::duration_cast(currentTime - lastHangCheckTime); - - if (elapsedTimeSinceGpuHangCheck.count() >= gpuHangCheckPeriod.count()) { - lastHangCheckTime = currentTime; - if (csr->isGpuHangDetected()) { - return ZE_RESULT_ERROR_DEVICE_LOST; - } + if (csr->checkGpuHangDetected(currentTime, lastHangCheckTime)) { + return ZE_RESULT_ERROR_DEVICE_LOST; } if (timeout == std::numeric_limits::max()) { diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index 2f8fa5363c..3ff6a3c82a 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -1206,14 +1206,18 @@ WaitStatus CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *pri } } - auto waitedOnTimestamps = waitForTimestamps(activeBcsStates, taskCount); + auto waitStatus = WaitStatus::NotReady; + auto waitedOnTimestamps = waitForTimestamps(activeBcsStates, taskCount, waitStatus); + if (waitStatus == WaitStatus::GpuHang) { + return WaitStatus::GpuHang; + } TimestampPacketContainer nodesToRelease; if (deferredTimestampPackets) { deferredTimestampPackets->swapNodes(nodesToRelease); } - const auto waitStatus = waitUntilComplete(taskCount, activeBcsStates, flushStamp->peekStamp(), false, cleanTemporaryAllocationsList, waitedOnTimestamps); + waitStatus = waitUntilComplete(taskCount, activeBcsStates, flushStamp->peekStamp(), false, cleanTemporaryAllocationsList, waitedOnTimestamps); if (printfHandler) { if (!printfHandler->printEnqueueOutput()) { diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index 08c081c5ee..4251ac82da 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -204,7 +204,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> { bool isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState); bool isWaitForTimestampsEnabled() const; - virtual bool waitForTimestamps(Range copyEnginesToWait, uint32_t taskCount) = 0; + virtual bool waitForTimestamps(Range copyEnginesToWait, uint32_t taskCount, WaitStatus &status) = 0; MOCKABLE_VIRTUAL bool isQueueBlocked(); diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h index 4046115199..3d3eeeccc1 100644 --- a/opencl/source/command_queue/command_queue_hw.h +++ b/opencl/source/command_queue/command_queue_hw.h @@ -427,7 +427,7 @@ class CommandQueueHw : public CommandQueue { bool isCacheFlushCommand(uint32_t commandType) const override; - bool waitForTimestamps(Range copyEnginesToWait, uint32_t taskCount) override; + bool waitForTimestamps(Range copyEnginesToWait, uint32_t taskCount, WaitStatus &status) override; MOCKABLE_VIRTUAL bool isCacheFlushForBcsRequired() const; diff --git a/opencl/source/command_queue/command_queue_hw_base.inl b/opencl/source/command_queue/command_queue_hw_base.inl index 16d5082fd4..5b7007be52 100644 --- a/opencl/source/command_queue/command_queue_hw_base.inl +++ b/opencl/source/command_queue/command_queue_hw_base.inl @@ -141,16 +141,23 @@ bool CommandQueueHw::isCacheFlushForBcsRequired() const { } template -inline bool waitForTimestampsWithinContainer(TimestampPacketContainer *container, CommandStreamReceiver &csr) { +inline bool waitForTimestampsWithinContainer(TimestampPacketContainer *container, CommandStreamReceiver &csr, WaitStatus &status) { bool waited = false; + status = WaitStatus::NotReady; if (container) { + auto lastHangCheckTime = std::chrono::high_resolution_clock::now(); for (const auto ×tamp : container->peekNodes()) { for (uint32_t i = 0; i < timestamp->getPacketsUsed(); i++) { while (timestamp->getContextEndValue(i) == 1) { csr.downloadAllocation(*timestamp->getBaseGraphicsAllocation()->getGraphicsAllocation(csr.getRootDeviceIndex())); WaitUtils::waitFunctionWithPredicate(static_cast(timestamp->getContextEndAddress(i)), 1u, std::not_equal_to()); + if (csr.checkGpuHangDetected(std::chrono::high_resolution_clock::now(), lastHangCheckTime)) { + status = WaitStatus::GpuHang; + return false; + } } + status = WaitStatus::Ready; waited = true; } } @@ -160,14 +167,14 @@ inline bool waitForTimestampsWithinContainer(TimestampPacketContainer *container } template -bool CommandQueueHw::waitForTimestamps(Range copyEnginesToWait, uint32_t taskCount) { +bool CommandQueueHw::waitForTimestamps(Range copyEnginesToWait, uint32_t taskCount, WaitStatus &status) { using TSPacketType = typename Family::TimestampPacketType; bool waited = false; if (isWaitForTimestampsEnabled()) { - waited = waitForTimestampsWithinContainer(timestampPacketContainer.get(), getGpgpuCommandStreamReceiver()); + waited = waitForTimestampsWithinContainer(timestampPacketContainer.get(), getGpgpuCommandStreamReceiver(), status); if (isOOQEnabled()) { - waitForTimestampsWithinContainer(deferredTimestampPackets.get(), getGpgpuCommandStreamReceiver()); + waitForTimestampsWithinContainer(deferredTimestampPackets.get(), getGpgpuCommandStreamReceiver(), status); } if (waited) { diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp index a9330568f7..ebf405623f 100644 --- a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp @@ -13,6 +13,7 @@ #include "shared/test/common/mocks/mock_csr.h" #include "shared/test/common/mocks/mock_os_library.h" #include "shared/test/common/mocks/mock_source_level_debugger.h" +#include "shared/test/common/mocks/mock_timestamp_container.h" #include "shared/test/common/utilities/base_object_utils.h" #include "opencl/test/unit_test/command_queue/command_queue_fixture.h" @@ -49,12 +50,36 @@ HWTEST_F(CommandQueueHwTest, givenNoTimestampPacketsWhenWaitForTimestampsThenNoW device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; MockCommandQueueHw cmdQ(context, device.get(), nullptr); auto taskCount = device->getUltCommandStreamReceiver().peekLatestFlushedTaskCount(); + auto status = WaitStatus::NotReady; - cmdQ.waitForTimestamps({}, 101u); + cmdQ.waitForTimestamps({}, 101u, status); EXPECT_EQ(device->getUltCommandStreamReceiver().peekLatestFlushedTaskCount(), taskCount); } +HWTEST_F(CommandQueueHwTest, givenEnableTimestampWaitForQueuesWhenGpuHangDetectedWhileWaitingForAllEnginesThenReturnCorrectStatus) { + DebugManagerStateRestore restorer; + DebugManager.flags.EnableTimestampWaitForQueues.set(4); + + ExecutionEnvironment *executionEnvironment = platform()->peekExecutionEnvironment(); + auto device = std::make_unique(MockDevice::create(executionEnvironment, 0u)); + MockCommandQueueHw cmdQ(context, device.get(), nullptr); + auto status = WaitStatus::NotReady; + + auto mockCSR = new MockCommandStreamReceiver(*executionEnvironment, 0, device->getDeviceBitfield()); + mockCSR->isGpuHangDetectedReturnValue = true; + device->resetCommandStreamReceiver(mockCSR); + + auto mockTagAllocator = new MockTagAllocator<>(0, device->getMemoryManager()); + mockCSR->timestampPacketAllocator.reset(mockTagAllocator); + cmdQ.timestampPacketContainer = std::make_unique(); + cmdQ.timestampPacketContainer->add(mockTagAllocator->getTag()); + + status = cmdQ.waitForAllEngines(false, nullptr, false); + + EXPECT_EQ(WaitStatus::GpuHang, status); +} + HWTEST_F(CommandQueueHwTest, WhenDebugSurfaceIsAllocatedThenBufferIsZeroed) { ExecutionEnvironment *executionEnvironment = platform()->peekExecutionEnvironment(); executionEnvironment->rootDeviceEnvironments[0]->debugger.reset(new MockActiveSourceLevelDebugger(new MockOsLibrary)); diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index 57270962b2..c123735dcd 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -212,7 +212,7 @@ class MockCommandQueue : public CommandQueue { bool obtainTimestampPacketForCacheFlush(bool isCacheFlushRequired) const override { return isCacheFlushRequired; } - bool waitForTimestamps(Range copyEnginesToWait, uint32_t taskCount) override { return false; }; + bool waitForTimestamps(Range copyEnginesToWait, uint32_t taskCount, WaitStatus &status) override { return false; }; bool releaseIndirectHeapCalled = false; diff --git a/shared/source/command_stream/command_stream_receiver.cpp b/shared/source/command_stream/command_stream_receiver.cpp index 99fd8ecd42..7d8e7445ea 100644 --- a/shared/source/command_stream/command_stream_receiver.cpp +++ b/shared/source/command_stream/command_stream_receiver.cpp @@ -354,8 +354,19 @@ WaitStatus CommandStreamReceiver::waitForCompletionWithTimeout(const WaitParams return retCode; } +bool CommandStreamReceiver::checkGpuHangDetected(TimeType currentTime, TimeType &lastHangCheckTime) const { + std::chrono::microseconds elapsedTimeSinceGpuHangCheck = std::chrono::duration_cast(currentTime - lastHangCheckTime); + + if (elapsedTimeSinceGpuHangCheck.count() >= gpuHangCheckPeriod.count()) { + lastHangCheckTime = currentTime; + if (isGpuHangDetected()) { + return true; + } + } + return false; +} + WaitStatus CommandStreamReceiver::baseWaitFunction(volatile uint32_t *pollAddress, const WaitParams ¶ms, uint32_t taskCountToWait) { - std::chrono::microseconds elapsedTimeSinceGpuHangCheck{0}; std::chrono::high_resolution_clock::time_point waitStartTime, lastHangCheckTime, currentTime; int64_t timeDiff = 0; @@ -376,13 +387,8 @@ WaitStatus CommandStreamReceiver::baseWaitFunction(volatile uint32_t *pollAddres } currentTime = std::chrono::high_resolution_clock::now(); - elapsedTimeSinceGpuHangCheck = std::chrono::duration_cast(currentTime - lastHangCheckTime); - - if (elapsedTimeSinceGpuHangCheck.count() >= gpuHangCheckPeriod.count()) { - lastHangCheckTime = currentTime; - if (isGpuHangDetected()) { - return WaitStatus::GpuHang; - } + if (checkGpuHangDetected(currentTime, lastHangCheckTime)) { + return WaitStatus::GpuHang; } if (params.enableTimeout) { diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index 57f46fd456..dc622ef49b 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -77,6 +77,7 @@ class CommandStreamReceiver { }; using MutexType = std::recursive_mutex; + using TimeType = std::chrono::high_resolution_clock::time_point; CommandStreamReceiver(ExecutionEnvironment &executionEnvironment, uint32_t rootDeviceIndex, const DeviceBitfield deviceBitfield); @@ -325,6 +326,7 @@ class CommandStreamReceiver { const RootDeviceEnvironment &peekRootDeviceEnvironment() const; MOCKABLE_VIRTUAL bool isGpuHangDetected() const; + MOCKABLE_VIRTUAL bool checkGpuHangDetected(TimeType currentTime, TimeType &lastHangCheckTime) const; uint64_t getCompletionAddress() const { uint64_t completionFenceAddress = castToUint64(const_cast(getTagAddress())); diff --git a/shared/test/common/mocks/mock_command_stream_receiver.h b/shared/test/common/mocks/mock_command_stream_receiver.h index 5545f55be2..d2fa7461e4 100644 --- a/shared/test/common/mocks/mock_command_stream_receiver.h +++ b/shared/test/common/mocks/mock_command_stream_receiver.h @@ -43,6 +43,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { using CommandStreamReceiver::tagAddress; using CommandStreamReceiver::tagsMultiAllocation; using CommandStreamReceiver::taskCount; + using CommandStreamReceiver::timestampPacketAllocator; using CommandStreamReceiver::useGpuIdleImplicitFlush; using CommandStreamReceiver::useNewResourceImplicitFlush;