Detect GPU hang in remaining blocking calls of L0

This change introduces detection of GPU hangs in
zeEventHostSynchronize and zeFenceHostSynchronize.
Furthermore, if CommandQueueHw::executeCommandLists
uses ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS and hang occurs,
the information about it is propagated to the caller.

Related-To: NEO-6681
Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
This commit is contained in:
Patryk Wrobel
2022-02-16 10:22:03 +00:00
committed by Compute-Runtime-Automation
parent 64b8de3c1d
commit 1275c4e200
11 changed files with 199 additions and 14 deletions

View File

@ -462,7 +462,10 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
csr->makeSurfacePackNonResident(csr->getResidencyAllocations());
if (getSynchronousMode() == ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS) {
this->synchronize(std::numeric_limits<uint64_t>::max());
const auto synchronizeResult = this->synchronize(std::numeric_limits<uint64_t>::max());
if (synchronizeResult == ZE_RESULT_ERROR_DEVICE_LOST) {
return ZE_RESULT_ERROR_DEVICE_LOST;
}
}
this->heapContainer.clear();

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2021 Intel Corporation
* Copyright (C) 2020-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -15,6 +15,7 @@
#include "level_zero/core/source/driver/driver_handle.h"
#include <level_zero/ze_api.h>
#include <chrono>
#include <limits>
struct _ze_event_handle_t {};
@ -102,6 +103,7 @@ struct Event : _ze_event_handle_t {
ze_event_scope_flags_t waitScope = 0u;
uint32_t kernelCount = 1u;
std::chrono::microseconds gpuHangCheckPeriod{500'000};
protected:
size_t contextStartOffset = 0u;

View File

@ -199,7 +199,8 @@ ze_result_t EventImp<TagSizeT>::hostSignal() {
template <typename TagSizeT>
ze_result_t EventImp<TagSizeT>::hostSynchronize(uint64_t timeout) {
std::chrono::high_resolution_clock::time_point time1, time2;
std::chrono::microseconds elapsedTimeSinceGpuHangCheck{0};
std::chrono::high_resolution_clock::time_point waitStartTime, lastHangCheckTime, currentTime;
uint64_t timeDiff = 0;
ze_result_t ret = ZE_RESULT_NOT_READY;
@ -212,7 +213,8 @@ ze_result_t EventImp<TagSizeT>::hostSynchronize(uint64_t timeout) {
return queryStatus();
}
time1 = std::chrono::high_resolution_clock::now();
waitStartTime = std::chrono::high_resolution_clock::now();
lastHangCheckTime = waitStartTime;
while (true) {
ret = queryStatus();
if (ret == ZE_RESULT_SUCCESS) {
@ -221,12 +223,21 @@ ze_result_t EventImp<TagSizeT>::hostSynchronize(uint64_t timeout) {
NEO::WaitUtils::waitFunction(nullptr, 0u);
currentTime = std::chrono::high_resolution_clock::now();
elapsedTimeSinceGpuHangCheck = std::chrono::duration_cast<std::chrono::microseconds>(currentTime - lastHangCheckTime);
if (elapsedTimeSinceGpuHangCheck.count() >= this->gpuHangCheckPeriod.count()) {
lastHangCheckTime = currentTime;
if (this->csr->isGpuHangDetected()) {
return ZE_RESULT_ERROR_DEVICE_LOST;
}
}
if (timeout == std::numeric_limits<uint32_t>::max()) {
continue;
}
time2 = std::chrono::high_resolution_clock::now();
timeDiff = std::chrono::duration_cast<std::chrono::nanoseconds>(time2 - time1).count();
timeDiff = std::chrono::duration_cast<std::chrono::nanoseconds>(currentTime - waitStartTime).count();
if (timeDiff >= timeout) {
break;

View File

@ -43,11 +43,13 @@ ze_result_t FenceImp::reset() {
}
ze_result_t FenceImp::hostSynchronize(uint64_t timeout) {
std::chrono::high_resolution_clock::time_point time1, time2;
std::chrono::microseconds elapsedTimeSinceGpuHangCheck{0};
std::chrono::high_resolution_clock::time_point waitStartTime, lastHangCheckTime, currentTime;
uint64_t timeDiff = 0;
ze_result_t ret = ZE_RESULT_NOT_READY;
const auto csr = cmdQueue->getCsr();
if (cmdQueue->getCsr()->getType() == NEO::CommandStreamReceiverType::CSR_AUB) {
if (csr->getType() == NEO::CommandStreamReceiverType::CSR_AUB) {
return ZE_RESULT_SUCCESS;
}
@ -59,7 +61,8 @@ ze_result_t FenceImp::hostSynchronize(uint64_t timeout) {
return queryStatus();
}
time1 = std::chrono::high_resolution_clock::now();
waitStartTime = std::chrono::high_resolution_clock::now();
lastHangCheckTime = waitStartTime;
while (timeDiff < timeout) {
ret = queryStatus();
if (ret == ZE_RESULT_SUCCESS) {
@ -68,12 +71,21 @@ ze_result_t FenceImp::hostSynchronize(uint64_t timeout) {
NEO::WaitUtils::waitFunction(nullptr, 0u);
currentTime = std::chrono::high_resolution_clock::now();
elapsedTimeSinceGpuHangCheck = std::chrono::duration_cast<std::chrono::microseconds>(currentTime - lastHangCheckTime);
if (elapsedTimeSinceGpuHangCheck.count() >= gpuHangCheckPeriod.count()) {
lastHangCheckTime = currentTime;
if (csr->isGpuHangDetected()) {
return ZE_RESULT_ERROR_DEVICE_LOST;
}
}
if (timeout == std::numeric_limits<uint64_t>::max()) {
continue;
}
time2 = std::chrono::high_resolution_clock::now();
timeDiff = std::chrono::duration_cast<std::chrono::nanoseconds>(time2 - time1).count();
timeDiff = std::chrono::duration_cast<std::chrono::nanoseconds>(currentTime - waitStartTime).count();
}
return ret;

View File

@ -13,6 +13,7 @@
#include "level_zero/core/source/cmdqueue/cmdqueue_imp.h"
#include <level_zero/ze_api.h>
#include <chrono>
#include <limits>
struct _ze_fence_handle_t {};
@ -39,6 +40,7 @@ struct Fence : _ze_fence_handle_t {
protected:
uint32_t partitionCount = 1;
uint32_t taskCount = 0;
std::chrono::microseconds gpuHangCheckPeriod{500'000};
};
struct FenceImp : public Fence {

View File

@ -69,7 +69,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
}
ze_result_t synchronize(uint64_t timeout) override {
synchronizedCalled++;
return ZE_RESULT_SUCCESS;
return synchronizeReturnValue;
}
NEO::SubmissionStatus submitBatchBuffer(size_t offset, NEO::ResidencyContainer &residencyContainer, void *endingCmdPtr, bool isCooperative) override {
@ -79,6 +79,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
uint32_t synchronizedCalled = 0;
NEO::ResidencyContainer residencyContainerSnapshot;
ze_result_t synchronizeReturnValue{ZE_RESULT_SUCCESS};
};
struct Deleter {

View File

@ -19,6 +19,9 @@ namespace ult {
template <>
struct WhiteBox<::L0::Fence> : public ::L0::Fence {
~WhiteBox() override = default;
using ::L0::Fence::gpuHangCheckPeriod;
using ::L0::Fence::partitionCount;
using ::L0::Fence::taskCount;
};

View File

@ -21,6 +21,12 @@
#include "level_zero/core/test/unit_tests/mocks/mock_event.h"
#include <atomic>
#include <chrono>
#include <cstdint>
#include <limits>
#include <memory>
using namespace std::chrono_literals;
namespace CpuIntrinsicsTests {
extern std::atomic<uintptr_t> lastClFlushedPtr;
@ -597,6 +603,43 @@ class EventSynchronizeTest : public Test<DeviceFixture> {
std::unique_ptr<L0::Event> event;
};
TEST_F(EventSynchronizeTest, GivenGpuHangWhenHostSynchronizeIsCalledThenDeviceLostIsReturned) {
const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
csr->isGpuHangDetectedReturnValue = true;
event->csr = csr.get();
event->gpuHangCheckPeriod = 0ms;
const auto timeout = std::numeric_limits<std::uint32_t>::max();
const auto result = event->hostSynchronize(timeout);
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, result);
}
TEST_F(EventSynchronizeTest, GivenNoGpuHangAndOneNanosecondTimeoutWhenHostSynchronizeIsCalledThenResultNotReadyIsReturnedDueToTimeout) {
const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
csr->isGpuHangDetectedReturnValue = false;
event->csr = csr.get();
event->gpuHangCheckPeriod = 0ms;
const auto timeoutNanoseconds = 1;
const auto result = event->hostSynchronize(timeoutNanoseconds);
EXPECT_EQ(ZE_RESULT_NOT_READY, result);
}
TEST_F(EventSynchronizeTest, GivenLongPeriodOfGpuCheckAndOneNanosecondTimeoutWhenHostSynchronizeIsCalledThenResultNotReadyIsReturnedDueToTimeout) {
const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
event->csr = csr.get();
event->gpuHangCheckPeriod = 50000000ms;
const auto timeoutNanoseconds = 1;
const auto result = event->hostSynchronize(timeoutNanoseconds);
EXPECT_EQ(ZE_RESULT_NOT_READY, result);
}
TEST_F(EventSynchronizeTest, givenCallToEventHostSynchronizeWithTimeoutZeroAndStateInitialHostSynchronizeReturnsNotReady) {
ze_result_t result = event->hostSynchronize(0);
EXPECT_EQ(ZE_RESULT_NOT_READY, result);

View File

@ -15,6 +15,13 @@
#include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h"
#include "level_zero/core/test/unit_tests/mocks/mock_fence.h"
#include <chrono>
#include <cstdint>
#include <limits>
#include <memory>
using namespace std::chrono_literals;
namespace L0 {
namespace ult {
@ -49,6 +56,87 @@ TEST_F(FenceTest, whenQueryingStatusWithoutCsrAndFenceUnsignaledThenReturnsNotRe
fence->destroy();
}
TEST_F(FenceTest, GivenGpuHangWhenHostSynchronizeIsCalledThenDeviceLostIsReturned) {
const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
csr->isGpuHangDetectedReturnValue = true;
csr->testTaskCountReadyReturnValue = false;
Mock<CommandQueue> cmdqueue(device, csr.get());
ze_fence_desc_t desc;
std::unique_ptr<WhiteBox<L0::Fence>> fence;
fence.reset(whitebox_cast(Fence::create(&cmdqueue, &desc)));
ASSERT_NE(nullptr, fence);
fence->taskCount = 1;
fence->gpuHangCheckPeriod = 0ms;
const auto timeout = std::numeric_limits<std::uint32_t>::max();
const auto result = fence->hostSynchronize(timeout);
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, result);
}
TEST_F(FenceTest, GivenNoGpuHangAndOneNanosecondTimeoutWhenHostSynchronizeIsCalledThenResultNotReadyIsReturnedDueToTimeout) {
const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
csr->isGpuHangDetectedReturnValue = false;
csr->testTaskCountReadyReturnValue = false;
Mock<CommandQueue> cmdqueue(device, csr.get());
ze_fence_desc_t desc;
std::unique_ptr<WhiteBox<L0::Fence>> fence;
fence.reset(whitebox_cast(Fence::create(&cmdqueue, &desc)));
ASSERT_NE(nullptr, fence);
fence->taskCount = 1;
fence->gpuHangCheckPeriod = 0ms;
const auto timeoutNanoseconds = 1;
const auto result = fence->hostSynchronize(timeoutNanoseconds);
EXPECT_EQ(ZE_RESULT_NOT_READY, result);
}
TEST_F(FenceTest, GivenLongPeriodOfGpuCheckAndOneNanosecondTimeoutWhenHostSynchronizeIsCalledThenResultNotReadyIsReturnedDueToTimeout) {
const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
csr->testTaskCountReadyReturnValue = false;
Mock<CommandQueue> cmdqueue(device, csr.get());
ze_fence_desc_t desc;
std::unique_ptr<WhiteBox<L0::Fence>> fence;
fence.reset(whitebox_cast(Fence::create(&cmdqueue, &desc)));
ASSERT_NE(nullptr, fence);
fence->taskCount = 1;
fence->gpuHangCheckPeriod = 50000000ms;
const auto timeoutNanoseconds = 1;
const auto result = fence->hostSynchronize(timeoutNanoseconds);
EXPECT_EQ(ZE_RESULT_NOT_READY, result);
}
TEST_F(FenceTest, GivenSuccessfulQueryResultAndNoTimeoutWhenHostSynchronizeIsCalledThenResultSuccessIsReturned) {
const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
csr->testTaskCountReadyReturnValue = true;
Mock<CommandQueue> cmdqueue(device, csr.get());
ze_fence_desc_t desc;
std::unique_ptr<WhiteBox<L0::Fence>> fence;
fence.reset(whitebox_cast(Fence::create(&cmdqueue, &desc)));
ASSERT_NE(nullptr, fence);
fence->taskCount = 1;
const auto timeout = std::numeric_limits<std::uint32_t>::max();
const auto result = fence->hostSynchronize(timeout);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
}
using FenceSynchronizeTest = Test<DeviceFixture>;
TEST_F(FenceSynchronizeTest, givenCallToFenceHostSynchronizeWithTimeoutZeroAndStateInitialThenHostSynchronizeReturnsNotReady) {

View File

@ -168,7 +168,7 @@ class CommandStreamReceiver {
virtual WaitStatus waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) = 0;
virtual WaitStatus waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait);
WaitStatus baseWaitFunction(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait);
bool testTaskCountReady(volatile uint32_t *pollAddress, uint32_t taskCountToWait);
MOCKABLE_VIRTUAL bool testTaskCountReady(volatile uint32_t *pollAddress, uint32_t taskCountToWait);
virtual void downloadAllocations(){};
void setSamplerCacheFlushRequired(SamplerCacheFlushState value) { this->samplerCacheFlushRequired = value; }
@ -318,12 +318,13 @@ class CommandStreamReceiver {
const HardwareInfo &peekHwInfo() const;
MOCKABLE_VIRTUAL bool isGpuHangDetected() const;
protected:
void cleanupResources();
void printDeviceIndex();
void checkForNewResources(uint32_t submittedTaskCount, uint32_t allocationTaskCount, GraphicsAllocation &gfxAllocation);
bool checkImplicitFlushForGpuIdle();
bool isGpuHangDetected() const;
MOCKABLE_VIRTUAL std::unique_lock<MutexType> obtainHostPtrSurfaceCreationLock();
std::unique_ptr<FlushStampTracker> flushStamp;

View File

@ -19,6 +19,7 @@
#include "gmock/gmock.h"
#include <optional>
#include <vector>
using namespace NEO;
@ -63,6 +64,22 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
bool isMultiOsContextCapable() const override { return multiOsContextCapable; }
bool isGpuHangDetected() const override {
if (isGpuHangDetectedReturnValue.has_value()) {
return *isGpuHangDetectedReturnValue;
} else {
return CommandStreamReceiver::isGpuHangDetected();
}
}
bool testTaskCountReady(volatile uint32_t *pollAddress, uint32_t taskCountToWait) override {
if (testTaskCountReadyReturnValue.has_value()) {
return *testTaskCountReadyReturnValue;
} else {
return CommandStreamReceiver::testTaskCountReady(pollAddress, taskCountToWait);
}
}
MemoryCompressionState getMemoryCompressionState(bool auxTranslationRequired, const HardwareInfo &hwInfo) const override {
return MemoryCompressionState::NotApplicable;
};
@ -147,6 +164,8 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
bool createPreemptionAllocationReturn = true;
bool createPreemptionAllocationParentCall = false;
bool programComputeBarrierCommandCalled = false;
std::optional<bool> isGpuHangDetectedReturnValue{};
std::optional<bool> testTaskCountReadyReturnValue{};
};
class MockCommandStreamReceiverWithFailingSubmitBatch : public MockCommandStreamReceiver {