mirror of
https://github.com/intel/compute-runtime.git
synced 2025-09-15 13:01:45 +08:00
Detect GPU hang in remaining blocking calls of L0
This change introduces detection of GPU hangs in zeEventHostSynchronize and zeFenceHostSynchronize. Furthermore, if CommandQueueHw::executeCommandLists uses ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS and hang occurs, the information about it is propagated to the caller. Related-To: NEO-6681 Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
This commit is contained in:

committed by
Compute-Runtime-Automation

parent
64b8de3c1d
commit
1275c4e200
@ -462,7 +462,10 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
|
||||
csr->makeSurfacePackNonResident(csr->getResidencyAllocations());
|
||||
|
||||
if (getSynchronousMode() == ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS) {
|
||||
this->synchronize(std::numeric_limits<uint64_t>::max());
|
||||
const auto synchronizeResult = this->synchronize(std::numeric_limits<uint64_t>::max());
|
||||
if (synchronizeResult == ZE_RESULT_ERROR_DEVICE_LOST) {
|
||||
return ZE_RESULT_ERROR_DEVICE_LOST;
|
||||
}
|
||||
}
|
||||
|
||||
this->heapContainer.clear();
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020-2021 Intel Corporation
|
||||
* Copyright (C) 2020-2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@ -15,6 +15,7 @@
|
||||
#include "level_zero/core/source/driver/driver_handle.h"
|
||||
#include <level_zero/ze_api.h>
|
||||
|
||||
#include <chrono>
|
||||
#include <limits>
|
||||
|
||||
struct _ze_event_handle_t {};
|
||||
@ -102,6 +103,7 @@ struct Event : _ze_event_handle_t {
|
||||
ze_event_scope_flags_t waitScope = 0u;
|
||||
|
||||
uint32_t kernelCount = 1u;
|
||||
std::chrono::microseconds gpuHangCheckPeriod{500'000};
|
||||
|
||||
protected:
|
||||
size_t contextStartOffset = 0u;
|
||||
|
@ -199,7 +199,8 @@ ze_result_t EventImp<TagSizeT>::hostSignal() {
|
||||
|
||||
template <typename TagSizeT>
|
||||
ze_result_t EventImp<TagSizeT>::hostSynchronize(uint64_t timeout) {
|
||||
std::chrono::high_resolution_clock::time_point time1, time2;
|
||||
std::chrono::microseconds elapsedTimeSinceGpuHangCheck{0};
|
||||
std::chrono::high_resolution_clock::time_point waitStartTime, lastHangCheckTime, currentTime;
|
||||
uint64_t timeDiff = 0;
|
||||
|
||||
ze_result_t ret = ZE_RESULT_NOT_READY;
|
||||
@ -212,7 +213,8 @@ ze_result_t EventImp<TagSizeT>::hostSynchronize(uint64_t timeout) {
|
||||
return queryStatus();
|
||||
}
|
||||
|
||||
time1 = std::chrono::high_resolution_clock::now();
|
||||
waitStartTime = std::chrono::high_resolution_clock::now();
|
||||
lastHangCheckTime = waitStartTime;
|
||||
while (true) {
|
||||
ret = queryStatus();
|
||||
if (ret == ZE_RESULT_SUCCESS) {
|
||||
@ -221,12 +223,21 @@ ze_result_t EventImp<TagSizeT>::hostSynchronize(uint64_t timeout) {
|
||||
|
||||
NEO::WaitUtils::waitFunction(nullptr, 0u);
|
||||
|
||||
currentTime = std::chrono::high_resolution_clock::now();
|
||||
elapsedTimeSinceGpuHangCheck = std::chrono::duration_cast<std::chrono::microseconds>(currentTime - lastHangCheckTime);
|
||||
|
||||
if (elapsedTimeSinceGpuHangCheck.count() >= this->gpuHangCheckPeriod.count()) {
|
||||
lastHangCheckTime = currentTime;
|
||||
if (this->csr->isGpuHangDetected()) {
|
||||
return ZE_RESULT_ERROR_DEVICE_LOST;
|
||||
}
|
||||
}
|
||||
|
||||
if (timeout == std::numeric_limits<uint32_t>::max()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
time2 = std::chrono::high_resolution_clock::now();
|
||||
timeDiff = std::chrono::duration_cast<std::chrono::nanoseconds>(time2 - time1).count();
|
||||
timeDiff = std::chrono::duration_cast<std::chrono::nanoseconds>(currentTime - waitStartTime).count();
|
||||
|
||||
if (timeDiff >= timeout) {
|
||||
break;
|
||||
|
@ -43,11 +43,13 @@ ze_result_t FenceImp::reset() {
|
||||
}
|
||||
|
||||
ze_result_t FenceImp::hostSynchronize(uint64_t timeout) {
|
||||
std::chrono::high_resolution_clock::time_point time1, time2;
|
||||
std::chrono::microseconds elapsedTimeSinceGpuHangCheck{0};
|
||||
std::chrono::high_resolution_clock::time_point waitStartTime, lastHangCheckTime, currentTime;
|
||||
uint64_t timeDiff = 0;
|
||||
ze_result_t ret = ZE_RESULT_NOT_READY;
|
||||
const auto csr = cmdQueue->getCsr();
|
||||
|
||||
if (cmdQueue->getCsr()->getType() == NEO::CommandStreamReceiverType::CSR_AUB) {
|
||||
if (csr->getType() == NEO::CommandStreamReceiverType::CSR_AUB) {
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
@ -59,7 +61,8 @@ ze_result_t FenceImp::hostSynchronize(uint64_t timeout) {
|
||||
return queryStatus();
|
||||
}
|
||||
|
||||
time1 = std::chrono::high_resolution_clock::now();
|
||||
waitStartTime = std::chrono::high_resolution_clock::now();
|
||||
lastHangCheckTime = waitStartTime;
|
||||
while (timeDiff < timeout) {
|
||||
ret = queryStatus();
|
||||
if (ret == ZE_RESULT_SUCCESS) {
|
||||
@ -68,12 +71,21 @@ ze_result_t FenceImp::hostSynchronize(uint64_t timeout) {
|
||||
|
||||
NEO::WaitUtils::waitFunction(nullptr, 0u);
|
||||
|
||||
currentTime = std::chrono::high_resolution_clock::now();
|
||||
elapsedTimeSinceGpuHangCheck = std::chrono::duration_cast<std::chrono::microseconds>(currentTime - lastHangCheckTime);
|
||||
|
||||
if (elapsedTimeSinceGpuHangCheck.count() >= gpuHangCheckPeriod.count()) {
|
||||
lastHangCheckTime = currentTime;
|
||||
if (csr->isGpuHangDetected()) {
|
||||
return ZE_RESULT_ERROR_DEVICE_LOST;
|
||||
}
|
||||
}
|
||||
|
||||
if (timeout == std::numeric_limits<uint64_t>::max()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
time2 = std::chrono::high_resolution_clock::now();
|
||||
timeDiff = std::chrono::duration_cast<std::chrono::nanoseconds>(time2 - time1).count();
|
||||
timeDiff = std::chrono::duration_cast<std::chrono::nanoseconds>(currentTime - waitStartTime).count();
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include "level_zero/core/source/cmdqueue/cmdqueue_imp.h"
|
||||
#include <level_zero/ze_api.h>
|
||||
|
||||
#include <chrono>
|
||||
#include <limits>
|
||||
|
||||
struct _ze_fence_handle_t {};
|
||||
@ -39,6 +40,7 @@ struct Fence : _ze_fence_handle_t {
|
||||
protected:
|
||||
uint32_t partitionCount = 1;
|
||||
uint32_t taskCount = 0;
|
||||
std::chrono::microseconds gpuHangCheckPeriod{500'000};
|
||||
};
|
||||
|
||||
struct FenceImp : public Fence {
|
||||
|
@ -69,7 +69,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
|
||||
}
|
||||
ze_result_t synchronize(uint64_t timeout) override {
|
||||
synchronizedCalled++;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
return synchronizeReturnValue;
|
||||
}
|
||||
|
||||
NEO::SubmissionStatus submitBatchBuffer(size_t offset, NEO::ResidencyContainer &residencyContainer, void *endingCmdPtr, bool isCooperative) override {
|
||||
@ -79,6 +79,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
|
||||
|
||||
uint32_t synchronizedCalled = 0;
|
||||
NEO::ResidencyContainer residencyContainerSnapshot;
|
||||
ze_result_t synchronizeReturnValue{ZE_RESULT_SUCCESS};
|
||||
};
|
||||
|
||||
struct Deleter {
|
||||
|
@ -19,6 +19,9 @@ namespace ult {
|
||||
|
||||
template <>
|
||||
struct WhiteBox<::L0::Fence> : public ::L0::Fence {
|
||||
~WhiteBox() override = default;
|
||||
|
||||
using ::L0::Fence::gpuHangCheckPeriod;
|
||||
using ::L0::Fence::partitionCount;
|
||||
using ::L0::Fence::taskCount;
|
||||
};
|
||||
|
@ -21,6 +21,12 @@
|
||||
#include "level_zero/core/test/unit_tests/mocks/mock_event.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
|
||||
using namespace std::chrono_literals;
|
||||
|
||||
namespace CpuIntrinsicsTests {
|
||||
extern std::atomic<uintptr_t> lastClFlushedPtr;
|
||||
@ -597,6 +603,43 @@ class EventSynchronizeTest : public Test<DeviceFixture> {
|
||||
std::unique_ptr<L0::Event> event;
|
||||
};
|
||||
|
||||
TEST_F(EventSynchronizeTest, GivenGpuHangWhenHostSynchronizeIsCalledThenDeviceLostIsReturned) {
|
||||
const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
|
||||
csr->isGpuHangDetectedReturnValue = true;
|
||||
|
||||
event->csr = csr.get();
|
||||
event->gpuHangCheckPeriod = 0ms;
|
||||
|
||||
const auto timeout = std::numeric_limits<std::uint32_t>::max();
|
||||
const auto result = event->hostSynchronize(timeout);
|
||||
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, result);
|
||||
}
|
||||
|
||||
TEST_F(EventSynchronizeTest, GivenNoGpuHangAndOneNanosecondTimeoutWhenHostSynchronizeIsCalledThenResultNotReadyIsReturnedDueToTimeout) {
|
||||
const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
|
||||
csr->isGpuHangDetectedReturnValue = false;
|
||||
|
||||
event->csr = csr.get();
|
||||
event->gpuHangCheckPeriod = 0ms;
|
||||
|
||||
const auto timeoutNanoseconds = 1;
|
||||
const auto result = event->hostSynchronize(timeoutNanoseconds);
|
||||
|
||||
EXPECT_EQ(ZE_RESULT_NOT_READY, result);
|
||||
}
|
||||
|
||||
TEST_F(EventSynchronizeTest, GivenLongPeriodOfGpuCheckAndOneNanosecondTimeoutWhenHostSynchronizeIsCalledThenResultNotReadyIsReturnedDueToTimeout) {
|
||||
const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
|
||||
event->csr = csr.get();
|
||||
event->gpuHangCheckPeriod = 50000000ms;
|
||||
|
||||
const auto timeoutNanoseconds = 1;
|
||||
const auto result = event->hostSynchronize(timeoutNanoseconds);
|
||||
|
||||
EXPECT_EQ(ZE_RESULT_NOT_READY, result);
|
||||
}
|
||||
|
||||
TEST_F(EventSynchronizeTest, givenCallToEventHostSynchronizeWithTimeoutZeroAndStateInitialHostSynchronizeReturnsNotReady) {
|
||||
ze_result_t result = event->hostSynchronize(0);
|
||||
EXPECT_EQ(ZE_RESULT_NOT_READY, result);
|
||||
|
@ -15,6 +15,13 @@
|
||||
#include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h"
|
||||
#include "level_zero/core/test/unit_tests/mocks/mock_fence.h"
|
||||
|
||||
#include <chrono>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
|
||||
using namespace std::chrono_literals;
|
||||
|
||||
namespace L0 {
|
||||
namespace ult {
|
||||
|
||||
@ -49,6 +56,87 @@ TEST_F(FenceTest, whenQueryingStatusWithoutCsrAndFenceUnsignaledThenReturnsNotRe
|
||||
fence->destroy();
|
||||
}
|
||||
|
||||
TEST_F(FenceTest, GivenGpuHangWhenHostSynchronizeIsCalledThenDeviceLostIsReturned) {
|
||||
const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
|
||||
csr->isGpuHangDetectedReturnValue = true;
|
||||
csr->testTaskCountReadyReturnValue = false;
|
||||
|
||||
Mock<CommandQueue> cmdqueue(device, csr.get());
|
||||
ze_fence_desc_t desc;
|
||||
|
||||
std::unique_ptr<WhiteBox<L0::Fence>> fence;
|
||||
fence.reset(whitebox_cast(Fence::create(&cmdqueue, &desc)));
|
||||
ASSERT_NE(nullptr, fence);
|
||||
|
||||
fence->taskCount = 1;
|
||||
fence->gpuHangCheckPeriod = 0ms;
|
||||
|
||||
const auto timeout = std::numeric_limits<std::uint32_t>::max();
|
||||
const auto result = fence->hostSynchronize(timeout);
|
||||
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, result);
|
||||
}
|
||||
|
||||
TEST_F(FenceTest, GivenNoGpuHangAndOneNanosecondTimeoutWhenHostSynchronizeIsCalledThenResultNotReadyIsReturnedDueToTimeout) {
|
||||
const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
|
||||
csr->isGpuHangDetectedReturnValue = false;
|
||||
csr->testTaskCountReadyReturnValue = false;
|
||||
|
||||
Mock<CommandQueue> cmdqueue(device, csr.get());
|
||||
ze_fence_desc_t desc;
|
||||
|
||||
std::unique_ptr<WhiteBox<L0::Fence>> fence;
|
||||
fence.reset(whitebox_cast(Fence::create(&cmdqueue, &desc)));
|
||||
ASSERT_NE(nullptr, fence);
|
||||
|
||||
fence->taskCount = 1;
|
||||
fence->gpuHangCheckPeriod = 0ms;
|
||||
|
||||
const auto timeoutNanoseconds = 1;
|
||||
const auto result = fence->hostSynchronize(timeoutNanoseconds);
|
||||
|
||||
EXPECT_EQ(ZE_RESULT_NOT_READY, result);
|
||||
}
|
||||
|
||||
TEST_F(FenceTest, GivenLongPeriodOfGpuCheckAndOneNanosecondTimeoutWhenHostSynchronizeIsCalledThenResultNotReadyIsReturnedDueToTimeout) {
|
||||
const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
|
||||
csr->testTaskCountReadyReturnValue = false;
|
||||
|
||||
Mock<CommandQueue> cmdqueue(device, csr.get());
|
||||
ze_fence_desc_t desc;
|
||||
|
||||
std::unique_ptr<WhiteBox<L0::Fence>> fence;
|
||||
fence.reset(whitebox_cast(Fence::create(&cmdqueue, &desc)));
|
||||
ASSERT_NE(nullptr, fence);
|
||||
|
||||
fence->taskCount = 1;
|
||||
fence->gpuHangCheckPeriod = 50000000ms;
|
||||
|
||||
const auto timeoutNanoseconds = 1;
|
||||
const auto result = fence->hostSynchronize(timeoutNanoseconds);
|
||||
|
||||
EXPECT_EQ(ZE_RESULT_NOT_READY, result);
|
||||
}
|
||||
|
||||
TEST_F(FenceTest, GivenSuccessfulQueryResultAndNoTimeoutWhenHostSynchronizeIsCalledThenResultSuccessIsReturned) {
|
||||
const auto csr = std::make_unique<MockCommandStreamReceiver>(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield());
|
||||
csr->testTaskCountReadyReturnValue = true;
|
||||
|
||||
Mock<CommandQueue> cmdqueue(device, csr.get());
|
||||
ze_fence_desc_t desc;
|
||||
|
||||
std::unique_ptr<WhiteBox<L0::Fence>> fence;
|
||||
fence.reset(whitebox_cast(Fence::create(&cmdqueue, &desc)));
|
||||
ASSERT_NE(nullptr, fence);
|
||||
|
||||
fence->taskCount = 1;
|
||||
|
||||
const auto timeout = std::numeric_limits<std::uint32_t>::max();
|
||||
const auto result = fence->hostSynchronize(timeout);
|
||||
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
}
|
||||
|
||||
using FenceSynchronizeTest = Test<DeviceFixture>;
|
||||
|
||||
TEST_F(FenceSynchronizeTest, givenCallToFenceHostSynchronizeWithTimeoutZeroAndStateInitialThenHostSynchronizeReturnsNotReady) {
|
||||
|
@ -168,7 +168,7 @@ class CommandStreamReceiver {
|
||||
virtual WaitStatus waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) = 0;
|
||||
virtual WaitStatus waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait);
|
||||
WaitStatus baseWaitFunction(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait);
|
||||
bool testTaskCountReady(volatile uint32_t *pollAddress, uint32_t taskCountToWait);
|
||||
MOCKABLE_VIRTUAL bool testTaskCountReady(volatile uint32_t *pollAddress, uint32_t taskCountToWait);
|
||||
virtual void downloadAllocations(){};
|
||||
|
||||
void setSamplerCacheFlushRequired(SamplerCacheFlushState value) { this->samplerCacheFlushRequired = value; }
|
||||
@ -318,12 +318,13 @@ class CommandStreamReceiver {
|
||||
|
||||
const HardwareInfo &peekHwInfo() const;
|
||||
|
||||
MOCKABLE_VIRTUAL bool isGpuHangDetected() const;
|
||||
|
||||
protected:
|
||||
void cleanupResources();
|
||||
void printDeviceIndex();
|
||||
void checkForNewResources(uint32_t submittedTaskCount, uint32_t allocationTaskCount, GraphicsAllocation &gfxAllocation);
|
||||
bool checkImplicitFlushForGpuIdle();
|
||||
bool isGpuHangDetected() const;
|
||||
MOCKABLE_VIRTUAL std::unique_lock<MutexType> obtainHostPtrSurfaceCreationLock();
|
||||
|
||||
std::unique_ptr<FlushStampTracker> flushStamp;
|
||||
|
@ -19,6 +19,7 @@
|
||||
|
||||
#include "gmock/gmock.h"
|
||||
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
|
||||
using namespace NEO;
|
||||
@ -63,6 +64,22 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
|
||||
|
||||
bool isMultiOsContextCapable() const override { return multiOsContextCapable; }
|
||||
|
||||
bool isGpuHangDetected() const override {
|
||||
if (isGpuHangDetectedReturnValue.has_value()) {
|
||||
return *isGpuHangDetectedReturnValue;
|
||||
} else {
|
||||
return CommandStreamReceiver::isGpuHangDetected();
|
||||
}
|
||||
}
|
||||
|
||||
bool testTaskCountReady(volatile uint32_t *pollAddress, uint32_t taskCountToWait) override {
|
||||
if (testTaskCountReadyReturnValue.has_value()) {
|
||||
return *testTaskCountReadyReturnValue;
|
||||
} else {
|
||||
return CommandStreamReceiver::testTaskCountReady(pollAddress, taskCountToWait);
|
||||
}
|
||||
}
|
||||
|
||||
MemoryCompressionState getMemoryCompressionState(bool auxTranslationRequired, const HardwareInfo &hwInfo) const override {
|
||||
return MemoryCompressionState::NotApplicable;
|
||||
};
|
||||
@ -147,6 +164,8 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
|
||||
bool createPreemptionAllocationReturn = true;
|
||||
bool createPreemptionAllocationParentCall = false;
|
||||
bool programComputeBarrierCommandCalled = false;
|
||||
std::optional<bool> isGpuHangDetectedReturnValue{};
|
||||
std::optional<bool> testTaskCountReadyReturnValue{};
|
||||
};
|
||||
|
||||
class MockCommandStreamReceiverWithFailingSubmitBatch : public MockCommandStreamReceiver {
|
||||
|
Reference in New Issue
Block a user