mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-04 23:56:39 +08:00
Detect GPU hang in clWaitForEvents
This change: - moves NEO::WaitStatus to a separate file - enables detection of GPU hang in clWaitForEvents - adjusts most of blocking calls in CommandStreamReceiver to return WaitStatus - adds ULTs to cover the new code Related-To: NEO-6681 Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
f2e1361541
commit
7f729b7f89
@@ -27,6 +27,8 @@
|
||||
#include "opencl/source/helpers/hardware_commands_helper.h"
|
||||
#include "opencl/source/mem_obj/mem_obj.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace NEO {
|
||||
|
||||
Event::Event(
|
||||
@@ -417,15 +419,18 @@ void Event::getBoundaryTimestampValues(TimestampPacketContainer *timestampContai
|
||||
}
|
||||
}
|
||||
|
||||
inline bool Event::wait(bool blocking, bool useQuickKmdSleep) {
|
||||
inline WaitStatus Event::wait(bool blocking, bool useQuickKmdSleep) {
|
||||
while (this->taskCount == CompletionStamp::notReady) {
|
||||
if (blocking == false) {
|
||||
return false;
|
||||
return WaitStatus::NotReady;
|
||||
}
|
||||
}
|
||||
|
||||
Range<CopyEngineState> states{&bcsState, bcsState.isValid() ? 1u : 0u};
|
||||
cmdQueue->waitUntilComplete(taskCount.load(), states, flushStamp->peekStamp(), useQuickKmdSleep);
|
||||
const auto waitStatus = cmdQueue->waitUntilComplete(taskCount.load(), states, flushStamp->peekStamp(), useQuickKmdSleep);
|
||||
if (waitStatus == WaitStatus::GpuHang) {
|
||||
return WaitStatus::GpuHang;
|
||||
}
|
||||
updateExecutionStatus();
|
||||
|
||||
DEBUG_BREAK_IF(this->taskLevel == CompletionStamp::notReady && this->executionStatus >= 0);
|
||||
@@ -433,7 +438,7 @@ inline bool Event::wait(bool blocking, bool useQuickKmdSleep) {
|
||||
auto *allocationStorage = cmdQueue->getGpgpuCommandStreamReceiver().getInternalAllocationStorage();
|
||||
allocationStorage->cleanAllocationList(this->taskCount, TEMPORARY_ALLOCATION);
|
||||
|
||||
return true;
|
||||
return WaitStatus::Ready;
|
||||
}
|
||||
|
||||
void Event::updateExecutionStatus() {
|
||||
@@ -630,16 +635,23 @@ cl_int Event::waitForEvents(cl_uint numEvents,
|
||||
// pointers to workerLists - for fast swap operations
|
||||
WorkerListT *currentlyPendingEvents = &workerList1;
|
||||
WorkerListT *pendingEventsLeft = &workerList2;
|
||||
WaitStatus eventWaitStatus = WaitStatus::NotReady;
|
||||
|
||||
while (currentlyPendingEvents->size() > 0) {
|
||||
for (auto &e : *currentlyPendingEvents) {
|
||||
Event *event = castToObjectOrAbort<Event>(e);
|
||||
for (auto current = currentlyPendingEvents->begin(), end = currentlyPendingEvents->end(); current != end; ++current) {
|
||||
Event *event = castToObjectOrAbort<Event>(*current);
|
||||
if (event->peekExecutionStatus() < CL_COMPLETE) {
|
||||
return CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
|
||||
}
|
||||
|
||||
if (event->wait(false, false) == false) {
|
||||
eventWaitStatus = event->wait(false, false);
|
||||
if (eventWaitStatus == WaitStatus::NotReady) {
|
||||
pendingEventsLeft->push_back(event);
|
||||
} else if (eventWaitStatus == WaitStatus::GpuHang) {
|
||||
setExecutionStatusToAbortedDueToGpuHang(pendingEventsLeft->begin(), pendingEventsLeft->end());
|
||||
setExecutionStatusToAbortedDueToGpuHang(current, end);
|
||||
|
||||
return CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -650,6 +662,13 @@ cl_int Event::waitForEvents(cl_uint numEvents,
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
inline void Event::setExecutionStatusToAbortedDueToGpuHang(cl_event *first, cl_event *last) {
|
||||
std::for_each(first, last, [](cl_event &e) {
|
||||
Event *event = castToObjectOrAbort<Event>(e);
|
||||
event->transitionExecutionStatus(executionAbortedDueToGpuHang);
|
||||
});
|
||||
}
|
||||
|
||||
uint32_t Event::getTaskLevel() {
|
||||
return taskLevel;
|
||||
}
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
* Copyright (C) 2018-2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "shared/source/command_stream/wait_status.h"
|
||||
#include "shared/source/helpers/flush_stamp.h"
|
||||
#include "shared/source/os_interface/os_time.h"
|
||||
#include "shared/source/os_interface/performance_counters.h"
|
||||
@@ -80,6 +81,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
|
||||
};
|
||||
|
||||
static const cl_ulong objectMagic = 0x80134213A43C981ALL;
|
||||
static constexpr cl_int executionAbortedDueToGpuHang = -777;
|
||||
|
||||
Event(CommandQueue *cmdQueue, cl_command_type cmdType,
|
||||
uint32_t taskLevel, uint32_t taskCount);
|
||||
@@ -206,9 +208,8 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
|
||||
// adds a callback (execution state change listener) to this event's list of callbacks
|
||||
void addCallback(Callback::ClbFuncT fn, cl_int type, void *data);
|
||||
|
||||
//returns true on success
|
||||
//if(blocking==false), will return with false instead of blocking while waiting for completion
|
||||
virtual bool wait(bool blocking, bool useQuickKmdSleep);
|
||||
//if(blocking==false), will return with WaitStatus::NotReady instead of blocking while waiting for completion
|
||||
virtual WaitStatus wait(bool blocking, bool useQuickKmdSleep);
|
||||
|
||||
bool isUserEvent() const {
|
||||
return (CL_COMMAND_USER == cmdType);
|
||||
@@ -347,6 +348,8 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
|
||||
void unblockEventsBlockedByThis(int32_t transitionStatus);
|
||||
void submitCommand(bool abortBlockedTasks);
|
||||
|
||||
static void setExecutionStatusToAbortedDueToGpuHang(cl_event *first, cl_event *last);
|
||||
|
||||
bool currentCmdQVirtualEvent;
|
||||
std::atomic<Command *> cmdToSubmit;
|
||||
std::atomic<Command *> submittedCmd;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
* Copyright (C) 2018-2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -24,13 +24,13 @@ void UserEvent::updateExecutionStatus() {
|
||||
return;
|
||||
}
|
||||
|
||||
bool UserEvent::wait(bool blocking, bool useQuickKmdSleep) {
|
||||
WaitStatus UserEvent::wait(bool blocking, bool useQuickKmdSleep) {
|
||||
while (updateStatusAndCheckCompletion() == false) {
|
||||
if (blocking == false) {
|
||||
return false;
|
||||
return WaitStatus::NotReady;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
return WaitStatus::Ready;
|
||||
}
|
||||
|
||||
uint32_t UserEvent::getTaskLevel() {
|
||||
@@ -53,16 +53,15 @@ VirtualEvent::VirtualEvent(CommandQueue *cmdQ, Context *ctx)
|
||||
}
|
||||
|
||||
void VirtualEvent::updateExecutionStatus() {
|
||||
;
|
||||
}
|
||||
|
||||
bool VirtualEvent::wait(bool blocking, bool useQuickKmdSleep) {
|
||||
WaitStatus VirtualEvent::wait(bool blocking, bool useQuickKmdSleep) {
|
||||
while (updateStatusAndCheckCompletion() == false) {
|
||||
if (blocking == false) {
|
||||
return false;
|
||||
return WaitStatus::NotReady;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
return WaitStatus::Ready;
|
||||
}
|
||||
|
||||
uint32_t VirtualEvent::getTaskLevel() {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
* Copyright (C) 2018-2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -18,7 +18,7 @@ class UserEvent : public Event {
|
||||
|
||||
~UserEvent() override = default;
|
||||
|
||||
bool wait(bool blocking, bool useQuickKmdSleep) override;
|
||||
WaitStatus wait(bool blocking, bool useQuickKmdSleep) override;
|
||||
|
||||
void updateExecutionStatus() override;
|
||||
|
||||
@@ -33,7 +33,7 @@ class VirtualEvent : public Event {
|
||||
|
||||
~VirtualEvent() override = default;
|
||||
|
||||
bool wait(bool blocking, bool useQuickKmdSleep) override;
|
||||
WaitStatus wait(bool blocking, bool useQuickKmdSleep) override;
|
||||
|
||||
bool setStatus(cl_int status) override;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user