mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-29 09:03:14 +08:00
Detect GPU hang in remaining calls of command queue and list
This change introduces checking of waits status in CommandQueue and CommandList classes. Related-To: NEO-6681 Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
9e7703a578
commit
19dded25ef
@@ -66,15 +66,20 @@ ze_result_t CommandQueueImp::initialize(bool copyOnly, bool isInternal) {
|
||||
return returnValue;
|
||||
}
|
||||
|
||||
void CommandQueueImp::reserveLinearStreamSize(size_t size) {
|
||||
NEO::WaitStatus CommandQueueImp::reserveLinearStreamSize(size_t size) {
|
||||
auto waitStatus{NEO::WaitStatus::Ready};
|
||||
|
||||
UNRECOVERABLE_IF(commandStream == nullptr);
|
||||
if (commandStream->getAvailableSpace() < size) {
|
||||
buffers.switchBuffers(csr);
|
||||
waitStatus = buffers.switchBuffers(csr);
|
||||
|
||||
NEO::GraphicsAllocation *nextBufferAllocation = buffers.getCurrentBufferAllocation();
|
||||
commandStream->replaceBuffer(nextBufferAllocation->getUnderlyingBuffer(),
|
||||
defaultQueueCmdBufferSize);
|
||||
commandStream->replaceGraphicsAllocation(nextBufferAllocation);
|
||||
}
|
||||
|
||||
return waitStatus;
|
||||
}
|
||||
|
||||
NEO::SubmissionStatus CommandQueueImp::submitBatchBuffer(size_t offset, NEO::ResidencyContainer &residencyContainer, void *endingCmdPtr,
|
||||
@@ -230,18 +235,21 @@ void CommandQueueImp::CommandBufferManager::destroy(Device *device) {
|
||||
}
|
||||
}
|
||||
|
||||
void CommandQueueImp::CommandBufferManager::switchBuffers(NEO::CommandStreamReceiver *csr) {
|
||||
NEO::WaitStatus CommandQueueImp::CommandBufferManager::switchBuffers(NEO::CommandStreamReceiver *csr) {
|
||||
if (bufferUse == BUFFER_ALLOCATION::FIRST) {
|
||||
bufferUse = BUFFER_ALLOCATION::SECOND;
|
||||
} else {
|
||||
bufferUse = BUFFER_ALLOCATION::FIRST;
|
||||
}
|
||||
|
||||
auto waitStatus{NEO::WaitStatus::Ready};
|
||||
auto completionId = flushId[bufferUse];
|
||||
if (completionId.second != 0u) {
|
||||
UNRECOVERABLE_IF(csr == nullptr);
|
||||
csr->waitForTaskCountWithKmdNotifyFallback(completionId.first, completionId.second, false, NEO::QueueThrottle::MEDIUM);
|
||||
waitStatus = csr->waitForTaskCountWithKmdNotifyFallback(completionId.first, completionId.second, false, NEO::QueueThrottle::MEDIUM);
|
||||
}
|
||||
|
||||
return waitStatus;
|
||||
}
|
||||
|
||||
} // namespace L0
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "shared/source/command_stream/preemption.h"
|
||||
#include "shared/source/command_stream/submission_status.h"
|
||||
#include "shared/source/command_stream/thread_arbitration_policy.h"
|
||||
#include "shared/source/command_stream/wait_status.h"
|
||||
#include "shared/source/device/device.h"
|
||||
#include "shared/source/helpers/hw_helper.h"
|
||||
#include "shared/source/helpers/hw_info.h"
|
||||
@@ -270,7 +271,12 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
|
||||
|
||||
size_t alignedSize = alignUp<size_t>(linearStreamSizeEstimate, minCmdBufferPtrAlign);
|
||||
size_t padding = alignedSize - linearStreamSizeEstimate;
|
||||
reserveLinearStreamSize(alignedSize);
|
||||
|
||||
const auto waitStatus = reserveLinearStreamSize(alignedSize);
|
||||
if (waitStatus == NEO::WaitStatus::GpuHang) {
|
||||
return ZE_RESULT_ERROR_DEVICE_LOST;
|
||||
}
|
||||
|
||||
NEO::LinearStream child(commandStream->getSpace(alignedSize), alignedSize);
|
||||
child.setGpuBase(ptrOffset(commandStream->getGpuBase(), commandStream->getUsed() - alignedSize));
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include "shared/source/command_stream/csr_definitions.h"
|
||||
#include "shared/source/command_stream/submission_status.h"
|
||||
#include "shared/source/command_stream/submissions_aggregator.h"
|
||||
#include "shared/source/command_stream/wait_status.h"
|
||||
#include "shared/source/helpers/constants.h"
|
||||
#include "shared/source/indirect_heap/indirect_heap.h"
|
||||
|
||||
@@ -38,7 +39,7 @@ struct CommandQueueImp : public CommandQueue {
|
||||
|
||||
ze_result_t initialize(Device *device, size_t sizeRequested);
|
||||
void destroy(Device *device);
|
||||
void switchBuffers(NEO::CommandStreamReceiver *csr);
|
||||
NEO::WaitStatus switchBuffers(NEO::CommandStreamReceiver *csr);
|
||||
|
||||
NEO::GraphicsAllocation *getCurrentBufferAllocation() {
|
||||
return buffers[bufferUse];
|
||||
@@ -78,7 +79,7 @@ struct CommandQueueImp : public CommandQueue {
|
||||
|
||||
NEO::CommandStreamReceiver *getCsr() { return csr; }
|
||||
|
||||
void reserveLinearStreamSize(size_t size);
|
||||
MOCKABLE_VIRTUAL NEO::WaitStatus reserveLinearStreamSize(size_t size);
|
||||
ze_command_queue_mode_t getSynchronousMode() const;
|
||||
virtual bool getPreemptionCmdProgramming() = 0;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user