Detect GPU hang in remaining calls of command queue and list

This change introduces checking of waits status in
CommandQueue and CommandList classes.

Related-To: NEO-6681
Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
This commit is contained in:
Patryk Wrobel
2022-04-08 14:38:10 +00:00
committed by Compute-Runtime-Automation
parent 9e7703a578
commit 19dded25ef
10 changed files with 281 additions and 16 deletions

View File

@@ -66,15 +66,20 @@ ze_result_t CommandQueueImp::initialize(bool copyOnly, bool isInternal) {
return returnValue;
}
void CommandQueueImp::reserveLinearStreamSize(size_t size) {
NEO::WaitStatus CommandQueueImp::reserveLinearStreamSize(size_t size) {
auto waitStatus{NEO::WaitStatus::Ready};
UNRECOVERABLE_IF(commandStream == nullptr);
if (commandStream->getAvailableSpace() < size) {
buffers.switchBuffers(csr);
waitStatus = buffers.switchBuffers(csr);
NEO::GraphicsAllocation *nextBufferAllocation = buffers.getCurrentBufferAllocation();
commandStream->replaceBuffer(nextBufferAllocation->getUnderlyingBuffer(),
defaultQueueCmdBufferSize);
commandStream->replaceGraphicsAllocation(nextBufferAllocation);
}
return waitStatus;
}
NEO::SubmissionStatus CommandQueueImp::submitBatchBuffer(size_t offset, NEO::ResidencyContainer &residencyContainer, void *endingCmdPtr,
@@ -230,18 +235,21 @@ void CommandQueueImp::CommandBufferManager::destroy(Device *device) {
}
}
void CommandQueueImp::CommandBufferManager::switchBuffers(NEO::CommandStreamReceiver *csr) {
NEO::WaitStatus CommandQueueImp::CommandBufferManager::switchBuffers(NEO::CommandStreamReceiver *csr) {
if (bufferUse == BUFFER_ALLOCATION::FIRST) {
bufferUse = BUFFER_ALLOCATION::SECOND;
} else {
bufferUse = BUFFER_ALLOCATION::FIRST;
}
auto waitStatus{NEO::WaitStatus::Ready};
auto completionId = flushId[bufferUse];
if (completionId.second != 0u) {
UNRECOVERABLE_IF(csr == nullptr);
csr->waitForTaskCountWithKmdNotifyFallback(completionId.first, completionId.second, false, NEO::QueueThrottle::MEDIUM);
waitStatus = csr->waitForTaskCountWithKmdNotifyFallback(completionId.first, completionId.second, false, NEO::QueueThrottle::MEDIUM);
}
return waitStatus;
}
} // namespace L0

View File

@@ -16,6 +16,7 @@
#include "shared/source/command_stream/preemption.h"
#include "shared/source/command_stream/submission_status.h"
#include "shared/source/command_stream/thread_arbitration_policy.h"
#include "shared/source/command_stream/wait_status.h"
#include "shared/source/device/device.h"
#include "shared/source/helpers/hw_helper.h"
#include "shared/source/helpers/hw_info.h"
@@ -270,7 +271,12 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
size_t alignedSize = alignUp<size_t>(linearStreamSizeEstimate, minCmdBufferPtrAlign);
size_t padding = alignedSize - linearStreamSizeEstimate;
reserveLinearStreamSize(alignedSize);
const auto waitStatus = reserveLinearStreamSize(alignedSize);
if (waitStatus == NEO::WaitStatus::GpuHang) {
return ZE_RESULT_ERROR_DEVICE_LOST;
}
NEO::LinearStream child(commandStream->getSpace(alignedSize), alignedSize);
child.setGpuBase(ptrOffset(commandStream->getGpuBase(), commandStream->getUsed() - alignedSize));

View File

@@ -10,6 +10,7 @@
#include "shared/source/command_stream/csr_definitions.h"
#include "shared/source/command_stream/submission_status.h"
#include "shared/source/command_stream/submissions_aggregator.h"
#include "shared/source/command_stream/wait_status.h"
#include "shared/source/helpers/constants.h"
#include "shared/source/indirect_heap/indirect_heap.h"
@@ -38,7 +39,7 @@ struct CommandQueueImp : public CommandQueue {
ze_result_t initialize(Device *device, size_t sizeRequested);
void destroy(Device *device);
void switchBuffers(NEO::CommandStreamReceiver *csr);
NEO::WaitStatus switchBuffers(NEO::CommandStreamReceiver *csr);
NEO::GraphicsAllocation *getCurrentBufferAllocation() {
return buffers[bufferUse];
@@ -78,7 +79,7 @@ struct CommandQueueImp : public CommandQueue {
NEO::CommandStreamReceiver *getCsr() { return csr; }
void reserveLinearStreamSize(size_t size);
MOCKABLE_VIRTUAL NEO::WaitStatus reserveLinearStreamSize(size_t size);
ze_command_queue_mode_t getSynchronousMode() const;
virtual bool getPreemptionCmdProgramming() = 0;