Detect GPU hang in command list immediate

This change introduces checking of values returned
by blocking calls used in cmdlist_hw_immediate.inl.

Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
Related-To: NEO-6681
This commit is contained in:
Patryk Wrobel
2022-03-23 16:20:37 +00:00
committed by Compute-Runtime-Automation
parent f3bf5498a4
commit 14954acd12
3 changed files with 209 additions and 5 deletions

View File

@@ -7,6 +7,7 @@
#pragma once
#include "shared/source/command_stream/wait_status.h"
#include "shared/source/helpers/hw_helper.h"
#include "shared/source/helpers/hw_info.h"
#include "shared/source/memory_manager/internal_allocation_storage.h"
@@ -113,7 +114,10 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::executeCommandListImm
if (this->isSyncModeQueue) {
auto timeoutMicroseconds = NEO::TimeoutControls::maxTimeout;
this->csr->waitForCompletionWithTimeout(false, timeoutMicroseconds, completionStamp.taskCount);
const auto waitStatus = this->csr->waitForCompletionWithTimeout(false, timeoutMicroseconds, completionStamp.taskCount);
if (waitStatus == NEO::WaitStatus::GpuHang) {
return ZE_RESULT_ERROR_DEVICE_LOST;
}
this->csr->getInternalAllocationStorage()->cleanAllocationList(completionStamp.taskCount, NEO::AllocationUsage::TEMPORARY_ALLOCATION);
}
@@ -180,7 +184,10 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendBarrier(
this->csr->flushNonKernelTask(nullptr, 0, 0, args, false, false, false);
if (this->isSyncModeQueue) {
auto timeoutMicroseconds = NEO::TimeoutControls::maxTimeout;
this->csr->waitForCompletionWithTimeout(false, timeoutMicroseconds, this->csr->peekTaskCount());
const auto waitStatus = this->csr->waitForCompletionWithTimeout(false, timeoutMicroseconds, this->csr->peekTaskCount());
if (waitStatus == NEO::WaitStatus::GpuHang) {
return ZE_RESULT_ERROR_DEVICE_LOST;
}
}
} else {
ret = CommandListCoreFamilyImmediate<gfxCoreFamily>::appendSignalEvent(hSignalEvent);
@@ -263,7 +270,10 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendSignalEvent(ze_
this->csr->flushNonKernelTask(&event->getAllocation(this->device), event->getGpuAddress(this->device), Event::STATE_SIGNALED, args, false, false, false);
if (this->isSyncModeQueue) {
auto timeoutMicroseconds = NEO::TimeoutControls::maxTimeout;
this->csr->waitForCompletionWithTimeout(false, timeoutMicroseconds, this->csr->peekTaskCount());
const auto waitStatus = this->csr->waitForCompletionWithTimeout(false, timeoutMicroseconds, this->csr->peekTaskCount());
if (waitStatus == NEO::WaitStatus::GpuHang) {
return ZE_RESULT_ERROR_DEVICE_LOST;
}
}
}
return ret;
@@ -289,7 +299,10 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendEventReset(ze_e
this->csr->flushNonKernelTask(&event->getAllocation(this->device), event->getGpuAddress(this->device), Event::STATE_CLEARED, args, false, false, false);
if (this->isSyncModeQueue) {
auto timeoutMicroseconds = NEO::TimeoutControls::maxTimeout;
this->csr->waitForCompletionWithTimeout(false, timeoutMicroseconds, this->csr->peekTaskCount());
const auto waitStatus = this->csr->waitForCompletionWithTimeout(false, timeoutMicroseconds, this->csr->peekTaskCount());
if (waitStatus == NEO::WaitStatus::GpuHang) {
return ZE_RESULT_ERROR_DEVICE_LOST;
}
}
}
return ret;

View File

@@ -5,9 +5,11 @@
*
*/
#include "shared/source/command_stream/wait_status.h"
#include "shared/source/gmm_helper/gmm_helper.h"
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
#include "shared/test/common/helpers/unit_test_helper.h"
#include "shared/test/common/mocks/mock_command_stream_receiver.h"
#include "shared/test/common/mocks/mock_memory_manager.h"
#include "shared/test/common/test_macros/test.h"
#include "shared/test/unit_test/page_fault_manager/mock_cpu_page_fault_manager.h"
@@ -926,6 +928,194 @@ TEST_F(CommandListCreate, whenCreatingImmCmdListWithSyncModeAndAppendBarrierThen
commandList->appendBarrier(nullptr, 0, nullptr);
}
TEST_F(CommandListCreate, GivenGpuHangWhenCreatingImmCmdListWithSyncModeAndAppendBarrierThenAppendBarrierReturnsDeviceLost) {
ze_command_queue_desc_t desc = {};
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
ze_result_t returnValue;
std::unique_ptr<L0::CommandList> commandList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue));
ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue);
ASSERT_NE(nullptr, commandList);
EXPECT_EQ(device, commandList->device);
EXPECT_EQ(CommandList::CommandListType::TYPE_IMMEDIATE, commandList->cmdListType);
EXPECT_NE(nullptr, commandList->cmdQImmediate);
MockCommandStreamReceiver mockCommandStreamReceiver(*neoDevice->executionEnvironment, neoDevice->getRootDeviceIndex(), neoDevice->getDeviceBitfield());
mockCommandStreamReceiver.waitForCompletionWithTimeoutReturnValue = WaitStatus::GpuHang;
const auto oldCsr = commandList->csr;
commandList->csr = &mockCommandStreamReceiver;
const auto appendBarrierResult = commandList->appendBarrier(nullptr, 0, nullptr);
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, appendBarrierResult);
commandList->csr = oldCsr;
}
HWTEST_F(CommandListCreate, GivenGpuHangWhenCreatingImmediateCommandListAndAppendingSignalEventsThenDeviceLostIsReturned) {
ze_command_queue_desc_t desc = {};
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
ze_result_t returnValue;
std::unique_ptr<L0::CommandList> commandList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue));
ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue);
ASSERT_NE(nullptr, commandList);
EXPECT_EQ(device, commandList->device);
EXPECT_EQ(CommandList::CommandListType::TYPE_IMMEDIATE, commandList->cmdListType);
EXPECT_NE(nullptr, commandList->cmdQImmediate);
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
eventDesc.wait = ZE_EVENT_SCOPE_FLAG_HOST;
ze_event_handle_t event = nullptr;
std::unique_ptr<L0::EventPool> eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
ASSERT_NE(nullptr, eventPool);
eventPool->createEvent(&eventDesc, &event);
std::unique_ptr<L0::Event> event_object(L0::Event::fromHandle(event));
ASSERT_NE(nullptr, event_object->csr);
ASSERT_EQ(static_cast<DeviceImp *>(device)->getNEODevice()->getDefaultEngine().commandStreamReceiver, event_object->csr);
returnValue = commandList->appendWaitOnEvents(1, &event);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
returnValue = commandList->appendBarrier(nullptr, 1, &event);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
MockCommandStreamReceiver mockCommandStreamReceiver(*neoDevice->executionEnvironment, neoDevice->getRootDeviceIndex(), neoDevice->getDeviceBitfield());
mockCommandStreamReceiver.waitForCompletionWithTimeoutReturnValue = WaitStatus::GpuHang;
const auto oldCsr = commandList->csr;
commandList->csr = &mockCommandStreamReceiver;
returnValue = commandList->appendSignalEvent(event);
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, returnValue);
commandList->csr = oldCsr;
}
HWTEST_F(CommandListCreate, GivenGpuHangWhenCreatingImmediateCommandListAndAppendingEventResetThenDeviceLostIsReturned) {
ze_command_queue_desc_t desc = {};
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
ze_result_t returnValue;
std::unique_ptr<L0::CommandList> commandList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue));
ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue);
ASSERT_NE(nullptr, commandList);
EXPECT_EQ(device, commandList->device);
EXPECT_EQ(CommandList::CommandListType::TYPE_IMMEDIATE, commandList->cmdListType);
EXPECT_NE(nullptr, commandList->cmdQImmediate);
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
eventDesc.wait = ZE_EVENT_SCOPE_FLAG_HOST;
ze_event_handle_t event = nullptr;
std::unique_ptr<L0::EventPool> eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
ASSERT_NE(nullptr, eventPool);
eventPool->createEvent(&eventDesc, &event);
std::unique_ptr<L0::Event> event_object(L0::Event::fromHandle(event));
ASSERT_NE(nullptr, event_object->csr);
ASSERT_EQ(static_cast<DeviceImp *>(device)->getNEODevice()->getDefaultEngine().commandStreamReceiver, event_object->csr);
returnValue = commandList->appendWaitOnEvents(1, &event);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
returnValue = commandList->appendBarrier(nullptr, 1, &event);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
returnValue = commandList->appendSignalEvent(event);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
returnValue = event_object->hostSignal();
ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue);
EXPECT_EQ(ZE_RESULT_SUCCESS, event_object->queryStatus());
MockCommandStreamReceiver mockCommandStreamReceiver(*neoDevice->executionEnvironment, neoDevice->getRootDeviceIndex(), neoDevice->getDeviceBitfield());
mockCommandStreamReceiver.waitForCompletionWithTimeoutReturnValue = WaitStatus::GpuHang;
const auto oldCsr = commandList->csr;
commandList->csr = &mockCommandStreamReceiver;
returnValue = commandList->appendEventReset(event);
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, returnValue);
commandList->csr = oldCsr;
}
HWTEST_F(CommandListCreate, GivenGpuHangAndEnabledFlushTaskSubmissionFlagWhenCreatingImmediateCommandListAndAppendingWaitOnEventsThenDeviceLostIsReturned) {
DebugManagerStateRestore restorer;
NEO::DebugManager.flags.EnableFlushTaskSubmission.set(true);
ze_command_queue_desc_t desc = {};
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
ze_result_t returnValue;
std::unique_ptr<L0::CommandList> commandList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue));
ASSERT_NE(nullptr, commandList);
ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue);
EXPECT_EQ(device, commandList->device);
EXPECT_EQ(1u, commandList->cmdListType);
EXPECT_NE(nullptr, commandList->cmdQImmediate);
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
eventDesc.wait = ZE_EVENT_SCOPE_FLAG_HOST;
ze_event_handle_t event = nullptr;
std::unique_ptr<L0::EventPool> eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
ASSERT_NE(nullptr, eventPool);
eventPool->createEvent(&eventDesc, &event);
std::unique_ptr<L0::Event> event_object(L0::Event::fromHandle(event));
ASSERT_NE(nullptr, event_object->csr);
ASSERT_EQ(static_cast<DeviceImp *>(device)->getNEODevice()->getDefaultEngine().commandStreamReceiver, event_object->csr);
MockCommandStreamReceiver mockCommandStreamReceiver(*neoDevice->executionEnvironment, neoDevice->getRootDeviceIndex(), neoDevice->getDeviceBitfield());
mockCommandStreamReceiver.waitForCompletionWithTimeoutReturnValue = WaitStatus::GpuHang;
const auto oldCsr = commandList->csr;
commandList->csr = &mockCommandStreamReceiver;
returnValue = commandList->appendWaitOnEvents(1, &event);
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, returnValue);
commandList->csr = oldCsr;
}
TEST_F(CommandListCreate, whenCreatingImmCmdListWithSyncModeAndAppendResetEventThenUpdateTaskCountNeededFlagIsDisabled) {
ze_command_queue_desc_t desc = {};
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;

View File

@@ -54,7 +54,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
WaitStatus waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) override {
waitForCompletionWithTimeoutCalled++;
return NEO::WaitStatus::Ready;
return waitForCompletionWithTimeoutReturnValue;
}
SubmissionStatus flush(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) override;
@@ -167,6 +167,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
bool programComputeBarrierCommandCalled = false;
std::optional<bool> isGpuHangDetectedReturnValue{};
std::optional<bool> testTaskCountReadyReturnValue{};
WaitStatus waitForCompletionWithTimeoutReturnValue{WaitStatus::Ready};
};
class MockCommandStreamReceiverWithFailingSubmitBatch : public MockCommandStreamReceiver {