mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-20 08:53:55 +08:00
Detect GPU hang in command list immediate
This change introduces checking of values returned by blocking calls used in cmdlist_hw_immediate.inl. Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com> Related-To: NEO-6681
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
f3bf5498a4
commit
14954acd12
@@ -7,6 +7,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "shared/source/command_stream/wait_status.h"
|
||||
#include "shared/source/helpers/hw_helper.h"
|
||||
#include "shared/source/helpers/hw_info.h"
|
||||
#include "shared/source/memory_manager/internal_allocation_storage.h"
|
||||
@@ -113,7 +114,10 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::executeCommandListImm
|
||||
|
||||
if (this->isSyncModeQueue) {
|
||||
auto timeoutMicroseconds = NEO::TimeoutControls::maxTimeout;
|
||||
this->csr->waitForCompletionWithTimeout(false, timeoutMicroseconds, completionStamp.taskCount);
|
||||
const auto waitStatus = this->csr->waitForCompletionWithTimeout(false, timeoutMicroseconds, completionStamp.taskCount);
|
||||
if (waitStatus == NEO::WaitStatus::GpuHang) {
|
||||
return ZE_RESULT_ERROR_DEVICE_LOST;
|
||||
}
|
||||
this->csr->getInternalAllocationStorage()->cleanAllocationList(completionStamp.taskCount, NEO::AllocationUsage::TEMPORARY_ALLOCATION);
|
||||
}
|
||||
|
||||
@@ -180,7 +184,10 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendBarrier(
|
||||
this->csr->flushNonKernelTask(nullptr, 0, 0, args, false, false, false);
|
||||
if (this->isSyncModeQueue) {
|
||||
auto timeoutMicroseconds = NEO::TimeoutControls::maxTimeout;
|
||||
this->csr->waitForCompletionWithTimeout(false, timeoutMicroseconds, this->csr->peekTaskCount());
|
||||
const auto waitStatus = this->csr->waitForCompletionWithTimeout(false, timeoutMicroseconds, this->csr->peekTaskCount());
|
||||
if (waitStatus == NEO::WaitStatus::GpuHang) {
|
||||
return ZE_RESULT_ERROR_DEVICE_LOST;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
ret = CommandListCoreFamilyImmediate<gfxCoreFamily>::appendSignalEvent(hSignalEvent);
|
||||
@@ -263,7 +270,10 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendSignalEvent(ze_
|
||||
this->csr->flushNonKernelTask(&event->getAllocation(this->device), event->getGpuAddress(this->device), Event::STATE_SIGNALED, args, false, false, false);
|
||||
if (this->isSyncModeQueue) {
|
||||
auto timeoutMicroseconds = NEO::TimeoutControls::maxTimeout;
|
||||
this->csr->waitForCompletionWithTimeout(false, timeoutMicroseconds, this->csr->peekTaskCount());
|
||||
const auto waitStatus = this->csr->waitForCompletionWithTimeout(false, timeoutMicroseconds, this->csr->peekTaskCount());
|
||||
if (waitStatus == NEO::WaitStatus::GpuHang) {
|
||||
return ZE_RESULT_ERROR_DEVICE_LOST;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
@@ -289,7 +299,10 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendEventReset(ze_e
|
||||
this->csr->flushNonKernelTask(&event->getAllocation(this->device), event->getGpuAddress(this->device), Event::STATE_CLEARED, args, false, false, false);
|
||||
if (this->isSyncModeQueue) {
|
||||
auto timeoutMicroseconds = NEO::TimeoutControls::maxTimeout;
|
||||
this->csr->waitForCompletionWithTimeout(false, timeoutMicroseconds, this->csr->peekTaskCount());
|
||||
const auto waitStatus = this->csr->waitForCompletionWithTimeout(false, timeoutMicroseconds, this->csr->peekTaskCount());
|
||||
if (waitStatus == NEO::WaitStatus::GpuHang) {
|
||||
return ZE_RESULT_ERROR_DEVICE_LOST;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
|
||||
@@ -5,9 +5,11 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/source/command_stream/wait_status.h"
|
||||
#include "shared/source/gmm_helper/gmm_helper.h"
|
||||
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
|
||||
#include "shared/test/common/helpers/unit_test_helper.h"
|
||||
#include "shared/test/common/mocks/mock_command_stream_receiver.h"
|
||||
#include "shared/test/common/mocks/mock_memory_manager.h"
|
||||
#include "shared/test/common/test_macros/test.h"
|
||||
#include "shared/test/unit_test/page_fault_manager/mock_cpu_page_fault_manager.h"
|
||||
@@ -926,6 +928,194 @@ TEST_F(CommandListCreate, whenCreatingImmCmdListWithSyncModeAndAppendBarrierThen
|
||||
commandList->appendBarrier(nullptr, 0, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(CommandListCreate, GivenGpuHangWhenCreatingImmCmdListWithSyncModeAndAppendBarrierThenAppendBarrierReturnsDeviceLost) {
|
||||
ze_command_queue_desc_t desc = {};
|
||||
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
|
||||
|
||||
ze_result_t returnValue;
|
||||
std::unique_ptr<L0::CommandList> commandList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue));
|
||||
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
ASSERT_NE(nullptr, commandList);
|
||||
|
||||
EXPECT_EQ(device, commandList->device);
|
||||
EXPECT_EQ(CommandList::CommandListType::TYPE_IMMEDIATE, commandList->cmdListType);
|
||||
EXPECT_NE(nullptr, commandList->cmdQImmediate);
|
||||
|
||||
MockCommandStreamReceiver mockCommandStreamReceiver(*neoDevice->executionEnvironment, neoDevice->getRootDeviceIndex(), neoDevice->getDeviceBitfield());
|
||||
mockCommandStreamReceiver.waitForCompletionWithTimeoutReturnValue = WaitStatus::GpuHang;
|
||||
|
||||
const auto oldCsr = commandList->csr;
|
||||
commandList->csr = &mockCommandStreamReceiver;
|
||||
|
||||
const auto appendBarrierResult = commandList->appendBarrier(nullptr, 0, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, appendBarrierResult);
|
||||
|
||||
commandList->csr = oldCsr;
|
||||
}
|
||||
|
||||
HWTEST_F(CommandListCreate, GivenGpuHangWhenCreatingImmediateCommandListAndAppendingSignalEventsThenDeviceLostIsReturned) {
|
||||
ze_command_queue_desc_t desc = {};
|
||||
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
|
||||
|
||||
ze_result_t returnValue;
|
||||
std::unique_ptr<L0::CommandList> commandList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue));
|
||||
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
ASSERT_NE(nullptr, commandList);
|
||||
|
||||
EXPECT_EQ(device, commandList->device);
|
||||
EXPECT_EQ(CommandList::CommandListType::TYPE_IMMEDIATE, commandList->cmdListType);
|
||||
EXPECT_NE(nullptr, commandList->cmdQImmediate);
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.count = 1;
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
|
||||
eventDesc.wait = ZE_EVENT_SCOPE_FLAG_HOST;
|
||||
|
||||
ze_event_handle_t event = nullptr;
|
||||
|
||||
std::unique_ptr<L0::EventPool> eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
ASSERT_NE(nullptr, eventPool);
|
||||
|
||||
eventPool->createEvent(&eventDesc, &event);
|
||||
|
||||
std::unique_ptr<L0::Event> event_object(L0::Event::fromHandle(event));
|
||||
ASSERT_NE(nullptr, event_object->csr);
|
||||
ASSERT_EQ(static_cast<DeviceImp *>(device)->getNEODevice()->getDefaultEngine().commandStreamReceiver, event_object->csr);
|
||||
|
||||
returnValue = commandList->appendWaitOnEvents(1, &event);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
|
||||
returnValue = commandList->appendBarrier(nullptr, 1, &event);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
|
||||
MockCommandStreamReceiver mockCommandStreamReceiver(*neoDevice->executionEnvironment, neoDevice->getRootDeviceIndex(), neoDevice->getDeviceBitfield());
|
||||
mockCommandStreamReceiver.waitForCompletionWithTimeoutReturnValue = WaitStatus::GpuHang;
|
||||
|
||||
const auto oldCsr = commandList->csr;
|
||||
commandList->csr = &mockCommandStreamReceiver;
|
||||
|
||||
returnValue = commandList->appendSignalEvent(event);
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, returnValue);
|
||||
|
||||
commandList->csr = oldCsr;
|
||||
}
|
||||
|
||||
HWTEST_F(CommandListCreate, GivenGpuHangWhenCreatingImmediateCommandListAndAppendingEventResetThenDeviceLostIsReturned) {
|
||||
ze_command_queue_desc_t desc = {};
|
||||
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
|
||||
|
||||
ze_result_t returnValue;
|
||||
std::unique_ptr<L0::CommandList> commandList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue));
|
||||
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
ASSERT_NE(nullptr, commandList);
|
||||
|
||||
EXPECT_EQ(device, commandList->device);
|
||||
EXPECT_EQ(CommandList::CommandListType::TYPE_IMMEDIATE, commandList->cmdListType);
|
||||
EXPECT_NE(nullptr, commandList->cmdQImmediate);
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.count = 1;
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
|
||||
eventDesc.wait = ZE_EVENT_SCOPE_FLAG_HOST;
|
||||
|
||||
ze_event_handle_t event = nullptr;
|
||||
|
||||
std::unique_ptr<L0::EventPool> eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
ASSERT_NE(nullptr, eventPool);
|
||||
|
||||
eventPool->createEvent(&eventDesc, &event);
|
||||
|
||||
std::unique_ptr<L0::Event> event_object(L0::Event::fromHandle(event));
|
||||
ASSERT_NE(nullptr, event_object->csr);
|
||||
ASSERT_EQ(static_cast<DeviceImp *>(device)->getNEODevice()->getDefaultEngine().commandStreamReceiver, event_object->csr);
|
||||
|
||||
returnValue = commandList->appendWaitOnEvents(1, &event);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
|
||||
returnValue = commandList->appendBarrier(nullptr, 1, &event);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
|
||||
returnValue = commandList->appendSignalEvent(event);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
|
||||
returnValue = event_object->hostSignal();
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, event_object->queryStatus());
|
||||
|
||||
MockCommandStreamReceiver mockCommandStreamReceiver(*neoDevice->executionEnvironment, neoDevice->getRootDeviceIndex(), neoDevice->getDeviceBitfield());
|
||||
mockCommandStreamReceiver.waitForCompletionWithTimeoutReturnValue = WaitStatus::GpuHang;
|
||||
|
||||
const auto oldCsr = commandList->csr;
|
||||
commandList->csr = &mockCommandStreamReceiver;
|
||||
|
||||
returnValue = commandList->appendEventReset(event);
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, returnValue);
|
||||
|
||||
commandList->csr = oldCsr;
|
||||
}
|
||||
|
||||
HWTEST_F(CommandListCreate, GivenGpuHangAndEnabledFlushTaskSubmissionFlagWhenCreatingImmediateCommandListAndAppendingWaitOnEventsThenDeviceLostIsReturned) {
|
||||
DebugManagerStateRestore restorer;
|
||||
NEO::DebugManager.flags.EnableFlushTaskSubmission.set(true);
|
||||
|
||||
ze_command_queue_desc_t desc = {};
|
||||
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
|
||||
|
||||
ze_result_t returnValue;
|
||||
std::unique_ptr<L0::CommandList> commandList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue));
|
||||
ASSERT_NE(nullptr, commandList);
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
|
||||
EXPECT_EQ(device, commandList->device);
|
||||
EXPECT_EQ(1u, commandList->cmdListType);
|
||||
EXPECT_NE(nullptr, commandList->cmdQImmediate);
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.count = 1;
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
|
||||
eventDesc.wait = ZE_EVENT_SCOPE_FLAG_HOST;
|
||||
|
||||
ze_event_handle_t event = nullptr;
|
||||
|
||||
std::unique_ptr<L0::EventPool> eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
ASSERT_NE(nullptr, eventPool);
|
||||
|
||||
eventPool->createEvent(&eventDesc, &event);
|
||||
|
||||
std::unique_ptr<L0::Event> event_object(L0::Event::fromHandle(event));
|
||||
ASSERT_NE(nullptr, event_object->csr);
|
||||
ASSERT_EQ(static_cast<DeviceImp *>(device)->getNEODevice()->getDefaultEngine().commandStreamReceiver, event_object->csr);
|
||||
|
||||
MockCommandStreamReceiver mockCommandStreamReceiver(*neoDevice->executionEnvironment, neoDevice->getRootDeviceIndex(), neoDevice->getDeviceBitfield());
|
||||
mockCommandStreamReceiver.waitForCompletionWithTimeoutReturnValue = WaitStatus::GpuHang;
|
||||
|
||||
const auto oldCsr = commandList->csr;
|
||||
commandList->csr = &mockCommandStreamReceiver;
|
||||
|
||||
returnValue = commandList->appendWaitOnEvents(1, &event);
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, returnValue);
|
||||
|
||||
commandList->csr = oldCsr;
|
||||
}
|
||||
|
||||
TEST_F(CommandListCreate, whenCreatingImmCmdListWithSyncModeAndAppendResetEventThenUpdateTaskCountNeededFlagIsDisabled) {
|
||||
ze_command_queue_desc_t desc = {};
|
||||
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
|
||||
|
||||
@@ -54,7 +54,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
|
||||
|
||||
WaitStatus waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) override {
|
||||
waitForCompletionWithTimeoutCalled++;
|
||||
return NEO::WaitStatus::Ready;
|
||||
return waitForCompletionWithTimeoutReturnValue;
|
||||
}
|
||||
SubmissionStatus flush(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) override;
|
||||
|
||||
@@ -167,6 +167,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
|
||||
bool programComputeBarrierCommandCalled = false;
|
||||
std::optional<bool> isGpuHangDetectedReturnValue{};
|
||||
std::optional<bool> testTaskCountReadyReturnValue{};
|
||||
WaitStatus waitForCompletionWithTimeoutReturnValue{WaitStatus::Ready};
|
||||
};
|
||||
|
||||
class MockCommandStreamReceiverWithFailingSubmitBatch : public MockCommandStreamReceiver {
|
||||
|
||||
Reference in New Issue
Block a user