mirror of
https://github.com/intel/compute-runtime.git
synced 2025-09-15 13:01:45 +08:00
Detect GPU hang in remaining calls of command queue and list
This change introduces checking of waits status in CommandQueue and CommandList classes. Related-To: NEO-6681 Signed-off-by: Patryk Wrobel <patryk.wrobel@intel.com>
This commit is contained in:

committed by
Compute-Runtime-Automation

parent
9e7703a578
commit
19dded25ef
@ -148,8 +148,17 @@ template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
ze_result_t CommandListCoreFamily<gfxCoreFamily>::executeCommandListImmediate(bool performMigration) {
|
||||
this->close();
|
||||
ze_command_list_handle_t immediateHandle = this->toHandle();
|
||||
this->cmdQImmediate->executeCommandLists(1, &immediateHandle, nullptr, performMigration);
|
||||
this->cmdQImmediate->synchronize(std::numeric_limits<uint64_t>::max());
|
||||
|
||||
const auto commandListExecutionResult = this->cmdQImmediate->executeCommandLists(1, &immediateHandle, nullptr, performMigration);
|
||||
if (commandListExecutionResult == ZE_RESULT_ERROR_DEVICE_LOST) {
|
||||
return commandListExecutionResult;
|
||||
}
|
||||
|
||||
const auto synchronizationResult = this->cmdQImmediate->synchronize(std::numeric_limits<uint64_t>::max());
|
||||
if (synchronizationResult == ZE_RESULT_ERROR_DEVICE_LOST) {
|
||||
return synchronizationResult;
|
||||
}
|
||||
|
||||
this->reset();
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
|
@ -66,15 +66,20 @@ ze_result_t CommandQueueImp::initialize(bool copyOnly, bool isInternal) {
|
||||
return returnValue;
|
||||
}
|
||||
|
||||
void CommandQueueImp::reserveLinearStreamSize(size_t size) {
|
||||
NEO::WaitStatus CommandQueueImp::reserveLinearStreamSize(size_t size) {
|
||||
auto waitStatus{NEO::WaitStatus::Ready};
|
||||
|
||||
UNRECOVERABLE_IF(commandStream == nullptr);
|
||||
if (commandStream->getAvailableSpace() < size) {
|
||||
buffers.switchBuffers(csr);
|
||||
waitStatus = buffers.switchBuffers(csr);
|
||||
|
||||
NEO::GraphicsAllocation *nextBufferAllocation = buffers.getCurrentBufferAllocation();
|
||||
commandStream->replaceBuffer(nextBufferAllocation->getUnderlyingBuffer(),
|
||||
defaultQueueCmdBufferSize);
|
||||
commandStream->replaceGraphicsAllocation(nextBufferAllocation);
|
||||
}
|
||||
|
||||
return waitStatus;
|
||||
}
|
||||
|
||||
NEO::SubmissionStatus CommandQueueImp::submitBatchBuffer(size_t offset, NEO::ResidencyContainer &residencyContainer, void *endingCmdPtr,
|
||||
@ -230,18 +235,21 @@ void CommandQueueImp::CommandBufferManager::destroy(Device *device) {
|
||||
}
|
||||
}
|
||||
|
||||
void CommandQueueImp::CommandBufferManager::switchBuffers(NEO::CommandStreamReceiver *csr) {
|
||||
NEO::WaitStatus CommandQueueImp::CommandBufferManager::switchBuffers(NEO::CommandStreamReceiver *csr) {
|
||||
if (bufferUse == BUFFER_ALLOCATION::FIRST) {
|
||||
bufferUse = BUFFER_ALLOCATION::SECOND;
|
||||
} else {
|
||||
bufferUse = BUFFER_ALLOCATION::FIRST;
|
||||
}
|
||||
|
||||
auto waitStatus{NEO::WaitStatus::Ready};
|
||||
auto completionId = flushId[bufferUse];
|
||||
if (completionId.second != 0u) {
|
||||
UNRECOVERABLE_IF(csr == nullptr);
|
||||
csr->waitForTaskCountWithKmdNotifyFallback(completionId.first, completionId.second, false, NEO::QueueThrottle::MEDIUM);
|
||||
waitStatus = csr->waitForTaskCountWithKmdNotifyFallback(completionId.first, completionId.second, false, NEO::QueueThrottle::MEDIUM);
|
||||
}
|
||||
|
||||
return waitStatus;
|
||||
}
|
||||
|
||||
} // namespace L0
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include "shared/source/command_stream/preemption.h"
|
||||
#include "shared/source/command_stream/submission_status.h"
|
||||
#include "shared/source/command_stream/thread_arbitration_policy.h"
|
||||
#include "shared/source/command_stream/wait_status.h"
|
||||
#include "shared/source/device/device.h"
|
||||
#include "shared/source/helpers/hw_helper.h"
|
||||
#include "shared/source/helpers/hw_info.h"
|
||||
@ -270,7 +271,12 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
|
||||
|
||||
size_t alignedSize = alignUp<size_t>(linearStreamSizeEstimate, minCmdBufferPtrAlign);
|
||||
size_t padding = alignedSize - linearStreamSizeEstimate;
|
||||
reserveLinearStreamSize(alignedSize);
|
||||
|
||||
const auto waitStatus = reserveLinearStreamSize(alignedSize);
|
||||
if (waitStatus == NEO::WaitStatus::GpuHang) {
|
||||
return ZE_RESULT_ERROR_DEVICE_LOST;
|
||||
}
|
||||
|
||||
NEO::LinearStream child(commandStream->getSpace(alignedSize), alignedSize);
|
||||
child.setGpuBase(ptrOffset(commandStream->getGpuBase(), commandStream->getUsed() - alignedSize));
|
||||
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include "shared/source/command_stream/csr_definitions.h"
|
||||
#include "shared/source/command_stream/submission_status.h"
|
||||
#include "shared/source/command_stream/submissions_aggregator.h"
|
||||
#include "shared/source/command_stream/wait_status.h"
|
||||
#include "shared/source/helpers/constants.h"
|
||||
#include "shared/source/indirect_heap/indirect_heap.h"
|
||||
|
||||
@ -38,7 +39,7 @@ struct CommandQueueImp : public CommandQueue {
|
||||
|
||||
ze_result_t initialize(Device *device, size_t sizeRequested);
|
||||
void destroy(Device *device);
|
||||
void switchBuffers(NEO::CommandStreamReceiver *csr);
|
||||
NEO::WaitStatus switchBuffers(NEO::CommandStreamReceiver *csr);
|
||||
|
||||
NEO::GraphicsAllocation *getCurrentBufferAllocation() {
|
||||
return buffers[bufferUse];
|
||||
@ -78,7 +79,7 @@ struct CommandQueueImp : public CommandQueue {
|
||||
|
||||
NEO::CommandStreamReceiver *getCsr() { return csr; }
|
||||
|
||||
void reserveLinearStreamSize(size_t size);
|
||||
MOCKABLE_VIRTUAL NEO::WaitStatus reserveLinearStreamSize(size_t size);
|
||||
ze_command_queue_mode_t getSynchronousMode() const;
|
||||
virtual bool getPreemptionCmdProgramming() = 0;
|
||||
|
||||
|
@ -126,8 +126,11 @@ ze_result_t DeviceImp::canAccessPeer(ze_device_handle_t hPeerDevice, ze_bool_t *
|
||||
if (ret == ZE_RESULT_SUCCESS) {
|
||||
this->crossAccessEnabledDevices[peerRootDeviceIndex] = true;
|
||||
pPeerDevice->crossAccessEnabledDevices[this->getNEODevice()->getRootDeviceIndex()] = true;
|
||||
L0::CommandQueue::fromHandle(commandQueue)->synchronize(std::numeric_limits<uint64_t>::max());
|
||||
*value = true;
|
||||
|
||||
ret = L0::CommandQueue::fromHandle(commandQueue)->synchronize(std::numeric_limits<uint64_t>::max());
|
||||
if (ret == ZE_RESULT_SUCCESS) {
|
||||
*value = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -137,6 +140,10 @@ ze_result_t DeviceImp::canAccessPeer(ze_device_handle_t hPeerDevice, ze_bool_t *
|
||||
L0::Context::fromHandle(context)->destroy();
|
||||
L0::CommandQueue::fromHandle(commandQueue)->destroy();
|
||||
L0::CommandList::fromHandle(commandList)->destroy();
|
||||
|
||||
if (ret == ZE_RESULT_ERROR_DEVICE_LOST) {
|
||||
return ZE_RESULT_ERROR_DEVICE_LOST;
|
||||
}
|
||||
}
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
|
@ -6,6 +6,7 @@
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "shared/test/common/test_macros/mock_method_macros.h"
|
||||
|
||||
#include "level_zero/core/source/cmdqueue/cmdqueue_hw.h"
|
||||
@ -13,6 +14,9 @@
|
||||
#include "level_zero/core/test/unit_tests/mock.h"
|
||||
#include "level_zero/core/test/unit_tests/white_box.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <optional>
|
||||
|
||||
namespace L0 {
|
||||
namespace ult {
|
||||
|
||||
@ -71,6 +75,14 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
|
||||
return synchronizeReturnValue;
|
||||
}
|
||||
|
||||
NEO::WaitStatus reserveLinearStreamSize(size_t size) override {
|
||||
if (reserveLinearStreamSizeReturnValue.has_value()) {
|
||||
return *reserveLinearStreamSizeReturnValue;
|
||||
}
|
||||
|
||||
return BaseClass::reserveLinearStreamSize(size);
|
||||
}
|
||||
|
||||
NEO::SubmissionStatus submitBatchBuffer(size_t offset, NEO::ResidencyContainer &residencyContainer, void *endingCmdPtr, bool isCooperative) override {
|
||||
residencyContainerSnapshot = residencyContainer;
|
||||
return BaseClass::submitBatchBuffer(offset, residencyContainer, endingCmdPtr, isCooperative);
|
||||
@ -79,6 +91,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
|
||||
uint32_t synchronizedCalled = 0;
|
||||
NEO::ResidencyContainer residencyContainerSnapshot;
|
||||
ze_result_t synchronizeReturnValue{ZE_RESULT_SUCCESS};
|
||||
std::optional<NEO::WaitStatus> reserveLinearStreamSizeReturnValue{};
|
||||
};
|
||||
|
||||
struct Deleter {
|
||||
@ -86,5 +99,6 @@ struct Deleter {
|
||||
cmdQ->destroy();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ult
|
||||
} // namespace L0
|
@ -17,6 +17,7 @@
|
||||
#include "level_zero/core/source/cmdqueue/cmdqueue_imp.h"
|
||||
#include "level_zero/core/test/unit_tests/fixtures/device_fixture.h"
|
||||
#include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h"
|
||||
#include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h"
|
||||
|
||||
namespace L0 {
|
||||
namespace ult {
|
||||
@ -1018,6 +1019,102 @@ HWTEST_F(CommandListCreate, GivenGpuHangWhenCreatingImmediateCommandListAndAppen
|
||||
commandList->csr = oldCsr;
|
||||
}
|
||||
|
||||
HWTEST2_F(CommandListCreate, GivenGpuHangOnExecutingCommandListsWhenCreatingImmediateCommandListAndWaitingOnEventsThenDeviceLostIsReturned, IsSKL) {
|
||||
ze_command_queue_desc_t desc = {};
|
||||
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
|
||||
|
||||
ze_result_t returnValue;
|
||||
std::unique_ptr<L0::CommandList> commandList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue));
|
||||
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
ASSERT_NE(nullptr, commandList);
|
||||
|
||||
EXPECT_EQ(device, commandList->device);
|
||||
EXPECT_EQ(CommandList::CommandListType::TYPE_IMMEDIATE, commandList->cmdListType);
|
||||
EXPECT_NE(nullptr, commandList->cmdQImmediate);
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.count = 1;
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
|
||||
eventDesc.wait = ZE_EVENT_SCOPE_FLAG_HOST;
|
||||
|
||||
ze_event_handle_t event = nullptr;
|
||||
|
||||
std::unique_ptr<L0::EventPool> eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
ASSERT_NE(nullptr, eventPool);
|
||||
|
||||
eventPool->createEvent(&eventDesc, &event);
|
||||
|
||||
std::unique_ptr<L0::Event> event_object(L0::Event::fromHandle(event));
|
||||
ASSERT_NE(nullptr, event_object->csr);
|
||||
ASSERT_EQ(static_cast<DeviceImp *>(device)->getNEODevice()->getDefaultEngine().commandStreamReceiver, event_object->csr);
|
||||
|
||||
MockCommandStreamReceiver mockCommandStreamReceiver(*neoDevice->executionEnvironment, neoDevice->getRootDeviceIndex(), neoDevice->getDeviceBitfield());
|
||||
Mock<CommandQueue> mockCommandQueue(device, &mockCommandStreamReceiver, &desc);
|
||||
mockCommandQueue.executeCommandListsResult = ZE_RESULT_ERROR_DEVICE_LOST;
|
||||
|
||||
auto oldCommandQueue = commandList->cmdQImmediate;
|
||||
commandList->cmdQImmediate = &mockCommandQueue;
|
||||
|
||||
returnValue = commandList->appendWaitOnEvents(1, &event);
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, returnValue);
|
||||
|
||||
commandList->cmdQImmediate = oldCommandQueue;
|
||||
}
|
||||
|
||||
HWTEST2_F(CommandListCreate, GivenGpuHangOnSynchronizingWhenCreatingImmediateCommandListAndWaitingOnEventsThenDeviceLostIsReturned, IsSKL) {
|
||||
ze_command_queue_desc_t desc = {};
|
||||
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
|
||||
|
||||
ze_result_t returnValue;
|
||||
std::unique_ptr<L0::CommandList> commandList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue));
|
||||
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
ASSERT_NE(nullptr, commandList);
|
||||
|
||||
EXPECT_EQ(device, commandList->device);
|
||||
EXPECT_EQ(CommandList::CommandListType::TYPE_IMMEDIATE, commandList->cmdListType);
|
||||
EXPECT_NE(nullptr, commandList->cmdQImmediate);
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.count = 1;
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
|
||||
eventDesc.wait = ZE_EVENT_SCOPE_FLAG_HOST;
|
||||
|
||||
ze_event_handle_t event = nullptr;
|
||||
|
||||
std::unique_ptr<L0::EventPool> eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
ASSERT_NE(nullptr, eventPool);
|
||||
|
||||
eventPool->createEvent(&eventDesc, &event);
|
||||
|
||||
std::unique_ptr<L0::Event> event_object(L0::Event::fromHandle(event));
|
||||
ASSERT_NE(nullptr, event_object->csr);
|
||||
ASSERT_EQ(static_cast<DeviceImp *>(device)->getNEODevice()->getDefaultEngine().commandStreamReceiver, event_object->csr);
|
||||
|
||||
MockCommandStreamReceiver mockCommandStreamReceiver(*neoDevice->executionEnvironment, neoDevice->getRootDeviceIndex(), neoDevice->getDeviceBitfield());
|
||||
Mock<CommandQueue> mockCommandQueue(device, &mockCommandStreamReceiver, &desc);
|
||||
mockCommandQueue.synchronizeResult = ZE_RESULT_ERROR_DEVICE_LOST;
|
||||
|
||||
auto oldCommandQueue = commandList->cmdQImmediate;
|
||||
commandList->cmdQImmediate = &mockCommandQueue;
|
||||
|
||||
returnValue = commandList->appendWaitOnEvents(1, &event);
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, returnValue);
|
||||
|
||||
commandList->cmdQImmediate = oldCommandQueue;
|
||||
}
|
||||
|
||||
HWTEST_F(CommandListCreate, GivenGpuHangWhenCreatingImmediateCommandListAndAppendingEventResetThenDeviceLostIsReturned) {
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.EnableFlushTaskSubmission.set(1);
|
||||
|
@ -86,9 +86,51 @@ TEST_F(CommandQueueCreate, whenSynchronizeByPollingTaskCountThenCallsPrintOutput
|
||||
commandQueue->destroy();
|
||||
}
|
||||
|
||||
HWTEST_F(CommandQueueCreate, givenGpuHangOnSecondReserveWhenReservingLinearStreamThenReturnGpuHang) {
|
||||
const ze_command_queue_desc_t desc{};
|
||||
ze_result_t returnValue;
|
||||
NEO::WaitStatus waitStatus{NEO::WaitStatus::NotReady};
|
||||
|
||||
auto commandQueue = whitebox_cast(CommandQueue::create(productFamily,
|
||||
device,
|
||||
neoDevice->getDefaultEngine().commandStreamReceiver,
|
||||
&desc,
|
||||
false,
|
||||
false,
|
||||
returnValue));
|
||||
|
||||
size_t maxSize = commandQueue->commandStream->getMaxAvailableSpace();
|
||||
|
||||
auto firstAllocation = commandQueue->commandStream->getGraphicsAllocation();
|
||||
EXPECT_EQ(firstAllocation, commandQueue->buffers.getCurrentBufferAllocation());
|
||||
|
||||
uint32_t currentTaskCount = 33u;
|
||||
auto &csr = neoDevice->getUltCommandStreamReceiver<FamilyType>();
|
||||
csr.latestWaitForCompletionWithTimeoutTaskCount = currentTaskCount;
|
||||
csr.waitForTaskCountWithKmdNotifyFallbackReturnValue = WaitStatus::Ready;
|
||||
|
||||
commandQueue->commandStream->getSpace(maxSize - 16u);
|
||||
commandQueue->buffers.setCurrentFlushStamp(121u, 121u);
|
||||
size_t nextSize = 32u;
|
||||
|
||||
waitStatus = commandQueue->reserveLinearStreamSize(nextSize);
|
||||
EXPECT_EQ(NEO::WaitStatus::Ready, waitStatus);
|
||||
|
||||
csr.waitForTaskCountWithKmdNotifyFallbackReturnValue = WaitStatus::GpuHang;
|
||||
commandQueue->commandStream->getSpace(maxSize - 32u);
|
||||
commandQueue->buffers.setCurrentFlushStamp(128u, 128u);
|
||||
nextSize = 64u;
|
||||
|
||||
waitStatus = commandQueue->reserveLinearStreamSize(nextSize);
|
||||
EXPECT_EQ(NEO::WaitStatus::GpuHang, waitStatus);
|
||||
|
||||
commandQueue->destroy();
|
||||
}
|
||||
|
||||
HWTEST_F(CommandQueueCreate, whenReserveLinearStreamThenBufferAllocationSwitched) {
|
||||
const ze_command_queue_desc_t desc{};
|
||||
ze_result_t returnValue;
|
||||
NEO::WaitStatus waitStatus{NEO::WaitStatus::NotReady};
|
||||
|
||||
auto commandQueue = whitebox_cast(CommandQueue::create(productFamily,
|
||||
device,
|
||||
@ -110,7 +152,9 @@ HWTEST_F(CommandQueueCreate, whenReserveLinearStreamThenBufferAllocationSwitched
|
||||
commandQueue->commandStream->getSpace(maxSize - 16u);
|
||||
commandQueue->buffers.setCurrentFlushStamp(121u, 121u);
|
||||
size_t nextSize = 16u + 16u;
|
||||
commandQueue->reserveLinearStreamSize(nextSize);
|
||||
|
||||
waitStatus = commandQueue->reserveLinearStreamSize(nextSize);
|
||||
EXPECT_EQ(NEO::WaitStatus::Ready, waitStatus);
|
||||
|
||||
auto secondAllocation = commandQueue->commandStream->getGraphicsAllocation();
|
||||
EXPECT_EQ(secondAllocation, commandQueue->buffers.getCurrentBufferAllocation());
|
||||
@ -119,7 +163,9 @@ HWTEST_F(CommandQueueCreate, whenReserveLinearStreamThenBufferAllocationSwitched
|
||||
|
||||
commandQueue->commandStream->getSpace(maxSize - 16u);
|
||||
commandQueue->buffers.setCurrentFlushStamp(244u, 244u);
|
||||
commandQueue->reserveLinearStreamSize(nextSize);
|
||||
|
||||
waitStatus = commandQueue->reserveLinearStreamSize(nextSize);
|
||||
EXPECT_EQ(NEO::WaitStatus::Ready, waitStatus);
|
||||
|
||||
auto thirdAllocation = commandQueue->commandStream->getGraphicsAllocation();
|
||||
EXPECT_EQ(thirdAllocation, commandQueue->buffers.getCurrentBufferAllocation());
|
||||
@ -128,7 +174,9 @@ HWTEST_F(CommandQueueCreate, whenReserveLinearStreamThenBufferAllocationSwitched
|
||||
EXPECT_EQ(csr.latestWaitForCompletionWithTimeoutTaskCount, 121u);
|
||||
|
||||
commandQueue->commandStream->getSpace(maxSize - 16u);
|
||||
commandQueue->reserveLinearStreamSize(nextSize);
|
||||
|
||||
waitStatus = commandQueue->reserveLinearStreamSize(nextSize);
|
||||
EXPECT_EQ(NEO::WaitStatus::Ready, waitStatus);
|
||||
|
||||
auto fourthAllocation = commandQueue->commandStream->getGraphicsAllocation();
|
||||
EXPECT_EQ(fourthAllocation, commandQueue->buffers.getCurrentBufferAllocation());
|
||||
@ -173,7 +221,9 @@ TEST_F(CommandQueueCreate, whenCmdBuffersAllocationsAreCreatedThenSizeIsNotLessT
|
||||
|
||||
commandQueue->commandStream->getSpace(maxSize - 16u);
|
||||
size_t nextSize = 16u + 16u;
|
||||
commandQueue->reserveLinearStreamSize(nextSize);
|
||||
|
||||
const auto waitStatus = commandQueue->reserveLinearStreamSize(nextSize);
|
||||
EXPECT_EQ(NEO::WaitStatus::Ready, waitStatus);
|
||||
|
||||
auto sizeSecondBuffer = commandQueue->buffers.getCurrentBufferAllocation()->getUnderlyingBufferSize();
|
||||
EXPECT_LE(maxSize, sizeSecondBuffer);
|
||||
@ -229,6 +279,27 @@ HWTEST_F(CommandQueueCreate, given100CmdListsWhenExecutingThenCommandStreamIsNot
|
||||
commandQueue->destroy();
|
||||
}
|
||||
|
||||
HWTEST2_F(CommandQueueCreate, givenGpuHangInReservingLinearStreamWhenExecutingCommandListsThenDeviceLostIsReturned, IsSKL) {
|
||||
const ze_command_queue_desc_t desc = {};
|
||||
MockCommandQueueHw<gfxCoreFamily> commandQueue(device, neoDevice->getDefaultEngine().commandStreamReceiver, &desc);
|
||||
commandQueue.reserveLinearStreamSizeReturnValue = NEO::WaitStatus::GpuHang;
|
||||
|
||||
Mock<Kernel> kernel;
|
||||
kernel.immutableData.device = device;
|
||||
|
||||
ze_result_t returnValue;
|
||||
auto commandList = std::unique_ptr<CommandList>(whitebox_cast(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue)));
|
||||
ASSERT_NE(nullptr, commandList);
|
||||
|
||||
ze_group_count_t dispatchFunctionArguments{1, 1, 1};
|
||||
commandList->appendLaunchKernel(kernel.toHandle(), &dispatchFunctionArguments, nullptr, 0, nullptr);
|
||||
|
||||
ze_command_list_handle_t cmdListHandles[1] = {commandList->toHandle()};
|
||||
|
||||
const auto result = commandQueue.executeCommandLists(1, cmdListHandles, nullptr, false);
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, result);
|
||||
}
|
||||
|
||||
HWTEST_F(CommandQueueCreate, givenUpdateTaskCountFromWaitWhenDispatchTaskCountWriteThenNoPipeControlFlushed) {
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
using POST_SYNC_OPERATION = typename FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION;
|
||||
|
@ -6,6 +6,7 @@
|
||||
*/
|
||||
|
||||
#include "shared/source/command_container/implicit_scaling.h"
|
||||
#include "shared/source/command_stream/wait_status.h"
|
||||
#include "shared/source/device/root_device.h"
|
||||
#include "shared/source/helpers/bindless_heaps_helper.h"
|
||||
#include "shared/source/helpers/hw_helper.h"
|
||||
@ -14,6 +15,8 @@
|
||||
#include "shared/source/os_interface/os_inc_base.h"
|
||||
#include "shared/source/os_interface/os_time.h"
|
||||
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
||||
#include "shared/test/common/helpers/engine_descriptor_helper.h"
|
||||
#include "shared/test/common/libult/ult_command_stream_receiver.h"
|
||||
#include "shared/test/common/mocks/mock_compilers.h"
|
||||
#include "shared/test/common/mocks/mock_device.h"
|
||||
#include "shared/test/common/mocks/mock_driver_info.h"
|
||||
@ -35,6 +38,7 @@
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include <list>
|
||||
#include <memory>
|
||||
|
||||
namespace NEO {
|
||||
@ -1776,6 +1780,43 @@ TEST_F(MultipleDevicesTest, givenTwoRootDevicesFromSameFamilyThenCanAccessPeerSu
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
|
||||
}
|
||||
|
||||
HWTEST_F(MultipleDevicesTest, givenTwoRootDevicesFromSameFamilyAndDeviceLostSynchronizeThenCanAccessPeerReturnsDeviceLost) {
|
||||
constexpr size_t devicesCount{2};
|
||||
ASSERT_LE(devicesCount, driverHandle->devices.size());
|
||||
|
||||
L0::Device *devices[devicesCount] = {driverHandle->devices[0], driverHandle->devices[1]};
|
||||
std::vector<NEO::Device *> allNeoDevices{};
|
||||
|
||||
for (const auto device : devices) {
|
||||
const auto neoDevice = device->getNEODevice();
|
||||
const auto neoSubDevices = neoDevice->getSubDevices();
|
||||
|
||||
allNeoDevices.push_back(neoDevice);
|
||||
allNeoDevices.insert(allNeoDevices.end(), neoSubDevices.begin(), neoSubDevices.end());
|
||||
}
|
||||
|
||||
for (const auto neoDevice : allNeoDevices) {
|
||||
auto &deviceRegularEngines = neoDevice->getRegularEngineGroups();
|
||||
ASSERT_EQ(1u, deviceRegularEngines.size());
|
||||
ASSERT_EQ(1u, deviceRegularEngines[0].engines.size());
|
||||
|
||||
auto &deviceEngine = deviceRegularEngines[0].engines[0];
|
||||
auto hwCsr = static_cast<CommandStreamReceiverHw<FamilyType> *>(deviceEngine.commandStreamReceiver);
|
||||
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(hwCsr);
|
||||
|
||||
ultCsr->callBaseWaitForCompletionWithTimeout = false;
|
||||
ultCsr->returnWaitForCompletionWithTimeout = WaitStatus::GpuHang;
|
||||
}
|
||||
|
||||
GFXCORE_FAMILY device0Family = devices[0]->getNEODevice()->getHardwareInfo().platform.eRenderCoreFamily;
|
||||
GFXCORE_FAMILY device1Family = devices[1]->getNEODevice()->getHardwareInfo().platform.eRenderCoreFamily;
|
||||
EXPECT_EQ(device0Family, device1Family);
|
||||
|
||||
ze_bool_t canAccess = true;
|
||||
ze_result_t res = devices[0]->canAccessPeer(devices[1]->toHandle(), &canAccess);
|
||||
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, res);
|
||||
}
|
||||
|
||||
using DeviceTests = Test<DeviceFixture>;
|
||||
|
||||
TEST_F(DeviceTests, WhenGettingMemoryAccessPropertiesThenSuccessIsReturned) {
|
||||
|
@ -189,10 +189,19 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
|
||||
}
|
||||
return returnWaitForCompletionWithTimeout;
|
||||
}
|
||||
|
||||
WaitStatus waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) {
|
||||
return waitForCompletionWithTimeout(WaitParams{false, enableTimeout, timeoutMicroseconds}, taskCountToWait);
|
||||
}
|
||||
|
||||
WaitStatus waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, QueueThrottle throttle) override {
|
||||
if (waitForTaskCountWithKmdNotifyFallbackReturnValue.has_value()) {
|
||||
return *waitForTaskCountWithKmdNotifyFallbackReturnValue;
|
||||
}
|
||||
|
||||
return BaseClass::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, throttle);
|
||||
}
|
||||
|
||||
void overrideCsrSizeReqFlags(CsrSizeRequestFlags &flags) { this->csrSizeRequestFlags = flags; }
|
||||
GraphicsAllocation *getPreemptionAllocation() const { return this->preemptionAllocation; }
|
||||
|
||||
@ -357,5 +366,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
|
||||
bool shouldFailFlushBatchedSubmissions = false;
|
||||
bool shouldFlushBatchedSubmissionsReturnSuccess = false;
|
||||
WaitStatus returnWaitForCompletionWithTimeout = WaitStatus::Ready;
|
||||
std::optional<WaitStatus> waitForTaskCountWithKmdNotifyFallbackReturnValue{};
|
||||
};
|
||||
|
||||
} // namespace NEO
|
||||
|
Reference in New Issue
Block a user