From a05cc69a5a386ef289f3ae0534547c8985c11dfe Mon Sep 17 00:00:00 2001 From: Lukasz Jobczyk Date: Thu, 15 Feb 2024 09:12:10 +0000 Subject: [PATCH] fix: Handle OOM in BCS split Signed-off-by: Lukasz Jobczyk --- level_zero/core/source/device/bcs_split.cpp | 33 ++++++--- level_zero/core/source/device/bcs_split.h | 14 ++-- .../xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp | 68 ++++++++++++++++++- 3 files changed, 101 insertions(+), 14 deletions(-) diff --git a/level_zero/core/source/device/bcs_split.cpp b/level_zero/core/source/device/bcs_split.cpp index dbe2fc46f3..ba05026b18 100644 --- a/level_zero/core/source/device/bcs_split.cpp +++ b/level_zero/core/source/device/bcs_split.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2023 Intel Corporation + * Copyright (C) 2022-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -114,24 +114,27 @@ std::vector &BcsSplit::getCmdQsForSplit(NEO::TransferDirection d return this->cmdQs; } -size_t BcsSplit::Events::obtainForSplit(Context *context, size_t maxEventCountInPool) { +std::optional BcsSplit::Events::obtainForSplit(Context *context, size_t maxEventCountInPool) { std::lock_guard lock(this->mtx); for (size_t i = 0; i < this->marker.size(); i++) { auto ret = this->marker[i]->queryStatus(); if (ret == ZE_RESULT_SUCCESS) { - this->marker[i]->reset(); - this->barrier[i]->reset(); - for (size_t j = 0; j < this->bcsSplit.cmdQs.size(); j++) { - this->subcopy[i * this->bcsSplit.cmdQs.size() + j]->reset(); - } + this->resetEventPackage(i); return i; } } - return this->allocateNew(context, maxEventCountInPool); + auto newEventIndex = this->allocateNew(context, maxEventCountInPool); + if (newEventIndex.has_value() || this->marker.empty()) { + return newEventIndex; + } + + this->marker[0]->hostSynchronize(std::numeric_limits::max()); + this->resetEventPackage(0); + return 0; } -size_t BcsSplit::Events::allocateNew(Context *context, size_t maxEventCountInPool) { +std::optional BcsSplit::Events::allocateNew(Context *context, size_t maxEventCountInPool) { /* Internal events needed for split: * - event per subcopy to signal completion of given subcopy (vector of subcopy events), * - 1 event to signal completion of entire split (vector of marker events), @@ -147,6 +150,9 @@ size_t BcsSplit::Events::allocateNew(Context *context, size_t maxEventCountInPoo desc.count = static_cast(maxEventCountInPool); auto hDevice = this->bcsSplit.device.toHandle(); auto pool = EventPool::create(this->bcsSplit.device.getDriverHandle(), context, 1, &hDevice, &desc, result); + if (!pool) { + return std::nullopt; + } this->pools.push_back(pool); this->createdFromLatestPool = 0u; } @@ -181,6 +187,15 @@ size_t BcsSplit::Events::allocateNew(Context *context, size_t maxEventCountInPoo return this->marker.size() - 1; } + +void BcsSplit::Events::resetEventPackage(size_t index) { + this->marker[index]->reset(); + this->barrier[index]->reset(); + for (size_t j = 0; j < this->bcsSplit.cmdQs.size(); j++) { + this->subcopy[index * this->bcsSplit.cmdQs.size() + j]->reset(); + } +} + void BcsSplit::Events::releaseResources() { for (auto &markerEvent : this->marker) { markerEvent->destroy(); diff --git a/level_zero/core/source/device/bcs_split.h b/level_zero/core/source/device/bcs_split.h index f4d2914e3e..f8809369b7 100644 --- a/level_zero/core/source/device/bcs_split.h +++ b/level_zero/core/source/device/bcs_split.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2023 Intel Corporation + * Copyright (C) 2022-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -44,8 +44,9 @@ struct BcsSplit { std::vector marker; size_t createdFromLatestPool = 0u; - size_t obtainForSplit(Context *context, size_t maxEventCountInPool); - size_t allocateNew(Context *context, size_t maxEventCountInPool); + std::optional obtainForSplit(Context *context, size_t maxEventCountInPool); + std::optional allocateNew(Context *context, size_t maxEventCountInPool); + void resetEventPackage(size_t index); void releaseResources(); @@ -74,7 +75,12 @@ struct BcsSplit { std::function appendCall) { ze_result_t result = ZE_RESULT_SUCCESS; - auto markerEventIndex = this->events.obtainForSplit(Context::fromHandle(cmdList->getCmdListContext()), MemoryConstants::pageSize64k / sizeof(typename CommandListCoreFamilyImmediate::GfxFamily::TimestampPacketType)); + auto markerEventIndexRet = this->events.obtainForSplit(Context::fromHandle(cmdList->getCmdListContext()), MemoryConstants::pageSize64k / sizeof(typename CommandListCoreFamilyImmediate::GfxFamily::TimestampPacketType)); + if (!markerEventIndexRet.has_value()) { + return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY; + } + + auto markerEventIndex = *markerEventIndexRet; auto barrierRequired = !cmdList->isInOrderExecutionEnabled() && cmdList->isBarrierRequired(); if (barrierRequired) { diff --git a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp index 792acb8b0d..686ad0a380 100644 --- a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp +++ b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2023 Intel Corporation + * Copyright (C) 2021-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -12,6 +12,7 @@ #include "shared/test/common/mocks/mock_command_stream_receiver.h" #include "shared/test/common/mocks/mock_device.h" #include "shared/test/common/mocks/mock_direct_submission_hw.h" +#include "shared/test/common/mocks/mock_memory_manager.h" #include "shared/test/common/test_macros/hw_test.h" #include "level_zero/core/source/driver/driver_handle_imp.h" @@ -1547,6 +1548,71 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.events.subcopy.size(), 8u); EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.events.barrier.size(), 2u); EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.events.createdFromLatestPool, 12u); + + NEO::debugManager.flags.OverrideEventSynchronizeTimeout.set(0); + auto memoryManager = reinterpret_cast(static_cast(testL0Device.get())->bcsSplit.device.getDriverHandle()->getMemoryManager()); + memoryManager->isMockHostMemoryManager = true; + memoryManager->forceFailureInPrimaryAllocation = true; + + ret = static_cast(testL0Device.get())->bcsSplit.events.obtainForSplit(Context::fromHandle(commandList0->getCmdListContext()), 12); + + EXPECT_EQ(ret, 0u); + EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.events.pools.size(), 1u); + EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.events.marker.size(), 2u); + EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.events.subcopy.size(), 8u); + EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.events.barrier.size(), 2u); + EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.events.createdFromLatestPool, 12u); +} + +HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenOOMAndObtainEventsForSplitThenNulloptIsReturned, IsXeHpcCore) { + using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; + + DebugManagerStateRestore restorer; + debugManager.flags.SplitBcsCopy.set(1); + debugManager.flags.EnableFlushTaskSubmission.set(0); + + ze_result_t returnValue; + auto hwInfo = *NEO::defaultHwInfo; + hwInfo.featureTable.ftrBcsInfo = 0b111111111; + hwInfo.capabilityTable.blitterOperationsSupported = true; + auto testNeoDevice = NEO::MockDevice::createWithNewExecutionEnvironment(&hwInfo); + auto testL0Device = std::unique_ptr(L0::Device::create(driverHandle.get(), testNeoDevice, false, &returnValue)); + + ze_command_queue_desc_t desc = {}; + desc.ordinal = static_cast(testNeoDevice->getEngineGroupIndexFromEngineGroupType(NEO::EngineGroupType::copy)); + + std::unique_ptr commandList0(CommandList::createImmediate(productFamily, + testL0Device.get(), + &desc, + false, + NEO::EngineGroupType::copy, + returnValue)); + ASSERT_NE(nullptr, commandList0); + EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.cmdQs.size(), 4u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 0u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u); + EXPECT_EQ(static_cast(static_cast(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u); + + EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.events.pools.size(), 0u); + EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.events.marker.size(), 0u); + EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.events.subcopy.size(), 0u); + EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.events.barrier.size(), 0u); + EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.events.createdFromLatestPool, 0u); + + NEO::debugManager.flags.OverrideEventSynchronizeTimeout.set(0); + auto memoryManager = reinterpret_cast(static_cast(testL0Device.get())->bcsSplit.device.getDriverHandle()->getMemoryManager()); + memoryManager->isMockHostMemoryManager = true; + memoryManager->forceFailureInPrimaryAllocation = true; + + auto ret = static_cast(testL0Device.get())->bcsSplit.events.obtainForSplit(Context::fromHandle(commandList0->getCmdListContext()), 12); + + EXPECT_FALSE(ret.has_value()); + EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.events.pools.size(), 0u); + EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.events.marker.size(), 0u); + EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.events.subcopy.size(), 0u); + EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.events.barrier.size(), 0u); + EXPECT_EQ(static_cast(testL0Device.get())->bcsSplit.events.createdFromLatestPool, 0u); } HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingPageFaultCopyThenSuccessIsReturned, IsXeHpcCore) {