From 8c9bff4f4e92a1ec19a442d615549c21566d1d83 Mon Sep 17 00:00:00 2001 From: Bartosz Dunajski Date: Fri, 10 Oct 2025 15:54:58 +0000 Subject: [PATCH] refactor: new logic to limit per queue split size Signed-off-by: Bartosz Dunajski --- level_zero/core/source/device/bcs_split.cpp | 20 ++++-- level_zero/core/source/device/bcs_split.h | 9 ++- level_zero/core/source/device/bcs_split.inl | 2 +- .../core/test/unit_tests/mocks/mock_cmdlist.h | 1 + .../sources/cmdlist/test_cmdlist_blit.cpp | 65 +++++++++++++++++++ .../debug_settings/debug_variables_base.inl | 1 + shared/source/helpers/common_types.h | 1 + .../pvc/os_agnostic_product_helper_pvc.inl | 1 + shared/test/common/test_files/igdrcl.config | 1 + .../pvc/linux/product_helper_tests_pvc.cpp | 1 + 10 files changed, 95 insertions(+), 7 deletions(-) diff --git a/level_zero/core/source/device/bcs_split.cpp b/level_zero/core/source/device/bcs_split.cpp index 311f7ab4be..bbf3a25349 100644 --- a/level_zero/core/source/device/bcs_split.cpp +++ b/level_zero/core/source/device/bcs_split.cpp @@ -38,6 +38,9 @@ bool BcsSplit::setupDevice(NEO::CommandStreamReceiver *csr, bool copyOffloadEnab std::lock_guard lock(this->mtx); + NEO::debugManager.flags.SplitBcsPerEngineMaxSize.assignIfNotDefault(splitSettings.perEngineMaxSize); + UNRECOVERABLE_IF(splitSettings.perEngineMaxSize == 0); + this->clientCount++; if (!this->cmdLists.empty()) { @@ -125,14 +128,23 @@ void BcsSplit::releaseResources() { } } -std::vector &BcsSplit::getCmdListsForSplit(NEO::TransferDirection direction) { +std::vector &BcsSplit::selectCmdLists(NEO::TransferDirection direction) { if (direction == NEO::TransferDirection::hostToLocal) { - return this->h2dCmdLists; + return h2dCmdLists; } else if (direction == NEO::TransferDirection::localToHost) { - return this->d2hCmdLists; + return d2hCmdLists; } - return this->cmdLists; + return cmdLists; +} + +BcsSplit::CmdListsForSplitContainer BcsSplit::getCmdListsForSplit(NEO::TransferDirection direction, size_t totalTransferSize) { + auto &selectedCmdListType = selectCmdLists(direction); + + size_t maxEnginesToUse = std::max(totalTransferSize / splitSettings.perEngineMaxSize, size_t(1)); + auto engineCount = std::min(selectedCmdListType.size(), maxEnginesToUse); + + return {selectedCmdListType.begin(), selectedCmdListType.begin() + engineCount}; } size_t BcsSplitEvents::obtainAggregatedEventsForSplit(Context *context) { diff --git a/level_zero/core/source/device/bcs_split.h b/level_zero/core/source/device/bcs_split.h index 8da3988c8c..89633a8ab0 100644 --- a/level_zero/core/source/device/bcs_split.h +++ b/level_zero/core/source/device/bcs_split.h @@ -78,7 +78,10 @@ class BcsSplit { template static constexpr size_t maxEventCountInPool = MemoryConstants::pageSize64k / sizeof(typename GfxFamily::TimestampPacketType); - using CsrContainer = StackVec; + static constexpr size_t csrContainerSize = 12; + + using CsrContainer = StackVec; + using CmdListsForSplitContainer = StackVec; BcsSplitEvents events; @@ -103,10 +106,12 @@ class BcsSplit { void releaseResources(); DeviceImp &getDevice() const { return device; } + CmdListsForSplitContainer getCmdListsForSplit(NEO::TransferDirection direction, size_t totalTransferSize); + BcsSplit(DeviceImp &device) : events(*this), device(device){}; protected: - std::vector &getCmdListsForSplit(NEO::TransferDirection direction); + std::vector &selectCmdLists(NEO::TransferDirection direction); void setupEnginesMask(); bool setupQueues(); diff --git a/level_zero/core/source/device/bcs_split.inl b/level_zero/core/source/device/bcs_split.inl index e26e028787..f1e1821777 100644 --- a/level_zero/core/source/device/bcs_split.inl +++ b/level_zero/core/source/device/bcs_split.inl @@ -26,7 +26,7 @@ ze_result_t BcsSplit::appendSplitCall(CommandListCoreFamilyImmediategetCmdListsForSplit(direction); + auto cmdListsForSplit = this->getCmdListsForSplit(direction, size); auto engineCount = cmdListsForSplit.size(); size_t markerEventIndex = 0; uint64_t aggregatedEventIncrementVal = 1; diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index d991c9267f..c55919f723 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -247,6 +247,7 @@ struct WhiteBox> using BaseClass::latestOperationHasOptimizedCbEvent; using BaseClass::latestOperationRequiredNonWalkerInOrderCmdsChaining; using BaseClass::maxFillPatternSizeForCopyEngine; + using BaseClass::minimalSizeForBcsSplit; using BaseClass::partitionCount; using BaseClass::pipeControlMultiKernelEventSync; using BaseClass::pipelineSelectStateTracking; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp index 157a88e7b8..6fecb7daf9 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp @@ -25,6 +25,8 @@ #include "level_zero/core/test/unit_tests/sources/helper/ze_object_utils.h" #include "level_zero/driver_experimental/zex_event.h" +#include + namespace L0 { namespace ult { @@ -1016,6 +1018,69 @@ HWTEST2_F(AggregatedBcsSplitTests, givenLimitedEnginesCountWhenCreatingBcsSplitT bcsSplit.releaseResources(); } +HWTEST2_F(AggregatedBcsSplitTests, givenMaxCopySizePerEngineSetToOneWhenSelectingQueueThenReturnAllQueues, IsAtLeastXeHpcCore) { + debugManager.flags.SplitBcsPerEngineMaxSize.set(1); + + BcsSplit bcsSplit(static_cast(*device)); + bcsSplit.setupDevice(cmdList->getCsr(false), false); + + constexpr TransferDirection direction = TransferDirection::hostToLocal; + const auto numQueues = bcsSplit.h2dCmdLists.size(); + const auto minimalSize = static_cast> *>(cmdList.get())->minimalSizeForBcsSplit; + + const std::array sizes = {minimalSize, minimalSize * 2, minimalSize * numQueues, minimalSize * numQueues * 5}; + + for (const auto size : sizes) { + auto queues = bcsSplit.getCmdListsForSplit(direction, size); + EXPECT_EQ(numQueues, queues.size()); + } + + bcsSplit.releaseResources(); +} + +HWTEST2_F(AggregatedBcsSplitTests, givenMaxCopySizePerEngineGreaterThanOneWhenSelectingQueueThenReturnAllQueues, IsAtLeastXeHpcCore) { + debugManager.flags.SplitBcsPerEngineMaxSize.set(static_cast(MemoryConstants::megaByte)); + debugManager.flags.SplitBcsMaskH2D.set(static_cast(0b11110)); + debugManager.flags.SplitBcsMask.set(static_cast(0b11110)); + + BcsSplit bcsSplit(static_cast(*device)); + bcsSplit.setupDevice(cmdList->getCsr(false), false); + + constexpr TransferDirection direction = TransferDirection::hostToLocal; + const auto numQueues = bcsSplit.h2dCmdLists.size(); + + auto &minimalSize = static_cast> *>(cmdList.get())->minimalSizeForBcsSplit; + + minimalSize = static_cast(MemoryConstants::megaByte * 2); + + { + auto queues = bcsSplit.getCmdListsForSplit(direction, minimalSize); + EXPECT_EQ(2u, queues.size()); + } + + { + auto queues = bcsSplit.getCmdListsForSplit(direction, 2 * minimalSize); + EXPECT_EQ(4u, queues.size()); + } + + { + auto queues = bcsSplit.getCmdListsForSplit(direction, (2 * minimalSize) + 2); + EXPECT_EQ(4u, queues.size()); + } + + { + auto queues = bcsSplit.getCmdListsForSplit(direction, minimalSize * numQueues); + EXPECT_EQ(numQueues, queues.size()); + } + + { + auto queues = bcsSplit.getCmdListsForSplit(direction, minimalSize * numQueues * 5); + EXPECT_EQ(numQueues, queues.size()); + } + + bcsSplit.releaseResources(); +} + HWTEST2_F(AggregatedBcsSplitTests, givenUninitializedBcsSplitCallingZexDeviceGetAggregatedCopyOffloadIncrementValueThenInitialize, IsAtLeastXeHpcCore) { uint32_t incValue = 0; bcsSplit->releaseResources(); diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 4accddd491..692260cd45 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -423,6 +423,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsRequiredEnginesCount, -1, "-1: default, DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsAggregatedEventsMode, -1, "-1: default, 0: disabled, 1: enabled. If enabled, use Aggregated CB Events for all Split operations") DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsTransferDirectionMask, -1, "-1: default, >0: TransferDirection enum mask, indicating supported directions") DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsForCopyOffload, -1, "-1: default, 0: disabled, 1: enable BCS split for immediate copy offload cmd lists") +DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsPerEngineMaxSize, -1, "-1: default, >0: maximum transfer size in bytes for single split engine. If (transfer_size * engineCount) < total_size, then split evenly to all engines") DECLARE_DEBUG_VARIABLE(int32_t, ReuseKernelBinaries, -1, "-1: default, 0:disabled, 1: enabled. If enabled, driver reuses kernel binaries.") DECLARE_DEBUG_VARIABLE(int32_t, SetAmountOfReusableAllocations, -1, "-1: default, 0:disabled, > 1: enabled. If enabled, driver will fill reusable allocation lists with given amount of command buffers and heaps at initialization of immediate command list.") DECLARE_DEBUG_VARIABLE(int32_t, SetAmountOfReusableAllocationsPerCmdQueue, -1, "-1: default, 0:disabled, > 1: enabled. If enabled, driver will fill reusable allocation lists with given amount of command buffers for each initialized opencl command queue.") diff --git a/shared/source/helpers/common_types.h b/shared/source/helpers/common_types.h index 009db59a90..d1b411d71e 100644 --- a/shared/source/helpers/common_types.h +++ b/shared/source/helpers/common_types.h @@ -148,6 +148,7 @@ struct BcsSplitSettings { BcsInfoMask allEngines = {}; BcsInfoMask h2dEngines = {}; BcsInfoMask d2hEngines = {}; + size_t perEngineMaxSize = 1; uint32_t minRequiredTotalCsrCount = 0; uint32_t requiredTileCount = 0; bool enabled = false; diff --git a/shared/source/xe_hpc_core/pvc/os_agnostic_product_helper_pvc.inl b/shared/source/xe_hpc_core/pvc/os_agnostic_product_helper_pvc.inl index 32a7def151..abe3e5887c 100644 --- a/shared/source/xe_hpc_core/pvc/os_agnostic_product_helper_pvc.inl +++ b/shared/source/xe_hpc_core/pvc/os_agnostic_product_helper_pvc.inl @@ -187,6 +187,7 @@ BcsSplitSettings ProductHelperHw::getBcsSplitSettings(const Hardware .allEngines = oddLinkedCopyEnginesMask, .h2dEngines = NEO::EngineHelpers::h2dCopyEngineMask, .d2hEngines = NEO::EngineHelpers::d2hCopyEngineMask, + .perEngineMaxSize = 1, // split evenly on all available engines .minRequiredTotalCsrCount = static_cast(oddLinkedCopyEnginesMask.count()), .requiredTileCount = 1, .enabled = true, diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index 3fc066e2b6..3b15f0f147 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -672,4 +672,5 @@ LimitIsaPrefetchSize = -1 EnableUsmAllocationPoolManager = -1 ForceTotalWMTPDataSize = -1 CopyLockedMemoryBeforeWrite = 0 +SplitBcsPerEngineMaxSize = -1 # Please don't edit below this line diff --git a/shared/test/unit_test/xe_hpc_core/pvc/linux/product_helper_tests_pvc.cpp b/shared/test/unit_test/xe_hpc_core/pvc/linux/product_helper_tests_pvc.cpp index 24cf7edc47..bd8f364815 100644 --- a/shared/test/unit_test/xe_hpc_core/pvc/linux/product_helper_tests_pvc.cpp +++ b/shared/test/unit_test/xe_hpc_core/pvc/linux/product_helper_tests_pvc.cpp @@ -97,6 +97,7 @@ PVCTEST_F(PvcProductHelperLinux, givenProductHelperWhenAskedIsBlitSplitEnqueueWA EXPECT_EQ(NEO::EngineHelpers::h2dCopyEngineMask, bcsSplitSettings.h2dEngines.to_ulong()); EXPECT_EQ(NEO::EngineHelpers::d2hCopyEngineMask, bcsSplitSettings.d2hEngines.to_ulong()); EXPECT_EQ(static_cast(bcsSplitSettings.allEngines.count()), bcsSplitSettings.minRequiredTotalCsrCount); + EXPECT_EQ(1u, bcsSplitSettings.perEngineMaxSize); EXPECT_EQ(1u, bcsSplitSettings.requiredTileCount); }