From 0597f064e57139f51455af170fdafd31ecabc4ef Mon Sep 17 00:00:00 2001 From: Bartosz Dunajski Date: Wed, 8 Oct 2025 14:53:49 +0000 Subject: [PATCH] refactor: bcs split class structure Signed-off-by: Bartosz Dunajski --- .../source/cmdlist/cmdlist_hw_immediate.inl | 27 ++- level_zero/core/source/device/CMakeLists.txt | 1 + level_zero/core/source/device/bcs_split.cpp | 83 +++---- level_zero/core/source/device/bcs_split.h | 209 ++++++------------ level_zero/core/source/device/bcs_split.inl | 137 ++++++++++++ level_zero/core/source/device/device_imp.h | 2 +- .../xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp | 4 +- .../debug_settings/debug_settings_manager.h | 8 + 8 files changed, 266 insertions(+), 205 deletions(-) create mode 100644 level_zero/core/source/device/bcs_split.inl diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index 1962d03b03..aa8bede6e8 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -718,12 +718,14 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopy( if (isSplitNeeded) { setupFlagsForBcsSplit(memoryCopyParams, hasStallingCmds, copyOffloadFlush, srcptr, dstptr, size, size); - auto splitCall = [&](CommandListCoreFamilyImmediate *subCmdList, void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) { + auto splitCall = [&](CommandListCoreFamilyImmediate *subCmdList, const BcsSplitParams::CopyParams ©Params, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) { memoryCopyParams.forceAggregatedEventIncValue = aggregatedEventIncValue; - return subCmdList->CommandListCoreFamily::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, 0u, nullptr, memoryCopyParams); + auto ¶ms = std::get(copyParams); + return subCmdList->CommandListCoreFamily::appendMemoryCopy(params.dst, params.src, sizeParam, hSignalEventParam, 0u, nullptr, memoryCopyParams); }; - ret = static_cast(this->device)->bcsSplit->appendSplitCall(this, dstptr, srcptr, size, hSignalEvent, numWaitEvents, phWaitEvents, true, memoryCopyParams.relaxedOrderingDispatch, direction, estimatedSize, splitCall); + BcsSplitParams::CopyParams copyParams = BcsSplitParams::MemCopy{dstptr, srcptr}; + ret = static_cast(this->device)->bcsSplit->appendSplitCall(this, copyParams, size, hSignalEvent, numWaitEvents, phWaitEvents, true, memoryCopyParams.relaxedOrderingDispatch, direction, estimatedSize, splitCall); } else if (this->isValidForStagingTransfer(dstptr, srcptr, size, numWaitEvents > 0)) { return this->appendStagingMemoryCopy(dstptr, srcptr, size, hSignalEvent, memoryCopyParams); @@ -774,14 +776,16 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopyRegio this->getTotalSizeForCopyRegion(srcRegion, srcPitch, srcSlicePitch), this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch)); - auto splitCall = [&](CommandListCoreFamilyImmediate *subCmdList, uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) { + auto splitCall = [&](CommandListCoreFamilyImmediate *subCmdList, const BcsSplitParams::CopyParams ©Params, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) { + auto ¶ms = std::get(copyParams); + ze_copy_region_t dstRegionLocal = {}; ze_copy_region_t srcRegionLocal = {}; memcpy(&dstRegionLocal, dstRegion, sizeof(ze_copy_region_t)); memcpy(&srcRegionLocal, srcRegion, sizeof(ze_copy_region_t)); - dstRegionLocal.originX = dstOriginXParam; + dstRegionLocal.originX = params.dst; dstRegionLocal.width = static_cast(sizeParam); - srcRegionLocal.originX = srcOriginXParam; + srcRegionLocal.originX = params.src; srcRegionLocal.width = static_cast(sizeParam); memoryCopyParams.forceAggregatedEventIncValue = aggregatedEventIncValue; return subCmdList->CommandListCoreFamily::appendMemoryCopyRegion(dstPtr, &dstRegionLocal, dstPitch, dstSlicePitch, @@ -789,7 +793,8 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopyRegio hSignalEventParam, 0u, nullptr, memoryCopyParams); }; - ret = static_cast(this->device)->bcsSplit->appendSplitCall(this, dstRegion->originX, srcRegion->originX, dstRegion->width, hSignalEvent, numWaitEvents, phWaitEvents, true, memoryCopyParams.relaxedOrderingDispatch, direction, estimatedSize, splitCall); + BcsSplitParams::CopyParams copyParams = BcsSplitParams::RegionCopy{dstRegion->originX, srcRegion->originX}; + ret = static_cast(this->device)->bcsSplit->appendSplitCall(this, copyParams, dstRegion->width, hSignalEvent, numWaitEvents, phWaitEvents, true, memoryCopyParams.relaxedOrderingDispatch, direction, estimatedSize, splitCall); } else { ret = CommandListCoreFamily::appendMemoryCopyRegion(dstPtr, dstRegion, dstPitch, dstSlicePitch, srcPtr, srcRegion, srcPitch, srcSlicePitch, @@ -860,12 +865,14 @@ ze_result_t CommandListCoreFamilyImmediate::appendPageFaultCopy(N setupFlagsForBcsSplit(bcsSplitMemoryCopyParams, hasStallingCmds, copyOffloadFlush, srcAddress, dstAddress, size, size); - auto splitCall = [&](CommandListCoreFamilyImmediate *subCmdList, void *dstAddressParam, const void *srcAddressParam, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) { + auto splitCall = [&](CommandListCoreFamilyImmediate *subCmdList, const BcsSplitParams::CopyParams ©Params, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) { bcsSplitMemoryCopyParams.forceAggregatedEventIncValue = aggregatedEventIncValue; - return subCmdList->CommandListCoreFamily::appendMemoryCopy(dstAddressParam, srcAddressParam, sizeParam, hSignalEventParam, 0u, nullptr, bcsSplitMemoryCopyParams); + auto ¶ms = std::get(copyParams); + return subCmdList->CommandListCoreFamily::appendMemoryCopy(params.dst, params.src, sizeParam, hSignalEventParam, 0u, nullptr, bcsSplitMemoryCopyParams); }; - ret = static_cast(this->device)->bcsSplit->appendSplitCall(this, dstAddress, srcAddress, size, nullptr, 0u, nullptr, false, bcsSplitMemoryCopyParams.relaxedOrderingDispatch, direction, commonImmediateCommandSize, splitCall); + BcsSplitParams::CopyParams copyParams = BcsSplitParams::MemCopy{dstAddress, srcAddress}; + ret = static_cast(this->device)->bcsSplit->appendSplitCall(this, copyParams, size, nullptr, 0u, nullptr, false, bcsSplitMemoryCopyParams.relaxedOrderingDispatch, direction, commonImmediateCommandSize, splitCall); } else { ret = CommandListCoreFamily::appendPageFaultCopy(dstAllocation, srcAllocation, size, flushHost); } diff --git a/level_zero/core/source/device/CMakeLists.txt b/level_zero/core/source/device/CMakeLists.txt index d34a3de7ea..1e09195928 100644 --- a/level_zero/core/source/device/CMakeLists.txt +++ b/level_zero/core/source/device/CMakeLists.txt @@ -10,6 +10,7 @@ target_sources(${L0_STATIC_LIB_NAME} ${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}device_imp_helper.cpp ${CMAKE_CURRENT_SOURCE_DIR}/bcs_split.cpp ${CMAKE_CURRENT_SOURCE_DIR}/bcs_split.h + ${CMAKE_CURRENT_SOURCE_DIR}/bcs_split.inl ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp ${CMAKE_CURRENT_SOURCE_DIR}/device.h ${CMAKE_CURRENT_SOURCE_DIR}/device_imp_${DRIVER_MODEL}/device_imp_${DRIVER_MODEL}.cpp diff --git a/level_zero/core/source/device/bcs_split.cpp b/level_zero/core/source/device/bcs_split.cpp index 548a4bd41d..311f7ab4be 100644 --- a/level_zero/core/source/device/bcs_split.cpp +++ b/level_zero/core/source/device/bcs_split.cpp @@ -21,14 +21,12 @@ namespace L0 { bool BcsSplit::setupDevice(NEO::CommandStreamReceiver *csr, bool copyOffloadEnabled) { auto &productHelper = this->device.getProductHelper(); - auto bcsSplitSettings = productHelper.getBcsSplitSettings(this->device.getHwInfo()); + this->splitSettings = productHelper.getBcsSplitSettings(this->device.getHwInfo()); - if (NEO::debugManager.flags.SplitBcsRequiredTileCount.get() != -1) { - bcsSplitSettings.requiredTileCount = static_cast(NEO::debugManager.flags.SplitBcsRequiredTileCount.get()); - } + NEO::debugManager.flags.SplitBcsRequiredTileCount.assignIfNotDefault(splitSettings.requiredTileCount); // If expectedTileCount==1, route root device to Tile0, otherwise use all Tiles - bool tileCountMatch = (bcsSplitSettings.requiredTileCount == 1) || (this->device.getNEODevice()->getNumSubDevices() == bcsSplitSettings.requiredTileCount); + bool tileCountMatch = (splitSettings.requiredTileCount == 1) || (this->device.getNEODevice()->getNumSubDevices() == splitSettings.requiredTileCount); bool engineMatch = (csr->getOsContext().getEngineType() == productHelper.getDefaultCopyEngine()); if (copyOffloadEnabled && NEO::debugManager.flags.SplitBcsForCopyOffload.get() != 0) { engineMatch = NEO::EngineHelpers::isComputeEngine(csr->getOsContext().getEngineType()); @@ -46,39 +44,35 @@ bool BcsSplit::setupDevice(NEO::CommandStreamReceiver *csr, bool copyOffloadEnab return true; } - events.aggregatedEventsMode = device.getL0GfxCoreHelper().bcsSplitAggregatedModeEnabled(); + events.aggregatedEventsMode = NEO::debugManager.flags.SplitBcsAggregatedEventsMode.getIfNotDefault(device.getL0GfxCoreHelper().bcsSplitAggregatedModeEnabled()); - if (NEO::debugManager.flags.SplitBcsAggregatedEventsMode.get() != -1) { - events.aggregatedEventsMode = !!NEO::debugManager.flags.SplitBcsAggregatedEventsMode.get(); - } + setupEnginesMask(); - setupEnginesMask(bcsSplitSettings); - - return setupQueues(bcsSplitSettings); + return setupQueues(); } -bool BcsSplit::setupQueues(const NEO::BcsSplitSettings &settings) { +bool BcsSplit::setupQueues() { CsrContainer csrs; - for (uint32_t tileId = 0; tileId < settings.requiredTileCount; tileId++) { + for (uint32_t tileId = 0; tileId < splitSettings.requiredTileCount; tileId++) { auto subDevice = this->device.getNEODevice()->getNearestGenericSubDevice(tileId); UNRECOVERABLE_IF(!subDevice); for (uint32_t engineId = 0; engineId < NEO::bcsInfoMaskSize; engineId++) { - if (settings.allEngines.test(engineId)) { + if (splitSettings.allEngines.test(engineId)) { if (auto engine = subDevice->tryGetEngine(NEO::EngineHelpers::getBcsEngineAtIdx(engineId), NEO::EngineUsage::regular)) { csrs.push_back(engine->commandStreamReceiver); } } - if (csrs.size() >= settings.minRequiredTotalCsrCount) { + if (csrs.size() >= splitSettings.minRequiredTotalCsrCount) { break; } } } - if (csrs.size() < settings.minRequiredTotalCsrCount) { + if (csrs.size() < splitSettings.minRequiredTotalCsrCount) { return false; } @@ -98,10 +92,10 @@ bool BcsSplit::setupQueues(const NEO::BcsSplitSettings &settings) { auto engineType = csr->getOsContext().getEngineType(); auto bcsId = NEO::EngineHelpers::getBcsIndex(engineType); - if (settings.h2dEngines.test(bcsId)) { + if (splitSettings.h2dEngines.test(bcsId)) { this->h2dCmdLists.push_back(cmdList); } - if (settings.d2hEngines.test(bcsId)) { + if (splitSettings.d2hEngines.test(bcsId)) { this->d2hCmdLists.push_back(cmdList); } } @@ -109,20 +103,11 @@ bool BcsSplit::setupQueues(const NEO::BcsSplitSettings &settings) { return true; } -void BcsSplit::setupEnginesMask(NEO::BcsSplitSettings &settings) { - if (NEO::debugManager.flags.SplitBcsMask.get() > 0) { - settings.allEngines = NEO::debugManager.flags.SplitBcsMask.get(); - } - if (NEO::debugManager.flags.SplitBcsMaskH2D.get() > 0) { - settings.h2dEngines = NEO::debugManager.flags.SplitBcsMaskH2D.get(); - } - if (NEO::debugManager.flags.SplitBcsMaskD2H.get() > 0) { - settings.d2hEngines = NEO::debugManager.flags.SplitBcsMaskD2H.get(); - } - - if (NEO::debugManager.flags.SplitBcsRequiredEnginesCount.get() != -1) { - settings.minRequiredTotalCsrCount = static_cast(NEO::debugManager.flags.SplitBcsRequiredEnginesCount.get()); - } +void BcsSplit::setupEnginesMask() { + NEO::debugManager.flags.SplitBcsMask.assignIfNotDefault(splitSettings.allEngines); + NEO::debugManager.flags.SplitBcsMaskH2D.assignIfNotDefault(splitSettings.h2dEngines); + NEO::debugManager.flags.SplitBcsMaskD2H.assignIfNotDefault(splitSettings.d2hEngines); + NEO::debugManager.flags.SplitBcsRequiredEnginesCount.assignIfNotDefault(splitSettings.minRequiredTotalCsrCount); } void BcsSplit::releaseResources() { @@ -150,7 +135,7 @@ std::vector &BcsSplit::getCmdListsForSplit(NEO::TransferDirection return this->cmdLists; } -size_t BcsSplit::Events::obtainAggregatedEventsForSplit(Context *context) { +size_t BcsSplitEvents::obtainAggregatedEventsForSplit(Context *context) { for (size_t i = 0; i < this->marker.size(); i++) { if (this->marker[i]->queryStatus() == ZE_RESULT_SUCCESS) { resetAggregatedEventState(i, false); @@ -161,7 +146,7 @@ size_t BcsSplit::Events::obtainAggregatedEventsForSplit(Context *context) { return this->createAggregatedEvent(context); } -std::optional BcsSplit::Events::obtainForSplit(Context *context, size_t maxEventCountInPool) { +std::optional BcsSplitEvents::obtainForSplit(Context *context, size_t maxEventCountInPool) { std::lock_guard lock(this->mtx); if (this->aggregatedEventsMode) { @@ -186,7 +171,7 @@ std::optional BcsSplit::Events::obtainForSplit(Context *context, size_t return 0; } -uint64_t *BcsSplit::Events::getNextAllocationForAggregatedEvent() { +uint64_t *BcsSplitEvents::getNextAllocationForAggregatedEvent() { constexpr size_t allocationSize = MemoryConstants::pageSize64k; if (!this->allocsForAggregatedEvents.empty() && (currentAggregatedAllocOffset + MemoryConstants::cacheLineSize) < allocationSize) { @@ -195,9 +180,9 @@ uint64_t *BcsSplit::Events::getNextAllocationForAggregatedEvent() { ze_device_mem_alloc_desc_t desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC}; void *ptr = nullptr; - auto context = Context::fromHandle(bcsSplit.device.getDriverHandle()->getDefaultContext()); + auto context = Context::fromHandle(bcsSplit.getDevice().getDriverHandle()->getDefaultContext()); - context->allocDeviceMem(bcsSplit.device.toHandle(), &desc, allocationSize, MemoryConstants::pageSize64k, &ptr); + context->allocDeviceMem(bcsSplit.getDevice().toHandle(), &desc, allocationSize, MemoryConstants::pageSize64k, &ptr); UNRECOVERABLE_IF(!ptr); currentAggregatedAllocOffset = 0; @@ -209,7 +194,7 @@ uint64_t *BcsSplit::Events::getNextAllocationForAggregatedEvent() { return ptrOffset(basePtr, currentAggregatedAllocOffset); } -size_t BcsSplit::Events::createAggregatedEvent(Context *context) { +size_t BcsSplitEvents::createAggregatedEvent(Context *context) { constexpr int preallocationCount = 8; size_t returnIndex = this->subcopy.size(); @@ -230,13 +215,13 @@ size_t BcsSplit::Events::createAggregatedEvent(Context *context) { externalStorageAllocProperties.deviceAddress = getNextAllocationForAggregatedEvent(); ze_event_handle_t handle = nullptr; - zexCounterBasedEventCreate2(context, bcsSplit.device.toHandle(), &counterBasedDesc, &handle); + zexCounterBasedEventCreate2(context, bcsSplit.getDevice().toHandle(), &counterBasedDesc, &handle); UNRECOVERABLE_IF(handle == nullptr); this->subcopy.push_back(Event::fromHandle(handle)); ze_event_handle_t markerHandle = nullptr; - zexCounterBasedEventCreate2(context, bcsSplit.device.toHandle(), &markerCounterBasedDesc, &markerHandle); + zexCounterBasedEventCreate2(context, bcsSplit.getDevice().toHandle(), &markerCounterBasedDesc, &markerHandle); UNRECOVERABLE_IF(markerHandle == nullptr); this->marker.push_back(Event::fromHandle(markerHandle)); @@ -247,14 +232,14 @@ size_t BcsSplit::Events::createAggregatedEvent(Context *context) { return returnIndex; } -bool BcsSplit::Events::allocatePool(Context *context, size_t maxEventCountInPool, size_t neededEvents) { +bool BcsSplitEvents::allocatePool(Context *context, size_t maxEventCountInPool, size_t neededEvents) { if (this->pools.empty() || this->createdFromLatestPool + neededEvents > maxEventCountInPool) { ze_result_t result; ze_event_pool_desc_t desc = {ZE_STRUCTURE_TYPE_EVENT_POOL_DESC}; desc.count = static_cast(maxEventCountInPool); - auto hDevice = this->bcsSplit.device.toHandle(); - auto pool = EventPool::create(this->bcsSplit.device.getDriverHandle(), context, 1, &hDevice, &desc, result); + auto hDevice = this->bcsSplit.getDevice().toHandle(); + auto pool = EventPool::create(this->bcsSplit.getDevice().getDriverHandle(), context, 1, &hDevice, &desc, result); if (!pool) { return false; } @@ -265,7 +250,7 @@ bool BcsSplit::Events::allocatePool(Context *context, size_t maxEventCountInPool return true; } -std::optional BcsSplit::Events::createFromPool(Context *context, size_t maxEventCountInPool) { +std::optional BcsSplitEvents::createFromPool(Context *context, size_t maxEventCountInPool) { /* Internal events needed for split: * - event per subcopy to signal completion of given subcopy (vector of subcopy events), * - 1 event to signal completion of entire split (vector of marker events), @@ -311,7 +296,7 @@ std::optional BcsSplit::Events::createFromPool(Context *context, size_t return this->marker.size() - 1; } -void BcsSplit::Events::resetEventPackage(size_t index) { +void BcsSplitEvents::resetEventPackage(size_t index) { this->marker[index]->reset(); this->barrier[index]->reset(); for (size_t j = 0; j < this->bcsSplit.cmdLists.size(); j++) { @@ -319,7 +304,7 @@ void BcsSplit::Events::resetEventPackage(size_t index) { } } -void BcsSplit::Events::resetAggregatedEventState(size_t index, bool markerCompleted) { +void BcsSplitEvents::resetAggregatedEventState(size_t index, bool markerCompleted) { *this->subcopy[index]->getInOrderExecInfo()->getBaseHostAddress() = 0; auto markerEvent = this->marker[index]; @@ -328,7 +313,7 @@ void BcsSplit::Events::resetAggregatedEventState(size_t index, bool markerComple markerEvent->setReportEmptyCbEventAsReady(markerCompleted); } -void BcsSplit::Events::releaseResources() { +void BcsSplitEvents::releaseResources() { for (auto &markerEvent : this->marker) { markerEvent->destroy(); } @@ -346,7 +331,7 @@ void BcsSplit::Events::releaseResources() { } pools.clear(); - auto context = Context::fromHandle(bcsSplit.device.getDriverHandle()->getDefaultContext()); + auto context = Context::fromHandle(bcsSplit.getDevice().getDriverHandle()->getDefaultContext()); for (auto &ptr : this->allocsForAggregatedEvents) { context->freeMem(ptr); } diff --git a/level_zero/core/source/device/bcs_split.h b/level_zero/core/source/device/bcs_split.h index 881c8e91bf..8da3988c8c 100644 --- a/level_zero/core/source/device/bcs_split.h +++ b/level_zero/core/source/device/bcs_split.h @@ -16,6 +16,7 @@ #include #include +#include #include namespace NEO { @@ -26,51 +27,68 @@ enum class TransferDirection; namespace L0 { struct CommandQueue; struct DeviceImp; +class BcsSplit; -struct BcsSplit { - template - using AppendCallFuncT = std::function *, T, K, size_t, ze_event_handle_t, uint64_t)>; - using CsrContainer = StackVec; +namespace BcsSplitParams { +struct MemCopy { + void *dst = nullptr; + const void *src = nullptr; +}; - DeviceImp &device; - uint32_t clientCount = 0u; +struct RegionCopy { + // originXParams + uint32_t dst = 0; + uint32_t src = 0; +}; + +using CopyParams = std::variant; +} // namespace BcsSplitParams + +struct BcsSplitEvents { + BcsSplit &bcsSplit; std::mutex mtx; + std::vector pools; + std::vector barrier; + std::vector subcopy; + std::vector marker; + std::vector allocsForAggregatedEvents; + size_t currentAggregatedAllocOffset = 0; + size_t createdFromLatestPool = 0u; + bool aggregatedEventsMode = false; - struct Events { - BcsSplit &bcsSplit; + std::optional obtainForSplit(Context *context, size_t maxEventCountInPool); + size_t obtainAggregatedEventsForSplit(Context *context); + void resetEventPackage(size_t index); + void resetAggregatedEventState(size_t index, bool markerCompleted); + void releaseResources(); + bool allocatePool(Context *context, size_t maxEventCountInPool, size_t neededEvents); + std::optional createFromPool(Context *context, size_t maxEventCountInPool); + size_t createAggregatedEvent(Context *context); + uint64_t *getNextAllocationForAggregatedEvent(); - std::mutex mtx; - std::vector pools; - std::vector barrier; - std::vector subcopy; - std::vector marker; - std::vector allocsForAggregatedEvents; - size_t currentAggregatedAllocOffset = 0; - size_t createdFromLatestPool = 0u; - bool aggregatedEventsMode = false; + BcsSplitEvents(BcsSplit &bcsSplit) : bcsSplit(bcsSplit) {} +}; - std::optional obtainForSplit(Context *context, size_t maxEventCountInPool); - size_t obtainAggregatedEventsForSplit(Context *context); - void resetEventPackage(size_t index); - void resetAggregatedEventState(size_t index, bool markerCompleted); - void releaseResources(); - bool allocatePool(Context *context, size_t maxEventCountInPool, size_t neededEvents); - std::optional createFromPool(Context *context, size_t maxEventCountInPool); - size_t createAggregatedEvent(Context *context); - uint64_t *getNextAllocationForAggregatedEvent(); +class BcsSplit { + public: + template + using AppendCallFuncT = std::function *, const BcsSplitParams::CopyParams &, size_t, ze_event_handle_t, uint64_t)>; - Events(BcsSplit &bcsSplit) : bcsSplit(bcsSplit) {} - } events; + template + static constexpr size_t maxEventCountInPool = MemoryConstants::pageSize64k / sizeof(typename GfxFamily::TimestampPacketType); + + using CsrContainer = StackVec; + + BcsSplitEvents events; std::vector cmdLists; std::vector h2dCmdLists; std::vector d2hCmdLists; - template + template ze_result_t appendSplitCall(CommandListCoreFamilyImmediate *cmdList, - T dstptr, - K srcptr, + const BcsSplitParams::CopyParams ©Params, size_t size, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, @@ -79,121 +97,26 @@ struct BcsSplit { bool hasRelaxedOrderingDependencies, NEO::TransferDirection direction, size_t estimatedCmdBufferSize, - AppendCallFuncT appendCall) { - constexpr size_t maxEventCountInPool = MemoryConstants::pageSize64k / sizeof(typename CommandListCoreFamilyImmediate::GfxFamily::TimestampPacketType); - - const auto aggregatedEventsMode = this->events.aggregatedEventsMode; - auto signalEvent = Event::fromHandle(hSignalEvent); - - ze_result_t result = ZE_RESULT_SUCCESS; - auto &cmdListsForSplit = this->getCmdListsForSplit(direction); - auto engineCount = cmdListsForSplit.size(); - size_t markerEventIndex = 0; - uint64_t aggregatedEventIncrementVal = 1; - - const bool useSignalEventForSubcopy = aggregatedEventsMode && cmdList->isUsingAdditionalBlitProperties() && Event::isAggregatedEvent(signalEvent) && - (signalEvent->getInOrderIncrementValue(1) % engineCount == 0); - - if (useSignalEventForSubcopy) { - aggregatedEventIncrementVal = signalEvent->getInOrderIncrementValue(1) / engineCount; - } else { - auto markerEventIndexRet = this->events.obtainForSplit(Context::fromHandle(cmdList->getCmdListContext()), maxEventCountInPool); - if (!markerEventIndexRet.has_value()) { - return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY; - } - markerEventIndex = *markerEventIndexRet; - } - - auto barrierRequired = !cmdList->isInOrderExecutionEnabled() && cmdList->isBarrierRequired(); - if (barrierRequired) { - cmdList->appendSignalEvent(this->events.barrier[markerEventIndex]->toHandle(), false); - } - - auto subcopyEventIndex = markerEventIndex * this->cmdLists.size(); - StackVec eventHandles; - - if (!cmdList->handleCounterBasedEventOperations(signalEvent, false)) { - return ZE_RESULT_ERROR_INVALID_ARGUMENT; - } - - auto totalSize = size; - for (size_t i = 0; i < cmdListsForSplit.size(); i++) { - auto subCmdList = static_cast *>(cmdListsForSplit[i]); - - auto lock = subCmdList->getCsr(false)->obtainUniqueOwnership(); - - subCmdList->checkAvailableSpace(numWaitEvents, hasRelaxedOrderingDependencies, estimatedCmdBufferSize, false); - - if (barrierRequired) { - auto barrierEventHandle = this->events.barrier[markerEventIndex]->toHandle(); - subCmdList->addEventsToCmdList(1u, &barrierEventHandle, nullptr, hasRelaxedOrderingDependencies, false, true, false, false); - } - - if (cmdList->hasInOrderDependencies()) { - auto &inOrderExecInfo = cmdList->getInOrderExecInfo(); - subCmdList->appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue(), inOrderExecInfo->getAllocationOffset(), hasRelaxedOrderingDependencies, false, false, false, false); - } - subCmdList->addEventsToCmdList(numWaitEvents, phWaitEvents, nullptr, hasRelaxedOrderingDependencies, false, false, false, false); - - if (!useSignalEventForSubcopy && signalEvent && i == 0u) { - subCmdList->appendEventForProfilingAllWalkers(signalEvent, nullptr, nullptr, true, true, false, true); - } - - auto localSize = totalSize / engineCount; - auto localDstPtr = ptrOffset(dstptr, size - totalSize); - auto localSrcPtr = ptrOffset(srcptr, size - totalSize); - - auto copyEventIndex = aggregatedEventsMode ? markerEventIndex : subcopyEventIndex + i; - auto eventHandle = useSignalEventForSubcopy ? signalEvent : this->events.subcopy[copyEventIndex]->toHandle(); - result = appendCall(subCmdList, localDstPtr, localSrcPtr, localSize, eventHandle, aggregatedEventIncrementVal); - subCmdList->flushImmediate(result, true, !hasRelaxedOrderingDependencies, hasRelaxedOrderingDependencies, NEO::AppendOperations::nonKernel, false, nullptr, true, &lock, nullptr); - - if ((aggregatedEventsMode && i == 0) || !aggregatedEventsMode) { - eventHandles.push_back(eventHandle); - } - - totalSize -= localSize; - engineCount--; - - if (signalEvent) { - signalEvent->appendAdditionalCsr(subCmdList->getCsr(false)); - } - } - - const bool dualStreamCopyOffload = cmdList->isDualStreamCopyOffloadOperation(cmdList->isCopyOffloadEnabled()); - - cmdList->addEventsToCmdList(static_cast(eventHandles.size()), eventHandles.data(), nullptr, hasRelaxedOrderingDependencies, false, true, false, dualStreamCopyOffload); - - const auto isCopyCmdList = cmdList->isCopyOnly(dualStreamCopyOffload); - - if (!useSignalEventForSubcopy && signalEvent) { - cmdList->appendSignalEventPostWalker(signalEvent, nullptr, nullptr, !isCopyCmdList, false, isCopyCmdList); - } - - if (!aggregatedEventsMode) { - cmdList->appendSignalEventPostWalker(this->events.marker[markerEventIndex], nullptr, nullptr, !isCopyCmdList, false, isCopyCmdList); - } - - if (cmdList->isInOrderExecutionEnabled()) { - cmdList->appendSignalInOrderDependencyCounter(signalEvent, dualStreamCopyOffload, false, false, useSignalEventForSubcopy); - } - cmdList->handleInOrderDependencyCounter(signalEvent, false, dualStreamCopyOffload); - - if (aggregatedEventsMode && !useSignalEventForSubcopy) { - std::lock_guard lock(events.mtx); - cmdList->assignInOrderExecInfoToEvent(this->events.marker[markerEventIndex]); - } - - return result; - } + AppendCallFuncT appendCall); bool setupDevice(NEO::CommandStreamReceiver *csr, bool copyOffloadEnabled); void releaseResources(); - std::vector &getCmdListsForSplit(NEO::TransferDirection direction); - void setupEnginesMask(NEO::BcsSplitSettings &settings); - bool setupQueues(const NEO::BcsSplitSettings &settings); + DeviceImp &getDevice() const { return device; } - BcsSplit(DeviceImp &device) : device(device), events(*this){}; + BcsSplit(DeviceImp &device) : events(*this), device(device){}; + + protected: + std::vector &getCmdListsForSplit(NEO::TransferDirection direction); + void setupEnginesMask(); + bool setupQueues(); + + DeviceImp &device; + NEO::BcsSplitSettings splitSettings = {}; + uint32_t clientCount = 0u; + + std::mutex mtx; }; -} // namespace L0 \ No newline at end of file +} // namespace L0 + +#include "level_zero/core/source/device/bcs_split.inl" diff --git a/level_zero/core/source/device/bcs_split.inl b/level_zero/core/source/device/bcs_split.inl new file mode 100644 index 0000000000..e26e028787 --- /dev/null +++ b/level_zero/core/source/device/bcs_split.inl @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2025 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +namespace L0 { + +template +ze_result_t BcsSplit::appendSplitCall(CommandListCoreFamilyImmediate *cmdList, + const BcsSplitParams::CopyParams ©Params, + size_t size, + ze_event_handle_t hSignalEvent, + uint32_t numWaitEvents, + ze_event_handle_t *phWaitEvents, + bool performMigration, + bool hasRelaxedOrderingDependencies, + NEO::TransferDirection direction, + size_t estimatedCmdBufferSize, + AppendCallFuncT appendCall) { + + using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; + + const auto aggregatedEventsMode = this->events.aggregatedEventsMode; + auto signalEvent = Event::fromHandle(hSignalEvent); + + ze_result_t result = ZE_RESULT_SUCCESS; + auto &cmdListsForSplit = this->getCmdListsForSplit(direction); + auto engineCount = cmdListsForSplit.size(); + size_t markerEventIndex = 0; + uint64_t aggregatedEventIncrementVal = 1; + + const bool useSignalEventForSubcopy = aggregatedEventsMode && cmdList->isUsingAdditionalBlitProperties() && Event::isAggregatedEvent(signalEvent) && + (signalEvent->getInOrderIncrementValue(1) % engineCount == 0); + + if (useSignalEventForSubcopy) { + aggregatedEventIncrementVal = signalEvent->getInOrderIncrementValue(1) / engineCount; + } else { + auto markerEventIndexRet = this->events.obtainForSplit(Context::fromHandle(cmdList->getCmdListContext()), maxEventCountInPool); + if (!markerEventIndexRet.has_value()) { + return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY; + } + markerEventIndex = *markerEventIndexRet; + } + + auto barrierRequired = !cmdList->isInOrderExecutionEnabled() && cmdList->isBarrierRequired(); + if (barrierRequired) { + cmdList->appendSignalEvent(this->events.barrier[markerEventIndex]->toHandle(), false); + } + + auto subcopyEventIndex = markerEventIndex * this->cmdLists.size(); + StackVec eventHandles; + + if (!cmdList->handleCounterBasedEventOperations(signalEvent, false)) { + return ZE_RESULT_ERROR_INVALID_ARGUMENT; + } + + auto totalSize = size; + for (size_t i = 0; i < cmdListsForSplit.size(); i++) { + auto subCmdList = static_cast *>(cmdListsForSplit[i]); + + auto lock = subCmdList->getCsr(false)->obtainUniqueOwnership(); + + subCmdList->checkAvailableSpace(numWaitEvents, hasRelaxedOrderingDependencies, estimatedCmdBufferSize, false); + + if (barrierRequired) { + auto barrierEventHandle = this->events.barrier[markerEventIndex]->toHandle(); + subCmdList->addEventsToCmdList(1u, &barrierEventHandle, nullptr, hasRelaxedOrderingDependencies, false, true, false, false); + } + + if (cmdList->hasInOrderDependencies()) { + auto &inOrderExecInfo = cmdList->getInOrderExecInfo(); + subCmdList->appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue(), inOrderExecInfo->getAllocationOffset(), hasRelaxedOrderingDependencies, false, false, false, false); + } + subCmdList->addEventsToCmdList(numWaitEvents, phWaitEvents, nullptr, hasRelaxedOrderingDependencies, false, false, false, false); + + if (!useSignalEventForSubcopy && signalEvent && i == 0u) { + subCmdList->appendEventForProfilingAllWalkers(signalEvent, nullptr, nullptr, true, true, false, true); + } + + auto localSize = totalSize / engineCount; + + BcsSplitParams::CopyParams localCopyParams; + + std::visit([&](auto &&arg) { + using T = std::decay_t; + localCopyParams = T{ptrOffset(arg.dst, size - totalSize), + ptrOffset(arg.src, size - totalSize)}; + }, + copyParams); + + auto copyEventIndex = aggregatedEventsMode ? markerEventIndex : subcopyEventIndex + i; + auto eventHandle = useSignalEventForSubcopy ? signalEvent : this->events.subcopy[copyEventIndex]->toHandle(); + result = appendCall(subCmdList, localCopyParams, localSize, eventHandle, aggregatedEventIncrementVal); + subCmdList->flushImmediate(result, true, !hasRelaxedOrderingDependencies, hasRelaxedOrderingDependencies, NEO::AppendOperations::nonKernel, false, nullptr, true, &lock, nullptr); + + if ((aggregatedEventsMode && i == 0) || !aggregatedEventsMode) { + eventHandles.push_back(eventHandle); + } + + totalSize -= localSize; + engineCount--; + + if (signalEvent) { + signalEvent->appendAdditionalCsr(subCmdList->getCsr(false)); + } + } + + const bool dualStreamCopyOffload = cmdList->isDualStreamCopyOffloadOperation(cmdList->isCopyOffloadEnabled()); + + cmdList->addEventsToCmdList(static_cast(eventHandles.size()), eventHandles.data(), nullptr, hasRelaxedOrderingDependencies, false, true, false, dualStreamCopyOffload); + + const auto isCopyCmdList = cmdList->isCopyOnly(dualStreamCopyOffload); + + if (!useSignalEventForSubcopy && signalEvent) { + cmdList->appendSignalEventPostWalker(signalEvent, nullptr, nullptr, !isCopyCmdList, false, isCopyCmdList); + } + + if (!aggregatedEventsMode) { + cmdList->appendSignalEventPostWalker(this->events.marker[markerEventIndex], nullptr, nullptr, !isCopyCmdList, false, isCopyCmdList); + } + + if (cmdList->isInOrderExecutionEnabled()) { + cmdList->appendSignalInOrderDependencyCounter(signalEvent, dualStreamCopyOffload, false, false, useSignalEventForSubcopy); + } + cmdList->handleInOrderDependencyCounter(signalEvent, false, dualStreamCopyOffload); + + if (aggregatedEventsMode && !useSignalEventForSubcopy) { + std::lock_guard lock(events.mtx); + cmdList->assignInOrderExecInfoToEvent(this->events.marker[markerEventIndex]); + } + + return result; +} + +} // namespace L0 \ No newline at end of file diff --git a/level_zero/core/source/device/device_imp.h b/level_zero/core/source/device/device_imp.h index 8045efac27..68b54d71d7 100644 --- a/level_zero/core/source/device/device_imp.h +++ b/level_zero/core/source/device/device_imp.h @@ -33,7 +33,7 @@ struct HardwareInfo; } // namespace NEO namespace L0 { -struct BcsSplit; +class BcsSplit; struct Image; struct SysmanDevice; struct FabricVertex; diff --git a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp index 7d6188639f..7bf1e71bc3 100644 --- a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp +++ b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdqueue_xe_hpc_core.cpp @@ -1691,7 +1691,7 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe NEO::debugManager.flags.OverrideEventSynchronizeTimeout.set(0); NEO::debugManager.flags.EnableTimestampPoolAllocator.set(0); - auto memoryManager = reinterpret_cast(static_cast(testL0Device.get())->bcsSplit->device.getDriverHandle()->getMemoryManager()); + auto memoryManager = reinterpret_cast(static_cast(testL0Device.get())->bcsSplit->getDevice().getDriverHandle()->getMemoryManager()); memoryManager->isMockHostMemoryManager = true; memoryManager->forceFailureInPrimaryAllocation = true; @@ -1741,7 +1741,7 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe NEO::debugManager.flags.OverrideEventSynchronizeTimeout.set(0); NEO::debugManager.flags.EnableTimestampPoolAllocator.set(0); - auto memoryManager = reinterpret_cast(static_cast(testL0Device.get())->bcsSplit->device.getDriverHandle()->getMemoryManager()); + auto memoryManager = reinterpret_cast(static_cast(testL0Device.get())->bcsSplit->getDevice().getDriverHandle()->getMemoryManager()); memoryManager->isMockHostMemoryManager = true; memoryManager->forceFailureInPrimaryAllocation = true; diff --git a/shared/source/debug_settings/debug_settings_manager.h b/shared/source/debug_settings/debug_settings_manager.h index d07cde4005..5791800175 100644 --- a/shared/source/debug_settings/debug_settings_manager.h +++ b/shared/source/debug_settings/debug_settings_manager.h @@ -102,6 +102,7 @@ struct DebugVarBase { this->set(data); } } + void setPrefixType(DebugVarPrefix data) { prefixType = std::move(data); } @@ -117,6 +118,13 @@ struct DebugVarBase { return (value != defaultValue) ? static_cast(value) : userValue; } + template + void assignIfNotDefault(UserType &userDataForAssignment) const { + if (value != defaultValue) { + userDataForAssignment = static_cast(value); + } + } + private: T value; T defaultValue;