From 94d01b4d40600e2878c6abc37b3c52a0a2b8c779 Mon Sep 17 00:00:00 2001 From: Bartosz Dunajski Date: Tue, 23 Sep 2025 16:35:04 +0000 Subject: [PATCH] feature: use User event in bcs split path if increment value is the same Related-To: NEO-14557 Signed-off-by: Bartosz Dunajski --- level_zero/core/source/cmdlist/cmdlist_hw.h | 5 +- level_zero/core/source/cmdlist/cmdlist_hw.inl | 32 +-- .../cmdlist/cmdlist_hw_gen12lp_to_xe3.inl | 2 +- .../source/cmdlist/cmdlist_hw_immediate.inl | 17 +- .../cmdlist/cmdlist_hw_xehp_and_later.inl | 4 +- .../cmdlist/cmdlist_memory_copy_params.h | 2 +- level_zero/core/source/device/bcs_split.h | 46 +++-- .../core/source/gen12lp/cmdlist_gen12lp.cpp | 2 +- .../fixtures/in_order_cmd_list_fixture.h | 128 ++++++++++++ .../sources/cmdlist/test_cmdlist_2.cpp | 2 +- .../sources/cmdlist/test_cmdlist_blit.cpp | 183 +++++++----------- .../cmdlist/test_cmdlist_memory_extension.cpp | 2 +- .../cmdlist/test_in_order_cmdlist_1.cpp | 6 +- .../cmdlist/test_in_order_cmdlist_2.cpp | 2 +- 14 files changed, 265 insertions(+), 168 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index d12ee75b82..29f6d0cecd 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -216,7 +216,7 @@ struct CommandListCoreFamily : public CommandListImp { void appendWaitOnInOrderDependency(std::shared_ptr &inOrderExecInfo, CommandToPatchContainer *outListCommands, uint64_t waitValue, uint32_t offset, bool relaxedOrderingAllowed, bool implicitDependency, bool skipAddingWaitEventsToResidency, bool noopDispatch, bool dualStreamCopyOffloadOperation); - MOCKABLE_VIRTUAL void appendSignalInOrderDependencyCounter(Event *signalEvent, bool copyOffloadOperation, bool stall, bool textureFlushRequired); + MOCKABLE_VIRTUAL void appendSignalInOrderDependencyCounter(Event *signalEvent, bool copyOffloadOperation, bool stall, bool textureFlushRequired, bool skipAggregatedEventSignaling); void handleInOrderDependencyCounter(Event *signalEvent, bool nonWalkerInOrderCmdsChaining, bool copyOffloadOperation); void handleInOrderCounterOverflow(bool copyOffloadOperation); @@ -251,6 +251,7 @@ struct CommandListCoreFamily : public CommandListImp { void assignInOrderExecInfoToEvent(Event *event); bool hasInOrderDependencies() const; void appendSignalEventPostWalker(Event *event, void **syncCmdBuffer, CommandToPatchContainer *outTimeStampSyncCmds, bool skipBarrierForEndProfiling, bool skipAddingEventToResidency, bool copyOperation); + bool isUsingAdditionalBlitProperties() const { return useAdditionalBlitProperties; } protected: MOCKABLE_VIRTUAL ze_result_t appendMemoryCopyKernelWithGA(uintptr_t dstPtr, NEO::GraphicsAllocation *dstPtrAlloc, @@ -372,7 +373,7 @@ struct CommandListCoreFamily : public CommandListImp { uint32_t getRegionOffsetForAppendMemoryCopyBlitRegion(AlignedAllocationData *allocationData); void handlePostSubmissionState(); - MOCKABLE_VIRTUAL void setAdditionalBlitProperties(NEO::BlitProperties &blitProperties, Event *signalEvent, uint32_t forceAggregatedEventIncValue, bool useAdditionalTimestamp); + MOCKABLE_VIRTUAL void setAdditionalBlitProperties(NEO::BlitProperties &blitProperties, Event *signalEvent, uint64_t forceAggregatedEventIncValue, bool useAdditionalTimestamp); void setupFillKernelArguments(size_t baseOffset, size_t patternSize, diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 0d69186a9d..156df587d2 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -228,7 +228,7 @@ void CommandListCoreFamily::handleInOrderCounterOverflow(bool cop inOrderExecInfo->setAllocationOffset(newOffset); inOrderExecInfo->initializeAllocationsFromHost(); - CommandListCoreFamily::appendSignalInOrderDependencyCounter(nullptr, copyOffloadOperation, false, false); // signal counter on new offset + CommandListCoreFamily::appendSignalInOrderDependencyCounter(nullptr, copyOffloadOperation, false, false, false); // signal counter on new offset } } @@ -690,7 +690,7 @@ ze_result_t CommandListCoreFamily::appendEventReset(ze_event_hand } if (this->isInOrderExecutionEnabled()) { - appendSignalInOrderDependencyCounter(event, false, false, false); + appendSignalInOrderDependencyCounter(event, false, false, false, false); } handleInOrderDependencyCounter(event, false, false); event->unsetInOrderExecInfo(); @@ -736,7 +736,7 @@ ze_result_t CommandListCoreFamily::appendMemoryRangesBarrier(uint addToMappedEventList(signalEvent); if (this->isInOrderExecutionEnabled()) { - appendSignalInOrderDependencyCounter(signalEvent, false, false, false); + appendSignalInOrderDependencyCounter(signalEvent, false, false, false, false); } handleInOrderDependencyCounter(signalEvent, false, false); @@ -1728,7 +1728,7 @@ ze_result_t CommandListCoreFamily::appendCopyImageBlit(uintptr_t if (!useAdditionalBlitProperties) { appendSignalEventPostWalker(signalEvent, nullptr, nullptr, false, false, true); if (this->isInOrderExecutionEnabled()) { - appendSignalInOrderDependencyCounter(signalEvent, false, false, false); + appendSignalInOrderDependencyCounter(signalEvent, false, false, false, false); } } handleInOrderDependencyCounter(signalEvent, false, false); @@ -2038,7 +2038,7 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, if ((!useAdditionalBlitProperties || !isCopyOnlyEnabled) && (launchParams.isKernelSplitOperation || inOrderCopyOnlySignalingAllowed || emitPipeControl)) { dispatchInOrderPostOperationBarrier(signalEvent, dcFlush, isCopyOnlyEnabled); - appendSignalInOrderDependencyCounter(signalEvent, memoryCopyParams.copyOffloadAllowed, false, false); + appendSignalInOrderDependencyCounter(signalEvent, memoryCopyParams.copyOffloadAllowed, false, false, false); } else if (!useAdditionalBlitProperties && isCopyOnlyEnabled && Event::isAggregatedEvent(signalEvent)) { appendSignalAggregatedEventAtomic(*signalEvent); } @@ -2175,7 +2175,7 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyRegion(void *d if (this->isInOrderExecutionEnabled()) { if (inOrderCopyOnlySignalingAllowed) { if (!useAdditionalBlitProperties) { - appendSignalInOrderDependencyCounter(signalEvent, memoryCopyParams.copyOffloadAllowed, false, false); + appendSignalInOrderDependencyCounter(signalEvent, memoryCopyParams.copyOffloadAllowed, false, false, false); } handleInOrderDependencyCounter(signalEvent, false, isCopyOnlyEnabled); } else if (!useAdditionalBlitProperties && isCopyOnlyEnabled && Event::isAggregatedEvent(signalEvent)) { @@ -2722,7 +2722,7 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, if (this->isInOrderExecutionEnabled()) { if (launchParams.isKernelSplitOperation || launchParams.pipeControlSignalling) { dispatchInOrderPostOperationBarrier(signalEvent, dcFlush, isCopyOnly(false)); - appendSignalInOrderDependencyCounter(signalEvent, false, false, false); + appendSignalInOrderDependencyCounter(signalEvent, false, false, false, false); } else { nonWalkerInOrderCmdChaining = isInOrderNonWalkerSignalingRequired(signalEvent); } @@ -2840,7 +2840,7 @@ ze_result_t CommandListCoreFamily::appendBlitFill(void *ptr, cons } if (isInOrderExecutionEnabled() && isCopyOnlySignaling) { - appendSignalInOrderDependencyCounter(signalEvent, false, false, false); + appendSignalInOrderDependencyCounter(signalEvent, false, false, false, false); } handleInOrderDependencyCounter(signalEvent, false, memoryCopyParams.copyOffloadAllowed); } @@ -3094,7 +3094,7 @@ ze_result_t CommandListCoreFamily::appendSignalEvent(ze_event_han } if (this->isInOrderExecutionEnabled()) { - appendSignalInOrderDependencyCounter(event, false, false, false); + appendSignalInOrderDependencyCounter(event, false, false, false, false); } handleInOrderDependencyCounter(event, false, false); @@ -3331,7 +3331,7 @@ ze_result_t CommandListCoreFamily::appendWaitOnEvents(uint32_t nu if (apiRequest) { if (this->isInOrderExecutionEnabled()) { - appendSignalInOrderDependencyCounter(nullptr, false, false, false); + appendSignalInOrderDependencyCounter(nullptr, false, false, false, false); } handleInOrderDependencyCounter(nullptr, false, false); } @@ -3380,7 +3380,7 @@ void CommandListCoreFamily::appendSignalAggregatedEventAtomic(Eve } template -void CommandListCoreFamily::appendSignalInOrderDependencyCounter(Event *signalEvent, bool copyOffloadOperation, bool stall, bool textureFlushRequired) { +void CommandListCoreFamily::appendSignalInOrderDependencyCounter(Event *signalEvent, bool copyOffloadOperation, bool stall, bool textureFlushRequired, bool skipAggregatedEventSignaling) { using ATOMIC_OPCODES = typename GfxFamily::MI_ATOMIC::ATOMIC_OPCODES; using DATA_SIZE = typename GfxFamily::MI_ATOMIC::DATA_SIZE; @@ -3431,7 +3431,7 @@ void CommandListCoreFamily::appendSignalInOrderDependencyCounter( appendSdiInOrderCounterSignalling(inOrderExecInfo->getBaseHostGpuAddress(), signalValue, copyOffloadOperation); } - if (Event::isAggregatedEvent(signalEvent)) { + if (!skipAggregatedEventSignaling && Event::isAggregatedEvent(signalEvent)) { appendSignalAggregatedEventAtomic(*signalEvent); } @@ -3639,7 +3639,7 @@ ze_result_t CommandListCoreFamily::appendWriteGlobalTimestamp( appendSignalEventPostWalker(signalEvent, nullptr, nullptr, false, false, isCopyOnly(false)); if (this->isInOrderExecutionEnabled()) { - appendSignalInOrderDependencyCounter(signalEvent, false, false, false); + appendSignalInOrderDependencyCounter(signalEvent, false, false, false, false); } handleInOrderDependencyCounter(signalEvent, false, false); @@ -4268,7 +4268,7 @@ ze_result_t CommandListCoreFamily::appendBarrier(ze_event_handle_ appendSignalEventPostWalker(signalEvent, nullptr, nullptr, skipPipeControl, false, isCopyOnly(false)); if (isInOrderExecutionEnabled()) { - appendSignalInOrderDependencyCounter(signalEvent, false, false, false); + appendSignalInOrderDependencyCounter(signalEvent, false, false, false, false); } handleInOrderDependencyCounter(signalEvent, false, false); @@ -4421,7 +4421,7 @@ ze_result_t CommandListCoreFamily::appendWaitOnMemory(void *desc, appendSignalEventPostWalker(signalEvent, nullptr, nullptr, false, false, isCopyOnly(false)); if (this->isInOrderExecutionEnabled()) { - appendSignalInOrderDependencyCounter(signalEvent, false, false, false); + appendSignalInOrderDependencyCounter(signalEvent, false, false, false, false); } handleInOrderDependencyCounter(signalEvent, false, false); @@ -4482,7 +4482,7 @@ ze_result_t CommandListCoreFamily::appendWriteToMemory(void *desc } if (this->isInOrderExecutionEnabled()) { - appendSignalInOrderDependencyCounter(nullptr, false, false, false); + appendSignalInOrderDependencyCounter(nullptr, false, false, false, false); } handleInOrderDependencyCounter(nullptr, false, false); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_gen12lp_to_xe3.inl b/level_zero/core/source/cmdlist/cmdlist_hw_gen12lp_to_xe3.inl index e9b552a9f4..6047a010bd 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_gen12lp_to_xe3.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_gen12lp_to_xe3.inl @@ -16,7 +16,7 @@ constexpr bool CommandListCoreFamily::checkIfAllocationImportedRe } template -void CommandListCoreFamily::setAdditionalBlitProperties(NEO::BlitProperties &blitProperties, Event *signalEvent, uint32_t forceAggregatedEventIncValue, bool useAdditionalTimestamp) { +void CommandListCoreFamily::setAdditionalBlitProperties(NEO::BlitProperties &blitProperties, Event *signalEvent, uint64_t forceAggregatedEventIncValue, bool useAdditionalTimestamp) { } template diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index 1a7f8e4f31..d69330e4a1 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -596,7 +596,7 @@ void CommandListCoreFamilyImmediate::handleInOrderNonWalkerSignal } CommandListCoreFamily::appendWaitOnSingleEvent(event, nullptr, nonWalkerSignalingHasRelaxedOrdering, false, CommandToPatch::Invalid); - CommandListCoreFamily::appendSignalInOrderDependencyCounter(event, false, false, false); + CommandListCoreFamily::appendSignalInOrderDependencyCounter(event, false, false, false, false); } template @@ -696,8 +696,8 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopy( if (isSplitNeeded) { setupFlagsForBcsSplit(memoryCopyParams, hasStallingCmds, copyOffloadFlush, srcptr, dstptr, size, size); - auto splitCall = [&](CommandListCoreFamilyImmediate *subCmdList, void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint32_t aggregatedEventInvValue) { - memoryCopyParams.forceAggregatedEventIncValue = aggregatedEventInvValue; + auto splitCall = [&](CommandListCoreFamilyImmediate *subCmdList, void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) { + memoryCopyParams.forceAggregatedEventIncValue = aggregatedEventIncValue; return subCmdList->CommandListCoreFamily::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, 0u, nullptr, memoryCopyParams); }; @@ -752,7 +752,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopyRegio this->getTotalSizeForCopyRegion(srcRegion, srcPitch, srcSlicePitch), this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch)); - auto splitCall = [&](CommandListCoreFamilyImmediate *subCmdList, uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint32_t aggregatedEventInvValue) { + auto splitCall = [&](CommandListCoreFamilyImmediate *subCmdList, uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) { ze_copy_region_t dstRegionLocal = {}; ze_copy_region_t srcRegionLocal = {}; memcpy(&dstRegionLocal, dstRegion, sizeof(ze_copy_region_t)); @@ -761,7 +761,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopyRegio dstRegionLocal.width = static_cast(sizeParam); srcRegionLocal.originX = srcOriginXParam; srcRegionLocal.width = static_cast(sizeParam); - memoryCopyParams.forceAggregatedEventIncValue = aggregatedEventInvValue; + memoryCopyParams.forceAggregatedEventIncValue = aggregatedEventIncValue; return subCmdList->CommandListCoreFamily::appendMemoryCopyRegion(dstPtr, &dstRegionLocal, dstPitch, dstSlicePitch, srcPtr, &srcRegionLocal, srcPitch, srcSlicePitch, hSignalEventParam, 0u, nullptr, memoryCopyParams); @@ -838,8 +838,8 @@ ze_result_t CommandListCoreFamilyImmediate::appendPageFaultCopy(N setupFlagsForBcsSplit(bcsSplitMemoryCopyParams, hasStallingCmds, copyOffloadFlush, srcAddress, dstAddress, size, size); - auto splitCall = [&](CommandListCoreFamilyImmediate *subCmdList, void *dstAddressParam, const void *srcAddressParam, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint32_t aggregatedEventInvValue) { - bcsSplitMemoryCopyParams.forceAggregatedEventIncValue = aggregatedEventInvValue; + auto splitCall = [&](CommandListCoreFamilyImmediate *subCmdList, void *dstAddressParam, const void *srcAddressParam, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) { + bcsSplitMemoryCopyParams.forceAggregatedEventIncValue = aggregatedEventIncValue; return subCmdList->CommandListCoreFamily::appendMemoryCopy(dstAddressParam, srcAddressParam, sizeParam, hSignalEventParam, 0u, nullptr, bcsSplitMemoryCopyParams); }; @@ -1377,7 +1377,7 @@ template ze_result_t CommandListCoreFamilyImmediate::flushInOrderCounterSignal(bool waitOnInOrderCounterRequired) { ze_result_t ret = ZE_RESULT_SUCCESS; if (waitOnInOrderCounterRequired && !this->isHeaplessModeEnabled() && this->latestOperationHasOptimizedCbEvent) { - this->appendSignalInOrderDependencyCounter(nullptr, false, true, false); + this->appendSignalInOrderDependencyCounter(nullptr, false, true, false, false); this->inOrderExecInfo->addCounterValue(this->getInOrderIncrementValue()); this->handleInOrderCounterOverflow(false); ret = flushImmediate(ret, false, true, false, NEO::AppendOperations::nonKernel, false, nullptr, false, nullptr, nullptr); @@ -1865,6 +1865,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendCommandLists(ui CommandListCoreFamily::appendSignalInOrderDependencyCounter(signalEvent, copyOffloadOperation, false, + false, false); } diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 7fb79469e9..0ae95925c9 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -507,12 +507,12 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K if (compactEvent && compactEvent->isCounterBased()) { auto pcCmdPtr = this->commandContainer.getCommandStream()->getSpace(0u); inOrderCounterValue = this->inOrderExecInfo->getCounterValue() + getInOrderIncrementValue(); - appendSignalInOrderDependencyCounter(eventForInOrderExec, false, true, textureFlushRequired); + appendSignalInOrderDependencyCounter(eventForInOrderExec, false, true, textureFlushRequired, false); addCmdForPatching(nullptr, pcCmdPtr, nullptr, inOrderCounterValue, NEO::InOrderPatchCommandHelpers::PatchCmdType::pipeControl); textureFlushRequired = false; } else { appendWaitOnSingleEvent(eventForInOrderExec, launchParams.outListCommands, false, false, CommandToPatch::CbEventTimestampPostSyncSemaphoreWait); - appendSignalInOrderDependencyCounter(eventForInOrderExec, false, false, false); + appendSignalInOrderDependencyCounter(eventForInOrderExec, false, false, false, false); } } else { this->latestOperationHasOptimizedCbEvent = true; diff --git a/level_zero/core/source/cmdlist/cmdlist_memory_copy_params.h b/level_zero/core/source/cmdlist/cmdlist_memory_copy_params.h index 1fa811e71c..ee51756b91 100644 --- a/level_zero/core/source/cmdlist/cmdlist_memory_copy_params.h +++ b/level_zero/core/source/cmdlist/cmdlist_memory_copy_params.h @@ -14,11 +14,11 @@ namespace L0 { struct CmdListMemoryCopyParams { + uint64_t forceAggregatedEventIncValue = 0; const void *bcsSplitBaseSrcPtr = nullptr; void *bcsSplitBaseDstPtr = nullptr; size_t bcsSplitTotalSrcSize = 0; size_t bcsSplitTotalDstSize = 0; - uint32_t forceAggregatedEventIncValue = 0; bool relaxedOrderingDispatch = false; bool forceDisableCopyOnlyInOrderSignaling = false; bool copyOffloadAllowed = false; diff --git a/level_zero/core/source/device/bcs_split.h b/level_zero/core/source/device/bcs_split.h index 70837844ef..b22a813a74 100644 --- a/level_zero/core/source/device/bcs_split.h +++ b/level_zero/core/source/device/bcs_split.h @@ -30,7 +30,7 @@ struct DeviceImp; struct BcsSplit { template - using AppendCallFuncT = std::function *, T, K, size_t, ze_event_handle_t, uint32_t)>; + using AppendCallFuncT = std::function *, T, K, size_t, ze_event_handle_t, uint64_t)>; using CsrContainer = StackVec; DeviceImp &device; @@ -81,15 +81,30 @@ struct BcsSplit { NEO::TransferDirection direction, size_t estimatedCmdBufferSize, AppendCallFuncT appendCall) { + constexpr size_t maxEventCountInPool = MemoryConstants::pageSize64k / sizeof(typename CommandListCoreFamilyImmediate::GfxFamily::TimestampPacketType); + + const auto aggregatedEventsMode = this->events.aggregatedEventsMode; + auto signalEvent = Event::fromHandle(hSignalEvent); + ze_result_t result = ZE_RESULT_SUCCESS; + auto &cmdListsForSplit = this->getCmdListsForSplit(direction); + auto engineCount = cmdListsForSplit.size(); + size_t markerEventIndex = 0; + uint64_t aggregatedEventIncrementVal = 1; - auto markerEventIndexRet = this->events.obtainForSplit(Context::fromHandle(cmdList->getCmdListContext()), MemoryConstants::pageSize64k / sizeof(typename CommandListCoreFamilyImmediate::GfxFamily::TimestampPacketType)); - if (!markerEventIndexRet.has_value()) { - return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY; + const bool useSignalEventForSubcopy = aggregatedEventsMode && cmdList->isUsingAdditionalBlitProperties() && Event::isAggregatedEvent(signalEvent) && + (signalEvent->getInOrderIncrementValue() % engineCount == 0); + + if (useSignalEventForSubcopy) { + aggregatedEventIncrementVal = signalEvent->getInOrderIncrementValue() / engineCount; + } else { + auto markerEventIndexRet = this->events.obtainForSplit(Context::fromHandle(cmdList->getCmdListContext()), maxEventCountInPool); + if (!markerEventIndexRet.has_value()) { + return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY; + } + markerEventIndex = *markerEventIndexRet; } - auto markerEventIndex = *markerEventIndexRet; - auto barrierRequired = !cmdList->isInOrderExecutionEnabled() && cmdList->isBarrierRequired(); if (barrierRequired) { cmdList->appendSignalEvent(this->events.barrier[markerEventIndex]->toHandle(), false); @@ -98,18 +113,11 @@ struct BcsSplit { auto subcopyEventIndex = markerEventIndex * this->cmdLists.size(); StackVec eventHandles; - auto &cmdListsForSplit = this->getCmdListsForSplit(direction); - - auto signalEvent = Event::fromHandle(hSignalEvent); - if (!cmdList->handleCounterBasedEventOperations(signalEvent, false)) { return ZE_RESULT_ERROR_INVALID_ARGUMENT; } - const auto aggregatedEventsMode = this->events.aggregatedEventsMode; - auto totalSize = size; - auto engineCount = cmdListsForSplit.size(); for (size_t i = 0; i < cmdListsForSplit.size(); i++) { auto subCmdList = static_cast *>(cmdListsForSplit[i]); @@ -126,7 +134,7 @@ struct BcsSplit { } subCmdList->addEventsToCmdList(numWaitEvents, phWaitEvents, nullptr, hasRelaxedOrderingDependencies, false, false, false, false); - if (signalEvent && i == 0u) { + if (!useSignalEventForSubcopy && signalEvent && i == 0u) { subCmdList->appendEventForProfilingAllWalkers(signalEvent, nullptr, nullptr, true, true, false, true); } @@ -135,8 +143,8 @@ struct BcsSplit { auto localSrcPtr = ptrOffset(srcptr, size - totalSize); auto copyEventIndex = aggregatedEventsMode ? markerEventIndex : subcopyEventIndex + i; - auto eventHandle = this->events.subcopy[copyEventIndex]->toHandle(); - result = appendCall(subCmdList, localDstPtr, localSrcPtr, localSize, eventHandle, 1); + auto eventHandle = useSignalEventForSubcopy ? signalEvent : this->events.subcopy[copyEventIndex]->toHandle(); + result = appendCall(subCmdList, localDstPtr, localSrcPtr, localSize, eventHandle, aggregatedEventIncrementVal); subCmdList->flushImmediate(result, true, !hasRelaxedOrderingDependencies, hasRelaxedOrderingDependencies, NEO::AppendOperations::nonKernel, false, nullptr, true, nullptr, nullptr); if ((aggregatedEventsMode && i == 0) || !aggregatedEventsMode) { @@ -157,7 +165,7 @@ struct BcsSplit { const auto isCopyCmdList = cmdList->isCopyOnly(dualStreamCopyOffload); - if (signalEvent) { + if (!useSignalEventForSubcopy && signalEvent) { cmdList->appendSignalEventPostWalker(signalEvent, nullptr, nullptr, !isCopyCmdList, false, isCopyCmdList); } @@ -166,11 +174,11 @@ struct BcsSplit { } if (cmdList->isInOrderExecutionEnabled()) { - cmdList->appendSignalInOrderDependencyCounter(signalEvent, dualStreamCopyOffload, false, false); + cmdList->appendSignalInOrderDependencyCounter(signalEvent, dualStreamCopyOffload, false, false, useSignalEventForSubcopy); } cmdList->handleInOrderDependencyCounter(signalEvent, false, dualStreamCopyOffload); - if (aggregatedEventsMode) { + if (aggregatedEventsMode && !useSignalEventForSubcopy) { cmdList->assignInOrderExecInfoToEvent(this->events.marker[markerEventIndex]); } diff --git a/level_zero/core/source/gen12lp/cmdlist_gen12lp.cpp b/level_zero/core/source/gen12lp/cmdlist_gen12lp.cpp index f56c3ee8b0..0ab60112cf 100644 --- a/level_zero/core/source/gen12lp/cmdlist_gen12lp.cpp +++ b/level_zero/core/source/gen12lp/cmdlist_gen12lp.cpp @@ -322,7 +322,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K NEO::MemorySynchronizationCommands::addSingleBarrier(*commandContainer.getCommandStream(), args); } - appendSignalInOrderDependencyCounter(event, false, false, false); + appendSignalInOrderDependencyCounter(event, false, false, false, false); } return ZE_RESULT_SUCCESS; diff --git a/level_zero/core/test/unit_tests/fixtures/in_order_cmd_list_fixture.h b/level_zero/core/test/unit_tests/fixtures/in_order_cmd_list_fixture.h index 7e0736b841..773291d10d 100644 --- a/level_zero/core/test/unit_tests/fixtures/in_order_cmd_list_fixture.h +++ b/level_zero/core/test/unit_tests/fixtures/in_order_cmd_list_fixture.h @@ -7,17 +7,20 @@ #pragma once +#include "shared/source/command_stream/transfer_direction.h" #include "shared/test/common/cmd_parse/gen_cmd_parse.h" #include "shared/test/common/helpers/engine_descriptor_helper.h" #include "shared/test/common/mocks/mock_os_context.h" #include "shared/test/common/test_macros/hw_test.h" #include "level_zero/core/source/cmdlist/cmdlist_memory_copy_params.h" +#include "level_zero/core/source/device/bcs_split.h" #include "level_zero/core/source/event/event_imp.h" #include "level_zero/core/source/gfx_core_helpers/l0_gfx_core_helper.h" #include "level_zero/core/test/unit_tests/fixtures/module_fixture.h" #include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h" #include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h" +#include "level_zero/core/test/unit_tests/mocks/mock_event.h" #include "level_zero/core/test/unit_tests/sources/helper/ze_object_utils.h" #include "level_zero/driver_experimental/zex_api.h" @@ -372,5 +375,130 @@ struct MultiTileSynchronizedDispatchFixture : public MultiTileInOrderCmdListFixt } }; +struct AggregatedBcsSplitTests : public ::testing::Test { + using MockEvent = WhiteBox>; + + void SetUp() override { + debugManager.flags.SplitBcsAggregatedEventsMode.set(1); + debugManager.flags.SplitBcsCopy.set(1); + debugManager.flags.SplitBcsRequiredTileCount.set(expectedTileCount); + debugManager.flags.SplitBcsRequiredEnginesCount.set(expectedEnginesCount); + debugManager.flags.SplitBcsMask.set(0b11110); + debugManager.flags.SplitBcsTransferDirectionMask.set(transferDirectionMask); + + createDevice(); + context = Context::fromHandle(driverHandle->getDefaultContext()); + cmdList = createCmdList(true); + } + + void createDevice() { + auto hwInfo = *NEO::defaultHwInfo; + hwInfo.featureTable.ftrBcsInfo = 0b111111111; + hwInfo.capabilityTable.blitterOperationsSupported = true; + auto neoDevice = NEO::MockDevice::createWithNewExecutionEnvironment(&hwInfo, 0); + + NEO::DeviceVector devices; + devices.push_back(std::unique_ptr(neoDevice)); + + for (uint32_t i = 1; i < expectedNumRootDevices; i++) { + auto neoRootDevice = NEO::MockDevice::createWithExecutionEnvironment(&hwInfo, neoDevice->getExecutionEnvironment(), i); + devices.push_back(std::unique_ptr(neoRootDevice)); + } + + driverHandle = std::make_unique>(); + driverHandle->initialize(std::move(devices)); + + this->device = driverHandle->devices[0]; + + bcsSplit = static_cast(device)->bcsSplit.get(); + } + + uint32_t queryCopyOrdinal() { + uint32_t count = 0; + device->getCommandQueueGroupProperties(&count, nullptr); + + std::vector groups; + groups.resize(count); + + device->getCommandQueueGroupProperties(&count, groups.data()); + + for (uint32_t i = 0; i < count; i++) { + if (groups[i].flags == ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) { + return i; + } + } + + EXPECT_TRUE(false); + return 0; + } + + DestroyableZeUniquePtr createCmdList(bool copyOnly) { + ze_result_t returnValue; + + ze_command_queue_desc_t desc = {}; + desc.flags = ZE_COMMAND_QUEUE_FLAG_IN_ORDER; + desc.ordinal = copyOnly ? queryCopyOrdinal() : 0; + + DestroyableZeUniquePtr commandList(CommandList::createImmediate(productFamily, + device, + &desc, + false, + copyOnly ? NEO::EngineGroupType::copy : NEO::EngineGroupType::compute, + returnValue)); + + *static_cast(commandList.get())->getInOrderExecInfo()->getBaseHostAddress() = std::numeric_limits::max(); + + return commandList; + } + + void *allocHostMem() { + void *alloc = nullptr; + ze_host_mem_alloc_desc_t deviceDesc = {}; + context->allocHostMem(&deviceDesc, copySize, 4096, &alloc); + + return alloc; + } + + void *allocDeviceMem(L0::Device *device) { + void *alloc = nullptr; + ze_device_mem_alloc_desc_t deviceDesc = {}; + ze_result_t result = context->allocDeviceMem(device->toHandle(), &deviceDesc, copySize, 4096u, &alloc); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + return alloc; + } + + DestroyableZeUniquePtr createExternalSyncStorageEvent(uint64_t counterValue, uint64_t incrementValue, uint64_t *deviceAddress) { + ze_event_handle_t outEvent = nullptr; + zex_counter_based_event_external_storage_properties_t externalStorageAllocProperties = {ZEX_STRUCTURE_COUNTER_BASED_EVENT_EXTERNAL_STORAGE_ALLOC_PROPERTIES}; + externalStorageAllocProperties.completionValue = counterValue; + externalStorageAllocProperties.deviceAddress = deviceAddress; + externalStorageAllocProperties.incrementValue = incrementValue; + + zex_counter_based_event_desc_t counterBasedDesc = {ZEX_STRUCTURE_COUNTER_BASED_EVENT_DESC}; + counterBasedDesc.flags = ZEX_COUNTER_BASED_EVENT_FLAG_IMMEDIATE | ZEX_COUNTER_BASED_EVENT_FLAG_NON_IMMEDIATE; + counterBasedDesc.pNext = &externalStorageAllocProperties; + + EXPECT_EQ(ZE_RESULT_SUCCESS, zexCounterBasedEventCreate2(context, device, &counterBasedDesc, &outEvent)); + + auto eventObj = static_cast(Event::fromHandle(outEvent)); + + return DestroyableZeUniquePtr(eventObj); + } + + DebugManagerStateRestore restore; + CmdListMemoryCopyParams copyParams = {}; + std::unique_ptr> driverHandle; + L0::Device *device = nullptr; + DestroyableZeUniquePtr cmdList; + BcsSplit *bcsSplit = nullptr; + Context *context = nullptr; + const size_t copySize = 4 * MemoryConstants::megaByte; + const int32_t transferDirectionMask = ~(1 << static_cast(TransferDirection::localToLocal)); + + uint32_t expectedTileCount = 1; + uint32_t expectedEnginesCount = 4; + uint32_t expectedNumRootDevices = 1; +}; + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp index 6869455a22..4c10788731 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp @@ -108,7 +108,7 @@ class MockCommandListHw : public WhiteBox<::L0::CommandListCoreFamily>; using BaseClass::useAdditionalBlitProperties; - void setAdditionalBlitProperties(NEO::BlitProperties &blitProperties, Event *signalEvent, uint32_t forceAggregatedEventIncValue, bool useAdditionalTimestamp) override { + void setAdditionalBlitProperties(NEO::BlitProperties &blitProperties, L0::Event *signalEvent, uint64_t forceAggregatedEventIncValue, bool useAdditionalTimestamp) override { additionalBlitPropertiesCalled++; BaseClass::setAdditionalBlitProperties(blitProperties, signalEvent, forceAggregatedEventIncValue, useAdditionalTimestamp); } - void appendSignalInOrderDependencyCounter(Event *signalEvent, bool copyOffloadOperation, bool stall, bool textureFlushRequired) override { + void appendSignalInOrderDependencyCounter(L0::Event *signalEvent, bool copyOffloadOperation, bool stall, bool textureFlushRequired, bool skipAggregatedEventSignaling) override { appendSignalInOrderDependencyCounterCalled++; - BaseClass::appendSignalInOrderDependencyCounter(signalEvent, copyOffloadOperation, stall, textureFlushRequired); + BaseClass::appendSignalInOrderDependencyCounter(signalEvent, copyOffloadOperation, stall, textureFlushRequired, skipAggregatedEventSignaling); } uint32_t additionalBlitPropertiesCalled = 0; uint32_t appendSignalInOrderDependencyCounterCalled = 0; @@ -1003,111 +1002,6 @@ HWTEST2_F(AppendMemoryCopyTests, givenCopyOnlyCommandListWithUseAdditionalBlitPr context->freeMem(dstBuffer); } -struct AggregatedBcsSplitTests : public ::testing::Test { - void SetUp() override { - debugManager.flags.SplitBcsAggregatedEventsMode.set(1); - debugManager.flags.SplitBcsCopy.set(1); - debugManager.flags.SplitBcsRequiredTileCount.set(expectedTileCount); - debugManager.flags.SplitBcsRequiredEnginesCount.set(expectedEnginesCount); - debugManager.flags.SplitBcsMask.set(0b11110); - debugManager.flags.SplitBcsTransferDirectionMask.set(transferDirectionMask); - - createDevice(); - context = Context::fromHandle(driverHandle->getDefaultContext()); - cmdList = createCmdList(true); - } - - void createDevice() { - auto hwInfo = *NEO::defaultHwInfo; - hwInfo.featureTable.ftrBcsInfo = 0b111111111; - hwInfo.capabilityTable.blitterOperationsSupported = true; - auto neoDevice = NEO::MockDevice::createWithNewExecutionEnvironment(&hwInfo, 0); - - NEO::DeviceVector devices; - devices.push_back(std::unique_ptr(neoDevice)); - - for (uint32_t i = 1; i < expectedNumRootDevices; i++) { - auto neoRootDevice = NEO::MockDevice::createWithExecutionEnvironment(&hwInfo, neoDevice->getExecutionEnvironment(), i); - devices.push_back(std::unique_ptr(neoRootDevice)); - } - - driverHandle = std::make_unique>(); - driverHandle->initialize(std::move(devices)); - - this->device = driverHandle->devices[0]; - - bcsSplit = static_cast(device)->bcsSplit.get(); - } - - uint32_t queryCopyOrdinal() { - uint32_t count = 0; - device->getCommandQueueGroupProperties(&count, nullptr); - - std::vector groups; - groups.resize(count); - - device->getCommandQueueGroupProperties(&count, groups.data()); - - for (uint32_t i = 0; i < count; i++) { - if (groups[i].flags == ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) { - return i; - } - } - - EXPECT_TRUE(false); - return 0; - } - - DestroyableZeUniquePtr createCmdList(bool copyOnly) { - ze_result_t returnValue; - - ze_command_queue_desc_t desc = {}; - desc.flags = ZE_COMMAND_QUEUE_FLAG_IN_ORDER; - desc.ordinal = copyOnly ? queryCopyOrdinal() : 0; - - DestroyableZeUniquePtr commandList(CommandList::createImmediate(productFamily, - device, - &desc, - false, - copyOnly ? NEO::EngineGroupType::copy : NEO::EngineGroupType::compute, - returnValue)); - - *static_cast(commandList.get())->getInOrderExecInfo()->getBaseHostAddress() = std::numeric_limits::max(); - - return commandList; - } - - void *allocHostMem() { - void *alloc = nullptr; - ze_host_mem_alloc_desc_t deviceDesc = {}; - context->allocHostMem(&deviceDesc, copySize, 4096, &alloc); - - return alloc; - } - - void *allocDeviceMem(L0::Device *device) { - void *alloc = nullptr; - ze_device_mem_alloc_desc_t deviceDesc = {}; - ze_result_t result = context->allocDeviceMem(device->toHandle(), &deviceDesc, copySize, 4096u, &alloc); - EXPECT_EQ(ZE_RESULT_SUCCESS, result); - return alloc; - } - - DebugManagerStateRestore restore; - CmdListMemoryCopyParams copyParams = {}; - std::unique_ptr> driverHandle; - L0::Device *device = nullptr; - DestroyableZeUniquePtr cmdList; - BcsSplit *bcsSplit = nullptr; - Context *context = nullptr; - const size_t copySize = 4 * MemoryConstants::megaByte; - const int32_t transferDirectionMask = ~(1 << static_cast(TransferDirection::localToLocal)); - - uint32_t expectedTileCount = 1; - uint32_t expectedEnginesCount = 4; - uint32_t expectedNumRootDevices = 1; -}; - HWTEST2_F(AggregatedBcsSplitTests, givenLimitedEnginesCountWhenCreatingBcsSplitThenCreateCorrectQueues, IsAtLeastXeHpcCore) { expectedEnginesCount = 2; debugManager.flags.SplitBcsRequiredEnginesCount.set(expectedEnginesCount); @@ -1178,6 +1072,71 @@ HWTEST2_F(AggregatedBcsSplitTests, givenCopyOffloadEnabledWhenAppendWithEventCal context->freeMem(ptr); } +HWTEST2_F(AggregatedBcsSplitTests, givenAggregatedEventWithMatchingCounterValueWhenAppendCopyCalledThenDontUseSubCopyEvents, IsAtLeastXeHpcCore) { + using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; + using MI_ATOMIC = typename FamilyType::MI_ATOMIC; + + auto cmdListHw = static_cast *>(cmdList.get()); + + auto ptr = allocHostMem(); + auto devAddress = castToUint64(allocDeviceMem(device)); + + uint64_t incValue = 5 * bcsSplit->cmdLists.size(); + uint64_t finalValue = 9 * incValue; + + auto event = createExternalSyncStorageEvent(finalValue, incValue, reinterpret_cast(devAddress)); + + auto mainCmdStream = cmdListHw->getCmdContainer().getCommandStream(); + auto mainOffset = mainCmdStream->getUsed(); + + cmdListHw->appendMemoryCopy(ptr, ptr, copySize, event->toHandle(), 0, nullptr, copyParams); + + EXPECT_EQ(cmdListHw->isUsingAdditionalBlitProperties(), bcsSplit->events.subcopy.empty()); + EXPECT_EQ(cmdListHw->isUsingAdditionalBlitProperties(), bcsSplit->events.marker.empty()); + + GenCmdList genCmdList; + ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(genCmdList, ptrOffset(mainCmdStream->getCpuBase(), mainOffset), (mainCmdStream->getUsed() - mainOffset))); + + bool miFlushFound = false; + auto itor = find(genCmdList.begin(), genCmdList.end()); + while (itor != genCmdList.end()) { + auto miFlushCmd = genCmdCast(*itor); + ASSERT_NE(nullptr, miFlushCmd); + if (devAddress == miFlushCmd->getDestinationAddress()) { + miFlushFound = true; + break; + } + + itor = find(++itor, genCmdList.end()); + } + + bool miAtomicFound = false; + itor = find(genCmdList.begin(), genCmdList.end()); + while (itor != genCmdList.end()) { + auto miAtomicCmd = genCmdCast(*itor); + ASSERT_NE(nullptr, miAtomicCmd); + if (devAddress == miAtomicCmd->getMemoryAddress()) { + miAtomicFound = true; + break; + } + itor = find(++itor, genCmdList.end()); + } + + bool found = miFlushFound || miAtomicFound; + + EXPECT_NE(cmdListHw->isUsingAdditionalBlitProperties(), found); + + auto event2 = createExternalSyncStorageEvent((incValue + 1) * 9, incValue + 1, reinterpret_cast(devAddress)); + + cmdListHw->appendMemoryCopy(ptr, ptr, copySize, event2->toHandle(), 0, nullptr, copyParams); + + EXPECT_FALSE(bcsSplit->events.subcopy.empty()); + EXPECT_FALSE(bcsSplit->events.marker.empty()); + + context->freeMem(ptr); + context->freeMem(reinterpret_cast(devAddress)); +} + HWTEST2_F(AggregatedBcsSplitTests, givenCopyOffloadEnabledWhenAppendThenUseCopyQueue, IsAtLeastXeHpcCore) { if (device->getProductHelper().isDcFlushAllowed()) { GTEST_SKIP(); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_memory_extension.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_memory_extension.cpp index 8f6e3a54aa..49343fab4c 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_memory_extension.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_memory_extension.cpp @@ -131,7 +131,7 @@ class MockCommandListExtensionHw : public WhiteBox<::L0::CommandListCoreFamilygetCmdContainer().getCommandStream(); immCmdList->inOrderAtomicSignalingEnabled = false; - immCmdList->appendSignalInOrderDependencyCounter(eventObj.get(), false, false, false); + immCmdList->appendSignalInOrderDependencyCounter(eventObj.get(), false, false, false, false); GenCmdList cmdList; ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, cmdStream->getCpuBase(), cmdStream->getUsed())); @@ -6025,7 +6025,7 @@ HWTEST_F(InOrderCmdListTests, givenCopyOnlyCmdListAndDebugFlagWhenCounterSignale auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); auto offset = cmdStream->getUsed(); - immCmdList->appendSignalInOrderDependencyCounter(nullptr, false, false, false); + immCmdList->appendSignalInOrderDependencyCounter(nullptr, false, false, false, false); { GenCmdList cmdList; @@ -6049,7 +6049,7 @@ HWTEST_F(InOrderCmdListTests, givenCopyOnlyCmdListAndDebugFlagWhenCounterSignale debugManager.flags.InOrderCopyMiFlushSync.set(0); offset = cmdStream->getUsed(); - immCmdList->appendSignalInOrderDependencyCounter(nullptr, false, false, false); + immCmdList->appendSignalInOrderDependencyCounter(nullptr, false, false, false, false); { GenCmdList cmdList; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp index 276969bcd6..7bfe4ac574 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp @@ -3859,7 +3859,7 @@ HWTEST2_F(MultiTileInOrderCmdListTests, givenMultiTileInOrderModeWhenSignalingSy auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); immCmdList->inOrderAtomicSignalingEnabled = false; - immCmdList->appendSignalInOrderDependencyCounter(nullptr, false, false, false); + immCmdList->appendSignalInOrderDependencyCounter(nullptr, false, false, false, false); GenCmdList cmdList; ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, cmdStream->getCpuBase(), cmdStream->getUsed()));