From 201324f804319cfb0ed6f064609f3e47bcb2384f Mon Sep 17 00:00:00 2001 From: Bartosz Dunajski Date: Wed, 30 Apr 2025 13:49:08 +0000 Subject: [PATCH] feature: wait path improvements for dual stream offload Related-To: NEO-7067 Signed-off-by: Bartosz Dunajski --- level_zero/core/source/cmdlist/cmdlist.cpp | 2 +- level_zero/core/source/cmdlist/cmdlist.h | 3 + level_zero/core/source/cmdlist/cmdlist_hw.h | 10 +- level_zero/core/source/cmdlist/cmdlist_hw.inl | 97 ++++++++++++------- .../source/cmdlist/cmdlist_hw_immediate.h | 1 + .../source/cmdlist/cmdlist_hw_immediate.inl | 2 +- .../core/test/unit_tests/mocks/mock_cmdlist.h | 5 +- .../sources/cmdlist/test_cmdlist_2.cpp | 10 +- .../sources/cmdlist/test_cmdlist_blit.cpp | 2 +- .../cmdlist/test_cmdlist_memory_extension.cpp | 2 +- .../cmdlist/test_in_order_cmdlist_1.cpp | 4 +- .../cmdlist/test_in_order_cmdlist_2.cpp | 88 +++++++++++++++++ 12 files changed, 171 insertions(+), 55 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist.cpp b/level_zero/core/source/cmdlist/cmdlist.cpp index c9b80ab192..f49e956e74 100644 --- a/level_zero/core/source/cmdlist/cmdlist.cpp +++ b/level_zero/core/source/cmdlist/cmdlist.cpp @@ -230,7 +230,7 @@ void CommandList::synchronizeEventList(uint32_t numWaitEvents, ze_event_handle_t } NEO::CommandStreamReceiver *CommandList::getCsr(bool copyOffload) const { - auto queue = (getCopyOffloadModeForOperation(copyOffload) == CopyOffloadModes::dualStream) ? this->cmdQImmediateCopyOffload : this->cmdQImmediate; + auto queue = isDualStreamCopyOffloadOperation(copyOffload) ? this->cmdQImmediateCopyOffload : this->cmdQImmediate; return static_cast(queue)->getCsr(); } diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index b2f0238f94..27d3a88e09 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -448,6 +448,9 @@ struct CommandList : _ze_command_list_handle_t { } MOCKABLE_VIRTUAL void synchronizeEventList(uint32_t numWaitEvents, ze_event_handle_t *waitEventList); + bool isDualStreamCopyOffloadOperation(bool offloadOperation) const { return (getCopyOffloadModeForOperation(offloadOperation) == CopyOffloadModes::dualStream); } + bool isNonDualStreamCopyOffloadOperation(bool offloadOperation) const { return offloadOperation && !isDualStreamCopyOffloadOperation(offloadOperation); } + std::map hostPtrMap; NEO::PrivateAllocsToReuseContainer ownedPrivateAllocations; std::vector patternAllocations; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 48ad18fe87..56e93f589b 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -195,7 +195,7 @@ struct CommandListCoreFamily : public CommandListImp { bool relaxedOrderingAllowed, bool trackDependencies, bool apiRequest, bool skipAddingWaitEventsToResidency, bool skipFlush, bool copyOffloadOperation) override; void appendWaitOnInOrderDependency(std::shared_ptr &inOrderExecInfo, CommandToPatchContainer *outListCommands, uint64_t waitValue, uint32_t offset, bool relaxedOrderingAllowed, bool implicitDependency, - bool skipAddingWaitEventsToResidency, bool noopDispatch, bool copyOffloadOperation); + bool skipAddingWaitEventsToResidency, bool noopDispatch, bool dualStreamCopyOffloadOperation); void appendSignalInOrderDependencyCounter(Event *signalEvent, bool copyOffloadOperation, bool stall, bool textureFlushRequired); void handleInOrderDependencyCounter(Event *signalEvent, bool nonWalkerInOrderCmdsChaining, bool copyOffloadOperation); void handleInOrderCounterOverflow(bool copyOffloadOperation); @@ -209,7 +209,7 @@ struct CommandListCoreFamily : public CommandListImp { void appendMultiPartitionEpilogue() override; void appendEventForProfilingAllWalkers(Event *event, void **syncCmdBuffer, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool singlePacketEvent, bool skipAddingEventToResidency, bool copyOperation); ze_result_t addEventsToCmdList(uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, CommandToPatchContainer *outWaitCmds, - bool relaxedOrderingAllowed, bool trackDependencies, bool waitForImplicitInOrderDependency, bool skipAddingWaitEventsToResidency, bool copyOffloadOperation); + bool relaxedOrderingAllowed, bool trackDependencies, bool waitForImplicitInOrderDependency, bool skipAddingWaitEventsToResidency, bool dualStreamCopyOffloadOperation); MOCKABLE_VIRTUAL void appendSynchronizedDispatchInitializationSection(); MOCKABLE_VIRTUAL void appendSynchronizedDispatchCleanupSection(); @@ -251,7 +251,7 @@ struct CommandListCoreFamily : public CommandListImp { size_t dstRowPitch, size_t dstSlicePitch, const Vec3 &srcSize, const Vec3 &dstSize, Event *signalEvent, - uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch); + uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch, bool dualStreamCopyOffload); MOCKABLE_VIRTUAL ze_result_t appendMemoryCopyKernel2d(AlignedAllocationData *dstAlignedAllocation, AlignedAllocationData *srcAlignedAllocation, Builtin builtin, const ze_copy_region_t *dstRegion, @@ -299,7 +299,7 @@ struct CommandListCoreFamily : public CommandListImp { Event *signalEvent, CmdListKernelLaunchParams &launchParams); - void appendWaitOnSingleEvent(Event *event, CommandToPatchContainer *outWaitCmds, bool relaxedOrderingAllowed, bool copyOffloadOperation, CommandToPatch::CommandType storedSemaphore); + void appendWaitOnSingleEvent(Event *event, CommandToPatchContainer *outWaitCmds, bool relaxedOrderingAllowed, bool dualStreamCopyOffload, CommandToPatch::CommandType storedSemaphore); void appendSdiInOrderCounterSignalling(uint64_t baseGpuVa, uint64_t signalValue, bool copyOffloadOperation); @@ -383,7 +383,7 @@ struct CommandListCoreFamily : public CommandListImp { virtual bool isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents, bool copyOffload) { return false; } virtual void setupFlushMethod(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) {} bool canSkipInOrderEventWait(Event &event, bool ignorCbEventBoundToCmdList) const; - bool handleInOrderImplicitDependencies(bool relaxedOrderingAllowed, bool copyOffloadOperation); + bool handleInOrderImplicitDependencies(bool relaxedOrderingAllowed, bool dualStreamCopyOffloadOperation); bool isQwordInOrderCounter() const { return GfxFamily::isQwordInOrderCounter; } bool isInOrderNonWalkerSignalingRequired(const Event *event) const; bool hasInOrderDependencies() const; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index ca12edc648..70a9aaa804 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -195,7 +195,8 @@ void CommandListCoreFamily::handleInOrderDependencyCounter(Event template void CommandListCoreFamily::handleInOrderCounterOverflow(bool copyOffloadOperation) { if (!isQwordInOrderCounter() && ((inOrderExecInfo->getCounterValue() + 1) == std::numeric_limits::max())) { - CommandListCoreFamily::appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue() + 1, inOrderExecInfo->getAllocationOffset(), false, true, false, false, copyOffloadOperation); + CommandListCoreFamily::appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue() + 1, inOrderExecInfo->getAllocationOffset(), false, true, false, false, + isDualStreamCopyOffloadOperation(copyOffloadOperation)); inOrderExecInfo->resetCounterValue(); @@ -1459,7 +1460,8 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyBlitRegion(Ali size_t dstRowPitch, size_t dstSlicePitch, const Vec3 &srcSize, const Vec3 &dstSize, Event *signalEvent, - uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) { + uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, + bool relaxedOrderingDispatch, bool dualStreamCopyOffload) { srcRegion.originX += getRegionOffsetForAppendMemoryCopyBlitRegion(srcAllocationData); dstRegion.originX += getRegionOffsetForAppendMemoryCopyBlitRegion(dstAllocationData); @@ -1479,7 +1481,7 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyBlitRegion(Ali blitProperties.srcSize = srcSize; blitProperties.dstSize = dstSize; - ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents, nullptr, relaxedOrderingDispatch, false, true, false, true); + ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents, nullptr, relaxedOrderingDispatch, false, true, false, dualStreamCopyOffload); if (ret) { return ret; } @@ -1488,7 +1490,11 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyBlitRegion(Ali return ZE_RESULT_ERROR_INVALID_ARGUMENT; } - appendEventForProfiling(signalEvent, nullptr, true, false, false, true); + const bool copyOnly = isCopyOnly(dualStreamCopyOffload); + + if (copyOnly) { + appendEventForProfiling(signalEvent, nullptr, true, false, false, true); + } auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironmentRef(); bool copyRegionPreferred = NEO::BlitCommandsHelper::isCopyRegionPreferred(copySizeModified, rootDeviceEnvironment, blitProperties.isSystemMemoryPoolUsed); if (copyRegionPreferred) { @@ -1498,7 +1504,9 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyBlitRegion(Ali } dummyBlitWa.isWaRequired = true; - appendSignalEventPostWalker(signalEvent, nullptr, nullptr, false, false, true); + if (copyOnly) { + appendSignalEventPostWalker(signalEvent, nullptr, nullptr, false, false, true); + } return ZE_RESULT_SUCCESS; } @@ -1685,7 +1693,8 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, bool waitForImplicitInOrderDependency = !isCopyOnlyEnabled || inOrderCopyOnlySignalingAllowed; - ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents, nullptr, memoryCopyParams.relaxedOrderingDispatch, false, waitForImplicitInOrderDependency, false, isCopyOnlyEnabled); + ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents, nullptr, memoryCopyParams.relaxedOrderingDispatch, false, + waitForImplicitInOrderDependency, false, isDualStreamCopyOffloadOperation(memoryCopyParams.copyOffloadAllowed)); if (ret) { return ret; @@ -1714,8 +1723,10 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, launchParams.pipeControlSignalling = (signalEvent && singlePipeControlPacket) || getDcFlushRequired(dstAllocationStruct.needsFlush); - if (!useAdditionalBlitProperties || !isCopyOnlyEnabled) { - appendEventForProfilingAllWalkers(signalEvent, nullptr, nullptr, true, singlePipeControlPacket, false, isCopyOnlyEnabled); + if (!isNonDualStreamCopyOffloadOperation(memoryCopyParams.copyOffloadAllowed)) { + if (!useAdditionalBlitProperties || !isCopyOnlyEnabled) { + appendEventForProfilingAllWalkers(signalEvent, nullptr, nullptr, true, singlePipeControlPacket, false, isCopyOnlyEnabled); + } } if (isCopyOnlyEnabled) { @@ -1788,8 +1799,10 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, appendCopyOperationFence(signalEvent, srcAllocationStruct.alloc, dstAllocationStruct.alloc, isCopyOnlyEnabled); - if (!useAdditionalBlitProperties || !isCopyOnlyEnabled) { - appendEventForProfilingAllWalkers(signalEvent, nullptr, nullptr, false, singlePipeControlPacket, false, isCopyOnlyEnabled); + if (!isNonDualStreamCopyOffloadOperation(memoryCopyParams.copyOffloadAllowed)) { + if (!useAdditionalBlitProperties || !isCopyOnlyEnabled) { + appendEventForProfilingAllWalkers(signalEvent, nullptr, nullptr, false, singlePipeControlPacket, false, isCopyOnlyEnabled); + } } bool l3flushInPipeControl = !l3FlushAfterPostSyncRequired || isSplitOperation; @@ -1797,12 +1810,12 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, addToMappedEventList(signalEvent); - if (this->isInOrderExecutionEnabled()) { + if (this->isInOrderExecutionEnabled() && !isNonDualStreamCopyOffloadOperation(memoryCopyParams.copyOffloadAllowed)) { bool emitPipeControl = !isCopyOnlyEnabled && launchParams.pipeControlSignalling; if (launchParams.isKernelSplitOperation || inOrderCopyOnlySignalingAllowed || emitPipeControl) { dispatchInOrderPostOperationBarrier(signalEvent, dcFlush, isCopyOnlyEnabled); - appendSignalInOrderDependencyCounter(signalEvent, isCopyOnlyEnabled, false, false); + appendSignalInOrderDependencyCounter(signalEvent, memoryCopyParams.copyOffloadAllowed, false, false); } if (!isCopyOnlyEnabled || inOrderCopyOnlySignalingAllowed) { @@ -1876,14 +1889,15 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyRegion(void *d memoryCopyParams.copyOffloadAllowed = isCopyOffloadAllowed(*srcAllocationStruct.alloc, *dstAllocationStruct.alloc); const bool isCopyOnlyEnabled = isCopyOnly(memoryCopyParams.copyOffloadAllowed); - const bool inOrderCopyOnlySignalingAllowed = this->isInOrderExecutionEnabled() && !memoryCopyParams.forceDisableCopyOnlyInOrderSignaling && isCopyOnlyEnabled; + const bool inOrderCopyOnlySignalingAllowed = this->isInOrderExecutionEnabled() && !memoryCopyParams.forceDisableCopyOnlyInOrderSignaling && + isCopyOnlyEnabled && !isNonDualStreamCopyOffloadOperation(memoryCopyParams.copyOffloadAllowed); ze_result_t result = ZE_RESULT_SUCCESS; if (isCopyOnlyEnabled) { result = appendMemoryCopyBlitRegion(&srcAllocationStruct, &dstAllocationStruct, *srcRegion, *dstRegion, {srcRegion->width, srcRegion->height, srcRegion->depth}, srcPitch, srcSlicePitch, dstPitch, dstSlicePitch, srcSize3, dstSize3, - signalEvent, numWaitEvents, phWaitEvents, memoryCopyParams.relaxedOrderingDispatch); + signalEvent, numWaitEvents, phWaitEvents, memoryCopyParams.relaxedOrderingDispatch, isDualStreamCopyOffloadOperation(memoryCopyParams.copyOffloadAllowed)); } else if ((srcRegion->depth > 1) || (srcRegion->originZ != 0) || (dstRegion->originZ != 0)) { result = this->appendMemoryCopyKernel3d(&dstAllocationStruct, &srcAllocationStruct, Builtin::copyBufferRectBytes3d, dstRegion, dstPitch, dstSlicePitch, dstAllocationStruct.offset, @@ -1907,7 +1921,7 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyRegion(void *d if (this->isInOrderExecutionEnabled()) { if (inOrderCopyOnlySignalingAllowed) { - appendSignalInOrderDependencyCounter(signalEvent, isCopyOnlyEnabled, false, false); + appendSignalInOrderDependencyCounter(signalEvent, memoryCopyParams.copyOffloadAllowed, false, false); handleInOrderDependencyCounter(signalEvent, false, isCopyOnlyEnabled); } } else { @@ -2607,17 +2621,17 @@ inline uint32_t CommandListCoreFamily::getRegionOffsetForAppendMe } template -bool CommandListCoreFamily::handleInOrderImplicitDependencies(bool relaxedOrderingAllowed, bool copyOffloadOperation) { +bool CommandListCoreFamily::handleInOrderImplicitDependencies(bool relaxedOrderingAllowed, bool dualStreamCopyOffloadOperation) { if (hasInOrderDependencies()) { if (inOrderExecInfo->isCounterAlreadyDone(inOrderExecInfo->getCounterValue())) { return false; } if (relaxedOrderingAllowed) { - NEO::RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(*commandContainer.getCommandStream(), isCopyOnly(copyOffloadOperation)); + NEO::RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(*commandContainer.getCommandStream(), isCopyOnly(dualStreamCopyOffloadOperation)); } - CommandListCoreFamily::appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue(), inOrderExecInfo->getAllocationOffset(), relaxedOrderingAllowed, true, false, false, copyOffloadOperation); + CommandListCoreFamily::appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue(), inOrderExecInfo->getAllocationOffset(), relaxedOrderingAllowed, true, false, false, dualStreamCopyOffloadOperation); return true; } @@ -2627,7 +2641,7 @@ bool CommandListCoreFamily::handleInOrderImplicitDependencies(boo template inline ze_result_t CommandListCoreFamily::addEventsToCmdList(uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, CommandToPatchContainer *outWaitCmds, - bool relaxedOrderingAllowed, bool trackDependencies, bool waitForImplicitInOrderDependency, bool skipAddingWaitEventsToResidency, bool copyOffloadOperation) { + bool relaxedOrderingAllowed, bool trackDependencies, bool waitForImplicitInOrderDependency, bool skipAddingWaitEventsToResidency, bool dualStreamCopyOffloadOperation) { bool inOrderDependenciesSent = false; if (this->latestOperationRequiredNonWalkerInOrderCmdsChaining && !relaxedOrderingAllowed) { @@ -2635,22 +2649,22 @@ inline ze_result_t CommandListCoreFamily::addEventsToCmdList(uint } if (waitForImplicitInOrderDependency) { - auto ret = this->flushInOrderCounterSignal(copyOffloadOperation || relaxedOrderingAllowed); + auto ret = this->flushInOrderCounterSignal(dualStreamCopyOffloadOperation || relaxedOrderingAllowed); if (ret != ZE_RESULT_SUCCESS) { return ret; } - inOrderDependenciesSent = handleInOrderImplicitDependencies(relaxedOrderingAllowed, copyOffloadOperation); + inOrderDependenciesSent = handleInOrderImplicitDependencies(relaxedOrderingAllowed, dualStreamCopyOffloadOperation); this->latestOperationHasOptimizedCbEvent = false; } if (relaxedOrderingAllowed && numWaitEvents > 0 && !inOrderDependenciesSent) { - NEO::RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(*commandContainer.getCommandStream(), isCopyOnly(copyOffloadOperation)); + NEO::RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(*commandContainer.getCommandStream(), isCopyOnly(dualStreamCopyOffloadOperation)); } if (numWaitEvents > 0) { if (phWaitEvents) { - return CommandListCoreFamily::appendWaitOnEvents(numWaitEvents, phWaitEvents, outWaitCmds, relaxedOrderingAllowed, trackDependencies, false, skipAddingWaitEventsToResidency, false, copyOffloadOperation); + return CommandListCoreFamily::appendWaitOnEvents(numWaitEvents, phWaitEvents, outWaitCmds, relaxedOrderingAllowed, trackDependencies, false, skipAddingWaitEventsToResidency, false, dualStreamCopyOffloadOperation); } else { return ZE_RESULT_ERROR_INVALID_ARGUMENT; } @@ -2728,7 +2742,7 @@ NEO::GraphicsAllocation *CommandListCoreFamily::getDeviceCounterA template void CommandListCoreFamily::appendWaitOnInOrderDependency(std::shared_ptr &inOrderExecInfo, CommandToPatchContainer *outListCommands, uint64_t waitValue, uint32_t offset, bool relaxedOrderingAllowed, bool implicitDependency, bool skipAddingWaitEventsToResidency, - bool noopDispatch, bool copyOffloadOperation) { + bool noopDispatch, bool dualStreamCopyOffloadOperation) { using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; UNRECOVERABLE_IF(waitValue > static_cast(std::numeric_limits::max()) && !isQwordInOrderCounter()); @@ -2741,13 +2755,15 @@ void CommandListCoreFamily::appendWaitOnInOrderDependency(std::sh uint64_t gpuAddress = inOrderExecInfo->getBaseDeviceAddress() + offset; const uint32_t immWriteOffset = device->getL0GfxCoreHelper().getImmediateWritePostSyncOffset(); + const bool copyOnlyWait = isCopyOnly(dualStreamCopyOffloadOperation); for (uint32_t i = 0; i < inOrderExecInfo->getNumDevicePartitionsToWait(); i++) { if (relaxedOrderingAllowed) { - NEO::EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*commandContainer.getCommandStream(), 0, gpuAddress, waitValue, NEO::CompareOperation::less, true, isQwordInOrderCounter(), isCopyOnly(copyOffloadOperation)); + NEO::EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*commandContainer.getCommandStream(), 0, gpuAddress, waitValue, NEO::CompareOperation::less, true, + isQwordInOrderCounter(), copyOnlyWait); } else { - auto resolveDependenciesViaPipeControls = !this->isCopyOnly(copyOffloadOperation) && !this->asMutable() && implicitDependency && (this->dcFlushSupport || (!this->heaplessModeEnabled && this->latestOperationHasOptimizedCbEvent)); + auto resolveDependenciesViaPipeControls = !copyOnlyWait && !this->asMutable() && implicitDependency && (this->dcFlushSupport || (!this->heaplessModeEnabled && this->latestOperationHasOptimizedCbEvent)); if (NEO::debugManager.flags.ResolveDependenciesViaPipeControls.get() != -1) { resolveDependenciesViaPipeControls = NEO::debugManager.flags.ResolveDependenciesViaPipeControls.get(); @@ -2778,8 +2794,8 @@ void CommandListCoreFamily::appendWaitOnInOrderDependency(std::sh auto lri2 = commandContainer.getCommandStream()->template getSpaceForCmd(); if (!noopDispatch) { - NEO::LriHelper::program(lri1, firstRegister, getLowPart(waitValue), true, isCopyOnly(copyOffloadOperation)); - NEO::LriHelper::program(lri2, secondRegister, getHighPart(waitValue), true, isCopyOnly(copyOffloadOperation)); + NEO::LriHelper::program(lri1, firstRegister, getLowPart(waitValue), true, copyOnlyWait); + NEO::LriHelper::program(lri2, secondRegister, getHighPart(waitValue), true, copyOnlyWait); } else { memset(lri1, 0, sizeof(MI_LOAD_REGISTER_IMM)); memset(lri2, 0, sizeof(MI_LOAD_REGISTER_IMM)); @@ -2862,8 +2878,10 @@ ze_result_t CommandListCoreFamily::appendWaitOnEvents(uint32_t nu callId); } + const bool dualStreamCopyOffload = isDualStreamCopyOffloadOperation(copyOffloadOperation); + if (this->isInOrderExecutionEnabled() && apiRequest) { - handleInOrderImplicitDependencies(false, copyOffloadOperation); + handleInOrderImplicitDependencies(false, dualStreamCopyOffload); } bool dcFlushRequired = false; @@ -2875,6 +2893,7 @@ ze_result_t CommandListCoreFamily::appendWaitOnEvents(uint32_t nu } } if (dcFlushRequired) { + UNRECOVERABLE_IF(isNonDualStreamCopyOffloadOperation(copyOffloadOperation)); if (isCopyOnly(copyOffloadOperation)) { NEO::MiFlushArgs args{this->dummyBlitWa}; encodeMiFlush(0, 0, args); @@ -2905,7 +2924,7 @@ ze_result_t CommandListCoreFamily::appendWaitOnEvents(uint32_t nu CommandListCoreFamily::appendWaitOnInOrderDependency(event->getInOrderExecInfo(), outWaitCmds, waitValue, event->getInOrderAllocationOffset(), relaxedOrderingAllowed, false, skipAddingWaitEventsToResidency, - isCbEventBoundToCmdList(event), copyOffloadOperation); + isCbEventBoundToCmdList(event), dualStreamCopyOffload); continue; } @@ -2918,10 +2937,10 @@ ze_result_t CommandListCoreFamily::appendWaitOnEvents(uint32_t nu commandContainer.addToResidencyContainer(event->getAllocation(this->device)); } - appendWaitOnSingleEvent(event, outWaitCmds, relaxedOrderingAllowed, copyOffloadOperation, CommandToPatch::WaitEventSemaphoreWait); + appendWaitOnSingleEvent(event, outWaitCmds, relaxedOrderingAllowed, dualStreamCopyOffload, CommandToPatch::WaitEventSemaphoreWait); } - if (isImmediateType() && isCopyOnly(copyOffloadOperation) && trackDependencies) { + if (isImmediateType() && isCopyOnly(dualStreamCopyOffload) && trackDependencies) { NEO::MiFlushArgs args{this->dummyBlitWa}; args.commandWithPostSync = true; auto csr = getCsr(false); @@ -2931,9 +2950,9 @@ ze_result_t CommandListCoreFamily::appendWaitOnEvents(uint32_t nu if (apiRequest) { if (this->isInOrderExecutionEnabled()) { - appendSignalInOrderDependencyCounter(nullptr, copyOffloadOperation, false, false); + appendSignalInOrderDependencyCounter(nullptr, false, false, false); } - handleInOrderDependencyCounter(nullptr, false, copyOffloadOperation); + handleInOrderDependencyCounter(nullptr, false, false); } if (NEO::debugManager.flags.EnableSWTags.get()) { @@ -2951,6 +2970,8 @@ template void CommandListCoreFamily::appendSdiInOrderCounterSignalling(uint64_t baseGpuVa, uint64_t signalValue, bool copyOffloadOperation) { using MI_STORE_DATA_IMM = typename GfxFamily::MI_STORE_DATA_IMM; + UNRECOVERABLE_IF(isNonDualStreamCopyOffloadOperation(copyOffloadOperation)); + uint64_t gpuVa = baseGpuVa + inOrderExecInfo->getAllocationOffset(); uint32_t numWrites = 1; @@ -2978,6 +2999,8 @@ void CommandListCoreFamily::appendSignalInOrderDependencyCounter( using ATOMIC_OPCODES = typename GfxFamily::MI_ATOMIC::ATOMIC_OPCODES; using DATA_SIZE = typename GfxFamily::MI_ATOMIC::DATA_SIZE; + UNRECOVERABLE_IF(isNonDualStreamCopyOffloadOperation(copyOffloadOperation)); + uint64_t deviceAllocGpuVa = inOrderExecInfo->getBaseDeviceAddress(); uint64_t signalValue = inOrderExecInfo->getCounterValue() + getInOrderIncrementValue(); @@ -3022,7 +3045,7 @@ void CommandListCoreFamily::appendSignalInOrderDependencyCounter( DATA_SIZE::DATA_SIZE_QWORD, 0, 0, signalEvent->getInOrderIncrementValue(), 0); } - if ((NEO::debugManager.flags.ProgramUserInterruptOnResolvedDependency.get() == 1 || copyOffloadOperation) && signalEvent && signalEvent->isInterruptModeEnabled()) { + if ((NEO::debugManager.flags.ProgramUserInterruptOnResolvedDependency.get() == 1 || isCopyOnly(copyOffloadOperation)) && signalEvent && signalEvent->isInterruptModeEnabled()) { NEO::EnodeUserInterrupt::encode(*cmdStream); } } @@ -4214,7 +4237,7 @@ void CommandListCoreFamily::dispatchEventRemainingPacketsPostSync } template -void CommandListCoreFamily::appendWaitOnSingleEvent(Event *event, CommandToPatchContainer *outWaitCmds, bool relaxedOrderingAllowed, bool copyOffloadOperation, CommandToPatch::CommandType storedSemaphore) { +void CommandListCoreFamily::appendWaitOnSingleEvent(Event *event, CommandToPatchContainer *outWaitCmds, bool relaxedOrderingAllowed, bool dualStreamCopyOffload, CommandToPatch::CommandType storedSemaphore) { using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; uint64_t gpuAddr = event->getCompletionFieldGpuAddress(this->device); @@ -4229,7 +4252,7 @@ void CommandListCoreFamily::appendWaitOnSingleEvent(Event *event, for (uint32_t i = 0u; i < packetsToWait; i++) { if (relaxedOrderingAllowed) { NEO::EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*commandContainer.getCommandStream(), 0, gpuAddr, Event::STATE_CLEARED, - NEO::CompareOperation::equal, true, false, isCopyOnly(copyOffloadOperation)); + NEO::CompareOperation::equal, true, false, isCopyOnly(dualStreamCopyOffload)); } else { NEO::EncodeSemaphore::addMiSemaphoreWaitCommand(*commandContainer.getCommandStream(), gpuAddr, diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h index 0bc72d6690..9ce07a1984 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h @@ -53,6 +53,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily::hostSynchronize(uint6 bool mainStorageCleanupNeeded = !mainInternalAllocStorage->getTemporaryAllocations().peekIsEmpty(); bool copyOffloadStorageCleanupNeeded = false; - const bool dualStreamCopyOffload = (getCopyOffloadModeForOperation(isCopyOffloadEnabled()) == CopyOffloadModes::dualStream); + const bool dualStreamCopyOffload = isDualStreamCopyOffloadOperation(isCopyOffloadEnabled()); if (dualStreamCopyOffload) { copyOffloadTaskCount = this->cmdQImmediateCopyOffload->getTaskCount(); diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index 8d4c292919..cad59e82b1 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -706,10 +706,11 @@ class MockCommandListCoreFamily : public CommandListCoreFamily { size_t dstRowPitch, size_t dstSlicePitch, const Vec3 &srcSize, const Vec3 &dstSize, L0::Event *signalEvent, - uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) override { + uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch, bool doubleStreamCopyOffload) override { srcBlitCopyRegionOffset = srcAllocationData->offset; dstBlitCopyRegionOffset = dstAllocationData->offset; - return L0::CommandListCoreFamily::appendMemoryCopyBlitRegion(srcAllocationData, dstAllocationData, srcRegion, dstRegion, copySize, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, srcSize, dstSize, signalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch); + return L0::CommandListCoreFamily::appendMemoryCopyBlitRegion(srcAllocationData, dstAllocationData, srcRegion, dstRegion, copySize, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, + srcSize, dstSize, signalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch, doubleStreamCopyOffload); } uintptr_t srcAlignedPtr; uintptr_t dstAlignedPtr; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp index f887c42db1..b4a53c021c 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp @@ -98,7 +98,7 @@ class MockCommandListHw : public WhiteBox<::L0::CommandListCoreFamily &srcSize, const Vec3 &dstSize, L0::Event *signalEvent, - uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) override { + uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch, bool doubleStreamCopyOffload) override { if (signalEvent) { useEvents = true; } else { @@ -1417,7 +1417,7 @@ HWTEST2_F(CommandListCreateTests, givenCopyCommandListWhenCopyRegionWithinMaxBli AlignedAllocationData dstAllocationData = {mockAllocationDst.gpuAddress, 0, &mockAllocationDst, false}; size_t rowPitch = copySize.x; size_t slicePitch = copySize.x * copySize.y; - commandList->appendMemoryCopyBlitRegion(&srcAllocationData, &dstAllocationData, srcRegion, dstRegion, copySize, rowPitch, slicePitch, rowPitch, slicePitch, srcSize, dstSize, nullptr, 0, nullptr, false); + commandList->appendMemoryCopyBlitRegion(&srcAllocationData, &dstAllocationData, srcRegion, dstRegion, copySize, rowPitch, slicePitch, rowPitch, slicePitch, srcSize, dstSize, nullptr, 0, nullptr, false, false); GenCmdList cmdList; ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer( @@ -1468,7 +1468,7 @@ HWTEST2_F(CommandListCreateTests, givenCopyCommandListWhenCopyRegionWithinMaxBli AlignedAllocationData dstAllocationData = {mockAllocationDst.gpuAddress, 0, &mockAllocationDst, false}; size_t rowPitch = copySize.x; size_t slicePitch = copySize.x * copySize.y; - commandList->appendMemoryCopyBlitRegion(&srcAllocationData, &dstAllocationData, srcRegion, dstRegion, copySize, rowPitch, slicePitch, rowPitch, slicePitch, srcSize, dstSize, nullptr, 0, nullptr, false); + commandList->appendMemoryCopyBlitRegion(&srcAllocationData, &dstAllocationData, srcRegion, dstRegion, copySize, rowPitch, slicePitch, rowPitch, slicePitch, srcSize, dstSize, nullptr, 0, nullptr, false, false); uint32_t bytesPerPixel = NEO::BlitCommandsHelper::getAvailableBytesPerPixel(copySize.x, srcRegion.originX, dstRegion.originY, srcSize.x, dstSize.x); GenCmdList cmdList; @@ -1518,7 +1518,7 @@ HWTEST2_F(CommandListCreateTests, givenCopyCommandListWhenCopyRegionGreaterThanM AlignedAllocationData dstAllocationData = {mockAllocationDst.gpuAddress, 0, &mockAllocationDst, false}; size_t rowPitch = copySize.x; size_t slicePitch = copySize.x * copySize.y; - commandList->appendMemoryCopyBlitRegion(&srcAllocationData, &dstAllocationData, srcRegion, dstRegion, copySize, rowPitch, slicePitch, rowPitch, slicePitch, srcSize, dstSize, nullptr, 0, nullptr, false); + commandList->appendMemoryCopyBlitRegion(&srcAllocationData, &dstAllocationData, srcRegion, dstRegion, copySize, rowPitch, slicePitch, rowPitch, slicePitch, srcSize, dstSize, nullptr, 0, nullptr, false, false); GenCmdList cmdList; ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer( @@ -1545,7 +1545,7 @@ class MockCommandListForRegionSize : public WhiteBox<::L0::CommandListCoreFamily size_t dstRowPitch, size_t dstSlicePitch, const Vec3 &srcSize, const Vec3 &dstSize, L0::Event *signalEvent, - uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) override { + uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch, bool doubleStreamCopyOffload) override { this->srcSize = srcSize; this->dstSize = dstSize; return ZE_RESULT_SUCCESS; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp index 6e7600be52..e45e52ad2f 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp @@ -253,7 +253,7 @@ HWTEST2_F(AppendMemoryCopyTests, givenCopyCommandListWhenTimestampPassedToMemory AlignedAllocationData srcAllocationData = {mockAllocationSrc.gpuAddress, 0, &mockAllocationSrc, false}; AlignedAllocationData dstAllocationData = {mockAllocationDst.gpuAddress, 0, &mockAllocationDst, false}; - commandList->appendMemoryCopyBlitRegion(&srcAllocationData, &dstAllocationData, srcRegion, dstRegion, {0, 0, 0}, 0, 0, 0, 0, 0, 0, event.get(), 0, nullptr, false); + commandList->appendMemoryCopyBlitRegion(&srcAllocationData, &dstAllocationData, srcRegion, dstRegion, {0, 0, 0}, 0, 0, 0, 0, 0, 0, event.get(), 0, nullptr, false, false); GenCmdList cmdList; auto baseAddr = event->getGpuAddress(device); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_memory_extension.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_memory_extension.cpp index 5d5402e923..85618fd4e1 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_memory_extension.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_memory_extension.cpp @@ -138,7 +138,7 @@ class MockCommandListExtensionHw : public WhiteBox<::L0::CommandListCoreFamily &srcSize, const Vec3 &dstSize, Event *signalEvent, - uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) override { + uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch, bool doubleStreamCopyOffload) override { if (signalEvent) { useEvents = true; } else { diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp index 6d9b5d51ac..a1598d0ba3 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp @@ -1755,7 +1755,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, InOrderCmdListTests, givenImmediateCmdListWhenDispa } events[0]->makeCounterBasedInitiallyDisabled(eventPool->getAllocation()); - copyOnlyCmdList->appendMemoryCopyBlitRegion(&allocationData, &allocationData, region, region, {0, 0, 0}, 0, 0, 0, 0, {0, 0, 0}, {0, 0, 0}, events[0].get(), 0, nullptr, false); + copyOnlyCmdList->appendMemoryCopyBlitRegion(&allocationData, &allocationData, region, region, {0, 0, 0}, 0, 0, 0, 0, {0, 0, 0}, {0, 0, 0}, events[0].get(), 0, nullptr, false, false); if (dcFlushRequired) { EXPECT_EQ(Event::CounterBasedMode::initiallyDisabled, events[0]->counterBasedMode); } else { @@ -1909,7 +1909,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, InOrderCmdListTests, givenNonInOrderCmdListWhenPass const void **ranges = reinterpret_cast(©Data[0]); EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, immCmdList->appendMemoryRangesBarrier(1, &rangeSizes, ranges, eventHandle, 0, nullptr)); - EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, copyOnlyCmdList->appendMemoryCopyBlitRegion(&allocationData, &allocationData, region, region, {0, 0, 0}, 0, 0, 0, 0, {0, 0, 0}, {0, 0, 0}, events[0].get(), 0, nullptr, false)); + EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, copyOnlyCmdList->appendMemoryCopyBlitRegion(&allocationData, &allocationData, region, region, {0, 0, 0}, 0, 0, 0, 0, {0, 0, 0}, {0, 0, 0}, events[0].get(), 0, nullptr, false, false)); EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, immCmdList->appendMemoryCopy(©Data, ©Data, 1, eventHandle, 0, nullptr, copyParams)); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp index 5cab6ce9be..95479bad81 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp @@ -101,6 +101,94 @@ HWTEST2_F(CopyOffloadInOrderTests, givenNonDualStreamModeWhenSubmittedThenUseDef EXPECT_TRUE(immCmdList->latestFlushIsHostVisible); } +HWTEST2_F(CopyOffloadInOrderTests, givenNonDualStreamModeWhenSubmittedThenDontProgramBcsMmioBase, IsAtLeastXeHpCore) { + using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; + debugManager.flags.OverrideCopyOffloadMode.set(nonDualStreamMode); + auto immCmdList0 = createImmCmdListWithOffload(); + auto immCmdList1 = createImmCmdListWithOffload(); + + auto eventPool = createEvents(2, true); + auto eventHandle0 = events[0]->toHandle(); + auto eventHandle1 = events[1]->toHandle(); + + immCmdList0->appendMemoryCopy(©Data1, ©Data2, 1, eventHandle0, 0, nullptr, copyParams); + immCmdList1->appendMemoryCopy(©Data1, ©Data2, 1, eventHandle1, 0, nullptr, copyParams); + immCmdList1->appendMemoryCopy(©Data1, ©Data2, 1, nullptr, 1, &eventHandle0, copyParams); + auto cmdStream = immCmdList1->getCmdContainer().getCommandStream(); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, cmdStream->getCpuBase(), 0)); + + auto itor = find(cmdList.begin(), cmdList.end()); + + while (itor != cmdList.end()) { + auto cmd = genCmdCast(*itor); + EXPECT_TRUE(cmd->getRegisterOffset() < RegisterOffsets::bcs0Base); + itor = find(++itor, cmdList.end()); + } +} + +HWTEST2_F(CopyOffloadInOrderTests, givenNonDualStreamModeAndProfilingEventWithRelaxedOrderingWhenAppendingThenDontBcsCommands, IsAtLeastXeHpCore) { + using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG; + using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; + using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM; + debugManager.flags.DirectSubmissionRelaxedOrdering.set(1); + debugManager.flags.OverrideCopyOffloadMode.set(nonDualStreamMode); + + auto immCmdList = createImmCmdListWithOffload(); + + auto mainQueueCsr = static_cast *>(immCmdList->getCsr(false)); + auto copyQueueCsr = static_cast *>(immCmdList->getCsr(true)); + + auto mainQueueDirectSubmission = new MockDirectSubmissionHw>(*mainQueueCsr); + auto offloadDirectSubmission = new MockDirectSubmissionHw>(*copyQueueCsr); + + mainQueueCsr->directSubmission.reset(mainQueueDirectSubmission); + copyQueueCsr->blitterDirectSubmission.reset(offloadDirectSubmission); + + int client1, client2; + + mainQueueCsr->registerClient(&client1); + mainQueueCsr->registerClient(&client2); + copyQueueCsr->registerClient(&client1); + copyQueueCsr->registerClient(&client2); + + auto eventPool = createEvents(1, true); + + auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); + auto offset = cmdStream->getUsed(); + + auto eventHandle = events[0]->toHandle(); + + immCmdList->appendMemoryCopy(©Data1, ©Data2, 1, eventHandle, 0, nullptr, copyParams); + + ze_copy_region_t region = {0, 0, 0, 1, 1, 1}; + immCmdList->appendMemoryCopyRegion(©Data1, ®ion, 1, 1, ©Data2, ®ion, 1, 1, eventHandle, 0, nullptr, copyParams); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), (cmdStream->getUsed() - offset))); + + auto lrrCmds = findAll(cmdList.begin(), cmdList.end()); + auto lriCmds = findAll(cmdList.begin(), cmdList.end()); + auto lrmCmds = findAll(cmdList.begin(), cmdList.end()); + + for (auto &lrr : lrrCmds) { + auto lrrCmd = genCmdCast(*lrr); + EXPECT_TRUE(lrrCmd->getSourceRegisterAddress() < RegisterOffsets::bcs0Base); + EXPECT_TRUE(lrrCmd->getDestinationRegisterAddress() < RegisterOffsets::bcs0Base); + } + + for (auto &lri : lriCmds) { + auto lriCmd = genCmdCast(*lri); + EXPECT_TRUE(lriCmd->getRegisterOffset() < RegisterOffsets::bcs0Base); + } + + for (auto &lrm : lrmCmds) { + auto lrmCmd = genCmdCast(*lrm); + EXPECT_TRUE(lrmCmd->getRegisterAddress() < RegisterOffsets::bcs0Base); + } +} + HWTEST2_F(CopyOffloadInOrderTests, givenDebugFlagSetWhenCreatingCmdListThenEnableCopyOffload, IsAtLeastXeHpCore) { NEO::debugManager.flags.ForceCopyOperationOffloadForComputeCmdList.set(1);