feature: wait path improvements for dual stream offload

Related-To: NEO-7067

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2025-04-30 13:49:08 +00:00
committed by Compute-Runtime-Automation
parent d3b11d1527
commit 201324f804
12 changed files with 171 additions and 55 deletions

View File

@@ -230,7 +230,7 @@ void CommandList::synchronizeEventList(uint32_t numWaitEvents, ze_event_handle_t
}
NEO::CommandStreamReceiver *CommandList::getCsr(bool copyOffload) const {
auto queue = (getCopyOffloadModeForOperation(copyOffload) == CopyOffloadModes::dualStream) ? this->cmdQImmediateCopyOffload : this->cmdQImmediate;
auto queue = isDualStreamCopyOffloadOperation(copyOffload) ? this->cmdQImmediateCopyOffload : this->cmdQImmediate;
return static_cast<CommandQueueImp *>(queue)->getCsr();
}

View File

@@ -448,6 +448,9 @@ struct CommandList : _ze_command_list_handle_t {
}
MOCKABLE_VIRTUAL void synchronizeEventList(uint32_t numWaitEvents, ze_event_handle_t *waitEventList);
bool isDualStreamCopyOffloadOperation(bool offloadOperation) const { return (getCopyOffloadModeForOperation(offloadOperation) == CopyOffloadModes::dualStream); }
bool isNonDualStreamCopyOffloadOperation(bool offloadOperation) const { return offloadOperation && !isDualStreamCopyOffloadOperation(offloadOperation); }
std::map<const void *, NEO::GraphicsAllocation *> hostPtrMap;
NEO::PrivateAllocsToReuseContainer ownedPrivateAllocations;
std::vector<NEO::GraphicsAllocation *> patternAllocations;

View File

@@ -195,7 +195,7 @@ struct CommandListCoreFamily : public CommandListImp {
bool relaxedOrderingAllowed, bool trackDependencies, bool apiRequest, bool skipAddingWaitEventsToResidency, bool skipFlush, bool copyOffloadOperation) override;
void appendWaitOnInOrderDependency(std::shared_ptr<NEO::InOrderExecInfo> &inOrderExecInfo, CommandToPatchContainer *outListCommands,
uint64_t waitValue, uint32_t offset, bool relaxedOrderingAllowed, bool implicitDependency,
bool skipAddingWaitEventsToResidency, bool noopDispatch, bool copyOffloadOperation);
bool skipAddingWaitEventsToResidency, bool noopDispatch, bool dualStreamCopyOffloadOperation);
void appendSignalInOrderDependencyCounter(Event *signalEvent, bool copyOffloadOperation, bool stall, bool textureFlushRequired);
void handleInOrderDependencyCounter(Event *signalEvent, bool nonWalkerInOrderCmdsChaining, bool copyOffloadOperation);
void handleInOrderCounterOverflow(bool copyOffloadOperation);
@@ -209,7 +209,7 @@ struct CommandListCoreFamily : public CommandListImp {
void appendMultiPartitionEpilogue() override;
void appendEventForProfilingAllWalkers(Event *event, void **syncCmdBuffer, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool singlePacketEvent, bool skipAddingEventToResidency, bool copyOperation);
ze_result_t addEventsToCmdList(uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, CommandToPatchContainer *outWaitCmds,
bool relaxedOrderingAllowed, bool trackDependencies, bool waitForImplicitInOrderDependency, bool skipAddingWaitEventsToResidency, bool copyOffloadOperation);
bool relaxedOrderingAllowed, bool trackDependencies, bool waitForImplicitInOrderDependency, bool skipAddingWaitEventsToResidency, bool dualStreamCopyOffloadOperation);
MOCKABLE_VIRTUAL void appendSynchronizedDispatchInitializationSection();
MOCKABLE_VIRTUAL void appendSynchronizedDispatchCleanupSection();
@@ -251,7 +251,7 @@ struct CommandListCoreFamily : public CommandListImp {
size_t dstRowPitch, size_t dstSlicePitch,
const Vec3<size_t> &srcSize, const Vec3<size_t> &dstSize,
Event *signalEvent,
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch);
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch, bool dualStreamCopyOffload);
MOCKABLE_VIRTUAL ze_result_t appendMemoryCopyKernel2d(AlignedAllocationData *dstAlignedAllocation, AlignedAllocationData *srcAlignedAllocation,
Builtin builtin, const ze_copy_region_t *dstRegion,
@@ -299,7 +299,7 @@ struct CommandListCoreFamily : public CommandListImp {
Event *signalEvent,
CmdListKernelLaunchParams &launchParams);
void appendWaitOnSingleEvent(Event *event, CommandToPatchContainer *outWaitCmds, bool relaxedOrderingAllowed, bool copyOffloadOperation, CommandToPatch::CommandType storedSemaphore);
void appendWaitOnSingleEvent(Event *event, CommandToPatchContainer *outWaitCmds, bool relaxedOrderingAllowed, bool dualStreamCopyOffload, CommandToPatch::CommandType storedSemaphore);
void appendSdiInOrderCounterSignalling(uint64_t baseGpuVa, uint64_t signalValue, bool copyOffloadOperation);
@@ -383,7 +383,7 @@ struct CommandListCoreFamily : public CommandListImp {
virtual bool isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents, bool copyOffload) { return false; }
virtual void setupFlushMethod(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) {}
bool canSkipInOrderEventWait(Event &event, bool ignorCbEventBoundToCmdList) const;
bool handleInOrderImplicitDependencies(bool relaxedOrderingAllowed, bool copyOffloadOperation);
bool handleInOrderImplicitDependencies(bool relaxedOrderingAllowed, bool dualStreamCopyOffloadOperation);
bool isQwordInOrderCounter() const { return GfxFamily::isQwordInOrderCounter; }
bool isInOrderNonWalkerSignalingRequired(const Event *event) const;
bool hasInOrderDependencies() const;

View File

@@ -195,7 +195,8 @@ void CommandListCoreFamily<gfxCoreFamily>::handleInOrderDependencyCounter(Event
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::handleInOrderCounterOverflow(bool copyOffloadOperation) {
if (!isQwordInOrderCounter() && ((inOrderExecInfo->getCounterValue() + 1) == std::numeric_limits<uint32_t>::max())) {
CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue() + 1, inOrderExecInfo->getAllocationOffset(), false, true, false, false, copyOffloadOperation);
CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue() + 1, inOrderExecInfo->getAllocationOffset(), false, true, false, false,
isDualStreamCopyOffloadOperation(copyOffloadOperation));
inOrderExecInfo->resetCounterValue();
@@ -1459,7 +1460,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyBlitRegion(Ali
size_t dstRowPitch, size_t dstSlicePitch,
const Vec3<size_t> &srcSize, const Vec3<size_t> &dstSize,
Event *signalEvent,
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) {
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents,
bool relaxedOrderingDispatch, bool dualStreamCopyOffload) {
srcRegion.originX += getRegionOffsetForAppendMemoryCopyBlitRegion(srcAllocationData);
dstRegion.originX += getRegionOffsetForAppendMemoryCopyBlitRegion(dstAllocationData);
@@ -1479,7 +1481,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyBlitRegion(Ali
blitProperties.srcSize = srcSize;
blitProperties.dstSize = dstSize;
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents, nullptr, relaxedOrderingDispatch, false, true, false, true);
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents, nullptr, relaxedOrderingDispatch, false, true, false, dualStreamCopyOffload);
if (ret) {
return ret;
}
@@ -1488,7 +1490,11 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyBlitRegion(Ali
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
appendEventForProfiling(signalEvent, nullptr, true, false, false, true);
const bool copyOnly = isCopyOnly(dualStreamCopyOffload);
if (copyOnly) {
appendEventForProfiling(signalEvent, nullptr, true, false, false, true);
}
auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironmentRef();
bool copyRegionPreferred = NEO::BlitCommandsHelper<GfxFamily>::isCopyRegionPreferred(copySizeModified, rootDeviceEnvironment, blitProperties.isSystemMemoryPoolUsed);
if (copyRegionPreferred) {
@@ -1498,7 +1504,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyBlitRegion(Ali
}
dummyBlitWa.isWaRequired = true;
appendSignalEventPostWalker(signalEvent, nullptr, nullptr, false, false, true);
if (copyOnly) {
appendSignalEventPostWalker(signalEvent, nullptr, nullptr, false, false, true);
}
return ZE_RESULT_SUCCESS;
}
@@ -1685,7 +1693,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
bool waitForImplicitInOrderDependency = !isCopyOnlyEnabled || inOrderCopyOnlySignalingAllowed;
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents, nullptr, memoryCopyParams.relaxedOrderingDispatch, false, waitForImplicitInOrderDependency, false, isCopyOnlyEnabled);
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents, nullptr, memoryCopyParams.relaxedOrderingDispatch, false,
waitForImplicitInOrderDependency, false, isDualStreamCopyOffloadOperation(memoryCopyParams.copyOffloadAllowed));
if (ret) {
return ret;
@@ -1714,8 +1723,10 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
launchParams.pipeControlSignalling = (signalEvent && singlePipeControlPacket) || getDcFlushRequired(dstAllocationStruct.needsFlush);
if (!useAdditionalBlitProperties || !isCopyOnlyEnabled) {
appendEventForProfilingAllWalkers(signalEvent, nullptr, nullptr, true, singlePipeControlPacket, false, isCopyOnlyEnabled);
if (!isNonDualStreamCopyOffloadOperation(memoryCopyParams.copyOffloadAllowed)) {
if (!useAdditionalBlitProperties || !isCopyOnlyEnabled) {
appendEventForProfilingAllWalkers(signalEvent, nullptr, nullptr, true, singlePipeControlPacket, false, isCopyOnlyEnabled);
}
}
if (isCopyOnlyEnabled) {
@@ -1788,8 +1799,10 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
appendCopyOperationFence(signalEvent, srcAllocationStruct.alloc, dstAllocationStruct.alloc, isCopyOnlyEnabled);
if (!useAdditionalBlitProperties || !isCopyOnlyEnabled) {
appendEventForProfilingAllWalkers(signalEvent, nullptr, nullptr, false, singlePipeControlPacket, false, isCopyOnlyEnabled);
if (!isNonDualStreamCopyOffloadOperation(memoryCopyParams.copyOffloadAllowed)) {
if (!useAdditionalBlitProperties || !isCopyOnlyEnabled) {
appendEventForProfilingAllWalkers(signalEvent, nullptr, nullptr, false, singlePipeControlPacket, false, isCopyOnlyEnabled);
}
}
bool l3flushInPipeControl = !l3FlushAfterPostSyncRequired || isSplitOperation;
@@ -1797,12 +1810,12 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
addToMappedEventList(signalEvent);
if (this->isInOrderExecutionEnabled()) {
if (this->isInOrderExecutionEnabled() && !isNonDualStreamCopyOffloadOperation(memoryCopyParams.copyOffloadAllowed)) {
bool emitPipeControl = !isCopyOnlyEnabled && launchParams.pipeControlSignalling;
if (launchParams.isKernelSplitOperation || inOrderCopyOnlySignalingAllowed || emitPipeControl) {
dispatchInOrderPostOperationBarrier(signalEvent, dcFlush, isCopyOnlyEnabled);
appendSignalInOrderDependencyCounter(signalEvent, isCopyOnlyEnabled, false, false);
appendSignalInOrderDependencyCounter(signalEvent, memoryCopyParams.copyOffloadAllowed, false, false);
}
if (!isCopyOnlyEnabled || inOrderCopyOnlySignalingAllowed) {
@@ -1876,14 +1889,15 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyRegion(void *d
memoryCopyParams.copyOffloadAllowed = isCopyOffloadAllowed(*srcAllocationStruct.alloc, *dstAllocationStruct.alloc);
const bool isCopyOnlyEnabled = isCopyOnly(memoryCopyParams.copyOffloadAllowed);
const bool inOrderCopyOnlySignalingAllowed = this->isInOrderExecutionEnabled() && !memoryCopyParams.forceDisableCopyOnlyInOrderSignaling && isCopyOnlyEnabled;
const bool inOrderCopyOnlySignalingAllowed = this->isInOrderExecutionEnabled() && !memoryCopyParams.forceDisableCopyOnlyInOrderSignaling &&
isCopyOnlyEnabled && !isNonDualStreamCopyOffloadOperation(memoryCopyParams.copyOffloadAllowed);
ze_result_t result = ZE_RESULT_SUCCESS;
if (isCopyOnlyEnabled) {
result = appendMemoryCopyBlitRegion(&srcAllocationStruct, &dstAllocationStruct, *srcRegion, *dstRegion,
{srcRegion->width, srcRegion->height, srcRegion->depth},
srcPitch, srcSlicePitch, dstPitch, dstSlicePitch, srcSize3, dstSize3,
signalEvent, numWaitEvents, phWaitEvents, memoryCopyParams.relaxedOrderingDispatch);
signalEvent, numWaitEvents, phWaitEvents, memoryCopyParams.relaxedOrderingDispatch, isDualStreamCopyOffloadOperation(memoryCopyParams.copyOffloadAllowed));
} else if ((srcRegion->depth > 1) || (srcRegion->originZ != 0) || (dstRegion->originZ != 0)) {
result = this->appendMemoryCopyKernel3d(&dstAllocationStruct, &srcAllocationStruct, Builtin::copyBufferRectBytes3d,
dstRegion, dstPitch, dstSlicePitch, dstAllocationStruct.offset,
@@ -1907,7 +1921,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyRegion(void *d
if (this->isInOrderExecutionEnabled()) {
if (inOrderCopyOnlySignalingAllowed) {
appendSignalInOrderDependencyCounter(signalEvent, isCopyOnlyEnabled, false, false);
appendSignalInOrderDependencyCounter(signalEvent, memoryCopyParams.copyOffloadAllowed, false, false);
handleInOrderDependencyCounter(signalEvent, false, isCopyOnlyEnabled);
}
} else {
@@ -2607,17 +2621,17 @@ inline uint32_t CommandListCoreFamily<gfxCoreFamily>::getRegionOffsetForAppendMe
}
template <GFXCORE_FAMILY gfxCoreFamily>
bool CommandListCoreFamily<gfxCoreFamily>::handleInOrderImplicitDependencies(bool relaxedOrderingAllowed, bool copyOffloadOperation) {
bool CommandListCoreFamily<gfxCoreFamily>::handleInOrderImplicitDependencies(bool relaxedOrderingAllowed, bool dualStreamCopyOffloadOperation) {
if (hasInOrderDependencies()) {
if (inOrderExecInfo->isCounterAlreadyDone(inOrderExecInfo->getCounterValue())) {
return false;
}
if (relaxedOrderingAllowed) {
NEO::RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers<GfxFamily>(*commandContainer.getCommandStream(), isCopyOnly(copyOffloadOperation));
NEO::RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers<GfxFamily>(*commandContainer.getCommandStream(), isCopyOnly(dualStreamCopyOffloadOperation));
}
CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue(), inOrderExecInfo->getAllocationOffset(), relaxedOrderingAllowed, true, false, false, copyOffloadOperation);
CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue(), inOrderExecInfo->getAllocationOffset(), relaxedOrderingAllowed, true, false, false, dualStreamCopyOffloadOperation);
return true;
}
@@ -2627,7 +2641,7 @@ bool CommandListCoreFamily<gfxCoreFamily>::handleInOrderImplicitDependencies(boo
template <GFXCORE_FAMILY gfxCoreFamily>
inline ze_result_t CommandListCoreFamily<gfxCoreFamily>::addEventsToCmdList(uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, CommandToPatchContainer *outWaitCmds,
bool relaxedOrderingAllowed, bool trackDependencies, bool waitForImplicitInOrderDependency, bool skipAddingWaitEventsToResidency, bool copyOffloadOperation) {
bool relaxedOrderingAllowed, bool trackDependencies, bool waitForImplicitInOrderDependency, bool skipAddingWaitEventsToResidency, bool dualStreamCopyOffloadOperation) {
bool inOrderDependenciesSent = false;
if (this->latestOperationRequiredNonWalkerInOrderCmdsChaining && !relaxedOrderingAllowed) {
@@ -2635,22 +2649,22 @@ inline ze_result_t CommandListCoreFamily<gfxCoreFamily>::addEventsToCmdList(uint
}
if (waitForImplicitInOrderDependency) {
auto ret = this->flushInOrderCounterSignal(copyOffloadOperation || relaxedOrderingAllowed);
auto ret = this->flushInOrderCounterSignal(dualStreamCopyOffloadOperation || relaxedOrderingAllowed);
if (ret != ZE_RESULT_SUCCESS) {
return ret;
}
inOrderDependenciesSent = handleInOrderImplicitDependencies(relaxedOrderingAllowed, copyOffloadOperation);
inOrderDependenciesSent = handleInOrderImplicitDependencies(relaxedOrderingAllowed, dualStreamCopyOffloadOperation);
this->latestOperationHasOptimizedCbEvent = false;
}
if (relaxedOrderingAllowed && numWaitEvents > 0 && !inOrderDependenciesSent) {
NEO::RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers<GfxFamily>(*commandContainer.getCommandStream(), isCopyOnly(copyOffloadOperation));
NEO::RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers<GfxFamily>(*commandContainer.getCommandStream(), isCopyOnly(dualStreamCopyOffloadOperation));
}
if (numWaitEvents > 0) {
if (phWaitEvents) {
return CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(numWaitEvents, phWaitEvents, outWaitCmds, relaxedOrderingAllowed, trackDependencies, false, skipAddingWaitEventsToResidency, false, copyOffloadOperation);
return CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(numWaitEvents, phWaitEvents, outWaitCmds, relaxedOrderingAllowed, trackDependencies, false, skipAddingWaitEventsToResidency, false, dualStreamCopyOffloadOperation);
} else {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
@@ -2728,7 +2742,7 @@ NEO::GraphicsAllocation *CommandListCoreFamily<gfxCoreFamily>::getDeviceCounterA
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(std::shared_ptr<NEO::InOrderExecInfo> &inOrderExecInfo, CommandToPatchContainer *outListCommands,
uint64_t waitValue, uint32_t offset, bool relaxedOrderingAllowed, bool implicitDependency, bool skipAddingWaitEventsToResidency,
bool noopDispatch, bool copyOffloadOperation) {
bool noopDispatch, bool dualStreamCopyOffloadOperation) {
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
UNRECOVERABLE_IF(waitValue > static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()) && !isQwordInOrderCounter());
@@ -2741,13 +2755,15 @@ void CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(std::sh
uint64_t gpuAddress = inOrderExecInfo->getBaseDeviceAddress() + offset;
const uint32_t immWriteOffset = device->getL0GfxCoreHelper().getImmediateWritePostSyncOffset();
const bool copyOnlyWait = isCopyOnly(dualStreamCopyOffloadOperation);
for (uint32_t i = 0; i < inOrderExecInfo->getNumDevicePartitionsToWait(); i++) {
if (relaxedOrderingAllowed) {
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataMemBatchBufferStart(*commandContainer.getCommandStream(), 0, gpuAddress, waitValue, NEO::CompareOperation::less, true, isQwordInOrderCounter(), isCopyOnly(copyOffloadOperation));
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataMemBatchBufferStart(*commandContainer.getCommandStream(), 0, gpuAddress, waitValue, NEO::CompareOperation::less, true,
isQwordInOrderCounter(), copyOnlyWait);
} else {
auto resolveDependenciesViaPipeControls = !this->isCopyOnly(copyOffloadOperation) && !this->asMutable() && implicitDependency && (this->dcFlushSupport || (!this->heaplessModeEnabled && this->latestOperationHasOptimizedCbEvent));
auto resolveDependenciesViaPipeControls = !copyOnlyWait && !this->asMutable() && implicitDependency && (this->dcFlushSupport || (!this->heaplessModeEnabled && this->latestOperationHasOptimizedCbEvent));
if (NEO::debugManager.flags.ResolveDependenciesViaPipeControls.get() != -1) {
resolveDependenciesViaPipeControls = NEO::debugManager.flags.ResolveDependenciesViaPipeControls.get();
@@ -2778,8 +2794,8 @@ void CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(std::sh
auto lri2 = commandContainer.getCommandStream()->template getSpaceForCmd<MI_LOAD_REGISTER_IMM>();
if (!noopDispatch) {
NEO::LriHelper<GfxFamily>::program(lri1, firstRegister, getLowPart(waitValue), true, isCopyOnly(copyOffloadOperation));
NEO::LriHelper<GfxFamily>::program(lri2, secondRegister, getHighPart(waitValue), true, isCopyOnly(copyOffloadOperation));
NEO::LriHelper<GfxFamily>::program(lri1, firstRegister, getLowPart(waitValue), true, copyOnlyWait);
NEO::LriHelper<GfxFamily>::program(lri2, secondRegister, getHighPart(waitValue), true, copyOnlyWait);
} else {
memset(lri1, 0, sizeof(MI_LOAD_REGISTER_IMM));
memset(lri2, 0, sizeof(MI_LOAD_REGISTER_IMM));
@@ -2862,8 +2878,10 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
callId);
}
const bool dualStreamCopyOffload = isDualStreamCopyOffloadOperation(copyOffloadOperation);
if (this->isInOrderExecutionEnabled() && apiRequest) {
handleInOrderImplicitDependencies(false, copyOffloadOperation);
handleInOrderImplicitDependencies(false, dualStreamCopyOffload);
}
bool dcFlushRequired = false;
@@ -2875,6 +2893,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
}
}
if (dcFlushRequired) {
UNRECOVERABLE_IF(isNonDualStreamCopyOffloadOperation(copyOffloadOperation));
if (isCopyOnly(copyOffloadOperation)) {
NEO::MiFlushArgs args{this->dummyBlitWa};
encodeMiFlush(0, 0, args);
@@ -2905,7 +2924,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(event->getInOrderExecInfo(), outWaitCmds,
waitValue, event->getInOrderAllocationOffset(),
relaxedOrderingAllowed, false, skipAddingWaitEventsToResidency,
isCbEventBoundToCmdList(event), copyOffloadOperation);
isCbEventBoundToCmdList(event), dualStreamCopyOffload);
continue;
}
@@ -2918,10 +2937,10 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
commandContainer.addToResidencyContainer(event->getAllocation(this->device));
}
appendWaitOnSingleEvent(event, outWaitCmds, relaxedOrderingAllowed, copyOffloadOperation, CommandToPatch::WaitEventSemaphoreWait);
appendWaitOnSingleEvent(event, outWaitCmds, relaxedOrderingAllowed, dualStreamCopyOffload, CommandToPatch::WaitEventSemaphoreWait);
}
if (isImmediateType() && isCopyOnly(copyOffloadOperation) && trackDependencies) {
if (isImmediateType() && isCopyOnly(dualStreamCopyOffload) && trackDependencies) {
NEO::MiFlushArgs args{this->dummyBlitWa};
args.commandWithPostSync = true;
auto csr = getCsr(false);
@@ -2931,9 +2950,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
if (apiRequest) {
if (this->isInOrderExecutionEnabled()) {
appendSignalInOrderDependencyCounter(nullptr, copyOffloadOperation, false, false);
appendSignalInOrderDependencyCounter(nullptr, false, false, false);
}
handleInOrderDependencyCounter(nullptr, false, copyOffloadOperation);
handleInOrderDependencyCounter(nullptr, false, false);
}
if (NEO::debugManager.flags.EnableSWTags.get()) {
@@ -2951,6 +2970,8 @@ template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendSdiInOrderCounterSignalling(uint64_t baseGpuVa, uint64_t signalValue, bool copyOffloadOperation) {
using MI_STORE_DATA_IMM = typename GfxFamily::MI_STORE_DATA_IMM;
UNRECOVERABLE_IF(isNonDualStreamCopyOffloadOperation(copyOffloadOperation));
uint64_t gpuVa = baseGpuVa + inOrderExecInfo->getAllocationOffset();
uint32_t numWrites = 1;
@@ -2978,6 +2999,8 @@ void CommandListCoreFamily<gfxCoreFamily>::appendSignalInOrderDependencyCounter(
using ATOMIC_OPCODES = typename GfxFamily::MI_ATOMIC::ATOMIC_OPCODES;
using DATA_SIZE = typename GfxFamily::MI_ATOMIC::DATA_SIZE;
UNRECOVERABLE_IF(isNonDualStreamCopyOffloadOperation(copyOffloadOperation));
uint64_t deviceAllocGpuVa = inOrderExecInfo->getBaseDeviceAddress();
uint64_t signalValue = inOrderExecInfo->getCounterValue() + getInOrderIncrementValue();
@@ -3022,7 +3045,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendSignalInOrderDependencyCounter(
DATA_SIZE::DATA_SIZE_QWORD, 0, 0, signalEvent->getInOrderIncrementValue(), 0);
}
if ((NEO::debugManager.flags.ProgramUserInterruptOnResolvedDependency.get() == 1 || copyOffloadOperation) && signalEvent && signalEvent->isInterruptModeEnabled()) {
if ((NEO::debugManager.flags.ProgramUserInterruptOnResolvedDependency.get() == 1 || isCopyOnly(copyOffloadOperation)) && signalEvent && signalEvent->isInterruptModeEnabled()) {
NEO::EnodeUserInterrupt<GfxFamily>::encode(*cmdStream);
}
}
@@ -4214,7 +4237,7 @@ void CommandListCoreFamily<gfxCoreFamily>::dispatchEventRemainingPacketsPostSync
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendWaitOnSingleEvent(Event *event, CommandToPatchContainer *outWaitCmds, bool relaxedOrderingAllowed, bool copyOffloadOperation, CommandToPatch::CommandType storedSemaphore) {
void CommandListCoreFamily<gfxCoreFamily>::appendWaitOnSingleEvent(Event *event, CommandToPatchContainer *outWaitCmds, bool relaxedOrderingAllowed, bool dualStreamCopyOffload, CommandToPatch::CommandType storedSemaphore) {
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
uint64_t gpuAddr = event->getCompletionFieldGpuAddress(this->device);
@@ -4229,7 +4252,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendWaitOnSingleEvent(Event *event,
for (uint32_t i = 0u; i < packetsToWait; i++) {
if (relaxedOrderingAllowed) {
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataMemBatchBufferStart(*commandContainer.getCommandStream(), 0, gpuAddr, Event::STATE_CLEARED,
NEO::CompareOperation::equal, true, false, isCopyOnly(copyOffloadOperation));
NEO::CompareOperation::equal, true, false, isCopyOnly(dualStreamCopyOffload));
} else {
NEO::EncodeSemaphore<GfxFamily>::addMiSemaphoreWaitCommand(*commandContainer.getCommandStream(),
gpuAddr,

View File

@@ -53,6 +53,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
using BaseClass::getCsr;
using BaseClass::isCopyOffloadEnabled;
using BaseClass::isCopyOnly;
using BaseClass::isDualStreamCopyOffloadOperation;
using BaseClass::isInOrderExecutionEnabled;
using BaseClass::isSkippingInOrderBarrierAllowed;
using BaseClass::isTbxMode;

View File

@@ -1126,7 +1126,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::hostSynchronize(uint6
bool mainStorageCleanupNeeded = !mainInternalAllocStorage->getTemporaryAllocations().peekIsEmpty();
bool copyOffloadStorageCleanupNeeded = false;
const bool dualStreamCopyOffload = (getCopyOffloadModeForOperation(isCopyOffloadEnabled()) == CopyOffloadModes::dualStream);
const bool dualStreamCopyOffload = isDualStreamCopyOffloadOperation(isCopyOffloadEnabled());
if (dualStreamCopyOffload) {
copyOffloadTaskCount = this->cmdQImmediateCopyOffload->getTaskCount();

View File

@@ -706,10 +706,11 @@ class MockCommandListCoreFamily : public CommandListCoreFamily<gfxCoreFamily> {
size_t dstRowPitch, size_t dstSlicePitch,
const Vec3<size_t> &srcSize, const Vec3<size_t> &dstSize,
L0::Event *signalEvent,
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) override {
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch, bool doubleStreamCopyOffload) override {
srcBlitCopyRegionOffset = srcAllocationData->offset;
dstBlitCopyRegionOffset = dstAllocationData->offset;
return L0::CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyBlitRegion(srcAllocationData, dstAllocationData, srcRegion, dstRegion, copySize, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, srcSize, dstSize, signalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
return L0::CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyBlitRegion(srcAllocationData, dstAllocationData, srcRegion, dstRegion, copySize, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch,
srcSize, dstSize, signalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch, doubleStreamCopyOffload);
}
uintptr_t srcAlignedPtr;
uintptr_t dstAlignedPtr;

View File

@@ -98,7 +98,7 @@ class MockCommandListHw : public WhiteBox<::L0::CommandListCoreFamily<gfxCoreFam
size_t dstRowPitch, size_t dstSlicePitch,
const Vec3<size_t> &srcSize, const Vec3<size_t> &dstSize,
L0::Event *signalEvent,
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) override {
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch, bool doubleStreamCopyOffload) override {
if (signalEvent) {
useEvents = true;
} else {
@@ -1417,7 +1417,7 @@ HWTEST2_F(CommandListCreateTests, givenCopyCommandListWhenCopyRegionWithinMaxBli
AlignedAllocationData dstAllocationData = {mockAllocationDst.gpuAddress, 0, &mockAllocationDst, false};
size_t rowPitch = copySize.x;
size_t slicePitch = copySize.x * copySize.y;
commandList->appendMemoryCopyBlitRegion(&srcAllocationData, &dstAllocationData, srcRegion, dstRegion, copySize, rowPitch, slicePitch, rowPitch, slicePitch, srcSize, dstSize, nullptr, 0, nullptr, false);
commandList->appendMemoryCopyBlitRegion(&srcAllocationData, &dstAllocationData, srcRegion, dstRegion, copySize, rowPitch, slicePitch, rowPitch, slicePitch, srcSize, dstSize, nullptr, 0, nullptr, false, false);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
@@ -1468,7 +1468,7 @@ HWTEST2_F(CommandListCreateTests, givenCopyCommandListWhenCopyRegionWithinMaxBli
AlignedAllocationData dstAllocationData = {mockAllocationDst.gpuAddress, 0, &mockAllocationDst, false};
size_t rowPitch = copySize.x;
size_t slicePitch = copySize.x * copySize.y;
commandList->appendMemoryCopyBlitRegion(&srcAllocationData, &dstAllocationData, srcRegion, dstRegion, copySize, rowPitch, slicePitch, rowPitch, slicePitch, srcSize, dstSize, nullptr, 0, nullptr, false);
commandList->appendMemoryCopyBlitRegion(&srcAllocationData, &dstAllocationData, srcRegion, dstRegion, copySize, rowPitch, slicePitch, rowPitch, slicePitch, srcSize, dstSize, nullptr, 0, nullptr, false, false);
uint32_t bytesPerPixel = NEO::BlitCommandsHelper<FamilyType>::getAvailableBytesPerPixel(copySize.x, srcRegion.originX, dstRegion.originY, srcSize.x, dstSize.x);
GenCmdList cmdList;
@@ -1518,7 +1518,7 @@ HWTEST2_F(CommandListCreateTests, givenCopyCommandListWhenCopyRegionGreaterThanM
AlignedAllocationData dstAllocationData = {mockAllocationDst.gpuAddress, 0, &mockAllocationDst, false};
size_t rowPitch = copySize.x;
size_t slicePitch = copySize.x * copySize.y;
commandList->appendMemoryCopyBlitRegion(&srcAllocationData, &dstAllocationData, srcRegion, dstRegion, copySize, rowPitch, slicePitch, rowPitch, slicePitch, srcSize, dstSize, nullptr, 0, nullptr, false);
commandList->appendMemoryCopyBlitRegion(&srcAllocationData, &dstAllocationData, srcRegion, dstRegion, copySize, rowPitch, slicePitch, rowPitch, slicePitch, srcSize, dstSize, nullptr, 0, nullptr, false, false);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
@@ -1545,7 +1545,7 @@ class MockCommandListForRegionSize : public WhiteBox<::L0::CommandListCoreFamily
size_t dstRowPitch, size_t dstSlicePitch,
const Vec3<size_t> &srcSize, const Vec3<size_t> &dstSize,
L0::Event *signalEvent,
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) override {
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch, bool doubleStreamCopyOffload) override {
this->srcSize = srcSize;
this->dstSize = dstSize;
return ZE_RESULT_SUCCESS;

View File

@@ -253,7 +253,7 @@ HWTEST2_F(AppendMemoryCopyTests, givenCopyCommandListWhenTimestampPassedToMemory
AlignedAllocationData srcAllocationData = {mockAllocationSrc.gpuAddress, 0, &mockAllocationSrc, false};
AlignedAllocationData dstAllocationData = {mockAllocationDst.gpuAddress, 0, &mockAllocationDst, false};
commandList->appendMemoryCopyBlitRegion(&srcAllocationData, &dstAllocationData, srcRegion, dstRegion, {0, 0, 0}, 0, 0, 0, 0, 0, 0, event.get(), 0, nullptr, false);
commandList->appendMemoryCopyBlitRegion(&srcAllocationData, &dstAllocationData, srcRegion, dstRegion, {0, 0, 0}, 0, 0, 0, 0, 0, 0, event.get(), 0, nullptr, false, false);
GenCmdList cmdList;
auto baseAddr = event->getGpuAddress(device);

View File

@@ -138,7 +138,7 @@ class MockCommandListExtensionHw : public WhiteBox<::L0::CommandListCoreFamily<g
size_t dstRowPitch, size_t dstSlicePitch,
const Vec3<size_t> &srcSize, const Vec3<size_t> &dstSize,
Event *signalEvent,
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) override {
uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch, bool doubleStreamCopyOffload) override {
if (signalEvent) {
useEvents = true;
} else {

View File

@@ -1755,7 +1755,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, InOrderCmdListTests, givenImmediateCmdListWhenDispa
}
events[0]->makeCounterBasedInitiallyDisabled(eventPool->getAllocation());
copyOnlyCmdList->appendMemoryCopyBlitRegion(&allocationData, &allocationData, region, region, {0, 0, 0}, 0, 0, 0, 0, {0, 0, 0}, {0, 0, 0}, events[0].get(), 0, nullptr, false);
copyOnlyCmdList->appendMemoryCopyBlitRegion(&allocationData, &allocationData, region, region, {0, 0, 0}, 0, 0, 0, 0, {0, 0, 0}, {0, 0, 0}, events[0].get(), 0, nullptr, false, false);
if (dcFlushRequired) {
EXPECT_EQ(Event::CounterBasedMode::initiallyDisabled, events[0]->counterBasedMode);
} else {
@@ -1909,7 +1909,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, InOrderCmdListTests, givenNonInOrderCmdListWhenPass
const void **ranges = reinterpret_cast<const void **>(&copyData[0]);
EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, immCmdList->appendMemoryRangesBarrier(1, &rangeSizes, ranges, eventHandle, 0, nullptr));
EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, copyOnlyCmdList->appendMemoryCopyBlitRegion(&allocationData, &allocationData, region, region, {0, 0, 0}, 0, 0, 0, 0, {0, 0, 0}, {0, 0, 0}, events[0].get(), 0, nullptr, false));
EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, copyOnlyCmdList->appendMemoryCopyBlitRegion(&allocationData, &allocationData, region, region, {0, 0, 0}, 0, 0, 0, 0, {0, 0, 0}, {0, 0, 0}, events[0].get(), 0, nullptr, false, false));
EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, immCmdList->appendMemoryCopy(&copyData, &copyData, 1, eventHandle, 0, nullptr, copyParams));

View File

@@ -101,6 +101,94 @@ HWTEST2_F(CopyOffloadInOrderTests, givenNonDualStreamModeWhenSubmittedThenUseDef
EXPECT_TRUE(immCmdList->latestFlushIsHostVisible);
}
HWTEST2_F(CopyOffloadInOrderTests, givenNonDualStreamModeWhenSubmittedThenDontProgramBcsMmioBase, IsAtLeastXeHpCore) {
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
debugManager.flags.OverrideCopyOffloadMode.set(nonDualStreamMode);
auto immCmdList0 = createImmCmdListWithOffload<FamilyType::gfxCoreFamily>();
auto immCmdList1 = createImmCmdListWithOffload<FamilyType::gfxCoreFamily>();
auto eventPool = createEvents<FamilyType>(2, true);
auto eventHandle0 = events[0]->toHandle();
auto eventHandle1 = events[1]->toHandle();
immCmdList0->appendMemoryCopy(&copyData1, &copyData2, 1, eventHandle0, 0, nullptr, copyParams);
immCmdList1->appendMemoryCopy(&copyData1, &copyData2, 1, eventHandle1, 0, nullptr, copyParams);
immCmdList1->appendMemoryCopy(&copyData1, &copyData2, 1, nullptr, 1, &eventHandle0, copyParams);
auto cmdStream = immCmdList1->getCmdContainer().getCommandStream();
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, cmdStream->getCpuBase(), 0));
auto itor = find<MI_LOAD_REGISTER_IMM *>(cmdList.begin(), cmdList.end());
while (itor != cmdList.end()) {
auto cmd = genCmdCast<MI_LOAD_REGISTER_IMM *>(*itor);
EXPECT_TRUE(cmd->getRegisterOffset() < RegisterOffsets::bcs0Base);
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
}
}
HWTEST2_F(CopyOffloadInOrderTests, givenNonDualStreamModeAndProfilingEventWithRelaxedOrderingWhenAppendingThenDontBcsCommands, IsAtLeastXeHpCore) {
using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG;
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM;
debugManager.flags.DirectSubmissionRelaxedOrdering.set(1);
debugManager.flags.OverrideCopyOffloadMode.set(nonDualStreamMode);
auto immCmdList = createImmCmdListWithOffload<FamilyType::gfxCoreFamily>();
auto mainQueueCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(immCmdList->getCsr(false));
auto copyQueueCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(immCmdList->getCsr(true));
auto mainQueueDirectSubmission = new MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>>(*mainQueueCsr);
auto offloadDirectSubmission = new MockDirectSubmissionHw<FamilyType, BlitterDispatcher<FamilyType>>(*copyQueueCsr);
mainQueueCsr->directSubmission.reset(mainQueueDirectSubmission);
copyQueueCsr->blitterDirectSubmission.reset(offloadDirectSubmission);
int client1, client2;
mainQueueCsr->registerClient(&client1);
mainQueueCsr->registerClient(&client2);
copyQueueCsr->registerClient(&client1);
copyQueueCsr->registerClient(&client2);
auto eventPool = createEvents<FamilyType>(1, true);
auto cmdStream = immCmdList->getCmdContainer().getCommandStream();
auto offset = cmdStream->getUsed();
auto eventHandle = events[0]->toHandle();
immCmdList->appendMemoryCopy(&copyData1, &copyData2, 1, eventHandle, 0, nullptr, copyParams);
ze_copy_region_t region = {0, 0, 0, 1, 1, 1};
immCmdList->appendMemoryCopyRegion(&copyData1, &region, 1, 1, &copyData2, &region, 1, 1, eventHandle, 0, nullptr, copyParams);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), (cmdStream->getUsed() - offset)));
auto lrrCmds = findAll<MI_LOAD_REGISTER_REG *>(cmdList.begin(), cmdList.end());
auto lriCmds = findAll<MI_LOAD_REGISTER_IMM *>(cmdList.begin(), cmdList.end());
auto lrmCmds = findAll<MI_STORE_REGISTER_MEM *>(cmdList.begin(), cmdList.end());
for (auto &lrr : lrrCmds) {
auto lrrCmd = genCmdCast<MI_LOAD_REGISTER_REG *>(*lrr);
EXPECT_TRUE(lrrCmd->getSourceRegisterAddress() < RegisterOffsets::bcs0Base);
EXPECT_TRUE(lrrCmd->getDestinationRegisterAddress() < RegisterOffsets::bcs0Base);
}
for (auto &lri : lriCmds) {
auto lriCmd = genCmdCast<MI_LOAD_REGISTER_IMM *>(*lri);
EXPECT_TRUE(lriCmd->getRegisterOffset() < RegisterOffsets::bcs0Base);
}
for (auto &lrm : lrmCmds) {
auto lrmCmd = genCmdCast<MI_STORE_REGISTER_MEM *>(*lrm);
EXPECT_TRUE(lrmCmd->getRegisterAddress() < RegisterOffsets::bcs0Base);
}
}
HWTEST2_F(CopyOffloadInOrderTests, givenDebugFlagSetWhenCreatingCmdListThenEnableCopyOffload, IsAtLeastXeHpCore) {
NEO::debugManager.flags.ForceCopyOperationOffloadForComputeCmdList.set(1);