feature: use bcs stream for copy offload bcs split

Related-To: NEO-14557

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2025-09-02 07:59:07 +00:00
committed by Compute-Runtime-Automation
parent 36614e614f
commit ac8f8d1d8c
5 changed files with 78 additions and 24 deletions

View File

@@ -502,6 +502,7 @@ struct CommandList : _ze_command_list_handle_t {
uint32_t getActiveScratchPatchElements() const {
return activeScratchPatchElements;
}
bool isDualStreamCopyOffloadOperation(bool offloadOperation) const { return (getCopyOffloadModeForOperation(offloadOperation) == CopyOffloadModes::dualStream); }
protected:
NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize, bool copyOffload);
@@ -513,7 +514,6 @@ struct CommandList : _ze_command_list_handle_t {
}
MOCKABLE_VIRTUAL void synchronizeEventList(uint32_t numWaitEvents, ze_event_handle_t *waitEventList);
bool isDualStreamCopyOffloadOperation(bool offloadOperation) const { return (getCopyOffloadModeForOperation(offloadOperation) == CopyOffloadModes::dualStream); }
bool isNonDualStreamCopyOffloadOperation(bool offloadOperation) const { return offloadOperation && !isDualStreamCopyOffloadOperation(offloadOperation); }
void registerWalkerWithProfilingEnqueued(Event *event);

View File

@@ -252,6 +252,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
ze_result_t appendStagingMemoryCopy(void *dstptr, const void *srcptr, size_t size, ze_event_handle_t hSignalEvent, CmdListMemoryCopyParams &memoryCopyParams);
ze_result_t stagingStatusToL0(const NEO::StagingTransferStatus &status) const;
size_t estimateAdditionalSizeAppendRegularCommandLists(uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists);
void setupFlagsForBcsSplit(CmdListMemoryCopyParams &memoryCopyParams, bool &hasStallingCmds, bool &copyOffloadFlush);
MOCKABLE_VIRTUAL void checkAssert();
ComputeFlushMethodType computeFlushMethod = nullptr;

View File

@@ -643,6 +643,16 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendBarrier(ze_even
return flushImmediate(ret, true, isStallingOperation, relaxedOrderingDispatch, NEO::AppendOperations::nonKernel, false, hSignalEvent, false, nullptr, nullptr);
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamilyImmediate<gfxCoreFamily>::setupFlagsForBcsSplit(CmdListMemoryCopyParams &memoryCopyParams, bool &hasStallingCmds, bool &copyOffloadFlush) {
memoryCopyParams.relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1, false); // split generates more than 1 event
memoryCopyParams.forceDisableCopyOnlyInOrderSignaling = true;
memoryCopyParams.taskCountUpdateRequired = true;
memoryCopyParams.copyOffloadAllowed = this->isCopyOffloadEnabled();
copyOffloadFlush = memoryCopyParams.copyOffloadAllowed;
hasStallingCmds = !memoryCopyParams.relaxedOrderingDispatch;
}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
void *dstptr,
@@ -652,6 +662,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents, CmdListMemoryCopyParams &memoryCopyParams) {
memoryCopyParams.relaxedOrderingDispatch |= isRelaxedOrderingDispatchAllowed(numWaitEvents, isCopyOffloadEnabled());
bool copyOffloadFlush = false;
auto estimatedSize = commonImmediateCommandSize;
if (isCopyOnly(true)) {
@@ -662,7 +673,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
}
checkAvailableSpace(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch, estimatedSize, false);
bool hasStallindCmds = hasStallingCmdsForRelaxedOrdering(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch);
bool hasStallingCmds = hasStallingCmdsForRelaxedOrdering(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch);
ze_result_t ret;
CpuMemCopyInfo cpuMemCopyInfo(dstptr, const_cast<void *>(srcptr), size);
@@ -678,16 +689,14 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
NEO::TransferDirection direction;
auto isSplitNeeded = this->isAppendSplitNeeded(dstptr, srcptr, size, direction);
if (isSplitNeeded) {
memoryCopyParams.relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1, false); // split generates more than 1 event
memoryCopyParams.forceDisableCopyOnlyInOrderSignaling = true;
memoryCopyParams.taskCountUpdateRequired = true;
hasStallindCmds = !memoryCopyParams.relaxedOrderingDispatch;
setupFlagsForBcsSplit(memoryCopyParams, hasStallingCmds, copyOffloadFlush);
auto splitCall = [&](CommandListCoreFamilyImmediate<gfxCoreFamily> *subCmdList, void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
return subCmdList->CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, 0u, nullptr, memoryCopyParams);
};
ret = static_cast<DeviceImp *>(this->device)->bcsSplit->appendSplitCall<gfxCoreFamily, void *, const void *>(this, dstptr, srcptr, size, hSignalEvent, numWaitEvents, phWaitEvents, true, memoryCopyParams.relaxedOrderingDispatch, direction, estimatedSize, splitCall);
} else if (this->isValidForStagingTransfer(dstptr, srcptr, size, numWaitEvents > 0)) {
return this->appendStagingMemoryCopy(dstptr, srcptr, size, hSignalEvent, memoryCopyParams);
} else {
@@ -695,7 +704,9 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
numWaitEvents, phWaitEvents, memoryCopyParams);
}
return flushImmediate(ret, true, hasStallindCmds, memoryCopyParams.relaxedOrderingDispatch, NEO::AppendOperations::kernel, memoryCopyParams.copyOffloadAllowed, hSignalEvent, memoryCopyParams.taskCountUpdateRequired, nullptr, nullptr);
copyOffloadFlush |= memoryCopyParams.copyOffloadAllowed;
return flushImmediate(ret, true, hasStallingCmds, memoryCopyParams.relaxedOrderingDispatch, NEO::AppendOperations::kernel, copyOffloadFlush, hSignalEvent, memoryCopyParams.taskCountUpdateRequired, nullptr, nullptr);
}
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -723,17 +734,15 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
}
checkAvailableSpace(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch, estimatedSize, false);
bool hasStallindCmds = hasStallingCmdsForRelaxedOrdering(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch);
bool hasStallingCmds = hasStallingCmdsForRelaxedOrdering(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch);
bool copyOffloadFlush = false;
ze_result_t ret;
NEO::TransferDirection direction;
auto isSplitNeeded = this->isAppendSplitNeeded(dstPtr, srcPtr, this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch), direction);
if (isSplitNeeded) {
memoryCopyParams.relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1, false); // split generates more than 1 event
memoryCopyParams.forceDisableCopyOnlyInOrderSignaling = true;
memoryCopyParams.taskCountUpdateRequired = true;
hasStallindCmds = !memoryCopyParams.relaxedOrderingDispatch;
setupFlagsForBcsSplit(memoryCopyParams, hasStallingCmds, copyOffloadFlush);
auto splitCall = [&](CommandListCoreFamilyImmediate<gfxCoreFamily> *subCmdList, uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
ze_copy_region_t dstRegionLocal = {};
@@ -756,7 +765,9 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
hSignalEvent, numWaitEvents, phWaitEvents, memoryCopyParams);
}
return flushImmediate(ret, true, hasStallindCmds, memoryCopyParams.relaxedOrderingDispatch, NEO::AppendOperations::kernel, memoryCopyParams.copyOffloadAllowed, hSignalEvent, memoryCopyParams.taskCountUpdateRequired, nullptr, nullptr);
copyOffloadFlush |= memoryCopyParams.copyOffloadAllowed;
return flushImmediate(ret, true, hasStallingCmds, memoryCopyParams.relaxedOrderingDispatch, NEO::AppendOperations::kernel, copyOffloadFlush, hSignalEvent, memoryCopyParams.taskCountUpdateRequired, nullptr, nullptr);
}
template <GFXCORE_FAMILY gfxCoreFamily>

View File

@@ -148,9 +148,11 @@ struct BcsSplit {
}
}
cmdList->addEventsToCmdList(static_cast<uint32_t>(eventHandles.size()), eventHandles.data(), nullptr, hasRelaxedOrderingDependencies, false, true, false, false);
const bool dualStreamCopyOffload = cmdList->isDualStreamCopyOffloadOperation(cmdList->isCopyOffloadEnabled());
const auto isCopyCmdList = cmdList->isCopyOnly(false);
cmdList->addEventsToCmdList(static_cast<uint32_t>(eventHandles.size()), eventHandles.data(), nullptr, hasRelaxedOrderingDependencies, false, true, false, dualStreamCopyOffload);
const auto isCopyCmdList = cmdList->isCopyOnly(dualStreamCopyOffload);
if (signalEvent) {
cmdList->appendSignalEventPostWalker(signalEvent, nullptr, nullptr, !isCopyCmdList, false, isCopyCmdList);
@@ -161,9 +163,9 @@ struct BcsSplit {
}
if (cmdList->isInOrderExecutionEnabled()) {
cmdList->appendSignalInOrderDependencyCounter(signalEvent, false, false, false);
cmdList->appendSignalInOrderDependencyCounter(signalEvent, dualStreamCopyOffload, false, false);
}
cmdList->handleInOrderDependencyCounter(signalEvent, false, false);
cmdList->handleInOrderDependencyCounter(signalEvent, false, dualStreamCopyOffload);
if (aggregatedEventsMode) {
cmdList->assignInOrderExecInfoToEvent(this->events.marker[markerEventIndex]);

View File

@@ -1171,18 +1171,58 @@ HWTEST2_F(AggregatedBcsSplitTests, givenCopyOffloadEnabledWhenAppendWithEventCal
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), (cmdStream->getUsed() - offset)));
auto lriItor = find<typename FamilyType::MI_LOAD_REGISTER_IMM *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), lriItor);
auto itor = find<typename FamilyType::PIPE_CONTROL *>(cmdList.begin(), lriItor);
EXPECT_EQ(lriItor, itor);
itor = find<typename FamilyType::MI_FLUSH_DW *>(cmdList.begin(), cmdList.end());
auto itor = find<typename FamilyType::PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(cmdList.end(), itor);
context->freeMem(ptr);
}
HWTEST2_F(AggregatedBcsSplitTests, givenCopyOffloadEnabledWhenAppendThenUseCopyQueue, IsAtLeastXeHpcCore) {
if (device->getProductHelper().isDcFlushAllowed()) {
GTEST_SKIP();
}
debugManager.flags.ForceCopyOperationOffloadForComputeCmdList.set(1);
ze_result_t returnValue;
auto computeCommandList = createCmdList(false);
auto ptr = allocHostMem();
ze_event_pool_desc_t eventPoolDesc = {.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP, .count = 1};
ze_event_desc_t eventDesc = {};
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
auto event = std::unique_ptr<L0::Event>(L0::Event::create<typename FamilyType::TimestampPacketType>(eventPool.get(), &eventDesc, device, returnValue));
auto cmdStream = computeCommandList->getCmdContainer().getCommandStream();
auto offset = cmdStream->getUsed();
auto computeTaskCount = computeCommandList->getCsr(false)->peekTaskCount();
TaskCountType copyTaskCount = 0;
if (computeCommandList->isDualStreamCopyOffloadOperation(true)) {
copyTaskCount = computeCommandList->getCsr(true)->peekTaskCount();
}
computeCommandList->appendMemoryCopy(ptr, ptr, copySize, event->toHandle(), 0, nullptr, copyParams);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), (cmdStream->getUsed() - offset)));
auto itor = find<typename FamilyType::MI_FLUSH_DW *>(cmdList.begin(), cmdList.end());
if (computeCommandList->isDualStreamCopyOffloadOperation(true)) {
EXPECT_EQ(computeTaskCount, computeCommandList->getCsr(false)->peekTaskCount());
EXPECT_EQ(copyTaskCount + 1, computeCommandList->getCsr(true)->peekTaskCount());
EXPECT_NE(cmdList.end(), itor);
} else {
EXPECT_EQ(computeTaskCount + 1, computeCommandList->getCsr(false)->peekTaskCount());
EXPECT_EQ(cmdList.end(), itor);
}
context->freeMem(ptr);
}
HWTEST_F(AggregatedBcsSplitTests, givenTransferDirectionWhenAskingIfSplitIsNeededThenReturnCorrectValue) {
debugManager.flags.SplitBcsTransferDirectionMask.set(-1);