mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-08 14:02:58 +08:00
feature: use bcs stream for copy offload bcs split
Related-To: NEO-14557 Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
36614e614f
commit
ac8f8d1d8c
@@ -502,6 +502,7 @@ struct CommandList : _ze_command_list_handle_t {
|
||||
uint32_t getActiveScratchPatchElements() const {
|
||||
return activeScratchPatchElements;
|
||||
}
|
||||
bool isDualStreamCopyOffloadOperation(bool offloadOperation) const { return (getCopyOffloadModeForOperation(offloadOperation) == CopyOffloadModes::dualStream); }
|
||||
|
||||
protected:
|
||||
NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize, bool copyOffload);
|
||||
@@ -513,7 +514,6 @@ struct CommandList : _ze_command_list_handle_t {
|
||||
}
|
||||
MOCKABLE_VIRTUAL void synchronizeEventList(uint32_t numWaitEvents, ze_event_handle_t *waitEventList);
|
||||
|
||||
bool isDualStreamCopyOffloadOperation(bool offloadOperation) const { return (getCopyOffloadModeForOperation(offloadOperation) == CopyOffloadModes::dualStream); }
|
||||
bool isNonDualStreamCopyOffloadOperation(bool offloadOperation) const { return offloadOperation && !isDualStreamCopyOffloadOperation(offloadOperation); }
|
||||
void registerWalkerWithProfilingEnqueued(Event *event);
|
||||
|
||||
|
||||
@@ -252,6 +252,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
|
||||
ze_result_t appendStagingMemoryCopy(void *dstptr, const void *srcptr, size_t size, ze_event_handle_t hSignalEvent, CmdListMemoryCopyParams &memoryCopyParams);
|
||||
ze_result_t stagingStatusToL0(const NEO::StagingTransferStatus &status) const;
|
||||
size_t estimateAdditionalSizeAppendRegularCommandLists(uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists);
|
||||
void setupFlagsForBcsSplit(CmdListMemoryCopyParams &memoryCopyParams, bool &hasStallingCmds, bool ©OffloadFlush);
|
||||
|
||||
MOCKABLE_VIRTUAL void checkAssert();
|
||||
ComputeFlushMethodType computeFlushMethod = nullptr;
|
||||
|
||||
@@ -643,6 +643,16 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendBarrier(ze_even
|
||||
return flushImmediate(ret, true, isStallingOperation, relaxedOrderingDispatch, NEO::AppendOperations::nonKernel, false, hSignalEvent, false, nullptr, nullptr);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void CommandListCoreFamilyImmediate<gfxCoreFamily>::setupFlagsForBcsSplit(CmdListMemoryCopyParams &memoryCopyParams, bool &hasStallingCmds, bool ©OffloadFlush) {
|
||||
memoryCopyParams.relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1, false); // split generates more than 1 event
|
||||
memoryCopyParams.forceDisableCopyOnlyInOrderSignaling = true;
|
||||
memoryCopyParams.taskCountUpdateRequired = true;
|
||||
memoryCopyParams.copyOffloadAllowed = this->isCopyOffloadEnabled();
|
||||
copyOffloadFlush = memoryCopyParams.copyOffloadAllowed;
|
||||
hasStallingCmds = !memoryCopyParams.relaxedOrderingDispatch;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
|
||||
void *dstptr,
|
||||
@@ -652,6 +662,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
|
||||
uint32_t numWaitEvents,
|
||||
ze_event_handle_t *phWaitEvents, CmdListMemoryCopyParams &memoryCopyParams) {
|
||||
memoryCopyParams.relaxedOrderingDispatch |= isRelaxedOrderingDispatchAllowed(numWaitEvents, isCopyOffloadEnabled());
|
||||
bool copyOffloadFlush = false;
|
||||
|
||||
auto estimatedSize = commonImmediateCommandSize;
|
||||
if (isCopyOnly(true)) {
|
||||
@@ -662,7 +673,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
|
||||
}
|
||||
checkAvailableSpace(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch, estimatedSize, false);
|
||||
|
||||
bool hasStallindCmds = hasStallingCmdsForRelaxedOrdering(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch);
|
||||
bool hasStallingCmds = hasStallingCmdsForRelaxedOrdering(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch);
|
||||
|
||||
ze_result_t ret;
|
||||
CpuMemCopyInfo cpuMemCopyInfo(dstptr, const_cast<void *>(srcptr), size);
|
||||
@@ -678,16 +689,14 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
|
||||
NEO::TransferDirection direction;
|
||||
auto isSplitNeeded = this->isAppendSplitNeeded(dstptr, srcptr, size, direction);
|
||||
if (isSplitNeeded) {
|
||||
memoryCopyParams.relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1, false); // split generates more than 1 event
|
||||
memoryCopyParams.forceDisableCopyOnlyInOrderSignaling = true;
|
||||
memoryCopyParams.taskCountUpdateRequired = true;
|
||||
hasStallindCmds = !memoryCopyParams.relaxedOrderingDispatch;
|
||||
setupFlagsForBcsSplit(memoryCopyParams, hasStallingCmds, copyOffloadFlush);
|
||||
|
||||
auto splitCall = [&](CommandListCoreFamilyImmediate<gfxCoreFamily> *subCmdList, void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
|
||||
return subCmdList->CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, 0u, nullptr, memoryCopyParams);
|
||||
};
|
||||
|
||||
ret = static_cast<DeviceImp *>(this->device)->bcsSplit->appendSplitCall<gfxCoreFamily, void *, const void *>(this, dstptr, srcptr, size, hSignalEvent, numWaitEvents, phWaitEvents, true, memoryCopyParams.relaxedOrderingDispatch, direction, estimatedSize, splitCall);
|
||||
|
||||
} else if (this->isValidForStagingTransfer(dstptr, srcptr, size, numWaitEvents > 0)) {
|
||||
return this->appendStagingMemoryCopy(dstptr, srcptr, size, hSignalEvent, memoryCopyParams);
|
||||
} else {
|
||||
@@ -695,7 +704,9 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
|
||||
numWaitEvents, phWaitEvents, memoryCopyParams);
|
||||
}
|
||||
|
||||
return flushImmediate(ret, true, hasStallindCmds, memoryCopyParams.relaxedOrderingDispatch, NEO::AppendOperations::kernel, memoryCopyParams.copyOffloadAllowed, hSignalEvent, memoryCopyParams.taskCountUpdateRequired, nullptr, nullptr);
|
||||
copyOffloadFlush |= memoryCopyParams.copyOffloadAllowed;
|
||||
|
||||
return flushImmediate(ret, true, hasStallingCmds, memoryCopyParams.relaxedOrderingDispatch, NEO::AppendOperations::kernel, copyOffloadFlush, hSignalEvent, memoryCopyParams.taskCountUpdateRequired, nullptr, nullptr);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
@@ -723,17 +734,15 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
|
||||
}
|
||||
checkAvailableSpace(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch, estimatedSize, false);
|
||||
|
||||
bool hasStallindCmds = hasStallingCmdsForRelaxedOrdering(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch);
|
||||
bool hasStallingCmds = hasStallingCmdsForRelaxedOrdering(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch);
|
||||
bool copyOffloadFlush = false;
|
||||
|
||||
ze_result_t ret;
|
||||
|
||||
NEO::TransferDirection direction;
|
||||
auto isSplitNeeded = this->isAppendSplitNeeded(dstPtr, srcPtr, this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch), direction);
|
||||
if (isSplitNeeded) {
|
||||
memoryCopyParams.relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1, false); // split generates more than 1 event
|
||||
memoryCopyParams.forceDisableCopyOnlyInOrderSignaling = true;
|
||||
memoryCopyParams.taskCountUpdateRequired = true;
|
||||
hasStallindCmds = !memoryCopyParams.relaxedOrderingDispatch;
|
||||
setupFlagsForBcsSplit(memoryCopyParams, hasStallingCmds, copyOffloadFlush);
|
||||
|
||||
auto splitCall = [&](CommandListCoreFamilyImmediate<gfxCoreFamily> *subCmdList, uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
|
||||
ze_copy_region_t dstRegionLocal = {};
|
||||
@@ -756,7 +765,9 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
|
||||
hSignalEvent, numWaitEvents, phWaitEvents, memoryCopyParams);
|
||||
}
|
||||
|
||||
return flushImmediate(ret, true, hasStallindCmds, memoryCopyParams.relaxedOrderingDispatch, NEO::AppendOperations::kernel, memoryCopyParams.copyOffloadAllowed, hSignalEvent, memoryCopyParams.taskCountUpdateRequired, nullptr, nullptr);
|
||||
copyOffloadFlush |= memoryCopyParams.copyOffloadAllowed;
|
||||
|
||||
return flushImmediate(ret, true, hasStallingCmds, memoryCopyParams.relaxedOrderingDispatch, NEO::AppendOperations::kernel, copyOffloadFlush, hSignalEvent, memoryCopyParams.taskCountUpdateRequired, nullptr, nullptr);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
|
||||
@@ -148,9 +148,11 @@ struct BcsSplit {
|
||||
}
|
||||
}
|
||||
|
||||
cmdList->addEventsToCmdList(static_cast<uint32_t>(eventHandles.size()), eventHandles.data(), nullptr, hasRelaxedOrderingDependencies, false, true, false, false);
|
||||
const bool dualStreamCopyOffload = cmdList->isDualStreamCopyOffloadOperation(cmdList->isCopyOffloadEnabled());
|
||||
|
||||
const auto isCopyCmdList = cmdList->isCopyOnly(false);
|
||||
cmdList->addEventsToCmdList(static_cast<uint32_t>(eventHandles.size()), eventHandles.data(), nullptr, hasRelaxedOrderingDependencies, false, true, false, dualStreamCopyOffload);
|
||||
|
||||
const auto isCopyCmdList = cmdList->isCopyOnly(dualStreamCopyOffload);
|
||||
|
||||
if (signalEvent) {
|
||||
cmdList->appendSignalEventPostWalker(signalEvent, nullptr, nullptr, !isCopyCmdList, false, isCopyCmdList);
|
||||
@@ -161,9 +163,9 @@ struct BcsSplit {
|
||||
}
|
||||
|
||||
if (cmdList->isInOrderExecutionEnabled()) {
|
||||
cmdList->appendSignalInOrderDependencyCounter(signalEvent, false, false, false);
|
||||
cmdList->appendSignalInOrderDependencyCounter(signalEvent, dualStreamCopyOffload, false, false);
|
||||
}
|
||||
cmdList->handleInOrderDependencyCounter(signalEvent, false, false);
|
||||
cmdList->handleInOrderDependencyCounter(signalEvent, false, dualStreamCopyOffload);
|
||||
|
||||
if (aggregatedEventsMode) {
|
||||
cmdList->assignInOrderExecInfoToEvent(this->events.marker[markerEventIndex]);
|
||||
|
||||
@@ -1171,18 +1171,58 @@ HWTEST2_F(AggregatedBcsSplitTests, givenCopyOffloadEnabledWhenAppendWithEventCal
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), (cmdStream->getUsed() - offset)));
|
||||
|
||||
auto lriItor = find<typename FamilyType::MI_LOAD_REGISTER_IMM *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_NE(cmdList.end(), lriItor);
|
||||
|
||||
auto itor = find<typename FamilyType::PIPE_CONTROL *>(cmdList.begin(), lriItor);
|
||||
EXPECT_EQ(lriItor, itor);
|
||||
|
||||
itor = find<typename FamilyType::MI_FLUSH_DW *>(cmdList.begin(), cmdList.end());
|
||||
auto itor = find<typename FamilyType::PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
|
||||
EXPECT_EQ(cmdList.end(), itor);
|
||||
|
||||
context->freeMem(ptr);
|
||||
}
|
||||
|
||||
HWTEST2_F(AggregatedBcsSplitTests, givenCopyOffloadEnabledWhenAppendThenUseCopyQueue, IsAtLeastXeHpcCore) {
|
||||
if (device->getProductHelper().isDcFlushAllowed()) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
|
||||
debugManager.flags.ForceCopyOperationOffloadForComputeCmdList.set(1);
|
||||
|
||||
ze_result_t returnValue;
|
||||
auto computeCommandList = createCmdList(false);
|
||||
|
||||
auto ptr = allocHostMem();
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP, .count = 1};
|
||||
ze_event_desc_t eventDesc = {};
|
||||
|
||||
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
|
||||
auto event = std::unique_ptr<L0::Event>(L0::Event::create<typename FamilyType::TimestampPacketType>(eventPool.get(), &eventDesc, device, returnValue));
|
||||
|
||||
auto cmdStream = computeCommandList->getCmdContainer().getCommandStream();
|
||||
auto offset = cmdStream->getUsed();
|
||||
|
||||
auto computeTaskCount = computeCommandList->getCsr(false)->peekTaskCount();
|
||||
TaskCountType copyTaskCount = 0;
|
||||
if (computeCommandList->isDualStreamCopyOffloadOperation(true)) {
|
||||
copyTaskCount = computeCommandList->getCsr(true)->peekTaskCount();
|
||||
}
|
||||
|
||||
computeCommandList->appendMemoryCopy(ptr, ptr, copySize, event->toHandle(), 0, nullptr, copyParams);
|
||||
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), (cmdStream->getUsed() - offset)));
|
||||
auto itor = find<typename FamilyType::MI_FLUSH_DW *>(cmdList.begin(), cmdList.end());
|
||||
|
||||
if (computeCommandList->isDualStreamCopyOffloadOperation(true)) {
|
||||
EXPECT_EQ(computeTaskCount, computeCommandList->getCsr(false)->peekTaskCount());
|
||||
EXPECT_EQ(copyTaskCount + 1, computeCommandList->getCsr(true)->peekTaskCount());
|
||||
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
} else {
|
||||
EXPECT_EQ(computeTaskCount + 1, computeCommandList->getCsr(false)->peekTaskCount());
|
||||
EXPECT_EQ(cmdList.end(), itor);
|
||||
}
|
||||
|
||||
context->freeMem(ptr);
|
||||
}
|
||||
|
||||
HWTEST_F(AggregatedBcsSplitTests, givenTransferDirectionWhenAskingIfSplitIsNeededThenReturnCorrectValue) {
|
||||
debugManager.flags.SplitBcsTransferDirectionMask.set(-1);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user