feature: relaxed ordering in copy offload path

Related-To: NEO-11376

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski 2024-06-14 09:43:05 +00:00 committed by Compute-Runtime-Automation
parent 8d28f8d90e
commit b0c924d40e
6 changed files with 86 additions and 26 deletions

View File

@ -356,7 +356,7 @@ struct CommandListCoreFamily : public CommandListImp {
}
void postInitComputeSetup();
NEO::PreemptionMode obtainKernelPreemptionMode(Kernel *kernel);
virtual bool isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents) const { return false; }
virtual bool isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents, bool copyOffload) const { return false; }
virtual void setupFlushMethod(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) {}
bool canSkipInOrderEventWait(Event &event, bool ignorCbEventBoundToCmdList) const;
bool handleInOrderImplicitDependencies(bool relaxedOrderingAllowed);

View File

@ -588,7 +588,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendEventReset(ze_event_hand
}
if (this->isInOrderExecutionEnabled()) {
handleInOrderImplicitDependencies(isRelaxedOrderingDispatchAllowed(0));
handleInOrderImplicitDependencies(isRelaxedOrderingDispatchAllowed(0, false));
}
appendSynchronizedDispatchInitializationSection();
@ -2476,7 +2476,7 @@ inline ze_result_t CommandListCoreFamily<gfxCoreFamily>::addEventsToCmdList(uint
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendSignalEvent(ze_event_handle_t hEvent) {
if (this->isInOrderExecutionEnabled()) {
handleInOrderImplicitDependencies(isRelaxedOrderingDispatchAllowed(0));
handleInOrderImplicitDependencies(isRelaxedOrderingDispatchAllowed(0, false));
}
auto event = Event::fromHandle(hEvent);

View File

@ -213,7 +213,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
TransferType getTransferType(const CpuMemCopyInfo &cpuMemCopyInfo);
size_t getTransferThreshold(TransferType transferType);
bool isBarrierRequired();
bool isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents) const override;
bool isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents, bool copyOffload) const override;
bool skipInOrderNonWalkerSignalingAllowed(ze_event_handle_t signalEvent) const override;
protected:

View File

@ -490,7 +490,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendLaunchKernel(
ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents,
CmdListKernelLaunchParams &launchParams, bool relaxedOrderingDispatch) {
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents);
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false);
bool stallingCmdsForRelaxedOrdering = hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch);
checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize);
@ -525,7 +525,7 @@ void CommandListCoreFamilyImmediate<gfxCoreFamily>::handleInOrderNonWalkerSignal
bool nonWalkerSignalingHasRelaxedOrdering = false;
if (NEO::debugManager.flags.EnableInOrderRelaxedOrderingForEventsChaining.get() != 0) {
nonWalkerSignalingHasRelaxedOrdering = isRelaxedOrderingDispatchAllowed(1);
nonWalkerSignalingHasRelaxedOrdering = isRelaxedOrderingDispatchAllowed(1, false);
}
if (nonWalkerSignalingHasRelaxedOrdering) {
@ -543,7 +543,7 @@ template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendLaunchKernelIndirect(
ze_kernel_handle_t kernelHandle, const ze_group_count_t &pDispatchArgumentsBuffer,
ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) {
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents);
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false);
checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize);
@ -568,7 +568,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendBarrier(ze_even
return ZE_RESULT_SUCCESS;
}
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents);
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false);
isStallingOperation = hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch);
}
@ -588,7 +588,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch, bool forceDisableCopyOnlyInOrderSignaling) {
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents);
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, isCopyOffloadEnabled());
auto estimatedSize = commonImmediateCommandSize;
if (isCopyOnly() || isCopyOffloadEnabled()) {
@ -615,7 +615,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
NEO::TransferDirection direction;
auto isSplitNeeded = this->isAppendSplitNeeded(dstptr, srcptr, size, direction);
if (isSplitNeeded) {
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1, false); // split generates more than 1 event
hasStallindCmds = !relaxedOrderingDispatch;
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, void *, const void *>(this, dstptr, srcptr, size, hSignalEvent, numWaitEvents, phWaitEvents, true, relaxedOrderingDispatch, direction, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
@ -642,7 +642,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch, bool forceDisableCopyOnlyInOrderSignaling) {
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents);
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, isCopyOffloadEnabled());
auto estimatedSize = commonImmediateCommandSize;
if (isCopyOnly() || isCopyOffloadEnabled()) {
@ -661,7 +661,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
NEO::TransferDirection direction;
auto isSplitNeeded = this->isAppendSplitNeeded(dstPtr, srcPtr, this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch), direction);
if (isSplitNeeded) {
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1, false); // split generates more than 1 event
hasStallindCmds = !relaxedOrderingDispatch;
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, uint32_t, uint32_t>(this, dstRegion->originX, srcRegion->originX, dstRegion->width, hSignalEvent, numWaitEvents, phWaitEvents, true, relaxedOrderingDispatch, direction, [&](uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
@ -692,7 +692,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryFill(void
ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) {
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents);
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false);
checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize);
@ -736,7 +736,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendPageFaultCopy(N
bool relaxedOrdering = false;
if (isSplitNeeded) {
relaxedOrdering = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event
relaxedOrdering = isRelaxedOrderingDispatchAllowed(1, false); // split generates more than 1 event
uintptr_t dstAddress = static_cast<uintptr_t>(dstAllocation->getGpuAddress());
uintptr_t srcAddress = static_cast<uintptr_t>(srcAllocation->getGpuAddress());
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, uintptr_t, uintptr_t>(this, dstAddress, srcAddress, size, nullptr, 0u, nullptr, false, relaxedOrdering, direction, [&](uintptr_t dstAddressParam, uintptr_t srcAddressParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
@ -815,7 +815,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendImageCopyRegion
ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) {
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents);
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false);
auto estimatedSize = commonImmediateCommandSize;
if (isCopyOnly()) {
@ -840,7 +840,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendImageCopyFromMe
ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) {
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents);
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false);
checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize);
@ -858,7 +858,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendImageCopyToMemo
ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) {
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents);
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false);
checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize);
@ -878,7 +878,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendImageCopyFromMe
ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) {
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents);
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false);
checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize);
@ -898,7 +898,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendImageCopyToMemo
ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) {
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents);
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false);
checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize);
@ -927,7 +927,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendLaunchCooperati
ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents,
ze_event_handle_t *waitEventHandles, bool relaxedOrderingDispatch) {
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents);
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false);
checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize);
@ -1396,10 +1396,10 @@ void CommandListCoreFamilyImmediate<gfxCoreFamily>::checkAssert() {
}
template <GFXCORE_FAMILY gfxCoreFamily>
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents) const {
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents, bool copyOffload) const {
auto numEvents = numWaitEvents + (this->hasInOrderDependencies() ? 1 : 0);
return NEO::RelaxedOrderingHelper::isRelaxedOrderingDispatchAllowed(*getCsr(false), numEvents);
return NEO::RelaxedOrderingHelper::isRelaxedOrderingDispatchAllowed(*getCsr(copyOffload), numEvents);
}
template <GFXCORE_FAMILY gfxCoreFamily>

View File

@ -546,7 +546,7 @@ HWTEST2_F(CommandListTest, givenRegularCmdListWhenAskingForRelaxedOrderingThenRe
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
EXPECT_FALSE(commandList->isRelaxedOrderingDispatchAllowed(5));
EXPECT_FALSE(commandList->isRelaxedOrderingDispatchAllowed(5, false));
}
HWTEST2_F(CommandListTest,

View File

@ -7,6 +7,7 @@
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/direct_submission/dispatchers/blitter_dispatcher.h"
#include "shared/source/gmm_helper/gmm_helper.h"
#include "shared/source/helpers/constants.h"
#include "shared/source/helpers/register_offsets.h"
@ -1640,7 +1641,7 @@ HWTEST2_F(InOrderCmdListTests, givenCmdsChainingWhenDispatchingKernelWithRelaxed
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, eventHandle, 0, nullptr, launchParams, false);
findConditionalBbStarts(1); // chaining
EXPECT_TRUE(immCmdList->isRelaxedOrderingDispatchAllowed(0));
EXPECT_TRUE(immCmdList->isRelaxedOrderingDispatchAllowed(0, false));
offset = cmdStream->getUsed();
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
@ -2039,7 +2040,7 @@ HWTEST2_F(InOrderCmdListTests, givenRelaxedOrderingWhenProgrammingTimestampEvent
immCmdList->inOrderExecInfo->addCounterValue(1);
EXPECT_TRUE(immCmdList->isRelaxedOrderingDispatchAllowed(0));
EXPECT_TRUE(immCmdList->isRelaxedOrderingDispatchAllowed(0, false));
EXPECT_EQ(0u, immCmdList->flushData.size());
@ -2146,7 +2147,7 @@ HWTEST2_F(InOrderCmdListTests, givenDebugFlagSetWhenChainingWithRelaxedOrderingT
immCmdList->inOrderExecInfo->addCounterValue(1);
EXPECT_TRUE(immCmdList->isRelaxedOrderingDispatchAllowed(0));
EXPECT_TRUE(immCmdList->isRelaxedOrderingDispatchAllowed(0, false));
EXPECT_EQ(0u, immCmdList->flushCount);
@ -7231,6 +7232,65 @@ HWTEST2_F(CopyOffloadInOrderTests, givenCopyOperationWithHostVisibleEventThenMar
EXPECT_EQ(!immCmdList->dcFlushSupport, immCmdList->latestFlushIsHostVisible);
}
HWTEST2_F(CopyOffloadInOrderTests, givenRelaxedOrderingEnabledWhenDispatchingThenUseCorrectCsr, IsAtLeastXeHpcCore) {
class MyMockCmdList : public WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>> {
public:
ze_result_t flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, bool kernelOperation, bool copyOffloadSubmission, ze_event_handle_t hSignalEvent) override {
latestRelaxedOrderingMode = hasRelaxedOrderingDependencies;
return ZE_RESULT_SUCCESS;
}
bool latestRelaxedOrderingMode = false;
};
debugManager.flags.DirectSubmissionRelaxedOrdering.set(1);
auto immCmdList = createImmCmdListImpl<gfxCoreFamily, MyMockCmdList>(true);
auto mainQueueCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(immCmdList->getCsr(false));
auto copyQueueCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(immCmdList->getCsr(true));
auto mainQueueDirectSubmission = new MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>>(*mainQueueCsr);
auto offloadDirectSubmission = new MockDirectSubmissionHw<FamilyType, BlitterDispatcher<FamilyType>>(*copyQueueCsr);
mainQueueCsr->directSubmission.reset(mainQueueDirectSubmission);
copyQueueCsr->blitterDirectSubmission.reset(offloadDirectSubmission);
int client1, client2;
// first dependency
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
// compute CSR
mainQueueCsr->registerClient(&client1);
mainQueueCsr->registerClient(&client2);
EXPECT_TRUE(immCmdList->isRelaxedOrderingDispatchAllowed(0, false));
EXPECT_FALSE(immCmdList->isRelaxedOrderingDispatchAllowed(0, true));
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_TRUE(immCmdList->latestRelaxedOrderingMode);
immCmdList->appendMemoryCopy(&copyData1, &copyData2, 1, nullptr, 0, nullptr, false, false);
EXPECT_FALSE(immCmdList->latestRelaxedOrderingMode);
// offload CSR
mainQueueCsr->unregisterClient(&client1);
mainQueueCsr->unregisterClient(&client2);
copyQueueCsr->registerClient(&client1);
copyQueueCsr->registerClient(&client2);
EXPECT_FALSE(immCmdList->isRelaxedOrderingDispatchAllowed(0, false));
EXPECT_TRUE(immCmdList->isRelaxedOrderingDispatchAllowed(0, true));
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_FALSE(immCmdList->latestRelaxedOrderingMode);
immCmdList->appendMemoryCopy(&copyData1, &copyData2, 1, nullptr, 0, nullptr, false, false);
EXPECT_TRUE(immCmdList->latestRelaxedOrderingMode);
}
HWTEST2_F(CopyOffloadInOrderTests, givenInOrderModeWhenCallingSyncThenHandleCompletionOnCorrectCsr, IsAtLeastXeHpCore) {
auto immCmdList = createImmCmdListWithOffload<gfxCoreFamily>();