feature: new heuristic to enable relaxed ordering 2

Related-To: NEO-13431

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz 2024-12-09 09:55:26 +00:00 committed by Compute-Runtime-Automation
parent 8f671cb6a8
commit 37e81d2a11
11 changed files with 214 additions and 6 deletions

View File

@ -354,7 +354,7 @@ struct CommandListCoreFamily : public CommandListImp {
}
void postInitComputeSetup();
NEO::PreemptionMode obtainKernelPreemptionMode(Kernel *kernel);
virtual bool isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents, bool copyOffload) const { return false; }
virtual bool isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents, bool copyOffload) { return false; }
virtual void setupFlushMethod(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) {}
bool canSkipInOrderEventWait(Event &event, bool ignorCbEventBoundToCmdList) const;
bool handleInOrderImplicitDependencies(bool relaxedOrderingAllowed, bool copyOffloadOperation);

View File

@ -208,7 +208,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
TransferType getTransferType(const CpuMemCopyInfo &cpuMemCopyInfo);
size_t getTransferThreshold(TransferType transferType);
bool isBarrierRequired();
bool isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents, bool copyOffload) const override;
bool isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents, bool copyOffload) override;
bool skipInOrderNonWalkerSignalingAllowed(ze_event_handle_t signalEvent) const override;
protected:
@ -221,12 +221,15 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
void setupFlushMethod(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) override;
void allocateOrReuseKernelPrivateMemoryIfNeeded(Kernel *kernel, uint32_t sizePerHwThread) override;
void handleInOrderNonWalkerSignaling(Event *event, bool &hasStallingCmds, bool &relaxedOrderingDispatch, ze_result_t &result);
CommandQueue *getCmdQImmediate(bool copyOffloadOperation) const;
MOCKABLE_VIRTUAL void checkAssert();
ComputeFlushMethodType computeFlushMethod = nullptr;
uint64_t relaxedOrderingCounter = 0;
std::atomic<bool> dependenciesPresent{false};
bool latestFlushIsHostVisible = false;
bool latestFlushIsCopyOffload = false;
bool keepRelaxedOrderingEnabled = false;
};
template <PRODUCT_FAMILY gfxProductFamily>

View File

@ -398,7 +398,7 @@ NEO::CompletionStamp CommandListCoreFamilyImmediate<gfxCoreFamily>::flushRegular
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::executeCommandListImmediateWithFlushTask(bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, bool kernelOperation, bool copyOffloadSubmission, bool requireTaskCountUpdate) {
return executeCommandListImmediateWithFlushTaskImpl(performMigration, hasStallingCmds, hasRelaxedOrderingDependencies, kernelOperation, requireTaskCountUpdate, copyOffloadSubmission ? this->cmdQImmediateCopyOffload : this->cmdQImmediate);
return executeCommandListImmediateWithFlushTaskImpl(performMigration, hasStallingCmds, hasRelaxedOrderingDependencies, kernelOperation, requireTaskCountUpdate, getCmdQImmediate(copyOffloadSubmission));
}
template <GFXCORE_FAMILY gfxCoreFamily>
@ -546,7 +546,9 @@ void CommandListCoreFamilyImmediate<gfxCoreFamily>::handleInOrderNonWalkerSignal
bool nonWalkerSignalingHasRelaxedOrdering = false;
if (NEO::debugManager.flags.EnableInOrderRelaxedOrderingForEventsChaining.get() != 0) {
auto counterValueBeforeSecondCheck = this->relaxedOrderingCounter;
nonWalkerSignalingHasRelaxedOrdering = isRelaxedOrderingDispatchAllowed(1, false);
this->relaxedOrderingCounter = counterValueBeforeSecondCheck; // dont increment twice
}
if (nonWalkerSignalingHasRelaxedOrdering) {
@ -1064,13 +1066,18 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::hostSynchronize(uint6
return hostSynchronize(timeout, true);
}
template <GFXCORE_FAMILY gfxCoreFamily>
CommandQueue *CommandListCoreFamilyImmediate<gfxCoreFamily>::getCmdQImmediate(bool copyOffloadOperation) const {
return copyOffloadOperation ? this->cmdQImmediateCopyOffload : this->cmdQImmediate;
}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds,
bool hasRelaxedOrderingDependencies, bool kernelOperation, bool copyOffloadSubmission, ze_event_handle_t hSignalEvent,
bool requireTaskCountUpdate) {
auto signalEvent = Event::fromHandle(hSignalEvent);
auto queue = copyOffloadSubmission ? this->cmdQImmediateCopyOffload : this->cmdQImmediate;
auto queue = getCmdQImmediate(copyOffloadSubmission);
this->latestFlushIsCopyOffload = copyOffloadSubmission;
if (NEO::debugManager.flags.DeferStateInitSubmissionToFirstRegularUsage.get() == 1) {
@ -1431,10 +1438,46 @@ void CommandListCoreFamilyImmediate<gfxCoreFamily>::checkAssert() {
}
template <GFXCORE_FAMILY gfxCoreFamily>
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents, bool copyOffload) const {
bool CommandListCoreFamilyImmediate<gfxCoreFamily>::isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents, bool copyOffload) {
auto csr = getCsr(copyOffload);
if (!csr->directSubmissionRelaxedOrderingEnabled()) {
return false;
}
auto numEvents = numWaitEvents + (this->hasInOrderDependencies() ? 1 : 0);
return NEO::RelaxedOrderingHelper::isRelaxedOrderingDispatchAllowed(*getCsr(copyOffload), numEvents);
if (NEO::debugManager.flags.DirectSubmissionRelaxedOrderingCounterHeuristic.get() == 1) {
uint32_t relaxedOrderingCounterThreshold = csr->getDirectSubmissionRelaxedOrderingQueueDepth();
auto queueTaskCount = getCmdQImmediate(copyOffload)->getTaskCount();
auto csrTaskCount = csr->peekTaskCount();
if ((this->device->getNEODevice()->isInitDeviceWithFirstSubmissionSupported(csr->getType()) || this->heaplessStateInitEnabled) && csr->peekTaskCount() == 1) {
DEBUG_BREAK_IF(queueTaskCount != 0);
queueTaskCount = 1;
}
if (NEO::debugManager.flags.DirectSubmissionRelaxedOrderingCounterHeuristicTreshold.get() != -1) {
relaxedOrderingCounterThreshold = static_cast<uint32_t>(NEO::debugManager.flags.DirectSubmissionRelaxedOrderingCounterHeuristicTreshold.get());
}
if (queueTaskCount == csrTaskCount) {
relaxedOrderingCounter++;
} else {
// Submission from another queue. Reset counter and keep relaxed ordering allowed
relaxedOrderingCounter = 0;
this->keepRelaxedOrderingEnabled = true;
}
if (relaxedOrderingCounter > static_cast<uint64_t>(relaxedOrderingCounterThreshold)) {
this->keepRelaxedOrderingEnabled = false;
return false;
}
return (keepRelaxedOrderingEnabled && (numEvents > 0));
}
return NEO::RelaxedOrderingHelper::isRelaxedOrderingDispatchAllowed(*csr, numEvents);
}
template <GFXCORE_FAMILY gfxCoreFamily>

View File

@ -206,6 +206,7 @@ struct WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>
using BaseClass::eventSignalPipeControl;
using BaseClass::finalStreamState;
using BaseClass::frontEndStateTracking;
using BaseClass::getCmdQImmediate;
using BaseClass::getDcFlushRequired;
using BaseClass::getHostPtrAlloc;
using BaseClass::getInOrderIncrementValue;
@ -227,6 +228,7 @@ struct WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>
using BaseClass::pipeControlMultiKernelEventSync;
using BaseClass::pipelineSelectStateTracking;
using BaseClass::programRegionGroupBarrier;
using BaseClass::relaxedOrderingCounter;
using BaseClass::requiredStreamState;
using BaseClass::requiresQueueUncachedMocs;
using BaseClass::signalAllEventPackets;

View File

@ -2057,6 +2057,146 @@ HWTEST2_F(InOrderCmdListTests, givenRelaxedOrderingEnabledWhenSignalEventCalledT
verifyFlags(false, true); // relaxed ordering disabled == stalling semaphore
}
HWTEST2_F(InOrderCmdListTests, givenCounterHeuristicForRelaxedOrderingEnabledWhenAppendingThenEnableRelaxedOrderingCorrectly, IsAtLeastXeHpcCore) {
debugManager.flags.DirectSubmissionRelaxedOrdering.set(1);
debugManager.flags.DirectSubmissionRelaxedOrderingCounterHeuristic.set(1);
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(device->getNEODevice()->getDefaultEngine().commandStreamReceiver);
ultCsr->recordFlushedBatchBuffer = true;
auto directSubmission = new MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>>(*ultCsr);
ultCsr->directSubmission.reset(directSubmission);
auto verifyFlags = [&ultCsr](bool relaxedOrderingExpected, auto &cmdList, uint64_t expectedCounter) {
EXPECT_EQ(expectedCounter, cmdList->relaxedOrderingCounter);
EXPECT_EQ(relaxedOrderingExpected, ultCsr->latestFlushedBatchBuffer.hasRelaxedOrderingDependencies);
};
auto immCmdList0 = createImmCmdList<gfxCoreFamily>();
auto queue0 = immCmdList0->getCmdQImmediate(false);
EXPECT_EQ(0u, queue0->getTaskCount());
EXPECT_EQ(0u, immCmdList0->relaxedOrderingCounter);
// First queue. Dont enable yet
immCmdList0->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
verifyFlags(false, immCmdList0, 1);
immCmdList0->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
verifyFlags(false, immCmdList0, 2);
immCmdList0->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
verifyFlags(false, immCmdList0, 3);
auto immCmdList1 = createImmCmdList<gfxCoreFamily>();
auto queue1 = immCmdList1->getCmdQImmediate(false);
EXPECT_EQ(0u, queue1->getTaskCount());
EXPECT_EQ(0u, immCmdList1->relaxedOrderingCounter);
// Reset to 0 - new queue
immCmdList1->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
verifyFlags(false, immCmdList1, 0); // no dependencies
immCmdList1->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
verifyFlags(true, immCmdList1, 1);
immCmdList1->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
verifyFlags(true, immCmdList1, 2);
// Back to queue0. Reset to 0 - new queue
immCmdList0->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
verifyFlags(true, immCmdList0, 0);
EXPECT_TRUE(ultCsr->getDirectSubmissionRelaxedOrderingQueueDepth() > 1);
for (uint32_t i = 0; i < ultCsr->getDirectSubmissionRelaxedOrderingQueueDepth(); i++) {
immCmdList0->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
verifyFlags(true, immCmdList0, i + 1);
}
// Threshold reached
immCmdList0->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
verifyFlags(false, immCmdList0, ultCsr->getDirectSubmissionRelaxedOrderingQueueDepth() + 1);
debugManager.flags.DirectSubmissionRelaxedOrderingCounterHeuristicTreshold.set(1);
// Back to queue1. Reset to 0 - new queue
immCmdList1->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
verifyFlags(true, immCmdList1, 0);
immCmdList1->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
verifyFlags(true, immCmdList1, 1);
// Threshold reached
immCmdList1->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
verifyFlags(false, immCmdList1, 2);
}
HWTEST2_F(InOrderCmdListTests, givenCounterHeuristicForRelaxedOrderingEnabledWithFirstDeviceInitSubmissionWhenAppendingThenEnableRelaxedOrderingCorrectly, IsAtLeastXeHpcCore) {
debugManager.flags.DirectSubmissionRelaxedOrdering.set(1);
debugManager.flags.DirectSubmissionRelaxedOrderingCounterHeuristic.set(1);
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(device->getNEODevice()->getDefaultEngine().commandStreamReceiver);
ultCsr->recordFlushedBatchBuffer = true;
VariableBackup<UltHwConfig> backup(&ultHwConfig);
ultHwConfig.useFirstSubmissionInitDevice = true;
if (!device->getNEODevice()->isInitDeviceWithFirstSubmissionSupported(ultCsr->getType())) {
GTEST_SKIP();
}
auto directSubmission = new MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>>(*ultCsr);
ultCsr->directSubmission.reset(directSubmission);
EXPECT_EQ(0u, ultCsr->peekTaskCount());
ultCsr->initializeDeviceWithFirstSubmission(*device->getNEODevice());
EXPECT_EQ(1u, ultCsr->peekTaskCount());
auto immCmdList0 = createImmCmdList<gfxCoreFamily>();
auto queue0 = immCmdList0->getCmdQImmediate(false);
EXPECT_EQ(0u, queue0->getTaskCount());
EXPECT_EQ(0u, immCmdList0->relaxedOrderingCounter);
immCmdList0->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(1u, immCmdList0->relaxedOrderingCounter);
}
HWTEST2_F(InOrderCmdListTests, givenRelaxedOrderingWithCounterHeuristicWhenSubmisionSplitThenDontIncrementCounterTwice, IsAtLeastXeHpcCore) {
debugManager.flags.DirectSubmissionRelaxedOrdering.set(1);
debugManager.flags.SkipInOrderNonWalkerSignalingAllowed.set(1);
debugManager.flags.DirectSubmissionRelaxedOrderingCounterHeuristic.set(1);
debugManager.flags.EnableInOrderRelaxedOrderingForEventsChaining.set(1);
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(device->getNEODevice()->getDefaultEngine().commandStreamReceiver);
auto directSubmission = new MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>>(*ultCsr);
ultCsr->directSubmission.reset(directSubmission);
int client1, client2;
ultCsr->registerClient(&client1);
ultCsr->registerClient(&client2);
auto immCmdList = createImmCmdList<gfxCoreFamily>();
auto eventPool = createEvents<FamilyType>(1, true);
events[0]->signalScope = 0;
if (!immCmdList->skipInOrderNonWalkerSignalingAllowed(events[0].get())) {
GTEST_SKIP(); // not supported
}
EXPECT_EQ(0u, immCmdList->relaxedOrderingCounter);
zeCommandListAppendLaunchKernel(immCmdList->toHandle(), kernel->toHandle(), &groupCount, events[0]->toHandle(), 0, nullptr);
EXPECT_EQ(1u, immCmdList->relaxedOrderingCounter);
zeCommandListAppendLaunchKernel(immCmdList->toHandle(), kernel->toHandle(), &groupCount, events[0]->toHandle(), 0, nullptr);
EXPECT_EQ(2u, immCmdList->relaxedOrderingCounter);
zeCommandListAppendLaunchKernel(immCmdList->toHandle(), kernel->toHandle(), &groupCount, events[0]->toHandle(), 0, nullptr);
EXPECT_EQ(3u, immCmdList->relaxedOrderingCounter);
}
HWTEST2_F(InOrderCmdListTests, givenInOrderEventModeWhenWaitingForEventFromPreviousAppendThenSkip, IsAtLeastXeHpCore) {
auto immCmdList = createImmCmdList<gfxCoreFamily>();

View File

@ -377,6 +377,8 @@ class CommandStreamReceiver {
return false;
}
virtual uint32_t getDirectSubmissionRelaxedOrderingQueueDepth() const { return 0; }
virtual bool isKmdWaitOnTaskCountAllowed() const {
return false;
}

View File

@ -156,6 +156,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
}
bool directSubmissionRelaxedOrderingEnabled() const override;
uint32_t getDirectSubmissionRelaxedOrderingQueueDepth() const override;
void stopDirectSubmission(bool blocking) override;

View File

@ -966,6 +966,18 @@ bool CommandStreamReceiverHw<GfxFamily>::bcsRelaxedOrderingAllowed(const BlitPro
(blitPropertiesContainer.size() == 1) && !hasStallingCmds;
}
template <typename GfxFamily>
uint32_t CommandStreamReceiverHw<GfxFamily>::getDirectSubmissionRelaxedOrderingQueueDepth() const {
if (directSubmission.get()) {
return directSubmission->getRelaxedOrderingQueueSize();
}
if (blitterDirectSubmission.get()) {
return blitterDirectSubmission->getRelaxedOrderingQueueSize();
}
return 0;
}
template <typename GfxFamily>
TaskCountType CommandStreamReceiverHw<GfxFamily>::flushBcsTask(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking, bool profilingEnabled, Device &device) {
auto lock = obtainUniqueOwnership();

View File

@ -450,6 +450,8 @@ DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionRelaxedOrdering, -1, "-1: defaul
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionRelaxedOrderingForBcs, -1, "-1: default, 0 - disable, 1 - enable. If set, enable RelaxedOrdering feature for BCS engine")
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionRelaxedOrderingQueueSizeLimit, -1, "-1: default, >0: Max gpu queue size. If limit is reached, scheduler wont consume new work")
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionRelaxedOrderingMinNumberOfClients, -1, "-1: default, >0: Enables RelaxedOrdering mode only if specified number of clients is assigned to given CSR.")
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionRelaxedOrderingCounterHeuristic, -1, "-1: default, 0: disabled, 1: enabled. If set use counter based heuristic to allow for relaxed ordering dispatch")
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionRelaxedOrderingCounterHeuristicTreshold, -1, "-1: default, >0: limit number of append calls to disable relaxed ordering dispatch")
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionMonitorFenceInputPolicy, -1, "-1: default, 0: stalling command flag, 1: explicit monitor fence flag. Selects policy to dispatch monitor fence upon input flag, either for every stalling command or explicit motor fence dispatch")
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionPrintSemaphoreUsage, -1, "-1: default, 0: disabled, 1: enabled. If set, print DirectSubmission semaphore programming and unlocking")
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionSwitchSemaphoreMode, -1, "-1: default, 1: enable switch on unsuccessful, 0: disable switch on unsuccessful")

View File

@ -103,6 +103,7 @@ class DirectSubmissionHw {
}
virtual void unblockPagingFenceSemaphore(uint64_t pagingFenceValue){};
uint32_t getRelaxedOrderingQueueSize() const { return currentRelaxedOrderingQueueSize; }
protected:
static constexpr size_t prefetchSize = 8 * MemoryConstants::cacheLineSize;

View File

@ -650,4 +650,6 @@ PrintCalculatedTimestamps = 0
DisableIndirectDetectionForKernelNames = unk
ForceIndirectDetectionForCMKernels = -1
LogIndirectDetectionKernelDetails = 0
DirectSubmissionRelaxedOrderingCounterHeuristic = -1
DirectSubmissionRelaxedOrderingCounterHeuristicTreshold = -1
# Please don't edit below this line