feature: improve waiting and signaling Events via KMD calls

Related-To: NEO-8179

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2023-11-22 15:11:49 +00:00
committed by Compute-Runtime-Automation
parent 7ffd151ac3
commit aba1cd8f9c
12 changed files with 62 additions and 162 deletions

View File

@@ -172,7 +172,7 @@ struct CommandListCoreFamily : CommandListImp {
ze_result_t appendSignalEvent(ze_event_handle_t hEvent) override;
ze_result_t appendWaitOnEvents(uint32_t numEvents, ze_event_handle_t *phEvent, bool relaxedOrderingAllowed, bool trackDependencies, bool signalInOrderCompletion) override;
void appendWaitOnInOrderDependency(std::shared_ptr<InOrderExecInfo> &inOrderExecInfo, uint64_t waitValue, uint32_t offset, bool relaxedOrderingAllowed, bool implicitDependency);
void appendSignalInOrderDependencyCounter();
void appendSignalInOrderDependencyCounter(Event *signalEvent);
void handleInOrderDependencyCounter(Event *signalEvent, bool nonWalkerInOrderCmdsChaining);
ze_result_t appendWriteGlobalTimestamp(uint64_t *dstptr, ze_event_handle_t hSignalEvent,

View File

@@ -6,7 +6,6 @@
*/
#include "shared/source/built_ins/built_ins.h"
#include "shared/source/command_container/encode_interrupt_helper.h"
#include "shared/source/command_container/encode_surface_state.h"
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/device/device.h"
@@ -169,7 +168,7 @@ void CommandListCoreFamily<gfxCoreFamily>::handleInOrderDependencyCounter(Event
UNRECOVERABLE_IF(inOrderAllocationOffset + offset >= inOrderExecInfo->inOrderDependencyCounterAllocation.getUnderlyingBufferSize());
CommandListCoreFamily<gfxCoreFamily>::appendSignalInOrderDependencyCounter(); // write 1 on new offset
CommandListCoreFamily<gfxCoreFamily>::appendSignalInOrderDependencyCounter(nullptr); // write 1 on new offset
}
inOrderExecInfo->inOrderDependencyCounter++;
@@ -545,7 +544,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendEventReset(ze_event_hand
}
if (this->isInOrderExecutionEnabled()) {
appendSignalInOrderDependencyCounter();
appendSignalInOrderDependencyCounter(event);
handleInOrderDependencyCounter(event, false);
}
@@ -586,7 +585,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryRangesBarrier(uint
addToMappedEventList(signalEvent);
if (this->isInOrderExecutionEnabled()) {
appendSignalInOrderDependencyCounter();
appendSignalInOrderDependencyCounter(signalEvent);
handleInOrderDependencyCounter(signalEvent, false);
}
@@ -1508,7 +1507,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
NEO::PipeControlArgs args;
NEO::MemorySynchronizationCommands<GfxFamily>::addSingleBarrier(*commandContainer.getCommandStream(), args);
}
appendSignalInOrderDependencyCounter();
appendSignalInOrderDependencyCounter(signalEvent);
}
if (!isCopyOnly() || inOrderCopyOnlySignalingAllowed) {
@@ -1604,7 +1603,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyRegion(void *d
if (this->isInOrderExecutionEnabled()) {
if (inOrderCopyOnlySignalingAllowed) {
appendSignalInOrderDependencyCounter();
appendSignalInOrderDependencyCounter(signalEvent);
}
if (!isCopyOnly() || inOrderCopyOnlySignalingAllowed) {
@@ -2058,7 +2057,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
NEO::PipeControlArgs args;
NEO::MemorySynchronizationCommands<GfxFamily>::addSingleBarrier(*commandContainer.getCommandStream(), args);
}
appendSignalInOrderDependencyCounter();
appendSignalInOrderDependencyCounter(signalEvent);
} else {
nonWalkerInOrderCmdChaining = isInOrderNonWalkerSignalingRequired(signalEvent);
}
@@ -2127,7 +2126,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBlitFill(void *ptr,
appendSignalEventPostWalker(signalEvent, false);
if (isInOrderExecutionEnabled()) {
appendSignalInOrderDependencyCounter();
appendSignalInOrderDependencyCounter(signalEvent);
handleInOrderDependencyCounter(signalEvent, false);
}
}
@@ -2350,7 +2349,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendSignalEvent(ze_event_han
dispatchEventPostSyncOperation(event, Event::STATE_SIGNALED, false, false, appendPipeControlWithPostSync, false);
if (this->isInOrderExecutionEnabled()) {
appendSignalInOrderDependencyCounter();
appendSignalInOrderDependencyCounter(event);
handleInOrderDependencyCounter(event, false);
}
@@ -2409,10 +2408,6 @@ void CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(std::sh
gpuAddress += sizeof(uint64_t);
}
if (NEO::EncodeUserInterruptHelper::isOperationAllowed(NEO::EncodeUserInterruptHelper::afterSemaphoreMask)) {
NEO::EnodeUserInterrupt<GfxFamily>::encode(*commandContainer.getCommandStream());
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -2498,7 +2493,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
}
if (signalInOrderCompletion) {
appendSignalInOrderDependencyCounter();
appendSignalInOrderDependencyCounter(nullptr);
handleInOrderDependencyCounter(nullptr, false);
}
@@ -2516,7 +2511,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendSignalInOrderDependencyCounter() {
void CommandListCoreFamily<gfxCoreFamily>::appendSignalInOrderDependencyCounter(Event *signalEvent) {
using MI_STORE_DATA_IMM = typename GfxFamily::MI_STORE_DATA_IMM;
uint64_t signalValue = inOrderExecInfo->inOrderDependencyCounter + 1;
@@ -2530,7 +2525,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendSignalInOrderDependencyCounter(
addCmdForPatching(nullptr, miStoreCmd, nullptr, signalValue, InOrderPatchCommandHelpers::PatchCmdType::Sdi);
if (NEO::EncodeUserInterruptHelper::isOperationAllowed(NEO::EncodeUserInterruptHelper::onSignalingFenceMask)) {
if ((NEO::DebugManager.flags.ProgramUserInterruptOnResolvedDependency.get() == 1) && signalEvent && signalEvent->isKmdWaitModeEnabled()) {
NEO::EnodeUserInterrupt<GfxFamily>::encode(*commandContainer.getCommandStream());
}
}
@@ -2670,7 +2665,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWriteGlobalTimestamp(
appendSignalEventPostWalker(signalEvent, false);
if (this->isInOrderExecutionEnabled()) {
appendSignalInOrderDependencyCounter();
appendSignalInOrderDependencyCounter(signalEvent);
handleInOrderDependencyCounter(signalEvent, false);
}
@@ -3181,7 +3176,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_
appendSignalEventPostWalker(signalEvent, this->isInOrderExecutionEnabled());
if (isInOrderExecutionEnabled()) {
appendSignalInOrderDependencyCounter();
appendSignalInOrderDependencyCounter(signalEvent);
handleInOrderDependencyCounter(signalEvent, false);
}
@@ -3344,7 +3339,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnMemory(void *desc,
appendSignalEventPostWalker(signalEvent, false);
if (this->isInOrderExecutionEnabled()) {
appendSignalInOrderDependencyCounter();
appendSignalInOrderDependencyCounter(signalEvent);
handleInOrderDependencyCounter(signalEvent, false);
}
@@ -3392,7 +3387,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWriteToMemory(void *desc
}
if (this->isInOrderExecutionEnabled()) {
appendSignalInOrderDependencyCounter();
appendSignalInOrderDependencyCounter(nullptr);
handleInOrderDependencyCounter(nullptr, false);
}

View File

@@ -471,7 +471,7 @@ void CommandListCoreFamilyImmediate<gfxCoreFamily>::handleInOrderNonWalkerSignal
}
CommandListCoreFamily<gfxCoreFamily>::appendWaitOnSingleEvent(event, nonWalkerSignalingHasRelaxedOrdering);
CommandListCoreFamily<gfxCoreFamily>::appendSignalInOrderDependencyCounter();
CommandListCoreFamily<gfxCoreFamily>::appendSignalInOrderDependencyCounter(event);
}
template <GFXCORE_FAMILY gfxCoreFamily>

View File

@@ -278,7 +278,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
NEO::PipeControlArgs args;
NEO::MemorySynchronizationCommands<GfxFamily>::addSingleBarrier(*commandContainer.getCommandStream(), args);
appendSignalInOrderDependencyCounter();
appendSignalInOrderDependencyCounter(event);
}
return ZE_RESULT_SUCCESS;

View File

@@ -331,7 +331,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
if (inOrderNonWalkerSignalling) {
if (!launchParams.skipInOrderNonWalkerSignaling) {
appendWaitOnSingleEvent(eventForInOrderExec, false);
appendSignalInOrderDependencyCounter();
appendSignalInOrderDependencyCounter(eventForInOrderExec);
}
} else {
UNRECOVERABLE_IF(!dispatchKernelArgs.outWalkerPtr);

View File

@@ -134,7 +134,7 @@ struct BcsSplit {
cmdList->appendEventForProfilingAllWalkers(this->events.marker[markerEventIndex], false, true);
if (cmdList->isInOrderExecutionEnabled()) {
cmdList->appendSignalInOrderDependencyCounter();
cmdList->appendSignalInOrderDependencyCounter(signalEvent);
cmdList->handleInOrderDependencyCounter(signalEvent, false);
}

View File

@@ -242,6 +242,8 @@ struct Event : _ze_event_handle_t {
const CommandQueue *getLatestUsedCmdQueue() const { return latestUsedCmdQueue; }
bool hasKerneMappedTsCapability = false;
std::shared_ptr<InOrderExecInfo> &getInOrderExecInfo() { return inOrderExecInfo; }
void enableKmdWaitMode() { kmdWaitMode = true; }
bool isKmdWaitModeEnabled() const { return kmdWaitMode; }
protected:
Event(EventPool *eventPool, int index, Device *device) : device(device), eventPool(eventPool), index(index) {}
@@ -299,6 +301,7 @@ struct Event : _ze_event_handle_t {
bool usingContextEndOffset = false;
bool signalAllEventPackets = false;
bool isFromIpcPool = false;
bool kmdWaitMode = false;
uint64_t timestampRefreshIntervalInNanoSec = 0;
};

View File

@@ -87,6 +87,10 @@ Event *Event::create(EventPool *eventPool, const ze_event_desc_t *desc, Device *
event->enableCounterBasedMode(true);
}
if (NEO::DebugManager.flags.WaitForUserFenceOnEventHostSynchronize.get() == 1) {
event->enableKmdWaitMode();
}
return event;
}
@@ -457,7 +461,7 @@ ze_result_t EventImp<TagSizeT>::hostSynchronize(uint64_t timeout) {
waitStartTime = std::chrono::high_resolution_clock::now();
lastHangCheckTime = waitStartTime;
do {
if (NEO::DebugManager.flags.WaitForUserFenceOnEventHostSynchronize.get() == 1 && isCounterBased()) {
if (isKmdWaitModeEnabled() && isCounterBased()) {
ret = waitForUserFence(timeout);
} else {
ret = queryStatus();

View File

@@ -6,7 +6,6 @@
*/
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/command_container/encode_interrupt_helper.h"
#include "shared/source/command_container/encode_surface_state.h"
#include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/helpers/api_specific_config.h"
@@ -1031,6 +1030,8 @@ HWTEST2_F(InOrderCmdListTests, givenDebugFlagSetWhenEventHostSyncCalledThenCallW
auto immCmdList = createImmCmdList<gfxCoreFamily>();
auto eventPool = createEvents<FamilyType>(2, false);
EXPECT_TRUE(events[0]->isKmdWaitModeEnabled());
EXPECT_TRUE(events[1]->isKmdWaitModeEnabled());
EXPECT_EQ(ZE_RESULT_NOT_READY, events[0]->hostSynchronize(2));
@@ -1150,43 +1151,6 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenSubmittingThenProgramSemaphor
ASSERT_TRUE(verifyInOrderDependency<FamilyType>(itor, 1, immCmdList->inOrderExecInfo->inOrderDependencyCounterAllocation.getGpuAddress() + counterOffset, immCmdList->isQwordInOrderCounter()));
}
HWTEST2_F(InOrderCmdListTests, givenDebugFlagSetWhenDispatchingSemaphoreThenProgramUserInterrupt, IsAtLeastSkl) {
using MI_USER_INTERRUPT = typename FamilyType::MI_USER_INTERRUPT;
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
DebugManager.flags.ProgramUserInterruptOnResolvedDependency.set(NEO::EncodeUserInterruptHelper::afterSemaphoreMask);
auto eventPool = createEvents<FamilyType>(1, false);
auto eventHandle = events[0]->toHandle();
events[0]->makeCounterBasedInitiallyDisabled();
auto immCmdList = createImmCmdList<gfxCoreFamily>();
auto cmdStream = immCmdList->getCmdContainer().getCommandStream();
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
auto offset = cmdStream->getUsed();
immCmdList->appendBarrier(nullptr, 1, &eventHandle, false);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(cmdStream->getCpuBase(), offset),
cmdStream->getUsed() - offset));
auto itor = find<MI_SEMAPHORE_WAIT *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), itor);
auto userInterruptCmd = genCmdCast<MI_USER_INTERRUPT *>(*(++itor));
ASSERT_NE(nullptr, userInterruptCmd);
auto allCmds = findAll<MI_USER_INTERRUPT *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(1u, allCmds.size());
}
HWTEST2_F(InOrderCmdListTests, givenTimestmapEventWhenProgrammingBarrierThenDontAddPipeControl, IsAtLeastSkl) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
@@ -1218,22 +1182,22 @@ HWTEST2_F(InOrderCmdListTests, givenDebugFlagSetWhenDispatchingStoreDataImmThenP
using MI_USER_INTERRUPT = typename FamilyType::MI_USER_INTERRUPT;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
DebugManager.flags.ProgramUserInterruptOnResolvedDependency.set(NEO::EncodeUserInterruptHelper::onSignalingFenceMask);
DebugManager.flags.ProgramUserInterruptOnResolvedDependency.set(1);
auto eventPool = createEvents<FamilyType>(1, false);
auto eventPool = createEvents<FamilyType>(2, false);
auto eventHandle = events[0]->toHandle();
events[0]->makeCounterBasedInitiallyDisabled();
EXPECT_FALSE(events[1]->isKmdWaitModeEnabled());
auto immCmdList = createImmCmdList<gfxCoreFamily>();
auto cmdStream = immCmdList->getCmdContainer().getCommandStream();
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
auto offset = cmdStream->getUsed();
immCmdList->appendBarrier(nullptr, 1, &eventHandle, false);
auto validateInterrupt = [&](bool interruptExpected) {
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
@@ -1250,61 +1214,26 @@ HWTEST2_F(InOrderCmdListTests, givenDebugFlagSetWhenDispatchingStoreDataImmThenP
EXPECT_EQ(immCmdList->inOrderExecInfo->inOrderDependencyCounterAllocation.getGpuAddress(), sdiCmd->getAddress());
auto userInterruptCmd = genCmdCast<MI_USER_INTERRUPT *>(*(++itor));
ASSERT_NE(nullptr, userInterruptCmd);
ASSERT_EQ(interruptExpected, nullptr != userInterruptCmd);
auto allCmds = findAll<MI_USER_INTERRUPT *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(1u, allCmds.size());
}
HWTEST2_F(InOrderCmdListTests, givenDebugFlagSetAsMaskWhenDispatchingStoreDataImmAndSemaphoreThenProgramUserInterrupt, IsAtLeastSkl) {
using MI_USER_INTERRUPT = typename FamilyType::MI_USER_INTERRUPT;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
constexpr int32_t invalidMask = 0b100;
DebugManager.flags.ProgramUserInterruptOnResolvedDependency.set(invalidMask);
auto eventPool = createEvents<FamilyType>(1, false);
auto eventHandle = events[0]->toHandle();
events[0]->makeCounterBasedInitiallyDisabled();
auto immCmdList = createImmCmdList<gfxCoreFamily>();
auto cmdStream = immCmdList->getCmdContainer().getCommandStream();
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
auto offset = cmdStream->getUsed();
EXPECT_EQ(interruptExpected ? 1u : 0u, allCmds.size());
};
// no signal Event
immCmdList->appendBarrier(nullptr, 1, &eventHandle, false);
validateInterrupt(false);
{
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(cmdStream->getCpuBase(), offset),
cmdStream->getUsed() - offset));
auto allCmds = findAll<MI_USER_INTERRUPT *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(0u, allCmds.size());
}
DebugManager.flags.ProgramUserInterruptOnResolvedDependency.set(NEO::EncodeUserInterruptHelper::onSignalingFenceMask | NEO::EncodeUserInterruptHelper::afterSemaphoreMask);
// regular signal Event
offset = cmdStream->getUsed();
immCmdList->appendBarrier(events[1]->toHandle(), 1, &eventHandle, false);
validateInterrupt(false);
immCmdList->appendBarrier(nullptr, 1, &eventHandle, false);
{
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(cmdStream->getCpuBase(), offset),
cmdStream->getUsed() - offset));
auto allCmds = findAll<MI_USER_INTERRUPT *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(2u, allCmds.size());
}
// signal Event with kmd wait mode
offset = cmdStream->getUsed();
events[1]->enableKmdWaitMode();
immCmdList->appendBarrier(events[1]->toHandle(), 1, &eventHandle, false);
validateInterrupt(true);
}
HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenWaitingForEventFromPreviousAppendThenSkip, IsAtLeastXeHpCore) {
@@ -4067,7 +3996,7 @@ HWTEST2_F(MultiTileInOrderCmdListTests, givenMultiTileInOrderModeWhenSignalingSy
auto cmdStream = immCmdList->getCmdContainer().getCommandStream();
immCmdList->appendSignalInOrderDependencyCounter();
immCmdList->appendSignalInOrderDependencyCounter(nullptr);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, cmdStream->getCpuBase(), cmdStream->getUsed()));

View File

@@ -15,7 +15,6 @@ set(NEO_CORE_COMMAND_CONTAINER
${CMAKE_CURRENT_SOURCE_DIR}/encode_alu_helper.h
${CMAKE_CURRENT_SOURCE_DIR}/encode_compute_mode_bdw_and_later.inl
${CMAKE_CURRENT_SOURCE_DIR}/encode_compute_mode_tgllp_and_later.inl
${CMAKE_CURRENT_SOURCE_DIR}/encode_interrupt_helper.h
${CMAKE_CURRENT_SOURCE_DIR}/encode_surface_state.h
${CMAKE_CURRENT_SOURCE_DIR}/implicit_scaling.cpp
${CMAKE_CURRENT_SOURCE_DIR}/implicit_scaling.h

View File

@@ -1,30 +0,0 @@
/*
* Copyright (C) 2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
/*
* Copyright (C)2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/debug_settings/debug_settings_manager.h"
namespace NEO {
struct EncodeUserInterruptHelper {
static constexpr int32_t afterSemaphoreMask = 0b01;
static constexpr int32_t onSignalingFenceMask = 0b10;
static bool isOperationAllowed(int32_t mode) {
const int32_t flagValue = NEO::DebugManager.flags.ProgramUserInterruptOnResolvedDependency.get();
return (flagValue != -1 && (flagValue & mode));
}
};
} // namespace NEO

View File

@@ -258,7 +258,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, TrackNumCsrClientsOnSyncPoints, -1, "-1: default
DECLARE_DEBUG_VARIABLE(int32_t, OverrideDriverVersion, -1, "-1: default, >=0: Use value as reported driver version")
DECLARE_DEBUG_VARIABLE(int32_t, WaitForUserFenceOnEventHostSynchronize, -1, "-1: default, 0: Disabled, 1: Enabled. If enabled, use WaitUserFence KMD call for in-order Events instead of active polling on host.")
DECLARE_DEBUG_VARIABLE(int32_t, DisableSystemPointerKernelArgument, -1, "-1: default, 0: Disabled, 1: using a system pointer for kernel argument returns an error.")
DECLARE_DEBUG_VARIABLE(int32_t, ProgramUserInterruptOnResolvedDependency, -1, "-1: default, 0: Disabled, >=1: bitfield. 01b: program after semaphore, 10b: on signaling fence (non-walker append).")
DECLARE_DEBUG_VARIABLE(int32_t, ProgramUserInterruptOnResolvedDependency, -1, "-1: default, 0: Disabled, 1: On signaling append completion (if possible) - for example in-order counter update")
DECLARE_DEBUG_VARIABLE(int32_t, EnableInOrderRegularCmdListPatching, -1, "-1: default, 0: Disabled, 1: If set, patch counter value on execute call")
DECLARE_DEBUG_VARIABLE(int32_t, EnableInOrderRelaxedOrderingForEventsChaining, -1, "-1: default, 0: Disabled, 1: If set, send 2 immediate flushes to avoid stalling RelaxedOrdering Scheduler.")