feature: improve in-order Events chaining

- Clear TS event before chaining
- Dont chain non-TS events

Related-To: NEO-7966

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2023-07-05 10:55:01 +00:00
committed by Compute-Runtime-Automation
parent 69d80ee5bc
commit 9adfa4b355
4 changed files with 62 additions and 57 deletions

View File

@@ -260,6 +260,8 @@ struct CommandListCoreFamily : CommandListImp {
Event *signalEvent,
const CmdListKernelLaunchParams &launchParams);
void appendWaitOnSingleEvent(Event *event, bool relaxedOrderingAllowed);
ze_result_t prepareIndirectParams(const ze_group_count_t *threadGroupDimensions);
void updateStreamPropertiesForRegularCommandLists(Kernel &kernel, bool isCooperative, const ze_group_count_t *threadGroupDimensions, bool isIndirect);
void updateStreamPropertiesForFlushTaskDispatchFlags(Kernel &kernel, bool isCooperative, const ze_group_count_t *threadGroupDimensions, bool isIndirect);

View File

@@ -2212,8 +2212,6 @@ void CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(NEO::Gr
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t numEvents, ze_event_handle_t *phEvent, bool relaxedOrderingAllowed, bool trackDependencies, bool signalInOrderCompletion) {
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
signalInOrderCompletion &= this->inOrderExecutionEnabled;
NEO::Device *neoDevice = device->getNEODevice();
@@ -2227,8 +2225,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
callId = neoDevice->getRootDeviceEnvironment().tagsManager->currentCallCount;
}
uint64_t gpuAddr = 0;
constexpr uint32_t eventStateClear = Event::State::STATE_CLEARED;
bool dcFlushRequired = false;
if (this->dcFlushSupport) {
@@ -2266,24 +2262,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
}
commandContainer.addToResidencyContainer(&event->getAllocation(this->device));
gpuAddr = event->getCompletionFieldGpuAddress(this->device);
uint32_t packetsToWait = event->getPacketsInUse();
if (this->signalAllEventPackets) {
packetsToWait = event->getMaxPacketsCount();
}
for (uint32_t i = 0u; i < packetsToWait; i++) {
if (relaxedOrderingAllowed) {
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataMemBatchBufferStart(*commandContainer.getCommandStream(), 0, gpuAddr, eventStateClear,
NEO::CompareOperation::Equal, true);
} else {
NEO::EncodeSemaphore<GfxFamily>::addMiSemaphoreWaitCommand(*commandContainer.getCommandStream(),
gpuAddr,
eventStateClear,
COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
}
gpuAddr += event->getSinglePacketSize();
}
appendWaitOnSingleEvent(event, relaxedOrderingAllowed);
}
if (this->cmdListType == TYPE_IMMEDIATE && isCopyOnly() && trackDependencies) {
@@ -3248,4 +3228,26 @@ void CommandListCoreFamily<gfxCoreFamily>::dispatchEventRemainingPacketsPostSync
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendWaitOnSingleEvent(Event *event, bool relaxedOrderingAllowed) {
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
uint64_t gpuAddr = event->getCompletionFieldGpuAddress(this->device);
uint32_t packetsToWait = this->signalAllEventPackets ? event->getMaxPacketsCount() : event->getPacketsInUse();
for (uint32_t i = 0u; i < packetsToWait; i++) {
if (relaxedOrderingAllowed) {
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataMemBatchBufferStart(*commandContainer.getCommandStream(), 0, gpuAddr, Event::STATE_CLEARED,
NEO::CompareOperation::Equal, true);
} else {
NEO::EncodeSemaphore<GfxFamily>::addMiSemaphoreWaitCommand(*commandContainer.getCommandStream(),
gpuAddr,
Event::STATE_CLEARED,
COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
}
gpuAddr += event->getSinglePacketSize();
}
}
} // namespace L0

View File

@@ -278,9 +278,13 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
bool inOrderExecSignalRequired = (this->inOrderExecutionEnabled && !launchParams.isKernelSplitOperation);
if (inOrderExecSignalRequired && !event) {
dispatchKernelArgs.eventAddress = this->inOrderDependencyCounterAllocation->getGpuAddress() + this->inOrderAllocationOffset;
dispatchKernelArgs.postSyncImmValue = this->inOrderDependencyCounter + 1;
if (inOrderExecSignalRequired) {
if (isTimestampEvent) {
dispatchEventPostSyncOperation(event, Event::STATE_CLEARED, false, false, false);
} else {
dispatchKernelArgs.eventAddress = this->inOrderDependencyCounterAllocation->getGpuAddress() + this->inOrderAllocationOffset;
dispatchKernelArgs.postSyncImmValue = this->inOrderDependencyCounter + 1;
}
}
NEO::EncodeDispatchKernel<GfxFamily>::encode(commandContainer, dispatchKernelArgs, getLogicalStateHelper());
@@ -301,9 +305,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
}
}
if (inOrderExecSignalRequired && event) {
auto eventHandle = event->toHandle();
CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(1, &eventHandle, false, false, false);
if (inOrderExecSignalRequired && isTimestampEvent) {
appendWaitOnSingleEvent(event, false);
appendSignalInOrderDependencyCounter();
}

View File

@@ -1095,25 +1095,8 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingWalkerThenSignalSy
auto &postSync = walkerCmd->getPostSync();
EXPECT_EQ(POSTSYNC_DATA::OPERATION_WRITE_IMMEDIATE_DATA, postSync.getOperation());
EXPECT_EQ(static_cast<uint32_t>(Event::STATE_SIGNALED), postSync.getImmediateData());
EXPECT_EQ(events[0]->getPacketAddress(device), postSync.getDestinationAddress());
// chaining
auto eventEndGpuVa = events[0]->getCompletionFieldGpuAddress(device);
auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(++walkerCmd);
ASSERT_NE(nullptr, semaphoreCmd);
EXPECT_EQ(static_cast<uint32_t>(Event::State::STATE_CLEARED), semaphoreCmd->getSemaphoreDataDword());
EXPECT_EQ(eventEndGpuVa, semaphoreCmd->getSemaphoreGraphicsAddress());
EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD, semaphoreCmd->getCompareOperation());
auto sdiCmd = genCmdCast<MI_STORE_DATA_IMM *>(++semaphoreCmd);
ASSERT_NE(nullptr, sdiCmd);
EXPECT_EQ(immCmdList->inOrderDependencyCounterAllocation->getGpuAddress() + counterOffset, sdiCmd->getAddress());
EXPECT_EQ(0u, sdiCmd->getStoreQword());
EXPECT_EQ(2u, sdiCmd->getDataDword0());
EXPECT_EQ(2u, postSync.getImmediateData());
EXPECT_EQ(immCmdList->inOrderDependencyCounterAllocation->getGpuAddress() + counterOffset, postSync.getDestinationAddress());
}
auto hostAddress = static_cast<uint32_t *>(ptrOffset(immCmdList->inOrderDependencyCounterAllocation->getUnderlyingBuffer(), counterOffset));
@@ -1128,7 +1111,7 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingWalkerThenSignalSy
EXPECT_EQ(ZE_RESULT_SUCCESS, events[0]->hostSynchronize(1));
}
HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingTimestampEventThenChainWithSyncAllocSignaling, IsAtLeastXeHpCore) {
HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingTimestampEventThenClearAndChainWithSyncAllocSignaling, IsAtLeastXeHpCore) {
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
@@ -1145,7 +1128,17 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingTimestampEventThen
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, cmdStream->getCpuBase(), cmdStream->getUsed()));
auto walkerItor = find<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
auto sdiItor = find<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), sdiItor);
auto sdiCmd = genCmdCast<MI_STORE_DATA_IMM *>(*sdiItor);
ASSERT_NE(nullptr, sdiCmd);
EXPECT_EQ(events[0]->getCompletionFieldGpuAddress(device), sdiCmd->getAddress());
EXPECT_EQ(0u, sdiCmd->getStoreQword());
EXPECT_EQ(Event::STATE_CLEARED, sdiCmd->getDataDword0());
auto walkerItor = find<COMPUTE_WALKER *>(sdiItor, cmdList.end());
ASSERT_NE(cmdList.end(), walkerItor);
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*walkerItor);
@@ -1164,7 +1157,7 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingTimestampEventThen
EXPECT_EQ(eventEndGpuVa, semaphoreCmd->getSemaphoreGraphicsAddress());
EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD, semaphoreCmd->getCompareOperation());
auto sdiCmd = genCmdCast<MI_STORE_DATA_IMM *>(++semaphoreCmd);
sdiCmd = genCmdCast<MI_STORE_DATA_IMM *>(++semaphoreCmd);
ASSERT_NE(nullptr, sdiCmd);
EXPECT_EQ(immCmdList->inOrderDependencyCounterAllocation->getGpuAddress(), sdiCmd->getAddress());
@@ -1371,6 +1364,7 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingAppendWaitOnEvents
HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingCounterWithOverflowThenHandleItCorrectly, IsAtLeastXeHpCore) {
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
auto immCmdList = createImmCmdList<gfxCoreFamily>();
immCmdList->inOrderDependencyCounter = std::numeric_limits<uint32_t>::max() - 1;
@@ -1381,28 +1375,32 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingCounterWithOverflo
auto eventHandle = events[0]->toHandle();
uint64_t baseGpuVa = immCmdList->inOrderDependencyCounterAllocation->getGpuAddress();
immCmdList->appendLaunchKernel(kernel->toHandle(), &groupCount, eventHandle, 0, nullptr, launchParams, false);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, cmdStream->getCpuBase(), cmdStream->getUsed()));
auto sdiItor = find<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), sdiItor);
auto walkerItor = find<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), walkerItor);
auto sdiCmd = genCmdCast<MI_STORE_DATA_IMM *>(*sdiItor);
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*walkerItor);
auto &postSync = walkerCmd->getPostSync();
uint64_t baseGpuVa = immCmdList->inOrderDependencyCounterAllocation->getGpuAddress();
EXPECT_EQ(std::numeric_limits<uint32_t>::max(), postSync.getImmediateData());
EXPECT_EQ(immCmdList->inOrderDependencyCounterAllocation->getGpuAddress(), postSync.getDestinationAddress());
EXPECT_EQ(baseGpuVa, sdiCmd->getAddress());
EXPECT_EQ(std::numeric_limits<uint32_t>::max(), sdiCmd->getDataDword0());
auto semaphoreItor = find<MI_SEMAPHORE_WAIT *>(walkerItor, cmdList.end());
ASSERT_NE(cmdList.end(), semaphoreItor);
auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(++sdiCmd);
auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(*semaphoreItor);
ASSERT_NE(nullptr, semaphoreCmd);
EXPECT_EQ(std::numeric_limits<uint32_t>::max(), semaphoreCmd->getSemaphoreDataDword());
EXPECT_EQ(baseGpuVa, semaphoreCmd->getSemaphoreGraphicsAddress());
sdiCmd = genCmdCast<MI_STORE_DATA_IMM *>(++semaphoreCmd);
auto sdiCmd = genCmdCast<MI_STORE_DATA_IMM *>(++semaphoreCmd);
ASSERT_NE(nullptr, sdiCmd);
uint32_t offset = static_cast<uint32_t>(sizeof(uint64_t));