mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-29 09:03:14 +08:00
feature: improve in-order Events chaining
- Clear TS event before chaining - Dont chain non-TS events Related-To: NEO-7966 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
69d80ee5bc
commit
9adfa4b355
@@ -260,6 +260,8 @@ struct CommandListCoreFamily : CommandListImp {
|
||||
Event *signalEvent,
|
||||
const CmdListKernelLaunchParams &launchParams);
|
||||
|
||||
void appendWaitOnSingleEvent(Event *event, bool relaxedOrderingAllowed);
|
||||
|
||||
ze_result_t prepareIndirectParams(const ze_group_count_t *threadGroupDimensions);
|
||||
void updateStreamPropertiesForRegularCommandLists(Kernel &kernel, bool isCooperative, const ze_group_count_t *threadGroupDimensions, bool isIndirect);
|
||||
void updateStreamPropertiesForFlushTaskDispatchFlags(Kernel &kernel, bool isCooperative, const ze_group_count_t *threadGroupDimensions, bool isIndirect);
|
||||
|
||||
@@ -2212,8 +2212,6 @@ void CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(NEO::Gr
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t numEvents, ze_event_handle_t *phEvent, bool relaxedOrderingAllowed, bool trackDependencies, bool signalInOrderCompletion) {
|
||||
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
|
||||
|
||||
signalInOrderCompletion &= this->inOrderExecutionEnabled;
|
||||
|
||||
NEO::Device *neoDevice = device->getNEODevice();
|
||||
@@ -2227,8 +2225,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
|
||||
callId = neoDevice->getRootDeviceEnvironment().tagsManager->currentCallCount;
|
||||
}
|
||||
|
||||
uint64_t gpuAddr = 0;
|
||||
constexpr uint32_t eventStateClear = Event::State::STATE_CLEARED;
|
||||
bool dcFlushRequired = false;
|
||||
|
||||
if (this->dcFlushSupport) {
|
||||
@@ -2266,24 +2262,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
|
||||
}
|
||||
|
||||
commandContainer.addToResidencyContainer(&event->getAllocation(this->device));
|
||||
gpuAddr = event->getCompletionFieldGpuAddress(this->device);
|
||||
uint32_t packetsToWait = event->getPacketsInUse();
|
||||
if (this->signalAllEventPackets) {
|
||||
packetsToWait = event->getMaxPacketsCount();
|
||||
}
|
||||
for (uint32_t i = 0u; i < packetsToWait; i++) {
|
||||
if (relaxedOrderingAllowed) {
|
||||
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataMemBatchBufferStart(*commandContainer.getCommandStream(), 0, gpuAddr, eventStateClear,
|
||||
NEO::CompareOperation::Equal, true);
|
||||
} else {
|
||||
NEO::EncodeSemaphore<GfxFamily>::addMiSemaphoreWaitCommand(*commandContainer.getCommandStream(),
|
||||
gpuAddr,
|
||||
eventStateClear,
|
||||
COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
|
||||
}
|
||||
|
||||
gpuAddr += event->getSinglePacketSize();
|
||||
}
|
||||
appendWaitOnSingleEvent(event, relaxedOrderingAllowed);
|
||||
}
|
||||
|
||||
if (this->cmdListType == TYPE_IMMEDIATE && isCopyOnly() && trackDependencies) {
|
||||
@@ -3248,4 +3228,26 @@ void CommandListCoreFamily<gfxCoreFamily>::dispatchEventRemainingPacketsPostSync
|
||||
}
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void CommandListCoreFamily<gfxCoreFamily>::appendWaitOnSingleEvent(Event *event, bool relaxedOrderingAllowed) {
|
||||
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
|
||||
|
||||
uint64_t gpuAddr = event->getCompletionFieldGpuAddress(this->device);
|
||||
uint32_t packetsToWait = this->signalAllEventPackets ? event->getMaxPacketsCount() : event->getPacketsInUse();
|
||||
|
||||
for (uint32_t i = 0u; i < packetsToWait; i++) {
|
||||
if (relaxedOrderingAllowed) {
|
||||
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataMemBatchBufferStart(*commandContainer.getCommandStream(), 0, gpuAddr, Event::STATE_CLEARED,
|
||||
NEO::CompareOperation::Equal, true);
|
||||
} else {
|
||||
NEO::EncodeSemaphore<GfxFamily>::addMiSemaphoreWaitCommand(*commandContainer.getCommandStream(),
|
||||
gpuAddr,
|
||||
Event::STATE_CLEARED,
|
||||
COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
|
||||
}
|
||||
|
||||
gpuAddr += event->getSinglePacketSize();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace L0
|
||||
|
||||
@@ -278,9 +278,13 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
|
||||
|
||||
bool inOrderExecSignalRequired = (this->inOrderExecutionEnabled && !launchParams.isKernelSplitOperation);
|
||||
|
||||
if (inOrderExecSignalRequired && !event) {
|
||||
dispatchKernelArgs.eventAddress = this->inOrderDependencyCounterAllocation->getGpuAddress() + this->inOrderAllocationOffset;
|
||||
dispatchKernelArgs.postSyncImmValue = this->inOrderDependencyCounter + 1;
|
||||
if (inOrderExecSignalRequired) {
|
||||
if (isTimestampEvent) {
|
||||
dispatchEventPostSyncOperation(event, Event::STATE_CLEARED, false, false, false);
|
||||
} else {
|
||||
dispatchKernelArgs.eventAddress = this->inOrderDependencyCounterAllocation->getGpuAddress() + this->inOrderAllocationOffset;
|
||||
dispatchKernelArgs.postSyncImmValue = this->inOrderDependencyCounter + 1;
|
||||
}
|
||||
}
|
||||
|
||||
NEO::EncodeDispatchKernel<GfxFamily>::encode(commandContainer, dispatchKernelArgs, getLogicalStateHelper());
|
||||
@@ -301,9 +305,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
|
||||
}
|
||||
}
|
||||
|
||||
if (inOrderExecSignalRequired && event) {
|
||||
auto eventHandle = event->toHandle();
|
||||
CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(1, &eventHandle, false, false, false);
|
||||
if (inOrderExecSignalRequired && isTimestampEvent) {
|
||||
appendWaitOnSingleEvent(event, false);
|
||||
|
||||
appendSignalInOrderDependencyCounter();
|
||||
}
|
||||
|
||||
@@ -1095,25 +1095,8 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingWalkerThenSignalSy
|
||||
auto &postSync = walkerCmd->getPostSync();
|
||||
|
||||
EXPECT_EQ(POSTSYNC_DATA::OPERATION_WRITE_IMMEDIATE_DATA, postSync.getOperation());
|
||||
EXPECT_EQ(static_cast<uint32_t>(Event::STATE_SIGNALED), postSync.getImmediateData());
|
||||
EXPECT_EQ(events[0]->getPacketAddress(device), postSync.getDestinationAddress());
|
||||
|
||||
// chaining
|
||||
auto eventEndGpuVa = events[0]->getCompletionFieldGpuAddress(device);
|
||||
|
||||
auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(++walkerCmd);
|
||||
ASSERT_NE(nullptr, semaphoreCmd);
|
||||
|
||||
EXPECT_EQ(static_cast<uint32_t>(Event::State::STATE_CLEARED), semaphoreCmd->getSemaphoreDataDword());
|
||||
EXPECT_EQ(eventEndGpuVa, semaphoreCmd->getSemaphoreGraphicsAddress());
|
||||
EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD, semaphoreCmd->getCompareOperation());
|
||||
|
||||
auto sdiCmd = genCmdCast<MI_STORE_DATA_IMM *>(++semaphoreCmd);
|
||||
ASSERT_NE(nullptr, sdiCmd);
|
||||
|
||||
EXPECT_EQ(immCmdList->inOrderDependencyCounterAllocation->getGpuAddress() + counterOffset, sdiCmd->getAddress());
|
||||
EXPECT_EQ(0u, sdiCmd->getStoreQword());
|
||||
EXPECT_EQ(2u, sdiCmd->getDataDword0());
|
||||
EXPECT_EQ(2u, postSync.getImmediateData());
|
||||
EXPECT_EQ(immCmdList->inOrderDependencyCounterAllocation->getGpuAddress() + counterOffset, postSync.getDestinationAddress());
|
||||
}
|
||||
|
||||
auto hostAddress = static_cast<uint32_t *>(ptrOffset(immCmdList->inOrderDependencyCounterAllocation->getUnderlyingBuffer(), counterOffset));
|
||||
@@ -1128,7 +1111,7 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingWalkerThenSignalSy
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, events[0]->hostSynchronize(1));
|
||||
}
|
||||
|
||||
HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingTimestampEventThenChainWithSyncAllocSignaling, IsAtLeastXeHpCore) {
|
||||
HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingTimestampEventThenClearAndChainWithSyncAllocSignaling, IsAtLeastXeHpCore) {
|
||||
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
|
||||
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
|
||||
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
|
||||
@@ -1145,7 +1128,17 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingTimestampEventThen
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, cmdStream->getCpuBase(), cmdStream->getUsed()));
|
||||
|
||||
auto walkerItor = find<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
|
||||
auto sdiItor = find<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_NE(cmdList.end(), sdiItor);
|
||||
|
||||
auto sdiCmd = genCmdCast<MI_STORE_DATA_IMM *>(*sdiItor);
|
||||
ASSERT_NE(nullptr, sdiCmd);
|
||||
|
||||
EXPECT_EQ(events[0]->getCompletionFieldGpuAddress(device), sdiCmd->getAddress());
|
||||
EXPECT_EQ(0u, sdiCmd->getStoreQword());
|
||||
EXPECT_EQ(Event::STATE_CLEARED, sdiCmd->getDataDword0());
|
||||
|
||||
auto walkerItor = find<COMPUTE_WALKER *>(sdiItor, cmdList.end());
|
||||
ASSERT_NE(cmdList.end(), walkerItor);
|
||||
|
||||
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*walkerItor);
|
||||
@@ -1164,7 +1157,7 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingTimestampEventThen
|
||||
EXPECT_EQ(eventEndGpuVa, semaphoreCmd->getSemaphoreGraphicsAddress());
|
||||
EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD, semaphoreCmd->getCompareOperation());
|
||||
|
||||
auto sdiCmd = genCmdCast<MI_STORE_DATA_IMM *>(++semaphoreCmd);
|
||||
sdiCmd = genCmdCast<MI_STORE_DATA_IMM *>(++semaphoreCmd);
|
||||
ASSERT_NE(nullptr, sdiCmd);
|
||||
|
||||
EXPECT_EQ(immCmdList->inOrderDependencyCounterAllocation->getGpuAddress(), sdiCmd->getAddress());
|
||||
@@ -1371,6 +1364,7 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingAppendWaitOnEvents
|
||||
HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingCounterWithOverflowThenHandleItCorrectly, IsAtLeastXeHpCore) {
|
||||
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
|
||||
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
|
||||
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
|
||||
|
||||
auto immCmdList = createImmCmdList<gfxCoreFamily>();
|
||||
immCmdList->inOrderDependencyCounter = std::numeric_limits<uint32_t>::max() - 1;
|
||||
@@ -1381,28 +1375,32 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingCounterWithOverflo
|
||||
|
||||
auto eventHandle = events[0]->toHandle();
|
||||
|
||||
uint64_t baseGpuVa = immCmdList->inOrderDependencyCounterAllocation->getGpuAddress();
|
||||
|
||||
immCmdList->appendLaunchKernel(kernel->toHandle(), &groupCount, eventHandle, 0, nullptr, launchParams, false);
|
||||
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, cmdStream->getCpuBase(), cmdStream->getUsed()));
|
||||
|
||||
auto sdiItor = find<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_NE(cmdList.end(), sdiItor);
|
||||
auto walkerItor = find<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_NE(cmdList.end(), walkerItor);
|
||||
|
||||
auto sdiCmd = genCmdCast<MI_STORE_DATA_IMM *>(*sdiItor);
|
||||
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*walkerItor);
|
||||
auto &postSync = walkerCmd->getPostSync();
|
||||
|
||||
uint64_t baseGpuVa = immCmdList->inOrderDependencyCounterAllocation->getGpuAddress();
|
||||
EXPECT_EQ(std::numeric_limits<uint32_t>::max(), postSync.getImmediateData());
|
||||
EXPECT_EQ(immCmdList->inOrderDependencyCounterAllocation->getGpuAddress(), postSync.getDestinationAddress());
|
||||
|
||||
EXPECT_EQ(baseGpuVa, sdiCmd->getAddress());
|
||||
EXPECT_EQ(std::numeric_limits<uint32_t>::max(), sdiCmd->getDataDword0());
|
||||
auto semaphoreItor = find<MI_SEMAPHORE_WAIT *>(walkerItor, cmdList.end());
|
||||
ASSERT_NE(cmdList.end(), semaphoreItor);
|
||||
|
||||
auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(++sdiCmd);
|
||||
auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(*semaphoreItor);
|
||||
ASSERT_NE(nullptr, semaphoreCmd);
|
||||
|
||||
EXPECT_EQ(std::numeric_limits<uint32_t>::max(), semaphoreCmd->getSemaphoreDataDword());
|
||||
EXPECT_EQ(baseGpuVa, semaphoreCmd->getSemaphoreGraphicsAddress());
|
||||
|
||||
sdiCmd = genCmdCast<MI_STORE_DATA_IMM *>(++semaphoreCmd);
|
||||
auto sdiCmd = genCmdCast<MI_STORE_DATA_IMM *>(++semaphoreCmd);
|
||||
ASSERT_NE(nullptr, sdiCmd);
|
||||
|
||||
uint32_t offset = static_cast<uint32_t>(sizeof(uint64_t));
|
||||
|
||||
Reference in New Issue
Block a user