From ae8494d379127dededf19c457b544eca02587606 Mon Sep 17 00:00:00 2001 From: "Dunajski, Bartosz" Date: Tue, 10 Oct 2023 15:29:45 +0000 Subject: [PATCH] feature: dont program redundant in-order semaphore after cmds chaining Related-To: NEO-7966 Signed-off-by: Dunajski, Bartosz --- level_zero/core/source/cmdlist/cmdlist_hw.h | 4 +- level_zero/core/source/cmdlist/cmdlist_hw.inl | 48 ++- .../source/cmdlist/cmdlist_hw_immediate.inl | 2 +- .../core/source/cmdlist/cmdlist_imp.cpp | 2 +- level_zero/core/source/device/bcs_split.h | 2 +- .../core/test/unit_tests/mocks/mock_cmdlist.h | 3 + .../test_cmdlist_append_launch_kernel_3.cpp | 347 ++++++++++++++++-- 7 files changed, 353 insertions(+), 55 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 82ce17c55f..5a55ae3d83 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -173,7 +173,7 @@ struct CommandListCoreFamily : CommandListImp { ze_result_t appendWaitOnEvents(uint32_t numEvents, ze_event_handle_t *phEvent, bool relaxedOrderingAllowed, bool trackDependencies, bool signalInOrderCompletion) override; void appendWaitOnInOrderDependency(std::shared_ptr &inOrderExecInfo, uint64_t waitValue, uint32_t offset, bool relaxedOrderingAllowed, bool implicitDependency); void appendSignalInOrderDependencyCounter(); - void handleInOrderDependencyCounter(Event *signalEvent); + void handleInOrderDependencyCounter(Event *signalEvent, bool nonWalkerInOrderCmdsChaining); ze_result_t appendWriteGlobalTimestamp(uint64_t *dstptr, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; @@ -340,6 +340,8 @@ struct CommandListCoreFamily : CommandListImp { void addCmdForPatching(std::shared_ptr *externalInOrderExecInfo, void *cmd, uint64_t counterValue, InOrderPatchCommandHelpers::PatchCmdType patchCmdType); InOrderPatchCommandsContainer inOrderPatchCmds; + + bool latestOperationRequiredNonWalkerInOrderCmdsChaining = false; }; template diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 9f48b7bed4..29e5fe6be1 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -152,7 +152,7 @@ ze_result_t CommandListCoreFamily::reset() { } template -void CommandListCoreFamily::handleInOrderDependencyCounter(Event *signalEvent) { +void CommandListCoreFamily::handleInOrderDependencyCounter(Event *signalEvent, bool nonWalkerInOrderCmdsChaining) { if (!isQwordInOrderCounter() && ((inOrderExecInfo->inOrderDependencyCounter + 1) == std::numeric_limits::max())) { CommandListCoreFamily::appendWaitOnInOrderDependency(inOrderExecInfo, inOrderExecInfo->inOrderDependencyCounter + 1, inOrderAllocationOffset, false, true); @@ -175,6 +175,8 @@ void CommandListCoreFamily::handleInOrderDependencyCounter(Event if (signalEvent && signalEvent->isInOrderExecEvent()) { signalEvent->updateInOrderExecState(inOrderExecInfo, inOrderExecInfo->inOrderDependencyCounter, this->inOrderAllocationOffset); } + + this->latestOperationRequiredNonWalkerInOrderCmdsChaining = nonWalkerInOrderCmdsChaining; } template @@ -362,7 +364,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernel(ze_kernel_h event, launchParams); if (isInOrderExecutionEnabled() && !launchParams.skipInOrderNonWalkerSignaling) { - handleInOrderDependencyCounter(event); + handleInOrderDependencyCounter(event, isInOrderNonWalkerSignalingRequired(event)); } addToMappedEventList(event); @@ -403,7 +405,7 @@ ze_result_t CommandListCoreFamily::appendLaunchCooperativeKernel( addToMappedEventList(event); if (this->isInOrderExecutionEnabled()) { - handleInOrderDependencyCounter(event); + handleInOrderDependencyCounter(event, isInOrderNonWalkerSignalingRequired(event)); } return ret; } @@ -438,7 +440,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelIndirect(ze_ appendSignalEventPostWalker(event, false); if (isInOrderExecutionEnabled()) { - handleInOrderDependencyCounter(event); + handleInOrderDependencyCounter(event, isInOrderNonWalkerSignalingRequired(event)); } return ret; @@ -530,7 +532,7 @@ ze_result_t CommandListCoreFamily::appendEventReset(ze_event_hand if (this->isInOrderExecutionEnabled()) { appendSignalInOrderDependencyCounter(); - handleInOrderDependencyCounter(event); + handleInOrderDependencyCounter(event, false); } if (NEO::DebugManager.flags.EnableSWTags.get()) { @@ -569,7 +571,7 @@ ze_result_t CommandListCoreFamily::appendMemoryRangesBarrier(uint if (this->isInOrderExecutionEnabled()) { appendSignalInOrderDependencyCounter(); - handleInOrderDependencyCounter(signalEvent); + handleInOrderDependencyCounter(signalEvent, false); } return ZE_RESULT_SUCCESS; @@ -1479,7 +1481,9 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, addToMappedEventList(signalEvent); if (this->isInOrderExecutionEnabled()) { - if (launchParams.isKernelSplitOperation || inOrderCopyOnlySignalingAllowed) { + bool emitPipeControl = !isCopyOnly() && eventSignalPipeControl(launchParams.isKernelSplitOperation, signalEvent ? getDcFlushRequired(signalEvent->isSignalScope()) : false); + + if (launchParams.isKernelSplitOperation || inOrderCopyOnlySignalingAllowed || emitPipeControl) { if (!signalEvent && !isCopyOnly()) { NEO::PipeControlArgs args; NEO::MemorySynchronizationCommands::addSingleBarrier(*commandContainer.getCommandStream(), args); @@ -1488,7 +1492,8 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, } if (!isCopyOnly() || inOrderCopyOnlySignalingAllowed) { - handleInOrderDependencyCounter(signalEvent); + bool nonWalkerInOrderCmdChaining = !isCopyOnly() && isInOrderNonWalkerSignalingRequired(signalEvent) && !emitPipeControl; + handleInOrderDependencyCounter(signalEvent, nonWalkerInOrderCmdChaining); } } @@ -1583,7 +1588,8 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyRegion(void *d } if (!isCopyOnly() || inOrderCopyOnlySignalingAllowed) { - handleInOrderDependencyCounter(signalEvent); + bool nonWalkerInOrderCmdChaining = !isCopyOnly() && isInOrderNonWalkerSignalingRequired(signalEvent); + handleInOrderDependencyCounter(signalEvent, nonWalkerInOrderCmdChaining); } } @@ -2022,15 +2028,18 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, addFlushRequiredCommand(hostPointerNeedsFlush, signalEvent); if (this->isInOrderExecutionEnabled()) { + bool nonWalkerInOrderCmdChaining = false; if (launchParams.isKernelSplitOperation) { if (!signalEvent) { NEO::PipeControlArgs args; NEO::MemorySynchronizationCommands::addSingleBarrier(*commandContainer.getCommandStream(), args); } appendSignalInOrderDependencyCounter(); + } else { + nonWalkerInOrderCmdChaining = isInOrderNonWalkerSignalingRequired(signalEvent); } - handleInOrderDependencyCounter(signalEvent); + handleInOrderDependencyCounter(signalEvent, nonWalkerInOrderCmdChaining); } if (NEO::DebugManager.flags.EnableSWTags.get()) { @@ -2093,7 +2102,7 @@ ze_result_t CommandListCoreFamily::appendBlitFill(void *ptr, if (isInOrderExecutionEnabled()) { appendSignalInOrderDependencyCounter(); - handleInOrderDependencyCounter(signalEvent); + handleInOrderDependencyCounter(signalEvent, false); } } return ZE_RESULT_SUCCESS; @@ -2265,6 +2274,11 @@ void CommandListCoreFamily::handleInOrderImplicitDependencies(boo template inline ze_result_t CommandListCoreFamily::addEventsToCmdList(uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingAllowed, bool trackDependencies, bool waitForImplicitInOrderDependency) { bool inOrderDependencies = false; + + if (this->latestOperationRequiredNonWalkerInOrderCmdsChaining && !relaxedOrderingAllowed) { + waitForImplicitInOrderDependency = false; + } + if (waitForImplicitInOrderDependency) { handleInOrderImplicitDependencies(relaxedOrderingAllowed); inOrderDependencies = hasInOrderDependencies(); @@ -2312,7 +2326,7 @@ ze_result_t CommandListCoreFamily::appendSignalEvent(ze_event_han if (this->isInOrderExecutionEnabled()) { appendSignalInOrderDependencyCounter(); - handleInOrderDependencyCounter(event); + handleInOrderDependencyCounter(event, false); } if (NEO::DebugManager.flags.EnableSWTags.get()) { @@ -2447,7 +2461,7 @@ ze_result_t CommandListCoreFamily::appendWaitOnEvents(uint32_t nu if (signalInOrderCompletion) { appendSignalInOrderDependencyCounter(); - handleInOrderDependencyCounter(nullptr); + handleInOrderDependencyCounter(nullptr, false); } makeResidentDummyAllocation(); @@ -2617,7 +2631,7 @@ ze_result_t CommandListCoreFamily::appendWriteGlobalTimestamp( if (this->isInOrderExecutionEnabled()) { appendSignalInOrderDependencyCounter(); - handleInOrderDependencyCounter(signalEvent); + handleInOrderDependencyCounter(signalEvent, false); } addToMappedEventList(signalEvent); @@ -3125,7 +3139,7 @@ ze_result_t CommandListCoreFamily::appendBarrier(ze_event_handle_ appendSignalEventPostWalker(signalEvent, this->isInOrderExecutionEnabled()); if (isInOrderExecutionEnabled()) { - handleInOrderDependencyCounter(signalEvent); + handleInOrderDependencyCounter(signalEvent, false); } return ZE_RESULT_SUCCESS; @@ -3272,7 +3286,7 @@ ze_result_t CommandListCoreFamily::appendWaitOnMemory(void *desc, if (this->isInOrderExecutionEnabled()) { appendSignalInOrderDependencyCounter(); - handleInOrderDependencyCounter(signalEvent); + handleInOrderDependencyCounter(signalEvent, false); } return ZE_RESULT_SUCCESS; @@ -3320,7 +3334,7 @@ ze_result_t CommandListCoreFamily::appendWriteToMemory(void *desc if (this->isInOrderExecutionEnabled()) { appendSignalInOrderDependencyCounter(); - handleInOrderDependencyCounter(nullptr); + handleInOrderDependencyCounter(nullptr, false); } return ZE_RESULT_SUCCESS; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index b5f2bf20ff..c8f3f78744 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -446,7 +446,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendLaunchKernel( auto event = Event::fromHandle(hSignalEvent); handleInOrderNonWalkerSignaling(event, stallingCmdsForRelaxedOrdering, relaxedOrderingDispatch, ret); - CommandListCoreFamily::handleInOrderDependencyCounter(event); + CommandListCoreFamily::handleInOrderDependencyCounter(event, true); } return flushImmediate(ret, true, stallingCmdsForRelaxedOrdering, relaxedOrderingDispatch, true, hSignalEvent); diff --git a/level_zero/core/source/cmdlist/cmdlist_imp.cpp b/level_zero/core/source/cmdlist/cmdlist_imp.cpp index 9917c5d995..0ac7f9d0ac 100644 --- a/level_zero/core/source/cmdlist/cmdlist_imp.cpp +++ b/level_zero/core/source/cmdlist/cmdlist_imp.cpp @@ -262,7 +262,7 @@ void CommandListImp::addToMappedEventList(Event *event) { } void CommandListImp::incRegularCmdListSubmissionCounter() { - if (isInOrderExecutionEnabled() && inOrderExecInfo->isRegularCmdList) { + if (isInOrderExecutionEnabled()) { inOrderExecInfo->regularCmdListSubmissionCounter++; } } diff --git a/level_zero/core/source/device/bcs_split.h b/level_zero/core/source/device/bcs_split.h index 00295f809e..847cb54b10 100644 --- a/level_zero/core/source/device/bcs_split.h +++ b/level_zero/core/source/device/bcs_split.h @@ -133,7 +133,7 @@ struct BcsSplit { if (cmdList->isInOrderExecutionEnabled()) { cmdList->appendSignalInOrderDependencyCounter(); - cmdList->handleInOrderDependencyCounter(signalEvent); + cmdList->handleInOrderDependencyCounter(signalEvent, false); } return result; diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index d7d28e3ac4..1f3bfc3cec 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -66,6 +66,7 @@ struct WhiteBox<::L0::CommandListCoreFamily> using BaseClass::doubleSbaWa; using BaseClass::engineGroupType; using BaseClass::estimateBufferSizeMultiTileBarrier; + using BaseClass::eventSignalPipeControl; using BaseClass::finalStreamState; using BaseClass::flags; using BaseClass::frontEndStateTracking; @@ -81,6 +82,7 @@ struct WhiteBox<::L0::CommandListCoreFamily> using BaseClass::inOrderExecInfo; using BaseClass::inOrderPatchCmds; using BaseClass::isFlushTaskSubmissionEnabled; + using BaseClass::isInOrderNonWalkerSignalingRequired; using BaseClass::isQwordInOrderCounter; using BaseClass::isRelaxedOrderingDispatchAllowed; using BaseClass::isSyncModeQueue; @@ -169,6 +171,7 @@ struct WhiteBox> using BaseClass::device; using BaseClass::doubleSbaWa; using BaseClass::engineGroupType; + using BaseClass::eventSignalPipeControl; using BaseClass::finalStreamState; using BaseClass::frontEndStateTracking; using BaseClass::getDcFlushRequired; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp index fd673a2bef..5af6ba4eaf 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp @@ -1212,11 +1212,15 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenWaitingForEventFromPreviousAp auto itor = find(cmdList.begin(), cmdList.end()); - ASSERT_NE(cmdList.end(), itor); // implicit dependency + if (immCmdList->isInOrderNonWalkerSignalingRequired(events[0].get())) { + EXPECT_EQ(cmdList.end(), itor); // already waited on previous call + } else { + ASSERT_NE(cmdList.end(), itor); // implicit dependency - itor = find(++itor, cmdList.end()); + itor = find(++itor, cmdList.end()); - EXPECT_EQ(cmdList.end(), itor); + EXPECT_EQ(cmdList.end(), itor); + } } HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenWaitingForEventFromPreviousAppendOnRegularCmdListThenSkip, IsAtLeastSkl) { @@ -1240,17 +1244,21 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenWaitingForEventFromPreviousAp auto itor = find(cmdList.begin(), cmdList.end()); - ASSERT_NE(cmdList.end(), itor); // implicit dependency + if (regularCmdList->isInOrderNonWalkerSignalingRequired(events[0].get())) { + EXPECT_EQ(cmdList.end(), itor); // already waited on previous call + } else { + ASSERT_NE(cmdList.end(), itor); // implicit dependency - itor = find(++itor, cmdList.end()); + itor = find(++itor, cmdList.end()); - EXPECT_EQ(cmdList.end(), itor); + EXPECT_EQ(cmdList.end(), itor); + } } HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenWaitingForRegularEventFromPreviousAppendThenSkip, IsAtLeastSkl) { using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; - auto immCmdList = createImmCmdList(); + auto immCmdList = createCopyOnlyImmCmdList(); auto eventPool = createEvents(1, false); events[0]->inOrderExecEvent = false; @@ -1258,11 +1266,17 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenWaitingForRegularEventFromPre auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); - immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, eventHandle, 0, nullptr, launchParams, false); + uint32_t copyData = 0; + void *deviceAlloc = nullptr; + ze_device_mem_alloc_desc_t deviceDesc = {}; + auto result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 128, 128, &deviceAlloc); + ASSERT_EQ(result, ZE_RESULT_SUCCESS); + + immCmdList->appendMemoryCopy(deviceAlloc, ©Data, 1, eventHandle, 0, nullptr, false, false); auto offset = cmdStream->getUsed(); - immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 1, &eventHandle, launchParams, false); + immCmdList->appendMemoryCopy(deviceAlloc, ©Data, 1, nullptr, 1, &eventHandle, false, false); GenCmdList cmdList; ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), cmdStream->getUsed() - offset)); @@ -1274,6 +1288,8 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenWaitingForRegularEventFromPre itor = find(++itor, cmdList.end()); EXPECT_EQ(cmdList.end(), itor); + + context->freeMem(deviceAlloc); } HWTEST2_F(InOrderCmdListTests, givenInOrderEventModeWhenSubmittingThenProgramSemaphoreOnlyForExternalEvent, IsAtLeastXeHpCore) { @@ -1331,6 +1347,219 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderEventModeWhenSubmittingThenProgramSem EXPECT_EQ(nullptr, semaphoreCmd); } +HWTEST2_F(InOrderCmdListTests, givenCmdsChainingWhenDispatchingKernelThenProgramSemaphoreOnce, IsAtLeastXeHpCore) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + + auto immCmdList = createImmCmdList(); + + auto eventPool = createEvents(1, false); + events[0]->inOrderExecEvent = false; + + auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); + + auto eventHandle = events[0]->toHandle(); + + auto offset = cmdStream->getUsed(); + ze_copy_region_t region = {0, 0, 0, 1, 1, 1}; + uint32_t copyData = 0; + + void *alloc = nullptr; + ze_device_mem_alloc_desc_t deviceDesc = {}; + auto result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 16384u, 4096u, &alloc); + ASSERT_EQ(result, ZE_RESULT_SUCCESS); + + auto findSemaphores = [&](size_t expectedNumSemaphores) { + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), cmdStream->getUsed() - offset)); + + auto cmds = findAll(cmdList.begin(), cmdList.end()); + + EXPECT_EQ(expectedNumSemaphores, cmds.size()); + }; + + immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, eventHandle, 0, nullptr, launchParams, false); + findSemaphores(1); // chaining + + offset = cmdStream->getUsed(); + immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false); + findSemaphores(0); // no implicit dependency semaphore + + offset = cmdStream->getUsed(); + immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, eventHandle, 0, nullptr, launchParams, false); + findSemaphores(2); // implicit dependency + chaining + + offset = cmdStream->getUsed(); + immCmdList->appendMemoryCopy(©Data, ©Data, 1, nullptr, 0, nullptr, false, false); + findSemaphores(0); // no implicit dependency + + offset = cmdStream->getUsed(); + immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, eventHandle, 0, nullptr, launchParams, false); + findSemaphores(2); // implicit dependency + chaining + + offset = cmdStream->getUsed(); + immCmdList->appendMemoryCopyRegion(©Data, ®ion, 1, 1, ©Data, ®ion, 1, 1, nullptr, 0, nullptr, false, false); + findSemaphores(0); // no implicit dependency + + offset = cmdStream->getUsed(); + immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, eventHandle, 0, nullptr, launchParams, false); + findSemaphores(2); // implicit dependency + chaining + + offset = cmdStream->getUsed(); + immCmdList->appendMemoryFill(alloc, ©Data, 1, 16, nullptr, 0, nullptr, false); + findSemaphores(0); // no implicit dependency + + offset = cmdStream->getUsed(); + immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, eventHandle, 0, nullptr, launchParams, false); + findSemaphores(2); // implicit dependency + chaining + + offset = cmdStream->getUsed(); + immCmdList->appendLaunchKernelIndirect(kernel->toHandle(), *static_cast(alloc), nullptr, 0, nullptr, false); + findSemaphores(0); // no implicit dependency + + offset = cmdStream->getUsed(); + immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, eventHandle, 0, nullptr, launchParams, false); + findSemaphores(2); // implicit dependency + chaining + + offset = cmdStream->getUsed(); + immCmdList->appendLaunchCooperativeKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, false); + findSemaphores(0); // no implicit dependency + + context->freeMem(alloc); +} + +HWTEST2_F(InOrderCmdListTests, givenCmdsChainingFromAppendCopyWhenDispatchingKernelThenProgramSemaphoreOnce, IsAtLeastXeHpCore) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + + auto immCmdList = createImmCmdList(); + + auto eventPool = createEvents(1, false); + events[0]->inOrderExecEvent = false; + + auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); + + auto eventHandle = events[0]->toHandle(); + + auto offset = cmdStream->getUsed(); + ze_copy_region_t region = {0, 0, 0, 1, 1, 1}; + uint32_t copyData = 0; + + auto findSemaphores = [&](size_t expectedNumSemaphores) { + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), cmdStream->getUsed() - offset)); + + auto cmds = findAll(cmdList.begin(), cmdList.end()); + + EXPECT_EQ(expectedNumSemaphores, cmds.size()); + }; + + immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false); + + uint32_t numSemaphores = immCmdList->eventSignalPipeControl(false, immCmdList->getDcFlushRequired(events[0]->isSignalScope())) ? 1 : 2; + + offset = cmdStream->getUsed(); + immCmdList->appendMemoryCopy(©Data, ©Data, 1, eventHandle, 0, nullptr, false, false); + findSemaphores(numSemaphores); // implicit dependency + optional chaining + + numSemaphores = immCmdList->eventSignalPipeControl(false, immCmdList->getDcFlushRequired(events[0]->isSignalScope())) ? 1 : 0; + + offset = cmdStream->getUsed(); + immCmdList->appendMemoryCopy(©Data, ©Data, 1, nullptr, 0, nullptr, false, false); + findSemaphores(numSemaphores); // implicit dependency for Compact event or no semaphores for non-compact + + offset = cmdStream->getUsed(); + immCmdList->appendMemoryCopyRegion(©Data, ®ion, 1, 1, ©Data, ®ion, 1, 1, eventHandle, 0, nullptr, false, false); + findSemaphores(2); // implicit dependency + chaining + + offset = cmdStream->getUsed(); + immCmdList->appendMemoryCopyRegion(©Data, ®ion, 1, 1, ©Data, ®ion, 1, 1, nullptr, 0, nullptr, false, false); + findSemaphores(0); // no implicit dependency +} + +HWTEST2_F(InOrderCmdListTests, givenEventWithRequiredPipeControlWhenDispatchingCopyThenSignalInOrderAllocation, IsAtLeastXeHpCore) { + using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + + auto immCmdList = createImmCmdList(); + + auto eventPool = createEvents(1, false); + + auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); + + auto eventHandle = events[0]->toHandle(); + + uint32_t copyData = 0; + + auto offset = cmdStream->getUsed(); + immCmdList->appendMemoryCopy(©Data, ©Data, 1, eventHandle, 0, nullptr, false, false); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), cmdStream->getUsed() - offset)); + + auto sdiItor = find(cmdList.begin(), cmdList.end()); + + if (immCmdList->eventSignalPipeControl(false, immCmdList->getDcFlushRequired(events[0]->isSignalScope()))) { + EXPECT_NE(cmdList.end(), sdiItor); + } else { + EXPECT_EQ(cmdList.end(), sdiItor); + + auto walkerItor = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), walkerItor); + + auto walkerCmd = genCmdCast(*walkerItor); + auto &postSync = walkerCmd->getPostSync(); + + EXPECT_EQ(POSTSYNC_DATA::OPERATION_WRITE_IMMEDIATE_DATA, postSync.getOperation()); + EXPECT_EQ(1u, postSync.getImmediateData()); + EXPECT_EQ(immCmdList->inOrderExecInfo->inOrderDependencyCounterAllocation.getGpuAddress(), postSync.getDestinationAddress()); + } +} + +HWTEST2_F(InOrderCmdListTests, givenCmdsChainingWhenDispatchingKernelWithRelaxedOrderingThenProgramAllDependencies, IsAtLeastXeHpCore) { + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + + DebugManager.flags.DirectSubmissionRelaxedOrdering.set(1); + + auto ultCsr = static_cast *>(device->getNEODevice()->getDefaultEngine().commandStreamReceiver); + + auto directSubmission = new MockDirectSubmissionHw>(*ultCsr); + ultCsr->directSubmission.reset(directSubmission); + int client1, client2; + ultCsr->registerClient(&client1); + ultCsr->registerClient(&client2); + + auto immCmdList = createImmCmdList(); + + auto eventPool = createEvents(1, false); + events[0]->inOrderExecEvent = false; + + auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); + + auto eventHandle = events[0]->toHandle(); + size_t offset = 0; + + auto findConditionalBbStarts = [&](size_t expectedNumBbStarts) { + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), cmdStream->getUsed() - offset)); + + auto cmds = findAll(cmdList.begin(), cmdList.end()); + + EXPECT_EQ(expectedNumBbStarts, cmds.size()); + }; + + immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, eventHandle, 0, nullptr, launchParams, false); + + offset = cmdStream->getUsed(); + immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, eventHandle, 0, nullptr, launchParams, false); + findConditionalBbStarts(1); // chaining + + EXPECT_TRUE(immCmdList->isRelaxedOrderingDispatchAllowed(0)); + + offset = cmdStream->getUsed(); + immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false); + findConditionalBbStarts(1); // implicit dependency +} + HWTEST2_F(InOrderCmdListTests, givenInOrderEventModeWhenWaitingForEventFromPreviousAppendThenSkip, IsAtLeastXeHpCore) { using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; @@ -1350,19 +1579,25 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderEventModeWhenWaitingForEventFromPrevi immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 1, &event0Handle, launchParams, false); - GenCmdList cmdList; - ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( - cmdList, - ptrOffset(cmdStream->getCpuBase(), offset), - cmdStream->getUsed() - offset)); + { + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + ptrOffset(cmdStream->getCpuBase(), offset), + cmdStream->getUsed() - offset)); - auto itor = find(cmdList.begin(), cmdList.end()); + auto itor = find(cmdList.begin(), cmdList.end()); - ASSERT_NE(cmdList.end(), itor); + if (immCmdList->isInOrderNonWalkerSignalingRequired(events[0].get())) { + EXPECT_EQ(cmdList.end(), itor); // already waited on previous call + } else { + ASSERT_NE(cmdList.end(), itor); - itor = find(++itor, cmdList.end()); + itor = find(++itor, cmdList.end()); - EXPECT_EQ(cmdList.end(), itor); + EXPECT_EQ(cmdList.end(), itor); + } + } } HWTEST2_F(InOrderCmdListTests, givenInOrderEventModeWhenSubmittingFromDifferentCmdListThenProgramSemaphoreForEvent, IsAtLeastSkl) { @@ -3342,30 +3577,74 @@ HWTEST2_F(MultiTileInOrderCmdListTests, givenMultiTileInOrderModeWhenProgramming auto eventPool = createEvents(1, false); auto eventHandle = events[0]->toHandle(); + size_t offset = cmdStream->getUsed(); + immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, eventHandle, 0, nullptr, launchParams, false); - size_t offset = cmdStream->getUsed(); + auto isCompactEvent = immCmdList->compactL3FlushEvent(immCmdList->getDcFlushRequired(events[0]->isSignalScope())); + + { + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), (cmdStream->getUsed() - offset))); + + auto semaphoreItor = find(cmdList.begin(), cmdList.end()); + + if (isCompactEvent) { + ASSERT_NE(cmdList.end(), semaphoreItor); + auto semaphoreCmd = genCmdCast(*semaphoreItor); + + ASSERT_NE(nullptr, semaphoreCmd); + + auto gpuAddress = events[0]->getCompletionFieldGpuAddress(device); + + while (gpuAddress != semaphoreCmd->getSemaphoreGraphicsAddress()) { + semaphoreItor = find(++semaphoreItor, cmdList.end()); + ASSERT_NE(cmdList.end(), semaphoreItor); + + semaphoreCmd = genCmdCast(*semaphoreItor); + ASSERT_NE(nullptr, semaphoreCmd); + } + + EXPECT_EQ(static_cast(Event::State::STATE_CLEARED), semaphoreCmd->getSemaphoreDataDword()); + EXPECT_EQ(gpuAddress, semaphoreCmd->getSemaphoreGraphicsAddress()); + + semaphoreCmd = genCmdCast(++semaphoreCmd); + ASSERT_NE(nullptr, semaphoreCmd); + + EXPECT_EQ(static_cast(Event::State::STATE_CLEARED), semaphoreCmd->getSemaphoreDataDword()); + EXPECT_EQ(gpuAddress + sizeof(uint64_t), semaphoreCmd->getSemaphoreGraphicsAddress()); + } + } + + offset = cmdStream->getUsed(); immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 1, &eventHandle, launchParams, false); - GenCmdList cmdList; - ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, - ptrOffset(cmdStream->getCpuBase(), offset), - (cmdStream->getUsed() - offset))); + { + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, + ptrOffset(cmdStream->getCpuBase(), offset), + (cmdStream->getUsed() - offset))); - auto semaphoreCmd = genCmdCast(*cmdList.begin()); - ASSERT_NE(nullptr, semaphoreCmd); + auto semaphoreCmd = genCmdCast(*cmdList.begin()); - auto gpuAddress = immCmdList->inOrderExecInfo->inOrderDependencyCounterAllocation.getGpuAddress(); + if (isCompactEvent) { + ASSERT_EQ(nullptr, semaphoreCmd); // already waited on previous call + } else { + ASSERT_NE(nullptr, semaphoreCmd); - EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword()); - EXPECT_EQ(gpuAddress, semaphoreCmd->getSemaphoreGraphicsAddress()); + auto gpuAddress = immCmdList->inOrderExecInfo->inOrderDependencyCounterAllocation.getGpuAddress(); - semaphoreCmd = genCmdCast(++semaphoreCmd); - ASSERT_NE(nullptr, semaphoreCmd); + EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword()); + EXPECT_EQ(gpuAddress, semaphoreCmd->getSemaphoreGraphicsAddress()); - EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword()); - EXPECT_EQ(gpuAddress + sizeof(uint64_t), semaphoreCmd->getSemaphoreGraphicsAddress()); + semaphoreCmd = genCmdCast(++semaphoreCmd); + ASSERT_NE(nullptr, semaphoreCmd); + + EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword()); + EXPECT_EQ(gpuAddress + sizeof(uint64_t), semaphoreCmd->getSemaphoreGraphicsAddress()); + } + } } HWTEST2_F(MultiTileInOrderCmdListTests, givenMultiTileInOrderModeWhenSignalingSyncAllocationThenEnablePartitionOffset, IsAtLeastXeHpCore) { @@ -3426,7 +3705,7 @@ HWTEST2_F(MultiTileInOrderCmdListTests, givenMultiTileInOrderModeWhenCallingSync EXPECT_EQ(ZE_RESULT_SUCCESS, events[0]->hostSynchronize(0)); } -HWTEST2_F(MultiTileInOrderCmdListTests, givenMultiTileInOrderModeWhenProgrammingTimestampEventThenHandleChaining, IsAtLeastXeHpcCore) { +HWTEST2_F(MultiTileInOrderCmdListTests, givenMultiTileInOrderModeWhenProgrammingTimestampEventThenHandleChaining, IsAtLeastXeHpCore) { using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; @@ -3475,7 +3754,7 @@ HWTEST2_F(MultiTileInOrderCmdListTests, givenMultiTileInOrderModeWhenProgramming EXPECT_EQ(eventEndGpuVa + events[0]->getSinglePacketSize(), semaphoreCmd->getSemaphoreGraphicsAddress()); } -HWTEST2_F(MultiTileInOrderCmdListTests, givenMultiTileInOrderModeWhenProgrammingTimestampEventThenHandlePacketsChaining, IsAtLeastXeHpcCore) { +HWTEST2_F(MultiTileInOrderCmdListTests, givenMultiTileInOrderModeWhenProgrammingTimestampEventThenHandlePacketsChaining, IsAtLeastXeHpCore) { using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;