From 607832a8e60d36070043c7f3b9a11bf27182d5bd Mon Sep 17 00:00:00 2001 From: "Dunajski, Bartosz" Date: Fri, 15 Dec 2023 13:08:58 +0000 Subject: [PATCH] fix: skip walker in-order signaling when closing PC is required Related-To: NEO-7966 Signed-off-by: Dunajski, Bartosz --- level_zero/core/source/cmdlist/cmdlist.h | 1 + level_zero/core/source/cmdlist/cmdlist_hw.inl | 4 +- .../cmdlist/cmdlist_hw_xehp_and_later.inl | 2 +- .../fixtures/in_order_cmd_list_fixture.h | 8 ++ .../test_cmdlist_append_launch_kernel_3.cpp | 122 +++++++++++++++++- 5 files changed, 128 insertions(+), 9 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index d6becb870c..644ee32904 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -48,6 +48,7 @@ struct CmdListKernelLaunchParams { bool isDestinationAllocationInSystemMemory = false; bool isHostSignalScopeEvent = false; bool skipInOrderNonWalkerSignaling = false; + bool pipeControlSignalling = false; }; struct CmdListReturnPoint { diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 643f1dc7d8..7980e4fad7 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -1456,6 +1456,8 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, launchParams.isKernelSplitOperation = kernelCounter > 1; bool singlePipeControlPacket = eventSignalPipeControl(launchParams.isKernelSplitOperation, dcFlush); + launchParams.pipeControlSignalling = (signalEvent && singlePipeControlPacket) || dstAllocationStruct.needsFlush; + appendEventForProfilingAllWalkers(signalEvent, true, singlePipeControlPacket); if (isCopyOnly()) { @@ -1525,7 +1527,7 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, addToMappedEventList(signalEvent); if (this->isInOrderExecutionEnabled()) { - bool emitPipeControl = !isCopyOnly() && eventSignalPipeControl(launchParams.isKernelSplitOperation, signalEvent ? getDcFlushRequired(signalEvent->isSignalScope()) : false); + bool emitPipeControl = !isCopyOnly() && launchParams.pipeControlSignalling; if (launchParams.isKernelSplitOperation || inOrderCopyOnlySignalingAllowed || emitPipeControl) { if (!signalEvent && !isCopyOnly()) { diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 8fbcfdd004..40fe86efdd 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -277,7 +277,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K appendEventForProfilingAllWalkers(compactEvent, true, true); } - bool inOrderExecSignalRequired = (this->isInOrderExecutionEnabled() && !launchParams.isKernelSplitOperation); + bool inOrderExecSignalRequired = (this->isInOrderExecutionEnabled() && !launchParams.isKernelSplitOperation && !launchParams.pipeControlSignalling); bool inOrderNonWalkerSignalling = isInOrderNonWalkerSignalingRequired(eventForInOrderExec); uint64_t inOrderCounterValue = 0; diff --git a/level_zero/core/test/unit_tests/fixtures/in_order_cmd_list_fixture.h b/level_zero/core/test/unit_tests/fixtures/in_order_cmd_list_fixture.h index 11387719c1..dd0fe55bb3 100644 --- a/level_zero/core/test/unit_tests/fixtures/in_order_cmd_list_fixture.h +++ b/level_zero/core/test/unit_tests/fixtures/in_order_cmd_list_fixture.h @@ -164,6 +164,14 @@ struct InOrderCmdListFixture : public ::Test { return ptr; } + void *allocDeviceMem(size_t size) { + void *alloc = nullptr; + ze_device_mem_alloc_desc_t deviceDesc = {}; + context->allocDeviceMem(device->toHandle(), &deviceDesc, size, 4096u, &alloc); + + return alloc; + } + template bool verifyInOrderDependency(GenCmdList::iterator &cmd, uint64_t counter, uint64_t syncVa, bool qwordCounter); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp index 978f6e3b1f..05a08069eb 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp @@ -1729,7 +1729,8 @@ HWTEST2_F(InOrderCmdListTests, givenCmdsChainingFromAppendCopyWhenDispatchingKer auto offset = cmdStream->getUsed(); ze_copy_region_t region = {0, 0, 0, 1, 1, 1}; - uint32_t copyData = 0; + + void *alloc = allocDeviceMem(16384u); auto findSemaphores = [&](size_t expectedNumSemaphores) { GenCmdList cmdList; @@ -1745,15 +1746,54 @@ HWTEST2_F(InOrderCmdListTests, givenCmdsChainingFromAppendCopyWhenDispatchingKer uint32_t numSemaphores = immCmdList->eventSignalPipeControl(false, immCmdList->getDcFlushRequired(events[0]->isSignalScope())) ? 1 : 2; offset = cmdStream->getUsed(); - immCmdList->appendMemoryCopy(©Data, ©Data, 1, eventHandle, 0, nullptr, false, false); + immCmdList->appendMemoryCopy(alloc, alloc, 1, eventHandle, 0, nullptr, false, false); findSemaphores(numSemaphores); // implicit dependency + optional chaining numSemaphores = immCmdList->eventSignalPipeControl(false, immCmdList->getDcFlushRequired(events[0]->isSignalScope())) ? 1 : 0; offset = cmdStream->getUsed(); - immCmdList->appendMemoryCopy(©Data, ©Data, 1, nullptr, 0, nullptr, false, false); + immCmdList->appendMemoryCopy(alloc, alloc, 1, nullptr, 0, nullptr, false, false); findSemaphores(numSemaphores); // implicit dependency for Compact event or no semaphores for non-compact + offset = cmdStream->getUsed(); + immCmdList->appendMemoryCopyRegion(alloc, ®ion, 1, 1, alloc, ®ion, 1, 1, eventHandle, 0, nullptr, false, false); + findSemaphores(2); // implicit dependency + chaining + + offset = cmdStream->getUsed(); + immCmdList->appendMemoryCopyRegion(alloc, ®ion, 1, 1, alloc, ®ion, 1, 1, nullptr, 0, nullptr, false, false); + findSemaphores(0); // no implicit dependency + + context->freeMem(alloc); +} + +HWTEST2_F(InOrderCmdListTests, givenCmdsChainingFromAppendCopyAndFlushRequiredWhenDispatchingKernelThenProgramSemaphoreOnce, IsAtLeastXeHpCore) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + auto immCmdList = createImmCmdList(); + auto eventPool = createEvents(1, false); + events[0]->makeCounterBasedImplicitlyDisabled(); + auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); + auto eventHandle = events[0]->toHandle(); + + auto offset = cmdStream->getUsed(); + ze_copy_region_t region = {0, 0, 0, 1, 1, 1}; + uint32_t copyData = 0; + + auto findSemaphores = [&](size_t expectedNumSemaphores) { + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), cmdStream->getUsed() - offset)); + auto cmds = findAll(cmdList.begin(), cmdList.end()); + EXPECT_EQ(expectedNumSemaphores, cmds.size()); + }; + immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false); + + offset = cmdStream->getUsed(); + immCmdList->appendMemoryCopy(©Data, ©Data, 1, eventHandle, 0, nullptr, false, false); + findSemaphores(1); // implicit dependency + + offset = cmdStream->getUsed(); + immCmdList->appendMemoryCopy(©Data, ©Data, 1, nullptr, 0, nullptr, false, false); + findSemaphores(1); // implicit dependency + offset = cmdStream->getUsed(); immCmdList->appendMemoryCopyRegion(©Data, ®ion, 1, 1, ©Data, ®ion, 1, 1, eventHandle, 0, nullptr, false, false); findSemaphores(2); // implicit dependency + chaining @@ -1776,10 +1816,10 @@ HWTEST2_F(InOrderCmdListTests, givenEventWithRequiredPipeControlWhenDispatchingC auto eventHandle = events[0]->toHandle(); - uint32_t copyData = 0; + void *alloc = allocDeviceMem(16384u); auto offset = cmdStream->getUsed(); - immCmdList->appendMemoryCopy(©Data, ©Data, 1, eventHandle, 0, nullptr, false, false); + immCmdList->appendMemoryCopy(alloc, alloc, 1, eventHandle, 0, nullptr, false, false); GenCmdList cmdList; ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), cmdStream->getUsed() - offset)); @@ -1801,6 +1841,44 @@ HWTEST2_F(InOrderCmdListTests, givenEventWithRequiredPipeControlWhenDispatchingC EXPECT_EQ(1u, postSync.getImmediateData()); EXPECT_EQ(immCmdList->inOrderExecInfo->getDeviceCounterAllocation().getGpuAddress(), postSync.getDestinationAddress()); } + + context->freeMem(alloc); +} + +HWTEST2_F(InOrderCmdListTests, givenEventWithRequiredPipeControlAndAllocFlushWhenDispatchingCopyThenSignalInOrderAllocation, IsAtLeastXeHpCore) { + using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA; + auto immCmdList = createImmCmdList(); + auto eventPool = createEvents(1, false); + auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); + + auto eventHandle = events[0]->toHandle(); + + uint32_t copyData = 0; + + auto offset = cmdStream->getUsed(); + immCmdList->appendMemoryCopy(©Data, ©Data, 1, eventHandle, 0, nullptr, false, false); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), cmdStream->getUsed() - offset)); + auto sdiItor = find(cmdList.begin(), cmdList.end()); + if (immCmdList->eventSignalPipeControl(false, immCmdList->getDcFlushRequired(events[0]->isSignalScope()))) { + EXPECT_NE(cmdList.end(), sdiItor); + } else { + EXPECT_NE(cmdList.end(), sdiItor); + + auto sdiCmd = genCmdCast(*sdiItor); + + EXPECT_EQ(immCmdList->inOrderExecInfo->getDeviceCounterAllocation().getGpuAddress(), sdiCmd->getAddress()); + + auto walkerItor = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), walkerItor); + auto walkerCmd = genCmdCast(*walkerItor); + auto &postSync = walkerCmd->getPostSync(); + + EXPECT_NE(immCmdList->inOrderExecInfo->getDeviceCounterAllocation().getGpuAddress(), postSync.getDestinationAddress()); + } } HWTEST2_F(InOrderCmdListTests, givenCmdsChainingWhenDispatchingKernelWithRelaxedOrderingThenProgramAllDependencies, IsAtLeastXeHpCore) { @@ -3089,9 +3167,9 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingComputeCopyThenDon auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); - auto alignedPtr = alignedMalloc(MemoryConstants::cacheLineSize, MemoryConstants::cacheLineSize); + void *alloc = allocDeviceMem(16384u); - immCmdList->appendMemoryCopy(alignedPtr, alignedPtr, 1, nullptr, 0, nullptr, false, false); + immCmdList->appendMemoryCopy(alloc, alloc, 1, nullptr, 0, nullptr, false, false); GenCmdList cmdList; ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, cmdStream->getCpuBase(), cmdStream->getUsed())); @@ -3107,6 +3185,36 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingComputeCopyThenDon auto sdiItor = find(walkerItor, cmdList.end()); EXPECT_EQ(cmdList.end(), sdiItor); + context->freeMem(alloc); +} + +HWTEST2_F(InOrderCmdListTests, givenAlocFlushRequiredhenProgrammingComputeCopyThenSingalFromSdi, IsAtLeastXeHpCore) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; + auto immCmdList = createImmCmdList(); + + auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); + + auto alignedPtr = alignedMalloc(MemoryConstants::cacheLineSize, MemoryConstants::cacheLineSize); + + immCmdList->appendMemoryCopy(alignedPtr, alignedPtr, 1, nullptr, 0, nullptr, false, false); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, cmdStream->getCpuBase(), cmdStream->getUsed())); + + auto walkerItor = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), walkerItor); + + auto walkerCmd = genCmdCast(*walkerItor); + auto &postSync = walkerCmd->getPostSync(); + EXPECT_EQ(0u, postSync.getDestinationAddress()); + + auto sdiItor = find(walkerItor, cmdList.end()); + EXPECT_NE(cmdList.end(), sdiItor); + auto sdiCmd = genCmdCast(*sdiItor); + + EXPECT_EQ(immCmdList->inOrderExecInfo->getDeviceCounterAllocation().getGpuAddress(), sdiCmd->getAddress()); + alignedFree(alignedPtr); }