diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl index 346b4400d4..5c1270cbb5 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl @@ -240,6 +240,17 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K event->setKernelForPrintf(kernel); } + if (this->inOrderExecutionEnabled && !launchParams.isKernelSplitOperation) { + NEO::PipeControlArgs args; + uint64_t counterAddress = this->inOrderDependencyCounterAllocation->getGpuAddress() + this->inOrderAllocationOffset; + + NEO::MemorySynchronizationCommands::addSingleBarrier(*commandContainer.getCommandStream(), + NEO::PostSyncMode::ImmediateData, + counterAddress, + this->inOrderDependencyCounter + 1, + args); + } + return ZE_RESULT_SUCCESS; } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp index 0c983c62df..b438fde4d3 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp @@ -1213,6 +1213,90 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingTimestampEventThen EXPECT_EQ(1u, sdiCmd->getDataDword0()); } +using NonPostSyncWalkerMatcher = IsWithinGfxCore; + +HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingWalkerThenProgramPipeControlWithSignalAllocation, NonPostSyncWalkerMatcher) { + using WALKER = typename FamilyType::WALKER_TYPE; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + auto immCmdList = createImmCmdList(); + immCmdList->inOrderAllocationOffset = 64; + immCmdList->inOrderDependencyCounter = 123; + + auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); + + immCmdList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, cmdStream->getCpuBase(), cmdStream->getUsed())); + + auto walkerItor = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), walkerItor); + + auto pcItor = find(walkerItor, cmdList.end()); + ASSERT_NE(cmdList.end(), pcItor); + + auto pcCmd = genCmdCast(*pcItor); + ASSERT_NE(nullptr, pcCmd); + + uint64_t expectedAddress = immCmdList->inOrderDependencyCounterAllocation->getGpuAddress() + immCmdList->inOrderAllocationOffset; + + EXPECT_EQ(static_cast(expectedAddress & 0x0000FFFFFFFFULL), pcCmd->getAddress()); + EXPECT_EQ(static_cast(expectedAddress >> 32), pcCmd->getAddressHigh()); + EXPECT_EQ(static_cast(immCmdList->inOrderDependencyCounter), pcCmd->getImmediateData()); + EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pcCmd->getPostSyncOperation()); +} + +HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingKernelSplitThenProgramPcAndSignalAlloc, NonPostSyncWalkerMatcher) { + using WALKER = typename FamilyType::WALKER_TYPE; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; + + auto immCmdList = createImmCmdList(); + immCmdList->inOrderAllocationOffset = 64; + immCmdList->inOrderDependencyCounter = 123; + + auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); + + const size_t ptrBaseSize = 256; + const size_t offset = 1; + + void *hostAlloc = nullptr; + ze_host_mem_alloc_desc_t hostDesc = {}; + context->allocHostMem(&hostDesc, ptrBaseSize, MemoryConstants::cacheLineSize, &hostAlloc); + + ASSERT_NE(nullptr, hostAlloc); + + auto unalignedPtr = ptrOffset(hostAlloc, offset); + + immCmdList->appendMemoryCopy(unalignedPtr, unalignedPtr, ptrBaseSize - offset, nullptr, 0, nullptr, false, false); + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, cmdStream->getCpuBase(), cmdStream->getUsed())); + + auto lastWalkerItor = reverseFind(cmdList.rbegin(), cmdList.rend()); + ASSERT_NE(cmdList.rend(), lastWalkerItor); + + auto pcItor = reverseFind(cmdList.rbegin(), lastWalkerItor); + ASSERT_NE(lastWalkerItor, pcItor); + + auto pcCmd = genCmdCast(*pcItor); + ASSERT_NE(nullptr, pcCmd); + EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_NO_WRITE, pcCmd->getPostSyncOperation()); + + auto sdiItor = reverseFind(cmdList.rbegin(), pcItor); + ASSERT_NE(pcItor, sdiItor); + + auto sdiCmd = genCmdCast(*sdiItor); + + uint64_t expectedAddress = immCmdList->inOrderDependencyCounterAllocation->getGpuAddress() + immCmdList->inOrderAllocationOffset; + + EXPECT_EQ(expectedAddress, sdiCmd->getAddress()); + EXPECT_EQ(0u, sdiCmd->getStoreQword()); + EXPECT_EQ(immCmdList->inOrderDependencyCounter, sdiCmd->getDataDword0()); + + context->freeMem(hostAlloc); +} + HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingAppendSignalEventThenSignalSyncAllocation, IsAtLeastXeHpCore) { using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;