From 138d76a8934f4e6329419604a046992575d81fcd Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Tue, 16 Sep 2025 16:48:00 +0000 Subject: [PATCH] feature: add flag to save task count and tag address in command list Related-To: NEO-16140 Signed-off-by: Zbigniew Zdanowicz --- level_zero/core/source/cmdlist/cmdlist.h | 15 ++++- level_zero/core/source/cmdlist/cmdlist_hw.inl | 2 + .../source/cmdlist/cmdlist_hw_immediate.h | 2 +- .../source/cmdlist/cmdlist_hw_immediate.inl | 4 +- level_zero/core/source/cmdqueue/cmdqueue.cpp | 10 ++++ level_zero/core/source/cmdqueue/cmdqueue.h | 9 ++- .../test/aub_tests/aub_hello_world_test.cpp | 2 +- .../unit_tests/fixtures/cmdlist_fixture.inl | 2 +- .../test/unit_tests/mocks/mock_cmdqueue.h | 2 + .../sources/cmdlist/test_cmdlist_1.cpp | 2 +- .../sources/cmdlist/test_cmdlist_4.cpp | 2 +- .../test_cmdlist_append_launch_kernel_1.cpp | 2 +- .../test_cmdlist_append_launch_kernel_3.cpp | 4 +- .../cmdlist/test_in_order_cmdlist_3.cpp | 2 +- .../test_cmdqueue_enqueue_cmdlist_2.cpp | 57 ++++++++++++++++++- .../experimental/source/graph/graph.cpp | 8 +-- 16 files changed, 106 insertions(+), 19 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index 4f296dc1ac..8c5cc00bc0 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -515,12 +515,22 @@ struct CommandList : _ze_command_list_handle_t { return flags; } - virtual void setPatchingPreamble(bool value) {} + virtual void setPatchingPreamble(bool patching, bool saveWait) {} uint32_t getActiveScratchPatchElements() const { return activeScratchPatchElements; } bool isDualStreamCopyOffloadOperation(bool offloadOperation) const { return (getCopyOffloadModeForOperation(offloadOperation) == CopyOffloadModes::dualStream); } + void saveLatestTagAndTaskCount(uint64_t tagGpuAddress, TaskCountType submittedTaskCount) { + this->latestTagGpuAddress = tagGpuAddress; + this->latestTaskCount = submittedTaskCount; + } + uint64_t getLatestTagGpuAddress() const { + return this->latestTagGpuAddress; + } + TaskCountType getLatestTaskCount() const { + return this->latestTaskCount; + } protected: NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize, bool copyOffload); @@ -553,11 +563,14 @@ struct CommandList : _ze_command_list_handle_t { NEO::L1CachePolicy l1CachePolicyData{}; NEO::EncodeDummyBlitWaArgs dummyBlitWa{}; + uint64_t latestTagGpuAddress = 0; int64_t currentSurfaceStateBaseAddress = NEO::StreamProperty64::initValue; int64_t currentDynamicStateBaseAddress = NEO::StreamProperty64::initValue; int64_t currentIndirectObjectBaseAddress = NEO::StreamProperty64::initValue; int64_t currentBindingTablePoolBaseAddress = NEO::StreamProperty64::initValue; + TaskCountType latestTaskCount = 0; + ze_context_handle_t hContext = nullptr; CommandQueue *cmdQImmediate = nullptr; CommandQueue *cmdQImmediateCopyOffload = nullptr; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 2f26aaa405..df3b4f70e2 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -153,6 +153,8 @@ ze_result_t CommandListCoreFamily::reset() { this->inOrderPatchCmds.clear(); this->totalNoopSpace = 0; + this->latestTagGpuAddress = 0; + this->latestTaskCount = 0; return ZE_RESULT_SUCCESS; } diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h index 4fd20f5de9..19f910f5e0 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h @@ -239,7 +239,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily::estimateAdditionalSizeAppe } template -inline void CommandListCoreFamilyImmediate::setPatchingPreamble(bool value) { - this->cmdQImmediate->setPatchingPreamble(value); +inline void CommandListCoreFamilyImmediate::setPatchingPreamble(bool patching, bool saveWait) { + this->cmdQImmediate->setPatchingPreamble(patching, saveWait); } } // namespace L0 diff --git a/level_zero/core/source/cmdqueue/cmdqueue.cpp b/level_zero/core/source/cmdqueue/cmdqueue.cpp index ad79f680d9..49fba95581 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue.cpp +++ b/level_zero/core/source/cmdqueue/cmdqueue.cpp @@ -42,6 +42,16 @@ bool CommandQueue::frontEndTrackingEnabled() const { return NEO::debugManager.flags.AllowPatchingVfeStateInCommandLists.get() || this->frontEndStateTracking; } +void CommandQueue::saveTagAndTaskCountForCommandLists(uint32_t numCommandLists, ze_command_list_handle_t *commandListHandles, + uint64_t tagGpuAddress, TaskCountType submittedTaskCount) { + if (this->saveWaitForPreamble) { + for (uint32_t i = 0; i < numCommandLists; i++) { + auto commandList = CommandList::fromHandle(commandListHandles[i]); + commandList->saveLatestTagAndTaskCount(tagGpuAddress, submittedTaskCount); + } + } +} + CommandQueueImp::CommandQueueImp(Device *device, NEO::CommandStreamReceiver *csr, const ze_command_queue_desc_t *desc) : desc(*desc), device(device), csr(csr) { int overrideCmdQueueSyncMode = NEO::debugManager.flags.OverrideCmdQueueSynchronousMode.get(); diff --git a/level_zero/core/source/cmdqueue/cmdqueue.h b/level_zero/core/source/cmdqueue/cmdqueue.h index 7856fc37f3..7371c1d14f 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue.h +++ b/level_zero/core/source/cmdqueue/cmdqueue.h @@ -90,12 +90,18 @@ struct CommandQueue : _ze_command_queue_handle_t { this->isWalkerWithProfilingEnqueued = false; return retVal; } - inline void setPatchingPreamble(bool patching) { + inline void setPatchingPreamble(bool patching, bool saveWait) { this->patchingPreamble = patching; + this->saveWaitForPreamble = saveWait; } inline bool getPatchingPreamble() const { return this->patchingPreamble; } + inline bool getSaveWaitForPreamble() const { + return this->saveWaitForPreamble; + } + void saveTagAndTaskCountForCommandLists(uint32_t numCommandLists, ze_command_list_handle_t *commandListHandles, + uint64_t tagGpuAddress, TaskCountType submittedTaskCount); protected: bool frontEndTrackingEnabled() const; @@ -119,6 +125,7 @@ struct CommandQueue : _ze_command_queue_handle_t { bool heaplessStateInitEnabled = false; bool isWalkerWithProfilingEnqueued = false; bool patchingPreamble = false; + bool saveWaitForPreamble = false; }; using CommandQueueAllocatorFn = CommandQueue *(*)(Device *device, NEO::CommandStreamReceiver *csr, diff --git a/level_zero/core/test/aub_tests/aub_hello_world_test.cpp b/level_zero/core/test/aub_tests/aub_hello_world_test.cpp index 4884444512..c60a6da15e 100644 --- a/level_zero/core/test/aub_tests/aub_hello_world_test.cpp +++ b/level_zero/core/test/aub_tests/aub_hello_world_test.cpp @@ -125,7 +125,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, uint32_t commandListCount = sizeof(commandLists) / sizeof(commandLists[0]); - pCmdq->setPatchingPreamble(true); + pCmdq->setPatchingPreamble(true, false); auto queueHandle = pCmdq->toHandle(); diff --git a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.inl b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.inl index b0c91caa2b..09e2e5b956 100644 --- a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.inl +++ b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.inl @@ -1642,7 +1642,7 @@ void CommandListScratchPatchFixtureInit::testScratchInline(bool useImmediate, bo void *queueCpuBase = commandQueue->commandStream.getCpuBase(); auto usedSpaceBefore = commandQueue->commandStream.getUsed(); - commandQueue->setPatchingPreamble(patchPreamble); + commandQueue->setPatchingPreamble(patchPreamble, false); result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, nullptr); EXPECT_EQ(ZE_RESULT_SUCCESS, result); auto usedSpaceAfter = commandQueue->commandStream.getUsed(); diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h b/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h index 85b5357bde..ee30759b45 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h @@ -63,6 +63,7 @@ struct WhiteBox<::L0::CommandQueue> : public ::L0::CommandQueueImp { using CommandQueue::partitionCount; using CommandQueue::patchingPreamble; using CommandQueue::pipelineSelectStateTracking; + using CommandQueue::saveWaitForPreamble; using CommandQueue::stateBaseAddressTracking; using CommandQueue::stateComputeModeTracking; @@ -109,6 +110,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw { using L0::CommandQueue::patchingPreamble; using L0::CommandQueue::pipelineSelectStateTracking; using L0::CommandQueue::preemptionCmdSyncProgramming; + using L0::CommandQueue::saveWaitForPreamble; using L0::CommandQueue::stateBaseAddressTracking; using L0::CommandQueue::stateComputeModeTracking; using L0::CommandQueueImp::csr; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp index b9dc6fc26f..65d5838be4 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp @@ -2543,7 +2543,7 @@ TEST_F(CommandListCreateTests, givenCreatingRegularCommandlistAndppendCommandLis EXPECT_FALSE(commandList->isImmediateType()); auto result = commandList->appendCommandLists(0u, nullptr, nullptr, 0u, nullptr); EXPECT_EQ(result, ZE_RESULT_ERROR_INVALID_ARGUMENT); - commandList->setPatchingPreamble(true); + commandList->setPatchingPreamble(true, false); } HWTEST_F(CommandListCreateTests, GivenGpuHangWhenCreatingImmediateCommandListAndAppendingEventResetThenDeviceLostIsReturned) { diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_4.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_4.cpp index 047617d208..3b31961ad5 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_4.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_4.cpp @@ -1670,7 +1670,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, uint64_t queueGpuBase = cmdStream->getGpuBase(); auto commandListHandle = commandList->toHandle(); - commandListImmediate->setPatchingPreamble(true); + commandListImmediate->setPatchingPreamble(true, false); auto usedSpaceBefore = cmdStream->getUsed(); returnValue = commandListImmediate->appendCommandLists(1, &commandListHandle, nullptr, 0, nullptr); EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp index 09f37307b6..ccc35913fb 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp @@ -1568,7 +1568,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, GivenPatchPreambleActiveWhenExecutingCo void *cfeInputPtr = commandList->commandsToPatch[0].pCommand; void *cfeInputPtr2 = commandList->commandsToPatch[1].pCommand; - commandQueue->setPatchingPreamble(true); + commandQueue->setPatchingPreamble(true, false); void *queueCpuBase = commandQueue->commandStream.getCpuBase(); auto usedSpaceBefore = commandQueue->commandStream.getUsed(); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp index 00cd0567bb..7ee4a54647 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp @@ -743,7 +743,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenPatchPreambleQueueWhenAppendedSync EXPECT_EQ((requiredSize1 + requiredSize2), commandList->getTotalNoopSpace()); - commandQueue->setPatchingPreamble(true); + commandQueue->setPatchingPreamble(true, false); void *queueCpuBase = commandQueue->commandStream.getCpuBase(); auto usedSpaceBefore = commandQueue->commandStream.getUsed(); @@ -993,7 +993,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenPatchPreambleQueueWhenAppendedRegi EXPECT_EQ((requiredSize1 + requiredSize2), commandList->getTotalNoopSpace()); - commandQueue->setPatchingPreamble(true); + commandQueue->setPatchingPreamble(true, false); void *queueCpuBase = commandQueue->commandStream.getCpuBase(); auto usedSpaceBefore = commandQueue->commandStream.getUsed(); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_3.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_3.cpp index 14ff7a7d17..b971632855 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_3.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_3.cpp @@ -350,7 +350,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ze_command_queue_desc_t desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC}; auto mockCmdQHw = makeZeUniquePtr>(device, device->getNEODevice()->getDefaultEngine().commandStreamReceiver, &desc); mockCmdQHw->initialize(false, false, false); - mockCmdQHw->setPatchingPreamble(true); + mockCmdQHw->setPatchingPreamble(true, false); debugManager.flags.InOrderDuplicatedCounterStorageEnabled.set(0); diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist_2.cpp index d23ed765fc..719801814a 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist_2.cpp @@ -977,7 +977,7 @@ HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, givenPatchPreambleWhenSingle uint64_t endGpuAddress = commandList->getCmdContainer().getEndCmdGpuAddress(); uint64_t startGpuAddress = commandList->getCmdContainer().getCmdBufferAllocations()[0]->getGpuAddress(); - commandQueue->setPatchingPreamble(true); + commandQueue->setPatchingPreamble(true, false); void *queueCpuBase = commandQueue->commandStream.getCpuBase(); uint64_t queueGpuBase = commandQueue->commandStream.getGpuBase(); @@ -1116,7 +1116,7 @@ HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, givenPatchPreambleWhenTwoCmd uint64_t start2GpuAddress = commandList2->getCmdContainer().getCmdBufferAllocations()[0]->getGpuAddress(); uint64_t endGpuAddress = commandList2->getCmdContainer().getEndCmdGpuAddress(); - commandQueue->setPatchingPreamble(true); + commandQueue->setPatchingPreamble(true, false); void *queueCpuBase = commandQueue->commandStream.getCpuBase(); uint64_t queueGpuBase = commandQueue->commandStream.getGpuBase(); @@ -1293,5 +1293,58 @@ HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, givenPatchPreambleWhenTwoCmd commandQueue->destroy(); } +HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, givenPatchPreambleAndSavingWaitDataWhenQueueSavesDataThenCommandListsHaveCorrectData) { + ze_result_t returnValue; + ze_command_queue_desc_t queueDesc{ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC}; + queueDesc.ordinal = 0u; + queueDesc.index = 0u; + queueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL; + queueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS; + + WhiteBox *commandQueue = whiteboxCast(CommandQueue::create(productFamily, + device, + neoDevice->getDefaultEngine().commandStreamReceiver, + &queueDesc, + false, + false, + false, + returnValue)); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + auto commandList = CommandList::create(productFamily, device, NEO::EngineGroupType::compute, 0u, returnValue, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + ze_command_list_handle_t commandListHandle = commandList->toHandle(); + commandList->close(); + + commandQueue->setPatchingPreamble(true, false); + EXPECT_TRUE(commandQueue->getPatchingPreamble()); + EXPECT_FALSE(commandQueue->getSaveWaitForPreamble()); + + uint64_t expectedGpuAddress = 0x123000; + TaskCountType expectedTaskCount = 0x456; + + commandQueue->saveTagAndTaskCountForCommandLists(1, &commandListHandle, expectedGpuAddress, expectedTaskCount); + // save and wait is disabled, so nothing to be saved + EXPECT_EQ(0u, commandList->getLatestTagGpuAddress()); + EXPECT_EQ(0u, commandList->getLatestTaskCount()); + + commandQueue->setPatchingPreamble(true, true); + EXPECT_TRUE(commandQueue->getPatchingPreamble()); + EXPECT_TRUE(commandQueue->getSaveWaitForPreamble()); + + commandQueue->saveTagAndTaskCountForCommandLists(1, &commandListHandle, expectedGpuAddress, expectedTaskCount); + // save and wait is now enabled + EXPECT_EQ(expectedGpuAddress, commandList->getLatestTagGpuAddress()); + EXPECT_EQ(expectedTaskCount, commandList->getLatestTaskCount()); + + commandList->reset(); + EXPECT_EQ(0u, commandList->getLatestTagGpuAddress()); + EXPECT_EQ(0u, commandList->getLatestTaskCount()); + + commandList->destroy(); + commandQueue->destroy(); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/experimental/source/graph/graph.cpp b/level_zero/experimental/source/graph/graph.cpp index fd6706ff7a..1c443d8c01 100644 --- a/level_zero/experimental/source/graph/graph.cpp +++ b/level_zero/experimental/source/graph/graph.cpp @@ -501,9 +501,9 @@ ze_result_t ExecutableGraph::execute(L0::CommandList *executionTarget, void *pNe auto currSignalEvent = (myLastCommandList == *cmdList) ? hSignalEvent : nullptr; ze_command_list_handle_t hCmdList = *cmdList; - executionTarget->setPatchingPreamble(true); + executionTarget->setPatchingPreamble(true, false); auto res = executionTarget->appendCommandLists(1, &hCmdList, currSignalEvent, numWaitEvents, phWaitEvents); - executionTarget->setPatchingPreamble(false); + executionTarget->setPatchingPreamble(false, false); if (ZE_RESULT_SUCCESS != res) { return res; } @@ -513,9 +513,9 @@ ze_result_t ExecutableGraph::execute(L0::CommandList *executionTarget, void *pNe if (L0::CommandList **cmdList = std::get_if(&this->submissionChain[submissioNodeId])) { auto currSignalEvent = (myLastCommandList == *cmdList) ? hSignalEvent : nullptr; ze_command_list_handle_t hCmdList = *cmdList; - executionTarget->setPatchingPreamble(true); + executionTarget->setPatchingPreamble(true, false); auto res = executionTarget->appendCommandLists(1, &hCmdList, currSignalEvent, 0, nullptr); - executionTarget->setPatchingPreamble(false); + executionTarget->setPatchingPreamble(false, false); if (ZE_RESULT_SUCCESS != res) { return res; }