From 529aa605639412f28dfaba6e840ce41b072a5c78 Mon Sep 17 00:00:00 2001 From: "Dunajski, Bartosz" Date: Mon, 25 Sep 2023 14:51:28 +0000 Subject: [PATCH] performance: optimize RelaxedOrdering in-order Barrier programming Related-To: NEO-7966 Signed-off-by: Dunajski, Bartosz --- .../source/cmdlist/cmdlist_hw_immediate.inl | 5 +- .../sources/cmdlist/test_cmdlist_1.cpp | 68 +++++++++++++++++++ 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index 9a2305f351..ae9b40c520 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -472,6 +472,8 @@ template ze_result_t CommandListCoreFamilyImmediate::appendBarrier(ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) { ze_result_t ret = ZE_RESULT_SUCCESS; + bool isStallingOperation = true; + if (isInOrderExecutionEnabled()) { if (isSkippingInOrderBarrierAllowed(hSignalEvent, numWaitEvents, phWaitEvents)) { if (hSignalEvent) { @@ -482,6 +484,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendBarrier(ze_even } relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents); + isStallingOperation = hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch); } checkAvailableSpace(numWaitEvents, false, commonImmediateCommandSize); @@ -491,7 +494,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendBarrier(ze_even ret = CommandListCoreFamily::appendBarrier(hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch); this->dependenciesPresent = true; - return flushImmediate(ret, true, !relaxedOrderingDispatch, relaxedOrderingDispatch, false, hSignalEvent); + return flushImmediate(ret, true, isStallingOperation, relaxedOrderingDispatch, false, hSignalEvent); } template diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp index d92eaba895..2de6b84af6 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp @@ -1646,6 +1646,74 @@ HWTEST2_F(CommandListCreate, givenInOrderExecutionWhenDispatchingBarrierThenAllo EXPECT_FALSE(ultCsr->latestFlushedBatchBuffer.hasStallingCmds); } +HWTEST2_F(CommandListCreate, givenInOrderExecutionWhenDispatchingBarrierWithFlushAndWithoutDependenciesThenDontMarkAsStalling, IsAtLeastXeHpcCore) { + bool useImmediateFlushTask = getHelper().platformSupportsImmediateComputeFlushTask(); + + DebugManagerStateRestore restore; + DebugManager.flags.DirectSubmissionRelaxedOrdering.set(1); + + ze_command_queue_desc_t desc = {}; + desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS; + ze_result_t returnValue; + auto commandList0 = zeUniquePtr(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue)); + auto commandList = zeUniquePtr(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue)); + ASSERT_NE(nullptr, commandList); + auto whiteBoxCmdList = static_cast(commandList.get()); + whiteBoxCmdList->enableInOrderExecution(); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + ze_event_desc_t eventDesc = {}; + eventDesc.wait = ZE_EVENT_SCOPE_FLAG_HOST; + + ze_event_handle_t event = nullptr; + + std::unique_ptr eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue)); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + ASSERT_EQ(ZE_RESULT_SUCCESS, eventPool->createEvent(&eventDesc, &event)); + std::unique_ptr eventObject(L0::Event::fromHandle(event)); + + auto ultCsr = static_cast *>(whiteBoxCmdList->csr); + ultCsr->recordFlusheBatchBuffer = true; + + auto directSubmission = new MockDirectSubmissionHw>(*ultCsr); + ultCsr->directSubmission.reset(directSubmission); + int client1, client2; + ultCsr->registerClient(&client1); + ultCsr->registerClient(&client2); + + // Initialize NP state + commandList0->appendBarrier(nullptr, 1, &event, false); + + if (useImmediateFlushTask) { + EXPECT_FALSE(ultCsr->recordedImmediateDispatchFlags.hasRelaxedOrderingDependencies); + EXPECT_TRUE(ultCsr->recordedImmediateDispatchFlags.hasStallingCmds); + } else { + EXPECT_TRUE(ultCsr->recordedDispatchFlags.hasRelaxedOrderingDependencies); + EXPECT_FALSE(ultCsr->recordedDispatchFlags.hasStallingCmds); + } + EXPECT_FALSE(ultCsr->latestFlushedBatchBuffer.hasRelaxedOrderingDependencies); + EXPECT_TRUE(ultCsr->latestFlushedBatchBuffer.hasStallingCmds); + + ultCsr->unregisterClient(&client1); + ultCsr->unregisterClient(&client2); + + commandList->appendBarrier(event, 0, nullptr, false); + + if (useImmediateFlushTask) { + EXPECT_FALSE(ultCsr->recordedImmediateDispatchFlags.hasRelaxedOrderingDependencies); + EXPECT_FALSE(ultCsr->recordedImmediateDispatchFlags.hasStallingCmds); + } else { + EXPECT_FALSE(ultCsr->recordedDispatchFlags.hasRelaxedOrderingDependencies); + EXPECT_FALSE(ultCsr->recordedDispatchFlags.hasStallingCmds); + } + EXPECT_FALSE(ultCsr->latestFlushedBatchBuffer.hasRelaxedOrderingDependencies); + EXPECT_FALSE(ultCsr->latestFlushedBatchBuffer.hasStallingCmds); +} + HWTEST2_F(CommandListCreate, givenInOrderExecutionWhenDispatchingRelaxedOrderingThenProgramConditionalBbStart, IsAtLeastXeHpcCore) { using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG;