From c7d5a96dfdb44dcbae39996f991a97330b1ed2af Mon Sep 17 00:00:00 2001 From: Lukasz Jobczyk Date: Thu, 28 Oct 2021 09:21:44 +0000 Subject: [PATCH] Flush small task adjustments Signed-off-by: Lukasz Jobczyk --- .../source/cmdlist/cmdlist_hw_immediate.inl | 3 -- .../core/source/cmdlist/cmdlist_imp.cpp | 1 - .../core/source/cmdqueue/cmdqueue_hw.inl | 4 ++ .../sources/cmdqueue/test_cmdqueue.cpp | 28 ++++++++++++ .../command_queue/enqueue_thread_tests.cpp | 1 + ...and_stream_receiver_flush_task_2_tests.cpp | 2 + ...and_stream_receiver_flush_task_3_tests.cpp | 38 ++++++++++++++++ ...ceiver_flush_task_tests_xehp_and_later.cpp | 44 +++++++++++++++++++ .../command_stream_receiver_hw_2_tests.cpp | 32 ++++++++++++++ .../command_stream_receiver_tests.cpp | 24 ++++++++++ opencl/test/unit_test/kernel/kernel_tests.cpp | 1 + .../command_stream_receiver.cpp | 14 +++--- .../command_stream/command_stream_receiver.h | 1 + .../command_stream_receiver_hw.h | 2 +- .../command_stream_receiver_hw_base.inl | 43 +++++++++++------- .../mocks/mock_command_stream_receiver.h | 2 + 16 files changed, 213 insertions(+), 27 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index 04243a1201..5ec800dc3c 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -176,7 +176,6 @@ ze_result_t CommandListCoreFamilyImmediate::appendBarrier( NEO::PipeControlArgs args; this->csr->flushNonKernelTask(nullptr, 0, 0, args, false, false, false); if (this->isSyncModeQueue) { - this->csr->flushTagUpdate(); auto timeoutMicroseconds = NEO::TimeoutControls::maxTimeout; this->csr->waitForCompletionWithTimeout(false, timeoutMicroseconds, this->csr->peekTaskCount()); } @@ -288,7 +287,6 @@ ze_result_t CommandListCoreFamilyImmediate::appendSignalEvent(ze_ } this->csr->flushNonKernelTask(&event->getAllocation(this->device), event->getGpuAddress(this->device), Event::STATE_SIGNALED, args, false, false, false); if (this->isSyncModeQueue) { - this->csr->flushTagUpdate(); auto timeoutMicroseconds = NEO::TimeoutControls::maxTimeout; this->csr->waitForCompletionWithTimeout(false, timeoutMicroseconds, this->csr->peekTaskCount()); } @@ -322,7 +320,6 @@ ze_result_t CommandListCoreFamilyImmediate::appendEventReset(ze_e } this->csr->flushNonKernelTask(&event->getAllocation(this->device), event->getGpuAddress(this->device), Event::STATE_CLEARED, args, false, false, false); if (this->isSyncModeQueue) { - this->csr->flushTagUpdate(); auto timeoutMicroseconds = NEO::TimeoutControls::maxTimeout; this->csr->waitForCompletionWithTimeout(false, timeoutMicroseconds, this->csr->peekTaskCount()); } diff --git a/level_zero/core/source/cmdlist/cmdlist_imp.cpp b/level_zero/core/source/cmdlist/cmdlist_imp.cpp index 67947fbd3e..0e17051589 100644 --- a/level_zero/core/source/cmdlist/cmdlist_imp.cpp +++ b/level_zero/core/source/cmdlist/cmdlist_imp.cpp @@ -28,7 +28,6 @@ CommandListAllocatorFn commandListFactoryImmediate[IGFX_MAX_PRODUCT] = {}; ze_result_t CommandListImp::destroy() { if (this->isFlushTaskSubmissionEnabled && !this->isSyncModeQueue) { - this->csr->flushTagUpdate(); auto timeoutMicroseconds = NEO::TimeoutControls::maxTimeout; this->csr->waitForCompletionWithTimeout(false, timeoutMicroseconds, this->csr->peekTaskCount()); } diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index 0c6e46374e..f79aad25be 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -558,6 +558,10 @@ void CommandQueueHw::dispatchTaskCountWrite(NEO::LinearStream &co UNRECOVERABLE_IF(csr == nullptr); + if (csr->isUpdateTagFromWaitEnabled()) { + return; + } + auto taskCountToWrite = csr->peekTaskCount() + 1; auto gpuAddress = static_cast(csr->getTagAllocation()->getGpuAddress()); diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp index 7cc96d3972..29059a1296 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp @@ -243,6 +243,34 @@ HWTEST_F(CommandQueueCreate, given100CmdListsWhenExecutingThenCommandStreamIsNot commandQueue->destroy(); } +HWTEST_F(CommandQueueCreate, givenUpdateTaskCountFromWaitWhenDispatchTaskCountWriteThenNoPipeControlFlushed) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + DebugManagerStateRestore restorer; + DebugManager.flags.UpdateTaskCountFromWait.set(1); + + const ze_command_queue_desc_t desc = {}; + ze_result_t returnValue; + auto commandQueue = whitebox_cast(CommandQueue::create(productFamily, + device, + neoDevice->getDefaultEngine().commandStreamReceiver, + &desc, + false, + false, + returnValue)); + + commandQueue->dispatchTaskCountWrite(*commandQueue->commandStream, false); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(commandQueue->commandStream->getCpuBase(), 0), commandQueue->commandStream->getUsed())); + + auto itor = find(cmdList.begin(), cmdList.end()); + EXPECT_EQ(cmdList.end(), itor); + + commandQueue->destroy(); +} + HWTEST_F(CommandQueueCreate, givenContainerWithAllocationsWhenResidencyContainerIsEmptyThenMakeResidentWasNotCalled) { auto csr = std::make_unique(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield()); csr->setupContext(*neoDevice->getDefaultEngine().osContext); diff --git a/opencl/test/unit_test/command_queue/enqueue_thread_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_thread_tests.cpp index 9867b40574..c3a85947f8 100644 --- a/opencl/test/unit_test/command_queue/enqueue_thread_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_thread_tests.cpp @@ -460,6 +460,7 @@ HWTEST_F(EnqueueThreading, WhenFinishingThenKernelHasOwnership) { auto csr = (CommandStreamReceiverMock *)&this->pCmdQ->getGpgpuCommandStreamReceiver(); csr->expectedToFreeCount = 0u; csr->latestSentTaskCount = 1; + csr->latestFlushedTaskCount = 1; pCmdQ->finish(); } diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp index 67ea8e77e6..97a62e1b03 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp @@ -125,6 +125,8 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenEmptyQueueWhenFinishingThenTa HWTEST_F(CommandStreamReceiverFlushTaskTests, givenTaskCountToWaitBiggerThanLatestSentTaskCountWhenWaitForCompletionThenFlushPipeControl) { typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL; + DebugManagerStateRestore restorer; + DebugManager.flags.UpdateTaskCountFromWait.set(1); auto &csr = pDevice->getUltCommandStreamReceiver(); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp index 71acd02e3e..f703f18cc3 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp @@ -530,6 +530,38 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenCsrInBatchingModeWhenFlushTas EXPECT_EQ(1u, mockCsr->peekLatestFlushedTaskCount()); } +HWTEST_F(CommandStreamReceiverFlushTaskTests, givenUpdateTaskCountFromWaitWhenFlushBatchedIsCalledThenFlushedTaskCountIsNotModifed) { + DebugManagerStateRestore restorer; + DebugManager.flags.UpdateTaskCountFromWait.set(1); + + auto mockCsr = new MockCsrHw2(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + pDevice->resetCommandStreamReceiver(mockCsr); + mockCsr->useNewResourceImplicitFlush = false; + mockCsr->useGpuIdleImplicitFlush = false; + mockCsr->overrideDispatchPolicy(DispatchMode::BatchedDispatch); + + DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); + dispatchFlags.preemptionMode = PreemptionHelper::getDefaultPreemptionMode(pDevice->getHardwareInfo()); + dispatchFlags.guardCommandBufferWithPipeControl = true; + + mockCsr->flushTask(commandStream, + 0, + dsh, + ioh, + ssh, + taskLevel, + dispatchFlags, + *pDevice); + + EXPECT_EQ(1u, mockCsr->peekLatestSentTaskCount()); + EXPECT_EQ(0u, mockCsr->peekLatestFlushedTaskCount()); + + mockCsr->flushBatchedSubmissions(); + + EXPECT_EQ(1u, mockCsr->peekLatestSentTaskCount()); + EXPECT_EQ(0u, mockCsr->peekLatestFlushedTaskCount()); +} + HWTEST_F(CommandStreamReceiverFlushTaskTests, givenCsrInDefaultModeWhenFlushTaskIsCalledThenFlushedTaskCountIsModifed) { CommandQueueHw commandQueue(nullptr, pClDevice, 0, false); auto &commandStream = commandQueue.getCS(4096u); @@ -1024,12 +1056,15 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenUpdateTaskCountFromWaitSetWhe DebugManager.flags.UpdateTaskCountFromWait.set(1); CommandQueueHw commandQueue(nullptr, pClDevice, 0, false); + commandQueue.taskCount = 10; auto mockCsr = new MockCsrHw2(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); pDevice->resetCommandStreamReceiver(mockCsr); mockCsr->useNewResourceImplicitFlush = false; mockCsr->useGpuIdleImplicitFlush = false; mockCsr->overrideDispatchPolicy(DispatchMode::BatchedDispatch); + mockCsr->taskCount.store(10); + mockCsr->latestFlushedTaskCount.store(5); commandQueue.waitForAllEngines(false, nullptr); @@ -1052,12 +1087,15 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenEnabledDirectSubmissionUpdate }; CommandQueueHw commandQueue(nullptr, pClDevice, 0, false); + commandQueue.taskCount = 10; auto mockCsr = new MockCsrHwDirectSubmission(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); pDevice->resetCommandStreamReceiver(mockCsr); mockCsr->useNewResourceImplicitFlush = false; mockCsr->useGpuIdleImplicitFlush = false; mockCsr->overrideDispatchPolicy(DispatchMode::BatchedDispatch); + mockCsr->taskCount.store(10); + mockCsr->latestFlushedTaskCount.store(5); commandQueue.waitForAllEngines(false, nullptr); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_tests_xehp_and_later.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_tests_xehp_and_later.cpp index 71381a3d47..0c037dcc98 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_tests_xehp_and_later.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_tests_xehp_and_later.cpp @@ -917,6 +917,27 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPAndLaterMultiTile verifyActivePartitionConfig(commandStreamReceiver, true); } +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPAndLaterMultiTileTests, + givenMultipleStaticActivePartitionsWhenFlushingTagUpdateThenExpectTagUpdatePipeControlWithPartitionFlagOnAndActivePartitionConfig) { + DebugManagerStateRestore restorer; + DebugManager.flags.UpdateTaskCountFromWait.set(1); + + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + if (pDevice->getPreemptionMode() == PreemptionMode::MidThread || pDevice->isDebuggerActive()) { + commandStreamReceiver.createPreemptionAllocation(); + } + EXPECT_EQ(1u, commandStreamReceiver.activePartitionsConfig); + commandStreamReceiver.activePartitions = 2; + commandStreamReceiver.taskCount = 3; + EXPECT_TRUE(commandStreamReceiver.staticWorkPartitioningEnabled); + flushTask(commandStreamReceiver, true); + commandStreamReceiver.flushTagUpdate(); + EXPECT_EQ(2u, commandStreamReceiver.activePartitionsConfig); + + prepareLinearStream(commandStream, 0); + verifyPipeControl(commandStreamReceiver, 4, true); +} + HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPAndLaterMultiTileTests, givenMultipleDynamicActivePartitionsWhenFlushingTaskThenExpectTagUpdatePipeControlWithoutPartitionFlagOnAndNoActivePartitionConfig) { auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); @@ -936,6 +957,29 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPAndLaterMultiTile verifyActivePartitionConfig(commandStreamReceiver, false); } +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPAndLaterMultiTileTests, + givenMultipleDynamicActivePartitionsWhenFlushingTagUpdateThenExpectTagUpdatePipeControlWithoutPartitionFlagOnAndNoActivePartitionConfig) { + DebugManagerStateRestore restorer; + DebugManager.flags.UpdateTaskCountFromWait.set(1); + + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + if (pDevice->getPreemptionMode() == PreemptionMode::MidThread || pDevice->isDebuggerActive()) { + commandStreamReceiver.createPreemptionAllocation(); + } + commandStreamReceiver.activePartitions = 2; + commandStreamReceiver.taskCount = 3; + commandStreamReceiver.staticWorkPartitioningEnabled = false; + flushTask(commandStreamReceiver, true); + commandStreamReceiver.flushTagUpdate(); + EXPECT_EQ(2u, commandStreamReceiver.activePartitionsConfig); + + prepareLinearStream(commandStream, 0); + verifyPipeControl(commandStreamReceiver, 4, false); + + prepareLinearStream(commandStreamReceiver.commandStream, 0); + verifyActivePartitionConfig(commandStreamReceiver, false); +} + HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPAndLaterMultiTileTests, givenSingleStaticActivePartitionWhenFlushingTaskThenExpectTagUpdatePipeControlWithoutPartitionFlagOnAndNoActivePartitionConfig) { auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp index dd09afb406..1daa9f0f0a 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp @@ -7,6 +7,7 @@ #include "shared/source/command_stream/scratch_space_controller_base.h" #include "shared/source/direct_submission/dispatchers/blitter_dispatcher.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/engine_descriptor_helper.h" #include "shared/test/common/helpers/ult_hw_config.h" #include "shared/test/common/mocks/mock_allocation_properties.h" @@ -348,6 +349,37 @@ HWTEST_F(BcsTests, whenBlitBufferThenCommandBufferHasProperTaskCount) { EXPECT_EQ(csr.getCS(0u).getGraphicsAllocation()->getResidencyTaskCount(csr.getOsContext().getContextId()), csr.peekTaskCount()); } +HWTEST_F(BcsTests, givenUpdateTaskCountFromWaitWhenBlitBufferThenCsrHasProperTaskCounts) { + DebugManagerStateRestore restorer; + DebugManager.flags.UpdateTaskCountFromWait.set(1); + + auto &csr = pDevice->getUltCommandStreamReceiver(); + + cl_int retVal = CL_SUCCESS; + auto buffer = clUniquePtr(Buffer::create(context.get(), CL_MEM_READ_WRITE, 1, nullptr, retVal)); + + constexpr size_t hostAllocationSize = MemoryConstants::pageSize; + auto hostAllocationPtr = allocateAlignedMemory(hostAllocationSize, MemoryConstants::pageSize); + void *hostPtr = reinterpret_cast(hostAllocationPtr.get()); + + auto graphicsAllocation = buffer->getGraphicsAllocation(pDevice->getRootDeviceIndex()); + + auto blitProperties = BlitProperties::constructPropertiesForReadWrite(BlitterConstants::BlitDirection::HostPtrToBuffer, + csr, graphicsAllocation, nullptr, hostPtr, + graphicsAllocation->getGpuAddress(), 0, + 0, 0, {1, 1, 1}, 0, 0, 0, 0); + + BlitPropertiesContainer blitPropertiesContainer; + blitPropertiesContainer.push_back(blitProperties); + + auto taskCount = csr.peekTaskCount(); + + csr.blitBuffer(blitPropertiesContainer, false, false, *pDevice); + + EXPECT_EQ(csr.peekTaskCount(), taskCount + 1); + EXPECT_EQ(csr.peekLatestFlushedTaskCount(), taskCount); +} + HWTEST_F(BcsTests, givenProfilingEnabledWhenBlitBufferThenCommandBufferIsConstructedProperly) { auto bcsOsContext = std::unique_ptr(OsContext::create(nullptr, 0, EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular}, pDevice->getDeviceBitfield()))); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_tests.cpp index d2ac21f6e7..5b8e41a21f 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -318,6 +318,30 @@ TEST(CommandStreamReceiverSimpleTest, givenCsrWhenSubmitiingBatchBufferThenTaskC executionEnvironment.memoryManager->freeGraphicsMemoryImpl(commandBuffer); } +HWTEST_F(CommandStreamReceiverTest, givenUpdateTaskCountFromWaitWhenSubmitiingBatchBufferThenTaskCountIsIncrementedAndLatestsValuesSetCorrectly) { + DebugManagerStateRestore restorer; + DebugManager.flags.UpdateTaskCountFromWait.set(1); + + MockCsrHw csr(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + + GraphicsAllocation *commandBuffer = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{csr.getRootDeviceIndex(), MemoryConstants::pageSize}); + ASSERT_NE(nullptr, commandBuffer); + LinearStream cs(commandBuffer); + + BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + ResidencyContainer residencyList; + + auto previousTaskCount = csr.peekTaskCount(); + auto currentTaskCount = previousTaskCount + 1; + csr.submitBatchBuffer(batchBuffer, residencyList); + + EXPECT_EQ(currentTaskCount, csr.peekTaskCount()); + EXPECT_EQ(previousTaskCount, csr.peekLatestFlushedTaskCount()); + EXPECT_EQ(currentTaskCount, csr.peekLatestSentTaskCount()); + + memoryManager->freeGraphicsMemoryImpl(commandBuffer); +} + HWTEST_F(CommandStreamReceiverTest, givenOverrideCsrAllocationSizeWhenCreatingCommandStreamCsrGraphicsAllocationThenAllocationHasCorrectSize) { DebugManagerStateRestore restore; diff --git a/opencl/test/unit_test/kernel/kernel_tests.cpp b/opencl/test/unit_test/kernel/kernel_tests.cpp index 816012b61f..1c984e1c5f 100644 --- a/opencl/test/unit_test/kernel/kernel_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_tests.cpp @@ -472,6 +472,7 @@ class CommandStreamReceiverMock : public CommandStreamReceiver { void flushTagUpdate() override{}; void flushNonKernelTask(GraphicsAllocation *eventAlloc, uint64_t immediateGpuAddress, uint64_t immediateData, PipeControlArgs &args, bool isWaitOnEvents, bool startOfDispatch, bool endOfDispatch) override{}; void updateTagFromWait() override{}; + bool isUpdateTagFromWaitEnabled() override { return false; }; bool isMultiOsContextCapable() const override { return false; } diff --git a/shared/source/command_stream/command_stream_receiver.cpp b/shared/source/command_stream/command_stream_receiver.cpp index f26243fb22..29c5260929 100644 --- a/shared/source/command_stream/command_stream_receiver.cpp +++ b/shared/source/command_stream/command_stream_receiver.cpp @@ -89,7 +89,9 @@ int CommandStreamReceiver::submitBatchBuffer(BatchBuffer &batchBuffer, Residency this->latestSentTaskCount = taskCount + 1; auto flushed = this->flush(batchBuffer, allocationsForResidency); - this->latestFlushedTaskCount = taskCount + 1; + if (!isUpdateTagFromWaitEnabled()) { + this->latestFlushedTaskCount = taskCount + 1; + } taskCount++; return !flushed; @@ -261,10 +263,6 @@ void CommandStreamReceiver::cleanupResources() { } bool CommandStreamReceiver::waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) { - if (this->latestSentTaskCount < taskCountToWait) { - this->flushTagUpdate(); - } - uint32_t latestSentTaskCount = this->latestFlushedTaskCount; if (latestSentTaskCount < taskCountToWait) { if (!this->flushBatchedSubmissions()) { @@ -279,7 +277,13 @@ bool CommandStreamReceiver::baseWaitFunction(volatile uint32_t *pollAddress, boo std::chrono::high_resolution_clock::time_point time1, time2; int64_t timeDiff = 0; + uint32_t latestSentTaskCount = this->latestFlushedTaskCount; + if (latestSentTaskCount < taskCountToWait) { + this->flushTagUpdate(); + } + volatile uint32_t *partitionAddress = pollAddress; + time1 = std::chrono::high_resolution_clock::now(); for (uint32_t i = 0; i < activePartitions; i++) { while (*partitionAddress < taskCountToWait && timeDiff <= timeoutMicroseconds) { diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index 4253291457..618e16cece 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -220,6 +220,7 @@ class CommandStreamReceiver { virtual void flushTagUpdate() = 0; virtual void flushNonKernelTask(GraphicsAllocation *eventAlloc, uint64_t immediateGpuAddress, uint64_t immediateData, PipeControlArgs &args, bool isWaitOnEvents, bool isStartOfDispatch, bool isEndOfDispatch) = 0; virtual void updateTagFromWait() = 0; + virtual bool isUpdateTagFromWaitEnabled() = 0; ScratchSpaceController *getScratchSpaceController() const { return scratchSpaceController.get(); diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h index 64e89fe7e8..118e0af02f 100644 --- a/shared/source/command_stream/command_stream_receiver_hw.h +++ b/shared/source/command_stream/command_stream_receiver_hw.h @@ -108,7 +108,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { size_t commandStreamStartTask); void flushHandler(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency); - bool isUpdateTagFromWaitEnabled(); + bool isUpdateTagFromWaitEnabled() override; void updateTagFromWait() override; bool isMultiOsContextCapable() const override; diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 6c396f972c..89a5a74b7d 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -789,7 +789,10 @@ inline bool CommandStreamReceiverHw::flushBatchedSubmissions() { flushStampUpdateHelper.updateAll(flushStamp->peekStamp()); - this->latestFlushedTaskCount = lastTaskCount; + if (!isUpdateTagFromWaitEnabled()) { + this->latestFlushedTaskCount = lastTaskCount; + } + this->makeSurfacePackNonResident(surfacesForSubmit); resourcePackage.clear(); } @@ -882,8 +885,6 @@ inline void CommandStreamReceiverHw::emitNoop(LinearStream &commandSt template inline void CommandStreamReceiverHw::waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) { - updateTagFromWait(); - int64_t waitTimeout = 0; bool enableTimeout = false; @@ -1088,14 +1089,18 @@ uint32_t CommandStreamReceiverHw::blitBuffer(const BlitPropertiesCont BlitCommandsHelper::programGlobalSequencerFlush(commandStream); - MemorySynchronizationCommands::addAdditionalSynchronization(commandStream, tagAllocation->getGpuAddress(), peekHwInfo()); + auto updateTag = !isUpdateTagFromWaitEnabled(); + updateTag |= blocking; + if (updateTag) { + MemorySynchronizationCommands::addAdditionalSynchronization(commandStream, tagAllocation->getGpuAddress(), peekHwInfo()); - MiFlushArgs args; - args.commandWithPostSync = true; - args.notifyEnable = isUsedNotifyEnableForPostSync(); - EncodeMiFlushDW::programMiFlushDw(commandStream, tagAllocation->getGpuAddress(), newTaskCount, args); + MiFlushArgs args; + args.commandWithPostSync = true; + args.notifyEnable = isUsedNotifyEnableForPostSync(); + EncodeMiFlushDW::programMiFlushDw(commandStream, tagAllocation->getGpuAddress(), newTaskCount, args); - MemorySynchronizationCommands::addAdditionalSynchronization(commandStream, tagAllocation->getGpuAddress(), peekHwInfo()); + MemorySynchronizationCommands::addAdditionalSynchronization(commandStream, tagAllocation->getGpuAddress(), peekHwInfo()); + } if (PauseOnGpuProperties::pauseModeAllowed(DebugManager.flags.PauseOnBlitCopy.get(), taskCount, PauseOnGpuProperties::PauseMode::AfterWorkload)) { BlitCommandsHelper::dispatchDebugPauseCommands(commandStream, getDebugPauseStateGPUAddress(), DebugPauseState::waitingForUserEndConfirmation, DebugPauseState::hasUserEndConfirmation); @@ -1129,7 +1134,10 @@ uint32_t CommandStreamReceiverHw::blitBuffer(const BlitPropertiesCont flush(batchBuffer, getResidencyAllocations()); makeSurfacePackNonResident(getResidencyAllocations()); - latestFlushedTaskCount = newTaskCount; + if (!isUpdateTagFromWaitEnabled()) { + latestFlushedTaskCount = newTaskCount; + } + taskCount = newTaskCount; auto flushStampToWait = flushStamp->peekStamp(); @@ -1145,7 +1153,7 @@ uint32_t CommandStreamReceiverHw::blitBuffer(const BlitPropertiesCont template inline void CommandStreamReceiverHw::flushTagUpdate() { if (this->osContext != nullptr) { - if (this->osContext->getEngineType() == aub_stream::ENGINE_BCS) { + if (EngineHelpers::isBcs(this->osContext->getEngineType())) { this->flushMiFlushDW(); } else { this->flushPipeControl(); @@ -1176,11 +1184,12 @@ inline void CommandStreamReceiverHw::flushMiFlushDW() { MiFlushArgs args; args.commandWithPostSync = true; args.notifyEnable = isUsedNotifyEnableForPostSync(); - EncodeMiFlushDW::programMiFlushDw(commandStream, tagAllocation->getGpuAddress(), taskCount, args); + EncodeMiFlushDW::programMiFlushDw(commandStream, tagAllocation->getGpuAddress(), taskCount + 1, args); makeResident(*tagAllocation); this->flushSmallTask(commandStream, commandStreamStart); + this->latestFlushedTaskCount = taskCount.load(); } template @@ -1215,8 +1224,9 @@ void CommandStreamReceiverHw::flushPipeControl() { PipeControlArgs args(true); args.notifyEnable = isUsedNotifyEnableForPostSync(); + args.workloadPartitionOffset = this->activePartitions > 1 && this->staticWorkPartitioningEnabled; MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation(commandStream, - PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, + PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, getTagAllocation()->getGpuAddress(), taskCount + 1, peekHwInfo(), @@ -1225,10 +1235,7 @@ void CommandStreamReceiverHw::flushPipeControl() { makeResident(*tagAllocation); this->flushSmallTask(commandStream, commandStreamStart); - - this->latestFlushedTaskCount = taskCount + 1; - this->latestSentTaskCount = taskCount + 1; - taskCount++; + this->latestFlushedTaskCount = taskCount.load(); } template @@ -1321,7 +1328,9 @@ void CommandStreamReceiverHw::flushSmallTask(LinearStream &commandStr BatchBuffer batchBuffer{commandStreamTask.getGraphicsAllocation(), commandStreamStartTask, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, commandStreamTask.getUsed(), &commandStreamTask, endingCmdPtr, false}; + this->latestSentTaskCount = taskCount + 1; flushHandler(batchBuffer, getResidencyAllocations()); + taskCount++; } template diff --git a/shared/test/common/mocks/mock_command_stream_receiver.h b/shared/test/common/mocks/mock_command_stream_receiver.h index 26e81637f4..ac6bb97878 100644 --- a/shared/test/common/mocks/mock_command_stream_receiver.h +++ b/shared/test/common/mocks/mock_command_stream_receiver.h @@ -59,6 +59,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { void flushTagUpdate() override{}; void flushNonKernelTask(GraphicsAllocation *eventAlloc, uint64_t immediateGpuAddress, uint64_t immediateData, PipeControlArgs &args, bool isWaitOnEvents, bool startOfDispatch, bool endOfDispatch) override{}; void updateTagFromWait() override{}; + bool isUpdateTagFromWaitEnabled() override { return false; }; bool isMultiOsContextCapable() const override { return multiOsContextCapable; } @@ -164,6 +165,7 @@ class MockCsrHw2 : public CommandStreamReceiverHw { using CommandStreamReceiver::globalFenceAllocation; using CommandStreamReceiver::isPreambleSent; using CommandStreamReceiver::lastSentCoherencyRequest; + using CommandStreamReceiver::latestFlushedTaskCount; using CommandStreamReceiver::mediaVfeStateDirty; using CommandStreamReceiver::nTo1SubmissionModelEnabled; using CommandStreamReceiver::pageTableManagerInitialized;