From c78c1515deafda5f4e8d1cf71965b5e2eabafcc5 Mon Sep 17 00:00:00 2001 From: Szymon Morek Date: Thu, 16 Oct 2025 14:35:52 +0000 Subject: [PATCH] performance: reuse cmd buffer without dc flush Related-To: NEO-16348 Signed-off-by: Szymon Morek --- .../command_queue/command_queue_tests.cpp | 1 + .../command_stream_receiver.cpp | 6 +++ .../command_stream/command_stream_receiver.h | 4 ++ .../command_stream_receiver_hw.h | 2 + .../command_stream_receiver_hw_base.inl | 23 ++++++++++- .../command_stream/tag_allocation_layout.h | 3 +- .../memory_manager/allocations_list.cpp | 14 ++++++- .../libult/ult_command_stream_receiver.h | 2 + .../command_container_tests.cpp | 3 +- .../command_stream_receiver_tests.cpp | 39 ++++++++++++++++++- 10 files changed, 90 insertions(+), 7 deletions(-) diff --git a/opencl/test/unit_test/command_queue/command_queue_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_tests.cpp index d5a9c0e783..bfe456e096 100644 --- a/opencl/test/unit_test/command_queue/command_queue_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_tests.cpp @@ -940,6 +940,7 @@ HWTEST_F(CommandQueueTests, givenMultipleCommandQueuesWhenMarkerIsEmittedThenGra std::unique_ptr commandQ(new MockCommandQueue(&context, device.get(), 0, false)); *device->getDefaultEngine().commandStreamReceiver->getTagAddress() = commandQ->getHeaplessStateInitEnabled() ? 1 : 0; + *device->getDefaultEngine().commandStreamReceiver->getUcTagAddress() = commandQ->getHeaplessStateInitEnabled() ? 0 : 0; commandQ->enqueueMarkerWithWaitList(0, nullptr, nullptr); commandQ->enqueueMarkerWithWaitList(0, nullptr, nullptr); diff --git a/shared/source/command_stream/command_stream_receiver.cpp b/shared/source/command_stream/command_stream_receiver.cpp index fcea2ad29a..0947d6ff97 100644 --- a/shared/source/command_stream/command_stream_receiver.cpp +++ b/shared/source/command_stream/command_stream_receiver.cpp @@ -546,6 +546,7 @@ void CommandStreamReceiver::setTagAllocation(GraphicsAllocation *allocation) { this->tagAddress = reinterpret_cast(allocation->getUnderlyingBuffer()); this->debugPauseStateAddress = reinterpret_cast( reinterpret_cast(allocation->getUnderlyingBuffer()) + TagAllocationLayout::debugPauseStateAddressOffset); + this->ucTagAddress = static_cast(ptrOffset(allocation->getUnderlyingBuffer(), TagAllocationLayout::ucTagAddressOffset)); } MultiGraphicsAllocation &CommandStreamReceiver::createMultiAllocationInSystemMemoryPool(AllocationType allocationType) { @@ -877,14 +878,18 @@ bool CommandStreamReceiver::initializeTagAllocation() { } this->setTagAllocation(tagAllocation); + auto initValue = debugManager.flags.EnableNullHardware.get() ? static_cast(-1) : initialHardwareTag; auto tagAddress = this->tagAddress; + auto ucTagAddress = this->ucTagAddress; auto completionFence = reinterpret_cast(getCompletionAddress()); UNRECOVERABLE_IF(!completionFence); uint32_t subDevices = static_cast(this->deviceBitfield.count()); for (uint32_t i = 0; i < subDevices; i++) { *tagAddress = initValue; tagAddress = ptrOffset(tagAddress, this->immWritePostSyncWriteOffset); + *ucTagAddress = initValue; + ucTagAddress = ptrOffset(ucTagAddress, this->immWritePostSyncWriteOffset); *completionFence = 0; completionFence = ptrOffset(completionFence, this->immWritePostSyncWriteOffset); } @@ -1207,6 +1212,7 @@ TaskCountType CompletionStamp::getTaskCountFromSubmissionStatusError(SubmissionS } uint64_t CommandStreamReceiver::getBarrierCountGpuAddress() const { return ptrOffset(this->tagAllocation->getGpuAddress(), TagAllocationLayout::barrierCountOffset); } uint64_t CommandStreamReceiver::getDebugPauseStateGPUAddress() const { return tagAllocation->getGpuAddress() + TagAllocationLayout::debugPauseStateAddressOffset; } +uint64_t CommandStreamReceiver::getUcTagGPUAddress() const { return tagAllocation->getGpuAddress() + TagAllocationLayout::ucTagAddressOffset; } uint64_t CommandStreamReceiver::getCompletionAddress() const { uint64_t completionFenceAddress = castToUint64(const_cast(tagAddress)); if (completionFenceAddress == 0) { diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index 4ba8aa78f1..a4c0b88a34 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -166,9 +166,11 @@ class CommandStreamReceiver : NEO::NonCopyableAndNonMovableClass { TaskCountType getNextBarrierCount() { return this->barrierCount.fetch_add(1u); } TaskCountType peekBarrierCount() const { return this->barrierCount.load(); } volatile TagAddressType *getTagAddress() const { return tagAddress; } + volatile TagAddressType *getUcTagAddress() const { return ucTagAddress; } volatile TagAddressType *getBarrierCountTagAddress() const { return this->barrierCountTagAddress; } uint64_t getBarrierCountGpuAddress() const; uint64_t getDebugPauseStateGPUAddress() const; + uint64_t getUcTagGPUAddress() const; virtual bool waitForFlushStamp(FlushStamp &flushStampToWait) { return true; } @@ -627,6 +629,7 @@ class CommandStreamReceiver : NEO::NonCopyableAndNonMovableClass { uint64_t totalMemoryUsed = 0u; volatile TagAddressType *tagAddress = nullptr; + volatile TagAddressType *ucTagAddress = nullptr; volatile TagAddressType *barrierCountTagAddress = nullptr; volatile DebugPauseState *debugPauseStateAddress = nullptr; SpinLock debugPauseStateLock; @@ -694,6 +697,7 @@ class CommandStreamReceiver : NEO::NonCopyableAndNonMovableClass { bool gsbaStateDirty = true; bool bindingTableBaseAddressRequired = false; bool heapStorageRequiresRecyclingTag = false; + bool ucResourceRequiresTagUpdate = false; bool mediaVfeStateDirty = true; bool stateComputeModeDirty = true; bool btdCommandDirty = true; diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h index fbb857def9..82ecdca12c 100644 --- a/shared/source/command_stream/command_stream_receiver_hw.h +++ b/shared/source/command_stream/command_stream_receiver_hw.h @@ -276,6 +276,8 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { bool areMultipleSubDevicesInContext, bool setGeneralStateBaseAddress); + inline void emitTagUpdateWithoutDCFlush(LinearStream &commandStream); + inline void processBarrierWithPostSync(LinearStream &commandStreamTask, DispatchFlags &dispatchFlags, bool &levelClosed, diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 83a6d387e3..bdf3727b06 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -442,7 +442,9 @@ CompletionStamp CommandStreamReceiverHw::flushTaskHeapful( if (detectInitProgrammingFlagsRequired(dispatchFlags)) { initProgrammingFlags(); } - + if (this->ucResourceRequiresTagUpdate) { + this->emitTagUpdateWithoutDCFlush(commandStreamTask); + } const auto &hwInfo = peekHwInfo(); bool hasStallingCmdsOnTaskStream = false; @@ -1842,6 +1844,25 @@ inline void CommandStreamReceiverHw::programStateBaseAddressCommon( } } +template +inline void CommandStreamReceiverHw::emitTagUpdateWithoutDCFlush(LinearStream &commandStream) { + auto &rootDeviceEnvironment = this->peekRootDeviceEnvironment(); + auto address = this->getUcTagGPUAddress(); + + PipeControlArgs args = {}; + args.notifyEnable = isUsedNotifyEnableForPostSync(); + MemorySynchronizationCommands::addBarrierWithPostSyncOperation( + commandStream, + PostSyncMode::immediateData, + address, + taskCount + 1, + rootDeviceEnvironment, + args); + + makeResident(*tagAllocation); + this->ucResourceRequiresTagUpdate = false; +} + template inline void CommandStreamReceiverHw::processBarrierWithPostSync(LinearStream &commandStreamTask, DispatchFlags &dispatchFlags, bool &levelClosed, void *¤tPipeControlForNooping, void *&epiloguePipeControlLocation, bool &hasStallingCmdsOnTaskStream, PipeControlArgs &args) { diff --git a/shared/source/command_stream/tag_allocation_layout.h b/shared/source/command_stream/tag_allocation_layout.h index 889b1a813f..4ac5438712 100644 --- a/shared/source/command_stream/tag_allocation_layout.h +++ b/shared/source/command_stream/tag_allocation_layout.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2023 Intel Corporation + * Copyright (C) 2022-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -11,6 +11,7 @@ namespace NEO { namespace TagAllocationLayout { inline constexpr uint64_t debugPauseStateAddressOffset = MemoryConstants::kiloByte; +inline constexpr uint64_t ucTagAddressOffset = MemoryConstants::kiloByte + MemoryConstants::cacheLineSize; inline constexpr uint64_t completionFenceOffset = 2 * MemoryConstants::kiloByte; inline constexpr uint64_t barrierCountOffset = 3 * MemoryConstants::kiloByte; } // namespace TagAllocationLayout diff --git a/shared/source/memory_manager/allocations_list.cpp b/shared/source/memory_manager/allocations_list.cpp index 8822e84e4c..2beb283483 100644 --- a/shared/source/memory_manager/allocations_list.cpp +++ b/shared/source/memory_manager/allocations_list.cpp @@ -20,6 +20,7 @@ struct ReusableAllocationRequirements { if (csr) { csrTagAddress = csr->getTagAddress(); + csrUcTagAddress = csr->getUcTagAddress(); contextId = csr->getOsContext().getContextId(); rootDeviceIndex = csr->getRootDeviceIndex(); deviceBitfield = csr->getOsContext().getDeviceBitfield(); @@ -30,6 +31,7 @@ struct ReusableAllocationRequirements { const void *requiredPtr = nullptr; size_t requiredMinimalSize = 0; volatile TagAddressType *csrTagAddress = nullptr; + volatile TagAddressType *csrUcTagAddress = nullptr; NEO::AllocationType allocationType = NEO::AllocationType::unknown; NEO::DeviceBitfield deviceBitfield = 1; uint32_t contextId = std::numeric_limits::max(); @@ -38,8 +40,7 @@ struct ReusableAllocationRequirements { bool forceSystemMemoryFlag = false; }; -bool checkTagAddressReady(ReusableAllocationRequirements *requirements, NEO::GraphicsAllocation *gfxAllocation) { - auto tagAddress = requirements->csrTagAddress; +bool checkTagAddressReady(ReusableAllocationRequirements *requirements, NEO::GraphicsAllocation *gfxAllocation, volatile TagAddressType *tagAddress) { auto taskCount = gfxAllocation->getTaskCount(requirements->contextId); for (uint32_t count = 0; count < requirements->deviceBitfield.count(); count++) { if (*tagAddress < taskCount) { @@ -50,6 +51,15 @@ bool checkTagAddressReady(ReusableAllocationRequirements *requirements, NEO::Gra return true; } + +bool checkTagAddressReady(ReusableAllocationRequirements *requirements, NEO::GraphicsAllocation *gfxAllocation) { + if (requirements->allocationType == NEO::AllocationType::commandBuffer) { + if (checkTagAddressReady(requirements, gfxAllocation, requirements->csrUcTagAddress)) { + return true; + } + } + return checkTagAddressReady(requirements, gfxAllocation, requirements->csrTagAddress); +} } // namespace namespace NEO { diff --git a/shared/test/common/libult/ult_command_stream_receiver.h b/shared/test/common/libult/ult_command_stream_receiver.h index 17e3d2a800..30934a5c6b 100644 --- a/shared/test/common/libult/ult_command_stream_receiver.h +++ b/shared/test/common/libult/ult_command_stream_receiver.h @@ -102,6 +102,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw { using BaseClass::sshState; using BaseClass::staticWorkPartitioningEnabled; using BaseClass::streamProperties; + using BaseClass::ucResourceRequiresTagUpdate; using BaseClass::wasSubmittedToSingleSubdevice; using BaseClass::CommandStreamReceiver::activePartitions; using BaseClass::CommandStreamReceiver::activePartitionsConfig; @@ -173,6 +174,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw { using BaseClass::CommandStreamReceiver::timestampPacketAllocator; using BaseClass::CommandStreamReceiver::timestampPacketWriteEnabled; using BaseClass::CommandStreamReceiver::timeStampPostSyncWriteOffset; + using BaseClass::CommandStreamReceiver::ucTagAddress; using BaseClass::CommandStreamReceiver::use4GbHeaps; using BaseClass::CommandStreamReceiver::useGpuIdleImplicitFlush; using BaseClass::CommandStreamReceiver::useNewResourceImplicitFlush; diff --git a/shared/test/unit_test/command_container/command_container_tests.cpp b/shared/test/unit_test/command_container/command_container_tests.cpp index eac6a0dbcc..8e32eaf337 100644 --- a/shared/test/unit_test/command_container/command_container_tests.cpp +++ b/shared/test/unit_test/command_container/command_container_tests.cpp @@ -973,6 +973,7 @@ HWTEST_F(CommandContainerTest, givenCmdContainerWhenReuseExistingCmdBufferWithAl auto cmdContainer = std::make_unique(); auto &csr = pDevice->getUltCommandStreamReceiver(); *csr.tagAddress = 0u; + *csr.ucTagAddress = 0u; AllocationsList allocList; cmdContainer->initialize(pDevice, &allocList, HeapSize::defaultHeapSize, false, false); @@ -993,7 +994,7 @@ HWTEST_F(CommandContainerTest, givenCmdContainerWhenReuseExistingCmdBufferWithAl HWTEST_F(CommandContainerTest, givenCmdContainerWhenReuseExistingCmdBufferWithAllocationInListAndCsrTaskCountSameAsAllocationThenReturnAlloc) { auto cmdContainer = std::make_unique(); auto &csr = pDevice->getUltCommandStreamReceiver(); - *csr.tagAddress = 10u; + *csr.ucTagAddress = 10u; AllocationsList allocList; cmdContainer->initialize(pDevice, &allocList, HeapSize::defaultHeapSize, false, false); diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index a07177468f..b99e826113 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -2102,7 +2102,7 @@ TEST_F(ReducedAddrSpaceCommandStreamReceiverTest, EXPECT_EQ(1u, mockMemoryManager->allocateGraphicsMemoryForNonSvmHostPtrCalled); } -TEST_F(CommandStreamReceiverTest, givenMinimumSizeDoesNotExceedCurrentWhenCallingEnsureCommandBufferAllocationThenDoNotReallocate) { +HWTEST_F(CommandStreamReceiverTest, givenMinimumSizeDoesNotExceedCurrentWhenCallingEnsureCommandBufferAllocationThenDoNotReallocate) { GraphicsAllocation *allocation = memoryManager->allocateGraphicsMemoryWithProperties({commandStreamReceiver->getRootDeviceIndex(), 128u, AllocationType::commandBuffer, pDevice->getDeviceBitfield()}); LinearStream commandStream{allocation}; @@ -2113,6 +2113,7 @@ TEST_F(CommandStreamReceiverTest, givenMinimumSizeDoesNotExceedCurrentWhenCallin commandStreamReceiver->ensureCommandBufferAllocation(commandStream, 128u, 0u); EXPECT_EQ(allocation, commandStream.getGraphicsAllocation()); EXPECT_EQ(MemoryConstants::pageSize, commandStream.getMaxAvailableSpace()); + EXPECT_FALSE(pDevice->getUltCommandStreamReceiver().ucResourceRequiresTagUpdate); memoryManager->freeGraphicsMemory(commandStream.getGraphicsAllocation()); } @@ -2126,7 +2127,7 @@ TEST_F(CommandStreamReceiverTest, givenMinimumSizeExceedsCurrentWhenCallingEnsur memoryManager->freeGraphicsMemory(commandStream.getGraphicsAllocation()); } -TEST_F(CommandStreamReceiverTest, givenMinimumSizeExceedsCurrentWhenCallingEnsureCommandBufferAllocationThenReallocateAndAlignSizeTo64kb) { +HWTEST_F(CommandStreamReceiverTest, givenMinimumSizeExceedsCurrentWhenCallingEnsureCommandBufferAllocationThenReallocateAndAlignSizeTo64kb) { GraphicsAllocation *allocation = memoryManager->allocateGraphicsMemoryWithProperties({commandStreamReceiver->getRootDeviceIndex(), 128u, AllocationType::commandBuffer, pDevice->getDeviceBitfield()}); LinearStream commandStream{allocation}; @@ -2139,6 +2140,7 @@ TEST_F(CommandStreamReceiverTest, givenMinimumSizeExceedsCurrentWhenCallingEnsur EXPECT_EQ(2 * MemoryConstants::pageSize64k, commandStream.getGraphicsAllocation()->getUnderlyingBufferSize()); EXPECT_EQ(2 * MemoryConstants::pageSize64k, commandStream.getMaxAvailableSpace()); + EXPECT_FALSE(pDevice->getUltCommandStreamReceiver().ucResourceRequiresTagUpdate); memoryManager->freeGraphicsMemory(commandStream.getGraphicsAllocation()); } @@ -5451,6 +5453,39 @@ HWTEST_F(CommandStreamReceiverHwTest, GivenFlushGuardBufferWithPipeControlWhenFl EXPECT_EQ(18u, commandStreamReceiver.peekLatestFlushedTaskCount()); } +HWTEST_F(CommandStreamReceiverHwTest, givenUcResourceRequiresTagUpdateWhenFlushTaskThenFlushTagUpdate) { + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + commandStreamReceiver.ucResourceRequiresTagUpdate = true; + auto offset = commandStream.getUsed(); + + commandStreamReceiver.flushTask(commandStream, + 0, + &dsh, + &ioh, + nullptr, + taskLevel, + flushTaskFlags, + *pDevice); + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, ptrOffset(commandStream.getCpuBase(), offset), (commandStream.getUsed() - offset))); + + auto itorPipeControl = find(cmdList.begin(), cmdList.end()); + if (MemorySynchronizationCommands::isBarrierWaRequired(pDevice->getRootDeviceEnvironment())) { + itorPipeControl++; + } + EXPECT_NE(itorPipeControl, cmdList.end()); + auto cmd = genCmdCast(*itorPipeControl); + EXPECT_FALSE(cmd->getDcFlushEnable()); + EXPECT_EQ(cmd->getPostSyncOperation(), FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA); + EXPECT_EQ(cmd->getImmediateData(), commandStreamReceiver.peekTaskCount()); + + uint64_t address = cmd->getAddressHigh(); + address <<= 32; + address |= cmd->getAddress(); + auto csrAddress = commandStreamReceiver.getUcTagGPUAddress(); + EXPECT_EQ(csrAddress, address); +} + HWTEST_F(CommandStreamReceiverHwTest, GivenFlushHeapStorageRequiresRecyclingTagWhenFlushTaskCalledThenExpectMonitorFenceFlagTrue) { auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); commandStreamReceiver.recordFlushedBatchBuffer = true;