performance(ocl): cmd buffer prealloc per cmdqueue

Add mechanism to preallocate cmd buffer allocations in command stream receiver reusable allocations list per command queue initialized. This should limit additional allocations during hot loop. Needs to be enabled in subsequent commits by setting product helper method. Related-To: NEO-8152 Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
2026-01-06 19:32:25 +08:00 · 2023-10-27 15:54:45 +00:00
parent cfbf6219fe
commit 39cf653959
13 changed files with 120 additions and 6 deletions
--- a/opencl/source/command_queue/command_queue.cpp
+++ b/opencl/source/command_queue/command_queue.cpp
@@ -152,6 +152,9 @@ CommandQueue::~CommandQueue() {
        if (NEO::Debugger::isDebugEnabled(isInternalUsage) && device->getDevice().getL0Debugger()) {
            device->getDevice().getL0Debugger()->notifyCommandQueueDestroyed(&device->getDevice());
        }
+        if (gpgpuEngine) {
+            gpgpuEngine->commandStreamReceiver->releasePreallocationRequest();
+        }
    }

    timestampPacketContainer.reset();
@@ -215,6 +218,7 @@ void CommandQueue::initializeGpgpuInternals() const {
    }

    gpgpuEngine->commandStreamReceiver->initializeResources();
+    gpgpuEngine->commandStreamReceiver->requestPreallocation();
    gpgpuEngine->commandStreamReceiver->initDirectSubmission();

    if (getCmdQueueProperties<cl_queue_properties>(propertiesVector.data(), CL_QUEUE_PROPERTIES) & static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) && !this->gpgpuEngine->commandStreamReceiver->isUpdateTagFromWaitEnabled()) {
--- a/opencl/test/unit_test/command_queue/command_queue_tests.cpp
+++ b/opencl/test/unit_test/command_queue/command_queue_tests.cpp
@@ -981,6 +981,30 @@ HWTEST_F(CommandQueueTests, givenNodeOrdinalSetWithCcsEngineWhenCreatingCommandQ
    delete pCmdQ;
 }

+HWTEST_F(CommandQueueTests, givenPreallocationsPerQueueWhenInitializeGpgpuCalledThenCSRRequestPreallocationIsCalled) {
+    auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
+    MockContext context(device.get());
+    auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, device.get(), nullptr);
+    auto &commandStreamReceiver = device->getUltCommandStreamReceiver<FamilyType>();
+    DebugManagerStateRestore restorer;
+    DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.set(1);
+
+    EXPECT_EQ(0u, commandStreamReceiver.requestedPreallocationsAmount);
+    EXPECT_TRUE(commandStreamReceiver.getAllocationsForReuse().peekIsEmpty());
+    EXPECT_EQ(0u, commandStreamReceiver.getResidencyAllocations().size());
+
+    mockCmdQ->initializeGpgpu();
+
+    EXPECT_EQ(1u, commandStreamReceiver.requestedPreallocationsAmount);
+    EXPECT_FALSE(commandStreamReceiver.getAllocationsForReuse().peekIsEmpty());
+    EXPECT_EQ(1u, commandStreamReceiver.getResidencyAllocations().size());
+
+    mockCmdQ.reset();
+    EXPECT_EQ(0u, commandStreamReceiver.requestedPreallocationsAmount);
+    EXPECT_FALSE(commandStreamReceiver.getAllocationsForReuse().peekIsEmpty());
+    EXPECT_EQ(1u, commandStreamReceiver.getResidencyAllocations().size());
+}
+
 struct WaitForQueueCompletionTests : public ::testing::Test {
    template <typename Family>
    struct MyCmdQueue : public CommandQueueHw<Family> {
--- a/opencl/test/unit_test/event/event_tests.cpp
+++ b/opencl/test/unit_test/event/event_tests.cpp
@@ -1210,7 +1210,7 @@ HWTEST_F(EventTest, givenVirtualEventWhenCommandSubmittedThenLockCsrOccurs) {

    virtualEvent->submitCommand(false);

-    uint32_t expectedLockCounter = pDevice->getDefaultEngine().commandStreamReceiver->getClearColorAllocation() ? 4u : 3u;
+    uint32_t expectedLockCounter = pDevice->getDefaultEngine().commandStreamReceiver->getClearColorAllocation() ? 5u : 4u;

    EXPECT_EQ(expectedLockCounter, pDevice->getUltCommandStreamReceiver<FamilyType>().recursiveLockCounter);
 }
--- a/shared/source/command_stream/command_stream_receiver.cpp
+++ b/shared/source/command_stream/command_stream_receiver.cpp
@@ -258,18 +258,48 @@ void CommandStreamReceiver::ensureCommandBufferAllocation(LinearStream &commandS
    commandStream.replaceGraphicsAllocation(allocation);
 }

+void CommandStreamReceiver::preallocateCommandBuffer() {
+    const AllocationProperties commandStreamAllocationProperties{rootDeviceIndex, true, MemoryConstants::pageSize64k, AllocationType::COMMAND_BUFFER,
+                                                                 isMultiOsContextCapable(), false, deviceBitfield};
+    auto allocation = this->getMemoryManager()->allocateGraphicsMemoryWithProperties(commandStreamAllocationProperties);
+    getInternalAllocationStorage()->storeAllocation(std::unique_ptr<GraphicsAllocation>(allocation), REUSABLE_ALLOCATION);
+    this->makeResident(*allocation);
+}
+
 void CommandStreamReceiver::fillReusableAllocationsList() {
    auto &gfxCoreHelper = getGfxCoreHelper();
    auto amountToFill = gfxCoreHelper.getAmountOfAllocationsToFill();
    for (auto i = 0u; i < amountToFill; i++) {
-        const AllocationProperties commandStreamAllocationProperties{rootDeviceIndex, true, MemoryConstants::pageSize64k, AllocationType::COMMAND_BUFFER,
-                                                                     isMultiOsContextCapable(), false, deviceBitfield};
-        auto allocation = this->getMemoryManager()->allocateGraphicsMemoryWithProperties(commandStreamAllocationProperties);
-        getInternalAllocationStorage()->storeAllocation(std::unique_ptr<GraphicsAllocation>(allocation), REUSABLE_ALLOCATION);
-        this->makeResident(*allocation);
+        preallocateCommandBuffer();
    }
 }

+void CommandStreamReceiver::requestPreallocation() {
+    auto preallocationsPerQueue = getProductHelper().getCommandBuffersPreallocatedPerCommandQueue();
+    if (DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.get() != -1) {
+        preallocationsPerQueue = DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.get();
+    }
+    auto lock = obtainUniqueOwnership();
+    requestedPreallocationsAmount += preallocationsPerQueue;
+    const int64_t amountToPreallocate = static_cast<int64_t>(requestedPreallocationsAmount.load()) - preallocatedAmount;
+    DEBUG_BREAK_IF(amountToPreallocate > preallocationsPerQueue);
+    if (amountToPreallocate > 0) {
+        for (auto i = 0u; i < amountToPreallocate; i++) {
+            preallocateCommandBuffer();
+        }
+        preallocatedAmount += static_cast<uint32_t>(amountToPreallocate);
+    }
+}
+
+void CommandStreamReceiver::releasePreallocationRequest() {
+    auto preallocationsPerQueue = getProductHelper().getCommandBuffersPreallocatedPerCommandQueue();
+    if (DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.get() != -1) {
+        preallocationsPerQueue = DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.get();
+    }
+    DEBUG_BREAK_IF(preallocationsPerQueue > requestedPreallocationsAmount);
+    requestedPreallocationsAmount -= preallocationsPerQueue;
+}
+
 bool CommandStreamReceiver::initializeResources() {
    if (!resourcesInitialized) {
        auto lock = obtainUniqueOwnership();
--- a/shared/source/command_stream/command_stream_receiver.h
+++ b/shared/source/command_stream/command_stream_receiver.h
@@ -432,6 +432,10 @@ class CommandStreamReceiver {

    virtual bool waitUserFence(TaskCountType waitValue, uint64_t hostAddress, int64_t timeout) { return false; }

+    void requestPreallocation();
+    void releasePreallocationRequest();
+    void preallocateCommandBuffer();
+
  protected:
    void cleanupResources();
    void printDeviceIndex();
@@ -448,6 +452,9 @@ class CommandStreamReceiver {
    std::unique_ptr<FlatBatchBufferHelper> flatBatchBufferHelper;
    std::unique_ptr<ExperimentalCommandBuffer> experimentalCmdBuffer;
    std::unique_ptr<InternalAllocationStorage> internalAllocationStorage;
+    std::atomic<uint32_t> preallocatedAmount{0};
+    std::atomic<uint32_t> requestedPreallocationsAmount{0};
+
    std::unique_ptr<KmdNotifyHelper> kmdNotifyHelper;
    std::unique_ptr<ScratchSpaceController> scratchSpaceController;
    std::unique_ptr<TagAllocatorBase> profilingTimeStampAllocator;
--- a/shared/source/debug_settings/debug_variables_base.inl
+++ b/shared/source/debug_settings/debug_variables_base.inl
@@ -345,6 +345,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsMaskH2D, 0, "0: default, >0: bitmask: in
 DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsMaskD2H, 0, "0: default, >0: bitmask: indicates bcs engines for D2H split")
 DECLARE_DEBUG_VARIABLE(int32_t, ReuseKernelBinaries, -1, "-1: default, 0:disabled, 1: enabled. If enabled, driver reuses kernel binaries.")
 DECLARE_DEBUG_VARIABLE(int32_t, SetAmountOfReusableAllocations, -1, "-1: default, 0:disabled, > 1: enabled. If enabled, driver will fill reusable allocation lists with given amount of command buffers and heaps at initialization of immediate command list.")
+DECLARE_DEBUG_VARIABLE(int32_t, SetAmountOfReusableAllocationsPerCmdQueue, -1, "-1: default, 0:disabled, > 1: enabled. If enabled, driver will fill reusable allocation lists with given amount of command buffers for each initialized opencl command queue.")
 DECLARE_DEBUG_VARIABLE(int32_t, UseHighAlignmentForHeapExtended, -1, "-1: default, 0:disabled, > 1: enabled. If enabled, driver aligns HEAP_EXTENDED allocations to GPU VA that is next power of 2 for a given size, if disables GPU VA is using 2MB/64KB alignment.")
 DECLARE_DEBUG_VARIABLE(int32_t, DispatchCmdlistCmdBufferPrimary, -1, "-1: default, 0: dispatch command buffers as seconadry, 1: dispatch command buffers as primary and chain")
 DECLARE_DEBUG_VARIABLE(int32_t, UseImmediateFlushTask, -1, "-1: default, 0: use regular flush task, 1: use immediate flush task")
--- a/shared/source/os_interface/product_helper.h
+++ b/shared/source/os_interface/product_helper.h
@@ -165,6 +165,7 @@ class ProductHelper {
    virtual bool isLinearStoragePreferred(bool isImage1d, bool forceLinearStorage) const = 0;
    virtual bool isTranslationExceptionSupported() const = 0;
    virtual uint32_t getMaxNumSamplers() const = 0;
+    virtual uint32_t getCommandBuffersPreallocatedPerCommandQueue() const = 0;

    virtual bool getFrontEndPropertyScratchSizeSupport() const = 0;
    virtual bool getFrontEndPropertyPrivateScratchSizeSupport() const = 0;
--- a/shared/source/os_interface/product_helper.inl
+++ b/shared/source/os_interface/product_helper.inl
@@ -778,6 +778,11 @@ uint32_t ProductHelperHw<gfxProduct>::getMaxNumSamplers() const {
    return 16u;
 }

+template <PRODUCT_FAMILY gfxProduct>
+uint32_t ProductHelperHw<gfxProduct>::getCommandBuffersPreallocatedPerCommandQueue() const {
+    return 0u;
+}
+
 template <PRODUCT_FAMILY gfxProduct>
 bool ProductHelperHw<gfxProduct>::disableL3CacheForDebug(const HardwareInfo &) const {
    return false;
--- a/shared/source/os_interface/product_helper_hw.h
+++ b/shared/source/os_interface/product_helper_hw.h
@@ -119,6 +119,7 @@ class ProductHelperHw : public ProductHelper {
    bool isLinearStoragePreferred(bool isImage1d, bool forceLinearStorage) const override;
    bool isTranslationExceptionSupported() const override;
    uint32_t getMaxNumSamplers() const override;
+    uint32_t getCommandBuffersPreallocatedPerCommandQueue() const override;

    bool getFrontEndPropertyScratchSizeSupport() const override;
    bool getFrontEndPropertyPrivateScratchSizeSupport() const override;
--- a/shared/test/common/libult/ult_command_stream_receiver.h
+++ b/shared/test/common/libult/ult_command_stream_receiver.h
@@ -135,6 +135,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
    using BaseClass::CommandStreamReceiver::perfCounterAllocator;
    using BaseClass::CommandStreamReceiver::pipelineSupportFlags;
    using BaseClass::CommandStreamReceiver::profilingTimeStampAllocator;
+    using BaseClass::CommandStreamReceiver::requestedPreallocationsAmount;
    using BaseClass::CommandStreamReceiver::requiredPrivateScratchSize;
    using BaseClass::CommandStreamReceiver::requiredScratchSize;
    using BaseClass::CommandStreamReceiver::resourcesInitialized;
--- a/shared/test/common/mocks/mock_product_helper.cpp
+++ b/shared/test/common/mocks/mock_product_helper.cpp
@@ -367,6 +367,11 @@ uint32_t ProductHelperHw<IGFX_UNKNOWN>::getMaxNumSamplers() const {
    return 0u;
 }

+template <>
+uint32_t ProductHelperHw<IGFX_UNKNOWN>::getCommandBuffersPreallocatedPerCommandQueue() const {
+    return 0u;
+}
+
 template <>
 uint32_t L1CachePolicyHelper<IGFX_UNKNOWN>::getL1CachePolicy(bool isDebuggerActive) {
    return L1CachePolicyHelper<IGFX_UNKNOWN>::getDefaultL1CachePolicy(isDebuggerActive);
--- a/shared/test/common/test_files/igdrcl.config
+++ b/shared/test/common/test_files/igdrcl.config
@@ -556,4 +556,5 @@ OverridePatIndexForDeviceMemory = -1
 PrintGmmCompressionParams = 0
 SkipInOrderNonWalkerSignalingAllowed = 0
 PrintKernelDispatchParameters = 0
+SetAmountOfReusableAllocationsPerCmdQueue = -1
 # Please don't edit below this line
--- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp
+++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp
@@ -173,6 +173,40 @@ HWTEST_F(CommandStreamReceiverTest, givenFlagDisabledWhenCallFillReusableAllocat
    EXPECT_EQ(0u, commandStreamReceiver->getResidencyAllocations().size());
 }

+HWTEST_F(CommandStreamReceiverTest, givenPreallocationsPerQueueEqualZeroWhenRequestPreallocationCalledThenDoNotAllocateCommandBuffer) {
+    DebugManagerStateRestore restorer;
+    DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.set(0);
+    EXPECT_TRUE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
+    EXPECT_EQ(0u, commandStreamReceiver->getResidencyAllocations().size());
+
+    commandStreamReceiver->requestPreallocation();
+    EXPECT_TRUE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
+    EXPECT_EQ(0u, commandStreamReceiver->getResidencyAllocations().size());
+}
+
+HWTEST_F(CommandStreamReceiverTest, givenPreallocationsPerQueueWhenRequestPreallocationCalledThenAllocateCommandBufferIfNeeded) {
+    DebugManagerStateRestore restorer;
+    DebugManager.flags.SetAmountOfReusableAllocationsPerCmdQueue.set(1);
+    EXPECT_TRUE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
+    EXPECT_EQ(0u, commandStreamReceiver->getResidencyAllocations().size());
+
+    commandStreamReceiver->requestPreallocation();
+    EXPECT_FALSE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
+    EXPECT_EQ(1u, commandStreamReceiver->getResidencyAllocations().size());
+
+    commandStreamReceiver->releasePreallocationRequest();
+    EXPECT_FALSE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
+    EXPECT_EQ(1u, commandStreamReceiver->getResidencyAllocations().size());
+
+    commandStreamReceiver->requestPreallocation();
+    EXPECT_FALSE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
+    EXPECT_EQ(1u, commandStreamReceiver->getResidencyAllocations().size());
+
+    commandStreamReceiver->requestPreallocation();
+    EXPECT_FALSE(commandStreamReceiver->getAllocationsForReuse().peekIsEmpty());
+    EXPECT_EQ(2u, commandStreamReceiver->getResidencyAllocations().size());
+}
+
 HWTEST_F(CommandStreamReceiverTest, whenRegisterClientThenIncrementClientNum) {
    auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
    auto numClients = csr.getNumClients();