refactor: split sync buffer and region allocation creation code

- split the allocation code from command list or kernel - allow to call allocation code in all parts of the driver Related-To: NEO-13350 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
2024-11-28 09:59:31 +00:00 · 2024-11-28 09:59:31 +00:00 · c5ed6bf73c
parent f2b0dad964
commit c5ed6bf73c
8 changed files with 71 additions and 36 deletions
--- a/level_zero/core/source/cmdlist/cmdlist_hw.inl
+++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl
@ -36,7 +36,6 @@
 #include "shared/source/memory_manager/unified_memory_manager.h"
 #include "shared/source/page_fault_manager/cpu_page_fault_manager.h"
 #include "shared/source/program/sync_buffer_handler.h"
-#include "shared/source/program/sync_buffer_handler.inl"
 #include "shared/source/utilities/software_tags_manager.h"

 #include "level_zero/api/driver_experimental/public/zex_cmdlist.h"
@ -2814,8 +2813,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::programSyncBuffer(Kernel &kern
        return ZE_RESULT_ERROR_INVALID_ARGUMENT;
    }

-    device.allocateSyncBufferHandler();
-    device.syncBufferHandler->prepareForEnqueue(requestedNumberOfWorkgroups, kernel);
+    auto patchData = NEO::KernelHelper::getSyncBufferAllocationOffset(device, requestedNumberOfWorkgroups);
+    kernel.patchSyncBuffer(patchData.first, patchData.second);

    return ZE_RESULT_SUCCESS;
 }
@ -2824,13 +2823,8 @@ template <GFXCORE_FAMILY gfxCoreFamily>
 void CommandListCoreFamily<gfxCoreFamily>::programRegionGroupBarrier(Kernel &kernel, const ze_group_count_t &threadGroupDimensions, size_t localRegionSize) {
    auto neoDevice = device->getNEODevice();

-    neoDevice->allocateSyncBufferHandler();
-
-    const size_t requestedNumberOfWorkgroups = threadGroupDimensions.groupCountX * threadGroupDimensions.groupCountY * threadGroupDimensions.groupCountZ;
-
-    size_t size = alignUp((requestedNumberOfWorkgroups / localRegionSize) * (localRegionSize + 1) * 2 * sizeof(uint32_t), MemoryConstants::cacheLineSize);
-
-    auto patchData = neoDevice->syncBufferHandler->obtainAllocationAndOffset(size);
+    auto threadGroupCount = threadGroupDimensions.groupCountX * threadGroupDimensions.groupCountY * threadGroupDimensions.groupCountZ;
+    auto patchData = NEO::KernelHelper::getRegionGroupBarrierAllocationOffset(*neoDevice, threadGroupCount, localRegionSize);

    kernel.patchRegionGroupBarrier(patchData.first, patchData.second);
 }
--- a/opencl/source/command_queue/enqueue_common.h
+++ b/opencl/source/command_queue/enqueue_common.h
@ -13,6 +13,7 @@
 #include "shared/source/helpers/engine_node_helper.h"
 #include "shared/source/helpers/flat_batch_buffer_helper.h"
 #include "shared/source/helpers/flush_stamp.h"
+#include "shared/source/helpers/kernel_helpers.h"
 #include "shared/source/helpers/pipe_control_args.h"
 #include "shared/source/helpers/timestamp_packet.h"
 #include "shared/source/memory_manager/internal_allocation_storage.h"
@ -20,7 +21,6 @@
 #include "shared/source/memory_manager/unified_memory_manager.h"
 #include "shared/source/os_interface/os_context.h"
 #include "shared/source/program/sync_buffer_handler.h"
-#include "shared/source/program/sync_buffer_handler.inl"
 #include "shared/source/utilities/range.h"
 #include "shared/source/utilities/tag_allocator.h"

@ -534,7 +534,8 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
        auto &lws = multiDispatchInfo.begin()->getLocalWorkgroupSize();
        size_t workGroupsCount = (gws.x * gws.y * gws.z) /
                                 (lws.x * lws.y * lws.z);
-        device->getDevice().syncBufferHandler->prepareForEnqueue(workGroupsCount, *multiDispatchInfo.peekMainKernel());
+        auto patchData = KernelHelper::getSyncBufferAllocationOffset(device->getDevice(), workGroupsCount);
+        multiDispatchInfo.peekMainKernel()->patchSyncBuffer(patchData.first, patchData.second);
    }

    if (event && this->isProfilingEnabled()) {
--- a/shared/source/helpers/kernel_helpers.cpp
+++ b/shared/source/helpers/kernel_helpers.cpp
@ -11,12 +11,10 @@
 #include "shared/source/device/device.h"
 #include "shared/source/execution_environment/root_device_environment.h"
 #include "shared/source/helpers/basic_math.h"
-#include "shared/source/helpers/constants.h"
 #include "shared/source/helpers/debug_helpers.h"
 #include "shared/source/helpers/gfx_core_helper.h"
 #include "shared/source/helpers/hw_info.h"
-
-#include <algorithm>
+#include "shared/source/program/sync_buffer_handler.h"

 namespace NEO {

@ -125,4 +123,20 @@ bool KernelHelper::isAnyArgumentPtrByValue(const KernelDescriptor &kernelDescrip
    return false;
 }

+std::pair<GraphicsAllocation *, size_t> KernelHelper::getRegionGroupBarrierAllocationOffset(Device &device, const size_t threadGroupCount, const size_t localRegionSize) {
+    device.allocateSyncBufferHandler();
+
+    size_t size = KernelHelper::getRegionGroupBarrierSize(threadGroupCount, localRegionSize);
+
+    return device.syncBufferHandler->obtainAllocationAndOffset(size);
+}
+
+std::pair<GraphicsAllocation *, size_t> KernelHelper::getSyncBufferAllocationOffset(Device &device, const size_t requestedNumberOfWorkgroups) {
+    device.allocateSyncBufferHandler();
+
+    size_t requiredSize = KernelHelper::getSyncBufferSize(requestedNumberOfWorkgroups);
+
+    return device.syncBufferHandler->obtainAllocationAndOffset(requiredSize);
+}
+
 } // namespace NEO
--- a/shared/source/helpers/kernel_helpers.h
+++ b/shared/source/helpers/kernel_helpers.h
@ -7,14 +7,18 @@

 #pragma once

+#include "shared/source/helpers/aligned_memory.h"
+#include "shared/source/helpers/constants.h"
 #include "shared/source/helpers/definitions/engine_group_types.h"
 #include "shared/source/kernel/kernel_descriptor.h"

+#include <algorithm>
 #include <cstddef>
 #include <cstdint>

 namespace NEO {
 class Device;
+class GraphicsAllocation;
 struct RootDeviceEnvironment;

 struct KernelHelper {
@ -39,6 +43,17 @@ struct KernelHelper {
    static ErrorCode checkIfThereIsSpaceForScratchOrPrivate(KernelDescriptor::KernelAttributes attributes, Device *device);

    static bool isAnyArgumentPtrByValue(const KernelDescriptor &kernelDescriptor);
+
+    static inline size_t getRegionGroupBarrierSize(const size_t threadGroupCount, const size_t localRegionSize) {
+        return alignUp((threadGroupCount / localRegionSize) * (localRegionSize + 1) * 2 * sizeof(uint32_t), MemoryConstants::cacheLineSize);
+    }
+
+    static std::pair<GraphicsAllocation *, size_t> getRegionGroupBarrierAllocationOffset(Device &device, const size_t threadGroupCount, const size_t localRegionSize);
+
+    static inline size_t getSyncBufferSize(const size_t requestedNumberOfWorkgroups) {
+        return alignUp(std::max(requestedNumberOfWorkgroups, static_cast<size_t>(CommonConstants::minimalSyncBufferSize)), static_cast<size_t>(CommonConstants::maximalSizeOfAtomicType));
+    }
+    static std::pair<GraphicsAllocation *, size_t> getSyncBufferAllocationOffset(Device &device, const size_t requestedNumberOfWorkgroups);
 };

 } // namespace NEO
--- a/shared/source/program/CMakeLists.txt
+++ b/shared/source/program/CMakeLists.txt
@ -1,5 +1,5 @@
 #
-# Copyright (C) 2019-2023 Intel Corporation
+# Copyright (C) 2019-2024 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 #
@ -21,7 +21,6 @@ set(NEO_CORE_PROGRAM
    ${CMAKE_CURRENT_SOURCE_DIR}/program_initialization.h
    ${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.inl
    ${CMAKE_CURRENT_SOURCE_DIR}/work_size_info.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/work_size_info.h
 )
--- a/shared/source/program/sync_buffer_handler.h
+++ b/shared/source/program/sync_buffer_handler.h
@ -25,8 +25,6 @@ class SyncBufferHandler : NonCopyableOrMovableClass {

    SyncBufferHandler(Device &device);

-    template <typename KernelT>
-    void prepareForEnqueue(size_t workGroupsCount, KernelT &kernel);
    void makeResident(CommandStreamReceiver &csr);

    std::pair<GraphicsAllocation *, size_t> obtainAllocationAndOffset(size_t requiredSize);
--- a/shared/source/program/sync_buffer_handler.inl
+++ b/shared/source/program/sync_buffer_handler.inl
@ -1,17 +0,0 @@
-/*
- * Copyright (C) 2021-2024 Intel Corporation
- *
- * SPDX-License-Identifier: MIT
- *
- */
-
-#include "shared/source/memory_manager/memory_manager.h"
-
-template <typename KernelT>
-void NEO::SyncBufferHandler::prepareForEnqueue(size_t workGroupsCount, KernelT &kernel) {
-    auto requiredSize = alignUp(std::max(workGroupsCount, static_cast<size_t>(CommonConstants::minimalSyncBufferSize)), CommonConstants::maximalSizeOfAtomicType);
-
-    auto patchData = obtainAllocationAndOffset(requiredSize);
-
-    kernel.patchSyncBuffer(patchData.first, patchData.second);
-}
--- a/shared/test/unit_test/helpers/kernel_helpers_tests.cpp
+++ b/shared/test/unit_test/helpers/kernel_helpers_tests.cpp
@ -283,3 +283,34 @@ TEST_F(KernelHelperTest, GivenPtrByValueWhenCheckingIsAnyArgumentPtrByValueThenT
    kernelDescriptor.payloadMappings.explicitArgs.push_back(valueArg);
    EXPECT_TRUE(KernelHelper::isAnyArgumentPtrByValue(kernelDescriptor));
 }
+
+TEST_F(KernelHelperTest, GivenThreadGroupCountWhenSyncBufferCreatedThenAllocationIsRetrieved) {
+    const size_t requestedNumberOfWorkgroups = 4;
+    auto offset = KernelHelper::getSyncBufferSize(requestedNumberOfWorkgroups);
+
+    auto pair = KernelHelper::getSyncBufferAllocationOffset(*pDevice, requestedNumberOfWorkgroups);
+    auto allocation = pair.first;
+
+    EXPECT_EQ(0u, pair.second);
+    EXPECT_NE(nullptr, allocation);
+
+    pair = KernelHelper::getSyncBufferAllocationOffset(*pDevice, requestedNumberOfWorkgroups);
+    EXPECT_EQ(offset, pair.second);
+    EXPECT_EQ(allocation, pair.first);
+}
+
+TEST_F(KernelHelperTest, GivenThreadGroupCountAndRegionSizeWhenRegionBarrierCreatedThenAllocationIsRetrieved) {
+    const size_t requestedNumberOfWorkgroups = 4;
+    const size_t localRegionSize = 2;
+    auto offset = KernelHelper::getRegionGroupBarrierSize(requestedNumberOfWorkgroups, localRegionSize);
+
+    auto pair = KernelHelper::getRegionGroupBarrierAllocationOffset(*pDevice, requestedNumberOfWorkgroups, localRegionSize);
+    auto allocation = pair.first;
+
+    EXPECT_EQ(0u, pair.second);
+    EXPECT_NE(nullptr, allocation);
+
+    pair = KernelHelper::getRegionGroupBarrierAllocationOffset(*pDevice, requestedNumberOfWorkgroups, localRegionSize);
+    EXPECT_EQ(offset, pair.second);
+    EXPECT_EQ(allocation, pair.first);
+}