From c5ed6bf73cf174478f350ef1ee6637f4940fbf36 Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Thu, 28 Nov 2024 09:59:31 +0000 Subject: [PATCH] refactor: split sync buffer and region allocation creation code - split the allocation code from command list or kernel - allow to call allocation code in all parts of the driver Related-To: NEO-13350 Signed-off-by: Zbigniew Zdanowicz --- level_zero/core/source/cmdlist/cmdlist_hw.inl | 14 +++------ opencl/source/command_queue/enqueue_common.h | 5 +-- shared/source/helpers/kernel_helpers.cpp | 20 ++++++++++-- shared/source/helpers/kernel_helpers.h | 15 +++++++++ shared/source/program/CMakeLists.txt | 3 +- shared/source/program/sync_buffer_handler.h | 2 -- shared/source/program/sync_buffer_handler.inl | 17 ---------- .../helpers/kernel_helpers_tests.cpp | 31 +++++++++++++++++++ 8 files changed, 71 insertions(+), 36 deletions(-) delete mode 100644 shared/source/program/sync_buffer_handler.inl diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index f514633c93..fd0097c80e 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -36,7 +36,6 @@ #include "shared/source/memory_manager/unified_memory_manager.h" #include "shared/source/page_fault_manager/cpu_page_fault_manager.h" #include "shared/source/program/sync_buffer_handler.h" -#include "shared/source/program/sync_buffer_handler.inl" #include "shared/source/utilities/software_tags_manager.h" #include "level_zero/api/driver_experimental/public/zex_cmdlist.h" @@ -2814,8 +2813,8 @@ ze_result_t CommandListCoreFamily::programSyncBuffer(Kernel &kern return ZE_RESULT_ERROR_INVALID_ARGUMENT; } - device.allocateSyncBufferHandler(); - device.syncBufferHandler->prepareForEnqueue(requestedNumberOfWorkgroups, kernel); + auto patchData = NEO::KernelHelper::getSyncBufferAllocationOffset(device, requestedNumberOfWorkgroups); + kernel.patchSyncBuffer(patchData.first, patchData.second); return ZE_RESULT_SUCCESS; } @@ -2824,13 +2823,8 @@ template void CommandListCoreFamily::programRegionGroupBarrier(Kernel &kernel, const ze_group_count_t &threadGroupDimensions, size_t localRegionSize) { auto neoDevice = device->getNEODevice(); - neoDevice->allocateSyncBufferHandler(); - - const size_t requestedNumberOfWorkgroups = threadGroupDimensions.groupCountX * threadGroupDimensions.groupCountY * threadGroupDimensions.groupCountZ; - - size_t size = alignUp((requestedNumberOfWorkgroups / localRegionSize) * (localRegionSize + 1) * 2 * sizeof(uint32_t), MemoryConstants::cacheLineSize); - - auto patchData = neoDevice->syncBufferHandler->obtainAllocationAndOffset(size); + auto threadGroupCount = threadGroupDimensions.groupCountX * threadGroupDimensions.groupCountY * threadGroupDimensions.groupCountZ; + auto patchData = NEO::KernelHelper::getRegionGroupBarrierAllocationOffset(*neoDevice, threadGroupCount, localRegionSize); kernel.patchRegionGroupBarrier(patchData.first, patchData.second); } diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 3b1fcb9d72..7f0e57fb54 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -13,6 +13,7 @@ #include "shared/source/helpers/engine_node_helper.h" #include "shared/source/helpers/flat_batch_buffer_helper.h" #include "shared/source/helpers/flush_stamp.h" +#include "shared/source/helpers/kernel_helpers.h" #include "shared/source/helpers/pipe_control_args.h" #include "shared/source/helpers/timestamp_packet.h" #include "shared/source/memory_manager/internal_allocation_storage.h" @@ -20,7 +21,6 @@ #include "shared/source/memory_manager/unified_memory_manager.h" #include "shared/source/os_interface/os_context.h" #include "shared/source/program/sync_buffer_handler.h" -#include "shared/source/program/sync_buffer_handler.inl" #include "shared/source/utilities/range.h" #include "shared/source/utilities/tag_allocator.h" @@ -534,7 +534,8 @@ void CommandQueueHw::processDispatchForKernels(const MultiDispatchInf auto &lws = multiDispatchInfo.begin()->getLocalWorkgroupSize(); size_t workGroupsCount = (gws.x * gws.y * gws.z) / (lws.x * lws.y * lws.z); - device->getDevice().syncBufferHandler->prepareForEnqueue(workGroupsCount, *multiDispatchInfo.peekMainKernel()); + auto patchData = KernelHelper::getSyncBufferAllocationOffset(device->getDevice(), workGroupsCount); + multiDispatchInfo.peekMainKernel()->patchSyncBuffer(patchData.first, patchData.second); } if (event && this->isProfilingEnabled()) { diff --git a/shared/source/helpers/kernel_helpers.cpp b/shared/source/helpers/kernel_helpers.cpp index 9bf7beea19..e47e827d1a 100644 --- a/shared/source/helpers/kernel_helpers.cpp +++ b/shared/source/helpers/kernel_helpers.cpp @@ -11,12 +11,10 @@ #include "shared/source/device/device.h" #include "shared/source/execution_environment/root_device_environment.h" #include "shared/source/helpers/basic_math.h" -#include "shared/source/helpers/constants.h" #include "shared/source/helpers/debug_helpers.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/hw_info.h" - -#include +#include "shared/source/program/sync_buffer_handler.h" namespace NEO { @@ -125,4 +123,20 @@ bool KernelHelper::isAnyArgumentPtrByValue(const KernelDescriptor &kernelDescrip return false; } +std::pair KernelHelper::getRegionGroupBarrierAllocationOffset(Device &device, const size_t threadGroupCount, const size_t localRegionSize) { + device.allocateSyncBufferHandler(); + + size_t size = KernelHelper::getRegionGroupBarrierSize(threadGroupCount, localRegionSize); + + return device.syncBufferHandler->obtainAllocationAndOffset(size); +} + +std::pair KernelHelper::getSyncBufferAllocationOffset(Device &device, const size_t requestedNumberOfWorkgroups) { + device.allocateSyncBufferHandler(); + + size_t requiredSize = KernelHelper::getSyncBufferSize(requestedNumberOfWorkgroups); + + return device.syncBufferHandler->obtainAllocationAndOffset(requiredSize); +} + } // namespace NEO diff --git a/shared/source/helpers/kernel_helpers.h b/shared/source/helpers/kernel_helpers.h index 6a47796554..e5a98feacb 100644 --- a/shared/source/helpers/kernel_helpers.h +++ b/shared/source/helpers/kernel_helpers.h @@ -7,14 +7,18 @@ #pragma once +#include "shared/source/helpers/aligned_memory.h" +#include "shared/source/helpers/constants.h" #include "shared/source/helpers/definitions/engine_group_types.h" #include "shared/source/kernel/kernel_descriptor.h" +#include #include #include namespace NEO { class Device; +class GraphicsAllocation; struct RootDeviceEnvironment; struct KernelHelper { @@ -39,6 +43,17 @@ struct KernelHelper { static ErrorCode checkIfThereIsSpaceForScratchOrPrivate(KernelDescriptor::KernelAttributes attributes, Device *device); static bool isAnyArgumentPtrByValue(const KernelDescriptor &kernelDescriptor); + + static inline size_t getRegionGroupBarrierSize(const size_t threadGroupCount, const size_t localRegionSize) { + return alignUp((threadGroupCount / localRegionSize) * (localRegionSize + 1) * 2 * sizeof(uint32_t), MemoryConstants::cacheLineSize); + } + + static std::pair getRegionGroupBarrierAllocationOffset(Device &device, const size_t threadGroupCount, const size_t localRegionSize); + + static inline size_t getSyncBufferSize(const size_t requestedNumberOfWorkgroups) { + return alignUp(std::max(requestedNumberOfWorkgroups, static_cast(CommonConstants::minimalSyncBufferSize)), static_cast(CommonConstants::maximalSizeOfAtomicType)); + } + static std::pair getSyncBufferAllocationOffset(Device &device, const size_t requestedNumberOfWorkgroups); }; } // namespace NEO diff --git a/shared/source/program/CMakeLists.txt b/shared/source/program/CMakeLists.txt index f45f6e2d97..669895b63c 100644 --- a/shared/source/program/CMakeLists.txt +++ b/shared/source/program/CMakeLists.txt @@ -1,5 +1,5 @@ # -# Copyright (C) 2019-2023 Intel Corporation +# Copyright (C) 2019-2024 Intel Corporation # # SPDX-License-Identifier: MIT # @@ -21,7 +21,6 @@ set(NEO_CORE_PROGRAM ${CMAKE_CURRENT_SOURCE_DIR}/program_initialization.h ${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.cpp ${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.h - ${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.inl ${CMAKE_CURRENT_SOURCE_DIR}/work_size_info.cpp ${CMAKE_CURRENT_SOURCE_DIR}/work_size_info.h ) diff --git a/shared/source/program/sync_buffer_handler.h b/shared/source/program/sync_buffer_handler.h index eab3b52827..0960b11db9 100644 --- a/shared/source/program/sync_buffer_handler.h +++ b/shared/source/program/sync_buffer_handler.h @@ -25,8 +25,6 @@ class SyncBufferHandler : NonCopyableOrMovableClass { SyncBufferHandler(Device &device); - template - void prepareForEnqueue(size_t workGroupsCount, KernelT &kernel); void makeResident(CommandStreamReceiver &csr); std::pair obtainAllocationAndOffset(size_t requiredSize); diff --git a/shared/source/program/sync_buffer_handler.inl b/shared/source/program/sync_buffer_handler.inl deleted file mode 100644 index 2a8043dcd0..0000000000 --- a/shared/source/program/sync_buffer_handler.inl +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) 2021-2024 Intel Corporation - * - * SPDX-License-Identifier: MIT - * - */ - -#include "shared/source/memory_manager/memory_manager.h" - -template -void NEO::SyncBufferHandler::prepareForEnqueue(size_t workGroupsCount, KernelT &kernel) { - auto requiredSize = alignUp(std::max(workGroupsCount, static_cast(CommonConstants::minimalSyncBufferSize)), CommonConstants::maximalSizeOfAtomicType); - - auto patchData = obtainAllocationAndOffset(requiredSize); - - kernel.patchSyncBuffer(patchData.first, patchData.second); -} diff --git a/shared/test/unit_test/helpers/kernel_helpers_tests.cpp b/shared/test/unit_test/helpers/kernel_helpers_tests.cpp index 6a60ee7db5..47fd80778b 100644 --- a/shared/test/unit_test/helpers/kernel_helpers_tests.cpp +++ b/shared/test/unit_test/helpers/kernel_helpers_tests.cpp @@ -283,3 +283,34 @@ TEST_F(KernelHelperTest, GivenPtrByValueWhenCheckingIsAnyArgumentPtrByValueThenT kernelDescriptor.payloadMappings.explicitArgs.push_back(valueArg); EXPECT_TRUE(KernelHelper::isAnyArgumentPtrByValue(kernelDescriptor)); } + +TEST_F(KernelHelperTest, GivenThreadGroupCountWhenSyncBufferCreatedThenAllocationIsRetrieved) { + const size_t requestedNumberOfWorkgroups = 4; + auto offset = KernelHelper::getSyncBufferSize(requestedNumberOfWorkgroups); + + auto pair = KernelHelper::getSyncBufferAllocationOffset(*pDevice, requestedNumberOfWorkgroups); + auto allocation = pair.first; + + EXPECT_EQ(0u, pair.second); + EXPECT_NE(nullptr, allocation); + + pair = KernelHelper::getSyncBufferAllocationOffset(*pDevice, requestedNumberOfWorkgroups); + EXPECT_EQ(offset, pair.second); + EXPECT_EQ(allocation, pair.first); +} + +TEST_F(KernelHelperTest, GivenThreadGroupCountAndRegionSizeWhenRegionBarrierCreatedThenAllocationIsRetrieved) { + const size_t requestedNumberOfWorkgroups = 4; + const size_t localRegionSize = 2; + auto offset = KernelHelper::getRegionGroupBarrierSize(requestedNumberOfWorkgroups, localRegionSize); + + auto pair = KernelHelper::getRegionGroupBarrierAllocationOffset(*pDevice, requestedNumberOfWorkgroups, localRegionSize); + auto allocation = pair.first; + + EXPECT_EQ(0u, pair.second); + EXPECT_NE(nullptr, allocation); + + pair = KernelHelper::getRegionGroupBarrierAllocationOffset(*pDevice, requestedNumberOfWorkgroups, localRegionSize); + EXPECT_EQ(offset, pair.second); + EXPECT_EQ(allocation, pair.first); +}