refactor: split sync buffer and region allocation creation code

- split the allocation code from command list or kernel
- allow to call allocation code in all parts of the driver

Related-To: NEO-13350

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz 2024-11-28 09:59:31 +00:00 committed by Compute-Runtime-Automation
parent f2b0dad964
commit c5ed6bf73c
8 changed files with 71 additions and 36 deletions

View File

@ -36,7 +36,6 @@
#include "shared/source/memory_manager/unified_memory_manager.h"
#include "shared/source/page_fault_manager/cpu_page_fault_manager.h"
#include "shared/source/program/sync_buffer_handler.h"
#include "shared/source/program/sync_buffer_handler.inl"
#include "shared/source/utilities/software_tags_manager.h"
#include "level_zero/api/driver_experimental/public/zex_cmdlist.h"
@ -2814,8 +2813,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::programSyncBuffer(Kernel &kern
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
device.allocateSyncBufferHandler();
device.syncBufferHandler->prepareForEnqueue(requestedNumberOfWorkgroups, kernel);
auto patchData = NEO::KernelHelper::getSyncBufferAllocationOffset(device, requestedNumberOfWorkgroups);
kernel.patchSyncBuffer(patchData.first, patchData.second);
return ZE_RESULT_SUCCESS;
}
@ -2824,13 +2823,8 @@ template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::programRegionGroupBarrier(Kernel &kernel, const ze_group_count_t &threadGroupDimensions, size_t localRegionSize) {
auto neoDevice = device->getNEODevice();
neoDevice->allocateSyncBufferHandler();
const size_t requestedNumberOfWorkgroups = threadGroupDimensions.groupCountX * threadGroupDimensions.groupCountY * threadGroupDimensions.groupCountZ;
size_t size = alignUp((requestedNumberOfWorkgroups / localRegionSize) * (localRegionSize + 1) * 2 * sizeof(uint32_t), MemoryConstants::cacheLineSize);
auto patchData = neoDevice->syncBufferHandler->obtainAllocationAndOffset(size);
auto threadGroupCount = threadGroupDimensions.groupCountX * threadGroupDimensions.groupCountY * threadGroupDimensions.groupCountZ;
auto patchData = NEO::KernelHelper::getRegionGroupBarrierAllocationOffset(*neoDevice, threadGroupCount, localRegionSize);
kernel.patchRegionGroupBarrier(patchData.first, patchData.second);
}

View File

@ -13,6 +13,7 @@
#include "shared/source/helpers/engine_node_helper.h"
#include "shared/source/helpers/flat_batch_buffer_helper.h"
#include "shared/source/helpers/flush_stamp.h"
#include "shared/source/helpers/kernel_helpers.h"
#include "shared/source/helpers/pipe_control_args.h"
#include "shared/source/helpers/timestamp_packet.h"
#include "shared/source/memory_manager/internal_allocation_storage.h"
@ -20,7 +21,6 @@
#include "shared/source/memory_manager/unified_memory_manager.h"
#include "shared/source/os_interface/os_context.h"
#include "shared/source/program/sync_buffer_handler.h"
#include "shared/source/program/sync_buffer_handler.inl"
#include "shared/source/utilities/range.h"
#include "shared/source/utilities/tag_allocator.h"
@ -534,7 +534,8 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
auto &lws = multiDispatchInfo.begin()->getLocalWorkgroupSize();
size_t workGroupsCount = (gws.x * gws.y * gws.z) /
(lws.x * lws.y * lws.z);
device->getDevice().syncBufferHandler->prepareForEnqueue(workGroupsCount, *multiDispatchInfo.peekMainKernel());
auto patchData = KernelHelper::getSyncBufferAllocationOffset(device->getDevice(), workGroupsCount);
multiDispatchInfo.peekMainKernel()->patchSyncBuffer(patchData.first, patchData.second);
}
if (event && this->isProfilingEnabled()) {

View File

@ -11,12 +11,10 @@
#include "shared/source/device/device.h"
#include "shared/source/execution_environment/root_device_environment.h"
#include "shared/source/helpers/basic_math.h"
#include "shared/source/helpers/constants.h"
#include "shared/source/helpers/debug_helpers.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/helpers/hw_info.h"
#include <algorithm>
#include "shared/source/program/sync_buffer_handler.h"
namespace NEO {
@ -125,4 +123,20 @@ bool KernelHelper::isAnyArgumentPtrByValue(const KernelDescriptor &kernelDescrip
return false;
}
std::pair<GraphicsAllocation *, size_t> KernelHelper::getRegionGroupBarrierAllocationOffset(Device &device, const size_t threadGroupCount, const size_t localRegionSize) {
device.allocateSyncBufferHandler();
size_t size = KernelHelper::getRegionGroupBarrierSize(threadGroupCount, localRegionSize);
return device.syncBufferHandler->obtainAllocationAndOffset(size);
}
std::pair<GraphicsAllocation *, size_t> KernelHelper::getSyncBufferAllocationOffset(Device &device, const size_t requestedNumberOfWorkgroups) {
device.allocateSyncBufferHandler();
size_t requiredSize = KernelHelper::getSyncBufferSize(requestedNumberOfWorkgroups);
return device.syncBufferHandler->obtainAllocationAndOffset(requiredSize);
}
} // namespace NEO

View File

@ -7,14 +7,18 @@
#pragma once
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/constants.h"
#include "shared/source/helpers/definitions/engine_group_types.h"
#include "shared/source/kernel/kernel_descriptor.h"
#include <algorithm>
#include <cstddef>
#include <cstdint>
namespace NEO {
class Device;
class GraphicsAllocation;
struct RootDeviceEnvironment;
struct KernelHelper {
@ -39,6 +43,17 @@ struct KernelHelper {
static ErrorCode checkIfThereIsSpaceForScratchOrPrivate(KernelDescriptor::KernelAttributes attributes, Device *device);
static bool isAnyArgumentPtrByValue(const KernelDescriptor &kernelDescriptor);
static inline size_t getRegionGroupBarrierSize(const size_t threadGroupCount, const size_t localRegionSize) {
return alignUp((threadGroupCount / localRegionSize) * (localRegionSize + 1) * 2 * sizeof(uint32_t), MemoryConstants::cacheLineSize);
}
static std::pair<GraphicsAllocation *, size_t> getRegionGroupBarrierAllocationOffset(Device &device, const size_t threadGroupCount, const size_t localRegionSize);
static inline size_t getSyncBufferSize(const size_t requestedNumberOfWorkgroups) {
return alignUp(std::max(requestedNumberOfWorkgroups, static_cast<size_t>(CommonConstants::minimalSyncBufferSize)), static_cast<size_t>(CommonConstants::maximalSizeOfAtomicType));
}
static std::pair<GraphicsAllocation *, size_t> getSyncBufferAllocationOffset(Device &device, const size_t requestedNumberOfWorkgroups);
};
} // namespace NEO

View File

@ -1,5 +1,5 @@
#
# Copyright (C) 2019-2023 Intel Corporation
# Copyright (C) 2019-2024 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
@ -21,7 +21,6 @@ set(NEO_CORE_PROGRAM
${CMAKE_CURRENT_SOURCE_DIR}/program_initialization.h
${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.cpp
${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.h
${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.inl
${CMAKE_CURRENT_SOURCE_DIR}/work_size_info.cpp
${CMAKE_CURRENT_SOURCE_DIR}/work_size_info.h
)

View File

@ -25,8 +25,6 @@ class SyncBufferHandler : NonCopyableOrMovableClass {
SyncBufferHandler(Device &device);
template <typename KernelT>
void prepareForEnqueue(size_t workGroupsCount, KernelT &kernel);
void makeResident(CommandStreamReceiver &csr);
std::pair<GraphicsAllocation *, size_t> obtainAllocationAndOffset(size_t requiredSize);

View File

@ -1,17 +0,0 @@
/*
* Copyright (C) 2021-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/memory_manager/memory_manager.h"
template <typename KernelT>
void NEO::SyncBufferHandler::prepareForEnqueue(size_t workGroupsCount, KernelT &kernel) {
auto requiredSize = alignUp(std::max(workGroupsCount, static_cast<size_t>(CommonConstants::minimalSyncBufferSize)), CommonConstants::maximalSizeOfAtomicType);
auto patchData = obtainAllocationAndOffset(requiredSize);
kernel.patchSyncBuffer(patchData.first, patchData.second);
}

View File

@ -283,3 +283,34 @@ TEST_F(KernelHelperTest, GivenPtrByValueWhenCheckingIsAnyArgumentPtrByValueThenT
kernelDescriptor.payloadMappings.explicitArgs.push_back(valueArg);
EXPECT_TRUE(KernelHelper::isAnyArgumentPtrByValue(kernelDescriptor));
}
TEST_F(KernelHelperTest, GivenThreadGroupCountWhenSyncBufferCreatedThenAllocationIsRetrieved) {
const size_t requestedNumberOfWorkgroups = 4;
auto offset = KernelHelper::getSyncBufferSize(requestedNumberOfWorkgroups);
auto pair = KernelHelper::getSyncBufferAllocationOffset(*pDevice, requestedNumberOfWorkgroups);
auto allocation = pair.first;
EXPECT_EQ(0u, pair.second);
EXPECT_NE(nullptr, allocation);
pair = KernelHelper::getSyncBufferAllocationOffset(*pDevice, requestedNumberOfWorkgroups);
EXPECT_EQ(offset, pair.second);
EXPECT_EQ(allocation, pair.first);
}
TEST_F(KernelHelperTest, GivenThreadGroupCountAndRegionSizeWhenRegionBarrierCreatedThenAllocationIsRetrieved) {
const size_t requestedNumberOfWorkgroups = 4;
const size_t localRegionSize = 2;
auto offset = KernelHelper::getRegionGroupBarrierSize(requestedNumberOfWorkgroups, localRegionSize);
auto pair = KernelHelper::getRegionGroupBarrierAllocationOffset(*pDevice, requestedNumberOfWorkgroups, localRegionSize);
auto allocation = pair.first;
EXPECT_EQ(0u, pair.second);
EXPECT_NE(nullptr, allocation);
pair = KernelHelper::getRegionGroupBarrierAllocationOffset(*pDevice, requestedNumberOfWorkgroups, localRegionSize);
EXPECT_EQ(offset, pair.second);
EXPECT_EQ(allocation, pair.first);
}