refactor: split sync buffer and region allocation creation code
- split the allocation code from command list or kernel - allow to call allocation code in all parts of the driver Related-To: NEO-13350 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
parent
f2b0dad964
commit
c5ed6bf73c
|
@ -36,7 +36,6 @@
|
|||
#include "shared/source/memory_manager/unified_memory_manager.h"
|
||||
#include "shared/source/page_fault_manager/cpu_page_fault_manager.h"
|
||||
#include "shared/source/program/sync_buffer_handler.h"
|
||||
#include "shared/source/program/sync_buffer_handler.inl"
|
||||
#include "shared/source/utilities/software_tags_manager.h"
|
||||
|
||||
#include "level_zero/api/driver_experimental/public/zex_cmdlist.h"
|
||||
|
@ -2814,8 +2813,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::programSyncBuffer(Kernel &kern
|
|||
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
device.allocateSyncBufferHandler();
|
||||
device.syncBufferHandler->prepareForEnqueue(requestedNumberOfWorkgroups, kernel);
|
||||
auto patchData = NEO::KernelHelper::getSyncBufferAllocationOffset(device, requestedNumberOfWorkgroups);
|
||||
kernel.patchSyncBuffer(patchData.first, patchData.second);
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
@ -2824,13 +2823,8 @@ template <GFXCORE_FAMILY gfxCoreFamily>
|
|||
void CommandListCoreFamily<gfxCoreFamily>::programRegionGroupBarrier(Kernel &kernel, const ze_group_count_t &threadGroupDimensions, size_t localRegionSize) {
|
||||
auto neoDevice = device->getNEODevice();
|
||||
|
||||
neoDevice->allocateSyncBufferHandler();
|
||||
|
||||
const size_t requestedNumberOfWorkgroups = threadGroupDimensions.groupCountX * threadGroupDimensions.groupCountY * threadGroupDimensions.groupCountZ;
|
||||
|
||||
size_t size = alignUp((requestedNumberOfWorkgroups / localRegionSize) * (localRegionSize + 1) * 2 * sizeof(uint32_t), MemoryConstants::cacheLineSize);
|
||||
|
||||
auto patchData = neoDevice->syncBufferHandler->obtainAllocationAndOffset(size);
|
||||
auto threadGroupCount = threadGroupDimensions.groupCountX * threadGroupDimensions.groupCountY * threadGroupDimensions.groupCountZ;
|
||||
auto patchData = NEO::KernelHelper::getRegionGroupBarrierAllocationOffset(*neoDevice, threadGroupCount, localRegionSize);
|
||||
|
||||
kernel.patchRegionGroupBarrier(patchData.first, patchData.second);
|
||||
}
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include "shared/source/helpers/engine_node_helper.h"
|
||||
#include "shared/source/helpers/flat_batch_buffer_helper.h"
|
||||
#include "shared/source/helpers/flush_stamp.h"
|
||||
#include "shared/source/helpers/kernel_helpers.h"
|
||||
#include "shared/source/helpers/pipe_control_args.h"
|
||||
#include "shared/source/helpers/timestamp_packet.h"
|
||||
#include "shared/source/memory_manager/internal_allocation_storage.h"
|
||||
|
@ -20,7 +21,6 @@
|
|||
#include "shared/source/memory_manager/unified_memory_manager.h"
|
||||
#include "shared/source/os_interface/os_context.h"
|
||||
#include "shared/source/program/sync_buffer_handler.h"
|
||||
#include "shared/source/program/sync_buffer_handler.inl"
|
||||
#include "shared/source/utilities/range.h"
|
||||
#include "shared/source/utilities/tag_allocator.h"
|
||||
|
||||
|
@ -534,7 +534,8 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
|
|||
auto &lws = multiDispatchInfo.begin()->getLocalWorkgroupSize();
|
||||
size_t workGroupsCount = (gws.x * gws.y * gws.z) /
|
||||
(lws.x * lws.y * lws.z);
|
||||
device->getDevice().syncBufferHandler->prepareForEnqueue(workGroupsCount, *multiDispatchInfo.peekMainKernel());
|
||||
auto patchData = KernelHelper::getSyncBufferAllocationOffset(device->getDevice(), workGroupsCount);
|
||||
multiDispatchInfo.peekMainKernel()->patchSyncBuffer(patchData.first, patchData.second);
|
||||
}
|
||||
|
||||
if (event && this->isProfilingEnabled()) {
|
||||
|
|
|
@ -11,12 +11,10 @@
|
|||
#include "shared/source/device/device.h"
|
||||
#include "shared/source/execution_environment/root_device_environment.h"
|
||||
#include "shared/source/helpers/basic_math.h"
|
||||
#include "shared/source/helpers/constants.h"
|
||||
#include "shared/source/helpers/debug_helpers.h"
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
#include "shared/source/helpers/hw_info.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include "shared/source/program/sync_buffer_handler.h"
|
||||
|
||||
namespace NEO {
|
||||
|
||||
|
@ -125,4 +123,20 @@ bool KernelHelper::isAnyArgumentPtrByValue(const KernelDescriptor &kernelDescrip
|
|||
return false;
|
||||
}
|
||||
|
||||
std::pair<GraphicsAllocation *, size_t> KernelHelper::getRegionGroupBarrierAllocationOffset(Device &device, const size_t threadGroupCount, const size_t localRegionSize) {
|
||||
device.allocateSyncBufferHandler();
|
||||
|
||||
size_t size = KernelHelper::getRegionGroupBarrierSize(threadGroupCount, localRegionSize);
|
||||
|
||||
return device.syncBufferHandler->obtainAllocationAndOffset(size);
|
||||
}
|
||||
|
||||
std::pair<GraphicsAllocation *, size_t> KernelHelper::getSyncBufferAllocationOffset(Device &device, const size_t requestedNumberOfWorkgroups) {
|
||||
device.allocateSyncBufferHandler();
|
||||
|
||||
size_t requiredSize = KernelHelper::getSyncBufferSize(requestedNumberOfWorkgroups);
|
||||
|
||||
return device.syncBufferHandler->obtainAllocationAndOffset(requiredSize);
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
|
|
@ -7,14 +7,18 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include "shared/source/helpers/aligned_memory.h"
|
||||
#include "shared/source/helpers/constants.h"
|
||||
#include "shared/source/helpers/definitions/engine_group_types.h"
|
||||
#include "shared/source/kernel/kernel_descriptor.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
namespace NEO {
|
||||
class Device;
|
||||
class GraphicsAllocation;
|
||||
struct RootDeviceEnvironment;
|
||||
|
||||
struct KernelHelper {
|
||||
|
@ -39,6 +43,17 @@ struct KernelHelper {
|
|||
static ErrorCode checkIfThereIsSpaceForScratchOrPrivate(KernelDescriptor::KernelAttributes attributes, Device *device);
|
||||
|
||||
static bool isAnyArgumentPtrByValue(const KernelDescriptor &kernelDescriptor);
|
||||
|
||||
static inline size_t getRegionGroupBarrierSize(const size_t threadGroupCount, const size_t localRegionSize) {
|
||||
return alignUp((threadGroupCount / localRegionSize) * (localRegionSize + 1) * 2 * sizeof(uint32_t), MemoryConstants::cacheLineSize);
|
||||
}
|
||||
|
||||
static std::pair<GraphicsAllocation *, size_t> getRegionGroupBarrierAllocationOffset(Device &device, const size_t threadGroupCount, const size_t localRegionSize);
|
||||
|
||||
static inline size_t getSyncBufferSize(const size_t requestedNumberOfWorkgroups) {
|
||||
return alignUp(std::max(requestedNumberOfWorkgroups, static_cast<size_t>(CommonConstants::minimalSyncBufferSize)), static_cast<size_t>(CommonConstants::maximalSizeOfAtomicType));
|
||||
}
|
||||
static std::pair<GraphicsAllocation *, size_t> getSyncBufferAllocationOffset(Device &device, const size_t requestedNumberOfWorkgroups);
|
||||
};
|
||||
|
||||
} // namespace NEO
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (C) 2019-2023 Intel Corporation
|
||||
# Copyright (C) 2019-2024 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
#
|
||||
|
@ -21,7 +21,6 @@ set(NEO_CORE_PROGRAM
|
|||
${CMAKE_CURRENT_SOURCE_DIR}/program_initialization.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/work_size_info.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/work_size_info.h
|
||||
)
|
||||
|
|
|
@ -25,8 +25,6 @@ class SyncBufferHandler : NonCopyableOrMovableClass {
|
|||
|
||||
SyncBufferHandler(Device &device);
|
||||
|
||||
template <typename KernelT>
|
||||
void prepareForEnqueue(size_t workGroupsCount, KernelT &kernel);
|
||||
void makeResident(CommandStreamReceiver &csr);
|
||||
|
||||
std::pair<GraphicsAllocation *, size_t> obtainAllocationAndOffset(size_t requiredSize);
|
||||
|
|
|
@ -1,17 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2021-2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/source/memory_manager/memory_manager.h"
|
||||
|
||||
template <typename KernelT>
|
||||
void NEO::SyncBufferHandler::prepareForEnqueue(size_t workGroupsCount, KernelT &kernel) {
|
||||
auto requiredSize = alignUp(std::max(workGroupsCount, static_cast<size_t>(CommonConstants::minimalSyncBufferSize)), CommonConstants::maximalSizeOfAtomicType);
|
||||
|
||||
auto patchData = obtainAllocationAndOffset(requiredSize);
|
||||
|
||||
kernel.patchSyncBuffer(patchData.first, patchData.second);
|
||||
}
|
|
@ -283,3 +283,34 @@ TEST_F(KernelHelperTest, GivenPtrByValueWhenCheckingIsAnyArgumentPtrByValueThenT
|
|||
kernelDescriptor.payloadMappings.explicitArgs.push_back(valueArg);
|
||||
EXPECT_TRUE(KernelHelper::isAnyArgumentPtrByValue(kernelDescriptor));
|
||||
}
|
||||
|
||||
TEST_F(KernelHelperTest, GivenThreadGroupCountWhenSyncBufferCreatedThenAllocationIsRetrieved) {
|
||||
const size_t requestedNumberOfWorkgroups = 4;
|
||||
auto offset = KernelHelper::getSyncBufferSize(requestedNumberOfWorkgroups);
|
||||
|
||||
auto pair = KernelHelper::getSyncBufferAllocationOffset(*pDevice, requestedNumberOfWorkgroups);
|
||||
auto allocation = pair.first;
|
||||
|
||||
EXPECT_EQ(0u, pair.second);
|
||||
EXPECT_NE(nullptr, allocation);
|
||||
|
||||
pair = KernelHelper::getSyncBufferAllocationOffset(*pDevice, requestedNumberOfWorkgroups);
|
||||
EXPECT_EQ(offset, pair.second);
|
||||
EXPECT_EQ(allocation, pair.first);
|
||||
}
|
||||
|
||||
TEST_F(KernelHelperTest, GivenThreadGroupCountAndRegionSizeWhenRegionBarrierCreatedThenAllocationIsRetrieved) {
|
||||
const size_t requestedNumberOfWorkgroups = 4;
|
||||
const size_t localRegionSize = 2;
|
||||
auto offset = KernelHelper::getRegionGroupBarrierSize(requestedNumberOfWorkgroups, localRegionSize);
|
||||
|
||||
auto pair = KernelHelper::getRegionGroupBarrierAllocationOffset(*pDevice, requestedNumberOfWorkgroups, localRegionSize);
|
||||
auto allocation = pair.first;
|
||||
|
||||
EXPECT_EQ(0u, pair.second);
|
||||
EXPECT_NE(nullptr, allocation);
|
||||
|
||||
pair = KernelHelper::getRegionGroupBarrierAllocationOffset(*pDevice, requestedNumberOfWorkgroups, localRegionSize);
|
||||
EXPECT_EQ(offset, pair.second);
|
||||
EXPECT_EQ(allocation, pair.first);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue