feature: create single temporary allocation for bcs split

Related-To: NEO-14557

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2025-09-04 14:12:29 +00:00
committed by Compute-Runtime-Automation
parent aa2e76cae7
commit 6191f5aec8
6 changed files with 65 additions and 11 deletions

View File

@@ -33,6 +33,7 @@
#include "shared/source/helpers/register_offsets.h"
#include "shared/source/helpers/state_base_address_helper.h"
#include "shared/source/helpers/surface_format_info.h"
#include "shared/source/helpers/validators.h"
#include "shared/source/indirect_heap/indirect_heap.h"
#include "shared/source/memory_manager/allocation_properties.h"
#include "shared/source/memory_manager/graphics_allocation.h"
@@ -1816,8 +1817,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
callId);
}
auto dstAllocationStruct = getAlignedAllocationData(this->device, sharedSystemEnabled, dstptr, size, false, isCopyOffloadEnabled());
auto srcAllocationStruct = getAlignedAllocationData(this->device, sharedSystemEnabled, srcptr, size, true, isCopyOffloadEnabled());
auto allocSize = NEO::getIfValid(memoryCopyParams.bcsSplitTotalDstSize, size);
auto dstAllocationStruct = getAlignedAllocationData(this->device, sharedSystemEnabled, NEO::getIfValid(memoryCopyParams.bcsSplitBaseDstPtr, dstptr), allocSize, false, isCopyOffloadEnabled());
auto srcAllocationStruct = getAlignedAllocationData(this->device, sharedSystemEnabled, NEO::getIfValid(memoryCopyParams.bcsSplitBaseSrcPtr, srcptr), allocSize, true, isCopyOffloadEnabled());
if ((dstAllocationStruct.alloc == nullptr || srcAllocationStruct.alloc == nullptr) && (sharedSystemEnabled == false)) {
return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY;
@@ -2054,11 +2056,11 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyRegion(void *d
callId);
}
size_t dstSize = this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch);
size_t srcSize = this->getTotalSizeForCopyRegion(srcRegion, srcPitch, srcSlicePitch);
size_t dstAllocSize = NEO::getIfValid(memoryCopyParams.bcsSplitTotalDstSize, this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch));
size_t srcAllocSize = NEO::getIfValid(memoryCopyParams.bcsSplitTotalSrcSize, this->getTotalSizeForCopyRegion(srcRegion, srcPitch, srcSlicePitch));
auto dstAllocationStruct = getAlignedAllocationData(this->device, false, dstPtr, dstSize, false, isCopyOffloadEnabled());
auto srcAllocationStruct = getAlignedAllocationData(this->device, false, srcPtr, srcSize, true, isCopyOffloadEnabled());
auto dstAllocationStruct = getAlignedAllocationData(this->device, false, NEO::getIfValid(memoryCopyParams.bcsSplitBaseDstPtr, dstPtr), dstAllocSize, false, isCopyOffloadEnabled());
auto srcAllocationStruct = getAlignedAllocationData(this->device, false, NEO::getIfValid(memoryCopyParams.bcsSplitBaseSrcPtr, srcPtr), srcAllocSize, true, isCopyOffloadEnabled());
UNRECOVERABLE_IF(srcSlicePitch && srcPitch == 0);
Vec3<size_t> srcSize3 = {srcPitch ? srcPitch : srcRegion->width + srcRegion->originX,

View File

@@ -252,7 +252,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
ze_result_t appendStagingMemoryCopy(void *dstptr, const void *srcptr, size_t size, ze_event_handle_t hSignalEvent, CmdListMemoryCopyParams &memoryCopyParams);
ze_result_t stagingStatusToL0(const NEO::StagingTransferStatus &status) const;
size_t estimateAdditionalSizeAppendRegularCommandLists(uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists);
void setupFlagsForBcsSplit(CmdListMemoryCopyParams &memoryCopyParams, bool &hasStallingCmds, bool &copyOffloadFlush);
void setupFlagsForBcsSplit(CmdListMemoryCopyParams &memoryCopyParams, bool &hasStallingCmds, bool &copyOffloadFlush, const void *srcPtr, void *dstPtr, size_t srcSize, size_t dstSize);
MOCKABLE_VIRTUAL void checkAssert();
ComputeFlushMethodType computeFlushMethod = nullptr;

View File

@@ -644,13 +644,18 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendBarrier(ze_even
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamilyImmediate<gfxCoreFamily>::setupFlagsForBcsSplit(CmdListMemoryCopyParams &memoryCopyParams, bool &hasStallingCmds, bool &copyOffloadFlush) {
void CommandListCoreFamilyImmediate<gfxCoreFamily>::setupFlagsForBcsSplit(CmdListMemoryCopyParams &memoryCopyParams, bool &hasStallingCmds, bool &copyOffloadFlush, const void *srcPtr, void *dstPtr, size_t srcSize, size_t dstSize) {
memoryCopyParams.relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1, false); // split generates more than 1 event
memoryCopyParams.forceDisableCopyOnlyInOrderSignaling = true;
memoryCopyParams.taskCountUpdateRequired = true;
memoryCopyParams.copyOffloadAllowed = this->isCopyOffloadEnabled();
copyOffloadFlush = memoryCopyParams.copyOffloadAllowed;
hasStallingCmds = !memoryCopyParams.relaxedOrderingDispatch;
memoryCopyParams.bcsSplitBaseDstPtr = dstPtr;
memoryCopyParams.bcsSplitBaseSrcPtr = srcPtr;
memoryCopyParams.bcsSplitTotalDstSize = dstSize;
memoryCopyParams.bcsSplitTotalSrcSize = srcSize;
}
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -689,7 +694,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
NEO::TransferDirection direction;
auto isSplitNeeded = this->isAppendSplitNeeded(dstptr, srcptr, size, direction);
if (isSplitNeeded) {
setupFlagsForBcsSplit(memoryCopyParams, hasStallingCmds, copyOffloadFlush);
setupFlagsForBcsSplit(memoryCopyParams, hasStallingCmds, copyOffloadFlush, srcptr, dstptr, size, size);
auto splitCall = [&](CommandListCoreFamilyImmediate<gfxCoreFamily> *subCmdList, void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
return subCmdList->CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, 0u, nullptr, memoryCopyParams);
@@ -742,7 +747,9 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
NEO::TransferDirection direction;
auto isSplitNeeded = this->isAppendSplitNeeded(dstPtr, srcPtr, this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch), direction);
if (isSplitNeeded) {
setupFlagsForBcsSplit(memoryCopyParams, hasStallingCmds, copyOffloadFlush);
setupFlagsForBcsSplit(memoryCopyParams, hasStallingCmds, copyOffloadFlush, srcPtr, dstPtr,
this->getTotalSizeForCopyRegion(srcRegion, srcPitch, srcSlicePitch),
this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch));
auto splitCall = [&](CommandListCoreFamilyImmediate<gfxCoreFamily> *subCmdList, uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
ze_copy_region_t dstRegionLocal = {};

View File

@@ -7,8 +7,14 @@
#pragma once
#include <cstddef>
namespace L0 {
struct CmdListMemoryCopyParams {
const void *bcsSplitBaseSrcPtr = nullptr;
void *bcsSplitBaseDstPtr = nullptr;
size_t bcsSplitTotalSrcSize = 0;
size_t bcsSplitTotalDstSize = 0;
bool relaxedOrderingDispatch = false;
bool forceDisableCopyOnlyInOrderSignaling = false;
bool copyOffloadAllowed = false;

View File

@@ -1370,6 +1370,40 @@ HWTEST2_F(AggregatedBcsSplitTests, givenMarkerEventWhenCheckingCompletionThenRes
*cmdListHw->inOrderExecInfo->getBaseHostAddress() = 3;
}
HWTEST2_F(AggregatedBcsSplitTests, givenUserPtrWhenAppendCalledThenCreateOnlyOneTempAlloc, IsAtLeastXeHpcCore) {
auto ptr = allocHostMem();
uint64_t hostPtr = 0;
auto cmdListHw = static_cast<WhiteBox<L0::CommandListCoreFamilyImmediate<FamilyType::gfxCoreFamily>> *>(cmdList.get());
auto &tempAllocList = device->getNEODevice()->getMemoryManager()->getTemporaryAllocationsList();
auto countElements = [&tempAllocList]() {
auto current = tempAllocList.peekHead();
uint32_t count = 0;
while (current) {
count++;
current = current->next;
}
return count;
};
EXPECT_EQ(0u, countElements());
cmdListHw->appendMemoryCopy(ptr, &hostPtr, copySize, nullptr, 0, nullptr, copyParams);
EXPECT_EQ(1u, countElements());
cmdListHw->hostSynchronize(1, true);
EXPECT_EQ(0u, countElements());
ze_copy_region_t region = {0, 0, 0, static_cast<uint32_t>(copySize), 1, 1};
cmdListHw->appendMemoryCopyRegion(ptr, &region, 0, 0, &hostPtr, &region, 0, 0, nullptr, 0, nullptr, copyParams);
EXPECT_EQ(1u, countElements());
context->freeMem(ptr);
}
HWTEST2_F(AggregatedBcsSplitTests, givenFullCmdBufferWhenAppendCalledThenAllocateNewBuffer, IsAtLeastXeHpcCore) {
auto ptr = allocHostMem();