feature: remote copy support for bcs split

Related-To: NEO-14557

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2025-07-01 13:41:48 +00:00
committed by Compute-Runtime-Automation
parent 6acf58633d
commit e2dff82741
11 changed files with 87 additions and 18 deletions

View File

@@ -319,7 +319,8 @@ struct CommandListCoreFamily : public CommandListImp {
size_t getTotalSizeForCopyRegion(const ze_copy_region_t *region, uint32_t pitch, uint32_t slicePitch);
bool isAppendSplitNeeded(void *dstPtr, const void *srcPtr, size_t size, NEO::TransferDirection &directionOut);
bool isAppendSplitNeeded(NEO::MemoryPool dstPool, NEO::MemoryPool srcPool, size_t size, NEO::TransferDirection &directionOut);
bool isAppendSplitNeeded(NEO::MemoryPool dstPool, NEO::MemoryPool srcPool, size_t size, NEO::TransferDirection &directionOut, bool remoteCopy);
bool isAppendSplitRemote(NEO::SvmAllocationData *allocData, void *ptr) const;
void applyMemoryRangesBarrier(uint32_t numRanges, const size_t *pRangeSizes,
const void **pRanges);

View File

@@ -3924,6 +3924,18 @@ inline NEO::MemoryPool getMemoryPoolFromAllocDataForSplit(bool allocFound, const
return NEO::MemoryPool::memoryNull;
}
template <GFXCORE_FAMILY gfxCoreFamily>
bool CommandListCoreFamily<gfxCoreFamily>::isAppendSplitRemote(NEO::SvmAllocationData *allocData, void *ptr) const {
auto driver = static_cast<DriverHandleImp *>(this->device->getDriverHandle());
if (allocData) {
auto alloc = allocData->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex());
return driver->isRemoteResourceNeeded(ptr, alloc, allocData, this->device);
}
return false;
}
template <GFXCORE_FAMILY gfxCoreFamily>
bool CommandListCoreFamily<gfxCoreFamily>::isAppendSplitNeeded(void *dstPtr, const void *srcPtr, size_t size, NEO::TransferDirection &directionOut) {
if (size < minimalSizeForBcsSplit) {
@@ -3943,12 +3955,14 @@ bool CommandListCoreFamily<gfxCoreFamily>::isAppendSplitNeeded(void *dstPtr, con
}
}
return this->isAppendSplitNeeded(dstMemoryPool, srcMemoryPool, size, directionOut);
bool remoteCopy = isAppendSplitRemote(srcAllocData, const_cast<void *>(srcPtr)) || isAppendSplitRemote(dstAllocData, dstPtr);
return this->isAppendSplitNeeded(dstMemoryPool, srcMemoryPool, size, directionOut, remoteCopy);
}
template <GFXCORE_FAMILY gfxCoreFamily>
inline bool CommandListCoreFamily<gfxCoreFamily>::isAppendSplitNeeded(NEO::MemoryPool dstPool, NEO::MemoryPool srcPool, size_t size, NEO::TransferDirection &directionOut) {
directionOut = NEO::createTransferDirection(!NEO::MemoryPoolHelper::isSystemMemoryPool(srcPool), !NEO::MemoryPoolHelper::isSystemMemoryPool(dstPool));
inline bool CommandListCoreFamily<gfxCoreFamily>::isAppendSplitNeeded(NEO::MemoryPool dstPool, NEO::MemoryPool srcPool, size_t size, NEO::TransferDirection &directionOut, bool remoteCopy) {
directionOut = NEO::createTransferDirection(!NEO::MemoryPoolHelper::isSystemMemoryPool(srcPool), !NEO::MemoryPoolHelper::isSystemMemoryPool(dstPool), remoteCopy);
return this->isBcsSplitNeeded &&
size >= minimalSizeForBcsSplit &&

View File

@@ -803,7 +803,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendPageFaultCopy(N
ze_result_t ret;
NEO::TransferDirection direction;
auto isSplitNeeded = this->isAppendSplitNeeded(dstAllocation->getMemoryPool(), srcAllocation->getMemoryPool(), size, direction);
auto isSplitNeeded = this->isAppendSplitNeeded(dstAllocation->getMemoryPool(), srcAllocation->getMemoryPool(), size, direction, false);
bool relaxedOrdering = false;

View File

@@ -150,6 +150,7 @@ struct Device : _ze_device_handle_t {
virtual ze_result_t getFabricVertex(ze_fabric_vertex_handle_t *phVertex) = 0;
virtual uint32_t getEventMaxPacketCount() const = 0;
virtual uint32_t getEventMaxKernelCount() const = 0;
virtual void bcsSplitReleaseResources() = 0;
NEO::TagAllocatorBase *getDeviceInOrderCounterAllocator();
NEO::TagAllocatorBase *getHostInOrderCounterAllocator();
NEO::TagAllocatorBase *getInOrderTimestampAllocator();

View File

@@ -73,6 +73,10 @@ namespace L0 {
DeviceImp::DeviceImp() : bcsSplit(*this){};
void DeviceImp::bcsSplitReleaseResources() {
bcsSplit.releaseResources();
}
DriverHandle *DeviceImp::getDriverHandle() {
return this->driverHandle;
}

View File

@@ -179,6 +179,7 @@ struct DeviceImp : public Device, NEO::NonCopyableAndNonMovableClass {
NEO::EngineGroupType getInternalEngineGroupType();
uint32_t getCopyEngineOrdinal() const;
std::optional<uint32_t> tryGetCopyEngineOrdinal() const;
void bcsSplitReleaseResources() override;
protected:
ze_result_t queryPeerAccess(DeviceImp *peerDevice);

View File

@@ -214,6 +214,11 @@ ze_result_t DriverHandleImp::getExtensionProperties(uint32_t *pCount,
}
DriverHandleImp::~DriverHandleImp() {
for (auto &device : this->devices) {
// release temporary pointers before default context destruction
device->bcsSplitReleaseResources();
}
if (this->defaultContext) {
L0::Context::fromHandle(this->defaultContext)->destroy();
this->defaultContext = nullptr;

View File

@@ -80,6 +80,7 @@ struct MockDevice : public Device {
ADDMETHOD_NOBASE(getDebugProperties, ze_result_t, ZE_RESULT_SUCCESS, (zet_device_debug_properties_t * properties));
ADDMETHOD_NOBASE(getDebugSession, DebugSession *, nullptr, (const zet_debug_config_t &config));
ADDMETHOD_NOBASE_VOIDRETURN(removeDebugSession, ());
ADDMETHOD_NOBASE_VOIDRETURN(bcsSplitReleaseResources, ());
ADDMETHOD_NOBASE(obtainReusableAllocation, NEO::GraphicsAllocation *, nullptr, (size_t requiredSize, NEO::AllocationType type))
ADDMETHOD_NOBASE_VOIDRETURN(storeReusableAllocation, (NEO::GraphicsAllocation & alloc));
ADDMETHOD_NOBASE(getFabricVertex, ze_result_t, ZE_RESULT_SUCCESS, (ze_fabric_vertex_handle_t * phVertex));

View File

@@ -936,28 +936,31 @@ struct AggregatedBcsSplitTests : public ::testing::Test {
debugManager.flags.SplitBcsRequiredEnginesCount.set(expectedEnginesCount);
debugManager.flags.SplitBcsMask.set(0b11110);
device = createDevice();
createDevice();
context = Context::fromHandle(driverHandle->getDefaultContext());
cmdList = createCmdList();
}
std::unique_ptr<L0::Device> createDevice() {
ze_result_t returnValue;
void createDevice() {
auto hwInfo = *NEO::defaultHwInfo;
hwInfo.featureTable.ftrBcsInfo = 0b111111111;
hwInfo.capabilityTable.blitterOperationsSupported = true;
auto neoDevice = NEO::MockDevice::createWithNewExecutionEnvironment<NEO::MockDevice>(&hwInfo);
auto neoDevice = NEO::MockDevice::createWithNewExecutionEnvironment<NEO::MockDevice>(&hwInfo, 0);
NEO::DeviceVector devices;
devices.push_back(std::unique_ptr<NEO::Device>(neoDevice));
for (uint32_t i = 1; i < expectedNumRootDevices; i++) {
auto neoRootDevice = NEO::MockDevice::createWithExecutionEnvironment<NEO::MockDevice>(&hwInfo, neoDevice->getExecutionEnvironment(), i);
devices.push_back(std::unique_ptr<NEO::Device>(neoRootDevice));
}
driverHandle = std::make_unique<Mock<L0::DriverHandleImp>>();
driverHandle->initialize(std::move(devices));
auto device = std::unique_ptr<L0::Device>(L0::Device::create(driverHandle.get(), neoDevice, false, &returnValue));
this->device = driverHandle->devices[0];
bcsSplit = &static_cast<DeviceImp *>(device.get())->bcsSplit;
return device;
bcsSplit = &static_cast<DeviceImp *>(device)->bcsSplit;
}
uint32_t queryCopyOrdinal() {
@@ -987,7 +990,7 @@ struct AggregatedBcsSplitTests : public ::testing::Test {
desc.ordinal = queryCopyOrdinal();
std::unique_ptr<L0::CommandList> commandList(CommandList::createImmediate(productFamily,
device.get(),
device,
&desc,
false,
NEO::EngineGroupType::copy,
@@ -1004,16 +1007,25 @@ struct AggregatedBcsSplitTests : public ::testing::Test {
return alloc;
}
void *allocDeviceMem(L0::Device *device) {
void *alloc = nullptr;
ze_device_mem_alloc_desc_t deviceDesc = {};
ze_result_t result = context->allocDeviceMem(device->toHandle(), &deviceDesc, copySize, 4096u, &alloc);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
return alloc;
}
DebugManagerStateRestore restore;
CmdListMemoryCopyParams copyParams = {};
std::unique_ptr<Mock<L0::DriverHandleImp>> driverHandle;
std::unique_ptr<L0::Device> device;
L0::Device *device = nullptr;
std::unique_ptr<L0::CommandList> cmdList;
BcsSplit *bcsSplit = nullptr;
Context *context = nullptr;
const size_t copySize = 4 * MemoryConstants::megaByte;
uint32_t expectedTileCount = 1;
uint32_t expectedEnginesCount = 4;
uint32_t expectedNumRootDevices = 1;
};
HWTEST2_F(AggregatedBcsSplitTests, whenObtainCalledThenAggregatedEventsCreated, IsAtLeastXeHpcCore) {
@@ -1137,6 +1149,31 @@ HWTEST2_F(AggregatedBcsSplitTests, givenMarkerEventWhenCheckingCompletionThenRes
context->freeMem(ptr);
}
struct MultiRootAggregatedBcsSplitTests : public AggregatedBcsSplitTests {
void SetUp() override {
expectedNumRootDevices = 2;
debugManager.flags.CreateMultipleRootDevices.set(expectedNumRootDevices);
AggregatedBcsSplitTests::SetUp();
}
};
HWTEST2_F(MultiRootAggregatedBcsSplitTests, givenRemoteAllocWhenCopyRequestedThenEnableSplit, IsAtLeastXeHpcCore) {
auto device1 = driverHandle->devices[1];
auto ptr = allocHostMem();
auto remoteAlloc = allocDeviceMem(device1);
auto cmdListHw = static_cast<WhiteBox<L0::CommandListCoreFamilyImmediate<FamilyType::gfxCoreFamily>> *>(cmdList.get());
cmdListHw->appendMemoryCopy(remoteAlloc, ptr, copySize, nullptr, 0, nullptr, copyParams);
EXPECT_EQ(cmdListHw->inOrderExecInfo->getCounterValue(), bcsSplit->events.marker[0]->getInOrderExecBaseSignalValue());
cmdListHw->appendMemoryCopy(ptr, remoteAlloc, copySize, nullptr, 0, nullptr, copyParams);
EXPECT_EQ(cmdListHw->inOrderExecInfo->getCounterValue(), bcsSplit->events.marker[1]->getInOrderExecBaseSignalValue());
context->freeMem(ptr);
context->freeMem(remoteAlloc);
}
struct MultiTileAggregatedBcsSplitTests : public AggregatedBcsSplitTests {
void SetUp() override {
expectedTileCount = 2;

View File

@@ -45,7 +45,7 @@ struct CsrSelectionArgs {
if (dst) {
processResource(*dst, rootDeviceIndex, this->dstResource);
}
this->direction = createTransferDirection(srcResource.isLocal, dstResource.isLocal);
this->direction = createTransferDirection(srcResource.isLocal, dstResource.isLocal, false);
}
CsrSelectionArgs(cl_command_type cmdType, Image *src, Image *dst, uint32_t rootDeviceIndex, const size_t *size, const size_t *srcOrigin, const size_t *dstOrigin)

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2023 Intel Corporation
* Copyright (C) 2023-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -13,9 +13,14 @@ enum class TransferDirection {
hostToLocal,
localToHost,
localToLocal,
remote,
};
inline TransferDirection createTransferDirection(bool srcLocal, bool dstLocal) {
inline TransferDirection createTransferDirection(bool srcLocal, bool dstLocal, bool remoteCopy) {
if (remoteCopy) {
return TransferDirection::remote;
}
if (srcLocal) {
if (dstLocal) {
return TransferDirection::localToLocal;