refactor: bcs split class structure

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2025-10-08 14:53:49 +00:00
committed by Compute-Runtime-Automation
parent 2aabe27531
commit 0597f064e5
8 changed files with 266 additions and 205 deletions

View File

@@ -718,12 +718,14 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
if (isSplitNeeded) {
setupFlagsForBcsSplit(memoryCopyParams, hasStallingCmds, copyOffloadFlush, srcptr, dstptr, size, size);
auto splitCall = [&](CommandListCoreFamilyImmediate<gfxCoreFamily> *subCmdList, void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) {
auto splitCall = [&](CommandListCoreFamilyImmediate<gfxCoreFamily> *subCmdList, const BcsSplitParams::CopyParams &copyParams, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) {
memoryCopyParams.forceAggregatedEventIncValue = aggregatedEventIncValue;
return subCmdList->CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, 0u, nullptr, memoryCopyParams);
auto &params = std::get<BcsSplitParams::MemCopy>(copyParams);
return subCmdList->CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(params.dst, params.src, sizeParam, hSignalEventParam, 0u, nullptr, memoryCopyParams);
};
ret = static_cast<DeviceImp *>(this->device)->bcsSplit->appendSplitCall<gfxCoreFamily, void *, const void *>(this, dstptr, srcptr, size, hSignalEvent, numWaitEvents, phWaitEvents, true, memoryCopyParams.relaxedOrderingDispatch, direction, estimatedSize, splitCall);
BcsSplitParams::CopyParams copyParams = BcsSplitParams::MemCopy{dstptr, srcptr};
ret = static_cast<DeviceImp *>(this->device)->bcsSplit->appendSplitCall<gfxCoreFamily>(this, copyParams, size, hSignalEvent, numWaitEvents, phWaitEvents, true, memoryCopyParams.relaxedOrderingDispatch, direction, estimatedSize, splitCall);
} else if (this->isValidForStagingTransfer(dstptr, srcptr, size, numWaitEvents > 0)) {
return this->appendStagingMemoryCopy(dstptr, srcptr, size, hSignalEvent, memoryCopyParams);
@@ -774,14 +776,16 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
this->getTotalSizeForCopyRegion(srcRegion, srcPitch, srcSlicePitch),
this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch));
auto splitCall = [&](CommandListCoreFamilyImmediate<gfxCoreFamily> *subCmdList, uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) {
auto splitCall = [&](CommandListCoreFamilyImmediate<gfxCoreFamily> *subCmdList, const BcsSplitParams::CopyParams &copyParams, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) {
auto &params = std::get<BcsSplitParams::RegionCopy>(copyParams);
ze_copy_region_t dstRegionLocal = {};
ze_copy_region_t srcRegionLocal = {};
memcpy(&dstRegionLocal, dstRegion, sizeof(ze_copy_region_t));
memcpy(&srcRegionLocal, srcRegion, sizeof(ze_copy_region_t));
dstRegionLocal.originX = dstOriginXParam;
dstRegionLocal.originX = params.dst;
dstRegionLocal.width = static_cast<uint32_t>(sizeParam);
srcRegionLocal.originX = srcOriginXParam;
srcRegionLocal.originX = params.src;
srcRegionLocal.width = static_cast<uint32_t>(sizeParam);
memoryCopyParams.forceAggregatedEventIncValue = aggregatedEventIncValue;
return subCmdList->CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyRegion(dstPtr, &dstRegionLocal, dstPitch, dstSlicePitch,
@@ -789,7 +793,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
hSignalEventParam, 0u, nullptr, memoryCopyParams);
};
ret = static_cast<DeviceImp *>(this->device)->bcsSplit->appendSplitCall<gfxCoreFamily, uint32_t, uint32_t>(this, dstRegion->originX, srcRegion->originX, dstRegion->width, hSignalEvent, numWaitEvents, phWaitEvents, true, memoryCopyParams.relaxedOrderingDispatch, direction, estimatedSize, splitCall);
BcsSplitParams::CopyParams copyParams = BcsSplitParams::RegionCopy{dstRegion->originX, srcRegion->originX};
ret = static_cast<DeviceImp *>(this->device)->bcsSplit->appendSplitCall<gfxCoreFamily>(this, copyParams, dstRegion->width, hSignalEvent, numWaitEvents, phWaitEvents, true, memoryCopyParams.relaxedOrderingDispatch, direction, estimatedSize, splitCall);
} else {
ret = CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyRegion(dstPtr, dstRegion, dstPitch, dstSlicePitch,
srcPtr, srcRegion, srcPitch, srcSlicePitch,
@@ -860,12 +865,14 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendPageFaultCopy(N
setupFlagsForBcsSplit(bcsSplitMemoryCopyParams, hasStallingCmds, copyOffloadFlush, srcAddress, dstAddress, size, size);
auto splitCall = [&](CommandListCoreFamilyImmediate<gfxCoreFamily> *subCmdList, void *dstAddressParam, const void *srcAddressParam, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) {
auto splitCall = [&](CommandListCoreFamilyImmediate<gfxCoreFamily> *subCmdList, const BcsSplitParams::CopyParams &copyParams, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) {
bcsSplitMemoryCopyParams.forceAggregatedEventIncValue = aggregatedEventIncValue;
return subCmdList->CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstAddressParam, srcAddressParam, sizeParam, hSignalEventParam, 0u, nullptr, bcsSplitMemoryCopyParams);
auto &params = std::get<BcsSplitParams::MemCopy>(copyParams);
return subCmdList->CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(params.dst, params.src, sizeParam, hSignalEventParam, 0u, nullptr, bcsSplitMemoryCopyParams);
};
ret = static_cast<DeviceImp *>(this->device)->bcsSplit->appendSplitCall<gfxCoreFamily, void *, const void *>(this, dstAddress, srcAddress, size, nullptr, 0u, nullptr, false, bcsSplitMemoryCopyParams.relaxedOrderingDispatch, direction, commonImmediateCommandSize, splitCall);
BcsSplitParams::CopyParams copyParams = BcsSplitParams::MemCopy{dstAddress, srcAddress};
ret = static_cast<DeviceImp *>(this->device)->bcsSplit->appendSplitCall<gfxCoreFamily>(this, copyParams, size, nullptr, 0u, nullptr, false, bcsSplitMemoryCopyParams.relaxedOrderingDispatch, direction, commonImmediateCommandSize, splitCall);
} else {
ret = CommandListCoreFamily<gfxCoreFamily>::appendPageFaultCopy(dstAllocation, srcAllocation, size, flushHost);
}

View File

@@ -10,6 +10,7 @@ target_sources(${L0_STATIC_LIB_NAME}
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}device_imp_helper.cpp
${CMAKE_CURRENT_SOURCE_DIR}/bcs_split.cpp
${CMAKE_CURRENT_SOURCE_DIR}/bcs_split.h
${CMAKE_CURRENT_SOURCE_DIR}/bcs_split.inl
${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
${CMAKE_CURRENT_SOURCE_DIR}/device.h
${CMAKE_CURRENT_SOURCE_DIR}/device_imp_${DRIVER_MODEL}/device_imp_${DRIVER_MODEL}.cpp

View File

@@ -21,14 +21,12 @@ namespace L0 {
bool BcsSplit::setupDevice(NEO::CommandStreamReceiver *csr, bool copyOffloadEnabled) {
auto &productHelper = this->device.getProductHelper();
auto bcsSplitSettings = productHelper.getBcsSplitSettings(this->device.getHwInfo());
this->splitSettings = productHelper.getBcsSplitSettings(this->device.getHwInfo());
if (NEO::debugManager.flags.SplitBcsRequiredTileCount.get() != -1) {
bcsSplitSettings.requiredTileCount = static_cast<uint32_t>(NEO::debugManager.flags.SplitBcsRequiredTileCount.get());
}
NEO::debugManager.flags.SplitBcsRequiredTileCount.assignIfNotDefault(splitSettings.requiredTileCount);
// If expectedTileCount==1, route root device to Tile0, otherwise use all Tiles
bool tileCountMatch = (bcsSplitSettings.requiredTileCount == 1) || (this->device.getNEODevice()->getNumSubDevices() == bcsSplitSettings.requiredTileCount);
bool tileCountMatch = (splitSettings.requiredTileCount == 1) || (this->device.getNEODevice()->getNumSubDevices() == splitSettings.requiredTileCount);
bool engineMatch = (csr->getOsContext().getEngineType() == productHelper.getDefaultCopyEngine());
if (copyOffloadEnabled && NEO::debugManager.flags.SplitBcsForCopyOffload.get() != 0) {
engineMatch = NEO::EngineHelpers::isComputeEngine(csr->getOsContext().getEngineType());
@@ -46,39 +44,35 @@ bool BcsSplit::setupDevice(NEO::CommandStreamReceiver *csr, bool copyOffloadEnab
return true;
}
events.aggregatedEventsMode = device.getL0GfxCoreHelper().bcsSplitAggregatedModeEnabled();
events.aggregatedEventsMode = NEO::debugManager.flags.SplitBcsAggregatedEventsMode.getIfNotDefault(device.getL0GfxCoreHelper().bcsSplitAggregatedModeEnabled());
if (NEO::debugManager.flags.SplitBcsAggregatedEventsMode.get() != -1) {
events.aggregatedEventsMode = !!NEO::debugManager.flags.SplitBcsAggregatedEventsMode.get();
}
setupEnginesMask();
setupEnginesMask(bcsSplitSettings);
return setupQueues(bcsSplitSettings);
return setupQueues();
}
bool BcsSplit::setupQueues(const NEO::BcsSplitSettings &settings) {
bool BcsSplit::setupQueues() {
CsrContainer csrs;
for (uint32_t tileId = 0; tileId < settings.requiredTileCount; tileId++) {
for (uint32_t tileId = 0; tileId < splitSettings.requiredTileCount; tileId++) {
auto subDevice = this->device.getNEODevice()->getNearestGenericSubDevice(tileId);
UNRECOVERABLE_IF(!subDevice);
for (uint32_t engineId = 0; engineId < NEO::bcsInfoMaskSize; engineId++) {
if (settings.allEngines.test(engineId)) {
if (splitSettings.allEngines.test(engineId)) {
if (auto engine = subDevice->tryGetEngine(NEO::EngineHelpers::getBcsEngineAtIdx(engineId), NEO::EngineUsage::regular)) {
csrs.push_back(engine->commandStreamReceiver);
}
}
if (csrs.size() >= settings.minRequiredTotalCsrCount) {
if (csrs.size() >= splitSettings.minRequiredTotalCsrCount) {
break;
}
}
}
if (csrs.size() < settings.minRequiredTotalCsrCount) {
if (csrs.size() < splitSettings.minRequiredTotalCsrCount) {
return false;
}
@@ -98,10 +92,10 @@ bool BcsSplit::setupQueues(const NEO::BcsSplitSettings &settings) {
auto engineType = csr->getOsContext().getEngineType();
auto bcsId = NEO::EngineHelpers::getBcsIndex(engineType);
if (settings.h2dEngines.test(bcsId)) {
if (splitSettings.h2dEngines.test(bcsId)) {
this->h2dCmdLists.push_back(cmdList);
}
if (settings.d2hEngines.test(bcsId)) {
if (splitSettings.d2hEngines.test(bcsId)) {
this->d2hCmdLists.push_back(cmdList);
}
}
@@ -109,20 +103,11 @@ bool BcsSplit::setupQueues(const NEO::BcsSplitSettings &settings) {
return true;
}
void BcsSplit::setupEnginesMask(NEO::BcsSplitSettings &settings) {
if (NEO::debugManager.flags.SplitBcsMask.get() > 0) {
settings.allEngines = NEO::debugManager.flags.SplitBcsMask.get();
}
if (NEO::debugManager.flags.SplitBcsMaskH2D.get() > 0) {
settings.h2dEngines = NEO::debugManager.flags.SplitBcsMaskH2D.get();
}
if (NEO::debugManager.flags.SplitBcsMaskD2H.get() > 0) {
settings.d2hEngines = NEO::debugManager.flags.SplitBcsMaskD2H.get();
}
if (NEO::debugManager.flags.SplitBcsRequiredEnginesCount.get() != -1) {
settings.minRequiredTotalCsrCount = static_cast<uint32_t>(NEO::debugManager.flags.SplitBcsRequiredEnginesCount.get());
}
void BcsSplit::setupEnginesMask() {
NEO::debugManager.flags.SplitBcsMask.assignIfNotDefault(splitSettings.allEngines);
NEO::debugManager.flags.SplitBcsMaskH2D.assignIfNotDefault(splitSettings.h2dEngines);
NEO::debugManager.flags.SplitBcsMaskD2H.assignIfNotDefault(splitSettings.d2hEngines);
NEO::debugManager.flags.SplitBcsRequiredEnginesCount.assignIfNotDefault(splitSettings.minRequiredTotalCsrCount);
}
void BcsSplit::releaseResources() {
@@ -150,7 +135,7 @@ std::vector<CommandList *> &BcsSplit::getCmdListsForSplit(NEO::TransferDirection
return this->cmdLists;
}
size_t BcsSplit::Events::obtainAggregatedEventsForSplit(Context *context) {
size_t BcsSplitEvents::obtainAggregatedEventsForSplit(Context *context) {
for (size_t i = 0; i < this->marker.size(); i++) {
if (this->marker[i]->queryStatus() == ZE_RESULT_SUCCESS) {
resetAggregatedEventState(i, false);
@@ -161,7 +146,7 @@ size_t BcsSplit::Events::obtainAggregatedEventsForSplit(Context *context) {
return this->createAggregatedEvent(context);
}
std::optional<size_t> BcsSplit::Events::obtainForSplit(Context *context, size_t maxEventCountInPool) {
std::optional<size_t> BcsSplitEvents::obtainForSplit(Context *context, size_t maxEventCountInPool) {
std::lock_guard<std::mutex> lock(this->mtx);
if (this->aggregatedEventsMode) {
@@ -186,7 +171,7 @@ std::optional<size_t> BcsSplit::Events::obtainForSplit(Context *context, size_t
return 0;
}
uint64_t *BcsSplit::Events::getNextAllocationForAggregatedEvent() {
uint64_t *BcsSplitEvents::getNextAllocationForAggregatedEvent() {
constexpr size_t allocationSize = MemoryConstants::pageSize64k;
if (!this->allocsForAggregatedEvents.empty() && (currentAggregatedAllocOffset + MemoryConstants::cacheLineSize) < allocationSize) {
@@ -195,9 +180,9 @@ uint64_t *BcsSplit::Events::getNextAllocationForAggregatedEvent() {
ze_device_mem_alloc_desc_t desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC};
void *ptr = nullptr;
auto context = Context::fromHandle(bcsSplit.device.getDriverHandle()->getDefaultContext());
auto context = Context::fromHandle(bcsSplit.getDevice().getDriverHandle()->getDefaultContext());
context->allocDeviceMem(bcsSplit.device.toHandle(), &desc, allocationSize, MemoryConstants::pageSize64k, &ptr);
context->allocDeviceMem(bcsSplit.getDevice().toHandle(), &desc, allocationSize, MemoryConstants::pageSize64k, &ptr);
UNRECOVERABLE_IF(!ptr);
currentAggregatedAllocOffset = 0;
@@ -209,7 +194,7 @@ uint64_t *BcsSplit::Events::getNextAllocationForAggregatedEvent() {
return ptrOffset(basePtr, currentAggregatedAllocOffset);
}
size_t BcsSplit::Events::createAggregatedEvent(Context *context) {
size_t BcsSplitEvents::createAggregatedEvent(Context *context) {
constexpr int preallocationCount = 8;
size_t returnIndex = this->subcopy.size();
@@ -230,13 +215,13 @@ size_t BcsSplit::Events::createAggregatedEvent(Context *context) {
externalStorageAllocProperties.deviceAddress = getNextAllocationForAggregatedEvent();
ze_event_handle_t handle = nullptr;
zexCounterBasedEventCreate2(context, bcsSplit.device.toHandle(), &counterBasedDesc, &handle);
zexCounterBasedEventCreate2(context, bcsSplit.getDevice().toHandle(), &counterBasedDesc, &handle);
UNRECOVERABLE_IF(handle == nullptr);
this->subcopy.push_back(Event::fromHandle(handle));
ze_event_handle_t markerHandle = nullptr;
zexCounterBasedEventCreate2(context, bcsSplit.device.toHandle(), &markerCounterBasedDesc, &markerHandle);
zexCounterBasedEventCreate2(context, bcsSplit.getDevice().toHandle(), &markerCounterBasedDesc, &markerHandle);
UNRECOVERABLE_IF(markerHandle == nullptr);
this->marker.push_back(Event::fromHandle(markerHandle));
@@ -247,14 +232,14 @@ size_t BcsSplit::Events::createAggregatedEvent(Context *context) {
return returnIndex;
}
bool BcsSplit::Events::allocatePool(Context *context, size_t maxEventCountInPool, size_t neededEvents) {
bool BcsSplitEvents::allocatePool(Context *context, size_t maxEventCountInPool, size_t neededEvents) {
if (this->pools.empty() ||
this->createdFromLatestPool + neededEvents > maxEventCountInPool) {
ze_result_t result;
ze_event_pool_desc_t desc = {ZE_STRUCTURE_TYPE_EVENT_POOL_DESC};
desc.count = static_cast<uint32_t>(maxEventCountInPool);
auto hDevice = this->bcsSplit.device.toHandle();
auto pool = EventPool::create(this->bcsSplit.device.getDriverHandle(), context, 1, &hDevice, &desc, result);
auto hDevice = this->bcsSplit.getDevice().toHandle();
auto pool = EventPool::create(this->bcsSplit.getDevice().getDriverHandle(), context, 1, &hDevice, &desc, result);
if (!pool) {
return false;
}
@@ -265,7 +250,7 @@ bool BcsSplit::Events::allocatePool(Context *context, size_t maxEventCountInPool
return true;
}
std::optional<size_t> BcsSplit::Events::createFromPool(Context *context, size_t maxEventCountInPool) {
std::optional<size_t> BcsSplitEvents::createFromPool(Context *context, size_t maxEventCountInPool) {
/* Internal events needed for split:
* - event per subcopy to signal completion of given subcopy (vector of subcopy events),
* - 1 event to signal completion of entire split (vector of marker events),
@@ -311,7 +296,7 @@ std::optional<size_t> BcsSplit::Events::createFromPool(Context *context, size_t
return this->marker.size() - 1;
}
void BcsSplit::Events::resetEventPackage(size_t index) {
void BcsSplitEvents::resetEventPackage(size_t index) {
this->marker[index]->reset();
this->barrier[index]->reset();
for (size_t j = 0; j < this->bcsSplit.cmdLists.size(); j++) {
@@ -319,7 +304,7 @@ void BcsSplit::Events::resetEventPackage(size_t index) {
}
}
void BcsSplit::Events::resetAggregatedEventState(size_t index, bool markerCompleted) {
void BcsSplitEvents::resetAggregatedEventState(size_t index, bool markerCompleted) {
*this->subcopy[index]->getInOrderExecInfo()->getBaseHostAddress() = 0;
auto markerEvent = this->marker[index];
@@ -328,7 +313,7 @@ void BcsSplit::Events::resetAggregatedEventState(size_t index, bool markerComple
markerEvent->setReportEmptyCbEventAsReady(markerCompleted);
}
void BcsSplit::Events::releaseResources() {
void BcsSplitEvents::releaseResources() {
for (auto &markerEvent : this->marker) {
markerEvent->destroy();
}
@@ -346,7 +331,7 @@ void BcsSplit::Events::releaseResources() {
}
pools.clear();
auto context = Context::fromHandle(bcsSplit.device.getDriverHandle()->getDefaultContext());
auto context = Context::fromHandle(bcsSplit.getDevice().getDriverHandle()->getDefaultContext());
for (auto &ptr : this->allocsForAggregatedEvents) {
context->freeMem(ptr);
}

View File

@@ -16,6 +16,7 @@
#include <functional>
#include <mutex>
#include <variant>
#include <vector>
namespace NEO {
@@ -26,18 +27,24 @@ enum class TransferDirection;
namespace L0 {
struct CommandQueue;
struct DeviceImp;
class BcsSplit;
struct BcsSplit {
template <GFXCORE_FAMILY gfxCoreFamily, typename T, typename K>
using AppendCallFuncT = std::function<ze_result_t(CommandListCoreFamilyImmediate<gfxCoreFamily> *, T, K, size_t, ze_event_handle_t, uint64_t)>;
using CsrContainer = StackVec<NEO::CommandStreamReceiver *, 12u>;
namespace BcsSplitParams {
struct MemCopy {
void *dst = nullptr;
const void *src = nullptr;
};
DeviceImp &device;
uint32_t clientCount = 0u;
struct RegionCopy {
// originXParams
uint32_t dst = 0;
uint32_t src = 0;
};
std::mutex mtx;
using CopyParams = std::variant<MemCopy, RegionCopy>;
} // namespace BcsSplitParams
struct Events {
struct BcsSplitEvents {
BcsSplit &bcsSplit;
std::mutex mtx;
@@ -60,17 +67,28 @@ struct BcsSplit {
size_t createAggregatedEvent(Context *context);
uint64_t *getNextAllocationForAggregatedEvent();
Events(BcsSplit &bcsSplit) : bcsSplit(bcsSplit) {}
} events;
BcsSplitEvents(BcsSplit &bcsSplit) : bcsSplit(bcsSplit) {}
};
class BcsSplit {
public:
template <GFXCORE_FAMILY gfxCoreFamily>
using AppendCallFuncT = std::function<ze_result_t(CommandListCoreFamilyImmediate<gfxCoreFamily> *, const BcsSplitParams::CopyParams &, size_t, ze_event_handle_t, uint64_t)>;
template <typename GfxFamily>
static constexpr size_t maxEventCountInPool = MemoryConstants::pageSize64k / sizeof(typename GfxFamily::TimestampPacketType);
using CsrContainer = StackVec<NEO::CommandStreamReceiver *, 12u>;
BcsSplitEvents events;
std::vector<CommandList *> cmdLists;
std::vector<CommandList *> h2dCmdLists;
std::vector<CommandList *> d2hCmdLists;
template <GFXCORE_FAMILY gfxCoreFamily, typename T, typename K>
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t appendSplitCall(CommandListCoreFamilyImmediate<gfxCoreFamily> *cmdList,
T dstptr,
K srcptr,
const BcsSplitParams::CopyParams &copyParams,
size_t size,
ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents,
@@ -79,121 +97,26 @@ struct BcsSplit {
bool hasRelaxedOrderingDependencies,
NEO::TransferDirection direction,
size_t estimatedCmdBufferSize,
AppendCallFuncT<gfxCoreFamily, T, K> appendCall) {
constexpr size_t maxEventCountInPool = MemoryConstants::pageSize64k / sizeof(typename CommandListCoreFamilyImmediate<gfxCoreFamily>::GfxFamily::TimestampPacketType);
const auto aggregatedEventsMode = this->events.aggregatedEventsMode;
auto signalEvent = Event::fromHandle(hSignalEvent);
ze_result_t result = ZE_RESULT_SUCCESS;
auto &cmdListsForSplit = this->getCmdListsForSplit(direction);
auto engineCount = cmdListsForSplit.size();
size_t markerEventIndex = 0;
uint64_t aggregatedEventIncrementVal = 1;
const bool useSignalEventForSubcopy = aggregatedEventsMode && cmdList->isUsingAdditionalBlitProperties() && Event::isAggregatedEvent(signalEvent) &&
(signalEvent->getInOrderIncrementValue(1) % engineCount == 0);
if (useSignalEventForSubcopy) {
aggregatedEventIncrementVal = signalEvent->getInOrderIncrementValue(1) / engineCount;
} else {
auto markerEventIndexRet = this->events.obtainForSplit(Context::fromHandle(cmdList->getCmdListContext()), maxEventCountInPool);
if (!markerEventIndexRet.has_value()) {
return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY;
}
markerEventIndex = *markerEventIndexRet;
}
auto barrierRequired = !cmdList->isInOrderExecutionEnabled() && cmdList->isBarrierRequired();
if (barrierRequired) {
cmdList->appendSignalEvent(this->events.barrier[markerEventIndex]->toHandle(), false);
}
auto subcopyEventIndex = markerEventIndex * this->cmdLists.size();
StackVec<ze_event_handle_t, 16> eventHandles;
if (!cmdList->handleCounterBasedEventOperations(signalEvent, false)) {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
auto totalSize = size;
for (size_t i = 0; i < cmdListsForSplit.size(); i++) {
auto subCmdList = static_cast<CommandListCoreFamilyImmediate<gfxCoreFamily> *>(cmdListsForSplit[i]);
auto lock = subCmdList->getCsr(false)->obtainUniqueOwnership();
subCmdList->checkAvailableSpace(numWaitEvents, hasRelaxedOrderingDependencies, estimatedCmdBufferSize, false);
if (barrierRequired) {
auto barrierEventHandle = this->events.barrier[markerEventIndex]->toHandle();
subCmdList->addEventsToCmdList(1u, &barrierEventHandle, nullptr, hasRelaxedOrderingDependencies, false, true, false, false);
}
if (cmdList->hasInOrderDependencies()) {
auto &inOrderExecInfo = cmdList->getInOrderExecInfo();
subCmdList->appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue(), inOrderExecInfo->getAllocationOffset(), hasRelaxedOrderingDependencies, false, false, false, false);
}
subCmdList->addEventsToCmdList(numWaitEvents, phWaitEvents, nullptr, hasRelaxedOrderingDependencies, false, false, false, false);
if (!useSignalEventForSubcopy && signalEvent && i == 0u) {
subCmdList->appendEventForProfilingAllWalkers(signalEvent, nullptr, nullptr, true, true, false, true);
}
auto localSize = totalSize / engineCount;
auto localDstPtr = ptrOffset(dstptr, size - totalSize);
auto localSrcPtr = ptrOffset(srcptr, size - totalSize);
auto copyEventIndex = aggregatedEventsMode ? markerEventIndex : subcopyEventIndex + i;
auto eventHandle = useSignalEventForSubcopy ? signalEvent : this->events.subcopy[copyEventIndex]->toHandle();
result = appendCall(subCmdList, localDstPtr, localSrcPtr, localSize, eventHandle, aggregatedEventIncrementVal);
subCmdList->flushImmediate(result, true, !hasRelaxedOrderingDependencies, hasRelaxedOrderingDependencies, NEO::AppendOperations::nonKernel, false, nullptr, true, &lock, nullptr);
if ((aggregatedEventsMode && i == 0) || !aggregatedEventsMode) {
eventHandles.push_back(eventHandle);
}
totalSize -= localSize;
engineCount--;
if (signalEvent) {
signalEvent->appendAdditionalCsr(subCmdList->getCsr(false));
}
}
const bool dualStreamCopyOffload = cmdList->isDualStreamCopyOffloadOperation(cmdList->isCopyOffloadEnabled());
cmdList->addEventsToCmdList(static_cast<uint32_t>(eventHandles.size()), eventHandles.data(), nullptr, hasRelaxedOrderingDependencies, false, true, false, dualStreamCopyOffload);
const auto isCopyCmdList = cmdList->isCopyOnly(dualStreamCopyOffload);
if (!useSignalEventForSubcopy && signalEvent) {
cmdList->appendSignalEventPostWalker(signalEvent, nullptr, nullptr, !isCopyCmdList, false, isCopyCmdList);
}
if (!aggregatedEventsMode) {
cmdList->appendSignalEventPostWalker(this->events.marker[markerEventIndex], nullptr, nullptr, !isCopyCmdList, false, isCopyCmdList);
}
if (cmdList->isInOrderExecutionEnabled()) {
cmdList->appendSignalInOrderDependencyCounter(signalEvent, dualStreamCopyOffload, false, false, useSignalEventForSubcopy);
}
cmdList->handleInOrderDependencyCounter(signalEvent, false, dualStreamCopyOffload);
if (aggregatedEventsMode && !useSignalEventForSubcopy) {
std::lock_guard<std::mutex> lock(events.mtx);
cmdList->assignInOrderExecInfoToEvent(this->events.marker[markerEventIndex]);
}
return result;
}
AppendCallFuncT<gfxCoreFamily> appendCall);
bool setupDevice(NEO::CommandStreamReceiver *csr, bool copyOffloadEnabled);
void releaseResources();
std::vector<CommandList *> &getCmdListsForSplit(NEO::TransferDirection direction);
void setupEnginesMask(NEO::BcsSplitSettings &settings);
bool setupQueues(const NEO::BcsSplitSettings &settings);
DeviceImp &getDevice() const { return device; }
BcsSplit(DeviceImp &device) : device(device), events(*this){};
BcsSplit(DeviceImp &device) : events(*this), device(device){};
protected:
std::vector<CommandList *> &getCmdListsForSplit(NEO::TransferDirection direction);
void setupEnginesMask();
bool setupQueues();
DeviceImp &device;
NEO::BcsSplitSettings splitSettings = {};
uint32_t clientCount = 0u;
std::mutex mtx;
};
} // namespace L0
#include "level_zero/core/source/device/bcs_split.inl"

View File

@@ -0,0 +1,137 @@
/*
* Copyright (C) 2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
namespace L0 {
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t BcsSplit::appendSplitCall(CommandListCoreFamilyImmediate<gfxCoreFamily> *cmdList,
const BcsSplitParams::CopyParams &copyParams,
size_t size,
ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents,
bool performMigration,
bool hasRelaxedOrderingDependencies,
NEO::TransferDirection direction,
size_t estimatedCmdBufferSize,
AppendCallFuncT<gfxCoreFamily> appendCall) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
const auto aggregatedEventsMode = this->events.aggregatedEventsMode;
auto signalEvent = Event::fromHandle(hSignalEvent);
ze_result_t result = ZE_RESULT_SUCCESS;
auto &cmdListsForSplit = this->getCmdListsForSplit(direction);
auto engineCount = cmdListsForSplit.size();
size_t markerEventIndex = 0;
uint64_t aggregatedEventIncrementVal = 1;
const bool useSignalEventForSubcopy = aggregatedEventsMode && cmdList->isUsingAdditionalBlitProperties() && Event::isAggregatedEvent(signalEvent) &&
(signalEvent->getInOrderIncrementValue(1) % engineCount == 0);
if (useSignalEventForSubcopy) {
aggregatedEventIncrementVal = signalEvent->getInOrderIncrementValue(1) / engineCount;
} else {
auto markerEventIndexRet = this->events.obtainForSplit(Context::fromHandle(cmdList->getCmdListContext()), maxEventCountInPool<GfxFamily>);
if (!markerEventIndexRet.has_value()) {
return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY;
}
markerEventIndex = *markerEventIndexRet;
}
auto barrierRequired = !cmdList->isInOrderExecutionEnabled() && cmdList->isBarrierRequired();
if (barrierRequired) {
cmdList->appendSignalEvent(this->events.barrier[markerEventIndex]->toHandle(), false);
}
auto subcopyEventIndex = markerEventIndex * this->cmdLists.size();
StackVec<ze_event_handle_t, 16> eventHandles;
if (!cmdList->handleCounterBasedEventOperations(signalEvent, false)) {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
auto totalSize = size;
for (size_t i = 0; i < cmdListsForSplit.size(); i++) {
auto subCmdList = static_cast<CommandListCoreFamilyImmediate<gfxCoreFamily> *>(cmdListsForSplit[i]);
auto lock = subCmdList->getCsr(false)->obtainUniqueOwnership();
subCmdList->checkAvailableSpace(numWaitEvents, hasRelaxedOrderingDependencies, estimatedCmdBufferSize, false);
if (barrierRequired) {
auto barrierEventHandle = this->events.barrier[markerEventIndex]->toHandle();
subCmdList->addEventsToCmdList(1u, &barrierEventHandle, nullptr, hasRelaxedOrderingDependencies, false, true, false, false);
}
if (cmdList->hasInOrderDependencies()) {
auto &inOrderExecInfo = cmdList->getInOrderExecInfo();
subCmdList->appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue(), inOrderExecInfo->getAllocationOffset(), hasRelaxedOrderingDependencies, false, false, false, false);
}
subCmdList->addEventsToCmdList(numWaitEvents, phWaitEvents, nullptr, hasRelaxedOrderingDependencies, false, false, false, false);
if (!useSignalEventForSubcopy && signalEvent && i == 0u) {
subCmdList->appendEventForProfilingAllWalkers(signalEvent, nullptr, nullptr, true, true, false, true);
}
auto localSize = totalSize / engineCount;
BcsSplitParams::CopyParams localCopyParams;
std::visit([&](auto &&arg) {
using T = std::decay_t<decltype(arg)>;
localCopyParams = T{ptrOffset(arg.dst, size - totalSize),
ptrOffset(arg.src, size - totalSize)};
},
copyParams);
auto copyEventIndex = aggregatedEventsMode ? markerEventIndex : subcopyEventIndex + i;
auto eventHandle = useSignalEventForSubcopy ? signalEvent : this->events.subcopy[copyEventIndex]->toHandle();
result = appendCall(subCmdList, localCopyParams, localSize, eventHandle, aggregatedEventIncrementVal);
subCmdList->flushImmediate(result, true, !hasRelaxedOrderingDependencies, hasRelaxedOrderingDependencies, NEO::AppendOperations::nonKernel, false, nullptr, true, &lock, nullptr);
if ((aggregatedEventsMode && i == 0) || !aggregatedEventsMode) {
eventHandles.push_back(eventHandle);
}
totalSize -= localSize;
engineCount--;
if (signalEvent) {
signalEvent->appendAdditionalCsr(subCmdList->getCsr(false));
}
}
const bool dualStreamCopyOffload = cmdList->isDualStreamCopyOffloadOperation(cmdList->isCopyOffloadEnabled());
cmdList->addEventsToCmdList(static_cast<uint32_t>(eventHandles.size()), eventHandles.data(), nullptr, hasRelaxedOrderingDependencies, false, true, false, dualStreamCopyOffload);
const auto isCopyCmdList = cmdList->isCopyOnly(dualStreamCopyOffload);
if (!useSignalEventForSubcopy && signalEvent) {
cmdList->appendSignalEventPostWalker(signalEvent, nullptr, nullptr, !isCopyCmdList, false, isCopyCmdList);
}
if (!aggregatedEventsMode) {
cmdList->appendSignalEventPostWalker(this->events.marker[markerEventIndex], nullptr, nullptr, !isCopyCmdList, false, isCopyCmdList);
}
if (cmdList->isInOrderExecutionEnabled()) {
cmdList->appendSignalInOrderDependencyCounter(signalEvent, dualStreamCopyOffload, false, false, useSignalEventForSubcopy);
}
cmdList->handleInOrderDependencyCounter(signalEvent, false, dualStreamCopyOffload);
if (aggregatedEventsMode && !useSignalEventForSubcopy) {
std::lock_guard<std::mutex> lock(events.mtx);
cmdList->assignInOrderExecInfoToEvent(this->events.marker[markerEventIndex]);
}
return result;
}
} // namespace L0

View File

@@ -33,7 +33,7 @@ struct HardwareInfo;
} // namespace NEO
namespace L0 {
struct BcsSplit;
class BcsSplit;
struct Image;
struct SysmanDevice;
struct FabricVertex;

View File

@@ -1691,7 +1691,7 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe
NEO::debugManager.flags.OverrideEventSynchronizeTimeout.set(0);
NEO::debugManager.flags.EnableTimestampPoolAllocator.set(0);
auto memoryManager = reinterpret_cast<MockMemoryManager *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit->device.getDriverHandle()->getMemoryManager());
auto memoryManager = reinterpret_cast<MockMemoryManager *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit->getDevice().getDriverHandle()->getMemoryManager());
memoryManager->isMockHostMemoryManager = true;
memoryManager->forceFailureInPrimaryAllocation = true;
@@ -1741,7 +1741,7 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe
NEO::debugManager.flags.OverrideEventSynchronizeTimeout.set(0);
NEO::debugManager.flags.EnableTimestampPoolAllocator.set(0);
auto memoryManager = reinterpret_cast<MockMemoryManager *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit->device.getDriverHandle()->getMemoryManager());
auto memoryManager = reinterpret_cast<MockMemoryManager *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit->getDevice().getDriverHandle()->getMemoryManager());
memoryManager->isMockHostMemoryManager = true;
memoryManager->forceFailureInPrimaryAllocation = true;

View File

@@ -102,6 +102,7 @@ struct DebugVarBase {
this->set(data);
}
}
void setPrefixType(DebugVarPrefix data) {
prefixType = std::move(data);
}
@@ -117,6 +118,13 @@ struct DebugVarBase {
return (value != defaultValue) ? static_cast<UserType>(value) : userValue;
}
template <typename UserType>
void assignIfNotDefault(UserType &userDataForAssignment) const {
if (value != defaultValue) {
userDataForAssignment = static_cast<UserType>(value);
}
}
private:
T value;
T defaultValue;