mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-08 14:02:58 +08:00
refactor: bcs split class structure
Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
2aabe27531
commit
0597f064e5
@@ -718,12 +718,14 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
|
||||
if (isSplitNeeded) {
|
||||
setupFlagsForBcsSplit(memoryCopyParams, hasStallingCmds, copyOffloadFlush, srcptr, dstptr, size, size);
|
||||
|
||||
auto splitCall = [&](CommandListCoreFamilyImmediate<gfxCoreFamily> *subCmdList, void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) {
|
||||
auto splitCall = [&](CommandListCoreFamilyImmediate<gfxCoreFamily> *subCmdList, const BcsSplitParams::CopyParams ©Params, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) {
|
||||
memoryCopyParams.forceAggregatedEventIncValue = aggregatedEventIncValue;
|
||||
return subCmdList->CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, 0u, nullptr, memoryCopyParams);
|
||||
auto ¶ms = std::get<BcsSplitParams::MemCopy>(copyParams);
|
||||
return subCmdList->CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(params.dst, params.src, sizeParam, hSignalEventParam, 0u, nullptr, memoryCopyParams);
|
||||
};
|
||||
|
||||
ret = static_cast<DeviceImp *>(this->device)->bcsSplit->appendSplitCall<gfxCoreFamily, void *, const void *>(this, dstptr, srcptr, size, hSignalEvent, numWaitEvents, phWaitEvents, true, memoryCopyParams.relaxedOrderingDispatch, direction, estimatedSize, splitCall);
|
||||
BcsSplitParams::CopyParams copyParams = BcsSplitParams::MemCopy{dstptr, srcptr};
|
||||
ret = static_cast<DeviceImp *>(this->device)->bcsSplit->appendSplitCall<gfxCoreFamily>(this, copyParams, size, hSignalEvent, numWaitEvents, phWaitEvents, true, memoryCopyParams.relaxedOrderingDispatch, direction, estimatedSize, splitCall);
|
||||
|
||||
} else if (this->isValidForStagingTransfer(dstptr, srcptr, size, numWaitEvents > 0)) {
|
||||
return this->appendStagingMemoryCopy(dstptr, srcptr, size, hSignalEvent, memoryCopyParams);
|
||||
@@ -774,14 +776,16 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
|
||||
this->getTotalSizeForCopyRegion(srcRegion, srcPitch, srcSlicePitch),
|
||||
this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch));
|
||||
|
||||
auto splitCall = [&](CommandListCoreFamilyImmediate<gfxCoreFamily> *subCmdList, uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) {
|
||||
auto splitCall = [&](CommandListCoreFamilyImmediate<gfxCoreFamily> *subCmdList, const BcsSplitParams::CopyParams ©Params, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) {
|
||||
auto ¶ms = std::get<BcsSplitParams::RegionCopy>(copyParams);
|
||||
|
||||
ze_copy_region_t dstRegionLocal = {};
|
||||
ze_copy_region_t srcRegionLocal = {};
|
||||
memcpy(&dstRegionLocal, dstRegion, sizeof(ze_copy_region_t));
|
||||
memcpy(&srcRegionLocal, srcRegion, sizeof(ze_copy_region_t));
|
||||
dstRegionLocal.originX = dstOriginXParam;
|
||||
dstRegionLocal.originX = params.dst;
|
||||
dstRegionLocal.width = static_cast<uint32_t>(sizeParam);
|
||||
srcRegionLocal.originX = srcOriginXParam;
|
||||
srcRegionLocal.originX = params.src;
|
||||
srcRegionLocal.width = static_cast<uint32_t>(sizeParam);
|
||||
memoryCopyParams.forceAggregatedEventIncValue = aggregatedEventIncValue;
|
||||
return subCmdList->CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyRegion(dstPtr, &dstRegionLocal, dstPitch, dstSlicePitch,
|
||||
@@ -789,7 +793,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
|
||||
hSignalEventParam, 0u, nullptr, memoryCopyParams);
|
||||
};
|
||||
|
||||
ret = static_cast<DeviceImp *>(this->device)->bcsSplit->appendSplitCall<gfxCoreFamily, uint32_t, uint32_t>(this, dstRegion->originX, srcRegion->originX, dstRegion->width, hSignalEvent, numWaitEvents, phWaitEvents, true, memoryCopyParams.relaxedOrderingDispatch, direction, estimatedSize, splitCall);
|
||||
BcsSplitParams::CopyParams copyParams = BcsSplitParams::RegionCopy{dstRegion->originX, srcRegion->originX};
|
||||
ret = static_cast<DeviceImp *>(this->device)->bcsSplit->appendSplitCall<gfxCoreFamily>(this, copyParams, dstRegion->width, hSignalEvent, numWaitEvents, phWaitEvents, true, memoryCopyParams.relaxedOrderingDispatch, direction, estimatedSize, splitCall);
|
||||
} else {
|
||||
ret = CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyRegion(dstPtr, dstRegion, dstPitch, dstSlicePitch,
|
||||
srcPtr, srcRegion, srcPitch, srcSlicePitch,
|
||||
@@ -860,12 +865,14 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendPageFaultCopy(N
|
||||
|
||||
setupFlagsForBcsSplit(bcsSplitMemoryCopyParams, hasStallingCmds, copyOffloadFlush, srcAddress, dstAddress, size, size);
|
||||
|
||||
auto splitCall = [&](CommandListCoreFamilyImmediate<gfxCoreFamily> *subCmdList, void *dstAddressParam, const void *srcAddressParam, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) {
|
||||
auto splitCall = [&](CommandListCoreFamilyImmediate<gfxCoreFamily> *subCmdList, const BcsSplitParams::CopyParams ©Params, size_t sizeParam, ze_event_handle_t hSignalEventParam, uint64_t aggregatedEventIncValue) {
|
||||
bcsSplitMemoryCopyParams.forceAggregatedEventIncValue = aggregatedEventIncValue;
|
||||
return subCmdList->CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstAddressParam, srcAddressParam, sizeParam, hSignalEventParam, 0u, nullptr, bcsSplitMemoryCopyParams);
|
||||
auto ¶ms = std::get<BcsSplitParams::MemCopy>(copyParams);
|
||||
return subCmdList->CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(params.dst, params.src, sizeParam, hSignalEventParam, 0u, nullptr, bcsSplitMemoryCopyParams);
|
||||
};
|
||||
|
||||
ret = static_cast<DeviceImp *>(this->device)->bcsSplit->appendSplitCall<gfxCoreFamily, void *, const void *>(this, dstAddress, srcAddress, size, nullptr, 0u, nullptr, false, bcsSplitMemoryCopyParams.relaxedOrderingDispatch, direction, commonImmediateCommandSize, splitCall);
|
||||
BcsSplitParams::CopyParams copyParams = BcsSplitParams::MemCopy{dstAddress, srcAddress};
|
||||
ret = static_cast<DeviceImp *>(this->device)->bcsSplit->appendSplitCall<gfxCoreFamily>(this, copyParams, size, nullptr, 0u, nullptr, false, bcsSplitMemoryCopyParams.relaxedOrderingDispatch, direction, commonImmediateCommandSize, splitCall);
|
||||
} else {
|
||||
ret = CommandListCoreFamily<gfxCoreFamily>::appendPageFaultCopy(dstAllocation, srcAllocation, size, flushHost);
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ target_sources(${L0_STATIC_LIB_NAME}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}device_imp_helper.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bcs_split.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bcs_split.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/bcs_split.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/device.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/device_imp_${DRIVER_MODEL}/device_imp_${DRIVER_MODEL}.cpp
|
||||
|
||||
@@ -21,14 +21,12 @@ namespace L0 {
|
||||
|
||||
bool BcsSplit::setupDevice(NEO::CommandStreamReceiver *csr, bool copyOffloadEnabled) {
|
||||
auto &productHelper = this->device.getProductHelper();
|
||||
auto bcsSplitSettings = productHelper.getBcsSplitSettings(this->device.getHwInfo());
|
||||
this->splitSettings = productHelper.getBcsSplitSettings(this->device.getHwInfo());
|
||||
|
||||
if (NEO::debugManager.flags.SplitBcsRequiredTileCount.get() != -1) {
|
||||
bcsSplitSettings.requiredTileCount = static_cast<uint32_t>(NEO::debugManager.flags.SplitBcsRequiredTileCount.get());
|
||||
}
|
||||
NEO::debugManager.flags.SplitBcsRequiredTileCount.assignIfNotDefault(splitSettings.requiredTileCount);
|
||||
|
||||
// If expectedTileCount==1, route root device to Tile0, otherwise use all Tiles
|
||||
bool tileCountMatch = (bcsSplitSettings.requiredTileCount == 1) || (this->device.getNEODevice()->getNumSubDevices() == bcsSplitSettings.requiredTileCount);
|
||||
bool tileCountMatch = (splitSettings.requiredTileCount == 1) || (this->device.getNEODevice()->getNumSubDevices() == splitSettings.requiredTileCount);
|
||||
bool engineMatch = (csr->getOsContext().getEngineType() == productHelper.getDefaultCopyEngine());
|
||||
if (copyOffloadEnabled && NEO::debugManager.flags.SplitBcsForCopyOffload.get() != 0) {
|
||||
engineMatch = NEO::EngineHelpers::isComputeEngine(csr->getOsContext().getEngineType());
|
||||
@@ -46,39 +44,35 @@ bool BcsSplit::setupDevice(NEO::CommandStreamReceiver *csr, bool copyOffloadEnab
|
||||
return true;
|
||||
}
|
||||
|
||||
events.aggregatedEventsMode = device.getL0GfxCoreHelper().bcsSplitAggregatedModeEnabled();
|
||||
events.aggregatedEventsMode = NEO::debugManager.flags.SplitBcsAggregatedEventsMode.getIfNotDefault(device.getL0GfxCoreHelper().bcsSplitAggregatedModeEnabled());
|
||||
|
||||
if (NEO::debugManager.flags.SplitBcsAggregatedEventsMode.get() != -1) {
|
||||
events.aggregatedEventsMode = !!NEO::debugManager.flags.SplitBcsAggregatedEventsMode.get();
|
||||
}
|
||||
setupEnginesMask();
|
||||
|
||||
setupEnginesMask(bcsSplitSettings);
|
||||
|
||||
return setupQueues(bcsSplitSettings);
|
||||
return setupQueues();
|
||||
}
|
||||
|
||||
bool BcsSplit::setupQueues(const NEO::BcsSplitSettings &settings) {
|
||||
bool BcsSplit::setupQueues() {
|
||||
CsrContainer csrs;
|
||||
|
||||
for (uint32_t tileId = 0; tileId < settings.requiredTileCount; tileId++) {
|
||||
for (uint32_t tileId = 0; tileId < splitSettings.requiredTileCount; tileId++) {
|
||||
auto subDevice = this->device.getNEODevice()->getNearestGenericSubDevice(tileId);
|
||||
|
||||
UNRECOVERABLE_IF(!subDevice);
|
||||
|
||||
for (uint32_t engineId = 0; engineId < NEO::bcsInfoMaskSize; engineId++) {
|
||||
if (settings.allEngines.test(engineId)) {
|
||||
if (splitSettings.allEngines.test(engineId)) {
|
||||
if (auto engine = subDevice->tryGetEngine(NEO::EngineHelpers::getBcsEngineAtIdx(engineId), NEO::EngineUsage::regular)) {
|
||||
csrs.push_back(engine->commandStreamReceiver);
|
||||
}
|
||||
}
|
||||
|
||||
if (csrs.size() >= settings.minRequiredTotalCsrCount) {
|
||||
if (csrs.size() >= splitSettings.minRequiredTotalCsrCount) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (csrs.size() < settings.minRequiredTotalCsrCount) {
|
||||
if (csrs.size() < splitSettings.minRequiredTotalCsrCount) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -98,10 +92,10 @@ bool BcsSplit::setupQueues(const NEO::BcsSplitSettings &settings) {
|
||||
auto engineType = csr->getOsContext().getEngineType();
|
||||
auto bcsId = NEO::EngineHelpers::getBcsIndex(engineType);
|
||||
|
||||
if (settings.h2dEngines.test(bcsId)) {
|
||||
if (splitSettings.h2dEngines.test(bcsId)) {
|
||||
this->h2dCmdLists.push_back(cmdList);
|
||||
}
|
||||
if (settings.d2hEngines.test(bcsId)) {
|
||||
if (splitSettings.d2hEngines.test(bcsId)) {
|
||||
this->d2hCmdLists.push_back(cmdList);
|
||||
}
|
||||
}
|
||||
@@ -109,20 +103,11 @@ bool BcsSplit::setupQueues(const NEO::BcsSplitSettings &settings) {
|
||||
return true;
|
||||
}
|
||||
|
||||
void BcsSplit::setupEnginesMask(NEO::BcsSplitSettings &settings) {
|
||||
if (NEO::debugManager.flags.SplitBcsMask.get() > 0) {
|
||||
settings.allEngines = NEO::debugManager.flags.SplitBcsMask.get();
|
||||
}
|
||||
if (NEO::debugManager.flags.SplitBcsMaskH2D.get() > 0) {
|
||||
settings.h2dEngines = NEO::debugManager.flags.SplitBcsMaskH2D.get();
|
||||
}
|
||||
if (NEO::debugManager.flags.SplitBcsMaskD2H.get() > 0) {
|
||||
settings.d2hEngines = NEO::debugManager.flags.SplitBcsMaskD2H.get();
|
||||
}
|
||||
|
||||
if (NEO::debugManager.flags.SplitBcsRequiredEnginesCount.get() != -1) {
|
||||
settings.minRequiredTotalCsrCount = static_cast<uint32_t>(NEO::debugManager.flags.SplitBcsRequiredEnginesCount.get());
|
||||
}
|
||||
void BcsSplit::setupEnginesMask() {
|
||||
NEO::debugManager.flags.SplitBcsMask.assignIfNotDefault(splitSettings.allEngines);
|
||||
NEO::debugManager.flags.SplitBcsMaskH2D.assignIfNotDefault(splitSettings.h2dEngines);
|
||||
NEO::debugManager.flags.SplitBcsMaskD2H.assignIfNotDefault(splitSettings.d2hEngines);
|
||||
NEO::debugManager.flags.SplitBcsRequiredEnginesCount.assignIfNotDefault(splitSettings.minRequiredTotalCsrCount);
|
||||
}
|
||||
|
||||
void BcsSplit::releaseResources() {
|
||||
@@ -150,7 +135,7 @@ std::vector<CommandList *> &BcsSplit::getCmdListsForSplit(NEO::TransferDirection
|
||||
return this->cmdLists;
|
||||
}
|
||||
|
||||
size_t BcsSplit::Events::obtainAggregatedEventsForSplit(Context *context) {
|
||||
size_t BcsSplitEvents::obtainAggregatedEventsForSplit(Context *context) {
|
||||
for (size_t i = 0; i < this->marker.size(); i++) {
|
||||
if (this->marker[i]->queryStatus() == ZE_RESULT_SUCCESS) {
|
||||
resetAggregatedEventState(i, false);
|
||||
@@ -161,7 +146,7 @@ size_t BcsSplit::Events::obtainAggregatedEventsForSplit(Context *context) {
|
||||
return this->createAggregatedEvent(context);
|
||||
}
|
||||
|
||||
std::optional<size_t> BcsSplit::Events::obtainForSplit(Context *context, size_t maxEventCountInPool) {
|
||||
std::optional<size_t> BcsSplitEvents::obtainForSplit(Context *context, size_t maxEventCountInPool) {
|
||||
std::lock_guard<std::mutex> lock(this->mtx);
|
||||
|
||||
if (this->aggregatedEventsMode) {
|
||||
@@ -186,7 +171,7 @@ std::optional<size_t> BcsSplit::Events::obtainForSplit(Context *context, size_t
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t *BcsSplit::Events::getNextAllocationForAggregatedEvent() {
|
||||
uint64_t *BcsSplitEvents::getNextAllocationForAggregatedEvent() {
|
||||
constexpr size_t allocationSize = MemoryConstants::pageSize64k;
|
||||
|
||||
if (!this->allocsForAggregatedEvents.empty() && (currentAggregatedAllocOffset + MemoryConstants::cacheLineSize) < allocationSize) {
|
||||
@@ -195,9 +180,9 @@ uint64_t *BcsSplit::Events::getNextAllocationForAggregatedEvent() {
|
||||
ze_device_mem_alloc_desc_t desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC};
|
||||
void *ptr = nullptr;
|
||||
|
||||
auto context = Context::fromHandle(bcsSplit.device.getDriverHandle()->getDefaultContext());
|
||||
auto context = Context::fromHandle(bcsSplit.getDevice().getDriverHandle()->getDefaultContext());
|
||||
|
||||
context->allocDeviceMem(bcsSplit.device.toHandle(), &desc, allocationSize, MemoryConstants::pageSize64k, &ptr);
|
||||
context->allocDeviceMem(bcsSplit.getDevice().toHandle(), &desc, allocationSize, MemoryConstants::pageSize64k, &ptr);
|
||||
UNRECOVERABLE_IF(!ptr);
|
||||
currentAggregatedAllocOffset = 0;
|
||||
|
||||
@@ -209,7 +194,7 @@ uint64_t *BcsSplit::Events::getNextAllocationForAggregatedEvent() {
|
||||
return ptrOffset(basePtr, currentAggregatedAllocOffset);
|
||||
}
|
||||
|
||||
size_t BcsSplit::Events::createAggregatedEvent(Context *context) {
|
||||
size_t BcsSplitEvents::createAggregatedEvent(Context *context) {
|
||||
constexpr int preallocationCount = 8;
|
||||
size_t returnIndex = this->subcopy.size();
|
||||
|
||||
@@ -230,13 +215,13 @@ size_t BcsSplit::Events::createAggregatedEvent(Context *context) {
|
||||
externalStorageAllocProperties.deviceAddress = getNextAllocationForAggregatedEvent();
|
||||
|
||||
ze_event_handle_t handle = nullptr;
|
||||
zexCounterBasedEventCreate2(context, bcsSplit.device.toHandle(), &counterBasedDesc, &handle);
|
||||
zexCounterBasedEventCreate2(context, bcsSplit.getDevice().toHandle(), &counterBasedDesc, &handle);
|
||||
UNRECOVERABLE_IF(handle == nullptr);
|
||||
|
||||
this->subcopy.push_back(Event::fromHandle(handle));
|
||||
|
||||
ze_event_handle_t markerHandle = nullptr;
|
||||
zexCounterBasedEventCreate2(context, bcsSplit.device.toHandle(), &markerCounterBasedDesc, &markerHandle);
|
||||
zexCounterBasedEventCreate2(context, bcsSplit.getDevice().toHandle(), &markerCounterBasedDesc, &markerHandle);
|
||||
UNRECOVERABLE_IF(markerHandle == nullptr);
|
||||
|
||||
this->marker.push_back(Event::fromHandle(markerHandle));
|
||||
@@ -247,14 +232,14 @@ size_t BcsSplit::Events::createAggregatedEvent(Context *context) {
|
||||
return returnIndex;
|
||||
}
|
||||
|
||||
bool BcsSplit::Events::allocatePool(Context *context, size_t maxEventCountInPool, size_t neededEvents) {
|
||||
bool BcsSplitEvents::allocatePool(Context *context, size_t maxEventCountInPool, size_t neededEvents) {
|
||||
if (this->pools.empty() ||
|
||||
this->createdFromLatestPool + neededEvents > maxEventCountInPool) {
|
||||
ze_result_t result;
|
||||
ze_event_pool_desc_t desc = {ZE_STRUCTURE_TYPE_EVENT_POOL_DESC};
|
||||
desc.count = static_cast<uint32_t>(maxEventCountInPool);
|
||||
auto hDevice = this->bcsSplit.device.toHandle();
|
||||
auto pool = EventPool::create(this->bcsSplit.device.getDriverHandle(), context, 1, &hDevice, &desc, result);
|
||||
auto hDevice = this->bcsSplit.getDevice().toHandle();
|
||||
auto pool = EventPool::create(this->bcsSplit.getDevice().getDriverHandle(), context, 1, &hDevice, &desc, result);
|
||||
if (!pool) {
|
||||
return false;
|
||||
}
|
||||
@@ -265,7 +250,7 @@ bool BcsSplit::Events::allocatePool(Context *context, size_t maxEventCountInPool
|
||||
return true;
|
||||
}
|
||||
|
||||
std::optional<size_t> BcsSplit::Events::createFromPool(Context *context, size_t maxEventCountInPool) {
|
||||
std::optional<size_t> BcsSplitEvents::createFromPool(Context *context, size_t maxEventCountInPool) {
|
||||
/* Internal events needed for split:
|
||||
* - event per subcopy to signal completion of given subcopy (vector of subcopy events),
|
||||
* - 1 event to signal completion of entire split (vector of marker events),
|
||||
@@ -311,7 +296,7 @@ std::optional<size_t> BcsSplit::Events::createFromPool(Context *context, size_t
|
||||
return this->marker.size() - 1;
|
||||
}
|
||||
|
||||
void BcsSplit::Events::resetEventPackage(size_t index) {
|
||||
void BcsSplitEvents::resetEventPackage(size_t index) {
|
||||
this->marker[index]->reset();
|
||||
this->barrier[index]->reset();
|
||||
for (size_t j = 0; j < this->bcsSplit.cmdLists.size(); j++) {
|
||||
@@ -319,7 +304,7 @@ void BcsSplit::Events::resetEventPackage(size_t index) {
|
||||
}
|
||||
}
|
||||
|
||||
void BcsSplit::Events::resetAggregatedEventState(size_t index, bool markerCompleted) {
|
||||
void BcsSplitEvents::resetAggregatedEventState(size_t index, bool markerCompleted) {
|
||||
*this->subcopy[index]->getInOrderExecInfo()->getBaseHostAddress() = 0;
|
||||
|
||||
auto markerEvent = this->marker[index];
|
||||
@@ -328,7 +313,7 @@ void BcsSplit::Events::resetAggregatedEventState(size_t index, bool markerComple
|
||||
markerEvent->setReportEmptyCbEventAsReady(markerCompleted);
|
||||
}
|
||||
|
||||
void BcsSplit::Events::releaseResources() {
|
||||
void BcsSplitEvents::releaseResources() {
|
||||
for (auto &markerEvent : this->marker) {
|
||||
markerEvent->destroy();
|
||||
}
|
||||
@@ -346,7 +331,7 @@ void BcsSplit::Events::releaseResources() {
|
||||
}
|
||||
pools.clear();
|
||||
|
||||
auto context = Context::fromHandle(bcsSplit.device.getDriverHandle()->getDefaultContext());
|
||||
auto context = Context::fromHandle(bcsSplit.getDevice().getDriverHandle()->getDefaultContext());
|
||||
for (auto &ptr : this->allocsForAggregatedEvents) {
|
||||
context->freeMem(ptr);
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
|
||||
#include <functional>
|
||||
#include <mutex>
|
||||
#include <variant>
|
||||
#include <vector>
|
||||
|
||||
namespace NEO {
|
||||
@@ -26,18 +27,24 @@ enum class TransferDirection;
|
||||
namespace L0 {
|
||||
struct CommandQueue;
|
||||
struct DeviceImp;
|
||||
class BcsSplit;
|
||||
|
||||
struct BcsSplit {
|
||||
template <GFXCORE_FAMILY gfxCoreFamily, typename T, typename K>
|
||||
using AppendCallFuncT = std::function<ze_result_t(CommandListCoreFamilyImmediate<gfxCoreFamily> *, T, K, size_t, ze_event_handle_t, uint64_t)>;
|
||||
using CsrContainer = StackVec<NEO::CommandStreamReceiver *, 12u>;
|
||||
namespace BcsSplitParams {
|
||||
struct MemCopy {
|
||||
void *dst = nullptr;
|
||||
const void *src = nullptr;
|
||||
};
|
||||
|
||||
DeviceImp &device;
|
||||
uint32_t clientCount = 0u;
|
||||
struct RegionCopy {
|
||||
// originXParams
|
||||
uint32_t dst = 0;
|
||||
uint32_t src = 0;
|
||||
};
|
||||
|
||||
std::mutex mtx;
|
||||
using CopyParams = std::variant<MemCopy, RegionCopy>;
|
||||
} // namespace BcsSplitParams
|
||||
|
||||
struct Events {
|
||||
struct BcsSplitEvents {
|
||||
BcsSplit &bcsSplit;
|
||||
|
||||
std::mutex mtx;
|
||||
@@ -60,17 +67,28 @@ struct BcsSplit {
|
||||
size_t createAggregatedEvent(Context *context);
|
||||
uint64_t *getNextAllocationForAggregatedEvent();
|
||||
|
||||
Events(BcsSplit &bcsSplit) : bcsSplit(bcsSplit) {}
|
||||
} events;
|
||||
BcsSplitEvents(BcsSplit &bcsSplit) : bcsSplit(bcsSplit) {}
|
||||
};
|
||||
|
||||
class BcsSplit {
|
||||
public:
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
using AppendCallFuncT = std::function<ze_result_t(CommandListCoreFamilyImmediate<gfxCoreFamily> *, const BcsSplitParams::CopyParams &, size_t, ze_event_handle_t, uint64_t)>;
|
||||
|
||||
template <typename GfxFamily>
|
||||
static constexpr size_t maxEventCountInPool = MemoryConstants::pageSize64k / sizeof(typename GfxFamily::TimestampPacketType);
|
||||
|
||||
using CsrContainer = StackVec<NEO::CommandStreamReceiver *, 12u>;
|
||||
|
||||
BcsSplitEvents events;
|
||||
|
||||
std::vector<CommandList *> cmdLists;
|
||||
std::vector<CommandList *> h2dCmdLists;
|
||||
std::vector<CommandList *> d2hCmdLists;
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily, typename T, typename K>
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
ze_result_t appendSplitCall(CommandListCoreFamilyImmediate<gfxCoreFamily> *cmdList,
|
||||
T dstptr,
|
||||
K srcptr,
|
||||
const BcsSplitParams::CopyParams ©Params,
|
||||
size_t size,
|
||||
ze_event_handle_t hSignalEvent,
|
||||
uint32_t numWaitEvents,
|
||||
@@ -79,121 +97,26 @@ struct BcsSplit {
|
||||
bool hasRelaxedOrderingDependencies,
|
||||
NEO::TransferDirection direction,
|
||||
size_t estimatedCmdBufferSize,
|
||||
AppendCallFuncT<gfxCoreFamily, T, K> appendCall) {
|
||||
constexpr size_t maxEventCountInPool = MemoryConstants::pageSize64k / sizeof(typename CommandListCoreFamilyImmediate<gfxCoreFamily>::GfxFamily::TimestampPacketType);
|
||||
|
||||
const auto aggregatedEventsMode = this->events.aggregatedEventsMode;
|
||||
auto signalEvent = Event::fromHandle(hSignalEvent);
|
||||
|
||||
ze_result_t result = ZE_RESULT_SUCCESS;
|
||||
auto &cmdListsForSplit = this->getCmdListsForSplit(direction);
|
||||
auto engineCount = cmdListsForSplit.size();
|
||||
size_t markerEventIndex = 0;
|
||||
uint64_t aggregatedEventIncrementVal = 1;
|
||||
|
||||
const bool useSignalEventForSubcopy = aggregatedEventsMode && cmdList->isUsingAdditionalBlitProperties() && Event::isAggregatedEvent(signalEvent) &&
|
||||
(signalEvent->getInOrderIncrementValue(1) % engineCount == 0);
|
||||
|
||||
if (useSignalEventForSubcopy) {
|
||||
aggregatedEventIncrementVal = signalEvent->getInOrderIncrementValue(1) / engineCount;
|
||||
} else {
|
||||
auto markerEventIndexRet = this->events.obtainForSplit(Context::fromHandle(cmdList->getCmdListContext()), maxEventCountInPool);
|
||||
if (!markerEventIndexRet.has_value()) {
|
||||
return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY;
|
||||
}
|
||||
markerEventIndex = *markerEventIndexRet;
|
||||
}
|
||||
|
||||
auto barrierRequired = !cmdList->isInOrderExecutionEnabled() && cmdList->isBarrierRequired();
|
||||
if (barrierRequired) {
|
||||
cmdList->appendSignalEvent(this->events.barrier[markerEventIndex]->toHandle(), false);
|
||||
}
|
||||
|
||||
auto subcopyEventIndex = markerEventIndex * this->cmdLists.size();
|
||||
StackVec<ze_event_handle_t, 16> eventHandles;
|
||||
|
||||
if (!cmdList->handleCounterBasedEventOperations(signalEvent, false)) {
|
||||
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
auto totalSize = size;
|
||||
for (size_t i = 0; i < cmdListsForSplit.size(); i++) {
|
||||
auto subCmdList = static_cast<CommandListCoreFamilyImmediate<gfxCoreFamily> *>(cmdListsForSplit[i]);
|
||||
|
||||
auto lock = subCmdList->getCsr(false)->obtainUniqueOwnership();
|
||||
|
||||
subCmdList->checkAvailableSpace(numWaitEvents, hasRelaxedOrderingDependencies, estimatedCmdBufferSize, false);
|
||||
|
||||
if (barrierRequired) {
|
||||
auto barrierEventHandle = this->events.barrier[markerEventIndex]->toHandle();
|
||||
subCmdList->addEventsToCmdList(1u, &barrierEventHandle, nullptr, hasRelaxedOrderingDependencies, false, true, false, false);
|
||||
}
|
||||
|
||||
if (cmdList->hasInOrderDependencies()) {
|
||||
auto &inOrderExecInfo = cmdList->getInOrderExecInfo();
|
||||
subCmdList->appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue(), inOrderExecInfo->getAllocationOffset(), hasRelaxedOrderingDependencies, false, false, false, false);
|
||||
}
|
||||
subCmdList->addEventsToCmdList(numWaitEvents, phWaitEvents, nullptr, hasRelaxedOrderingDependencies, false, false, false, false);
|
||||
|
||||
if (!useSignalEventForSubcopy && signalEvent && i == 0u) {
|
||||
subCmdList->appendEventForProfilingAllWalkers(signalEvent, nullptr, nullptr, true, true, false, true);
|
||||
}
|
||||
|
||||
auto localSize = totalSize / engineCount;
|
||||
auto localDstPtr = ptrOffset(dstptr, size - totalSize);
|
||||
auto localSrcPtr = ptrOffset(srcptr, size - totalSize);
|
||||
|
||||
auto copyEventIndex = aggregatedEventsMode ? markerEventIndex : subcopyEventIndex + i;
|
||||
auto eventHandle = useSignalEventForSubcopy ? signalEvent : this->events.subcopy[copyEventIndex]->toHandle();
|
||||
result = appendCall(subCmdList, localDstPtr, localSrcPtr, localSize, eventHandle, aggregatedEventIncrementVal);
|
||||
subCmdList->flushImmediate(result, true, !hasRelaxedOrderingDependencies, hasRelaxedOrderingDependencies, NEO::AppendOperations::nonKernel, false, nullptr, true, &lock, nullptr);
|
||||
|
||||
if ((aggregatedEventsMode && i == 0) || !aggregatedEventsMode) {
|
||||
eventHandles.push_back(eventHandle);
|
||||
}
|
||||
|
||||
totalSize -= localSize;
|
||||
engineCount--;
|
||||
|
||||
if (signalEvent) {
|
||||
signalEvent->appendAdditionalCsr(subCmdList->getCsr(false));
|
||||
}
|
||||
}
|
||||
|
||||
const bool dualStreamCopyOffload = cmdList->isDualStreamCopyOffloadOperation(cmdList->isCopyOffloadEnabled());
|
||||
|
||||
cmdList->addEventsToCmdList(static_cast<uint32_t>(eventHandles.size()), eventHandles.data(), nullptr, hasRelaxedOrderingDependencies, false, true, false, dualStreamCopyOffload);
|
||||
|
||||
const auto isCopyCmdList = cmdList->isCopyOnly(dualStreamCopyOffload);
|
||||
|
||||
if (!useSignalEventForSubcopy && signalEvent) {
|
||||
cmdList->appendSignalEventPostWalker(signalEvent, nullptr, nullptr, !isCopyCmdList, false, isCopyCmdList);
|
||||
}
|
||||
|
||||
if (!aggregatedEventsMode) {
|
||||
cmdList->appendSignalEventPostWalker(this->events.marker[markerEventIndex], nullptr, nullptr, !isCopyCmdList, false, isCopyCmdList);
|
||||
}
|
||||
|
||||
if (cmdList->isInOrderExecutionEnabled()) {
|
||||
cmdList->appendSignalInOrderDependencyCounter(signalEvent, dualStreamCopyOffload, false, false, useSignalEventForSubcopy);
|
||||
}
|
||||
cmdList->handleInOrderDependencyCounter(signalEvent, false, dualStreamCopyOffload);
|
||||
|
||||
if (aggregatedEventsMode && !useSignalEventForSubcopy) {
|
||||
std::lock_guard<std::mutex> lock(events.mtx);
|
||||
cmdList->assignInOrderExecInfoToEvent(this->events.marker[markerEventIndex]);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
AppendCallFuncT<gfxCoreFamily> appendCall);
|
||||
|
||||
bool setupDevice(NEO::CommandStreamReceiver *csr, bool copyOffloadEnabled);
|
||||
void releaseResources();
|
||||
std::vector<CommandList *> &getCmdListsForSplit(NEO::TransferDirection direction);
|
||||
void setupEnginesMask(NEO::BcsSplitSettings &settings);
|
||||
bool setupQueues(const NEO::BcsSplitSettings &settings);
|
||||
DeviceImp &getDevice() const { return device; }
|
||||
|
||||
BcsSplit(DeviceImp &device) : device(device), events(*this){};
|
||||
BcsSplit(DeviceImp &device) : events(*this), device(device){};
|
||||
|
||||
protected:
|
||||
std::vector<CommandList *> &getCmdListsForSplit(NEO::TransferDirection direction);
|
||||
void setupEnginesMask();
|
||||
bool setupQueues();
|
||||
|
||||
DeviceImp &device;
|
||||
NEO::BcsSplitSettings splitSettings = {};
|
||||
uint32_t clientCount = 0u;
|
||||
|
||||
std::mutex mtx;
|
||||
};
|
||||
|
||||
} // namespace L0
|
||||
|
||||
#include "level_zero/core/source/device/bcs_split.inl"
|
||||
|
||||
137
level_zero/core/source/device/bcs_split.inl
Normal file
137
level_zero/core/source/device/bcs_split.inl
Normal file
@@ -0,0 +1,137 @@
|
||||
/*
|
||||
* Copyright (C) 2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
namespace L0 {
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
ze_result_t BcsSplit::appendSplitCall(CommandListCoreFamilyImmediate<gfxCoreFamily> *cmdList,
|
||||
const BcsSplitParams::CopyParams ©Params,
|
||||
size_t size,
|
||||
ze_event_handle_t hSignalEvent,
|
||||
uint32_t numWaitEvents,
|
||||
ze_event_handle_t *phWaitEvents,
|
||||
bool performMigration,
|
||||
bool hasRelaxedOrderingDependencies,
|
||||
NEO::TransferDirection direction,
|
||||
size_t estimatedCmdBufferSize,
|
||||
AppendCallFuncT<gfxCoreFamily> appendCall) {
|
||||
|
||||
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
|
||||
|
||||
const auto aggregatedEventsMode = this->events.aggregatedEventsMode;
|
||||
auto signalEvent = Event::fromHandle(hSignalEvent);
|
||||
|
||||
ze_result_t result = ZE_RESULT_SUCCESS;
|
||||
auto &cmdListsForSplit = this->getCmdListsForSplit(direction);
|
||||
auto engineCount = cmdListsForSplit.size();
|
||||
size_t markerEventIndex = 0;
|
||||
uint64_t aggregatedEventIncrementVal = 1;
|
||||
|
||||
const bool useSignalEventForSubcopy = aggregatedEventsMode && cmdList->isUsingAdditionalBlitProperties() && Event::isAggregatedEvent(signalEvent) &&
|
||||
(signalEvent->getInOrderIncrementValue(1) % engineCount == 0);
|
||||
|
||||
if (useSignalEventForSubcopy) {
|
||||
aggregatedEventIncrementVal = signalEvent->getInOrderIncrementValue(1) / engineCount;
|
||||
} else {
|
||||
auto markerEventIndexRet = this->events.obtainForSplit(Context::fromHandle(cmdList->getCmdListContext()), maxEventCountInPool<GfxFamily>);
|
||||
if (!markerEventIndexRet.has_value()) {
|
||||
return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY;
|
||||
}
|
||||
markerEventIndex = *markerEventIndexRet;
|
||||
}
|
||||
|
||||
auto barrierRequired = !cmdList->isInOrderExecutionEnabled() && cmdList->isBarrierRequired();
|
||||
if (barrierRequired) {
|
||||
cmdList->appendSignalEvent(this->events.barrier[markerEventIndex]->toHandle(), false);
|
||||
}
|
||||
|
||||
auto subcopyEventIndex = markerEventIndex * this->cmdLists.size();
|
||||
StackVec<ze_event_handle_t, 16> eventHandles;
|
||||
|
||||
if (!cmdList->handleCounterBasedEventOperations(signalEvent, false)) {
|
||||
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
|
||||
auto totalSize = size;
|
||||
for (size_t i = 0; i < cmdListsForSplit.size(); i++) {
|
||||
auto subCmdList = static_cast<CommandListCoreFamilyImmediate<gfxCoreFamily> *>(cmdListsForSplit[i]);
|
||||
|
||||
auto lock = subCmdList->getCsr(false)->obtainUniqueOwnership();
|
||||
|
||||
subCmdList->checkAvailableSpace(numWaitEvents, hasRelaxedOrderingDependencies, estimatedCmdBufferSize, false);
|
||||
|
||||
if (barrierRequired) {
|
||||
auto barrierEventHandle = this->events.barrier[markerEventIndex]->toHandle();
|
||||
subCmdList->addEventsToCmdList(1u, &barrierEventHandle, nullptr, hasRelaxedOrderingDependencies, false, true, false, false);
|
||||
}
|
||||
|
||||
if (cmdList->hasInOrderDependencies()) {
|
||||
auto &inOrderExecInfo = cmdList->getInOrderExecInfo();
|
||||
subCmdList->appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue(), inOrderExecInfo->getAllocationOffset(), hasRelaxedOrderingDependencies, false, false, false, false);
|
||||
}
|
||||
subCmdList->addEventsToCmdList(numWaitEvents, phWaitEvents, nullptr, hasRelaxedOrderingDependencies, false, false, false, false);
|
||||
|
||||
if (!useSignalEventForSubcopy && signalEvent && i == 0u) {
|
||||
subCmdList->appendEventForProfilingAllWalkers(signalEvent, nullptr, nullptr, true, true, false, true);
|
||||
}
|
||||
|
||||
auto localSize = totalSize / engineCount;
|
||||
|
||||
BcsSplitParams::CopyParams localCopyParams;
|
||||
|
||||
std::visit([&](auto &&arg) {
|
||||
using T = std::decay_t<decltype(arg)>;
|
||||
localCopyParams = T{ptrOffset(arg.dst, size - totalSize),
|
||||
ptrOffset(arg.src, size - totalSize)};
|
||||
},
|
||||
copyParams);
|
||||
|
||||
auto copyEventIndex = aggregatedEventsMode ? markerEventIndex : subcopyEventIndex + i;
|
||||
auto eventHandle = useSignalEventForSubcopy ? signalEvent : this->events.subcopy[copyEventIndex]->toHandle();
|
||||
result = appendCall(subCmdList, localCopyParams, localSize, eventHandle, aggregatedEventIncrementVal);
|
||||
subCmdList->flushImmediate(result, true, !hasRelaxedOrderingDependencies, hasRelaxedOrderingDependencies, NEO::AppendOperations::nonKernel, false, nullptr, true, &lock, nullptr);
|
||||
|
||||
if ((aggregatedEventsMode && i == 0) || !aggregatedEventsMode) {
|
||||
eventHandles.push_back(eventHandle);
|
||||
}
|
||||
|
||||
totalSize -= localSize;
|
||||
engineCount--;
|
||||
|
||||
if (signalEvent) {
|
||||
signalEvent->appendAdditionalCsr(subCmdList->getCsr(false));
|
||||
}
|
||||
}
|
||||
|
||||
const bool dualStreamCopyOffload = cmdList->isDualStreamCopyOffloadOperation(cmdList->isCopyOffloadEnabled());
|
||||
|
||||
cmdList->addEventsToCmdList(static_cast<uint32_t>(eventHandles.size()), eventHandles.data(), nullptr, hasRelaxedOrderingDependencies, false, true, false, dualStreamCopyOffload);
|
||||
|
||||
const auto isCopyCmdList = cmdList->isCopyOnly(dualStreamCopyOffload);
|
||||
|
||||
if (!useSignalEventForSubcopy && signalEvent) {
|
||||
cmdList->appendSignalEventPostWalker(signalEvent, nullptr, nullptr, !isCopyCmdList, false, isCopyCmdList);
|
||||
}
|
||||
|
||||
if (!aggregatedEventsMode) {
|
||||
cmdList->appendSignalEventPostWalker(this->events.marker[markerEventIndex], nullptr, nullptr, !isCopyCmdList, false, isCopyCmdList);
|
||||
}
|
||||
|
||||
if (cmdList->isInOrderExecutionEnabled()) {
|
||||
cmdList->appendSignalInOrderDependencyCounter(signalEvent, dualStreamCopyOffload, false, false, useSignalEventForSubcopy);
|
||||
}
|
||||
cmdList->handleInOrderDependencyCounter(signalEvent, false, dualStreamCopyOffload);
|
||||
|
||||
if (aggregatedEventsMode && !useSignalEventForSubcopy) {
|
||||
std::lock_guard<std::mutex> lock(events.mtx);
|
||||
cmdList->assignInOrderExecInfoToEvent(this->events.marker[markerEventIndex]);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace L0
|
||||
@@ -33,7 +33,7 @@ struct HardwareInfo;
|
||||
} // namespace NEO
|
||||
|
||||
namespace L0 {
|
||||
struct BcsSplit;
|
||||
class BcsSplit;
|
||||
struct Image;
|
||||
struct SysmanDevice;
|
||||
struct FabricVertex;
|
||||
|
||||
@@ -1691,7 +1691,7 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe
|
||||
NEO::debugManager.flags.OverrideEventSynchronizeTimeout.set(0);
|
||||
NEO::debugManager.flags.EnableTimestampPoolAllocator.set(0);
|
||||
|
||||
auto memoryManager = reinterpret_cast<MockMemoryManager *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit->device.getDriverHandle()->getMemoryManager());
|
||||
auto memoryManager = reinterpret_cast<MockMemoryManager *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit->getDevice().getDriverHandle()->getMemoryManager());
|
||||
memoryManager->isMockHostMemoryManager = true;
|
||||
memoryManager->forceFailureInPrimaryAllocation = true;
|
||||
|
||||
@@ -1741,7 +1741,7 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe
|
||||
NEO::debugManager.flags.OverrideEventSynchronizeTimeout.set(0);
|
||||
NEO::debugManager.flags.EnableTimestampPoolAllocator.set(0);
|
||||
|
||||
auto memoryManager = reinterpret_cast<MockMemoryManager *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit->device.getDriverHandle()->getMemoryManager());
|
||||
auto memoryManager = reinterpret_cast<MockMemoryManager *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit->getDevice().getDriverHandle()->getMemoryManager());
|
||||
memoryManager->isMockHostMemoryManager = true;
|
||||
memoryManager->forceFailureInPrimaryAllocation = true;
|
||||
|
||||
|
||||
@@ -102,6 +102,7 @@ struct DebugVarBase {
|
||||
this->set(data);
|
||||
}
|
||||
}
|
||||
|
||||
void setPrefixType(DebugVarPrefix data) {
|
||||
prefixType = std::move(data);
|
||||
}
|
||||
@@ -117,6 +118,13 @@ struct DebugVarBase {
|
||||
return (value != defaultValue) ? static_cast<UserType>(value) : userValue;
|
||||
}
|
||||
|
||||
template <typename UserType>
|
||||
void assignIfNotDefault(UserType &userDataForAssignment) const {
|
||||
if (value != defaultValue) {
|
||||
userDataForAssignment = static_cast<UserType>(value);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
T value;
|
||||
T defaultValue;
|
||||
|
||||
Reference in New Issue
Block a user