feature: initial support for aggregated bcs split

Related-To: NEO-14557

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2025-06-23 07:50:44 +00:00
committed by Compute-Runtime-Automation
parent b7580a3998
commit de8f98b43e
7 changed files with 278 additions and 38 deletions

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2022-2024 Intel Corporation
* Copyright (C) 2022-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -12,12 +12,16 @@
#include "shared/source/os_interface/os_context.h"
#include "level_zero/core/source/device/device_imp.h"
#include "level_zero/core/source/driver/driver_handle.h"
#include "level_zero/driver_experimental/zex_api.h"
namespace L0 {
bool BcsSplit::setupDevice(uint32_t productFamily, bool internalUsage, const ze_command_queue_desc_t *desc, NEO::CommandStreamReceiver *csr) {
const auto defaultEngine = this->device.getProductHelper().getDefaultCopyEngine();
auto initializeBcsSplit = this->device.getNEODevice()->isBcsSplitSupported() &&
csr->getOsContext().getEngineType() == aub_stream::EngineType::ENGINE_BCS &&
csr->getOsContext().getEngineType() == defaultEngine &&
!internalUsage;
if (!initializeBcsSplit) {
@@ -54,8 +58,7 @@ bool BcsSplit::setupDevice(uint32_t productFamily, bool internalUsage, const ze_
return false;
}
ze_command_queue_desc_t splitDesc;
memcpy(&splitDesc, desc, sizeof(ze_command_queue_desc_t));
ze_command_queue_desc_t splitDesc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC};
splitDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
for (const auto &csr : csrs) {
@@ -114,8 +117,30 @@ std::vector<CommandQueue *> &BcsSplit::getCmdQsForSplit(NEO::TransferDirection d
return this->cmdQs;
}
BcsSplit::Events::Events(BcsSplit &bcsSplit) : bcsSplit(bcsSplit) {
if (NEO::debugManager.flags.SplitBcsAggregatedEventsMode.get() != -1) {
aggregatedEventsMode = !!NEO::debugManager.flags.SplitBcsAggregatedEventsMode.get();
}
};
size_t BcsSplit::Events::obtainAggregatedEventsForSplit(Context *context) {
for (size_t i = 0; i < this->subcopy.size(); i++) {
if (this->subcopy[i]->queryStatus() == ZE_RESULT_SUCCESS) {
resetAggregatedEventState(i, 0);
return i;
}
}
return this->createAggregatedEvent(context);
}
std::optional<size_t> BcsSplit::Events::obtainForSplit(Context *context, size_t maxEventCountInPool) {
std::lock_guard<std::mutex> lock(this->mtx);
if (this->aggregatedEventsMode) {
return obtainAggregatedEventsForSplit(context);
}
for (size_t i = 0; i < this->marker.size(); i++) {
auto ret = this->marker[i]->queryStatus();
if (ret == ZE_RESULT_SUCCESS) {
@@ -124,7 +149,7 @@ std::optional<size_t> BcsSplit::Events::obtainForSplit(Context *context, size_t
}
}
auto newEventIndex = this->allocateNew(context, maxEventCountInPool);
auto newEventIndex = this->createFromPool(context, maxEventCountInPool);
if (newEventIndex.has_value() || this->marker.empty()) {
return newEventIndex;
}
@@ -134,54 +159,115 @@ std::optional<size_t> BcsSplit::Events::obtainForSplit(Context *context, size_t
return 0;
}
std::optional<size_t> BcsSplit::Events::allocateNew(Context *context, size_t maxEventCountInPool) {
/* Internal events needed for split:
* - event per subcopy to signal completion of given subcopy (vector of subcopy events),
* - 1 event to signal completion of entire split (vector of marker events),
* - 1 event to handle barrier (vector of barrier events).
*/
const size_t neededEvents = this->bcsSplit.cmdQs.size() + 2;
uint64_t *BcsSplit::Events::getNextAllocationForAggregatedEvent() {
constexpr size_t allocationSize = MemoryConstants::pageSize64k;
if (!this->allocsForAggregatedEvents.empty() && (currentAggregatedAllocOffset + MemoryConstants::cacheLineSize) < allocationSize) {
currentAggregatedAllocOffset += MemoryConstants::cacheLineSize;
} else {
ze_device_mem_alloc_desc_t desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC};
void *ptr = nullptr;
auto context = Context::fromHandle(bcsSplit.device.getDriverHandle()->getDefaultContext());
context->allocDeviceMem(bcsSplit.device.toHandle(), &desc, allocationSize, MemoryConstants::pageSize64k, &ptr);
UNRECOVERABLE_IF(!ptr);
currentAggregatedAllocOffset = 0;
this->allocsForAggregatedEvents.push_back(ptr);
}
auto basePtr = reinterpret_cast<uint64_t *>(this->allocsForAggregatedEvents.back());
return ptrOffset(basePtr, currentAggregatedAllocOffset);
}
size_t BcsSplit::Events::createAggregatedEvent(Context *context) {
constexpr int preallocationCount = 8;
size_t returnIndex = this->subcopy.size();
zex_counter_based_event_external_storage_properties_t externalStorageAllocProperties = {ZEX_STRUCTURE_COUNTER_BASED_EVENT_EXTERNAL_STORAGE_ALLOC_PROPERTIES};
externalStorageAllocProperties.completionValue = static_cast<uint64_t>(bcsSplit.cmdQs.size());
externalStorageAllocProperties.incrementValue = 1;
zex_counter_based_event_desc_t counterBasedDesc = {ZEX_STRUCTURE_COUNTER_BASED_EVENT_DESC};
counterBasedDesc.flags = ZEX_COUNTER_BASED_EVENT_FLAG_IMMEDIATE;
counterBasedDesc.signalScope = ZE_EVENT_SCOPE_FLAG_DEVICE;
counterBasedDesc.pNext = &externalStorageAllocProperties;
for (int i = 0; i < preallocationCount; i++) {
externalStorageAllocProperties.deviceAddress = getNextAllocationForAggregatedEvent();
ze_event_handle_t handle = nullptr;
zexCounterBasedEventCreate2(context, bcsSplit.device.toHandle(), &counterBasedDesc, &handle);
UNRECOVERABLE_IF(handle == nullptr);
this->subcopy.push_back(Event::fromHandle(handle));
resetAggregatedEventState(this->subcopy.size() - 1, (i == 0) ? 0 : externalStorageAllocProperties.completionValue);
}
return returnIndex;
}
bool BcsSplit::Events::allocatePool(Context *context, size_t maxEventCountInPool, size_t neededEvents) {
if (this->pools.empty() ||
this->createdFromLatestPool + neededEvents > maxEventCountInPool) {
ze_result_t result;
ze_event_pool_desc_t desc{};
desc.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC;
ze_event_pool_desc_t desc = {ZE_STRUCTURE_TYPE_EVENT_POOL_DESC};
desc.count = static_cast<uint32_t>(maxEventCountInPool);
auto hDevice = this->bcsSplit.device.toHandle();
auto pool = EventPool::create(this->bcsSplit.device.getDriverHandle(), context, 1, &hDevice, &desc, result);
if (!pool) {
return std::nullopt;
return false;
}
this->pools.push_back(pool);
this->createdFromLatestPool = 0u;
}
return true;
}
std::optional<size_t> BcsSplit::Events::createFromPool(Context *context, size_t maxEventCountInPool) {
/* Internal events needed for split:
* - event per subcopy to signal completion of given subcopy (vector of subcopy events),
* - 1 event to signal completion of entire split (vector of marker events),
* - 1 event to handle barrier (vector of barrier events).
*/
const size_t neededEvents = this->bcsSplit.cmdQs.size() + 2;
if (!allocatePool(context, maxEventCountInPool, neededEvents)) {
return std::nullopt;
}
auto pool = this->pools[this->pools.size() - 1];
ze_event_desc_t desc{};
desc.stype = ZE_STRUCTURE_TYPE_EVENT_DESC;
desc.signal = ZE_EVENT_SCOPE_FLAG_DEVICE;
ze_event_desc_t desc = {ZE_STRUCTURE_TYPE_EVENT_DESC};
for (size_t i = 0; i < neededEvents; i++) {
// Marker event is the only one of internal split events that will be read from host, so create it at the end with appended scope flag.
bool markerEvent = (i == neededEvents - 1);
bool barrierEvent = (i == neededEvents - 2);
desc.signal = markerEvent ? ZE_EVENT_SCOPE_FLAG_HOST : ZE_EVENT_SCOPE_FLAG_DEVICE;
desc.index = static_cast<uint32_t>(this->createdFromLatestPool++);
// Marker event is the only one of internal split events that will be read from host, so create it at the end with appended scope flag.
if (i == neededEvents - 1) {
desc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
}
ze_event_handle_t hEvent{};
ze_event_handle_t hEvent = {};
pool->createEvent(&desc, &hEvent);
Event::fromHandle(hEvent)->disableImplicitCounterBasedMode();
auto event = Event::fromHandle(hEvent);
event->disableImplicitCounterBasedMode();
// Last event, created with host scope flag, is marker event.
if (i == neededEvents - 1) {
this->marker.push_back(Event::fromHandle(hEvent));
if (markerEvent) {
this->marker.push_back(event);
// One event to handle barrier and others to handle subcopy completion.
} else if (i == neededEvents - 2) {
this->barrier.push_back(Event::fromHandle(hEvent));
} else if (barrierEvent) {
this->barrier.push_back(event);
} else {
this->subcopy.push_back(Event::fromHandle(hEvent));
this->subcopy.push_back(event);
}
}
@@ -196,6 +282,17 @@ void BcsSplit::Events::resetEventPackage(size_t index) {
}
}
void BcsSplit::Events::resetAggregatedEventState(size_t index, uint64_t value) {
auto event = this->subcopy[index];
*event->getInOrderExecInfo()->getBaseHostAddress() = value;
if (value == 0) {
event->resetCompletionStatus();
} else {
event->setIsCompleted();
}
}
void BcsSplit::Events::releaseResources() {
for (auto &markerEvent : this->marker) {
markerEvent->destroy();
@@ -213,5 +310,11 @@ void BcsSplit::Events::releaseResources() {
pool->destroy();
}
pools.clear();
auto context = Context::fromHandle(bcsSplit.device.getDriverHandle()->getDefaultContext());
for (auto &ptr : this->allocsForAggregatedEvents) {
context->freeMem(ptr);
}
allocsForAggregatedEvents.clear();
}
} // namespace L0

View File

@@ -42,15 +42,22 @@ struct BcsSplit {
std::vector<Event *> barrier;
std::vector<Event *> subcopy;
std::vector<Event *> marker;
std::vector<void *> allocsForAggregatedEvents;
size_t currentAggregatedAllocOffset = 0;
size_t createdFromLatestPool = 0u;
bool aggregatedEventsMode = false;
std::optional<size_t> obtainForSplit(Context *context, size_t maxEventCountInPool);
std::optional<size_t> allocateNew(Context *context, size_t maxEventCountInPool);
size_t obtainAggregatedEventsForSplit(Context *context);
void resetEventPackage(size_t index);
void resetAggregatedEventState(size_t index, uint64_t value);
void releaseResources();
bool allocatePool(Context *context, size_t maxEventCountInPool, size_t neededEvents);
std::optional<size_t> createFromPool(Context *context, size_t maxEventCountInPool);
size_t createAggregatedEvent(Context *context);
uint64_t *getNextAllocationForAggregatedEvent();
Events(BcsSplit &bcsSplit) : bcsSplit(bcsSplit){};
Events(BcsSplit &bcsSplit);
} events;
std::vector<CommandQueue *> cmdQs;