mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 06:49:52 +08:00
feature: initial support for aggregated bcs split
Related-To: NEO-14557 Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
b7580a3998
commit
de8f98b43e
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2022-2024 Intel Corporation
|
||||
* Copyright (C) 2022-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -12,12 +12,16 @@
|
||||
#include "shared/source/os_interface/os_context.h"
|
||||
|
||||
#include "level_zero/core/source/device/device_imp.h"
|
||||
#include "level_zero/core/source/driver/driver_handle.h"
|
||||
#include "level_zero/driver_experimental/zex_api.h"
|
||||
|
||||
namespace L0 {
|
||||
|
||||
bool BcsSplit::setupDevice(uint32_t productFamily, bool internalUsage, const ze_command_queue_desc_t *desc, NEO::CommandStreamReceiver *csr) {
|
||||
const auto defaultEngine = this->device.getProductHelper().getDefaultCopyEngine();
|
||||
|
||||
auto initializeBcsSplit = this->device.getNEODevice()->isBcsSplitSupported() &&
|
||||
csr->getOsContext().getEngineType() == aub_stream::EngineType::ENGINE_BCS &&
|
||||
csr->getOsContext().getEngineType() == defaultEngine &&
|
||||
!internalUsage;
|
||||
|
||||
if (!initializeBcsSplit) {
|
||||
@@ -54,8 +58,7 @@ bool BcsSplit::setupDevice(uint32_t productFamily, bool internalUsage, const ze_
|
||||
return false;
|
||||
}
|
||||
|
||||
ze_command_queue_desc_t splitDesc;
|
||||
memcpy(&splitDesc, desc, sizeof(ze_command_queue_desc_t));
|
||||
ze_command_queue_desc_t splitDesc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC};
|
||||
splitDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
|
||||
|
||||
for (const auto &csr : csrs) {
|
||||
@@ -114,8 +117,30 @@ std::vector<CommandQueue *> &BcsSplit::getCmdQsForSplit(NEO::TransferDirection d
|
||||
return this->cmdQs;
|
||||
}
|
||||
|
||||
BcsSplit::Events::Events(BcsSplit &bcsSplit) : bcsSplit(bcsSplit) {
|
||||
if (NEO::debugManager.flags.SplitBcsAggregatedEventsMode.get() != -1) {
|
||||
aggregatedEventsMode = !!NEO::debugManager.flags.SplitBcsAggregatedEventsMode.get();
|
||||
}
|
||||
};
|
||||
|
||||
size_t BcsSplit::Events::obtainAggregatedEventsForSplit(Context *context) {
|
||||
for (size_t i = 0; i < this->subcopy.size(); i++) {
|
||||
if (this->subcopy[i]->queryStatus() == ZE_RESULT_SUCCESS) {
|
||||
resetAggregatedEventState(i, 0);
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
return this->createAggregatedEvent(context);
|
||||
}
|
||||
|
||||
std::optional<size_t> BcsSplit::Events::obtainForSplit(Context *context, size_t maxEventCountInPool) {
|
||||
std::lock_guard<std::mutex> lock(this->mtx);
|
||||
|
||||
if (this->aggregatedEventsMode) {
|
||||
return obtainAggregatedEventsForSplit(context);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < this->marker.size(); i++) {
|
||||
auto ret = this->marker[i]->queryStatus();
|
||||
if (ret == ZE_RESULT_SUCCESS) {
|
||||
@@ -124,7 +149,7 @@ std::optional<size_t> BcsSplit::Events::obtainForSplit(Context *context, size_t
|
||||
}
|
||||
}
|
||||
|
||||
auto newEventIndex = this->allocateNew(context, maxEventCountInPool);
|
||||
auto newEventIndex = this->createFromPool(context, maxEventCountInPool);
|
||||
if (newEventIndex.has_value() || this->marker.empty()) {
|
||||
return newEventIndex;
|
||||
}
|
||||
@@ -134,54 +159,115 @@ std::optional<size_t> BcsSplit::Events::obtainForSplit(Context *context, size_t
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::optional<size_t> BcsSplit::Events::allocateNew(Context *context, size_t maxEventCountInPool) {
|
||||
/* Internal events needed for split:
|
||||
* - event per subcopy to signal completion of given subcopy (vector of subcopy events),
|
||||
* - 1 event to signal completion of entire split (vector of marker events),
|
||||
* - 1 event to handle barrier (vector of barrier events).
|
||||
*/
|
||||
const size_t neededEvents = this->bcsSplit.cmdQs.size() + 2;
|
||||
uint64_t *BcsSplit::Events::getNextAllocationForAggregatedEvent() {
|
||||
constexpr size_t allocationSize = MemoryConstants::pageSize64k;
|
||||
|
||||
if (!this->allocsForAggregatedEvents.empty() && (currentAggregatedAllocOffset + MemoryConstants::cacheLineSize) < allocationSize) {
|
||||
currentAggregatedAllocOffset += MemoryConstants::cacheLineSize;
|
||||
} else {
|
||||
ze_device_mem_alloc_desc_t desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC};
|
||||
void *ptr = nullptr;
|
||||
|
||||
auto context = Context::fromHandle(bcsSplit.device.getDriverHandle()->getDefaultContext());
|
||||
|
||||
context->allocDeviceMem(bcsSplit.device.toHandle(), &desc, allocationSize, MemoryConstants::pageSize64k, &ptr);
|
||||
UNRECOVERABLE_IF(!ptr);
|
||||
currentAggregatedAllocOffset = 0;
|
||||
|
||||
this->allocsForAggregatedEvents.push_back(ptr);
|
||||
}
|
||||
|
||||
auto basePtr = reinterpret_cast<uint64_t *>(this->allocsForAggregatedEvents.back());
|
||||
|
||||
return ptrOffset(basePtr, currentAggregatedAllocOffset);
|
||||
}
|
||||
|
||||
size_t BcsSplit::Events::createAggregatedEvent(Context *context) {
|
||||
constexpr int preallocationCount = 8;
|
||||
size_t returnIndex = this->subcopy.size();
|
||||
|
||||
zex_counter_based_event_external_storage_properties_t externalStorageAllocProperties = {ZEX_STRUCTURE_COUNTER_BASED_EVENT_EXTERNAL_STORAGE_ALLOC_PROPERTIES};
|
||||
externalStorageAllocProperties.completionValue = static_cast<uint64_t>(bcsSplit.cmdQs.size());
|
||||
externalStorageAllocProperties.incrementValue = 1;
|
||||
|
||||
zex_counter_based_event_desc_t counterBasedDesc = {ZEX_STRUCTURE_COUNTER_BASED_EVENT_DESC};
|
||||
counterBasedDesc.flags = ZEX_COUNTER_BASED_EVENT_FLAG_IMMEDIATE;
|
||||
counterBasedDesc.signalScope = ZE_EVENT_SCOPE_FLAG_DEVICE;
|
||||
counterBasedDesc.pNext = &externalStorageAllocProperties;
|
||||
|
||||
for (int i = 0; i < preallocationCount; i++) {
|
||||
externalStorageAllocProperties.deviceAddress = getNextAllocationForAggregatedEvent();
|
||||
|
||||
ze_event_handle_t handle = nullptr;
|
||||
zexCounterBasedEventCreate2(context, bcsSplit.device.toHandle(), &counterBasedDesc, &handle);
|
||||
UNRECOVERABLE_IF(handle == nullptr);
|
||||
|
||||
this->subcopy.push_back(Event::fromHandle(handle));
|
||||
|
||||
resetAggregatedEventState(this->subcopy.size() - 1, (i == 0) ? 0 : externalStorageAllocProperties.completionValue);
|
||||
}
|
||||
|
||||
return returnIndex;
|
||||
}
|
||||
|
||||
bool BcsSplit::Events::allocatePool(Context *context, size_t maxEventCountInPool, size_t neededEvents) {
|
||||
if (this->pools.empty() ||
|
||||
this->createdFromLatestPool + neededEvents > maxEventCountInPool) {
|
||||
ze_result_t result;
|
||||
ze_event_pool_desc_t desc{};
|
||||
desc.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC;
|
||||
ze_event_pool_desc_t desc = {ZE_STRUCTURE_TYPE_EVENT_POOL_DESC};
|
||||
desc.count = static_cast<uint32_t>(maxEventCountInPool);
|
||||
auto hDevice = this->bcsSplit.device.toHandle();
|
||||
auto pool = EventPool::create(this->bcsSplit.device.getDriverHandle(), context, 1, &hDevice, &desc, result);
|
||||
if (!pool) {
|
||||
return std::nullopt;
|
||||
return false;
|
||||
}
|
||||
this->pools.push_back(pool);
|
||||
this->createdFromLatestPool = 0u;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::optional<size_t> BcsSplit::Events::createFromPool(Context *context, size_t maxEventCountInPool) {
|
||||
/* Internal events needed for split:
|
||||
* - event per subcopy to signal completion of given subcopy (vector of subcopy events),
|
||||
* - 1 event to signal completion of entire split (vector of marker events),
|
||||
* - 1 event to handle barrier (vector of barrier events).
|
||||
*/
|
||||
|
||||
const size_t neededEvents = this->bcsSplit.cmdQs.size() + 2;
|
||||
|
||||
if (!allocatePool(context, maxEventCountInPool, neededEvents)) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
auto pool = this->pools[this->pools.size() - 1];
|
||||
ze_event_desc_t desc{};
|
||||
desc.stype = ZE_STRUCTURE_TYPE_EVENT_DESC;
|
||||
desc.signal = ZE_EVENT_SCOPE_FLAG_DEVICE;
|
||||
ze_event_desc_t desc = {ZE_STRUCTURE_TYPE_EVENT_DESC};
|
||||
|
||||
for (size_t i = 0; i < neededEvents; i++) {
|
||||
// Marker event is the only one of internal split events that will be read from host, so create it at the end with appended scope flag.
|
||||
bool markerEvent = (i == neededEvents - 1);
|
||||
bool barrierEvent = (i == neededEvents - 2);
|
||||
|
||||
desc.signal = markerEvent ? ZE_EVENT_SCOPE_FLAG_HOST : ZE_EVENT_SCOPE_FLAG_DEVICE;
|
||||
desc.index = static_cast<uint32_t>(this->createdFromLatestPool++);
|
||||
|
||||
// Marker event is the only one of internal split events that will be read from host, so create it at the end with appended scope flag.
|
||||
if (i == neededEvents - 1) {
|
||||
desc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
|
||||
}
|
||||
|
||||
ze_event_handle_t hEvent{};
|
||||
ze_event_handle_t hEvent = {};
|
||||
pool->createEvent(&desc, &hEvent);
|
||||
Event::fromHandle(hEvent)->disableImplicitCounterBasedMode();
|
||||
|
||||
auto event = Event::fromHandle(hEvent);
|
||||
|
||||
event->disableImplicitCounterBasedMode();
|
||||
|
||||
// Last event, created with host scope flag, is marker event.
|
||||
if (i == neededEvents - 1) {
|
||||
this->marker.push_back(Event::fromHandle(hEvent));
|
||||
if (markerEvent) {
|
||||
this->marker.push_back(event);
|
||||
|
||||
// One event to handle barrier and others to handle subcopy completion.
|
||||
} else if (i == neededEvents - 2) {
|
||||
this->barrier.push_back(Event::fromHandle(hEvent));
|
||||
} else if (barrierEvent) {
|
||||
this->barrier.push_back(event);
|
||||
} else {
|
||||
this->subcopy.push_back(Event::fromHandle(hEvent));
|
||||
this->subcopy.push_back(event);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -196,6 +282,17 @@ void BcsSplit::Events::resetEventPackage(size_t index) {
|
||||
}
|
||||
}
|
||||
|
||||
void BcsSplit::Events::resetAggregatedEventState(size_t index, uint64_t value) {
|
||||
auto event = this->subcopy[index];
|
||||
*event->getInOrderExecInfo()->getBaseHostAddress() = value;
|
||||
|
||||
if (value == 0) {
|
||||
event->resetCompletionStatus();
|
||||
} else {
|
||||
event->setIsCompleted();
|
||||
}
|
||||
}
|
||||
|
||||
void BcsSplit::Events::releaseResources() {
|
||||
for (auto &markerEvent : this->marker) {
|
||||
markerEvent->destroy();
|
||||
@@ -213,5 +310,11 @@ void BcsSplit::Events::releaseResources() {
|
||||
pool->destroy();
|
||||
}
|
||||
pools.clear();
|
||||
|
||||
auto context = Context::fromHandle(bcsSplit.device.getDriverHandle()->getDefaultContext());
|
||||
for (auto &ptr : this->allocsForAggregatedEvents) {
|
||||
context->freeMem(ptr);
|
||||
}
|
||||
allocsForAggregatedEvents.clear();
|
||||
}
|
||||
} // namespace L0
|
||||
|
||||
@@ -42,15 +42,22 @@ struct BcsSplit {
|
||||
std::vector<Event *> barrier;
|
||||
std::vector<Event *> subcopy;
|
||||
std::vector<Event *> marker;
|
||||
std::vector<void *> allocsForAggregatedEvents;
|
||||
size_t currentAggregatedAllocOffset = 0;
|
||||
size_t createdFromLatestPool = 0u;
|
||||
bool aggregatedEventsMode = false;
|
||||
|
||||
std::optional<size_t> obtainForSplit(Context *context, size_t maxEventCountInPool);
|
||||
std::optional<size_t> allocateNew(Context *context, size_t maxEventCountInPool);
|
||||
size_t obtainAggregatedEventsForSplit(Context *context);
|
||||
void resetEventPackage(size_t index);
|
||||
|
||||
void resetAggregatedEventState(size_t index, uint64_t value);
|
||||
void releaseResources();
|
||||
bool allocatePool(Context *context, size_t maxEventCountInPool, size_t neededEvents);
|
||||
std::optional<size_t> createFromPool(Context *context, size_t maxEventCountInPool);
|
||||
size_t createAggregatedEvent(Context *context);
|
||||
uint64_t *getNextAllocationForAggregatedEvent();
|
||||
|
||||
Events(BcsSplit &bcsSplit) : bcsSplit(bcsSplit){};
|
||||
Events(BcsSplit &bcsSplit);
|
||||
} events;
|
||||
|
||||
std::vector<CommandQueue *> cmdQs;
|
||||
|
||||
Reference in New Issue
Block a user