performance: reuse cmd buffer without dc flush

Related-To: NEO-16348

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2025-10-16 14:35:52 +00:00
committed by Compute-Runtime-Automation
parent 6a8b20edb8
commit c78c1515de
10 changed files with 90 additions and 7 deletions

View File

@@ -546,6 +546,7 @@ void CommandStreamReceiver::setTagAllocation(GraphicsAllocation *allocation) {
this->tagAddress = reinterpret_cast<TagAddressType *>(allocation->getUnderlyingBuffer());
this->debugPauseStateAddress = reinterpret_cast<DebugPauseState *>(
reinterpret_cast<uint8_t *>(allocation->getUnderlyingBuffer()) + TagAllocationLayout::debugPauseStateAddressOffset);
this->ucTagAddress = static_cast<TagAddressType *>(ptrOffset(allocation->getUnderlyingBuffer(), TagAllocationLayout::ucTagAddressOffset));
}
MultiGraphicsAllocation &CommandStreamReceiver::createMultiAllocationInSystemMemoryPool(AllocationType allocationType) {
@@ -877,14 +878,18 @@ bool CommandStreamReceiver::initializeTagAllocation() {
}
this->setTagAllocation(tagAllocation);
auto initValue = debugManager.flags.EnableNullHardware.get() ? static_cast<uint32_t>(-1) : initialHardwareTag;
auto tagAddress = this->tagAddress;
auto ucTagAddress = this->ucTagAddress;
auto completionFence = reinterpret_cast<TaskCountType *>(getCompletionAddress());
UNRECOVERABLE_IF(!completionFence);
uint32_t subDevices = static_cast<uint32_t>(this->deviceBitfield.count());
for (uint32_t i = 0; i < subDevices; i++) {
*tagAddress = initValue;
tagAddress = ptrOffset(tagAddress, this->immWritePostSyncWriteOffset);
*ucTagAddress = initValue;
ucTagAddress = ptrOffset(ucTagAddress, this->immWritePostSyncWriteOffset);
*completionFence = 0;
completionFence = ptrOffset(completionFence, this->immWritePostSyncWriteOffset);
}
@@ -1207,6 +1212,7 @@ TaskCountType CompletionStamp::getTaskCountFromSubmissionStatusError(SubmissionS
}
uint64_t CommandStreamReceiver::getBarrierCountGpuAddress() const { return ptrOffset(this->tagAllocation->getGpuAddress(), TagAllocationLayout::barrierCountOffset); }
uint64_t CommandStreamReceiver::getDebugPauseStateGPUAddress() const { return tagAllocation->getGpuAddress() + TagAllocationLayout::debugPauseStateAddressOffset; }
uint64_t CommandStreamReceiver::getUcTagGPUAddress() const { return tagAllocation->getGpuAddress() + TagAllocationLayout::ucTagAddressOffset; }
uint64_t CommandStreamReceiver::getCompletionAddress() const {
uint64_t completionFenceAddress = castToUint64(const_cast<TagAddressType *>(tagAddress));
if (completionFenceAddress == 0) {

View File

@@ -166,9 +166,11 @@ class CommandStreamReceiver : NEO::NonCopyableAndNonMovableClass {
TaskCountType getNextBarrierCount() { return this->barrierCount.fetch_add(1u); }
TaskCountType peekBarrierCount() const { return this->barrierCount.load(); }
volatile TagAddressType *getTagAddress() const { return tagAddress; }
volatile TagAddressType *getUcTagAddress() const { return ucTagAddress; }
volatile TagAddressType *getBarrierCountTagAddress() const { return this->barrierCountTagAddress; }
uint64_t getBarrierCountGpuAddress() const;
uint64_t getDebugPauseStateGPUAddress() const;
uint64_t getUcTagGPUAddress() const;
virtual bool waitForFlushStamp(FlushStamp &flushStampToWait) { return true; }
@@ -627,6 +629,7 @@ class CommandStreamReceiver : NEO::NonCopyableAndNonMovableClass {
uint64_t totalMemoryUsed = 0u;
volatile TagAddressType *tagAddress = nullptr;
volatile TagAddressType *ucTagAddress = nullptr;
volatile TagAddressType *barrierCountTagAddress = nullptr;
volatile DebugPauseState *debugPauseStateAddress = nullptr;
SpinLock debugPauseStateLock;
@@ -694,6 +697,7 @@ class CommandStreamReceiver : NEO::NonCopyableAndNonMovableClass {
bool gsbaStateDirty = true;
bool bindingTableBaseAddressRequired = false;
bool heapStorageRequiresRecyclingTag = false;
bool ucResourceRequiresTagUpdate = false;
bool mediaVfeStateDirty = true;
bool stateComputeModeDirty = true;
bool btdCommandDirty = true;

View File

@@ -276,6 +276,8 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
bool areMultipleSubDevicesInContext,
bool setGeneralStateBaseAddress);
inline void emitTagUpdateWithoutDCFlush(LinearStream &commandStream);
inline void processBarrierWithPostSync(LinearStream &commandStreamTask,
DispatchFlags &dispatchFlags,
bool &levelClosed,

View File

@@ -442,7 +442,9 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTaskHeapful(
if (detectInitProgrammingFlagsRequired(dispatchFlags)) {
initProgrammingFlags();
}
if (this->ucResourceRequiresTagUpdate) {
this->emitTagUpdateWithoutDCFlush(commandStreamTask);
}
const auto &hwInfo = peekHwInfo();
bool hasStallingCmdsOnTaskStream = false;
@@ -1842,6 +1844,25 @@ inline void CommandStreamReceiverHw<GfxFamily>::programStateBaseAddressCommon(
}
}
template <typename GfxFamily>
inline void CommandStreamReceiverHw<GfxFamily>::emitTagUpdateWithoutDCFlush(LinearStream &commandStream) {
auto &rootDeviceEnvironment = this->peekRootDeviceEnvironment();
auto address = this->getUcTagGPUAddress();
PipeControlArgs args = {};
args.notifyEnable = isUsedNotifyEnableForPostSync();
MemorySynchronizationCommands<GfxFamily>::addBarrierWithPostSyncOperation(
commandStream,
PostSyncMode::immediateData,
address,
taskCount + 1,
rootDeviceEnvironment,
args);
makeResident(*tagAllocation);
this->ucResourceRequiresTagUpdate = false;
}
template <typename GfxFamily>
inline void CommandStreamReceiverHw<GfxFamily>::processBarrierWithPostSync(LinearStream &commandStreamTask, DispatchFlags &dispatchFlags, bool &levelClosed, void *&currentPipeControlForNooping, void *&epiloguePipeControlLocation, bool &hasStallingCmdsOnTaskStream, PipeControlArgs &args) {

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2022-2023 Intel Corporation
* Copyright (C) 2022-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -11,6 +11,7 @@
namespace NEO {
namespace TagAllocationLayout {
inline constexpr uint64_t debugPauseStateAddressOffset = MemoryConstants::kiloByte;
inline constexpr uint64_t ucTagAddressOffset = MemoryConstants::kiloByte + MemoryConstants::cacheLineSize;
inline constexpr uint64_t completionFenceOffset = 2 * MemoryConstants::kiloByte;
inline constexpr uint64_t barrierCountOffset = 3 * MemoryConstants::kiloByte;
} // namespace TagAllocationLayout