mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-31 20:13:04 +08:00
performance: reuse cmd buffer without dc flush
Related-To: NEO-16348 Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
6a8b20edb8
commit
c78c1515de
@@ -546,6 +546,7 @@ void CommandStreamReceiver::setTagAllocation(GraphicsAllocation *allocation) {
|
||||
this->tagAddress = reinterpret_cast<TagAddressType *>(allocation->getUnderlyingBuffer());
|
||||
this->debugPauseStateAddress = reinterpret_cast<DebugPauseState *>(
|
||||
reinterpret_cast<uint8_t *>(allocation->getUnderlyingBuffer()) + TagAllocationLayout::debugPauseStateAddressOffset);
|
||||
this->ucTagAddress = static_cast<TagAddressType *>(ptrOffset(allocation->getUnderlyingBuffer(), TagAllocationLayout::ucTagAddressOffset));
|
||||
}
|
||||
|
||||
MultiGraphicsAllocation &CommandStreamReceiver::createMultiAllocationInSystemMemoryPool(AllocationType allocationType) {
|
||||
@@ -877,14 +878,18 @@ bool CommandStreamReceiver::initializeTagAllocation() {
|
||||
}
|
||||
|
||||
this->setTagAllocation(tagAllocation);
|
||||
|
||||
auto initValue = debugManager.flags.EnableNullHardware.get() ? static_cast<uint32_t>(-1) : initialHardwareTag;
|
||||
auto tagAddress = this->tagAddress;
|
||||
auto ucTagAddress = this->ucTagAddress;
|
||||
auto completionFence = reinterpret_cast<TaskCountType *>(getCompletionAddress());
|
||||
UNRECOVERABLE_IF(!completionFence);
|
||||
uint32_t subDevices = static_cast<uint32_t>(this->deviceBitfield.count());
|
||||
for (uint32_t i = 0; i < subDevices; i++) {
|
||||
*tagAddress = initValue;
|
||||
tagAddress = ptrOffset(tagAddress, this->immWritePostSyncWriteOffset);
|
||||
*ucTagAddress = initValue;
|
||||
ucTagAddress = ptrOffset(ucTagAddress, this->immWritePostSyncWriteOffset);
|
||||
*completionFence = 0;
|
||||
completionFence = ptrOffset(completionFence, this->immWritePostSyncWriteOffset);
|
||||
}
|
||||
@@ -1207,6 +1212,7 @@ TaskCountType CompletionStamp::getTaskCountFromSubmissionStatusError(SubmissionS
|
||||
}
|
||||
uint64_t CommandStreamReceiver::getBarrierCountGpuAddress() const { return ptrOffset(this->tagAllocation->getGpuAddress(), TagAllocationLayout::barrierCountOffset); }
|
||||
uint64_t CommandStreamReceiver::getDebugPauseStateGPUAddress() const { return tagAllocation->getGpuAddress() + TagAllocationLayout::debugPauseStateAddressOffset; }
|
||||
uint64_t CommandStreamReceiver::getUcTagGPUAddress() const { return tagAllocation->getGpuAddress() + TagAllocationLayout::ucTagAddressOffset; }
|
||||
uint64_t CommandStreamReceiver::getCompletionAddress() const {
|
||||
uint64_t completionFenceAddress = castToUint64(const_cast<TagAddressType *>(tagAddress));
|
||||
if (completionFenceAddress == 0) {
|
||||
|
||||
@@ -166,9 +166,11 @@ class CommandStreamReceiver : NEO::NonCopyableAndNonMovableClass {
|
||||
TaskCountType getNextBarrierCount() { return this->barrierCount.fetch_add(1u); }
|
||||
TaskCountType peekBarrierCount() const { return this->barrierCount.load(); }
|
||||
volatile TagAddressType *getTagAddress() const { return tagAddress; }
|
||||
volatile TagAddressType *getUcTagAddress() const { return ucTagAddress; }
|
||||
volatile TagAddressType *getBarrierCountTagAddress() const { return this->barrierCountTagAddress; }
|
||||
uint64_t getBarrierCountGpuAddress() const;
|
||||
uint64_t getDebugPauseStateGPUAddress() const;
|
||||
uint64_t getUcTagGPUAddress() const;
|
||||
|
||||
virtual bool waitForFlushStamp(FlushStamp &flushStampToWait) { return true; }
|
||||
|
||||
@@ -627,6 +629,7 @@ class CommandStreamReceiver : NEO::NonCopyableAndNonMovableClass {
|
||||
uint64_t totalMemoryUsed = 0u;
|
||||
|
||||
volatile TagAddressType *tagAddress = nullptr;
|
||||
volatile TagAddressType *ucTagAddress = nullptr;
|
||||
volatile TagAddressType *barrierCountTagAddress = nullptr;
|
||||
volatile DebugPauseState *debugPauseStateAddress = nullptr;
|
||||
SpinLock debugPauseStateLock;
|
||||
@@ -694,6 +697,7 @@ class CommandStreamReceiver : NEO::NonCopyableAndNonMovableClass {
|
||||
bool gsbaStateDirty = true;
|
||||
bool bindingTableBaseAddressRequired = false;
|
||||
bool heapStorageRequiresRecyclingTag = false;
|
||||
bool ucResourceRequiresTagUpdate = false;
|
||||
bool mediaVfeStateDirty = true;
|
||||
bool stateComputeModeDirty = true;
|
||||
bool btdCommandDirty = true;
|
||||
|
||||
@@ -276,6 +276,8 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
|
||||
bool areMultipleSubDevicesInContext,
|
||||
bool setGeneralStateBaseAddress);
|
||||
|
||||
inline void emitTagUpdateWithoutDCFlush(LinearStream &commandStream);
|
||||
|
||||
inline void processBarrierWithPostSync(LinearStream &commandStreamTask,
|
||||
DispatchFlags &dispatchFlags,
|
||||
bool &levelClosed,
|
||||
|
||||
@@ -442,7 +442,9 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTaskHeapful(
|
||||
if (detectInitProgrammingFlagsRequired(dispatchFlags)) {
|
||||
initProgrammingFlags();
|
||||
}
|
||||
|
||||
if (this->ucResourceRequiresTagUpdate) {
|
||||
this->emitTagUpdateWithoutDCFlush(commandStreamTask);
|
||||
}
|
||||
const auto &hwInfo = peekHwInfo();
|
||||
|
||||
bool hasStallingCmdsOnTaskStream = false;
|
||||
@@ -1842,6 +1844,25 @@ inline void CommandStreamReceiverHw<GfxFamily>::programStateBaseAddressCommon(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline void CommandStreamReceiverHw<GfxFamily>::emitTagUpdateWithoutDCFlush(LinearStream &commandStream) {
|
||||
auto &rootDeviceEnvironment = this->peekRootDeviceEnvironment();
|
||||
auto address = this->getUcTagGPUAddress();
|
||||
|
||||
PipeControlArgs args = {};
|
||||
args.notifyEnable = isUsedNotifyEnableForPostSync();
|
||||
MemorySynchronizationCommands<GfxFamily>::addBarrierWithPostSyncOperation(
|
||||
commandStream,
|
||||
PostSyncMode::immediateData,
|
||||
address,
|
||||
taskCount + 1,
|
||||
rootDeviceEnvironment,
|
||||
args);
|
||||
|
||||
makeResident(*tagAllocation);
|
||||
this->ucResourceRequiresTagUpdate = false;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline void CommandStreamReceiverHw<GfxFamily>::processBarrierWithPostSync(LinearStream &commandStreamTask, DispatchFlags &dispatchFlags, bool &levelClosed, void *¤tPipeControlForNooping, void *&epiloguePipeControlLocation, bool &hasStallingCmdsOnTaskStream, PipeControlArgs &args) {
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2022-2023 Intel Corporation
|
||||
* Copyright (C) 2022-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -11,6 +11,7 @@
|
||||
namespace NEO {
|
||||
namespace TagAllocationLayout {
|
||||
inline constexpr uint64_t debugPauseStateAddressOffset = MemoryConstants::kiloByte;
|
||||
inline constexpr uint64_t ucTagAddressOffset = MemoryConstants::kiloByte + MemoryConstants::cacheLineSize;
|
||||
inline constexpr uint64_t completionFenceOffset = 2 * MemoryConstants::kiloByte;
|
||||
inline constexpr uint64_t barrierCountOffset = 3 * MemoryConstants::kiloByte;
|
||||
} // namespace TagAllocationLayout
|
||||
|
||||
Reference in New Issue
Block a user