performance: Optimize ULLS start on submit path

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2023-12-27 06:50:44 +00:00
committed by Compute-Runtime-Automation
parent fd68e4f0cf
commit ea78831e28
6 changed files with 84 additions and 160 deletions

View File

@@ -82,8 +82,6 @@ class DirectSubmissionHw {
MOCKABLE_VIRTUAL bool stopRingBuffer(bool blocking);
bool startRingBuffer();
MOCKABLE_VIRTUAL bool dispatchCommandBuffer(BatchBuffer &batchBuffer, FlushStampTracker &flushStamp);
uint32_t getDispatchErrorCode();
@@ -121,6 +119,7 @@ class DirectSubmissionHw {
virtual bool dispatchMonitorFenceRequired(bool requireMonitorFence);
virtual void getTagAddressValue(TagData &tagData) = 0;
void unblockGpu();
bool submitCommandBufferToGpu(bool needStart, uint64_t gpuAddress, size_t size);
bool copyCommandBufferIntoRing(BatchBuffer &batchBuffer);
void cpuCachelineFlush(void *ptr, size_t size);
@@ -135,6 +134,9 @@ class DirectSubmissionHw {
void dispatchStartSection(uint64_t gpuStartAddress);
size_t getSizeStartSection();
size_t getUllsStateSize();
void dispatchUllsState();
void dispatchSwitchRingBufferSection(uint64_t nextBufferGpuAddress);
size_t getSizeSwitchRingBufferSection();

View File

@@ -499,55 +499,6 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::initialize(bool submitOnInit, bo
return ret;
}
template <typename GfxFamily, typename Dispatcher>
bool DirectSubmissionHw<GfxFamily, Dispatcher>::startRingBuffer() {
if (ringStart) {
return true;
}
size_t startSize = getSizeSemaphoreSection(false);
if (!this->partitionConfigSet) {
startSize += getSizePartitionRegisterConfigurationSection();
}
if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
startSize += getSizeSystemMemoryFenceAddress();
}
if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
startSize += RelaxedOrderingHelper::getSizeRegistersInit<GfxFamily>();
}
size_t requiredSize = startSize + getSizeDispatch(false, false, dispatchMonitorFenceRequired(true)) + getSizeEnd(false);
if (ringCommandStream.getAvailableSpace() < requiredSize) {
switchRingBuffers(nullptr);
}
uint64_t gpuStartVa = ringCommandStream.getCurrentGpuAddressPosition();
if (!this->partitionConfigSet) {
dispatchPartitionRegisterConfiguration();
this->partitionConfigSet = true;
}
if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
dispatchSystemMemoryFenceAddress();
this->systemMemoryFenceAddressSet = true;
}
if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
preinitializeRelaxedOrderingSections();
dispatchStaticRelaxedOrderingScheduler();
initRelaxedOrderingRegisters();
this->relaxedOrderingInitialized = true;
}
currentQueueWorkCount++;
dispatchSemaphoreSection(currentQueueWorkCount);
ringStart = submit(gpuStartVa, startSize);
return ringStart;
}
template <typename GfxFamily, typename Dispatcher>
bool DirectSubmissionHw<GfxFamily, Dispatcher>::stopRingBuffer(bool blocking) {
if (!ringStart) {
@@ -940,15 +891,46 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::copyCommandBufferIntoRing(BatchB
return ret;
}
template <typename GfxFamily, typename Dispatcher>
size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getUllsStateSize() {
size_t startSize = 0u;
if (!this->partitionConfigSet) {
startSize += getSizePartitionRegisterConfigurationSection();
}
if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
startSize += getSizeSystemMemoryFenceAddress();
}
if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
startSize += RelaxedOrderingHelper::getSizeRegistersInit<GfxFamily>();
}
return startSize;
}
template <typename GfxFamily, typename Dispatcher>
void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchUllsState() {
if (!this->partitionConfigSet) {
dispatchPartitionRegisterConfiguration();
this->partitionConfigSet = true;
}
if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
dispatchSystemMemoryFenceAddress();
this->systemMemoryFenceAddressSet = true;
}
if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
preinitializeRelaxedOrderingSections();
dispatchStaticRelaxedOrderingScheduler();
initRelaxedOrderingRegisters();
this->relaxedOrderingInitialized = true;
}
}
template <typename GfxFamily, typename Dispatcher>
bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffer &batchBuffer, FlushStampTracker &flushStamp) {
if (batchBuffer.ringBufferRestartRequest) {
this->stopRingBuffer(false);
}
if (!this->startRingBuffer()) {
return false;
}
lastSubmittedThrottle = batchBuffer.throttle;
bool relaxedOrderingSchedulerWillBeNeeded = (this->relaxedOrderingSchedulerRequired || batchBuffer.hasRelaxedOrderingDependencies);
bool inputRequiredMonitorFence = false;
@@ -959,7 +941,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
}
bool dispatchMonitorFence = this->dispatchMonitorFenceRequired(inputRequiredMonitorFence);
size_t dispatchSize = getSizeDispatch(relaxedOrderingSchedulerWillBeNeeded, batchBuffer.hasRelaxedOrderingDependencies, dispatchMonitorFence);
size_t dispatchSize = this->getUllsStateSize() + getSizeDispatch(relaxedOrderingSchedulerWillBeNeeded, batchBuffer.hasRelaxedOrderingDependencies, dispatchMonitorFence);
if (this->copyCommandBufferIntoRing(batchBuffer)) {
dispatchSize += (batchBuffer.stream->getUsed() - batchBuffer.startOffset) - 2 * getSizeStartSection();
@@ -978,8 +960,14 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
}
}
auto needStart = !this->ringStart;
this->ringStart = true;
auto startVA = ringCommandStream.getCurrentGpuAddressPosition();
this->switchRingBuffersNeeded(requiredMinimalSize, batchBuffer.allocationsForResidency);
this->dispatchUllsState();
if (this->relaxedOrderingEnabled && batchBuffer.hasStallingCmds && this->relaxedOrderingSchedulerRequired) {
dispatchRelaxedOrderingQueueStall();
}
@@ -991,9 +979,10 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
void *currentPosition = dispatchWorkloadSection(batchBuffer, dispatchMonitorFence);
cpuCachelineFlush(currentPosition, dispatchSize);
handleResidency();
this->unblockGpu();
if (!this->submitCommandBufferToGpu(needStart, startVA, requiredMinimalSize)) {
return false;
}
cpuCachelineFlush(semaphorePtr, MemoryConstants::cacheLineSize);
currentQueueWorkCount++;
@@ -1008,6 +997,17 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
return ringStart;
}
template <typename GfxFamily, typename Dispatcher>
bool DirectSubmissionHw<GfxFamily, Dispatcher>::submitCommandBufferToGpu(bool needStart, uint64_t gpuAddress, size_t size) {
if (needStart) {
return this->submit(gpuAddress, size);
} else {
handleResidency();
this->unblockGpu();
return true;
}
}
template <typename GfxFamily, typename Dispatcher>
inline void DirectSubmissionHw<GfxFamily, Dispatcher>::setReturnAddress(void *returnCmd, uint64_t returnAddress) {
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;

View File

@@ -55,7 +55,9 @@ WddmDirectSubmission<GfxFamily, Dispatcher>::~WddmDirectSubmission() {
template <typename GfxFamily, typename Dispatcher>
inline void WddmDirectSubmission<GfxFamily, Dispatcher>::flushMonitorFence() {
this->startRingBuffer();
auto needStart = !this->ringStart;
this->ringStart = true;
auto startVA = this->ringCommandStream.getCurrentGpuAddressPosition();
size_t requiredMinimalSize = this->getSizeSemaphoreSection(false) +
Dispatcher::getSizeMonitorFence(this->rootDeviceEnvironment) +
@@ -71,8 +73,7 @@ inline void WddmDirectSubmission<GfxFamily, Dispatcher>::flushMonitorFence() {
Dispatcher::dispatchMonitorFence(this->ringCommandStream, currentTagData.tagAddress, currentTagData.tagValue, this->rootDeviceEnvironment, this->useNotifyForPostSync, this->partitionedMode, this->dcFlushRequired);
this->dispatchSemaphoreSection(this->currentQueueWorkCount + 1);
this->handleResidency();
this->unblockGpu();
this->submitCommandBufferToGpu(needStart, startVA, requiredMinimalSize);
this->currentQueueWorkCount++;
this->updateTagValueImpl(this->currentRingBuffer);