mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-08 14:02:58 +08:00
performance: Optimize ULLS start on submit path
Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
fd68e4f0cf
commit
ea78831e28
@@ -82,8 +82,6 @@ class DirectSubmissionHw {
|
||||
|
||||
MOCKABLE_VIRTUAL bool stopRingBuffer(bool blocking);
|
||||
|
||||
bool startRingBuffer();
|
||||
|
||||
MOCKABLE_VIRTUAL bool dispatchCommandBuffer(BatchBuffer &batchBuffer, FlushStampTracker &flushStamp);
|
||||
uint32_t getDispatchErrorCode();
|
||||
|
||||
@@ -121,6 +119,7 @@ class DirectSubmissionHw {
|
||||
virtual bool dispatchMonitorFenceRequired(bool requireMonitorFence);
|
||||
virtual void getTagAddressValue(TagData &tagData) = 0;
|
||||
void unblockGpu();
|
||||
bool submitCommandBufferToGpu(bool needStart, uint64_t gpuAddress, size_t size);
|
||||
bool copyCommandBufferIntoRing(BatchBuffer &batchBuffer);
|
||||
|
||||
void cpuCachelineFlush(void *ptr, size_t size);
|
||||
@@ -135,6 +134,9 @@ class DirectSubmissionHw {
|
||||
void dispatchStartSection(uint64_t gpuStartAddress);
|
||||
size_t getSizeStartSection();
|
||||
|
||||
size_t getUllsStateSize();
|
||||
void dispatchUllsState();
|
||||
|
||||
void dispatchSwitchRingBufferSection(uint64_t nextBufferGpuAddress);
|
||||
size_t getSizeSwitchRingBufferSection();
|
||||
|
||||
|
||||
@@ -499,55 +499,6 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::initialize(bool submitOnInit, bo
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
bool DirectSubmissionHw<GfxFamily, Dispatcher>::startRingBuffer() {
|
||||
if (ringStart) {
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t startSize = getSizeSemaphoreSection(false);
|
||||
if (!this->partitionConfigSet) {
|
||||
startSize += getSizePartitionRegisterConfigurationSection();
|
||||
}
|
||||
if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
|
||||
startSize += getSizeSystemMemoryFenceAddress();
|
||||
}
|
||||
if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
|
||||
startSize += RelaxedOrderingHelper::getSizeRegistersInit<GfxFamily>();
|
||||
}
|
||||
|
||||
size_t requiredSize = startSize + getSizeDispatch(false, false, dispatchMonitorFenceRequired(true)) + getSizeEnd(false);
|
||||
if (ringCommandStream.getAvailableSpace() < requiredSize) {
|
||||
switchRingBuffers(nullptr);
|
||||
}
|
||||
uint64_t gpuStartVa = ringCommandStream.getCurrentGpuAddressPosition();
|
||||
|
||||
if (!this->partitionConfigSet) {
|
||||
dispatchPartitionRegisterConfiguration();
|
||||
this->partitionConfigSet = true;
|
||||
}
|
||||
|
||||
if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
|
||||
dispatchSystemMemoryFenceAddress();
|
||||
this->systemMemoryFenceAddressSet = true;
|
||||
}
|
||||
|
||||
if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
|
||||
preinitializeRelaxedOrderingSections();
|
||||
dispatchStaticRelaxedOrderingScheduler();
|
||||
initRelaxedOrderingRegisters();
|
||||
|
||||
this->relaxedOrderingInitialized = true;
|
||||
}
|
||||
|
||||
currentQueueWorkCount++;
|
||||
dispatchSemaphoreSection(currentQueueWorkCount);
|
||||
|
||||
ringStart = submit(gpuStartVa, startSize);
|
||||
|
||||
return ringStart;
|
||||
}
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
bool DirectSubmissionHw<GfxFamily, Dispatcher>::stopRingBuffer(bool blocking) {
|
||||
if (!ringStart) {
|
||||
@@ -940,15 +891,46 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::copyCommandBufferIntoRing(BatchB
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getUllsStateSize() {
|
||||
size_t startSize = 0u;
|
||||
if (!this->partitionConfigSet) {
|
||||
startSize += getSizePartitionRegisterConfigurationSection();
|
||||
}
|
||||
if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
|
||||
startSize += getSizeSystemMemoryFenceAddress();
|
||||
}
|
||||
if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
|
||||
startSize += RelaxedOrderingHelper::getSizeRegistersInit<GfxFamily>();
|
||||
}
|
||||
return startSize;
|
||||
}
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchUllsState() {
|
||||
if (!this->partitionConfigSet) {
|
||||
dispatchPartitionRegisterConfiguration();
|
||||
this->partitionConfigSet = true;
|
||||
}
|
||||
if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
|
||||
dispatchSystemMemoryFenceAddress();
|
||||
this->systemMemoryFenceAddressSet = true;
|
||||
}
|
||||
if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
|
||||
preinitializeRelaxedOrderingSections();
|
||||
dispatchStaticRelaxedOrderingScheduler();
|
||||
initRelaxedOrderingRegisters();
|
||||
|
||||
this->relaxedOrderingInitialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffer &batchBuffer, FlushStampTracker &flushStamp) {
|
||||
if (batchBuffer.ringBufferRestartRequest) {
|
||||
this->stopRingBuffer(false);
|
||||
}
|
||||
|
||||
if (!this->startRingBuffer()) {
|
||||
return false;
|
||||
}
|
||||
lastSubmittedThrottle = batchBuffer.throttle;
|
||||
bool relaxedOrderingSchedulerWillBeNeeded = (this->relaxedOrderingSchedulerRequired || batchBuffer.hasRelaxedOrderingDependencies);
|
||||
bool inputRequiredMonitorFence = false;
|
||||
@@ -959,7 +941,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
|
||||
}
|
||||
bool dispatchMonitorFence = this->dispatchMonitorFenceRequired(inputRequiredMonitorFence);
|
||||
|
||||
size_t dispatchSize = getSizeDispatch(relaxedOrderingSchedulerWillBeNeeded, batchBuffer.hasRelaxedOrderingDependencies, dispatchMonitorFence);
|
||||
size_t dispatchSize = this->getUllsStateSize() + getSizeDispatch(relaxedOrderingSchedulerWillBeNeeded, batchBuffer.hasRelaxedOrderingDependencies, dispatchMonitorFence);
|
||||
|
||||
if (this->copyCommandBufferIntoRing(batchBuffer)) {
|
||||
dispatchSize += (batchBuffer.stream->getUsed() - batchBuffer.startOffset) - 2 * getSizeStartSection();
|
||||
@@ -978,8 +960,14 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
|
||||
}
|
||||
}
|
||||
|
||||
auto needStart = !this->ringStart;
|
||||
this->ringStart = true;
|
||||
auto startVA = ringCommandStream.getCurrentGpuAddressPosition();
|
||||
|
||||
this->switchRingBuffersNeeded(requiredMinimalSize, batchBuffer.allocationsForResidency);
|
||||
|
||||
this->dispatchUllsState();
|
||||
|
||||
if (this->relaxedOrderingEnabled && batchBuffer.hasStallingCmds && this->relaxedOrderingSchedulerRequired) {
|
||||
dispatchRelaxedOrderingQueueStall();
|
||||
}
|
||||
@@ -991,9 +979,10 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
|
||||
void *currentPosition = dispatchWorkloadSection(batchBuffer, dispatchMonitorFence);
|
||||
|
||||
cpuCachelineFlush(currentPosition, dispatchSize);
|
||||
handleResidency();
|
||||
|
||||
this->unblockGpu();
|
||||
if (!this->submitCommandBufferToGpu(needStart, startVA, requiredMinimalSize)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
cpuCachelineFlush(semaphorePtr, MemoryConstants::cacheLineSize);
|
||||
currentQueueWorkCount++;
|
||||
@@ -1008,6 +997,17 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
|
||||
return ringStart;
|
||||
}
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
bool DirectSubmissionHw<GfxFamily, Dispatcher>::submitCommandBufferToGpu(bool needStart, uint64_t gpuAddress, size_t size) {
|
||||
if (needStart) {
|
||||
return this->submit(gpuAddress, size);
|
||||
} else {
|
||||
handleResidency();
|
||||
this->unblockGpu();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
inline void DirectSubmissionHw<GfxFamily, Dispatcher>::setReturnAddress(void *returnCmd, uint64_t returnAddress) {
|
||||
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
|
||||
|
||||
@@ -55,7 +55,9 @@ WddmDirectSubmission<GfxFamily, Dispatcher>::~WddmDirectSubmission() {
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
inline void WddmDirectSubmission<GfxFamily, Dispatcher>::flushMonitorFence() {
|
||||
this->startRingBuffer();
|
||||
auto needStart = !this->ringStart;
|
||||
this->ringStart = true;
|
||||
auto startVA = this->ringCommandStream.getCurrentGpuAddressPosition();
|
||||
|
||||
size_t requiredMinimalSize = this->getSizeSemaphoreSection(false) +
|
||||
Dispatcher::getSizeMonitorFence(this->rootDeviceEnvironment) +
|
||||
@@ -71,8 +73,7 @@ inline void WddmDirectSubmission<GfxFamily, Dispatcher>::flushMonitorFence() {
|
||||
Dispatcher::dispatchMonitorFence(this->ringCommandStream, currentTagData.tagAddress, currentTagData.tagValue, this->rootDeviceEnvironment, this->useNotifyForPostSync, this->partitionedMode, this->dcFlushRequired);
|
||||
|
||||
this->dispatchSemaphoreSection(this->currentQueueWorkCount + 1);
|
||||
this->handleResidency();
|
||||
this->unblockGpu();
|
||||
this->submitCommandBufferToGpu(needStart, startVA, requiredMinimalSize);
|
||||
this->currentQueueWorkCount++;
|
||||
|
||||
this->updateTagValueImpl(this->currentRingBuffer);
|
||||
|
||||
Reference in New Issue
Block a user