Revert "performance: Optimize ULLS start on submit path"

This reverts commit ea78831e28.

Signed-off-by: Compute-Runtime-Validation <compute-runtime-validation@intel.com>
This commit is contained in:
Compute-Runtime-Validation 2023-12-28 01:25:36 +01:00 committed by Compute-Runtime-Automation
parent d238a68bae
commit 1d02f7fdd9
6 changed files with 160 additions and 84 deletions

View File

@ -82,6 +82,8 @@ class DirectSubmissionHw {
MOCKABLE_VIRTUAL bool stopRingBuffer(bool blocking);
bool startRingBuffer();
MOCKABLE_VIRTUAL bool dispatchCommandBuffer(BatchBuffer &batchBuffer, FlushStampTracker &flushStamp);
uint32_t getDispatchErrorCode();
@ -119,7 +121,6 @@ class DirectSubmissionHw {
virtual bool dispatchMonitorFenceRequired(bool requireMonitorFence);
virtual void getTagAddressValue(TagData &tagData) = 0;
void unblockGpu();
bool submitCommandBufferToGpu(bool needStart, uint64_t gpuAddress, size_t size);
bool copyCommandBufferIntoRing(BatchBuffer &batchBuffer);
void cpuCachelineFlush(void *ptr, size_t size);
@ -134,9 +135,6 @@ class DirectSubmissionHw {
void dispatchStartSection(uint64_t gpuStartAddress);
size_t getSizeStartSection();
size_t getUllsStateSize();
void dispatchUllsState();
void dispatchSwitchRingBufferSection(uint64_t nextBufferGpuAddress);
size_t getSizeSwitchRingBufferSection();

View File

@ -499,6 +499,55 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::initialize(bool submitOnInit, bo
return ret;
}
template <typename GfxFamily, typename Dispatcher>
bool DirectSubmissionHw<GfxFamily, Dispatcher>::startRingBuffer() {
if (ringStart) {
return true;
}
size_t startSize = getSizeSemaphoreSection(false);
if (!this->partitionConfigSet) {
startSize += getSizePartitionRegisterConfigurationSection();
}
if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
startSize += getSizeSystemMemoryFenceAddress();
}
if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
startSize += RelaxedOrderingHelper::getSizeRegistersInit<GfxFamily>();
}
size_t requiredSize = startSize + getSizeDispatch(false, false, dispatchMonitorFenceRequired(true)) + getSizeEnd(false);
if (ringCommandStream.getAvailableSpace() < requiredSize) {
switchRingBuffers(nullptr);
}
uint64_t gpuStartVa = ringCommandStream.getCurrentGpuAddressPosition();
if (!this->partitionConfigSet) {
dispatchPartitionRegisterConfiguration();
this->partitionConfigSet = true;
}
if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
dispatchSystemMemoryFenceAddress();
this->systemMemoryFenceAddressSet = true;
}
if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
preinitializeRelaxedOrderingSections();
dispatchStaticRelaxedOrderingScheduler();
initRelaxedOrderingRegisters();
this->relaxedOrderingInitialized = true;
}
currentQueueWorkCount++;
dispatchSemaphoreSection(currentQueueWorkCount);
ringStart = submit(gpuStartVa, startSize);
return ringStart;
}
template <typename GfxFamily, typename Dispatcher>
bool DirectSubmissionHw<GfxFamily, Dispatcher>::stopRingBuffer(bool blocking) {
if (!ringStart) {
@ -891,46 +940,15 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::copyCommandBufferIntoRing(BatchB
return ret;
}
template <typename GfxFamily, typename Dispatcher>
size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getUllsStateSize() {
size_t startSize = 0u;
if (!this->partitionConfigSet) {
startSize += getSizePartitionRegisterConfigurationSection();
}
if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
startSize += getSizeSystemMemoryFenceAddress();
}
if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
startSize += RelaxedOrderingHelper::getSizeRegistersInit<GfxFamily>();
}
return startSize;
}
template <typename GfxFamily, typename Dispatcher>
void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchUllsState() {
if (!this->partitionConfigSet) {
dispatchPartitionRegisterConfiguration();
this->partitionConfigSet = true;
}
if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) {
dispatchSystemMemoryFenceAddress();
this->systemMemoryFenceAddressSet = true;
}
if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
preinitializeRelaxedOrderingSections();
dispatchStaticRelaxedOrderingScheduler();
initRelaxedOrderingRegisters();
this->relaxedOrderingInitialized = true;
}
}
template <typename GfxFamily, typename Dispatcher>
bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffer &batchBuffer, FlushStampTracker &flushStamp) {
if (batchBuffer.ringBufferRestartRequest) {
this->stopRingBuffer(false);
}
if (!this->startRingBuffer()) {
return false;
}
lastSubmittedThrottle = batchBuffer.throttle;
bool relaxedOrderingSchedulerWillBeNeeded = (this->relaxedOrderingSchedulerRequired || batchBuffer.hasRelaxedOrderingDependencies);
bool inputRequiredMonitorFence = false;
@ -941,7 +959,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
}
bool dispatchMonitorFence = this->dispatchMonitorFenceRequired(inputRequiredMonitorFence);
size_t dispatchSize = this->getUllsStateSize() + getSizeDispatch(relaxedOrderingSchedulerWillBeNeeded, batchBuffer.hasRelaxedOrderingDependencies, dispatchMonitorFence);
size_t dispatchSize = getSizeDispatch(relaxedOrderingSchedulerWillBeNeeded, batchBuffer.hasRelaxedOrderingDependencies, dispatchMonitorFence);
if (this->copyCommandBufferIntoRing(batchBuffer)) {
dispatchSize += (batchBuffer.stream->getUsed() - batchBuffer.startOffset) - 2 * getSizeStartSection();
@ -960,14 +978,8 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
}
}
auto needStart = !this->ringStart;
this->ringStart = true;
auto startVA = ringCommandStream.getCurrentGpuAddressPosition();
this->switchRingBuffersNeeded(requiredMinimalSize, batchBuffer.allocationsForResidency);
this->dispatchUllsState();
if (this->relaxedOrderingEnabled && batchBuffer.hasStallingCmds && this->relaxedOrderingSchedulerRequired) {
dispatchRelaxedOrderingQueueStall();
}
@ -979,10 +991,9 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
void *currentPosition = dispatchWorkloadSection(batchBuffer, dispatchMonitorFence);
cpuCachelineFlush(currentPosition, dispatchSize);
handleResidency();
if (!this->submitCommandBufferToGpu(needStart, startVA, requiredMinimalSize)) {
return false;
}
this->unblockGpu();
cpuCachelineFlush(semaphorePtr, MemoryConstants::cacheLineSize);
currentQueueWorkCount++;
@ -997,17 +1008,6 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
return ringStart;
}
template <typename GfxFamily, typename Dispatcher>
bool DirectSubmissionHw<GfxFamily, Dispatcher>::submitCommandBufferToGpu(bool needStart, uint64_t gpuAddress, size_t size) {
if (needStart) {
return this->submit(gpuAddress, size);
} else {
handleResidency();
this->unblockGpu();
return true;
}
}
template <typename GfxFamily, typename Dispatcher>
inline void DirectSubmissionHw<GfxFamily, Dispatcher>::setReturnAddress(void *returnCmd, uint64_t returnAddress) {
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;

View File

@ -55,9 +55,7 @@ WddmDirectSubmission<GfxFamily, Dispatcher>::~WddmDirectSubmission() {
template <typename GfxFamily, typename Dispatcher>
inline void WddmDirectSubmission<GfxFamily, Dispatcher>::flushMonitorFence() {
auto needStart = !this->ringStart;
this->ringStart = true;
auto startVA = this->ringCommandStream.getCurrentGpuAddressPosition();
this->startRingBuffer();
size_t requiredMinimalSize = this->getSizeSemaphoreSection(false) +
Dispatcher::getSizeMonitorFence(this->rootDeviceEnvironment) +
@ -73,7 +71,8 @@ inline void WddmDirectSubmission<GfxFamily, Dispatcher>::flushMonitorFence() {
Dispatcher::dispatchMonitorFence(this->ringCommandStream, currentTagData.tagAddress, currentTagData.tagValue, this->rootDeviceEnvironment, this->useNotifyForPostSync, this->partitionedMode, this->dcFlushRequired);
this->dispatchSemaphoreSection(this->currentQueueWorkCount + 1);
this->submitCommandBufferToGpu(needStart, startVA, requiredMinimalSize);
this->handleResidency();
this->unblockGpu();
this->currentQueueWorkCount++;
this->updateTagValueImpl(this->currentRingBuffer);

View File

@ -40,7 +40,6 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw<GfxFamily, Dispatcher>
using BaseClass::dispatchSemaphoreSection;
using BaseClass::dispatchStartSection;
using BaseClass::dispatchSwitchRingBufferSection;
using BaseClass::dispatchUllsState;
using BaseClass::dispatchWorkloadSection;
using BaseClass::getDiagnosticModeSection;
using BaseClass::getSizeDisablePrefetcher;
@ -80,6 +79,7 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw<GfxFamily, Dispatcher>
using BaseClass::semaphorePtr;
using BaseClass::semaphores;
using BaseClass::setReturnAddress;
using BaseClass::startRingBuffer;
using BaseClass::stopRingBuffer;
using BaseClass::switchRingBuffersAllocations;
using BaseClass::switchRingBuffersNeeded;

View File

@ -375,6 +375,74 @@ HWTEST_F(DirectSubmissionTest, givenDirectSubmissionSubmitFailWhenRingIsStartedT
EXPECT_NE(0u, directSubmission.ringCommandStream.getUsed());
}
HWTEST_F(DirectSubmissionTest, givenDirectSubmissionStartWhenRingIsStartedThenExpectNoStartCommandsDispatched) {
MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver);
bool ret = directSubmission.initialize(true, false);
EXPECT_TRUE(ret);
size_t usedSize = directSubmission.ringCommandStream.getUsed();
ret = directSubmission.startRingBuffer();
EXPECT_TRUE(ret);
EXPECT_EQ(usedSize, directSubmission.ringCommandStream.getUsed());
}
HWTEST_F(DirectSubmissionTest, givenDirectSubmissionStartWhenRingIsNotStartedThenExpectStartCommandsDispatched) {
MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver);
bool ret = directSubmission.initialize(false, false);
EXPECT_TRUE(ret);
size_t usedSize = directSubmission.ringCommandStream.getUsed();
ret = directSubmission.startRingBuffer();
EXPECT_TRUE(ret);
EXPECT_TRUE(directSubmission.ringStart);
EXPECT_NE(usedSize, directSubmission.ringCommandStream.getUsed());
}
HWTEST_F(DirectSubmissionTest, givenDirectSubmissionStartWhenRingIsNotStartedSubmitFailThenExpectStartCommandsDispatchedRingNotStarted) {
MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver);
bool ret = directSubmission.initialize(false, false);
EXPECT_TRUE(ret);
size_t usedSize = directSubmission.ringCommandStream.getUsed();
directSubmission.submitReturn = false;
ret = directSubmission.startRingBuffer();
EXPECT_FALSE(ret);
EXPECT_FALSE(directSubmission.ringStart);
EXPECT_NE(usedSize, directSubmission.ringCommandStream.getUsed());
}
HWTEST_F(DirectSubmissionTest, givenDirectSubmissionStartWhenRingIsNotStartedAndSwitchBufferIsNeededThenExpectRingAllocationChangedStartCommandsDispatched) {
MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver);
bool ret = directSubmission.initialize(false, false);
EXPECT_TRUE(ret);
auto expectedRingBuffer = directSubmission.currentRingBuffer;
GraphicsAllocation *oldRingBuffer = directSubmission.ringCommandStream.getGraphicsAllocation();
auto requiredSize = directSubmission.getSizeSemaphoreSection(false);
if (directSubmission.miMemFenceRequired) {
requiredSize += directSubmission.getSizeSystemMemoryFenceAddress();
}
if (directSubmission.isRelaxedOrderingEnabled()) {
requiredSize += RelaxedOrderingHelper::getSizeRegistersInit<FamilyType>();
}
directSubmission.ringCommandStream.getSpace(directSubmission.ringCommandStream.getAvailableSpace() - requiredSize);
ret = directSubmission.startRingBuffer();
auto actualRingBuffer = directSubmission.currentRingBuffer;
EXPECT_TRUE(ret);
EXPECT_TRUE(directSubmission.ringStart);
EXPECT_NE(oldRingBuffer, directSubmission.ringCommandStream.getGraphicsAllocation());
EXPECT_EQ(requiredSize, directSubmission.ringCommandStream.getUsed());
EXPECT_NE(expectedRingBuffer, actualRingBuffer);
}
HWTEST_F(DirectSubmissionTest, givenDirectSubmissionStopWhenStopRingIsCalledThenExpectStopCommandDispatched) {
MockDirectSubmissionHw<FamilyType, RenderDispatcher<FamilyType>> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver);

View File

@ -139,7 +139,7 @@ HWTEST_F(DirectSubmissionDispatchMiMemFenceTest, givenMiMemFenceSupportedWhenDis
EXPECT_TRUE(directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp));
validateFenceProgramming<FamilyType>(directSubmission, 1, 1);
validateFenceProgramming<FamilyType>(directSubmission, 2, 1);
EXPECT_EQ(miMemFenceSupported, directSubmission.systemMemoryFenceAddressSet);
}
@ -155,7 +155,7 @@ HWTEST_F(DirectSubmissionDispatchMiMemFenceTest, givenMiMemFenceSupportedWhenSys
EXPECT_TRUE(directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp));
validateFenceProgramming<FamilyType>(directSubmission, 1, 0);
validateFenceProgramming<FamilyType>(directSubmission, 2, 0);
EXPECT_TRUE(directSubmission.systemMemoryFenceAddressSet);
}
@ -651,20 +651,23 @@ HWTEST_F(DirectSubmissionDispatchBufferTest,
ret = directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp);
EXPECT_TRUE(ret);
EXPECT_EQ(oldRingAllocation, directSubmission.ringCommandStream.getGraphicsAllocation());
EXPECT_EQ(0u, directSubmission.semaphoreData->queueWorkCount);
EXPECT_EQ(2u, directSubmission.currentQueueWorkCount);
EXPECT_EQ(2u, directSubmission.semaphoreData->queueWorkCount);
EXPECT_EQ(3u, directSubmission.currentQueueWorkCount);
EXPECT_EQ(1u, directSubmission.submitCount);
EXPECT_EQ(oldRingAllocation->getGpuAddress(), directSubmission.submitGpuAddress);
EXPECT_EQ(1u, directSubmission.handleResidencyCount);
size_t submitSize = directSubmission.getSizeDispatch(false, false, directSubmission.dispatchMonitorFenceRequired(false)) - directSubmission.getSizeNewResourceHandler();
size_t submitSize = directSubmission.getSizeSemaphoreSection(false);
if (directSubmission.miMemFenceRequired) {
submitSize += directSubmission.getSizeSystemMemoryFenceAddress();
}
if (directSubmission.isRelaxedOrderingEnabled()) {
submitSize += RelaxedOrderingHelper::getSizeRegistersInit<FamilyType>();
}
EXPECT_EQ(submitSize, directSubmission.ringCommandStream.getUsed());
EXPECT_EQ(submitSize, directSubmission.submitSize);
EXPECT_EQ(oldRingAllocation->getGpuAddress(), directSubmission.submitGpuAddress);
EXPECT_EQ(2u, directSubmission.handleResidencyCount);
size_t dispatchSize = submitSize + directSubmission.getSizeDispatch(false, false, directSubmission.dispatchMonitorFenceRequired(false)) - directSubmission.getSizeNewResourceHandler();
EXPECT_EQ(dispatchSize, directSubmission.ringCommandStream.getUsed());
EXPECT_TRUE(directSubmission.ringStart);
}
@ -729,19 +732,22 @@ HWTEST_F(DirectSubmissionDispatchBufferTest,
ret = directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp);
EXPECT_TRUE(ret);
EXPECT_NE(oldRingAllocation, directSubmission.ringCommandStream.getGraphicsAllocation());
EXPECT_EQ(0u, directSubmission.semaphoreData->queueWorkCount);
EXPECT_EQ(2u, directSubmission.currentQueueWorkCount);
EXPECT_EQ(2u, directSubmission.semaphoreData->queueWorkCount);
EXPECT_EQ(3u, directSubmission.currentQueueWorkCount);
EXPECT_EQ(1u, directSubmission.submitCount);
EXPECT_EQ(1u, directSubmission.handleResidencyCount);
size_t submitSize = directSubmission.getSizeDispatch(false, false, directSubmission.dispatchMonitorFenceRequired(false)) - directSubmission.getSizeNewResourceHandler();
size_t submitSize = directSubmission.getSizeSemaphoreSection(false);
if (directSubmission.miMemFenceRequired) {
submitSize += directSubmission.getSizeSystemMemoryFenceAddress();
}
if (directSubmission.isRelaxedOrderingEnabled()) {
submitSize += RelaxedOrderingHelper::getSizeRegistersInit<FamilyType>();
}
EXPECT_EQ(submitSize, directSubmission.ringCommandStream.getUsed());
EXPECT_EQ(submitSize, directSubmission.submitSize);
EXPECT_EQ(2u, directSubmission.handleResidencyCount);
size_t dispatchSize = submitSize + directSubmission.getSizeDispatch(false, false, directSubmission.dispatchMonitorFenceRequired(false)) - directSubmission.getSizeNewResourceHandler();
EXPECT_EQ(dispatchSize, directSubmission.ringCommandStream.getUsed());
EXPECT_TRUE(directSubmission.ringStart);
}
@ -867,8 +873,10 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, DirectSubmissionDispatchBufferTest,
EXPECT_FALSE(directSubmission.ringStart);
EXPECT_EQ(0x0u, directSubmission.ringCommandStream.getUsed());
directSubmission.dispatchUllsState();
ret = directSubmission.startRingBuffer();
EXPECT_TRUE(ret);
EXPECT_TRUE(directSubmission.partitionConfigSet);
EXPECT_TRUE(directSubmission.ringStart);
HardwareParse hwParse;
hwParse.parseCommands<FamilyType>(directSubmission.ringCommandStream, 0);
@ -1636,10 +1644,13 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, whenInitializingThenDispatchStat
directSubmission.initialize(false, false);
EXPECT_EQ(0u, directSubmission.dispatchStaticRelaxedOrderingSchedulerCalled);
directSubmission.dispatchUllsState();
directSubmission.startRingBuffer();
EXPECT_EQ(1u, directSubmission.dispatchStaticRelaxedOrderingSchedulerCalled);
directSubmission.startRingBuffer();
EXPECT_EQ(1u, directSubmission.dispatchStaticRelaxedOrderingSchedulerCalled);
directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp);
EXPECT_EQ(1u, directSubmission.dispatchStaticRelaxedOrderingSchedulerCalled);
}
@ -1708,7 +1719,7 @@ HWTEST_F(DirectSubmissionRelaxedOrderingTests, whenInitializingThenPreinitialize
size_t offset = directSubmission.ringCommandStream.getUsed();
directSubmission.dispatchUllsState();
directSubmission.startRingBuffer();
EXPECT_FALSE(verifyInitRegisters(directSubmission.ringCommandStream, offset));
EXPECT_EQ(1u, directSubmission.preinitializeRelaxedOrderingSectionsCalled);
@ -1719,7 +1730,7 @@ HWTEST_F(DirectSubmissionRelaxedOrderingTests, whenInitializingThenPreinitialize
directSubmission.initialize(false, false);
EXPECT_EQ(0u, directSubmission.preinitializeRelaxedOrderingSectionsCalled);
directSubmission.dispatchUllsState();
directSubmission.startRingBuffer();
EXPECT_EQ(1u, directSubmission.preinitializeRelaxedOrderingSectionsCalled);
EXPECT_TRUE(directSubmission.relaxedOrderingInitialized);
@ -1727,7 +1738,7 @@ HWTEST_F(DirectSubmissionRelaxedOrderingTests, whenInitializingThenPreinitialize
EXPECT_NE(nullptr, directSubmission.preinitializedRelaxedOrderingScheduler.get());
size_t offset = directSubmission.ringCommandStream.getUsed();
directSubmission.dispatchUllsState();
directSubmission.startRingBuffer();
EXPECT_FALSE(verifyInitRegisters(directSubmission.ringCommandStream, offset));
EXPECT_EQ(1u, directSubmission.preinitializeRelaxedOrderingSectionsCalled);
}