mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-21 09:14:47 +08:00
RelaxedOrdering: Improve dependencies tracking
Avoid not needed scheduler programming Related-To: NEO-7458 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
ad6237478f
commit
3f962bf3e8
@@ -615,7 +615,8 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
|
||||
auto &streamToSubmit = submitCommandStreamFromCsr ? commandStreamCSR : commandStreamTask;
|
||||
BatchBuffer batchBuffer{streamToSubmit.getGraphicsAllocation(), startOffset, chainedBatchBufferStartOffset, taskStartAddress, chainedBatchBuffer,
|
||||
dispatchFlags.requiresCoherency, dispatchFlags.lowPriority, dispatchFlags.throttle, dispatchFlags.sliceCount,
|
||||
streamToSubmit.getUsed(), &streamToSubmit, bbEndLocation, dispatchFlags.useSingleSubdevice, (submitCSR || dispatchFlags.hasStallingCmds)};
|
||||
streamToSubmit.getUsed(), &streamToSubmit, bbEndLocation, dispatchFlags.useSingleSubdevice, (submitCSR || dispatchFlags.hasStallingCmds),
|
||||
dispatchFlags.hasRelaxedOrderingDependencies};
|
||||
streamToSubmit.getGraphicsAllocation()->updateTaskCount(this->taskCount + 1, this->osContext->getContextId());
|
||||
streamToSubmit.getGraphicsAllocation()->updateResidencyTaskCount(this->taskCount + 1, this->osContext->getContextId());
|
||||
|
||||
@@ -1178,7 +1179,7 @@ uint32_t CommandStreamReceiverHw<GfxFamily>::flushBcsTask(const BlitPropertiesCo
|
||||
uint64_t taskStartAddress = commandStream.getGpuBase() + commandStreamStart;
|
||||
|
||||
BatchBuffer batchBuffer{commandStream.getGraphicsAllocation(), commandStreamStart, 0, taskStartAddress, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount,
|
||||
commandStream.getUsed(), &commandStream, endingCmdPtr, false, false};
|
||||
commandStream.getUsed(), &commandStream, endingCmdPtr, false, false, false};
|
||||
|
||||
commandStream.getGraphicsAllocation()->updateTaskCount(newTaskCount, this->osContext->getContextId());
|
||||
commandStream.getGraphicsAllocation()->updateResidencyTaskCount(newTaskCount, this->osContext->getContextId());
|
||||
@@ -1290,7 +1291,7 @@ SubmissionStatus CommandStreamReceiverHw<GfxFamily>::flushSmallTask(LinearStream
|
||||
|
||||
BatchBuffer batchBuffer{commandStreamTask.getGraphicsAllocation(), commandStreamStartTask, 0, taskStartAddress,
|
||||
nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount,
|
||||
commandStreamTask.getUsed(), &commandStreamTask, endingCmdPtr, false, true};
|
||||
commandStreamTask.getUsed(), &commandStreamTask, endingCmdPtr, false, true, false};
|
||||
|
||||
this->latestSentTaskCount = taskCount + 1;
|
||||
auto submissionStatus = flushHandler(batchBuffer, getResidencyAllocations());
|
||||
|
||||
@@ -57,36 +57,37 @@ struct DispatchFlags {
|
||||
uint64_t sliceCountP, bool blockingP, bool dcFlushP, bool useSLMP, bool guardCommandBufferWithPipeControlP, bool gsba32BitRequiredP,
|
||||
bool requiresCoherencyP, bool lowPriorityP, bool implicitFlushP, bool outOfOrderExecutionAllowedP, bool epilogueRequiredP,
|
||||
bool usePerDSSbackedBufferP, bool useSingleSubdeviceP, bool useGlobalAtomicsP, bool areMultipleSubDevicesInContextP, bool memoryMigrationRequiredP, bool textureCacheFlush,
|
||||
bool hasStallingCmds) : csrDependencies(csrDependenciesP),
|
||||
barrierTimestampPacketNodes(barrierTimestampPacketNodesP),
|
||||
pipelineSelectArgs(pipelineSelectArgsP),
|
||||
flushStampReference(flushStampReferenceP),
|
||||
throttle(throttleP),
|
||||
preemptionMode(preemptionModeP),
|
||||
numGrfRequired(numGrfRequiredP),
|
||||
l3CacheSettings(l3CacheSettingsP),
|
||||
threadArbitrationPolicy(threadArbitrationPolicyP),
|
||||
additionalKernelExecInfo(additionalKernelExecInfoP),
|
||||
kernelExecutionType(kernelExecutionTypeP),
|
||||
memoryCompressionState(memoryCompressionStateP),
|
||||
sliceCount(sliceCountP),
|
||||
blocking(blockingP),
|
||||
dcFlush(dcFlushP),
|
||||
useSLM(useSLMP),
|
||||
guardCommandBufferWithPipeControl(guardCommandBufferWithPipeControlP),
|
||||
gsba32BitRequired(gsba32BitRequiredP),
|
||||
requiresCoherency(requiresCoherencyP),
|
||||
lowPriority(lowPriorityP),
|
||||
implicitFlush(implicitFlushP),
|
||||
outOfOrderExecutionAllowed(outOfOrderExecutionAllowedP),
|
||||
epilogueRequired(epilogueRequiredP),
|
||||
usePerDssBackedBuffer(usePerDSSbackedBufferP),
|
||||
useSingleSubdevice(useSingleSubdeviceP),
|
||||
useGlobalAtomics(useGlobalAtomicsP),
|
||||
areMultipleSubDevicesInContext(areMultipleSubDevicesInContextP),
|
||||
memoryMigrationRequired(memoryMigrationRequiredP),
|
||||
textureCacheFlush(textureCacheFlush),
|
||||
hasStallingCmds(hasStallingCmds){};
|
||||
bool hasStallingCmds, bool hasRelaxedOrderingDependencies) : csrDependencies(csrDependenciesP),
|
||||
barrierTimestampPacketNodes(barrierTimestampPacketNodesP),
|
||||
pipelineSelectArgs(pipelineSelectArgsP),
|
||||
flushStampReference(flushStampReferenceP),
|
||||
throttle(throttleP),
|
||||
preemptionMode(preemptionModeP),
|
||||
numGrfRequired(numGrfRequiredP),
|
||||
l3CacheSettings(l3CacheSettingsP),
|
||||
threadArbitrationPolicy(threadArbitrationPolicyP),
|
||||
additionalKernelExecInfo(additionalKernelExecInfoP),
|
||||
kernelExecutionType(kernelExecutionTypeP),
|
||||
memoryCompressionState(memoryCompressionStateP),
|
||||
sliceCount(sliceCountP),
|
||||
blocking(blockingP),
|
||||
dcFlush(dcFlushP),
|
||||
useSLM(useSLMP),
|
||||
guardCommandBufferWithPipeControl(guardCommandBufferWithPipeControlP),
|
||||
gsba32BitRequired(gsba32BitRequiredP),
|
||||
requiresCoherency(requiresCoherencyP),
|
||||
lowPriority(lowPriorityP),
|
||||
implicitFlush(implicitFlushP),
|
||||
outOfOrderExecutionAllowed(outOfOrderExecutionAllowedP),
|
||||
epilogueRequired(epilogueRequiredP),
|
||||
usePerDssBackedBuffer(usePerDSSbackedBufferP),
|
||||
useSingleSubdevice(useSingleSubdeviceP),
|
||||
useGlobalAtomics(useGlobalAtomicsP),
|
||||
areMultipleSubDevicesInContext(areMultipleSubDevicesInContextP),
|
||||
memoryMigrationRequired(memoryMigrationRequiredP),
|
||||
textureCacheFlush(textureCacheFlush),
|
||||
hasStallingCmds(hasStallingCmds),
|
||||
hasRelaxedOrderingDependencies(hasRelaxedOrderingDependencies){};
|
||||
|
||||
CsrDependencies csrDependencies;
|
||||
TimestampPacketContainer *barrierTimestampPacketNodes = nullptr;
|
||||
@@ -119,6 +120,7 @@ struct DispatchFlags {
|
||||
bool memoryMigrationRequired = false;
|
||||
bool textureCacheFlush = false;
|
||||
bool hasStallingCmds = false;
|
||||
bool hasRelaxedOrderingDependencies = false;
|
||||
bool disableEUFusion = false;
|
||||
};
|
||||
|
||||
|
||||
@@ -100,14 +100,15 @@ void NEO::SubmissionAggregator::aggregateCommandBuffers(ResourcePackage &resourc
|
||||
|
||||
NEO::BatchBuffer::BatchBuffer(GraphicsAllocation *commandBufferAllocation, size_t startOffset,
|
||||
size_t chainedBatchBufferStartOffset, uint64_t taskStartAddress, GraphicsAllocation *chainedBatchBuffer,
|
||||
bool requiresCoherency, bool lowPriority,
|
||||
QueueThrottle throttle, uint64_t sliceCount,
|
||||
size_t usedSize, LinearStream *stream, void *endCmdPtr, bool useSingleSubdevice, bool hasStallingCmds)
|
||||
bool requiresCoherency, bool lowPriority, QueueThrottle throttle, uint64_t sliceCount,
|
||||
size_t usedSize, LinearStream *stream, void *endCmdPtr, bool useSingleSubdevice, bool hasStallingCmds,
|
||||
bool hasRelaxedOrderingDependencies)
|
||||
: commandBufferAllocation(commandBufferAllocation), startOffset(startOffset),
|
||||
chainedBatchBufferStartOffset(chainedBatchBufferStartOffset), taskStartAddress(taskStartAddress), chainedBatchBuffer(chainedBatchBuffer),
|
||||
requiresCoherency(requiresCoherency), low_priority(lowPriority),
|
||||
throttle(throttle), sliceCount(sliceCount),
|
||||
usedSize(usedSize), stream(stream), endCmdPtr(endCmdPtr), useSingleSubdevice(useSingleSubdevice), hasStallingCmds(hasStallingCmds) {}
|
||||
usedSize(usedSize), stream(stream), endCmdPtr(endCmdPtr), useSingleSubdevice(useSingleSubdevice), hasStallingCmds(hasStallingCmds),
|
||||
hasRelaxedOrderingDependencies(hasRelaxedOrderingDependencies) {}
|
||||
|
||||
NEO::CommandBuffer::CommandBuffer(Device &device) : device(device) {
|
||||
flushStamp.reset(new FlushStampTracker(false));
|
||||
|
||||
@@ -32,7 +32,8 @@ struct BatchBuffer {
|
||||
LinearStream *stream,
|
||||
void *endCmdPtr,
|
||||
bool useSingleSubdevice,
|
||||
bool hasStallingCmds);
|
||||
bool hasStallingCmds,
|
||||
bool hasRelaxedOrderingDependencies);
|
||||
BatchBuffer() {}
|
||||
GraphicsAllocation *commandBufferAllocation = nullptr;
|
||||
size_t startOffset = 0u;
|
||||
@@ -52,6 +53,7 @@ struct BatchBuffer {
|
||||
|
||||
bool useSingleSubdevice = false;
|
||||
bool hasStallingCmds = false;
|
||||
bool hasRelaxedOrderingDependencies = false;
|
||||
bool ringBufferRestartRequest = false;
|
||||
};
|
||||
|
||||
|
||||
@@ -114,12 +114,12 @@ class DirectSubmissionHw {
|
||||
|
||||
void cpuCachelineFlush(void *ptr, size_t size);
|
||||
|
||||
void dispatchSemaphoreSection(uint32_t value, bool firstSubmission);
|
||||
size_t getSizeSemaphoreSection(bool firstSubmission);
|
||||
void dispatchSemaphoreSection(uint32_t value);
|
||||
size_t getSizeSemaphoreSection(bool relaxedOrderingSchedulerRequired);
|
||||
|
||||
void dispatchRelaxedOrderingSchedulerSection(uint32_t value);
|
||||
MOCKABLE_VIRTUAL void dispatchRelaxedOrderingSchedulerSection(uint32_t value);
|
||||
|
||||
void dispatchRelaxedOrderingReturnPtrRegs(LinearStream &cmdStream, uint64_t returnPtr);
|
||||
void dispatchRelaxedOrderingReturnPtrRegs(LinearStream &cmdStream, uint64_t returnPtr, bool hasRelaxedOrderingDependencies);
|
||||
|
||||
void dispatchStartSection(uint64_t gpuStartAddress);
|
||||
size_t getSizeStartSection();
|
||||
@@ -127,10 +127,10 @@ class DirectSubmissionHw {
|
||||
void dispatchSwitchRingBufferSection(uint64_t nextBufferGpuAddress);
|
||||
size_t getSizeSwitchRingBufferSection();
|
||||
|
||||
void dispatchRelaxedOrderingQueueStall();
|
||||
MOCKABLE_VIRTUAL void dispatchRelaxedOrderingQueueStall();
|
||||
size_t getSizeDispatchRelaxedOrderingQueueStall();
|
||||
|
||||
void dispatchTaskStoreSection(uint64_t taskStartSectionVa);
|
||||
MOCKABLE_VIRTUAL void dispatchTaskStoreSection(uint64_t taskStartSectionVa);
|
||||
MOCKABLE_VIRTUAL void preinitializeRelaxedOrderingSections();
|
||||
|
||||
void initRelaxedOrderingRegisters();
|
||||
@@ -138,7 +138,7 @@ class DirectSubmissionHw {
|
||||
void setReturnAddress(void *returnCmd, uint64_t returnAddress);
|
||||
|
||||
void *dispatchWorkloadSection(BatchBuffer &batchBuffer);
|
||||
size_t getSizeDispatch();
|
||||
size_t getSizeDispatch(bool relaxedOrderingSchedulerRequired);
|
||||
|
||||
void dispatchPrefetchMitigation();
|
||||
size_t getSizePrefetchMitigation();
|
||||
@@ -148,7 +148,7 @@ class DirectSubmissionHw {
|
||||
|
||||
MOCKABLE_VIRTUAL void dispatchStaticRelaxedOrderingScheduler();
|
||||
|
||||
size_t getSizeEnd();
|
||||
size_t getSizeEnd(bool relaxedOrderingSchedulerRequired);
|
||||
|
||||
void dispatchPartitionRegisterConfiguration();
|
||||
size_t getSizePartitionRegisterConfigurationSection();
|
||||
@@ -226,6 +226,6 @@ class DirectSubmissionHw {
|
||||
bool dcFlushRequired = false;
|
||||
bool relaxedOrderingEnabled = false;
|
||||
bool relaxedOrderingInitialized = false;
|
||||
bool firstSubmissionAfterRingStart = true;
|
||||
bool relaxedOrderingSchedulerRequired = false;
|
||||
};
|
||||
} // namespace NEO
|
||||
|
||||
@@ -403,7 +403,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::initialize(bool submitOnInit, bo
|
||||
initDiagnostic(submitOnInit);
|
||||
if (ret && submitOnInit) {
|
||||
size_t startBufferSize = Dispatcher::getSizePreemption() +
|
||||
getSizeSemaphoreSection(true);
|
||||
getSizeSemaphoreSection(false);
|
||||
|
||||
Dispatcher::dispatchPreemption(ringCommandStream);
|
||||
if (this->partitionedMode) {
|
||||
@@ -431,7 +431,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::initialize(bool submitOnInit, bo
|
||||
dispatchDiagnosticModeSection();
|
||||
startBufferSize += getDiagnosticModeSection();
|
||||
}
|
||||
dispatchSemaphoreSection(currentQueueWorkCount, true);
|
||||
dispatchSemaphoreSection(currentQueueWorkCount);
|
||||
|
||||
ringStart = submit(ringCommandStream.getGraphicsAllocation()->getGpuAddress(), startBufferSize);
|
||||
performDiagnosticMode();
|
||||
@@ -446,7 +446,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::startRingBuffer() {
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t startSize = getSizeSemaphoreSection(true);
|
||||
size_t startSize = getSizeSemaphoreSection(false);
|
||||
if (!this->partitionConfigSet) {
|
||||
startSize += getSizePartitionRegisterConfigurationSection();
|
||||
}
|
||||
@@ -457,7 +457,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::startRingBuffer() {
|
||||
startSize += RelaxedOrderingHelper::getSizeRegistersInit<GfxFamily>();
|
||||
}
|
||||
|
||||
size_t requiredSize = startSize + getSizeDispatch() + getSizeEnd();
|
||||
size_t requiredSize = startSize + getSizeDispatch(false) + getSizeEnd(false);
|
||||
if (ringCommandStream.getAvailableSpace() < requiredSize) {
|
||||
switchRingBuffers();
|
||||
}
|
||||
@@ -482,12 +482,10 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::startRingBuffer() {
|
||||
}
|
||||
|
||||
currentQueueWorkCount++;
|
||||
dispatchSemaphoreSection(currentQueueWorkCount, true);
|
||||
dispatchSemaphoreSection(currentQueueWorkCount);
|
||||
|
||||
ringStart = submit(gpuStartVa, startSize);
|
||||
|
||||
firstSubmissionAfterRingStart = true;
|
||||
|
||||
return ringStart;
|
||||
}
|
||||
|
||||
@@ -497,7 +495,8 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::stopRingBuffer() {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (this->relaxedOrderingEnabled && !firstSubmissionAfterRingStart) {
|
||||
bool relaxedOrderingSchedulerWasRequired = this->relaxedOrderingSchedulerRequired;
|
||||
if (this->relaxedOrderingEnabled && this->relaxedOrderingSchedulerRequired) {
|
||||
dispatchRelaxedOrderingQueueStall();
|
||||
}
|
||||
|
||||
@@ -515,7 +514,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::stopRingBuffer() {
|
||||
EncodeNoop<GfxFamily>::emitNoop(ringCommandStream, bytesToPad);
|
||||
EncodeNoop<GfxFamily>::alignToCacheLine(ringCommandStream);
|
||||
|
||||
cpuCachelineFlush(flushPtr, getSizeEnd());
|
||||
cpuCachelineFlush(flushPtr, getSizeEnd(relaxedOrderingSchedulerWasRequired));
|
||||
this->unblockGpu();
|
||||
cpuCachelineFlush(semaphorePtr, MemoryConstants::cacheLineSize);
|
||||
|
||||
@@ -526,13 +525,13 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::stopRingBuffer() {
|
||||
}
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
inline void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchSemaphoreSection(uint32_t value, bool firstSubmission) {
|
||||
inline void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchSemaphoreSection(uint32_t value) {
|
||||
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
|
||||
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
|
||||
|
||||
dispatchDisablePrefetcher(true);
|
||||
|
||||
if (this->relaxedOrderingEnabled && !firstSubmission) {
|
||||
if (this->relaxedOrderingEnabled && this->relaxedOrderingSchedulerRequired) {
|
||||
dispatchRelaxedOrderingSchedulerSection(value);
|
||||
} else {
|
||||
EncodeSempahore<GfxFamily>::addMiSemaphoreWaitCommand(ringCommandStream,
|
||||
@@ -550,9 +549,9 @@ inline void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchSemaphoreSection(
|
||||
}
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeSemaphoreSection(bool firstSubmission) {
|
||||
size_t semaphoreSize = (this->relaxedOrderingEnabled && !firstSubmission) ? RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection<GfxFamily>::totalSize
|
||||
: EncodeSempahore<GfxFamily>::getSizeMiSemaphoreWait();
|
||||
inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeSemaphoreSection(bool relaxedOrderingSchedulerRequired) {
|
||||
size_t semaphoreSize = (this->relaxedOrderingEnabled && relaxedOrderingSchedulerRequired) ? RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection<GfxFamily>::totalSize
|
||||
: EncodeSempahore<GfxFamily>::getSizeMiSemaphoreWait();
|
||||
semaphoreSize += getSizePrefetchMitigation();
|
||||
|
||||
if (isDisablePrefetcherRequired) {
|
||||
@@ -597,7 +596,7 @@ inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeSwitchRingBuffer
|
||||
}
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeEnd() {
|
||||
inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeEnd(bool relaxedOrderingSchedulerRequired) {
|
||||
size_t size = Dispatcher::getSizeStopCommandBuffer() +
|
||||
Dispatcher::getSizeCacheFlush(*hwInfo) +
|
||||
(Dispatcher::getSizeStartCommandBuffer() - Dispatcher::getSizeStopCommandBuffer()) +
|
||||
@@ -605,15 +604,15 @@ inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeEnd() {
|
||||
if (disableMonitorFence) {
|
||||
size += Dispatcher::getSizeMonitorFence(*hwInfo);
|
||||
}
|
||||
if (this->relaxedOrderingEnabled) {
|
||||
if (this->relaxedOrderingEnabled && relaxedOrderingSchedulerRequired) {
|
||||
size += getSizeDispatchRelaxedOrderingQueueStall();
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeDispatch() {
|
||||
size_t size = getSizeSemaphoreSection(false);
|
||||
inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeDispatch(bool relaxedOrderingSchedulerRequired) {
|
||||
size_t size = getSizeSemaphoreSection(relaxedOrderingSchedulerRequired);
|
||||
if (workloadMode == 0) {
|
||||
size += getSizeStartSection();
|
||||
if (this->relaxedOrderingEnabled) {
|
||||
@@ -673,7 +672,7 @@ void *DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchWorkloadSection(BatchBu
|
||||
uint64_t returnGpuPointer = ringCommandStream.getCurrentGpuAddressPosition();
|
||||
|
||||
if (this->relaxedOrderingEnabled) {
|
||||
dispatchRelaxedOrderingReturnPtrRegs(relaxedOrderingReturnPtrCmdStream, returnGpuPointer);
|
||||
dispatchRelaxedOrderingReturnPtrRegs(relaxedOrderingReturnPtrCmdStream, returnGpuPointer, batchBuffer.hasRelaxedOrderingDependencies);
|
||||
} else {
|
||||
setReturnAddress(returnCmd, returnGpuPointer);
|
||||
}
|
||||
@@ -683,7 +682,7 @@ void *DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchWorkloadSection(BatchBu
|
||||
}
|
||||
// mode 2 does not dispatch any commands
|
||||
|
||||
if (this->relaxedOrderingEnabled) {
|
||||
if (this->relaxedOrderingEnabled && batchBuffer.hasRelaxedOrderingDependencies) {
|
||||
dispatchTaskStoreSection(batchBuffer.taskStartAddress);
|
||||
}
|
||||
|
||||
@@ -698,7 +697,7 @@ void *DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchWorkloadSection(BatchBu
|
||||
this->useNotifyForPostSync, this->partitionedMode, this->dcFlushRequired);
|
||||
}
|
||||
|
||||
dispatchSemaphoreSection(currentQueueWorkCount + 1, false);
|
||||
dispatchSemaphoreSection(currentQueueWorkCount + 1);
|
||||
return currentPosition;
|
||||
}
|
||||
|
||||
@@ -708,25 +707,31 @@ void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchRelaxedOrderingQueueStal
|
||||
EncodeBatchBufferStartOrEnd<GfxFamily>::getCmdSizeConditionalDataRegBatchBufferStart());
|
||||
|
||||
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R5, 1, true);
|
||||
dispatchSemaphoreSection(currentQueueWorkCount, false);
|
||||
dispatchSemaphoreSection(currentQueueWorkCount);
|
||||
|
||||
// patch conditional bb_start with current GPU address
|
||||
EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataRegBatchBufferStart(bbStartStream, ringCommandStream.getCurrentGpuAddressPosition(),
|
||||
CS_GPR_R1, 0, CompareOperation::Equal, false);
|
||||
|
||||
relaxedOrderingSchedulerRequired = false;
|
||||
}
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeDispatchRelaxedOrderingQueueStall() {
|
||||
return getSizeSemaphoreSection(false) + sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM) +
|
||||
return getSizeSemaphoreSection(true) + sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM) +
|
||||
EncodeBatchBufferStartOrEnd<GfxFamily>::getCmdSizeConditionalDataRegBatchBufferStart();
|
||||
}
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchRelaxedOrderingReturnPtrRegs(LinearStream &cmdStream, uint64_t returnPtr) {
|
||||
void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchRelaxedOrderingReturnPtrRegs(LinearStream &cmdStream, uint64_t returnPtr, bool hasRelaxedOrderingDependencies) {
|
||||
LriHelper<GfxFamily>::program(&cmdStream, CS_GPR_R4, static_cast<uint32_t>(returnPtr & 0xFFFF'FFFFULL), true);
|
||||
LriHelper<GfxFamily>::program(&cmdStream, CS_GPR_R4 + 4, static_cast<uint32_t>(returnPtr >> 32), true);
|
||||
|
||||
uint64_t returnPtrAfterTaskStoreSection = returnPtr + RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>();
|
||||
uint64_t returnPtrAfterTaskStoreSection = returnPtr;
|
||||
|
||||
if (hasRelaxedOrderingDependencies) {
|
||||
returnPtrAfterTaskStoreSection += RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>();
|
||||
}
|
||||
|
||||
LriHelper<GfxFamily>::program(&cmdStream, CS_GPR_R3, static_cast<uint32_t>(returnPtrAfterTaskStoreSection & 0xFFFF'FFFFULL), true);
|
||||
LriHelper<GfxFamily>::program(&cmdStream, CS_GPR_R3 + 4, static_cast<uint32_t>(returnPtrAfterTaskStoreSection >> 32), true);
|
||||
@@ -846,24 +851,32 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
|
||||
|
||||
this->startRingBuffer();
|
||||
|
||||
size_t dispatchSize = getSizeDispatch();
|
||||
bool relaxedOrderingSchedulerWillBeNeeded = (this->relaxedOrderingSchedulerRequired || batchBuffer.hasRelaxedOrderingDependencies);
|
||||
|
||||
size_t dispatchSize = getSizeDispatch(relaxedOrderingSchedulerWillBeNeeded);
|
||||
size_t cycleSize = getSizeSwitchRingBufferSection();
|
||||
size_t requiredMinimalSize = dispatchSize + cycleSize + getSizeEnd();
|
||||
size_t requiredMinimalSize = dispatchSize + cycleSize + getSizeEnd(relaxedOrderingSchedulerWillBeNeeded);
|
||||
if (this->relaxedOrderingEnabled) {
|
||||
if (batchBuffer.hasStallingCmds && !firstSubmissionAfterRingStart) {
|
||||
requiredMinimalSize += +RelaxedOrderingHelper::getSizeReturnPtrRegs<GfxFamily>();
|
||||
|
||||
if (batchBuffer.hasStallingCmds && this->relaxedOrderingSchedulerRequired) {
|
||||
requiredMinimalSize += getSizeDispatchRelaxedOrderingQueueStall();
|
||||
}
|
||||
requiredMinimalSize += RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>() + RelaxedOrderingHelper::getSizeReturnPtrRegs<GfxFamily>();
|
||||
if (batchBuffer.hasRelaxedOrderingDependencies) {
|
||||
requiredMinimalSize += RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>();
|
||||
}
|
||||
}
|
||||
|
||||
if (ringCommandStream.getAvailableSpace() < requiredMinimalSize) {
|
||||
switchRingBuffers();
|
||||
}
|
||||
|
||||
if (this->relaxedOrderingEnabled && batchBuffer.hasStallingCmds && !firstSubmissionAfterRingStart) {
|
||||
if (this->relaxedOrderingEnabled && batchBuffer.hasStallingCmds && this->relaxedOrderingSchedulerRequired) {
|
||||
dispatchRelaxedOrderingQueueStall();
|
||||
}
|
||||
|
||||
this->relaxedOrderingSchedulerRequired |= batchBuffer.hasRelaxedOrderingDependencies;
|
||||
|
||||
handleNewResourcesSubmission();
|
||||
|
||||
void *currentPosition = dispatchWorkloadSection(batchBuffer);
|
||||
@@ -890,8 +903,6 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
|
||||
uint64_t flushValue = updateTagValue();
|
||||
flushStamp.setStamp(flushValue);
|
||||
|
||||
firstSubmissionAfterRingStart = false;
|
||||
|
||||
return ringStart;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user