Optimize BCS flushing scheme [3/n]

Change-Id: I806d642c869bccfe40a1eb0c58b6a2f53e071cd8
Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2020-06-26 11:21:07 +02:00
committed by sys_ocldev
parent 0d5d793a01
commit 107f07eb08
11 changed files with 166 additions and 22 deletions

View File

@@ -141,10 +141,19 @@ volatile uint32_t *CommandQueue::getHwTagAddress() const {
return getGpgpuCommandStreamReceiver().getTagAddress();
}
bool CommandQueue::isCompleted(uint32_t taskCount) const {
uint32_t tag = getHwTag();
DEBUG_BREAK_IF(tag == CompletionStamp::notReady);
return tag >= taskCount;
bool CommandQueue::isCompleted(uint32_t gpgpuTaskCount, uint32_t bcsTaskCount) const {
uint32_t gpgpuHwTag = getHwTag();
DEBUG_BREAK_IF(gpgpuHwTag == CompletionStamp::notReady);
if (gpgpuHwTag >= gpgpuTaskCount) {
if (auto bcsCsr = getBcsCommandStreamReceiver()) {
return (*bcsCsr->getTagAddress()) >= bcsTaskCount;
}
return true;
}
return false;
}
void CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, uint32_t bcsTaskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) {
@@ -687,11 +696,4 @@ void CommandQueue::aubCaptureHook(bool &blocking, bool &clearAllDependencies, co
}
}
bool CommandQueue::isGpgpuSubmissionForBcsRequired() const {
if (DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.get() == 0) {
return (latestSentEnqueueType != EnqueueProperties::Operation::Blit) && (latestSentEnqueueType != EnqueueProperties::Operation::None);
}
return true;
}
} // namespace NEO

View File

@@ -211,7 +211,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
volatile uint32_t *getHwTagAddress() const;
bool isCompleted(uint32_t taskCount) const;
bool isCompleted(uint32_t gpgpuTaskCount, uint32_t bcsTaskCount) const;
MOCKABLE_VIRTUAL bool isQueueBlocked();
@@ -301,6 +301,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
void updateBcsTaskCount(uint32_t newBcsTaskCount) { this->bcsTaskCount = newBcsTaskCount; }
uint32_t peekBcsTaskCount() const { return bcsTaskCount; }
void updateLatestSentEnqueueType(EnqueueProperties::Operation newEnqueueType) { this->latestSentEnqueueType = newEnqueueType; }
// taskCount of last task
uint32_t taskCount = 0;
@@ -338,7 +340,6 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
bool blitEnqueueAllowed(cl_command_type cmdType) const;
void aubCaptureHook(bool &blocking, bool &clearAllDependencies, const MultiDispatchInfo &multiDispatchInfo);
virtual bool obtainTimestampPacketForCacheFlush(bool isCacheFlushRequired) const = 0;
bool isGpgpuSubmissionForBcsRequired() const;
Context *context = nullptr;
ClDevice *device = nullptr;

View File

@@ -474,5 +474,7 @@ class CommandQueueHw : public CommandQueue {
CsrDependencies &csrDeps,
KernelOperation *blockedCommandsData,
TimestampPacketDependencies &timestampPacketDependencies);
bool isGpgpuSubmissionForBcsRequired(bool queueBlocked) const;
};
} // namespace NEO

View File

@@ -151,4 +151,18 @@ bool CommandQueueHw<Family>::obtainTimestampPacketForCacheFlush(bool isCacheFlus
return isCacheFlushRequired;
}
template <typename Family>
bool CommandQueueHw<Family>::isGpgpuSubmissionForBcsRequired(bool queueBlocked) const {
if (queueBlocked) {
return true;
}
bool required = isCacheFlushForBcsRequired() && (latestSentEnqueueType != EnqueueProperties::Operation::Blit) && (latestSentEnqueueType != EnqueueProperties::Operation::None);
if (DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.get() == 1) {
required = true;
}
return required;
}
} // namespace NEO

View File

@@ -215,7 +215,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
if (isCacheFlushForBcsRequired()) {
// Cache flush for aux translation is always required (if supported)
if ((blitEnqueue && isGpgpuSubmissionForBcsRequired()) || (enqueueWithBlitAuxTranslation)) {
if ((blitEnqueue && isGpgpuSubmissionForBcsRequired(blockQueue)) || (enqueueWithBlitAuxTranslation)) {
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
}
}
@@ -490,7 +490,7 @@ BlitProperties CommandQueueHw<GfxFamily>::processDispatchForBlitEnqueue(const Mu
auto currentTimestampPacketNode = timestampPacketContainer->peekNodes().at(0);
blitProperties.outputTimestampPacket = currentTimestampPacketNode;
if (isGpgpuSubmissionForBcsRequired()) {
if (isGpgpuSubmissionForBcsRequired(queueBlocked)) {
if (isCacheFlushForBcsRequired()) {
auto cacheFlushTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketDependencies.cacheFlushNodes.peekNodes()[0]);
PipeControlArgs args(true);
@@ -956,7 +956,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
CompletionStamp completionStamp = {this->taskCount, this->taskLevel, this->flushStamp->peekStamp()};
bool flushGpgpuCsr = true;
if ((enqueueProperties.operation == EnqueueProperties::Operation::Blit) && !isGpgpuSubmissionForBcsRequired()) {
if ((enqueueProperties.operation == EnqueueProperties::Operation::Blit) && !isGpgpuSubmissionForBcsRequired(false)) {
flushGpgpuCsr = false;
}

View File

@@ -408,7 +408,7 @@ void Event::updateExecutionStatus() {
// Note : Intentional fallthrough (no return) to check for CL_COMPLETE
}
if ((cmdQueue != nullptr) && (cmdQueue->isCompleted(getCompletionStamp()))) {
if ((cmdQueue != nullptr) && (cmdQueue->isCompleted(getCompletionStamp(), this->bcsTaskCount))) {
transitionExecutionStatus(CL_COMPLETE);
executeCallbacks(CL_COMPLETE);
unblockEventsBlockedByThis(CL_COMPLETE);

View File

@@ -88,7 +88,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
~Event() override;
uint32_t getCompletionStamp(void) const;
uint32_t getCompletionStamp() const;
void updateCompletionStamp(uint32_t taskCount, uint32_t bcsTaskCount, uint32_t tasklevel, FlushStamp flushStamp);
cl_ulong getDelta(cl_ulong startTime,
cl_ulong endTime);

View File

@@ -87,6 +87,8 @@ CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) {
dispatchFlags,
commandQueue.getDevice());
commandQueue.updateLatestSentEnqueueType(EnqueueProperties::Operation::DependencyResolveOnGpu);
if (!memObj.isMemObjZeroCopy()) {
commandQueue.waitUntilComplete(completionStamp.taskCount, commandQueue.peekBcsTaskCount(), completionStamp.flushStamp, false);
if (operationType == MAP) {
@@ -262,6 +264,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
auto bcsTaskCount = commandQueue.getBcsCommandStreamReceiver()->blitBuffer(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled());
commandQueue.updateBcsTaskCount(bcsTaskCount);
}
commandQueue.updateLatestSentEnqueueType(EnqueueProperties::Operation::GpuKernel);
if (gtpinIsGTPinInitialized()) {
gtpinNotifyFlushTask(completionStamp.taskCount);
@@ -314,7 +317,11 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
auto lockCSR = commandStreamReceiver.obtainUniqueOwnership();
auto enqueueOperationType = EnqueueProperties::Operation::DependencyResolveOnGpu;
if (kernelOperation->blitEnqueue) {
enqueueOperationType = EnqueueProperties::Operation::Blit;
if (commandStreamReceiver.isStallingPipeControlOnNextFlushRequired()) {
timestampPacketDependencies->barrierNodes.add(commandStreamReceiver.getTimestampPacketAllocator()->getTag());
}
@@ -364,6 +371,8 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
dispatchBlitOperation();
}
commandQueue.updateLatestSentEnqueueType(enqueueOperationType);
return completionStamp;
}