Optimize BCS flushing scheme [2/n]

Change-Id: I6f1e0115b9c45f89afb86f8fd2304604243541df
Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2020-06-25 11:35:29 +02:00
committed by sys_ocldev
parent d4b12c97d4
commit 86dc5bacc7
19 changed files with 291 additions and 134 deletions

View File

@@ -147,28 +147,28 @@ bool CommandQueue::isCompleted(uint32_t taskCount) const {
return tag >= taskCount;
}
void CommandQueue::waitUntilComplete(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) {
void CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, uint32_t bcsTaskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) {
WAIT_ENTER()
DBG_LOG(LogTaskCounts, __FUNCTION__, "Waiting for taskCount:", taskCountToWait);
DBG_LOG(LogTaskCounts, __FUNCTION__, "Waiting for taskCount:", gpgpuTaskCountToWait);
DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "Current taskCount:", getHwTag());
bool forcePowerSavingMode = this->throttle == QueueThrottle::LOW;
getGpgpuCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait,
getGpgpuCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(gpgpuTaskCountToWait, flushStampToWait,
useQuickKmdSleep, forcePowerSavingMode);
DEBUG_BREAK_IF(getHwTag() < taskCountToWait);
DEBUG_BREAK_IF(getHwTag() < gpgpuTaskCountToWait);
if (gtpinIsGTPinInitialized()) {
gtpinNotifyTaskCompletion(taskCountToWait);
gtpinNotifyTaskCompletion(gpgpuTaskCountToWait);
}
if (auto bcsCsr = getBcsCommandStreamReceiver()) {
bcsCsr->waitForTaskCountWithKmdNotifyFallback(bcsTaskCount, 0, false, false);
bcsCsr->waitForTaskCountAndCleanTemporaryAllocationList(bcsTaskCount);
bcsCsr->waitForTaskCountWithKmdNotifyFallback(bcsTaskCountToWait, 0, false, false);
bcsCsr->waitForTaskCountAndCleanTemporaryAllocationList(bcsTaskCountToWait);
}
getGpgpuCommandStreamReceiver().waitForTaskCountAndCleanTemporaryAllocationList(taskCountToWait);
getGpgpuCommandStreamReceiver().waitForTaskCountAndCleanTemporaryAllocationList(gpgpuTaskCountToWait);
WAIT_LEAVE()
}

View File

@@ -215,7 +215,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
MOCKABLE_VIRTUAL bool isQueueBlocked();
MOCKABLE_VIRTUAL void waitUntilComplete(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep);
MOCKABLE_VIRTUAL void waitUntilComplete(uint32_t gpgpuTaskCountToWait, uint32_t bcsTaskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep);
static uint32_t getTaskLevelFromWaitList(uint32_t taskLevel,
cl_uint numEventsInWaitList,
@@ -299,6 +299,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
}
void updateBcsTaskCount(uint32_t newBcsTaskCount) { this->bcsTaskCount = newBcsTaskCount; }
uint32_t peekBcsTaskCount() const { return bcsTaskCount; }
// taskCount of last task
uint32_t taskCount = 0;

View File

@@ -145,7 +145,7 @@ void *CommandQueue::cpuDataTransferHandler(TransferProperties &transferPropertie
if (outEventObj) {
outEventObj->setEndTimeStamp();
outEventObj->updateTaskCount(this->taskCount);
outEventObj->updateTaskCount(this->taskCount, this->bcsTaskCount);
outEventObj->flushStamp->replaceStampObject(this->flushStamp->getStampReference());
if (eventCompleted) {
outEventObj->setStatus(CL_COMPLETE);

View File

@@ -305,7 +305,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
getGpgpuCommandStreamReceiver().setMediaVFEStateDirty(true);
if (devQueueHw->getSchedulerReturnInstance() > 0) {
waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp, false);
waitUntilComplete(completionStamp.taskCount, bcsTaskCount, completionStamp.flushStamp, false);
this->runSchedulerSimulation(*devQueueHw, *parentKernel);
}
}
@@ -353,7 +353,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
updateFromCompletionStamp(completionStamp);
if (eventBuilder.getEvent()) {
eventBuilder.getEvent()->updateCompletionStamp(completionStamp.taskCount, completionStamp.taskLevel, completionStamp.flushStamp);
eventBuilder.getEvent()->updateCompletionStamp(completionStamp.taskCount, bcsTaskCount, completionStamp.taskLevel, completionStamp.flushStamp);
FileLoggerInstance().log(DebugManager.flags.EventsDebugEnable.get(), "updateCompletionStamp Event", eventBuilder.getEvent(), "taskLevel", eventBuilder.getEvent()->taskLevel.load());
}
@@ -382,9 +382,9 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
if (blockQueue) {
while (isQueueBlocked()) {
}
waitUntilComplete(taskCount, flushStamp->peekStamp(), false);
waitUntilComplete(taskCount, bcsTaskCount, flushStamp->peekStamp(), false);
} else {
waitUntilComplete(taskCount, flushStamp->peekStamp(), false);
waitUntilComplete(taskCount, bcsTaskCount, flushStamp->peekStamp(), false);
if (printfHandler) {
printfHandler->printEnqueueOutput();
}

View File

@@ -27,7 +27,7 @@ cl_int CommandQueueHw<GfxFamily>::finish() {
auto flushStampToWaitFor = this->flushStamp->peekStamp();
// Stall until HW reaches CQ taskCount
waitUntilComplete(taskCountToWaitFor, flushStampToWaitFor, false);
waitUntilComplete(taskCountToWaitFor, this->bcsTaskCount, flushStampToWaitFor, false);
return CL_SUCCESS;
}

View File

@@ -224,8 +224,9 @@ uint32_t Event::getCompletionStamp() const {
return this->taskCount;
}
void Event::updateCompletionStamp(uint32_t taskCount, uint32_t tasklevel, FlushStamp flushStamp) {
this->taskCount = taskCount;
void Event::updateCompletionStamp(uint32_t gpgpuTaskCount, uint32_t bcsTaskCount, uint32_t tasklevel, FlushStamp flushStamp) {
this->taskCount = gpgpuTaskCount;
this->bcsTaskCount = bcsTaskCount;
this->taskLevel = tasklevel;
this->flushStamp->setStamp(flushStamp);
}
@@ -370,7 +371,7 @@ inline bool Event::wait(bool blocking, bool useQuickKmdSleep) {
}
}
cmdQueue->waitUntilComplete(taskCount.load(), flushStamp->peekStamp(), useQuickKmdSleep);
cmdQueue->waitUntilComplete(taskCount.load(), this->bcsTaskCount, flushStamp->peekStamp(), useQuickKmdSleep);
updateExecutionStatus();
DEBUG_BREAK_IF(this->taskLevel == CompletionStamp::notReady && this->executionStatus >= 0);
@@ -510,11 +511,9 @@ void Event::transitionExecutionStatus(int32_t newExecutionStatus) const {
void Event::submitCommand(bool abortTasks) {
std::unique_ptr<Command> cmdToProcess(cmdToSubmit.exchange(nullptr));
if (cmdToProcess.get() != nullptr) {
std::unique_lock<CommandStreamReceiver::MutexType> lockCSR;
if (this->cmdQueue) {
lockCSR = this->getCommandQueue()->getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
}
if ((this->isProfilingEnabled()) && (this->cmdQueue != nullptr)) {
auto lockCSR = getCommandQueue()->getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
if (this->isProfilingEnabled()) {
if (timeStampNode) {
this->cmdQueue->getGpgpuCommandStreamReceiver().makeResident(*timeStampNode->getBaseGraphicsAllocation());
cmdToProcess->timestamp = timeStampNode;
@@ -530,10 +529,10 @@ void Event::submitCommand(bool abortTasks) {
}
}
auto &complStamp = cmdToProcess->submit(taskLevel, abortTasks);
if (profilingCpuPath && this->isProfilingEnabled() && (this->cmdQueue != nullptr)) {
if (profilingCpuPath && this->isProfilingEnabled()) {
setEndTimeStamp();
}
updateTaskCount(complStamp.taskCount);
updateTaskCount(complStamp.taskCount, cmdQueue->peekBcsTaskCount());
flushStamp->setStamp(complStamp.flushStamp);
submittedCmd.exchange(cmdToProcess.release());
} else if (profilingCpuPath && endTimeStamp == 0) {
@@ -543,7 +542,7 @@ void Event::submitCommand(bool abortTasks) {
if (!this->isUserEvent() && this->eventWithoutCommand) {
if (this->cmdQueue) {
auto lockCSR = this->getCommandQueue()->getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
updateTaskCount(this->cmdQueue->getGpgpuCommandStreamReceiver().peekTaskCount());
updateTaskCount(this->cmdQueue->getGpgpuCommandStreamReceiver().peekTaskCount(), cmdQueue->peekBcsTaskCount());
}
}
//make sure that task count is synchronized for events with kernels

View File

@@ -89,7 +89,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
~Event() override;
uint32_t getCompletionStamp(void) const;
void updateCompletionStamp(uint32_t taskCount, uint32_t tasklevel, FlushStamp flushStamp);
void updateCompletionStamp(uint32_t taskCount, uint32_t bcsTaskCount, uint32_t tasklevel, FlushStamp flushStamp);
cl_ulong getDelta(cl_ulong startTime,
cl_ulong endTime);
void setCPUProfilingPath(bool isCPUPath) { this->profilingCpuPath = isCPUPath; }
@@ -243,14 +243,15 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
virtual void unblockEventBy(Event &event, uint32_t taskLevel, int32_t transitionStatus);
void updateTaskCount(uint32_t taskCount) {
if (taskCount == CompletionStamp::notReady) {
void updateTaskCount(uint32_t gpgpuTaskCount, uint32_t bcsTaskCount) {
if (gpgpuTaskCount == CompletionStamp::notReady) {
DEBUG_BREAK_IF(true);
return;
}
uint32_t prevTaskCount = this->taskCount.exchange(taskCount);
if ((prevTaskCount != CompletionStamp::notReady) && (prevTaskCount > taskCount)) {
this->bcsTaskCount = bcsTaskCount;
uint32_t prevTaskCount = this->taskCount.exchange(gpgpuTaskCount);
if ((prevTaskCount != CompletionStamp::notReady) && (prevTaskCount > gpgpuTaskCount)) {
this->taskCount = prevTaskCount;
DEBUG_BREAK_IF(true);
}
@@ -363,6 +364,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
uint64_t startTimeStamp;
uint64_t endTimeStamp;
uint64_t completeTimeStamp;
uint32_t bcsTaskCount = 0;
bool perfCountersEnabled;
TagNode<HwTimeStamps> *timeStampNode = nullptr;
TagNode<HwPerfCounter> *perfCounterNode = nullptr;

View File

@@ -88,7 +88,7 @@ CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) {
commandQueue.getDevice());
if (!memObj.isMemObjZeroCopy()) {
commandQueue.waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp, false);
commandQueue.waitUntilComplete(completionStamp.taskCount, commandQueue.peekBcsTaskCount(), completionStamp.flushStamp, false);
if (operationType == MAP) {
memObj.transferDataToHostPtr(copySize, copyOffset);
} else if (!readOnly) {
@@ -268,7 +268,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
}
if (printfHandler) {
commandQueue.waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp, false);
commandQueue.waitUntilComplete(completionStamp.taskCount, commandQueue.peekBcsTaskCount(), completionStamp.flushStamp, false);
printfHandler.get()->printEnqueueOutput();
}