mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 23:03:02 +08:00
Optimize BCS flushing scheme [2/n]
Change-Id: I6f1e0115b9c45f89afb86f8fd2304604243541df Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
committed by
sys_ocldev
parent
d4b12c97d4
commit
86dc5bacc7
@@ -147,28 +147,28 @@ bool CommandQueue::isCompleted(uint32_t taskCount) const {
|
||||
return tag >= taskCount;
|
||||
}
|
||||
|
||||
void CommandQueue::waitUntilComplete(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) {
|
||||
void CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, uint32_t bcsTaskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) {
|
||||
WAIT_ENTER()
|
||||
|
||||
DBG_LOG(LogTaskCounts, __FUNCTION__, "Waiting for taskCount:", taskCountToWait);
|
||||
DBG_LOG(LogTaskCounts, __FUNCTION__, "Waiting for taskCount:", gpgpuTaskCountToWait);
|
||||
DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "Current taskCount:", getHwTag());
|
||||
|
||||
bool forcePowerSavingMode = this->throttle == QueueThrottle::LOW;
|
||||
|
||||
getGpgpuCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait,
|
||||
getGpgpuCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(gpgpuTaskCountToWait, flushStampToWait,
|
||||
useQuickKmdSleep, forcePowerSavingMode);
|
||||
DEBUG_BREAK_IF(getHwTag() < taskCountToWait);
|
||||
DEBUG_BREAK_IF(getHwTag() < gpgpuTaskCountToWait);
|
||||
|
||||
if (gtpinIsGTPinInitialized()) {
|
||||
gtpinNotifyTaskCompletion(taskCountToWait);
|
||||
gtpinNotifyTaskCompletion(gpgpuTaskCountToWait);
|
||||
}
|
||||
|
||||
if (auto bcsCsr = getBcsCommandStreamReceiver()) {
|
||||
bcsCsr->waitForTaskCountWithKmdNotifyFallback(bcsTaskCount, 0, false, false);
|
||||
bcsCsr->waitForTaskCountAndCleanTemporaryAllocationList(bcsTaskCount);
|
||||
bcsCsr->waitForTaskCountWithKmdNotifyFallback(bcsTaskCountToWait, 0, false, false);
|
||||
bcsCsr->waitForTaskCountAndCleanTemporaryAllocationList(bcsTaskCountToWait);
|
||||
}
|
||||
|
||||
getGpgpuCommandStreamReceiver().waitForTaskCountAndCleanTemporaryAllocationList(taskCountToWait);
|
||||
getGpgpuCommandStreamReceiver().waitForTaskCountAndCleanTemporaryAllocationList(gpgpuTaskCountToWait);
|
||||
|
||||
WAIT_LEAVE()
|
||||
}
|
||||
|
||||
@@ -215,7 +215,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
||||
|
||||
MOCKABLE_VIRTUAL bool isQueueBlocked();
|
||||
|
||||
MOCKABLE_VIRTUAL void waitUntilComplete(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep);
|
||||
MOCKABLE_VIRTUAL void waitUntilComplete(uint32_t gpgpuTaskCountToWait, uint32_t bcsTaskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep);
|
||||
|
||||
static uint32_t getTaskLevelFromWaitList(uint32_t taskLevel,
|
||||
cl_uint numEventsInWaitList,
|
||||
@@ -299,6 +299,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
||||
}
|
||||
|
||||
void updateBcsTaskCount(uint32_t newBcsTaskCount) { this->bcsTaskCount = newBcsTaskCount; }
|
||||
uint32_t peekBcsTaskCount() const { return bcsTaskCount; }
|
||||
|
||||
// taskCount of last task
|
||||
uint32_t taskCount = 0;
|
||||
|
||||
@@ -145,7 +145,7 @@ void *CommandQueue::cpuDataTransferHandler(TransferProperties &transferPropertie
|
||||
|
||||
if (outEventObj) {
|
||||
outEventObj->setEndTimeStamp();
|
||||
outEventObj->updateTaskCount(this->taskCount);
|
||||
outEventObj->updateTaskCount(this->taskCount, this->bcsTaskCount);
|
||||
outEventObj->flushStamp->replaceStampObject(this->flushStamp->getStampReference());
|
||||
if (eventCompleted) {
|
||||
outEventObj->setStatus(CL_COMPLETE);
|
||||
|
||||
@@ -305,7 +305,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
|
||||
getGpgpuCommandStreamReceiver().setMediaVFEStateDirty(true);
|
||||
|
||||
if (devQueueHw->getSchedulerReturnInstance() > 0) {
|
||||
waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp, false);
|
||||
waitUntilComplete(completionStamp.taskCount, bcsTaskCount, completionStamp.flushStamp, false);
|
||||
this->runSchedulerSimulation(*devQueueHw, *parentKernel);
|
||||
}
|
||||
}
|
||||
@@ -353,7 +353,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
|
||||
updateFromCompletionStamp(completionStamp);
|
||||
|
||||
if (eventBuilder.getEvent()) {
|
||||
eventBuilder.getEvent()->updateCompletionStamp(completionStamp.taskCount, completionStamp.taskLevel, completionStamp.flushStamp);
|
||||
eventBuilder.getEvent()->updateCompletionStamp(completionStamp.taskCount, bcsTaskCount, completionStamp.taskLevel, completionStamp.flushStamp);
|
||||
FileLoggerInstance().log(DebugManager.flags.EventsDebugEnable.get(), "updateCompletionStamp Event", eventBuilder.getEvent(), "taskLevel", eventBuilder.getEvent()->taskLevel.load());
|
||||
}
|
||||
|
||||
@@ -382,9 +382,9 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
|
||||
if (blockQueue) {
|
||||
while (isQueueBlocked()) {
|
||||
}
|
||||
waitUntilComplete(taskCount, flushStamp->peekStamp(), false);
|
||||
waitUntilComplete(taskCount, bcsTaskCount, flushStamp->peekStamp(), false);
|
||||
} else {
|
||||
waitUntilComplete(taskCount, flushStamp->peekStamp(), false);
|
||||
waitUntilComplete(taskCount, bcsTaskCount, flushStamp->peekStamp(), false);
|
||||
if (printfHandler) {
|
||||
printfHandler->printEnqueueOutput();
|
||||
}
|
||||
|
||||
@@ -27,7 +27,7 @@ cl_int CommandQueueHw<GfxFamily>::finish() {
|
||||
auto flushStampToWaitFor = this->flushStamp->peekStamp();
|
||||
|
||||
// Stall until HW reaches CQ taskCount
|
||||
waitUntilComplete(taskCountToWaitFor, flushStampToWaitFor, false);
|
||||
waitUntilComplete(taskCountToWaitFor, this->bcsTaskCount, flushStampToWaitFor, false);
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -224,8 +224,9 @@ uint32_t Event::getCompletionStamp() const {
|
||||
return this->taskCount;
|
||||
}
|
||||
|
||||
void Event::updateCompletionStamp(uint32_t taskCount, uint32_t tasklevel, FlushStamp flushStamp) {
|
||||
this->taskCount = taskCount;
|
||||
void Event::updateCompletionStamp(uint32_t gpgpuTaskCount, uint32_t bcsTaskCount, uint32_t tasklevel, FlushStamp flushStamp) {
|
||||
this->taskCount = gpgpuTaskCount;
|
||||
this->bcsTaskCount = bcsTaskCount;
|
||||
this->taskLevel = tasklevel;
|
||||
this->flushStamp->setStamp(flushStamp);
|
||||
}
|
||||
@@ -370,7 +371,7 @@ inline bool Event::wait(bool blocking, bool useQuickKmdSleep) {
|
||||
}
|
||||
}
|
||||
|
||||
cmdQueue->waitUntilComplete(taskCount.load(), flushStamp->peekStamp(), useQuickKmdSleep);
|
||||
cmdQueue->waitUntilComplete(taskCount.load(), this->bcsTaskCount, flushStamp->peekStamp(), useQuickKmdSleep);
|
||||
updateExecutionStatus();
|
||||
|
||||
DEBUG_BREAK_IF(this->taskLevel == CompletionStamp::notReady && this->executionStatus >= 0);
|
||||
@@ -510,11 +511,9 @@ void Event::transitionExecutionStatus(int32_t newExecutionStatus) const {
|
||||
void Event::submitCommand(bool abortTasks) {
|
||||
std::unique_ptr<Command> cmdToProcess(cmdToSubmit.exchange(nullptr));
|
||||
if (cmdToProcess.get() != nullptr) {
|
||||
std::unique_lock<CommandStreamReceiver::MutexType> lockCSR;
|
||||
if (this->cmdQueue) {
|
||||
lockCSR = this->getCommandQueue()->getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
|
||||
}
|
||||
if ((this->isProfilingEnabled()) && (this->cmdQueue != nullptr)) {
|
||||
auto lockCSR = getCommandQueue()->getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
|
||||
|
||||
if (this->isProfilingEnabled()) {
|
||||
if (timeStampNode) {
|
||||
this->cmdQueue->getGpgpuCommandStreamReceiver().makeResident(*timeStampNode->getBaseGraphicsAllocation());
|
||||
cmdToProcess->timestamp = timeStampNode;
|
||||
@@ -530,10 +529,10 @@ void Event::submitCommand(bool abortTasks) {
|
||||
}
|
||||
}
|
||||
auto &complStamp = cmdToProcess->submit(taskLevel, abortTasks);
|
||||
if (profilingCpuPath && this->isProfilingEnabled() && (this->cmdQueue != nullptr)) {
|
||||
if (profilingCpuPath && this->isProfilingEnabled()) {
|
||||
setEndTimeStamp();
|
||||
}
|
||||
updateTaskCount(complStamp.taskCount);
|
||||
updateTaskCount(complStamp.taskCount, cmdQueue->peekBcsTaskCount());
|
||||
flushStamp->setStamp(complStamp.flushStamp);
|
||||
submittedCmd.exchange(cmdToProcess.release());
|
||||
} else if (profilingCpuPath && endTimeStamp == 0) {
|
||||
@@ -543,7 +542,7 @@ void Event::submitCommand(bool abortTasks) {
|
||||
if (!this->isUserEvent() && this->eventWithoutCommand) {
|
||||
if (this->cmdQueue) {
|
||||
auto lockCSR = this->getCommandQueue()->getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
|
||||
updateTaskCount(this->cmdQueue->getGpgpuCommandStreamReceiver().peekTaskCount());
|
||||
updateTaskCount(this->cmdQueue->getGpgpuCommandStreamReceiver().peekTaskCount(), cmdQueue->peekBcsTaskCount());
|
||||
}
|
||||
}
|
||||
//make sure that task count is synchronized for events with kernels
|
||||
|
||||
@@ -89,7 +89,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
|
||||
~Event() override;
|
||||
|
||||
uint32_t getCompletionStamp(void) const;
|
||||
void updateCompletionStamp(uint32_t taskCount, uint32_t tasklevel, FlushStamp flushStamp);
|
||||
void updateCompletionStamp(uint32_t taskCount, uint32_t bcsTaskCount, uint32_t tasklevel, FlushStamp flushStamp);
|
||||
cl_ulong getDelta(cl_ulong startTime,
|
||||
cl_ulong endTime);
|
||||
void setCPUProfilingPath(bool isCPUPath) { this->profilingCpuPath = isCPUPath; }
|
||||
@@ -243,14 +243,15 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
|
||||
|
||||
virtual void unblockEventBy(Event &event, uint32_t taskLevel, int32_t transitionStatus);
|
||||
|
||||
void updateTaskCount(uint32_t taskCount) {
|
||||
if (taskCount == CompletionStamp::notReady) {
|
||||
void updateTaskCount(uint32_t gpgpuTaskCount, uint32_t bcsTaskCount) {
|
||||
if (gpgpuTaskCount == CompletionStamp::notReady) {
|
||||
DEBUG_BREAK_IF(true);
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t prevTaskCount = this->taskCount.exchange(taskCount);
|
||||
if ((prevTaskCount != CompletionStamp::notReady) && (prevTaskCount > taskCount)) {
|
||||
this->bcsTaskCount = bcsTaskCount;
|
||||
uint32_t prevTaskCount = this->taskCount.exchange(gpgpuTaskCount);
|
||||
if ((prevTaskCount != CompletionStamp::notReady) && (prevTaskCount > gpgpuTaskCount)) {
|
||||
this->taskCount = prevTaskCount;
|
||||
DEBUG_BREAK_IF(true);
|
||||
}
|
||||
@@ -363,6 +364,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
|
||||
uint64_t startTimeStamp;
|
||||
uint64_t endTimeStamp;
|
||||
uint64_t completeTimeStamp;
|
||||
uint32_t bcsTaskCount = 0;
|
||||
bool perfCountersEnabled;
|
||||
TagNode<HwTimeStamps> *timeStampNode = nullptr;
|
||||
TagNode<HwPerfCounter> *perfCounterNode = nullptr;
|
||||
|
||||
@@ -88,7 +88,7 @@ CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) {
|
||||
commandQueue.getDevice());
|
||||
|
||||
if (!memObj.isMemObjZeroCopy()) {
|
||||
commandQueue.waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp, false);
|
||||
commandQueue.waitUntilComplete(completionStamp.taskCount, commandQueue.peekBcsTaskCount(), completionStamp.flushStamp, false);
|
||||
if (operationType == MAP) {
|
||||
memObj.transferDataToHostPtr(copySize, copyOffset);
|
||||
} else if (!readOnly) {
|
||||
@@ -268,7 +268,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
|
||||
}
|
||||
|
||||
if (printfHandler) {
|
||||
commandQueue.waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp, false);
|
||||
commandQueue.waitUntilComplete(completionStamp.taskCount, commandQueue.peekBcsTaskCount(), completionStamp.flushStamp, false);
|
||||
printfHandler.get()->printEnqueueOutput();
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user