Optimize BCS flushing scheme [2/n]

Change-Id: I6f1e0115b9c45f89afb86f8fd2304604243541df Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
2026-01-03 23:03:02 +08:00 · 2020-06-25 11:35:29 +02:00
parent d4b12c97d4
commit 86dc5bacc7
19 changed files with 291 additions and 134 deletions
--- a/opencl/source/command_queue/command_queue.cpp
+++ b/opencl/source/command_queue/command_queue.cpp
@@ -147,28 +147,28 @@ bool CommandQueue::isCompleted(uint32_t taskCount) const {
    return tag >= taskCount;
 }

-void CommandQueue::waitUntilComplete(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) {
+void CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, uint32_t bcsTaskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) {
    WAIT_ENTER()

-    DBG_LOG(LogTaskCounts, __FUNCTION__, "Waiting for taskCount:", taskCountToWait);
+    DBG_LOG(LogTaskCounts, __FUNCTION__, "Waiting for taskCount:", gpgpuTaskCountToWait);
    DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "Current taskCount:", getHwTag());

    bool forcePowerSavingMode = this->throttle == QueueThrottle::LOW;

-    getGpgpuCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait,
+    getGpgpuCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(gpgpuTaskCountToWait, flushStampToWait,
                                                                          useQuickKmdSleep, forcePowerSavingMode);
-    DEBUG_BREAK_IF(getHwTag() < taskCountToWait);
+    DEBUG_BREAK_IF(getHwTag() < gpgpuTaskCountToWait);

    if (gtpinIsGTPinInitialized()) {
-        gtpinNotifyTaskCompletion(taskCountToWait);
+        gtpinNotifyTaskCompletion(gpgpuTaskCountToWait);
    }

    if (auto bcsCsr = getBcsCommandStreamReceiver()) {
-        bcsCsr->waitForTaskCountWithKmdNotifyFallback(bcsTaskCount, 0, false, false);
-        bcsCsr->waitForTaskCountAndCleanTemporaryAllocationList(bcsTaskCount);
+        bcsCsr->waitForTaskCountWithKmdNotifyFallback(bcsTaskCountToWait, 0, false, false);
+        bcsCsr->waitForTaskCountAndCleanTemporaryAllocationList(bcsTaskCountToWait);
    }

-    getGpgpuCommandStreamReceiver().waitForTaskCountAndCleanTemporaryAllocationList(taskCountToWait);
+    getGpgpuCommandStreamReceiver().waitForTaskCountAndCleanTemporaryAllocationList(gpgpuTaskCountToWait);

    WAIT_LEAVE()
 }
--- a/opencl/source/command_queue/command_queue.h
+++ b/opencl/source/command_queue/command_queue.h
@@ -215,7 +215,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {

    MOCKABLE_VIRTUAL bool isQueueBlocked();

-    MOCKABLE_VIRTUAL void waitUntilComplete(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep);
+    MOCKABLE_VIRTUAL void waitUntilComplete(uint32_t gpgpuTaskCountToWait, uint32_t bcsTaskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep);

    static uint32_t getTaskLevelFromWaitList(uint32_t taskLevel,
                                             cl_uint numEventsInWaitList,
@@ -299,6 +299,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
    }

    void updateBcsTaskCount(uint32_t newBcsTaskCount) { this->bcsTaskCount = newBcsTaskCount; }
+    uint32_t peekBcsTaskCount() const { return bcsTaskCount; }

    // taskCount of last task
    uint32_t taskCount = 0;
--- a/opencl/source/command_queue/cpu_data_transfer_handler.cpp
+++ b/opencl/source/command_queue/cpu_data_transfer_handler.cpp
@@ -145,7 +145,7 @@ void *CommandQueue::cpuDataTransferHandler(TransferProperties &transferPropertie

        if (outEventObj) {
            outEventObj->setEndTimeStamp();
-            outEventObj->updateTaskCount(this->taskCount);
+            outEventObj->updateTaskCount(this->taskCount, this->bcsTaskCount);
            outEventObj->flushStamp->replaceStampObject(this->flushStamp->getStampReference());
            if (eventCompleted) {
                outEventObj->setStatus(CL_COMPLETE);
--- a/opencl/source/command_queue/enqueue_common.h
+++ b/opencl/source/command_queue/enqueue_common.h
@@ -305,7 +305,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
                getGpgpuCommandStreamReceiver().setMediaVFEStateDirty(true);

                if (devQueueHw->getSchedulerReturnInstance() > 0) {
-                    waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp, false);
+                    waitUntilComplete(completionStamp.taskCount, bcsTaskCount, completionStamp.flushStamp, false);
                    this->runSchedulerSimulation(*devQueueHw, *parentKernel);
                }
            }
@@ -353,7 +353,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
    updateFromCompletionStamp(completionStamp);

    if (eventBuilder.getEvent()) {
-        eventBuilder.getEvent()->updateCompletionStamp(completionStamp.taskCount, completionStamp.taskLevel, completionStamp.flushStamp);
+        eventBuilder.getEvent()->updateCompletionStamp(completionStamp.taskCount, bcsTaskCount, completionStamp.taskLevel, completionStamp.flushStamp);
        FileLoggerInstance().log(DebugManager.flags.EventsDebugEnable.get(), "updateCompletionStamp Event", eventBuilder.getEvent(), "taskLevel", eventBuilder.getEvent()->taskLevel.load());
    }

@@ -382,9 +382,9 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
        if (blockQueue) {
            while (isQueueBlocked()) {
            }
-            waitUntilComplete(taskCount, flushStamp->peekStamp(), false);
+            waitUntilComplete(taskCount, bcsTaskCount, flushStamp->peekStamp(), false);
        } else {
-            waitUntilComplete(taskCount, flushStamp->peekStamp(), false);
+            waitUntilComplete(taskCount, bcsTaskCount, flushStamp->peekStamp(), false);
            if (printfHandler) {
                printfHandler->printEnqueueOutput();
            }
--- a/opencl/source/command_queue/finish.h
+++ b/opencl/source/command_queue/finish.h
@@ -27,7 +27,7 @@ cl_int CommandQueueHw<GfxFamily>::finish() {
    auto flushStampToWaitFor = this->flushStamp->peekStamp();

    // Stall until HW reaches CQ taskCount
-    waitUntilComplete(taskCountToWaitFor, flushStampToWaitFor, false);
+    waitUntilComplete(taskCountToWaitFor, this->bcsTaskCount, flushStampToWaitFor, false);

    return CL_SUCCESS;
 }
--- a/opencl/source/event/event.cpp
+++ b/opencl/source/event/event.cpp
@@ -224,8 +224,9 @@ uint32_t Event::getCompletionStamp() const {
    return this->taskCount;
 }

-void Event::updateCompletionStamp(uint32_t taskCount, uint32_t tasklevel, FlushStamp flushStamp) {
-    this->taskCount = taskCount;
+void Event::updateCompletionStamp(uint32_t gpgpuTaskCount, uint32_t bcsTaskCount, uint32_t tasklevel, FlushStamp flushStamp) {
+    this->taskCount = gpgpuTaskCount;
+    this->bcsTaskCount = bcsTaskCount;
    this->taskLevel = tasklevel;
    this->flushStamp->setStamp(flushStamp);
 }
@@ -370,7 +371,7 @@ inline bool Event::wait(bool blocking, bool useQuickKmdSleep) {
        }
    }

-    cmdQueue->waitUntilComplete(taskCount.load(), flushStamp->peekStamp(), useQuickKmdSleep);
+    cmdQueue->waitUntilComplete(taskCount.load(), this->bcsTaskCount, flushStamp->peekStamp(), useQuickKmdSleep);
    updateExecutionStatus();

    DEBUG_BREAK_IF(this->taskLevel == CompletionStamp::notReady && this->executionStatus >= 0);
@@ -510,11 +511,9 @@ void Event::transitionExecutionStatus(int32_t newExecutionStatus) const {
 void Event::submitCommand(bool abortTasks) {
    std::unique_ptr<Command> cmdToProcess(cmdToSubmit.exchange(nullptr));
    if (cmdToProcess.get() != nullptr) {
-        std::unique_lock<CommandStreamReceiver::MutexType> lockCSR;
-        if (this->cmdQueue) {
-            lockCSR = this->getCommandQueue()->getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
-        }
-        if ((this->isProfilingEnabled()) && (this->cmdQueue != nullptr)) {
+        auto lockCSR = getCommandQueue()->getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
+
+        if (this->isProfilingEnabled()) {
            if (timeStampNode) {
                this->cmdQueue->getGpgpuCommandStreamReceiver().makeResident(*timeStampNode->getBaseGraphicsAllocation());
                cmdToProcess->timestamp = timeStampNode;
@@ -530,10 +529,10 @@ void Event::submitCommand(bool abortTasks) {
            }
        }
        auto &complStamp = cmdToProcess->submit(taskLevel, abortTasks);
-        if (profilingCpuPath && this->isProfilingEnabled() && (this->cmdQueue != nullptr)) {
+        if (profilingCpuPath && this->isProfilingEnabled()) {
            setEndTimeStamp();
        }
-        updateTaskCount(complStamp.taskCount);
+        updateTaskCount(complStamp.taskCount, cmdQueue->peekBcsTaskCount());
        flushStamp->setStamp(complStamp.flushStamp);
        submittedCmd.exchange(cmdToProcess.release());
    } else if (profilingCpuPath && endTimeStamp == 0) {
@@ -543,7 +542,7 @@ void Event::submitCommand(bool abortTasks) {
        if (!this->isUserEvent() && this->eventWithoutCommand) {
            if (this->cmdQueue) {
                auto lockCSR = this->getCommandQueue()->getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
-                updateTaskCount(this->cmdQueue->getGpgpuCommandStreamReceiver().peekTaskCount());
+                updateTaskCount(this->cmdQueue->getGpgpuCommandStreamReceiver().peekTaskCount(), cmdQueue->peekBcsTaskCount());
            }
        }
        //make sure that task count is synchronized for events with kernels
--- a/opencl/source/event/event.h
+++ b/opencl/source/event/event.h
@@ -89,7 +89,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
    ~Event() override;

    uint32_t getCompletionStamp(void) const;
-    void updateCompletionStamp(uint32_t taskCount, uint32_t tasklevel, FlushStamp flushStamp);
+    void updateCompletionStamp(uint32_t taskCount, uint32_t bcsTaskCount, uint32_t tasklevel, FlushStamp flushStamp);
    cl_ulong getDelta(cl_ulong startTime,
                      cl_ulong endTime);
    void setCPUProfilingPath(bool isCPUPath) { this->profilingCpuPath = isCPUPath; }
@@ -243,14 +243,15 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {

    virtual void unblockEventBy(Event &event, uint32_t taskLevel, int32_t transitionStatus);

-    void updateTaskCount(uint32_t taskCount) {
-        if (taskCount == CompletionStamp::notReady) {
+    void updateTaskCount(uint32_t gpgpuTaskCount, uint32_t bcsTaskCount) {
+        if (gpgpuTaskCount == CompletionStamp::notReady) {
            DEBUG_BREAK_IF(true);
            return;
        }

-        uint32_t prevTaskCount = this->taskCount.exchange(taskCount);
-        if ((prevTaskCount != CompletionStamp::notReady) && (prevTaskCount > taskCount)) {
+        this->bcsTaskCount = bcsTaskCount;
+        uint32_t prevTaskCount = this->taskCount.exchange(gpgpuTaskCount);
+        if ((prevTaskCount != CompletionStamp::notReady) && (prevTaskCount > gpgpuTaskCount)) {
            this->taskCount = prevTaskCount;
            DEBUG_BREAK_IF(true);
        }
@@ -363,6 +364,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
    uint64_t startTimeStamp;
    uint64_t endTimeStamp;
    uint64_t completeTimeStamp;
+    uint32_t bcsTaskCount = 0;
    bool perfCountersEnabled;
    TagNode<HwTimeStamps> *timeStampNode = nullptr;
    TagNode<HwPerfCounter> *perfCounterNode = nullptr;
--- a/opencl/source/helpers/task_information.cpp
+++ b/opencl/source/helpers/task_information.cpp
@@ -88,7 +88,7 @@ CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) {
                                                      commandQueue.getDevice());

    if (!memObj.isMemObjZeroCopy()) {
-        commandQueue.waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp, false);
+        commandQueue.waitUntilComplete(completionStamp.taskCount, commandQueue.peekBcsTaskCount(), completionStamp.flushStamp, false);
        if (operationType == MAP) {
            memObj.transferDataToHostPtr(copySize, copyOffset);
        } else if (!readOnly) {
@@ -268,7 +268,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
    }

    if (printfHandler) {
-        commandQueue.waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp, false);
+        commandQueue.waitUntilComplete(completionStamp.taskCount, commandQueue.peekBcsTaskCount(), completionStamp.flushStamp, false);
        printfHandler.get()->printEnqueueOutput();
    }