Optimize BCS flushing scheme [2/n]

Change-Id: I6f1e0115b9c45f89afb86f8fd2304604243541df
Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2020-06-25 11:35:29 +02:00
committed by sys_ocldev
parent d4b12c97d4
commit 86dc5bacc7
19 changed files with 291 additions and 134 deletions

View File

@@ -224,8 +224,9 @@ uint32_t Event::getCompletionStamp() const {
return this->taskCount;
}
void Event::updateCompletionStamp(uint32_t taskCount, uint32_t tasklevel, FlushStamp flushStamp) {
this->taskCount = taskCount;
void Event::updateCompletionStamp(uint32_t gpgpuTaskCount, uint32_t bcsTaskCount, uint32_t tasklevel, FlushStamp flushStamp) {
this->taskCount = gpgpuTaskCount;
this->bcsTaskCount = bcsTaskCount;
this->taskLevel = tasklevel;
this->flushStamp->setStamp(flushStamp);
}
@@ -370,7 +371,7 @@ inline bool Event::wait(bool blocking, bool useQuickKmdSleep) {
}
}
cmdQueue->waitUntilComplete(taskCount.load(), flushStamp->peekStamp(), useQuickKmdSleep);
cmdQueue->waitUntilComplete(taskCount.load(), this->bcsTaskCount, flushStamp->peekStamp(), useQuickKmdSleep);
updateExecutionStatus();
DEBUG_BREAK_IF(this->taskLevel == CompletionStamp::notReady && this->executionStatus >= 0);
@@ -510,11 +511,9 @@ void Event::transitionExecutionStatus(int32_t newExecutionStatus) const {
void Event::submitCommand(bool abortTasks) {
std::unique_ptr<Command> cmdToProcess(cmdToSubmit.exchange(nullptr));
if (cmdToProcess.get() != nullptr) {
std::unique_lock<CommandStreamReceiver::MutexType> lockCSR;
if (this->cmdQueue) {
lockCSR = this->getCommandQueue()->getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
}
if ((this->isProfilingEnabled()) && (this->cmdQueue != nullptr)) {
auto lockCSR = getCommandQueue()->getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
if (this->isProfilingEnabled()) {
if (timeStampNode) {
this->cmdQueue->getGpgpuCommandStreamReceiver().makeResident(*timeStampNode->getBaseGraphicsAllocation());
cmdToProcess->timestamp = timeStampNode;
@@ -530,10 +529,10 @@ void Event::submitCommand(bool abortTasks) {
}
}
auto &complStamp = cmdToProcess->submit(taskLevel, abortTasks);
if (profilingCpuPath && this->isProfilingEnabled() && (this->cmdQueue != nullptr)) {
if (profilingCpuPath && this->isProfilingEnabled()) {
setEndTimeStamp();
}
updateTaskCount(complStamp.taskCount);
updateTaskCount(complStamp.taskCount, cmdQueue->peekBcsTaskCount());
flushStamp->setStamp(complStamp.flushStamp);
submittedCmd.exchange(cmdToProcess.release());
} else if (profilingCpuPath && endTimeStamp == 0) {
@@ -543,7 +542,7 @@ void Event::submitCommand(bool abortTasks) {
if (!this->isUserEvent() && this->eventWithoutCommand) {
if (this->cmdQueue) {
auto lockCSR = this->getCommandQueue()->getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
updateTaskCount(this->cmdQueue->getGpgpuCommandStreamReceiver().peekTaskCount());
updateTaskCount(this->cmdQueue->getGpgpuCommandStreamReceiver().peekTaskCount(), cmdQueue->peekBcsTaskCount());
}
}
//make sure that task count is synchronized for events with kernels

View File

@@ -89,7 +89,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
~Event() override;
uint32_t getCompletionStamp(void) const;
void updateCompletionStamp(uint32_t taskCount, uint32_t tasklevel, FlushStamp flushStamp);
void updateCompletionStamp(uint32_t taskCount, uint32_t bcsTaskCount, uint32_t tasklevel, FlushStamp flushStamp);
cl_ulong getDelta(cl_ulong startTime,
cl_ulong endTime);
void setCPUProfilingPath(bool isCPUPath) { this->profilingCpuPath = isCPUPath; }
@@ -243,14 +243,15 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
virtual void unblockEventBy(Event &event, uint32_t taskLevel, int32_t transitionStatus);
void updateTaskCount(uint32_t taskCount) {
if (taskCount == CompletionStamp::notReady) {
void updateTaskCount(uint32_t gpgpuTaskCount, uint32_t bcsTaskCount) {
if (gpgpuTaskCount == CompletionStamp::notReady) {
DEBUG_BREAK_IF(true);
return;
}
uint32_t prevTaskCount = this->taskCount.exchange(taskCount);
if ((prevTaskCount != CompletionStamp::notReady) && (prevTaskCount > taskCount)) {
this->bcsTaskCount = bcsTaskCount;
uint32_t prevTaskCount = this->taskCount.exchange(gpgpuTaskCount);
if ((prevTaskCount != CompletionStamp::notReady) && (prevTaskCount > gpgpuTaskCount)) {
this->taskCount = prevTaskCount;
DEBUG_BREAK_IF(true);
}
@@ -363,6 +364,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
uint64_t startTimeStamp;
uint64_t endTimeStamp;
uint64_t completeTimeStamp;
uint32_t bcsTaskCount = 0;
bool perfCountersEnabled;
TagNode<HwTimeStamps> *timeStampNode = nullptr;
TagNode<HwPerfCounter> *perfCounterNode = nullptr;