mirror of
https://github.com/intel/compute-runtime.git
synced 2025-11-10 05:49:51 +08:00
Optimize BCS flushing scheme [2/n]
Change-Id: I6f1e0115b9c45f89afb86f8fd2304604243541df Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
committed by
sys_ocldev
parent
d4b12c97d4
commit
86dc5bacc7
@@ -224,8 +224,9 @@ uint32_t Event::getCompletionStamp() const {
|
||||
return this->taskCount;
|
||||
}
|
||||
|
||||
void Event::updateCompletionStamp(uint32_t taskCount, uint32_t tasklevel, FlushStamp flushStamp) {
|
||||
this->taskCount = taskCount;
|
||||
void Event::updateCompletionStamp(uint32_t gpgpuTaskCount, uint32_t bcsTaskCount, uint32_t tasklevel, FlushStamp flushStamp) {
|
||||
this->taskCount = gpgpuTaskCount;
|
||||
this->bcsTaskCount = bcsTaskCount;
|
||||
this->taskLevel = tasklevel;
|
||||
this->flushStamp->setStamp(flushStamp);
|
||||
}
|
||||
@@ -370,7 +371,7 @@ inline bool Event::wait(bool blocking, bool useQuickKmdSleep) {
|
||||
}
|
||||
}
|
||||
|
||||
cmdQueue->waitUntilComplete(taskCount.load(), flushStamp->peekStamp(), useQuickKmdSleep);
|
||||
cmdQueue->waitUntilComplete(taskCount.load(), this->bcsTaskCount, flushStamp->peekStamp(), useQuickKmdSleep);
|
||||
updateExecutionStatus();
|
||||
|
||||
DEBUG_BREAK_IF(this->taskLevel == CompletionStamp::notReady && this->executionStatus >= 0);
|
||||
@@ -510,11 +511,9 @@ void Event::transitionExecutionStatus(int32_t newExecutionStatus) const {
|
||||
void Event::submitCommand(bool abortTasks) {
|
||||
std::unique_ptr<Command> cmdToProcess(cmdToSubmit.exchange(nullptr));
|
||||
if (cmdToProcess.get() != nullptr) {
|
||||
std::unique_lock<CommandStreamReceiver::MutexType> lockCSR;
|
||||
if (this->cmdQueue) {
|
||||
lockCSR = this->getCommandQueue()->getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
|
||||
}
|
||||
if ((this->isProfilingEnabled()) && (this->cmdQueue != nullptr)) {
|
||||
auto lockCSR = getCommandQueue()->getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
|
||||
|
||||
if (this->isProfilingEnabled()) {
|
||||
if (timeStampNode) {
|
||||
this->cmdQueue->getGpgpuCommandStreamReceiver().makeResident(*timeStampNode->getBaseGraphicsAllocation());
|
||||
cmdToProcess->timestamp = timeStampNode;
|
||||
@@ -530,10 +529,10 @@ void Event::submitCommand(bool abortTasks) {
|
||||
}
|
||||
}
|
||||
auto &complStamp = cmdToProcess->submit(taskLevel, abortTasks);
|
||||
if (profilingCpuPath && this->isProfilingEnabled() && (this->cmdQueue != nullptr)) {
|
||||
if (profilingCpuPath && this->isProfilingEnabled()) {
|
||||
setEndTimeStamp();
|
||||
}
|
||||
updateTaskCount(complStamp.taskCount);
|
||||
updateTaskCount(complStamp.taskCount, cmdQueue->peekBcsTaskCount());
|
||||
flushStamp->setStamp(complStamp.flushStamp);
|
||||
submittedCmd.exchange(cmdToProcess.release());
|
||||
} else if (profilingCpuPath && endTimeStamp == 0) {
|
||||
@@ -543,7 +542,7 @@ void Event::submitCommand(bool abortTasks) {
|
||||
if (!this->isUserEvent() && this->eventWithoutCommand) {
|
||||
if (this->cmdQueue) {
|
||||
auto lockCSR = this->getCommandQueue()->getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
|
||||
updateTaskCount(this->cmdQueue->getGpgpuCommandStreamReceiver().peekTaskCount());
|
||||
updateTaskCount(this->cmdQueue->getGpgpuCommandStreamReceiver().peekTaskCount(), cmdQueue->peekBcsTaskCount());
|
||||
}
|
||||
}
|
||||
//make sure that task count is synchronized for events with kernels
|
||||
|
||||
@@ -89,7 +89,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
|
||||
~Event() override;
|
||||
|
||||
uint32_t getCompletionStamp(void) const;
|
||||
void updateCompletionStamp(uint32_t taskCount, uint32_t tasklevel, FlushStamp flushStamp);
|
||||
void updateCompletionStamp(uint32_t taskCount, uint32_t bcsTaskCount, uint32_t tasklevel, FlushStamp flushStamp);
|
||||
cl_ulong getDelta(cl_ulong startTime,
|
||||
cl_ulong endTime);
|
||||
void setCPUProfilingPath(bool isCPUPath) { this->profilingCpuPath = isCPUPath; }
|
||||
@@ -243,14 +243,15 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
|
||||
|
||||
virtual void unblockEventBy(Event &event, uint32_t taskLevel, int32_t transitionStatus);
|
||||
|
||||
void updateTaskCount(uint32_t taskCount) {
|
||||
if (taskCount == CompletionStamp::notReady) {
|
||||
void updateTaskCount(uint32_t gpgpuTaskCount, uint32_t bcsTaskCount) {
|
||||
if (gpgpuTaskCount == CompletionStamp::notReady) {
|
||||
DEBUG_BREAK_IF(true);
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t prevTaskCount = this->taskCount.exchange(taskCount);
|
||||
if ((prevTaskCount != CompletionStamp::notReady) && (prevTaskCount > taskCount)) {
|
||||
this->bcsTaskCount = bcsTaskCount;
|
||||
uint32_t prevTaskCount = this->taskCount.exchange(gpgpuTaskCount);
|
||||
if ((prevTaskCount != CompletionStamp::notReady) && (prevTaskCount > gpgpuTaskCount)) {
|
||||
this->taskCount = prevTaskCount;
|
||||
DEBUG_BREAK_IF(true);
|
||||
}
|
||||
@@ -363,6 +364,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
|
||||
uint64_t startTimeStamp;
|
||||
uint64_t endTimeStamp;
|
||||
uint64_t completeTimeStamp;
|
||||
uint32_t bcsTaskCount = 0;
|
||||
bool perfCountersEnabled;
|
||||
TagNode<HwTimeStamps> *timeStampNode = nullptr;
|
||||
TagNode<HwPerfCounter> *perfCounterNode = nullptr;
|
||||
|
||||
Reference in New Issue
Block a user