mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 23:03:02 +08:00
performance: limit barrier usage in non-usm copies
Related-To: NEO-11501 Don't emit barrier if non-usm copy through staging buffers with OOQ and single transfer was issued. No need to send barrier if there was single transfer - there's nothing to aggregate Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
f171a22c0a
commit
daeb6e897a
@@ -1517,13 +1517,21 @@ cl_int CommandQueue::enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstP
|
||||
profilingEvent.setQueueTimeStamp();
|
||||
}
|
||||
|
||||
// If there was only one chunk copy, no barrier for OOQ is needed
|
||||
bool isSingleTransfer = false;
|
||||
auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) -> int32_t {
|
||||
auto isFirstTransfer = (chunkDst == dstPtr);
|
||||
auto isLastTransfer = ptrOffset(chunkDst, chunkSize) == ptrOffset(dstPtr, size);
|
||||
isSingleTransfer = isFirstTransfer && isLastTransfer;
|
||||
|
||||
if (isFirstTransfer && isProfilingEnabled()) {
|
||||
profilingEvent.setSubmitTimeStamp();
|
||||
}
|
||||
memcpy(stagingBuffer, chunkSrc, chunkSize);
|
||||
if (isSingleTransfer) {
|
||||
return this->enqueueSVMMemcpy(false, chunkDst, stagingBuffer, chunkSize, 0, nullptr, event);
|
||||
}
|
||||
|
||||
if (isFirstTransfer && isProfilingEnabled()) {
|
||||
profilingEvent.setStartTimeStamp();
|
||||
}
|
||||
@@ -1543,12 +1551,12 @@ cl_int CommandQueue::enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstP
|
||||
}
|
||||
|
||||
if (event != nullptr) {
|
||||
if (this->isOOQEnabled()) {
|
||||
if (!isSingleTransfer && this->isOOQEnabled()) {
|
||||
ret = this->enqueueBarrierWithWaitList(0, nullptr, event);
|
||||
}
|
||||
if (isProfilingEnabled()) {
|
||||
auto pEvent = castToObjectOrAbort<Event>(*event);
|
||||
pEvent->copyTimestamps(profilingEvent);
|
||||
pEvent->copyTimestamps(profilingEvent, !isSingleTransfer);
|
||||
pEvent->setCPUProfilingPath(false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -397,7 +397,7 @@ void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t con
|
||||
auto &device = this->cmdQueue->getDevice();
|
||||
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
||||
auto resolution = device.getDeviceInfo().profilingTimerResolution;
|
||||
if (timestampsCopied) {
|
||||
if (isAdjustmentNeeded) {
|
||||
// Adjust startTS since we calculate profiling based on other event timestamps
|
||||
contextStartTS = startTimeStamp.gpuTimeStamp;
|
||||
}
|
||||
|
||||
@@ -312,12 +312,12 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
|
||||
|
||||
static void getBoundaryTimestampValues(TimestampPacketContainer *timestampContainer, uint64_t &globalStartTS, uint64_t &globalEndTS);
|
||||
|
||||
void copyTimestamps(const Event &srcEvent) {
|
||||
void copyTimestamps(const Event &srcEvent, bool isAdjustmentNeeded) {
|
||||
this->queueTimeStamp = srcEvent.queueTimeStamp;
|
||||
this->submitTimeStamp = srcEvent.submitTimeStamp;
|
||||
this->startTimeStamp = srcEvent.startTimeStamp;
|
||||
this->endTimeStamp = srcEvent.endTimeStamp;
|
||||
timestampsCopied = true;
|
||||
this->isAdjustmentNeeded = isAdjustmentNeeded;
|
||||
}
|
||||
|
||||
protected:
|
||||
@@ -391,7 +391,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
|
||||
bool profilingEnabled = false;
|
||||
bool profilingCpuPath = false;
|
||||
bool dataCalculated = false;
|
||||
bool timestampsCopied = false;
|
||||
bool isAdjustmentNeeded = false;
|
||||
|
||||
ProfilingInfo queueTimeStamp{};
|
||||
ProfilingInfo submitTimeStamp{};
|
||||
|
||||
Reference in New Issue
Block a user