performance: limit barrier usage in non-usm copies

Related-To: NEO-11501

Don't emit barrier if non-usm copy through staging
buffers with OOQ and single transfer was issued.
No need to send barrier if there was single transfer -
there's nothing to aggregate

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2024-07-02 13:41:00 +00:00
committed by Compute-Runtime-Automation
parent f171a22c0a
commit daeb6e897a
4 changed files with 42 additions and 6 deletions

View File

@@ -1517,13 +1517,21 @@ cl_int CommandQueue::enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstP
profilingEvent.setQueueTimeStamp();
}
// If there was only one chunk copy, no barrier for OOQ is needed
bool isSingleTransfer = false;
auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) -> int32_t {
auto isFirstTransfer = (chunkDst == dstPtr);
auto isLastTransfer = ptrOffset(chunkDst, chunkSize) == ptrOffset(dstPtr, size);
isSingleTransfer = isFirstTransfer && isLastTransfer;
if (isFirstTransfer && isProfilingEnabled()) {
profilingEvent.setSubmitTimeStamp();
}
memcpy(stagingBuffer, chunkSrc, chunkSize);
if (isSingleTransfer) {
return this->enqueueSVMMemcpy(false, chunkDst, stagingBuffer, chunkSize, 0, nullptr, event);
}
if (isFirstTransfer && isProfilingEnabled()) {
profilingEvent.setStartTimeStamp();
}
@@ -1543,12 +1551,12 @@ cl_int CommandQueue::enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstP
}
if (event != nullptr) {
if (this->isOOQEnabled()) {
if (!isSingleTransfer && this->isOOQEnabled()) {
ret = this->enqueueBarrierWithWaitList(0, nullptr, event);
}
if (isProfilingEnabled()) {
auto pEvent = castToObjectOrAbort<Event>(*event);
pEvent->copyTimestamps(profilingEvent);
pEvent->copyTimestamps(profilingEvent, !isSingleTransfer);
pEvent->setCPUProfilingPath(false);
}
}

View File

@@ -397,7 +397,7 @@ void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t con
auto &device = this->cmdQueue->getDevice();
auto &gfxCoreHelper = device.getGfxCoreHelper();
auto resolution = device.getDeviceInfo().profilingTimerResolution;
if (timestampsCopied) {
if (isAdjustmentNeeded) {
// Adjust startTS since we calculate profiling based on other event timestamps
contextStartTS = startTimeStamp.gpuTimeStamp;
}

View File

@@ -312,12 +312,12 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
static void getBoundaryTimestampValues(TimestampPacketContainer *timestampContainer, uint64_t &globalStartTS, uint64_t &globalEndTS);
void copyTimestamps(const Event &srcEvent) {
void copyTimestamps(const Event &srcEvent, bool isAdjustmentNeeded) {
this->queueTimeStamp = srcEvent.queueTimeStamp;
this->submitTimeStamp = srcEvent.submitTimeStamp;
this->startTimeStamp = srcEvent.startTimeStamp;
this->endTimeStamp = srcEvent.endTimeStamp;
timestampsCopied = true;
this->isAdjustmentNeeded = isAdjustmentNeeded;
}
protected:
@@ -391,7 +391,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
bool profilingEnabled = false;
bool profilingCpuPath = false;
bool dataCalculated = false;
bool timestampsCopied = false;
bool isAdjustmentNeeded = false;
ProfilingInfo queueTimeStamp{};
ProfilingInfo submitTimeStamp{};