diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index 91eea51729..161c2bf446 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -1517,13 +1517,21 @@ cl_int CommandQueue::enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstP profilingEvent.setQueueTimeStamp(); } + // If there was only one chunk copy, no barrier for OOQ is needed + bool isSingleTransfer = false; auto chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) -> int32_t { auto isFirstTransfer = (chunkDst == dstPtr); auto isLastTransfer = ptrOffset(chunkDst, chunkSize) == ptrOffset(dstPtr, size); + isSingleTransfer = isFirstTransfer && isLastTransfer; + if (isFirstTransfer && isProfilingEnabled()) { profilingEvent.setSubmitTimeStamp(); } memcpy(stagingBuffer, chunkSrc, chunkSize); + if (isSingleTransfer) { + return this->enqueueSVMMemcpy(false, chunkDst, stagingBuffer, chunkSize, 0, nullptr, event); + } + if (isFirstTransfer && isProfilingEnabled()) { profilingEvent.setStartTimeStamp(); } @@ -1543,12 +1551,12 @@ cl_int CommandQueue::enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstP } if (event != nullptr) { - if (this->isOOQEnabled()) { + if (!isSingleTransfer && this->isOOQEnabled()) { ret = this->enqueueBarrierWithWaitList(0, nullptr, event); } if (isProfilingEnabled()) { auto pEvent = castToObjectOrAbort(*event); - pEvent->copyTimestamps(profilingEvent); + pEvent->copyTimestamps(profilingEvent, !isSingleTransfer); pEvent->setCPUProfilingPath(false); } } diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp index 236a3150a0..bb32a9c62b 100644 --- a/opencl/source/event/event.cpp +++ b/opencl/source/event/event.cpp @@ -397,7 +397,7 @@ void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t con auto &device = this->cmdQueue->getDevice(); auto &gfxCoreHelper = device.getGfxCoreHelper(); auto resolution = device.getDeviceInfo().profilingTimerResolution; - if (timestampsCopied) { + if (isAdjustmentNeeded) { // Adjust startTS since we calculate profiling based on other event timestamps contextStartTS = startTimeStamp.gpuTimeStamp; } diff --git a/opencl/source/event/event.h b/opencl/source/event/event.h index 2731f7a727..53f9927252 100644 --- a/opencl/source/event/event.h +++ b/opencl/source/event/event.h @@ -312,12 +312,12 @@ class Event : public BaseObject<_cl_event>, public IDNode { static void getBoundaryTimestampValues(TimestampPacketContainer *timestampContainer, uint64_t &globalStartTS, uint64_t &globalEndTS); - void copyTimestamps(const Event &srcEvent) { + void copyTimestamps(const Event &srcEvent, bool isAdjustmentNeeded) { this->queueTimeStamp = srcEvent.queueTimeStamp; this->submitTimeStamp = srcEvent.submitTimeStamp; this->startTimeStamp = srcEvent.startTimeStamp; this->endTimeStamp = srcEvent.endTimeStamp; - timestampsCopied = true; + this->isAdjustmentNeeded = isAdjustmentNeeded; } protected: @@ -391,7 +391,7 @@ class Event : public BaseObject<_cl_event>, public IDNode { bool profilingEnabled = false; bool profilingCpuPath = false; bool dataCalculated = false; - bool timestampsCopied = false; + bool isAdjustmentNeeded = false; ProfilingInfo queueTimeStamp{}; ProfilingInfo submitTimeStamp{}; diff --git a/opencl/test/unit_test/command_queue/enqueue_svm_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_svm_tests.cpp index a4c554e6b8..224ec46989 100644 --- a/opencl/test/unit_test/command_queue/enqueue_svm_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_svm_tests.cpp @@ -2456,6 +2456,34 @@ HWTEST_F(StagingBufferTest, givenOutOfOrderCmdQueueWhenEnqueueStagingBufferMemcp clReleaseEvent(event); } +HWTEST_F(StagingBufferTest, givenOutOfOrderCmdQueueWhenEnqueueStagingBufferMemcpyNonBlockingWithSingleTransferThenNoBarrierEnqueued) { + constexpr cl_command_type expectedLastCmd = CL_COMMAND_SVM_MEMCPY; + + cl_event event; + MockCommandQueueHw myCmdQ(context, pClDevice, 0); + myCmdQ.setOoqEnabled(); + + auto initialUsmAllocs = svmManager->getNumAllocs(); + retVal = myCmdQ.enqueueStagingBufferMemcpy( + false, // cl_bool blocking_copy + dstPtr, // void *dst_ptr + srcPtr, // const void *src_ptr + stagingBufferSize, // size_t size + &event // cl_event *event + ); + + auto pEvent = (Event *)event; + auto numOfStagingBuffers = svmManager->getNumAllocs() - initialUsmAllocs; + EXPECT_EQ(CL_SUCCESS, retVal); + EXPECT_EQ(1u, numOfStagingBuffers); + EXPECT_EQ(1u, myCmdQ.enqueueSVMMemcpyCalledCount); + EXPECT_EQ(0u, myCmdQ.finishCalledCount); + EXPECT_EQ(expectedLastCmd, myCmdQ.lastCommandType); + EXPECT_EQ(expectedLastCmd, pEvent->getCommandType()); + + clReleaseEvent(event); +} + HWTEST_F(StagingBufferTest, givenEnqueueStagingBufferMemcpyWhenTaskCountNotReadyThenCopySucessfullAndBuffersNotReused) { MockCommandQueueHw myCmdQ(context, pClDevice, 0); auto initialUsmAllocs = svmManager->getNumAllocs();