diff --git a/opencl/source/api/api.cpp b/opencl/source/api/api.cpp index ee85f9427e..f39ae56b68 100644 --- a/opencl/source/api/api.cpp +++ b/opencl/source/api/api.cpp @@ -2964,19 +2964,22 @@ cl_int CL_API_CALL clEnqueueWriteImage(cl_command_queue commandQueue, TRACING_EXIT(ClEnqueueWriteImage, &retVal); return retVal; } - - retVal = pCommandQueue->enqueueWriteImage( - pImage, - blockingWrite, - origin, - region, - inputRowPitch, - inputSlicePitch, - ptr, - nullptr, - numEventsInWaitList, - eventWaitList, - event); + if (pCommandQueue->isValidForStagingWriteImage(pImage, ptr, numEventsInWaitList > 0)) { + retVal = pCommandQueue->enqueueStagingWriteImage(pImage, blockingWrite, origin, region, inputRowPitch, inputSlicePitch, ptr, event); + } else { + retVal = pCommandQueue->enqueueWriteImage( + pImage, + blockingWrite, + origin, + region, + inputRowPitch, + inputSlicePitch, + ptr, + nullptr, + numEventsInWaitList, + eventWaitList, + event); + } } DBG_LOG_INPUTS("event", getClFileLogger().getEvents(reinterpret_cast(event), 1u)); TRACING_EXIT(ClEnqueueWriteImage, &retVal); diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index 219b4cce5c..dc3b887392 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -1569,7 +1569,7 @@ cl_int CommandQueue::enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstP // If there was only one chunk copy, no barrier for OOQ is needed bool isSingleTransfer = false; - ChunkCopyFunction chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) -> int32_t { + ChunkCopyFunction chunkCopy = [&](void *stagingBuffer, size_t chunkSize, void *chunkDst, const void *chunkSrc) -> int32_t { auto isFirstTransfer = (chunkDst == dstPtr); auto isLastTransfer = ptrOffset(chunkDst, chunkSize) == ptrOffset(dstPtr, size); isSingleTransfer = isFirstTransfer && isLastTransfer; @@ -1599,19 +1599,71 @@ cl_int CommandQueue::enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstP if (ret != CL_SUCCESS) { return ret; } + return postStagingTransferSync(event, profilingEvent, isSingleTransfer, blockingCopy); +} +cl_int CommandQueue::enqueueStagingWriteImage(Image *dstImage, cl_bool blockingCopy, const size_t *globalOrigin, const size_t *globalRegion, + size_t inputRowPitch, size_t inputSlicePitch, const void *ptr, cl_event *event) { + constexpr cl_command_type cmdType = CL_COMMAND_WRITE_IMAGE; + CsrSelectionArgs csrSelectionArgs{cmdType, nullptr, dstImage, this->getDevice().getRootDeviceIndex(), globalRegion, nullptr, globalOrigin}; + auto csr = &selectCsrForBuiltinOperation(csrSelectionArgs); + + Event profilingEvent{this, CL_COMMAND_WRITE_IMAGE, CompletionStamp::notReady, CompletionStamp::notReady}; + if (isProfilingEnabled()) { + profilingEvent.setQueueTimeStamp(); + } + + // If there was only one chunk write, no barrier for OOQ is needed + bool isSingleTransfer = false; + ChunkWriteImageFunc chunkWrite = [&](void *stagingBuffer, size_t bufferSize, const void *chunkPtr, const size_t *origin, const size_t *region) -> int32_t { + auto isFirstTransfer = (globalOrigin[1] == origin[1]); + auto isLastTransfer = (globalOrigin[1] + globalRegion[1] == origin[1] + region[1]); + isSingleTransfer = isFirstTransfer && isLastTransfer; + + if (isFirstTransfer && isProfilingEnabled()) { + profilingEvent.setSubmitTimeStamp(); + } + memcpy(stagingBuffer, chunkPtr, bufferSize); + if (isSingleTransfer) { + return this->enqueueWriteImage(dstImage, false, origin, region, inputRowPitch, inputSlicePitch, stagingBuffer, nullptr, 0, nullptr, event); + } + + if (isFirstTransfer && isProfilingEnabled()) { + profilingEvent.setStartTimeStamp(); + } + + cl_event *outEvent = nullptr; + if (isLastTransfer && !this->isOOQEnabled()) { + outEvent = event; + } + auto ret = this->enqueueWriteImage(dstImage, false, origin, region, inputRowPitch, inputSlicePitch, stagingBuffer, nullptr, 0, nullptr, outEvent); + return ret; + }; + auto bytesPerPixel = dstImage->getSurfaceFormatInfo().surfaceFormat.imageElementSizeInBytes; + auto dstRowPitch = inputRowPitch ? inputRowPitch : globalRegion[0] * bytesPerPixel; + auto stagingBufferManager = this->context->getStagingBufferManager(); + auto ret = stagingBufferManager->performImageWrite(ptr, globalOrigin, globalRegion, dstRowPitch, chunkWrite, csr); + if (ret != CL_SUCCESS) { + return ret; + } + return postStagingTransferSync(event, profilingEvent, isSingleTransfer, blockingCopy); +} + +cl_int CommandQueue::postStagingTransferSync(cl_event *event, const Event &profilingEvent, bool isSingleTransfer, bool isBlocking) { + cl_int ret = CL_SUCCESS; if (event != nullptr) { if (!isSingleTransfer && this->isOOQEnabled()) { ret = this->enqueueBarrierWithWaitList(0, nullptr, event); } + auto pEvent = castToObjectOrAbort(*event); if (isProfilingEnabled()) { - auto pEvent = castToObjectOrAbort(*event); pEvent->copyTimestamps(profilingEvent, !isSingleTransfer); pEvent->setCPUProfilingPath(false); } + pEvent->setCmdType(profilingEvent.getCommandType()); } - if (blockingCopy) { + if (isBlocking) { ret = this->finish(); } return ret; @@ -1633,12 +1685,18 @@ bool CommandQueue::isValidForStagingBufferCopy(Device &device, void *dstPtr, con return stagingBufferManager->isValidForCopy(device, dstPtr, srcPtr, size, hasDependencies, osContextId); } -bool CommandQueue::isValidForStagingWriteImage(size_t size) { +bool CommandQueue::isValidForStagingWriteImage(Image *image, const void *ptr, bool hasDependencies) { auto stagingBufferManager = context->getStagingBufferManager(); if (!stagingBufferManager) { return false; } - return stagingBufferManager->isValidForStagingWriteImage(this->getDevice(), size); + switch (image->getImageDesc().image_type) { + case CL_MEM_OBJECT_IMAGE1D: + case CL_MEM_OBJECT_IMAGE2D: + return stagingBufferManager->isValidForStagingWriteImage(this->getDevice(), ptr, hasDependencies); + default: + return false; + } } } // namespace NEO diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index 8c6abd7e56..e05af5de14 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -389,8 +389,10 @@ class CommandQueue : public BaseObject<_cl_command_queue> { bool isBcs() const { return isCopyOnly; }; cl_int enqueueStagingBufferMemcpy(cl_bool blockingCopy, void *dstPtr, const void *srcPtr, size_t size, cl_event *event); + cl_int enqueueStagingWriteImage(Image *dstImage, cl_bool blockingCopy, const size_t *globalOrigin, const size_t *globalRegion, + size_t inputRowPitch, size_t inputSlicePitch, const void *ptr, cl_event *event); bool isValidForStagingBufferCopy(Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies); - bool isValidForStagingWriteImage(size_t size); + bool isValidForStagingWriteImage(Image *image, const void *ptr, bool hasDependencies); protected: void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet); @@ -434,6 +436,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> { void unregisterGpgpuAndBcsCsrClients(); + cl_int postStagingTransferSync(cl_event *event, const Event &profilingEvent, bool isSingleTransfer, bool isBlocking); + Context *context = nullptr; ClDevice *device = nullptr; mutable EngineControl *gpgpuEngine = nullptr; diff --git a/opencl/source/command_queue/enqueue_write_image.h b/opencl/source/command_queue/enqueue_write_image.h index 1c8765ad70..265e532a57 100644 --- a/opencl/source/command_queue/enqueue_write_image.h +++ b/opencl/source/command_queue/enqueue_write_image.h @@ -62,7 +62,6 @@ cl_int CommandQueueHw::enqueueWriteImage( auto bcsSplit = this->isSplitEnqueueBlitNeeded(csrSelectionArgs.direction, getTotalSizeFromRectRegion(region), csr); - StagingBufferTracker stagingBufferTracker{}; if (!mapAllocation) { InternalMemoryType memoryType = InternalMemoryType::notSpecified; bool isCpuCopyAllowed = false; @@ -71,20 +70,6 @@ cl_int CommandQueueHw::enqueueWriteImage( return retVal; } - if (!mapAllocation && this->isValidForStagingWriteImage(hostPtrSize)) { - auto allocatedSize = hostPtrSize; - auto [heapAllocator, stagingBuffer] = getContext().getStagingBufferManager()->requestStagingBuffer(allocatedSize, &csr); - auto stagingBufferPtr = addrToPtr(stagingBuffer); - if (stagingBufferPtr != nullptr) { - stagingBufferTracker = StagingBufferTracker{heapAllocator, stagingBuffer, allocatedSize, 0}; - memcpy(stagingBufferPtr, srcPtr, hostPtrSize); - srcPtr = stagingBufferPtr; - - mapAllocation = getContext().getSVMAllocsManager()->getSVMAlloc(srcPtr)->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex()); - UNRECOVERABLE_IF(mapAllocation == nullptr); - } - } - if (mapAllocation) { mapAllocation->setAubWritable(true, GraphicsAllocation::defaultBank); mapAllocation->setTbxWritable(true, GraphicsAllocation::defaultBank); @@ -134,10 +119,6 @@ cl_int CommandQueueHw::enqueueWriteImage( MultiDispatchInfo dispatchInfo(dc); const auto dispatchResult = dispatchBcsOrGpgpuEnqueue(dispatchInfo, surfaces, eBuiltInOps, numEventsInWaitList, eventWaitList, event, blockingWrite == CL_TRUE, csr); - if (stagingBufferTracker.chunkAddress != 0) { - stagingBufferTracker.taskCountToWait = csr.peekTaskCount(); - getContext().getStagingBufferManager()->trackChunk(stagingBufferTracker); - } if (dispatchResult != CL_SUCCESS) { return dispatchResult; diff --git a/opencl/source/event/event.h b/opencl/source/event/event.h index 53f9927252..ed11bb6a53 100644 --- a/opencl/source/event/event.h +++ b/opencl/source/event/event.h @@ -243,7 +243,7 @@ class Event : public BaseObject<_cl_event>, public IDNode { return cmdQueue; } - cl_command_type getCommandType() { + cl_command_type getCommandType() const { return cmdType; } diff --git a/opencl/test/unit_test/command_queue/enqueue_svm_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_svm_tests.cpp index 1e5ab20eac..28e783b4c6 100644 --- a/opencl/test/unit_test/command_queue/enqueue_svm_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_svm_tests.cpp @@ -2431,8 +2431,6 @@ HWTEST_F(StagingBufferTest, givenInOrderCmdQueueWhenEnqueueStagingBufferMemcpyNo } HWTEST_F(StagingBufferTest, givenOutOfOrderCmdQueueWhenEnqueueStagingBufferMemcpyNonBlockingThenCopySucessfull) { - constexpr cl_command_type expectedLastCmd = CL_COMMAND_BARRIER; - cl_event event; MockCommandQueueHw myCmdQ(context, pClDevice, 0); myCmdQ.setOoqEnabled(); @@ -2452,8 +2450,8 @@ HWTEST_F(StagingBufferTest, givenOutOfOrderCmdQueueWhenEnqueueStagingBufferMemcp EXPECT_EQ(1u, numOfStagingBuffers); EXPECT_EQ(expectedNumOfCopies, myCmdQ.enqueueSVMMemcpyCalledCount); EXPECT_EQ(0u, myCmdQ.finishCalledCount); - EXPECT_EQ(expectedLastCmd, myCmdQ.lastCommandType); - EXPECT_EQ(expectedLastCmd, pEvent->getCommandType()); + EXPECT_EQ(static_cast(CL_COMMAND_BARRIER), myCmdQ.lastCommandType); + EXPECT_EQ(static_cast(CL_COMMAND_SVM_MEMCPY), pEvent->getCommandType()); clReleaseEvent(event); } diff --git a/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp index 7a51e57660..deda7d6812 100644 --- a/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp @@ -234,17 +234,13 @@ HWTEST_F(EnqueueWriteImageTest, GivenImage1DarrayWhenReadWriteImageIsCalledThenH EnqueueWriteImageHelper<>::enqueueWriteImage(pCmdQ, dstImage2.get(), CL_FALSE, origin, region); auto &csr = pCmdQ->getGpgpuCommandStreamReceiver(); - if (!pCmdQ->isValidForStagingWriteImage(imageSize)) { - auto temporaryAllocation1 = csr.getTemporaryAllocations().peekHead(); - ASSERT_NE(nullptr, temporaryAllocation1); - EXPECT_EQ(temporaryAllocation1->getUnderlyingBufferSize(), imageSize); - } + auto temporaryAllocation1 = csr.getTemporaryAllocations().peekHead(); + ASSERT_NE(nullptr, temporaryAllocation1); + EXPECT_EQ(temporaryAllocation1->getUnderlyingBufferSize(), imageSize); EnqueueReadImageHelper<>::enqueueReadImage(pCmdQ, dstImage2.get(), CL_FALSE, origin, region); - auto temporaryAllocation2 = csr.getTemporaryAllocations().peekHead(); - if (!pCmdQ->isValidForStagingWriteImage(imageSize)) { - temporaryAllocation2 = temporaryAllocation2->next; - } + auto temporaryAllocation2 = temporaryAllocation1->next; + ASSERT_NE(nullptr, temporaryAllocation2); EXPECT_EQ(temporaryAllocation2->getUnderlyingBufferSize(), imageSize); } @@ -299,17 +295,12 @@ HWTEST_F(EnqueueWriteImageTest, GivenImage2DarrayWhenReadWriteImageIsCalledThenH auto &csr = pCmdQ->getGpgpuCommandStreamReceiver(); - if (!pCmdQ->isValidForStagingWriteImage(imageSize)) { - auto temporaryAllocation1 = csr.getTemporaryAllocations().peekHead(); - ASSERT_NE(nullptr, temporaryAllocation1); - EXPECT_EQ(temporaryAllocation1->getUnderlyingBufferSize(), imageSize); - } + auto temporaryAllocation1 = csr.getTemporaryAllocations().peekHead(); + ASSERT_NE(nullptr, temporaryAllocation1); + EXPECT_EQ(temporaryAllocation1->getUnderlyingBufferSize(), imageSize); EnqueueReadImageHelper<>::enqueueReadImage(pCmdQ, dstImage.get(), CL_FALSE, origin, region); - auto temporaryAllocation2 = csr.getTemporaryAllocations().peekHead(); - if (!pCmdQ->isValidForStagingWriteImage(imageSize)) { - temporaryAllocation2 = temporaryAllocation2->next; - } + auto temporaryAllocation2 = temporaryAllocation1->next; ASSERT_NE(nullptr, temporaryAllocation2); EXPECT_EQ(temporaryAllocation2->getUnderlyingBufferSize(), imageSize); } @@ -810,52 +801,129 @@ HWTEST_F(EnqueueWriteImageTest, whenEnqueueWriteImageWithUsmPtrAndSizeLowerThanR svmManager->freeSVMAlloc(usmPtr); } -HWTEST_F(EnqueueWriteImageTest, whenEnqueueWriteImageWithStagingCopyEnabledThenDontImportAllocation) { +HWTEST_F(EnqueueWriteImageTest, whenIsValidForStagingWriteImageCalledThenReturnCorrectValue) { bool svmSupported = pDevice->getHardwareInfo().capabilityTable.ftrSvm; if (!svmSupported) { GTEST_SKIP(); } - DebugManagerStateRestore restorer{}; - debugManager.flags.EnableCopyWithStagingBuffers.set(1); - auto res = EnqueueWriteImageHelper<>::enqueueWriteImage(pCmdQ, dstImage, CL_FALSE, - EnqueueWriteImageTraits::origin, - EnqueueWriteImageTraits::region, - EnqueueWriteImageTraits::rowPitch, - EnqueueWriteImageTraits::slicePitch, - EnqueueWriteImageTraits::hostPtr, - nullptr, - 0u, - nullptr, - nullptr); + unsigned char ptr[16]; + + std::unique_ptr image(Image1dHelper<>::create(context)); + EXPECT_FALSE(pCmdQ->isValidForStagingWriteImage(image.get(), ptr, false)); + + image.reset(Image2dHelper<>::create(context)); + EXPECT_FALSE(pCmdQ->isValidForStagingWriteImage(image.get(), ptr, false)); + + image.reset(Image3dHelper<>::create(context)); + EXPECT_FALSE(pCmdQ->isValidForStagingWriteImage(image.get(), ptr, false)); +} + +struct WriteImageStagingBufferTest : public EnqueueWriteImageTest { + void SetUp() override { + REQUIRE_SVM_OR_SKIP(defaultHwInfo); + EnqueueWriteImageTest::SetUp(); + ptr = new unsigned char[writeSize]; + device.reset(new MockClDevice{MockClDevice::createWithNewExecutionEnvironment(nullptr)}); + } + + void TearDown() override { + if (defaultHwInfo->capabilityTable.ftrSvm == false) { + return; + } + delete[] ptr; + EnqueueWriteImageTest::TearDown(); + } + + static constexpr size_t stagingBufferSize = MemoryConstants::megaByte * 2; + static constexpr size_t writeSize = stagingBufferSize * 4; + unsigned char *ptr; + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {4, 8, 1}; + std::unique_ptr device; + cl_queue_properties props = {}; +}; + +HWTEST_F(WriteImageStagingBufferTest, whenEnqueueStagingWriteImageCalledThenReturnSuccess) { + MockCommandQueueHw mockCommandQueueHw(context, device.get(), &props); + auto res = mockCommandQueueHw.enqueueStagingWriteImage(dstImage, false, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, nullptr); + EXPECT_EQ(res, CL_SUCCESS); - pCmdQ->finish(); + EXPECT_EQ(4ul, mockCommandQueueHw.enqueueWriteImageCounter); auto &csr = pDevice->getUltCommandStreamReceiver(); EXPECT_EQ(0u, csr.createAllocationForHostSurfaceCalled); } -HWTEST_F(EnqueueWriteImageTest, whenEnqueueWriteImageWithStagingCopyEnabledAndStagingBufferFailedThenImportAllocation) { - bool svmSupported = pDevice->getHardwareInfo().capabilityTable.ftrSvm; - if (!svmSupported) { - GTEST_SKIP(); - } - DebugManagerStateRestore restorer{}; - debugManager.flags.EnableCopyWithStagingBuffers.set(1); - auto memoryManager = static_cast(pDevice->getMemoryManager()); - memoryManager->isMockHostMemoryManager = true; - memoryManager->forceFailureInPrimaryAllocation = true; - memoryManager->singleFailureInPrimaryAllocation = true; - auto res = EnqueueWriteImageHelper<>::enqueueWriteImage(pCmdQ, dstImage, CL_FALSE, - EnqueueWriteImageTraits::origin, - EnqueueWriteImageTraits::region, - EnqueueWriteImageTraits::rowPitch, - EnqueueWriteImageTraits::slicePitch, - EnqueueWriteImageTraits::hostPtr, - nullptr, - 0u, - nullptr, - nullptr); +HWTEST_F(WriteImageStagingBufferTest, whenBlockingEnqueueStagingWriteImageCalledThenFinishCalled) { + MockCommandQueueHw mockCommandQueueHw(context, device.get(), &props); + auto res = mockCommandQueueHw.enqueueStagingWriteImage(dstImage, true, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, nullptr); + EXPECT_EQ(res, CL_SUCCESS); - pCmdQ->finish(); - auto &csr = pDevice->getUltCommandStreamReceiver(); - EXPECT_EQ(1u, csr.createAllocationForHostSurfaceCalled); + EXPECT_EQ(1u, mockCommandQueueHw.finishCalledCount); } + +HWTEST_F(WriteImageStagingBufferTest, whenEnqueueStagingWriteImageCalledWithEventThenReturnValidEvent) { + constexpr cl_command_type expectedLastCmd = CL_COMMAND_WRITE_IMAGE; + MockCommandQueueHw mockCommandQueueHw(context, device.get(), &props); + cl_event event; + auto res = mockCommandQueueHw.enqueueStagingWriteImage(dstImage, false, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, &event); + EXPECT_EQ(res, CL_SUCCESS); + + auto pEvent = (Event *)event; + EXPECT_EQ(expectedLastCmd, mockCommandQueueHw.lastCommandType); + EXPECT_EQ(expectedLastCmd, pEvent->getCommandType()); + + clReleaseEvent(event); +} + +HWTEST_F(WriteImageStagingBufferTest, givenOutOfOrderQueueWhenEnqueueStagingWriteImageCalledWithEventThenReturnValidEvent) { + MockCommandQueueHw mockCommandQueueHw(context, device.get(), &props); + mockCommandQueueHw.setOoqEnabled(); + cl_event event; + auto res = mockCommandQueueHw.enqueueStagingWriteImage(dstImage, false, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, &event); + EXPECT_EQ(res, CL_SUCCESS); + + auto pEvent = (Event *)event; + EXPECT_EQ(static_cast(CL_COMMAND_BARRIER), mockCommandQueueHw.lastCommandType); + EXPECT_EQ(static_cast(CL_COMMAND_WRITE_IMAGE), pEvent->getCommandType()); + + clReleaseEvent(event); +} + +HWTEST_F(WriteImageStagingBufferTest, givenOutOfOrderQueueWhenEnqueueStagingWriteImageCalledWithSingleTransferThenNoBarrierEnqueued) { + constexpr cl_command_type expectedLastCmd = CL_COMMAND_WRITE_IMAGE; + MockCommandQueueHw mockCommandQueueHw(context, device.get(), &props); + mockCommandQueueHw.setOoqEnabled(); + cl_event event; + region[1] = 1; + auto res = mockCommandQueueHw.enqueueStagingWriteImage(dstImage, false, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, &event); + EXPECT_EQ(res, CL_SUCCESS); + + auto pEvent = (Event *)event; + EXPECT_EQ(expectedLastCmd, mockCommandQueueHw.lastCommandType); + EXPECT_EQ(expectedLastCmd, pEvent->getCommandType()); + + clReleaseEvent(event); +} + +HWTEST_F(WriteImageStagingBufferTest, givenCmdQueueWithProfilingWhenEnqueueStagingWriteImageThenTimestampsSetCorrectly) { + cl_event event; + MockCommandQueueHw mockCommandQueueHw(context, device.get(), &props); + mockCommandQueueHw.setProfilingEnabled(); + auto res = mockCommandQueueHw.enqueueStagingWriteImage(dstImage, false, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, &event); + EXPECT_EQ(res, CL_SUCCESS); + + auto pEvent = (Event *)event; + EXPECT_FALSE(pEvent->isCPUProfilingPath()); + EXPECT_TRUE(pEvent->isProfilingEnabled()); + + clReleaseEvent(event); +} + +HWTEST_F(WriteImageStagingBufferTest, whenEnqueueStagingWriteImageFailedThenPropagateErrorCode) { + MockCommandQueueHw mockCommandQueueHw(context, device.get(), &props); + mockCommandQueueHw.enqueueWriteImageCallBase = false; + auto res = mockCommandQueueHw.enqueueStagingWriteImage(dstImage, false, origin, region, MemoryConstants::megaByte, MemoryConstants::megaByte, ptr, nullptr); + + EXPECT_EQ(res, CL_INVALID_OPERATION); + EXPECT_EQ(1ul, mockCommandQueueHw.enqueueWriteImageCounter); +} \ No newline at end of file diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index 0b379b6fc3..5d392751b7 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -357,17 +357,20 @@ class MockCommandQueueHw : public CommandQueueHw { const cl_event *eventWaitList, cl_event *event) override { enqueueWriteImageCounter++; - return BaseClass::enqueueWriteImage(dstImage, - blockingWrite, - origin, - region, - inputRowPitch, - inputSlicePitch, - ptr, - mapAllocation, - numEventsInWaitList, - eventWaitList, - event); + if (enqueueWriteImageCallBase) { + return BaseClass::enqueueWriteImage(dstImage, + blockingWrite, + origin, + region, + inputRowPitch, + inputSlicePitch, + ptr, + mapAllocation, + numEventsInWaitList, + eventWaitList, + event); + } + return CL_INVALID_OPERATION; } void *cpuDataTransferHandler(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &retVal) override { cpuDataTransferHandlerCalled = true; @@ -482,6 +485,7 @@ class MockCommandQueueHw : public CommandQueueHw { std::vector lastEnqueuedKernels; MultiDispatchInfo storedMultiDispatchInfo; size_t enqueueWriteImageCounter = 0; + bool enqueueWriteImageCallBase = true; size_t enqueueWriteBufferCounter = 0; size_t requestedCmdStreamSize = 0; bool blockingWriteBuffer = false; diff --git a/shared/source/utilities/staging_buffer_manager.cpp b/shared/source/utilities/staging_buffer_manager.cpp index 4f9c9d8bc6..3ece30693c 100644 --- a/shared/source/utilities/staging_buffer_manager.cpp +++ b/shared/source/utilities/staging_buffer_manager.cpp @@ -37,17 +37,18 @@ StagingBufferManager::~StagingBufferManager() { } /* - * This method performs 4 steps for single chunk copy + * This method performs 4 steps for single chunk transfer * 1. Get existing chunk of staging buffer, if can't - allocate new one, - * 2. Perform actual copy, + * 2. Perform actual transfer, * 3. Store used buffer to tracking container (with current task count) * 4. Update tag if required to reuse this buffer in next chunk copies */ -int32_t StagingBufferManager::performChunkCopy(void *chunkDst, const void *chunkSrc, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr) { +template +int32_t StagingBufferManager::performChunkTransfer(CommandStreamReceiver *csr, size_t size, Func &func, Args... args) { auto allocatedSize = size; - auto [allocator, chunkBuffer] = requestStagingBuffer(allocatedSize, csr); - auto ret = chunkCopyFunc(chunkDst, addrToPtr(chunkBuffer), chunkSrc, size); - trackChunk({allocator, chunkBuffer, allocatedSize, csr->peekTaskCount()}); + auto [allocator, stagingBuffer] = requestStagingBuffer(allocatedSize, csr); + auto ret = func(addrToPtr(stagingBuffer), size, args...); + trackChunk({allocator, stagingBuffer, allocatedSize, csr->peekTaskCount()}); if (csr->isAnyDirectSubmissionEnabled()) { csr->flushTagUpdate(); } @@ -66,7 +67,7 @@ int32_t StagingBufferManager::performCopy(void *dstPtr, const void *srcPtr, size for (auto i = 0u; i < copiesNum; i++) { auto chunkDst = ptrOffset(dstPtr, i * chunkSize); auto chunkSrc = ptrOffset(srcPtr, i * chunkSize); - auto ret = performChunkCopy(chunkDst, chunkSrc, chunkSize, chunkCopyFunc, csr); + auto ret = performChunkTransfer(csr, chunkSize, chunkCopyFunc, chunkDst, chunkSrc); if (ret) { return ret; } @@ -75,7 +76,50 @@ int32_t StagingBufferManager::performCopy(void *dstPtr, const void *srcPtr, size if (remainder != 0) { auto chunkDst = ptrOffset(dstPtr, copiesNum * chunkSize); auto chunkSrc = ptrOffset(srcPtr, copiesNum * chunkSize); - auto ret = performChunkCopy(chunkDst, chunkSrc, remainder, chunkCopyFunc, csr); + auto ret = performChunkTransfer(csr, remainder, chunkCopyFunc, chunkDst, chunkSrc); + if (ret) { + return ret; + } + } + return 0; +} + +/* + * This method orchestrates write operation for images with given origin and region. + * Transfer is splitted into chunks, each chunk represents sub-region to transfer. + * Each chunk contains staging buffer which should be used instead of non-usm memory during transfers on GPU. + * Several rows are packed into single chunk unless size of single row exceeds maximum chunk size (2MB). + * Caller provides actual function to enqueue write operation for single chunk. + */ +int32_t StagingBufferManager::performImageWrite(const void *ptr, const size_t *globalOrigin, const size_t *globalRegion, size_t rowPitch, ChunkWriteImageFunc &chunkWriteImageFunc, CommandStreamReceiver *csr) { + size_t origin[3] = {}; + size_t region[3] = {}; + origin[0] = globalOrigin[0]; + origin[2] = globalOrigin[2]; + region[0] = globalRegion[0]; + region[2] = globalRegion[2]; + auto rowsPerChunk = std::max(1ul, chunkSize / rowPitch); + rowsPerChunk = std::min(rowsPerChunk, globalRegion[1]); + auto numOfChunks = globalRegion[1] / rowsPerChunk; + auto remainder = globalRegion[1] % (rowsPerChunk * numOfChunks); + + for (auto i = 0u; i < numOfChunks; i++) { + origin[1] = globalOrigin[1] + i * rowsPerChunk; + region[1] = rowsPerChunk; + auto size = region[1] * rowPitch; + auto chunkPtr = ptrOffset(ptr, i * rowsPerChunk * rowPitch); + auto ret = performChunkTransfer(csr, size, chunkWriteImageFunc, chunkPtr, origin, region); + if (ret) { + return ret; + } + } + + if (remainder != 0) { + origin[1] = globalOrigin[1] + numOfChunks * rowsPerChunk; + region[1] = remainder; + auto size = region[1] * rowPitch; + auto chunkPtr = ptrOffset(ptr, numOfChunks * rowsPerChunk * rowPitch); + auto ret = performChunkTransfer(csr, size, chunkWriteImageFunc, chunkPtr, origin, region); if (ret) { return ret; } @@ -152,13 +196,13 @@ bool StagingBufferManager::isValidForCopy(const Device &device, void *dstPtr, co return stagingCopyEnabled && hostToUsmCopy && !hasDependencies && (isUsedByOsContext || size <= chunkSize); } -bool StagingBufferManager::isValidForStagingWriteImage(const Device &device, size_t size) const { - auto thresholdSizeForImages = 32 * MemoryConstants::megaByte; - auto stagingCopyEnabled = true; +bool StagingBufferManager::isValidForStagingWriteImage(const Device &device, const void *ptr, bool hasDependencies) const { + auto stagingCopyEnabled = false; if (debugManager.flags.EnableCopyWithStagingBuffers.get() != -1) { stagingCopyEnabled = debugManager.flags.EnableCopyWithStagingBuffers.get(); } - return stagingCopyEnabled && (0 < size && size <= thresholdSizeForImages); + auto nonUsmPtr = ptr != nullptr && svmAllocsManager->getSVMAlloc(ptr) == nullptr; + return stagingCopyEnabled && !hasDependencies && nonUsmPtr; } void StagingBufferManager::clearTrackedChunks(CommandStreamReceiver *csr) { diff --git a/shared/source/utilities/staging_buffer_manager.h b/shared/source/utilities/staging_buffer_manager.h index 33a3cf8f13..dd29ea0dee 100644 --- a/shared/source/utilities/staging_buffer_manager.h +++ b/shared/source/utilities/staging_buffer_manager.h @@ -21,7 +21,8 @@ class CommandStreamReceiver; class Device; class HeapAllocator; -using ChunkCopyFunction = std::function; +using ChunkCopyFunction = std::function; +using ChunkWriteImageFunc = std::function; class StagingBuffer { public: @@ -60,9 +61,11 @@ class StagingBufferManager { StagingBufferManager &operator=(const StagingBufferManager &other) = delete; bool isValidForCopy(const Device &device, void *dstPtr, const void *srcPtr, size_t size, bool hasDependencies, uint32_t osContextId) const; - bool isValidForStagingWriteImage(const Device &device, size_t size) const; + bool isValidForStagingWriteImage(const Device &device, const void *ptr, bool hasDependencies) const; int32_t performCopy(void *dstPtr, const void *srcPtr, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr); + int32_t performImageWrite(const void *ptr, const size_t *globalOrigin, const size_t *globalRegion, size_t rowPitch, ChunkWriteImageFunc &chunkWriteImageFunc, CommandStreamReceiver *csr); + std::pair requestStagingBuffer(size_t &size, CommandStreamReceiver *csr); void trackChunk(const StagingBufferTracker &tracker); @@ -71,7 +74,8 @@ class StagingBufferManager { void *allocateStagingBuffer(size_t size); void clearTrackedChunks(CommandStreamReceiver *csr); - int32_t performChunkCopy(void *chunkDst, const void *chunkSrc, size_t size, ChunkCopyFunction &chunkCopyFunc, CommandStreamReceiver *csr); + template + int32_t performChunkTransfer(CommandStreamReceiver *csr, size_t size, Func &chunkCopyFunc, Args... args); size_t chunkSize = MemoryConstants::pageSize2M; std::mutex mtx; diff --git a/shared/test/unit_test/utilities/staging_buffer_manager_tests.cpp b/shared/test/unit_test/utilities/staging_buffer_manager_tests.cpp index 04a9fec058..e083323b7b 100644 --- a/shared/test/unit_test/utilities/staging_buffer_manager_tests.cpp +++ b/shared/test/unit_test/utilities/staging_buffer_manager_tests.cpp @@ -55,7 +55,7 @@ class StagingBufferManagerFixture : public DeviceFixture { memset(usmBuffer, 0, copySize); memset(nonUsmBuffer, 0xFF, copySize); - ChunkCopyFunction chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) { + ChunkCopyFunction chunkCopy = [&](void *stagingBuffer, size_t chunkSize, void *chunkDst, const void *chunkSrc) { chunkCounter++; memcpy(stagingBuffer, chunkSrc, chunkSize); memcpy(chunkDst, stagingBuffer, chunkSize); @@ -74,6 +74,44 @@ class StagingBufferManagerFixture : public DeviceFixture { delete[] nonUsmBuffer; } + void imageWriteThroughStagingBuffers(size_t rowPitch, const size_t *globalOrigin, const size_t *globalRegion, size_t expectedChunks) { + auto ptr = new unsigned char[stagingBufferSize * expectedChunks]; + + size_t chunkCounter = 0; + size_t expectedOrigin = globalOrigin[1]; + auto expectedRowsPerChunk = std::min(std::max(1ul, stagingBufferSize / rowPitch), globalRegion[1]); + auto numOfChunks = globalRegion[1] / expectedRowsPerChunk; + auto remainder = globalRegion[1] % (expectedRowsPerChunk * numOfChunks); + ChunkWriteImageFunc chunkWrite = [&](void *stagingBuffer, size_t bufferSize, const void *chunkPtr, const size_t *origin, const size_t *region) -> int32_t { + EXPECT_NE(nullptr, stagingBuffer); + EXPECT_NE(nullptr, chunkPtr); + EXPECT_NE(nullptr, origin); + EXPECT_NE(nullptr, region); + + EXPECT_EQ(globalOrigin[0], origin[0]); + EXPECT_EQ(globalRegion[0], region[0]); + + EXPECT_EQ(expectedOrigin, origin[1]); + if (chunkCounter + 1 == expectedChunks && remainder != 0) { + EXPECT_EQ(remainder, region[1]); + } else { + EXPECT_EQ(expectedRowsPerChunk, region[1]); + } + expectedOrigin += region[1]; + chunkCounter++; + reinterpret_cast(csr)->taskCount++; + return 0; + }; + auto initialNumOfUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs(); + auto ret = stagingBufferManager->performImageWrite(ptr, globalOrigin, globalRegion, rowPitch, chunkWrite, csr); + auto newUsmAllocations = svmAllocsManager->svmAllocs.getNumAllocs() - initialNumOfUsmAllocations; + + EXPECT_EQ(0, ret); + EXPECT_EQ(expectedChunks, chunkCounter); + EXPECT_EQ(1u, newUsmAllocations); + delete[] ptr; + } + constexpr static size_t stagingBufferSize = MemoryConstants::megaByte * 2; DebugManagerStateRestore restorer; std::unique_ptr svmAllocsManager; @@ -124,6 +162,34 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferEnabledWhenValidForCopyThenRe svmAllocsManager->freeSVMAlloc(usmBuffer); } +TEST_F(StagingBufferManagerTest, givenStagingBufferEnabledWhenValidForImageWriteThenReturnCorrectValue) { + constexpr size_t bufferSize = 1024; + auto usmBuffer = allocateDeviceBuffer(bufferSize); + unsigned char nonUsmBuffer[bufferSize]; + + struct { + void *ptr; + bool hasDependencies; + bool expectValid; + } copyParamsStruct[4]{ + {usmBuffer, false, false}, + {usmBuffer, true, false}, + {nonUsmBuffer, false, true}, + {nonUsmBuffer, true, false}, + }; + for (auto i = 0; i < 4; i++) { + auto actualValid = stagingBufferManager->isValidForStagingWriteImage(*pDevice, copyParamsStruct[i].ptr, copyParamsStruct[i].hasDependencies); + EXPECT_EQ(actualValid, copyParamsStruct[i].expectValid); + } + + debugManager.flags.EnableCopyWithStagingBuffers.set(0); + EXPECT_FALSE(stagingBufferManager->isValidForStagingWriteImage(*pDevice, nonUsmBuffer, false)); + + debugManager.flags.EnableCopyWithStagingBuffers.set(-1); + EXPECT_FALSE(stagingBufferManager->isValidForStagingWriteImage(*pDevice, nonUsmBuffer, false)); + svmAllocsManager->freeSVMAlloc(usmBuffer); +} + TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformCopyThenCopyData) { constexpr size_t numOfChunkCopies = 8; constexpr size_t remainder = 1024; @@ -181,7 +247,7 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedChunkCopyThenEarlyR memset(usmBuffer, 0, totalCopySize); memset(nonUsmBuffer, 0xFF, totalCopySize); - ChunkCopyFunction chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) { + ChunkCopyFunction chunkCopy = [&](void *stagingBuffer, size_t chunkSize, void *chunkDst, const void *chunkSrc) { chunkCounter++; memcpy(stagingBuffer, chunkSrc, chunkSize); memcpy(chunkDst, stagingBuffer, chunkSize); @@ -211,7 +277,7 @@ TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedRemainderCopyThenRe memset(usmBuffer, 0, totalCopySize); memset(nonUsmBuffer, 0xFF, totalCopySize); - ChunkCopyFunction chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) { + ChunkCopyFunction chunkCopy = [&](void *stagingBuffer, size_t chunkSize, void *chunkDst, const void *chunkSrc) { chunkCounter++; memcpy(stagingBuffer, chunkSrc, chunkSize); memcpy(chunkDst, stagingBuffer, chunkSize); @@ -256,7 +322,7 @@ HWTEST_F(StagingBufferManagerTest, givenStagingBufferWhenDirectSubmissionEnabled auto nonUsmBuffer = new unsigned char[totalCopySize]; size_t flushTagsCalled = 0; - ChunkCopyFunction chunkCopy = [&](void *chunkDst, void *stagingBuffer, const void *chunkSrc, size_t chunkSize) { + ChunkCopyFunction chunkCopy = [&](void *stagingBuffer, size_t chunkSize, void *chunkDst, const void *chunkSrc) { if (ultCsr->flushTagUpdateCalled) { flushTagsCalled++; ultCsr->flushTagUpdateCalled = false; @@ -274,19 +340,6 @@ HWTEST_F(StagingBufferManagerTest, givenStagingBufferWhenDirectSubmissionEnabled delete[] nonUsmBuffer; } -HWTEST_F(StagingBufferManagerTest, givenStagingBufferManagerWhenIsValidForStagingWriteImageCalledThenReturnCorrectValue) { - EXPECT_TRUE(stagingBufferManager->isValidForStagingWriteImage(*pDevice, MemoryConstants::pageSize2M)); - - EXPECT_FALSE(stagingBufferManager->isValidForStagingWriteImage(*pDevice, 0)); - EXPECT_FALSE(stagingBufferManager->isValidForStagingWriteImage(*pDevice, MemoryConstants::gigaByte)); - - debugManager.flags.EnableCopyWithStagingBuffers.set(0); - EXPECT_FALSE(stagingBufferManager->isValidForStagingWriteImage(*pDevice, MemoryConstants::pageSize2M)); - - debugManager.flags.EnableCopyWithStagingBuffers.set(-1); - EXPECT_TRUE(stagingBufferManager->isValidForStagingWriteImage(*pDevice, MemoryConstants::pageSize2M)); -} - HWTEST_F(StagingBufferManagerTest, givenFailedAllocationWhenRequestStagingBufferCalledThenReturnNullptr) { size_t size = MemoryConstants::pageSize2M; auto memoryManager = static_cast(pDevice->getMemoryManager()); @@ -295,3 +348,73 @@ HWTEST_F(StagingBufferManagerTest, givenFailedAllocationWhenRequestStagingBuffer auto [heapAllocator, stagingBuffer] = stagingBufferManager->requestStagingBuffer(size, csr); EXPECT_EQ(stagingBuffer, 0u); } + +TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageWriteThenWholeRegionCovered) { + size_t expectedChunks = 8; + const size_t globalOrigin[3] = {0, 0, 0}; + const size_t globalRegion[3] = {4, expectedChunks, 1}; + imageWriteThroughStagingBuffers(stagingBufferSize, globalOrigin, globalRegion, expectedChunks); +} + +TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageWriteWithOriginThenWholeRegionCovered) { + size_t expectedChunks = 8; + const size_t globalOrigin[3] = {4, 4, 0}; + const size_t globalRegion[3] = {4, expectedChunks, 1}; + imageWriteThroughStagingBuffers(stagingBufferSize, globalOrigin, globalRegion, expectedChunks); +} + +TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageWriteWithMultipleRowsPerChunkThenWholeRegionCovered) { + size_t expectedChunks = 4; + const size_t globalOrigin[3] = {0, 0, 0}; + const size_t globalRegion[3] = {4, 8, 1}; + imageWriteThroughStagingBuffers(MemoryConstants::megaByte, globalOrigin, globalRegion, expectedChunks); +} + +TEST_F(StagingBufferManagerTest, givenStagingBufferWhenPerformImageWriteWithRemainderThenWholeRegionCovered) { + size_t expectedChunks = 4; + const size_t globalOrigin[3] = {0, 0, 0}; + const size_t globalRegion[3] = {4, 7, 1}; + imageWriteThroughStagingBuffers(MemoryConstants::megaByte, globalOrigin, globalRegion, expectedChunks); +} + +TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedChunkImageWriteThenEarlyReturnWithFailure) { + size_t expectedChunks = 4; + const size_t globalOrigin[3] = {0, 0, 0}; + const size_t globalRegion[3] = {4, 8, 1}; + constexpr int expectedErrorCode = 1; + auto ptr = new unsigned char[stagingBufferSize * expectedChunks]; + + size_t chunkCounter = 0; + ChunkWriteImageFunc chunkWrite = [&](void *stagingBuffer, size_t bufferSize, const void *chunkPtr, const size_t *origin, const size_t *region) -> int32_t { + ++chunkCounter; + return expectedErrorCode; + }; + + auto ret = stagingBufferManager->performImageWrite(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr); + EXPECT_EQ(expectedErrorCode, ret); + EXPECT_EQ(1u, chunkCounter); + delete[] ptr; +} + +TEST_F(StagingBufferManagerTest, givenStagingBufferWhenFailedChunkImageWriteWithRemainderThenReturnWithFailure) { + size_t expectedChunks = 4; + const size_t globalOrigin[3] = {0, 0, 0}; + const size_t globalRegion[3] = {4, 7, 1}; + constexpr int expectedErrorCode = 1; + auto ptr = new unsigned char[stagingBufferSize * expectedChunks]; + + size_t chunkCounter = 0; + size_t remainderCounter = 4; + ChunkWriteImageFunc chunkWrite = [&](void *stagingBuffer, size_t bufferSize, const void *chunkPtr, const size_t *origin, const size_t *region) -> int32_t { + ++chunkCounter; + if (chunkCounter == remainderCounter) { + return expectedErrorCode; + } + return 0; + }; + + auto ret = stagingBufferManager->performImageWrite(ptr, globalOrigin, globalRegion, MemoryConstants::megaByte, chunkWrite, csr); + EXPECT_EQ(expectedErrorCode, ret); + EXPECT_EQ(remainderCounter, chunkCounter); + delete[] ptr; +} \ No newline at end of file