diff --git a/opencl/source/api/api.cpp b/opencl/source/api/api.cpp index a5b46d5f4b..1871e27731 100644 --- a/opencl/source/api/api.cpp +++ b/opencl/source/api/api.cpp @@ -2899,18 +2899,22 @@ cl_int CL_API_CALL clEnqueueReadImage(cl_command_queue commandQueue, return retVal; } - retVal = pCommandQueue->enqueueReadImage( - pImage, - blockingRead, - origin, - region, - rowPitch, - slicePitch, - ptr, - nullptr, - numEventsInWaitList, - eventWaitList, - event); + if (pCommandQueue->isValidForStagingTransfer(pImage, ptr, numEventsInWaitList > 0)) { + retVal = pCommandQueue->enqueueStagingImageTransfer(CL_COMMAND_READ_IMAGE, pImage, blockingRead, origin, region, rowPitch, slicePitch, ptr, event); + } else { + retVal = pCommandQueue->enqueueReadImage( + pImage, + blockingRead, + origin, + region, + rowPitch, + slicePitch, + ptr, + nullptr, + numEventsInWaitList, + eventWaitList, + event); + } } DBG_LOG_INPUTS("event", getClFileLogger().getEvents(reinterpret_cast(event), 1u)); TRACING_EXIT(ClEnqueueReadImage, &retVal); diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index 7e6b885107..18bc7794b2 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -1558,4 +1558,12 @@ void CommandQueue::unregisterGpgpuAndBcsCsrClients() { } } +size_t CommandQueue::calculateHostPtrSizeForImage(const size_t *region, size_t rowPitch, size_t slicePitch, Image *image) { + auto bytesPerPixel = image->getSurfaceFormatInfo().surfaceFormat.imageElementSizeInBytes; + auto dstRowPitch = rowPitch ? rowPitch : region[0] * bytesPerPixel; + auto dstSlicePitch = slicePitch ? slicePitch : ((image->getImageDesc().image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY ? 1 : region[1]) * dstRowPitch); + + return Image::calculateHostPtrSize(region, dstRowPitch, dstSlicePitch, bytesPerPixel, image->getImageDesc().image_type); +} + } // namespace NEO diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index a4eef0002e..f20ea08652 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -455,6 +455,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> { cl_int postStagingTransferSync(const StagingTransferStatus &status, cl_event *event, const cl_event profilingEvent, bool isSingleTransfer, bool isBlocking, cl_command_type commandType); cl_event *assignEventForStaging(cl_event *userEvent, cl_event *profilingEvent, bool isFirstTransfer, bool isLastTransfer) const; + size_t calculateHostPtrSizeForImage(const size_t *region, size_t rowPitch, size_t slicePitch, Image *image); + Context *context = nullptr; ClDevice *device = nullptr; mutable EngineControl *gpgpuEngine = nullptr; diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h index f49d450ab4..05195325d3 100644 --- a/opencl/source/command_queue/command_queue_hw.h +++ b/opencl/source/command_queue/command_queue_hw.h @@ -479,7 +479,6 @@ class CommandQueueHw : public CommandQueue { protected: MOCKABLE_VIRTUAL void enqueueHandlerHook(const unsigned int commandType, const MultiDispatchInfo &dispatchInfo){}; MOCKABLE_VIRTUAL bool prepareCsrDependency(CsrDependencies &csrDeps, CsrDependencyContainer &dependencyTags, TimestampPacketDependencies ×tampPacketDependencies, TagAllocatorBase *allocator, bool blockQueue); - size_t calculateHostPtrSizeForImage(const size_t *region, size_t rowPitch, size_t slicePitch, Image *image); cl_int enqueueReadWriteBufferOnCpuWithMemoryTransfer(cl_command_type commandType, Buffer *buffer, size_t offset, size_t size, void *ptr, cl_uint numEventsInWaitList, diff --git a/opencl/source/command_queue/command_queue_staging.cpp b/opencl/source/command_queue/command_queue_staging.cpp index 686653f61c..ee2f1ec0e2 100644 --- a/opencl/source/command_queue/command_queue_staging.cpp +++ b/opencl/source/command_queue/command_queue_staging.cpp @@ -71,6 +71,14 @@ cl_int CommandQueue::enqueueStagingImageTransfer(cl_command_type commandType, Im auto stagingBufferManager = this->context->getStagingBufferManager(); auto ret = stagingBufferManager->performImageTransfer(ptr, globalOrigin, globalRegion, dstRowPitch, bytesPerPixel, chunkWrite, &csr, isRead); + + if (isRead && context->isProvidingPerformanceHints()) { + auto hostPtrSize = calculateHostPtrSizeForImage(globalRegion, inputRowPitch, inputSlicePitch, image); + if (!isL3Capable(ptr, hostPtrSize)) { + context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, CL_ENQUEUE_READ_IMAGE_DOESNT_MEET_ALIGNMENT_RESTRICTIONS, ptr, hostPtrSize, MemoryConstants::pageSize, MemoryConstants::pageSize); + } + } + return postStagingTransferSync(ret, event, profilingEvent, isSingleTransfer, blockingCopy, commandType); } diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index c47af03e69..5f8d11d9a9 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -1253,15 +1253,6 @@ void CommandQueueHw::computeOffsetsValueForRectCommands(size_t *buffe *hostOffset = hostOrigin[2] * computedHostSlicePitch + hostOrigin[1] * computedHostRowPitch + hostOrigin[0]; } -template -size_t CommandQueueHw::calculateHostPtrSizeForImage(const size_t *region, size_t rowPitch, size_t slicePitch, Image *image) { - auto bytesPerPixel = image->getSurfaceFormatInfo().surfaceFormat.imageElementSizeInBytes; - auto dstRowPitch = rowPitch ? rowPitch : region[0] * bytesPerPixel; - auto dstSlicePitch = slicePitch ? slicePitch : ((image->getImageDesc().image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY ? 1 : region[1]) * dstRowPitch); - - return Image::calculateHostPtrSize(region, dstRowPitch, dstSlicePitch, bytesPerPixel, image->getImageDesc().image_type); -} - template bool CommandQueueHw::prepareCsrDependency(CsrDependencies &csrDeps, CsrDependencyContainer &dependencyTags, TimestampPacketDependencies ×tampPacketDependencies, TagAllocatorBase *allocator, bool blockQueue) { for (auto &dependentCsr : csrDeps.csrWithMultiEngineDependencies) { diff --git a/opencl/source/helpers/cache_policy.cpp b/opencl/source/helpers/cache_policy.cpp index d7bb74e958..de7494df36 100644 --- a/opencl/source/helpers/cache_policy.cpp +++ b/opencl/source/helpers/cache_policy.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2021 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -12,7 +12,7 @@ namespace NEO { -bool isL3Capable(void *ptr, size_t size) { +bool isL3Capable(const void *ptr, size_t size) { return isAligned(ptr) && isAligned(size); } diff --git a/opencl/test/unit_test/context/driver_diagnostics_enqueue_tests.cpp b/opencl/test/unit_test/context/driver_diagnostics_enqueue_tests.cpp index 2a710a0f7e..274a71ce6f 100644 --- a/opencl/test/unit_test/context/driver_diagnostics_enqueue_tests.cpp +++ b/opencl/test/unit_test/context/driver_diagnostics_enqueue_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -382,6 +382,41 @@ TEST_P(PerformanceHintEnqueueReadImageTest, GivenHostPtrAndSizeAlignmentsWhenEnq alignedFree(ptr); } +TEST_P(PerformanceHintEnqueueReadImageTest, GivenHostPtrAndSizeAlignmentsWhenEnqueueStagingReadImageIsCallingThenContextProvidesHintsAboutAlignments) { + REQUIRE_IMAGES_OR_SKIP(defaultHwInfo); + REQUIRE_SVM_OR_SKIP(pPlatform->getClDevice(0)); + + size_t hostOrigin[] = {0, 0, 0}; + size_t sizeForReadImageInPixels = MemoryConstants::cacheLineSize; + size_t sizeForReadImage = sizeForReadImageInPixels * image->getSurfaceFormatInfo().surfaceFormat.imageElementSizeInBytes; + void *ptr = alignedMalloc(sizeForReadImage + MemoryConstants::cacheLineSize, MemoryConstants::cacheLineSize); + uintptr_t addressForReadImage = (uintptr_t)ptr; + + bool hintWithMisalignment = !(alignedAddress && alignedSize); + if (!alignedAddress) { + addressForReadImage++; + } + if (!alignedSize) { + sizeForReadImageInPixels--; + sizeForReadImage -= image->getSurfaceFormatInfo().surfaceFormat.imageElementSizeInBytes; + } + size_t region[] = {sizeForReadImageInPixels, 1, 1}; + pCmdQ->enqueueStagingImageTransfer(CL_COMMAND_READ_IMAGE, + image, + CL_FALSE, + hostOrigin, + region, + 0, + 0, + (void *)addressForReadImage, + nullptr); + ASSERT_EQ(alignedSize, isAligned(sizeForReadImage)); + + snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[CL_ENQUEUE_READ_IMAGE_DOESNT_MEET_ALIGNMENT_RESTRICTIONS], addressForReadImage, sizeForReadImage, MemoryConstants::pageSize, MemoryConstants::pageSize); + EXPECT_EQ(hintWithMisalignment, containsHint(expectedHint, userData)); + alignedFree(ptr); +} + TEST_F(PerformanceHintEnqueueImageTest, GivenNonBlockingWriteWhenEnqueueWriteImageIsCallingThenContextProvidesProperHint) { size_t hostOrigin[] = {0, 0, 0}; diff --git a/shared/source/helpers/cache_policy.cpp b/shared/source/helpers/cache_policy.cpp index e8ee288a0f..a3c311d15f 100644 --- a/shared/source/helpers/cache_policy.cpp +++ b/shared/source/helpers/cache_policy.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -13,7 +13,7 @@ namespace NEO { -bool isL3Capable(void *ptr, size_t size) { +bool isL3Capable(const void *ptr, size_t size) { return isAligned(ptr) && isAligned(size); } diff --git a/shared/source/helpers/cache_policy.h b/shared/source/helpers/cache_policy.h index 65374c142c..7689737f68 100644 --- a/shared/source/helpers/cache_policy.h +++ b/shared/source/helpers/cache_policy.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -15,7 +15,7 @@ namespace NEO { class GraphicsAllocation; class ProductHelper; -bool isL3Capable(void *ptr, size_t size); +bool isL3Capable(const void *ptr, size_t size); bool isL3Capable(const GraphicsAllocation &graphicsAllocation); template