From bbcca976820bd71e7afffb65d0155b37e5e157fd Mon Sep 17 00:00:00 2001 From: Szymon Morek Date: Wed, 23 Jul 2025 11:39:49 +0000 Subject: [PATCH] fix: flush cache after accessing img from buffer Related-To: NEO-15391 Signed-off-by: Szymon Morek --- opencl/source/command_queue/enqueue_common.h | 12 ++++---- opencl/source/kernel/kernel.cpp | 4 ++- opencl/source/kernel/kernel.h | 3 ++ .../command_queue/enqueue_kernel_2_tests.cpp | 15 ++++++++++ .../enqueue_write_image_tests.cpp | 15 ++++++++++ .../kernel/kernel_image_arg_tests.cpp | 30 +++++++++++++++++++ opencl/test/unit_test/mocks/mock_kernel.h | 1 + .../command_stream_receiver_hw_base.inl | 8 +++-- 8 files changed, 80 insertions(+), 8 deletions(-) diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 6f86a2cb94..1e5d1f6a49 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -833,6 +833,7 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( auto systolicPipelineSelectMode = false; Kernel *kernel = nullptr; bool auxTranslationRequired = false; + bool containsImageFromBuffer = false; for (auto &dispatchInfo : multiDispatchInfo) { if (kernel != dispatchInfo.getKernel()) { kernel = dispatchInfo.getKernel(); @@ -849,6 +850,7 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( anyUncacheableArgs = true; } this->isCacheFlushOnNextBcsWriteRequired |= kernel->usesImages(); + containsImageFromBuffer |= kernel->hasImageFromBufferArgs(); } UNRECOVERABLE_IF(kernel == nullptr); @@ -875,16 +877,16 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( dsh = &getIndirectHeap(IndirectHeap::Type::dynamicState, 0u); ioh = &getIndirectHeap(IndirectHeap::Type::indirectObject, 0u); - auto allocNeedsFlushDC = false; + auto dcFlush = shouldFlushDC(commandType, printfHandler) || containsImageFromBuffer; if (!device->isFullRangeSvm()) { if (std::any_of(csr.getResidencyAllocations().begin(), csr.getResidencyAllocations().end(), [](const auto allocation) { return allocation->isFlushL3Required(); })) { - allocNeedsFlushDC = true; + dcFlush |= true; } } auto memoryCompressionState = csr.getMemoryCompressionState(auxTranslationRequired); bool hasStallingCmds = enqueueProperties.hasStallingCmds || (!relaxedOrderingEnabled && (eventsRequest.numEventsInWaitList > 0 || timestampPacketDependencies.previousEnqueueNodes.peekNodes().size() > 0)); - + auto textureCacheFlush = isTextureCacheFlushNeeded(commandType) || containsImageFromBuffer; DispatchFlags dispatchFlags( ×tampPacketDependencies.barrierNodes, // barrierTimestampPacketNodes {}, // pipelineSelectArgs @@ -899,7 +901,7 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( memoryCompressionState, // memoryCompressionState getSliceCount(), // sliceCount blocking, // blocking - shouldFlushDC(commandType, printfHandler) || allocNeedsFlushDC, // dcFlush + dcFlush, // dcFlush multiDispatchInfo.usesSlm(), // useSLM !csr.isUpdateTagFromWaitEnabled() || commandType == CL_COMMAND_FILL_BUFFER, // guardCommandBufferWithPipeControl commandType == CL_COMMAND_NDRANGE_KERNEL, // GSBA32BitRequired @@ -910,7 +912,7 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( false, // usePerDssBackedBuffer kernel->areMultipleSubDevicesInContext(), // areMultipleSubDevicesInContext kernel->requiresMemoryMigration(), // memoryMigrationRequired - isTextureCacheFlushNeeded(commandType), // textureCacheFlush + textureCacheFlush, // textureCacheFlush hasStallingCmds, // hasStallingCmds relaxedOrderingEnabled, // hasRelaxedOrderingDependencies false, // stateCacheInvalidation diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index a8394c838c..3846e313fe 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -1567,8 +1567,10 @@ cl_int Kernel::setArgImageWithMipLevel(uint32_t argIndex, } DBG_LOG_INPUTS("setArgImage cl_mem", clMemObj); - + auto wasImageFromBuffer = kernelArguments[argIndex].isImageFromBuffer; storeKernelArg(argIndex, IMAGE_OBJ, clMemObj, argVal, argSize); + kernelArguments[argIndex].isImageFromBuffer = pImage->isImageFromBuffer(); + imageFromBufferArgsCount += (pImage->isImageFromBuffer() ? 1 : 0) - (wasImageFromBuffer ? 1 : 0); void *surfaceState = nullptr; if (isValidOffset(argAsImg.bindless)) { diff --git a/opencl/source/kernel/kernel.h b/opencl/source/kernel/kernel.h index 3189636f71..aa6ef2c5fd 100644 --- a/opencl/source/kernel/kernel.h +++ b/opencl/source/kernel/kernel.h @@ -75,6 +75,7 @@ class Kernel : public ReferenceTrackedObject, NEO::NonCopyableAndNonMova bool isPatched = false; bool isStatelessUncacheable = false; bool isSetToNullptr = false; + bool isImageFromBuffer = false; }; typedef int32_t (Kernel::*KernelArgHandler)(uint32_t argIndex, @@ -275,6 +276,7 @@ class Kernel : public ReferenceTrackedObject, NEO::NonCopyableAndNonMova void resetSharedObjectsPatchAddresses(); bool isUsingSharedObjArgs() const { return usingSharedObjArgs; } bool hasUncacheableStatelessArgs() const { return statelessUncacheableArgsCount > 0; } + bool hasImageFromBufferArgs() const { return imageFromBufferArgsCount > 0; } bool hasPrintfOutput() const; @@ -459,6 +461,7 @@ class Kernel : public ReferenceTrackedObject, NEO::NonCopyableAndNonMova uint32_t patchedArgumentsNum = 0; uint32_t startOffset = 0; uint32_t statelessUncacheableArgsCount = 0; + uint32_t imageFromBufferArgsCount = 0; uint32_t additionalKernelExecInfo = AdditionalKernelExecInfo::disableOverdispatch; uint32_t maxKernelWorkGroupSize = 0; uint32_t slmTotalSize = 0u; diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp index d1cc5315e8..22bd45e221 100644 --- a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp @@ -1365,3 +1365,18 @@ TEST(EnqueuePropertiesTest, givenGpuKernelEnqueuePropertiesThenStartTimestampOnC EnqueueProperties properties(false, true, false, false, false, false, nullptr); EXPECT_FALSE(properties.isStartTimestampOnCpuRequired()); } + +HWTEST_F(EnqueueKernelTest, whenEnqueueKernelWithImageFromBufferThenInvalidateTextureCache) { + auto &csr = pDevice->getUltCommandStreamReceiver(); + size_t off[3] = {0, 0, 0}; + size_t gws[3] = {1, 1, 1}; + MockKernelWithInternals mockKernel(*pClDevice); + auto res = pCmdQ->enqueueKernel(mockKernel.mockKernel, 1, off, gws, nullptr, 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, res); + EXPECT_FALSE(csr.recordedDispatchFlags.textureCacheFlush); + + mockKernel.mockKernel->imageFromBufferArgsCount = 1; + res = pCmdQ->enqueueKernel(mockKernel.mockKernel, 1, off, gws, nullptr, 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, res); + EXPECT_TRUE(csr.recordedDispatchFlags.textureCacheFlush); +} \ No newline at end of file diff --git a/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp index 9b0309065f..09e839b1f5 100644 --- a/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp @@ -13,6 +13,7 @@ #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/unit_test_helper.h" #include "shared/test/common/mocks/mock_builtins.h" +#include "shared/test/common/mocks/mock_direct_submission_hw.h" #include "shared/test/common/mocks/mock_gmm_resource_info.h" #include "shared/test/common/test_macros/test.h" @@ -107,6 +108,20 @@ HWTEST_F(EnqueueWriteImageTest, WhenWritingImageThenTaskLevelIsIncremented) { EXPECT_GT(pCmdQ->taskLevel, taskLevelBefore); } +HWTEST_F(EnqueueWriteImageTest, WhenWritingImageWithDirectSubmissionThenInvalidateTextureCache) { + auto directSubmission = new MockDirectSubmissionHw>(*pDevice->getDefaultEngine().commandStreamReceiver); + auto &ultCsr = this->pDevice->getUltCommandStreamReceiver(); + ultCsr.directSubmissionAvailable = true; + ultCsr.directSubmission.reset(directSubmission); + EnqueueWriteImageHelper<>::enqueueWriteImage(pCmdQ, dstImage, EnqueueWriteImageTraits::blocking); + EXPECT_TRUE(ultCsr.recordedDispatchFlags.textureCacheFlush); + + ultCsr.directSubmissionAvailable = false; + ultCsr.directSubmission.reset(nullptr); + EnqueueWriteImageHelper<>::enqueueWriteImage(pCmdQ, dstImage, EnqueueWriteImageTraits::blocking); + EXPECT_FALSE(ultCsr.recordedDispatchFlags.textureCacheFlush); +} + HWTEST_F(EnqueueWriteImageTest, WhenWritingImageThenCommandsAreAdded) { auto usedCmdBufferBefore = pCS->getUsed(); diff --git a/opencl/test/unit_test/kernel/kernel_image_arg_tests.cpp b/opencl/test/unit_test/kernel/kernel_image_arg_tests.cpp index b258e97fb4..b52e8fb068 100644 --- a/opencl/test/unit_test/kernel/kernel_image_arg_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_image_arg_tests.cpp @@ -87,6 +87,36 @@ TEST_F(KernelImageArgTest, givenKernelWithValidOffsetNumMipLevelsWhenImageArgIsS EXPECT_EQ(7U, *patchedNumMipLevels); } +TEST_F(KernelImageArgTest, givenKernelWithImageFromBufferThenIncrementCounter) { + uint32_t data = 0; + auto buffer = clCreateBuffer(context.get(), CL_MEM_USE_HOST_PTR, sizeof(data), &data, &retVal); + EXPECT_EQ(CL_SUCCESS, retVal); + + MockImageBase image; + cl_mem imageObj = ℑ + pKernel->setArg(0, sizeof(cl_mem), &imageObj); + EXPECT_FALSE(pKernel->hasImageFromBufferArgs()); + + cl_image_format imgFormat = {CL_RGBA, CL_UNORM_INT8}; + cl_image_desc imgDesc = image.getImageDesc(); + imgDesc.image_type = CL_MEM_OBJECT_IMAGE2D; + imgDesc.mem_object = buffer; + auto memoryProperties = ClMemoryPropertiesHelper::createMemoryProperties(0, 0, 0, pDevice); + auto surfaceFormat = Image::getSurfaceFormatFromTable(0, &imgFormat, context->getDevice(0)->getHardwareInfo().capabilityTable.supportsOcl21Features); + auto imgFromBuffer = Image::create(context.get(), memoryProperties, 0, 0, surfaceFormat, &imgDesc, nullptr, retVal); + + imageObj = imgFromBuffer; + pKernel->setArg(0, sizeof(cl_mem), &imageObj); + EXPECT_TRUE(pKernel->hasImageFromBufferArgs()); + + imageObj = ℑ + pKernel->setArg(0, sizeof(cl_mem), &imageObj); + EXPECT_FALSE(pKernel->hasImageFromBufferArgs()); + + clReleaseMemObject(imgFromBuffer); + clReleaseMemObject(buffer); +} + TEST_F(KernelImageArgTest, givenImageWithNumSamplesWhenSetArgIsCalledThenPatchNumSamplesInfo) { cl_image_format imgFormat = {CL_RGBA, CL_UNORM_INT8}; cl_image_desc imgDesc = {}; diff --git a/opencl/test/unit_test/mocks/mock_kernel.h b/opencl/test/unit_test/mocks/mock_kernel.h index 883a0d0ad8..c6872d4c50 100644 --- a/opencl/test/unit_test/mocks/mock_kernel.h +++ b/opencl/test/unit_test/mocks/mock_kernel.h @@ -108,6 +108,7 @@ class MockKernel : public Kernel { using Kernel::hasDirectStatelessAccessToHostMemory; using Kernel::hasDirectStatelessAccessToSharedBuffer; using Kernel::hasIndirectStatelessAccessToHostMemory; + using Kernel::imageFromBufferArgsCount; using Kernel::implicitArgsVersion; using Kernel::isBuiltIn; using Kernel::isUnifiedMemorySyncRequired; diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 0ef4633be4..791ed027c4 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -447,8 +447,12 @@ CompletionStamp CommandStreamReceiverHw::flushTaskHeapful( const auto &hwInfo = peekHwInfo(); bool hasStallingCmdsOnTaskStream = false; - - if (dispatchFlags.blocking || dispatchFlags.dcFlush || dispatchFlags.guardCommandBufferWithPipeControl || this->heapStorageRequiresRecyclingTag) { + bool barrierWithPostSyncNeeded = dispatchFlags.blocking || + dispatchFlags.dcFlush || + dispatchFlags.guardCommandBufferWithPipeControl || + dispatchFlags.textureCacheFlush || + this->heapStorageRequiresRecyclingTag; + if (barrierWithPostSyncNeeded) { LinearStream &epilogueCommandStream = dispatchFlags.optionalEpilogueCmdStream != nullptr ? *dispatchFlags.optionalEpilogueCmdStream : commandStreamTask; processBarrierWithPostSync(epilogueCommandStream, dispatchFlags, levelClosed, currentPipeControlForNooping, epiloguePipeControlLocation, hasStallingCmdsOnTaskStream, args);