From c2377e76c19a9e246cfeccda82f22cf9836e2d59 Mon Sep 17 00:00:00 2001 From: Damian Tomczak Date: Tue, 9 Sep 2025 11:41:30 +0000 Subject: [PATCH] feature: force stateless for copy buffer to image Related-to: NEO-6075 Signed-off-by: Damian Tomczak --- .../enqueue_copy_buffer_to_image.h | 4 +- .../command_queue/enqueue_write_image.h | 4 +- .../command_queue_hw_2_tests.cpp | 3 +- .../enqueue_copy_buffer_to_image_tests.cpp | 63 +++++++++++++++++-- .../enqueue_write_image_tests.cpp | 51 ++++++++++++++- 5 files changed, 113 insertions(+), 12 deletions(-) diff --git a/opencl/source/command_queue/enqueue_copy_buffer_to_image.h b/opencl/source/command_queue/enqueue_copy_buffer_to_image.h index 0a92634922..f91390353a 100644 --- a/opencl/source/command_queue/enqueue_copy_buffer_to_image.h +++ b/opencl/source/command_queue/enqueue_copy_buffer_to_image.h @@ -25,8 +25,8 @@ cl_int CommandQueueHw::enqueueCopyBufferToImage( const cl_event *eventWaitList, cl_event *event) { - const bool useStateless = forceStateless(srcBuffer->getSize()); - auto builtInType = EBuiltInOps::adjustBuiltinType(useStateless, this->heaplessModeEnabled); + const bool isStateless = isForceStateless || forceStateless(srcBuffer->getSize()); + auto builtInType = EBuiltInOps::adjustBuiltinType(isStateless, this->heaplessModeEnabled); auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, this->getClDevice()); diff --git a/opencl/source/command_queue/enqueue_write_image.h b/opencl/source/command_queue/enqueue_write_image.h index 9d6b25cd3a..2fc445f23d 100644 --- a/opencl/source/command_queue/enqueue_write_image.h +++ b/opencl/source/command_queue/enqueue_write_image.h @@ -134,9 +134,9 @@ cl_int CommandQueueHw::enqueueWriteImageImpl( dc.bcsSplit = bcsSplit; dc.direction = csrSelectionArgs.direction; - const bool useStateless = forceStateless(dstImage->getSize()); + const bool isStateless = isForceStateless || forceStateless(dstImage->getSize()); const bool useHeapless = getHeaplessModeEnabled(); - auto eBuiltInOps = EBuiltInOps::adjustBuiltinType(useStateless, useHeapless); + auto eBuiltInOps = EBuiltInOps::adjustBuiltinType(isStateless, useHeapless); MultiDispatchInfo dispatchInfo(dc); const auto dispatchResult = dispatchBcsOrGpgpuEnqueue(dispatchInfo, surfaces, eBuiltInOps, numEventsInWaitList, eventWaitList, event, blockingWrite == CL_TRUE, csr); diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp index fb896049b3..d701a4e091 100644 --- a/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp @@ -178,6 +178,7 @@ HWTEST_F(BuiltinParamsCommandQueueHwTests, givenEnqueueWriteImageCallWhenBuiltin debugManager.flags.EnableCopyWithStagingBuffers.set(0); bool heaplessAllowed = UnitTestHelper::isHeaplessAllowed(); + const bool useStateless = pDevice->getCompilerProductHelper().isForceToStatelessRequired(); for (auto useHeapless : {false, heaplessAllowed}) { if (useHeapless && !heaplessAllowed) { @@ -185,7 +186,7 @@ HWTEST_F(BuiltinParamsCommandQueueHwTests, givenEnqueueWriteImageCallWhenBuiltin } reinterpret_cast *>(pCmdQ)->heaplessModeEnabled = useHeapless; - setUpImpl(EBuiltInOps::adjustBuiltinType(false, useHeapless)); + setUpImpl(EBuiltInOps::adjustBuiltinType(useStateless, useHeapless)); std::unique_ptr dstImage(ImageHelperUlt>::create(context)); diff --git a/opencl/test/unit_test/command_queue/enqueue_copy_buffer_to_image_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_copy_buffer_to_image_tests.cpp index acdd703be6..b4f4002561 100644 --- a/opencl/test/unit_test/command_queue/enqueue_copy_buffer_to_image_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_copy_buffer_to_image_tests.cpp @@ -18,6 +18,7 @@ #include "opencl/test/unit_test/fixtures/one_mip_level_image_fixture.h" #include "opencl/test/unit_test/gen_common/gen_commands_common_validation.h" #include "opencl/test/unit_test/mocks/mock_buffer.h" +#include "opencl/test/unit_test/mocks/mock_builder.h" #include "opencl/test/unit_test/mocks/mock_builtin_dispatch_info_builder.h" #include "opencl/test/unit_test/mocks/mock_cl_execution_environment.h" #include "opencl/test/unit_test/mocks/mock_command_queue.h" @@ -214,15 +215,23 @@ typedef EnqueueCopyBufferToImageMipMapTest MipMapCopyBufferToImageTest; HWTEST_P(MipMapCopyBufferToImageTest, GivenImageWithMipLevelNonZeroWhenCopyBufferToImageIsCalledThenProperMipLevelIsSet) { auto imageType = (cl_mem_object_type)GetParam(); auto builtIns = new MockBuiltins(); + + auto builtInType = EBuiltInOps::copyBufferToImage3d; + + auto &compilerProductHelper = pDevice->getCompilerProductHelper(); + if (compilerProductHelper.isForceToStatelessRequired()) { + builtInType = EBuiltInOps::copyBufferToImage3dStateless; + } + MockRootDeviceEnvironment::resetBuiltins(pCmdQ->getDevice().getExecutionEnvironment()->rootDeviceEnvironments[pCmdQ->getDevice().getRootDeviceIndex()].get(), builtIns); auto &origBuilder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder( - adjustBuiltInType(pCmdQ->getHeaplessModeEnabled(), EBuiltInOps::copyBufferToImage3d), + adjustBuiltInType(pCmdQ->getHeaplessModeEnabled(), builtInType), pCmdQ->getClDevice()); // substitute original builder with mock builder auto oldBuilder = pClExecutionEnvironment->setBuiltinDispatchInfoBuilder( rootDeviceIndex, - adjustBuiltInType(pCmdQ->getHeaplessModeEnabled(), EBuiltInOps::copyBufferToImage3d), + adjustBuiltInType(pCmdQ->getHeaplessModeEnabled(), builtInType), std::unique_ptr(new MockBuiltinDispatchInfoBuilder(*builtIns, pCmdQ->getClDevice(), &origBuilder))); cl_int retVal = CL_SUCCESS; @@ -274,7 +283,7 @@ HWTEST_P(MipMapCopyBufferToImageTest, GivenImageWithMipLevelNonZeroWhenCopyBuffe EXPECT_EQ(CL_SUCCESS, retVal); - auto &mockBuilder = static_cast(BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(adjustBuiltInType(pCmdQ->getHeaplessModeEnabled(), EBuiltInOps::copyBufferToImage3d), + auto &mockBuilder = static_cast(BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(adjustBuiltInType(pCmdQ->getHeaplessModeEnabled(), builtInType), pCmdQ->getClDevice())); auto params = mockBuilder.getBuiltinOpParams(); @@ -283,7 +292,7 @@ HWTEST_P(MipMapCopyBufferToImageTest, GivenImageWithMipLevelNonZeroWhenCopyBuffe // restore original builder and retrieve mock builder auto newBuilder = pClExecutionEnvironment->setBuiltinDispatchInfoBuilder( rootDeviceIndex, - adjustBuiltInType(pCmdQ->getHeaplessModeEnabled(), EBuiltInOps::copyBufferToImage3d), + adjustBuiltInType(pCmdQ->getHeaplessModeEnabled(), builtInType), std::move(oldBuilder)); EXPECT_NE(nullptr, newBuilder); } @@ -359,7 +368,7 @@ HWTEST_F(EnqueueCopyBufferToImageStatelessTest, givenGpuHangAndBlockingCallAndBi using EnqueueCopyBufferToImageStatefulTest = EnqueueCopyBufferToImageHw; -HWTEST_F(EnqueueCopyBufferToImageStatefulTest, givenBigBufferWhenCopyingBufferToImageStatefulThenSuccessIsReturned) { +HWTEST2_F(EnqueueCopyBufferToImageStatefulTest, givenBigBufferWhenCopyingBufferToImageStatefulThenSuccessIsReturned, IsStatefulBufferPreferredForProduct) { auto cmdQ = std::make_unique>(context.get(), device.get()); if (cmdQ->getHeaplessModeEnabled()) { GTEST_SKIP(); @@ -397,3 +406,47 @@ HWTEST_F(OneMipLevelCopyBufferToImageImageTests, GivenNotMippedImageWhenCopyingB EXPECT_TRUE(builtinOpsParamsCaptured); EXPECT_EQ(0u, usedBuiltinOpsParams.dstMipLevel); } + +HWTEST_F(EnqueueCopyBufferToImageTest, given4gbBufferAndIsForceStatelessIsFalseWhenEnqueueCopyBufferToImageCallThenStatelessIsUsed) { + struct FourGbMockBuffer : MockBuffer { + size_t getSize() const override { return static_cast(4ull * MemoryConstants::gigaByte); } + }; + + REQUIRE_IMAGES_OR_SKIP(defaultHwInfo); + if (is32bit) { + GTEST_SKIP(); + } + + auto mockCmdQ = static_cast *>(pCmdQ); + mockCmdQ->isForceStateless = false; + + EBuiltInOps::Type copyBuiltIn = EBuiltInOps::adjustBuiltinType(true, pCmdQ->getHeaplessModeEnabled()); + + auto builtIns = new MockBuiltins(); + MockRootDeviceEnvironment::resetBuiltins(pCmdQ->getDevice().getExecutionEnvironment()->rootDeviceEnvironments[pCmdQ->getDevice().getRootDeviceIndex()].get(), builtIns); + + // substitute original builder with mock builder + auto oldBuilder = pClExecutionEnvironment->setBuiltinDispatchInfoBuilder( + rootDeviceIndex, + copyBuiltIn, + std::unique_ptr(new MockBuilder(*builtIns, pCmdQ->getClDevice()))); + + FourGbMockBuffer srcBuffer; + + auto mockBuilder = static_cast(&BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder( + copyBuiltIn, + *pClDevice)); + + EXPECT_FALSE(mockBuilder->wasBuildDispatchInfosWithBuiltinOpParamsCalled); + + EnqueueCopyBufferToImageHelper<>::enqueueCopyBufferToImage(pCmdQ, &srcBuffer, dstImage); + + EXPECT_TRUE(mockBuilder->wasBuildDispatchInfosWithBuiltinOpParamsCalled); + + // restore original builder and retrieve mock builder + auto newBuilder = pClExecutionEnvironment->setBuiltinDispatchInfoBuilder( + rootDeviceIndex, + copyBuiltIn, + std::move(oldBuilder)); + EXPECT_EQ(mockBuilder, newBuilder.get()); +} \ No newline at end of file diff --git a/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp index 93643d879b..8712cefca7 100644 --- a/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp @@ -22,9 +22,11 @@ #include "opencl/test/unit_test/command_queue/enqueue_write_image_fixture.h" #include "opencl/test/unit_test/fixtures/one_mip_level_image_fixture.h" #include "opencl/test/unit_test/gen_common/gen_commands_common_validation.h" +#include "opencl/test/unit_test/mocks/mock_builder.h" #include "opencl/test/unit_test/mocks/mock_builtin_dispatch_info_builder.h" #include "opencl/test/unit_test/mocks/mock_cl_execution_environment.h" #include "opencl/test/unit_test/mocks/mock_command_queue.h" +#include "opencl/test/unit_test/mocks/mock_image.h" using namespace NEO; @@ -267,7 +269,7 @@ HWTEST_F(EnqueueWriteImageTest, GivenImage1DarrayWhenWriteImageIsCalledThenRowPi auto builtIns = new MockBuiltins(); MockRootDeviceEnvironment::resetBuiltins(pCmdQ->getDevice().getExecutionEnvironment()->rootDeviceEnvironments[pCmdQ->getDevice().getRootDeviceIndex()].get(), builtIns); - const bool useStateless = false; + const bool useStateless = pDevice->getCompilerProductHelper().isForceToStatelessRequired(); auto copyBuiltIn = EBuiltInOps::adjustBuiltinType(useStateless, pCmdQ->getHeaplessModeEnabled()); auto &origBuilder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder( @@ -420,7 +422,8 @@ HWTEST_P(MipMapWriteImageTest, GivenImageWithMipLevelNonZeroWhenReadImageIsCalle auto builtIns = new MockBuiltins(); MockRootDeviceEnvironment::resetBuiltins(pCmdQ->getDevice().getExecutionEnvironment()->rootDeviceEnvironments[pCmdQ->getDevice().getRootDeviceIndex()].get(), builtIns); - EBuiltInOps::Type eBuiltInOp = EBuiltInOps::adjustBuiltinType(false, pCmdQ->getHeaplessModeEnabled()); + const bool useStateless = pDevice->getCompilerProductHelper().isForceToStatelessRequired(); + EBuiltInOps::Type eBuiltInOp = EBuiltInOps::adjustBuiltinType(useStateless, pCmdQ->getHeaplessModeEnabled()); auto &origBuilder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder( eBuiltInOp, @@ -999,4 +1002,48 @@ HWTEST_F(WriteImageStagingBufferTest, whenEnqueueStagingWriteImageCalledForMipMa auto res = mockCommandQueueHw.enqueueStagingImageTransfer(CL_COMMAND_WRITE_IMAGE, image.get(), false, origin, region, 4u, pitchSize, ptr, nullptr); EXPECT_EQ(res, CL_SUCCESS); +} + +HWTEST_F(EnqueueWriteImageTest, given4gbBufferAndIsForceStatelessIsFalseWhenEnqueueWriteImageCallThenStatelessIsUsed) { + struct FourGbMockImage : MockImageBase { + size_t getSize() const override { return static_cast(4ull * MemoryConstants::gigaByte); } + }; + + REQUIRE_IMAGES_OR_SKIP(defaultHwInfo); + if (is32bit) { + GTEST_SKIP(); + } + + auto mockCmdQ = static_cast *>(pCmdQ); + mockCmdQ->isForceStateless = false; + + EBuiltInOps::Type copyBuiltIn = EBuiltInOps::adjustBuiltinType(true, pCmdQ->getHeaplessModeEnabled()); + + auto builtIns = new MockBuiltins(); + MockRootDeviceEnvironment::resetBuiltins(pCmdQ->getDevice().getExecutionEnvironment()->rootDeviceEnvironments[pCmdQ->getDevice().getRootDeviceIndex()].get(), builtIns); + + // substitute original builder with mock builder + auto oldBuilder = pClExecutionEnvironment->setBuiltinDispatchInfoBuilder( + rootDeviceIndex, + copyBuiltIn, + std::unique_ptr(new MockBuilder(*builtIns, pCmdQ->getClDevice()))); + + FourGbMockImage dstImage; + + auto mockBuilder = static_cast(&BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder( + copyBuiltIn, + *pClDevice)); + + EXPECT_FALSE(mockBuilder->wasBuildDispatchInfosWithBuiltinOpParamsCalled); + + EnqueueWriteImageHelper<>::enqueueWriteImage(pCmdQ, &dstImage); + + EXPECT_TRUE(mockBuilder->wasBuildDispatchInfosWithBuiltinOpParamsCalled); + + // restore original builder and retrieve mock builder + auto newBuilder = pClExecutionEnvironment->setBuiltinDispatchInfoBuilder( + rootDeviceIndex, + copyBuiltIn, + std::move(oldBuilder)); + EXPECT_EQ(mockBuilder, newBuilder.get()); } \ No newline at end of file