From 2cd6809d1e54e22a16a7cd170668711acb7e230b Mon Sep 17 00:00:00 2001 From: Maciej Dziuban Date: Wed, 10 Feb 2021 17:41:08 +0000 Subject: [PATCH] Do not use blitter for clEnqueueCopyBuffer Signed-off-by: Maciej Dziuban --- opencl/source/command_queue/command_queue.cpp | 7 +++ opencl/source/command_queue/command_queue.h | 1 + opencl/source/command_queue/enqueue_common.h | 6 ++- .../command_queue/command_queue_hw_tests.cpp | 1 + .../unit_test/mem_obj/buffer_bcs_tests.cpp | 53 +++++++++++++++++++ .../test/unit_test/test_files/igdrcl.config | 3 +- .../debug_settings/debug_variables_base.inl | 1 + 7 files changed, 70 insertions(+), 2 deletions(-) diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index 2c60231011..e8de6d0da1 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -698,6 +698,13 @@ bool CommandQueue::blitEnqueueAllowed(cl_command_type cmdType) const { } } +bool CommandQueue::blitEnqueuePreferred(cl_command_type cmdType) const { + if (cmdType == CL_COMMAND_COPY_BUFFER) { + return DebugManager.flags.PreferCopyEngineForCopyBufferToBuffer.get() == 1; + } + return true; +} + bool CommandQueue::blitEnqueueImageAllowed(const size_t *origin, const size_t *region) { auto blitEnqueuImageAllowed = false; diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index 37b386f186..bb7dc31284 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -353,6 +353,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> { void providePerformanceHint(TransferProperties &transferProperties); bool queueDependenciesClearRequired() const; bool blitEnqueueAllowed(cl_command_type cmdType) const; + bool blitEnqueuePreferred(cl_command_type cmdType) const; MOCKABLE_VIRTUAL bool blitEnqueueImageAllowed(const size_t *origin, const size_t *region); void aubCaptureHook(bool &blocking, bool &clearAllDependencies, const MultiDispatchInfo &multiDispatchInfo); virtual bool obtainTimestampPacketForCacheFlush(bool isCacheFlushRequired) const = 0; diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 45db8b21cc..ba0335b785 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -1136,7 +1136,11 @@ void CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDispat template template void CommandQueueHw::dispatchBcsOrGpgpuEnqueue(MultiDispatchInfo &dispatchInfo, Surface *(&surfaces)[surfaceCount], EBuiltInOps::Type builtInOperation, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, bool blitAllowed) { - if (blitAllowed) { + const bool blitPreferred = blitEnqueuePreferred(cmdType); + const bool blitRequired = isCopyOnly; + const bool blit = blitAllowed && (blitPreferred || blitRequired); + + if (blit) { enqueueBlit(dispatchInfo, numEventsInWaitList, eventWaitList, event, blocking); } else { auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInOperation, diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp index c4cebc5284..19801c5318 100644 --- a/opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp @@ -1368,6 +1368,7 @@ struct CommandQueueHwBlitTest : ClDeviceFixture, ContextFixture, CommandQueueHwF DebugManager.flags.EnableBlitterOperationsSupport.set(1); DebugManager.flags.EnableTimestampPacket.set(1); + DebugManager.flags.PreferCopyEngineForCopyBufferToBuffer.set(1); ClDeviceFixture::SetUp(); pDevice->getRootDeviceEnvironment().getMutableHardwareInfo()->capabilityTable.blitterOperationsSupported = true; cl_device_id device = pClDevice; diff --git a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp index ec5dbad6fd..e4176c7a01 100644 --- a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp +++ b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp @@ -95,6 +95,7 @@ struct BcsBufferTests : public ::testing::Test { DebugManager.flags.EnableTimestampPacket.set(1); DebugManager.flags.EnableBlitterForEnqueueOperations.set(1); DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(1); + DebugManager.flags.PreferCopyEngineForCopyBufferToBuffer.set(1); device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(nullptr)); auto &capabilityTable = device->getRootDeviceEnvironment().getMutableHardwareInfo()->capabilityTable; bool createBcsEngine = !capabilityTable.blitterOperationsSupported; @@ -1327,3 +1328,55 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenBlockedEnqueueWhenUsingBcsThenWaitForVal cmdQ->finish(); EXPECT_EQ(1u, myMockCsr->waitForTaskCountAndCleanAllocationListCalled); } + +HWTEST_TEMPLATED_F(BcsBufferTests, givenDebugFlagSetToOneWhenEnqueueingCopyBufferToBufferThenUseBlitter) { + auto bcsCsr = static_cast *>(commandQueue->getBcsCommandStreamReceiver()); + auto bufferForBlt0 = clUniquePtr(Buffer::create(bcsMockContext.get(), CL_MEM_READ_WRITE, 1, nullptr, retVal)); + auto bufferForBlt1 = clUniquePtr(Buffer::create(bcsMockContext.get(), CL_MEM_READ_WRITE, 1, nullptr, retVal)); + + DebugManager.flags.PreferCopyEngineForCopyBufferToBuffer.set(-1); + EXPECT_EQ(0u, bcsCsr->blitBufferCalled); + commandQueue->enqueueCopyBuffer(bufferForBlt0.get(), bufferForBlt1.get(), 0, 1, 1, 0, nullptr, nullptr); + EXPECT_EQ(0u, bcsCsr->blitBufferCalled); + + DebugManager.flags.PreferCopyEngineForCopyBufferToBuffer.set(0); + EXPECT_EQ(0u, bcsCsr->blitBufferCalled); + commandQueue->enqueueCopyBuffer(bufferForBlt0.get(), bufferForBlt1.get(), 0, 1, 1, 0, nullptr, nullptr); + EXPECT_EQ(0u, bcsCsr->blitBufferCalled); + + DebugManager.flags.PreferCopyEngineForCopyBufferToBuffer.set(1); + EXPECT_EQ(0u, bcsCsr->blitBufferCalled); + commandQueue->enqueueCopyBuffer(bufferForBlt0.get(), bufferForBlt1.get(), 0, 1, 1, 0, nullptr, nullptr); + EXPECT_EQ(1u, bcsCsr->blitBufferCalled); +} + +HWTEST_TEMPLATED_F(BcsBufferTests, givenBcsQueueWhenEnqueueingCopyBufferToBufferThenUseBlitterRegardlessOfPreference) { + REQUIRE_BLITTER_OR_SKIP(&device->getDevice().getHardwareInfo()); + + cl_command_queue_properties properties[] = { + CL_QUEUE_FAMILY_INTEL, + device->getDevice().getIndexOfNonEmptyEngineGroup(EngineGroupType::Copy), + CL_QUEUE_INDEX_INTEL, + 0, + 0, + }; + MockCommandQueueHw queue(bcsMockContext.get(), device.get(), properties); + auto bcsCsr = static_cast *>(queue.getBcsCommandStreamReceiver()); + auto bufferForBlt0 = clUniquePtr(Buffer::create(bcsMockContext.get(), CL_MEM_READ_WRITE, 1, nullptr, retVal)); + auto bufferForBlt1 = clUniquePtr(Buffer::create(bcsMockContext.get(), CL_MEM_READ_WRITE, 1, nullptr, retVal)); + + DebugManager.flags.PreferCopyEngineForCopyBufferToBuffer.set(-1); + EXPECT_EQ(0u, bcsCsr->blitBufferCalled); + queue.enqueueCopyBuffer(bufferForBlt0.get(), bufferForBlt1.get(), 0, 1, 1, 0, nullptr, nullptr); + EXPECT_EQ(1u, bcsCsr->blitBufferCalled); + + DebugManager.flags.PreferCopyEngineForCopyBufferToBuffer.set(0); + EXPECT_EQ(1u, bcsCsr->blitBufferCalled); + queue.enqueueCopyBuffer(bufferForBlt0.get(), bufferForBlt1.get(), 0, 1, 1, 0, nullptr, nullptr); + EXPECT_EQ(2u, bcsCsr->blitBufferCalled); + + DebugManager.flags.PreferCopyEngineForCopyBufferToBuffer.set(1); + EXPECT_EQ(2u, bcsCsr->blitBufferCalled); + queue.enqueueCopyBuffer(bufferForBlt0.get(), bufferForBlt1.get(), 0, 1, 1, 0, nullptr, nullptr); + EXPECT_EQ(3u, bcsCsr->blitBufferCalled); +} diff --git a/opencl/test/unit_test/test_files/igdrcl.config b/opencl/test/unit_test/test_files/igdrcl.config index 2ac1e2d79b..f0604a83e6 100644 --- a/opencl/test/unit_test/test_files/igdrcl.config +++ b/opencl/test/unit_test/test_files/igdrcl.config @@ -210,4 +210,5 @@ EnableMockSourceLevelDebugger = 0 EnableHostPointerImport = -1 EnableHostUsmSupport = -1 ForceBtpPrefetchMode = -1 -OverrideProfilingTimerResolution = -1 \ No newline at end of file +OverrideProfilingTimerResolution = -1 +PreferCopyEngineForCopyBufferToBuffer = -1 \ No newline at end of file diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 5ac008391f..479e9cc9a4 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -218,6 +218,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, UseKmdMigration, 0, "-1: devices default mode, 0 DECLARE_DEBUG_VARIABLE(int32_t, ForceSemaphoreDelayBetweenWaits, -1, "Specifies the minimum number of microseconds allowed for command streamer to wait before re-fetching the data. 0 - poll interval will be equal to the memory latency of the read completion") DECLARE_DEBUG_VARIABLE(int32_t, ForceLocalMemoryAccessMode, -1, "-1: don't override, 0: default rules apply, 1: CPU can access local memory, 3: CPU never accesses local memory") DECLARE_DEBUG_VARIABLE(int32_t, ForceUserptrAlignment, -1, "-1: no force (4kb), >0: n kb alignment") +DECLARE_DEBUG_VARIABLE(int32_t, PreferCopyEngineForCopyBufferToBuffer, -1, "-1: default, 0: prefer EUs, 1: prefer blitter") DECLARE_DEBUG_VARIABLE(int64_t, ForceSystemMemoryPlacement, 0, "0: default, >0: (bitmask) for given Graphics Allocation Type, force system memory placement") DECLARE_DEBUG_VARIABLE(int64_t, ForceNonSystemMemoryPlacement, 0, "0: default, >0: (bitmask) for given Graphics Allocation Type, force non-system memory placement") DECLARE_DEBUG_VARIABLE(int64_t, DisableIndirectAccess, -1, "0: default, 0: Indirect access for L0 kernels is enabled, 1: Read IGC experimental properties to determine whether indirect access is required")