From 61510e9a92ae8b67f6e204d2ab931269bc2ed542 Mon Sep 17 00:00:00 2001 From: "Cencelewska, Katarzyna" Date: Wed, 3 Aug 2022 22:41:23 +0000 Subject: [PATCH] Revert optimization of gpgpu csr's mutex lock in the enqueue blit optimization available under flag ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission Related-To: NEO-7011 Signed-off-by: Cencelewska, Katarzyna --- .../source/command_queue/command_queue_hw.h | 2 +- opencl/source/command_queue/enqueue_common.h | 19 ++++- .../command_queue/blit_enqueue_2_tests.cpp | 75 +++++++++++++++++++ .../test/unit_test/mocks/mock_command_queue.h | 14 ++++ .../debug_settings/debug_variables_base.inl | 1 + shared/test/common/test_files/igdrcl.config | 3 +- 6 files changed, 108 insertions(+), 6 deletions(-) diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h index 1672c32a20..4798c9f273 100644 --- a/opencl/source/command_queue/command_queue_hw.h +++ b/opencl/source/command_queue/command_queue_hw.h @@ -497,7 +497,7 @@ class CommandQueueHw : public CommandQueue { KernelOperation *blockedCommandsData, TimestampPacketDependencies ×tampPacketDependencies); - bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies) const; + MOCKABLE_VIRTUAL bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies) const; void setupEvent(EventBuilder &eventBuilder, cl_event *outEvent, uint32_t cmdType); bool isBlitAuxTranslationRequired(const MultiDispatchInfo &multiDispatchInfo); diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index a8f4a851f3..853af1d298 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -1117,6 +1117,9 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp std::unique_ptr blockedCommandsData; TakeOwnershipWrapper> queueOwnership(*this); + if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() != 1) { + commandStreamReceiverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership(); + } auto blockQueue = false; auto taskLevel = 0u; @@ -1165,7 +1168,9 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp LinearStream *gpgpuCommandStream = {}; size_t gpgpuCommandStreamStart = {}; if (gpgpuSubmission) { - commandStreamReceiverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership(); + if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() == 1) { + commandStreamReceiverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership(); + } gpgpuCommandStream = obtainCommandStream(csrDeps, true, blockQueue, multiDispatchInfo, eventsRequest, blockedCommandsData, nullptr, 0, false); gpgpuCommandStreamStart = gpgpuCommandStream->getUsed(); } @@ -1182,7 +1187,9 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp } if (gpgpuSubmission) { - commandStreamReceiverOwnership.unlock(); + if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() == 1) { + commandStreamReceiverOwnership.unlock(); + } } if (eventBuilder.getEvent()) { @@ -1199,13 +1206,17 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, nullptr, &bcsCsr); if (gpgpuSubmission) { - commandStreamReceiverOwnership.unlock(); + if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() == 1) { + commandStreamReceiverOwnership.unlock(); + } } } timestampPacketDependencies.moveNodesToNewContainer(*deferredTimestampPackets); csrDeps.copyNodesToNewContainer(*deferredTimestampPackets); - + if (DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() != 1) { + commandStreamReceiverOwnership.unlock(); + } queueOwnership.unlock(); bcsCommandStreamReceiverOwnership.unlock(); diff --git a/opencl/test/unit_test/command_queue/blit_enqueue_2_tests.cpp b/opencl/test/unit_test/command_queue/blit_enqueue_2_tests.cpp index b2dd3bc0ff..c37cfe8131 100644 --- a/opencl/test/unit_test/command_queue/blit_enqueue_2_tests.cpp +++ b/opencl/test/unit_test/command_queue/blit_enqueue_2_tests.cpp @@ -390,6 +390,81 @@ HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenSubmissionT } } +using BlitEnqueueForceFlagsTests = BlitEnqueueTests<1>; +HWTEST_TEMPLATED_F(BlitEnqueueForceFlagsTests, givenFlagsToForceCsrLockAndNonBlockedQueueWhenEnqueueBlitThenLockAreSetCorrectly) { + using CsrType = UltCommandStreamReceiver; + auto mockCommandQueue = static_cast *>(commandQueue.get()); + auto mockCsr = static_cast(&mockCommandQueue->getGpgpuCommandStreamReceiver()); + + auto buffer = createBuffer(1, false); + buffer->forceDisallowCPUCopy = true; + mockCommandQueue->setQueueBlocked = false; + int hostPtr = 0; + { + DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.set(-1); + DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(-1); + mockCsr->recursiveLockCounter = 0u; + mockCommandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); + EXPECT_EQ(1u, mockCsr->recursiveLockCounter); + } + { + DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.set(-1); + DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(1); + mockCsr->recursiveLockCounter = 0u; + mockCommandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); + EXPECT_EQ(1u, mockCsr->recursiveLockCounter); + } + { + DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.set(1); + DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(-1); + mockCsr->recursiveLockCounter = 0u; + mockCommandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); + EXPECT_EQ(0u, mockCsr->recursiveLockCounter); + } + { + DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.set(1); + DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(1); + mockCsr->recursiveLockCounter = 0u; + mockCommandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); + EXPECT_EQ(1u, mockCsr->recursiveLockCounter); + } +} + +HWTEST_TEMPLATED_F(BlitEnqueueForceFlagsTests, givenFlagToForceCsrLockAndBlockedQueueWhenGpgpuSubmissionForBcsNotRequiredAndCallEnqueueBlitThenLockAreSetCorrectly) { + using CsrType = UltCommandStreamReceiver; + auto mockCommandQueue = static_cast *>(commandQueue.get()); + auto mockCsr = static_cast(&mockCommandQueue->getGpgpuCommandStreamReceiver()); + + auto buffer = createBuffer(1, false); + buffer->forceDisallowCPUCopy = true; + int hostPtr = 0; + + DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.set(1); + + mockCsr->recursiveLockCounter = 0u; + mockCommandQueue->setQueueBlocked = true; + mockCommandQueue->forceGpgpuSubmissionForBcsRequired = 0; + mockCommandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); + EXPECT_EQ(0u, mockCsr->recursiveLockCounter); +} +HWTEST_TEMPLATED_F(BlitEnqueueForceFlagsTests, givenFlagToForceCsrLockAndBlockedQueueWhenGpgpuSubmissionForBcsRequiredAndCallEnqueueBlitThenLockAreSetCorrectly) { + using CsrType = UltCommandStreamReceiver; + auto mockCommandQueue = static_cast *>(commandQueue.get()); + auto mockCsr = static_cast(&mockCommandQueue->getGpgpuCommandStreamReceiver()); + + auto buffer = createBuffer(1, false); + buffer->forceDisallowCPUCopy = true; + int hostPtr = 0; + + DebugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.set(1); + + mockCsr->recursiveLockCounter = 0u; + mockCommandQueue->setQueueBlocked = true; + mockCommandQueue->forceGpgpuSubmissionForBcsRequired = 1; + mockCommandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); + EXPECT_EQ(1u, mockCsr->recursiveLockCounter); +} + using BlitCopyTests = BlitEnqueueTests<1>; HWTEST_TEMPLATED_F(BlitCopyTests, givenKernelAllocationInLocalMemoryWhenCreatingWithoutAllowedCpuAccessThenUseBcsForTransfer) { diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index 26288b029a..269f16809d 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -382,6 +382,18 @@ class MockCommandQueueHw : public CommandQueueHw { isBlitEnqueueImageAllowed = BaseClass::blitEnqueueImageAllowed(origin, region, image); return isBlitEnqueueImageAllowed; } + bool isQueueBlocked() override { + if (setQueueBlocked != -1) { + return setQueueBlocked; + } + return BaseClass::isQueueBlocked(); + } + bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies) const override { + if (forceGpgpuSubmissionForBcsRequired != -1) { + return forceGpgpuSubmissionForBcsRequired; + } + return BaseClass::isGpgpuSubmissionForBcsRequired(queueBlocked, timestampPacketDependencies); + } unsigned int lastCommandType; std::vector lastEnqueuedKernels; @@ -396,6 +408,8 @@ class MockCommandQueueHw : public CommandQueueHw { bool notifyEnqueueSVMMemcpyCalled = false; bool cpuDataTransferHandlerCalled = false; bool useBcsCsrOnNotifyEnabled = false; + int setQueueBlocked = -1; + int forceGpgpuSubmissionForBcsRequired = -1; mutable bool isBlitEnqueueImageAllowed = false; struct OverrideReturnValue { bool enabled = false; diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index dab01b4c19..576a49c834 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -112,6 +112,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, OverrideLeastOccupiedBank, -1, "-1: default, >= DECLARE_DEBUG_VARIABLE(int32_t, OverrideRevision, -1, "-1: default, >=0: Revision id") DECLARE_DEBUG_VARIABLE(int32_t, ForceCacheFlushForBcs, -1, "Force cache flush from gpgpu engine before dispatching BCS copy. -1: default, 1: enabled, 0: disabled") DECLARE_DEBUG_VARIABLE(int32_t, ForceGpgpuSubmissionForBcsEnqueue, -1, "-1: Default, 1: Submit gpgpu command buffer with cache flushing and completion synchronization, 0: Do nothing, if possible") +DECLARE_DEBUG_VARIABLE(int32_t, ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission, -1, "-1: Default, 1: Force gpgpu command stream receiver lock for bcs enqueue only when gpgpu submission, 0: Do nothing, if possible") DECLARE_DEBUG_VARIABLE(int32_t, EnableUsmCompression, -1, "enable compression support for L0 USM Device and Shared Device side: -1 default, 0: disable, 1: enable") DECLARE_DEBUG_VARIABLE(int32_t, EnableHostUsmSupport, -1, "-1: default, 0: disable, 1: enable, Enables USM host memory") DECLARE_DEBUG_VARIABLE(int32_t, MediaVfeStateMaxSubSlices, -1, ">=0: Programs Media Vfe State Maximum Number of Dual-Subslices to given value ") diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index afe1a85937..d96107ab19 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -440,4 +440,5 @@ OverrideL1CachePolicyInSurfaceStateAndStateless = -1 EnableBcsSwControlWa = -1 ExperimentalEnableL0DebuggerForOpenCL = 0 DebuggerDisableSingleAddressSbaTracking = 0 -ForceImagesSupport = -1 \ No newline at end of file +ForceImagesSupport = -1 +ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission = -1 \ No newline at end of file