diff --git a/opencl/source/command_queue/cpu_data_transfer_handler.cpp b/opencl/source/command_queue/cpu_data_transfer_handler.cpp
index 29d89b8ec7..46fddbcc6a 100644
--- a/opencl/source/command_queue/cpu_data_transfer_handler.cpp
+++ b/opencl/source/command_queue/cpu_data_transfer_handler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,8 +52,8 @@ void *CommandQueue::cpuDataTransferHandler(TransferProperties &transferPropertie
         *eventsRequest.outEvent = outEventObj;
     }
 
-    auto commandStreamReceieverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
     TakeOwnershipWrapper<CommandQueue> queueOwnership(*this);
+    auto commandStreamReceieverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
 
     auto blockQueue = false;
     auto taskLevel = 0u;
@@ -80,8 +80,8 @@ void *CommandQueue::cpuDataTransferHandler(TransferProperties &transferPropertie
                                         eventBuilder);
     }
 
-    queueOwnership.unlock();
     commandStreamReceieverOwnership.unlock();
+    queueOwnership.unlock();
 
     // read/write buffers are always blocking
     if (!blockQueue || transferProperties.blocking) {
diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h
index 016bcded4a..902a3623c4 100644
--- a/opencl/source/command_queue/enqueue_common.h
+++ b/opencl/source/command_queue/enqueue_common.h
@@ -127,7 +127,6 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
 
     TagNodeBase *hwTimeStamps = nullptr;
     CommandStreamReceiver &computeCommandStreamReceiver = getGpgpuCommandStreamReceiver();
-    auto commandStreamReceiverOwnership = computeCommandStreamReceiver.obtainUniqueOwnership();
 
     EventBuilder eventBuilder;
     setupEvent(eventBuilder, event, commandType);
@@ -137,6 +136,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
     std::unique_ptr<KernelOperation> blockedCommandsData;
     std::unique_ptr<PrintfHandler> printfHandler;
     TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
+    auto commandStreamReceiverOwnership = computeCommandStreamReceiver.obtainUniqueOwnership();
 
     auto blockQueue = false;
     auto taskLevel = 0u;
@@ -353,8 +353,8 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
         timestampPacketDependencies.moveNodesToNewContainer(*deferredTimestampPackets);
     }
 
-    queueOwnership.unlock();
     commandStreamReceiverOwnership.unlock();
+    queueOwnership.unlock();
 
     if (blocking) {
         auto &builtinOpParams = multiDispatchInfo.peekBuiltinOpParams();
@@ -950,7 +950,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
     CompletionStamp completionStamp = {this->taskCount, this->taskLevel, this->flushStamp->peekStamp()};
     bool flushGpgpuCsr = true;
 
-    if ((enqueueProperties.operation == EnqueueProperties::Operation::Blit) && !isGpgpuSubmissionForBcsRequired(false, timestampPacketDependencies)) {
+    if ((enqueueProperties.operation == EnqueueProperties::Operation::Blit) && commandStream == nullptr) {
         flushGpgpuCsr = false;
     } else {
         csrDeps.makeResident(getGpgpuCommandStreamReceiver());
@@ -1155,6 +1155,10 @@ void CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDispat
 
     if (blockQueue) {
         enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, nullptr, &bcsCsr);
+
+        if (gpgpuSubmission) {
+            commandStreamReceiverOwnership.unlock();
+        }
     }
 
     timestampPacketDependencies.moveNodesToNewContainer(*deferredTimestampPackets);
diff --git a/opencl/test/unit_test/mt_tests/command_queue/ooq_task_tests_mt.cpp b/opencl/test/unit_test/mt_tests/command_queue/ooq_task_tests_mt.cpp
index e2e45ff7dc..d4e34cd2f2 100644
--- a/opencl/test/unit_test/mt_tests/command_queue/ooq_task_tests_mt.cpp
+++ b/opencl/test/unit_test/mt_tests/command_queue/ooq_task_tests_mt.cpp
@@ -8,6 +8,8 @@
 #include "opencl/test/unit_test/command_queue/enqueue_fixture.h"
 #include "opencl/test/unit_test/fixtures/hello_world_fixture.h"
 
+#include <future>
+
 using namespace NEO;
 
 struct OOQFixtureFactory : public HelloWorldFixtureFactory {
@@ -89,4 +91,76 @@ TEST_F(OOQTaskTestsMt, GivenBlockedOnUserEventWhenEnqueingMarkerThenSuccessIsRet
 
     retVal = clReleaseEvent(userEvent);
     EXPECT_EQ(CL_SUCCESS, retVal);
+}
+
+TEST_F(OOQTaskTestsMt, givenBlitterWhenEnqueueCopyAndKernelUsingMultipleThreadsThenSuccessReturned) {
+    auto hwInfo = *defaultHwInfo;
+    hwInfo.capabilityTable.blitterOperationsSupported = true;
+    REQUIRE_FULL_BLITTER_OR_SKIP(&hwInfo);
+
+    DebugManagerStateRestore restorer;
+    DebugManager.flags.EnableBlitterForEnqueueOperations.set(1);
+    DebugManager.flags.DoCpuCopyOnReadBuffer.set(0);
+    DebugManager.flags.DoCpuCopyOnWriteBuffer.set(0);
+
+    constexpr uint32_t numThreads = 32;
+    std::atomic_uint32_t barrier = numThreads;
+    std::array<std::future<void>, numThreads> threads;
+
+    auto device = MockClDevice::createWithNewExecutionEnvironment<MockDevice>(&hwInfo, rootDeviceIndex);
+    MockClDevice clDevice(device);
+    auto cmdQ = createCommandQueue(&clDevice, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
+    EXPECT_EQ(cmdQ->taskCount, 0u);
+    EXPECT_EQ(cmdQ->getGpgpuCommandStreamReceiver().peekTaskCount(), 0u);
+    EXPECT_EQ(cmdQ->peekBcsTaskCount(aub_stream::EngineType::ENGINE_BCS), 0u);
+    auto buffer = std::unique_ptr<Buffer>(BufferHelper<>::create());
+
+    for (auto &thread : threads) {
+        thread = std::async(std::launch::async, [&]() {
+            auto alignedReadPtr = alignedMalloc(BufferDefaults::sizeInBytes, MemoryConstants::cacheLineSize);
+            barrier.fetch_sub(1u);
+            while (barrier.load() != 0u) {
+                std::this_thread::yield();
+            }
+
+            auto retVal = EnqueueWriteBufferHelper<>::enqueueWriteBuffer(cmdQ,
+                                                                         buffer.get(),
+                                                                         CL_TRUE,
+                                                                         0,
+                                                                         BufferDefaults::sizeInBytes,
+                                                                         alignedReadPtr,
+                                                                         nullptr,
+                                                                         0,
+                                                                         nullptr,
+                                                                         nullptr);
+            EXPECT_EQ(CL_SUCCESS, retVal);
+
+            size_t workSize[] = {64};
+            retVal = EnqueueKernelHelper<>::enqueueKernel(cmdQ, KernelFixture::pKernel, 1, nullptr, workSize, workSize, 0, nullptr, nullptr);
+            EXPECT_EQ(CL_SUCCESS, retVal);
+
+            retVal = EnqueueReadBufferHelper<>::enqueueReadBuffer(cmdQ,
+                                                                  buffer.get(),
+                                                                  CL_TRUE,
+                                                                  0,
+                                                                  BufferDefaults::sizeInBytes,
+                                                                  alignedReadPtr,
+                                                                  nullptr,
+                                                                  0,
+                                                                  nullptr,
+                                                                  nullptr);
+            EXPECT_EQ(CL_SUCCESS, retVal);
+
+            alignedFree(alignedReadPtr);
+        });
+    }
+    for (auto &thread : threads) {
+        thread.get();
+    }
+
+    EXPECT_NE(cmdQ->taskCount, 0u);
+    EXPECT_NE(cmdQ->getGpgpuCommandStreamReceiver().peekTaskCount(), 0u);
+    EXPECT_EQ(cmdQ->peekBcsTaskCount(aub_stream::EngineType::ENGINE_BCS), 2 * numThreads);
+
+    clReleaseCommandQueue(cmdQ);
 }
\ No newline at end of file