diff --git a/opencl/source/command_queue/cpu_data_transfer_handler.cpp b/opencl/source/command_queue/cpu_data_transfer_handler.cpp index 29d89b8ec7..46fddbcc6a 100644 --- a/opencl/source/command_queue/cpu_data_transfer_handler.cpp +++ b/opencl/source/command_queue/cpu_data_transfer_handler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2021 Intel Corporation + * Copyright (C) 2018-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -52,8 +52,8 @@ void *CommandQueue::cpuDataTransferHandler(TransferProperties &transferPropertie *eventsRequest.outEvent = outEventObj; } - auto commandStreamReceieverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership(); TakeOwnershipWrapper queueOwnership(*this); + auto commandStreamReceieverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership(); auto blockQueue = false; auto taskLevel = 0u; @@ -80,8 +80,8 @@ void *CommandQueue::cpuDataTransferHandler(TransferProperties &transferPropertie eventBuilder); } - queueOwnership.unlock(); commandStreamReceieverOwnership.unlock(); + queueOwnership.unlock(); // read/write buffers are always blocking if (!blockQueue || transferProperties.blocking) { diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 016bcded4a..902a3623c4 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -127,7 +127,6 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, TagNodeBase *hwTimeStamps = nullptr; CommandStreamReceiver &computeCommandStreamReceiver = getGpgpuCommandStreamReceiver(); - auto commandStreamReceiverOwnership = computeCommandStreamReceiver.obtainUniqueOwnership(); EventBuilder eventBuilder; setupEvent(eventBuilder, event, commandType); @@ -137,6 +136,7 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, std::unique_ptr blockedCommandsData; std::unique_ptr printfHandler; TakeOwnershipWrapper> queueOwnership(*this); + auto commandStreamReceiverOwnership = computeCommandStreamReceiver.obtainUniqueOwnership(); auto blockQueue = false; auto taskLevel = 0u; @@ -353,8 +353,8 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, timestampPacketDependencies.moveNodesToNewContainer(*deferredTimestampPackets); } - queueOwnership.unlock(); commandStreamReceiverOwnership.unlock(); + queueOwnership.unlock(); if (blocking) { auto &builtinOpParams = multiDispatchInfo.peekBuiltinOpParams(); @@ -950,7 +950,7 @@ CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( CompletionStamp completionStamp = {this->taskCount, this->taskLevel, this->flushStamp->peekStamp()}; bool flushGpgpuCsr = true; - if ((enqueueProperties.operation == EnqueueProperties::Operation::Blit) && !isGpgpuSubmissionForBcsRequired(false, timestampPacketDependencies)) { + if ((enqueueProperties.operation == EnqueueProperties::Operation::Blit) && commandStream == nullptr) { flushGpgpuCsr = false; } else { csrDeps.makeResident(getGpgpuCommandStreamReceiver()); @@ -1155,6 +1155,10 @@ void CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDispat if (blockQueue) { enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, nullptr, &bcsCsr); + + if (gpgpuSubmission) { + commandStreamReceiverOwnership.unlock(); + } } timestampPacketDependencies.moveNodesToNewContainer(*deferredTimestampPackets); diff --git a/opencl/test/unit_test/mt_tests/command_queue/ooq_task_tests_mt.cpp b/opencl/test/unit_test/mt_tests/command_queue/ooq_task_tests_mt.cpp index e2e45ff7dc..d4e34cd2f2 100644 --- a/opencl/test/unit_test/mt_tests/command_queue/ooq_task_tests_mt.cpp +++ b/opencl/test/unit_test/mt_tests/command_queue/ooq_task_tests_mt.cpp @@ -8,6 +8,8 @@ #include "opencl/test/unit_test/command_queue/enqueue_fixture.h" #include "opencl/test/unit_test/fixtures/hello_world_fixture.h" +#include + using namespace NEO; struct OOQFixtureFactory : public HelloWorldFixtureFactory { @@ -89,4 +91,76 @@ TEST_F(OOQTaskTestsMt, GivenBlockedOnUserEventWhenEnqueingMarkerThenSuccessIsRet retVal = clReleaseEvent(userEvent); EXPECT_EQ(CL_SUCCESS, retVal); +} + +TEST_F(OOQTaskTestsMt, givenBlitterWhenEnqueueCopyAndKernelUsingMultipleThreadsThenSuccessReturned) { + auto hwInfo = *defaultHwInfo; + hwInfo.capabilityTable.blitterOperationsSupported = true; + REQUIRE_FULL_BLITTER_OR_SKIP(&hwInfo); + + DebugManagerStateRestore restorer; + DebugManager.flags.EnableBlitterForEnqueueOperations.set(1); + DebugManager.flags.DoCpuCopyOnReadBuffer.set(0); + DebugManager.flags.DoCpuCopyOnWriteBuffer.set(0); + + constexpr uint32_t numThreads = 32; + std::atomic_uint32_t barrier = numThreads; + std::array, numThreads> threads; + + auto device = MockClDevice::createWithNewExecutionEnvironment(&hwInfo, rootDeviceIndex); + MockClDevice clDevice(device); + auto cmdQ = createCommandQueue(&clDevice, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE); + EXPECT_EQ(cmdQ->taskCount, 0u); + EXPECT_EQ(cmdQ->getGpgpuCommandStreamReceiver().peekTaskCount(), 0u); + EXPECT_EQ(cmdQ->peekBcsTaskCount(aub_stream::EngineType::ENGINE_BCS), 0u); + auto buffer = std::unique_ptr(BufferHelper<>::create()); + + for (auto &thread : threads) { + thread = std::async(std::launch::async, [&]() { + auto alignedReadPtr = alignedMalloc(BufferDefaults::sizeInBytes, MemoryConstants::cacheLineSize); + barrier.fetch_sub(1u); + while (barrier.load() != 0u) { + std::this_thread::yield(); + } + + auto retVal = EnqueueWriteBufferHelper<>::enqueueWriteBuffer(cmdQ, + buffer.get(), + CL_TRUE, + 0, + BufferDefaults::sizeInBytes, + alignedReadPtr, + nullptr, + 0, + nullptr, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + size_t workSize[] = {64}; + retVal = EnqueueKernelHelper<>::enqueueKernel(cmdQ, KernelFixture::pKernel, 1, nullptr, workSize, workSize, 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + retVal = EnqueueReadBufferHelper<>::enqueueReadBuffer(cmdQ, + buffer.get(), + CL_TRUE, + 0, + BufferDefaults::sizeInBytes, + alignedReadPtr, + nullptr, + 0, + nullptr, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + alignedFree(alignedReadPtr); + }); + } + for (auto &thread : threads) { + thread.get(); + } + + EXPECT_NE(cmdQ->taskCount, 0u); + EXPECT_NE(cmdQ->getGpgpuCommandStreamReceiver().peekTaskCount(), 0u); + EXPECT_EQ(cmdQ->peekBcsTaskCount(aub_stream::EngineType::ENGINE_BCS), 2 * numThreads); + + clReleaseCommandQueue(cmdQ); } \ No newline at end of file