diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index 26212bfc04..a4f169c7d8 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -1293,7 +1293,7 @@ WaitStatus CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *pri waitStatus = waitUntilComplete(taskCount, activeBcsStates, flushStamp->peekStamp(), false, cleanTemporaryAllocationsList, waitedOnTimestamps); - tryReleaseDeferredNodes(false); + handlePostCompletionOperations(false); if (printfHandler) { if (!printfHandler->printEnqueueOutput()) { @@ -1381,13 +1381,15 @@ bool CommandQueue::migrateMultiGraphicsAllocationsIfRequired(const BuiltinOpPara return migrationHandled; } -void CommandQueue::tryReleaseDeferredNodes(bool checkEventsState) { +void CommandQueue::handlePostCompletionOperations(bool checkQueueCompletion) { TakeOwnershipWrapper queueOwnership(*this); - if (checkEventsState && !isCompleted(this->taskCount, this->bcsStates)) { + if (checkQueueCompletion && !isCompleted(this->taskCount, this->bcsStates)) { return; } + unregisterGpgpuAndBcsCsrClients(); + TimestampPacketContainer nodesToRelease; if (deferredTimestampPackets) { deferredTimestampPackets->swapNodes(nodesToRelease); @@ -1398,4 +1400,51 @@ void CommandQueue::tryReleaseDeferredNodes(bool checkEventsState) { } } +void CommandQueue::registerGpgpuCsrClient() { + if (!gpgpuCsrClientRegistered) { + gpgpuCsrClientRegistered = true; + + getGpgpuCommandStreamReceiver().registerClient(); + } +} + +void CommandQueue::registerBcsCsrClient(CommandStreamReceiver &bcsCsr) { + auto engineType = bcsCsr.getOsContext().getEngineType(); + + auto &bcsState = bcsStates[EngineHelpers::getBcsIndex(engineType)]; + + if (!bcsState.csrClientRegistered) { + bcsState.csrClientRegistered = true; + bcsCsr.registerClient(); + } +} + +void CommandQueue::unregisterGpgpuCsrClient() { + if (gpgpuCsrClientRegistered) { + gpgpuEngine->commandStreamReceiver->unregisterClient(); + gpgpuCsrClientRegistered = false; + } +} + +void CommandQueue::unregisterBcsCsrClient(CommandStreamReceiver &bcsCsr) { + auto engineType = bcsCsr.getOsContext().getEngineType(); + + auto &bcsState = bcsStates[EngineHelpers::getBcsIndex(engineType)]; + + if (bcsState.isValid() && bcsState.csrClientRegistered) { + bcsCsr.unregisterClient(); + bcsState.csrClientRegistered = false; + } +} + +void CommandQueue::unregisterGpgpuAndBcsCsrClients() { + unregisterGpgpuCsrClient(); + + for (auto &engine : this->bcsEngines) { + if (engine) { + unregisterBcsCsrClient(*engine->commandStreamReceiver); + } + } +} + } // namespace NEO diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index 2a673479eb..d9b469c5c7 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -377,7 +377,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> { const std::array &peekActiveBcsStates() const { return bcsStates; } - void tryReleaseDeferredNodes(bool checkEventsState); + void handlePostCompletionOperations(bool checkQueueCompletion); protected: void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet); @@ -414,6 +414,14 @@ class CommandQueue : public BaseObject<_cl_command_queue> { virtual bool obtainTimestampPacketForCacheFlush(bool isCacheFlushRequired) const = 0; void assignDataToOverwrittenBcsNode(TagNodeBase *node); + void registerGpgpuCsrClient(); + void registerBcsCsrClient(CommandStreamReceiver &bcsCsr); + + void unregisterGpgpuCsrClient(); + void unregisterBcsCsrClient(CommandStreamReceiver &bcsCsr); + + void unregisterGpgpuAndBcsCsrClients(); + Context *context = nullptr; ClDevice *device = nullptr; mutable EngineControl *gpgpuEngine = nullptr; @@ -463,6 +471,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> { bool stallingCommandsOnNextFlushRequired = false; bool dcFlushRequiredOnStallingCommandsOnNextFlush = false; bool splitBarrierRequired = false; + bool gpgpuCsrClientRegistered = false; }; template diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h index 2352d21449..7c87720578 100644 --- a/opencl/source/command_queue/command_queue_hw.h +++ b/opencl/source/command_queue/command_queue_hw.h @@ -521,10 +521,5 @@ class CommandQueueHw : public CommandQueue { bool isBlitAuxTranslationRequired(const MultiDispatchInfo &multiDispatchInfo); bool relaxedOrderingForGpgpuAllowed(uint32_t numWaitEvents) const; - - void registerGpgpuCsrClient(); - void registerBcsCsrClient(CommandStreamReceiver &bcsCsr); - - bool gpgpuCsrClientRegistered = false; }; } // namespace NEO diff --git a/opencl/source/command_queue/command_queue_hw_base.inl b/opencl/source/command_queue/command_queue_hw_base.inl index e4c34224d1..74fa74a50e 100644 --- a/opencl/source/command_queue/command_queue_hw_base.inl +++ b/opencl/source/command_queue/command_queue_hw_base.inl @@ -245,37 +245,8 @@ void CommandQueueHw::setupEvent(EventBuilder &eventBuilder, cl_event *ou } } -template -void CommandQueueHw::registerGpgpuCsrClient() { - if (!gpgpuCsrClientRegistered) { - gpgpuCsrClientRegistered = true; - - getGpgpuCommandStreamReceiver().registerClient(); - } -} - -template -void CommandQueueHw::registerBcsCsrClient(CommandStreamReceiver &bcsCsr) { - auto engineType = bcsCsr.getOsContext().getEngineType(); - - auto &bcsState = bcsStates[EngineHelpers::getBcsIndex(engineType)]; - - if (!bcsState.csrClientRegistered) { - bcsState.csrClientRegistered = true; - bcsCsr.registerClient(); - } -} - template CommandQueueHw::~CommandQueueHw() { - if (gpgpuCsrClientRegistered) { - gpgpuEngine->commandStreamReceiver->unregisterClient(); - } - - for (auto ©Engine : bcsStates) { - if (copyEngine.isValid() && copyEngine.csrClientRegistered) { - bcsEngines[EngineHelpers::getBcsIndex(copyEngine.engineType)]->commandStreamReceiver->unregisterClient(); - } - } + unregisterGpgpuAndBcsCsrClients(); } } // namespace NEO diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp index 398933e8db..18cb83da5a 100644 --- a/opencl/source/event/event.cpp +++ b/opencl/source/event/event.cpp @@ -454,7 +454,7 @@ inline WaitStatus Event::wait(bool blocking, bool useQuickKmdSleep) { DEBUG_BREAK_IF(this->taskLevel == CompletionStamp::notReady && this->executionStatus >= 0); - cmdQueue->tryReleaseDeferredNodes(true); + cmdQueue->handlePostCompletionOperations(true); auto *allocationStorage = cmdQueue->getGpgpuCommandStreamReceiver().getInternalAllocationStorage(); allocationStorage->cleanAllocationList(this->taskCount, TEMPORARY_ALLOCATION); diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp index 8eb23a8433..3601f5f305 100644 --- a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp @@ -1099,6 +1099,47 @@ HWTEST_F(CommandQueueHwTest, givenCommandQueueWhenDispatchingWorkThenRegisterCsr EXPECT_EQ(baseNumClients, csr.getNumClients()); } +HWTEST_F(CommandQueueHwTest, givenCsrClientWhenCallingSyncPointsThenUnregister) { + MockKernelWithInternals mockKernelWithInternals(*pClDevice); + auto mockKernel = mockKernelWithInternals.mockKernel; + + auto &csr = pDevice->getUltCommandStreamReceiver(); + + size_t gws = 1; + + auto baseNumClients = csr.getNumClients(); + + MockCommandQueueHw mockCmdQueueHw{context, pClDevice, nullptr}; + + EXPECT_EQ(CL_SUCCESS, mockCmdQueueHw.enqueueKernel(mockKernel, 1, nullptr, &gws, nullptr, 0, nullptr, nullptr)); + + EXPECT_EQ(baseNumClients + 1, csr.getNumClients()); + + mockCmdQueueHw.finish(); + + EXPECT_EQ(baseNumClients, csr.getNumClients()); // queue synchronized + + cl_event e0, e1; + + EXPECT_EQ(CL_SUCCESS, mockCmdQueueHw.enqueueKernel(mockKernel, 1, nullptr, &gws, nullptr, 0, nullptr, &e0)); + EXPECT_EQ(baseNumClients + 1, csr.getNumClients()); + *csr.tagAddress = mockCmdQueueHw.taskCount; + + EXPECT_EQ(CL_SUCCESS, mockCmdQueueHw.enqueueKernel(mockKernel, 1, nullptr, &gws, nullptr, 0, nullptr, &e1)); + EXPECT_EQ(baseNumClients + 1, csr.getNumClients()); + + clWaitForEvents(1, &e0); + EXPECT_EQ(baseNumClients + 1, csr.getNumClients()); // CSR task count < queue task count + + *csr.tagAddress = mockCmdQueueHw.taskCount; + + clWaitForEvents(1, &e0); + EXPECT_EQ(baseNumClients, csr.getNumClients()); // queue ready + + clReleaseEvent(e0); + clReleaseEvent(e1); +} + HWTEST_F(CommandQueueHwTest, givenKernelSplitEnqueueReadBufferWhenBlockedThenEnqueueSurfacesMakeResidentIsCalledOnce) { UserEvent userEvent(context); auto &csr = pDevice->getUltCommandStreamReceiver(); diff --git a/opencl/test/unit_test/command_stream/cl_tbx_command_stream_tests.cpp b/opencl/test/unit_test/command_stream/cl_tbx_command_stream_tests.cpp index 4e178c784b..268cab8cc9 100644 --- a/opencl/test/unit_test/command_stream/cl_tbx_command_stream_tests.cpp +++ b/opencl/test/unit_test/command_stream/cl_tbx_command_stream_tests.cpp @@ -39,7 +39,7 @@ HWTEST_F(ClTbxCommandStreamTests, givenTbxCsrWhenDispatchBlitEnqueueThenProcessC EngineControl engineControl0{&tbxCsr0, &osContext0}; MockOsContext osContext1(1, EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular}, pDevice->getDeviceBitfield())); - tbxCsr1.setupContext(osContext0); + tbxCsr1.setupContext(osContext1); EngineControl engineControl1{&tbxCsr1, &osContext1}; MockCommandQueueHw cmdQ(&context, pClDevice, nullptr); @@ -47,6 +47,8 @@ HWTEST_F(ClTbxCommandStreamTests, givenTbxCsrWhenDispatchBlitEnqueueThenProcessC cmdQ.clearBcsEngines(); cmdQ.bcsEngines[0] = &engineControl1; + cmdQ.bcsStates[0] = {aub_stream::ENGINE_BCS, 0, false}; + cl_int error = CL_SUCCESS; std::unique_ptr buffer(Buffer::create(&context, 0, 1, nullptr, error)); diff --git a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp index 04503dd261..c05b4e8e57 100644 --- a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp +++ b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp @@ -609,6 +609,8 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenAllBcsEnginesReadyWhenWaitingForEventThe } clReleaseEvent(event); + + commandQueue.reset(); } HWTEST_TEMPLATED_F(BcsBufferTests, givenMapAllocationWhenEnqueueingReadOrWriteBufferThenStoreMapAllocationInDispatchParameters) { diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index 52854706d2..05d75e977b 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -272,6 +272,7 @@ class MockCommandQueueHw : public CommandQueueHw { using BaseClass::relaxedOrderingForGpgpuAllowed; using BaseClass::requiresCacheFlushAfterWalker; using BaseClass::splitBarrierRequired; + using BaseClass::taskCount; using BaseClass::throttle; using BaseClass::timestampPacketContainer;