diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index f5b7a23e32..2d22b0c9de 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -1281,17 +1281,10 @@ WaitStatus CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *pri return WaitStatus::GpuHang; } - TimestampPacketContainer nodesToRelease; - if (deferredTimestampPackets) { - deferredTimestampPackets->swapNodes(nodesToRelease); - } - TimestampPacketContainer multiRootSyncNodesToRelease; - if (deferredMultiRootSyncNodes.get()) { - deferredMultiRootSyncNodes->swapNodes(multiRootSyncNodesToRelease); - } - waitStatus = waitUntilComplete(taskCount, activeBcsStates, flushStamp->peekStamp(), false, cleanTemporaryAllocationsList, waitedOnTimestamps); + releaseDeferredNodes(); + if (printfHandler) { if (!printfHandler->printEnqueueOutput()) { return WaitStatus::GpuHang; @@ -1378,4 +1371,15 @@ bool CommandQueue::migrateMultiGraphicsAllocationsIfRequired(const BuiltinOpPara return migrationHandled; } +void CommandQueue::releaseDeferredNodes() { + TimestampPacketContainer nodesToRelease; + if (deferredTimestampPackets) { + deferredTimestampPackets->swapNodes(nodesToRelease); + } + TimestampPacketContainer multiRootSyncNodesToRelease; + if (deferredMultiRootSyncNodes.get()) { + deferredMultiRootSyncNodes->swapNodes(multiRootSyncNodesToRelease); + } +} + } // namespace NEO diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index 798c91d122..f1a12b3cc4 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -374,6 +374,10 @@ class CommandQueue : public BaseObject<_cl_command_queue> { const std::array &peekActiveBcsStates() const { return bcsStates; } + void releaseDeferredNodes(); + + TaskCountType peekTaskCount() const { return taskCount; } + protected: void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet); cl_int enqueueWriteMemObjForUnmap(MemObj *memObj, void *mappedPtr, EventsRequest &eventsRequest); diff --git a/opencl/source/command_queue/command_queue_hw_base.inl b/opencl/source/command_queue/command_queue_hw_base.inl index 050567ec47..e3f1f23a1d 100644 --- a/opencl/source/command_queue/command_queue_hw_base.inl +++ b/opencl/source/command_queue/command_queue_hw_base.inl @@ -171,11 +171,13 @@ bool CommandQueueHw::waitForTimestamps(Range copyEngine using TSPacketType = typename Family::TimestampPacketType; bool waited = false; + if (isOOQEnabled()) { + // TSP for OOQ dispatch is optional. We need to wait for task count. + return waited; + } + if (isWaitForTimestampsEnabled()) { waited = waitForTimestampsWithinContainer(mainContainer, getGpgpuCommandStreamReceiver(), status); - if (isOOQEnabled()) { - waitForTimestampsWithinContainer(deferredContainer, getGpgpuCommandStreamReceiver(), status); - } if (waited) { getGpgpuCommandStreamReceiver().downloadAllocations(); diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index d2a5c112c8..a1c875cc1e 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -211,7 +211,12 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, if (isCacheFlushCommand(commandType) || isMarkerWithPostSyncWrite || isNonStallingIoqBarrierWithDependencies) { nodesCount = 1; } else if (!multiDispatchInfo.empty()) { - nodesCount = estimateTimestampPacketNodesCount(multiDispatchInfo); + if (isOOQEnabled() && !event) { + // TSP not needed. Release current node. + timestampPacketContainer->moveNodesToNewContainer(*deferredTimestampPackets); + } else { + nodesCount = estimateTimestampPacketNodesCount(multiDispatchInfo); + } } if (isCacheFlushForBcsRequired() && enqueueWithBlitAuxTranslation) { diff --git a/opencl/source/command_queue/hardware_interface_bdw_and_later.inl b/opencl/source/command_queue/hardware_interface_bdw_and_later.inl index 5626b1bba5..9d55cb60c5 100644 --- a/opencl/source/command_queue/hardware_interface_bdw_and_later.inl +++ b/opencl/source/command_queue/hardware_interface_bdw_and_later.inl @@ -68,7 +68,8 @@ inline void HardwareInterface::programWalker( size_t numWorkGroups[3] = {walkerArgs.numberOfWorkgroups->x, walkerArgs.numberOfWorkgroups->y, walkerArgs.numberOfWorkgroups->z}; auto threadGroupCount = static_cast(walkerArgs.numberOfWorkgroups->x * walkerArgs.numberOfWorkgroups->y * walkerArgs.numberOfWorkgroups->z); - if (walkerArgs.currentTimestampPacketNodes && commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { + if (walkerArgs.currentTimestampPacketNodes && walkerArgs.currentTimestampPacketNodes->peekNodes().size() > 0 && + commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { auto timestampPacketNode = walkerArgs.currentTimestampPacketNodes->peekNodes().at(walkerArgs.currentDispatchIndex); GpgpuWalkerHelper::setupTimestampPacket(&commandStream, &walkerCmd, timestampPacketNode, rootDeviceEnvironment); } diff --git a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl index 405745beb6..c4fafe20e2 100644 --- a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl +++ b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl @@ -79,7 +79,8 @@ inline void HardwareInterface::programWalker( auto &queueCsr = commandQueue.getGpgpuCommandStreamReceiver(); auto &rootDeviceEnvironment = commandQueue.getDevice().getRootDeviceEnvironment(); - if (walkerArgs.currentTimestampPacketNodes && queueCsr.peekTimestampPacketWriteEnabled()) { + + if (walkerArgs.currentTimestampPacketNodes && (walkerArgs.currentTimestampPacketNodes->peekNodes().size() > 0)) { auto timestampPacket = walkerArgs.currentTimestampPacketNodes->peekNodes().at(walkerArgs.currentDispatchIndex); GpgpuWalkerHelper::setupTimestampPacket(&commandStream, &walkerCmd, timestampPacket, rootDeviceEnvironment); } @@ -125,7 +126,8 @@ inline void HardwareInterface::programWalker( auto devices = queueCsr.getOsContext().getDeviceBitfield(); auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, true); - if (walkerArgs.currentTimestampPacketNodes && DebugManager.flags.PrintTimestampPacketUsage.get() == 1) { + if (walkerArgs.currentTimestampPacketNodes && walkerArgs.currentTimestampPacketNodes->peekNodes().size() > 0 && + DebugManager.flags.PrintTimestampPacketUsage.get() == 1) { auto gpuVa = walkerArgs.currentTimestampPacketNodes->peekNodes()[walkerArgs.currentDispatchIndex]->getGpuAddress(); printf("\nPID:%u, TSP used for Walker: 0x%" PRIX64 ", cmdBuffer pos: 0x%" PRIX64, SysCalls::getProcessId(), gpuVa, commandStream.getCurrentGpuAddressPosition()); } diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp index b2a534b2f6..746062b8dd 100644 --- a/opencl/source/event/event.cpp +++ b/opencl/source/event/event.cpp @@ -451,6 +451,19 @@ inline WaitStatus Event::wait(bool blocking, bool useQuickKmdSleep) { DEBUG_BREAK_IF(this->taskLevel == CompletionStamp::notReady && this->executionStatus >= 0); + { + TakeOwnershipWrapper queueOwnership(*cmdQueue); + + bool releaseNodes = (taskCount == cmdQueue->peekTaskCount()); + if (bcsState.isValid()) { + releaseNodes &= (bcsState.taskCount == cmdQueue->peekBcsTaskCount(bcsState.engineType)); + } + + if (releaseNodes) { + cmdQueue->releaseDeferredNodes(); + } + } + auto *allocationStorage = cmdQueue->getGpgpuCommandStreamReceiver().getInternalAllocationStorage(); allocationStorage->cleanAllocationList(this->taskCount, TEMPORARY_ALLOCATION); diff --git a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp index 2b76c9dc72..8bc18d1e6c 100644 --- a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp +++ b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp @@ -756,7 +756,7 @@ HWTEST_F(TimestampPacketTests, givenEnableTimestampWaitForQueuesWhenFinishThenWa EXPECT_EQ(csr.waitForCompletionWithTimeoutTaskCountCalled, 0u); } -HWTEST_F(TimestampPacketTests, givenOOQAndEnableTimestampWaitForQueuesWhenFinishThenWaitOnTimestamp) { +HWTEST_F(TimestampPacketTests, givenOOQAndEnableTimestampWaitForQueuesWhenFinishThenDontWaitOnTimestamp) { DebugManagerStateRestore restorer; DebugManager.flags.UpdateTaskCountFromWait.set(3); DebugManager.flags.EnableTimestampWaitForQueues.set(1); @@ -774,19 +774,83 @@ HWTEST_F(TimestampPacketTests, givenOOQAndEnableTimestampWaitForQueuesWhenFinish cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); cmdQ->flush(); - EXPECT_EQ(1u, deferredTimestampPackets->peekNodes().size()); - EXPECT_EQ(1u, timestampPacketContainer->peekNodes().size()); - ASSERT_GT(deferredTimestampPackets->peekNodes().size(), 0u); - ASSERT_GT(timestampPacketContainer->peekNodes().size(), 0u); - typename FamilyType::TimestampPacketType timestampData[] = {2, 2, 2, 2}; - for (uint32_t i = 0; i < deferredTimestampPackets->peekNodes()[0]->getPacketsUsed(); i++) { - deferredTimestampPackets->peekNodes()[0]->assignDataToAllTimestamps(i, timestampData); - timestampPacketContainer->peekNodes()[0]->assignDataToAllTimestamps(i, timestampData); - } + EXPECT_EQ(0u, deferredTimestampPackets->peekNodes().size()); + EXPECT_EQ(0u, timestampPacketContainer->peekNodes().size()); cmdQ->finish(); - EXPECT_EQ(csr.waitForCompletionWithTimeoutTaskCountCalled, 0u); + EXPECT_EQ(csr.waitForCompletionWithTimeoutTaskCountCalled, 1u); + + cmdQ.reset(); +} + +HWTEST_F(TimestampPacketTests, givenOOQAndWithoutEventWhenEnqueueCalledThenMoveCurrentNodeToDeferredContainer) { + DebugManagerStateRestore restorer; + DebugManager.flags.UpdateTaskCountFromWait.set(3); + DebugManager.flags.EnableTimestampWaitForQueues.set(1); + + auto &csr = device->getUltCommandStreamReceiver(); + csr.timestampPacketWriteEnabled = true; + csr.callBaseWaitForCompletionWithTimeout = false; + cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0}; + auto cmdQ = std::make_unique>(context, device.get(), props); + + TimestampPacketContainer *deferredTimestampPackets = cmdQ->deferredTimestampPackets.get(); + TimestampPacketContainer *timestampPacketContainer = cmdQ->timestampPacketContainer.get(); + + cl_event event; + + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, &event); + + EXPECT_EQ(0u, deferredTimestampPackets->peekNodes().size()); + EXPECT_EQ(1u, timestampPacketContainer->peekNodes().size()); + + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + cmdQ->flush(); + + EXPECT_EQ(1u, deferredTimestampPackets->peekNodes().size()); + EXPECT_EQ(0u, timestampPacketContainer->peekNodes().size()); + + cmdQ->finish(); + + clReleaseEvent(event); + + cmdQ.reset(); +} + +HWTEST_F(TimestampPacketTests, givenEventWithLatestTaskCountWhenWaitCalledThenClearDeferredNodes) { + DebugManagerStateRestore restorer; + DebugManager.flags.UpdateTaskCountFromWait.set(3); + DebugManager.flags.EnableTimestampWaitForQueues.set(1); + + auto &csr = device->getUltCommandStreamReceiver(); + csr.timestampPacketWriteEnabled = true; + csr.callBaseWaitForCompletionWithTimeout = false; + cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0}; + auto cmdQ = std::make_unique>(context, device.get(), props); + + TimestampPacketContainer *deferredTimestampPackets = cmdQ->deferredTimestampPackets.get(); + TimestampPacketContainer *timestampPacketContainer = cmdQ->timestampPacketContainer.get(); + + cl_event event1, event2; + + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, &event1); + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, &event2); + cmdQ->flush(); + + EXPECT_EQ(1u, deferredTimestampPackets->peekNodes().size()); + EXPECT_EQ(1u, timestampPacketContainer->peekNodes().size()); + + castToObjectOrAbort(event1)->wait(false, false); + EXPECT_EQ(1u, deferredTimestampPackets->peekNodes().size()); + EXPECT_EQ(1u, timestampPacketContainer->peekNodes().size()); + + castToObjectOrAbort(event2)->wait(false, false); + EXPECT_EQ(0u, deferredTimestampPackets->peekNodes().size()); + EXPECT_EQ(1u, timestampPacketContainer->peekNodes().size()); + + clReleaseEvent(event1); + clReleaseEvent(event2); cmdQ.reset(); } @@ -815,30 +879,28 @@ HWTEST_F(TimestampPacketTests, givenEnableTimestampWaitForQueuesWhenFinishThenCa cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); cmdQ->flush(); - EXPECT_EQ(1u, deferredTimestampPackets->peekNodes().size()); - EXPECT_EQ(1u, timestampPacketContainer->peekNodes().size()); + EXPECT_EQ(0u, deferredTimestampPackets->peekNodes().size()); + EXPECT_EQ(0u, timestampPacketContainer->peekNodes().size()); VariableBackup backupPauseAddress(&CpuIntrinsicsTests::pauseAddress); VariableBackup backupPauseValue(&CpuIntrinsicsTests::pauseValue); VariableBackup backupPauseOffset(&CpuIntrinsicsTests::pauseOffset); VariableBackup> backupSetupPauseAddress(&CpuIntrinsicsTests::setupPauseAddress); - ASSERT_GT(deferredTimestampPackets->peekNodes().size(), 0u); - ASSERT_GT(timestampPacketContainer->peekNodes().size(), 0u); - deferredTimestampPackets->peekNodes()[0]->setPacketsUsed(1u); - timestampPacketContainer->peekNodes()[0]->setPacketsUsed(1u); + auto &csr = cmdQ->getGpgpuCommandStreamReceiver(); + *csr.getTagAddress() = 0; - CpuIntrinsicsTests::pauseAddress = reinterpret_cast(const_cast(timestampPacketContainer->peekNodes()[0]->getContextEndAddress(0u))); - CpuIntrinsicsTests::pauseValue = 2u; + CpuIntrinsicsTests::pauseAddress = csr.getTagAddress(); + CpuIntrinsicsTests::pauseValue = 3u; CpuIntrinsicsTests::setupPauseAddress = [&]() { - CpuIntrinsicsTests::pauseAddress = reinterpret_cast(const_cast(deferredTimestampPackets->peekNodes()[0]->getContextEndAddress(0u))); + CpuIntrinsicsTests::pauseAddress = csr.getTagAddress(); }; CpuIntrinsicsTests::pauseCounter = 0u; EXPECT_FALSE(device->getUltCommandStreamReceiver().downloadAllocationCalled); cmdQ->finish(); - EXPECT_EQ(2u, CpuIntrinsicsTests::pauseCounter); + EXPECT_EQ(1u, CpuIntrinsicsTests::pauseCounter); EXPECT_TRUE(device->getUltCommandStreamReceiver().downloadAllocationCalled); cmdQ.reset(); @@ -852,12 +914,16 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEnqueueingToO TimestampPacketContainer *deferredTimestampPackets = cmdQ->deferredTimestampPackets.get(); - cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + cl_event event; + + cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, &event); EXPECT_EQ(0u, deferredTimestampPackets->peekNodes().size()); cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); EXPECT_EQ(1u, deferredTimestampPackets->peekNodes().size()); + + clReleaseEvent(event); } HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEnqueueingBlockingThenTrackOwnershipUntilQueueIsCompleted) { diff --git a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp index cce563dae7..c7726d675b 100644 --- a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp +++ b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp @@ -480,6 +480,62 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenBlockedBlitEnqueueWhenUnblockingThenMake EXPECT_TRUE(bcsCsr->isMadeResident(eventDependency->getBaseGraphicsAllocation()->getDefaultGraphicsAllocation(), bcsCsr->taskCount)); } +HWTEST_TEMPLATED_F(BcsBufferTests, givenEventWithLatestTaskCountWhenWaitCalledThenClearDeferredNodes) { + auto mockCmdQ = static_cast *>(commandQueue.get()); + + auto bufferForBlt = clUniquePtr(Buffer::create(bcsMockContext.get(), CL_MEM_READ_WRITE, 1, nullptr, retVal)); + bufferForBlt->forceDisallowCPUCopy = true; + + TimestampPacketContainer *deferredTimestampPackets = mockCmdQ->deferredTimestampPackets.get(); + TimestampPacketContainer *timestampPacketContainer = mockCmdQ->timestampPacketContainer.get(); + + cl_event event1, event2; + + mockCmdQ->enqueueReadBuffer(bufferForBlt.get(), CL_FALSE, 0, 1, &hostPtr, nullptr, 0, nullptr, &event1); + mockCmdQ->enqueueReadBuffer(bufferForBlt.get(), CL_FALSE, 0, 1, &hostPtr, nullptr, 0, nullptr, &event2); + + mockCmdQ->taskCount++; + + auto event1Obj = castToObjectOrAbort(event1); + auto event2Obj = castToObjectOrAbort(event2); + + size_t expectedSize = 1; + if (mockCmdQ->isCacheFlushForBcsRequired()) { + expectedSize += 2; + } + + EXPECT_EQ(expectedSize, deferredTimestampPackets->peekNodes().size()); + EXPECT_EQ(1u, timestampPacketContainer->peekNodes().size()); + + // gpgpu task count not equal + { + event1Obj->wait(false, false); + EXPECT_EQ(expectedSize, deferredTimestampPackets->peekNodes().size()); + EXPECT_EQ(1u, timestampPacketContainer->peekNodes().size()); + + event2Obj->wait(false, false); + EXPECT_EQ(expectedSize, deferredTimestampPackets->peekNodes().size()); + EXPECT_EQ(1u, timestampPacketContainer->peekNodes().size()); + } + + event1Obj->updateTaskCount(mockCmdQ->taskCount, event1Obj->peekBcsTaskCountFromCommandQueue() - 1); + event2Obj->updateTaskCount(mockCmdQ->taskCount, event1Obj->peekBcsTaskCountFromCommandQueue()); + + // gpgpu and bcs task count equal + { + event1Obj->wait(false, false); + EXPECT_EQ(expectedSize, deferredTimestampPackets->peekNodes().size()); + EXPECT_EQ(1u, timestampPacketContainer->peekNodes().size()); + + event2Obj->wait(false, false); + EXPECT_EQ(0u, deferredTimestampPackets->peekNodes().size()); + EXPECT_EQ(1u, timestampPacketContainer->peekNodes().size()); + } + + clReleaseEvent(event1); + clReleaseEvent(event2); +} + HWTEST_TEMPLATED_F(BcsBufferTests, givenMapAllocationWhenEnqueueingReadOrWriteBufferThenStoreMapAllocationInDispatchParameters) { DebugManager.flags.DisableZeroCopyForBuffers.set(true); auto mockCmdQ = static_cast *>(commandQueue.get());