diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index d73d28fee5..356bee337a 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -1033,13 +1033,6 @@ WaitStatus CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *pri } } - auto waitedOnTimestamps = waitForTimestamps(taskCount); - - TimestampPacketContainer nodesToRelease; - if (deferredTimestampPackets) { - deferredTimestampPackets->swapNodes(nodesToRelease); - } - StackVec activeBcsStates{}; for (CopyEngineState &state : this->bcsStates) { if (state.isValid()) { @@ -1047,6 +1040,13 @@ WaitStatus CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *pri } } + auto waitedOnTimestamps = waitForTimestamps(activeBcsStates, taskCount); + + TimestampPacketContainer nodesToRelease; + if (deferredTimestampPackets) { + deferredTimestampPackets->swapNodes(nodesToRelease); + } + const auto waitStatus = waitUntilComplete(taskCount, activeBcsStates, flushStamp->peekStamp(), false, cleanTemporaryAllocationsList, waitedOnTimestamps); if (printfHandler) { diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index 84115d5760..d47cf3e255 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -205,7 +205,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> { bool isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState) const; bool isWaitForTimestampsEnabled() const; - virtual bool waitForTimestamps(uint32_t taskCount) = 0; + virtual bool waitForTimestamps(Range copyEnginesToWait, uint32_t taskCount) = 0; MOCKABLE_VIRTUAL bool isQueueBlocked(); diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h index 92dc5f397e..ec46f998fb 100644 --- a/opencl/source/command_queue/command_queue_hw.h +++ b/opencl/source/command_queue/command_queue_hw.h @@ -446,7 +446,7 @@ class CommandQueueHw : public CommandQueue { bool isCacheFlushCommand(uint32_t commandType) const override; - bool waitForTimestamps(uint32_t taskCount) override; + bool waitForTimestamps(Range copyEnginesToWait, uint32_t taskCount) override; MOCKABLE_VIRTUAL bool isCacheFlushForBcsRequired() const; diff --git a/opencl/source/command_queue/command_queue_hw_base.inl b/opencl/source/command_queue/command_queue_hw_base.inl index 38e3119df5..5386b0e646 100644 --- a/opencl/source/command_queue/command_queue_hw_base.inl +++ b/opencl/source/command_queue/command_queue_hw_base.inl @@ -160,7 +160,7 @@ inline bool waitForTimestampsWithinContainer(TimestampPacketContainer *container } template -bool CommandQueueHw::waitForTimestamps(uint32_t taskCount) { +bool CommandQueueHw::waitForTimestamps(Range copyEnginesToWait, uint32_t taskCount) { using TSPacketType = typename Family::TimestampPacketType; bool waited = false; @@ -169,6 +169,14 @@ bool CommandQueueHw::waitForTimestamps(uint32_t taskCount) { if (isOOQEnabled()) { waitForTimestampsWithinContainer(deferredTimestampPackets.get(), getGpgpuCommandStreamReceiver()); } + + if (waited) { + getGpgpuCommandStreamReceiver().downloadAllocations(); + for (const auto ©Engine : copyEnginesToWait) { + auto bcsCsr = getBcsCommandStreamReceiver(copyEngine.engineType); + bcsCsr->downloadAllocations(); + } + } } return waited; diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp index e6f6ae1147..7e5b38854a 100644 --- a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp @@ -49,7 +49,7 @@ HWTEST_F(CommandQueueHwTest, givenNoTimestampPacketsWhenWaitForTimestampsThenNoW MockCommandQueueHw cmdQ(context, device.get(), nullptr); auto taskCount = device->getUltCommandStreamReceiver().peekLatestFlushedTaskCount(); - cmdQ.waitForTimestamps(101u); + cmdQ.waitForTimestamps({}, 101u); EXPECT_EQ(device->getUltCommandStreamReceiver().peekLatestFlushedTaskCount(), taskCount); } diff --git a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp index 89fb12d0a8..35a85130fd 100644 --- a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp +++ b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp @@ -833,11 +833,13 @@ HWTEST_F(TimestampPacketTests, givenTimestampWaitEnabledWhenEnqueueWithEventThen EXPECT_TRUE(event2.isCompleted()); EXPECT_EQ(csr.waitForCompletionWithTimeoutTaskCountCalled, 0u); EXPECT_TRUE(csr.downloadAllocationCalled); + EXPECT_TRUE(csr.downloadAllocationsCalled); for (CopyEngineState &state : cmdQ->bcsStates) { if (state.isValid()) { auto bcsCsr = static_cast *>(cmdQ->getBcsCommandStreamReceiver(state.engineType)); EXPECT_EQ(bcsCsr->waitForCompletionWithTimeoutTaskCountCalled, 0u); + EXPECT_TRUE(csr.downloadAllocationsCalled); } } diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index f3d21ae2f4..40ef5362d2 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -212,7 +212,7 @@ class MockCommandQueue : public CommandQueue { bool obtainTimestampPacketForCacheFlush(bool isCacheFlushRequired) const override { return isCacheFlushRequired; } - bool waitForTimestamps(uint32_t taskCount) override { return false; }; + bool waitForTimestamps(Range copyEnginesToWait, uint32_t taskCount) override { return false; }; bool releaseIndirectHeapCalled = false; diff --git a/shared/source/command_stream/command_stream_receiver.cpp b/shared/source/command_stream/command_stream_receiver.cpp index 0b0e2afc4e..fc9bd2b109 100644 --- a/shared/source/command_stream/command_stream_receiver.cpp +++ b/shared/source/command_stream/command_stream_receiver.cpp @@ -344,6 +344,8 @@ WaitStatus CommandStreamReceiver::baseWaitFunction(volatile uint32_t *pollAddres lastHangCheckTime = waitStartTime; for (uint32_t i = 0; i < activePartitions; i++) { while (*partitionAddress < taskCountToWait && timeDiff <= params.waitTimeout) { + this->downloadTagAllocation(taskCountToWait); + if (!params.indefinitelyPoll && WaitUtils::waitFunction(partitionAddress, taskCountToWait)) { break; } diff --git a/shared/test/common/libult/ult_command_stream_receiver.h b/shared/test/common/libult/ult_command_stream_receiver.h index 8e97cbc51b..607e7a04f0 100644 --- a/shared/test/common/libult/ult_command_stream_receiver.h +++ b/shared/test/common/libult/ult_command_stream_receiver.h @@ -175,6 +175,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ void downloadAllocations() override { downloadAllocationCalled = true; + downloadAllocationsCalled = true; } void downloadAllocationUlt(GraphicsAllocation &gfxAllocation) { @@ -355,6 +356,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ bool checkAndActivateAubSubCaptureCalled = false; bool addAubCommentCalled = false; std::atomic_bool downloadAllocationCalled = false; + std::atomic_bool downloadAllocationsCalled = false; bool flushBatchedSubmissionsCalled = false; bool initProgrammingFlagsCalled = false; bool multiOsContextCapable = false; diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index 66389ffe16..2e95ecd9d4 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -308,7 +308,7 @@ HWTEST_F(CommandStreamReceiverTest, givenGpuHangWhenWaititingForTaskCountThenGpu constexpr auto taskCountToWait = 1; const auto waitStatus = csr.waitForTaskCount(taskCountToWait); EXPECT_EQ(WaitStatus::GpuHang, waitStatus); - EXPECT_FALSE(csr.downloadAllocationCalled); + EXPECT_TRUE(csr.downloadAllocationCalled); } HWTEST_F(CommandStreamReceiverTest, whenDownloadTagAllocationThenDonwloadOnlyIfTagAllocationWasFlushed) {