From 1f0c58d0bfa78d6fbdeeccc68c4d5e39b7ce36a1 Mon Sep 17 00:00:00 2001 From: Lukasz Jobczyk Date: Thu, 9 Dec 2021 11:59:52 +0000 Subject: [PATCH] Refactor timestamp wait mechanism Signed-off-by: Lukasz Jobczyk --- opencl/source/command_queue/command_queue.cpp | 41 +++++++++++-------- opencl/source/command_queue/command_queue.h | 9 ++-- .../source/command_queue/command_queue_hw.h | 2 +- .../command_queue/command_queue_hw_base.inl | 25 +++++------ opencl/source/dll/CMakeLists.txt | 3 +- ...ue_linux_dll.cpp => command_queue_dll.cpp} | 6 +++ .../dll/windows/command_queue_windows_dll.cpp | 20 --------- .../command_queue/command_queue_hw_tests.cpp | 15 +++++++ .../command_queue/command_queue_tests.cpp | 16 ++++---- .../command_queue/enqueue_handler_tests.cpp | 4 +- .../command_queue/enqueue_kernel_2_tests.cpp | 4 +- .../helpers/timestamp_packet_1_tests.cpp | 34 +++++++-------- .../unit_test/libult/command_queue_ult.cpp | 8 ++++ opencl/test/unit_test/linux/CMakeLists.txt | 2 +- .../test/unit_test/linux/main_linux_dll.cpp | 6 ++- .../test/unit_test/mocks/mock_command_queue.h | 10 ++--- .../command_stream_receiver.cpp | 10 ----- .../command_stream/command_stream_receiver.h | 2 - shared/source/helpers/hw_helper.h | 3 ++ .../helpers/hw_helper_bdw_and_later.inl | 5 +++ .../helpers/hw_helper_pvc_and_later.inl | 5 +++ .../helpers/hw_helper_xehp_and_later.inl | 5 +++ shared/test/common/helpers/ult_hw_config.h | 1 + .../unit_test/base_ult_config_listener.cpp | 2 +- 24 files changed, 131 insertions(+), 107 deletions(-) rename opencl/source/dll/{linux/command_queue_linux_dll.cpp => command_queue_dll.cpp} (87%) delete mode 100644 opencl/source/dll/windows/command_queue_windows_dll.cpp diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index be9f5564d4..3935fe6d34 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -230,28 +230,30 @@ bool CommandQueue::isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState return false; } -void CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList) { +void CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait) { WAIT_ENTER() DBG_LOG(LogTaskCounts, __FUNCTION__, "Waiting for taskCount:", gpgpuTaskCountToWait); DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "Current taskCount:", getHwTag()); - bool forcePowerSavingMode = this->throttle == QueueThrottle::LOW; + if (!skipWait) { + bool forcePowerSavingMode = this->throttle == QueueThrottle::LOW; - getGpgpuCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(gpgpuTaskCountToWait, - flushStampToWait, - useQuickKmdSleep, - forcePowerSavingMode); - DEBUG_BREAK_IF(getHwTag() < gpgpuTaskCountToWait); + getGpgpuCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(gpgpuTaskCountToWait, + flushStampToWait, + useQuickKmdSleep, + forcePowerSavingMode); + DEBUG_BREAK_IF(getHwTag() < gpgpuTaskCountToWait); - if (gtpinIsGTPinInitialized()) { - gtpinNotifyTaskCompletion(gpgpuTaskCountToWait); - } + if (gtpinIsGTPinInitialized()) { + gtpinNotifyTaskCompletion(gpgpuTaskCountToWait); + } - for (const CopyEngineState ©Engine : copyEnginesToWait) { - auto bcsCsr = getBcsCommandStreamReceiver(copyEngine.engineType); - bcsCsr->waitForTaskCountWithKmdNotifyFallback(copyEngine.taskCount, 0, false, false); - bcsCsr->waitForTaskCountAndCleanTemporaryAllocationList(copyEngine.taskCount); + for (const CopyEngineState ©Engine : copyEnginesToWait) { + auto bcsCsr = getBcsCommandStreamReceiver(copyEngine.engineType); + bcsCsr->waitForTaskCountWithKmdNotifyFallback(copyEngine.taskCount, 0, false, false); + bcsCsr->waitForTaskCountAndCleanTemporaryAllocationList(copyEngine.taskCount); + } } if (cleanTemporaryAllocationList) { @@ -957,8 +959,10 @@ void CommandQueue::aubCaptureHook(bool &blocking, bool &clearAllDependencies, co } } -bool CommandQueue::isTimestampWaitEnabled() { - auto enabled = false; +bool CommandQueue::isWaitForTimestampsEnabled() { + auto &hwHelper = HwHelper::get(getDevice().getHardwareInfo().platform.eRenderCoreFamily); + auto enabled = CommandQueue::isTimestampWaitEnabled(); + enabled &= hwHelper.isTimestampWaitSupported(); switch (DebugManager.flags.EnableTimestampWait.get()) { case 0: @@ -987,9 +991,10 @@ void CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *printfHan } } + auto waitedOnTimestamps = waitForTimestamps(taskCount); + TimestampPacketContainer nodesToRelease; if (deferredTimestampPackets) { - waitForTimestamps(taskCount); deferredTimestampPackets->swapNodes(nodesToRelease); } @@ -999,7 +1004,7 @@ void CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *printfHan activeBcsStates.push_back(state); } } - waitUntilComplete(taskCount, activeBcsStates, flushStamp->peekStamp(), false, cleanTemporaryAllocationsList); + waitUntilComplete(taskCount, activeBcsStates, flushStamp->peekStamp(), false, cleanTemporaryAllocationsList, waitedOnTimestamps); if (printfHandler) { printfHandler->printEnqueueOutput(); diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index 3b0dc7acd1..5572ad3a7f 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -204,14 +204,14 @@ class CommandQueue : public BaseObject<_cl_command_queue> { bool isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState) const; - bool isTimestampWaitEnabled(); - virtual void waitForTimestamps(uint32_t taskCount) = 0; + bool isWaitForTimestampsEnabled(); + virtual bool waitForTimestamps(uint32_t taskCount) = 0; MOCKABLE_VIRTUAL bool isQueueBlocked(); - MOCKABLE_VIRTUAL void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList); + MOCKABLE_VIRTUAL void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait); MOCKABLE_VIRTUAL void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) { - this->waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep, true); + this->waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep, true, false); } MOCKABLE_VIRTUAL void waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler, bool cleanTemporaryAllocationsList); MOCKABLE_VIRTUAL void waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler) { @@ -240,6 +240,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> { size_t minRequiredSize, IndirectHeap *&indirectHeap); static bool isAssignEngineRoundRobinEnabled(); + static bool isTimestampWaitEnabled(); MOCKABLE_VIRTUAL void releaseIndirectHeap(IndirectHeap::Type heapType); diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h index 5147757359..43d3eafc0b 100644 --- a/opencl/source/command_queue/command_queue_hw.h +++ b/opencl/source/command_queue/command_queue_hw.h @@ -446,7 +446,7 @@ class CommandQueueHw : public CommandQueue { bool isCacheFlushCommand(uint32_t commandType) const override; - void waitForTimestamps(uint32_t taskCount) override; + bool waitForTimestamps(uint32_t taskCount) override; MOCKABLE_VIRTUAL bool isCacheFlushForBcsRequired() const; diff --git a/opencl/source/command_queue/command_queue_hw_base.inl b/opencl/source/command_queue/command_queue_hw_base.inl index b7ed0de3ea..a1345d2655 100644 --- a/opencl/source/command_queue/command_queue_hw_base.inl +++ b/opencl/source/command_queue/command_queue_hw_base.inl @@ -139,12 +139,14 @@ template inline bool waitForTimestampsWithinContainer(TimestampPacketContainer *container) { bool waited = false; - for (const auto ×tamp : container->peekNodes()) { - for (uint32_t i = 0; i < timestamp->getPacketsUsed(); i++) { - while (timestamp->getContextEndValue(i) == 1) { - WaitUtils::waitFunctionWithPredicate(static_cast(timestamp->getContextEndAddress(i)), 1u, std::not_equal_to()); + if (container) { + for (const auto ×tamp : container->peekNodes()) { + for (uint32_t i = 0; i < timestamp->getPacketsUsed(); i++) { + while (timestamp->getContextEndValue(i) == 1) { + WaitUtils::waitFunctionWithPredicate(static_cast(timestamp->getContextEndAddress(i)), 1u, std::not_equal_to()); + } + waited = true; } - waited = true; } } @@ -152,20 +154,19 @@ inline bool waitForTimestampsWithinContainer(TimestampPacketContainer *container } template -void CommandQueueHw::waitForTimestamps(uint32_t taskCount) { +bool CommandQueueHw::waitForTimestamps(uint32_t taskCount) { using TSPacketType = typename Family::TimestampPacketType; + bool waited = false; - if (isTimestampWaitEnabled()) { - bool waited = waitForTimestampsWithinContainer(timestampPacketContainer.get()); + if (isWaitForTimestampsEnabled()) { + waited = waitForTimestampsWithinContainer(timestampPacketContainer.get()); if (isOOQEnabled()) { waited |= waitForTimestampsWithinContainer(deferredTimestampPackets.get()); } - - if (waited) { - getGpgpuCommandStreamReceiver().updateTagFromCpu(taskCount); - } } + + return waited; } template diff --git a/opencl/source/dll/CMakeLists.txt b/opencl/source/dll/CMakeLists.txt index 8d0a5b133c..e8dc8eb89a 100644 --- a/opencl/source/dll/CMakeLists.txt +++ b/opencl/source/dll/CMakeLists.txt @@ -12,6 +12,7 @@ endif() set(RUNTIME_SRCS_DLL_BASE ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt + ${CMAKE_CURRENT_SOURCE_DIR}/command_queue_dll.cpp ${NEO_SHARED_DIRECTORY}/dll/create_deferred_deleter.cpp ${NEO_SHARED_DIRECTORY}/dll/create_memory_manager_${DRIVER_MODEL}.cpp ${NEO_SHARED_DIRECTORY}/dll/create_tbx_sockets.cpp @@ -42,7 +43,6 @@ set(RUNTIME_SRCS_DLL_BASE append_sources_from_properties(RUNTIME_SRCS_DLL_BASE NEO_CORE_SRCS_LINK) set(RUNTIME_SRCS_DLL_LINUX - ${CMAKE_CURRENT_SOURCE_DIR}/linux/command_queue_linux_dll.cpp ${NEO_SHARED_DIRECTORY}/dll/linux/drm_neo_create.cpp ${NEO_SHARED_DIRECTORY}/dll/linux/options_linux.cpp ${NEO_SHARED_DIRECTORY}/dll/linux/os_interface.cpp @@ -54,7 +54,6 @@ set(RUNTIME_SRCS_DLL_LINUX ) set(RUNTIME_SRCS_DLL_WINDOWS - ${CMAKE_CURRENT_SOURCE_DIR}/windows/command_queue_windows_dll.cpp ${NEO_SHARED_DIRECTORY}/dll/windows/options_windows.cpp ${NEO_SHARED_DIRECTORY}/dll/windows/os_interface.cpp ${NEO_SHARED_DIRECTORY}/dll/windows/environment_variables.cpp diff --git a/opencl/source/dll/linux/command_queue_linux_dll.cpp b/opencl/source/dll/command_queue_dll.cpp similarity index 87% rename from opencl/source/dll/linux/command_queue_linux_dll.cpp rename to opencl/source/dll/command_queue_dll.cpp index 3819fd80ed..3efcc13a85 100644 --- a/opencl/source/dll/linux/command_queue_linux_dll.cpp +++ b/opencl/source/dll/command_queue_dll.cpp @@ -8,6 +8,7 @@ #include "opencl/source/command_queue/command_queue.h" namespace NEO { + bool CommandQueue::isAssignEngineRoundRobinEnabled() { auto assignEngineRoundRobin = false; @@ -17,4 +18,9 @@ bool CommandQueue::isAssignEngineRoundRobinEnabled() { return assignEngineRoundRobin; } + +bool CommandQueue::isTimestampWaitEnabled() { + return false; +} + } // namespace NEO \ No newline at end of file diff --git a/opencl/source/dll/windows/command_queue_windows_dll.cpp b/opencl/source/dll/windows/command_queue_windows_dll.cpp deleted file mode 100644 index 3819fd80ed..0000000000 --- a/opencl/source/dll/windows/command_queue_windows_dll.cpp +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Copyright (C) 2020-2021 Intel Corporation - * - * SPDX-License-Identifier: MIT - * - */ - -#include "opencl/source/command_queue/command_queue.h" - -namespace NEO { -bool CommandQueue::isAssignEngineRoundRobinEnabled() { - auto assignEngineRoundRobin = false; - - if (DebugManager.flags.EnableCmdQRoundRobindEngineAssign.get() != -1) { - assignEngineRoundRobin = DebugManager.flags.EnableCmdQRoundRobindEngineAssign.get(); - } - - return assignEngineRoundRobin; -} -} // namespace NEO \ No newline at end of file diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp index 9136149c63..c27092a7fe 100644 --- a/opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp @@ -100,6 +100,21 @@ HWTEST_F(CommandQueueHwTest, WhenConstructingTwoCommandQueuesThenOnlyOneDebugSur EXPECT_EQ(dbgSurface, device->getGpgpuCommandStreamReceiver().getDebugSurfaceAllocation()); } +HWTEST_F(CommandQueueHwTest, givenNoTimestampPacketsWhenWaitForTimestampsThenNoWaitAndTagIsNotUpdated) { + DebugManagerStateRestore restorer; + DebugManager.flags.EnableTimestampPacket.set(0); + DebugManager.flags.EnableTimestampWait.set(4); + ExecutionEnvironment *executionEnvironment = platform()->peekExecutionEnvironment(); + auto device = std::make_unique(MockDevice::create(executionEnvironment, 0u)); + device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = false; + MockCommandQueueHw cmdQ(context, device.get(), nullptr); + auto taskCount = device->getUltCommandStreamReceiver().peekLatestFlushedTaskCount(); + + cmdQ.waitForTimestamps(101u); + + EXPECT_EQ(device->getUltCommandStreamReceiver().peekLatestFlushedTaskCount(), taskCount); +} + HWTEST_F(CommandQueueHwTest, WhenDebugSurfaceIsAllocatedThenBufferIsZeroed) { ExecutionEnvironment *executionEnvironment = platform()->peekExecutionEnvironment(); executionEnvironment->rootDeviceEnvironments[0]->debugger.reset(new MockActiveSourceLevelDebugger(new MockOsLibrary)); diff --git a/opencl/test/unit_test/command_queue/command_queue_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_tests.cpp index 5aec6c82fa..97267a75fd 100644 --- a/opencl/test/unit_test/command_queue/command_queue_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_tests.cpp @@ -140,32 +140,34 @@ TEST(CommandQueue, WhenConstructingCommandQueueThenQueueFamilyIsNotSelected) { TEST(CommandQueue, givenEnableTimestampWaitWhenCheckIsTimestampWaitEnabledThenReturnProperValue) { DebugManagerStateRestore restorer; + VariableBackup backup(&ultHwConfig); + ultHwConfig.useWaitForTimestamps = true; auto mockDevice = std::make_unique(MockDevice::createWithNewExecutionEnvironment(defaultHwInfo.get())); MockCommandQueue cmdQ(nullptr, mockDevice.get(), 0, false); { DebugManager.flags.EnableTimestampWait.set(0); - EXPECT_FALSE(cmdQ.isTimestampWaitEnabled()); + EXPECT_FALSE(cmdQ.isWaitForTimestampsEnabled()); } { DebugManager.flags.EnableTimestampWait.set(1); - EXPECT_EQ(cmdQ.isTimestampWaitEnabled(), cmdQ.getGpgpuCommandStreamReceiver().isUpdateTagFromWaitEnabled()); + EXPECT_EQ(cmdQ.isWaitForTimestampsEnabled(), cmdQ.getGpgpuCommandStreamReceiver().isUpdateTagFromWaitEnabled()); } { DebugManager.flags.EnableTimestampWait.set(2); - EXPECT_EQ(cmdQ.isTimestampWaitEnabled(), cmdQ.getGpgpuCommandStreamReceiver().isDirectSubmissionEnabled()); + EXPECT_EQ(cmdQ.isWaitForTimestampsEnabled(), cmdQ.getGpgpuCommandStreamReceiver().isDirectSubmissionEnabled()); } { DebugManager.flags.EnableTimestampWait.set(3); - EXPECT_EQ(cmdQ.isTimestampWaitEnabled(), cmdQ.getGpgpuCommandStreamReceiver().isAnyDirectSubmissionEnabled()); + EXPECT_EQ(cmdQ.isWaitForTimestampsEnabled(), cmdQ.getGpgpuCommandStreamReceiver().isAnyDirectSubmissionEnabled()); } { DebugManager.flags.EnableTimestampWait.set(4); - EXPECT_TRUE(cmdQ.isTimestampWaitEnabled()); + EXPECT_TRUE(cmdQ.isWaitForTimestampsEnabled()); } } @@ -884,7 +886,7 @@ struct WaitForQueueCompletionTests : public ::testing::Test { template struct MyCmdQueue : public CommandQueueHw { MyCmdQueue(Context *context, ClDevice *device) : CommandQueueHw(context, device, nullptr, false){}; - void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList) override { + void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait) override { requestedUseQuickKmdSleep = useQuickKmdSleep; waitUntilCompleteCounter++; } @@ -977,7 +979,7 @@ HWTEST_F(WaitUntilCompletionTests, givenCommandQueueAndCleanTemporaryAllocationL cmdQ->gpgpuEngine->commandStreamReceiver = cmdStream.get(); uint32_t taskCount = 0u; StackVec activeBcsStates{}; - cmdQ->waitUntilComplete(taskCount, activeBcsStates, cmdQ->flushStamp->peekStamp(), false, false); + cmdQ->waitUntilComplete(taskCount, activeBcsStates, cmdQ->flushStamp->peekStamp(), false, false, false); auto cmdStreamPtr = &device->getGpgpuCommandStreamReceiver(); diff --git a/opencl/test/unit_test/command_queue/enqueue_handler_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_handler_tests.cpp index 8da9541f81..12d5058bc1 100644 --- a/opencl/test/unit_test/command_queue/enqueue_handler_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_handler_tests.cpp @@ -111,9 +111,9 @@ struct EnqueueHandlerWithAubSubCaptureTests : public EnqueueHandlerTest { public: MockCmdQWithAubSubCapture(Context *context, ClDevice *device) : CommandQueueHw(context, device, nullptr, false) {} - void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList) override { + void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait) override { waitUntilCompleteCalled = true; - CommandQueueHw::waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep, cleanTemporaryAllocationList); + CommandQueueHw::waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep, cleanTemporaryAllocationList, skipWait); } void obtainNewTimestampPacketNodes(size_t numberOfNodes, TimestampPacketContainer &previousNodes, bool clearAllDependencies, CommandStreamReceiver &csr) override { diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp index d422c32b5a..726afe07cb 100644 --- a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp @@ -803,9 +803,9 @@ class MyCmdQ : public MockCommandQueueHw { auxTranslationDirection); } - void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList) override { + void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait) override { waitCalled++; - MockCommandQueueHw::waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep, cleanTemporaryAllocationList); + MockCommandQueueHw::waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep, cleanTemporaryAllocationList, skipWait); } std::vector auxTranslationDirections; diff --git a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp index 7cdfb56962..285e03e5b8 100644 --- a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp +++ b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp @@ -775,18 +775,16 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEnqueueingThe EXPECT_EQ(0u, deferredTimestampPackets->peekNodes().size()); } -HWTEST_F(TimestampPacketTests, givenEnableTimestampWaitWhenFinishWithoutEnqueueThenWaitOnTimestampAndDoNotUpdateTagFromCpu) { +HWTEST_F(TimestampPacketTests, givenEnableTimestampWaitWhenFinishWithoutEnqueueThenDoNotWaitOnTimestamp) { DebugManagerStateRestore restorer; DebugManager.flags.UpdateTaskCountFromWait.set(3); DebugManager.flags.EnableTimestampWait.set(1); - device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + auto &csr = device->getUltCommandStreamReceiver(); + csr.timestampPacketWriteEnabled = true; + csr.callBaseWaitForCompletionWithTimeout = false; auto cmdQ = std::make_unique>(context, device.get(), nullptr); - const auto &csr = cmdQ->getGpgpuCommandStreamReceiver(); - auto taskCount = *csr.getTagAddress(); - auto latestFlushedTaskCount = csr.peekLatestFlushedTaskCount(); - TimestampPacketContainer *deferredTimestampPackets = cmdQ->deferredTimestampPackets.get(); TimestampPacketContainer *timestampPacketContainer = cmdQ->timestampPacketContainer.get(); @@ -795,16 +793,17 @@ HWTEST_F(TimestampPacketTests, givenEnableTimestampWaitWhenFinishWithoutEnqueueT cmdQ->finish(); - EXPECT_EQ(csr.peekLatestFlushedTaskCount(), latestFlushedTaskCount); - EXPECT_EQ(*csr.getTagAddress(), taskCount); + EXPECT_EQ(csr.waitForCompletionWithTimeoutTaskCountCalled, 1u); } -HWTEST_F(TimestampPacketTests, givenEnableTimestampWaitWhenFinishThenWaitOnTimestampAndUpdateTagFromCpu) { +HWTEST_F(TimestampPacketTests, givenEnableTimestampWaitWhenFinishThenWaitOnTimestamp) { DebugManagerStateRestore restorer; DebugManager.flags.UpdateTaskCountFromWait.set(3); DebugManager.flags.EnableTimestampWait.set(1); - device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + auto &csr = device->getUltCommandStreamReceiver(); + csr.timestampPacketWriteEnabled = true; + csr.callBaseWaitForCompletionWithTimeout = false; auto cmdQ = std::make_unique>(context, device.get(), nullptr); TimestampPacketContainer *deferredTimestampPackets = cmdQ->deferredTimestampPackets.get(); @@ -819,23 +818,22 @@ HWTEST_F(TimestampPacketTests, givenEnableTimestampWaitWhenFinishThenWaitOnTimes typename FamilyType::TimestampPacketType timestampData[] = {2, 2, 2, 2}; for (uint32_t i = 0; i < deferredTimestampPackets->peekNodes()[0]->getPacketsUsed(); i++) { - deferredTimestampPackets->peekNodes()[0]->assignDataToAllTimestamps(i, timestampData); timestampPacketContainer->peekNodes()[0]->assignDataToAllTimestamps(i, timestampData); } cmdQ->finish(); - const auto &csr = cmdQ->getGpgpuCommandStreamReceiver(); - EXPECT_EQ(csr.peekLatestFlushedTaskCount(), 2u); - EXPECT_EQ(*csr.getTagAddress(), 2u); + EXPECT_EQ(csr.waitForCompletionWithTimeoutTaskCountCalled, 0u); } -HWTEST_F(TimestampPacketTests, givenOOQAndEnableTimestampWaitWhenFinishThenWaitOnTimestampAndUpdateTagFromCpu) { +HWTEST_F(TimestampPacketTests, givenOOQAndEnableTimestampWaitWhenFinishThenWaitOnTimestamp) { DebugManagerStateRestore restorer; DebugManager.flags.UpdateTaskCountFromWait.set(3); DebugManager.flags.EnableTimestampWait.set(1); - device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + auto &csr = device->getUltCommandStreamReceiver(); + csr.timestampPacketWriteEnabled = true; + csr.callBaseWaitForCompletionWithTimeout = false; cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0}; auto cmdQ = std::make_unique>(context, device.get(), props); @@ -857,9 +855,7 @@ HWTEST_F(TimestampPacketTests, givenOOQAndEnableTimestampWaitWhenFinishThenWaitO cmdQ->finish(); - const auto &csr = cmdQ->getGpgpuCommandStreamReceiver(); - EXPECT_EQ(csr.peekLatestFlushedTaskCount(), 2u); - EXPECT_EQ(*csr.getTagAddress(), 2u); + EXPECT_EQ(csr.waitForCompletionWithTimeoutTaskCountCalled, 0u); cmdQ.reset(); } diff --git a/opencl/test/unit_test/libult/command_queue_ult.cpp b/opencl/test/unit_test/libult/command_queue_ult.cpp index 3819fd80ed..d3845f978f 100644 --- a/opencl/test/unit_test/libult/command_queue_ult.cpp +++ b/opencl/test/unit_test/libult/command_queue_ult.cpp @@ -5,9 +5,12 @@ * */ +#include "shared/test/common/helpers/ult_hw_config.h" + #include "opencl/source/command_queue/command_queue.h" namespace NEO { + bool CommandQueue::isAssignEngineRoundRobinEnabled() { auto assignEngineRoundRobin = false; @@ -17,4 +20,9 @@ bool CommandQueue::isAssignEngineRoundRobinEnabled() { return assignEngineRoundRobin; } + +bool CommandQueue::isTimestampWaitEnabled() { + return ultHwConfig.useWaitForTimestamps; +} + } // namespace NEO \ No newline at end of file diff --git a/opencl/test/unit_test/linux/CMakeLists.txt b/opencl/test/unit_test/linux/CMakeLists.txt index b1a0ab8359..000429960b 100644 --- a/opencl/test/unit_test/linux/CMakeLists.txt +++ b/opencl/test/unit_test/linux/CMakeLists.txt @@ -27,7 +27,7 @@ add_executable(igdrcl_${target_name} ${NEO_SHARED_DIRECTORY}/dll/linux/drm_neo_create.cpp ${NEO_SHARED_DIRECTORY}/dll/linux/options_linux.cpp ${NEO_SHARED_DIRECTORY}/dll/linux/os_interface.cpp - ${NEO_SOURCE_DIR}/opencl/source/dll/linux/command_queue_linux_dll.cpp + ${NEO_SOURCE_DIR}/opencl/source/dll/command_queue_dll.cpp ${NEO_SOURCE_DIR}/opencl/source/os_interface/linux/platform_teardown_linux.cpp ${NEO_SOURCE_DIR}/opencl/test/unit_test/linux${BRANCH_DIR_SUFFIX}drm_other_requests.cpp ) diff --git a/opencl/test/unit_test/linux/main_linux_dll.cpp b/opencl/test/unit_test/linux/main_linux_dll.cpp index 728168c800..262ec68e1e 100644 --- a/opencl/test/unit_test/linux/main_linux_dll.cpp +++ b/opencl/test/unit_test/linux/main_linux_dll.cpp @@ -803,10 +803,14 @@ TEST(DirectSubmissionControllerTest, whenCheckDirectSubmissionControllerSupportT EXPECT_TRUE(DirectSubmissionController::isSupported()); } -TEST(CommandQueueTest, whenCheckEngineRoundRobinAssignThenReturnsTrue) { +TEST(CommandQueueTest, whenCheckEngineRoundRobinAssignThenReturnsFalse) { EXPECT_FALSE(CommandQueue::isAssignEngineRoundRobinEnabled()); } +TEST(CommandQueueTest, whenCheckEngineTimestampWaitEnabledThenReturnsFalse) { + EXPECT_FALSE(CommandQueue::isTimestampWaitEnabled()); +} + TEST(CommandQueueTest, givenEnableCmdQRoundRobindEngineAssignSetWhenCheckEngineRoundRobinAssignThenReturnsTrue) { DebugManagerStateRestore restorer; DebugManager.flags.EnableCmdQRoundRobindEngineAssign.set(1); diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index cdfc4fe623..c418b7d3ae 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -90,9 +90,9 @@ class MockCommandQueue : public CommandQueue { return writeBufferRetValue; } - void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList) override { + void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait) override { latestTaskCountWaited = gpgpuTaskCountToWait; - return CommandQueue::waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep, cleanTemporaryAllocationList); + return CommandQueue::waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep, cleanTemporaryAllocationList, skipWait); } void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) override { @@ -202,7 +202,7 @@ class MockCommandQueue : public CommandQueue { bool obtainTimestampPacketForCacheFlush(bool isCacheFlushRequired) const override { return isCacheFlushRequired; } - void waitForTimestamps(uint32_t taskCount) override{}; + bool waitForTimestamps(uint32_t taskCount) override { return false; }; bool releaseIndirectHeapCalled = false; @@ -333,9 +333,9 @@ class MockCommandQueueHw : public CommandQueueHw { useBcsCsrOnNotifyEnabled = notifyBcsCsr; } - void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList) override { + void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait) override { latestTaskCountWaited = gpgpuTaskCountToWait; - return BaseClass::waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep, cleanTemporaryAllocationList); + return BaseClass::waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep, cleanTemporaryAllocationList, skipWait); } bool isCacheFlushForBcsRequired() const override { diff --git a/shared/source/command_stream/command_stream_receiver.cpp b/shared/source/command_stream/command_stream_receiver.cpp index 4309441fc7..e4e56a32ef 100644 --- a/shared/source/command_stream/command_stream_receiver.cpp +++ b/shared/source/command_stream/command_stream_receiver.cpp @@ -674,16 +674,6 @@ bool CommandStreamReceiver::createAllocationForHostSurface(HostPtrSurface &surfa return true; } -void CommandStreamReceiver::updateTagFromCpu(uint32_t taskCount) { - this->latestFlushedTaskCount.store(taskCount); - - auto partitionAddress = getTagAddress(); - for (uint32_t i = 0; i < activePartitions; i++) { - *partitionAddress = taskCount; - partitionAddress = ptrOffset(partitionAddress, this->postSyncWriteOffset); - } -} - TagAllocatorBase *CommandStreamReceiver::getEventTsAllocator() { if (profilingTimeStampAllocator.get() == nullptr) { std::vector rootDeviceIndices = {rootDeviceIndex}; diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index 24f94017f5..422bf4d669 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -225,8 +225,6 @@ class CommandStreamReceiver { virtual void updateTagFromWait() = 0; virtual bool isUpdateTagFromWaitEnabled() = 0; - void updateTagFromCpu(uint32_t taskCount); - ScratchSpaceController *getScratchSpaceController() const { return scratchSpaceController.get(); } diff --git a/shared/source/helpers/hw_helper.h b/shared/source/helpers/hw_helper.h index 39c85471e8..834e4926de 100644 --- a/shared/source/helpers/hw_helper.h +++ b/shared/source/helpers/hw_helper.h @@ -64,6 +64,7 @@ class HwHelper { static bool compressedImagesSupported(const HardwareInfo &hwInfo); static bool cacheFlushAfterWalkerSupported(const HardwareInfo &hwInfo); virtual bool timestampPacketWriteSupported() const = 0; + virtual bool isTimestampWaitSupported() const = 0; virtual size_t getRenderSurfaceStateSize() const = 0; virtual void setRenderSurfaceStateForBuffer(const RootDeviceEnvironment &rootDeviceEnvironment, void *surfaceStateBuffer, @@ -239,6 +240,8 @@ class HwHelperHw : public HwHelper { bool timestampPacketWriteSupported() const override; + bool isTimestampWaitSupported() const override; + bool is1MbAlignmentSupported(const HardwareInfo &hwInfo, bool isCompressionEnabled) const override; bool isFenceAllocationRequired(const HardwareInfo &hwInfo) const override; diff --git a/shared/source/helpers/hw_helper_bdw_and_later.inl b/shared/source/helpers/hw_helper_bdw_and_later.inl index ca136c7281..624933ef0e 100644 --- a/shared/source/helpers/hw_helper_bdw_and_later.inl +++ b/shared/source/helpers/hw_helper_bdw_and_later.inl @@ -40,6 +40,11 @@ bool HwHelperHw::timestampPacketWriteSupported() const { return false; } +template +bool HwHelperHw::isTimestampWaitSupported() const { + return false; +} + template bool HwHelperHw::isAssignEngineRoundRobinSupported() const { return false; diff --git a/shared/source/helpers/hw_helper_pvc_and_later.inl b/shared/source/helpers/hw_helper_pvc_and_later.inl index a51c1690e2..f144834ab5 100644 --- a/shared/source/helpers/hw_helper_pvc_and_later.inl +++ b/shared/source/helpers/hw_helper_pvc_and_later.inl @@ -36,6 +36,11 @@ bool HwHelperHw::isCooperativeDispatchSupported(const EngineGroupType en return true; } +template <> +bool HwHelperHw::isTimestampWaitSupported() const { + return true; +} + template <> uint32_t HwHelperHw::adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType, const HardwareInfo &hwInfo, bool isEngineInstanced) const { diff --git a/shared/source/helpers/hw_helper_xehp_and_later.inl b/shared/source/helpers/hw_helper_xehp_and_later.inl index d796a77dfe..acd43d20ab 100644 --- a/shared/source/helpers/hw_helper_xehp_and_later.inl +++ b/shared/source/helpers/hw_helper_xehp_and_later.inl @@ -54,6 +54,11 @@ bool HwHelperHw::timestampPacketWriteSupported() const { return true; } +template +bool HwHelperHw::isTimestampWaitSupported() const { + return false; +} + template const EngineInstancesContainer HwHelperHw::getGpgpuEngineInstances(const HardwareInfo &hwInfo) const { auto defaultEngine = getChosenEngineType(hwInfo); diff --git a/shared/test/common/helpers/ult_hw_config.h b/shared/test/common/helpers/ult_hw_config.h index 328c9e3080..38e858e704 100644 --- a/shared/test/common/helpers/ult_hw_config.h +++ b/shared/test/common/helpers/ult_hw_config.h @@ -12,6 +12,7 @@ struct UltHwConfig { bool useHwCsr = false; bool useMockedPrepareDeviceEnvironmentsFunc = true; bool forceOsAgnosticMemoryManager = true; + bool useWaitForTimestamps = false; bool csrFailInitDirectSubmission = false; bool csrBaseCallDirectSubmissionAvailable = false; diff --git a/shared/test/unit_test/base_ult_config_listener.cpp b/shared/test/unit_test/base_ult_config_listener.cpp index 8773153d3a..48f1cdf819 100644 --- a/shared/test/unit_test/base_ult_config_listener.cpp +++ b/shared/test/unit_test/base_ult_config_listener.cpp @@ -32,7 +32,7 @@ void NEO::BaseUltConfigListener::OnTestEnd(const ::testing::TestInfo &) { // Ensure that global state is restored UltHwConfig expectedState{}; - static_assert(sizeof(UltHwConfig) == 11 * sizeof(bool), ""); // Ensure that there is no internal padding + static_assert(sizeof(UltHwConfig) == 12 * sizeof(bool), ""); // Ensure that there is no internal padding EXPECT_EQ(0, memcmp(&expectedState, &ultHwConfig, sizeof(UltHwConfig))); EXPECT_EQ(0, memcmp(&referencedHwInfo.platform, &defaultHwInfo->platform, sizeof(PLATFORM)));