diff --git a/opencl/test/unit_test/helpers/hw_helper_tests_pvc_and_later.cpp b/opencl/test/unit_test/helpers/hw_helper_tests_pvc_and_later.cpp index a857181b2d..ca36e803e3 100644 --- a/opencl/test/unit_test/helpers/hw_helper_tests_pvc_and_later.cpp +++ b/opencl/test/unit_test/helpers/hw_helper_tests_pvc_and_later.cpp @@ -86,6 +86,12 @@ HWTEST2_F(HwHelperTestPvcAndLater, GivenVariousValuesWhenCallingGetBarriersCount EXPECT_EQ(32u, hwHelper.getBarriersCountFromHasBarriers(7u)); } +HWTEST2_F(HwHelperTestPvcAndLater, givenHwHelperWhenCheckIsUpdateTaskCountFromWaitSupportedThenReturnsTrue, IsAtLeastXeHpcCore) { + auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); + + EXPECT_TRUE(hwHelper.isUpdateTaskCountFromWaitSupported()); +} + HWTEST2_F(HwHelperTestPvcAndLater, givenCooperativeContextSupportedWhenGetEngineInstancesThenReturnCorrectAmountOfCooperativeCcs, IsAtLeastXeHpcCore) { HardwareInfo hwInfo = *defaultHwInfo; hwInfo.gtSystemInfo.CCSInfo.NumberOfCCSEnabled = 2; diff --git a/opencl/test/unit_test/os_interface/linux/drm_command_stream_tests_1.cpp b/opencl/test/unit_test/os_interface/linux/drm_command_stream_tests_1.cpp index f38a200aac..ecc208c9f9 100644 --- a/opencl/test/unit_test/os_interface/linux/drm_command_stream_tests_1.cpp +++ b/opencl/test/unit_test/os_interface/linux/drm_command_stream_tests_1.cpp @@ -931,6 +931,33 @@ HWTEST_TEMPLATED_F(DrmCommandStreamTest, givenLocalMemoryEnabledWhenCreatingDrmC { DebugManagerStateRestore restore; DebugManager.flags.EnableLocalMemory.set(1); + DebugManager.flags.UpdateTaskCountFromWait.set(3); + + MockDrmCsr csr1(executionEnvironment, 0, 1, gemCloseWorkerMode::gemCloseWorkerInactive); + EXPECT_EQ(DispatchMode::ImmediateDispatch, csr1.dispatchMode); + + DebugManager.flags.CsrDispatchMode.set(static_cast(DispatchMode::BatchedDispatch)); + MockDrmCsr csr2(executionEnvironment, 0, 1, gemCloseWorkerMode::gemCloseWorkerInactive); + EXPECT_EQ(DispatchMode::BatchedDispatch, csr2.dispatchMode); + } + + { + DebugManagerStateRestore restore; + DebugManager.flags.EnableLocalMemory.set(0); + DebugManager.flags.UpdateTaskCountFromWait.set(3); + + MockDrmCsr csr1(executionEnvironment, 0, 1, gemCloseWorkerMode::gemCloseWorkerInactive); + EXPECT_EQ(DispatchMode::ImmediateDispatch, csr1.dispatchMode); + + DebugManager.flags.CsrDispatchMode.set(static_cast(DispatchMode::BatchedDispatch)); + MockDrmCsr csr2(executionEnvironment, 0, 1, gemCloseWorkerMode::gemCloseWorkerInactive); + EXPECT_EQ(DispatchMode::BatchedDispatch, csr2.dispatchMode); + } + + { + DebugManagerStateRestore restore; + DebugManager.flags.EnableLocalMemory.set(1); + DebugManager.flags.UpdateTaskCountFromWait.set(0); MockDrmCsr csr1(executionEnvironment, 0, 1, gemCloseWorkerMode::gemCloseWorkerInactive); EXPECT_EQ(DispatchMode::BatchedDispatch, csr1.dispatchMode); @@ -943,6 +970,7 @@ HWTEST_TEMPLATED_F(DrmCommandStreamTest, givenLocalMemoryEnabledWhenCreatingDrmC { DebugManagerStateRestore restore; DebugManager.flags.EnableLocalMemory.set(0); + DebugManager.flags.UpdateTaskCountFromWait.set(0); MockDrmCsr csr1(executionEnvironment, 0, 1, gemCloseWorkerMode::gemCloseWorkerInactive); EXPECT_EQ(DispatchMode::ImmediateDispatch, csr1.dispatchMode); diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index e1aa01899d..3e550bb184 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -187,12 +187,14 @@ CompletionStamp CommandStreamReceiverHw::flushTask( bool updateTag = false; if (dispatchFlags.blocking || dispatchFlags.dcFlush || dispatchFlags.guardCommandBufferWithPipeControl) { + updateTag = !isUpdateTagFromWaitEnabled(); + if (this->dispatchMode == DispatchMode::ImmediateDispatch) { //for ImmediateDispatch we will send this right away, therefore this pipe control will close the level //for BatchedSubmissions it will be nooped and only last ppc in batch will be emitted. levelClosed = true; //if we guard with ppc, flush dc as well to speed up completion latency - if (dispatchFlags.guardCommandBufferWithPipeControl) { + if (dispatchFlags.guardCommandBufferWithPipeControl && updateTag) { dispatchFlags.dcFlush = true; } } @@ -206,7 +208,6 @@ CompletionStamp CommandStreamReceiverHw::flushTask( auto address = getTagAllocation()->getGpuAddress(); - updateTag = !isUpdateTagFromWaitEnabled(); updateTag |= dispatchFlags.blocking; updateTag |= dispatchFlags.dcFlush; @@ -1311,7 +1312,9 @@ inline void CommandStreamReceiverHw::flushHandler(BatchBuffer &batchB template inline bool CommandStreamReceiverHw::isUpdateTagFromWaitEnabled() { - bool enabled = false; + auto &hwHelper = HwHelper::get(peekHwInfo().platform.eRenderCoreFamily); + auto enabled = hwHelper.isUpdateTaskCountFromWaitSupported(); + enabled &= this->isAnyDirectSubmissionEnabled(); switch (DebugManager.flags.UpdateTaskCountFromWait.get()) { case 0: diff --git a/shared/source/helpers/hw_helper.h b/shared/source/helpers/hw_helper.h index 834e4926de..e6bea92da8 100644 --- a/shared/source/helpers/hw_helper.h +++ b/shared/source/helpers/hw_helper.h @@ -65,6 +65,7 @@ class HwHelper { static bool cacheFlushAfterWalkerSupported(const HardwareInfo &hwInfo); virtual bool timestampPacketWriteSupported() const = 0; virtual bool isTimestampWaitSupported() const = 0; + virtual bool isUpdateTaskCountFromWaitSupported() const = 0; virtual size_t getRenderSurfaceStateSize() const = 0; virtual void setRenderSurfaceStateForBuffer(const RootDeviceEnvironment &rootDeviceEnvironment, void *surfaceStateBuffer, @@ -242,6 +243,8 @@ class HwHelperHw : public HwHelper { bool isTimestampWaitSupported() const override; + bool isUpdateTaskCountFromWaitSupported() const override; + bool is1MbAlignmentSupported(const HardwareInfo &hwInfo, bool isCompressionEnabled) const override; bool isFenceAllocationRequired(const HardwareInfo &hwInfo) const override; diff --git a/shared/source/helpers/hw_helper_bdw_and_later.inl b/shared/source/helpers/hw_helper_bdw_and_later.inl index 624933ef0e..cb01bc28b7 100644 --- a/shared/source/helpers/hw_helper_bdw_and_later.inl +++ b/shared/source/helpers/hw_helper_bdw_and_later.inl @@ -45,6 +45,11 @@ bool HwHelperHw::isTimestampWaitSupported() const { return false; } +template +bool HwHelperHw::isUpdateTaskCountFromWaitSupported() const { + return false; +} + template bool HwHelperHw::isAssignEngineRoundRobinSupported() const { return false; diff --git a/shared/source/helpers/hw_helper_pvc_and_later.inl b/shared/source/helpers/hw_helper_pvc_and_later.inl index f144834ab5..0ece986b7f 100644 --- a/shared/source/helpers/hw_helper_pvc_and_later.inl +++ b/shared/source/helpers/hw_helper_pvc_and_later.inl @@ -41,6 +41,11 @@ bool HwHelperHw::isTimestampWaitSupported() const { return true; } +template <> +bool HwHelperHw::isUpdateTaskCountFromWaitSupported() const { + return true; +} + template <> uint32_t HwHelperHw::adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType, const HardwareInfo &hwInfo, bool isEngineInstanced) const { diff --git a/shared/source/helpers/hw_helper_xehp_and_later.inl b/shared/source/helpers/hw_helper_xehp_and_later.inl index acd43d20ab..80012b41c8 100644 --- a/shared/source/helpers/hw_helper_xehp_and_later.inl +++ b/shared/source/helpers/hw_helper_xehp_and_later.inl @@ -59,6 +59,11 @@ bool HwHelperHw::isTimestampWaitSupported() const { return false; } +template +bool HwHelperHw::isUpdateTaskCountFromWaitSupported() const { + return false; +} + template const EngineInstancesContainer HwHelperHw::getGpgpuEngineInstances(const HardwareInfo &hwInfo) const { auto defaultEngine = getChosenEngineType(hwInfo); diff --git a/shared/source/os_interface/linux/drm_command_stream.inl b/shared/source/os_interface/linux/drm_command_stream.inl index 1f65df931f..5f1dd6247c 100644 --- a/shared/source/os_interface/linux/drm_command_stream.inl +++ b/shared/source/os_interface/linux/drm_command_stream.inl @@ -55,7 +55,7 @@ DrmCommandStreamReceiver::DrmCommandStreamReceiver(ExecutionEnvironme auto hwInfo = rootDeviceEnvironment->getHardwareInfo(); auto localMemoryEnabled = HwHelper::get(hwInfo->platform.eRenderCoreFamily).getEnableLocalMemory(*hwInfo); - this->dispatchMode = localMemoryEnabled ? DispatchMode::BatchedDispatch : DispatchMode::ImmediateDispatch; + this->dispatchMode = localMemoryEnabled && !this->isUpdateTagFromWaitEnabled() ? DispatchMode::BatchedDispatch : DispatchMode::ImmediateDispatch; if (DebugManager.flags.CsrDispatchMode.get()) { this->dispatchMode = static_cast(DebugManager.flags.CsrDispatchMode.get()); diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index fa6526cf7b..92b4e6c712 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -388,6 +388,21 @@ HWTEST_F(CommandStreamReceiverTest, givenUpdateTaskCountFromWaitWhenCheckTaskCou } } +HWTEST_F(CommandStreamReceiverTest, givenUpdateTaskCountFromWaitWhenCheckIfEnabledThenCanBeEnabledOnlyWithDirectSubmission) { + auto &csr = pDevice->getUltCommandStreamReceiver(); + auto &hwHelper = HwHelper::get(csr.peekHwInfo().platform.eRenderCoreFamily); + + { + csr.directSubmissionAvailable = true; + EXPECT_EQ(csr.isUpdateTagFromWaitEnabled(), hwHelper.isUpdateTaskCountFromWaitSupported()); + } + + { + csr.directSubmissionAvailable = false; + EXPECT_FALSE(csr.isUpdateTagFromWaitEnabled()); + } +} + struct InitDirectSubmissionFixture { void SetUp() { DebugManager.flags.EnableDirectSubmission.set(1);