diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 4c558ab060..2e25880dd4 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -771,7 +771,7 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( blocking, //blocking shouldFlushDC(commandType, printfHandler) || allocNeedsFlushDC, //dcFlush multiDispatchInfo.usesSlm(), //useSLM - !getGpgpuCommandStreamReceiver().isUpdateTagFromWaitEnabled(), //guardCommandBufferWithPipeControl + true, //guardCommandBufferWithPipeControl commandType == CL_COMMAND_NDRANGE_KERNEL, //GSBA32BitRequired requiresCoherency, //requiresCoherency (QueuePriority::LOW == priority), //lowPriority @@ -1008,7 +1008,7 @@ CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( blocking, //blocking false, //dcFlush false, //useSLM - !getGpgpuCommandStreamReceiver().isUpdateTagFromWaitEnabled(), //guardCommandBufferWithPipeControl + true, //guardCommandBufferWithPipeControl false, //GSBA32BitRequired false, //requiresCoherency false, //lowPriority diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp index 3a14b69a60..b8e37c0ad0 100644 --- a/opencl/source/helpers/task_information.cpp +++ b/opencl/source/helpers/task_information.cpp @@ -70,7 +70,7 @@ CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) { true, //blocking true, //dcFlush false, //useSLM - !commandQueue.getGpgpuCommandStreamReceiver().isUpdateTagFromWaitEnabled(), //guardCommandBufferWithPipeControl + true, //guardCommandBufferWithPipeControl false, //GSBA32BitRequired false, //requiresCoherency commandQueue.getPriority() == QueuePriority::LOW, //lowPriority @@ -199,7 +199,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate true, //blocking flushDC, //dcFlush slmUsed, //useSLM - !commandQueue.getGpgpuCommandStreamReceiver().isUpdateTagFromWaitEnabled(), //guardCommandBufferWithPipeControl + true, //guardCommandBufferWithPipeControl commandType == CL_COMMAND_NDRANGE_KERNEL, //GSBA32BitRequired requiresCoherency, //requiresCoherency commandQueue.getPriority() == QueuePriority::LOW, //lowPriority @@ -357,7 +357,7 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate true, //blocking false, //dcFlush false, //useSLM - !commandStreamReceiver.isUpdateTagFromWaitEnabled(), //guardCommandBufferWithPipeControl + true, //guardCommandBufferWithPipeControl false, //GSBA32BitRequired false, //requiresCoherency commandQueue.getPriority() == QueuePriority::LOW, //lowPriority diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp index 9f4735df71..597eaa8715 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp @@ -102,8 +102,24 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenEnableUpdateTaskFromWaitWhenN // Parse command list parseCommands(commandStreamTask, 0); + auto pipeControlExpected = MemorySynchronizationCommands::getDcFlushEnable(true, *defaultHwInfo); + auto itorPC = find(cmdList.begin(), cmdList.end()); - EXPECT_EQ(cmdList.end(), itorPC); + + if (pipeControlExpected) { + EXPECT_NE(cmdList.end(), itorPC); + if (UnitTestHelper::isPipeControlWArequired(pDevice->getHardwareInfo())) { + itorPC++; + itorPC = find(itorPC, cmdList.end()); + EXPECT_NE(cmdList.end(), itorPC); + } + + // Verify that the dcFlushEnabled bit is set in PC + auto pCmdWA = reinterpret_cast(*itorPC); + EXPECT_EQ(MemorySynchronizationCommands::getDcFlushEnable(true, *defaultHwInfo), pCmdWA->getDcFlushEnable()); + } else { + EXPECT_EQ(cmdList.end(), itorPC); + } buffer->release(); } diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp index 3cba0fd321..c5f12c0dae 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp @@ -1032,38 +1032,6 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenCsrInBatchingModeWithOutOfOrd EXPECT_EQ(MemorySynchronizationCommands::getDcFlushEnable(true, *defaultHwInfo), pipeControl->getDcFlushEnable()); } -HWTEST_F(CommandStreamReceiverFlushTaskTests, givenUpdateTaskCountFromWaitSetAndGuardCommandBufferWithPipeControlWhenFlushTaskThenThereIsPipeControlForUpdateTaskCount) { - DebugManagerStateRestore restorer; - DebugManager.flags.UpdateTaskCountFromWait.set(3); - - CommandQueueHw commandQueue(nullptr, pClDevice, 0, false); - auto &commandStream = commandQueue.getCS(4096u); - - auto mockCsr = new MockCsrHw2(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); - pDevice->resetCommandStreamReceiver(mockCsr); - mockCsr->useNewResourceImplicitFlush = false; - mockCsr->useGpuIdleImplicitFlush = false; - mockCsr->overrideDispatchPolicy(DispatchMode::BatchedDispatch); - - DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); - dispatchFlags.preemptionMode = PreemptionHelper::getDefaultPreemptionMode(pDevice->getHardwareInfo()); - dispatchFlags.guardCommandBufferWithPipeControl = true; - - mockCsr->flushTask(commandStream, - 0, - dsh, - ioh, - ssh, - taskLevel, - dispatchFlags, - *pDevice); - - parseCommands(commandStream); - auto itorPipeControl = find(cmdList.begin(), cmdList.end()); - - EXPECT_NE(itorPipeControl, cmdList.end()); -} - HWTEST_F(CommandStreamReceiverFlushTaskTests, givenUpdateTaskCountFromWaitSetWhenFlushTaskThenThereIsNoPipeControlForUpdateTaskCount) { DebugManagerStateRestore restorer; DebugManager.flags.UpdateTaskCountFromWait.set(3); @@ -1079,6 +1047,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenUpdateTaskCountFromWaitSetWhe DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); dispatchFlags.preemptionMode = PreemptionHelper::getDefaultPreemptionMode(pDevice->getHardwareInfo()); + dispatchFlags.guardCommandBufferWithPipeControl = true; mockCsr->flushTask(commandStream, 0, diff --git a/opencl/test/unit_test/helpers/hw_helper_tests_pvc_and_later.cpp b/opencl/test/unit_test/helpers/hw_helper_tests_pvc_and_later.cpp index d53e810a02..2bfc0059e3 100644 --- a/opencl/test/unit_test/helpers/hw_helper_tests_pvc_and_later.cpp +++ b/opencl/test/unit_test/helpers/hw_helper_tests_pvc_and_later.cpp @@ -86,12 +86,6 @@ HWTEST2_F(HwHelperTestPvcAndLater, GivenVariousValuesWhenCallingGetBarriersCount EXPECT_EQ(32u, hwHelper.getBarriersCountFromHasBarriers(7u)); } -HWTEST2_F(HwHelperTestPvcAndLater, givenHwHelperWhenCheckIsUpdateTaskCountFromWaitSupportedThenReturnsTrue, IsAtLeastXeHpcCore) { - auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); - - EXPECT_TRUE(hwHelper.isUpdateTaskCountFromWaitSupported()); -} - HWTEST2_F(HwHelperTestPvcAndLater, givenCooperativeContextSupportedWhenGetEngineInstancesThenReturnCorrectAmountOfCooperativeCcs, IsAtLeastXeHpcCore) { HardwareInfo hwInfo = *defaultHwInfo; hwInfo.gtSystemInfo.CCSInfo.NumberOfCCSEnabled = 2; diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index eafbff673c..dfeffeeddd 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -197,6 +197,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( const auto &hwInfo = peekHwInfo(); auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily); + bool updateTag = false; if (dispatchFlags.blocking || dispatchFlags.dcFlush || dispatchFlags.guardCommandBufferWithPipeControl) { if (this->dispatchMode == DispatchMode::ImmediateDispatch) { //for ImmediateDispatch we will send this right away, therefore this pipe control will close the level @@ -220,20 +221,29 @@ CompletionStamp CommandStreamReceiverHw::flushTask( auto address = getTagAllocation()->getGpuAddress(); - PipeControlArgs args; - args.dcFlushEnable = MemorySynchronizationCommands::getDcFlushEnable(dispatchFlags.dcFlush, hwInfo); - args.notifyEnable = isUsedNotifyEnableForPostSync(); - args.tlbInvalidation |= dispatchFlags.memoryMigrationRequired; - args.textureCacheInvalidationEnable |= dispatchFlags.textureCacheFlush; - args.workloadPartitionOffset = isMultiTileOperationEnabled(); - MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( - commandStreamTask, - PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, - address, - taskCount + 1, - hwInfo, - args); + updateTag = !isUpdateTagFromWaitEnabled(); + updateTag |= dispatchFlags.blocking; + updateTag |= dispatchFlags.dcFlush; + if (updateTag) { + PipeControlArgs args; + args.dcFlushEnable = MemorySynchronizationCommands::getDcFlushEnable(dispatchFlags.dcFlush, hwInfo); + args.notifyEnable = isUsedNotifyEnableForPostSync(); + args.tlbInvalidation |= dispatchFlags.memoryMigrationRequired; + args.textureCacheInvalidationEnable |= dispatchFlags.textureCacheFlush; + args.workloadPartitionOffset = isMultiTileOperationEnabled(); + MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( + commandStreamTask, + PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, + address, + taskCount + 1, + hwInfo, + args); + } else { + currentPipeControlForNooping = nullptr; + } + + this->latestSentTaskCount = taskCount + 1; DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "taskCount", peekTaskCount()); if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) { flatBatchBufferHelper->setPatchInfoData(PatchInfoData(address, 0u, @@ -248,7 +258,6 @@ CompletionStamp CommandStreamReceiverHw::flushTask( PatchInfoAllocationType::Default)); } } - this->latestSentTaskCount = taskCount + 1; if (DebugManager.flags.ForceSLML3Config.get()) { dispatchFlags.useSLM = true; @@ -576,7 +585,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( if (submitCSR | submitTask) { if (this->dispatchMode == DispatchMode::ImmediateDispatch) { flushHandler(batchBuffer, this->getResidencyAllocations()); - if (dispatchFlags.blocking || dispatchFlags.dcFlush || dispatchFlags.guardCommandBufferWithPipeControl) { + if (updateTag) { this->latestFlushedTaskCount = this->taskCount + 1; } } else { @@ -1318,9 +1327,7 @@ inline void CommandStreamReceiverHw::flushHandler(BatchBuffer &batchB template inline bool CommandStreamReceiverHw::isUpdateTagFromWaitEnabled() { - auto &hwHelper = HwHelper::get(peekHwInfo().platform.eRenderCoreFamily); - auto enabled = hwHelper.isUpdateTaskCountFromWaitSupported(); - enabled &= this->isAnyDirectSubmissionEnabled(); + bool enabled = false; switch (DebugManager.flags.UpdateTaskCountFromWait.get()) { case 0: @@ -1426,9 +1433,6 @@ inline bool CommandStreamReceiverHw::initDirectSubmission(Device &dev if (directSubmissionController) { directSubmissionController->registerDirectSubmission(this); } - if (this->isUpdateTagFromWaitEnabled()) { - this->overrideDispatchPolicy(DispatchMode::ImmediateDispatch); - } } osContext.setDirectSubmissionActive(); } diff --git a/shared/source/helpers/hw_helper.h b/shared/source/helpers/hw_helper.h index f7b6cd4e2a..f03802c2a8 100644 --- a/shared/source/helpers/hw_helper.h +++ b/shared/source/helpers/hw_helper.h @@ -65,7 +65,6 @@ class HwHelper { static bool cacheFlushAfterWalkerSupported(const HardwareInfo &hwInfo); virtual bool timestampPacketWriteSupported() const = 0; virtual bool isTimestampWaitSupported() const = 0; - virtual bool isUpdateTaskCountFromWaitSupported() const = 0; virtual size_t getRenderSurfaceStateSize() const = 0; virtual void setRenderSurfaceStateForBuffer(const RootDeviceEnvironment &rootDeviceEnvironment, void *surfaceStateBuffer, @@ -250,8 +249,6 @@ class HwHelperHw : public HwHelper { bool isTimestampWaitSupported() const override; - bool isUpdateTaskCountFromWaitSupported() const override; - bool is1MbAlignmentSupported(const HardwareInfo &hwInfo, bool isCompressionEnabled) const override; bool isFenceAllocationRequired(const HardwareInfo &hwInfo) const override; diff --git a/shared/source/helpers/hw_helper_bdw_and_later.inl b/shared/source/helpers/hw_helper_bdw_and_later.inl index 6ecd23300e..bf1abad4df 100644 --- a/shared/source/helpers/hw_helper_bdw_and_later.inl +++ b/shared/source/helpers/hw_helper_bdw_and_later.inl @@ -45,11 +45,6 @@ bool HwHelperHw::isTimestampWaitSupported() const { return false; } -template -bool HwHelperHw::isUpdateTaskCountFromWaitSupported() const { - return false; -} - template bool HwHelperHw::isAssignEngineRoundRobinSupported() const { return false; diff --git a/shared/source/helpers/hw_helper_pvc_and_later.inl b/shared/source/helpers/hw_helper_pvc_and_later.inl index 3686a86820..0d45ced42d 100644 --- a/shared/source/helpers/hw_helper_pvc_and_later.inl +++ b/shared/source/helpers/hw_helper_pvc_and_later.inl @@ -41,11 +41,6 @@ bool HwHelperHw::isTimestampWaitSupported() const { return true; } -template <> -bool HwHelperHw::isUpdateTaskCountFromWaitSupported() const { - return true; -} - template <> uint32_t HwHelperHw::adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType, const HardwareInfo &hwInfo, bool isEngineInstanced) const { diff --git a/shared/source/helpers/hw_helper_xehp_and_later.inl b/shared/source/helpers/hw_helper_xehp_and_later.inl index c48d00b2f5..7134a09f70 100644 --- a/shared/source/helpers/hw_helper_xehp_and_later.inl +++ b/shared/source/helpers/hw_helper_xehp_and_later.inl @@ -60,11 +60,6 @@ bool HwHelperHw::isTimestampWaitSupported() const { return false; } -template -bool HwHelperHw::isUpdateTaskCountFromWaitSupported() const { - return false; -} - template const EngineInstancesContainer HwHelperHw::getGpgpuEngineInstances(const HardwareInfo &hwInfo) const { auto defaultEngine = getChosenEngineType(hwInfo); diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index bd05ff2081..dd7002aac9 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -561,21 +561,6 @@ HWTEST_F(CommandStreamReceiverTest, givenUpdateTaskCountFromWaitWhenCheckTaskCou } } -HWTEST_F(CommandStreamReceiverTest, givenUpdateTaskCountFromWaitWhenCheckIfEnabledThenCanBeEnabledOnlyWithDirectSubmission) { - auto &csr = pDevice->getUltCommandStreamReceiver(); - auto &hwHelper = HwHelper::get(csr.peekHwInfo().platform.eRenderCoreFamily); - - { - csr.directSubmissionAvailable = true; - EXPECT_EQ(csr.isUpdateTagFromWaitEnabled(), hwHelper.isUpdateTaskCountFromWaitSupported()); - } - - { - csr.directSubmissionAvailable = false; - EXPECT_FALSE(csr.isUpdateTagFromWaitEnabled()); - } -} - struct InitDirectSubmissionFixture { void SetUp() { DebugManager.flags.EnableDirectSubmission.set(1);