diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index 1fd46b4c05..4514e29552 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -239,7 +239,9 @@ NEO::CompletionStamp CommandListCoreFamilyImmediate::flushRegular hasStallingCmds, // hasStallingCmds hasRelaxedOrderingDependencies, // hasRelaxedOrderingDependencies false, // stateCacheInvalidation - false); // isStallingCommandsOnNextFlushRequired + false, // isStallingCommandsOnNextFlushRequired + false // isDcFlushRequiredOnStallingCommandsOnNextFlush + ); this->updateDispatchFlagsWithRequiredStreamState(dispatchFlags); this->csr->setRequiredScratchSizes(this->getCommandListPerThreadScratchSize(), this->getCommandListPerThreadPrivateScratchSize()); diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index b97f0856c7..eaf67b5e52 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -348,9 +348,12 @@ class CommandQueue : public BaseObject<_cl_command_queue> { void fillCsrDependenciesWithLastBcsPackets(CsrDependencies &csrDeps); void clearLastBcsPackets(); - void setStallingCommandsOnNextFlush(bool isStallingCommandsOnNextFlushRequired) { stallingCommandsOnNextFlushRequired = isStallingCommandsOnNextFlushRequired; } + void setStallingCommandsOnNextFlush(const bool isStallingCommandsOnNextFlushRequired) { stallingCommandsOnNextFlushRequired = isStallingCommandsOnNextFlushRequired; } bool isStallingCommandsOnNextFlushRequired() const { return stallingCommandsOnNextFlushRequired; } + void setDcFlushRequiredOnStallingCommandsOnNextFlush(const bool isDcFlushRequiredOnStallingCommandsOnNextFlush) { dcFlushRequiredOnStallingCommandsOnNextFlush = isDcFlushRequiredOnStallingCommandsOnNextFlush; } + bool isDcFlushRequiredOnStallingCommandsOnNextFlush() const { return dcFlushRequiredOnStallingCommandsOnNextFlush; } + // taskCount of last task TaskCountType taskCount = 0; @@ -460,6 +463,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> { }; std::array bcsTimestampPacketContainers; bool stallingCommandsOnNextFlushRequired = false; + bool dcFlushRequiredOnStallingCommandsOnNextFlush = false; bool splitBarrierRequired = false; }; diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index a1c875cc1e..718061991b 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -272,6 +272,11 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } else if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) { if (CL_COMMAND_BARRIER == commandType && !isNonStallingIoqBarrier) { setStallingCommandsOnNextFlush(true); + if (NEO::DebugManager.flags.SkipDcFlushOnBarrierWithoutEvents.get() == 1 && !event && !getGpgpuCommandStreamReceiver().isMultiTileOperationEnabled()) { + // skip dcFlush + } else { + setDcFlushRequiredOnStallingCommandsOnNextFlush(true); + } this->splitBarrierRequired = true; } @@ -642,6 +647,7 @@ void CommandQueueHw::processDispatchForBlitAuxTranslation(CommandStre *this->timestampPacketContainer, csrDeps, getGpgpuCommandStreamReceiver(), bcsCsr); setStallingCommandsOnNextFlush(true); + setDcFlushRequiredOnStallingCommandsOnNextFlush(true); } eventsRequest.setupBcsCsrForOutputEvent(bcsCsr); @@ -899,7 +905,9 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( false, // hasStallingCmds relaxedOrderingEnabled, // hasRelaxedOrderingDependencies false, // stateCacheInvalidation - isStallingCommandsOnNextFlushRequired()); // isStallingCommandsOnNextFlushRequired + isStallingCommandsOnNextFlushRequired(), // isStallingCommandsOnNextFlushRequired + isDcFlushRequiredOnStallingCommandsOnNextFlush() // isDcFlushRequiredOnStallingCommandsOnNextFlush + ); dispatchFlags.pipelineSelectArgs.mediaSamplerRequired = mediaSamplerRequired; dispatchFlags.pipelineSelectArgs.systolicPipelineSelectMode = systolicPipelineSelectMode; @@ -962,6 +970,7 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( if (isHandlingBarrier) { clearLastBcsPackets(); setStallingCommandsOnNextFlush(false); + setDcFlushRequiredOnStallingCommandsOnNextFlush(false); } if (gtpinIsGTPinInitialized()) { @@ -1156,7 +1165,9 @@ CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( !hasRelaxedOrderingDependencies, // hasStallingCmds hasRelaxedOrderingDependencies, // hasRelaxedOrderingDependencies stateCacheInvalidationNeeded, // stateCacheInvalidation - isStallingCommandsOnNextFlushRequired()); // isStallingCommandsOnNextFlushRequired + isStallingCommandsOnNextFlushRequired(), // isStallingCommandsOnNextFlushRequired + isDcFlushRequiredOnStallingCommandsOnNextFlush() // isDcFlushRequiredOnStallingCommandsOnNextFlush + ); const bool isHandlingBarrier = isStallingCommandsOnNextFlushRequired(); @@ -1180,6 +1191,7 @@ CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( if (isHandlingBarrier) { clearLastBcsPackets(); setStallingCommandsOnNextFlush(false); + setDcFlushRequiredOnStallingCommandsOnNextFlush(false); } } @@ -1280,6 +1292,7 @@ cl_int CommandQueueHw::enqueueBlitSplit(MultiDispatchInfo &dispatchIn if (isOOQEnabled() && (isStallingCommandsOnNextFlushRequired() || this->splitBarrierRequired)) { this->setStallingCommandsOnNextFlush(true); + this->setDcFlushRequiredOnStallingCommandsOnNextFlush(true); NullSurface s; Surface *surfaces[] = {&s}; BuiltinOpParams params{}; diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp index 27209b1f89..830ef3ade8 100644 --- a/opencl/source/helpers/task_information.cpp +++ b/opencl/source/helpers/task_information.cpp @@ -84,7 +84,9 @@ CompletionStamp &CommandMapUnmap::submit(TaskCountType taskLevel, bool terminate false, // hasStallingCmds false, // hasRelaxedOrderingDependencies false, // stateCacheInvalidation - commandQueue.isStallingCommandsOnNextFlushRequired()); // isStallingCommandsOnNextFlushRequired + commandQueue.isStallingCommandsOnNextFlushRequired(), // isStallingCommandsOnNextFlushRequired + commandQueue.isDcFlushRequiredOnStallingCommandsOnNextFlush() // isDcFlushRequiredOnStallingCommandsOnNextFlush + ); DEBUG_BREAK_IF(taskLevel >= CompletionStamp::notReady); @@ -179,6 +181,7 @@ CompletionStamp &CommandComputeKernel::submit(TaskCountType taskLevel, bool term *currentTimestampPacketNodes, csrDeps, commandQueue.getGpgpuCommandStreamReceiver(), *bcsCsrForAuxTranslation); commandQueue.setStallingCommandsOnNextFlush(true); + commandQueue.setDcFlushRequiredOnStallingCommandsOnNextFlush(true); } if (timestampPacketDependencies && commandQueue.isOOQEnabled()) { @@ -221,8 +224,9 @@ CompletionStamp &CommandComputeKernel::submit(TaskCountType taskLevel, bool term false, // hasStallingCmds false, // hasRelaxedOrderingDependencies false, // stateCacheInvalidation - commandQueue.isStallingCommandsOnNextFlushRequired()); // isStallingCommandsOnNextFlushRequired - + commandQueue.isStallingCommandsOnNextFlushRequired(), // isStallingCommandsOnNextFlushRequired + commandQueue.isDcFlushRequiredOnStallingCommandsOnNextFlush() // isDcFlushRequiredOnStallingCommandsOnNextFlush + ); if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { eventsRequest.fillCsrDependenciesForRootDevices(dispatchFlags.csrDependencies, commandStreamReceiver); } @@ -269,6 +273,7 @@ CompletionStamp &CommandComputeKernel::submit(TaskCountType taskLevel, bool term if (isHandlingBarrier) { commandQueue.clearLastBcsPackets(); commandQueue.setStallingCommandsOnNextFlush(false); + commandQueue.setDcFlushRequiredOnStallingCommandsOnNextFlush(false); } if (kernelOperation->blitPropertiesContainer.size() > 0) { @@ -398,7 +403,9 @@ CompletionStamp &CommandWithoutKernel::submit(TaskCountType taskLevel, bool term false, // hasStallingCmds false, // hasRelaxedOrderingDependencies false, // stateCacheInvalidation - commandQueue.isStallingCommandsOnNextFlushRequired()); // isStallingCommandsOnNextFlushRequired + commandQueue.isStallingCommandsOnNextFlushRequired(), // isStallingCommandsOnNextFlushRequired + commandQueue.isDcFlushRequiredOnStallingCommandsOnNextFlush() // isDcFlushRequiredOnStallingCommandsOnNextFlush + ); if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { eventsRequest.fillCsrDependenciesForRootDevices(dispatchFlags.csrDependencies, commandStreamReceiver); @@ -427,6 +434,7 @@ CompletionStamp &CommandWithoutKernel::submit(TaskCountType taskLevel, bool term if (isHandlingBarrier) { commandQueue.clearLastBcsPackets(); commandQueue.setStallingCommandsOnNextFlush(false); + commandQueue.setDcFlushRequiredOnStallingCommandsOnNextFlush(false); } if (kernelOperation->blitEnqueue) { diff --git a/opencl/test/unit_test/command_queue/ooq_task_tests.cpp b/opencl/test/unit_test/command_queue/ooq_task_tests.cpp index 52ab88dc8b..e4867a62a0 100644 --- a/opencl/test/unit_test/command_queue/ooq_task_tests.cpp +++ b/opencl/test/unit_test/command_queue/ooq_task_tests.cpp @@ -322,3 +322,89 @@ TEST_F(OOQTaskTests, givenOutOfOrderCommandQueueWhenBarrierIsCalledThenTaskLevel EXPECT_GT(newTaskLevel, currentTaskLevel); } + +HWTEST_F(OOQTaskTests, givenSkipDcFlushOnBarrierWithEventsEnabledWhenEnqueingBarrierWithWaitListThenDcFlushNotSet) { + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + if (false == commandStreamReceiver.peekTimestampPacketWriteEnabled()) { + GTEST_SKIP(); + } + DebugManagerStateRestore restorer; + DebugManager.flags.SkipDcFlushOnBarrierWithoutEvents.set(1); + + const cl_uint numEventsInWaitList = 0; + const cl_event *eventWaitList = nullptr; + auto retVal = pCmdQ->enqueueBarrierWithWaitList( + numEventsInWaitList, + eventWaitList, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + EXPECT_TRUE(pCmdQ->isStallingCommandsOnNextFlushRequired()); + EXPECT_FALSE(pCmdQ->isDcFlushRequiredOnStallingCommandsOnNextFlush()); +} + +HWTEST_F(OOQTaskTests, givenSkipDcFlushOnBarrierWithEventsEnabledWhenEnqueingBarrierWithWaitListWithEventThenDcFlushSet) { + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + if (false == commandStreamReceiver.peekTimestampPacketWriteEnabled()) { + GTEST_SKIP(); + } + DebugManagerStateRestore restorer; + DebugManager.flags.SkipDcFlushOnBarrierWithoutEvents.set(1); + + const cl_uint numEventsInWaitList = 0; + const cl_event *eventWaitList = nullptr; + cl_event clEvent{}; + auto retVal = pCmdQ->enqueueBarrierWithWaitList( + numEventsInWaitList, + eventWaitList, + &clEvent); + EXPECT_EQ(CL_SUCCESS, retVal); + + EXPECT_TRUE(pCmdQ->isStallingCommandsOnNextFlushRequired()); + EXPECT_TRUE(pCmdQ->isDcFlushRequiredOnStallingCommandsOnNextFlush()); + auto outEvent = castToObject(clEvent); + outEvent->release(); +} + +HWTEST_F(OOQTaskTests, givenSkipDcFlushOnBarrierWithoutEventsDisableddWhenEnqueingBarrierWithWaitListThenDcFlushSet) { + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + if (false == commandStreamReceiver.peekTimestampPacketWriteEnabled()) { + GTEST_SKIP(); + } + DebugManagerStateRestore restorer; + DebugManager.flags.SkipDcFlushOnBarrierWithoutEvents.set(0); + + const cl_uint numEventsInWaitList = 0; + const cl_event *eventWaitList = nullptr; + auto retVal = pCmdQ->enqueueBarrierWithWaitList( + numEventsInWaitList, + eventWaitList, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + EXPECT_TRUE(pCmdQ->isStallingCommandsOnNextFlushRequired()); + EXPECT_TRUE(pCmdQ->isDcFlushRequiredOnStallingCommandsOnNextFlush()); +} + +HWTEST_F(OOQTaskTests, givenSkipDcFlushOnBarrierWithoutEventsAndMultiTileContextWhenEnqueuingBarrierWithWaitlistThenDcFlushSet) { + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + if (false == commandStreamReceiver.peekTimestampPacketWriteEnabled()) { + GTEST_SKIP(); + } + commandStreamReceiver.setActivePartitions(2u); + commandStreamReceiver.staticWorkPartitioningEnabled = true; + EXPECT_TRUE(commandStreamReceiver.isMultiTileOperationEnabled()); + DebugManagerStateRestore restorer; + DebugManager.flags.SkipDcFlushOnBarrierWithoutEvents.set(1); + + const cl_uint numEventsInWaitList = 0; + const cl_event *eventWaitList = nullptr; + auto retVal = pCmdQ->enqueueBarrierWithWaitList( + numEventsInWaitList, + eventWaitList, + nullptr); + EXPECT_EQ(CL_SUCCESS, retVal); + + EXPECT_TRUE(pCmdQ->isStallingCommandsOnNextFlushRequired()); + EXPECT_TRUE(pCmdQ->isDcFlushRequiredOnStallingCommandsOnNextFlush()); +} \ No newline at end of file diff --git a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp index 6d91d9a388..8036d115f9 100644 --- a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp +++ b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp @@ -805,7 +805,7 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenPipeControlRequestWhenDispatchingBlitEnq } } -HWTEST_TEMPLATED_F(BcsBufferTests, givenBarrierWhenReleasingMultipleBlockedEnqueuesThenProgramBarrierOnce) { +HWTEST_TEMPLATED_F(BcsBufferTests, givenBarrierWithEmptyWaitlistWhenReleasingMultipleBlockedEnqueuesThenProgramBarrierOnce) { DebugManager.flags.OptimizeIoqBarriersHandling.set(0); using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h index 17096df2bc..3aacfcc8ed 100644 --- a/shared/source/command_stream/command_stream_receiver_hw.h +++ b/shared/source/command_stream/command_stream_receiver_hw.h @@ -189,7 +189,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { void programVFEState(LinearStream &csr, DispatchFlags &dispatchFlags, uint32_t maxFrontEndThreads); void programStallingCommandsForBarrier(LinearStream &cmdStream, DispatchFlags &dispatchFlags); void programStallingNoPostSyncCommandsForBarrier(LinearStream &cmdStream); - void programStallingPostSyncCommandsForBarrier(LinearStream &cmdStream, TagNodeBase &tagNode); + void programStallingPostSyncCommandsForBarrier(LinearStream &cmdStream, TagNodeBase &tagNode, bool dcFlushRequired); void programEngineModeCommands(LinearStream &csr, const DispatchFlags &dispatchFlags); void programEngineModeEpliogue(LinearStream &csr, const DispatchFlags &dispatchFlags); void programActivePartitionConfigFlushTask(LinearStream &csr); diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 6e4c6c5ac7..7bf9090d1b 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -743,7 +743,7 @@ inline void CommandStreamReceiverHw::programStallingCommandsForBarrie auto barrierTimestampPacketNodes = dispatchFlags.barrierTimestampPacketNodes; if (barrierTimestampPacketNodes && barrierTimestampPacketNodes->peekNodes().size() != 0) { - programStallingPostSyncCommandsForBarrier(cmdStream, *barrierTimestampPacketNodes->peekNodes()[0]); + programStallingPostSyncCommandsForBarrier(cmdStream, *barrierTimestampPacketNodes->peekNodes()[0], dispatchFlags.isDcFlushRequiredOnStallingCommandsOnNextFlush); barrierTimestampPacketNodes->makeResident(*this); } else { programStallingNoPostSyncCommandsForBarrier(cmdStream); diff --git a/shared/source/command_stream/command_stream_receiver_hw_bdw_and_later.inl b/shared/source/command_stream/command_stream_receiver_hw_bdw_and_later.inl index b2199c3489..0b51de0c6c 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_bdw_and_later.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_bdw_and_later.inl @@ -154,10 +154,10 @@ inline void CommandStreamReceiverHw::programStallingNoPostSyncCommand } template -inline void CommandStreamReceiverHw::programStallingPostSyncCommandsForBarrier(LinearStream &cmdStream, TagNodeBase &tagNode) { +inline void CommandStreamReceiverHw::programStallingPostSyncCommandsForBarrier(LinearStream &cmdStream, TagNodeBase &tagNode, bool dcFlushRequired) { auto barrierTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(tagNode); PipeControlArgs args; - args.dcFlushEnable = this->dcFlushSupport; + args.dcFlushEnable = this->dcFlushSupport && dcFlushRequired; MemorySynchronizationCommands::addBarrierWithPostSyncOperation( cmdStream, PostSyncMode::ImmediateData, diff --git a/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl b/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl index 3462e9f604..8f0945d995 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl @@ -189,10 +189,10 @@ inline void CommandStreamReceiverHw::programStallingNoPostSyncCommand } template -inline void CommandStreamReceiverHw::programStallingPostSyncCommandsForBarrier(LinearStream &cmdStream, TagNodeBase &tagNode) { +inline void CommandStreamReceiverHw::programStallingPostSyncCommandsForBarrier(LinearStream &cmdStream, TagNodeBase &tagNode, bool dcFlushRequired) { auto barrierTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(tagNode); PipeControlArgs args; - args.dcFlushEnable = this->dcFlushSupport; + args.dcFlushEnable = this->dcFlushSupport && dcFlushRequired; if (isMultiTileOperationEnabled()) { args.workloadPartitionOffset = true; ImplicitScalingDispatch::dispatchBarrierCommands(cmdStream, diff --git a/shared/source/command_stream/csr_definitions.h b/shared/source/command_stream/csr_definitions.h index 2b6c87f575..2d3e85e3a2 100644 --- a/shared/source/command_stream/csr_definitions.h +++ b/shared/source/command_stream/csr_definitions.h @@ -58,38 +58,39 @@ struct DispatchFlags { uint64_t sliceCountP, bool blockingP, bool dcFlushP, bool useSLMP, bool guardCommandBufferWithPipeControlP, bool gsba32BitRequiredP, bool requiresCoherencyP, bool lowPriorityP, bool implicitFlushP, bool outOfOrderExecutionAllowedP, bool epilogueRequiredP, bool usePerDSSbackedBufferP, bool useGlobalAtomicsP, bool areMultipleSubDevicesInContextP, bool memoryMigrationRequiredP, bool textureCacheFlush, - bool hasStallingCmds, bool hasRelaxedOrderingDependencies, bool stateCacheInvalidation, bool isStallingCommandsOnNextFlushRequired) : csrDependencies(csrDependenciesP), - barrierTimestampPacketNodes(barrierTimestampPacketNodesP), - pipelineSelectArgs(pipelineSelectArgsP), - flushStampReference(flushStampReferenceP), - throttle(throttleP), - preemptionMode(preemptionModeP), - numGrfRequired(numGrfRequiredP), - l3CacheSettings(l3CacheSettingsP), - threadArbitrationPolicy(threadArbitrationPolicyP), - additionalKernelExecInfo(additionalKernelExecInfoP), - kernelExecutionType(kernelExecutionTypeP), - memoryCompressionState(memoryCompressionStateP), - sliceCount(sliceCountP), - blocking(blockingP), - dcFlush(dcFlushP), - useSLM(useSLMP), - guardCommandBufferWithPipeControl(guardCommandBufferWithPipeControlP), - gsba32BitRequired(gsba32BitRequiredP), - requiresCoherency(requiresCoherencyP), - lowPriority(lowPriorityP), - implicitFlush(implicitFlushP), - outOfOrderExecutionAllowed(outOfOrderExecutionAllowedP), - epilogueRequired(epilogueRequiredP), - usePerDssBackedBuffer(usePerDSSbackedBufferP), - useGlobalAtomics(useGlobalAtomicsP), - areMultipleSubDevicesInContext(areMultipleSubDevicesInContextP), - memoryMigrationRequired(memoryMigrationRequiredP), - textureCacheFlush(textureCacheFlush), - hasStallingCmds(hasStallingCmds), - hasRelaxedOrderingDependencies(hasRelaxedOrderingDependencies), - stateCacheInvalidation(stateCacheInvalidation), - isStallingCommandsOnNextFlushRequired(isStallingCommandsOnNextFlushRequired){}; + bool hasStallingCmds, bool hasRelaxedOrderingDependencies, bool stateCacheInvalidation, bool isStallingCommandsOnNextFlushRequired, bool isDcFlushRequiredOnStallingCommandsOnNextFlush) : csrDependencies(csrDependenciesP), + barrierTimestampPacketNodes(barrierTimestampPacketNodesP), + pipelineSelectArgs(pipelineSelectArgsP), + flushStampReference(flushStampReferenceP), + throttle(throttleP), + preemptionMode(preemptionModeP), + numGrfRequired(numGrfRequiredP), + l3CacheSettings(l3CacheSettingsP), + threadArbitrationPolicy(threadArbitrationPolicyP), + additionalKernelExecInfo(additionalKernelExecInfoP), + kernelExecutionType(kernelExecutionTypeP), + memoryCompressionState(memoryCompressionStateP), + sliceCount(sliceCountP), + blocking(blockingP), + dcFlush(dcFlushP), + useSLM(useSLMP), + guardCommandBufferWithPipeControl(guardCommandBufferWithPipeControlP), + gsba32BitRequired(gsba32BitRequiredP), + requiresCoherency(requiresCoherencyP), + lowPriority(lowPriorityP), + implicitFlush(implicitFlushP), + outOfOrderExecutionAllowed(outOfOrderExecutionAllowedP), + epilogueRequired(epilogueRequiredP), + usePerDssBackedBuffer(usePerDSSbackedBufferP), + useGlobalAtomics(useGlobalAtomicsP), + areMultipleSubDevicesInContext(areMultipleSubDevicesInContextP), + memoryMigrationRequired(memoryMigrationRequiredP), + textureCacheFlush(textureCacheFlush), + hasStallingCmds(hasStallingCmds), + hasRelaxedOrderingDependencies(hasRelaxedOrderingDependencies), + stateCacheInvalidation(stateCacheInvalidation), + isStallingCommandsOnNextFlushRequired(isStallingCommandsOnNextFlushRequired), + isDcFlushRequiredOnStallingCommandsOnNextFlush(isDcFlushRequiredOnStallingCommandsOnNextFlush){}; CsrDependencies csrDependencies; TimestampPacketContainer *barrierTimestampPacketNodes = nullptr; @@ -125,6 +126,7 @@ struct DispatchFlags { bool disableEUFusion = false; bool stateCacheInvalidation = false; bool isStallingCommandsOnNextFlushRequired = false; + bool isDcFlushRequiredOnStallingCommandsOnNextFlush = false; }; struct CsrSizeRequestFlags { diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index dc71fc0d67..92eb29c514 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -338,6 +338,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, SetAmountOfReusableAllocations, -1, "-1: default DECLARE_DEBUG_VARIABLE(int32_t, UseHighAlignmentForHeapExtended, -1, "-1: default, 0:disabled, > 1: enabled. If enabled, driver aligns HEAP_EXTENDED allocations to GPU VA that is next power of 2 for a given size, if disables GPU VA is using 2MB/64KB alignment.") DECLARE_DEBUG_VARIABLE(int32_t, DispatchCmdlistCmdBufferPrimary, -1, "-1: default, 0: dispatch command buffers as seconadry, 1: dispatch command buffers as primary and chain") DECLARE_DEBUG_VARIABLE(int32_t, UseImmediateFlushTask, -1, "-1: default, 0: use regular flush task, 1: use immediate flush task") +DECLARE_DEBUG_VARIABLE(int32_t, SkipDcFlushOnBarrierWithoutEvents, -1, "-1: default (disabled), 0: disabled, 1: enabled") /*DIRECT SUBMISSION FLAGS*/ DECLARE_DEBUG_VARIABLE(int32_t, EnableDirectSubmission, -1, "-1: default (disabled), 0: disable, 1:enable. Enables direct submission of command buffers bypassing KMD") diff --git a/shared/test/common/helpers/dispatch_flags_helper.h b/shared/test/common/helpers/dispatch_flags_helper.h index cf7c341ba5..3ec6d75a27 100644 --- a/shared/test/common/helpers/dispatch_flags_helper.h +++ b/shared/test/common/helpers/dispatch_flags_helper.h @@ -45,7 +45,8 @@ struct DispatchFlagsHelper { false, // hasStallingCmds false, // hasRelaxedOrderingDependencies false, // stateCacheInvalidation - false // isStallingCommandsOnNextFlushRequired + false, // isStallingCommandsOnNextFlushRequired + false // isDcFlushRequiredOnStallingCommandsOnNextFlush ); } }; diff --git a/shared/test/common/libult/ult_command_stream_receiver.h b/shared/test/common/libult/ult_command_stream_receiver.h index db36778ead..41f99354e1 100644 --- a/shared/test/common/libult/ult_command_stream_receiver.h +++ b/shared/test/common/libult/ult_command_stream_receiver.h @@ -36,6 +36,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ using BaseClass::checkPlatformSupportsNewResourceImplicitFlush; using BaseClass::createKernelArgsBufferAllocation; using BaseClass::csrSizeRequestFlags; + using BaseClass::dcFlushSupport; using BaseClass::directSubmission; using BaseClass::dshState; using BaseClass::getCmdSizeForPrologue; diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index c889be58be..be89f28d88 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -542,4 +542,5 @@ TrackNumCsrClientsOnSyncPoints = -1 CommandListTimestampRefreshIntervalInMilliSec = -1 SynchronizeEventBeforeReset = -1 RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup = 0 +SkipDcFlushOnBarrierWithoutEvents = -1 # Please don't edit below this line diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index b1c4df3ebb..7a1f9299c6 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -601,7 +601,7 @@ TEST(CommandStreamReceiverSimpleTest, givenCsrWithoutTagAllocationWhenGetTagAllo EXPECT_EQ(nullptr, csr.getTagAllocation()); } -TEST(CommandStreamReceiverSimpleTest, givenCsrWhenSubmitiingBatchBufferThenTaskCountIsIncrementedAndLatestsValuesSetCorrectly) { +TEST(CommandStreamReceiverSimpleTest, givenCsrWhenSubmitingBatchBufferThenTaskCountIsIncrementedAndLatestsValuesSetCorrectly) { MockExecutionEnvironment executionEnvironment; executionEnvironment.prepareRootDeviceEnvironments(1); executionEnvironment.initializeMemoryManager(); @@ -4209,3 +4209,96 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverHwTest, givenScratchSpaceSurfa EXPECT_EQ(scratchController->privateScratchSizeBytes, alignedSizeForPrivateScratch * scratchController->computeUnitsUsedForScratch); EXPECT_EQ(scratchController->privateScratchSizeBytes, scratchController->getPrivateScratchSpaceAllocation()->getUnderlyingBufferSize()); } + +HWTEST_F(CommandStreamReceiverHwTest, givenDcFlushRequiredWhenProgramStallingPostSyncCommandsForBarrierCalledThenDcFlushSet) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + auto &ultCsr = pDevice->getUltCommandStreamReceiver(); + ultCsr.dcFlushSupport = true; + if (ultCsr.isMultiTileOperationEnabled()) { + GTEST_SKIP(); + } + char commandBuffer[MemoryConstants::pageSize]; + LinearStream commandStream(commandBuffer, MemoryConstants::pageSize); + TagNodeBase *tagNode = ultCsr.getTimestampPacketAllocator()->getTag(); + constexpr bool dcFlushRequired = true; + ultCsr.programStallingPostSyncCommandsForBarrier(commandStream, *tagNode, dcFlushRequired); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + commandStream.getCpuBase(), + commandStream.getUsed())); + auto pipeControlIteratorVector = findAll(cmdList.begin(), cmdList.end()); + ASSERT_GE(pipeControlIteratorVector.size(), 1u); + auto pipeControlIterator = pipeControlIteratorVector[0]; + const bool barrierWaRequired = MemorySynchronizationCommands::isBarrierWaRequired(pDevice->getRootDeviceEnvironment()); + if (barrierWaRequired) { + ASSERT_GE(pipeControlIteratorVector.size(), 2u); + pipeControlIterator = pipeControlIteratorVector[1]; + } + auto pipeControl = genCmdCast(*pipeControlIterator); + ASSERT_NE(nullptr, pipeControl); + EXPECT_TRUE(pipeControl->getDcFlushEnable()); +} + +HWTEST_F(CommandStreamReceiverHwTest, givenDcFlushRequiredButNoDcFlushSupportWhenProgramStallingPostSyncCommandsForBarrierCalledThenDcFlushNotSet) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + auto &ultCsr = pDevice->getUltCommandStreamReceiver(); + ultCsr.dcFlushSupport = false; + if (ultCsr.isMultiTileOperationEnabled()) { + GTEST_SKIP(); + } + char commandBuffer[MemoryConstants::pageSize]; + LinearStream commandStream(commandBuffer, MemoryConstants::pageSize); + TagNodeBase *tagNode = ultCsr.getTimestampPacketAllocator()->getTag(); + constexpr bool dcFlushRequired = true; + ultCsr.programStallingPostSyncCommandsForBarrier(commandStream, *tagNode, dcFlushRequired); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + commandStream.getCpuBase(), + commandStream.getUsed())); + auto pipeControlIteratorVector = findAll(cmdList.begin(), cmdList.end()); + ASSERT_GE(pipeControlIteratorVector.size(), 1u); + auto pipeControlIterator = pipeControlIteratorVector[0]; + const bool barrierWaRequired = MemorySynchronizationCommands::isBarrierWaRequired(pDevice->getRootDeviceEnvironment()); + if (barrierWaRequired) { + ASSERT_GE(pipeControlIteratorVector.size(), 2u); + pipeControlIterator = pipeControlIteratorVector[1]; + } + auto pipeControl = genCmdCast(*pipeControlIterator); + ASSERT_NE(nullptr, pipeControl); + EXPECT_FALSE(pipeControl->getDcFlushEnable()); +} + +HWTEST_F(CommandStreamReceiverHwTest, givenDcFlushRequiredFalseWhenProgramStallingPostSyncCommandsForBarrierCalledThenDcFlushNotSet) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + auto &ultCsr = pDevice->getUltCommandStreamReceiver(); + ultCsr.dcFlushSupport = true; + if (ultCsr.isMultiTileOperationEnabled()) { + GTEST_SKIP(); + } + char commandBuffer[MemoryConstants::pageSize]; + LinearStream commandStream(commandBuffer, MemoryConstants::pageSize); + TagNodeBase *tagNode = ultCsr.getTimestampPacketAllocator()->getTag(); + constexpr bool dcFlushRequired = false; + ultCsr.programStallingPostSyncCommandsForBarrier(commandStream, *tagNode, dcFlushRequired); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + commandStream.getCpuBase(), + commandStream.getUsed())); + auto pipeControlIteratorVector = findAll(cmdList.begin(), cmdList.end()); + ASSERT_GE(pipeControlIteratorVector.size(), 1u); + auto pipeControlIterator = pipeControlIteratorVector[0]; + const bool barrierWaRequired = MemorySynchronizationCommands::isBarrierWaRequired(pDevice->getRootDeviceEnvironment()); + if (barrierWaRequired) { + ASSERT_GE(pipeControlIteratorVector.size(), 2u); + pipeControlIterator = pipeControlIteratorVector[1]; + } + auto pipeControl = genCmdCast(*pipeControlIterator); + ASSERT_NE(nullptr, pipeControl); + EXPECT_FALSE(pipeControl->getDcFlushEnable()); +} diff --git a/shared/test/unit_test/command_stream/compute_mode_tests.h b/shared/test/unit_test/command_stream/compute_mode_tests.h index 2c4b6232b7..8e8bbdb557 100644 --- a/shared/test/unit_test/command_stream/compute_mode_tests.h +++ b/shared/test/unit_test/command_stream/compute_mode_tests.h @@ -95,6 +95,6 @@ struct ComputeModeRequirements : public ::testing::Test { CommandStreamReceiver *csr = nullptr; std::unique_ptr device; - DispatchFlags flags{{}, nullptr, {}, nullptr, QueueThrottle::MEDIUM, PreemptionMode::Disabled, GrfConfig::DefaultGrfNumber, L3CachingSettings::l3CacheOn, ThreadArbitrationPolicy::NotPresent, AdditionalKernelExecInfo::NotApplicable, KernelExecutionType::NotApplicable, MemoryCompressionState::NotApplicable, QueueSliceCount::defaultSliceCount, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false}; + DispatchFlags flags{{}, nullptr, {}, nullptr, QueueThrottle::MEDIUM, PreemptionMode::Disabled, GrfConfig::DefaultGrfNumber, L3CachingSettings::l3CacheOn, ThreadArbitrationPolicy::NotPresent, AdditionalKernelExecInfo::NotApplicable, KernelExecutionType::NotApplicable, MemoryCompressionState::NotApplicable, QueueSliceCount::defaultSliceCount, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false}; GraphicsAllocation *alloc = nullptr; };