diff --git a/shared/source/command_stream/definitions/stream_properties.inl b/shared/source/command_stream/definitions/stream_properties.inl index 02e49385e4..b4a288f964 100644 --- a/shared/source/command_stream/definitions/stream_properties.inl +++ b/shared/source/command_stream/definitions/stream_properties.inl @@ -23,6 +23,7 @@ struct StateComputeModePropertiesSupport { bool allocationForScratchAndMidthreadPreemption = false; bool enableVariableRegisterSizeAllocation = false; bool pipelinedEuThreadArbitration = false; + bool enableL1FlushUavCoherencyMode = false; bool lscSamplerBackingThreshold = false; bool enableOutOfBoundariesInTranslationException = false; bool enablePageFaultException = false; @@ -42,6 +43,7 @@ struct StateComputeModeProperties { StreamProperty memoryAllocationForScratchAndMidthreadPreemptionBuffers{}; StreamProperty enableVariableRegisterSizeAllocation{}; StreamProperty pipelinedEuThreadArbitration{}; + StreamProperty enableL1FlushUavCoherencyMode{}; StreamProperty lscSamplerBackingThreshold{}; StreamProperty enableOutOfBoundariesInTranslationException{}; StreamProperty enablePageFaultException{}; diff --git a/shared/source/command_stream/stream_properties.cpp b/shared/source/command_stream/stream_properties.cpp index e73ed3a21e..eb7d3f3bff 100644 --- a/shared/source/command_stream/stream_properties.cpp +++ b/shared/source/command_stream/stream_properties.cpp @@ -62,6 +62,7 @@ void StateComputeModeProperties::copyPropertiesAll(const StateComputeModePropert memoryAllocationForScratchAndMidthreadPreemptionBuffers.set(properties.memoryAllocationForScratchAndMidthreadPreemptionBuffers.value); enableVariableRegisterSizeAllocation.set(properties.enableVariableRegisterSizeAllocation.value); pipelinedEuThreadArbitration.set(properties.pipelinedEuThreadArbitration.value); + enableL1FlushUavCoherencyMode.set(properties.enableL1FlushUavCoherencyMode.value); enablePageFaultException.set(properties.enablePageFaultException.value); enableSystemMemoryReadFence.set(properties.enableSystemMemoryReadFence.value); enableMemoryException.set(properties.enableMemoryException.value); @@ -93,6 +94,7 @@ bool StateComputeModeProperties::isDirty() const { memoryAllocationForScratchAndMidthreadPreemptionBuffers.isDirty || enableVariableRegisterSizeAllocation.isDirty || pipelinedEuThreadArbitration.isDirty || + enableL1FlushUavCoherencyMode.isDirty || enablePageFaultException.isDirty || enableSystemMemoryReadFence.isDirty || enableMemoryException.isDirty || @@ -118,6 +120,7 @@ void StateComputeModeProperties::clearIsDirtyPerContext() { devicePreemptionMode.isDirty = false; enableVariableRegisterSizeAllocation.isDirty = false; pipelinedEuThreadArbitration.isDirty = false; + enableL1FlushUavCoherencyMode.isDirty = false; enablePageFaultException.isDirty = false; enableSystemMemoryReadFence.isDirty = false; enableMemoryException.isDirty = false; @@ -188,6 +191,7 @@ void StateComputeModeProperties::resetState() { this->memoryAllocationForScratchAndMidthreadPreemptionBuffers.value = StreamProperty::initValue; this->enableVariableRegisterSizeAllocation.value = StreamProperty::initValue; this->pipelinedEuThreadArbitration.value = StreamProperty::initValue; + this->enableL1FlushUavCoherencyMode.value = StreamProperty::initValue; this->enablePageFaultException.value = StreamProperty::initValue; this->enableSystemMemoryReadFence.value = StreamProperty::initValue; this->enableMemoryException.value = StreamProperty::initValue; @@ -216,6 +220,10 @@ void StateComputeModeProperties::setPropertiesPerContext(bool requiresCoherency, this->pipelinedEuThreadArbitration.set(true); } + if (this->scmPropertiesSupport.enableL1FlushUavCoherencyMode) { + this->enableL1FlushUavCoherencyMode.set(this->scmPropertiesSupport.enableL1FlushUavCoherencyMode); + } + if (this->scmPropertiesSupport.enablePageFaultException) { this->enablePageFaultException.set(this->scmPropertiesSupport.enablePageFaultException); } diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 8aaa36f980..ae10370e60 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -584,6 +584,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, ForceDispatchTimeoutCounter, -1, "Set timeout fo DECLARE_DEBUG_VARIABLE(int32_t, OverrideNumThreadsPerEu, -1, "-1: default, >0: force number of threads per EU") DECLARE_DEBUG_VARIABLE(int32_t, Enable64bAddressingForRayTracing, -1, "-1: default, 0: disabled, 1: enabled. Enable support for 64 bit addressing for RayTracing HSD-14016042915") DECLARE_DEBUG_VARIABLE(int32_t, EnableXe3VariableRegisterSizeAllocation, -1, "When enabled, use new Xe3 Variable Register per Thread (VRT) feature, -1: default, 0: disabled, 1: enabled") +DECLARE_DEBUG_VARIABLE(int32_t, EnableL1FlushUavCoherencyMode, -1, "When enabled, state compute mode is configured with L1 flush for UAV coherency mode, -1: default, 0: disabled, 1: enabled") DECLARE_DEBUG_VARIABLE(int32_t, ResourceBarrierL1FlushMode, -1, "Invalidate or flush L1 cache in RESOURCE_BARRIER instruction. -1: default, 0: disabled, 1: invalidate L1, 2: flush L1, 3: both") /* IMPLICIT SCALING */ diff --git a/shared/source/xe2_hpg_core/command_encoder_xe2_hpg_core.cpp b/shared/source/xe2_hpg_core/command_encoder_xe2_hpg_core.cpp index 61e056fd1c..fcb6285126 100644 --- a/shared/source/xe2_hpg_core/command_encoder_xe2_hpg_core.cpp +++ b/shared/source/xe2_hpg_core/command_encoder_xe2_hpg_core.cpp @@ -78,6 +78,11 @@ void EncodeComputeMode::programComputeModeCommand(LinearStream &csr, Sta maskBits |= Family::stateComputeModeLargeGrfModeMask; } + if (properties.enableL1FlushUavCoherencyMode.isDirty) { + stateComputeMode.setUavCoherencyMode(STATE_COMPUTE_MODE::UAV_COHERENCY_MODE::UAV_COHERENCY_MODE_FLUSH_DATAPORT_L1); + maskBits2 |= Family::stateComputeModeUavCoherencyModeMask; + } + stateComputeMode.setMask1(maskBits); stateComputeMode.setMask2(maskBits2); diff --git a/shared/source/xe2_hpg_core/hw_cmds_base.h b/shared/source/xe2_hpg_core/hw_cmds_base.h index f28b79a654..3f37e65c5f 100644 --- a/shared/source/xe2_hpg_core/hw_cmds_base.h +++ b/shared/source/xe2_hpg_core/hw_cmds_base.h @@ -24,6 +24,7 @@ struct Xe2HpgCore { static constexpr uint32_t stateComputeModeEuThreadSchedulingModeOverrideMask = (0b11u << 13); static constexpr uint32_t stateComputeModeLargeGrfModeMask = (1u << 15); // DW2 + static constexpr uint32_t stateComputeModeUavCoherencyModeMask = (1u << 6); static constexpr uint32_t stateComputeModeMemoryAllocationForScratchAndMidthreadPreemptionBuffersMask = (1u << 11); static constexpr bool isUsingL3Control = false; @@ -53,6 +54,7 @@ struct Xe2HpgCore { static constexpr bool devicePreemptionMode = false; static constexpr bool allocationForScratchAndMidthreadPreemption = true; + static constexpr bool enableL1FlushUavCoherencyMode = false; }; struct StateBaseAddressStateSupport { diff --git a/shared/source/xe2_hpg_core/os_agnostic_product_helper_xe2_hpg_core.inl b/shared/source/xe2_hpg_core/os_agnostic_product_helper_xe2_hpg_core.inl index f3891968c3..510c9b0a60 100644 --- a/shared/source/xe2_hpg_core/os_agnostic_product_helper_xe2_hpg_core.inl +++ b/shared/source/xe2_hpg_core/os_agnostic_product_helper_xe2_hpg_core.inl @@ -51,6 +51,10 @@ void ProductHelperHw::fillScmPropertiesSupportStructure(StateCompute fillScmPropertiesSupportStructureBase(propertiesSupport); propertiesSupport.allocationForScratchAndMidthreadPreemption = GfxProduct::StateComputeModeStateSupport::allocationForScratchAndMidthreadPreemption; + propertiesSupport.enableL1FlushUavCoherencyMode = GfxProduct::StateComputeModeStateSupport::enableL1FlushUavCoherencyMode; + if (debugManager.flags.EnableL1FlushUavCoherencyMode.get() != -1) { + propertiesSupport.enableL1FlushUavCoherencyMode = !!debugManager.flags.EnableL1FlushUavCoherencyMode.get(); + } } template <> diff --git a/shared/source/xe3_core/command_encoder_xe3_core.cpp b/shared/source/xe3_core/command_encoder_xe3_core.cpp index 9542778d59..9272459b9f 100644 --- a/shared/source/xe3_core/command_encoder_xe3_core.cpp +++ b/shared/source/xe3_core/command_encoder_xe3_core.cpp @@ -66,6 +66,11 @@ void EncodeComputeMode::programComputeModeCommand(LinearStream &csr, Sta maskBits |= Family::stateComputeModeEnableVariableRegisterSizeAllocationMask; } + if (properties.enableL1FlushUavCoherencyMode.isDirty) { + stateComputeMode.setUavCoherencyMode(STATE_COMPUTE_MODE::UAV_COHERENCY_MODE::UAV_COHERENCY_MODE_FLUSH_DATAPORT_L1); + maskBits2 |= Family::stateComputeModeUavCoherencyModeMask; + } + stateComputeMode.setMask1(maskBits); stateComputeMode.setMask2(maskBits2); diff --git a/shared/source/xe3_core/hw_cmds_base.h b/shared/source/xe3_core/hw_cmds_base.h index 5489a4441c..c8df73dda0 100644 --- a/shared/source/xe3_core/hw_cmds_base.h +++ b/shared/source/xe3_core/hw_cmds_base.h @@ -26,6 +26,7 @@ struct Xe3Core { static constexpr uint32_t stateComputeModeEuThreadSchedulingModeOverrideMask = (0b11u << 13); static constexpr uint32_t stateComputeModeLargeGrfModeMask = (1u << 15); // DW2 + static constexpr uint32_t stateComputeModeUavCoherencyModeMask = (1u << 6); static constexpr uint32_t bcsEngineCount = 1u; static constexpr uint32_t timestampPacketCount = 16u; @@ -55,6 +56,7 @@ struct Xe3Core { static constexpr bool allocationForScratchAndMidthreadPreemption = true; static constexpr bool enableVariableRegisterSizeAllocation = true; + static constexpr bool enableL1FlushUavCoherencyMode = false; }; struct StateBaseAddressStateSupport { diff --git a/shared/source/xe3_core/os_agnostic_product_helper_xe3_core.inl b/shared/source/xe3_core/os_agnostic_product_helper_xe3_core.inl index 095deb81db..76044858fd 100644 --- a/shared/source/xe3_core/os_agnostic_product_helper_xe3_core.inl +++ b/shared/source/xe3_core/os_agnostic_product_helper_xe3_core.inl @@ -52,6 +52,11 @@ void ProductHelperHw::fillScmPropertiesSupportStructure(StateCompute if (pipelinedEuThreadArbitration) { propertiesSupport.pipelinedEuThreadArbitration = true; } + + propertiesSupport.enableL1FlushUavCoherencyMode = GfxProduct::StateComputeModeStateSupport::enableL1FlushUavCoherencyMode; + if (debugManager.flags.EnableL1FlushUavCoherencyMode.get() != -1) { + propertiesSupport.enableL1FlushUavCoherencyMode = !!debugManager.flags.EnableL1FlushUavCoherencyMode.get(); + } } template <> diff --git a/shared/source/xe3p_core/hw_cmds_base.h b/shared/source/xe3p_core/hw_cmds_base.h index b8571ca52d..63dcf88044 100644 --- a/shared/source/xe3p_core/hw_cmds_base.h +++ b/shared/source/xe3p_core/hw_cmds_base.h @@ -30,6 +30,7 @@ struct Xe3pCore { static constexpr uint32_t stateComputeModeEuThreadSchedulingModeOverrideMask = (0b11u << 13); static constexpr uint32_t stateComputeModeLargeGrfModeMask = (1u << 15); // DW2 + static constexpr uint32_t stateComputeModeUavCoherencyModeMask = (1u << 6); static constexpr uint32_t stateComputeModeEnableOutOfBoundariesInTranslationExceptionMask = (1u << 7); static constexpr uint32_t stateComputeModePageFaultExceptionEnableMask = (1u << 9); static constexpr uint32_t stateComputeModeSystemMemoryReadFenceEnableMask = (1u << 11); @@ -66,6 +67,7 @@ struct Xe3pCore { static constexpr bool allocationForScratchAndMidthreadPreemption = true; static constexpr bool enableVariableRegisterSizeAllocation = true; + static constexpr bool enableL1FlushUavCoherencyMode = false; static constexpr bool enablePageFaultException = false; static constexpr bool enableSystemMemoryReadFence = false; static constexpr bool enableMemoryException = false; diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index 270684e040..45a6283ae2 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -685,6 +685,7 @@ Enable512NumGrfs = 1 EnableUsmPoolResidencyTracking = -1 EnableUsmPoolLazyInit = -1 ForcePrintsRedirection = -1 +EnableL1FlushUavCoherencyMode = -1; ResourceBarrierL1FlushMode = -1 InitialCounterBasedEventValue = -1 DirectSubmissionInitialSemaphoreValue = -1 diff --git a/shared/test/unit_test/encoders/command_encoder_tests_xe2_and_later.cpp b/shared/test/unit_test/encoders/command_encoder_tests_xe2_and_later.cpp index 547075c1da..49dcb85a42 100644 --- a/shared/test/unit_test/encoders/command_encoder_tests_xe2_and_later.cpp +++ b/shared/test/unit_test/encoders/command_encoder_tests_xe2_and_later.cpp @@ -1,11 +1,12 @@ /* - * Copyright (C) 2024 Intel Corporation + * Copyright (C) 2024-2025 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "shared/source/command_container/encode_surface_state.h" +#include "shared/source/command_stream/stream_properties.h" #include "shared/source/gmm_helper/gmm_helper.h" #include "shared/source/gmm_helper/gmm_lib.h" #include "shared/source/os_interface/product_helper.h" @@ -63,3 +64,54 @@ HWTEST2_F(CommandEncodeStatesTestXe2AndLater, whenDebugFlagIsDisabledForAdjustPi HWTEST2_F(ImplicitScalingTests, GivenXeAtLeastHpg2WhenCheckingPipeControlStallRequiredThenExpectTrue, IsAtLeastXe2HpgCore) { EXPECT_FALSE(ImplicitScalingDispatch::getPipeControlStallRequired()); } + +HWTEST2_F(CommandEncodeStatesTestXe2AndLater, givenDebugFlagWhenProgrammingStateComputeModeThenEnableL1FlushUavCoherencyMode, IsAtLeastXe2HpgCore) { + using STATE_COMPUTE_MODE = typename FamilyType::STATE_COMPUTE_MODE; + + DebugManagerStateRestore restore; + + uint8_t buffer[sizeof(STATE_COMPUTE_MODE)]{}; + const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment(); + { + // default + LinearStream linearStream(buffer, sizeof(buffer)); + + StreamProperties streamProperties{}; + streamProperties.initSupport(rootDeviceEnvironment); + streamProperties.stateComputeMode.setPropertiesAll(false, 0, 0, PreemptionMode::Disabled, false); + EncodeComputeMode::programComputeModeCommand(linearStream, streamProperties.stateComputeMode, rootDeviceEnvironment); + + auto &stateComputeModeCmd = *reinterpret_cast(linearStream.getCpuBase()); + EXPECT_EQ(STATE_COMPUTE_MODE::UAV_COHERENCY_MODE::UAV_COHERENCY_MODE_DRAIN_DATAPORT_MODE, stateComputeModeCmd.getUavCoherencyMode()); + } + + { + // enabled + debugManager.flags.EnableL1FlushUavCoherencyMode.set(1); + + LinearStream linearStream(buffer, sizeof(buffer)); + + StreamProperties streamProperties{}; + streamProperties.initSupport(rootDeviceEnvironment); + streamProperties.stateComputeMode.setPropertiesAll(false, 0, 0, PreemptionMode::Disabled, false); + EncodeComputeMode::programComputeModeCommand(linearStream, streamProperties.stateComputeMode, rootDeviceEnvironment); + + auto &stateComputeModeCmd = *reinterpret_cast(linearStream.getCpuBase()); + EXPECT_EQ(STATE_COMPUTE_MODE::UAV_COHERENCY_MODE::UAV_COHERENCY_MODE_FLUSH_DATAPORT_L1, stateComputeModeCmd.getUavCoherencyMode()); + } + + { + // disabled + debugManager.flags.EnableL1FlushUavCoherencyMode.set(0); + + LinearStream linearStream(buffer, sizeof(buffer)); + + StreamProperties streamProperties{}; + streamProperties.initSupport(rootDeviceEnvironment); + streamProperties.stateComputeMode.setPropertiesAll(false, 0, 0, PreemptionMode::Disabled, false); + EncodeComputeMode::programComputeModeCommand(linearStream, streamProperties.stateComputeMode, rootDeviceEnvironment); + + auto &stateComputeModeCmd = *reinterpret_cast(linearStream.getCpuBase()); + EXPECT_EQ(STATE_COMPUTE_MODE::UAV_COHERENCY_MODE::UAV_COHERENCY_MODE_DRAIN_DATAPORT_MODE, stateComputeModeCmd.getUavCoherencyMode()); + } +}