From ca72dff1abfef2ce44e927776200734594f4fa63 Mon Sep 17 00:00:00 2001 From: "Vysochyn, Illia" Date: Tue, 14 Jan 2025 19:34:35 +0000 Subject: [PATCH] feature: Add missing pipelined EU thread arbitration on Xe3 Related-To: NEO-13682 Signed-off-by: Vysochyn, Illia --- .../definitions/stream_properties.inl | 4 +++ .../command_stream/stream_properties.cpp | 12 +++++++++ .../debug_settings/debug_variables_base.inl | 1 + .../xe3_core/hw_cmds_generated_xe3_core.inl | 11 ++++++-- .../xe3_core/command_encoder_xe3_core.cpp | 5 +++- shared/source/xe3_core/hw_cmds_base.h | 3 ++- .../os_agnostic_product_helper_xe3_core.inl | 9 +++++++ shared/test/common/test_files/igdrcl.config | 1 + .../xe3_core/compute_mode_tests_xe3_core.cpp | 2 +- .../xe3_core/test_encode_xe3_core.cpp | 25 +++++++++++++++++++ 10 files changed, 68 insertions(+), 5 deletions(-) diff --git a/shared/source/command_stream/definitions/stream_properties.inl b/shared/source/command_stream/definitions/stream_properties.inl index e879cea2aa..b02e7bf81b 100644 --- a/shared/source/command_stream/definitions/stream_properties.inl +++ b/shared/source/command_stream/definitions/stream_properties.inl @@ -21,6 +21,7 @@ struct StateComputeModePropertiesSupport { bool devicePreemptionMode = false; bool allocationForScratchAndMidthreadPreemption = false; bool enableVariableRegisterSizeAllocation = false; + bool pipelinedEuThreadArbitration = false; }; struct StateComputeModeProperties { @@ -42,6 +43,8 @@ struct StateComputeModeProperties { void copyPropertiesAll(const StateComputeModeProperties &properties); void copyPropertiesGrfNumberThreadArbitration(const StateComputeModeProperties &properties); + void setPipelinedEuThreadArbitration(); + bool isPipelinedEuThreadArbitrationEnabled() const; bool isDirty() const; void clearIsDirty(); @@ -64,6 +67,7 @@ struct StateComputeModeProperties { StateComputeModePropertiesSupport scmPropertiesSupport = {}; int32_t defaultThreadArbitrationPolicy = 0; bool propertiesSupportLoaded = false; + bool pipelinedEuThreadArbitration = false; }; struct FrontEndPropertiesSupport { diff --git a/shared/source/command_stream/stream_properties.cpp b/shared/source/command_stream/stream_properties.cpp index 7903381ec8..c693093031 100644 --- a/shared/source/command_stream/stream_properties.cpp +++ b/shared/source/command_stream/stream_properties.cpp @@ -179,6 +179,10 @@ void StateComputeModeProperties::setPropertiesPerContext(bool requiresCoherency, this->enableVariableRegisterSizeAllocation.set(this->scmPropertiesSupport.enableVariableRegisterSizeAllocation); } + if (this->scmPropertiesSupport.pipelinedEuThreadArbitration) { + setPipelinedEuThreadArbitration(); + } + setPropertiesExtraPerContext(); if (clearDirtyState) { clearIsDirtyPerContext(); @@ -534,3 +538,11 @@ void StateBaseAddressProperties::clearIsDirty() { dynamicStateBaseAddress.isDirty = false; indirectObjectBaseAddress.isDirty = false; } + +void StateComputeModeProperties::setPipelinedEuThreadArbitration() { + this->pipelinedEuThreadArbitration = true; +} + +bool StateComputeModeProperties::isPipelinedEuThreadArbitrationEnabled() const { + return pipelinedEuThreadArbitration; +} diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index f8b324ae40..243fda86f8 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -306,6 +306,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, DebugUmdFifoPollInterval, -1, "-1: default , > 0 DECLARE_DEBUG_VARIABLE(int32_t, DebugUmdInterruptTimeout, -1, "-1: default , > 0: interruptTimeout based on input in milliseconds. Default is 2000 milliseconds") DECLARE_DEBUG_VARIABLE(int32_t, DebugUmdMaxReadWriteRetry, -1, "-1: default , > 0: max pread/pwrite retry attempts in read/writeGpuMemory calls based on input in milliseconds. Default is 3") DECLARE_DEBUG_VARIABLE(int32_t, ForceIndirectDetectionForCMKernels, -1, "-1: default , 0 : disable indirect detection for CM kernels, 1 : enable indirect detection for CM kernels") +DECLARE_DEBUG_VARIABLE(int32_t, PipelinedEuThreadArbitration, -1, "-1: default. 1: Use Walker field, 0: Use StateComputeMode command to program pipelinedEuThreadArbitration") DECLARE_DEBUG_VARIABLE(bool, ForceUseOnlyGlobalTimestamps, 0, "0- default disabled, 1: enable use only global timestamp") /*LOGGING FLAGS*/ diff --git a/shared/source/generated/xe3_core/hw_cmds_generated_xe3_core.inl b/shared/source/generated/xe3_core/hw_cmds_generated_xe3_core.inl index 9205798ed5..4bf1f98488 100644 --- a/shared/source/generated/xe3_core/hw_cmds_generated_xe3_core.inl +++ b/shared/source/generated/xe3_core/hw_cmds_generated_xe3_core.inl @@ -6950,7 +6950,8 @@ typedef struct tagSTATE_COMPUTE_MODE { uint32_t Reserved_37 : BITFIELD_RANGE(5, 6); uint32_t AsyncComputeThreadLimit : BITFIELD_RANGE(7, 9); uint32_t EnableVariableRegisterSizeAllocation_Vrt : BITFIELD_RANGE(10, 10); - uint32_t Reserved_43 : BITFIELD_RANGE(11, 12); + uint32_t Reserved_43 : BITFIELD_RANGE(11, 11); + uint32_t EnablePipelinedEuThreadArbitration : BITFIELD_RANGE(12, 12); uint32_t EuThreadSchedulingMode : BITFIELD_RANGE(13, 14); uint32_t LargeGrfMode : BITFIELD_RANGE(15, 15); uint32_t Mask1 : BITFIELD_RANGE(16, 31); @@ -6959,7 +6960,7 @@ typedef struct tagSTATE_COMPUTE_MODE { uint32_t MidthreadPreemptionOverdispatchThreadGroupCount : BITFIELD_RANGE(3, 4); uint32_t MidthreadPreemptionOverdispatchTestMode : BITFIELD_RANGE(5, 5); uint32_t UavCoherencyMode : BITFIELD_RANGE(6, 6); - uint32_t Reserved_76 : BITFIELD_RANGE(7, 15); + uint32_t Reserved_71 : BITFIELD_RANGE(7, 15); uint32_t Mask2 : BITFIELD_RANGE(16, 31); } Common; uint32_t RawData[3]; @@ -7076,6 +7077,12 @@ typedef struct tagSTATE_COMPUTE_MODE { inline bool getEnableVariableRegisterSizeAllocationVrt() const { return TheStructure.Common.EnableVariableRegisterSizeAllocation_Vrt; } + inline void setEnablePipelinedEuThreadArbitration(const bool value) { + TheStructure.Common.EnablePipelinedEuThreadArbitration = value; + } + inline bool getEnablePipelinedEuThreadArbitration() const { + return TheStructure.Common.EnablePipelinedEuThreadArbitration; + } inline void setEuThreadSchedulingMode(const EU_THREAD_SCHEDULING_MODE value) { TheStructure.Common.EuThreadSchedulingMode = value; } diff --git a/shared/source/xe3_core/command_encoder_xe3_core.cpp b/shared/source/xe3_core/command_encoder_xe3_core.cpp index 466b71c408..e60bc78382 100644 --- a/shared/source/xe3_core/command_encoder_xe3_core.cpp +++ b/shared/source/xe3_core/command_encoder_xe3_core.cpp @@ -32,7 +32,10 @@ void EncodeComputeMode::programComputeModeCommand(LinearStream &csr, Sta auto maskBits = stateComputeMode.getMask1(); auto maskBits2 = stateComputeMode.getMask2(); - if (properties.threadArbitrationPolicy.isDirty) { + if (properties.isPipelinedEuThreadArbitrationEnabled()) { + stateComputeMode.setEnablePipelinedEuThreadArbitration(true); + maskBits |= Family::stateComputeModePipelinedEuThreadArbitrationMask; + } else if (properties.threadArbitrationPolicy.isDirty) { switch (properties.threadArbitrationPolicy.value) { case ThreadArbitrationPolicy::RoundRobin: stateComputeMode.setEuThreadSchedulingMode(STATE_COMPUTE_MODE::EU_THREAD_SCHEDULING_MODE::EU_THREAD_SCHEDULING_MODE_ROUND_ROBIN); diff --git a/shared/source/xe3_core/hw_cmds_base.h b/shared/source/xe3_core/hw_cmds_base.h index 19a3f856f6..b44fd65b90 100644 --- a/shared/source/xe3_core/hw_cmds_base.h +++ b/shared/source/xe3_core/hw_cmds_base.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2024 Intel Corporation + * Copyright (C) 2024-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -24,6 +24,7 @@ struct Xe3Core { #include "shared/source/generated/xe3_core/hw_cmds_generated_xe3_core.inl" static constexpr uint32_t stateComputeModeEnableVariableRegisterSizeAllocationMask = (1u << 10); + static constexpr uint32_t stateComputeModePipelinedEuThreadArbitrationMask = (1u << 12); static constexpr uint32_t stateComputeModeEuThreadSchedulingModeOverrideMask = (0b11u << 13); static constexpr uint32_t stateComputeModeLargeGrfModeMask = (1u << 15); // DW2 diff --git a/shared/source/xe3_core/os_agnostic_product_helper_xe3_core.inl b/shared/source/xe3_core/os_agnostic_product_helper_xe3_core.inl index d22a3c1a8b..e2386c5f9f 100644 --- a/shared/source/xe3_core/os_agnostic_product_helper_xe3_core.inl +++ b/shared/source/xe3_core/os_agnostic_product_helper_xe3_core.inl @@ -31,6 +31,15 @@ void ProductHelperHw::fillScmPropertiesSupportStructure(StateCompute propertiesSupport.enableVariableRegisterSizeAllocation = !!debugManager.flags.EnableXe3VariableRegisterSizeAllocation.get(); } propertiesSupport.largeGrfMode = !propertiesSupport.enableVariableRegisterSizeAllocation; + + bool pipelinedEuThreadArbitration = true; + if (debugManager.flags.PipelinedEuThreadArbitration.get() != -1) { + pipelinedEuThreadArbitration = !!debugManager.flags.PipelinedEuThreadArbitration.get(); + } + + if (pipelinedEuThreadArbitration) { + propertiesSupport.pipelinedEuThreadArbitration = true; + } } template <> diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index ae4e81bab8..edc24d8d04 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -662,4 +662,5 @@ LogIndirectDetectionKernelDetails = 0 DirectSubmissionRelaxedOrderingCounterHeuristic = -1 DirectSubmissionRelaxedOrderingCounterHeuristicTreshold = -1 ClearStandaloneInOrderTimestampAllocation = -1 +PipelinedEuThreadArbitration = -1 # Please don't edit below this line diff --git a/shared/test/unit_test/xe3_core/compute_mode_tests_xe3_core.cpp b/shared/test/unit_test/xe3_core/compute_mode_tests_xe3_core.cpp index 3ab2c1a15e..fa6accd984 100644 --- a/shared/test/unit_test/xe3_core/compute_mode_tests_xe3_core.cpp +++ b/shared/test/unit_test/xe3_core/compute_mode_tests_xe3_core.cpp @@ -153,7 +153,7 @@ XE3_CORETEST_F(ComputeModeRequirementsXe3Core, giventhreadArbitrationPolicyWitho hwParser.parseCommands(getCsrHw()->commandStream, startOffset); bool foundOne = false; - uint32_t expectedMask = FamilyType::stateComputeModeEuThreadSchedulingModeOverrideMask; + uint32_t expectedMask = FamilyType::stateComputeModePipelinedEuThreadArbitrationMask; for (auto it = hwParser.cmdList.begin(); it != hwParser.cmdList.end(); it++) { auto cmd = genCmdCast(*it); diff --git a/shared/test/unit_test/xe3_core/test_encode_xe3_core.cpp b/shared/test/unit_test/xe3_core/test_encode_xe3_core.cpp index ab8240ec8e..dcde7f1a17 100644 --- a/shared/test/unit_test/xe3_core/test_encode_xe3_core.cpp +++ b/shared/test/unit_test/xe3_core/test_encode_xe3_core.cpp @@ -382,6 +382,31 @@ XE3_CORETEST_F(EncodeKernelXe3CoreTest, givenDefaultSettingForFenceWhenKernelUse EXPECT_TRUE(postSyncData.getSystemMemoryFenceRequest()); } +XE3_CORETEST_F(EncodeKernelXe3CoreTest, givenDebugFlagSetWhenSetPropertiesAllCalledThenDisablePipelinedThreadArbitrationPolicy) { + DebugManagerStateRestore restore; + + MockExecutionEnvironment executionEnvironment{}; + auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0]; + + { + StreamProperties streamProperties{}; + streamProperties.initSupport(rootDeviceEnvironment); + + streamProperties.stateComputeMode.setPropertiesAll(false, 0, 0, PreemptionMode::Disabled); + EXPECT_TRUE(streamProperties.stateComputeMode.isPipelinedEuThreadArbitrationEnabled()); + } + + { + debugManager.flags.PipelinedEuThreadArbitration.set(0); + + StreamProperties streamProperties{}; + streamProperties.initSupport(rootDeviceEnvironment); + + streamProperties.stateComputeMode.setPropertiesAll(false, 0, 0, PreemptionMode::Disabled); + EXPECT_FALSE(streamProperties.stateComputeMode.isPipelinedEuThreadArbitrationEnabled()); + } +} + XE3_CORETEST_F(EncodeKernelXe3CoreTest, givenDebugFlagWhenProgrammingStateComputeModeThenEnableVrtFieldIsCorrectlySet) { using STATE_COMPUTE_MODE = typename FamilyType::STATE_COMPUTE_MODE;