From d17255fe1851f71c1f764b85184912ad0203bd8c Mon Sep 17 00:00:00 2001 From: Szymon Morek Date: Thu, 21 Aug 2025 09:53:34 +0000 Subject: [PATCH] performance: add low latency hint on xe Related-To: NEO-14708 Signed-off-by: Szymon Morek --- .../debug_settings/debug_variables_base.inl | 1 + .../os_interface/linux/xe/ioctl_helper_xe.cpp | 15 ++++ .../os_interface/linux/xe/ioctl_helper_xe.h | 3 +- .../linux/xe/mock_ioctl_helper_xe.h | 1 + shared/test/common/test_files/igdrcl.config | 1 + .../linux/xe/ioctl_helper_xe_tests.cpp | 71 +++++++++++++++++++ 6 files changed, 91 insertions(+), 1 deletion(-) diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index a5f58f370f..0270f4b531 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -433,6 +433,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, ForcePostSyncL1Flush, -1, "-1: default (do nothi DECLARE_DEBUG_VARIABLE(int32_t, AllowNotZeroForCompressedOnWddm, -1, "-1: default (do nothing), 0: do not set AllowNotZeroed for compressed resources, 1: set AllowNotZeroed for compressed resources"); DECLARE_DEBUG_VARIABLE(int32_t, ForceWddmHugeChunkSizeMB, -1, "-1: default (do nothing), >0: set given huge chunk size in MegaBytes for WDDM"); DECLARE_DEBUG_VARIABLE(int64_t, ForceGmmSystemMemoryBufferForAllocations, 0, "0: default, >0: (bitmask) for given Allocation Types, force GMM_RESOURCE_USAGE_OCL_SYSTEM_MEMORY_BUFFER gmm resource type"); +DECLARE_DEBUG_VARIABLE(int32_t, ForceLowLatencyHint, -1, "Force passing low latency hint during xe_exec_queue creation. -1: default, 0: disabled, 1: enabled"); DECLARE_DEBUG_VARIABLE(int32_t, EmitMemAdvisePriorToCopyForNonUsm, -1, "Enable Memadvise to system memory for copy/fill with shared system input: -1: default, 0: disabled, 1: enabled") DECLARE_DEBUG_VARIABLE(int32_t, TreatNonUsmForTransfersAsSharedSystem, -1, "-1: default, 0: import non-usm as external host ptr on copy/fill (legacy mode), 1: treat non usm on copy/fill as shared system usm") diff --git a/shared/source/os_interface/linux/xe/ioctl_helper_xe.cpp b/shared/source/os_interface/linux/xe/ioctl_helper_xe.cpp index 965ef06063..3d937f7337 100644 --- a/shared/source/os_interface/linux/xe/ioctl_helper_xe.cpp +++ b/shared/source/os_interface/linux/xe/ioctl_helper_xe.cpp @@ -214,8 +214,17 @@ bool IoctlHelperXe::initialize() { xeLog("DRM_XE_QUERY_CONFIG_MAX_EXEC_QUEUE_PRIORITY\t\t%#llx\n", config->info[DRM_XE_QUERY_CONFIG_MAX_EXEC_QUEUE_PRIORITY]); + xeLog(" DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY\t%s\n", + config->info[DRM_XE_QUERY_CONFIG_FLAGS] & + DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY + ? "ON" + : "OFF"); maxExecQueuePriority = config->info[DRM_XE_QUERY_CONFIG_MAX_EXEC_QUEUE_PRIORITY] & 0xffff; + isLowLatencyHintAvailable = false; + if (debugManager.flags.ForceLowLatencyHint.get() != -1) { + isLowLatencyHintAvailable = !!debugManager.flags.ForceLowLatencyHint.get(); + } memset(&queryConfig, 0, sizeof(queryConfig)); queryConfig.query = DRM_XE_DEVICE_QUERY_HWCONFIG; @@ -1335,6 +1344,12 @@ void IoctlHelperXe::xeShowBindTable() { } } +void IoctlHelperXe::applyContextFlags(void *execQueueCreate, bool allocateInterrupt) { + if (this->isLowLatencyHintAvailable) { + reinterpret_cast(execQueueCreate)->flags |= DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT; + } +} + int IoctlHelperXe::createDrmContext(Drm &drm, OsContextLinux &osContext, uint32_t drmVmId, uint32_t deviceIndex, bool allocateInterrupt) { uint32_t drmContextId = 0; diff --git a/shared/source/os_interface/linux/xe/ioctl_helper_xe.h b/shared/source/os_interface/linux/xe/ioctl_helper_xe.h index d4655c2c99..b120ca3aff 100644 --- a/shared/source/os_interface/linux/xe/ioctl_helper_xe.h +++ b/shared/source/os_interface/linux/xe/ioctl_helper_xe.h @@ -178,7 +178,7 @@ class IoctlHelperXe : public IoctlHelper { uint16_t getDefaultEngineClass(const aub_stream::EngineType &defaultEngineType); void setOptionalContextProperties(Drm &drm, void *extProperties, uint32_t &extIndexInOut); virtual void setContextProperties(const OsContextLinux &osContext, uint32_t deviceIndex, void *extProperties, uint32_t &extIndexInOut); - virtual void applyContextFlags(void *execQueueCreate, bool allocateInterrupt){}; + virtual void applyContextFlags(void *execQueueCreate, bool allocateInterrupt); struct GtIpVersion { uint16_t major; @@ -187,6 +187,7 @@ class IoctlHelperXe : public IoctlHelper { }; bool queryHwIpVersion(GtIpVersion >IpVersion); + bool isLowLatencyHintAvailable = false; int maxExecQueuePriority = 0; std::mutex xeLock; std::mutex gemCloseLock; diff --git a/shared/test/common/os_interface/linux/xe/mock_ioctl_helper_xe.h b/shared/test/common/os_interface/linux/xe/mock_ioctl_helper_xe.h index 375ca587f7..2ae0174e12 100644 --- a/shared/test/common/os_interface/linux/xe/mock_ioctl_helper_xe.h +++ b/shared/test/common/os_interface/linux/xe/mock_ioctl_helper_xe.h @@ -19,6 +19,7 @@ struct MockIoctlHelperXe : IoctlHelperXe { using IoctlHelperXe::getFdFromVmExport; using IoctlHelperXe::ioctl; using IoctlHelperXe::IoctlHelperXe; + using IoctlHelperXe::isLowLatencyHintAvailable; using IoctlHelperXe::maxContextSetProperties; using IoctlHelperXe::maxExecQueuePriority; using IoctlHelperXe::queryGtListData; diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index e91ec012a8..ecafb472be 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -648,6 +648,7 @@ PipelinedEuThreadArbitration = -1 ExperimentalUSMAllocationReuseCleaner = -1 DummyPageBackingEnabled = 0 EnableDeferBacking = 0 +ForceLowLatencyHint = -1 EmitMemAdvisePriorToCopyForNonUsm = -1 TreatNonUsmForTransfersAsSharedSystem = -1 SetMaxBVHLevels = -1 diff --git a/shared/test/unit_test/os_interface/linux/xe/ioctl_helper_xe_tests.cpp b/shared/test/unit_test/os_interface/linux/xe/ioctl_helper_xe_tests.cpp index 04c188af8a..964aedb5d0 100644 --- a/shared/test/unit_test/os_interface/linux/xe/ioctl_helper_xe_tests.cpp +++ b/shared/test/unit_test/os_interface/linux/xe/ioctl_helper_xe_tests.cpp @@ -3016,6 +3016,77 @@ TEST_F(IoctlHelperXeTest, givenXeIoctlHelperAndDeferBackingFlagSetToTrueWhenMake EXPECT_TRUE(xeIoctlHelper->makeResidentBeforeLockNeeded()); } +TEST_F(IoctlHelperXeTest, givenXeIoctlHelperWhenCreateDrmContextAndLowLatencyHintNotAvailableThenNoFlagIsSet) { + class MockLinuxOsContext : public OsContextLinux { + public: + using OsContextLinux::initializeContext; + using OsContextLinux::OsContextLinux; + }; + + auto executionEnvironment = std::make_unique(); + auto drm = DrmMockXe::create(*executionEnvironment->rootDeviceEnvironments[0]); + auto xeIoctlHelper = static_cast(drm->getIoctlHelper()); + xeIoctlHelper->contextParamEngine.push_back(drm_xe_engine_class_instance{}); + MockLinuxOsContext osContext(*drm, 0, 5u, NEO::EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_CCS, EngineUsage::regular})); + + osContext.initializeContext(false); + EXPECT_EQ(0u, drm->latestExecQueueCreate.flags); +} + +TEST_F(IoctlHelperXeTest, givenXeIoctlHelperWhenCreateDrmContextAndLowLatencyHintAvailableThenFlagIsSet) { + class MockLinuxOsContext : public OsContextLinux { + public: + using OsContextLinux::initializeContext; + using OsContextLinux::OsContextLinux; + }; + + auto executionEnvironment = std::make_unique(); + auto drm = DrmMockXe::create(*executionEnvironment->rootDeviceEnvironments[0]); + auto xeIoctlHelper = static_cast(drm->getIoctlHelper()); + xeIoctlHelper->contextParamEngine.push_back(drm_xe_engine_class_instance{}); + MockLinuxOsContext osContext(*drm, 0, 5u, NEO::EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_CCS, EngineUsage::regular})); + + xeIoctlHelper->isLowLatencyHintAvailable = true; + osContext.initializeContext(false); + EXPECT_EQ(static_cast(DRM_XE_EXEC_QUEUE_LOW_LATENCY_HINT), drm->latestExecQueueCreate.flags); +} + +TEST_F(IoctlHelperXeTest, whenInitializeIoctlHelperAndLowLatencyNotAvailableThenFlagNotSet) { + auto executionEnvironment = std::make_unique(); + auto drm = DrmMockXe::create(*executionEnvironment->rootDeviceEnvironments[0]); + auto xeIoctlHelper = static_cast(drm->getIoctlHelper()); + + auto xeQueryConfig = reinterpret_cast(drm->queryConfig); + xeQueryConfig->info[DRM_XE_QUERY_CONFIG_FLAGS] = 0; + xeIoctlHelper->initialize(); + EXPECT_FALSE(static_cast(xeIoctlHelper)->isLowLatencyHintAvailable); +} + +TEST_F(IoctlHelperXeTest, whenInitializeIoctlHelperAndLowLatencyAvailableThenFlagNotSet) { + auto executionEnvironment = std::make_unique(); + auto drm = DrmMockXe::create(*executionEnvironment->rootDeviceEnvironments[0]); + auto xeIoctlHelper = static_cast(drm->getIoctlHelper()); + + auto xeQueryConfig = reinterpret_cast(drm->queryConfig); + xeQueryConfig->info[DRM_XE_QUERY_CONFIG_FLAGS] = DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY; + xeIoctlHelper->initialize(); + EXPECT_FALSE(static_cast(xeIoctlHelper)->isLowLatencyHintAvailable); +} + +TEST_F(IoctlHelperXeTest, whenInitializeIoctlHelperAndLowLatencyAvailableButDebugFlagEnabledThenFlagNotSet) { + DebugManagerStateRestore restorer{}; + debugManager.flags.ForceLowLatencyHint.set(0); + + auto executionEnvironment = std::make_unique(); + auto drm = DrmMockXe::create(*executionEnvironment->rootDeviceEnvironments[0]); + auto xeIoctlHelper = static_cast(drm->getIoctlHelper()); + + auto xeQueryConfig = reinterpret_cast(drm->queryConfig); + xeQueryConfig->info[DRM_XE_QUERY_CONFIG_FLAGS] = DRM_XE_QUERY_CONFIG_FLAG_HAS_LOW_LATENCY; + xeIoctlHelper->initialize(); + EXPECT_FALSE(static_cast(xeIoctlHelper)->isLowLatencyHintAvailable); +} + TEST_F(IoctlHelperXeTest, givenXeIoctlHelperWhenCallingOverrideMaxSlicesSupportedThenResultIsFalse) { auto executionEnvironment = std::make_unique(); DrmMock drm{*executionEnvironment->rootDeviceEnvironments[0]};