From e55d4bf234c1285de62aa4db1dd79958a6df2907 Mon Sep 17 00:00:00 2001 From: Kamil Kopryk Date: Fri, 13 Mar 2020 13:14:28 +0100 Subject: [PATCH] Adjust preffered wgs multiple for specific configs Change-Id: Ib7e788760f0400b983e03044386f04637e12727e Signed-off-by: Kamil Kopryk Related-To: NEO-4331 --- opencl/source/gen12lp/hw_helper_gen12lp.cpp | 6 ++++- opencl/source/kernel/kernel.cpp | 6 ++++- .../gen12lp/hw_helper_tests_gen12lp.inl | 26 +++++++++++++++++++ .../unit_test/helpers/hw_helper_tests.cpp | 8 ++++++ .../debug_settings/debug_variables_base.inl | 2 +- shared/source/gen12lp/preamble_gen12lp.cpp | 9 +++---- shared/source/helpers/CMakeLists.txt | 1 + shared/source/helpers/hw_helper.h | 3 +++ shared/source/helpers/hw_helper_base.inl | 5 ++++ .../source/helpers/hw_helper_tgllp_plus.inl | 19 ++++++++++++++ 10 files changed, 77 insertions(+), 8 deletions(-) create mode 100644 shared/source/helpers/hw_helper_tgllp_plus.inl diff --git a/opencl/source/gen12lp/hw_helper_gen12lp.cpp b/opencl/source/gen12lp/hw_helper_gen12lp.cpp index ce6d339bb1..1e096adebd 100644 --- a/opencl/source/gen12lp/hw_helper_gen12lp.cpp +++ b/opencl/source/gen12lp/hw_helper_gen12lp.cpp @@ -5,8 +5,13 @@ * */ +#include "shared/source/gen12lp/hw_cmds.h" + +using Family = NEO::TGLLPFamily; + #include "shared/source/helpers/flat_batch_buffer_helper_hw.inl" #include "shared/source/helpers/hw_helper_bdw_plus.inl" +#include "shared/source/helpers/hw_helper_tgllp_plus.inl" #include "opencl/source/aub/aub_helper_bdw_plus.inl" #include "opencl/source/gen12lp/helpers_gen12lp.h" @@ -14,7 +19,6 @@ #include "engine_node.h" namespace NEO { -typedef TGLLPFamily Family; template <> bool HwHelperHw::isOffsetToSkipSetFFIDGPWARequired(const HardwareInfo &hwInfo) const { diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index dc7f3a6e83..e3a5bff2a3 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -581,7 +581,8 @@ cl_int Kernel::getWorkGroupInfo(cl_device_id device, cl_kernel_work_group_info p cl_ulong scratchSize; cl_ulong privateMemSize; size_t maxWorkgroupSize; - + const auto &hwInfo = getDevice().getHardwareInfo(); + auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily); GetInfoHelper info(paramValue, paramValueSize, paramValueSizeRet); switch (paramName) { @@ -612,6 +613,9 @@ cl_int Kernel::getWorkGroupInfo(cl_device_id device, cl_kernel_work_group_info p case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: DEBUG_BREAK_IF(!patchInfo.executionEnvironment); preferredWorkGroupSizeMultiple = patchInfo.executionEnvironment->LargestCompiledSIMDSize; + if (hwHelper.isFusedEuDispatchEnabled(hwInfo)) { + preferredWorkGroupSizeMultiple *= 2; + } retVal = changeGetInfoStatusToCLResultType((info.set(preferredWorkGroupSizeMultiple))); break; diff --git a/opencl/test/unit_test/gen12lp/hw_helper_tests_gen12lp.inl b/opencl/test/unit_test/gen12lp/hw_helper_tests_gen12lp.inl index cf3258081a..45f7b5012c 100644 --- a/opencl/test/unit_test/gen12lp/hw_helper_tests_gen12lp.inl +++ b/opencl/test/unit_test/gen12lp/hw_helper_tests_gen12lp.inl @@ -5,6 +5,8 @@ * */ +#include "shared/test/unit_test/helpers/debug_manager_state_restore.h" + #include "opencl/test/unit_test/gen12lp/special_ult_helper_gen12lp.h" #include "opencl/test/unit_test/helpers/hw_helper_tests.h" #include "opencl/test/unit_test/mocks/mock_context.h" @@ -144,6 +146,30 @@ GEN12LPTEST_F(HwHelperTestGen12Lp, givenFtrCcsNodeSetAndDefaultRcsWhenGetGpgpuEn EXPECT_EQ(aub_stream::ENGINE_CCS, engines[3]); } +GEN12LPTEST_F(HwHelperTestGen12Lp, givenTgllpWhenIsFusedEuDispatchEnabledIsCalledThenResultIsCorrect) { + DebugManagerStateRestore restorer; + auto &helper = HwHelper::get(renderCoreFamily); + auto &waTable = hardwareInfo.workaroundTable; + bool wa; + int32_t debugKey; + size_t expectedResult; + + const std::array, 6> testParams{std::make_tuple(true, false, -1), + std::make_tuple(false, true, -1), + std::make_tuple(true, false, 0), + std::make_tuple(true, true, 0), + std::make_tuple(false, false, 1), + std::make_tuple(false, true, 1)}; + + for (const auto ¶ms : testParams) { + std::tie(expectedResult, wa, debugKey) = params; + waTable.waDisableFusedThreadScheduling = wa; + DebugManager.flags.CFEFusedEUDispatch.set(debugKey); + + EXPECT_EQ(expectedResult, helper.isFusedEuDispatchEnabled(hardwareInfo)); + } +} + class HwHelperTestsGen12LpBuffer : public ::testing::Test { public: void SetUp() override { diff --git a/opencl/test/unit_test/helpers/hw_helper_tests.cpp b/opencl/test/unit_test/helpers/hw_helper_tests.cpp index bf2543cac9..f6cc175f9f 100644 --- a/opencl/test/unit_test/helpers/hw_helper_tests.cpp +++ b/opencl/test/unit_test/helpers/hw_helper_tests.cpp @@ -816,6 +816,14 @@ HWTEST_F(HwHelperTest, givenDefaultHwHelperHwWhenMinimalSIMDSizeIsQueriedThen8Is EXPECT_EQ(8u, helper.getMinimalSIMDSize()); } +HWCMDTEST_F(IGFX_GEN8_CORE, HwHelperTest, WhenIsFusedEuDispatchEnabledIsCalledThenFalseIsReturned) { + if (hardwareInfo.platform.eRenderCoreFamily == IGFX_GEN12LP_CORE) { + GTEST_SKIP(); + } + auto &helper = HwHelper::get(renderCoreFamily); + EXPECT_FALSE(helper.isFusedEuDispatchEnabled(hardwareInfo)); +} + HWTEST_F(PipeControlHelperTests, WhenGettingPipeControSizeForCacheFlushThenReturnCorrectValue) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; size_t actualSize = MemorySynchronizationCommands::getSizeForFullCacheFlush(); diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index b125b45cc9..248249cdc4 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -39,7 +39,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, SchedulerSimulationReturnInstance, 0, "prints ex DECLARE_DEBUG_VARIABLE(int32_t, SchedulerGWS, 0, "Forces gws of scheduler kernel, only multiple of 24 allowed or 0 - default selected") DECLARE_DEBUG_VARIABLE(int32_t, EnableExperimentalCommandBuffer, 0, "Enables injection of experimental command buffer") DECLARE_DEBUG_VARIABLE(int32_t, OverrideStatelessMocsIndex, -1, "-1: feature inactive, >=0 : following MOCS index will be programmed for stateless accesses in state base address") -DECLARE_DEBUG_VARIABLE(int32_t, CFEFusedEUDispatch, -1, "Set Fused EU dispatch in FrontEnd State command. -1 - do not set") +DECLARE_DEBUG_VARIABLE(int32_t, CFEFusedEUDispatch, -1, "Set Fused EU dispatch in FrontEnd State command. -1 - default, 0 - enabled, 1 - disabled") DECLARE_DEBUG_VARIABLE(int32_t, ForceAuxTranslationMode, -1, "-1: Default, 0: Builtin, 1: Blit") DECLARE_DEBUG_VARIABLE(int32_t, OverrideGpuAddressSpace, -1, "-1: Default, !=-1: GPU address space range in bits") DECLARE_DEBUG_VARIABLE(int32_t, OverrideMaxWorkgroupSize, -1, "-1: Default, !=-1: Overrides max worgkroup size to this value") diff --git a/shared/source/gen12lp/preamble_gen12lp.cpp b/shared/source/gen12lp/preamble_gen12lp.cpp index 91b2511ca5..a7b4e822dd 100644 --- a/shared/source/gen12lp/preamble_gen12lp.cpp +++ b/shared/source/gen12lp/preamble_gen12lp.cpp @@ -79,13 +79,12 @@ uint32_t PreambleHelper::getUrbEntryAllocationSize() { template <> void PreambleHelper::programAdditionalFieldsInVfeState(VFE_STATE_TYPE *mediaVfeState, const HardwareInfo &hwInfo) { - mediaVfeState->setDisableSlice0Subslice2(hwInfo.workaroundTable.waDisableFusedThreadScheduling); - - if (DebugManager.flags.CFEFusedEUDispatch.get() != -1) { - mediaVfeState->setDisableSlice0Subslice2(DebugManager.flags.CFEFusedEUDispatch.get()); + auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily); + if (!hwHelper.isFusedEuDispatchEnabled(hwInfo)) { + mediaVfeState->setDisableSlice0Subslice2(true); } } -// Explicitly instantiate PreambleHelper for TGLLP device family +// Explicitly instantiate PreambleHelper for TGLLP device family template struct PreambleHelper; } // namespace NEO diff --git a/shared/source/helpers/CMakeLists.txt b/shared/source/helpers/CMakeLists.txt index f48d2c9b24..03f4d051d7 100644 --- a/shared/source/helpers/CMakeLists.txt +++ b/shared/source/helpers/CMakeLists.txt @@ -48,6 +48,7 @@ set(NEO_CORE_HELPERS ${CMAKE_CURRENT_SOURCE_DIR}/hw_helper_base.inl ${CMAKE_CURRENT_SOURCE_DIR}/hw_helper_bdw_plus.inl ${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}/hw_helper_extended.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/hw_helper_tgllp_plus.inl ${CMAKE_CURRENT_SOURCE_DIR}/hw_info.cpp ${CMAKE_CURRENT_SOURCE_DIR}/hw_info.h ${CMAKE_CURRENT_SOURCE_DIR}/interlocked_max.h diff --git a/shared/source/helpers/hw_helper.h b/shared/source/helpers/hw_helper.h index a44d0d48ff..ec4f736219 100644 --- a/shared/source/helpers/hw_helper.h +++ b/shared/source/helpers/hw_helper.h @@ -79,6 +79,7 @@ class HwHelper { virtual bool isForceEmuInt32DivRemSPWARequired(const HardwareInfo &hwInfo) = 0; virtual uint32_t getMinimalSIMDSize() = 0; virtual bool isOffsetToSkipSetFFIDGPWARequired(const HardwareInfo &hwInfo) const = 0; + virtual bool isFusedEuDispatchEnabled(const HardwareInfo &hwInfo) const = 0; static uint32_t getSubDevicesCount(const HardwareInfo *pHwInfo); static uint32_t getEnginesCount(const HardwareInfo &hwInfo); @@ -200,6 +201,8 @@ class HwHelperHw : public HwHelper { bool isOffsetToSkipSetFFIDGPWARequired(const HardwareInfo &hwInfo) const override; + bool isFusedEuDispatchEnabled(const HardwareInfo &hwInfo) const override; + static bool isForceDefaultRCSEngineWARequired(const HardwareInfo &hwInfo); bool isForceEmuInt32DivRemSPWARequired(const HardwareInfo &hwInfo) override; diff --git a/shared/source/helpers/hw_helper_base.inl b/shared/source/helpers/hw_helper_base.inl index ed4726e53d..2d811d670c 100644 --- a/shared/source/helpers/hw_helper_base.inl +++ b/shared/source/helpers/hw_helper_base.inl @@ -306,6 +306,11 @@ uint32_t HwHelperHw::getMaxThreadsForWorkgroup(const HardwareInfo &hw return HwHelper::getMaxThreadsForWorkgroup(hwInfo, maxNumEUsPerSubSlice); } +template +inline bool HwHelperHw::isFusedEuDispatchEnabled(const HardwareInfo &hwInfo) const { + return false; +} + template size_t MemorySynchronizationCommands::getSizeForFullCacheFlush() { return sizeof(typename GfxFamily::PIPE_CONTROL); diff --git a/shared/source/helpers/hw_helper_tgllp_plus.inl b/shared/source/helpers/hw_helper_tgllp_plus.inl new file mode 100644 index 0000000000..e2ce10bba8 --- /dev/null +++ b/shared/source/helpers/hw_helper_tgllp_plus.inl @@ -0,0 +1,19 @@ +/* + * Copyright (C) 2020 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +namespace NEO { + +template <> +inline bool HwHelperHw::isFusedEuDispatchEnabled(const HardwareInfo &hwInfo) const { + auto fusedEuDispatchEnabled = !hwInfo.workaroundTable.waDisableFusedThreadScheduling; + if (DebugManager.flags.CFEFusedEUDispatch.get() != -1) { + fusedEuDispatchEnabled = (DebugManager.flags.CFEFusedEUDispatch.get() == 0); + } + return fusedEuDispatchEnabled; +} + +} // namespace NEO