From 08e38539820961b0c8d4a040577dc95d12c3d168 Mon Sep 17 00:00:00 2001 From: Bartosz Dunajski Date: Wed, 30 Mar 2022 14:11:26 +0000 Subject: [PATCH] Debug flag to add extra MI_MEM_FENCE for DirectSubmission Signed-off-by: Bartosz Dunajski --- level_zero/core/source/cmdlist/cmdlist_hw.inl | 4 +--- .../test/unit_test/test_files/igdrcl.config | 1 + .../hw_helper_tests_xe_hpc_core.cpp | 2 +- .../command_stream_receiver_hw_base.inl | 4 ++-- .../debug_settings/debug_variables_base.inl | 1 + .../direct_submission_hw.inl | 16 +++++++++++++ ...direct_submission_xe_hp_core_and_later.inl | 24 +++++++++++++++++-- shared/source/gen11/hw_cmds_base.h | 2 ++ shared/source/gen12lp/hw_cmds_base.h | 1 + shared/source/gen8/hw_cmds_base.h | 2 ++ shared/source/gen9/hw_cmds_base.h | 2 ++ shared/source/helpers/hw_helper.h | 4 ++-- shared/source/helpers/hw_helper_base.inl | 10 ++++---- shared/source/xe_hp_core/hw_cmds_base.h | 1 + shared/source/xe_hpc_core/hw_cmds_base.h | 1 + .../xe_hpc_core/hw_helper_xe_hpc_core.cpp | 8 +++++-- shared/source/xe_hpg_core/hw_cmds_base.h | 1 + 17 files changed, 67 insertions(+), 17 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 05003d040f..2e839098a4 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -2006,9 +2006,7 @@ void CommandListCoreFamily::appendEventForProfiling(ze_event_hand NEO::MemorySynchronizationCommands::addPipeControl(*commandContainer.getCommandStream(), args); uint64_t baseAddr = event->getGpuAddress(this->device); - NEO::MemorySynchronizationCommands::addAdditionalSynchronization(*commandContainer.getCommandStream(), - baseAddr, - hwInfo); + NEO::MemorySynchronizationCommands::addAdditionalSynchronization(*commandContainer.getCommandStream(), baseAddr, false, hwInfo); appendWriteKernelTimestamp(hEvent, beforeWalker, true); } } diff --git a/opencl/test/unit_test/test_files/igdrcl.config b/opencl/test/unit_test/test_files/igdrcl.config index c38892686a..c23f517c58 100644 --- a/opencl/test/unit_test/test_files/igdrcl.config +++ b/opencl/test/unit_test/test_files/igdrcl.config @@ -409,3 +409,4 @@ DirectSubmissionReadBackCommandBuffer = -1 DirectSubmissionReadBackRingBuffer = -1 ReadBackCommandBufferAllocation = -1 PrintImageBlitBlockCopyCmdDetails = 0 +DirectSubmissionInsertExtraMiMemFenceCommands = -1 diff --git a/opencl/test/unit_test/xe_hpc_core/hw_helper_tests_xe_hpc_core.cpp b/opencl/test/unit_test/xe_hpc_core/hw_helper_tests_xe_hpc_core.cpp index 7da7e91290..a694f90a7e 100644 --- a/opencl/test/unit_test/xe_hpc_core/hw_helper_tests_xe_hpc_core.cpp +++ b/opencl/test/unit_test/xe_hpc_core/hw_helper_tests_xe_hpc_core.cpp @@ -684,7 +684,7 @@ XE_HPC_CORETEST_F(HwHelperTestsXeHpcCore, givenMemorySynchronizationCommandsWhen LinearStream commandStream(buffer, 128); auto synchronizationSize = MemorySynchronizationCommands::getSizeForSingleAdditionalSynchronization(hardwareInfo); - MemorySynchronizationCommands::addAdditionalSynchronization(commandStream, gpuAddress, hardwareInfo); + MemorySynchronizationCommands::addAdditionalSynchronization(commandStream, gpuAddress, false, hardwareInfo); HardwareParse hwParser; hwParser.parseCommands(commandStream); diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 0f2538d36a..db09b52a50 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -1069,14 +1069,14 @@ uint32_t CommandStreamReceiverHw::flushBcsTask(const BlitPropertiesCo auto updateTag = !isUpdateTagFromWaitEnabled(); updateTag |= blocking; if (updateTag) { - MemorySynchronizationCommands::addAdditionalSynchronization(commandStream, tagAllocation->getGpuAddress(), peekHwInfo()); + MemorySynchronizationCommands::addAdditionalSynchronization(commandStream, tagAllocation->getGpuAddress(), false, peekHwInfo()); MiFlushArgs args; args.commandWithPostSync = true; args.notifyEnable = isUsedNotifyEnableForPostSync(); EncodeMiFlushDW::programMiFlushDw(commandStream, tagAllocation->getGpuAddress(), newTaskCount, args, hwInfo); - MemorySynchronizationCommands::addAdditionalSynchronization(commandStream, tagAllocation->getGpuAddress(), peekHwInfo()); + MemorySynchronizationCommands::addAdditionalSynchronization(commandStream, tagAllocation->getGpuAddress(), false, peekHwInfo()); } if (PauseOnGpuProperties::pauseModeAllowed(DebugManager.flags.PauseOnBlitCopy.get(), taskCount, PauseOnGpuProperties::PauseMode::AfterWorkload)) { diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 41978d9fd0..9d888ed97c 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -289,6 +289,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionForceLocalMemoryStorageMode, -1, DECLARE_DEBUG_VARIABLE(int32_t, EnableRingSwitchTagUpdateWa, -1, "-1: default, 0 - disable, 1 - enable. If enabled, completionRingBuffers wont be updated if ring is not runnning.") DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionReadBackCommandBuffer, -1, "-1: default - disabled, 0 - disable, 1 - enable. If enabled, read first dword of cmd buffer after handling residency.") DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionReadBackRingBuffer, -1, "-1: default - disabled, 0 - disable, 1 - enable. If enabled, read first dword of ring buffer after handling residency.") +DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionInsertExtraMiMemFenceCommands, -1, "-1: default, 0 - disable, 1 - enable. If enabled, add extra MI_MEM_FENCE instructions with acquire bit set") /* IMPLICIT SCALING */ DECLARE_DEBUG_VARIABLE(int32_t, EnableWalkerPartition, -1, "-1: default, 0: disable, 1: enable, Enables Walker Partitioning via WPARID.") diff --git a/shared/source/direct_submission/direct_submission_hw.inl b/shared/source/direct_submission/direct_submission_hw.inl index 71d46bda1c..4e3460ce84 100644 --- a/shared/source/direct_submission/direct_submission_hw.inl +++ b/shared/source/direct_submission/direct_submission_hw.inl @@ -246,11 +246,20 @@ template inline void DirectSubmissionHw::dispatchSemaphoreSection(uint32_t value) { using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT; using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; + dispatchDisablePrefetcher(true); EncodeSempahore::addMiSemaphoreWaitCommand(ringCommandStream, semaphoreGpuVa, value, COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD); + + if constexpr (GfxFamily::isUsingMiMemFence) { + if (DebugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.get() == 1) { + + MemorySynchronizationCommands::addAdditionalSynchronization(ringCommandStream, 0, true, this->device.getHardwareInfo()); + } + } + dispatchPrefetchMitigation(); dispatchDisablePrefetcher(false); } @@ -260,6 +269,13 @@ inline size_t DirectSubmissionHw::getSizeSemaphoreSection size_t semaphoreSize = EncodeSempahore::getSizeMiSemaphoreWait(); semaphoreSize += getSizePrefetchMitigation(); semaphoreSize += 2 * getSizeDisablePrefetcher(); + + if constexpr (GfxFamily::isUsingMiMemFence) { + if (DebugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.get() == 1) { + semaphoreSize += MemorySynchronizationCommands::getSizeForSingleAdditionalSynchronization(this->device.getHardwareInfo()); + } + } + return semaphoreSize; } diff --git a/shared/source/direct_submission/direct_submission_xe_hp_core_and_later.inl b/shared/source/direct_submission/direct_submission_xe_hp_core_and_later.inl index be74f94185..f637b6f5bb 100644 --- a/shared/source/direct_submission/direct_submission_xe_hp_core_and_later.inl +++ b/shared/source/direct_submission/direct_submission_xe_hp_core_and_later.inl @@ -1,11 +1,13 @@ /* - * Copyright (C) 2021 Intel Corporation + * Copyright (C) 2021-2022 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "shared/source/command_container/implicit_scaling.h" +#include "shared/source/command_container/memory_fence_encoder.h" +#include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/direct_submission/direct_submission_hw.h" namespace NEO { @@ -15,11 +17,29 @@ inline void DirectSubmissionHw::dispatchPartitionRegister ImplicitScalingDispatch::dispatchRegisterConfiguration(ringCommandStream, this->workPartitionAllocation->getGpuAddress(), this->postSyncOffset); + + if constexpr (GfxFamily::isUsingMiMemFence) { + if (DebugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.get() == 1) { + auto &engineControl = device.getEngine(this->osContext.getEngineType(), this->osContext.getEngineUsage()); + + UNRECOVERABLE_IF(engineControl.osContext->getContextId() != engineControl.osContext->getContextId()); + + EncodeMemoryFence::encodeSystemMemoryFence(ringCommandStream, engineControl.commandStreamReceiver->getGlobalFenceAllocation()); + } + } } template inline size_t DirectSubmissionHw::getSizePartitionRegisterConfigurationSection() { - return ImplicitScalingDispatch::getRegisterConfigurationSize(); + auto size = ImplicitScalingDispatch::getRegisterConfigurationSize(); + + if constexpr (GfxFamily::isUsingMiMemFence) { + if (DebugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.get() == 1) { + size += EncodeMemoryFence::getSystemMemoryFenceSize(); + } + } + + return size; } template diff --git a/shared/source/gen11/hw_cmds_base.h b/shared/source/gen11/hw_cmds_base.h index 8b51022828..42431028f8 100644 --- a/shared/source/gen11/hw_cmds_base.h +++ b/shared/source/gen11/hw_cmds_base.h @@ -22,6 +22,8 @@ struct GEN11 { static constexpr bool supportsSampler = true; static constexpr bool isUsingGenericMediaStateClear = true; + static constexpr bool isUsingMiMemFence = false; + struct DataPortBindlessSurfaceExtendedMessageDescriptor { union { struct { diff --git a/shared/source/gen12lp/hw_cmds_base.h b/shared/source/gen12lp/hw_cmds_base.h index 71ef06a49d..320320c494 100644 --- a/shared/source/gen12lp/hw_cmds_base.h +++ b/shared/source/gen12lp/hw_cmds_base.h @@ -25,6 +25,7 @@ struct GEN12LP { static constexpr bool supportsSampler = true; static constexpr bool isUsingGenericMediaStateClear = true; static constexpr uint32_t stateComputeModeForceNonCoherentMask = (0b11u << 3); + static constexpr bool isUsingMiMemFence = false; struct DataPortBindlessSurfaceExtendedMessageDescriptor { union { diff --git a/shared/source/gen8/hw_cmds_base.h b/shared/source/gen8/hw_cmds_base.h index d261eaa30e..0ef8641bba 100644 --- a/shared/source/gen8/hw_cmds_base.h +++ b/shared/source/gen8/hw_cmds_base.h @@ -24,6 +24,8 @@ struct GEN8 { static constexpr bool supportsSampler = true; static constexpr bool isUsingGenericMediaStateClear = true; + static constexpr bool isUsingMiMemFence = false; + struct DataPortBindlessSurfaceExtendedMessageDescriptor { union { struct { diff --git a/shared/source/gen9/hw_cmds_base.h b/shared/source/gen9/hw_cmds_base.h index 54a1d6be64..d65f5eecb4 100644 --- a/shared/source/gen9/hw_cmds_base.h +++ b/shared/source/gen9/hw_cmds_base.h @@ -24,6 +24,8 @@ struct GEN9 { static constexpr bool supportsSampler = true; static constexpr bool isUsingGenericMediaStateClear = true; + static constexpr bool isUsingMiMemFence = false; + struct DataPortBindlessSurfaceExtendedMessageDescriptor { union { struct { diff --git a/shared/source/helpers/hw_helper.h b/shared/source/helpers/hw_helper.h index d3fb4f860b..ea0942a8a7 100644 --- a/shared/source/helpers/hw_helper.h +++ b/shared/source/helpers/hw_helper.h @@ -460,8 +460,8 @@ struct MemorySynchronizationCommands { static void addPipeControlWA(LinearStream &commandStream, uint64_t gpuAddress, const HardwareInfo &hwInfo); static void setPipeControlWA(void *&commandsBuffer, uint64_t gpuAddress, const HardwareInfo &hwInfo); - static void addAdditionalSynchronization(LinearStream &commandStream, uint64_t gpuAddress, const HardwareInfo &hwInfo); - static void setAdditionalSynchronization(void *&commandsBuffer, uint64_t gpuAddress, const HardwareInfo &hwInfo); + static void addAdditionalSynchronization(LinearStream &commandStream, uint64_t gpuAddress, bool acquire, const HardwareInfo &hwInfo); + static void setAdditionalSynchronization(void *&commandsBuffer, uint64_t gpuAddress, bool acquire, const HardwareInfo &hwInfo); static void addPipeControl(LinearStream &commandStream, PipeControlArgs &args); static void setPipeControl(PIPE_CONTROL &pipeControl, PipeControlArgs &args); diff --git a/shared/source/helpers/hw_helper_base.inl b/shared/source/helpers/hw_helper_base.inl index edb579f2ba..dedfe08e80 100644 --- a/shared/source/helpers/hw_helper_base.inl +++ b/shared/source/helpers/hw_helper_base.inl @@ -244,7 +244,7 @@ void MemorySynchronizationCommands::setPipeControlAndProgramPostSyncO setPostSyncExtraProperties(args, hwInfo); MemorySynchronizationCommands::setPipeControlWithPostSync(commandsBuffer, operation, gpuAddress, immediateData, args); - MemorySynchronizationCommands::setAdditionalSynchronization(commandsBuffer, gpuAddress, hwInfo); + MemorySynchronizationCommands::setAdditionalSynchronization(commandsBuffer, gpuAddress, false, hwInfo); } template @@ -292,15 +292,15 @@ void MemorySynchronizationCommands::setPipeControlWA(void *&commandsB *reinterpret_cast(commandsBuffer) = cmd; commandsBuffer = ptrOffset(commandsBuffer, sizeof(PIPE_CONTROL)); - MemorySynchronizationCommands::setAdditionalSynchronization(commandsBuffer, gpuAddress, hwInfo); + MemorySynchronizationCommands::setAdditionalSynchronization(commandsBuffer, gpuAddress, false, hwInfo); } } template -void MemorySynchronizationCommands::addAdditionalSynchronization(LinearStream &commandStream, uint64_t gpuAddress, const HardwareInfo &hwInfo) { +void MemorySynchronizationCommands::addAdditionalSynchronization(LinearStream &commandStream, uint64_t gpuAddress, bool acquire, const HardwareInfo &hwInfo) { size_t requiredSize = MemorySynchronizationCommands::getSizeForSingleAdditionalSynchronization(hwInfo); void *commandBuffer = commandStream.getSpace(requiredSize); - setAdditionalSynchronization(commandBuffer, gpuAddress, hwInfo); + setAdditionalSynchronization(commandBuffer, gpuAddress, acquire, hwInfo); } template @@ -396,7 +396,7 @@ size_t MemorySynchronizationCommands::getSizeForPipeControlWA(const H } template -void MemorySynchronizationCommands::setAdditionalSynchronization(void *&commandsBuffer, uint64_t gpuAddress, const HardwareInfo &hwInfo) { +void MemorySynchronizationCommands::setAdditionalSynchronization(void *&commandsBuffer, uint64_t gpuAddress, bool acquire, const HardwareInfo &hwInfo) { } template diff --git a/shared/source/xe_hp_core/hw_cmds_base.h b/shared/source/xe_hp_core/hw_cmds_base.h index b1df4ed3be..fc421ba569 100644 --- a/shared/source/xe_hp_core/hw_cmds_base.h +++ b/shared/source/xe_hp_core/hw_cmds_base.h @@ -31,6 +31,7 @@ struct XeHpCore { static constexpr bool isUsingMediaSamplerDopClockGate = true; static constexpr bool supportsSampler = true; static constexpr bool isUsingGenericMediaStateClear = true; + static constexpr bool isUsingMiMemFence = false; struct DataPortBindlessSurfaceExtendedMessageDescriptor { union { diff --git a/shared/source/xe_hpc_core/hw_cmds_base.h b/shared/source/xe_hpc_core/hw_cmds_base.h index 0efb2e187c..d8b8db884e 100644 --- a/shared/source/xe_hpc_core/hw_cmds_base.h +++ b/shared/source/xe_hpc_core/hw_cmds_base.h @@ -34,6 +34,7 @@ struct XE_HPC_CORE { static constexpr bool isUsingMediaSamplerDopClockGate = false; static constexpr bool supportsSampler = false; static constexpr bool isUsingGenericMediaStateClear = true; + static constexpr bool isUsingMiMemFence = true; struct DataPortBindlessSurfaceExtendedMessageDescriptor { union { diff --git a/shared/source/xe_hpc_core/hw_helper_xe_hpc_core.cpp b/shared/source/xe_hpc_core/hw_helper_xe_hpc_core.cpp index 0dec00ca00..29a83c1e05 100644 --- a/shared/source/xe_hpc_core/hw_helper_xe_hpc_core.cpp +++ b/shared/source/xe_hpc_core/hw_helper_xe_hpc_core.cpp @@ -186,7 +186,7 @@ size_t MemorySynchronizationCommands::getSizeForSingleAdditionalSynchron } template <> -void MemorySynchronizationCommands::setAdditionalSynchronization(void *&commandsBuffer, uint64_t gpuAddress, const HardwareInfo &hwInfo) { +void MemorySynchronizationCommands::setAdditionalSynchronization(void *&commandsBuffer, uint64_t gpuAddress, bool acquire, const HardwareInfo &hwInfo) { using MI_MEM_FENCE = typename Family::MI_MEM_FENCE; using MI_SEMAPHORE_WAIT = typename Family::MI_SEMAPHORE_WAIT; @@ -197,7 +197,11 @@ void MemorySynchronizationCommands::setAdditionalSynchronization(void *& } if (programGlobalFenceAsMiMemFenceCommandInCommandStream) { MI_MEM_FENCE miMemFence = Family::cmdInitMemFence; - miMemFence.setFenceType(Family::MI_MEM_FENCE::FENCE_TYPE::FENCE_TYPE_RELEASE); + if (acquire) { + miMemFence.setFenceType(Family::MI_MEM_FENCE::FENCE_TYPE::FENCE_TYPE_ACQUIRE); + } else { + miMemFence.setFenceType(Family::MI_MEM_FENCE::FENCE_TYPE::FENCE_TYPE_RELEASE); + } *reinterpret_cast(commandsBuffer) = miMemFence; commandsBuffer = ptrOffset(commandsBuffer, sizeof(MI_MEM_FENCE)); } else { diff --git a/shared/source/xe_hpg_core/hw_cmds_base.h b/shared/source/xe_hpg_core/hw_cmds_base.h index 950c115804..4168dd3c45 100644 --- a/shared/source/xe_hpg_core/hw_cmds_base.h +++ b/shared/source/xe_hpg_core/hw_cmds_base.h @@ -31,6 +31,7 @@ struct XE_HPG_CORE { static constexpr bool isUsingMediaSamplerDopClockGate = false; static constexpr bool supportsSampler = true; static constexpr bool isUsingGenericMediaStateClear = true; + static constexpr bool isUsingMiMemFence = false; struct DataPortBindlessSurfaceExtendedMessageDescriptor { union {