From 880464da7708d21bc4f5f06db1f77c32e85c3109 Mon Sep 17 00:00:00 2001 From: Lukasz Jobczyk Date: Tue, 5 Jul 2022 10:25:16 +0000 Subject: [PATCH] Apply additional synchronization WA to DG2 ULLS Signed-off-by: Lukasz Jobczyk --- .../direct_submission/direct_submission_hw.h | 2 ++ .../direct_submission_hw.inl | 7 +++--- .../linux/drm_direct_submission.h | 8 ++---- .../linux/drm_direct_submission.inl | 25 ++++++++++++++++--- shared/source/helpers/hw_helper.h | 2 ++ shared/source/helpers/hw_helper_base.inl | 10 ++++++++ shared/source/os_interface/hw_info_config.h | 2 ++ shared/source/os_interface/hw_info_config.inl | 5 ++++ .../dg2/os_agnostic_hw_info_config_dg2.inl | 5 ++++ .../xe_hpg_core/hw_helper_xe_hpg_core.cpp | 12 +++++++++ .../direct_submission_tests_2.cpp | 24 ++++++++++++------ .../linux/drm_direct_submission_tests.cpp | 13 ++++++++-- 12 files changed, 93 insertions(+), 22 deletions(-) diff --git a/shared/source/direct_submission/direct_submission_hw.h b/shared/source/direct_submission/direct_submission_hw.h index 3deff17059..848fb603b7 100644 --- a/shared/source/direct_submission/direct_submission_hw.h +++ b/shared/source/direct_submission/direct_submission_hw.h @@ -168,6 +168,7 @@ class DirectSubmissionHw { uint64_t semaphoreGpuVa = 0u; uint64_t gpuVaForMiFlush = 0u; + uint64_t gpuVaForAdditionalSynchronizationWA = 0u; OsContext &osContext; const uint32_t rootDeviceIndex; @@ -200,5 +201,6 @@ class DirectSubmissionHw { bool useNotifyForPostSync = false; bool miMemFenceRequired = false; bool systemMemoryFenceAddressSet = false; + bool completionFenceSupported = false; }; } // namespace NEO diff --git a/shared/source/direct_submission/direct_submission_hw.inl b/shared/source/direct_submission/direct_submission_hw.inl index 1d5c8adbeb..accdb51bad 100644 --- a/shared/source/direct_submission/direct_submission_hw.inl +++ b/shared/source/direct_submission/direct_submission_hw.inl @@ -52,7 +52,7 @@ DirectSubmissionHw::DirectSubmissionHw(const DirectSubmis disableCacheFlush = !!DebugManager.flags.DirectSubmissionDisableCacheFlush.get(); } - miMemFenceRequired = hwInfoConfig->isGlobalFenceInCommandStreamRequired(*hwInfo); + miMemFenceRequired = hwInfoConfig->isGlobalFenceInDirectSubmissionRequired(*hwInfo); if (DebugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.get() == 0) { miMemFenceRequired = false; } @@ -287,7 +287,7 @@ inline void DirectSubmissionHw::dispatchSemaphoreSection( COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD); if (miMemFenceRequired) { - MemorySynchronizationCommands::addAdditionalSynchronization(ringCommandStream, 0, true, *hwInfo); + MemorySynchronizationCommands::addAdditionalSynchronizationForDirectSubmission(ringCommandStream, this->gpuVaForAdditionalSynchronizationWA, true, *hwInfo); } dispatchPrefetchMitigation(); @@ -301,7 +301,7 @@ inline size_t DirectSubmissionHw::getSizeSemaphoreSection semaphoreSize += 2 * getSizeDisablePrefetcher(); if (miMemFenceRequired) { - semaphoreSize += MemorySynchronizationCommands::getSizeForSingleAdditionalSynchronization(*hwInfo); + semaphoreSize += MemorySynchronizationCommands::getSizeForSingleAdditionalSynchronizationForDirectSubmission(*hwInfo); } return semaphoreSize; @@ -643,7 +643,6 @@ size_t DirectSubmissionHw::getDiagnosticModeSection() { template void DirectSubmissionHw::dispatchSystemMemoryFenceAddress() { - UNRECOVERABLE_IF(!this->globalFenceAllocation); EncodeMemoryFence::encodeSystemMemoryFence(ringCommandStream, this->globalFenceAllocation, this->logicalStateHelper); if (logicalStateHelper) { diff --git a/shared/source/direct_submission/linux/drm_direct_submission.h b/shared/source/direct_submission/linux/drm_direct_submission.h index fa00e28e99..9e865fb6cf 100644 --- a/shared/source/direct_submission/linux/drm_direct_submission.h +++ b/shared/source/direct_submission/linux/drm_direct_submission.h @@ -20,12 +20,7 @@ class DrmDirectSubmission : public DirectSubmissionHw { ~DrmDirectSubmission() override; - uint32_t *getCompletionValuePointer() override { - if (this->completionFenceAllocation) { - return &this->completionFenceValue; - } - return DirectSubmissionHw::getCompletionValuePointer(); - } + uint32_t *getCompletionValuePointer() override; protected: bool allocateOsResources() override; @@ -40,6 +35,7 @@ class DrmDirectSubmission : public DirectSubmissionHw { uint64_t updateTagValue() override; void getTagAddressValue(TagData &tagData) override; bool isCompleted(uint32_t ringBufferIndex) override; + bool isCompletionFenceSupported(); MOCKABLE_VIRTUAL void wait(uint32_t taskCountToWait); diff --git a/shared/source/direct_submission/linux/drm_direct_submission.inl b/shared/source/direct_submission/linux/drm_direct_submission.inl index 2c8ef7b5ab..fd462550a5 100644 --- a/shared/source/direct_submission/linux/drm_direct_submission.inl +++ b/shared/source/direct_submission/linux/drm_direct_submission.inl @@ -48,8 +48,14 @@ DrmDirectSubmission::DrmDirectSubmission(const DirectSubm UNRECOVERABLE_IF(this->workPartitionAllocation == nullptr); } - if (drm.completionFenceSupport()) { + if (this->miMemFenceRequired || drm.completionFenceSupport()) { this->completionFenceAllocation = inputParams.completionFenceAllocation; + if (this->completionFenceAllocation) { + this->gpuVaForAdditionalSynchronizationWA = this->completionFenceAllocation->getGpuAddress() + 8u; + if (drm.completionFenceSupport()) { + this->completionFenceSupported = true; + } + } } } @@ -59,7 +65,7 @@ inline DrmDirectSubmission::~DrmDirectSubmission() { this->stopRingBuffer(); this->wait(static_cast(this->currentTagData.tagValue)); } - if (this->completionFenceAllocation) { + if (this->isCompletionFenceSupported()) { auto osContextLinux = static_cast(&this->osContext); auto &drm = osContextLinux->getDrm(); auto completionFenceCpuAddress = reinterpret_cast(this->completionFenceAllocation->getUnderlyingBuffer()) + Drm::completionFenceOffset; @@ -68,6 +74,14 @@ inline DrmDirectSubmission::~DrmDirectSubmission() { this->deallocateResources(); } +template +uint32_t *DrmDirectSubmission::getCompletionValuePointer() { + if (this->isCompletionFenceSupported()) { + return &this->completionFenceValue; + } + return DirectSubmissionHw::getCompletionValuePointer(); +} + template bool DrmDirectSubmission::allocateOsResources() { this->currentTagData.tagAddress = this->semaphoreGpuVa + offsetof(RingSemaphoreData, tagAllocation); @@ -97,7 +111,7 @@ bool DrmDirectSubmission::submit(uint64_t gpuAddress, siz uint32_t completionValue = 0u; uint64_t completionFenceGpuAddress = 0u; - if (this->completionFenceAllocation) { + if (this->isCompletionFenceSupported()) { completionValue = ++completionFenceValue; completionFenceGpuAddress = this->completionFenceAllocation->getGpuAddress() + Drm::completionFenceOffset; } @@ -223,6 +237,11 @@ inline bool DrmDirectSubmission::isCompleted(uint32_t rin return true; } +template +bool DrmDirectSubmission::isCompletionFenceSupported() { + return this->completionFenceSupported; +} + template void DrmDirectSubmission::wait(uint32_t taskCountToWait) { auto pollAddress = this->tagAddress; diff --git a/shared/source/helpers/hw_helper.h b/shared/source/helpers/hw_helper.h index 4bcadce334..7c6b155c4f 100644 --- a/shared/source/helpers/hw_helper.h +++ b/shared/source/helpers/hw_helper.h @@ -462,6 +462,7 @@ struct MemorySynchronizationCommands { static void addPipeControlWA(LinearStream &commandStream, uint64_t gpuAddress, const HardwareInfo &hwInfo); static void setPipeControlWA(void *&commandsBuffer, uint64_t gpuAddress, const HardwareInfo &hwInfo); + static void addAdditionalSynchronizationForDirectSubmission(LinearStream &commandStream, uint64_t gpuAddress, bool acquire, const HardwareInfo &hwInfo); static void addAdditionalSynchronization(LinearStream &commandStream, uint64_t gpuAddress, bool acquire, const HardwareInfo &hwInfo); static void setAdditionalSynchronization(void *&commandsBuffer, uint64_t gpuAddress, bool acquire, const HardwareInfo &hwInfo); @@ -478,6 +479,7 @@ struct MemorySynchronizationCommands { static size_t getSizeForPipeControlWithPostSyncOperation(const HardwareInfo &hwInfo); static size_t getSizeForPipeControlWA(const HardwareInfo &hwInfo); static size_t getSizeForSinglePipeControl(); + static size_t getSizeForSingleAdditionalSynchronizationForDirectSubmission(const HardwareInfo &hwInfo); static size_t getSizeForSingleAdditionalSynchronization(const HardwareInfo &hwInfo); static size_t getSizeForAdditonalSynchronization(const HardwareInfo &hwInfo); static size_t getSizeForFullCacheFlush(); diff --git a/shared/source/helpers/hw_helper_base.inl b/shared/source/helpers/hw_helper_base.inl index 012421edcb..8f883f248d 100644 --- a/shared/source/helpers/hw_helper_base.inl +++ b/shared/source/helpers/hw_helper_base.inl @@ -306,6 +306,11 @@ void MemorySynchronizationCommands::addAdditionalSynchronization(Line setAdditionalSynchronization(commandBuffer, gpuAddress, acquire, hwInfo); } +template +void MemorySynchronizationCommands::addAdditionalSynchronizationForDirectSubmission(LinearStream &commandStream, uint64_t gpuAddress, bool acquire, const HardwareInfo &hwInfo) { + MemorySynchronizationCommands::addAdditionalSynchronization(commandStream, gpuAddress, acquire, hwInfo); +} + template void MemorySynchronizationCommands::setPipeControl(typename GfxFamily::PIPE_CONTROL &pipeControl, PipeControlArgs &args) { pipeControl.setCommandStreamerStallEnable(true); @@ -410,6 +415,11 @@ inline size_t MemorySynchronizationCommands::getSizeForSingleAddition return 0u; } +template +inline size_t MemorySynchronizationCommands::getSizeForSingleAdditionalSynchronizationForDirectSubmission(const HardwareInfo &hwInfo) { + return MemorySynchronizationCommands::getSizeForSingleAdditionalSynchronization(hwInfo); +} + template inline size_t MemorySynchronizationCommands::getSizeForAdditonalSynchronization(const HardwareInfo &hwInfo) { return 0u; diff --git a/shared/source/os_interface/hw_info_config.h b/shared/source/os_interface/hw_info_config.h index cccd7f93ba..77a1760046 100644 --- a/shared/source/os_interface/hw_info_config.h +++ b/shared/source/os_interface/hw_info_config.h @@ -101,6 +101,7 @@ class HwInfoConfig { virtual bool isSpecialPipelineSelectModeChanged(const HardwareInfo &hwInfo) const = 0; virtual bool isSystolicModeConfigurable(const HardwareInfo &hwInfo) const = 0; virtual bool isGlobalFenceInCommandStreamRequired(const HardwareInfo &hwInfo) const = 0; + virtual bool isGlobalFenceInDirectSubmissionRequired(const HardwareInfo &hwInfo) const = 0; virtual bool isComputeDispatchAllWalkerEnableInComputeWalkerRequired(const HardwareInfo &hwInfo) const = 0; virtual bool isAdjustProgrammableIdPreferredSlmSizeRequired(const HardwareInfo &hwInfo) const = 0; virtual uint32_t getThreadEuRatioForScratch(const HardwareInfo &hwInfo) const = 0; @@ -202,6 +203,7 @@ class HwInfoConfigHw : public HwInfoConfig { bool isSystolicModeConfigurable(const HardwareInfo &hwInfo) const override; bool isComputeDispatchAllWalkerEnableInComputeWalkerRequired(const HardwareInfo &hwInfo) const override; bool isGlobalFenceInCommandStreamRequired(const HardwareInfo &hwInfo) const override; + bool isGlobalFenceInDirectSubmissionRequired(const HardwareInfo &hwInfo) const override; bool isAdjustProgrammableIdPreferredSlmSizeRequired(const HardwareInfo &hwInfo) const override; uint32_t getThreadEuRatioForScratch(const HardwareInfo &hwInfo) const override; bool isComputeDispatchAllWalkerEnableInCfeStateRequired(const HardwareInfo &hwInfo) const override; diff --git a/shared/source/os_interface/hw_info_config.inl b/shared/source/os_interface/hw_info_config.inl index 6f43ff287e..e140718ad8 100644 --- a/shared/source/os_interface/hw_info_config.inl +++ b/shared/source/os_interface/hw_info_config.inl @@ -343,6 +343,11 @@ bool HwInfoConfigHw::isGlobalFenceInCommandStreamRequired(const Hard return false; } +template +bool HwInfoConfigHw::isGlobalFenceInDirectSubmissionRequired(const HardwareInfo &hwInfo) const { + return HwInfoConfigHw::isGlobalFenceInCommandStreamRequired(hwInfo); +}; + template bool HwInfoConfigHw::isAdjustProgrammableIdPreferredSlmSizeRequired(const HardwareInfo &hwInfo) const { return false; diff --git a/shared/source/xe_hpg_core/dg2/os_agnostic_hw_info_config_dg2.inl b/shared/source/xe_hpg_core/dg2/os_agnostic_hw_info_config_dg2.inl index f2110e2574..6f3428a6c7 100644 --- a/shared/source/xe_hpg_core/dg2/os_agnostic_hw_info_config_dg2.inl +++ b/shared/source/xe_hpg_core/dg2/os_agnostic_hw_info_config_dg2.inl @@ -56,6 +56,11 @@ uint32_t HwInfoConfigHw::getSteppingFromHwRevId(const HardwareInfo & } } +template <> +bool HwInfoConfigHw::isGlobalFenceInDirectSubmissionRequired(const HardwareInfo &hwInfo) const { + return true; +} + template <> bool HwInfoConfigHw::isDirectSubmissionSupported(const HardwareInfo &hwInfo) const { return true; diff --git a/shared/source/xe_hpg_core/hw_helper_xe_hpg_core.cpp b/shared/source/xe_hpg_core/hw_helper_xe_hpg_core.cpp index b0bd9c98f5..2f14af4dd1 100644 --- a/shared/source/xe_hpg_core/hw_helper_xe_hpg_core.cpp +++ b/shared/source/xe_hpg_core/hw_helper_xe_hpg_core.cpp @@ -78,6 +78,18 @@ bool HwHelperHw::isBankOverrideRequired(const HardwareInfo &hwInfo) cons return forceOverrideMemoryBankIndex; } +template <> +size_t MemorySynchronizationCommands::getSizeForSingleAdditionalSynchronizationForDirectSubmission(const HardwareInfo &hwInfo) { + return EncodeSempahore::getSizeMiSemaphoreWait(); +} + +template <> +void MemorySynchronizationCommands::addAdditionalSynchronizationForDirectSubmission(LinearStream &commandStream, uint64_t gpuAddress, bool acquire, const HardwareInfo &hwInfo) { + using MI_SEMAPHORE_WAIT = typename Family::MI_SEMAPHORE_WAIT; + + EncodeSempahore::addMiSemaphoreWaitCommand(commandStream, gpuAddress, EncodeSempahore::invalidHardwareTag, MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD); +} + template <> const StackVec HwHelperHw::getThreadsPerEUConfigs() const { return {4, 8}; diff --git a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp index 62bdcf9b7a..13c8154d32 100644 --- a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp +++ b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp @@ -38,25 +38,24 @@ struct DirectSubmissionDispatchMiMemFenceTest : public DirectSubmissionDispatchB DirectSubmissionDispatchBufferTest::SetUp(); auto hwInfoConfig = HwInfoConfig::get(pDevice->getHardwareInfo().platform.eProductFamily); - miMemFenceSupported = hwInfoConfig->isGlobalFenceInCommandStreamRequired(pDevice->getHardwareInfo()); + miMemFenceSupported = hwInfoConfig->isGlobalFenceInDirectSubmissionRequired(pDevice->getHardwareInfo()); } template void validateFenceProgramming(MockDirectSubmissionHw> &directSubmission, uint32_t expectedFenceCount, uint32_t expectedSysMemFenceCount) { + int32_t id = 0; int32_t systemMemoryFenceId = -1; uint32_t fenceCount = 0; uint32_t sysMemFenceCount = 0; + HardwareParse hwParse; + hwParse.parseCommands(directSubmission.ringCommandStream, 0); + hwParse.findHardwareCommands(); + if constexpr (FamilyType::isUsingMiMemFence) { using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS; using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE; - HardwareParse hwParse; - hwParse.parseCommands(directSubmission.ringCommandStream, 0); - hwParse.findHardwareCommands(); - - int32_t id = 0; - for (auto &it : hwParse.cmdList) { if (auto sysFenceAddress = genCmdCast(it)) { EXPECT_EQ(-1, systemMemoryFenceId); @@ -74,6 +73,17 @@ struct DirectSubmissionDispatchMiMemFenceTest : public DirectSubmissionDispatchB id++; } + } else if (miMemFenceSupported) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + expectedSysMemFenceCount = 0u; + for (auto &it : hwParse.cmdList) { + if (auto sysFenceAddress = genCmdCast(it)) { + fenceCount++; + } + + id++; + } + fenceCount /= 2; } if (miMemFenceSupported) { diff --git a/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp b/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp index d0b9372ce7..687bd5de2d 100644 --- a/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp +++ b/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp @@ -72,6 +72,7 @@ struct MockDrmDirectSubmission : public DrmDirectSubmissioncompletionFenceSupport()); { MockDrmDirectSubmission> directSubmission(commandStreamReceiver); - EXPECT_EQ(nullptr, directSubmission.completionFenceAllocation); + EXPECT_EQ(directSubmission.miMemFenceRequired, directSubmission.completionFenceAllocation != nullptr); } { MockDrmDirectSubmission> directSubmission(commandStreamReceiver); - EXPECT_EQ(nullptr, directSubmission.completionFenceAllocation); + EXPECT_EQ(directSubmission.miMemFenceRequired, directSubmission.completionFenceAllocation != nullptr); } } @@ -309,6 +310,8 @@ HWTEST_F(DrmDirectSubmissionTest, givenCompletionFenceSupportAndFenceIsNotComple HWTEST_F(DrmDirectSubmissionTest, givenNoCompletionFenceSupportWhenSubmittingThenNoCompletionAddressIsPassedToExec) { uint64_t gpuAddress = 0x1000; size_t size = 0x1000; + DebugManagerStateRestore restorer; + DebugManager.flags.EnableDrmCompletionFence.set(0); MockDrmDirectSubmission> drmDirectSubmission(*device->getDefaultEngine().commandStreamReceiver); drmDirectSubmission.completionFenceAllocation = nullptr; @@ -334,6 +337,8 @@ HWTEST_F(DrmDirectSubmissionTest, givenNoCompletionFenceSupportWhenSubmittingThe HWTEST_F(DrmDirectSubmissionTest, givenTile0AndCompletionFenceSupportWhenSubmittingThenCompletionAddressAndValueArePassedToExec) { uint64_t gpuAddress = 0x1000; size_t size = 0x1000; + DebugManagerStateRestore restorer; + DebugManager.flags.EnableDrmCompletionFence.set(1); auto &commandStreamReceiver = *device->getDefaultEngine().commandStreamReceiver; auto drm = executionEnvironment.rootDeviceEnvironments[0]->osInterface->getDriverModel()->as(); @@ -371,6 +376,8 @@ HWTEST_F(DrmDirectSubmissionTest, givenTile0AndCompletionFenceSupportWhenSubmitt HWTEST_F(DrmDirectSubmissionTest, givenTile1AndCompletionFenceSupportWhenSubmittingThenCompletionAddressAndValueArePassedToExec) { uint64_t gpuAddress = 0x1000; size_t size = 0x1000; + DebugManagerStateRestore restorer; + DebugManager.flags.EnableDrmCompletionFence.set(1); auto &commandStreamReceiver = *device->getDefaultEngine().commandStreamReceiver; auto drm = executionEnvironment.rootDeviceEnvironments[0]->osInterface->getDriverModel()->as(); @@ -408,6 +415,8 @@ HWTEST_F(DrmDirectSubmissionTest, givenTile1AndCompletionFenceSupportWhenSubmitt HWTEST_F(DrmDirectSubmissionTest, givenTwoTilesAndCompletionFenceSupportWhenSubmittingThenCompletionAddressAndValueArePassedToExec) { uint64_t gpuAddress = 0x1000; size_t size = 0x1000; + DebugManagerStateRestore restorer; + DebugManager.flags.EnableDrmCompletionFence.set(1); auto &commandStreamReceiver = device->getUltCommandStreamReceiver(); auto drm = executionEnvironment.rootDeviceEnvironments[0]->osInterface->getDriverModel()->as();