diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 44956dcb73..953baf98f4 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -344,6 +344,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionControllerMaxTimeout, -1, "Set d DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionControllerDivisor, -1, "Set direct submission controller timeout divider, -1: default 1, >0: divider value") DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionForceLocalMemoryStorageMode, -1, "Force local memory storage for command/ring/semaphore buffer, -1: default - for all engines, 0: disabled, 1: for multiOsContextCapable engine, 2: for all engines") DECLARE_DEBUG_VARIABLE(int32_t, EnableRingSwitchTagUpdateWa, -1, "-1: default, 0 - disable, 1 - enable. If enabled, completionFences wont be updated if ring is not running.") +DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionPCIBarrier, -1, "Use PCI barrier for data synchronization before semaphore unblock -1: default, 0 - disable, 1 - enable.") DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionInsertExtraMiMemFenceCommands, -1, "-1: default, 0 - disable, 1 - enable. If enabled, add extra MI_MEM_FENCE instructions with acquire bit set") DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionInsertSfenceInstructionPriorToSubmission, -1, "-1: default, 0 - disable, 1 - Insert _mm_sfence before unlocking semaphore only, 2 - insert before and after semaphore") DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionMaxRingBuffers, -1, "-1: default, >0: max ring buffer count, During switch ring buffer, if there is no available ring, wait for completion instead of allocating new one if DirectSubmissionMaxRingBuffers is reached") diff --git a/shared/source/direct_submission/direct_submission_hw.h b/shared/source/direct_submission/direct_submission_hw.h index 2d25a08b1d..7616af0478 100644 --- a/shared/source/direct_submission/direct_submission_hw.h +++ b/shared/source/direct_submission/direct_submission_hw.h @@ -211,6 +211,7 @@ class DirectSubmissionHw { void *semaphorePtr = nullptr; volatile RingSemaphoreData *semaphoreData = nullptr; volatile void *workloadModeOneStoreAddress = nullptr; + uint32_t *pciBarrierPtr = nullptr; uint32_t currentQueueWorkCount = 1u; uint32_t workloadMode = 0; diff --git a/shared/source/direct_submission/direct_submission_hw.inl b/shared/source/direct_submission/direct_submission_hw.inl index c73e4a8d31..e986b56656 100644 --- a/shared/source/direct_submission/direct_submission_hw.inl +++ b/shared/source/direct_submission/direct_submission_hw.inl @@ -408,6 +408,10 @@ inline void DirectSubmissionHw::unblockGpu() { CpuIntrinsics::sfence(); } + if (this->pciBarrierPtr) { + *this->pciBarrierPtr = 0u; + } + semaphoreData->QueueWorkCount = currentQueueWorkCount; if (sfenceMode == DirectSubmissionSfenceMode::BeforeAndAfterSemaphore) { diff --git a/shared/source/direct_submission/linux/drm_direct_submission.inl b/shared/source/direct_submission/linux/drm_direct_submission.inl index 3b20561dfc..122ad0466a 100644 --- a/shared/source/direct_submission/linux/drm_direct_submission.inl +++ b/shared/source/direct_submission/linux/drm_direct_submission.inl @@ -16,6 +16,7 @@ #include "shared/source/os_interface/linux/drm_wrappers.h" #include "shared/source/os_interface/linux/ioctl_helper.h" #include "shared/source/os_interface/linux/os_context_linux.h" +#include "shared/source/os_interface/linux/sys_calls.h" #include "shared/source/utilities/wait_util.h" #include @@ -50,6 +51,23 @@ DrmDirectSubmission::DrmDirectSubmission(const DirectSubm auto &drm = osContextLinux->getDrm(); drm.setDirectSubmissionActive(true); + auto usePciBarrier = true; + if (DebugManager.flags.DirectSubmissionPCIBarrier.get() != -1) { + usePciBarrier = DebugManager.flags.DirectSubmissionPCIBarrier.get(); + } + + if (usePciBarrier) { + auto ptr = static_cast(drm.getIoctlHelper()->pciBarrierMmap()); + if (ptr != MAP_FAILED) { + this->pciBarrierPtr = ptr; + } + } + PRINT_DEBUG_STRING(DebugManager.flags.PrintDebugMessages.get(), stderr, "Using PCI barrier ptr: %p\n", this->pciBarrierPtr); + if (this->pciBarrierPtr) { + this->miMemFenceRequired = false; + this->sfenceMode = DirectSubmissionSfenceMode::Disabled; + } + if (this->partitionedMode) { this->workPartitionAllocation = inputParams.workPartitionAllocation; UNRECOVERABLE_IF(this->workPartitionAllocation == nullptr); @@ -86,6 +104,9 @@ inline DrmDirectSubmission::~DrmDirectSubmission() { drm.waitOnUserFences(*osContextLinux, completionFenceCpuAddress, this->completionFenceValue, this->activeTiles, this->postSyncOffset); } this->deallocateResources(); + if (this->pciBarrierPtr) { + SysCalls::munmap(this->pciBarrierPtr, MemoryConstants::pageSize); + } } template diff --git a/shared/source/os_interface/linux/ioctl_helper.h b/shared/source/os_interface/linux/ioctl_helper.h index a4c391d2a7..a94184f44b 100644 --- a/shared/source/os_interface/linux/ioctl_helper.h +++ b/shared/source/os_interface/linux/ioctl_helper.h @@ -139,6 +139,7 @@ class IoctlHelper { virtual std::string getFileForMaxMemoryFrequencyOfSubDevice(int subDeviceId) const; virtual bool getFabricLatency(uint32_t fabricId, uint32_t &latency, uint32_t &bandwidth) = 0; virtual bool isWaitBeforeBindRequired(bool bind) const = 0; + virtual void *pciBarrierMmap() { return nullptr; }; uint32_t getFlagsForPrimeHandleToFd() const; @@ -267,6 +268,7 @@ class IoctlHelperPrelim20 : public IoctlHelper { bool checkIfIoctlReinvokeRequired(int error, DrmIoctl ioctlRequest) const override; bool getFabricLatency(uint32_t fabricId, uint32_t &latency, uint32_t &bandwidth) override; bool isWaitBeforeBindRequired(bool bind) const override; + void *pciBarrierMmap() override; protected: bool queryHwIpVersion(EngineClassInstance &engineInfo, HardwareIpVersion &ipVersion, int &ret); diff --git a/shared/source/os_interface/linux/ioctl_helper_prelim.cpp b/shared/source/os_interface/linux/ioctl_helper_prelim.cpp index 443b8eed66..a63bb0aaed 100644 --- a/shared/source/os_interface/linux/ioctl_helper_prelim.cpp +++ b/shared/source/os_interface/linux/ioctl_helper_prelim.cpp @@ -8,6 +8,7 @@ #include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/execution_environment/root_device_environment.h" #include "shared/source/helpers/common_types.h" +#include "shared/source/helpers/constants.h" #include "shared/source/helpers/debug_helpers.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/hw_info.h" @@ -731,6 +732,11 @@ bool IoctlHelperPrelim20::isWaitBeforeBindRequired(bool bind) const { return bind; } +void *IoctlHelperPrelim20::pciBarrierMmap() { + static constexpr uint64_t pciBarrierMmapOffset = 0x50 << 12; + return SysCalls::mmap(NULL, MemoryConstants::pageSize, PROT_WRITE, MAP_SHARED, drm.getFileDescriptor(), pciBarrierMmapOffset); +} + bool IoctlHelperPrelim20::queryHwIpVersion(EngineClassInstance &engineInfo, HardwareIpVersion &ipVersion, int &ret) { QueryItem queryItem{}; queryItem.queryId = PRELIM_DRM_I915_QUERY_HW_IP_VERSION; diff --git a/shared/test/common/mocks/mock_direct_submission_hw.h b/shared/test/common/mocks/mock_direct_submission_hw.h index 566396d327..cb6452af33 100644 --- a/shared/test/common/mocks/mock_direct_submission_hw.h +++ b/shared/test/common/mocks/mock_direct_submission_hw.h @@ -56,6 +56,7 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw using BaseClass::osContext; using BaseClass::partitionConfigSet; using BaseClass::partitionedMode; + using BaseClass::pciBarrierPtr; using BaseClass::performDiagnosticMode; using BaseClass::postSyncOffset; using BaseClass::preinitializedRelaxedOrderingScheduler; @@ -78,6 +79,7 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw using BaseClass::stopRingBuffer; using BaseClass::switchRingBuffersAllocations; using BaseClass::systemMemoryFenceAddressSet; + using BaseClass::unblockGpu; using BaseClass::useNotifyForPostSync; using BaseClass::workloadMode; using BaseClass::workloadModeOneExpectedValue; diff --git a/shared/test/common/os_interface/linux/sys_calls_linux_ult.cpp b/shared/test/common/os_interface/linux/sys_calls_linux_ult.cpp index 7c9c456475..41d646dde7 100644 --- a/shared/test/common/os_interface/linux/sys_calls_linux_ult.cpp +++ b/shared/test/common/os_interface/linux/sys_calls_linux_ult.cpp @@ -50,6 +50,7 @@ std::vector mmapVector(64); std::vector mmapCapturedExtendedPointers(64); bool mmapCaptureExtendedPointers = false; bool mmapAllowExtendedPointers = false; +bool failMmap = false; uint32_t mmapFuncCalled = 0u; uint32_t munmapFuncCalled = 0u; @@ -201,6 +202,9 @@ ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset) { void *mmap(void *addr, size_t size, int prot, int flags, int fd, off_t off) noexcept { mmapFuncCalled++; + if (failMmap) { + return reinterpret_cast(-1); + } if (reinterpret_cast(addr) > maxNBitValue(48)) { if (mmapCaptureExtendedPointers) { mmapCapturedExtendedPointers.push_back(addr); diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index 6d485a385e..f07876ea6e 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -107,6 +107,7 @@ DirectSubmissionEnableDebugBuffer = 0 DirectSubmissionDiagnosticExecutionCount = 30 DirectSubmissionNewResourceTlbFlush = -1 DirectSubmissionDisableCacheFlush = -1 +DirectSubmissionPCIBarrier = -1 DirectSubmissionDisableMonitorFence = -1 DirectSubmissionPrintBuffers = 0 DirectSubmissionMaxRingBuffers = -1 diff --git a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp index 0f275882a6..f41f06db55 100644 --- a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp +++ b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp @@ -160,6 +160,17 @@ HWTEST_F(DirectSubmissionDispatchMiMemFenceTest, givenMiMemFenceSupportedWhenSys EXPECT_TRUE(directSubmission.systemMemoryFenceAddressSet); } +HWTEST_F(DirectSubmissionDispatchMiMemFenceTest, givenPciBarrierPtrSetWhenUnblockGpuThenWriteZero) { + MockDirectSubmissionHw> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); + uint32_t pciBarrierMock = 1; + directSubmission.pciBarrierPtr = &pciBarrierMock; + EXPECT_TRUE(directSubmission.initialize(false, false)); + + directSubmission.unblockGpu(); + + EXPECT_EQ(*directSubmission.pciBarrierPtr, 0u); +} + HWTEST_F(DirectSubmissionDispatchMiMemFenceTest, givenDebugFlagSetWhenCreatingDirectSubmissionThenDontEnableMiMemFenceProgramming) { DebugManagerStateRestore restorer; DebugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.set(0); diff --git a/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp b/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp index 9d1981f46b..2e838ab35d 100644 --- a/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp +++ b/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp @@ -12,6 +12,7 @@ #include "shared/source/direct_submission/linux/drm_direct_submission.h" #include "shared/source/os_interface/linux/drm_gem_close_worker.h" #include "shared/source/os_interface/linux/os_context_linux.h" +#include "shared/source/os_interface/linux/sys_calls.h" #include "shared/test/common/cmd_parse/hw_parse.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/engine_descriptor_helper.h" @@ -31,6 +32,12 @@ namespace CpuIntrinsicsTests { extern std::atomic pauseCounter; } +namespace NEO { +namespace SysCalls { +extern bool failMmap; +} +} // namespace NEO + struct DrmDirectSubmissionTest : public DrmMemoryManagerBasic { void SetUp() override { DrmMemoryManagerBasic::SetUp(); @@ -78,9 +85,11 @@ struct MockDrmDirectSubmission : public DrmDirectSubmissionosInterface->getDriverModel()->as(); + auto ptr = drm->getIoctlHelper()->pciBarrierMmap(); + if (!ptr) { + GTEST_SKIP(); + } + + DebugManagerStateRestore restorer; + DebugManager.flags.DirectSubmissionPCIBarrier.set(1); + auto &commandStreamReceiver = *device->getDefaultEngine().commandStreamReceiver; + + MockDrmDirectSubmission> directSubmission(commandStreamReceiver); + + EXPECT_NE(nullptr, directSubmission.pciBarrierPtr); + EXPECT_EQ(DirectSubmissionSfenceMode::Disabled, directSubmission.sfenceMode); + EXPECT_FALSE(directSubmission.miMemFenceRequired); + + SysCalls::munmap(ptr, MemoryConstants::pageSize); +} + +HWTEST_F(DrmDirectSubmissionTest, givenPciBarrierWhenCreateDirectSubmissionAndMmapFailsThenPtrNotMappedAndOtherSyncMethodsRemain) { + auto drm = executionEnvironment.rootDeviceEnvironments[0]->osInterface->getDriverModel()->as(); + auto ptr = drm->getIoctlHelper()->pciBarrierMmap(); + if (!ptr) { + GTEST_SKIP(); + } + + DebugManagerStateRestore restorer; + DebugManager.flags.DirectSubmissionPCIBarrier.set(1); + auto &commandStreamReceiver = *device->getDefaultEngine().commandStreamReceiver; + VariableBackup backup(&SysCalls::failMmap, true); + + MockDrmDirectSubmission> directSubmission(commandStreamReceiver); + + EXPECT_EQ(nullptr, directSubmission.pciBarrierPtr); + EXPECT_NE(DirectSubmissionSfenceMode::Disabled, directSubmission.sfenceMode); + EXPECT_EQ(directSubmission.miMemFenceRequired, device->getRootDeviceEnvironment().getHelper().isGlobalFenceInDirectSubmissionRequired(device->getHardwareInfo())); + + SysCalls::munmap(ptr, MemoryConstants::pageSize); +} + +HWTEST_F(DrmDirectSubmissionTest, givenPciBarrierDisabledWhenCreateDirectSubmissionThenPtrNotMappedAndOtherSyncMethodsRemain) { + auto drm = executionEnvironment.rootDeviceEnvironments[0]->osInterface->getDriverModel()->as(); + auto ptr = drm->getIoctlHelper()->pciBarrierMmap(); + if (!ptr) { + GTEST_SKIP(); + } + + DebugManagerStateRestore restorer; + DebugManager.flags.DirectSubmissionPCIBarrier.set(0); + auto &commandStreamReceiver = *device->getDefaultEngine().commandStreamReceiver; + + MockDrmDirectSubmission> directSubmission(commandStreamReceiver); + + EXPECT_EQ(nullptr, directSubmission.pciBarrierPtr); + EXPECT_NE(DirectSubmissionSfenceMode::Disabled, directSubmission.sfenceMode); + EXPECT_EQ(directSubmission.miMemFenceRequired, device->getRootDeviceEnvironment().getHelper().isGlobalFenceInDirectSubmissionRequired(device->getHardwareInfo())); + + SysCalls::munmap(ptr, MemoryConstants::pageSize); +} + HWTEST_F(DrmDirectSubmissionTest, givenNoCompletionFenceSupportWhenCreateDrmDirectSubmissionThenCompletionFenceAllocationIsNotSet) { DebugManagerStateRestore restorer; DebugManager.flags.EnableDrmCompletionFence.set(0); diff --git a/shared/test/unit_test/os_interface/linux/xe/ioctl_helper_xe_tests.cpp b/shared/test/unit_test/os_interface/linux/xe/ioctl_helper_xe_tests.cpp index 6b48fcb21b..b1fd5e85a0 100644 --- a/shared/test/unit_test/os_interface/linux/xe/ioctl_helper_xe_tests.cpp +++ b/shared/test/unit_test/os_interface/linux/xe/ioctl_helper_xe_tests.cpp @@ -36,6 +36,15 @@ TEST(IoctlHelperXeTest, givenXeDrmVersionsWhenGettingIoctlHelperThenValidIoctlHe EXPECT_NE(nullptr, xeIoctlHelper); } +TEST(IoctlHelperXeTest, givenXeDrmWhenGetPciBarrierMmapThenReturnsNullptr) { + MockExecutionEnvironment executionEnvironment{}; + std::unique_ptr drm{Drm::create(std::make_unique(0, ""), *executionEnvironment.rootDeviceEnvironments[0])}; + IoctlHelperXe ioctlHelper{*drm}; + + auto ptr = ioctlHelper.pciBarrierMmap(); + EXPECT_EQ(ptr, nullptr); +} + TEST(IoctlHelperXeTest, whenChangingBufferBindingThenWaitIsNeededAlways) { MockExecutionEnvironment executionEnvironment{}; std::unique_ptr drm{Drm::create(std::make_unique(0, ""), *executionEnvironment.rootDeviceEnvironments[0])};