diff --git a/opencl/test/unit_test/os_interface/linux/CMakeLists.txt b/opencl/test/unit_test/os_interface/linux/CMakeLists.txt index b595495b98..7fd59d5d95 100644 --- a/opencl/test/unit_test/os_interface/linux/CMakeLists.txt +++ b/opencl/test/unit_test/os_interface/linux/CMakeLists.txt @@ -1,5 +1,5 @@ # -# Copyright (C) 2018-2021 Intel Corporation +# Copyright (C) 2018-2022 Intel Corporation # # SPDX-License-Identifier: MIT # @@ -65,6 +65,12 @@ if("${BRANCH_TYPE}" STREQUAL "") ) endif() +if(TESTS_XEHP_AND_LATER) + list(APPEND IGDRCL_SRCS_tests_os_interface_linux + ${CMAKE_CURRENT_SOURCE_DIR}/drm_command_stream_xehp_and_later_tests.cpp + ) +endif() + if(UNIX) target_sources(igdrcl_tests PRIVATE ${IGDRCL_SRCS_tests_os_interface_linux}) endif() diff --git a/opencl/test/unit_test/os_interface/linux/drm_command_stream_mm_tests.cpp b/opencl/test/unit_test/os_interface/linux/drm_command_stream_mm_tests.cpp index f8e5487946..10029f0a61 100644 --- a/opencl/test/unit_test/os_interface/linux/drm_command_stream_mm_tests.cpp +++ b/opencl/test/unit_test/os_interface/linux/drm_command_stream_mm_tests.cpp @@ -95,8 +95,10 @@ HWTEST_F(DrmCommandStreamMMTest, givenExecutionEnvironmentWithMoreThanOneRootDev } } -HWTEST_TEMPLATED_F(DrmCommandStreamMemExecTest, GivenDrmSupportsCompletionFenceWhenCallingCsrExecThenTagAllocationIsPassed) { +HWTEST_TEMPLATED_F(DrmCommandStreamMemExecTest, GivenDrmSupportsVmBindAndCompletionFenceWhenCallingCsrExecThenTagAllocationIsPassed) { mock->completionFenceSupported = true; + mock->isVmBindAvailableCall.callParent = false; + mock->isVmBindAvailableCall.returnValue = true; TestedBufferObject bo(mock, 128); MockDrmAllocation cmdBuffer(GraphicsAllocation::AllocationType::COMMAND_BUFFER, MemoryPool::System4KBPages); @@ -118,7 +120,7 @@ HWTEST_TEMPLATED_F(DrmCommandStreamMemExecTest, GivenDrmSupportsCompletionFenceW auto *testCsr = static_cast *>(csr); testCsr->latestSentTaskCount = 2; - int ret = testCsr->exec(batchBuffer, 1, 2); + int ret = testCsr->exec(batchBuffer, 1, 2, 0); EXPECT_EQ(0, ret); EXPECT_EQ(expectedCompletionGpuAddress, bo.receivedCompletionGpuAddress); @@ -126,3 +128,73 @@ HWTEST_TEMPLATED_F(DrmCommandStreamMemExecTest, GivenDrmSupportsCompletionFenceW mm->freeGraphicsMemory(allocation); } + +HWTEST_TEMPLATED_F(DrmCommandStreamMemExecTest, GivenDrmSupportsVmBindAndNotCompletionFenceWhenCallingCsrExecThenTagAllocationIsNotPassed) { + mock->completionFenceSupported = false; + mock->isVmBindAvailableCall.callParent = false; + mock->isVmBindAvailableCall.returnValue = true; + + TestedBufferObject bo(mock, 128); + MockDrmAllocation cmdBuffer(GraphicsAllocation::AllocationType::COMMAND_BUFFER, MemoryPool::System4KBPages); + cmdBuffer.bufferObjects[0] = &bo; + uint8_t buff[128]; + + LinearStream cs(&cmdBuffer, buff, 128); + CommandStreamReceiverHw::addBatchBufferEnd(cs, nullptr); + EncodeNoop::alignToCacheLine(cs); + + BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + + auto allocation = mm->allocateGraphicsMemoryWithProperties(MockAllocationProperties{csr->getRootDeviceIndex(), MemoryConstants::pageSize}); + csr->makeResident(cmdBuffer); + csr->makeResident(*allocation); + csr->makeResident(*csr->getTagAllocation()); + + constexpr uint64_t expectedCompletionGpuAddress = 0; + constexpr uint32_t expectedCompletionValue = 0; + auto *testCsr = static_cast *>(csr); + testCsr->latestSentTaskCount = 2; + + int ret = testCsr->exec(batchBuffer, 1, 2, 0); + EXPECT_EQ(0, ret); + + EXPECT_EQ(expectedCompletionGpuAddress, bo.receivedCompletionGpuAddress); + EXPECT_EQ(expectedCompletionValue, bo.receivedCompletionValue); + + mm->freeGraphicsMemory(allocation); +} + +HWTEST_TEMPLATED_F(DrmCommandStreamMemExecTest, GivenDrmSupportsCompletionFenceAndNotVmBindWhenCallingCsrExecThenTagAllocationIsNotPassed) { + mock->completionFenceSupported = true; + mock->isVmBindAvailableCall.callParent = false; + mock->isVmBindAvailableCall.returnValue = false; + + TestedBufferObject bo(mock, 128); + MockDrmAllocation cmdBuffer(GraphicsAllocation::AllocationType::COMMAND_BUFFER, MemoryPool::System4KBPages); + cmdBuffer.bufferObjects[0] = &bo; + uint8_t buff[128]; + + LinearStream cs(&cmdBuffer, buff, 128); + CommandStreamReceiverHw::addBatchBufferEnd(cs, nullptr); + EncodeNoop::alignToCacheLine(cs); + + BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + + auto allocation = mm->allocateGraphicsMemoryWithProperties(MockAllocationProperties{csr->getRootDeviceIndex(), MemoryConstants::pageSize}); + csr->makeResident(cmdBuffer); + csr->makeResident(*allocation); + csr->makeResident(*csr->getTagAllocation()); + + constexpr uint64_t expectedCompletionGpuAddress = 0; + constexpr uint32_t expectedCompletionValue = 0; + auto *testCsr = static_cast *>(csr); + testCsr->latestSentTaskCount = 2; + + int ret = testCsr->exec(batchBuffer, 1, 2, 0); + EXPECT_EQ(0, ret); + + EXPECT_EQ(expectedCompletionGpuAddress, bo.receivedCompletionGpuAddress); + EXPECT_EQ(expectedCompletionValue, bo.receivedCompletionValue); + + mm->freeGraphicsMemory(allocation); +} diff --git a/opencl/test/unit_test/os_interface/linux/drm_command_stream_xehp_and_later_tests.cpp b/opencl/test/unit_test/os_interface/linux/drm_command_stream_xehp_and_later_tests.cpp new file mode 100644 index 0000000000..093c2b5040 --- /dev/null +++ b/opencl/test/unit_test/os_interface/linux/drm_command_stream_xehp_and_later_tests.cpp @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2022 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/execution_environment/root_device_environment.h" +#include "shared/source/os_interface/linux/drm_command_stream.h" +#include "shared/source/os_interface/linux/drm_memory_manager.h" +#include "shared/source/os_interface/linux/drm_memory_operations_handler.h" +#include "shared/source/os_interface/linux/os_context_linux.h" +#include "shared/source/os_interface/os_interface.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" +#include "shared/test/common/helpers/engine_descriptor_helper.h" +#include "shared/test/common/helpers/variable_backup.h" +#include "shared/test/common/libult/linux/drm_mock.h" +#include "shared/test/common/mocks/linux/mock_drm_allocation.h" +#include "shared/test/common/mocks/linux/mock_drm_memory_manager.h" +#include "shared/test/common/mocks/mock_allocation_properties.h" +#include "shared/test/common/mocks/mock_execution_environment.h" +#include "shared/test/common/os_interface/linux/device_command_stream_fixture.h" +#include "shared/test/common/os_interface/linux/drm_buffer_object_fixture.h" +#include "shared/test/common/test_macros/test.h" + +#include "opencl/test/unit_test/os_interface/linux/drm_command_stream_fixture.h" + +using namespace NEO; + +struct DrmCommandStreamMultiTileMemExecFixture { + void SetUp() { + DebugManager.flags.CreateMultipleSubDevices.set(2u); + DebugManager.flags.EnableImplicitScaling.set(1); + DebugManager.flags.EnableForcePin.set(false); + osLocalMemoryBackup = std::make_unique>(&OSInterface::osEnableLocalMemory, true); + + executionEnvironment = new MockExecutionEnvironment(); + executionEnvironment->incRefInternal(); + executionEnvironment->initGmm(); + + mock = new DrmMockCustom(*executionEnvironment->rootDeviceEnvironments[0]); + executionEnvironment->rootDeviceEnvironments[0]->osInterface = std::make_unique(); + executionEnvironment->rootDeviceEnvironments[0]->osInterface->setDriverModel(std::unique_ptr(mock)); + executionEnvironment->rootDeviceEnvironments[0]->memoryOperationsInterface = DrmMemoryOperationsHandler::create(*mock, 0); + + memoryManager = new DrmMemoryManager(gemCloseWorkerMode::gemCloseWorkerInactive, + DebugManager.flags.EnableForcePin.get(), + true, + *executionEnvironment); + executionEnvironment->memoryManager.reset(memoryManager); + executionEnvironment->prepareRootDeviceEnvironments(1u); + executionEnvironment->rootDeviceEnvironments[0]->setHwInfo(NEO::defaultHwInfo.get()); + executionEnvironment->initializeMemoryManager(); + device.reset(MockDevice::create(executionEnvironment, 0)); + + osContext = std::make_unique(*mock, 0u, EngineDescriptorHelper::getDefaultDescriptor(device->getDeviceBitfield())); + osContext->ensureContextInitialized(); + } + + void TearDown() { + executionEnvironment->decRefInternal(); + } + + DebugManagerStateRestore dbgRestore; + std::unique_ptr> osLocalMemoryBackup; + std::unique_ptr device; + std::unique_ptr osContext; + MockExecutionEnvironment *executionEnvironment = nullptr; + DrmMockCustom *mock = nullptr; + DrmMemoryManager *memoryManager = nullptr; +}; + +using DrmCommandStreamMultiTileMemExecTest = Test; + +HWCMDTEST_F(IGFX_XE_HP_CORE, DrmCommandStreamMultiTileMemExecTest, GivenDrmSupportsCompletionFenceAndVmBindWhenCallingCsrExecThenMultipleTagAllocationIsPassed) { + auto *testCsr = new TestedDrmCommandStreamReceiver(*executionEnvironment, 0, device->getDeviceBitfield()); + device->resetCommandStreamReceiver(testCsr); + EXPECT_EQ(2u, testCsr->activePartitions); + testCsr->setupContext(*osContext.get()); + + mock->completionFenceSupported = true; + mock->isVmBindAvailableCall.callParent = false; + mock->isVmBindAvailableCall.returnValue = true; + + TestedBufferObject bo(mock, 128); + MockDrmAllocation cmdBuffer(GraphicsAllocation::AllocationType::COMMAND_BUFFER, MemoryPool::System4KBPages); + cmdBuffer.bufferObjects[0] = &bo; + uint8_t buff[128]; + + LinearStream cs(&cmdBuffer, buff, 128); + CommandStreamReceiverHw::addBatchBufferEnd(cs, nullptr); + EncodeNoop::alignToCacheLine(cs); + + BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + + auto allocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{testCsr->getRootDeviceIndex(), MemoryConstants::pageSize}); + testCsr->makeResident(cmdBuffer); + testCsr->makeResident(*allocation); + testCsr->makeResident(*testCsr->getTagAllocation()); + + testCsr->latestSentTaskCount = 2; + testCsr->postSyncWriteOffset = 16; + + uint64_t expectedCompletionGpuAddress = testCsr->getTagAllocation()->getGpuAddress() + Drm::completionFenceOffset + testCsr->postSyncWriteOffset; + + int ret = testCsr->flushInternal(batchBuffer, testCsr->getResidencyAllocations()); + EXPECT_EQ(0, ret); + + EXPECT_EQ(expectedCompletionGpuAddress, bo.receivedCompletionGpuAddress); + EXPECT_EQ(testCsr->latestSentTaskCount, bo.receivedCompletionValue); + EXPECT_EQ(2u, bo.execCalled); + + memoryManager->freeGraphicsMemory(allocation); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, DrmCommandStreamMultiTileMemExecTest, GivenDrmSupportsCompletionFenceAndVmBindWhenHandlingCompletionThenExpectMultipleWaitCalls) { + EngineControl &defaultEngine = device->getDefaultEngine(); + EXPECT_EQ(2u, defaultEngine.commandStreamReceiver->getActivePartitions()); + + uint32_t postSyncOffset = defaultEngine.commandStreamReceiver->getPostSyncWriteOffset(); + EXPECT_NE(0u, postSyncOffset); + + mock->completionFenceSupported = true; + mock->isVmBindAvailableCall.callParent = false; + mock->isVmBindAvailableCall.returnValue = true; + + auto allocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{0, 1024, GraphicsAllocation::AllocationType::COMMAND_BUFFER}); + allocation->updateTaskCount(2, defaultEngine.osContext->getContextId()); + + volatile uint32_t *completionAddress = defaultEngine.commandStreamReceiver->getTagAddress(); + completionAddress += (Drm::completionFenceOffset / sizeof(uint32_t)); + *completionAddress = 1; + completionAddress += (postSyncOffset / sizeof(uint32_t)); + *completionAddress = 1; + + memoryManager->handleFenceCompletion(allocation); + + uint64_t expectedAddress = castToUint64(const_cast(defaultEngine.commandStreamReceiver->getTagAddress())) + + Drm::completionFenceOffset + + postSyncOffset; + constexpr uint64_t expectedValue = 2; + + EXPECT_EQ(2u, mock->waitUserFenceCall.called); + EXPECT_EQ(expectedAddress, mock->waitUserFenceCall.address); + EXPECT_EQ(expectedValue, mock->waitUserFenceCall.value); + + memoryManager->freeGraphicsMemory(allocation); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, DrmCommandStreamMultiTileMemExecTest, GivenDrmSupportsCompletionFenceAndVmBindWhenHandlingCompletionAndOneContextIsReadyThenExpectOneWaitCall) { + EngineControl &defaultEngine = device->getDefaultEngine(); + EXPECT_EQ(2u, defaultEngine.commandStreamReceiver->getActivePartitions()); + + uint32_t postSyncOffset = defaultEngine.commandStreamReceiver->getPostSyncWriteOffset(); + EXPECT_NE(0u, postSyncOffset); + + mock->completionFenceSupported = true; + mock->isVmBindAvailableCall.callParent = false; + mock->isVmBindAvailableCall.returnValue = true; + + auto allocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{0, 1024, GraphicsAllocation::AllocationType::COMMAND_BUFFER}); + allocation->updateTaskCount(2, defaultEngine.osContext->getContextId()); + + volatile uint32_t *completionAddress = defaultEngine.commandStreamReceiver->getTagAddress(); + completionAddress += (Drm::completionFenceOffset / sizeof(uint32_t)); + *completionAddress = 2; //1st context is ready + completionAddress += (postSyncOffset / sizeof(uint32_t)); + *completionAddress = 1; + + memoryManager->handleFenceCompletion(allocation); + + uint64_t expectedAddress = castToUint64(const_cast(defaultEngine.commandStreamReceiver->getTagAddress())) + + Drm::completionFenceOffset + + postSyncOffset; + constexpr uint64_t expectedValue = 2; + + EXPECT_EQ(1u, mock->waitUserFenceCall.called); + EXPECT_EQ(expectedAddress, mock->waitUserFenceCall.address); + EXPECT_EQ(expectedValue, mock->waitUserFenceCall.value); + + memoryManager->freeGraphicsMemory(allocation); +} diff --git a/opencl/test/unit_test/os_interface/linux/drm_memory_manager_tests.cpp b/opencl/test/unit_test/os_interface/linux/drm_memory_manager_tests.cpp index 582c63b604..2598c1f87d 100644 --- a/opencl/test/unit_test/os_interface/linux/drm_memory_manager_tests.cpp +++ b/opencl/test/unit_test/os_interface/linux/drm_memory_manager_tests.cpp @@ -5809,14 +5809,29 @@ TEST_F(DrmMemoryManagerTest, GivenEligbleAllocationTypeWhenCheckingAllocationEli } TEST_F(DrmMemoryManagerTest, GivenNotEligbleAllocationTypeWhenCheckingAllocationEligbleForCompletionFenceThenReturnFalse) { - GraphicsAllocation::AllocationType validAllocations[] = { + GraphicsAllocation::AllocationType invalidAllocations[] = { GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY, GraphicsAllocation::AllocationType::CONSTANT_SURFACE, GraphicsAllocation::AllocationType::FILL_PATTERN, GraphicsAllocation::AllocationType::GLOBAL_SURFACE}; for (size_t i = 0; i < 4; i++) { - EXPECT_FALSE(memoryManager->allocationTypeForCompletionFence(validAllocations[i])); + EXPECT_FALSE(memoryManager->allocationTypeForCompletionFence(invalidAllocations[i])); + } +} + +TEST_F(DrmMemoryManagerTest, GivenNotEligbleAllocationTypeAndDebugFlagOverridingWhenCheckingAllocationEligbleForCompletionFenceThenReturnTrue) { + DebugManagerStateRestore dbgState; + DebugManager.flags.UseDrmCompletionFenceForAllAllocations.set(1); + + GraphicsAllocation::AllocationType invalidAllocations[] = { + GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY, + GraphicsAllocation::AllocationType::CONSTANT_SURFACE, + GraphicsAllocation::AllocationType::FILL_PATTERN, + GraphicsAllocation::AllocationType::GLOBAL_SURFACE}; + + for (size_t i = 0; i < 4; i++) { + EXPECT_TRUE(memoryManager->allocationTypeForCompletionFence(invalidAllocations[i])); } } @@ -5877,4 +5892,27 @@ TEST_F(DrmMemoryManagerTest, givenCompletionFenceEnabledWhenHandlingCompletionOf memoryManager->freeGraphicsMemory(allocation); } +HWTEST_F(DrmMemoryManagerTest, givenCompletionFenceEnabledWhenHandlingCompletionAndTagAddressIsNullThenDoNotCallWaitUserFence) { + mock->ioctl_expected.total = -1; + + VariableBackup backupFenceSupported{&mock->completionFenceSupported, true}; + VariableBackup backupVmBindCallParent{&mock->isVmBindAvailableCall.callParent, false}; + VariableBackup backupVmBindReturnValue{&mock->isVmBindAvailableCall.returnValue, true}; + + auto allocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{rootDeviceIndex, 1024, GraphicsAllocation::AllocationType::COMMAND_BUFFER}); + auto engine = memoryManager->getRegisteredEngines()[0]; + allocation->updateTaskCount(2, engine.osContext->getContextId()); + + auto testCsr = static_cast *>(engine.commandStreamReceiver); + auto backupTagAddress = testCsr->tagAddress; + testCsr->tagAddress = nullptr; + + memoryManager->handleFenceCompletion(allocation); + EXPECT_EQ(0u, mock->waitUserFenceCall.called); + + testCsr->tagAddress = backupTagAddress; + + memoryManager->freeGraphicsMemory(allocation); +} + } // namespace NEO diff --git a/opencl/test/unit_test/os_interface/linux/drm_tests.cpp b/opencl/test/unit_test/os_interface/linux/drm_tests.cpp index 8a37744783..b57fce7d2f 100644 --- a/opencl/test/unit_test/os_interface/linux/drm_tests.cpp +++ b/opencl/test/unit_test/os_interface/linux/drm_tests.cpp @@ -944,14 +944,23 @@ TEST(DrmTest, GivenCompletionFenceDebugFlagWhenCreatingDrmObjectThenExpectCorrec auto executionEnvironment = std::make_unique(); executionEnvironment->prepareRootDeviceEnvironments(1); + HardwareInfo *hwInfo = defaultHwInfo.get(); + executionEnvironment->rootDeviceEnvironments[0]->setHwInfo(hwInfo); + + auto &hwHelper = HwHelper::get(hwInfo->platform.eRenderCoreFamily); + DrmMock drmDefault{*executionEnvironment->rootDeviceEnvironments[0]}; - EXPECT_FALSE(drmDefault.completionFenceSupported); + if (hwHelper.isLinuxCompletionFenceSupported()) { + EXPECT_TRUE(drmDefault.completionFenceSupport()); + } else { + EXPECT_FALSE(drmDefault.completionFenceSupport()); + } DebugManager.flags.EnableDrmCompletionFence.set(1); DrmMock drmEnabled{*executionEnvironment->rootDeviceEnvironments[0]}; - EXPECT_TRUE(drmEnabled.completionFenceSupported); + EXPECT_TRUE(drmEnabled.completionFenceSupport()); DebugManager.flags.EnableDrmCompletionFence.set(0); DrmMock drmDisabled{*executionEnvironment->rootDeviceEnvironments[0]}; - EXPECT_FALSE(drmDisabled.completionFenceSupported); + EXPECT_FALSE(drmDisabled.completionFenceSupport()); } diff --git a/opencl/test/unit_test/test_files/igdrcl.config b/opencl/test/unit_test/test_files/igdrcl.config index 3d25e8f768..662027e812 100644 --- a/opencl/test/unit_test/test_files/igdrcl.config +++ b/opencl/test/unit_test/test_files/igdrcl.config @@ -365,6 +365,7 @@ UpdateCrossThreadDataSize = 0 ForceBcsEngineIndex = -1 ResolveDependenciesViaPipeControls = -1 EnableDrmCompletionFence = -1 +UseDrmCompletionFenceForAllAllocations = -1 ExperimentalEnableSourceLevelDebugger = 0 Force2dImageAsArray = -1 ForceExtendedBufferSize = -1 \ No newline at end of file diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 0f719bb430..eb913554c1 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -343,6 +343,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, OverrideSystolicPipelineSelect, -1, "set SYSTOLI DECLARE_DEBUG_VARIABLE(int32_t, OverrideSystolicInComputeWalker, -1, "set SYSTOLIC MODE ENABLE in COMPUTE_WALKER cmd, -1:default, 0:disable, 1:enable") DECLARE_DEBUG_VARIABLE(int32_t, AddStatePrefetchCmdToMemoryPrefetchAPI, -1, "Add STATE_PREFETCH to zeCommandListAppendMemoryPrefetch, -1:default, 0:disable, 1:enable") DECLARE_DEBUG_VARIABLE(int32_t, EnableDrmCompletionFence, -1, "Enables DRM completion fence, -1:default (disabled), 0:disable, 1:enable") +DECLARE_DEBUG_VARIABLE(int32_t, UseDrmCompletionFenceForAllAllocations, -1, "Uses DRM completion fence for all allocations, -1:default (disabled), 0:disable, 1:enable") /*EXPERIMENTAL TOGGLES*/ DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionCount, 0, "Experimental implementation: Set number of COMPUTE_WALKERs for a given Partition Type, 0 - do not set the feature.") diff --git a/shared/source/helpers/hw_helper.h b/shared/source/helpers/hw_helper.h index 96ac0646f9..18e04914e6 100644 --- a/shared/source/helpers/hw_helper.h +++ b/shared/source/helpers/hw_helper.h @@ -154,6 +154,7 @@ class HwHelper { virtual bool isRevisionSpecificBinaryBuiltinRequired() const = 0; virtual bool forceNonGpuCoherencyWA(bool requiresCoherency) const = 0; virtual bool platformSupportsImplicitScaling(const NEO::HardwareInfo &hwInfo) const = 0; + virtual bool isLinuxCompletionFenceSupported() const = 0; protected: HwHelper() = default; @@ -389,6 +390,7 @@ class HwHelperHw : public HwHelper { bool isRevisionSpecificBinaryBuiltinRequired() const override; bool forceNonGpuCoherencyWA(bool requiresCoherency) const override; bool platformSupportsImplicitScaling(const NEO::HardwareInfo &hwInfo) const override; + bool isLinuxCompletionFenceSupported() const override; protected: static const AuxTranslationMode defaultAuxTranslationMode; diff --git a/shared/source/helpers/hw_helper_bdw_and_later.inl b/shared/source/helpers/hw_helper_bdw_and_later.inl index 783f3e4953..bf1abad4df 100644 --- a/shared/source/helpers/hw_helper_bdw_and_later.inl +++ b/shared/source/helpers/hw_helper_bdw_and_later.inl @@ -145,4 +145,9 @@ inline bool HwHelperHw::platformSupportsImplicitScaling(const NEO::Ha return false; } +template +inline bool HwHelperHw::isLinuxCompletionFenceSupported() const { + return false; +} + } // namespace NEO diff --git a/shared/source/helpers/hw_helper_xehp_and_later.inl b/shared/source/helpers/hw_helper_xehp_and_later.inl index f943c03598..7134a09f70 100644 --- a/shared/source/helpers/hw_helper_xehp_and_later.inl +++ b/shared/source/helpers/hw_helper_xehp_and_later.inl @@ -199,4 +199,9 @@ inline bool HwHelperHw::platformSupportsImplicitScaling(const NEO::Ha return ImplicitScalingDispatch::platformSupportsImplicitScaling(hwInfo); } +template +inline bool HwHelperHw::isLinuxCompletionFenceSupported() const { + return false; +} + } // namespace NEO diff --git a/shared/source/os_interface/linux/drm_buffer_object.cpp b/shared/source/os_interface/linux/drm_buffer_object.cpp index c0793ae3f2..436d63412b 100644 --- a/shared/source/os_interface/linux/drm_buffer_object.cpp +++ b/shared/source/os_interface/linux/drm_buffer_object.cpp @@ -34,7 +34,7 @@ namespace NEO { BufferObject::BufferObject(Drm *drm, int handle, size_t size, size_t maxOsContextCount) : drm(drm), refCount(1), handle(handle), size(size), isReused(false) { - this->tiling_mode = I915_TILING_NONE; + this->tilingMode = I915_TILING_NONE; this->lockedAddress = nullptr; perContextVmsUsed = drm->isPerContextVMRequired(); @@ -86,7 +86,7 @@ int BufferObject::wait(int64_t timeoutNs) { } bool BufferObject::setTiling(uint32_t mode, uint32_t stride) { - if (this->tiling_mode == mode) { + if (this->tilingMode == mode) { return true; } @@ -99,7 +99,7 @@ bool BufferObject::setTiling(uint32_t mode, uint32_t stride) { return false; } - this->tiling_mode = set_tiling.tiling_mode; + this->tilingMode = set_tiling.tiling_mode; return set_tiling.tiling_mode == mode; } diff --git a/shared/source/os_interface/linux/drm_buffer_object.h b/shared/source/os_interface/linux/drm_buffer_object.h index 5059aa5c6b..8c70362508 100644 --- a/shared/source/os_interface/linux/drm_buffer_object.h +++ b/shared/source/os_interface/linux/drm_buffer_object.h @@ -148,7 +148,7 @@ class BufferObject { bool isReused; //Tiling - uint32_t tiling_mode; + uint32_t tilingMode; bool allowCapture = false; bool requiresImmediateBinding = false; bool requiresExplicitResidency = false; diff --git a/shared/source/os_interface/linux/drm_command_stream.h b/shared/source/os_interface/linux/drm_command_stream.h index 0a9976ccde..8f75466ad8 100644 --- a/shared/source/os_interface/linux/drm_command_stream.h +++ b/shared/source/os_interface/linux/drm_command_stream.h @@ -67,7 +67,7 @@ class DrmCommandStreamReceiver : public DeviceCommandStreamReceiver { protected: MOCKABLE_VIRTUAL int flushInternal(const BatchBuffer &batchBuffer, const ResidencyContainer &allocationsForResidency); - MOCKABLE_VIRTUAL int exec(const BatchBuffer &batchBuffer, uint32_t vmHandleId, uint32_t drmContextId); + MOCKABLE_VIRTUAL int exec(const BatchBuffer &batchBuffer, uint32_t vmHandleId, uint32_t drmContextId, uint32_t index); MOCKABLE_VIRTUAL int waitUserFence(uint32_t waitValue); bool isUserFenceWaitActive(); diff --git a/shared/source/os_interface/linux/drm_command_stream.inl b/shared/source/os_interface/linux/drm_command_stream.inl index 092ff3acfa..219e9bed2b 100644 --- a/shared/source/os_interface/linux/drm_command_stream.inl +++ b/shared/source/os_interface/linux/drm_command_stream.inl @@ -183,7 +183,7 @@ void DrmCommandStreamReceiver::printBOsForSubmit(ResidencyContainer & } template -int DrmCommandStreamReceiver::exec(const BatchBuffer &batchBuffer, uint32_t vmHandleId, uint32_t drmContextId) { +int DrmCommandStreamReceiver::exec(const BatchBuffer &batchBuffer, uint32_t vmHandleId, uint32_t drmContextId, uint32_t index) { DrmAllocation *alloc = static_cast(batchBuffer.commandBufferAllocation); DEBUG_BREAK_IF(!alloc); BufferObject *bb = alloc->getBO(); @@ -199,8 +199,9 @@ int DrmCommandStreamReceiver::exec(const BatchBuffer &batchBuffer, ui uint64_t completionGpuAddress = 0; uint32_t completionValue = 0; - if (this->drm->completionFenceSupport()) { - completionGpuAddress = getTagAllocation()->getGpuAddress() + Drm::completionFenceOffset; + if (this->drm->isVmBindAvailable() && + this->drm->completionFenceSupport()) { + completionGpuAddress = getTagAllocation()->getGpuAddress() + (index * this->postSyncWriteOffset) + Drm::completionFenceOffset; completionValue = this->latestSentTaskCount; } diff --git a/shared/source/os_interface/linux/drm_command_stream_bdw_and_later.inl b/shared/source/os_interface/linux/drm_command_stream_bdw_and_later.inl index b375a43e42..fb182273f0 100644 --- a/shared/source/os_interface/linux/drm_command_stream_bdw_and_later.inl +++ b/shared/source/os_interface/linux/drm_command_stream_bdw_and_later.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2021 Intel Corporation + * Copyright (C) 2019-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -13,7 +13,7 @@ namespace NEO { template int DrmCommandStreamReceiver::flushInternal(const BatchBuffer &batchBuffer, const ResidencyContainer &allocationsForResidency) { this->processResidency(allocationsForResidency, 0u); - int ret = this->exec(batchBuffer, 0u, static_cast(osContext)->getDrmContextIds()[0]); + int ret = this->exec(batchBuffer, 0u, static_cast(osContext)->getDrmContextIds()[0], 0); return ret; } diff --git a/shared/source/os_interface/linux/drm_command_stream_xehp_and_later.inl b/shared/source/os_interface/linux/drm_command_stream_xehp_and_later.inl index 1f54edc677..140ca12468 100644 --- a/shared/source/os_interface/linux/drm_command_stream_xehp_and_later.inl +++ b/shared/source/os_interface/linux/drm_command_stream_xehp_and_later.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2021 Intel Corporation + * Copyright (C) 2018-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -32,7 +32,7 @@ int DrmCommandStreamReceiver::flushInternal(const BatchBuffer &batchB printf("Drm Submission of contextIndex: %u, with context id %u\n", contextIndex, drmContextIds[contextIndex]); } - int ret = this->exec(batchBuffer, tileIterator, drmContextIds[contextIndex]); + int ret = this->exec(batchBuffer, tileIterator, drmContextIds[contextIndex], contextIndex); if (ret) { return ret; } diff --git a/shared/source/os_interface/linux/drm_memory_manager.cpp b/shared/source/os_interface/linux/drm_memory_manager.cpp index 47157e8d6c..f98cbd54e6 100644 --- a/shared/source/os_interface/linux/drm_memory_manager.cpp +++ b/shared/source/os_interface/linux/drm_memory_manager.cpp @@ -1466,6 +1466,11 @@ bool DrmMemoryManager::retrieveMmapOffsetForBufferObject(uint32_t rootDeviceInde } bool DrmMemoryManager::allocationTypeForCompletionFence(GraphicsAllocation::AllocationType allocationType) { + int32_t overrideAllowAllAllocations = DebugManager.flags.UseDrmCompletionFenceForAllAllocations.get(); + bool allowAllAllocations = overrideAllowAllAllocations == -1 ? false : !!overrideAllowAllAllocations; + if (allowAllAllocations) { + return true; + } if (allocationType == GraphicsAllocation::AllocationType::COMMAND_BUFFER || allocationType == GraphicsAllocation::AllocationType::RING_BUFFER || allocationType == GraphicsAllocation::AllocationType::SEMAPHORE_BUFFER || @@ -1482,15 +1487,28 @@ void DrmMemoryManager::waitOnCompletionFence(GraphicsAllocation *allocation) { OsContext *osContext = engine.osContext; CommandStreamReceiver *csr = engine.commandStreamReceiver; + uint32_t activeHwContexts = csr->getActivePartitions(); auto osContextId = osContext->getContextId(); auto allocationTaskCount = allocation->getTaskCount(osContextId); - uint64_t completionFenceAddress = castToUint64(const_cast(csr->getTagAddress())) + Drm::completionFenceOffset; + uint64_t completionFenceAddress = castToUint64(const_cast(csr->getTagAddress())); + if (completionFenceAddress == 0) { + continue; + } if (allocation->isUsedByOsContext(osContextId)) { - uint32_t ctxId = static_cast(osContext)->getDrmContextIds()[0]; - constexpr int64_t timeout = -1; - constexpr uint16_t flags = 0; - getDrm(csr->getRootDeviceIndex()).waitUserFence(ctxId, completionFenceAddress, allocationTaskCount, Drm::ValueWidth::U32, timeout, flags); + completionFenceAddress += Drm::completionFenceOffset; + Drm &drm = getDrm(csr->getRootDeviceIndex()); + auto &ctxVector = static_cast(osContext)->getDrmContextIds(); + + for (uint32_t i = 0; i < activeHwContexts; i++) { + uint32_t *fenceValue = reinterpret_cast(completionFenceAddress); + if (*fenceValue < allocationTaskCount) { + constexpr int64_t timeout = -1; + constexpr uint16_t flags = 0; + drm.waitUserFence(ctxVector[i], completionFenceAddress, allocationTaskCount, Drm::ValueWidth::U32, timeout, flags); + } + completionFenceAddress += csr->getPostSyncWriteOffset(); + } } } } else { diff --git a/shared/source/os_interface/linux/drm_neo.cpp b/shared/source/os_interface/linux/drm_neo.cpp index e63b7fe931..c4141a3a84 100644 --- a/shared/source/os_interface/linux/drm_neo.cpp +++ b/shared/source/os_interface/linux/drm_neo.cpp @@ -197,10 +197,6 @@ Drm::Drm(std::unique_ptr &&hwDeviceIdIn, RootDeviceEnvironment &r hwDeviceId(std::move(hwDeviceIdIn)), rootDeviceEnvironment(rootDeviceEnvironment) { pagingFence.fill(0u); fenceVal.fill(0u); - int32_t overrideCompletionFence = DebugManager.flags.EnableDrmCompletionFence.get(); - if (overrideCompletionFence != -1) { - completionFenceSupported = !!overrideCompletionFence; - } } int Drm::ioctl(unsigned long request, void *arg) { @@ -1035,4 +1031,17 @@ bool Drm::queryEngineInfo(bool isSysmanEnabled) { return true; } +bool Drm::completionFenceSupport() { + std::call_once(checkCompletionFenceOnce, [this]() { + bool support = IoctlHelper::get(this)->completionFenceExtensionSupported(*getRootDeviceEnvironment().getHardwareInfo()); + int32_t overrideCompletionFence = DebugManager.flags.EnableDrmCompletionFence.get(); + if (overrideCompletionFence != -1) { + support = !!overrideCompletionFence; + } + + completionFenceSupported = support; + }); + return completionFenceSupported; +} + } // namespace NEO diff --git a/shared/source/os_interface/linux/drm_neo.h b/shared/source/os_interface/linux/drm_neo.h index 19835a7549..b18b1cbe41 100644 --- a/shared/source/os_interface/linux/drm_neo.h +++ b/shared/source/os_interface/linux/drm_neo.h @@ -252,9 +252,7 @@ class Drm : public DriverModel { } MOCKABLE_VIRTUAL std::vector getMemoryRegions(); - bool completionFenceSupport() const { - return completionFenceSupported; - } + MOCKABLE_VIRTUAL bool completionFenceSupport(); protected: Drm(std::unique_ptr &&hwDeviceIdIn, RootDeviceEnvironment &rootDeviceEnvironment); @@ -322,6 +320,7 @@ class Drm : public DriverModel { std::unique_ptr memoryInfo; std::once_flag checkBindOnce; + std::once_flag checkCompletionFenceOnce; RootDeviceEnvironment &rootDeviceEnvironment; uint64_t uuid = 0; diff --git a/shared/source/os_interface/linux/ioctl_helper.h b/shared/source/os_interface/linux/ioctl_helper.h index fa1c35de44..b37062738a 100644 --- a/shared/source/os_interface/linux/ioctl_helper.h +++ b/shared/source/os_interface/linux/ioctl_helper.h @@ -20,6 +20,7 @@ namespace NEO { class Drm; class IoctlHelper; enum class CacheRegion : uint16_t; +struct HardwareInfo; extern IoctlHelper *ioctlHelperFactory[IGFX_MAX_PRODUCT]; @@ -74,6 +75,7 @@ class IoctlHelper { virtual uint32_t queryDistances(Drm *drm, std::vector &queryItems, std::vector &distanceInfos) = 0; virtual int32_t getComputeEngineClass() = 0; virtual int execBuffer(Drm *drm, drm_i915_gem_execbuffer2 *execBuffer, uint64_t completionGpuAddress, uint32_t counterValue) = 0; + virtual bool completionFenceExtensionSupported(const HardwareInfo &hwInfo) = 0; }; class IoctlHelperUpstream : public IoctlHelper { @@ -96,6 +98,7 @@ class IoctlHelperUpstream : public IoctlHelper { uint32_t queryDistances(Drm *drm, std::vector &queryItems, std::vector &distanceInfos) override; int32_t getComputeEngineClass() override; int execBuffer(Drm *drm, drm_i915_gem_execbuffer2 *execBuffer, uint64_t completionGpuAddress, uint32_t counterValue) override; + bool completionFenceExtensionSupported(const HardwareInfo &hwInfo) override; }; template @@ -129,6 +132,7 @@ class IoctlHelperPrelim20 : public IoctlHelper { uint32_t queryDistances(Drm *drm, std::vector &queryItems, std::vector &distanceInfos) override; int32_t getComputeEngineClass() override; int execBuffer(Drm *drm, drm_i915_gem_execbuffer2 *execBuffer, uint64_t completionGpuAddress, uint32_t counterValue) override; + bool completionFenceExtensionSupported(const HardwareInfo &hwInfo) override; }; } // namespace NEO diff --git a/shared/source/os_interface/linux/ioctl_helper_prelim_extended.cpp b/shared/source/os_interface/linux/ioctl_helper_prelim_extended.cpp index 97a9de3e7c..a087c5c8ef 100644 --- a/shared/source/os_interface/linux/ioctl_helper_prelim_extended.cpp +++ b/shared/source/os_interface/linux/ioctl_helper_prelim_extended.cpp @@ -15,4 +15,8 @@ int IoctlHelperPrelim20::execBuffer(Drm *drm, drm_i915_gem_execbuffer2 *execBuff return ioctl(drm, DRM_IOCTL_I915_GEM_EXECBUFFER2, execBuffer); } +bool IoctlHelperPrelim20::completionFenceExtensionSupported(const HardwareInfo &hwInfo) { + return false; +} + } // namespace NEO diff --git a/shared/source/os_interface/linux/ioctl_helper_upstream.cpp b/shared/source/os_interface/linux/ioctl_helper_upstream.cpp index a743214a63..b243164b51 100644 --- a/shared/source/os_interface/linux/ioctl_helper_upstream.cpp +++ b/shared/source/os_interface/linux/ioctl_helper_upstream.cpp @@ -134,4 +134,8 @@ int IoctlHelperUpstream::execBuffer(Drm *drm, drm_i915_gem_execbuffer2 *execBuff return ioctl(drm, DRM_IOCTL_I915_GEM_EXECBUFFER2, execBuffer); } +bool IoctlHelperUpstream::completionFenceExtensionSupported(const HardwareInfo &hwInfo) { + return false; +} + } // namespace NEO diff --git a/shared/source/xe_hpg_core/hw_helper_xe_hpg_core.cpp b/shared/source/xe_hpg_core/hw_helper_xe_hpg_core.cpp index 6b2a2df591..bd0071d294 100644 --- a/shared/source/xe_hpg_core/hw_helper_xe_hpg_core.cpp +++ b/shared/source/xe_hpg_core/hw_helper_xe_hpg_core.cpp @@ -109,6 +109,11 @@ bool HwHelperHw::disableL3CacheForDebug(const HardwareInfo &hwInfo) cons return isWorkaroundRequired(REVISION_A0, REVISION_B, hwInfo); } +template <> +inline bool HwHelperHw::isLinuxCompletionFenceSupported() const { + return false; +} + template class HwHelperHw; template class FlatBatchBufferHelperHw; template struct MemorySynchronizationCommands; diff --git a/shared/test/common/mocks/linux/mock_drm_command_stream_receiver.h b/shared/test/common/mocks/linux/mock_drm_command_stream_receiver.h index db9f0cfc0b..36156c3ba3 100644 --- a/shared/test/common/mocks/linux/mock_drm_command_stream_receiver.h +++ b/shared/test/common/mocks/linux/mock_drm_command_stream_receiver.h @@ -20,10 +20,12 @@ class TestedDrmCommandStreamReceiver : public DrmCommandStreamReceivertiling_mode = mode; + this->tilingMode = mode; } void fillExecObject(drm_i915_gem_exec_object2 &execObject, OsContext *osContext, uint32_t vmHandleId, uint32_t drmContextId) override { @@ -42,12 +42,14 @@ class TestedBufferObject : public BufferObject { BufferObject *const residency[], size_t residencyCount, drm_i915_gem_exec_object2 *execObjectsStorage, uint64_t completionGpuAddress, uint32_t completionValue) override { this->receivedCompletionGpuAddress = completionGpuAddress; this->receivedCompletionValue = completionValue; + this->execCalled++; return BufferObject::exec(used, startOffset, flags, requiresCoherency, osContext, vmHandleId, drmContextId, residency, residencyCount, execObjectsStorage, completionGpuAddress, completionValue); } uint64_t receivedCompletionGpuAddress = 0; drm_i915_gem_exec_object2 *execObjectPointerFilled = nullptr; uint32_t receivedCompletionValue = 0; + uint32_t execCalled = 0; }; template