diff --git a/opencl/source/os_interface/linux/drm_command_stream.h b/opencl/source/os_interface/linux/drm_command_stream.h index 730979b9ac..6c3b807b6c 100644 --- a/opencl/source/os_interface/linux/drm_command_stream.h +++ b/opencl/source/os_interface/linux/drm_command_stream.h @@ -24,6 +24,8 @@ class DrmCommandStreamReceiver : public DeviceCommandStreamReceiver { protected: typedef DeviceCommandStreamReceiver BaseClass; using CommandStreamReceiverHw::CommandStreamReceiver::getTagAddress; + using CommandStreamReceiverHw::CommandStreamReceiver::getTagAllocation; + using CommandStreamReceiverHw::CommandStreamReceiver::taskCount; using BaseClass::getScratchPatchAddress; using BaseClass::makeNonResident; using BaseClass::makeResident; @@ -61,10 +63,14 @@ class DrmCommandStreamReceiver : public DeviceCommandStreamReceiver { protected: MOCKABLE_VIRTUAL void flushInternal(const BatchBuffer &batchBuffer, const ResidencyContainer &allocationsForResidency); MOCKABLE_VIRTUAL void exec(const BatchBuffer &batchBuffer, uint32_t vmHandleId, uint32_t drmContextId); + MOCKABLE_VIRTUAL int waitUserFence(uint32_t waitValue); std::vector residency; std::vector execObjectsStorage; Drm *drm; gemCloseWorkerMode gemCloseWorkerOperationMode; + + bool useUserFenceWait = false; + bool useContextForUserFenceWait = false; }; } // namespace NEO diff --git a/opencl/source/os_interface/linux/drm_command_stream.inl b/opencl/source/os_interface/linux/drm_command_stream.inl index 4d6662fd1c..793d51dc33 100644 --- a/opencl/source/os_interface/linux/drm_command_stream.inl +++ b/opencl/source/os_interface/linux/drm_command_stream.inl @@ -52,6 +52,12 @@ DrmCommandStreamReceiver::DrmCommandStreamReceiver(ExecutionEnvironme if (DebugManager.flags.CsrDispatchMode.get()) { this->dispatchMode = static_cast(DebugManager.flags.CsrDispatchMode.get()); } + if (DebugManager.flags.EnableUserFenceForCompletionWait.get() == 1) { + useUserFenceWait = true; + } + if (DebugManager.flags.EnableUserFenceUseCtxId.get() == 1) { + useContextForUserFenceWait = true; + } } template @@ -93,7 +99,11 @@ bool DrmCommandStreamReceiver::flush(BatchBuffer &batchBuffer, Reside return this->blitterDirectSubmission->dispatchCommandBuffer(batchBuffer, *this->flushStamp.get()); } - this->flushStamp->setStamp(bb->peekHandle()); + if (useUserFenceWait) { + this->flushStamp->setStamp(taskCount + 1); + } else { + this->flushStamp->setStamp(bb->peekHandle()); + } this->flushInternal(batchBuffer, allocationsForResidency); if (this->gemCloseWorkerOperationMode == gemCloseWorkerMode::gemCloseWorkerActive) { @@ -193,11 +203,13 @@ GmmPageTableMngr *DrmCommandStreamReceiver::createPageTableManager() template bool DrmCommandStreamReceiver::waitForFlushStamp(FlushStamp &flushStamp) { - drm_i915_gem_wait wait = {}; - wait.bo_handle = static_cast(flushStamp); - wait.timeout_ns = -1; + auto waitValue = static_cast(flushStamp); + if (useUserFenceWait) { + waitUserFence(waitValue); + } else { + this->drm->waitHandle(waitValue); + } - drm->ioctl(DRM_IOCTL_I915_GEM_WAIT, &wait); return true; } diff --git a/opencl/source/os_interface/linux/drm_command_stream_bdw_plus.inl b/opencl/source/os_interface/linux/drm_command_stream_bdw_plus.inl index 84d090d209..ae28c23ddd 100644 --- a/opencl/source/os_interface/linux/drm_command_stream_bdw_plus.inl +++ b/opencl/source/os_interface/linux/drm_command_stream_bdw_plus.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2020 Intel Corporation + * Copyright (C) 2019-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -17,4 +17,13 @@ void DrmCommandStreamReceiver::flushInternal(const BatchBuffer &batch this->exec(batchBuffer, 0u, static_cast(osContext)->getDrmContextIds()[0]); } +template +int DrmCommandStreamReceiver::waitUserFence(uint32_t waitValue) { + uint32_t ctxId = 0u; + if (useContextForUserFenceWait) { + ctxId = static_cast(osContext)->getDrmContextIds()[0]; + } + return this->drm->waitUserFence(ctxId, getTagAllocation()->getGpuAddress(), waitValue, Drm::ValueWidth::U32); +} + } // namespace NEO diff --git a/opencl/test/unit_test/mocks/linux/mock_drm_command_stream_receiver.h b/opencl/test/unit_test/mocks/linux/mock_drm_command_stream_receiver.h index 0fad1fbd14..c16b37f09b 100644 --- a/opencl/test/unit_test/mocks/linux/mock_drm_command_stream_receiver.h +++ b/opencl/test/unit_test/mocks/linux/mock_drm_command_stream_receiver.h @@ -18,11 +18,15 @@ class TestedDrmCommandStreamReceiver : public DrmCommandStreamReceiver::residency; + using DrmCommandStreamReceiver::useContextForUserFenceWait; + using DrmCommandStreamReceiver::useUserFenceWait; using CommandStreamReceiverHw::directSubmission; using CommandStreamReceiverHw::blitterDirectSubmission; using CommandStreamReceiverHw::CommandStreamReceiver::lastSentSliceCount; @@ -74,4 +78,24 @@ class TestedDrmCommandStreamReceiver : public DrmCommandStreamReceiver::waitUserFence(waitValue); + } else { + return waitUserFenceResult.returnValue; + } + } }; diff --git a/opencl/test/unit_test/os_interface/linux/device_command_stream_fixture.h b/opencl/test/unit_test/os_interface/linux/device_command_stream_fixture.h index 3fecb1eac2..c27a54ec18 100644 --- a/opencl/test/unit_test/os_interface/linux/device_command_stream_fixture.h +++ b/opencl/test/unit_test/os_interface/linux/device_command_stream_fixture.h @@ -350,4 +350,24 @@ class DrmMockCustom : public Drm { int getErrno() override { return errnoValue; } + + struct WaitUserFenceCall { + uint64_t address = 0u; + uint64_t value = 0u; + uint32_t ctxId = 0u; + ValueWidth dataWidth = ValueWidth::U8; + + uint32_t called = 0u; + }; + + WaitUserFenceCall waitUserFenceCall{}; + + int waitUserFence(uint32_t ctxId, uint64_t address, uint64_t value, ValueWidth dataWidth) override { + waitUserFenceCall.called++; + waitUserFenceCall.ctxId = ctxId; + waitUserFenceCall.address = address; + waitUserFenceCall.dataWidth = dataWidth; + waitUserFenceCall.value = value; + return Drm::waitUserFence(ctxId, address, value, dataWidth); + } }; diff --git a/opencl/test/unit_test/os_interface/linux/drm_command_stream_tests.cpp b/opencl/test/unit_test/os_interface/linux/drm_command_stream_tests.cpp index 31e89a88b2..11648f1c2f 100644 --- a/opencl/test/unit_test/os_interface/linux/drm_command_stream_tests.cpp +++ b/opencl/test/unit_test/os_interface/linux/drm_command_stream_tests.cpp @@ -1663,3 +1663,82 @@ HWTEST_TEMPLATED_F(DrmCommandStreamTest, givenPageTableManagerAndMapFalseWhenUpd EXPECT_TRUE(result); } + +HWTEST_TEMPLATED_F(DrmCommandStreamEnhancedTest, givenWaitUserFenceFlagSetWhenDrmCsrFlushedThenExpectTaskCountPlusOneStoredAsFlushStamp) { + DebugManagerStateRestore restorer; + DebugManager.flags.EnableUserFenceForCompletionWait.set(1); + + TestedDrmCommandStreamReceiver *testedCsr = + new TestedDrmCommandStreamReceiver(gemCloseWorkerMode::gemCloseWorkerInactive, + *this->executionEnvironment, + 1); + EXPECT_TRUE(testedCsr->useUserFenceWait); + device->resetCommandStreamReceiver(testedCsr); + + auto commandBuffer = mm->allocateGraphicsMemoryWithProperties(MockAllocationProperties{testedCsr->getRootDeviceIndex(), MemoryConstants::pageSize}); + ASSERT_NE(nullptr, commandBuffer); + LinearStream cs(commandBuffer); + + CommandStreamReceiverHw::addBatchBufferEnd(cs, nullptr); + CommandStreamReceiverHw::alignToCacheLine(cs); + BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + + testedCsr->taskCount = 10u; + testedCsr->flush(batchBuffer, testedCsr->getResidencyAllocations()); + + EXPECT_EQ(11u, testedCsr->flushStamp->peekStamp()); + + mm->freeGraphicsMemory(commandBuffer); +} + +HWTEST_TEMPLATED_F(DrmCommandStreamEnhancedTest, givenWaitUserFenceFlagSetWhenDrmCsrThenExpectUseDrmWaitUserFenceCallWithZeroContext) { + DebugManagerStateRestore restorer; + DebugManager.flags.EnableUserFenceForCompletionWait.set(1); + + TestedDrmCommandStreamReceiver *testedCsr = + new TestedDrmCommandStreamReceiver(gemCloseWorkerMode::gemCloseWorkerInactive, + *this->executionEnvironment, + 1); + EXPECT_TRUE(testedCsr->useUserFenceWait); + EXPECT_FALSE(testedCsr->useContextForUserFenceWait); + device->resetCommandStreamReceiver(testedCsr); + + FlushStamp handleToWait = 123; + testedCsr->waitForFlushStamp(handleToWait); + + EXPECT_EQ(1u, testedCsr->waitUserFenceResult.called); + EXPECT_EQ(123u, testedCsr->waitUserFenceResult.waitValue); + EXPECT_EQ(0u, mock->waitUserFenceCall.ctxId); + EXPECT_EQ(1u, mock->waitUserFenceCall.called); + EXPECT_EQ(Drm::ValueWidth::U32, mock->waitUserFenceCall.dataWidth); +} + +HWTEST_TEMPLATED_F(DrmCommandStreamEnhancedTest, givenWaitUserFenceAndUseCtxFlagsSetWhenDrmCsrThenExpectUseDrmWaitUserFenceCallWithNonZeroContext) { + DebugManagerStateRestore restorer; + DebugManager.flags.EnableUserFenceForCompletionWait.set(1); + DebugManager.flags.EnableUserFenceUseCtxId.set(1); + + TestedDrmCommandStreamReceiver *testedCsr = + new TestedDrmCommandStreamReceiver(gemCloseWorkerMode::gemCloseWorkerInactive, + *this->executionEnvironment, + 1); + EXPECT_TRUE(testedCsr->useUserFenceWait); + EXPECT_TRUE(testedCsr->useContextForUserFenceWait); + device->resetCommandStreamReceiver(testedCsr); + + auto osContextLinux = static_cast(device->getDefaultEngine().osContext); + std::vector &drmCtxIds = const_cast &>(osContextLinux->getDrmContextIds()); + size_t drmCtxSize = drmCtxIds.size(); + for (uint32_t i = 0; i < drmCtxSize; i++) { + drmCtxIds[i] = 5u + i; + } + + FlushStamp handleToWait = 123; + testedCsr->waitForFlushStamp(handleToWait); + + EXPECT_EQ(1u, testedCsr->waitUserFenceResult.called); + EXPECT_EQ(123u, testedCsr->waitUserFenceResult.waitValue); + EXPECT_NE(0u, mock->waitUserFenceCall.ctxId); + EXPECT_EQ(1u, mock->waitUserFenceCall.called); + EXPECT_EQ(Drm::ValueWidth::U32, mock->waitUserFenceCall.dataWidth); +} diff --git a/opencl/test/unit_test/test_files/igdrcl.config b/opencl/test/unit_test/test_files/igdrcl.config index fd48ffe877..275ad67e02 100644 --- a/opencl/test/unit_test/test_files/igdrcl.config +++ b/opencl/test/unit_test/test_files/igdrcl.config @@ -247,3 +247,5 @@ DebugApiUsed = 0 ForceHostPointerImport = -1 OverrideMaxWorkGroupCount = -1 UseUmKmDataTranslator = 0 +EnableUserFenceForCompletionWait = -1 +EnableUserFenceUseCtxId = -1 diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 853a8aea83..29237f4464 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -230,6 +230,8 @@ DECLARE_DEBUG_VARIABLE(int32_t, UseCyclesPerSecondTimer, 0, "0: default behavior DECLARE_DEBUG_VARIABLE(int32_t, WaitLoopCount, -1, "-1: use default, >=0: number of iterations in wait loop") DECLARE_DEBUG_VARIABLE(int32_t, GTPinAllocateBufferInSharedMemory, -1, "Force GTPin to allocate buffer in shared memory") DECLARE_DEBUG_VARIABLE(int32_t, AlignLocalMemoryVaTo2MB, -1, "Allow 2MB pages for allocations with size>=2MB. On Linux it means aligned VA, on Windows it means aligned size. -1: default, 0: disabled, 1: enabled") +DECLARE_DEBUG_VARIABLE(int32_t, EnableUserFenceForCompletionWait, -1, "-1: default (disabled), 0: disable, 1: enable : Use Wait User Fence instead Gem Wait") +DECLARE_DEBUG_VARIABLE(int32_t, EnableUserFenceUseCtxId, -1, "-1: default (disabled), 0: disable, 1: enable : Use Context Id in Wait User Fence when waiting for completion tag") /*EXPERIMENTAL TOGGLES*/ DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalEnableCustomLocalMemoryAlignment, 0, "Align local memory allocations to a given value. Works only with allocations at least as big as the value. 0: no effect, 2097152: 2 megabytes, 1073741824: 1 gigabyte") diff --git a/shared/source/os_interface/linux/drm_neo.cpp b/shared/source/os_interface/linux/drm_neo.cpp index 1231e11e84..828465cf48 100644 --- a/shared/source/os_interface/linux/drm_neo.cpp +++ b/shared/source/os_interface/linux/drm_neo.cpp @@ -743,4 +743,12 @@ const std::vector &Drm::getSliceMappings(uint32_t deviceIndex) { return topologyMap[deviceIndex].sliceIndices; } +int Drm::waitHandle(uint32_t waitHandle) { + drm_i915_gem_wait wait = {}; + wait.bo_handle = waitHandle; + wait.timeout_ns = -1; + + return ioctl(DRM_IOCTL_I915_GEM_WAIT, &wait); +} + } // namespace NEO diff --git a/shared/source/os_interface/linux/drm_neo.h b/shared/source/os_interface/linux/drm_neo.h index a54237832f..5dc49e3b8c 100644 --- a/shared/source/os_interface/linux/drm_neo.h +++ b/shared/source/os_interface/linux/drm_neo.h @@ -204,6 +204,15 @@ class Drm : public DriverModel { uint64_t getNextFenceVal(uint32_t vmHandleId) { return ++fenceVal[vmHandleId]; } uint64_t *getFenceAddr(uint32_t vmHandleId) { return &pagingFence[vmHandleId]; } + int waitHandle(uint32_t waitHandle); + enum class ValueWidth : uint32_t { + U8, + U16, + U32, + U64 + }; + MOCKABLE_VIRTUAL int waitUserFence(uint32_t ctxId, uint64_t address, uint64_t value, ValueWidth dataWidth); + void setNewResourceBound(bool value) { this->newResourceBound = value; }; bool getNewResourceBound() { return this->newResourceBound; }; diff --git a/shared/source/os_interface/linux/drm_query.cpp b/shared/source/os_interface/linux/drm_query.cpp index e71210da47..0bb11f98bd 100644 --- a/shared/source/os_interface/linux/drm_query.cpp +++ b/shared/source/os_interface/linux/drm_query.cpp @@ -75,6 +75,10 @@ int Drm::unbindBufferObject(OsContext *osContext, uint32_t vmHandleId, BufferObj void Drm::waitForBind(uint32_t vmHandleId) { } +int Drm::waitUserFence(uint32_t ctx, uint64_t address, uint64_t value, ValueWidth dataWidth) { + return 0; +} + bool Drm::isVmBindAvailable() { return this->bindAvailable; } diff --git a/shared/source/os_interface/linux/drm_query_dg1.cpp b/shared/source/os_interface/linux/drm_query_dg1.cpp index db0d44a2d5..06455a8b4f 100644 --- a/shared/source/os_interface/linux/drm_query_dg1.cpp +++ b/shared/source/os_interface/linux/drm_query_dg1.cpp @@ -84,6 +84,10 @@ int Drm::unbindBufferObject(OsContext *osContext, uint32_t vmHandleId, BufferObj void Drm::waitForBind(uint32_t vmHandleId) { } +int Drm::waitUserFence(uint32_t ctx, uint64_t address, uint64_t value, ValueWidth dataWidth) { + return 0; +} + bool Drm::isVmBindAvailable() { return this->bindAvailable; }