From 03185f71115621146d2eeccaed621386362f10c0 Mon Sep 17 00:00:00 2001 From: Mateusz Jablonski Date: Wed, 20 Apr 2022 17:32:39 +0000 Subject: [PATCH] feature direct submission: use tag allocation as a completion fence use tag allocation address as a completion address in exec call wait for completion value before destroying drm direct submission Related-To: NEO-6643 Signed-off-by: Mateusz Jablonski --- .../command_stream/command_stream_receiver.h | 2 +- .../direct_submission_hw.cpp | 1 + .../direct_submission/direct_submission_hw.h | 2 + .../direct_submission_hw.inl | 4 + .../linux/drm_direct_submission.h | 3 +- .../linux/drm_direct_submission.inl | 40 ++- .../libult/ult_command_stream_receiver.h | 1 + .../common/mocks/linux/mock_drm_allocation.h | 12 + .../common/mocks/mock_direct_submission_hw.h | 1 + .../mocks/mock_memory_operations_handler.h | 7 + .../direct_submission_tests_1.cpp | 48 +++ .../linux/drm_direct_submission_tests.cpp | 292 ++++++++++++++++++ 12 files changed, 408 insertions(+), 5 deletions(-) diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index bf6d15175d..46bc7b05cb 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -321,7 +321,7 @@ class CommandStreamReceiver { MOCKABLE_VIRTUAL bool isGpuHangDetected() const; - uint64_t getCompletionAddress() { + uint64_t getCompletionAddress() const { uint64_t completionFenceAddress = castToUint64(const_cast(getTagAddress())); if (completionFenceAddress == 0) { return 0; diff --git a/shared/source/direct_submission/direct_submission_hw.cpp b/shared/source/direct_submission/direct_submission_hw.cpp index c9fb0caa41..ad16d42c92 100644 --- a/shared/source/direct_submission/direct_submission_hw.cpp +++ b/shared/source/direct_submission/direct_submission_hw.cpp @@ -14,6 +14,7 @@ DirectSubmissionInputParams::DirectSubmissionInputParams(const CommandStreamRece memoryManager = commandStreamReceiver.getMemoryManager(); globalFenceAllocation = commandStreamReceiver.getGlobalFenceAllocation(); workPartitionAllocation = commandStreamReceiver.getWorkPartitionAllocation(); + completionFenceAllocation = commandStreamReceiver.getTagAllocation(); } } // namespace NEO diff --git a/shared/source/direct_submission/direct_submission_hw.h b/shared/source/direct_submission/direct_submission_hw.h index cc5a11b33f..45ff43692d 100644 --- a/shared/source/direct_submission/direct_submission_hw.h +++ b/shared/source/direct_submission/direct_submission_hw.h @@ -64,6 +64,7 @@ struct DirectSubmissionInputParams : NonCopyableClass { MemoryManager *memoryManager = nullptr; const GraphicsAllocation *globalFenceAllocation = nullptr; GraphicsAllocation *workPartitionAllocation = nullptr; + GraphicsAllocation *completionFenceAllocation = nullptr; const uint32_t rootDeviceIndex; }; @@ -160,6 +161,7 @@ class DirectSubmissionHw { MemoryOperationsHandler *memoryOperationHandler = nullptr; const HardwareInfo *hwInfo = nullptr; const GraphicsAllocation *globalFenceAllocation = nullptr; + GraphicsAllocation *completionFenceAllocation = nullptr; GraphicsAllocation *ringBuffer = nullptr; GraphicsAllocation *ringBuffer2 = nullptr; GraphicsAllocation *semaphores = nullptr; diff --git a/shared/source/direct_submission/direct_submission_hw.inl b/shared/source/direct_submission/direct_submission_hw.inl index 102085290b..a7244174a9 100644 --- a/shared/source/direct_submission/direct_submission_hw.inl +++ b/shared/source/direct_submission/direct_submission_hw.inl @@ -100,6 +100,10 @@ bool DirectSubmissionHw::allocateResources() { allocations.push_back(workPartitionAllocation); } + if (completionFenceAllocation != nullptr) { + allocations.push_back(completionFenceAllocation); + } + if (DebugManager.flags.DirectSubmissionPrintBuffers.get()) { printf("Ring buffer 1 - gpu address: %" PRIx64 " - %" PRIx64 ", cpu address: %p - %p, size: %zu \n", ringBuffer->getGpuAddress(), diff --git a/shared/source/direct_submission/linux/drm_direct_submission.h b/shared/source/direct_submission/linux/drm_direct_submission.h index 0ae396d1d2..b1d7c9062c 100644 --- a/shared/source/direct_submission/linux/drm_direct_submission.h +++ b/shared/source/direct_submission/linux/drm_direct_submission.h @@ -35,7 +35,8 @@ class DrmDirectSubmission : public DirectSubmissionHw { MOCKABLE_VIRTUAL void wait(uint32_t taskCountToWait); - TagData currentTagData; + TagData currentTagData{}; volatile uint32_t *tagAddress; + uint32_t completionFenceValue{}; }; } // namespace NEO diff --git a/shared/source/direct_submission/linux/drm_direct_submission.inl b/shared/source/direct_submission/linux/drm_direct_submission.inl index 613c330b8d..73bad97f9b 100644 --- a/shared/source/direct_submission/linux/drm_direct_submission.inl +++ b/shared/source/direct_submission/linux/drm_direct_submission.inl @@ -39,12 +39,17 @@ DrmDirectSubmission::DrmDirectSubmission(const DirectSubm this->partitionedMode = this->activeTiles > 1u; this->partitionConfigSet = !this->partitionedMode; - osContextLinux->getDrm().setDirectSubmissionActive(true); + auto &drm = osContextLinux->getDrm(); + drm.setDirectSubmissionActive(true); if (this->partitionedMode) { this->workPartitionAllocation = inputParams.workPartitionAllocation; UNRECOVERABLE_IF(this->workPartitionAllocation == nullptr); } + + if (drm.completionFenceSupport()) { + this->completionFenceAllocation = inputParams.completionFenceAllocation; + } } template @@ -53,6 +58,24 @@ inline DrmDirectSubmission::~DrmDirectSubmission() { this->stopRingBuffer(); this->wait(static_cast(this->currentTagData.tagValue)); } + if (this->completionFenceAllocation) { + auto osContextLinux = static_cast(&this->osContext); + auto &drm = osContextLinux->getDrm(); + auto &drmContextIds = osContextLinux->getDrmContextIds(); + uint32_t drmContextId = 0u; + auto completionFenceCpuAddress = reinterpret_cast(this->completionFenceAllocation->getUnderlyingBuffer()) + Drm::completionFenceOffset; + for (auto drmIterator = 0u; drmIterator < osContextLinux->getDeviceBitfield().size(); drmIterator++) { + if (osContextLinux->getDeviceBitfield().test(drmIterator)) { + if (*reinterpret_cast(completionFenceCpuAddress) < completionFenceValue) { + constexpr int64_t timeout = -1; + constexpr uint16_t flags = 0; + drm.waitUserFence(drmContextIds[drmContextId], completionFenceCpuAddress, completionFenceValue, Drm::ValueWidth::U32, timeout, flags); + } + drmContextId++; + completionFenceCpuAddress = ptrOffset(completionFenceCpuAddress, this->postSyncOffset); + } + } + } this->deallocateResources(); } @@ -81,6 +104,14 @@ bool DrmDirectSubmission::submit(uint64_t gpuAddress, siz bool ret = false; uint32_t drmContextId = 0u; + + uint32_t completionValue = 0u; + uint64_t completionFenceGpuAddress = 0u; + if (this->completionFenceAllocation) { + completionValue = ++completionFenceValue; + completionFenceGpuAddress = this->completionFenceAllocation->getGpuAddress() + Drm::completionFenceOffset; + } + for (auto drmIterator = 0u; drmIterator < osContextLinux->getDeviceBitfield().size(); drmIterator++) { if (osContextLinux->getDeviceBitfield().test(drmIterator)) { ret |= !!bb->exec(static_cast(size), @@ -93,9 +124,12 @@ bool DrmDirectSubmission::submit(uint64_t gpuAddress, siz nullptr, 0, &execObject, - 0, - 0); + completionFenceGpuAddress, + completionValue); drmContextId++; + if (completionFenceGpuAddress) { + completionFenceGpuAddress += this->postSyncOffset; + } } } diff --git a/shared/test/common/libult/ult_command_stream_receiver.h b/shared/test/common/libult/ult_command_stream_receiver.h index c5fcc013da..bf0d4cc900 100644 --- a/shared/test/common/libult/ult_command_stream_receiver.h +++ b/shared/test/common/libult/ult_command_stream_receiver.h @@ -122,6 +122,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ using BaseClass::CommandStreamReceiver::useNotifyEnableForPostSync; using BaseClass::CommandStreamReceiver::userPauseConfirmation; using BaseClass::CommandStreamReceiver::waitForTaskCountAndCleanAllocationList; + using BaseClass::CommandStreamReceiver::workPartitionAllocation; UltCommandStreamReceiver(ExecutionEnvironment &executionEnvironment, uint32_t rootDeviceIndex, diff --git a/shared/test/common/mocks/linux/mock_drm_allocation.h b/shared/test/common/mocks/linux/mock_drm_allocation.h index 19718bc04b..b7966f7c88 100644 --- a/shared/test/common/mocks/linux/mock_drm_allocation.h +++ b/shared/test/common/mocks/linux/mock_drm_allocation.h @@ -18,8 +18,20 @@ class MockBufferObject : public BufferObject { using BufferObject::BufferObject; using BufferObject::handle; + struct ExecParams { + uint64_t completionGpuAddress = 0; + uint32_t completionValue = 0; + }; + + std::vector passedExecParams{}; MockBufferObject(Drm *drm) : BufferObject(drm, CommonConstants::unsupportedPatIndex, 0, 0, 1) { } + int exec(uint32_t used, size_t startOffset, unsigned int flags, bool requiresCoherency, OsContext *osContext, uint32_t vmHandleId, uint32_t drmContextId, + BufferObject *const residency[], size_t residencyCount, drm_i915_gem_exec_object2 *execObjectsStorage, uint64_t completionGpuAddress, uint32_t completionValue) override { + passedExecParams.push_back({completionGpuAddress, completionValue}); + return BufferObject::exec(used, startOffset, flags, requiresCoherency, osContext, vmHandleId, drmContextId, + residency, residencyCount, execObjectsStorage, completionGpuAddress, completionValue); + } }; class MockDrmAllocation : public DrmAllocation { diff --git a/shared/test/common/mocks/mock_direct_submission_hw.h b/shared/test/common/mocks/mock_direct_submission_hw.h index c3143d7a63..87cd0b9f80 100644 --- a/shared/test/common/mocks/mock_direct_submission_hw.h +++ b/shared/test/common/mocks/mock_direct_submission_hw.h @@ -17,6 +17,7 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw using BaseClass = DirectSubmissionHw; using BaseClass::activeTiles; using BaseClass::allocateResources; + using BaseClass::completionFenceAllocation; using BaseClass::completionRingBuffers; using BaseClass::cpuCachelineFlush; using BaseClass::currentQueueWorkCount; diff --git a/shared/test/common/mocks/mock_memory_operations_handler.h b/shared/test/common/mocks/mock_memory_operations_handler.h index 48dbd6f4b7..6b833bb521 100644 --- a/shared/test/common/mocks/mock_memory_operations_handler.h +++ b/shared/test/common/mocks/mock_memory_operations_handler.h @@ -57,6 +57,11 @@ class MockMemoryOperations : public MemoryOperationsHandler { if (osContext) { makeResidentContextId = osContext->getContextId(); } + if (captureGfxAllocationsForMakeResident) { + for (auto &gfxAllocation : gfxAllocations) { + gfxAllocationsForMakeResident.push_back(gfxAllocation); + } + } return MemoryOperationsStatus::SUCCESS; } MemoryOperationsStatus evictWithinOsContext(OsContext *osContext, GraphicsAllocation &gfxAllocation) override { @@ -64,9 +69,11 @@ class MockMemoryOperations : public MemoryOperationsHandler { return MemoryOperationsStatus::SUCCESS; } + std::vector gfxAllocationsForMakeResident{}; int makeResidentCalledCount = 0; int evictCalledCount = 0; uint32_t makeResidentContextId = std::numeric_limits::max(); + bool captureGfxAllocationsForMakeResident = false; }; } // namespace NEO diff --git a/shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp b/shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp index 261ab1b744..81ade6107e 100644 --- a/shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp +++ b/shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp @@ -118,6 +118,54 @@ HWTEST_F(DirectSubmissionTest, givenDirectSubmissionWhenMakingResourcesResidentT pDevice->getRootDeviceEnvironmentRef().memoryOperationsInterface.release(); } +HWTEST_F(DirectSubmissionTest, givenDirectSubmissionWithoutCompletionFenceAllocationWhenAllocatingResourcesThenMakeResidentIsCalledForRingAndSemaphoreBuffers) { + auto mockMemoryOperations = std::make_unique(); + mockMemoryOperations->captureGfxAllocationsForMakeResident = true; + pDevice->getRootDeviceEnvironmentRef().memoryOperationsInterface.reset(mockMemoryOperations.get()); + + MockDirectSubmissionHw> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); + + directSubmission.callBaseResident = true; + bool ret = directSubmission.initialize(true, false); + EXPECT_TRUE(ret); + EXPECT_EQ(nullptr, directSubmission.completionFenceAllocation); + + EXPECT_EQ(1, mockMemoryOperations->makeResidentCalledCount); + ASSERT_EQ(3u, mockMemoryOperations->gfxAllocationsForMakeResident.size()); + EXPECT_EQ(directSubmission.ringBuffer, mockMemoryOperations->gfxAllocationsForMakeResident[0]); + EXPECT_EQ(directSubmission.ringBuffer2, mockMemoryOperations->gfxAllocationsForMakeResident[1]); + EXPECT_EQ(directSubmission.semaphores, mockMemoryOperations->gfxAllocationsForMakeResident[2]); + + pDevice->getRootDeviceEnvironmentRef().memoryOperationsInterface.release(); +} + +HWTEST_F(DirectSubmissionTest, givenDirectSubmissionWithCompletionFenceAllocationWhenAllocatingResourcesThenMakeResidentIsCalledForRingAndSemaphoreBuffersAndCompletionFenceAllocation) { + auto mockMemoryOperations = std::make_unique(); + mockMemoryOperations->captureGfxAllocationsForMakeResident = true; + + pDevice->getRootDeviceEnvironmentRef().memoryOperationsInterface.reset(mockMemoryOperations.get()); + + MockDirectSubmissionHw> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); + + MockGraphicsAllocation completionFenceAllocation{}; + + directSubmission.completionFenceAllocation = &completionFenceAllocation; + + directSubmission.callBaseResident = true; + bool ret = directSubmission.initialize(true, false); + EXPECT_TRUE(ret); + EXPECT_EQ(&completionFenceAllocation, directSubmission.completionFenceAllocation); + + EXPECT_EQ(1, mockMemoryOperations->makeResidentCalledCount); + ASSERT_EQ(4u, mockMemoryOperations->gfxAllocationsForMakeResident.size()); + EXPECT_EQ(directSubmission.ringBuffer, mockMemoryOperations->gfxAllocationsForMakeResident[0]); + EXPECT_EQ(directSubmission.ringBuffer2, mockMemoryOperations->gfxAllocationsForMakeResident[1]); + EXPECT_EQ(directSubmission.semaphores, mockMemoryOperations->gfxAllocationsForMakeResident[2]); + EXPECT_EQ(directSubmission.completionFenceAllocation, mockMemoryOperations->gfxAllocationsForMakeResident[3]); + + pDevice->getRootDeviceEnvironmentRef().memoryOperationsInterface.release(); +} + HWTEST_F(DirectSubmissionTest, givenDirectSubmissionInitializedWhenRingIsStartedThenExpectAllocationsCreatedAndCommandsDispatched) { MockDirectSubmissionHw> directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); EXPECT_TRUE(directSubmission.disableCpuCacheFlush); diff --git a/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp b/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp index 69ddb5c344..16eeb44062 100644 --- a/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp +++ b/shared/test/unit_test/direct_submission/linux/drm_direct_submission_tests.cpp @@ -17,6 +17,7 @@ #include "shared/test/common/helpers/variable_backup.h" #include "shared/test/common/libult/linux/drm_mock.h" #include "shared/test/common/libult/ult_command_stream_receiver.h" +#include "shared/test/common/mocks/linux/mock_drm_allocation.h" #include "shared/test/common/mocks/mock_device.h" #include "shared/test/common/os_interface/linux/drm_memory_manager_tests.h" #include "shared/test/common/test_macros/test.h" @@ -57,6 +58,8 @@ struct MockDrmDirectSubmission : public DrmDirectSubmission; using BaseClass::activeTiles; using BaseClass::allocateResources; + using BaseClass::completionFenceAllocation; + using BaseClass::completionFenceValue; using BaseClass::currentTagData; using BaseClass::disableMonitorFence; using BaseClass::dispatchSwitchRingBufferSection; @@ -70,6 +73,7 @@ struct MockDrmDirectSubmission : public DrmDirectSubmissiongetDefaultEngine().commandStreamReceiver; + auto drm = executionEnvironment.rootDeviceEnvironments[0]->osInterface->getDriverModel()->as(); + + ASSERT_TRUE(drm->completionFenceSupport()); + + auto expectedCompletionFenceAllocation = commandStreamReceiver.getTagAllocation(); + EXPECT_NE(nullptr, expectedCompletionFenceAllocation); + { + MockDrmDirectSubmission> directSubmission(commandStreamReceiver); + EXPECT_EQ(expectedCompletionFenceAllocation, directSubmission.completionFenceAllocation); + } + { + MockDrmDirectSubmission> directSubmission(commandStreamReceiver); + EXPECT_EQ(expectedCompletionFenceAllocation, directSubmission.completionFenceAllocation); + } +} + +HWTEST_F(DrmDirectSubmissionTest, givenNoCompletionFenceSupportWhenCreateDrmDirectSubmissionThenCompletionFenceAllocationIsNotSet) { + DebugManagerStateRestore restorer; + DebugManager.flags.EnableDrmCompletionFence.set(0); + auto &commandStreamReceiver = *device->getDefaultEngine().commandStreamReceiver; + auto drm = executionEnvironment.rootDeviceEnvironments[0]->osInterface->getDriverModel()->as(); + + ASSERT_FALSE(drm->completionFenceSupport()); + { + MockDrmDirectSubmission> directSubmission(commandStreamReceiver); + EXPECT_EQ(nullptr, directSubmission.completionFenceAllocation); + } + { + MockDrmDirectSubmission> directSubmission(commandStreamReceiver); + EXPECT_EQ(nullptr, directSubmission.completionFenceAllocation); + } +} + +HWTEST_F(DrmDirectSubmissionTest, givenDirectSubmissionWithoutCompletionFenceAllocationWhenDestroyingThenNoWaitForUserFenceIsCalled) { + DebugManagerStateRestore restorer; + DebugManager.flags.EnableDrmCompletionFence.set(0); + auto &commandStreamReceiver = *device->getDefaultEngine().commandStreamReceiver; + auto drm = static_cast(executionEnvironment.rootDeviceEnvironments[0]->osInterface->getDriverModel()->as()); + + ASSERT_FALSE(drm->completionFenceSupport()); + + drm->waitUserFenceParams.clear(); + { + MockDrmDirectSubmission> directSubmission(commandStreamReceiver); + directSubmission.completionFenceValue = 10; + } + + EXPECT_EQ(0u, drm->waitUserFenceParams.size()); +} + +HWTEST_F(DrmDirectSubmissionTest, givenCompletionFenceSupportAndFenceIsNotCompletedWhenDestroyingThenWaitForUserFenceIsCalled) { + DebugManagerStateRestore restorer; + DebugManager.flags.EnableDrmCompletionFence.set(1); + + auto &commandStreamReceiver = *device->getDefaultEngine().commandStreamReceiver; + auto drm = static_cast(executionEnvironment.rootDeviceEnvironments[0]->osInterface->getDriverModel()->as()); + + ASSERT_TRUE(drm->completionFenceSupport()); + + drm->waitUserFenceParams.clear(); + { + MockDrmDirectSubmission> directSubmission(commandStreamReceiver); + directSubmission.completionFenceValue = 10; + } + + EXPECT_EQ(osContext->getDrmContextIds().size(), drm->waitUserFenceParams.size()); +} + +HWTEST_F(DrmDirectSubmissionTest, givenCompletionFenceSupportAndFenceIsNotCompletedWhenWaitOnSpecificAddressesPerOsContext) { + DebugManagerStateRestore restorer; + DebugManager.flags.EnableDrmCompletionFence.set(1); + + auto &commandStreamReceiver = device->getUltCommandStreamReceiver(); + memset(commandStreamReceiver.getTagAllocation()->getUnderlyingBuffer(), 0, commandStreamReceiver.getTagAllocation()->getUnderlyingBufferSize()); + auto drm = static_cast(executionEnvironment.rootDeviceEnvironments[0]->osInterface->getDriverModel()->as()); + + ASSERT_TRUE(drm->completionFenceSupport()); + auto completionFenceBaseCpuAddress = reinterpret_cast(commandStreamReceiver.getTagAddress()) + Drm::completionFenceOffset; + uint32_t expectedCompletionValueToWait = 10u; + + { + DeviceBitfield firstTileBitfield{0b01}; + OsContextLinux osContext(*drm, 0u, + EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_RCS, EngineUsage::Regular}, + PreemptionMode::ThreadGroup, firstTileBitfield)); + osContext.ensureContextInitialized(); + commandStreamReceiver.setupContext(osContext); + drm->waitUserFenceParams.clear(); + { + MockDrmDirectSubmission> directSubmission(commandStreamReceiver); + directSubmission.completionFenceValue = expectedCompletionValueToWait; + } + EXPECT_EQ(1u, drm->waitUserFenceParams.size()); + EXPECT_EQ(expectedCompletionValueToWait, drm->waitUserFenceParams[0].value); + EXPECT_EQ(completionFenceBaseCpuAddress, drm->waitUserFenceParams[0].address); + } + { + DeviceBitfield secondTileBitfield{0b10}; + OsContextLinux osContext(*drm, 0u, + EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_RCS, EngineUsage::Regular}, + PreemptionMode::ThreadGroup, secondTileBitfield)); + osContext.ensureContextInitialized(); + commandStreamReceiver.setupContext(osContext); + drm->waitUserFenceParams.clear(); + { + MockDrmDirectSubmission> directSubmission(commandStreamReceiver); + directSubmission.completionFenceValue = expectedCompletionValueToWait; + } + EXPECT_EQ(1u, drm->waitUserFenceParams.size()); + EXPECT_EQ(expectedCompletionValueToWait, drm->waitUserFenceParams[0].value); + EXPECT_EQ(completionFenceBaseCpuAddress, drm->waitUserFenceParams[0].address); + } + + { + DeviceBitfield twoTilesBitfield{0b11}; + OsContextLinux osContext(*drm, 0u, + EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_RCS, EngineUsage::Regular}, + PreemptionMode::ThreadGroup, twoTilesBitfield)); + osContext.ensureContextInitialized(); + commandStreamReceiver.setupContext(osContext); + drm->waitUserFenceParams.clear(); + MockGraphicsAllocation workPartitionAllocation{}; + commandStreamReceiver.workPartitionAllocation = &workPartitionAllocation; + { + MockDrmDirectSubmission> directSubmission(commandStreamReceiver); + directSubmission.completionFenceValue = expectedCompletionValueToWait; + } + commandStreamReceiver.workPartitionAllocation = nullptr; + + EXPECT_EQ(2u, drm->waitUserFenceParams.size()); + EXPECT_EQ(expectedCompletionValueToWait, drm->waitUserFenceParams[0].value); + EXPECT_EQ(completionFenceBaseCpuAddress, drm->waitUserFenceParams[0].address); + + EXPECT_EQ(expectedCompletionValueToWait, drm->waitUserFenceParams[1].value); + EXPECT_EQ(completionFenceBaseCpuAddress + commandStreamReceiver.getPostSyncWriteOffset(), drm->waitUserFenceParams[1].address); + } + commandStreamReceiver.setupContext(*osContext); +} + +HWTEST_F(DrmDirectSubmissionTest, givenNoCompletionFenceSupportWhenSubmittingThenNoCompletionAddressIsPassedToExec) { + uint64_t gpuAddress = 0x1000; + size_t size = 0x1000; + + MockDrmDirectSubmission> drmDirectSubmission(*device->getDefaultEngine().commandStreamReceiver); + drmDirectSubmission.completionFenceAllocation = nullptr; + EXPECT_TRUE(drmDirectSubmission.allocateResources()); + auto ringBuffer = static_cast(drmDirectSubmission.ringBuffer); + auto initialBO = ringBuffer->getBufferObjectToModify(0); + + auto drm = executionEnvironment.rootDeviceEnvironments[0]->osInterface->getDriverModel()->as(); + MockBufferObject mockBO(drm); + ringBuffer->getBufferObjectToModify(0) = &mockBO; + + for (auto i = 0; i < 2; i++) { + mockBO.passedExecParams.clear(); + EXPECT_TRUE(drmDirectSubmission.submit(gpuAddress, size)); + + ASSERT_EQ(1u, mockBO.passedExecParams.size()); + EXPECT_EQ(0u, mockBO.passedExecParams[0].completionGpuAddress); + EXPECT_EQ(0u, mockBO.passedExecParams[0].completionValue); + } + ringBuffer->getBufferObjectToModify(0) = initialBO; +} + +HWTEST_F(DrmDirectSubmissionTest, givenTile0AndCompletionFenceSupportWhenSubmittingThenCompletionAddressAndValueArePassedToExec) { + uint64_t gpuAddress = 0x1000; + size_t size = 0x1000; + + auto &commandStreamReceiver = *device->getDefaultEngine().commandStreamReceiver; + auto drm = executionEnvironment.rootDeviceEnvironments[0]->osInterface->getDriverModel()->as(); + auto completionFenceBaseGpuAddress = commandStreamReceiver.getTagAllocation()->getGpuAddress() + Drm::completionFenceOffset; + + DeviceBitfield firstTileBitfield{0b01}; + OsContextLinux osContextTile0(*drm, 0u, + EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_RCS, EngineUsage::Regular}, + PreemptionMode::ThreadGroup, firstTileBitfield)); + osContextTile0.ensureContextInitialized(); + commandStreamReceiver.setupContext(osContextTile0); + + MockDrmDirectSubmission> drmDirectSubmission(commandStreamReceiver); + drmDirectSubmission.completionFenceAllocation = commandStreamReceiver.getTagAllocation(); + EXPECT_TRUE(drmDirectSubmission.allocateResources()); + auto ringBuffer = static_cast(drmDirectSubmission.ringBuffer); + auto initialBO = ringBuffer->getBufferObjectToModify(0); + + MockBufferObject mockBO(drm); + ringBuffer->getBufferObjectToModify(0) = &mockBO; + + for (auto i = 0u; i < 2; i++) { + mockBO.passedExecParams.clear(); + EXPECT_TRUE(drmDirectSubmission.submit(gpuAddress, size)); + + ASSERT_EQ(1u, mockBO.passedExecParams.size()); + EXPECT_EQ(completionFenceBaseGpuAddress, mockBO.passedExecParams[0].completionGpuAddress); + EXPECT_EQ(i + 1, mockBO.passedExecParams[0].completionValue); + } + ringBuffer->getBufferObjectToModify(0) = initialBO; + + commandStreamReceiver.setupContext(*osContext); +} + +HWTEST_F(DrmDirectSubmissionTest, givenTile1AndCompletionFenceSupportWhenSubmittingThenCompletionAddressAndValueArePassedToExec) { + uint64_t gpuAddress = 0x1000; + size_t size = 0x1000; + + auto &commandStreamReceiver = *device->getDefaultEngine().commandStreamReceiver; + auto drm = executionEnvironment.rootDeviceEnvironments[0]->osInterface->getDriverModel()->as(); + auto completionFenceBaseGpuAddress = commandStreamReceiver.getTagAllocation()->getGpuAddress() + Drm::completionFenceOffset; + + DeviceBitfield secondTileBitfield{0b10}; + OsContextLinux osContextTile1(*drm, 0u, + EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_RCS, EngineUsage::Regular}, + PreemptionMode::ThreadGroup, secondTileBitfield)); + osContextTile1.ensureContextInitialized(); + commandStreamReceiver.setupContext(osContextTile1); + + MockDrmDirectSubmission> drmDirectSubmission(commandStreamReceiver); + drmDirectSubmission.completionFenceAllocation = commandStreamReceiver.getTagAllocation(); + EXPECT_TRUE(drmDirectSubmission.allocateResources()); + auto ringBuffer = static_cast(drmDirectSubmission.ringBuffer); + auto initialBO = ringBuffer->getBufferObjectToModify(0); + + MockBufferObject mockBO(drm); + ringBuffer->getBufferObjectToModify(0) = &mockBO; + + for (auto i = 0u; i < 2; i++) { + mockBO.passedExecParams.clear(); + EXPECT_TRUE(drmDirectSubmission.submit(gpuAddress, size)); + + ASSERT_EQ(1u, mockBO.passedExecParams.size()); + EXPECT_EQ(completionFenceBaseGpuAddress, mockBO.passedExecParams[0].completionGpuAddress); + EXPECT_EQ(i + 1, mockBO.passedExecParams[0].completionValue); + } + ringBuffer->getBufferObjectToModify(0) = initialBO; + + commandStreamReceiver.setupContext(*osContext); +} + +HWTEST_F(DrmDirectSubmissionTest, givenTwoTilesAndCompletionFenceSupportWhenSubmittingThenCompletionAddressAndValueArePassedToExec) { + uint64_t gpuAddress = 0x1000; + size_t size = 0x1000; + + auto &commandStreamReceiver = device->getUltCommandStreamReceiver(); + auto drm = executionEnvironment.rootDeviceEnvironments[0]->osInterface->getDriverModel()->as(); + auto completionFenceBaseGpuAddress = commandStreamReceiver.getTagAllocation()->getGpuAddress() + Drm::completionFenceOffset; + + DeviceBitfield twoTilesBitfield{0b11}; + OsContextLinux osContextBothTiles(*drm, 0u, + EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_RCS, EngineUsage::Regular}, + PreemptionMode::ThreadGroup, twoTilesBitfield)); + osContextBothTiles.ensureContextInitialized(); + commandStreamReceiver.setupContext(osContextBothTiles); + + MockGraphicsAllocation workPartitionAllocation{}; + commandStreamReceiver.workPartitionAllocation = &workPartitionAllocation; + + MockDrmDirectSubmission> drmDirectSubmission(commandStreamReceiver); + + commandStreamReceiver.workPartitionAllocation = nullptr; + + drmDirectSubmission.completionFenceAllocation = commandStreamReceiver.getTagAllocation(); + EXPECT_TRUE(drmDirectSubmission.allocateResources()); + auto ringBuffer = static_cast(drmDirectSubmission.ringBuffer); + auto initialBO = ringBuffer->getBufferObjectToModify(0); + + MockBufferObject mockBO(drm); + ringBuffer->getBufferObjectToModify(0) = &mockBO; + + for (auto i = 0u; i < 2; i++) { + mockBO.passedExecParams.clear(); + EXPECT_TRUE(drmDirectSubmission.submit(gpuAddress, size)); + + ASSERT_EQ(2u, mockBO.passedExecParams.size()); + EXPECT_EQ(completionFenceBaseGpuAddress, mockBO.passedExecParams[0].completionGpuAddress); + EXPECT_EQ(i + 1, mockBO.passedExecParams[0].completionValue); + + EXPECT_EQ(completionFenceBaseGpuAddress + commandStreamReceiver.getPostSyncWriteOffset(), mockBO.passedExecParams[1].completionGpuAddress); + EXPECT_EQ(i + 1, mockBO.passedExecParams[1].completionValue); + } + ringBuffer->getBufferObjectToModify(0) = initialBO; + + commandStreamReceiver.setupContext(*osContext); +} + HWTEST_F(DrmDirectSubmissionTest, givenDisabledMonitorFenceWhenDispatchSwitchRingBufferThenDispatchPipeControl) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using Dispatcher = RenderDispatcher;