From fed90f5c8ea82c556a7d1ff2371fe7856dd00eb3 Mon Sep 17 00:00:00 2001 From: Lukasz Jobczyk Date: Fri, 21 Jun 2024 12:00:59 +0000 Subject: [PATCH] fix: Add infrastructure to force dc flush when mitigate dc -force dc on next tag update after RT kernel -force dc when release shared object Related-To: NEO-10556 Signed-off-by: Lukasz Jobczyk --- level_zero/core/source/cmdlist/cmdlist.cpp | 7 +++ level_zero/core/source/cmdlist/cmdlist.h | 3 + .../source/cmdlist/cmdlist_hw_immediate.inl | 1 + .../cmdlist/cmdlist_hw_xehp_and_later.inl | 4 ++ .../core/source/cmdqueue/cmdqueue_hw.inl | 3 + .../core/test/unit_tests/mocks/mock_cmdlist.h | 1 + .../cmdlist/test_cmdlist_xehp_and_later.cpp | 63 +++++++++++++++++++ opencl/source/command_queue/command_queue.cpp | 9 ++- .../command_queue/command_queue_tests.cpp | 32 ++++++++++ .../command_stream/command_stream_receiver.h | 11 ++++ .../command_stream_receiver_hw_base.inl | 4 +- .../libult/ult_command_stream_receiver.h | 8 +++ .../command_stream_receiver_tests.cpp | 31 +++++++++ 13 files changed, 174 insertions(+), 3 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist.cpp b/level_zero/core/source/cmdlist/cmdlist.cpp index 9ddb29a10f..fe043745c2 100644 --- a/level_zero/core/source/cmdlist/cmdlist.cpp +++ b/level_zero/core/source/cmdlist/cmdlist.cpp @@ -89,6 +89,13 @@ void CommandList::removeMemoryPrefetchAllocations() { } } +void CommandList::registerCsrDcFlushForDcMitigation(NEO::CommandStreamReceiver &csr) { + if (this->requiresDcFlushForDcMitigation) { + csr.registerDcFlushForDcMitigation(); + this->requiresDcFlushForDcMitigation = false; + } +} + NEO::GraphicsAllocation *CommandList::getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize, bool copyOffload) { auto allocation = hostPtrMap.lower_bound(buffer); if (allocation != hostPtrMap.end()) { diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index fed044d505..93e9545b19 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -369,6 +369,8 @@ struct CommandList : _ze_command_list_handle_t { return taskCountUpdateFenceRequired; } + void registerCsrDcFlushForDcMitigation(NEO::CommandStreamReceiver &csr); + protected: NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize, bool copyOffload); NEO::GraphicsAllocation *getHostPtrAlloc(const void *buffer, uint64_t bufferSize, bool hostCopyAllowed, bool copyOffload); @@ -455,6 +457,7 @@ struct CommandList : _ze_command_list_handle_t { bool heaplessStateInitEnabled = false; bool scratchAddressPatchingEnabled = false; bool taskCountUpdateFenceRequired = false; + bool requiresDcFlushForDcMitigation = false; }; using CommandListAllocatorFn = CommandList *(*)(uint32_t); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index 2c1b7abc23..cf34bf2056 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -434,6 +434,7 @@ inline ze_result_t CommandListCoreFamilyImmediate::executeCommand if (cmdQ->peekIsCopyOnlyCommandQueue()) { completionStamp = flushBcsTask(*commandStream, commandStreamStart, hasStallingCmds, hasRelaxedOrderingDependencies, csr); } else { + this->registerCsrDcFlushForDcMitigation(*csr); completionStamp = (this->*computeFlushMethod)(*commandStream, commandStreamStart, hasStallingCmds, hasRelaxedOrderingDependencies, kernelOperation); } diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 07db7a2fd1..582d589953 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -454,6 +454,10 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K } if (kernelImp->usesRayTracing()) { + if (this->device->getProductHelper().isDcFlushMitigated()) { + this->requiresDcFlushForDcMitigation = true; + } + NEO::PipeControlArgs args{}; args.stateCacheInvalidationEnable = true; NEO::MemorySynchronizationCommands::addSingleBarrier(*commandContainer.getCommandStream(), args); diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index de1d97e3b3..7df82e5d0f 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -764,6 +764,8 @@ void CommandQueueHw::setupCmdListsAndContextParams( this->partitionCount = std::max(this->partitionCount, commandList->getPartitionCount()); ctx.cmdListScratchAddressPatchingEnabled |= commandList->getCmdListScratchAddressPatchingEnabled(); + + commandList->registerCsrDcFlushForDcMitigation(*this->getCsr()); } makeResidentAndMigrate(ctx.isMigrationRequested, commandContainer.getResidencyContainer()); @@ -1307,6 +1309,7 @@ void CommandQueueHw::dispatchTaskCountPostSyncRegular( NEO::PipeControlArgs args; args.dcFlushEnable = this->csr->getDcFlushSupport(); + args.dcFlushEnable |= this->csr->checkDcFlushRequiredForDcMitigationAndReset(); args.workloadPartitionOffset = this->partitionCount > 1; args.notifyEnable = this->csr->isUsedNotifyEnableForPostSync(); NEO::MemorySynchronizationCommands::addBarrierWithPostSyncOperation( diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index 1914537f16..2455956e7e 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -105,6 +105,7 @@ struct WhiteBox<::L0::CommandListCoreFamily> using BaseClass::pipeControlMultiKernelEventSync; using BaseClass::pipelineSelectStateTracking; using BaseClass::requiredStreamState; + using BaseClass::requiresDcFlushForDcMitigation; using BaseClass::requiresQueueUncachedMocs; using BaseClass::scratchAddressPatchingEnabled; using BaseClass::setupTimestampEventForMultiTile; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp index 9215e25fe6..45f44de562 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp @@ -1576,6 +1576,31 @@ HWTEST2_F(CommandListAppendLaunchRayTracingKernelTest, givenKernelUsingRayTracin neoDevice->rtMemoryBackedBuffer = nullptr; } +HWTEST2_F(CommandListAppendLaunchRayTracingKernelTest, givenDcFlushMitigationWhenAppendLaunchKernelWithRayTracingIsCalledThenRequireDcFlush, RayTracingMatcher) { + DebugManagerStateRestore restorer; + debugManager.flags.AllowDcFlush.set(0); + + Mock<::L0::KernelImp> kernel; + auto pMockModule = std::unique_ptr(new Mock(device, nullptr)); + kernel.module = pMockModule.get(); + + kernel.setGroupSize(4, 1, 1); + ze_group_count_t groupCount{8, 1, 1}; + auto pCommandList = std::make_unique>>(); + auto result = pCommandList->initialize(device, NEO::EngineGroupType::compute, 0); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + kernel.immutableData.kernelDescriptor->kernelAttributes.flags.hasRTCalls = true; + neoDevice->rtMemoryBackedBuffer = buffer1; + CmdListKernelLaunchParams launchParams = {}; + + result = pCommandList->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(pCommandList->requiresDcFlushForDcMitigation, device->getProductHelper().isDcFlushMitigated()); + + neoDevice->rtMemoryBackedBuffer = nullptr; +} + using RayTracingCmdListTest = Test; template @@ -1694,6 +1719,28 @@ HWTEST2_F(RayTracingCmdListTest, ultCsr->isMadeResident(rtAllocation, residentCount); } +HWTEST2_F(RayTracingCmdListTest, + givenDcFlushMitigationWhenRegularAppendLaunchKernelAndExecuteThenRegisterDcFlushForDcFlushMitigation, + RayTracingMatcher) { + DebugManagerStateRestore restorer; + debugManager.flags.AllowDcFlush.set(0); + + auto ultCsr = static_cast *>(commandQueue->getCsr()); + + ze_group_count_t groupCount{1, 1, 1}; + CmdListKernelLaunchParams launchParams = {}; + auto result = commandList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + result = commandList->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + ze_command_list_handle_t cmdListHandle = commandList->toHandle(); + result = commandQueue->executeCommandLists(1, &cmdListHandle, nullptr, true, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + EXPECT_EQ(ultCsr->registeredDcFlushForDcFlushMitigation, device->getProductHelper().isDcFlushMitigated()); +} + HWTEST2_F(RayTracingCmdListTest, givenRayTracingKernelWhenRegularCmdListExecutedAndImmediateExecutedAgainThenDispatch3dBtdCommandOnceMakeResidentTwiceAndPipeControlWithStateCacheFlushAfterWalker, RayTracingMatcher) { @@ -1796,6 +1843,22 @@ HWTEST2_F(RayTracingCmdListTest, ultCsr->isMadeResident(rtAllocation, residentCount); } +HWTEST2_F(RayTracingCmdListTest, + givenDcFlushMitigationWhenImmediateAppendLaunchKernelThenRegisterDcFlushForDcFlushMitigation, + RayTracingMatcher) { + DebugManagerStateRestore restorer; + debugManager.flags.AllowDcFlush.set(0); + + commandListImmediate->isSyncModeQueue = true; + auto ultCsr = static_cast *>(commandQueue->getCsr()); + + ze_group_count_t groupCount{1, 1, 1}; + CmdListKernelLaunchParams launchParams = {}; + auto result = commandListImmediate->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(ultCsr->registeredDcFlushForDcFlushMitigation, device->getProductHelper().isDcFlushMitigated()); +} + HWTEST2_F(RayTracingCmdListTest, givenRayTracingKernelWhenImmediateCmdListExecutedAndRegularExecutedAgainThenDispatch3dBtdCommandOnceMakeResidentTwiceAndPipeControlWithStateCacheFlushAfterWalker, RayTracingMatcher) { diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index dc92bb10f1..af65709aaf 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -633,8 +633,13 @@ cl_int CommandQueue::enqueueReleaseSharedObjects(cl_uint numObjects, const cl_me memObject->acquireCount--; } - if (isImageReleased && this->getGpgpuCommandStreamReceiver().isDirectSubmissionEnabled()) { - this->getGpgpuCommandStreamReceiver().sendRenderStateCacheFlush(); + if (this->getGpgpuCommandStreamReceiver().isDirectSubmissionEnabled()) { + if (this->getDevice().getProductHelper().isDcFlushMitigated()) { + this->getGpgpuCommandStreamReceiver().registerDcFlushForDcMitigation(); + this->getGpgpuCommandStreamReceiver().sendRenderStateCacheFlush(); + } else if (isImageReleased) { + this->getGpgpuCommandStreamReceiver().sendRenderStateCacheFlush(); + } } auto status = enqueueMarkerWithWaitList( diff --git a/opencl/test/unit_test/command_queue/command_queue_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_tests.cpp index c6e3c15c2c..d58332a914 100644 --- a/opencl/test/unit_test/command_queue/command_queue_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_tests.cpp @@ -1368,6 +1368,38 @@ HWTEST_F(CommandQueueTests, givenDirectSubmissionAndSharedImageWhenReleasingShar result = cmdQ.enqueueReleaseSharedObjects(numObjects, memObjects, 0, nullptr, nullptr, 0); EXPECT_EQ(result, CL_SUCCESS); EXPECT_TRUE(ultCsr->renderStateCacheFlushed); + EXPECT_EQ(ultCsr->renderStateCacheDcFlushForced, context.getDevice(0)->getProductHelper().isDcFlushMitigated()); +} + +HWTEST_F(CommandQueueTests, givenDcFlushMitigationAndDirectSubmissionAndBufferWhenReleasingSharedObjectThenFlushRenderStateCacheAndForceDcFlush) { + DebugManagerStateRestore restorer; + debugManager.flags.AllowDcFlush.set(0); + + MockContext context; + MockCommandQueue cmdQ(&context, context.getDevice(0), 0, false); + MockSharingHandler *mockSharingHandler = new MockSharingHandler; + + auto buffer = std::unique_ptr(BufferHelper<>::create(&context)); + buffer->setSharingHandler(mockSharingHandler); + buffer->getGraphicsAllocation(0u)->setAllocationType(AllocationType::sharedBuffer); + + cl_mem memObject = buffer.get(); + cl_uint numObjects = 1; + cl_mem *memObjects = &memObject; + + cl_int result = cmdQ.enqueueAcquireSharedObjects(numObjects, memObjects, 0, nullptr, nullptr, 0); + EXPECT_EQ(result, CL_SUCCESS); + + auto ultCsr = static_cast *>(&cmdQ.getGpgpuCommandStreamReceiver()); + ultCsr->directSubmissionAvailable = true; + ultCsr->callBaseSendRenderStateCacheFlush = false; + ultCsr->flushReturnValue = SubmissionStatus::success; + EXPECT_FALSE(ultCsr->renderStateCacheFlushed); + + result = cmdQ.enqueueReleaseSharedObjects(numObjects, memObjects, 0, nullptr, nullptr, 0); + EXPECT_EQ(result, CL_SUCCESS); + EXPECT_EQ(ultCsr->renderStateCacheFlushed, context.getDevice(0)->getProductHelper().isDcFlushMitigated()); + EXPECT_EQ(ultCsr->renderStateCacheDcFlushForced, context.getDevice(0)->getProductHelper().isDcFlushMitigated()); } TEST(CommandQueue, givenEnqueuesForSharedObjectsWithImageWhenUsingSharingHandlerWithEventThenReturnSuccess) { diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index 1be2d63337..ad00300735 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -329,6 +329,16 @@ class CommandStreamReceiver { requiresInstructionCacheFlush = true; } + MOCKABLE_VIRTUAL bool checkDcFlushRequiredForDcMitigationAndReset() { + auto ret = this->requiresDcFlush; + this->requiresDcFlush = false; + return ret; + } + + void registerDcFlushForDcMitigation() { + this->requiresDcFlush = true; + } + bool isLocalMemoryEnabled() const { return localMemoryEnabled; } uint32_t getRootDeviceIndex() const { return rootDeviceIndex; } @@ -637,6 +647,7 @@ class CommandStreamReceiver { bool nTo1SubmissionModelEnabled = false; bool lastSystolicPipelineSelectMode = false; bool requiresInstructionCacheFlush = false; + bool requiresDcFlush = false; bool localMemoryEnabled = false; bool pageTableManagerInitialized = false; diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 91f9cfe06f..5f6b314743 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -1206,7 +1206,7 @@ SubmissionStatus CommandStreamReceiverHw::flushPipeControl(bool state auto lock = obtainUniqueOwnership(); PipeControlArgs args; - args.dcFlushEnable = this->dcFlushSupport; + args.dcFlushEnable = this->dcFlushSupport || this->checkDcFlushRequiredForDcMitigationAndReset(); args.notifyEnable = isUsedNotifyEnableForPostSync(); args.workloadPartitionOffset = isMultiTileOperationEnabled(); @@ -1794,6 +1794,7 @@ inline void CommandStreamReceiverHw::processBarrierWithPostSync(Linea auto &rootDeviceEnvironment = this->peekRootDeviceEnvironment(); args.dcFlushEnable = getDcFlushRequired(dispatchFlags.dcFlush); + args.dcFlushEnable |= this->checkDcFlushRequiredForDcMitigationAndReset(); args.notifyEnable = isUsedNotifyEnableForPostSync(); args.tlbInvalidation |= dispatchFlags.memoryMigrationRequired; args.textureCacheInvalidationEnable |= dispatchFlags.textureCacheFlush; @@ -2153,6 +2154,7 @@ void CommandStreamReceiverHw::dispatchImmediateFlushClientBufferComma PipeControlArgs args = {}; args.dcFlushEnable = this->dcFlushSupport; + args.dcFlushEnable |= this->checkDcFlushRequiredForDcMitigationAndReset(); args.notifyEnable = isUsedNotifyEnableForPostSync(); args.workloadPartitionOffset = isMultiTileOperationEnabled(); MemorySynchronizationCommands::addBarrierWithPostSyncOperation( diff --git a/shared/test/common/libult/ult_command_stream_receiver.h b/shared/test/common/libult/ult_command_stream_receiver.h index 0672ebee76..f8dfd48ad9 100644 --- a/shared/test/common/libult/ult_command_stream_receiver.h +++ b/shared/test/common/libult/ult_command_stream_receiver.h @@ -256,6 +256,11 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ downloadAllocationCalled = true; } + bool checkDcFlushRequiredForDcMitigationAndReset() override { + this->registeredDcFlushForDcFlushMitigation = this->requiresDcFlush; + return BaseClass::checkDcFlushRequiredForDcMitigationAndReset(); + } + WaitStatus waitForCompletionWithTimeout(const WaitParams ¶ms, TaskCountType taskCountToWait) override { std::lock_guard guard(mutex); latestWaitForCompletionWithTimeoutTaskCount.store(taskCountToWait); @@ -457,6 +462,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ SubmissionStatus sendRenderStateCacheFlush() override { this->renderStateCacheFlushed = true; + this->renderStateCacheDcFlushForced = this->requiresDcFlush; if (callBaseSendRenderStateCacheFlush) { return BaseClass::sendRenderStateCacheFlush(); } @@ -523,6 +529,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ std::atomic downloadAllocationsCalledCount = 0; bool renderStateCacheFlushed = false; + bool renderStateCacheDcFlushForced = false; bool cpuCopyForHostPtrSurfaceAllowed = false; bool createPageTableManagerCalled = false; bool recordFlusheBatchBuffer = false; @@ -551,6 +558,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ bool isKmdWaitOnTaskCountAllowedValue = false; bool stopDirectSubmissionCalled = false; bool stopDirectSubmissionCalledBlocking = false; + bool registeredDcFlushForDcFlushMitigation = false; }; } // namespace NEO diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index 85eb69452e..2b30d733de 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -3440,6 +3440,37 @@ HWTEST_F(CommandStreamReceiverHwTest, givenFlushPipeControlWhenFlushWithStateCac EXPECT_TRUE(UnitTestHelper::findStateCacheFlushPipeControl(commandStreamReceiver, commandStreamReceiver.commandStream)); } +HWTEST_F(CommandStreamReceiverHwTest, givenDcFlushForcedWhenSendRenderStateCacheFlushThenExpectDcFlush) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + + commandStreamReceiver.registerDcFlushForDcMitigation(); + commandStreamReceiver.sendRenderStateCacheFlush(); + + HardwareParse hwParserCsr; + hwParserCsr.parsePipeControl = true; + hwParserCsr.parseCommands(commandStreamReceiver.commandStream, 0); + hwParserCsr.findHardwareCommands(); + + bool stateCacheFlushFound = false; + auto itorPipeControl = hwParserCsr.pipeControlList.begin(); + while (itorPipeControl != hwParserCsr.pipeControlList.end()) { + auto pipeControl = reinterpret_cast(*itorPipeControl); + + if (pipeControl->getDcFlushEnable() && + pipeControl->getRenderTargetCacheFlushEnable() && + pipeControl->getStateCacheInvalidationEnable() && + pipeControl->getTextureCacheInvalidationEnable() && + ((commandStreamReceiver.isTlbFlushRequiredForStateCacheFlush() && pipeControl->getTlbInvalidate()) || (!commandStreamReceiver.isTlbFlushRequiredForStateCacheFlush() && !pipeControl->getTlbInvalidate()))) { + stateCacheFlushFound = true; + break; + } + itorPipeControl++; + } + + EXPECT_TRUE(stateCacheFlushFound); +} + HWTEST2_F(CommandStreamReceiverHwTest, givenRayTracingAllocationPresentWhenFlushingTaskThenDispatchBtdStateCommandOnceAndResidentAlways, IsAtLeastXeHpCore) {