From 69f5ca634554877d753eebeb4356a31597c93b05 Mon Sep 17 00:00:00 2001 From: Mateusz Hoppe Date: Tue, 19 Sep 2023 11:10:12 +0000 Subject: [PATCH] feature: bindless addressing - flush state cache after reusing SS slot - when Surface State is reused for new resource, State Cache needs to be invalidated Related-To: NEO-7063 Signed-off-by: Mateusz Hoppe --- .../core/source/cmdqueue/cmdqueue_hw.inl | 10 +++ .../test_cmdqueue_enqueue_cmdlist_2.cpp | 55 +++++++++++++ .../command_stream_receiver_hw_base.inl | 23 +++++- shared/source/helpers/gfx_core_helper.h | 1 + .../source/helpers/gfx_core_helper_base.inl | 14 ++++ .../command_stream_receiver_tests.cpp | 79 +++++++++++++++++++ 6 files changed, 181 insertions(+), 1 deletion(-) diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index cd219c707f..45b377eb67 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -125,6 +125,12 @@ ze_result_t CommandQueueHw::executeCommandListsRegular( auto neoDevice = this->device->getNEODevice(); + bool stateCacheFlushRequired = neoDevice->getBindlessHeapsHelper() ? neoDevice->getBindlessHeapsHelper()->getStateDirtyForContext(this->csr->getOsContext().getContextId()) : false; + if (stateCacheFlushRequired) { + linearStreamSizeEstimate += NEO::MemorySynchronizationCommands::getSizeForFullCacheFlush(); + neoDevice->getBindlessHeapsHelper()->clearStateDirtyForContext(this->csr->getOsContext().getContextId()); + } + if (ctx.isDispatchTaskCountPostSyncRequired) { linearStreamSizeEstimate += NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(neoDevice->getRootDeviceEnvironment(), false); } @@ -146,6 +152,10 @@ ze_result_t CommandQueueHw::executeCommandListsRegular( this->makeSbaTrackingBufferResidentIfL0DebuggerEnabled(ctx.isDebugEnabled); this->makeCsrTagAllocationResident(); + if (stateCacheFlushRequired) { + NEO::MemorySynchronizationCommands::addStateCacheFlush(child, neoDevice->getRootDeviceEnvironment()); + } + if (ctx.globalInit) { this->getTagsManagerHeapsAndMakeThemResidentIfSWTagsEnabled(child); this->csr->programHardwareContext(child); diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist_2.cpp index ae1f0cd193..f6fed827cc 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist_2.cpp @@ -9,6 +9,7 @@ #include "shared/source/helpers/pause_on_gpu_properties.h" #include "shared/test/common/cmd_parse/gen_cmd_parse.h" #include "shared/test/common/helpers/unit_test_helper.h" +#include "shared/test/common/mocks/mock_bindless_heaps_helper.h" #include "shared/test/common/test_macros/hw_test.h" #include "level_zero/core/source/cmdlist/cmdlist.h" @@ -780,5 +781,59 @@ HWTEST2_F(CmdListLargeGrfTest, testBody(); } +HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, GivenDirtyFlagForContextInBindlessHelperWhenExecutingCmdListsThenStateCacheInvalidateIsSent) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + ze_command_queue_desc_t queueDesc = {}; + ze_result_t returnValue; + + auto bindlessHeapsHelper = std::make_unique(neoDevice->getMemoryManager(), neoDevice->getNumGenericSubDevices() > 1, neoDevice->getRootDeviceIndex(), neoDevice->getDeviceBitfield()); + MockBindlesHeapsHelper *bindlessHeapsHelperPtr = bindlessHeapsHelper.get(); + + neoDevice->getExecutionEnvironment()->rootDeviceEnvironments[neoDevice->getRootDeviceIndex()]->bindlessHeapsHelper.reset(bindlessHeapsHelper.release()); + + queueDesc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS; + auto commandQueue = whiteboxCast(CommandQueue::create(productFamily, device, neoDevice->getDefaultEngine().commandStreamReceiver, &queueDesc, false, false, false, returnValue)); + ASSERT_NE(nullptr, commandQueue); + + bindlessHeapsHelperPtr->stateCacheDirtyForContext.set(commandQueue->getCsr()->getOsContext().getContextId()); + + auto usedSpaceBefore = commandQueue->commandStream.getUsed(); + + ze_command_list_handle_t commandLists[] = { + CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue)->toHandle(), + CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue)->toHandle()}; + uint32_t numCommandLists = sizeof(commandLists) / sizeof(commandLists[0]); + CommandList::fromHandle(commandLists[0])->close(); + CommandList::fromHandle(commandLists[1])->close(); + auto result = commandQueue->executeCommandLists(numCommandLists, commandLists, nullptr, true); + + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + auto usedSpaceAfter = commandQueue->commandStream.getUsed(); + ASSERT_GT(usedSpaceAfter, usedSpaceBefore); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(commandQueue->commandStream.getCpuBase(), 0), usedSpaceAfter)); + + auto pipeControls = findAll(cmdList.begin(), cmdList.end()); + ASSERT_NE(0u, pipeControls.size()); + + auto pipeControl = reinterpret_cast(*pipeControls[0]); + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_TRUE(pipeControl->getStateCacheInvalidationEnable()); + EXPECT_TRUE(pipeControl->getTextureCacheInvalidationEnable()); + EXPECT_TRUE(pipeControl->getRenderTargetCacheFlushEnable()); + + EXPECT_FALSE(bindlessHeapsHelperPtr->getStateDirtyForContext(commandQueue->getCsr()->getOsContext().getContextId())); + + for (auto i = 0u; i < numCommandLists; i++) { + auto commandList = CommandList::fromHandle(commandLists[i]); + commandList->destroy(); + } + + commandQueue->destroy(); +} + } // namespace ult } // namespace L0 diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 0b556b7801..f13289125e 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -302,9 +302,19 @@ CompletionStamp CommandStreamReceiverHw::flushImmediateTask( handleImmediateFlushJumpToImmediate(flushData); + bool stateCacheFlushRequired = device.getBindlessHeapsHelper() ? device.getBindlessHeapsHelper()->getStateDirtyForContext(getOsContext().getContextId()) : false; + if (stateCacheFlushRequired) { + flushData.estimatedSize += MemorySynchronizationCommands::getSizeForFullCacheFlush(); + } + auto &csrCommandStream = getCS(flushData.estimatedSize); flushData.csrStartOffset = csrCommandStream.getUsed(); + if (stateCacheFlushRequired) { + device.getBindlessHeapsHelper()->clearStateDirtyForContext(getOsContext().getContextId()); + MemorySynchronizationCommands::addStateCacheFlush(csrCommandStream, device.getRootDeviceEnvironment()); + } + dispatchImmediateFlushPipelineSelectCommand(flushData, csrCommandStream); dispatchImmediateFlushFrontEndCommand(flushData, device, csrCommandStream); dispatchImmediateFlushStateComputeModeCommand(flushData, csrCommandStream); @@ -468,7 +478,13 @@ CompletionStamp CommandStreamReceiverHw::flushTask( handleFrontEndStateTransition(dispatchFlags); - auto &commandStreamCSR = this->getCS(getRequiredCmdStreamSizeAligned(dispatchFlags, device)); + auto estimatedSize = getRequiredCmdStreamSizeAligned(dispatchFlags, device); + + bool stateCacheFlushRequired = device.getBindlessHeapsHelper() ? device.getBindlessHeapsHelper()->getStateDirtyForContext(getOsContext().getContextId()) : false; + if (stateCacheFlushRequired) { + estimatedSize += MemorySynchronizationCommands::getSizeForFullCacheFlush(); + } + auto &commandStreamCSR = this->getCS(estimatedSize); auto commandStreamStartCSR = commandStreamCSR.getUsed(); TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(commandStreamCSR, dispatchFlags.csrDependencies, false); @@ -517,6 +533,11 @@ CompletionStamp CommandStreamReceiverHw::flushTask( experimentalCmdBuffer->injectBufferStart(commandStreamCSR, startingOffset); } + if (stateCacheFlushRequired) { + device.getBindlessHeapsHelper()->clearStateDirtyForContext(getOsContext().getContextId()); + MemorySynchronizationCommands::addStateCacheFlush(commandStreamCSR, device.getRootDeviceEnvironment()); + } + if (requiresInstructionCacheFlush) { PipeControlArgs args; args.instructionCacheInvalidateEnable = true; diff --git a/shared/source/helpers/gfx_core_helper.h b/shared/source/helpers/gfx_core_helper.h index c0bca817f4..7e776d6992 100644 --- a/shared/source/helpers/gfx_core_helper.h +++ b/shared/source/helpers/gfx_core_helper.h @@ -433,6 +433,7 @@ struct MemorySynchronizationCommands { static void addFullCacheFlush(LinearStream &commandStream, const RootDeviceEnvironment &rootDeviceEnvironment); static void setCacheFlushExtraProperties(PipeControlArgs &args); + static void addStateCacheFlush(LinearStream &commandStream, const RootDeviceEnvironment &rootDeviceEnvironment); static size_t getSizeForBarrierWithPostSyncOperation(const RootDeviceEnvironment &rootDeviceEnvironment, bool tlbInvalidationRequired); static size_t getSizeForBarrierWa(const RootDeviceEnvironment &rootDeviceEnvironment); diff --git a/shared/source/helpers/gfx_core_helper_base.inl b/shared/source/helpers/gfx_core_helper_base.inl index d6009e3dca..4378d64317 100644 --- a/shared/source/helpers/gfx_core_helper_base.inl +++ b/shared/source/helpers/gfx_core_helper_base.inl @@ -512,6 +512,20 @@ void MemorySynchronizationCommands::addFullCacheFlush(LinearStream &c MemorySynchronizationCommands::addSingleBarrier(commandStream, args); } +template +void MemorySynchronizationCommands::addStateCacheFlush(LinearStream &commandStream, const RootDeviceEnvironment &rootDeviceEnvironment) { + using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; + + PIPE_CONTROL cmd = GfxFamily::cmdInitPipeControl; + cmd.setCommandStreamerStallEnable(true); + cmd.setRenderTargetCacheFlushEnable(true); + cmd.setStateCacheInvalidationEnable(true); + cmd.setTextureCacheInvalidationEnable(true); + + auto commandsBuffer = commandStream.getSpace(sizeof(PIPE_CONTROL)); + *reinterpret_cast(commandsBuffer) = cmd; +} + template const StackVec GfxCoreHelperHw::getDeviceSubGroupSizes() const { return {8, 16, 32}; diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index 270ff24cd0..6bfd61c9ab 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -36,6 +36,7 @@ #include "shared/test/common/helpers/gtest_helpers.h" #include "shared/test/common/helpers/unit_test_helper.h" #include "shared/test/common/mocks/mock_allocation_properties.h" +#include "shared/test/common/mocks/mock_bindless_heaps_helper.h" #include "shared/test/common/mocks/mock_csr.h" #include "shared/test/common/mocks/mock_device.h" #include "shared/test/common/mocks/mock_driver_model.h" @@ -4489,3 +4490,81 @@ HWTEST_F(CommandStreamReceiverTest, givenCsrWhenCleanUpResourcesThenOwnedPrivate csr.cleanupResources(); EXPECT_EQ(mapForReuse->size(), 0u); } + +HWTEST_F(CommandStreamReceiverHwTest, GivenDirtyFlagForContextInBindlessHelperWhenFlushTaskCalledThenStateCacheInvalidateIsSent) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + + auto bindlessHeapsHelper = std::make_unique(pDevice->getMemoryManager(), pDevice->getNumGenericSubDevices() > 1, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + MockBindlesHeapsHelper *bindlessHeapsHelperPtr = bindlessHeapsHelper.get(); + pDevice->getExecutionEnvironment()->rootDeviceEnvironments[pDevice->getRootDeviceIndex()]->bindlessHeapsHelper.reset(bindlessHeapsHelper.release()); + + bindlessHeapsHelperPtr->stateCacheDirtyForContext.set(commandStreamReceiver.getOsContext().getContextId()); + + flushTaskFlags.implicitFlush = true; + auto usedSpaceBefore = commandStreamReceiver.commandStream.getUsed(); + + commandStreamReceiver.flushTask(commandStream, + 0, + &dsh, + &ioh, + nullptr, + taskLevel, + flushTaskFlags, + *pDevice); + + auto usedSpaceAfter = commandStreamReceiver.commandStream.getUsed(); + ASSERT_GT(usedSpaceAfter, usedSpaceBefore); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(commandStreamReceiver.commandStream.getCpuBase(), 0), usedSpaceAfter)); + + auto pipeControls = findAll(cmdList.begin(), cmdList.end()); + ASSERT_NE(0u, pipeControls.size()); + + bool pcFound = false; + for (size_t i = 0; i < pipeControls.size(); i++) { + auto pipeControl = reinterpret_cast(*pipeControls[i]); + bool csStall = pipeControl->getCommandStreamerStallEnable(); + bool stateCache = pipeControl->getStateCacheInvalidationEnable(); + bool texCache = pipeControl->getTextureCacheInvalidationEnable(); + bool renderTargetCache = pipeControl->getRenderTargetCacheFlushEnable(); + + if (csStall && stateCache && texCache && renderTargetCache) { + pcFound = true; + break; + } + } + EXPECT_TRUE(pcFound); + EXPECT_FALSE(bindlessHeapsHelperPtr->getStateDirtyForContext(commandStreamReceiver.getOsContext().getContextId())); +} + +HWTEST_F(CommandStreamReceiverHwTest, GivenDirtyFlagForContextInBindlessHelperWhenFlushImmediateTaskCalledThenStateCacheInvalidateIsSent) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + + auto bindlessHeapsHelper = std::make_unique(pDevice->getMemoryManager(), pDevice->getNumGenericSubDevices() > 1, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + MockBindlesHeapsHelper *bindlessHeapsHelperPtr = bindlessHeapsHelper.get(); + pDevice->getExecutionEnvironment()->rootDeviceEnvironments[pDevice->getRootDeviceIndex()]->bindlessHeapsHelper.reset(bindlessHeapsHelper.release()); + + bindlessHeapsHelperPtr->stateCacheDirtyForContext.set(commandStreamReceiver.getOsContext().getContextId()); + + this->requiredStreamProperties.stateComputeMode.setPropertiesAll(false, GrfConfig::DefaultGrfNumber, ThreadArbitrationPolicy::AgeBased, NEO::PreemptionMode::ThreadGroup); + + commandStreamReceiver.flushImmediateTask(commandStream, commandStream.getUsed(), immediateFlushTaskFlags, *pDevice); + + HardwareParse hwParserCsr; + hwParserCsr.parseCommands(commandStreamReceiver.commandStream, 0); + auto pcCmd = hwParserCsr.getCommand(); + ASSERT_NE(nullptr, pcCmd); + + EXPECT_TRUE(pcCmd->getCommandStreamerStallEnable()); + EXPECT_TRUE(pcCmd->getStateCacheInvalidationEnable()); + EXPECT_TRUE(pcCmd->getTextureCacheInvalidationEnable()); + EXPECT_TRUE(pcCmd->getRenderTargetCacheFlushEnable()); + + EXPECT_FALSE(bindlessHeapsHelperPtr->getStateDirtyForContext(commandStreamReceiver.getOsContext().getContextId())); +}