From c51b656d2cadfac186a74e10fda6a03a1d32c44a Mon Sep 17 00:00:00 2001 From: Fabian Zwolinski Date: Wed, 24 Jan 2024 15:42:31 +0000 Subject: [PATCH] fix: request instruction cache invalidation on module destroy Invalidation is requested on both linux and windows, on Csr's that used Isa allocation. Related-To: NEO-10045 Signed-off-by: Fabian Zwolinski --- .../core/source/cmdqueue/cmdqueue_hw.inl | 9 ++++ level_zero/core/source/module/module_imp.cpp | 15 ++++++ .../test_cmdqueue_enqueue_cmdlist_2.cpp | 52 ++++++++++++++++++- .../sources/module/test_module_2.cpp | 35 ++++++++++++- .../command_stream/command_stream_receiver.h | 8 +++ .../command_stream_receiver_hw_base.inl | 15 ++++-- shared/source/helpers/gfx_core_helper.h | 2 + .../source/helpers/gfx_core_helper_base.inl | 13 +++++ .../mocks/mock_command_stream_receiver.h | 5 ++ .../command_stream_receiver_tests.cpp | 19 +++++++ .../helpers/gfx_core_helper_tests.cpp | 19 +++++++ .../windows/device_command_stream_tests.cpp | 3 +- 12 files changed, 188 insertions(+), 7 deletions(-) diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index 754f6cf909..7ab53b4419 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -135,6 +135,10 @@ ze_result_t CommandQueueHw::executeCommandListsRegular( linearStreamSizeEstimate += NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(neoDevice->getRootDeviceEnvironment(), false); } + if (this->csr->isInstructionCacheFlushRequired()) { + linearStreamSizeEstimate += NEO::MemorySynchronizationCommands::getSizeForInstructionCacheFlush(); + } + this->csr->getResidencyAllocations().reserve(ctx.spaceForResidency); NEO::LinearStream child(nullptr); @@ -226,6 +230,11 @@ ze_result_t CommandQueueHw::executeCommandListsRegular( this->assignCsrTaskCountToFenceIfAvailable(hFence); this->dispatchTaskCountPostSyncRegular(ctx.isDispatchTaskCountPostSyncRequired, child); + if (this->csr->isInstructionCacheFlushRequired()) { + NEO::MemorySynchronizationCommands::addInstructionCacheFlush(child); + this->csr->setInstructionCacheFlushed(); + } + auto submitResult = this->prepareAndSubmitBatchBuffer(ctx, child); this->csr->setPreemptionMode(ctx.statePreemption); diff --git a/level_zero/core/source/module/module_imp.cpp b/level_zero/core/source/module/module_imp.cpp index 324c34782d..bcc05ff59f 100644 --- a/level_zero/core/source/module/module_imp.cpp +++ b/level_zero/core/source/module/module_imp.cpp @@ -7,6 +7,7 @@ #include "level_zero/core/source/module/module_imp.h" +#include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/compiler_interface/compiler_options.h" #include "shared/source/compiler_interface/compiler_options_extra.h" #include "shared/source/compiler_interface/compiler_warnings/compiler_warnings.h" @@ -37,6 +38,7 @@ #include "shared/source/memory_manager/memory_manager.h" #include "shared/source/memory_manager/memory_operations_handler.h" #include "shared/source/memory_manager/unified_memory_manager.h" +#include "shared/source/os_interface/os_context.h" #include "shared/source/program/kernel_info.h" #include "shared/source/program/program_initialization.h" @@ -1540,6 +1542,19 @@ ze_result_t ModuleImp::destroy() { auto tempHandle = debugModuleHandle; auto tempDevice = device; + + auto rootDeviceIndex = getDevice()->getNEODevice()->getRootDeviceIndex(); + auto &executionEnvironment = getDevice()->getNEODevice()->getRootDeviceEnvironment().executionEnvironment; + + for (const auto &kernelImmData : this->kernelImmDatas) { + for (auto &engine : executionEnvironment.memoryManager->getRegisteredEngines(rootDeviceIndex)) { + auto contextId = engine.osContext->getContextId(); + if (kernelImmData->getIsaGraphicsAllocation()->isUsedByOsContext(contextId)) { + engine.commandStreamReceiver->registerInstructionCacheFlush(); + } + } + } + delete this; if (tempDevice->getL0Debugger() && tempHandle != 0) { diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist_2.cpp index 7ba90d5d68..e94ebcd866 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist_2.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2023 Intel Corporation + * Copyright (C) 2022-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -837,5 +837,55 @@ HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, GivenDirtyFlagForContextInBi commandQueue->destroy(); } +HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, GivenRegisterInstructionCacheFlushWhenExecutingCmdListsThenInstructionCacheInvalidateIsSent) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + ze_command_queue_desc_t queueDesc = {}; + ze_result_t returnValue; + + neoDevice->getDefaultEngine().commandStreamReceiver->registerInstructionCacheFlush(); + + queueDesc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS; + auto commandQueue = whiteboxCast(CommandQueue::create(productFamily, device, neoDevice->getDefaultEngine().commandStreamReceiver, &queueDesc, false, false, false, returnValue)); + ASSERT_NE(nullptr, commandQueue); + + auto usedSpaceBefore = commandQueue->commandStream.getUsed(); + + ze_command_list_handle_t commandLists[] = { + CommandList::create(productFamily, device, NEO::EngineGroupType::renderCompute, 0u, returnValue, false)->toHandle()}; + uint32_t numCommandLists = 1; + CommandList::fromHandle(commandLists[0])->close(); + auto result = commandQueue->executeCommandLists(numCommandLists, commandLists, nullptr, true); + + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + auto usedSpaceAfter = commandQueue->commandStream.getUsed(); + ASSERT_GT(usedSpaceAfter, usedSpaceBefore); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer( + cmdList, ptrOffset(commandQueue->commandStream.getCpuBase(), 0), usedSpaceAfter)); + + auto pipeControls = findAll(cmdList.begin(), cmdList.end()); + ASSERT_NE(0u, pipeControls.size()); + + bool foundInstructionCacheInvalidate = false; + for (auto pipeControlIT : pipeControls) { + auto pipeControl = reinterpret_cast(*pipeControlIT); + if (pipeControl->getInstructionCacheInvalidateEnable()) { + foundInstructionCacheInvalidate = true; + break; + } + } + + EXPECT_TRUE(foundInstructionCacheInvalidate); + + for (auto i = 0u; i < numCommandLists; i++) { + auto commandList = CommandList::fromHandle(commandLists[i]); + commandList->destroy(); + } + + commandQueue->destroy(); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/module/test_module_2.cpp b/level_zero/core/test/unit_tests/sources/module/test_module_2.cpp index dcfd1bea8c..66da46e1ca 100644 --- a/level_zero/core/test/unit_tests/sources/module/test_module_2.cpp +++ b/level_zero/core/test/unit_tests/sources/module/test_module_2.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2023 Intel Corporation + * Copyright (C) 2022-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -9,6 +9,7 @@ #include "shared/source/helpers/aligned_memory.h" #include "shared/source/helpers/file_io.h" #include "shared/test/common/helpers/test_files.h" +#include "shared/test/common/mocks/mock_command_stream_receiver.h" #include "shared/test/common/mocks/mock_device.h" #include "shared/test/common/mocks/mock_modules_zebin.h" #include "shared/test/common/test_macros/test.h" @@ -17,7 +18,9 @@ #include "level_zero/core/source/kernel/kernel.h" #include "level_zero/core/source/module/module_build_log.h" #include "level_zero/core/test/unit_tests/fixtures/device_fixture.h" +#include "level_zero/core/test/unit_tests/mocks/mock_device.h" #include "level_zero/core/test/unit_tests/mocks/mock_module.h" + namespace L0 { namespace ult { @@ -75,6 +78,36 @@ TEST_F(ModuleTests, whenCreatingAutoGrfBuildOptionsThenOptionsParsedCorrectly) { EXPECT_TRUE(NEO::CompilerOptions::contains(internalBuildOptions, NEO::CompilerOptions::autoGrf)); } +TEST(ModuleDestroyTest, givenIsaAllocationWhenIsModuleDestroyedThenRequireInstructionCacheFlushInCsrThatUsedTheAllocation) { + const uint32_t rootDeviceIndex = 0u; + NEO::HardwareInfo hwInfo = *NEO::defaultHwInfo.get(); + auto *neoMockDevice = NEO::MockDevice::createWithNewExecutionEnvironment(&hwInfo, rootDeviceIndex); + + MockCommandStreamReceiver *mockCommandStreamReceiver = new MockCommandStreamReceiver(*neoMockDevice->executionEnvironment, neoMockDevice->getRootDeviceIndex(), neoMockDevice->getDeviceBitfield()); + mockCommandStreamReceiver->makeResidentParentCall = true; + + neoMockDevice->resetCommandStreamReceiver(mockCommandStreamReceiver); + + MockDeviceImp deviceImp(neoMockDevice, neoMockDevice->getExecutionEnvironment()); + + auto module = new MockModule{&deviceImp, nullptr, ModuleType::user}; + module->translationUnit.reset(new MockModuleTranslationUnit{&deviceImp}); + + auto kernelInfo = new KernelInfo{}; + kernelInfo->heapInfo.pKernelHeap = reinterpret_cast(0xdeadbeef0000); + kernelInfo->heapInfo.kernelHeapSize = static_cast(0x40); + module->translationUnit->programInfo.kernelInfos.push_back(kernelInfo); + + module->initializeKernelImmutableDatas(); + auto &kernelImmDatas = module->getKernelImmutableDataVector(); + auto csr = deviceImp.getNEODevice()->getEngine(0).commandStreamReceiver; + csr->makeResident(*kernelImmDatas[0]->getIsaParentAllocation()); + + module->destroy(); + + EXPECT_TRUE(mockCommandStreamReceiver->requiresInstructionCacheFlush); +} + TEST(ModuleBuildLog, WhenCreatingModuleBuildLogThenNonNullPointerReturned) { auto moduleBuildLog = ModuleBuildLog::create(); ASSERT_NE(nullptr, moduleBuildLog); diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index 6017271693..8c42d4e83d 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -273,6 +273,14 @@ class CommandStreamReceiver { void downloadAllocation(GraphicsAllocation &gfxAllocation); + bool isInstructionCacheFlushRequired() const { + return requiresInstructionCacheFlush; + } + + void setInstructionCacheFlushed() { + requiresInstructionCacheFlush = false; + } + void registerInstructionCacheFlush() { auto mutex = obtainUniqueOwnership(); requiresInstructionCacheFlush = true; diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 2e8d371145..f43973a8e7 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -304,6 +304,10 @@ CompletionStamp CommandStreamReceiverHw::flushImmediateTask( flushData.estimatedSize += MemorySynchronizationCommands::getSizeForFullCacheFlush(); } + if (requiresInstructionCacheFlush) { + flushData.estimatedSize += MemorySynchronizationCommands::getSizeForInstructionCacheFlush(); + } + auto &csrCommandStream = getCS(flushData.estimatedSize); flushData.csrStartOffset = csrCommandStream.getUsed(); @@ -312,6 +316,11 @@ CompletionStamp CommandStreamReceiverHw::flushImmediateTask( MemorySynchronizationCommands::addStateCacheFlush(csrCommandStream, device.getRootDeviceEnvironment()); } + if (requiresInstructionCacheFlush) { + MemorySynchronizationCommands::addInstructionCacheFlush(csrCommandStream); + requiresInstructionCacheFlush = false; + } + dispatchImmediateFlushPipelineSelectCommand(flushData, csrCommandStream); dispatchImmediateFlushFrontEndCommand(flushData, device, csrCommandStream); dispatchImmediateFlushStateComputeModeCommand(flushData, csrCommandStream); @@ -540,9 +549,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( } if (requiresInstructionCacheFlush) { - PipeControlArgs args; - args.instructionCacheInvalidateEnable = true; - MemorySynchronizationCommands::addSingleBarrier(commandStreamCSR, args); + MemorySynchronizationCommands::addInstructionCacheFlush(commandStreamCSR); requiresInstructionCacheFlush = false; } @@ -942,7 +949,7 @@ size_t CommandStreamReceiverHw::getRequiredCmdStreamSize(const Dispat } if (requiresInstructionCacheFlush) { - size += MemorySynchronizationCommands::getSizeForSingleBarrier(false); + size += MemorySynchronizationCommands::getSizeForInstructionCacheFlush(); } if (debugManager.flags.ForcePipeControlPriorToWalker.get()) { diff --git a/shared/source/helpers/gfx_core_helper.h b/shared/source/helpers/gfx_core_helper.h index 846e039572..8557f14843 100644 --- a/shared/source/helpers/gfx_core_helper.h +++ b/shared/source/helpers/gfx_core_helper.h @@ -459,6 +459,7 @@ struct MemorySynchronizationCommands { static void addFullCacheFlush(LinearStream &commandStream, const RootDeviceEnvironment &rootDeviceEnvironment); static void setCacheFlushExtraProperties(PipeControlArgs &args); static void addStateCacheFlush(LinearStream &commandStream, const RootDeviceEnvironment &rootDeviceEnvironment); + static void addInstructionCacheFlush(LinearStream &commandStream); static size_t getSizeForBarrierWithPostSyncOperation(const RootDeviceEnvironment &rootDeviceEnvironment, bool tlbInvalidationRequired); static size_t getSizeForBarrierWa(const RootDeviceEnvironment &rootDeviceEnvironment); @@ -466,6 +467,7 @@ struct MemorySynchronizationCommands { static size_t getSizeForSingleAdditionalSynchronizationForDirectSubmission(const RootDeviceEnvironment &rootDeviceEnvironment); static size_t getSizeForSingleAdditionalSynchronization(const RootDeviceEnvironment &rootDeviceEnvironment); static size_t getSizeForAdditonalSynchronization(const RootDeviceEnvironment &rootDeviceEnvironment); + static size_t getSizeForInstructionCacheFlush(); static size_t getSizeForFullCacheFlush(); static bool isBarrierWaRequired(const RootDeviceEnvironment &rootDeviceEnvironment); diff --git a/shared/source/helpers/gfx_core_helper_base.inl b/shared/source/helpers/gfx_core_helper_base.inl index 3f437b5d86..f99acd2cd7 100644 --- a/shared/source/helpers/gfx_core_helper_base.inl +++ b/shared/source/helpers/gfx_core_helper_base.inl @@ -528,6 +528,19 @@ void MemorySynchronizationCommands::addStateCacheFlush(LinearStream & *reinterpret_cast(commandsBuffer) = cmd; } +template +size_t MemorySynchronizationCommands::getSizeForInstructionCacheFlush() { + return MemorySynchronizationCommands::getSizeForSingleBarrier(false); +} + +template +void MemorySynchronizationCommands::addInstructionCacheFlush(LinearStream &commandStream) { + PipeControlArgs args; + args.instructionCacheInvalidateEnable = true; + + MemorySynchronizationCommands::addSingleBarrier(commandStream, args); +} + template const StackVec GfxCoreHelperHw::getDeviceSubGroupSizes() const { return {8, 16, 32}; diff --git a/shared/test/common/mocks/mock_command_stream_receiver.h b/shared/test/common/mocks/mock_command_stream_receiver.h index b18d0e7319..ceab2fe496 100644 --- a/shared/test/common/mocks/mock_command_stream_receiver.h +++ b/shared/test/common/mocks/mock_command_stream_receiver.h @@ -49,6 +49,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { using CommandStreamReceiver::osContext; using CommandStreamReceiver::ownershipMutex; using CommandStreamReceiver::preemptionAllocation; + using CommandStreamReceiver::requiresInstructionCacheFlush; using CommandStreamReceiver::tagAddress; using CommandStreamReceiver::tagsMultiAllocation; using CommandStreamReceiver::taskCount; @@ -183,6 +184,9 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { GraphicsAllocation *getClearColorAllocation() override { return nullptr; } void makeResident(GraphicsAllocation &gfxAllocation) override { makeResidentCalledTimes++; + if (makeResidentParentCall) { + return CommandStreamReceiver::makeResident(gfxAllocation); + } } std::unique_lock obtainHostPtrSurfaceCreationLock() override { @@ -220,6 +224,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { bool programHardwareContextCalled = false; bool createPreemptionAllocationReturn = true; bool createPreemptionAllocationParentCall = false; + bool makeResidentParentCall = false; bool programComputeBarrierCommandCalled = false; bool programStallingCommandsForBarrierCalled = false; std::optional isGpuHangDetectedReturnValue{}; diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index d12d4e6e8e..3f757c8af2 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -4729,6 +4729,25 @@ HWTEST_F(CommandStreamReceiverHwTest, GivenDirtyFlagForContextInBindlessHelperWh EXPECT_FALSE(bindlessHeapsHelperPtr->getStateDirtyForContext(commandStreamReceiver.getOsContext().getContextId())); } +HWTEST_F(CommandStreamReceiverHwTest, givenRequiresInstructionCacheFlushWhenFlushImmediateThenInstructionCacheInvalidateEnableIsSent) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + commandStreamReceiver.registerInstructionCacheFlush(); + + this->requiredStreamProperties.stateComputeMode.setPropertiesAll(false, GrfConfig::defaultGrfNumber, ThreadArbitrationPolicy::AgeBased, NEO::PreemptionMode::ThreadGroup); + + commandStreamReceiver.flushImmediateTask(commandStream, commandStream.getUsed(), immediateFlushTaskFlags, *pDevice); + + HardwareParse hwParserCsr; + hwParserCsr.parseCommands(commandStreamReceiver.commandStream, 0); + auto pcCmd = hwParserCsr.getCommand(); + ASSERT_NE(nullptr, pcCmd); + + EXPECT_TRUE(pcCmd->getInstructionCacheInvalidateEnable()); + EXPECT_FALSE(commandStreamReceiver.requiresInstructionCacheFlush); +} + HWTEST_F(CommandStreamReceiverHwTest, GivenFlushIsBlockingWhenFlushTaskCalledThenExpectMonitorFenceFlagTrue) { auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); commandStreamReceiver.recordFlusheBatchBuffer = true; diff --git a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp index 114c463774..1e83bcbbb3 100644 --- a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp +++ b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp @@ -1008,6 +1008,25 @@ HWTEST_F(PipeControlHelperTests, WhenProgrammingCacheFlushThenExpectBasicFieldsS EXPECT_TRUE(pipeControl->getTlbInvalidate()); } +HWTEST_F(PipeControlHelperTests, WhenGettingPipeControSizeForInstructionCacheFlushThenReturnCorrectValue) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + size_t actualSize = MemorySynchronizationCommands::getSizeForInstructionCacheFlush(); + EXPECT_EQ(sizeof(PIPE_CONTROL), actualSize); +} + +HWTEST_F(PipeControlHelperTests, WhenProgrammingInstructionCacheFlushThenExpectInstructionCacheInvalidateEnableFieldSet) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + std::unique_ptr buffer(new uint8_t[128]); + + LinearStream stream(buffer.get(), 128); + MockExecutionEnvironment mockExecutionEnvironment{}; + MemorySynchronizationCommands::addInstructionCacheFlush(stream); + PIPE_CONTROL *pipeControl = genCmdCast(buffer.get()); + ASSERT_NE(nullptr, pipeControl); + + EXPECT_TRUE(pipeControl->getInstructionCacheInvalidateEnable()); +} + using ProductHelperCommonTest = Test; HWTEST2_F(ProductHelperCommonTest, givenBlitterPreferenceWhenEnablingBlitterOperationsSupportThenHonorThePreference, IsAtLeastGen12lp) { diff --git a/shared/test/unit_test/os_interface/windows/device_command_stream_tests.cpp b/shared/test/unit_test/os_interface/windows/device_command_stream_tests.cpp index 23ae4264b5..9fe8c6eb40 100644 --- a/shared/test/unit_test/os_interface/windows/device_command_stream_tests.cpp +++ b/shared/test/unit_test/os_interface/windows/device_command_stream_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -74,6 +74,7 @@ struct MockWddmCsr : public WddmCommandStreamReceiver { using CommandStreamReceiver::dispatchMode; using CommandStreamReceiver::getCS; using CommandStreamReceiver::globalFenceAllocation; + using CommandStreamReceiver::requiresInstructionCacheFlush; using CommandStreamReceiver::useGpuIdleImplicitFlush; using CommandStreamReceiver::useNewResourceImplicitFlush; using CommandStreamReceiverHw::blitterDirectSubmission;