diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 3b6f67d1d9..eca604e99b 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -169,7 +169,7 @@ struct CommandListCoreFamily : CommandListImp { ze_result_t appendSignalEvent(ze_event_handle_t hEvent) override; ze_result_t appendWaitOnEvents(uint32_t numEvents, ze_event_handle_t *phEvent, bool relaxedOrderingAllowed, bool trackDependencies, bool signalInOrderCompletion) override; - void appendWaitOnInOrderDependency(NEO::GraphicsAllocation *dependencyCounterAllocation, uint64_t waitValue, uint32_t offset, bool relaxedOrderingAllowed); + void appendWaitOnInOrderDependency(NEO::GraphicsAllocation *dependencyCounterAllocation, uint64_t waitValue, uint32_t offset, bool relaxedOrderingAllowed, bool implicitDependency); void appendSignalInOrderDependencyCounter(); ze_result_t appendWriteGlobalTimestamp(uint64_t *dstptr, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; @@ -186,6 +186,7 @@ struct CommandListCoreFamily : CommandListImp { ze_result_t executeCommandListImmediate(bool performMigration) override; ze_result_t executeCommandListImmediateImpl(bool performMigration, L0::CommandQueue *cmdQImmediate); size_t getReserveSshSize(); + void patchInOrderCmds() override; protected: MOCKABLE_VIRTUAL ze_result_t appendMemoryCopyKernelWithGA(void *dstPtr, NEO::GraphicsAllocation *dstPtrAlloc, @@ -330,6 +331,10 @@ struct CommandListCoreFamily : CommandListImp { void handleInOrderImplicitDependencies(bool relaxedOrderingAllowed); virtual void handleInOrderDependencyCounter(); bool isQwordInOrderCounter() const { return GfxFamily::isQwordInOrderCounter; } + + void addCmdForPatching(void *cmd, uint64_t counterValue, InOrderPatchCommandTypes::CmdType cmdType); + + InOrderPatchCommandsContainer inOrderPatchCmds; }; template diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index a4a1d6fbd3..5f9f4cb716 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -151,7 +151,7 @@ ze_result_t CommandListCoreFamily::reset() { template void CommandListCoreFamily::handleInOrderDependencyCounter() { if (!isQwordInOrderCounter() && ((inOrderDependencyCounter + 1) == std::numeric_limits::max())) { - CommandListCoreFamily::appendWaitOnInOrderDependency(inOrderDependencyCounterAllocation, inOrderDependencyCounter + 1, inOrderAllocationOffset, false); + CommandListCoreFamily::appendWaitOnInOrderDependency(inOrderDependencyCounterAllocation, inOrderDependencyCounter + 1, inOrderAllocationOffset, false, true); inOrderDependencyCounter = 0; @@ -2226,7 +2226,7 @@ void CommandListCoreFamily::handleInOrderImplicitDependencies(boo NEO::RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(*commandContainer.getCommandStream()); } - CommandListCoreFamily::appendWaitOnInOrderDependency(this->inOrderDependencyCounterAllocation, this->inOrderDependencyCounter, this->inOrderAllocationOffset, relaxedOrderingAllowed); + CommandListCoreFamily::appendWaitOnInOrderDependency(this->inOrderDependencyCounterAllocation, this->inOrderDependencyCounter, this->inOrderAllocationOffset, relaxedOrderingAllowed, true); } } @@ -2293,7 +2293,7 @@ ze_result_t CommandListCoreFamily::appendSignalEvent(ze_event_han } template -void CommandListCoreFamily::appendWaitOnInOrderDependency(NEO::GraphicsAllocation *dependencyCounterAllocation, uint64_t waitValue, uint32_t offset, bool relaxedOrderingAllowed) { +void CommandListCoreFamily::appendWaitOnInOrderDependency(NEO::GraphicsAllocation *dependencyCounterAllocation, uint64_t waitValue, uint32_t offset, bool relaxedOrderingAllowed, bool implicitDependency) { using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; UNRECOVERABLE_IF(waitValue > std::numeric_limits::max()); @@ -2307,9 +2307,16 @@ void CommandListCoreFamily::appendWaitOnInOrderDependency(NEO::Gr NEO::EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*commandContainer.getCommandStream(), 0, gpuAddress, waitValue, NEO::CompareOperation::Less, true, isQwordInOrderCounter()); } else { - NEO::EncodeSemaphore::addMiSemaphoreWaitCommand(*commandContainer.getCommandStream(), - gpuAddress, waitValue, - COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, false, isQwordInOrderCounter(), false); + using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT; + + auto semaphoreCommand = reinterpret_cast(commandContainer.getCommandStream()->getSpace(sizeof(MI_SEMAPHORE_WAIT))); + + NEO::EncodeSemaphore::programMiSemaphoreWait(semaphoreCommand, gpuAddress, waitValue, COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, + false, true, isQwordInOrderCounter(), false); + + if (implicitDependency) { + addCmdForPatching(semaphoreCommand, waitValue, InOrderPatchCommandTypes::CmdType::Semaphore); + } } gpuAddress += sizeof(uint64_t); @@ -2376,7 +2383,7 @@ ze_result_t CommandListCoreFamily::appendWaitOnEvents(uint32_t nu return ZE_RESULT_ERROR_INVALID_ARGUMENT; // in-order event not signaled yet } if (isInOrderEventWaitRequired(*event)) { - CommandListCoreFamily::appendWaitOnInOrderDependency(event->getInOrderExecDataAllocation(), event->getInOrderExecSignalValue(), event->getInOrderAllocationOffset(), relaxedOrderingAllowed); + CommandListCoreFamily::appendWaitOnInOrderDependency(event->getInOrderExecDataAllocation(), event->getInOrderExecSignalValue(), event->getInOrderAllocationOffset(), relaxedOrderingAllowed, false); } continue; } @@ -2413,12 +2420,18 @@ ze_result_t CommandListCoreFamily::appendWaitOnEvents(uint32_t nu template void CommandListCoreFamily::appendSignalInOrderDependencyCounter() { + using MI_STORE_DATA_IMM = typename GfxFamily::MI_STORE_DATA_IMM; + uint64_t signalValue = this->inOrderDependencyCounter + 1; uint64_t gpuVa = this->inOrderDependencyCounterAllocation->getGpuAddress() + this->inOrderAllocationOffset; - NEO::EncodeStoreMemory::programStoreDataImm(*commandContainer.getCommandStream(), gpuVa, - getLowPart(signalValue), getHighPart(signalValue), isQwordInOrderCounter(), (this->partitionCount > 1)); + auto miStoreCmd = reinterpret_cast(commandContainer.getCommandStream()->getSpace(sizeof(MI_STORE_DATA_IMM))); + + NEO::EncodeStoreMemory::programStoreDataImm(miStoreCmd, gpuVa, getLowPart(signalValue), getHighPart(signalValue), + isQwordInOrderCounter(), (this->partitionCount > 1)); + + addCmdForPatching(miStoreCmd, signalValue, InOrderPatchCommandTypes::CmdType::Sdi); if (NEO::EncodeUserInterruptHelper::isOperationAllowed(NEO::EncodeUserInterruptHelper::onSignalingFenceMask)) { NEO::EnodeUserInterrupt::encode(*commandContainer.getCommandStream()); @@ -3439,4 +3452,21 @@ void CommandListCoreFamily::appendWaitOnSingleEvent(Event *event, } } +template +void CommandListCoreFamily::addCmdForPatching(void *cmd, uint64_t counterValue, InOrderPatchCommandTypes::CmdType cmdType) { + if ((NEO::DebugManager.flags.EnableInOrderRegularCmdListPatching.get() == 1) && (this->cmdListType == TYPE_REGULAR)) { + this->inOrderPatchCmds.emplace_back(cmd, counterValue, cmdType); + } +} + +template +void CommandListCoreFamily::patchInOrderCmds() { + if (this->regularCmdListSubmissionCounter > 0) { + auto appendCounter = this->regularCmdListSubmissionCounter * inOrderDependencyCounter; + + for (auto &cmd : inOrderPatchCmds) { + cmd.patch(appendCounter); + } + } +} } // namespace L0 diff --git a/level_zero/core/source/cmdlist/cmdlist_imp.h b/level_zero/core/source/cmdlist/cmdlist_imp.h index f5453ec79a..cd6433ef70 100644 --- a/level_zero/core/source/cmdlist/cmdlist_imp.h +++ b/level_zero/core/source/cmdlist/cmdlist_imp.h @@ -9,6 +9,7 @@ #include "shared/source/os_interface/os_time.h" #include "level_zero/core/source/cmdlist/cmdlist.h" +#include "level_zero/core/source/helpers/in_order_patch_cmds.h" #include @@ -35,6 +36,7 @@ struct CommandListImp : CommandList { void addToMappedEventList(Event *event); const std::vector &peekMappedEventList() { return mappedTsEventList; } void incRegularCmdListSubmissionCounter() { regularCmdListSubmissionCounter++; } + virtual void patchInOrderCmds() = 0; protected: NEO::GraphicsAllocation *inOrderDependencyCounterAllocation = nullptr; diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index 45b377eb67..090e3fc66f 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -540,6 +540,7 @@ void CommandQueueHw::setupCmdListsAndContextParams( auto commandList = static_cast(CommandList::fromHandle(phCommandLists[i])); commandList->setCsr(this->csr); commandList->storeReferenceTsToMappedEvents(false); + commandList->patchInOrderCmds(); commandList->incRegularCmdListSubmissionCounter(); auto &commandContainer = commandList->getCmdContainer(); diff --git a/level_zero/core/source/helpers/CMakeLists.txt b/level_zero/core/source/helpers/CMakeLists.txt index 116bf74008..a9cbeef44a 100644 --- a/level_zero/core/source/helpers/CMakeLists.txt +++ b/level_zero/core/source/helpers/CMakeLists.txt @@ -11,6 +11,7 @@ target_sources(${L0_STATIC_LIB_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/error_code_helper_l0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/error_code_helper_l0.h ${CMAKE_CURRENT_SOURCE_DIR}/implicit_scaling_l0.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/in_order_patch_cmds.h ${CMAKE_CURRENT_SOURCE_DIR}/l0_gfx_core_helper_factory_init.inl ${CMAKE_CURRENT_SOURCE_DIR}/l0_populate_factory.h ${CMAKE_CURRENT_SOURCE_DIR}/properties_parser.h diff --git a/level_zero/core/source/helpers/in_order_patch_cmds.h b/level_zero/core/source/helpers/in_order_patch_cmds.h new file mode 100644 index 0000000000..c7ab90ac1d --- /dev/null +++ b/level_zero/core/source/helpers/in_order_patch_cmds.h @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2023 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once + +#include "shared/source/helpers/ptr_math.h" + +#include +#include + +namespace L0 { +namespace InOrderPatchCommandTypes { +enum class CmdType { + Sdi, + Semaphore +}; + +template +struct BaseCmd { + BaseCmd(void *cmd, uint64_t baseCounterValue, CmdType cmdType) : cmd(cmd), baseCounterValue(baseCounterValue), cmdType(cmdType) {} + + void patch(uint64_t appendCunterValue) { + if (CmdType::Sdi == cmdType) { + patchSdi(appendCunterValue); + } else { + UNRECOVERABLE_IF(CmdType::Semaphore != cmdType); + patchSemaphore(appendCunterValue); + } + } + + void *cmd = nullptr; + const uint64_t baseCounterValue = 0; + const CmdType cmdType; + + protected: + void patchSdi(uint64_t appendCunterValue) { + auto sdiCmd = reinterpret_cast(cmd); + sdiCmd->setDataDword0(getLowPart(baseCounterValue + appendCunterValue)); + sdiCmd->setDataDword1(getHighPart(baseCounterValue + appendCunterValue)); + } + + void patchSemaphore(uint64_t appendCunterValue) { + auto semaphoreCmd = reinterpret_cast(cmd); + semaphoreCmd->setSemaphoreDataDword(static_cast(baseCounterValue + appendCunterValue)); + } + + BaseCmd() = delete; +}; + +} // namespace InOrderPatchCommandTypes + +template +using InOrderPatchCommandsContainer = std::vector>; + +} // namespace L0 diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index 3953014a8d..9537c93881 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -79,6 +79,7 @@ struct WhiteBox<::L0::CommandListCoreFamily> using BaseClass::inOrderAllocationOffset; using BaseClass::inOrderDependencyCounter; using BaseClass::inOrderDependencyCounterAllocation; + using BaseClass::inOrderPatchCmds; using BaseClass::isFlushTaskSubmissionEnabled; using BaseClass::isQwordInOrderCounter; using BaseClass::isRelaxedOrderingDispatchAllowed; @@ -172,6 +173,7 @@ struct WhiteBox> using BaseClass::immediateCmdListHeapSharing; using BaseClass::inOrderDependencyCounter; using BaseClass::inOrderDependencyCounterAllocation; + using BaseClass::inOrderPatchCmds; using BaseClass::isBcsSplitNeeded; using BaseClass::isFlushTaskSubmissionEnabled; using BaseClass::isQwordInOrderCounter; @@ -269,6 +271,7 @@ struct MockCommandList : public CommandList { ADDMETHOD_NOBASE(close, ze_result_t, ZE_RESULT_SUCCESS, ()); ADDMETHOD_NOBASE(destroy, ze_result_t, ZE_RESULT_SUCCESS, ()); + ADDMETHOD_NOBASE_VOIDRETURN(patchInOrderCmds, (void)); ADDMETHOD_NOBASE(appendLaunchKernel, ze_result_t, ZE_RESULT_SUCCESS, (ze_kernel_handle_t kernelHandle, diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp index 5b06620163..977fa05519 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp @@ -2737,6 +2737,18 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenDoingCpuCopyThenSynchronize, context->freeMem(deviceAlloc); } +HWTEST2_F(InOrderCmdListTests, givenDebugFlagSetWhenUsingImmediateCmdListThenDontAddCmdsToPatch, IsAtLeastXeHpCore) { + DebugManager.flags.EnableInOrderRegularCmdListPatching.set(1); + + auto immCmdList = createCopyOnlyImmCmdList(); + + uint32_t copyData = 0; + + immCmdList->appendMemoryCopy(©Data, ©Data, 1, nullptr, 0, nullptr, false, false); + + EXPECT_EQ(0u, immCmdList->inOrderPatchCmds.size()); +} + HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenGpuHangDetectedInCpuCopyPathThenReportError, IsAtLeastXeHpCore) { auto immCmdList = createImmCmdList(); immCmdList->copyThroughLockedPtrEnabled = true; @@ -3216,6 +3228,99 @@ HWTEST2_F(InOrderRegularCmdListTests, givenInOrderFlagWhenCreatingCmdListThenEna EXPECT_EQ(ZE_RESULT_SUCCESS, zeCommandListDestroy(cmdList)); } +HWTEST2_F(InOrderRegularCmdListTests, givenDebugFlagSetWhenUsingRegularCmdListThenAddCmdsToPatch, IsAtLeastXeHpCore) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; + + DebugManager.flags.EnableInOrderRegularCmdListPatching.set(1); + + ze_command_queue_desc_t desc = {}; + + auto mockCmdQHw = makeZeUniquePtr>(device, device->getNEODevice()->getDefaultEngine().commandStreamReceiver, &desc); + mockCmdQHw->initialize(true, false, false); + auto regularCmdList = createRegularCmdList(true); + + auto cmdStream = regularCmdList->getCmdContainer().getCommandStream(); + + size_t offset = cmdStream->getUsed(); + + uint32_t copyData = 0; + + regularCmdList->appendMemoryCopy(©Data, ©Data, 1, nullptr, 0, nullptr, false, false); + + EXPECT_EQ(1u, regularCmdList->inOrderPatchCmds.size()); // SDI + + auto sdiFromContainer1 = reinterpret_cast(regularCmdList->inOrderPatchCmds[0].cmd); + MI_STORE_DATA_IMM *sdiFromParser1 = nullptr; + + { + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, + ptrOffset(cmdStream->getCpuBase(), offset), + (cmdStream->getUsed() - offset))); + + auto itor = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itor); + + sdiFromParser1 = genCmdCast(*itor); + } + + offset = cmdStream->getUsed(); + regularCmdList->appendMemoryCopy(©Data, ©Data, 1, nullptr, 0, nullptr, false, false); + ASSERT_EQ(3u, regularCmdList->inOrderPatchCmds.size()); // SDI + Semaphore + SDI + + auto semaphoreFromContainer2 = reinterpret_cast(regularCmdList->inOrderPatchCmds[1].cmd); + MI_SEMAPHORE_WAIT *semaphoreFromParser2 = nullptr; + + auto sdiFromContainer2 = reinterpret_cast(regularCmdList->inOrderPatchCmds[2].cmd); + MI_STORE_DATA_IMM *sdiFromParser2 = nullptr; + + { + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, + ptrOffset(cmdStream->getCpuBase(), offset), + (cmdStream->getUsed() - offset))); + + auto semaphoreItor = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), semaphoreItor); + + semaphoreFromParser2 = genCmdCast(*semaphoreItor); + + auto sdiItor = find(semaphoreItor, cmdList.end()); + ASSERT_NE(cmdList.end(), sdiItor); + + sdiFromParser2 = genCmdCast(*sdiItor); + } + + EXPECT_EQ(2u, regularCmdList->inOrderDependencyCounter); + + auto verifyPatching = [&](uint64_t executionCounter) { + auto appendValue = regularCmdList->inOrderDependencyCounter * executionCounter; + + EXPECT_EQ(1u + appendValue, sdiFromContainer1->getDataDword0()); + EXPECT_EQ(1u + appendValue, sdiFromParser1->getDataDword0()); + + EXPECT_EQ(1u + appendValue, semaphoreFromContainer2->getSemaphoreDataDword()); + EXPECT_EQ(1u + appendValue, semaphoreFromParser2->getSemaphoreDataDword()); + + EXPECT_EQ(2u + appendValue, sdiFromContainer2->getDataDword0()); + EXPECT_EQ(2u + appendValue, sdiFromParser2->getDataDword0()); + }; + + regularCmdList->close(); + + auto handle = regularCmdList->toHandle(); + + mockCmdQHw->executeCommandLists(1, &handle, nullptr, false); + verifyPatching(0); + + mockCmdQHw->executeCommandLists(1, &handle, nullptr, false); + verifyPatching(1); + + mockCmdQHw->executeCommandLists(1, &handle, nullptr, false); + verifyPatching(2); +} + HWTEST2_F(InOrderRegularCmdListTests, givenInOrderModeWhenDispatchingRegularCmdListThenProgramPipeControlsToHandleDependencies, IsAtLeastXeHpCore) { using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 43704dadae..4c4805cb40 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -253,6 +253,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, OverrideDriverVersion, -1, "-1: default, >=0: Us DECLARE_DEBUG_VARIABLE(int32_t, WaitForUserFenceOnEventHostSynchronize, -1, "-1: default, 0: Disabled, 1: Enabled. If enabled, use WaitUserFence KMD call for in-order Events instead of active polling on host.") DECLARE_DEBUG_VARIABLE(int32_t, DisableSystemPointerKernelArgument, -1, "-1: default, 0: Disabled, 1: using a system pointer for kernel argument returns an error.") DECLARE_DEBUG_VARIABLE(int32_t, ProgramUserInterruptOnResolvedDependency, -1, "-1: default, 0: Disabled, >=1: bitfield. 01b: program after semaphore, 10b: on signaling fence (non-walker append).") +DECLARE_DEBUG_VARIABLE(int32_t, EnableInOrderRegularCmdListPatching, -1, "-1: default, 0: Disabled, 1: If set, patch counter value on execute call") /*LOGGING FLAGS*/ DECLARE_DEBUG_VARIABLE(int32_t, PrintDriverDiagnostics, -1, "prints driver diagnostics messages to standard output, value corresponds to hint level") diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index 5793b123f8..3812ef5848 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -548,5 +548,6 @@ WaitForUserFenceOnEventHostSynchronize = -1 ProgramUserInterruptOnResolvedDependency = -1 DisableSystemPointerKernelArgument = -1 DoNotValidateDriverPath = 0 +EnableInOrderRegularCmdListPatching = -1 ForceInOrderEvents = -1 # Please don't edit below this line