From 09ef0201c6030129df15daeb92f0ea8f94b2bfae Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Fri, 21 Apr 2023 20:51:02 +0000 Subject: [PATCH] fix: correctly assign state transition when same command list executed twice Single command list object can be passed multiple times to the execution command list. Not all command list instances might require dynamic preamble, as it depends what state is before particular command list instance. Correctly assign the particular instance of command list to state transition. Related-To: NEO-7828 Signed-off-by: Zbigniew Zdanowicz --- level_zero/core/source/cmdqueue/cmdqueue_hw.h | 8 +-- .../core/source/cmdqueue/cmdqueue_hw.inl | 34 +++++----- .../core/source/cmdqueue/cmdqueue_imp.h | 11 ++-- .../unit_tests/fixtures/cmdlist_fixture.cpp | 18 ++++++ .../unit_tests/fixtures/cmdlist_fixture.h | 8 +++ .../sources/cmdlist/test_cmdlist_2.cpp | 63 +++++++++++++++++++ 6 files changed, 115 insertions(+), 27 deletions(-) diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.h b/level_zero/core/source/cmdqueue/cmdqueue_hw.h index 6221b320be..868d55f143 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.h +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.h @@ -184,8 +184,7 @@ struct CommandQueueHw : public CommandQueueImp { NEO::StreamProperties &requiredState, bool &propertyDirty); inline size_t estimatePipelineSelectCmdSize(); - inline void programOneCmdListPipelineSelect(CommandList *commandList, - NEO::LinearStream &commandStream, + inline void programOneCmdListPipelineSelect(NEO::LinearStream &commandStream, CommandListRequiredStateChange &cmdListRequired); inline size_t estimateScmCmdSizeForMultipleCommandLists(NEO::StreamProperties &csrState, @@ -195,8 +194,7 @@ struct CommandQueueHw : public CommandQueueImp { NEO::StreamProperties &requiredState, bool &propertyDirty); - inline void programRequiredStateComputeModeForCommandList(CommandList *commandList, - NEO::LinearStream &commandStream, + inline void programRequiredStateComputeModeForCommandList(NEO::LinearStream &commandStream, CommandListRequiredStateChange &cmdListRequired); inline size_t estimateStateBaseAddressCmdDispatchSize(bool bindingTableBaseAddress); @@ -223,8 +221,6 @@ struct CommandQueueHw : public CommandQueueImp { inline void programRequiredStateBaseAddressForCommandList(CommandListExecutionContext &ctx, NEO::LinearStream &commandStream, - NEO::HeapAddressModel commandListHeapAddressModel, - bool indirectHeapInLocalMemory, CommandListRequiredStateChange &cmdListRequired); inline void updateBaseAddressState(CommandList *lastCommandList); diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index a947636bfb..5acb3e5633 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -158,8 +158,8 @@ ze_result_t CommandQueueHw::executeCommandListsRegular( if (this->stateChanges.size() > 0) { auto &firstCmdListWithStateChange = this->stateChanges[0]; // check first required state change is for the first command list - if (firstCmdListWithStateChange.commandList == ctx.firstCommandList && firstCmdListWithStateChange.flags.propertyPsDirty) { - this->programOneCmdListPipelineSelect(ctx.firstCommandList, child, firstCmdListWithStateChange); + if (firstCmdListWithStateChange.cmdListIndex == 0 && firstCmdListWithStateChange.flags.propertyPsDirty) { + this->programOneCmdListPipelineSelect(child, firstCmdListWithStateChange); firstCmdListWithStateChange.flags.propertyPsDirty = false; } } @@ -187,12 +187,13 @@ ze_result_t CommandQueueHw::executeCommandListsRegular( if (this->stateChanges.size() > this->currentStateChangeIndex) { auto &stateChange = this->stateChanges[this->currentStateChangeIndex]; - if (stateChange.commandList == commandList) { + if (stateChange.cmdListIndex == i) { + DEBUG_BREAK_IF(commandList != stateChange.commandList); this->updateOneCmdListPreemptionModeAndCtxStatePreemption(child, stateChange); - this->programOneCmdListPipelineSelect(commandList, child, stateChange); + this->programOneCmdListPipelineSelect(child, stateChange); this->programOneCmdListFrontEndIfDirty(ctx, child, stateChange); - this->programRequiredStateComputeModeForCommandList(commandList, child, stateChange); - this->programRequiredStateBaseAddressForCommandList(ctx, child, commandList->getCmdListHeapAddressModel(), commandList->getCmdContainer().isIndirectHeapInLocalMemory(), stateChange); + this->programRequiredStateComputeModeForCommandList(child, stateChange); + this->programRequiredStateBaseAddressForCommandList(ctx, child, stateChange); this->currentStateChangeIndex++; } @@ -756,7 +757,7 @@ size_t CommandQueueHw::estimateLinearStreamSizeComplementary( if (propertyScmDirty || propertyFeDirty || propertyPsDirty || propertySbaDirty || frontEndReturnPoint || propertyPreemptionDirty) { CommandListDirtyFlags dirtyFlags = {propertyScmDirty, propertyFeDirty, propertyPsDirty, propertySbaDirty, frontEndReturnPoint, propertyPreemptionDirty}; - this->stateChanges.emplace_back(stagingState, cmdList, dirtyFlags, ctx.statePreemption); + this->stateChanges.emplace_back(stagingState, cmdList, dirtyFlags, ctx.statePreemption, i); } } @@ -1278,7 +1279,7 @@ size_t CommandQueueHw::estimatePipelineSelectCmdSizeForMultipleCo } template -void CommandQueueHw::programOneCmdListPipelineSelect(CommandList *commandList, NEO::LinearStream &commandStream, +void CommandQueueHw::programOneCmdListPipelineSelect(NEO::LinearStream &commandStream, CommandListRequiredStateChange &cmdListRequired) { if (!this->pipelineSelectStateTracking) { return; @@ -1290,7 +1291,7 @@ void CommandQueueHw::programOneCmdListPipelineSelect(CommandList systolic, false, false, - commandList->getSystolicModeSupport()}; + cmdListRequired.commandList->getSystolicModeSupport()}; NEO::PreambleHelper::programPipelineSelect(&commandStream, args, device->getNEODevice()->getRootDeviceEnvironment()); csr->setPreambleSetFlag(true); @@ -1337,8 +1338,7 @@ size_t CommandQueueHw::estimateScmCmdSizeForMultipleCommandLists( } template -void CommandQueueHw::programRequiredStateComputeModeForCommandList(CommandList *commandList, - NEO::LinearStream &commandStream, +void CommandQueueHw::programRequiredStateComputeModeForCommandList(NEO::LinearStream &commandStream, CommandListRequiredStateChange &cmdListRequired) { if (!this->stateComputeModeTracking) { return; @@ -1349,11 +1349,11 @@ void CommandQueueHw::programRequiredStateComputeModeForCommandLis cmdListRequired.requiredState.pipelineSelect.systolicMode.value == 1, false, false, - commandList->getSystolicModeSupport()}; + cmdListRequired.commandList->getSystolicModeSupport()}; - bool isRcs = this->getCsr()->isRcs(); NEO::EncodeComputeMode::programComputeModeCommandWithSynchronization(commandStream, cmdListRequired.requiredState.stateComputeMode, pipelineSelectArgs, - false, device->getNEODevice()->getRootDeviceEnvironment(), isRcs, this->getCsr()->getDcFlushSupport(), nullptr); + false, device->getNEODevice()->getRootDeviceEnvironment(), this->csr->isRcs(), + this->csr->getDcFlushSupport(), nullptr); this->csr->setStateComputeModeDirty(false); } } @@ -1361,14 +1361,14 @@ void CommandQueueHw::programRequiredStateComputeModeForCommandLis template void CommandQueueHw::programRequiredStateBaseAddressForCommandList(CommandListExecutionContext &ctx, NEO::LinearStream &commandStream, - NEO::HeapAddressModel commandListHeapAddressModel, - bool indirectHeapInLocalMemory, CommandListRequiredStateChange &cmdListRequired) { if (!this->stateBaseAddressTracking) { return; } + if (cmdListRequired.flags.propertySbaDirty) { + bool indirectHeapInLocalMemory = cmdListRequired.commandList->getCmdContainer().isIndirectHeapInLocalMemory(); programStateBaseAddress(ctx.scratchGsba, indirectHeapInLocalMemory, commandStream, @@ -1439,7 +1439,7 @@ size_t CommandQueueHw::estimateStateBaseAddressCmdSizeForGlobalSt if (baseAddressStateDirty) { csrState.stateBaseAddress.copyPropertiesAll(cmdListRequired.stateBaseAddress); } else { - csrState.stateBaseAddress.copyPropertiesStatelessMocs(cmdListFinal.stateBaseAddress); + csrState.stateBaseAddress.copyPropertiesStatelessMocs(cmdListRequired.stateBaseAddress); } csrState.stateBaseAddress.setPropertiesSurfaceState(globalStatelessHeap->getHeapGpuBase(), globalStatelessHeap->getHeapSizeInPages()); diff --git a/level_zero/core/source/cmdqueue/cmdqueue_imp.h b/level_zero/core/source/cmdqueue/cmdqueue_imp.h index 26f1563140..a4483bacf1 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_imp.h +++ b/level_zero/core/source/cmdqueue/cmdqueue_imp.h @@ -112,14 +112,17 @@ struct CommandQueueImp : public CommandQueue { CommandListRequiredStateChange() = default; CommandListRequiredStateChange(NEO::StreamProperties &requiredState, CommandList *commandList, CommandListDirtyFlags flags, - NEO::PreemptionMode newMode) : requiredState(requiredState), - commandList(commandList), - flags(flags), - newMode(newMode) {} + NEO::PreemptionMode newMode, + uint32_t cmdListIndex) : requiredState(requiredState), + commandList(commandList), + flags(flags), + newMode(newMode), + cmdListIndex(cmdListIndex) {} NEO::StreamProperties requiredState{}; CommandList *commandList = nullptr; CommandListDirtyFlags flags; NEO::PreemptionMode newMode = NEO::PreemptionMode::Initial; + uint32_t cmdListIndex = 0; }; using CommandListStateChangeList = std::vector; diff --git a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.cpp b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.cpp index 2a24e053ba..395b8331a3 100644 --- a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.cpp +++ b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.cpp @@ -339,5 +339,23 @@ void PrimaryBatchBufferCmdListFixture::setUp() { ModuleMutableCommandListFixture::setUp(); } +void PrimaryBatchBufferPreamblelessCmdListFixture::setUp() { + DebugManager.flags.SelectCmdListHeapAddressModel.set(static_cast(NEO::HeapAddressModel::GlobalStateless)); + DebugManager.flags.ForceL1Caching.set(0); + + PrimaryBatchBufferCmdListFixture::setUp(); + + ze_result_t returnValue; + commandList2.reset(whiteboxCast(CommandList::create(productFamily, device, engineGroupType, 0u, returnValue))); + commandList3.reset(whiteboxCast(CommandList::create(productFamily, device, engineGroupType, 0u, returnValue))); +} + +void PrimaryBatchBufferPreamblelessCmdListFixture::tearDown() { + commandList2.reset(nullptr); + commandList3.reset(nullptr); + + PrimaryBatchBufferCmdListFixture::tearDown(); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h index 19a8e4278a..6ec423aac3 100644 --- a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h +++ b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h @@ -287,5 +287,13 @@ struct PrimaryBatchBufferCmdListFixture : public ModuleMutableCommandListFixture void setUp(); }; +struct PrimaryBatchBufferPreamblelessCmdListFixture : public PrimaryBatchBufferCmdListFixture { + void setUp(); + void tearDown(); + + std::unique_ptr commandList2; + std::unique_ptr commandList3; +}; + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp index 26a307640f..a20795a3da 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp @@ -9,6 +9,7 @@ #include "shared/source/helpers/blit_commands_helper.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/test/common/cmd_parse/gen_cmd_parse.h" +#include "shared/test/common/libult/ult_command_stream_receiver.h" #include "shared/test/common/mocks/mock_graphics_allocation.h" #include "shared/test/common/test_macros/hw_test.h" @@ -1529,5 +1530,67 @@ HWTEST_F(PrimaryBatchBufferCmdListTest, givenPrimaryBatchBufferWhenCommandListHa EXPECT_EQ(expectedEndPtr, cmdContainer.getEndCmdPtr()); } +using PrimaryBatchBufferPreamblelessCmdListTest = Test; + +HWTEST2_F(PrimaryBatchBufferPreamblelessCmdListTest, + givenPrimaryBatchBufferWhenExecutingSingleCommandListTwiceInSingleCallAndFirstTimeNotExpectsPreambleThenProperlyDispatchPreambleForSecondInstance, + IsAtLeastXeHpCore) { + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS; + + // command list 1 will have two kernels, transition from cached MOCS to uncached MOCS state + ze_group_count_t groupCount{1, 1, 1}; + CmdListKernelLaunchParams launchParams = {}; + ze_result_t result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + kernel->kernelRequiresUncachedMocsCount++; + + result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + // command list 2 will have two kernels, transition from uncached MOCS to cached MOCS state + result = commandList2->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + kernel->kernelRequiresUncachedMocsCount--; + + result = commandList2->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList2->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + // first command list settles global init and leaves state as uncached MOCS + auto commandListHandle = commandList->toHandle(); + result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto &cmdQueueStream = commandQueue->commandStream; + auto queueSizeUsed = cmdQueueStream.getUsed(); + + ze_command_list_handle_t sameCommandListTwice[] = {commandList2->toHandle(), commandList2->toHandle()}; + // second command list requires uncached MOCS state, so no dynamic preamble for the fist instance, but leaves cached MOCS state + // second instance require dynamic preamble to reload MOCS to uncached state + result = commandQueue->executeCommandLists(2, sameCommandListTwice, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + ptrOffset(cmdQueueStream.getCpuBase(), queueSizeUsed), + cmdQueueStream.getUsed() - queueSizeUsed)); + + auto cmdQueueSbaDirtyCmds = findAll(cmdList.begin(), cmdList.end()); + ASSERT_TRUE(cmdQueueSbaDirtyCmds.size() >= 1u); + auto sbaCmd = reinterpret_cast(*cmdQueueSbaDirtyCmds[0]); + + auto uncachedMocs = device->getMOCS(false, false) >> 1; + EXPECT_EQ((uncachedMocs << 1), sbaCmd->getStatelessDataPortAccessMemoryObjectControlState()); +} + } // namespace ult } // namespace L0