diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.h b/level_zero/core/source/cmdqueue/cmdqueue_hw.h index 6221b320be..868d55f143 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.h +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.h @@ -184,8 +184,7 @@ struct CommandQueueHw : public CommandQueueImp { NEO::StreamProperties &requiredState, bool &propertyDirty); inline size_t estimatePipelineSelectCmdSize(); - inline void programOneCmdListPipelineSelect(CommandList *commandList, - NEO::LinearStream &commandStream, + inline void programOneCmdListPipelineSelect(NEO::LinearStream &commandStream, CommandListRequiredStateChange &cmdListRequired); inline size_t estimateScmCmdSizeForMultipleCommandLists(NEO::StreamProperties &csrState, @@ -195,8 +194,7 @@ struct CommandQueueHw : public CommandQueueImp { NEO::StreamProperties &requiredState, bool &propertyDirty); - inline void programRequiredStateComputeModeForCommandList(CommandList *commandList, - NEO::LinearStream &commandStream, + inline void programRequiredStateComputeModeForCommandList(NEO::LinearStream &commandStream, CommandListRequiredStateChange &cmdListRequired); inline size_t estimateStateBaseAddressCmdDispatchSize(bool bindingTableBaseAddress); @@ -223,8 +221,6 @@ struct CommandQueueHw : public CommandQueueImp { inline void programRequiredStateBaseAddressForCommandList(CommandListExecutionContext &ctx, NEO::LinearStream &commandStream, - NEO::HeapAddressModel commandListHeapAddressModel, - bool indirectHeapInLocalMemory, CommandListRequiredStateChange &cmdListRequired); inline void updateBaseAddressState(CommandList *lastCommandList); diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index a947636bfb..5acb3e5633 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -158,8 +158,8 @@ ze_result_t CommandQueueHw::executeCommandListsRegular( if (this->stateChanges.size() > 0) { auto &firstCmdListWithStateChange = this->stateChanges[0]; // check first required state change is for the first command list - if (firstCmdListWithStateChange.commandList == ctx.firstCommandList && firstCmdListWithStateChange.flags.propertyPsDirty) { - this->programOneCmdListPipelineSelect(ctx.firstCommandList, child, firstCmdListWithStateChange); + if (firstCmdListWithStateChange.cmdListIndex == 0 && firstCmdListWithStateChange.flags.propertyPsDirty) { + this->programOneCmdListPipelineSelect(child, firstCmdListWithStateChange); firstCmdListWithStateChange.flags.propertyPsDirty = false; } } @@ -187,12 +187,13 @@ ze_result_t CommandQueueHw::executeCommandListsRegular( if (this->stateChanges.size() > this->currentStateChangeIndex) { auto &stateChange = this->stateChanges[this->currentStateChangeIndex]; - if (stateChange.commandList == commandList) { + if (stateChange.cmdListIndex == i) { + DEBUG_BREAK_IF(commandList != stateChange.commandList); this->updateOneCmdListPreemptionModeAndCtxStatePreemption(child, stateChange); - this->programOneCmdListPipelineSelect(commandList, child, stateChange); + this->programOneCmdListPipelineSelect(child, stateChange); this->programOneCmdListFrontEndIfDirty(ctx, child, stateChange); - this->programRequiredStateComputeModeForCommandList(commandList, child, stateChange); - this->programRequiredStateBaseAddressForCommandList(ctx, child, commandList->getCmdListHeapAddressModel(), commandList->getCmdContainer().isIndirectHeapInLocalMemory(), stateChange); + this->programRequiredStateComputeModeForCommandList(child, stateChange); + this->programRequiredStateBaseAddressForCommandList(ctx, child, stateChange); this->currentStateChangeIndex++; } @@ -756,7 +757,7 @@ size_t CommandQueueHw::estimateLinearStreamSizeComplementary( if (propertyScmDirty || propertyFeDirty || propertyPsDirty || propertySbaDirty || frontEndReturnPoint || propertyPreemptionDirty) { CommandListDirtyFlags dirtyFlags = {propertyScmDirty, propertyFeDirty, propertyPsDirty, propertySbaDirty, frontEndReturnPoint, propertyPreemptionDirty}; - this->stateChanges.emplace_back(stagingState, cmdList, dirtyFlags, ctx.statePreemption); + this->stateChanges.emplace_back(stagingState, cmdList, dirtyFlags, ctx.statePreemption, i); } } @@ -1278,7 +1279,7 @@ size_t CommandQueueHw::estimatePipelineSelectCmdSizeForMultipleCo } template -void CommandQueueHw::programOneCmdListPipelineSelect(CommandList *commandList, NEO::LinearStream &commandStream, +void CommandQueueHw::programOneCmdListPipelineSelect(NEO::LinearStream &commandStream, CommandListRequiredStateChange &cmdListRequired) { if (!this->pipelineSelectStateTracking) { return; @@ -1290,7 +1291,7 @@ void CommandQueueHw::programOneCmdListPipelineSelect(CommandList systolic, false, false, - commandList->getSystolicModeSupport()}; + cmdListRequired.commandList->getSystolicModeSupport()}; NEO::PreambleHelper::programPipelineSelect(&commandStream, args, device->getNEODevice()->getRootDeviceEnvironment()); csr->setPreambleSetFlag(true); @@ -1337,8 +1338,7 @@ size_t CommandQueueHw::estimateScmCmdSizeForMultipleCommandLists( } template -void CommandQueueHw::programRequiredStateComputeModeForCommandList(CommandList *commandList, - NEO::LinearStream &commandStream, +void CommandQueueHw::programRequiredStateComputeModeForCommandList(NEO::LinearStream &commandStream, CommandListRequiredStateChange &cmdListRequired) { if (!this->stateComputeModeTracking) { return; @@ -1349,11 +1349,11 @@ void CommandQueueHw::programRequiredStateComputeModeForCommandLis cmdListRequired.requiredState.pipelineSelect.systolicMode.value == 1, false, false, - commandList->getSystolicModeSupport()}; + cmdListRequired.commandList->getSystolicModeSupport()}; - bool isRcs = this->getCsr()->isRcs(); NEO::EncodeComputeMode::programComputeModeCommandWithSynchronization(commandStream, cmdListRequired.requiredState.stateComputeMode, pipelineSelectArgs, - false, device->getNEODevice()->getRootDeviceEnvironment(), isRcs, this->getCsr()->getDcFlushSupport(), nullptr); + false, device->getNEODevice()->getRootDeviceEnvironment(), this->csr->isRcs(), + this->csr->getDcFlushSupport(), nullptr); this->csr->setStateComputeModeDirty(false); } } @@ -1361,14 +1361,14 @@ void CommandQueueHw::programRequiredStateComputeModeForCommandLis template void CommandQueueHw::programRequiredStateBaseAddressForCommandList(CommandListExecutionContext &ctx, NEO::LinearStream &commandStream, - NEO::HeapAddressModel commandListHeapAddressModel, - bool indirectHeapInLocalMemory, CommandListRequiredStateChange &cmdListRequired) { if (!this->stateBaseAddressTracking) { return; } + if (cmdListRequired.flags.propertySbaDirty) { + bool indirectHeapInLocalMemory = cmdListRequired.commandList->getCmdContainer().isIndirectHeapInLocalMemory(); programStateBaseAddress(ctx.scratchGsba, indirectHeapInLocalMemory, commandStream, @@ -1439,7 +1439,7 @@ size_t CommandQueueHw::estimateStateBaseAddressCmdSizeForGlobalSt if (baseAddressStateDirty) { csrState.stateBaseAddress.copyPropertiesAll(cmdListRequired.stateBaseAddress); } else { - csrState.stateBaseAddress.copyPropertiesStatelessMocs(cmdListFinal.stateBaseAddress); + csrState.stateBaseAddress.copyPropertiesStatelessMocs(cmdListRequired.stateBaseAddress); } csrState.stateBaseAddress.setPropertiesSurfaceState(globalStatelessHeap->getHeapGpuBase(), globalStatelessHeap->getHeapSizeInPages()); diff --git a/level_zero/core/source/cmdqueue/cmdqueue_imp.h b/level_zero/core/source/cmdqueue/cmdqueue_imp.h index 26f1563140..a4483bacf1 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_imp.h +++ b/level_zero/core/source/cmdqueue/cmdqueue_imp.h @@ -112,14 +112,17 @@ struct CommandQueueImp : public CommandQueue { CommandListRequiredStateChange() = default; CommandListRequiredStateChange(NEO::StreamProperties &requiredState, CommandList *commandList, CommandListDirtyFlags flags, - NEO::PreemptionMode newMode) : requiredState(requiredState), - commandList(commandList), - flags(flags), - newMode(newMode) {} + NEO::PreemptionMode newMode, + uint32_t cmdListIndex) : requiredState(requiredState), + commandList(commandList), + flags(flags), + newMode(newMode), + cmdListIndex(cmdListIndex) {} NEO::StreamProperties requiredState{}; CommandList *commandList = nullptr; CommandListDirtyFlags flags; NEO::PreemptionMode newMode = NEO::PreemptionMode::Initial; + uint32_t cmdListIndex = 0; }; using CommandListStateChangeList = std::vector; diff --git a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.cpp b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.cpp index 2a24e053ba..395b8331a3 100644 --- a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.cpp +++ b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.cpp @@ -339,5 +339,23 @@ void PrimaryBatchBufferCmdListFixture::setUp() { ModuleMutableCommandListFixture::setUp(); } +void PrimaryBatchBufferPreamblelessCmdListFixture::setUp() { + DebugManager.flags.SelectCmdListHeapAddressModel.set(static_cast(NEO::HeapAddressModel::GlobalStateless)); + DebugManager.flags.ForceL1Caching.set(0); + + PrimaryBatchBufferCmdListFixture::setUp(); + + ze_result_t returnValue; + commandList2.reset(whiteboxCast(CommandList::create(productFamily, device, engineGroupType, 0u, returnValue))); + commandList3.reset(whiteboxCast(CommandList::create(productFamily, device, engineGroupType, 0u, returnValue))); +} + +void PrimaryBatchBufferPreamblelessCmdListFixture::tearDown() { + commandList2.reset(nullptr); + commandList3.reset(nullptr); + + PrimaryBatchBufferCmdListFixture::tearDown(); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h index 19a8e4278a..6ec423aac3 100644 --- a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h +++ b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h @@ -287,5 +287,13 @@ struct PrimaryBatchBufferCmdListFixture : public ModuleMutableCommandListFixture void setUp(); }; +struct PrimaryBatchBufferPreamblelessCmdListFixture : public PrimaryBatchBufferCmdListFixture { + void setUp(); + void tearDown(); + + std::unique_ptr commandList2; + std::unique_ptr commandList3; +}; + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp index 26a307640f..a20795a3da 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp @@ -9,6 +9,7 @@ #include "shared/source/helpers/blit_commands_helper.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/test/common/cmd_parse/gen_cmd_parse.h" +#include "shared/test/common/libult/ult_command_stream_receiver.h" #include "shared/test/common/mocks/mock_graphics_allocation.h" #include "shared/test/common/test_macros/hw_test.h" @@ -1529,5 +1530,67 @@ HWTEST_F(PrimaryBatchBufferCmdListTest, givenPrimaryBatchBufferWhenCommandListHa EXPECT_EQ(expectedEndPtr, cmdContainer.getEndCmdPtr()); } +using PrimaryBatchBufferPreamblelessCmdListTest = Test; + +HWTEST2_F(PrimaryBatchBufferPreamblelessCmdListTest, + givenPrimaryBatchBufferWhenExecutingSingleCommandListTwiceInSingleCallAndFirstTimeNotExpectsPreambleThenProperlyDispatchPreambleForSecondInstance, + IsAtLeastXeHpCore) { + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS; + + // command list 1 will have two kernels, transition from cached MOCS to uncached MOCS state + ze_group_count_t groupCount{1, 1, 1}; + CmdListKernelLaunchParams launchParams = {}; + ze_result_t result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + kernel->kernelRequiresUncachedMocsCount++; + + result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + // command list 2 will have two kernels, transition from uncached MOCS to cached MOCS state + result = commandList2->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + kernel->kernelRequiresUncachedMocsCount--; + + result = commandList2->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList2->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + // first command list settles global init and leaves state as uncached MOCS + auto commandListHandle = commandList->toHandle(); + result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto &cmdQueueStream = commandQueue->commandStream; + auto queueSizeUsed = cmdQueueStream.getUsed(); + + ze_command_list_handle_t sameCommandListTwice[] = {commandList2->toHandle(), commandList2->toHandle()}; + // second command list requires uncached MOCS state, so no dynamic preamble for the fist instance, but leaves cached MOCS state + // second instance require dynamic preamble to reload MOCS to uncached state + result = commandQueue->executeCommandLists(2, sameCommandListTwice, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + ptrOffset(cmdQueueStream.getCpuBase(), queueSizeUsed), + cmdQueueStream.getUsed() - queueSizeUsed)); + + auto cmdQueueSbaDirtyCmds = findAll(cmdList.begin(), cmdList.end()); + ASSERT_TRUE(cmdQueueSbaDirtyCmds.size() >= 1u); + auto sbaCmd = reinterpret_cast(*cmdQueueSbaDirtyCmds[0]); + + auto uncachedMocs = device->getMOCS(false, false) >> 1; + EXPECT_EQ((uncachedMocs << 1), sbaCmd->getStatelessDataPortAccessMemoryObjectControlState()); +} + } // namespace ult } // namespace L0