diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 8757c1691b..6812980db2 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -311,19 +311,14 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(z template void CommandListCoreFamily::appendMultiPartitionPrologue(uint32_t partitionDataSize) { - - const uint64_t workPartitionAllocationGpuVa = device->getNEODevice()->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress(); - size_t estimatedSizeRequired = NEO::ImplicitScalingDispatch::getRegisterConfigurationSize(); + size_t estimatedSizeRequired = NEO::ImplicitScalingDispatch::getOffsetRegisterSize(); increaseCommandStreamSpace(estimatedSizeRequired); - - NEO::ImplicitScalingDispatch::dispatchRegisterConfiguration(*commandContainer.getCommandStream(), - workPartitionAllocationGpuVa, - partitionDataSize); + NEO::ImplicitScalingDispatch::dispatchOffsetRegister(*commandContainer.getCommandStream(), + partitionDataSize); } template void CommandListCoreFamily::appendMultiPartitionEpilogue() { - const size_t estimatedSizeRequired = NEO::ImplicitScalingDispatch::getOffsetRegisterSize(); increaseCommandStreamSpace(estimatedSizeRequired); NEO::ImplicitScalingDispatch::dispatchOffsetRegister(*commandContainer.getCommandStream(), diff --git a/level_zero/core/source/cmdqueue/cmdqueue.cpp b/level_zero/core/source/cmdqueue/cmdqueue.cpp index 6004eedb7d..b12a4ddfb7 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue.cpp +++ b/level_zero/core/source/cmdqueue/cmdqueue.cpp @@ -54,6 +54,9 @@ ze_result_t CommandQueueImp::initialize(bool copyOnly, bool isInternal) { isCopyOnlyCommandQueue = copyOnly; preemptionCmdSyncProgramming = getPreemptionCmdProgramming(); activeSubDevices = static_cast(csr->getOsContext().getDeviceBitfield().count()); + if (!isInternal) { + partitionCount = csr->getActivePartitions(); + } } return returnValue; } diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.h b/level_zero/core/source/cmdqueue/cmdqueue_hw.h index 47fd7b4f95..8233c826e2 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.h +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.h @@ -52,9 +52,6 @@ struct CommandQueueHw : public CommandQueueImp { bool getPreemptionCmdProgramming() override; void patchCommands(CommandList &commandList, uint64_t scratchAddress); - - size_t getPartitionProgrammingSize(); - void programPartitionConfiguration(NEO::LinearStream &stream); }; } // namespace L0 diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index 01a38b3534..925432e478 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -149,7 +149,7 @@ ze_result_t CommandQueueHw::executeCommandLists( } bool directSubmissionEnabled = isCopyOnlyCommandQueue ? csr->isBlitterDirectSubmissionEnabled() : csr->isDirectSubmissionEnabled(); - partitionCount = csr->getActivePartitions(); + bool programActivePartitionConfig = csr->isProgramActivePartitionConfigRequired(); L0::Fence *fence = nullptr; @@ -215,6 +215,11 @@ ze_result_t CommandQueueHw::executeCommandLists( linearStreamSizeEstimate += sizeof(MI_BATCH_BUFFER_END); } + auto csrHw = reinterpret_cast *>(csr); + if (programActivePartitionConfig) { + linearStreamSizeEstimate += csrHw->getCmdSizeForActivePartitionConfig(); + } + auto &hwInfo = device->getHwInfo(); if (hFence) { fence = Fence::fromHandle(hFence); @@ -269,9 +274,6 @@ ze_result_t CommandQueueHw::executeCommandLists( } linearStreamSizeEstimate += isCopyOnlyCommandQueue ? NEO::EncodeMiFlushDW::getMiFlushDwCmdSizeForDataWrite() : NEO::MemorySynchronizationCommands::getSizeForPipeControlWithPostSyncOperation(hwInfo); - if (partitionCount > 1) { - linearStreamSizeEstimate += getPartitionProgrammingSize(); - } size_t alignedSize = alignUp(linearStreamSizeEstimate, minCmdBufferPtrAlign); size_t padding = alignedSize - linearStreamSizeEstimate; @@ -282,6 +284,7 @@ ze_result_t CommandQueueHw::executeCommandLists( if (globalFenceAllocation) { csr->makeResident(*globalFenceAllocation); } + const auto workPartitionAllocation = csr->getWorkPartitionAllocation(); if (workPartitionAllocation) { csr->makeResident(*workPartitionAllocation); @@ -352,6 +355,10 @@ ze_result_t CommandQueueHw::executeCommandLists( } } + if (programActivePartitionConfig) { + csrHw->programActivePartitionConfig(child); + } + for (auto i = 0u; i < numCommandLists; ++i) { auto commandList = CommandList::fromHandle(phCommandLists[i]); auto cmdBufferAllocations = commandList->commandContainer.getCmdBufferAllocations(); @@ -419,10 +426,6 @@ ze_result_t CommandQueueHw::executeCommandLists( commandQueuePreemptionMode = statePreemption; - if (partitionCount > 1) { - programPartitionConfiguration(child); - } - if (hFence) { csr->makeResident(fence->getAllocation()); if (isCopyOnlyCommandQueue) { diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl index 9cd6c65f34..12570f9027 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl @@ -123,13 +123,4 @@ void CommandQueueHw::patchCommands(CommandList &commandList, uint UNRECOVERABLE_IF(!commandsToPatch.empty()); } -template -size_t CommandQueueHw::getPartitionProgrammingSize() { - return 0; -} - -template -void CommandQueueHw::programPartitionConfiguration(NEO::LinearStream &stream) { -} - } // namespace L0 diff --git a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl index 7626484187..1350c6cbf5 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl @@ -153,20 +153,4 @@ void CommandQueueHw::patchCommands(CommandList &commandList, uint } } -template -size_t CommandQueueHw::getPartitionProgrammingSize() { - using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; - return NEO::ImplicitScalingDispatch::getRegisterConfigurationSize(); -} - -template -void CommandQueueHw::programPartitionConfiguration(NEO::LinearStream &stream) { - using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; - - uint64_t workPartitionAddress = csr->getWorkPartitionAllocationGpuAddress(); - NEO::ImplicitScalingDispatch::dispatchRegisterConfiguration(stream, - workPartitionAddress, - CommonConstants::partitionAddressOffset); -} - } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_multipartition_prologue.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_multipartition_prologue.cpp index f8ce556d2f..f4a90e4d34 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_multipartition_prologue.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_multipartition_prologue.cpp @@ -19,7 +19,6 @@ namespace ult { using MultiPartitionPrologueTest = Test; HWTEST2_F(MultiPartitionPrologueTest, whenAppendMultiPartitionPrologueIsCalledThenCommandListIsUpdated, IsAtLeastXeHpCore) { - using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM; using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; @@ -41,27 +40,22 @@ HWTEST2_F(MultiPartitionPrologueTest, whenAppendMultiPartitionPrologueIsCalledTh ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( cmdList, ptrOffset(commandContainer.getCommandStream()->getCpuBase(), 0), usedSpaceAfter)); - auto itorPc = find(cmdList.begin(), cmdList.end()); - ASSERT_NE(cmdList.end(), itorPc); + auto itorLrm = find(cmdList.begin(), cmdList.end()); + EXPECT_EQ(cmdList.end(), itorLrm); - auto lrmCmdPc = genCmdCast(*itorPc); - ASSERT_EQ(NEO::PartitionRegisters::wparidCCSOffset, lrmCmdPc->getRegisterAddress()); - ASSERT_EQ(lrmCmdPc->getMmioRemapEnable(), true); + auto itorLri = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorLri); - itorPc = find(cmdList.begin(), cmdList.end()); - ASSERT_NE(cmdList.end(), itorPc); - - auto lriCmdPc = genCmdCast(*itorPc); - ASSERT_EQ(NEO::PartitionRegisters::addressOffsetCCSOffset, static_cast(lriCmdPc->getRegisterOffset())); - ASSERT_EQ(static_cast(lriCmdPc->getDataDword()), dataPartitionSize); - ASSERT_EQ(lriCmdPc->getMmioRemapEnable(), true); + auto lriCmd = genCmdCast(*itorLri); + EXPECT_EQ(NEO::PartitionRegisters::addressOffsetCCSOffset, static_cast(lriCmd->getRegisterOffset())); + EXPECT_EQ(dataPartitionSize, static_cast(lriCmd->getDataDword())); + EXPECT_EQ(true, lriCmd->getMmioRemapEnable()); auto result = commandList->close(); - ASSERT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); } HWTEST2_F(MultiPartitionPrologueTest, whenAppendMultiPartitionPrologueIsCalledThenCommandListIsNotUpdated, IsAtMostGen12lp) { - ze_result_t returnValue; std::unique_ptr commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::Compute, 0u, returnValue)); auto &commandContainer = commandList->commandContainer; @@ -81,7 +75,6 @@ HWTEST2_F(MultiPartitionPrologueTest, whenAppendMultiPartitionPrologueIsCalledTh } using MultiPartitionEpilogueTest = Test; HWTEST2_F(MultiPartitionEpilogueTest, whenAppendMultiPartitionEpilogueIsCalledThenCommandListIsUpdated, IsAtLeastXeHpCore) { - using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; ze_result_t returnValue; @@ -101,16 +94,16 @@ HWTEST2_F(MultiPartitionEpilogueTest, whenAppendMultiPartitionEpilogueIsCalledTh ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( cmdList, ptrOffset(commandContainer.getCommandStream()->getCpuBase(), 0), usedSpaceAfter)); - auto itorPc = find(cmdList.begin(), cmdList.end()); - ASSERT_NE(cmdList.end(), itorPc); + auto itorLri = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorLri); - auto lriCmdPc = genCmdCast(*itorPc); - ASSERT_EQ(NEO::PartitionRegisters::addressOffsetCCSOffset, static_cast(lriCmdPc->getRegisterOffset())); - ASSERT_EQ(static_cast(lriCmdPc->getDataDword()), CommonConstants::partitionAddressOffset); - ASSERT_EQ(lriCmdPc->getMmioRemapEnable(), true); + auto lriCmd = genCmdCast(*itorLri); + EXPECT_EQ(NEO::PartitionRegisters::addressOffsetCCSOffset, static_cast(lriCmd->getRegisterOffset())); + EXPECT_EQ(CommonConstants::partitionAddressOffset, static_cast(lriCmd->getDataDword())); + EXPECT_EQ(true, lriCmd->getMmioRemapEnable()); auto result = commandList->close(); - ASSERT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); } HWTEST2_F(MultiPartitionEpilogueTest, whenAppendMultiPartitionPrologueIsCalledThenCommandListIsNotUpdated, IsAtMostGen12lp) { diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp index c64afce62a..4313dc20f1 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp @@ -674,7 +674,7 @@ HWTEST_F(CommandQueueCommandsSingleTile, givenCommandQueueWhenExecutingCommandLi commandQueue->destroy(); } -HWTEST_F(CommandQueueCommandsMultiTile, givenCommandQueueOnMultiTileWhenExecutingCommandListsThenWorkPartitionAllocationIsMadeResident) { +HWTEST2_F(CommandQueueCommandsMultiTile, givenCommandQueueOnMultiTileWhenExecutingCommandListsThenWorkPartitionAllocationIsMadeResident, IsAtLeastXeHpCore) { DebugManagerStateRestore restorer; DebugManager.flags.EnableWalkerPartition.set(1); @@ -691,6 +691,7 @@ HWTEST_F(CommandQueueCommandsMultiTile, givenCommandQueueOnMultiTileWhenExecutin bool expectedGAWasMadeResident = false; }; MyCsrMock csr(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield()); + EXPECT_EQ(2u, csr.activePartitions); csr.initializeTagAllocation(); csr.createWorkPartitionAllocation(*neoDevice); csr.setupContext(*neoDevice->getDefaultEngine().osContext); @@ -712,13 +713,14 @@ HWTEST_F(CommandQueueCommandsMultiTile, givenCommandQueueOnMultiTileWhenExecutin auto status = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false); EXPECT_EQ(status, ZE_RESULT_SUCCESS); + EXPECT_EQ(2u, csr.activePartitionsConfig); ASSERT_NE(nullptr, workPartitionAllocation); EXPECT_TRUE(csr.expectedGAWasMadeResident); commandQueue->destroy(); } -HWTEST_F(CommandQueueCommandsMultiTile, givenCommandQueueOnMultiTileWhenWalkerPartitionIsDisabledThenWorkPartitionAllocationIsNotCreated) { +HWTEST2_F(CommandQueueCommandsMultiTile, givenCommandQueueOnMultiTileWhenWalkerPartitionIsDisabledThenWorkPartitionAllocationIsNotCreated, IsAtLeastXeHpCore) { DebugManagerStateRestore restorer; DebugManager.flags.EnableWalkerPartition.set(0); @@ -2109,26 +2111,5 @@ HWTEST2_F(DeviceWithDualStorage, givenCmdListWithAppendedKernelAndUsmTransferAnd commandQueue->destroy(); } -HWTEST2_F(CommandQueueSynchronizeTest, givenBasePlatformsWhenProgrammingPartitionRegistersThenExpectNoAction, CommandQueueSBASupport) { - ze_result_t returnValue; - ze_command_queue_desc_t desc = {}; - auto csr = neoDevice->getDefaultEngine().commandStreamReceiver; - - auto commandQueue = new MockCommandQueueHw(device, csr, &desc); - returnValue = commandQueue->initialize(false, false); - EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); - - constexpr size_t expectedSize = 0; - EXPECT_EQ(expectedSize, commandQueue->getPartitionProgrammingSize()); - - size_t usedBefore = commandQueue->commandStream->getUsed(); - commandQueue->programPartitionConfiguration(*commandQueue->commandStream); - size_t usedAfter = commandQueue->commandStream->getUsed(); - - EXPECT_EQ(expectedSize, usedAfter - usedBefore); - - commandQueue->destroy(); -} - } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist.cpp index 3846863e19..8ebe939bbb 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_enqueue_cmdlist.cpp @@ -5,6 +5,7 @@ * */ +#include "shared/source/command_container/implicit_scaling.h" #include "shared/source/command_stream/command_stream_receiver_hw.h" #include "shared/source/command_stream/preemption.h" #include "shared/source/utilities/software_tags_manager.h" @@ -823,11 +824,41 @@ HWTEST_F(CommandQueueExecuteCommandListSWTagsTests, givenEnableSWTagsAndCommandL EXPECT_TRUE(tagFound); } +template +void findPartitionRegister(GenCmdList &cmdList, bool expectToFind) { + using MI_LOAD_REGISTER_MEM = typename GfxFamily::MI_LOAD_REGISTER_MEM; + using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM; + + auto loadRegisterMemList = findAll(cmdList.begin(), cmdList.end()); + bool wparidRegisterFound = false; + for (size_t i = 0; i < loadRegisterMemList.size(); i++) { + auto loadRegMem = reinterpret_cast(*loadRegisterMemList[i]); + if (NEO::PartitionRegisters::wparidCCSOffset == loadRegMem->getRegisterAddress()) { + wparidRegisterFound = true; + } + } + + auto loadRegisterImmList = findAll(cmdList.begin(), cmdList.end()); + bool offsetRegisterFound = false; + for (size_t i = 0; i < loadRegisterImmList.size(); i++) { + auto loadRegImm = reinterpret_cast(*loadRegisterImmList[i]); + if (NEO::PartitionRegisters::addressOffsetCCSOffset == loadRegImm->getRegisterOffset()) { + offsetRegisterFound = true; + } + } + + if (expectToFind) { + EXPECT_TRUE(wparidRegisterFound); + EXPECT_TRUE(offsetRegisterFound); + } else { + EXPECT_FALSE(wparidRegisterFound); + EXPECT_FALSE(offsetRegisterFound); + } +} + HWTEST2_F(MultiDeviceCommandQueueExecuteCommandLists, givenMultiplePartitionCountWhenExecutingCmdListThenExpectMmioProgrammingAndCorrectEstimation, IsAtLeastXeHpCore) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using POST_SYNC_OPERATION = typename FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION; - using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM; - using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; using PARSE = typename FamilyType::PARSE; ze_command_queue_desc_t desc{}; @@ -846,6 +877,8 @@ HWTEST2_F(MultiDeviceCommandQueueExecuteCommandLists, givenMultiplePartitionCoun false, returnValue)); EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + EXPECT_EQ(2u, commandQueue->partitionCount); + ASSERT_NE(nullptr, commandQueue->commandStream); auto &commandStreamReceiver = device->getNEODevice()->getDefaultEngine().commandStreamReceiver; if (device->getNEODevice()->getPreemptionMode() == PreemptionMode::MidThread || device->getNEODevice()->isDebuggerActive()) { @@ -858,28 +891,34 @@ HWTEST2_F(MultiDeviceCommandQueueExecuteCommandLists, givenMultiplePartitionCoun EXPECT_EQ(1u, fence->partitionCount); ze_fence_handle_t fenceHandle = fence->toHandle(); - ASSERT_NE(nullptr, commandQueue->commandStream); - - fence->partitionCount = 2; //1st execute call initialized pipeline + auto usedSpaceBefore = commandQueue->commandStream->getUsed(); auto result = commandQueue->executeCommandLists(numCommandLists, commandLists, fenceHandle, true); EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + auto usedSpaceAfter = commandQueue->commandStream->getUsed(); - auto usedSpaceBefore = commandQueue->commandStream->getUsed(); + //1st call then initialize registers + GenCmdList cmdList; + ASSERT_TRUE(PARSE::parseCommandBuffer(cmdList, ptrOffset(commandQueue->commandStream->getCpuBase(), usedSpaceBefore), usedSpaceAfter)); + findPartitionRegister(cmdList, true); + + usedSpaceBefore = commandQueue->commandStream->getUsed(); result = commandQueue->executeCommandLists(numCommandLists, commandLists, fenceHandle, true); ASSERT_EQ(ZE_RESULT_SUCCESS, result); - auto usedSpaceAfter = commandQueue->commandStream->getUsed(); + usedSpaceAfter = commandQueue->commandStream->getUsed(); ASSERT_GT(usedSpaceAfter, usedSpaceBefore); size_t cmdBufferSizeWithoutMmioProgramming = usedSpaceAfter - usedSpaceBefore; - EXPECT_EQ(1u, fence->partitionCount); - - auto workPartitionAddress = device->getNEODevice()->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress(); + EXPECT_EQ(2u, fence->partitionCount); for (auto i = 0u; i < numCommandLists; i++) { auto commandList = CommandList::fromHandle(commandLists[i]); commandList->partitionCount = 2; } + cmdList.clear(); + ASSERT_TRUE(PARSE::parseCommandBuffer(cmdList, ptrOffset(commandQueue->commandStream->getCpuBase(), usedSpaceBefore), usedSpaceAfter)); + findPartitionRegister(cmdList, false); + usedSpaceBefore = commandQueue->commandStream->getUsed(); result = commandQueue->executeCommandLists(numCommandLists, commandLists, fenceHandle, true); ASSERT_EQ(ZE_RESULT_SUCCESS, result); @@ -888,24 +927,12 @@ HWTEST2_F(MultiDeviceCommandQueueExecuteCommandLists, givenMultiplePartitionCoun size_t cmdBufferSizeWithtMmioProgramming = usedSpaceAfter - usedSpaceBefore; EXPECT_EQ(2u, fence->partitionCount); - size_t expectedSizeWithMmioProgramming = cmdBufferSizeWithoutMmioProgramming + sizeof(MI_LOAD_REGISTER_IMM) + sizeof(MI_LOAD_REGISTER_MEM); + size_t expectedSizeWithMmioProgramming = cmdBufferSizeWithoutMmioProgramming; EXPECT_GE(expectedSizeWithMmioProgramming, cmdBufferSizeWithtMmioProgramming); - GenCmdList cmdList; + cmdList.clear(); ASSERT_TRUE(PARSE::parseCommandBuffer(cmdList, ptrOffset(commandQueue->commandStream->getCpuBase(), usedSpaceBefore), usedSpaceAfter)); - - auto itorLri = find(cmdList.begin(), cmdList.end()); - ASSERT_NE(cmdList.end(), itorLri); - auto itorLrm = find(cmdList.begin(), cmdList.end()); - ASSERT_NE(cmdList.end(), itorLrm); - - auto loadRegisterImm = static_cast(*itorLri); - EXPECT_EQ(0x23B4u, loadRegisterImm->getRegisterOffset()); - EXPECT_EQ(8u, loadRegisterImm->getDataDword()); - - auto loadRegisterMem = static_cast(*itorLrm); - EXPECT_EQ(0x221Cu, loadRegisterMem->getRegisterAddress()); - EXPECT_EQ(workPartitionAddress, loadRegisterMem->getMemoryAddress()); + findPartitionRegister(cmdList, false); auto pipeControlList = findAll(cmdList.begin(), cmdList.end()); diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp index ca6440e8fe..6eba37194e 100644 --- a/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp +++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp @@ -1559,7 +1559,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition, if (device->getPreemptionMode() == PreemptionMode::MidThread || device->isDebuggerActive()) { commandStreamReceiver.createPreemptionAllocation(); } - EXPECT_EQ(1u, commandStreamReceiver.activePartitions); + EXPECT_EQ(2u, commandStreamReceiver.activePartitions); cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); EXPECT_EQ(2u, commandStreamReceiver.activePartitions); @@ -1585,7 +1585,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTestStaticPartition, if (device->getPreemptionMode() == PreemptionMode::MidThread || device->isDebuggerActive()) { commandStreamReceiver.createPreemptionAllocation(); } - EXPECT_EQ(1u, commandStreamReceiver.activePartitions); + EXPECT_EQ(2u, commandStreamReceiver.activePartitions); kernel->mockProgram->allowNonUniform = true; cmdQ->enqueueKernel(kernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); EXPECT_EQ(2u, commandStreamReceiver.activePartitions); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_tests_xehp_and_later.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_tests_xehp_and_later.cpp index 834e344d17..63b5c569ce 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_tests_xehp_and_later.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_tests_xehp_and_later.cpp @@ -935,48 +935,6 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPAndLaterMultiTile verifyPipeControl(commandStreamReceiver, 4, true); } -HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPAndLaterMultiTileTests, - givenMultipleDynamicActivePartitionsWhenFlushingTaskThenExpectTagUpdatePipeControlWithoutPartitionFlagOnAndNoActivePartitionConfig) { - auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); - if (pDevice->getPreemptionMode() == PreemptionMode::MidThread || pDevice->isDebuggerActive()) { - commandStreamReceiver.createPreemptionAllocation(); - } - commandStreamReceiver.activePartitions = 2; - commandStreamReceiver.taskCount = 3; - commandStreamReceiver.staticWorkPartitioningEnabled = false; - flushTask(commandStreamReceiver, true); - EXPECT_EQ(2u, commandStreamReceiver.activePartitionsConfig); - - prepareLinearStream(commandStream, 0); - verifyPipeControl(commandStreamReceiver, 4, false); - - prepareLinearStream(commandStreamReceiver.commandStream, 0); - verifyActivePartitionConfig(commandStreamReceiver, false); -} - -HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPAndLaterMultiTileTests, - givenMultipleDynamicActivePartitionsWhenFlushingTagUpdateThenExpectTagUpdatePipeControlWithoutPartitionFlagOnAndNoActivePartitionConfig) { - DebugManagerStateRestore restorer; - DebugManager.flags.UpdateTaskCountFromWait.set(1); - - auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); - if (pDevice->getPreemptionMode() == PreemptionMode::MidThread || pDevice->isDebuggerActive()) { - commandStreamReceiver.createPreemptionAllocation(); - } - commandStreamReceiver.activePartitions = 2; - commandStreamReceiver.taskCount = 3; - commandStreamReceiver.staticWorkPartitioningEnabled = false; - flushTask(commandStreamReceiver, true); - commandStreamReceiver.flushTagUpdate(); - EXPECT_EQ(2u, commandStreamReceiver.activePartitionsConfig); - - prepareLinearStream(commandStream, 0); - verifyPipeControl(commandStreamReceiver, 4, false); - - prepareLinearStream(commandStreamReceiver.commandStream, 0); - verifyActivePartitionConfig(commandStreamReceiver, false); -} - HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPAndLaterMultiTileTests, givenSingleStaticActivePartitionWhenFlushingTaskThenExpectTagUpdatePipeControlWithoutPartitionFlagOnAndNoActivePartitionConfig) { auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); @@ -1028,3 +986,65 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPAndLaterMultiTile prepareLinearStream(commandStreamReceiver.commandStream, usedBeforeCsrCmdStream); verifyActivePartitionConfig(commandStreamReceiver, false); } + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPAndLaterMultiTileTests, + givenMultipleDynamicActivePartitionsWhenFlushingTaskTwiceThenExpectTagUpdatePipeControlWithoutPartitionFlagAndPartitionRegisters) { + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + if (pDevice->getPreemptionMode() == PreemptionMode::MidThread || pDevice->isDebuggerActive()) { + commandStreamReceiver.createPreemptionAllocation(); + } + + commandStreamReceiver.activePartitions = 2; + commandStreamReceiver.taskCount = 3; + commandStreamReceiver.staticWorkPartitioningEnabled = false; + flushTask(commandStreamReceiver, true); + EXPECT_EQ(2u, commandStreamReceiver.activePartitionsConfig); + + prepareLinearStream(commandStream, 0); + verifyPipeControl(commandStreamReceiver, 4, false); + + prepareLinearStream(commandStreamReceiver.commandStream, 0); + verifyActivePartitionConfig(commandStreamReceiver, false); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPAndLaterMultiTileTests, + givenMultipleDynamicActivePartitionsWhenFlushingTagUpdateThenExpectTagUpdatePipeControlWithoutPartitionFlag) { + DebugManagerStateRestore restorer; + DebugManager.flags.UpdateTaskCountFromWait.set(1); + + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + if (pDevice->getPreemptionMode() == PreemptionMode::MidThread || pDevice->isDebuggerActive()) { + commandStreamReceiver.createPreemptionAllocation(); + } + commandStreamReceiver.activePartitions = 2; + commandStreamReceiver.taskCount = 3; + commandStreamReceiver.staticWorkPartitioningEnabled = false; + commandStreamReceiver.flushTagUpdate(); + + prepareLinearStream(commandStreamReceiver.commandStream, 0); + verifyPipeControl(commandStreamReceiver, 4, false); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPAndLaterMultiTileTests, + givenMultipleStaticActivePartitionsAndDirectSubmissionActiveWhenFlushingTaskThenExpectTagUpdatePipeControlWithPartitionFlagOnAndNoActivePartitionConfig) { + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + if (pDevice->getPreemptionMode() == PreemptionMode::MidThread || pDevice->isDebuggerActive()) { + commandStreamReceiver.createPreemptionAllocation(); + } + + commandStreamReceiver.directSubmissionAvailable = true; + + EXPECT_EQ(1u, commandStreamReceiver.activePartitionsConfig); + EXPECT_EQ(2u, commandStreamReceiver.activePartitions); + EXPECT_TRUE(commandStreamReceiver.staticWorkPartitioningEnabled); + + commandStreamReceiver.taskCount = 3; + flushTask(commandStreamReceiver, true); + EXPECT_EQ(1u, commandStreamReceiver.activePartitionsConfig); + + prepareLinearStream(commandStream, 0); + verifyPipeControl(commandStreamReceiver, 4, true); + + prepareLinearStream(commandStreamReceiver.commandStream, 0); + verifyActivePartitionConfig(commandStreamReceiver, false); +} diff --git a/opencl/test/unit_test/helpers/timestamp_packet_2_tests.cpp b/opencl/test/unit_test/helpers/timestamp_packet_2_tests.cpp index 7e0ca69237..86ca0e1ef6 100644 --- a/opencl/test/unit_test/helpers/timestamp_packet_2_tests.cpp +++ b/opencl/test/unit_test/helpers/timestamp_packet_2_tests.cpp @@ -237,11 +237,13 @@ HWTEST_F(TimestampPacketTests, givenPipeControlRequestWhenFlushingThenProgramPip EXPECT_FALSE(csr.stallingCommandsOnNextFlushRequired); HardwareParse hwParser; + hwParser.parsePipeControl = true; hwParser.parseCommands(csr.commandStream, 0); + hwParser.findHardwareCommands(); auto secondEnqueueOffset = csr.commandStream.getUsed(); - auto pipeControl = genCmdCast(*hwParser.cmdList.begin()); - EXPECT_NE(nullptr, pipeControl); + auto pipeControl = genCmdCast(*hwParser.pipeControlList.begin()); + ASSERT_NE(nullptr, pipeControl); EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_NO_WRITE, pipeControl->getPostSyncOperation()); EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); diff --git a/shared/source/command_stream/command_stream_receiver.cpp b/shared/source/command_stream/command_stream_receiver.cpp index 1b3baf7cc6..481c7e0e24 100644 --- a/shared/source/command_stream/command_stream_receiver.cpp +++ b/shared/source/command_stream/command_stream_receiver.cpp @@ -55,7 +55,11 @@ CommandStreamReceiver::CommandStreamReceiver(ExecutionEnvironment &executionEnvi } internalAllocationStorage = std::make_unique(*this); - if (deviceBitfield.count() > 1 && DebugManager.flags.EnableStaticPartitioning.get() != 0 && NEO::ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, true)) { + uint32_t subDeviceCount = static_cast(deviceBitfield.count()); + if (NEO::ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, true) && + subDeviceCount > 1 && + DebugManager.flags.EnableStaticPartitioning.get() != 0) { + this->activePartitions = subDeviceCount; this->staticWorkPartitioningEnabled = true; } } diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index 200c41f38e..02982e5a47 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -290,6 +290,10 @@ class CommandStreamReceiver { bool skipResourceCleanup() const; + inline bool isProgramActivePartitionConfigRequired() const { + return this->isDirectSubmissionEnabled() ? false : this->activePartitionsConfig != this->activePartitions; + } + std::unique_ptr pageTableManager; protected: diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 2c7e239ac9..701fff5bbd 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -5,6 +5,7 @@ * */ +#include "shared/source/command_container/command_encoder.h" #include "shared/source/command_stream/command_stream_receiver_hw.h" #include "shared/source/command_stream/experimental_command_buffer.h" #include "shared/source/command_stream/linear_stream.h" @@ -267,8 +268,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( csrSizeRequestFlags.numGrfRequiredChanged = this->lastSentNumGrfRequired != dispatchFlags.numGrfRequired; lastSentNumGrfRequired = dispatchFlags.numGrfRequired; - csrSizeRequestFlags.activePartitionsChanged = this->activePartitionsConfig != this->activePartitions; - this->activePartitionsConfig = this->activePartitions; + csrSizeRequestFlags.activePartitionsChanged = isProgramActivePartitionConfigRequired(); if (dispatchFlags.threadArbitrationPolicy != ThreadArbitrationPolicy::NotPresent) { this->requiredThreadArbitrationPolicy = dispatchFlags.threadArbitrationPolicy; @@ -807,7 +807,9 @@ size_t CommandStreamReceiverHw::getRequiredCmdStreamSize(const Dispat } size += getCmdSizeForEpilogue(dispatchFlags); size += getCmdsSizeForHardwareContext(); - size += getCmdSizeForActivePartitionConfig(); + if (csrSizeRequestFlags.activePartitionsChanged) { + size += getCmdSizeForActivePartitionConfig(); + } if (executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->getHardwareInfo()->workaroundTable.waSamplerCacheFlushBetweenRedescribedSurfaceReads) { if (this->samplerCacheFlushRequired != SamplerCacheFlushState::samplerCacheFlushNotRequired) { diff --git a/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl b/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl index 9976d9c9e8..f0eb060106 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl @@ -157,6 +157,7 @@ inline void CommandStreamReceiverHw::programActivePartitionConfig(Lin uint64_t workPartitionAddress = getWorkPartitionAllocationGpuAddress(); ImplicitScalingDispatch::dispatchRegisterConfiguration(csr, workPartitionAddress, CommonConstants::partitionAddressOffset); } + this->activePartitionsConfig = this->activePartitions; } template diff --git a/shared/test/common/mocks/mock_command_stream_receiver.h b/shared/test/common/mocks/mock_command_stream_receiver.h index ac6bb97878..41b621f362 100644 --- a/shared/test/common/mocks/mock_command_stream_receiver.h +++ b/shared/test/common/mocks/mock_command_stream_receiver.h @@ -159,6 +159,7 @@ class MockCsrHw2 : public CommandStreamReceiverHw { using CommandStreamReceiverHw::programL3; using CommandStreamReceiverHw::programVFEState; using CommandStreamReceiver::activePartitions; + using CommandStreamReceiver::activePartitionsConfig; using CommandStreamReceiver::clearColorAllocation; using CommandStreamReceiver::commandStream; using CommandStreamReceiver::dispatchMode;