diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index 60fe1fc9eb..6f4ea26602 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -438,7 +438,9 @@ inline ze_result_t CommandListCoreFamilyImmediate::executeCommand size_t commandStreamStart = this->cmdListCurrentStartOffset; if (appendOperation == NEO::AppendOperations::cmdList && this->dispatchCmdListBatchBufferAsPrimary) { auto cmdListStartCmdBufferStream = reinterpret_cast(cmdQ)->getStartingCmdBuffer(); - // check if queue starting stream is the same as immediate, if not - regular cmdlist is the starting command buffer + // check if queue starting stream is the same as immediate, + // if they are the same - immediate command list buffer has preamble in it including jump from immediate to regular cmdlist - proceed normal + // if not - regular cmdlist is the starting command buffer - no queue preamble or waiting commands if (cmdListStartCmdBufferStream != commandStream) { commandStream = cmdListStartCmdBufferStream; commandStreamStart = 0u; @@ -1720,7 +1722,16 @@ ze_result_t CommandListCoreFamilyImmediate::appendCommandLists(ui return ret; } - auto mainAppendLock = static_cast(this->cmdQImmediate)->getCsr()->obtainUniqueOwnership(); + auto queueImp = static_cast(this->cmdQImmediate); + + auto mainAppendLock = queueImp->getCsr()->obtainUniqueOwnership(); + + if (this->dispatchCmdListBatchBufferAsPrimary) { + // check if wait event preamble or implicit synchronization is present and force bb start jump in queue, even when no preamble is required there + if (this->commandContainer.getCommandStream()->getUsed() != this->cmdListCurrentStartOffset) { + queueImp->triggerBbStartJump(); + } + } ret = this->cmdQImmediate->executeCommandLists(numCommandLists, phCommandLists, nullptr, true, this->commandContainer.getCommandStream()); if (ret != ZE_RESULT_SUCCESS) { return ret; diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index 58e759fd26..5458a9744c 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -253,7 +253,7 @@ size_t CommandQueueHw::estimateStreamSizeForExecuteCommandListsRe linearStreamSizeEstimate += NEO::EncodeBatchBufferStartOrEnd::getBatchBufferEndSize(); } - linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(ctx.globalInit); + linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(ctx.globalInit || this->forceBbStartJump); for (uint32_t i = 0; i < numCommandLists; i++) { auto cmdList = CommandList::fromHandle(commandListHandles[i]); linearStreamSizeEstimate += estimateCommandListSecondaryStart(cmdList); @@ -486,7 +486,7 @@ ze_result_t CommandQueueHw::executeCommandListsCopyOnly( ctx.spaceForResidency += estimateCommandListResidencySize(commandList); } - linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(ctx.globalInit); + linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(ctx.globalInit || this->forceBbStartJump); if (fenceRequired) { linearStreamSizeEstimate += NEO::MemorySynchronizationCommands::getSizeForSingleAdditionalSynchronization(device->getNEODevice()->getRootDeviceEnvironment()); } @@ -1020,7 +1020,7 @@ size_t CommandQueueHw::estimateLinearStreamSizeComplementary( } bool firstCmdlistDynamicPreamble = (this->stateChanges.size() > 0 && this->stateChanges[0].cmdListIndex == 0); - bool estimateBbStartForGlobalInitOnly = !firstCmdlistDynamicPreamble && ctx.globalInit; + bool estimateBbStartForGlobalInitOnly = !firstCmdlistDynamicPreamble && (ctx.globalInit || this->forceBbStartJump); linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(estimateBbStartForGlobalInitOnly); return linearStreamSizeEstimate; @@ -1217,7 +1217,7 @@ void CommandQueueHw::programOneCmdListBatchBufferStartPrimaryBatc auto bbStartPatchLocation = reinterpret_cast(ctx.currentPatchForChainedBbStart); bool dynamicPreamble = ctx.childGpuAddressPositionBeforeDynamicPreamble != commandStream.getCurrentGpuAddressPosition(); - if (ctx.globalInit || dynamicPreamble) { + if (ctx.globalInit || dynamicPreamble || this->forceBbStartJump) { if (ctx.currentPatchForChainedBbStart) { // dynamic preamble, 2nd or later command list // jump from previous command list to the position before dynamic preamble @@ -1230,6 +1230,7 @@ void CommandQueueHw::programOneCmdListBatchBufferStartPrimaryBatc NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(&commandStream, cmdListFirstCmdBuffer->getGpuAddress(), false, false, false); ctx.globalInit = false; + this->forceBbStartJump = false; } else { if (ctx.currentPatchForChainedBbStart == nullptr) { // nothing to dispatch from queue, first command list will be used as submitting batch buffer to KMD or ULLS diff --git a/level_zero/core/source/cmdqueue/cmdqueue_imp.h b/level_zero/core/source/cmdqueue/cmdqueue_imp.h index 0e6717fad0..4f29ef8474 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_imp.h +++ b/level_zero/core/source/cmdqueue/cmdqueue_imp.h @@ -114,6 +114,9 @@ struct CommandQueueImp : public CommandQueue { NEO::LinearStream *getStartingCmdBuffer() const { return startingCmdBuffer; } + void triggerBbStartJump() { + forceBbStartJump = true; + } protected: MOCKABLE_VIRTUAL NEO::SubmissionStatus submitBatchBuffer(size_t offset, NEO::ResidencyContainer &residencyContainer, void *endingCmdPtr, @@ -171,6 +174,7 @@ struct CommandQueueImp : public CommandQueue { std::atomic cmdListWithAssertExecuted = false; bool useKmdWaitFunction = false; + bool forceBbStartJump = false; }; } // namespace L0 diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index fe9f3dd4ca..24b8e99fb4 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -92,6 +92,7 @@ struct WhiteBox<::L0::CommandListCoreFamily> using BaseClass::inOrderAtomicSignalingEnabled; using BaseClass::inOrderExecInfo; using BaseClass::inOrderPatchCmds; + using BaseClass::internalUsage; using BaseClass::interruptEvents; using BaseClass::isFlushTaskSubmissionEnabled; using BaseClass::isInOrderNonWalkerSignalingRequired; @@ -217,6 +218,7 @@ struct WhiteBox> using BaseClass::inOrderAtomicSignalingEnabled; using BaseClass::inOrderExecInfo; using BaseClass::inOrderPatchCmds; + using BaseClass::internalUsage; using BaseClass::interruptEvents; using BaseClass::isBcsSplitNeeded; using BaseClass::isFlushTaskSubmissionEnabled; @@ -264,6 +266,7 @@ struct MockCommandListImmediate : public CommandListCoreFamilyImmediate : public ::L0::CommandQueueImp { using BaseClass::desc; using BaseClass::device; using BaseClass::firstCmdListStream; + using BaseClass::forceBbStartJump; using BaseClass::preemptionCmdSyncProgramming; using BaseClass::printfKernelContainer; using BaseClass::startingCmdBuffer; @@ -77,6 +78,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw { using BaseClass::commandStream; using BaseClass::estimateStreamSizeForExecuteCommandListsRegularHeapless; using BaseClass::executeCommandListsRegularHeapless; + using BaseClass::forceBbStartJump; using BaseClass::prepareAndSubmitBatchBuffer; using BaseClass::printfKernelContainer; using BaseClass::startingCmdBuffer; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_4.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_4.cpp index 7ff4332c28..d04028b728 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_4.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_4.cpp @@ -1584,8 +1584,8 @@ HWTEST2_F(ImmediateCommandListTest, givenImmediateCmdListWhenAppendingRegularThe auto startStream = static_cast(commandListImmediate->cmdQImmediate)->getStartingCmdBuffer(); if (commandListImmediate->getCmdListBatchBufferFlag()) { - auto expectedStream = commandList->getCmdContainer().getCommandStream(); - EXPECT_EQ(expectedStream, startStream); + auto expectedStreamAllocation = commandList->getCmdContainer().getCommandStream()->getGraphicsAllocation(); + EXPECT_EQ(expectedStreamAllocation, startStream->getGraphicsAllocation()); } else { auto expectedStream = commandListImmediate->getCmdContainer().getCommandStream(); EXPECT_EQ(expectedStream, startStream); @@ -1696,5 +1696,66 @@ HWTEST2_F(ImmediateCommandListTest, EXPECT_NE(cmdList.end(), iterator); } +HWTEST2_F(ImmediateCommandListTest, + givenImmediateCmdListWithPrimaryBatchBufferWhenAppendingRegularCmdListWithWaitEventThenDispatchSemaphoreAndJumpFromImmediateToRegular, MatchAny) { + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + + ze_event_pool_desc_t eventPoolDesc = {ZE_STRUCTURE_TYPE_EVENT_POOL_DESC}; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + eventPoolDesc.count = 2; + + ze_event_desc_t eventDesc = {ZE_STRUCTURE_TYPE_EVENT_DESC}; + eventDesc.index = 0; + eventDesc.wait = 0; + eventDesc.signal = 0; + + ze_result_t returnValue; + std::unique_ptr eventPool = std::unique_ptr(static_cast(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue))); + std::unique_ptr event = std::unique_ptr(getHelper().createEvent(eventPool.get(), &eventDesc, device)); + auto eventHandle = event->toHandle(); + + commandList->close(); + auto cmdListHandle = commandList->toHandle(); + + auto regularCmdBufferStream = commandList->getCmdContainer().getCommandStream(); + auto regularCmdBufferAllocation = regularCmdBufferStream->getGraphicsAllocation(); + + auto cmdQImmediate = static_cast *>(commandListImmediate->cmdQImmediate); + + commandListImmediate->dispatchCmdListBatchBufferAsPrimary = true; + cmdQImmediate->dispatchCmdListBatchBufferAsPrimary = true; + + // first append can carry preamble + returnValue = commandListImmediate->appendCommandLists(1, &cmdListHandle, nullptr, 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + auto immediateCmdBufferStream = commandListImmediate->getCmdContainer().getCommandStream(); + auto offsetBefore = immediateCmdBufferStream->getUsed(); + + // no preamble but wait event as first, then bb_start jumping to regular cmdlist + returnValue = commandListImmediate->appendCommandLists(1, &cmdListHandle, nullptr, 1, &eventHandle); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + auto offsetAfter = immediateCmdBufferStream->getUsed(); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer( + cmdList, + ptrOffset(immediateCmdBufferStream->getCpuBase(), offsetBefore), + offsetAfter - offsetBefore)); + + auto iteratorWait = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), iteratorWait); + + auto iteratorBbStart = find(iteratorWait, cmdList.end()); + ASSERT_NE(cmdList.end(), iteratorBbStart); + + auto bbStart = genCmdCast(*iteratorBbStart); + + EXPECT_EQ(regularCmdBufferAllocation->getGpuAddress(), bbStart->getBatchBufferStartAddress()); + EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer()); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp index c0d9789e7e..f847e55446 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp @@ -1201,52 +1201,5 @@ HWTEST2_F(MultiTileImmediateCommandListAppendLaunchKernelXeHpCoreTest, givenImpl EXPECT_EQ(cmdList.end(), itorSemaphoreWait); } -HWTEST2_F(MultiTileImmediateCommandListAppendLaunchKernelXeHpCoreTest, givenImplicitScalingWhenUsingImmediateCommandListWithoutFlushTaskThenUseSecondaryBuffer, IsAtLeastXeHpCore) { - using WalkerVariant = typename FamilyType::WalkerVariant; - - using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; - - debugManager.flags.UsePipeControlAfterPartitionedWalker.set(1); - - ze_group_count_t groupCount{128, 1, 1}; - - ze_command_queue_desc_t queueDesc = {}; - auto queue = std::make_unique>(device, device->getNEODevice()->getDefaultEngine().commandStreamReceiver, &queueDesc); - - auto immediateCmdList = std::make_unique>>(); - immediateCmdList->cmdListType = ::L0::CommandList::CommandListType::typeImmediate; - immediateCmdList->isFlushTaskSubmissionEnabled = false; - immediateCmdList->cmdQImmediate = queue.get(); - auto result = immediateCmdList->initialize(device, NEO::EngineGroupType::compute, 0u); - ASSERT_EQ(ZE_RESULT_SUCCESS, result); - - auto cmdStream = immediateCmdList->getCmdContainer().getCommandStream(); - - auto sizeBefore = cmdStream->getUsed(); - CmdListKernelLaunchParams launchParams = {}; - result = immediateCmdList->appendLaunchKernelWithParams(kernel.get(), groupCount, nullptr, launchParams); - ASSERT_EQ(ZE_RESULT_SUCCESS, result); - auto sizeAfter = cmdStream->getUsed(); - - GenCmdList cmdList; - ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer( - cmdList, - ptrOffset(cmdStream->getCpuBase(), sizeBefore), - sizeAfter - sizeBefore)); - - auto itorWalker = NEO::UnitTestHelper::findWalkerTypeCmd(cmdList.begin(), cmdList.end()); - ASSERT_NE(cmdList.end(), itorWalker); - - WalkerVariant walkerCmd = NEO::UnitTestHelper::getWalkerVariant(*itorWalker); - std::visit([](auto &&walker) { - EXPECT_TRUE(walker->getWorkloadPartitionEnable()); - }, - walkerCmd); - - auto itorBbStart = find(cmdList.begin(), cmdList.end()); - ASSERT_NE(cmdList.end(), itorBbStart); - auto cmdBbStart = genCmdCast(*itorBbStart); - EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH, cmdBbStart->getSecondLevelBatchBuffer()); -} } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_wait_on_events.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_wait_on_events.cpp index 0ae3937a86..f05f1cf374 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_wait_on_events.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_wait_on_events.cpp @@ -214,16 +214,23 @@ HWTEST2_F(CommandListAppendWaitOnEvent, givenImmediateCmdListAndAppendingRegular std::unique_ptr commandListRegular(CommandList::create(productFamily, device, NEO::EngineGroupType::compute, 0u, returnValue, false)); commandListRegular->close(); auto commandListHandle = commandListRegular->toHandle(); - auto result = immCommandList->appendCommandLists(1u, &commandListHandle, nullptr, 1u, &hEventHandle); + // 1st append can carry preamble + auto result = immCommandList->appendCommandLists(1u, &commandListHandle, nullptr, 1u, &hEventHandle); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + // 2nd append should carry only wait events and bb_start to regular command list + auto usedSpaceBefore = immCommandList->getCmdContainer().getCommandStream()->getUsed(); + + result = immCommandList->appendCommandLists(1u, &commandListHandle, nullptr, 1u, &hEventHandle); EXPECT_EQ(ZE_RESULT_SUCCESS, result); auto usedSpaceAfter = immCommandList->getCmdContainer().getCommandStream()->getUsed(); GenCmdList cmdList; ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, - immCommandList->getCmdContainer().getCommandStream()->getCpuBase(), - usedSpaceAfter)); + ptrOffset(immCommandList->getCmdContainer().getCommandStream()->getCpuBase(), usedSpaceBefore), + usedSpaceAfter - usedSpaceBefore)); auto itor = find(cmdList.begin(), cmdList.end()); ASSERT_NE(cmdList.end(), itor); diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp index a7d59b6bbf..0d9289fa66 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp @@ -540,48 +540,6 @@ HWTEST_F(CommandQueueCreate, givenUpdateTaskCountFromWaitAndRegularCmdListWhenDi commandQueue->destroy(); } -HWTEST_F(CommandQueueCreate, givenUpdateTaskCountFromWaitAndImmediateCmdListWhenDispatchTaskCountWriteThenNoPipeControlFlushed) { - using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; - using POST_SYNC_OPERATION = typename FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION; - - DebugManagerStateRestore restorer; - debugManager.flags.UpdateTaskCountFromWait.set(3); - - const ze_command_queue_desc_t desc = {}; - ze_result_t returnValue; - auto commandQueue = whiteboxCast(CommandQueue::create(productFamily, - device, - neoDevice->getDefaultEngine().commandStreamReceiver, - &desc, - false, - false, - true, - returnValue)); - - auto commandList = CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::renderCompute, returnValue); - ASSERT_NE(nullptr, commandList); - - ze_command_list_handle_t cmdListHandle = commandList->toHandle(); - commandQueue->executeCommandLists(1, &cmdListHandle, nullptr, false, nullptr); - - GenCmdList cmdList; - ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer( - cmdList, ptrOffset(commandQueue->commandStream.getCpuBase(), 0), commandQueue->commandStream.getUsed())); - - auto pipeControls = findAll(cmdList.begin(), cmdList.end()); - bool pipeControlsPostSync = false; - for (size_t i = 0; i < pipeControls.size(); i++) { - auto pipeControl = reinterpret_cast(*pipeControls[i]); - if (pipeControl->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { - pipeControlsPostSync = true; - } - } - EXPECT_FALSE(pipeControlsPostSync); - - commandList->destroy(); - commandQueue->destroy(); -} - HWTEST_F(CommandQueueCreate, givenContainerWithAllocationsWhenResidencyContainerIsEmptyThenMakeResidentWasNotCalled) { auto csr = std::make_unique(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield()); csr->setupContext(*neoDevice->getDefaultEngine().osContext); diff --git a/shared/source/command_container/cmdcontainer.cpp b/shared/source/command_container/cmdcontainer.cpp index 7d403296ac..258e271046 100644 --- a/shared/source/command_container/cmdcontainer.cpp +++ b/shared/source/command_container/cmdcontainer.cpp @@ -112,7 +112,7 @@ CommandContainer::ErrorCode CommandContainer::initialize(Device *device, Allocat return ErrorCode::outOfDeviceMemory; } secondaryCommandStreamForImmediateCmdList = std::make_unique(cmdBufferAllocationHost->getUnderlyingBuffer(), - usableSize, this, this->selectedBbCmdSize); + usableSize, cmdcontainer, this->selectedBbCmdSize); secondaryCommandStreamForImmediateCmdList->replaceGraphicsAllocation(cmdBufferAllocationHost); cmdBufferAllocations.push_back(cmdBufferAllocationHost); addToResidencyContainer(cmdBufferAllocationHost); diff --git a/shared/test/unit_test/command_container/command_container_tests.cpp b/shared/test/unit_test/command_container/command_container_tests.cpp index b24a7fc900..281638a02d 100644 --- a/shared/test/unit_test/command_container/command_container_tests.cpp +++ b/shared/test/unit_test/command_container/command_container_tests.cpp @@ -1919,6 +1919,15 @@ TEST_F(CommandContainerTest, givenCmdContainerWhenImmediateCmdListCsrIsSetThenCo EXPECT_EQ(cmdContainer.getCommandStream()->getCmdContainer(), nullptr); } +TEST_F(CommandContainerTest, givenCmdContainerWithImmediateCsrWhenCreatingSecondaryCmdBufferThenSecondaryStreamHasCmdContainerSetToNullptr) { + MyMockCommandContainer cmdContainer; + cmdContainer.setImmediateCmdListCsr(pDevice->getDefaultEngine().commandStreamReceiver); + constexpr bool createSecondary = true; + cmdContainer.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, false, createSecondary); + ASSERT_NE(nullptr, cmdContainer.secondaryCommandStreamForImmediateCmdList.get()); + EXPECT_EQ(cmdContainer.secondaryCommandStreamForImmediateCmdList->getCmdContainer(), nullptr); +} + TEST_F(CommandContainerTest, givenCmdContainerWhenOldHeapIsStoredAndResetContainerThenUseStorageForReuseForStoredHeap) { MyMockCommandContainer cmdContainer;