diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.h b/level_zero/core/source/cmdqueue/cmdqueue_hw.h index f25c1dab84..4e1001de71 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.h +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.h @@ -133,8 +133,7 @@ struct CommandQueueHw : public CommandQueueImp { uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, ze_fence_handle_t hFence, - ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, - ze_event_handle_t *phWaitEvents); + NEO::LinearStream *parentImmediateCommandlistLinearStream); inline size_t computeDebuggerCmdsSize(const CommandListExecutionContext &ctx); inline size_t computePreemptionSizeForCommandList(CommandListExecutionContext &ctx, CommandList *commandList, diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index da25ff8185..1b87a57f78 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -114,7 +114,7 @@ ze_result_t CommandQueueHw::executeCommandLists( this->startingCmdBuffer = &this->commandStream; if (this->isCopyOnlyCommandQueue) { - ret = this->executeCommandListsCopyOnly(ctx, numCommandLists, phCommandLists, hFence, nullptr, 0, nullptr); + ret = this->executeCommandListsCopyOnly(ctx, numCommandLists, phCommandLists, hFence, parentImmediateCommandlistLinearStream); } else if (this->heaplessStateInitEnabled) { ctx.globalInit = false; ret = this->executeCommandListsRegularHeapless(ctx, numCommandLists, phCommandLists, hFence, nullptr, 0, nullptr); @@ -413,7 +413,6 @@ ze_result_t CommandQueueHw::executeCommandListsRegular( this->dispatchTaskCountPostSyncRegular(ctx.isDispatchTaskCountPostSyncRequired, *streamForDispatch); auto submitResult = NEO::SubmissionStatus::failed; - if (parentImmediateCommandlistLinearStream) { submitResult = NEO::SubmissionStatus::success; } else { @@ -446,10 +445,9 @@ ze_result_t CommandQueueHw::executeCommandListsCopyOnly( uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, ze_fence_handle_t hFence, - ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, - ze_event_handle_t *phWaitEvents) { + NEO::LinearStream *parentImmediateCommandlistLinearStream) { - this->setupCmdListsAndContextParams(ctx, phCommandLists, numCommandLists, hFence, nullptr); + this->setupCmdListsAndContextParams(ctx, phCommandLists, numCommandLists, hFence, parentImmediateCommandlistLinearStream); ctx.isDirectSubmissionEnabled = this->csr->isBlitterDirectSubmissionEnabled(); size_t linearStreamSizeEstimate = this->estimateLinearStreamSizeInitial(ctx); @@ -478,34 +476,48 @@ ze_result_t CommandQueueHw::executeCommandListsCopyOnly( return ret; } + NEO::LinearStream *streamForDispatch = parentImmediateCommandlistLinearStream ? parentImmediateCommandlistLinearStream : &child; + this->getGlobalFenceAndMakeItResident(); - this->getTagsManagerHeapsAndMakeThemResidentIfSWTagsEnabled(child); - this->csr->programHardwareContext(child); + this->getTagsManagerHeapsAndMakeThemResidentIfSWTagsEnabled(*streamForDispatch); + this->csr->programHardwareContext(*streamForDispatch); for (auto i = 0u; i < numCommandLists; ++i) { auto commandList = CommandList::fromHandle(phCommandLists[i]); - ctx.childGpuAddressPositionBeforeDynamicPreamble = child.getCurrentGpuAddressPosition(); + ctx.childGpuAddressPositionBeforeDynamicPreamble = (*streamForDispatch).getCurrentGpuAddressPosition(); - this->programOneCmdListBatchBufferStart(commandList, child, ctx); + this->programOneCmdListBatchBufferStart(commandList, *streamForDispatch, ctx); this->prefetchMemoryToDeviceAssociatedWithCmdList(commandList); } this->migrateSharedAllocationsIfRequested(ctx.isMigrationRequested, ctx.firstCommandList); this->assignCsrTaskCountToFenceIfAvailable(hFence); - this->programLastCommandListReturnBbStart(child, ctx); + this->programLastCommandListReturnBbStart(*streamForDispatch, ctx); - this->dispatchTaskCountPostSyncByMiFlushDw(ctx.isDispatchTaskCountPostSyncRequired, fenceRequired, child); + this->dispatchTaskCountPostSyncByMiFlushDw(ctx.isDispatchTaskCountPostSyncRequired, fenceRequired, *streamForDispatch); this->makeCsrTagAllocationResident(); - auto submitResult = this->prepareAndSubmitBatchBuffer(ctx, child); + + auto submitResult = NEO::SubmissionStatus::failed; + if (parentImmediateCommandlistLinearStream) { + submitResult = NEO::SubmissionStatus::success; + } else { + submitResult = this->prepareAndSubmitBatchBuffer(ctx, *streamForDispatch); + } + this->updateTaskCountAndPostSync(ctx.isDispatchTaskCountPostSyncRequired); this->csr->makeSurfacePackNonResident(this->csr->getResidencyAllocations(), false); - auto completionResult = this->waitForCommandQueueCompletionAndCleanHeapContainer(); + auto completionResult = ZE_RESULT_SUCCESS; + if (!parentImmediateCommandlistLinearStream) { + completionResult = this->waitForCommandQueueCompletionAndCleanHeapContainer(); + } ze_result_t retVal = this->handleSubmissionAndCompletionResults(submitResult, completionResult); - this->csr->getResidencyAllocations().clear(); + if (!parentImmediateCommandlistLinearStream) { + this->csr->getResidencyAllocations().clear(); + } return retVal; } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_signal_event.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_signal_event.cpp index e23a8ecc26..b517bd3bf2 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_signal_event.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_signal_event.cpp @@ -224,6 +224,70 @@ HWTEST2_F(CommandListAppendSignalEvent, givenImmediateCmdListAndAppendingRegular ASSERT_TRUE(postSyncFound); } +HWTEST2_F(CommandListAppendSignalEvent, givenCopyOnlyImmediateCmdListAndAppendingRegularCommandlistWithWaitOnEventsAndSignalEventThenUseSemaphoreAndFlushDw, IsAtLeastXeHpcCore) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + + ze_result_t result = ZE_RESULT_SUCCESS; + auto eventPoolHostVisible = std::unique_ptr(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto eventHostVisible = std::unique_ptr(Event::create(eventPoolHostVisible.get(), &eventDesc, device)); + + auto waitEventPool = std::unique_ptr(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto waitEvent = std::unique_ptr(Event::create(waitEventPool.get(), &eventDesc, device)); + + ze_command_queue_desc_t desc = {}; + desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS; + ze_result_t returnValue; + std::unique_ptr immCommandList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::copy, returnValue)); + ASSERT_NE(nullptr, immCommandList); + + ze_event_handle_t hSignalEventHandle = eventHostVisible->toHandle(); + ze_event_handle_t hWaitEventHandle = waitEvent->toHandle(); + std::unique_ptr commandListRegular(CommandList::create(productFamily, device, NEO::EngineGroupType::copy, 0u, returnValue, false)); + commandListRegular->close(); + auto commandListHandle = commandListRegular->toHandle(); + auto usedSpaceBefore = immCommandList->getCmdContainer().getCommandStream()->getUsed(); + result = immCommandList->appendCommandLists(1u, &commandListHandle, hSignalEventHandle, 1u, &hWaitEventHandle); + + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto usedSpaceAfter = immCommandList->getCmdContainer().getCommandStream()->getUsed(); + ASSERT_GT(usedSpaceAfter, usedSpaceBefore); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, + immCommandList->getCmdContainer().getCommandStream()->getCpuBase(), + usedSpaceAfter)); + + auto itorSemaphore = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorSemaphore); + + auto itorBBStart = find(itorSemaphore, cmdList.end()); + ASSERT_NE(cmdList.end(), itorBBStart); + + uint32_t expectedMiFlushCount = 1; + NEO::EncodeDummyBlitWaArgs waArgs{false, &(device->getNEODevice()->getRootDeviceEnvironmentRef())}; + if (MockEncodeMiFlushDW::getWaSize(waArgs) > 0) { + expectedMiFlushCount = 2; + } + // Add pair of MIFlush for task count update + expectedMiFlushCount += 2; + auto itorMiFlush = findAll(cmdList.begin(), cmdList.end()); + + EXPECT_EQ(expectedMiFlushCount, static_cast(itorMiFlush.size())); +} + HWTEST2_F(CommandListAppendSignalEvent, givenTimestampEventUsedInSignalThenPipeControlAppendedCorrectly, IsAtLeastSkl) { using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;