diff --git a/level_zero/core/source/cmdqueue/cmdqueue.cpp b/level_zero/core/source/cmdqueue/cmdqueue.cpp index 6b4928eece..03c43bda22 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue.cpp +++ b/level_zero/core/source/cmdqueue/cmdqueue.cpp @@ -124,9 +124,13 @@ NEO::SubmissionStatus CommandQueueImp::submitBatchBuffer(size_t offset, NEO::Res bool isCooperative) { UNRECOVERABLE_IF(csr == nullptr); - NEO::BatchBuffer batchBuffer(commandStream.getGraphicsAllocation(), offset, 0, 0, nullptr, false, false, + NEO::BatchBuffer batchBuffer(this->startingCmdBuffer->getGraphicsAllocation(), offset, 0, 0, nullptr, false, false, NEO::QueueThrottle::HIGH, NEO::QueueSliceCount::defaultSliceCount, - commandStream.getUsed(), &commandStream, endingCmdPtr, csr->getNumClients(), false, false); + this->startingCmdBuffer->getUsed(), this->startingCmdBuffer, endingCmdPtr, csr->getNumClients(), false, false); + + if (this->startingCmdBuffer != &this->commandStream) { + this->csr->makeResident(*this->commandStream.getGraphicsAllocation()); + } commandStream.getGraphicsAllocation()->updateTaskCount(csr->peekTaskCount() + 1, csr->getOsContext().getContextId()); commandStream.getGraphicsAllocation()->updateResidencyTaskCount(csr->peekTaskCount() + 1, csr->getOsContext().getContextId()); diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.h b/level_zero/core/source/cmdqueue/cmdqueue_hw.h index 868d55f143..fd42ded9b3 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.h +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.h @@ -74,9 +74,13 @@ struct CommandQueueHw : public CommandQueueImp { NEO::StreamProperties cmdListBeginState{}; uint64_t scratchGsba = 0; + uint64_t childGpuAddressPositionBeforeDynamicPreamble = 0; + size_t spaceForResidency = 10; CommandList *firstCommandList = nullptr; CommandList *lastCommandList = nullptr; + void *currentPatchForChainedBbStart = nullptr; + NEO::PreemptionMode preemptionMode{}; NEO::PreemptionMode statePreemption{}; uint32_t perThreadScratchSpaceSize = 0; @@ -125,6 +129,7 @@ struct CommandQueueHw : public CommandQueueImp { MOCKABLE_VIRTUAL bool isDispatchTaskCountPostSyncRequired(ze_fence_handle_t hFence, bool containsAnyRegularCmdList) const; inline size_t estimateLinearStreamSizeInitial(CommandListExecutionContext &ctx); inline size_t estimateCommandListSecondaryStart(CommandList *commandList); + inline size_t estimateCommandListPrimaryStart(bool required); inline size_t estimateCommandListResidencySize(CommandList *commandList); inline void setFrontEndStateProperties(CommandListExecutionContext &ctx); inline void handleScratchSpaceAndUpdateGSBAStateDirtyFlag(CommandListExecutionContext &ctx); @@ -157,8 +162,12 @@ struct CommandQueueHw : public CommandQueueImp { inline void programOneCmdListFrontEndIfDirty(CommandListExecutionContext &ctx, NEO::LinearStream &commandStream, CommandListRequiredStateChange &cmdListRequiredState); - inline void programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &commandStream); inline void programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &commandStream, CommandListExecutionContext &ctx); + inline void programOneCmdListBatchBufferStartPrimaryBatchBuffer(CommandList *commandList, NEO::LinearStream &commandStream, CommandListExecutionContext &ctx); + inline void programOneCmdListBatchBufferStartSecondaryBatchBuffer(CommandList *commandList, NEO::LinearStream &commandStream, CommandListExecutionContext &ctx); + inline void programLastCommandListReturnBbStart( + NEO::LinearStream &commandStream, + CommandListExecutionContext &ctx); inline void mergeOneCmdListPipelinedState(CommandList *commandList); inline void programFrontEndAndClearDirtyFlag(bool shouldFrontEndBeProgrammed, CommandListExecutionContext &ctx, diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index 5acb3e5633..ff37c94f1a 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -13,6 +13,7 @@ #include "shared/source/command_stream/command_stream_receiver_hw.h" #include "shared/source/command_stream/linear_stream.h" #include "shared/source/command_stream/preemption.h" +#include "shared/source/command_stream/preemption_mode.h" #include "shared/source/command_stream/scratch_space_controller.h" #include "shared/source/command_stream/wait_status.h" #include "shared/source/debugger/debugger_l0.h" @@ -78,13 +79,14 @@ ze_result_t CommandQueueHw::executeCommandLists( auto neoDevice = device->getNEODevice(); auto ctx = CommandListExecutionContext{phCommandLists, numCommandLists, - csr->getPreemptionMode(), + this->isCopyOnlyCommandQueue ? NEO::PreemptionMode::Disabled : csr->getPreemptionMode(), device, NEO::Debugger::isDebugEnabled(internalUsage), csr->isProgramActivePartitionConfigRequired(), performMigration}; ctx.globalInit |= ctx.isDebugEnabled && !this->commandQueueDebugCmdsProgrammed && (neoDevice->getSourceLevelDebugger() || device->getL0Debugger()); + this->startingCmdBuffer = &this->commandStream; this->device->activateMetricGroups(); if (this->isCopyOnlyCommandQueue) { @@ -185,6 +187,8 @@ ze_result_t CommandQueueHw::executeCommandListsRegular( for (auto i = 0u; i < numCommandLists; ++i) { auto commandList = CommandList::fromHandle(commandListHandles[i]); + ctx.childGpuAddressPositionBeforeDynamicPreamble = child.getCurrentGpuAddressPosition(); + if (this->stateChanges.size() > this->currentStateChangeIndex) { auto &stateChange = this->stateChanges[this->currentStateChangeIndex]; if (stateChange.cmdListIndex == i) { @@ -214,6 +218,7 @@ ze_result_t CommandQueueHw::executeCommandListsRegular( this->updateBaseAddressState(ctx.lastCommandList); this->migrateSharedAllocationsIfRequested(ctx.isMigrationRequested, ctx.firstCommandList); + this->programLastCommandListReturnBbStart(child, ctx); this->programStateSipEndWA(ctx.stateSipRequired, child); this->assignCsrTaskCountToFenceIfAvailable(hFence); this->dispatchTaskCountPostSyncRegular(ctx.isDispatchTaskCountPostSyncRequired, child); @@ -250,6 +255,8 @@ ze_result_t CommandQueueHw::executeCommandListsCopyOnly( ctx.spaceForResidency += estimateCommandListResidencySize(commandList); } + linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(ctx.globalInit); + this->csr->getResidencyAllocations().reserve(ctx.spaceForResidency); NEO::EncodeDummyBlitWaArgs waArgs{false, &(this->device->getNEODevice()->getRootDeviceEnvironmentRef())}; @@ -270,7 +277,9 @@ ze_result_t CommandQueueHw::executeCommandListsCopyOnly( for (auto i = 0u; i < numCommandLists; ++i) { auto commandList = CommandList::fromHandle(phCommandLists[i]); - this->programOneCmdListBatchBufferStart(commandList, child); + ctx.childGpuAddressPositionBeforeDynamicPreamble = child.getCurrentGpuAddressPosition(); + + this->programOneCmdListBatchBufferStart(commandList, child, ctx); this->mergeOneCmdListPipelinedState(commandList); this->prefetchMemoryToDeviceAssociatedWithCmdList(commandList); } @@ -278,6 +287,7 @@ ze_result_t CommandQueueHw::executeCommandListsCopyOnly( this->assignCsrTaskCountToFenceIfAvailable(hFence); + this->programLastCommandListReturnBbStart(child, ctx); this->dispatchTaskCountPostSyncByMiFlushDw(ctx.isDispatchTaskCountPostSyncRequired, child); this->makeCsrTagAllocationResident(); @@ -559,7 +569,7 @@ void CommandQueueHw::setupCmdListsAndContextParams( uint32_t numCommandLists, ze_fence_handle_t hFence) { - ctx.containsAnyRegularCmdList |= ctx.firstCommandList->getCmdListType() == CommandList::CommandListType::TYPE_REGULAR; + ctx.containsAnyRegularCmdList = ctx.firstCommandList->getCmdListType() == CommandList::CommandListType::TYPE_REGULAR; for (auto i = 0u; i < numCommandLists; i++) { auto commandList = CommandList::fromHandle(phCommandLists[i]); @@ -625,7 +635,7 @@ size_t CommandQueueHw::estimateLinearStreamSizeInitial( auto hwContextSizeEstimate = this->csr->getCmdsSizeForHardwareContext(); if (hwContextSizeEstimate > 0) { linearStreamSizeEstimate += hwContextSizeEstimate; - ctx.globalInit |= true; + ctx.globalInit = true; } if (ctx.isDirectSubmissionEnabled) { @@ -644,7 +654,7 @@ size_t CommandQueueHw::estimateLinearStreamSizeInitial( if (NEO::DebugManager.flags.EnableSWTags.get()) { linearStreamSizeEstimate += NEO::SWTagsManager::estimateSpaceForSWTags(); - ctx.globalInit |= true; + ctx.globalInit = true; } linearStreamSizeEstimate += NEO::EncodeKernelArgsBuffer::getKernelArgsBufferCmdsSize(this->csr->getKernelArgsBufferAllocation(), @@ -669,8 +679,18 @@ size_t CommandQueueHw::estimateLinearStreamSizeInitial( template size_t CommandQueueHw::estimateCommandListSecondaryStart(CommandList *commandList) { - using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START; - return (commandList->getCmdContainer().getCmdBufferAllocations().size() * NEO::EncodeBatchBufferStartOrEnd::getBatchBufferStartSize()); + if (!this->dispatchCmdListBatchBufferAsPrimary) { + return (commandList->getCmdContainer().getCmdBufferAllocations().size() * NEO::EncodeBatchBufferStartOrEnd::getBatchBufferStartSize()); + } + return 0; +} + +template +size_t CommandQueueHw::estimateCommandListPrimaryStart(bool required) { + if (this->dispatchCmdListBatchBufferAsPrimary && required) { + return NEO::EncodeBatchBufferStartOrEnd::getBatchBufferStartSize(); + } + return 0; } template @@ -758,6 +778,7 @@ size_t CommandQueueHw::estimateLinearStreamSizeComplementary( if (propertyScmDirty || propertyFeDirty || propertyPsDirty || propertySbaDirty || frontEndReturnPoint || propertyPreemptionDirty) { CommandListDirtyFlags dirtyFlags = {propertyScmDirty, propertyFeDirty, propertyPsDirty, propertySbaDirty, frontEndReturnPoint, propertyPreemptionDirty}; this->stateChanges.emplace_back(stagingState, cmdList, dirtyFlags, ctx.statePreemption, i); + linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(true); } } @@ -770,7 +791,7 @@ size_t CommandQueueHw::estimateLinearStreamSizeComplementary( auto csrHw = static_cast *>(this->csr); linearStreamSizeEstimate += csrHw->getCmdSizeForPerDssBackedBuffer(this->device->getHwInfo()); - ctx.globalInit |= true; + ctx.globalInit = true; } NEO::Device *neoDevice = this->device->getNEODevice(); @@ -781,6 +802,10 @@ size_t CommandQueueHw::estimateLinearStreamSizeComplementary( linearStreamSizeEstimate += NEO::PreemptionHelper::getRequiredStateSipCmdSize(*neoDevice, this->csr->isRcs()); } + bool firstCmdlistDynamicPreamble = (this->stateChanges.size() > 0 && this->stateChanges[0].cmdListIndex == 0); + bool estimateBbStartForGlobalInitOnly = !firstCmdlistDynamicPreamble && ctx.globalInit; + linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(estimateBbStartForGlobalInitOnly); + return linearStreamSizeEstimate; } @@ -983,18 +1008,63 @@ void CommandQueueHw::writeCsrStreamInlineIfLogicalStateHelperAvai } template -void CommandQueueHw::programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &cmdStream) { - CommandListExecutionContext ctx = {}; - programOneCmdListBatchBufferStart(commandList, cmdStream, ctx); +void CommandQueueHw::programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &commandStream, CommandListExecutionContext &ctx) { + if (this->dispatchCmdListBatchBufferAsPrimary) { + programOneCmdListBatchBufferStartPrimaryBatchBuffer(commandList, commandStream, ctx); + } else { + programOneCmdListBatchBufferStartSecondaryBatchBuffer(commandList, commandStream, ctx); + } } template -void CommandQueueHw::programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &cmdStream, CommandListExecutionContext &ctx) { +void CommandQueueHw::programOneCmdListBatchBufferStartPrimaryBatchBuffer(CommandList *commandList, NEO::LinearStream &commandStream, CommandListExecutionContext &ctx) { + using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START; + + NEO::CommandContainer &cmdListContainer = commandList->getCmdContainer(); + NEO::GraphicsAllocation *cmdListFirstCmdBuffer = cmdListContainer.getCmdBufferAllocations()[0]; + auto bbStartPatchLocation = reinterpret_cast(ctx.currentPatchForChainedBbStart); + + bool dynamicPreamble = ctx.childGpuAddressPositionBeforeDynamicPreamble != commandStream.getCurrentGpuAddressPosition(); + if (ctx.globalInit || dynamicPreamble) { + if (ctx.currentPatchForChainedBbStart) { + // dynamic preamble, 2nd or later command list + // jump from previous command list to the position before dynamic preamble + NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart( + bbStartPatchLocation, + ctx.childGpuAddressPositionBeforeDynamicPreamble, + false, false, false); + } + // dynamic preamble, jump from current position, after dynamic preamble to the current command list + NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(&commandStream, cmdListFirstCmdBuffer->getGpuAddress(), false, false, false); + + ctx.globalInit = false; + } else { + if (ctx.currentPatchForChainedBbStart == nullptr) { + // nothing to dispatch from queue, first command list will be used as submitting batch buffer to KMD or ULLS + size_t firstCmdBufferAlignedSize = cmdListContainer.getAlignedPrimarySize(); + this->firstCmdListStream.replaceGraphicsAllocation(cmdListFirstCmdBuffer); + this->firstCmdListStream.replaceBuffer(cmdListFirstCmdBuffer->getUnderlyingBuffer(), firstCmdBufferAlignedSize); + this->firstCmdListStream.getSpace(firstCmdBufferAlignedSize); + this->startingCmdBuffer = &this->firstCmdListStream; + } else { + // chain between command lists when no dynamic preamble required between 2nd and next command list + NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart( + bbStartPatchLocation, + cmdListFirstCmdBuffer->getGpuAddress(), + false, false, false); + } + } + + ctx.currentPatchForChainedBbStart = cmdListContainer.getEndCmdPtr(); +} + +template +void CommandQueueHw::programOneCmdListBatchBufferStartSecondaryBatchBuffer(CommandList *commandList, NEO::LinearStream &commandStream, CommandListExecutionContext &ctx) { auto &commandContainer = commandList->getCmdContainer(); auto &cmdBufferAllocations = commandContainer.getCmdBufferAllocations(); auto cmdBufferCount = cmdBufferAllocations.size(); - bool isCommandListImmediate = (commandList->getCmdListType() == CommandList::CommandListType::TYPE_IMMEDIATE) ? true : false; + bool isCommandListImmediate = !ctx.containsAnyRegularCmdList; auto &returnPoints = commandList->getReturnPoints(); uint32_t returnPointsSize = commandList->getReturnPointsSize(); @@ -1006,7 +1076,7 @@ void CommandQueueHw::programOneCmdListBatchBufferStart(CommandLis if (isCommandListImmediate && (iter == (cmdBufferCount - 1))) { startOffset = ptrOffset(allocation->getGpuAddress(), commandContainer.currentLinearStreamStartOffsetRef()); } - NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(&cmdStream, startOffset, true, false, false); + NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(&commandStream, startOffset, true, false, false); if (returnPointsSize > 0) { bool cmdBufferHasRestarts = std::find_if( std::next(returnPoints.begin(), returnPointIdx), @@ -1020,9 +1090,9 @@ void CommandQueueHw::programOneCmdListBatchBufferStart(CommandLis ctx.cmdListBeginState.frontEndState.copyPropertiesComputeDispatchAllWalkerEnableDisableEuFusion(returnPoints[returnPointIdx].configSnapshot.frontEndState); programFrontEnd(scratchSpaceController->getScratchPatchAddress(), scratchSpaceController->getPerThreadScratchSpaceSize(), - cmdStream, + commandStream, ctx.cmdListBeginState); - NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(&cmdStream, + NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(&commandStream, returnPoints[returnPointIdx].gpuAddress, true, false, false); returnPointIdx++; @@ -1032,6 +1102,20 @@ void CommandQueueHw::programOneCmdListBatchBufferStart(CommandLis } } +template +void CommandQueueHw::programLastCommandListReturnBbStart( + NEO::LinearStream &commandStream, + CommandListExecutionContext &ctx) { + using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START; + if (this->dispatchCmdListBatchBufferAsPrimary) { + auto finalReturnPosition = commandStream.getCurrentGpuAddressPosition(); + auto bbStartCmd = reinterpret_cast(ctx.currentPatchForChainedBbStart); + NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(bbStartCmd, + finalReturnPosition, + false, false, false); + } +} + template void CommandQueueHw::mergeOneCmdListPipelinedState(CommandList *commandList) { @@ -1166,8 +1250,11 @@ NEO::SubmissionStatus CommandQueueHw::prepareAndSubmitBatchBuffer void *paddingPtr = innerCommandStream.getSpace(this->alignedChildStreamPadding); memset(paddingPtr, 0, this->alignedChildStreamPadding); } + size_t startOffset = (this->startingCmdBuffer == &this->firstCmdListStream) + ? 0 + : ptrDiff(innerCommandStream.getCpuBase(), outerCommandStream.getCpuBase()); - return submitBatchBuffer(ptrDiff(innerCommandStream.getCpuBase(), outerCommandStream.getCpuBase()), + return submitBatchBuffer(startOffset, csr->getResidencyAllocations(), endingCmd, ctx.anyCommandListWithCooperativeKernels); diff --git a/level_zero/core/source/cmdqueue/cmdqueue_imp.h b/level_zero/core/source/cmdqueue/cmdqueue_imp.h index a4483bacf1..cdf8104800 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_imp.h +++ b/level_zero/core/source/cmdqueue/cmdqueue_imp.h @@ -129,6 +129,7 @@ struct CommandQueueImp : public CommandQueue { CommandBufferManager buffers; NEO::LinearStream commandStream{}; + NEO::LinearStream firstCmdListStream{}; NEO::HeapContainer heapContainer; ze_command_queue_desc_t desc; std::vector printfKernelContainer; @@ -138,6 +139,7 @@ struct CommandQueueImp : public CommandQueue { Device *device = nullptr; NEO::CommandStreamReceiver *csr = nullptr; + NEO::LinearStream *startingCmdBuffer = nullptr; uint32_t currentStateChangeIndex = 0; diff --git a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl index b8a2f5e32f..5bd329aa33 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl @@ -187,7 +187,6 @@ void CommandQueueHw::patchCommands(CommandList &commandList, uint for (auto &commandToPatch : commandsToPatch) { switch (commandToPatch.type) { case CommandList::CommandToPatch::FrontEndState: { - UNRECOVERABLE_IF(scratchAddress == 0u); uint32_t lowScratchAddress = uint32_t(0xFFFFFFFF & scratchAddress); CFE_STATE *cfeStateCmd = nullptr; cfeStateCmd = reinterpret_cast(commandToPatch.pCommand); diff --git a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.cpp b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.cpp index 395b8331a3..6023bcea8a 100644 --- a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.cpp +++ b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.cpp @@ -334,7 +334,7 @@ void CommandListAppendLaunchRayTracingKernelFixture::tearDown() { } void PrimaryBatchBufferCmdListFixture::setUp() { - NEO::DebugManager.flags.DispatchCmdlistCmdBufferPrimary.set(1); + DebugManager.flags.DispatchCmdlistCmdBufferPrimary.set(1); ModuleMutableCommandListFixture::setUp(); } diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h b/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h index 87fe4cf620..4f2a380628 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h @@ -30,6 +30,7 @@ struct WhiteBox<::L0::CommandQueue> : public ::L0::CommandQueueImp { using BaseClass::device; using BaseClass::preemptionCmdSyncProgramming; using BaseClass::printfKernelContainer; + using BaseClass::startingCmdBuffer; using BaseClass::submitBatchBuffer; using BaseClass::synchronizeByPollingForTaskCount; using BaseClass::taskCount; @@ -71,6 +72,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw { using BaseClass::commandStream; using BaseClass::prepareAndSubmitBatchBuffer; using BaseClass::printfKernelContainer; + using BaseClass::startingCmdBuffer; using L0::CommandQueue::activeSubDevices; using L0::CommandQueue::cmdListHeapAddressModel; using L0::CommandQueue::dispatchCmdListBatchBufferAsPrimary; @@ -106,6 +108,9 @@ struct MockCommandQueueHw : public L0::CommandQueueHw { if (submitBatchBufferReturnValue.has_value()) { return *submitBatchBufferReturnValue; } + if (this->startingCmdBuffer == nullptr) { + this->startingCmdBuffer = &this->commandStream; + } return BaseClass::submitBatchBuffer(offset, residencyContainer, endingCmdPtr, isCooperative); } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp index a20795a3da..f37e06058f 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp @@ -14,6 +14,7 @@ #include "shared/test/common/test_macros/hw_test.h" #include "level_zero/core/source/builtin/builtin_functions_lib.h" +#include "level_zero/core/source/device/device.h" #include "level_zero/core/source/gfx_core_helpers/l0_gfx_core_helper.h" #include "level_zero/core/source/image/image_hw.h" #include "level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h" @@ -1530,6 +1531,113 @@ HWTEST_F(PrimaryBatchBufferCmdListTest, givenPrimaryBatchBufferWhenCommandListHa EXPECT_EQ(expectedEndPtr, cmdContainer.getEndCmdPtr()); } +HWTEST_F(PrimaryBatchBufferCmdListTest, givenPrimaryBatchBufferWhenCopyCommandListAndQueueAreCreatedThenFirstDispatchCreatesGlobalInitPreambleAndLaterDispatchProvideCmdListBuffer) { + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + + ze_result_t returnValue; + uint32_t count = 0u; + returnValue = device->getCommandQueueGroupProperties(&count, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + EXPECT_GT(count, 0u); + + std::vector properties(count); + returnValue = device->getCommandQueueGroupProperties(&count, properties.data()); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + uint32_t ordinal = 0u; + for (ordinal = 0u; ordinal < count; ordinal++) { + if ((properties[ordinal].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) && + !(properties[ordinal].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)) { + if (properties[ordinal].numQueues == 0) { + continue; + } + break; + } + } + + if (ordinal == count) { + GTEST_SKIP(); + } + + void *dstPtr = nullptr; + void *srcPtr = nullptr; + const size_t size = 64; + ze_device_mem_alloc_desc_t deviceDesc = {}; + returnValue = context->allocDeviceMem(device->toHandle(), &deviceDesc, size, 4u, &dstPtr); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + returnValue = context->allocDeviceMem(device->toHandle(), &deviceDesc, size, 4u, &srcPtr); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + ze_command_queue_desc_t desc{}; + desc.ordinal = ordinal; + desc.index = 0u; + + ze_command_queue_handle_t commandQueueHandle; + returnValue = device->createCommandQueue(&desc, &commandQueueHandle); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + auto commandQueueCopy = static_cast(L0::CommandQueue::fromHandle(commandQueueHandle)); + ASSERT_NE(commandQueueCopy, nullptr); + + auto ultCsr = static_cast *>(commandQueueCopy->getCsr()); + ultCsr->recordFlusheBatchBuffer = true; + + std::unique_ptr commandListCopy; + commandListCopy.reset(whiteboxCast(CommandList::create(productFamily, device, NEO::EngineGroupType::Copy, 0u, returnValue))); + ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue); + + auto &cmdContainerCopy = commandListCopy->getCmdContainer(); + auto &cmdListStream = *cmdContainerCopy.getCommandStream(); + auto firstCmdBufferAllocation = cmdContainerCopy.getCmdBufferAllocations()[0]; + + returnValue = commandListCopy->appendMemoryCopy(dstPtr, srcPtr, size, nullptr, 0, nullptr, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + size_t firstCmdBufferUsed = cmdListStream.getUsed(); + auto bbStartSpace = ptrOffset(cmdListStream.getCpuBase(), firstCmdBufferUsed); + + returnValue = commandListCopy->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + EXPECT_EQ(bbStartSpace, cmdContainerCopy.getEndCmdPtr()); + size_t expectedAlignedUse = alignUp(firstCmdBufferUsed + sizeof(MI_BATCH_BUFFER_START), NEO::CommandContainer::minCmdBufferPtrAlign); + EXPECT_EQ(expectedAlignedUse, cmdContainerCopy.getAlignedPrimarySize()); + + size_t blitterContextInitSize = ultCsr->getCmdsSizeForHardwareContext(); + + auto cmdListHandle = commandListCopy->toHandle(); + returnValue = commandQueueCopy->executeCommandLists(1, &cmdListHandle, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + auto bbStartCmd = genCmdCast(bbStartSpace); + ASSERT_NE(nullptr, bbStartCmd); + + auto &cmdQueueStream = commandQueueCopy->commandStream; + if (blitterContextInitSize > 0) { + EXPECT_EQ(cmdQueueStream.getGraphicsAllocation(), ultCsr->latestFlushedBatchBuffer.commandBufferAllocation); + } else { + EXPECT_EQ(firstCmdBufferAllocation, ultCsr->latestFlushedBatchBuffer.commandBufferAllocation); + EXPECT_EQ(cmdQueueStream.getGpuBase(), bbStartCmd->getBatchBufferStartAddress()); + } + size_t queueSizeUsed = cmdQueueStream.getUsed(); + + returnValue = commandQueueCopy->executeCommandLists(1, &cmdListHandle, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + bbStartCmd = genCmdCast(bbStartSpace); + ASSERT_NE(nullptr, bbStartCmd); + + EXPECT_EQ(cmdQueueStream.getGpuBase() + queueSizeUsed, bbStartCmd->getBatchBufferStartAddress()); + + commandQueueCopy->destroy(); + commandListCopy.reset(nullptr); + + returnValue = context->freeMem(dstPtr); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + returnValue = context->freeMem(srcPtr); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); +} + using PrimaryBatchBufferPreamblelessCmdListTest = Test; HWTEST2_F(PrimaryBatchBufferPreamblelessCmdListTest, @@ -1592,5 +1700,213 @@ HWTEST2_F(PrimaryBatchBufferPreamblelessCmdListTest, EXPECT_EQ((uncachedMocs << 1), sbaCmd->getStatelessDataPortAccessMemoryObjectControlState()); } +HWTEST2_F(PrimaryBatchBufferPreamblelessCmdListTest, + givenPrimaryBatchBufferWhenExecutingCommandWithoutPreambleThenUseCommandListBufferAsStartingBuffer, + IsAtLeastXeHpCore) { + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + + auto ultCsr = static_cast *>(commandQueue->getCsr()); + ultCsr->recordFlusheBatchBuffer = true; + + ze_group_count_t groupCount{1, 1, 1}; + CmdListKernelLaunchParams launchParams = {}; + ze_result_t result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto commandListHandle = commandList->toHandle(); + result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto &cmdQueueStream = commandQueue->commandStream; + EXPECT_EQ(cmdQueueStream.getGraphicsAllocation(), ultCsr->latestFlushedBatchBuffer.commandBufferAllocation); + + size_t queueUsedSize = cmdQueueStream.getUsed(); + auto gpuReturnAddress = cmdQueueStream.getGpuBase() + queueUsedSize; + + result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto &cmdContainer = commandList->getCmdContainer(); + auto firstCmdBufferAllocation = cmdContainer.getCmdBufferAllocations()[0]; + EXPECT_EQ(firstCmdBufferAllocation, ultCsr->latestFlushedBatchBuffer.commandBufferAllocation); + + auto bbStartCmd = genCmdCast(cmdContainer.getEndCmdPtr()); + ASSERT_NE(nullptr, bbStartCmd); + EXPECT_EQ(gpuReturnAddress, bbStartCmd->getBatchBufferStartAddress()); +} + +HWTEST2_F(PrimaryBatchBufferPreamblelessCmdListTest, + givenPrimaryBatchBufferWhenExecutingMultipleCommandListsAndEachWithoutPreambleThenUseCommandListBufferAsStartingBufferAndChainAllCommandLists, + IsAtLeastXeHpCore) { + + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + + auto ultCsr = static_cast *>(commandQueue->getCsr()); + ultCsr->recordFlusheBatchBuffer = true; + + ze_group_count_t groupCount{1, 1, 1}; + CmdListKernelLaunchParams launchParams = {}; + ze_result_t result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + ze_command_list_handle_t commandLists[] = {commandList->toHandle(), + commandList2->toHandle(), + commandList3->toHandle()}; + + result = commandQueue->executeCommandLists(1, commandLists, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto &cmdQueueStream = commandQueue->commandStream; + EXPECT_EQ(cmdQueueStream.getGraphicsAllocation(), ultCsr->latestFlushedBatchBuffer.commandBufferAllocation); + + size_t queueUsedSize = cmdQueueStream.getUsed(); + auto gpuReturnAddress = cmdQueueStream.getGpuBase() + queueUsedSize; + + result = commandList2->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList2->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList3->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList3->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandQueue->executeCommandLists(3, commandLists, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + ptrOffset(cmdQueueStream.getCpuBase(), queueUsedSize), + cmdQueueStream.getUsed() - queueUsedSize)); + auto cmdQueueBbStartCmds = findAll(cmdList.begin(), cmdList.end()); + EXPECT_EQ(0u, cmdQueueBbStartCmds.size()); + + auto &cmdContainer1stCmdList = commandList->getCmdContainer(); + auto dispatchCmdBufferAllocation = cmdContainer1stCmdList.getCmdBufferAllocations()[0]; + EXPECT_EQ(dispatchCmdBufferAllocation, ultCsr->latestFlushedBatchBuffer.commandBufferAllocation); + + auto bbStartCmd = genCmdCast(cmdContainer1stCmdList.getEndCmdPtr()); + ASSERT_NE(nullptr, bbStartCmd); + + auto &cmdContainer2ndCmdList = commandList2->getCmdContainer(); + auto secondCmdBufferAllocation = cmdContainer2ndCmdList.getCmdBufferAllocations()[0]; + EXPECT_EQ(secondCmdBufferAllocation->getGpuAddress(), bbStartCmd->getBatchBufferStartAddress()); + + bbStartCmd = genCmdCast(cmdContainer2ndCmdList.getEndCmdPtr()); + ASSERT_NE(nullptr, bbStartCmd); + + auto &cmdContainer3rdCmdList = commandList3->getCmdContainer(); + auto thirdCmdBufferAllocation = cmdContainer3rdCmdList.getCmdBufferAllocations()[0]; + EXPECT_EQ(thirdCmdBufferAllocation->getGpuAddress(), bbStartCmd->getBatchBufferStartAddress()); + + bbStartCmd = genCmdCast(cmdContainer3rdCmdList.getEndCmdPtr()); + ASSERT_NE(nullptr, bbStartCmd); + EXPECT_EQ(gpuReturnAddress, bbStartCmd->getBatchBufferStartAddress()); +} + +HWTEST2_F(PrimaryBatchBufferPreamblelessCmdListTest, + givenPrimaryBatchBufferWhenExecutingMultipleCommandListsAndSecondWithPreambleThenUseCommandListBufferAsStartingBufferAndChainFirstListToQueuePreambleAndAfterToSecondList, + IsAtLeastXeHpCore) { + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS; + + auto ultCsr = static_cast *>(commandQueue->getCsr()); + ultCsr->recordFlusheBatchBuffer = true; + + ze_group_count_t groupCount{1, 1, 1}; + CmdListKernelLaunchParams launchParams = {}; + ze_result_t result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + ze_command_list_handle_t commandLists[] = {commandList->toHandle(), + commandList2->toHandle(), + commandList3->toHandle()}; + + result = commandQueue->executeCommandLists(1, commandLists, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto &cmdQueueStream = commandQueue->commandStream; + EXPECT_EQ(cmdQueueStream.getGraphicsAllocation(), ultCsr->latestFlushedBatchBuffer.commandBufferAllocation); + + size_t queueUsedSize = cmdQueueStream.getUsed(); + auto gpuReturnAddress = cmdQueueStream.getGpuBase() + queueUsedSize; + + kernel->kernelRequiresUncachedMocsCount++; + + result = commandList2->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList2->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList3->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList3->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandQueue->executeCommandLists(3, commandLists, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + // 1st command list is preambleless + auto &cmdContainer1stCmdList = commandList->getCmdContainer(); + auto dispatchCmdBufferAllocation = cmdContainer1stCmdList.getCmdBufferAllocations()[0]; + EXPECT_EQ(dispatchCmdBufferAllocation, ultCsr->latestFlushedBatchBuffer.commandBufferAllocation); + + auto bbStartCmd = genCmdCast(cmdContainer1stCmdList.getEndCmdPtr()); + ASSERT_NE(nullptr, bbStartCmd); + + // ending BB_START of 1st command list points to dynamic preamble - dirty stateless mocs SBA command + EXPECT_EQ(gpuReturnAddress, bbStartCmd->getBatchBufferStartAddress()); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + ptrOffset(cmdQueueStream.getCpuBase(), queueUsedSize), + cmdQueueStream.getUsed() - queueUsedSize)); + auto cmdQueueSbaDirtyCmds = findAll(cmdList.begin(), cmdList.end()); + ASSERT_TRUE(cmdQueueSbaDirtyCmds.size() >= 1u); + + auto cmdQueueBbStartCmds = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(1u, cmdQueueBbStartCmds.size()); + + auto chainFromPreambleToSecondBbStartCmd = reinterpret_cast(*cmdQueueBbStartCmds[0]); + + auto &cmdContainer2ndCmdList = commandList2->getCmdContainer(); + auto secondCmdBufferAllocation = cmdContainer2ndCmdList.getCmdBufferAllocations()[0]; + EXPECT_EQ(secondCmdBufferAllocation->getGpuAddress(), chainFromPreambleToSecondBbStartCmd->getBatchBufferStartAddress()); + + bbStartCmd = genCmdCast(cmdContainer2ndCmdList.getEndCmdPtr()); + ASSERT_NE(nullptr, bbStartCmd); + + auto &cmdContainer3rdCmdList = commandList3->getCmdContainer(); + auto thirdCmdBufferAllocation = cmdContainer3rdCmdList.getCmdBufferAllocations()[0]; + EXPECT_EQ(thirdCmdBufferAllocation->getGpuAddress(), bbStartCmd->getBatchBufferStartAddress()); + + bbStartCmd = genCmdCast(cmdContainer3rdCmdList.getEndCmdPtr()); + ASSERT_NE(nullptr, bbStartCmd); + + size_t sbaSize = sizeof(STATE_BASE_ADDRESS) + NEO::MemorySynchronizationCommands::getSizeForSingleBarrier(false); + if (commandQueue->doubleSbaWa) { + sbaSize += sizeof(STATE_BASE_ADDRESS); + } + + gpuReturnAddress += sizeof(MI_BATCH_BUFFER_START) + sbaSize; + EXPECT_EQ(gpuReturnAddress, bbStartCmd->getBatchBufferStartAddress()); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_3.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_3.cpp index 00a682b0c9..e506b76c75 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_3.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_3.cpp @@ -1534,15 +1534,19 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest, ze_group_count_t groupCount{1, 1, 1}; CmdListKernelLaunchParams launchParams = {}; - commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + ze_result_t result; + result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); auto &commandsToPatch = commandList->commandsToPatch; EXPECT_EQ(0u, commandsToPatch.size()); + mockKernelImmData->kernelDescriptor->kernelAttributes.perThreadScratchSize[0] = 0x40; mockKernelImmData->kernelDescriptor->kernelAttributes.flags.requiresDisabledEUFusion = 1; size_t usedBefore = cmdStream.getUsed(); - commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); if (fePropertiesSupport.disableEuFusion) { ASSERT_EQ(1u, commandsToPatch.size()); @@ -1555,11 +1559,13 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest, auto cfeCmd = genCmdCast(cfePatch.pCommand); ASSERT_NE(nullptr, cfeCmd); EXPECT_TRUE(NEO::UnitTestHelper::getDisableFusionStateFromFrontEndCommand(*cfeCmd)); + EXPECT_EQ(0u, cfeCmd->getScratchSpaceBuffer()); } else { EXPECT_EQ(0u, commandsToPatch.size()); } - commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); if (fePropertiesSupport.disableEuFusion) { EXPECT_EQ(1u, commandsToPatch.size()); @@ -1570,7 +1576,8 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest, mockKernelImmData->kernelDescriptor->kernelAttributes.flags.requiresDisabledEUFusion = 0; usedBefore = cmdStream.getUsed(); - commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); if (fePropertiesSupport.disableEuFusion) { ASSERT_EQ(2u, commandsToPatch.size()); @@ -1583,12 +1590,14 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest, auto cfeCmd = genCmdCast(cfePatch.pCommand); ASSERT_NE(nullptr, cfeCmd); EXPECT_FALSE(NEO::UnitTestHelper::getDisableFusionStateFromFrontEndCommand(*cfeCmd)); + EXPECT_EQ(0u, cfeCmd->getScratchSpaceBuffer()); } mockKernelImmData->kernelDescriptor->kernelAttributes.flags.requiresDisabledEUFusion = 1; usedBefore = cmdStream.getUsed(); - commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); if (fePropertiesSupport.disableEuFusion) { ASSERT_EQ(3u, commandsToPatch.size()); @@ -1601,15 +1610,40 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest, auto cfeCmd = genCmdCast(cfePatch.pCommand); ASSERT_NE(nullptr, cfeCmd); EXPECT_TRUE(NEO::UnitTestHelper::getDisableFusionStateFromFrontEndCommand(*cfeCmd)); + EXPECT_EQ(0u, cfeCmd->getScratchSpaceBuffer()); } else { EXPECT_EQ(0u, commandsToPatch.size()); } + result = commandList->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto commandListHandle = commandList->toHandle(); + result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + if (fePropertiesSupport.disableEuFusion) { - commandList->reset(); + ASSERT_EQ(3u, commandsToPatch.size()); + + bool disableFusionStates[] = {true, false, true}; + uint32_t disableFusionStatesIdx = 0; + + for (const auto &cfeToPatch : commandsToPatch) { + EXPECT_EQ(CommandList::CommandToPatch::FrontEndState, cfeToPatch.type); + auto cfeCmd = genCmdCast(cfeToPatch.pDestination); + ASSERT_NE(nullptr, cfeCmd); + + EXPECT_EQ(disableFusionStates[disableFusionStatesIdx++], + NEO::UnitTestHelper::getDisableFusionStateFromFrontEndCommand(*cfeCmd)); + EXPECT_NE(0u, cfeCmd->getScratchSpaceBuffer()); + } + + result = commandList->reset(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); EXPECT_EQ(0u, commandsToPatch.size()); } } + HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest, givenFrontEndTrackingCmdListIsExecutedWhenPropertyComputeDispatchAllWalkerSupportedThenExpectFrontEndAddedToPatchlist, IsAtLeastXeHpCore) { @@ -1619,6 +1653,8 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest, auto &productHelper = device->getProductHelper(); productHelper.fillFrontEndPropertiesSupportStructure(fePropertiesSupport, device->getHwInfo()); + mockKernelImmData->kernelDescriptor->kernelAttributes.perThreadScratchSize[0] = 0x40; + NEO::DebugManager.flags.AllowMixingRegularAndCooperativeKernels.set(1); EXPECT_TRUE(commandList->frontEndStateTracking); @@ -1644,6 +1680,7 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest, auto cfeCmd = genCmdCast(cfePatch.pCommand); ASSERT_NE(nullptr, cfeCmd); EXPECT_TRUE(NEO::UnitTestHelper::getComputeDispatchAllWalkerFromFrontEndCommand(*cfeCmd)); + EXPECT_EQ(0u, cfeCmd->getScratchSpaceBuffer()); } else { EXPECT_EQ(0u, commandsToPatch.size()); } @@ -1668,6 +1705,7 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest, auto cfeCmd = genCmdCast(cfePatch.pCommand); ASSERT_NE(nullptr, cfeCmd); EXPECT_FALSE(NEO::UnitTestHelper::getComputeDispatchAllWalkerFromFrontEndCommand(*cfeCmd)); + EXPECT_EQ(0u, cfeCmd->getScratchSpaceBuffer()); } else { EXPECT_EQ(0u, commandsToPatch.size()); } @@ -1683,12 +1721,36 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest, auto cfeCmd = genCmdCast(cfePatch.pCommand); ASSERT_NE(nullptr, cfeCmd); EXPECT_TRUE(NEO::UnitTestHelper::getComputeDispatchAllWalkerFromFrontEndCommand(*cfeCmd)); + EXPECT_EQ(0u, cfeCmd->getScratchSpaceBuffer()); } else { EXPECT_EQ(0u, commandsToPatch.size()); } + result = commandList->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto commandListHandle = commandList->toHandle(); + result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + if (fePropertiesSupport.computeDispatchAllWalker) { - commandList->reset(); + ASSERT_EQ(3u, commandsToPatch.size()); + + bool computeDispatchAllWalkerStates[] = {true, false, true}; + uint32_t computeDispatchAllWalkerStatesIdx = 0; + + for (const auto &cfeToPatch : commandsToPatch) { + EXPECT_EQ(CommandList::CommandToPatch::FrontEndState, cfeToPatch.type); + auto cfeCmd = genCmdCast(cfeToPatch.pDestination); + ASSERT_NE(nullptr, cfeCmd); + + EXPECT_EQ(computeDispatchAllWalkerStates[computeDispatchAllWalkerStatesIdx++], + NEO::UnitTestHelper::getComputeDispatchAllWalkerFromFrontEndCommand(*cfeCmd)); + EXPECT_NE(0u, cfeCmd->getScratchSpaceBuffer()); + } + + result = commandList->reset(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); EXPECT_EQ(0u, commandsToPatch.size()); } } diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp index c3baf72565..80a5e06a17 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp @@ -650,6 +650,7 @@ HWTEST_F(CommandQueueCreate, givenContainerWithAllocationsWhenResidencyContainer false, false, returnValue)); + commandQueue->startingCmdBuffer = &commandQueue->commandStream; ResidencyContainer container; TaskCountType peekTaskCountBefore = commandQueue->csr->peekTaskCount(); TaskCountType flushedTaskCountBefore = commandQueue->csr->peekLatestFlushedTaskCount(); @@ -676,6 +677,7 @@ HWTEST_F(CommandQueueCreate, givenCommandStreamReceiverFailsThenSubmitBatchBuffe false, false, returnValue)); + commandQueue->startingCmdBuffer = &commandQueue->commandStream; ResidencyContainer container; TaskCountType peekTaskCountBefore = commandQueue->csr->peekTaskCount(); TaskCountType flushedTaskCountBefore = commandQueue->csr->peekLatestFlushedTaskCount(); @@ -701,6 +703,7 @@ HWTEST_F(CommandQueueCreate, givenOutOfMemoryThenSubmitBatchBufferReturnsOutOfMe false, false, returnValue)); + commandQueue->startingCmdBuffer = &commandQueue->commandStream; ResidencyContainer container; NEO::SubmissionStatus ret = commandQueue->submitBatchBuffer(0, container, nullptr, false); EXPECT_EQ(ret, NEO::SubmissionStatus::OUT_OF_MEMORY); diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp index fe09fa7fb3..940f91f373 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp @@ -1014,29 +1014,6 @@ HWTEST2_F(CommandQueueScratchTests, whenPatchCommandsIsCalledThenCommandsAreCorr } } -HWTEST2_F(CommandQueueScratchTests, givenInvalidScratchAddressWhenPatchCommandsIsCalledThenAbortIsThrown, IsAtLeastXeHpCore) { - using CFE_STATE = typename FamilyType::CFE_STATE; - - ze_command_queue_desc_t desc = {}; - NEO::CommandStreamReceiver *csr = nullptr; - device->getCsrForOrdinalAndIndex(&csr, 0u, 0u); - auto commandQueue = std::make_unique>(device, csr, &desc); - auto commandList = std::make_unique>>(); - - CFE_STATE destinationCfeState; - auto sourceCfeState = new CFE_STATE; - *sourceCfeState = FamilyType::cmdInitCfeState; - - CommandList::CommandToPatch commandToPatch; - commandToPatch.pDestination = &destinationCfeState; - commandToPatch.pCommand = sourceCfeState; - commandToPatch.type = CommandList::CommandToPatch::CommandType::FrontEndState; - commandList->commandsToPatch.push_back(commandToPatch); - - uint64_t invalidScratchAddress = 0u; - EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, invalidScratchAddress)); -} - using IsWithinNotSupported = IsWithinGfxCore; HWTEST2_F(CommandQueueScratchTests, givenCommandsToPatchToNotSupportedPlatformWhenPatchCommandsIsCalledThenAbortIsThrown, IsWithinNotSupported) {