From f4512073725e6adc670abfaa9855672411a069a2 Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Wed, 12 Apr 2023 20:24:09 +0000 Subject: [PATCH] performance: dispatch and chain command list batch buffers as primary Command list batch buffers should be chained when no dynamic or global preamble is present in command queue. Return to command queue, when preamble is required. Chain last command list to the command queue epilog. Provide first command list batch buffer to KMD/ULLS when no command queue preamble. Related-To: NEO-7807 Signed-off-by: Zbigniew Zdanowicz --- level_zero/core/source/cmdqueue/cmdqueue.cpp | 8 +- level_zero/core/source/cmdqueue/cmdqueue_hw.h | 11 +- .../core/source/cmdqueue/cmdqueue_hw.inl | 121 ++++++- .../core/source/cmdqueue/cmdqueue_imp.h | 2 + .../cmdqueue_xe_hp_core_and_later.inl | 1 - .../unit_tests/fixtures/cmdlist_fixture.cpp | 2 +- .../test/unit_tests/mocks/mock_cmdqueue.h | 5 + .../sources/cmdlist/test_cmdlist_2.cpp | 316 ++++++++++++++++++ .../sources/cmdlist/test_cmdlist_3.cpp | 76 ++++- .../sources/cmdqueue/test_cmdqueue_1.cpp | 3 + .../sources/cmdqueue/test_cmdqueue_2.cpp | 23 -- 11 files changed, 516 insertions(+), 52 deletions(-) diff --git a/level_zero/core/source/cmdqueue/cmdqueue.cpp b/level_zero/core/source/cmdqueue/cmdqueue.cpp index 6b4928eece..03c43bda22 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue.cpp +++ b/level_zero/core/source/cmdqueue/cmdqueue.cpp @@ -124,9 +124,13 @@ NEO::SubmissionStatus CommandQueueImp::submitBatchBuffer(size_t offset, NEO::Res bool isCooperative) { UNRECOVERABLE_IF(csr == nullptr); - NEO::BatchBuffer batchBuffer(commandStream.getGraphicsAllocation(), offset, 0, 0, nullptr, false, false, + NEO::BatchBuffer batchBuffer(this->startingCmdBuffer->getGraphicsAllocation(), offset, 0, 0, nullptr, false, false, NEO::QueueThrottle::HIGH, NEO::QueueSliceCount::defaultSliceCount, - commandStream.getUsed(), &commandStream, endingCmdPtr, csr->getNumClients(), false, false); + this->startingCmdBuffer->getUsed(), this->startingCmdBuffer, endingCmdPtr, csr->getNumClients(), false, false); + + if (this->startingCmdBuffer != &this->commandStream) { + this->csr->makeResident(*this->commandStream.getGraphicsAllocation()); + } commandStream.getGraphicsAllocation()->updateTaskCount(csr->peekTaskCount() + 1, csr->getOsContext().getContextId()); commandStream.getGraphicsAllocation()->updateResidencyTaskCount(csr->peekTaskCount() + 1, csr->getOsContext().getContextId()); diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.h b/level_zero/core/source/cmdqueue/cmdqueue_hw.h index 868d55f143..fd42ded9b3 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.h +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.h @@ -74,9 +74,13 @@ struct CommandQueueHw : public CommandQueueImp { NEO::StreamProperties cmdListBeginState{}; uint64_t scratchGsba = 0; + uint64_t childGpuAddressPositionBeforeDynamicPreamble = 0; + size_t spaceForResidency = 10; CommandList *firstCommandList = nullptr; CommandList *lastCommandList = nullptr; + void *currentPatchForChainedBbStart = nullptr; + NEO::PreemptionMode preemptionMode{}; NEO::PreemptionMode statePreemption{}; uint32_t perThreadScratchSpaceSize = 0; @@ -125,6 +129,7 @@ struct CommandQueueHw : public CommandQueueImp { MOCKABLE_VIRTUAL bool isDispatchTaskCountPostSyncRequired(ze_fence_handle_t hFence, bool containsAnyRegularCmdList) const; inline size_t estimateLinearStreamSizeInitial(CommandListExecutionContext &ctx); inline size_t estimateCommandListSecondaryStart(CommandList *commandList); + inline size_t estimateCommandListPrimaryStart(bool required); inline size_t estimateCommandListResidencySize(CommandList *commandList); inline void setFrontEndStateProperties(CommandListExecutionContext &ctx); inline void handleScratchSpaceAndUpdateGSBAStateDirtyFlag(CommandListExecutionContext &ctx); @@ -157,8 +162,12 @@ struct CommandQueueHw : public CommandQueueImp { inline void programOneCmdListFrontEndIfDirty(CommandListExecutionContext &ctx, NEO::LinearStream &commandStream, CommandListRequiredStateChange &cmdListRequiredState); - inline void programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &commandStream); inline void programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &commandStream, CommandListExecutionContext &ctx); + inline void programOneCmdListBatchBufferStartPrimaryBatchBuffer(CommandList *commandList, NEO::LinearStream &commandStream, CommandListExecutionContext &ctx); + inline void programOneCmdListBatchBufferStartSecondaryBatchBuffer(CommandList *commandList, NEO::LinearStream &commandStream, CommandListExecutionContext &ctx); + inline void programLastCommandListReturnBbStart( + NEO::LinearStream &commandStream, + CommandListExecutionContext &ctx); inline void mergeOneCmdListPipelinedState(CommandList *commandList); inline void programFrontEndAndClearDirtyFlag(bool shouldFrontEndBeProgrammed, CommandListExecutionContext &ctx, diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index 5acb3e5633..ff37c94f1a 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -13,6 +13,7 @@ #include "shared/source/command_stream/command_stream_receiver_hw.h" #include "shared/source/command_stream/linear_stream.h" #include "shared/source/command_stream/preemption.h" +#include "shared/source/command_stream/preemption_mode.h" #include "shared/source/command_stream/scratch_space_controller.h" #include "shared/source/command_stream/wait_status.h" #include "shared/source/debugger/debugger_l0.h" @@ -78,13 +79,14 @@ ze_result_t CommandQueueHw::executeCommandLists( auto neoDevice = device->getNEODevice(); auto ctx = CommandListExecutionContext{phCommandLists, numCommandLists, - csr->getPreemptionMode(), + this->isCopyOnlyCommandQueue ? NEO::PreemptionMode::Disabled : csr->getPreemptionMode(), device, NEO::Debugger::isDebugEnabled(internalUsage), csr->isProgramActivePartitionConfigRequired(), performMigration}; ctx.globalInit |= ctx.isDebugEnabled && !this->commandQueueDebugCmdsProgrammed && (neoDevice->getSourceLevelDebugger() || device->getL0Debugger()); + this->startingCmdBuffer = &this->commandStream; this->device->activateMetricGroups(); if (this->isCopyOnlyCommandQueue) { @@ -185,6 +187,8 @@ ze_result_t CommandQueueHw::executeCommandListsRegular( for (auto i = 0u; i < numCommandLists; ++i) { auto commandList = CommandList::fromHandle(commandListHandles[i]); + ctx.childGpuAddressPositionBeforeDynamicPreamble = child.getCurrentGpuAddressPosition(); + if (this->stateChanges.size() > this->currentStateChangeIndex) { auto &stateChange = this->stateChanges[this->currentStateChangeIndex]; if (stateChange.cmdListIndex == i) { @@ -214,6 +218,7 @@ ze_result_t CommandQueueHw::executeCommandListsRegular( this->updateBaseAddressState(ctx.lastCommandList); this->migrateSharedAllocationsIfRequested(ctx.isMigrationRequested, ctx.firstCommandList); + this->programLastCommandListReturnBbStart(child, ctx); this->programStateSipEndWA(ctx.stateSipRequired, child); this->assignCsrTaskCountToFenceIfAvailable(hFence); this->dispatchTaskCountPostSyncRegular(ctx.isDispatchTaskCountPostSyncRequired, child); @@ -250,6 +255,8 @@ ze_result_t CommandQueueHw::executeCommandListsCopyOnly( ctx.spaceForResidency += estimateCommandListResidencySize(commandList); } + linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(ctx.globalInit); + this->csr->getResidencyAllocations().reserve(ctx.spaceForResidency); NEO::EncodeDummyBlitWaArgs waArgs{false, &(this->device->getNEODevice()->getRootDeviceEnvironmentRef())}; @@ -270,7 +277,9 @@ ze_result_t CommandQueueHw::executeCommandListsCopyOnly( for (auto i = 0u; i < numCommandLists; ++i) { auto commandList = CommandList::fromHandle(phCommandLists[i]); - this->programOneCmdListBatchBufferStart(commandList, child); + ctx.childGpuAddressPositionBeforeDynamicPreamble = child.getCurrentGpuAddressPosition(); + + this->programOneCmdListBatchBufferStart(commandList, child, ctx); this->mergeOneCmdListPipelinedState(commandList); this->prefetchMemoryToDeviceAssociatedWithCmdList(commandList); } @@ -278,6 +287,7 @@ ze_result_t CommandQueueHw::executeCommandListsCopyOnly( this->assignCsrTaskCountToFenceIfAvailable(hFence); + this->programLastCommandListReturnBbStart(child, ctx); this->dispatchTaskCountPostSyncByMiFlushDw(ctx.isDispatchTaskCountPostSyncRequired, child); this->makeCsrTagAllocationResident(); @@ -559,7 +569,7 @@ void CommandQueueHw::setupCmdListsAndContextParams( uint32_t numCommandLists, ze_fence_handle_t hFence) { - ctx.containsAnyRegularCmdList |= ctx.firstCommandList->getCmdListType() == CommandList::CommandListType::TYPE_REGULAR; + ctx.containsAnyRegularCmdList = ctx.firstCommandList->getCmdListType() == CommandList::CommandListType::TYPE_REGULAR; for (auto i = 0u; i < numCommandLists; i++) { auto commandList = CommandList::fromHandle(phCommandLists[i]); @@ -625,7 +635,7 @@ size_t CommandQueueHw::estimateLinearStreamSizeInitial( auto hwContextSizeEstimate = this->csr->getCmdsSizeForHardwareContext(); if (hwContextSizeEstimate > 0) { linearStreamSizeEstimate += hwContextSizeEstimate; - ctx.globalInit |= true; + ctx.globalInit = true; } if (ctx.isDirectSubmissionEnabled) { @@ -644,7 +654,7 @@ size_t CommandQueueHw::estimateLinearStreamSizeInitial( if (NEO::DebugManager.flags.EnableSWTags.get()) { linearStreamSizeEstimate += NEO::SWTagsManager::estimateSpaceForSWTags(); - ctx.globalInit |= true; + ctx.globalInit = true; } linearStreamSizeEstimate += NEO::EncodeKernelArgsBuffer::getKernelArgsBufferCmdsSize(this->csr->getKernelArgsBufferAllocation(), @@ -669,8 +679,18 @@ size_t CommandQueueHw::estimateLinearStreamSizeInitial( template size_t CommandQueueHw::estimateCommandListSecondaryStart(CommandList *commandList) { - using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START; - return (commandList->getCmdContainer().getCmdBufferAllocations().size() * NEO::EncodeBatchBufferStartOrEnd::getBatchBufferStartSize()); + if (!this->dispatchCmdListBatchBufferAsPrimary) { + return (commandList->getCmdContainer().getCmdBufferAllocations().size() * NEO::EncodeBatchBufferStartOrEnd::getBatchBufferStartSize()); + } + return 0; +} + +template +size_t CommandQueueHw::estimateCommandListPrimaryStart(bool required) { + if (this->dispatchCmdListBatchBufferAsPrimary && required) { + return NEO::EncodeBatchBufferStartOrEnd::getBatchBufferStartSize(); + } + return 0; } template @@ -758,6 +778,7 @@ size_t CommandQueueHw::estimateLinearStreamSizeComplementary( if (propertyScmDirty || propertyFeDirty || propertyPsDirty || propertySbaDirty || frontEndReturnPoint || propertyPreemptionDirty) { CommandListDirtyFlags dirtyFlags = {propertyScmDirty, propertyFeDirty, propertyPsDirty, propertySbaDirty, frontEndReturnPoint, propertyPreemptionDirty}; this->stateChanges.emplace_back(stagingState, cmdList, dirtyFlags, ctx.statePreemption, i); + linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(true); } } @@ -770,7 +791,7 @@ size_t CommandQueueHw::estimateLinearStreamSizeComplementary( auto csrHw = static_cast *>(this->csr); linearStreamSizeEstimate += csrHw->getCmdSizeForPerDssBackedBuffer(this->device->getHwInfo()); - ctx.globalInit |= true; + ctx.globalInit = true; } NEO::Device *neoDevice = this->device->getNEODevice(); @@ -781,6 +802,10 @@ size_t CommandQueueHw::estimateLinearStreamSizeComplementary( linearStreamSizeEstimate += NEO::PreemptionHelper::getRequiredStateSipCmdSize(*neoDevice, this->csr->isRcs()); } + bool firstCmdlistDynamicPreamble = (this->stateChanges.size() > 0 && this->stateChanges[0].cmdListIndex == 0); + bool estimateBbStartForGlobalInitOnly = !firstCmdlistDynamicPreamble && ctx.globalInit; + linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(estimateBbStartForGlobalInitOnly); + return linearStreamSizeEstimate; } @@ -983,18 +1008,63 @@ void CommandQueueHw::writeCsrStreamInlineIfLogicalStateHelperAvai } template -void CommandQueueHw::programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &cmdStream) { - CommandListExecutionContext ctx = {}; - programOneCmdListBatchBufferStart(commandList, cmdStream, ctx); +void CommandQueueHw::programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &commandStream, CommandListExecutionContext &ctx) { + if (this->dispatchCmdListBatchBufferAsPrimary) { + programOneCmdListBatchBufferStartPrimaryBatchBuffer(commandList, commandStream, ctx); + } else { + programOneCmdListBatchBufferStartSecondaryBatchBuffer(commandList, commandStream, ctx); + } } template -void CommandQueueHw::programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &cmdStream, CommandListExecutionContext &ctx) { +void CommandQueueHw::programOneCmdListBatchBufferStartPrimaryBatchBuffer(CommandList *commandList, NEO::LinearStream &commandStream, CommandListExecutionContext &ctx) { + using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START; + + NEO::CommandContainer &cmdListContainer = commandList->getCmdContainer(); + NEO::GraphicsAllocation *cmdListFirstCmdBuffer = cmdListContainer.getCmdBufferAllocations()[0]; + auto bbStartPatchLocation = reinterpret_cast(ctx.currentPatchForChainedBbStart); + + bool dynamicPreamble = ctx.childGpuAddressPositionBeforeDynamicPreamble != commandStream.getCurrentGpuAddressPosition(); + if (ctx.globalInit || dynamicPreamble) { + if (ctx.currentPatchForChainedBbStart) { + // dynamic preamble, 2nd or later command list + // jump from previous command list to the position before dynamic preamble + NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart( + bbStartPatchLocation, + ctx.childGpuAddressPositionBeforeDynamicPreamble, + false, false, false); + } + // dynamic preamble, jump from current position, after dynamic preamble to the current command list + NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(&commandStream, cmdListFirstCmdBuffer->getGpuAddress(), false, false, false); + + ctx.globalInit = false; + } else { + if (ctx.currentPatchForChainedBbStart == nullptr) { + // nothing to dispatch from queue, first command list will be used as submitting batch buffer to KMD or ULLS + size_t firstCmdBufferAlignedSize = cmdListContainer.getAlignedPrimarySize(); + this->firstCmdListStream.replaceGraphicsAllocation(cmdListFirstCmdBuffer); + this->firstCmdListStream.replaceBuffer(cmdListFirstCmdBuffer->getUnderlyingBuffer(), firstCmdBufferAlignedSize); + this->firstCmdListStream.getSpace(firstCmdBufferAlignedSize); + this->startingCmdBuffer = &this->firstCmdListStream; + } else { + // chain between command lists when no dynamic preamble required between 2nd and next command list + NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart( + bbStartPatchLocation, + cmdListFirstCmdBuffer->getGpuAddress(), + false, false, false); + } + } + + ctx.currentPatchForChainedBbStart = cmdListContainer.getEndCmdPtr(); +} + +template +void CommandQueueHw::programOneCmdListBatchBufferStartSecondaryBatchBuffer(CommandList *commandList, NEO::LinearStream &commandStream, CommandListExecutionContext &ctx) { auto &commandContainer = commandList->getCmdContainer(); auto &cmdBufferAllocations = commandContainer.getCmdBufferAllocations(); auto cmdBufferCount = cmdBufferAllocations.size(); - bool isCommandListImmediate = (commandList->getCmdListType() == CommandList::CommandListType::TYPE_IMMEDIATE) ? true : false; + bool isCommandListImmediate = !ctx.containsAnyRegularCmdList; auto &returnPoints = commandList->getReturnPoints(); uint32_t returnPointsSize = commandList->getReturnPointsSize(); @@ -1006,7 +1076,7 @@ void CommandQueueHw::programOneCmdListBatchBufferStart(CommandLis if (isCommandListImmediate && (iter == (cmdBufferCount - 1))) { startOffset = ptrOffset(allocation->getGpuAddress(), commandContainer.currentLinearStreamStartOffsetRef()); } - NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(&cmdStream, startOffset, true, false, false); + NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(&commandStream, startOffset, true, false, false); if (returnPointsSize > 0) { bool cmdBufferHasRestarts = std::find_if( std::next(returnPoints.begin(), returnPointIdx), @@ -1020,9 +1090,9 @@ void CommandQueueHw::programOneCmdListBatchBufferStart(CommandLis ctx.cmdListBeginState.frontEndState.copyPropertiesComputeDispatchAllWalkerEnableDisableEuFusion(returnPoints[returnPointIdx].configSnapshot.frontEndState); programFrontEnd(scratchSpaceController->getScratchPatchAddress(), scratchSpaceController->getPerThreadScratchSpaceSize(), - cmdStream, + commandStream, ctx.cmdListBeginState); - NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(&cmdStream, + NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(&commandStream, returnPoints[returnPointIdx].gpuAddress, true, false, false); returnPointIdx++; @@ -1032,6 +1102,20 @@ void CommandQueueHw::programOneCmdListBatchBufferStart(CommandLis } } +template +void CommandQueueHw::programLastCommandListReturnBbStart( + NEO::LinearStream &commandStream, + CommandListExecutionContext &ctx) { + using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START; + if (this->dispatchCmdListBatchBufferAsPrimary) { + auto finalReturnPosition = commandStream.getCurrentGpuAddressPosition(); + auto bbStartCmd = reinterpret_cast(ctx.currentPatchForChainedBbStart); + NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(bbStartCmd, + finalReturnPosition, + false, false, false); + } +} + template void CommandQueueHw::mergeOneCmdListPipelinedState(CommandList *commandList) { @@ -1166,8 +1250,11 @@ NEO::SubmissionStatus CommandQueueHw::prepareAndSubmitBatchBuffer void *paddingPtr = innerCommandStream.getSpace(this->alignedChildStreamPadding); memset(paddingPtr, 0, this->alignedChildStreamPadding); } + size_t startOffset = (this->startingCmdBuffer == &this->firstCmdListStream) + ? 0 + : ptrDiff(innerCommandStream.getCpuBase(), outerCommandStream.getCpuBase()); - return submitBatchBuffer(ptrDiff(innerCommandStream.getCpuBase(), outerCommandStream.getCpuBase()), + return submitBatchBuffer(startOffset, csr->getResidencyAllocations(), endingCmd, ctx.anyCommandListWithCooperativeKernels); diff --git a/level_zero/core/source/cmdqueue/cmdqueue_imp.h b/level_zero/core/source/cmdqueue/cmdqueue_imp.h index a4483bacf1..cdf8104800 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_imp.h +++ b/level_zero/core/source/cmdqueue/cmdqueue_imp.h @@ -129,6 +129,7 @@ struct CommandQueueImp : public CommandQueue { CommandBufferManager buffers; NEO::LinearStream commandStream{}; + NEO::LinearStream firstCmdListStream{}; NEO::HeapContainer heapContainer; ze_command_queue_desc_t desc; std::vector printfKernelContainer; @@ -138,6 +139,7 @@ struct CommandQueueImp : public CommandQueue { Device *device = nullptr; NEO::CommandStreamReceiver *csr = nullptr; + NEO::LinearStream *startingCmdBuffer = nullptr; uint32_t currentStateChangeIndex = 0; diff --git a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl index b8a2f5e32f..5bd329aa33 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl @@ -187,7 +187,6 @@ void CommandQueueHw::patchCommands(CommandList &commandList, uint for (auto &commandToPatch : commandsToPatch) { switch (commandToPatch.type) { case CommandList::CommandToPatch::FrontEndState: { - UNRECOVERABLE_IF(scratchAddress == 0u); uint32_t lowScratchAddress = uint32_t(0xFFFFFFFF & scratchAddress); CFE_STATE *cfeStateCmd = nullptr; cfeStateCmd = reinterpret_cast(commandToPatch.pCommand); diff --git a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.cpp b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.cpp index 395b8331a3..6023bcea8a 100644 --- a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.cpp +++ b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.cpp @@ -334,7 +334,7 @@ void CommandListAppendLaunchRayTracingKernelFixture::tearDown() { } void PrimaryBatchBufferCmdListFixture::setUp() { - NEO::DebugManager.flags.DispatchCmdlistCmdBufferPrimary.set(1); + DebugManager.flags.DispatchCmdlistCmdBufferPrimary.set(1); ModuleMutableCommandListFixture::setUp(); } diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h b/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h index 87fe4cf620..4f2a380628 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h @@ -30,6 +30,7 @@ struct WhiteBox<::L0::CommandQueue> : public ::L0::CommandQueueImp { using BaseClass::device; using BaseClass::preemptionCmdSyncProgramming; using BaseClass::printfKernelContainer; + using BaseClass::startingCmdBuffer; using BaseClass::submitBatchBuffer; using BaseClass::synchronizeByPollingForTaskCount; using BaseClass::taskCount; @@ -71,6 +72,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw { using BaseClass::commandStream; using BaseClass::prepareAndSubmitBatchBuffer; using BaseClass::printfKernelContainer; + using BaseClass::startingCmdBuffer; using L0::CommandQueue::activeSubDevices; using L0::CommandQueue::cmdListHeapAddressModel; using L0::CommandQueue::dispatchCmdListBatchBufferAsPrimary; @@ -106,6 +108,9 @@ struct MockCommandQueueHw : public L0::CommandQueueHw { if (submitBatchBufferReturnValue.has_value()) { return *submitBatchBufferReturnValue; } + if (this->startingCmdBuffer == nullptr) { + this->startingCmdBuffer = &this->commandStream; + } return BaseClass::submitBatchBuffer(offset, residencyContainer, endingCmdPtr, isCooperative); } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp index a20795a3da..f37e06058f 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp @@ -14,6 +14,7 @@ #include "shared/test/common/test_macros/hw_test.h" #include "level_zero/core/source/builtin/builtin_functions_lib.h" +#include "level_zero/core/source/device/device.h" #include "level_zero/core/source/gfx_core_helpers/l0_gfx_core_helper.h" #include "level_zero/core/source/image/image_hw.h" #include "level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h" @@ -1530,6 +1531,113 @@ HWTEST_F(PrimaryBatchBufferCmdListTest, givenPrimaryBatchBufferWhenCommandListHa EXPECT_EQ(expectedEndPtr, cmdContainer.getEndCmdPtr()); } +HWTEST_F(PrimaryBatchBufferCmdListTest, givenPrimaryBatchBufferWhenCopyCommandListAndQueueAreCreatedThenFirstDispatchCreatesGlobalInitPreambleAndLaterDispatchProvideCmdListBuffer) { + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + + ze_result_t returnValue; + uint32_t count = 0u; + returnValue = device->getCommandQueueGroupProperties(&count, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + EXPECT_GT(count, 0u); + + std::vector properties(count); + returnValue = device->getCommandQueueGroupProperties(&count, properties.data()); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + uint32_t ordinal = 0u; + for (ordinal = 0u; ordinal < count; ordinal++) { + if ((properties[ordinal].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) && + !(properties[ordinal].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)) { + if (properties[ordinal].numQueues == 0) { + continue; + } + break; + } + } + + if (ordinal == count) { + GTEST_SKIP(); + } + + void *dstPtr = nullptr; + void *srcPtr = nullptr; + const size_t size = 64; + ze_device_mem_alloc_desc_t deviceDesc = {}; + returnValue = context->allocDeviceMem(device->toHandle(), &deviceDesc, size, 4u, &dstPtr); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + returnValue = context->allocDeviceMem(device->toHandle(), &deviceDesc, size, 4u, &srcPtr); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + ze_command_queue_desc_t desc{}; + desc.ordinal = ordinal; + desc.index = 0u; + + ze_command_queue_handle_t commandQueueHandle; + returnValue = device->createCommandQueue(&desc, &commandQueueHandle); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + auto commandQueueCopy = static_cast(L0::CommandQueue::fromHandle(commandQueueHandle)); + ASSERT_NE(commandQueueCopy, nullptr); + + auto ultCsr = static_cast *>(commandQueueCopy->getCsr()); + ultCsr->recordFlusheBatchBuffer = true; + + std::unique_ptr commandListCopy; + commandListCopy.reset(whiteboxCast(CommandList::create(productFamily, device, NEO::EngineGroupType::Copy, 0u, returnValue))); + ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue); + + auto &cmdContainerCopy = commandListCopy->getCmdContainer(); + auto &cmdListStream = *cmdContainerCopy.getCommandStream(); + auto firstCmdBufferAllocation = cmdContainerCopy.getCmdBufferAllocations()[0]; + + returnValue = commandListCopy->appendMemoryCopy(dstPtr, srcPtr, size, nullptr, 0, nullptr, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + size_t firstCmdBufferUsed = cmdListStream.getUsed(); + auto bbStartSpace = ptrOffset(cmdListStream.getCpuBase(), firstCmdBufferUsed); + + returnValue = commandListCopy->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + EXPECT_EQ(bbStartSpace, cmdContainerCopy.getEndCmdPtr()); + size_t expectedAlignedUse = alignUp(firstCmdBufferUsed + sizeof(MI_BATCH_BUFFER_START), NEO::CommandContainer::minCmdBufferPtrAlign); + EXPECT_EQ(expectedAlignedUse, cmdContainerCopy.getAlignedPrimarySize()); + + size_t blitterContextInitSize = ultCsr->getCmdsSizeForHardwareContext(); + + auto cmdListHandle = commandListCopy->toHandle(); + returnValue = commandQueueCopy->executeCommandLists(1, &cmdListHandle, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + auto bbStartCmd = genCmdCast(bbStartSpace); + ASSERT_NE(nullptr, bbStartCmd); + + auto &cmdQueueStream = commandQueueCopy->commandStream; + if (blitterContextInitSize > 0) { + EXPECT_EQ(cmdQueueStream.getGraphicsAllocation(), ultCsr->latestFlushedBatchBuffer.commandBufferAllocation); + } else { + EXPECT_EQ(firstCmdBufferAllocation, ultCsr->latestFlushedBatchBuffer.commandBufferAllocation); + EXPECT_EQ(cmdQueueStream.getGpuBase(), bbStartCmd->getBatchBufferStartAddress()); + } + size_t queueSizeUsed = cmdQueueStream.getUsed(); + + returnValue = commandQueueCopy->executeCommandLists(1, &cmdListHandle, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + bbStartCmd = genCmdCast(bbStartSpace); + ASSERT_NE(nullptr, bbStartCmd); + + EXPECT_EQ(cmdQueueStream.getGpuBase() + queueSizeUsed, bbStartCmd->getBatchBufferStartAddress()); + + commandQueueCopy->destroy(); + commandListCopy.reset(nullptr); + + returnValue = context->freeMem(dstPtr); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + returnValue = context->freeMem(srcPtr); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); +} + using PrimaryBatchBufferPreamblelessCmdListTest = Test; HWTEST2_F(PrimaryBatchBufferPreamblelessCmdListTest, @@ -1592,5 +1700,213 @@ HWTEST2_F(PrimaryBatchBufferPreamblelessCmdListTest, EXPECT_EQ((uncachedMocs << 1), sbaCmd->getStatelessDataPortAccessMemoryObjectControlState()); } +HWTEST2_F(PrimaryBatchBufferPreamblelessCmdListTest, + givenPrimaryBatchBufferWhenExecutingCommandWithoutPreambleThenUseCommandListBufferAsStartingBuffer, + IsAtLeastXeHpCore) { + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + + auto ultCsr = static_cast *>(commandQueue->getCsr()); + ultCsr->recordFlusheBatchBuffer = true; + + ze_group_count_t groupCount{1, 1, 1}; + CmdListKernelLaunchParams launchParams = {}; + ze_result_t result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto commandListHandle = commandList->toHandle(); + result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto &cmdQueueStream = commandQueue->commandStream; + EXPECT_EQ(cmdQueueStream.getGraphicsAllocation(), ultCsr->latestFlushedBatchBuffer.commandBufferAllocation); + + size_t queueUsedSize = cmdQueueStream.getUsed(); + auto gpuReturnAddress = cmdQueueStream.getGpuBase() + queueUsedSize; + + result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto &cmdContainer = commandList->getCmdContainer(); + auto firstCmdBufferAllocation = cmdContainer.getCmdBufferAllocations()[0]; + EXPECT_EQ(firstCmdBufferAllocation, ultCsr->latestFlushedBatchBuffer.commandBufferAllocation); + + auto bbStartCmd = genCmdCast(cmdContainer.getEndCmdPtr()); + ASSERT_NE(nullptr, bbStartCmd); + EXPECT_EQ(gpuReturnAddress, bbStartCmd->getBatchBufferStartAddress()); +} + +HWTEST2_F(PrimaryBatchBufferPreamblelessCmdListTest, + givenPrimaryBatchBufferWhenExecutingMultipleCommandListsAndEachWithoutPreambleThenUseCommandListBufferAsStartingBufferAndChainAllCommandLists, + IsAtLeastXeHpCore) { + + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + + auto ultCsr = static_cast *>(commandQueue->getCsr()); + ultCsr->recordFlusheBatchBuffer = true; + + ze_group_count_t groupCount{1, 1, 1}; + CmdListKernelLaunchParams launchParams = {}; + ze_result_t result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + ze_command_list_handle_t commandLists[] = {commandList->toHandle(), + commandList2->toHandle(), + commandList3->toHandle()}; + + result = commandQueue->executeCommandLists(1, commandLists, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto &cmdQueueStream = commandQueue->commandStream; + EXPECT_EQ(cmdQueueStream.getGraphicsAllocation(), ultCsr->latestFlushedBatchBuffer.commandBufferAllocation); + + size_t queueUsedSize = cmdQueueStream.getUsed(); + auto gpuReturnAddress = cmdQueueStream.getGpuBase() + queueUsedSize; + + result = commandList2->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList2->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList3->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList3->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandQueue->executeCommandLists(3, commandLists, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + ptrOffset(cmdQueueStream.getCpuBase(), queueUsedSize), + cmdQueueStream.getUsed() - queueUsedSize)); + auto cmdQueueBbStartCmds = findAll(cmdList.begin(), cmdList.end()); + EXPECT_EQ(0u, cmdQueueBbStartCmds.size()); + + auto &cmdContainer1stCmdList = commandList->getCmdContainer(); + auto dispatchCmdBufferAllocation = cmdContainer1stCmdList.getCmdBufferAllocations()[0]; + EXPECT_EQ(dispatchCmdBufferAllocation, ultCsr->latestFlushedBatchBuffer.commandBufferAllocation); + + auto bbStartCmd = genCmdCast(cmdContainer1stCmdList.getEndCmdPtr()); + ASSERT_NE(nullptr, bbStartCmd); + + auto &cmdContainer2ndCmdList = commandList2->getCmdContainer(); + auto secondCmdBufferAllocation = cmdContainer2ndCmdList.getCmdBufferAllocations()[0]; + EXPECT_EQ(secondCmdBufferAllocation->getGpuAddress(), bbStartCmd->getBatchBufferStartAddress()); + + bbStartCmd = genCmdCast(cmdContainer2ndCmdList.getEndCmdPtr()); + ASSERT_NE(nullptr, bbStartCmd); + + auto &cmdContainer3rdCmdList = commandList3->getCmdContainer(); + auto thirdCmdBufferAllocation = cmdContainer3rdCmdList.getCmdBufferAllocations()[0]; + EXPECT_EQ(thirdCmdBufferAllocation->getGpuAddress(), bbStartCmd->getBatchBufferStartAddress()); + + bbStartCmd = genCmdCast(cmdContainer3rdCmdList.getEndCmdPtr()); + ASSERT_NE(nullptr, bbStartCmd); + EXPECT_EQ(gpuReturnAddress, bbStartCmd->getBatchBufferStartAddress()); +} + +HWTEST2_F(PrimaryBatchBufferPreamblelessCmdListTest, + givenPrimaryBatchBufferWhenExecutingMultipleCommandListsAndSecondWithPreambleThenUseCommandListBufferAsStartingBufferAndChainFirstListToQueuePreambleAndAfterToSecondList, + IsAtLeastXeHpCore) { + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS; + + auto ultCsr = static_cast *>(commandQueue->getCsr()); + ultCsr->recordFlusheBatchBuffer = true; + + ze_group_count_t groupCount{1, 1, 1}; + CmdListKernelLaunchParams launchParams = {}; + ze_result_t result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + ze_command_list_handle_t commandLists[] = {commandList->toHandle(), + commandList2->toHandle(), + commandList3->toHandle()}; + + result = commandQueue->executeCommandLists(1, commandLists, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto &cmdQueueStream = commandQueue->commandStream; + EXPECT_EQ(cmdQueueStream.getGraphicsAllocation(), ultCsr->latestFlushedBatchBuffer.commandBufferAllocation); + + size_t queueUsedSize = cmdQueueStream.getUsed(); + auto gpuReturnAddress = cmdQueueStream.getGpuBase() + queueUsedSize; + + kernel->kernelRequiresUncachedMocsCount++; + + result = commandList2->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList2->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList3->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandList3->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + result = commandQueue->executeCommandLists(3, commandLists, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + // 1st command list is preambleless + auto &cmdContainer1stCmdList = commandList->getCmdContainer(); + auto dispatchCmdBufferAllocation = cmdContainer1stCmdList.getCmdBufferAllocations()[0]; + EXPECT_EQ(dispatchCmdBufferAllocation, ultCsr->latestFlushedBatchBuffer.commandBufferAllocation); + + auto bbStartCmd = genCmdCast(cmdContainer1stCmdList.getEndCmdPtr()); + ASSERT_NE(nullptr, bbStartCmd); + + // ending BB_START of 1st command list points to dynamic preamble - dirty stateless mocs SBA command + EXPECT_EQ(gpuReturnAddress, bbStartCmd->getBatchBufferStartAddress()); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + ptrOffset(cmdQueueStream.getCpuBase(), queueUsedSize), + cmdQueueStream.getUsed() - queueUsedSize)); + auto cmdQueueSbaDirtyCmds = findAll(cmdList.begin(), cmdList.end()); + ASSERT_TRUE(cmdQueueSbaDirtyCmds.size() >= 1u); + + auto cmdQueueBbStartCmds = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(1u, cmdQueueBbStartCmds.size()); + + auto chainFromPreambleToSecondBbStartCmd = reinterpret_cast(*cmdQueueBbStartCmds[0]); + + auto &cmdContainer2ndCmdList = commandList2->getCmdContainer(); + auto secondCmdBufferAllocation = cmdContainer2ndCmdList.getCmdBufferAllocations()[0]; + EXPECT_EQ(secondCmdBufferAllocation->getGpuAddress(), chainFromPreambleToSecondBbStartCmd->getBatchBufferStartAddress()); + + bbStartCmd = genCmdCast(cmdContainer2ndCmdList.getEndCmdPtr()); + ASSERT_NE(nullptr, bbStartCmd); + + auto &cmdContainer3rdCmdList = commandList3->getCmdContainer(); + auto thirdCmdBufferAllocation = cmdContainer3rdCmdList.getCmdBufferAllocations()[0]; + EXPECT_EQ(thirdCmdBufferAllocation->getGpuAddress(), bbStartCmd->getBatchBufferStartAddress()); + + bbStartCmd = genCmdCast(cmdContainer3rdCmdList.getEndCmdPtr()); + ASSERT_NE(nullptr, bbStartCmd); + + size_t sbaSize = sizeof(STATE_BASE_ADDRESS) + NEO::MemorySynchronizationCommands::getSizeForSingleBarrier(false); + if (commandQueue->doubleSbaWa) { + sbaSize += sizeof(STATE_BASE_ADDRESS); + } + + gpuReturnAddress += sizeof(MI_BATCH_BUFFER_START) + sbaSize; + EXPECT_EQ(gpuReturnAddress, bbStartCmd->getBatchBufferStartAddress()); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_3.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_3.cpp index 00a682b0c9..e506b76c75 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_3.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_3.cpp @@ -1534,15 +1534,19 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest, ze_group_count_t groupCount{1, 1, 1}; CmdListKernelLaunchParams launchParams = {}; - commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + ze_result_t result; + result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); auto &commandsToPatch = commandList->commandsToPatch; EXPECT_EQ(0u, commandsToPatch.size()); + mockKernelImmData->kernelDescriptor->kernelAttributes.perThreadScratchSize[0] = 0x40; mockKernelImmData->kernelDescriptor->kernelAttributes.flags.requiresDisabledEUFusion = 1; size_t usedBefore = cmdStream.getUsed(); - commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); if (fePropertiesSupport.disableEuFusion) { ASSERT_EQ(1u, commandsToPatch.size()); @@ -1555,11 +1559,13 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest, auto cfeCmd = genCmdCast(cfePatch.pCommand); ASSERT_NE(nullptr, cfeCmd); EXPECT_TRUE(NEO::UnitTestHelper::getDisableFusionStateFromFrontEndCommand(*cfeCmd)); + EXPECT_EQ(0u, cfeCmd->getScratchSpaceBuffer()); } else { EXPECT_EQ(0u, commandsToPatch.size()); } - commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); if (fePropertiesSupport.disableEuFusion) { EXPECT_EQ(1u, commandsToPatch.size()); @@ -1570,7 +1576,8 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest, mockKernelImmData->kernelDescriptor->kernelAttributes.flags.requiresDisabledEUFusion = 0; usedBefore = cmdStream.getUsed(); - commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); if (fePropertiesSupport.disableEuFusion) { ASSERT_EQ(2u, commandsToPatch.size()); @@ -1583,12 +1590,14 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest, auto cfeCmd = genCmdCast(cfePatch.pCommand); ASSERT_NE(nullptr, cfeCmd); EXPECT_FALSE(NEO::UnitTestHelper::getDisableFusionStateFromFrontEndCommand(*cfeCmd)); + EXPECT_EQ(0u, cfeCmd->getScratchSpaceBuffer()); } mockKernelImmData->kernelDescriptor->kernelAttributes.flags.requiresDisabledEUFusion = 1; usedBefore = cmdStream.getUsed(); - commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); if (fePropertiesSupport.disableEuFusion) { ASSERT_EQ(3u, commandsToPatch.size()); @@ -1601,15 +1610,40 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest, auto cfeCmd = genCmdCast(cfePatch.pCommand); ASSERT_NE(nullptr, cfeCmd); EXPECT_TRUE(NEO::UnitTestHelper::getDisableFusionStateFromFrontEndCommand(*cfeCmd)); + EXPECT_EQ(0u, cfeCmd->getScratchSpaceBuffer()); } else { EXPECT_EQ(0u, commandsToPatch.size()); } + result = commandList->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto commandListHandle = commandList->toHandle(); + result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + if (fePropertiesSupport.disableEuFusion) { - commandList->reset(); + ASSERT_EQ(3u, commandsToPatch.size()); + + bool disableFusionStates[] = {true, false, true}; + uint32_t disableFusionStatesIdx = 0; + + for (const auto &cfeToPatch : commandsToPatch) { + EXPECT_EQ(CommandList::CommandToPatch::FrontEndState, cfeToPatch.type); + auto cfeCmd = genCmdCast(cfeToPatch.pDestination); + ASSERT_NE(nullptr, cfeCmd); + + EXPECT_EQ(disableFusionStates[disableFusionStatesIdx++], + NEO::UnitTestHelper::getDisableFusionStateFromFrontEndCommand(*cfeCmd)); + EXPECT_NE(0u, cfeCmd->getScratchSpaceBuffer()); + } + + result = commandList->reset(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); EXPECT_EQ(0u, commandsToPatch.size()); } } + HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest, givenFrontEndTrackingCmdListIsExecutedWhenPropertyComputeDispatchAllWalkerSupportedThenExpectFrontEndAddedToPatchlist, IsAtLeastXeHpCore) { @@ -1619,6 +1653,8 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest, auto &productHelper = device->getProductHelper(); productHelper.fillFrontEndPropertiesSupportStructure(fePropertiesSupport, device->getHwInfo()); + mockKernelImmData->kernelDescriptor->kernelAttributes.perThreadScratchSize[0] = 0x40; + NEO::DebugManager.flags.AllowMixingRegularAndCooperativeKernels.set(1); EXPECT_TRUE(commandList->frontEndStateTracking); @@ -1644,6 +1680,7 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest, auto cfeCmd = genCmdCast(cfePatch.pCommand); ASSERT_NE(nullptr, cfeCmd); EXPECT_TRUE(NEO::UnitTestHelper::getComputeDispatchAllWalkerFromFrontEndCommand(*cfeCmd)); + EXPECT_EQ(0u, cfeCmd->getScratchSpaceBuffer()); } else { EXPECT_EQ(0u, commandsToPatch.size()); } @@ -1668,6 +1705,7 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest, auto cfeCmd = genCmdCast(cfePatch.pCommand); ASSERT_NE(nullptr, cfeCmd); EXPECT_FALSE(NEO::UnitTestHelper::getComputeDispatchAllWalkerFromFrontEndCommand(*cfeCmd)); + EXPECT_EQ(0u, cfeCmd->getScratchSpaceBuffer()); } else { EXPECT_EQ(0u, commandsToPatch.size()); } @@ -1683,12 +1721,36 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest, auto cfeCmd = genCmdCast(cfePatch.pCommand); ASSERT_NE(nullptr, cfeCmd); EXPECT_TRUE(NEO::UnitTestHelper::getComputeDispatchAllWalkerFromFrontEndCommand(*cfeCmd)); + EXPECT_EQ(0u, cfeCmd->getScratchSpaceBuffer()); } else { EXPECT_EQ(0u, commandsToPatch.size()); } + result = commandList->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto commandListHandle = commandList->toHandle(); + result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + if (fePropertiesSupport.computeDispatchAllWalker) { - commandList->reset(); + ASSERT_EQ(3u, commandsToPatch.size()); + + bool computeDispatchAllWalkerStates[] = {true, false, true}; + uint32_t computeDispatchAllWalkerStatesIdx = 0; + + for (const auto &cfeToPatch : commandsToPatch) { + EXPECT_EQ(CommandList::CommandToPatch::FrontEndState, cfeToPatch.type); + auto cfeCmd = genCmdCast(cfeToPatch.pDestination); + ASSERT_NE(nullptr, cfeCmd); + + EXPECT_EQ(computeDispatchAllWalkerStates[computeDispatchAllWalkerStatesIdx++], + NEO::UnitTestHelper::getComputeDispatchAllWalkerFromFrontEndCommand(*cfeCmd)); + EXPECT_NE(0u, cfeCmd->getScratchSpaceBuffer()); + } + + result = commandList->reset(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); EXPECT_EQ(0u, commandsToPatch.size()); } } diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp index c3baf72565..80a5e06a17 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp @@ -650,6 +650,7 @@ HWTEST_F(CommandQueueCreate, givenContainerWithAllocationsWhenResidencyContainer false, false, returnValue)); + commandQueue->startingCmdBuffer = &commandQueue->commandStream; ResidencyContainer container; TaskCountType peekTaskCountBefore = commandQueue->csr->peekTaskCount(); TaskCountType flushedTaskCountBefore = commandQueue->csr->peekLatestFlushedTaskCount(); @@ -676,6 +677,7 @@ HWTEST_F(CommandQueueCreate, givenCommandStreamReceiverFailsThenSubmitBatchBuffe false, false, returnValue)); + commandQueue->startingCmdBuffer = &commandQueue->commandStream; ResidencyContainer container; TaskCountType peekTaskCountBefore = commandQueue->csr->peekTaskCount(); TaskCountType flushedTaskCountBefore = commandQueue->csr->peekLatestFlushedTaskCount(); @@ -701,6 +703,7 @@ HWTEST_F(CommandQueueCreate, givenOutOfMemoryThenSubmitBatchBufferReturnsOutOfMe false, false, returnValue)); + commandQueue->startingCmdBuffer = &commandQueue->commandStream; ResidencyContainer container; NEO::SubmissionStatus ret = commandQueue->submitBatchBuffer(0, container, nullptr, false); EXPECT_EQ(ret, NEO::SubmissionStatus::OUT_OF_MEMORY); diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp index fe09fa7fb3..940f91f373 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp @@ -1014,29 +1014,6 @@ HWTEST2_F(CommandQueueScratchTests, whenPatchCommandsIsCalledThenCommandsAreCorr } } -HWTEST2_F(CommandQueueScratchTests, givenInvalidScratchAddressWhenPatchCommandsIsCalledThenAbortIsThrown, IsAtLeastXeHpCore) { - using CFE_STATE = typename FamilyType::CFE_STATE; - - ze_command_queue_desc_t desc = {}; - NEO::CommandStreamReceiver *csr = nullptr; - device->getCsrForOrdinalAndIndex(&csr, 0u, 0u); - auto commandQueue = std::make_unique>(device, csr, &desc); - auto commandList = std::make_unique>>(); - - CFE_STATE destinationCfeState; - auto sourceCfeState = new CFE_STATE; - *sourceCfeState = FamilyType::cmdInitCfeState; - - CommandList::CommandToPatch commandToPatch; - commandToPatch.pDestination = &destinationCfeState; - commandToPatch.pCommand = sourceCfeState; - commandToPatch.type = CommandList::CommandToPatch::CommandType::FrontEndState; - commandList->commandsToPatch.push_back(commandToPatch); - - uint64_t invalidScratchAddress = 0u; - EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, invalidScratchAddress)); -} - using IsWithinNotSupported = IsWithinGfxCore; HWTEST2_F(CommandQueueScratchTests, givenCommandsToPatchToNotSupportedPlatformWhenPatchCommandsIsCalledThenAbortIsThrown, IsWithinNotSupported) {