performance: dispatch and chain command list batch buffers as primary

Command list batch buffers should be chained when no dynamic or global preamble
is present in command queue.
Return to command queue, when preamble is required.
Chain last command list to the command queue epilog.
Provide first command list batch buffer to KMD/ULLS when no command queue
preamble.

Related-To: NEO-7807

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2023-04-12 20:24:09 +00:00
committed by Compute-Runtime-Automation
parent 2022592f3d
commit f451207372
11 changed files with 516 additions and 52 deletions

View File

@@ -124,9 +124,13 @@ NEO::SubmissionStatus CommandQueueImp::submitBatchBuffer(size_t offset, NEO::Res
bool isCooperative) {
UNRECOVERABLE_IF(csr == nullptr);
NEO::BatchBuffer batchBuffer(commandStream.getGraphicsAllocation(), offset, 0, 0, nullptr, false, false,
NEO::BatchBuffer batchBuffer(this->startingCmdBuffer->getGraphicsAllocation(), offset, 0, 0, nullptr, false, false,
NEO::QueueThrottle::HIGH, NEO::QueueSliceCount::defaultSliceCount,
commandStream.getUsed(), &commandStream, endingCmdPtr, csr->getNumClients(), false, false);
this->startingCmdBuffer->getUsed(), this->startingCmdBuffer, endingCmdPtr, csr->getNumClients(), false, false);
if (this->startingCmdBuffer != &this->commandStream) {
this->csr->makeResident(*this->commandStream.getGraphicsAllocation());
}
commandStream.getGraphicsAllocation()->updateTaskCount(csr->peekTaskCount() + 1, csr->getOsContext().getContextId());
commandStream.getGraphicsAllocation()->updateResidencyTaskCount(csr->peekTaskCount() + 1, csr->getOsContext().getContextId());

View File

@@ -74,9 +74,13 @@ struct CommandQueueHw : public CommandQueueImp {
NEO::StreamProperties cmdListBeginState{};
uint64_t scratchGsba = 0;
uint64_t childGpuAddressPositionBeforeDynamicPreamble = 0;
size_t spaceForResidency = 10;
CommandList *firstCommandList = nullptr;
CommandList *lastCommandList = nullptr;
void *currentPatchForChainedBbStart = nullptr;
NEO::PreemptionMode preemptionMode{};
NEO::PreemptionMode statePreemption{};
uint32_t perThreadScratchSpaceSize = 0;
@@ -125,6 +129,7 @@ struct CommandQueueHw : public CommandQueueImp {
MOCKABLE_VIRTUAL bool isDispatchTaskCountPostSyncRequired(ze_fence_handle_t hFence, bool containsAnyRegularCmdList) const;
inline size_t estimateLinearStreamSizeInitial(CommandListExecutionContext &ctx);
inline size_t estimateCommandListSecondaryStart(CommandList *commandList);
inline size_t estimateCommandListPrimaryStart(bool required);
inline size_t estimateCommandListResidencySize(CommandList *commandList);
inline void setFrontEndStateProperties(CommandListExecutionContext &ctx);
inline void handleScratchSpaceAndUpdateGSBAStateDirtyFlag(CommandListExecutionContext &ctx);
@@ -157,8 +162,12 @@ struct CommandQueueHw : public CommandQueueImp {
inline void programOneCmdListFrontEndIfDirty(CommandListExecutionContext &ctx,
NEO::LinearStream &commandStream,
CommandListRequiredStateChange &cmdListRequiredState);
inline void programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &commandStream);
inline void programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &commandStream, CommandListExecutionContext &ctx);
inline void programOneCmdListBatchBufferStartPrimaryBatchBuffer(CommandList *commandList, NEO::LinearStream &commandStream, CommandListExecutionContext &ctx);
inline void programOneCmdListBatchBufferStartSecondaryBatchBuffer(CommandList *commandList, NEO::LinearStream &commandStream, CommandListExecutionContext &ctx);
inline void programLastCommandListReturnBbStart(
NEO::LinearStream &commandStream,
CommandListExecutionContext &ctx);
inline void mergeOneCmdListPipelinedState(CommandList *commandList);
inline void programFrontEndAndClearDirtyFlag(bool shouldFrontEndBeProgrammed,
CommandListExecutionContext &ctx,

View File

@@ -13,6 +13,7 @@
#include "shared/source/command_stream/command_stream_receiver_hw.h"
#include "shared/source/command_stream/linear_stream.h"
#include "shared/source/command_stream/preemption.h"
#include "shared/source/command_stream/preemption_mode.h"
#include "shared/source/command_stream/scratch_space_controller.h"
#include "shared/source/command_stream/wait_status.h"
#include "shared/source/debugger/debugger_l0.h"
@@ -78,13 +79,14 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
auto neoDevice = device->getNEODevice();
auto ctx = CommandListExecutionContext{phCommandLists,
numCommandLists,
csr->getPreemptionMode(),
this->isCopyOnlyCommandQueue ? NEO::PreemptionMode::Disabled : csr->getPreemptionMode(),
device,
NEO::Debugger::isDebugEnabled(internalUsage),
csr->isProgramActivePartitionConfigRequired(),
performMigration};
ctx.globalInit |= ctx.isDebugEnabled && !this->commandQueueDebugCmdsProgrammed && (neoDevice->getSourceLevelDebugger() || device->getL0Debugger());
this->startingCmdBuffer = &this->commandStream;
this->device->activateMetricGroups();
if (this->isCopyOnlyCommandQueue) {
@@ -185,6 +187,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
for (auto i = 0u; i < numCommandLists; ++i) {
auto commandList = CommandList::fromHandle(commandListHandles[i]);
ctx.childGpuAddressPositionBeforeDynamicPreamble = child.getCurrentGpuAddressPosition();
if (this->stateChanges.size() > this->currentStateChangeIndex) {
auto &stateChange = this->stateChanges[this->currentStateChangeIndex];
if (stateChange.cmdListIndex == i) {
@@ -214,6 +218,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
this->updateBaseAddressState(ctx.lastCommandList);
this->migrateSharedAllocationsIfRequested(ctx.isMigrationRequested, ctx.firstCommandList);
this->programLastCommandListReturnBbStart(child, ctx);
this->programStateSipEndWA(ctx.stateSipRequired, child);
this->assignCsrTaskCountToFenceIfAvailable(hFence);
this->dispatchTaskCountPostSyncRegular(ctx.isDispatchTaskCountPostSyncRequired, child);
@@ -250,6 +255,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsCopyOnly(
ctx.spaceForResidency += estimateCommandListResidencySize(commandList);
}
linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(ctx.globalInit);
this->csr->getResidencyAllocations().reserve(ctx.spaceForResidency);
NEO::EncodeDummyBlitWaArgs waArgs{false, &(this->device->getNEODevice()->getRootDeviceEnvironmentRef())};
@@ -270,7 +277,9 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsCopyOnly(
for (auto i = 0u; i < numCommandLists; ++i) {
auto commandList = CommandList::fromHandle(phCommandLists[i]);
this->programOneCmdListBatchBufferStart(commandList, child);
ctx.childGpuAddressPositionBeforeDynamicPreamble = child.getCurrentGpuAddressPosition();
this->programOneCmdListBatchBufferStart(commandList, child, ctx);
this->mergeOneCmdListPipelinedState(commandList);
this->prefetchMemoryToDeviceAssociatedWithCmdList(commandList);
}
@@ -278,6 +287,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsCopyOnly(
this->assignCsrTaskCountToFenceIfAvailable(hFence);
this->programLastCommandListReturnBbStart(child, ctx);
this->dispatchTaskCountPostSyncByMiFlushDw(ctx.isDispatchTaskCountPostSyncRequired, child);
this->makeCsrTagAllocationResident();
@@ -559,7 +569,7 @@ void CommandQueueHw<gfxCoreFamily>::setupCmdListsAndContextParams(
uint32_t numCommandLists,
ze_fence_handle_t hFence) {
ctx.containsAnyRegularCmdList |= ctx.firstCommandList->getCmdListType() == CommandList::CommandListType::TYPE_REGULAR;
ctx.containsAnyRegularCmdList = ctx.firstCommandList->getCmdListType() == CommandList::CommandListType::TYPE_REGULAR;
for (auto i = 0u; i < numCommandLists; i++) {
auto commandList = CommandList::fromHandle(phCommandLists[i]);
@@ -625,7 +635,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeInitial(
auto hwContextSizeEstimate = this->csr->getCmdsSizeForHardwareContext();
if (hwContextSizeEstimate > 0) {
linearStreamSizeEstimate += hwContextSizeEstimate;
ctx.globalInit |= true;
ctx.globalInit = true;
}
if (ctx.isDirectSubmissionEnabled) {
@@ -644,7 +654,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeInitial(
if (NEO::DebugManager.flags.EnableSWTags.get()) {
linearStreamSizeEstimate += NEO::SWTagsManager::estimateSpaceForSWTags<GfxFamily>();
ctx.globalInit |= true;
ctx.globalInit = true;
}
linearStreamSizeEstimate += NEO::EncodeKernelArgsBuffer<GfxFamily>::getKernelArgsBufferCmdsSize(this->csr->getKernelArgsBufferAllocation(),
@@ -669,8 +679,18 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeInitial(
template <GFXCORE_FAMILY gfxCoreFamily>
size_t CommandQueueHw<gfxCoreFamily>::estimateCommandListSecondaryStart(CommandList *commandList) {
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
return (commandList->getCmdContainer().getCmdBufferAllocations().size() * NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferStartSize());
if (!this->dispatchCmdListBatchBufferAsPrimary) {
return (commandList->getCmdContainer().getCmdBufferAllocations().size() * NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferStartSize());
}
return 0;
}
template <GFXCORE_FAMILY gfxCoreFamily>
size_t CommandQueueHw<gfxCoreFamily>::estimateCommandListPrimaryStart(bool required) {
if (this->dispatchCmdListBatchBufferAsPrimary && required) {
return NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferStartSize();
}
return 0;
}
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -758,6 +778,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeComplementary(
if (propertyScmDirty || propertyFeDirty || propertyPsDirty || propertySbaDirty || frontEndReturnPoint || propertyPreemptionDirty) {
CommandListDirtyFlags dirtyFlags = {propertyScmDirty, propertyFeDirty, propertyPsDirty, propertySbaDirty, frontEndReturnPoint, propertyPreemptionDirty};
this->stateChanges.emplace_back(stagingState, cmdList, dirtyFlags, ctx.statePreemption, i);
linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(true);
}
}
@@ -770,7 +791,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeComplementary(
auto csrHw = static_cast<NEO::CommandStreamReceiverHw<GfxFamily> *>(this->csr);
linearStreamSizeEstimate += csrHw->getCmdSizeForPerDssBackedBuffer(this->device->getHwInfo());
ctx.globalInit |= true;
ctx.globalInit = true;
}
NEO::Device *neoDevice = this->device->getNEODevice();
@@ -781,6 +802,10 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeComplementary(
linearStreamSizeEstimate += NEO::PreemptionHelper::getRequiredStateSipCmdSize<GfxFamily>(*neoDevice, this->csr->isRcs());
}
bool firstCmdlistDynamicPreamble = (this->stateChanges.size() > 0 && this->stateChanges[0].cmdListIndex == 0);
bool estimateBbStartForGlobalInitOnly = !firstCmdlistDynamicPreamble && ctx.globalInit;
linearStreamSizeEstimate += this->estimateCommandListPrimaryStart(estimateBbStartForGlobalInitOnly);
return linearStreamSizeEstimate;
}
@@ -983,18 +1008,63 @@ void CommandQueueHw<gfxCoreFamily>::writeCsrStreamInlineIfLogicalStateHelperAvai
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &cmdStream) {
CommandListExecutionContext ctx = {};
programOneCmdListBatchBufferStart(commandList, cmdStream, ctx);
void CommandQueueHw<gfxCoreFamily>::programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &commandStream, CommandListExecutionContext &ctx) {
if (this->dispatchCmdListBatchBufferAsPrimary) {
programOneCmdListBatchBufferStartPrimaryBatchBuffer(commandList, commandStream, ctx);
} else {
programOneCmdListBatchBufferStartSecondaryBatchBuffer(commandList, commandStream, ctx);
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::programOneCmdListBatchBufferStart(CommandList *commandList, NEO::LinearStream &cmdStream, CommandListExecutionContext &ctx) {
void CommandQueueHw<gfxCoreFamily>::programOneCmdListBatchBufferStartPrimaryBatchBuffer(CommandList *commandList, NEO::LinearStream &commandStream, CommandListExecutionContext &ctx) {
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
NEO::CommandContainer &cmdListContainer = commandList->getCmdContainer();
NEO::GraphicsAllocation *cmdListFirstCmdBuffer = cmdListContainer.getCmdBufferAllocations()[0];
auto bbStartPatchLocation = reinterpret_cast<MI_BATCH_BUFFER_START *>(ctx.currentPatchForChainedBbStart);
bool dynamicPreamble = ctx.childGpuAddressPositionBeforeDynamicPreamble != commandStream.getCurrentGpuAddressPosition();
if (ctx.globalInit || dynamicPreamble) {
if (ctx.currentPatchForChainedBbStart) {
// dynamic preamble, 2nd or later command list
// jump from previous command list to the position before dynamic preamble
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(
bbStartPatchLocation,
ctx.childGpuAddressPositionBeforeDynamicPreamble,
false, false, false);
}
// dynamic preamble, jump from current position, after dynamic preamble to the current command list
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(&commandStream, cmdListFirstCmdBuffer->getGpuAddress(), false, false, false);
ctx.globalInit = false;
} else {
if (ctx.currentPatchForChainedBbStart == nullptr) {
// nothing to dispatch from queue, first command list will be used as submitting batch buffer to KMD or ULLS
size_t firstCmdBufferAlignedSize = cmdListContainer.getAlignedPrimarySize();
this->firstCmdListStream.replaceGraphicsAllocation(cmdListFirstCmdBuffer);
this->firstCmdListStream.replaceBuffer(cmdListFirstCmdBuffer->getUnderlyingBuffer(), firstCmdBufferAlignedSize);
this->firstCmdListStream.getSpace(firstCmdBufferAlignedSize);
this->startingCmdBuffer = &this->firstCmdListStream;
} else {
// chain between command lists when no dynamic preamble required between 2nd and next command list
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(
bbStartPatchLocation,
cmdListFirstCmdBuffer->getGpuAddress(),
false, false, false);
}
}
ctx.currentPatchForChainedBbStart = cmdListContainer.getEndCmdPtr();
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::programOneCmdListBatchBufferStartSecondaryBatchBuffer(CommandList *commandList, NEO::LinearStream &commandStream, CommandListExecutionContext &ctx) {
auto &commandContainer = commandList->getCmdContainer();
auto &cmdBufferAllocations = commandContainer.getCmdBufferAllocations();
auto cmdBufferCount = cmdBufferAllocations.size();
bool isCommandListImmediate = (commandList->getCmdListType() == CommandList::CommandListType::TYPE_IMMEDIATE) ? true : false;
bool isCommandListImmediate = !ctx.containsAnyRegularCmdList;
auto &returnPoints = commandList->getReturnPoints();
uint32_t returnPointsSize = commandList->getReturnPointsSize();
@@ -1006,7 +1076,7 @@ void CommandQueueHw<gfxCoreFamily>::programOneCmdListBatchBufferStart(CommandLis
if (isCommandListImmediate && (iter == (cmdBufferCount - 1))) {
startOffset = ptrOffset(allocation->getGpuAddress(), commandContainer.currentLinearStreamStartOffsetRef());
}
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(&cmdStream, startOffset, true, false, false);
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(&commandStream, startOffset, true, false, false);
if (returnPointsSize > 0) {
bool cmdBufferHasRestarts = std::find_if(
std::next(returnPoints.begin(), returnPointIdx),
@@ -1020,9 +1090,9 @@ void CommandQueueHw<gfxCoreFamily>::programOneCmdListBatchBufferStart(CommandLis
ctx.cmdListBeginState.frontEndState.copyPropertiesComputeDispatchAllWalkerEnableDisableEuFusion(returnPoints[returnPointIdx].configSnapshot.frontEndState);
programFrontEnd(scratchSpaceController->getScratchPatchAddress(),
scratchSpaceController->getPerThreadScratchSpaceSize(),
cmdStream,
commandStream,
ctx.cmdListBeginState);
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(&cmdStream,
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(&commandStream,
returnPoints[returnPointIdx].gpuAddress,
true, false, false);
returnPointIdx++;
@@ -1032,6 +1102,20 @@ void CommandQueueHw<gfxCoreFamily>::programOneCmdListBatchBufferStart(CommandLis
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::programLastCommandListReturnBbStart(
NEO::LinearStream &commandStream,
CommandListExecutionContext &ctx) {
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
if (this->dispatchCmdListBatchBufferAsPrimary) {
auto finalReturnPosition = commandStream.getCurrentGpuAddressPosition();
auto bbStartCmd = reinterpret_cast<MI_BATCH_BUFFER_START *>(ctx.currentPatchForChainedBbStart);
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(bbStartCmd,
finalReturnPosition,
false, false, false);
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::mergeOneCmdListPipelinedState(CommandList *commandList) {
@@ -1166,8 +1250,11 @@ NEO::SubmissionStatus CommandQueueHw<gfxCoreFamily>::prepareAndSubmitBatchBuffer
void *paddingPtr = innerCommandStream.getSpace(this->alignedChildStreamPadding);
memset(paddingPtr, 0, this->alignedChildStreamPadding);
}
size_t startOffset = (this->startingCmdBuffer == &this->firstCmdListStream)
? 0
: ptrDiff(innerCommandStream.getCpuBase(), outerCommandStream.getCpuBase());
return submitBatchBuffer(ptrDiff(innerCommandStream.getCpuBase(), outerCommandStream.getCpuBase()),
return submitBatchBuffer(startOffset,
csr->getResidencyAllocations(),
endingCmd,
ctx.anyCommandListWithCooperativeKernels);

View File

@@ -129,6 +129,7 @@ struct CommandQueueImp : public CommandQueue {
CommandBufferManager buffers;
NEO::LinearStream commandStream{};
NEO::LinearStream firstCmdListStream{};
NEO::HeapContainer heapContainer;
ze_command_queue_desc_t desc;
std::vector<Kernel *> printfKernelContainer;
@@ -138,6 +139,7 @@ struct CommandQueueImp : public CommandQueue {
Device *device = nullptr;
NEO::CommandStreamReceiver *csr = nullptr;
NEO::LinearStream *startingCmdBuffer = nullptr;
uint32_t currentStateChangeIndex = 0;

View File

@@ -187,7 +187,6 @@ void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint
for (auto &commandToPatch : commandsToPatch) {
switch (commandToPatch.type) {
case CommandList::CommandToPatch::FrontEndState: {
UNRECOVERABLE_IF(scratchAddress == 0u);
uint32_t lowScratchAddress = uint32_t(0xFFFFFFFF & scratchAddress);
CFE_STATE *cfeStateCmd = nullptr;
cfeStateCmd = reinterpret_cast<CFE_STATE *>(commandToPatch.pCommand);

View File

@@ -334,7 +334,7 @@ void CommandListAppendLaunchRayTracingKernelFixture::tearDown() {
}
void PrimaryBatchBufferCmdListFixture::setUp() {
NEO::DebugManager.flags.DispatchCmdlistCmdBufferPrimary.set(1);
DebugManager.flags.DispatchCmdlistCmdBufferPrimary.set(1);
ModuleMutableCommandListFixture::setUp();
}

View File

@@ -30,6 +30,7 @@ struct WhiteBox<::L0::CommandQueue> : public ::L0::CommandQueueImp {
using BaseClass::device;
using BaseClass::preemptionCmdSyncProgramming;
using BaseClass::printfKernelContainer;
using BaseClass::startingCmdBuffer;
using BaseClass::submitBatchBuffer;
using BaseClass::synchronizeByPollingForTaskCount;
using BaseClass::taskCount;
@@ -71,6 +72,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
using BaseClass::commandStream;
using BaseClass::prepareAndSubmitBatchBuffer;
using BaseClass::printfKernelContainer;
using BaseClass::startingCmdBuffer;
using L0::CommandQueue::activeSubDevices;
using L0::CommandQueue::cmdListHeapAddressModel;
using L0::CommandQueue::dispatchCmdListBatchBufferAsPrimary;
@@ -106,6 +108,9 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
if (submitBatchBufferReturnValue.has_value()) {
return *submitBatchBufferReturnValue;
}
if (this->startingCmdBuffer == nullptr) {
this->startingCmdBuffer = &this->commandStream;
}
return BaseClass::submitBatchBuffer(offset, residencyContainer, endingCmdPtr, isCooperative);
}

View File

@@ -14,6 +14,7 @@
#include "shared/test/common/test_macros/hw_test.h"
#include "level_zero/core/source/builtin/builtin_functions_lib.h"
#include "level_zero/core/source/device/device.h"
#include "level_zero/core/source/gfx_core_helpers/l0_gfx_core_helper.h"
#include "level_zero/core/source/image/image_hw.h"
#include "level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h"
@@ -1530,6 +1531,113 @@ HWTEST_F(PrimaryBatchBufferCmdListTest, givenPrimaryBatchBufferWhenCommandListHa
EXPECT_EQ(expectedEndPtr, cmdContainer.getEndCmdPtr());
}
HWTEST_F(PrimaryBatchBufferCmdListTest, givenPrimaryBatchBufferWhenCopyCommandListAndQueueAreCreatedThenFirstDispatchCreatesGlobalInitPreambleAndLaterDispatchProvideCmdListBuffer) {
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
ze_result_t returnValue;
uint32_t count = 0u;
returnValue = device->getCommandQueueGroupProperties(&count, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
EXPECT_GT(count, 0u);
std::vector<ze_command_queue_group_properties_t> properties(count);
returnValue = device->getCommandQueueGroupProperties(&count, properties.data());
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
uint32_t ordinal = 0u;
for (ordinal = 0u; ordinal < count; ordinal++) {
if ((properties[ordinal].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) &&
!(properties[ordinal].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)) {
if (properties[ordinal].numQueues == 0) {
continue;
}
break;
}
}
if (ordinal == count) {
GTEST_SKIP();
}
void *dstPtr = nullptr;
void *srcPtr = nullptr;
const size_t size = 64;
ze_device_mem_alloc_desc_t deviceDesc = {};
returnValue = context->allocDeviceMem(device->toHandle(), &deviceDesc, size, 4u, &dstPtr);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
returnValue = context->allocDeviceMem(device->toHandle(), &deviceDesc, size, 4u, &srcPtr);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
ze_command_queue_desc_t desc{};
desc.ordinal = ordinal;
desc.index = 0u;
ze_command_queue_handle_t commandQueueHandle;
returnValue = device->createCommandQueue(&desc, &commandQueueHandle);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto commandQueueCopy = static_cast<L0::ult::CommandQueue *>(L0::CommandQueue::fromHandle(commandQueueHandle));
ASSERT_NE(commandQueueCopy, nullptr);
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(commandQueueCopy->getCsr());
ultCsr->recordFlusheBatchBuffer = true;
std::unique_ptr<L0::ult::CommandList> commandListCopy;
commandListCopy.reset(whiteboxCast(CommandList::create(productFamily, device, NEO::EngineGroupType::Copy, 0u, returnValue)));
ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto &cmdContainerCopy = commandListCopy->getCmdContainer();
auto &cmdListStream = *cmdContainerCopy.getCommandStream();
auto firstCmdBufferAllocation = cmdContainerCopy.getCmdBufferAllocations()[0];
returnValue = commandListCopy->appendMemoryCopy(dstPtr, srcPtr, size, nullptr, 0, nullptr, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
size_t firstCmdBufferUsed = cmdListStream.getUsed();
auto bbStartSpace = ptrOffset(cmdListStream.getCpuBase(), firstCmdBufferUsed);
returnValue = commandListCopy->close();
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
EXPECT_EQ(bbStartSpace, cmdContainerCopy.getEndCmdPtr());
size_t expectedAlignedUse = alignUp(firstCmdBufferUsed + sizeof(MI_BATCH_BUFFER_START), NEO::CommandContainer::minCmdBufferPtrAlign);
EXPECT_EQ(expectedAlignedUse, cmdContainerCopy.getAlignedPrimarySize());
size_t blitterContextInitSize = ultCsr->getCmdsSizeForHardwareContext();
auto cmdListHandle = commandListCopy->toHandle();
returnValue = commandQueueCopy->executeCommandLists(1, &cmdListHandle, nullptr, true);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto bbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(bbStartSpace);
ASSERT_NE(nullptr, bbStartCmd);
auto &cmdQueueStream = commandQueueCopy->commandStream;
if (blitterContextInitSize > 0) {
EXPECT_EQ(cmdQueueStream.getGraphicsAllocation(), ultCsr->latestFlushedBatchBuffer.commandBufferAllocation);
} else {
EXPECT_EQ(firstCmdBufferAllocation, ultCsr->latestFlushedBatchBuffer.commandBufferAllocation);
EXPECT_EQ(cmdQueueStream.getGpuBase(), bbStartCmd->getBatchBufferStartAddress());
}
size_t queueSizeUsed = cmdQueueStream.getUsed();
returnValue = commandQueueCopy->executeCommandLists(1, &cmdListHandle, nullptr, true);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
bbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(bbStartSpace);
ASSERT_NE(nullptr, bbStartCmd);
EXPECT_EQ(cmdQueueStream.getGpuBase() + queueSizeUsed, bbStartCmd->getBatchBufferStartAddress());
commandQueueCopy->destroy();
commandListCopy.reset(nullptr);
returnValue = context->freeMem(dstPtr);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
returnValue = context->freeMem(srcPtr);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
}
using PrimaryBatchBufferPreamblelessCmdListTest = Test<PrimaryBatchBufferPreamblelessCmdListFixture>;
HWTEST2_F(PrimaryBatchBufferPreamblelessCmdListTest,
@@ -1592,5 +1700,213 @@ HWTEST2_F(PrimaryBatchBufferPreamblelessCmdListTest,
EXPECT_EQ((uncachedMocs << 1), sbaCmd->getStatelessDataPortAccessMemoryObjectControlState());
}
HWTEST2_F(PrimaryBatchBufferPreamblelessCmdListTest,
givenPrimaryBatchBufferWhenExecutingCommandWithoutPreambleThenUseCommandListBufferAsStartingBuffer,
IsAtLeastXeHpCore) {
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(commandQueue->getCsr());
ultCsr->recordFlusheBatchBuffer = true;
ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
ze_result_t result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
result = commandList->close();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto commandListHandle = commandList->toHandle();
result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, true);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto &cmdQueueStream = commandQueue->commandStream;
EXPECT_EQ(cmdQueueStream.getGraphicsAllocation(), ultCsr->latestFlushedBatchBuffer.commandBufferAllocation);
size_t queueUsedSize = cmdQueueStream.getUsed();
auto gpuReturnAddress = cmdQueueStream.getGpuBase() + queueUsedSize;
result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, true);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto &cmdContainer = commandList->getCmdContainer();
auto firstCmdBufferAllocation = cmdContainer.getCmdBufferAllocations()[0];
EXPECT_EQ(firstCmdBufferAllocation, ultCsr->latestFlushedBatchBuffer.commandBufferAllocation);
auto bbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(cmdContainer.getEndCmdPtr());
ASSERT_NE(nullptr, bbStartCmd);
EXPECT_EQ(gpuReturnAddress, bbStartCmd->getBatchBufferStartAddress());
}
HWTEST2_F(PrimaryBatchBufferPreamblelessCmdListTest,
givenPrimaryBatchBufferWhenExecutingMultipleCommandListsAndEachWithoutPreambleThenUseCommandListBufferAsStartingBufferAndChainAllCommandLists,
IsAtLeastXeHpCore) {
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(commandQueue->getCsr());
ultCsr->recordFlusheBatchBuffer = true;
ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
ze_result_t result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
result = commandList->close();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
ze_command_list_handle_t commandLists[] = {commandList->toHandle(),
commandList2->toHandle(),
commandList3->toHandle()};
result = commandQueue->executeCommandLists(1, commandLists, nullptr, true);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto &cmdQueueStream = commandQueue->commandStream;
EXPECT_EQ(cmdQueueStream.getGraphicsAllocation(), ultCsr->latestFlushedBatchBuffer.commandBufferAllocation);
size_t queueUsedSize = cmdQueueStream.getUsed();
auto gpuReturnAddress = cmdQueueStream.getGpuBase() + queueUsedSize;
result = commandList2->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
result = commandList2->close();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
result = commandList3->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
result = commandList3->close();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
result = commandQueue->executeCommandLists(3, commandLists, nullptr, true);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(cmdQueueStream.getCpuBase(), queueUsedSize),
cmdQueueStream.getUsed() - queueUsedSize));
auto cmdQueueBbStartCmds = findAll<MI_BATCH_BUFFER_START *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(0u, cmdQueueBbStartCmds.size());
auto &cmdContainer1stCmdList = commandList->getCmdContainer();
auto dispatchCmdBufferAllocation = cmdContainer1stCmdList.getCmdBufferAllocations()[0];
EXPECT_EQ(dispatchCmdBufferAllocation, ultCsr->latestFlushedBatchBuffer.commandBufferAllocation);
auto bbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(cmdContainer1stCmdList.getEndCmdPtr());
ASSERT_NE(nullptr, bbStartCmd);
auto &cmdContainer2ndCmdList = commandList2->getCmdContainer();
auto secondCmdBufferAllocation = cmdContainer2ndCmdList.getCmdBufferAllocations()[0];
EXPECT_EQ(secondCmdBufferAllocation->getGpuAddress(), bbStartCmd->getBatchBufferStartAddress());
bbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(cmdContainer2ndCmdList.getEndCmdPtr());
ASSERT_NE(nullptr, bbStartCmd);
auto &cmdContainer3rdCmdList = commandList3->getCmdContainer();
auto thirdCmdBufferAllocation = cmdContainer3rdCmdList.getCmdBufferAllocations()[0];
EXPECT_EQ(thirdCmdBufferAllocation->getGpuAddress(), bbStartCmd->getBatchBufferStartAddress());
bbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(cmdContainer3rdCmdList.getEndCmdPtr());
ASSERT_NE(nullptr, bbStartCmd);
EXPECT_EQ(gpuReturnAddress, bbStartCmd->getBatchBufferStartAddress());
}
HWTEST2_F(PrimaryBatchBufferPreamblelessCmdListTest,
givenPrimaryBatchBufferWhenExecutingMultipleCommandListsAndSecondWithPreambleThenUseCommandListBufferAsStartingBufferAndChainFirstListToQueuePreambleAndAfterToSecondList,
IsAtLeastXeHpCore) {
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS;
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(commandQueue->getCsr());
ultCsr->recordFlusheBatchBuffer = true;
ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
ze_result_t result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
result = commandList->close();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
ze_command_list_handle_t commandLists[] = {commandList->toHandle(),
commandList2->toHandle(),
commandList3->toHandle()};
result = commandQueue->executeCommandLists(1, commandLists, nullptr, true);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto &cmdQueueStream = commandQueue->commandStream;
EXPECT_EQ(cmdQueueStream.getGraphicsAllocation(), ultCsr->latestFlushedBatchBuffer.commandBufferAllocation);
size_t queueUsedSize = cmdQueueStream.getUsed();
auto gpuReturnAddress = cmdQueueStream.getGpuBase() + queueUsedSize;
kernel->kernelRequiresUncachedMocsCount++;
result = commandList2->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
result = commandList2->close();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
result = commandList3->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
result = commandList3->close();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
result = commandQueue->executeCommandLists(3, commandLists, nullptr, true);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
// 1st command list is preambleless
auto &cmdContainer1stCmdList = commandList->getCmdContainer();
auto dispatchCmdBufferAllocation = cmdContainer1stCmdList.getCmdBufferAllocations()[0];
EXPECT_EQ(dispatchCmdBufferAllocation, ultCsr->latestFlushedBatchBuffer.commandBufferAllocation);
auto bbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(cmdContainer1stCmdList.getEndCmdPtr());
ASSERT_NE(nullptr, bbStartCmd);
// ending BB_START of 1st command list points to dynamic preamble - dirty stateless mocs SBA command
EXPECT_EQ(gpuReturnAddress, bbStartCmd->getBatchBufferStartAddress());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(cmdQueueStream.getCpuBase(), queueUsedSize),
cmdQueueStream.getUsed() - queueUsedSize));
auto cmdQueueSbaDirtyCmds = findAll<STATE_BASE_ADDRESS *>(cmdList.begin(), cmdList.end());
ASSERT_TRUE(cmdQueueSbaDirtyCmds.size() >= 1u);
auto cmdQueueBbStartCmds = findAll<MI_BATCH_BUFFER_START *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(1u, cmdQueueBbStartCmds.size());
auto chainFromPreambleToSecondBbStartCmd = reinterpret_cast<MI_BATCH_BUFFER_START *>(*cmdQueueBbStartCmds[0]);
auto &cmdContainer2ndCmdList = commandList2->getCmdContainer();
auto secondCmdBufferAllocation = cmdContainer2ndCmdList.getCmdBufferAllocations()[0];
EXPECT_EQ(secondCmdBufferAllocation->getGpuAddress(), chainFromPreambleToSecondBbStartCmd->getBatchBufferStartAddress());
bbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(cmdContainer2ndCmdList.getEndCmdPtr());
ASSERT_NE(nullptr, bbStartCmd);
auto &cmdContainer3rdCmdList = commandList3->getCmdContainer();
auto thirdCmdBufferAllocation = cmdContainer3rdCmdList.getCmdBufferAllocations()[0];
EXPECT_EQ(thirdCmdBufferAllocation->getGpuAddress(), bbStartCmd->getBatchBufferStartAddress());
bbStartCmd = genCmdCast<MI_BATCH_BUFFER_START *>(cmdContainer3rdCmdList.getEndCmdPtr());
ASSERT_NE(nullptr, bbStartCmd);
size_t sbaSize = sizeof(STATE_BASE_ADDRESS) + NEO::MemorySynchronizationCommands<FamilyType>::getSizeForSingleBarrier(false);
if (commandQueue->doubleSbaWa) {
sbaSize += sizeof(STATE_BASE_ADDRESS);
}
gpuReturnAddress += sizeof(MI_BATCH_BUFFER_START) + sbaSize;
EXPECT_EQ(gpuReturnAddress, bbStartCmd->getBatchBufferStartAddress());
}
} // namespace ult
} // namespace L0

View File

@@ -1534,15 +1534,19 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest,
ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
ze_result_t result;
result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto &commandsToPatch = commandList->commandsToPatch;
EXPECT_EQ(0u, commandsToPatch.size());
mockKernelImmData->kernelDescriptor->kernelAttributes.perThreadScratchSize[0] = 0x40;
mockKernelImmData->kernelDescriptor->kernelAttributes.flags.requiresDisabledEUFusion = 1;
size_t usedBefore = cmdStream.getUsed();
commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
if (fePropertiesSupport.disableEuFusion) {
ASSERT_EQ(1u, commandsToPatch.size());
@@ -1555,11 +1559,13 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest,
auto cfeCmd = genCmdCast<CFE_STATE *>(cfePatch.pCommand);
ASSERT_NE(nullptr, cfeCmd);
EXPECT_TRUE(NEO::UnitTestHelper<FamilyType>::getDisableFusionStateFromFrontEndCommand(*cfeCmd));
EXPECT_EQ(0u, cfeCmd->getScratchSpaceBuffer());
} else {
EXPECT_EQ(0u, commandsToPatch.size());
}
commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
if (fePropertiesSupport.disableEuFusion) {
EXPECT_EQ(1u, commandsToPatch.size());
@@ -1570,7 +1576,8 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest,
mockKernelImmData->kernelDescriptor->kernelAttributes.flags.requiresDisabledEUFusion = 0;
usedBefore = cmdStream.getUsed();
commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
if (fePropertiesSupport.disableEuFusion) {
ASSERT_EQ(2u, commandsToPatch.size());
@@ -1583,12 +1590,14 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest,
auto cfeCmd = genCmdCast<CFE_STATE *>(cfePatch.pCommand);
ASSERT_NE(nullptr, cfeCmd);
EXPECT_FALSE(NEO::UnitTestHelper<FamilyType>::getDisableFusionStateFromFrontEndCommand(*cfeCmd));
EXPECT_EQ(0u, cfeCmd->getScratchSpaceBuffer());
}
mockKernelImmData->kernelDescriptor->kernelAttributes.flags.requiresDisabledEUFusion = 1;
usedBefore = cmdStream.getUsed();
commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
if (fePropertiesSupport.disableEuFusion) {
ASSERT_EQ(3u, commandsToPatch.size());
@@ -1601,15 +1610,40 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest,
auto cfeCmd = genCmdCast<CFE_STATE *>(cfePatch.pCommand);
ASSERT_NE(nullptr, cfeCmd);
EXPECT_TRUE(NEO::UnitTestHelper<FamilyType>::getDisableFusionStateFromFrontEndCommand(*cfeCmd));
EXPECT_EQ(0u, cfeCmd->getScratchSpaceBuffer());
} else {
EXPECT_EQ(0u, commandsToPatch.size());
}
result = commandList->close();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto commandListHandle = commandList->toHandle();
result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, true);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
if (fePropertiesSupport.disableEuFusion) {
commandList->reset();
ASSERT_EQ(3u, commandsToPatch.size());
bool disableFusionStates[] = {true, false, true};
uint32_t disableFusionStatesIdx = 0;
for (const auto &cfeToPatch : commandsToPatch) {
EXPECT_EQ(CommandList::CommandToPatch::FrontEndState, cfeToPatch.type);
auto cfeCmd = genCmdCast<CFE_STATE *>(cfeToPatch.pDestination);
ASSERT_NE(nullptr, cfeCmd);
EXPECT_EQ(disableFusionStates[disableFusionStatesIdx++],
NEO::UnitTestHelper<FamilyType>::getDisableFusionStateFromFrontEndCommand(*cfeCmd));
EXPECT_NE(0u, cfeCmd->getScratchSpaceBuffer());
}
result = commandList->reset();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(0u, commandsToPatch.size());
}
}
HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest,
givenFrontEndTrackingCmdListIsExecutedWhenPropertyComputeDispatchAllWalkerSupportedThenExpectFrontEndAddedToPatchlist,
IsAtLeastXeHpCore) {
@@ -1619,6 +1653,8 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest,
auto &productHelper = device->getProductHelper();
productHelper.fillFrontEndPropertiesSupportStructure(fePropertiesSupport, device->getHwInfo());
mockKernelImmData->kernelDescriptor->kernelAttributes.perThreadScratchSize[0] = 0x40;
NEO::DebugManager.flags.AllowMixingRegularAndCooperativeKernels.set(1);
EXPECT_TRUE(commandList->frontEndStateTracking);
@@ -1644,6 +1680,7 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest,
auto cfeCmd = genCmdCast<CFE_STATE *>(cfePatch.pCommand);
ASSERT_NE(nullptr, cfeCmd);
EXPECT_TRUE(NEO::UnitTestHelper<FamilyType>::getComputeDispatchAllWalkerFromFrontEndCommand(*cfeCmd));
EXPECT_EQ(0u, cfeCmd->getScratchSpaceBuffer());
} else {
EXPECT_EQ(0u, commandsToPatch.size());
}
@@ -1668,6 +1705,7 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest,
auto cfeCmd = genCmdCast<CFE_STATE *>(cfePatch.pCommand);
ASSERT_NE(nullptr, cfeCmd);
EXPECT_FALSE(NEO::UnitTestHelper<FamilyType>::getComputeDispatchAllWalkerFromFrontEndCommand(*cfeCmd));
EXPECT_EQ(0u, cfeCmd->getScratchSpaceBuffer());
} else {
EXPECT_EQ(0u, commandsToPatch.size());
}
@@ -1683,12 +1721,36 @@ HWTEST2_F(FrontEndPrimaryBatchBufferCommandListTest,
auto cfeCmd = genCmdCast<CFE_STATE *>(cfePatch.pCommand);
ASSERT_NE(nullptr, cfeCmd);
EXPECT_TRUE(NEO::UnitTestHelper<FamilyType>::getComputeDispatchAllWalkerFromFrontEndCommand(*cfeCmd));
EXPECT_EQ(0u, cfeCmd->getScratchSpaceBuffer());
} else {
EXPECT_EQ(0u, commandsToPatch.size());
}
result = commandList->close();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto commandListHandle = commandList->toHandle();
result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, true);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
if (fePropertiesSupport.computeDispatchAllWalker) {
commandList->reset();
ASSERT_EQ(3u, commandsToPatch.size());
bool computeDispatchAllWalkerStates[] = {true, false, true};
uint32_t computeDispatchAllWalkerStatesIdx = 0;
for (const auto &cfeToPatch : commandsToPatch) {
EXPECT_EQ(CommandList::CommandToPatch::FrontEndState, cfeToPatch.type);
auto cfeCmd = genCmdCast<CFE_STATE *>(cfeToPatch.pDestination);
ASSERT_NE(nullptr, cfeCmd);
EXPECT_EQ(computeDispatchAllWalkerStates[computeDispatchAllWalkerStatesIdx++],
NEO::UnitTestHelper<FamilyType>::getComputeDispatchAllWalkerFromFrontEndCommand(*cfeCmd));
EXPECT_NE(0u, cfeCmd->getScratchSpaceBuffer());
}
result = commandList->reset();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(0u, commandsToPatch.size());
}
}

View File

@@ -650,6 +650,7 @@ HWTEST_F(CommandQueueCreate, givenContainerWithAllocationsWhenResidencyContainer
false,
false,
returnValue));
commandQueue->startingCmdBuffer = &commandQueue->commandStream;
ResidencyContainer container;
TaskCountType peekTaskCountBefore = commandQueue->csr->peekTaskCount();
TaskCountType flushedTaskCountBefore = commandQueue->csr->peekLatestFlushedTaskCount();
@@ -676,6 +677,7 @@ HWTEST_F(CommandQueueCreate, givenCommandStreamReceiverFailsThenSubmitBatchBuffe
false,
false,
returnValue));
commandQueue->startingCmdBuffer = &commandQueue->commandStream;
ResidencyContainer container;
TaskCountType peekTaskCountBefore = commandQueue->csr->peekTaskCount();
TaskCountType flushedTaskCountBefore = commandQueue->csr->peekLatestFlushedTaskCount();
@@ -701,6 +703,7 @@ HWTEST_F(CommandQueueCreate, givenOutOfMemoryThenSubmitBatchBufferReturnsOutOfMe
false,
false,
returnValue));
commandQueue->startingCmdBuffer = &commandQueue->commandStream;
ResidencyContainer container;
NEO::SubmissionStatus ret = commandQueue->submitBatchBuffer(0, container, nullptr, false);
EXPECT_EQ(ret, NEO::SubmissionStatus::OUT_OF_MEMORY);

View File

@@ -1014,29 +1014,6 @@ HWTEST2_F(CommandQueueScratchTests, whenPatchCommandsIsCalledThenCommandsAreCorr
}
}
HWTEST2_F(CommandQueueScratchTests, givenInvalidScratchAddressWhenPatchCommandsIsCalledThenAbortIsThrown, IsAtLeastXeHpCore) {
using CFE_STATE = typename FamilyType::CFE_STATE;
ze_command_queue_desc_t desc = {};
NEO::CommandStreamReceiver *csr = nullptr;
device->getCsrForOrdinalAndIndex(&csr, 0u, 0u);
auto commandQueue = std::make_unique<MockCommandQueueHw<gfxCoreFamily>>(device, csr, &desc);
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
CFE_STATE destinationCfeState;
auto sourceCfeState = new CFE_STATE;
*sourceCfeState = FamilyType::cmdInitCfeState;
CommandList::CommandToPatch commandToPatch;
commandToPatch.pDestination = &destinationCfeState;
commandToPatch.pCommand = sourceCfeState;
commandToPatch.type = CommandList::CommandToPatch::CommandType::FrontEndState;
commandList->commandsToPatch.push_back(commandToPatch);
uint64_t invalidScratchAddress = 0u;
EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, invalidScratchAddress));
}
using IsWithinNotSupported = IsWithinGfxCore<IGFX_GEN9_CORE, IGFX_GEN12LP_CORE>;
HWTEST2_F(CommandQueueScratchTests, givenCommandsToPatchToNotSupportedPlatformWhenPatchCommandsIsCalledThenAbortIsThrown, IsWithinNotSupported) {