feature: Append recorded commandlist into immediate (5/N)

- add support for heapless mode

Related-To: NEO-10356

Signed-off-by: Aravind Gopalakrishnan <aravind.gopalakrishnan@intel.com>
This commit is contained in:
Aravind Gopalakrishnan
2024-08-28 23:14:23 +00:00
committed by Compute-Runtime-Automation
parent 63528e70a7
commit 20aa853369
2 changed files with 26 additions and 15 deletions

View File

@@ -121,8 +121,7 @@ struct CommandQueueHw : public CommandQueueImp {
uint32_t numCommandLists, uint32_t numCommandLists,
ze_command_list_handle_t *commandListHandles, ze_command_list_handle_t *commandListHandles,
ze_fence_handle_t hFence, ze_fence_handle_t hFence,
ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, NEO::LinearStream *parentImmediateCommandlistLinearStream);
ze_event_handle_t *phWaitEvents);
MOCKABLE_VIRTUAL ze_result_t executeCommandListsRegular(CommandListExecutionContext &ctx, MOCKABLE_VIRTUAL ze_result_t executeCommandListsRegular(CommandListExecutionContext &ctx,
uint32_t numCommandLists, uint32_t numCommandLists,

View File

@@ -117,7 +117,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
ret = this->executeCommandListsCopyOnly(ctx, numCommandLists, phCommandLists, hFence, parentImmediateCommandlistLinearStream); ret = this->executeCommandListsCopyOnly(ctx, numCommandLists, phCommandLists, hFence, parentImmediateCommandlistLinearStream);
} else if (this->heaplessStateInitEnabled) { } else if (this->heaplessStateInitEnabled) {
ctx.globalInit = false; ctx.globalInit = false;
ret = this->executeCommandListsRegularHeapless(ctx, numCommandLists, phCommandLists, hFence, nullptr, 0, nullptr); ret = this->executeCommandListsRegularHeapless(ctx, numCommandLists, phCommandLists, hFence, parentImmediateCommandlistLinearStream);
} else { } else {
ret = this->executeCommandListsRegular(ctx, numCommandLists, phCommandLists, hFence, parentImmediateCommandlistLinearStream); ret = this->executeCommandListsRegular(ctx, numCommandLists, phCommandLists, hFence, parentImmediateCommandlistLinearStream);
} }
@@ -135,13 +135,12 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegularHeapless(
uint32_t numCommandLists, uint32_t numCommandLists,
ze_command_list_handle_t *commandListHandles, ze_command_list_handle_t *commandListHandles,
ze_fence_handle_t hFence, ze_fence_handle_t hFence,
ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, NEO::LinearStream *parentImmediateCommandlistLinearStream) {
ze_event_handle_t *phWaitEvents) {
auto neoDevice = this->device->getNEODevice(); auto neoDevice = this->device->getNEODevice();
this->csr->initializeDeviceWithFirstSubmission(*neoDevice); this->csr->initializeDeviceWithFirstSubmission(*neoDevice);
this->setupCmdListsAndContextParams(ctx, commandListHandles, numCommandLists, hFence, nullptr); this->setupCmdListsAndContextParams(ctx, commandListHandles, numCommandLists, hFence, parentImmediateCommandlistLinearStream);
ctx.isDirectSubmissionEnabled = this->csr->isDirectSubmissionEnabled(); ctx.isDirectSubmissionEnabled = this->csr->isDirectSubmissionEnabled();
bool instructionCacheFlushRequired = this->csr->isInstructionCacheFlushRequired(); bool instructionCacheFlushRequired = this->csr->isInstructionCacheFlushRequired();
bool stateCacheFlushRequired = neoDevice->getBindlessHeapsHelper() ? neoDevice->getBindlessHeapsHelper()->getStateDirtyForContext(this->csr->getOsContext().getContextId()) : false; bool stateCacheFlushRequired = neoDevice->getBindlessHeapsHelper() ? neoDevice->getBindlessHeapsHelper()->getStateDirtyForContext(this->csr->getOsContext().getContextId()) : false;
@@ -164,6 +163,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegularHeapless(
return ret; return ret;
} }
NEO::LinearStream *streamForDispatch = parentImmediateCommandlistLinearStream ? parentImmediateCommandlistLinearStream : &child;
this->getGlobalFenceAndMakeItResident(); this->getGlobalFenceAndMakeItResident();
this->getWorkPartitionAndMakeItResident(); this->getWorkPartitionAndMakeItResident();
this->getGlobalStatelessHeapAndMakeItResident(ctx); this->getGlobalStatelessHeapAndMakeItResident(ctx);
@@ -176,22 +177,22 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegularHeapless(
this->makeCsrTagAllocationResident(); this->makeCsrTagAllocationResident();
if (instructionCacheFlushRequired) { if (instructionCacheFlushRequired) {
NEO::MemorySynchronizationCommands<GfxFamily>::addInstructionCacheFlush(child); NEO::MemorySynchronizationCommands<GfxFamily>::addInstructionCacheFlush(*streamForDispatch);
this->csr->setInstructionCacheFlushed(); this->csr->setInstructionCacheFlushed();
} }
if (stateCacheFlushRequired) { if (stateCacheFlushRequired) {
NEO::MemorySynchronizationCommands<GfxFamily>::addStateCacheFlush(child, neoDevice->getRootDeviceEnvironment()); NEO::MemorySynchronizationCommands<GfxFamily>::addStateCacheFlush(*streamForDispatch, neoDevice->getRootDeviceEnvironment());
neoDevice->getBindlessHeapsHelper()->clearStateDirtyForContext(this->csr->getOsContext().getContextId()); neoDevice->getBindlessHeapsHelper()->clearStateDirtyForContext(this->csr->getOsContext().getContextId());
} }
for (auto i = 0u; i < numCommandLists; ++i) { for (auto i = 0u; i < numCommandLists; ++i) {
auto commandList = CommandList::fromHandle(commandListHandles[i]); auto commandList = CommandList::fromHandle(commandListHandles[i]);
ctx.childGpuAddressPositionBeforeDynamicPreamble = child.getCurrentGpuAddressPosition(); ctx.childGpuAddressPositionBeforeDynamicPreamble = (*streamForDispatch).getCurrentGpuAddressPosition();
this->patchCommands(*commandList, ctx); this->patchCommands(*commandList, ctx);
this->programOneCmdListBatchBufferStart(commandList, child, ctx); this->programOneCmdListBatchBufferStart(commandList, *streamForDispatch, ctx);
this->prefetchMemoryToDeviceAssociatedWithCmdList(commandList); this->prefetchMemoryToDeviceAssociatedWithCmdList(commandList);
if (commandList->hasKernelWithAssert()) { if (commandList->hasKernelWithAssert()) {
@@ -202,18 +203,29 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegularHeapless(
} }
this->migrateSharedAllocationsIfRequested(ctx.isMigrationRequested, ctx.firstCommandList); this->migrateSharedAllocationsIfRequested(ctx.isMigrationRequested, ctx.firstCommandList);
this->programLastCommandListReturnBbStart(child, ctx); this->programLastCommandListReturnBbStart(*streamForDispatch, ctx);
this->assignCsrTaskCountToFenceIfAvailable(hFence); this->assignCsrTaskCountToFenceIfAvailable(hFence);
this->dispatchTaskCountPostSyncRegular(ctx.isDispatchTaskCountPostSyncRequired, child); this->dispatchTaskCountPostSyncRegular(ctx.isDispatchTaskCountPostSyncRequired, *streamForDispatch);
auto submitResult = NEO::SubmissionStatus::failed;
if (parentImmediateCommandlistLinearStream) {
submitResult = NEO::SubmissionStatus::success;
} else {
submitResult = this->prepareAndSubmitBatchBuffer(ctx, *streamForDispatch);
}
auto submitResult = this->prepareAndSubmitBatchBuffer(ctx, child);
this->updateTaskCountAndPostSync(ctx.isDispatchTaskCountPostSyncRequired); this->updateTaskCountAndPostSync(ctx.isDispatchTaskCountPostSyncRequired);
this->csr->makeSurfacePackNonResident(this->csr->getResidencyAllocations(), false); this->csr->makeSurfacePackNonResident(this->csr->getResidencyAllocations(), false);
auto completionResult = this->waitForCommandQueueCompletionAndCleanHeapContainer(); auto completionResult = ZE_RESULT_SUCCESS;
if (!parentImmediateCommandlistLinearStream) {
completionResult = this->waitForCommandQueueCompletionAndCleanHeapContainer();
}
ze_result_t retVal = this->handleSubmissionAndCompletionResults(submitResult, completionResult); ze_result_t retVal = this->handleSubmissionAndCompletionResults(submitResult, completionResult);
this->csr->getResidencyAllocations().clear(); if (!parentImmediateCommandlistLinearStream) {
this->csr->getResidencyAllocations().clear();
}
return retVal; return retVal;
} }