diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h index cc417a69d6..7f8a06309f 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h @@ -14,6 +14,7 @@ #include #include +#include namespace NEO { struct SvmAllocationData; @@ -22,6 +23,8 @@ class LinearStream; } // namespace NEO namespace L0 { +using CsrMutex = std::recursive_mutex; +using MutexLock = std::unique_lock; struct EventPool; struct Event; @@ -53,7 +56,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily::*)(NEO::LinearStream &, size_t, bool, bool, bool, bool); + using ComputeFlushMethodType = NEO::CompletionStamp (CommandListCoreFamilyImmediate::*)(NEO::LinearStream &, size_t, bool, bool, NEO::AppendOperations, bool); CommandListCoreFamilyImmediate(uint32_t numIddsPerBlock); @@ -186,24 +189,31 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily void handleHeapsAndResidencyForImmediateRegularTask(void *&sshCpuBaseAddress); void handleDebugSurfaceStateUpdate(NEO::IndirectHeap *ssh); - void checkAvailableSpace(uint32_t numEvents, bool hasRelaxedOrderingDependencies, size_t commandSize); + void checkAvailableSpace(uint32_t numEvents, bool hasRelaxedOrderingDependencies, size_t commandSize, bool requestCommandBufferInLocalMem); void updateDispatchFlagsWithRequiredStreamState(NEO::DispatchFlags &dispatchFlags); - MOCKABLE_VIRTUAL ze_result_t flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, bool kernelOperation, bool copyOffloadSubmission, ze_event_handle_t hSignalEvent, bool requireTaskCountUpdate); + MOCKABLE_VIRTUAL ze_result_t flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, + NEO::AppendOperations appendOperation, bool copyOffloadSubmission, ze_event_handle_t hSignalEvent, bool requireTaskCountUpdate, + MutexLock *outerLock); bool preferCopyThroughLockedPtr(CpuMemCopyInfo &cpuMemCopyInfo, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents); bool isSuitableUSMHostAlloc(NEO::SvmAllocationData *alloc); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index 257bb883b8..1f49ca75e3 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -50,7 +50,7 @@ CommandListCoreFamilyImmediate::CommandListCoreFamilyImmediate(ui } template -void CommandListCoreFamilyImmediate::checkAvailableSpace(uint32_t numEvents, bool hasRelaxedOrderingDependencies, size_t commandSize) { +void CommandListCoreFamilyImmediate::checkAvailableSpace(uint32_t numEvents, bool hasRelaxedOrderingDependencies, size_t commandSize, bool requestCommandBufferInLocalMem) { this->commandContainer.fillReusableAllocationLists(); /* Command container might has two command buffers. If it has, one is in local memory, because relaxed ordering requires that and one in system for copying it into ring buffer. @@ -104,12 +104,16 @@ void CommandListCoreFamilyImmediate::updateDispatchFlagsWithRequi } template -NEO::CompletionStamp CommandListCoreFamilyImmediate::flushBcsTask(NEO::LinearStream &cmdStreamTask, size_t taskStartOffset, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, NEO::CommandStreamReceiver *csr) { +NEO::CompletionStamp CommandListCoreFamilyImmediate::flushBcsTask(NEO::LinearStream &cmdStreamTask, size_t taskStartOffset, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, bool requireTaskCountUpdate, NEO::AppendOperations appendOperation, NEO::CommandStreamReceiver *csr) { + NEO::LinearStream *optionalEpilogueCmdStream = nullptr; + NEO::DispatchBcsFlags dispatchBcsFlags( this->isSyncModeQueue, // flushTaskCount hasStallingCmds, // hasStallingCmds hasRelaxedOrderingDependencies // hasRelaxedOrderingDependencies ); + dispatchBcsFlags.optionalEpilogueCmdStream = optionalEpilogueCmdStream; + dispatchBcsFlags.dispatchOperation = appendOperation; CommandListImp::storeReferenceTsToMappedEvents(true); @@ -235,17 +239,21 @@ void CommandListCoreFamilyImmediate::handleHeapsAndResidencyForIm template NEO::CompletionStamp CommandListCoreFamilyImmediate::flushImmediateRegularTask(NEO::LinearStream &cmdStreamTask, size_t taskStartOffset, bool hasStallingCmds, - bool hasRelaxedOrderingDependencies, bool kernelOperation, bool requireTaskCountUpdate) { + bool hasRelaxedOrderingDependencies, NEO::AppendOperations appendOperation, bool requireTaskCountUpdate) { void *sshCpuPointer = nullptr; constexpr bool streamStatesSupported = true; - if (kernelOperation) { + if (appendOperation == NEO::AppendOperations::kernel) { handleHeapsAndResidencyForImmediateRegularTask(sshCpuPointer); } + NEO::LinearStream *optionalEpilogueCmdStream = nullptr; + NEO::ImmediateDispatchFlags dispatchFlags{ &this->requiredStreamState, // requiredState sshCpuPointer, // sshCpuBase + optionalEpilogueCmdStream, // optionalEpilogueCmdStream + appendOperation, // dispatchOperation this->isSyncModeQueue, // blockingAppend requireTaskCountUpdate, // requireTaskCountUpdate hasRelaxedOrderingDependencies, // hasRelaxedOrderingDependencies @@ -261,18 +269,22 @@ NEO::CompletionStamp CommandListCoreFamilyImmediate::flushImmedia template NEO::CompletionStamp CommandListCoreFamilyImmediate::flushImmediateRegularTaskStateless(NEO::LinearStream &cmdStreamTask, size_t taskStartOffset, bool hasStallingCmds, - bool hasRelaxedOrderingDependencies, bool kernelOperation, bool requireTaskCountUpdate) { + bool hasRelaxedOrderingDependencies, NEO::AppendOperations appendOperation, bool requireTaskCountUpdate) { void *sshCpuPointer = nullptr; constexpr bool streamStatesSupported = false; - if (kernelOperation) { + if (appendOperation == NEO::AppendOperations::kernel) { handleHeapsAndResidencyForImmediateRegularTask(sshCpuPointer); } + NEO::LinearStream *optionalEpilogueCmdStream = nullptr; + NEO::ImmediateDispatchFlags dispatchFlags{ nullptr, // requiredState sshCpuPointer, // sshCpuBase + optionalEpilogueCmdStream, // optionalEpilogueCmdStream + appendOperation, // dispatchOperation this->isSyncModeQueue, // blockingAppend requireTaskCountUpdate, // requireTaskCountUpdate hasRelaxedOrderingDependencies, // hasRelaxedOrderingDependencies @@ -287,7 +299,7 @@ NEO::CompletionStamp CommandListCoreFamilyImmediate::flushImmedia } template -NEO::CompletionStamp CommandListCoreFamilyImmediate::flushRegularTask(NEO::LinearStream &cmdStreamTask, size_t taskStartOffset, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, bool kernelOperation, bool requireTaskCountUpdate) { +NEO::CompletionStamp CommandListCoreFamilyImmediate::flushRegularTask(NEO::LinearStream &cmdStreamTask, size_t taskStartOffset, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, NEO::AppendOperations appendOperation, bool requireTaskCountUpdate) { auto csr = getCsr(false); NEO::DispatchFlags dispatchFlags( @@ -327,7 +339,7 @@ NEO::CompletionStamp CommandListCoreFamilyImmediate::flushRegular NEO::IndirectHeap *dsh = nullptr; NEO::IndirectHeap *ssh = nullptr; - if (kernelOperation) { + if (appendOperation == NEO::AppendOperations::kernel) { this->updateDispatchFlagsWithRequiredStreamState(dispatchFlags); csr->setRequiredScratchSizes(this->getCommandListPerThreadScratchSize(0u), this->getCommandListPerThreadScratchSize(1u)); @@ -396,12 +408,16 @@ NEO::CompletionStamp CommandListCoreFamilyImmediate::flushRegular } template -ze_result_t CommandListCoreFamilyImmediate::executeCommandListImmediateWithFlushTask(bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, bool kernelOperation, bool copyOffloadSubmission, bool requireTaskCountUpdate) { - return executeCommandListImmediateWithFlushTaskImpl(performMigration, hasStallingCmds, hasRelaxedOrderingDependencies, kernelOperation, requireTaskCountUpdate, getCmdQImmediate(copyOffloadSubmission)); +ze_result_t CommandListCoreFamilyImmediate::executeCommandListImmediateWithFlushTask(bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, NEO::AppendOperations appendOperation, + bool copyOffloadSubmission, bool requireTaskCountUpdate, + MutexLock *outerLock) { + return executeCommandListImmediateWithFlushTaskImpl(performMigration, hasStallingCmds, hasRelaxedOrderingDependencies, appendOperation, requireTaskCountUpdate, getCmdQImmediate(copyOffloadSubmission), outerLock); } template -inline ze_result_t CommandListCoreFamilyImmediate::executeCommandListImmediateWithFlushTaskImpl(bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, bool kernelOperation, bool requireTaskCountUpdate, CommandQueue *cmdQ) { +inline ze_result_t CommandListCoreFamilyImmediate::executeCommandListImmediateWithFlushTaskImpl(bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, NEO::AppendOperations appendOperation, + bool requireTaskCountUpdate, CommandQueue *cmdQ, + MutexLock *outerLock) { this->commandContainer.removeDuplicatesFromResidencyContainer(); auto commandStream = this->commandContainer.getCommandStream(); @@ -448,10 +464,10 @@ inline ze_result_t CommandListCoreFamilyImmediate::executeCommand NEO::CompletionStamp completionStamp; if (cmdQ->peekIsCopyOnlyCommandQueue()) { - completionStamp = flushBcsTask(*commandStream, commandStreamStart, hasStallingCmds, hasRelaxedOrderingDependencies, csr); + completionStamp = flushBcsTask(*commandStream, commandStreamStart, hasStallingCmds, hasRelaxedOrderingDependencies, requireTaskCountUpdate, appendOperation, csr); } else { this->registerCsrDcFlushForDcMitigation(*csr); - completionStamp = (this->*computeFlushMethod)(*commandStream, commandStreamStart, hasStallingCmds, hasRelaxedOrderingDependencies, kernelOperation, requireTaskCountUpdate); + completionStamp = (this->*computeFlushMethod)(*commandStream, commandStreamStart, hasStallingCmds, hasRelaxedOrderingDependencies, appendOperation, requireTaskCountUpdate); } if (completionStamp.taskCount > NEO::CompletionStamp::notReady) { @@ -513,7 +529,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendLaunchKernel( relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false); bool stallingCmdsForRelaxedOrdering = hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch); - checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize); + checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize, false); bool hostWait = waitForEventsFromHost(); if (hostWait) { this->synchronizeEventList(numWaitEvents, phWaitEvents); @@ -537,7 +553,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendLaunchKernel( CommandListCoreFamily::handleInOrderDependencyCounter(event, true, false); } - return flushImmediate(ret, true, stallingCmdsForRelaxedOrdering, relaxedOrderingDispatch, true, false, hSignalEvent, false); + return flushImmediate(ret, true, stallingCmdsForRelaxedOrdering, relaxedOrderingDispatch, NEO::AppendOperations::kernel, false, hSignalEvent, false, nullptr); } template @@ -554,7 +570,7 @@ void CommandListCoreFamilyImmediate::handleInOrderNonWalkerSignal if (event && event->isCounterBased()) { event->hostEventSetValue(Event::STATE_INITIAL); } - result = flushImmediate(result, true, hasStallingCmds, relaxedOrderingDispatch, true, false, nullptr, false); + result = flushImmediate(result, true, hasStallingCmds, relaxedOrderingDispatch, NEO::AppendOperations::kernel, false, nullptr, false, nullptr); NEO::RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(*this->commandContainer.getCommandStream(), isCopyOnly(false)); relaxedOrderingDispatch = true; hasStallingCmds = hasStallingCmdsForRelaxedOrdering(1, relaxedOrderingDispatch); @@ -570,12 +586,12 @@ ze_result_t CommandListCoreFamilyImmediate::appendLaunchKernelInd ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) { relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false); - checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize); + checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize, false); auto ret = CommandListCoreFamily::appendLaunchKernelIndirect(kernelHandle, pDispatchArgumentsBuffer, hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch); - return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, true, false, hSignalEvent, false); + return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, NEO::AppendOperations::kernel, false, hSignalEvent, false, nullptr); } template @@ -597,12 +613,12 @@ ze_result_t CommandListCoreFamilyImmediate::appendBarrier(ze_even isStallingOperation = hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch); } - checkAvailableSpace(numWaitEvents, false, commonImmediateCommandSize); + checkAvailableSpace(numWaitEvents, false, commonImmediateCommandSize, false); ret = CommandListCoreFamily::appendBarrier(hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch); this->dependenciesPresent = true; - return flushImmediate(ret, true, isStallingOperation, relaxedOrderingDispatch, false, false, hSignalEvent, false); + return flushImmediate(ret, true, isStallingOperation, relaxedOrderingDispatch, NEO::AppendOperations::nonKernel, false, hSignalEvent, false, nullptr); } template @@ -622,7 +638,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopy( auto sizePerBlit = sizeof(typename GfxFamily::XY_COPY_BLT) + NEO::BlitCommandsHelper::estimatePostBlitCommandSize(); estimatedSize += nBlits * sizePerBlit; } - checkAvailableSpace(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch, estimatedSize); + checkAvailableSpace(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch, estimatedSize, false); bool hasStallindCmds = hasStallingCmdsForRelaxedOrdering(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch); @@ -652,7 +668,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopy( numWaitEvents, phWaitEvents, memoryCopyParams); } - return flushImmediate(ret, true, hasStallindCmds, memoryCopyParams.relaxedOrderingDispatch, true, memoryCopyParams.copyOffloadAllowed, hSignalEvent, false); + return flushImmediate(ret, true, hasStallindCmds, memoryCopyParams.relaxedOrderingDispatch, NEO::AppendOperations::kernel, memoryCopyParams.copyOffloadAllowed, hSignalEvent, false, nullptr); } template @@ -678,7 +694,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopyRegio auto sizePerBlit = sizeof(typename GfxFamily::XY_COPY_BLT) + NEO::BlitCommandsHelper::estimatePostBlitCommandSize(); estimatedSize += xBlits * yBlits * zBlits * sizePerBlit; } - checkAvailableSpace(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch, estimatedSize); + checkAvailableSpace(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch, estimatedSize, false); bool hasStallindCmds = hasStallingCmdsForRelaxedOrdering(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch); @@ -710,7 +726,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopyRegio hSignalEvent, numWaitEvents, phWaitEvents, memoryCopyParams); } - return flushImmediate(ret, true, hasStallindCmds, memoryCopyParams.relaxedOrderingDispatch, true, memoryCopyParams.copyOffloadAllowed, hSignalEvent, false); + return flushImmediate(ret, true, hasStallindCmds, memoryCopyParams.relaxedOrderingDispatch, NEO::AppendOperations::kernel, memoryCopyParams.copyOffloadAllowed, hSignalEvent, false, nullptr); } template @@ -721,11 +737,11 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryFill(void ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) { relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false); - checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize); + checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize, false); auto ret = CommandListCoreFamily::appendMemoryFill(ptr, pattern, patternSize, size, hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch); - return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, true, false, hSignalEvent, false); + return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, NEO::AppendOperations::kernel, false, hSignalEvent, false, nullptr); } template @@ -735,18 +751,18 @@ ze_result_t CommandListCoreFamilyImmediate::appendSignalEvent(ze_ relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(0, false); bool hasStallingCmds = !Event::fromHandle(hSignalEvent)->isCounterBased() || hasStallingCmdsForRelaxedOrdering(0, relaxedOrderingDispatch); - checkAvailableSpace(0, false, commonImmediateCommandSize); + checkAvailableSpace(0, false, commonImmediateCommandSize, false); ret = CommandListCoreFamily::appendSignalEvent(hSignalEvent, relaxedOrderingDispatch); - return flushImmediate(ret, true, hasStallingCmds, relaxedOrderingDispatch, false, false, hSignalEvent, false); + return flushImmediate(ret, true, hasStallingCmds, relaxedOrderingDispatch, NEO::AppendOperations::nonKernel, false, hSignalEvent, false, nullptr); } template ze_result_t CommandListCoreFamilyImmediate::appendEventReset(ze_event_handle_t hSignalEvent) { ze_result_t ret = ZE_RESULT_SUCCESS; - checkAvailableSpace(0, false, commonImmediateCommandSize); + checkAvailableSpace(0, false, commonImmediateCommandSize, false); ret = CommandListCoreFamily::appendEventReset(hSignalEvent); - return flushImmediate(ret, true, true, false, false, false, hSignalEvent, false); + return flushImmediate(ret, true, true, false, NEO::AppendOperations::nonKernel, false, hSignalEvent, false, nullptr); } template @@ -754,7 +770,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendPageFaultCopy(N NEO::GraphicsAllocation *srcAllocation, size_t size, bool flushHost) { - checkAvailableSpace(0, false, commonImmediateCommandSize); + checkAvailableSpace(0, false, commonImmediateCommandSize, false); ze_result_t ret; @@ -776,7 +792,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendPageFaultCopy(N } else { ret = CommandListCoreFamily::appendPageFaultCopy(dstAllocation, srcAllocation, size, flushHost); } - return flushImmediate(ret, false, false, relaxedOrdering, true, false, nullptr, false); + return flushImmediate(ret, false, false, relaxedOrdering, NEO::AppendOperations::kernel, false, nullptr, false, nullptr); } template @@ -791,7 +807,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendWaitOnEvents(ui } if (!skipFlush) { - checkAvailableSpace(numEvents, false, commonImmediateCommandSize); + checkAvailableSpace(numEvents, false, commonImmediateCommandSize, false); } auto ret = CommandListCoreFamily::appendWaitOnEvents(numEvents, phWaitEvents, outWaitCmds, relaxedOrderingAllowed, trackDependencies, apiRequest, skipAddingWaitEventsToResidency, false, copyOffloadOperation); @@ -801,7 +817,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendWaitOnEvents(ui return ret; } - return flushImmediate(ret, true, true, false, false, false, nullptr, false); + return flushImmediate(ret, true, true, false, NEO::AppendOperations::nonKernel, false, nullptr, false, nullptr); } template @@ -809,11 +825,11 @@ ze_result_t CommandListCoreFamilyImmediate::appendWriteGlobalTime uint64_t *dstptr, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) { - checkAvailableSpace(numWaitEvents, false, commonImmediateCommandSize); + checkAvailableSpace(numWaitEvents, false, commonImmediateCommandSize, false); auto ret = CommandListCoreFamily::appendWriteGlobalTimestamp(dstptr, hSignalEvent, numWaitEvents, phWaitEvents); - return flushImmediate(ret, true, true, false, false, false, hSignalEvent, false); + return flushImmediate(ret, true, true, false, NEO::AppendOperations::nonKernel, false, hSignalEvent, false, nullptr); } template @@ -853,12 +869,12 @@ ze_result_t CommandListCoreFamilyImmediate::appendImageCopyRegion auto sizePerBlit = sizeof(typename GfxFamily::XY_BLOCK_COPY_BLT) + NEO::BlitCommandsHelper::estimatePostBlitCommandSize(); estimatedSize += nBlits * sizePerBlit; } - checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, estimatedSize); + checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, estimatedSize, false); auto ret = CommandListCoreFamily::appendImageCopyRegion(hDstImage, hSrcImage, pDstRegion, pSrcRegion, hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch); - return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, true, false, hSignalEvent, false); + return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, NEO::AppendOperations::kernel, false, hSignalEvent, false, nullptr); } template @@ -871,12 +887,12 @@ ze_result_t CommandListCoreFamilyImmediate::appendImageCopyFromMe ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) { relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false); - checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize); + checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize, false); auto ret = CommandListCoreFamily::appendImageCopyFromMemory(hDstImage, srcPtr, pDstRegion, hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch); - return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, true, false, hSignalEvent, false); + return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, NEO::AppendOperations::kernel, false, hSignalEvent, false, nullptr); } template @@ -889,12 +905,12 @@ ze_result_t CommandListCoreFamilyImmediate::appendImageCopyToMemo ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) { relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false); - checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize); + checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize, false); auto ret = CommandListCoreFamily::appendImageCopyToMemory(dstPtr, hSrcImage, pSrcRegion, hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch); - return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, true, false, hSignalEvent, false); + return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, NEO::AppendOperations::kernel, false, hSignalEvent, false, nullptr); } template @@ -909,12 +925,12 @@ ze_result_t CommandListCoreFamilyImmediate::appendImageCopyFromMe ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) { relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false); - checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize); + checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize, false); auto ret = CommandListCoreFamily::appendImageCopyFromMemoryExt(hDstImage, srcPtr, pDstRegion, srcRowPitch, srcSlicePitch, hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch); - return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, true, false, hSignalEvent, false); + return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, NEO::AppendOperations::kernel, false, hSignalEvent, false, nullptr); } template @@ -929,12 +945,12 @@ ze_result_t CommandListCoreFamilyImmediate::appendImageCopyToMemo ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) { relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false); - checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize); + checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize, false); auto ret = CommandListCoreFamily::appendImageCopyToMemoryExt(dstPtr, hSrcImage, pSrcRegion, destRowPitch, destSlicePitch, hSignalEvent, numWaitEvents, phWaitEvents, relaxedOrderingDispatch); - return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, true, false, hSignalEvent, false); + return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch), relaxedOrderingDispatch, NEO::AppendOperations::kernel, false, hSignalEvent, false, nullptr); } template @@ -944,24 +960,24 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryRangesBar ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) { - checkAvailableSpace(numWaitEvents, false, commonImmediateCommandSize); + checkAvailableSpace(numWaitEvents, false, commonImmediateCommandSize, false); auto ret = CommandListCoreFamily::appendMemoryRangesBarrier(numRanges, pRangeSizes, pRanges, hSignalEvent, numWaitEvents, phWaitEvents); - return flushImmediate(ret, true, true, false, false, false, hSignalEvent, false); + return flushImmediate(ret, true, true, false, NEO::AppendOperations::nonKernel, false, hSignalEvent, false, nullptr); } template ze_result_t CommandListCoreFamilyImmediate::appendWaitOnMemory(void *desc, void *ptr, uint64_t data, ze_event_handle_t signalEventHandle, bool useQwordData) { - checkAvailableSpace(0, false, commonImmediateCommandSize); + checkAvailableSpace(0, false, commonImmediateCommandSize, false); auto ret = CommandListCoreFamily::appendWaitOnMemory(desc, ptr, data, signalEventHandle, useQwordData); - return flushImmediate(ret, true, false, false, false, false, signalEventHandle, false); + return flushImmediate(ret, true, false, false, NEO::AppendOperations::nonKernel, false, signalEventHandle, false, nullptr); } template ze_result_t CommandListCoreFamilyImmediate::appendWriteToMemory(void *desc, void *ptr, uint64_t data) { - checkAvailableSpace(0, false, commonImmediateCommandSize); + checkAvailableSpace(0, false, commonImmediateCommandSize, false); auto ret = CommandListCoreFamily::appendWriteToMemory(desc, ptr, data); - return flushImmediate(ret, true, false, false, false, false, nullptr, false); + return flushImmediate(ret, true, false, false, NEO::AppendOperations::nonKernel, false, nullptr, false, nullptr); } template @@ -969,7 +985,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendWaitExternalSem const ze_intel_external_semaphore_wait_params_exp_t *params, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) { - checkAvailableSpace(0, false, commonImmediateCommandSize); + checkAvailableSpace(0, false, commonImmediateCommandSize, false); auto ret = ZE_RESULT_SUCCESS; if (numWaitEvents) { @@ -1014,7 +1030,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendSignalExternalS const ze_intel_external_semaphore_signal_params_exp_t *params, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) { - checkAvailableSpace(0, false, commonImmediateCommandSize); + checkAvailableSpace(0, false, commonImmediateCommandSize, false); auto ret = ZE_RESULT_SUCCESS; if (numWaitEvents) { @@ -1157,9 +1173,9 @@ CommandQueue *CommandListCoreFamilyImmediate::getCmdQImmediate(bo } template -ze_result_t CommandListCoreFamilyImmediate::flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds, - bool hasRelaxedOrderingDependencies, bool kernelOperation, bool copyOffloadSubmission, ze_event_handle_t hSignalEvent, - bool requireTaskCountUpdate) { +ze_result_t CommandListCoreFamilyImmediate::flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, + NEO::AppendOperations appendOperation, bool copyOffloadSubmission, ze_event_handle_t hSignalEvent, bool requireTaskCountUpdate, + MutexLock *outerLock) { auto signalEvent = Event::fromHandle(hSignalEvent); auto queue = getCmdQImmediate(copyOffloadSubmission); @@ -1174,7 +1190,7 @@ ze_result_t CommandListCoreFamilyImmediate::flushImmediate(ze_res if (signalEvent && (NEO::debugManager.flags.TrackNumCsrClientsOnSyncPoints.get() != 0)) { signalEvent->setLatestUsedCmdQueue(queue); } - inputRet = executeCommandListImmediateWithFlushTask(performMigration, hasStallingCmds, hasRelaxedOrderingDependencies, kernelOperation, copyOffloadSubmission, requireTaskCountUpdate); + inputRet = executeCommandListImmediateWithFlushTask(performMigration, hasStallingCmds, hasRelaxedOrderingDependencies, appendOperation, copyOffloadSubmission, requireTaskCountUpdate, outerLock); } else { inputRet = executeCommandListImmediate(performMigration); } @@ -1255,7 +1271,7 @@ ze_result_t CommandListCoreFamilyImmediate::flushInOrderCounterSi this->appendSignalInOrderDependencyCounter(nullptr, false, true); this->inOrderExecInfo->addCounterValue(this->getInOrderIncrementValue()); this->handleInOrderCounterOverflow(false); - ret = flushImmediate(ret, false, true, false, false, false, nullptr, false); + ret = flushImmediate(ret, false, true, false, NEO::AppendOperations::nonKernel, false, nullptr, false, nullptr); } return ret; } @@ -1660,7 +1676,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendCommandLists(ui ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) { auto ret = ZE_RESULT_SUCCESS; - checkAvailableSpace(numWaitEvents, false, commonImmediateCommandSize); + checkAvailableSpace(numWaitEvents, false, commonImmediateCommandSize, false); if (numWaitEvents) { ret = this->appendWaitOnEvents(numWaitEvents, phWaitEvents, nullptr, false, true, true, true, true, false); } @@ -1685,7 +1701,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendCommandLists(ui } bool hasStallingCmds = true; - return flushImmediate(ret, true, hasStallingCmds, relaxedOrderingDispatch, true, false, hSignalEvent, true); + return flushImmediate(ret, true, hasStallingCmds, relaxedOrderingDispatch, NEO::AppendOperations::kernel, false, hSignalEvent, true, nullptr); } } // namespace L0 diff --git a/level_zero/core/source/device/bcs_split.h b/level_zero/core/source/device/bcs_split.h index 86e24c750b..01238c9d47 100644 --- a/level_zero/core/source/device/bcs_split.h +++ b/level_zero/core/source/device/bcs_split.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2024 Intel Corporation + * Copyright (C) 2022-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -121,7 +121,7 @@ struct BcsSplit { result = appendCall(localDstPtr, localSrcPtr, localSize, eventHandle); if (cmdList->flushTaskSubmissionEnabled()) { - cmdList->executeCommandListImmediateWithFlushTaskImpl(performMigration, hasStallingCmds, hasRelaxedOrderingDependencies, false, false, cmdQsForSplit[i]); + cmdList->executeCommandListImmediateWithFlushTaskImpl(performMigration, hasStallingCmds, hasRelaxedOrderingDependencies, NEO::AppendOperations::nonKernel, false, cmdQsForSplit[i], nullptr); } else { cmdList->executeCommandListImmediateImpl(performMigration, cmdQsForSplit[i]); } diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index 2c662a2a1d..8e80dd8f91 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -718,10 +718,12 @@ class MockCommandListImmediateHw : public WhiteBox<::L0::CommandListCoreFamilyIm return executeCommandListImmediateReturnValue; } - ze_result_t executeCommandListImmediateWithFlushTask(bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, bool kernelOperation, bool copyOffloadSubmission, bool requireTaskCountUpdate) override { + ze_result_t executeCommandListImmediateWithFlushTask(bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, NEO::AppendOperations appendOperation, + bool copyOffloadSubmission, bool requireTaskCountUpdate, + MutexLock *outerLock) override { ++executeCommandListImmediateWithFlushTaskCalledCount; if (callBaseExecute) { - return BaseClass::executeCommandListImmediateWithFlushTask(performMigration, hasStallingCmds, hasRelaxedOrderingDependencies, kernelOperation, copyOffloadSubmission, requireTaskCountUpdate); + return BaseClass::executeCommandListImmediateWithFlushTask(performMigration, hasStallingCmds, hasRelaxedOrderingDependencies, appendOperation, copyOffloadSubmission, requireTaskCountUpdate, outerLock); } return executeCommandListImmediateWithFlushTaskReturnValue; } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_5.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_5.cpp index 8f0e8575c6..f555b4c9e0 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_5.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_5.cpp @@ -929,7 +929,7 @@ HWTEST2_F(CommandListCreate, givenSecondaryCommandStreamForImmediateCmdListWhenC auto immediateCmdList = static_cast *>(commandList.get()); auto secondaryCmdStream = reinterpret_cast(&commandList->getCmdContainer())->secondaryCommandStreamForImmediateCmdList.get(); - immediateCmdList->checkAvailableSpace(0u, false, commonImmediateCommandSize); + immediateCmdList->checkAvailableSpace(0u, false, commonImmediateCommandSize, false); EXPECT_EQ(commandList->getCmdContainer().getCommandStream(), secondaryCmdStream); EXPECT_TRUE(MemoryPoolHelper::isSystemMemoryPool(commandList->getCmdContainer().getCommandStream()->getGraphicsAllocation()->getMemoryPool())); @@ -958,7 +958,7 @@ HWTEST2_F(CommandListCreate, givenNoSecondaryCommandStreamForImmediateCmdListWhe auto immediateCmdList = static_cast *>(commandList.get()); auto cmdStream = commandList->getCmdContainer().getCommandStream(); - immediateCmdList->checkAvailableSpace(0u, false, commonImmediateCommandSize); + immediateCmdList->checkAvailableSpace(0u, false, commonImmediateCommandSize, false); EXPECT_EQ(commandList->getCmdContainer().getCommandStream(), cmdStream); EXPECT_FALSE(MemoryPoolHelper::isSystemMemoryPool(commandList->getCmdContainer().getCommandStream()->getGraphicsAllocation()->getMemoryPool())); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp index a3fec615e2..a889ffeecf 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp @@ -128,7 +128,7 @@ HWTEST2_F(CommandListExecuteImmediate, whenExecutingCommandListImmediateWithFlus commandListImmediate.requiredStreamState.stateComputeMode.isCoherencyRequired.value = 0; commandListImmediate.requiredStreamState.stateComputeMode.largeGrfMode.value = 1; commandListImmediate.requiredStreamState.stateComputeMode.threadArbitrationPolicy.value = NEO::ThreadArbitrationPolicy::RoundRobin; - commandListImmediate.executeCommandListImmediateWithFlushTask(false, false, false, true, false, false); + commandListImmediate.executeCommandListImmediateWithFlushTask(false, false, false, NEO::AppendOperations::kernel, false, false, nullptr); NEO::StateComputeModePropertiesSupport scmPropertiesSupport = {}; productHelper.fillScmPropertiesSupportStructure(scmPropertiesSupport); @@ -158,7 +158,7 @@ HWTEST2_F(CommandListExecuteImmediate, whenExecutingCommandListImmediateWithFlus commandListImmediate.requiredStreamState.stateComputeMode.isCoherencyRequired.value = 0; commandListImmediate.requiredStreamState.stateComputeMode.largeGrfMode.value = 0; commandListImmediate.requiredStreamState.stateComputeMode.threadArbitrationPolicy.value = NEO::ThreadArbitrationPolicy::AgeBased; - commandListImmediate.executeCommandListImmediateWithFlushTask(false, false, false, true, false, false); + commandListImmediate.executeCommandListImmediateWithFlushTask(false, false, false, NEO::AppendOperations::kernel, false, false, nullptr); expectedLargeGrfMode = scmPropertiesSupport.largeGrfMode ? 0 : -1; expectedThreadArbitrationPolicy = scmPropertiesSupport.threadArbitrationPolicy ? NEO::ThreadArbitrationPolicy::AgeBased : -1; @@ -196,7 +196,7 @@ HWTEST2_F(CommandListExecuteImmediate, whenExecutingCommandListImmediateWithFlus commandList.reset(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::renderCompute, returnValue)); auto &commandListImmediate = static_cast &>(*commandList); commandListImmediate.containsAnyKernel = true; - commandListImmediate.executeCommandListImmediateWithFlushTask(false, false, false, true, false, false); + commandListImmediate.executeCommandListImmediateWithFlushTask(false, false, false, NEO::AppendOperations::kernel, false, false, nullptr); EXPECT_FALSE(commandListImmediate.containsAnyKernel); } @@ -209,7 +209,7 @@ HWTEST2_F(CommandListExecuteImmediate, whenExecutingCommandListImmediateWithFlus commandList.reset(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::renderCompute, returnValue)); auto &commandListImmediate = static_cast &>(*commandList); - EXPECT_EQ(ZE_RESULT_SUCCESS, commandListImmediate.executeCommandListImmediateWithFlushTask(false, false, false, false, false, false)); + EXPECT_EQ(ZE_RESULT_SUCCESS, commandListImmediate.executeCommandListImmediateWithFlushTask(false, false, false, NEO::AppendOperations::nonKernel, false, false, nullptr)); } HWTEST2_F(CommandListExecuteImmediate, givenOutOfHostMemoryErrorOnFlushWhenExecutingCommandListImmediateWithFlushTaskThenProperErrorIsReturned, MatchAny) { @@ -221,7 +221,7 @@ HWTEST2_F(CommandListExecuteImmediate, givenOutOfHostMemoryErrorOnFlushWhenExecu auto &commandStreamReceiver = neoDevice->getUltCommandStreamReceiver(); commandStreamReceiver.flushReturnValue = SubmissionStatus::outOfHostMemory; - EXPECT_EQ(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, commandListImmediate.executeCommandListImmediateWithFlushTask(false, false, false, false, false, false)); + EXPECT_EQ(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, commandListImmediate.executeCommandListImmediateWithFlushTask(false, false, false, NEO::AppendOperations::nonKernel, false, false, nullptr)); } HWTEST2_F(CommandListExecuteImmediate, givenOutOfDeviceMemoryErrorOnFlushWhenExecutingCommandListImmediateWithFlushTaskThenProperErrorIsReturned, MatchAny) { @@ -233,7 +233,7 @@ HWTEST2_F(CommandListExecuteImmediate, givenOutOfDeviceMemoryErrorOnFlushWhenExe auto &commandStreamReceiver = neoDevice->getUltCommandStreamReceiver(); commandStreamReceiver.flushReturnValue = SubmissionStatus::outOfMemory; - EXPECT_EQ(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, commandListImmediate.executeCommandListImmediateWithFlushTask(false, false, false, false, false, false)); + EXPECT_EQ(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, commandListImmediate.executeCommandListImmediateWithFlushTask(false, false, false, NEO::AppendOperations::nonKernel, false, false, nullptr)); } HWTEST2_F(CommandListExecuteImmediate, GivenImmediateCommandListWhenCommandListIsCreatedThenCsrStateIsNotSet, MatchAny) { @@ -549,7 +549,7 @@ HWTEST2_F(CommandListTest, givenImmediateCommandListWhenFlushImmediateThenOverri auto event = std::unique_ptr(static_cast(L0::Event::create(eventPool.get(), &eventDesc, device))); event->csrs[0] = &mockCommandStreamReceiver; - cmdList.flushImmediate(ZE_RESULT_SUCCESS, false, false, false, false, false, event->toHandle(), false); + cmdList.flushImmediate(ZE_RESULT_SUCCESS, false, false, false, NEO::AppendOperations::nonKernel, false, event->toHandle(), false, nullptr); EXPECT_EQ(event->csrs[0], cmdList.getCsr(false)); } @@ -1247,7 +1247,7 @@ HWTEST2_F(CommandListTest, givenCmdListWithIndirectAccessWhenExecutingCommandLis auto oldCommandQueue = commandListImmediate.cmdQImmediate; commandListImmediate.cmdQImmediate = &mockCommandQueue; commandListImmediate.indirectAllocationsAllowed = true; - commandListImmediate.executeCommandListImmediateWithFlushTask(false, false, false, false, false, false); + commandListImmediate.executeCommandListImmediateWithFlushTask(false, false, false, NEO::AppendOperations::nonKernel, false, false, nullptr); EXPECT_EQ(mockCommandQueue.handleIndirectAllocationResidencyCalledTimes, 1u); commandListImmediate.cmdQImmediate = oldCommandQueue; } @@ -1266,7 +1266,7 @@ HWTEST2_F(CommandListTest, givenCmdListWithNoIndirectAccessWhenExecutingCommandL auto oldCommandQueue = commandListImmediate.cmdQImmediate; commandListImmediate.cmdQImmediate = &mockCommandQueue; commandListImmediate.indirectAllocationsAllowed = false; - commandListImmediate.executeCommandListImmediateWithFlushTask(false, false, false, false, false, false); + commandListImmediate.executeCommandListImmediateWithFlushTask(false, false, false, NEO::AppendOperations::nonKernel, false, false, nullptr); EXPECT_EQ(mockCommandQueue.handleIndirectAllocationResidencyCalledTimes, 0u); commandListImmediate.cmdQImmediate = oldCommandQueue; } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp index 28d80d61bb..3c4f47c6ea 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp @@ -1896,12 +1896,12 @@ HWTEST2_F(CommandListCreate, givenImmediateCommandListWhenThereIsNoEnoughSpaceFo commandList->getCmdContainer().getCommandStream()->getGraphicsAllocation()->updateTaskCount(0u, 0u); commandList->getCmdContainer().getCommandStream()->getSpace(useSize); - reinterpret_cast *>(commandList.get())->checkAvailableSpace(0, false, commonImmediateCommandSize); + reinterpret_cast *>(commandList.get())->checkAvailableSpace(0, false, commonImmediateCommandSize, false); EXPECT_EQ(1U, commandList->getCmdContainer().getCmdBufferAllocations().size()); commandList->getCmdContainer().getCommandStream()->getSpace(useSize); auto latestFlushedTaskCount = whiteBoxCmdList->getCsr(false)->peekLatestFlushedTaskCount(); - reinterpret_cast *>(commandList.get())->checkAvailableSpace(0, false, commonImmediateCommandSize); + reinterpret_cast *>(commandList.get())->checkAvailableSpace(0, false, commonImmediateCommandSize, false); EXPECT_EQ(1U, commandList->getCmdContainer().getCmdBufferAllocations().size()); EXPECT_EQ(latestFlushedTaskCount + 1, whiteBoxCmdList->getCsr(false)->peekLatestFlushedTaskCount()); } @@ -1923,12 +1923,12 @@ HWTEST2_F(CommandListCreate, givenImmediateCommandListWhenThereIsNoEnoughSpaceFo commandList->getCmdContainer().getCommandStream()->getGraphicsAllocation()->updateTaskCount(0u, 0u); commandList->getCmdContainer().getCommandStream()->getSpace(useSize); - reinterpret_cast *>(commandList.get())->checkAvailableSpace(numEvents, false, commonImmediateCommandSize); + reinterpret_cast *>(commandList.get())->checkAvailableSpace(numEvents, false, commonImmediateCommandSize, false); EXPECT_EQ(1U, commandList->getCmdContainer().getCmdBufferAllocations().size()); commandList->getCmdContainer().getCommandStream()->getSpace(useSize); auto latestFlushedTaskCount = whiteBoxCmdList->getCsr(false)->peekLatestFlushedTaskCount(); - reinterpret_cast *>(commandList.get())->checkAvailableSpace(numEvents, false, commonImmediateCommandSize); + reinterpret_cast *>(commandList.get())->checkAvailableSpace(numEvents, false, commonImmediateCommandSize, false); EXPECT_EQ(1U, commandList->getCmdContainer().getCmdBufferAllocations().size()); EXPECT_EQ(latestFlushedTaskCount + 1, whiteBoxCmdList->getCsr(false)->peekLatestFlushedTaskCount()); } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_signal_event.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_signal_event.cpp index b6ec1bd1e8..3aa421f3a2 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_signal_event.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_signal_event.cpp @@ -664,7 +664,7 @@ HWTEST2_F(CommandListAppendUsedPacketSignalEvent, event->signalScope = ZE_EVENT_SCOPE_FLAG_HOST; commandList->partitionCount = packets; - commandList->checkAvailableSpace(0, false, commonImmediateCommandSize); + commandList->checkAvailableSpace(0, false, commonImmediateCommandSize, false); commandList->appendSignalEventPostWalker(event.get(), nullptr, nullptr, false, false, false); EXPECT_EQ(packets, event->getPacketsInUse()); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_wait_on_events.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_wait_on_events.cpp index b1beccc213..78ed0bb215 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_wait_on_events.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_wait_on_events.cpp @@ -277,10 +277,12 @@ class MockCommandListImmediateHwWithWaitEventFail : public WhiteBox<::L0::Comman return executeCommandListImmediateReturnValue; } - ze_result_t executeCommandListImmediateWithFlushTask(bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, bool kernelOperation, bool copyOffloadSubmission, bool requireTaskCountUpdate) override { + ze_result_t executeCommandListImmediateWithFlushTask(bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, NEO::AppendOperations appendOperation, + bool copyOffloadSubmission, bool requireTaskCountUpdate, + MutexLock *outerLock) override { ++executeCommandListImmediateWithFlushTaskCalledCount; if (callBaseExecute) { - return BaseClass::executeCommandListImmediateWithFlushTask(performMigration, hasStallingCmds, hasRelaxedOrderingDependencies, kernelOperation, copyOffloadSubmission, requireTaskCountUpdate); + return BaseClass::executeCommandListImmediateWithFlushTask(performMigration, hasStallingCmds, hasRelaxedOrderingDependencies, appendOperation, copyOffloadSubmission, requireTaskCountUpdate, outerLock); } return executeCommandListImmediateWithFlushTaskReturnValue; } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp index f31a3407ab..86075ee6d5 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp @@ -2769,7 +2769,9 @@ HWTEST2_F(InOrderCmdListTests, givenRelaxedOrderingWhenProgrammingTimestampEvent using BaseClass = WhiteBox>; using BaseClass::BaseClass; - ze_result_t flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, bool kernelOperation, bool copyOffloadSubmission, ze_event_handle_t hSignalEvent, bool requireTaskCountUpdate) override { + ze_result_t flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, + NEO::AppendOperations appendOperation, bool copyOffloadSubmission, ze_event_handle_t hSignalEvent, bool requireTaskCountUpdate, + MutexLock *outerLock) override { flushData.push_back(this->cmdListCurrentStartOffset); this->cmdListCurrentStartOffset = this->commandContainer.getCommandStream()->getUsed(); @@ -2900,7 +2902,9 @@ HWTEST2_F(InOrderCmdListTests, givenRelaxedOrderingWhenProgrammingTimestampEvent return ret; } - ze_result_t flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, bool kernelOperation, bool copyOffloadSubmission, ze_event_handle_t hSignalEvent, bool requireTaskCountUpdate) override { + ze_result_t flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, + NEO::AppendOperations appendOperation, bool copyOffloadSubmission, ze_event_handle_t hSignalEvent, bool requireTaskCountUpdate, + MutexLock *outerLock) override { auto hostAddr = reinterpret_cast(usedEvent->getCompletionFieldHostAddress()); eventCompletionData.push_back(*hostAddr); @@ -2965,7 +2969,9 @@ HWTEST2_F(InOrderCmdListTests, givenRegularNonTimestampEventWhenSkipItsConverted using BaseClass = WhiteBox>; using BaseClass::BaseClass; - ze_result_t flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, bool kernelOperation, bool copyOffloadSubmission, ze_event_handle_t hSignalEvent, bool requireTaskCountUpdate) override { + ze_result_t flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, + NEO::AppendOperations appendOperation, bool copyOffloadSubmission, ze_event_handle_t hSignalEvent, bool requireTaskCountUpdate, + MutexLock *outerLock) override { flushCounter++; this->cmdListCurrentStartOffset = this->commandContainer.getCommandStream()->getUsed(); @@ -3016,7 +3022,9 @@ HWTEST2_F(InOrderCmdListTests, givenDebugFlagSetWhenChainingWithRelaxedOrderingT using BaseClass = WhiteBox>; using BaseClass::BaseClass; - ze_result_t flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, bool kernelOperation, bool copyOffloadSubmission, ze_event_handle_t hSignalEvent, bool requireTaskCountUpdate) override { + ze_result_t flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, + NEO::AppendOperations appendOperation, bool copyOffloadSubmission, ze_event_handle_t hSignalEvent, bool requireTaskCountUpdate, + MutexLock *outerLock) override { flushCount++; return ZE_RESULT_SUCCESS; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp index b9af0f83a1..1a8d9f4a9f 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp @@ -627,7 +627,9 @@ HWTEST2_F(CopyOffloadInOrderTests, givenCopyOperationWithHostVisibleEventThenMar HWTEST2_F(CopyOffloadInOrderTests, givenRelaxedOrderingEnabledWhenDispatchingThenUseCorrectCsr, IsAtLeastXeHpcCore) { class MyMockCmdList : public WhiteBox> { public: - ze_result_t flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, bool kernelOperation, bool copyOffloadSubmission, ze_event_handle_t hSignalEvent, bool requireTaskCountUpdate) override { + ze_result_t flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, + NEO::AppendOperations appendOperation, bool copyOffloadSubmission, ze_event_handle_t hSignalEvent, bool requireTaskCountUpdate, + MutexLock *outerLock) override { latestRelaxedOrderingMode = hasRelaxedOrderingDependencies; return ZE_RESULT_SUCCESS; diff --git a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp index 8016eede1e..401d9fa50b 100644 --- a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp +++ b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2024 Intel Corporation + * Copyright (C) 2021-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -249,7 +249,7 @@ HWTEST2_F(CommandListStatePrefetchXeHpcCore, givenForceMemoryPrefetchForKmdMigra std::unique_ptr commandList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::renderCompute, returnValue)); auto &commandListImmediate = static_cast &>(*commandList); - result = commandListImmediate.executeCommandListImmediateWithFlushTask(false, false, false, false, false, false); + result = commandListImmediate.executeCommandListImmediateWithFlushTask(false, false, false, NEO::AppendOperations::nonKernel, false, false, nullptr); EXPECT_EQ(ZE_RESULT_SUCCESS, result); auto mockMemoryManager = reinterpret_cast(neoDevice->getMemoryManager()); diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h index fdd5574055..418dd64402 100644 --- a/shared/source/command_stream/command_stream_receiver_hw.h +++ b/shared/source/command_stream/command_stream_receiver_hw.h @@ -39,6 +39,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { bool stateBaseAddressFullConfigurationNeeded = false; bool stateBaseAddressDirty = false; bool contextOneTimeInit = false; + bool stateCacheFlushRequired = false; }; public: diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 4dfe38c9ac..621f3b3a62 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -329,8 +329,8 @@ CompletionStamp CommandStreamReceiverHw::flushImmediateTask( handleImmediateFlushStateBaseAddressState(dispatchFlags, flushData, device); handleImmediateFlushOneTimeContextInitState(dispatchFlags, flushData, device); - bool stateCacheFlushRequired = device.getBindlessHeapsHelper() ? device.getBindlessHeapsHelper()->getStateDirtyForContext(getOsContext().getContextId()) : false; - if (stateCacheFlushRequired) { + flushData.stateCacheFlushRequired = device.getBindlessHeapsHelper() ? device.getBindlessHeapsHelper()->getStateDirtyForContext(getOsContext().getContextId()) : false; + if (flushData.stateCacheFlushRequired) { flushData.estimatedSize += MemorySynchronizationCommands::getSizeForFullCacheFlush(); } @@ -344,7 +344,7 @@ CompletionStamp CommandStreamReceiverHw::flushImmediateTask( auto &csrCommandStream = getCS(flushData.estimatedSize); flushData.csrStartOffset = csrCommandStream.getUsed(); - if (stateCacheFlushRequired) { + if (flushData.stateCacheFlushRequired) { device.getBindlessHeapsHelper()->clearStateDirtyForContext(getOsContext().getContextId()); MemorySynchronizationCommands::addStateCacheFlush(csrCommandStream, device.getRootDeviceEnvironment()); } diff --git a/shared/source/command_stream/csr_definitions.h b/shared/source/command_stream/csr_definitions.h index e9ab9d9d64..cc689fba19 100644 --- a/shared/source/command_stream/csr_definitions.h +++ b/shared/source/command_stream/csr_definitions.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -13,6 +13,7 @@ #include "shared/source/command_stream/preemption_mode.h" #include "shared/source/command_stream/queue_throttle.h" #include "shared/source/command_stream/thread_arbitration_policy.h" +#include "shared/source/helpers/append_operations.h" #include "shared/source/helpers/pipeline_select_args.h" #include "shared/source/kernel/grf_config.h" #include "shared/source/kernel/kernel_execution_type.h" @@ -20,6 +21,7 @@ #include namespace NEO { +class LinearStream; struct FlushStampTrackingObj; struct StreamProperties; @@ -44,6 +46,8 @@ struct DispatchBcsFlags { DispatchBcsFlags(bool flushTaskCount, bool hasStallingCmds, bool hasRelaxedOrderingDependencies) : flushTaskCount(flushTaskCount), hasStallingCmds(hasStallingCmds), hasRelaxedOrderingDependencies(hasRelaxedOrderingDependencies) {} + LinearStream *optionalEpilogueCmdStream = nullptr; + AppendOperations dispatchOperation = AppendOperations::none; bool flushTaskCount = false; bool hasStallingCmds = false; bool hasRelaxedOrderingDependencies = false; @@ -136,6 +140,8 @@ struct CsrSizeRequestFlags { struct ImmediateDispatchFlags { StreamProperties *requiredState = nullptr; void *sshCpuBase = nullptr; + LinearStream *optionalEpilogueCmdStream = nullptr; + AppendOperations dispatchOperation = AppendOperations::none; bool blockingAppend = false; bool requireTaskCountUpdate = false; bool hasRelaxedOrderingDependencies = false; diff --git a/shared/source/helpers/CMakeLists.txt b/shared/source/helpers/CMakeLists.txt index 892affcd93..00f86b63e5 100644 --- a/shared/source/helpers/CMakeLists.txt +++ b/shared/source/helpers/CMakeLists.txt @@ -13,10 +13,11 @@ set(NEO_CORE_HELPERS ${CMAKE_CURRENT_SOURCE_DIR}/affinity_mask.h ${CMAKE_CURRENT_SOURCE_DIR}/aligned_memory.h ${CMAKE_CURRENT_SOURCE_DIR}/api_gfx_core_helper.h + ${CMAKE_CURRENT_SOURCE_DIR}/api_specific_config.h ${CMAKE_CURRENT_SOURCE_DIR}/app_resource_defines.h ${CMAKE_CURRENT_SOURCE_DIR}/app_resource_helper.h ${CMAKE_CURRENT_SOURCE_DIR}/app_resource_helper.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/api_specific_config.h + ${CMAKE_CURRENT_SOURCE_DIR}/append_operations.h ${CMAKE_CURRENT_SOURCE_DIR}/array_count.h ${CMAKE_CURRENT_SOURCE_DIR}/aux_translation.h ${CMAKE_CURRENT_SOURCE_DIR}/basic_math.h diff --git a/shared/source/helpers/append_operations.h b/shared/source/helpers/append_operations.h new file mode 100644 index 0000000000..cbf9c9db15 --- /dev/null +++ b/shared/source/helpers/append_operations.h @@ -0,0 +1,19 @@ +/* + * Copyright (C) 2025 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once + +namespace NEO { + +enum AppendOperations { + none = 0, + kernel = 1u, + nonKernel = 2u, + cmdList = 3u +}; + +} // namespace NEO