diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 79e9d43df5..ec7298aa12 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -375,6 +375,8 @@ struct CommandListCoreFamily : public CommandListImp { void appendCopyOperationFence(Event *signalEvent, NEO::GraphicsAllocation *srcAllocation, NEO::GraphicsAllocation *dstAllocation, bool copyOffloadOperation); bool isDeviceToHostCopyEventFenceRequired(Event *signalEvent) const; bool isDeviceToHostBcsCopy(NEO::GraphicsAllocation *srcAllocation, NEO::GraphicsAllocation *dstAllocation, bool copyOffloadOperation) const; + bool singleEventPacketRequired(bool inputSinglePacketEventRequest) const; + void programEventL3Flush(Event *event); NEO::InOrderPatchCommandsContainer inOrderPatchCmds; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 390d98cf48..a7701f8884 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -4183,4 +4183,60 @@ ze_result_t CommandListCoreFamily::appendCommandLists(uint32_t nu return ZE_RESULT_ERROR_INVALID_ARGUMENT; } +template +void CommandListCoreFamily::appendEventForProfilingAllWalkers(Event *event, void **syncCmdBuffer, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool singlePacketEvent, bool skipAddingEventToResidency, bool copyOperation) { + if (copyOperation || singleEventPacketRequired(singlePacketEvent)) { + if (beforeWalker) { + appendEventForProfiling(event, outTimeStampSyncCmds, true, false, skipAddingEventToResidency, copyOperation); + } else { + appendSignalEventPostWalker(event, syncCmdBuffer, outTimeStampSyncCmds, false, skipAddingEventToResidency, copyOperation); + } + } else { + if (event) { + if (beforeWalker) { + event->resetKernelCountAndPacketUsedCount(); + event->zeroKernelCount(); + } else { + if (event->getKernelCount() > 1) { + if (getDcFlushRequired(event->isSignalScope())) { + programEventL3Flush(event); + } + dispatchEventRemainingPacketsPostSyncOperation(event, copyOperation); + } + } + } + } +} + +template +void CommandListCoreFamily::programEventL3Flush(Event *event) { + auto eventPartitionOffset = (partitionCount > 1) ? (partitionCount * event->getSinglePacketSize()) + : event->getSinglePacketSize(); + uint64_t eventAddress = event->getPacketAddress(device) + eventPartitionOffset; + if (event->isUsingContextEndOffset()) { + eventAddress += event->getContextEndOffset(); + } + + if (partitionCount > 1) { + event->setPacketsInUse(event->getPacketsUsedInLastKernel() + partitionCount); + } else { + event->setPacketsInUse(event->getPacketsUsedInLastKernel() + 1); + } + + event->setL3FlushForCurrentKernel(); + + auto &cmdListStream = *commandContainer.getCommandStream(); + NEO::PipeControlArgs args; + args.dcFlushEnable = true; + args.workloadPartitionOffset = partitionCount > 1; + + NEO::MemorySynchronizationCommands::addBarrierWithPostSyncOperation( + cmdListStream, + NEO::PostSyncMode::immediateData, + eventAddress, + Event::STATE_SIGNALED, + device->getNEODevice()->getRootDeviceEnvironment(), + args); +} + } // namespace L0 diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl index 15eccb459e..41979f7398 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl @@ -110,6 +110,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K } appendEventForProfiling(event, nullptr, true, false, false, false); + auto perThreadScratchSize = std::max(this->getCommandListPerThreadScratchSize(0u), kernel->getImmutableData()->getDescriptor().kernelAttributes.perThreadScratchSize[0]); this->setCommandListPerThreadScratchSize(0u, perThreadScratchSize); @@ -341,15 +342,6 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelSplit(Kernel return appendLaunchKernelWithParams(kernel, threadGroupDimensions, nullptr, launchParams); } -template -void CommandListCoreFamily::appendEventForProfilingAllWalkers(Event *event, void **syncCmdBuffer, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool singlePacketEvent, bool skipAddingEventToResidency, bool copyOperation) { - if (beforeWalker) { - appendEventForProfiling(event, outTimeStampSyncCmds, true, false, skipAddingEventToResidency, copyOperation); - } else { - appendSignalEventPostWalker(event, syncCmdBuffer, outTimeStampSyncCmds, false, skipAddingEventToResidency, copyOperation); - } -} - template inline NEO::PreemptionMode CommandListCoreFamily::obtainKernelPreemptionMode(Kernel *kernel) { NEO::PreemptionFlags flags = NEO::PreemptionHelper::createPreemptionLevelFlags(*device->getNEODevice(), &kernel->getImmutableData()->getDescriptor()); @@ -360,4 +352,9 @@ template void CommandListCoreFamily::appendDispatchOffsetRegister(bool workloadPartitionEvent, bool beforeProfilingCmds) { } +template +bool CommandListCoreFamily::singleEventPacketRequired(bool inputSinglePacketEventRequest) const { + return true; +} + } // namespace L0 diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 15b5747762..3edff0c8be 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -39,42 +39,6 @@ size_t CommandListCoreFamily::getReserveSshSize() { return 4 * MemoryConstants::pageSize; } -template -void programEventL3Flush(Event *event, - Device *device, - uint32_t partitionCount, - NEO::CommandContainer &commandContainer) { - using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; - - auto eventPartitionOffset = (partitionCount > 1) ? (partitionCount * event->getSinglePacketSize()) - : event->getSinglePacketSize(); - uint64_t eventAddress = event->getPacketAddress(device) + eventPartitionOffset; - if (event->isUsingContextEndOffset()) { - eventAddress += event->getContextEndOffset(); - } - - if (partitionCount > 1) { - event->setPacketsInUse(event->getPacketsUsedInLastKernel() + partitionCount); - } else { - event->setPacketsInUse(event->getPacketsUsedInLastKernel() + 1); - } - - event->setL3FlushForCurrentKernel(); - - auto &cmdListStream = *commandContainer.getCommandStream(); - NEO::PipeControlArgs args; - args.dcFlushEnable = true; - args.workloadPartitionOffset = partitionCount > 1; - - NEO::MemorySynchronizationCommands::addBarrierWithPostSyncOperation( - cmdListStream, - NEO::PostSyncMode::immediateData, - eventAddress, - Event::STATE_SIGNALED, - commandContainer.getDevice()->getRootDeviceEnvironment(), - args); -} - template bool CommandListCoreFamily::isInOrderNonWalkerSignalingRequired(const Event *event) const { if (event && compactL3FlushEvent(getDcFlushRequired(event->isSignalScope()))) { @@ -426,7 +390,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K } else if (event) { event->setPacketsInUse(partitionCount); if (l3FlushEnable) { - programEventL3Flush(event, this->device, partitionCount, commandContainer); + programEventL3Flush(event); } if (!launchParams.isKernelSplitOperation) { dispatchEventRemainingPacketsPostSyncOperation(event, false); @@ -591,31 +555,6 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelSplit(Kernel return appendLaunchKernelWithParams(kernel, threadGroupDimensions, event, launchParams); } -template -void CommandListCoreFamily::appendEventForProfilingAllWalkers(Event *event, void **syncCmdBuffer, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool singlePacketEvent, bool skipAddingEventToResidency, bool copyOperation) { - if (copyOperation || singlePacketEvent) { - if (beforeWalker) { - appendEventForProfiling(event, outTimeStampSyncCmds, true, false, skipAddingEventToResidency, copyOperation); - } else { - appendSignalEventPostWalker(event, syncCmdBuffer, outTimeStampSyncCmds, false, skipAddingEventToResidency, copyOperation); - } - } else { - if (event) { - if (beforeWalker) { - event->resetKernelCountAndPacketUsedCount(); - event->zeroKernelCount(); - } else { - if (event->getKernelCount() > 1) { - if (getDcFlushRequired(event->isSignalScope())) { - programEventL3Flush(event, this->device, this->partitionCount, this->commandContainer); - } - dispatchEventRemainingPacketsPostSyncOperation(event, copyOperation); - } - } - } - } -} - template void CommandListCoreFamily::appendDispatchOffsetRegister(bool workloadPartitionEvent, bool beforeProfilingCmds) { if (workloadPartitionEvent && !device->getL0GfxCoreHelper().hasUnifiedPostSyncAllocationLayout()) { @@ -625,4 +564,9 @@ void CommandListCoreFamily::appendDispatchOffsetRegister(bool wor } } +template +bool CommandListCoreFamily::singleEventPacketRequired(bool inputSinglePacketEventRequest) const { + return inputSinglePacketEventRequest; +} + } // namespace L0