diff --git a/runtime/command_queue/command_queue_hw.h b/runtime/command_queue/command_queue_hw.h index 202edfd960..3e1eea8813 100644 --- a/runtime/command_queue/command_queue_hw.h +++ b/runtime/command_queue/command_queue_hw.h @@ -8,6 +8,7 @@ #pragma once #include "runtime/command_stream/command_stream_receiver.h" #include "runtime/command_queue/command_queue.h" +#include "runtime/device_queue/device_queue_hw.h" #include "runtime/mem_obj/mem_obj.h" #include "runtime/memory_manager/graphics_allocation.h" #include "runtime/program/printf_handler.h" @@ -352,5 +353,24 @@ class CommandQueueHw : public CommandQueue { size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch); + void processDeviceEnqueue(Kernel *parentKernel, + DeviceQueueHw *devQueueHw, + const MultiDispatchInfo &multiDispatchInfo, + TagNode *hwTimeStamps, + PreemptionMode preemption, + bool &blocking); + + template + void processDispatchForKernels(const MultiDispatchInfo &multiDispatchInfo, + std::unique_ptr &printfHandler, + Event *event, + TagNode *&hwTimeStamps, + Kernel *parentKernel, + bool blockQueue, + DeviceQueueHw *devQueueHw, + CsrDependencies &csrDeps, + KernelOperation *&blockedCommandsData, + TimestampPacketContainer &previousTimestampPacketNodes, + PreemptionMode preemption); }; } // namespace OCLRT diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h index 2d1cf9c0b4..5e414ed482 100644 --- a/runtime/command_queue/enqueue_common.h +++ b/runtime/command_queue/enqueue_common.h @@ -170,7 +170,7 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, perfCountersRequired = (this->isPerfCountersEnabled() && event != nullptr); KernelOperation *blockedCommandsData = nullptr; std::unique_ptr printfHandler; - bool slmUsed = false; + bool slmUsed = multiDispatchInfo.usesSlm() || parentKernel; auto preemption = PreemptionHelper::taskPreemptionMode(*device, multiDispatchInfo); TakeOwnershipWrapper> queueOwnership(*this); @@ -212,70 +212,9 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, auto commandStreamStart = commandStream.getUsed(); if (multiDispatchInfo.empty() == false) { - HwPerfCounter *hwPerfCounter = nullptr; - DebugManager.dumpKernelArgs(&multiDispatchInfo); - - printfHandler.reset(PrintfHandler::create(multiDispatchInfo, *device)); - if (printfHandler) { - printfHandler.get()->prepareDispatch(multiDispatchInfo); - } - - if (commandType == CL_COMMAND_NDRANGE_KERNEL) { - if (multiDispatchInfo.peekMainKernel()->getProgram()->isKernelDebugEnabled()) { - setupDebugSurface(multiDispatchInfo.peekMainKernel()); - } - } - - if (eventBuilder.getEvent()) { - if (getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { - eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer); - } - if (this->isProfilingEnabled()) { - // Get allocation for timestamps - hwTimeStamps = eventBuilder.getEvent()->getHwTimeStampNode(); - if (this->isPerfCountersEnabled()) { - hwPerfCounter = eventBuilder.getEvent()->getHwPerfCounterNode()->tagForCpuAccess; - // PERF COUNTER: copy current configuration from queue to event - eventBuilder.getEvent()->copyPerfCounters(this->getPerfCountersConfigData()); - } - } - } - - if (parentKernel) { - parentKernel->createReflectionSurface(); - parentKernel->patchDefaultDeviceQueue(context->getDefaultDeviceQueue()); - parentKernel->patchEventPool(context->getDefaultDeviceQueue()); - parentKernel->patchReflectionSurface(context->getDefaultDeviceQueue(), printfHandler.get()); - if (!blockQueue) { - devQueueHw->resetDeviceQueue(); - devQueueHw->acquireEMCriticalSection(); - } - } - - HardwareInterface::dispatchWalker( - *this, - multiDispatchInfo, - csrDeps, - &blockedCommandsData, - hwTimeStamps, - hwPerfCounter, - &previousTimestampPacketNodes, - timestampPacketContainer.get(), - preemption, - blockQueue, - commandType); - - if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) { - for (auto &dispatchInfo : multiDispatchInfo) { - for (auto &patchInfoData : dispatchInfo.getKernel()->getPatchInfoDataList()) { - getCommandStreamReceiver().getFlatBatchBufferHelper().setPatchInfoData(patchInfoData); - } - } - } - - getCommandStreamReceiver().setRequiredScratchSize(multiDispatchInfo.getRequiredScratchSize()); - - slmUsed = multiDispatchInfo.usesSlm(); + processDispatchForKernels(multiDispatchInfo, printfHandler, eventBuilder.getEvent(), + hwTimeStamps, parentKernel, blockQueue, devQueueHw, csrDeps, blockedCommandsData, + previousTimestampPacketNodes, preemption); } else if (getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { if (CL_COMMAND_BARRIER == commandType) { getCommandStreamReceiver().requestStallingPipeControlOnNextFlush(); @@ -295,46 +234,7 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, CompletionStamp completionStamp; if (!blockQueue) { if (parentKernel) { - size_t minSizeSSHForEM = KernelCommandsHelper::template getSizeRequiredForExecutionModel(*parentKernel); - - uint32_t taskCount = getCommandStreamReceiver().peekTaskCount() + 1; - devQueueHw->setupExecutionModelDispatch(getIndirectHeap(IndirectHeap::SURFACE_STATE, minSizeSSHForEM), - *devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE), - parentKernel, - (uint32_t)multiDispatchInfo.size(), - taskCount, - hwTimeStamps); - - BuiltIns &builtIns = *getDevice().getExecutionEnvironment()->getBuiltIns(); - SchedulerKernel &scheduler = builtIns.getSchedulerKernel(this->getContext()); - - scheduler.setArgs(devQueueHw->getQueueBuffer(), - devQueueHw->getStackBuffer(), - devQueueHw->getEventPoolBuffer(), - devQueueHw->getSlbBuffer(), - devQueueHw->getDshBuffer(), - parentKernel->getKernelReflectionSurface(), - devQueueHw->getQueueStorageBuffer(), - this->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u).getGraphicsAllocation(), - devQueueHw->getDebugQueue()); - - GpgpuWalkerHelper::dispatchScheduler( - *this, - *devQueueHw, - preemption, - scheduler, - &getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u), - devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE)); - - scheduler.makeResident(getCommandStreamReceiver()); - - // Update SLM usage - slmUsed |= scheduler.slmTotalSize > 0; - - parentKernel->getProgram()->getBlockKernelManager()->makeInternalAllocationsResident(getCommandStreamReceiver()); - if (parentKernel->isAuxTranslationRequired()) { - blocking = true; - } + processDeviceEnqueue(parentKernel, devQueueHw, multiDispatchInfo, hwTimeStamps, preemption, blocking); } auto submissionRequired = !isCommandWithoutKernel(commandType); @@ -446,6 +346,128 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } } +template +template +void CommandQueueHw::processDispatchForKernels(const MultiDispatchInfo &multiDispatchInfo, + std::unique_ptr &printfHandler, + Event *event, + TagNode *&hwTimeStamps, + Kernel *parentKernel, + bool blockQueue, + DeviceQueueHw *devQueueHw, + CsrDependencies &csrDeps, + KernelOperation *&blockedCommandsData, + TimestampPacketContainer &previousTimestampPacketNodes, + PreemptionMode preemption) { + HwPerfCounter *hwPerfCounter = nullptr; + DebugManager.dumpKernelArgs(&multiDispatchInfo); + + printfHandler.reset(PrintfHandler::create(multiDispatchInfo, *device)); + if (printfHandler) { + printfHandler.get()->prepareDispatch(multiDispatchInfo); + } + + if (commandType == CL_COMMAND_NDRANGE_KERNEL) { + if (multiDispatchInfo.peekMainKernel()->getProgram()->isKernelDebugEnabled()) { + setupDebugSurface(multiDispatchInfo.peekMainKernel()); + } + } + + if (event) { + if (getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { + event->addTimestampPacketNodes(*timestampPacketContainer); + } + if (this->isProfilingEnabled()) { + // Get allocation for timestamps + hwTimeStamps = event->getHwTimeStampNode(); + if (this->isPerfCountersEnabled()) { + hwPerfCounter = event->getHwPerfCounterNode()->tagForCpuAccess; + // PERF COUNTER: copy current configuration from queue to event + event->copyPerfCounters(this->getPerfCountersConfigData()); + } + } + } + + if (parentKernel) { + parentKernel->createReflectionSurface(); + parentKernel->patchDefaultDeviceQueue(context->getDefaultDeviceQueue()); + parentKernel->patchEventPool(context->getDefaultDeviceQueue()); + parentKernel->patchReflectionSurface(context->getDefaultDeviceQueue(), printfHandler.get()); + if (!blockQueue) { + devQueueHw->resetDeviceQueue(); + devQueueHw->acquireEMCriticalSection(); + } + } + + HardwareInterface::dispatchWalker( + *this, + multiDispatchInfo, + csrDeps, + &blockedCommandsData, + hwTimeStamps, + hwPerfCounter, + &previousTimestampPacketNodes, + timestampPacketContainer.get(), + preemption, + blockQueue, + commandType); + + if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) { + for (auto &dispatchInfo : multiDispatchInfo) { + for (auto &patchInfoData : dispatchInfo.getKernel()->getPatchInfoDataList()) { + getCommandStreamReceiver().getFlatBatchBufferHelper().setPatchInfoData(patchInfoData); + } + } + } + + getCommandStreamReceiver().setRequiredScratchSize(multiDispatchInfo.getRequiredScratchSize()); +} +template +void CommandQueueHw::processDeviceEnqueue(Kernel *parentKernel, + DeviceQueueHw *devQueueHw, + const MultiDispatchInfo &multiDispatchInfo, + TagNode *hwTimeStamps, + PreemptionMode preemption, + bool &blocking) { + size_t minSizeSSHForEM = KernelCommandsHelper::template getSizeRequiredForExecutionModel(*parentKernel); + + uint32_t taskCount = getCommandStreamReceiver().peekTaskCount() + 1; + devQueueHw->setupExecutionModelDispatch(getIndirectHeap(IndirectHeap::SURFACE_STATE, minSizeSSHForEM), + *devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE), + parentKernel, + (uint32_t)multiDispatchInfo.size(), + taskCount, + hwTimeStamps); + + BuiltIns &builtIns = *getDevice().getExecutionEnvironment()->getBuiltIns(); + SchedulerKernel &scheduler = builtIns.getSchedulerKernel(this->getContext()); + + scheduler.setArgs(devQueueHw->getQueueBuffer(), + devQueueHw->getStackBuffer(), + devQueueHw->getEventPoolBuffer(), + devQueueHw->getSlbBuffer(), + devQueueHw->getDshBuffer(), + parentKernel->getKernelReflectionSurface(), + devQueueHw->getQueueStorageBuffer(), + this->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u).getGraphicsAllocation(), + devQueueHw->getDebugQueue()); + + GpgpuWalkerHelper::dispatchScheduler( + *this, + *devQueueHw, + preemption, + scheduler, + &getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u), + devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE)); + + scheduler.makeResident(getCommandStreamReceiver()); + + parentKernel->getProgram()->getBlockKernelManager()->makeInternalAllocationsResident(getCommandStreamReceiver()); + if (parentKernel->isAuxTranslationRequired()) { + blocking = true; + } +} + template void CommandQueueHw::obtainTaskLevelAndBlockedStatus(unsigned int &taskLevel, cl_uint &numEventsInWaitList, const cl_event *&eventWaitList, bool &blockQueue, unsigned int commandType) { auto isQueueBlockedStatus = isQueueBlocked();