diff --git a/runtime/command_queue/command_queue_hw.h b/runtime/command_queue/command_queue_hw.h
index 202edfd960..3e1eea8813 100644
--- a/runtime/command_queue/command_queue_hw.h
+++ b/runtime/command_queue/command_queue_hw.h
@@ -8,6 +8,7 @@
 #pragma once
 #include "runtime/command_stream/command_stream_receiver.h"
 #include "runtime/command_queue/command_queue.h"
+#include "runtime/device_queue/device_queue_hw.h"
 #include "runtime/mem_obj/mem_obj.h"
 #include "runtime/memory_manager/graphics_allocation.h"
 #include "runtime/program/printf_handler.h"
@@ -352,5 +353,24 @@ class CommandQueueHw : public CommandQueue {
                                                    size_t bufferSlicePitch,
                                                    size_t hostRowPitch,
                                                    size_t hostSlicePitch);
+    void processDeviceEnqueue(Kernel *parentKernel,
+                              DeviceQueueHw<GfxFamily> *devQueueHw,
+                              const MultiDispatchInfo &multiDispatchInfo,
+                              TagNode<HwTimeStamps> *hwTimeStamps,
+                              PreemptionMode preemption,
+                              bool &blocking);
+
+    template <uint32_t commandType>
+    void processDispatchForKernels(const MultiDispatchInfo &multiDispatchInfo,
+                                   std::unique_ptr<PrintfHandler> &printfHandler,
+                                   Event *event,
+                                   TagNode<OCLRT::HwTimeStamps> *&hwTimeStamps,
+                                   Kernel *parentKernel,
+                                   bool blockQueue,
+                                   DeviceQueueHw<GfxFamily> *devQueueHw,
+                                   CsrDependencies &csrDeps,
+                                   KernelOperation *&blockedCommandsData,
+                                   TimestampPacketContainer &previousTimestampPacketNodes,
+                                   PreemptionMode preemption);
 };
 } // namespace OCLRT
diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h
index 2d1cf9c0b4..5e414ed482 100644
--- a/runtime/command_queue/enqueue_common.h
+++ b/runtime/command_queue/enqueue_common.h
@@ -170,7 +170,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
     perfCountersRequired = (this->isPerfCountersEnabled() && event != nullptr);
     KernelOperation *blockedCommandsData = nullptr;
     std::unique_ptr<PrintfHandler> printfHandler;
-    bool slmUsed = false;
+    bool slmUsed = multiDispatchInfo.usesSlm() || parentKernel;
     auto preemption = PreemptionHelper::taskPreemptionMode(*device, multiDispatchInfo);
     TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
 
@@ -212,70 +212,9 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
     auto commandStreamStart = commandStream.getUsed();
 
     if (multiDispatchInfo.empty() == false) {
-        HwPerfCounter *hwPerfCounter = nullptr;
-        DebugManager.dumpKernelArgs(&multiDispatchInfo);
-
-        printfHandler.reset(PrintfHandler::create(multiDispatchInfo, *device));
-        if (printfHandler) {
-            printfHandler.get()->prepareDispatch(multiDispatchInfo);
-        }
-
-        if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
-            if (multiDispatchInfo.peekMainKernel()->getProgram()->isKernelDebugEnabled()) {
-                setupDebugSurface(multiDispatchInfo.peekMainKernel());
-            }
-        }
-
-        if (eventBuilder.getEvent()) {
-            if (getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
-                eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer);
-            }
-            if (this->isProfilingEnabled()) {
-                // Get allocation for timestamps
-                hwTimeStamps = eventBuilder.getEvent()->getHwTimeStampNode();
-                if (this->isPerfCountersEnabled()) {
-                    hwPerfCounter = eventBuilder.getEvent()->getHwPerfCounterNode()->tagForCpuAccess;
-                    // PERF COUNTER: copy current configuration from queue to event
-                    eventBuilder.getEvent()->copyPerfCounters(this->getPerfCountersConfigData());
-                }
-            }
-        }
-
-        if (parentKernel) {
-            parentKernel->createReflectionSurface();
-            parentKernel->patchDefaultDeviceQueue(context->getDefaultDeviceQueue());
-            parentKernel->patchEventPool(context->getDefaultDeviceQueue());
-            parentKernel->patchReflectionSurface(context->getDefaultDeviceQueue(), printfHandler.get());
-            if (!blockQueue) {
-                devQueueHw->resetDeviceQueue();
-                devQueueHw->acquireEMCriticalSection();
-            }
-        }
-
-        HardwareInterface<GfxFamily>::dispatchWalker(
-            *this,
-            multiDispatchInfo,
-            csrDeps,
-            &blockedCommandsData,
-            hwTimeStamps,
-            hwPerfCounter,
-            &previousTimestampPacketNodes,
-            timestampPacketContainer.get(),
-            preemption,
-            blockQueue,
-            commandType);
-
-        if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
-            for (auto &dispatchInfo : multiDispatchInfo) {
-                for (auto &patchInfoData : dispatchInfo.getKernel()->getPatchInfoDataList()) {
-                    getCommandStreamReceiver().getFlatBatchBufferHelper().setPatchInfoData(patchInfoData);
-                }
-            }
-        }
-
-        getCommandStreamReceiver().setRequiredScratchSize(multiDispatchInfo.getRequiredScratchSize());
-
-        slmUsed = multiDispatchInfo.usesSlm();
+        processDispatchForKernels<commandType>(multiDispatchInfo, printfHandler, eventBuilder.getEvent(),
+                                               hwTimeStamps, parentKernel, blockQueue, devQueueHw, csrDeps, blockedCommandsData,
+                                               previousTimestampPacketNodes, preemption);
     } else if (getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
         if (CL_COMMAND_BARRIER == commandType) {
             getCommandStreamReceiver().requestStallingPipeControlOnNextFlush();
@@ -295,46 +234,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
     CompletionStamp completionStamp;
     if (!blockQueue) {
         if (parentKernel) {
-            size_t minSizeSSHForEM = KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*parentKernel);
-
-            uint32_t taskCount = getCommandStreamReceiver().peekTaskCount() + 1;
-            devQueueHw->setupExecutionModelDispatch(getIndirectHeap(IndirectHeap::SURFACE_STATE, minSizeSSHForEM),
-                                                    *devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
-                                                    parentKernel,
-                                                    (uint32_t)multiDispatchInfo.size(),
-                                                    taskCount,
-                                                    hwTimeStamps);
-
-            BuiltIns &builtIns = *getDevice().getExecutionEnvironment()->getBuiltIns();
-            SchedulerKernel &scheduler = builtIns.getSchedulerKernel(this->getContext());
-
-            scheduler.setArgs(devQueueHw->getQueueBuffer(),
-                              devQueueHw->getStackBuffer(),
-                              devQueueHw->getEventPoolBuffer(),
-                              devQueueHw->getSlbBuffer(),
-                              devQueueHw->getDshBuffer(),
-                              parentKernel->getKernelReflectionSurface(),
-                              devQueueHw->getQueueStorageBuffer(),
-                              this->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u).getGraphicsAllocation(),
-                              devQueueHw->getDebugQueue());
-
-            GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
-                *this,
-                *devQueueHw,
-                preemption,
-                scheduler,
-                &getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
-                devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE));
-
-            scheduler.makeResident(getCommandStreamReceiver());
-
-            // Update SLM usage
-            slmUsed |= scheduler.slmTotalSize > 0;
-
-            parentKernel->getProgram()->getBlockKernelManager()->makeInternalAllocationsResident(getCommandStreamReceiver());
-            if (parentKernel->isAuxTranslationRequired()) {
-                blocking = true;
-            }
+            processDeviceEnqueue(parentKernel, devQueueHw, multiDispatchInfo, hwTimeStamps, preemption, blocking);
         }
 
         auto submissionRequired = !isCommandWithoutKernel(commandType);
@@ -446,6 +346,128 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
     }
 }
 
+template <typename GfxFamily>
+template <uint32_t commandType>
+void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInfo &multiDispatchInfo,
+                                                          std::unique_ptr<PrintfHandler> &printfHandler,
+                                                          Event *event,
+                                                          TagNode<HwTimeStamps> *&hwTimeStamps,
+                                                          Kernel *parentKernel,
+                                                          bool blockQueue,
+                                                          DeviceQueueHw<GfxFamily> *devQueueHw,
+                                                          CsrDependencies &csrDeps,
+                                                          KernelOperation *&blockedCommandsData,
+                                                          TimestampPacketContainer &previousTimestampPacketNodes,
+                                                          PreemptionMode preemption) {
+    HwPerfCounter *hwPerfCounter = nullptr;
+    DebugManager.dumpKernelArgs(&multiDispatchInfo);
+
+    printfHandler.reset(PrintfHandler::create(multiDispatchInfo, *device));
+    if (printfHandler) {
+        printfHandler.get()->prepareDispatch(multiDispatchInfo);
+    }
+
+    if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
+        if (multiDispatchInfo.peekMainKernel()->getProgram()->isKernelDebugEnabled()) {
+            setupDebugSurface(multiDispatchInfo.peekMainKernel());
+        }
+    }
+
+    if (event) {
+        if (getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
+            event->addTimestampPacketNodes(*timestampPacketContainer);
+        }
+        if (this->isProfilingEnabled()) {
+            // Get allocation for timestamps
+            hwTimeStamps = event->getHwTimeStampNode();
+            if (this->isPerfCountersEnabled()) {
+                hwPerfCounter = event->getHwPerfCounterNode()->tagForCpuAccess;
+                // PERF COUNTER: copy current configuration from queue to event
+                event->copyPerfCounters(this->getPerfCountersConfigData());
+            }
+        }
+    }
+
+    if (parentKernel) {
+        parentKernel->createReflectionSurface();
+        parentKernel->patchDefaultDeviceQueue(context->getDefaultDeviceQueue());
+        parentKernel->patchEventPool(context->getDefaultDeviceQueue());
+        parentKernel->patchReflectionSurface(context->getDefaultDeviceQueue(), printfHandler.get());
+        if (!blockQueue) {
+            devQueueHw->resetDeviceQueue();
+            devQueueHw->acquireEMCriticalSection();
+        }
+    }
+
+    HardwareInterface<GfxFamily>::dispatchWalker(
+        *this,
+        multiDispatchInfo,
+        csrDeps,
+        &blockedCommandsData,
+        hwTimeStamps,
+        hwPerfCounter,
+        &previousTimestampPacketNodes,
+        timestampPacketContainer.get(),
+        preemption,
+        blockQueue,
+        commandType);
+
+    if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
+        for (auto &dispatchInfo : multiDispatchInfo) {
+            for (auto &patchInfoData : dispatchInfo.getKernel()->getPatchInfoDataList()) {
+                getCommandStreamReceiver().getFlatBatchBufferHelper().setPatchInfoData(patchInfoData);
+            }
+        }
+    }
+
+    getCommandStreamReceiver().setRequiredScratchSize(multiDispatchInfo.getRequiredScratchSize());
+}
+template <typename GfxFamily>
+void CommandQueueHw<GfxFamily>::processDeviceEnqueue(Kernel *parentKernel,
+                                                     DeviceQueueHw<GfxFamily> *devQueueHw,
+                                                     const MultiDispatchInfo &multiDispatchInfo,
+                                                     TagNode<HwTimeStamps> *hwTimeStamps,
+                                                     PreemptionMode preemption,
+                                                     bool &blocking) {
+    size_t minSizeSSHForEM = KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*parentKernel);
+
+    uint32_t taskCount = getCommandStreamReceiver().peekTaskCount() + 1;
+    devQueueHw->setupExecutionModelDispatch(getIndirectHeap(IndirectHeap::SURFACE_STATE, minSizeSSHForEM),
+                                            *devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
+                                            parentKernel,
+                                            (uint32_t)multiDispatchInfo.size(),
+                                            taskCount,
+                                            hwTimeStamps);
+
+    BuiltIns &builtIns = *getDevice().getExecutionEnvironment()->getBuiltIns();
+    SchedulerKernel &scheduler = builtIns.getSchedulerKernel(this->getContext());
+
+    scheduler.setArgs(devQueueHw->getQueueBuffer(),
+                      devQueueHw->getStackBuffer(),
+                      devQueueHw->getEventPoolBuffer(),
+                      devQueueHw->getSlbBuffer(),
+                      devQueueHw->getDshBuffer(),
+                      parentKernel->getKernelReflectionSurface(),
+                      devQueueHw->getQueueStorageBuffer(),
+                      this->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u).getGraphicsAllocation(),
+                      devQueueHw->getDebugQueue());
+
+    GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
+        *this,
+        *devQueueHw,
+        preemption,
+        scheduler,
+        &getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
+        devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE));
+
+    scheduler.makeResident(getCommandStreamReceiver());
+
+    parentKernel->getProgram()->getBlockKernelManager()->makeInternalAllocationsResident(getCommandStreamReceiver());
+    if (parentKernel->isAuxTranslationRequired()) {
+        blocking = true;
+    }
+}
+
 template <typename GfxFamily>
 void CommandQueueHw<GfxFamily>::obtainTaskLevelAndBlockedStatus(unsigned int &taskLevel, cl_uint &numEventsInWaitList, const cl_event *&eventWaitList, bool &blockQueue, unsigned int commandType) {
     auto isQueueBlockedStatus = isQueueBlocked();