[26/n] Internal 4GB allocator.

- change the way we handle blocked commands. - instead of allocating CPU pointer and populating it with commands, create real IndirectHeap that may be later submitted to the GPU - that removes a lot of copy operations that were happening on submit time - for device enqueue, this requires dsh & shh to be passed directly to the underlying commands, in that scenario device queue buffers are not used Change-Id: I1124a8edbb46777ea7f7d3a5946f302e7fdf9665
2025-12-20 00:24:58 +08:00 · 2018-04-05 15:12:28 +02:00
parent 100f559daa
commit ffa9b097f5
20 changed files with 331 additions and 319 deletions
--- a/runtime/command_queue/command_queue.cpp
+++ b/runtime/command_queue/command_queue.cpp
@@ -239,35 +239,7 @@ IndirectHeap &CommandQueue::getIndirectHeap(IndirectHeap::Type heapType,
    }
    if (!heapMemory) {
-        size_t reservedSize = 0;
+        allocateHeapMemory(heapType, minRequiredSize, heap);
        auto finalHeapSize = defaultHeapSize;
        minRequiredSize += reservedSize;
        finalHeapSize = alignUp(std::max(finalHeapSize, minRequiredSize), MemoryConstants::pageSize);
        heapMemory = memoryManager->obtainReusableAllocation(finalHeapSize).release();
        if (!heapMemory) {
            heapMemory = memoryManager->allocateGraphicsMemory(finalHeapSize, MemoryConstants::pageSize);
        } else {
            finalHeapSize = std::max(heapMemory->getUnderlyingBufferSize(), finalHeapSize);
        }
        heapMemory->setAllocationType(GraphicsAllocation::ALLOCATION_TYPE_LINEAR_STREAM);
        if (IndirectHeap::SURFACE_STATE == heapType) {
            DEBUG_BREAK_IF(minRequiredSize > maxSshSize);
            finalHeapSize = maxSshSize;
        }
        if (heap) {
            heap->replaceBuffer(heapMemory->getUnderlyingBuffer(), finalHeapSize);
            heap->replaceGraphicsAllocation(heapMemory);
        } else {
            heap = new IndirectHeap(heapMemory);
            heap->overrideMaxSize(finalHeapSize);
        }
    }
    return *heap;
@@ -650,4 +622,37 @@ bool CommandQueue::setupDebugSurface(Kernel *kernel) {
    return true;
 }
 void CommandQueue::allocateHeapMemory(IndirectHeap::Type heapType,
                                      size_t minRequiredSize, IndirectHeap *&indirectHeap) {
    auto memoryManager = device->getMemoryManager();
    size_t reservedSize = 0;
    auto finalHeapSize = defaultHeapSize;
    minRequiredSize += reservedSize;
    finalHeapSize = alignUp(std::max(finalHeapSize, minRequiredSize), MemoryConstants::pageSize);
    auto heapMemory = memoryManager->obtainReusableAllocation(finalHeapSize).release();
    if (!heapMemory) {
        heapMemory = memoryManager->allocateGraphicsMemory(finalHeapSize, MemoryConstants::pageSize);
    } else {
        finalHeapSize = std::max(heapMemory->getUnderlyingBufferSize(), finalHeapSize);
    }
    heapMemory->setAllocationType(GraphicsAllocation::ALLOCATION_TYPE_LINEAR_STREAM);
    if (IndirectHeap::SURFACE_STATE == heapType) {
        DEBUG_BREAK_IF(minRequiredSize > maxSshSize);
        finalHeapSize = maxSshSize;
    }
    if (indirectHeap) {
        indirectHeap->replaceBuffer(heapMemory->getUnderlyingBuffer(), finalHeapSize);
        indirectHeap->replaceGraphicsAllocation(heapMemory);
    } else {
        indirectHeap = new IndirectHeap(heapMemory);
        indirectHeap->overrideMaxSize(finalHeapSize);
    }
 }
 } // namespace OCLRT
--- a/runtime/command_queue/command_queue.h
+++ b/runtime/command_queue/command_queue.h
@@ -336,6 +336,9 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
    IndirectHeap &getIndirectHeap(IndirectHeap::Type heapType,
                                  size_t minRequiredSize = 0u);
    void allocateHeapMemory(IndirectHeap::Type heapType,
                            size_t minRequiredSize, IndirectHeap *&indirectHeap);
    MOCKABLE_VIRTUAL void releaseIndirectHeap(IndirectHeap::Type heapType);
    cl_command_queue_properties getCommandQueueProperties() const {
--- a/runtime/command_queue/enqueue_common.h
+++ b/runtime/command_queue/enqueue_common.h
@@ -275,6 +275,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
            uint32_t taskCount = commandStreamReceiver.peekTaskCount() + 1;
            devQueueHw->setupExecutionModelDispatch(getIndirectHeap(IndirectHeap::SURFACE_STATE, minSizeSSHForEM),
                                                    *devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
                                                    multiDispatchInfo.begin()->getKernel(),
                                                    (uint32_t)multiDispatchInfo.size(),
                                                    taskCount,
@@ -297,7 +298,9 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
                *this,
                *devQueueHw,
                preemption,
-                scheduler);
+                scheduler,
                &getIndirectHeap(IndirectHeap::SURFACE_STATE),
                devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE));
            scheduler.makeResident(commandStreamReceiver);
--- a/runtime/command_queue/gpgpu_walker.h
+++ b/runtime/command_queue/gpgpu_walker.h
@@ -119,13 +119,6 @@ inline cl_uint computeDimensions(const size_t workItems[3]) {
    return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 : 1;
 }
 template <typename SizeAndAllocCalcT, typename... CalcArgsT>
 IndirectHeap *allocateIndirectHeap(SizeAndAllocCalcT &&calc, CalcArgsT &&... args) {
    size_t alignment = MemoryConstants::pageSize;
    size_t size = calc(std::forward<CalcArgsT>(args)...);
    return new IndirectHeap(alignedMalloc(size, alignment), size);
 }
 template <typename GfxFamily>
 class GpgpuWalkerHelper {
  public:
@@ -227,7 +220,9 @@ class GpgpuWalkerHelper {
        CommandQueue &commandQueue,
        DeviceQueueHw<GfxFamily> &devQueueHw,
        PreemptionMode preemptionMode,
-        SchedulerKernel &scheduler);
+        SchedulerKernel &scheduler,
        IndirectHeap *ssh,
        IndirectHeap *dsh);
 };
 template <typename GfxFamily, uint32_t eventType>
--- a/runtime/command_queue/gpgpu_walker.inl
+++ b/runtime/command_queue/gpgpu_walker.inl
@@ -458,20 +458,27 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
        using KCH = KernelCommandsHelper<GfxFamily>;
        commandStream = new LinearStream(alignedMalloc(MemoryConstants::pageSize, MemoryConstants::pageSize), MemoryConstants::pageSize);
        if (executionModelKernel) {
            uint32_t offsetDsh = commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset();
            uint32_t colorCalcSize = commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize;
-            dsh = allocateIndirectHeap([&multiDispatchInfo, offsetDsh] { return KCH::getTotalSizeRequiredDSH(multiDispatchInfo) + KCH::getTotalSizeRequiredIOH(multiDispatchInfo) + offsetDsh; });
+            commandQueue.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE,
                                            commandQueue.getContext().getDefaultDeviceQueue()->getDshBuffer()->getUnderlyingBufferSize(),
                                            dsh);
            dsh->getSpace(colorCalcSize);
            ioh = dsh;
            commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE,
                                            KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*(multiDispatchInfo.begin()->getKernel())) +
                                                KCH::getTotalSizeRequiredSSH(multiDispatchInfo),
                                            ssh);
        } else {
-            dsh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredDSH(multiDispatchInfo); });
+            commandQueue.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, KCH::getTotalSizeRequiredDSH(multiDispatchInfo), dsh);
-            ioh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredIOH(multiDispatchInfo); });
+            commandQueue.allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, KCH::getTotalSizeRequiredIOH(multiDispatchInfo), ioh);
            commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE, KCH::getTotalSizeRequiredSSH(multiDispatchInfo), ssh);
        }
        ssh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredSSH(multiDispatchInfo); });
        using UniqueIH = std::unique_ptr<IndirectHeap>;
-        *blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh), UniqueIH(ioh), UniqueIH(ssh));
+        *blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh), UniqueIH(ioh), UniqueIH(ssh),
                                                   *commandQueue.getDevice().getMemoryManager());
        if (executionModelKernel) {
            (*blockedCommandsData)->doNotFreeISH = true;
        }
@@ -671,7 +678,9 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
    CommandQueue &commandQueue,
    DeviceQueueHw<GfxFamily> &devQueueHw,
    PreemptionMode preemptionMode,
-    SchedulerKernel &scheduler) {
+    SchedulerKernel &scheduler,
    IndirectHeap *ssh,
    IndirectHeap *dsh) {
    using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
    using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
@@ -679,13 +688,9 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
    using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
    OCLRT::LinearStream *commandStream = nullptr;
-    OCLRT::IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
+    OCLRT::IndirectHeap *ioh = nullptr;
    commandStream = &commandQueue.getCS(0);
    // note : below code assumes that caller to dispatchScheduler "preallocated" memory
    //        required for execution model in below heap managers
    dsh = devQueueHw.getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
    ssh = &commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE);
    bool dcFlush = false;
    commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, dcFlush);
--- a/runtime/device_queue/device_queue.cpp
+++ b/runtime/device_queue/device_queue.cpp
@@ -156,12 +156,12 @@ void DeviceQueue::initDeviceQueue() {
    igilEventPool->m_size = caps.maxOnDeviceEvents;
 }
-void DeviceQueue::setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, Kernel *parentKernel, uint32_t parentCount, uint32_t taskCount, HwTimeStamps *hwTimeStamp) {
+void DeviceQueue::setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint32_t taskCount, HwTimeStamps *hwTimeStamp) {
-    setupIndirectState(surfaceStateHeap, parentKernel, parentCount);
+    setupIndirectState(surfaceStateHeap, dynamicStateHeap, parentKernel, parentCount);
    addExecutionModelCleanUpSection(parentKernel, hwTimeStamp, taskCount);
 }
-void DeviceQueue::setupIndirectState(IndirectHeap &surfaceStateHeap, Kernel *parentKernel, uint32_t parentIDCount) {
+void DeviceQueue::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount) {
    return;
 }
@@ -173,7 +173,7 @@ void DeviceQueue::resetDeviceQueue() {
    return;
 }
-void DeviceQueue::dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode) {
+void DeviceQueue::dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh) {
    return;
 }
--- a/runtime/device_queue/device_queue.h
+++ b/runtime/device_queue/device_queue.h
@@ -81,9 +81,9 @@ class DeviceQueue : public BaseObject<_device_queue> {
                               size_t paramValueSize, void *paramValue,
                               size_t *paramValueSizeRet);
-    void setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, Kernel *parentKernel, uint32_t parentCount, uint32_t taskCount, HwTimeStamps *hwTimeStamp);
+    void setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint32_t taskCount, HwTimeStamps *hwTimeStamp);
-    virtual void setupIndirectState(IndirectHeap &surfaceStateHeap, Kernel *parentKernel, uint32_t parentIDCount);
+    virtual void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount);
    virtual void addExecutionModelCleanUpSection(Kernel *parentKernel, HwTimeStamps *hwTimeStamp, uint32_t taskCount);
    MOCKABLE_VIRTUAL bool isEMCriticalSectionFree() {
@@ -93,7 +93,7 @@ class DeviceQueue : public BaseObject<_device_queue> {
    }
    virtual void resetDeviceQueue();
-    virtual void dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode);
+    virtual void dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh);
    virtual IndirectHeap *getIndirectHeap(IndirectHeap::Type type);
    void acquireEMCriticalSection() {
--- a/runtime/device_queue/device_queue_hw.h
+++ b/runtime/device_queue/device_queue_hw.h
@@ -72,11 +72,11 @@ class DeviceQueueHw : public DeviceQueue {
    size_t setSchedulerCrossThreadData(SchedulerKernel &scheduler);
-    void setupIndirectState(IndirectHeap &surfaceStateHeap, Kernel *parentKernel, uint32_t parentIDCount) override;
+    void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount) override;
    void addExecutionModelCleanUpSection(Kernel *parentKernel, HwTimeStamps *hwTimeStamp, uint32_t taskCount) override;
    void resetDeviceQueue() override;
-    void dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode) override;
+    void dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh) override;
    uint32_t getSchedulerReturnInstance() {
        return igilQueue->m_controls.m_SchedulerEarlyReturn;
--- a/runtime/device_queue/device_queue_hw.inl
+++ b/runtime/device_queue/device_queue_hw.inl
@@ -290,11 +290,8 @@ IndirectHeap *DeviceQueueHw<GfxFamily>::getIndirectHeap(IndirectHeap::Type type)
 }
 template <typename GfxFamily>
-void DeviceQueueHw<GfxFamily>::setupIndirectState(IndirectHeap &surfaceStateHeap, Kernel *parentKernel, uint32_t parentIDCount) {
+void DeviceQueueHw<GfxFamily>::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount) {
-    void *pDSH = dshBuffer->getUnderlyingBuffer();
+    void *pDSH = dynamicStateHeap.getCpuBase();
    // Heap and dshBuffer shoud be the same if heap is created
    DEBUG_BREAK_IF(!((heaps[IndirectHeap::DYNAMIC_STATE] == nullptr) || (heaps[IndirectHeap::DYNAMIC_STATE]->getCpuBase() == pDSH)));
    // Set scheduler ID to last entry in first table, it will have ID == 0, blocks will have following entries.
    auto igilCmdQueue = reinterpret_cast<IGIL_CommandQueue *>(queueBuffer->getUnderlyingBuffer());
@@ -386,11 +383,13 @@ size_t DeviceQueueHw<GfxFamily>::setSchedulerCrossThreadData(SchedulerKernel &sc
 }
 template <typename GfxFamily>
-void DeviceQueueHw<GfxFamily>::dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode) {
+void DeviceQueueHw<GfxFamily>::dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh) {
    GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(cmdQ,
                                                    *this,
                                                    preemptionMode,
-                                                    scheduler);
+                                                    scheduler,
                                                    ssh,
                                                    dsh);
    return;
 }
--- a/runtime/helpers/task_information.cpp
+++ b/runtime/helpers/task_information.cpp
@@ -28,6 +28,7 @@
 #include "runtime/device_queue/device_queue.h"
 #include "runtime/gtpin/gtpin_notify.h"
 #include "runtime/mem_obj/mem_obj.h"
 #include "runtime/memory_manager/memory_manager.h"
 #include "runtime/memory_manager/surface.h"
 #include "runtime/helpers/aligned_memory.h"
 #include "runtime/helpers/string.h"
@@ -35,13 +36,14 @@
 namespace OCLRT {
 KernelOperation::~KernelOperation() {
-    alignedFree(dsh->getCpuBase());
+    memoryManager.storeAllocation(std::unique_ptr<GraphicsAllocation>(dsh->getGraphicsAllocation()), REUSABLE_ALLOCATION);
-    if (doNotFreeISH) {
+    if (ioh.get() == dsh.get()) {
        ioh.release();
    } else {
        alignedFree(ioh->getCpuBase());
    }
-    alignedFree(ssh->getCpuBase());
+    if (ioh) {
        memoryManager.storeAllocation(std::unique_ptr<GraphicsAllocation>(ioh->getGraphicsAllocation()), REUSABLE_ALLOCATION);
    }
    memoryManager.storeAllocation(std::unique_ptr<GraphicsAllocation>(ssh->getGraphicsAllocation()), REUSABLE_ALLOCATION);
    alignedFree(commandStream->getCpuBase());
 }
@@ -163,43 +165,9 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
    //transfer the memory to commandStream of the queue.
    memcpy_s(pDst, commandsSize, commandStream.getCpuBase(), commandsSize);
-    size_t requestedDshSize = kernelOperation->dsh->getUsed();
+    IndirectHeap *dsh = kernelOperation->dsh.get();
-    size_t requestedIohSize = kernelOperation->ioh->getUsed();
+    IndirectHeap *ioh = kernelOperation->ioh.get();
-    size_t requestedSshSize = kernelOperation->ssh->getUsed() + kernelOperation->surfaceStateHeapSizeEM;
+    IndirectHeap *ssh = kernelOperation->ssh.get();
    IndirectHeap *dsh = nullptr;
    IndirectHeap *ioh = nullptr;
    IndirectHeap::Type trackedHeaps[] = {IndirectHeap::SURFACE_STATE, IndirectHeap::INDIRECT_OBJECT, IndirectHeap::DYNAMIC_STATE};
    for (auto trackedHeap = 0u; trackedHeap < ARRAY_COUNT(trackedHeaps); trackedHeap++) {
        if (commandQueue.getIndirectHeap(trackedHeaps[trackedHeap], 0).getUsed() > 0) {
            commandQueue.releaseIndirectHeap(trackedHeaps[trackedHeap]);
        }
    }
    if (executionModelKernel) {
        dsh = devQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
        // In ExecutionModel IOH is the same as DSH to eliminate StateBaseAddress reprogramming for scheduler kernel and blocks.
        ioh = dsh;
        memcpy_s(dsh->getSpace(0), dsh->getAvailableSpace(), ptrOffset(kernelOperation->dsh->getCpuBase(), devQueue->colorCalcStateSize), kernelOperation->dsh->getUsed() - devQueue->colorCalcStateSize);
        dsh->getSpace(kernelOperation->dsh->getUsed() - devQueue->colorCalcStateSize);
    } else {
        dsh = &commandQueue.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, requestedDshSize);
        ioh = &commandQueue.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, requestedIohSize);
        memcpy_s(dsh->getCpuBase(), requestedDshSize, kernelOperation->dsh->getCpuBase(), kernelOperation->dsh->getUsed());
        dsh->getSpace(requestedDshSize);
        memcpy_s(ioh->getCpuBase(), requestedIohSize, kernelOperation->ioh->getCpuBase(), kernelOperation->ioh->getUsed());
        ioh->getSpace(requestedIohSize);
    }
    IndirectHeap &ssh = commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, requestedSshSize);
    memcpy_s(ssh.getCpuBase(), requestedSshSize, kernelOperation->ssh->getCpuBase(), kernelOperation->ssh->getUsed());
    ssh.getSpace(kernelOperation->ssh->getUsed());
    auto requiresCoherency = false;
    for (auto &surface : surfaces) {
@@ -214,7 +182,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
    if (executionModelKernel) {
        uint32_t taskCount = commandStreamReceiver.peekTaskCount() + 1;
-        devQueue->setupExecutionModelDispatch(ssh, kernel, kernelCount, taskCount, timestamp);
+        devQueue->setupExecutionModelDispatch(*ssh, *dsh, kernel, kernelCount, taskCount, timestamp);
        BuiltIns &builtIns = BuiltIns::getInstance();
        SchedulerKernel &scheduler = builtIns.getSchedulerKernel(commandQueue.getContext());
@@ -223,16 +191,18 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
                          devQueue->getStackBuffer(),
                          devQueue->getEventPoolBuffer(),
                          devQueue->getSlbBuffer(),
-                          devQueue->getDshBuffer(),
+                          dsh->getGraphicsAllocation(),
                          kernel->getKernelReflectionSurface(),
                          devQueue->getQueueStorageBuffer(),
-                          ssh.getGraphicsAllocation(),
+                          ssh->getGraphicsAllocation(),
                          devQueue->getDebugQueue());
        devQueue->dispatchScheduler(
            commandQueue,
            scheduler,
-            preemptionMode);
+            preemptionMode,
            ssh,
            dsh);
        scheduler.makeResident(commandStreamReceiver);
@@ -261,14 +231,13 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
                                                      offset,
                                                      *dsh,
                                                      *ioh,
-                                                      ssh,
+                                                      *ssh,
                                                      taskLevel,
                                                      dispatchFlags);
    for (auto &surface : surfaces) {
        surface->setCompletionStamp(completionStamp, nullptr, nullptr);
    }
    commandQueue.waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp, false);
    if (printfHandler) {
        printfHandler.get()->printEnqueueOutput();
    }
--- a/runtime/helpers/task_information.h
+++ b/runtime/helpers/task_information.h
@@ -39,6 +39,7 @@ class MemObj;
 class Surface;
 class PrintfHandler;
 struct HwTimeStamps;
 class MemoryManager;
 enum MapOperationType {
    MAP,
@@ -77,10 +78,11 @@ class CommandMapUnmap : public Command {
 };
 struct KernelOperation {
-    KernelOperation(std::unique_ptr<LinearStream> commandStream, std::unique_ptr<IndirectHeap> dsh, std::unique_ptr<IndirectHeap> ioh, std::unique_ptr<IndirectHeap> ssh)
+    KernelOperation(std::unique_ptr<LinearStream> commandStream, std::unique_ptr<IndirectHeap> dsh, std::unique_ptr<IndirectHeap> ioh, std::unique_ptr<IndirectHeap> ssh,
                    MemoryManager &memoryManager)
        : commandStream(std::move(commandStream)), dsh(std::move(dsh)),
          ioh(std::move(ioh)), ssh(std::move(ssh)),
-          surfaceStateHeapSizeEM(0), doNotFreeISH(false) {
+          surfaceStateHeapSizeEM(0), doNotFreeISH(false), memoryManager(memoryManager) {
    }
    ~KernelOperation();
@@ -92,6 +94,7 @@ struct KernelOperation {
    size_t surfaceStateHeapSizeEM;
    bool doNotFreeISH;
    MemoryManager &memoryManager;
 };
 class CommandComputeKernel : public Command {
--- a/unit_tests/command_queue/command_queue_hw_tests.cpp
+++ b/unit_tests/command_queue/command_queue_hw_tests.cpp
@@ -392,95 +392,55 @@ HWTEST_F(CommandQueueHwTest, GivenNotCompleteUserEventPassedToEnqueueWhenEventIs
 }
 typedef CommandQueueHwTest BlockedCommandQueueTest;
 HWTEST_F(BlockedCommandQueueTest, givenCommandQueueWhichHasSomeUsedHeapsWhenBlockedCommandIsBeingSubmittedItReloadsThemToZeroToKeepProperOffsets) {
    DebugManagerStateRestore debugStateRestore;
    bool oldMemsetAllocationsFlag = MemoryManagement::memsetNewAllocations;
    MemoryManagement::memsetNewAllocations = true;
    DebugManager.flags.ForcePreemptionMode.set(-1); // allow default preemption mode
    auto deviceWithDefaultPreemptionMode = std::unique_ptr<MockDevice>(DeviceHelper<>::create(nullptr));
    this->pDevice->setPreemptionMode(deviceWithDefaultPreemptionMode->getPreemptionMode());
    this->pDevice->getCommandStreamReceiver().setPreemptionCsrAllocation(deviceWithDefaultPreemptionMode->getPreemptionAllocation());
    DebugManager.flags.DisableResourceRecycling.set(true);
 HWTEST_F(BlockedCommandQueueTest, givenCommandQueueWhenBlockedCommandIsBeingSubmittedThenQueueHeapsAreNotUsed) {
    UserEvent userEvent(context);
    cl_event blockedEvent = &userEvent;
    MockKernelWithInternals mockKernelWithInternals(*pDevice);
    mockKernelWithInternals.kernelHeader.KernelHeapSize = sizeof(mockKernelWithInternals.kernelIsa);
    auto mockKernel = mockKernelWithInternals.mockKernel;
    IndirectHeap::Type heaps[] = {IndirectHeap::INDIRECT_OBJECT, IndirectHeap::DYNAMIC_STATE, IndirectHeap::SURFACE_STATE};
    size_t prealocatedHeapSize = 2 * 64 * KB;
    for (auto heapType : heaps) {
        auto &heap = pCmdQ->getIndirectHeap(heapType, prealocatedHeapSize);
        heap.getSpace(16);
        memset(heap.getCpuBase(), 0, prealocatedHeapSize);
    }
    // preallocating memsetted allocations to get predictable results
    pCmdQ->getDevice().getMemoryManager()->cleanAllocationList(-1, REUSABLE_ALLOCATION);
    DebugManager.flags.DisableResourceRecycling.set(false);
    std::set<void *> reusableHeaps;
    for (unsigned int i = 0; i < 4; ++i) {
        auto allocSize = prealocatedHeapSize;
        void *mem = alignedMalloc(allocSize, 64);
        reusableHeaps.insert(mem);
        memset(mem, 0, allocSize);
        std::unique_ptr<GraphicsAllocation> reusableAlloc{new MockGraphicsAllocation(mem, allocSize)};
        pCmdQ->getDevice().getMemoryManager()->storeAllocation(std::move(reusableAlloc), REUSABLE_ALLOCATION);
    }
    // disable further allocation reuse
    DebugManager.flags.DisableResourceRecycling.set(true);
    size_t offset = 0;
    size_t size = 1;
-    pCmdQ->enqueueKernel(mockKernel, 1, &offset, &size, &size, 1, &blockedEvent, nullptr); // blocked command
+
    cl_event blockedEvent = &userEvent;
    pCmdQ->enqueueKernel(mockKernel, 1, &offset, &size, &size, 1, &blockedEvent, nullptr);
    userEvent.setStatus(CL_COMPLETE);
-    // make sure used heaps are from preallocated pool
+    auto &ioh = pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 4096u);
-    EXPECT_NE(reusableHeaps.end(), reusableHeaps.find(pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0).getCpuBase()));
+    auto &dsh = pCmdQ->getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 4096u);
-    EXPECT_NE(reusableHeaps.end(), reusableHeaps.find(pCmdQ->getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0).getCpuBase()));
+    auto &ssh = pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 4096u);
    EXPECT_NE(reusableHeaps.end(), reusableHeaps.find(pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0).getCpuBase()));
-    pCmdQ->getDevice().getMemoryManager()->cleanAllocationList(-1, REUSABLE_ALLOCATION);
+    EXPECT_EQ(0u, ioh.getUsed());
-    std::unordered_map<int, std::vector<char>> blockedCommandHeaps;
+    EXPECT_EQ(0u, dsh.getUsed());
-    int i = 0;
+    EXPECT_EQ(0u, ssh.getUsed());
-    for (auto heapType : heaps) {
+}
        auto &heap = pCmdQ->getIndirectHeap(heapType, 0);
        blockedCommandHeaps[static_cast<int>(heaps[i])].assign(reinterpret_cast<char *>(heap.getCpuBase()), reinterpret_cast<char *>(heap.getCpuBase()) + heap.getUsed());
-        // prepare new heaps for nonblocked command
+HWTEST_F(BlockedCommandQueueTest, givenCommandQueueWithUsedHeapsWhenBlockedCommandIsBeingSubmittedThenQueueHeapsAreNotUsed) {
-        pCmdQ->releaseIndirectHeap(heapType);
+    UserEvent userEvent(context);
-        ++i;
+    MockKernelWithInternals mockKernelWithInternals(*pDevice);
-    }
+    auto mockKernel = mockKernelWithInternals.mockKernel;
-    pCmdQ->enqueueKernel(mockKernel, 1, &offset, &size, &size, 0, nullptr, nullptr); // nonblocked command
+    size_t offset = 0;
-    i = 0;
+    size_t size = 1;
    std::unordered_map<int, std::vector<char>> nonblockedCommandHeaps;
    for (auto heapType : heaps) {
        auto &heap = pCmdQ->getIndirectHeap(heapType, 0);
        nonblockedCommandHeaps[static_cast<int>(heaps[i])].assign(reinterpret_cast<char *>(heap.getCpuBase()), reinterpret_cast<char *>(heap.getCpuBase()) + heap.getUsed());
        ++i;
    }
-    // expecting blocked command to be programmed indentically to a non-blocked counterpart
+    cl_event blockedEvent = &userEvent;
    EXPECT_THAT(nonblockedCommandHeaps[static_cast<int>(IndirectHeap::INDIRECT_OBJECT)],
                testing::ContainerEq(blockedCommandHeaps[static_cast<int>(IndirectHeap::INDIRECT_OBJECT)]));
    EXPECT_THAT(nonblockedCommandHeaps[static_cast<int>(IndirectHeap::DYNAMIC_STATE)],
                testing::ContainerEq(blockedCommandHeaps[static_cast<int>(IndirectHeap::DYNAMIC_STATE)]));
    EXPECT_THAT(nonblockedCommandHeaps[static_cast<int>(IndirectHeap::SURFACE_STATE)],
                testing::ContainerEq(blockedCommandHeaps[static_cast<int>(IndirectHeap::SURFACE_STATE)]));
-    for (auto ptr : reusableHeaps) {
+    auto &ioh = pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 4096u);
-        alignedFree(ptr);
+    auto &dsh = pCmdQ->getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 4096u);
-    }
+    auto &ssh = pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 4096u);
-    BuiltIns::shutDown();
+    auto spaceToUse = 4u;
-    MemoryManagement::memsetNewAllocations = oldMemsetAllocationsFlag;
+
    ioh.getSpace(spaceToUse);
    dsh.getSpace(spaceToUse);
    ssh.getSpace(spaceToUse);
    pCmdQ->enqueueKernel(mockKernel, 1, &offset, &size, &size, 1, &blockedEvent, nullptr);
    userEvent.setStatus(CL_COMPLETE);
    EXPECT_EQ(spaceToUse, ioh.getUsed());
    EXPECT_EQ(spaceToUse, dsh.getUsed());
    EXPECT_EQ(spaceToUse, ssh.getUsed());
 }
 HWTEST_F(BlockedCommandQueueTest, givenCommandQueueWhichHasSomeUnusedHeapsWhenBlockedCommandIsBeingSubmittedThenThoseHeapsAreBeingUsed) {
--- a/unit_tests/command_queue/command_queue_tests.cpp
+++ b/unit_tests/command_queue/command_queue_tests.cpp
@@ -616,6 +616,33 @@ TEST_P(CommandQueueIndirectHeapTest, givenCommandQueueWhenGetIndirectHeapIsCalle
    EXPECT_EQ(GraphicsAllocation::ALLOCATION_TYPE_LINEAR_STREAM, indirectHeapAllocation->getAllocationType());
 }
 TEST_P(CommandQueueIndirectHeapTest, givenCommandQueueWhenGetHeapMemoryIsCalledThenHeapIsCreated) {
    const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
    CommandQueue cmdQ(&context, pDevice, props);
    IndirectHeap *indirectHeap = nullptr;
    cmdQ.allocateHeapMemory(this->GetParam(), 100, indirectHeap);
    EXPECT_NE(nullptr, indirectHeap);
    EXPECT_NE(nullptr, indirectHeap->getGraphicsAllocation());
    pDevice->getMemoryManager()->freeGraphicsMemory(indirectHeap->getGraphicsAllocation());
    delete indirectHeap;
 }
 TEST_P(CommandQueueIndirectHeapTest, givenCommandQueueWhenGetHeapMemoryIsCalledWithAlreadyAllocatedHeapThenGraphicsAllocationIsCreated) {
    const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
    CommandQueue cmdQ(&context, pDevice, props);
    IndirectHeap heap(nullptr, 100);
    IndirectHeap *indirectHeap = &heap;
    cmdQ.allocateHeapMemory(this->GetParam(), 100, indirectHeap);
    EXPECT_EQ(&heap, indirectHeap);
    EXPECT_NE(nullptr, indirectHeap->getGraphicsAllocation());
    pDevice->getMemoryManager()->freeGraphicsMemory(indirectHeap->getGraphicsAllocation());
 }
 INSTANTIATE_TEST_CASE_P(
    Device,
    CommandQueueIndirectHeapTest,
--- a/unit_tests/command_queue/dispatch_walker_tests.cpp
+++ b/unit_tests/command_queue/dispatch_walker_tests.cpp
@@ -710,9 +710,9 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerShouldGetRequiredHeapSizesFromKernelW
    auto expectedSizeSSH = KernelCommandsHelper<FamilyType>::getSizeRequiredSSH(kernel);
    EXPECT_EQ(expectedSizeCS, blockedCommandsData->commandStream->getMaxAvailableSpace());
-    EXPECT_EQ(expectedSizeDSH, blockedCommandsData->dsh->getMaxAvailableSpace());
+    EXPECT_LE(expectedSizeDSH, blockedCommandsData->dsh->getMaxAvailableSpace());
-    EXPECT_EQ(expectedSizeIOH, blockedCommandsData->ioh->getMaxAvailableSpace());
+    EXPECT_LE(expectedSizeIOH, blockedCommandsData->ioh->getMaxAvailableSpace());
-    EXPECT_EQ(expectedSizeSSH, blockedCommandsData->ssh->getMaxAvailableSpace());
+    EXPECT_LE(expectedSizeSSH, blockedCommandsData->ssh->getMaxAvailableSpace());
    delete blockedCommandsData;
 }
@@ -745,9 +745,9 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerShouldGetRequiredHeapSizesFromMdiWhen
    auto expectedSizeSSH = KernelCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);
    EXPECT_EQ(expectedSizeCS, blockedCommandsData->commandStream->getMaxAvailableSpace());
-    EXPECT_EQ(expectedSizeDSH, blockedCommandsData->dsh->getMaxAvailableSpace());
+    EXPECT_LE(expectedSizeDSH, blockedCommandsData->dsh->getMaxAvailableSpace());
-    EXPECT_EQ(expectedSizeIOH, blockedCommandsData->ioh->getMaxAvailableSpace());
+    EXPECT_LE(expectedSizeIOH, blockedCommandsData->ioh->getMaxAvailableSpace());
-    EXPECT_EQ(expectedSizeSSH, blockedCommandsData->ssh->getMaxAvailableSpace());
+    EXPECT_LE(expectedSizeSSH, blockedCommandsData->ssh->getMaxAvailableSpace());
    delete blockedCommandsData;
 }
--- a/unit_tests/device_queue/device_queue_hw_tests.cpp
+++ b/unit_tests/device_queue/device_queue_hw_tests.cpp
@@ -534,7 +534,7 @@ HWTEST_P(DeviceQueueHwWithKernel, setupIndirectState) {
        auto usedBeforeSSH = ssh->getUsed();
        auto usedBeforeDSH = dsh->getUsed();
-        devQueueHw->setupIndirectState(*ssh, pKernel, 1);
+        devQueueHw->setupIndirectState(*ssh, *dsh, pKernel, 1);
        auto usedAfterSSH = ssh->getUsed();
        auto usedAfterDSH = dsh->getUsed();
@@ -564,7 +564,7 @@ HWTEST_P(DeviceQueueHwWithKernel, setupIndirectStateSetsCorrectStartBlockID) {
        uint32_t parentCount = 4;
-        devQueueHw->setupIndirectState(*ssh, pKernel, parentCount);
+        devQueueHw->setupIndirectState(*ssh, *dsh, pKernel, parentCount);
        auto *igilQueue = reinterpret_cast<IGIL_CommandQueue *>(devQueueHw->getQueueBuffer()->getUnderlyingBuffer());
        EXPECT_EQ(parentCount, igilQueue->m_controls.m_StartBlockID);
@@ -594,7 +594,7 @@ HWTEST_P(DeviceQueueHwWithKernel, setupIndirectStateSetsCorrectDSHValues) {
        uint32_t parentCount = 1;
-        devQueueHw->setupIndirectState(*ssh, pKernel, parentCount);
+        devQueueHw->setupIndirectState(*ssh, *dsh, pKernel, parentCount);
        auto *igilQueue = reinterpret_cast<IGIL_CommandQueue *>(devQueueHw->getQueueBuffer()->getUnderlyingBuffer());
        EXPECT_EQ(igilQueue->m_controls.m_DynamicHeapStart, devQueueHw->offsetDsh + alignUp((uint32_t)pKernel->getDynamicStateHeapSize(), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE));
--- a/unit_tests/device_queue/device_queue_tests.cpp
+++ b/unit_tests/device_queue/device_queue_tests.cpp
@@ -40,7 +40,8 @@ TEST(DeviceQueueSimpleTest, setupExecutionModelDispatchDoesNothing) {
    size_t size = 20;
    IndirectHeap ssh(buffer, size);
-    devQueue.setupExecutionModelDispatch(ssh, nullptr, 0, 0, 0);
+    IndirectHeap dsh(buffer, size);
    devQueue.setupExecutionModelDispatch(ssh, dsh, nullptr, 0, 0, 0);
    EXPECT_EQ(0u, ssh.getUsed());
@@ -320,7 +321,7 @@ TEST_F(DeviceQueueTest, dispatchScheduler) {
    CommandQueue cmdQ(nullptr, nullptr, 0);
    KernelInfo info;
    MockSchedulerKernel *kernel = new MockSchedulerKernel(&program, info, *device);
-    devQueue.dispatchScheduler(cmdQ, *kernel, device->getPreemptionMode());
+    devQueue.dispatchScheduler(cmdQ, *kernel, device->getPreemptionMode(), nullptr, nullptr);
    delete kernel;
 }
--- a/unit_tests/event/event_tests.cpp
+++ b/unit_tests/event/event_tests.cpp
@@ -449,60 +449,18 @@ class SurfaceMock : public Surface {
    SurfaceMock(SurfaceMock *parent) : parent(parent){};
 };
 TEST_F(InternalsEventTest, resizeCmdQueueHeapsWhenKernelOparationHeapsAreBigger) {
    CommandQueue *pCmdQ = new CommandQueue(mockContext, pDevice, 0);
    IndirectHeap &cmdQueueDsh = pCmdQ->getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 4096);
    IndirectHeap &cmdQueueIoh = pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 4096);
    IndirectHeap &cmdQueueSsh = pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 4096);
    auto requestedSize = cmdQueueDsh.getMaxAvailableSpace() * 2;
    auto cmdStream = new LinearStream(alignedMalloc(requestedSize, requestedSize), requestedSize);
    auto createFullHeap = [](size_t size) {
        auto heap = new IndirectHeap(alignedMalloc(size, size), size);
        heap->getSpace(heap->getAvailableSpace());
        return heap;
    };
    auto dsh = createFullHeap(requestedSize);
    auto ioh = createFullHeap(requestedSize);
    auto ssh = createFullHeap(maxSshSize);
    using UniqueIH = std::unique_ptr<IndirectHeap>;
    auto kernelOperation = new KernelOperation(std::unique_ptr<LinearStream>(cmdStream), UniqueIH(dsh),
                                               UniqueIH(ioh), UniqueIH(ssh));
    std::vector<Surface *> v;
    SurfaceMock *surface = new SurfaceMock;
    v.push_back(surface);
    PreemptionMode preemptionMode = pDevice->getPreemptionMode();
    auto cmdComputeKernel = new CommandComputeKernel(*pCmdQ, pDevice->getCommandStreamReceiver(),
                                                     std::unique_ptr<KernelOperation>(kernelOperation), v, false, false, false, nullptr, preemptionMode);
    EXPECT_LT(cmdQueueDsh.getMaxAvailableSpace(), dsh->getMaxAvailableSpace());
    EXPECT_LT(cmdQueueIoh.getMaxAvailableSpace(), ioh->getMaxAvailableSpace());
    EXPECT_EQ(maxSshSize, ssh->getMaxAvailableSpace());
    cmdComputeKernel->submit(0, false);
    EXPECT_GE(cmdQueueDsh.getMaxAvailableSpace(), dsh->getMaxAvailableSpace());
    EXPECT_GE(cmdQueueIoh.getMaxAvailableSpace(), ioh->getMaxAvailableSpace());
    EXPECT_GE(cmdQueueSsh.getMaxAvailableSpace(), ssh->getMaxAvailableSpace());
    delete pCmdQ;
    delete cmdComputeKernel;
 }
 TEST_F(InternalsEventTest, processBlockedCommandsKernelOperation) {
    MockEvent<Event> event(nullptr, CL_COMMAND_NDRANGE_KERNEL, 0, 0);
    CommandQueue *pCmdQ = new CommandQueue(mockContext, pDevice, 0);
    auto cmdStream = new LinearStream(alignedMalloc(4096, 4096), 4096);
-    auto dsh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
+    IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
-    auto ioh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
+    pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, 4096u, dsh);
-    auto ssh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
+    pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, 4096u, ioh);
    pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, 4096u, ssh);
    using UniqueIH = std::unique_ptr<IndirectHeap>;
    auto blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(cmdStream), UniqueIH(dsh),
-                                                   UniqueIH(ioh), UniqueIH(ssh));
+                                                   UniqueIH(ioh), UniqueIH(ssh), *pCmdQ->getDevice().getMemoryManager());
    auto &csr = pDevice->getCommandStreamReceiver();
    std::vector<Surface *> v;
@@ -534,12 +492,13 @@ TEST_F(InternalsEventTest, processBlockedCommandsAbortKernelOperation) {
    CommandQueue *pCmdQ = new CommandQueue(mockContext, pDevice, 0);
    auto cmdStream = new LinearStream(alignedMalloc(4096, 4096), 4096);
-    auto dsh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
+    IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
-    auto ioh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
+    pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, 4096u, dsh);
-    auto ssh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
+    pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, 4096u, ioh);
    pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, 4096u, ssh);
    using UniqueIH = std::unique_ptr<IndirectHeap>;
    auto blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(cmdStream), UniqueIH(dsh),
-                                                   UniqueIH(ioh), UniqueIH(ssh));
+                                                   UniqueIH(ioh), UniqueIH(ssh), *pCmdQ->getDevice().getMemoryManager());
    auto &csr = pDevice->getCommandStreamReceiver();
    std::vector<Surface *> v;
@@ -565,12 +524,13 @@ TEST_F(InternalsEventTest, givenBlockedKernelWithPrintfWhenSubmittedThenPrintOut
    CommandQueue *pCmdQ = new CommandQueue(mockContext, pDevice, 0);
    auto cmdStream = new LinearStream(alignedMalloc(4096, 4096), 4096);
-    auto dsh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
+    IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
-    auto ioh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
+    pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, 4096u, dsh);
-    auto ssh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
+    pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, 4096u, ioh);
    pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, 4096u, ssh);
    using UniqueIH = std::unique_ptr<IndirectHeap>;
    auto blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(cmdStream), UniqueIH(dsh),
-                                                   UniqueIH(ioh), UniqueIH(ssh));
+                                                   UniqueIH(ioh), UniqueIH(ssh), *pCmdQ->getDevice().getMemoryManager());
    SPatchAllocateStatelessPrintfSurface *pPrintfSurface = new SPatchAllocateStatelessPrintfSurface();
    pPrintfSurface->DataParamOffset = 0;
@@ -1477,12 +1437,13 @@ HWTEST_F(InternalsEventTest, givenAbortedCommandWhenSubmitCalledThenDontUpdateFl
    csr.flushStamp->setStamp(5);
    auto cmdStream = new LinearStream(alignedMalloc(4096, 4096), 4096);
-    auto dsh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
+    IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
-    auto ioh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
+    pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, 4096u, dsh);
-    auto ssh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
+    pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, 4096u, ioh);
    pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, 4096u, ssh);
    using UniqueIH = std::unique_ptr<IndirectHeap>;
    auto blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(cmdStream), UniqueIH(dsh),
-                                                   UniqueIH(ioh), UniqueIH(ssh));
+                                                   UniqueIH(ioh), UniqueIH(ssh), *pCmdQ->getDevice().getMemoryManager());
    PreemptionMode preemptionMode = pDevice->getPreemptionMode();
    std::vector<Surface *> v;
    auto cmd = new CommandComputeKernel(*pCmdQ, csr, std::unique_ptr<KernelOperation>(blockedCommandsData), v, false, false, false, nullptr, preemptionMode);
--- a/unit_tests/execution_model/scheduler_dispatch_tests.cpp
+++ b/unit_tests/execution_model/scheduler_dispatch_tests.cpp
@@ -76,7 +76,9 @@ HWTEST_F(ExecutionModelSchedulerFixture, dispatchScheduler) {
            *pCmdQ,
            *pDevQueueHw,
            pDevice->getPreemptionMode(),
-            scheduler);
+            scheduler,
            &pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE),
            pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE));
        EXPECT_EQ(0u, *scheduler.globalWorkOffsetX);
        EXPECT_EQ(0u, *scheduler.globalWorkOffsetY);
@@ -192,7 +194,9 @@ HWTEST_F(ExecutionModelSchedulerFixture, dispatchSchedulerDoesNotUseStandardCmdQ
            *pCmdQ,
            *pDevQueueHw,
            pDevice->getPreemptionMode(),
-            scheduler);
+            scheduler,
            &pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE),
            pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE));
        auto &ioh = pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT);
@@ -223,7 +227,9 @@ HWTEST_F(ParentKernelCommandQueueFixture, dispatchSchedulerWithEarlyReturnSetToF
            *pCmdQ,
            mockDevQueue,
            device->getPreemptionMode(),
-            scheduler);
+            scheduler,
            &pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE),
            mockDevQueue.getIndirectHeap(IndirectHeap::DYNAMIC_STATE));
        HardwareParse hwParser;
        hwParser.parseCommands<FamilyType>(commandStream, 0);
--- a/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp
+++ b/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp
@@ -20,6 +20,7 @@
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 #include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/event/hw_timestamps.h"
 #include "runtime/helpers/kernel_commands.h"
 #include "runtime/helpers/task_information.h"
@@ -65,18 +66,18 @@ class MockDeviceQueueHwWithCriticalSectionRelease : public DeviceQueueHw<GfxFami
        return igilCmdQueue->m_controls.m_CriticalSection == DeviceQueueHw<GfxFamily>::ExecutionModelCriticalSection::Free;
    }
-    void setupIndirectState(IndirectHeap &surfaceStateHeap, Kernel *parentKernel, uint32_t parentIDCount) override {
+    void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount) override {
        indirectStateSetup = true;
-        return BaseClass::setupIndirectState(surfaceStateHeap, parentKernel, parentIDCount);
+        return BaseClass::setupIndirectState(surfaceStateHeap, dynamicStateHeap, parentKernel, parentIDCount);
    }
    void addExecutionModelCleanUpSection(Kernel *parentKernel, HwTimeStamps *hwTimeStamp, uint32_t taskCount) override {
        cleanupSectionAdded = true;
        timestampAddedInCleanupSection = hwTimeStamp;
        return BaseClass::addExecutionModelCleanUpSection(parentKernel, hwTimeStamp, taskCount);
    }
-    void dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode) override {
+    void dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh) override {
        schedulerDispatched = true;
-        return BaseClass::dispatchScheduler(cmdQ, scheduler, preemptionMode);
+        return BaseClass::dispatchScheduler(cmdQ, scheduler, preemptionMode, ssh, dsh);
    }
    uint32_t criticalSectioncheckCounter = 0;
@@ -98,17 +99,22 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenLockedEMcritcalSectionWhenParentK
        mockDevQueue.acquireEMCriticalSection();
        size_t heapSize = 20;
-        size_t alignement = 64;
+        size_t dshSize = mockDevQueue.getDshBuffer()->getUnderlyingBufferSize();
-        size_t dshSize = heapSize + mockDevQueue.getDshOffset();
+
-        IndirectHeap *dsh = new IndirectHeap(alignedMalloc(dshSize, alignement), dshSize);
+        IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
        pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, dshSize, dsh);
        pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, heapSize, ioh);
        pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, heapSize, ssh);
        dsh->getSpace(mockDevQueue.getDshOffset());
        size_t minSizeSSHForEM = KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*parentKernel);
        KernelOperation *blockedCommandData = new KernelOperation(std::unique_ptr<LinearStream>(new LinearStream()),
                                                                  std::unique_ptr<IndirectHeap>(dsh),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)),
+                                                                  std::unique_ptr<IndirectHeap>(ioh),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)));
+                                                                  std::unique_ptr<IndirectHeap>(ssh),
                                                                  *pCmdQ->getDevice().getMemoryManager());
        blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        PreemptionMode preemptionMode = device->getPreemptionMode();
@@ -124,7 +130,7 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenLockedEMcritcalSectionWhenParentK
    }
 }
-HWTEST_F(ParentKernelCommandQueueFixture, givenParentKernelWhenCommandIsSubmittedThenDeviceQueueDshIsUsed) {
+HWTEST_F(ParentKernelCommandQueueFixture, givenParentKernelWhenCommandIsSubmittedThenPassedDshIsUsed) {
    if (device->getSupportedClVersion() >= 20) {
        cl_queue_properties properties[3] = {0};
        MockParentKernel *parentKernel = MockParentKernel::create(*device);
@@ -135,14 +141,19 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenParentKernelWhenCommandIsSubmitte
        auto *dshOfDevQueue = mockDevQueue.getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
        size_t heapSize = 20;
-        size_t alignement = 64;
+        size_t dshSize = mockDevQueue.getDshBuffer()->getUnderlyingBufferSize();
-        size_t dshSize = heapSize + mockDevQueue.getDshOffset();
+
-        IndirectHeap *dsh = new IndirectHeap(alignedMalloc(dshSize, alignement), dshSize);
+        IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
        pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, dshSize, dsh);
        pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, heapSize, ioh);
        pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, heapSize, ssh);
        // add initial offset of colorCalState
        dsh->getSpace(DeviceQueue::colorCalcStateSize);
        uint64_t ValueToFillDsh = 5;
        uint64_t *dshVal = (uint64_t *)dsh->getSpace(sizeof(uint64_t));
        // Fill Interface Descriptor Data
        *dshVal = ValueToFillDsh;
@@ -155,15 +166,15 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenParentKernelWhenCommandIsSubmitte
        *dshVal = ValueToFillDsh;
        size_t usedDSHBeforeSubmit = dshOfDevQueue->getUsed();
        uint64_t *devQueueDshValue = (uint64_t *)dshOfDevQueue->getSpace(0);
        uint32_t colorCalcSizeDevQueue = DeviceQueue::colorCalcStateSize;
        EXPECT_EQ(colorCalcSizeDevQueue, usedDSHBeforeSubmit);
        KernelOperation *blockedCommandData = new KernelOperation(std::unique_ptr<LinearStream>(new LinearStream()),
                                                                  std::unique_ptr<IndirectHeap>(dsh),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)),
+                                                                  std::unique_ptr<IndirectHeap>(ioh),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)));
+                                                                  std::unique_ptr<IndirectHeap>(ssh),
                                                                  *pCmdQ->getDevice().getMemoryManager());
        size_t minSizeSSHForEM = KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*parentKernel);
@@ -175,13 +186,9 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenParentKernelWhenCommandIsSubmitte
        cmdComputeKernel->submit(0, false);
        //device queue dsh is not changed
        size_t usedDSHAfterSubmit = dshOfDevQueue->getUsed();
-
+        EXPECT_EQ(usedDSHAfterSubmit, usedDSHAfterSubmit);
        EXPECT_EQ(mockDevQueue.getDshOffset() + sizeof(uint64_t), usedDSHAfterSubmit);
        EXPECT_EQ(ValueToFillDsh, *devQueueDshValue);
        uint64_t *devQueueDshParent = (uint64_t *)ptrOffset((char *)dshOfDevQueue->getCpuBase(), mockDevQueue.getDshOffset());
        EXPECT_EQ(ValueToFillDsh, *devQueueDshParent);
        delete cmdComputeKernel;
        delete parentKernel;
@@ -197,15 +204,20 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenParentKernelWhenCommandIsSubmitte
        context->setDefaultDeviceQueue(&mockDevQueue);
        size_t heapSize = 20;
-        size_t alignement = 64;
+        size_t dshSize = mockDevQueue.getDshBuffer()->getUnderlyingBufferSize();
-        size_t dshSize = heapSize + mockDevQueue.getDshOffset();
+
-        IndirectHeap *dsh = new IndirectHeap(alignedMalloc(dshSize, alignement), dshSize);
+        IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
        pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, dshSize, dsh);
        pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, heapSize, ioh);
        pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, heapSize, ssh);
        dsh->getSpace(mockDevQueue.getDshOffset());
        KernelOperation *blockedCommandData = new KernelOperation(std::unique_ptr<LinearStream>(new LinearStream()),
                                                                  std::unique_ptr<IndirectHeap>(dsh),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)),
+                                                                  std::unique_ptr<IndirectHeap>(ioh),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)));
+                                                                  std::unique_ptr<IndirectHeap>(ssh),
                                                                  *pCmdQ->getDevice().getMemoryManager());
        size_t minSizeSSHForEM = KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*parentKernel);
@@ -234,15 +246,18 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenBlockedParentKernelWithProfilingW
        context->setDefaultDeviceQueue(&mockDevQueue);
        size_t heapSize = 20;
-        size_t alignement = 64;
+        size_t dshSize = mockDevQueue.getDshBuffer()->getUnderlyingBufferSize();
-        size_t dshSize = heapSize + mockDevQueue.getDshOffset();
+        IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
-        IndirectHeap *dsh = new IndirectHeap(alignedMalloc(dshSize, alignement), dshSize);
+        pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, dshSize, dsh);
        pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, heapSize, ioh);
        pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, heapSize, ssh);
        dsh->getSpace(mockDevQueue.getDshOffset());
        KernelOperation *blockedCommandData = new KernelOperation(std::unique_ptr<LinearStream>(new LinearStream()),
                                                                  std::unique_ptr<IndirectHeap>(dsh),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)),
+                                                                  std::unique_ptr<IndirectHeap>(ioh),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)));
+                                                                  std::unique_ptr<IndirectHeap>(ssh),
                                                                  *pCmdQ->getDevice().getMemoryManager());
        size_t minSizeSSHForEM = KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*parentKernel);
@@ -274,15 +289,19 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenParentKernelWhenCommandIsSubmitte
        context->setDefaultDeviceQueue(&mockDevQueue);
        size_t heapSize = 20;
-        size_t alignement = 64;
+        size_t dshSize = mockDevQueue.getDshBuffer()->getUnderlyingBufferSize();
-        size_t dshSize = heapSize + mockDevQueue.getDshOffset();
+
-        IndirectHeap *dsh = new IndirectHeap(alignedMalloc(dshSize, alignement), dshSize);
+        IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
        pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, dshSize, dsh);
        pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, heapSize, ioh);
        pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, heapSize, ssh);
        dsh->getSpace(mockDevQueue.getDshOffset());
        KernelOperation *blockedCommandData = new KernelOperation(std::unique_ptr<LinearStream>(new LinearStream()),
                                                                  std::unique_ptr<IndirectHeap>(dsh),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)),
+                                                                  std::unique_ptr<IndirectHeap>(ioh),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)));
+                                                                  std::unique_ptr<IndirectHeap>(ssh),
                                                                  *pCmdQ->getDevice().getMemoryManager());
        size_t minSizeSSHForEM = KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*parentKernel);
@@ -301,7 +320,7 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenParentKernelWhenCommandIsSubmitte
    }
 }
-HWTEST_F(ParentKernelCommandQueueFixture, givenUsedSSHWhenParentKernelIsSubmittedThenNewSSHIsAllocated) {
+HWTEST_F(ParentKernelCommandQueueFixture, givenUsedCommandQueueHeapshenParentKernelIsSubmittedThenQueueHeapsAreNotUsed) {
    if (device->getSupportedClVersion() >= 20) {
        cl_queue_properties properties[3] = {0};
        MockParentKernel *parentKernel = MockParentKernel::create(*device);
@@ -314,20 +333,30 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenUsedSSHWhenParentKernelIsSubmitte
        size_t minSizeSSHForEM = KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*parentKernel);
        size_t heapSize = 20;
        size_t alignement = 64;
-        size_t dshSize = heapSize + mockDevQueue.getDshOffset();
+        size_t dshSize = mockDevQueue.getDshBuffer()->getUnderlyingBufferSize();
-        IndirectHeap *dsh = new IndirectHeap(alignedMalloc(dshSize, alignement), dshSize);
+        IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
        pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, dshSize, dsh);
        pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, heapSize, ioh);
        pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, heapSize, ssh);
        dsh->getSpace(mockDevQueue.getDshOffset());
-        cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 100);
+        auto &queueSsh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 100);
-        // use some SSH
+        auto &queueDsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 100);
-        cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE).getSpace(4);
+        auto &queueIoh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 100);
        size_t usedSize = 4u;
        queueSsh.getSpace(usedSize);
        queueDsh.getSpace(usedSize);
        queueIoh.getSpace(usedSize);
        KernelOperation *blockedCommandData = new KernelOperation(std::unique_ptr<LinearStream>(new LinearStream()),
                                                                  std::unique_ptr<IndirectHeap>(dsh),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)),
+                                                                  std::unique_ptr<IndirectHeap>(ioh),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)));
+                                                                  std::unique_ptr<IndirectHeap>(ssh),
                                                                  *pCmdQ->getDevice().getMemoryManager());
        blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        PreemptionMode preemptionMode = device->getPreemptionMode();
@@ -337,7 +366,10 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenUsedSSHWhenParentKernelIsSubmitte
        cmdComputeKernel->submit(0, false);
-        EXPECT_TRUE(cmdQ.releaseIndirectHeapCalled);
+        EXPECT_FALSE(cmdQ.releaseIndirectHeapCalled);
        EXPECT_EQ(usedSize, queueDsh.getUsed());
        EXPECT_EQ(usedSize, queueIoh.getUsed());
        EXPECT_EQ(usedSize, queueSsh.getUsed());
        delete cmdComputeKernel;
        delete parentKernel;
@@ -355,14 +387,14 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenNotUsedSSHWhenParentKernelIsSubmi
        size_t minSizeSSHForEM = KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*parentKernel);
        size_t heapSize = 20;
        size_t alignement = 64;
        size_t dshSize = heapSize + mockDevQueue.getDshOffset();
        IndirectHeap *dsh = new IndirectHeap(alignedMalloc(dshSize, alignement), dshSize);
        dsh->getSpace(mockDevQueue.getDshOffset());
        size_t dshSize = mockDevQueue.getDshBuffer()->getUnderlyingBufferSize();
        size_t sshSize = 1000;
-        IndirectHeap *ssh = new IndirectHeap(alignedMalloc(sshSize, 4096), sshSize);
+        IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
        pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, dshSize, dsh);
        pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, heapSize, ioh);
        pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, sshSize, ssh);
        dsh->getSpace(mockDevQueue.getDshOffset());
        EXPECT_EQ(0u, ssh->getUsed());
@@ -372,8 +404,9 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenNotUsedSSHWhenParentKernelIsSubmi
        KernelOperation *blockedCommandData = new KernelOperation(std::unique_ptr<LinearStream>(new LinearStream()),
                                                                  std::unique_ptr<IndirectHeap>(dsh),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)),
+                                                                  std::unique_ptr<IndirectHeap>(ioh),
-                                                                  std::unique_ptr<IndirectHeap>(ssh));
+                                                                  std::unique_ptr<IndirectHeap>(ssh),
                                                                  *pCmdQ->getDevice().getMemoryManager());
        blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        PreemptionMode preemptionMode = device->getPreemptionMode();
@@ -391,3 +424,43 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenNotUsedSSHWhenParentKernelIsSubmi
        delete parentKernel;
    }
 }
 HWTEST_F(ParentKernelCommandQueueFixture, givenBlockedCommandQueueWhenDispatchWalkerIsCalledThenHeapsHaveProperSizes) {
    if (device->getSupportedClVersion() >= 20) {
        cl_queue_properties properties[3] = {0};
        std::unique_ptr<MockParentKernel> parentKernel(MockParentKernel::create(*device));
        MockDeviceQueueHw<FamilyType> mockDevQueue(context, device, properties[0]);
        parentKernel->createReflectionSurface();
        context->setDefaultDeviceQueue(&mockDevQueue);
        KernelOperation *blockedCommandsData = nullptr;
        const size_t globalOffsets[3] = {0, 0, 0};
        const size_t workItems[3] = {1, 1, 1};
        GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
                                                      *parentKernel,
                                                      1,
                                                      globalOffsets,
                                                      workItems,
                                                      nullptr,
                                                      0,
                                                      nullptr,
                                                      &blockedCommandsData,
                                                      nullptr,
                                                      nullptr,
                                                      device->getPreemptionMode(),
                                                      true);
        EXPECT_NE(nullptr, blockedCommandsData);
        EXPECT_EQ(blockedCommandsData->dsh->getMaxAvailableSpace(), mockDevQueue.getDshBuffer()->getUnderlyingBufferSize());
        EXPECT_EQ(blockedCommandsData->dsh, blockedCommandsData->ioh);
        EXPECT_NE(nullptr, blockedCommandsData->dsh->getGraphicsAllocation());
        EXPECT_NE(nullptr, blockedCommandsData->ioh->getGraphicsAllocation());
        EXPECT_NE(nullptr, blockedCommandsData->ssh->getGraphicsAllocation());
        EXPECT_EQ(blockedCommandsData->dsh->getGraphicsAllocation(), blockedCommandsData->ioh->getGraphicsAllocation());
        delete blockedCommandsData;
    }
 }
--- a/unit_tests/gen8/scheduler_dispatch_tests.cpp
+++ b/unit_tests/gen8/scheduler_dispatch_tests.cpp
@@ -55,7 +55,9 @@ BDWTEST_F(BdwSchedulerTest, givenCallToDispatchSchedulerWhenPipeControlWithCSSta
            *pCmdQ,
            *pDevQueueHw,
            pDevice->getPreemptionMode(),
-            scheduler);
+            scheduler,
            &pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE),
            pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE));
        HardwareParse hwParser;
        hwParser.parseCommands<FamilyType>(commandStream, 0);