[26/n] Internal 4GB allocator.

- change the way we handle blocked commands. - instead of allocating CPU pointer and populating it with commands, create real IndirectHeap that may be later submitted to the GPU - that removes a lot of copy operations that were happening on submit time - for device enqueue, this requires dsh & shh to be passed directly to the underlying commands, in that scenario device queue buffers are not used Change-Id: I1124a8edbb46777ea7f7d3a5946f302e7fdf9665
2025-12-19 06:24:51 +08:00 · 2018-04-05 15:12:28 +02:00
parent 100f559daa
commit ffa9b097f5
20 changed files with 331 additions and 319 deletions
--- a/runtime/command_queue/command_queue.cpp
+++ b/runtime/command_queue/command_queue.cpp
@@ -239,35 +239,7 @@ IndirectHeap &CommandQueue::getIndirectHeap(IndirectHeap::Type heapType,
    }

    if (!heapMemory) {
-        size_t reservedSize = 0;
-        auto finalHeapSize = defaultHeapSize;
-
-        minRequiredSize += reservedSize;
-
-        finalHeapSize = alignUp(std::max(finalHeapSize, minRequiredSize), MemoryConstants::pageSize);
-
-        heapMemory = memoryManager->obtainReusableAllocation(finalHeapSize).release();
-
-        if (!heapMemory) {
-            heapMemory = memoryManager->allocateGraphicsMemory(finalHeapSize, MemoryConstants::pageSize);
-        } else {
-            finalHeapSize = std::max(heapMemory->getUnderlyingBufferSize(), finalHeapSize);
-        }
-
-        heapMemory->setAllocationType(GraphicsAllocation::ALLOCATION_TYPE_LINEAR_STREAM);
-
-        if (IndirectHeap::SURFACE_STATE == heapType) {
-            DEBUG_BREAK_IF(minRequiredSize > maxSshSize);
-            finalHeapSize = maxSshSize;
-        }
-
-        if (heap) {
-            heap->replaceBuffer(heapMemory->getUnderlyingBuffer(), finalHeapSize);
-            heap->replaceGraphicsAllocation(heapMemory);
-        } else {
-            heap = new IndirectHeap(heapMemory);
-            heap->overrideMaxSize(finalHeapSize);
-        }
+        allocateHeapMemory(heapType, minRequiredSize, heap);
    }

    return *heap;
@@ -650,4 +622,37 @@ bool CommandQueue::setupDebugSurface(Kernel *kernel) {
    return true;
 }

+void CommandQueue::allocateHeapMemory(IndirectHeap::Type heapType,
+                                      size_t minRequiredSize, IndirectHeap *&indirectHeap) {
+    auto memoryManager = device->getMemoryManager();
+    size_t reservedSize = 0;
+    auto finalHeapSize = defaultHeapSize;
+
+    minRequiredSize += reservedSize;
+
+    finalHeapSize = alignUp(std::max(finalHeapSize, minRequiredSize), MemoryConstants::pageSize);
+
+    auto heapMemory = memoryManager->obtainReusableAllocation(finalHeapSize).release();
+
+    if (!heapMemory) {
+        heapMemory = memoryManager->allocateGraphicsMemory(finalHeapSize, MemoryConstants::pageSize);
+    } else {
+        finalHeapSize = std::max(heapMemory->getUnderlyingBufferSize(), finalHeapSize);
+    }
+
+    heapMemory->setAllocationType(GraphicsAllocation::ALLOCATION_TYPE_LINEAR_STREAM);
+
+    if (IndirectHeap::SURFACE_STATE == heapType) {
+        DEBUG_BREAK_IF(minRequiredSize > maxSshSize);
+        finalHeapSize = maxSshSize;
+    }
+
+    if (indirectHeap) {
+        indirectHeap->replaceBuffer(heapMemory->getUnderlyingBuffer(), finalHeapSize);
+        indirectHeap->replaceGraphicsAllocation(heapMemory);
+    } else {
+        indirectHeap = new IndirectHeap(heapMemory);
+        indirectHeap->overrideMaxSize(finalHeapSize);
+    }
+}
 } // namespace OCLRT
--- a/runtime/command_queue/command_queue.h
+++ b/runtime/command_queue/command_queue.h
@@ -336,6 +336,9 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
    IndirectHeap &getIndirectHeap(IndirectHeap::Type heapType,
                                  size_t minRequiredSize = 0u);

+    void allocateHeapMemory(IndirectHeap::Type heapType,
+                            size_t minRequiredSize, IndirectHeap *&indirectHeap);
+
    MOCKABLE_VIRTUAL void releaseIndirectHeap(IndirectHeap::Type heapType);

    cl_command_queue_properties getCommandQueueProperties() const {
--- a/runtime/command_queue/enqueue_common.h
+++ b/runtime/command_queue/enqueue_common.h
@@ -275,6 +275,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,

            uint32_t taskCount = commandStreamReceiver.peekTaskCount() + 1;
            devQueueHw->setupExecutionModelDispatch(getIndirectHeap(IndirectHeap::SURFACE_STATE, minSizeSSHForEM),
+                                                    *devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
                                                    multiDispatchInfo.begin()->getKernel(),
                                                    (uint32_t)multiDispatchInfo.size(),
                                                    taskCount,
@@ -297,7 +298,9 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
                *this,
                *devQueueHw,
                preemption,
-                scheduler);
+                scheduler,
+                &getIndirectHeap(IndirectHeap::SURFACE_STATE),
+                devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE));

            scheduler.makeResident(commandStreamReceiver);

--- a/runtime/command_queue/gpgpu_walker.h
+++ b/runtime/command_queue/gpgpu_walker.h
@@ -119,13 +119,6 @@ inline cl_uint computeDimensions(const size_t workItems[3]) {
    return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 : 1;
 }

-template <typename SizeAndAllocCalcT, typename... CalcArgsT>
-IndirectHeap *allocateIndirectHeap(SizeAndAllocCalcT &&calc, CalcArgsT &&... args) {
-    size_t alignment = MemoryConstants::pageSize;
-    size_t size = calc(std::forward<CalcArgsT>(args)...);
-    return new IndirectHeap(alignedMalloc(size, alignment), size);
-}
-
 template <typename GfxFamily>
 class GpgpuWalkerHelper {
  public:
@@ -227,7 +220,9 @@ class GpgpuWalkerHelper {
        CommandQueue &commandQueue,
        DeviceQueueHw<GfxFamily> &devQueueHw,
        PreemptionMode preemptionMode,
-        SchedulerKernel &scheduler);
+        SchedulerKernel &scheduler,
+        IndirectHeap *ssh,
+        IndirectHeap *dsh);
 };

 template <typename GfxFamily, uint32_t eventType>
--- a/runtime/command_queue/gpgpu_walker.inl
+++ b/runtime/command_queue/gpgpu_walker.inl
@@ -458,20 +458,27 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
        using KCH = KernelCommandsHelper<GfxFamily>;
        commandStream = new LinearStream(alignedMalloc(MemoryConstants::pageSize, MemoryConstants::pageSize), MemoryConstants::pageSize);
        if (executionModelKernel) {
-            uint32_t offsetDsh = commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset();
            uint32_t colorCalcSize = commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize;

-            dsh = allocateIndirectHeap([&multiDispatchInfo, offsetDsh] { return KCH::getTotalSizeRequiredDSH(multiDispatchInfo) + KCH::getTotalSizeRequiredIOH(multiDispatchInfo) + offsetDsh; });
+            commandQueue.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE,
+                                            commandQueue.getContext().getDefaultDeviceQueue()->getDshBuffer()->getUnderlyingBufferSize(),
+                                            dsh);
+
            dsh->getSpace(colorCalcSize);
            ioh = dsh;
+            commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE,
+                                            KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*(multiDispatchInfo.begin()->getKernel())) +
+                                                KCH::getTotalSizeRequiredSSH(multiDispatchInfo),
+                                            ssh);
        } else {
-            dsh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredDSH(multiDispatchInfo); });
-            ioh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredIOH(multiDispatchInfo); });
+            commandQueue.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, KCH::getTotalSizeRequiredDSH(multiDispatchInfo), dsh);
+            commandQueue.allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, KCH::getTotalSizeRequiredIOH(multiDispatchInfo), ioh);
+            commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE, KCH::getTotalSizeRequiredSSH(multiDispatchInfo), ssh);
        }

-        ssh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredSSH(multiDispatchInfo); });
        using UniqueIH = std::unique_ptr<IndirectHeap>;
-        *blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh), UniqueIH(ioh), UniqueIH(ssh));
+        *blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh), UniqueIH(ioh), UniqueIH(ssh),
+                                                   *commandQueue.getDevice().getMemoryManager());
        if (executionModelKernel) {
            (*blockedCommandsData)->doNotFreeISH = true;
        }
@@ -671,7 +678,9 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
    CommandQueue &commandQueue,
    DeviceQueueHw<GfxFamily> &devQueueHw,
    PreemptionMode preemptionMode,
-    SchedulerKernel &scheduler) {
+    SchedulerKernel &scheduler,
+    IndirectHeap *ssh,
+    IndirectHeap *dsh) {

    using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
    using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
@@ -679,13 +688,9 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
    using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;

    OCLRT::LinearStream *commandStream = nullptr;
-    OCLRT::IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
+    OCLRT::IndirectHeap *ioh = nullptr;

    commandStream = &commandQueue.getCS(0);
-    // note : below code assumes that caller to dispatchScheduler "preallocated" memory
-    //        required for execution model in below heap managers
-    dsh = devQueueHw.getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
-    ssh = &commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE);

    bool dcFlush = false;
    commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, dcFlush);
--- a/runtime/device_queue/device_queue.cpp
+++ b/runtime/device_queue/device_queue.cpp
@@ -156,12 +156,12 @@ void DeviceQueue::initDeviceQueue() {
    igilEventPool->m_size = caps.maxOnDeviceEvents;
 }

-void DeviceQueue::setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, Kernel *parentKernel, uint32_t parentCount, uint32_t taskCount, HwTimeStamps *hwTimeStamp) {
-    setupIndirectState(surfaceStateHeap, parentKernel, parentCount);
+void DeviceQueue::setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint32_t taskCount, HwTimeStamps *hwTimeStamp) {
+    setupIndirectState(surfaceStateHeap, dynamicStateHeap, parentKernel, parentCount);
    addExecutionModelCleanUpSection(parentKernel, hwTimeStamp, taskCount);
 }

-void DeviceQueue::setupIndirectState(IndirectHeap &surfaceStateHeap, Kernel *parentKernel, uint32_t parentIDCount) {
+void DeviceQueue::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount) {
    return;
 }

@@ -173,7 +173,7 @@ void DeviceQueue::resetDeviceQueue() {
    return;
 }

-void DeviceQueue::dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode) {
+void DeviceQueue::dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh) {
    return;
 }

--- a/runtime/device_queue/device_queue.h
+++ b/runtime/device_queue/device_queue.h
@@ -81,9 +81,9 @@ class DeviceQueue : public BaseObject<_device_queue> {
                               size_t paramValueSize, void *paramValue,
                               size_t *paramValueSizeRet);

-    void setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, Kernel *parentKernel, uint32_t parentCount, uint32_t taskCount, HwTimeStamps *hwTimeStamp);
+    void setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint32_t taskCount, HwTimeStamps *hwTimeStamp);

-    virtual void setupIndirectState(IndirectHeap &surfaceStateHeap, Kernel *parentKernel, uint32_t parentIDCount);
+    virtual void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount);
    virtual void addExecutionModelCleanUpSection(Kernel *parentKernel, HwTimeStamps *hwTimeStamp, uint32_t taskCount);

    MOCKABLE_VIRTUAL bool isEMCriticalSectionFree() {
@@ -93,7 +93,7 @@ class DeviceQueue : public BaseObject<_device_queue> {
    }

    virtual void resetDeviceQueue();
-    virtual void dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode);
+    virtual void dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh);
    virtual IndirectHeap *getIndirectHeap(IndirectHeap::Type type);

    void acquireEMCriticalSection() {
--- a/runtime/device_queue/device_queue_hw.h
+++ b/runtime/device_queue/device_queue_hw.h
@@ -72,11 +72,11 @@ class DeviceQueueHw : public DeviceQueue {

    size_t setSchedulerCrossThreadData(SchedulerKernel &scheduler);

-    void setupIndirectState(IndirectHeap &surfaceStateHeap, Kernel *parentKernel, uint32_t parentIDCount) override;
+    void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount) override;

    void addExecutionModelCleanUpSection(Kernel *parentKernel, HwTimeStamps *hwTimeStamp, uint32_t taskCount) override;
    void resetDeviceQueue() override;
-    void dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode) override;
+    void dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh) override;

    uint32_t getSchedulerReturnInstance() {
        return igilQueue->m_controls.m_SchedulerEarlyReturn;
--- a/runtime/device_queue/device_queue_hw.inl
+++ b/runtime/device_queue/device_queue_hw.inl
@@ -290,11 +290,8 @@ IndirectHeap *DeviceQueueHw<GfxFamily>::getIndirectHeap(IndirectHeap::Type type)
 }

 template <typename GfxFamily>
-void DeviceQueueHw<GfxFamily>::setupIndirectState(IndirectHeap &surfaceStateHeap, Kernel *parentKernel, uint32_t parentIDCount) {
-    void *pDSH = dshBuffer->getUnderlyingBuffer();
-
-    // Heap and dshBuffer shoud be the same if heap is created
-    DEBUG_BREAK_IF(!((heaps[IndirectHeap::DYNAMIC_STATE] == nullptr) || (heaps[IndirectHeap::DYNAMIC_STATE]->getCpuBase() == pDSH)));
+void DeviceQueueHw<GfxFamily>::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount) {
+    void *pDSH = dynamicStateHeap.getCpuBase();

    // Set scheduler ID to last entry in first table, it will have ID == 0, blocks will have following entries.
    auto igilCmdQueue = reinterpret_cast<IGIL_CommandQueue *>(queueBuffer->getUnderlyingBuffer());
@@ -386,11 +383,13 @@ size_t DeviceQueueHw<GfxFamily>::setSchedulerCrossThreadData(SchedulerKernel &sc
 }

 template <typename GfxFamily>
-void DeviceQueueHw<GfxFamily>::dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode) {
+void DeviceQueueHw<GfxFamily>::dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh) {
    GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(cmdQ,
                                                    *this,
                                                    preemptionMode,
-                                                    scheduler);
+                                                    scheduler,
+                                                    ssh,
+                                                    dsh);
    return;
 }

--- a/runtime/helpers/task_information.cpp
+++ b/runtime/helpers/task_information.cpp
@@ -28,6 +28,7 @@
 #include "runtime/device_queue/device_queue.h"
 #include "runtime/gtpin/gtpin_notify.h"
 #include "runtime/mem_obj/mem_obj.h"
+#include "runtime/memory_manager/memory_manager.h"
 #include "runtime/memory_manager/surface.h"
 #include "runtime/helpers/aligned_memory.h"
 #include "runtime/helpers/string.h"
@@ -35,13 +36,14 @@

 namespace OCLRT {
 KernelOperation::~KernelOperation() {
-    alignedFree(dsh->getCpuBase());
-    if (doNotFreeISH) {
+    memoryManager.storeAllocation(std::unique_ptr<GraphicsAllocation>(dsh->getGraphicsAllocation()), REUSABLE_ALLOCATION);
+    if (ioh.get() == dsh.get()) {
        ioh.release();
-    } else {
-        alignedFree(ioh->getCpuBase());
    }
-    alignedFree(ssh->getCpuBase());
+    if (ioh) {
+        memoryManager.storeAllocation(std::unique_ptr<GraphicsAllocation>(ioh->getGraphicsAllocation()), REUSABLE_ALLOCATION);
+    }
+    memoryManager.storeAllocation(std::unique_ptr<GraphicsAllocation>(ssh->getGraphicsAllocation()), REUSABLE_ALLOCATION);
    alignedFree(commandStream->getCpuBase());
 }

@@ -163,43 +165,9 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
    //transfer the memory to commandStream of the queue.
    memcpy_s(pDst, commandsSize, commandStream.getCpuBase(), commandsSize);

-    size_t requestedDshSize = kernelOperation->dsh->getUsed();
-    size_t requestedIohSize = kernelOperation->ioh->getUsed();
-    size_t requestedSshSize = kernelOperation->ssh->getUsed() + kernelOperation->surfaceStateHeapSizeEM;
-
-    IndirectHeap *dsh = nullptr;
-    IndirectHeap *ioh = nullptr;
-
-    IndirectHeap::Type trackedHeaps[] = {IndirectHeap::SURFACE_STATE, IndirectHeap::INDIRECT_OBJECT, IndirectHeap::DYNAMIC_STATE};
-
-    for (auto trackedHeap = 0u; trackedHeap < ARRAY_COUNT(trackedHeaps); trackedHeap++) {
-        if (commandQueue.getIndirectHeap(trackedHeaps[trackedHeap], 0).getUsed() > 0) {
-            commandQueue.releaseIndirectHeap(trackedHeaps[trackedHeap]);
-        }
-    }
-
-    if (executionModelKernel) {
-        dsh = devQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
-        // In ExecutionModel IOH is the same as DSH to eliminate StateBaseAddress reprogramming for scheduler kernel and blocks.
-        ioh = dsh;
-
-        memcpy_s(dsh->getSpace(0), dsh->getAvailableSpace(), ptrOffset(kernelOperation->dsh->getCpuBase(), devQueue->colorCalcStateSize), kernelOperation->dsh->getUsed() - devQueue->colorCalcStateSize);
-        dsh->getSpace(kernelOperation->dsh->getUsed() - devQueue->colorCalcStateSize);
-    } else {
-        dsh = &commandQueue.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, requestedDshSize);
-        ioh = &commandQueue.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, requestedIohSize);
-
-        memcpy_s(dsh->getCpuBase(), requestedDshSize, kernelOperation->dsh->getCpuBase(), kernelOperation->dsh->getUsed());
-        dsh->getSpace(requestedDshSize);
-
-        memcpy_s(ioh->getCpuBase(), requestedIohSize, kernelOperation->ioh->getCpuBase(), kernelOperation->ioh->getUsed());
-        ioh->getSpace(requestedIohSize);
-    }
-
-    IndirectHeap &ssh = commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, requestedSshSize);
-
-    memcpy_s(ssh.getCpuBase(), requestedSshSize, kernelOperation->ssh->getCpuBase(), kernelOperation->ssh->getUsed());
-    ssh.getSpace(kernelOperation->ssh->getUsed());
+    IndirectHeap *dsh = kernelOperation->dsh.get();
+    IndirectHeap *ioh = kernelOperation->ioh.get();
+    IndirectHeap *ssh = kernelOperation->ssh.get();

    auto requiresCoherency = false;
    for (auto &surface : surfaces) {
@@ -214,7 +182,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate

    if (executionModelKernel) {
        uint32_t taskCount = commandStreamReceiver.peekTaskCount() + 1;
-        devQueue->setupExecutionModelDispatch(ssh, kernel, kernelCount, taskCount, timestamp);
+        devQueue->setupExecutionModelDispatch(*ssh, *dsh, kernel, kernelCount, taskCount, timestamp);

        BuiltIns &builtIns = BuiltIns::getInstance();
        SchedulerKernel &scheduler = builtIns.getSchedulerKernel(commandQueue.getContext());
@@ -223,16 +191,18 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
                          devQueue->getStackBuffer(),
                          devQueue->getEventPoolBuffer(),
                          devQueue->getSlbBuffer(),
-                          devQueue->getDshBuffer(),
+                          dsh->getGraphicsAllocation(),
                          kernel->getKernelReflectionSurface(),
                          devQueue->getQueueStorageBuffer(),
-                          ssh.getGraphicsAllocation(),
+                          ssh->getGraphicsAllocation(),
                          devQueue->getDebugQueue());

        devQueue->dispatchScheduler(
            commandQueue,
            scheduler,
-            preemptionMode);
+            preemptionMode,
+            ssh,
+            dsh);

        scheduler.makeResident(commandStreamReceiver);

@@ -261,14 +231,13 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
                                                      offset,
                                                      *dsh,
                                                      *ioh,
-                                                      ssh,
+                                                      *ssh,
                                                      taskLevel,
                                                      dispatchFlags);
    for (auto &surface : surfaces) {
        surface->setCompletionStamp(completionStamp, nullptr, nullptr);
    }
    commandQueue.waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp, false);
-
    if (printfHandler) {
        printfHandler.get()->printEnqueueOutput();
    }
--- a/runtime/helpers/task_information.h
+++ b/runtime/helpers/task_information.h
@@ -39,6 +39,7 @@ class MemObj;
 class Surface;
 class PrintfHandler;
 struct HwTimeStamps;
+class MemoryManager;

 enum MapOperationType {
    MAP,
@@ -77,10 +78,11 @@ class CommandMapUnmap : public Command {
 };

 struct KernelOperation {
-    KernelOperation(std::unique_ptr<LinearStream> commandStream, std::unique_ptr<IndirectHeap> dsh, std::unique_ptr<IndirectHeap> ioh, std::unique_ptr<IndirectHeap> ssh)
+    KernelOperation(std::unique_ptr<LinearStream> commandStream, std::unique_ptr<IndirectHeap> dsh, std::unique_ptr<IndirectHeap> ioh, std::unique_ptr<IndirectHeap> ssh,
+                    MemoryManager &memoryManager)
        : commandStream(std::move(commandStream)), dsh(std::move(dsh)),
          ioh(std::move(ioh)), ssh(std::move(ssh)),
-          surfaceStateHeapSizeEM(0), doNotFreeISH(false) {
+          surfaceStateHeapSizeEM(0), doNotFreeISH(false), memoryManager(memoryManager) {
    }

    ~KernelOperation();
@@ -92,6 +94,7 @@ struct KernelOperation {

    size_t surfaceStateHeapSizeEM;
    bool doNotFreeISH;
+    MemoryManager &memoryManager;
 };

 class CommandComputeKernel : public Command {
--- a/unit_tests/command_queue/command_queue_hw_tests.cpp
+++ b/unit_tests/command_queue/command_queue_hw_tests.cpp
@@ -392,95 +392,55 @@ HWTEST_F(CommandQueueHwTest, GivenNotCompleteUserEventPassedToEnqueueWhenEventIs
 }

 typedef CommandQueueHwTest BlockedCommandQueueTest;
-HWTEST_F(BlockedCommandQueueTest, givenCommandQueueWhichHasSomeUsedHeapsWhenBlockedCommandIsBeingSubmittedItReloadsThemToZeroToKeepProperOffsets) {
-    DebugManagerStateRestore debugStateRestore;
-    bool oldMemsetAllocationsFlag = MemoryManagement::memsetNewAllocations;
-    MemoryManagement::memsetNewAllocations = true;
-
-    DebugManager.flags.ForcePreemptionMode.set(-1); // allow default preemption mode
-    auto deviceWithDefaultPreemptionMode = std::unique_ptr<MockDevice>(DeviceHelper<>::create(nullptr));
-    this->pDevice->setPreemptionMode(deviceWithDefaultPreemptionMode->getPreemptionMode());
-    this->pDevice->getCommandStreamReceiver().setPreemptionCsrAllocation(deviceWithDefaultPreemptionMode->getPreemptionAllocation());
-
-    DebugManager.flags.DisableResourceRecycling.set(true);

+HWTEST_F(BlockedCommandQueueTest, givenCommandQueueWhenBlockedCommandIsBeingSubmittedThenQueueHeapsAreNotUsed) {
    UserEvent userEvent(context);
-    cl_event blockedEvent = &userEvent;
    MockKernelWithInternals mockKernelWithInternals(*pDevice);
-    mockKernelWithInternals.kernelHeader.KernelHeapSize = sizeof(mockKernelWithInternals.kernelIsa);
    auto mockKernel = mockKernelWithInternals.mockKernel;

-    IndirectHeap::Type heaps[] = {IndirectHeap::INDIRECT_OBJECT, IndirectHeap::DYNAMIC_STATE, IndirectHeap::SURFACE_STATE};
-
-    size_t prealocatedHeapSize = 2 * 64 * KB;
-    for (auto heapType : heaps) {
-        auto &heap = pCmdQ->getIndirectHeap(heapType, prealocatedHeapSize);
-        heap.getSpace(16);
-        memset(heap.getCpuBase(), 0, prealocatedHeapSize);
-    }
-
-    // preallocating memsetted allocations to get predictable results
-    pCmdQ->getDevice().getMemoryManager()->cleanAllocationList(-1, REUSABLE_ALLOCATION);
-    DebugManager.flags.DisableResourceRecycling.set(false);
-
-    std::set<void *> reusableHeaps;
-    for (unsigned int i = 0; i < 4; ++i) {
-        auto allocSize = prealocatedHeapSize;
-        void *mem = alignedMalloc(allocSize, 64);
-        reusableHeaps.insert(mem);
-        memset(mem, 0, allocSize);
-        std::unique_ptr<GraphicsAllocation> reusableAlloc{new MockGraphicsAllocation(mem, allocSize)};
-        pCmdQ->getDevice().getMemoryManager()->storeAllocation(std::move(reusableAlloc), REUSABLE_ALLOCATION);
-    }
-
-    // disable further allocation reuse
-    DebugManager.flags.DisableResourceRecycling.set(true);
-
    size_t offset = 0;
    size_t size = 1;
-    pCmdQ->enqueueKernel(mockKernel, 1, &offset, &size, &size, 1, &blockedEvent, nullptr); // blocked command
+
+    cl_event blockedEvent = &userEvent;
+
+    pCmdQ->enqueueKernel(mockKernel, 1, &offset, &size, &size, 1, &blockedEvent, nullptr);
    userEvent.setStatus(CL_COMPLETE);

-    // make sure used heaps are from preallocated pool
-    EXPECT_NE(reusableHeaps.end(), reusableHeaps.find(pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0).getCpuBase()));
-    EXPECT_NE(reusableHeaps.end(), reusableHeaps.find(pCmdQ->getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0).getCpuBase()));
-    EXPECT_NE(reusableHeaps.end(), reusableHeaps.find(pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0).getCpuBase()));
+    auto &ioh = pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 4096u);
+    auto &dsh = pCmdQ->getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 4096u);
+    auto &ssh = pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 4096u);

-    pCmdQ->getDevice().getMemoryManager()->cleanAllocationList(-1, REUSABLE_ALLOCATION);
-    std::unordered_map<int, std::vector<char>> blockedCommandHeaps;
-    int i = 0;
-    for (auto heapType : heaps) {
-        auto &heap = pCmdQ->getIndirectHeap(heapType, 0);
-        blockedCommandHeaps[static_cast<int>(heaps[i])].assign(reinterpret_cast<char *>(heap.getCpuBase()), reinterpret_cast<char *>(heap.getCpuBase()) + heap.getUsed());
+    EXPECT_EQ(0u, ioh.getUsed());
+    EXPECT_EQ(0u, dsh.getUsed());
+    EXPECT_EQ(0u, ssh.getUsed());
+}

-        // prepare new heaps for nonblocked command
-        pCmdQ->releaseIndirectHeap(heapType);
-        ++i;
-    }
+HWTEST_F(BlockedCommandQueueTest, givenCommandQueueWithUsedHeapsWhenBlockedCommandIsBeingSubmittedThenQueueHeapsAreNotUsed) {
+    UserEvent userEvent(context);
+    MockKernelWithInternals mockKernelWithInternals(*pDevice);
+    auto mockKernel = mockKernelWithInternals.mockKernel;

-    pCmdQ->enqueueKernel(mockKernel, 1, &offset, &size, &size, 0, nullptr, nullptr); // nonblocked command
-    i = 0;
-    std::unordered_map<int, std::vector<char>> nonblockedCommandHeaps;
-    for (auto heapType : heaps) {
-        auto &heap = pCmdQ->getIndirectHeap(heapType, 0);
-        nonblockedCommandHeaps[static_cast<int>(heaps[i])].assign(reinterpret_cast<char *>(heap.getCpuBase()), reinterpret_cast<char *>(heap.getCpuBase()) + heap.getUsed());
-        ++i;
-    }
+    size_t offset = 0;
+    size_t size = 1;

-    // expecting blocked command to be programmed indentically to a non-blocked counterpart
-    EXPECT_THAT(nonblockedCommandHeaps[static_cast<int>(IndirectHeap::INDIRECT_OBJECT)],
-                testing::ContainerEq(blockedCommandHeaps[static_cast<int>(IndirectHeap::INDIRECT_OBJECT)]));
-    EXPECT_THAT(nonblockedCommandHeaps[static_cast<int>(IndirectHeap::DYNAMIC_STATE)],
-                testing::ContainerEq(blockedCommandHeaps[static_cast<int>(IndirectHeap::DYNAMIC_STATE)]));
-    EXPECT_THAT(nonblockedCommandHeaps[static_cast<int>(IndirectHeap::SURFACE_STATE)],
-                testing::ContainerEq(blockedCommandHeaps[static_cast<int>(IndirectHeap::SURFACE_STATE)]));
+    cl_event blockedEvent = &userEvent;

-    for (auto ptr : reusableHeaps) {
-        alignedFree(ptr);
-    }
+    auto &ioh = pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 4096u);
+    auto &dsh = pCmdQ->getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 4096u);
+    auto &ssh = pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 4096u);

-    BuiltIns::shutDown();
-    MemoryManagement::memsetNewAllocations = oldMemsetAllocationsFlag;
+    auto spaceToUse = 4u;
+
+    ioh.getSpace(spaceToUse);
+    dsh.getSpace(spaceToUse);
+    ssh.getSpace(spaceToUse);
+
+    pCmdQ->enqueueKernel(mockKernel, 1, &offset, &size, &size, 1, &blockedEvent, nullptr);
+    userEvent.setStatus(CL_COMPLETE);
+
+    EXPECT_EQ(spaceToUse, ioh.getUsed());
+    EXPECT_EQ(spaceToUse, dsh.getUsed());
+    EXPECT_EQ(spaceToUse, ssh.getUsed());
 }

 HWTEST_F(BlockedCommandQueueTest, givenCommandQueueWhichHasSomeUnusedHeapsWhenBlockedCommandIsBeingSubmittedThenThoseHeapsAreBeingUsed) {
--- a/unit_tests/command_queue/command_queue_tests.cpp
+++ b/unit_tests/command_queue/command_queue_tests.cpp
@@ -616,6 +616,33 @@ TEST_P(CommandQueueIndirectHeapTest, givenCommandQueueWhenGetIndirectHeapIsCalle
    EXPECT_EQ(GraphicsAllocation::ALLOCATION_TYPE_LINEAR_STREAM, indirectHeapAllocation->getAllocationType());
 }

+TEST_P(CommandQueueIndirectHeapTest, givenCommandQueueWhenGetHeapMemoryIsCalledThenHeapIsCreated) {
+    const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
+    CommandQueue cmdQ(&context, pDevice, props);
+
+    IndirectHeap *indirectHeap = nullptr;
+    cmdQ.allocateHeapMemory(this->GetParam(), 100, indirectHeap);
+    EXPECT_NE(nullptr, indirectHeap);
+    EXPECT_NE(nullptr, indirectHeap->getGraphicsAllocation());
+
+    pDevice->getMemoryManager()->freeGraphicsMemory(indirectHeap->getGraphicsAllocation());
+    delete indirectHeap;
+}
+
+TEST_P(CommandQueueIndirectHeapTest, givenCommandQueueWhenGetHeapMemoryIsCalledWithAlreadyAllocatedHeapThenGraphicsAllocationIsCreated) {
+    const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
+    CommandQueue cmdQ(&context, pDevice, props);
+
+    IndirectHeap heap(nullptr, 100);
+
+    IndirectHeap *indirectHeap = &heap;
+    cmdQ.allocateHeapMemory(this->GetParam(), 100, indirectHeap);
+    EXPECT_EQ(&heap, indirectHeap);
+    EXPECT_NE(nullptr, indirectHeap->getGraphicsAllocation());
+
+    pDevice->getMemoryManager()->freeGraphicsMemory(indirectHeap->getGraphicsAllocation());
+}
+
 INSTANTIATE_TEST_CASE_P(
    Device,
    CommandQueueIndirectHeapTest,
--- a/unit_tests/command_queue/dispatch_walker_tests.cpp
+++ b/unit_tests/command_queue/dispatch_walker_tests.cpp
@@ -710,9 +710,9 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerShouldGetRequiredHeapSizesFromKernelW
    auto expectedSizeSSH = KernelCommandsHelper<FamilyType>::getSizeRequiredSSH(kernel);

    EXPECT_EQ(expectedSizeCS, blockedCommandsData->commandStream->getMaxAvailableSpace());
-    EXPECT_EQ(expectedSizeDSH, blockedCommandsData->dsh->getMaxAvailableSpace());
-    EXPECT_EQ(expectedSizeIOH, blockedCommandsData->ioh->getMaxAvailableSpace());
-    EXPECT_EQ(expectedSizeSSH, blockedCommandsData->ssh->getMaxAvailableSpace());
+    EXPECT_LE(expectedSizeDSH, blockedCommandsData->dsh->getMaxAvailableSpace());
+    EXPECT_LE(expectedSizeIOH, blockedCommandsData->ioh->getMaxAvailableSpace());
+    EXPECT_LE(expectedSizeSSH, blockedCommandsData->ssh->getMaxAvailableSpace());

    delete blockedCommandsData;
 }
@@ -745,9 +745,9 @@ HWTEST_F(DispatchWalkerTest, dispatchWalkerShouldGetRequiredHeapSizesFromMdiWhen
    auto expectedSizeSSH = KernelCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);

    EXPECT_EQ(expectedSizeCS, blockedCommandsData->commandStream->getMaxAvailableSpace());
-    EXPECT_EQ(expectedSizeDSH, blockedCommandsData->dsh->getMaxAvailableSpace());
-    EXPECT_EQ(expectedSizeIOH, blockedCommandsData->ioh->getMaxAvailableSpace());
-    EXPECT_EQ(expectedSizeSSH, blockedCommandsData->ssh->getMaxAvailableSpace());
+    EXPECT_LE(expectedSizeDSH, blockedCommandsData->dsh->getMaxAvailableSpace());
+    EXPECT_LE(expectedSizeIOH, blockedCommandsData->ioh->getMaxAvailableSpace());
+    EXPECT_LE(expectedSizeSSH, blockedCommandsData->ssh->getMaxAvailableSpace());

    delete blockedCommandsData;
 }
--- a/unit_tests/device_queue/device_queue_hw_tests.cpp
+++ b/unit_tests/device_queue/device_queue_hw_tests.cpp
@@ -534,7 +534,7 @@ HWTEST_P(DeviceQueueHwWithKernel, setupIndirectState) {
        auto usedBeforeSSH = ssh->getUsed();
        auto usedBeforeDSH = dsh->getUsed();

-        devQueueHw->setupIndirectState(*ssh, pKernel, 1);
+        devQueueHw->setupIndirectState(*ssh, *dsh, pKernel, 1);
        auto usedAfterSSH = ssh->getUsed();
        auto usedAfterDSH = dsh->getUsed();

@@ -564,7 +564,7 @@ HWTEST_P(DeviceQueueHwWithKernel, setupIndirectStateSetsCorrectStartBlockID) {

        uint32_t parentCount = 4;

-        devQueueHw->setupIndirectState(*ssh, pKernel, parentCount);
+        devQueueHw->setupIndirectState(*ssh, *dsh, pKernel, parentCount);
        auto *igilQueue = reinterpret_cast<IGIL_CommandQueue *>(devQueueHw->getQueueBuffer()->getUnderlyingBuffer());

        EXPECT_EQ(parentCount, igilQueue->m_controls.m_StartBlockID);
@@ -594,7 +594,7 @@ HWTEST_P(DeviceQueueHwWithKernel, setupIndirectStateSetsCorrectDSHValues) {

        uint32_t parentCount = 1;

-        devQueueHw->setupIndirectState(*ssh, pKernel, parentCount);
+        devQueueHw->setupIndirectState(*ssh, *dsh, pKernel, parentCount);
        auto *igilQueue = reinterpret_cast<IGIL_CommandQueue *>(devQueueHw->getQueueBuffer()->getUnderlyingBuffer());

        EXPECT_EQ(igilQueue->m_controls.m_DynamicHeapStart, devQueueHw->offsetDsh + alignUp((uint32_t)pKernel->getDynamicStateHeapSize(), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE));
--- a/unit_tests/device_queue/device_queue_tests.cpp
+++ b/unit_tests/device_queue/device_queue_tests.cpp
@@ -40,7 +40,8 @@ TEST(DeviceQueueSimpleTest, setupExecutionModelDispatchDoesNothing) {

    size_t size = 20;
    IndirectHeap ssh(buffer, size);
-    devQueue.setupExecutionModelDispatch(ssh, nullptr, 0, 0, 0);
+    IndirectHeap dsh(buffer, size);
+    devQueue.setupExecutionModelDispatch(ssh, dsh, nullptr, 0, 0, 0);

    EXPECT_EQ(0u, ssh.getUsed());

@@ -320,7 +321,7 @@ TEST_F(DeviceQueueTest, dispatchScheduler) {
    CommandQueue cmdQ(nullptr, nullptr, 0);
    KernelInfo info;
    MockSchedulerKernel *kernel = new MockSchedulerKernel(&program, info, *device);
-    devQueue.dispatchScheduler(cmdQ, *kernel, device->getPreemptionMode());
+    devQueue.dispatchScheduler(cmdQ, *kernel, device->getPreemptionMode(), nullptr, nullptr);
    delete kernel;
 }

--- a/unit_tests/event/event_tests.cpp
+++ b/unit_tests/event/event_tests.cpp
@@ -449,60 +449,18 @@ class SurfaceMock : public Surface {
    SurfaceMock(SurfaceMock *parent) : parent(parent){};
 };

-TEST_F(InternalsEventTest, resizeCmdQueueHeapsWhenKernelOparationHeapsAreBigger) {
-    CommandQueue *pCmdQ = new CommandQueue(mockContext, pDevice, 0);
-    IndirectHeap &cmdQueueDsh = pCmdQ->getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 4096);
-    IndirectHeap &cmdQueueIoh = pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 4096);
-    IndirectHeap &cmdQueueSsh = pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 4096);
-
-    auto requestedSize = cmdQueueDsh.getMaxAvailableSpace() * 2;
-    auto cmdStream = new LinearStream(alignedMalloc(requestedSize, requestedSize), requestedSize);
-
-    auto createFullHeap = [](size_t size) {
-        auto heap = new IndirectHeap(alignedMalloc(size, size), size);
-        heap->getSpace(heap->getAvailableSpace());
-        return heap;
-    };
-
-    auto dsh = createFullHeap(requestedSize);
-    auto ioh = createFullHeap(requestedSize);
-    auto ssh = createFullHeap(maxSshSize);
-
-    using UniqueIH = std::unique_ptr<IndirectHeap>;
-    auto kernelOperation = new KernelOperation(std::unique_ptr<LinearStream>(cmdStream), UniqueIH(dsh),
-                                               UniqueIH(ioh), UniqueIH(ssh));
-    std::vector<Surface *> v;
-    SurfaceMock *surface = new SurfaceMock;
-    v.push_back(surface);
-    PreemptionMode preemptionMode = pDevice->getPreemptionMode();
-    auto cmdComputeKernel = new CommandComputeKernel(*pCmdQ, pDevice->getCommandStreamReceiver(),
-                                                     std::unique_ptr<KernelOperation>(kernelOperation), v, false, false, false, nullptr, preemptionMode);
-
-    EXPECT_LT(cmdQueueDsh.getMaxAvailableSpace(), dsh->getMaxAvailableSpace());
-    EXPECT_LT(cmdQueueIoh.getMaxAvailableSpace(), ioh->getMaxAvailableSpace());
-    EXPECT_EQ(maxSshSize, ssh->getMaxAvailableSpace());
-
-    cmdComputeKernel->submit(0, false);
-
-    EXPECT_GE(cmdQueueDsh.getMaxAvailableSpace(), dsh->getMaxAvailableSpace());
-    EXPECT_GE(cmdQueueIoh.getMaxAvailableSpace(), ioh->getMaxAvailableSpace());
-    EXPECT_GE(cmdQueueSsh.getMaxAvailableSpace(), ssh->getMaxAvailableSpace());
-
-    delete pCmdQ;
-    delete cmdComputeKernel;
-}
-
 TEST_F(InternalsEventTest, processBlockedCommandsKernelOperation) {
    MockEvent<Event> event(nullptr, CL_COMMAND_NDRANGE_KERNEL, 0, 0);
    CommandQueue *pCmdQ = new CommandQueue(mockContext, pDevice, 0);

    auto cmdStream = new LinearStream(alignedMalloc(4096, 4096), 4096);
-    auto dsh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
-    auto ioh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
-    auto ssh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
+    IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
+    pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, 4096u, dsh);
+    pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, 4096u, ioh);
+    pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, 4096u, ssh);
    using UniqueIH = std::unique_ptr<IndirectHeap>;
    auto blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(cmdStream), UniqueIH(dsh),
-                                                   UniqueIH(ioh), UniqueIH(ssh));
+                                                   UniqueIH(ioh), UniqueIH(ssh), *pCmdQ->getDevice().getMemoryManager());

    auto &csr = pDevice->getCommandStreamReceiver();
    std::vector<Surface *> v;
@@ -534,12 +492,13 @@ TEST_F(InternalsEventTest, processBlockedCommandsAbortKernelOperation) {
    CommandQueue *pCmdQ = new CommandQueue(mockContext, pDevice, 0);

    auto cmdStream = new LinearStream(alignedMalloc(4096, 4096), 4096);
-    auto dsh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
-    auto ioh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
-    auto ssh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
+    IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
+    pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, 4096u, dsh);
+    pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, 4096u, ioh);
+    pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, 4096u, ssh);
    using UniqueIH = std::unique_ptr<IndirectHeap>;
    auto blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(cmdStream), UniqueIH(dsh),
-                                                   UniqueIH(ioh), UniqueIH(ssh));
+                                                   UniqueIH(ioh), UniqueIH(ssh), *pCmdQ->getDevice().getMemoryManager());

    auto &csr = pDevice->getCommandStreamReceiver();
    std::vector<Surface *> v;
@@ -565,12 +524,13 @@ TEST_F(InternalsEventTest, givenBlockedKernelWithPrintfWhenSubmittedThenPrintOut
    CommandQueue *pCmdQ = new CommandQueue(mockContext, pDevice, 0);

    auto cmdStream = new LinearStream(alignedMalloc(4096, 4096), 4096);
-    auto dsh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
-    auto ioh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
-    auto ssh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
+    IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
+    pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, 4096u, dsh);
+    pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, 4096u, ioh);
+    pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, 4096u, ssh);
    using UniqueIH = std::unique_ptr<IndirectHeap>;
    auto blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(cmdStream), UniqueIH(dsh),
-                                                   UniqueIH(ioh), UniqueIH(ssh));
+                                                   UniqueIH(ioh), UniqueIH(ssh), *pCmdQ->getDevice().getMemoryManager());

    SPatchAllocateStatelessPrintfSurface *pPrintfSurface = new SPatchAllocateStatelessPrintfSurface();
    pPrintfSurface->DataParamOffset = 0;
@@ -1477,12 +1437,13 @@ HWTEST_F(InternalsEventTest, givenAbortedCommandWhenSubmitCalledThenDontUpdateFl
    csr.flushStamp->setStamp(5);

    auto cmdStream = new LinearStream(alignedMalloc(4096, 4096), 4096);
-    auto dsh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
-    auto ioh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
-    auto ssh = new IndirectHeap(alignedMalloc(4096, 4096), 4096);
+    IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
+    pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, 4096u, dsh);
+    pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, 4096u, ioh);
+    pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, 4096u, ssh);
    using UniqueIH = std::unique_ptr<IndirectHeap>;
    auto blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(cmdStream), UniqueIH(dsh),
-                                                   UniqueIH(ioh), UniqueIH(ssh));
+                                                   UniqueIH(ioh), UniqueIH(ssh), *pCmdQ->getDevice().getMemoryManager());
    PreemptionMode preemptionMode = pDevice->getPreemptionMode();
    std::vector<Surface *> v;
    auto cmd = new CommandComputeKernel(*pCmdQ, csr, std::unique_ptr<KernelOperation>(blockedCommandsData), v, false, false, false, nullptr, preemptionMode);
--- a/unit_tests/execution_model/scheduler_dispatch_tests.cpp
+++ b/unit_tests/execution_model/scheduler_dispatch_tests.cpp
@@ -76,7 +76,9 @@ HWTEST_F(ExecutionModelSchedulerFixture, dispatchScheduler) {
            *pCmdQ,
            *pDevQueueHw,
            pDevice->getPreemptionMode(),
-            scheduler);
+            scheduler,
+            &pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE),
+            pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE));

        EXPECT_EQ(0u, *scheduler.globalWorkOffsetX);
        EXPECT_EQ(0u, *scheduler.globalWorkOffsetY);
@@ -192,7 +194,9 @@ HWTEST_F(ExecutionModelSchedulerFixture, dispatchSchedulerDoesNotUseStandardCmdQ
            *pCmdQ,
            *pDevQueueHw,
            pDevice->getPreemptionMode(),
-            scheduler);
+            scheduler,
+            &pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE),
+            pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE));

        auto &ioh = pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT);

@@ -223,7 +227,9 @@ HWTEST_F(ParentKernelCommandQueueFixture, dispatchSchedulerWithEarlyReturnSetToF
            *pCmdQ,
            mockDevQueue,
            device->getPreemptionMode(),
-            scheduler);
+            scheduler,
+            &pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE),
+            mockDevQueue.getIndirectHeap(IndirectHeap::DYNAMIC_STATE));

        HardwareParse hwParser;
        hwParser.parseCommands<FamilyType>(commandStream, 0);
--- a/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp
+++ b/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp
@@ -20,6 +20,7 @@
 * OTHER DEALINGS IN THE SOFTWARE.
 */

+#include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/event/hw_timestamps.h"
 #include "runtime/helpers/kernel_commands.h"
 #include "runtime/helpers/task_information.h"
@@ -65,18 +66,18 @@ class MockDeviceQueueHwWithCriticalSectionRelease : public DeviceQueueHw<GfxFami
        return igilCmdQueue->m_controls.m_CriticalSection == DeviceQueueHw<GfxFamily>::ExecutionModelCriticalSection::Free;
    }

-    void setupIndirectState(IndirectHeap &surfaceStateHeap, Kernel *parentKernel, uint32_t parentIDCount) override {
+    void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount) override {
        indirectStateSetup = true;
-        return BaseClass::setupIndirectState(surfaceStateHeap, parentKernel, parentIDCount);
+        return BaseClass::setupIndirectState(surfaceStateHeap, dynamicStateHeap, parentKernel, parentIDCount);
    }
    void addExecutionModelCleanUpSection(Kernel *parentKernel, HwTimeStamps *hwTimeStamp, uint32_t taskCount) override {
        cleanupSectionAdded = true;
        timestampAddedInCleanupSection = hwTimeStamp;
        return BaseClass::addExecutionModelCleanUpSection(parentKernel, hwTimeStamp, taskCount);
    }
-    void dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode) override {
+    void dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh) override {
        schedulerDispatched = true;
-        return BaseClass::dispatchScheduler(cmdQ, scheduler, preemptionMode);
+        return BaseClass::dispatchScheduler(cmdQ, scheduler, preemptionMode, ssh, dsh);
    }

    uint32_t criticalSectioncheckCounter = 0;
@@ -98,17 +99,22 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenLockedEMcritcalSectionWhenParentK
        mockDevQueue.acquireEMCriticalSection();

        size_t heapSize = 20;
-        size_t alignement = 64;
-        size_t dshSize = heapSize + mockDevQueue.getDshOffset();
-        IndirectHeap *dsh = new IndirectHeap(alignedMalloc(dshSize, alignement), dshSize);
+        size_t dshSize = mockDevQueue.getDshBuffer()->getUnderlyingBufferSize();
+
+        IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
+        pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, dshSize, dsh);
+        pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, heapSize, ioh);
+        pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, heapSize, ssh);
+
        dsh->getSpace(mockDevQueue.getDshOffset());

        size_t minSizeSSHForEM = KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*parentKernel);

        KernelOperation *blockedCommandData = new KernelOperation(std::unique_ptr<LinearStream>(new LinearStream()),
                                                                  std::unique_ptr<IndirectHeap>(dsh),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)));
+                                                                  std::unique_ptr<IndirectHeap>(ioh),
+                                                                  std::unique_ptr<IndirectHeap>(ssh),
+                                                                  *pCmdQ->getDevice().getMemoryManager());

        blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        PreemptionMode preemptionMode = device->getPreemptionMode();
@@ -124,7 +130,7 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenLockedEMcritcalSectionWhenParentK
    }
 }

-HWTEST_F(ParentKernelCommandQueueFixture, givenParentKernelWhenCommandIsSubmittedThenDeviceQueueDshIsUsed) {
+HWTEST_F(ParentKernelCommandQueueFixture, givenParentKernelWhenCommandIsSubmittedThenPassedDshIsUsed) {
    if (device->getSupportedClVersion() >= 20) {
        cl_queue_properties properties[3] = {0};
        MockParentKernel *parentKernel = MockParentKernel::create(*device);
@@ -135,14 +141,19 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenParentKernelWhenCommandIsSubmitte
        auto *dshOfDevQueue = mockDevQueue.getIndirectHeap(IndirectHeap::DYNAMIC_STATE);

        size_t heapSize = 20;
-        size_t alignement = 64;
-        size_t dshSize = heapSize + mockDevQueue.getDshOffset();
-        IndirectHeap *dsh = new IndirectHeap(alignedMalloc(dshSize, alignement), dshSize);
+        size_t dshSize = mockDevQueue.getDshBuffer()->getUnderlyingBufferSize();
+
+        IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
+        pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, dshSize, dsh);
+        pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, heapSize, ioh);
+        pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, heapSize, ssh);
+
        // add initial offset of colorCalState
        dsh->getSpace(DeviceQueue::colorCalcStateSize);

        uint64_t ValueToFillDsh = 5;
        uint64_t *dshVal = (uint64_t *)dsh->getSpace(sizeof(uint64_t));
+
        // Fill Interface Descriptor Data
        *dshVal = ValueToFillDsh;

@@ -155,15 +166,15 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenParentKernelWhenCommandIsSubmitte
        *dshVal = ValueToFillDsh;

        size_t usedDSHBeforeSubmit = dshOfDevQueue->getUsed();
-        uint64_t *devQueueDshValue = (uint64_t *)dshOfDevQueue->getSpace(0);

        uint32_t colorCalcSizeDevQueue = DeviceQueue::colorCalcStateSize;
        EXPECT_EQ(colorCalcSizeDevQueue, usedDSHBeforeSubmit);

        KernelOperation *blockedCommandData = new KernelOperation(std::unique_ptr<LinearStream>(new LinearStream()),
                                                                  std::unique_ptr<IndirectHeap>(dsh),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)));
+                                                                  std::unique_ptr<IndirectHeap>(ioh),
+                                                                  std::unique_ptr<IndirectHeap>(ssh),
+                                                                  *pCmdQ->getDevice().getMemoryManager());

        size_t minSizeSSHForEM = KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*parentKernel);

@@ -175,13 +186,9 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenParentKernelWhenCommandIsSubmitte

        cmdComputeKernel->submit(0, false);

+        //device queue dsh is not changed
        size_t usedDSHAfterSubmit = dshOfDevQueue->getUsed();
-
-        EXPECT_EQ(mockDevQueue.getDshOffset() + sizeof(uint64_t), usedDSHAfterSubmit);
-        EXPECT_EQ(ValueToFillDsh, *devQueueDshValue);
-
-        uint64_t *devQueueDshParent = (uint64_t *)ptrOffset((char *)dshOfDevQueue->getCpuBase(), mockDevQueue.getDshOffset());
-        EXPECT_EQ(ValueToFillDsh, *devQueueDshParent);
+        EXPECT_EQ(usedDSHAfterSubmit, usedDSHAfterSubmit);

        delete cmdComputeKernel;
        delete parentKernel;
@@ -197,15 +204,20 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenParentKernelWhenCommandIsSubmitte
        context->setDefaultDeviceQueue(&mockDevQueue);

        size_t heapSize = 20;
-        size_t alignement = 64;
-        size_t dshSize = heapSize + mockDevQueue.getDshOffset();
-        IndirectHeap *dsh = new IndirectHeap(alignedMalloc(dshSize, alignement), dshSize);
+        size_t dshSize = mockDevQueue.getDshBuffer()->getUnderlyingBufferSize();
+
+        IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
+        pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, dshSize, dsh);
+        pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, heapSize, ioh);
+        pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, heapSize, ssh);
+
        dsh->getSpace(mockDevQueue.getDshOffset());

        KernelOperation *blockedCommandData = new KernelOperation(std::unique_ptr<LinearStream>(new LinearStream()),
                                                                  std::unique_ptr<IndirectHeap>(dsh),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)));
+                                                                  std::unique_ptr<IndirectHeap>(ioh),
+                                                                  std::unique_ptr<IndirectHeap>(ssh),
+                                                                  *pCmdQ->getDevice().getMemoryManager());

        size_t minSizeSSHForEM = KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*parentKernel);

@@ -234,15 +246,18 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenBlockedParentKernelWithProfilingW
        context->setDefaultDeviceQueue(&mockDevQueue);

        size_t heapSize = 20;
-        size_t alignement = 64;
-        size_t dshSize = heapSize + mockDevQueue.getDshOffset();
-        IndirectHeap *dsh = new IndirectHeap(alignedMalloc(dshSize, alignement), dshSize);
+        size_t dshSize = mockDevQueue.getDshBuffer()->getUnderlyingBufferSize();
+        IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
+        pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, dshSize, dsh);
+        pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, heapSize, ioh);
+        pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, heapSize, ssh);
        dsh->getSpace(mockDevQueue.getDshOffset());

        KernelOperation *blockedCommandData = new KernelOperation(std::unique_ptr<LinearStream>(new LinearStream()),
                                                                  std::unique_ptr<IndirectHeap>(dsh),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)));
+                                                                  std::unique_ptr<IndirectHeap>(ioh),
+                                                                  std::unique_ptr<IndirectHeap>(ssh),
+                                                                  *pCmdQ->getDevice().getMemoryManager());

        size_t minSizeSSHForEM = KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*parentKernel);

@@ -274,15 +289,19 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenParentKernelWhenCommandIsSubmitte
        context->setDefaultDeviceQueue(&mockDevQueue);

        size_t heapSize = 20;
-        size_t alignement = 64;
-        size_t dshSize = heapSize + mockDevQueue.getDshOffset();
-        IndirectHeap *dsh = new IndirectHeap(alignedMalloc(dshSize, alignement), dshSize);
+        size_t dshSize = mockDevQueue.getDshBuffer()->getUnderlyingBufferSize();
+
+        IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
+        pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, dshSize, dsh);
+        pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, heapSize, ioh);
+        pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, heapSize, ssh);
        dsh->getSpace(mockDevQueue.getDshOffset());

        KernelOperation *blockedCommandData = new KernelOperation(std::unique_ptr<LinearStream>(new LinearStream()),
                                                                  std::unique_ptr<IndirectHeap>(dsh),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)));
+                                                                  std::unique_ptr<IndirectHeap>(ioh),
+                                                                  std::unique_ptr<IndirectHeap>(ssh),
+                                                                  *pCmdQ->getDevice().getMemoryManager());

        size_t minSizeSSHForEM = KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*parentKernel);

@@ -301,7 +320,7 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenParentKernelWhenCommandIsSubmitte
    }
 }

-HWTEST_F(ParentKernelCommandQueueFixture, givenUsedSSHWhenParentKernelIsSubmittedThenNewSSHIsAllocated) {
+HWTEST_F(ParentKernelCommandQueueFixture, givenUsedCommandQueueHeapshenParentKernelIsSubmittedThenQueueHeapsAreNotUsed) {
    if (device->getSupportedClVersion() >= 20) {
        cl_queue_properties properties[3] = {0};
        MockParentKernel *parentKernel = MockParentKernel::create(*device);
@@ -314,20 +333,30 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenUsedSSHWhenParentKernelIsSubmitte
        size_t minSizeSSHForEM = KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*parentKernel);

        size_t heapSize = 20;
-        size_t alignement = 64;

-        size_t dshSize = heapSize + mockDevQueue.getDshOffset();
-        IndirectHeap *dsh = new IndirectHeap(alignedMalloc(dshSize, alignement), dshSize);
+        size_t dshSize = mockDevQueue.getDshBuffer()->getUnderlyingBufferSize();
+        IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
+        pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, dshSize, dsh);
+        pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, heapSize, ioh);
+        pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, heapSize, ssh);
+
        dsh->getSpace(mockDevQueue.getDshOffset());

-        cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 100);
-        // use some SSH
-        cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE).getSpace(4);
+        auto &queueSsh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 100);
+        auto &queueDsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 100);
+        auto &queueIoh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 100);
+
+        size_t usedSize = 4u;
+
+        queueSsh.getSpace(usedSize);
+        queueDsh.getSpace(usedSize);
+        queueIoh.getSpace(usedSize);

        KernelOperation *blockedCommandData = new KernelOperation(std::unique_ptr<LinearStream>(new LinearStream()),
                                                                  std::unique_ptr<IndirectHeap>(dsh),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)));
+                                                                  std::unique_ptr<IndirectHeap>(ioh),
+                                                                  std::unique_ptr<IndirectHeap>(ssh),
+                                                                  *pCmdQ->getDevice().getMemoryManager());

        blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        PreemptionMode preemptionMode = device->getPreemptionMode();
@@ -337,7 +366,10 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenUsedSSHWhenParentKernelIsSubmitte

        cmdComputeKernel->submit(0, false);

-        EXPECT_TRUE(cmdQ.releaseIndirectHeapCalled);
+        EXPECT_FALSE(cmdQ.releaseIndirectHeapCalled);
+        EXPECT_EQ(usedSize, queueDsh.getUsed());
+        EXPECT_EQ(usedSize, queueIoh.getUsed());
+        EXPECT_EQ(usedSize, queueSsh.getUsed());

        delete cmdComputeKernel;
        delete parentKernel;
@@ -355,14 +387,14 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenNotUsedSSHWhenParentKernelIsSubmi
        size_t minSizeSSHForEM = KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*parentKernel);

        size_t heapSize = 20;
-        size_t alignement = 64;
-
-        size_t dshSize = heapSize + mockDevQueue.getDshOffset();
-        IndirectHeap *dsh = new IndirectHeap(alignedMalloc(dshSize, alignement), dshSize);
-        dsh->getSpace(mockDevQueue.getDshOffset());

+        size_t dshSize = mockDevQueue.getDshBuffer()->getUnderlyingBufferSize();
        size_t sshSize = 1000;
-        IndirectHeap *ssh = new IndirectHeap(alignedMalloc(sshSize, 4096), sshSize);
+        IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
+        pCmdQ->allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, dshSize, dsh);
+        pCmdQ->allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, heapSize, ioh);
+        pCmdQ->allocateHeapMemory(IndirectHeap::SURFACE_STATE, sshSize, ssh);
+        dsh->getSpace(mockDevQueue.getDshOffset());

        EXPECT_EQ(0u, ssh->getUsed());

@@ -372,8 +404,9 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenNotUsedSSHWhenParentKernelIsSubmi

        KernelOperation *blockedCommandData = new KernelOperation(std::unique_ptr<LinearStream>(new LinearStream()),
                                                                  std::unique_ptr<IndirectHeap>(dsh),
-                                                                  std::unique_ptr<IndirectHeap>(new IndirectHeap(alignedMalloc(heapSize, alignement), heapSize)),
-                                                                  std::unique_ptr<IndirectHeap>(ssh));
+                                                                  std::unique_ptr<IndirectHeap>(ioh),
+                                                                  std::unique_ptr<IndirectHeap>(ssh),
+                                                                  *pCmdQ->getDevice().getMemoryManager());

        blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        PreemptionMode preemptionMode = device->getPreemptionMode();
@@ -391,3 +424,43 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenNotUsedSSHWhenParentKernelIsSubmi
        delete parentKernel;
    }
 }
+
+HWTEST_F(ParentKernelCommandQueueFixture, givenBlockedCommandQueueWhenDispatchWalkerIsCalledThenHeapsHaveProperSizes) {
+    if (device->getSupportedClVersion() >= 20) {
+        cl_queue_properties properties[3] = {0};
+        std::unique_ptr<MockParentKernel> parentKernel(MockParentKernel::create(*device));
+
+        MockDeviceQueueHw<FamilyType> mockDevQueue(context, device, properties[0]);
+        parentKernel->createReflectionSurface();
+        context->setDefaultDeviceQueue(&mockDevQueue);
+
+        KernelOperation *blockedCommandsData = nullptr;
+        const size_t globalOffsets[3] = {0, 0, 0};
+        const size_t workItems[3] = {1, 1, 1};
+
+        GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
+                                                      *parentKernel,
+                                                      1,
+                                                      globalOffsets,
+                                                      workItems,
+                                                      nullptr,
+                                                      0,
+                                                      nullptr,
+                                                      &blockedCommandsData,
+                                                      nullptr,
+                                                      nullptr,
+                                                      device->getPreemptionMode(),
+                                                      true);
+
+        EXPECT_NE(nullptr, blockedCommandsData);
+        EXPECT_EQ(blockedCommandsData->dsh->getMaxAvailableSpace(), mockDevQueue.getDshBuffer()->getUnderlyingBufferSize());
+        EXPECT_EQ(blockedCommandsData->dsh, blockedCommandsData->ioh);
+
+        EXPECT_NE(nullptr, blockedCommandsData->dsh->getGraphicsAllocation());
+        EXPECT_NE(nullptr, blockedCommandsData->ioh->getGraphicsAllocation());
+        EXPECT_NE(nullptr, blockedCommandsData->ssh->getGraphicsAllocation());
+        EXPECT_EQ(blockedCommandsData->dsh->getGraphicsAllocation(), blockedCommandsData->ioh->getGraphicsAllocation());
+
+        delete blockedCommandsData;
+    }
+}
--- a/unit_tests/gen8/scheduler_dispatch_tests.cpp
+++ b/unit_tests/gen8/scheduler_dispatch_tests.cpp
@@ -55,7 +55,9 @@ BDWTEST_F(BdwSchedulerTest, givenCallToDispatchSchedulerWhenPipeControlWithCSSta
            *pCmdQ,
            *pDevQueueHw,
            pDevice->getPreemptionMode(),
-            scheduler);
+            scheduler,
+            &pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE),
+            pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE));

        HardwareParse hwParser;
        hwParser.parseCommands<FamilyType>(commandStream, 0);