From 7ec69c33f9ee797699f5f4882b690cce400eee9a Mon Sep 17 00:00:00 2001
From: Mateusz Jablonski <mateusz.jablonski@intel.com>
Date: Mon, 23 Nov 2020 18:01:38 +0000
Subject: [PATCH] Store SSH per root device in Kernel

Related-To: NEO-5001
Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
---
 opencl/source/command_queue/command_queue.cpp |  2 +-
 opencl/source/command_queue/enqueue_common.h  |  5 +-
 opencl/source/command_queue/gpgpu_walker.h    |  2 +-
 .../command_queue/hardware_interface_base.inl |  3 +-
 opencl/source/gtpin/gtpin_callbacks.cpp       | 12 ++--
 opencl/source/gtpin/gtpin_hw_helper.h         |  8 +--
 opencl/source/gtpin/gtpin_hw_helper.inl       | 16 +++---
 .../source/helpers/hardware_commands_helper.h |  4 +-
 .../helpers/hardware_commands_helper_base.inl | 12 ++--
 opencl/source/kernel/kernel.cpp               | 55 ++++++++++---------
 opencl/source/kernel/kernel.h                 | 14 ++---
 opencl/source/program/printf_handler.cpp      |  2 +-
 .../accelerators/media_image_arg_tests.cpp    |  4 +-
 ...cl_mem_locally_uncached_resource_tests.cpp |  3 +-
 .../unit_test/built_ins/built_in_tests.cpp    | 16 +++---
 .../command_queue/command_queue_tests.cpp     |  8 +--
 .../command_queue/dispatch_walker_tests.cpp   |  2 +-
 .../command_queue/enqueue_handler_tests.cpp   |  2 +-
 .../get_size_required_buffer_tests.cpp        |  6 +-
 .../get_size_required_image_tests.cpp         | 12 ++--
 .../sync_buffer_handler_tests.cpp             |  2 +-
 .../device_queue/device_queue_hw_tests.cpp    |  8 +--
 .../enqueue_execution_model_kernel_tests.cpp  |  4 +-
 .../parent_kernel_dispatch_tests.cpp          |  4 +-
 .../scheduler_dispatch_tests.cpp              |  6 +-
 .../submit_blocked_parent_kernel_tests.cpp    | 14 ++---
 .../fixtures/execution_model_fixture.h        |  3 +-
 .../gen8/scheduler_dispatch_tests_gen8.cpp    |  2 +-
 opencl/test/unit_test/gtpin/gtpin_tests.cpp   | 40 +++++++-------
 .../hardware_commands_helper_tests.cpp        |  8 +--
 .../helpers/hardware_commands_helper_tests.h  |  2 +-
 .../kernel/kernel_arg_buffer_tests.cpp        |  6 +-
 .../kernel/kernel_arg_pipe_tests.cpp          |  6 +-
 .../unit_test/kernel/kernel_arg_svm_tests.cpp | 30 +++++-----
 opencl/test/unit_test/kernel/kernel_tests.cpp | 44 +++++++--------
 .../kernel/kernel_transformable_tests.cpp     | 18 +++---
 .../mem_obj/buffer_set_arg_tests.cpp          |  6 +-
 .../unit_test/mem_obj/image_set_arg_tests.cpp | 38 ++++++-------
 .../memory_manager/memory_manager_tests.cpp   |  6 +-
 opencl/test/unit_test/mocks/mock_kernel.h     | 16 +++---
 40 files changed, 231 insertions(+), 220 deletions(-)
diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp
index e61c0e633f..a30e30b157 100644
--- a/opencl/source/command_queue/command_queue.cpp
+++ b/opencl/source/command_queue/command_queue.cpp
@@ -530,7 +530,7 @@ bool CommandQueue::setupDebugSurface(Kernel *kernel) {
 
     DEBUG_BREAK_IF(!kernel->requiresSshForBuffers());
 
-    auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getSurfaceStateHeap()),
+    auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getSurfaceStateHeap(device->getRootDeviceIndex())),
                                   kernel->getKernelInfo().patchInfo.pAllocateSystemThreadSurface->Offset);
     void *addressToPatch = reinterpret_cast<void *>(debugSurface->getGpuAddress());
     size_t sizeToPatch = debugSurface->getUnderlyingBufferSize();
diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h
index 221708dcd2..f5e9a6ae42 100644
--- a/opencl/source/command_queue/enqueue_common.h
+++ b/opencl/source/command_queue/enqueue_common.h
@@ -330,7 +330,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
 
     if (blockQueue) {
         if (parentKernel) {
-            size_t minSizeSSHForEM = HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
+            size_t minSizeSSHForEM = HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel, device->getRootDeviceIndex());
             blockedCommandsData->surfaceStateHeapSizeEM = minSizeSSHForEM;
         }
 
@@ -534,7 +534,8 @@ void CommandQueueHw<GfxFamily>::processDeviceEnqueue(DeviceQueueHw<GfxFamily> *d
                                                      TagNode<HwTimeStamps> *hwTimeStamps,
                                                      bool &blocking) {
     auto parentKernel = multiDispatchInfo.peekParentKernel();
-    size_t minSizeSSHForEM = HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
+    auto rootDeviceIndex = devQueueHw->getDevice().getRootDeviceIndex();
+    size_t minSizeSSHForEM = HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);
     bool isCcsUsed = EngineHelpers::isCcs(gpgpuEngine->osContext->getEngineType());
 
     uint32_t taskCount = getGpgpuCommandStreamReceiver().peekTaskCount() + 1;
diff --git a/opencl/source/command_queue/gpgpu_walker.h b/opencl/source/command_queue/gpgpu_walker.h
index 32fb391d2f..d0817f6713 100644
--- a/opencl/source/command_queue/gpgpu_walker.h
+++ b/opencl/source/command_queue/gpgpu_walker.h
@@ -200,7 +200,7 @@ IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const MultiDispatchInf
 
     if (Kernel *parentKernel = multiDispatchInfo.peekParentKernel()) {
         if (heapType == IndirectHeap::SURFACE_STATE) {
-            expectedSize += HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
+            expectedSize += HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel, commandQueue.getDevice().getRootDeviceIndex());
         } else //if (heapType == IndirectHeap::DYNAMIC_STATE || heapType == IndirectHeap::INDIRECT_OBJECT)
         {
             DeviceQueueHw<GfxFamily> *pDevQueue = castToObject<DeviceQueueHw<GfxFamily>>(commandQueue.getContext().getDefaultDeviceQueue());
diff --git a/opencl/source/command_queue/hardware_interface_base.inl b/opencl/source/command_queue/hardware_interface_base.inl
index 55c0f3350d..d21218a80d 100644
--- a/opencl/source/command_queue/hardware_interface_base.inl
+++ b/opencl/source/command_queue/hardware_interface_base.inl
@@ -248,6 +248,7 @@ template <typename GfxFamily>
 void HardwareInterface<GfxFamily>::obtainIndirectHeaps(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo,
                                                        bool blockedQueue, IndirectHeap *&dsh, IndirectHeap *&ioh, IndirectHeap *&ssh) {
     auto parentKernel = multiDispatchInfo.peekParentKernel();
+    auto rootDeviceIndex = commandQueue.getDevice().getRootDeviceIndex();
 
     if (blockedQueue) {
         size_t dshSize = 0;
@@ -257,7 +258,7 @@ void HardwareInterface<GfxFamily>::obtainIndirectHeaps(CommandQueue &commandQueu
 
         if (parentKernel) {
             dshSize = commandQueue.getContext().getDefaultDeviceQueue()->getDshBuffer()->getUnderlyingBufferSize();
-            sshSize += HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
+            sshSize += HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);
             iohEqualsDsh = true;
             colorCalcSize = static_cast<size_t>(commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize);
         } else {
diff --git a/opencl/source/gtpin/gtpin_callbacks.cpp b/opencl/source/gtpin/gtpin_callbacks.cpp
index 533c5c87d7..9a185bc9a8 100644
--- a/opencl/source/gtpin/gtpin_callbacks.cpp
+++ b/opencl/source/gtpin/gtpin_callbacks.cpp
@@ -66,9 +66,10 @@ void gtpinNotifyKernelCreate(cl_kernel kernel) {
         size_t gtpinBTI = pKernel->getNumberOfBindingTableStates();
         // Enlarge local copy of SSH by 1 SS
         auto device = pKernel->getDevices()[0];
+        auto rootDeviceIndex = device->getRootDeviceIndex();
         GFXCORE_FAMILY genFamily = device->getHardwareInfo().platform.eRenderCoreFamily;
         GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
-        if (pKernel->isParentKernel || !gtpinHelper.addSurfaceState(pKernel)) {
+        if (pKernel->isParentKernel || !gtpinHelper.addSurfaceState(pKernel, rootDeviceIndex)) {
             // Kernel with no SSH or Kernel EM, not supported
             return;
         }
@@ -103,8 +104,10 @@ void gtpinNotifyKernelCreate(cl_kernel kernel) {
 
 void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
     if (isGTPinInitialized) {
+        auto pCmdQ = reinterpret_cast<CommandQueue *>(pCmdQueue);
+        auto &device = pCmdQ->getDevice();
         auto pKernel = castToObjectOrAbort<Kernel>(kernel);
-        if (pKernel->isParentKernel || pKernel->getSurfaceStateHeapSize() == 0) {
+        if (pKernel->isParentKernel || pKernel->getSurfaceStateHeapSize(device.getRootDeviceIndex()) == 0) {
             // Kernel with no SSH, not supported
             return;
         }
@@ -132,14 +135,13 @@ void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
         if (!resource) {
             return;
         }
-        auto &device = *pKernel->getDevices()[0];
         GFXCORE_FAMILY genFamily = device.getHardwareInfo().platform.eRenderCoreFamily;
         GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
         size_t gtpinBTI = pKernel->getNumberOfBindingTableStates() - 1;
-        void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI);
+        void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI, device.getRootDeviceIndex());
         cl_mem buffer = (cl_mem)resource;
         auto pBuffer = castToObjectOrAbort<Buffer>(buffer);
-        pBuffer->setArgStateful(pSurfaceState, false, false, false, false, device.getDevice());
+        pBuffer->setArgStateful(pSurfaceState, false, false, false, false, device);
     }
 }
 
diff --git a/opencl/source/gtpin/gtpin_hw_helper.h b/opencl/source/gtpin/gtpin_hw_helper.h
index 7ff896be0f..bcf49ed333 100644
--- a/opencl/source/gtpin/gtpin_hw_helper.h
+++ b/opencl/source/gtpin/gtpin_hw_helper.h
@@ -15,8 +15,8 @@ class GTPinHwHelper {
   public:
     static GTPinHwHelper &get(GFXCORE_FAMILY gfxCore);
     virtual uint32_t getGenVersion() = 0;
-    virtual bool addSurfaceState(Kernel *pKernel) = 0;
-    virtual void *getSurfaceState(Kernel *pKernel, size_t bti) = 0;
+    virtual bool addSurfaceState(Kernel *pKernel, uint32_t rootDeviceIndex) = 0;
+    virtual void *getSurfaceState(Kernel *pKernel, size_t bti, uint32_t rootDeviceIndex) = 0;
 
   protected:
     GTPinHwHelper(){};
@@ -30,8 +30,8 @@ class GTPinHwHelperHw : public GTPinHwHelper {
         return gtpinHwHelper;
     }
     uint32_t getGenVersion() override;
-    bool addSurfaceState(Kernel *pKernel) override;
-    void *getSurfaceState(Kernel *pKernel, size_t bti) override;
+    bool addSurfaceState(Kernel *pKernel, uint32_t rootDeviceIndex) override;
+    void *getSurfaceState(Kernel *pKernel, size_t bti, uint32_t rootDeviceIndex) override;
 
   private:
     GTPinHwHelperHw(){};
diff --git a/opencl/source/gtpin/gtpin_hw_helper.inl b/opencl/source/gtpin/gtpin_hw_helper.inl
index a74719ef8a..e481d22294 100644
--- a/opencl/source/gtpin/gtpin_hw_helper.inl
+++ b/opencl/source/gtpin/gtpin_hw_helper.inl
@@ -15,11 +15,11 @@
 namespace NEO {
 
 template <typename GfxFamily>
-bool GTPinHwHelperHw<GfxFamily>::addSurfaceState(Kernel *pKernel) {
+bool GTPinHwHelperHw<GfxFamily>::addSurfaceState(Kernel *pKernel, uint32_t rootDeviceIndex) {
     using RENDER_SURFACE_STATE = typename GfxFamily::RENDER_SURFACE_STATE;
     using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE;
 
-    size_t sshSize = pKernel->getSurfaceStateHeapSize();
+    size_t sshSize = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
     if ((sshSize == 0) || pKernel->isParentKernel) {
         // Kernels which do not use SSH or use Execution Model are not supported (yet)
         return false;
@@ -29,7 +29,7 @@ bool GTPinHwHelperHw<GfxFamily>::addSurfaceState(Kernel *pKernel) {
     size_t sizeToEnlarge = ssSize + btsSize;
     size_t currBTOffset = pKernel->getBindingTableOffset();
     size_t currSurfaceStateSize = currBTOffset;
-    char *pSsh = static_cast<char *>(pKernel->getSurfaceStateHeap());
+    char *pSsh = static_cast<char *>(pKernel->getSurfaceStateHeap(rootDeviceIndex));
     char *pNewSsh = new char[sshSize + sizeToEnlarge];
     memcpy_s(pNewSsh, sshSize + sizeToEnlarge, pSsh, currSurfaceStateSize);
     RENDER_SURFACE_STATE *pSS = reinterpret_cast<RENDER_SURFACE_STATE *>(pNewSsh + currSurfaceStateSize);
@@ -40,19 +40,19 @@ bool GTPinHwHelperHw<GfxFamily>::addSurfaceState(Kernel *pKernel) {
     BINDING_TABLE_STATE *pNewBTS = reinterpret_cast<BINDING_TABLE_STATE *>(pNewSsh + newSurfaceStateSize + currBTCount * btsSize);
     *pNewBTS = GfxFamily::cmdInitBindingTableState;
     pNewBTS->setSurfaceStatePointer((uint64_t)currBTOffset);
-    pKernel->resizeSurfaceStateHeap(pNewSsh, sshSize + sizeToEnlarge, currBTCount + 1, newSurfaceStateSize);
+    pKernel->resizeSurfaceStateHeap(rootDeviceIndex, pNewSsh, sshSize + sizeToEnlarge, currBTCount + 1, newSurfaceStateSize);
     return true;
 }
 
 template <typename GfxFamily>
-void *GTPinHwHelperHw<GfxFamily>::getSurfaceState(Kernel *pKernel, size_t bti) {
+void *GTPinHwHelperHw<GfxFamily>::getSurfaceState(Kernel *pKernel, size_t bti, uint32_t rootDeviceIndex) {
     using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE;
 
-    if ((nullptr == pKernel->getSurfaceStateHeap()) || (bti >= pKernel->getNumberOfBindingTableStates())) {
+    if ((nullptr == pKernel->getSurfaceStateHeap(rootDeviceIndex)) || (bti >= pKernel->getNumberOfBindingTableStates())) {
         return nullptr;
     }
-    auto *pBts = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(), (pKernel->getBindingTableOffset() + bti * sizeof(BINDING_TABLE_STATE))));
-    auto pSurfaceState = ptrOffset(pKernel->getSurfaceStateHeap(), pBts->getSurfaceStatePointer());
+    auto *pBts = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex), (pKernel->getBindingTableOffset() + bti * sizeof(BINDING_TABLE_STATE))));
+    auto pSurfaceState = ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex), pBts->getSurfaceStatePointer());
     return pSurfaceState;
 }
 
diff --git a/opencl/source/helpers/hardware_commands_helper.h b/opencl/source/helpers/hardware_commands_helper.h
index 07c1339c1d..31d52c68c8 100644
--- a/opencl/source/helpers/hardware_commands_helper.h
+++ b/opencl/source/helpers/hardware_commands_helper.h
@@ -126,7 +126,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
         const Kernel &kernel,
         size_t localWorkSize = 256);
     static size_t getSizeRequiredSSH(
-        const Kernel &kernel);
+        const Kernel &kernel, uint32_t rootDeviceIndex);
 
     static size_t getTotalSizeRequiredDSH(
         const MultiDispatchInfo &multiDispatchInfo);
@@ -135,7 +135,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
     static size_t getTotalSizeRequiredSSH(
         const MultiDispatchInfo &multiDispatchInfo);
 
-    static size_t getSshSizeForExecutionModel(const Kernel &kernel);
+    static size_t getSshSizeForExecutionModel(const Kernel &kernel, uint32_t rootDeviceIndex);
     static void setInterfaceDescriptorOffset(
         WALKER_TYPE<GfxFamily> *walkerCmd,
         uint32_t &interfaceDescriptorIndex);
diff --git a/opencl/source/helpers/hardware_commands_helper_base.inl b/opencl/source/helpers/hardware_commands_helper_base.inl
index 4038a91657..be7aa425cd 100644
--- a/opencl/source/helpers/hardware_commands_helper_base.inl
+++ b/opencl/source/helpers/hardware_commands_helper_base.inl
@@ -75,9 +75,9 @@ size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(
 
 template <typename GfxFamily>
 size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredSSH(
-    const Kernel &kernel) {
+    const Kernel &kernel, uint32_t rootDeviceIndex) {
     typedef typename GfxFamily::BINDING_TABLE_STATE BINDING_TABLE_STATE;
-    auto sizeSSH = kernel.getSurfaceStateHeapSize();
+    auto sizeSSH = kernel.getSurfaceStateHeapSize(rootDeviceIndex);
     sizeSSH += sizeSSH ? BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE : 0;
     return sizeSSH;
 }
@@ -112,11 +112,11 @@ size_t HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(
 template <typename GfxFamily>
 size_t HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(
     const MultiDispatchInfo &multiDispatchInfo) {
-    return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredSSH(*dispatchInfo.getKernel()); });
+    return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredSSH(*dispatchInfo.getKernel(), dispatchInfo.getClDevice().getRootDeviceIndex()); });
 }
 
 template <typename GfxFamily>
-size_t HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(const Kernel &kernel) {
+size_t HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(const Kernel &kernel, uint32_t rootDeviceIndex) {
     typedef typename GfxFamily::BINDING_TABLE_STATE BINDING_TABLE_STATE;
 
     size_t totalSize = 0;
@@ -136,7 +136,7 @@ size_t HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(const Kern
 
     SchedulerKernel &scheduler = kernel.getContext().getSchedulerKernel();
 
-    totalSize += getSizeRequiredSSH(scheduler);
+    totalSize += getSizeRequiredSSH(scheduler, rootDeviceIndex);
 
     totalSize += maxBindingTableCount * sizeof(BINDING_TABLE_STATE) * DeviceQueue::interfaceDescriptorEntries;
     totalSize = alignUp(totalSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
@@ -237,7 +237,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
     kernel.patchBindlessSurfaceStateOffsets(device, ssh.getUsed());
 
     auto dstBindingTablePointer = EncodeSurfaceState<GfxFamily>::pushBindingTableAndSurfaceStates(ssh, (kernelInfo.patchInfo.bindingTableState != nullptr) ? kernelInfo.patchInfo.bindingTableState->Count : 0,
-                                                                                                  kernel.getSurfaceStateHeap(), kernel.getSurfaceStateHeapSize(),
+                                                                                                  kernel.getSurfaceStateHeap(rootDeviceIndex), kernel.getSurfaceStateHeapSize(rootDeviceIndex),
                                                                                                   kernel.getNumberOfBindingTableStates(), kernel.getBindingTableOffset());
 
     // Copy our sampler state if it exists
diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp
index 3e7fef9029..130262cb37 100644
--- a/opencl/source/kernel/kernel.cpp
+++ b/opencl/source/kernel/kernel.cpp
@@ -140,7 +140,7 @@ void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, Graphic
     uint32_t sshOffset = patch.SurfaceStateHeapOffset;
     auto rootDeviceIndex = allocation.getRootDeviceIndex();
     void *crossThreadData = getCrossThreadData(rootDeviceIndex);
-    void *ssh = getSurfaceStateHeap();
+    void *ssh = getSurfaceStateHeap(rootDeviceIndex);
     if (crossThreadData != nullptr) {
         auto pp = ptrOffset(crossThreadData, crossThreadDataOffset);
         uintptr_t addressToPatch = reinterpret_cast<uintptr_t>(ptrToPatchInCrossThreadData);
@@ -235,13 +235,14 @@ cl_int Kernel::initialize() {
         }
 
         // allocate our own SSH, if necessary
-        sshLocalSize = heapInfo.SurfaceStateHeapSize;
+        kernelDeviceInfos[rootDeviceIndex].sshLocalSize = heapInfo.SurfaceStateHeapSize;
 
-        if (sshLocalSize) {
-            pSshLocal = std::make_unique<char[]>(sshLocalSize);
+        if (kernelDeviceInfos[rootDeviceIndex].sshLocalSize) {
+            kernelDeviceInfos[rootDeviceIndex].pSshLocal = std::make_unique<char[]>(kernelDeviceInfos[rootDeviceIndex].sshLocalSize);
 
             // copy the ssh into our local copy
-            memcpy_s(pSshLocal.get(), sshLocalSize, heapInfo.pSsh, sshLocalSize);
+            memcpy_s(kernelDeviceInfos[rootDeviceIndex].pSshLocal.get(), kernelDeviceInfos[rootDeviceIndex].sshLocalSize,
+                     heapInfo.pSsh, kernelDeviceInfos[rootDeviceIndex].sshLocalSize);
         }
         numberOfBindingTableStates = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Count : 0;
         localBindingTableOffset = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Offset : 0;
@@ -287,7 +288,7 @@ cl_int Kernel::initialize() {
 
         if (patchInfo.pAllocateStatelessEventPoolSurface) {
             if (requiresSshForBuffers()) {
-                auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
+                auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
                                               patchInfo.pAllocateStatelessEventPoolSurface->SurfaceStateHeapOffset);
                 Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0);
             }
@@ -296,7 +297,7 @@ cl_int Kernel::initialize() {
         if (patchInfo.pAllocateStatelessDefaultDeviceQueueSurface) {
 
             if (requiresSshForBuffers()) {
-                auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
+                auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
                                               patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->SurfaceStateHeapOffset);
                 Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0);
             }
@@ -783,8 +784,8 @@ void Kernel::setStartOffset(uint32_t offset) {
     this->startOffset = offset;
 }
 
-void *Kernel::getSurfaceStateHeap() const {
-    return kernelInfo.usesSsh ? pSshLocal.get() : nullptr;
+void *Kernel::getSurfaceStateHeap(uint32_t rootDeviceIndex) const {
+    return kernelInfo.usesSsh ? kernelDeviceInfos[rootDeviceIndex].pSshLocal.get() : nullptr;
 }
 
 size_t Kernel::getDynamicStateHeapSize() const {
@@ -795,9 +796,9 @@ const void *Kernel::getDynamicStateHeap() const {
     return kernelInfo.heapInfo.pDsh;
 }
 
-size_t Kernel::getSurfaceStateHeapSize() const {
+size_t Kernel::getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const {
     return kernelInfo.usesSsh
-               ? sshLocalSize
+               ? kernelDeviceInfos[rootDeviceIndex].sshLocalSize
                : 0;
 }
 
@@ -805,9 +806,9 @@ size_t Kernel::getNumberOfBindingTableStates() const {
     return numberOfBindingTableStates;
 }
 
-void Kernel::resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset) {
-    pSshLocal.reset(static_cast<char *>(pNewSsh));
-    sshLocalSize = static_cast<uint32_t>(newSshSize);
+void Kernel::resizeSurfaceStateHeap(uint32_t rootDeviceIndex, void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset) {
+    kernelDeviceInfos[rootDeviceIndex].pSshLocal.reset(static_cast<char *>(pNewSsh));
+    kernelDeviceInfos[rootDeviceIndex].sshLocalSize = static_cast<uint32_t>(newSshSize);
     numberOfBindingTableStates = newBindingTableCount;
     localBindingTableOffset = newBindingTableOffset;
 }
@@ -882,7 +883,7 @@ cl_int Kernel::setArgSvm(uint32_t argIndex, size_t svmAllocSize, void *svmPtr, G
 
     if (requiresSshForBuffers()) {
         const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex];
-        auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
+        auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
         Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, svmAllocSize + ptrDiff(svmPtr, ptrToPatch), ptrToPatch, 0, svmAlloc, svmFlags, 0);
     }
     if (!kernelArguments[argIndex].isPatched) {
@@ -913,7 +914,7 @@ cl_int Kernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocatio
 
     if (requiresSshForBuffers()) {
         const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex];
-        auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
+        auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
         size_t allocSize = 0;
         size_t offset = 0;
         if (svmAlloc != nullptr) {
@@ -1317,7 +1318,7 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex,
         }
 
         if (requiresSshForBuffers()) {
-            auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
+            auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
             buffer->setArgStateful(surfaceState, forceNonAuxMode, disableL3, isAuxTranslationKernel, kernelArgInfo.isReadOnly, getDevice().getDevice());
         }
 
@@ -1342,7 +1343,7 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex,
         storeKernelArg(argIndex, BUFFER_OBJ, nullptr, argVal, argSize);
 
         if (requiresSshForBuffers()) {
-            auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
+            auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
             Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0);
         }
 
@@ -1391,7 +1392,7 @@ cl_int Kernel::setArgPipe(uint32_t argIndex,
         auto graphicsAllocation = pipe->getGraphicsAllocation(getDevice().getRootDeviceIndex());
 
         if (requiresSshForBuffers()) {
-            auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
+            auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
             Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState,
                                     pipe->getSize(), pipe->getCpuAddress(), 0,
                                     graphicsAllocation, 0, 0);
@@ -1429,7 +1430,7 @@ cl_int Kernel::setArgImageWithMipLevel(uint32_t argIndex,
 
         storeKernelArg(argIndex, IMAGE_OBJ, clMemObj, argVal, argSize);
 
-        auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
+        auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
         DEBUG_BREAK_IF(!kernelArgInfo.isImage);
 
         // Sets SS structure
@@ -2250,7 +2251,7 @@ void Kernel::patchDefaultDeviceQueue(DeviceQueue *devQueue) {
                                   static_cast<uintptr_t>(devQueue->getQueueBuffer()->getGpuAddressToPatch()));
         }
         if (requiresSshForBuffers()) {
-            auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
+            auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
                                           patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->SurfaceStateHeapOffset);
             Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, devQueue->getQueueBuffer()->getUnderlyingBufferSize(),
                                     (void *)devQueue->getQueueBuffer()->getGpuAddress(), 0, devQueue->getQueueBuffer(), 0, 0);
@@ -2272,7 +2273,7 @@ void Kernel::patchEventPool(DeviceQueue *devQueue) {
         }
 
         if (requiresSshForBuffers()) {
-            auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
+            auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
                                           patchInfo.pAllocateStatelessEventPoolSurface->SurfaceStateHeapOffset);
             Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, devQueue->getEventPoolBuffer()->getUnderlyingBufferSize(),
                                     (void *)devQueue->getEventPoolBuffer()->getGpuAddress(), 0, devQueue->getEventPoolBuffer(), 0, 0);
@@ -2298,13 +2299,14 @@ bool Kernel::usesSyncBuffer() {
 }
 
 void Kernel::patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
+    auto rootDeviceIndex = device.getRootDeviceIndex();
     auto &patchInfo = kernelInfo.patchInfo;
-    auto bufferPatchAddress = ptrOffset(getCrossThreadData(device.getRootDeviceIndex()), patchInfo.pAllocateSyncBuffer->DataParamOffset);
+    auto bufferPatchAddress = ptrOffset(getCrossThreadData(rootDeviceIndex), patchInfo.pAllocateSyncBuffer->DataParamOffset);
     patchWithRequiredSize(bufferPatchAddress, patchInfo.pAllocateSyncBuffer->DataParamSize,
                           ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset));
 
     if (requiresSshForBuffers()) {
-        auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
+        auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
                                       patchInfo.pAllocateSyncBuffer->SurfaceStateHeapOffset);
         auto addressToPatch = gfxAllocation->getUnderlyingBuffer();
         auto sizeToPatch = gfxAllocation->getUnderlyingBufferSize();
@@ -2353,10 +2355,11 @@ void Kernel::resolveArgs() {
             }
         }
     }
+    auto rootDeviceIndex = getDevice().getRootDeviceIndex();
     if (canTransformImageTo2dArray) {
-        imageTransformer->transformImagesTo2dArray(kernelInfo, kernelArguments, getSurfaceStateHeap());
+        imageTransformer->transformImagesTo2dArray(kernelInfo, kernelArguments, getSurfaceStateHeap(rootDeviceIndex));
     } else if (imageTransformer->didTransform()) {
-        imageTransformer->transformImagesTo3d(kernelInfo, kernelArguments, getSurfaceStateHeap());
+        imageTransformer->transformImagesTo3d(kernelInfo, kernelArguments, getSurfaceStateHeap(rootDeviceIndex));
     }
 }
 
diff --git a/opencl/source/kernel/kernel.h b/opencl/source/kernel/kernel.h
index 2cf9b651bf..e2f6d89e07 100644
--- a/opencl/source/kernel/kernel.h
+++ b/opencl/source/kernel/kernel.h
@@ -153,18 +153,18 @@ class Kernel : public BaseObject<_cl_kernel> {
                            size_t *paramValueSizeRet) const;
 
     const void *getKernelHeap() const;
-    void *getSurfaceStateHeap() const;
+    void *getSurfaceStateHeap(uint32_t rootDeviceIndex) const;
     const void *getDynamicStateHeap() const;
 
     size_t getKernelHeapSize() const;
-    size_t getSurfaceStateHeapSize() const;
+    size_t getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const;
     size_t getDynamicStateHeapSize() const;
     size_t getNumberOfBindingTableStates() const;
     size_t getBindingTableOffset() const {
         return localBindingTableOffset;
     }
 
-    void resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset);
+    void resizeSurfaceStateHeap(uint32_t rootDeviceIndex, void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset);
 
     void substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize);
     bool isKernelHeapSubstituted() const;
@@ -524,8 +524,6 @@ class Kernel : public BaseObject<_cl_kernel> {
 
     size_t numberOfBindingTableStates = 0u;
     size_t localBindingTableOffset = 0u;
-    std::unique_ptr<char[]> pSshLocal;
-    uint32_t sshLocalSize = 0u;
 
     GraphicsAllocation *kernelReflectionSurface = nullptr;
 
@@ -550,13 +548,15 @@ class Kernel : public BaseObject<_cl_kernel> {
     bool debugEnabled = false;
     uint32_t additionalKernelExecInfo = AdditionalKernelExecInfo::NotSet;
 
-    struct KernelDeviceInfo {
+    struct KernelDeviceInfo : public NonCopyableClass {
+        std::unique_ptr<char[]> pSshLocal;
+        uint32_t sshLocalSize = 0u;
         char *crossThreadData = nullptr;
         uint32_t crossThreadDataSize = 0u;
 
         GraphicsAllocation *privateSurface = nullptr;
         uint64_t privateSurfaceSize = 0u;
     };
-    StackVec<KernelDeviceInfo, 1> kernelDeviceInfos;
+    std::vector<KernelDeviceInfo> kernelDeviceInfos;
 };
 } // namespace NEO
diff --git a/opencl/source/program/printf_handler.cpp b/opencl/source/program/printf_handler.cpp
index c12052d804..44ade48fd1 100644
--- a/opencl/source/program/printf_handler.cpp
+++ b/opencl/source/program/printf_handler.cpp
@@ -62,7 +62,7 @@ void PrintfHandler::prepareDispatch(const MultiDispatchInfo &multiDispatchInfo)
 
     patchWithRequiredSize(printfPatchAddress, kernel->getKernelInfo().patchInfo.pAllocateStatelessPrintfSurface->DataParamSize, (uintptr_t)printfSurface->getGpuAddressToPatch());
     if (kernel->requiresSshForBuffers()) {
-        auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getSurfaceStateHeap()),
+        auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getSurfaceStateHeap(rootDeviceIndex)),
                                       kernel->getKernelInfo().patchInfo.pAllocateStatelessPrintfSurface->SurfaceStateHeapOffset);
         void *addressToPatch = printfSurface->getUnderlyingBuffer();
         size_t sizeToPatch = printfSurface->getUnderlyingBufferSize();
diff --git a/opencl/test/unit_test/accelerators/media_image_arg_tests.cpp b/opencl/test/unit_test/accelerators/media_image_arg_tests.cpp
index 83971ffe2a..87f57c7a24 100644
--- a/opencl/test/unit_test/accelerators/media_image_arg_tests.cpp
+++ b/opencl/test/unit_test/accelerators/media_image_arg_tests.cpp
@@ -81,7 +81,7 @@ HWTEST_F(MediaImageSetArgTest, WhenSettingMediaImageArgThenArgsSetCorrectly) {
     typedef typename FamilyType::MEDIA_SURFACE_STATE MEDIA_SURFACE_STATE;
 
     auto pSurfaceState = reinterpret_cast<const MEDIA_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     srcImage->setMediaImageArg(const_cast<MEDIA_SURFACE_STATE *>(pSurfaceState), pClDevice->getRootDeviceIndex());
@@ -109,7 +109,7 @@ HWTEST_F(MediaImageSetArgTest, WhenSettingKernelArgImageThenArgsSetCorrectly) {
     ASSERT_EQ(CL_SUCCESS, retVal);
 
     auto pSurfaceState = reinterpret_cast<const MEDIA_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     uint64_t surfaceAddress = pSurfaceState->getSurfaceBaseAddress();
diff --git a/opencl/test/unit_test/api/cl_mem_locally_uncached_resource_tests.cpp b/opencl/test/unit_test/api/cl_mem_locally_uncached_resource_tests.cpp
index 8417b075c1..7cd82259fc 100644
--- a/opencl/test/unit_test/api/cl_mem_locally_uncached_resource_tests.cpp
+++ b/opencl/test/unit_test/api/cl_mem_locally_uncached_resource_tests.cpp
@@ -27,8 +27,9 @@ namespace clMemLocallyUncachedResourceTests {
 
 template <typename FamilyType>
 uint32_t argMocs(Kernel &kernel, size_t argIndex) {
+    auto rootDeviceIndex = kernel.getDevices()[0]->getRootDeviceIndex();
     using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
-    auto surfaceStateHeapAddress = kernel.getSurfaceStateHeap();
+    auto surfaceStateHeapAddress = kernel.getSurfaceStateHeap(rootDeviceIndex);
     auto surfaceStateHeapAddressOffset = kernel.getKernelInfo().kernelArgInfo[argIndex].offsetHeap;
     auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(surfaceStateHeapAddress, surfaceStateHeapAddressOffset));
     return surfaceState->getMemoryObjectControlState();
diff --git a/opencl/test/unit_test/built_ins/built_in_tests.cpp b/opencl/test/unit_test/built_ins/built_in_tests.cpp
index 57e24fad99..62fa4102b1 100644
--- a/opencl/test/unit_test/built_ins/built_in_tests.cpp
+++ b/opencl/test/unit_test/built_ins/built_in_tests.cpp
@@ -484,12 +484,12 @@ HWCMDTEST_F(IGFX_GEN8_CORE, BuiltInTests, givenAuxTranslationKernelWhenSettingKe
         // read args
         auto argNum = 0;
         auto expectedMocs = pDevice->getGmmHelper()->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED);
-        auto sshBase = mockAuxBuiltInOp.convertToAuxKernel[0]->getSurfaceStateHeap();
+        auto sshBase = mockAuxBuiltInOp.convertToAuxKernel[0]->getSurfaceStateHeap(rootDeviceIndex);
         auto sshOffset = mockAuxBuiltInOp.convertToAuxKernel[0]->getKernelInfo().kernelArgInfo[argNum].offsetHeap;
         auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(sshBase, sshOffset));
         EXPECT_EQ(expectedMocs, surfaceState->getMemoryObjectControlState());
 
-        sshBase = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getSurfaceStateHeap();
+        sshBase = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getSurfaceStateHeap(rootDeviceIndex);
         sshOffset = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getKernelInfo().kernelArgInfo[argNum].offsetHeap;
         surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(sshBase, sshOffset));
         EXPECT_EQ(expectedMocs, surfaceState->getMemoryObjectControlState());
@@ -499,12 +499,12 @@ HWCMDTEST_F(IGFX_GEN8_CORE, BuiltInTests, givenAuxTranslationKernelWhenSettingKe
         // write args
         auto argNum = 1;
         auto expectedMocs = pDevice->getGmmHelper()->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER);
-        auto sshBase = mockAuxBuiltInOp.convertToAuxKernel[0]->getSurfaceStateHeap();
+        auto sshBase = mockAuxBuiltInOp.convertToAuxKernel[0]->getSurfaceStateHeap(rootDeviceIndex);
         auto sshOffset = mockAuxBuiltInOp.convertToAuxKernel[0]->getKernelInfo().kernelArgInfo[argNum].offsetHeap;
         auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(sshBase, sshOffset));
         EXPECT_EQ(expectedMocs, surfaceState->getMemoryObjectControlState());
 
-        sshBase = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getSurfaceStateHeap();
+        sshBase = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getSurfaceStateHeap(rootDeviceIndex);
         sshOffset = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getKernelInfo().kernelArgInfo[argNum].offsetHeap;
         surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(sshBase, sshOffset));
         EXPECT_EQ(expectedMocs, surfaceState->getMemoryObjectControlState());
@@ -541,7 +541,7 @@ HWTEST_F(BuiltInTests, givenAuxToNonAuxTranslationWhenSettingSurfaceStateThenSet
     {
         // read arg
         auto argNum = 0;
-        auto sshBase = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getSurfaceStateHeap();
+        auto sshBase = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getSurfaceStateHeap(rootDeviceIndex);
         auto sshOffset = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getKernelInfo().kernelArgInfo[argNum].offsetHeap;
         auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(sshBase, sshOffset));
         EXPECT_EQ(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E, surfaceState->getAuxiliarySurfaceMode());
@@ -550,7 +550,7 @@ HWTEST_F(BuiltInTests, givenAuxToNonAuxTranslationWhenSettingSurfaceStateThenSet
     {
         // write arg
         auto argNum = 1;
-        auto sshBase = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getSurfaceStateHeap();
+        auto sshBase = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getSurfaceStateHeap(rootDeviceIndex);
         auto sshOffset = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getKernelInfo().kernelArgInfo[argNum].offsetHeap;
         auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(sshBase, sshOffset));
         EXPECT_EQ(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_NONE, surfaceState->getAuxiliarySurfaceMode());
@@ -586,7 +586,7 @@ HWTEST_F(BuiltInTests, givenNonAuxToAuxTranslationWhenSettingSurfaceStateThenSet
     {
         // read arg
         auto argNum = 0;
-        auto sshBase = mockAuxBuiltInOp.convertToAuxKernel[0]->getSurfaceStateHeap();
+        auto sshBase = mockAuxBuiltInOp.convertToAuxKernel[0]->getSurfaceStateHeap(rootDeviceIndex);
         auto sshOffset = mockAuxBuiltInOp.convertToAuxKernel[0]->getKernelInfo().kernelArgInfo[argNum].offsetHeap;
         auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(sshBase, sshOffset));
         EXPECT_EQ(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_NONE, surfaceState->getAuxiliarySurfaceMode());
@@ -595,7 +595,7 @@ HWTEST_F(BuiltInTests, givenNonAuxToAuxTranslationWhenSettingSurfaceStateThenSet
     {
         // write arg
         auto argNum = 1;
-        auto sshBase = mockAuxBuiltInOp.convertToAuxKernel[0]->getSurfaceStateHeap();
+        auto sshBase = mockAuxBuiltInOp.convertToAuxKernel[0]->getSurfaceStateHeap(rootDeviceIndex);
         auto sshOffset = mockAuxBuiltInOp.convertToAuxKernel[0]->getKernelInfo().kernelArgInfo[argNum].offsetHeap;
         auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(sshBase, sshOffset));
         EXPECT_EQ(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E, surfaceState->getAuxiliarySurfaceMode());
diff --git a/opencl/test/unit_test/command_queue/command_queue_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_tests.cpp
index f17abddff5..d02c49191a 100644
--- a/opencl/test/unit_test/command_queue/command_queue_tests.cpp
+++ b/opencl/test/unit_test/command_queue/command_queue_tests.cpp
@@ -1038,7 +1038,7 @@ HWTEST_F(CommandQueueCommandStreamTest, givenDebugKernelWhenSetupDebugSurfaceIsC
     std::unique_ptr<MockDebugKernel> kernel(MockKernel::create<MockDebugKernel>(*pDevice, &program));
     MockCommandQueue cmdQ(context.get(), pClDevice, 0);
 
-    kernel->setSshLocal(nullptr, sizeof(RENDER_SURFACE_STATE) + kernel->getAllocatedKernelInfo()->patchInfo.pAllocateSystemThreadSurface->Offset);
+    kernel->setSshLocal(nullptr, sizeof(RENDER_SURFACE_STATE) + kernel->getAllocatedKernelInfo()->patchInfo.pAllocateSystemThreadSurface->Offset, rootDeviceIndex);
     kernel->getAllocatedKernelInfo()->usesSsh = true;
     auto &commandStreamReceiver = cmdQ.getGpgpuCommandStreamReceiver();
 
@@ -1047,7 +1047,7 @@ HWTEST_F(CommandQueueCommandStreamTest, givenDebugKernelWhenSetupDebugSurfaceIsC
 
     auto debugSurface = commandStreamReceiver.getDebugSurfaceAllocation();
     ASSERT_NE(nullptr, debugSurface);
-    RENDER_SURFACE_STATE *surfaceState = (RENDER_SURFACE_STATE *)kernel->getSurfaceStateHeap();
+    RENDER_SURFACE_STATE *surfaceState = (RENDER_SURFACE_STATE *)kernel->getSurfaceStateHeap(rootDeviceIndex);
     EXPECT_EQ(debugSurface->getGpuAddress(), surfaceState->getSurfaceBaseAddress());
 }
 
@@ -1058,7 +1058,7 @@ HWTEST_F(CommandQueueCommandStreamTest, givenCsrWithDebugSurfaceAllocatedWhenSet
     std::unique_ptr<MockDebugKernel> kernel(MockKernel::create<MockDebugKernel>(*pDevice, &program));
     MockCommandQueue cmdQ(context.get(), pClDevice, 0);
 
-    kernel->setSshLocal(nullptr, sizeof(RENDER_SURFACE_STATE) + kernel->getAllocatedKernelInfo()->patchInfo.pAllocateSystemThreadSurface->Offset);
+    kernel->setSshLocal(nullptr, sizeof(RENDER_SURFACE_STATE) + kernel->getAllocatedKernelInfo()->patchInfo.pAllocateSystemThreadSurface->Offset, rootDeviceIndex);
     kernel->getAllocatedKernelInfo()->usesSsh = true;
     auto &commandStreamReceiver = cmdQ.getGpgpuCommandStreamReceiver();
     commandStreamReceiver.allocateDebugSurface(SipKernel::maxDbgSurfaceSize);
@@ -1068,7 +1068,7 @@ HWTEST_F(CommandQueueCommandStreamTest, givenCsrWithDebugSurfaceAllocatedWhenSet
     cmdQ.setupDebugSurface(kernel.get());
 
     EXPECT_EQ(debugSurface, commandStreamReceiver.getDebugSurfaceAllocation());
-    RENDER_SURFACE_STATE *surfaceState = (RENDER_SURFACE_STATE *)kernel->getSurfaceStateHeap();
+    RENDER_SURFACE_STATE *surfaceState = (RENDER_SURFACE_STATE *)kernel->getSurfaceStateHeap(rootDeviceIndex);
     EXPECT_EQ(debugSurface->getGpuAddress(), surfaceState->getSurfaceBaseAddress());
 }
 
diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp
index 04bf812539..87e1bda34e 100644
--- a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp
+++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp
@@ -733,7 +733,7 @@ HWTEST_F(DispatchWalkerTest, GivenBlockedQueueWhenDispatchingWalkerThenRequiredH
 
     auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(kernel);
     auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(rootDeviceIndex, kernel, Math::computeTotalElementsCount(localWorkgroupSize));
-    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(kernel);
+    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(kernel, rootDeviceIndex);
 
     EXPECT_LE(expectedSizeDSH, blockedCommandsData->dsh->getMaxAvailableSpace());
     EXPECT_LE(expectedSizeIOH, blockedCommandsData->ioh->getMaxAvailableSpace());
diff --git a/opencl/test/unit_test/command_queue/enqueue_handler_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_handler_tests.cpp
index 0047af8e33..e875952a5a 100644
--- a/opencl/test/unit_test/command_queue/enqueue_handler_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_handler_tests.cpp
@@ -578,7 +578,7 @@ HWTEST_F(EnqueueHandlerTest, givenKernelUsingSyncBufferWhenEnqueuingKernelThenSs
         kernel->initialize();
 
         auto bindingTableState = reinterpret_cast<BINDING_TABLE_STATE *>(
-            ptrOffset(kernel->getSurfaceStateHeap(), sPatchBindingTableState.Offset));
+            ptrOffset(kernel->getSurfaceStateHeap(rootDeviceIndex), sPatchBindingTableState.Offset));
         bindingTableState->setSurfaceStatePointer(0);
 
         auto mockCmdQ = clUniquePtr(new MockCommandQueueHw<FamilyType>(context, pClDevice, 0));
diff --git a/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp b/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp
index 8a2468cb44..e79b6b52f1 100644
--- a/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp
+++ b/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp
@@ -400,7 +400,7 @@ HWTEST_F(GetSizeRequiredBufferTest, givenMultipleKernelRequiringSshWhenTotalSize
     builder.buildDispatchInfos(multiDispatchInfo);
     builder.buildDispatchInfos(multiDispatchInfo);
 
-    auto sizeSSH = multiDispatchInfo.begin()->getKernel()->getSurfaceStateHeapSize();
+    auto sizeSSH = multiDispatchInfo.begin()->getKernel()->getSurfaceStateHeapSize(rootDeviceIndex);
     sizeSSH += sizeSSH ? FamilyType::BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE : 0;
 
     sizeSSH = alignUp(sizeSSH, MemoryConstants::cacheLineSize);
@@ -439,7 +439,7 @@ HWTEST_F(GetSizeRequiredBufferTest, GivenHelloWorldKernelWhenEnqueingKernelThenH
     auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, KernelFixture::pKernel);
     auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*KernelFixture::pKernel);
     auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(rootDeviceIndex, *KernelFixture::pKernel, workSize[0]);
-    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*KernelFixture::pKernel);
+    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*KernelFixture::pKernel, rootDeviceIndex);
 
     // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
     expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END);
@@ -478,7 +478,7 @@ HWTEST_F(GetSizeRequiredBufferTest, GivenKernelWithSimpleArgWhenEnqueingKernelTh
     auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, KernelFixture::pKernel);
     auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*KernelFixture::pKernel);
     auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(rootDeviceIndex, *KernelFixture::pKernel, workSize[0]);
-    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*KernelFixture::pKernel);
+    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*KernelFixture::pKernel, rootDeviceIndex);
 
     EXPECT_EQ(0u, expectedSizeIOH % GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
     EXPECT_EQ(0u, expectedSizeDSH % 64);
diff --git a/opencl/test/unit_test/command_queue/get_size_required_image_tests.cpp b/opencl/test/unit_test/command_queue/get_size_required_image_tests.cpp
index d636cce3f7..3d00f31654 100644
--- a/opencl/test/unit_test/command_queue/get_size_required_image_tests.cpp
+++ b/opencl/test/unit_test/command_queue/get_size_required_image_tests.cpp
@@ -93,7 +93,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenCopyingImageThenHeapsAndCommandBufferCons
     auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_COPY_IMAGE, false, false, *pCmdQ, kernel);
     auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
     auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(rootDeviceIndex, *kernel);
-    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
+    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel, rootDeviceIndex);
 
     // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
     expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END);
@@ -140,7 +140,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenCopyingReadWriteImageThenHeapsAndCommandB
     auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_COPY_IMAGE, false, false, *pCmdQ, kernel.get());
     auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel.get());
     auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(rootDeviceIndex, *kernel.get());
-    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel.get());
+    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel.get(), rootDeviceIndex);
 
     // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
     expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END);
@@ -197,7 +197,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenReadingImageNonBlockingThenHeapsAndComman
     auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_READ_IMAGE, false, false, *pCmdQ, kernel);
     auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
     auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(rootDeviceIndex, *kernel);
-    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
+    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel, rootDeviceIndex);
 
     // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
     expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END);
@@ -252,7 +252,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenReadingImageBlockingThenHeapsAndCommandBu
     auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_READ_IMAGE, false, false, *pCmdQ, kernel);
     auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
     auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(rootDeviceIndex, *kernel);
-    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
+    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel, rootDeviceIndex);
 
     // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
     expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END);
@@ -307,7 +307,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenWritingImageNonBlockingThenHeapsAndComman
     auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_WRITE_IMAGE, false, false, *pCmdQ, kernel);
     auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
     auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(rootDeviceIndex, *kernel);
-    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
+    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel, rootDeviceIndex);
 
     // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
     expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END);
@@ -362,7 +362,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenWritingImageBlockingThenHeapsAndCommandBu
     auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_WRITE_IMAGE, false, false, *pCmdQ, kernel);
     auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
     auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(rootDeviceIndex, *kernel);
-    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
+    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel, rootDeviceIndex);
 
     // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
     expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END);
diff --git a/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp b/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp
index f308c2ba2e..aa988999ef 100644
--- a/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp
+++ b/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp
@@ -180,7 +180,7 @@ HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenSshRequiredWhenPatchingSyncBuffer
 
     pClDevice->allocateSyncBufferHandler();
     auto syncBufferHandler = getSyncBufferHandler();
-    auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(kernel->getSurfaceStateHeap(),
+    auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(kernel->getSurfaceStateHeap(rootDeviceIndex),
                                                                            sPatchAllocateSyncBuffer.SurfaceStateHeapOffset));
     auto bufferAddress = syncBufferHandler->graphicsAllocation->getGpuAddress();
     surfaceState->setSurfaceBaseAddress(bufferAddress + 1);
diff --git a/opencl/test/unit_test/device_queue/device_queue_hw_tests.cpp b/opencl/test/unit_test/device_queue/device_queue_hw_tests.cpp
index 4015d48e44..76bdc1e888 100644
--- a/opencl/test/unit_test/device_queue/device_queue_hw_tests.cpp
+++ b/opencl/test/unit_test/device_queue/device_queue_hw_tests.cpp
@@ -537,7 +537,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, DeviceQueueHwWithKernel, WhenSetiingIUpIndirectState
     auto dsh = devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
     ASSERT_NE(nullptr, dsh);
 
-    size_t surfaceStateHeapSize = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(const_cast<const Kernel &>(*pKernel));
+    size_t surfaceStateHeapSize = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(const_cast<const Kernel &>(*pKernel), rootDeviceIndex);
 
     auto ssh = new IndirectHeap(alignedMalloc(surfaceStateHeapSize, MemoryConstants::pageSize), surfaceStateHeapSize);
     auto usedBeforeSSH = ssh->getUsed();
@@ -565,7 +565,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, DeviceQueueHwWithKernel, WhenSettingUpIndirectStateT
     auto dsh = devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
     ASSERT_NE(nullptr, dsh);
 
-    size_t surfaceStateHeapSize = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(const_cast<const Kernel &>(*pKernel));
+    size_t surfaceStateHeapSize = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(const_cast<const Kernel &>(*pKernel), rootDeviceIndex);
 
     auto ssh = new IndirectHeap(alignedMalloc(surfaceStateHeapSize, MemoryConstants::pageSize), surfaceStateHeapSize);
 
@@ -593,7 +593,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, DeviceQueueHwWithKernel, WhenSettingUpIndirectStateT
     auto dsh = devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
     ASSERT_NE(nullptr, dsh);
 
-    size_t surfaceStateHeapSize = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(const_cast<const Kernel &>(*pKernel));
+    size_t surfaceStateHeapSize = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(const_cast<const Kernel &>(*pKernel), rootDeviceIndex);
 
     auto ssh = new IndirectHeap(alignedMalloc(surfaceStateHeapSize, MemoryConstants::pageSize), surfaceStateHeapSize);
 
@@ -631,7 +631,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, DeviceQueueHwWithKernel, GivenHasBarriersSetWhenCall
     }
 
     auto surfaceStateHeapSize =
-        HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(const_cast<const Kernel &>(*pKernel));
+        HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(const_cast<const Kernel &>(*pKernel), rootDeviceIndex);
     auto ssh = std::make_unique<IndirectHeap>(alignedMalloc(surfaceStateHeapSize, MemoryConstants::pageSize), surfaceStateHeapSize);
 
     devQueueHw->setupIndirectState(*ssh, *dsh, pKernel, parentCount, false);
diff --git a/opencl/test/unit_test/execution_model/enqueue_execution_model_kernel_tests.cpp b/opencl/test/unit_test/execution_model/enqueue_execution_model_kernel_tests.cpp
index 49e1e87bf9..ca9564c6df 100644
--- a/opencl/test/unit_test/execution_model/enqueue_execution_model_kernel_tests.cpp
+++ b/opencl/test/unit_test/execution_model/enqueue_execution_model_kernel_tests.cpp
@@ -297,7 +297,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelEnqueueTest, givenParentKernelWhenEnqueu
     BlockKernelManager *blockManager = pProgram->getBlockKernelManager();
     uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount());
 
-    size_t parentKernelSSHSize = pKernel->getSurfaceStateHeapSize();
+    size_t parentKernelSSHSize = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
 
     MockMultiDispatchInfo multiDispatchInfo(pClDevice, pKernel);
 
@@ -340,7 +340,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelEnqueueTest, givenParentKernelWhenEnqueu
                 EXPECT_EQ(0, memcmp(srcSurfaceState, dstSurfaceState, sizeof(RENDER_SURFACE_STATE)));
             }
 
-            blockSSH = ptrOffset(blockSSH, blockKernel->getSurfaceStateHeapSize());
+            blockSSH = ptrOffset(blockSSH, blockKernel->getSurfaceStateHeapSize(rootDeviceIndex));
         }
 
         delete blockKernel;
diff --git a/opencl/test/unit_test/execution_model/parent_kernel_dispatch_tests.cpp b/opencl/test/unit_test/execution_model/parent_kernel_dispatch_tests.cpp
index 25119134d0..464acd730c 100644
--- a/opencl/test/unit_test/execution_model/parent_kernel_dispatch_tests.cpp
+++ b/opencl/test/unit_test/execution_model/parent_kernel_dispatch_tests.cpp
@@ -132,7 +132,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelDispatchTest, givenParentKernelWhenQueue
     EXPECT_LE(pKernel->getKernelInfo().heapInfo.SurfaceStateHeapSize, ssh.getMaxAvailableSpace());
 
     size_t minRequiredSize = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);
-    size_t minRequiredSizeForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*pKernel);
+    size_t minRequiredSizeForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*pKernel, rootDeviceIndex);
 
     EXPECT_LE(minRequiredSize + minRequiredSizeForEM, ssh.getMaxAvailableSpace());
 }
@@ -162,7 +162,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelDispatchTest, givenParentKernelWhenQueue
     ASSERT_NE(nullptr, blockedCommandsData);
 
     size_t minRequiredSize = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo) + UnitTestHelper<FamilyType>::getDefaultSshUsage();
-    size_t minRequiredSizeForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*pKernel);
+    size_t minRequiredSizeForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*pKernel, rootDeviceIndex);
 
     size_t sshUsed = blockedCommandsData->ssh->getUsed();
 
diff --git a/opencl/test/unit_test/execution_model/scheduler_dispatch_tests.cpp b/opencl/test/unit_test/execution_model/scheduler_dispatch_tests.cpp
index 04d2229b1a..27a2a1f419 100644
--- a/opencl/test/unit_test/execution_model/scheduler_dispatch_tests.cpp
+++ b/opencl/test/unit_test/execution_model/scheduler_dispatch_tests.cpp
@@ -53,7 +53,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ExecutionModelSchedulerFixture, WhenDispatchingSched
 
     EXPECT_NE(nullptr, executionModelDsh);
 
-    size_t minRequiredSizeForSchedulerSSH = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel);
+    size_t minRequiredSizeForSchedulerSSH = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);
     // Setup heaps in pCmdQ
     MultiDispatchInfo multiDispatchinfo(&scheduler);
     LinearStream &commandStream = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, CsrDependencies(),
@@ -174,7 +174,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ExecutionModelSchedulerFixture, WhenDispatchingSched
     DeviceQueueHw<FamilyType> *pDevQueueHw = castToObject<DeviceQueueHw<FamilyType>>(pDevQueue);
     SchedulerKernel &scheduler = context->getSchedulerKernel();
 
-    size_t minRequiredSizeForSchedulerSSH = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel);
+    size_t minRequiredSizeForSchedulerSSH = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);
     // Setup heaps in pCmdQ
 
     MultiDispatchInfo multiDispatchinfo(&scheduler);
@@ -209,7 +209,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, GivenEarlyReturnSet
 
     SchedulerKernel &scheduler = context->getSchedulerKernel();
 
-    size_t minRequiredSizeForSchedulerSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(scheduler);
+    size_t minRequiredSizeForSchedulerSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(scheduler, rootDeviceIndex);
     // Setup heaps in pCmdQ
     MultiDispatchInfo multiDispatchinfo(&scheduler);
     LinearStream &commandStream = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, CsrDependencies(),
diff --git a/opencl/test/unit_test/execution_model/submit_blocked_parent_kernel_tests.cpp b/opencl/test/unit_test/execution_model/submit_blocked_parent_kernel_tests.cpp
index 172d5cb04e..7c73e59880 100644
--- a/opencl/test/unit_test/execution_model/submit_blocked_parent_kernel_tests.cpp
+++ b/opencl/test/unit_test/execution_model/submit_blocked_parent_kernel_tests.cpp
@@ -97,7 +97,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenLockedEMcritca
 
     dsh->getSpace(mockDevQueue.getDshOffset());
 
-    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel);
+    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);
 
     auto cmdStreamAllocation = device->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getRootDeviceIndex(), 4096, GraphicsAllocation::AllocationType::COMMAND_BUFFER, device->getDeviceBitfield()});
     auto blockedCommandData = std::make_unique<KernelOperation>(new LinearStream(cmdStreamAllocation),
@@ -162,7 +162,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenParentKernelWh
                                                                 *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
     blockedCommandData->setHeaps(dsh, ioh, ssh);
 
-    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel);
+    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);
 
     blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
     PreemptionMode preemptionMode = device->getPreemptionMode();
@@ -203,7 +203,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenParentKernelWh
                                                                 *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
     blockedCommandData->setHeaps(dsh, ioh, ssh);
 
-    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel);
+    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);
 
     blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
     PreemptionMode preemptionMode = device->getPreemptionMode();
@@ -241,7 +241,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenBlockedParentK
                                                                 *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
     blockedCommandData->setHeaps(dsh, ioh, ssh);
 
-    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel);
+    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);
 
     blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
     PreemptionMode preemptionMode = device->getPreemptionMode();
@@ -282,7 +282,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenParentKernelWh
                                                                 *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
     blockedCommandData->setHeaps(dsh, ioh, ssh);
 
-    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel);
+    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);
 
     blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
     PreemptionMode preemptionMode = device->getPreemptionMode();
@@ -308,7 +308,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenUsedCommandQue
 
     MockCommandQueue cmdQ(context, device, properties);
 
-    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel);
+    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);
 
     size_t heapSize = 20;
 
@@ -362,7 +362,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenNotUsedSSHWhen
     parentKernel->createReflectionSurface();
     context->setDefaultDeviceQueue(&mockDevQueue);
 
-    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel);
+    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);
 
     size_t heapSize = 20;
 
diff --git a/opencl/test/unit_test/fixtures/execution_model_fixture.h b/opencl/test/unit_test/fixtures/execution_model_fixture.h
index 912ca179bf..63318b225c 100644
--- a/opencl/test/unit_test/fixtures/execution_model_fixture.h
+++ b/opencl/test/unit_test/fixtures/execution_model_fixture.h
@@ -109,7 +109,7 @@ struct ParentKernelCommandQueueFixture : public CommandQueueHwFixture,
                                          testing::Test {
 
     void SetUp() override {
-        device = new MockClDevice{MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr)};
+        device = new MockClDevice{MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr, rootDeviceIndex)};
         CommandQueueHwFixture::SetUp(device, 0);
     }
     void TearDown() override {
@@ -125,4 +125,5 @@ struct ParentKernelCommandQueueFixture : public CommandQueueHwFixture,
 
         return std::make_unique<KernelOperation>(commandStream, *gpgpuCsr.getInternalAllocationStorage());
     }
+    const uint32_t rootDeviceIndex = 0u;
 };
diff --git a/opencl/test/unit_test/gen8/scheduler_dispatch_tests_gen8.cpp b/opencl/test/unit_test/gen8/scheduler_dispatch_tests_gen8.cpp
index c55257a7cf..20c007cbb7 100644
--- a/opencl/test/unit_test/gen8/scheduler_dispatch_tests_gen8.cpp
+++ b/opencl/test/unit_test/gen8/scheduler_dispatch_tests_gen8.cpp
@@ -31,7 +31,7 @@ BDWTEST_F(BdwSchedulerTest, givenCallToDispatchSchedulerWhenPipeControlWithCSSta
         DeviceQueueHw<FamilyType> *pDevQueueHw = castToObject<DeviceQueueHw<FamilyType>>(pDevQueue);
         SchedulerKernel &scheduler = context->getSchedulerKernel();
 
-        size_t minRequiredSizeForSchedulerSSH = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel);
+        size_t minRequiredSizeForSchedulerSSH = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);
 
         // Setup heaps in pCmdQ
         MultiDispatchInfo multiDispatchinfo(&scheduler);
diff --git a/opencl/test/unit_test/gtpin/gtpin_tests.cpp b/opencl/test/unit_test/gtpin/gtpin_tests.cpp
index 20917137a7..1a54501083 100644
--- a/opencl/test/unit_test/gtpin/gtpin_tests.cpp
+++ b/opencl/test/unit_test/gtpin/gtpin_tests.cpp
@@ -160,7 +160,8 @@ class GTPinFixture : public ContextFixture, public MemoryManagementFixture {
         executionEnvironment->memoryManager.reset(memoryManager);
         initPlatform();
         pDevice = pPlatform->getClDevice(0);
-        cl_device_id device = (cl_device_id)pDevice;
+        rootDeviceIndex = pDevice->getRootDeviceIndex();
+        cl_device_id device = pDevice;
         ContextFixture::SetUp(1, &device);
 
         driverServices.bufferAllocate = nullptr;
@@ -193,6 +194,7 @@ class GTPinFixture : public ContextFixture, public MemoryManagementFixture {
     driver_services_t driverServices;
     gtpin::ocl::gtpin_events_t gtpinCallbacks;
     MockMemoryManagerWithFailures *memoryManager = nullptr;
+    uint32_t rootDeviceIndex = std::numeric_limits<uint32_t>::max();
 };
 
 typedef Test<GTPinFixture> GTPinTests;
@@ -1279,7 +1281,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelWithoutSSHIsUsedThenG
 
     // Verify that when SSH is removed then during kernel execution
     // GT-Pin Kernel Submit, Command Buffer Create and Command Buffer Complete callbacks are not called.
-    pKernel->resizeSurfaceStateHeap(nullptr, 0, 0, 0);
+    pKernel->resizeSurfaceStateHeap(rootDeviceIndex, nullptr, 0, 0, 0);
 
     int prevCount2 = KernelSubmitCallbackCount;
     int prevCount3 = CommandBufferCreateCallbackCount;
@@ -1392,7 +1394,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenBlockedKernelWithoutSSHIsUs
 
     // Verify that when SSH is removed then during kernel execution
     // GT-Pin Kernel Submit, Command Buffer Create and Command Buffer Complete callbacks are not called.
-    pKernel->resizeSurfaceStateHeap(nullptr, 0, 0, 0);
+    pKernel->resizeSurfaceStateHeap(rootDeviceIndex, nullptr, 0, 0, 0);
 
     cl_event userEvent = clCreateUserEvent(context, &retVal);
     EXPECT_EQ(CL_SUCCESS, retVal);
@@ -2167,15 +2169,15 @@ TEST_F(GTPinTests, givenParentKernelWhenGtPinAddingSurfaceStateThenItIsNotAddedA
     std::unique_ptr<MockParentKernel> parentKernel(MockParentKernel::create(*pContext));
 
     parentKernel->mockKernelInfo->usesSsh = true;
-    parentKernel->sshLocalSize = 64;
-    parentKernel->pSshLocal.reset(new char[64]);
+    parentKernel->kernelDeviceInfos[rootDeviceIndex].sshLocalSize = 64;
+    parentKernel->kernelDeviceInfos[rootDeviceIndex].pSshLocal.reset(new char[64]);
 
-    size_t sizeSurfaceStates1 = parentKernel->getSurfaceStateHeapSize();
+    size_t sizeSurfaceStates1 = parentKernel->getSurfaceStateHeapSize(rootDeviceIndex);
 
-    bool surfaceAdded = gtpinHelper.addSurfaceState(parentKernel.get());
+    bool surfaceAdded = gtpinHelper.addSurfaceState(parentKernel.get(), rootDeviceIndex);
     EXPECT_FALSE(surfaceAdded);
 
-    size_t sizeSurfaceStates2 = parentKernel->getSurfaceStateHeapSize();
+    size_t sizeSurfaceStates2 = parentKernel->getSurfaceStateHeapSize(rootDeviceIndex);
     EXPECT_EQ(sizeSurfaceStates2, sizeSurfaceStates1);
 }
 
@@ -2225,47 +2227,47 @@ TEST_F(GTPinTests, givenKernelWithSSHThenVerifyThatSSHResizeWorksWell) {
 
     size_t numBTS1 = pKernel->getNumberOfBindingTableStates();
     EXPECT_EQ(2u, numBTS1);
-    size_t sizeSurfaceStates1 = pKernel->getSurfaceStateHeapSize();
+    size_t sizeSurfaceStates1 = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
     EXPECT_NE(0u, sizeSurfaceStates1);
     size_t offsetBTS1 = pKernel->getBindingTableOffset();
     EXPECT_NE(0u, offsetBTS1);
 
     GFXCORE_FAMILY genFamily = pDevice->getHardwareInfo().platform.eRenderCoreFamily;
     GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
-    void *pSS1 = gtpinHelper.getSurfaceState(pKernel, 0);
+    void *pSS1 = gtpinHelper.getSurfaceState(pKernel, 0, rootDeviceIndex);
     EXPECT_NE(nullptr, pSS1);
 
     // Enlarge SSH by one SURFACE STATE element
-    bool surfaceAdded = gtpinHelper.addSurfaceState(pKernel);
+    bool surfaceAdded = gtpinHelper.addSurfaceState(pKernel, rootDeviceIndex);
     EXPECT_TRUE(surfaceAdded);
 
     size_t numBTS2 = pKernel->getNumberOfBindingTableStates();
     EXPECT_EQ(numBTS1 + 1, numBTS2);
-    size_t sizeSurfaceStates2 = pKernel->getSurfaceStateHeapSize();
+    size_t sizeSurfaceStates2 = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
     EXPECT_GT(sizeSurfaceStates2, sizeSurfaceStates1);
     size_t offsetBTS2 = pKernel->getBindingTableOffset();
     EXPECT_GT(offsetBTS2, offsetBTS1);
 
-    void *pSS2 = gtpinHelper.getSurfaceState(pKernel, 0);
+    void *pSS2 = gtpinHelper.getSurfaceState(pKernel, 0, rootDeviceIndex);
     EXPECT_NE(pSS2, pSS1);
 
-    pSS2 = gtpinHelper.getSurfaceState(pKernel, numBTS2);
+    pSS2 = gtpinHelper.getSurfaceState(pKernel, numBTS2, rootDeviceIndex);
     EXPECT_EQ(nullptr, pSS2);
 
     // Remove kernel's SSH
-    pKernel->resizeSurfaceStateHeap(nullptr, 0, 0, 0);
+    pKernel->resizeSurfaceStateHeap(rootDeviceIndex, nullptr, 0, 0, 0);
 
     // Try to enlarge SSH once again, this time the operation must fail
-    surfaceAdded = gtpinHelper.addSurfaceState(pKernel);
+    surfaceAdded = gtpinHelper.addSurfaceState(pKernel, rootDeviceIndex);
     EXPECT_FALSE(surfaceAdded);
 
     size_t numBTS3 = pKernel->getNumberOfBindingTableStates();
     EXPECT_EQ(0u, numBTS3);
-    size_t sizeSurfaceStates3 = pKernel->getSurfaceStateHeapSize();
+    size_t sizeSurfaceStates3 = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
     EXPECT_EQ(0u, sizeSurfaceStates3);
     size_t offsetBTS3 = pKernel->getBindingTableOffset();
     EXPECT_EQ(0u, offsetBTS3);
-    void *pSS3 = gtpinHelper.getSurfaceState(pKernel, 0);
+    void *pSS3 = gtpinHelper.getSurfaceState(pKernel, 0, rootDeviceIndex);
     EXPECT_EQ(nullptr, pSS3);
 
     // Cleanup
@@ -2396,7 +2398,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenOnKernelSubitIsCalledThenCo
     std::unique_ptr<MockCommandQueue> cmdQ(new MockCommandQueue(context.get(), pDevice, nullptr));
     std::unique_ptr<MockKernel> pKernel(new MockKernel(pProgramm.get(), *pKernelInfo));
 
-    pKernel->setSshLocal(nullptr, sizeof(surfaceStateHeap));
+    pKernel->setSshLocal(nullptr, sizeof(surfaceStateHeap), rootDeviceIndex);
 
     kernelOffset = 0x1234;
     EXPECT_NE(pKernel->getStartOffset(), kernelOffset);
diff --git a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp
index 38c18a0a37..04b4f53b93 100644
--- a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp
+++ b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp
@@ -363,7 +363,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenAllocatingIndirectStateRes
     auto usedAfterSSH = ssh.getUsed();
     auto sizeRequiredDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
     auto sizeRequiredIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(rootDeviceIndex, *kernel, localWorkSize);
-    auto sizeRequiredSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
+    auto sizeRequiredSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel, rootDeviceIndex);
 
     EXPECT_GE(sizeRequiredDSH, usedAfterDSH - usedBeforeDSH);
     EXPECT_GE(sizeRequiredIOH, usedAfterIOH - usedBeforeIOH);
@@ -1005,7 +1005,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenInd
     }
 
     mockKernelWithInternal->mockKernel->setCrossThreadData(mockKernelWithInternal->crossThreadData, sizeof(mockKernelWithInternal->crossThreadData));
-    mockKernelWithInternal->mockKernel->setSshLocal(mockKernelWithInternal->sshLocal, sizeof(mockKernelWithInternal->sshLocal));
+    mockKernelWithInternal->mockKernel->setSshLocal(mockKernelWithInternal->sshLocal, sizeof(mockKernelWithInternal->sshLocal), rootDeviceIndex);
     uint32_t interfaceDescriptorIndex = 0;
     auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
     auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
@@ -1096,12 +1096,12 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelCommandsFromBinaryTest, WhenGettingSizeR
     totalSize += maxBindingTableCount * sizeof(BINDING_TABLE_STATE) * DeviceQueue::interfaceDescriptorEntries;
 
     auto &scheduler = pContext->getSchedulerKernel();
-    auto schedulerSshSize = scheduler.getSurfaceStateHeapSize();
+    auto schedulerSshSize = scheduler.getSurfaceStateHeapSize(rootDeviceIndex);
     totalSize += schedulerSshSize + ((schedulerSshSize != 0) ? BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE : 0);
 
     totalSize = alignUp(totalSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
 
-    EXPECT_EQ(totalSize, HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*pKernel));
+    EXPECT_EQ(totalSize, HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*pKernel, rootDeviceIndex));
 }
 
 static const char *binaryFile = "simple_block_kernel";
diff --git a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.h b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.h
index 110ed65755..731abf1056 100644
--- a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.h
+++ b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.h
@@ -46,7 +46,7 @@ struct HardwareCommandsTest : ClDeviceFixture,
     template <typename GfxFamily>
     size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const Kernel &srcKernel) {
         return EncodeSurfaceState<GfxFamily>::pushBindingTableAndSurfaceStates(dstHeap, (srcKernel.getKernelInfo().patchInfo.bindingTableState != nullptr) ? srcKernel.getKernelInfo().patchInfo.bindingTableState->Count : 0,
-                                                                               srcKernel.getSurfaceStateHeap(), srcKernel.getSurfaceStateHeapSize(),
+                                                                               srcKernel.getSurfaceStateHeap(rootDeviceIndex), srcKernel.getSurfaceStateHeapSize(rootDeviceIndex),
                                                                                srcKernel.getNumberOfBindingTableStates(), srcKernel.getBindingTableOffset());
     }
 };
diff --git a/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp b/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp
index 3ee16a3343..b0c4ad48d5 100644
--- a/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp
@@ -58,7 +58,7 @@ TEST_F(KernelArgBufferTest, GivenSvmPtrStatelessWhenSettingKernelArgThenArgument
     EXPECT_EQ(CL_SUCCESS, retVal);
     EXPECT_FALSE(pKernel->requiresCoherency());
 
-    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
 
     delete buffer;
 }
@@ -76,11 +76,11 @@ HWTEST_F(KernelArgBufferTest, GivenSvmPtrStatefulWhenSettingKernelArgThenArgumen
     EXPECT_EQ(CL_SUCCESS, retVal);
     EXPECT_FALSE(pKernel->requiresCoherency());
 
-    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
 
     typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(), pKernelInfo->kernelArgInfo[0].offsetHeap));
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex), pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
     EXPECT_EQ(buffer->getGraphicsAllocation(mockRootDeviceIndex)->getGpuAddress(), surfaceAddress);
diff --git a/opencl/test/unit_test/kernel/kernel_arg_pipe_tests.cpp b/opencl/test/unit_test/kernel/kernel_arg_pipe_tests.cpp
index ea61c47c23..43db09eb00 100644
--- a/opencl/test/unit_test/kernel/kernel_arg_pipe_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_arg_pipe_tests.cpp
@@ -110,7 +110,7 @@ TEST_F(KernelArgPipeTest, GivenSvmPtrStatelessWhenSettingKernelArgThenArgumentsA
     auto retVal = this->pKernel->setArg(0, sizeof(cl_mem *), pVal);
     EXPECT_EQ(CL_SUCCESS, retVal);
 
-    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
 
     delete pipe;
 }
@@ -127,11 +127,11 @@ HWTEST_F(KernelArgPipeTest, GivenSvmPtrStatefulWhenSettingKernelArgThenArguments
     auto retVal = this->pKernel->setArg(0, sizeof(cl_mem *), pVal);
     EXPECT_EQ(CL_SUCCESS, retVal);
 
-    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
 
     typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     void *surfaceAddress = reinterpret_cast<void *>(surfaceState->getSurfaceBaseAddress());
diff --git a/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp b/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp
index 47e87bbfb3..6040a74d83 100644
--- a/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp
@@ -101,7 +101,7 @@ TEST_F(KernelArgSvmTest, GivenSvmPtrStatelessWhenSettingKernelArgThenArgumentsAr
     auto retVal = pKernel->setArgSvm(0, 256, svmPtr, nullptr, 0u);
     EXPECT_EQ(CL_SUCCESS, retVal);
 
-    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
 
     delete[] svmPtr;
 }
@@ -115,11 +115,11 @@ HWTEST_F(KernelArgSvmTest, GivenSvmPtrStatefulWhenSettingKernelArgThenArgumentsA
     auto retVal = pKernel->setArgSvm(0, 256, svmPtr, nullptr, 0u);
     EXPECT_EQ(CL_SUCCESS, retVal);
 
-    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
 
     typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     void *surfaceAddress = reinterpret_cast<void *>(surfaceState->getSurfaceBaseAddress());
@@ -154,7 +154,7 @@ TEST_F(KernelArgSvmTest, GivenValidSvmAllocStatelessWhenSettingKernelArgThenArgu
     auto retVal = pKernel->setArgSvmAlloc(0, svmPtr, &svmAlloc);
     EXPECT_EQ(CL_SUCCESS, retVal);
 
-    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
 
     delete[] svmPtr;
 }
@@ -170,11 +170,11 @@ HWTEST_F(KernelArgSvmTest, GivenValidSvmAllocStatefulWhenSettingKernelArgThenArg
     auto retVal = pKernel->setArgSvmAlloc(0, svmPtr, &svmAlloc);
     EXPECT_EQ(CL_SUCCESS, retVal);
 
-    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
 
     typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     void *surfaceAddress = reinterpret_cast<void *>(surfaceState->getSurfaceBaseAddress());
@@ -196,7 +196,7 @@ HWTEST_F(KernelArgSvmTest, givenOffsetedSvmPointerWhenSetArgSvmAllocIsCalledThen
 
     typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     void *surfaceAddress = reinterpret_cast<void *>(surfaceState->getSurfaceBaseAddress());
@@ -214,7 +214,7 @@ HWTEST_F(KernelArgSvmTest, givenDeviceSupportingSharedSystemAllocationsWhenSetAr
 
     typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     void *surfaceAddress = reinterpret_cast<void *>(surfaceState->getSurfaceBaseAddress());
@@ -237,7 +237,7 @@ HWTEST_F(KernelArgSvmTest, WhenPatchingWithImplicitSurfaceThenPatchIsApplied) {
     svmPtr.resize(256);
 
     pKernel->setCrossThreadData(nullptr, sizeof(void *));
-    pKernel->setSshLocal(nullptr, rendSurfSize);
+    pKernel->setSshLocal(nullptr, rendSurfSize, rootDeviceIndex);
     pKernelInfo->requiresSshForBuffers = true;
     pKernelInfo->usesSsh = true;
     {
@@ -254,8 +254,8 @@ HWTEST_F(KernelArgSvmTest, WhenPatchingWithImplicitSurfaceThenPatchIsApplied) {
         ASSERT_GE(pKernel->getCrossThreadDataSize(rootDeviceIndex), sizeof(void *));
         *reinterpret_cast<void **>(pKernel->getCrossThreadData(rootDeviceIndex)) = 0U;
 
-        ASSERT_GE(pKernel->getSurfaceStateHeapSize(), rendSurfSize);
-        RENDER_SURFACE_STATE *surfState = reinterpret_cast<RENDER_SURFACE_STATE *>(pKernel->getSurfaceStateHeap());
+        ASSERT_GE(pKernel->getSurfaceStateHeapSize(rootDeviceIndex), rendSurfSize);
+        RENDER_SURFACE_STATE *surfState = reinterpret_cast<RENDER_SURFACE_STATE *>(pKernel->getSurfaceStateHeap(rootDeviceIndex));
         memset(surfState, 0, rendSurfSize);
 
         pKernel->patchWithImplicitSurface(ptrToPatch, svmAlloc, patch);
@@ -278,7 +278,7 @@ HWTEST_F(KernelArgSvmTest, WhenPatchingWithImplicitSurfaceThenPatchIsApplied) {
 
         // when cross thread and ssh data is not available then should not do anything
         pKernel->setCrossThreadData(nullptr, 0);
-        pKernel->setSshLocal(nullptr, 0);
+        pKernel->setSshLocal(nullptr, 0, rootDeviceIndex);
         pKernel->patchWithImplicitSurface(ptrToPatch, svmAlloc, patch);
     }
 }
@@ -389,7 +389,7 @@ HWTEST_TYPED_TEST(KernelArgSvmTestTyped, GivenBufferKernelArgWhenBufferOffsetIsN
     kai.offsetBufferOffset = kai.kernelArgPatchInfoVector[0].size;
 
     this->pKernel->setCrossThreadData(nullptr, kai.offsetBufferOffset + sizeof(uint32_t));
-    this->pKernel->setSshLocal(nullptr, rendSurfSize);
+    this->pKernel->setSshLocal(nullptr, rendSurfSize, rootDeviceIndex);
     this->pKernelInfo->requiresSshForBuffers = true;
     this->pKernelInfo->usesSsh = true;
     {
@@ -405,8 +405,8 @@ HWTEST_TYPED_TEST(KernelArgSvmTestTyped, GivenBufferKernelArgWhenBufferOffsetIsN
         *expectedPointerPatchPtr = reinterpret_cast<void *>(0U);
         *expectedOffsetPatchPtr = 0U;
 
-        ASSERT_GE(this->pKernel->getSurfaceStateHeapSize(), rendSurfSize);
-        RENDER_SURFACE_STATE *surfState = reinterpret_cast<RENDER_SURFACE_STATE *>(this->pKernel->getSurfaceStateHeap());
+        ASSERT_GE(this->pKernel->getSurfaceStateHeapSize(rootDeviceIndex), rendSurfSize);
+        RENDER_SURFACE_STATE *surfState = reinterpret_cast<RENDER_SURFACE_STATE *>(this->pKernel->getSurfaceStateHeap(rootDeviceIndex));
         memset(surfState, 0, rendSurfSize);
 
         TypeParam::setArg(*this->pKernel, 0U, ptrToPatch, sizeToPatch, svmAlloc);
diff --git a/opencl/test/unit_test/kernel/kernel_tests.cpp b/opencl/test/unit_test/kernel/kernel_tests.cpp
index 1caef7ee1a..28dabfbc6d 100644
--- a/opencl/test/unit_test/kernel/kernel_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_tests.cpp
@@ -736,13 +736,13 @@ HWTEST_F(KernelPrivateSurfaceTest, givenStatefulKernelWhenKernelIsCreatedThenPri
 
     ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
 
-    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
 
     auto bufferAddress = pKernel->kernelDeviceInfos[pDevice->getRootDeviceIndex()].privateSurface->getGpuAddress();
 
     typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->patchInfo.pAllocateStatelessPrivateSurface->SurfaceStateHeapOffset));
     auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
 
@@ -766,7 +766,7 @@ TEST_F(KernelPrivateSurfaceTest, givenStatelessKernelWhenKernelIsCreatedThenPriv
     char buffer[16];
     MockGraphicsAllocation gfxAlloc(buffer, sizeof(buffer));
 
-    MockContext context;
+    MockContext context(pClDevice);
     MockProgram program(&context, false, toClDeviceVector(*pClDevice));
     program.setConstantSurface(&gfxAlloc);
 
@@ -779,8 +779,8 @@ TEST_F(KernelPrivateSurfaceTest, givenStatelessKernelWhenKernelIsCreatedThenPriv
 
     ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
 
-    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
-    EXPECT_EQ(nullptr, pKernel->getSurfaceStateHeap());
+    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
+    EXPECT_EQ(nullptr, pKernel->getSurfaceStateHeap(rootDeviceIndex));
 
     program.setConstantSurface(nullptr);
     delete pKernel;
@@ -984,11 +984,11 @@ HWTEST_F(KernelGlobalSurfaceTest, givenStatefulKernelWhenKernelIsCreatedThenGlob
 
     ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
 
-    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
 
     typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->patchInfo.pAllocateStatelessGlobalMemorySurfaceWithInitialization->SurfaceStateHeapOffset));
     auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
 
@@ -1025,8 +1025,8 @@ TEST_F(KernelGlobalSurfaceTest, givenStatelessKernelWhenKernelIsCreatedThenGloba
 
     ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
 
-    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
-    EXPECT_EQ(nullptr, pKernel->getSurfaceStateHeap());
+    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
+    EXPECT_EQ(nullptr, pKernel->getSurfaceStateHeap(rootDeviceIndex));
 
     program.setGlobalSurface(nullptr);
     delete pKernel;
@@ -1156,11 +1156,11 @@ HWTEST_F(KernelConstantSurfaceTest, givenStatefulKernelWhenKernelIsCreatedThenCo
 
     ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
 
-    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
 
     typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->patchInfo.pAllocateStatelessConstantMemorySurfaceWithInitialization->SurfaceStateHeapOffset));
     auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
 
@@ -1197,8 +1197,8 @@ TEST_F(KernelConstantSurfaceTest, givenStatelessKernelWhenKernelIsCreatedThenCon
 
     ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
 
-    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
-    EXPECT_EQ(nullptr, pKernel->getSurfaceStateHeap());
+    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
+    EXPECT_EQ(nullptr, pKernel->getSurfaceStateHeap(rootDeviceIndex));
 
     program.setConstantSurface(nullptr);
     delete pKernel;
@@ -1238,11 +1238,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelEventPoolSurfaceTest, givenStatefulKernelWhenK
 
     ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
 
-    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
 
     typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->patchInfo.pAllocateStatelessEventPoolSurface->SurfaceStateHeapOffset));
     auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
 
@@ -1291,7 +1291,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelEventPoolSurfaceTest, givenStatefulKernelWhenE
 
     typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->patchInfo.pAllocateStatelessEventPoolSurface->SurfaceStateHeapOffset));
     auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
 
@@ -1363,7 +1363,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelEventPoolSurfaceTest, givenStatelessKernelWhen
 
     ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
     if (pClDevice->areOcl21FeaturesSupported() == false) {
-        EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
+        EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
     } else {
     }
 
@@ -1442,11 +1442,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelDefaultDeviceQueueSurfaceTest, givenStatefulKe
 
     ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
 
-    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
 
     typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->SurfaceStateHeapOffset));
     auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
 
@@ -1493,11 +1493,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelDefaultDeviceQueueSurfaceTest, givenStatefulKe
 
     pKernel->patchDefaultDeviceQueue(pDevQueue);
 
-    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
 
     typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->SurfaceStateHeapOffset));
     auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
 
@@ -1537,7 +1537,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelDefaultDeviceQueueSurfaceTest, givenStatelessK
 
     ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
 
-    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
 
     delete pKernel;
 }
diff --git a/opencl/test/unit_test/kernel/kernel_transformable_tests.cpp b/opencl/test/unit_test/kernel/kernel_transformable_tests.cpp
index a13c1c736d..0b02cff30c 100644
--- a/opencl/test/unit_test/kernel/kernel_transformable_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_transformable_tests.cpp
@@ -21,6 +21,7 @@ using namespace NEO;
 class KernelTransformableTest : public ::testing::Test {
   public:
     void SetUp() override {
+        rootDeviceIndex = context.getDevice(0)->getRootDeviceIndex();
         pKernelInfo = std::make_unique<KernelInfo>();
         KernelArgPatchInfo kernelArgPatchInfo;
 
@@ -74,6 +75,7 @@ class KernelTransformableTest : public ::testing::Test {
     std::unique_ptr<Image> image;
     SKernelBinaryHeaderCommon kernelHeader;
     char surfaceStateHeap[0x80];
+    uint32_t rootDeviceIndex = std::numeric_limits<uint32_t>::max();
 };
 
 HWTEST_F(KernelTransformableTest, givenKernelThatCannotTranformImagesWithTwoTransformableImagesAndTwoTransformableSamplersWhenAllArgsAreSetThenImagesAreNotTransformed) {
@@ -93,7 +95,7 @@ HWTEST_F(KernelTransformableTest, givenKernelThatCannotTranformImagesWithTwoTran
     pKernel->setArg(2, sizeof(clImage), &clImage);
     pKernel->setArg(3, sizeof(clImage), &clImage);
 
-    auto ssh = pKernel->getSurfaceStateHeap();
+    auto ssh = pKernel->getSurfaceStateHeap(rootDeviceIndex);
 
     auto firstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, firstImageOffset));
     EXPECT_EQ(SURFACE_TYPE::SURFACE_TYPE_SURFTYPE_3D, firstSurfaceState->getSurfaceType());
@@ -120,7 +122,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithTwoTransformableImagesAndTwoTra
     pKernel->setArg(2, sizeof(clImage), &clImage);
     pKernel->setArg(3, sizeof(clImage), &clImage);
 
-    auto ssh = pKernel->getSurfaceStateHeap();
+    auto ssh = pKernel->getSurfaceStateHeap(rootDeviceIndex);
 
     auto firstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, firstImageOffset));
     EXPECT_EQ(SURFACE_TYPE::SURFACE_TYPE_SURFTYPE_2D, firstSurfaceState->getSurfaceType());
@@ -147,7 +149,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithTwoTransformableImagesAndTwoTra
     pKernel->setArg(2, sizeof(clImage), &clImage);
     pKernel->setArg(3, sizeof(clImage), &clImage);
 
-    auto ssh = pKernel->getSurfaceStateHeap();
+    auto ssh = pKernel->getSurfaceStateHeap(rootDeviceIndex);
 
     auto firstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, firstImageOffset));
     auto secondSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, secondImageOffset));
@@ -179,7 +181,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithOneTransformableImageAndTwoTran
     pKernel->setArg(2, sizeof(clImage), &clImage);
     pKernel->setArg(3, sizeof(clImage), &clImage);
 
-    auto ssh = pKernel->getSurfaceStateHeap();
+    auto ssh = pKernel->getSurfaceStateHeap(rootDeviceIndex);
 
     auto firstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, firstImageOffset));
     auto secondSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, secondImageOffset));
@@ -201,7 +203,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithImages2dAndTwoTransformableSamp
     pKernelInfo->kernelArgInfo[2].isTransformable = true;
     pKernelInfo->kernelArgInfo[3].isTransformable = true;
 
-    auto ssh = pKernel->getSurfaceStateHeap();
+    auto ssh = pKernel->getSurfaceStateHeap(rootDeviceIndex);
 
     auto firstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, firstImageOffset));
     auto secondSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, secondImageOffset));
@@ -233,7 +235,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithTwoTransformableImagesAndTwoTra
     pKernel->setArg(2, sizeof(clImage), &clImage);
     pKernel->setArg(3, sizeof(clImage), &clImage);
 
-    auto ssh = pKernel->getSurfaceStateHeap();
+    auto ssh = pKernel->getSurfaceStateHeap(rootDeviceIndex);
 
     auto firstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, firstImageOffset));
     auto secondSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, secondImageOffset));
@@ -265,7 +267,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithNonTransformableSamplersWhenRes
     pKernel->setArg(2, sizeof(clImage), &clImage);
     pKernel->setArg(3, sizeof(clImage), &clImage);
 
-    auto ssh = pKernel->getSurfaceStateHeap();
+    auto ssh = pKernel->getSurfaceStateHeap(rootDeviceIndex);
 
     auto firstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, firstImageOffset));
     auto secondSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, secondImageOffset));
@@ -303,7 +305,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithoutSamplersAndTransformableImag
     pKernel->setArg(2, sizeof(clImage), &clImage);
     pKernel->setArg(3, sizeof(clImage), &clImage);
 
-    auto ssh = pKernel->getSurfaceStateHeap();
+    auto ssh = pKernel->getSurfaceStateHeap(rootDeviceIndex);
 
     auto firstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, firstImageOffset));
     auto secondSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, secondImageOffset));
diff --git a/opencl/test/unit_test/mem_obj/buffer_set_arg_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_set_arg_tests.cpp
index cec23012e1..39b01c90eb 100644
--- a/opencl/test/unit_test/mem_obj/buffer_set_arg_tests.cpp
+++ b/opencl/test/unit_test/mem_obj/buffer_set_arg_tests.cpp
@@ -125,7 +125,7 @@ HWTEST_F(BufferSetArgTest, givenSetArgBufferWhenNullArgStatefulThenProgramNullSu
     using SURFACE_FORMAT = typename RENDER_SURFACE_STATE::SURFACE_FORMAT;
 
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     pKernelInfo->requiresSshForBuffers = true;
@@ -145,7 +145,7 @@ HWTEST_F(BufferSetArgTest, givenSetKernelArgOnReadOnlyBufferThatIsMisalingedWhen
     using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
 
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     pKernelInfo->requiresSshForBuffers = true;
@@ -186,7 +186,7 @@ HWTEST_F(BufferSetArgTest, givenSetArgBufferWithNullArgStatelessThenDontProgramN
 HWTEST_F(BufferSetArgTest, givenNonPureStatefulArgWhenRenderCompressedBufferIsSetThenSetNonAuxMode) {
     using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
 
-    auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(), pKernelInfo->kernelArgInfo[0].offsetHeap));
+    auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex), pKernelInfo->kernelArgInfo[0].offsetHeap));
     auto graphicsAllocation = buffer->getGraphicsAllocation(pClDevice->getRootDeviceIndex());
     graphicsAllocation->setAllocationType(GraphicsAllocation::AllocationType::BUFFER_COMPRESSED);
     graphicsAllocation->setDefaultGmm(new Gmm(pDevice->getGmmClientContext(), graphicsAllocation->getUnderlyingBuffer(), buffer->getSize(), false));
diff --git a/opencl/test/unit_test/mem_obj/image_set_arg_tests.cpp b/opencl/test/unit_test/mem_obj/image_set_arg_tests.cpp
index 7c45dc5f38..72f980b23d 100644
--- a/opencl/test/unit_test/mem_obj/image_set_arg_tests.cpp
+++ b/opencl/test/unit_test/mem_obj/image_set_arg_tests.cpp
@@ -120,7 +120,7 @@ HWTEST_F(ImageSetArgTest, WhenSettingKernelArgImageThenSurfaceBaseAddressIsSetCo
     typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
 
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     srcImage->setImageArg(const_cast<RENDER_SURFACE_STATE *>(surfaceState), false, 0, pClDevice->getRootDeviceIndex());
@@ -195,7 +195,7 @@ HWTEST_F(ImageSetArgTest, givenCubeMapIndexWhenSetKernelArgImageIsCalledThenModi
     src2dImage->setCubeFaceIndex(cubeFaceIndex);
 
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     src2dImage->setImageArg(const_cast<RENDER_SURFACE_STATE *>(surfaceState), false, 0, pClDevice->getRootDeviceIndex());
@@ -298,7 +298,7 @@ HWTEST_F(ImageSetArgTest, givenNonCubeMapIndexWhenSetKernelArgImageIsCalledThenD
     typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
 
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     EXPECT_EQ(srcImage->getCubeFaceIndex(), __GMM_NO_CUBE_MAP);
@@ -327,7 +327,7 @@ HWTEST_F(ImageSetArgTest, givenOffsetedBufferWhenSetKernelArgImageIscalledThenFu
     typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
 
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     auto graphicsAllocation = srcAllocation;
@@ -357,7 +357,7 @@ HWTEST_F(ImageSetArgTest, WhenSettingKernelArgThenPropertiesAreSetCorrectly) {
     ASSERT_EQ(CL_SUCCESS, retVal);
 
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     size_t rPitch = srcImage->getImageDesc().image_row_pitch;
@@ -408,7 +408,7 @@ HWTEST_F(ImageSetArgTest, givenImage2DWithMipMapsWhenSetKernelArgIsCalledThenMip
     ASSERT_EQ(CL_SUCCESS, retVal);
 
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
     EXPECT_EQ((uint32_t)mipLevel, surfaceState->getSurfaceMinLod());
     EXPECT_EQ((uint32_t)mipCount, surfaceState->getMipCountLod() + 1);
@@ -429,7 +429,7 @@ HWTEST_F(ImageSetArgTest, Given2dArrayWhenSettingKernelArgThenPropertiesAreSetCo
     ASSERT_EQ(CL_SUCCESS, retVal);
 
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
     auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
 
@@ -477,7 +477,7 @@ HWTEST_F(ImageSetArgTest, Given1dArrayWhenSettingKernelArgThenPropertiesAreSetCo
     ASSERT_EQ(CL_SUCCESS, retVal);
 
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
     auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
 
@@ -533,7 +533,7 @@ HWTEST_F(ImageSetArgTest, givenMcsAllocationWhenSetArgIsCalledWithoutUnifiedAuxC
     ASSERT_EQ(CL_SUCCESS, retVal);
 
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     EXPECT_FALSE(Image::isDepthFormat(image->getImageFormat()));
@@ -569,7 +569,7 @@ HWTEST_F(ImageSetArgTest, givenDepthFormatWhenSetArgIsCalledThenProgramAuxFields
     ASSERT_EQ(CL_SUCCESS, retVal);
 
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     EXPECT_TRUE(Image::isDepthFormat(image->getImageFormat()));
@@ -600,7 +600,7 @@ HWTEST_F(ImageSetArgTest, givenMultisampledR32Floatx8x24DepthStencilFormatWhenSe
     retVal = clSetKernelArg(pKernel, 0, sizeof(memObj), &memObj);
     ASSERT_EQ(CL_SUCCESS, retVal);
 
-    auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(),
+    auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                                                                            pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     EXPECT_TRUE(Image::isDepthFormat(image->getImageFormat()));
@@ -627,7 +627,7 @@ HWTEST_F(ImageSetArgTest, givenMcsAllocationAndRenderCompressionWhenSetArgOnMult
     retVal = clSetKernelArg(pKernel, 0, sizeof(memObj), &memObj);
     ASSERT_EQ(CL_SUCCESS, retVal);
 
-    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(),
+    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                                                                                  pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     EXPECT_TRUE(surfaceState->getMultisampledSurfaceStorageFormat() ==
@@ -657,7 +657,7 @@ HWTEST_F(ImageSetArgTest, givenDepthFormatAndRenderCompressionWhenSetArgOnMultis
     retVal = clSetKernelArg(pKernel, 0, sizeof(memObj), &memObj);
     ASSERT_EQ(CL_SUCCESS, retVal);
 
-    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(),
+    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                                                                                  pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     EXPECT_TRUE(Image::isDepthFormat(image->getImageFormat()));
@@ -692,7 +692,7 @@ HWTEST_F(ImageSetArgTest, givenMcsAllocationWhenSetArgIsCalledWithUnifiedAuxCapa
     retVal = clSetKernelArg(pKernel, 0, sizeof(memObj), &memObj);
     ASSERT_EQ(CL_SUCCESS, retVal);
 
-    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(),
+    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                                                                                  pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     EXPECT_TRUE(surfaceState->getAuxiliarySurfaceMode() == AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E);
@@ -723,7 +723,7 @@ HWTEST_F(ImageSetArgTest, givenMcsAllocationWhenSetArgIsCalledWithUnifiedAuxCapa
     retVal = clSetKernelArg(pKernel, 0, sizeof(memObj), &memObj);
     ASSERT_EQ(CL_SUCCESS, retVal);
 
-    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(),
+    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                                                                                  pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     EXPECT_NE(0u, surfaceState->getAuxiliarySurfaceBaseAddress());
@@ -758,7 +758,7 @@ HWTEST_F(ImageSetArgTest, givenMcsAllocationWhenSetArgIsCalledWithUnifiedAuxCapa
     retVal = clSetKernelArg(pKernel, 0, sizeof(memObj), &memObj);
     ASSERT_EQ(CL_SUCCESS, retVal);
 
-    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(),
+    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                                                                                  pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     EXPECT_EQ(pitchValue, surfaceState->getAuxiliarySurfacePitch());
@@ -793,7 +793,7 @@ HWTEST_F(ImageSetArgTest, GivenImageFrom1dBufferWhenSettingKernelArgThenProperti
     ASSERT_EQ(CL_SUCCESS, retVal);
 
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
     auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
     auto image = castToObject<Image>(imageFromBuffer);
@@ -837,7 +837,7 @@ HWTEST_F(ImageSetArgTest, GivenImageWithClLuminanceFormatWhenSettingKernelArgThe
     ASSERT_EQ(CL_SUCCESS, retVal);
 
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
     //for CL_LUMINANCE format we override channels to RED to be spec complaint.
     EXPECT_EQ(RENDER_SURFACE_STATE::SHADER_CHANNEL_SELECT_RED, surfaceState->getShaderChannelSelectRed());
@@ -963,7 +963,7 @@ HWTEST_F(ImageMediaBlockSetArgTest, WhenSettingKernelArgImageThenPropertiesAreCo
     ASSERT_EQ(CL_SUCCESS, retVal);
 
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                   pKernelInfo->kernelArgInfo[0].offsetHeap));
 
     size_t rPitch = srcImage->getImageDesc().image_row_pitch;
diff --git a/opencl/test/unit_test/memory_manager/memory_manager_tests.cpp b/opencl/test/unit_test/memory_manager/memory_manager_tests.cpp
index 3c3b108d93..7cc8a9755b 100644
--- a/opencl/test/unit_test/memory_manager/memory_manager_tests.cpp
+++ b/opencl/test/unit_test/memory_manager/memory_manager_tests.cpp
@@ -544,7 +544,7 @@ TEST_F(MemoryAllocatorTest, givenStatelessKernelWithPrintfWhenPrintfSurfaceIsCre
 
     EXPECT_EQ(allocationAddress, *(uintptr_t *)printfPatchAddress);
 
-    EXPECT_EQ(0u, kernel.mockKernel->getSurfaceStateHeapSize());
+    EXPECT_EQ(0u, kernel.mockKernel->getSurfaceStateHeapSize(rootDeviceIndex));
 
     delete printfHandler;
 }
@@ -575,11 +575,11 @@ HWTEST_F(MemoryAllocatorTest, givenStatefulKernelWithPrintfWhenPrintfSurfaceIsCr
     auto printfAllocation = printfHandler->getSurface();
     auto allocationAddress = printfAllocation->getGpuAddress();
 
-    EXPECT_NE(0u, kernel.mockKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, kernel.mockKernel->getSurfaceStateHeapSize(device->getRootDeviceIndex()));
 
     typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(kernel.mockKernel->getSurfaceStateHeap(),
+        ptrOffset(kernel.mockKernel->getSurfaceStateHeap(device->getRootDeviceIndex()),
                   kernel.mockKernel->getKernelInfo().patchInfo.pAllocateStatelessPrintfSurface->SurfaceStateHeapOffset));
     auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
 
diff --git a/opencl/test/unit_test/mocks/mock_kernel.h b/opencl/test/unit_test/mocks/mock_kernel.h
index 34d94e2e57..d21e4dbef8 100644
--- a/opencl/test/unit_test/mocks/mock_kernel.h
+++ b/opencl/test/unit_test/mocks/mock_kernel.h
@@ -43,7 +43,6 @@ class MockKernel : public Kernel {
     using Kernel::numberOfBindingTableStates;
     using Kernel::patchBufferOffset;
     using Kernel::patchWithImplicitSurface;
-    using Kernel::sshLocalSize;
     using Kernel::svmAllocationsRequireCacheFlush;
     using Kernel::threadArbitrationPolicy;
     using Kernel::unifiedMemoryControls;
@@ -181,15 +180,15 @@ class MockKernel : public Kernel {
         kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize = static_cast<uint32_t>(mockCrossThreadData.size());
     }
 
-    void setSshLocal(const void *sshPattern, uint32_t newSshSize) {
-        sshLocalSize = newSshSize;
+    void setSshLocal(const void *sshPattern, uint32_t newSshSize, uint32_t rootDeviceIndex) {
+        kernelDeviceInfos[rootDeviceIndex].sshLocalSize = newSshSize;
 
         if (newSshSize == 0) {
-            pSshLocal.reset(nullptr);
+            kernelDeviceInfos[rootDeviceIndex].pSshLocal.reset(nullptr);
         } else {
-            pSshLocal = std::make_unique<char[]>(newSshSize);
+            kernelDeviceInfos[rootDeviceIndex].pSshLocal = std::make_unique<char[]>(newSshSize);
             if (sshPattern) {
-                memcpy_s(pSshLocal.get(), newSshSize, sshPattern, newSshSize);
+                memcpy_s(kernelDeviceInfos[rootDeviceIndex].pSshLocal.get(), newSshSize, sshPattern, newSshSize);
             }
         }
     }
@@ -291,7 +290,7 @@ class MockKernelWithInternals {
         mockProgram = new MockProgram(context, false, deviceVector);
         mockKernel = new MockKernel(mockProgram, kernelInfo);
         mockKernel->setCrossThreadData(&crossThreadData, sizeof(crossThreadData));
-        mockKernel->setSshLocal(&sshLocal, sizeof(sshLocal));
+        mockKernel->setSshLocal(&sshLocal, sizeof(sshLocal), deviceArg.getRootDeviceIndex());
 
         if (addDefaultArg) {
             defaultKernelArguments.resize(2);
@@ -358,10 +357,9 @@ class MockKernelWithInternals {
 class MockParentKernel : public Kernel {
   public:
     using Kernel::auxTranslationRequired;
+    using Kernel::kernelDeviceInfos;
     using Kernel::kernelInfo;
     using Kernel::patchBlocksCurbeWithConstantValues;
-    using Kernel::pSshLocal;
-    using Kernel::sshLocalSize;
 
     static MockParentKernel *create(Context &context, bool addChildSimdSize = false, bool addChildGlobalMemory = false, bool addChildConstantMemory = false, bool addPrintfForParent = true, bool addPrintfForBlock = true) {
         auto clDevice = context.getDevice(0);