Store SSH per root device in Kernel

Related-To: NEO-5001 Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
2020-11-23 18:01:38 +00:00 · 2020-11-23 18:01:38 +00:00 · 7ec69c33f9
parent 52d96af5f0
commit 7ec69c33f9
40 changed files with 231 additions and 220 deletions
--- a/opencl/source/command_queue/command_queue.cpp
+++ b/opencl/source/command_queue/command_queue.cpp
@ -530,7 +530,7 @@ bool CommandQueue::setupDebugSurface(Kernel *kernel) {

    DEBUG_BREAK_IF(!kernel->requiresSshForBuffers());

-    auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getSurfaceStateHeap()),
+    auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getSurfaceStateHeap(device->getRootDeviceIndex())),
                                  kernel->getKernelInfo().patchInfo.pAllocateSystemThreadSurface->Offset);
    void *addressToPatch = reinterpret_cast<void *>(debugSurface->getGpuAddress());
    size_t sizeToPatch = debugSurface->getUnderlyingBufferSize();
--- a/opencl/source/command_queue/enqueue_common.h
+++ b/opencl/source/command_queue/enqueue_common.h
@ -330,7 +330,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,

    if (blockQueue) {
        if (parentKernel) {
-            size_t minSizeSSHForEM = HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
+            size_t minSizeSSHForEM = HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel, device->getRootDeviceIndex());
            blockedCommandsData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        }

@ -534,7 +534,8 @@ void CommandQueueHw<GfxFamily>::processDeviceEnqueue(DeviceQueueHw<GfxFamily> *d
                                                     TagNode<HwTimeStamps> *hwTimeStamps,
                                                     bool &blocking) {
    auto parentKernel = multiDispatchInfo.peekParentKernel();
-    size_t minSizeSSHForEM = HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
+    auto rootDeviceIndex = devQueueHw->getDevice().getRootDeviceIndex();
+    size_t minSizeSSHForEM = HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);
    bool isCcsUsed = EngineHelpers::isCcs(gpgpuEngine->osContext->getEngineType());

    uint32_t taskCount = getGpgpuCommandStreamReceiver().peekTaskCount() + 1;
--- a/opencl/source/command_queue/gpgpu_walker.h
+++ b/opencl/source/command_queue/gpgpu_walker.h
@ -200,7 +200,7 @@ IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const MultiDispatchInf

    if (Kernel *parentKernel = multiDispatchInfo.peekParentKernel()) {
        if (heapType == IndirectHeap::SURFACE_STATE) {
-            expectedSize += HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
+            expectedSize += HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel, commandQueue.getDevice().getRootDeviceIndex());
        } else //if (heapType == IndirectHeap::DYNAMIC_STATE || heapType == IndirectHeap::INDIRECT_OBJECT)
        {
            DeviceQueueHw<GfxFamily> *pDevQueue = castToObject<DeviceQueueHw<GfxFamily>>(commandQueue.getContext().getDefaultDeviceQueue());
--- a/opencl/source/command_queue/hardware_interface_base.inl
+++ b/opencl/source/command_queue/hardware_interface_base.inl
@ -248,6 +248,7 @@ template <typename GfxFamily>
 void HardwareInterface<GfxFamily>::obtainIndirectHeaps(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo,
                                                       bool blockedQueue, IndirectHeap *&dsh, IndirectHeap *&ioh, IndirectHeap *&ssh) {
    auto parentKernel = multiDispatchInfo.peekParentKernel();
+    auto rootDeviceIndex = commandQueue.getDevice().getRootDeviceIndex();

    if (blockedQueue) {
        size_t dshSize = 0;
@ -257,7 +258,7 @@ void HardwareInterface<GfxFamily>::obtainIndirectHeaps(CommandQueue &commandQueu

        if (parentKernel) {
            dshSize = commandQueue.getContext().getDefaultDeviceQueue()->getDshBuffer()->getUnderlyingBufferSize();
-            sshSize += HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
+            sshSize += HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);
            iohEqualsDsh = true;
            colorCalcSize = static_cast<size_t>(commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize);
        } else {
--- a/opencl/source/gtpin/gtpin_callbacks.cpp
+++ b/opencl/source/gtpin/gtpin_callbacks.cpp
@ -66,9 +66,10 @@ void gtpinNotifyKernelCreate(cl_kernel kernel) {
        size_t gtpinBTI = pKernel->getNumberOfBindingTableStates();
        // Enlarge local copy of SSH by 1 SS
        auto device = pKernel->getDevices()[0];
+        auto rootDeviceIndex = device->getRootDeviceIndex();
        GFXCORE_FAMILY genFamily = device->getHardwareInfo().platform.eRenderCoreFamily;
        GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
-        if (pKernel->isParentKernel || !gtpinHelper.addSurfaceState(pKernel)) {
+        if (pKernel->isParentKernel || !gtpinHelper.addSurfaceState(pKernel, rootDeviceIndex)) {
            // Kernel with no SSH or Kernel EM, not supported
            return;
        }
@ -103,8 +104,10 @@ void gtpinNotifyKernelCreate(cl_kernel kernel) {

 void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
    if (isGTPinInitialized) {
+        auto pCmdQ = reinterpret_cast<CommandQueue *>(pCmdQueue);
+        auto &device = pCmdQ->getDevice();
        auto pKernel = castToObjectOrAbort<Kernel>(kernel);
-        if (pKernel->isParentKernel || pKernel->getSurfaceStateHeapSize() == 0) {
+        if (pKernel->isParentKernel || pKernel->getSurfaceStateHeapSize(device.getRootDeviceIndex()) == 0) {
            // Kernel with no SSH, not supported
            return;
        }
@ -132,14 +135,13 @@ void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
        if (!resource) {
            return;
        }
-        auto &device = *pKernel->getDevices()[0];
        GFXCORE_FAMILY genFamily = device.getHardwareInfo().platform.eRenderCoreFamily;
        GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
        size_t gtpinBTI = pKernel->getNumberOfBindingTableStates() - 1;
-        void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI);
+        void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI, device.getRootDeviceIndex());
        cl_mem buffer = (cl_mem)resource;
        auto pBuffer = castToObjectOrAbort<Buffer>(buffer);
-        pBuffer->setArgStateful(pSurfaceState, false, false, false, false, device.getDevice());
+        pBuffer->setArgStateful(pSurfaceState, false, false, false, false, device);
    }
 }

--- a/opencl/source/gtpin/gtpin_hw_helper.h
+++ b/opencl/source/gtpin/gtpin_hw_helper.h
@ -15,8 +15,8 @@ class GTPinHwHelper {
  public:
    static GTPinHwHelper &get(GFXCORE_FAMILY gfxCore);
    virtual uint32_t getGenVersion() = 0;
-    virtual bool addSurfaceState(Kernel *pKernel) = 0;
-    virtual void *getSurfaceState(Kernel *pKernel, size_t bti) = 0;
+    virtual bool addSurfaceState(Kernel *pKernel, uint32_t rootDeviceIndex) = 0;
+    virtual void *getSurfaceState(Kernel *pKernel, size_t bti, uint32_t rootDeviceIndex) = 0;

  protected:
    GTPinHwHelper(){};
@ -30,8 +30,8 @@ class GTPinHwHelperHw : public GTPinHwHelper {
        return gtpinHwHelper;
    }
    uint32_t getGenVersion() override;
-    bool addSurfaceState(Kernel *pKernel) override;
-    void *getSurfaceState(Kernel *pKernel, size_t bti) override;
+    bool addSurfaceState(Kernel *pKernel, uint32_t rootDeviceIndex) override;
+    void *getSurfaceState(Kernel *pKernel, size_t bti, uint32_t rootDeviceIndex) override;

  private:
    GTPinHwHelperHw(){};
--- a/opencl/source/gtpin/gtpin_hw_helper.inl
+++ b/opencl/source/gtpin/gtpin_hw_helper.inl
@ -15,11 +15,11 @@
 namespace NEO {

 template <typename GfxFamily>
-bool GTPinHwHelperHw<GfxFamily>::addSurfaceState(Kernel *pKernel) {
+bool GTPinHwHelperHw<GfxFamily>::addSurfaceState(Kernel *pKernel, uint32_t rootDeviceIndex) {
    using RENDER_SURFACE_STATE = typename GfxFamily::RENDER_SURFACE_STATE;
    using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE;

-    size_t sshSize = pKernel->getSurfaceStateHeapSize();
+    size_t sshSize = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
    if ((sshSize == 0) || pKernel->isParentKernel) {
        // Kernels which do not use SSH or use Execution Model are not supported (yet)
        return false;
@ -29,7 +29,7 @@ bool GTPinHwHelperHw<GfxFamily>::addSurfaceState(Kernel *pKernel) {
    size_t sizeToEnlarge = ssSize + btsSize;
    size_t currBTOffset = pKernel->getBindingTableOffset();
    size_t currSurfaceStateSize = currBTOffset;
-    char *pSsh = static_cast<char *>(pKernel->getSurfaceStateHeap());
+    char *pSsh = static_cast<char *>(pKernel->getSurfaceStateHeap(rootDeviceIndex));
    char *pNewSsh = new char[sshSize + sizeToEnlarge];
    memcpy_s(pNewSsh, sshSize + sizeToEnlarge, pSsh, currSurfaceStateSize);
    RENDER_SURFACE_STATE *pSS = reinterpret_cast<RENDER_SURFACE_STATE *>(pNewSsh + currSurfaceStateSize);
@ -40,19 +40,19 @@ bool GTPinHwHelperHw<GfxFamily>::addSurfaceState(Kernel *pKernel) {
    BINDING_TABLE_STATE *pNewBTS = reinterpret_cast<BINDING_TABLE_STATE *>(pNewSsh + newSurfaceStateSize + currBTCount * btsSize);
    *pNewBTS = GfxFamily::cmdInitBindingTableState;
    pNewBTS->setSurfaceStatePointer((uint64_t)currBTOffset);
-    pKernel->resizeSurfaceStateHeap(pNewSsh, sshSize + sizeToEnlarge, currBTCount + 1, newSurfaceStateSize);
+    pKernel->resizeSurfaceStateHeap(rootDeviceIndex, pNewSsh, sshSize + sizeToEnlarge, currBTCount + 1, newSurfaceStateSize);
    return true;
 }

 template <typename GfxFamily>
-void *GTPinHwHelperHw<GfxFamily>::getSurfaceState(Kernel *pKernel, size_t bti) {
+void *GTPinHwHelperHw<GfxFamily>::getSurfaceState(Kernel *pKernel, size_t bti, uint32_t rootDeviceIndex) {
    using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE;

-    if ((nullptr == pKernel->getSurfaceStateHeap()) || (bti >= pKernel->getNumberOfBindingTableStates())) {
+    if ((nullptr == pKernel->getSurfaceStateHeap(rootDeviceIndex)) || (bti >= pKernel->getNumberOfBindingTableStates())) {
        return nullptr;
    }
-    auto *pBts = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(), (pKernel->getBindingTableOffset() + bti * sizeof(BINDING_TABLE_STATE))));
-    auto pSurfaceState = ptrOffset(pKernel->getSurfaceStateHeap(), pBts->getSurfaceStatePointer());
+    auto *pBts = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex), (pKernel->getBindingTableOffset() + bti * sizeof(BINDING_TABLE_STATE))));
+    auto pSurfaceState = ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex), pBts->getSurfaceStatePointer());
    return pSurfaceState;
 }

--- a/opencl/source/helpers/hardware_commands_helper.h
+++ b/opencl/source/helpers/hardware_commands_helper.h
@ -126,7 +126,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
        const Kernel &kernel,
        size_t localWorkSize = 256);
    static size_t getSizeRequiredSSH(
-        const Kernel &kernel);
+        const Kernel &kernel, uint32_t rootDeviceIndex);

    static size_t getTotalSizeRequiredDSH(
        const MultiDispatchInfo &multiDispatchInfo);
@ -135,7 +135,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
    static size_t getTotalSizeRequiredSSH(
        const MultiDispatchInfo &multiDispatchInfo);

-    static size_t getSshSizeForExecutionModel(const Kernel &kernel);
+    static size_t getSshSizeForExecutionModel(const Kernel &kernel, uint32_t rootDeviceIndex);
    static void setInterfaceDescriptorOffset(
        WALKER_TYPE<GfxFamily> *walkerCmd,
        uint32_t &interfaceDescriptorIndex);
--- a/opencl/source/helpers/hardware_commands_helper_base.inl
+++ b/opencl/source/helpers/hardware_commands_helper_base.inl
@ -75,9 +75,9 @@ size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(

 template <typename GfxFamily>
 size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredSSH(
-    const Kernel &kernel) {
+    const Kernel &kernel, uint32_t rootDeviceIndex) {
    typedef typename GfxFamily::BINDING_TABLE_STATE BINDING_TABLE_STATE;
-    auto sizeSSH = kernel.getSurfaceStateHeapSize();
+    auto sizeSSH = kernel.getSurfaceStateHeapSize(rootDeviceIndex);
    sizeSSH += sizeSSH ? BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE : 0;
    return sizeSSH;
 }
@ -112,11 +112,11 @@ size_t HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(
 template <typename GfxFamily>
 size_t HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(
    const MultiDispatchInfo &multiDispatchInfo) {
-    return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredSSH(*dispatchInfo.getKernel()); });
+    return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredSSH(*dispatchInfo.getKernel(), dispatchInfo.getClDevice().getRootDeviceIndex()); });
 }

 template <typename GfxFamily>
-size_t HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(const Kernel &kernel) {
+size_t HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(const Kernel &kernel, uint32_t rootDeviceIndex) {
    typedef typename GfxFamily::BINDING_TABLE_STATE BINDING_TABLE_STATE;

    size_t totalSize = 0;
@ -136,7 +136,7 @@ size_t HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(const Kern

    SchedulerKernel &scheduler = kernel.getContext().getSchedulerKernel();

-    totalSize += getSizeRequiredSSH(scheduler);
+    totalSize += getSizeRequiredSSH(scheduler, rootDeviceIndex);

    totalSize += maxBindingTableCount * sizeof(BINDING_TABLE_STATE) * DeviceQueue::interfaceDescriptorEntries;
    totalSize = alignUp(totalSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
@ -237,7 +237,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
    kernel.patchBindlessSurfaceStateOffsets(device, ssh.getUsed());

    auto dstBindingTablePointer = EncodeSurfaceState<GfxFamily>::pushBindingTableAndSurfaceStates(ssh, (kernelInfo.patchInfo.bindingTableState != nullptr) ? kernelInfo.patchInfo.bindingTableState->Count : 0,
-                                                                                                  kernel.getSurfaceStateHeap(), kernel.getSurfaceStateHeapSize(),
+                                                                                                  kernel.getSurfaceStateHeap(rootDeviceIndex), kernel.getSurfaceStateHeapSize(rootDeviceIndex),
                                                                                                  kernel.getNumberOfBindingTableStates(), kernel.getBindingTableOffset());

    // Copy our sampler state if it exists
--- a/opencl/source/kernel/kernel.cpp
+++ b/opencl/source/kernel/kernel.cpp
@ -140,7 +140,7 @@ void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, Graphic
    uint32_t sshOffset = patch.SurfaceStateHeapOffset;
    auto rootDeviceIndex = allocation.getRootDeviceIndex();
    void *crossThreadData = getCrossThreadData(rootDeviceIndex);
-    void *ssh = getSurfaceStateHeap();
+    void *ssh = getSurfaceStateHeap(rootDeviceIndex);
    if (crossThreadData != nullptr) {
        auto pp = ptrOffset(crossThreadData, crossThreadDataOffset);
        uintptr_t addressToPatch = reinterpret_cast<uintptr_t>(ptrToPatchInCrossThreadData);
@ -235,13 +235,14 @@ cl_int Kernel::initialize() {
        }

        // allocate our own SSH, if necessary
-        sshLocalSize = heapInfo.SurfaceStateHeapSize;
+        kernelDeviceInfos[rootDeviceIndex].sshLocalSize = heapInfo.SurfaceStateHeapSize;

-        if (sshLocalSize) {
-            pSshLocal = std::make_unique<char[]>(sshLocalSize);
+        if (kernelDeviceInfos[rootDeviceIndex].sshLocalSize) {
+            kernelDeviceInfos[rootDeviceIndex].pSshLocal = std::make_unique<char[]>(kernelDeviceInfos[rootDeviceIndex].sshLocalSize);

            // copy the ssh into our local copy
-            memcpy_s(pSshLocal.get(), sshLocalSize, heapInfo.pSsh, sshLocalSize);
+            memcpy_s(kernelDeviceInfos[rootDeviceIndex].pSshLocal.get(), kernelDeviceInfos[rootDeviceIndex].sshLocalSize,
+                     heapInfo.pSsh, kernelDeviceInfos[rootDeviceIndex].sshLocalSize);
        }
        numberOfBindingTableStates = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Count : 0;
        localBindingTableOffset = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Offset : 0;
@ -287,7 +288,7 @@ cl_int Kernel::initialize() {

        if (patchInfo.pAllocateStatelessEventPoolSurface) {
            if (requiresSshForBuffers()) {
-                auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
+                auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
                                              patchInfo.pAllocateStatelessEventPoolSurface->SurfaceStateHeapOffset);
                Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0);
            }
@ -296,7 +297,7 @@ cl_int Kernel::initialize() {
        if (patchInfo.pAllocateStatelessDefaultDeviceQueueSurface) {

            if (requiresSshForBuffers()) {
-                auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
+                auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
                                              patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->SurfaceStateHeapOffset);
                Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0);
            }
@ -783,8 +784,8 @@ void Kernel::setStartOffset(uint32_t offset) {
    this->startOffset = offset;
 }

-void *Kernel::getSurfaceStateHeap() const {
-    return kernelInfo.usesSsh ? pSshLocal.get() : nullptr;
+void *Kernel::getSurfaceStateHeap(uint32_t rootDeviceIndex) const {
+    return kernelInfo.usesSsh ? kernelDeviceInfos[rootDeviceIndex].pSshLocal.get() : nullptr;
 }

 size_t Kernel::getDynamicStateHeapSize() const {
@ -795,9 +796,9 @@ const void *Kernel::getDynamicStateHeap() const {
    return kernelInfo.heapInfo.pDsh;
 }

-size_t Kernel::getSurfaceStateHeapSize() const {
+size_t Kernel::getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const {
    return kernelInfo.usesSsh
-               ? sshLocalSize
+               ? kernelDeviceInfos[rootDeviceIndex].sshLocalSize
               : 0;
 }

@ -805,9 +806,9 @@ size_t Kernel::getNumberOfBindingTableStates() const {
    return numberOfBindingTableStates;
 }

-void Kernel::resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset) {
-    pSshLocal.reset(static_cast<char *>(pNewSsh));
-    sshLocalSize = static_cast<uint32_t>(newSshSize);
+void Kernel::resizeSurfaceStateHeap(uint32_t rootDeviceIndex, void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset) {
+    kernelDeviceInfos[rootDeviceIndex].pSshLocal.reset(static_cast<char *>(pNewSsh));
+    kernelDeviceInfos[rootDeviceIndex].sshLocalSize = static_cast<uint32_t>(newSshSize);
    numberOfBindingTableStates = newBindingTableCount;
    localBindingTableOffset = newBindingTableOffset;
 }
@ -882,7 +883,7 @@ cl_int Kernel::setArgSvm(uint32_t argIndex, size_t svmAllocSize, void *svmPtr, G

    if (requiresSshForBuffers()) {
        const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex];
-        auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
+        auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
        Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, svmAllocSize + ptrDiff(svmPtr, ptrToPatch), ptrToPatch, 0, svmAlloc, svmFlags, 0);
    }
    if (!kernelArguments[argIndex].isPatched) {
@ -913,7 +914,7 @@ cl_int Kernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocatio

    if (requiresSshForBuffers()) {
        const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex];
-        auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
+        auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
        size_t allocSize = 0;
        size_t offset = 0;
        if (svmAlloc != nullptr) {
@ -1317,7 +1318,7 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex,
        }

        if (requiresSshForBuffers()) {
-            auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
+            auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
            buffer->setArgStateful(surfaceState, forceNonAuxMode, disableL3, isAuxTranslationKernel, kernelArgInfo.isReadOnly, getDevice().getDevice());
        }

@ -1342,7 +1343,7 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex,
        storeKernelArg(argIndex, BUFFER_OBJ, nullptr, argVal, argSize);

        if (requiresSshForBuffers()) {
-            auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
+            auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
            Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0);
        }

@ -1391,7 +1392,7 @@ cl_int Kernel::setArgPipe(uint32_t argIndex,
        auto graphicsAllocation = pipe->getGraphicsAllocation(getDevice().getRootDeviceIndex());

        if (requiresSshForBuffers()) {
-            auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
+            auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
            Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState,
                                    pipe->getSize(), pipe->getCpuAddress(), 0,
                                    graphicsAllocation, 0, 0);
@ -1429,7 +1430,7 @@ cl_int Kernel::setArgImageWithMipLevel(uint32_t argIndex,

        storeKernelArg(argIndex, IMAGE_OBJ, clMemObj, argVal, argSize);

-        auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
+        auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
        DEBUG_BREAK_IF(!kernelArgInfo.isImage);

        // Sets SS structure
@ -2250,7 +2251,7 @@ void Kernel::patchDefaultDeviceQueue(DeviceQueue *devQueue) {
                                  static_cast<uintptr_t>(devQueue->getQueueBuffer()->getGpuAddressToPatch()));
        }
        if (requiresSshForBuffers()) {
-            auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
+            auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
                                          patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->SurfaceStateHeapOffset);
            Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, devQueue->getQueueBuffer()->getUnderlyingBufferSize(),
                                    (void *)devQueue->getQueueBuffer()->getGpuAddress(), 0, devQueue->getQueueBuffer(), 0, 0);
@ -2272,7 +2273,7 @@ void Kernel::patchEventPool(DeviceQueue *devQueue) {
        }

        if (requiresSshForBuffers()) {
-            auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
+            auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
                                          patchInfo.pAllocateStatelessEventPoolSurface->SurfaceStateHeapOffset);
            Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, devQueue->getEventPoolBuffer()->getUnderlyingBufferSize(),
                                    (void *)devQueue->getEventPoolBuffer()->getGpuAddress(), 0, devQueue->getEventPoolBuffer(), 0, 0);
@ -2298,13 +2299,14 @@ bool Kernel::usesSyncBuffer() {
 }

 void Kernel::patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
+    auto rootDeviceIndex = device.getRootDeviceIndex();
    auto &patchInfo = kernelInfo.patchInfo;
-    auto bufferPatchAddress = ptrOffset(getCrossThreadData(device.getRootDeviceIndex()), patchInfo.pAllocateSyncBuffer->DataParamOffset);
+    auto bufferPatchAddress = ptrOffset(getCrossThreadData(rootDeviceIndex), patchInfo.pAllocateSyncBuffer->DataParamOffset);
    patchWithRequiredSize(bufferPatchAddress, patchInfo.pAllocateSyncBuffer->DataParamSize,
                          ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset));

    if (requiresSshForBuffers()) {
-        auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
+        auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
                                      patchInfo.pAllocateSyncBuffer->SurfaceStateHeapOffset);
        auto addressToPatch = gfxAllocation->getUnderlyingBuffer();
        auto sizeToPatch = gfxAllocation->getUnderlyingBufferSize();
@ -2353,10 +2355,11 @@ void Kernel::resolveArgs() {
            }
        }
    }
+    auto rootDeviceIndex = getDevice().getRootDeviceIndex();
    if (canTransformImageTo2dArray) {
-        imageTransformer->transformImagesTo2dArray(kernelInfo, kernelArguments, getSurfaceStateHeap());
+        imageTransformer->transformImagesTo2dArray(kernelInfo, kernelArguments, getSurfaceStateHeap(rootDeviceIndex));
    } else if (imageTransformer->didTransform()) {
-        imageTransformer->transformImagesTo3d(kernelInfo, kernelArguments, getSurfaceStateHeap());
+        imageTransformer->transformImagesTo3d(kernelInfo, kernelArguments, getSurfaceStateHeap(rootDeviceIndex));
    }
 }

--- a/opencl/source/kernel/kernel.h
+++ b/opencl/source/kernel/kernel.h
@ -153,18 +153,18 @@ class Kernel : public BaseObject<_cl_kernel> {
                           size_t *paramValueSizeRet) const;

    const void *getKernelHeap() const;
-    void *getSurfaceStateHeap() const;
+    void *getSurfaceStateHeap(uint32_t rootDeviceIndex) const;
    const void *getDynamicStateHeap() const;

    size_t getKernelHeapSize() const;
-    size_t getSurfaceStateHeapSize() const;
+    size_t getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const;
    size_t getDynamicStateHeapSize() const;
    size_t getNumberOfBindingTableStates() const;
    size_t getBindingTableOffset() const {
        return localBindingTableOffset;
    }

-    void resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset);
+    void resizeSurfaceStateHeap(uint32_t rootDeviceIndex, void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset);

    void substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize);
    bool isKernelHeapSubstituted() const;
@ -524,8 +524,6 @@ class Kernel : public BaseObject<_cl_kernel> {

    size_t numberOfBindingTableStates = 0u;
    size_t localBindingTableOffset = 0u;
-    std::unique_ptr<char[]> pSshLocal;
-    uint32_t sshLocalSize = 0u;

    GraphicsAllocation *kernelReflectionSurface = nullptr;

@ -550,13 +548,15 @@ class Kernel : public BaseObject<_cl_kernel> {
    bool debugEnabled = false;
    uint32_t additionalKernelExecInfo = AdditionalKernelExecInfo::NotSet;

-    struct KernelDeviceInfo {
+    struct KernelDeviceInfo : public NonCopyableClass {
+        std::unique_ptr<char[]> pSshLocal;
+        uint32_t sshLocalSize = 0u;
        char *crossThreadData = nullptr;
        uint32_t crossThreadDataSize = 0u;

        GraphicsAllocation *privateSurface = nullptr;
        uint64_t privateSurfaceSize = 0u;
    };
-    StackVec<KernelDeviceInfo, 1> kernelDeviceInfos;
+    std::vector<KernelDeviceInfo> kernelDeviceInfos;
 };
 } // namespace NEO
--- a/opencl/source/program/printf_handler.cpp
+++ b/opencl/source/program/printf_handler.cpp
@ -62,7 +62,7 @@ void PrintfHandler::prepareDispatch(const MultiDispatchInfo &multiDispatchInfo)

    patchWithRequiredSize(printfPatchAddress, kernel->getKernelInfo().patchInfo.pAllocateStatelessPrintfSurface->DataParamSize, (uintptr_t)printfSurface->getGpuAddressToPatch());
    if (kernel->requiresSshForBuffers()) {
-        auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getSurfaceStateHeap()),
+        auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getSurfaceStateHeap(rootDeviceIndex)),
                                      kernel->getKernelInfo().patchInfo.pAllocateStatelessPrintfSurface->SurfaceStateHeapOffset);
        void *addressToPatch = printfSurface->getUnderlyingBuffer();
        size_t sizeToPatch = printfSurface->getUnderlyingBufferSize();
--- a/opencl/test/unit_test/accelerators/media_image_arg_tests.cpp
+++ b/opencl/test/unit_test/accelerators/media_image_arg_tests.cpp
@ -81,7 +81,7 @@ HWTEST_F(MediaImageSetArgTest, WhenSettingMediaImageArgThenArgsSetCorrectly) {
    typedef typename FamilyType::MEDIA_SURFACE_STATE MEDIA_SURFACE_STATE;

    auto pSurfaceState = reinterpret_cast<const MEDIA_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    srcImage->setMediaImageArg(const_cast<MEDIA_SURFACE_STATE *>(pSurfaceState), pClDevice->getRootDeviceIndex());
@ -109,7 +109,7 @@ HWTEST_F(MediaImageSetArgTest, WhenSettingKernelArgImageThenArgsSetCorrectly) {
    ASSERT_EQ(CL_SUCCESS, retVal);

    auto pSurfaceState = reinterpret_cast<const MEDIA_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    uint64_t surfaceAddress = pSurfaceState->getSurfaceBaseAddress();
--- a/opencl/test/unit_test/api/cl_mem_locally_uncached_resource_tests.cpp
+++ b/opencl/test/unit_test/api/cl_mem_locally_uncached_resource_tests.cpp
@ -27,8 +27,9 @@ namespace clMemLocallyUncachedResourceTests {

 template <typename FamilyType>
 uint32_t argMocs(Kernel &kernel, size_t argIndex) {
+    auto rootDeviceIndex = kernel.getDevices()[0]->getRootDeviceIndex();
    using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
-    auto surfaceStateHeapAddress = kernel.getSurfaceStateHeap();
+    auto surfaceStateHeapAddress = kernel.getSurfaceStateHeap(rootDeviceIndex);
    auto surfaceStateHeapAddressOffset = kernel.getKernelInfo().kernelArgInfo[argIndex].offsetHeap;
    auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(surfaceStateHeapAddress, surfaceStateHeapAddressOffset));
    return surfaceState->getMemoryObjectControlState();
--- a/opencl/test/unit_test/built_ins/built_in_tests.cpp
+++ b/opencl/test/unit_test/built_ins/built_in_tests.cpp
@ -484,12 +484,12 @@ HWCMDTEST_F(IGFX_GEN8_CORE, BuiltInTests, givenAuxTranslationKernelWhenSettingKe
        // read args
        auto argNum = 0;
        auto expectedMocs = pDevice->getGmmHelper()->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED);
-        auto sshBase = mockAuxBuiltInOp.convertToAuxKernel[0]->getSurfaceStateHeap();
+        auto sshBase = mockAuxBuiltInOp.convertToAuxKernel[0]->getSurfaceStateHeap(rootDeviceIndex);
        auto sshOffset = mockAuxBuiltInOp.convertToAuxKernel[0]->getKernelInfo().kernelArgInfo[argNum].offsetHeap;
        auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(sshBase, sshOffset));
        EXPECT_EQ(expectedMocs, surfaceState->getMemoryObjectControlState());

-        sshBase = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getSurfaceStateHeap();
+        sshBase = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getSurfaceStateHeap(rootDeviceIndex);
        sshOffset = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getKernelInfo().kernelArgInfo[argNum].offsetHeap;
        surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(sshBase, sshOffset));
        EXPECT_EQ(expectedMocs, surfaceState->getMemoryObjectControlState());
@ -499,12 +499,12 @@ HWCMDTEST_F(IGFX_GEN8_CORE, BuiltInTests, givenAuxTranslationKernelWhenSettingKe
        // write args
        auto argNum = 1;
        auto expectedMocs = pDevice->getGmmHelper()->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER);
-        auto sshBase = mockAuxBuiltInOp.convertToAuxKernel[0]->getSurfaceStateHeap();
+        auto sshBase = mockAuxBuiltInOp.convertToAuxKernel[0]->getSurfaceStateHeap(rootDeviceIndex);
        auto sshOffset = mockAuxBuiltInOp.convertToAuxKernel[0]->getKernelInfo().kernelArgInfo[argNum].offsetHeap;
        auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(sshBase, sshOffset));
        EXPECT_EQ(expectedMocs, surfaceState->getMemoryObjectControlState());

-        sshBase = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getSurfaceStateHeap();
+        sshBase = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getSurfaceStateHeap(rootDeviceIndex);
        sshOffset = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getKernelInfo().kernelArgInfo[argNum].offsetHeap;
        surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(sshBase, sshOffset));
        EXPECT_EQ(expectedMocs, surfaceState->getMemoryObjectControlState());
@ -541,7 +541,7 @@ HWTEST_F(BuiltInTests, givenAuxToNonAuxTranslationWhenSettingSurfaceStateThenSet
    {
        // read arg
        auto argNum = 0;
-        auto sshBase = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getSurfaceStateHeap();
+        auto sshBase = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getSurfaceStateHeap(rootDeviceIndex);
        auto sshOffset = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getKernelInfo().kernelArgInfo[argNum].offsetHeap;
        auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(sshBase, sshOffset));
        EXPECT_EQ(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E, surfaceState->getAuxiliarySurfaceMode());
@ -550,7 +550,7 @@ HWTEST_F(BuiltInTests, givenAuxToNonAuxTranslationWhenSettingSurfaceStateThenSet
    {
        // write arg
        auto argNum = 1;
-        auto sshBase = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getSurfaceStateHeap();
+        auto sshBase = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getSurfaceStateHeap(rootDeviceIndex);
        auto sshOffset = mockAuxBuiltInOp.convertToNonAuxKernel[0]->getKernelInfo().kernelArgInfo[argNum].offsetHeap;
        auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(sshBase, sshOffset));
        EXPECT_EQ(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_NONE, surfaceState->getAuxiliarySurfaceMode());
@ -586,7 +586,7 @@ HWTEST_F(BuiltInTests, givenNonAuxToAuxTranslationWhenSettingSurfaceStateThenSet
    {
        // read arg
        auto argNum = 0;
-        auto sshBase = mockAuxBuiltInOp.convertToAuxKernel[0]->getSurfaceStateHeap();
+        auto sshBase = mockAuxBuiltInOp.convertToAuxKernel[0]->getSurfaceStateHeap(rootDeviceIndex);
        auto sshOffset = mockAuxBuiltInOp.convertToAuxKernel[0]->getKernelInfo().kernelArgInfo[argNum].offsetHeap;
        auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(sshBase, sshOffset));
        EXPECT_EQ(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_NONE, surfaceState->getAuxiliarySurfaceMode());
@ -595,7 +595,7 @@ HWTEST_F(BuiltInTests, givenNonAuxToAuxTranslationWhenSettingSurfaceStateThenSet
    {
        // write arg
        auto argNum = 1;
-        auto sshBase = mockAuxBuiltInOp.convertToAuxKernel[0]->getSurfaceStateHeap();
+        auto sshBase = mockAuxBuiltInOp.convertToAuxKernel[0]->getSurfaceStateHeap(rootDeviceIndex);
        auto sshOffset = mockAuxBuiltInOp.convertToAuxKernel[0]->getKernelInfo().kernelArgInfo[argNum].offsetHeap;
        auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(sshBase, sshOffset));
        EXPECT_EQ(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E, surfaceState->getAuxiliarySurfaceMode());
--- a/opencl/test/unit_test/command_queue/command_queue_tests.cpp
+++ b/opencl/test/unit_test/command_queue/command_queue_tests.cpp
@ -1038,7 +1038,7 @@ HWTEST_F(CommandQueueCommandStreamTest, givenDebugKernelWhenSetupDebugSurfaceIsC
    std::unique_ptr<MockDebugKernel> kernel(MockKernel::create<MockDebugKernel>(*pDevice, &program));
    MockCommandQueue cmdQ(context.get(), pClDevice, 0);

-    kernel->setSshLocal(nullptr, sizeof(RENDER_SURFACE_STATE) + kernel->getAllocatedKernelInfo()->patchInfo.pAllocateSystemThreadSurface->Offset);
+    kernel->setSshLocal(nullptr, sizeof(RENDER_SURFACE_STATE) + kernel->getAllocatedKernelInfo()->patchInfo.pAllocateSystemThreadSurface->Offset, rootDeviceIndex);
    kernel->getAllocatedKernelInfo()->usesSsh = true;
    auto &commandStreamReceiver = cmdQ.getGpgpuCommandStreamReceiver();

@ -1047,7 +1047,7 @@ HWTEST_F(CommandQueueCommandStreamTest, givenDebugKernelWhenSetupDebugSurfaceIsC

    auto debugSurface = commandStreamReceiver.getDebugSurfaceAllocation();
    ASSERT_NE(nullptr, debugSurface);
-    RENDER_SURFACE_STATE *surfaceState = (RENDER_SURFACE_STATE *)kernel->getSurfaceStateHeap();
+    RENDER_SURFACE_STATE *surfaceState = (RENDER_SURFACE_STATE *)kernel->getSurfaceStateHeap(rootDeviceIndex);
    EXPECT_EQ(debugSurface->getGpuAddress(), surfaceState->getSurfaceBaseAddress());
 }

@ -1058,7 +1058,7 @@ HWTEST_F(CommandQueueCommandStreamTest, givenCsrWithDebugSurfaceAllocatedWhenSet
    std::unique_ptr<MockDebugKernel> kernel(MockKernel::create<MockDebugKernel>(*pDevice, &program));
    MockCommandQueue cmdQ(context.get(), pClDevice, 0);

-    kernel->setSshLocal(nullptr, sizeof(RENDER_SURFACE_STATE) + kernel->getAllocatedKernelInfo()->patchInfo.pAllocateSystemThreadSurface->Offset);
+    kernel->setSshLocal(nullptr, sizeof(RENDER_SURFACE_STATE) + kernel->getAllocatedKernelInfo()->patchInfo.pAllocateSystemThreadSurface->Offset, rootDeviceIndex);
    kernel->getAllocatedKernelInfo()->usesSsh = true;
    auto &commandStreamReceiver = cmdQ.getGpgpuCommandStreamReceiver();
    commandStreamReceiver.allocateDebugSurface(SipKernel::maxDbgSurfaceSize);
@ -1068,7 +1068,7 @@ HWTEST_F(CommandQueueCommandStreamTest, givenCsrWithDebugSurfaceAllocatedWhenSet
    cmdQ.setupDebugSurface(kernel.get());

    EXPECT_EQ(debugSurface, commandStreamReceiver.getDebugSurfaceAllocation());
-    RENDER_SURFACE_STATE *surfaceState = (RENDER_SURFACE_STATE *)kernel->getSurfaceStateHeap();
+    RENDER_SURFACE_STATE *surfaceState = (RENDER_SURFACE_STATE *)kernel->getSurfaceStateHeap(rootDeviceIndex);
    EXPECT_EQ(debugSurface->getGpuAddress(), surfaceState->getSurfaceBaseAddress());
 }

--- a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp
+++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp
@ -733,7 +733,7 @@ HWTEST_F(DispatchWalkerTest, GivenBlockedQueueWhenDispatchingWalkerThenRequiredH

    auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(kernel);
    auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(rootDeviceIndex, kernel, Math::computeTotalElementsCount(localWorkgroupSize));
-    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(kernel);
+    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(kernel, rootDeviceIndex);

    EXPECT_LE(expectedSizeDSH, blockedCommandsData->dsh->getMaxAvailableSpace());
    EXPECT_LE(expectedSizeIOH, blockedCommandsData->ioh->getMaxAvailableSpace());
--- a/opencl/test/unit_test/command_queue/enqueue_handler_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_handler_tests.cpp
@ -578,7 +578,7 @@ HWTEST_F(EnqueueHandlerTest, givenKernelUsingSyncBufferWhenEnqueuingKernelThenSs
        kernel->initialize();

        auto bindingTableState = reinterpret_cast<BINDING_TABLE_STATE *>(
-            ptrOffset(kernel->getSurfaceStateHeap(), sPatchBindingTableState.Offset));
+            ptrOffset(kernel->getSurfaceStateHeap(rootDeviceIndex), sPatchBindingTableState.Offset));
        bindingTableState->setSurfaceStatePointer(0);

        auto mockCmdQ = clUniquePtr(new MockCommandQueueHw<FamilyType>(context, pClDevice, 0));
--- a/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp
+++ b/opencl/test/unit_test/command_queue/get_size_required_buffer_tests.cpp
@ -400,7 +400,7 @@ HWTEST_F(GetSizeRequiredBufferTest, givenMultipleKernelRequiringSshWhenTotalSize
    builder.buildDispatchInfos(multiDispatchInfo);
    builder.buildDispatchInfos(multiDispatchInfo);

-    auto sizeSSH = multiDispatchInfo.begin()->getKernel()->getSurfaceStateHeapSize();
+    auto sizeSSH = multiDispatchInfo.begin()->getKernel()->getSurfaceStateHeapSize(rootDeviceIndex);
    sizeSSH += sizeSSH ? FamilyType::BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE : 0;

    sizeSSH = alignUp(sizeSSH, MemoryConstants::cacheLineSize);
@ -439,7 +439,7 @@ HWTEST_F(GetSizeRequiredBufferTest, GivenHelloWorldKernelWhenEnqueingKernelThenH
    auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, KernelFixture::pKernel);
    auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*KernelFixture::pKernel);
    auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(rootDeviceIndex, *KernelFixture::pKernel, workSize[0]);
-    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*KernelFixture::pKernel);
+    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*KernelFixture::pKernel, rootDeviceIndex);

    // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
    expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END);
@ -478,7 +478,7 @@ HWTEST_F(GetSizeRequiredBufferTest, GivenKernelWithSimpleArgWhenEnqueingKernelTh
    auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *pCmdQ, KernelFixture::pKernel);
    auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*KernelFixture::pKernel);
    auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(rootDeviceIndex, *KernelFixture::pKernel, workSize[0]);
-    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*KernelFixture::pKernel);
+    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*KernelFixture::pKernel, rootDeviceIndex);

    EXPECT_EQ(0u, expectedSizeIOH % GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
    EXPECT_EQ(0u, expectedSizeDSH % 64);
--- a/opencl/test/unit_test/command_queue/get_size_required_image_tests.cpp
+++ b/opencl/test/unit_test/command_queue/get_size_required_image_tests.cpp
@ -93,7 +93,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenCopyingImageThenHeapsAndCommandBufferCons
    auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_COPY_IMAGE, false, false, *pCmdQ, kernel);
    auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
    auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(rootDeviceIndex, *kernel);
-    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
+    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel, rootDeviceIndex);

    // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
    expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END);
@ -140,7 +140,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenCopyingReadWriteImageThenHeapsAndCommandB
    auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_COPY_IMAGE, false, false, *pCmdQ, kernel.get());
    auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel.get());
    auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(rootDeviceIndex, *kernel.get());
-    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel.get());
+    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel.get(), rootDeviceIndex);

    // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
    expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END);
@ -197,7 +197,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenReadingImageNonBlockingThenHeapsAndComman
    auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_READ_IMAGE, false, false, *pCmdQ, kernel);
    auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
    auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(rootDeviceIndex, *kernel);
-    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
+    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel, rootDeviceIndex);

    // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
    expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END);
@ -252,7 +252,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenReadingImageBlockingThenHeapsAndCommandBu
    auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_READ_IMAGE, false, false, *pCmdQ, kernel);
    auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
    auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(rootDeviceIndex, *kernel);
-    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
+    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel, rootDeviceIndex);

    // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
    expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END);
@ -307,7 +307,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenWritingImageNonBlockingThenHeapsAndComman
    auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_WRITE_IMAGE, false, false, *pCmdQ, kernel);
    auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
    auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(rootDeviceIndex, *kernel);
-    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
+    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel, rootDeviceIndex);

    // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
    expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END);
@ -362,7 +362,7 @@ HWTEST_F(GetSizeRequiredImageTest, WhenWritingImageBlockingThenHeapsAndCommandBu
    auto expectedSizeCS = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_WRITE_IMAGE, false, false, *pCmdQ, kernel);
    auto expectedSizeDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
    auto expectedSizeIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(rootDeviceIndex, *kernel);
-    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
+    auto expectedSizeSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel, rootDeviceIndex);

    // Since each enqueue* may flush, we may see a MI_BATCH_BUFFER_END appended.
    expectedSizeCS += sizeof(typename FamilyType::MI_BATCH_BUFFER_END);
--- a/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp
+++ b/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp
@ -180,7 +180,7 @@ HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenSshRequiredWhenPatchingSyncBuffer

    pClDevice->allocateSyncBufferHandler();
    auto syncBufferHandler = getSyncBufferHandler();
-    auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(kernel->getSurfaceStateHeap(),
+    auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(kernel->getSurfaceStateHeap(rootDeviceIndex),
                                                                           sPatchAllocateSyncBuffer.SurfaceStateHeapOffset));
    auto bufferAddress = syncBufferHandler->graphicsAllocation->getGpuAddress();
    surfaceState->setSurfaceBaseAddress(bufferAddress + 1);
--- a/opencl/test/unit_test/device_queue/device_queue_hw_tests.cpp
+++ b/opencl/test/unit_test/device_queue/device_queue_hw_tests.cpp
@ -537,7 +537,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, DeviceQueueHwWithKernel, WhenSetiingIUpIndirectState
    auto dsh = devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
    ASSERT_NE(nullptr, dsh);

-    size_t surfaceStateHeapSize = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(const_cast<const Kernel &>(*pKernel));
+    size_t surfaceStateHeapSize = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(const_cast<const Kernel &>(*pKernel), rootDeviceIndex);

    auto ssh = new IndirectHeap(alignedMalloc(surfaceStateHeapSize, MemoryConstants::pageSize), surfaceStateHeapSize);
    auto usedBeforeSSH = ssh->getUsed();
@ -565,7 +565,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, DeviceQueueHwWithKernel, WhenSettingUpIndirectStateT
    auto dsh = devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
    ASSERT_NE(nullptr, dsh);

-    size_t surfaceStateHeapSize = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(const_cast<const Kernel &>(*pKernel));
+    size_t surfaceStateHeapSize = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(const_cast<const Kernel &>(*pKernel), rootDeviceIndex);

    auto ssh = new IndirectHeap(alignedMalloc(surfaceStateHeapSize, MemoryConstants::pageSize), surfaceStateHeapSize);

@ -593,7 +593,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, DeviceQueueHwWithKernel, WhenSettingUpIndirectStateT
    auto dsh = devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
    ASSERT_NE(nullptr, dsh);

-    size_t surfaceStateHeapSize = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(const_cast<const Kernel &>(*pKernel));
+    size_t surfaceStateHeapSize = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(const_cast<const Kernel &>(*pKernel), rootDeviceIndex);

    auto ssh = new IndirectHeap(alignedMalloc(surfaceStateHeapSize, MemoryConstants::pageSize), surfaceStateHeapSize);

@ -631,7 +631,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, DeviceQueueHwWithKernel, GivenHasBarriersSetWhenCall
    }

    auto surfaceStateHeapSize =
-        HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(const_cast<const Kernel &>(*pKernel));
+        HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(const_cast<const Kernel &>(*pKernel), rootDeviceIndex);
    auto ssh = std::make_unique<IndirectHeap>(alignedMalloc(surfaceStateHeapSize, MemoryConstants::pageSize), surfaceStateHeapSize);

    devQueueHw->setupIndirectState(*ssh, *dsh, pKernel, parentCount, false);
--- a/opencl/test/unit_test/execution_model/enqueue_execution_model_kernel_tests.cpp
+++ b/opencl/test/unit_test/execution_model/enqueue_execution_model_kernel_tests.cpp
@ -297,7 +297,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelEnqueueTest, givenParentKernelWhenEnqueu
    BlockKernelManager *blockManager = pProgram->getBlockKernelManager();
    uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount());

-    size_t parentKernelSSHSize = pKernel->getSurfaceStateHeapSize();
+    size_t parentKernelSSHSize = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);

    MockMultiDispatchInfo multiDispatchInfo(pClDevice, pKernel);

@ -340,7 +340,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelEnqueueTest, givenParentKernelWhenEnqueu
                EXPECT_EQ(0, memcmp(srcSurfaceState, dstSurfaceState, sizeof(RENDER_SURFACE_STATE)));
            }

-            blockSSH = ptrOffset(blockSSH, blockKernel->getSurfaceStateHeapSize());
+            blockSSH = ptrOffset(blockSSH, blockKernel->getSurfaceStateHeapSize(rootDeviceIndex));
        }

        delete blockKernel;
--- a/opencl/test/unit_test/execution_model/parent_kernel_dispatch_tests.cpp
+++ b/opencl/test/unit_test/execution_model/parent_kernel_dispatch_tests.cpp
@ -132,7 +132,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelDispatchTest, givenParentKernelWhenQueue
    EXPECT_LE(pKernel->getKernelInfo().heapInfo.SurfaceStateHeapSize, ssh.getMaxAvailableSpace());

    size_t minRequiredSize = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);
-    size_t minRequiredSizeForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*pKernel);
+    size_t minRequiredSizeForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*pKernel, rootDeviceIndex);

    EXPECT_LE(minRequiredSize + minRequiredSizeForEM, ssh.getMaxAvailableSpace());
 }
@ -162,7 +162,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelDispatchTest, givenParentKernelWhenQueue
    ASSERT_NE(nullptr, blockedCommandsData);

    size_t minRequiredSize = HardwareCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo) + UnitTestHelper<FamilyType>::getDefaultSshUsage();
-    size_t minRequiredSizeForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*pKernel);
+    size_t minRequiredSizeForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*pKernel, rootDeviceIndex);

    size_t sshUsed = blockedCommandsData->ssh->getUsed();

--- a/opencl/test/unit_test/execution_model/scheduler_dispatch_tests.cpp
+++ b/opencl/test/unit_test/execution_model/scheduler_dispatch_tests.cpp
@ -53,7 +53,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ExecutionModelSchedulerFixture, WhenDispatchingSched

    EXPECT_NE(nullptr, executionModelDsh);

-    size_t minRequiredSizeForSchedulerSSH = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel);
+    size_t minRequiredSizeForSchedulerSSH = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);
    // Setup heaps in pCmdQ
    MultiDispatchInfo multiDispatchinfo(&scheduler);
    LinearStream &commandStream = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, CsrDependencies(),
@ -174,7 +174,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ExecutionModelSchedulerFixture, WhenDispatchingSched
    DeviceQueueHw<FamilyType> *pDevQueueHw = castToObject<DeviceQueueHw<FamilyType>>(pDevQueue);
    SchedulerKernel &scheduler = context->getSchedulerKernel();

-    size_t minRequiredSizeForSchedulerSSH = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel);
+    size_t minRequiredSizeForSchedulerSSH = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);
    // Setup heaps in pCmdQ

    MultiDispatchInfo multiDispatchinfo(&scheduler);
@ -209,7 +209,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, GivenEarlyReturnSet

    SchedulerKernel &scheduler = context->getSchedulerKernel();

-    size_t minRequiredSizeForSchedulerSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(scheduler);
+    size_t minRequiredSizeForSchedulerSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(scheduler, rootDeviceIndex);
    // Setup heaps in pCmdQ
    MultiDispatchInfo multiDispatchinfo(&scheduler);
    LinearStream &commandStream = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, CsrDependencies(),
--- a/opencl/test/unit_test/execution_model/submit_blocked_parent_kernel_tests.cpp
+++ b/opencl/test/unit_test/execution_model/submit_blocked_parent_kernel_tests.cpp
@ -97,7 +97,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenLockedEMcritca

    dsh->getSpace(mockDevQueue.getDshOffset());

-    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel);
+    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);

    auto cmdStreamAllocation = device->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getRootDeviceIndex(), 4096, GraphicsAllocation::AllocationType::COMMAND_BUFFER, device->getDeviceBitfield()});
    auto blockedCommandData = std::make_unique<KernelOperation>(new LinearStream(cmdStreamAllocation),
@ -162,7 +162,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenParentKernelWh
                                                                *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
    blockedCommandData->setHeaps(dsh, ioh, ssh);

-    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel);
+    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);

    blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
    PreemptionMode preemptionMode = device->getPreemptionMode();
@ -203,7 +203,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenParentKernelWh
                                                                *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
    blockedCommandData->setHeaps(dsh, ioh, ssh);

-    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel);
+    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);

    blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
    PreemptionMode preemptionMode = device->getPreemptionMode();
@ -241,7 +241,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenBlockedParentK
                                                                *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
    blockedCommandData->setHeaps(dsh, ioh, ssh);

-    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel);
+    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);

    blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
    PreemptionMode preemptionMode = device->getPreemptionMode();
@ -282,7 +282,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenParentKernelWh
                                                                *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
    blockedCommandData->setHeaps(dsh, ioh, ssh);

-    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel);
+    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);

    blockedCommandData->surfaceStateHeapSizeEM = minSizeSSHForEM;
    PreemptionMode preemptionMode = device->getPreemptionMode();
@ -308,7 +308,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenUsedCommandQue

    MockCommandQueue cmdQ(context, device, properties);

-    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel);
+    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);

    size_t heapSize = 20;

@ -362,7 +362,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, givenNotUsedSSHWhen
    parentKernel->createReflectionSurface();
    context->setDefaultDeviceQueue(&mockDevQueue);

-    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel);
+    size_t minSizeSSHForEM = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);

    size_t heapSize = 20;

--- a/opencl/test/unit_test/fixtures/execution_model_fixture.h
+++ b/opencl/test/unit_test/fixtures/execution_model_fixture.h
@ -109,7 +109,7 @@ struct ParentKernelCommandQueueFixture : public CommandQueueHwFixture,
                                         testing::Test {

    void SetUp() override {
-        device = new MockClDevice{MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr)};
+        device = new MockClDevice{MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr, rootDeviceIndex)};
        CommandQueueHwFixture::SetUp(device, 0);
    }
    void TearDown() override {
@ -125,4 +125,5 @@ struct ParentKernelCommandQueueFixture : public CommandQueueHwFixture,

        return std::make_unique<KernelOperation>(commandStream, *gpgpuCsr.getInternalAllocationStorage());
    }
+    const uint32_t rootDeviceIndex = 0u;
 };
--- a/opencl/test/unit_test/gen8/scheduler_dispatch_tests_gen8.cpp
+++ b/opencl/test/unit_test/gen8/scheduler_dispatch_tests_gen8.cpp
@ -31,7 +31,7 @@ BDWTEST_F(BdwSchedulerTest, givenCallToDispatchSchedulerWhenPipeControlWithCSSta
        DeviceQueueHw<FamilyType> *pDevQueueHw = castToObject<DeviceQueueHw<FamilyType>>(pDevQueue);
        SchedulerKernel &scheduler = context->getSchedulerKernel();

-        size_t minRequiredSizeForSchedulerSSH = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel);
+        size_t minRequiredSizeForSchedulerSSH = HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*parentKernel, rootDeviceIndex);

        // Setup heaps in pCmdQ
        MultiDispatchInfo multiDispatchinfo(&scheduler);
--- a/opencl/test/unit_test/gtpin/gtpin_tests.cpp
+++ b/opencl/test/unit_test/gtpin/gtpin_tests.cpp
@ -160,7 +160,8 @@ class GTPinFixture : public ContextFixture, public MemoryManagementFixture {
        executionEnvironment->memoryManager.reset(memoryManager);
        initPlatform();
        pDevice = pPlatform->getClDevice(0);
-        cl_device_id device = (cl_device_id)pDevice;
+        rootDeviceIndex = pDevice->getRootDeviceIndex();
+        cl_device_id device = pDevice;
        ContextFixture::SetUp(1, &device);

        driverServices.bufferAllocate = nullptr;
@ -193,6 +194,7 @@ class GTPinFixture : public ContextFixture, public MemoryManagementFixture {
    driver_services_t driverServices;
    gtpin::ocl::gtpin_events_t gtpinCallbacks;
    MockMemoryManagerWithFailures *memoryManager = nullptr;
+    uint32_t rootDeviceIndex = std::numeric_limits<uint32_t>::max();
 };

 typedef Test<GTPinFixture> GTPinTests;
@ -1279,7 +1281,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelWithoutSSHIsUsedThenG

    // Verify that when SSH is removed then during kernel execution
    // GT-Pin Kernel Submit, Command Buffer Create and Command Buffer Complete callbacks are not called.
-    pKernel->resizeSurfaceStateHeap(nullptr, 0, 0, 0);
+    pKernel->resizeSurfaceStateHeap(rootDeviceIndex, nullptr, 0, 0, 0);

    int prevCount2 = KernelSubmitCallbackCount;
    int prevCount3 = CommandBufferCreateCallbackCount;
@ -1392,7 +1394,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenBlockedKernelWithoutSSHIsUs

    // Verify that when SSH is removed then during kernel execution
    // GT-Pin Kernel Submit, Command Buffer Create and Command Buffer Complete callbacks are not called.
-    pKernel->resizeSurfaceStateHeap(nullptr, 0, 0, 0);
+    pKernel->resizeSurfaceStateHeap(rootDeviceIndex, nullptr, 0, 0, 0);

    cl_event userEvent = clCreateUserEvent(context, &retVal);
    EXPECT_EQ(CL_SUCCESS, retVal);
@ -2167,15 +2169,15 @@ TEST_F(GTPinTests, givenParentKernelWhenGtPinAddingSurfaceStateThenItIsNotAddedA
    std::unique_ptr<MockParentKernel> parentKernel(MockParentKernel::create(*pContext));

    parentKernel->mockKernelInfo->usesSsh = true;
-    parentKernel->sshLocalSize = 64;
-    parentKernel->pSshLocal.reset(new char[64]);
+    parentKernel->kernelDeviceInfos[rootDeviceIndex].sshLocalSize = 64;
+    parentKernel->kernelDeviceInfos[rootDeviceIndex].pSshLocal.reset(new char[64]);

-    size_t sizeSurfaceStates1 = parentKernel->getSurfaceStateHeapSize();
+    size_t sizeSurfaceStates1 = parentKernel->getSurfaceStateHeapSize(rootDeviceIndex);

-    bool surfaceAdded = gtpinHelper.addSurfaceState(parentKernel.get());
+    bool surfaceAdded = gtpinHelper.addSurfaceState(parentKernel.get(), rootDeviceIndex);
    EXPECT_FALSE(surfaceAdded);

-    size_t sizeSurfaceStates2 = parentKernel->getSurfaceStateHeapSize();
+    size_t sizeSurfaceStates2 = parentKernel->getSurfaceStateHeapSize(rootDeviceIndex);
    EXPECT_EQ(sizeSurfaceStates2, sizeSurfaceStates1);
 }

@ -2225,47 +2227,47 @@ TEST_F(GTPinTests, givenKernelWithSSHThenVerifyThatSSHResizeWorksWell) {

    size_t numBTS1 = pKernel->getNumberOfBindingTableStates();
    EXPECT_EQ(2u, numBTS1);
-    size_t sizeSurfaceStates1 = pKernel->getSurfaceStateHeapSize();
+    size_t sizeSurfaceStates1 = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
    EXPECT_NE(0u, sizeSurfaceStates1);
    size_t offsetBTS1 = pKernel->getBindingTableOffset();
    EXPECT_NE(0u, offsetBTS1);

    GFXCORE_FAMILY genFamily = pDevice->getHardwareInfo().platform.eRenderCoreFamily;
    GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
-    void *pSS1 = gtpinHelper.getSurfaceState(pKernel, 0);
+    void *pSS1 = gtpinHelper.getSurfaceState(pKernel, 0, rootDeviceIndex);
    EXPECT_NE(nullptr, pSS1);

    // Enlarge SSH by one SURFACE STATE element
-    bool surfaceAdded = gtpinHelper.addSurfaceState(pKernel);
+    bool surfaceAdded = gtpinHelper.addSurfaceState(pKernel, rootDeviceIndex);
    EXPECT_TRUE(surfaceAdded);

    size_t numBTS2 = pKernel->getNumberOfBindingTableStates();
    EXPECT_EQ(numBTS1 + 1, numBTS2);
-    size_t sizeSurfaceStates2 = pKernel->getSurfaceStateHeapSize();
+    size_t sizeSurfaceStates2 = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
    EXPECT_GT(sizeSurfaceStates2, sizeSurfaceStates1);
    size_t offsetBTS2 = pKernel->getBindingTableOffset();
    EXPECT_GT(offsetBTS2, offsetBTS1);

-    void *pSS2 = gtpinHelper.getSurfaceState(pKernel, 0);
+    void *pSS2 = gtpinHelper.getSurfaceState(pKernel, 0, rootDeviceIndex);
    EXPECT_NE(pSS2, pSS1);

-    pSS2 = gtpinHelper.getSurfaceState(pKernel, numBTS2);
+    pSS2 = gtpinHelper.getSurfaceState(pKernel, numBTS2, rootDeviceIndex);
    EXPECT_EQ(nullptr, pSS2);

    // Remove kernel's SSH
-    pKernel->resizeSurfaceStateHeap(nullptr, 0, 0, 0);
+    pKernel->resizeSurfaceStateHeap(rootDeviceIndex, nullptr, 0, 0, 0);

    // Try to enlarge SSH once again, this time the operation must fail
-    surfaceAdded = gtpinHelper.addSurfaceState(pKernel);
+    surfaceAdded = gtpinHelper.addSurfaceState(pKernel, rootDeviceIndex);
    EXPECT_FALSE(surfaceAdded);

    size_t numBTS3 = pKernel->getNumberOfBindingTableStates();
    EXPECT_EQ(0u, numBTS3);
-    size_t sizeSurfaceStates3 = pKernel->getSurfaceStateHeapSize();
+    size_t sizeSurfaceStates3 = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
    EXPECT_EQ(0u, sizeSurfaceStates3);
    size_t offsetBTS3 = pKernel->getBindingTableOffset();
    EXPECT_EQ(0u, offsetBTS3);
-    void *pSS3 = gtpinHelper.getSurfaceState(pKernel, 0);
+    void *pSS3 = gtpinHelper.getSurfaceState(pKernel, 0, rootDeviceIndex);
    EXPECT_EQ(nullptr, pSS3);

    // Cleanup
@ -2396,7 +2398,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenOnKernelSubitIsCalledThenCo
    std::unique_ptr<MockCommandQueue> cmdQ(new MockCommandQueue(context.get(), pDevice, nullptr));
    std::unique_ptr<MockKernel> pKernel(new MockKernel(pProgramm.get(), *pKernelInfo));

-    pKernel->setSshLocal(nullptr, sizeof(surfaceStateHeap));
+    pKernel->setSshLocal(nullptr, sizeof(surfaceStateHeap), rootDeviceIndex);

    kernelOffset = 0x1234;
    EXPECT_NE(pKernel->getStartOffset(), kernelOffset);
--- a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp
+++ b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp
@ -363,7 +363,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenAllocatingIndirectStateRes
    auto usedAfterSSH = ssh.getUsed();
    auto sizeRequiredDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
    auto sizeRequiredIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(rootDeviceIndex, *kernel, localWorkSize);
-    auto sizeRequiredSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
+    auto sizeRequiredSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel, rootDeviceIndex);

    EXPECT_GE(sizeRequiredDSH, usedAfterDSH - usedBeforeDSH);
    EXPECT_GE(sizeRequiredIOH, usedAfterIOH - usedBeforeIOH);
@ -1005,7 +1005,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenInd
    }

    mockKernelWithInternal->mockKernel->setCrossThreadData(mockKernelWithInternal->crossThreadData, sizeof(mockKernelWithInternal->crossThreadData));
-    mockKernelWithInternal->mockKernel->setSshLocal(mockKernelWithInternal->sshLocal, sizeof(mockKernelWithInternal->sshLocal));
+    mockKernelWithInternal->mockKernel->setSshLocal(mockKernelWithInternal->sshLocal, sizeof(mockKernelWithInternal->sshLocal), rootDeviceIndex);
    uint32_t interfaceDescriptorIndex = 0;
    auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
    auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
@ -1096,12 +1096,12 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelCommandsFromBinaryTest, WhenGettingSizeR
    totalSize += maxBindingTableCount * sizeof(BINDING_TABLE_STATE) * DeviceQueue::interfaceDescriptorEntries;

    auto &scheduler = pContext->getSchedulerKernel();
-    auto schedulerSshSize = scheduler.getSurfaceStateHeapSize();
+    auto schedulerSshSize = scheduler.getSurfaceStateHeapSize(rootDeviceIndex);
    totalSize += schedulerSshSize + ((schedulerSshSize != 0) ? BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE : 0);

    totalSize = alignUp(totalSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);

-    EXPECT_EQ(totalSize, HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*pKernel));
+    EXPECT_EQ(totalSize, HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*pKernel, rootDeviceIndex));
 }

 static const char *binaryFile = "simple_block_kernel";
--- a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.h
+++ b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.h
@ -46,7 +46,7 @@ struct HardwareCommandsTest : ClDeviceFixture,
    template <typename GfxFamily>
    size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const Kernel &srcKernel) {
        return EncodeSurfaceState<GfxFamily>::pushBindingTableAndSurfaceStates(dstHeap, (srcKernel.getKernelInfo().patchInfo.bindingTableState != nullptr) ? srcKernel.getKernelInfo().patchInfo.bindingTableState->Count : 0,
-                                                                               srcKernel.getSurfaceStateHeap(), srcKernel.getSurfaceStateHeapSize(),
+                                                                               srcKernel.getSurfaceStateHeap(rootDeviceIndex), srcKernel.getSurfaceStateHeapSize(rootDeviceIndex),
                                                                               srcKernel.getNumberOfBindingTableStates(), srcKernel.getBindingTableOffset());
    }
 };
--- a/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp
@ -58,7 +58,7 @@ TEST_F(KernelArgBufferTest, GivenSvmPtrStatelessWhenSettingKernelArgThenArgument
    EXPECT_EQ(CL_SUCCESS, retVal);
    EXPECT_FALSE(pKernel->requiresCoherency());

-    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));

    delete buffer;
 }
@ -76,11 +76,11 @@ HWTEST_F(KernelArgBufferTest, GivenSvmPtrStatefulWhenSettingKernelArgThenArgumen
    EXPECT_EQ(CL_SUCCESS, retVal);
    EXPECT_FALSE(pKernel->requiresCoherency());

-    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));

    typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(), pKernelInfo->kernelArgInfo[0].offsetHeap));
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex), pKernelInfo->kernelArgInfo[0].offsetHeap));

    auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
    EXPECT_EQ(buffer->getGraphicsAllocation(mockRootDeviceIndex)->getGpuAddress(), surfaceAddress);
--- a/opencl/test/unit_test/kernel/kernel_arg_pipe_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_arg_pipe_tests.cpp
@ -110,7 +110,7 @@ TEST_F(KernelArgPipeTest, GivenSvmPtrStatelessWhenSettingKernelArgThenArgumentsA
    auto retVal = this->pKernel->setArg(0, sizeof(cl_mem *), pVal);
    EXPECT_EQ(CL_SUCCESS, retVal);

-    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));

    delete pipe;
 }
@ -127,11 +127,11 @@ HWTEST_F(KernelArgPipeTest, GivenSvmPtrStatefulWhenSettingKernelArgThenArguments
    auto retVal = this->pKernel->setArg(0, sizeof(cl_mem *), pVal);
    EXPECT_EQ(CL_SUCCESS, retVal);

-    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));

    typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    void *surfaceAddress = reinterpret_cast<void *>(surfaceState->getSurfaceBaseAddress());
--- a/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp
@ -101,7 +101,7 @@ TEST_F(KernelArgSvmTest, GivenSvmPtrStatelessWhenSettingKernelArgThenArgumentsAr
    auto retVal = pKernel->setArgSvm(0, 256, svmPtr, nullptr, 0u);
    EXPECT_EQ(CL_SUCCESS, retVal);

-    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));

    delete[] svmPtr;
 }
@ -115,11 +115,11 @@ HWTEST_F(KernelArgSvmTest, GivenSvmPtrStatefulWhenSettingKernelArgThenArgumentsA
    auto retVal = pKernel->setArgSvm(0, 256, svmPtr, nullptr, 0u);
    EXPECT_EQ(CL_SUCCESS, retVal);

-    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));

    typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    void *surfaceAddress = reinterpret_cast<void *>(surfaceState->getSurfaceBaseAddress());
@ -154,7 +154,7 @@ TEST_F(KernelArgSvmTest, GivenValidSvmAllocStatelessWhenSettingKernelArgThenArgu
    auto retVal = pKernel->setArgSvmAlloc(0, svmPtr, &svmAlloc);
    EXPECT_EQ(CL_SUCCESS, retVal);

-    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));

    delete[] svmPtr;
 }
@ -170,11 +170,11 @@ HWTEST_F(KernelArgSvmTest, GivenValidSvmAllocStatefulWhenSettingKernelArgThenArg
    auto retVal = pKernel->setArgSvmAlloc(0, svmPtr, &svmAlloc);
    EXPECT_EQ(CL_SUCCESS, retVal);

-    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));

    typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    void *surfaceAddress = reinterpret_cast<void *>(surfaceState->getSurfaceBaseAddress());
@ -196,7 +196,7 @@ HWTEST_F(KernelArgSvmTest, givenOffsetedSvmPointerWhenSetArgSvmAllocIsCalledThen

    typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    void *surfaceAddress = reinterpret_cast<void *>(surfaceState->getSurfaceBaseAddress());
@ -214,7 +214,7 @@ HWTEST_F(KernelArgSvmTest, givenDeviceSupportingSharedSystemAllocationsWhenSetAr

    typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    void *surfaceAddress = reinterpret_cast<void *>(surfaceState->getSurfaceBaseAddress());
@ -237,7 +237,7 @@ HWTEST_F(KernelArgSvmTest, WhenPatchingWithImplicitSurfaceThenPatchIsApplied) {
    svmPtr.resize(256);

    pKernel->setCrossThreadData(nullptr, sizeof(void *));
-    pKernel->setSshLocal(nullptr, rendSurfSize);
+    pKernel->setSshLocal(nullptr, rendSurfSize, rootDeviceIndex);
    pKernelInfo->requiresSshForBuffers = true;
    pKernelInfo->usesSsh = true;
    {
@ -254,8 +254,8 @@ HWTEST_F(KernelArgSvmTest, WhenPatchingWithImplicitSurfaceThenPatchIsApplied) {
        ASSERT_GE(pKernel->getCrossThreadDataSize(rootDeviceIndex), sizeof(void *));
        *reinterpret_cast<void **>(pKernel->getCrossThreadData(rootDeviceIndex)) = 0U;

-        ASSERT_GE(pKernel->getSurfaceStateHeapSize(), rendSurfSize);
-        RENDER_SURFACE_STATE *surfState = reinterpret_cast<RENDER_SURFACE_STATE *>(pKernel->getSurfaceStateHeap());
+        ASSERT_GE(pKernel->getSurfaceStateHeapSize(rootDeviceIndex), rendSurfSize);
+        RENDER_SURFACE_STATE *surfState = reinterpret_cast<RENDER_SURFACE_STATE *>(pKernel->getSurfaceStateHeap(rootDeviceIndex));
        memset(surfState, 0, rendSurfSize);

        pKernel->patchWithImplicitSurface(ptrToPatch, svmAlloc, patch);
@ -278,7 +278,7 @@ HWTEST_F(KernelArgSvmTest, WhenPatchingWithImplicitSurfaceThenPatchIsApplied) {

        // when cross thread and ssh data is not available then should not do anything
        pKernel->setCrossThreadData(nullptr, 0);
-        pKernel->setSshLocal(nullptr, 0);
+        pKernel->setSshLocal(nullptr, 0, rootDeviceIndex);
        pKernel->patchWithImplicitSurface(ptrToPatch, svmAlloc, patch);
    }
 }
@ -389,7 +389,7 @@ HWTEST_TYPED_TEST(KernelArgSvmTestTyped, GivenBufferKernelArgWhenBufferOffsetIsN
    kai.offsetBufferOffset = kai.kernelArgPatchInfoVector[0].size;

    this->pKernel->setCrossThreadData(nullptr, kai.offsetBufferOffset + sizeof(uint32_t));
-    this->pKernel->setSshLocal(nullptr, rendSurfSize);
+    this->pKernel->setSshLocal(nullptr, rendSurfSize, rootDeviceIndex);
    this->pKernelInfo->requiresSshForBuffers = true;
    this->pKernelInfo->usesSsh = true;
    {
@ -405,8 +405,8 @@ HWTEST_TYPED_TEST(KernelArgSvmTestTyped, GivenBufferKernelArgWhenBufferOffsetIsN
        *expectedPointerPatchPtr = reinterpret_cast<void *>(0U);
        *expectedOffsetPatchPtr = 0U;

-        ASSERT_GE(this->pKernel->getSurfaceStateHeapSize(), rendSurfSize);
-        RENDER_SURFACE_STATE *surfState = reinterpret_cast<RENDER_SURFACE_STATE *>(this->pKernel->getSurfaceStateHeap());
+        ASSERT_GE(this->pKernel->getSurfaceStateHeapSize(rootDeviceIndex), rendSurfSize);
+        RENDER_SURFACE_STATE *surfState = reinterpret_cast<RENDER_SURFACE_STATE *>(this->pKernel->getSurfaceStateHeap(rootDeviceIndex));
        memset(surfState, 0, rendSurfSize);

        TypeParam::setArg(*this->pKernel, 0U, ptrToPatch, sizeToPatch, svmAlloc);
--- a/opencl/test/unit_test/kernel/kernel_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_tests.cpp
@ -736,13 +736,13 @@ HWTEST_F(KernelPrivateSurfaceTest, givenStatefulKernelWhenKernelIsCreatedThenPri

    ASSERT_EQ(CL_SUCCESS, pKernel->initialize());

-    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));

    auto bufferAddress = pKernel->kernelDeviceInfos[pDevice->getRootDeviceIndex()].privateSurface->getGpuAddress();

    typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->patchInfo.pAllocateStatelessPrivateSurface->SurfaceStateHeapOffset));
    auto surfaceAddress = surfaceState->getSurfaceBaseAddress();

@ -766,7 +766,7 @@ TEST_F(KernelPrivateSurfaceTest, givenStatelessKernelWhenKernelIsCreatedThenPriv
    char buffer[16];
    MockGraphicsAllocation gfxAlloc(buffer, sizeof(buffer));

-    MockContext context;
+    MockContext context(pClDevice);
    MockProgram program(&context, false, toClDeviceVector(*pClDevice));
    program.setConstantSurface(&gfxAlloc);

@ -779,8 +779,8 @@ TEST_F(KernelPrivateSurfaceTest, givenStatelessKernelWhenKernelIsCreatedThenPriv

    ASSERT_EQ(CL_SUCCESS, pKernel->initialize());

-    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
-    EXPECT_EQ(nullptr, pKernel->getSurfaceStateHeap());
+    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
+    EXPECT_EQ(nullptr, pKernel->getSurfaceStateHeap(rootDeviceIndex));

    program.setConstantSurface(nullptr);
    delete pKernel;
@ -984,11 +984,11 @@ HWTEST_F(KernelGlobalSurfaceTest, givenStatefulKernelWhenKernelIsCreatedThenGlob

    ASSERT_EQ(CL_SUCCESS, pKernel->initialize());

-    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));

    typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->patchInfo.pAllocateStatelessGlobalMemorySurfaceWithInitialization->SurfaceStateHeapOffset));
    auto surfaceAddress = surfaceState->getSurfaceBaseAddress();

@ -1025,8 +1025,8 @@ TEST_F(KernelGlobalSurfaceTest, givenStatelessKernelWhenKernelIsCreatedThenGloba

    ASSERT_EQ(CL_SUCCESS, pKernel->initialize());

-    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
-    EXPECT_EQ(nullptr, pKernel->getSurfaceStateHeap());
+    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
+    EXPECT_EQ(nullptr, pKernel->getSurfaceStateHeap(rootDeviceIndex));

    program.setGlobalSurface(nullptr);
    delete pKernel;
@ -1156,11 +1156,11 @@ HWTEST_F(KernelConstantSurfaceTest, givenStatefulKernelWhenKernelIsCreatedThenCo

    ASSERT_EQ(CL_SUCCESS, pKernel->initialize());

-    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));

    typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->patchInfo.pAllocateStatelessConstantMemorySurfaceWithInitialization->SurfaceStateHeapOffset));
    auto surfaceAddress = surfaceState->getSurfaceBaseAddress();

@ -1197,8 +1197,8 @@ TEST_F(KernelConstantSurfaceTest, givenStatelessKernelWhenKernelIsCreatedThenCon

    ASSERT_EQ(CL_SUCCESS, pKernel->initialize());

-    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
-    EXPECT_EQ(nullptr, pKernel->getSurfaceStateHeap());
+    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
+    EXPECT_EQ(nullptr, pKernel->getSurfaceStateHeap(rootDeviceIndex));

    program.setConstantSurface(nullptr);
    delete pKernel;
@ -1238,11 +1238,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelEventPoolSurfaceTest, givenStatefulKernelWhenK

    ASSERT_EQ(CL_SUCCESS, pKernel->initialize());

-    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));

    typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->patchInfo.pAllocateStatelessEventPoolSurface->SurfaceStateHeapOffset));
    auto surfaceAddress = surfaceState->getSurfaceBaseAddress();

@ -1291,7 +1291,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelEventPoolSurfaceTest, givenStatefulKernelWhenE

    typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->patchInfo.pAllocateStatelessEventPoolSurface->SurfaceStateHeapOffset));
    auto surfaceAddress = surfaceState->getSurfaceBaseAddress();

@ -1363,7 +1363,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelEventPoolSurfaceTest, givenStatelessKernelWhen

    ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
    if (pClDevice->areOcl21FeaturesSupported() == false) {
-        EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
+        EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
    } else {
    }

@ -1442,11 +1442,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelDefaultDeviceQueueSurfaceTest, givenStatefulKe

    ASSERT_EQ(CL_SUCCESS, pKernel->initialize());

-    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));

    typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->SurfaceStateHeapOffset));
    auto surfaceAddress = surfaceState->getSurfaceBaseAddress();

@ -1493,11 +1493,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelDefaultDeviceQueueSurfaceTest, givenStatefulKe

    pKernel->patchDefaultDeviceQueue(pDevQueue);

-    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));

    typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->SurfaceStateHeapOffset));
    auto surfaceAddress = surfaceState->getSurfaceBaseAddress();

@ -1537,7 +1537,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelDefaultDeviceQueueSurfaceTest, givenStatelessK

    ASSERT_EQ(CL_SUCCESS, pKernel->initialize());

-    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize());
+    EXPECT_EQ(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));

    delete pKernel;
 }
--- a/opencl/test/unit_test/kernel/kernel_transformable_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_transformable_tests.cpp
@ -21,6 +21,7 @@ using namespace NEO;
 class KernelTransformableTest : public ::testing::Test {
  public:
    void SetUp() override {
+        rootDeviceIndex = context.getDevice(0)->getRootDeviceIndex();
        pKernelInfo = std::make_unique<KernelInfo>();
        KernelArgPatchInfo kernelArgPatchInfo;

@ -74,6 +75,7 @@ class KernelTransformableTest : public ::testing::Test {
    std::unique_ptr<Image> image;
    SKernelBinaryHeaderCommon kernelHeader;
    char surfaceStateHeap[0x80];
+    uint32_t rootDeviceIndex = std::numeric_limits<uint32_t>::max();
 };

 HWTEST_F(KernelTransformableTest, givenKernelThatCannotTranformImagesWithTwoTransformableImagesAndTwoTransformableSamplersWhenAllArgsAreSetThenImagesAreNotTransformed) {
@ -93,7 +95,7 @@ HWTEST_F(KernelTransformableTest, givenKernelThatCannotTranformImagesWithTwoTran
    pKernel->setArg(2, sizeof(clImage), &clImage);
    pKernel->setArg(3, sizeof(clImage), &clImage);

-    auto ssh = pKernel->getSurfaceStateHeap();
+    auto ssh = pKernel->getSurfaceStateHeap(rootDeviceIndex);

    auto firstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, firstImageOffset));
    EXPECT_EQ(SURFACE_TYPE::SURFACE_TYPE_SURFTYPE_3D, firstSurfaceState->getSurfaceType());
@ -120,7 +122,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithTwoTransformableImagesAndTwoTra
    pKernel->setArg(2, sizeof(clImage), &clImage);
    pKernel->setArg(3, sizeof(clImage), &clImage);

-    auto ssh = pKernel->getSurfaceStateHeap();
+    auto ssh = pKernel->getSurfaceStateHeap(rootDeviceIndex);

    auto firstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, firstImageOffset));
    EXPECT_EQ(SURFACE_TYPE::SURFACE_TYPE_SURFTYPE_2D, firstSurfaceState->getSurfaceType());
@ -147,7 +149,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithTwoTransformableImagesAndTwoTra
    pKernel->setArg(2, sizeof(clImage), &clImage);
    pKernel->setArg(3, sizeof(clImage), &clImage);

-    auto ssh = pKernel->getSurfaceStateHeap();
+    auto ssh = pKernel->getSurfaceStateHeap(rootDeviceIndex);

    auto firstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, firstImageOffset));
    auto secondSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, secondImageOffset));
@ -179,7 +181,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithOneTransformableImageAndTwoTran
    pKernel->setArg(2, sizeof(clImage), &clImage);
    pKernel->setArg(3, sizeof(clImage), &clImage);

-    auto ssh = pKernel->getSurfaceStateHeap();
+    auto ssh = pKernel->getSurfaceStateHeap(rootDeviceIndex);

    auto firstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, firstImageOffset));
    auto secondSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, secondImageOffset));
@ -201,7 +203,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithImages2dAndTwoTransformableSamp
    pKernelInfo->kernelArgInfo[2].isTransformable = true;
    pKernelInfo->kernelArgInfo[3].isTransformable = true;

-    auto ssh = pKernel->getSurfaceStateHeap();
+    auto ssh = pKernel->getSurfaceStateHeap(rootDeviceIndex);

    auto firstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, firstImageOffset));
    auto secondSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, secondImageOffset));
@ -233,7 +235,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithTwoTransformableImagesAndTwoTra
    pKernel->setArg(2, sizeof(clImage), &clImage);
    pKernel->setArg(3, sizeof(clImage), &clImage);

-    auto ssh = pKernel->getSurfaceStateHeap();
+    auto ssh = pKernel->getSurfaceStateHeap(rootDeviceIndex);

    auto firstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, firstImageOffset));
    auto secondSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, secondImageOffset));
@ -265,7 +267,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithNonTransformableSamplersWhenRes
    pKernel->setArg(2, sizeof(clImage), &clImage);
    pKernel->setArg(3, sizeof(clImage), &clImage);

-    auto ssh = pKernel->getSurfaceStateHeap();
+    auto ssh = pKernel->getSurfaceStateHeap(rootDeviceIndex);

    auto firstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, firstImageOffset));
    auto secondSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, secondImageOffset));
@ -303,7 +305,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithoutSamplersAndTransformableImag
    pKernel->setArg(2, sizeof(clImage), &clImage);
    pKernel->setArg(3, sizeof(clImage), &clImage);

-    auto ssh = pKernel->getSurfaceStateHeap();
+    auto ssh = pKernel->getSurfaceStateHeap(rootDeviceIndex);

    auto firstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, firstImageOffset));
    auto secondSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh, secondImageOffset));
--- a/opencl/test/unit_test/mem_obj/buffer_set_arg_tests.cpp
+++ b/opencl/test/unit_test/mem_obj/buffer_set_arg_tests.cpp
@ -125,7 +125,7 @@ HWTEST_F(BufferSetArgTest, givenSetArgBufferWhenNullArgStatefulThenProgramNullSu
    using SURFACE_FORMAT = typename RENDER_SURFACE_STATE::SURFACE_FORMAT;

    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    pKernelInfo->requiresSshForBuffers = true;
@ -145,7 +145,7 @@ HWTEST_F(BufferSetArgTest, givenSetKernelArgOnReadOnlyBufferThatIsMisalingedWhen
    using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;

    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    pKernelInfo->requiresSshForBuffers = true;
@ -186,7 +186,7 @@ HWTEST_F(BufferSetArgTest, givenSetArgBufferWithNullArgStatelessThenDontProgramN
 HWTEST_F(BufferSetArgTest, givenNonPureStatefulArgWhenRenderCompressedBufferIsSetThenSetNonAuxMode) {
    using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;

-    auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(), pKernelInfo->kernelArgInfo[0].offsetHeap));
+    auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex), pKernelInfo->kernelArgInfo[0].offsetHeap));
    auto graphicsAllocation = buffer->getGraphicsAllocation(pClDevice->getRootDeviceIndex());
    graphicsAllocation->setAllocationType(GraphicsAllocation::AllocationType::BUFFER_COMPRESSED);
    graphicsAllocation->setDefaultGmm(new Gmm(pDevice->getGmmClientContext(), graphicsAllocation->getUnderlyingBuffer(), buffer->getSize(), false));
--- a/opencl/test/unit_test/mem_obj/image_set_arg_tests.cpp
+++ b/opencl/test/unit_test/mem_obj/image_set_arg_tests.cpp
@ -120,7 +120,7 @@ HWTEST_F(ImageSetArgTest, WhenSettingKernelArgImageThenSurfaceBaseAddressIsSetCo
    typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;

    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    srcImage->setImageArg(const_cast<RENDER_SURFACE_STATE *>(surfaceState), false, 0, pClDevice->getRootDeviceIndex());
@ -195,7 +195,7 @@ HWTEST_F(ImageSetArgTest, givenCubeMapIndexWhenSetKernelArgImageIsCalledThenModi
    src2dImage->setCubeFaceIndex(cubeFaceIndex);

    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    src2dImage->setImageArg(const_cast<RENDER_SURFACE_STATE *>(surfaceState), false, 0, pClDevice->getRootDeviceIndex());
@ -298,7 +298,7 @@ HWTEST_F(ImageSetArgTest, givenNonCubeMapIndexWhenSetKernelArgImageIsCalledThenD
    typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;

    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    EXPECT_EQ(srcImage->getCubeFaceIndex(), __GMM_NO_CUBE_MAP);
@ -327,7 +327,7 @@ HWTEST_F(ImageSetArgTest, givenOffsetedBufferWhenSetKernelArgImageIscalledThenFu
    typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;

    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    auto graphicsAllocation = srcAllocation;
@ -357,7 +357,7 @@ HWTEST_F(ImageSetArgTest, WhenSettingKernelArgThenPropertiesAreSetCorrectly) {
    ASSERT_EQ(CL_SUCCESS, retVal);

    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    size_t rPitch = srcImage->getImageDesc().image_row_pitch;
@ -408,7 +408,7 @@ HWTEST_F(ImageSetArgTest, givenImage2DWithMipMapsWhenSetKernelArgIsCalledThenMip
    ASSERT_EQ(CL_SUCCESS, retVal);

    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));
    EXPECT_EQ((uint32_t)mipLevel, surfaceState->getSurfaceMinLod());
    EXPECT_EQ((uint32_t)mipCount, surfaceState->getMipCountLod() + 1);
@ -429,7 +429,7 @@ HWTEST_F(ImageSetArgTest, Given2dArrayWhenSettingKernelArgThenPropertiesAreSetCo
    ASSERT_EQ(CL_SUCCESS, retVal);

    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));
    auto surfaceAddress = surfaceState->getSurfaceBaseAddress();

@ -477,7 +477,7 @@ HWTEST_F(ImageSetArgTest, Given1dArrayWhenSettingKernelArgThenPropertiesAreSetCo
    ASSERT_EQ(CL_SUCCESS, retVal);

    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));
    auto surfaceAddress = surfaceState->getSurfaceBaseAddress();

@ -533,7 +533,7 @@ HWTEST_F(ImageSetArgTest, givenMcsAllocationWhenSetArgIsCalledWithoutUnifiedAuxC
    ASSERT_EQ(CL_SUCCESS, retVal);

    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    EXPECT_FALSE(Image::isDepthFormat(image->getImageFormat()));
@ -569,7 +569,7 @@ HWTEST_F(ImageSetArgTest, givenDepthFormatWhenSetArgIsCalledThenProgramAuxFields
    ASSERT_EQ(CL_SUCCESS, retVal);

    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    EXPECT_TRUE(Image::isDepthFormat(image->getImageFormat()));
@ -600,7 +600,7 @@ HWTEST_F(ImageSetArgTest, givenMultisampledR32Floatx8x24DepthStencilFormatWhenSe
    retVal = clSetKernelArg(pKernel, 0, sizeof(memObj), &memObj);
    ASSERT_EQ(CL_SUCCESS, retVal);

-    auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(),
+    auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                                                                           pKernelInfo->kernelArgInfo[0].offsetHeap));

    EXPECT_TRUE(Image::isDepthFormat(image->getImageFormat()));
@ -627,7 +627,7 @@ HWTEST_F(ImageSetArgTest, givenMcsAllocationAndRenderCompressionWhenSetArgOnMult
    retVal = clSetKernelArg(pKernel, 0, sizeof(memObj), &memObj);
    ASSERT_EQ(CL_SUCCESS, retVal);

-    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(),
+    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                                                                                 pKernelInfo->kernelArgInfo[0].offsetHeap));

    EXPECT_TRUE(surfaceState->getMultisampledSurfaceStorageFormat() ==
@ -657,7 +657,7 @@ HWTEST_F(ImageSetArgTest, givenDepthFormatAndRenderCompressionWhenSetArgOnMultis
    retVal = clSetKernelArg(pKernel, 0, sizeof(memObj), &memObj);
    ASSERT_EQ(CL_SUCCESS, retVal);

-    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(),
+    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                                                                                 pKernelInfo->kernelArgInfo[0].offsetHeap));

    EXPECT_TRUE(Image::isDepthFormat(image->getImageFormat()));
@ -692,7 +692,7 @@ HWTEST_F(ImageSetArgTest, givenMcsAllocationWhenSetArgIsCalledWithUnifiedAuxCapa
    retVal = clSetKernelArg(pKernel, 0, sizeof(memObj), &memObj);
    ASSERT_EQ(CL_SUCCESS, retVal);

-    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(),
+    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                                                                                 pKernelInfo->kernelArgInfo[0].offsetHeap));

    EXPECT_TRUE(surfaceState->getAuxiliarySurfaceMode() == AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E);
@ -723,7 +723,7 @@ HWTEST_F(ImageSetArgTest, givenMcsAllocationWhenSetArgIsCalledWithUnifiedAuxCapa
    retVal = clSetKernelArg(pKernel, 0, sizeof(memObj), &memObj);
    ASSERT_EQ(CL_SUCCESS, retVal);

-    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(),
+    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                                                                                 pKernelInfo->kernelArgInfo[0].offsetHeap));

    EXPECT_NE(0u, surfaceState->getAuxiliarySurfaceBaseAddress());
@ -758,7 +758,7 @@ HWTEST_F(ImageSetArgTest, givenMcsAllocationWhenSetArgIsCalledWithUnifiedAuxCapa
    retVal = clSetKernelArg(pKernel, 0, sizeof(memObj), &memObj);
    ASSERT_EQ(CL_SUCCESS, retVal);

-    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(),
+    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                                                                                 pKernelInfo->kernelArgInfo[0].offsetHeap));

    EXPECT_EQ(pitchValue, surfaceState->getAuxiliarySurfacePitch());
@ -793,7 +793,7 @@ HWTEST_F(ImageSetArgTest, GivenImageFrom1dBufferWhenSettingKernelArgThenProperti
    ASSERT_EQ(CL_SUCCESS, retVal);

    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));
    auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
    auto image = castToObject<Image>(imageFromBuffer);
@ -837,7 +837,7 @@ HWTEST_F(ImageSetArgTest, GivenImageWithClLuminanceFormatWhenSettingKernelArgThe
    ASSERT_EQ(CL_SUCCESS, retVal);

    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));
    //for CL_LUMINANCE format we override channels to RED to be spec complaint.
    EXPECT_EQ(RENDER_SURFACE_STATE::SHADER_CHANNEL_SELECT_RED, surfaceState->getShaderChannelSelectRed());
@ -963,7 +963,7 @@ HWTEST_F(ImageMediaBlockSetArgTest, WhenSettingKernelArgImageThenPropertiesAreCo
    ASSERT_EQ(CL_SUCCESS, retVal);

    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(pKernel->getSurfaceStateHeap(),
+        ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    size_t rPitch = srcImage->getImageDesc().image_row_pitch;
--- a/opencl/test/unit_test/memory_manager/memory_manager_tests.cpp
+++ b/opencl/test/unit_test/memory_manager/memory_manager_tests.cpp
@ -544,7 +544,7 @@ TEST_F(MemoryAllocatorTest, givenStatelessKernelWithPrintfWhenPrintfSurfaceIsCre

    EXPECT_EQ(allocationAddress, *(uintptr_t *)printfPatchAddress);

-    EXPECT_EQ(0u, kernel.mockKernel->getSurfaceStateHeapSize());
+    EXPECT_EQ(0u, kernel.mockKernel->getSurfaceStateHeapSize(rootDeviceIndex));

    delete printfHandler;
 }
@ -575,11 +575,11 @@ HWTEST_F(MemoryAllocatorTest, givenStatefulKernelWithPrintfWhenPrintfSurfaceIsCr
    auto printfAllocation = printfHandler->getSurface();
    auto allocationAddress = printfAllocation->getGpuAddress();

-    EXPECT_NE(0u, kernel.mockKernel->getSurfaceStateHeapSize());
+    EXPECT_NE(0u, kernel.mockKernel->getSurfaceStateHeapSize(device->getRootDeviceIndex()));

    typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
    auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
-        ptrOffset(kernel.mockKernel->getSurfaceStateHeap(),
+        ptrOffset(kernel.mockKernel->getSurfaceStateHeap(device->getRootDeviceIndex()),
                  kernel.mockKernel->getKernelInfo().patchInfo.pAllocateStatelessPrintfSurface->SurfaceStateHeapOffset));
    auto surfaceAddress = surfaceState->getSurfaceBaseAddress();

--- a/opencl/test/unit_test/mocks/mock_kernel.h
+++ b/opencl/test/unit_test/mocks/mock_kernel.h
@ -43,7 +43,6 @@ class MockKernel : public Kernel {
    using Kernel::numberOfBindingTableStates;
    using Kernel::patchBufferOffset;
    using Kernel::patchWithImplicitSurface;
-    using Kernel::sshLocalSize;
    using Kernel::svmAllocationsRequireCacheFlush;
    using Kernel::threadArbitrationPolicy;
    using Kernel::unifiedMemoryControls;
@ -181,15 +180,15 @@ class MockKernel : public Kernel {
        kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize = static_cast<uint32_t>(mockCrossThreadData.size());
    }

-    void setSshLocal(const void *sshPattern, uint32_t newSshSize) {
-        sshLocalSize = newSshSize;
+    void setSshLocal(const void *sshPattern, uint32_t newSshSize, uint32_t rootDeviceIndex) {
+        kernelDeviceInfos[rootDeviceIndex].sshLocalSize = newSshSize;

        if (newSshSize == 0) {
-            pSshLocal.reset(nullptr);
+            kernelDeviceInfos[rootDeviceIndex].pSshLocal.reset(nullptr);
        } else {
-            pSshLocal = std::make_unique<char[]>(newSshSize);
+            kernelDeviceInfos[rootDeviceIndex].pSshLocal = std::make_unique<char[]>(newSshSize);
            if (sshPattern) {
-                memcpy_s(pSshLocal.get(), newSshSize, sshPattern, newSshSize);
+                memcpy_s(kernelDeviceInfos[rootDeviceIndex].pSshLocal.get(), newSshSize, sshPattern, newSshSize);
            }
        }
    }
@ -291,7 +290,7 @@ class MockKernelWithInternals {
        mockProgram = new MockProgram(context, false, deviceVector);
        mockKernel = new MockKernel(mockProgram, kernelInfo);
        mockKernel->setCrossThreadData(&crossThreadData, sizeof(crossThreadData));
-        mockKernel->setSshLocal(&sshLocal, sizeof(sshLocal));
+        mockKernel->setSshLocal(&sshLocal, sizeof(sshLocal), deviceArg.getRootDeviceIndex());

        if (addDefaultArg) {
            defaultKernelArguments.resize(2);
@ -358,10 +357,9 @@ class MockKernelWithInternals {
 class MockParentKernel : public Kernel {
  public:
    using Kernel::auxTranslationRequired;
+    using Kernel::kernelDeviceInfos;
    using Kernel::kernelInfo;
    using Kernel::patchBlocksCurbeWithConstantValues;
-    using Kernel::pSshLocal;
-    using Kernel::sshLocalSize;

    static MockParentKernel *create(Context &context, bool addChildSimdSize = false, bool addChildGlobalMemory = false, bool addChildConstantMemory = false, bool addPrintfForParent = true, bool addPrintfForBlock = true) {
        auto clDevice = context.getDevice(0);