Pass root device index to get proper kernel info

Related-To: NEO-5001 Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
2020-12-08 12:11:40 +00:00 · 2020-12-08 12:11:40 +00:00 · 350ec9f16b
parent d3f3730989
commit 350ec9f16b
49 changed files with 239 additions and 217 deletions
--- a/opencl/source/api/api.cpp
+++ b/opencl/source/api/api.cpp
@ -3410,7 +3410,7 @@ cl_int CL_API_CALL clEnqueueNDRangeKernel(cl_command_queue commandQueue,
    }

    if ((pKernel->getExecutionType() != KernelExecutionType::Default) ||
-        pKernel->isUsingSyncBuffer()) {
+        pKernel->isUsingSyncBuffer(pCommandQueue->getDevice().getRootDeviceIndex())) {
        retVal = CL_INVALID_KERNEL;
        TRACING_EXIT(clEnqueueNDRangeKernel, &retVal);
        return retVal;
@ -5818,7 +5818,9 @@ cl_int CL_API_CALL clEnqueueNDCountKernelINTEL(cl_command_queue commandQueue,
        return retVal;
    }

-    auto &hardwareInfo = pKernel->getDevices()[0]->getHardwareInfo();
+    auto &device = pCommandQueue->getClDevice();
+    auto rootDeviceIndex = device.getRootDeviceIndex();
+    auto &hardwareInfo = device.getHardwareInfo();
    auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
    if (!hwHelper.isCooperativeDispatchSupported(pCommandQueue->getGpgpuEngine().getEngineType(), hardwareInfo.platform.eProductFamily)) {
        retVal = CL_INVALID_COMMAND_QUEUE;
@ -5842,13 +5844,13 @@ cl_int CL_API_CALL clEnqueueNDCountKernelINTEL(cl_command_queue commandQueue,
        }
    }

-    if (pKernel->isUsingSyncBuffer()) {
+    if (pKernel->isUsingSyncBuffer(rootDeviceIndex)) {
        if (pKernel->getExecutionType() != KernelExecutionType::Concurrent) {
            retVal = CL_INVALID_KERNEL;
            return retVal;
        }

-        pCommandQueue->getDevice().getSpecializedDevice<ClDevice>()->allocateSyncBufferHandler();
+        device.allocateSyncBufferHandler();
    }

    if (!pCommandQueue->validateCapabilityForOperation(CL_QUEUE_CAPABILITY_KERNEL_INTEL, eventWaitList, event)) {
--- a/opencl/source/command_queue/command_queue.cpp
+++ b/opencl/source/command_queue/command_queue.cpp
@ -529,7 +529,7 @@ bool CommandQueue::setupDebugSurface(Kernel *kernel) {
    auto debugSurface = getGpgpuCommandStreamReceiver().getDebugSurfaceAllocation();

    auto rootDeviceIndex = device->getRootDeviceIndex();
-    DEBUG_BREAK_IF(!kernel->requiresSshForBuffers());
+    DEBUG_BREAK_IF(!kernel->requiresSshForBuffers(rootDeviceIndex));
    auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getSurfaceStateHeap(rootDeviceIndex)),
                                  kernel->getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSystemThreadSurface->Offset);
    void *addressToPatch = reinterpret_cast<void *>(debugSurface->getGpuAddress());
--- a/opencl/source/command_queue/enqueue_common.h
+++ b/opencl/source/command_queue/enqueue_common.h
@ -64,16 +64,16 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface *(&surfaces)[surfaceCount
    if (DebugManager.flags.ForceDispatchScheduler.get()) {
        forceDispatchScheduler(multiDispatchInfo);
    } else {
+        auto rootDeviceIndex = device->getRootDeviceIndex();
        if (kernel->isAuxTranslationRequired()) {
            auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(EBuiltInOps::AuxTranslation, getClDevice());
            builtInLock.takeOwnership(builder);
-            kernel->fillWithBuffersForAuxTranslation(memObjsForAuxTranslation);
+            kernel->fillWithBuffersForAuxTranslation(memObjsForAuxTranslation, rootDeviceIndex);
            multiDispatchInfo.setMemObjsForAuxTranslation(memObjsForAuxTranslation);
            if (!memObjsForAuxTranslation.empty()) {
                dispatchAuxTranslationBuiltin(multiDispatchInfo, AuxTranslationDirection::AuxToNonAux);
            }
        }
-        auto rootDeviceIndex = device->getRootDeviceIndex();
        if (kernel->getKernelInfo(rootDeviceIndex).builtinDispatchBuilder == nullptr) {
            DispatchInfoBuilder<SplitDispatch::Dim::d3D, SplitDispatch::SplitMode::WalkerSplit> builder(getClDevice());
            builder.setDispatchGeometry(workDim, workItems, enqueuedWorkSizes, globalOffsets, Vec3<size_t>{0, 0, 0}, localWorkSizesIn);
@ -373,7 +373,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
        printfHandler->prepareDispatch(multiDispatchInfo);
    }

-    if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer()) {
+    if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer(device->getRootDeviceIndex())) {
        auto &gws = multiDispatchInfo.begin()->getGWS();
        auto &lws = multiDispatchInfo.begin()->getLocalWorkgroupSize();
        size_t workGroupsCount = (gws.x * gws.y * gws.z) /
@ -649,7 +649,8 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
        printfHandler->makeResident(getGpgpuCommandStreamReceiver());
    }

-    if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer()) {
+    auto rootDeviceIndex = device->getRootDeviceIndex();
+    if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer(rootDeviceIndex)) {
        device->syncBufferHandler->makeResident(getGpgpuCommandStreamReceiver());
    }

@ -674,7 +675,6 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
    auto specialPipelineSelectMode = false;
    Kernel *kernel = nullptr;
    bool usePerDssBackedBuffer = false;
-    auto rootDeviceIndex = device->getRootDeviceIndex();

    for (auto &dispatchInfo : multiDispatchInfo) {
        if (kernel != dispatchInfo.getKernel()) {
--- a/opencl/source/context/context.cpp
+++ b/opencl/source/context/context.cpp
@ -412,7 +412,7 @@ SchedulerKernel &Context::getSchedulerKernel() {
            kernelInfos,
            &retVal);

-        UNRECOVERABLE_IF(schedulerBuiltIn->pKernel->getScratchSize() != 0);
+        UNRECOVERABLE_IF(schedulerBuiltIn->pKernel->getScratchSize(clDevice->getRootDeviceIndex()) != 0);

        DEBUG_BREAK_IF(retVal != CL_SUCCESS);
    };
--- a/opencl/source/device_queue/device_queue_hw_bdw_plus.inl
+++ b/opencl/source/device_queue/device_queue_hw_bdw_plus.inl
@ -151,13 +151,13 @@ template <typename GfxFamily>
 void DeviceQueueHw<GfxFamily>::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount, bool isCcsUsed) {
    using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
    void *pDSH = dynamicStateHeap.getCpuBase();
-
+    auto rootDeviceIndex = device->getRootDeviceIndex();
    // Set scheduler ID to last entry in first table, it will have ID == 0, blocks will have following entries.
    auto igilCmdQueue = reinterpret_cast<IGIL_CommandQueue *>(queueBuffer->getUnderlyingBuffer());
    igilCmdQueue->m_controls.m_IDTstart = colorCalcStateSize + sizeof(INTERFACE_DESCRIPTOR_DATA) * (interfaceDescriptorEntries - 2);

    // Parent's dsh is located after ColorCalcState and 2 ID tables
-    igilCmdQueue->m_controls.m_DynamicHeapStart = offsetDsh + alignUp((uint32_t)parentKernel->getDynamicStateHeapSize(), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
+    igilCmdQueue->m_controls.m_DynamicHeapStart = offsetDsh + alignUp(static_cast<uint32_t>(parentKernel->getDynamicStateHeapSize(rootDeviceIndex)), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
    igilCmdQueue->m_controls.m_DynamicHeapSizeInBytes = (uint32_t)dshBuffer->getUnderlyingBufferSize();

    igilCmdQueue->m_controls.m_CurrentDSHoffset = igilCmdQueue->m_controls.m_DynamicHeapStart;
--- a/opencl/source/gtpin/gtpin_callbacks.cpp
+++ b/opencl/source/gtpin/gtpin_callbacks.cpp
@ -73,7 +73,7 @@ void gtpinNotifyKernelCreate(cl_kernel kernel) {
            // Kernel with no SSH or Kernel EM, not supported
            return;
        }
-        if (pKernel->isKernelHeapSubstituted()) {
+        if (pKernel->isKernelHeapSubstituted(rootDeviceIndex)) {
            // ISA for this kernel was already substituted
            return;
        }
@ -85,8 +85,8 @@ void gtpinNotifyKernelCreate(cl_kernel kernel) {

        paramsIn.kernel_type = GTPIN_KERNEL_TYPE_CS;
        paramsIn.simd = (GTPIN_SIMD_WIDTH)kernelInfo.getMaxSimdSize();
-        paramsIn.orig_kernel_binary = (uint8_t *)pKernel->getKernelHeap();
-        paramsIn.orig_kernel_size = static_cast<uint32_t>(pKernel->getKernelHeapSize());
+        paramsIn.orig_kernel_binary = (uint8_t *)pKernel->getKernelHeap(rootDeviceIndex);
+        paramsIn.orig_kernel_size = static_cast<uint32_t>(pKernel->getKernelHeapSize(rootDeviceIndex));
        paramsIn.buffer_type = GTPIN_BUFFER_BINDFULL;
        paramsIn.buffer_desc.BTI = static_cast<uint32_t>(gtpinBTI);
        paramsIn.igc_hash_id = kernelInfo.shaderHashCode;
@ -97,8 +97,8 @@ void gtpinNotifyKernelCreate(cl_kernel kernel) {
        instrument_params_out_t paramsOut = {0};
        (*GTPinCallbacks.onKernelCreate)((context_handle_t)(cl_context)context, &paramsIn, &paramsOut);
        // Substitute ISA of created kernel with instrumented code
-        pKernel->substituteKernelHeap(paramsOut.inst_kernel_binary, paramsOut.inst_kernel_size);
-        pKernel->setKernelId(paramsOut.kernel_id);
+        pKernel->substituteKernelHeap(rootDeviceIndex, paramsOut.inst_kernel_binary, paramsOut.inst_kernel_size);
+        pKernel->setKernelId(rootDeviceIndex, paramsOut.kernel_id);
    }
 }

@ -106,14 +106,15 @@ void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
    if (isGTPinInitialized) {
        auto pCmdQ = reinterpret_cast<CommandQueue *>(pCmdQueue);
        auto &device = pCmdQ->getDevice();
+        auto rootDeviceIndex = device.getRootDeviceIndex();
        auto pKernel = castToObjectOrAbort<Kernel>(kernel);
-        if (pKernel->isParentKernel || pKernel->getSurfaceStateHeapSize(device.getRootDeviceIndex()) == 0) {
+        if (pKernel->isParentKernel || pKernel->getSurfaceStateHeapSize(rootDeviceIndex) == 0) {
            // Kernel with no SSH, not supported
            return;
        }
        Context *pContext = &(pKernel->getContext());
        cl_context context = (cl_context)pContext;
-        uint64_t kernelId = pKernel->getKernelId();
+        uint64_t kernelId = pKernel->getKernelId(rootDeviceIndex);
        command_buffer_handle_t commandBuffer = (command_buffer_handle_t)((uintptr_t)(sequenceCount++));
        uint32_t kernelOffset = 0;
        resource_handle_t resource = 0;
@ -138,7 +139,7 @@ void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
        GFXCORE_FAMILY genFamily = device.getHardwareInfo().platform.eRenderCoreFamily;
        GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
        size_t gtpinBTI = pKernel->getNumberOfBindingTableStates() - 1;
-        void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI, device.getRootDeviceIndex());
+        void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI, rootDeviceIndex);
        cl_mem buffer = (cl_mem)resource;
        auto pBuffer = castToObjectOrAbort<Buffer>(buffer);
        pBuffer->setArgStateful(pSurfaceState, false, false, false, false, device);
--- a/opencl/source/helpers/dispatch_info.cpp
+++ b/opencl/source/helpers/dispatch_info.cpp
@ -19,11 +19,11 @@ bool DispatchInfo::usesStatelessPrintfSurface() const {
 }

 uint32_t DispatchInfo::getRequiredScratchSize() const {
-    return (kernel == nullptr) ? 0 : kernel->getScratchSize();
+    return (kernel == nullptr) ? 0 : kernel->getScratchSize(pClDevice->getRootDeviceIndex());
 }

 uint32_t DispatchInfo::getRequiredPrivateScratchSize() const {
-    return (kernel == nullptr) ? 0 : kernel->getPrivateScratchSize();
+    return (kernel == nullptr) ? 0 : kernel->getPrivateScratchSize(pClDevice->getRootDeviceIndex());
 }

 Kernel *MultiDispatchInfo::peekMainKernel() const {
--- a/opencl/source/helpers/hardware_commands_helper_base.inl
+++ b/opencl/source/helpers/hardware_commands_helper_base.inl
@ -52,7 +52,7 @@ size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredDSH(

    totalSize += borderColorSize + additionalSizeRequiredDsh();

-    DEBUG_BREAK_IF(!(totalSize >= kernel.getDynamicStateHeapSize() || kernel.getKernelInfo(rootDeviceIndex).isVmeWorkload));
+    DEBUG_BREAK_IF(!(totalSize >= kernel.getDynamicStateHeapSize(rootDeviceIndex) || kernel.getKernelInfo(rootDeviceIndex).isVmeWorkload));

    return alignUp(totalSize, EncodeStates<GfxFamily>::alignInterfaceDescriptorData);
 }
@ -245,7 +245,9 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
    uint32_t samplerCount = 0;
    if (patchInfo.samplerStateArray) {
        samplerCount = patchInfo.samplerStateArray->Count;
-        samplerStateOffset = EncodeStates<GfxFamily>::copySamplerState(&dsh, patchInfo.samplerStateArray->Offset, samplerCount, patchInfo.samplerStateArray->BorderColorOffset, kernel.getDynamicStateHeap(), device.getBindlessHeapsHelper());
+        samplerStateOffset = EncodeStates<GfxFamily>::copySamplerState(&dsh, patchInfo.samplerStateArray->Offset,
+                                                                       samplerCount, patchInfo.samplerStateArray->BorderColorOffset,
+                                                                       kernel.getDynamicStateHeap(rootDeviceIndex), device.getBindlessHeapsHelper());
    }

    auto threadPayload = kernelInfo.patchInfo.threadPayload;
--- a/opencl/source/kernel/get_additional_kernel_info.cpp
+++ b/opencl/source/kernel/get_additional_kernel_info.cpp
@ -11,6 +11,6 @@ namespace NEO {
 void Kernel::getAdditionalInfo(cl_kernel_info paramName, const void *&paramValue, size_t &paramValueSizeRet) const {
 }

-void Kernel::getAdditionalWorkGroupInfo(cl_kernel_work_group_info paramName, const void *&paramValue, size_t &paramValueSizeRet) const {
+void Kernel::getAdditionalWorkGroupInfo(cl_kernel_work_group_info paramName, const void *&paramValue, size_t &paramValueSizeRet, uint32_t rootDeviceIndex) const {
 }
 } // namespace NEO
--- a/opencl/source/kernel/kernel.cpp
+++ b/opencl/source/kernel/kernel.cpp
@ -290,7 +290,7 @@ cl_int Kernel::initialize() {
        }

        if (patchInfo.pAllocateStatelessEventPoolSurface) {
-            if (requiresSshForBuffers()) {
+            if (requiresSshForBuffers(rootDeviceIndex)) {
                auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
                                              patchInfo.pAllocateStatelessEventPoolSurface->SurfaceStateHeapOffset);
                Buffer::setSurfaceState(&pClDevice->getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0);
@ -299,7 +299,7 @@ cl_int Kernel::initialize() {

        if (patchInfo.pAllocateStatelessDefaultDeviceQueueSurface) {

-            if (requiresSshForBuffers()) {
+            if (requiresSshForBuffers(rootDeviceIndex)) {
                auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
                                              patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->SurfaceStateHeapOffset);
                Buffer::setSurfaceState(&pClDevice->getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0);
@ -442,16 +442,18 @@ cl_int Kernel::getInfo(cl_kernel_info paramName, size_t paramValueSize,
    const _cl_context *ctxt;
    cl_uint refCount = 0;
    uint64_t nonCannonizedGpuAddress = 0llu;
+    auto defaultRootDeviceIndex = getDevices()[0]->getRootDeviceIndex();
+    auto &defaultKernelInfo = getKernelInfo(defaultRootDeviceIndex);

    switch (paramName) {
    case CL_KERNEL_FUNCTION_NAME:
-        pSrc = getDefaultKernelInfo().kernelDescriptor.kernelMetadata.kernelName.c_str();
-        srcSize = getDefaultKernelInfo().kernelDescriptor.kernelMetadata.kernelName.length() + 1;
+        pSrc = defaultKernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str();
+        srcSize = defaultKernelInfo.kernelDescriptor.kernelMetadata.kernelName.length() + 1;
        break;

    case CL_KERNEL_NUM_ARGS:
        srcSize = sizeof(cl_uint);
-        numArgs = (cl_uint)getDefaultKernelInfo().kernelArgInfo.size();
+        numArgs = static_cast<cl_uint>(defaultKernelInfo.kernelArgInfo.size());
        pSrc = &numArgs;
        break;

@ -474,16 +476,16 @@ cl_int Kernel::getInfo(cl_kernel_info paramName, size_t paramValueSize,
        break;

    case CL_KERNEL_ATTRIBUTES:
-        pSrc = getDefaultKernelInfo().attributes.c_str();
-        srcSize = getDefaultKernelInfo().attributes.length() + 1;
+        pSrc = defaultKernelInfo.attributes.c_str();
+        srcSize = defaultKernelInfo.attributes.length() + 1;
        break;

    case CL_KERNEL_BINARY_PROGRAM_INTEL:
-        pSrc = getKernelHeap();
-        srcSize = getKernelHeapSize();
+        pSrc = getKernelHeap(defaultRootDeviceIndex);
+        srcSize = getKernelHeapSize(defaultRootDeviceIndex);
        break;
    case CL_KERNEL_BINARY_GPU_ADDRESS_INTEL:
-        nonCannonizedGpuAddress = GmmHelper::decanonize(getDefaultKernelInfo().kernelAllocation->getGpuAddress());
+        nonCannonizedGpuAddress = GmmHelper::decanonize(defaultKernelInfo.kernelAllocation->getGpuAddress());
        pSrc = &nonCannonizedGpuAddress;
        srcSize = sizeof(nonCannonizedGpuAddress);
        break;
@ -504,8 +506,9 @@ cl_int Kernel::getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName, size_t
    cl_int retVal;
    const void *pSrc = nullptr;
    size_t srcSize = GetInfo::invalidSourceSize;
-    auto numArgs = (cl_uint)getDefaultKernelInfo().kernelArgInfo.size();
-    const auto &argInfo = getDefaultKernelInfo().kernelArgInfo[argIndx];
+    auto &defaultKernelInfo = getDefaultKernelInfo();
+    auto numArgs = static_cast<cl_uint>(defaultKernelInfo.kernelArgInfo.size());
+    const auto &argInfo = defaultKernelInfo.kernelArgInfo[argIndx];

    if (argIndx >= numArgs) {
        retVal = CL_INVALID_ARG_INDEX;
@ -626,7 +629,7 @@ cl_int Kernel::getWorkGroupInfo(ClDevice &device, cl_kernel_work_group_info para
        pSrc = &privateMemSize;
        break;
    default:
-        getAdditionalWorkGroupInfo(paramName, pSrc, srcSize);
+        getAdditionalWorkGroupInfo(paramName, pSrc, srcSize, rootDeviceIndex);
        break;
    }

@ -742,16 +745,16 @@ cl_int Kernel::getSubGroupInfo(ClDevice &clDevice, cl_kernel_sub_group_info para
    }
 }

-const void *Kernel::getKernelHeap() const {
-    return getDefaultKernelInfo().heapInfo.pKernelHeap;
+const void *Kernel::getKernelHeap(uint32_t rootDeviceIndex) const {
+    return getKernelInfo(rootDeviceIndex).heapInfo.pKernelHeap;
 }

-size_t Kernel::getKernelHeapSize() const {
-    return getDefaultKernelInfo().heapInfo.KernelHeapSize;
+size_t Kernel::getKernelHeapSize(uint32_t rootDeviceIndex) const {
+    return getKernelInfo(rootDeviceIndex).heapInfo.KernelHeapSize;
 }

-void Kernel::substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize) {
-    KernelInfo *pKernelInfo = const_cast<KernelInfo *>(&getDefaultKernelInfo());
+void Kernel::substituteKernelHeap(uint32_t rootDeviceIndex, void *newKernelHeap, size_t newKernelHeapSize) {
+    KernelInfo *pKernelInfo = const_cast<KernelInfo *>(&getKernelInfo(rootDeviceIndex));
    void **pKernelHeap = const_cast<void **>(&pKernelInfo->heapInfo.pKernelHeap);
    *pKernelHeap = newKernelHeap;
    auto &heapInfo = pKernelInfo->heapInfo;
@ -771,16 +774,16 @@ void Kernel::substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize)
    UNRECOVERABLE_IF(!status);
 }

-bool Kernel::isKernelHeapSubstituted() const {
-    return getDefaultKernelInfo().isKernelHeapSubstituted;
+bool Kernel::isKernelHeapSubstituted(uint32_t rootDeviceIndex) const {
+    return getKernelInfo(rootDeviceIndex).isKernelHeapSubstituted;
 }

-uint64_t Kernel::getKernelId() const {
-    return getDefaultKernelInfo().kernelId;
+uint64_t Kernel::getKernelId(uint32_t rootDeviceIndex) const {
+    return getKernelInfo(rootDeviceIndex).kernelId;
 }

-void Kernel::setKernelId(uint64_t newKernelId) {
-    KernelInfo *pKernelInfo = const_cast<KernelInfo *>(&getDefaultKernelInfo());
+void Kernel::setKernelId(uint32_t rootDeviceIndex, uint64_t newKernelId) {
+    KernelInfo *pKernelInfo = const_cast<KernelInfo *>(&getKernelInfo(rootDeviceIndex));
    pKernelInfo->kernelId = newKernelId;
 }
 uint32_t Kernel::getStartOffset() const {
@ -794,12 +797,12 @@ void *Kernel::getSurfaceStateHeap(uint32_t rootDeviceIndex) const {
    return kernelInfos[rootDeviceIndex]->usesSsh ? kernelDeviceInfos[rootDeviceIndex].pSshLocal.get() : nullptr;
 }

-size_t Kernel::getDynamicStateHeapSize() const {
-    return getDefaultKernelInfo().heapInfo.DynamicStateHeapSize;
+size_t Kernel::getDynamicStateHeapSize(uint32_t rootDeviceIndex) const {
+    return getKernelInfo(rootDeviceIndex).heapInfo.DynamicStateHeapSize;
 }

-const void *Kernel::getDynamicStateHeap() const {
-    return getDefaultKernelInfo().heapInfo.pDsh;
+const void *Kernel::getDynamicStateHeap(uint32_t rootDeviceIndex) const {
+    return getKernelInfo(rootDeviceIndex).heapInfo.pDsh;
 }

 size_t Kernel::getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const {
@ -823,8 +826,9 @@ cl_int Kernel::setArg(uint32_t argIndex, size_t argSize, const void *argVal) {
    cl_int retVal = CL_SUCCESS;
    bool updateExposedKernel = true;
    auto argWasUncacheable = false;
-    if (getDefaultKernelInfo().builtinDispatchBuilder != nullptr) {
-        updateExposedKernel = getDefaultKernelInfo().builtinDispatchBuilder->setExplicitArg(argIndex, argSize, argVal, retVal);
+    auto &defaultKernelInfo = getDefaultKernelInfo();
+    if (defaultKernelInfo.builtinDispatchBuilder != nullptr) {
+        updateExposedKernel = defaultKernelInfo.builtinDispatchBuilder->setExplicitArg(argIndex, argSize, argVal, retVal);
    }
    if (updateExposedKernel) {
        if (argIndex >= kernelArgHandlers.size()) {
@ -882,13 +886,14 @@ void *Kernel::patchBufferOffset(const KernelArgInfo &argInfo, void *svmPtr, Grap

 cl_int Kernel::setArgSvm(uint32_t argIndex, size_t svmAllocSize, void *svmPtr, GraphicsAllocation *svmAlloc, cl_mem_flags svmFlags) {
    auto rootDeviceIndex = getDevice().getRootDeviceIndex();
-    void *ptrToPatch = patchBufferOffset(getDefaultKernelInfo().kernelArgInfo[argIndex], svmPtr, svmAlloc, rootDeviceIndex);
+    auto &kernelInfo = getKernelInfo(rootDeviceIndex);
+    void *ptrToPatch = patchBufferOffset(kernelInfo.kernelArgInfo[argIndex], svmPtr, svmAlloc, rootDeviceIndex);
    setArgImmediate(argIndex, sizeof(void *), &svmPtr);

    storeKernelArg(argIndex, SVM_OBJ, nullptr, svmPtr, sizeof(void *), svmAlloc, svmFlags);

-    if (requiresSshForBuffers()) {
-        const auto &kernelArgInfo = getDefaultKernelInfo().kernelArgInfo[argIndex];
+    if (requiresSshForBuffers(rootDeviceIndex)) {
+        const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex];
        auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
        Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, svmAllocSize + ptrDiff(svmPtr, ptrToPatch), ptrToPatch, 0, svmAlloc, svmFlags, 0);
    }
@ -905,7 +910,8 @@ cl_int Kernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocatio
    DBG_LOG_INPUTS("setArgBuffer svm_alloc", svmAlloc);

    auto rootDeviceIndex = getDevice().getRootDeviceIndex();
-    const auto &kernelArgInfo = getDefaultKernelInfo().kernelArgInfo[argIndex];
+    auto &kernelInfo = getKernelInfo(rootDeviceIndex);
+    const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex];

    storeKernelArg(argIndex, SVM_ALLOC_OBJ, svmAlloc, svmPtr, sizeof(uintptr_t));

@ -918,8 +924,8 @@ cl_int Kernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocatio

    patchWithRequiredSize(patchLocation, patchSize, reinterpret_cast<uintptr_t>(svmPtr));

-    if (requiresSshForBuffers()) {
-        const auto &kernelArgInfo = getDefaultKernelInfo().kernelArgInfo[argIndex];
+    if (requiresSshForBuffers(rootDeviceIndex)) {
+        const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex];
        auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
        size_t allocSize = 0;
        size_t offset = 0;
@ -1049,7 +1055,7 @@ uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *local
        return 0;
    }

-    auto executionEnvironment = getDefaultKernelInfo().patchInfo.executionEnvironment;
+    auto executionEnvironment = getKernelInfo(rootDeviceIndex).patchInfo.executionEnvironment;
    auto dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount;
    if (dssCount == 0) {
        dssCount = hardwareInfo.gtSystemInfo.SubSliceCount;
@ -1227,6 +1233,7 @@ cl_int Kernel::setArgLocal(uint32_t argIndex,
                           const void *argVal) {
    auto rootDeviceIndex = getDevice().getRootDeviceIndex();
    auto crossThreadData = reinterpret_cast<uint32_t *>(getCrossThreadData(rootDeviceIndex));
+    auto &defaultKernelInfo = getDefaultKernelInfo();

    storeKernelArg(argIndex, SLM_OBJ, nullptr, argVal, argSize);

@ -1234,7 +1241,7 @@ cl_int Kernel::setArgLocal(uint32_t argIndex,

    // Extract our current slmOffset
    auto slmOffset = *ptrOffset(crossThreadData,
-                                getDefaultKernelInfo().kernelArgInfo[argIndex].kernelArgPatchInfoVector[0].crossthreadOffset);
+                                defaultKernelInfo.kernelArgInfo[argIndex].kernelArgPatchInfoVector[0].crossthreadOffset);

    // Add our size
    slmOffset += static_cast<uint32_t>(argSize);
@ -1242,7 +1249,7 @@ cl_int Kernel::setArgLocal(uint32_t argIndex,
    // Update all slm offsets after this argIndex
    ++argIndex;
    while (argIndex < slmSizes.size()) {
-        const auto &kernelArgInfo = getDefaultKernelInfo().kernelArgInfo[argIndex];
+        const auto &kernelArgInfo = defaultKernelInfo.kernelArgInfo[argIndex];
        auto slmAlignment = kernelArgInfo.slmAlignment;

        // If an local argument, alignment should be non-zero
@ -1260,7 +1267,7 @@ cl_int Kernel::setArgLocal(uint32_t argIndex,
        ++argIndex;
    }

-    slmTotalSize = getDefaultKernelInfo().workloadInfo.slmStaticSize + alignUp(slmOffset, KB);
+    slmTotalSize = defaultKernelInfo.workloadInfo.slmStaticSize + alignUp(slmOffset, KB);

    return CL_SUCCESS;
 }
@ -1272,9 +1279,9 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex,
    if (argSize != sizeof(cl_mem *))
        return CL_INVALID_ARG_SIZE;

-    const auto &kernelArgInfo = getDefaultKernelInfo().kernelArgInfo[argIndex];
    auto clMem = reinterpret_cast<const cl_mem *>(argVal);
    auto rootDeviceIndex = getDevice().getRootDeviceIndex();
+    const auto &kernelArgInfo = getKernelInfo(rootDeviceIndex).kernelArgInfo[argIndex];
    patchBufferOffset(kernelArgInfo, nullptr, nullptr, rootDeviceIndex);

    if (clMem && *clMem) {
@ -1323,7 +1330,7 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex,
            forceNonAuxMode = true;
        }

-        if (requiresSshForBuffers()) {
+        if (requiresSshForBuffers(rootDeviceIndex)) {
            auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
            buffer->setArgStateful(surfaceState, forceNonAuxMode, disableL3, isAuxTranslationKernel, kernelArgInfo.isReadOnly, getDevice().getDevice());
        }
@ -1348,7 +1355,7 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex,

        storeKernelArg(argIndex, BUFFER_OBJ, nullptr, argVal, argSize);

-        if (requiresSshForBuffers()) {
+        if (requiresSshForBuffers(rootDeviceIndex)) {
            auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
            Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0);
        }
@ -1365,7 +1372,6 @@ cl_int Kernel::setArgPipe(uint32_t argIndex,
        return CL_INVALID_ARG_SIZE;
    }

-    const auto &kernelArgInfo = getDefaultKernelInfo().kernelArgInfo[argIndex];
    auto clMem = reinterpret_cast<const cl_mem *>(argVal);

    if (clMem && *clMem) {
@ -1388,6 +1394,7 @@ cl_int Kernel::setArgPipe(uint32_t argIndex,
            return CL_INVALID_MEM_OBJECT;
        }
        auto rootDeviceIndex = getDevice().getRootDeviceIndex();
+        const auto &kernelArgInfo = getKernelInfo(rootDeviceIndex).kernelArgInfo[argIndex];
        auto patchLocation = ptrOffset(getCrossThreadData(rootDeviceIndex),
                                       kernelArgInfo.kernelArgPatchInfoVector[0].crossthreadOffset);

@ -1397,7 +1404,7 @@ cl_int Kernel::setArgPipe(uint32_t argIndex,

        auto graphicsAllocation = pipe->getGraphicsAllocation(rootDeviceIndex);

-        if (requiresSshForBuffers()) {
+        if (requiresSshForBuffers(rootDeviceIndex)) {
            auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
            Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState,
                                    pipe->getSize(), pipe->getCpuAddress(), 0,
@ -1421,7 +1428,8 @@ cl_int Kernel::setArgImageWithMipLevel(uint32_t argIndex,
                                       const void *argVal, uint32_t mipLevel) {
    auto retVal = CL_INVALID_ARG_VALUE;
    auto rootDeviceIndex = getDevice().getRootDeviceIndex();
-    patchBufferOffset(getDefaultKernelInfo().kernelArgInfo[argIndex], nullptr, nullptr, rootDeviceIndex);
+    auto &kernelInfo = getKernelInfo(rootDeviceIndex);
+    patchBufferOffset(kernelInfo.kernelArgInfo[argIndex], nullptr, nullptr, rootDeviceIndex);

    auto clMemObj = *(static_cast<const cl_mem *>(argVal));
    auto pImage = castToObject<Image>(clMemObj);
@ -1430,7 +1438,7 @@ cl_int Kernel::setArgImageWithMipLevel(uint32_t argIndex,
        if (pImage->peekSharingHandler()) {
            usingSharedObjArgs = true;
        }
-        const auto &kernelArgInfo = getDefaultKernelInfo().kernelArgInfo[argIndex];
+        const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex];

        DBG_LOG_INPUTS("setArgImage cl_mem", clMemObj);

@ -1441,7 +1449,7 @@ cl_int Kernel::setArgImageWithMipLevel(uint32_t argIndex,

        // Sets SS structure
        if (kernelArgInfo.isMediaImage) {
-            DEBUG_BREAK_IF(!getDefaultKernelInfo().isVmeWorkload);
+            DEBUG_BREAK_IF(!kernelInfo.isVmeWorkload);
            pImage->setMediaImageArg(surfaceState, rootDeviceIndex);
        } else {
            pImage->setImageArg(surfaceState, kernelArgInfo.isMediaBlockImage, mipLevel, rootDeviceIndex);
@ -1487,7 +1495,7 @@ cl_int Kernel::setArgImmediate(uint32_t argIndex,
    auto rootDeviceIndex = getDevice().getRootDeviceIndex();

    if (argVal) {
-        const auto &kernelArgInfo = getDefaultKernelInfo().kernelArgInfo[argIndex];
+        const auto &kernelArgInfo = getKernelInfo(rootDeviceIndex).kernelArgInfo[argIndex];
        DEBUG_BREAK_IF(kernelArgInfo.kernelArgPatchInfoVector.size() <= 0);

        storeKernelArg(argIndex, NONE_OBJ, nullptr, nullptr, argSize);
@ -1540,11 +1548,11 @@ cl_int Kernel::setArgSampler(uint32_t argIndex,
    }

    if (pSampler && argSize == sizeof(cl_sampler *)) {
-        const auto &kernelArgInfo = getDefaultKernelInfo().kernelArgInfo[argIndex];
+        const auto &kernelArgInfo = getKernelInfo(rootDeviceIndex).kernelArgInfo[argIndex];

        storeKernelArg(argIndex, SAMPLER_OBJ, clSamplerObj, argVal, argSize);

-        auto dsh = getDynamicStateHeap();
+        auto dsh = getDynamicStateHeap(rootDeviceIndex);
        auto samplerState = ptrOffset(dsh, kernelArgInfo.offsetHeap);

        pSampler->setArg(const_cast<void *>(samplerState));
@ -1583,7 +1591,7 @@ cl_int Kernel::setArgAccelerator(uint32_t argIndex,
    if (pAccelerator) {
        storeKernelArg(argIndex, ACCELERATOR_OBJ, clAcceleratorObj, argVal, argSize);

-        const auto &kernelArgInfo = getDefaultKernelInfo().kernelArgInfo[argIndex];
+        const auto &kernelArgInfo = getKernelInfo(rootDeviceIndex).kernelArgInfo[argIndex];

        if (kernelArgInfo.samplerArgumentType == iOpenCL::SAMPLER_OBJECT_VME) {
            auto crossThreadData = getCrossThreadData(rootDeviceIndex);
@ -2267,7 +2275,7 @@ void Kernel::patchDefaultDeviceQueue(DeviceQueue *devQueue) {
            patchWithRequiredSize(patchLocation, patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->DataParamSize,
                                  static_cast<uintptr_t>(devQueue->getQueueBuffer()->getGpuAddressToPatch()));
        }
-        if (requiresSshForBuffers()) {
+        if (requiresSshForBuffers(rootDeviceIndex)) {
            auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
                                          patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->SurfaceStateHeapOffset);
            Buffer::setSurfaceState(&devQueue->getDevice(), surfaceState, devQueue->getQueueBuffer()->getUnderlyingBufferSize(),
@ -2289,7 +2297,7 @@ void Kernel::patchEventPool(DeviceQueue *devQueue) {
                                  static_cast<uintptr_t>(devQueue->getEventPoolBuffer()->getGpuAddressToPatch()));
        }

-        if (requiresSshForBuffers()) {
+        if (requiresSshForBuffers(rootDeviceIndex)) {
            auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
                                          patchInfo.pAllocateStatelessEventPoolSurface->SurfaceStateHeapOffset);
            Buffer::setSurfaceState(&devQueue->getDevice(), surfaceState, devQueue->getEventPoolBuffer()->getUnderlyingBufferSize(),
@ -2311,8 +2319,8 @@ void Kernel::patchBlocksSimdSize(uint32_t rootDeviceIndex) {
    }
 }

-bool Kernel::usesSyncBuffer() {
-    return (getDefaultKernelInfo().patchInfo.pAllocateSyncBuffer != nullptr);
+bool Kernel::usesSyncBuffer(uint32_t rootDeviceIndex) {
+    return (getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSyncBuffer != nullptr);
 }

 void Kernel::patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
@ -2322,7 +2330,7 @@ void Kernel::patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation,
    patchWithRequiredSize(bufferPatchAddress, patchInfo.pAllocateSyncBuffer->DataParamSize,
                          ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset));

-    if (requiresSshForBuffers()) {
+    if (requiresSshForBuffers(rootDeviceIndex)) {
        auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
                                      patchInfo.pAllocateSyncBuffer->SurfaceStateHeapOffset);
        auto addressToPatch = gfxAllocation->getUnderlyingBuffer();
@ -2339,12 +2347,13 @@ bool Kernel::isPatched() const {
 cl_int Kernel::checkCorrectImageAccessQualifier(cl_uint argIndex,
                                                size_t argSize,
                                                const void *argValue) const {
-    if (getDefaultKernelInfo().kernelArgInfo[argIndex].isImage) {
+    auto &defaultKernelInfo = getDefaultKernelInfo();
+    if (defaultKernelInfo.kernelArgInfo[argIndex].isImage) {
        cl_mem mem = *(static_cast<const cl_mem *>(argValue));
        MemObj *pMemObj = nullptr;
        WithCastToInternal(mem, &pMemObj);
        if (pMemObj) {
-            auto accessQualifier = getDefaultKernelInfo().kernelArgInfo[argIndex].metadata.accessQualifier;
+            auto accessQualifier = defaultKernelInfo.kernelArgInfo[argIndex].metadata.accessQualifier;
            cl_mem_flags flags = pMemObj->getFlags();
            if ((accessQualifier == KernelArgMetadata::AccessReadOnly && ((flags | CL_MEM_WRITE_ONLY) == flags)) ||
                (accessQualifier == KernelArgMetadata::AccessWriteOnly && ((flags | CL_MEM_READ_ONLY) == flags))) {
@ -2372,11 +2381,16 @@ void Kernel::resolveArgs() {
            }
        }
    }
-    auto rootDeviceIndex = getDevice().getRootDeviceIndex();
+    for (auto rootDeviceIndex = 0u; rootDeviceIndex < kernelInfos.size(); rootDeviceIndex++) {
+        auto pKernelInfo = kernelInfos[rootDeviceIndex];
+        if (!pKernelInfo) {
+            continue;
+        }
        if (canTransformImageTo2dArray) {
-        imageTransformer->transformImagesTo2dArray(getDefaultKernelInfo(), kernelArguments, getSurfaceStateHeap(rootDeviceIndex));
+            imageTransformer->transformImagesTo2dArray(*pKernelInfo, kernelArguments, getSurfaceStateHeap(rootDeviceIndex));
        } else if (imageTransformer->didTransform()) {
-        imageTransformer->transformImagesTo3d(getDefaultKernelInfo(), kernelArguments, getSurfaceStateHeap(rootDeviceIndex));
+            imageTransformer->transformImagesTo3d(*pKernelInfo, kernelArguments, getSurfaceStateHeap(rootDeviceIndex));
+        }
    }
 }

@ -2385,10 +2399,11 @@ bool Kernel::canTransformImages() const {
    return renderCoreFamily >= IGFX_GEN9_CORE && renderCoreFamily <= IGFX_GEN11LP_CORE;
 }

-void Kernel::fillWithBuffersForAuxTranslation(MemObjsForAuxTranslation &memObjsForAuxTranslation) {
+void Kernel::fillWithBuffersForAuxTranslation(MemObjsForAuxTranslation &memObjsForAuxTranslation, uint32_t rootDeviceIndex) {
    memObjsForAuxTranslation.reserve(getKernelArgsNumber());
+    auto &kernelInfo = getKernelInfo(rootDeviceIndex);
    for (uint32_t i = 0; i < getKernelArgsNumber(); i++) {
-        if (BUFFER_OBJ == kernelArguments.at(i).type && !getDefaultKernelInfo().kernelArgInfo.at(i).pureStatefulBufferAccess) {
+        if (BUFFER_OBJ == kernelArguments.at(i).type && !kernelInfo.kernelArgInfo.at(i).pureStatefulBufferAccess) {
            auto buffer = castToObject<Buffer>(getKernelArg(i));
            if (buffer && buffer->getMultiGraphicsAllocation().getAllocationType() == GraphicsAllocation::AllocationType::BUFFER_COMPRESSED) {
                memObjsForAuxTranslation.insert(buffer);
@ -2396,7 +2411,7 @@ void Kernel::fillWithBuffersForAuxTranslation(MemObjsForAuxTranslation &memObjsF
                auto &context = this->program->getContext();
                if (context.isProvidingPerformanceHints()) {
                    context.providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, KERNEL_ARGUMENT_AUX_TRANSLATION,
-                                                   getDefaultKernelInfo().kernelDescriptor.kernelMetadata.kernelName.c_str(), i, getDefaultKernelInfo().kernelArgInfo.at(i).metadataExtended->argName.c_str());
+                                                   kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(), i, kernelInfo.kernelArgInfo.at(i).metadataExtended->argName.c_str());
                }
            }
        }
--- a/opencl/source/kernel/kernel.h
+++ b/opencl/source/kernel/kernel.h
@ -141,7 +141,7 @@ class Kernel : public BaseObject<_cl_kernel> {
    cl_int getInfo(cl_kernel_info paramName, size_t paramValueSize,
                   void *paramValue, size_t *paramValueSizeRet) const;
    void getAdditionalInfo(cl_kernel_info paramName, const void *&paramValue, size_t &paramValueSizeRet) const;
-    void getAdditionalWorkGroupInfo(cl_kernel_work_group_info paramName, const void *&paramValue, size_t &paramValueSizeRet) const;
+    void getAdditionalWorkGroupInfo(cl_kernel_work_group_info paramName, const void *&paramValue, size_t &paramValueSizeRet, uint32_t rootDeviceIndex) const;

    cl_int getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName,
                      size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const;
@ -154,13 +154,13 @@ class Kernel : public BaseObject<_cl_kernel> {
                           size_t paramValueSize, void *paramValue,
                           size_t *paramValueSizeRet) const;

-    const void *getKernelHeap() const;
+    const void *getKernelHeap(uint32_t rootDeviceIndex) const;
    void *getSurfaceStateHeap(uint32_t rootDeviceIndex) const;
-    const void *getDynamicStateHeap() const;
+    const void *getDynamicStateHeap(uint32_t rootDeviceIndex) const;

-    size_t getKernelHeapSize() const;
+    size_t getKernelHeapSize(uint32_t rootDeviceIndex) const;
    size_t getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const;
-    size_t getDynamicStateHeapSize() const;
+    size_t getDynamicStateHeapSize(uint32_t rootDeviceIndex) const;
    size_t getNumberOfBindingTableStates() const;
    size_t getBindingTableOffset() const {
        return localBindingTableOffset;
@ -168,10 +168,10 @@ class Kernel : public BaseObject<_cl_kernel> {

    void resizeSurfaceStateHeap(uint32_t rootDeviceIndex, void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset);

-    void substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize);
-    bool isKernelHeapSubstituted() const;
-    uint64_t getKernelId() const;
-    void setKernelId(uint64_t newKernelId);
+    void substituteKernelHeap(uint32_t rootDeviceIndex, void *newKernelHeap, size_t newKernelHeapSize);
+    bool isKernelHeapSubstituted(uint32_t rootDeviceIndex) const;
+    uint64_t getKernelId(uint32_t rootDeviceIndex) const;
+    void setKernelId(uint32_t rootDeviceIndex, uint64_t newKernelId);
    uint32_t getStartOffset() const;
    void setStartOffset(uint32_t offset);

@ -183,8 +183,8 @@ class Kernel : public BaseObject<_cl_kernel> {
        return getDefaultKernelInfo().kernelArgInfo.size();
    }

-    bool requiresSshForBuffers() const {
-        return getDefaultKernelInfo().requiresSshForBuffers;
+    bool requiresSshForBuffers(uint32_t rootDeviceIndex) const {
+        return getKernelInfo(rootDeviceIndex).requiresSshForBuffers;
    }

    const KernelInfo &getKernelInfo(uint32_t rootDeviceIndex) const {
@ -200,12 +200,12 @@ class Kernel : public BaseObject<_cl_kernel> {

    Program *getProgram() const { return program; }

-    uint32_t getScratchSize() {
-        return getDefaultKernelInfo().patchInfo.mediavfestate ? getDefaultKernelInfo().patchInfo.mediavfestate->PerThreadScratchSpace : 0;
+    uint32_t getScratchSize(uint32_t rootDeviceIndex) {
+        return getKernelInfo(rootDeviceIndex).patchInfo.mediavfestate ? getKernelInfo(rootDeviceIndex).patchInfo.mediavfestate->PerThreadScratchSpace : 0;
    }

-    uint32_t getPrivateScratchSize() {
-        return getDefaultKernelInfo().patchInfo.mediaVfeStateSlot1 ? getDefaultKernelInfo().patchInfo.mediaVfeStateSlot1->PerThreadScratchSpace : 0;
+    uint32_t getPrivateScratchSize(uint32_t rootDeviceIndex) {
+        return getKernelInfo(rootDeviceIndex).patchInfo.mediaVfeStateSlot1 ? getKernelInfo(rootDeviceIndex).patchInfo.mediaVfeStateSlot1->PerThreadScratchSpace : 0;
    }

    void createReflectionSurface();
@ -215,7 +215,7 @@ class Kernel : public BaseObject<_cl_kernel> {
    void patchDefaultDeviceQueue(DeviceQueue *devQueue);
    void patchEventPool(DeviceQueue *devQueue);
    void patchBlocksSimdSize(uint32_t rootDeviceIndex);
-    bool usesSyncBuffer();
+    bool usesSyncBuffer(uint32_t rootDeviceIndex);
    void patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, size_t bufferOffset);
    void patchBindlessSurfaceStateOffsets(const Device &device, const size_t sshOffset);

@ -352,26 +352,26 @@ class Kernel : public BaseObject<_cl_kernel> {
    KernelExecutionType getExecutionType() const {
        return executionType;
    }
-    bool isUsingSyncBuffer() const {
-        return (getDefaultKernelInfo().patchInfo.pAllocateSyncBuffer != nullptr);
+    bool isUsingSyncBuffer(uint32_t rootDeviceIndex) const {
+        return (getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSyncBuffer != nullptr);
    }

    bool checkIfIsParentKernelAndBlocksUsesPrintf();

-    bool is32Bit() const {
-        return getDefaultKernelInfo().gpuPointerSize == 4;
+    bool is32Bit(uint32_t rootDeviceIndex) const {
+        return getKernelInfo(rootDeviceIndex).gpuPointerSize == 4;
    }

-    int32_t getDebugSurfaceBti() const {
-        if (getDefaultKernelInfo().patchInfo.pAllocateSystemThreadSurface) {
-            return getDefaultKernelInfo().patchInfo.pAllocateSystemThreadSurface->BTI;
+    int32_t getDebugSurfaceBti(uint32_t rootDeviceIndex) const {
+        if (getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSystemThreadSurface) {
+            return getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSystemThreadSurface->BTI;
        }
        return -1;
    }

-    size_t getPerThreadSystemThreadSurfaceSize() const {
-        if (getDefaultKernelInfo().patchInfo.pAllocateSystemThreadSurface) {
-            return getDefaultKernelInfo().patchInfo.pAllocateSystemThreadSurface->PerThreadSystemThreadSurfaceSize;
+    size_t getPerThreadSystemThreadSurfaceSize(uint32_t rootDeviceIndex) const {
+        if (getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSystemThreadSurface) {
+            return getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSystemThreadSurface->PerThreadSystemThreadSurfaceSize;
        }
        return 0;
    }
@ -381,7 +381,7 @@ class Kernel : public BaseObject<_cl_kernel> {
        return usingImagesOnly;
    }

-    void fillWithBuffersForAuxTranslation(MemObjsForAuxTranslation &memObjsForAuxTranslation);
+    void fillWithBuffersForAuxTranslation(MemObjsForAuxTranslation &memObjsForAuxTranslation, uint32_t rootDeviceIndex);

    MOCKABLE_VIRTUAL bool requiresCacheFlushCommand(const CommandQueue &commandQueue) const;

--- a/opencl/source/program/kernel_info.cpp
+++ b/opencl/source/program/kernel_info.cpp
@ -143,13 +143,12 @@ WorkSizeInfo::WorkSizeInfo(const DispatchInfo &dispatchInfo) {
    this->numThreadsPerSubSlice = static_cast<uint32_t>(device.getSharedDeviceInfo().maxNumEUsPerSubSlice) *
                                  device.getSharedDeviceInfo().numThreadsPerEU;
    this->localMemSize = static_cast<uint32_t>(device.getSharedDeviceInfo().localMemSize);
-    setIfUseImg(dispatchInfo.getKernel());
+    setIfUseImg(kernelInfo);
    setMinWorkGroupSize();
 }
-void WorkSizeInfo::setIfUseImg(Kernel *pKernel) {
-    auto ParamsCount = pKernel->getKernelArgsNumber();
-    for (auto i = 0u; i < ParamsCount; i++) {
-        if (pKernel->getDefaultKernelInfo().kernelArgInfo[i].isImage) {
+void WorkSizeInfo::setIfUseImg(const KernelInfo &kernelInfo) {
+    for (auto i = 0u; i < kernelInfo.kernelArgInfo.size(); i++) {
+        if (kernelInfo.kernelArgInfo[i].isImage) {
            imgUsed = true;
            yTiledSurfaces = true;
        }
--- a/opencl/source/program/kernel_info.h
+++ b/opencl/source/program/kernel_info.h
@ -84,7 +84,7 @@ struct WorkSizeInfo {

    WorkSizeInfo(uint32_t maxWorkGroupSize, bool hasBarriers, uint32_t simdSize, uint32_t slmTotalSize, GFXCORE_FAMILY coreFamily, uint32_t numThreadsPerSubSlice, uint32_t localMemSize, bool imgUsed, bool yTiledSurface);
    WorkSizeInfo(const DispatchInfo &dispatchInfo);
-    void setIfUseImg(Kernel *pKernel);
+    void setIfUseImg(const KernelInfo &kernelInfo);
    void setMinWorkGroupSize();
    void checkRatio(const size_t workItems[3]);
 };
--- a/opencl/source/program/printf_handler.cpp
+++ b/opencl/source/program/printf_handler.cpp
@ -61,7 +61,7 @@ void PrintfHandler::prepareDispatch(const MultiDispatchInfo &multiDispatchInfo)
                                        kernel->getKernelInfo(rootDeviceIndex).patchInfo.pAllocateStatelessPrintfSurface->DataParamOffset);

    patchWithRequiredSize(printfPatchAddress, kernel->getKernelInfo(rootDeviceIndex).patchInfo.pAllocateStatelessPrintfSurface->DataParamSize, (uintptr_t)printfSurface->getGpuAddressToPatch());
-    if (kernel->requiresSshForBuffers()) {
+    if (kernel->requiresSshForBuffers(rootDeviceIndex)) {
        auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getSurfaceStateHeap(rootDeviceIndex)),
                                      kernel->getKernelInfo(rootDeviceIndex).patchInfo.pAllocateStatelessPrintfSurface->SurfaceStateHeapOffset);
        void *addressToPatch = printfSurface->getUnderlyingBuffer();
@ -75,8 +75,9 @@ void PrintfHandler::makeResident(CommandStreamReceiver &commandStreamReceiver) {
 }

 void PrintfHandler::printEnqueueOutput() {
+    auto rootDeviceIndex = device.getRootDeviceIndex();
    PrintFormatter printFormatter(reinterpret_cast<const uint8_t *>(printfSurface->getUnderlyingBuffer()), static_cast<uint32_t>(printfSurface->getUnderlyingBufferSize()),
-                                  kernel->is32Bit(), kernel->getKernelInfo(device.getRootDeviceIndex()).patchInfo.stringDataMap);
+                                  kernel->is32Bit(rootDeviceIndex), kernel->getKernelInfo(rootDeviceIndex).patchInfo.stringDataMap);
    printFormatter.printKernelOutput();
 }
 } // namespace NEO
--- a/opencl/source/scheduler/scheduler_kernel.h
+++ b/opencl/source/scheduler/scheduler_kernel.h
@ -34,8 +34,9 @@ class SchedulerKernel : public Kernel {
    }

    size_t getCurbeSize() {
-        size_t crossTrheadDataSize = getDefaultKernelInfo().patchInfo.dataParameterStream ? getDefaultKernelInfo().patchInfo.dataParameterStream->DataParameterStreamSize : 0;
-        size_t dshSize = getDefaultKernelInfo().heapInfo.DynamicStateHeapSize;
+        auto &defaultKernelInfo = getDefaultKernelInfo();
+        size_t crossTrheadDataSize = defaultKernelInfo.patchInfo.dataParameterStream ? defaultKernelInfo.patchInfo.dataParameterStream->DataParameterStreamSize : 0;
+        size_t dshSize = defaultKernelInfo.heapInfo.DynamicStateHeapSize;

        crossTrheadDataSize = alignUp(crossTrheadDataSize, 64);
        dshSize = alignUp(dshSize, 64);
--- a/opencl/test/unit_test/api/cl_enqueue_nd_range_kernel_tests.inl
+++ b/opencl/test/unit_test/api/cl_enqueue_nd_range_kernel_tests.inl
@ -137,7 +137,7 @@ TEST_F(clEnqueueNDRangeKernelTests, GivenKernelWithAllocateSyncBufferPatchWhenEx
    SPatchAllocateSyncBuffer patchAllocateSyncBuffer;
    pProgram->mockKernelInfo.patchInfo.pAllocateSyncBuffer = &patchAllocateSyncBuffer;

-    EXPECT_TRUE(pKernel->isUsingSyncBuffer());
+    EXPECT_TRUE(pKernel->isUsingSyncBuffer(testedRootDeviceIndex));

    retVal = clEnqueueNDRangeKernel(
        pCommandQueue,
--- a/opencl/test/unit_test/api/cl_get_kernel_work_group_info_tests.inl
+++ b/opencl/test/unit_test/api/cl_get_kernel_work_group_info_tests.inl
@ -156,7 +156,7 @@ TEST_F(clGetKernelWorkGroupInfoTests, GivenKernelRequiringScratchSpaceWhenGettin
    mediaVFEstate.PerThreadScratchSpace = 1024; //whatever greater than 0
    mockKernel.kernelInfo.patchInfo.mediavfestate = &mediaVFEstate;

-    cl_ulong scratchSpaceSize = static_cast<cl_ulong>(mockKernel.mockKernel->getScratchSize());
+    cl_ulong scratchSpaceSize = static_cast<cl_ulong>(mockKernel.mockKernel->getScratchSize(testedRootDeviceIndex));
    EXPECT_EQ(scratchSpaceSize, 1024u);

    retVal = clGetKernelWorkGroupInfo(
--- a/opencl/test/unit_test/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp
+++ b/opencl/test/unit_test/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp
@ -109,8 +109,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, AUBHelloWorld, simple) {
    EXPECT_EQ(0u, addrIDD % alignmentIDD);

    // Check kernel start pointer matches hard-coded kernel.
-    auto pExpectedISA = pKernel->getKernelHeap();
-    auto expectedSize = pKernel->getKernelHeapSize();
+    auto pExpectedISA = pKernel->getKernelHeap(rootDeviceIndex);
+    auto expectedSize = pKernel->getKernelHeapSize(rootDeviceIndex);

    auto pSBA = reinterpret_cast<STATE_BASE_ADDRESS *>(cmdStateBaseAddress);
    ASSERT_NE(nullptr, pSBA);
@ -268,8 +268,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, AUBSimpleArg, simple) {
    EXPECT_EQ(0u, addrIDD % alignmentIDD);

    // Check kernel start pointer matches hard-coded kernel.
-    auto pExpectedISA = pKernel->getKernelHeap();
-    auto expectedSize = pKernel->getKernelHeapSize();
+    auto pExpectedISA = pKernel->getKernelHeap(rootDeviceIndex);
+    auto expectedSize = pKernel->getKernelHeapSize(rootDeviceIndex);

    auto pSBA = reinterpret_cast<STATE_BASE_ADDRESS *>(cmdStateBaseAddress);
    ASSERT_NE(nullptr, pSBA);
--- a/opencl/test/unit_test/command_queue/enqueue_copy_buffer_rect_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_copy_buffer_rect_tests.cpp
@ -183,7 +183,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueCopyBufferRectTest, WhenCopyingBufferRect2DTh

    EXPECT_NE(dshBefore, pDSH->getUsed());
    EXPECT_NE(iohBefore, pIOH->getUsed());
-    if (kernel->requiresSshForBuffers()) {
+    if (kernel->requiresSshForBuffers(rootDeviceIndex)) {
        EXPECT_NE(sshBefore, pSSH->getUsed());
    }
 }
--- a/opencl/test/unit_test/command_queue/enqueue_copy_buffer_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_copy_buffer_tests.cpp
@ -202,9 +202,9 @@ HWTEST_F(EnqueueCopyBufferTest, WhenCopyingBufferThenIndirectDataGetsAdded) {

    auto kernel = multiDispatchInfo.begin()->getKernel();

-    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), kernel));
+    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), kernel, rootDeviceIndex));
    EXPECT_NE(iohBefore, pIOH->getUsed());
-    if (kernel->requiresSshForBuffers()) {
+    if (kernel->requiresSshForBuffers(rootDeviceIndex)) {
        EXPECT_NE(sshBefore, pSSH->getUsed());
    }
 }
@ -329,7 +329,7 @@ HWTEST_F(EnqueueCopyBufferTest, WhenCopyingBufferThenArgumentZeroMatchesSourceAd
    ASSERT_NE(nullptr, kernel);

    // Determine where the argument is
-    auto pArgument = (void **)getStatelessArgumentPointer<FamilyType>(*kernel, 0u, pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0));
+    auto pArgument = (void **)getStatelessArgumentPointer<FamilyType>(*kernel, 0u, pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0), rootDeviceIndex);

    EXPECT_EQ(reinterpret_cast<void *>(srcBuffer->getGraphicsAllocation(pClDevice->getRootDeviceIndex())->getGpuAddress()), *pArgument);
 }
@ -357,7 +357,7 @@ HWTEST_F(EnqueueCopyBufferTest, WhenCopyingBufferThenArgumentOneMatchesDestinati
    ASSERT_NE(nullptr, kernel);

    // Determine where the argument is
-    auto pArgument = (void **)getStatelessArgumentPointer<FamilyType>(*kernel, 1u, pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0));
+    auto pArgument = (void **)getStatelessArgumentPointer<FamilyType>(*kernel, 1u, pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0), rootDeviceIndex);

    EXPECT_EQ(reinterpret_cast<void *>(dstBuffer->getGraphicsAllocation(pClDevice->getRootDeviceIndex())->getGpuAddress()), *pArgument);
 }
--- a/opencl/test/unit_test/command_queue/enqueue_copy_buffer_to_image_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_copy_buffer_to_image_tests.cpp
@ -85,7 +85,7 @@ HWTEST_F(EnqueueCopyBufferToImageTest, WhenCopyingBufferToImageThenIndirectDataG
    auto sshBefore = pSSH->getUsed();

    EnqueueCopyBufferToImageHelper<>::enqueueCopyBufferToImage(pCmdQ, srcBuffer, dstImage);
-    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr));
+    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr, rootDeviceIndex));
    EXPECT_NE(iohBefore, pIOH->getUsed());
    EXPECT_NE(sshBefore, pSSH->getUsed());
 }
--- a/opencl/test/unit_test/command_queue/enqueue_copy_image_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_copy_image_tests.cpp
@ -85,7 +85,7 @@ HWTEST_F(EnqueueCopyImageTest, WhenCopyingImageThenIndirectDataGetsAdded) {
    auto sshBefore = pSSH->getUsed();

    EnqueueCopyImageHelper<>::enqueueCopyImage(pCmdQ, srcImage, dstImage);
-    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr));
+    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr, rootDeviceIndex));
    EXPECT_NE(iohBefore, pIOH->getUsed());
    EXPECT_NE(sshBefore, pSSH->getUsed());
 }
--- a/opencl/test/unit_test/command_queue/enqueue_copy_image_to_buffer_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_copy_image_to_buffer_tests.cpp
@ -84,7 +84,7 @@ HWTEST_F(EnqueueCopyImageToBufferTest, WhenCopyingImageToBufferThenIndirectDataG
    auto sshBefore = pSSH->getUsed();

    enqueueCopyImageToBuffer<FamilyType>();
-    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr));
+    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr, rootDeviceIndex));
    EXPECT_NE(iohBefore, pIOH->getUsed());
    EXPECT_NE(sshBefore, pSSH->getUsed());
 }
--- a/opencl/test/unit_test/command_queue/enqueue_fill_buffer_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_fill_buffer_tests.cpp
@ -115,9 +115,9 @@ HWTEST_F(EnqueueFillBufferCmdTests, WhenFillingBufferThenIndirectDataGetsAdded)

    auto kernel = multiDispatchInfo.begin()->getKernel();

-    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), kernel));
+    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), kernel, rootDeviceIndex));
    EXPECT_NE(iohBefore, pIOH->getUsed());
-    if (kernel->requiresSshForBuffers()) {
+    if (kernel->requiresSshForBuffers(rootDeviceIndex)) {
        EXPECT_NE(sshBefore, pSSH->getUsed());
    }

@ -302,7 +302,7 @@ HWTEST_F(EnqueueFillBufferCmdTests, WhenFillingBufferThenArgumentZeroShouldMatch
    ASSERT_NE(nullptr, kernel);

    // Determine where the argument is
-    auto pArgument = (void **)getStatelessArgumentPointer<FamilyType>(*kernel, 0u, pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0));
+    auto pArgument = (void **)getStatelessArgumentPointer<FamilyType>(*kernel, 0u, pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0), rootDeviceIndex);

    EXPECT_EQ((void *)((uintptr_t)buffer->getGraphicsAllocation(pClDevice->getRootDeviceIndex())->getGpuAddress()), *pArgument);

@ -335,7 +335,7 @@ HWTEST_F(EnqueueFillBufferCmdTests, WhenFillingBufferThenArgumentTwoShouldMatchP
    ASSERT_NE(nullptr, kernel);

    // Determine where the argument is
-    auto pArgument = (void **)getStatelessArgumentPointer<FamilyType>(*kernel, 2u, pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0));
+    auto pArgument = (void **)getStatelessArgumentPointer<FamilyType>(*kernel, 2u, pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0), rootDeviceIndex);
    EXPECT_NE(nullptr, *pArgument);

    context.getMemoryManager()->freeGraphicsMemory(patternAllocation);
--- a/opencl/test/unit_test/command_queue/enqueue_fill_image_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_fill_image_tests.cpp
@ -92,7 +92,7 @@ HWTEST_F(EnqueueFillImageTest, WhenFillingImageThenIndirectDataGetsAdded) {
    auto sshBefore = pSSH->getUsed();

    EnqueueFillImageHelper<>::enqueueFillImage(pCmdQ, image);
-    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr));
+    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr, rootDeviceIndex));
    EXPECT_NE(iohBefore, pIOH->getUsed());
    EXPECT_NE(sshBefore, pSSH->getUsed());
 }
--- a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp
@ -415,9 +415,9 @@ HWTEST_F(EnqueueKernelTest, addsIndirectData) {
    auto sshBefore = pSSH->getUsed();

    callOneWorkItemNDRKernel();
-    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), pKernel));
+    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), pKernel, rootDeviceIndex));
    EXPECT_NE(iohBefore, pIOH->getUsed());
-    if (pKernel->requiresSshForBuffers() || (pKernel->getKernelInfo(rootDeviceIndex).patchInfo.imageMemObjKernelArgs.size() > 0)) {
+    if (pKernel->requiresSshForBuffers(rootDeviceIndex) || (pKernel->getKernelInfo(rootDeviceIndex).patchInfo.imageMemObjKernelArgs.size() > 0)) {
        EXPECT_NE(sshBefore, pSSH->getUsed());
    }
 }
--- a/opencl/test/unit_test/command_queue/enqueue_read_buffer_rect_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_read_buffer_rect_tests.cpp
@ -196,7 +196,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueReadBufferRectTest, WhenReadingBufferThenIndi

    EXPECT_NE(dshBefore, pDSH->getUsed());
    EXPECT_NE(iohBefore, pIOH->getUsed());
-    if (kernel->requiresSshForBuffers()) {
+    if (kernel->requiresSshForBuffers(rootDeviceIndex)) {
        EXPECT_NE(sshBefore, pSSH->getUsed());
    }
 }
--- a/opencl/test/unit_test/command_queue/enqueue_read_buffer_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_read_buffer_tests.cpp
@ -156,9 +156,9 @@ HWTEST_F(EnqueueReadBufferTypeTest, addsIndirectData) {

    auto kernel = multiDispatchInfo.begin()->getKernel();

-    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), kernel));
+    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), kernel, rootDeviceIndex));
    EXPECT_NE(iohBefore, pIOH->getUsed());
-    if (kernel->requiresSshForBuffers()) {
+    if (kernel->requiresSshForBuffers(rootDeviceIndex)) {
        EXPECT_NE(sshBefore, pSSH->getUsed());
    }
 }
--- a/opencl/test/unit_test/command_queue/enqueue_read_image_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_read_image_tests.cpp
@ -95,7 +95,7 @@ HWTEST_F(EnqueueReadImageTest, WhenReadingImageThenIndirectDataIsAdded) {
    auto sshBefore = pSSH->getUsed();

    EnqueueReadImageHelper<>::enqueueReadImage(pCmdQ, srcImage, EnqueueReadImageTraits::blocking);
-    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr));
+    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr, rootDeviceIndex));
    EXPECT_NE(iohBefore, pIOH->getUsed());
    EXPECT_NE(sshBefore, pSSH->getUsed());
 }
--- a/opencl/test/unit_test/command_queue/enqueue_write_buffer_rect_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_write_buffer_rect_tests.cpp
@ -170,7 +170,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueWriteBufferRectTest, WhenWritingBufferThenInd

    EXPECT_NE(dshBefore, pDSH->getUsed());
    EXPECT_NE(iohBefore, pIOH->getUsed());
-    if (kernel->requiresSshForBuffers()) {
+    if (kernel->requiresSshForBuffers(rootDeviceIndex)) {
        EXPECT_NE(sshBefore, pSSH->getUsed());
    }
 }
--- a/opencl/test/unit_test/command_queue/enqueue_write_buffer_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_write_buffer_tests.cpp
@ -154,9 +154,9 @@ HWTEST_F(EnqueueWriteBufferTypeTest, WhenWritingBufferThenIndirectDataIsAdded) {

    auto kernel = multiDispatchInfo.begin()->getKernel();

-    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), kernel));
+    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), kernel, rootDeviceIndex));
    EXPECT_NE(iohBefore, pIOH->getUsed());
-    if (kernel->requiresSshForBuffers()) {
+    if (kernel->requiresSshForBuffers(rootDeviceIndex)) {
        EXPECT_NE(sshBefore, pSSH->getUsed());
    }
 }
--- a/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_write_image_tests.cpp
@ -97,7 +97,7 @@ HWTEST_F(EnqueueWriteImageTest, WhenWritingImageThenIndirectDataIsAdded) {
    auto sshBefore = pSSH->getUsed();

    EnqueueWriteImageHelper<>::enqueueWriteImage(pCmdQ, dstImage, EnqueueWriteImageTraits::blocking);
-    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr));
+    EXPECT_TRUE(UnitTestHelper<FamilyType>::evaluateDshUsage(dshBefore, pDSH->getUsed(), nullptr, rootDeviceIndex));
    EXPECT_NE(iohBefore, pIOH->getUsed());
    EXPECT_NE(sshBefore, pSSH->getUsed());
 }
--- a/opencl/test/unit_test/context/driver_diagnostics_tests.cpp
+++ b/opencl/test/unit_test/context/driver_diagnostics_tests.cpp
@ -452,7 +452,7 @@ TEST_F(PerformanceHintTest, givenPrintDriverDiagnosticsDebugModeEnabledWhenCallF

    testing::internal::CaptureStdout();
    MemObjsForAuxTranslation memObjects;
-    mockKernel.mockKernel->fillWithBuffersForAuxTranslation(memObjects);
+    mockKernel.mockKernel->fillWithBuffersForAuxTranslation(memObjects, rootDeviceIndex);

    snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[KERNEL_ARGUMENT_AUX_TRANSLATION],
             mockKernel.mockKernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(), 0, mockKernel.mockKernel->getKernelInfo(rootDeviceIndex).kernelArgInfo.at(0).metadataExtended->argName.c_str());
--- a/opencl/test/unit_test/device_queue/device_queue_hw_tests.cpp
+++ b/opencl/test/unit_test/device_queue/device_queue_hw_tests.cpp
@ -603,9 +603,9 @@ HWCMDTEST_P(IGFX_GEN8_CORE, DeviceQueueHwWithKernel, WhenSettingUpIndirectStateT
    devQueueHw->setupIndirectState(*ssh, *dsh, pKernel, parentCount, false);
    auto *igilQueue = reinterpret_cast<IGIL_CommandQueue *>(devQueueHw->getQueueBuffer()->getUnderlyingBuffer());

-    EXPECT_EQ(igilQueue->m_controls.m_DynamicHeapStart, devQueueHw->offsetDsh + alignUp((uint32_t)pKernel->getDynamicStateHeapSize(), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE));
+    EXPECT_EQ(igilQueue->m_controls.m_DynamicHeapStart, devQueueHw->offsetDsh + alignUp((uint32_t)pKernel->getDynamicStateHeapSize(rootDeviceIndex), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE));
    EXPECT_EQ(igilQueue->m_controls.m_DynamicHeapSizeInBytes, (uint32_t)devQueueHw->getDshBuffer()->getUnderlyingBufferSize());
-    EXPECT_EQ(igilQueue->m_controls.m_CurrentDSHoffset, devQueueHw->offsetDsh + alignUp((uint32_t)pKernel->getDynamicStateHeapSize(), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE));
+    EXPECT_EQ(igilQueue->m_controls.m_CurrentDSHoffset, devQueueHw->offsetDsh + alignUp((uint32_t)pKernel->getDynamicStateHeapSize(rootDeviceIndex), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE));
    EXPECT_EQ(igilQueue->m_controls.m_ParentDSHOffset, devQueueHw->offsetDsh);

    alignedFree(ssh->getCpuBase());
--- a/opencl/test/unit_test/execution_model/parent_kernel_dispatch_tests.cpp
+++ b/opencl/test/unit_test/execution_model/parent_kernel_dispatch_tests.cpp
@ -170,7 +170,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelDispatchTest, givenParentKernelWhenQueue
                             pKernel->getKernelInfo(rootDeviceIndex).patchInfo.bindingTableState->Count * sizeof(BINDING_TABLE_STATE) +
                             UnitTestHelper<FamilyType>::getDefaultSshUsage();

-    if ((pKernel->requiresSshForBuffers()) || (pKernel->getKernelInfo(rootDeviceIndex).patchInfo.imageMemObjKernelArgs.size() > 0)) {
+    if ((pKernel->requiresSshForBuffers(rootDeviceIndex)) || (pKernel->getKernelInfo(rootDeviceIndex).patchInfo.imageMemObjKernelArgs.size() > 0)) {
        EXPECT_EQ(expectedSizeSSH, sshUsed);
    }

--- a/opencl/test/unit_test/gtpin/gtpin_tests.cpp
+++ b/opencl/test/unit_test/gtpin/gtpin_tests.cpp
@ -763,7 +763,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelIsExecutedThenGTPinCa

    Kernel *pKernel1 = (Kernel *)kernel1;
    const KernelInfo &kInfo1 = pKernel1->getKernelInfo(rootDeviceIndex);
-    uint64_t gtpinKernelId1 = pKernel1->getKernelId();
+    uint64_t gtpinKernelId1 = pKernel1->getKernelId(rootDeviceIndex);
    EXPECT_EQ(kInfo1.shaderHashCode, gtpinKernelId1);

    constexpr size_t n = 256;
@ -797,7 +797,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelIsExecutedThenGTPinCa

    Kernel *pKernel2 = (Kernel *)kernel2;
    const KernelInfo &kInfo2 = pKernel2->getKernelInfo(rootDeviceIndex);
-    uint64_t gtpinKernelId2 = pKernel2->getKernelId();
+    uint64_t gtpinKernelId2 = pKernel2->getKernelId(rootDeviceIndex);
    EXPECT_EQ(kInfo2.shaderHashCode, gtpinKernelId2);

    auto buff20 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr);
@ -911,7 +911,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelINTELIsExecutedThenGT

    Kernel *pKernel1 = (Kernel *)kernel1;
    const KernelInfo &kInfo1 = pKernel1->getKernelInfo(rootDeviceIndex);
-    uint64_t gtpinKernelId1 = pKernel1->getKernelId();
+    uint64_t gtpinKernelId1 = pKernel1->getKernelId(rootDeviceIndex);
    EXPECT_EQ(kInfo1.shaderHashCode, gtpinKernelId1);

    cl_uint workDim = 1;
@ -951,7 +951,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelINTELIsExecutedThenGT

    Kernel *pKernel2 = (Kernel *)kernel2;
    const KernelInfo &kInfo2 = pKernel2->getKernelInfo(rootDeviceIndex);
-    uint64_t gtpinKernelId2 = pKernel2->getKernelId();
+    uint64_t gtpinKernelId2 = pKernel2->getKernelId(rootDeviceIndex);
    EXPECT_EQ(kInfo2.shaderHashCode, gtpinKernelId2);

    auto buff20 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr);
@ -1267,7 +1267,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelWithoutSSHIsUsedThenG

    Kernel *pKernel = (Kernel *)kernel;
    const KernelInfo &kInfo = pKernel->getKernelInfo(rootDeviceIndex);
-    uint64_t gtpinKernelId = pKernel->getKernelId();
+    uint64_t gtpinKernelId = pKernel->getKernelId(rootDeviceIndex);
    EXPECT_EQ(kInfo.shaderHashCode, gtpinKernelId);

    constexpr size_t n = 256;
@ -1380,7 +1380,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenBlockedKernelWithoutSSHIsUs

    Kernel *pKernel = (Kernel *)kernel;
    const KernelInfo &kInfo = pKernel->getKernelInfo(rootDeviceIndex);
-    uint64_t gtpinKernelId = pKernel->getKernelId();
+    uint64_t gtpinKernelId = pKernel->getKernelId(rootDeviceIndex);
    EXPECT_EQ(kInfo.shaderHashCode, gtpinKernelId);

    constexpr size_t n = 256;
@ -1504,7 +1504,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenTheSameKerneIsExecutedTwice

    Kernel *pKernel1 = (Kernel *)kernel1;
    const KernelInfo &kInfo1 = pKernel1->getKernelInfo(rootDeviceIndex);
-    uint64_t gtpinKernelId1 = pKernel1->getKernelId();
+    uint64_t gtpinKernelId1 = pKernel1->getKernelId(rootDeviceIndex);
    EXPECT_EQ(kInfo1.shaderHashCode, gtpinKernelId1);

    constexpr size_t n = 256;
@ -1542,7 +1542,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenTheSameKerneIsExecutedTwice

    Kernel *pKernel2 = (Kernel *)kernel2;
    const KernelInfo &kInfo2 = pKernel2->getKernelInfo(rootDeviceIndex);
-    uint64_t gtpinKernelId2 = pKernel2->getKernelId();
+    uint64_t gtpinKernelId2 = pKernel2->getKernelId(rootDeviceIndex);
    EXPECT_EQ(kInfo2.shaderHashCode, gtpinKernelId2);

    auto buff20 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr);
@ -2325,18 +2325,18 @@ TEST_F(GTPinTests, givenKernelThenVerifyThatKernelCodeSubstitutionWorksWell) {
    Kernel *pKernel = castToObject<Kernel>(kernel);
    ASSERT_NE(nullptr, pKernel);

-    bool isKernelCodeSubstituted = pKernel->isKernelHeapSubstituted();
+    bool isKernelCodeSubstituted = pKernel->isKernelHeapSubstituted(rootDeviceIndex);
    EXPECT_FALSE(isKernelCodeSubstituted);

    // Substitute new kernel code
    constexpr size_t newCodeSize = 64;
    uint8_t newCode[newCodeSize] = {0x0, 0x1, 0x2, 0x3, 0x4};
-    pKernel->substituteKernelHeap(&newCode[0], newCodeSize);
+    pKernel->substituteKernelHeap(rootDeviceIndex, &newCode[0], newCodeSize);

    // Verify that substitution went properly
-    isKernelCodeSubstituted = pKernel->isKernelHeapSubstituted();
+    isKernelCodeSubstituted = pKernel->isKernelHeapSubstituted(rootDeviceIndex);
    EXPECT_TRUE(isKernelCodeSubstituted);
-    uint8_t *pBin2 = reinterpret_cast<uint8_t *>(const_cast<void *>(pKernel->getKernelHeap()));
+    uint8_t *pBin2 = reinterpret_cast<uint8_t *>(const_cast<void *>(pKernel->getKernelHeap(rootDeviceIndex)));
    EXPECT_EQ(pBin2, &newCode[0]);

    auto kernelIsa = pKernel->getKernelInfo(rootDeviceIndex).kernelAllocation->getUnderlyingBuffer();
--- a/opencl/test/unit_test/helpers/unit_test_helper.h
+++ b/opencl/test/unit_test/helpers/unit_test_helper.h
@ -21,7 +21,7 @@ template <typename GfxFamily>
 struct UnitTestHelper {
    static bool isL3ConfigProgrammable();

-    static bool evaluateDshUsage(size_t sizeBeforeEnqueue, size_t sizeAfterEnqueue, Kernel *kernel);
+    static bool evaluateDshUsage(size_t sizeBeforeEnqueue, size_t sizeAfterEnqueue, Kernel *kernel, uint32_t rootDeviceIndex);

    static bool isPageTableManagerSupported(const HardwareInfo &hwInfo);

--- a/opencl/test/unit_test/helpers/unit_test_helper.inl
+++ b/opencl/test/unit_test/helpers/unit_test_helper.inl
@ -13,7 +13,7 @@ bool UnitTestHelper<GfxFamily>::isL3ConfigProgrammable() {
 };

 template <typename GfxFamily>
-bool UnitTestHelper<GfxFamily>::evaluateDshUsage(size_t sizeBeforeEnqueue, size_t sizeAfterEnqueue, Kernel *kernel) {
+bool UnitTestHelper<GfxFamily>::evaluateDshUsage(size_t sizeBeforeEnqueue, size_t sizeAfterEnqueue, Kernel *kernel, uint32_t rootDeviceIndex) {
    if (sizeBeforeEnqueue != sizeAfterEnqueue) {
        return true;
    }
--- a/opencl/test/unit_test/kernel/debug_kernel_tests.cpp
+++ b/opencl/test/unit_test/kernel/debug_kernel_tests.cpp
@ -22,7 +22,7 @@ TEST(DebugKernelTest, givenKernelCompiledForDebuggingWhenGetDebugSurfaceBtiIsCal
    program.enableKernelDebug();
    std::unique_ptr<MockKernel> kernel(MockKernel::create<MockDebugKernel>(device->getDevice(), &program));

-    EXPECT_EQ(0, kernel->getDebugSurfaceBti());
+    EXPECT_EQ(0, kernel->getDebugSurfaceBti(device->getRootDeviceIndex()));
 }

 TEST(DebugKernelTest, givenKernelCompiledForDebuggingWhenGetPerThreadSystemThreadSurfaceSizeIsCalledThenCorrectValueIsReturned) {
@ -31,7 +31,7 @@ TEST(DebugKernelTest, givenKernelCompiledForDebuggingWhenGetPerThreadSystemThrea
    program.enableKernelDebug();
    std::unique_ptr<MockDebugKernel> kernel(MockKernel::create<MockDebugKernel>(device->getDevice(), &program));

-    EXPECT_EQ(MockDebugKernel::perThreadSystemThreadSurfaceSize, kernel->getPerThreadSystemThreadSurfaceSize());
+    EXPECT_EQ(MockDebugKernel::perThreadSystemThreadSurfaceSize, kernel->getPerThreadSystemThreadSurfaceSize(device->getRootDeviceIndex()));
 }

 TEST(DebugKernelTest, givenKernelCompiledForDebuggingWhenQueryingIsKernelDebugEnabledThenTrueIsReturned) {
@ -60,7 +60,7 @@ TEST(DebugKernelTest, givenKernelWithoutDebugFlagWhenGetDebugSurfaceBtiIsCalledT
    program.enableKernelDebug();
    std::unique_ptr<MockKernel> kernel(MockKernel::create<MockKernel>(device->getDevice(), &program));

-    EXPECT_EQ(-1, kernel->getDebugSurfaceBti());
+    EXPECT_EQ(-1, kernel->getDebugSurfaceBti(device->getRootDeviceIndex()));
 }

 TEST(DebugKernelTest, givenKernelWithoutDebugFlagWhenGetPerThreadSystemThreadSurfaceSizeIsCalledThenZeroIsReturned) {
@ -69,5 +69,5 @@ TEST(DebugKernelTest, givenKernelWithoutDebugFlagWhenGetPerThreadSystemThreadSur
    program.enableKernelDebug();
    std::unique_ptr<MockKernel> kernel(MockKernel::create<MockKernel>(device->getDevice(), &program));

-    EXPECT_EQ(0u, kernel->getPerThreadSystemThreadSurfaceSize());
+    EXPECT_EQ(0u, kernel->getPerThreadSystemThreadSurfaceSize(device->getRootDeviceIndex()));
 }
--- a/opencl/test/unit_test/kernel/kernel_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_tests.cpp
@ -97,8 +97,8 @@ TEST(KernelTest, WhenKernelIsCreatedThenCorrectMembersAreMemObjects) {
 }

 TEST_F(KernelTests, WhenKernelIsCreatedThenKernelHeapIsCorrect) {
-    EXPECT_EQ(pKernel->getKernelInfo(rootDeviceIndex).heapInfo.pKernelHeap, pKernel->getKernelHeap());
-    EXPECT_EQ(pKernel->getKernelInfo(rootDeviceIndex).heapInfo.KernelHeapSize, pKernel->getKernelHeapSize());
+    EXPECT_EQ(pKernel->getKernelInfo(rootDeviceIndex).heapInfo.pKernelHeap, pKernel->getKernelHeap(rootDeviceIndex));
+    EXPECT_EQ(pKernel->getKernelInfo(rootDeviceIndex).heapInfo.KernelHeapSize, pKernel->getKernelHeapSize(rootDeviceIndex));
 }

 TEST_F(KernelTests, GivenInvalidParamNameWhenGettingInfoThenInvalidValueErrorIsReturned) {
@ -167,7 +167,7 @@ TEST_F(KernelTests, GivenKernelBinaryProgramIntelWhenGettingInfoThenKernelBinary
    size_t paramValueSize = 0;
    char *paramValue = nullptr;
    size_t paramValueSizeRet = 0;
-    const char *pKernelData = reinterpret_cast<const char *>(pKernel->getKernelHeap());
+    const char *pKernelData = reinterpret_cast<const char *>(pKernel->getKernelHeap(rootDeviceIndex));
    EXPECT_NE(nullptr, pKernelData);

    // get size of kernel binary
@ -2670,7 +2670,7 @@ TEST(KernelTest, givenKernelWithKernelInfoWith32bitPointerSizeThenReport32bit) {
    MockProgram program(&context, false, toClDeviceVector(*device));
    std::unique_ptr<MockKernel> kernel(new MockKernel(&program, MockKernel::toKernelInfoContainer(info, rootDeviceIndex)));

-    EXPECT_TRUE(kernel->is32Bit());
+    EXPECT_TRUE(kernel->is32Bit(rootDeviceIndex));
 }

 TEST(KernelTest, givenKernelWithKernelInfoWith64bitPointerSizeThenReport64bit) {
@ -2683,7 +2683,7 @@ TEST(KernelTest, givenKernelWithKernelInfoWith64bitPointerSizeThenReport64bit) {
    MockProgram program(&context, false, toClDeviceVector(*device));
    std::unique_ptr<MockKernel> kernel(new MockKernel(&program, MockKernel::toKernelInfoContainer(info, rootDeviceIndex)));

-    EXPECT_FALSE(kernel->is32Bit());
+    EXPECT_FALSE(kernel->is32Bit(rootDeviceIndex));
 }

 TEST(KernelTest, givenFtrRenderCompressedBuffersWhenInitializingArgsWithNonStatefulAccessThenMarkKernelForAuxTranslation) {
@ -2978,7 +2978,7 @@ TEST(KernelTest, givenKernelRequiringPrivateScratchSpaceWhenGettingSizeForPrivat
    mockKernel.kernelInfo.patchInfo.mediavfestate = &mediaVFEstate;
    mockKernel.kernelInfo.patchInfo.mediaVfeStateSlot1 = &mediaVFEstateSlot1;

-    EXPECT_EQ(1024u, mockKernel.mockKernel->getPrivateScratchSize());
+    EXPECT_EQ(1024u, mockKernel.mockKernel->getPrivateScratchSize(device->getRootDeviceIndex()));
 }

 TEST(KernelTest, givenKernelWithoutMediaVfeStateSlot1WhenGettingSizeForPrivateScratchSpaceThenCorrectSizeIsReturned) {
@ -2987,7 +2987,7 @@ TEST(KernelTest, givenKernelWithoutMediaVfeStateSlot1WhenGettingSizeForPrivateSc
    MockKernelWithInternals mockKernel(*device);
    mockKernel.kernelInfo.patchInfo.mediaVfeStateSlot1 = nullptr;

-    EXPECT_EQ(0u, mockKernel.mockKernel->getPrivateScratchSize());
+    EXPECT_EQ(0u, mockKernel.mockKernel->getPrivateScratchSize(device->getRootDeviceIndex()));
 }

 TEST(KernelTest, givenKernelWithPatchInfoCollectionEnabledWhenPatchWithImplicitSurfaceCalledThenPatchInfoDataIsCollected) {
--- a/opencl/test/unit_test/kernel/kernel_transformable_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_transformable_tests.cpp
@ -21,7 +21,7 @@ using namespace NEO;
 class KernelTransformableTest : public ::testing::Test {
  public:
    void SetUp() override {
-        rootDeviceIndex = context.getDevice(0)->getRootDeviceIndex();
+        context = std::make_unique<MockContext>(deviceFactory.rootDevices[rootDeviceIndex]);
        pKernelInfo = std::make_unique<KernelInfo>();
        KernelArgPatchInfo kernelArgPatchInfo;

@ -45,7 +45,7 @@ class KernelTransformableTest : public ::testing::Test {
        pKernelInfo->kernelArgInfo[3].isImage = true;
        pKernelInfo->argumentsToPatchNum = 4;

-        program = std::make_unique<MockProgram>(toClDeviceVector(*context.getDevice(0)));
+        program = std::make_unique<MockProgram>(context.get(), false, toClDeviceVector(*context->getDevice(0)));
        pKernel.reset(new MockKernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex)));
        ASSERT_EQ(CL_SUCCESS, pKernel->initialize());

@ -66,7 +66,8 @@ class KernelTransformableTest : public ::testing::Test {
    const int secondImageOffset = 0x40;

    cl_int retVal = CL_SUCCESS;
-    MockContext context;
+    UltClDeviceFactory deviceFactory{2, 0};
+    std::unique_ptr<MockContext> context;
    std::unique_ptr<MockProgram> program;
    std::unique_ptr<Sampler> sampler;
    std::unique_ptr<KernelInfo> pKernelInfo;
@ -75,14 +76,14 @@ class KernelTransformableTest : public ::testing::Test {
    std::unique_ptr<Image> image;
    SKernelBinaryHeaderCommon kernelHeader;
    char surfaceStateHeap[0x80];
-    uint32_t rootDeviceIndex = std::numeric_limits<uint32_t>::max();
+    const uint32_t rootDeviceIndex = 1;
 };

 HWTEST_F(KernelTransformableTest, givenKernelThatCannotTranformImagesWithTwoTransformableImagesAndTwoTransformableSamplersWhenAllArgsAreSetThenImagesAreNotTransformed) {
    using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
    using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE;

-    image.reset(Image3dHelper<>::create(&context));
+    image.reset(Image3dHelper<>::create(context.get()));
    sampler.reset(createTransformableSampler());
    cl_mem clImage = image.get();
    cl_sampler clSampler = sampler.get();
@ -110,7 +111,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithTwoTransformableImagesAndTwoTra
    using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
    using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE;

-    image.reset(Image3dHelper<>::create(&context));
+    image.reset(Image3dHelper<>::create(context.get()));
    sampler.reset(createTransformableSampler());
    cl_mem clImage = image.get();
    cl_sampler clSampler = sampler.get();
@ -137,7 +138,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithTwoTransformableImagesAndTwoTra
    using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
    using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE;

-    image.reset(Image3dHelper<>::create(&context));
+    image.reset(Image3dHelper<>::create(context.get()));
    sampler.reset(createTransformableSampler());
    cl_mem clImage = image.get();
    cl_sampler clSampler = sampler.get();
@ -169,7 +170,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithOneTransformableImageAndTwoTran
    using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
    using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE;

-    image.reset(Image3dHelper<>::create(&context));
+    image.reset(Image3dHelper<>::create(context.get()));
    sampler.reset(createTransformableSampler());
    cl_mem clImage = image.get();
    cl_sampler clSampler = sampler.get();
@ -196,7 +197,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithImages2dAndTwoTransformableSamp
    using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
    using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE;

-    image.reset(Image2dHelper<>::create(&context));
+    image.reset(Image2dHelper<>::create(context.get()));
    sampler.reset(createTransformableSampler());
    cl_mem clImage = image.get();
    cl_sampler clSampler = sampler.get();
@ -223,7 +224,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithTwoTransformableImagesAndTwoTra
    using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
    using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE;

-    image.reset(Image3dHelper<>::create(&context));
+    image.reset(Image3dHelper<>::create(context.get()));
    sampler.reset(createTransformableSampler());
    cl_mem clImage = image.get();
    cl_sampler clSampler = sampler.get();
@ -255,7 +256,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithNonTransformableSamplersWhenRes
    using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
    using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE;

-    image.reset(Image3dHelper<>::create(&context));
+    image.reset(Image3dHelper<>::create(context.get()));
    sampler.reset(createNonTransformableSampler());
    cl_mem clImage = image.get();
    cl_sampler clSampler = sampler.get();
@ -287,7 +288,7 @@ HWTEST_F(KernelTransformableTest, givenKernelWithoutSamplersAndTransformableImag
    using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
    using SURFACE_TYPE = typename RENDER_SURFACE_STATE::SURFACE_TYPE;

-    image.reset(Image3dHelper<>::create(&context));
+    image.reset(Image3dHelper<>::create(context.get()));
    cl_mem clImage = image.get();

    pKernelInfo->kernelArgInfo[0].isSampler = false;
--- a/opencl/test/unit_test/kernel/substitute_kernel_heap_tests.cpp
+++ b/opencl/test/unit_test/kernel/substitute_kernel_heap_tests.cpp
@ -34,7 +34,7 @@ TEST_F(KernelSubstituteTest, givenKernelWhenSubstituteKernelHeapWithGreaterSizeT
    const size_t newHeapSize = initialHeapSize + 1;
    char newHeap[newHeapSize];

-    kernel.mockKernel->substituteKernelHeap(newHeap, newHeapSize);
+    kernel.mockKernel->substituteKernelHeap(rootDeviceIndex, newHeap, newHeapSize);
    auto secondAllocation = kernel.kernelInfo.kernelAllocation;
    EXPECT_NE(nullptr, secondAllocation);
    auto secondAllocationSize = secondAllocation->getUnderlyingBufferSize();
@ -64,7 +64,7 @@ TEST_F(KernelSubstituteTest, givenKernelWhenSubstituteKernelHeapWithSameSizeThen
    const size_t newHeapSize = initialHeapSize;
    char newHeap[newHeapSize];

-    kernel.mockKernel->substituteKernelHeap(newHeap, newHeapSize);
+    kernel.mockKernel->substituteKernelHeap(rootDeviceIndex, newHeap, newHeapSize);
    auto secondAllocation = kernel.kernelInfo.kernelAllocation;
    EXPECT_NE(nullptr, secondAllocation);
    auto secondAllocationSize = secondAllocation->getUnderlyingBufferSize();
@ -93,7 +93,7 @@ TEST_F(KernelSubstituteTest, givenKernelWhenSubstituteKernelHeapWithSmallerSizeT
    const size_t newHeapSize = initialHeapSize - 1;
    char newHeap[newHeapSize];

-    kernel.mockKernel->substituteKernelHeap(newHeap, newHeapSize);
+    kernel.mockKernel->substituteKernelHeap(rootDeviceIndex, newHeap, newHeapSize);
    auto secondAllocation = kernel.kernelInfo.kernelAllocation;
    EXPECT_NE(nullptr, secondAllocation);
    auto secondAllocationSize = secondAllocation->getUnderlyingBufferSize();
@ -125,7 +125,7 @@ TEST_F(KernelSubstituteTest, givenKernelWithUsedKernelAllocationWhenSubstituteKe

    EXPECT_TRUE(commandStreamReceiver.getTemporaryAllocations().peekIsEmpty());

-    kernel.mockKernel->substituteKernelHeap(newHeap, newHeapSize);
+    kernel.mockKernel->substituteKernelHeap(rootDeviceIndex, newHeap, newHeapSize);
    auto secondAllocation = kernel.kernelInfo.kernelAllocation;

    EXPECT_FALSE(commandStreamReceiver.getTemporaryAllocations().peekIsEmpty());
--- a/opencl/test/unit_test/sampler/sampler_set_arg_tests.cpp
+++ b/opencl/test/unit_test/sampler/sampler_set_arg_tests.cpp
@ -122,7 +122,7 @@ HWTEST_F(SamplerSetArgTest, WhenSettingKernelArgSamplerThenSamplerStatesAreCorre
    ASSERT_EQ(CL_SUCCESS, retVal);

    auto samplerState = reinterpret_cast<const SAMPLER_STATE *>(
-        ptrOffset(pKernel->getDynamicStateHeap(),
+        ptrOffset(pKernel->getDynamicStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));
    EXPECT_EQ(static_cast<cl_bool>(CL_TRUE), static_cast<cl_bool>(!samplerState->getNonNormalizedCoordinateEnable()));
    EXPECT_EQ(SAMPLER_STATE::TEXTURE_COORDINATE_MODE_MIRROR, samplerState->getTcxAddressControlMode());
@ -426,7 +426,7 @@ HWTEST_P(NormalizedTest, WhenSettingKernelArgSamplerThenCoordsAreCorrect) {
    ASSERT_EQ(CL_SUCCESS, retVal);

    auto samplerState = reinterpret_cast<const SAMPLER_STATE *>(
-        ptrOffset(pKernel->getDynamicStateHeap(),
+        ptrOffset(pKernel->getDynamicStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    EXPECT_EQ(normalizedCoordinates, static_cast<cl_bool>(!samplerState->getNonNormalizedCoordinateEnable()));
@ -477,7 +477,7 @@ HWTEST_P(AddressingModeTest, WhenSettingKernelArgSamplerThenModesAreCorrect) {
    ASSERT_EQ(CL_SUCCESS, retVal);

    auto samplerState = reinterpret_cast<const SAMPLER_STATE *>(
-        ptrOffset(pKernel->getDynamicStateHeap(),
+        ptrOffset(pKernel->getDynamicStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    auto expectedModeX = SAMPLER_STATE::TEXTURE_COORDINATE_MODE_MIRROR;
@ -557,7 +557,7 @@ HWTEST_F(SamplerSetArgTest, GivenMipmapsWhenSettingKernelArgSamplerThenLodAreCor
    ASSERT_EQ(CL_SUCCESS, retVal);

    auto samplerState = reinterpret_cast<const SAMPLER_STATE *>(
-        ptrOffset(pKernel->getDynamicStateHeap(),
+        ptrOffset(pKernel->getDynamicStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    EXPECT_EQ(FamilyType::SAMPLER_STATE::MIP_MODE_FILTER_LINEAR, samplerState->getMipModeFilter());
@ -588,7 +588,7 @@ HWTEST_P(FilterModeTest, WhenSettingKernelArgSamplerThenFiltersAreCorrect) {
        retVal);

    auto samplerState = reinterpret_cast<const SAMPLER_STATE *>(
-        ptrOffset(pKernel->getDynamicStateHeap(),
+        ptrOffset(pKernel->getDynamicStateHeap(rootDeviceIndex),
                  pKernelInfo->kernelArgInfo[0].offsetHeap));

    sampler->setArg(const_cast<SAMPLER_STATE *>(samplerState));
--- a/shared/test/unit_test/cmd_parse/hw_parse.h
+++ b/shared/test/unit_test/cmd_parse/hw_parse.h
@ -131,7 +131,7 @@ struct HardwareParse {
    }

    template <typename FamilyType>
-    const void *getStatelessArgumentPointer(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh);
+    const void *getStatelessArgumentPointer(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh, uint32_t rootDeviceIndex);

    template <typename CmdType>
    CmdType *getCommand(GenCmdList::iterator itorStart, GenCmdList::iterator itorEnd) {
--- a/shared/test/unit_test/cmd_parse/hw_parse.inl
+++ b/shared/test/unit_test/cmd_parse/hw_parse.inl
@ -94,7 +94,7 @@ void HardwareParse::findHardwareCommands() {
 }

 template <typename FamilyType>
-const void *HardwareParse::getStatelessArgumentPointer(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh) {
+const void *HardwareParse::getStatelessArgumentPointer(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh, uint32_t rootDeviceIndex) {
    typedef typename FamilyType::GPGPU_WALKER GPGPU_WALKER;
    typedef typename FamilyType::STATE_BASE_ADDRESS STATE_BASE_ADDRESS;

@ -115,7 +115,7 @@ const void *HardwareParse::getStatelessArgumentPointer(const Kernel &kernel, uin
        offsetCrossThreadData);

    // Determine where the argument is
-    auto &patchInfo = kernel.getDefaultKernelInfo().patchInfo;
+    auto &patchInfo = kernel.getKernelInfo(rootDeviceIndex).patchInfo;
    for (auto &arg : patchInfo.statelessGlobalMemObjKernelArgs) {
        if (arg->ArgumentNumber == indexArg) {
            return ptrOffset(pCrossThreadData, arg->DataParamOffset);
--- a/shared/test/unit_test/gen11/cmd_parse_gen11.cpp
+++ b/shared/test/unit_test/gen11/cmd_parse_gen11.cpp
@ -85,5 +85,5 @@ template struct CmdParse<GenGfxFamily>;
 namespace NEO {
 template void HardwareParse::findHardwareCommands<ICLFamily>();
 template void HardwareParse::findHardwareCommands<ICLFamily>(IndirectHeap *);
-template const void *HardwareParse::getStatelessArgumentPointer<ICLFamily>(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh);
+template const void *HardwareParse::getStatelessArgumentPointer<ICLFamily>(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh, uint32_t rootDeviceIndex);
 } // namespace NEO
--- a/shared/test/unit_test/gen12lp/cmd_parse_gen12lp.cpp
+++ b/shared/test/unit_test/gen12lp/cmd_parse_gen12lp.cpp
@ -99,5 +99,5 @@ template struct CmdParse<GenGfxFamily>;
 namespace NEO {
 template void HardwareParse::findHardwareCommands<TGLLPFamily>();
 template void HardwareParse::findHardwareCommands<TGLLPFamily>(IndirectHeap *);
-template const void *HardwareParse::getStatelessArgumentPointer<TGLLPFamily>(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh);
+template const void *HardwareParse::getStatelessArgumentPointer<TGLLPFamily>(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh, uint32_t rootDeviceIndex);
 } // namespace NEO
--- a/shared/test/unit_test/gen8/cmd_parse_gen8.cpp
+++ b/shared/test/unit_test/gen8/cmd_parse_gen8.cpp
@ -67,5 +67,5 @@ template struct CmdParse<GenGfxFamily>;
 namespace NEO {
 template void HardwareParse::findHardwareCommands<BDWFamily>();
 template void HardwareParse::findHardwareCommands<BDWFamily>(IndirectHeap *);
-template const void *HardwareParse::getStatelessArgumentPointer<BDWFamily>(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh);
+template const void *HardwareParse::getStatelessArgumentPointer<BDWFamily>(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh, uint32_t rootDeviceIndex);
 } // namespace NEO
--- a/shared/test/unit_test/gen9/cmd_parse_gen9.cpp
+++ b/shared/test/unit_test/gen9/cmd_parse_gen9.cpp
@ -85,5 +85,5 @@ template struct CmdParse<GenGfxFamily>;
 namespace NEO {
 template void HardwareParse::findHardwareCommands<SKLFamily>();
 template void HardwareParse::findHardwareCommands<SKLFamily>(IndirectHeap *);
-template const void *HardwareParse::getStatelessArgumentPointer<SKLFamily>(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh);
+template const void *HardwareParse::getStatelessArgumentPointer<SKLFamily>(const Kernel &kernel, uint32_t indexArg, IndirectHeap &ioh, uint32_t rootDeviceIndex);
 } // namespace NEO