Store device specific kernel members per root device

Related-To: NEO-5001 Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
2025-12-20 08:53:55 +08:00 · 2020-12-10 13:22:10 +00:00
parent 8d2cfd87ae
commit aa1fc85257
30 changed files with 446 additions and 306 deletions
--- a/opencl/source/command_queue/enqueue_kernel.h
+++ b/opencl/source/command_queue/enqueue_kernel.h
@@ -132,7 +132,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueKernel(
            ",", globalWorkSizeIn[2],
            ",SIMD:, ", kernelInfo.getMaxSimdSize());

-    if (totalWorkItems > kernel.maxKernelWorkGroupSize) {
+    if (totalWorkItems > kernel.getMaxKernelWorkGroupSize(rootDeviceIndex)) {
        return CL_INVALID_WORK_GROUP_SIZE;
    }

--- a/opencl/source/command_queue/gpgpu_walker_bdw_plus.inl
+++ b/opencl/source/command_queue/gpgpu_walker_bdw_plus.inl
@@ -96,31 +96,13 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
    DEBUG_BREAK_IF(simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20);

    // Patch our kernel constants
-    *scheduler.globalWorkOffsetX = 0;
-    *scheduler.globalWorkOffsetY = 0;
-    *scheduler.globalWorkOffsetZ = 0;
-
-    *scheduler.globalWorkSizeX = (uint32_t)scheduler.getGws();
-    *scheduler.globalWorkSizeY = 1;
-    *scheduler.globalWorkSizeZ = 1;
-
-    *scheduler.localWorkSizeX = (uint32_t)scheduler.getLws();
-    *scheduler.localWorkSizeY = 1;
-    *scheduler.localWorkSizeZ = 1;
-
-    *scheduler.localWorkSizeX2 = (uint32_t)scheduler.getLws();
-    *scheduler.localWorkSizeY2 = 1;
-    *scheduler.localWorkSizeZ2 = 1;
-
-    *scheduler.enqueuedLocalWorkSizeX = (uint32_t)scheduler.getLws();
-    *scheduler.enqueuedLocalWorkSizeY = 1;
-    *scheduler.enqueuedLocalWorkSizeZ = 1;
-
-    *scheduler.numWorkGroupsX = (uint32_t)(scheduler.getGws() / scheduler.getLws());
-    *scheduler.numWorkGroupsY = 0;
-    *scheduler.numWorkGroupsZ = 0;
-
-    *scheduler.workDim = 1;
+    scheduler.setGlobalWorkOffsetValues(rootDeviceIndex, 0, 0, 0);
+    scheduler.setGlobalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(scheduler.getGws()), 1, 1);
+    scheduler.setLocalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(scheduler.getLws()), 1, 1);
+    scheduler.setLocalWorkSize2Values(rootDeviceIndex, static_cast<uint32_t>(scheduler.getLws()), 1, 1);
+    scheduler.setEnqueuedLocalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(scheduler.getLws()), 1, 1);
+    scheduler.setNumWorkGroupsValues(rootDeviceIndex, static_cast<uint32_t>(scheduler.getGws() / scheduler.getLws()), 0, 0);
+    scheduler.setWorkDim(rootDeviceIndex, 1);

    // Send our indirect object data
    size_t localWorkSizes[3] = {scheduler.getLws(), 1, 1};
--- a/opencl/source/command_queue/hardware_interface_base.inl
+++ b/opencl/source/command_queue/hardware_interface_base.inl
@@ -196,36 +196,23 @@ void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQ

    size_t globalWorkSizes[3] = {gws.x, gws.y, gws.z};

+    auto rootDeviceIndex = commandQueue.getDevice().getRootDeviceIndex();
    // Patch our kernel constants
-    *kernel.globalWorkOffsetX = static_cast<uint32_t>(offset.x);
-    *kernel.globalWorkOffsetY = static_cast<uint32_t>(offset.y);
-    *kernel.globalWorkOffsetZ = static_cast<uint32_t>(offset.z);
+    kernel.setGlobalWorkOffsetValues(rootDeviceIndex, static_cast<uint32_t>(offset.x), static_cast<uint32_t>(offset.y), static_cast<uint32_t>(offset.z));
+    kernel.setGlobalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(gws.x), static_cast<uint32_t>(gws.y), static_cast<uint32_t>(gws.z));

-    *kernel.globalWorkSizeX = static_cast<uint32_t>(gws.x);
-    *kernel.globalWorkSizeY = static_cast<uint32_t>(gws.y);
-    *kernel.globalWorkSizeZ = static_cast<uint32_t>(gws.z);
-
-    if (isMainKernel || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) {
-        *kernel.localWorkSizeX = static_cast<uint32_t>(lws.x);
-        *kernel.localWorkSizeY = static_cast<uint32_t>(lws.y);
-        *kernel.localWorkSizeZ = static_cast<uint32_t>(lws.z);
+    if (isMainKernel || (!kernel.isLocalWorkSize2Patched(rootDeviceIndex))) {
+        kernel.setLocalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
    }

-    *kernel.localWorkSizeX2 = static_cast<uint32_t>(lws.x);
-    *kernel.localWorkSizeY2 = static_cast<uint32_t>(lws.y);
-    *kernel.localWorkSizeZ2 = static_cast<uint32_t>(lws.z);
-
-    *kernel.enqueuedLocalWorkSizeX = static_cast<uint32_t>(elws.x);
-    *kernel.enqueuedLocalWorkSizeY = static_cast<uint32_t>(elws.y);
-    *kernel.enqueuedLocalWorkSizeZ = static_cast<uint32_t>(elws.z);
+    kernel.setLocalWorkSize2Values(rootDeviceIndex, static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
+    kernel.setEnqueuedLocalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(elws.x), static_cast<uint32_t>(elws.y), static_cast<uint32_t>(elws.z));

    if (isMainKernel) {
-        *kernel.numWorkGroupsX = static_cast<uint32_t>(totalNumberOfWorkgroups.x);
-        *kernel.numWorkGroupsY = static_cast<uint32_t>(totalNumberOfWorkgroups.y);
-        *kernel.numWorkGroupsZ = static_cast<uint32_t>(totalNumberOfWorkgroups.z);
+        kernel.setNumWorkGroupsValues(rootDeviceIndex, static_cast<uint32_t>(totalNumberOfWorkgroups.x), static_cast<uint32_t>(totalNumberOfWorkgroups.y), static_cast<uint32_t>(totalNumberOfWorkgroups.z));
    }

-    *kernel.workDim = dim;
+    kernel.setWorkDim(rootDeviceIndex, dim);

    // Send our indirect object data
    size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};
--- a/opencl/source/command_queue/local_work_size.cpp
+++ b/opencl/source/command_queue/local_work_size.cpp
@@ -427,7 +427,7 @@ Vec3<size_t> computeWorkgroupSize(const DispatchInfo &dispatchInfo) {
            size_t workItems[3] = {dispatchInfo.getGWS().x, dispatchInfo.getGWS().y, dispatchInfo.getGWS().z};
            computeWorkgroupSizeND(wsInfo, workGroupSize, workItems, dispatchInfo.getDim());
        } else {
-            auto maxWorkGroupSize = kernel->maxKernelWorkGroupSize;
+            auto maxWorkGroupSize = kernel->getMaxKernelWorkGroupSize(rootDeviceIndex);
            auto simd = kernel->getKernelInfo(rootDeviceIndex).getMaxSimdSize();
            size_t workItems[3] = {dispatchInfo.getGWS().x, dispatchInfo.getGWS().y, dispatchInfo.getGWS().z};
            if (dispatchInfo.getDim() == 1) {
--- a/opencl/source/context/context.h
+++ b/opencl/source/context/context.h
@@ -143,7 +143,7 @@ class Context : public BaseObject<_cl_context> {

    ContextType peekContextType() const { return contextType; }

-    SchedulerKernel &getSchedulerKernel();
+    MOCKABLE_VIRTUAL SchedulerKernel &getSchedulerKernel();

    bool isDeviceAssociated(const ClDevice &clDevice) const;
    ClDevice *getSubDeviceByIndex(uint32_t subDeviceIndex) const;
--- a/opencl/source/gtpin/gtpin_callbacks.cpp
+++ b/opencl/source/gtpin/gtpin_callbacks.cpp
@@ -63,10 +63,10 @@ void gtpinNotifyKernelCreate(cl_kernel kernel) {
    }
    if (isGTPinInitialized) {
        auto pKernel = castToObjectOrAbort<Kernel>(kernel);
-        size_t gtpinBTI = pKernel->getNumberOfBindingTableStates();
-        // Enlarge local copy of SSH by 1 SS
        auto device = pKernel->getDevices()[0];
        auto rootDeviceIndex = device->getRootDeviceIndex();
+        size_t gtpinBTI = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
+        // Enlarge local copy of SSH by 1 SS
        GFXCORE_FAMILY genFamily = device->getHardwareInfo().platform.eRenderCoreFamily;
        GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
        if (pKernel->isParentKernel || !gtpinHelper.addSurfaceState(pKernel, rootDeviceIndex)) {
@@ -138,7 +138,7 @@ void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
        }
        GFXCORE_FAMILY genFamily = device.getHardwareInfo().platform.eRenderCoreFamily;
        GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
-        size_t gtpinBTI = pKernel->getNumberOfBindingTableStates() - 1;
+        size_t gtpinBTI = pKernel->getNumberOfBindingTableStates(rootDeviceIndex) - 1;
        void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI, rootDeviceIndex);
        cl_mem buffer = (cl_mem)resource;
        auto pBuffer = castToObjectOrAbort<Buffer>(buffer);
--- a/opencl/source/gtpin/gtpin_hw_helper.inl
+++ b/opencl/source/gtpin/gtpin_hw_helper.inl
@@ -27,7 +27,7 @@ bool GTPinHwHelperHw<GfxFamily>::addSurfaceState(Kernel *pKernel, uint32_t rootD
    size_t ssSize = sizeof(RENDER_SURFACE_STATE);
    size_t btsSize = sizeof(BINDING_TABLE_STATE);
    size_t sizeToEnlarge = ssSize + btsSize;
-    size_t currBTOffset = pKernel->getBindingTableOffset();
+    size_t currBTOffset = pKernel->getBindingTableOffset(rootDeviceIndex);
    size_t currSurfaceStateSize = currBTOffset;
    char *pSsh = static_cast<char *>(pKernel->getSurfaceStateHeap(rootDeviceIndex));
    char *pNewSsh = new char[sshSize + sizeToEnlarge];
@@ -35,7 +35,7 @@ bool GTPinHwHelperHw<GfxFamily>::addSurfaceState(Kernel *pKernel, uint32_t rootD
    RENDER_SURFACE_STATE *pSS = reinterpret_cast<RENDER_SURFACE_STATE *>(pNewSsh + currSurfaceStateSize);
    *pSS = GfxFamily::cmdInitRenderSurfaceState;
    size_t newSurfaceStateSize = currSurfaceStateSize + ssSize;
-    size_t currBTCount = pKernel->getNumberOfBindingTableStates();
+    size_t currBTCount = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
    memcpy_s(pNewSsh + newSurfaceStateSize, sshSize + sizeToEnlarge - newSurfaceStateSize, pSsh + currBTOffset, currBTCount * btsSize);
    BINDING_TABLE_STATE *pNewBTS = reinterpret_cast<BINDING_TABLE_STATE *>(pNewSsh + newSurfaceStateSize + currBTCount * btsSize);
    *pNewBTS = GfxFamily::cmdInitBindingTableState;
@@ -48,10 +48,10 @@ template <typename GfxFamily>
 void *GTPinHwHelperHw<GfxFamily>::getSurfaceState(Kernel *pKernel, size_t bti, uint32_t rootDeviceIndex) {
    using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE;

-    if ((nullptr == pKernel->getSurfaceStateHeap(rootDeviceIndex)) || (bti >= pKernel->getNumberOfBindingTableStates())) {
+    if ((nullptr == pKernel->getSurfaceStateHeap(rootDeviceIndex)) || (bti >= pKernel->getNumberOfBindingTableStates(rootDeviceIndex))) {
        return nullptr;
    }
-    auto *pBts = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex), (pKernel->getBindingTableOffset() + bti * sizeof(BINDING_TABLE_STATE))));
+    auto *pBts = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex), (pKernel->getBindingTableOffset(rootDeviceIndex) + bti * sizeof(BINDING_TABLE_STATE))));
    auto pSurfaceState = ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex), pBts->getSurfaceStatePointer());
    return pSurfaceState;
 }
--- a/opencl/source/helpers/hardware_commands_helper_base.inl
+++ b/opencl/source/helpers/hardware_commands_helper_base.inl
@@ -238,7 +238,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(

    auto dstBindingTablePointer = EncodeSurfaceState<GfxFamily>::pushBindingTableAndSurfaceStates(ssh, (kernelInfo.patchInfo.bindingTableState != nullptr) ? kernelInfo.patchInfo.bindingTableState->Count : 0,
                                                                                                  kernel.getSurfaceStateHeap(rootDeviceIndex), kernel.getSurfaceStateHeapSize(rootDeviceIndex),
-                                                                                                  kernel.getNumberOfBindingTableStates(), kernel.getBindingTableOffset());
+                                                                                                  kernel.getNumberOfBindingTableStates(rootDeviceIndex), kernel.getBindingTableOffset(rootDeviceIndex));

    // Copy our sampler state if it exists
    uint32_t samplerStateOffset = 0;
@@ -281,7 +281,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
    uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * sizeof(INTERFACE_DESCRIPTOR_DATA);
    DEBUG_BREAK_IF(patchInfo.executionEnvironment == nullptr);

-    auto bindingTablePrefetchSize = std::min(31u, static_cast<uint32_t>(kernel.getNumberOfBindingTableStates()));
+    auto bindingTablePrefetchSize = std::min(31u, static_cast<uint32_t>(kernel.getNumberOfBindingTableStates(rootDeviceIndex)));
    if (resetBindingTablePrefetch(kernel)) {
        bindingTablePrefetchSize = 0;
    }
--- a/opencl/source/kernel/kernel.cpp
+++ b/opencl/source/kernel/kernel.cpp
@@ -78,8 +78,9 @@ Kernel::Kernel(Program *programArg, const KernelInfoContainer &kernelInfosArg, b
    program->retain();
    program->retainForKernel();
    imageTransformer.reset(new ImageTransformer);
-
-    maxKernelWorkGroupSize = static_cast<uint32_t>(deviceVector[0]->getSharedDeviceInfo().maxWorkGroupSize);
+    for (const auto &pClDevice : deviceVector) {
+        kernelDeviceInfos[pClDevice->getRootDeviceIndex()].maxKernelWorkGroupSize = static_cast<uint32_t>(pClDevice->getSharedDeviceInfo().maxWorkGroupSize);
+    }
 }

 Kernel::~Kernel() {
@@ -170,9 +171,9 @@ template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData
 cl_int Kernel::initialize() {
    cl_int retVal = CL_OUT_OF_HOST_MEMORY;
    do {
-        reconfigureKernel();
        auto pClDevice = &getDevice();
        auto rootDeviceIndex = pClDevice->getRootDeviceIndex();
+        reconfigureKernel(rootDeviceIndex);
        auto &hwInfo = pClDevice->getHardwareInfo();
        auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
        auto &kernelInfo = *kernelInfos[rootDeviceIndex];
@@ -201,40 +202,84 @@ cl_int Kernel::initialize() {
            }

            auto crossThread = reinterpret_cast<uint32_t *>(kernelDeviceInfos[rootDeviceIndex].crossThreadData);
-            globalWorkOffsetX = workloadInfo.globalWorkOffsetOffsets[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[0]) : globalWorkOffsetX;
-            globalWorkOffsetY = workloadInfo.globalWorkOffsetOffsets[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[1]) : globalWorkOffsetY;
-            globalWorkOffsetZ = workloadInfo.globalWorkOffsetOffsets[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[2]) : globalWorkOffsetZ;
+            kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX = workloadInfo.globalWorkOffsetOffsets[0] != WorkloadInfo::undefinedOffset
+                                                                       ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[0])
+                                                                       : kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX;
+            kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY = workloadInfo.globalWorkOffsetOffsets[1] != WorkloadInfo::undefinedOffset
+                                                                       ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[1])
+                                                                       : kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY;
+            kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetZ = workloadInfo.globalWorkOffsetOffsets[2] != WorkloadInfo::undefinedOffset
+                                                                       ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[2])
+                                                                       : kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetZ;

-            localWorkSizeX = workloadInfo.localWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[0]) : localWorkSizeX;
-            localWorkSizeY = workloadInfo.localWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[1]) : localWorkSizeY;
-            localWorkSizeZ = workloadInfo.localWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[2]) : localWorkSizeZ;
+            kernelDeviceInfos[rootDeviceIndex].localWorkSizeX = workloadInfo.localWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset
+                                                                    ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[0])
+                                                                    : kernelDeviceInfos[rootDeviceIndex].localWorkSizeX;
+            kernelDeviceInfos[rootDeviceIndex].localWorkSizeY = workloadInfo.localWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset
+                                                                    ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[1])
+                                                                    : kernelDeviceInfos[rootDeviceIndex].localWorkSizeY;
+            kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ = workloadInfo.localWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset
+                                                                    ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[2])
+                                                                    : kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ;

-            localWorkSizeX2 = workloadInfo.localWorkSizeOffsets2[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[0]) : localWorkSizeX2;
-            localWorkSizeY2 = workloadInfo.localWorkSizeOffsets2[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[1]) : localWorkSizeY2;
-            localWorkSizeZ2 = workloadInfo.localWorkSizeOffsets2[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[2]) : localWorkSizeZ2;
+            kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2 = workloadInfo.localWorkSizeOffsets2[0] != WorkloadInfo::undefinedOffset
+                                                                     ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[0])
+                                                                     : kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2;
+            kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2 = workloadInfo.localWorkSizeOffsets2[1] != WorkloadInfo::undefinedOffset
+                                                                     ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[1])
+                                                                     : kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2;
+            kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2 = workloadInfo.localWorkSizeOffsets2[2] != WorkloadInfo::undefinedOffset
+                                                                     ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[2])
+                                                                     : kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2;

-            globalWorkSizeX = workloadInfo.globalWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[0]) : globalWorkSizeX;
-            globalWorkSizeY = workloadInfo.globalWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[1]) : globalWorkSizeY;
-            globalWorkSizeZ = workloadInfo.globalWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[2]) : globalWorkSizeZ;
+            kernelDeviceInfos[rootDeviceIndex].globalWorkSizeX = workloadInfo.globalWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset
+                                                                     ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[0])
+                                                                     : kernelDeviceInfos[rootDeviceIndex].globalWorkSizeX;
+            kernelDeviceInfos[rootDeviceIndex].globalWorkSizeY = workloadInfo.globalWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset
+                                                                     ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[1])
+                                                                     : kernelDeviceInfos[rootDeviceIndex].globalWorkSizeY;
+            kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ = workloadInfo.globalWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset
+                                                                     ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[2])
+                                                                     : kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ;

-            enqueuedLocalWorkSizeX = workloadInfo.enqueuedLocalWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[0]) : enqueuedLocalWorkSizeX;
-            enqueuedLocalWorkSizeY = workloadInfo.enqueuedLocalWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[1]) : enqueuedLocalWorkSizeY;
-            enqueuedLocalWorkSizeZ = workloadInfo.enqueuedLocalWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[2]) : enqueuedLocalWorkSizeZ;
+            kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX = workloadInfo.enqueuedLocalWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset
+                                                                            ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[0])
+                                                                            : kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX;
+            kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY = workloadInfo.enqueuedLocalWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset
+                                                                            ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[1])
+                                                                            : kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY;
+            kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ = workloadInfo.enqueuedLocalWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset
+                                                                            ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[2])
+                                                                            : kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ;

-            numWorkGroupsX = workloadInfo.numWorkGroupsOffset[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[0]) : numWorkGroupsX;
-            numWorkGroupsY = workloadInfo.numWorkGroupsOffset[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[1]) : numWorkGroupsY;
-            numWorkGroupsZ = workloadInfo.numWorkGroupsOffset[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[2]) : numWorkGroupsZ;
+            kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX = workloadInfo.numWorkGroupsOffset[0] != WorkloadInfo::undefinedOffset
+                                                                    ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[0])
+                                                                    : kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX;
+            kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY = workloadInfo.numWorkGroupsOffset[1] != WorkloadInfo::undefinedOffset
+                                                                    ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[1])
+                                                                    : kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY;
+            kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ = workloadInfo.numWorkGroupsOffset[2] != WorkloadInfo::undefinedOffset
+                                                                    ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[2])
+                                                                    : kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ;

-            maxWorkGroupSizeForCrossThreadData = workloadInfo.maxWorkGroupSizeOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.maxWorkGroupSizeOffset) : maxWorkGroupSizeForCrossThreadData;
-            workDim = workloadInfo.workDimOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.workDimOffset) : workDim;
-            dataParameterSimdSize = workloadInfo.simdSizeOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.simdSizeOffset) : dataParameterSimdSize;
-            parentEventOffset = workloadInfo.parentEventOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.parentEventOffset) : parentEventOffset;
-            preferredWkgMultipleOffset = workloadInfo.preferredWkgMultipleOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.preferredWkgMultipleOffset) : preferredWkgMultipleOffset;
+            kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData = workloadInfo.maxWorkGroupSizeOffset != WorkloadInfo::undefinedOffset
+                                                                                        ? ptrOffset(crossThread, workloadInfo.maxWorkGroupSizeOffset)
+                                                                                        : kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData;
+            kernelDeviceInfos[rootDeviceIndex].workDim = workloadInfo.workDimOffset != WorkloadInfo::undefinedOffset
+                                                             ? ptrOffset(crossThread, workloadInfo.workDimOffset)
+                                                             : kernelDeviceInfos[rootDeviceIndex].workDim;
+            kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize = workloadInfo.simdSizeOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.simdSizeOffset) : kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize;
+            kernelDeviceInfos[rootDeviceIndex].parentEventOffset = workloadInfo.parentEventOffset != WorkloadInfo::undefinedOffset
+                                                                       ? ptrOffset(crossThread, workloadInfo.parentEventOffset)
+                                                                       : kernelDeviceInfos[rootDeviceIndex].parentEventOffset;
+            kernelDeviceInfos[rootDeviceIndex].preferredWkgMultipleOffset = workloadInfo.preferredWkgMultipleOffset != WorkloadInfo::undefinedOffset
+                                                                                ? ptrOffset(crossThread, workloadInfo.preferredWkgMultipleOffset)
+                                                                                : kernelDeviceInfos[rootDeviceIndex].preferredWkgMultipleOffset;

-            *maxWorkGroupSizeForCrossThreadData = maxKernelWorkGroupSize;
-            *dataParameterSimdSize = maxSimdSize;
-            *preferredWkgMultipleOffset = maxSimdSize;
-            *parentEventOffset = WorkloadInfo::invalidParentEvent;
+            *kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData = kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize;
+            *kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize = maxSimdSize;
+            *kernelDeviceInfos[rootDeviceIndex].preferredWkgMultipleOffset = maxSimdSize;
+            *kernelDeviceInfos[rootDeviceIndex].parentEventOffset = WorkloadInfo::invalidParentEvent;
        }

        // allocate our own SSH, if necessary
@@ -247,8 +292,8 @@ cl_int Kernel::initialize() {
            memcpy_s(kernelDeviceInfos[rootDeviceIndex].pSshLocal.get(), kernelDeviceInfos[rootDeviceIndex].sshLocalSize,
                     heapInfo.pSsh, kernelDeviceInfos[rootDeviceIndex].sshLocalSize);
        }
-        numberOfBindingTableStates = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Count : 0;
-        localBindingTableOffset = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Offset : 0;
+        kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Count : 0;
+        kernelDeviceInfos[rootDeviceIndex].localBindingTableOffset = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Offset : 0;

        // patch crossthread data and ssh with inline surfaces, if necessary
        auto perHwThreadPrivateMemorySize = PatchTokenBinary::getPerHwThreadPrivateSurfaceSize(patchInfo.pAllocateStatelessPrivateSurface, kernelInfo.getMaxSimdSize());
@@ -582,7 +627,7 @@ cl_int Kernel::getWorkGroupInfo(ClDevice &device, cl_kernel_work_group_info para

    switch (paramName) {
    case CL_KERNEL_WORK_GROUP_SIZE:
-        maxWorkgroupSize = this->maxKernelWorkGroupSize;
+        maxWorkgroupSize = kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize;
        if (DebugManager.flags.UseMaxSimdSizeToDeduceMaxWorkgroupSize.get()) {
            auto divisionSize = CommonConstants::maximalSimdSize / patchInfo.executionEnvironment->LargestCompiledSIMDSize;
            maxWorkgroupSize /= divisionSize;
@@ -646,9 +691,10 @@ cl_int Kernel::getSubGroupInfo(ClDevice &clDevice, cl_kernel_sub_group_info para
                               size_t *paramValueSizeRet) const {
    size_t numDimensions = 0;
    size_t WGS = 1;
-    const auto &kernelInfo = getKernelInfo(clDevice.getRootDeviceIndex());
+    auto rootDeviceIndex = clDevice.getRootDeviceIndex();
+    const auto &kernelInfo = getKernelInfo(rootDeviceIndex);
    auto maxSimdSize = static_cast<size_t>(kernelInfo.getMaxSimdSize());
-    auto maxRequiredWorkGroupSize = static_cast<size_t>(kernelInfo.getMaxRequiredWorkGroupSize(maxKernelWorkGroupSize));
+    auto maxRequiredWorkGroupSize = static_cast<size_t>(kernelInfo.getMaxRequiredWorkGroupSize(getMaxKernelWorkGroupSize(rootDeviceIndex)));
    auto largestCompiledSIMDSize = static_cast<size_t>(kernelInfo.patchInfo.executionEnvironment->LargestCompiledSIMDSize);

    GetInfoHelper info(paramValue, paramValueSize, paramValueSizeRet);
@@ -811,15 +857,15 @@ size_t Kernel::getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const {
               : 0;
 }

-size_t Kernel::getNumberOfBindingTableStates() const {
-    return numberOfBindingTableStates;
+size_t Kernel::getNumberOfBindingTableStates(uint32_t rootDeviceIndex) const {
+    return kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates;
 }

 void Kernel::resizeSurfaceStateHeap(uint32_t rootDeviceIndex, void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset) {
    kernelDeviceInfos[rootDeviceIndex].pSshLocal.reset(static_cast<char *>(pNewSsh));
    kernelDeviceInfos[rootDeviceIndex].sshLocalSize = static_cast<uint32_t>(newSshSize);
-    numberOfBindingTableStates = newBindingTableCount;
-    localBindingTableOffset = newBindingTableOffset;
+    kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = newBindingTableCount;
+    kernelDeviceInfos[rootDeviceIndex].localBindingTableOffset = newBindingTableOffset;
 }

 cl_int Kernel::setArg(uint32_t argIndex, size_t argSize, const void *argVal) {
@@ -2564,4 +2610,51 @@ const KernelInfo &Kernel::getDefaultKernelInfo() const {
    UNRECOVERABLE_IF(!pKernelInfo);
    return *pKernelInfo;
 }
+void Kernel::setGlobalWorkOffsetValues(uint32_t rootDeviceIndex, uint32_t globalWorkOffsetX, uint32_t globalWorkOffsetY, uint32_t globalWorkOffsetZ) {
+    *kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX = globalWorkOffsetX;
+    *kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY = globalWorkOffsetY;
+    *kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY = globalWorkOffsetZ;
+}
+
+void Kernel::setGlobalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ) {
+    *kernelDeviceInfos[rootDeviceIndex].globalWorkSizeX = globalWorkSizeX;
+    *kernelDeviceInfos[rootDeviceIndex].globalWorkSizeY = globalWorkSizeY;
+    *kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ = globalWorkSizeZ;
+}
+
+void Kernel::setLocalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
+    *kernelDeviceInfos[rootDeviceIndex].localWorkSizeX = localWorkSizeX;
+    *kernelDeviceInfos[rootDeviceIndex].localWorkSizeY = localWorkSizeY;
+    *kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ = localWorkSizeZ;
+}
+
+void Kernel::setLocalWorkSize2Values(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
+    *kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2 = localWorkSizeX;
+    *kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2 = localWorkSizeY;
+    *kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2 = localWorkSizeZ;
+}
+
+void Kernel::setEnqueuedLocalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
+    *kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX = localWorkSizeX;
+    *kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY = localWorkSizeY;
+    *kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ = localWorkSizeZ;
+}
+
+bool Kernel::isLocalWorkSize2Patched(uint32_t rootDeviceIndex) {
+    return kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2 != &dummyPatchLocation;
+}
+
+void Kernel::setNumWorkGroupsValues(uint32_t rootDeviceIndex, uint32_t numWorkGroupsX, uint32_t numWorkGroupsY, uint32_t numWorkGroupsZ) {
+    *kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX = numWorkGroupsX;
+    *kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY = numWorkGroupsY;
+    *kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ = numWorkGroupsZ;
+}
+
+void Kernel::setWorkDim(uint32_t rootDeviceIndex, uint32_t workDim) {
+    *kernelDeviceInfos[rootDeviceIndex].workDim = workDim;
+}
+
+uint32_t Kernel::getMaxKernelWorkGroupSize(uint32_t rootDeviceIndex) const {
+    return kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize;
+}
 } // namespace NEO
--- a/opencl/source/kernel/kernel.h
+++ b/opencl/source/kernel/kernel.h
@@ -161,9 +161,9 @@ class Kernel : public BaseObject<_cl_kernel> {
    size_t getKernelHeapSize(uint32_t rootDeviceIndex) const;
    size_t getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const;
    size_t getDynamicStateHeapSize(uint32_t rootDeviceIndex) const;
-    size_t getNumberOfBindingTableStates() const;
-    size_t getBindingTableOffset() const {
-        return localBindingTableOffset;
+    size_t getNumberOfBindingTableStates(uint32_t rootDeviceIndex) const;
+    size_t getBindingTableOffset(uint32_t rootDeviceIndex) const {
+        return kernelDeviceInfos[rootDeviceIndex].localBindingTableOffset;
    }

    void resizeSurfaceStateHeap(uint32_t rootDeviceIndex, void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset);
@@ -304,37 +304,6 @@ class Kernel : public BaseObject<_cl_kernel> {
                                            size_t argSize,
                                            const void *argValue) const;

-    uint32_t *globalWorkOffsetX = &Kernel::dummyPatchLocation;
-    uint32_t *globalWorkOffsetY = &Kernel::dummyPatchLocation;
-    uint32_t *globalWorkOffsetZ = &Kernel::dummyPatchLocation;
-
-    uint32_t *localWorkSizeX = &Kernel::dummyPatchLocation;
-    uint32_t *localWorkSizeY = &Kernel::dummyPatchLocation;
-    uint32_t *localWorkSizeZ = &Kernel::dummyPatchLocation;
-
-    uint32_t *localWorkSizeX2 = &Kernel::dummyPatchLocation;
-    uint32_t *localWorkSizeY2 = &Kernel::dummyPatchLocation;
-    uint32_t *localWorkSizeZ2 = &Kernel::dummyPatchLocation;
-
-    uint32_t *globalWorkSizeX = &Kernel::dummyPatchLocation;
-    uint32_t *globalWorkSizeY = &Kernel::dummyPatchLocation;
-    uint32_t *globalWorkSizeZ = &Kernel::dummyPatchLocation;
-
-    uint32_t *enqueuedLocalWorkSizeX = &Kernel::dummyPatchLocation;
-    uint32_t *enqueuedLocalWorkSizeY = &Kernel::dummyPatchLocation;
-    uint32_t *enqueuedLocalWorkSizeZ = &Kernel::dummyPatchLocation;
-
-    uint32_t *numWorkGroupsX = &Kernel::dummyPatchLocation;
-    uint32_t *numWorkGroupsY = &Kernel::dummyPatchLocation;
-    uint32_t *numWorkGroupsZ = &Kernel::dummyPatchLocation;
-
-    uint32_t *maxWorkGroupSizeForCrossThreadData = &Kernel::dummyPatchLocation;
-    uint32_t maxKernelWorkGroupSize = 0;
-    uint32_t *workDim = &Kernel::dummyPatchLocation;
-    uint32_t *dataParameterSimdSize = &Kernel::dummyPatchLocation;
-    uint32_t *parentEventOffset = &Kernel::dummyPatchLocation;
-    uint32_t *preferredWkgMultipleOffset = &Kernel::dummyPatchLocation;
-
    static uint32_t dummyPatchLocation;

    std::vector<size_t> slmSizes;
@@ -426,6 +395,16 @@ class Kernel : public BaseObject<_cl_kernel> {
    }
    const KernelInfo &getDefaultKernelInfo() const;

+    void setGlobalWorkOffsetValues(uint32_t rootDeviceIndex, uint32_t globalWorkOffsetX, uint32_t globalWorkOffsetY, uint32_t globalWorkOffsetZ);
+    void setGlobalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ);
+    void setLocalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ);
+    void setLocalWorkSize2Values(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ);
+    void setEnqueuedLocalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ);
+    bool isLocalWorkSize2Patched(uint32_t rootDeviceIndex);
+    void setNumWorkGroupsValues(uint32_t rootDeviceIndex, uint32_t numWorkGroupsX, uint32_t numWorkGroupsY, uint32_t numWorkGroupsZ);
+    void setWorkDim(uint32_t rootDeviceIndex, uint32_t workDim);
+    uint32_t getMaxKernelWorkGroupSize(uint32_t rootDeviceIndex) const;
+
  protected:
    struct ObjectCounts {
        uint32_t imageCount;
@@ -511,7 +490,7 @@ class Kernel : public BaseObject<_cl_kernel> {

    void resolveArgs();

-    void reconfigureKernel();
+    void reconfigureKernel(uint32_t rootDeviceIndex);

    void addAllocationToCacheFlushVector(uint32_t argIndex, GraphicsAllocation *argAllocation);
    bool allocationForCacheFlush(GraphicsAllocation *argAllocation) const;
@@ -534,9 +513,6 @@ class Kernel : public BaseObject<_cl_kernel> {

    AuxTranslationDirection auxTranslationDirection = AuxTranslationDirection::None;

-    size_t numberOfBindingTableStates = 0u;
-    size_t localBindingTableOffset = 0u;
-
    GraphicsAllocation *kernelReflectionSurface = nullptr;

    bool usingSharedObjArgs = false;
@@ -561,6 +537,40 @@ class Kernel : public BaseObject<_cl_kernel> {
    uint32_t additionalKernelExecInfo = AdditionalKernelExecInfo::NotSet;

    struct KernelDeviceInfo : public NonCopyableClass {
+        uint32_t *globalWorkOffsetX = &Kernel::dummyPatchLocation;
+        uint32_t *globalWorkOffsetY = &Kernel::dummyPatchLocation;
+        uint32_t *globalWorkOffsetZ = &Kernel::dummyPatchLocation;
+
+        uint32_t *localWorkSizeX = &Kernel::dummyPatchLocation;
+        uint32_t *localWorkSizeY = &Kernel::dummyPatchLocation;
+        uint32_t *localWorkSizeZ = &Kernel::dummyPatchLocation;
+
+        uint32_t *localWorkSizeX2 = &Kernel::dummyPatchLocation;
+        uint32_t *localWorkSizeY2 = &Kernel::dummyPatchLocation;
+        uint32_t *localWorkSizeZ2 = &Kernel::dummyPatchLocation;
+
+        uint32_t *globalWorkSizeX = &Kernel::dummyPatchLocation;
+        uint32_t *globalWorkSizeY = &Kernel::dummyPatchLocation;
+        uint32_t *globalWorkSizeZ = &Kernel::dummyPatchLocation;
+
+        uint32_t *enqueuedLocalWorkSizeX = &Kernel::dummyPatchLocation;
+        uint32_t *enqueuedLocalWorkSizeY = &Kernel::dummyPatchLocation;
+        uint32_t *enqueuedLocalWorkSizeZ = &Kernel::dummyPatchLocation;
+
+        uint32_t *numWorkGroupsX = &Kernel::dummyPatchLocation;
+        uint32_t *numWorkGroupsY = &Kernel::dummyPatchLocation;
+        uint32_t *numWorkGroupsZ = &Kernel::dummyPatchLocation;
+
+        uint32_t *maxWorkGroupSizeForCrossThreadData = &Kernel::dummyPatchLocation;
+        uint32_t maxKernelWorkGroupSize = 0;
+        uint32_t *workDim = &Kernel::dummyPatchLocation;
+        uint32_t *dataParameterSimdSize = &Kernel::dummyPatchLocation;
+        uint32_t *parentEventOffset = &Kernel::dummyPatchLocation;
+        uint32_t *preferredWkgMultipleOffset = &Kernel::dummyPatchLocation;
+
+        size_t numberOfBindingTableStates = 0u;
+        size_t localBindingTableOffset = 0u;
+
        std::unique_ptr<char[]> pSshLocal;
        uint32_t sshLocalSize = 0u;
        char *crossThreadData = nullptr;
--- a/opencl/source/kernel/kernel_extra.cpp
+++ b/opencl/source/kernel/kernel_extra.cpp
@@ -13,7 +13,7 @@ namespace NEO {
 bool Kernel::requiresCacheFlushCommand(const CommandQueue &commandQueue) const {
    return false;
 }
-void Kernel::reconfigureKernel() {
+void Kernel::reconfigureKernel(uint32_t rootDeviceIndex) {
 }
 int Kernel::setKernelThreadArbitrationPolicy(uint32_t policy) {
    if (policy == CL_KERNEL_EXEC_INFO_THREAD_ARBITRATION_POLICY_ROUND_ROBIN_INTEL) {
--- a/opencl/source/program/kernel_info.cpp
+++ b/opencl/source/program/kernel_info.cpp
@@ -133,8 +133,9 @@ WorkSizeInfo::WorkSizeInfo(uint32_t maxWorkGroupSize, bool hasBarriers, uint32_t
 }
 WorkSizeInfo::WorkSizeInfo(const DispatchInfo &dispatchInfo) {
    auto &device = dispatchInfo.getClDevice();
-    const auto &kernelInfo = dispatchInfo.getKernel()->getKernelInfo(device.getRootDeviceIndex());
-    this->maxWorkGroupSize = dispatchInfo.getKernel()->maxKernelWorkGroupSize;
+    auto rootDeviceIndex = device.getRootDeviceIndex();
+    const auto &kernelInfo = dispatchInfo.getKernel()->getKernelInfo(rootDeviceIndex);
+    this->maxWorkGroupSize = dispatchInfo.getKernel()->getMaxKernelWorkGroupSize(rootDeviceIndex);
    auto pExecutionEnvironment = kernelInfo.patchInfo.executionEnvironment;
    this->hasBarriers = (pExecutionEnvironment != nullptr) && (pExecutionEnvironment->HasBarriers);
    this->simdSize = static_cast<uint32_t>(kernelInfo.getMaxSimdSize());
--- a/opencl/test/unit_test/api/cl_get_kernel_sub_group_info_tests.inl
+++ b/opencl/test/unit_test/api/cl_get_kernel_sub_group_info_tests.inl
@@ -15,12 +15,12 @@ struct KernelSubGroupInfoFixture : HelloWorldFixture<HelloWorldFixtureFactory> {

    void SetUp() override {
        ParentClass::SetUp();
-        pKernel->maxKernelWorkGroupSize = static_cast<uint32_t>(pDevice->getDeviceInfo().maxWorkGroupSize / 2);
+        pKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize = static_cast<uint32_t>(pDevice->getDeviceInfo().maxWorkGroupSize / 2);
        maxSimdSize = static_cast<size_t>(pKernel->getKernelInfo(rootDeviceIndex).getMaxSimdSize());
        ASSERT_LE(8u, maxSimdSize);
        maxWorkDim = static_cast<size_t>(pClDevice->getDeviceInfo().maxWorkItemDimensions);
        ASSERT_EQ(3u, maxWorkDim);
-        maxWorkGroupSize = static_cast<size_t>(pKernel->maxKernelWorkGroupSize);
+        maxWorkGroupSize = static_cast<size_t>(pKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize);
        ASSERT_GE(1024u, maxWorkGroupSize);
        largestCompiledSIMDSize = static_cast<size_t>(pKernel->getKernelInfo(rootDeviceIndex).patchInfo.executionEnvironment->LargestCompiledSIMDSize);
        ASSERT_EQ(32u, largestCompiledSIMDSize);
@@ -30,8 +30,8 @@ struct KernelSubGroupInfoFixture : HelloWorldFixture<HelloWorldFixtureFactory> {
        auto requiredWorkGroupSizeZ = static_cast<size_t>(pKernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2]);

        calculatedMaxWorkgroupSize = requiredWorkGroupSizeX * requiredWorkGroupSizeY * requiredWorkGroupSizeZ;
-        if ((calculatedMaxWorkgroupSize == 0) || (calculatedMaxWorkgroupSize > static_cast<size_t>(pKernel->maxKernelWorkGroupSize))) {
-            calculatedMaxWorkgroupSize = static_cast<size_t>(pKernel->maxKernelWorkGroupSize);
+        if ((calculatedMaxWorkgroupSize == 0) || (calculatedMaxWorkgroupSize > static_cast<size_t>(pKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize))) {
+            calculatedMaxWorkgroupSize = static_cast<size_t>(pKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize);
        }
    }

--- a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp
+++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp
@@ -259,7 +259,7 @@ HWTEST_F(DispatchWalkerTest, GivenDefaultLwsAlgorithmWhenDispatchingWalkerThenDi
            nullptr,
            CL_COMMAND_NDRANGE_KERNEL);

-        EXPECT_EQ(dimension, *kernel.workDim);
+        EXPECT_EQ(dimension, *kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
    }
 }

@@ -288,7 +288,7 @@ HWTEST_F(DispatchWalkerTest, GivenSquaredLwsAlgorithmWhenDispatchingWalkerThenDi
            nullptr,
            nullptr,
            CL_COMMAND_NDRANGE_KERNEL);
-        EXPECT_EQ(dimension, *kernel.workDim);
+        EXPECT_EQ(dimension, *kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
    }
 }

@@ -316,7 +316,7 @@ HWTEST_F(DispatchWalkerTest, GivenNdLwsAlgorithmWhenDispatchingWalkerThenDimensi
            nullptr,
            nullptr,
            CL_COMMAND_NDRANGE_KERNEL);
-        EXPECT_EQ(dimension, *kernel.workDim);
+        EXPECT_EQ(dimension, *kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
    }
 }

@@ -345,7 +345,7 @@ HWTEST_F(DispatchWalkerTest, GivenOldLwsAlgorithmWhenDispatchingWalkerThenDimens
            nullptr,
            nullptr,
            CL_COMMAND_NDRANGE_KERNEL);
-        EXPECT_EQ(dimension, *kernel.workDim);
+        EXPECT_EQ(dimension, *kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
    }
 }

@@ -375,9 +375,9 @@ HWTEST_F(DispatchWalkerTest, GivenNumWorkGroupsWhenDispatchingWalkerThenNumWorkG
        nullptr,
        CL_COMMAND_NDRANGE_KERNEL);

-    EXPECT_EQ(2u, *kernel.numWorkGroupsX);
-    EXPECT_EQ(5u, *kernel.numWorkGroupsY);
-    EXPECT_EQ(10u, *kernel.numWorkGroupsZ);
+    EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
+    EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
+    EXPECT_EQ(10u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
 }

 HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndDefaultAlgorithmWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -405,9 +405,9 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndDefaultAlgorithmWhenDispatch
        nullptr,
        nullptr,
        CL_COMMAND_NDRANGE_KERNEL);
-    EXPECT_EQ(2u, *kernel.localWorkSizeX);
-    EXPECT_EQ(5u, *kernel.localWorkSizeY);
-    EXPECT_EQ(1u, *kernel.localWorkSizeZ);
+    EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
+    EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
+    EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
 }

 HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndNdOnWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -435,9 +435,9 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndNdOnWhenDispatchingWalkerThe
        nullptr,
        nullptr,
        CL_COMMAND_NDRANGE_KERNEL);
-    EXPECT_EQ(2u, *kernel.localWorkSizeX);
-    EXPECT_EQ(5u, *kernel.localWorkSizeY);
-    EXPECT_EQ(10u, *kernel.localWorkSizeZ);
+    EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
+    EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
+    EXPECT_EQ(10u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
 }

 HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -466,9 +466,9 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmWhenDispatch
        nullptr,
        nullptr,
        CL_COMMAND_NDRANGE_KERNEL);
-    EXPECT_EQ(2u, *kernel.localWorkSizeX);
-    EXPECT_EQ(5u, *kernel.localWorkSizeY);
-    EXPECT_EQ(1u, *kernel.localWorkSizeZ);
+    EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
+    EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
+    EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
 }

 HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmOffAndNdOffWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -497,9 +497,9 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmOffAndNdOffW
        nullptr,
        nullptr,
        CL_COMMAND_NDRANGE_KERNEL);
-    EXPECT_EQ(2u, *kernel.localWorkSizeX);
-    EXPECT_EQ(5u, *kernel.localWorkSizeY);
-    EXPECT_EQ(1u, *kernel.localWorkSizeZ);
+    EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
+    EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
+    EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
 }

 HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -526,9 +526,9 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeWhenDispatchingWalkerThenLwsIsC
        nullptr,
        nullptr,
        CL_COMMAND_NDRANGE_KERNEL);
-    EXPECT_EQ(1u, *kernel.localWorkSizeX);
-    EXPECT_EQ(2u, *kernel.localWorkSizeY);
-    EXPECT_EQ(3u, *kernel.localWorkSizeZ);
+    EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
+    EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
+    EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
 }

 HWTEST_F(DispatchWalkerTest, GivenTwoSetsOfLwsOffsetsWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -558,12 +558,12 @@ HWTEST_F(DispatchWalkerTest, GivenTwoSetsOfLwsOffsetsWhenDispatchingWalkerThenLw
        nullptr,
        nullptr,
        CL_COMMAND_NDRANGE_KERNEL);
-    EXPECT_EQ(1u, *kernel.localWorkSizeX);
-    EXPECT_EQ(2u, *kernel.localWorkSizeY);
-    EXPECT_EQ(3u, *kernel.localWorkSizeZ);
-    EXPECT_EQ(1u, *kernel.localWorkSizeX2);
-    EXPECT_EQ(2u, *kernel.localWorkSizeY2);
-    EXPECT_EQ(3u, *kernel.localWorkSizeZ2);
+    EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
+    EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
+    EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
+    EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2);
+    EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
+    EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2);
 }

 HWTEST_F(DispatchWalkerTest, GivenSplitKernelWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -597,16 +597,16 @@ HWTEST_F(DispatchWalkerTest, GivenSplitKernelWhenDispatchingWalkerThenLwsIsCorre

    auto dispatchId = 0;
    for (auto &dispatchInfo : multiDispatchInfo) {
-        auto &kernel = *dispatchInfo.getKernel();
+        auto &kernel = static_cast<MockKernel &>(*dispatchInfo.getKernel());
        if (dispatchId == 0) {
-            EXPECT_EQ(1u, *kernel.localWorkSizeX);
-            EXPECT_EQ(2u, *kernel.localWorkSizeY);
-            EXPECT_EQ(3u, *kernel.localWorkSizeZ);
+            EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
+            EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
+            EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
        }
        if (dispatchId == 1) {
-            EXPECT_EQ(4u, *kernel.localWorkSizeX);
-            EXPECT_EQ(5u, *kernel.localWorkSizeY);
-            EXPECT_EQ(6u, *kernel.localWorkSizeZ);
+            EXPECT_EQ(4u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
+            EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
+            EXPECT_EQ(6u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
        }
        dispatchId++;
    }
@@ -646,27 +646,27 @@ HWTEST_F(DispatchWalkerTest, GivenSplitWalkerWhenDispatchingWalkerThenLwsIsCorre
        CL_COMMAND_NDRANGE_KERNEL);

    for (auto &dispatchInfo : multiDispatchInfo) {
-        auto &kernel = *dispatchInfo.getKernel();
+        auto &kernel = static_cast<MockKernel &>(*dispatchInfo.getKernel());
        if (&kernel == &mainKernel) {
-            EXPECT_EQ(4u, *kernel.localWorkSizeX);
-            EXPECT_EQ(5u, *kernel.localWorkSizeY);
-            EXPECT_EQ(6u, *kernel.localWorkSizeZ);
-            EXPECT_EQ(4u, *kernel.localWorkSizeX2);
-            EXPECT_EQ(5u, *kernel.localWorkSizeY2);
-            EXPECT_EQ(6u, *kernel.localWorkSizeZ2);
-            EXPECT_EQ(3u, *kernel.numWorkGroupsX);
-            EXPECT_EQ(2u, *kernel.numWorkGroupsY);
-            EXPECT_EQ(2u, *kernel.numWorkGroupsZ);
+            EXPECT_EQ(4u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
+            EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
+            EXPECT_EQ(6u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
+            EXPECT_EQ(4u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2);
+            EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
+            EXPECT_EQ(6u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2);
+            EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
+            EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
+            EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
        } else {
-            EXPECT_EQ(0u, *kernel.localWorkSizeX);
-            EXPECT_EQ(0u, *kernel.localWorkSizeY);
-            EXPECT_EQ(0u, *kernel.localWorkSizeZ);
-            EXPECT_EQ(1u, *kernel.localWorkSizeX2);
-            EXPECT_EQ(2u, *kernel.localWorkSizeY2);
-            EXPECT_EQ(3u, *kernel.localWorkSizeZ2);
-            EXPECT_EQ(0u, *kernel.numWorkGroupsX);
-            EXPECT_EQ(0u, *kernel.numWorkGroupsY);
-            EXPECT_EQ(0u, *kernel.numWorkGroupsZ);
+            EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
+            EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
+            EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
+            EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2);
+            EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
+            EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2);
+            EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
+            EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
+            EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
        }
    }
 }
@@ -859,8 +859,8 @@ HWTEST_F(DispatchWalkerTest, GivenMultipleKernelsWhenDispatchingWalkerThenWorkDi
        CL_COMMAND_NDRANGE_KERNEL);

    for (auto &dispatchInfo : multiDispatchInfo) {
-        auto &kernel = *dispatchInfo.getKernel();
-        EXPECT_EQ(*kernel.workDim, dispatchInfo.getDim());
+        auto &kernel = static_cast<MockKernel &>(*dispatchInfo.getKernel());
+        EXPECT_EQ(*kernel.kernelDeviceInfos[rootDeviceIndex].workDim, dispatchInfo.getDim());
    }
 }

--- a/opencl/test/unit_test/command_queue/enqueue_debug_kernel_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_debug_kernel_tests.cpp
@@ -99,7 +99,7 @@ HWTEST_F(EnqueueDebugKernelTest, givenDebugKernelWhenEnqueuedThenSSHAndBtiAreCor

        mockCmdQ->enqueueKernel(debugKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);

-        auto *dstBtiTableBase = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(surfaceStates, debugKernel->getBindingTableOffset()));
+        auto *dstBtiTableBase = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(surfaceStates, debugKernel->getBindingTableOffset(rootDeviceIndex)));
        uint32_t surfaceStateOffset = dstBtiTableBase[0].getSurfaceStatePointer();

        auto debugSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh.getCpuBase(), surfaceStateOffset));
--- a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp
@@ -1277,9 +1277,9 @@ TEST_F(EnqueueKernelTest, givenKernelWhenAllArgsAreNotAndEventExistSetThenClEnqu
 TEST_F(EnqueueKernelTest, givenEnqueueCommandThatLwsExceedsDeviceCapabilitiesWhenEnqueueNDRangeKernelIsCalledThenErrorIsReturned) {
    MockKernelWithInternals mockKernel(*pClDevice);

-    mockKernel.mockKernel->maxKernelWorkGroupSize = static_cast<uint32_t>(pDevice->getDeviceInfo().maxWorkGroupSize / 2);
+    mockKernel.mockKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize = static_cast<uint32_t>(pDevice->getDeviceInfo().maxWorkGroupSize / 2);

-    auto maxKernelWorkgroupSize = mockKernel.mockKernel->maxKernelWorkGroupSize;
+    auto maxKernelWorkgroupSize = mockKernel.mockKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize;
    size_t globalWorkSize[3] = {maxKernelWorkgroupSize + 1, 1, 1};
    size_t localWorkSize[3] = {maxKernelWorkgroupSize + 1, 1, 1};

--- a/opencl/test/unit_test/command_queue/enqueue_kernel_local_work_size_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_kernel_local_work_size_tests.cpp
@@ -64,13 +64,13 @@ TEST_F(EnqueueKernelRequiredWorkSize, GivenUnspecifiedWorkGroupSizeWhenEnqeueing

    EXPECT_EQ(CL_SUCCESS, retVal);

-    EXPECT_EQ(*pKernel->localWorkSizeX, 8u);
-    EXPECT_EQ(*pKernel->localWorkSizeY, 4u);
-    EXPECT_EQ(*pKernel->localWorkSizeZ, 4u);
+    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX, 8u);
+    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY, 4u);
+    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ, 4u);

-    EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeX, 8u);
-    EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeY, 4u);
-    EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeZ, 4u);
+    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX, 8u);
+    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY, 4u);
+    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ, 4u);
 }

 // Fully specified
@@ -91,13 +91,13 @@ TEST_F(EnqueueKernelRequiredWorkSize, GivenRequiredWorkGroupSizeWhenEnqeueingKer

    EXPECT_EQ(CL_SUCCESS, retVal);

-    EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeX, 8u);
-    EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeY, 4u);
-    EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeZ, 4u);
+    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX, 8u);
+    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY, 4u);
+    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ, 4u);

-    EXPECT_EQ(*pKernel->localWorkSizeX, 8u);
-    EXPECT_EQ(*pKernel->localWorkSizeY, 4u);
-    EXPECT_EQ(*pKernel->localWorkSizeZ, 4u);
+    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX, 8u);
+    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY, 4u);
+    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ, 4u);
 }

 // Underspecified.  Won't permit.
--- a/opencl/test/unit_test/context/driver_diagnostics_enqueue_tests.cpp
+++ b/opencl/test/unit_test/context/driver_diagnostics_enqueue_tests.cpp
@@ -680,8 +680,11 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
    retVal = pCmdQ->enqueueKernel(kernel, 1, nullptr, globalWorkGroupSize, nullptr, 0, nullptr, nullptr);
    EXPECT_EQ(CL_SUCCESS, retVal);

-    snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE], kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
-             *kernel->localWorkSizeX, *kernel->localWorkSizeY, *kernel->localWorkSizeZ);
+    snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
+             kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
+             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
+             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
+             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
    EXPECT_TRUE(containsHint(expectedHint, userData));
 }

@@ -692,8 +695,11 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
    retVal = pCmdQ->enqueueKernel(kernel, 1, nullptr, globalWorkGroupSize, nullptr, 0, nullptr, nullptr);
    EXPECT_EQ(CL_SUCCESS, retVal);

-    snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE], kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
-             *kernel->localWorkSizeX, *kernel->localWorkSizeY, *kernel->localWorkSizeZ);
+    snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
+             kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
+             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
+             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
+             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
    EXPECT_TRUE(containsHint(expectedHint, userData));
    DebugManager.flags.EnableComputeWorkSizeND.set(isWorkGroupSizeEnabled);
 }
@@ -705,8 +711,11 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
    retVal = pCmdQ->enqueueKernel(kernel, 1, nullptr, globalWorkGroupSize, nullptr, 0, nullptr, nullptr);
    EXPECT_EQ(CL_SUCCESS, retVal);

-    snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE], kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
-             *kernel->localWorkSizeX, *kernel->localWorkSizeY, *kernel->localWorkSizeZ);
+    snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
+             kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
+             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
+             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
+             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
    EXPECT_TRUE(containsHint(expectedHint, userData));
    DebugManager.flags.EnableComputeWorkSizeND.set(isWorkGroupSizeEnabled);
 }
@@ -716,8 +725,11 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
    retVal = pCmdQ->enqueueKernel(kernel, 1, nullptr, globalWorkGroupSize, nullptr, 0, nullptr, nullptr);
    EXPECT_EQ(CL_SUCCESS, retVal);

-    snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE], kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
-             *kernel->localWorkSizeX, *kernel->localWorkSizeY, *kernel->localWorkSizeZ);
+    snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
+             kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
+             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
+             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
+             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
    EXPECT_TRUE(containsHint(expectedHint, userData));
 }

@@ -729,8 +741,11 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
    retVal = pCmdQ->enqueueKernel(kernel, 1, nullptr, globalWorkGroupSize, nullptr, 0, nullptr, nullptr);
    EXPECT_EQ(CL_SUCCESS, retVal);

-    snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE], kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
-             *kernel->localWorkSizeX, *kernel->localWorkSizeY, *kernel->localWorkSizeZ);
+    snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
+             kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
+             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
+             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
+             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
    EXPECT_TRUE(containsHint(expectedHint, userData));
 }

@@ -742,8 +757,11 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
    retVal = pCmdQ->enqueueKernel(kernel, 1, nullptr, globalWorkGroupSize, nullptr, 0, nullptr, nullptr);
    EXPECT_EQ(CL_SUCCESS, retVal);

-    snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE], kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
-             *kernel->localWorkSizeX, *kernel->localWorkSizeY, *kernel->localWorkSizeZ);
+    snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
+             kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
+             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
+             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
+             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
    EXPECT_TRUE(containsHint(expectedHint, userData));
 }

--- a/opencl/test/unit_test/context/driver_diagnostics_tests.h
+++ b/opencl/test/unit_test/context/driver_diagnostics_tests.h
@@ -241,7 +241,7 @@ struct PerformanceHintEnqueueKernelTest : public PerformanceHintEnqueueTest,
        ProgramFixture::TearDown();
        PerformanceHintEnqueueTest::TearDown();
    }
-    Kernel *kernel = nullptr;
+    MockKernel *kernel = nullptr;
    uint32_t rootDeviceIndex = std::numeric_limits<uint32_t>::max();
    size_t globalWorkGroupSize[3]{};
 };
--- a/opencl/test/unit_test/execution_model/enqueue_execution_model_kernel_tests.cpp
+++ b/opencl/test/unit_test/execution_model/enqueue_execution_model_kernel_tests.cpp
@@ -328,7 +328,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelEnqueueTest, givenParentKernelWhenEnqueu

        Kernel *blockKernel = Kernel::create(pKernel->getProgram(), MockKernel::toKernelInfoContainer(*pBlockInfo, rootDeviceIndex), nullptr);
        blockSSH = alignUp(blockSSH, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
-        if (blockKernel->getNumberOfBindingTableStates() > 0) {
+        if (blockKernel->getNumberOfBindingTableStates(rootDeviceIndex) > 0) {
            ASSERT_NE(nullptr, pBlockInfo->patchInfo.bindingTableState);
            auto dstBlockBti = ptrOffset(blockSSH, pBlockInfo->patchInfo.bindingTableState->Offset);
            EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(dstBlockBti) % INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER_ALIGN_SIZE);
@@ -336,7 +336,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelEnqueueTest, givenParentKernelWhenEnqueu

            auto srcBlockBti = ptrOffset(pBlockInfo->heapInfo.pSsh, pBlockInfo->patchInfo.bindingTableState->Offset);
            auto srcBindingTable = reinterpret_cast<const BINDING_TABLE_STATE *>(srcBlockBti);
-            for (uint32_t i = 0; i < blockKernel->getNumberOfBindingTableStates(); ++i) {
+            for (uint32_t i = 0; i < blockKernel->getNumberOfBindingTableStates(rootDeviceIndex); ++i) {
                uint32_t dstSurfaceStatePointer = dstBindingTable[i].getSurfaceStatePointer();
                uint32_t srcSurfaceStatePointer = srcBindingTable[i].getSurfaceStatePointer();
                auto *dstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh->getCpuBase(), dstSurfaceStatePointer));
--- a/opencl/test/unit_test/execution_model/parent_kernel_dispatch_tests.cpp
+++ b/opencl/test/unit_test/execution_model/parent_kernel_dispatch_tests.cpp
@@ -166,7 +166,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelDispatchTest, givenParentKernelWhenQueue

    size_t sshUsed = blockedCommandsData->ssh->getUsed();

-    size_t expectedSizeSSH = pKernel->getNumberOfBindingTableStates() * sizeof(RENDER_SURFACE_STATE) +
+    size_t expectedSizeSSH = pKernel->getNumberOfBindingTableStates(rootDeviceIndex) * sizeof(RENDER_SURFACE_STATE) +
                             pKernel->getKernelInfo(rootDeviceIndex).patchInfo.bindingTableState->Count * sizeof(BINDING_TABLE_STATE) +
                             UnitTestHelper<FamilyType>::getDefaultSshUsage();

--- a/opencl/test/unit_test/execution_model/scheduler_dispatch_tests.cpp
+++ b/opencl/test/unit_test/execution_model/scheduler_dispatch_tests.cpp
@@ -45,7 +45,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ExecutionModelSchedulerFixture, WhenDispatchingSched
    using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;

    DeviceQueueHw<FamilyType> *pDevQueueHw = castToObject<DeviceQueueHw<FamilyType>>(pDevQueue);
-    SchedulerKernel &scheduler = context->getSchedulerKernel();
+    auto &scheduler = static_cast<MockSchedulerKernel &>(context->getSchedulerKernel());

    auto *executionModelDshAllocation = pDevQueueHw->getDshBuffer();
    auto *dshHeap = pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
@@ -70,27 +70,27 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ExecutionModelSchedulerFixture, WhenDispatchingSched
        pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
        false);

-    EXPECT_EQ(0u, *scheduler.globalWorkOffsetX);
-    EXPECT_EQ(0u, *scheduler.globalWorkOffsetY);
-    EXPECT_EQ(0u, *scheduler.globalWorkOffsetZ);
+    EXPECT_EQ(0u, *scheduler.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX);
+    EXPECT_EQ(0u, *scheduler.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY);
+    EXPECT_EQ(0u, *scheduler.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetZ);

-    EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.localWorkSizeX);
-    EXPECT_EQ(1u, *scheduler.localWorkSizeY);
-    EXPECT_EQ(1u, *scheduler.localWorkSizeZ);
+    EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
+    EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
+    EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);

-    EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.localWorkSizeX2);
-    EXPECT_EQ(1u, *scheduler.localWorkSizeY2);
-    EXPECT_EQ(1u, *scheduler.localWorkSizeZ2);
+    EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2);
+    EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
+    EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2);

-    if (scheduler.enqueuedLocalWorkSizeX != &Kernel::dummyPatchLocation) {
-        EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.enqueuedLocalWorkSizeX);
+    if (scheduler.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX != &Kernel::dummyPatchLocation) {
+        EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX);
    }
-    EXPECT_EQ(1u, *scheduler.enqueuedLocalWorkSizeY);
-    EXPECT_EQ(1u, *scheduler.enqueuedLocalWorkSizeZ);
+    EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY);
+    EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ);

-    EXPECT_EQ((uint32_t)(scheduler.getGws() / scheduler.getLws()), *scheduler.numWorkGroupsX);
-    EXPECT_EQ(0u, *scheduler.numWorkGroupsY);
-    EXPECT_EQ(0u, *scheduler.numWorkGroupsZ);
+    EXPECT_EQ((uint32_t)(scheduler.getGws() / scheduler.getLws()), *scheduler.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
+    EXPECT_EQ(0u, *scheduler.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
+    EXPECT_EQ(0u, *scheduler.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);

    HardwareParse hwParser;
    hwParser.parseCommands<FamilyType>(commandStream, 0);
--- a/opencl/test/unit_test/fixtures/hello_world_kernel_fixture.h
+++ b/opencl/test/unit_test/fixtures/hello_world_kernel_fixture.h
@@ -97,7 +97,7 @@ struct HelloWorldKernelFixture : public ProgramFixture {
    std::string *pKernelName = nullptr;
    cl_uint simd = 32;
    cl_int retVal = CL_SUCCESS;
-    Kernel *pKernel = nullptr;
+    MockKernel *pKernel = nullptr;
    MockContext *pContext = nullptr;
 };
 } // namespace NEO
--- a/opencl/test/unit_test/gtpin/gtpin_tests.cpp
+++ b/opencl/test/unit_test/gtpin/gtpin_tests.cpp
@@ -2225,11 +2225,11 @@ TEST_F(GTPinTests, givenKernelWithSSHThenVerifyThatSSHResizeWorksWell) {
    Kernel *pKernel = castToObject<Kernel>(kernel);
    ASSERT_NE(nullptr, pKernel);

-    size_t numBTS1 = pKernel->getNumberOfBindingTableStates();
+    size_t numBTS1 = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
    EXPECT_EQ(2u, numBTS1);
    size_t sizeSurfaceStates1 = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
    EXPECT_NE(0u, sizeSurfaceStates1);
-    size_t offsetBTS1 = pKernel->getBindingTableOffset();
+    size_t offsetBTS1 = pKernel->getBindingTableOffset(rootDeviceIndex);
    EXPECT_NE(0u, offsetBTS1);

    GFXCORE_FAMILY genFamily = pDevice->getHardwareInfo().platform.eRenderCoreFamily;
@@ -2241,11 +2241,11 @@ TEST_F(GTPinTests, givenKernelWithSSHThenVerifyThatSSHResizeWorksWell) {
    bool surfaceAdded = gtpinHelper.addSurfaceState(pKernel, rootDeviceIndex);
    EXPECT_TRUE(surfaceAdded);

-    size_t numBTS2 = pKernel->getNumberOfBindingTableStates();
+    size_t numBTS2 = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
    EXPECT_EQ(numBTS1 + 1, numBTS2);
    size_t sizeSurfaceStates2 = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
    EXPECT_GT(sizeSurfaceStates2, sizeSurfaceStates1);
-    size_t offsetBTS2 = pKernel->getBindingTableOffset();
+    size_t offsetBTS2 = pKernel->getBindingTableOffset(rootDeviceIndex);
    EXPECT_GT(offsetBTS2, offsetBTS1);

    void *pSS2 = gtpinHelper.getSurfaceState(pKernel, 0, rootDeviceIndex);
@@ -2261,11 +2261,11 @@ TEST_F(GTPinTests, givenKernelWithSSHThenVerifyThatSSHResizeWorksWell) {
    surfaceAdded = gtpinHelper.addSurfaceState(pKernel, rootDeviceIndex);
    EXPECT_FALSE(surfaceAdded);

-    size_t numBTS3 = pKernel->getNumberOfBindingTableStates();
+    size_t numBTS3 = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
    EXPECT_EQ(0u, numBTS3);
    size_t sizeSurfaceStates3 = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
    EXPECT_EQ(0u, sizeSurfaceStates3);
-    size_t offsetBTS3 = pKernel->getBindingTableOffset();
+    size_t offsetBTS3 = pKernel->getBindingTableOffset(rootDeviceIndex);
    EXPECT_EQ(0u, offsetBTS3);
    void *pSS3 = gtpinHelper.getSurfaceState(pKernel, 0, rootDeviceIndex);
    EXPECT_EQ(nullptr, pSS3);
--- a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp
+++ b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp
@@ -385,7 +385,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTabl
    *pWalkerCmd = FamilyType::cmdInitGpgpuWalker;

    auto expectedBindingTableCount = 3u;
-    mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
+    mockKernelWithInternal->mockKernel->kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = expectedBindingTableCount;

    auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
    auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
@@ -431,7 +431,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelThatIsSchedulerWhen
    *pWalkerCmd = FamilyType::cmdInitGpgpuWalker;

    auto expectedBindingTableCount = 3u;
-    mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
+    mockKernelWithInternal->mockKernel->kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = expectedBindingTableCount;
    auto isScheduler = const_cast<bool *>(&mockKernelWithInternal->mockKernel->isSchedulerKernel);
    *isScheduler = true;

@@ -475,7 +475,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTable
    *pWalkerCmd = FamilyType::cmdInitGpgpuWalker;

    auto expectedBindingTableCount = 100u;
-    mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
+    mockKernelWithInternal->mockKernel->kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = expectedBindingTableCount;

    auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
    auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
@@ -802,7 +802,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenGettingBindingTableStateTh
        auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);

        // Initialize binding table state pointers with pattern
-        EXPECT_EQ(numSurfaces, pKernel->getNumberOfBindingTableStates());
+        EXPECT_EQ(numSurfaces, pKernel->getNumberOfBindingTableStates(rootDeviceIndex));

        const size_t localWorkSizes[3]{256, 1, 1};

@@ -890,7 +890,7 @@ HWTEST_F(HardwareCommandsTest, GivenBuffersNotRequiringSshWhenSettingBindingTabl
    auto usedBefore = ssh.getUsed();

    // Initialize binding table state pointers with pattern
-    auto numSurfaceStates = pKernel->getNumberOfBindingTableStates();
+    auto numSurfaceStates = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
    EXPECT_EQ(0u, numSurfaceStates);

    // set binding table states
@@ -933,7 +933,7 @@ HWTEST_F(HardwareCommandsTest, GivenZeroSurfaceStatesWhenSettingBindingTableStat
    auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);

    // Initialize binding table state pointers with pattern
-    auto numSurfaceStates = pKernel->getNumberOfBindingTableStates();
+    auto numSurfaceStates = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
    EXPECT_EQ(0u, numSurfaceStates);

    auto dstBindingTablePointer = pushBindingTableAndSurfaceStates<FamilyType>(ssh, *pKernel);
--- a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.h
+++ b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.h
@@ -47,6 +47,6 @@ struct HardwareCommandsTest : ClDeviceFixture,
    size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const Kernel &srcKernel) {
        return EncodeSurfaceState<GfxFamily>::pushBindingTableAndSurfaceStates(dstHeap, (srcKernel.getKernelInfo(rootDeviceIndex).patchInfo.bindingTableState != nullptr) ? srcKernel.getKernelInfo(rootDeviceIndex).patchInfo.bindingTableState->Count : 0,
                                                                               srcKernel.getSurfaceStateHeap(rootDeviceIndex), srcKernel.getSurfaceStateHeapSize(rootDeviceIndex),
-                                                                               srcKernel.getNumberOfBindingTableStates(), srcKernel.getBindingTableOffset());
+                                                                               srcKernel.getNumberOfBindingTableStates(rootDeviceIndex), srcKernel.getBindingTableOffset(rootDeviceIndex));
    }
 };
--- a/opencl/test/unit_test/kernel/kernel_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_tests.cpp
@@ -65,7 +65,7 @@ class KernelTests : public ProgramFromBinaryFixture {
        ASSERT_EQ(CL_SUCCESS, retVal);

        // create a kernel
-        pKernel = Kernel::create(
+        pKernel = Kernel::create<MockKernel>(
            pProgram,
            pProgram->getKernelInfosForKernel(kernelName),
            &retVal);
@@ -81,7 +81,7 @@ class KernelTests : public ProgramFromBinaryFixture {
        ProgramFromBinaryFixture::TearDown();
    }

-    Kernel *pKernel = nullptr;
+    MockKernel *pKernel = nullptr;
    cl_int retVal = CL_SUCCESS;
 };

@@ -278,7 +278,7 @@ TEST_F(KernelTests, GivenKernelWorkGroupSizeWhenGettingWorkGroupInfoThenWorkGrou
    size_t paramValueSizeRet = 0;

    auto kernelMaxWorkGroupSize = pDevice->getDeviceInfo().maxWorkGroupSize - 1;
-    pKernel->maxKernelWorkGroupSize = static_cast<uint32_t>(kernelMaxWorkGroupSize);
+    pKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize = static_cast<uint32_t>(kernelMaxWorkGroupSize);

    retVal = pKernel->getWorkGroupInfo(
        *pClDevice,
@@ -2305,10 +2305,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenGlobalWorkOffsetIsCorr
    MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
    ASSERT_EQ(CL_SUCCESS, kernel.initialize());

-    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.globalWorkOffsetX);
-    EXPECT_NE(nullptr, kernel.globalWorkOffsetY);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.globalWorkOffsetY);
-    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.globalWorkOffsetZ);
+    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX);
+    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY);
+    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetZ);
 }

 TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkSizeIsCorrect) {
@@ -2318,10 +2318,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkSizeIsCorrect
    MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
    ASSERT_EQ(CL_SUCCESS, kernel.initialize());

-    EXPECT_NE(nullptr, kernel.localWorkSizeX);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.localWorkSizeX);
-    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.localWorkSizeY);
-    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.localWorkSizeZ);
+    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
+    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
+    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
 }

 TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkSize2IsCorrect) {
@@ -2331,10 +2331,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkSize2IsCorrec
    MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
    ASSERT_EQ(CL_SUCCESS, kernel.initialize());

-    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.localWorkSizeX2);
-    EXPECT_NE(nullptr, kernel.localWorkSizeY2);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.localWorkSizeY2);
-    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.localWorkSizeZ2);
+    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2);
+    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
+    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2);
 }

 TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenGlobalWorkSizeIsCorrect) {
@@ -2344,10 +2344,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenGlobalWorkSizeIsCorrec
    MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
    ASSERT_EQ(CL_SUCCESS, kernel.initialize());

-    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.globalWorkSizeX);
-    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.globalWorkSizeY);
-    EXPECT_NE(nullptr, kernel.globalWorkSizeZ);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.globalWorkSizeZ);
+    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkSizeX);
+    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkSizeY);
+    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ);
 }

 TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkDimIsCorrect) {
@@ -2357,8 +2357,8 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkDimIsCorrect)
    MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
    ASSERT_EQ(CL_SUCCESS, kernel.initialize());

-    EXPECT_NE(nullptr, kernel.workDim);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.workDim);
+    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
 }

 TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenNumWorkGroupsIsCorrect) {
@@ -2370,12 +2370,12 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenNumWorkGroupsIsCorrect
    MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
    ASSERT_EQ(CL_SUCCESS, kernel.initialize());

-    EXPECT_NE(nullptr, kernel.numWorkGroupsX);
-    EXPECT_NE(nullptr, kernel.numWorkGroupsY);
-    EXPECT_NE(nullptr, kernel.numWorkGroupsZ);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.numWorkGroupsX);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.numWorkGroupsY);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.numWorkGroupsZ);
+    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
+    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
+    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
 }

 TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenEnqueuedLocalWorkSizeIsCorrect) {
@@ -2385,10 +2385,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenEnqueuedLocalWorkSizeI
    MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
    ASSERT_EQ(CL_SUCCESS, kernel.initialize());

-    EXPECT_NE(nullptr, kernel.enqueuedLocalWorkSizeX);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.enqueuedLocalWorkSizeX);
-    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.enqueuedLocalWorkSizeY);
-    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.enqueuedLocalWorkSizeZ);
+    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX);
+    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY);
+    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ);
 }

 TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenEnqueuedMaxWorkGroupSizeIsCorrect) {
@@ -2398,11 +2398,11 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenEnqueuedMaxWorkGroupSi
    MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
    ASSERT_EQ(CL_SUCCESS, kernel.initialize());

-    EXPECT_NE(nullptr, kernel.maxWorkGroupSizeForCrossThreadData);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.maxWorkGroupSizeForCrossThreadData);
-    EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.maxWorkGroupSizeOffset), static_cast<void *>(kernel.maxWorkGroupSizeForCrossThreadData));
-    EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, *kernel.maxWorkGroupSizeForCrossThreadData);
-    EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, kernel.maxKernelWorkGroupSize);
+    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData);
+    EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.maxWorkGroupSizeOffset), static_cast<void *>(kernel.kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData));
+    EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, *kernel.kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData);
+    EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, kernel.kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize);
 }

 TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenDataParameterSimdSizeIsCorrect) {
@@ -2414,10 +2414,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenDataParameterSimdSizeI
    executionEnvironment.CompiledSIMD8 = true;
    ASSERT_EQ(CL_SUCCESS, kernel.initialize());

-    EXPECT_NE(nullptr, kernel.dataParameterSimdSize);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.dataParameterSimdSize);
-    EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.simdSizeOffset), static_cast<void *>(kernel.dataParameterSimdSize));
-    EXPECT_EQ_VAL(pKernelInfo->getMaxSimdSize(), *kernel.dataParameterSimdSize);
+    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize);
+    EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.simdSizeOffset), static_cast<void *>(kernel.kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize));
+    EXPECT_EQ_VAL(pKernelInfo->getMaxSimdSize(), *kernel.kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize);
 }

 TEST_F(KernelCrossThreadTests, GivenParentEventOffsetWhenKernelIsInitializedThenParentEventIsInitiatedWithInvalid) {
@@ -2425,10 +2425,10 @@ TEST_F(KernelCrossThreadTests, GivenParentEventOffsetWhenKernelIsInitializedThen
    MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex));
    ASSERT_EQ(CL_SUCCESS, kernel.initialize());

-    EXPECT_NE(nullptr, kernel.parentEventOffset);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.parentEventOffset);
-    EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.parentEventOffset), static_cast<void *>(kernel.parentEventOffset));
-    EXPECT_EQ(WorkloadInfo::invalidParentEvent, *kernel.parentEventOffset);
+    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].parentEventOffset);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].parentEventOffset);
+    EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.parentEventOffset), static_cast<void *>(kernel.kernelDeviceInfos[rootDeviceIndex].parentEventOffset));
+    EXPECT_EQ(WorkloadInfo::invalidParentEvent, *kernel.kernelDeviceInfos[rootDeviceIndex].parentEventOffset);
 }

 TEST_F(KernelCrossThreadTests, WhenAddingKernelThenProgramRefCountIsIncremented) {
--- a/opencl/test/unit_test/mocks/mock_context.cpp
+++ b/opencl/test/unit_test/mocks/mock_context.cpp
@@ -17,6 +17,7 @@
 #include "opencl/source/sharings/sharing.h"
 #include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
 #include "opencl/test/unit_test/mocks/mock_cl_device.h"
+#include "opencl/test/unit_test/mocks/mock_kernel.h"

 #include "d3d_sharing_functions.h"

@@ -128,6 +129,52 @@ void MockContext::initializeWithDevices(const ClDeviceVector &devices, bool noSp
    setupContextType();
 }

+SchedulerKernel &MockContext::getSchedulerKernel() {
+    if (schedulerBuiltIn->pKernel) {
+        return *static_cast<SchedulerKernel *>(schedulerBuiltIn->pKernel);
+    }
+
+    auto initializeSchedulerProgramAndKernel = [&] {
+        cl_int retVal = CL_SUCCESS;
+        auto clDevice = getDevice(0);
+        auto src = SchedulerKernel::loadSchedulerKernel(&clDevice->getDevice());
+
+        auto program = Program::createBuiltInFromGenBinary(this,
+                                                           devices,
+                                                           src.resource.data(),
+                                                           src.resource.size(),
+                                                           &retVal);
+        DEBUG_BREAK_IF(retVal != CL_SUCCESS);
+        DEBUG_BREAK_IF(!program);
+
+        retVal = program->processGenBinary(*clDevice);
+        DEBUG_BREAK_IF(retVal != CL_SUCCESS);
+
+        schedulerBuiltIn->pProgram = program;
+
+        KernelInfoContainer kernelInfos;
+        kernelInfos.resize(getMaxRootDeviceIndex() + 1);
+        for (auto rootDeviceIndex : rootDeviceIndices) {
+            auto kernelInfo = schedulerBuiltIn->pProgram->getKernelInfo(SchedulerKernel::schedulerName, rootDeviceIndex);
+            DEBUG_BREAK_IF(!kernelInfo);
+            kernelInfos[rootDeviceIndex] = kernelInfo;
+        }
+
+        schedulerBuiltIn->pKernel = Kernel::create<MockSchedulerKernel>(
+            schedulerBuiltIn->pProgram,
+            kernelInfos,
+            &retVal);
+
+        UNRECOVERABLE_IF(schedulerBuiltIn->pKernel->getScratchSize(clDevice->getRootDeviceIndex()) != 0);
+
+        DEBUG_BREAK_IF(retVal != CL_SUCCESS);
+    };
+    std::call_once(schedulerBuiltIn->programIsInitialized, initializeSchedulerProgramAndKernel);
+
+    UNRECOVERABLE_IF(schedulerBuiltIn->pKernel == nullptr);
+    return *static_cast<SchedulerKernel *>(schedulerBuiltIn->pKernel);
+}
+
 MockDefaultContext::MockDefaultContext() : MockContext(nullptr, nullptr) {
    pRootDevice0 = ultClDeviceFactory.rootDevices[0];
    pRootDevice1 = ultClDeviceFactory.rootDevices[1];
--- a/opencl/test/unit_test/mocks/mock_context.h
+++ b/opencl/test/unit_test/mocks/mock_context.h
@@ -47,6 +47,8 @@ class MockContext : public Context {
    std::unique_ptr<AsyncEventsHandler> &getAsyncEventsHandlerUniquePtr();
    void initializeWithDevices(const ClDeviceVector &devices, bool noSpecialQueue);

+    SchedulerKernel &getSchedulerKernel() override;
+
  private:
    ClDevice *pDevice = nullptr;
 };
--- a/opencl/test/unit_test/mocks/mock_kernel.h
+++ b/opencl/test/unit_test/mocks/mock_kernel.h
@@ -40,7 +40,6 @@ class MockKernel : public Kernel {
    using Kernel::kernelDeviceInfos;
    using Kernel::kernelSvmGfxAllocations;
    using Kernel::kernelUnifiedMemoryGfxAllocations;
-    using Kernel::numberOfBindingTableStates;
    using Kernel::patchBufferOffset;
    using Kernel::patchWithImplicitSurface;
    using Kernel::svmAllocationsRequireCacheFlush;
@@ -595,6 +594,7 @@ class MockParentKernel : public Kernel {

 class MockSchedulerKernel : public SchedulerKernel {
  public:
+    using SchedulerKernel::kernelDeviceInfos;
    MockSchedulerKernel(Program *programArg, const KernelInfoContainer &kernelInfoArg) : SchedulerKernel(programArg, kernelInfoArg){};
 };