diff --git a/opencl/source/built_ins/vme_dispatch_builder.h b/opencl/source/built_ins/vme_dispatch_builder.h
index 33fc321cbc..9019c2ae21 100644
--- a/opencl/source/built_ins/vme_dispatch_builder.h
+++ b/opencl/source/built_ins/vme_dispatch_builder.h
@@ -173,7 +173,7 @@ class VmeBuiltinDispatchInfoBuilder : public BuiltinDispatchInfoBuilder {
         DEBUG_BREAK_IF(kernelArgInfo.kernelArgPatchInfoVector.size() != 1);
         const KernelArgPatchInfo &patchInfo = kernelArgInfo.kernelArgPatchInfoVector[0];
         DEBUG_BREAK_IF(sizeof(RetType) > patchInfo.size);
-        return *(RetType *)(vmeKernel->getCrossThreadData(clDevice.getRootDeviceIndex()) + patchInfo.crossthreadOffset);
+        return *(RetType *)(vmeKernel->getCrossThreadData() + patchInfo.crossthreadOffset);
     }
 
     cl_int validateImages(Vec3<size_t> inputRegion, Vec3<size_t> offset) const {
diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h
index 8b24954279..010882d18e 100644
--- a/opencl/source/command_queue/enqueue_common.h
+++ b/opencl/source/command_queue/enqueue_common.h
@@ -921,7 +921,7 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
             } else {
                 continue;
             }
-            kernel->getResidency(allSurfaces, device->getRootDeviceIndex());
+            kernel->getResidency(allSurfaces);
         }
         for (auto &surface : CreateRange(surfaces, surfaceCount)) {
             allSurfaces.push_back(surface->duplicate());
diff --git a/opencl/source/command_queue/enqueue_kernel.h b/opencl/source/command_queue/enqueue_kernel.h
index e14bdecbe3..9339b1899f 100644
--- a/opencl/source/command_queue/enqueue_kernel.h
+++ b/opencl/source/command_queue/enqueue_kernel.h
@@ -132,7 +132,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueKernel(
             ",", globalWorkSizeIn[2],
             ",SIMD:, ", kernelInfo.getMaxSimdSize());
 
-    if (totalWorkItems > kernel.getMaxKernelWorkGroupSize(rootDeviceIndex)) {
+    if (totalWorkItems > kernel.getMaxKernelWorkGroupSize()) {
         return CL_INVALID_WORK_GROUP_SIZE;
     }
 
diff --git a/opencl/source/command_queue/gpgpu_walker_bdw_plus.inl b/opencl/source/command_queue/gpgpu_walker_bdw_plus.inl
index 02fa0d6fa2..9d703e98cc 100644
--- a/opencl/source/command_queue/gpgpu_walker_bdw_plus.inl
+++ b/opencl/source/command_queue/gpgpu_walker_bdw_plus.inl
@@ -96,13 +96,13 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
     DEBUG_BREAK_IF(simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20);
 
     // Patch our kernel constants
-    scheduler.setGlobalWorkOffsetValues(rootDeviceIndex, 0, 0, 0);
-    scheduler.setGlobalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(scheduler.getGws()), 1, 1);
-    scheduler.setLocalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(scheduler.getLws()), 1, 1);
-    scheduler.setLocalWorkSize2Values(rootDeviceIndex, static_cast<uint32_t>(scheduler.getLws()), 1, 1);
-    scheduler.setEnqueuedLocalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(scheduler.getLws()), 1, 1);
-    scheduler.setNumWorkGroupsValues(rootDeviceIndex, static_cast<uint32_t>(scheduler.getGws() / scheduler.getLws()), 0, 0);
-    scheduler.setWorkDim(rootDeviceIndex, 1);
+    scheduler.setGlobalWorkOffsetValues(0, 0, 0);
+    scheduler.setGlobalWorkSizeValues(static_cast<uint32_t>(scheduler.getGws()), 1, 1);
+    scheduler.setLocalWorkSizeValues(static_cast<uint32_t>(scheduler.getLws()), 1, 1);
+    scheduler.setLocalWorkSize2Values(static_cast<uint32_t>(scheduler.getLws()), 1, 1);
+    scheduler.setEnqueuedLocalWorkSizeValues(static_cast<uint32_t>(scheduler.getLws()), 1, 1);
+    scheduler.setNumWorkGroupsValues(static_cast<uint32_t>(scheduler.getGws() / scheduler.getLws()), 0, 0);
+    scheduler.setWorkDim(1);
 
     // Send our indirect object data
     size_t localWorkSizes[3] = {scheduler.getLws(), 1, 1};
diff --git a/opencl/source/command_queue/hardware_interface_base.inl b/opencl/source/command_queue/hardware_interface_base.inl
index a42787e616..96e4d1b918 100644
--- a/opencl/source/command_queue/hardware_interface_base.inl
+++ b/opencl/source/command_queue/hardware_interface_base.inl
@@ -211,23 +211,22 @@ void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQ
 
     size_t globalWorkSizes[3] = {gws.x, gws.y, gws.z};
 
-    auto rootDeviceIndex = commandQueue.getDevice().getRootDeviceIndex();
     // Patch our kernel constants
-    kernel.setGlobalWorkOffsetValues(rootDeviceIndex, static_cast<uint32_t>(offset.x), static_cast<uint32_t>(offset.y), static_cast<uint32_t>(offset.z));
-    kernel.setGlobalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(gws.x), static_cast<uint32_t>(gws.y), static_cast<uint32_t>(gws.z));
+    kernel.setGlobalWorkOffsetValues(static_cast<uint32_t>(offset.x), static_cast<uint32_t>(offset.y), static_cast<uint32_t>(offset.z));
+    kernel.setGlobalWorkSizeValues(static_cast<uint32_t>(gws.x), static_cast<uint32_t>(gws.y), static_cast<uint32_t>(gws.z));
 
-    if (isMainKernel || (!kernel.isLocalWorkSize2Patched(rootDeviceIndex))) {
-        kernel.setLocalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
+    if (isMainKernel || (!kernel.isLocalWorkSize2Patched())) {
+        kernel.setLocalWorkSizeValues(static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
     }
 
-    kernel.setLocalWorkSize2Values(rootDeviceIndex, static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
-    kernel.setEnqueuedLocalWorkSizeValues(rootDeviceIndex, static_cast<uint32_t>(elws.x), static_cast<uint32_t>(elws.y), static_cast<uint32_t>(elws.z));
+    kernel.setLocalWorkSize2Values(static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
+    kernel.setEnqueuedLocalWorkSizeValues(static_cast<uint32_t>(elws.x), static_cast<uint32_t>(elws.y), static_cast<uint32_t>(elws.z));
 
     if (isMainKernel) {
-        kernel.setNumWorkGroupsValues(rootDeviceIndex, static_cast<uint32_t>(totalNumberOfWorkgroups.x), static_cast<uint32_t>(totalNumberOfWorkgroups.y), static_cast<uint32_t>(totalNumberOfWorkgroups.z));
+        kernel.setNumWorkGroupsValues(static_cast<uint32_t>(totalNumberOfWorkgroups.x), static_cast<uint32_t>(totalNumberOfWorkgroups.y), static_cast<uint32_t>(totalNumberOfWorkgroups.z));
     }
 
-    kernel.setWorkDim(rootDeviceIndex, dim);
+    kernel.setWorkDim(dim);
 
     // Send our indirect object data
     size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};
diff --git a/opencl/source/command_queue/local_work_size.cpp b/opencl/source/command_queue/local_work_size.cpp
index 5e7f1b2f05..9c45bc9302 100644
--- a/opencl/source/command_queue/local_work_size.cpp
+++ b/opencl/source/command_queue/local_work_size.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2020 Intel Corporation
+ * Copyright (C) 2017-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -427,7 +427,7 @@ Vec3<size_t> computeWorkgroupSize(const DispatchInfo &dispatchInfo) {
             size_t workItems[3] = {dispatchInfo.getGWS().x, dispatchInfo.getGWS().y, dispatchInfo.getGWS().z};
             computeWorkgroupSizeND(wsInfo, workGroupSize, workItems, dispatchInfo.getDim());
         } else {
-            auto maxWorkGroupSize = kernel->getMaxKernelWorkGroupSize(rootDeviceIndex);
+            auto maxWorkGroupSize = kernel->getMaxKernelWorkGroupSize();
             auto simd = kernel->getKernelInfo(rootDeviceIndex).getMaxSimdSize();
             size_t workItems[3] = {dispatchInfo.getGWS().x, dispatchInfo.getGWS().y, dispatchInfo.getGWS().z};
             if (dispatchInfo.getDim() == 1) {
diff --git a/opencl/source/gtpin/gtpin_callbacks.cpp b/opencl/source/gtpin/gtpin_callbacks.cpp
index 8008f25d7a..710655c71d 100644
--- a/opencl/source/gtpin/gtpin_callbacks.cpp
+++ b/opencl/source/gtpin/gtpin_callbacks.cpp
@@ -67,7 +67,7 @@ void gtpinNotifyKernelCreate(cl_kernel kernel) {
         auto pKernel = pMultiDeviceKernel->getDefaultKernel();
         auto &device = pKernel->getDevices()[0]->getDevice();
         auto rootDeviceIndex = device.getRootDeviceIndex();
-        size_t gtpinBTI = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
+        size_t gtpinBTI = pKernel->getNumberOfBindingTableStates();
         // Enlarge local copy of SSH by 1 SS
         GFXCORE_FAMILY genFamily = device.getHardwareInfo().platform.eRenderCoreFamily;
         GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
@@ -141,7 +141,7 @@ void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
         }
         GFXCORE_FAMILY genFamily = device.getHardwareInfo().platform.eRenderCoreFamily;
         GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
-        size_t gtpinBTI = pKernel->getNumberOfBindingTableStates(rootDeviceIndex) - 1;
+        size_t gtpinBTI = pKernel->getNumberOfBindingTableStates() - 1;
         void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI, rootDeviceIndex);
         cl_mem buffer = (cl_mem)resource;
         auto pBuffer = castToObjectOrAbort<Buffer>(buffer);
diff --git a/opencl/source/gtpin/gtpin_hw_helper.inl b/opencl/source/gtpin/gtpin_hw_helper.inl
index a8a9579585..2e6283478d 100644
--- a/opencl/source/gtpin/gtpin_hw_helper.inl
+++ b/opencl/source/gtpin/gtpin_hw_helper.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2020 Intel Corporation
+ * Copyright (C) 2018-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,7 @@ bool GTPinHwHelperHw<GfxFamily>::addSurfaceState(Kernel *pKernel, uint32_t rootD
     size_t ssSize = sizeof(RENDER_SURFACE_STATE);
     size_t btsSize = sizeof(BINDING_TABLE_STATE);
     size_t sizeToEnlarge = ssSize + btsSize;
-    size_t currBTOffset = pKernel->getBindingTableOffset(rootDeviceIndex);
+    size_t currBTOffset = pKernel->getBindingTableOffset();
     size_t currSurfaceStateSize = currBTOffset;
     char *pSsh = static_cast<char *>(pKernel->getSurfaceStateHeap(rootDeviceIndex));
     char *pNewSsh = new char[sshSize + sizeToEnlarge];
@@ -35,12 +35,12 @@ bool GTPinHwHelperHw<GfxFamily>::addSurfaceState(Kernel *pKernel, uint32_t rootD
     RENDER_SURFACE_STATE *pSS = reinterpret_cast<RENDER_SURFACE_STATE *>(pNewSsh + currSurfaceStateSize);
     *pSS = GfxFamily::cmdInitRenderSurfaceState;
     size_t newSurfaceStateSize = currSurfaceStateSize + ssSize;
-    size_t currBTCount = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
+    size_t currBTCount = pKernel->getNumberOfBindingTableStates();
     memcpy_s(pNewSsh + newSurfaceStateSize, sshSize + sizeToEnlarge - newSurfaceStateSize, pSsh + currBTOffset, currBTCount * btsSize);
     BINDING_TABLE_STATE *pNewBTS = reinterpret_cast<BINDING_TABLE_STATE *>(pNewSsh + newSurfaceStateSize + currBTCount * btsSize);
     *pNewBTS = GfxFamily::cmdInitBindingTableState;
     pNewBTS->setSurfaceStatePointer((uint64_t)currBTOffset);
-    pKernel->resizeSurfaceStateHeap(rootDeviceIndex, pNewSsh, sshSize + sizeToEnlarge, currBTCount + 1, newSurfaceStateSize);
+    pKernel->resizeSurfaceStateHeap(pNewSsh, sshSize + sizeToEnlarge, currBTCount + 1, newSurfaceStateSize);
     return true;
 }
 
@@ -48,10 +48,10 @@ template <typename GfxFamily>
 void *GTPinHwHelperHw<GfxFamily>::getSurfaceState(Kernel *pKernel, size_t bti, uint32_t rootDeviceIndex) {
     using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE;
 
-    if ((nullptr == pKernel->getSurfaceStateHeap(rootDeviceIndex)) || (bti >= pKernel->getNumberOfBindingTableStates(rootDeviceIndex))) {
+    if ((nullptr == pKernel->getSurfaceStateHeap(rootDeviceIndex)) || (bti >= pKernel->getNumberOfBindingTableStates())) {
         return nullptr;
     }
-    auto *pBts = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex), (pKernel->getBindingTableOffset(rootDeviceIndex) + bti * sizeof(BINDING_TABLE_STATE))));
+    auto *pBts = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex), (pKernel->getBindingTableOffset() + bti * sizeof(BINDING_TABLE_STATE))));
     auto pSurfaceState = ptrOffset(pKernel->getSurfaceStateHeap(rootDeviceIndex), pBts->getSurfaceStatePointer());
     return pSurfaceState;
 }
diff --git a/opencl/source/helpers/dispatch_info.cpp b/opencl/source/helpers/dispatch_info.cpp
index 6e4dc93177..32801a9ade 100644
--- a/opencl/source/helpers/dispatch_info.cpp
+++ b/opencl/source/helpers/dispatch_info.cpp
@@ -11,7 +11,7 @@
 
 namespace NEO {
 bool DispatchInfo::usesSlm() const {
-    return (kernel == nullptr) ? false : kernel->getSlmTotalSize(pClDevice->getRootDeviceIndex()) > 0;
+    return (kernel == nullptr) ? false : kernel->getSlmTotalSize() > 0;
 }
 
 bool DispatchInfo::usesStatelessPrintfSurface() const {
diff --git a/opencl/source/helpers/hardware_commands_helper.h b/opencl/source/helpers/hardware_commands_helper.h
index c932761498..a0e7e83592 100644
--- a/opencl/source/helpers/hardware_commands_helper.h
+++ b/opencl/source/helpers/hardware_commands_helper.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2020 Intel Corporation
+ * Copyright (C) 2017-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,8 +77,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
         Kernel &kernel,
         bool inlineDataProgrammingRequired,
         WALKER_TYPE<GfxFamily> *walkerCmd,
-        uint32_t &sizeCrossThreadData,
-        uint32_t rootDeviceIndex);
+        uint32_t &sizeCrossThreadData);
 
     static size_t sendIndirectState(
         LinearStream &commandStream,
diff --git a/opencl/source/helpers/hardware_commands_helper_base.inl b/opencl/source/helpers/hardware_commands_helper_base.inl
index 13c9381ae6..a5a19f5a63 100644
--- a/opencl/source/helpers/hardware_commands_helper_base.inl
+++ b/opencl/source/helpers/hardware_commands_helper_base.inl
@@ -63,7 +63,7 @@ size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(
 
     auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
     uint32_t grfSize = sizeof(typename GfxFamily::GRF);
-    return alignUp((kernel.getCrossThreadDataSize(rootDeviceIndex) +
+    return alignUp((kernel.getCrossThreadDataSize() +
                     getPerThreadDataSizeTotal(kernelInfo.getMaxSimdSize(), grfSize, numChannels, localWorkSize)),
                    WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
 }
@@ -174,7 +174,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
 
     interfaceDescriptor.setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL);
 
-    auto slmTotalSize = kernel.getSlmTotalSize(rootDeviceIndex);
+    auto slmTotalSize = kernel.getSlmTotalSize();
 
     setGrfInfo(&interfaceDescriptor, kernel, sizeCrossThreadData, sizePerThreadData, rootDeviceIndex);
     EncodeDispatchKernel<GfxFamily>::appendAdditionalIDDFields(&interfaceDescriptor, hardwareInfo, threadsPerThreadGroup, slmTotalSize, SlmPolicy::SlmPolicyNone);
@@ -237,7 +237,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
 
     auto dstBindingTablePointer = EncodeSurfaceState<GfxFamily>::pushBindingTableAndSurfaceStates(ssh, kernelInfo.kernelDescriptor.payloadMappings.bindingTable.numEntries,
                                                                                                   kernel.getSurfaceStateHeap(rootDeviceIndex), kernel.getSurfaceStateHeapSize(rootDeviceIndex),
-                                                                                                  kernel.getNumberOfBindingTableStates(rootDeviceIndex), kernel.getBindingTableOffset(rootDeviceIndex));
+                                                                                                  kernel.getNumberOfBindingTableStates(), kernel.getBindingTableOffset());
 
     // Copy our sampler state if it exists
     const auto &samplerTable = kernelInfo.kernelDescriptor.payloadMappings.samplerTable;
@@ -254,11 +254,11 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
     auto threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkItems));
     auto numChannels = static_cast<uint32_t>(kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels);
 
-    uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize(rootDeviceIndex);
+    uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();
 
     size_t offsetCrossThreadData = HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
         ioh, kernel, inlineDataProgrammingRequired,
-        walkerCmd, sizeCrossThreadData, rootDeviceIndex);
+        walkerCmd, sizeCrossThreadData);
 
     size_t sizePerThreadDataTotal = 0;
     size_t sizePerThreadData = 0;
@@ -277,7 +277,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
 
     uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * sizeof(INTERFACE_DESCRIPTOR_DATA);
 
-    auto bindingTablePrefetchSize = std::min(31u, static_cast<uint32_t>(kernel.getNumberOfBindingTableStates(rootDeviceIndex)));
+    auto bindingTablePrefetchSize = std::min(31u, static_cast<uint32_t>(kernel.getNumberOfBindingTableStates()));
     if (resetBindingTablePrefetch(kernel)) {
         bindingTablePrefetchSize = 0;
     }
diff --git a/opencl/source/helpers/hardware_commands_helper_bdw_plus.inl b/opencl/source/helpers/hardware_commands_helper_bdw_plus.inl
index fbda56410b..dc6d13fd47 100644
--- a/opencl/source/helpers/hardware_commands_helper_bdw_plus.inl
+++ b/opencl/source/helpers/hardware_commands_helper_bdw_plus.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2020 Intel Corporation
+ * Copyright (C) 2018-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -127,13 +127,12 @@ size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
     Kernel &kernel,
     bool inlineDataProgrammingRequired,
     WALKER_TYPE<GfxFamily> *walkerCmd,
-    uint32_t &sizeCrossThreadData,
-    uint32_t rootDeviceIndex) {
+    uint32_t &sizeCrossThreadData) {
     indirectHeap.align(WALKER_TYPE<GfxFamily>::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
 
     auto offsetCrossThreadData = indirectHeap.getUsed();
     char *pDest = static_cast<char *>(indirectHeap.getSpace(sizeCrossThreadData));
-    memcpy_s(pDest, sizeCrossThreadData, kernel.getCrossThreadData(rootDeviceIndex), sizeCrossThreadData);
+    memcpy_s(pDest, sizeCrossThreadData, kernel.getCrossThreadData(), sizeCrossThreadData);
 
     if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
         FlatBatchBufferHelper::fixCrossThreadDataInfo(kernel.getPatchInfoDataList(), offsetCrossThreadData, indirectHeap.getGraphicsAllocation()->getGpuAddress());
diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp
index 8a14719391..39c3d05235 100644
--- a/opencl/source/helpers/task_information.cpp
+++ b/opencl/source/helpers/task_information.cpp
@@ -198,7 +198,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
         scheduler.makeResident(commandStreamReceiver);
 
         // Update SLM usage
-        slmUsed |= scheduler.getSlmTotalSize(rootDeviceIndex) > 0;
+        slmUsed |= scheduler.getSlmTotalSize() > 0;
 
         this->kernel->getProgram()->getBlockKernelManager()->makeInternalAllocationsResident(commandStreamReceiver);
     }
diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp
index 58f648d165..19af26aab5 100644
--- a/opencl/source/kernel/kernel.cpp
+++ b/opencl/source/kernel/kernel.cpp
@@ -73,25 +73,22 @@ Kernel::Kernel(Program *programArg, const KernelInfoContainer &kernelInfosArg, C
       deviceVector(programArg->getDevices()),
       kernelInfos(kernelInfosArg),
       defaultRootDeviceIndex(clDeviceArg.getRootDeviceIndex()) {
-    kernelDeviceInfos.resize(program->getMaxRootDeviceIndex() + 1);
     program->retain();
     program->retainForKernel();
     imageTransformer.reset(new ImageTransformer);
     auto rootDeviceIndex = defaultRootDeviceIndex;
-    kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize = static_cast<uint32_t>(clDevice.getSharedDeviceInfo().maxWorkGroupSize);
-    kernelDeviceInfos[rootDeviceIndex].slmTotalSize = kernelInfosArg[rootDeviceIndex]->workloadInfo.slmStaticSize;
+    maxKernelWorkGroupSize = static_cast<uint32_t>(clDevice.getSharedDeviceInfo().maxWorkGroupSize);
+    slmTotalSize = kernelInfosArg[rootDeviceIndex]->workloadInfo.slmStaticSize;
 }
 
 Kernel::~Kernel() {
-    for (auto &kernelDeviceInfo : kernelDeviceInfos) {
-        delete[] kernelDeviceInfo.crossThreadData;
-        kernelDeviceInfo.crossThreadData = nullptr;
-        kernelDeviceInfo.crossThreadDataSize = 0;
+    delete[] crossThreadData;
+    crossThreadData = nullptr;
+    crossThreadDataSize = 0;
 
-        if (kernelDeviceInfo.privateSurface) {
-            program->peekExecutionEnvironment().memoryManager->checkGpuUsageAndDestroyGraphicsAllocations(kernelDeviceInfo.privateSurface);
-            kernelDeviceInfo.privateSurface = nullptr;
-        }
+    if (privateSurface) {
+        program->peekExecutionEnvironment().memoryManager->checkGpuUsageAndDestroyGraphicsAllocations(privateSurface);
+        privateSurface = nullptr;
     }
 
     if (kernelReflectionSurface) {
@@ -138,7 +135,6 @@ inline void patch(const SrcT &src, void *dst, uint32_t dstOffsetBytes) {
 void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const ArgDescPointer &arg) {
     auto rootDeviceIndex = device.getRootDeviceIndex();
 
-    void *crossThreadData = getCrossThreadData(rootDeviceIndex);
     if ((nullptr != crossThreadData) && isValidOffset(arg.stateless)) {
         auto pp = ptrOffset(crossThreadData, arg.stateless);
         uintptr_t addressToPatch = reinterpret_cast<uintptr_t>(ptrToPatchInCrossThreadData);
@@ -164,7 +160,6 @@ void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, Graphic
     auto rootDeviceIndex = device.getRootDeviceIndex();
     uint32_t pointerSize = patch.DataParamSize;
 
-    void *crossThreadData = getCrossThreadData(rootDeviceIndex);
     if (crossThreadData != nullptr) {
         uint32_t crossThreadDataOffset = patch.DataParamOffset;
         auto pp = ptrOffset(crossThreadData, crossThreadDataOffset);
@@ -200,7 +195,6 @@ cl_int Kernel::initialize() {
     reconfigureKernel(rootDeviceIndex);
     auto &hwInfo = pClDevice->getHardwareInfo();
     auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
-    auto &kernelDeviceInfo = kernelDeviceInfos[rootDeviceIndex];
     auto &kernelInfo = *kernelInfos[rootDeviceIndex];
     auto &kernelDescriptor = kernelInfo.kernelDescriptor;
     auto maxSimdSize = kernelInfo.getMaxSimdSize();
@@ -211,133 +205,133 @@ cl_int Kernel::initialize() {
         return CL_INVALID_KERNEL;
     }
 
-    kernelDeviceInfo.crossThreadDataSize = kernelDescriptor.kernelAttributes.crossThreadDataSize;
+    crossThreadDataSize = kernelDescriptor.kernelAttributes.crossThreadDataSize;
 
     // now allocate our own cross-thread data, if necessary
-    if (kernelDeviceInfo.crossThreadDataSize) {
-        kernelDeviceInfo.crossThreadData = new char[kernelDeviceInfo.crossThreadDataSize];
+    if (crossThreadDataSize) {
+        crossThreadData = new char[crossThreadDataSize];
 
         if (kernelInfo.crossThreadData) {
-            memcpy_s(kernelDeviceInfo.crossThreadData, kernelDeviceInfo.crossThreadDataSize,
-                     kernelInfo.crossThreadData, kernelDeviceInfo.crossThreadDataSize);
+            memcpy_s(crossThreadData, crossThreadDataSize,
+                     kernelInfo.crossThreadData, crossThreadDataSize);
         } else {
-            memset(kernelDeviceInfo.crossThreadData, 0x00, kernelDeviceInfo.crossThreadDataSize);
+            memset(crossThreadData, 0x00, crossThreadDataSize);
         }
 
-        auto crossThread = reinterpret_cast<uint32_t *>(kernelDeviceInfo.crossThreadData);
-        kernelDeviceInfo.globalWorkOffsetX = workloadInfo.globalWorkOffsetOffsets[0] != WorkloadInfo::undefinedOffset
-                                                 ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[0])
-                                                 : kernelDeviceInfo.globalWorkOffsetX;
-        kernelDeviceInfo.globalWorkOffsetY = workloadInfo.globalWorkOffsetOffsets[1] != WorkloadInfo::undefinedOffset
-                                                 ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[1])
-                                                 : kernelDeviceInfo.globalWorkOffsetY;
-        kernelDeviceInfo.globalWorkOffsetZ = workloadInfo.globalWorkOffsetOffsets[2] != WorkloadInfo::undefinedOffset
-                                                 ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[2])
-                                                 : kernelDeviceInfo.globalWorkOffsetZ;
+        auto crossThread = reinterpret_cast<uint32_t *>(crossThreadData);
+        globalWorkOffsetX = workloadInfo.globalWorkOffsetOffsets[0] != WorkloadInfo::undefinedOffset
+                                ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[0])
+                                : globalWorkOffsetX;
+        globalWorkOffsetY = workloadInfo.globalWorkOffsetOffsets[1] != WorkloadInfo::undefinedOffset
+                                ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[1])
+                                : globalWorkOffsetY;
+        globalWorkOffsetZ = workloadInfo.globalWorkOffsetOffsets[2] != WorkloadInfo::undefinedOffset
+                                ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[2])
+                                : globalWorkOffsetZ;
 
-        kernelDeviceInfo.localWorkSizeX = workloadInfo.localWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset
-                                              ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[0])
-                                              : kernelDeviceInfo.localWorkSizeX;
-        kernelDeviceInfo.localWorkSizeY = workloadInfo.localWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset
-                                              ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[1])
-                                              : kernelDeviceInfo.localWorkSizeY;
-        kernelDeviceInfo.localWorkSizeZ = workloadInfo.localWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset
-                                              ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[2])
-                                              : kernelDeviceInfo.localWorkSizeZ;
+        localWorkSizeX = workloadInfo.localWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset
+                             ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[0])
+                             : localWorkSizeX;
+        localWorkSizeY = workloadInfo.localWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset
+                             ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[1])
+                             : localWorkSizeY;
+        localWorkSizeZ = workloadInfo.localWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset
+                             ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[2])
+                             : localWorkSizeZ;
 
-        kernelDeviceInfo.localWorkSizeX2 = workloadInfo.localWorkSizeOffsets2[0] != WorkloadInfo::undefinedOffset
-                                               ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[0])
-                                               : kernelDeviceInfo.localWorkSizeX2;
-        kernelDeviceInfo.localWorkSizeY2 = workloadInfo.localWorkSizeOffsets2[1] != WorkloadInfo::undefinedOffset
-                                               ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[1])
-                                               : kernelDeviceInfo.localWorkSizeY2;
-        kernelDeviceInfo.localWorkSizeZ2 = workloadInfo.localWorkSizeOffsets2[2] != WorkloadInfo::undefinedOffset
-                                               ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[2])
-                                               : kernelDeviceInfo.localWorkSizeZ2;
+        localWorkSizeX2 = workloadInfo.localWorkSizeOffsets2[0] != WorkloadInfo::undefinedOffset
+                              ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[0])
+                              : localWorkSizeX2;
+        localWorkSizeY2 = workloadInfo.localWorkSizeOffsets2[1] != WorkloadInfo::undefinedOffset
+                              ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[1])
+                              : localWorkSizeY2;
+        localWorkSizeZ2 = workloadInfo.localWorkSizeOffsets2[2] != WorkloadInfo::undefinedOffset
+                              ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[2])
+                              : localWorkSizeZ2;
 
-        kernelDeviceInfo.globalWorkSizeX = workloadInfo.globalWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset
-                                               ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[0])
-                                               : kernelDeviceInfo.globalWorkSizeX;
-        kernelDeviceInfo.globalWorkSizeY = workloadInfo.globalWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset
-                                               ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[1])
-                                               : kernelDeviceInfo.globalWorkSizeY;
-        kernelDeviceInfo.globalWorkSizeZ = workloadInfo.globalWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset
-                                               ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[2])
-                                               : kernelDeviceInfo.globalWorkSizeZ;
+        globalWorkSizeX = workloadInfo.globalWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset
+                              ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[0])
+                              : globalWorkSizeX;
+        globalWorkSizeY = workloadInfo.globalWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset
+                              ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[1])
+                              : globalWorkSizeY;
+        globalWorkSizeZ = workloadInfo.globalWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset
+                              ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[2])
+                              : globalWorkSizeZ;
 
-        kernelDeviceInfo.enqueuedLocalWorkSizeX = workloadInfo.enqueuedLocalWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset
-                                                      ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[0])
-                                                      : kernelDeviceInfo.enqueuedLocalWorkSizeX;
-        kernelDeviceInfo.enqueuedLocalWorkSizeY = workloadInfo.enqueuedLocalWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset
-                                                      ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[1])
-                                                      : kernelDeviceInfo.enqueuedLocalWorkSizeY;
-        kernelDeviceInfo.enqueuedLocalWorkSizeZ = workloadInfo.enqueuedLocalWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset
-                                                      ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[2])
-                                                      : kernelDeviceInfo.enqueuedLocalWorkSizeZ;
+        enqueuedLocalWorkSizeX = workloadInfo.enqueuedLocalWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset
+                                     ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[0])
+                                     : enqueuedLocalWorkSizeX;
+        enqueuedLocalWorkSizeY = workloadInfo.enqueuedLocalWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset
+                                     ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[1])
+                                     : enqueuedLocalWorkSizeY;
+        enqueuedLocalWorkSizeZ = workloadInfo.enqueuedLocalWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset
+                                     ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[2])
+                                     : enqueuedLocalWorkSizeZ;
 
-        kernelDeviceInfo.numWorkGroupsX = workloadInfo.numWorkGroupsOffset[0] != WorkloadInfo::undefinedOffset
-                                              ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[0])
-                                              : kernelDeviceInfo.numWorkGroupsX;
-        kernelDeviceInfo.numWorkGroupsY = workloadInfo.numWorkGroupsOffset[1] != WorkloadInfo::undefinedOffset
-                                              ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[1])
-                                              : kernelDeviceInfo.numWorkGroupsY;
-        kernelDeviceInfo.numWorkGroupsZ = workloadInfo.numWorkGroupsOffset[2] != WorkloadInfo::undefinedOffset
-                                              ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[2])
-                                              : kernelDeviceInfo.numWorkGroupsZ;
+        numWorkGroupsX = workloadInfo.numWorkGroupsOffset[0] != WorkloadInfo::undefinedOffset
+                             ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[0])
+                             : numWorkGroupsX;
+        numWorkGroupsY = workloadInfo.numWorkGroupsOffset[1] != WorkloadInfo::undefinedOffset
+                             ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[1])
+                             : numWorkGroupsY;
+        numWorkGroupsZ = workloadInfo.numWorkGroupsOffset[2] != WorkloadInfo::undefinedOffset
+                             ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[2])
+                             : numWorkGroupsZ;
 
-        kernelDeviceInfo.maxWorkGroupSizeForCrossThreadData = workloadInfo.maxWorkGroupSizeOffset != WorkloadInfo::undefinedOffset
-                                                                  ? ptrOffset(crossThread, workloadInfo.maxWorkGroupSizeOffset)
-                                                                  : kernelDeviceInfo.maxWorkGroupSizeForCrossThreadData;
-        kernelDeviceInfo.workDim = workloadInfo.workDimOffset != WorkloadInfo::undefinedOffset
-                                       ? ptrOffset(crossThread, workloadInfo.workDimOffset)
-                                       : kernelDeviceInfo.workDim;
-        kernelDeviceInfo.dataParameterSimdSize = workloadInfo.simdSizeOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.simdSizeOffset) : kernelDeviceInfo.dataParameterSimdSize;
-        kernelDeviceInfo.parentEventOffset = workloadInfo.parentEventOffset != WorkloadInfo::undefinedOffset
-                                                 ? ptrOffset(crossThread, workloadInfo.parentEventOffset)
-                                                 : kernelDeviceInfo.parentEventOffset;
-        kernelDeviceInfo.preferredWkgMultipleOffset = workloadInfo.preferredWkgMultipleOffset != WorkloadInfo::undefinedOffset
-                                                          ? ptrOffset(crossThread, workloadInfo.preferredWkgMultipleOffset)
-                                                          : kernelDeviceInfo.preferredWkgMultipleOffset;
+        maxWorkGroupSizeForCrossThreadData = workloadInfo.maxWorkGroupSizeOffset != WorkloadInfo::undefinedOffset
+                                                 ? ptrOffset(crossThread, workloadInfo.maxWorkGroupSizeOffset)
+                                                 : maxWorkGroupSizeForCrossThreadData;
+        workDim = workloadInfo.workDimOffset != WorkloadInfo::undefinedOffset
+                      ? ptrOffset(crossThread, workloadInfo.workDimOffset)
+                      : workDim;
+        dataParameterSimdSize = workloadInfo.simdSizeOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.simdSizeOffset) : dataParameterSimdSize;
+        parentEventOffset = workloadInfo.parentEventOffset != WorkloadInfo::undefinedOffset
+                                ? ptrOffset(crossThread, workloadInfo.parentEventOffset)
+                                : parentEventOffset;
+        preferredWkgMultipleOffset = workloadInfo.preferredWkgMultipleOffset != WorkloadInfo::undefinedOffset
+                                         ? ptrOffset(crossThread, workloadInfo.preferredWkgMultipleOffset)
+                                         : preferredWkgMultipleOffset;
 
-        *kernelDeviceInfo.maxWorkGroupSizeForCrossThreadData = kernelDeviceInfo.maxKernelWorkGroupSize;
-        *kernelDeviceInfo.dataParameterSimdSize = maxSimdSize;
-        *kernelDeviceInfo.preferredWkgMultipleOffset = maxSimdSize;
-        *kernelDeviceInfo.parentEventOffset = WorkloadInfo::invalidParentEvent;
+        *maxWorkGroupSizeForCrossThreadData = maxKernelWorkGroupSize;
+        *dataParameterSimdSize = maxSimdSize;
+        *preferredWkgMultipleOffset = maxSimdSize;
+        *parentEventOffset = WorkloadInfo::invalidParentEvent;
     }
 
     // allocate our own SSH, if necessary
-    kernelDeviceInfo.sshLocalSize = heapInfo.SurfaceStateHeapSize;
+    sshLocalSize = heapInfo.SurfaceStateHeapSize;
 
-    if (kernelDeviceInfo.sshLocalSize) {
-        kernelDeviceInfo.pSshLocal = std::make_unique<char[]>(kernelDeviceInfo.sshLocalSize);
+    if (sshLocalSize) {
+        pSshLocal = std::make_unique<char[]>(sshLocalSize);
 
         // copy the ssh into our local copy
-        memcpy_s(kernelDeviceInfo.pSshLocal.get(), kernelDeviceInfo.sshLocalSize,
-                 heapInfo.pSsh, kernelDeviceInfo.sshLocalSize);
+        memcpy_s(pSshLocal.get(), sshLocalSize,
+                 heapInfo.pSsh, sshLocalSize);
     }
-    kernelDeviceInfo.numberOfBindingTableStates = kernelDescriptor.payloadMappings.bindingTable.numEntries;
-    kernelDeviceInfo.localBindingTableOffset = kernelDescriptor.payloadMappings.bindingTable.tableOffset;
+    numberOfBindingTableStates = kernelDescriptor.payloadMappings.bindingTable.numEntries;
+    localBindingTableOffset = kernelDescriptor.payloadMappings.bindingTable.tableOffset;
 
     // patch crossthread data and ssh with inline surfaces, if necessary
     auto perHwThreadPrivateMemorySize = kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize;
 
     if (perHwThreadPrivateMemorySize) {
-        kernelDeviceInfo.privateSurfaceSize = KernelHelper::getPrivateSurfaceSize(perHwThreadPrivateMemorySize, pClDevice->getSharedDeviceInfo().computeUnitsUsedForScratch);
+        privateSurfaceSize = KernelHelper::getPrivateSurfaceSize(perHwThreadPrivateMemorySize, pClDevice->getSharedDeviceInfo().computeUnitsUsedForScratch);
 
-        DEBUG_BREAK_IF(kernelDeviceInfo.privateSurfaceSize == 0);
-        if (kernelDeviceInfo.privateSurfaceSize > std::numeric_limits<uint32_t>::max()) {
+        DEBUG_BREAK_IF(privateSurfaceSize == 0);
+        if (privateSurfaceSize > std::numeric_limits<uint32_t>::max()) {
             return CL_OUT_OF_RESOURCES;
         }
-        kernelDeviceInfo.privateSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties(
+        privateSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties(
             {rootDeviceIndex,
-             static_cast<size_t>(kernelDeviceInfo.privateSurfaceSize),
+             static_cast<size_t>(privateSurfaceSize),
              GraphicsAllocation::AllocationType::PRIVATE_SURFACE,
              pClDevice->getDeviceBitfield()});
-        if (kernelDeviceInfo.privateSurface == nullptr) {
+        if (privateSurface == nullptr) {
             return CL_OUT_OF_RESOURCES;
         }
         const auto &patch = kernelDescriptor.payloadMappings.implicitArgs.privateMemoryAddress;
-        patchWithImplicitSurface(reinterpret_cast<void *>(kernelDeviceInfo.privateSurface->getGpuAddressToPatch()), *kernelDeviceInfo.privateSurface, pClDevice->getDevice(), patch);
+        patchWithImplicitSurface(reinterpret_cast<void *>(privateSurface->getGpuAddressToPatch()), *privateSurface, pClDevice->getDevice(), patch);
     }
     if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless)) {
         DEBUG_BREAK_IF(program->getConstantSurface(rootDeviceIndex) == nullptr);
@@ -391,7 +385,7 @@ cl_int Kernel::initialize() {
         debugEnabled = true;
     }
     auto numArgs = kernelInfo.kernelArgInfo.size();
-    kernelDeviceInfo.slmSizes.resize(numArgs);
+    slmSizes.resize(numArgs);
 
     this->kernelHasIndirectAccess |= kernelInfo.kernelDescriptor.kernelAttributes.hasNonKernelArgLoad ||
                                      kernelInfo.kernelDescriptor.kernelAttributes.hasNonKernelArgStore ||
@@ -449,10 +443,9 @@ cl_int Kernel::initialize() {
 
 cl_int Kernel::cloneKernel(Kernel *pSourceKernel) {
     // copy cross thread data to store arguments set to source kernel with clSetKernelArg on immediate data (non-pointer types)
-    auto rootDeviceIndex = defaultRootDeviceIndex;
-    memcpy_s(kernelDeviceInfos[rootDeviceIndex].crossThreadData, kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize,
-             pSourceKernel->kernelDeviceInfos[rootDeviceIndex].crossThreadData, pSourceKernel->kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize);
-    DEBUG_BREAK_IF(pSourceKernel->kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize != kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize);
+    memcpy_s(crossThreadData, crossThreadDataSize,
+             pSourceKernel->crossThreadData, pSourceKernel->crossThreadDataSize);
+    DEBUG_BREAK_IF(pSourceKernel->crossThreadDataSize != crossThreadDataSize);
 
     // copy arguments set to source kernel with clSetKernelArg or clSetKernelArgSVMPointer
     for (uint32_t i = 0; i < pSourceKernel->kernelArguments.size(); i++) {
@@ -641,7 +634,7 @@ cl_int Kernel::getWorkGroupInfo(ClDevice &device, cl_kernel_work_group_info para
 
     switch (paramName) {
     case CL_KERNEL_WORK_GROUP_SIZE:
-        maxWorkgroupSize = kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize;
+        maxWorkgroupSize = maxKernelWorkGroupSize;
         if (DebugManager.flags.UseMaxSimdSizeToDeduceMaxWorkgroupSize.get()) {
             auto divisionSize = CommonConstants::maximalSimdSize / kernelInfo.getMaxSimdSize();
             maxWorkgroupSize /= divisionSize;
@@ -704,7 +697,7 @@ cl_int Kernel::getSubGroupInfo(ClDevice &clDevice, cl_kernel_sub_group_info para
     auto rootDeviceIndex = clDevice.getRootDeviceIndex();
     const auto &kernelInfo = getKernelInfo(rootDeviceIndex);
     auto maxSimdSize = static_cast<size_t>(kernelInfo.getMaxSimdSize());
-    auto maxRequiredWorkGroupSize = static_cast<size_t>(kernelInfo.getMaxRequiredWorkGroupSize(getMaxKernelWorkGroupSize(rootDeviceIndex)));
+    auto maxRequiredWorkGroupSize = static_cast<size_t>(kernelInfo.getMaxRequiredWorkGroupSize(getMaxKernelWorkGroupSize()));
     auto largestCompiledSIMDSize = static_cast<size_t>(kernelInfo.getMaxSimdSize());
 
     GetInfoHelper info(paramValue, paramValueSize, paramValueSizeRet);
@@ -850,7 +843,7 @@ void Kernel::setStartOffset(uint32_t offset) {
 }
 
 void *Kernel::getSurfaceStateHeap(uint32_t rootDeviceIndex) const {
-    return kernelInfos[rootDeviceIndex]->usesSsh ? kernelDeviceInfos[rootDeviceIndex].pSshLocal.get() : nullptr;
+    return kernelInfos[rootDeviceIndex]->usesSsh ? pSshLocal.get() : nullptr;
 }
 
 size_t Kernel::getDynamicStateHeapSize(uint32_t rootDeviceIndex) const {
@@ -863,19 +856,19 @@ const void *Kernel::getDynamicStateHeap(uint32_t rootDeviceIndex) const {
 
 size_t Kernel::getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const {
     return kernelInfos[rootDeviceIndex]->usesSsh
-               ? kernelDeviceInfos[rootDeviceIndex].sshLocalSize
+               ? sshLocalSize
                : 0;
 }
 
-size_t Kernel::getNumberOfBindingTableStates(uint32_t rootDeviceIndex) const {
-    return kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates;
+size_t Kernel::getNumberOfBindingTableStates() const {
+    return numberOfBindingTableStates;
 }
 
-void Kernel::resizeSurfaceStateHeap(uint32_t rootDeviceIndex, void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset) {
-    kernelDeviceInfos[rootDeviceIndex].pSshLocal.reset(static_cast<char *>(pNewSsh));
-    kernelDeviceInfos[rootDeviceIndex].sshLocalSize = static_cast<uint32_t>(newSshSize);
-    kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = newBindingTableCount;
-    kernelDeviceInfos[rootDeviceIndex].localBindingTableOffset = newBindingTableOffset;
+void Kernel::resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset) {
+    pSshLocal.reset(static_cast<char *>(pNewSsh));
+    sshLocalSize = static_cast<uint32_t>(newSshSize);
+    numberOfBindingTableStates = newBindingTableCount;
+    localBindingTableOffset = newBindingTableOffset;
 }
 
 cl_int Kernel::setArg(uint32_t argIndex, size_t argSize, const void *argVal) {
@@ -936,7 +929,7 @@ void *Kernel::patchBufferOffset(const KernelArgInfo &argInfo, void *svmPtr, Grap
     DEBUG_BREAK_IF(ptrDiff(svmPtr, ptrToPatch) != static_cast<uint32_t>(ptrDiff(svmPtr, ptrToPatch)));
     uint32_t offsetToPatch = static_cast<uint32_t>(ptrDiff(svmPtr, ptrToPatch));
 
-    patch<uint32_t, uint32_t>(offsetToPatch, getCrossThreadData(rootDeviceIndex), argInfo.offsetBufferOffset);
+    patch<uint32_t, uint32_t>(offsetToPatch, crossThreadData, argInfo.offsetBufferOffset);
     return ptrToPatch;
 }
 
@@ -974,7 +967,7 @@ cl_int Kernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocatio
 
     void *ptrToPatch = patchBufferOffset(kernelArgInfo, svmPtr, svmAlloc, rootDeviceIndex);
 
-    auto patchLocation = ptrOffset(getCrossThreadData(rootDeviceIndex),
+    auto patchLocation = ptrOffset(crossThreadData,
                                    kernelArgInfo.kernelArgPatchInfoVector[0].crossthreadOffset);
 
     auto patchSize = kernelArgInfo.kernelArgPatchInfoVector[0].size;
@@ -1143,7 +1136,7 @@ uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *local
                                               availableThreadCount,
                                               dssCount,
                                               dssCount * KB * hardwareInfo.capabilityTable.slmSize,
-                                              hwHelper.alignSlmSize(kernelDeviceInfos[rootDeviceIndex].slmTotalSize),
+                                              hwHelper.alignSlmSize(slmTotalSize),
                                               static_cast<uint32_t>(hwHelper.getMaxBarrierRegisterPerSlice()),
                                               hwHelper.getBarriersCountFromHasBarriers(barrierCount),
                                               workDim,
@@ -1264,8 +1257,8 @@ bool Kernel::isSingleSubdevicePreferred() const {
 
 void Kernel::makeResident(CommandStreamReceiver &commandStreamReceiver) {
     auto rootDeviceIndex = commandStreamReceiver.getRootDeviceIndex();
-    if (kernelDeviceInfos[rootDeviceIndex].privateSurface) {
-        commandStreamReceiver.makeResident(*kernelDeviceInfos[rootDeviceIndex].privateSurface);
+    if (privateSurface) {
+        commandStreamReceiver.makeResident(*privateSurface);
     }
 
     if (program->getConstantSurface(rootDeviceIndex)) {
@@ -1312,12 +1305,13 @@ void Kernel::makeResident(CommandStreamReceiver &commandStreamReceiver) {
     }
 }
 
-void Kernel::getResidency(std::vector<Surface *> &dst, uint32_t rootDeviceIndex) {
-    if (kernelDeviceInfos[rootDeviceIndex].privateSurface) {
-        GeneralSurface *surface = new GeneralSurface(kernelDeviceInfos[rootDeviceIndex].privateSurface);
+void Kernel::getResidency(std::vector<Surface *> &dst) {
+    if (privateSurface) {
+        GeneralSurface *surface = new GeneralSurface(privateSurface);
         dst.push_back(surface);
     }
 
+    auto rootDeviceIndex = getDevice().getRootDeviceIndex();
     if (program->getConstantSurface(rootDeviceIndex)) {
         GeneralSurface *surface = new GeneralSurface(program->getConstantSurface(rootDeviceIndex));
         dst.push_back(surface);
@@ -1391,13 +1385,12 @@ cl_int Kernel::setArgLocal(uint32_t argIndexIn,
     storeKernelArg(argIndexIn, SLM_OBJ, nullptr, argVal, argSize);
     auto pClDevice = &getDevice();
     auto rootDeviceIndex = pClDevice->getRootDeviceIndex();
-    auto crossThreadData = reinterpret_cast<uint32_t *>(getCrossThreadData(rootDeviceIndex));
     auto &kernelInfo = *kernelInfos[rootDeviceIndex];
-    auto &kernelDeviceInfo = kernelDeviceInfos[rootDeviceIndex];
+    uint32_t *crossThreadData = reinterpret_cast<uint32_t *>(this->crossThreadData);
 
     uint32_t argIndex = argIndexIn;
 
-    kernelDeviceInfo.slmSizes[argIndex] = argSize;
+    slmSizes[argIndex] = argSize;
 
     // Extract our current slmOffset
     auto slmOffset = *ptrOffset(crossThreadData,
@@ -1408,7 +1401,7 @@ cl_int Kernel::setArgLocal(uint32_t argIndexIn,
 
     // Update all slm offsets after this argIndex
     ++argIndex;
-    while (argIndex < kernelDeviceInfo.slmSizes.size()) {
+    while (argIndex < slmSizes.size()) {
         const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex];
         auto slmAlignment = kernelArgInfo.slmAlignment;
 
@@ -1423,11 +1416,11 @@ cl_int Kernel::setArgLocal(uint32_t argIndexIn,
             *patchLocation = slmOffset;
         }
 
-        slmOffset += static_cast<uint32_t>(kernelDeviceInfo.slmSizes[argIndex]);
+        slmOffset += static_cast<uint32_t>(slmSizes[argIndex]);
         ++argIndex;
     }
 
-    kernelDeviceInfo.slmTotalSize = kernelInfo.workloadInfo.slmStaticSize + alignUp(slmOffset, KB);
+    slmTotalSize = kernelInfo.workloadInfo.slmStaticSize + alignUp(slmOffset, KB);
 
     return CL_SUCCESS;
 }
@@ -1460,7 +1453,7 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex,
         patchBufferOffset(kernelArgInfo, nullptr, nullptr, rootDeviceIndex);
         auto graphicsAllocation = buffer->getGraphicsAllocation(rootDeviceIndex);
 
-        auto patchLocation = ptrOffset(getCrossThreadData(rootDeviceIndex),
+        auto patchLocation = ptrOffset(crossThreadData,
                                        kernelArgInfo.kernelArgPatchInfoVector[0].crossthreadOffset);
 
         auto patchSize = kernelArgInfo.kernelArgPatchInfoVector[0].size;
@@ -1469,7 +1462,7 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex,
 
         if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
             PatchInfoData patchInfoData(addressToPatch - buffer->getOffset(), static_cast<uint64_t>(buffer->getOffset()),
-                                        PatchInfoAllocationType::KernelArg, reinterpret_cast<uint64_t>(getCrossThreadData(rootDeviceIndex)),
+                                        PatchInfoAllocationType::KernelArg, reinterpret_cast<uint64_t>(crossThreadData),
                                         static_cast<uint64_t>(kernelArgInfo.kernelArgPatchInfoVector[0].crossthreadOffset),
                                         PatchInfoAllocationType::IndirectObjectHeap, patchSize);
             this->patchInfoDataList.push_back(patchInfoData);
@@ -1512,7 +1505,7 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex,
         storeKernelArg(argIndex, BUFFER_OBJ, nullptr, argVal, argSize);
         const auto &kernelArgInfo = getKernelInfo(rootDeviceIndex).kernelArgInfo[argIndex];
         patchBufferOffset(kernelArgInfo, nullptr, nullptr, rootDeviceIndex);
-        auto patchLocation = ptrOffset(getCrossThreadData(rootDeviceIndex),
+        auto patchLocation = ptrOffset(crossThreadData,
                                        kernelArgInfo.kernelArgPatchInfoVector[0].crossthreadOffset);
 
         patchWithRequiredSize(patchLocation, kernelArgInfo.kernelArgPatchInfoVector[0].size, 0u);
@@ -1558,7 +1551,7 @@ cl_int Kernel::setArgPipe(uint32_t argIndex,
         }
         auto rootDeviceIndex = getDevice().getRootDeviceIndex();
         const auto &kernelArgInfo = getKernelInfo(rootDeviceIndex).kernelArgInfo[argIndex];
-        auto patchLocation = ptrOffset(getCrossThreadData(rootDeviceIndex),
+        auto patchLocation = ptrOffset(crossThreadData,
                                        kernelArgInfo.kernelArgPatchInfoVector[0].crossthreadOffset);
 
         auto patchSize = kernelArgInfo.kernelArgPatchInfoVector[0].size;
@@ -1595,6 +1588,7 @@ cl_int Kernel::setArgImageWithMipLevel(uint32_t argIndex,
     auto &kernelInfo = getKernelInfo(rootDeviceIndex);
     patchBufferOffset(kernelInfo.kernelArgInfo[argIndex], nullptr, nullptr, rootDeviceIndex);
 
+    uint32_t *crossThreadData = reinterpret_cast<uint32_t *>(this->crossThreadData);
     auto clMemObj = *(static_cast<const cl_mem *>(argVal));
     auto pImage = castToObject<Image>(clMemObj);
 
@@ -1619,7 +1613,6 @@ cl_int Kernel::setArgImageWithMipLevel(uint32_t argIndex,
             pImage->setImageArg(surfaceState, kernelArgInfo.isMediaBlockImage, mipLevel, rootDeviceIndex);
         }
 
-        auto crossThreadData = reinterpret_cast<uint32_t *>(getCrossThreadData(rootDeviceIndex));
         auto &imageDesc = pImage->getImageDesc();
         auto &imageFormat = pImage->getImageFormat();
         auto graphicsAllocation = pImage->getGraphicsAllocation(rootDeviceIndex);
@@ -1665,8 +1658,7 @@ cl_int Kernel::setArgImmediate(uint32_t argIndex,
         const auto &kernelArgInfo = getKernelInfo(rootDeviceIndex).kernelArgInfo[argIndex];
         DEBUG_BREAK_IF(kernelArgInfo.kernelArgPatchInfoVector.size() <= 0);
 
-        auto crossThreadData = getCrossThreadData(rootDeviceIndex);
-        auto crossThreadDataEnd = ptrOffset(crossThreadData, getCrossThreadDataSize(rootDeviceIndex));
+        auto crossThreadDataEnd = ptrOffset(crossThreadData, crossThreadDataSize);
 
         for (const auto &kernelArgPatchInfo : kernelArgInfo.kernelArgPatchInfoVector) {
             DEBUG_BREAK_IF(kernelArgPatchInfo.size <= 0);
@@ -1698,6 +1690,7 @@ cl_int Kernel::setArgSampler(uint32_t argIndex,
         return retVal;
     }
 
+    uint32_t *crossThreadData = reinterpret_cast<uint32_t *>(this->crossThreadData);
     auto clSamplerObj = *(static_cast<const cl_sampler *>(argVal));
     auto pSampler = castToObject<Sampler>(clSamplerObj);
     auto rootDeviceIndex = getDevice().getRootDeviceIndex();
@@ -1722,7 +1715,6 @@ cl_int Kernel::setArgSampler(uint32_t argIndex,
 
         pSampler->setArg(const_cast<void *>(samplerState), getProgram()->getDevices()[0]->getHardwareInfo());
 
-        auto crossThreadData = reinterpret_cast<uint32_t *>(getCrossThreadData(rootDeviceIndex));
         patch<uint32_t, unsigned int>(pSampler->getSnapWaValue(), crossThreadData, kernelArgInfo.offsetSamplerSnapWa);
         patch<uint32_t, uint32_t>(GetAddrModeEnum(pSampler->addressingMode), crossThreadData, kernelArgInfo.offsetSamplerAddressingMode);
         patch<uint32_t, uint32_t>(GetNormCoordsEnum(pSampler->normalizedCoordinates), crossThreadData, kernelArgInfo.offsetSamplerNormalizedCoords);
@@ -1759,7 +1751,6 @@ cl_int Kernel::setArgAccelerator(uint32_t argIndex,
         const auto &kernelArgInfo = getKernelInfo(rootDeviceIndex).kernelArgInfo[argIndex];
 
         if (kernelArgInfo.samplerArgumentType == iOpenCL::SAMPLER_OBJECT_VME) {
-            auto crossThreadData = getCrossThreadData(rootDeviceIndex);
 
             const auto pVmeAccelerator = castToObjectOrAbort<VmeAccelerator>(pAccelerator);
             auto pDesc = static_cast<const cl_motion_estimation_desc_intel *>(pVmeAccelerator->getDescriptor());
@@ -1808,7 +1799,7 @@ cl_int Kernel::setArgDevQueue(uint32_t argIndex,
     storeKernelArg(argIndex, DEVICE_QUEUE_OBJ, clDeviceQueue, argVal, argSize);
 
     const auto &kernelArgPatchInfo = kernelInfos[rootDeviceIndex]->kernelArgInfo[argIndex].kernelArgPatchInfoVector[0];
-    auto patchLocation = ptrOffset(reinterpret_cast<uint32_t *>(getCrossThreadData(rootDeviceIndex)),
+    auto patchLocation = ptrOffset(reinterpret_cast<uint32_t *>(crossThreadData),
                                    kernelArgPatchInfo.crossthreadOffset);
 
     patchWithRequiredSize(patchLocation, kernelArgPatchInfo.size,
@@ -2403,10 +2394,10 @@ void Kernel::provideInitializationHints() {
 
     auto pClDevice = &getDevice();
     auto rootDeviceIndex = pClDevice->getRootDeviceIndex();
-    if (kernelDeviceInfos[rootDeviceIndex].privateSurfaceSize) {
+    if (privateSurfaceSize) {
         context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, PRIVATE_MEMORY_USAGE_TOO_HIGH,
                                         kernelInfos[rootDeviceIndex]->kernelDescriptor.kernelMetadata.kernelName.c_str(),
-                                        kernelDeviceInfos[rootDeviceIndex].privateSurfaceSize);
+                                        privateSurfaceSize);
     }
     auto scratchSize = kernelInfos[rootDeviceIndex]->kernelDescriptor.kernelAttributes.perThreadScratchSize[0] *
                        pClDevice->getSharedDeviceInfo().computeUnitsUsedForScratch * getKernelInfo(rootDeviceIndex).getMaxSimdSize();
@@ -2419,8 +2410,8 @@ void Kernel::provideInitializationHints() {
 void Kernel::patchDefaultDeviceQueue(DeviceQueue *devQueue) {
     auto rootDeviceIndex = devQueue->getDevice().getRootDeviceIndex();
     const auto &defaultQueueSurfaceAddress = kernelInfos[rootDeviceIndex]->kernelDescriptor.payloadMappings.implicitArgs.deviceSideEnqueueDefaultQueueSurfaceAddress;
-    if (isValidOffset(defaultQueueSurfaceAddress.stateless) && kernelDeviceInfos[rootDeviceIndex].crossThreadData) {
-        auto patchLocation = ptrOffset(reinterpret_cast<uint32_t *>(getCrossThreadData(rootDeviceIndex)), defaultQueueSurfaceAddress.stateless);
+    if (isValidOffset(defaultQueueSurfaceAddress.stateless) && crossThreadData) {
+        auto patchLocation = ptrOffset(reinterpret_cast<uint32_t *>(crossThreadData), defaultQueueSurfaceAddress.stateless);
         patchWithRequiredSize(patchLocation, defaultQueueSurfaceAddress.pointerSize,
                               static_cast<uintptr_t>(devQueue->getQueueBuffer()->getGpuAddressToPatch()));
     }
@@ -2436,8 +2427,8 @@ void Kernel::patchEventPool(DeviceQueue *devQueue) {
     auto rootDeviceIndex = devQueue->getDevice().getRootDeviceIndex();
     const auto &eventPoolSurfaceAddress = kernelInfos[rootDeviceIndex]->kernelDescriptor.payloadMappings.implicitArgs.deviceSideEnqueueEventPoolSurfaceAddress;
 
-    if (isValidOffset(eventPoolSurfaceAddress.stateless) && kernelDeviceInfos[rootDeviceIndex].crossThreadData) {
-        auto patchLocation = ptrOffset(reinterpret_cast<uint32_t *>(getCrossThreadData(rootDeviceIndex)), eventPoolSurfaceAddress.stateless);
+    if (isValidOffset(eventPoolSurfaceAddress.stateless) && crossThreadData) {
+        auto patchLocation = ptrOffset(reinterpret_cast<uint32_t *>(crossThreadData), eventPoolSurfaceAddress.stateless);
         patchWithRequiredSize(patchLocation, eventPoolSurfaceAddress.pointerSize,
                               static_cast<uintptr_t>(devQueue->getEventPoolBuffer()->getGpuAddressToPatch()));
     }
@@ -2459,7 +2450,7 @@ void Kernel::patchBlocksSimdSize(uint32_t rootDeviceIndex) {
         DEBUG_BREAK_IF(!(idOffset.first < static_cast<uint32_t>(blockManager->getCount())));
 
         const KernelInfo *blockInfo = blockManager->getBlockKernelInfo(idOffset.first);
-        uint32_t *simdSize = reinterpret_cast<uint32_t *>(&kernelDeviceInfos[rootDeviceIndex].crossThreadData[idOffset.second]);
+        uint32_t *simdSize = reinterpret_cast<uint32_t *>(&crossThreadData[idOffset.second]);
         *simdSize = blockInfo->getMaxSimdSize();
     }
 }
@@ -2471,7 +2462,7 @@ bool Kernel::usesSyncBuffer(uint32_t rootDeviceIndex) {
 void Kernel::patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
     auto rootDeviceIndex = device.getRootDeviceIndex();
     const auto &syncBuffer = kernelInfos[rootDeviceIndex]->kernelDescriptor.payloadMappings.implicitArgs.syncBufferAddress;
-    auto bufferPatchAddress = ptrOffset(getCrossThreadData(rootDeviceIndex), syncBuffer.stateless);
+    auto bufferPatchAddress = ptrOffset(crossThreadData, syncBuffer.stateless);
     patchWithRequiredSize(bufferPatchAddress, syncBuffer.pointerSize,
                           ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset));
 
@@ -2699,7 +2690,7 @@ void Kernel::patchBindlessSurfaceStateOffsets(const Device &device, const size_t
             if ((kernelInfo.kernelArgInfo[i].isBuffer) ||
                 (kernelInfo.kernelArgInfo[i].isImage)) {
 
-                auto patchLocation = ptrOffset(getCrossThreadData(device.getRootDeviceIndex()),
+                auto patchLocation = ptrOffset(crossThreadData,
                                                kernelInfo.kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset);
 
                 auto bindlessOffset = static_cast<uint32_t>(sshOffset) + kernelInfo.kernelArgInfo[i].offsetHeap;
@@ -2746,56 +2737,56 @@ const HardwareInfo &Kernel::getHardwareInfo(uint32_t rootDeviceIndex) const {
 const KernelInfo &Kernel::getDefaultKernelInfo() const {
     return *kernelInfos[defaultRootDeviceIndex];
 }
-void Kernel::setGlobalWorkOffsetValues(uint32_t rootDeviceIndex, uint32_t globalWorkOffsetX, uint32_t globalWorkOffsetY, uint32_t globalWorkOffsetZ) {
-    *kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX = globalWorkOffsetX;
-    *kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY = globalWorkOffsetY;
-    *kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetZ = globalWorkOffsetZ;
+void Kernel::setGlobalWorkOffsetValues(uint32_t globalWorkOffsetX, uint32_t globalWorkOffsetY, uint32_t globalWorkOffsetZ) {
+    *this->globalWorkOffsetX = globalWorkOffsetX;
+    *this->globalWorkOffsetY = globalWorkOffsetY;
+    *this->globalWorkOffsetZ = globalWorkOffsetZ;
 }
 
-void Kernel::setGlobalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ) {
-    *kernelDeviceInfos[rootDeviceIndex].globalWorkSizeX = globalWorkSizeX;
-    *kernelDeviceInfos[rootDeviceIndex].globalWorkSizeY = globalWorkSizeY;
-    *kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ = globalWorkSizeZ;
+void Kernel::setGlobalWorkSizeValues(uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ) {
+    *this->globalWorkSizeX = globalWorkSizeX;
+    *this->globalWorkSizeY = globalWorkSizeY;
+    *this->globalWorkSizeZ = globalWorkSizeZ;
 }
 
-void Kernel::setLocalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
-    *kernelDeviceInfos[rootDeviceIndex].localWorkSizeX = localWorkSizeX;
-    *kernelDeviceInfos[rootDeviceIndex].localWorkSizeY = localWorkSizeY;
-    *kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ = localWorkSizeZ;
+void Kernel::setLocalWorkSizeValues(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
+    *this->localWorkSizeX = localWorkSizeX;
+    *this->localWorkSizeY = localWorkSizeY;
+    *this->localWorkSizeZ = localWorkSizeZ;
 }
 
-void Kernel::setLocalWorkSize2Values(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
-    *kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2 = localWorkSizeX;
-    *kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2 = localWorkSizeY;
-    *kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2 = localWorkSizeZ;
+void Kernel::setLocalWorkSize2Values(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
+    *this->localWorkSizeX2 = localWorkSizeX;
+    *this->localWorkSizeY2 = localWorkSizeY;
+    *this->localWorkSizeZ2 = localWorkSizeZ;
 }
 
-void Kernel::setEnqueuedLocalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
-    *kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX = localWorkSizeX;
-    *kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY = localWorkSizeY;
-    *kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ = localWorkSizeZ;
+void Kernel::setEnqueuedLocalWorkSizeValues(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
+    *this->enqueuedLocalWorkSizeX = localWorkSizeX;
+    *this->enqueuedLocalWorkSizeY = localWorkSizeY;
+    *this->enqueuedLocalWorkSizeZ = localWorkSizeZ;
 }
 
-bool Kernel::isLocalWorkSize2Patched(uint32_t rootDeviceIndex) {
-    return kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2 != &dummyPatchLocation;
+bool Kernel::isLocalWorkSize2Patched() {
+    return localWorkSizeX2 != &dummyPatchLocation;
 }
 
-void Kernel::setNumWorkGroupsValues(uint32_t rootDeviceIndex, uint32_t numWorkGroupsX, uint32_t numWorkGroupsY, uint32_t numWorkGroupsZ) {
-    *kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX = numWorkGroupsX;
-    *kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY = numWorkGroupsY;
-    *kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ = numWorkGroupsZ;
+void Kernel::setNumWorkGroupsValues(uint32_t numWorkGroupsX, uint32_t numWorkGroupsY, uint32_t numWorkGroupsZ) {
+    *this->numWorkGroupsX = numWorkGroupsX;
+    *this->numWorkGroupsY = numWorkGroupsY;
+    *this->numWorkGroupsZ = numWorkGroupsZ;
 }
 
-void Kernel::setWorkDim(uint32_t rootDeviceIndex, uint32_t workDim) {
-    *kernelDeviceInfos[rootDeviceIndex].workDim = workDim;
+void Kernel::setWorkDim(uint32_t workDim) {
+    *this->workDim = workDim;
 }
 
-uint32_t Kernel::getMaxKernelWorkGroupSize(uint32_t rootDeviceIndex) const {
-    return kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize;
+uint32_t Kernel::getMaxKernelWorkGroupSize() const {
+    return maxKernelWorkGroupSize;
 }
 
-uint32_t Kernel::getSlmTotalSize(uint32_t rootDeviceIndex) const {
-    return kernelDeviceInfos[rootDeviceIndex].slmTotalSize;
+uint32_t Kernel::getSlmTotalSize() const {
+    return slmTotalSize;
 }
 
 size_t Kernel::getTotalNumDevicesInContext() const {
diff --git a/opencl/source/kernel/kernel.h b/opencl/source/kernel/kernel.h
index 54c49ed485..92b6c48b5f 100644
--- a/opencl/source/kernel/kernel.h
+++ b/opencl/source/kernel/kernel.h
@@ -126,12 +126,12 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
     void setAuxTranslationRequired(bool onOff) { auxTranslationRequired = onOff; }
     void updateAuxTranslationRequired();
 
-    char *getCrossThreadData(uint32_t rootDeviceIndex) const {
-        return kernelDeviceInfos[rootDeviceIndex].crossThreadData;
+    char *getCrossThreadData() const {
+        return crossThreadData;
     }
 
-    uint32_t getCrossThreadDataSize(uint32_t rootDeviceIndex) const {
-        return kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize;
+    uint32_t getCrossThreadDataSize() const {
+        return crossThreadDataSize;
     }
 
     cl_int initialize();
@@ -172,12 +172,12 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
     size_t getKernelHeapSize(uint32_t rootDeviceIndex) const;
     size_t getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const;
     size_t getDynamicStateHeapSize(uint32_t rootDeviceIndex) const;
-    size_t getNumberOfBindingTableStates(uint32_t rootDeviceIndex) const;
-    size_t getBindingTableOffset(uint32_t rootDeviceIndex) const {
-        return kernelDeviceInfos[rootDeviceIndex].localBindingTableOffset;
+    size_t getNumberOfBindingTableStates() const;
+    size_t getBindingTableOffset() const {
+        return localBindingTableOffset;
     }
 
-    void resizeSurfaceStateHeap(uint32_t rootDeviceIndex, void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset);
+    void resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset);
 
     void substituteKernelHeap(const Device &device, void *newKernelHeap, size_t newKernelHeapSize);
     bool isKernelHeapSubstituted(uint32_t rootDeviceIndex) const;
@@ -303,7 +303,7 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
 
     //residency for kernel surfaces
     MOCKABLE_VIRTUAL void makeResident(CommandStreamReceiver &commandStreamReceiver);
-    MOCKABLE_VIRTUAL void getResidency(std::vector<Surface *> &dst, uint32_t rootDeviceIndex);
+    MOCKABLE_VIRTUAL void getResidency(std::vector<Surface *> &dst);
     bool requiresCoherency();
     void resetSharedObjectsPatchAddresses();
     bool isUsingSharedObjArgs() const { return usingSharedObjArgs; }
@@ -392,16 +392,16 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
     }
     const KernelInfo &getDefaultKernelInfo() const;
 
-    void setGlobalWorkOffsetValues(uint32_t rootDeviceIndex, uint32_t globalWorkOffsetX, uint32_t globalWorkOffsetY, uint32_t globalWorkOffsetZ);
-    void setGlobalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ);
-    void setLocalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ);
-    void setLocalWorkSize2Values(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ);
-    void setEnqueuedLocalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ);
-    bool isLocalWorkSize2Patched(uint32_t rootDeviceIndex);
-    void setNumWorkGroupsValues(uint32_t rootDeviceIndex, uint32_t numWorkGroupsX, uint32_t numWorkGroupsY, uint32_t numWorkGroupsZ);
-    void setWorkDim(uint32_t rootDeviceIndex, uint32_t workDim);
-    uint32_t getMaxKernelWorkGroupSize(uint32_t rootDeviceIndex) const;
-    uint32_t getSlmTotalSize(uint32_t rootDeviceIndex) const;
+    void setGlobalWorkOffsetValues(uint32_t globalWorkOffsetX, uint32_t globalWorkOffsetY, uint32_t globalWorkOffsetZ);
+    void setGlobalWorkSizeValues(uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ);
+    void setLocalWorkSizeValues(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ);
+    void setLocalWorkSize2Values(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ);
+    void setEnqueuedLocalWorkSizeValues(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ);
+    bool isLocalWorkSize2Patched();
+    void setNumWorkGroupsValues(uint32_t numWorkGroupsX, uint32_t numWorkGroupsY, uint32_t numWorkGroupsZ);
+    void setWorkDim(uint32_t workDim);
+    uint32_t getMaxKernelWorkGroupSize() const;
+    uint32_t getSlmTotalSize() const;
     bool getHasIndirectAccess() const {
         return this->kernelHasIndirectAccess;
     }
@@ -546,53 +546,50 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
     bool debugEnabled = false;
     uint32_t additionalKernelExecInfo = AdditionalKernelExecInfo::NotSet;
 
-    struct KernelDeviceInfo : public NonCopyableClass {
-        uint32_t *globalWorkOffsetX = &Kernel::dummyPatchLocation;
-        uint32_t *globalWorkOffsetY = &Kernel::dummyPatchLocation;
-        uint32_t *globalWorkOffsetZ = &Kernel::dummyPatchLocation;
+    uint32_t *globalWorkOffsetX = &Kernel::dummyPatchLocation;
+    uint32_t *globalWorkOffsetY = &Kernel::dummyPatchLocation;
+    uint32_t *globalWorkOffsetZ = &Kernel::dummyPatchLocation;
 
-        uint32_t *localWorkSizeX = &Kernel::dummyPatchLocation;
-        uint32_t *localWorkSizeY = &Kernel::dummyPatchLocation;
-        uint32_t *localWorkSizeZ = &Kernel::dummyPatchLocation;
+    uint32_t *localWorkSizeX = &Kernel::dummyPatchLocation;
+    uint32_t *localWorkSizeY = &Kernel::dummyPatchLocation;
+    uint32_t *localWorkSizeZ = &Kernel::dummyPatchLocation;
 
-        uint32_t *localWorkSizeX2 = &Kernel::dummyPatchLocation;
-        uint32_t *localWorkSizeY2 = &Kernel::dummyPatchLocation;
-        uint32_t *localWorkSizeZ2 = &Kernel::dummyPatchLocation;
+    uint32_t *localWorkSizeX2 = &Kernel::dummyPatchLocation;
+    uint32_t *localWorkSizeY2 = &Kernel::dummyPatchLocation;
+    uint32_t *localWorkSizeZ2 = &Kernel::dummyPatchLocation;
 
-        uint32_t *globalWorkSizeX = &Kernel::dummyPatchLocation;
-        uint32_t *globalWorkSizeY = &Kernel::dummyPatchLocation;
-        uint32_t *globalWorkSizeZ = &Kernel::dummyPatchLocation;
+    uint32_t *globalWorkSizeX = &Kernel::dummyPatchLocation;
+    uint32_t *globalWorkSizeY = &Kernel::dummyPatchLocation;
+    uint32_t *globalWorkSizeZ = &Kernel::dummyPatchLocation;
 
-        uint32_t *enqueuedLocalWorkSizeX = &Kernel::dummyPatchLocation;
-        uint32_t *enqueuedLocalWorkSizeY = &Kernel::dummyPatchLocation;
-        uint32_t *enqueuedLocalWorkSizeZ = &Kernel::dummyPatchLocation;
+    uint32_t *enqueuedLocalWorkSizeX = &Kernel::dummyPatchLocation;
+    uint32_t *enqueuedLocalWorkSizeY = &Kernel::dummyPatchLocation;
+    uint32_t *enqueuedLocalWorkSizeZ = &Kernel::dummyPatchLocation;
 
-        uint32_t *numWorkGroupsX = &Kernel::dummyPatchLocation;
-        uint32_t *numWorkGroupsY = &Kernel::dummyPatchLocation;
-        uint32_t *numWorkGroupsZ = &Kernel::dummyPatchLocation;
+    uint32_t *numWorkGroupsX = &Kernel::dummyPatchLocation;
+    uint32_t *numWorkGroupsY = &Kernel::dummyPatchLocation;
+    uint32_t *numWorkGroupsZ = &Kernel::dummyPatchLocation;
 
-        uint32_t *maxWorkGroupSizeForCrossThreadData = &Kernel::dummyPatchLocation;
-        uint32_t maxKernelWorkGroupSize = 0;
-        uint32_t *workDim = &Kernel::dummyPatchLocation;
-        uint32_t *dataParameterSimdSize = &Kernel::dummyPatchLocation;
-        uint32_t *parentEventOffset = &Kernel::dummyPatchLocation;
-        uint32_t *preferredWkgMultipleOffset = &Kernel::dummyPatchLocation;
+    uint32_t *maxWorkGroupSizeForCrossThreadData = &Kernel::dummyPatchLocation;
+    uint32_t maxKernelWorkGroupSize = 0;
+    uint32_t *workDim = &Kernel::dummyPatchLocation;
+    uint32_t *dataParameterSimdSize = &Kernel::dummyPatchLocation;
+    uint32_t *parentEventOffset = &Kernel::dummyPatchLocation;
+    uint32_t *preferredWkgMultipleOffset = &Kernel::dummyPatchLocation;
 
-        size_t numberOfBindingTableStates = 0u;
-        size_t localBindingTableOffset = 0u;
+    size_t numberOfBindingTableStates = 0u;
+    size_t localBindingTableOffset = 0u;
 
-        std::vector<size_t> slmSizes;
-        uint32_t slmTotalSize = 0u;
+    std::vector<size_t> slmSizes;
+    uint32_t slmTotalSize = 0u;
 
-        std::unique_ptr<char[]> pSshLocal;
-        uint32_t sshLocalSize = 0u;
-        char *crossThreadData = nullptr;
-        uint32_t crossThreadDataSize = 0u;
+    std::unique_ptr<char[]> pSshLocal;
+    uint32_t sshLocalSize = 0u;
+    char *crossThreadData = nullptr;
+    uint32_t crossThreadDataSize = 0u;
 
-        GraphicsAllocation *privateSurface = nullptr;
-        uint64_t privateSurfaceSize = 0u;
-    };
-    std::vector<KernelDeviceInfo> kernelDeviceInfos;
+    GraphicsAllocation *privateSurface = nullptr;
+    uint64_t privateSurfaceSize = 0u;
     const uint32_t defaultRootDeviceIndex;
 
     struct KernelConfig {
diff --git a/opencl/source/program/kernel_info.cpp b/opencl/source/program/kernel_info.cpp
index fc1a7e32bb..8708996a87 100644
--- a/opencl/source/program/kernel_info.cpp
+++ b/opencl/source/program/kernel_info.cpp
@@ -135,10 +135,10 @@ WorkSizeInfo::WorkSizeInfo(const DispatchInfo &dispatchInfo) {
     auto &device = dispatchInfo.getClDevice();
     auto rootDeviceIndex = device.getRootDeviceIndex();
     const auto &kernelInfo = dispatchInfo.getKernel()->getKernelInfo(rootDeviceIndex);
-    this->maxWorkGroupSize = dispatchInfo.getKernel()->getMaxKernelWorkGroupSize(rootDeviceIndex);
+    this->maxWorkGroupSize = dispatchInfo.getKernel()->getMaxKernelWorkGroupSize();
     this->hasBarriers = kernelInfo.kernelDescriptor.kernelAttributes.usesBarriers();
     this->simdSize = static_cast<uint32_t>(kernelInfo.getMaxSimdSize());
-    this->slmTotalSize = static_cast<uint32_t>(dispatchInfo.getKernel()->getSlmTotalSize(rootDeviceIndex));
+    this->slmTotalSize = static_cast<uint32_t>(dispatchInfo.getKernel()->getSlmTotalSize());
     this->coreFamily = device.getHardwareInfo().platform.eRenderCoreFamily;
     this->numThreadsPerSubSlice = static_cast<uint32_t>(device.getSharedDeviceInfo().maxNumEUsPerSubSlice) *
                                   device.getSharedDeviceInfo().numThreadsPerEU;
diff --git a/opencl/source/program/printf_handler.cpp b/opencl/source/program/printf_handler.cpp
index a21c297ec9..ce3d45b0f8 100644
--- a/opencl/source/program/printf_handler.cpp
+++ b/opencl/source/program/printf_handler.cpp
@@ -59,7 +59,7 @@ void PrintfHandler::prepareDispatch(const MultiDispatchInfo &multiDispatchInfo)
                                                      sizeof(printfSurfaceInitialDataSize));
 
     const auto &printfSurfaceArg = kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.payloadMappings.implicitArgs.printfSurfaceAddress;
-    auto printfPatchAddress = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getCrossThreadData(rootDeviceIndex)), printfSurfaceArg.stateless);
+    auto printfPatchAddress = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getCrossThreadData()), printfSurfaceArg.stateless);
     patchWithRequiredSize(printfPatchAddress, printfSurfaceArg.pointerSize, (uintptr_t)printfSurface->getGpuAddressToPatch());
     if (isValidOffset(printfSurfaceArg.bindful)) {
         auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getSurfaceStateHeap(rootDeviceIndex)), printfSurfaceArg.bindful);
diff --git a/opencl/source/utilities/logger.cpp b/opencl/source/utilities/logger.cpp
index 97f8c88c46..a4469e8c21 100644
--- a/opencl/source/utilities/logger.cpp
+++ b/opencl/source/utilities/logger.cpp
@@ -206,9 +206,8 @@ void FileLogger<DebugLevel>::dumpKernelArgs(const Kernel *kernel) {
                 }
             } else {
                 type = "immediate";
-                auto rootDeviceIndex = kernel->getDevices()[0]->getRootDeviceIndex();
-                auto crossThreadData = kernel->getCrossThreadData(rootDeviceIndex);
-                auto crossThreadDataSize = kernel->getCrossThreadDataSize(rootDeviceIndex);
+                auto crossThreadData = kernel->getCrossThreadData();
+                auto crossThreadDataSize = kernel->getCrossThreadDataSize();
                 argVal = std::unique_ptr<char[]>(new char[crossThreadDataSize]);
 
                 size_t totalArgSize = 0;
diff --git a/opencl/test/unit_test/accelerators/media_image_arg_tests.cpp b/opencl/test/unit_test/accelerators/media_image_arg_tests.cpp
index a947ccd6a5..e6d9f5c1a3 100644
--- a/opencl/test/unit_test/accelerators/media_image_arg_tests.cpp
+++ b/opencl/test/unit_test/accelerators/media_image_arg_tests.cpp
@@ -98,7 +98,7 @@ HWTEST_F(MediaImageSetArgTest, WhenSettingMediaImageArgThenArgsSetCorrectly) {
               pSurfaceState->getSurfaceBaseAddress());
 
     std::vector<Surface *> surfaces;
-    pKernel->getResidency(surfaces, rootDeviceIndex);
+    pKernel->getResidency(surfaces);
     EXPECT_EQ(0u, surfaces.size());
 }
 
@@ -136,7 +136,7 @@ HWTEST_F(MediaImageSetArgTest, WhenSettingKernelArgImageThenArgsSetCorrectly) {
     EXPECT_EQ(MEDIA_SURFACE_STATE::PICTURE_STRUCTURE_FRAME_PICTURE, pSurfaceState->getPictureStructure());
 
     std::vector<Surface *> surfaces;
-    pKernel->getResidency(surfaces, rootDeviceIndex);
+    pKernel->getResidency(surfaces);
 
     for (auto &surface : surfaces) {
         delete surface;
diff --git a/opencl/test/unit_test/api/cl_get_kernel_sub_group_info_tests.inl b/opencl/test/unit_test/api/cl_get_kernel_sub_group_info_tests.inl
index f0ea6b20ca..897a633794 100644
--- a/opencl/test/unit_test/api/cl_get_kernel_sub_group_info_tests.inl
+++ b/opencl/test/unit_test/api/cl_get_kernel_sub_group_info_tests.inl
@@ -15,12 +15,12 @@ struct KernelSubGroupInfoFixture : HelloWorldFixture<HelloWorldFixtureFactory> {
 
     void SetUp() override {
         ParentClass::SetUp();
-        pKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize = static_cast<uint32_t>(pDevice->getDeviceInfo().maxWorkGroupSize / 2);
+        pKernel->maxKernelWorkGroupSize = static_cast<uint32_t>(pDevice->getDeviceInfo().maxWorkGroupSize / 2);
         maxSimdSize = static_cast<size_t>(pKernel->getKernelInfo(rootDeviceIndex).getMaxSimdSize());
         ASSERT_LE(8u, maxSimdSize);
         maxWorkDim = static_cast<size_t>(pClDevice->getDeviceInfo().maxWorkItemDimensions);
         ASSERT_EQ(3u, maxWorkDim);
-        maxWorkGroupSize = static_cast<size_t>(pKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize);
+        maxWorkGroupSize = static_cast<size_t>(pKernel->maxKernelWorkGroupSize);
         ASSERT_GE(1024u, maxWorkGroupSize);
         largestCompiledSIMDSize = static_cast<size_t>(pKernel->getKernelInfo(rootDeviceIndex).getMaxSimdSize());
         ASSERT_EQ(32u, largestCompiledSIMDSize);
@@ -30,8 +30,8 @@ struct KernelSubGroupInfoFixture : HelloWorldFixture<HelloWorldFixtureFactory> {
         auto requiredWorkGroupSizeZ = static_cast<size_t>(pKernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2]);
 
         calculatedMaxWorkgroupSize = requiredWorkGroupSizeX * requiredWorkGroupSizeY * requiredWorkGroupSizeZ;
-        if ((calculatedMaxWorkgroupSize == 0) || (calculatedMaxWorkgroupSize > static_cast<size_t>(pKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize))) {
-            calculatedMaxWorkgroupSize = static_cast<size_t>(pKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize);
+        if ((calculatedMaxWorkgroupSize == 0) || (calculatedMaxWorkgroupSize > static_cast<size_t>(pKernel->maxKernelWorkGroupSize))) {
+            calculatedMaxWorkgroupSize = static_cast<size_t>(pKernel->maxKernelWorkGroupSize);
         }
     }
 
diff --git a/opencl/test/unit_test/api/cl_unified_shared_memory_tests.inl b/opencl/test/unit_test/api/cl_unified_shared_memory_tests.inl
index 96fdd33d6e..852d02eb58 100644
--- a/opencl/test/unit_test/api/cl_unified_shared_memory_tests.inl
+++ b/opencl/test/unit_test/api/cl_unified_shared_memory_tests.inl
@@ -656,7 +656,7 @@ TEST(clUnifiedSharedMemoryTests, whenDeviceSupportSharedMemoryAllocationsAndSyst
     EXPECT_EQ(retVal, CL_SUCCESS);
 
     //check if cross thread is updated
-    auto crossThreadLocation = reinterpret_cast<uintptr_t *>(ptrOffset(mockKernel.mockKernel->getCrossThreadData(device->getRootDeviceIndex()), mockKernel.kernelInfo.kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset));
+    auto crossThreadLocation = reinterpret_cast<uintptr_t *>(ptrOffset(mockKernel.mockKernel->getCrossThreadData(), mockKernel.kernelInfo.kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset));
     auto systemAddress = reinterpret_cast<uintptr_t>(systemPointer);
 
     EXPECT_EQ(*crossThreadLocation, systemAddress);
diff --git a/opencl/test/unit_test/built_ins/built_in_tests.cpp b/opencl/test/unit_test/built_ins/built_in_tests.cpp
index a7ca790981..f0af4a4d4b 100644
--- a/opencl/test/unit_test/built_ins/built_in_tests.cpp
+++ b/opencl/test/unit_test/built_ins/built_in_tests.cpp
@@ -919,7 +919,7 @@ TEST_F(BuiltInTests, GivenUnalignedCopyBufferToBufferWhenDispatchInfoIsCreatedTh
 
     EXPECT_EQ(kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName, "CopyBufferToBufferMiddleMisaligned");
 
-    const auto crossThreadData = kernel->getCrossThreadData(rootDeviceIndex);
+    const auto crossThreadData = kernel->getCrossThreadData();
     const auto crossThreadOffset = kernel->getKernelInfo(rootDeviceIndex).kernelArgInfo[4].kernelArgPatchInfoVector[0].crossthreadOffset;
     EXPECT_EQ(8u, *reinterpret_cast<uint32_t *>(ptrOffset(crossThreadData, crossThreadOffset)));
 
@@ -1147,7 +1147,7 @@ TEST_F(VmeBuiltInTests, GivenVmeBuilderWhenGettingDispatchInfoThenParamsAreCorre
         auto &argInfo = outDi->getKernel()->getKernelInfo(rootDeviceIndex).kernelArgInfo[vmeImplicitArgsBase + i];
         ASSERT_EQ(1U, argInfo.kernelArgPatchInfoVector.size());
         auto off = argInfo.kernelArgPatchInfoVector[0].crossthreadOffset;
-        EXPECT_EQ(vmeExtraArgsExpectedVals[i], *((uint32_t *)(outDi->getKernel()->getCrossThreadData(rootDeviceIndex) + off)));
+        EXPECT_EQ(vmeExtraArgsExpectedVals[i], *((uint32_t *)(outDi->getKernel()->getCrossThreadData() + off)));
     }
 }
 
@@ -1209,7 +1209,7 @@ TEST_F(VmeBuiltInTests, GivenAdvancedVmeBuilderWhenGettingDispatchInfoThenParams
             auto &argInfo = outDi->getKernel()->getKernelInfo(rootDeviceIndex).kernelArgInfo[vmeImplicitArgsBase + i];
             ASSERT_EQ(1U, argInfo.kernelArgPatchInfoVector.size());
             auto off = argInfo.kernelArgPatchInfoVector[0].crossthreadOffset;
-            EXPECT_EQ(vmeExtraArgsExpectedVals[i], *((uint32_t *)(outDi->getKernel()->getCrossThreadData(rootDeviceIndex) + off)));
+            EXPECT_EQ(vmeExtraArgsExpectedVals[i], *((uint32_t *)(outDi->getKernel()->getCrossThreadData() + off)));
         }
     }
 }
diff --git a/opencl/test/unit_test/command_queue/command_queue_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_tests.cpp
index e291fbca91..d398f817ed 100644
--- a/opencl/test/unit_test/command_queue/command_queue_tests.cpp
+++ b/opencl/test/unit_test/command_queue/command_queue_tests.cpp
@@ -1043,7 +1043,7 @@ HWTEST_F(CommandQueueCommandStreamTest, givenDebugKernelWhenSetupDebugSurfaceIsC
 
     const auto &systemThreadSurfaceAddress = kernel->getAllocatedKernelInfo()->kernelDescriptor.payloadMappings.implicitArgs.systemThreadSurfaceAddress.bindful;
     kernel->getAllocatedKernelInfo()->usesSsh = true;
-    kernel->setSshLocal(nullptr, sizeof(RENDER_SURFACE_STATE) + systemThreadSurfaceAddress, rootDeviceIndex);
+    kernel->setSshLocal(nullptr, sizeof(RENDER_SURFACE_STATE) + systemThreadSurfaceAddress);
     auto &commandStreamReceiver = cmdQ.getGpgpuCommandStreamReceiver();
 
     cmdQ.getGpgpuCommandStreamReceiver().allocateDebugSurface(SipKernel::maxDbgSurfaceSize);
@@ -1064,7 +1064,7 @@ HWTEST_F(CommandQueueCommandStreamTest, givenCsrWithDebugSurfaceAllocatedWhenSet
 
     const auto &systemThreadSurfaceAddress = kernel->getAllocatedKernelInfo()->kernelDescriptor.payloadMappings.implicitArgs.systemThreadSurfaceAddress.bindful;
     kernel->getAllocatedKernelInfo()->usesSsh = true;
-    kernel->setSshLocal(nullptr, sizeof(RENDER_SURFACE_STATE) + systemThreadSurfaceAddress, rootDeviceIndex);
+    kernel->setSshLocal(nullptr, sizeof(RENDER_SURFACE_STATE) + systemThreadSurfaceAddress);
     auto &commandStreamReceiver = cmdQ.getGpgpuCommandStreamReceiver();
     commandStreamReceiver.allocateDebugSurface(SipKernel::maxDbgSurfaceSize);
     auto debugSurface = commandStreamReceiver.getDebugSurfaceAllocation();
diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp
index c1d5d9c064..0b79c73bc1 100644
--- a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp
+++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp
@@ -273,7 +273,7 @@ HWTEST_F(DispatchWalkerTest, GivenDefaultLwsAlgorithmWhenDispatchingWalkerThenDi
             nullptr,
             CL_COMMAND_NDRANGE_KERNEL);
 
-        EXPECT_EQ(dimension, *kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
+        EXPECT_EQ(dimension, *kernel.workDim);
     }
 }
 
@@ -304,7 +304,7 @@ HWTEST_F(DispatchWalkerTest, GivenSquaredLwsAlgorithmWhenDispatchingWalkerThenDi
             nullptr,
             nullptr,
             CL_COMMAND_NDRANGE_KERNEL);
-        EXPECT_EQ(dimension, *kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
+        EXPECT_EQ(dimension, *kernel.workDim);
     }
 }
 
@@ -334,7 +334,7 @@ HWTEST_F(DispatchWalkerTest, GivenNdLwsAlgorithmWhenDispatchingWalkerThenDimensi
             nullptr,
             nullptr,
             CL_COMMAND_NDRANGE_KERNEL);
-        EXPECT_EQ(dimension, *kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
+        EXPECT_EQ(dimension, *kernel.workDim);
     }
 }
 
@@ -365,7 +365,7 @@ HWTEST_F(DispatchWalkerTest, GivenOldLwsAlgorithmWhenDispatchingWalkerThenDimens
             nullptr,
             nullptr,
             CL_COMMAND_NDRANGE_KERNEL);
-        EXPECT_EQ(dimension, *kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
+        EXPECT_EQ(dimension, *kernel.workDim);
     }
 }
 
@@ -397,9 +397,9 @@ HWTEST_F(DispatchWalkerTest, GivenNumWorkGroupsWhenDispatchingWalkerThenNumWorkG
         nullptr,
         CL_COMMAND_NDRANGE_KERNEL);
 
-    EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
-    EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
-    EXPECT_EQ(10u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
+    EXPECT_EQ(2u, *kernel.numWorkGroupsX);
+    EXPECT_EQ(5u, *kernel.numWorkGroupsY);
+    EXPECT_EQ(10u, *kernel.numWorkGroupsZ);
 }
 
 HWTEST_F(DispatchWalkerTest, GivenGlobalWorkOffsetWhenDispatchingWalkerThenGlobalWorkOffsetIsCorrectlySet) {
@@ -430,9 +430,9 @@ HWTEST_F(DispatchWalkerTest, GivenGlobalWorkOffsetWhenDispatchingWalkerThenGloba
         nullptr,
         CL_COMMAND_NDRANGE_KERNEL);
 
-    EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX);
-    EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY);
-    EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetZ);
+    EXPECT_EQ(1u, *kernel.globalWorkOffsetX);
+    EXPECT_EQ(2u, *kernel.globalWorkOffsetY);
+    EXPECT_EQ(3u, *kernel.globalWorkOffsetZ);
 }
 
 HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndDefaultAlgorithmWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -462,9 +462,9 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndDefaultAlgorithmWhenDispatch
         nullptr,
         nullptr,
         CL_COMMAND_NDRANGE_KERNEL);
-    EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
-    EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
-    EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
+    EXPECT_EQ(2u, *kernel.localWorkSizeX);
+    EXPECT_EQ(5u, *kernel.localWorkSizeY);
+    EXPECT_EQ(1u, *kernel.localWorkSizeZ);
 }
 
 HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndNdOnWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -494,9 +494,9 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndNdOnWhenDispatchingWalkerThe
         nullptr,
         nullptr,
         CL_COMMAND_NDRANGE_KERNEL);
-    EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
-    EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
-    EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
+    EXPECT_EQ(2u, *kernel.localWorkSizeX);
+    EXPECT_EQ(3u, *kernel.localWorkSizeY);
+    EXPECT_EQ(5u, *kernel.localWorkSizeZ);
 }
 
 HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -527,9 +527,9 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmWhenDispatch
         nullptr,
         nullptr,
         CL_COMMAND_NDRANGE_KERNEL);
-    EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
-    EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
-    EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
+    EXPECT_EQ(2u, *kernel.localWorkSizeX);
+    EXPECT_EQ(5u, *kernel.localWorkSizeY);
+    EXPECT_EQ(1u, *kernel.localWorkSizeZ);
 }
 
 HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmOffAndNdOffWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -560,9 +560,9 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmOffAndNdOffW
         nullptr,
         nullptr,
         CL_COMMAND_NDRANGE_KERNEL);
-    EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
-    EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
-    EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
+    EXPECT_EQ(2u, *kernel.localWorkSizeX);
+    EXPECT_EQ(5u, *kernel.localWorkSizeY);
+    EXPECT_EQ(1u, *kernel.localWorkSizeZ);
 }
 
 HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -591,9 +591,9 @@ HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeWhenDispatchingWalkerThenLwsIsC
         nullptr,
         nullptr,
         CL_COMMAND_NDRANGE_KERNEL);
-    EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
-    EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
-    EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
+    EXPECT_EQ(1u, *kernel.localWorkSizeX);
+    EXPECT_EQ(2u, *kernel.localWorkSizeY);
+    EXPECT_EQ(3u, *kernel.localWorkSizeZ);
 }
 
 HWTEST_F(DispatchWalkerTest, GivenTwoSetsOfLwsOffsetsWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -625,12 +625,12 @@ HWTEST_F(DispatchWalkerTest, GivenTwoSetsOfLwsOffsetsWhenDispatchingWalkerThenLw
         nullptr,
         nullptr,
         CL_COMMAND_NDRANGE_KERNEL);
-    EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
-    EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
-    EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
-    EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2);
-    EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
-    EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2);
+    EXPECT_EQ(1u, *kernel.localWorkSizeX);
+    EXPECT_EQ(2u, *kernel.localWorkSizeY);
+    EXPECT_EQ(3u, *kernel.localWorkSizeZ);
+    EXPECT_EQ(1u, *kernel.localWorkSizeX2);
+    EXPECT_EQ(2u, *kernel.localWorkSizeY2);
+    EXPECT_EQ(3u, *kernel.localWorkSizeZ2);
 }
 
 HWTEST_F(DispatchWalkerTest, GivenSplitKernelWhenDispatchingWalkerThenLwsIsCorrect) {
@@ -670,14 +670,14 @@ HWTEST_F(DispatchWalkerTest, GivenSplitKernelWhenDispatchingWalkerThenLwsIsCorre
     for (auto &dispatchInfo : multiDispatchInfo) {
         auto &kernel = static_cast<MockKernel &>(*dispatchInfo.getKernel());
         if (dispatchId == 0) {
-            EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
-            EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
-            EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
+            EXPECT_EQ(1u, *kernel.localWorkSizeX);
+            EXPECT_EQ(2u, *kernel.localWorkSizeY);
+            EXPECT_EQ(3u, *kernel.localWorkSizeZ);
         }
         if (dispatchId == 1) {
-            EXPECT_EQ(4u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
-            EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
-            EXPECT_EQ(6u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
+            EXPECT_EQ(4u, *kernel.localWorkSizeX);
+            EXPECT_EQ(5u, *kernel.localWorkSizeY);
+            EXPECT_EQ(6u, *kernel.localWorkSizeZ);
         }
         dispatchId++;
     }
@@ -723,25 +723,25 @@ HWTEST_F(DispatchWalkerTest, GivenSplitWalkerWhenDispatchingWalkerThenLwsIsCorre
     for (auto &dispatchInfo : multiDispatchInfo) {
         auto &kernel = static_cast<MockKernel &>(*dispatchInfo.getKernel());
         if (&kernel == &mainKernel) {
-            EXPECT_EQ(4u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
-            EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
-            EXPECT_EQ(6u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
-            EXPECT_EQ(4u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2);
-            EXPECT_EQ(5u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
-            EXPECT_EQ(6u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2);
-            EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
-            EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
-            EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
+            EXPECT_EQ(4u, *kernel.localWorkSizeX);
+            EXPECT_EQ(5u, *kernel.localWorkSizeY);
+            EXPECT_EQ(6u, *kernel.localWorkSizeZ);
+            EXPECT_EQ(4u, *kernel.localWorkSizeX2);
+            EXPECT_EQ(5u, *kernel.localWorkSizeY2);
+            EXPECT_EQ(6u, *kernel.localWorkSizeZ2);
+            EXPECT_EQ(3u, *kernel.numWorkGroupsX);
+            EXPECT_EQ(2u, *kernel.numWorkGroupsY);
+            EXPECT_EQ(2u, *kernel.numWorkGroupsZ);
         } else {
-            EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
-            EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
-            EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
-            EXPECT_EQ(1u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2);
-            EXPECT_EQ(2u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
-            EXPECT_EQ(3u, *kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2);
-            EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
-            EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
-            EXPECT_EQ(0u, *kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
+            EXPECT_EQ(0u, *kernel.localWorkSizeX);
+            EXPECT_EQ(0u, *kernel.localWorkSizeY);
+            EXPECT_EQ(0u, *kernel.localWorkSizeZ);
+            EXPECT_EQ(1u, *kernel.localWorkSizeX2);
+            EXPECT_EQ(2u, *kernel.localWorkSizeY2);
+            EXPECT_EQ(3u, *kernel.localWorkSizeZ2);
+            EXPECT_EQ(0u, *kernel.numWorkGroupsX);
+            EXPECT_EQ(0u, *kernel.numWorkGroupsY);
+            EXPECT_EQ(0u, *kernel.numWorkGroupsZ);
         }
     }
 }
@@ -939,7 +939,7 @@ HWTEST_F(DispatchWalkerTest, GivenMultipleKernelsWhenDispatchingWalkerThenWorkDi
 
     for (auto &dispatchInfo : multiDispatchInfo) {
         auto &kernel = static_cast<MockKernel &>(*dispatchInfo.getKernel());
-        EXPECT_EQ(*kernel.kernelDeviceInfos[rootDeviceIndex].workDim, dispatchInfo.getDim());
+        EXPECT_EQ(*kernel.workDim, dispatchInfo.getDim());
     }
 }
 
diff --git a/opencl/test/unit_test/command_queue/enqueue_debug_kernel_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_debug_kernel_tests.cpp
index 5decccc349..598086d171 100644
--- a/opencl/test/unit_test/command_queue/enqueue_debug_kernel_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_debug_kernel_tests.cpp
@@ -101,7 +101,7 @@ HWTEST_F(EnqueueDebugKernelTest, givenDebugKernelWhenEnqueuedThenSSHAndBtiAreCor
 
         mockCmdQ->enqueueKernel(debugKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
 
-        auto *dstBtiTableBase = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(surfaceStates, debugKernel->getBindingTableOffset(rootDeviceIndex)));
+        auto *dstBtiTableBase = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(surfaceStates, debugKernel->getBindingTableOffset()));
         uint32_t surfaceStateOffset = dstBtiTableBase[0].getSurfaceStatePointer();
 
         auto debugSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh.getCpuBase(), surfaceStateOffset));
diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp
index c3e3baf6d7..e73edaf3c1 100644
--- a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp
@@ -1290,9 +1290,9 @@ TEST_F(EnqueueKernelTest, givenKernelWhenAllArgsAreNotAndEventExistSetThenClEnqu
 TEST_F(EnqueueKernelTest, givenEnqueueCommandThatLwsExceedsDeviceCapabilitiesWhenEnqueueNDRangeKernelIsCalledThenErrorIsReturned) {
     MockKernelWithInternals mockKernel(*pClDevice);
 
-    mockKernel.mockKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize = static_cast<uint32_t>(pDevice->getDeviceInfo().maxWorkGroupSize / 2);
+    mockKernel.mockKernel->maxKernelWorkGroupSize = static_cast<uint32_t>(pDevice->getDeviceInfo().maxWorkGroupSize / 2);
 
-    auto maxKernelWorkgroupSize = mockKernel.mockKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize;
+    auto maxKernelWorkgroupSize = mockKernel.mockKernel->maxKernelWorkGroupSize;
     size_t globalWorkSize[3] = {maxKernelWorkgroupSize + 1, 1, 1};
     size_t localWorkSize[3] = {maxKernelWorkgroupSize + 1, 1, 1};
 
diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp
index 82a3f87432..919a400f10 100644
--- a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp
@@ -667,7 +667,7 @@ HWTEST_P(EnqueueKernelPrintfTest, GivenKernelWithPrintfBlockedByEventWhenEventUn
         patchData.DataParamOffset = 0;
         populateKernelDescriptor(mockKernel.kernelInfo.kernelDescriptor, patchData);
 
-        auto crossThreadData = reinterpret_cast<uint64_t *>(mockKernel.mockKernel->getCrossThreadData(rootDeviceIndex));
+        auto crossThreadData = reinterpret_cast<uint64_t *>(mockKernel.mockKernel->getCrossThreadData());
 
         std::string testString = "test";
 
diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_local_work_size_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_local_work_size_tests.cpp
index d8c6700115..3afabe82bb 100644
--- a/opencl/test/unit_test/command_queue/enqueue_kernel_local_work_size_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_kernel_local_work_size_tests.cpp
@@ -64,13 +64,13 @@ TEST_F(EnqueueKernelRequiredWorkSize, GivenUnspecifiedWorkGroupSizeWhenEnqeueing
 
     EXPECT_EQ(CL_SUCCESS, retVal);
 
-    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX, 8u);
-    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY, 2u);
-    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ, 2u);
+    EXPECT_EQ(*pKernel->localWorkSizeX, 8u);
+    EXPECT_EQ(*pKernel->localWorkSizeY, 2u);
+    EXPECT_EQ(*pKernel->localWorkSizeZ, 2u);
 
-    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX, 8u);
-    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY, 2u);
-    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ, 2u);
+    EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeX, 8u);
+    EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeY, 2u);
+    EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeZ, 2u);
 }
 
 // Fully specified
@@ -91,13 +91,13 @@ TEST_F(EnqueueKernelRequiredWorkSize, GivenRequiredWorkGroupSizeWhenEnqeueingKer
 
     EXPECT_EQ(CL_SUCCESS, retVal);
 
-    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX, 8u);
-    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY, 2u);
-    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ, 2u);
+    EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeX, 8u);
+    EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeY, 2u);
+    EXPECT_EQ(*pKernel->enqueuedLocalWorkSizeZ, 2u);
 
-    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX, 8u);
-    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY, 2u);
-    EXPECT_EQ(*pKernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ, 2u);
+    EXPECT_EQ(*pKernel->localWorkSizeX, 8u);
+    EXPECT_EQ(*pKernel->localWorkSizeY, 2u);
+    EXPECT_EQ(*pKernel->localWorkSizeZ, 2u);
 }
 
 // Underspecified.  Won't permit.
diff --git a/opencl/test/unit_test/command_queue/enqueue_read_buffer_rect_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_read_buffer_rect_tests.cpp
index 2f3fe25b15..dd88bfce4a 100644
--- a/opencl/test/unit_test/command_queue/enqueue_read_buffer_rect_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_read_buffer_rect_tests.cpp
@@ -576,13 +576,13 @@ HWTEST_F(EnqueueReadWriteBufferRectDispatch, givenOffsetResultingInMisalignedPtr
         const auto &surfaceStateDst = getSurfaceState<FamilyType>(&cmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0), 1);
 
         if (kernelInfo.kernelArgInfo[1].kernelArgPatchInfoVector[0].size == sizeof(uint64_t)) {
-            auto pKernelArg = (uint64_t *)(kernel->getCrossThreadData(device->getRootDeviceIndex()) +
+            auto pKernelArg = (uint64_t *)(kernel->getCrossThreadData() +
                                            kernelInfo.kernelArgInfo[1].kernelArgPatchInfoVector[0].crossthreadOffset);
             EXPECT_EQ(reinterpret_cast<uint64_t>(alignDown(misalignedDstPtr, 4)), *pKernelArg);
             EXPECT_EQ(*pKernelArg, surfaceStateDst.getSurfaceBaseAddress());
 
         } else if (kernelInfo.kernelArgInfo[1].kernelArgPatchInfoVector[0].size == sizeof(uint32_t)) {
-            auto pKernelArg = (uint32_t *)(kernel->getCrossThreadData(device->getRootDeviceIndex()) +
+            auto pKernelArg = (uint32_t *)(kernel->getCrossThreadData() +
                                            kernelInfo.kernelArgInfo[1].kernelArgPatchInfoVector[0].crossthreadOffset);
             EXPECT_EQ(reinterpret_cast<uint64_t>(alignDown(misalignedDstPtr, 4)), static_cast<uint64_t>(*pKernelArg));
             EXPECT_EQ(static_cast<uint64_t>(*pKernelArg), surfaceStateDst.getSurfaceBaseAddress());
@@ -590,7 +590,7 @@ HWTEST_F(EnqueueReadWriteBufferRectDispatch, givenOffsetResultingInMisalignedPtr
     }
 
     if (kernelInfo.kernelArgInfo[3].kernelArgPatchInfoVector[0].size == 4 * sizeof(uint32_t)) { // size of  uint4 DstOrigin
-        auto dstOffset = (uint32_t *)(kernel->getCrossThreadData(device->getRootDeviceIndex()) +
+        auto dstOffset = (uint32_t *)(kernel->getCrossThreadData() +
                                       kernelInfo.kernelArgInfo[3].kernelArgPatchInfoVector[0].crossthreadOffset);
         EXPECT_EQ(hostOffset.x + ptrDiff(misalignedDstPtr, alignDown(misalignedDstPtr, 4)), *dstOffset);
     } else {
diff --git a/opencl/test/unit_test/command_queue/enqueue_svm_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_svm_tests.cpp
index 74a7f6d6bc..43f8295e12 100644
--- a/opencl/test/unit_test/command_queue/enqueue_svm_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_svm_tests.cpp
@@ -769,7 +769,7 @@ TEST_F(EnqueueSvmTest, givenEnqueueTaskBlockedOnUserEventWhenItIsEnqueuedThenSur
     auto pMultiDeviceKernel = clUniquePtr(MultiDeviceKernel::create<MockKernel>(program.get(), program->getKernelInfosForKernel("FillBufferBytes"), &retVal));
     auto kernel = static_cast<MockKernel *>(pMultiDeviceKernel->getKernel(rootDeviceIndex));
     std::vector<Surface *> allSurfaces;
-    kernel->getResidency(allSurfaces, rootDeviceIndex);
+    kernel->getResidency(allSurfaces);
     EXPECT_EQ(1u, allSurfaces.size());
 
     kernel->setSvmKernelExecInfo(pSvmAlloc);
@@ -789,7 +789,7 @@ TEST_F(EnqueueSvmTest, givenEnqueueTaskBlockedOnUserEventWhenItIsEnqueuedThenSur
         nullptr);
     EXPECT_EQ(CL_SUCCESS, retVal);
 
-    kernel->getResidency(allSurfaces, rootDeviceIndex);
+    kernel->getResidency(allSurfaces);
     EXPECT_EQ(3u, allSurfaces.size());
 
     for (auto &surface : allSurfaces)
diff --git a/opencl/test/unit_test/command_queue/enqueue_write_buffer_rect_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_write_buffer_rect_tests.cpp
index 2da32b4d6e..cd9909923a 100644
--- a/opencl/test/unit_test/command_queue/enqueue_write_buffer_rect_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_write_buffer_rect_tests.cpp
@@ -574,13 +574,13 @@ HWTEST_F(EnqueueReadWriteBufferRectDispatch, givenOffsetResultingInMisalignedPtr
         const auto &surfaceState = getSurfaceState<FamilyType>(&cmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0), 0);
 
         if (kernelInfo.kernelArgInfo[0].kernelArgPatchInfoVector[0].size == sizeof(uint64_t)) {
-            auto pKernelArg = (uint64_t *)(kernel->getCrossThreadData(device->getRootDeviceIndex()) +
+            auto pKernelArg = (uint64_t *)(kernel->getCrossThreadData() +
                                            kernelInfo.kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
             EXPECT_EQ(reinterpret_cast<uint64_t>(alignDown(misalignedHostPtr, 4)), *pKernelArg);
             EXPECT_EQ(*pKernelArg, surfaceState.getSurfaceBaseAddress());
 
         } else if (kernelInfo.kernelArgInfo[0].kernelArgPatchInfoVector[0].size == sizeof(uint32_t)) {
-            auto pKernelArg = (uint32_t *)(kernel->getCrossThreadData(device->getRootDeviceIndex()) +
+            auto pKernelArg = (uint32_t *)(kernel->getCrossThreadData() +
                                            kernelInfo.kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
             EXPECT_EQ(reinterpret_cast<uint64_t>(alignDown(misalignedHostPtr, 4)), static_cast<uint64_t>(*pKernelArg));
             EXPECT_EQ(static_cast<uint64_t>(*pKernelArg), surfaceState.getSurfaceBaseAddress());
@@ -588,7 +588,7 @@ HWTEST_F(EnqueueReadWriteBufferRectDispatch, givenOffsetResultingInMisalignedPtr
     }
 
     if (kernelInfo.kernelArgInfo[2].kernelArgPatchInfoVector[0].size == 4 * sizeof(uint32_t)) { // size of  uint4 SrcOrigin
-        auto dstOffset = (uint32_t *)(kernel->getCrossThreadData(device->getRootDeviceIndex()) +
+        auto dstOffset = (uint32_t *)(kernel->getCrossThreadData() +
                                       kernelInfo.kernelArgInfo[2].kernelArgPatchInfoVector[0].crossthreadOffset);
         EXPECT_EQ(hostOffset.x + ptrDiff(misalignedHostPtr, alignDown(misalignedHostPtr, 4)), *dstOffset);
     } else {
diff --git a/opencl/test/unit_test/command_queue/gl/windows/enqueue_kernel_gl_tests_windows.cpp b/opencl/test/unit_test/command_queue/gl/windows/enqueue_kernel_gl_tests_windows.cpp
index 28c8210b40..1f764d67fb 100644
--- a/opencl/test/unit_test/command_queue/gl/windows/enqueue_kernel_gl_tests_windows.cpp
+++ b/opencl/test/unit_test/command_queue/gl/windows/enqueue_kernel_gl_tests_windows.cpp
@@ -45,7 +45,7 @@ TEST_F(EnqueueKernelTest, givenKernelWithSharedObjArgsWhenEnqueueIsCalledThenRes
     auto &kernelInfo = pKernel->getKernelInfo(rootDeviceIndex);
 
     auto pKernelArg =
-        (uint32_t *)(pKernel->getCrossThreadData(rootDeviceIndex) + kernelInfo.kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
+        (uint32_t *)(pKernel->getCrossThreadData() + kernelInfo.kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
 
     auto address1 = static_cast<uint64_t>(*pKernelArg);
     auto sharedBufferGpuAddress =
diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp
index 4c1c8d31ea..46f1c06933 100644
--- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp
+++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp
@@ -380,7 +380,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenKernelWithSlmWhenPreviousSLML
     commandStreamReceiver->lastSentL3Config = L3Config;
     commandStreamReceiver->lastSentThreadArbitrationPolicy = kernel.mockKernel->getThreadArbitrationPolicy();
 
-    ((MockKernel *)kernel)->setTotalSLMSize(rootDeviceIndex, 1024);
+    ((MockKernel *)kernel)->setTotalSLMSize(1024);
 
     cmdList.clear();
     commandQueue.enqueueKernel(kernel, 1, nullptr, &GWS, nullptr, 0, nullptr, nullptr);
diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests.inl b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests.inl
index 4cd254d479..994e398eb3 100644
--- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests.inl
+++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2020 Intel Corporation
+ * Copyright (C) 2018-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,7 @@ void CommandStreamReceiverHwTest<GfxFamily>::givenKernelWithSlmWhenPreviousNOSLM
     commandStreamReceiver->isPreambleSent = true;
     commandStreamReceiver->lastSentL3Config = 0;
 
-    static_cast<MockKernel *>(kernel)->setTotalSLMSize(rootDeviceIndex, 1024);
+    static_cast<MockKernel *>(kernel)->setTotalSLMSize(1024);
 
     cmdList.clear();
     commandQueue.enqueueKernel(kernel, 1, nullptr, &GWS, nullptr, 0, nullptr, nullptr);
@@ -89,7 +89,7 @@ void CommandStreamReceiverHwTest<GfxFamily>::givenBlockedKernelWithSlmWhenPrevio
     commandStreamReceiver->isPreambleSent = true;
     commandStreamReceiver->lastSentL3Config = 0;
 
-    static_cast<MockKernel *>(kernel)->setTotalSLMSize(rootDeviceIndex, 1024);
+    static_cast<MockKernel *>(kernel)->setTotalSLMSize(1024);
 
     commandQueue.enqueueKernel(kernel, 1, nullptr, &GWS, nullptr, 1, &blockingEvent, nullptr);
 
diff --git a/opencl/test/unit_test/context/driver_diagnostics_enqueue_tests.cpp b/opencl/test/unit_test/context/driver_diagnostics_enqueue_tests.cpp
index 106dba3139..88afa15d0f 100644
--- a/opencl/test/unit_test/context/driver_diagnostics_enqueue_tests.cpp
+++ b/opencl/test/unit_test/context/driver_diagnostics_enqueue_tests.cpp
@@ -682,9 +682,9 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
 
     snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
              kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
-             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
-             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
-             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
+             *kernel->localWorkSizeX,
+             *kernel->localWorkSizeY,
+             *kernel->localWorkSizeZ);
     EXPECT_TRUE(containsHint(expectedHint, userData));
 }
 
@@ -697,9 +697,9 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
 
     snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
              kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
-             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
-             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
-             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
+             *kernel->localWorkSizeX,
+             *kernel->localWorkSizeY,
+             *kernel->localWorkSizeZ);
     EXPECT_TRUE(containsHint(expectedHint, userData));
     DebugManager.flags.EnableComputeWorkSizeND.set(isWorkGroupSizeEnabled);
 }
@@ -713,9 +713,9 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
 
     snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
              kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
-             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
-             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
-             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
+             *kernel->localWorkSizeX,
+             *kernel->localWorkSizeY,
+             *kernel->localWorkSizeZ);
     EXPECT_TRUE(containsHint(expectedHint, userData));
     DebugManager.flags.EnableComputeWorkSizeND.set(isWorkGroupSizeEnabled);
 }
@@ -727,9 +727,9 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
 
     snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
              kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
-             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
-             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
-             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
+             *kernel->localWorkSizeX,
+             *kernel->localWorkSizeY,
+             *kernel->localWorkSizeZ);
     EXPECT_TRUE(containsHint(expectedHint, userData));
 }
 
@@ -743,9 +743,9 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
 
     snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
              kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
-             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
-             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
-             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
+             *kernel->localWorkSizeX,
+             *kernel->localWorkSizeY,
+             *kernel->localWorkSizeZ);
     EXPECT_TRUE(containsHint(expectedHint, userData));
 }
 
@@ -759,9 +759,9 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
 
     snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE],
              kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelMetadata.kernelName.c_str(),
-             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeX,
-             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeY,
-             *kernel->kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
+             *kernel->localWorkSizeX,
+             *kernel->localWorkSizeY,
+             *kernel->localWorkSizeZ);
     EXPECT_TRUE(containsHint(expectedHint, userData));
 }
 
diff --git a/opencl/test/unit_test/execution_model/enqueue_execution_model_kernel_tests.cpp b/opencl/test/unit_test/execution_model/enqueue_execution_model_kernel_tests.cpp
index a0fd671562..4bcecf59fe 100644
--- a/opencl/test/unit_test/execution_model/enqueue_execution_model_kernel_tests.cpp
+++ b/opencl/test/unit_test/execution_model/enqueue_execution_model_kernel_tests.cpp
@@ -322,7 +322,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelEnqueueTest, givenParentKernelWhenEnqueu
 
         Kernel *blockKernel = Kernel::create(pKernel->getProgram(), MockKernel::toKernelInfoContainer(*pBlockInfo, rootDeviceIndex), *pClDevice, nullptr);
         blockSSH = alignUp(blockSSH, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
-        if (blockKernel->getNumberOfBindingTableStates(rootDeviceIndex) > 0) {
+        if (blockKernel->getNumberOfBindingTableStates() > 0) {
             ASSERT_TRUE(isValidOffset(pBlockInfo->kernelDescriptor.payloadMappings.bindingTable.tableOffset));
             auto dstBlockBti = ptrOffset(blockSSH, pBlockInfo->kernelDescriptor.payloadMappings.bindingTable.tableOffset);
             EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(dstBlockBti) % INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER_ALIGN_SIZE);
@@ -330,7 +330,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelEnqueueTest, givenParentKernelWhenEnqueu
 
             auto srcBlockBti = ptrOffset(pBlockInfo->heapInfo.pSsh, pBlockInfo->kernelDescriptor.payloadMappings.bindingTable.tableOffset);
             auto srcBindingTable = reinterpret_cast<const BINDING_TABLE_STATE *>(srcBlockBti);
-            for (uint32_t i = 0; i < blockKernel->getNumberOfBindingTableStates(rootDeviceIndex); ++i) {
+            for (uint32_t i = 0; i < blockKernel->getNumberOfBindingTableStates(); ++i) {
                 uint32_t dstSurfaceStatePointer = dstBindingTable[i].getSurfaceStatePointer();
                 uint32_t srcSurfaceStatePointer = srcBindingTable[i].getSurfaceStatePointer();
                 auto *dstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh->getCpuBase(), dstSurfaceStatePointer));
@@ -454,13 +454,13 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelEnqueueFixture, GivenParentKernelWhenEnq
 
         const auto &defaultQueueSurfaceAddress = implicitArgs.deviceSideEnqueueDefaultQueueSurfaceAddress;
         if (isValidOffset(defaultQueueSurfaceAddress.stateless)) {
-            auto patchLocation = ptrOffset(reinterpret_cast<uint64_t *>(parentKernel->getCrossThreadData(rootDeviceIndex)), defaultQueueSurfaceAddress.stateless);
+            auto patchLocation = ptrOffset(reinterpret_cast<uint64_t *>(parentKernel->getCrossThreadData()), defaultQueueSurfaceAddress.stateless);
             EXPECT_EQ(pDevQueue->getQueueBuffer()->getGpuAddressToPatch(), *patchLocation);
         }
 
         const auto &eventPoolSurfaceAddress = implicitArgs.deviceSideEnqueueEventPoolSurfaceAddress;
         if (isValidOffset(eventPoolSurfaceAddress.stateless)) {
-            auto patchLocation = ptrOffset(reinterpret_cast<uint64_t *>(parentKernel->getCrossThreadData(rootDeviceIndex)), eventPoolSurfaceAddress.stateless);
+            auto patchLocation = ptrOffset(reinterpret_cast<uint64_t *>(parentKernel->getCrossThreadData()), eventPoolSurfaceAddress.stateless);
             EXPECT_EQ(pDevQueue->getEventPoolBuffer()->getGpuAddressToPatch(), *patchLocation);
         }
     }
diff --git a/opencl/test/unit_test/execution_model/parent_kernel_dispatch_tests.cpp b/opencl/test/unit_test/execution_model/parent_kernel_dispatch_tests.cpp
index 1ec480899d..09f8cc48f8 100644
--- a/opencl/test/unit_test/execution_model/parent_kernel_dispatch_tests.cpp
+++ b/opencl/test/unit_test/execution_model/parent_kernel_dispatch_tests.cpp
@@ -174,7 +174,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelDispatchTest, givenParentKernelWhenQueue
 
     size_t sshUsed = blockedCommandsData->ssh->getUsed();
 
-    size_t expectedSizeSSH = pKernel->getNumberOfBindingTableStates(rootDeviceIndex) * sizeof(RENDER_SURFACE_STATE) +
+    size_t expectedSizeSSH = pKernel->getNumberOfBindingTableStates() * sizeof(RENDER_SURFACE_STATE) +
                              pKernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.payloadMappings.bindingTable.numEntries * sizeof(BINDING_TABLE_STATE) +
                              UnitTestHelper<FamilyType>::getDefaultSshUsage();
 
diff --git a/opencl/test/unit_test/execution_model/scheduler_dispatch_tests.cpp b/opencl/test/unit_test/execution_model/scheduler_dispatch_tests.cpp
index 00d1790dcd..dfefe9699d 100644
--- a/opencl/test/unit_test/execution_model/scheduler_dispatch_tests.cpp
+++ b/opencl/test/unit_test/execution_model/scheduler_dispatch_tests.cpp
@@ -70,27 +70,27 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ExecutionModelSchedulerFixture, WhenDispatchingSched
         pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
         false);
 
-    EXPECT_EQ(0u, *scheduler.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX);
-    EXPECT_EQ(0u, *scheduler.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY);
-    EXPECT_EQ(0u, *scheduler.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetZ);
+    EXPECT_EQ(0u, *scheduler.globalWorkOffsetX);
+    EXPECT_EQ(0u, *scheduler.globalWorkOffsetY);
+    EXPECT_EQ(0u, *scheduler.globalWorkOffsetZ);
 
-    EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
-    EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
-    EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
+    EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.localWorkSizeX);
+    EXPECT_EQ(1u, *scheduler.localWorkSizeY);
+    EXPECT_EQ(1u, *scheduler.localWorkSizeZ);
 
-    EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2);
-    EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
-    EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2);
+    EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.localWorkSizeX2);
+    EXPECT_EQ(1u, *scheduler.localWorkSizeY2);
+    EXPECT_EQ(1u, *scheduler.localWorkSizeZ2);
 
-    if (scheduler.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX != &Kernel::dummyPatchLocation) {
-        EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX);
+    if (scheduler.enqueuedLocalWorkSizeX != &Kernel::dummyPatchLocation) {
+        EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.enqueuedLocalWorkSizeX);
     }
-    EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY);
-    EXPECT_EQ(1u, *scheduler.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ);
+    EXPECT_EQ(1u, *scheduler.enqueuedLocalWorkSizeY);
+    EXPECT_EQ(1u, *scheduler.enqueuedLocalWorkSizeZ);
 
-    EXPECT_EQ((uint32_t)(scheduler.getGws() / scheduler.getLws()), *scheduler.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
-    EXPECT_EQ(0u, *scheduler.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
-    EXPECT_EQ(0u, *scheduler.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
+    EXPECT_EQ((uint32_t)(scheduler.getGws() / scheduler.getLws()), *scheduler.numWorkGroupsX);
+    EXPECT_EQ(0u, *scheduler.numWorkGroupsY);
+    EXPECT_EQ(0u, *scheduler.numWorkGroupsZ);
 
     HardwareParse hwParser;
     hwParser.parseCommands<FamilyType>(commandStream, 0);
@@ -151,7 +151,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ExecutionModelSchedulerFixture, WhenDispatchingSched
     auto grfSize = pDevice->getHardwareInfo().capabilityTable.grfSize;
     auto sizePerThreadDataTotal = PerThreadDataHelper::getPerThreadDataSizeTotal(scheduler.getKernelInfo(rootDeviceIndex).getMaxSimdSize(), grfSize, numChannels, scheduler.getLws());
 
-    auto sizeCrossThreadData = scheduler.getCrossThreadDataSize(rootDeviceIndex);
+    auto sizeCrossThreadData = scheduler.getCrossThreadDataSize();
     auto IndirectDataLength = alignUp((uint32_t)(sizeCrossThreadData + sizePerThreadDataTotal), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
     EXPECT_EQ(IndirectDataLength, walker->getIndirectDataLength());
 
diff --git a/opencl/test/unit_test/gtpin/gtpin_tests.cpp b/opencl/test/unit_test/gtpin/gtpin_tests.cpp
index 42627bbb37..7782af24fa 100644
--- a/opencl/test/unit_test/gtpin/gtpin_tests.cpp
+++ b/opencl/test/unit_test/gtpin/gtpin_tests.cpp
@@ -1282,7 +1282,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelWithoutSSHIsUsedThenG
 
     // Verify that when SSH is removed then during kernel execution
     // GT-Pin Kernel Submit, Command Buffer Create and Command Buffer Complete callbacks are not called.
-    pKernel->resizeSurfaceStateHeap(rootDeviceIndex, nullptr, 0, 0, 0);
+    pKernel->resizeSurfaceStateHeap(nullptr, 0, 0, 0);
 
     int prevCount2 = KernelSubmitCallbackCount;
     int prevCount3 = CommandBufferCreateCallbackCount;
@@ -1396,7 +1396,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenBlockedKernelWithoutSSHIsUs
 
     // Verify that when SSH is removed then during kernel execution
     // GT-Pin Kernel Submit, Command Buffer Create and Command Buffer Complete callbacks are not called.
-    pKernel->resizeSurfaceStateHeap(rootDeviceIndex, nullptr, 0, 0, 0);
+    pKernel->resizeSurfaceStateHeap(nullptr, 0, 0, 0);
 
     cl_event userEvent = clCreateUserEvent(context, &retVal);
     EXPECT_EQ(CL_SUCCESS, retVal);
@@ -2177,8 +2177,8 @@ TEST_F(GTPinTests, givenParentKernelWhenGtPinAddingSurfaceStateThenItIsNotAddedA
     std::unique_ptr<MockParentKernel> parentKernel(MockParentKernel::create(*pContext));
 
     parentKernel->mockKernelInfo->usesSsh = true;
-    parentKernel->kernelDeviceInfos[rootDeviceIndex].sshLocalSize = 64;
-    parentKernel->kernelDeviceInfos[rootDeviceIndex].pSshLocal.reset(new char[64]);
+    parentKernel->sshLocalSize = 64;
+    parentKernel->pSshLocal.reset(new char[64]);
 
     size_t sizeSurfaceStates1 = parentKernel->getSurfaceStateHeapSize(rootDeviceIndex);
 
@@ -2234,11 +2234,11 @@ TEST_F(GTPinTests, givenKernelWithSSHThenVerifyThatSSHResizeWorksWell) {
     auto pKernel = pMultiDeviceKernel->getKernel(rootDeviceIndex);
     ASSERT_NE(nullptr, pKernel);
 
-    size_t numBTS1 = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
+    size_t numBTS1 = pKernel->getNumberOfBindingTableStates();
     EXPECT_EQ(2u, numBTS1);
     size_t sizeSurfaceStates1 = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
     EXPECT_NE(0u, sizeSurfaceStates1);
-    size_t offsetBTS1 = pKernel->getBindingTableOffset(rootDeviceIndex);
+    size_t offsetBTS1 = pKernel->getBindingTableOffset();
     EXPECT_NE(0u, offsetBTS1);
 
     GFXCORE_FAMILY genFamily = pDevice->getHardwareInfo().platform.eRenderCoreFamily;
@@ -2250,11 +2250,11 @@ TEST_F(GTPinTests, givenKernelWithSSHThenVerifyThatSSHResizeWorksWell) {
     bool surfaceAdded = gtpinHelper.addSurfaceState(pKernel, rootDeviceIndex);
     EXPECT_TRUE(surfaceAdded);
 
-    size_t numBTS2 = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
+    size_t numBTS2 = pKernel->getNumberOfBindingTableStates();
     EXPECT_EQ(numBTS1 + 1, numBTS2);
     size_t sizeSurfaceStates2 = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
     EXPECT_GT(sizeSurfaceStates2, sizeSurfaceStates1);
-    size_t offsetBTS2 = pKernel->getBindingTableOffset(rootDeviceIndex);
+    size_t offsetBTS2 = pKernel->getBindingTableOffset();
     EXPECT_GT(offsetBTS2, offsetBTS1);
 
     void *pSS2 = gtpinHelper.getSurfaceState(pKernel, 0, rootDeviceIndex);
@@ -2264,17 +2264,17 @@ TEST_F(GTPinTests, givenKernelWithSSHThenVerifyThatSSHResizeWorksWell) {
     EXPECT_EQ(nullptr, pSS2);
 
     // Remove kernel's SSH
-    pKernel->resizeSurfaceStateHeap(rootDeviceIndex, nullptr, 0, 0, 0);
+    pKernel->resizeSurfaceStateHeap(nullptr, 0, 0, 0);
 
     // Try to enlarge SSH once again, this time the operation must fail
     surfaceAdded = gtpinHelper.addSurfaceState(pKernel, rootDeviceIndex);
     EXPECT_FALSE(surfaceAdded);
 
-    size_t numBTS3 = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
+    size_t numBTS3 = pKernel->getNumberOfBindingTableStates();
     EXPECT_EQ(0u, numBTS3);
     size_t sizeSurfaceStates3 = pKernel->getSurfaceStateHeapSize(rootDeviceIndex);
     EXPECT_EQ(0u, sizeSurfaceStates3);
-    size_t offsetBTS3 = pKernel->getBindingTableOffset(rootDeviceIndex);
+    size_t offsetBTS3 = pKernel->getBindingTableOffset();
     EXPECT_EQ(0u, offsetBTS3);
     void *pSS3 = gtpinHelper.getSurfaceState(pKernel, 0, rootDeviceIndex);
     EXPECT_EQ(nullptr, pSS3);
@@ -2409,7 +2409,7 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenOnKernelSubitIsCalledThenCo
     std::unique_ptr<MultiDeviceKernel> pMultiDeviceKernel(MockMultiDeviceKernel::create<MockKernel>(pProgramm.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex)));
     auto pKernel = static_cast<MockKernel *>(pMultiDeviceKernel->getKernel(rootDeviceIndex));
 
-    pKernel->setSshLocal(nullptr, sizeof(surfaceStateHeap), rootDeviceIndex);
+    pKernel->setSshLocal(nullptr, sizeof(surfaceStateHeap));
 
     kernelOffset = 0x1234;
     EXPECT_NE(pKernel->getStartOffset(), kernelOffset);
diff --git a/opencl/test/unit_test/helpers/dispatch_info_builder_tests.cpp b/opencl/test/unit_test/helpers/dispatch_info_builder_tests.cpp
index e16d2c472f..dee0aa9530 100644
--- a/opencl/test/unit_test/helpers/dispatch_info_builder_tests.cpp
+++ b/opencl/test/unit_test/helpers/dispatch_info_builder_tests.cpp
@@ -69,7 +69,7 @@ class DispatchInfoBuilderFixture : public ContextFixture, public ClDeviceFixture
         pKernel->setCrossThreadData(pCrossThreadData, sizeof(pCrossThreadData));
         pKernel->setKernelArgHandler(0, &Kernel::setArgBuffer);
 
-        pKernel->kernelDeviceInfos[rootDeviceIndex].slmTotalSize = 128;
+        pKernel->slmTotalSize = 128;
         pKernel->isBuiltIn = true;
     }
 
@@ -874,11 +874,11 @@ TEST_F(DispatchInfoBuilderTest, WhenSettingKernelArgThenAddressesAreCorrect) {
 
     for (auto &dispatchInfo : multiDispatchInfo) {
         auto crossthreadOffset0 = pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset;
-        EXPECT_EQ(buffer->getCpuAddress(), *reinterpret_cast<void **>((dispatchInfo.getKernel()->getCrossThreadData(rootDeviceIndex) + crossthreadOffset0)));
+        EXPECT_EQ(buffer->getCpuAddress(), *reinterpret_cast<void **>((dispatchInfo.getKernel()->getCrossThreadData() + crossthreadOffset0)));
         auto crossthreadOffset1 = pKernelInfo->kernelArgInfo[1].kernelArgPatchInfoVector[0].crossthreadOffset;
-        EXPECT_EQ(svmPtr, *(reinterpret_cast<void **>(dispatchInfo.getKernel()->getCrossThreadData(rootDeviceIndex) + crossthreadOffset1)));
+        EXPECT_EQ(svmPtr, *(reinterpret_cast<void **>(dispatchInfo.getKernel()->getCrossThreadData() + crossthreadOffset1)));
         auto crossthreadOffset2 = pKernelInfo->kernelArgInfo[2].kernelArgPatchInfoVector[0].crossthreadOffset;
-        EXPECT_EQ(svmPtr, *(reinterpret_cast<void **>(dispatchInfo.getKernel()->getCrossThreadData(rootDeviceIndex) + crossthreadOffset2)));
+        EXPECT_EQ(svmPtr, *(reinterpret_cast<void **>(dispatchInfo.getKernel()->getCrossThreadData() + crossthreadOffset2)));
     }
 
     delete buffer;
@@ -920,34 +920,34 @@ TEST_F(DispatchInfoBuilderTest, GivenSplitWhenSettingKernelArgThenAddressesAreCo
     clearCrossThreadData();
     builder1D.setArg(SplitDispatch::RegionCoordX::Left, static_cast<uint32_t>(0), sizeof(cl_mem *), pVal);
     for (auto &dispatchInfo : mdi1D) {
-        EXPECT_EQ(buffer->getCpuAddress(), *reinterpret_cast<void **>((dispatchInfo.getKernel()->getCrossThreadData(rootDeviceIndex) + 0x10)));
+        EXPECT_EQ(buffer->getCpuAddress(), *reinterpret_cast<void **>((dispatchInfo.getKernel()->getCrossThreadData() + 0x10)));
     }
     clearCrossThreadData();
     builder2D.setArg(SplitDispatch::RegionCoordX::Left, SplitDispatch::RegionCoordY::Top, static_cast<uint32_t>(0), sizeof(cl_mem *), pVal);
     for (auto &dispatchInfo : mdi2D) {
-        EXPECT_EQ(buffer->getCpuAddress(), *reinterpret_cast<void **>((dispatchInfo.getKernel()->getCrossThreadData(rootDeviceIndex) + 0x10)));
+        EXPECT_EQ(buffer->getCpuAddress(), *reinterpret_cast<void **>((dispatchInfo.getKernel()->getCrossThreadData() + 0x10)));
     }
     clearCrossThreadData();
     builder3D.setArg(SplitDispatch::RegionCoordX::Left, SplitDispatch::RegionCoordY::Top, SplitDispatch::RegionCoordZ::Front, static_cast<uint32_t>(0), sizeof(cl_mem *), pVal);
     for (auto &dispatchInfo : mdi3D) {
-        EXPECT_EQ(buffer->getCpuAddress(), *reinterpret_cast<void **>((dispatchInfo.getKernel()->getCrossThreadData(rootDeviceIndex) + 0x10)));
+        EXPECT_EQ(buffer->getCpuAddress(), *reinterpret_cast<void **>((dispatchInfo.getKernel()->getCrossThreadData() + 0x10)));
     }
 
     //Set arg SVM
     clearCrossThreadData();
     builder1D.setArgSvm(SplitDispatch::RegionCoordX::Left, 1, sizeof(svmPtr), svmPtr, nullptr, 0u);
     for (auto &dispatchInfo : mdi1D) {
-        EXPECT_EQ(svmPtr, *(reinterpret_cast<void **>(dispatchInfo.getKernel()->getCrossThreadData(rootDeviceIndex) + 0x30)));
+        EXPECT_EQ(svmPtr, *(reinterpret_cast<void **>(dispatchInfo.getKernel()->getCrossThreadData() + 0x30)));
     }
     clearCrossThreadData();
     builder2D.setArgSvm(SplitDispatch::RegionCoordX::Left, SplitDispatch::RegionCoordY::Top, 1, sizeof(svmPtr), svmPtr, nullptr, 0u);
     for (auto &dispatchInfo : mdi2D) {
-        EXPECT_EQ(svmPtr, *(reinterpret_cast<void **>(dispatchInfo.getKernel()->getCrossThreadData(rootDeviceIndex) + 0x30)));
+        EXPECT_EQ(svmPtr, *(reinterpret_cast<void **>(dispatchInfo.getKernel()->getCrossThreadData() + 0x30)));
     }
     clearCrossThreadData();
     builder3D.setArgSvm(SplitDispatch::RegionCoordX::Left, SplitDispatch::RegionCoordY::Top, SplitDispatch::RegionCoordZ::Front, 1, sizeof(svmPtr), svmPtr, nullptr, 0u);
     for (auto &dispatchInfo : mdi3D) {
-        EXPECT_EQ(svmPtr, *(reinterpret_cast<void **>(dispatchInfo.getKernel()->getCrossThreadData(rootDeviceIndex) + 0x30)));
+        EXPECT_EQ(svmPtr, *(reinterpret_cast<void **>(dispatchInfo.getKernel()->getCrossThreadData() + 0x30)));
     }
 
     delete buffer;
diff --git a/opencl/test/unit_test/helpers/dispatch_info_tests.cpp b/opencl/test/unit_test/helpers/dispatch_info_tests.cpp
index 7ce73a25b7..5a46d91792 100644
--- a/opencl/test/unit_test/helpers/dispatch_info_tests.cpp
+++ b/opencl/test/unit_test/helpers/dispatch_info_tests.cpp
@@ -44,7 +44,7 @@ class DispatchInfoFixture : public ContextFixture, public ClDeviceFixture {
         pProgram = new MockProgram(pContext, false, toClDeviceVector(*pClDevice));
 
         pKernel = new MockKernel(pProgram, MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex), *pClDevice);
-        pKernel->kernelDeviceInfos[rootDeviceIndex].slmTotalSize = 128;
+        pKernel->slmTotalSize = 128;
     }
     void TearDown() override {
         delete pKernel;
diff --git a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp
index be5babb87c..c8c929291c 100644
--- a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp
+++ b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp
@@ -91,7 +91,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenProgramInterfaceDescriptor
     auto usedIndirectHeapBefore = indirectHeap.getUsed();
     indirectHeap.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA));
 
-    size_t crossThreadDataSize = kernel->getCrossThreadDataSize(rootDeviceIndex);
+    size_t crossThreadDataSize = kernel->getCrossThreadDataSize();
     HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData(
         indirectHeap, 0, 0, crossThreadDataSize, 64, 0, 0, 0, 1, *kernel, 0, pDevice->getPreemptionMode(), nullptr, *pDevice);
 
@@ -163,17 +163,16 @@ HWTEST_F(HardwareCommandsTest, WhenCrossThreadDataIsCreatedThenOnlyRequiredSpace
 
     auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
     auto usedBefore = indirectHeap.getUsed();
-    auto sizeCrossThreadData = kernel->getCrossThreadDataSize(rootDeviceIndex);
+    auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
     HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
         indirectHeap,
         *kernel,
         false,
         nullptr,
-        sizeCrossThreadData,
-        rootDeviceIndex);
+        sizeCrossThreadData);
 
     auto usedAfter = indirectHeap.getUsed();
-    EXPECT_EQ(kernel->getCrossThreadDataSize(rootDeviceIndex), usedAfter - usedBefore);
+    EXPECT_EQ(kernel->getCrossThreadDataSize(), usedAfter - usedBefore);
 }
 
 HWTEST_F(HardwareCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoCommentsForAUBDumpIsNotSetThenAddPatchInfoDataOffsetsAreNotMoved) {
@@ -190,14 +189,13 @@ HWTEST_F(HardwareCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoComme
 
     PatchInfoData patchInfoData = {0xaaaaaaaa, 0, PatchInfoAllocationType::KernelArg, 0xbbbbbbbb, 0, PatchInfoAllocationType::IndirectObjectHeap};
     kernel->getPatchInfoDataList().push_back(patchInfoData);
-    auto sizeCrossThreadData = kernel->getCrossThreadDataSize(rootDeviceIndex);
+    auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
     HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
         indirectHeap,
         *kernel,
         false,
         nullptr,
-        sizeCrossThreadData,
-        rootDeviceIndex);
+        sizeCrossThreadData);
 
     ASSERT_EQ(1u, kernel->getPatchInfoDataList().size());
     EXPECT_EQ(0xaaaaaaaa, kernel->getPatchInfoDataList()[0].sourceAllocation);
@@ -212,14 +210,13 @@ HWTEST_F(HardwareCommandsTest, givenIndirectHeapNotAllocatedFromInternalPoolWhen
     auto nonInternalAllocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), MemoryConstants::pageSize});
     IndirectHeap indirectHeap(nonInternalAllocation, false);
 
-    auto sizeCrossThreadData = mockKernelWithInternal->mockKernel->getCrossThreadDataSize(rootDeviceIndex);
+    auto sizeCrossThreadData = mockKernelWithInternal->mockKernel->getCrossThreadDataSize();
     auto offset = HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
         indirectHeap,
         *mockKernelWithInternal->mockKernel,
         false,
         nullptr,
-        sizeCrossThreadData,
-        rootDeviceIndex);
+        sizeCrossThreadData);
     EXPECT_EQ(0u, offset);
     pDevice->getMemoryManager()->freeGraphicsMemory(nonInternalAllocation);
 }
@@ -229,14 +226,13 @@ HWTEST_F(HardwareCommandsTest, givenIndirectHeapAllocatedFromInternalPoolWhenSen
     IndirectHeap indirectHeap(internalAllocation, true);
     auto expectedOffset = internalAllocation->getGpuAddressToPatch();
 
-    auto sizeCrossThreadData = mockKernelWithInternal->mockKernel->getCrossThreadDataSize(rootDeviceIndex);
+    auto sizeCrossThreadData = mockKernelWithInternal->mockKernel->getCrossThreadDataSize();
     auto offset = HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
         indirectHeap,
         *mockKernelWithInternal->mockKernel,
         false,
         nullptr,
-        sizeCrossThreadData,
-        rootDeviceIndex);
+        sizeCrossThreadData);
     EXPECT_EQ(expectedOffset, offset);
 
     pDevice->getMemoryManager()->freeGraphicsMemory(internalAllocation);
@@ -263,14 +259,13 @@ HWTEST_F(HardwareCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoComme
 
     kernel->getPatchInfoDataList().push_back(patchInfoData1);
     kernel->getPatchInfoDataList().push_back(patchInfoData2);
-    auto sizeCrossThreadData = kernel->getCrossThreadDataSize(rootDeviceIndex);
+    auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
     auto offsetCrossThreadData = HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
         indirectHeap,
         *kernel,
         false,
         nullptr,
-        sizeCrossThreadData,
-        rootDeviceIndex);
+        sizeCrossThreadData);
 
     ASSERT_NE(0u, offsetCrossThreadData);
     EXPECT_EQ(128u, offsetCrossThreadData);
@@ -386,7 +381,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTabl
     *pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
 
     auto expectedBindingTableCount = 3u;
-    mockKernelWithInternal->mockKernel->kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = expectedBindingTableCount;
+    mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
 
     auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
     auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
@@ -432,7 +427,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelThatIsSchedulerWhen
     *pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
 
     auto expectedBindingTableCount = 3u;
-    mockKernelWithInternal->mockKernel->kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = expectedBindingTableCount;
+    mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
     auto isScheduler = const_cast<bool *>(&mockKernelWithInternal->mockKernel->isSchedulerKernel);
     *isScheduler = true;
 
@@ -476,7 +471,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTable
     *pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
 
     auto expectedBindingTableCount = 100u;
-    mockKernelWithInternal->mockKernel->kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = expectedBindingTableCount;
+    mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
 
     auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
     auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
@@ -799,7 +794,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenGettingBindingTableStateTh
         auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
 
         // Initialize binding table state pointers with pattern
-        EXPECT_EQ(numSurfaces, pKernel->getNumberOfBindingTableStates(rootDeviceIndex));
+        EXPECT_EQ(numSurfaces, pKernel->getNumberOfBindingTableStates());
 
         const size_t localWorkSizes[3]{256, 1, 1};
 
@@ -888,7 +883,7 @@ HWTEST_F(HardwareCommandsTest, GivenBuffersNotRequiringSshWhenSettingBindingTabl
     auto usedBefore = ssh.getUsed();
 
     // Initialize binding table state pointers with pattern
-    auto numSurfaceStates = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
+    auto numSurfaceStates = pKernel->getNumberOfBindingTableStates();
     EXPECT_EQ(0u, numSurfaceStates);
 
     // set binding table states
@@ -932,7 +927,7 @@ HWTEST_F(HardwareCommandsTest, GivenZeroSurfaceStatesWhenSettingBindingTableStat
     auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
 
     // Initialize binding table state pointers with pattern
-    auto numSurfaceStates = pKernel->getNumberOfBindingTableStates(rootDeviceIndex);
+    auto numSurfaceStates = pKernel->getNumberOfBindingTableStates();
     EXPECT_EQ(0u, numSurfaceStates);
 
     auto dstBindingTablePointer = pushBindingTableAndSurfaceStates<FamilyType>(ssh, *pKernel);
@@ -1078,7 +1073,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenInd
     }
 
     mockKernelWithInternal->mockKernel->setCrossThreadData(mockKernelWithInternal->crossThreadData, sizeof(mockKernelWithInternal->crossThreadData));
-    mockKernelWithInternal->mockKernel->setSshLocal(mockKernelWithInternal->sshLocal, sizeof(mockKernelWithInternal->sshLocal), rootDeviceIndex);
+    mockKernelWithInternal->mockKernel->setSshLocal(mockKernelWithInternal->sshLocal, sizeof(mockKernelWithInternal->sshLocal));
     uint32_t interfaceDescriptorIndex = 0;
     auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
     auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel, rootDeviceIndex);
diff --git a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.h b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.h
index 543f0c605f..e68dcb4cbf 100644
--- a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.h
+++ b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.h
@@ -47,6 +47,6 @@ struct HardwareCommandsTest : ClDeviceFixture,
     size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const Kernel &srcKernel) {
         return EncodeSurfaceState<GfxFamily>::pushBindingTableAndSurfaceStates(dstHeap, srcKernel.getKernelInfo(rootDeviceIndex).kernelDescriptor.payloadMappings.bindingTable.numEntries,
                                                                                srcKernel.getSurfaceStateHeap(rootDeviceIndex), srcKernel.getSurfaceStateHeapSize(rootDeviceIndex),
-                                                                               srcKernel.getNumberOfBindingTableStates(rootDeviceIndex), srcKernel.getBindingTableOffset(rootDeviceIndex));
+                                                                               srcKernel.getNumberOfBindingTableStates(), srcKernel.getBindingTableOffset());
     }
 };
diff --git a/opencl/test/unit_test/kernel/clone_kernel_tests.cpp b/opencl/test/unit_test/kernel/clone_kernel_tests.cpp
index 25850d9b97..2b5fd98d78 100644
--- a/opencl/test/unit_test/kernel/clone_kernel_tests.cpp
+++ b/opencl/test/unit_test/kernel/clone_kernel_tests.cpp
@@ -183,7 +183,7 @@ TEST_F(CloneKernelTest, GivenArgLocalWhenCloningKernelThenKernelInfoIsCorrect) {
         EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getPatchedArgumentsNum(), pClonedKernel[rootDeviceIndex]->getPatchedArgumentsNum());
         EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched, pClonedKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched);
 
-        EXPECT_EQ(alignUp(slmSize, 1024), pClonedKernel[rootDeviceIndex]->kernelDeviceInfos[rootDeviceIndex].slmTotalSize);
+        EXPECT_EQ(alignUp(slmSize, 1024), pClonedKernel[rootDeviceIndex]->slmTotalSize);
     }
 }
 
@@ -219,7 +219,7 @@ TEST_F(CloneKernelTest, GivenArgBufferWhenCloningKernelThenKernelInfoIsCorrect)
         EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getPatchedArgumentsNum(), pClonedKernel[rootDeviceIndex]->getPatchedArgumentsNum());
         EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched, pClonedKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched);
 
-        auto pKernelArg = (cl_mem *)(pClonedKernel[rootDeviceIndex]->getCrossThreadData(rootDeviceIndex) +
+        auto pKernelArg = (cl_mem *)(pClonedKernel[rootDeviceIndex]->getCrossThreadData() +
                                      pClonedKernel[rootDeviceIndex]->getKernelInfo(rootDeviceIndex).kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
         EXPECT_EQ(buffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddressToPatch(), reinterpret_cast<uint64_t>(*pKernelArg));
     }
@@ -256,7 +256,7 @@ TEST_F(CloneKernelTest, GivenArgPipeWhenCloningKernelThenKernelInfoIsCorrect) {
     EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getPatchedArgumentsNum(), pClonedKernel[rootDeviceIndex]->getPatchedArgumentsNum());
     EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched, pClonedKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched);
 
-    auto pKernelArg = (cl_mem *)(pClonedKernel[rootDeviceIndex]->getCrossThreadData(rootDeviceIndex) +
+    auto pKernelArg = (cl_mem *)(pClonedKernel[rootDeviceIndex]->getCrossThreadData() +
                                  pClonedKernel[rootDeviceIndex]->getKernelInfo(rootDeviceIndex).kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
     EXPECT_EQ(pipe->getGraphicsAllocation(rootDeviceIndex)->getGpuAddressToPatch(), reinterpret_cast<uint64_t>(*pKernelArg));
 }
@@ -296,7 +296,7 @@ TEST_F(CloneKernelTest, GivenArgImageWhenCloningKernelThenKernelInfoIsCorrect) {
     EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getPatchedArgumentsNum(), pClonedKernel[rootDeviceIndex]->getPatchedArgumentsNum());
     EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched, pClonedKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched);
 
-    auto crossThreadData = reinterpret_cast<uint32_t *>(pClonedKernel[rootDeviceIndex]->getCrossThreadData(rootDeviceIndex));
+    auto crossThreadData = reinterpret_cast<uint32_t *>(pClonedKernel[rootDeviceIndex]->getCrossThreadData());
     EXPECT_EQ(objectId, *crossThreadData);
 
     const auto &argInfo = pClonedKernel[rootDeviceIndex]->getKernelInfo(rootDeviceIndex).kernelArgInfo[0];
@@ -349,7 +349,7 @@ TEST_F(CloneKernelTest, GivenArgAcceleratorWhenCloningKernelThenKernelInfoIsCorr
     EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getPatchedArgumentsNum(), pClonedKernel[rootDeviceIndex]->getPatchedArgumentsNum());
     EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched, pClonedKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched);
 
-    auto crossThreadData = reinterpret_cast<uint32_t *>(pClonedKernel[rootDeviceIndex]->getCrossThreadData(rootDeviceIndex));
+    auto crossThreadData = reinterpret_cast<uint32_t *>(pClonedKernel[rootDeviceIndex]->getCrossThreadData());
 
     const auto &argInfo = pClonedKernel[rootDeviceIndex]->getKernelInfo(rootDeviceIndex).kernelArgInfo[0];
 
@@ -403,7 +403,7 @@ TEST_F(CloneKernelTest, GivenArgSamplerWhenCloningKernelThenKernelInfoIsCorrect)
     EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getPatchedArgumentsNum(), pClonedKernel[rootDeviceIndex]->getPatchedArgumentsNum());
     EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched, pClonedKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched);
 
-    auto crossThreadData = reinterpret_cast<uint32_t *>(pClonedKernel[rootDeviceIndex]->getCrossThreadData(rootDeviceIndex));
+    auto crossThreadData = reinterpret_cast<uint32_t *>(pClonedKernel[rootDeviceIndex]->getCrossThreadData());
     EXPECT_EQ(objectId, *crossThreadData);
 
     const auto &argInfo = pClonedKernel[rootDeviceIndex]->getKernelInfo(rootDeviceIndex).kernelArgInfo[0];
@@ -454,7 +454,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CloneKernelTest, GivenArgDeviceQueueWhenCloningKerne
     EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getPatchedArgumentsNum(), pClonedKernel[rootDeviceIndex]->getPatchedArgumentsNum());
     EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched, pClonedKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched);
 
-    auto pKernelArg = (uintptr_t *)(pClonedKernel[rootDeviceIndex]->getCrossThreadData(rootDeviceIndex) +
+    auto pKernelArg = (uintptr_t *)(pClonedKernel[rootDeviceIndex]->getCrossThreadData() +
                                     pClonedKernel[rootDeviceIndex]->getKernelInfo(rootDeviceIndex).kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
     EXPECT_EQ(static_cast<uintptr_t>(mockDevQueue.getQueueBuffer()->getGpuAddressToPatch()), *pKernelArg);
 }
@@ -485,7 +485,7 @@ TEST_F(CloneKernelTest, GivenArgSvmWhenCloningKernelThenKernelInfoIsCorrect) {
         EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getPatchedArgumentsNum(), pClonedKernel[rootDeviceIndex]->getPatchedArgumentsNum());
         EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched, pClonedKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched);
 
-        auto pKernelArg = (void **)(pClonedKernel[rootDeviceIndex]->getCrossThreadData(rootDeviceIndex) +
+        auto pKernelArg = (void **)(pClonedKernel[rootDeviceIndex]->getCrossThreadData() +
                                     pClonedKernel[rootDeviceIndex]->getKernelInfo(rootDeviceIndex).kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
         EXPECT_EQ(svmPtr, *pKernelArg);
     }
@@ -518,7 +518,7 @@ TEST_F(CloneKernelTest, GivenArgSvmAllocWhenCloningKernelThenKernelInfoIsCorrect
     EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getPatchedArgumentsNum(), pClonedKernel[rootDeviceIndex]->getPatchedArgumentsNum());
     EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched, pClonedKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched);
 
-    auto pKernelArg = (void **)(pClonedKernel[rootDeviceIndex]->getCrossThreadData(rootDeviceIndex) +
+    auto pKernelArg = (void **)(pClonedKernel[rootDeviceIndex]->getCrossThreadData() +
                                 pClonedKernel[rootDeviceIndex]->getKernelInfo(rootDeviceIndex).kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
     EXPECT_EQ(svmPtr, *pKernelArg);
 
@@ -552,7 +552,7 @@ TEST_F(CloneKernelTest, GivenArgImmediateWhenCloningKernelThenKernelInfoIsCorrec
         EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getPatchedArgumentsNum(), pClonedKernel[rootDeviceIndex]->getPatchedArgumentsNum());
         EXPECT_EQ(pSourceKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched, pClonedKernel[rootDeviceIndex]->getKernelArgInfo(0).isPatched);
 
-        auto pKernelArg = (TypeParam *)(pClonedKernel[rootDeviceIndex]->getCrossThreadData(rootDeviceIndex) +
+        auto pKernelArg = (TypeParam *)(pClonedKernel[rootDeviceIndex]->getCrossThreadData() +
                                         pClonedKernel[rootDeviceIndex]->getKernelInfo(rootDeviceIndex).kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
         EXPECT_EQ(value, *pKernelArg);
     }
diff --git a/opencl/test/unit_test/kernel/kernel_accelerator_arg_tests.cpp b/opencl/test/unit_test/kernel/kernel_accelerator_arg_tests.cpp
index 8a4846a86f..fda014c310 100644
--- a/opencl/test/unit_test/kernel/kernel_accelerator_arg_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_accelerator_arg_tests.cpp
@@ -105,7 +105,7 @@ TEST_F(KernelArgAcceleratorTest, WhenCreatingVmeAcceleratorThenCorrectKernelArgs
     status = this->pKernel->setArg(0, sizeof(cl_accelerator_intel), &accelerator);
     ASSERT_EQ(CL_SUCCESS, status);
 
-    char *crossThreadData = pKernel->getCrossThreadData(rootDeviceIndex);
+    char *crossThreadData = pKernel->getCrossThreadData();
 
     const auto &arginfo = pKernelInfo->kernelArgInfo[0];
 
diff --git a/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp b/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp
index 3ae1b3a7c4..42a738ebed 100644
--- a/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_arg_buffer_tests.cpp
@@ -41,7 +41,7 @@ TEST_F(KernelArgBufferTest, GivenValidBufferWhenSettingKernelArgThenBufferAddres
     auto retVal = this->pKernel->setArg(0, sizeof(cl_mem *), pVal);
     EXPECT_EQ(CL_SUCCESS, retVal);
 
-    auto pKernelArg = (cl_mem **)(this->pKernel->getCrossThreadData(rootDeviceIndex) +
+    auto pKernelArg = (cl_mem **)(this->pKernel->getCrossThreadData() +
                                   this->pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
     EXPECT_EQ(buffer->getCpuAddress(), *pKernelArg);
 
@@ -127,7 +127,7 @@ TEST_F(MultiDeviceKernelArgBufferTest, GivenValidBufferWhenSettingKernelArgThenB
 
     for (auto &rootDeviceIndex : pContext->getRootDeviceIndices()) {
         auto pKernel = static_cast<MockKernel *>(pMultiDeviceKernel->getKernel(rootDeviceIndex));
-        auto pKernelArg = reinterpret_cast<size_t *>(pKernel->getCrossThreadData(rootDeviceIndex) +
+        auto pKernelArg = reinterpret_cast<size_t *>(pKernel->getCrossThreadData() +
                                                      kernelInfos[rootDeviceIndex]->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
         EXPECT_EQ(pBuffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddressToPatch(), *pKernelArg);
     }
@@ -266,7 +266,7 @@ TEST_F(KernelArgBufferTest, GivenNullPtrWhenSettingKernelArgThenKernelArgIsNull)
     auto pVal = &val;
     this->pKernel->setArg(0, sizeof(cl_mem *), pVal);
 
-    auto pKernelArg = (cl_mem **)(this->pKernel->getCrossThreadData(rootDeviceIndex) +
+    auto pKernelArg = (cl_mem **)(this->pKernel->getCrossThreadData() +
                                   this->pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
 
     EXPECT_EQ(nullptr, *pKernelArg);
@@ -283,7 +283,7 @@ TEST_F(MultiDeviceKernelArgBufferTest, GivenNullPtrWhenSettingKernelArgThenKerne
     pMultiDeviceKernel->setArg(0, sizeof(cl_mem *), pVal);
     for (auto &rootDeviceIndex : pContext->getRootDeviceIndices()) {
         auto pKernel = static_cast<MockKernel *>(pMultiDeviceKernel->getKernel(rootDeviceIndex));
-        auto pKernelArg = reinterpret_cast<void **>(pKernel->getCrossThreadData(rootDeviceIndex) +
+        auto pKernelArg = reinterpret_cast<void **>(pKernel->getCrossThreadData() +
                                                     kernelInfos[rootDeviceIndex]->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
         EXPECT_EQ(nullptr, *pKernelArg);
     }
@@ -295,7 +295,7 @@ TEST_F(KernelArgBufferTest, given32BitDeviceWhenArgPtrPassedIsNullThenOnly4Bytes
 
     this->pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].size = 4;
 
-    auto pKernelArg64bit = (uint64_t *)(this->pKernel->getCrossThreadData(rootDeviceIndex) +
+    auto pKernelArg64bit = (uint64_t *)(this->pKernel->getCrossThreadData() +
                                         this->pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
 
     uint32_t *pKernelArg32bit = (uint32_t *)pKernelArg64bit;
@@ -312,7 +312,7 @@ TEST_F(KernelArgBufferTest, given32BitDeviceWhenArgPtrPassedIsNullThenOnly4Bytes
 TEST_F(KernelArgBufferTest, given32BitDeviceWhenArgPassedIsNullThenOnly4BytesAreBeingPatched) {
     auto pVal = nullptr;
     this->pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].size = 4;
-    auto pKernelArg64bit = (uint64_t *)(this->pKernel->getCrossThreadData(rootDeviceIndex) +
+    auto pKernelArg64bit = (uint64_t *)(this->pKernel->getCrossThreadData() +
                                         this->pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
 
     *pKernelArg64bit = 0xffffffffffffffff;
@@ -537,7 +537,7 @@ HWTEST_F(KernelArgBufferTestBindless, givenUsedBindlessBuffersWhenPatchingSurfac
     pKernelInfo->kernelArgInfo[0].offsetHeap = 64;
     pKernelInfo->kernelArgInfo[0].isBuffer = true;
 
-    auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(rootDeviceIndex), crossThreadDataOffset));
+    auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset));
     *patchLocation = 0xdead;
 
     uint32_t sshOffset = 0x1000;
@@ -565,7 +565,7 @@ TEST_F(KernelArgBufferTest, givenUsedBindlessBuffersAndNonBufferArgWhenPatchingS
     pKernelInfo->kernelArgInfo[0].offsetHeap = 64;
     pKernelInfo->kernelArgInfo[0].isBuffer = false;
 
-    auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(rootDeviceIndex), crossThreadDataOffset));
+    auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset));
     *patchLocation = 0xdead;
 
     uint32_t sshOffset = 4000;
@@ -584,7 +584,7 @@ TEST_F(KernelArgBufferTest, givenNotUsedBindlessBuffersAndBufferArgWhenPatchingS
     pKernelInfo->kernelArgInfo[0].offsetHeap = 64;
     pKernelInfo->kernelArgInfo[0].isBuffer = true;
 
-    auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(rootDeviceIndex), crossThreadDataOffset));
+    auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset));
     *patchLocation = 0xdead;
 
     uint32_t sshOffset = 4000;
@@ -602,7 +602,7 @@ HWTEST_F(KernelArgBufferTestBindless, givenUsedBindlessBuffersAndBuiltinKernelWh
     pKernelInfo->kernelArgInfo[0].offsetHeap = 64;
     pKernelInfo->kernelArgInfo[0].isBuffer = true;
 
-    auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(rootDeviceIndex), crossThreadDataOffset));
+    auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset));
     *patchLocation = 0xdead;
 
     pKernel->isBuiltIn = true;
diff --git a/opencl/test/unit_test/kernel/kernel_arg_pipe_tests.cpp b/opencl/test/unit_test/kernel/kernel_arg_pipe_tests.cpp
index 2272daaf87..af6758601a 100644
--- a/opencl/test/unit_test/kernel/kernel_arg_pipe_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_arg_pipe_tests.cpp
@@ -92,7 +92,7 @@ TEST_F(KernelArgPipeTest, GivenValidPipeWhenSettingKernelArgThenPipeAddressIsCor
     auto retVal = this->pKernel->setArg(0, sizeof(cl_mem *), pVal);
     EXPECT_EQ(CL_SUCCESS, retVal);
 
-    auto pKernelArg = (cl_mem **)(this->pKernel->getCrossThreadData(rootDeviceIndex) +
+    auto pKernelArg = (cl_mem **)(this->pKernel->getCrossThreadData() +
                                   this->pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
     EXPECT_EQ(pipe->getCpuAddress(), *pKernelArg);
 
diff --git a/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp b/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp
index 37973babbd..1b8cd662c4 100644
--- a/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp
@@ -86,7 +86,7 @@ TEST_F(KernelArgSvmTest, GivenValidSvmPtrWhenSettingKernelArgThenSvmPtrIsCorrect
     auto retVal = pKernel->setArgSvm(0, 256, svmPtr, nullptr, 0u);
     EXPECT_EQ(CL_SUCCESS, retVal);
 
-    auto pKernelArg = (void **)(pKernel->getCrossThreadData(rootDeviceIndex) +
+    auto pKernelArg = (void **)(pKernel->getCrossThreadData() +
                                 pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
     EXPECT_EQ(svmPtr, *pKernelArg);
 
@@ -137,7 +137,7 @@ TEST_F(KernelArgSvmTest, GivenValidSvmAllocWhenSettingKernelArgThenArgumentsAreS
     auto retVal = pKernel->setArgSvmAlloc(0, svmPtr, &svmAlloc);
     EXPECT_EQ(CL_SUCCESS, retVal);
 
-    auto pKernelArg = (void **)(pKernel->getCrossThreadData(rootDeviceIndex) +
+    auto pKernelArg = (void **)(pKernel->getCrossThreadData() +
                                 pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
     EXPECT_EQ(svmPtr, *pKernelArg);
 
@@ -238,7 +238,7 @@ HWTEST_F(KernelArgSvmTest, WhenPatchingWithImplicitSurfaceThenPatchIsApplied) {
     svmPtr.resize(256);
 
     pKernel->setCrossThreadData(nullptr, sizeof(void *));
-    pKernel->setSshLocal(nullptr, rendSurfSize, rootDeviceIndex);
+    pKernel->setSshLocal(nullptr, rendSurfSize);
     pKernelInfo->requiresSshForBuffers = true;
     pKernelInfo->usesSsh = true;
     {
@@ -252,8 +252,8 @@ HWTEST_F(KernelArgSvmTest, WhenPatchingWithImplicitSurfaceThenPatchIsApplied) {
 
         constexpr size_t patchOffset = 16;
         void *ptrToPatch = svmPtr.data() + patchOffset;
-        ASSERT_GE(pKernel->getCrossThreadDataSize(rootDeviceIndex), sizeof(void *));
-        *reinterpret_cast<void **>(pKernel->getCrossThreadData(rootDeviceIndex)) = 0U;
+        ASSERT_GE(pKernel->getCrossThreadDataSize(), sizeof(void *));
+        *reinterpret_cast<void **>(pKernel->getCrossThreadData()) = 0U;
 
         ASSERT_GE(pKernel->getSurfaceStateHeapSize(rootDeviceIndex), rendSurfSize);
         RENDER_SURFACE_STATE *surfState = reinterpret_cast<RENDER_SURFACE_STATE *>(pKernel->getSurfaceStateHeap(rootDeviceIndex));
@@ -262,7 +262,7 @@ HWTEST_F(KernelArgSvmTest, WhenPatchingWithImplicitSurfaceThenPatchIsApplied) {
         pKernel->patchWithImplicitSurface(ptrToPatch, svmAlloc, *pDevice, patch);
 
         // verify cross thread data was properly patched
-        EXPECT_EQ(ptrToPatch, *reinterpret_cast<void **>(pKernel->getCrossThreadData(rootDeviceIndex)));
+        EXPECT_EQ(ptrToPatch, *reinterpret_cast<void **>(pKernel->getCrossThreadData()));
 
         // create surface state for comparison
         RENDER_SURFACE_STATE expectedSurfaceState;
@@ -279,7 +279,7 @@ HWTEST_F(KernelArgSvmTest, WhenPatchingWithImplicitSurfaceThenPatchIsApplied) {
 
         // when cross thread and ssh data is not available then should not do anything
         pKernel->setCrossThreadData(nullptr, 0);
-        pKernel->setSshLocal(nullptr, 0, rootDeviceIndex);
+        pKernel->setSshLocal(nullptr, 0);
         pKernel->patchWithImplicitSurface(ptrToPatch, svmAlloc, *pDevice, patch);
     }
 }
@@ -294,7 +294,7 @@ TEST_F(KernelArgSvmTest, WhenPatchingBufferOffsetThenPatchIsApplied) {
         constexpr uint32_t svmOffset = 13U;
 
         MockGraphicsAllocation svmAlloc(svmPtr.data(), 256);
-        uint32_t *expectedPatchPtr = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData(rootDeviceIndex));
+        uint32_t *expectedPatchPtr = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData());
 
         KernelArgInfo kai;
         void *returnedPtr = nullptr;
@@ -390,7 +390,7 @@ HWTEST_TYPED_TEST(KernelArgSvmTestTyped, GivenBufferKernelArgWhenBufferOffsetIsN
     kai.offsetBufferOffset = kai.kernelArgPatchInfoVector[0].size;
 
     this->pKernel->setCrossThreadData(nullptr, kai.offsetBufferOffset + sizeof(uint32_t));
-    this->pKernel->setSshLocal(nullptr, rendSurfSize, rootDeviceIndex);
+    this->pKernel->setSshLocal(nullptr, rendSurfSize);
     this->pKernelInfo->requiresSshForBuffers = true;
     this->pKernelInfo->usesSsh = true;
     {
@@ -399,10 +399,10 @@ HWTEST_TYPED_TEST(KernelArgSvmTestTyped, GivenBufferKernelArgWhenBufferOffsetIsN
         constexpr size_t patchOffset = 16;
         void *ptrToPatch = svmPtr + patchOffset;
         size_t sizeToPatch = svmSize - patchOffset;
-        ASSERT_GE(this->pKernel->getCrossThreadDataSize(rootDeviceIndex), kai.offsetBufferOffset + sizeof(uint32_t));
+        ASSERT_GE(this->pKernel->getCrossThreadDataSize(), kai.offsetBufferOffset + sizeof(uint32_t));
 
-        void **expectedPointerPatchPtr = reinterpret_cast<void **>(this->pKernel->getCrossThreadData(rootDeviceIndex));
-        uint32_t *expectedOffsetPatchPtr = reinterpret_cast<uint32_t *>(ptrOffset(this->pKernel->getCrossThreadData(rootDeviceIndex), kai.offsetBufferOffset));
+        void **expectedPointerPatchPtr = reinterpret_cast<void **>(this->pKernel->getCrossThreadData());
+        uint32_t *expectedOffsetPatchPtr = reinterpret_cast<uint32_t *>(ptrOffset(this->pKernel->getCrossThreadData(), kai.offsetBufferOffset));
         *expectedPointerPatchPtr = reinterpret_cast<void *>(0U);
         *expectedOffsetPatchPtr = 0U;
 
@@ -534,7 +534,7 @@ TEST_F(KernelArgSvmTest, givenCpuAddressIsNullWhenGpuAddressIsValidThenExpectSvm
     auto retVal = pKernel->setArgSvmAlloc(0, svmPtr, &svmAlloc);
     EXPECT_EQ(CL_SUCCESS, retVal);
 
-    auto pKernelArg = (void **)(pKernel->getCrossThreadData(rootDeviceIndex) +
+    auto pKernelArg = (void **)(pKernel->getCrossThreadData() +
                                 pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
     EXPECT_EQ(svmPtr, *pKernelArg);
 }
@@ -548,7 +548,7 @@ TEST_F(KernelArgSvmTest, givenCpuAddressIsNullWhenGpuAddressIsValidThenPatchBuff
     constexpr uint32_t initVal = 7U;
 
     MockGraphicsAllocation svmAlloc(nullptr, reinterpret_cast<uint64_t>(svmPtr.data()), 256);
-    uint32_t *expectedPatchPtr = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData(rootDeviceIndex));
+    uint32_t *expectedPatchPtr = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData());
 
     KernelArgInfo kai;
     void *returnedPtr = nullptr;
diff --git a/opencl/test/unit_test/kernel/kernel_image_arg_tests.cpp b/opencl/test/unit_test/kernel/kernel_image_arg_tests.cpp
index 181053819c..e3bb6bf4a7 100644
--- a/opencl/test/unit_test/kernel/kernel_image_arg_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_image_arg_tests.cpp
@@ -38,7 +38,7 @@ TEST_F(KernelImageArgTest, GivenKernelWithImageArgsWhenCheckingDifferentScenario
     pKernel->setArg(3, sizeof(memObj), &memObj);
     pKernel->setArg(4, sizeof(memObj), &memObj);
 
-    auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData(rootDeviceIndex));
+    auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData());
     auto imgWidthOffset = ptrOffset(crossThreadData, 0x4);
     EXPECT_EQ(imageWidth, *imgWidthOffset);
 
@@ -63,7 +63,7 @@ TEST_F(KernelImageArgTest, givenKernelWithFlatImageTokensWhenArgIsSetThenPatchAl
     cl_mem memObj = image.get();
 
     pKernel->setArg(0, sizeof(memObj), &memObj);
-    auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData(rootDeviceIndex));
+    auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData());
     auto pixelSize = image->getSurfaceFormatInfo().surfaceFormat.ImageElementSizeInBytes;
 
     auto offsetFlatBaseOffset = ptrOffset(crossThreadData, pKernel->getKernelInfo(rootDeviceIndex).kernelArgInfo[0].offsetFlatBaseOffset);
@@ -85,7 +85,7 @@ TEST_F(KernelImageArgTest, givenKernelWithValidOffsetNumMipLevelsWhenImageArgIsS
     cl_mem imageObj = &image;
 
     pKernel->setArg(0, sizeof(imageObj), &imageObj);
-    auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData(rootDeviceIndex));
+    auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData());
     auto patchedNumMipLevels = ptrOffset(crossThreadData, offsetNumMipLevelsImage0);
     EXPECT_EQ(7U, *patchedNumMipLevels);
 }
@@ -107,7 +107,7 @@ TEST_F(KernelImageArgTest, givenImageWithNumSamplesWhenSetArgIsCalledThenPatchNu
 
     pKernel->setArg(0, sizeof(memObj), &memObj);
 
-    auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData(rootDeviceIndex));
+    auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData());
     auto patchedNumSamples = ptrOffset(crossThreadData, 0x3c);
     EXPECT_EQ(16u, *patchedNumSamples);
 
@@ -367,7 +367,7 @@ HWTEST_F(KernelImageArgTestBindless, givenUsedBindlessImagesWhenPatchingSurfaceS
     for (size_t i = 0; i < pKernelInfo->kernelArgInfo.size(); i++) {
         pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset = 0x20 + static_cast<uint32_t>(0x20 * i);
         auto crossThreadDataOffset = pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset;
-        auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(rootDeviceIndex), crossThreadDataOffset));
+        auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset));
         *patchLocation = 0xdead;
     }
 
@@ -378,7 +378,7 @@ HWTEST_F(KernelImageArgTestBindless, givenUsedBindlessImagesWhenPatchingSurfaceS
 
     for (size_t i = 0; i < pKernelInfo->kernelArgInfo.size(); i++) {
         auto crossThreadDataOffset = pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset;
-        auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(rootDeviceIndex), crossThreadDataOffset));
+        auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset));
 
         if (pKernelInfo->kernelArgInfo[i].isImage) {
             DataPortBindlessSurfaceExtendedMessageDescriptor extMessageDesc;
@@ -400,7 +400,7 @@ TEST_F(KernelImageArgTest, givenUsedBindlessImagesAndNonImageArgWhenPatchingSurf
     for (size_t i = 0; i < pKernelInfo->kernelArgInfo.size(); i++) {
         pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset = 0x20 + static_cast<uint32_t>(0x20 * i);
         auto crossThreadDataOffset = pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset;
-        auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(rootDeviceIndex), crossThreadDataOffset));
+        auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset));
         *patchLocation = 0xdead;
     }
 
@@ -411,7 +411,7 @@ TEST_F(KernelImageArgTest, givenUsedBindlessImagesAndNonImageArgWhenPatchingSurf
     pKernel->patchBindlessSurfaceStateOffsets(*pDevice, sshOffset);
 
     auto crossThreadDataOffset = pKernelInfo->kernelArgInfo[nonImageIndex].kernelArgPatchInfoVector[0].crossthreadOffset;
-    auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(rootDeviceIndex), crossThreadDataOffset));
+    auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset));
 
     EXPECT_EQ(0xdeadu, *patchLocation);
 }
@@ -425,7 +425,7 @@ TEST_F(KernelImageArgTest, givenNotUsedBindlessImagesAndImageArgWhenPatchingSurf
     for (size_t i = 0; i < pKernelInfo->kernelArgInfo.size(); i++) {
         pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset = 0x20 + static_cast<uint32_t>(0x20 * i);
         auto crossThreadDataOffset = pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset;
-        auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(rootDeviceIndex), crossThreadDataOffset));
+        auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset));
         *patchLocation = 0xdead;
     }
 
@@ -436,7 +436,7 @@ TEST_F(KernelImageArgTest, givenNotUsedBindlessImagesAndImageArgWhenPatchingSurf
     pKernel->patchBindlessSurfaceStateOffsets(*pDevice, sshOffset);
 
     auto crossThreadDataOffset = pKernelInfo->kernelArgInfo[nonImageIndex].kernelArgPatchInfoVector[0].crossthreadOffset;
-    auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(rootDeviceIndex), crossThreadDataOffset));
+    auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset));
 
     EXPECT_EQ(0xdeadu, *patchLocation);
 }
diff --git a/opencl/test/unit_test/kernel/kernel_immediate_arg_tests.cpp b/opencl/test/unit_test/kernel/kernel_immediate_arg_tests.cpp
index 518e0560ea..812745792e 100644
--- a/opencl/test/unit_test/kernel/kernel_immediate_arg_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_immediate_arg_tests.cpp
@@ -109,7 +109,7 @@ TYPED_TEST(KernelArgImmediateTest, WhenSettingKernelArgThenArgIsSetCorrectly) {
 
     for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
         auto pKernel = this->pMultiDeviceKernel->getKernel(rootDeviceIndex);
-        auto pKernelArg = (TypeParam *)(pKernel->getCrossThreadData(rootDeviceIndex) +
+        auto pKernelArg = (TypeParam *)(pKernel->getCrossThreadData() +
                                         this->pKernelInfo[rootDeviceIndex]->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
 
         EXPECT_EQ(val, *pKernelArg);
@@ -132,7 +132,7 @@ TYPED_TEST(KernelArgImmediateTest, GivenMultipleArgumentsWhenSettingKernelArgThe
     for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
         auto pKernel = this->pMultiDeviceKernel->getKernel(rootDeviceIndex);
 
-        auto pKernelArg = (TypeParam *)(pKernel->getCrossThreadData(rootDeviceIndex) +
+        auto pKernelArg = (TypeParam *)(pKernel->getCrossThreadData() +
                                         this->pKernelInfo[rootDeviceIndex]->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
 
         EXPECT_EQ(val, *pKernelArg);
@@ -142,7 +142,7 @@ TYPED_TEST(KernelArgImmediateTest, GivenMultipleArgumentsWhenSettingKernelArgThe
 
     for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
         auto pKernel = this->pMultiDeviceKernel->getKernel(rootDeviceIndex);
-        auto pKernelArg = (TypeParam *)(pKernel->getCrossThreadData(rootDeviceIndex) +
+        auto pKernelArg = (TypeParam *)(pKernel->getCrossThreadData() +
                                         this->pKernelInfo[rootDeviceIndex]->kernelArgInfo[1].kernelArgPatchInfoVector[0].crossthreadOffset);
 
         EXPECT_EQ(val, *pKernelArg);
@@ -152,7 +152,7 @@ TYPED_TEST(KernelArgImmediateTest, GivenMultipleArgumentsWhenSettingKernelArgThe
 
     for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
         auto pKernel = this->pMultiDeviceKernel->getKernel(rootDeviceIndex);
-        auto pKernelArg = (TypeParam *)(pKernel->getCrossThreadData(rootDeviceIndex) +
+        auto pKernelArg = (TypeParam *)(pKernel->getCrossThreadData() +
                                         this->pKernelInfo[rootDeviceIndex]->kernelArgInfo[2].kernelArgPatchInfoVector[0].crossthreadOffset);
 
         EXPECT_EQ(val, *pKernelArg);
@@ -166,7 +166,7 @@ TYPED_TEST(KernelArgImmediateTest, GivenCrossThreadDataOverwritesWhenSettingKern
     for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
         auto pKernel = this->pMultiDeviceKernel->getKernel(rootDeviceIndex);
 
-        TypeParam *pKernelArg = (TypeParam *)(pKernel->getCrossThreadData(rootDeviceIndex) +
+        TypeParam *pKernelArg = (TypeParam *)(pKernel->getCrossThreadData() +
                                               this->pKernelInfo[rootDeviceIndex]->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
 
         EXPECT_EQ(val, *pKernelArg);
@@ -177,7 +177,7 @@ TYPED_TEST(KernelArgImmediateTest, GivenCrossThreadDataOverwritesWhenSettingKern
 
     for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
         auto pKernel = this->pMultiDeviceKernel->getKernel(rootDeviceIndex);
-        auto pKernelArg = (TypeParam *)(pKernel->getCrossThreadData(rootDeviceIndex) +
+        auto pKernelArg = (TypeParam *)(pKernel->getCrossThreadData() +
                                         this->pKernelInfo[rootDeviceIndex]->kernelArgInfo[1].kernelArgPatchInfoVector[0].crossthreadOffset);
 
         EXPECT_EQ(val, *pKernelArg);
@@ -187,7 +187,7 @@ TYPED_TEST(KernelArgImmediateTest, GivenCrossThreadDataOverwritesWhenSettingKern
 
     for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
         auto pKernel = this->pMultiDeviceKernel->getKernel(rootDeviceIndex);
-        auto pKernelArg = (TypeParam *)(pKernel->getCrossThreadData(rootDeviceIndex) +
+        auto pKernelArg = (TypeParam *)(pKernel->getCrossThreadData() +
                                         this->pKernelInfo[rootDeviceIndex]->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
 
         EXPECT_EQ(val, *pKernelArg);
@@ -215,11 +215,11 @@ TYPED_TEST(KernelArgImmediateTest, GivenMultipleStructElementsWhenSettingKernelA
 
     for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
         auto pKernel = this->pMultiDeviceKernel->getKernel(rootDeviceIndex);
-        auto pCrossthreadA = (TypeParam *)(pKernel->getCrossThreadData(rootDeviceIndex) +
+        auto pCrossthreadA = (TypeParam *)(pKernel->getCrossThreadData() +
                                            this->pKernelInfo[rootDeviceIndex]->kernelArgInfo[3].kernelArgPatchInfoVector[0].crossthreadOffset);
         EXPECT_EQ(immediateStruct.a, *pCrossthreadA);
 
-        auto pCrossthreadB = (TypeParam *)(pKernel->getCrossThreadData(rootDeviceIndex) +
+        auto pCrossthreadB = (TypeParam *)(pKernel->getCrossThreadData() +
                                            this->pKernelInfo[rootDeviceIndex]->kernelArgInfo[3].kernelArgPatchInfoVector[1].crossthreadOffset);
         EXPECT_EQ(immediateStruct.b, *pCrossthreadB);
     }
@@ -233,7 +233,7 @@ TYPED_TEST(KernelArgImmediateTest, givenTooLargePatchSizeWhenSettingArgThenDontR
         std::memset(&memory[0], 0xaa, sizeof(TypeParam));
         std::memset(&memory[1], 0xbb, sizeof(TypeParam));
 
-        const auto destinationMemoryAddress = pKernel->getCrossThreadData(rootDeviceIndex) +
+        const auto destinationMemoryAddress = pKernel->getCrossThreadData() +
                                               this->pKernelInfo[rootDeviceIndex]->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset;
         const auto memoryBeyondLimitAddress = destinationMemoryAddress + sizeof(TypeParam);
 
@@ -258,7 +258,7 @@ TYPED_TEST(KernelArgImmediateTest, givenNotTooLargePatchSizeWhenSettingArgThenDo
         std::memset(&memory[0], 0xaa, sizeof(TypeParam));
         std::memset(&memory[1], 0xbb, sizeof(TypeParam));
 
-        const auto destinationMemoryAddress = pKernel->getCrossThreadData(rootDeviceIndex) +
+        const auto destinationMemoryAddress = pKernel->getCrossThreadData() +
                                               this->pKernelInfo[rootDeviceIndex]->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset;
         const auto memoryBeyondLimitAddress = destinationMemoryAddress + sizeof(TypeParam);
 
@@ -285,9 +285,9 @@ TYPED_TEST(KernelArgImmediateTest, givenMulitplePatchesAndFirstPatchSizeTooLarge
         std::memset(&memory[0], 0xaa, sizeof(TypeParam));
         std::memset(&memory[1], 0xbb, sizeof(TypeParam));
 
-        const auto destinationMemoryAddress1 = pKernel->getCrossThreadData(rootDeviceIndex) +
+        const auto destinationMemoryAddress1 = pKernel->getCrossThreadData() +
                                                this->pKernelInfo[rootDeviceIndex]->kernelArgInfo[3].kernelArgPatchInfoVector[2].crossthreadOffset;
-        const auto destinationMemoryAddress2 = pKernel->getCrossThreadData(rootDeviceIndex) +
+        const auto destinationMemoryAddress2 = pKernel->getCrossThreadData() +
                                                this->pKernelInfo[rootDeviceIndex]->kernelArgInfo[3].kernelArgPatchInfoVector[1].crossthreadOffset;
         const auto memoryBeyondLimitAddress1 = destinationMemoryAddress1 + sizeof(TypeParam);
         const auto memoryBeyondLimitAddress2 = destinationMemoryAddress2 + sizeof(TypeParam) / 2;
@@ -321,9 +321,9 @@ TYPED_TEST(KernelArgImmediateTest, givenMulitplePatchesAndSecondPatchSizeTooLarg
         std::memset(&memory[0], 0xaa, sizeof(TypeParam));
         std::memset(&memory[1], 0xbb, sizeof(TypeParam));
 
-        const auto destinationMemoryAddress1 = pKernel->getCrossThreadData(rootDeviceIndex) +
+        const auto destinationMemoryAddress1 = pKernel->getCrossThreadData() +
                                                this->pKernelInfo[rootDeviceIndex]->kernelArgInfo[3].kernelArgPatchInfoVector[2].crossthreadOffset;
-        const auto destinationMemoryAddress2 = pKernel->getCrossThreadData(rootDeviceIndex) +
+        const auto destinationMemoryAddress2 = pKernel->getCrossThreadData() +
                                                this->pKernelInfo[rootDeviceIndex]->kernelArgInfo[3].kernelArgPatchInfoVector[1].crossthreadOffset;
         const auto memoryBeyondLimitAddress1 = destinationMemoryAddress1 + sizeof(TypeParam) / 2;
         const auto memoryBeyondLimitAddress2 = destinationMemoryAddress2 + sizeof(TypeParam) / 2;
@@ -355,9 +355,9 @@ TYPED_TEST(KernelArgImmediateTest, givenMultiplePatchesAndOneSourceOffsetBeyondA
         std::memset(&memory[0], 0xaa, sizeof(TypeParam));
         std::memset(&memory[1], 0xbb, sizeof(TypeParam));
 
-        const auto destinationMemoryAddress1 = pKernel->getCrossThreadData(rootDeviceIndex) +
+        const auto destinationMemoryAddress1 = pKernel->getCrossThreadData() +
                                                this->pKernelInfo[rootDeviceIndex]->kernelArgInfo[3].kernelArgPatchInfoVector[1].crossthreadOffset;
-        const auto destinationMemoryAddress2 = pKernel->getCrossThreadData(rootDeviceIndex) +
+        const auto destinationMemoryAddress2 = pKernel->getCrossThreadData() +
                                                this->pKernelInfo[rootDeviceIndex]->kernelArgInfo[3].kernelArgPatchInfoVector[2].crossthreadOffset;
         const auto memoryBeyondLimitAddress1 = destinationMemoryAddress1 + sizeof(TypeParam);
         const auto memoryBeyondLimitAddress2 = destinationMemoryAddress2;
diff --git a/opencl/test/unit_test/kernel/kernel_slm_arg_tests.cpp b/opencl/test/unit_test/kernel/kernel_slm_arg_tests.cpp
index 32db959dc0..bcf0f9fafa 100644
--- a/opencl/test/unit_test/kernel/kernel_slm_arg_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_slm_arg_tests.cpp
@@ -87,7 +87,7 @@ TEST_F(KernelSlmArgTest, WhenSettingSizeThenAlignmentOfHigherSlmArgsIsUpdated) {
     pMultiDeviceKernel->setArg(2, slmSize2, nullptr);
 
     for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
-        auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel[rootDeviceIndex]->getCrossThreadData(rootDeviceIndex));
+        auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel[rootDeviceIndex]->getCrossThreadData());
         auto slmOffset = ptrOffset(crossThreadData, 0x10);
         EXPECT_EQ(0u, *slmOffset);
 
@@ -97,7 +97,7 @@ TEST_F(KernelSlmArgTest, WhenSettingSizeThenAlignmentOfHigherSlmArgsIsUpdated) {
         slmOffset = ptrOffset(crossThreadData, 0x30);
         EXPECT_EQ(0x400u, *slmOffset);
 
-        EXPECT_EQ(5 * KB, pKernel[rootDeviceIndex]->kernelDeviceInfos[rootDeviceIndex].slmTotalSize);
+        EXPECT_EQ(5 * KB, pKernel[rootDeviceIndex]->slmTotalSize);
     }
 }
 
@@ -106,7 +106,7 @@ TEST_F(KernelSlmArgTest, GivenReverseOrderWhenSettingSizeThenAlignmentOfHigherSl
     pMultiDeviceKernel->setArg(0, slmSize0, nullptr);
 
     for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
-        auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel[rootDeviceIndex]->getCrossThreadData(rootDeviceIndex));
+        auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel[rootDeviceIndex]->getCrossThreadData());
         auto slmOffset = ptrOffset(crossThreadData, 0x10);
         EXPECT_EQ(0u, *slmOffset);
 
@@ -116,6 +116,6 @@ TEST_F(KernelSlmArgTest, GivenReverseOrderWhenSettingSizeThenAlignmentOfHigherSl
         slmOffset = ptrOffset(crossThreadData, 0x30);
         EXPECT_EQ(0x400u, *slmOffset);
 
-        EXPECT_EQ(5 * KB, pKernel[rootDeviceIndex]->kernelDeviceInfos[rootDeviceIndex].slmTotalSize);
+        EXPECT_EQ(5 * KB, pKernel[rootDeviceIndex]->slmTotalSize);
     }
 }
diff --git a/opencl/test/unit_test/kernel/kernel_tests.cpp b/opencl/test/unit_test/kernel/kernel_tests.cpp
index 053806bb51..fc1ada99a7 100644
--- a/opencl/test/unit_test/kernel/kernel_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_tests.cpp
@@ -282,7 +282,7 @@ TEST_F(KernelTests, GivenKernelWorkGroupSizeWhenGettingWorkGroupInfoThenWorkGrou
     size_t paramValueSizeRet = 0;
 
     auto kernelMaxWorkGroupSize = pDevice->getDeviceInfo().maxWorkGroupSize - 1;
-    pKernel->kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize = static_cast<uint32_t>(kernelMaxWorkGroupSize);
+    pKernel->maxKernelWorkGroupSize = static_cast<uint32_t>(kernelMaxWorkGroupSize);
 
     retVal = pKernel->getWorkGroupInfo(
         *pClDevice,
@@ -587,7 +587,7 @@ TEST_F(KernelPrivateSurfaceTest, givenKernelWithPrivateSurfaceThatIsInUseByGpuWh
 
     auto &csr = pDevice->getGpgpuCommandStreamReceiver();
 
-    auto privateSurface = pKernel->kernelDeviceInfos[pDevice->getRootDeviceIndex()].privateSurface;
+    auto privateSurface = pKernel->privateSurface;
     auto tagAddress = csr.getTagAddress();
 
     privateSurface->updateTaskCount(*tagAddress + 1, csr.getOsContext().getContextId());
@@ -667,7 +667,7 @@ TEST_F(KernelPrivateSurfaceTest, given32BitDeviceWhenKernelIsCreatedThenPrivateS
 
         ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
 
-        EXPECT_TRUE(pKernel->kernelDeviceInfos[pDevice->getRootDeviceIndex()].privateSurface->is32BitAllocation());
+        EXPECT_TRUE(pKernel->privateSurface->is32BitAllocation());
 
         delete pKernel;
     }
@@ -707,7 +707,7 @@ HWTEST_F(KernelPrivateSurfaceTest, givenStatefulKernelWhenKernelIsCreatedThenPri
 
     EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize(rootDeviceIndex));
 
-    auto bufferAddress = pKernel->kernelDeviceInfos[pDevice->getRootDeviceIndex()].privateSurface->getGpuAddress();
+    auto bufferAddress = pKernel->privateSurface->getGpuAddress();
 
     typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
     auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
@@ -856,7 +856,7 @@ TEST_F(KernelGlobalSurfaceTest, givenBuiltInKernelWhenKernelIsCreatedThenGlobalS
 
     ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
 
-    EXPECT_EQ(bufferAddress, *(uint64_t *)pKernel->getCrossThreadData(rootDeviceIndex));
+    EXPECT_EQ(bufferAddress, *(uint64_t *)pKernel->getCrossThreadData());
 
     program.setGlobalSurface(nullptr);
     delete pKernel;
@@ -891,7 +891,7 @@ TEST_F(KernelGlobalSurfaceTest, givenNDRangeKernelWhenKernelIsCreatedThenGlobalS
 
     ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
 
-    EXPECT_EQ(bufferAddress, *(uint64_t *)pKernel->getCrossThreadData(rootDeviceIndex));
+    EXPECT_EQ(bufferAddress, *(uint64_t *)pKernel->getCrossThreadData());
 
     program.setGlobalSurface(nullptr);
 
@@ -1008,7 +1008,7 @@ TEST_F(KernelConstantSurfaceTest, givenBuiltInKernelWhenKernelIsCreatedThenConst
 
     ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
 
-    EXPECT_EQ(bufferAddress, *(uint64_t *)pKernel->getCrossThreadData(rootDeviceIndex));
+    EXPECT_EQ(bufferAddress, *(uint64_t *)pKernel->getCrossThreadData());
 
     program.setConstantSurface(nullptr);
     delete pKernel;
@@ -1043,7 +1043,7 @@ TEST_F(KernelConstantSurfaceTest, givenNDRangeKernelWhenKernelIsCreatedThenConst
 
     ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
 
-    EXPECT_EQ(bufferAddress, *(uint64_t *)pKernel->getCrossThreadData(rootDeviceIndex));
+    EXPECT_EQ(bufferAddress, *(uint64_t *)pKernel->getCrossThreadData());
 
     program.setConstantSurface(nullptr);
 
@@ -1234,7 +1234,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelEventPoolSurfaceTest, givenKernelWithNullEvent
 
     pKernel->patchEventPool(pDevQueue);
 
-    EXPECT_EQ(123u, *(uint64_t *)pKernel->getCrossThreadData(rootDeviceIndex));
+    EXPECT_EQ(123u, *(uint64_t *)pKernel->getCrossThreadData());
 
     delete pKernel;
 }
@@ -1296,7 +1296,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelEventPoolSurfaceTest, givenStatelessKernelWhen
 
     pKernel->patchEventPool(pDevQueue);
 
-    EXPECT_EQ(pDevQueue->getEventPoolBuffer()->getGpuAddressToPatch(), *(uint64_t *)pKernel->getCrossThreadData(rootDeviceIndex));
+    EXPECT_EQ(pDevQueue->getEventPoolBuffer()->getGpuAddressToPatch(), *(uint64_t *)pKernel->getCrossThreadData());
 
     delete pKernel;
 }
@@ -1438,7 +1438,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelDefaultDeviceQueueSurfaceTest, givenKernelWith
 
     pKernel->patchDefaultDeviceQueue(pDevQueue);
 
-    EXPECT_EQ(123u, *(uint64_t *)pKernel->getCrossThreadData(rootDeviceIndex));
+    EXPECT_EQ(123u, *(uint64_t *)pKernel->getCrossThreadData());
 
     delete pKernel;
 }
@@ -1470,7 +1470,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelDefaultDeviceQueueSurfaceTest, givenStatelessK
 
     pKernel->patchDefaultDeviceQueue(pDevQueue);
 
-    EXPECT_EQ(pDevQueue->getQueueBuffer()->getGpuAddressToPatch(), *(uint64_t *)pKernel->getCrossThreadData(rootDeviceIndex));
+    EXPECT_EQ(pDevQueue->getQueueBuffer()->getGpuAddressToPatch(), *(uint64_t *)pKernel->getCrossThreadData());
 
     delete pKernel;
 }
@@ -1542,7 +1542,7 @@ HWTEST_F(KernelResidencyTest, givenKernelWhenMakeResidentIsCalledThenExportedFun
 
     // check getResidency as well
     std::vector<NEO::Surface *> residencySurfaces;
-    pKernel->getResidency(residencySurfaces, rootDeviceIndex);
+    pKernel->getResidency(residencySurfaces);
     std::unique_ptr<NEO::ExecutionEnvironment> mockCsrExecEnv;
     {
         CommandStreamReceiverMock csrMock;
@@ -1580,7 +1580,7 @@ HWTEST_F(KernelResidencyTest, givenKernelWhenMakeResidentIsCalledThenGlobalBuffe
     EXPECT_TRUE(commandStreamReceiver.isMadeResident(program.buildInfos[pDevice->getRootDeviceIndex()].globalSurface));
 
     std::vector<NEO::Surface *> residencySurfaces;
-    pKernel->getResidency(residencySurfaces, rootDeviceIndex);
+    pKernel->getResidency(residencySurfaces);
     std::unique_ptr<NEO::ExecutionEnvironment> mockCsrExecEnv;
     {
         CommandStreamReceiverMock csrMock;
@@ -2468,10 +2468,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenGlobalWorkOffsetIsCorr
     MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex), *pClDevice);
     ASSERT_EQ(CL_SUCCESS, kernel.initialize());
 
-    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX);
-    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY);
-    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetZ);
+    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.globalWorkOffsetX);
+    EXPECT_NE(nullptr, kernel.globalWorkOffsetY);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.globalWorkOffsetY);
+    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.globalWorkOffsetZ);
 }
 
 TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkSizeIsCorrect) {
@@ -2481,10 +2481,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkSizeIsCorrect
     MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex), *pClDevice);
     ASSERT_EQ(CL_SUCCESS, kernel.initialize());
 
-    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX);
-    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY);
-    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ);
+    EXPECT_NE(nullptr, kernel.localWorkSizeX);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.localWorkSizeX);
+    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.localWorkSizeY);
+    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.localWorkSizeZ);
 }
 
 TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkSize2IsCorrect) {
@@ -2494,10 +2494,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkSize2IsCorrec
     MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex), *pClDevice);
     ASSERT_EQ(CL_SUCCESS, kernel.initialize());
 
-    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2);
-    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2);
-    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2);
+    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.localWorkSizeX2);
+    EXPECT_NE(nullptr, kernel.localWorkSizeY2);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.localWorkSizeY2);
+    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.localWorkSizeZ2);
 }
 
 TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenGlobalWorkSizeIsCorrect) {
@@ -2507,10 +2507,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenGlobalWorkSizeIsCorrec
     MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex), *pClDevice);
     ASSERT_EQ(CL_SUCCESS, kernel.initialize());
 
-    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkSizeX);
-    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkSizeY);
-    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ);
+    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.globalWorkSizeX);
+    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.globalWorkSizeY);
+    EXPECT_NE(nullptr, kernel.globalWorkSizeZ);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.globalWorkSizeZ);
 }
 
 TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkDimIsCorrect) {
@@ -2520,8 +2520,8 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenLocalWorkDimIsCorrect)
     MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex), *pClDevice);
     ASSERT_EQ(CL_SUCCESS, kernel.initialize());
 
-    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].workDim);
+    EXPECT_NE(nullptr, kernel.workDim);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.workDim);
 }
 
 TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenNumWorkGroupsIsCorrect) {
@@ -2533,12 +2533,12 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenNumWorkGroupsIsCorrect
     MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex), *pClDevice);
     ASSERT_EQ(CL_SUCCESS, kernel.initialize());
 
-    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
-    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
-    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ);
+    EXPECT_NE(nullptr, kernel.numWorkGroupsX);
+    EXPECT_NE(nullptr, kernel.numWorkGroupsY);
+    EXPECT_NE(nullptr, kernel.numWorkGroupsZ);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.numWorkGroupsX);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.numWorkGroupsY);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.numWorkGroupsZ);
 }
 
 TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenEnqueuedLocalWorkSizeIsCorrect) {
@@ -2548,10 +2548,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenEnqueuedLocalWorkSizeI
     MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex), *pClDevice);
     ASSERT_EQ(CL_SUCCESS, kernel.initialize());
 
-    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX);
-    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY);
-    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ);
+    EXPECT_NE(nullptr, kernel.enqueuedLocalWorkSizeX);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.enqueuedLocalWorkSizeX);
+    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.enqueuedLocalWorkSizeY);
+    EXPECT_EQ(&Kernel::dummyPatchLocation, kernel.enqueuedLocalWorkSizeZ);
 }
 
 TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenEnqueuedMaxWorkGroupSizeIsCorrect) {
@@ -2560,11 +2560,11 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenEnqueuedMaxWorkGroupSi
     MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex), *pClDevice);
     ASSERT_EQ(CL_SUCCESS, kernel.initialize());
 
-    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData);
-    EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.maxWorkGroupSizeOffset), static_cast<void *>(kernel.kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData));
-    EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, *kernel.kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData);
-    EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, kernel.kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize);
+    EXPECT_NE(nullptr, kernel.maxWorkGroupSizeForCrossThreadData);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.maxWorkGroupSizeForCrossThreadData);
+    EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData() + pKernelInfo->workloadInfo.maxWorkGroupSizeOffset), static_cast<void *>(kernel.maxWorkGroupSizeForCrossThreadData));
+    EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, *kernel.maxWorkGroupSizeForCrossThreadData);
+    EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, kernel.maxKernelWorkGroupSize);
 }
 
 TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenDataParameterSimdSizeIsCorrect) {
@@ -2573,10 +2573,10 @@ TEST_F(KernelCrossThreadTests, WhenKernelIsInitializedThenDataParameterSimdSizeI
     MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex), *pClDevice);
     ASSERT_EQ(CL_SUCCESS, kernel.initialize());
 
-    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize);
-    EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.simdSizeOffset), static_cast<void *>(kernel.kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize));
-    EXPECT_EQ_VAL(pKernelInfo->getMaxSimdSize(), *kernel.kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize);
+    EXPECT_NE(nullptr, kernel.dataParameterSimdSize);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.dataParameterSimdSize);
+    EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData() + pKernelInfo->workloadInfo.simdSizeOffset), static_cast<void *>(kernel.dataParameterSimdSize));
+    EXPECT_EQ_VAL(pKernelInfo->getMaxSimdSize(), *kernel.dataParameterSimdSize);
 }
 
 TEST_F(KernelCrossThreadTests, GivenParentEventOffsetWhenKernelIsInitializedThenParentEventIsInitiatedWithInvalid) {
@@ -2584,10 +2584,10 @@ TEST_F(KernelCrossThreadTests, GivenParentEventOffsetWhenKernelIsInitializedThen
     MockKernel kernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex), *pClDevice);
     ASSERT_EQ(CL_SUCCESS, kernel.initialize());
 
-    EXPECT_NE(nullptr, kernel.kernelDeviceInfos[rootDeviceIndex].parentEventOffset);
-    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.kernelDeviceInfos[rootDeviceIndex].parentEventOffset);
-    EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData(rootDeviceIndex) + pKernelInfo->workloadInfo.parentEventOffset), static_cast<void *>(kernel.kernelDeviceInfos[rootDeviceIndex].parentEventOffset));
-    EXPECT_EQ(WorkloadInfo::invalidParentEvent, *kernel.kernelDeviceInfos[rootDeviceIndex].parentEventOffset);
+    EXPECT_NE(nullptr, kernel.parentEventOffset);
+    EXPECT_NE(&Kernel::dummyPatchLocation, kernel.parentEventOffset);
+    EXPECT_EQ(static_cast<void *>(kernel.getCrossThreadData() + pKernelInfo->workloadInfo.parentEventOffset), static_cast<void *>(kernel.parentEventOffset));
+    EXPECT_EQ(WorkloadInfo::invalidParentEvent, *kernel.parentEventOffset);
 }
 
 TEST_F(KernelCrossThreadTests, WhenAddingKernelThenProgramRefCountIsIncremented) {
@@ -2608,7 +2608,7 @@ TEST_F(KernelCrossThreadTests, GivenSlmStatisSizeWhenCreatingKernelThenSlmTotalS
 
     MockKernel *kernel = new MockKernel(program.get(), MockKernel::toKernelInfoContainer(*pKernelInfo, rootDeviceIndex), *pClDevice);
 
-    EXPECT_EQ(1024u, kernel->kernelDeviceInfos[rootDeviceIndex].slmTotalSize);
+    EXPECT_EQ(1024u, kernel->slmTotalSize);
 
     delete kernel;
 }
@@ -2623,9 +2623,9 @@ TEST_F(KernelCrossThreadTests, givenKernelWithPrivateMemoryWhenItIsCreatedThenCu
 
     kernel->initialize();
 
-    auto privateSurface = kernel->kernelDeviceInfos[pDevice->getRootDeviceIndex()].privateSurface;
+    auto privateSurface = kernel->privateSurface;
 
-    auto constantBuffer = kernel->getCrossThreadData(rootDeviceIndex);
+    auto constantBuffer = kernel->getCrossThreadData();
     auto privateAddress = (uintptr_t)privateSurface->getGpuAddressToPatch();
     auto ptrCurbe = (uint64_t *)constantBuffer;
     auto privateAddressFromCurbe = (uintptr_t)*ptrCurbe;
@@ -2642,7 +2642,7 @@ TEST_F(KernelCrossThreadTests, givenKernelWithPreferredWkgMultipleWhenItIsCreate
 
     kernel->initialize();
 
-    auto *crossThread = kernel->getCrossThreadData(rootDeviceIndex);
+    auto *crossThread = kernel->getCrossThreadData();
 
     uint32_t *preferredWkgMultipleOffset = (uint32_t *)ptrOffset(crossThread, 8);
 
@@ -2667,7 +2667,7 @@ TEST_F(KernelCrossThreadTests, WhenPatchingBlocksSimdSizeThenSimdSizeIsPatchedCo
     kernel->mockKernel->patchBlocksSimdSize(rootDeviceIndex);
 
     // obtain block's simd size from cross thread data
-    void *blockSimdSize = ptrOffset(kernel->mockKernel->getCrossThreadData(rootDeviceIndex), kernel->kernelInfo.childrenKernelsIdOffset[0].second);
+    void *blockSimdSize = ptrOffset(kernel->mockKernel->getCrossThreadData(), kernel->kernelInfo.childrenKernelsIdOffset[0].second);
     uint32_t *simdSize = reinterpret_cast<uint32_t *>(blockSimdSize);
 
     // check of block's simd size has been patched correctly
@@ -3419,7 +3419,7 @@ TEST_F(KernelMultiRootDeviceTest, givenKernelWithPrivateSurfaceWhenInitializeThe
 
     for (auto &rootDeviceIndex : context->getRootDeviceIndices()) {
         auto kernel = static_cast<MockKernel *>(pMultiDeviceKernel->getKernel(rootDeviceIndex));
-        auto privateSurface = kernel->kernelDeviceInfos[rootDeviceIndex].privateSurface;
+        auto privateSurface = kernel->privateSurface;
         ASSERT_NE(nullptr, privateSurface);
         EXPECT_EQ(rootDeviceIndex, privateSurface->getRootDeviceIndex());
     }
diff --git a/opencl/test/unit_test/kernel/parent_kernel_tests.cpp b/opencl/test/unit_test/kernel/parent_kernel_tests.cpp
index 888d875102..c6fa88e595 100644
--- a/opencl/test/unit_test/kernel/parent_kernel_tests.cpp
+++ b/opencl/test/unit_test/kernel/parent_kernel_tests.cpp
@@ -68,7 +68,7 @@ TEST(ParentKernelTest, WhenPatchingBlocksSimdSizeThenPatchIsAppliedCorrectly) {
 
     parentKernel->patchBlocksSimdSize(rootDeviceIndex);
 
-    void *blockSimdSize = ptrOffset(parentKernel->getCrossThreadData(rootDeviceIndex), parentKernel->getKernelInfo(rootDeviceIndex).childrenKernelsIdOffset[0].second);
+    void *blockSimdSize = ptrOffset(parentKernel->getCrossThreadData(), parentKernel->getKernelInfo(rootDeviceIndex).childrenKernelsIdOffset[0].second);
     uint32_t *simdSize = reinterpret_cast<uint32_t *>(blockSimdSize);
 
     EXPECT_EQ(program->blockKernelManager->getBlockKernelInfo(0)->getMaxSimdSize(), *simdSize);
@@ -99,7 +99,7 @@ TEST(ParentKernelTest, WhenInitializingParentKernelThenBlocksSimdSizeIsPatched)
 
     parentKernel->initialize();
 
-    void *blockSimdSize = ptrOffset(parentKernel->getCrossThreadData(rootDeviceIndex), parentKernel->getKernelInfo(rootDeviceIndex).childrenKernelsIdOffset[0].second);
+    void *blockSimdSize = ptrOffset(parentKernel->getCrossThreadData(), parentKernel->getKernelInfo(rootDeviceIndex).childrenKernelsIdOffset[0].second);
     uint32_t *simdSize = reinterpret_cast<uint32_t *>(blockSimdSize);
 
     EXPECT_EQ(program->blockKernelManager->getBlockKernelInfo(0)->getMaxSimdSize(), *simdSize);
diff --git a/opencl/test/unit_test/mem_obj/buffer_set_arg_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_set_arg_tests.cpp
index 1c8b149e7d..19a3f0c02b 100644
--- a/opencl/test/unit_test/mem_obj/buffer_set_arg_tests.cpp
+++ b/opencl/test/unit_test/mem_obj/buffer_set_arg_tests.cpp
@@ -104,7 +104,7 @@ class BufferSetArgTest : public ContextFixture,
 };
 
 TEST_F(BufferSetArgTest, WhenSettingKernelArgBufferThenGpuAddressIsSet) {
-    auto pKernelArg = (void **)(pKernel->getCrossThreadData(rootDeviceIndex) +
+    auto pKernelArg = (void **)(pKernel->getCrossThreadData() +
                                 pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
 
     auto tokenSize = pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].size;
@@ -206,7 +206,7 @@ HWTEST_F(BufferSetArgTest, givenNonPureStatefulArgWhenRenderCompressedBufferIsSe
 }
 
 TEST_F(BufferSetArgTest, Given32BitAddressingWhenSettingArgStatelessThenGpuAddressIsSetCorrectly) {
-    auto pKernelArg = (void **)(pKernel->getCrossThreadData(rootDeviceIndex) +
+    auto pKernelArg = (void **)(pKernel->getCrossThreadData() +
                                 pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
 
     auto tokenSize = pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].size;
@@ -229,7 +229,7 @@ TEST_F(BufferSetArgTest, givenBufferWhenOffsetedSubbufferIsPassedToSetKernelArgT
 
     EXPECT_EQ(ptrOffset(buffer->getCpuAddress(), region.origin), subBuffer->getCpuAddress());
 
-    auto pKernelArg = (void **)(pKernel->getCrossThreadData(rootDeviceIndex) +
+    auto pKernelArg = (void **)(pKernel->getCrossThreadData() +
                                 pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
 
     auto tokenSize = pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].size;
@@ -241,7 +241,7 @@ TEST_F(BufferSetArgTest, givenBufferWhenOffsetedSubbufferIsPassedToSetKernelArgT
 }
 
 TEST_F(BufferSetArgTest, givenCurbeTokenThatSizeIs4BytesWhenStatelessArgIsPatchedThenOnly4BytesArePatchedInCurbe) {
-    auto pKernelArg = (void **)(pKernel->getCrossThreadData(rootDeviceIndex) +
+    auto pKernelArg = (void **)(pKernel->getCrossThreadData() +
                                 pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
 
     //fill 8 bytes with 0xffffffffffffffff;
@@ -275,13 +275,13 @@ TEST_F(BufferSetArgTest, WhenSettingKernelArgThenAddressToPatchIsSetCorrectlyAnd
         &memObj);
     ASSERT_EQ(CL_SUCCESS, retVal);
 
-    auto pKernelArg = (void **)(pKernel->getCrossThreadData(rootDeviceIndex) +
+    auto pKernelArg = (void **)(pKernel->getCrossThreadData() +
                                 pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
 
     EXPECT_EQ(reinterpret_cast<void *>(buffer->getGraphicsAllocation(pClDevice->getRootDeviceIndex())->getGpuAddressToPatch()), *pKernelArg);
 
     std::vector<Surface *> surfaces;
-    pKernel->getResidency(surfaces, rootDeviceIndex);
+    pKernel->getResidency(surfaces);
     EXPECT_EQ(1u, surfaces.size());
 
     for (auto &surface : surfaces) {
@@ -305,13 +305,13 @@ TEST_F(BufferSetArgTest, GivenSvmPointerWhenSettingKernelArgThenAddressToPatchIs
         pSvmAlloc);
     ASSERT_EQ(CL_SUCCESS, retVal);
 
-    auto pKernelArg = (void **)(pKernel->getCrossThreadData(rootDeviceIndex) +
+    auto pKernelArg = (void **)(pKernel->getCrossThreadData() +
                                 pKernelInfo->kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset);
 
     EXPECT_EQ(ptrSVM, *pKernelArg);
 
     std::vector<Surface *> surfaces;
-    pKernel->getResidency(surfaces, rootDeviceIndex);
+    pKernel->getResidency(surfaces);
     EXPECT_EQ(1u, surfaces.size());
     for (auto &surface : surfaces) {
         delete surface;
@@ -348,7 +348,7 @@ TEST_F(BufferSetArgTest, givenKernelArgBufferWhenAddPathInfoDataIsSetThenPatchIn
     EXPECT_EQ(PatchInfoAllocationType::KernelArg, pKernel->getPatchInfoDataList()[0].sourceType);
     EXPECT_EQ(PatchInfoAllocationType::IndirectObjectHeap, pKernel->getPatchInfoDataList()[0].targetType);
     EXPECT_EQ(buffer->getGraphicsAllocation(pClDevice->getRootDeviceIndex())->getGpuAddressToPatch(), pKernel->getPatchInfoDataList()[0].sourceAllocation);
-    EXPECT_EQ(reinterpret_cast<uint64_t>(pKernel->getCrossThreadData(rootDeviceIndex)), pKernel->getPatchInfoDataList()[0].targetAllocation);
+    EXPECT_EQ(reinterpret_cast<uint64_t>(pKernel->getCrossThreadData()), pKernel->getPatchInfoDataList()[0].targetAllocation);
     EXPECT_EQ(0u, pKernel->getPatchInfoDataList()[0].sourceAllocationOffset);
 }
 
diff --git a/opencl/test/unit_test/mem_obj/image_set_arg_tests.cpp b/opencl/test/unit_test/mem_obj/image_set_arg_tests.cpp
index c8835cc648..890c526275 100644
--- a/opencl/test/unit_test/mem_obj/image_set_arg_tests.cpp
+++ b/opencl/test/unit_test/mem_obj/image_set_arg_tests.cpp
@@ -133,7 +133,7 @@ HWTEST_F(ImageSetArgTest, WhenSettingKernelArgImageThenSurfaceBaseAddressIsSetCo
     EXPECT_EQ(srcAllocation->getGpuAddress(), surfaceAddress);
 
     std::vector<Surface *> surfaces;
-    pKernel->getResidency(surfaces, rootDeviceIndex);
+    pKernel->getResidency(surfaces);
     EXPECT_EQ(0u, surfaces.size());
 }
 
@@ -343,7 +343,7 @@ HWTEST_F(ImageSetArgTest, givenOffsetedBufferWhenSetKernelArgImageIscalledThenFu
     EXPECT_EQ(srcAllocation->getGpuAddress(), surfaceAddress);
 
     std::vector<Surface *> surfaces;
-    pKernel->getResidency(surfaces, rootDeviceIndex);
+    pKernel->getResidency(surfaces);
     EXPECT_EQ(0u, surfaces.size());
 }
 
@@ -386,7 +386,7 @@ HWTEST_F(ImageSetArgTest, WhenSettingKernelArgThenPropertiesAreSetCorrectly) {
     EXPECT_EQ(0u, surfaceState->getCoherencyType());
 
     std::vector<Surface *> surfaces;
-    pKernel->getResidency(surfaces, rootDeviceIndex);
+    pKernel->getResidency(surfaces);
     EXPECT_EQ(1u, surfaces.size());
 
     for (auto &surface : surfaces) {
@@ -458,7 +458,7 @@ HWTEST_F(ImageSetArgTest, Given2dArrayWhenSettingKernelArgThenPropertiesAreSetCo
     EXPECT_EQ(RENDER_SURFACE_STATE::SHADER_CHANNEL_SELECT_ALPHA, surfaceState->getShaderChannelSelectAlpha());
 
     std::vector<Surface *> surfaces;
-    pKernel->getResidency(surfaces, rootDeviceIndex);
+    pKernel->getResidency(surfaces);
     EXPECT_EQ(1u, surfaces.size());
     for (auto &surface : surfaces) {
         delete surface;
@@ -506,7 +506,7 @@ HWTEST_F(ImageSetArgTest, Given1dArrayWhenSettingKernelArgThenPropertiesAreSetCo
     EXPECT_EQ(RENDER_SURFACE_STATE::SHADER_CHANNEL_SELECT_ALPHA, surfaceState->getShaderChannelSelectAlpha());
 
     std::vector<Surface *> surfaces;
-    pKernel->getResidency(surfaces, rootDeviceIndex);
+    pKernel->getResidency(surfaces);
     EXPECT_EQ(1u, surfaces.size());
     for (auto &surface : surfaces) {
         delete surface;
@@ -852,7 +852,7 @@ HWTEST_F(ImageSetArgTest, GivenImageWithClLuminanceFormatWhenSettingKernelArgThe
     EXPECT_EQ(RENDER_SURFACE_STATE::SHADER_CHANNEL_SELECT_ALPHA, surfaceState->getShaderChannelSelectAlpha());
 
     std::vector<Surface *> surfaces;
-    pKernel->getResidency(surfaces, rootDeviceIndex);
+    pKernel->getResidency(surfaces);
     EXPECT_EQ(1u, surfaces.size());
     for (auto &surface : surfaces) {
         delete surface;
@@ -872,7 +872,7 @@ HWTEST_F(ImageSetArgTest, WhenSettingArgThenImageIsReturned) {
     EXPECT_EQ(memObj, pKernel->getKernelArg(0));
 
     std::vector<Surface *> surfaces;
-    pKernel->getResidency(surfaces, rootDeviceIndex);
+    pKernel->getResidency(surfaces);
     EXPECT_EQ(1u, surfaces.size());
 
     for (auto &surface : surfaces) {
@@ -999,7 +999,7 @@ HWTEST_F(ImageMediaBlockSetArgTest, WhenSettingKernelArgImageThenPropertiesAreCo
     EXPECT_EQ(imageMocs, surfaceState->getMemoryObjectControlState());
 
     std::vector<Surface *> surfaces;
-    pKernel->getResidency(surfaces, rootDeviceIndex);
+    pKernel->getResidency(surfaces);
     EXPECT_EQ(1u, surfaces.size());
 
     for (auto &surface : surfaces) {
diff --git a/opencl/test/unit_test/memory_manager/memory_manager_tests.cpp b/opencl/test/unit_test/memory_manager/memory_manager_tests.cpp
index 94b412ebaf..6f67712929 100644
--- a/opencl/test/unit_test/memory_manager/memory_manager_tests.cpp
+++ b/opencl/test/unit_test/memory_manager/memory_manager_tests.cpp
@@ -541,7 +541,7 @@ TEST_F(MemoryAllocatorTest, givenStatelessKernelWithPrintfWhenPrintfSurfaceIsCre
     auto printfAllocation = printfHandler->getSurface();
     auto allocationAddress = printfAllocation->getGpuAddressToPatch();
 
-    auto printfPatchAddress = ptrOffset(reinterpret_cast<uintptr_t *>(kernel.mockKernel->getCrossThreadData(rootDeviceIndex)),
+    auto printfPatchAddress = ptrOffset(reinterpret_cast<uintptr_t *>(kernel.mockKernel->getCrossThreadData()),
                                         kernel.mockKernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.payloadMappings.implicitArgs.printfSurfaceAddress.stateless);
 
     EXPECT_EQ(allocationAddress, *(uintptr_t *)printfPatchAddress);
diff --git a/opencl/test/unit_test/mocks/mock_kernel.cpp b/opencl/test/unit_test/mocks/mock_kernel.cpp
index b243dc1a83..43a0ea6cd1 100644
--- a/opencl/test/unit_test/mocks/mock_kernel.cpp
+++ b/opencl/test/unit_test/mocks/mock_kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2020 Intel Corporation
+ * Copyright (C) 2017-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,9 +56,9 @@ void MockKernel::makeResident(CommandStreamReceiver &commandStreamReceiver) {
     Kernel::makeResident(commandStreamReceiver);
 }
 
-void MockKernel::getResidency(std::vector<Surface *> &dst, uint32_t rootDeviceIndex) {
+void MockKernel::getResidency(std::vector<Surface *> &dst) {
     getResidencyCalls++;
-    Kernel::getResidency(dst, rootDeviceIndex);
+    Kernel::getResidency(dst);
 }
 bool MockKernel::requiresCacheFlushCommand(const CommandQueue &commandQueue) const {
     if (DebugManager.flags.EnableCacheFlushAfterWalker.get() != -1) {
diff --git a/opencl/test/unit_test/mocks/mock_kernel.h b/opencl/test/unit_test/mocks/mock_kernel.h
index 8285ad6fa1..efd10b03f8 100644
--- a/opencl/test/unit_test/mocks/mock_kernel.h
+++ b/opencl/test/unit_test/mocks/mock_kernel.h
@@ -116,8 +116,18 @@ class MockKernel : public Kernel {
     using Kernel::allBufferArgsStateful;
     using Kernel::auxTranslationRequired;
     using Kernel::containsStatelessWrites;
+    using Kernel::dataParameterSimdSize;
+    using Kernel::enqueuedLocalWorkSizeX;
+    using Kernel::enqueuedLocalWorkSizeY;
+    using Kernel::enqueuedLocalWorkSizeZ;
     using Kernel::executionType;
     using Kernel::getDevice;
+    using Kernel::globalWorkOffsetX;
+    using Kernel::globalWorkOffsetY;
+    using Kernel::globalWorkOffsetZ;
+    using Kernel::globalWorkSizeX;
+    using Kernel::globalWorkSizeY;
+    using Kernel::globalWorkSizeZ;
     using Kernel::hasDirectStatelessAccessToHostMemory;
     using Kernel::hasIndirectStatelessAccessToHostMemory;
     using Kernel::isSchedulerKernel;
@@ -125,17 +135,35 @@ class MockKernel : public Kernel {
     using Kernel::kernelArgRequiresCacheFlush;
     using Kernel::kernelArguments;
     using Kernel::KernelConfig;
-    using Kernel::kernelDeviceInfos;
     using Kernel::kernelHasIndirectAccess;
     using Kernel::kernelSubmissionMap;
     using Kernel::kernelSvmGfxAllocations;
     using Kernel::kernelUnifiedMemoryGfxAllocations;
+    using Kernel::localWorkSizeX;
+    using Kernel::localWorkSizeX2;
+    using Kernel::localWorkSizeY;
+    using Kernel::localWorkSizeY2;
+    using Kernel::localWorkSizeZ;
+    using Kernel::localWorkSizeZ2;
+    using Kernel::maxKernelWorkGroupSize;
+    using Kernel::maxWorkGroupSizeForCrossThreadData;
+    using Kernel::numberOfBindingTableStates;
+    using Kernel::numWorkGroupsX;
+    using Kernel::numWorkGroupsY;
+    using Kernel::numWorkGroupsZ;
+    using Kernel::parentEventOffset;
     using Kernel::patchBufferOffset;
     using Kernel::patchWithImplicitSurface;
+    using Kernel::preferredWkgMultipleOffset;
+    using Kernel::privateSurface;
     using Kernel::singleSubdevicePreferedInCurrentEnqueue;
     using Kernel::svmAllocationsRequireCacheFlush;
     using Kernel::threadArbitrationPolicy;
     using Kernel::unifiedMemoryControls;
+    using Kernel::workDim;
+
+    using Kernel::slmSizes;
+    using Kernel::slmTotalSize;
 
     struct BlockPatchValues {
         uint64_t offset;
@@ -190,10 +218,8 @@ class MockKernel : public Kernel {
 
     ~MockKernel() override {
         // prevent double deletion
-        for (auto rootDeviceIndex = 0u; rootDeviceIndex < kernelDeviceInfos.size(); rootDeviceIndex++) {
-            if (kernelDeviceInfos[rootDeviceIndex].crossThreadData == mockCrossThreadData.data()) {
-                kernelDeviceInfos[rootDeviceIndex].crossThreadData = nullptr;
-            }
+        if (crossThreadData == mockCrossThreadData.data()) {
+            crossThreadData = nullptr;
         }
 
         if (kernelInfoAllocated) {
@@ -230,9 +256,9 @@ class MockKernel : public Kernel {
         kernelInfos.resize(rootDeviceIndex + 1);
         kernelInfos[rootDeviceIndex] = info;
         auto kernel = new KernelType(program, kernelInfos, *device.getSpecializedDevice<ClDevice>());
-        kernel->kernelDeviceInfos[rootDeviceIndex].crossThreadData = new char[crossThreadSize];
-        memset(kernel->kernelDeviceInfos[rootDeviceIndex].crossThreadData, 0, crossThreadSize);
-        kernel->kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize = crossThreadSize;
+        kernel->crossThreadData = new char[crossThreadSize];
+        memset(kernel->crossThreadData, 0, crossThreadSize);
+        kernel->crossThreadDataSize = crossThreadSize;
 
         kernel->kernelInfoAllocated = info;
 
@@ -249,11 +275,10 @@ class MockKernel : public Kernel {
 
     ////////////////////////////////////////////////////////////////////////////////
     void setCrossThreadData(const void *crossThreadDataPattern, uint32_t newCrossThreadDataSize) {
-        auto rootDeviceIndex = defaultRootDeviceIndex;
-        if ((kernelDeviceInfos[rootDeviceIndex].crossThreadData != nullptr) && (kernelDeviceInfos[rootDeviceIndex].crossThreadData != mockCrossThreadData.data())) {
-            delete[] kernelDeviceInfos[rootDeviceIndex].crossThreadData;
-            kernelDeviceInfos[rootDeviceIndex].crossThreadData = nullptr;
-            kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize = 0;
+        if ((crossThreadData != nullptr) && (crossThreadData != mockCrossThreadData.data())) {
+            delete[] crossThreadData;
+            crossThreadData = nullptr;
+            crossThreadDataSize = 0;
         }
         if (crossThreadDataPattern && (newCrossThreadDataSize > 0)) {
             mockCrossThreadData.clear();
@@ -263,41 +288,34 @@ class MockKernel : public Kernel {
         }
 
         if (newCrossThreadDataSize == 0) {
-            kernelDeviceInfos[rootDeviceIndex].crossThreadData = nullptr;
-            kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize = 0;
+            crossThreadData = nullptr;
+            crossThreadDataSize = 0;
             return;
         }
-        kernelDeviceInfos[rootDeviceIndex].crossThreadData = mockCrossThreadData.data();
-        kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize = static_cast<uint32_t>(mockCrossThreadData.size());
+        crossThreadData = mockCrossThreadData.data();
+        crossThreadDataSize = static_cast<uint32_t>(mockCrossThreadData.size());
     }
 
-    void setSshLocal(const void *sshPattern, uint32_t newSshSize, uint32_t rootDeviceIndex) {
-        kernelDeviceInfos[rootDeviceIndex].sshLocalSize = newSshSize;
+    void setSshLocal(const void *sshPattern, uint32_t newSshSize) {
+        sshLocalSize = newSshSize;
 
         if (newSshSize == 0) {
-            kernelDeviceInfos[rootDeviceIndex].pSshLocal.reset(nullptr);
+            pSshLocal.reset(nullptr);
         } else {
-            kernelDeviceInfos[rootDeviceIndex].pSshLocal = std::make_unique<char[]>(newSshSize);
+            pSshLocal = std::make_unique<char[]>(newSshSize);
             if (sshPattern) {
-                memcpy_s(kernelDeviceInfos[rootDeviceIndex].pSshLocal.get(), newSshSize, sshPattern, newSshSize);
+                memcpy_s(pSshLocal.get(), newSshSize, sshPattern, newSshSize);
             }
         }
     }
 
     void setPrivateSurface(GraphicsAllocation *gfxAllocation, uint32_t size) {
-        if (gfxAllocation) {
-            kernelDeviceInfos[gfxAllocation->getRootDeviceIndex()].privateSurface = gfxAllocation;
-            kernelDeviceInfos[gfxAllocation->getRootDeviceIndex()].privateSurfaceSize = size;
-        } else {
-            for (auto &kernelDeviceInfo : kernelDeviceInfos) {
-                kernelDeviceInfo.privateSurface = gfxAllocation;
-                kernelDeviceInfo.privateSurfaceSize = size;
-            }
-        }
+        privateSurface = gfxAllocation;
+        privateSurfaceSize = size;
     }
 
-    void setTotalSLMSize(uint32_t rootDeviceIndex, uint32_t size) {
-        kernelDeviceInfos[rootDeviceIndex].slmTotalSize = size;
+    void setTotalSLMSize(uint32_t size) {
+        slmTotalSize = size;
     }
 
     void setKernelArguments(std::vector<SimpleKernelArgInfo> kernelArguments) {
@@ -314,7 +332,7 @@ class MockKernel : public Kernel {
     void setUsingSharedArgs(bool usingSharedArgValue) { this->usingSharedObjArgs = usingSharedArgValue; }
 
     void makeResident(CommandStreamReceiver &commandStreamReceiver) override;
-    void getResidency(std::vector<Surface *> &dst, uint32_t rootDeviceIndex) override;
+    void getResidency(std::vector<Surface *> &dst) override;
 
     void setSpecialPipelineSelectMode(bool value) { specialPipelineSelectMode = value; }
 
@@ -391,9 +409,7 @@ class MockKernelWithInternals {
         }
         mockMultiDeviceKernel = new MockMultiDeviceKernel(std::move(mockKernels));
 
-        for (const auto &pClDevice : deviceVector) {
-            mockKernel->setSshLocal(&sshLocal, sizeof(sshLocal), pClDevice->getRootDeviceIndex());
-        }
+        mockKernel->setSshLocal(&sshLocal, sizeof(sshLocal));
 
         if (addDefaultArg) {
             defaultKernelArguments.resize(2);
@@ -462,9 +478,10 @@ class MockKernelWithInternals {
 class MockParentKernel : public Kernel {
   public:
     using Kernel::auxTranslationRequired;
-    using Kernel::kernelDeviceInfos;
     using Kernel::kernelInfos;
     using Kernel::patchBlocksCurbeWithConstantValues;
+    using Kernel::pSshLocal;
+    using Kernel::sshLocalSize;
 
     static MockParentKernel *create(Context &context, bool addChildSimdSize = false, bool addChildGlobalMemory = false, bool addChildConstantMemory = false, bool addPrintfForParent = true, bool addPrintfForBlock = true) {
         auto clDevice = context.getDevice(0);
@@ -531,9 +548,9 @@ class MockParentKernel : public Kernel {
         info->crossThreadData = new char[crossThreadSize];
 
         auto parent = new MockParentKernel(mockProgram, kernelInfos);
-        parent->kernelDeviceInfos[rootDeviceIndex].crossThreadData = new char[crossThreadSize];
-        memset(parent->kernelDeviceInfos[rootDeviceIndex].crossThreadData, 0, crossThreadSize);
-        parent->kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize = crossThreadSize;
+        parent->crossThreadData = new char[crossThreadSize];
+        memset(parent->crossThreadData, 0, crossThreadSize);
+        parent->crossThreadDataSize = crossThreadSize;
         parent->mockKernelInfo = info;
 
         auto infoBlock = new KernelInfo();
@@ -665,7 +682,24 @@ class MockParentKernel : public Kernel {
 
 class MockSchedulerKernel : public SchedulerKernel {
   public:
-    using SchedulerKernel::kernelDeviceInfos;
+    using Kernel::enqueuedLocalWorkSizeX;
+    using Kernel::enqueuedLocalWorkSizeY;
+    using Kernel::enqueuedLocalWorkSizeZ;
+    using Kernel::globalWorkOffsetX;
+    using Kernel::globalWorkOffsetY;
+    using Kernel::globalWorkOffsetZ;
+    using Kernel::globalWorkSizeX;
+    using Kernel::globalWorkSizeY;
+    using Kernel::globalWorkSizeZ;
+    using Kernel::localWorkSizeX;
+    using Kernel::localWorkSizeX2;
+    using Kernel::localWorkSizeY;
+    using Kernel::localWorkSizeY2;
+    using Kernel::localWorkSizeZ;
+    using Kernel::localWorkSizeZ2;
+    using Kernel::numWorkGroupsX;
+    using Kernel::numWorkGroupsY;
+    using Kernel::numWorkGroupsZ;
     MockSchedulerKernel(Program *programArg, const KernelInfoContainer &kernelInfoArg, ClDevice &clDeviceArg) : SchedulerKernel(programArg, kernelInfoArg, clDeviceArg){};
 };
 
diff --git a/opencl/test/unit_test/program/program_tests.cpp b/opencl/test/unit_test/program/program_tests.cpp
index e394c11b3d..5875ba9008 100644
--- a/opencl/test/unit_test/program/program_tests.cpp
+++ b/opencl/test/unit_test/program/program_tests.cpp
@@ -1374,7 +1374,7 @@ HWTEST_F(PatchTokenTests, givenKernelRequiringConstantAllocationWhenMakeResident
     element = std::find(residencyVector.begin(), residencyVector.end(), constantAllocation);
     EXPECT_NE(residencyVector.end(), element);
 
-    auto crossThreadData = pKernel->getCrossThreadData(rootDeviceIndex);
+    auto crossThreadData = pKernel->getCrossThreadData();
     uint32_t *constBuffGpuAddr = reinterpret_cast<uint32_t *>(pProgram->getConstantSurface(pContext->getDevice(0)->getRootDeviceIndex())->getGpuAddressToPatch());
     uintptr_t *pDst = reinterpret_cast<uintptr_t *>(crossThreadData + pKernelInfo->kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless);
 
@@ -1384,7 +1384,7 @@ HWTEST_F(PatchTokenTests, givenKernelRequiringConstantAllocationWhenMakeResident
     EXPECT_EQ(0u, pCommandStreamReceiver->residency.size());
 
     std::vector<Surface *> surfaces;
-    pKernel->getResidency(surfaces, rootDeviceIndex);
+    pKernel->getResidency(surfaces);
     EXPECT_EQ(2u, surfaces.size());
 
     for (Surface *surface : surfaces) {
diff --git a/opencl/test/unit_test/sampler/sampler_set_arg_tests.cpp b/opencl/test/unit_test/sampler/sampler_set_arg_tests.cpp
index 75d7aedd53..854200b78d 100644
--- a/opencl/test/unit_test/sampler/sampler_set_arg_tests.cpp
+++ b/opencl/test/unit_test/sampler/sampler_set_arg_tests.cpp
@@ -137,7 +137,7 @@ HWTEST_F(SamplerSetArgTest, WhenSettingKernelArgSamplerThenSamplerStatesAreCorre
     EXPECT_EQ(SAMPLER_STATE::MIP_MODE_FILTER_NEAREST, samplerState->getMipModeFilter());
 
     std::vector<Surface *> surfaces;
-    pKernel->getResidency(surfaces, rootDeviceIndex);
+    pKernel->getResidency(surfaces);
     EXPECT_EQ(0u, surfaces.size());
 }
 
@@ -314,7 +314,7 @@ HWTEST_F(SamplerSetArgTest, GivenFilteringNearestAndAddressingClampWhenSettingKe
 
     EXPECT_EQ(samplerObj, pKernel->getKernelArg(0));
 
-    auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData(rootDeviceIndex));
+    auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData());
     auto snapWaCrossThreadData = ptrOffset(crossThreadData, 0x4);
 
     unsigned int snapWaValue = 0xffffffff;
@@ -435,7 +435,7 @@ HWTEST_P(NormalizedTest, WhenSettingKernelArgSamplerThenCoordsAreCorrect) {
 
     EXPECT_EQ(normalizedCoordinates, static_cast<cl_bool>(!samplerState->getNonNormalizedCoordinateEnable()));
 
-    auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData(rootDeviceIndex));
+    auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData());
     auto normalizedCoordsAddress = ptrOffset(crossThreadData, 0x10);
     unsigned int normalizedCoordsValue = GetNormCoordsEnum(normalizedCoordinates);
 
@@ -518,7 +518,7 @@ HWTEST_P(AddressingModeTest, WhenSettingKernelArgSamplerThenModesAreCorrect) {
     EXPECT_EQ(expectedModeY, samplerState->getTcyAddressControlMode());
     EXPECT_EQ(expectedModeZ, samplerState->getTczAddressControlMode());
 
-    auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData(rootDeviceIndex));
+    auto crossThreadData = reinterpret_cast<uint32_t *>(pKernel->getCrossThreadData());
     auto addressingModeAddress = ptrOffset(crossThreadData, 0x8);
 
     unsigned int addresingValue = GetAddrModeEnum(addressingMode);
diff --git a/opencl/test/unit_test/scenarios/windows/enqueue_read_write_buffer_scenarios_windows_tests.cpp b/opencl/test/unit_test/scenarios/windows/enqueue_read_write_buffer_scenarios_windows_tests.cpp
index 171338e3a3..8c8da60fdf 100644
--- a/opencl/test/unit_test/scenarios/windows/enqueue_read_write_buffer_scenarios_windows_tests.cpp
+++ b/opencl/test/unit_test/scenarios/windows/enqueue_read_write_buffer_scenarios_windows_tests.cpp
@@ -110,13 +110,13 @@ HWTEST_F(EnqueueBufferWindowsTest, givenMisalignedHostPtrWhenEnqueueReadBufferCa
         const auto &surfaceStateDst = getSurfaceState<FamilyType>(&cmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0), 1);
 
         if (kernelInfo.kernelArgInfo[1].kernelArgPatchInfoVector[0].size == sizeof(uint64_t)) {
-            auto pKernelArg = (uint64_t *)(kernel->getCrossThreadData(rootDeviceIndex) +
+            auto pKernelArg = (uint64_t *)(kernel->getCrossThreadData() +
                                            kernelInfo.kernelArgInfo[1].kernelArgPatchInfoVector[0].crossthreadOffset);
             EXPECT_EQ(alignDown(gpuVa, 4), static_cast<uint64_t>(*pKernelArg));
             EXPECT_EQ(*pKernelArg, surfaceStateDst.getSurfaceBaseAddress());
 
         } else if (kernelInfo.kernelArgInfo[1].kernelArgPatchInfoVector[0].size == sizeof(uint32_t)) {
-            auto pKernelArg = (uint32_t *)(kernel->getCrossThreadData(rootDeviceIndex) +
+            auto pKernelArg = (uint32_t *)(kernel->getCrossThreadData() +
                                            kernelInfo.kernelArgInfo[1].kernelArgPatchInfoVector[0].crossthreadOffset);
             EXPECT_EQ(alignDown(gpuVa, 4), static_cast<uint64_t>(*pKernelArg));
             EXPECT_EQ(static_cast<uint64_t>(*pKernelArg), surfaceStateDst.getSurfaceBaseAddress());
@@ -124,7 +124,7 @@ HWTEST_F(EnqueueBufferWindowsTest, givenMisalignedHostPtrWhenEnqueueReadBufferCa
     }
 
     if (kernelInfo.kernelArgInfo[3].kernelArgPatchInfoVector[0].size == sizeof(uint32_t)) {
-        auto dstOffset = (uint32_t *)(kernel->getCrossThreadData(rootDeviceIndex) +
+        auto dstOffset = (uint32_t *)(kernel->getCrossThreadData() +
                                       kernelInfo.kernelArgInfo[3].kernelArgPatchInfoVector[0].crossthreadOffset);
         EXPECT_EQ(ptrDiff(misalignedPtr, alignDown(misalignedPtr, 4)), *dstOffset);
     } else {