From 35ff2849449a9ed137f6a5c7d60fa731001a8e45 Mon Sep 17 00:00:00 2001
From: Mateusz Jablonski <mateusz.jablonski@intel.com>
Date: Tue, 23 Mar 2021 17:11:41 +0000
Subject: [PATCH] Cleanup Kernel class

move deviceVector to MultiDeviceKernel class
remove Device arg from Kernel's methods

Related-To: NEO-5001
Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
---
 level_zero/core/source/kernel/kernel.h        |  2 +-
 level_zero/core/source/kernel/kernel_imp.cpp  |  2 +-
 level_zero/core/source/kernel/kernel_imp.h    |  2 +-
 opencl/source/api/api.cpp                     |  7 +--
 opencl/source/gtpin/gtpin_callbacks.cpp       |  4 +-
 opencl/source/kernel/kernel.cpp               | 58 +++++++++----------
 opencl/source/kernel/kernel.h                 | 21 +++----
 opencl/source/kernel/multi_device_kernel.cpp  |  5 +-
 opencl/source/kernel/multi_device_kernel.h    |  1 +
 .../sync_buffer_handler_tests.cpp             |  2 +-
 opencl/test/unit_test/gtpin/gtpin_tests.cpp   |  2 +-
 .../unit_test/kernel/kernel_arg_svm_tests.cpp |  4 +-
 opencl/test/unit_test/kernel/kernel_tests.cpp | 17 +++---
 .../kernel/substitute_kernel_heap_tests.cpp   | 10 ++--
 opencl/test/unit_test/mocks/mock_program.h    |  2 +-
 shared/source/program/sync_buffer_handler.inl |  2 +-
 16 files changed, 63 insertions(+), 78 deletions(-)

diff --git a/level_zero/core/source/kernel/kernel.h b/level_zero/core/source/kernel/kernel.h
index e96c30fd19..e006c67fc2 100644
--- a/level_zero/core/source/kernel/kernel.h
+++ b/level_zero/core/source/kernel/kernel.h
@@ -133,7 +133,7 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI {
     virtual void printPrintfOutput() = 0;
 
     virtual bool usesSyncBuffer() = 0;
-    virtual void patchSyncBuffer(NEO::Device &device, NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) = 0;
+    virtual void patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) = 0;
 
     Kernel() = default;
     Kernel(const Kernel &) = delete;
diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp
index 841478cc27..a82d417375 100644
--- a/level_zero/core/source/kernel/kernel_imp.cpp
+++ b/level_zero/core/source/kernel/kernel_imp.cpp
@@ -778,7 +778,7 @@ bool KernelImp::usesSyncBuffer() {
     return this->kernelImmData->getDescriptor().kernelAttributes.flags.usesSyncBuffer;
 }
 
-void KernelImp::patchSyncBuffer(NEO::Device &device, NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
+void KernelImp::patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
     this->residencyContainer.push_back(gfxAllocation);
     NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize),
                       this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.syncBufferAddress,
diff --git a/level_zero/core/source/kernel/kernel_imp.h b/level_zero/core/source/kernel/kernel_imp.h
index 0a71e3ea8a..721dd7bd9e 100644
--- a/level_zero/core/source/kernel/kernel_imp.h
+++ b/level_zero/core/source/kernel/kernel_imp.h
@@ -81,7 +81,7 @@ struct KernelImp : Kernel {
     void printPrintfOutput() override;
 
     bool usesSyncBuffer() override;
-    void patchSyncBuffer(NEO::Device &device, NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) override;
+    void patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) override;
 
     const uint8_t *getSurfaceStateHeapData() const override { return surfaceStateHeapData.get(); }
     uint32_t getSurfaceStateHeapDataSize() const override { return surfaceStateHeapDataSize; }
diff --git a/opencl/source/api/api.cpp b/opencl/source/api/api.cpp
index 03190a9825..ddc92e8dce 100644
--- a/opencl/source/api/api.cpp
+++ b/opencl/source/api/api.cpp
@@ -1951,7 +1951,6 @@ cl_int CL_API_CALL clGetKernelWorkGroupInfo(cl_kernel kernel,
     if (CL_SUCCESS == retVal) {
         auto pKernel = pMultiDeviceKernel->getKernel(pClDevice->getRootDeviceIndex());
         retVal = pKernel->getWorkGroupInfo(
-            *pClDevice,
             paramName,
             paramValueSize,
             paramValue,
@@ -5312,7 +5311,7 @@ cl_int CL_API_CALL clGetKernelSubGroupInfoKHR(cl_kernel kernel,
     case CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE:
     case CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE:
     case CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL:
-        return pKernel->getSubGroupInfo(*pClDevice, paramName,
+        return pKernel->getSubGroupInfo(paramName,
                                         inputValueSize, inputValue,
                                         paramValueSize, paramValue,
                                         paramValueSizeRet);
@@ -5418,7 +5417,7 @@ cl_int CL_API_CALL clGetKernelSubGroupInfo(cl_kernel kernel,
     }
 
     auto pKernel = pMultiDeviceKernel->getKernel(pClDevice->getRootDeviceIndex());
-    retVal = pKernel->getSubGroupInfo(*pClDevice, paramName,
+    retVal = pKernel->getSubGroupInfo(paramName,
                                       inputValueSize, inputValue,
                                       paramValueSize, paramValue,
                                       paramValueSizeRet);
@@ -5796,7 +5795,7 @@ cl_int CL_API_CALL clGetKernelSuggestedLocalWorkSizeINTEL(cl_command_queue comma
         return retVal;
     }
 
-    pKernel->getSuggestedLocalWorkSize(workDim, globalWorkSize, globalWorkOffset, suggestedLocalWorkSize, pCommandQueue->getClDevice());
+    pKernel->getSuggestedLocalWorkSize(workDim, globalWorkSize, globalWorkOffset, suggestedLocalWorkSize);
 
     return retVal;
 }
diff --git a/opencl/source/gtpin/gtpin_callbacks.cpp b/opencl/source/gtpin/gtpin_callbacks.cpp
index e20d17b481..a1107268bc 100644
--- a/opencl/source/gtpin/gtpin_callbacks.cpp
+++ b/opencl/source/gtpin/gtpin_callbacks.cpp
@@ -65,7 +65,7 @@ void gtpinNotifyKernelCreate(cl_kernel kernel) {
     if (isGTPinInitialized) {
         auto pMultiDeviceKernel = castToObjectOrAbort<MultiDeviceKernel>(kernel);
         auto pKernel = pMultiDeviceKernel->getDefaultKernel();
-        auto &device = pKernel->getDevices()[0]->getDevice();
+        auto &device = pMultiDeviceKernel->getDevices()[0]->getDevice();
         size_t gtpinBTI = pKernel->getNumberOfBindingTableStates();
         // Enlarge local copy of SSH by 1 SS
         GFXCORE_FAMILY genFamily = device.getHardwareInfo().platform.eRenderCoreFamily;
@@ -98,7 +98,7 @@ void gtpinNotifyKernelCreate(cl_kernel kernel) {
         instrument_params_out_t paramsOut = {0};
         (*GTPinCallbacks.onKernelCreate)((context_handle_t)(cl_context)context, &paramsIn, &paramsOut);
         // Substitute ISA of created kernel with instrumented code
-        pKernel->substituteKernelHeap(device, paramsOut.inst_kernel_binary, paramsOut.inst_kernel_size);
+        pKernel->substituteKernelHeap(paramsOut.inst_kernel_binary, paramsOut.inst_kernel_size);
         pKernel->setKernelId(paramsOut.kernel_id);
     }
 }
diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp
index 6582abb21e..be0854fb6d 100644
--- a/opencl/source/kernel/kernel.cpp
+++ b/opencl/source/kernel/kernel.cpp
@@ -70,9 +70,7 @@ Kernel::Kernel(Program *programArg, const KernelInfo &kernelInfoArg, ClDevice &c
       executionEnvironment(programArg->getExecutionEnvironment()),
       program(programArg),
       clDevice(clDeviceArg),
-      deviceVector(programArg->getDevices()),
-      kernelInfo(kernelInfoArg),
-      defaultRootDeviceIndex(clDeviceArg.getRootDeviceIndex()) {
+      kernelInfo(kernelInfoArg) {
     program->retain();
     program->retainForKernel();
     imageTransformer.reset(new ImageTransformer);
@@ -131,7 +129,7 @@ inline void patch(const SrcT &src, void *dst, uint32_t dstOffsetBytes) {
     *patchLocation = static_cast<DstT>(src);
 }
 
-void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const ArgDescPointer &arg) {
+void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const ArgDescPointer &arg) {
     if ((nullptr != crossThreadData) && isValidOffset(arg.stateless)) {
         auto pp = ptrOffset(crossThreadData, arg.stateless);
         uintptr_t addressToPatch = reinterpret_cast<uintptr_t>(ptrToPatchInCrossThreadData);
@@ -147,13 +145,13 @@ void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, Graphic
         auto surfaceState = ptrOffset(ssh, arg.bindful);
         void *addressToPatch = reinterpret_cast<void *>(allocation.getGpuAddressToPatch());
         size_t sizeToPatch = allocation.getUnderlyingBufferSize();
-        Buffer::setSurfaceState(&device, surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0,
+        Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0,
                                 kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, getTotalNumDevicesInContext());
     }
 }
 
 template <typename PatchTokenT>
-void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const PatchTokenT &patch) {
+void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const PatchTokenT &patch) {
     uint32_t pointerSize = patch.DataParamSize;
 
     if (crossThreadData != nullptr) {
@@ -173,16 +171,16 @@ void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, Graphic
         auto surfaceState = ptrOffset(ssh, sshOffset);
         void *addressToPatch = reinterpret_cast<void *>(allocation.getGpuAddressToPatch());
         size_t sizeToPatch = allocation.getUnderlyingBufferSize();
-        Buffer::setSurfaceState(&device, surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0,
+        Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0,
                                 kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, getTotalNumDevicesInContext());
     }
 }
 
-template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization &patch);
+template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization &patch);
 
-template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const SPatchAllocateStatelessPrivateSurface &patch);
+template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const SPatchAllocateStatelessPrivateSurface &patch);
 
-template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const SPatchAllocateStatelessConstantMemorySurfaceWithInitialization &patch);
+template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const SPatchAllocateStatelessConstantMemorySurfaceWithInitialization &patch);
 
 cl_int Kernel::initialize() {
     this->kernelHasIndirectAccess = false;
@@ -326,14 +324,14 @@ cl_int Kernel::initialize() {
             return CL_OUT_OF_RESOURCES;
         }
         const auto &patch = kernelDescriptor.payloadMappings.implicitArgs.privateMemoryAddress;
-        patchWithImplicitSurface(reinterpret_cast<void *>(privateSurface->getGpuAddressToPatch()), *privateSurface, pClDevice->getDevice(), patch);
+        patchWithImplicitSurface(reinterpret_cast<void *>(privateSurface->getGpuAddressToPatch()), *privateSurface, patch);
     }
     if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless)) {
         DEBUG_BREAK_IF(program->getConstantSurface(rootDeviceIndex) == nullptr);
         uintptr_t constMemory = isBuiltIn ? (uintptr_t)program->getConstantSurface(rootDeviceIndex)->getUnderlyingBuffer() : (uintptr_t)program->getConstantSurface(rootDeviceIndex)->getGpuAddressToPatch();
 
         const auto &arg = kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress;
-        patchWithImplicitSurface(reinterpret_cast<void *>(constMemory), *program->getConstantSurface(rootDeviceIndex), pClDevice->getDevice(), arg);
+        patchWithImplicitSurface(reinterpret_cast<void *>(constMemory), *program->getConstantSurface(rootDeviceIndex), arg);
     }
 
     if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless)) {
@@ -341,7 +339,7 @@ cl_int Kernel::initialize() {
         uintptr_t globalMemory = isBuiltIn ? (uintptr_t)program->getGlobalSurface(rootDeviceIndex)->getUnderlyingBuffer() : (uintptr_t)program->getGlobalSurface(rootDeviceIndex)->getGpuAddressToPatch();
 
         const auto &arg = kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress;
-        patchWithImplicitSurface(reinterpret_cast<void *>(globalMemory), *program->getGlobalSurface(rootDeviceIndex), pClDevice->getDevice(), arg);
+        patchWithImplicitSurface(reinterpret_cast<void *>(globalMemory), *program->getGlobalSurface(rootDeviceIndex), arg);
     }
 
     bool useGlobalAtomics = kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics;
@@ -392,7 +390,6 @@ cl_int Kernel::initialize() {
     // double check this assumption
     bool usingBuffers = false;
     bool usingImages = false;
-    auto &defaultKernelInfo = kernelInfo;
     kernelArguments.resize(numArgs);
     kernelArgHandlers.resize(numArgs);
     kernelArgRequiresCacheFlush.resize(numArgs);
@@ -401,7 +398,7 @@ cl_int Kernel::initialize() {
         storeKernelArg(i, NONE_OBJ, nullptr, nullptr, 0);
 
         // set the argument handler
-        auto &argInfo = defaultKernelInfo.kernelArgInfo[i];
+        auto &argInfo = kernelInfo.kernelArgInfo[i];
         if (argInfo.metadata.addressQualifier == KernelArgMetadata::AddrLocal) {
             kernelArgHandlers[i] = &Kernel::setArgLocal;
         } else if (argInfo.isAccelerator) {
@@ -551,9 +548,8 @@ cl_int Kernel::getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName, size_t
     cl_int retVal;
     const void *pSrc = nullptr;
     size_t srcSize = GetInfo::invalidSourceSize;
-    auto &defaultKernelInfo = kernelInfo;
-    auto numArgs = static_cast<cl_uint>(defaultKernelInfo.kernelArgInfo.size());
-    const auto &argInfo = defaultKernelInfo.kernelArgInfo[argIndx];
+    auto numArgs = static_cast<cl_uint>(kernelInfo.kernelArgInfo.size());
+    const auto &argInfo = kernelInfo.kernelArgInfo[argIndx];
 
     if (argIndx >= numArgs) {
         retVal = CL_INVALID_ARG_INDEX;
@@ -604,7 +600,7 @@ cl_int Kernel::getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName, size_t
     return retVal;
 }
 
-cl_int Kernel::getWorkGroupInfo(ClDevice &device, cl_kernel_work_group_info paramName,
+cl_int Kernel::getWorkGroupInfo(cl_kernel_work_group_info paramName,
                                 size_t paramValueSize, void *paramValue,
                                 size_t *paramValueSizeRet) const {
     cl_int retVal = CL_INVALID_VALUE;
@@ -619,7 +615,7 @@ cl_int Kernel::getWorkGroupInfo(ClDevice &device, cl_kernel_work_group_info para
     cl_ulong scratchSize;
     cl_ulong privateMemSize;
     size_t maxWorkgroupSize;
-    const auto &hwInfo = device.getHardwareInfo();
+    const auto &hwInfo = clDevice.getHardwareInfo();
     auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
     auto &clHwHelper = ClHwHelper::get(hwInfo.platform.eRenderCoreFamily);
     GetInfoHelper info(paramValue, paramValueSize, paramValueSizeRet);
@@ -680,7 +676,7 @@ cl_int Kernel::getWorkGroupInfo(ClDevice &device, cl_kernel_work_group_info para
     return retVal;
 }
 
-cl_int Kernel::getSubGroupInfo(ClDevice &clDevice, cl_kernel_sub_group_info paramName,
+cl_int Kernel::getSubGroupInfo(cl_kernel_sub_group_info paramName,
                                size_t inputValueSize, const void *inputValue,
                                size_t paramValueSize, void *paramValue,
                                size_t *paramValueSizeRet) const {
@@ -791,7 +787,7 @@ size_t Kernel::getKernelHeapSize() const {
     return kernelInfo.heapInfo.KernelHeapSize;
 }
 
-void Kernel::substituteKernelHeap(const Device &device, void *newKernelHeap, size_t newKernelHeapSize) {
+void Kernel::substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize) {
     KernelInfo *pKernelInfo = const_cast<KernelInfo *>(&kernelInfo);
     void **pKernelHeap = const_cast<void **>(&pKernelInfo->heapInfo.pKernelHeap);
     *pKernelHeap = newKernelHeap;
@@ -807,7 +803,7 @@ void Kernel::substituteKernelHeap(const Device &device, void *newKernelHeap, siz
     } else {
         memoryManager->checkGpuUsageAndDestroyGraphicsAllocations(pKernelInfo->kernelAllocation);
         pKernelInfo->kernelAllocation = nullptr;
-        status = pKernelInfo->createKernelAllocation(device, isBuiltIn);
+        status = pKernelInfo->createKernelAllocation(clDevice.getDevice(), isBuiltIn);
     }
     UNRECOVERABLE_IF(!status);
 }
@@ -864,9 +860,8 @@ cl_int Kernel::setArg(uint32_t argIndex, size_t argSize, const void *argVal) {
     cl_int retVal = CL_SUCCESS;
     bool updateExposedKernel = true;
     auto argWasUncacheable = false;
-    auto &defaultKernelInfo = kernelInfo;
-    if (defaultKernelInfo.builtinDispatchBuilder != nullptr) {
-        updateExposedKernel = defaultKernelInfo.builtinDispatchBuilder->setExplicitArg(argIndex, argSize, argVal, retVal);
+    if (kernelInfo.builtinDispatchBuilder != nullptr) {
+        updateExposedKernel = kernelInfo.builtinDispatchBuilder->setExplicitArg(argIndex, argSize, argVal, retVal);
     }
     if (updateExposedKernel) {
         if (argIndex >= kernelArgHandlers.size()) {
@@ -1068,7 +1063,7 @@ cl_int Kernel::setKernelExecutionType(cl_execution_info_kernel_type_intel execut
 }
 
 void Kernel::getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *globalWorkSize, const size_t *globalWorkOffset,
-                                       size_t *localWorkSize, ClDevice &clDevice) {
+                                       size_t *localWorkSize) {
     UNRECOVERABLE_IF((workDim == 0) || (workDim > 3));
     UNRECOVERABLE_IF(globalWorkSize == nullptr);
     Vec3<size_t> elws{0, 0, 0};
@@ -2431,7 +2426,7 @@ bool Kernel::usesSyncBuffer() {
     return kernelInfo.kernelDescriptor.kernelAttributes.flags.usesSyncBuffer;
 }
 
-void Kernel::patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
+void Kernel::patchSyncBuffer(GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
     const auto &syncBuffer = kernelInfo.kernelDescriptor.payloadMappings.implicitArgs.syncBufferAddress;
     auto bufferPatchAddress = ptrOffset(crossThreadData, syncBuffer.stateless);
     patchWithRequiredSize(bufferPatchAddress, syncBuffer.pointerSize,
@@ -2441,7 +2436,7 @@ void Kernel::patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation,
         auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()), syncBuffer.bindful);
         auto addressToPatch = gfxAllocation->getUnderlyingBuffer();
         auto sizeToPatch = gfxAllocation->getUnderlyingBufferSize();
-        Buffer::setSurfaceState(&device, surfaceState, false, false, sizeToPatch, addressToPatch, 0, gfxAllocation, 0, 0,
+        Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, gfxAllocation, 0, 0,
                                 kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, getTotalNumDevicesInContext());
     }
 }
@@ -2454,13 +2449,12 @@ bool Kernel::isPatched() const {
 cl_int Kernel::checkCorrectImageAccessQualifier(cl_uint argIndex,
                                                 size_t argSize,
                                                 const void *argValue) const {
-    auto &defaultKernelInfo = kernelInfo;
-    if (defaultKernelInfo.kernelArgInfo[argIndex].isImage) {
+    if (kernelInfo.kernelArgInfo[argIndex].isImage) {
         cl_mem mem = *(static_cast<const cl_mem *>(argValue));
         MemObj *pMemObj = nullptr;
         WithCastToInternal(mem, &pMemObj);
         if (pMemObj) {
-            auto accessQualifier = defaultKernelInfo.kernelArgInfo[argIndex].metadata.accessQualifier;
+            auto accessQualifier = kernelInfo.kernelArgInfo[argIndex].metadata.accessQualifier;
             cl_mem_flags flags = pMemObj->getFlags();
             if ((accessQualifier == KernelArgMetadata::AccessReadOnly && ((flags | CL_MEM_WRITE_ONLY) == flags)) ||
                 (accessQualifier == KernelArgMetadata::AccessWriteOnly && ((flags | CL_MEM_READ_ONLY) == flags))) {
diff --git a/opencl/source/kernel/kernel.h b/opencl/source/kernel/kernel.h
index 1186d49e4f..0381ccc680 100644
--- a/opencl/source/kernel/kernel.h
+++ b/opencl/source/kernel/kernel.h
@@ -41,8 +41,6 @@ class Surface;
 class PrintfHandler;
 class MultiDeviceKernel;
 
-using KernelInfoContainer = StackVec<const KernelInfo *, 1>;
-
 class Kernel : public ReferenceTrackedObject<Kernel> {
   public:
     static const uint32_t kernelBinaryAlignement = 64;
@@ -157,10 +155,10 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
     cl_int getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName,
                       size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const;
 
-    cl_int getWorkGroupInfo(ClDevice &clDevice, cl_kernel_work_group_info paramName,
+    cl_int getWorkGroupInfo(cl_kernel_work_group_info paramName,
                             size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const;
 
-    cl_int getSubGroupInfo(ClDevice &device, cl_kernel_sub_group_info paramName,
+    cl_int getSubGroupInfo(cl_kernel_sub_group_info paramName,
                            size_t inputValueSize, const void *inputValue,
                            size_t paramValueSize, void *paramValue,
                            size_t *paramValueSizeRet) const;
@@ -179,7 +177,7 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
 
     void resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset);
 
-    void substituteKernelHeap(const Device &device, void *newKernelHeap, size_t newKernelHeapSize);
+    void substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize);
     bool isKernelHeapSubstituted() const;
     uint64_t getKernelId() const;
     void setKernelId(uint64_t newKernelId);
@@ -224,7 +222,7 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
     void patchEventPool(DeviceQueue *devQueue);
     void patchBlocksSimdSize();
     bool usesSyncBuffer();
-    void patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, size_t bufferOffset);
+    void patchSyncBuffer(GraphicsAllocation *gfxAllocation, size_t bufferOffset);
     void patchBindlessSurfaceStateOffsets(const Device &device, const size_t sshOffset);
 
     GraphicsAllocation *getKernelReflectionSurface() const {
@@ -368,7 +366,7 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
         this->threadArbitrationPolicy = policy;
     }
     void getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *globalWorkSize, const size_t *globalWorkOffset,
-                                   size_t *localWorkSize, ClDevice &clDevice);
+                                   size_t *localWorkSize);
     uint32_t getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize, const CommandQueue *commandQueue) const;
 
     uint64_t getKernelStartOffset(
@@ -383,9 +381,6 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
     void setAdditionalKernelExecInfo(uint32_t additionalKernelExecInfo);
     uint32_t getAdditionalKernelExecInfo() const;
     MOCKABLE_VIRTUAL bool requiresWaDisableRccRhwoOptimization() const;
-    const ClDeviceVector &getDevices() const {
-        return program->getDevices();
-    }
 
     void setGlobalWorkOffsetValues(uint32_t globalWorkOffsetX, uint32_t globalWorkOffsetY, uint32_t globalWorkOffsetZ);
     void setGlobalWorkSizeValues(uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ);
@@ -479,10 +474,10 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
 
     void *patchBufferOffset(const KernelArgInfo &argInfo, void *svmPtr, GraphicsAllocation *svmAlloc);
 
-    void patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const ArgDescPointer &arg);
+    void patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const ArgDescPointer &arg);
     // Sets-up both crossThreadData and ssh for given implicit (private/constant, etc.) allocation
     template <typename PatchTokenT>
-    void patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const PatchTokenT &patch);
+    void patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const PatchTokenT &patch);
 
     void getParentObjectCounts(ObjectCounts &objectCount);
     Kernel(Program *programArg, const KernelInfo &kernelInfo, ClDevice &clDevice, bool schedulerKernel = false);
@@ -508,7 +503,6 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
     const ExecutionEnvironment &executionEnvironment;
     Program *program;
     ClDevice &clDevice;
-    const ClDeviceVector &deviceVector;
     const KernelInfo &kernelInfo;
 
     std::vector<SimpleKernelArgInfo> kernelArguments;
@@ -585,7 +579,6 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
 
     GraphicsAllocation *privateSurface = nullptr;
     uint64_t privateSurfaceSize = 0u;
-    const uint32_t defaultRootDeviceIndex;
 
     struct KernelConfig {
         Vec3<size_t> gws;
diff --git a/opencl/source/kernel/multi_device_kernel.cpp b/opencl/source/kernel/multi_device_kernel.cpp
index af60e4a695..0a88225a48 100644
--- a/opencl/source/kernel/multi_device_kernel.cpp
+++ b/opencl/source/kernel/multi_device_kernel.cpp
@@ -19,9 +19,10 @@ MultiDeviceKernel::~MultiDeviceKernel() {
 Kernel *MultiDeviceKernel::determineDefaultKernel(KernelVectorType &kernelVector) {
     for (auto &pKernel : kernelVector) {
         if (pKernel) {
-            return kernelVector[(*pKernel->getDevices().begin())->getRootDeviceIndex()];
+            return pKernel;
         }
     }
+    UNRECOVERABLE_IF(true);
     return nullptr;
 }
 MultiDeviceKernel::MultiDeviceKernel(KernelVectorType kernelVector, const KernelInfoContainer kernelInfosArg) : kernels(std::move(kernelVector)),
@@ -39,7 +40,7 @@ MultiDeviceKernel::MultiDeviceKernel(KernelVectorType kernelVector, const Kernel
 const std::vector<Kernel::SimpleKernelArgInfo> &MultiDeviceKernel::getKernelArguments() const { return defaultKernel->getKernelArguments(); }
 cl_int MultiDeviceKernel::getInfo(cl_kernel_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const { return defaultKernel->getInfo(paramName, paramValueSize, paramValue, paramValueSizeRet); }
 cl_int MultiDeviceKernel::getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const { return defaultKernel->getArgInfo(argIndx, paramName, paramValueSize, paramValue, paramValueSizeRet); }
-const ClDeviceVector &MultiDeviceKernel::getDevices() const { return defaultKernel->getDevices(); }
+const ClDeviceVector &MultiDeviceKernel::getDevices() const { return program->getDevices(); }
 size_t MultiDeviceKernel::getKernelArgsNumber() const { return defaultKernel->getKernelArgsNumber(); }
 Context &MultiDeviceKernel::getContext() const { return defaultKernel->getContext(); }
 bool MultiDeviceKernel::getHasIndirectAccess() const { return defaultKernel->getHasIndirectAccess(); }
diff --git a/opencl/source/kernel/multi_device_kernel.h b/opencl/source/kernel/multi_device_kernel.h
index f2469abc60..8bf6ab9d23 100644
--- a/opencl/source/kernel/multi_device_kernel.h
+++ b/opencl/source/kernel/multi_device_kernel.h
@@ -15,6 +15,7 @@ struct OpenCLObjectMapper<_cl_kernel> {
 };
 
 using KernelVectorType = StackVec<Kernel *, 4>;
+using KernelInfoContainer = StackVec<const KernelInfo *, 4>;
 
 class MultiDeviceKernel : public BaseObject<_cl_kernel> {
   public:
diff --git a/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp b/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp
index e2796738e6..0de8b5f645 100644
--- a/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp
+++ b/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp
@@ -190,7 +190,7 @@ HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenSshRequiredWhenPatchingSyncBuffer
     auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
     EXPECT_NE(bufferAddress, surfaceAddress);
 
-    kernel->patchSyncBuffer(commandQueue->getDevice(), syncBufferHandler->graphicsAllocation, syncBufferHandler->usedBufferSize);
+    kernel->patchSyncBuffer(syncBufferHandler->graphicsAllocation, syncBufferHandler->usedBufferSize);
     surfaceAddress = surfaceState->getSurfaceBaseAddress();
     EXPECT_EQ(bufferAddress, surfaceAddress);
 }
diff --git a/opencl/test/unit_test/gtpin/gtpin_tests.cpp b/opencl/test/unit_test/gtpin/gtpin_tests.cpp
index 2a82096169..94a986505d 100644
--- a/opencl/test/unit_test/gtpin/gtpin_tests.cpp
+++ b/opencl/test/unit_test/gtpin/gtpin_tests.cpp
@@ -2344,7 +2344,7 @@ TEST_F(GTPinTests, givenKernelThenVerifyThatKernelCodeSubstitutionWorksWell) {
     // Substitute new kernel code
     constexpr size_t newCodeSize = 64;
     uint8_t newCode[newCodeSize] = {0x0, 0x1, 0x2, 0x3, 0x4};
-    pKernel->substituteKernelHeap(pDevice->getDevice(), &newCode[0], newCodeSize);
+    pKernel->substituteKernelHeap(&newCode[0], newCodeSize);
 
     // Verify that substitution went properly
     isKernelCodeSubstituted = pKernel->isKernelHeapSubstituted();
diff --git a/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp b/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp
index 2b0f279bbb..445c5f4e81 100644
--- a/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_arg_svm_tests.cpp
@@ -259,7 +259,7 @@ HWTEST_F(KernelArgSvmTest, WhenPatchingWithImplicitSurfaceThenPatchIsApplied) {
         RENDER_SURFACE_STATE *surfState = reinterpret_cast<RENDER_SURFACE_STATE *>(pKernel->getSurfaceStateHeap());
         memset(surfState, 0, rendSurfSize);
 
-        pKernel->patchWithImplicitSurface(ptrToPatch, svmAlloc, *pDevice, patch);
+        pKernel->patchWithImplicitSurface(ptrToPatch, svmAlloc, patch);
 
         // verify cross thread data was properly patched
         EXPECT_EQ(ptrToPatch, *reinterpret_cast<void **>(pKernel->getCrossThreadData()));
@@ -280,7 +280,7 @@ HWTEST_F(KernelArgSvmTest, WhenPatchingWithImplicitSurfaceThenPatchIsApplied) {
         // when cross thread and ssh data is not available then should not do anything
         pKernel->setCrossThreadData(nullptr, 0);
         pKernel->setSshLocal(nullptr, 0);
-        pKernel->patchWithImplicitSurface(ptrToPatch, svmAlloc, *pDevice, patch);
+        pKernel->patchWithImplicitSurface(ptrToPatch, svmAlloc, patch);
     }
 }
 
diff --git a/opencl/test/unit_test/kernel/kernel_tests.cpp b/opencl/test/unit_test/kernel/kernel_tests.cpp
index ef0aa46980..d1fae7c68e 100644
--- a/opencl/test/unit_test/kernel/kernel_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_tests.cpp
@@ -285,7 +285,6 @@ TEST_F(KernelTests, GivenKernelWorkGroupSizeWhenGettingWorkGroupInfoThenWorkGrou
     pKernel->maxKernelWorkGroupSize = static_cast<uint32_t>(kernelMaxWorkGroupSize);
 
     retVal = pKernel->getWorkGroupInfo(
-        *pClDevice,
         paramName,
         paramValueSize,
         &paramValue,
@@ -303,7 +302,6 @@ TEST_F(KernelTests, GivenKernelCompileWorkGroupSizeWhenGettingWorkGroupInfoThenC
     size_t paramValueSizeRet = 0;
 
     retVal = pKernel->getWorkGroupInfo(
-        *pClDevice,
         paramName,
         paramValueSize,
         &paramValue,
@@ -317,7 +315,6 @@ TEST_F(KernelTests, GivenInvalidParamNameWhenGettingWorkGroupInfoThenInvalidValu
     size_t paramValueSizeRet = 0x1234u;
 
     retVal = pKernel->getWorkGroupInfo(
-        *pClDevice,
         0,
         0,
         nullptr,
@@ -2818,15 +2815,15 @@ HWTEST_F(KernelTest, givenKernelWhenDebugFlagToUseMaxSimdForCalculationsIsUsedTh
     size_t maxKernelWkgSize;
 
     kernel.kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 32;
-    kernel.mockKernel->getWorkGroupInfo(*device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxKernelWkgSize, nullptr);
+    kernel.mockKernel->getWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxKernelWkgSize, nullptr);
     EXPECT_EQ(1024u, maxKernelWkgSize);
 
     kernel.kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 16;
-    kernel.mockKernel->getWorkGroupInfo(*device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxKernelWkgSize, nullptr);
+    kernel.mockKernel->getWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxKernelWkgSize, nullptr);
     EXPECT_EQ(512u, maxKernelWkgSize);
 
     kernel.kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 8;
-    kernel.mockKernel->getWorkGroupInfo(*device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxKernelWkgSize, nullptr);
+    kernel.mockKernel->getWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxKernelWkgSize, nullptr);
     EXPECT_EQ(256u, maxKernelWkgSize);
 }
 
@@ -3166,7 +3163,7 @@ TEST(KernelTest, givenKernelWithPatchInfoCollectionEnabledWhenPatchWithImplicitS
     SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization patchToken{};
     uint64_t crossThreadData = 0;
     EXPECT_EQ(0u, kernel.mockKernel->getPatchInfoDataList().size());
-    kernel.mockKernel->patchWithImplicitSurface(&crossThreadData, mockAllocation, device->getDevice(), patchToken);
+    kernel.mockKernel->patchWithImplicitSurface(&crossThreadData, mockAllocation, patchToken);
     EXPECT_EQ(1u, kernel.mockKernel->getPatchInfoDataList().size());
 }
 
@@ -3176,7 +3173,7 @@ TEST(KernelTest, givenKernelWithPatchInfoCollecitonEnabledAndArgumentWithInvalid
     MockGraphicsAllocation mockAllocation;
     ArgDescPointer arg;
     uint64_t ptr = 0;
-    kernel.mockKernel->patchWithImplicitSurface(&ptr, mockAllocation, device->getDevice(), arg);
+    kernel.mockKernel->patchWithImplicitSurface(&ptr, mockAllocation, arg);
     EXPECT_EQ(0u, kernel.mockKernel->getPatchInfoDataList().size());
 }
 
@@ -3191,7 +3188,7 @@ TEST(KernelTest, givenKernelWithPatchInfoCollectionEnabledAndValidArgumentWhenPa
     arg.stateless = 0;
     uint64_t crossThreadData = 0;
     EXPECT_EQ(0u, kernel.mockKernel->getPatchInfoDataList().size());
-    kernel.mockKernel->patchWithImplicitSurface(&crossThreadData, mockAllocation, device->getDevice(), arg);
+    kernel.mockKernel->patchWithImplicitSurface(&crossThreadData, mockAllocation, arg);
     EXPECT_EQ(1u, kernel.mockKernel->getPatchInfoDataList().size());
 }
 
@@ -3202,7 +3199,7 @@ TEST(KernelTest, givenKernelWithPatchInfoCollectionDisabledWhenPatchWithImplicit
     SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization patchToken{};
     uint64_t crossThreadData = 0;
     EXPECT_EQ(0u, kernel.mockKernel->getPatchInfoDataList().size());
-    kernel.mockKernel->patchWithImplicitSurface(&crossThreadData, mockAllocation, device->getDevice(), patchToken);
+    kernel.mockKernel->patchWithImplicitSurface(&crossThreadData, mockAllocation, patchToken);
     EXPECT_EQ(0u, kernel.mockKernel->getPatchInfoDataList().size());
 }
 
diff --git a/opencl/test/unit_test/kernel/substitute_kernel_heap_tests.cpp b/opencl/test/unit_test/kernel/substitute_kernel_heap_tests.cpp
index 510dd1024e..d38ceb70d3 100644
--- a/opencl/test/unit_test/kernel/substitute_kernel_heap_tests.cpp
+++ b/opencl/test/unit_test/kernel/substitute_kernel_heap_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2020 Intel Corporation
+ * Copyright (C) 2018-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@ TEST_F(KernelSubstituteTest, givenKernelWhenSubstituteKernelHeapWithGreaterSizeT
     const size_t newHeapSize = initialHeapSize + 1;
     char newHeap[newHeapSize];
 
-    kernel.mockKernel->substituteKernelHeap(*pDevice, newHeap, newHeapSize);
+    kernel.mockKernel->substituteKernelHeap(newHeap, newHeapSize);
     auto secondAllocation = kernel.kernelInfo.kernelAllocation;
     EXPECT_NE(nullptr, secondAllocation);
     auto secondAllocationSize = secondAllocation->getUnderlyingBufferSize();
@@ -64,7 +64,7 @@ TEST_F(KernelSubstituteTest, givenKernelWhenSubstituteKernelHeapWithSameSizeThen
     const size_t newHeapSize = initialHeapSize;
     char newHeap[newHeapSize];
 
-    kernel.mockKernel->substituteKernelHeap(*pDevice, newHeap, newHeapSize);
+    kernel.mockKernel->substituteKernelHeap(newHeap, newHeapSize);
     auto secondAllocation = kernel.kernelInfo.kernelAllocation;
     EXPECT_NE(nullptr, secondAllocation);
     auto secondAllocationSize = secondAllocation->getUnderlyingBufferSize();
@@ -93,7 +93,7 @@ TEST_F(KernelSubstituteTest, givenKernelWhenSubstituteKernelHeapWithSmallerSizeT
     const size_t newHeapSize = initialHeapSize - 1;
     char newHeap[newHeapSize];
 
-    kernel.mockKernel->substituteKernelHeap(*pDevice, newHeap, newHeapSize);
+    kernel.mockKernel->substituteKernelHeap(newHeap, newHeapSize);
     auto secondAllocation = kernel.kernelInfo.kernelAllocation;
     EXPECT_NE(nullptr, secondAllocation);
     auto secondAllocationSize = secondAllocation->getUnderlyingBufferSize();
@@ -125,7 +125,7 @@ TEST_F(KernelSubstituteTest, givenKernelWithUsedKernelAllocationWhenSubstituteKe
 
     EXPECT_TRUE(commandStreamReceiver.getTemporaryAllocations().peekIsEmpty());
 
-    kernel.mockKernel->substituteKernelHeap(*pDevice, newHeap, newHeapSize);
+    kernel.mockKernel->substituteKernelHeap(newHeap, newHeapSize);
     auto secondAllocation = kernel.kernelInfo.kernelAllocation;
 
     EXPECT_FALSE(commandStreamReceiver.getTemporaryAllocations().peekIsEmpty());
diff --git a/opencl/test/unit_test/mocks/mock_program.h b/opencl/test/unit_test/mocks/mock_program.h
index 558e9e3be4..a970cd1576 100644
--- a/opencl/test/unit_test/mocks/mock_program.h
+++ b/opencl/test/unit_test/mocks/mock_program.h
@@ -11,7 +11,7 @@
 #include "shared/source/helpers/string.h"
 
 #include "opencl/source/cl_device/cl_device.h"
-#include "opencl/source/kernel/kernel.h"
+#include "opencl/source/kernel/multi_device_kernel.h"
 #include "opencl/source/program/kernel_info.h"
 #include "opencl/source/program/program.h"
 
diff --git a/shared/source/program/sync_buffer_handler.inl b/shared/source/program/sync_buffer_handler.inl
index 1e7d5d7dba..351b6b40d9 100644
--- a/shared/source/program/sync_buffer_handler.inl
+++ b/shared/source/program/sync_buffer_handler.inl
@@ -19,7 +19,7 @@ void NEO::SyncBufferHandler::prepareForEnqueue(size_t workGroupsCount, KernelT &
         usedBufferSize = 0;
     }
 
-    kernel.patchSyncBuffer(device, graphicsAllocation, usedBufferSize);
+    kernel.patchSyncBuffer(graphicsAllocation, usedBufferSize);
 
     usedBufferSize += requiredSize;
 }