Cleanup Kernel class

move deviceVector to MultiDeviceKernel class remove Device arg from Kernel's methods Related-To: NEO-5001 Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
2026-01-05 09:09:04 +08:00 · 2021-03-23 17:11:41 +00:00
parent a86cb2d4db
commit 35ff284944
16 changed files with 63 additions and 78 deletions
--- a/opencl/source/kernel/kernel.cpp
+++ b/opencl/source/kernel/kernel.cpp
@@ -70,9 +70,7 @@ Kernel::Kernel(Program *programArg, const KernelInfo &kernelInfoArg, ClDevice &c
      executionEnvironment(programArg->getExecutionEnvironment()),
      program(programArg),
      clDevice(clDeviceArg),
-      deviceVector(programArg->getDevices()),
-      kernelInfo(kernelInfoArg),
-      defaultRootDeviceIndex(clDeviceArg.getRootDeviceIndex()) {
+      kernelInfo(kernelInfoArg) {
    program->retain();
    program->retainForKernel();
    imageTransformer.reset(new ImageTransformer);
@@ -131,7 +129,7 @@ inline void patch(const SrcT &src, void *dst, uint32_t dstOffsetBytes) {
    *patchLocation = static_cast<DstT>(src);
 }

-void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const ArgDescPointer &arg) {
+void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const ArgDescPointer &arg) {
    if ((nullptr != crossThreadData) && isValidOffset(arg.stateless)) {
        auto pp = ptrOffset(crossThreadData, arg.stateless);
        uintptr_t addressToPatch = reinterpret_cast<uintptr_t>(ptrToPatchInCrossThreadData);
@@ -147,13 +145,13 @@ void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, Graphic
        auto surfaceState = ptrOffset(ssh, arg.bindful);
        void *addressToPatch = reinterpret_cast<void *>(allocation.getGpuAddressToPatch());
        size_t sizeToPatch = allocation.getUnderlyingBufferSize();
-        Buffer::setSurfaceState(&device, surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0,
+        Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0,
                                kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, getTotalNumDevicesInContext());
    }
 }

 template <typename PatchTokenT>
-void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const PatchTokenT &patch) {
+void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const PatchTokenT &patch) {
    uint32_t pointerSize = patch.DataParamSize;

    if (crossThreadData != nullptr) {
@@ -173,16 +171,16 @@ void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, Graphic
        auto surfaceState = ptrOffset(ssh, sshOffset);
        void *addressToPatch = reinterpret_cast<void *>(allocation.getGpuAddressToPatch());
        size_t sizeToPatch = allocation.getUnderlyingBufferSize();
-        Buffer::setSurfaceState(&device, surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0,
+        Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0,
                                kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, getTotalNumDevicesInContext());
    }
 }

-template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization &patch);
+template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization &patch);

-template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const SPatchAllocateStatelessPrivateSurface &patch);
+template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const SPatchAllocateStatelessPrivateSurface &patch);

-template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const SPatchAllocateStatelessConstantMemorySurfaceWithInitialization &patch);
+template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const SPatchAllocateStatelessConstantMemorySurfaceWithInitialization &patch);

 cl_int Kernel::initialize() {
    this->kernelHasIndirectAccess = false;
@@ -326,14 +324,14 @@ cl_int Kernel::initialize() {
            return CL_OUT_OF_RESOURCES;
        }
        const auto &patch = kernelDescriptor.payloadMappings.implicitArgs.privateMemoryAddress;
-        patchWithImplicitSurface(reinterpret_cast<void *>(privateSurface->getGpuAddressToPatch()), *privateSurface, pClDevice->getDevice(), patch);
+        patchWithImplicitSurface(reinterpret_cast<void *>(privateSurface->getGpuAddressToPatch()), *privateSurface, patch);
    }
    if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless)) {
        DEBUG_BREAK_IF(program->getConstantSurface(rootDeviceIndex) == nullptr);
        uintptr_t constMemory = isBuiltIn ? (uintptr_t)program->getConstantSurface(rootDeviceIndex)->getUnderlyingBuffer() : (uintptr_t)program->getConstantSurface(rootDeviceIndex)->getGpuAddressToPatch();

        const auto &arg = kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress;
-        patchWithImplicitSurface(reinterpret_cast<void *>(constMemory), *program->getConstantSurface(rootDeviceIndex), pClDevice->getDevice(), arg);
+        patchWithImplicitSurface(reinterpret_cast<void *>(constMemory), *program->getConstantSurface(rootDeviceIndex), arg);
    }

    if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless)) {
@@ -341,7 +339,7 @@ cl_int Kernel::initialize() {
        uintptr_t globalMemory = isBuiltIn ? (uintptr_t)program->getGlobalSurface(rootDeviceIndex)->getUnderlyingBuffer() : (uintptr_t)program->getGlobalSurface(rootDeviceIndex)->getGpuAddressToPatch();

        const auto &arg = kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress;
-        patchWithImplicitSurface(reinterpret_cast<void *>(globalMemory), *program->getGlobalSurface(rootDeviceIndex), pClDevice->getDevice(), arg);
+        patchWithImplicitSurface(reinterpret_cast<void *>(globalMemory), *program->getGlobalSurface(rootDeviceIndex), arg);
    }

    bool useGlobalAtomics = kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics;
@@ -392,7 +390,6 @@ cl_int Kernel::initialize() {
    // double check this assumption
    bool usingBuffers = false;
    bool usingImages = false;
-    auto &defaultKernelInfo = kernelInfo;
    kernelArguments.resize(numArgs);
    kernelArgHandlers.resize(numArgs);
    kernelArgRequiresCacheFlush.resize(numArgs);
@@ -401,7 +398,7 @@ cl_int Kernel::initialize() {
        storeKernelArg(i, NONE_OBJ, nullptr, nullptr, 0);

        // set the argument handler
-        auto &argInfo = defaultKernelInfo.kernelArgInfo[i];
+        auto &argInfo = kernelInfo.kernelArgInfo[i];
        if (argInfo.metadata.addressQualifier == KernelArgMetadata::AddrLocal) {
            kernelArgHandlers[i] = &Kernel::setArgLocal;
        } else if (argInfo.isAccelerator) {
@@ -551,9 +548,8 @@ cl_int Kernel::getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName, size_t
    cl_int retVal;
    const void *pSrc = nullptr;
    size_t srcSize = GetInfo::invalidSourceSize;
-    auto &defaultKernelInfo = kernelInfo;
-    auto numArgs = static_cast<cl_uint>(defaultKernelInfo.kernelArgInfo.size());
-    const auto &argInfo = defaultKernelInfo.kernelArgInfo[argIndx];
+    auto numArgs = static_cast<cl_uint>(kernelInfo.kernelArgInfo.size());
+    const auto &argInfo = kernelInfo.kernelArgInfo[argIndx];

    if (argIndx >= numArgs) {
        retVal = CL_INVALID_ARG_INDEX;
@@ -604,7 +600,7 @@ cl_int Kernel::getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName, size_t
    return retVal;
 }

-cl_int Kernel::getWorkGroupInfo(ClDevice &device, cl_kernel_work_group_info paramName,
+cl_int Kernel::getWorkGroupInfo(cl_kernel_work_group_info paramName,
                                size_t paramValueSize, void *paramValue,
                                size_t *paramValueSizeRet) const {
    cl_int retVal = CL_INVALID_VALUE;
@@ -619,7 +615,7 @@ cl_int Kernel::getWorkGroupInfo(ClDevice &device, cl_kernel_work_group_info para
    cl_ulong scratchSize;
    cl_ulong privateMemSize;
    size_t maxWorkgroupSize;
-    const auto &hwInfo = device.getHardwareInfo();
+    const auto &hwInfo = clDevice.getHardwareInfo();
    auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
    auto &clHwHelper = ClHwHelper::get(hwInfo.platform.eRenderCoreFamily);
    GetInfoHelper info(paramValue, paramValueSize, paramValueSizeRet);
@@ -680,7 +676,7 @@ cl_int Kernel::getWorkGroupInfo(ClDevice &device, cl_kernel_work_group_info para
    return retVal;
 }

-cl_int Kernel::getSubGroupInfo(ClDevice &clDevice, cl_kernel_sub_group_info paramName,
+cl_int Kernel::getSubGroupInfo(cl_kernel_sub_group_info paramName,
                               size_t inputValueSize, const void *inputValue,
                               size_t paramValueSize, void *paramValue,
                               size_t *paramValueSizeRet) const {
@@ -791,7 +787,7 @@ size_t Kernel::getKernelHeapSize() const {
    return kernelInfo.heapInfo.KernelHeapSize;
 }

-void Kernel::substituteKernelHeap(const Device &device, void *newKernelHeap, size_t newKernelHeapSize) {
+void Kernel::substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize) {
    KernelInfo *pKernelInfo = const_cast<KernelInfo *>(&kernelInfo);
    void **pKernelHeap = const_cast<void **>(&pKernelInfo->heapInfo.pKernelHeap);
    *pKernelHeap = newKernelHeap;
@@ -807,7 +803,7 @@ void Kernel::substituteKernelHeap(const Device &device, void *newKernelHeap, siz
    } else {
        memoryManager->checkGpuUsageAndDestroyGraphicsAllocations(pKernelInfo->kernelAllocation);
        pKernelInfo->kernelAllocation = nullptr;
-        status = pKernelInfo->createKernelAllocation(device, isBuiltIn);
+        status = pKernelInfo->createKernelAllocation(clDevice.getDevice(), isBuiltIn);
    }
    UNRECOVERABLE_IF(!status);
 }
@@ -864,9 +860,8 @@ cl_int Kernel::setArg(uint32_t argIndex, size_t argSize, const void *argVal) {
    cl_int retVal = CL_SUCCESS;
    bool updateExposedKernel = true;
    auto argWasUncacheable = false;
-    auto &defaultKernelInfo = kernelInfo;
-    if (defaultKernelInfo.builtinDispatchBuilder != nullptr) {
-        updateExposedKernel = defaultKernelInfo.builtinDispatchBuilder->setExplicitArg(argIndex, argSize, argVal, retVal);
+    if (kernelInfo.builtinDispatchBuilder != nullptr) {
+        updateExposedKernel = kernelInfo.builtinDispatchBuilder->setExplicitArg(argIndex, argSize, argVal, retVal);
    }
    if (updateExposedKernel) {
        if (argIndex >= kernelArgHandlers.size()) {
@@ -1068,7 +1063,7 @@ cl_int Kernel::setKernelExecutionType(cl_execution_info_kernel_type_intel execut
 }

 void Kernel::getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *globalWorkSize, const size_t *globalWorkOffset,
-                                       size_t *localWorkSize, ClDevice &clDevice) {
+                                       size_t *localWorkSize) {
    UNRECOVERABLE_IF((workDim == 0) || (workDim > 3));
    UNRECOVERABLE_IF(globalWorkSize == nullptr);
    Vec3<size_t> elws{0, 0, 0};
@@ -2431,7 +2426,7 @@ bool Kernel::usesSyncBuffer() {
    return kernelInfo.kernelDescriptor.kernelAttributes.flags.usesSyncBuffer;
 }

-void Kernel::patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
+void Kernel::patchSyncBuffer(GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
    const auto &syncBuffer = kernelInfo.kernelDescriptor.payloadMappings.implicitArgs.syncBufferAddress;
    auto bufferPatchAddress = ptrOffset(crossThreadData, syncBuffer.stateless);
    patchWithRequiredSize(bufferPatchAddress, syncBuffer.pointerSize,
@@ -2441,7 +2436,7 @@ void Kernel::patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation,
        auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()), syncBuffer.bindful);
        auto addressToPatch = gfxAllocation->getUnderlyingBuffer();
        auto sizeToPatch = gfxAllocation->getUnderlyingBufferSize();
-        Buffer::setSurfaceState(&device, surfaceState, false, false, sizeToPatch, addressToPatch, 0, gfxAllocation, 0, 0,
+        Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, gfxAllocation, 0, 0,
                                kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, getTotalNumDevicesInContext());
    }
 }
@@ -2454,13 +2449,12 @@ bool Kernel::isPatched() const {
 cl_int Kernel::checkCorrectImageAccessQualifier(cl_uint argIndex,
                                                size_t argSize,
                                                const void *argValue) const {
-    auto &defaultKernelInfo = kernelInfo;
-    if (defaultKernelInfo.kernelArgInfo[argIndex].isImage) {
+    if (kernelInfo.kernelArgInfo[argIndex].isImage) {
        cl_mem mem = *(static_cast<const cl_mem *>(argValue));
        MemObj *pMemObj = nullptr;
        WithCastToInternal(mem, &pMemObj);
        if (pMemObj) {
-            auto accessQualifier = defaultKernelInfo.kernelArgInfo[argIndex].metadata.accessQualifier;
+            auto accessQualifier = kernelInfo.kernelArgInfo[argIndex].metadata.accessQualifier;
            cl_mem_flags flags = pMemObj->getFlags();
            if ((accessQualifier == KernelArgMetadata::AccessReadOnly && ((flags | CL_MEM_WRITE_ONLY) == flags)) ||
                (accessQualifier == KernelArgMetadata::AccessWriteOnly && ((flags | CL_MEM_READ_ONLY) == flags))) {
--- a/opencl/source/kernel/kernel.h
+++ b/opencl/source/kernel/kernel.h
@@ -41,8 +41,6 @@ class Surface;
 class PrintfHandler;
 class MultiDeviceKernel;

-using KernelInfoContainer = StackVec<const KernelInfo *, 1>;
-
 class Kernel : public ReferenceTrackedObject<Kernel> {
  public:
    static const uint32_t kernelBinaryAlignement = 64;
@@ -157,10 +155,10 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
    cl_int getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName,
                      size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const;

-    cl_int getWorkGroupInfo(ClDevice &clDevice, cl_kernel_work_group_info paramName,
+    cl_int getWorkGroupInfo(cl_kernel_work_group_info paramName,
                            size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const;

-    cl_int getSubGroupInfo(ClDevice &device, cl_kernel_sub_group_info paramName,
+    cl_int getSubGroupInfo(cl_kernel_sub_group_info paramName,
                           size_t inputValueSize, const void *inputValue,
                           size_t paramValueSize, void *paramValue,
                           size_t *paramValueSizeRet) const;
@@ -179,7 +177,7 @@ class Kernel : public ReferenceTrackedObject<Kernel> {

    void resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset);

-    void substituteKernelHeap(const Device &device, void *newKernelHeap, size_t newKernelHeapSize);
+    void substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize);
    bool isKernelHeapSubstituted() const;
    uint64_t getKernelId() const;
    void setKernelId(uint64_t newKernelId);
@@ -224,7 +222,7 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
    void patchEventPool(DeviceQueue *devQueue);
    void patchBlocksSimdSize();
    bool usesSyncBuffer();
-    void patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, size_t bufferOffset);
+    void patchSyncBuffer(GraphicsAllocation *gfxAllocation, size_t bufferOffset);
    void patchBindlessSurfaceStateOffsets(const Device &device, const size_t sshOffset);

    GraphicsAllocation *getKernelReflectionSurface() const {
@@ -368,7 +366,7 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
        this->threadArbitrationPolicy = policy;
    }
    void getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *globalWorkSize, const size_t *globalWorkOffset,
-                                   size_t *localWorkSize, ClDevice &clDevice);
+                                   size_t *localWorkSize);
    uint32_t getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize, const CommandQueue *commandQueue) const;

    uint64_t getKernelStartOffset(
@@ -383,9 +381,6 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
    void setAdditionalKernelExecInfo(uint32_t additionalKernelExecInfo);
    uint32_t getAdditionalKernelExecInfo() const;
    MOCKABLE_VIRTUAL bool requiresWaDisableRccRhwoOptimization() const;
-    const ClDeviceVector &getDevices() const {
-        return program->getDevices();
-    }

    void setGlobalWorkOffsetValues(uint32_t globalWorkOffsetX, uint32_t globalWorkOffsetY, uint32_t globalWorkOffsetZ);
    void setGlobalWorkSizeValues(uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ);
@@ -479,10 +474,10 @@ class Kernel : public ReferenceTrackedObject<Kernel> {

    void *patchBufferOffset(const KernelArgInfo &argInfo, void *svmPtr, GraphicsAllocation *svmAlloc);

-    void patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const ArgDescPointer &arg);
+    void patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const ArgDescPointer &arg);
    // Sets-up both crossThreadData and ssh for given implicit (private/constant, etc.) allocation
    template <typename PatchTokenT>
-    void patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const PatchTokenT &patch);
+    void patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const PatchTokenT &patch);

    void getParentObjectCounts(ObjectCounts &objectCount);
    Kernel(Program *programArg, const KernelInfo &kernelInfo, ClDevice &clDevice, bool schedulerKernel = false);
@@ -508,7 +503,6 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
    const ExecutionEnvironment &executionEnvironment;
    Program *program;
    ClDevice &clDevice;
-    const ClDeviceVector &deviceVector;
    const KernelInfo &kernelInfo;

    std::vector<SimpleKernelArgInfo> kernelArguments;
@@ -585,7 +579,6 @@ class Kernel : public ReferenceTrackedObject<Kernel> {

    GraphicsAllocation *privateSurface = nullptr;
    uint64_t privateSurfaceSize = 0u;
-    const uint32_t defaultRootDeviceIndex;

    struct KernelConfig {
        Vec3<size_t> gws;
--- a/opencl/source/kernel/multi_device_kernel.cpp
+++ b/opencl/source/kernel/multi_device_kernel.cpp
@@ -19,9 +19,10 @@ MultiDeviceKernel::~MultiDeviceKernel() {
 Kernel *MultiDeviceKernel::determineDefaultKernel(KernelVectorType &kernelVector) {
    for (auto &pKernel : kernelVector) {
        if (pKernel) {
-            return kernelVector[(*pKernel->getDevices().begin())->getRootDeviceIndex()];
+            return pKernel;
        }
    }
+    UNRECOVERABLE_IF(true);
    return nullptr;
 }
 MultiDeviceKernel::MultiDeviceKernel(KernelVectorType kernelVector, const KernelInfoContainer kernelInfosArg) : kernels(std::move(kernelVector)),
@@ -39,7 +40,7 @@ MultiDeviceKernel::MultiDeviceKernel(KernelVectorType kernelVector, const Kernel
 const std::vector<Kernel::SimpleKernelArgInfo> &MultiDeviceKernel::getKernelArguments() const { return defaultKernel->getKernelArguments(); }
 cl_int MultiDeviceKernel::getInfo(cl_kernel_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const { return defaultKernel->getInfo(paramName, paramValueSize, paramValue, paramValueSizeRet); }
 cl_int MultiDeviceKernel::getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const { return defaultKernel->getArgInfo(argIndx, paramName, paramValueSize, paramValue, paramValueSizeRet); }
-const ClDeviceVector &MultiDeviceKernel::getDevices() const { return defaultKernel->getDevices(); }
+const ClDeviceVector &MultiDeviceKernel::getDevices() const { return program->getDevices(); }
 size_t MultiDeviceKernel::getKernelArgsNumber() const { return defaultKernel->getKernelArgsNumber(); }
 Context &MultiDeviceKernel::getContext() const { return defaultKernel->getContext(); }
 bool MultiDeviceKernel::getHasIndirectAccess() const { return defaultKernel->getHasIndirectAccess(); }
--- a/opencl/source/kernel/multi_device_kernel.h
+++ b/opencl/source/kernel/multi_device_kernel.h
@@ -15,6 +15,7 @@ struct OpenCLObjectMapper<_cl_kernel> {
 };

 using KernelVectorType = StackVec<Kernel *, 4>;
+using KernelInfoContainer = StackVec<const KernelInfo *, 4>;

 class MultiDeviceKernel : public BaseObject<_cl_kernel> {
  public: