Remove device enqueue part 6

- isParentKernel, peekParentKernel, parentKernel - structs: AUBParentKernelFixture, MockParentKernel, ParentKernelCommandQueueFixture Related-To: NEO-6559 Signed-off-by: Katarzyna Cencelewska <katarzyna.cencelewska@intel.com>
2025-09-10 12:53:42 +08:00 · 2022-01-13 15:27:58 +00:00
parent d9aae805c7
commit 59683ec491
27 changed files with 24 additions and 895 deletions
--- a/opencl/source/command_queue/enqueue_common.h
+++ b/opencl/source/command_queue/enqueue_common.h
@ -98,7 +98,6 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface *(&surfaces)[surfaceCount
    }
    if (AuxTranslationMode::Builtin == auxTranslationMode) {
        UNRECOVERABLE_IF(kernel->isParentKernel);
        dispatchAuxTranslationBuiltin(multiDispatchInfo, AuxTranslationDirection::NonAuxToAux);
    }
@ -127,8 +126,6 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
        return;
    }
    Kernel *parentKernel = multiDispatchInfo.peekParentKernel();
    TagNodeBase *hwTimeStamps = nullptr;
    CommandStreamReceiver &computeCommandStreamReceiver = getGpgpuCommandStreamReceiver();
    auto commandStreamReceiverOwnership = computeCommandStreamReceiver.obtainUniqueOwnership();
@ -333,11 +330,6 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
    updateFromCompletionStamp(completionStamp, eventBuilder.getEvent());
    if (blockQueue) {
        if (parentKernel) {
            size_t minSizeSSHForEM = HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
            blockedCommandsData->surfaceStateHeapSizeEM = minSizeSSHForEM;
        }
        enqueueBlocked(commandType,
                       surfacesForResidency,
                       numSurfaceForResidency,
@ -414,13 +406,6 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
        hwTimeStamps = event->getHwTimeStampNode();
    }
    if (auto parentKernel = multiDispatchInfo.peekParentKernel()) {
        parentKernel->createReflectionSurface();
        parentKernel->patchDefaultDeviceQueue(context->getDefaultDeviceQueue());
        parentKernel->patchEventPool(context->getDefaultDeviceQueue());
        parentKernel->patchReflectionSurface(context->getDefaultDeviceQueue(), printfHandler.get());
    }
    if (event && this->isPerfCountersEnabled()) {
        hwPerfCounter = event->getHwPerfCounterNode();
    }
@ -761,7 +746,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
        getSliceCount(),                                                                            //sliceCount
        blocking,                                                                                   //blocking
        shouldFlushDC(commandType, printfHandler) || allocNeedsFlushDC,                             //dcFlush
-        multiDispatchInfo.usesSlm() || multiDispatchInfo.peekParentKernel(),                        //useSLM
+        multiDispatchInfo.usesSlm(),                                                                //useSLM
        true,                                                                                       //guardCommandBufferWithPipeControl
        commandType == CL_COMMAND_NDRANGE_KERNEL,                                                   //GSBA32BitRequired
        requiresCoherency,                                                                          //requiresCoherency
@ -905,7 +890,7 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
        }
        PreemptionMode preemptionMode = ClPreemptionHelper::taskPreemptionMode(getDevice(), multiDispatchInfo);
-        bool slmUsed = multiDispatchInfo.usesSlm() || multiDispatchInfo.peekParentKernel();
+        bool slmUsed = multiDispatchInfo.usesSlm();
        command = std::make_unique<CommandComputeKernel>(*this,
                                                         blockedCommandsData,
                                                         allSurfaces,
--- a/opencl/source/command_queue/enqueue_kernel.h
+++ b/opencl/source/command_queue/enqueue_kernel.h
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -43,10 +43,6 @@ cl_int CommandQueueHw<GfxFamily>::enqueueKernel(
    auto &kernel = *pKernel;
    const auto &kernelInfo = kernel.getKernelInfo();
    if (kernel.isParentKernel && !this->context->getDefaultDeviceQueue()) {
        return CL_INVALID_OPERATION;
    }
    if (!kernel.isPatched()) {
        if (event) {
            *event = nullptr;
--- a/opencl/source/command_queue/gpgpu_walker.h
+++ b/opencl/source/command_queue/gpgpu_walker.h
@ -129,12 +129,6 @@ IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const MultiDispatchInf
    }
    // clang-format on
    if (Kernel *parentKernel = multiDispatchInfo.peekParentKernel()) {
        if (heapType == IndirectHeap::SURFACE_STATE) {
            expectedSize += HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
        }
    }
    if (ih == nullptr)
        ih = &commandQueue.getIndirectHeap(heapType, expectedSize);
--- a/opencl/source/command_queue/hardware_interface.h
+++ b/opencl/source/command_queue/hardware_interface.h
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -49,7 +49,6 @@ class HardwareInterface {
        CommandQueue &commandQueue,
        const MultiDispatchInfo &multiDispatchInfo,
        size_t &totalInterfaceDescriptorTableSize,
        Kernel *parentKernel,
        IndirectHeap *dsh,
        LinearStream *commandStream);
--- a/opencl/source/command_queue/hardware_interface_base.inl
+++ b/opencl/source/command_queue/hardware_interface_base.inl
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -73,7 +73,6 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
    LinearStream *commandStream = nullptr;
    IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
    auto parentKernel = multiDispatchInfo.peekParentKernel();
    auto mainKernel = multiDispatchInfo.peekMainKernel();
    auto preemptionMode = ClPreemptionHelper::taskPreemptionMode(commandQueue.getDevice(), multiDispatchInfo);
@ -125,8 +124,7 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
    size_t totalInterfaceDescriptorTableSize = sizeof(INTERFACE_DESCRIPTOR_DATA);
-    getDefaultDshSpace(offsetInterfaceDescriptorTable, commandQueue, multiDispatchInfo, totalInterfaceDescriptorTableSize,
+    getDefaultDshSpace(offsetInterfaceDescriptorTable, commandQueue, multiDispatchInfo, totalInterfaceDescriptorTableSize, dsh, commandStream);
                       parentKernel, dsh, commandStream);
    // Program media interface descriptor load
    HardwareCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
@ -255,22 +253,13 @@ void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQ
 template <typename GfxFamily>
 void HardwareInterface<GfxFamily>::obtainIndirectHeaps(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo,
                                                       bool blockedQueue, IndirectHeap *&dsh, IndirectHeap *&ioh, IndirectHeap *&ssh) {
    auto parentKernel = multiDispatchInfo.peekParentKernel();
    if (blockedQueue) {
        size_t dshSize = 0;
        size_t colorCalcSize = 0;
        size_t sshSize = HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(multiDispatchInfo);
        bool iohEqualsDsh = false;
-        if (parentKernel) {
+        dshSize = HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH(multiDispatchInfo);
            dshSize = commandQueue.getContext().getDefaultDeviceQueue()->getDshBuffer()->getUnderlyingBufferSize();
            sshSize += HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
            iohEqualsDsh = true;
            colorCalcSize = static_cast<size_t>(commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize);
        } else {
            dshSize = HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH(multiDispatchInfo);
        }
        commandQueue.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, dshSize, dsh);
        dsh->getSpace(colorCalcSize);
@ -284,12 +273,6 @@ void HardwareInterface<GfxFamily>::obtainIndirectHeaps(CommandQueue &commandQueu
                                            HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(multiDispatchInfo), ioh);
        }
    } else {
        if (parentKernel && (commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, 0).getUsed() > 0)) {
            commandQueue.releaseIndirectHeap(IndirectHeap::SURFACE_STATE);
            // clean reserved bindless offsets
            ssh = &getIndirectHeap<GfxFamily, IndirectHeap::SURFACE_STATE>(commandQueue, multiDispatchInfo);
            ssh->replaceBuffer(ssh->getCpuBase(), ssh->getMaxAvailableSpace());
        }
        dsh = &getIndirectHeap<GfxFamily, IndirectHeap::DYNAMIC_STATE>(commandQueue, multiDispatchInfo);
        ioh = &getIndirectHeap<GfxFamily, IndirectHeap::INDIRECT_OBJECT>(commandQueue, multiDispatchInfo);
        ssh = &getIndirectHeap<GfxFamily, IndirectHeap::SURFACE_STATE>(commandQueue, multiDispatchInfo);
--- a/opencl/source/command_queue/hardware_interface_bdw_and_later.inl
+++ b/opencl/source/command_queue/hardware_interface_bdw_and_later.inl
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019-2021 Intel Corporation
+ * Copyright (C) 2019-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -19,18 +19,13 @@ inline void HardwareInterface<GfxFamily>::getDefaultDshSpace(
    CommandQueue &commandQueue,
    const MultiDispatchInfo &multiDispatchInfo,
    size_t &totalInterfaceDescriptorTableSize,
    Kernel *parentKernel,
    IndirectHeap *dsh,
    LinearStream *commandStream) {
    size_t numDispatches = multiDispatchInfo.size();
    totalInterfaceDescriptorTableSize *= numDispatches;
-    if (!parentKernel) {
+    dsh->getSpace(totalInterfaceDescriptorTableSize);
        dsh->getSpace(totalInterfaceDescriptorTableSize);
    } else {
        dsh->getSpace(commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset() - dsh->getUsed());
    }
 }
 template <typename GfxFamily>
--- a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl
+++ b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2021 Intel Corporation
+ * Copyright (C) 2021-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -24,13 +24,8 @@ inline void HardwareInterface<GfxFamily>::getDefaultDshSpace(
    CommandQueue &commandQueue,
    const MultiDispatchInfo &multiDispatchInfo,
    size_t &totalInterfaceDescriptorTableSize,
    Kernel *parentKernel,
    IndirectHeap *dsh,
    LinearStream *commandStream) {
    if (parentKernel) {
        dsh->getSpace(commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset() - dsh->getUsed());
    }
 }
 template <typename GfxFamily>
--- a/opencl/source/gtpin/gtpin_callbacks.cpp
+++ b/opencl/source/gtpin/gtpin_callbacks.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -72,7 +72,7 @@ void gtpinNotifyKernelCreate(cl_kernel kernel) {
        // Enlarge local copy of SSH by 1 SS
        GFXCORE_FAMILY genFamily = device.getHardwareInfo().platform.eRenderCoreFamily;
        GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
-        if (pKernel->isParentKernel || !gtpinHelper.addSurfaceState(pKernel)) {
+        if (!gtpinHelper.addSurfaceState(pKernel)) {
            // Kernel with no SSH or Kernel EM, not supported
            return;
        }
@ -117,7 +117,7 @@ void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
        auto rootDeviceIndex = device.getRootDeviceIndex();
        auto pMultiDeviceKernel = castToObjectOrAbort<MultiDeviceKernel>(kernel);
        auto pKernel = pMultiDeviceKernel->getKernel(rootDeviceIndex);
-        if (pKernel->isParentKernel || pKernel->getSurfaceStateHeapSize() == 0) {
+        if (pKernel->getSurfaceStateHeapSize() == 0) {
            // Kernel with no SSH, not supported
            return;
        }
--- a/opencl/source/gtpin/gtpin_hw_helper.inl
+++ b/opencl/source/gtpin/gtpin_hw_helper.inl
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -20,7 +20,7 @@ bool GTPinHwHelperHw<GfxFamily>::addSurfaceState(Kernel *pKernel) {
    using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE;
    size_t sshSize = pKernel->getSurfaceStateHeapSize();
-    if ((sshSize == 0) || pKernel->isParentKernel) {
+    if (sshSize == 0) {
        // Kernels which do not use SSH or use Execution Model are not supported (yet)
        return false;
    }
--- a/opencl/source/helpers/dispatch_info.cpp
+++ b/opencl/source/helpers/dispatch_info.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -33,10 +33,6 @@ Kernel *MultiDispatchInfo::peekMainKernel() const {
    return mainKernel ? mainKernel : dispatchInfos.begin()->getKernel();
 }
 Kernel *MultiDispatchInfo::peekParentKernel() const {
    return (mainKernel && mainKernel->isParentKernel) ? mainKernel : nullptr;
 }
 void MultiDispatchInfo::backupUnifiedMemorySyncRequirement() {
    for (const auto &dispatchInfo : dispatchInfos) {
        dispatchInfo.getKernel()->setUnifiedMemorySyncRequirement(true);
--- a/opencl/source/helpers/dispatch_info.h
+++ b/opencl/source/helpers/dispatch_info.h
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -187,7 +187,6 @@ struct MultiDispatchInfo {
        redescribedSurfaces.push_back(memObj.release());
    }
    Kernel *peekParentKernel() const;
    Kernel *peekMainKernel() const;
    void setBuiltinOpParams(const BuiltinOpParams &builtinOpParams) {
--- a/opencl/source/helpers/task_information.cpp
+++ b/opencl/source/helpers/task_information.cpp
@ -140,20 +140,10 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
        return completionStamp;
    }
    auto &commandStreamReceiver = commandQueue.getGpgpuCommandStreamReceiver();
    bool executionModelKernel = kernel->isParentKernel;
    auto devQueue = commandQueue.getContext().getDefaultDeviceQueue();
    auto bcsCsrForAuxTranslation = commandQueue.getBcsForAuxTranslation();
    auto commandStreamReceiverOwnership = commandStreamReceiver.obtainUniqueOwnership();
    if (executionModelKernel) {
        while (!devQueue->isEMCriticalSectionFree())
            ;
        devQueue->resetDeviceQueue();
        devQueue->acquireEMCriticalSection();
    }
    IndirectHeap *dsh = kernelOperation->dsh.get();
    IndirectHeap *ioh = kernelOperation->ioh.get();
    IndirectHeap *ssh = kernelOperation->ssh.get();
--- a/opencl/source/kernel/kernel.cpp
+++ b/opencl/source/kernel/kernel.cpp
@ -68,8 +68,7 @@ class Surface;
 uint32_t Kernel::dummyPatchLocation = 0xbaddf00d;
 Kernel::Kernel(Program *programArg, const KernelInfo &kernelInfoArg, ClDevice &clDeviceArg)
-    : isParentKernel(kernelInfoArg.kernelDescriptor.kernelAttributes.flags.usesDeviceSideEnqueue),
+    : executionEnvironment(programArg->getExecutionEnvironment()),
      executionEnvironment(programArg->getExecutionEnvironment()),
      program(programArg),
      clDevice(clDeviceArg),
      kernelInfo(kernelInfoArg) {
@ -262,9 +261,6 @@ cl_int Kernel::initialize() {
        program->getContextPtr()->setResolvesRequiredInKernels(true);
    }
    if (isParentKernel) {
        program->allocateBlockPrivateSurfaces(*pClDevice);
    }
    if (program->isKernelDebugEnabled() && isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.systemThreadSurfaceAddress.bindful)) {
        debugEnabled = true;
    }
@ -1791,129 +1787,6 @@ void Kernel::unsetArg(uint32_t argIndex) {
 void Kernel::createReflectionSurface() {
    auto pClDevice = &clDevice;
    if (this->isParentKernel && kernelReflectionSurface == nullptr) {
        auto &hwInfo = pClDevice->getHardwareInfo();
        auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
        BlockKernelManager *blockManager = program->getBlockKernelManager();
        uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount());
        ObjectCounts objectCount;
        getParentObjectCounts(objectCount);
        uint32_t parentImageCount = objectCount.imageCount;
        uint32_t parentSamplerCount = objectCount.samplerCount;
        size_t maxConstantBufferSize = 0;
        std::vector<IGIL_KernelCurbeParams> *curbeParamsForBlocks = new std::vector<IGIL_KernelCurbeParams>[blockCount];
        uint64_t *tokenMask = new uint64_t[blockCount];
        uint32_t *sshTokenOffsetsFromKernelData = new uint32_t[blockCount];
        size_t kernelReflectionSize = alignUp(sizeof(IGIL_KernelDataHeader) + blockCount * sizeof(IGIL_KernelAddressData), sizeof(void *));
        uint32_t kernelDataOffset = static_cast<uint32_t>(kernelReflectionSize);
        uint32_t parentSSHAlignedSize = alignUp(this->kernelInfo.heapInfo.SurfaceStateHeapSize, hwHelper.getBindingTableStateAlignement());
        uint32_t btOffset = parentSSHAlignedSize;
        for (uint32_t i = 0; i < blockCount; i++) {
            const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i);
            size_t samplerStateAndBorderColorSize = 0;
            uint32_t firstSSHTokenIndex = 0;
            ReflectionSurfaceHelper::getCurbeParams(curbeParamsForBlocks[i], tokenMask[i], firstSSHTokenIndex, *pBlockInfo, hwInfo);
            maxConstantBufferSize = std::max(maxConstantBufferSize, static_cast<size_t>(pBlockInfo->kernelDescriptor.kernelAttributes.crossThreadDataSize));
            samplerStateAndBorderColorSize = pBlockInfo->getSamplerStateArraySize(hwInfo);
            samplerStateAndBorderColorSize = alignUp(samplerStateAndBorderColorSize, Sampler::samplerStateArrayAlignment);
            samplerStateAndBorderColorSize += pBlockInfo->getBorderColorStateSize();
            samplerStateAndBorderColorSize = alignUp(samplerStateAndBorderColorSize, sizeof(void *));
            sshTokenOffsetsFromKernelData[i] = offsetof(IGIL_KernelData, m_data) + sizeof(IGIL_KernelCurbeParams) * firstSSHTokenIndex;
            kernelReflectionSize += alignUp(sizeof(IGIL_KernelData) + sizeof(IGIL_KernelCurbeParams) * curbeParamsForBlocks[i].size(), sizeof(void *));
            kernelReflectionSize += parentSamplerCount * sizeof(IGIL_SamplerParams) + samplerStateAndBorderColorSize;
        }
        maxConstantBufferSize = alignUp(maxConstantBufferSize, sizeof(void *));
        kernelReflectionSize += blockCount * alignUp(maxConstantBufferSize, sizeof(void *));
        kernelReflectionSize += parentImageCount * sizeof(IGIL_ImageParamters);
        kernelReflectionSize += parentSamplerCount * sizeof(IGIL_ParentSamplerParams);
        kernelReflectionSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties(
            {pClDevice->getRootDeviceIndex(), kernelReflectionSize,
             GraphicsAllocation::AllocationType::DEVICE_QUEUE_BUFFER,
             pClDevice->getDeviceBitfield()});
        for (uint32_t i = 0; i < blockCount; i++) {
            const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i);
            uint32_t newKernelDataOffset = ReflectionSurfaceHelper::setKernelData(kernelReflectionSurface->getUnderlyingBuffer(),
                                                                                  kernelDataOffset,
                                                                                  curbeParamsForBlocks[i],
                                                                                  tokenMask[i],
                                                                                  maxConstantBufferSize,
                                                                                  parentSamplerCount,
                                                                                  *pBlockInfo,
                                                                                  hwInfo);
            uint32_t offset = static_cast<uint32_t>(offsetof(IGIL_KernelDataHeader, m_data) + sizeof(IGIL_KernelAddressData) * i);
            uint32_t samplerHeapOffset = static_cast<uint32_t>(alignUp(kernelDataOffset + sizeof(IGIL_KernelData) + curbeParamsForBlocks[i].size() * sizeof(IGIL_KernelCurbeParams), sizeof(void *)));
            uint32_t samplerHeapSize = static_cast<uint32_t>(alignUp(pBlockInfo->getSamplerStateArraySize(hwInfo), Sampler::samplerStateArrayAlignment) + pBlockInfo->getBorderColorStateSize());
            uint32_t constantBufferOffset = alignUp(samplerHeapOffset + samplerHeapSize, sizeof(void *));
            uint32_t samplerParamsOffset = 0;
            if (parentSamplerCount) {
                samplerParamsOffset = newKernelDataOffset - sizeof(IGIL_SamplerParams) * parentSamplerCount;
                IGIL_SamplerParams *pSamplerParams = (IGIL_SamplerParams *)ptrOffset(kernelReflectionSurface->getUnderlyingBuffer(), samplerParamsOffset);
                uint32_t sampler = 0;
                const auto &args = pBlockInfo->kernelDescriptor.payloadMappings.explicitArgs;
                for (uint32_t argID = 0; argID < args.size(); argID++) {
                    if (args[argID].is<ArgDescriptor::ArgTSampler>()) {
                        pSamplerParams[sampler].m_ArgID = argID;
                        pSamplerParams[sampler].m_SamplerStateOffset = args[argID].as<ArgDescSampler>().bindful;
                        sampler++;
                    }
                }
            }
            ReflectionSurfaceHelper::setKernelAddressData(kernelReflectionSurface->getUnderlyingBuffer(),
                                                          offset,
                                                          kernelDataOffset,
                                                          samplerHeapOffset,
                                                          constantBufferOffset,
                                                          samplerParamsOffset,
                                                          sshTokenOffsetsFromKernelData[i] + kernelDataOffset,
                                                          btOffset,
                                                          *pBlockInfo,
                                                          hwInfo);
            if (samplerHeapSize > 0) {
                void *pDst = ptrOffset(kernelReflectionSurface->getUnderlyingBuffer(), samplerHeapOffset);
                const void *pSrc = ptrOffset(pBlockInfo->heapInfo.pDsh, pBlockInfo->getBorderColorOffset());
                memcpy_s(pDst, samplerHeapSize, pSrc, samplerHeapSize);
            }
            void *pDst = ptrOffset(kernelReflectionSurface->getUnderlyingBuffer(), constantBufferOffset);
            const char *pSrc = pBlockInfo->crossThreadData;
            memcpy_s(pDst, pBlockInfo->getConstantBufferSize(), pSrc, pBlockInfo->getConstantBufferSize());
            btOffset += pBlockInfo->kernelDescriptor.payloadMappings.bindingTable.tableOffset;
            kernelDataOffset = newKernelDataOffset;
        }
        uint32_t samplerOffset = 0;
        if (parentSamplerCount) {
            samplerOffset = kernelDataOffset + parentImageCount * sizeof(IGIL_ImageParamters);
        }
        ReflectionSurfaceHelper::setKernelDataHeader(kernelReflectionSurface->getUnderlyingBuffer(), blockCount, parentImageCount, parentSamplerCount, kernelDataOffset, samplerOffset);
        delete[] curbeParamsForBlocks;
        delete[] tokenMask;
        delete[] sshTokenOffsetsFromKernelData;
        // Patch constant values once after reflection surface creation
        patchBlocksCurbeWithConstantValues();
    }
    if (DebugManager.flags.ForceDispatchScheduler.get()) {
        if (kernelReflectionSurface == nullptr) {
            kernelReflectionSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties(
@ -1927,7 +1800,6 @@ void Kernel::createReflectionSurface() {
 void Kernel::getParentObjectCounts(ObjectCounts &objectCount) {
    objectCount.imageCount = 0;
    objectCount.samplerCount = 0;
    DEBUG_BREAK_IF(!isParentKernel);
    for (const auto &arg : this->kernelArguments) {
        if (arg.type == SAMPLER_OBJ) {
@ -1942,22 +1814,6 @@ bool Kernel::hasPrintfOutput() const {
    return kernelInfo.kernelDescriptor.kernelAttributes.flags.usesPrintf;
 }
 size_t Kernel::getInstructionHeapSizeForExecutionModel() const {
    BlockKernelManager *blockManager = program->getBlockKernelManager();
    uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount());
    size_t totalSize = 0;
    if (isParentKernel) {
        totalSize = kernelBinaryAlignment - 1; // for initial alignment
        for (uint32_t i = 0; i < blockCount; i++) {
            const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i);
            totalSize += pBlockInfo->heapInfo.KernelHeapSize;
            totalSize = alignUp(totalSize, kernelBinaryAlignment);
        }
    }
    return totalSize;
 }
 void Kernel::patchBlocksCurbeWithConstantValues() {
    auto rootDeviceIndex = clDevice.getRootDeviceIndex();
    BlockKernelManager *blockManager = program->getBlockKernelManager();
@ -2622,10 +2478,6 @@ void Kernel::setReflectionSurfaceBlockBtOffset(uint32_t blockID, uint32_t offset
    ReflectionSurfaceHelper::setKernelAddressDataBtOffset(getKernelReflectionSurface()->getUnderlyingBuffer(), blockID, offset);
 }
 bool Kernel::checkIfIsParentKernelAndBlocksUsesPrintf() {
    return isParentKernel && getProgram()->getBlockKernelManager()->getIfBlockUsesPrintf();
 }
 uint64_t Kernel::getKernelStartOffset(
    const bool localIdsGenerationByRuntime,
    const bool kernelUsesLocalIds,
--- a/opencl/source/kernel/kernel.h
+++ b/opencl/source/kernel/kernel.h
@ -236,8 +236,6 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
        return kernelReflectionSurface;
    }
    size_t getInstructionHeapSizeForExecutionModel() const;
    // Helpers
    cl_int setArg(uint32_t argIndex, uint32_t argValue);
    cl_int setArg(uint32_t argIndex, uint64_t argValue);
@ -324,7 +322,6 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
    uint32_t allBufferArgsStateful = CL_TRUE;
    bool isBuiltIn = false;
    const bool isParentKernel;
    uint32_t getThreadArbitrationPolicy() const {
        return threadArbitrationPolicy;
@ -333,8 +330,6 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
        return executionType;
    }
    bool checkIfIsParentKernelAndBlocksUsesPrintf();
    bool is32Bit() const {
        return kernelInfo.kernelDescriptor.kernelAttributes.gpuPointerSize == 4;
    }
--- a/opencl/source/program/printf_handler.cpp
+++ b/opencl/source/program/printf_handler.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -38,7 +38,7 @@ PrintfHandler *PrintfHandler::create(const MultiDispatchInfo &multiDispatchInfo,
    }
    auto mainKernel = multiDispatchInfo.peekMainKernel();
    if (mainKernel != nullptr) {
-        if (mainKernel->checkIfIsParentKernelAndBlocksUsesPrintf() || mainKernel->getImplicitArgs()) {
+        if (mainKernel->getImplicitArgs()) {
            return new PrintfHandler(device);
        }
    }
--- a/opencl/test/unit_test/aub_tests/fixtures/CMakeLists.txt
+++ b/opencl/test/unit_test/aub_tests/fixtures/CMakeLists.txt
@ -1,5 +1,5 @@
 #
-# Copyright (C) 2018-2021 Intel Corporation
+# Copyright (C) 2018-2022 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 #
@ -8,7 +8,6 @@ target_sources(igdrcl_aub_tests PRIVATE
               ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
               ${CMAKE_CURRENT_SOURCE_DIR}/aub_fixture.cpp
               ${CMAKE_CURRENT_SOURCE_DIR}/aub_fixture.h
               ${CMAKE_CURRENT_SOURCE_DIR}/aub_parent_kernel_fixture.h
               ${CMAKE_CURRENT_SOURCE_DIR}/hello_world_fixture.h
               ${CMAKE_CURRENT_SOURCE_DIR}/run_kernel_fixture.h
               ${CMAKE_CURRENT_SOURCE_DIR}/simple_arg_fixture.h
--- a/opencl/test/unit_test/aub_tests/fixtures/aub_parent_kernel_fixture.h
+++ b/opencl/test/unit_test/aub_tests/fixtures/aub_parent_kernel_fixture.h
@ -1,37 +0,0 @@
 /*
 * Copyright (C) 2018-2021 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */
 #pragma once
 #include "opencl/test/unit_test/aub_tests/command_queue/command_enqueue_fixture.h"
 #include "opencl/test/unit_test/fixtures/hello_world_kernel_fixture.h"
 #include "opencl/test/unit_test/test_macros/test_checks_ocl.h"
 namespace NEO {
 static const char programFile[] = "simple_block_kernel";
 static const char kernelName[] = "kernel_reflection";
 class AUBParentKernelFixture : public CommandEnqueueAUBFixture,
                               public HelloWorldKernelFixture,
                               public testing::Test {
  public:
    using HelloWorldKernelFixture::SetUp;
    void SetUp() override {
        REQUIRE_OCL_21_OR_SKIP(defaultHwInfo);
        CommandEnqueueAUBFixture::SetUp();
        ASSERT_NE(nullptr, pClDevice);
        HelloWorldKernelFixture::SetUp(pClDevice, programFile, kernelName, "-cl-std=CL2.0");
    }
    void TearDown() override {
        if (IsSkipped()) {
            return;
        }
        HelloWorldKernelFixture::TearDown();
        CommandEnqueueAUBFixture::TearDown();
    }
 };
 } // namespace NEO
--- a/opencl/test/unit_test/fixtures/CMakeLists.txt
+++ b/opencl/test/unit_test/fixtures/CMakeLists.txt
@ -19,7 +19,6 @@ set(IGDRCL_SRCS_tests_fixtures
    ${CMAKE_CURRENT_SOURCE_DIR}/device_instrumentation_fixture.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/device_instrumentation_fixture.h
    ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_handler_fixture.h
    ${CMAKE_CURRENT_SOURCE_DIR}/execution_model_fixture.h
    ${CMAKE_CURRENT_SOURCE_DIR}/hello_world_fixture.h
    ${CMAKE_CURRENT_SOURCE_DIR}/hello_world_kernel_fixture.h
    ${CMAKE_CURRENT_SOURCE_DIR}/image_fixture.cpp
--- a/opencl/test/unit_test/fixtures/execution_model_fixture.h
+++ b/opencl/test/unit_test/fixtures/execution_model_fixture.h
@ -1,39 +0,0 @@
 /*
 * Copyright (C) 2018-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */
 #pragma once
 #include "shared/test/common/helpers/debug_manager_state_restore.h"
 #include "opencl/source/device_queue/device_queue.h"
 #include "opencl/test/unit_test/command_queue/command_queue_fixture.h"
 #include "opencl/test/unit_test/mocks/mock_cl_device.h"
 #include "opencl/test/unit_test/mocks/mock_kernel.h"
 #include "opencl/test/unit_test/test_macros/test_checks_ocl.h"
 struct ParentKernelCommandQueueFixture : public CommandQueueHwFixture,
                                         testing::Test {
    void SetUp() override {
        device = new MockClDevice{MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr, rootDeviceIndex)};
        CommandQueueHwFixture::SetUp(device, 0);
    }
    void TearDown() override {
        CommandQueueHwFixture::TearDown();
        delete device;
    }
    std::unique_ptr<KernelOperation> createBlockedCommandsData(CommandQueue &commandQueue) {
        auto commandStream = new LinearStream();
        auto &gpgpuCsr = commandQueue.getGpgpuCommandStreamReceiver();
        gpgpuCsr.ensureCommandBufferAllocation(*commandStream, 1, 1);
        return std::make_unique<KernelOperation>(commandStream, *gpgpuCsr.getInternalAllocationStorage());
    }
    const uint32_t rootDeviceIndex = 0u;
 };
--- a/opencl/test/unit_test/gtpin/gtpin_tests.cpp
+++ b/opencl/test/unit_test/gtpin/gtpin_tests.cpp
@ -2033,23 +2033,6 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenLowMemoryConditionOccursThe
    injectFailures(allocBufferFunc);
 }
 TEST_F(GTPinTests, givenParentKernelWhenGtPinAddingSurfaceStateThenItIsNotAddedAndFalseIsReturned) {
    GFXCORE_FAMILY genFamily = pDevice->getHardwareInfo().platform.eRenderCoreFamily;
    GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
    std::unique_ptr<MockParentKernel> parentKernel(MockParentKernel::create(*pContext));
    parentKernel->sshLocalSize = 64;
    parentKernel->pSshLocal.reset(new char[64]);
    size_t sizeSurfaceStates1 = parentKernel->getSurfaceStateHeapSize();
    bool surfaceAdded = gtpinHelper.addSurfaceState(parentKernel.get());
    EXPECT_FALSE(surfaceAdded);
    size_t sizeSurfaceStates2 = parentKernel->getSurfaceStateHeapSize();
    EXPECT_EQ(sizeSurfaceStates2, sizeSurfaceStates1);
 }
 TEST_F(GTPinTests, givenKernelWithSSHThenVerifyThatSSHResizeWorksWell) {
    cl_kernel kernel = nullptr;
    cl_program pProgram = nullptr;
--- a/opencl/test/unit_test/helpers/dispatch_info_tests.cpp
+++ b/opencl/test/unit_test/helpers/dispatch_info_tests.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -266,46 +266,32 @@ TEST_F(DispatchInfoTest, WhenSettingValuesInDispatchInfoThenThoseValuesAreSet) {
    EXPECT_EQ(swgs, dispatchInfo.getStartOfWorkgroups());
 }
-TEST_F(DispatchInfoTest, givenKernelWhenMultiDispatchInfoIsCreatedThenQueryParentAndMainKernel) {
+TEST_F(DispatchInfoTest, givenKernelWhenMultiDispatchInfoIsCreatedThenQueryMainKernel) {
    std::unique_ptr<MockParentKernel> parentKernel(MockParentKernel::create(*pContext));
    std::unique_ptr<MockKernel> baseKernel(MockKernel::create(*pDevice, pProgram));
    std::unique_ptr<MockKernel> builtInKernel(MockKernel::create(*pDevice, pProgram));
    builtInKernel->isBuiltIn = true;
    DispatchInfo parentKernelDispatchInfo(pClDevice, parentKernel.get(), 1, {1, 1, 1}, {1, 1, 1}, {1, 1, 1});
    DispatchInfo baseDispatchInfo(pClDevice, baseKernel.get(), 1, {1, 1, 1}, {1, 1, 1}, {1, 1, 1});
    DispatchInfo builtInDispatchInfo(pClDevice, builtInKernel.get(), 1, {1, 1, 1}, {1, 1, 1}, {1, 1, 1});
    {
        MultiDispatchInfo multiDispatchInfo(parentKernel.get());
        multiDispatchInfo.push(parentKernelDispatchInfo);
        EXPECT_EQ(parentKernel.get(), multiDispatchInfo.peekParentKernel());
        EXPECT_EQ(parentKernel.get(), multiDispatchInfo.peekMainKernel());
    }
    {
        MultiDispatchInfo multiDispatchInfo(baseKernel.get());
        multiDispatchInfo.push(builtInDispatchInfo);
        EXPECT_EQ(nullptr, multiDispatchInfo.peekParentKernel());
        EXPECT_EQ(baseKernel.get(), multiDispatchInfo.peekMainKernel()); // dont pick builtin kernel
        multiDispatchInfo.push(baseDispatchInfo);
        EXPECT_EQ(nullptr, multiDispatchInfo.peekParentKernel());
        EXPECT_EQ(baseKernel.get(), multiDispatchInfo.peekMainKernel());
    }
    {
        MultiDispatchInfo multiDispatchInfo;
        EXPECT_EQ(nullptr, multiDispatchInfo.peekParentKernel());
        EXPECT_EQ(nullptr, multiDispatchInfo.peekMainKernel());
        multiDispatchInfo.push(builtInDispatchInfo);
        EXPECT_EQ(nullptr, multiDispatchInfo.peekParentKernel());
        EXPECT_EQ(builtInKernel.get(), multiDispatchInfo.peekMainKernel());
    }
    {
        MultiDispatchInfo multiDispatchInfo;
        multiDispatchInfo.push(parentKernelDispatchInfo);
        multiDispatchInfo.push(baseDispatchInfo);
        multiDispatchInfo.push(builtInDispatchInfo);
--- a/opencl/test/unit_test/kernel/CMakeLists.txt
+++ b/opencl/test/unit_test/kernel/CMakeLists.txt
@ -27,7 +27,6 @@ set(IGDRCL_SRCS_tests_kernel
    ${CMAKE_CURRENT_SOURCE_DIR}/kernel_tests.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/kernel_transformable_tests.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/debug_kernel_tests.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/parent_kernel_tests.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/substitute_kernel_heap_tests.cpp
 )
--- a/opencl/test/unit_test/kernel/kernel_reflection_surface_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_reflection_surface_tests.cpp
@ -13,7 +13,6 @@
 #include "opencl/source/kernel/kernel.h"
 #include "opencl/source/program/printf_handler.h"
 #include "opencl/source/sampler/sampler.h"
 #include "opencl/test/unit_test/fixtures/execution_model_fixture.h"
 #include "opencl/test/unit_test/fixtures/image_fixture.h"
 #include "opencl/test/unit_test/fixtures/multi_root_device_fixture.h"
 #include "opencl/test/unit_test/fixtures/platform_fixture.h"
@ -32,21 +31,6 @@
 using namespace NEO;
 TEST(KernelReflectionSurfaceTestSingle, GivenNonParentKernelWhenCreatingKernelReflectionSurfaceThenKernelReflectionSurfaceIsNotCreated) {
    MockClDevice device{new MockDevice};
    MockProgram program(toClDeviceVector(device));
    KernelInfo info;
    MockKernel kernel(&program, info, device);
    EXPECT_FALSE(kernel.isParentKernel);
    kernel.createReflectionSurface();
    auto reflectionSurface = kernel.getKernelReflectionSurface();
    EXPECT_EQ(nullptr, reflectionSurface);
 }
 class ReflectionSurfaceHelperTest : public testing::TestWithParam<std::tuple<const IGIL_KernelCurbeParams, const IGIL_KernelCurbeParams, bool>> {
  protected:
@ -678,168 +662,3 @@ TEST_F(ReflectionSurfaceHelperFixture, GivenUndefinedOffsetsWhenPatchingBlocksCu
    EXPECT_THAT(patchedValues, MemCompare(reference.get(), 10 * sizeof(IGIL_KernelDataHeader) - constBufferOffset));
 }
 class ReflectionSurfaceConstantValuesPatchingTest : public ClDeviceFixture,
                                                    public ::testing::Test {
  public:
    void SetUp() override {
        ClDeviceFixture::SetUp();
    }
    void TearDown() override {
        ClDeviceFixture::TearDown();
    }
 };
 TEST_F(ReflectionSurfaceConstantValuesPatchingTest, GivenBlockWithGlobalMemoryWhenReflectionSurfaceIsPatchedWithConstantValuesThenProgramGlobalMemoryAddressIsPatched) {
    MockContext context(pClDevice);
    MockParentKernel::CreateParams createParams{};
    createParams.addChildGlobalMemory = true;
    std::unique_ptr<MockParentKernel> parentKernel(MockParentKernel::create(context, createParams));
    // graphicsMemory is released by Program
    GraphicsAllocation *globalMemory = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), MemoryConstants::pageSize});
    parentKernel->mockProgram->setGlobalSurface(globalMemory);
    // Allocte reflectionSurface, 2 * 4096 should be enough
    GraphicsAllocation *reflectionSurface = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), 2 * MemoryConstants::pageSize});
    parentKernel->setReflectionSurface(reflectionSurface);
    memset(reflectionSurface->getUnderlyingBuffer(), 0, reflectionSurface->getUnderlyingBufferSize());
    const uint32_t constBufferOffset = (uint32_t)alignUp(sizeof(IGIL_KernelDataHeader) + sizeof(IGIL_KernelAddressData) + sizeof(IGIL_KernelData) + sizeof(IGIL_KernelCurbeParams), sizeof(uint64_t));
    IGIL_KernelDataHeader *pKernelHeader = reinterpret_cast<IGIL_KernelDataHeader *>(reflectionSurface->getUnderlyingBuffer());
    pKernelHeader->m_numberOfKernels = 1;
    pKernelHeader->m_data[0].m_ConstantBufferOffset = constBufferOffset;
    parentKernel->patchBlocksCurbeWithConstantValues();
    auto *blockInfo = parentKernel->mockProgram->blockKernelManager->getBlockKernelInfo(0);
    uint32_t blockPatchOffset = blockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless;
    uint64_t *pCurbe = (uint64_t *)ptrOffset(reflectionSurface->getUnderlyingBuffer(), constBufferOffset + blockPatchOffset);
    EXPECT_EQ(globalMemory->getGpuAddressToPatch(), *pCurbe);
 }
 TEST_F(ReflectionSurfaceConstantValuesPatchingTest, GivenBlockWithGlobalMemoryAndProgramWithoutGlobalMemortWhenReflectionSurfaceIsPatchedWithConstantValuesThenZeroAddressIsPatched) {
    MockContext context(pClDevice);
    MockParentKernel::CreateParams createParams{};
    createParams.addChildGlobalMemory = true;
    std::unique_ptr<MockParentKernel> parentKernel(MockParentKernel::create(context, createParams));
    if (parentKernel->mockProgram->getGlobalSurface(pClDevice->getRootDeviceIndex())) {
        pDevice->getMemoryManager()->freeGraphicsMemory(parentKernel->mockProgram->getGlobalSurface(pClDevice->getRootDeviceIndex()));
        parentKernel->mockProgram->setGlobalSurface(nullptr);
    }
    // Allocte reflectionSurface, 2 * 4096 should be enough
    GraphicsAllocation *reflectionSurface = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), 2 * MemoryConstants::pageSize});
    parentKernel->setReflectionSurface(reflectionSurface);
    memset(reflectionSurface->getUnderlyingBuffer(), 0, reflectionSurface->getUnderlyingBufferSize());
    const uint32_t constBufferOffset = (uint32_t)alignUp(sizeof(IGIL_KernelDataHeader) + sizeof(IGIL_KernelAddressData) + sizeof(IGIL_KernelData) + sizeof(IGIL_KernelCurbeParams), sizeof(uint64_t));
    IGIL_KernelDataHeader *pKernelHeader = reinterpret_cast<IGIL_KernelDataHeader *>(reflectionSurface->getUnderlyingBuffer());
    pKernelHeader->m_numberOfKernels = 1;
    pKernelHeader->m_data[0].m_ConstantBufferOffset = constBufferOffset;
    parentKernel->patchBlocksCurbeWithConstantValues();
    auto *blockInfo = parentKernel->mockProgram->blockKernelManager->getBlockKernelInfo(0);
    uint32_t blockPatchOffset = blockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless;
    uint64_t *pCurbe = (uint64_t *)ptrOffset(reflectionSurface->getUnderlyingBuffer(), constBufferOffset + blockPatchOffset);
    EXPECT_EQ(0u, *pCurbe);
 }
 TEST_F(ReflectionSurfaceConstantValuesPatchingTest, GivenBlockWithConstantMemoryWhenReflectionSurfaceIsPatchedWithConstantValuesThenProgramConstantMemoryAddressIsPatched) {
    MockContext context(pClDevice);
    MockParentKernel::CreateParams createParams{};
    createParams.addChildConstantMemory = true;
    std::unique_ptr<MockParentKernel> parentKernel(MockParentKernel::create(context, createParams));
    // graphicsMemory is released by Program
    GraphicsAllocation *constantMemory = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), MemoryConstants::pageSize});
    parentKernel->mockProgram->setConstantSurface(constantMemory);
    // Allocte reflectionSurface, 2 * 4096 should be enough
    GraphicsAllocation *reflectionSurface = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), 2 * MemoryConstants::pageSize});
    parentKernel->setReflectionSurface(reflectionSurface);
    memset(reflectionSurface->getUnderlyingBuffer(), 0, reflectionSurface->getUnderlyingBufferSize());
    const uint32_t constBufferOffset = (uint32_t)alignUp(sizeof(IGIL_KernelDataHeader) + sizeof(IGIL_KernelAddressData) + sizeof(IGIL_KernelData) + sizeof(IGIL_KernelCurbeParams), sizeof(uint64_t));
    IGIL_KernelDataHeader *pKernelHeader = reinterpret_cast<IGIL_KernelDataHeader *>(reflectionSurface->getUnderlyingBuffer());
    pKernelHeader->m_numberOfKernels = 1;
    pKernelHeader->m_data[0].m_ConstantBufferOffset = constBufferOffset;
    parentKernel->patchBlocksCurbeWithConstantValues();
    auto *blockInfo = parentKernel->mockProgram->blockKernelManager->getBlockKernelInfo(0);
    uint32_t blockPatchOffset = blockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless;
    uint64_t *pCurbe = (uint64_t *)ptrOffset(reflectionSurface->getUnderlyingBuffer(), constBufferOffset);
    uint64_t *pCurbeToPatch = (uint64_t *)ptrOffset(reflectionSurface->getUnderlyingBuffer(), constBufferOffset + blockPatchOffset);
    EXPECT_EQ(constantMemory->getGpuAddressToPatch(), *pCurbeToPatch);
    std::unique_ptr<char> zeroMemory = std::unique_ptr<char>(new char[4096]);
    memset(zeroMemory.get(), 0, 4096);
    // memory before is not written
    EXPECT_THAT(zeroMemory.get(), MemCompare(pCurbe, std::min(4096u, blockPatchOffset)));
    //memory after is not written
    EXPECT_THAT(zeroMemory.get(), MemCompare(pCurbeToPatch + 1, std::min(4096u, 8192u - constBufferOffset - blockPatchOffset - (uint32_t)sizeof(uint64_t))));
 }
 TEST_F(ReflectionSurfaceConstantValuesPatchingTest, GivenBlockWithConstantMemoryAndProgramWithoutConstantMemortWhenReflectionSurfaceIsPatchedWithConstantValuesThenZeroAddressIsPatched) {
    MockContext context(pClDevice);
    MockParentKernel::CreateParams createParams{};
    createParams.addChildConstantMemory = true;
    std::unique_ptr<MockParentKernel> parentKernel(MockParentKernel::create(context, createParams));
    if (parentKernel->mockProgram->getConstantSurface(pClDevice->getRootDeviceIndex())) {
        pDevice->getMemoryManager()->freeGraphicsMemory(parentKernel->mockProgram->getConstantSurface(pClDevice->getRootDeviceIndex()));
        parentKernel->mockProgram->setConstantSurface(nullptr);
    }
    // Allocte reflectionSurface, 2 * 4096 should be enough
    GraphicsAllocation *reflectionSurface = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), 2 * MemoryConstants::pageSize});
    parentKernel->setReflectionSurface(reflectionSurface);
    memset(reflectionSurface->getUnderlyingBuffer(), 0, reflectionSurface->getUnderlyingBufferSize());
    const uint32_t constBufferOffset = (uint32_t)alignUp(sizeof(IGIL_KernelDataHeader) + sizeof(IGIL_KernelAddressData) + sizeof(IGIL_KernelData) + sizeof(IGIL_KernelCurbeParams), sizeof(uint64_t));
    IGIL_KernelDataHeader *pKernelHeader = reinterpret_cast<IGIL_KernelDataHeader *>(reflectionSurface->getUnderlyingBuffer());
    pKernelHeader->m_numberOfKernels = 1;
    pKernelHeader->m_data[0].m_ConstantBufferOffset = constBufferOffset;
    parentKernel->patchBlocksCurbeWithConstantValues();
    auto *blockInfo = parentKernel->mockProgram->blockKernelManager->getBlockKernelInfo(0);
    uint32_t blockPatchOffset = blockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless;
    uint64_t *pCurbe = (uint64_t *)ptrOffset(reflectionSurface->getUnderlyingBuffer(), constBufferOffset);
    uint64_t *pCurbeToPatch = (uint64_t *)ptrOffset(reflectionSurface->getUnderlyingBuffer(), constBufferOffset + blockPatchOffset);
    EXPECT_EQ(0u, *pCurbeToPatch);
    std::unique_ptr<char> zeroMemory = std::unique_ptr<char>(new char[4096]);
    memset(zeroMemory.get(), 0, 4096);
    // memory before is not written
    EXPECT_THAT(zeroMemory.get(), MemCompare(pCurbe, std::min(4096u, blockPatchOffset)));
    //memory after is not written
    EXPECT_THAT(zeroMemory.get(), MemCompare(pCurbeToPatch + 1, std::min(4096u, 8192u - constBufferOffset - blockPatchOffset - (uint32_t)sizeof(uint64_t))));
 }
--- a/opencl/test/unit_test/kernel/kernel_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_tests.cpp
@ -34,7 +34,6 @@
 #include "opencl/source/mem_obj/image.h"
 #include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
 #include "opencl/test/unit_test/fixtures/device_host_queue_fixture.h"
 #include "opencl/test/unit_test/fixtures/execution_model_fixture.h"
 #include "opencl/test/unit_test/fixtures/multi_root_device_fixture.h"
 #include "opencl/test/unit_test/helpers/gtest_helpers.h"
 #include "opencl/test/unit_test/mocks/mock_command_queue.h"
@ -2182,13 +2181,6 @@ TEST(KernelInfoTest, givenHwHelperWhenCreatingKernelAllocationThenCorrectPadding
    clDevice->getMemoryManager()->freeGraphicsMemory(mockKernel->kernelInfo.getGraphicsAllocation());
 }
 TEST(KernelTest, givenNormalKernelWhenGettingInstructionHeapSizeForExecutionModelThenZeroIsReturned) {
    auto device = clUniquePtr(new MockClDevice(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get())));
    MockKernelWithInternals kernel(*device);
    EXPECT_EQ(0u, kernel.mockKernel->getInstructionHeapSizeForExecutionModel());
 }
 TEST(KernelTest, WhenSettingKernelArgThenBuiltinDispatchInfoBuilderIsUsed) {
    struct MockBuiltinDispatchBuilder : BuiltinDispatchInfoBuilder {
        using BuiltinDispatchInfoBuilder::BuiltinDispatchInfoBuilder;
--- a/opencl/test/unit_test/kernel/parent_kernel_tests.cpp
+++ b/opencl/test/unit_test/kernel/parent_kernel_tests.cpp
@ -1,146 +0,0 @@
 /*
 * Copyright (C) 2018-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */
 #include "shared/test/common/mocks/mock_device.h"
 #include "shared/test/common/test_macros/test.h"
 #include "opencl/test/unit_test/mocks/mock_cl_device.h"
 #include "opencl/test/unit_test/mocks/mock_kernel.h"
 #include "opencl/test/unit_test/mocks/mock_program.h"
 #include <memory>
 using namespace NEO;
 class MockKernelWithArgumentAccess : public Kernel {
  public:
    std::vector<SimpleKernelArgInfo> &getKernelArguments() {
        return kernelArguments;
    }
    class ObjectCountsPublic : public Kernel::ObjectCounts {
    };
    MockKernelWithArgumentAccess(Program *programArg, KernelInfo &kernelInfoArg, ClDevice &clDeviceArg) : Kernel(programArg, kernelInfoArg, clDeviceArg) {
    }
    void getParentObjectCountsPublic(MockKernelWithArgumentAccess::ObjectCountsPublic &objectCount) {
        getParentObjectCounts(objectCount);
    }
 };
 TEST(ParentKernelTest, WhenArgsAddedThenObjectCountsAreIncremented) {
    MockClDevice *device = new MockClDevice{new MockDevice};
    MockProgram program(toClDeviceVector(*device));
    KernelInfo info;
    info.kernelDescriptor.kernelAttributes.flags.usesDeviceSideEnqueue = true;
    MockKernelWithArgumentAccess kernel(&program, info, *device);
    std::vector<Kernel::SimpleKernelArgInfo> &args = kernel.getKernelArguments();
    Kernel::SimpleKernelArgInfo argInfo;
    argInfo.type = Kernel::kernelArgType::SAMPLER_OBJ;
    args.push_back(argInfo);
    argInfo.type = Kernel::kernelArgType::IMAGE_OBJ;
    args.push_back(argInfo);
    MockKernelWithArgumentAccess::ObjectCountsPublic objectCounts;
    kernel.getParentObjectCountsPublic(objectCounts);
    EXPECT_EQ(1u, objectCounts.imageCount);
    EXPECT_EQ(1u, objectCounts.samplerCount);
    delete device;
 }
 TEST(ParentKernelTest, WhenPatchingBlocksSimdSizeThenPatchIsAppliedCorrectly) {
    MockClDevice device{new MockDevice};
    MockContext context(&device);
    MockParentKernel::CreateParams createParams{};
    createParams.addChildSimdSize = true;
    std::unique_ptr<MockParentKernel> parentKernel(MockParentKernel::create(context, createParams));
    MockProgram *program = (MockProgram *)parentKernel->mockProgram;
    parentKernel->patchBlocksSimdSize();
    void *blockSimdSize = ptrOffset(parentKernel->getCrossThreadData(), parentKernel->getKernelInfo().childrenKernelsIdOffset[0].second);
    uint32_t *simdSize = reinterpret_cast<uint32_t *>(blockSimdSize);
    EXPECT_EQ(program->blockKernelManager->getBlockKernelInfo(0)->getMaxSimdSize(), *simdSize);
 }
 TEST(ParentKernelTest, GivenParentKernelWhenCheckingForDeviceEnqueueThenTrueIsReturned) {
    MockClDevice device{new MockDevice};
    MockContext context(&device);
    std::unique_ptr<MockParentKernel> parentKernel(MockParentKernel::create(context));
    EXPECT_TRUE(parentKernel->getKernelInfo().hasDeviceEnqueue());
 }
 TEST(ParentKernelTest, GivenNormalKernelWhenCheckingForDeviceEnqueueThenFalseIsReturned) {
    MockClDevice device{new MockDevice};
    MockKernelWithInternals kernel(device);
    EXPECT_FALSE(kernel.kernelInfo.hasDeviceEnqueue());
 }
 TEST(ParentKernelTest, WhenInitializingParentKernelThenBlocksSimdSizeIsPatched) {
    MockClDevice device{new MockDevice};
    MockContext context(&device);
    MockParentKernel::CreateParams createParams{};
    createParams.addChildSimdSize = true;
    std::unique_ptr<MockParentKernel> parentKernel(MockParentKernel::create(context, createParams));
    MockProgram *program = (MockProgram *)parentKernel->mockProgram;
    parentKernel->initialize();
    void *blockSimdSize = ptrOffset(parentKernel->getCrossThreadData(), parentKernel->getKernelInfo().childrenKernelsIdOffset[0].second);
    uint32_t *simdSize = reinterpret_cast<uint32_t *>(blockSimdSize);
    EXPECT_EQ(program->blockKernelManager->getBlockKernelInfo(0)->getMaxSimdSize(), *simdSize);
 }
 TEST(ParentKernelTest, WhenInitializingParentKernelThenPrivateMemoryForBlocksIsAllocated) {
    MockClDevice device{new MockDevice};
    MockContext context(&device);
    MockParentKernel::CreateParams createParams{};
    createParams.addChildSimdSize = true;
    std::unique_ptr<MockParentKernel> parentKernel(MockParentKernel::create(context, createParams));
    MockProgram *program = (MockProgram *)parentKernel->mockProgram;
    auto infoBlock = new MockKernelInfo();
    infoBlock->kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::Stateless;
    uint32_t crossThreadOffsetBlock = 0;
    infoBlock->setDeviceSideEnqueueDefaultQueueSurface(8, crossThreadOffsetBlock);
    crossThreadOffsetBlock += 8;
    infoBlock->setDeviceSideEnqueueEventPoolSurface(8, crossThreadOffsetBlock);
    crossThreadOffsetBlock += 8;
    infoBlock->setPrivateMemory(1000, false, 8, crossThreadOffsetBlock);
    crossThreadOffsetBlock += 8;
    infoBlock->setLocalIds({0, 0, 0});
    infoBlock->kernelDescriptor.kernelAttributes.flags.usesDeviceSideEnqueue = true;
    infoBlock->setDeviceSideEnqueueBlockInterfaceDescriptorOffset(0);
    infoBlock->heapInfo.pDsh = (void *)new uint64_t[64];
    infoBlock->heapInfo.DynamicStateHeapSize = 64 * sizeof(uint64_t);
    infoBlock->setCrossThreadDataSize(crossThreadOffsetBlock);
    infoBlock->crossThreadData = new char[crossThreadOffsetBlock];
    program->blockKernelManager->addBlockKernelInfo(infoBlock);
    parentKernel->initialize();
    EXPECT_NE(nullptr, program->getBlockKernelManager()->getPrivateSurface(program->getBlockKernelManager()->getCount() - 1));
 }
--- a/opencl/test/unit_test/mocks/mock_kernel.h
+++ b/opencl/test/unit_test/mocks/mock_kernel.h
@ -409,144 +409,6 @@ class MockKernelWithInternals {
    char dshLocal[128];
    std::vector<Kernel::SimpleKernelArgInfo> defaultKernelArguments;
 };
 class MockParentKernel : public Kernel {
  public:
    struct CreateParams {
        bool addChildSimdSize = false;
        bool addChildGlobalMemory = false;
        bool addChildConstantMemory = false;
        bool addPrintfForParent = false;
        bool addPrintfForBlock = false;
    };
    using Kernel::auxTranslationRequired;
    using Kernel::kernelInfo;
    using Kernel::patchBlocksCurbeWithConstantValues;
    using Kernel::pImplicitArgs;
    using Kernel::pSshLocal;
    using Kernel::sshLocalSize;
    static MockParentKernel *create(Context &context) {
        CreateParams createParams{};
        return create(context, createParams);
    }
    static MockParentKernel *create(Context &context, const CreateParams &createParams) {
        auto clDevice = context.getDevice(0);
        auto info = new MockKernelInfo();
        const size_t crossThreadSize = 160;
        uint32_t crossThreadOffset = 0;
        uint32_t crossThreadOffsetBlock = 0;
        info->setLocalIds({0, 0, 0});
        info->kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::Stateless;
        info->kernelDescriptor.kernelAttributes.flags.usesDeviceSideEnqueue = true;
        info->kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::DefaultGrfNumber;
        info->kernelDescriptor.kernelAttributes.simdSize = 32;
        info->setDeviceSideEnqueueDefaultQueueSurface(8, crossThreadOffset);
        crossThreadOffset += 8;
        info->setDeviceSideEnqueueEventPoolSurface(8, crossThreadOffset);
        crossThreadOffset += 8;
        if (createParams.addPrintfForParent) {
            info->setPrintfSurface(8, crossThreadOffset);
            crossThreadOffset += 8;
        }
        ClDeviceVector deviceVector;
        deviceVector.push_back(clDevice);
        MockProgram *mockProgram = new MockProgram(&context, false, deviceVector);
        if (createParams.addChildSimdSize) {
            info->childrenKernelsIdOffset.push_back({0, crossThreadOffset});
        }
        UNRECOVERABLE_IF(crossThreadSize < crossThreadOffset + 8);
        info->crossThreadData = new char[crossThreadSize];
        auto parent = new MockParentKernel(mockProgram, *info);
        parent->crossThreadData = new char[crossThreadSize];
        memset(parent->crossThreadData, 0, crossThreadSize);
        parent->crossThreadDataSize = crossThreadSize;
        parent->mockKernelInfo = info;
        auto infoBlock = new MockKernelInfo();
        infoBlock->kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::Stateless;
        infoBlock->setDeviceSideEnqueueDefaultQueueSurface(8, crossThreadOffsetBlock);
        crossThreadOffsetBlock += 8;
        infoBlock->setDeviceSideEnqueueEventPoolSurface(8, crossThreadOffset);
        crossThreadOffsetBlock += 8;
        if (createParams.addPrintfForBlock) {
            infoBlock->setPrintfSurface(8, crossThreadOffsetBlock);
            crossThreadOffsetBlock += 8;
        }
        if (createParams.addChildGlobalMemory) {
            infoBlock->setGlobalVariablesSurface(8, crossThreadOffsetBlock);
            crossThreadOffsetBlock += 8;
        }
        if (createParams.addChildConstantMemory) {
            infoBlock->setGlobalConstantsSurface(8, crossThreadOffsetBlock);
            crossThreadOffsetBlock += 8;
        }
        infoBlock->setLocalIds({0, 0, 0});
        infoBlock->kernelDescriptor.kernelAttributes.flags.usesDeviceSideEnqueue = true;
        infoBlock->kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::DefaultGrfNumber;
        infoBlock->kernelDescriptor.kernelAttributes.simdSize = 32;
        infoBlock->setDeviceSideEnqueueBlockInterfaceDescriptorOffset(0);
        infoBlock->heapInfo.pDsh = (void *)new uint64_t[64];
        infoBlock->heapInfo.DynamicStateHeapSize = 64 * sizeof(uint64_t);
        size_t crossThreadDataSize = crossThreadOffsetBlock > crossThreadSize ? crossThreadOffsetBlock : crossThreadSize;
        infoBlock->crossThreadData = new char[crossThreadDataSize];
        infoBlock->setCrossThreadDataSize(static_cast<uint16_t>(crossThreadDataSize));
        mockProgram->blockKernelManager->addBlockKernelInfo(infoBlock);
        parent->mockProgram = mockProgram;
        return parent;
    }
    MockParentKernel(Program *programArg, const KernelInfo &kernelInfoArg) : Kernel(programArg, kernelInfoArg, *programArg->getDevices()[0]) {
    }
    ~MockParentKernel() override {
        delete &kernelInfo;
        BlockKernelManager *blockManager = program->getBlockKernelManager();
        for (uint32_t i = 0; i < blockManager->getCount(); i++) {
            const KernelInfo *blockInfo = blockManager->getBlockKernelInfo(i);
            delete[](uint64_t *) blockInfo->heapInfo.pDsh;
        }
        if (mockProgram) {
            mockProgram->decRefInternal();
        }
    }
    Context *getContext() {
        return &mockProgram->getContext();
    }
    void setReflectionSurface(GraphicsAllocation *reflectionSurface) {
        kernelReflectionSurface = reflectionSurface;
    }
    MockProgram *mockProgram;
    KernelInfo *mockKernelInfo = nullptr;
 };
 class MockDebugKernel : public MockKernel {
  public:
    MockDebugKernel(Program *program, const KernelInfo &kernelInfo, ClDevice &clDeviceArg) : MockKernel(program, kernelInfo, clDeviceArg) {
--- a/opencl/test/unit_test/program/printf_handler_tests.cpp
+++ b/opencl/test/unit_test/program/printf_handler_tests.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -256,73 +256,6 @@ HWTEST_F(PrintfHandlerTests, givenPrintfHandlerWhenEnqueueIsBlockedThenDontUsePr
    EXPECT_FALSE(cmdQ.isQueueBlocked());
 }
 TEST_F(PrintfHandlerTests, givenParentKernelWithoutPrintfAndBlockKernelWithPrintfWhenPrintfHandlerCreateCalledThenResultIsAnObject) {
    auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
    MockContext context(device.get());
    MockParentKernel::CreateParams createParams{};
    createParams.addPrintfForParent = false;
    createParams.addPrintfForBlock = true;
    std::unique_ptr<MockParentKernel> parentKernelWithoutPrintf(MockParentKernel::create(context, createParams));
    MockMultiDispatchInfo multiDispatchInfo(device.get(), parentKernelWithoutPrintf.get());
    std::unique_ptr<PrintfHandler> printfHandler(PrintfHandler::create(multiDispatchInfo, *device));
    ASSERT_NE(nullptr, printfHandler.get());
 }
 TEST_F(PrintfHandlerTests, givenKernelWithImplicitArgsButWithoutPrintfWhenPrintfHandlerCreateCalledThenResultIsAnObject) {
    auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
    MockContext context(device.get());
    MockParentKernel::CreateParams createParams{};
    createParams.addPrintfForParent = false;
    createParams.addPrintfForBlock = false;
    std::unique_ptr<MockParentKernel> parentKernelWithoutPrintf(MockParentKernel::create(context, createParams));
    parentKernelWithoutPrintf->pImplicitArgs = std::make_unique<ImplicitArgs>();
    *parentKernelWithoutPrintf->pImplicitArgs = {};
    MockMultiDispatchInfo multiDispatchInfo(device.get(), parentKernelWithoutPrintf.get());
    std::unique_ptr<PrintfHandler> printfHandler(PrintfHandler::create(multiDispatchInfo, *device));
    ASSERT_NE(nullptr, printfHandler.get());
 }
 TEST_F(PrintfHandlerTests, givenParentKernelAndBlockKernelWithoutPrintfWhenPrintfHandlerCreateCalledThenResaultIsNullptr) {
    auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
    MockContext context(device.get());
    MockParentKernel::CreateParams createParams{};
    createParams.addPrintfForBlock = false;
    createParams.addPrintfForParent = false;
    std::unique_ptr<MockParentKernel> blockKernelWithoutPrintf(MockParentKernel::create(context, createParams));
    MockMultiDispatchInfo multiDispatchInfo(device.get(), blockKernelWithoutPrintf.get());
    std::unique_ptr<PrintfHandler> printfHandler(PrintfHandler::create(multiDispatchInfo, *device));
    ASSERT_EQ(nullptr, printfHandler.get());
 }
 TEST_F(PrintfHandlerTests, givenParentKernelWithPrintfAndBlockKernelWithoutPrintfWhenPrintfHandlerCreateCalledThenResaultIsAnObject) {
    auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
    MockContext context(device.get());
    MockParentKernel::CreateParams createParams{};
    createParams.addPrintfForBlock = false;
    createParams.addPrintfForParent = true;
    std::unique_ptr<MockParentKernel> parentKernelWithPrintfBlockKernelWithoutPrintf(MockParentKernel::create(context, createParams));
    MockMultiDispatchInfo multiDispatchInfo(device.get(), parentKernelWithPrintfBlockKernelWithoutPrintf.get());
    std::unique_ptr<PrintfHandler> printfHandler(PrintfHandler::create(multiDispatchInfo, *device));
    ASSERT_NE(nullptr, printfHandler);
 }
 TEST_F(PrintfHandlerTests, givenMultiDispatchInfoWithMultipleKernelsWhenCreatingAndDispatchingPrintfHandlerThenPickMainKernel) {
    MockContext context;
    auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));