diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h
index df908fd7ec..8356dc16e3 100644
--- a/level_zero/core/source/cmdlist/cmdlist_hw.h
+++ b/level_zero/core/source/cmdlist/cmdlist_hw.h
@@ -203,7 +203,8 @@ struct CommandListCoreFamily : CommandListImp {
                                                               const ze_group_count_t *pThreadGroupDimensions,
                                                               ze_event_handle_t hEvent,
                                                               bool isIndirect,
-                                                              bool isPredicate);
+                                                              bool isPredicate,
+                                                              bool isCooperative);
     ze_result_t appendLaunchKernelSplit(ze_kernel_handle_t hKernel, const ze_group_count_t *pThreadGroupDimensions, ze_event_handle_t hEvent);
     ze_result_t prepareIndirectParams(const ze_group_count_t *pThreadGroupDimensions);
 
@@ -211,6 +212,7 @@ struct CommandListCoreFamily : CommandListImp {
                                   const void **pRanges);
 
     ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, uint32_t lws[3]);
+    ze_result_t programSyncBuffer(Kernel &kernel, NEO::Device &device, const ze_group_count_t *pThreadGroupDimensions);
     void appendWriteKernelTimestamp(ze_event_handle_t hEvent, bool beforeWalker, bool maskLsb);
     void appendEventForProfiling(ze_event_handle_t hEvent, bool beforeWalker);
     void appendEventForProfilingAllWalkers(ze_event_handle_t hEvent, bool beforeWalker);
diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl
index 0e9ac7127d..8e10274ab8 100644
--- a/level_zero/core/source/cmdlist/cmdlist_hw.inl
+++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl
@@ -24,6 +24,8 @@
 #include "shared/source/memory_manager/allocation_properties.h"
 #include "shared/source/memory_manager/graphics_allocation.h"
 #include "shared/source/memory_manager/memory_manager.h"
+#include "shared/source/program/sync_buffer_handler.h"
+#include "shared/source/program/sync_buffer_handler.inl"
 
 #include "level_zero/core/source/cmdlist/cmdlist_hw.h"
 #include "level_zero/core/source/device/device_imp.h"
@@ -146,7 +148,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernel(ze_kernel_h
     }
 
     return appendLaunchKernelWithParams(hKernel, pThreadGroupDimensions,
-                                        hEvent, false, false);
+                                        hEvent, false, false, false);
 }
 
 template <GFXCORE_FAMILY gfxCoreFamily>
@@ -156,7 +158,13 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchCooperativeKernel(
                                                                                 uint32_t numWaitEvents,
                                                                                 ze_event_handle_t *phWaitEvents) {
 
-    return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents);
+    if (ret) {
+        return ret;
+    }
+
+    return appendLaunchKernelWithParams(hKernel, pLaunchFuncArgs,
+                                        hSignalEvent, false, false, true);
 }
 
 template <GFXCORE_FAMILY gfxCoreFamily>
@@ -172,7 +180,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelIndirect(ze_
     }
     appendEventForProfiling(hEvent, true);
     ret = appendLaunchKernelWithParams(hKernel, pDispatchArgumentsBuffer,
-                                       nullptr, true, false);
+                                       nullptr, true, false, false);
     appendSignalEventPostWalker(hEvent);
 
     return ret;
@@ -203,7 +211,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchMultipleKernelsInd
 
         ret = appendLaunchKernelWithParams(phKernels[i],
                                            haveLaunchArguments ? &pLaunchArgumentsBuffer[i] : nullptr,
-                                           nullptr, true, true);
+                                           nullptr, true, true, false);
         if (ret) {
             return ret;
         }
@@ -668,7 +676,7 @@ template <GFXCORE_FAMILY gfxCoreFamily>
 ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(ze_kernel_handle_t hKernel,
                                                                           const ze_group_count_t *pThreadGroupDimensions,
                                                                           ze_event_handle_t hEvent) {
-    return appendLaunchKernelWithParams(hKernel, pThreadGroupDimensions, nullptr, false, false);
+    return appendLaunchKernelWithParams(hKernel, pThreadGroupDimensions, nullptr, false, false, false);
 }
 
 template <GFXCORE_FAMILY gfxCoreFamily>
@@ -850,7 +858,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendPageFaultCopy(NEO::Graph
     ze_group_count_t dispatchFuncArgs{groups, 1u, 1u};
 
     ze_result_t ret = appendLaunchKernelWithParams(builtinFunction->toHandle(), &dispatchFuncArgs,
-                                                   nullptr, false, false);
+                                                   nullptr, false, false, false);
     if (ret != ZE_RESULT_SUCCESS) {
         return ret;
     }
@@ -1539,6 +1547,30 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
     return ZE_RESULT_SUCCESS;
 }
 
+template <GFXCORE_FAMILY gfxCoreFamily>
+ze_result_t CommandListCoreFamily<gfxCoreFamily>::programSyncBuffer(Kernel &kernel, NEO::Device &device,
+                                                                    const ze_group_count_t *pThreadGroupDimensions) {
+    auto &hwInfo = device.getHardwareInfo();
+    auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily);
+    if (!hwHelper.isCooperativeDispatchSupported(this->engineGroupType, hwInfo.platform.eProductFamily)) {
+        return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    uint32_t maximalNumberOfWorkgroupsAllowed;
+    auto ret = kernel.suggestMaxCooperativeGroupCount(&maximalNumberOfWorkgroupsAllowed);
+    UNRECOVERABLE_IF(ret != ZE_RESULT_SUCCESS);
+    size_t requestedNumberOfWorkgroups = (pThreadGroupDimensions->groupCountX * pThreadGroupDimensions->groupCountY *
+                                          pThreadGroupDimensions->groupCountZ);
+    if (requestedNumberOfWorkgroups > maximalNumberOfWorkgroupsAllowed) {
+        return ZE_RESULT_ERROR_INVALID_ARGUMENT;
+    }
+
+    device.allocateSyncBufferHandler();
+    device.syncBufferHandler->prepareForEnqueue(requestedNumberOfWorkgroups, kernel);
+
+    return ZE_RESULT_SUCCESS;
+}
+
 template <GFXCORE_FAMILY gfxCoreFamily>
 void CommandListCoreFamily<gfxCoreFamily>::appendWriteKernelTimestamp(ze_event_handle_t hEvent, bool beforeWalker, bool maskLsb) {
     constexpr uint32_t mask = 0xfffffffe;
diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl
index 003d1d5633..c6642303a0 100644
--- a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl
+++ b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl
@@ -37,7 +37,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
                                                                                const ze_group_count_t *pThreadGroupDimensions,
                                                                                ze_event_handle_t hEvent,
                                                                                bool isIndirect,
-                                                                               bool isPredicate) {
+                                                                               bool isPredicate,
+                                                                               bool isCooperative) {
     const auto kernel = Kernel::fromHandle(hKernel);
     UNRECOVERABLE_IF(kernel == nullptr);
     appendEventForProfiling(hEvent, true);
@@ -78,6 +79,15 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
         this->indirectAllocationsAllowed = true;
     }
 
+    if (kernel->usesSyncBuffer()) {
+        auto retVal = (isCooperative
+                           ? programSyncBuffer(*kernel, *device->getNEODevice(), pThreadGroupDimensions)
+                           : ZE_RESULT_ERROR_INVALID_ARGUMENT);
+        if (retVal) {
+            return retVal;
+        }
+    }
+
     KernelImp *kernelImp = static_cast<KernelImp *>(kernel);
     this->containsStatelessUncachedResource |= kernelImp->getKernelRequiresUncachedMocs();
     uint32_t partitionCount = 0;
diff --git a/level_zero/core/source/kernel/kernel.h b/level_zero/core/source/kernel/kernel.h
index 2b1a3b9967..e96c30fd19 100644
--- a/level_zero/core/source/kernel/kernel.h
+++ b/level_zero/core/source/kernel/kernel.h
@@ -132,6 +132,9 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI {
     virtual NEO::GraphicsAllocation *getPrintfBufferAllocation() = 0;
     virtual void printPrintfOutput() = 0;
 
+    virtual bool usesSyncBuffer() = 0;
+    virtual void patchSyncBuffer(NEO::Device &device, NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) = 0;
+
     Kernel() = default;
     Kernel(const Kernel &) = delete;
     Kernel(Kernel &&) = delete;
diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp
index 6b1589b4f1..841478cc27 100644
--- a/level_zero/core/source/kernel/kernel_imp.cpp
+++ b/level_zero/core/source/kernel/kernel_imp.cpp
@@ -774,6 +774,17 @@ void KernelImp::printPrintfOutput() {
     PrintfHandler::printOutput(kernelImmData, this->printfBuffer, module->getDevice());
 }
 
+bool KernelImp::usesSyncBuffer() {
+    return this->kernelImmData->getDescriptor().kernelAttributes.flags.usesSyncBuffer;
+}
+
+void KernelImp::patchSyncBuffer(NEO::Device &device, NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
+    this->residencyContainer.push_back(gfxAllocation);
+    NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize),
+                      this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.syncBufferAddress,
+                      static_cast<uintptr_t>(ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset)));
+}
+
 void KernelImp::setDebugSurface() {
     auto device = module->getDevice();
     if (module->isDebugEnabled() && device->getNEODevice()->getDebugger()) {
diff --git a/level_zero/core/source/kernel/kernel_imp.h b/level_zero/core/source/kernel/kernel_imp.h
index a2d7bda37c..0a71e3ea8a 100644
--- a/level_zero/core/source/kernel/kernel_imp.h
+++ b/level_zero/core/source/kernel/kernel_imp.h
@@ -16,8 +16,6 @@
 
 namespace L0 {
 
-struct GraphicsAllocation;
-
 struct KernelImp : Kernel {
     KernelImp(Module *module);
 
@@ -82,6 +80,9 @@ struct KernelImp : Kernel {
     NEO::GraphicsAllocation *getPrintfBufferAllocation() override { return this->printfBuffer; }
     void printPrintfOutput() override;
 
+    bool usesSyncBuffer() override;
+    void patchSyncBuffer(NEO::Device &device, NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) override;
+
     const uint8_t *getSurfaceStateHeapData() const override { return surfaceStateHeapData.get(); }
     uint32_t getSurfaceStateHeapDataSize() const override { return surfaceStateHeapDataSize; }
 
diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h
index 8b312a9a43..5bb7777a87 100644
--- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h
+++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020 Intel Corporation
+ * Copyright (C) 2020-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,6 +37,7 @@ struct WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>
     using BaseClass::applyMemoryRangesBarrier;
     using BaseClass::commandListPerThreadScratchSize;
     using BaseClass::commandListPreemptionMode;
+    using BaseClass::engineGroupType;
     using BaseClass::getAlignedAllocation;
     using BaseClass::getAllocationFromHostPtrMap;
     using BaseClass::getHostPtrAlloc;
diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp
index 796e60c40c..3390838674 100644
--- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp
+++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp
@@ -18,6 +18,8 @@
 #include "level_zero/core/test/unit_tests/fixtures/module_fixture.h"
 #include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h"
 #include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h"
+#include "level_zero/core/test/unit_tests/mocks/mock_module.h"
+
 namespace L0 {
 namespace ult {
 
@@ -250,7 +252,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, WhenAppendingFunctionThenUsedCmdBufferS
 
     auto sizeBefore = commandList->commandContainer.getCommandStream()->getUsed();
 
-    auto result = commandList->appendLaunchKernelWithParams(kernel->toHandle(), &groupCount, nullptr, false, false);
+    auto result = commandList->appendLaunchKernelWithParams(kernel->toHandle(), &groupCount, nullptr, false, false, false);
     ASSERT_EQ(ZE_RESULT_SUCCESS, result);
 
     auto sizeAfter = commandList->commandContainer.getCommandStream()->getUsed();
@@ -260,7 +262,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, WhenAppendingFunctionThenUsedCmdBufferS
 
     sizeBefore = commandList->commandContainer.getCommandStream()->getUsed();
 
-    result = commandList->appendLaunchKernelWithParams(kernel->toHandle(), &groupCount, nullptr, true, false);
+    result = commandList->appendLaunchKernelWithParams(kernel->toHandle(), &groupCount, nullptr, true, false, false);
     ASSERT_EQ(ZE_RESULT_SUCCESS, result);
 
     sizeAfter = commandList->commandContainer.getCommandStream()->getUsed();
@@ -1009,5 +1011,59 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandListAppendLaunchKernel, givenAppendLaunchMult
     device->getDriverHandle()->freeMem(reinterpret_cast<void *>(numLaunchArgs));
 }
 
+HWTEST_F(CommandListAppendLaunchKernel, givenInvalidEventListWhenAppendLaunchCooperativeKernelIsCalledThenErrorIsReturned) {
+    createKernel();
+
+    ze_group_count_t groupCount{1, 1, 1};
+    ze_result_t returnValue;
+    std::unique_ptr<L0::CommandList> commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, returnValue));
+    returnValue = commandList->appendLaunchCooperativeKernel(kernel->toHandle(), &groupCount, nullptr, 1, nullptr);
+
+    EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, returnValue);
+}
+
+HWTEST2_F(CommandListAppendLaunchKernel, givenKernelUsingSyncBufferWhenAppendLaunchCooperativeKernelIsCalledThenCorrectValueIsReturned, SklPlusMatcher) {
+    Mock<::L0::Kernel> kernel;
+    auto pMockModule = std::unique_ptr<Module>(new Mock<Module>(device, nullptr));
+    kernel.module = pMockModule.get();
+
+    kernel.setGroupSize(4, 1, 1);
+    ze_group_count_t groupCount{8, 1, 1};
+    auto pCommandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
+    auto result = pCommandList->initialize(device, NEO::EngineGroupType::Compute);
+    ASSERT_EQ(ZE_RESULT_SUCCESS, result);
+
+    auto &kernelAttributes = kernel.immutableData.kernelDescriptor->kernelAttributes;
+    kernelAttributes.flags.usesSyncBuffer = true;
+    kernelAttributes.numGrfRequired = GrfConfig::DefaultGrfNumber;
+    bool isCooperative = true;
+    result = pCommandList->appendLaunchCooperativeKernel(kernel.toHandle(), &groupCount, nullptr, 0, nullptr);
+    EXPECT_EQ(ZE_RESULT_SUCCESS, result);
+
+    result = pCommandList->appendLaunchKernelWithParams(kernel.toHandle(), &groupCount, nullptr, false, false, isCooperative);
+    EXPECT_EQ(ZE_RESULT_SUCCESS, result);
+
+    {
+        VariableBackup<uint32_t> usesSyncBuffer{&kernelAttributes.flags.packed};
+        usesSyncBuffer = false;
+        result = pCommandList->appendLaunchKernelWithParams(kernel.toHandle(), &groupCount, nullptr, false, false, isCooperative);
+        EXPECT_EQ(ZE_RESULT_SUCCESS, result);
+    }
+    {
+        VariableBackup<uint32_t> groupCountX{&groupCount.groupCountX};
+        uint32_t maximalNumberOfWorkgroupsAllowed;
+        kernel.suggestMaxCooperativeGroupCount(&maximalNumberOfWorkgroupsAllowed);
+        groupCountX = maximalNumberOfWorkgroupsAllowed + 1;
+        result = pCommandList->appendLaunchKernelWithParams(kernel.toHandle(), &groupCount, nullptr, false, false, isCooperative);
+        EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, result);
+    }
+    {
+        VariableBackup<bool> cooperative{&isCooperative};
+        cooperative = false;
+        result = pCommandList->appendLaunchKernelWithParams(kernel.toHandle(), &groupCount, nullptr, false, false, isCooperative);
+        EXPECT_EQ(ZE_RESULT_ERROR_INVALID_ARGUMENT, result);
+    }
+}
+
 } // namespace ult
 } // namespace L0
diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp
index e4a55b8f1f..4cb01c6183 100644
--- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp
+++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_fill.cpp
@@ -49,7 +49,8 @@ class AppendFillFixture : public DeviceFixture, public ::testing::Test {
                                                  const ze_group_count_t *pThreadGroupDimensions,
                                                  ze_event_handle_t hEvent,
                                                  bool isIndirect,
-                                                 bool isPredicate) override {
+                                                 bool isPredicate,
+                                                 bool isCooperative) override {
             if (numberOfCallsToAppendLaunchKernelWithParams == thresholdOfCallsToAppendLaunchKernelWithParamsToFail) {
                 return ZE_RESULT_ERROR_UNKNOWN;
             }
@@ -59,7 +60,8 @@ class AppendFillFixture : public DeviceFixture, public ::testing::Test {
                                                                                       pThreadGroupDimensions,
                                                                                       hEvent,
                                                                                       isIndirect,
-                                                                                      isPredicate);
+                                                                                      isPredicate,
+                                                                                      isCooperative);
         }
 
         uint32_t thresholdOfCallsToAppendLaunchKernelWithParamsToFail = std::numeric_limits<uint32_t>::max();
diff --git a/opencl/source/api/api.cpp b/opencl/source/api/api.cpp
index cfbf53ccbd..f0d7e0396b 100644
--- a/opencl/source/api/api.cpp
+++ b/opencl/source/api/api.cpp
@@ -5892,7 +5892,8 @@ cl_int CL_API_CALL clEnqueueNDCountKernelINTEL(cl_command_queue commandQueue,
     auto rootDeviceIndex = device.getRootDeviceIndex();
     auto &hardwareInfo = device.getHardwareInfo();
     auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
-    if (!hwHelper.isCooperativeDispatchSupported(pCommandQueue->getGpgpuEngine().getEngineType(), hardwareInfo.platform.eProductFamily)) {
+    auto engineGroupType = hwHelper.getEngineGroupType(pCommandQueue->getGpgpuEngine().getEngineType(), hardwareInfo);
+    if (!hwHelper.isCooperativeDispatchSupported(engineGroupType, hardwareInfo.platform.eProductFamily)) {
         retVal = CL_INVALID_COMMAND_QUEUE;
         return retVal;
     }
@@ -5921,7 +5922,7 @@ cl_int CL_API_CALL clEnqueueNDCountKernelINTEL(cl_command_queue commandQueue,
             return retVal;
         }
 
-        device.allocateSyncBufferHandler();
+        device.getDevice().allocateSyncBufferHandler();
     }
 
     if (!pCommandQueue->validateCapabilityForOperation(CL_QUEUE_CAPABILITY_KERNEL_INTEL, numEventsInWaitList, eventWaitList, event)) {
diff --git a/opencl/source/cl_device/cl_device.cpp b/opencl/source/cl_device/cl_device.cpp
index 857edd11be..76b356d126 100644
--- a/opencl/source/cl_device/cl_device.cpp
+++ b/opencl/source/cl_device/cl_device.cpp
@@ -15,7 +15,6 @@
 #include "shared/source/helpers/string.h"
 #include "shared/source/os_interface/driver_info.h"
 #include "shared/source/os_interface/os_interface.h"
-#include "shared/source/program/sync_buffer_handler.h"
 #include "shared/source/source_level_debugger/source_level_debugger.h"
 
 #include "opencl/source/helpers/cl_hw_helper.h"
@@ -68,7 +67,6 @@ ClDevice::~ClDevice() {
         getSourceLevelDebugger()->notifyDeviceDestruction();
     }
 
-    syncBufferHandler.reset();
     for (auto &subDevice : subDevices) {
         subDevice.reset();
     }
@@ -98,14 +96,6 @@ bool ClDevice::isOcl21Conformant() const {
             hwInfo.capabilityTable.supportsPipes && hwInfo.capabilityTable.supportsIndependentForwardProgress);
 }
 
-void ClDevice::allocateSyncBufferHandler() {
-    TakeOwnershipWrapper<ClDevice> lock(*this);
-    if (syncBufferHandler.get() == nullptr) {
-        syncBufferHandler = std::make_unique<SyncBufferHandler>(this->getDevice());
-        UNRECOVERABLE_IF(syncBufferHandler.get() == nullptr);
-    }
-}
-
 void ClDevice::retainApi() {
     auto parentDeviceId = deviceInfo.parentDevice;
     if (parentDeviceId) {
diff --git a/opencl/source/cl_device/cl_device.h b/opencl/source/cl_device/cl_device.h
index 95f7430400..1d1f49ceb2 100644
--- a/opencl/source/cl_device/cl_device.h
+++ b/opencl/source/cl_device/cl_device.h
@@ -31,7 +31,6 @@ class MemoryManager;
 class PerformanceCounters;
 class Platform;
 class SourceLevelDebugger;
-class SyncBufferHandler;
 struct DeviceInfo;
 struct EngineControl;
 struct HardwareCapabilities;
@@ -77,7 +76,6 @@ class ClDevice : public BaseObject<_cl_device_id> {
     double getPlatformHostTimerResolution() const;
     bool isSimulation() const;
     GFXCORE_FAMILY getRenderCoreFamily() const;
-    void allocateSyncBufferHandler();
     PerformanceCounters *getPerformanceCounters();
     PreemptionMode getPreemptionMode() const;
     bool isDebuggerActive() const;
@@ -119,7 +117,6 @@ class ClDevice : public BaseObject<_cl_device_id> {
     ClDevice *getDeviceById(uint32_t deviceId);
     const std::string &peekCompilerExtensions() const;
     const std::string &peekCompilerExtensionsWithFeatures() const;
-    std::unique_ptr<SyncBufferHandler> syncBufferHandler;
     DeviceBitfield getDeviceBitfield() const;
     bool isDeviceEnqueueSupported() const;
     bool arePipesSupported() const;
diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h
index 010882d18e..c2a0a7ddef 100644
--- a/opencl/source/command_queue/enqueue_common.h
+++ b/opencl/source/command_queue/enqueue_common.h
@@ -15,6 +15,7 @@
 #include "shared/source/memory_manager/surface.h"
 #include "shared/source/os_interface/os_context.h"
 #include "shared/source/program/sync_buffer_handler.h"
+#include "shared/source/program/sync_buffer_handler.inl"
 #include "shared/source/utilities/range.h"
 #include "shared/source/utilities/tag_allocator.h"
 
@@ -404,7 +405,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
         auto &lws = multiDispatchInfo.begin()->getLocalWorkgroupSize();
         size_t workGroupsCount = (gws.x * gws.y * gws.z) /
                                  (lws.x * lws.y * lws.z);
-        device->syncBufferHandler->prepareForEnqueue(workGroupsCount, *multiDispatchInfo.peekMainKernel());
+        device->getDevice().syncBufferHandler->prepareForEnqueue(workGroupsCount, *multiDispatchInfo.peekMainKernel());
     }
 
     if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
@@ -685,7 +686,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
 
     auto rootDeviceIndex = device->getRootDeviceIndex();
     if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer(rootDeviceIndex)) {
-        device->syncBufferHandler->makeResident(getGpgpuCommandStreamReceiver());
+        device->getDevice().syncBufferHandler->makeResident(getGpgpuCommandStreamReceiver());
     }
 
     if (timestampPacketContainer) {
diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp
index 19af26aab5..ad8b2ecc9e 100644
--- a/opencl/source/kernel/kernel.cpp
+++ b/opencl/source/kernel/kernel.cpp
@@ -1117,7 +1117,8 @@ uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *local
     auto &hardwareInfo = getHardwareInfo(rootDeviceIndex);
     auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
 
-    if (!hwHelper.isCooperativeDispatchSupported(commandQueue->getGpgpuEngine().getEngineType(), hardwareInfo.platform.eProductFamily)) {
+    auto engineGroupType = hwHelper.getEngineGroupType(commandQueue->getGpgpuEngine().getEngineType(), hardwareInfo);
+    if (!hwHelper.isCooperativeDispatchSupported(engineGroupType, hardwareInfo.platform.eProductFamily)) {
         return 0;
     }
 
diff --git a/opencl/test/unit_test/command_queue/enqueue_handler_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_handler_tests.cpp
index 491006cd0c..53304de7ca 100644
--- a/opencl/test/unit_test/command_queue/enqueue_handler_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_handler_tests.cpp
@@ -547,7 +547,7 @@ HWTEST_F(EnqueueHandlerTest, givenKernelUsingSyncBufferWhenEnqueuingKernelThenSs
     sPatchBindingTableState.Count = 1;
     sPatchBindingTableState.SurfaceStateOffset = 0;
 
-    pClDevice->allocateSyncBufferHandler();
+    pDevice->allocateSyncBufferHandler();
 
     size_t offset = 0;
     size_t size = 1;
@@ -591,7 +591,7 @@ HWTEST_F(EnqueueHandlerTest, givenKernelUsingSyncBufferWhenEnqueuingKernelThenSs
         hwParser.parseCommands<FamilyType>(*mockCmdQ);
 
         auto &surfaceState = hwParser.getSurfaceState<FamilyType>(&surfaceStateHeap, 0);
-        auto pSyncBufferHandler = static_cast<MockSyncBufferHandler *>(pClDevice->syncBufferHandler.get());
+        auto pSyncBufferHandler = static_cast<MockSyncBufferHandler *>(pDevice->syncBufferHandler.get());
         EXPECT_EQ(pSyncBufferHandler->graphicsAllocation->getGpuAddress(), surfaceState.getSurfaceBaseAddress());
     }
 }
diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp
index e73edaf3c1..faeae8cf8a 100644
--- a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp
@@ -218,7 +218,8 @@ using clEnqueueNDCountKernelTests = api_tests;
 
 TEST_F(clEnqueueNDCountKernelTests, GivenQueueIncapableWhenEnqueuingNDCountKernelINTELThenInvalidOperationIsReturned) {
     auto &hwHelper = HwHelper::get(::defaultHwInfo->platform.eRenderCoreFamily);
-    if (!hwHelper.isCooperativeDispatchSupported(pCommandQueue->getGpgpuEngine().getEngineType(), ::defaultHwInfo->platform.eProductFamily)) {
+    auto engineGroupType = hwHelper.getEngineGroupType(pCommandQueue->getGpgpuEngine().getEngineType(), *::defaultHwInfo);
+    if (!hwHelper.isCooperativeDispatchSupported(engineGroupType, ::defaultHwInfo->platform.eProductFamily)) {
         GTEST_SKIP();
     }
 
@@ -250,7 +251,8 @@ TEST_F(EnqueueKernelTest, givenKernelWhenAllArgsAreSetThenClEnqueueNDCountKernel
     CommandQueue *pCmdQ2 = createCommandQueue(pClDevice);
 
     HwHelper &hwHelper = HwHelper::get(pClDevice->getDevice().getHardwareInfo().platform.eRenderCoreFamily);
-    if (!hwHelper.isCooperativeDispatchSupported(pCmdQ2->getGpgpuEngine().getEngineType(), pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) {
+    auto engineGroupType = hwHelper.getEngineGroupType(pCmdQ2->getGpgpuEngine().getEngineType(), hardwareInfo);
+    if (!hwHelper.isCooperativeDispatchSupported(engineGroupType, pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) {
         pCmdQ2->getGpgpuEngine().osContext = pCmdQ2->getDevice().getEngine(aub_stream::ENGINE_CCS, EngineUsage::LowPriority).osContext;
     }
 
@@ -296,7 +298,8 @@ TEST_F(EnqueueKernelTest, givenKernelWhenNotAllArgsAreSetButSetKernelArgIsCalled
     CommandQueue *pCmdQ2 = createCommandQueue(pClDevice);
 
     HwHelper &hwHelper = HwHelper::get(pClDevice->getDevice().getHardwareInfo().platform.eRenderCoreFamily);
-    if (!hwHelper.isCooperativeDispatchSupported(pCmdQ2->getGpgpuEngine().getEngineType(), pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) {
+    auto engineGroupType = hwHelper.getEngineGroupType(pCmdQ2->getGpgpuEngine().getEngineType(), hardwareInfo);
+    if (!hwHelper.isCooperativeDispatchSupported(engineGroupType, pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) {
         pCmdQ2->getGpgpuEngine().osContext = pCmdQ2->getDevice().getEngine(aub_stream::ENGINE_CCS, EngineUsage::LowPriority).osContext;
     }
 
@@ -342,7 +345,8 @@ TEST_F(EnqueueKernelTest, givenKernelWhenSetKernelArgIsCalledForEachArgButAtLeas
     CommandQueue *pCmdQ2 = createCommandQueue(pClDevice);
 
     HwHelper &hwHelper = HwHelper::get(pClDevice->getDevice().getHardwareInfo().platform.eRenderCoreFamily);
-    if (!hwHelper.isCooperativeDispatchSupported(pCmdQ2->getGpgpuEngine().getEngineType(), pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) {
+    auto engineGroupType = hwHelper.getEngineGroupType(pCmdQ2->getGpgpuEngine().getEngineType(), hardwareInfo);
+    if (!hwHelper.isCooperativeDispatchSupported(engineGroupType, pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) {
         pCmdQ2->getGpgpuEngine().osContext = pCmdQ2->getDevice().getEngine(aub_stream::ENGINE_CCS, EngineUsage::LowPriority).osContext;
     }
 
diff --git a/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp b/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp
index 1f38106a62..1c0102812b 100644
--- a/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp
+++ b/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp
@@ -6,7 +6,7 @@
  */
 
 #include "shared/source/program/sync_buffer_handler.h"
-#include "shared/test/common/helpers/debug_manager_state_restore.h"
+#include "shared/test/common/mocks/ult_device_factory.h"
 
 #include "opencl/source/api/api.h"
 #include "opencl/test/unit_test/fixtures/enqueue_handler_fixture.h"
@@ -86,7 +86,7 @@ class SyncBufferHandlerTest : public SyncBufferEnqueueHandlerTest {
     }
 
     MockSyncBufferHandler *getSyncBufferHandler() {
-        return reinterpret_cast<MockSyncBufferHandler *>(pClDevice->syncBufferHandler.get());
+        return reinterpret_cast<MockSyncBufferHandler *>(pDevice->syncBufferHandler.get());
     }
 
     cl_int enqueueNDCount() {
@@ -94,7 +94,8 @@ class SyncBufferHandlerTest : public SyncBufferEnqueueHandlerTest {
     }
 
     bool isCooperativeDispatchSupported() {
-        return hwHelper->isCooperativeDispatchSupported(commandQueue->getGpgpuEngine().getEngineType(), commandQueue->getDevice().getHardwareInfo().platform.eProductFamily);
+        auto engineGroupType = hwHelper->getEngineGroupType(commandQueue->getGpgpuEngine().getEngineType(), hardwareInfo);
+        return hwHelper->isCooperativeDispatchSupported(engineGroupType, commandQueue->getDevice().getHardwareInfo().platform.eProductFamily);
     }
 
     const cl_uint workDim = 1;
@@ -157,7 +158,7 @@ HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenMaxWorkgroupCountWhenEnqueuingCon
 HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenTooHighWorkgroupCountWhenEnqueuingConcurrentKernelThenErrorIsReturned) {
     size_t maxWorkGroupCount = kernel->getMaxWorkGroupCount(workDim, lws, commandQueue);
     workgroupCount[0] = maxWorkGroupCount + 1;
-    globalWorkSize[0] = maxWorkGroupCount * lws[0] + 1;
+    globalWorkSize[0] = maxWorkGroupCount * lws[0];
 
     auto retVal = enqueueNDCount();
     EXPECT_EQ(CL_INVALID_VALUE, retVal);
@@ -180,7 +181,7 @@ HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenSshRequiredWhenPatchingSyncBuffer
     kernelInternals->kernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::BindfulAndStateless;
     patchAllocateSyncBuffer();
 
-    pClDevice->allocateSyncBufferHandler();
+    pDevice->allocateSyncBufferHandler();
     auto syncBufferHandler = getSyncBufferHandler();
     auto surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(kernel->getSurfaceStateHeap(rootDeviceIndex),
                                                                            sPatchAllocateSyncBuffer.SurfaceStateHeapOffset));
@@ -196,7 +197,7 @@ HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenSshRequiredWhenPatchingSyncBuffer
 
 TEST(SyncBufferHandlerDeviceTest, GivenRootDeviceWhenAllocateSyncBufferIsCalledTwiceThenTheObjectIsCreatedOnlyOnce) {
     const size_t testUsedBufferSize = 100;
-    MockClDevice rootDevice{new MockDevice};
+    MockDevice rootDevice;
     rootDevice.allocateSyncBufferHandler();
     auto syncBufferHandler = reinterpret_cast<MockSyncBufferHandler *>(rootDevice.syncBufferHandler.get());
 
@@ -210,20 +211,17 @@ TEST(SyncBufferHandlerDeviceTest, GivenRootDeviceWhenAllocateSyncBufferIsCalledT
 }
 
 TEST(SyncBufferHandlerDeviceTest, GivenSubDeviceWhenAllocateSyncBufferIsCalledTwiceThenTheObjectIsCreatedOnlyOnce) {
-    DebugManagerStateRestore restorer;
-    DebugManager.flags.CreateMultipleSubDevices.set(2);
-    VariableBackup<bool> mockDeviceFlagBackup(&MockDevice::createSingleDevice, false);
-    auto rootDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
-    auto &subDevice = rootDevice->subDevices[0];
-    subDevice->allocateSyncBufferHandler();
-    auto syncBufferHandler = reinterpret_cast<MockSyncBufferHandler *>(subDevice->syncBufferHandler.get());
+    UltDeviceFactory ultDeviceFactory{1, 2};
+    auto pSubDevice = ultDeviceFactory.subDevices[0];
+    pSubDevice->allocateSyncBufferHandler();
+    auto syncBufferHandler = reinterpret_cast<MockSyncBufferHandler *>(pSubDevice->syncBufferHandler.get());
 
     const size_t testUsedBufferSize = 100;
     ASSERT_NE(syncBufferHandler->usedBufferSize, testUsedBufferSize);
     syncBufferHandler->usedBufferSize = testUsedBufferSize;
 
-    subDevice->allocateSyncBufferHandler();
-    syncBufferHandler = reinterpret_cast<MockSyncBufferHandler *>(subDevice->syncBufferHandler.get());
+    pSubDevice->allocateSyncBufferHandler();
+    syncBufferHandler = reinterpret_cast<MockSyncBufferHandler *>(pSubDevice->syncBufferHandler.get());
 
     EXPECT_EQ(testUsedBufferSize, syncBufferHandler->usedBufferSize);
 }
diff --git a/opencl/test/unit_test/gtpin/gtpin_tests.cpp b/opencl/test/unit_test/gtpin/gtpin_tests.cpp
index 7782af24fa..777a32cc83 100644
--- a/opencl/test/unit_test/gtpin/gtpin_tests.cpp
+++ b/opencl/test/unit_test/gtpin/gtpin_tests.cpp
@@ -921,8 +921,10 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelINTELIsExecutedThenGT
     size_t localWorkSize[3] = {1, 1, 1};
     CommandQueue *commandQueue = nullptr;
     WithCastToInternal(cmdQ, &commandQueue);
-    HwHelper &hwHelper = HwHelper::get(pDevice->getDevice().getHardwareInfo().platform.eRenderCoreFamily);
-    if (!hwHelper.isCooperativeDispatchSupported(commandQueue->getGpgpuEngine().getEngineType(), pDevice->getDevice().getHardwareInfo().platform.eProductFamily)) {
+    auto &hwInfo = pDevice->getDevice().getHardwareInfo();
+    HwHelper &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
+    auto engineGroupType = hwHelper.getEngineGroupType(commandQueue->getGpgpuEngine().getEngineType(), hwInfo);
+    if (!hwHelper.isCooperativeDispatchSupported(engineGroupType, pDevice->getDevice().getHardwareInfo().platform.eProductFamily)) {
         commandQueue->getGpgpuEngine().osContext = commandQueue->getDevice().getEngine(aub_stream::ENGINE_CCS, EngineUsage::LowPriority).osContext;
     }
     size_t n = pKernel1->getMaxWorkGroupCount(workDim, localWorkSize, commandQueue);
diff --git a/shared/source/device/device.cpp b/shared/source/device/device.cpp
index 862eebf345..9d219cfe86 100644
--- a/shared/source/device/device.cpp
+++ b/shared/source/device/device.cpp
@@ -44,6 +44,7 @@ Device::~Device() {
         engine.commandStreamReceiver->flushBatchedSubmissions();
     }
 
+    syncBufferHandler.reset();
     commandStreamReceivers.clear();
     executionEnvironment->memoryManager->waitForDeletions();
 
@@ -308,6 +309,15 @@ GmmClientContext *Device::getGmmClientContext() const {
     return getGmmHelper()->getClientContext();
 }
 
+void Device::allocateSyncBufferHandler() {
+    static std::mutex mutex;
+    std::unique_lock<std::mutex> lock(mutex);
+    if (syncBufferHandler.get() == nullptr) {
+        syncBufferHandler = std::make_unique<SyncBufferHandler>(*this);
+        UNRECOVERABLE_IF(syncBufferHandler.get() == nullptr);
+    }
+}
+
 uint64_t Device::getGlobalMemorySize(uint32_t deviceBitfield) const {
     auto globalMemorySize = getMemoryManager()->isLocalMemorySupported(this->getRootDeviceIndex())
                                 ? getMemoryManager()->getLocalMemorySize(this->getRootDeviceIndex(), deviceBitfield)
diff --git a/shared/source/device/device.h b/shared/source/device/device.h
index e2dbd6edd2..c6a2449679 100644
--- a/shared/source/device/device.h
+++ b/shared/source/device/device.h
@@ -15,6 +15,7 @@
 #include "shared/source/helpers/engine_control.h"
 #include "shared/source/helpers/engine_node_helper.h"
 #include "shared/source/helpers/hw_info.h"
+#include "shared/source/program/sync_buffer_handler.h"
 
 #include "opencl/source/os_interface/performance_counters.h"
 
@@ -92,6 +93,7 @@ class Device : public ReferenceTrackedObject<Device> {
     }
     MOCKABLE_VIRTUAL CompilerInterface *getCompilerInterface() const;
     BuiltIns *getBuiltIns() const;
+    void allocateSyncBufferHandler();
 
     virtual uint32_t getRootDeviceIndex() const = 0;
     virtual uint32_t getNumAvailableDevices() const = 0;
@@ -101,6 +103,7 @@ class Device : public ReferenceTrackedObject<Device> {
     virtual BindlessHeapsHelper *getBindlessHeapsHelper() const = 0;
 
     static decltype(&PerformanceCounters::create) createPerformanceCountersFunc;
+    std::unique_ptr<SyncBufferHandler> syncBufferHandler;
 
   protected:
     Device() = delete;
diff --git a/shared/source/helpers/hw_helper.h b/shared/source/helpers/hw_helper.h
index 04e1590d39..906925f33a 100644
--- a/shared/source/helpers/hw_helper.h
+++ b/shared/source/helpers/hw_helper.h
@@ -124,7 +124,7 @@ class HwHelper {
     virtual bool useOnlyGlobalTimestamps() const = 0;
     virtual bool useSystemMemoryPlacementForISA(const HardwareInfo &hwInfo) const = 0;
     virtual bool packedFormatsSupported() const = 0;
-    virtual bool isCooperativeDispatchSupported(const aub_stream::EngineType engine, const PRODUCT_FAMILY productFamily) const = 0;
+    virtual bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType, const PRODUCT_FAMILY productFamily) const = 0;
     virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0;
     virtual bool isMediaBlockIOSupported(const HardwareInfo &hwInfo) const = 0;
     virtual bool isCopyOnlyEngineType(EngineGroupType type) const = 0;
@@ -325,7 +325,7 @@ class HwHelperHw : public HwHelper {
 
     bool packedFormatsSupported() const override;
 
-    bool isCooperativeDispatchSupported(const aub_stream::EngineType engine, const PRODUCT_FAMILY productFamily) const override;
+    bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType, const PRODUCT_FAMILY productFamily) const override;
 
     size_t getMaxFillPaternSizeForCopyEngine() const override;
 
diff --git a/shared/source/helpers/hw_helper_base.inl b/shared/source/helpers/hw_helper_base.inl
index 49ed996927..f9de825950 100644
--- a/shared/source/helpers/hw_helper_base.inl
+++ b/shared/source/helpers/hw_helper_base.inl
@@ -536,7 +536,7 @@ bool MemorySynchronizationCommands<GfxFamily>::isPipeControlPriorToPipelineSelec
 }
 
 template <typename GfxFamily>
-bool HwHelperHw<GfxFamily>::isCooperativeDispatchSupported(const aub_stream::EngineType engine, const PRODUCT_FAMILY productFamily) const {
+bool HwHelperHw<GfxFamily>::isCooperativeDispatchSupported(const EngineGroupType engineGroupType, const PRODUCT_FAMILY productFamily) const {
     return true;
 }
 
diff --git a/shared/source/program/CMakeLists.txt b/shared/source/program/CMakeLists.txt
index a619d9281b..5d001f178c 100644
--- a/shared/source/program/CMakeLists.txt
+++ b/shared/source/program/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2019-2020 Intel Corporation
+# Copyright (C) 2019-2021 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 #
@@ -16,6 +16,7 @@ set(NEO_CORE_PROGRAM
     ${CMAKE_CURRENT_SOURCE_DIR}/program_initialization.h
     ${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.inl
 )
 
 set_property(GLOBAL PROPERTY NEO_CORE_PROGRAM ${NEO_CORE_PROGRAM})
diff --git a/shared/source/program/sync_buffer_handler.cpp b/shared/source/program/sync_buffer_handler.cpp
index bd46449c8d..d297a72592 100644
--- a/shared/source/program/sync_buffer_handler.cpp
+++ b/shared/source/program/sync_buffer_handler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019-2020 Intel Corporation
+ * Copyright (C) 2019-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -11,8 +11,6 @@
 #include "shared/source/memory_manager/graphics_allocation.h"
 #include "shared/source/memory_manager/memory_manager.h"
 
-#include "opencl/source/kernel/kernel.h"
-
 namespace NEO {
 
 SyncBufferHandler::~SyncBufferHandler() {
@@ -24,22 +22,6 @@ SyncBufferHandler::SyncBufferHandler(Device &device)
     allocateNewBuffer();
 }
 
-void SyncBufferHandler::prepareForEnqueue(size_t workGroupsCount, Kernel &kernel) {
-    auto requiredSize = workGroupsCount;
-    std::lock_guard<std::mutex> guard(this->mutex);
-
-    bool isCurrentBufferFull = (usedBufferSize + requiredSize > bufferSize);
-    if (isCurrentBufferFull) {
-        memoryManager.checkGpuUsageAndDestroyGraphicsAllocations(graphicsAllocation);
-        allocateNewBuffer();
-        usedBufferSize = 0;
-    }
-
-    kernel.patchSyncBuffer(device, graphicsAllocation, usedBufferSize);
-
-    usedBufferSize += requiredSize;
-}
-
 void SyncBufferHandler::makeResident(CommandStreamReceiver &csr) {
     csr.makeResident(*graphicsAllocation);
 }
diff --git a/shared/source/program/sync_buffer_handler.h b/shared/source/program/sync_buffer_handler.h
index 29de7dac4b..9a27d467ab 100644
--- a/shared/source/program/sync_buffer_handler.h
+++ b/shared/source/program/sync_buffer_handler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019-2020 Intel Corporation
+ * Copyright (C) 2019-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -17,8 +17,8 @@ class CommandStreamReceiver;
 class Context;
 class Device;
 class GraphicsAllocation;
-class MemoryManager;
 class Kernel;
+class MemoryManager;
 
 class SyncBufferHandler {
   public:
@@ -26,7 +26,8 @@ class SyncBufferHandler {
 
     SyncBufferHandler(Device &device);
 
-    void prepareForEnqueue(size_t workGroupsCount, Kernel &kernel);
+    template <typename KernelT>
+    void prepareForEnqueue(size_t workGroupsCount, KernelT &kernel);
     void makeResident(CommandStreamReceiver &csr);
 
   protected:
diff --git a/shared/source/program/sync_buffer_handler.inl b/shared/source/program/sync_buffer_handler.inl
new file mode 100644
index 0000000000..1e7d5d7dba
--- /dev/null
+++ b/shared/source/program/sync_buffer_handler.inl
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/memory_manager/memory_manager.h"
+
+template <typename KernelT>
+void NEO::SyncBufferHandler::prepareForEnqueue(size_t workGroupsCount, KernelT &kernel) {
+    auto requiredSize = workGroupsCount;
+    std::lock_guard<std::mutex> guard(this->mutex);
+
+    bool isCurrentBufferFull = (usedBufferSize + requiredSize > bufferSize);
+    if (isCurrentBufferFull) {
+        memoryManager.checkGpuUsageAndDestroyGraphicsAllocations(graphicsAllocation);
+        allocateNewBuffer();
+        usedBufferSize = 0;
+    }
+
+    kernel.patchSyncBuffer(device, graphicsAllocation, usedBufferSize);
+
+    usedBufferSize += requiredSize;
+}