Add GTPin feature to allocate buffer in shared memory

Related-To: NEO-5667 Signed-off-by: Milczarek, Slawomir <slawomir.milczarek@intel.com>
2025-09-15 13:01:45 +08:00 · 2021-03-25 13:57:58 +00:00
parent 671d916c70
commit 255e85c124
7 changed files with 279 additions and 39 deletions
--- a/opencl/source/gtpin/gtpin_callbacks.cpp
+++ b/opencl/source/gtpin/gtpin_callbacks.cpp
@ -7,7 +7,7 @@
 #include "shared/source/command_stream/command_stream_receiver.h"
 #include "shared/source/memory_manager/surface.h"
-#include "shared/source/utilities/spinlock.h"
+#include "shared/source/memory_manager/unified_memory_manager.h"
 #include "opencl/source/cl_device/cl_device.h"
 #include "opencl/source/command_queue/command_queue.h"
@ -30,13 +30,15 @@ using namespace gtpin;
 namespace NEO {
 using GTPinLockType = std::recursive_mutex;
 extern gtpin::ocl::gtpin_events_t GTPinCallbacks;
 igc_init_t *pIgcInit = nullptr;
 std::atomic<int> sequenceCount(1);
 CommandQueue *pCmdQueueForFlushTask = nullptr;
 std::deque<gtpinkexec_t> kernelExecQueue;
-SpinLock kernelExecQueueLock;
+GTPinLockType kernelExecQueueLock;
 void gtpinNotifyContextCreate(cl_context context) {
    if (isGTPinInitialized) {
@ -131,7 +133,7 @@ void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
        kExec.gtpinResource = (cl_mem)resource;
        kExec.commandBuffer = commandBuffer;
        kExec.pCommandQueue = (CommandQueue *)pCmdQueue;
-        std::unique_lock<SpinLock> lock{kernelExecQueueLock};
+        std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
        kernelExecQueue.push_back(kExec);
        lock.unlock();
        // Patch SSH[gtpinBTI] with GT-Pin resource
@ -142,10 +144,19 @@ void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
        GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
        size_t gtpinBTI = pKernel->getNumberOfBindingTableStates() - 1;
        void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI);
-        cl_mem buffer = (cl_mem)resource;
+        if (gtpinHelper.canUseSharedAllocation(device.getHardwareInfo())) {
-        auto pBuffer = castToObjectOrAbort<Buffer>(buffer);
+            auto allocData = reinterpret_cast<SvmAllocationData *>(resource);
-        pBuffer->setArgStateful(pSurfaceState, false, false, false, false, device,
+            auto gpuAllocation = allocData->gpuAllocations.getGraphicsAllocation(rootDeviceIndex);
-                                pKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, pContext->getNumDevices());
+            size_t size = gpuAllocation->getUnderlyingBufferSize();
            Buffer::setSurfaceState(&device, pSurfaceState, false, false, size, gpuAllocation->getUnderlyingBuffer(), 0, gpuAllocation, 0, 0,
                                    pKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, pContext->getNumDevices());
            pKernel->setUnifiedMemoryExecInfo(gpuAllocation);
        } else {
            cl_mem buffer = (cl_mem)resource;
            auto pBuffer = castToObjectOrAbort<Buffer>(buffer);
            pBuffer->setArgStateful(pSurfaceState, false, false, false, false, device,
                                    pKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, pContext->getNumDevices());
        }
    }
 }
@ -157,7 +168,7 @@ void gtpinNotifyPreFlushTask(void *pCmdQueue) {
 void gtpinNotifyFlushTask(uint32_t flushedTaskCount) {
    if (isGTPinInitialized) {
-        std::unique_lock<SpinLock> lock{kernelExecQueueLock};
+        std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
        size_t numElems = kernelExecQueue.size();
        for (size_t n = 0; n < numElems; n++) {
            if ((kernelExecQueue[n].pCommandQueue == pCmdQueueForFlushTask) && !kernelExecQueue[n].isTaskCountValid) {
@ -173,7 +184,7 @@ void gtpinNotifyFlushTask(uint32_t flushedTaskCount) {
 void gtpinNotifyTaskCompletion(uint32_t completedTaskCount) {
    if (isGTPinInitialized) {
-        std::unique_lock<SpinLock> lock{kernelExecQueueLock};
+        std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
        size_t numElems = kernelExecQueue.size();
        for (size_t n = 0; n < numElems;) {
            if (kernelExecQueue[n].isTaskCountValid && (kernelExecQueue[n].taskCount <= completedTaskCount)) {
@ -191,15 +202,23 @@ void gtpinNotifyTaskCompletion(uint32_t completedTaskCount) {
 void gtpinNotifyMakeResident(void *pKernel, void *pCSR) {
    if (isGTPinInitialized) {
-        std::unique_lock<SpinLock> lock{kernelExecQueueLock};
+        std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
        Context &context = static_cast<Kernel *>(pKernel)->getContext();
        GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(context.getDevice(0)->getHardwareInfo().platform.eRenderCoreFamily);
        size_t numElems = kernelExecQueue.size();
        for (size_t n = 0; n < numElems; n++) {
            if ((kernelExecQueue[n].pKernel == pKernel) && !kernelExecQueue[n].isResourceResident && kernelExecQueue[n].gtpinResource) {
                // It's time for kernel to make resident its GT-Pin resource
                CommandStreamReceiver *pCommandStreamReceiver = reinterpret_cast<CommandStreamReceiver *>(pCSR);
-                cl_mem gtpinBuffer = kernelExecQueue[n].gtpinResource;
+                GraphicsAllocation *pGfxAlloc = nullptr;
-                auto pBuffer = castToObjectOrAbort<Buffer>(gtpinBuffer);
+                if (gtpinHelper.canUseSharedAllocation(context.getDevice(0)->getHardwareInfo())) {
-                GraphicsAllocation *pGfxAlloc = pBuffer->getGraphicsAllocation(pCommandStreamReceiver->getRootDeviceIndex());
+                    auto allocData = reinterpret_cast<SvmAllocationData *>(kernelExecQueue[n].gtpinResource);
                    pGfxAlloc = allocData->gpuAllocations.getGraphicsAllocation(pCommandStreamReceiver->getRootDeviceIndex());
                } else {
                    cl_mem gtpinBuffer = kernelExecQueue[n].gtpinResource;
                    auto pBuffer = castToObjectOrAbort<Buffer>(gtpinBuffer);
                    pGfxAlloc = pBuffer->getGraphicsAllocation(pCommandStreamReceiver->getRootDeviceIndex());
                }
                pCommandStreamReceiver->makeResident(*pGfxAlloc);
                kernelExecQueue[n].isResourceResident = true;
                break;
@ -210,7 +229,7 @@ void gtpinNotifyMakeResident(void *pKernel, void *pCSR) {
 void gtpinNotifyUpdateResidencyList(void *pKernel, void *pResVec) {
    if (isGTPinInitialized) {
-        std::unique_lock<SpinLock> lock{kernelExecQueueLock};
+        std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
        size_t numElems = kernelExecQueue.size();
        for (size_t n = 0; n < numElems; n++) {
            if ((kernelExecQueue[n].pKernel == pKernel) && !kernelExecQueue[n].isResourceResident && kernelExecQueue[n].gtpinResource) {
--- a/opencl/source/gtpin/gtpin_helpers.cpp
+++ b/opencl/source/gtpin/gtpin_helpers.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2020 Intel Corporation
+ * Copyright (C) 2017-2021 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -8,8 +8,12 @@
 #include "gtpin_helpers.h"
 #include "shared/source/memory_manager/memory_manager.h"
 #include "shared/source/memory_manager/unified_memory_manager.h"
 #include "opencl/source/api/api.h"
 #include "opencl/source/cl_device/cl_device.h"
 #include "opencl/source/context/context.h"
 #include "opencl/source/gtpin/gtpin_hw_helper.h"
 #include "opencl/source/helpers/validators.h"
 #include "opencl/source/mem_obj/buffer.h"
@ -27,27 +31,39 @@ GTPIN_DI_STATUS GTPIN_DRIVER_CALLCONV gtpinCreateBuffer(context_handle_t context
        return GTPIN_DI_ERROR_INVALID_ARGUMENT;
    }
    size_t size = alignUp(reqSize, MemoryConstants::cacheLineSize);
-    void *hostPtr = pContext->getMemoryManager()->allocateSystemMemory(size, MemoryConstants::pageSize);
+    GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(pContext->getDevice(0)->getHardwareInfo().platform.eRenderCoreFamily);
-    if (hostPtr == nullptr) {
+    if (gtpinHelper.canUseSharedAllocation(pContext->getDevice(0)->getHardwareInfo())) {
-        return GTPIN_DI_ERROR_ALLOCATION_FAILED;
+        void *unfiedMemorySharedAllocation = clSharedMemAllocINTEL(pContext, pContext->getDevice(0), 0, size, 0, &diag);
        auto allocationsManager = pContext->getSVMAllocsManager();
        auto graphicsAllocation = allocationsManager->getSVMAlloc(unfiedMemorySharedAllocation);
        *pResource = (resource_handle_t)graphicsAllocation;
    } else {
        void *hostPtr = pContext->getMemoryManager()->allocateSystemMemory(size, MemoryConstants::pageSize);
        if (hostPtr == nullptr) {
            return GTPIN_DI_ERROR_ALLOCATION_FAILED;
        }
        cl_mem buffer = Buffer::create(pContext, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE | CL_MEM_FORCE_HOST_MEMORY_INTEL, size, hostPtr, diag);
        *pResource = (resource_handle_t)buffer;
    }
    cl_mem buffer = Buffer::create(pContext, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE | CL_MEM_FORCE_HOST_MEMORY_INTEL, size, hostPtr, diag);
    *pResource = (resource_handle_t)buffer;
    return GTPIN_DI_SUCCESS;
 }
 GTPIN_DI_STATUS GTPIN_DRIVER_CALLCONV gtpinFreeBuffer(context_handle_t context, resource_handle_t resource) {
    cl_mem buffer = (cl_mem)resource;
    Context *pContext = castToObject<Context>((cl_context)context);
-    if ((pContext == nullptr) || (buffer == nullptr)) {
+    if ((pContext == nullptr) || (resource == nullptr)) {
        return GTPIN_DI_ERROR_INVALID_ARGUMENT;
    }
-    auto pMemObj = castToObject<MemObj>(buffer);
+    if (pContext->getMemoryManager()->isLocalMemorySupported(pContext->getDevice(0)->getRootDeviceIndex())) {
-    if (pMemObj == nullptr) {
+        auto allocData = reinterpret_cast<SvmAllocationData *>(resource);
-        return GTPIN_DI_ERROR_INVALID_ARGUMENT;
+        clMemFreeINTEL(pContext, allocData->cpuAllocation->getUnderlyingBuffer());
    } else {
        auto pMemObj = castToObject<MemObj>(resource);
        if (pMemObj == nullptr) {
            return GTPIN_DI_ERROR_INVALID_ARGUMENT;
        }
        alignedFree(pMemObj->getHostPtr());
        pMemObj->release();
    }
    alignedFree(pMemObj->getHostPtr());
    pMemObj->release();
    return GTPIN_DI_SUCCESS;
 }
@ -57,23 +73,31 @@ GTPIN_DI_STATUS GTPIN_DRIVER_CALLCONV gtpinMapBuffer(context_handle_t context, r
    if ((pContext == nullptr) || (buffer == nullptr) || (pAddress == nullptr)) {
        return GTPIN_DI_ERROR_INVALID_ARGUMENT;
    }
-    auto pMemObj = castToObject<MemObj>(buffer);
+    GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(pContext->getDevice(0)->getHardwareInfo().platform.eRenderCoreFamily);
-    if (pMemObj == nullptr) {
+    if (gtpinHelper.canUseSharedAllocation(pContext->getDevice(0)->getHardwareInfo())) {
-        return GTPIN_DI_ERROR_INVALID_ARGUMENT;
+        auto allocData = reinterpret_cast<SvmAllocationData *>(resource);
        *pAddress = reinterpret_cast<uint8_t *>(allocData->cpuAllocation->getUnderlyingBuffer());
    } else {
        auto pMemObj = castToObject<MemObj>(buffer);
        if (pMemObj == nullptr) {
            return GTPIN_DI_ERROR_INVALID_ARGUMENT;
        }
        *pAddress = reinterpret_cast<uint8_t *>(pMemObj->getHostPtr());
    }
    *pAddress = (uint8_t *)pMemObj->getHostPtr();
    return GTPIN_DI_SUCCESS;
 }
 GTPIN_DI_STATUS GTPIN_DRIVER_CALLCONV gtpinUnmapBuffer(context_handle_t context, resource_handle_t resource) {
    cl_mem buffer = (cl_mem)resource;
    Context *pContext = castToObject<Context>((cl_context)context);
-    if ((pContext == nullptr) || (buffer == nullptr)) {
+    if ((pContext == nullptr) || (resource == nullptr)) {
        return GTPIN_DI_ERROR_INVALID_ARGUMENT;
    }
-    auto pMemObj = castToObject<MemObj>(buffer);
+    GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(pContext->getDevice(0)->getHardwareInfo().platform.eRenderCoreFamily);
-    if (pMemObj == nullptr) {
+    if (!gtpinHelper.canUseSharedAllocation(pContext->getDevice(0)->getHardwareInfo())) {
-        return GTPIN_DI_ERROR_INVALID_ARGUMENT;
+        auto pMemObj = castToObject<MemObj>(resource);
        if (pMemObj == nullptr) {
            return GTPIN_DI_ERROR_INVALID_ARGUMENT;
        }
    }
    return GTPIN_DI_SUCCESS;
 }
--- a/opencl/source/gtpin/gtpin_hw_helper.h
+++ b/opencl/source/gtpin/gtpin_hw_helper.h
@ -17,6 +17,7 @@ class GTPinHwHelper {
    virtual uint32_t getGenVersion() = 0;
    virtual bool addSurfaceState(Kernel *pKernel) = 0;
    virtual void *getSurfaceState(Kernel *pKernel, size_t bti) = 0;
    virtual bool canUseSharedAllocation(const HardwareInfo &hwInfo) const = 0;
  protected:
    GTPinHwHelper(){};
@ -32,8 +33,9 @@ class GTPinHwHelperHw : public GTPinHwHelper {
    uint32_t getGenVersion() override;
    bool addSurfaceState(Kernel *pKernel) override;
    void *getSurfaceState(Kernel *pKernel, size_t bti) override;
    bool canUseSharedAllocation(const HardwareInfo &hwInfo) const override;
-  private:
+  protected:
    GTPinHwHelperHw(){};
 };
 } // namespace NEO
--- a/opencl/source/gtpin/gtpin_hw_helper.inl
+++ b/opencl/source/gtpin/gtpin_hw_helper.inl
@ -56,4 +56,14 @@ void *GTPinHwHelperHw<GfxFamily>::getSurfaceState(Kernel *pKernel, size_t bti) {
    return pSurfaceState;
 }
 template <typename GfxFamily>
 bool GTPinHwHelperHw<GfxFamily>::canUseSharedAllocation(const HardwareInfo &hwInfo) const {
    bool canUseSharedAllocation = false;
    if (DebugManager.flags.GTPinAllocateBufferInSharedMemory.get() != -1) {
        canUseSharedAllocation = !!DebugManager.flags.GTPinAllocateBufferInSharedMemory.get();
    }
    canUseSharedAllocation &= hwInfo.capabilityTable.ftrSvm;
    return canUseSharedAllocation;
 }
 } // namespace NEO
--- a/opencl/test/unit_test/gtpin/gtpin_tests.cpp
+++ b/opencl/test/unit_test/gtpin/gtpin_tests.cpp
@ -11,7 +11,9 @@
 #include "shared/source/helpers/file_io.h"
 #include "shared/source/helpers/hash.h"
 #include "shared/source/memory_manager/surface.h"
 #include "shared/source/memory_manager/unified_memory_manager.h"
 #include "shared/source/os_interface/os_context.h"
 #include "shared/test/common/helpers/debug_manager_state_restore.h"
 #include "shared/test/common/helpers/test_files.h"
 #include "shared/test/common/helpers/variable_backup.h"
 #include "shared/test/common/mocks/mock_device.h"
@ -52,7 +54,8 @@ using namespace gtpin;
 namespace NEO {
 extern std::deque<gtpinkexec_t> kernelExecQueue;
-}
+extern GTPinHwHelper *gtpinHwHelperFactory[IGFX_MAX_CORE];
 } // namespace NEO
 namespace ULT {
@ -149,6 +152,11 @@ class GTPinFixture : public ContextFixture, public MemoryManagementFixture {
  public:
    void SetUp() override {
        DebugManager.flags.GTPinAllocateBufferInSharedMemory.set(false);
        SetUpImpl();
    }
    void SetUpImpl() {
        platformsImpl->clear();
        MemoryManagementFixture::SetUp();
        constructPlatform();
@ -195,6 +203,7 @@ class GTPinFixture : public ContextFixture, public MemoryManagementFixture {
    gtpin::ocl::gtpin_events_t gtpinCallbacks;
    MockMemoryManagerWithFailures *memoryManager = nullptr;
    uint32_t rootDeviceIndex = std::numeric_limits<uint32_t>::max();
    DebugManagerStateRestore restore;
 };
 typedef Test<GTPinFixture> GTPinTests;
@ -2495,4 +2504,178 @@ HWTEST_F(GTPinTests, givenGtPinInitializedWhenSubmittingKernelCommandThenFlushed
    EXPECT_EQ(kernelExecQueue[0].taskCount, stamp.taskCount);
 }
 class GTPinFixtureWithLocalMemory : public GTPinFixture {
  public:
    void SetUp() override {
        DebugManager.flags.EnableLocalMemory.set(true);
        DebugManager.flags.GTPinAllocateBufferInSharedMemory.set(true);
        GTPinFixture::SetUpImpl();
    }
    void TearDown() override {
        GTPinFixture::TearDown();
    }
    DebugManagerStateRestore restore;
 };
 using GTPinTestsWithLocalMemory = Test<GTPinFixtureWithLocalMemory>;
 TEST_F(GTPinTestsWithLocalMemory, whenPlatformHasNoSvmSupportThenGtPinBufferCantBeAllocatedInSharedMemory) {
    DebugManager.flags.GTPinAllocateBufferInSharedMemory.set(-1);
    GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(pDevice->getHardwareInfo().platform.eRenderCoreFamily);
    auto canUseSharedAllocation = gtpinHelper.canUseSharedAllocation(pDevice->getHardwareInfo());
    if (!pDevice->getHardwareInfo().capabilityTable.ftrSvm) {
        EXPECT_FALSE(canUseSharedAllocation);
    }
 }
 HWTEST_F(GTPinTestsWithLocalMemory, givenGtPinCanUseSharedAllocationWhenGtPinBufferIsCreatedThenAllocateBufferInSharedMemory) {
    GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(pDevice->getHardwareInfo().platform.eRenderCoreFamily);
    if (!gtpinHelper.canUseSharedAllocation(pDevice->getHardwareInfo())) {
        GTEST_SKIP();
    }
    resource_handle_t resource = nullptr;
    cl_context ctxt = (cl_context)((Context *)pContext);
    GTPIN_DI_STATUS status = GTPIN_DI_SUCCESS;
    status = gtpinCreateBuffer((gtpin::context_handle_t)ctxt, 256, &resource);
    EXPECT_EQ(GTPIN_DI_SUCCESS, status);
    EXPECT_NE(nullptr, resource);
    auto allocData = reinterpret_cast<SvmAllocationData *>(resource);
    auto cpuAllocation = allocData->cpuAllocation;
    ASSERT_NE(nullptr, cpuAllocation);
    EXPECT_NE(GraphicsAllocation::AllocationType::UNIFIED_SHARED_MEMORY, cpuAllocation->getAllocationType());
    auto gpuAllocation = allocData->gpuAllocations.getGraphicsAllocation(pDevice->getRootDeviceIndex());
    ASSERT_NE(nullptr, gpuAllocation);
    EXPECT_NE(GraphicsAllocation::AllocationType::UNIFIED_SHARED_MEMORY, gpuAllocation->getAllocationType());
    uint8_t *address = nullptr;
    status = gtpinMapBuffer((gtpin::context_handle_t)ctxt, resource, &address);
    EXPECT_EQ(GTPIN_DI_SUCCESS, status);
    EXPECT_EQ(allocData->cpuAllocation->getUnderlyingBuffer(), address);
    status = gtpinUnmapBuffer((gtpin::context_handle_t)ctxt, resource);
    EXPECT_EQ(GTPIN_DI_SUCCESS, status);
    status = gtpinFreeBuffer((gtpin::context_handle_t)ctxt, resource);
    EXPECT_EQ(GTPIN_DI_SUCCESS, status);
 }
 HWTEST_F(GTPinTestsWithLocalMemory, givenGtPinCanUseSharedAllocationWhenGtPinBufferIsAllocatedInSharedMemoryThenSetSurfaceStateForTheBufferAndMakeItResident) {
    GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(pDevice->getHardwareInfo().platform.eRenderCoreFamily);
    if (!gtpinHelper.canUseSharedAllocation(pDevice->getHardwareInfo())) {
        GTEST_SKIP();
    }
    gtpinCallbacks.onContextCreate = OnContextCreate;
    gtpinCallbacks.onContextDestroy = OnContextDestroy;
    gtpinCallbacks.onKernelCreate = OnKernelCreate;
    gtpinCallbacks.onKernelSubmit = OnKernelSubmit;
    gtpinCallbacks.onCommandBufferCreate = OnCommandBufferCreate;
    gtpinCallbacks.onCommandBufferComplete = OnCommandBufferComplete;
    GTPIN_DI_STATUS status = GTPin_Init(&gtpinCallbacks, &driverServices, nullptr);
    EXPECT_EQ(GTPIN_DI_SUCCESS, status);
    cl_kernel kernel = nullptr;
    cl_program pProgram = nullptr;
    cl_device_id device = (cl_device_id)pDevice;
    size_t sourceSize = 0;
    std::string testFile;
    cl_command_queue cmdQ = nullptr;
    cl_queue_properties properties = 0;
    cl_context context = nullptr;
    KernelBinaryHelper kbHelper("CopyBuffer_simd16", false);
    testFile.append(clFiles);
    testFile.append("CopyBuffer_simd16.cl");
    auto pSource = loadDataFromFile(testFile.c_str(), sourceSize);
    EXPECT_NE(0u, sourceSize);
    EXPECT_NE(nullptr, pSource);
    context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &retVal);
    EXPECT_EQ(CL_SUCCESS, retVal);
    EXPECT_NE(nullptr, context);
    cmdQ = clCreateCommandQueue(context, device, properties, &retVal);
    ASSERT_NE(nullptr, cmdQ);
    EXPECT_EQ(CL_SUCCESS, retVal);
    const char *sources[1] = {pSource.get()};
    pProgram = clCreateProgramWithSource(
        context,
        1,
        sources,
        &sourceSize,
        &retVal);
    ASSERT_NE(nullptr, pProgram);
    retVal = clBuildProgram(
        pProgram,
        1,
        &device,
        nullptr,
        nullptr,
        nullptr);
    EXPECT_EQ(CL_SUCCESS, retVal);
    kernel = clCreateKernel(pProgram, "CopyBuffer", &retVal);
    EXPECT_NE(nullptr, kernel);
    EXPECT_EQ(CL_SUCCESS, retVal);
    auto pMultiDeviceKernel = static_cast<MultiDeviceKernel *>(kernel);
    auto pKernel = pMultiDeviceKernel->getKernel(rootDeviceIndex);
    auto pCmdQueue = castToObject<CommandQueue>(cmdQ);
    auto &csr = pCmdQueue->getGpgpuCommandStreamReceiver();
    using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
    constexpr size_t renderSurfaceSize = sizeof(RENDER_SURFACE_STATE);
    size_t gtpinBTI = pKernel->getNumberOfBindingTableStates() - 1;
    void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI);
    EXPECT_NE(nullptr, pSurfaceState);
    RENDER_SURFACE_STATE *surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(pSurfaceState);
    memset(pSurfaceState, 0, renderSurfaceSize);
    gtpinNotifyKernelSubmit(kernel, pCmdQueue);
    auto allocData = reinterpret_cast<SvmAllocationData *>(kernelExecQueue[0].gtpinResource);
    EXPECT_NE(nullptr, allocData);
    auto gpuAllocation = allocData->gpuAllocations.getGraphicsAllocation(rootDeviceIndex);
    EXPECT_NE(nullptr, gpuAllocation);
    RENDER_SURFACE_STATE expectedSurfaceState;
    memset(&expectedSurfaceState, 0, renderSurfaceSize);
    {
        void *addressToPatch = gpuAllocation->getUnderlyingBuffer();
        size_t sizeToPatch = gpuAllocation->getUnderlyingBufferSize();
        Buffer::setSurfaceState(&pDevice->getDevice(), &expectedSurfaceState, false, false,
                                sizeToPatch, addressToPatch, 0, gpuAllocation, 0, 0,
                                pKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, pContext->getNumDevices());
    }
    EXPECT_EQ(0, memcmp(&expectedSurfaceState, surfaceState, renderSurfaceSize));
    EXPECT_FALSE(gpuAllocation->isResident(csr.getOsContext().getContextId()));
    gtpinNotifyMakeResident(pKernel, &csr);
    EXPECT_TRUE(gpuAllocation->isResident(csr.getOsContext().getContextId()));
    kernelExecQueue[0].isTaskCountValid = true;
    gtpinNotifyTaskCompletion(kernelExecQueue[0].taskCount);
    retVal = clReleaseKernel(kernel);
    EXPECT_EQ(CL_SUCCESS, retVal);
    retVal = clReleaseProgram(pProgram);
    EXPECT_EQ(CL_SUCCESS, retVal);
    retVal = clReleaseCommandQueue(cmdQ);
    EXPECT_EQ(CL_SUCCESS, retVal);
    retVal = clReleaseContext(context);
    EXPECT_EQ(CL_SUCCESS, retVal);
 }
 } // namespace ULT
--- a/opencl/test/unit_test/test_files/igdrcl.config
+++ b/opencl/test/unit_test/test_files/igdrcl.config
@ -228,4 +228,5 @@ OverrideSlmAllocationSize = -1
 OverrideSlmSize = -1
 UseCyclesPerSecondTimer = 0
 WaitLoopCount = -1
-DebuggerLogBitmask = 0
+DebuggerLogBitmask = 0
 GTPinAllocateBufferInSharedMemory = -1
--- a/shared/source/debug_settings/debug_variables_base.inl
+++ b/shared/source/debug_settings/debug_variables_base.inl
@ -216,6 +216,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, UseBindlessMode, -1, "Use precompiled builtins i
 DECLARE_DEBUG_VARIABLE(int32_t, OverrideSlmSize, -1, "Force different slm size than default in kB")
 DECLARE_DEBUG_VARIABLE(int32_t, UseCyclesPerSecondTimer, 0, "0: default behavior, 0: disabled: Report L0 timer in nanosecond units, 1: enabled: Report L0 timer in cycles per second")
 DECLARE_DEBUG_VARIABLE(int32_t, WaitLoopCount, -1, "-1: use default, >=0: number of iterations in wait loop")
 DECLARE_DEBUG_VARIABLE(int32_t, GTPinAllocateBufferInSharedMemory, -1, "Force GTPin to allocate buffer in shared memory")
 /*DRIVER TOGGLES*/
 DECLARE_DEBUG_VARIABLE(int32_t, ForceOCLVersion, 0, "Force specific OpenCL API version")