Add GTPin feature to allocate buffer in shared memory

Related-To: NEO-5667 Signed-off-by: Milczarek, Slawomir <slawomir.milczarek@intel.com>
2025-09-15 13:01:45 +08:00 · 2021-03-25 13:57:58 +00:00
parent 671d916c70
commit 255e85c124
7 changed files with 279 additions and 39 deletions
--- a/opencl/source/gtpin/gtpin_callbacks.cpp
+++ b/opencl/source/gtpin/gtpin_callbacks.cpp
@ -7,7 +7,7 @@

 #include "shared/source/command_stream/command_stream_receiver.h"
 #include "shared/source/memory_manager/surface.h"
-#include "shared/source/utilities/spinlock.h"
+#include "shared/source/memory_manager/unified_memory_manager.h"

 #include "opencl/source/cl_device/cl_device.h"
 #include "opencl/source/command_queue/command_queue.h"
@ -30,13 +30,15 @@ using namespace gtpin;

 namespace NEO {

+using GTPinLockType = std::recursive_mutex;
+
 extern gtpin::ocl::gtpin_events_t GTPinCallbacks;

 igc_init_t *pIgcInit = nullptr;
 std::atomic<int> sequenceCount(1);
 CommandQueue *pCmdQueueForFlushTask = nullptr;
 std::deque<gtpinkexec_t> kernelExecQueue;
-SpinLock kernelExecQueueLock;
+GTPinLockType kernelExecQueueLock;

 void gtpinNotifyContextCreate(cl_context context) {
    if (isGTPinInitialized) {
@ -131,7 +133,7 @@ void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
        kExec.gtpinResource = (cl_mem)resource;
        kExec.commandBuffer = commandBuffer;
        kExec.pCommandQueue = (CommandQueue *)pCmdQueue;
-        std::unique_lock<SpinLock> lock{kernelExecQueueLock};
+        std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
        kernelExecQueue.push_back(kExec);
        lock.unlock();
        // Patch SSH[gtpinBTI] with GT-Pin resource
@ -142,12 +144,21 @@ void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
        GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
        size_t gtpinBTI = pKernel->getNumberOfBindingTableStates() - 1;
        void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI);
+        if (gtpinHelper.canUseSharedAllocation(device.getHardwareInfo())) {
+            auto allocData = reinterpret_cast<SvmAllocationData *>(resource);
+            auto gpuAllocation = allocData->gpuAllocations.getGraphicsAllocation(rootDeviceIndex);
+            size_t size = gpuAllocation->getUnderlyingBufferSize();
+            Buffer::setSurfaceState(&device, pSurfaceState, false, false, size, gpuAllocation->getUnderlyingBuffer(), 0, gpuAllocation, 0, 0,
+                                    pKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, pContext->getNumDevices());
+            pKernel->setUnifiedMemoryExecInfo(gpuAllocation);
+        } else {
            cl_mem buffer = (cl_mem)resource;
            auto pBuffer = castToObjectOrAbort<Buffer>(buffer);
            pBuffer->setArgStateful(pSurfaceState, false, false, false, false, device,
                                    pKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, pContext->getNumDevices());
        }
    }
+}

 void gtpinNotifyPreFlushTask(void *pCmdQueue) {
    if (isGTPinInitialized) {
@ -157,7 +168,7 @@ void gtpinNotifyPreFlushTask(void *pCmdQueue) {

 void gtpinNotifyFlushTask(uint32_t flushedTaskCount) {
    if (isGTPinInitialized) {
-        std::unique_lock<SpinLock> lock{kernelExecQueueLock};
+        std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
        size_t numElems = kernelExecQueue.size();
        for (size_t n = 0; n < numElems; n++) {
            if ((kernelExecQueue[n].pCommandQueue == pCmdQueueForFlushTask) && !kernelExecQueue[n].isTaskCountValid) {
@ -173,7 +184,7 @@ void gtpinNotifyFlushTask(uint32_t flushedTaskCount) {

 void gtpinNotifyTaskCompletion(uint32_t completedTaskCount) {
    if (isGTPinInitialized) {
-        std::unique_lock<SpinLock> lock{kernelExecQueueLock};
+        std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
        size_t numElems = kernelExecQueue.size();
        for (size_t n = 0; n < numElems;) {
            if (kernelExecQueue[n].isTaskCountValid && (kernelExecQueue[n].taskCount <= completedTaskCount)) {
@ -191,15 +202,23 @@ void gtpinNotifyTaskCompletion(uint32_t completedTaskCount) {

 void gtpinNotifyMakeResident(void *pKernel, void *pCSR) {
    if (isGTPinInitialized) {
-        std::unique_lock<SpinLock> lock{kernelExecQueueLock};
+        std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
+        Context &context = static_cast<Kernel *>(pKernel)->getContext();
+        GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(context.getDevice(0)->getHardwareInfo().platform.eRenderCoreFamily);
        size_t numElems = kernelExecQueue.size();
        for (size_t n = 0; n < numElems; n++) {
            if ((kernelExecQueue[n].pKernel == pKernel) && !kernelExecQueue[n].isResourceResident && kernelExecQueue[n].gtpinResource) {
                // It's time for kernel to make resident its GT-Pin resource
                CommandStreamReceiver *pCommandStreamReceiver = reinterpret_cast<CommandStreamReceiver *>(pCSR);
+                GraphicsAllocation *pGfxAlloc = nullptr;
+                if (gtpinHelper.canUseSharedAllocation(context.getDevice(0)->getHardwareInfo())) {
+                    auto allocData = reinterpret_cast<SvmAllocationData *>(kernelExecQueue[n].gtpinResource);
+                    pGfxAlloc = allocData->gpuAllocations.getGraphicsAllocation(pCommandStreamReceiver->getRootDeviceIndex());
+                } else {
                    cl_mem gtpinBuffer = kernelExecQueue[n].gtpinResource;
                    auto pBuffer = castToObjectOrAbort<Buffer>(gtpinBuffer);
-                GraphicsAllocation *pGfxAlloc = pBuffer->getGraphicsAllocation(pCommandStreamReceiver->getRootDeviceIndex());
+                    pGfxAlloc = pBuffer->getGraphicsAllocation(pCommandStreamReceiver->getRootDeviceIndex());
+                }
                pCommandStreamReceiver->makeResident(*pGfxAlloc);
                kernelExecQueue[n].isResourceResident = true;
                break;
@ -210,7 +229,7 @@ void gtpinNotifyMakeResident(void *pKernel, void *pCSR) {

 void gtpinNotifyUpdateResidencyList(void *pKernel, void *pResVec) {
    if (isGTPinInitialized) {
-        std::unique_lock<SpinLock> lock{kernelExecQueueLock};
+        std::unique_lock<GTPinLockType> lock{kernelExecQueueLock};
        size_t numElems = kernelExecQueue.size();
        for (size_t n = 0; n < numElems; n++) {
            if ((kernelExecQueue[n].pKernel == pKernel) && !kernelExecQueue[n].isResourceResident && kernelExecQueue[n].gtpinResource) {
--- a/opencl/source/gtpin/gtpin_helpers.cpp
+++ b/opencl/source/gtpin/gtpin_helpers.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017-2020 Intel Corporation
+ * Copyright (C) 2017-2021 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -8,8 +8,12 @@
 #include "gtpin_helpers.h"

 #include "shared/source/memory_manager/memory_manager.h"
+#include "shared/source/memory_manager/unified_memory_manager.h"

+#include "opencl/source/api/api.h"
+#include "opencl/source/cl_device/cl_device.h"
 #include "opencl/source/context/context.h"
+#include "opencl/source/gtpin/gtpin_hw_helper.h"
 #include "opencl/source/helpers/validators.h"
 #include "opencl/source/mem_obj/buffer.h"

@ -27,27 +31,39 @@ GTPIN_DI_STATUS GTPIN_DRIVER_CALLCONV gtpinCreateBuffer(context_handle_t context
        return GTPIN_DI_ERROR_INVALID_ARGUMENT;
    }
    size_t size = alignUp(reqSize, MemoryConstants::cacheLineSize);
+    GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(pContext->getDevice(0)->getHardwareInfo().platform.eRenderCoreFamily);
+    if (gtpinHelper.canUseSharedAllocation(pContext->getDevice(0)->getHardwareInfo())) {
+        void *unfiedMemorySharedAllocation = clSharedMemAllocINTEL(pContext, pContext->getDevice(0), 0, size, 0, &diag);
+        auto allocationsManager = pContext->getSVMAllocsManager();
+        auto graphicsAllocation = allocationsManager->getSVMAlloc(unfiedMemorySharedAllocation);
+        *pResource = (resource_handle_t)graphicsAllocation;
+    } else {
        void *hostPtr = pContext->getMemoryManager()->allocateSystemMemory(size, MemoryConstants::pageSize);
        if (hostPtr == nullptr) {
            return GTPIN_DI_ERROR_ALLOCATION_FAILED;
        }
        cl_mem buffer = Buffer::create(pContext, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE | CL_MEM_FORCE_HOST_MEMORY_INTEL, size, hostPtr, diag);
        *pResource = (resource_handle_t)buffer;
+    }
    return GTPIN_DI_SUCCESS;
 }

 GTPIN_DI_STATUS GTPIN_DRIVER_CALLCONV gtpinFreeBuffer(context_handle_t context, resource_handle_t resource) {
-    cl_mem buffer = (cl_mem)resource;
    Context *pContext = castToObject<Context>((cl_context)context);
-    if ((pContext == nullptr) || (buffer == nullptr)) {
+    if ((pContext == nullptr) || (resource == nullptr)) {
        return GTPIN_DI_ERROR_INVALID_ARGUMENT;
    }
-    auto pMemObj = castToObject<MemObj>(buffer);
+    if (pContext->getMemoryManager()->isLocalMemorySupported(pContext->getDevice(0)->getRootDeviceIndex())) {
+        auto allocData = reinterpret_cast<SvmAllocationData *>(resource);
+        clMemFreeINTEL(pContext, allocData->cpuAllocation->getUnderlyingBuffer());
+    } else {
+        auto pMemObj = castToObject<MemObj>(resource);
        if (pMemObj == nullptr) {
            return GTPIN_DI_ERROR_INVALID_ARGUMENT;
        }
        alignedFree(pMemObj->getHostPtr());
        pMemObj->release();
+    }
    return GTPIN_DI_SUCCESS;
 }

@ -57,24 +73,32 @@ GTPIN_DI_STATUS GTPIN_DRIVER_CALLCONV gtpinMapBuffer(context_handle_t context, r
    if ((pContext == nullptr) || (buffer == nullptr) || (pAddress == nullptr)) {
        return GTPIN_DI_ERROR_INVALID_ARGUMENT;
    }
+    GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(pContext->getDevice(0)->getHardwareInfo().platform.eRenderCoreFamily);
+    if (gtpinHelper.canUseSharedAllocation(pContext->getDevice(0)->getHardwareInfo())) {
+        auto allocData = reinterpret_cast<SvmAllocationData *>(resource);
+        *pAddress = reinterpret_cast<uint8_t *>(allocData->cpuAllocation->getUnderlyingBuffer());
+    } else {
        auto pMemObj = castToObject<MemObj>(buffer);
        if (pMemObj == nullptr) {
            return GTPIN_DI_ERROR_INVALID_ARGUMENT;
        }
-    *pAddress = (uint8_t *)pMemObj->getHostPtr();
+        *pAddress = reinterpret_cast<uint8_t *>(pMemObj->getHostPtr());
+    }
    return GTPIN_DI_SUCCESS;
 }

 GTPIN_DI_STATUS GTPIN_DRIVER_CALLCONV gtpinUnmapBuffer(context_handle_t context, resource_handle_t resource) {
-    cl_mem buffer = (cl_mem)resource;
    Context *pContext = castToObject<Context>((cl_context)context);
-    if ((pContext == nullptr) || (buffer == nullptr)) {
+    if ((pContext == nullptr) || (resource == nullptr)) {
        return GTPIN_DI_ERROR_INVALID_ARGUMENT;
    }
-    auto pMemObj = castToObject<MemObj>(buffer);
+    GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(pContext->getDevice(0)->getHardwareInfo().platform.eRenderCoreFamily);
+    if (!gtpinHelper.canUseSharedAllocation(pContext->getDevice(0)->getHardwareInfo())) {
+        auto pMemObj = castToObject<MemObj>(resource);
        if (pMemObj == nullptr) {
            return GTPIN_DI_ERROR_INVALID_ARGUMENT;
        }
+    }
    return GTPIN_DI_SUCCESS;
 }
 } // namespace NEO
--- a/opencl/source/gtpin/gtpin_hw_helper.h
+++ b/opencl/source/gtpin/gtpin_hw_helper.h
@ -17,6 +17,7 @@ class GTPinHwHelper {
    virtual uint32_t getGenVersion() = 0;
    virtual bool addSurfaceState(Kernel *pKernel) = 0;
    virtual void *getSurfaceState(Kernel *pKernel, size_t bti) = 0;
+    virtual bool canUseSharedAllocation(const HardwareInfo &hwInfo) const = 0;

  protected:
    GTPinHwHelper(){};
@ -32,8 +33,9 @@ class GTPinHwHelperHw : public GTPinHwHelper {
    uint32_t getGenVersion() override;
    bool addSurfaceState(Kernel *pKernel) override;
    void *getSurfaceState(Kernel *pKernel, size_t bti) override;
+    bool canUseSharedAllocation(const HardwareInfo &hwInfo) const override;

-  private:
+  protected:
    GTPinHwHelperHw(){};
 };
 } // namespace NEO
--- a/opencl/source/gtpin/gtpin_hw_helper.inl
+++ b/opencl/source/gtpin/gtpin_hw_helper.inl
@ -56,4 +56,14 @@ void *GTPinHwHelperHw<GfxFamily>::getSurfaceState(Kernel *pKernel, size_t bti) {
    return pSurfaceState;
 }

+template <typename GfxFamily>
+bool GTPinHwHelperHw<GfxFamily>::canUseSharedAllocation(const HardwareInfo &hwInfo) const {
+    bool canUseSharedAllocation = false;
+    if (DebugManager.flags.GTPinAllocateBufferInSharedMemory.get() != -1) {
+        canUseSharedAllocation = !!DebugManager.flags.GTPinAllocateBufferInSharedMemory.get();
+    }
+    canUseSharedAllocation &= hwInfo.capabilityTable.ftrSvm;
+    return canUseSharedAllocation;
+}
+
 } // namespace NEO
--- a/opencl/test/unit_test/gtpin/gtpin_tests.cpp
+++ b/opencl/test/unit_test/gtpin/gtpin_tests.cpp
@ -11,7 +11,9 @@
 #include "shared/source/helpers/file_io.h"
 #include "shared/source/helpers/hash.h"
 #include "shared/source/memory_manager/surface.h"
+#include "shared/source/memory_manager/unified_memory_manager.h"
 #include "shared/source/os_interface/os_context.h"
+#include "shared/test/common/helpers/debug_manager_state_restore.h"
 #include "shared/test/common/helpers/test_files.h"
 #include "shared/test/common/helpers/variable_backup.h"
 #include "shared/test/common/mocks/mock_device.h"
@ -52,7 +54,8 @@ using namespace gtpin;

 namespace NEO {
 extern std::deque<gtpinkexec_t> kernelExecQueue;
-}
+extern GTPinHwHelper *gtpinHwHelperFactory[IGFX_MAX_CORE];
+} // namespace NEO

 namespace ULT {

@ -149,6 +152,11 @@ class GTPinFixture : public ContextFixture, public MemoryManagementFixture {

  public:
    void SetUp() override {
+        DebugManager.flags.GTPinAllocateBufferInSharedMemory.set(false);
+        SetUpImpl();
+    }
+
+    void SetUpImpl() {
        platformsImpl->clear();
        MemoryManagementFixture::SetUp();
        constructPlatform();
@ -195,6 +203,7 @@ class GTPinFixture : public ContextFixture, public MemoryManagementFixture {
    gtpin::ocl::gtpin_events_t gtpinCallbacks;
    MockMemoryManagerWithFailures *memoryManager = nullptr;
    uint32_t rootDeviceIndex = std::numeric_limits<uint32_t>::max();
+    DebugManagerStateRestore restore;
 };

 typedef Test<GTPinFixture> GTPinTests;
@ -2495,4 +2504,178 @@ HWTEST_F(GTPinTests, givenGtPinInitializedWhenSubmittingKernelCommandThenFlushed
    EXPECT_EQ(kernelExecQueue[0].taskCount, stamp.taskCount);
 }

+class GTPinFixtureWithLocalMemory : public GTPinFixture {
+  public:
+    void SetUp() override {
+        DebugManager.flags.EnableLocalMemory.set(true);
+        DebugManager.flags.GTPinAllocateBufferInSharedMemory.set(true);
+        GTPinFixture::SetUpImpl();
+    }
+    void TearDown() override {
+        GTPinFixture::TearDown();
+    }
+    DebugManagerStateRestore restore;
+};
+
+using GTPinTestsWithLocalMemory = Test<GTPinFixtureWithLocalMemory>;
+
+TEST_F(GTPinTestsWithLocalMemory, whenPlatformHasNoSvmSupportThenGtPinBufferCantBeAllocatedInSharedMemory) {
+    DebugManager.flags.GTPinAllocateBufferInSharedMemory.set(-1);
+    GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(pDevice->getHardwareInfo().platform.eRenderCoreFamily);
+    auto canUseSharedAllocation = gtpinHelper.canUseSharedAllocation(pDevice->getHardwareInfo());
+    if (!pDevice->getHardwareInfo().capabilityTable.ftrSvm) {
+        EXPECT_FALSE(canUseSharedAllocation);
+    }
+}
+
+HWTEST_F(GTPinTestsWithLocalMemory, givenGtPinCanUseSharedAllocationWhenGtPinBufferIsCreatedThenAllocateBufferInSharedMemory) {
+    GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(pDevice->getHardwareInfo().platform.eRenderCoreFamily);
+    if (!gtpinHelper.canUseSharedAllocation(pDevice->getHardwareInfo())) {
+        GTEST_SKIP();
+    }
+
+    resource_handle_t resource = nullptr;
+    cl_context ctxt = (cl_context)((Context *)pContext);
+    GTPIN_DI_STATUS status = GTPIN_DI_SUCCESS;
+
+    status = gtpinCreateBuffer((gtpin::context_handle_t)ctxt, 256, &resource);
+    EXPECT_EQ(GTPIN_DI_SUCCESS, status);
+    EXPECT_NE(nullptr, resource);
+
+    auto allocData = reinterpret_cast<SvmAllocationData *>(resource);
+
+    auto cpuAllocation = allocData->cpuAllocation;
+    ASSERT_NE(nullptr, cpuAllocation);
+    EXPECT_NE(GraphicsAllocation::AllocationType::UNIFIED_SHARED_MEMORY, cpuAllocation->getAllocationType());
+
+    auto gpuAllocation = allocData->gpuAllocations.getGraphicsAllocation(pDevice->getRootDeviceIndex());
+    ASSERT_NE(nullptr, gpuAllocation);
+    EXPECT_NE(GraphicsAllocation::AllocationType::UNIFIED_SHARED_MEMORY, gpuAllocation->getAllocationType());
+
+    uint8_t *address = nullptr;
+    status = gtpinMapBuffer((gtpin::context_handle_t)ctxt, resource, &address);
+    EXPECT_EQ(GTPIN_DI_SUCCESS, status);
+    EXPECT_EQ(allocData->cpuAllocation->getUnderlyingBuffer(), address);
+
+    status = gtpinUnmapBuffer((gtpin::context_handle_t)ctxt, resource);
+    EXPECT_EQ(GTPIN_DI_SUCCESS, status);
+
+    status = gtpinFreeBuffer((gtpin::context_handle_t)ctxt, resource);
+    EXPECT_EQ(GTPIN_DI_SUCCESS, status);
+}
+
+HWTEST_F(GTPinTestsWithLocalMemory, givenGtPinCanUseSharedAllocationWhenGtPinBufferIsAllocatedInSharedMemoryThenSetSurfaceStateForTheBufferAndMakeItResident) {
+    GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(pDevice->getHardwareInfo().platform.eRenderCoreFamily);
+    if (!gtpinHelper.canUseSharedAllocation(pDevice->getHardwareInfo())) {
+        GTEST_SKIP();
+    }
+
+    gtpinCallbacks.onContextCreate = OnContextCreate;
+    gtpinCallbacks.onContextDestroy = OnContextDestroy;
+    gtpinCallbacks.onKernelCreate = OnKernelCreate;
+    gtpinCallbacks.onKernelSubmit = OnKernelSubmit;
+    gtpinCallbacks.onCommandBufferCreate = OnCommandBufferCreate;
+    gtpinCallbacks.onCommandBufferComplete = OnCommandBufferComplete;
+
+    GTPIN_DI_STATUS status = GTPin_Init(&gtpinCallbacks, &driverServices, nullptr);
+    EXPECT_EQ(GTPIN_DI_SUCCESS, status);
+
+    cl_kernel kernel = nullptr;
+    cl_program pProgram = nullptr;
+    cl_device_id device = (cl_device_id)pDevice;
+    size_t sourceSize = 0;
+    std::string testFile;
+    cl_command_queue cmdQ = nullptr;
+    cl_queue_properties properties = 0;
+    cl_context context = nullptr;
+
+    KernelBinaryHelper kbHelper("CopyBuffer_simd16", false);
+    testFile.append(clFiles);
+    testFile.append("CopyBuffer_simd16.cl");
+    auto pSource = loadDataFromFile(testFile.c_str(), sourceSize);
+    EXPECT_NE(0u, sourceSize);
+    EXPECT_NE(nullptr, pSource);
+
+    context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &retVal);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+    EXPECT_NE(nullptr, context);
+
+    cmdQ = clCreateCommandQueue(context, device, properties, &retVal);
+    ASSERT_NE(nullptr, cmdQ);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    const char *sources[1] = {pSource.get()};
+    pProgram = clCreateProgramWithSource(
+        context,
+        1,
+        sources,
+        &sourceSize,
+        &retVal);
+    ASSERT_NE(nullptr, pProgram);
+
+    retVal = clBuildProgram(
+        pProgram,
+        1,
+        &device,
+        nullptr,
+        nullptr,
+        nullptr);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    kernel = clCreateKernel(pProgram, "CopyBuffer", &retVal);
+    EXPECT_NE(nullptr, kernel);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    auto pMultiDeviceKernel = static_cast<MultiDeviceKernel *>(kernel);
+    auto pKernel = pMultiDeviceKernel->getKernel(rootDeviceIndex);
+    auto pCmdQueue = castToObject<CommandQueue>(cmdQ);
+    auto &csr = pCmdQueue->getGpgpuCommandStreamReceiver();
+
+    using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
+    constexpr size_t renderSurfaceSize = sizeof(RENDER_SURFACE_STATE);
+
+    size_t gtpinBTI = pKernel->getNumberOfBindingTableStates() - 1;
+    void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI);
+    EXPECT_NE(nullptr, pSurfaceState);
+
+    RENDER_SURFACE_STATE *surfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(pSurfaceState);
+    memset(pSurfaceState, 0, renderSurfaceSize);
+
+    gtpinNotifyKernelSubmit(kernel, pCmdQueue);
+
+    auto allocData = reinterpret_cast<SvmAllocationData *>(kernelExecQueue[0].gtpinResource);
+    EXPECT_NE(nullptr, allocData);
+    auto gpuAllocation = allocData->gpuAllocations.getGraphicsAllocation(rootDeviceIndex);
+    EXPECT_NE(nullptr, gpuAllocation);
+
+    RENDER_SURFACE_STATE expectedSurfaceState;
+    memset(&expectedSurfaceState, 0, renderSurfaceSize);
+    {
+        void *addressToPatch = gpuAllocation->getUnderlyingBuffer();
+        size_t sizeToPatch = gpuAllocation->getUnderlyingBufferSize();
+        Buffer::setSurfaceState(&pDevice->getDevice(), &expectedSurfaceState, false, false,
+                                sizeToPatch, addressToPatch, 0, gpuAllocation, 0, 0,
+                                pKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, pContext->getNumDevices());
+    }
+    EXPECT_EQ(0, memcmp(&expectedSurfaceState, surfaceState, renderSurfaceSize));
+
+    EXPECT_FALSE(gpuAllocation->isResident(csr.getOsContext().getContextId()));
+    gtpinNotifyMakeResident(pKernel, &csr);
+    EXPECT_TRUE(gpuAllocation->isResident(csr.getOsContext().getContextId()));
+
+    kernelExecQueue[0].isTaskCountValid = true;
+    gtpinNotifyTaskCompletion(kernelExecQueue[0].taskCount);
+
+    retVal = clReleaseKernel(kernel);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    retVal = clReleaseProgram(pProgram);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    retVal = clReleaseCommandQueue(cmdQ);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    retVal = clReleaseContext(context);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+}
 } // namespace ULT
--- a/opencl/test/unit_test/test_files/igdrcl.config
+++ b/opencl/test/unit_test/test_files/igdrcl.config
@ -229,3 +229,4 @@ OverrideSlmSize = -1
 UseCyclesPerSecondTimer = 0
 WaitLoopCount = -1
 DebuggerLogBitmask = 0
+GTPinAllocateBufferInSharedMemory = -1
--- a/shared/source/debug_settings/debug_variables_base.inl
+++ b/shared/source/debug_settings/debug_variables_base.inl
@ -216,6 +216,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, UseBindlessMode, -1, "Use precompiled builtins i
 DECLARE_DEBUG_VARIABLE(int32_t, OverrideSlmSize, -1, "Force different slm size than default in kB")
 DECLARE_DEBUG_VARIABLE(int32_t, UseCyclesPerSecondTimer, 0, "0: default behavior, 0: disabled: Report L0 timer in nanosecond units, 1: enabled: Report L0 timer in cycles per second")
 DECLARE_DEBUG_VARIABLE(int32_t, WaitLoopCount, -1, "-1: use default, >=0: number of iterations in wait loop")
+DECLARE_DEBUG_VARIABLE(int32_t, GTPinAllocateBufferInSharedMemory, -1, "Force GTPin to allocate buffer in shared memory")

 /*DRIVER TOGGLES*/
 DECLARE_DEBUG_VARIABLE(int32_t, ForceOCLVersion, 0, "Force specific OpenCL API version")