Add support for GT-Pin Callbacks [3/n]

Change-Id: Iea4b49efc9a666fde310ece15a9c69686d22f627
2025-12-22 10:17:01 +08:00 · 2018-02-08 16:00:20 +01:00
parent 3e9a43fcb3
commit 5909a6b3d3
33 changed files with 1993 additions and 44 deletions
--- a/2
+++ b/2
@@ -2,4 +2,4 @@
 neoDependenciesRev='735095-769'
 strategy='EQUAL'
 allowedF=43
-allowedCD=341
+allowedCD=340
--- a/runtime/api/api.cpp
+++ b/runtime/api/api.cpp
@@ -359,7 +359,6 @@ cl_int CL_API_CALL clReleaseContext(cl_context context) {
    Context *pContext = castToObject<Context>(context);
    if (pContext) {
        pContext->release();
        gtpinNotifyContextDestroy(context);
        return CL_SUCCESS;
    }
@@ -1294,7 +1293,6 @@ cl_kernel CL_API_CALL clCreateKernel(cl_program clProgram,
    Program *pProgram = nullptr;
    cl_kernel kernel = nullptr;
    cl_int retVal = CL_SUCCESS;
    DBG_LOG_INPUTS("clProgram", clProgram, "kernelName", kernelName);
    do {
@@ -1350,6 +1348,9 @@ cl_int CL_API_CALL clCreateKernelsInProgram(cl_program clProgram,
                    program,
                    *kernelInfo,
                    nullptr);
                if (kernels[ordinal] != nullptr) {
                    gtpinNotifyKernelCreate(kernels[ordinal]);
                }
            }
        }
@@ -2559,6 +2560,11 @@ cl_int CL_API_CALL clEnqueueNDRangeKernel(cl_command_queue commandQueue,
        return retVal;
    }
    auto pKernel = castToObjectOrAbort<Kernel>(kernel);
    TakeOwnershipWrapper<Kernel> kernelOwnership(*pKernel, gtpinIsGTPinInitialized());
    gtpinNotifyKernelSubmit(kernel, pCommandQueue);
    retVal = pCommandQueue->enqueueKernel(
        kernel,
        workDim,
@@ -3767,6 +3773,9 @@ cl_kernel CL_API_CALL clCloneKernel(cl_kernel sourceKernel,
    if (errcodeRet) {
        *errcodeRet = retVal;
    }
    if (pClonedKernel != nullptr) {
        gtpinNotifyKernelCreate(pClonedKernel);
    }
    return pClonedKernel;
 }
--- a/runtime/command_queue/command_queue.cpp
+++ b/runtime/command_queue/command_queue.cpp
@@ -28,6 +28,7 @@
 #include "runtime/device_queue/device_queue.h"
 #include "runtime/event/event.h"
 #include "runtime/event/event_builder.h"
 #include "runtime/gtpin/gtpin_notify.h"
 #include "runtime/helpers/aligned_memory.h"
 #include "runtime/helpers/array_count.h"
 #include "runtime/helpers/get_info.h"
--- a/runtime/command_queue/enqueue_common.h
+++ b/runtime/command_queue/enqueue_common.h
@@ -27,6 +27,7 @@
 #include "runtime/command_queue/dispatch_walker.h"
 #include "runtime/command_stream/command_stream_receiver.h"
 #include "runtime/event/event_builder.h"
 #include "runtime/gtpin/gtpin_notify.h"
 #include "runtime/helpers/kernel_commands.h"
 #include "runtime/helpers/dispatch_info_builder.h"
 #include "runtime/mem_obj/buffer.h"
@@ -549,6 +550,8 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
    DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady);
    gtpinNotifyPreFlushTask(this);
    CompletionStamp completionStamp = commandStreamReceiver.flushTask(
        commandStream,
        commandStreamStart,
--- a/runtime/command_stream/command_stream_receiver.cpp
+++ b/runtime/command_stream/command_stream_receiver.cpp
@@ -24,6 +24,7 @@
 #include "runtime/command_stream/command_stream_receiver.h"
 #include "runtime/command_stream/preemption.h"
 #include "runtime/device/device.h"
 #include "runtime/gtpin/gtpin_notify.h"
 #include "runtime/memory_manager/memory_manager.h"
 #include "runtime/helpers/cache_policy.h"
 #include "runtime/os_interface/os_interface.h"
@@ -192,6 +193,7 @@ bool CommandStreamReceiver::waitForCompletionWithTimeout(bool enableTimeout, int
        }
    }
    if (*getTagAddress() >= taskCountToWait) {
        gtpinNotifyTaskCompletion(taskCountToWait);
        return true;
    }
    return false;
--- a/runtime/command_stream/command_stream_receiver_hw.inl
+++ b/runtime/command_stream/command_stream_receiver_hw.inl
@@ -23,6 +23,7 @@
 #include "runtime/command_stream/command_stream_receiver_hw.h"
 #include "runtime/command_stream/linear_stream.h"
 #include "runtime/device/device.h"
 #include "runtime/gtpin/gtpin_notify.h"
 #include "runtime/helpers/cache_policy.h"
 #include "runtime/helpers/preamble.h"
 #include "runtime/helpers/ptr_math.h"
@@ -377,6 +378,9 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
        engineType};
    this->taskLevel += levelClosed ? 1 : 0;
    gtpinNotifyFlushTask(completionStamp.taskCount);
    return completionStamp;
 }
--- a/runtime/context/context.cpp
+++ b/runtime/context/context.cpp
@@ -25,6 +25,7 @@
 #include "runtime/device/device.h"
 #include "runtime/device_queue/device_queue.h"
 #include "runtime/mem_obj/image.h"
 #include "runtime/gtpin/gtpin_notify.h"
 #include "runtime/helpers/get_info.h"
 #include "runtime/helpers/ptr_math.h"
 #include "runtime/platform/platform.h"
@@ -71,6 +72,7 @@ Context::~Context() {
    if (memoryManager && memoryManager->isAsyncDeleterEnabled()) {
        memoryManager->getDeferredDeleter()->removeClient();
    }
    gtpinNotifyContextDestroy((cl_context)this);
 }
 DeviceQueue *Context::getDefaultDeviceQueue() {
--- a/runtime/gen8/gtpin_setup_gen8.cpp
+++ b/runtime/gen8/gtpin_setup_gen8.cpp
@@ -22,6 +22,7 @@
 #include "gtpin_ocl_interface.h"
 #include "runtime/gtpin/gtpin_hw_helper.h"
 #include "runtime/gtpin/gtpin_hw_helper.inl"
 namespace OCLRT {
--- a/runtime/gen9/gtpin_setup_gen9.cpp
+++ b/runtime/gen9/gtpin_setup_gen9.cpp
@@ -22,6 +22,7 @@
 #include "gtpin_ocl_interface.h"
 #include "runtime/gtpin/gtpin_hw_helper.h"
 #include "runtime/gtpin/gtpin_hw_helper.inl"
 namespace OCLRT {
--- a/runtime/gtpin/CMakeLists.txt
+++ b/runtime/gtpin/CMakeLists.txt
@@ -26,9 +26,11 @@ if(GTPIN_HEADERS_DIR)
    ${CMAKE_CURRENT_SOURCE_DIR}/gtpin_helpers.h
    ${CMAKE_CURRENT_SOURCE_DIR}/gtpin_hw_helper.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/gtpin_hw_helper.h
    ${CMAKE_CURRENT_SOURCE_DIR}/gtpin_hw_helper.inl
    ${CMAKE_CURRENT_SOURCE_DIR}/gtpin_init.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/gtpin_init.h
    ${CMAKE_CURRENT_SOURCE_DIR}/gtpin_notify.h
    ${CMAKE_CURRENT_SOURCE_DIR}/gtpin_defs.h
    PARENT_SCOPE
  )
 else()
--- a/runtime/gtpin/gtpin_callback_stubs.cpp
+++ b/runtime/gtpin/gtpin_callback_stubs.cpp
@@ -32,4 +32,29 @@ void gtpinNotifyContextDestroy(cl_context context) {
 void gtpinNotifyKernelCreate(cl_kernel kernel) {
 }
 void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
 }
 void gtpinNotifyPreFlushTask(void *pCmdQueue) {
 }
 void gtpinNotifyFlushTask(uint32_t flushedTaskCount) {
 }
 void gtpinNotifyTaskCompletion(uint32_t completedTaskCount) {
 }
 void gtpinNotifyMakeResident(void *pKernel, void *pCommandStreamReceiver) {
 }
 void gtpinNotifyUpdateResidencyList(void *pKernel, void *pResidencyVector) {
 }
 void gtpinNotifyPlatformShutdown() {
 }
 bool gtpinIsGTPinInitialized() {
    return false;
 }
 }
--- a/runtime/gtpin/gtpin_callbacks.cpp
+++ b/runtime/gtpin/gtpin_callbacks.cpp
@@ -23,20 +23,33 @@
 #include "config.h"
 #include "gtpin_ocl_interface.h"
 #include "CL/cl.h"
 #include "runtime/command_queue/command_queue.h"
 #include "runtime/command_stream/command_stream_receiver.h"
 #include "runtime/context/context.h"
 #include "runtime/device/device.h"
 #include "runtime/device/device_info.h"
 #include "runtime/gtpin/gtpin_defs.h"
 #include "runtime/gtpin/gtpin_hw_helper.h"
 #include "runtime/kernel/kernel.h"
 #include "runtime/mem_obj/buffer.h"
 #include "runtime/memory_manager/surface.h"
 #include "runtime/platform/platform.h"
 #include "runtime/utilities/spinlock.h"
 #include <deque>
 #include <vector>
 using namespace gtpin;
 namespace OCLRT {
 extern bool isGTPinInitialized;
 extern gtpin::ocl::gtpin_events_t GTPinCallbacks;
 igc_init_t *pIgcInfo = nullptr;
 std::atomic<int> sequenceCount(1);
 CommandQueue *pCmdQueueForFlushTask = nullptr;
 std::deque<gtpinkexec_t> kernelExecQueue;
 std::atomic_flag kernelExecQueueLock = ATOMIC_FLAG_INIT;
 void gtpinNotifyContextCreate(cl_context context) {
    if (isGTPinInitialized) {
@@ -47,7 +60,6 @@ void gtpinNotifyContextCreate(cl_context context) {
        GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
        gtpinPlatformInfo.gen_version = (gtpin::GTPIN_GEN_VERSION)gtpinHelper.getGenVersion();
        gtpinPlatformInfo.device_id = static_cast<uint32_t>(pDevice->getHardwareInfo().pPlatform->usDeviceID);
        (*GTPinCallbacks.onContextCreate)((context_handle_t)context, &gtpinPlatformInfo, &pIgcInfo);
    }
 }
@@ -60,24 +72,178 @@ void gtpinNotifyContextDestroy(cl_context context) {
 void gtpinNotifyKernelCreate(cl_kernel kernel) {
    if (isGTPinInitialized) {
-        auto pKernel = castToObject<Kernel>(kernel);
+        auto pKernel = castToObjectOrAbort<Kernel>(kernel);
        size_t gtpinBTI = pKernel->getNumberOfBindingTableStates();
        // Enlarge local copy of SSH by 1 SS
        auto pPlatform = platform();
        auto pDevice = pPlatform->getDevice(0);
        GFXCORE_FAMILY genFamily = pDevice->getHardwareInfo().pPlatform->eRenderCoreFamily;
        GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
        if (!gtpinHelper.addSurfaceState(pKernel)) {
            // Kernel with no SSH or Kernel EM, not supported
            return;
        }
        if (pKernel->isKernelHeapSubstituted()) {
            // ISA for this kernel was already substituted
            return;
        }
        // Notify GT-Pin that new kernel was created
        Context *pContext = &(pKernel->getContext());
        cl_context context = (cl_context)pContext;
-        const KernelInfo &kInfo = pKernel->getKernelInfo();
+        auto &kernelInfo = pKernel->getKernelInfo();
        instrument_params_in_t paramsIn;
        paramsIn.kernel_type = GTPIN_KERNEL_TYPE_CS;
-        paramsIn.simd = (GTPIN_SIMD_WIDTH)kInfo.getMaxSimdSize();
+        paramsIn.simd = (GTPIN_SIMD_WIDTH)kernelInfo.getMaxSimdSize();
        paramsIn.orig_kernel_binary = (uint8_t *)pKernel->getKernelHeap();
        paramsIn.orig_kernel_size = static_cast<uint32_t>(pKernel->getKernelHeapSize());
        paramsIn.buffer_type = GTPIN_BUFFER_BINDFULL;
-        paramsIn.buffer_desc.BTI = kInfo.patchInfo.bindingTableState->Count;
+        paramsIn.buffer_desc.BTI = static_cast<uint32_t>(gtpinBTI);
-        paramsIn.igc_hash_id = kInfo.heapInfo.pKernelHeader->ShaderHashCode;
+        paramsIn.igc_hash_id = kernelInfo.heapInfo.pKernelHeader->ShaderHashCode;
-        paramsIn.kernel_name = (char *)kInfo.name.c_str();
+        paramsIn.kernel_name = (char *)kernelInfo.name.c_str();
        paramsIn.igc_info = nullptr;
        instrument_params_out_t paramsOut = {0};
        (*GTPinCallbacks.onKernelCreate)((context_handle_t)(cl_context)context, &paramsIn, &paramsOut);
        // Substitute ISA of created kernel with instrumented code
        pKernel->substituteKernelHeap(paramsOut.inst_kernel_binary, paramsOut.inst_kernel_size);
        pKernel->setKernelId(paramsOut.kernel_id);
    }
 }
 void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue) {
    if (isGTPinInitialized) {
        auto pKernel = castToObjectOrAbort<Kernel>(kernel);
        if (pKernel->getSurfaceStateHeapSize() == 0) {
            // Kernel with no SSH, not supported
            return;
        }
        Context *pContext = &(pKernel->getContext());
        cl_context context = (cl_context)pContext;
        uint64_t kernelId = pKernel->getKernelId();
        command_buffer_handle_t commandBuffer = (command_buffer_handle_t)((uintptr_t)(sequenceCount++));
        uint32_t kernelOffset = 0;
        resource_handle_t resource = 0;
        // Notify GT-Pin that abstract "command buffer" was created
        (*GTPinCallbacks.onCommandBufferCreate)((context_handle_t)context, commandBuffer);
        // Notify GT-Pin that kernel was submited for execution
        (*GTPinCallbacks.onKernelSubmit)(commandBuffer, kernelId, &kernelOffset, &resource);
        // Create new record in Kernel Execution Queue describing submited kernel
        gtpinkexec_t kExec;
        kExec.pKernel = pKernel;
        kExec.gtpinResource = (cl_mem)resource;
        kExec.commandBuffer = commandBuffer;
        kExec.pCommandQueue = (CommandQueue *)pCmdQueue;
        SpinLock lock;
        lock.enter(kernelExecQueueLock);
        kernelExecQueue.push_back(kExec);
        lock.leave(kernelExecQueueLock);
        // Patch SSH[gtpinBTI] with GT-Pin resource
        auto pPlatform = platform();
        auto pDevice = pPlatform->getDevice(0);
        GFXCORE_FAMILY genFamily = pDevice->getHardwareInfo().pPlatform->eRenderCoreFamily;
        GTPinHwHelper &gtpinHelper = GTPinHwHelper::get(genFamily);
        size_t gtpinBTI = pKernel->getNumberOfBindingTableStates() - 1;
        void *pSurfaceState = gtpinHelper.getSurfaceState(pKernel, gtpinBTI);
        cl_mem buffer = (cl_mem)resource;
        auto pBuffer = castToObjectOrAbort<Buffer>(buffer);
        pBuffer->setArgStateful(const_cast<void *>(pSurfaceState));
    }
 }
 void gtpinNotifyPreFlushTask(void *pCmdQueue) {
    if (isGTPinInitialized) {
        pCmdQueueForFlushTask = (CommandQueue *)pCmdQueue;
    }
 }
 void gtpinNotifyFlushTask(uint32_t flushedTaskCount) {
    if (isGTPinInitialized) {
        SpinLock lock;
        lock.enter(kernelExecQueueLock);
        size_t numElems = kernelExecQueue.size();
        for (size_t n = 0; n < numElems; n++) {
            if ((kernelExecQueue[n].pCommandQueue == pCmdQueueForFlushTask) && !kernelExecQueue[n].isTaskCountValid) {
                // Update record in Kernel Execution Queue with kernel's TC
                kernelExecQueue[n].isTaskCountValid = true;
                kernelExecQueue[n].taskCount = flushedTaskCount;
                break;
            }
        }
        lock.leave(kernelExecQueueLock);
        pCmdQueueForFlushTask = nullptr;
    }
 }
 void gtpinNotifyTaskCompletion(uint32_t completedTaskCount) {
    if (isGTPinInitialized) {
        SpinLock lock;
        lock.enter(kernelExecQueueLock);
        size_t numElems = kernelExecQueue.size();
        for (size_t n = 0; n < numElems;) {
            if (kernelExecQueue[n].isTaskCountValid && (kernelExecQueue[n].taskCount <= completedTaskCount)) {
                // Notify GT-Pin that execution of "command buffer" was completed
                (*GTPinCallbacks.onCommandBufferComplete)(kernelExecQueue[n].commandBuffer);
                // Remove kernel's record from Kernel Execution Queue
                kernelExecQueue.erase(kernelExecQueue.begin() + n);
                numElems--;
            } else {
                n++;
            }
        }
        lock.leave(kernelExecQueueLock);
    }
 }
 void gtpinNotifyMakeResident(void *pKernel, void *pCSR) {
    if (isGTPinInitialized) {
        SpinLock lock;
        lock.enter(kernelExecQueueLock);
        size_t numElems = kernelExecQueue.size();
        for (size_t n = 0; n < numElems; n++) {
            if ((kernelExecQueue[n].pKernel == pKernel) && !kernelExecQueue[n].isResourceResident) {
                // It's time for kernel to make resident its GT-Pin resource
                CommandStreamReceiver *pCommandStreamReceiver = reinterpret_cast<CommandStreamReceiver *>(pCSR);
                cl_mem gtpinBuffer = kernelExecQueue[n].gtpinResource;
                auto pBuffer = castToObjectOrAbort<Buffer>(gtpinBuffer);
                GraphicsAllocation *pGfxAlloc = pBuffer->getGraphicsAllocation();
                pCommandStreamReceiver->makeResident(*pGfxAlloc);
                kernelExecQueue[n].isResourceResident = true;
                break;
            }
        }
        lock.leave(kernelExecQueueLock);
    }
 }
 void gtpinNotifyUpdateResidencyList(void *pKernel, void *pResVec) {
    if (isGTPinInitialized) {
        SpinLock lock;
        lock.enter(kernelExecQueueLock);
        size_t numElems = kernelExecQueue.size();
        for (size_t n = 0; n < numElems; n++) {
            if ((kernelExecQueue[n].pKernel == pKernel) && !kernelExecQueue[n].isResourceResident) {
                // It's time for kernel to update its residency list with its GT-Pin resource
                std::vector<Surface *> *pResidencyVector = (std::vector<Surface *> *)pResVec;
                cl_mem gtpinBuffer = kernelExecQueue[n].gtpinResource;
                auto pBuffer = castToObjectOrAbort<Buffer>(gtpinBuffer);
                GraphicsAllocation *pGfxAlloc = pBuffer->getGraphicsAllocation();
                GeneralSurface *pSurface = new GeneralSurface(pGfxAlloc);
                pResidencyVector->push_back(pSurface);
                kernelExecQueue[n].isResourceResident = true;
                break;
            }
        }
        lock.leave(kernelExecQueueLock);
    }
 }
 void gtpinNotifyPlatformShutdown() {
    if (isGTPinInitialized) {
        // Clear Kernel Execution Queue
        kernelExecQueue.clear();
    }
 }
 bool gtpinIsGTPinInitialized() {
    return isGTPinInitialized;
 }
 }
--- a/runtime/gtpin/gtpin_defs.h
+++ b/runtime/gtpin/gtpin_defs.h
@@ -0,0 +1,52 @@
 /*
 * Copyright (c) 2018, Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 #include "config.h"
 #include "gtpin_ocl_interface.h"
 #include "CL/cl.h"
 #include "runtime/command_queue/command_queue.h"
 #include "runtime/kernel/kernel.h"
 namespace OCLRT {
 struct GTPinKernelExec {
    Kernel *pKernel;
    cl_mem gtpinResource;
    CommandQueue *pCommandQueue;
    gtpin::command_buffer_handle_t commandBuffer;
    uint32_t taskCount;
    bool isTaskCountValid;
    bool isResourceResident;
    GTPinKernelExec() {
        pKernel = nullptr;
        gtpinResource = nullptr;
        pCommandQueue = nullptr;
        commandBuffer = nullptr;
        taskCount = 0;
        isTaskCountValid = false;
        isResourceResident = false;
    }
 };
 typedef struct GTPinKernelExec gtpinkexec_t;
 } // OCLRT
--- a/runtime/gtpin/gtpin_hw_helper.h
+++ b/runtime/gtpin/gtpin_hw_helper.h
@@ -24,10 +24,14 @@
 #include "runtime/gen_common/hw_cmds.h"
 namespace OCLRT {
 class Kernel;
 class GTPinHwHelper {
  public:
    static GTPinHwHelper &get(GFXCORE_FAMILY gfxCore);
    virtual uint32_t getGenVersion() = 0;
    virtual bool addSurfaceState(Kernel *pKernel) = 0;
    virtual void *getSurfaceState(Kernel *pKernel, size_t bti) = 0;
  protected:
    GTPinHwHelper(){};
@@ -41,6 +45,8 @@ class GTPinHwHelperHw : public GTPinHwHelper {
        return gtpinHwHelper;
    }
    uint32_t getGenVersion() override;
    bool addSurfaceState(Kernel *pKernel) override;
    void *getSurfaceState(Kernel *pKernel, size_t bti) override;
  private:
    GTPinHwHelperHw(){};
--- a/runtime/gtpin/gtpin_hw_helper.inl
+++ b/runtime/gtpin/gtpin_hw_helper.inl
@@ -0,0 +1,74 @@
 /*
 * Copyright (c) 2018, Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 #include "hw_cmds.h"
 #include "runtime/gtpin/gtpin_hw_helper.h"
 #include "runtime/helpers/string.h"
 #include "runtime/kernel/kernel.h"
 namespace OCLRT {
 template <typename GfxFamily>
 bool GTPinHwHelperHw<GfxFamily>::addSurfaceState(Kernel *pKernel) {
    using RENDER_SURFACE_STATE = typename GfxFamily::RENDER_SURFACE_STATE;
    using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE;
    size_t sshSize = pKernel->getSurfaceStateHeapSize();
    if ((sshSize == 0) || pKernel->isParentKernel) {
        // Kernels which do not use SSH or use Execution Model are not supported (yet)
        return false;
    }
    size_t ssSize = sizeof(RENDER_SURFACE_STATE);
    size_t btsSize = sizeof(BINDING_TABLE_STATE);
    size_t sizeToEnlarge = ssSize + btsSize;
    size_t currBTOffset = pKernel->getBindingTableOffset();
    size_t currSurfaceStateSize = currBTOffset;
    char *pSsh = reinterpret_cast<char *>(pKernel->getSurfaceStateHeap());
    char *pNewSsh = new char[sshSize + sizeToEnlarge];
    memcpy_s(pNewSsh, sshSize + sizeToEnlarge, pSsh, currSurfaceStateSize);
    RENDER_SURFACE_STATE *pSS = reinterpret_cast<RENDER_SURFACE_STATE *>(pNewSsh + currSurfaceStateSize);
    pSS->init();
    size_t newSurfaceStateSize = currSurfaceStateSize + ssSize;
    size_t currBTCount = pKernel->getNumberOfBindingTableStates();
    memcpy_s(pNewSsh + newSurfaceStateSize, sshSize + sizeToEnlarge - newSurfaceStateSize, pSsh + currBTOffset, currBTCount * btsSize);
    BINDING_TABLE_STATE *pNewBTS = reinterpret_cast<BINDING_TABLE_STATE *>(pNewSsh + newSurfaceStateSize + currBTCount * btsSize);
    BINDING_TABLE_STATE bti;
    bti.init();
    bti.setSurfaceStatePointer((uint64_t)currBTOffset);
    *pNewBTS = bti;
    pKernel->resizeSurfaceStateHeap(pNewSsh, sshSize + sizeToEnlarge, currBTCount + 1, newSurfaceStateSize);
    return true;
 }
 template <typename GfxFamily>
 void *GTPinHwHelperHw<GfxFamily>::getSurfaceState(Kernel *pKernel, size_t bti) {
    using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE;
    if ((nullptr == pKernel->getSurfaceStateHeap()) || (bti >= pKernel->getNumberOfBindingTableStates())) {
        return nullptr;
    }
    auto *pBts = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(), (pKernel->getBindingTableOffset() + bti * sizeof(BINDING_TABLE_STATE))));
    auto pSurfaceState = ptrOffset(pKernel->getSurfaceStateHeap(), pBts->getSurfaceStatePointer());
    return pSurfaceState;
 }
 } // namespace OCLRT
--- a/runtime/gtpin/gtpin_notify.h
+++ b/runtime/gtpin/gtpin_notify.h
@@ -26,4 +26,12 @@ namespace OCLRT {
 void gtpinNotifyContextCreate(cl_context context);
 void gtpinNotifyContextDestroy(cl_context context);
 void gtpinNotifyKernelCreate(cl_kernel kernel);
 void gtpinNotifyKernelSubmit(cl_kernel kernel, void *pCmdQueue);
 void gtpinNotifyPreFlushTask(void *pCmdQueue);
 void gtpinNotifyFlushTask(uint32_t flushedTaskCount);
 void gtpinNotifyTaskCompletion(uint32_t completedTaskCount);
 void gtpinNotifyMakeResident(void *pKernel, void *pCommandStreamReceiver);
 void gtpinNotifyUpdateResidencyList(void *pKernel, void *pResidencyVector);
 void gtpinNotifyPlatformShutdown();
 bool gtpinIsGTPinInitialized();
 }
--- a/runtime/helpers/base_object.h
+++ b/runtime/helpers/base_object.h
@@ -107,6 +107,12 @@ class TakeOwnershipWrapper {
        : obj(obj) {
        this->locked = obj.takeOwnership(true);
    }
    TakeOwnershipWrapper(T &obj, bool lockImmediately)
        : obj(obj) {
        if (lockImmediately) {
            this->locked = obj.takeOwnership(true);
        }
    }
    ~TakeOwnershipWrapper() {
        if (locked) {
            obj.releaseOwnership();
--- a/runtime/helpers/kernel_commands.h
+++ b/runtime/helpers/kernel_commands.h
@@ -76,16 +76,20 @@ struct KernelCommandsHelper : public PerThreadDataHelper {
        const Kernel &kernel);
    static size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const KernelInfo &srcKernelInfo,
-                                                   const void *srcKernelSsh, size_t srcKernelSshSize);
+                                                   const void *srcKernelSsh, size_t srcKernelSshSize,
                                                   size_t numberOfBindingTableStates, size_t offsetOfBindingTable);
    static size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const KernelInfo &srcKernelInfo) {
        return pushBindingTableAndSurfaceStates(dstHeap, srcKernelInfo, srcKernelInfo.heapInfo.pSsh,
-                                                srcKernelInfo.heapInfo.pKernelHeader->SurfaceStateHeapSize);
+                                                srcKernelInfo.heapInfo.pKernelHeader->SurfaceStateHeapSize,
                                                (srcKernelInfo.patchInfo.bindingTableState != nullptr) ? srcKernelInfo.patchInfo.bindingTableState->Count : 0,
                                                (srcKernelInfo.patchInfo.bindingTableState != nullptr) ? srcKernelInfo.patchInfo.bindingTableState->Offset : 0);
    }
    static size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const Kernel &srcKernel) {
        return pushBindingTableAndSurfaceStates(dstHeap, srcKernel.getKernelInfo(),
-                                                srcKernel.getSurfaceStateHeap(), srcKernel.getSurfaceStateHeapSize());
+                                                srcKernel.getSurfaceStateHeap(), srcKernel.getSurfaceStateHeapSize(),
                                                srcKernel.getNumberOfBindingTableStates(), srcKernel.getBindingTableOffset());
    }
    static size_t sendIndirectState(
--- a/runtime/helpers/kernel_commands.inl
+++ b/runtime/helpers/kernel_commands.inl
@@ -265,7 +265,8 @@ size_t KernelCommandsHelper<GfxFamily>::sendCrossThreadData(
 // as required by the INTERFACE_DESCRIPTOR_DATA.
 template <typename GfxFamily>
 size_t KernelCommandsHelper<GfxFamily>::pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const KernelInfo &srcKernelInfo,
-                                                                         const void *srcKernelSsh, size_t srcKernelSshSize) {
+                                                                         const void *srcKernelSsh, size_t srcKernelSshSize,
                                                                         size_t numberOfBindingTableStates, size_t offsetOfBindingTable) {
    using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE;
    using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
    using RENDER_SURFACE_STATE = typename GfxFamily::RENDER_SURFACE_STATE;
@@ -274,9 +275,8 @@ size_t KernelCommandsHelper<GfxFamily>::pushBindingTableAndSurfaceStates(Indirec
        // according to compiler, kernel does not reference BTIs to stateful surfaces, so there's nothing to patch
        return 0;
    }
-    size_t sshSize = srcKernelInfo.heapInfo.pKernelHeader->SurfaceStateHeapSize;
+    size_t sshSize = srcKernelSshSize;
-    DEBUG_BREAK_IF(!((sshSize <= srcKernelSshSize) && (srcKernelSsh != nullptr)));
+    DEBUG_BREAK_IF(srcKernelSsh == nullptr);
    uint32_t localBtiOffset = srcKernelInfo.patchInfo.bindingTableState->Offset;
    auto srcSurfaceState = srcKernelSsh;
    // Align the heap and allocate space for new ssh data
@@ -289,21 +289,21 @@ size_t KernelCommandsHelper<GfxFamily>::pushBindingTableAndSurfaceStates(Indirec
        // nothing to patch, we're at the start of heap (which is assumed to be the surface state base address)
        // we need to simply copy the ssh (including BTIs from compiler)
        memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, sshSize);
-        return localBtiOffset;
+        return offsetOfBindingTable;
    }
    // We can copy-over the surface states, but BTIs will need to be patched
-    memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, localBtiOffset);
+    memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, offsetOfBindingTable);
    uint32_t surfaceStatesOffset = static_cast<uint32_t>(ptrDiff(dstSurfaceState, dstHeap.getBase()));
    // march over BTIs and offset the pointers based on surface state base address
-    auto *dstBtiTableBase = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(dstSurfaceState, localBtiOffset));
+    auto *dstBtiTableBase = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(dstSurfaceState, offsetOfBindingTable));
    DEBUG_BREAK_IF(reinterpret_cast<uintptr_t>(dstBtiTableBase) % INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER_ALIGN_SIZE != 0);
-    auto *srcBtiTableBase = reinterpret_cast<const BINDING_TABLE_STATE *>(ptrOffset(srcSurfaceState, localBtiOffset));
+    auto *srcBtiTableBase = reinterpret_cast<const BINDING_TABLE_STATE *>(ptrOffset(srcSurfaceState, offsetOfBindingTable));
    BINDING_TABLE_STATE bti;
    bti.init(); // init whole DWORD - i.e. not just the SurfaceStatePointer bits
-    for (uint32_t i = 0, e = srcKernelInfo.patchInfo.bindingTableState->Count; i != e; ++i) {
+    for (uint32_t i = 0, e = (uint32_t)numberOfBindingTableStates; i != e; ++i) {
        uint32_t localSurfaceStateOffset = srcBtiTableBase[i].getSurfaceStatePointer();
        uint32_t offsetedSurfaceStateOffset = localSurfaceStateOffset + surfaceStatesOffset;
        bti.setSurfaceStatePointer(offsetedSurfaceStateOffset); // patch just the SurfaceStatePointer bits
--- a/runtime/helpers/task_information.cpp
+++ b/runtime/helpers/task_information.cpp
@@ -26,6 +26,7 @@
 #include "runtime/command_queue/enqueue_common.h"
 #include "runtime/device/device.h"
 #include "runtime/device_queue/device_queue.h"
 #include "runtime/gtpin/gtpin_notify.h"
 #include "runtime/mem_obj/image.h"
 #include "runtime/memory_manager/surface.h"
 #include "runtime/helpers/aligned_memory.h"
@@ -76,6 +77,8 @@ CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) {
    DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady);
    gtpinNotifyPreFlushTask(&cmdQ);
    completionStamp = csr.flushTask(queueCommandStream,
                                    offset,
                                    cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
@@ -267,6 +270,8 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
    DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady);
    gtpinNotifyPreFlushTask(&commandQueue);
    completionStamp = commandStreamReceiver.flushTask(queueCommandStream,
                                                      offset,
                                                      *dsh,
@@ -275,12 +280,10 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
                                                      ssh,
                                                      taskLevel,
                                                      dispatchFlags);
    commandQueue.waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp);
    for (auto &surface : surfaces) {
        surface->setCompletionStamp(completionStamp, nullptr, nullptr);
    }
    commandQueue.waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp);
    if (printfHandler) {
        printfHandler.get()->printEnqueueOutput();
@@ -309,6 +312,8 @@ CompletionStamp &CommandMarker::submit(uint32_t taskLevel, bool terminated) {
    DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady);
    gtpinNotifyPreFlushTask(&cmdQ);
    completionStamp = csr.flushTask(queueCommandStream,
                                    offset,
                                    cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
--- a/runtime/kernel/kernel.cpp
+++ b/runtime/kernel/kernel.cpp
@@ -30,6 +30,7 @@
 #include "runtime/helpers/surface_formats.h"
 #include "runtime/device_queue/device_queue.h"
 #include "runtime/execution_model/device_enqueue.h"
 #include "runtime/gtpin/gtpin_notify.h"
 #include "runtime/helpers/aligned_memory.h"
 #include "runtime/helpers/basic_math.h"
 #include "runtime/helpers/debug_helpers.h"
@@ -94,6 +95,8 @@ Kernel::Kernel(Program *programArg, const KernelInfo &kernelInfoArg, const Devic
      context(nullptr),
      device(deviceArg),
      kernelInfo(kernelInfoArg),
      numberOfBindingTableStates(0),
      localBindingTableOffset(0),
      pSshLocal(nullptr),
      sshLocalSize(0),
      crossThreadData(nullptr),
@@ -245,6 +248,8 @@ cl_int Kernel::initialize() {
            // copy the ssh into our local copy
            memcpy_s(pSshLocal, sshLocalSize, heapInfo.pSsh, sshLocalSize);
        }
        numberOfBindingTableStates = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Count : 0;
        localBindingTableOffset = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Offset : 0;
        // patch crossthread data and ssh with inline surfaces, if necessary
        privateSurfaceSize = patchInfo.pAllocateStatelessPrivateSurface
@@ -690,6 +695,11 @@ void Kernel::substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize)
    *pKernelHeap = newKernelHeap;
    SKernelBinaryHeaderCommon *pHeader = const_cast<SKernelBinaryHeaderCommon *>(pKernelInfo->heapInfo.pKernelHeader);
    pHeader->KernelHeapSize = static_cast<uint32_t>(newKernelHeapSize);
    pKernelInfo->isKernelHeapSubstituted = true;
 }
 bool Kernel::isKernelHeapSubstituted() const {
    return kernelInfo.isKernelHeapSubstituted;
 }
 uint64_t Kernel::getKernelId() const {
@@ -721,16 +731,20 @@ const void *Kernel::getDynamicStateHeap() const {
 size_t Kernel::getSurfaceStateHeapSize() const {
    return kernelInfo.usesSsh
-               ? kernelInfo.heapInfo.pKernelHeader->SurfaceStateHeapSize
+               ? sshLocalSize
               : 0;
 }
-size_t Kernel::getNumberOfSurfaceStates() const {
+size_t Kernel::getNumberOfBindingTableStates() const {
-    const auto &patchInfo = kernelInfo.patchInfo;
+    return numberOfBindingTableStates;
-    if (patchInfo.bindingTableState == nullptr) {
+}
-        return 0;
+
-    }
+void Kernel::resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset) {
-    return patchInfo.bindingTableState->Count;
+    delete[] pSshLocal;
    pSshLocal = reinterpret_cast<char *>(pNewSsh);
    sshLocalSize = static_cast<uint32_t>(newSshSize);
    numberOfBindingTableStates = newBindingTableCount;
    localBindingTableOffset = newBindingTableOffset;
 }
 uint32_t Kernel::getScratchSizeValueToProgramMediaVfeState(int scratchSize) {
@@ -927,6 +941,8 @@ void Kernel::makeResident(CommandStreamReceiver &commandStreamReceiver) {
    }
    makeArgsResident(commandStreamReceiver);
    gtpinNotifyMakeResident(this, &commandStreamReceiver);
 }
 void Kernel::getResidency(std::vector<Surface *> &dst) {
@@ -964,6 +980,8 @@ void Kernel::getResidency(std::vector<Surface *> &dst) {
            }
        }
    }
    gtpinNotifyUpdateResidencyList(this, &dst);
 }
 bool Kernel::requiresCoherency() {
--- a/runtime/kernel/kernel.h
+++ b/runtime/kernel/kernel.h
@@ -156,9 +156,15 @@ class Kernel : public BaseObject<_cl_kernel> {
    size_t getKernelHeapSize() const;
    size_t getSurfaceStateHeapSize() const;
    size_t getDynamicStateHeapSize() const;
-    size_t getNumberOfSurfaceStates() const;
+    size_t getNumberOfBindingTableStates() const;
    size_t getBindingTableOffset() const {
        return localBindingTableOffset;
    }
    void resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset);
    void substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize);
    bool isKernelHeapSubstituted() const;
    uint64_t getKernelId() const;
    void setKernelId(uint64_t newKernelId);
@@ -434,6 +440,8 @@ class Kernel : public BaseObject<_cl_kernel> {
    std::vector<KernelArgHandler> kernelArgHandlers;
    std::vector<GraphicsAllocation *> kernelSvmGfxAllocations;
    size_t numberOfBindingTableStates;
    size_t localBindingTableOffset;
    char *pSshLocal;
    uint32_t sshLocalSize;
--- a/runtime/platform/platform.cpp
+++ b/runtime/platform/platform.cpp
@@ -25,6 +25,7 @@
 #include "runtime/compiler_interface/compiler_interface.h"
 #include "CL/cl_ext.h"
 #include "runtime/device/device.h"
 #include "runtime/gtpin/gtpin_notify.h"
 #include "runtime/helpers/debug_helpers.h"
 #include "runtime/helpers/get_info.h"
 #include "runtime/helpers/options.h"
@@ -191,6 +192,8 @@ void Platform::shutdown() {
    DeviceFactory::releaseDevices();
    std::string().swap(compilerExtensions);
    gtpinNotifyPlatformShutdown();
 }
 Device *Platform::getDevice(size_t deviceOrdinal) {
--- a/runtime/program/kernel_info.h
+++ b/runtime/program/kernel_info.h
@@ -236,5 +236,6 @@ struct KernelInfo {
    uint32_t argumentsToPatchNum = 0;
    uint32_t systemKernelOffset = 0;
    uint64_t kernelId = 0;
    bool isKernelHeapSubstituted = false;
 };
 } // namespace OCLRT
--- a/runtime/utilities/CMakeLists.txt
+++ b/runtime/utilities/CMakeLists.txt
@@ -35,6 +35,7 @@ set(RUNTIME_SRCS_UTILITIES_BASE
  ${CMAKE_CURRENT_SOURCE_DIR}/perf_profiler.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/perf_profiler.h
  ${CMAKE_CURRENT_SOURCE_DIR}/reference_tracked_object.h
  ${CMAKE_CURRENT_SOURCE_DIR}/spinlock.h
  ${CMAKE_CURRENT_SOURCE_DIR}/stackvec.h
  ${CMAKE_CURRENT_SOURCE_DIR}/tag_allocator.h
  ${CMAKE_CURRENT_SOURCE_DIR}/tag_allocator_base.h
--- a/runtime/utilities/spinlock.h
+++ b/runtime/utilities/spinlock.h
@@ -0,0 +1,40 @@
 /*
 * Copyright (c) 2018, Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 #pragma once
 #include <atomic>
 namespace OCLRT {
 class SpinLock {
  public:
    void enter(std::atomic_flag &spinLock) {
        while (spinLock.test_and_set(std::memory_order_acquire)) {
        };
    }
    void leave(std::atomic_flag &spinLock) {
        spinLock.clear(std::memory_order_release);
    }
 };
 } // OCLRT
--- a/unit_tests/execution_model/enqueue_execution_model_kernel_tests.cpp
+++ b/unit_tests/execution_model/enqueue_execution_model_kernel_tests.cpp
@@ -214,7 +214,7 @@ HWTEST_P(ParentKernelEnqueueTest, givenParentKernelWhenEnqueuedThenBlocksSurface
            Kernel *blockKernel = Kernel::create(pKernel->getProgram(), *pBlockInfo, nullptr);
            blockSSH = alignUp(blockSSH, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
-            if (blockKernel->getNumberOfSurfaceStates() > 0) {
+            if (blockKernel->getNumberOfBindingTableStates() > 0) {
                ASSERT_NE(nullptr, pBlockInfo->patchInfo.bindingTableState);
                auto dstBlockBti = ptrOffset(blockSSH, pBlockInfo->patchInfo.bindingTableState->Offset);
                EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(dstBlockBti) % INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER_ALIGN_SIZE);
@@ -222,7 +222,7 @@ HWTEST_P(ParentKernelEnqueueTest, givenParentKernelWhenEnqueuedThenBlocksSurface
                auto srcBlockBti = ptrOffset(pBlockInfo->heapInfo.pSsh, pBlockInfo->patchInfo.bindingTableState->Offset);
                auto srcBindingTable = reinterpret_cast<BINDING_TABLE_STATE *>(srcBlockBti);
-                for (uint32_t i = 0; i < blockKernel->getNumberOfSurfaceStates(); ++i) {
+                for (uint32_t i = 0; i < blockKernel->getNumberOfBindingTableStates(); ++i) {
                    uint32_t dstSurfaceStatePointer = dstBindingTable[i].getSurfaceStatePointer();
                    uint32_t srcSurfaceStatePointer = srcBindingTable[i].getSurfaceStatePointer();
                    auto *dstSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(ptrOffset(ssh->getBase(), dstSurfaceStatePointer));
--- a/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp
+++ b/unit_tests/execution_model/parent_kernel_dispatch_tests.cpp
@@ -188,7 +188,7 @@ HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsBlockedThenSSHSiz
        size_t sshUsed = blockedCommandsData->ssh->getUsed();
-        size_t expectedSizeSSH = pKernel->getNumberOfSurfaceStates() * sizeof(RENDER_SURFACE_STATE) + pKernel->getKernelInfo().patchInfo.bindingTableState->Count * sizeof(BINDING_TABLE_STATE);
+        size_t expectedSizeSSH = pKernel->getNumberOfBindingTableStates() * sizeof(RENDER_SURFACE_STATE) + pKernel->getKernelInfo().patchInfo.bindingTableState->Count * sizeof(BINDING_TABLE_STATE);
        if ((pKernel->requiresSshForBuffers()) || (pKernel->getKernelInfo().patchInfo.imageMemObjKernelArgs.size() > 0)) {
            EXPECT_EQ(expectedSizeSSH, sshUsed);
--- a/unit_tests/gtpin/gtpin_tests.cpp
+++ b/unit_tests/gtpin/gtpin_tests.cpp
--- a/unit_tests/helpers/base_object_tests.cpp
+++ b/unit_tests/helpers/base_object_tests.cpp
@@ -324,6 +324,20 @@ TEST(BaseObjectTest, takeOwnership) {
    EXPECT_FALSE(buffer.hasOwnership());
 }
 TEST(BaseObjectTest, takeOwnershipWrapper) {
    MockBuffer buffer;
    {
        TakeOwnershipWrapper<Buffer> bufferOwnership(buffer, false);
        EXPECT_FALSE(buffer.hasOwnership());
    }
    {
        TakeOwnershipWrapper<Buffer> bufferOwnership(buffer, true);
        EXPECT_TRUE(buffer.hasOwnership());
        bufferOwnership.unlock();
        EXPECT_FALSE(buffer.hasOwnership());
    }
 }
 TYPED_TEST(BaseObjectTests, getCond) {
    TypeParam *object = new TypeParam;
--- a/unit_tests/helpers/kernel_commands_tests.cpp
+++ b/unit_tests/helpers/kernel_commands_tests.cpp
@@ -486,7 +486,7 @@ HWTEST_F(KernelCommandsTest, usedBindingTableStatePointersForGlobalAndConstantAn
        auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
        // Initialize binding table state pointers with pattern
-        EXPECT_EQ(numSurfaces, pKernel->getNumberOfSurfaceStates());
+        EXPECT_EQ(numSurfaces, pKernel->getNumberOfBindingTableStates());
        const size_t localWorkSizes[3]{256, 1, 1};
@@ -566,7 +566,7 @@ HWTEST_F(KernelCommandsTest, setBindingTableStatesForKernelWithBuffersNotRequiri
    auto usedBefore = ssh.getUsed();
    // Initialize binding table state pointers with pattern
-    auto numSurfaceStates = pKernel->getNumberOfSurfaceStates();
+    auto numSurfaceStates = pKernel->getNumberOfBindingTableStates();
    EXPECT_EQ(0u, numSurfaceStates);
    // set binding table states
@@ -613,10 +613,13 @@ HWTEST_F(KernelCommandsTest, setBindingTableStatesForNoSurfaces) {
    auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
    // Initialize binding table state pointers with pattern
-    auto numSurfaceStates = pKernel->getNumberOfSurfaceStates();
+    auto numSurfaceStates = pKernel->getNumberOfBindingTableStates();
    EXPECT_EQ(0u, numSurfaceStates);
-    auto dstBindingTablePointer = KernelCommandsHelper<FamilyType>::pushBindingTableAndSurfaceStates(ssh, *pKernel);
+    auto dstBindingTablePointer = KernelCommandsHelper<FamilyType>::pushBindingTableAndSurfaceStates(ssh, *pKernelInfo);
    EXPECT_EQ(0u, dstBindingTablePointer);
    dstBindingTablePointer = KernelCommandsHelper<FamilyType>::pushBindingTableAndSurfaceStates(ssh, *pKernel);
    EXPECT_EQ(0u, dstBindingTablePointer);
    SPatchBindingTableState bindingTableState;
--- a/unit_tests/utilities/CMakeLists.txt
+++ b/unit_tests/utilities/CMakeLists.txt
@@ -31,6 +31,7 @@ set(IGDRCL_SRCS_tests_utilities
    "${CMAKE_CURRENT_SOURCE_DIR}/timer_util_tests.cpp"
    "${CMAKE_CURRENT_SOURCE_DIR}/cpuinfo_tests.cpp"
    "${CMAKE_CURRENT_SOURCE_DIR}/perf_profiler.cpp"
    "${CMAKE_CURRENT_SOURCE_DIR}/spinlock_tests.cpp"
    "${CMAKE_CURRENT_SOURCE_DIR}/vec_tests.cpp"
    PARENT_SCOPE
 )
--- a/unit_tests/utilities/spinlock_tests.cpp
+++ b/unit_tests/utilities/spinlock_tests.cpp
@@ -0,0 +1,64 @@
 /*
 * Copyright (c) 2017, Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 #include "runtime/utilities/spinlock.h"
 #include "gtest/gtest.h"
 #include <thread>
 using namespace OCLRT;
 TEST(SpinLockTest, givenTwoThreadsThenVerifyThatTheySynchronizeWithSpinLock) {
    std::atomic_flag syncLock = ATOMIC_FLAG_INIT;
    std::atomic<bool> threadStarted(false);
    std::atomic<bool> threadFinished(false);
    SpinLock lock1;
    int sharedCount = 0;
    // Initially acquire spin lock so the worker thread will wait
    lock1.enter(syncLock);
    // Start worker thread
    std::thread t([&]() {
        threadStarted = true;
        SpinLock lock2;
        lock2.enter(syncLock);
        sharedCount++;
        EXPECT_EQ(2, sharedCount);
        lock2.leave(syncLock);
        threadFinished = true;
    });
    // Wait till worker thread is started
    while (!threadStarted) {
    };
    sharedCount++;
    EXPECT_EQ(1, sharedCount);
    // Release spin lock thus allowing worker thread to proceed
    lock1.leave(syncLock);
    // Wait till worker thread finishes
    while (!threadFinished) {
    };
    EXPECT_EQ(2, sharedCount);
    t.join();
 }