Initial commit

Change-Id: I4bf1707bd3dfeadf2c17b0a7daff372b1925ebbd
2025-09-15 13:01:45 +08:00 · 2017-12-21 00:45:38 +01:00
commit 7e9ad41290
1350 changed files with 233156 additions and 0 deletions
--- a/runtime/program/.clang-tidy
+++ b/runtime/program/.clang-tidy
@ -0,0 +1,39 @@
+---
+Checks:          'clang-diagnostic-*,clang-analyzer-*,google-default-arguments,readability-identifier-naming,modernize-use-override,modernize-use-default-member-init,-clang-analyzer-alpha*,-clang-analyzer-optin.performance.Padding'
+# -clang-analyzer-core.CallAndMessage
+# WarningsAsErrors: '.*'
+HeaderFilterRegex: 'runtime/'
+AnalyzeTemporaryDtors: false
+CheckOptions:    
+  - key:             google-readability-braces-around-statements.ShortStatementLines
+    value:           '1'
+  - key:             google-readability-function-size.StatementThreshold
+    value:           '800'
+  - key:             google-readability-namespace-comments.ShortNamespaceLines
+    value:           '10'
+  - key:             google-readability-namespace-comments.SpacesBeforeComments
+    value:           '2'
+  - key:             readability-identifier-naming.MethodCase
+    value:           camelBack
+  - key:             readability-identifier-naming.ParameterCase
+    value:           camelBack
+  - key:             readability-identifier-naming.ClassMemberCase
+    value:           camelBack
+  - key:             readability-identifier-naming.ClassMethodCase
+    value:           camelBack
+  - key:             modernize-loop-convert.MaxCopySize
+    value:           '16'
+  - key:             modernize-loop-convert.MinConfidence
+    value:           reasonable
+  - key:             modernize-loop-convert.NamingStyle
+    value:           CamelCase
+  - key:             modernize-pass-by-value.IncludeStyle
+    value:           llvm
+  - key:             modernize-replace-auto-ptr.IncludeStyle
+    value:           llvm
+  - key:             modernize-use-nullptr.NullMacros
+    value:           'NULL'
+  - key:             modernize-use-default-member-init.UseAssignment
+    value:           '1'
+...
+
--- a/runtime/program/block_kernel_manager.cpp
+++ b/runtime/program/block_kernel_manager.cpp
@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "block_kernel_manager.h"
+#include "runtime/helpers/debug_helpers.h"
+
+namespace OCLRT {
+
+void BlockKernelManager::addBlockKernelInfo(KernelInfo *blockKernelInfo) {
+    blockKernelInfoArray.push_back(blockKernelInfo);
+    blockUsesPrintf |= (blockKernelInfo->patchInfo.pAllocateStatelessPrintfSurface != nullptr);
+}
+
+const KernelInfo *BlockKernelManager::getBlockKernelInfo(size_t ordinal) {
+    DEBUG_BREAK_IF(ordinal >= blockKernelInfoArray.size());
+    return blockKernelInfoArray[ordinal];
+}
+
+BlockKernelManager::~BlockKernelManager() {
+    for (auto &i : blockKernelInfoArray)
+        delete i;
+}
+void BlockKernelManager::pushPrivateSurface(GraphicsAllocation *allocation, size_t ordinal) {
+    if (blockPrivateSurfaceArray.size() < blockKernelInfoArray.size()) {
+        blockPrivateSurfaceArray.resize(blockKernelInfoArray.size());
+
+        for (uint32_t i = 0; i < blockPrivateSurfaceArray.size(); i++) {
+            blockPrivateSurfaceArray[i] = nullptr;
+        }
+    }
+
+    DEBUG_BREAK_IF(ordinal >= blockPrivateSurfaceArray.size());
+
+    blockPrivateSurfaceArray[ordinal] = allocation;
+}
+
+GraphicsAllocation *BlockKernelManager::getPrivateSurface(size_t ordinal) {
+    // Ff queried ordinal is out of bound return nullptr,
+    // this happens when no private surface was not pushed
+    if (ordinal < blockPrivateSurfaceArray.size())
+        return blockPrivateSurfaceArray[ordinal];
+    return nullptr;
+}
+} // namespace OCLRT
--- a/runtime/program/block_kernel_manager.h
+++ b/runtime/program/block_kernel_manager.h
@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "kernel_info.h"
+#include "runtime/api/cl_types.h"
+#include <vector>
+
+namespace OCLRT {
+class GraphicsAllocation;
+
+class BlockKernelManager {
+  public:
+    BlockKernelManager() = default;
+    virtual ~BlockKernelManager();
+    void addBlockKernelInfo(KernelInfo *);
+    const KernelInfo *getBlockKernelInfo(size_t ordinal);
+    size_t getCount() const {
+        return blockKernelInfoArray.size();
+    }
+    bool getIfBlockUsesPrintf() const {
+        return blockUsesPrintf;
+    }
+
+    void pushPrivateSurface(GraphicsAllocation *allocation, size_t ordinal);
+    GraphicsAllocation *getPrivateSurface(size_t ordinal);
+
+  protected:
+    bool blockUsesPrintf = false;
+    std::vector<KernelInfo *> blockKernelInfoArray;
+    std::vector<GraphicsAllocation *> blockPrivateSurfaceArray;
+};
+} // namespace OCLRT
--- a/runtime/program/build.cpp
+++ b/runtime/program/build.cpp
@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "config.h"
+#include "runtime/compiler_interface/compiler_interface.h"
+#include "runtime/os_interface/debug_settings_manager.h"
+#include "runtime/platform/platform.h"
+#include "runtime/helpers/validators.h"
+#include "program.h"
+#include <cstring>
+
+namespace OCLRT {
+
+cl_int Program::build(
+    cl_uint numDevices,
+    const cl_device_id *deviceList,
+    const char *buildOptions,
+    void(CL_CALLBACK *funcNotify)(cl_program program, void *userData),
+    void *userData,
+    bool enableCaching) {
+    cl_int retVal = CL_SUCCESS;
+
+    do {
+        if (((deviceList == nullptr) && (numDevices != 0)) ||
+            ((deviceList != nullptr) && (numDevices == 0))) {
+            retVal = CL_INVALID_VALUE;
+            break;
+        }
+
+        if ((funcNotify == nullptr) &&
+            (userData != nullptr)) {
+            retVal = CL_INVALID_VALUE;
+            break;
+        }
+
+        // if a device_list is specified, make sure it points to our device
+        // NOTE: a null device_list is ok - it means "all devices"
+        if (deviceList && validateObject(*deviceList) != CL_SUCCESS) {
+            retVal = CL_INVALID_DEVICE;
+            break;
+        }
+
+        // check to see if a previous build request is in progress
+        if (buildStatus == CL_BUILD_IN_PROGRESS) {
+            retVal = CL_INVALID_OPERATION;
+            break;
+        }
+
+        if (isCreatedFromBinary == false) {
+            buildStatus = CL_BUILD_IN_PROGRESS;
+
+            options = (buildOptions) ? buildOptions : "";
+            std::string reraStr = "-cl-intel-gtpin-rera";
+            size_t pos = options.find(reraStr);
+            if (pos != std::string::npos) {
+                // build option "-cl-intel-gtpin-rera" is present, move it to internalOptions
+                size_t reraLen = reraStr.length();
+                options.erase(pos, reraLen);
+                internalOptions.append(reraStr);
+                internalOptions.append(" ");
+            }
+
+            CompilerInterface *pCompilerInterface = getCompilerInterface();
+            if (!pCompilerInterface) {
+                retVal = CL_OUT_OF_HOST_MEMORY;
+                break;
+            }
+
+            TranslationArgs inputArgs = {};
+            if (strcmp(sourceCode.c_str(), "") == 0) {
+                retVal = CL_INVALID_PROGRAM;
+                break;
+            }
+
+            internalOptions.append(platform()->getCompilerExtensions());
+            inputArgs.pInput = (char *)(sourceCode.c_str());
+            inputArgs.InputSize = (uint32_t)sourceCode.size();
+            inputArgs.pOptions = options.c_str();
+            inputArgs.OptionsSize = (uint32_t)options.length();
+            inputArgs.pInternalOptions = internalOptions.c_str();
+            inputArgs.InternalOptionsSize = (uint32_t)internalOptions.length();
+            inputArgs.pTracingOptions = nullptr;
+            inputArgs.TracingOptionsCount = 0;
+            DBG_LOG(LogApiCalls,
+                    "Build Options", inputArgs.pOptions,
+                    "\nBuild Internal Options", inputArgs.pInternalOptions);
+
+            retVal = pCompilerInterface->build(*this, inputArgs, enableCaching);
+            if (retVal != CL_SUCCESS) {
+                break;
+            }
+        }
+        updateNonUniformFlag();
+
+        retVal = processGenBinary();
+        if (retVal != CL_SUCCESS) {
+            break;
+        }
+
+        separateBlockKernels();
+    } while (false);
+
+    if (retVal != CL_SUCCESS) {
+        buildStatus = CL_BUILD_ERROR;
+        programBinaryType = CL_PROGRAM_BINARY_TYPE_NONE;
+    } else {
+        buildStatus = CL_BUILD_SUCCESS;
+        programBinaryType = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+    }
+
+    if (funcNotify != nullptr) {
+        (*funcNotify)(this, userData);
+    }
+
+    return retVal;
+}
+
+cl_int Program::build(const cl_device_id device, const char *buildOptions, bool enableCaching,
+                      std::unordered_map<std::string, BuiltinDispatchInfoBuilder *> &builtinsMap) {
+    auto ret = this->build(1, &device, buildOptions, nullptr, nullptr, enableCaching);
+    if (ret != CL_SUCCESS) {
+        return ret;
+    }
+
+    for (auto &ki : this->kernelInfoArray) {
+        auto fit = builtinsMap.find(ki->name);
+        if (fit == builtinsMap.end()) {
+            continue;
+        }
+        ki->builtinDispatchBuilder = fit->second;
+    }
+    return ret;
+}
+
+cl_int Program::build(
+    const char *pKernelData,
+    size_t kernelDataSize) {
+    cl_int retVal = CL_SUCCESS;
+    processKernel(pKernelData, retVal);
+
+    return retVal;
+}
+}
--- a/runtime/program/compile.cpp
+++ b/runtime/program/compile.cpp
@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "config.h"
+#include "elf/writer.h"
+#include "runtime/compiler_interface/compiler_interface.h"
+#include "runtime/platform/platform.h"
+#include "runtime/helpers/validators.h"
+#include "program.h"
+#include <cstring>
+
+namespace OCLRT {
+
+cl_int Program::compile(
+    cl_uint numDevices,
+    const cl_device_id *deviceList,
+    const char *buildOptions,
+    cl_uint numInputHeaders,
+    const cl_program *inputHeaders,
+    const char **headerIncludeNames,
+    void(CL_CALLBACK *funcNotify)(cl_program program, void *userData),
+    void *userData) {
+    cl_int retVal = CL_SUCCESS;
+    cl_program program;
+    CLElfLib::CElfWriter *pElfWriter = nullptr;
+    Program *pHeaderProgObj;
+    size_t compileDataSize;
+    char *pCompileData = nullptr;
+
+    do {
+        if (((deviceList == nullptr) && (numDevices != 0)) ||
+            ((deviceList != nullptr) && (numDevices == 0))) {
+            retVal = CL_INVALID_VALUE;
+            break;
+        }
+
+        if (numInputHeaders == 0) {
+            if ((headerIncludeNames != nullptr) || (inputHeaders != nullptr)) {
+                retVal = CL_INVALID_VALUE;
+                break;
+            }
+        } else {
+            if ((headerIncludeNames == nullptr) || (inputHeaders == nullptr)) {
+                retVal = CL_INVALID_VALUE;
+                break;
+            }
+        }
+
+        if ((funcNotify == nullptr) &&
+            (userData != nullptr)) {
+            retVal = CL_INVALID_VALUE;
+            break;
+        }
+
+        // if a device_list is specified, make sure it points to our device
+        // NOTE: a null device_list is ok - it means "all devices"
+        if ((deviceList != nullptr) && validateObject(*deviceList) != CL_SUCCESS) {
+            retVal = CL_INVALID_DEVICE;
+            break;
+        }
+
+        if (buildStatus == CL_BUILD_IN_PROGRESS) {
+            retVal = CL_INVALID_OPERATION;
+            break;
+        }
+
+        buildStatus = CL_BUILD_IN_PROGRESS;
+
+        options = (buildOptions != nullptr) ? buildOptions : "";
+        std::string reraStr = "-cl-intel-gtpin-rera";
+        size_t pos = options.find(reraStr);
+        if (pos != std::string::npos) {
+            // compile option "-cl-intel-gtpin-rera" is present, move it to internalOptions
+            size_t reraLen = reraStr.length();
+            options.erase(pos, reraLen);
+            internalOptions.append(reraStr);
+            internalOptions.append(" ");
+        }
+
+        // create ELF writer to process all sources to be compiled
+        pElfWriter = CLElfLib::CElfWriter::create(CLElfLib::EH_TYPE_OPENCL_SOURCE, CLElfLib::EH_MACHINE_NONE, 0);
+        UNRECOVERABLE_IF(pElfWriter == nullptr);
+
+        CLElfLib::SSectionNode sectionNode;
+
+        // create main section
+        sectionNode.Name = "CLMain";
+        sectionNode.pData = (char *)sourceCode.c_str();
+        sectionNode.DataSize = (unsigned int)(strlen(sourceCode.c_str()) + 1);
+        sectionNode.Flags = 0;
+        sectionNode.Type = CLElfLib::SH_TYPE_OPENCL_SOURCE;
+
+        // add main program's source
+        pElfWriter->addSection(&sectionNode);
+
+        for (cl_uint i = 0; i < numInputHeaders; i++) {
+            program = inputHeaders[i];
+            if (program == nullptr) {
+                retVal = CL_INVALID_PROGRAM;
+                break;
+            }
+            pHeaderProgObj = castToObject<Program>(program);
+            if (pHeaderProgObj == nullptr) {
+                retVal = CL_INVALID_PROGRAM;
+                break;
+            }
+            sectionNode.Name = headerIncludeNames[i];
+            sectionNode.Type = CLElfLib::SH_TYPE_OPENCL_HEADER;
+            sectionNode.Flags = 0;
+            // collect required data from the header
+            retVal = pHeaderProgObj->getSource(sectionNode.pData, sectionNode.DataSize);
+            if (retVal != CL_SUCCESS) {
+                break;
+            }
+            pElfWriter->addSection(&sectionNode);
+        }
+        if (retVal != CL_SUCCESS) {
+            break;
+        }
+
+        pElfWriter->resolveBinary(nullptr, compileDataSize);
+        pCompileData = new char[compileDataSize];
+        pElfWriter->resolveBinary(pCompileData, compileDataSize);
+
+        CompilerInterface *pCompilerInterface = getCompilerInterface();
+        if (!pCompilerInterface) {
+            retVal = CL_OUT_OF_HOST_MEMORY;
+            break;
+        }
+
+        TranslationArgs inputArgs = {};
+
+        // set parameters for compilation
+        internalOptions.append(platform()->getCompilerExtensions());
+        inputArgs.pInput = pCompileData;
+        inputArgs.InputSize = (uint32_t)compileDataSize;
+        inputArgs.pOptions = options.c_str();
+        inputArgs.OptionsSize = (uint32_t)options.length();
+        inputArgs.pInternalOptions = internalOptions.c_str();
+        inputArgs.InternalOptionsSize = (uint32_t)internalOptions.length();
+        inputArgs.pTracingOptions = nullptr;
+        inputArgs.TracingOptionsCount = 0;
+
+        retVal = pCompilerInterface->compile(*this, inputArgs);
+        if (retVal != CL_SUCCESS) {
+            break;
+        }
+        updateNonUniformFlag();
+    } while (false);
+
+    if (retVal != CL_SUCCESS) {
+        buildStatus = CL_BUILD_ERROR;
+        programBinaryType = CL_PROGRAM_BINARY_TYPE_NONE;
+    } else {
+        buildStatus = CL_BUILD_SUCCESS;
+        programBinaryType = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
+    }
+
+    CLElfLib::CElfWriter::destroy(pElfWriter);
+    delete[] pCompileData;
+    internalOptions.clear();
+
+    if (funcNotify != nullptr) {
+        (*funcNotify)(this, userData);
+    }
+
+    return retVal;
+}
+} // namespace OCLRT
--- a/runtime/program/create.cpp
+++ b/runtime/program/create.cpp
@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "runtime/program/create.inl"
+#include "runtime/program/program.h"
+
+namespace OCLRT {
+template Program *Program::create<Program>(cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int &);
+template Program *Program::create<Program>(cl_context, cl_uint, const char **, const size_t *, cl_int &);
+template Program *Program::create<Program>(const char *, Context *, Device &, bool, cl_int *);
+template Program *Program::createFromIL<Program>(Context *, const void *, size_t length, cl_int &);
+}
--- a/runtime/program/create.inl
+++ b/runtime/program/create.inl
@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "runtime/program/program.h"
+#include "runtime/context/context.h"
+
+namespace OCLRT {
+
+template <typename T>
+T *Program::create(
+    cl_context context,
+    cl_uint numDevices,
+    const cl_device_id *deviceList,
+    const size_t *lengths,
+    const unsigned char **binaries,
+    cl_int *binaryStatus,
+    cl_int &errcodeRet) {
+    auto pContext = castToObject<Context>(context);
+    DEBUG_BREAK_IF(!pContext);
+
+    auto program = new T(pContext);
+
+    auto retVal = program->createProgramFromBinary(binaries[0], lengths[0]);
+
+    if (binaryStatus) {
+        DEBUG_BREAK_IF(retVal != CL_SUCCESS);
+        *binaryStatus = CL_SUCCESS;
+    }
+
+    if (retVal != CL_SUCCESS) {
+        delete program;
+        program = nullptr;
+    }
+
+    errcodeRet = retVal;
+    return program;
+}
+
+template <typename T>
+T *Program::create(
+    cl_context context,
+    cl_uint count,
+    const char **strings,
+    const size_t *lengths,
+    cl_int &errcodeRet) {
+    std::string combinedString;
+    size_t combinedStringSize = 0;
+    T *program = nullptr;
+    auto pContext = castToObject<Context>(context);
+    DEBUG_BREAK_IF(!pContext);
+
+    auto retVal = createCombinedString(
+        combinedString,
+        combinedStringSize,
+        count,
+        strings,
+        lengths);
+
+    if (CL_SUCCESS == retVal) {
+        program = new T(pContext);
+        program->sourceCode.swap(combinedString);
+    }
+
+    errcodeRet = retVal;
+    return program;
+}
+
+template <typename T>
+T *Program::create(
+    const char *nullTerminatedString,
+    Context *context,
+    Device &device,
+    bool isBuiltIn,
+    cl_int *errcodeRet) {
+    cl_int retVal = CL_SUCCESS;
+    T *program = nullptr;
+
+    if (nullTerminatedString == nullptr) {
+        retVal = CL_INVALID_VALUE;
+    }
+
+    if (retVal == CL_SUCCESS) {
+        program = new T();
+        program->setSource((char *)nullTerminatedString);
+        program->context = context;
+        program->isBuiltIn = isBuiltIn;
+        if (program->context && !program->isBuiltIn) {
+            program->context->incRefInternal();
+        }
+        program->pDevice = &device;
+        program->numDevices = 1;
+        if (is32bit || DebugManager.flags.DisableStatelessToStatefulOptimization.get()) {
+            program->internalOptions += "-cl-intel-greater-than-4GB-buffer-required";
+        }
+    }
+
+    if (errcodeRet) {
+        *errcodeRet = retVal;
+    }
+
+    return program;
+}
+
+template <typename T>
+T *Program::createFromIL(Context *ctx,
+                         const void *il,
+                         size_t length,
+                         cl_int &errcodeRet) {
+    errcodeRet = CL_SUCCESS;
+
+    if ((il == nullptr) || (length == 0)) {
+        errcodeRet = CL_INVALID_BINARY;
+        return nullptr;
+    }
+
+    T *program = new T(ctx, false);
+    errcodeRet = program->createProgramFromBinary(il, length);
+    if (errcodeRet != CL_SUCCESS) {
+        delete program;
+        program = nullptr;
+    }
+
+    return program;
+}
+} // namespace OCLRT
--- a/runtime/program/evaluate_unhandled_token.cpp
+++ b/runtime/program/evaluate_unhandled_token.cpp
@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "runtime/program/program.h"
+
+namespace OCLRT {
+
+bool Program::isSafeToSkipUnhandledToken(unsigned int token) const {
+    return false;
+}
+
+} // namespace OCLRT
--- a/runtime/program/get_info.cpp
+++ b/runtime/program/get_info.cpp
@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "runtime/context/context.h"
+#include "runtime/helpers/base_object.h"
+#include "runtime/helpers/get_info.h"
+#include "runtime/helpers/validators.h"
+#include "program.h"
+
+namespace OCLRT {
+
+cl_int Program::getInfo(cl_program_info paramName, size_t paramValueSize,
+                        void *paramValue, size_t *paramValueSizeRet) {
+    cl_int retVal = CL_SUCCESS;
+    const void *pSrc = nullptr;
+    size_t srcSize = 0;
+    size_t retSize = 0;
+    std::string kernelNamesString;
+    cl_device_id device_id = pDevice;
+    cl_uint refCount = 0;
+    size_t numKernels;
+    cl_context clContext = context;
+
+    switch (paramName) {
+    case CL_PROGRAM_CONTEXT:
+        pSrc = &clContext;
+        retSize = srcSize = sizeof(clContext);
+        break;
+
+    case CL_PROGRAM_BINARIES:
+        resolveProgramBinary();
+        pSrc = elfBinary;
+        retSize = sizeof(void **);
+        srcSize = elfBinarySize;
+        if (paramValue != nullptr) {
+            if (paramValueSize < retSize) {
+                retVal = CL_INVALID_VALUE;
+                break;
+            }
+            paramValueSize = srcSize;
+            paramValue = *(void **)paramValue;
+        }
+        break;
+
+    case CL_PROGRAM_BINARY_SIZES:
+        resolveProgramBinary();
+        pSrc = &elfBinarySize;
+        retSize = srcSize = sizeof(size_t *);
+        break;
+
+    case CL_PROGRAM_KERNEL_NAMES:
+        kernelNamesString = getKernelNamesString();
+        pSrc = kernelNamesString.c_str();
+        retSize = srcSize = kernelNamesString.length() + 1;
+
+        if (buildStatus != CL_BUILD_SUCCESS) {
+            retVal = CL_INVALID_PROGRAM_EXECUTABLE;
+        }
+        break;
+
+    case CL_PROGRAM_NUM_KERNELS:
+        numKernels = kernelInfoArray.size();
+        pSrc = &numKernels;
+        retSize = srcSize = sizeof(numKernels);
+
+        if (buildStatus != CL_BUILD_SUCCESS) {
+            retVal = CL_INVALID_PROGRAM_EXECUTABLE;
+        }
+        break;
+
+    case CL_PROGRAM_NUM_DEVICES:
+        pSrc = &numDevices;
+        retSize = srcSize = sizeof(cl_uint);
+        break;
+
+    case CL_PROGRAM_DEVICES:
+        pSrc = &device_id;
+        retSize = srcSize = sizeof(cl_device_id);
+        break;
+
+    case CL_PROGRAM_REFERENCE_COUNT:
+        refCount = static_cast<cl_uint>(this->getReference());
+        retSize = srcSize = sizeof(refCount);
+        pSrc = &refCount;
+        break;
+
+    case CL_PROGRAM_SOURCE:
+        pSrc = sourceCode.c_str();
+        retSize = srcSize = strlen(sourceCode.c_str()) + 1;
+        break;
+
+    case CL_PROGRAM_IL:
+        pSrc = sourceCode.data();
+        retSize = srcSize = sourceCode.size();
+        if (!Program::isValidSpirvBinary(pSrc, srcSize)) {
+            if (paramValueSizeRet) {
+                *paramValueSizeRet = 0;
+            }
+            return CL_SUCCESS;
+        }
+        break;
+
+    case CL_PROGRAM_DEBUG_INFO_SIZES_INTEL:
+        resolveProgramBinary();
+        retSize = srcSize = sizeof(debugDataSize);
+        pSrc = &debugDataSize;
+        break;
+
+    case CL_PROGRAM_DEBUG_INFO_INTEL:
+        resolveProgramBinary();
+        pSrc = debugData;
+        retSize = numDevices * sizeof(void **);
+        srcSize = debugDataSize;
+        if (paramValue != nullptr) {
+            if (paramValueSize < retSize) {
+                retVal = CL_INVALID_VALUE;
+                break;
+            }
+            paramValueSize = srcSize;
+            paramValue = *(void **)paramValue;
+        }
+        break;
+
+    default:
+        retVal = CL_INVALID_VALUE;
+        break;
+    }
+
+    retVal = (retVal == CL_SUCCESS)
+                 ? ::getInfo(paramValue, paramValueSize, pSrc, srcSize)
+                 : retVal;
+    if (paramValueSizeRet) {
+        *paramValueSizeRet = retSize;
+    }
+    return retVal;
+}
+
+cl_int Program::getBuildInfo(cl_device_id device, cl_program_build_info paramName,
+                             size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const {
+    cl_int retVal = CL_SUCCESS;
+    const void *pSrc = nullptr;
+    size_t srcSize = 0;
+    size_t retSize = 0;
+    cl_device_id device_id = pDevice;
+
+    if (device != device_id) {
+        return CL_INVALID_DEVICE;
+    }
+
+    retVal = validateObjects(device);
+    if (retVal != CL_SUCCESS) {
+        return CL_INVALID_DEVICE;
+    }
+
+    auto pDev = castToObject<Device>(device);
+
+    switch (paramName) {
+    case CL_PROGRAM_BUILD_STATUS:
+        srcSize = retSize = sizeof(cl_build_status);
+        pSrc = &buildStatus;
+        break;
+
+    case CL_PROGRAM_BUILD_OPTIONS:
+        srcSize = retSize = strlen(options.c_str()) + 1;
+        pSrc = options.c_str();
+        break;
+
+    case CL_PROGRAM_BUILD_LOG: {
+        const char *pBuildLog = getBuildLog(pDev);
+
+        if (pBuildLog != nullptr) {
+            pSrc = pBuildLog;
+            srcSize = retSize = strlen(pBuildLog) + 1;
+        } else {
+            pSrc = "";
+            srcSize = retSize = 1;
+        }
+    } break;
+
+    case CL_PROGRAM_BINARY_TYPE:
+        srcSize = retSize = sizeof(cl_program_binary_type);
+        pSrc = &programBinaryType;
+        break;
+
+    case CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE:
+        pSrc = &globalVarTotalSize;
+        retSize = srcSize = sizeof(size_t);
+        break;
+
+    default:
+        retVal = CL_INVALID_VALUE;
+        break;
+    }
+
+    retVal = (retVal == CL_SUCCESS)
+                 ? ::getInfo(paramValue, paramValueSize, pSrc, srcSize)
+                 : retVal;
+
+    if (paramValueSizeRet) {
+        *paramValueSizeRet = retSize;
+    }
+
+    return retVal;
+}
+} // namespace OCLRT
--- a/runtime/program/heap_info.h
+++ b/runtime/program/heap_info.h
@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include <cstdint>
+#include "patch_info.h"
+
+namespace OCLRT {
+
+struct HeapInfo {
+    const SKernelBinaryHeaderCommon *pKernelHeader;
+    const void *pKernelHeap;
+    const void *pGsh;
+    const void *pDsh;
+    void *pSsh;
+    const void *pPatchList;
+    const void *pBlob;
+    size_t blobSize;
+
+    HeapInfo() {
+        pKernelHeader = nullptr;
+        pKernelHeap = nullptr;
+        pGsh = nullptr;
+        pDsh = nullptr;
+        pSsh = nullptr;
+        pPatchList = nullptr;
+        pBlob = nullptr;
+        blobSize = 0;
+    }
+};
+
+} // namespace OCLRT
--- a/runtime/program/kernel_arg_info.h
+++ b/runtime/program/kernel_arg_info.h
@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "config.h"
+#include "CL/cl.h"
+#include <cstdint>
+#include <string>
+
+struct KernelArgPatchInfo {
+    uint32_t crossthreadOffset = 0;
+    uint32_t size = 0;
+    uint32_t sourceOffset = 0;
+};
+
+struct KernelArgInfo {
+    static constexpr uint32_t undefinedOffset = (uint32_t)-1;
+
+    std::string name;
+    std::string typeStr;
+    std::string accessQualifierStr;
+    std::string addressQualifierStr;
+    std::string typeQualifierStr;
+    uint32_t offsetHeap = 0;
+    std::vector<KernelArgPatchInfo> kernelArgPatchInfoVector;
+    uint32_t slmAlignment = 0;
+    bool isImage = false;
+    bool isMediaImage = false;
+    bool isMediaBlockImage = false;
+    bool isSampler = false;
+    bool isAccelerator = false;
+    bool isDeviceQueue = false;
+    bool isBuffer = false;
+    uint32_t samplerArgumentType = 0;
+    uint32_t offsetImgWidth = undefinedOffset;
+    uint32_t offsetImgHeight = undefinedOffset;
+    uint32_t offsetImgDepth = undefinedOffset;
+    uint32_t offsetChannelDataType = undefinedOffset;
+    uint32_t offsetChannelOrder = undefinedOffset;
+    uint32_t offsetArraySize = undefinedOffset;
+    uint32_t offsetNumSamples = undefinedOffset;
+    uint32_t offsetSamplerSnapWa = undefinedOffset;
+    uint32_t offsetSamplerAddressingMode = undefinedOffset;
+    uint32_t offsetSamplerNormalizedCoords = undefinedOffset;
+    uint32_t offsetVmeMbBlockType = undefinedOffset;
+    uint32_t offsetVmeSubpixelMode = undefinedOffset;
+    uint32_t offsetVmeSadAdjustMode = undefinedOffset;
+    uint32_t offsetVmeSearchPathType = undefinedOffset;
+    uint32_t offsetObjectId = undefinedOffset;
+    uint32_t offsetBufferOffset = undefinedOffset;
+
+    bool needPatch = false;
+
+    cl_kernel_arg_access_qualifier accessQualifier = CL_KERNEL_ARG_ACCESS_NONE;
+    cl_kernel_arg_address_qualifier addressQualifier = CL_KERNEL_ARG_ADDRESS_GLOBAL;
+    cl_kernel_arg_type_qualifier typeQualifier = CL_KERNEL_ARG_TYPE_NONE;
+
+    KernelArgInfo() = default;
+};
--- a/runtime/program/kernel_info.cpp
+++ b/runtime/program/kernel_info.cpp
@ -0,0 +1,499 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "hw_cmds.h"
+#include "runtime/device/device.h"
+#include "runtime/helpers/aligned_memory.h"
+#include "runtime/helpers/ptr_math.h"
+#include "runtime/mem_obj/buffer.h"
+#include "runtime/mem_obj/image.h"
+#include "runtime/kernel/kernel.h"
+#include "runtime/sampler/sampler.h"
+#include "runtime/helpers/string.h"
+#include <cstdint>
+#include <cstring>
+#include <map>
+#include <unordered_map>
+#include <sstream>
+
+namespace OCLRT {
+
+const uint32_t WorkloadInfo::undefinedOffset = (uint32_t)-1;
+const uint32_t WorkloadInfo::invalidParentEvent = (uint32_t)-1;
+
+std::unordered_map<std::string, uint32_t> accessQualifierMap = {
+    {"", CL_KERNEL_ARG_ACCESS_NONE},
+    {"NONE", CL_KERNEL_ARG_ACCESS_NONE},
+    {"read_only", CL_KERNEL_ARG_ACCESS_READ_ONLY},
+    {"__read_only", CL_KERNEL_ARG_ACCESS_READ_ONLY},
+    {"write_only", CL_KERNEL_ARG_ACCESS_WRITE_ONLY},
+    {"__write_only", CL_KERNEL_ARG_ACCESS_WRITE_ONLY},
+    {"read_write", CL_KERNEL_ARG_ACCESS_READ_WRITE},
+    {"__read_write", CL_KERNEL_ARG_ACCESS_READ_WRITE},
+};
+
+std::unordered_map<std::string, uint32_t> addressQualifierMap = {
+    {"", CL_KERNEL_ARG_ADDRESS_GLOBAL},
+    {"__global", CL_KERNEL_ARG_ADDRESS_GLOBAL},
+    {"__local", CL_KERNEL_ARG_ADDRESS_LOCAL},
+    {"__private", CL_KERNEL_ARG_ADDRESS_PRIVATE},
+    {"__constant", CL_KERNEL_ARG_ADDRESS_CONSTANT},
+    {"not_specified", CL_KERNEL_ARG_ADDRESS_PRIVATE},
+};
+
+struct KernelArgumentType {
+    const char *argTypeQualifier;
+    uint64_t argTypeQualifierValue;
+};
+
+constexpr KernelArgumentType typeQualifiers[] = {
+    {"const", CL_KERNEL_ARG_TYPE_CONST},
+    {"volatile", CL_KERNEL_ARG_TYPE_VOLATILE},
+    {"restrict", CL_KERNEL_ARG_TYPE_RESTRICT},
+    {"pipe", CL_KERNEL_ARG_TYPE_PIPE},
+};
+
+std::map<std::string, size_t> typeSizeMap = {
+    {"char", sizeof(cl_char)},
+    {"char2", sizeof(cl_char2)},
+    {"char3", sizeof(cl_char3)},
+    {"char4", sizeof(cl_char4)},
+    {"char8", sizeof(cl_char8)},
+    {"char16", sizeof(cl_char16)},
+
+    {"uchar", sizeof(cl_uchar)},
+    {"uchar2", sizeof(cl_uchar2)},
+    {"uchar3", sizeof(cl_uchar3)},
+    {"uchar4", sizeof(cl_uchar4)},
+    {"uchar8", sizeof(cl_uchar8)},
+    {"uchar16", sizeof(cl_uchar16)},
+
+    {"short", sizeof(cl_short)},
+    {"short2", sizeof(cl_short2)},
+    {"short3", sizeof(cl_short3)},
+    {"short4", sizeof(cl_short4)},
+    {"short8", sizeof(cl_short8)},
+    {"short16", sizeof(cl_short16)},
+
+    {"ushort", sizeof(cl_ushort)},
+    {"ushort2", sizeof(cl_ushort2)},
+    {"ushort3", sizeof(cl_ushort3)},
+    {"ushort4", sizeof(cl_ushort4)},
+    {"ushort8", sizeof(cl_ushort8)},
+    {"ushort16", sizeof(cl_ushort16)},
+
+    {"int", sizeof(cl_int)},
+    {"int2", sizeof(cl_int2)},
+    {"int3", sizeof(cl_int3)},
+    {"int4", sizeof(cl_int4)},
+    {"int8", sizeof(cl_int8)},
+    {"int16", sizeof(cl_int16)},
+
+    {"uint", sizeof(cl_uint)},
+    {"uint2", sizeof(cl_uint2)},
+    {"uint3", sizeof(cl_uint3)},
+    {"uint4", sizeof(cl_uint4)},
+    {"uint8", sizeof(cl_uint8)},
+    {"uint16", sizeof(cl_uint16)},
+
+    {"long", sizeof(cl_long)},
+    {"long2", sizeof(cl_long2)},
+    {"long3", sizeof(cl_long3)},
+    {"long4", sizeof(cl_long4)},
+    {"long8", sizeof(cl_long8)},
+    {"long16", sizeof(cl_long16)},
+
+    {"ulong", sizeof(cl_ulong)},
+    {"ulong2", sizeof(cl_ulong2)},
+    {"ulong3", sizeof(cl_ulong3)},
+    {"ulong4", sizeof(cl_ulong4)},
+    {"ulong8", sizeof(cl_ulong8)},
+    {"ulong16", sizeof(cl_ulong16)},
+
+    {"half", sizeof(cl_half)},
+
+    {"float", sizeof(cl_float)},
+    {"float2", sizeof(cl_float2)},
+    {"float3", sizeof(cl_float3)},
+    {"float4", sizeof(cl_float4)},
+    {"float8", sizeof(cl_float8)},
+    {"float16", sizeof(cl_float16)},
+
+#ifdef cl_khr_fp16
+    {"half2", sizeof(cl_half2)},
+    {"half3", sizeof(cl_half3)},
+    {"half4", sizeof(cl_half4)},
+    {"half8", sizeof(cl_half8)},
+    {"half16", sizeof(cl_half16)},
+#endif
+
+    {"double", sizeof(cl_double)},
+    {"double2", sizeof(cl_double2)},
+    {"double3", sizeof(cl_double3)},
+    {"double4", sizeof(cl_double4)},
+    {"double8", sizeof(cl_double8)},
+    {"double16", sizeof(cl_double16)},
+};
+WorkSizeInfo::WorkSizeInfo(uint32_t maxWorkGroupSize, uint32_t hasBarriers, uint32_t simdSize, uint32_t slmTotalSize, GFXCORE_FAMILY coreFamily, uint32_t numThreadsPerSlice, uint32_t localMemSize, bool imgUsed, bool yTiledSurface) {
+    this->maxWorkGroupSize = maxWorkGroupSize;
+    this->hasBarriers = hasBarriers;
+    this->simdSize = simdSize;
+    this->slmTotalSize = slmTotalSize;
+    this->coreFamily = coreFamily;
+    this->numThreadsPerSlice = numThreadsPerSlice;
+    this->localMemSize = localMemSize;
+    this->imgUsed = imgUsed;
+    this->yTiledSurfaces = yTiledSurface;
+    setMinWorkGroupSize();
+}
+WorkSizeInfo::WorkSizeInfo(const DispatchInfo &dispatchInfo) {
+    this->maxWorkGroupSize = (uint32_t)dispatchInfo.getKernel()->getDevice().getDeviceInfo().maxWorkGroupSize;
+    this->hasBarriers = (uint32_t)dispatchInfo.getKernel()->getKernelInfo().patchInfo.executionEnvironment->HasBarriers;
+    this->simdSize = (uint32_t)dispatchInfo.getKernel()->getKernelInfo().getMaxSimdSize();
+    this->slmTotalSize = (uint32_t)dispatchInfo.getKernel()->slmTotalSize;
+    this->coreFamily = dispatchInfo.getKernel()->getDevice().getHardwareInfo().pPlatform->eRenderCoreFamily;
+    this->numThreadsPerSlice = (uint32_t)dispatchInfo.getKernel()->getDevice().getDeviceInfo().maxNumEUsPerSubSlice;
+    this->localMemSize = (uint32_t)dispatchInfo.getKernel()->getDevice().getDeviceInfo().localMemSize;
+    setIfUseImg(dispatchInfo.getKernel());
+    setMinWorkGroupSize();
+}
+void WorkSizeInfo::setIfUseImg(Kernel *pKernel) {
+    auto ParamsCount = pKernel->getKernelArgsNumber();
+    for (auto i = 0u; i < ParamsCount; i++) {
+        if (pKernel->getKernelInfo().kernelArgInfo[i].isImage) {
+            imgUsed = true;
+            yTiledSurfaces = true;
+        }
+    }
+}
+void WorkSizeInfo::setMinWorkGroupSize() {
+    minWorkGroupSize = 0;
+    if (hasBarriers > 0) {
+        uint32_t maxBarriersPerHSlice = (coreFamily >= IGFX_GEN9_CORE) ? 32 : 16;
+        minWorkGroupSize = numThreadsPerSlice * simdSize / maxBarriersPerHSlice;
+    }
+    if (slmTotalSize > 0) {
+        minWorkGroupSize = std::max(maxWorkGroupSize / ((localMemSize / slmTotalSize)), minWorkGroupSize);
+    }
+}
+void WorkSizeInfo::checkRatio(const size_t workItems[3]) {
+    if (slmTotalSize > 0) {
+        useRatio = true;
+        targetRatio = log((float)workItems[0]) - log((float)workItems[1]);
+        useStrictRatio = false;
+    } else if (yTiledSurfaces == true) {
+        useRatio = true;
+        targetRatio = YTilingRatioValue;
+        useStrictRatio = true;
+    }
+}
+
+KernelInfo *KernelInfo::create() {
+    return new KernelInfo();
+}
+
+KernelInfo::~KernelInfo() {
+    kernelArgInfo.clear();
+
+    for (auto &stringData : patchInfo.stringDataMap) {
+        delete[] stringData.second.pStringData;
+    }
+    patchInfo.stringDataMap.clear();
+    delete[] crossThreadData;
+}
+
+cl_int KernelInfo::storeArgInfo(const SPatchKernelArgumentInfo *pkernelArgInfo) {
+    cl_int retVal = CL_SUCCESS;
+
+    if (pkernelArgInfo == nullptr) {
+        retVal = CL_INVALID_BINARY;
+    } else {
+        uint32_t argNum = pkernelArgInfo->ArgumentNumber;
+        auto pCurArgAttrib = ptrOffset(
+            reinterpret_cast<const char *>(pkernelArgInfo),
+            sizeof(SPatchKernelArgumentInfo));
+
+        resizeKernelArgInfoAndRegisterParameter(argNum);
+
+        kernelArgInfo[argNum].addressQualifierStr = pCurArgAttrib;
+        pCurArgAttrib += pkernelArgInfo->AddressQualifierSize;
+
+        kernelArgInfo[argNum].accessQualifierStr = pCurArgAttrib;
+        pCurArgAttrib += pkernelArgInfo->AccessQualifierSize;
+
+        kernelArgInfo[argNum].name = pCurArgAttrib;
+        pCurArgAttrib += pkernelArgInfo->ArgumentNameSize;
+
+        {
+            auto argType = strchr(pCurArgAttrib, ';');
+            DEBUG_BREAK_IF(argType == nullptr);
+
+            kernelArgInfo[argNum].typeStr.assign(pCurArgAttrib, argType - pCurArgAttrib);
+            pCurArgAttrib += pkernelArgInfo->TypeNameSize;
+
+            ++argType;
+        }
+
+        kernelArgInfo[argNum].typeQualifierStr = pCurArgAttrib;
+
+        patchInfo.kernelArgumentInfo.push_back(pkernelArgInfo);
+    }
+
+    return retVal;
+}
+
+void KernelInfo::storeKernelArgument(
+    const SPatchDataParameterBuffer *pDataParameterKernelArg) {
+    uint32_t argNum = pDataParameterKernelArg->ArgumentNumber;
+    uint32_t dataSize = pDataParameterKernelArg->DataSize;
+    uint32_t offset = pDataParameterKernelArg->Offset;
+    uint32_t sourceOffset = pDataParameterKernelArg->SourceOffset;
+
+    storeKernelArgPatchInfo(argNum, dataSize, offset, sourceOffset, 0);
+}
+
+void KernelInfo::storeKernelArgument(
+    const SPatchStatelessGlobalMemoryObjectKernelArgument *pStatelessGlobalKernelArg) {
+    uint32_t argNum = pStatelessGlobalKernelArg->ArgumentNumber;
+    uint32_t offsetSSH = pStatelessGlobalKernelArg->SurfaceStateHeapOffset;
+
+    usesSsh |= true;
+    storeKernelArgPatchInfo(argNum, pStatelessGlobalKernelArg->DataParamSize, pStatelessGlobalKernelArg->DataParamOffset, 0, offsetSSH);
+    kernelArgInfo[argNum].isBuffer = true;
+    patchInfo.statelessGlobalMemObjKernelArgs.push_back(pStatelessGlobalKernelArg);
+}
+
+void KernelInfo::storeKernelArgument(
+    const SPatchImageMemoryObjectKernelArgument *pImageMemObjKernelArg) {
+    uint32_t argNum = pImageMemObjKernelArg->ArgumentNumber;
+    uint32_t offsetSurfaceState = pImageMemObjKernelArg->Offset;
+
+    usesSsh |= true;
+    storeKernelArgPatchInfo(argNum, 0, 0, 0, offsetSurfaceState);
+    kernelArgInfo[argNum].isImage = true;
+
+    if (pImageMemObjKernelArg->Type == iOpenCL::IMAGE_MEMORY_OBJECT_2D_MEDIA) {
+        kernelArgInfo[argNum].isMediaImage = true;
+    }
+
+    if (pImageMemObjKernelArg->Type == iOpenCL::IMAGE_MEMORY_OBJECT_2D_MEDIA_BLOCK) {
+        kernelArgInfo[argNum].isMediaBlockImage = true;
+    }
+
+    kernelArgInfo[argNum].accessQualifier = pImageMemObjKernelArg->Writeable
+                                                ? CL_KERNEL_ARG_ACCESS_READ_WRITE
+                                                : CL_KERNEL_ARG_ACCESS_READ_ONLY;
+    patchInfo.imageMemObjKernelArgs.push_back(pImageMemObjKernelArg);
+}
+
+void KernelInfo::storeKernelArgument(
+    const SPatchGlobalMemoryObjectKernelArgument *pGlobalMemObjKernelArg) {
+    uint32_t argNum = pGlobalMemObjKernelArg->ArgumentNumber;
+    uint32_t offsetSurfaceState = pGlobalMemObjKernelArg->Offset;
+
+    usesSsh |= true;
+    storeKernelArgPatchInfo(argNum, 0, 0, 0, offsetSurfaceState);
+    kernelArgInfo[argNum].isBuffer = true;
+
+    patchInfo.globalMemObjKernelArgs.push_back(pGlobalMemObjKernelArg);
+}
+
+void KernelInfo::storeKernelArgument(
+    const SPatchSamplerKernelArgument *pSamplerArgument) {
+    uint32_t argNum = pSamplerArgument->ArgumentNumber;
+    uint32_t offsetSurfaceState = pSamplerArgument->Offset;
+
+    storeKernelArgPatchInfo(argNum, 0, 0, 0, offsetSurfaceState);
+    kernelArgInfo[argNum].samplerArgumentType = pSamplerArgument->Type;
+
+    if (this->name == "ve_enhance_intel" ||
+        this->name == "ve_dn_enhance_intel" ||
+        this->name == "ve_dn_di_enhance_intel") {
+        kernelArgInfo[argNum].isAccelerator = true;
+        kernelArgInfo[argNum].samplerArgumentType = iOpenCL::SAMPLER_OBJECT_VE;
+    } else if (pSamplerArgument->Type != iOpenCL::SAMPLER_OBJECT_TEXTURE) {
+        DEBUG_BREAK_IF(pSamplerArgument->Type != iOpenCL::SAMPLER_OBJECT_VME &&
+                       pSamplerArgument->Type != iOpenCL::SAMPLER_OBJECT_VE &&
+                       pSamplerArgument->Type != iOpenCL::SAMPLER_OBJECT_VD);
+        kernelArgInfo[argNum].isAccelerator = true;
+        isVmeWorkload = true;
+    } else {
+        kernelArgInfo[argNum].isSampler = true;
+    }
+}
+
+void KernelInfo::storeKernelArgument(
+    const SPatchStatelessConstantMemoryObjectKernelArgument *pStatelessConstMemObjKernelArg) {
+    uint32_t argNum = pStatelessConstMemObjKernelArg->ArgumentNumber;
+    uint32_t offsetSSH = pStatelessConstMemObjKernelArg->SurfaceStateHeapOffset;
+
+    usesSsh |= true;
+    storeKernelArgPatchInfo(argNum, pStatelessConstMemObjKernelArg->DataParamSize, pStatelessConstMemObjKernelArg->DataParamOffset, 0, offsetSSH);
+    kernelArgInfo[argNum].isBuffer = true;
+    patchInfo.statelessGlobalMemObjKernelArgs.push_back(reinterpret_cast<const SPatchStatelessGlobalMemoryObjectKernelArgument *>(pStatelessConstMemObjKernelArg));
+}
+
+void KernelInfo::storeKernelArgument(const SPatchStatelessDeviceQueueKernelArgument *pStatelessDeviceQueueKernelArg) {
+    uint32_t argNum = pStatelessDeviceQueueKernelArg->ArgumentNumber;
+
+    resizeKernelArgInfoAndRegisterParameter(argNum);
+    kernelArgInfo[argNum].isDeviceQueue = true;
+
+    storeKernelArgPatchInfo(argNum, pStatelessDeviceQueueKernelArg->DataParamSize, pStatelessDeviceQueueKernelArg->DataParamOffset, 0, pStatelessDeviceQueueKernelArg->SurfaceStateHeapOffset);
+}
+
+void KernelInfo::storePatchToken(
+    const SPatchAllocateStatelessPrivateSurface *pStatelessPrivateSurfaceArg) {
+    usesSsh |= true;
+    patchInfo.pAllocateStatelessPrivateSurface = pStatelessPrivateSurfaceArg;
+}
+
+void KernelInfo::storePatchToken(const SPatchAllocateStatelessConstantMemorySurfaceWithInitialization *pStatelessConstantMemorySurfaceWithInitializationArg) {
+    usesSsh |= true;
+    patchInfo.pAllocateStatelessConstantMemorySurfaceWithInitialization = pStatelessConstantMemorySurfaceWithInitializationArg;
+}
+
+void KernelInfo::storePatchToken(const SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization *pStatelessGlobalMemorySurfaceWithInitializationArg) {
+    usesSsh |= true;
+    patchInfo.pAllocateStatelessGlobalMemorySurfaceWithInitialization = pStatelessGlobalMemorySurfaceWithInitializationArg;
+}
+
+void KernelInfo::storePatchToken(const SPatchAllocateStatelessPrintfSurface *pStatelessPrintfSurfaceArg) {
+    usesSsh |= true;
+    patchInfo.pAllocateStatelessPrintfSurface = pStatelessPrintfSurfaceArg;
+}
+
+void KernelInfo::storePatchToken(const SPatchAllocateStatelessEventPoolSurface *pStatelessEventPoolSurfaceArg) {
+    usesSsh |= true;
+    patchInfo.pAllocateStatelessEventPoolSurface = pStatelessEventPoolSurfaceArg;
+}
+
+void KernelInfo::storePatchToken(const SPatchAllocateStatelessDefaultDeviceQueueSurface *pStatelessDefaultDeviceQueueSurfaceArg) {
+    usesSsh |= true;
+    patchInfo.pAllocateStatelessDefaultDeviceQueueSurface = pStatelessDefaultDeviceQueueSurfaceArg;
+}
+
+void KernelInfo::storePatchToken(const SPatchString *pStringArg) {
+    uint32_t stringIndex = pStringArg->Index;
+    PrintfStringInfo printfStringInfo;
+    printfStringInfo.SizeInBytes = pStringArg->StringSize;
+    if (printfStringInfo.SizeInBytes) {
+        printfStringInfo.pStringData = new char[printfStringInfo.SizeInBytes];
+        if (printfStringInfo.pStringData != nullptr) {
+            memcpy_s(printfStringInfo.pStringData, printfStringInfo.SizeInBytes, (cl_char *)pStringArg + sizeof(SPatchString), printfStringInfo.SizeInBytes);
+            patchInfo.stringDataMap.insert(std::pair<uint32_t, PrintfStringInfo>(stringIndex, printfStringInfo));
+        }
+    }
+}
+
+void KernelInfo::storePatchToken(const SPatchKernelAttributesInfo *pKernelAttributesInfo) {
+    attributes = reinterpret_cast<const char *>(pKernelAttributesInfo) + sizeof(SPatchKernelAttributesInfo);
+
+    auto start = attributes.find("intel_reqd_sub_group_size(");
+    if (start != std::string::npos) {
+        start += strlen("intel_reqd_sub_group_size(");
+        auto stop = attributes.find(")", start);
+        std::stringstream requiredSubGroupSizeStr(attributes.substr(start, stop - start));
+        requiredSubGroupSizeStr >> requiredSubGroupSize;
+    }
+}
+
+const char *KernelInfo::queryPrintfString(uint32_t index) const {
+    auto printfInfo = patchInfo.stringDataMap.find(index);
+    return printfInfo == patchInfo.stringDataMap.end() ? nullptr : printfInfo->second.pStringData;
+}
+
+cl_int KernelInfo::resolveKernelInfo() {
+    cl_int retVal = CL_SUCCESS;
+    std::unordered_map<std::string, uint32_t>::iterator iterUint;
+    std::unordered_map<std::string, size_t>::iterator iterSizeT;
+
+    for (auto &argInfo : kernelArgInfo) {
+        iterUint = accessQualifierMap.find(argInfo.accessQualifierStr);
+        if (iterUint != accessQualifierMap.end()) {
+            argInfo.accessQualifier = iterUint->second;
+        } else {
+            retVal = CL_INVALID_BINARY;
+            break;
+        }
+
+        iterUint = addressQualifierMap.find(argInfo.addressQualifierStr);
+        if (iterUint != addressQualifierMap.end()) {
+            argInfo.addressQualifier = iterUint->second;
+        } else {
+            retVal = CL_INVALID_BINARY;
+            break;
+        }
+
+        auto qualifierCount = sizeof(typeQualifiers) / sizeof(typeQualifiers[0]);
+
+        for (auto qualifierId = 0u; qualifierId < qualifierCount; qualifierId++) {
+            if (strstr(argInfo.typeQualifierStr.c_str(), typeQualifiers[qualifierId].argTypeQualifier) != nullptr) {
+                argInfo.typeQualifier |= typeQualifiers[qualifierId].argTypeQualifierValue;
+            }
+        }
+    }
+
+    return retVal;
+}
+
+void KernelInfo::storeKernelArgPatchInfo(uint32_t argNum, uint32_t dataSize, uint32_t dataOffset, uint32_t sourceOffset, uint32_t offsetSSH) {
+    resizeKernelArgInfoAndRegisterParameter(argNum);
+
+    KernelArgPatchInfo kernelArgPatchInfo;
+    kernelArgPatchInfo.crossthreadOffset = dataOffset;
+    kernelArgPatchInfo.size = dataSize;
+    kernelArgPatchInfo.sourceOffset = sourceOffset;
+
+    kernelArgInfo[argNum].kernelArgPatchInfoVector.push_back(kernelArgPatchInfo);
+    kernelArgInfo[argNum].offsetHeap = offsetSSH;
+}
+
+size_t KernelInfo::getSamplerStateArrayCount() const {
+    size_t count = patchInfo.samplerStateArray ? (size_t)patchInfo.samplerStateArray->Count : 0;
+    return count;
+}
+size_t KernelInfo::getSamplerStateArraySize(const HardwareInfo &hwInfo) const {
+    size_t samplerStateArraySize = getSamplerStateArrayCount() * Sampler::getSamplerStateSize(hwInfo);
+    return samplerStateArraySize;
+}
+
+size_t KernelInfo::getBorderColorStateSize() const {
+    size_t borderColorSize = 0;
+    if (patchInfo.samplerStateArray) {
+        borderColorSize = patchInfo.samplerStateArray->Offset - patchInfo.samplerStateArray->BorderColorOffset;
+    }
+    return borderColorSize;
+}
+
+size_t KernelInfo::getBorderColorOffset() const {
+    size_t borderColorOffset = 0;
+    if (patchInfo.samplerStateArray) {
+        borderColorOffset = patchInfo.samplerStateArray->BorderColorOffset;
+    }
+    return borderColorOffset;
+}
+
+uint32_t KernelInfo::getConstantBufferSize() const {
+    return patchInfo.dataParameterStream ? patchInfo.dataParameterStream->DataParameterStreamSize : 0;
+}
+} // namespace OCLRT
--- a/runtime/program/kernel_info.h
+++ b/runtime/program/kernel_info.h
@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "config.h"
+#include "CL/cl.h"
+#include "heap_info.h"
+#include "kernel_arg_info.h"
+#include "patch_info.h"
+#include "runtime/helpers/hw_info.h"
+#include "runtime/helpers/dispatch_info.h"
+#include <algorithm>
+#include <cstdint>
+#include <cmath>
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include <map>
+
+namespace OCLRT {
+class BuiltinDispatchInfoBuilder;
+class Device;
+class Kernel;
+struct KernelInfo;
+struct KernelArgumentType;
+
+extern std::unordered_map<std::string, uint32_t> accessQualifierMap;
+extern std::unordered_map<std::string, uint32_t> addressQualifierMap;
+extern std::map<std::string, size_t> typeSizeMap;
+
+struct WorkloadInfo {
+    uint32_t globalWorkOffsetOffsets[3];
+    uint32_t globalWorkSizeOffsets[3];
+    uint32_t localWorkSizeOffsets[3];
+    uint32_t localWorkSizeOffsets2[3];
+    uint32_t enqueuedLocalWorkSizeOffsets[3];
+    uint32_t numWorkGroupsOffset[3];
+    uint32_t maxWorkGroupSizeOffset;
+    uint32_t workDimOffset;
+    uint32_t slmStaticSize = 0;
+    uint32_t simdSizeOffset;
+    uint32_t parentEventOffset;
+    uint32_t prefferedWkgMultipleOffset;
+
+    static const uint32_t undefinedOffset;
+    static const uint32_t invalidParentEvent;
+
+    WorkloadInfo() {
+        globalWorkOffsetOffsets[0] = undefinedOffset;
+        globalWorkOffsetOffsets[1] = undefinedOffset;
+        globalWorkOffsetOffsets[2] = undefinedOffset;
+        globalWorkSizeOffsets[0] = undefinedOffset;
+        globalWorkSizeOffsets[1] = undefinedOffset;
+        globalWorkSizeOffsets[2] = undefinedOffset;
+        localWorkSizeOffsets[0] = undefinedOffset;
+        localWorkSizeOffsets[1] = undefinedOffset;
+        localWorkSizeOffsets[2] = undefinedOffset;
+        localWorkSizeOffsets2[0] = undefinedOffset;
+        localWorkSizeOffsets2[1] = undefinedOffset;
+        localWorkSizeOffsets2[2] = undefinedOffset;
+        enqueuedLocalWorkSizeOffsets[0] = undefinedOffset;
+        enqueuedLocalWorkSizeOffsets[1] = undefinedOffset;
+        enqueuedLocalWorkSizeOffsets[2] = undefinedOffset;
+        numWorkGroupsOffset[0] = undefinedOffset;
+        numWorkGroupsOffset[1] = undefinedOffset;
+        numWorkGroupsOffset[2] = undefinedOffset;
+        maxWorkGroupSizeOffset = undefinedOffset;
+        workDimOffset = undefinedOffset;
+        simdSizeOffset = undefinedOffset;
+        parentEventOffset = undefinedOffset;
+        prefferedWkgMultipleOffset = undefinedOffset;
+    }
+};
+
+static const float YTilingRatioValue = 1.3862943611198906188344642429164f;
+
+struct WorkSizeInfo {
+
+    uint32_t maxWorkGroupSize;
+    uint32_t minWorkGroupSize;
+    uint32_t hasBarriers;
+    uint32_t simdSize;
+    uint32_t slmTotalSize;
+    GFXCORE_FAMILY coreFamily;
+    uint32_t numThreadsPerSlice;
+    uint32_t localMemSize;
+    bool imgUsed = false;
+    bool yTiledSurfaces = false;
+    bool useRatio = false;
+    bool useStrictRatio = false;
+    float targetRatio = 0;
+
+    WorkSizeInfo(uint32_t maxWorkGroupSize, uint32_t hasBarriers, uint32_t simdSize, uint32_t slmTotalSize, GFXCORE_FAMILY coreFamily, uint32_t numThreadsPerSlice, uint32_t localMemSize, bool imgUsed, bool yTiledSurface);
+    WorkSizeInfo(const DispatchInfo &dispatchInfo);
+    void setIfUseImg(Kernel *pKernel);
+    void setMinWorkGroupSize();
+    void checkRatio(const size_t workItems[3]);
+};
+
+struct KernelInfo {
+  public:
+    static KernelInfo *create();
+    KernelInfo() {
+        heapInfo = {};
+        patchInfo = {};
+        workloadInfo = {};
+        kernelArgInfo = {};
+        kernelNonArgInfo = {};
+        childrenKernelsIdOffset = {};
+        reqdWorkGroupSize[0] = WorkloadInfo::undefinedOffset;
+        reqdWorkGroupSize[1] = WorkloadInfo::undefinedOffset;
+        reqdWorkGroupSize[2] = WorkloadInfo::undefinedOffset;
+    }
+
+    KernelInfo(const KernelInfo &) = delete;
+    KernelInfo &operator=(const KernelInfo &) = delete;
+
+    ~KernelInfo();
+
+    cl_int storeArgInfo(const SPatchKernelArgumentInfo *pkernelArgInfo);
+    void storeKernelArgument(const SPatchDataParameterBuffer *pDataParameterKernelArg);
+    void storeKernelArgument(const SPatchStatelessGlobalMemoryObjectKernelArgument *pStatelessGlobalKernelArg);
+    void storeKernelArgument(const SPatchImageMemoryObjectKernelArgument *pImageMemObjKernelArg);
+    void storeKernelArgument(const SPatchGlobalMemoryObjectKernelArgument *pGlobalMemObjKernelArg);
+    void storeKernelArgument(const SPatchStatelessConstantMemoryObjectKernelArgument *pStatelessConstMemObjKernelArg);
+    void storeKernelArgument(const SPatchStatelessDeviceQueueKernelArgument *pStatelessDeviceQueueKernelArg);
+    void storeKernelArgument(const SPatchSamplerKernelArgument *pSamplerKernelArg);
+    void storePatchToken(const SPatchAllocateStatelessPrivateSurface *pStatelessPrivateSurfaceArg);
+    void storePatchToken(const SPatchAllocateStatelessConstantMemorySurfaceWithInitialization *pStatelessConstantMemorySurfaceWithInitializationArg);
+    void storePatchToken(const SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization *pStatelessGlobalMemorySurfaceWithInitializationArg);
+    void storePatchToken(const SPatchAllocateStatelessPrintfSurface *pStatelessPrintfSurfaceArg);
+    void storePatchToken(const SPatchAllocateStatelessEventPoolSurface *pStatelessEventPoolSurfaceArg);
+    void storePatchToken(const SPatchAllocateStatelessDefaultDeviceQueueSurface *pStatelessDefaultDeviceQueueSurfaceArg);
+    void storePatchToken(const SPatchString *pStringArg);
+    void storePatchToken(const SPatchKernelAttributesInfo *pKernelAttributesInfo);
+    cl_int resolveKernelInfo();
+    void resizeKernelArgInfoAndRegisterParameter(uint32_t argCount) {
+        if (kernelArgInfo.size() <= argCount) {
+            kernelArgInfo.resize(argCount + 1);
+        }
+        if (!kernelArgInfo[argCount].needPatch) {
+            kernelArgInfo[argCount].needPatch = true;
+            argumentsToPatchNum++;
+        }
+    }
+
+    void storeKernelArgPatchInfo(uint32_t argNum, uint32_t dataSize, uint32_t crossthreadOffset, uint32_t sourceOffset, uint32_t offsetSSH);
+
+    const char *queryPrintfString(uint32_t index) const;
+
+    size_t getSamplerStateArrayCount() const;
+    size_t getSamplerStateArraySize(const HardwareInfo &hwInfo) const;
+    size_t getBorderColorStateSize() const;
+    size_t getBorderColorOffset() const;
+    unsigned int getMaxSimdSize() const {
+        const auto executionEnvironment = patchInfo.executionEnvironment;
+        if (executionEnvironment == nullptr) {
+            return 1;
+        }
+
+        if (executionEnvironment->CompiledSIMD32) {
+            return 32;
+        }
+
+        if (executionEnvironment->CompiledSIMD16) {
+            return 16;
+        }
+
+        return 8;
+    }
+    bool hasDeviceEnqueue() const {
+        return patchInfo.executionEnvironment ? !!patchInfo.executionEnvironment->HasDeviceEnqueue : false;
+    }
+    bool requiresSubgroupIndependentForwardProgress() const {
+        return patchInfo.executionEnvironment ? !!patchInfo.executionEnvironment->SubgroupIndependentForwardProgressRequired : false;
+    }
+    size_t getMaxRequiredWorkGroupSize(size_t maxWorkGroupSize) const {
+        auto requiredWorkGroupSizeX = patchInfo.executionEnvironment->RequiredWorkGroupSizeX;
+        auto requiredWorkGroupSizeY = patchInfo.executionEnvironment->RequiredWorkGroupSizeY;
+        auto requiredWorkGroupSizeZ = patchInfo.executionEnvironment->RequiredWorkGroupSizeZ;
+        size_t maxRequiredWorkGroupSize = requiredWorkGroupSizeX * requiredWorkGroupSizeY * requiredWorkGroupSizeZ;
+        if ((maxRequiredWorkGroupSize == 0) || (maxRequiredWorkGroupSize > maxWorkGroupSize)) {
+            maxRequiredWorkGroupSize = maxWorkGroupSize;
+        }
+        return maxRequiredWorkGroupSize;
+    }
+
+    uint32_t getConstantBufferSize() const;
+    int32_t getArgNumByName(const char *name) const {
+        int32_t argNum = 0;
+        for (auto &arg : kernelArgInfo) {
+            if (arg.name == name) {
+                return argNum;
+            }
+            ++argNum;
+        }
+        return -1;
+    }
+
+    std::string name;
+    std::string attributes;
+    HeapInfo heapInfo;
+    PatchInfo patchInfo;
+    std::vector<KernelArgInfo> kernelArgInfo;
+    std::vector<KernelArgInfo> kernelNonArgInfo;
+    WorkloadInfo workloadInfo;
+    std::vector<std::pair<uint32_t, uint32_t>> childrenKernelsIdOffset;
+    bool usesSsh = false;
+    bool requiresSshForBuffers = false;
+    bool isValid = false;
+    bool isVmeWorkload = false;
+    char *crossThreadData = nullptr;
+    size_t reqdWorkGroupSize[3];
+    size_t requiredSubGroupSize = 0;
+    uint32_t gpuPointerSize = 0;
+    const BuiltinDispatchInfoBuilder *builtinDispatchBuilder = nullptr;
+    uint32_t argumentsToPatchNum = 0;
+    uint32_t systemKernelOffset = 0;
+};
+} // namespace OCLRT
--- a/runtime/program/link.cpp
+++ b/runtime/program/link.cpp
@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "config.h"
+#include "runtime/compiler_interface/compiler_interface.h"
+#include "runtime/platform/platform.h"
+#include "runtime/helpers/validators.h"
+#include "program.h"
+#include "elf/writer.h"
+#include <cstring>
+
+namespace OCLRT {
+
+cl_int Program::link(
+    cl_uint numDevices,
+    const cl_device_id *deviceList,
+    const char *buildOptions,
+    cl_uint numInputPrograms,
+    const cl_program *inputPrograms,
+    void(CL_CALLBACK *funcNotify)(cl_program program, void *userData),
+    void *userData) {
+    cl_int retVal = CL_SUCCESS;
+    cl_program program;
+    CLElfLib::CElfWriter *pElfWriter = nullptr;
+    Program *pInputProgObj;
+    size_t dataSize;
+    char *pData = nullptr;
+    bool isCreateLibrary;
+    CLElfLib::SSectionNode sectionNode;
+
+    do {
+        if (((deviceList == nullptr) && (numDevices != 0)) ||
+            ((deviceList != nullptr) && (numDevices == 0))) {
+            retVal = CL_INVALID_VALUE;
+            break;
+        }
+
+        if ((numInputPrograms == 0) || (inputPrograms == nullptr)) {
+            retVal = CL_INVALID_VALUE;
+            break;
+        }
+
+        if ((funcNotify == nullptr) &&
+            (userData != nullptr)) {
+            retVal = CL_INVALID_VALUE;
+            break;
+        }
+
+        if ((deviceList != nullptr) && validateObject(*deviceList) != CL_SUCCESS) {
+            retVal = CL_INVALID_DEVICE;
+            break;
+        }
+
+        if (buildStatus == CL_BUILD_IN_PROGRESS) {
+            retVal = CL_INVALID_OPERATION;
+            break;
+        }
+
+        options = (buildOptions != nullptr) ? buildOptions : "";
+
+        isCreateLibrary = (strstr(options.c_str(), "-create-library") != nullptr);
+
+        buildStatus = CL_BUILD_IN_PROGRESS;
+
+        pElfWriter = CLElfLib::CElfWriter::create(CLElfLib::EH_TYPE_OPENCL_OBJECTS, CLElfLib::EH_MACHINE_NONE, 0);
+
+        StackVec<const Program *, 16> inputProgramsInternal;
+        for (cl_uint i = 0; i < numInputPrograms; i++) {
+            program = inputPrograms[i];
+            if (program == nullptr) {
+                retVal = CL_INVALID_PROGRAM;
+                break;
+            }
+            pInputProgObj = castToObject<Program>(program);
+            if (pInputProgObj == nullptr) {
+                retVal = CL_INVALID_PROGRAM;
+                break;
+            }
+            inputProgramsInternal.push_back(pInputProgObj);
+            if ((pInputProgObj->llvmBinary == nullptr) || (pInputProgObj->llvmBinarySize == 0)) {
+                retVal = CL_INVALID_PROGRAM;
+                break;
+            }
+            sectionNode.Name = "";
+            if (pInputProgObj->getIsSpirV()) {
+                sectionNode.Type = CLElfLib::SH_TYPE_SPIRV;
+            } else {
+                sectionNode.Type = CLElfLib::SH_TYPE_OPENCL_LLVM_BINARY;
+            }
+            sectionNode.Flags = 0;
+            sectionNode.pData = pInputProgObj->llvmBinary;
+            sectionNode.DataSize = static_cast<unsigned int>(pInputProgObj->llvmBinarySize);
+
+            pElfWriter->addSection(&sectionNode);
+        }
+        if (retVal != CL_SUCCESS) {
+            break;
+        }
+
+        pElfWriter->resolveBinary(nullptr, dataSize);
+        pData = new char[dataSize];
+        pElfWriter->resolveBinary(pData, dataSize);
+
+        CompilerInterface *pCompilerInterface = getCompilerInterface();
+        if (!pCompilerInterface) {
+            retVal = CL_OUT_OF_HOST_MEMORY;
+            break;
+        }
+
+        TranslationArgs inputArgs = {};
+
+        inputArgs.pInput = pData;
+        inputArgs.InputSize = (uint32_t)dataSize;
+        inputArgs.pOptions = options.c_str();
+        inputArgs.OptionsSize = (uint32_t)options.length();
+        inputArgs.pInternalOptions = internalOptions.c_str();
+        inputArgs.InternalOptionsSize = (uint32_t)internalOptions.length();
+        inputArgs.pTracingOptions = nullptr;
+        inputArgs.TracingOptionsCount = 0;
+
+        if (!isCreateLibrary) {
+            retVal = pCompilerInterface->link(*this, inputArgs);
+            if (retVal != CL_SUCCESS) {
+                break;
+            }
+
+            retVal = processGenBinary();
+            if (retVal != CL_SUCCESS) {
+                break;
+            }
+            programBinaryType = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+        } else {
+            retVal = pCompilerInterface->createLibrary(*this, inputArgs);
+            if (retVal != CL_SUCCESS) {
+                break;
+            }
+            programBinaryType = CL_PROGRAM_BINARY_TYPE_LIBRARY;
+        }
+        updateNonUniformFlag(&*inputProgramsInternal.begin(), inputProgramsInternal.size());
+        separateBlockKernels();
+    } while (false);
+
+    if (retVal != CL_SUCCESS) {
+        buildStatus = CL_BUILD_ERROR;
+        programBinaryType = CL_PROGRAM_BINARY_TYPE_NONE;
+    } else {
+        buildStatus = CL_BUILD_SUCCESS;
+    }
+
+    CLElfLib::CElfWriter::destroy(pElfWriter);
+    delete[] pData;
+    internalOptions.clear();
+
+    if (funcNotify != nullptr) {
+        (*funcNotify)(this, userData);
+    }
+
+    return retVal;
+}
+} // namespace OCLRT
--- a/runtime/program/patch_info.h
+++ b/runtime/program/patch_info.h
@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "patch_list.h"
+#include "patch_g7.h"
+#include <vector>
+#include <map>
+
+namespace OCLRT {
+using iOpenCL::SPatchMediaInterfaceDescriptorLoad;
+using iOpenCL::SPatchAllocateLocalSurface;
+using iOpenCL::SPatchMediaVFEState;
+using iOpenCL::SPatchInterfaceDescriptorData;
+using iOpenCL::SPatchSamplerStateArray;
+using iOpenCL::SPatchBindingTableState;
+using iOpenCL::SPatchDataParameterBuffer;
+using iOpenCL::SPatchStatelessGlobalMemoryObjectKernelArgument;
+using iOpenCL::SPatchGlobalMemoryObjectKernelArgument;
+using iOpenCL::SPatchStatelessConstantMemoryObjectKernelArgument;
+using iOpenCL::SPatchStatelessDeviceQueueKernelArgument;
+using iOpenCL::SPatchImageMemoryObjectKernelArgument;
+using iOpenCL::SPatchSamplerKernelArgument;
+using iOpenCL::SPatchDataParameterStream;
+using iOpenCL::SPatchThreadPayload;
+using iOpenCL::SPatchExecutionEnvironment;
+using iOpenCL::SPatchKernelAttributesInfo;
+using iOpenCL::SPatchKernelArgumentInfo;
+using iOpenCL::SKernelBinaryHeaderCommon;
+using iOpenCL::SProgramBinaryHeader;
+using iOpenCL::SPatchAllocateStatelessPrivateSurface;
+using iOpenCL::SPatchAllocateStatelessConstantMemorySurfaceWithInitialization;
+using iOpenCL::SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization;
+using iOpenCL::SPatchAllocateStatelessPrintfSurface;
+using iOpenCL::SPatchAllocateStatelessEventPoolSurface;
+using iOpenCL::SPatchAllocateStatelessDefaultDeviceQueueSurface;
+using iOpenCL::SPatchString;
+using iOpenCL::SPatchGtpinFreeGRFInfo;
+using iOpenCL::SPatchStateSIP;
+
+typedef struct TagPrintfStringInfo {
+    size_t SizeInBytes;
+    char *pStringData;
+} PrintfStringInfo, *PPrintfStringInfo;
+
+struct PatchInfo {
+    const SPatchMediaInterfaceDescriptorLoad *interfaceDescriptorDataLoad = nullptr;
+    const SPatchAllocateLocalSurface *localsurface = nullptr;
+    const SPatchMediaVFEState *mediavfestate = nullptr;
+    const SPatchInterfaceDescriptorData *interfaceDescriptorData = nullptr;
+    const SPatchSamplerStateArray *samplerStateArray = nullptr;
+    const SPatchBindingTableState *bindingTableState = nullptr;
+    ::std::vector<const SPatchDataParameterBuffer *> dataParameterBuffers;
+    ::std::vector<const SPatchStatelessGlobalMemoryObjectKernelArgument *>
+        statelessGlobalMemObjKernelArgs;
+    ::std::vector<const SPatchImageMemoryObjectKernelArgument *>
+        imageMemObjKernelArgs;
+    ::std::vector<const SPatchGlobalMemoryObjectKernelArgument *>
+        globalMemObjKernelArgs;
+    const SPatchDataParameterStream *dataParameterStream = nullptr;
+    const SPatchThreadPayload *threadPayload = nullptr;
+    const SPatchExecutionEnvironment *executionEnvironment = nullptr;
+    const SPatchKernelAttributesInfo *pKernelAttributesInfo = nullptr;
+    const SPatchAllocateStatelessPrivateSurface *pAllocateStatelessPrivateSurface = nullptr;
+    const SPatchAllocateStatelessConstantMemorySurfaceWithInitialization *pAllocateStatelessConstantMemorySurfaceWithInitialization = nullptr;
+    const SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization *pAllocateStatelessGlobalMemorySurfaceWithInitialization = nullptr;
+    const SPatchAllocateStatelessPrintfSurface *pAllocateStatelessPrintfSurface = nullptr;
+    const SPatchAllocateStatelessEventPoolSurface *pAllocateStatelessEventPoolSurface = nullptr;
+    const SPatchAllocateStatelessDefaultDeviceQueueSurface *pAllocateStatelessDefaultDeviceQueueSurface = nullptr;
+    ::std::map<uint32_t, PrintfStringInfo> stringDataMap;
+    ::std::vector<const SPatchKernelArgumentInfo *> kernelArgumentInfo;
+
+    PatchInfo() {
+    }
+};
+
+} // namespace OCLRT
--- a/runtime/program/print_formatter.cpp
+++ b/runtime/program/print_formatter.cpp
@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "print_formatter.h"
+
+#include "runtime/helpers/string.h"
+#include "runtime/memory_manager/graphics_allocation.h"
+#include <iostream>
+
+namespace OCLRT {
+
+PrintFormatter::PrintFormatter(Kernel &kernelArg, GraphicsAllocation &dataArg) : kernel(kernelArg),
+                                                                                 data(dataArg),
+                                                                                 buffer(nullptr),
+                                                                                 bufferSize(0),
+                                                                                 offset(0) {
+}
+
+void PrintFormatter::printKernelOutput(const std::function<void(char *)> &print) {
+    offset = 0;
+    buffer = reinterpret_cast<uint8_t *>(data.getUnderlyingBuffer());
+
+    // first 4 bytes of the buffer store it's own size
+    // before reading it size needs to be set to 4 because read() checks bounds and would fail if bufferSize was 0
+    bufferSize = 4;
+    read(&bufferSize);
+
+    uint32_t stringIndex = 0;
+
+    while (offset + 4 <= bufferSize) {
+        read(&stringIndex);
+        const char *formatString = kernel.getKernelInfo().queryPrintfString(stringIndex);
+        if (formatString != nullptr) {
+            printString(formatString, print);
+        }
+    }
+}
+
+void PrintFormatter::printString(const char *formatString, const std::function<void(char *)> &print) {
+    size_t length = strnlen_s(formatString, maxPrintfOutputLength);
+    char output[maxPrintfOutputLength];
+
+    size_t cursor = 0;
+    for (size_t i = 0; i <= length; i++) {
+        if (formatString[i] == '\\')
+            output[cursor++] = escapeChar(formatString[++i]);
+        else if (formatString[i] == '%') {
+            size_t end = i;
+            if (end + 1 <= length && formatString[end + 1] == '%') {
+                output[cursor++] = '%';
+                continue;
+            }
+
+            while (isConversionSpecifier(formatString[end++]) == false && end < length)
+                ;
+            char dataFormat[maxPrintfOutputLength];
+
+            memcpy_s(dataFormat, maxPrintfOutputLength, formatString + i, end - i);
+            dataFormat[end - i] = '\0';
+
+            if (formatString[end - 1] == 's')
+                cursor += printStringToken(output + cursor, maxPrintfOutputLength - cursor, dataFormat);
+            else
+                cursor += printToken(output + cursor, maxPrintfOutputLength - cursor, dataFormat);
+
+            i = end - 1;
+        } else {
+            output[cursor++] = formatString[i];
+        }
+    }
+
+    print(output);
+}
+
+void PrintFormatter::stripVectorFormat(const char *format, char *stripped) {
+    while (*format != '\0') {
+        if (*format != 'v') {
+            *stripped = *format;
+        } else if (*(format + 1) != '1') {
+            format += 2;
+            continue;
+
+        } else {
+            format += 3;
+            continue;
+        }
+        stripped++;
+        format++;
+    }
+    *stripped = '\0';
+}
+
+void PrintFormatter::stripVectorTypeConversion(char *format) {
+    size_t len = strlen(format);
+    if (len > 3 && format[len - 3] == 'h' && format[len - 2] == 'l') {
+        format[len - 3] = format[len - 1];
+        format[len - 2] = '\0';
+    }
+}
+
+size_t PrintFormatter::printToken(char *output, size_t size, const char *formatString) {
+    PRINTF_DATA_TYPE type(PRINTF_DATA_TYPE::INVALID);
+    read(&type);
+
+    switch (type) {
+    case PRINTF_DATA_TYPE::BYTE:
+        return typedPrintToken<int8_t>(output, size, formatString);
+    case PRINTF_DATA_TYPE::SHORT:
+        return typedPrintToken<int16_t>(output, size, formatString);
+    case PRINTF_DATA_TYPE::INT:
+        return typedPrintToken<int>(output, size, formatString);
+    case PRINTF_DATA_TYPE::FLOAT:
+        return typedPrintToken<float>(output, size, formatString);
+    case PRINTF_DATA_TYPE::LONG:
+        return typedPrintToken<int64_t>(output, size, formatString);
+    case PRINTF_DATA_TYPE::POINTER:
+        return typedPrintToken<void *>(output, size, formatString);
+    case PRINTF_DATA_TYPE::DOUBLE:
+        return typedPrintToken<double>(output, size, formatString);
+    case PRINTF_DATA_TYPE::VECTOR_BYTE:
+        return typedPrintVectorToken<int8_t>(output, size, formatString);
+    case PRINTF_DATA_TYPE::VECTOR_SHORT:
+        return typedPrintVectorToken<int16_t>(output, size, formatString);
+    case PRINTF_DATA_TYPE::VECTOR_INT:
+        return typedPrintVectorToken<int>(output, size, formatString);
+    case PRINTF_DATA_TYPE::VECTOR_LONG:
+        return typedPrintVectorToken<int64_t>(output, size, formatString);
+    case PRINTF_DATA_TYPE::VECTOR_FLOAT:
+        return typedPrintVectorToken<float>(output, size, formatString);
+    case PRINTF_DATA_TYPE::VECTOR_DOUBLE:
+        return typedPrintVectorToken<double>(output, size, formatString);
+    default:
+        return 0;
+    }
+}
+
+char PrintFormatter::escapeChar(char escape) {
+    switch (escape) {
+    case 'n':
+        return '\n';
+    default:
+        return escape;
+    }
+}
+
+bool PrintFormatter::isConversionSpecifier(char c) {
+    switch (c) {
+    case 'd':
+    case 'i':
+    case 'o':
+    case 'u':
+    case 'x':
+    case 'X':
+    case 'a':
+    case 'A':
+    case 'e':
+    case 'E':
+    case 'f':
+    case 'F':
+    case 'g':
+    case 'G':
+    case 's':
+    case 'c':
+    case 'p':
+        return true;
+    default:
+        return false;
+    }
+}
+} // namespace OCLRT
--- a/runtime/program/print_formatter.h
+++ b/runtime/program/print_formatter.h
@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "runtime/helpers/aligned_memory.h"
+#include "runtime/kernel/kernel.h"
+#include "runtime/os_interface/print.h"
+
+#include <algorithm>
+#include <cctype>
+#include <cstdint>
+#include <functional>
+
+extern int memcpy_s(void *dst, size_t destSize, const void *src, size_t count);
+
+namespace OCLRT {
+
+enum class PRINTF_DATA_TYPE : int {
+    INVALID,
+    BYTE,
+    SHORT,
+    INT,
+    FLOAT,
+    STRING,
+    LONG,
+    POINTER,
+    DOUBLE,
+    VECTOR_BYTE,
+    VECTOR_SHORT,
+    VECTOR_INT,
+    VECTOR_LONG,
+    VECTOR_FLOAT,
+    VECTOR_DOUBLE
+};
+
+class PrintFormatter {
+  public:
+    PrintFormatter(Kernel &kernelArg, GraphicsAllocation &dataArg);
+    void printKernelOutput(const std::function<void(char *)> &print = [](char *str) { printToSTDOUT(str); });
+
+    static const size_t maxPrintfOutputLength = 1024;
+
+  protected:
+    void printString(const char *formatString, const std::function<void(char *)> &print);
+    size_t printToken(char *output, size_t size, const char *formatString);
+
+    char escapeChar(char escape);
+    bool isConversionSpecifier(char c);
+    void stripVectorFormat(const char *format, char *stripped);
+    void stripVectorTypeConversion(char *format);
+
+    template <class T>
+    bool read(T *value) {
+        if (offset + sizeof(T) <= bufferSize) {
+            auto srcPtr = reinterpret_cast<T *>(buffer + offset);
+
+            if (isAligned(srcPtr)) {
+                *value = *srcPtr;
+            } else {
+                memcpy_s(value, bufferSize - offset, srcPtr, sizeof(T));
+            }
+            offset += sizeof(T);
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    template <class T>
+    size_t typedPrintToken(char *output, size_t size, const char *formatString) {
+        T value = {0};
+        read(&value);
+        return simple_sprintf(output, size, formatString, value);
+    }
+
+    template <class T>
+    size_t typedPrintVectorToken(char *output, size_t size, const char *formatString) {
+        T value = {0};
+        int valueCount = 0;
+        read(&valueCount);
+
+        size_t charactersPrinted = 0;
+        char strippedFormat[1024];
+
+        stripVectorFormat(formatString, strippedFormat);
+        stripVectorTypeConversion(strippedFormat);
+
+        for (int i = 0; i < valueCount; i++) {
+            read(&value);
+            charactersPrinted += simple_sprintf(output + charactersPrinted, size - charactersPrinted, strippedFormat, value);
+            if (i < valueCount - 1)
+                charactersPrinted += simple_sprintf(output + charactersPrinted, size - charactersPrinted, "%c", ',');
+        }
+
+        if (sizeof(T) < 4) {
+            offset += (4 - sizeof(T)) * valueCount;
+        }
+
+        return charactersPrinted;
+    }
+
+    size_t printStringToken(char *output, size_t size, const char *formatString) {
+        int index = 0;
+        int type = 0;
+        // additional read to discard the data type
+        read(&type);
+        read(&index);
+        if (type == static_cast<int>(PRINTF_DATA_TYPE::STRING))
+            return simple_sprintf(output, size, formatString, kernel.getKernelInfo().queryPrintfString(index));
+        else
+            return simple_sprintf(output, size, formatString, 0);
+    }
+
+    Kernel &kernel;
+    GraphicsAllocation &data;
+
+    uint8_t *buffer;     // buffer extracted from the kernel, contains values to be printed
+    uint32_t bufferSize; // size of the data contained in the buffer
+    uint32_t offset;     // current position in currently parsed buffer
+};
+};
--- a/runtime/program/printf_handler.cpp
+++ b/runtime/program/printf_handler.cpp
@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "printf_handler.h"
+
+#include "runtime/mem_obj/buffer.h"
+#include "runtime/program/print_formatter.h"
+#include "runtime/kernel/kernel.h"
+#include "runtime/helpers/dispatch_info.h"
+#include "runtime/helpers/ptr_math.h"
+#include "runtime/helpers/aligned_memory.h"
+#include "runtime/memory_manager/memory_manager.h"
+
+namespace OCLRT {
+
+PrintfHandler::PrintfHandler(Device &deviceArg) : device(deviceArg) {}
+
+PrintfHandler::~PrintfHandler() {
+    device.getMemoryManager()->freeGraphicsMemory(printfSurface);
+}
+
+PrintfHandler *PrintfHandler::create(const MultiDispatchInfo &multiDispatchInfo, Device &device) {
+    if (multiDispatchInfo.usesStatelessPrintfSurface() ||
+        (multiDispatchInfo.begin()->getKernel()->checkIfIsParentKernelAndBlocksUsesPrintf())) {
+        return new PrintfHandler(device);
+    }
+    return nullptr;
+}
+
+void PrintfHandler::prepareDispatch(const MultiDispatchInfo &multiDispatchInfo) {
+    auto printfSurfaceSize = device.getDeviceInfo().printfBufferSize;
+    if (printfSurfaceSize == 0) {
+        return;
+    }
+    kernel = multiDispatchInfo.begin()->getKernel();
+
+    printfSurface = device.getMemoryManager()->createGraphicsAllocationWithRequiredBitness(printfSurfaceSize, nullptr);
+
+    *reinterpret_cast<uint32_t *>(printfSurface->getUnderlyingBuffer()) = printfSurfaceInitialDataSize;
+
+    auto printfPatchAddress = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getCrossThreadData()),
+                                        kernel->getKernelInfo().patchInfo.pAllocateStatelessPrintfSurface->DataParamOffset);
+
+    patchWithRequiredSize(printfPatchAddress, kernel->getKernelInfo().patchInfo.pAllocateStatelessPrintfSurface->DataParamSize, (uintptr_t)printfSurface->getGpuAddressToPatch());
+    if (kernel->requiresSshForBuffers()) {
+        auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getSurfaceStateHeap()),
+                                      kernel->getKernelInfo().patchInfo.pAllocateStatelessPrintfSurface->SurfaceStateHeapOffset);
+        void *addressToPatch = printfSurface->getUnderlyingBuffer();
+        size_t sizeToPatch = printfSurface->getUnderlyingBufferSize();
+        Buffer::setSurfaceState(&kernel->getContext(), surfaceState, sizeToPatch, addressToPatch, printfSurface);
+    }
+}
+
+void PrintfHandler::makeResident(CommandStreamReceiver &commandStreamReceiver) {
+    commandStreamReceiver.makeResident(*printfSurface);
+}
+
+void PrintfHandler::printEnqueueOutput() {
+    PrintFormatter printFormatter(*kernel, *printfSurface);
+    printFormatter.printKernelOutput();
+}
+} // namespace OCLRT
--- a/runtime/program/printf_handler.h
+++ b/runtime/program/printf_handler.h
@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "runtime/kernel/kernel.h"
+#include "runtime/command_stream/command_stream_receiver.h"
+
+namespace OCLRT {
+
+struct MultiDispatchInfo;
+
+class PrintfHandler {
+  public:
+    static PrintfHandler *create(const MultiDispatchInfo &multiDispatchInfo, Device &deviceArg);
+
+    ~PrintfHandler();
+
+    void prepareDispatch(const MultiDispatchInfo &multiDispatchInfo);
+    void makeResident(CommandStreamReceiver &commandStreamReceiver);
+    void printEnqueueOutput();
+
+    GraphicsAllocation *getSurface() {
+        return printfSurface;
+    }
+
+  protected:
+    PrintfHandler(Device &device);
+
+    static const uint32_t printfSurfaceInitialDataSize = sizeof(uint32_t);
+    Device &device;
+    Kernel *kernel = nullptr;
+    GraphicsAllocation *printfSurface = nullptr;
+};
+} // namespace OCLRT
--- a/runtime/program/process_elf_binary.cpp
+++ b/runtime/program/process_elf_binary.cpp
@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "elf/reader.h"
+#include "elf/writer.h"
+#include "program.h"
+#include "runtime/helpers/string.h"
+
+namespace OCLRT {
+
+cl_int Program::processElfBinary(
+    const void *pBinary,
+    size_t binarySize,
+    uint32_t &binaryVersion) {
+    cl_int retVal = CL_SUCCESS;
+    CLElfLib::CElfReader *pElfReader = nullptr;
+    const CLElfLib::SElf64Header *pElfHeader = nullptr;
+    char *pSectionData = nullptr;
+    size_t sectionDataSize = 0;
+
+    binaryVersion = iOpenCL::CURRENT_ICBE_VERSION;
+
+    if (CLElfLib::CElfReader::isValidElf64(pBinary, binarySize) == false) {
+        retVal = CL_INVALID_BINARY;
+    }
+
+    if (retVal == CL_SUCCESS) {
+        delete[] elfBinary;
+        elfBinarySize = 0;
+
+        elfBinary = new char[binarySize];
+
+        elfBinarySize = binarySize;
+        memcpy_s(elfBinary, elfBinarySize, pBinary, binarySize);
+    }
+
+    if (retVal == CL_SUCCESS) {
+        pElfReader = CLElfLib::CElfReader::create(
+            (const char *)pBinary,
+            binarySize);
+
+        if (pElfReader == nullptr) {
+            retVal = CL_OUT_OF_HOST_MEMORY;
+        }
+    }
+
+    if (retVal == CL_SUCCESS) {
+        pElfHeader = pElfReader->getElfHeader();
+
+        switch (pElfHeader->Type) {
+        case CLElfLib::EH_TYPE_OPENCL_EXECUTABLE:
+            programBinaryType = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+            break;
+
+        case CLElfLib::EH_TYPE_OPENCL_LIBRARY:
+            programBinaryType = CL_PROGRAM_BINARY_TYPE_LIBRARY;
+            break;
+
+        case CLElfLib::EH_TYPE_OPENCL_OBJECTS:
+            programBinaryType = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
+            break;
+
+        default:
+            retVal = CL_INVALID_BINARY;
+        }
+    }
+
+    if (retVal == CL_SUCCESS) {
+        // section 0 is always null
+        for (uint32_t i = 1; i < pElfHeader->NumSectionHeaderEntries; i++) {
+            const CLElfLib::SElf64SectionHeader *pSectionHeader = pElfReader->getSectionHeader(i);
+
+            pSectionData = nullptr;
+            sectionDataSize = 0;
+
+            switch (pSectionHeader->Type) {
+            case CLElfLib::SH_TYPE_SPIRV:
+                isSpirV = true;
+            // FALLTHROUGH
+            case CLElfLib::SH_TYPE_OPENCL_LLVM_BINARY:
+                pElfReader->getSectionData(i, pSectionData, sectionDataSize);
+                if (pSectionData && sectionDataSize) {
+                    storeLlvmBinary(pSectionData, sectionDataSize);
+                }
+                break;
+
+            case CLElfLib::SH_TYPE_OPENCL_DEV_BINARY:
+                pElfReader->getSectionData(i, pSectionData, sectionDataSize);
+                if (pSectionData && sectionDataSize && validateGenBinaryHeader((SProgramBinaryHeader *)pSectionData)) {
+                    storeGenBinary(pSectionData, sectionDataSize);
+                    isCreatedFromBinary = true;
+                } else {
+                    getProgramCompilerVersion((SProgramBinaryHeader *)pSectionData, binaryVersion);
+                    retVal = CL_INVALID_BINARY;
+                }
+                break;
+
+            case CLElfLib::SH_TYPE_OPENCL_OPTIONS:
+                pElfReader->getSectionData(i, pSectionData, sectionDataSize);
+                if (pSectionData && sectionDataSize) {
+                    options = pSectionData;
+                }
+                break;
+
+            case CLElfLib::SH_TYPE_STR_TBL:
+                // We can skip the string table
+                break;
+
+            default:
+                retVal = CL_INVALID_BINARY;
+            }
+
+            if (retVal != CL_SUCCESS) {
+                break;
+            }
+        }
+    }
+
+    if (retVal == CL_SUCCESS) {
+        isProgramBinaryResolved = true;
+        buildStatus = CL_BUILD_SUCCESS;
+
+        // Create an empty build log since program is effectively built
+        updateBuildLog(pDevice, "", 1);
+    }
+
+    CLElfLib::CElfReader::destroy(pElfReader);
+    return retVal;
+}
+
+cl_int Program::resolveProgramBinary() {
+    cl_int retVal = CL_SUCCESS;
+    CLElfLib::E_EH_TYPE headerType;
+    CLElfLib::CElfWriter *pElfWriter = nullptr;
+
+    if (isProgramBinaryResolved == false) {
+        delete[] elfBinary;
+        elfBinary = nullptr;
+        elfBinarySize = 0;
+
+        switch (programBinaryType) {
+        case CL_PROGRAM_BINARY_TYPE_EXECUTABLE:
+            headerType = CLElfLib::EH_TYPE_OPENCL_EXECUTABLE;
+
+            if (!genBinary || !genBinarySize) {
+                retVal = CL_INVALID_BINARY;
+            }
+            break;
+
+        case CL_PROGRAM_BINARY_TYPE_LIBRARY:
+            headerType = CLElfLib::EH_TYPE_OPENCL_LIBRARY;
+
+            if (!llvmBinary || !llvmBinarySize) {
+                retVal = CL_INVALID_BINARY;
+            }
+            break;
+
+        case CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT:
+            headerType = CLElfLib::EH_TYPE_OPENCL_OBJECTS;
+
+            if (!llvmBinary || !llvmBinarySize) {
+                retVal = CL_INVALID_BINARY;
+            }
+            break;
+
+        default:
+            retVal = CL_INVALID_BINARY;
+        }
+
+        if (retVal == CL_SUCCESS) {
+            pElfWriter = CLElfLib::CElfWriter::create(headerType, CLElfLib::EH_MACHINE_NONE, 0);
+
+            if (pElfWriter) {
+                CLElfLib::SSectionNode sectionNode;
+
+                // Always add the options string
+                sectionNode.Name = "BuildOptions";
+                sectionNode.Type = CLElfLib::SH_TYPE_OPENCL_OPTIONS;
+                sectionNode.pData = (char *)options.c_str();
+                sectionNode.DataSize = (uint32_t)(strlen(options.c_str()) + 1);
+
+                auto elfRetVal = pElfWriter->addSection(&sectionNode);
+
+                if (elfRetVal) {
+                    // Add the LLVM component if available
+                    if (getIsSpirV()) {
+                        sectionNode.Type = CLElfLib::SH_TYPE_SPIRV;
+                    } else {
+                        sectionNode.Type = CLElfLib::SH_TYPE_OPENCL_LLVM_BINARY;
+                    }
+                    if (headerType == CLElfLib::EH_TYPE_OPENCL_LIBRARY) {
+                        sectionNode.Name = "Intel(R) OpenCL LLVM Archive";
+                        sectionNode.pData = (char *)llvmBinary;
+                        sectionNode.DataSize = (uint32_t)llvmBinarySize;
+                        elfRetVal = pElfWriter->addSection(&sectionNode);
+                    } else {
+                        sectionNode.Name = "Intel(R) OpenCL LLVM Object";
+                        sectionNode.pData = (char *)llvmBinary;
+                        sectionNode.DataSize = (uint32_t)llvmBinarySize;
+                        elfRetVal = pElfWriter->addSection(&sectionNode);
+                    }
+                }
+
+                // Add the device binary if it exists
+                if (elfRetVal && genBinary) {
+                    sectionNode.Name = "Intel(R) OpenCL Device Binary";
+                    sectionNode.Type = CLElfLib::SH_TYPE_OPENCL_DEV_BINARY;
+                    sectionNode.pData = (char *)genBinary;
+                    sectionNode.DataSize = (uint32_t)genBinarySize;
+
+                    elfRetVal = pElfWriter->addSection(&sectionNode);
+                }
+
+                // Add the device debug data if it exists
+                if (elfRetVal && (debugData != nullptr)) {
+                    sectionNode.Name = "Intel(R) OpenCL Device Debug";
+                    sectionNode.Type = CLElfLib::SH_TYPE_OPENCL_DEV_DEBUG;
+                    sectionNode.pData = debugData;
+                    sectionNode.DataSize = (uint32_t)debugDataSize;
+                    elfRetVal = pElfWriter->addSection(&sectionNode);
+                }
+
+                if (elfRetVal) {
+                    elfRetVal = pElfWriter->resolveBinary(elfBinary, elfBinarySize);
+                }
+
+                if (elfRetVal) {
+                    elfBinary = new char[elfBinarySize];
+
+                    elfRetVal = pElfWriter->resolveBinary(elfBinary, elfBinarySize);
+                }
+
+                if (elfRetVal) {
+                    isProgramBinaryResolved = true;
+                } else {
+                    retVal = CL_INVALID_BINARY;
+                }
+            } else {
+                retVal = CL_OUT_OF_HOST_MEMORY;
+            }
+
+            CLElfLib::CElfWriter::destroy(pElfWriter);
+        }
+    }
+    return retVal;
+}
+}
--- a/runtime/program/process_gen_binary.cpp
+++ b/runtime/program/process_gen_binary.cpp
@ -0,0 +1,976 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "runtime/helpers/aligned_memory.h"
+#include "runtime/helpers/debug_helpers.h"
+#include "runtime/helpers/hash.h"
+#include "runtime/helpers/ptr_math.h"
+#include "runtime/helpers/string.h"
+#include "runtime/memory_manager/memory_manager.h"
+#include "patch_list.h"
+#include "patch_shared.h"
+#include "program.h"
+#include "runtime/kernel/kernel.h"
+
+#include <algorithm>
+
+using namespace iOpenCL;
+
+namespace OCLRT {
+extern bool familyEnabled[];
+
+const KernelInfo *Program::getKernelInfo(
+    const char *kernelName) const {
+    if (kernelName == nullptr) {
+        return nullptr;
+    }
+
+    auto it = std::find_if(kernelInfoArray.begin(), kernelInfoArray.end(),
+                           [=](const KernelInfo *kInfo) { return (0 == strcmp(kInfo->name.c_str(), kernelName)); });
+
+    return (it != kernelInfoArray.end()) ? *it : nullptr;
+}
+
+size_t Program::getNumKernels() const {
+    return kernelInfoArray.size();
+}
+
+const KernelInfo *Program::getKernelInfo(size_t ordinal) const {
+    DEBUG_BREAK_IF(ordinal >= kernelInfoArray.size());
+    return kernelInfoArray[ordinal];
+}
+
+std::string Program::getKernelNamesString() const {
+    std::string semiColonDelimitedKernelNameStr;
+
+    for (uint32_t i = 0; i < kernelInfoArray.size(); i++) {
+        semiColonDelimitedKernelNameStr += kernelInfoArray[i]->name;
+        if ((i + 1) != kernelInfoArray.size()) {
+            semiColonDelimitedKernelNameStr += ";";
+        }
+    }
+
+    return semiColonDelimitedKernelNameStr;
+}
+
+size_t Program::processKernel(
+    const void *pKernelBlob,
+    cl_int &retVal) {
+    size_t sizeProcessed = 0;
+
+    do {
+        auto pKernelInfo = KernelInfo::create();
+        if (!pKernelInfo) {
+            retVal = CL_OUT_OF_HOST_MEMORY;
+            break;
+        }
+
+        auto pCurKernelPtr = pKernelBlob;
+        pKernelInfo->heapInfo.pBlob = pKernelBlob;
+
+        pKernelInfo->heapInfo.pKernelHeader = reinterpret_cast<const SKernelBinaryHeaderCommon *>(pCurKernelPtr);
+        pCurKernelPtr = ptrOffset(pCurKernelPtr, sizeof(SKernelBinaryHeaderCommon));
+
+        std::string readName{reinterpret_cast<const char *>(pCurKernelPtr), pKernelInfo->heapInfo.pKernelHeader->KernelNameSize};
+        pKernelInfo->name = readName.c_str();
+        pCurKernelPtr = ptrOffset(pCurKernelPtr, pKernelInfo->heapInfo.pKernelHeader->KernelNameSize);
+
+        pKernelInfo->heapInfo.pKernelHeap = pCurKernelPtr;
+        pCurKernelPtr = ptrOffset(pCurKernelPtr, pKernelInfo->heapInfo.pKernelHeader->KernelHeapSize);
+
+        pKernelInfo->heapInfo.pGsh = pCurKernelPtr;
+        pCurKernelPtr = ptrOffset(pCurKernelPtr, pKernelInfo->heapInfo.pKernelHeader->GeneralStateHeapSize);
+
+        pKernelInfo->heapInfo.pDsh = pCurKernelPtr;
+        pCurKernelPtr = ptrOffset(pCurKernelPtr, pKernelInfo->heapInfo.pKernelHeader->DynamicStateHeapSize);
+
+        pKernelInfo->heapInfo.pSsh = const_cast<void *>(pCurKernelPtr);
+        pCurKernelPtr = ptrOffset(pCurKernelPtr, pKernelInfo->heapInfo.pKernelHeader->SurfaceStateHeapSize);
+
+        pKernelInfo->heapInfo.pPatchList = pCurKernelPtr;
+
+        retVal = parsePatchList(*pKernelInfo);
+        if (retVal != CL_SUCCESS) {
+            delete pKernelInfo;
+
+            sizeProcessed = ptrDiff(pCurKernelPtr, pKernelBlob);
+            break;
+        }
+
+        auto pKernelHeader = pKernelInfo->heapInfo.pKernelHeader;
+        auto pKernel = ptrOffset(pKernelBlob, sizeof(SKernelBinaryHeaderCommon));
+
+        if (genBinary)
+            pKernelInfo->gpuPointerSize = reinterpret_cast<const SProgramBinaryHeader *>(genBinary)->GPUPointerSizeInBytes;
+
+        uint32_t kernelSize =
+            pKernelHeader->DynamicStateHeapSize +
+            pKernelHeader->GeneralStateHeapSize +
+            pKernelHeader->KernelHeapSize +
+            pKernelHeader->KernelNameSize +
+            pKernelHeader->PatchListSize +
+            pKernelHeader->SurfaceStateHeapSize;
+
+        pKernelInfo->heapInfo.blobSize = kernelSize + sizeof(SKernelBinaryHeaderCommon);
+
+        uint32_t kernelCheckSum = pKernelInfo->heapInfo.pKernelHeader->CheckSum;
+
+        uint64_t hashValue = Hash::hash(reinterpret_cast<const char *>(pKernel), kernelSize);
+
+        uint32_t calcCheckSum = hashValue & 0xFFFFFFFF;
+        pKernelInfo->isValid = (calcCheckSum == kernelCheckSum);
+
+        retVal = CL_SUCCESS;
+        sizeProcessed = sizeof(SKernelBinaryHeaderCommon) + kernelSize;
+        kernelInfoArray.push_back(pKernelInfo);
+        if (pKernelInfo->hasDeviceEnqueue()) {
+            parentKernelInfoArray.push_back(pKernelInfo);
+        }
+        if (pKernelInfo->requiresSubgroupIndependentForwardProgress()) {
+            subgroupKernelInfoArray.push_back(pKernelInfo);
+        }
+    } while (false);
+
+    return sizeProcessed;
+}
+
+cl_int Program::parsePatchList(KernelInfo &kernelInfo) {
+    cl_int retVal = CL_SUCCESS;
+
+    auto pPatchList = kernelInfo.heapInfo.pPatchList;
+    auto patchListSize = kernelInfo.heapInfo.pKernelHeader->PatchListSize;
+    auto pCurPatchListPtr = pPatchList;
+    uint32_t PrivateMemoryStatelessSizeOffset = 0xFFffFFff;
+    uint32_t LocalMemoryStatelessWindowSizeOffset = 0xFFffFFff;
+    uint32_t LocalMemoryStatelessWindowStartAddressOffset = 0xFFffFFff;
+
+    //Speed up containers by giving some pre-allocated storage
+    kernelInfo.kernelArgInfo.reserve(10);
+    kernelInfo.patchInfo.kernelArgumentInfo.reserve(10);
+    kernelInfo.patchInfo.dataParameterBuffers.reserve(20);
+    std::stringstream PatchTokens;
+
+    DBG_LOG(LogPatchTokens, "\nPATCH_TOKENs for kernel", kernelInfo.name);
+
+    while (ptrDiff(pCurPatchListPtr, pPatchList) < patchListSize) {
+        uint32_t index = 0;
+        uint32_t argNum = 0;
+        auto pPatch = reinterpret_cast<const SPatchItemHeader *>(pCurPatchListPtr);
+        const SPatchDataParameterBuffer *pDataParameterBuffer = nullptr;
+
+        switch (pPatch->Token) {
+        case PATCH_TOKEN_SAMPLER_STATE_ARRAY:
+            kernelInfo.patchInfo.samplerStateArray =
+                reinterpret_cast<const SPatchSamplerStateArray *>(pPatch);
+            DBG_LOG(LogPatchTokens,
+                    "\n.SAMPLER_STATE_ARRAY", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .Offset", kernelInfo.patchInfo.samplerStateArray->Offset,
+                    "\n  .Count", kernelInfo.patchInfo.samplerStateArray->Count,
+                    "\n  .BorderColorOffset", kernelInfo.patchInfo.samplerStateArray->BorderColorOffset);
+            break;
+
+        case PATCH_TOKEN_BINDING_TABLE_STATE:
+            kernelInfo.patchInfo.bindingTableState =
+                reinterpret_cast<const SPatchBindingTableState *>(pPatch);
+            kernelInfo.usesSsh = (kernelInfo.patchInfo.bindingTableState->Count > 0);
+            DBG_LOG(LogPatchTokens,
+                    "\n.BINDING_TABLE_STATE", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .Offset", kernelInfo.patchInfo.bindingTableState->Offset,
+                    "\n  .Count", kernelInfo.patchInfo.bindingTableState->Count,
+                    "\n  .SurfaceStateOffset", kernelInfo.patchInfo.bindingTableState->SurfaceStateOffset);
+            break;
+
+        case PATCH_TOKEN_ALLOCATE_LOCAL_SURFACE:
+            kernelInfo.patchInfo.localsurface =
+                reinterpret_cast<const SPatchAllocateLocalSurface *>(pPatch);
+            kernelInfo.workloadInfo.slmStaticSize = kernelInfo.patchInfo.localsurface->TotalInlineLocalMemorySize;
+            DBG_LOG(LogPatchTokens,
+                    "\n.ALLOCATE_LOCAL_SURFACE", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .TotalInlineLocalMemorySize", kernelInfo.patchInfo.localsurface->TotalInlineLocalMemorySize);
+            break;
+
+        case PATCH_TOKEN_MEDIA_VFE_STATE:
+            kernelInfo.patchInfo.mediavfestate =
+                reinterpret_cast<const SPatchMediaVFEState *>(pPatch);
+            DBG_LOG(LogPatchTokens,
+                    "\n.MEDIA_VFE_STATE", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .ScratchSpaceOffset", kernelInfo.patchInfo.mediavfestate->ScratchSpaceOffset,
+                    "\n  .PerThreadScratchSpace", kernelInfo.patchInfo.mediavfestate->PerThreadScratchSpace);
+            break;
+
+        case PATCH_TOKEN_DATA_PARAMETER_BUFFER:
+            DBG_LOG(LogPatchTokens,
+                    "\n.DATA_PARAMETER_BUFFER", pPatch->Token,
+                    "\n  .Size", pPatch->Size);
+
+            pDataParameterBuffer = reinterpret_cast<const SPatchDataParameterBuffer *>(pPatch);
+            kernelInfo.patchInfo.dataParameterBuffers.push_back(
+                pDataParameterBuffer);
+            argNum = pDataParameterBuffer->ArgumentNumber;
+            switch (pDataParameterBuffer->Type) {
+            case DATA_PARAMETER_KERNEL_ARGUMENT:
+                kernelInfo.storeKernelArgument(pDataParameterBuffer);
+                DBG_LOG(LogPatchTokens, "\n  .Type", "KERNEL_ARGUMENT");
+                break;
+
+            case DATA_PARAMETER_LOCAL_WORK_SIZE: {
+                DBG_LOG(LogPatchTokens, "\n  .Type", "LOCAL_WORK_SIZE");
+                index = pDataParameterBuffer->SourceOffset / sizeof(uint32_t);
+                if (kernelInfo.workloadInfo.localWorkSizeOffsets[2] == WorkloadInfo::undefinedOffset) {
+                    kernelInfo.workloadInfo.localWorkSizeOffsets[index] =
+                        pDataParameterBuffer->Offset;
+                } else {
+                    kernelInfo.workloadInfo.localWorkSizeOffsets2[index] =
+                        pDataParameterBuffer->Offset;
+                }
+                break;
+            }
+
+            case DATA_PARAMETER_GLOBAL_WORK_OFFSET:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "GLOBAL_WORK_OFFSET");
+                index = pDataParameterBuffer->SourceOffset / sizeof(uint32_t);
+                kernelInfo.workloadInfo.globalWorkOffsetOffsets[index] =
+                    pDataParameterBuffer->Offset;
+                break;
+
+            case DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "ENQUEUED_LOCAL_WORK_SIZE");
+                index = pDataParameterBuffer->SourceOffset / sizeof(uint32_t);
+                kernelInfo.workloadInfo.enqueuedLocalWorkSizeOffsets[index] =
+                    pDataParameterBuffer->Offset;
+                break;
+
+            case DATA_PARAMETER_GLOBAL_WORK_SIZE:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "GLOBAL_WORK_SIZE");
+                index = pDataParameterBuffer->SourceOffset / sizeof(uint32_t);
+                kernelInfo.workloadInfo.globalWorkSizeOffsets[index] =
+                    pDataParameterBuffer->Offset;
+                break;
+
+            case DATA_PARAMETER_NUM_WORK_GROUPS:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "NUM_WORK_GROUPS");
+                index = pDataParameterBuffer->SourceOffset / sizeof(uint32_t);
+                kernelInfo.workloadInfo.numWorkGroupsOffset[index] =
+                    pDataParameterBuffer->Offset;
+                break;
+
+            case DATA_PARAMETER_MAX_WORKGROUP_SIZE:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "MAX_WORKGROUP_SIZE");
+                kernelInfo.workloadInfo.maxWorkGroupSizeOffset = pDataParameterBuffer->Offset;
+                break;
+
+            case DATA_PARAMETER_WORK_DIMENSIONS:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "WORK_DIMENSIONS");
+                kernelInfo.workloadInfo.workDimOffset = pDataParameterBuffer->Offset;
+                break;
+
+            case DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES: {
+                DBG_LOG(LogPatchTokens, "\n  .Type", "SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES");
+                kernelInfo.resizeKernelArgInfoAndRegisterParameter(argNum);
+
+                KernelArgPatchInfo kernelArgPatchInfo;
+                kernelArgPatchInfo.size = pDataParameterBuffer->DataSize;
+                kernelArgPatchInfo.crossthreadOffset = pDataParameterBuffer->Offset;
+
+                kernelInfo.kernelArgInfo[argNum].slmAlignment = pDataParameterBuffer->SourceOffset;
+                kernelInfo.kernelArgInfo[argNum].kernelArgPatchInfoVector.push_back(kernelArgPatchInfo);
+            } break;
+
+            case DATA_PARAMETER_IMAGE_WIDTH:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "IMAGE_WIDTH");
+                kernelInfo.resizeKernelArgInfoAndRegisterParameter(argNum);
+                kernelInfo.kernelArgInfo[argNum].offsetImgWidth = pDataParameterBuffer->Offset;
+                break;
+
+            case DATA_PARAMETER_IMAGE_HEIGHT:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "IMAGE_HEIGHT");
+                kernelInfo.resizeKernelArgInfoAndRegisterParameter(argNum);
+                kernelInfo.kernelArgInfo[argNum].offsetImgHeight = pDataParameterBuffer->Offset;
+                break;
+
+            case DATA_PARAMETER_IMAGE_DEPTH:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "IMAGE_DEPTH");
+                kernelInfo.resizeKernelArgInfoAndRegisterParameter(argNum);
+                kernelInfo.kernelArgInfo[argNum].offsetImgDepth = pDataParameterBuffer->Offset;
+                break;
+
+            case DATA_PARAMETER_SAMPLER_COORDINATE_SNAP_WA_REQUIRED:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "SAMPLER_COORDINATE_SNAP_WA_REQUIRED");
+                kernelInfo.resizeKernelArgInfoAndRegisterParameter(argNum);
+                kernelInfo.kernelArgInfo[argNum].offsetSamplerSnapWa = pDataParameterBuffer->Offset;
+                break;
+            case DATA_PARAMETER_SAMPLER_ADDRESS_MODE:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "SAMPLER_ADDRESS_MODE");
+                kernelInfo.resizeKernelArgInfoAndRegisterParameter(argNum);
+                kernelInfo.kernelArgInfo[argNum].offsetSamplerAddressingMode = pDataParameterBuffer->Offset;
+                break;
+            case DATA_PARAMETER_SAMPLER_NORMALIZED_COORDS:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "SAMPLER_ADDRESS_MODE");
+                kernelInfo.resizeKernelArgInfoAndRegisterParameter(argNum);
+                kernelInfo.kernelArgInfo[argNum].offsetSamplerNormalizedCoords = pDataParameterBuffer->Offset;
+                break;
+            case DATA_PARAMETER_IMAGE_CHANNEL_DATA_TYPE:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "SAMPLER_ADDRESS_MODE");
+                kernelInfo.resizeKernelArgInfoAndRegisterParameter(argNum);
+                kernelInfo.kernelArgInfo[argNum].offsetChannelDataType = pDataParameterBuffer->Offset;
+                break;
+
+            case DATA_PARAMETER_IMAGE_CHANNEL_ORDER:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "IMAGE_CHANNEL_ORDER");
+                kernelInfo.resizeKernelArgInfoAndRegisterParameter(argNum);
+                kernelInfo.kernelArgInfo[argNum].offsetChannelOrder = pDataParameterBuffer->Offset;
+                break;
+            case DATA_PARAMETER_IMAGE_ARRAY_SIZE:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "IMAGE_ARRAY_SIZE");
+                kernelInfo.resizeKernelArgInfoAndRegisterParameter(argNum);
+                kernelInfo.kernelArgInfo[argNum].offsetArraySize = pDataParameterBuffer->Offset;
+                break;
+
+            case DATA_PARAMETER_OBJECT_ID:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "OBJECT_ID");
+                kernelInfo.resizeKernelArgInfoAndRegisterParameter(argNum);
+                kernelInfo.kernelArgInfo[argNum].offsetObjectId = pDataParameterBuffer->Offset;
+                break;
+
+            case DATA_PARAMETER_SIMD_SIZE:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "SIMD_SIZE");
+                kernelInfo.workloadInfo.simdSizeOffset = pDataParameterBuffer->Offset;
+                break;
+
+            case DATA_PARAMETER_PARENT_EVENT:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "PARENT_EVENT");
+                kernelInfo.workloadInfo.parentEventOffset = pDataParameterBuffer->Offset;
+                break;
+
+            case DATA_PARAMETER_CHILD_BLOCK_SIMD_SIZE:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "CHILD_BLOCK_SIMD_SIZE");
+                kernelInfo.childrenKernelsIdOffset.push_back({argNum, pDataParameterBuffer->Offset});
+                break;
+
+            case DATA_PARAMETER_PRIVATE_MEMORY_STATELESS_SIZE:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "PRIVATE_MEMORY_STATELESS_SIZE");
+                PrivateMemoryStatelessSizeOffset = pDataParameterBuffer->Offset;
+                break;
+            case DATA_PARAMETER_LOCAL_MEMORY_STATELESS_WINDOW_SIZE:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "LOCAL_MEMORY_STATELESS_WINDOW_SIZE");
+                LocalMemoryStatelessWindowSizeOffset = pDataParameterBuffer->Offset;
+                break;
+            case DATA_PARAMETER_LOCAL_MEMORY_STATELESS_WINDOW_START_ADDRESS:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "LOCAL_MEMORY_STATELESS_WINDOW_START_ADDRESS");
+                LocalMemoryStatelessWindowStartAddressOffset = pDataParameterBuffer->Offset;
+                pDevice->prepareSLMWindow();
+                break;
+            case DATA_PARAMETER_PREFERRED_WORKGROUP_MULTIPLE:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "PREFERRED_WORKGROUP_MULTIPLE");
+                kernelInfo.workloadInfo.prefferedWkgMultipleOffset = pDataParameterBuffer->Offset;
+                break;
+            case DATA_PARAMETER_BUFFER_OFFSET:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "DATA_PARAMETER_BUFFER_OFFSET");
+                kernelInfo.resizeKernelArgInfoAndRegisterParameter(argNum);
+                kernelInfo.kernelArgInfo[argNum].offsetBufferOffset = pDataParameterBuffer->Offset;
+                break;
+            case DATA_PARAMETER_NUM_HARDWARE_THREADS:
+            case DATA_PARAMETER_PRINTF_SURFACE_SIZE:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "Unhandled", pDataParameterBuffer->Type);
+                printDebugString(DebugManager.flags.PrintDebugMessages.get(), stderr,
+                                 "Program::parsePatchList.Unhandled Data parameter: %d\n", pDataParameterBuffer->Type);
+                break;
+
+            case DATA_PARAMETER_VME_MB_BLOCK_TYPE:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "VME_MB_BLOCK_TYPE");
+                kernelInfo.resizeKernelArgInfoAndRegisterParameter(argNum);
+                kernelInfo.kernelArgInfo[argNum].offsetVmeMbBlockType = pDataParameterBuffer->Offset;
+                DEBUG_BREAK_IF(pDataParameterBuffer->DataSize != sizeof(uint32_t));
+                break;
+            case DATA_PARAMETER_VME_SUBPIXEL_MODE:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "VME_SUBPIXEL_MODE");
+                kernelInfo.resizeKernelArgInfoAndRegisterParameter(argNum);
+                kernelInfo.kernelArgInfo[argNum].offsetVmeSubpixelMode = pDataParameterBuffer->Offset;
+                DEBUG_BREAK_IF(pDataParameterBuffer->DataSize != sizeof(uint32_t));
+                break;
+            case DATA_PARAMETER_VME_SAD_ADJUST_MODE:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "VME_SAD_ADJUST_MODE");
+                kernelInfo.resizeKernelArgInfoAndRegisterParameter(argNum);
+                kernelInfo.kernelArgInfo[argNum].offsetVmeSadAdjustMode = pDataParameterBuffer->Offset;
+                DEBUG_BREAK_IF(pDataParameterBuffer->DataSize != sizeof(uint32_t));
+                break;
+            case DATA_PARAMETER_VME_SEARCH_PATH_TYPE:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "VME_SEARCH_PATH_TYPE");
+                kernelInfo.resizeKernelArgInfoAndRegisterParameter(argNum);
+                kernelInfo.kernelArgInfo[argNum].offsetVmeSearchPathType = pDataParameterBuffer->Offset;
+                DEBUG_BREAK_IF(pDataParameterBuffer->DataSize != sizeof(uint32_t));
+                break;
+            case DATA_PARAMETER_IMAGE_NUM_SAMPLES:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "IMAGE_NUM_SAMPLES");
+                kernelInfo.resizeKernelArgInfoAndRegisterParameter(argNum);
+                kernelInfo.kernelArgInfo[argNum].offsetNumSamples = pDataParameterBuffer->Offset;
+                DEBUG_BREAK_IF(pDataParameterBuffer->DataSize != sizeof(uint32_t));
+                break;
+
+            case DATA_PARAMETER_IMAGE_NUM_MIP_LEVELS:
+            case DATA_PARAMETER_IMAGE_SRGB_CHANNEL_ORDER:
+            case DATA_PARAMETER_STAGE_IN_GRID_ORIGIN:
+            case DATA_PARAMETER_STAGE_IN_GRID_SIZE:
+                break;
+
+            case DATA_PARAMETER_LOCAL_ID:
+            case DATA_PARAMETER_EXECUTION_MASK:
+            case DATA_PARAMETER_VME_IMAGE_TYPE:
+            case DATA_PARAMETER_VME_MB_SKIP_BLOCK_TYPE:
+                break;
+
+            default:
+                DBG_LOG(LogPatchTokens, "\n  .Type", "Unhandled", pDataParameterBuffer->Type);
+                DEBUG_BREAK_IF(true);
+            }
+
+            DBG_LOG(LogPatchTokens,
+                    "\n  .ArgumentNumber", pDataParameterBuffer->ArgumentNumber,
+                    "\n  .Offset", pDataParameterBuffer->Offset,
+                    "\n  .DataSize", pDataParameterBuffer->DataSize,
+                    "\n  .SourceOffset", pDataParameterBuffer->SourceOffset);
+
+            break;
+
+        case PATCH_TOKEN_MEDIA_INTERFACE_DESCRIPTOR_LOAD:
+            kernelInfo.patchInfo.interfaceDescriptorDataLoad =
+                reinterpret_cast<const SPatchMediaInterfaceDescriptorLoad *>(pPatch);
+            DBG_LOG(LogPatchTokens,
+                    "\n.MEDIA_INTERFACE_DESCRIPTOR_LOAD", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .InterfaceDescriptorDataOffset", kernelInfo.patchInfo.interfaceDescriptorDataLoad->InterfaceDescriptorDataOffset);
+            break;
+
+        case PATCH_TOKEN_INTERFACE_DESCRIPTOR_DATA:
+            kernelInfo.patchInfo.interfaceDescriptorData =
+                reinterpret_cast<const SPatchInterfaceDescriptorData *>(pPatch);
+            DBG_LOG(LogPatchTokens,
+                    "\n.INTERFACE_DESCRIPTOR_DATA", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .Offset", kernelInfo.patchInfo.interfaceDescriptorData->Offset,
+                    "\n  .SamplerStateOffset", kernelInfo.patchInfo.interfaceDescriptorData->SamplerStateOffset,
+                    "\n  .KernelOffset", kernelInfo.patchInfo.interfaceDescriptorData->KernelOffset,
+                    "\n  .BindingTableOffset", kernelInfo.patchInfo.interfaceDescriptorData->BindingTableOffset);
+            break;
+
+        case PATCH_TOKEN_THREAD_PAYLOAD:
+            kernelInfo.patchInfo.threadPayload =
+                reinterpret_cast<const SPatchThreadPayload *>(pPatch);
+            DBG_LOG(LogPatchTokens,
+                    "\n.THREAD_PAYLOAD", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .HeaderPresent", kernelInfo.patchInfo.threadPayload->HeaderPresent,
+                    "\n  .LocalIDXPresent", kernelInfo.patchInfo.threadPayload->LocalIDXPresent,
+                    "\n  .LocalIDYPresent", kernelInfo.patchInfo.threadPayload->LocalIDYPresent,
+                    "\n  .LocalIDZPresent", kernelInfo.patchInfo.threadPayload->LocalIDZPresent,
+                    "\n  .LocalIDFlattenedPresent", kernelInfo.patchInfo.threadPayload->LocalIDFlattenedPresent,
+                    "\n  .IndirectPayloadStorage", kernelInfo.patchInfo.threadPayload->IndirectPayloadStorage,
+                    "\n  .UnusedPerThreadConstantPresent", kernelInfo.patchInfo.threadPayload->UnusedPerThreadConstantPresent,
+                    "\n  .GetLocalIDPresent", kernelInfo.patchInfo.threadPayload->GetLocalIDPresent,
+                    "\n  .GetGroupIDPresent", kernelInfo.patchInfo.threadPayload->GetGroupIDPresent,
+                    "\n  .GetGlobalOffsetPresent", kernelInfo.patchInfo.threadPayload->GetGlobalOffsetPresent);
+            break;
+
+        case PATCH_TOKEN_EXECUTION_ENVIRONMENT:
+            kernelInfo.patchInfo.executionEnvironment =
+                reinterpret_cast<const SPatchExecutionEnvironment *>(pPatch);
+            if (kernelInfo.patchInfo.executionEnvironment->RequiredWorkGroupSizeX != 0) {
+                kernelInfo.reqdWorkGroupSize[0] = kernelInfo.patchInfo.executionEnvironment->RequiredWorkGroupSizeX;
+                kernelInfo.reqdWorkGroupSize[1] = kernelInfo.patchInfo.executionEnvironment->RequiredWorkGroupSizeY;
+                kernelInfo.reqdWorkGroupSize[2] = kernelInfo.patchInfo.executionEnvironment->RequiredWorkGroupSizeZ;
+                DEBUG_BREAK_IF(!(kernelInfo.patchInfo.executionEnvironment->RequiredWorkGroupSizeY > 0));
+                DEBUG_BREAK_IF(!(kernelInfo.patchInfo.executionEnvironment->RequiredWorkGroupSizeZ > 0));
+            }
+            if (kernelInfo.patchInfo.executionEnvironment->CompiledForGreaterThan4GBBuffers == false) {
+                kernelInfo.requiresSshForBuffers = true;
+            }
+            DBG_LOG(LogPatchTokens,
+                    "\n.EXECUTION_ENVIRONMENT", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .RequiredWorkGroupSizeX", kernelInfo.patchInfo.executionEnvironment->RequiredWorkGroupSizeX,
+                    "\n  .RequiredWorkGroupSizeY", kernelInfo.patchInfo.executionEnvironment->RequiredWorkGroupSizeY,
+                    "\n  .RequiredWorkGroupSizeZ", kernelInfo.patchInfo.executionEnvironment->RequiredWorkGroupSizeZ,
+                    "\n  .LargestCompiledSIMDSize", kernelInfo.patchInfo.executionEnvironment->LargestCompiledSIMDSize,
+                    "\n  .CompiledSubGroupsNumber", kernelInfo.patchInfo.executionEnvironment->CompiledSubGroupsNumber,
+                    "\n  .HasBarriers", kernelInfo.patchInfo.executionEnvironment->HasBarriers,
+                    "\n  .DisableMidThreadPreemption", kernelInfo.patchInfo.executionEnvironment->DisableMidThreadPreemption,
+                    "\n  .CompiledSIMD8", kernelInfo.patchInfo.executionEnvironment->CompiledSIMD8,
+                    "\n  .CompiledSIMD16", kernelInfo.patchInfo.executionEnvironment->CompiledSIMD16,
+                    "\n  .CompiledSIMD32", kernelInfo.patchInfo.executionEnvironment->CompiledSIMD32,
+                    "\n  .HasDeviceEnqueue", kernelInfo.patchInfo.executionEnvironment->HasDeviceEnqueue,
+                    "\n  .MayAccessUndeclaredResource", kernelInfo.patchInfo.executionEnvironment->MayAccessUndeclaredResource,
+                    "\n  .UsesFencesForReadWriteImages", kernelInfo.patchInfo.executionEnvironment->UsesFencesForReadWriteImages,
+                    "\n  .UsesStatelessSpillFill", kernelInfo.patchInfo.executionEnvironment->UsesStatelessSpillFill,
+                    "\n  .IsCoherent", kernelInfo.patchInfo.executionEnvironment->IsCoherent,
+                    "\n  .SubgroupIndependentForwardProgressRequired", kernelInfo.patchInfo.executionEnvironment->SubgroupIndependentForwardProgressRequired);
+            break;
+
+        case PATCH_TOKEN_DATA_PARAMETER_STREAM:
+            kernelInfo.patchInfo.dataParameterStream =
+                reinterpret_cast<const SPatchDataParameterStream *>(pPatch);
+            DBG_LOG(LogPatchTokens,
+                    "\n.DATA_PARAMETER_STREAM", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .DataParameterStreamSize", kernelInfo.patchInfo.dataParameterStream->DataParameterStreamSize);
+            break;
+
+        case PATCH_TOKEN_KERNEL_ARGUMENT_INFO: {
+            auto pkernelArgInfo = reinterpret_cast<const SPatchKernelArgumentInfo *>(pPatch);
+            kernelInfo.storeArgInfo(pkernelArgInfo);
+            DBG_LOG(LogPatchTokens,
+                    "\n.KERNEL_ARGUMENT_INFO", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .ArgumentNumber", pkernelArgInfo->ArgumentNumber,
+                    "\n  .AddressQualifierSize", pkernelArgInfo->AddressQualifierSize,
+                    "\n  .AccessQualifierSize", pkernelArgInfo->AccessQualifierSize,
+                    "\n  .ArgumentNameSize", pkernelArgInfo->ArgumentNameSize,
+                    "\n  .TypeNameSize", pkernelArgInfo->TypeNameSize,
+                    "\n  .TypeQualifierSize", pkernelArgInfo->TypeQualifierSize);
+            break;
+        }
+
+        case PATCH_TOKEN_KERNEL_ATTRIBUTES_INFO:
+            kernelInfo.patchInfo.pKernelAttributesInfo =
+                reinterpret_cast<const SPatchKernelAttributesInfo *>(pPatch);
+            kernelInfo.storePatchToken(kernelInfo.patchInfo.pKernelAttributesInfo);
+            DBG_LOG(LogPatchTokens,
+                    "\n.KERNEL_ATTRIBUTES_INFO", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .AttributesSize", kernelInfo.patchInfo.pKernelAttributesInfo->AttributesSize);
+            break;
+
+        case PATCH_TOKEN_SAMPLER_KERNEL_ARGUMENT: {
+            const SPatchSamplerKernelArgument *pSamplerKernelObjectKernelArg = nullptr;
+
+            pSamplerKernelObjectKernelArg = reinterpret_cast<const SPatchSamplerKernelArgument *>(pPatch);
+            kernelInfo.storeKernelArgument(pSamplerKernelObjectKernelArg);
+            DBG_LOG(LogPatchTokens,
+                    "\n.SAMPLER_KERNEL_ARGUMENT", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .ArgumentNumber", pSamplerKernelObjectKernelArg->ArgumentNumber,
+                    "\n  .Type", pSamplerKernelObjectKernelArg->Type,
+                    "\n  .Offset", pSamplerKernelObjectKernelArg->Offset);
+        };
+            break;
+
+        case PATCH_TOKEN_IMAGE_MEMORY_OBJECT_KERNEL_ARGUMENT: {
+            const SPatchImageMemoryObjectKernelArgument *pImageMemObjectKernelArg = nullptr;
+
+            pImageMemObjectKernelArg =
+                reinterpret_cast<const SPatchImageMemoryObjectKernelArgument *>(pPatch);
+            kernelInfo.storeKernelArgument(pImageMemObjectKernelArg);
+            DBG_LOG(LogPatchTokens,
+                    "\n.IMAGE_MEMORY_OBJECT_KERNEL_ARGUMENT", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .ArgumentNumber", pImageMemObjectKernelArg->ArgumentNumber,
+                    "\n  .Type", pImageMemObjectKernelArg->Type,
+                    "\n  .Offset", pImageMemObjectKernelArg->Offset,
+                    "\n  .LocationIndex", pImageMemObjectKernelArg->LocationIndex,
+                    "\n  .LocationIndex2", pImageMemObjectKernelArg->LocationIndex2,
+                    "\n  .Transformable", pImageMemObjectKernelArg->Transformable);
+        };
+            break;
+
+        case PATCH_TOKEN_GLOBAL_MEMORY_OBJECT_KERNEL_ARGUMENT: {
+            const SPatchGlobalMemoryObjectKernelArgument *pGlobalMemObjectKernelArg = nullptr;
+            pGlobalMemObjectKernelArg =
+                reinterpret_cast<const SPatchGlobalMemoryObjectKernelArgument *>(pPatch);
+            kernelInfo.storeKernelArgument(pGlobalMemObjectKernelArg);
+            DBG_LOG(LogPatchTokens,
+                    "\n.GLOBAL_MEMORY_OBJECT_KERNEL_ARGUMENT", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .ArgumentNumber", pGlobalMemObjectKernelArg->ArgumentNumber,
+                    "\n  .Offset", pGlobalMemObjectKernelArg->Offset,
+                    "\n  .LocationIndex", pGlobalMemObjectKernelArg->LocationIndex,
+                    "\n  .LocationIndex2", pGlobalMemObjectKernelArg->LocationIndex2);
+        };
+            break;
+
+        case PATCH_TOKEN_STATELESS_GLOBAL_MEMORY_OBJECT_KERNEL_ARGUMENT: {
+            const SPatchStatelessGlobalMemoryObjectKernelArgument *pStatelessGlobalMemObjKernelArg = nullptr;
+
+            pStatelessGlobalMemObjKernelArg =
+                reinterpret_cast<const SPatchStatelessGlobalMemoryObjectKernelArgument *>(pPatch);
+            kernelInfo.storeKernelArgument(pStatelessGlobalMemObjKernelArg);
+            DBG_LOG(LogPatchTokens,
+                    "\n.STATELESS_GLOBAL_MEMORY_OBJECT_KERNEL_ARGUMENT", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .ArgumentNumber", pStatelessGlobalMemObjKernelArg->ArgumentNumber,
+                    "\n  .SurfaceStateHeapOffset", pStatelessGlobalMemObjKernelArg->SurfaceStateHeapOffset,
+                    "\n  .DataParamOffset", pStatelessGlobalMemObjKernelArg->DataParamOffset,
+                    "\n  .DataParamSize", pStatelessGlobalMemObjKernelArg->DataParamSize,
+                    "\n  .LocationIndex", pStatelessGlobalMemObjKernelArg->LocationIndex,
+                    "\n  .LocationIndex2", pStatelessGlobalMemObjKernelArg->LocationIndex2);
+        };
+            break;
+
+        case PATCH_TOKEN_STATELESS_CONSTANT_MEMORY_OBJECT_KERNEL_ARGUMENT: {
+            const SPatchStatelessConstantMemoryObjectKernelArgument *pPatchToken = reinterpret_cast<const SPatchStatelessConstantMemoryObjectKernelArgument *>(pPatch);
+            kernelInfo.storeKernelArgument(pPatchToken);
+            DBG_LOG(LogPatchTokens,
+                    "\n.STATELESS_CONSTANT_MEMORY_OBJECT_KERNEL_ARGUMENT", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .ArgumentNumber", pPatchToken->ArgumentNumber,
+                    "\n  .SurfaceStateHeapOffset", pPatchToken->SurfaceStateHeapOffset,
+                    "\n  .DataParamOffset", pPatchToken->DataParamOffset,
+                    "\n  .DataParamSize", pPatchToken->DataParamSize);
+        } break;
+
+        case PATCH_TOKEN_STATELESS_DEVICE_QUEUE_KERNEL_ARGUMENT: {
+            const SPatchStatelessDeviceQueueKernelArgument *pPatchToken = reinterpret_cast<const SPatchStatelessDeviceQueueKernelArgument *>(pPatch);
+            kernelInfo.storeKernelArgument(pPatchToken);
+            DBG_LOG(LogPatchTokens,
+                    "\n.STATELESS_DEVICE_QUEUE_KERNEL_ARGUMENT", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .ArgumentNumber", pPatchToken->ArgumentNumber,
+                    "\n  .SurfaceStateHeapOffset", pPatchToken->SurfaceStateHeapOffset,
+                    "\n  .DataParamOffset", pPatchToken->DataParamOffset,
+                    "\n  .DataParamSize", pPatchToken->DataParamSize);
+        } break;
+
+        case PATCH_TOKEN_ALLOCATE_STATELESS_PRIVATE_MEMORY: {
+            const SPatchAllocateStatelessPrivateSurface *pPatchToken = reinterpret_cast<const SPatchAllocateStatelessPrivateSurface *>(pPatch);
+            kernelInfo.storePatchToken(pPatchToken);
+            DBG_LOG(LogPatchTokens,
+                    "\n.ALLOCATE_STATELESS_PRIVATE_MEMORY", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .SurfaceStateHeapOffset", pPatchToken->SurfaceStateHeapOffset,
+                    "\n  .DataParamOffset", pPatchToken->DataParamOffset,
+                    "\n  .DataParamSize", pPatchToken->DataParamSize,
+                    "\n  .PerThreadPrivateMemorySize", pPatchToken->PerThreadPrivateMemorySize);
+        } break;
+
+        case PATCH_TOKEN_ALLOCATE_STATELESS_CONSTANT_MEMORY_SURFACE_WITH_INITIALIZATION: {
+            const SPatchAllocateStatelessConstantMemorySurfaceWithInitialization *pPatchToken = reinterpret_cast<const SPatchAllocateStatelessConstantMemorySurfaceWithInitialization *>(pPatch);
+            kernelInfo.storePatchToken(pPatchToken);
+            DBG_LOG(LogPatchTokens,
+                    "\n.ALLOCATE_STATELESS_CONSTANT_MEMORY_SURFACE_WITH_INITIALIZATION", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .ConstantBufferIndex", pPatchToken->ConstantBufferIndex,
+                    "\n  .SurfaceStateHeapOffset", pPatchToken->SurfaceStateHeapOffset,
+                    "\n  .DataParamOffset", pPatchToken->DataParamOffset,
+                    "\n  .DataParamSize", pPatchToken->DataParamSize);
+        } break;
+
+        case PATCH_TOKEN_ALLOCATE_STATELESS_GLOBAL_MEMORY_SURFACE_WITH_INITIALIZATION: {
+            const SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization *pPatchToken = reinterpret_cast<const SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization *>(pPatch);
+            kernelInfo.storePatchToken(pPatchToken);
+            DBG_LOG(LogPatchTokens,
+                    "\n.ALLOCATE_STATELESS_GLOBAL_MEMORY_SURFACE_WITH_INITIALIZATION", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .GlobalBufferIndex", pPatchToken->GlobalBufferIndex,
+                    "\n  .SurfaceStateHeapOffset", pPatchToken->SurfaceStateHeapOffset,
+                    "\n  .DataParamOffset", pPatchToken->DataParamOffset,
+                    "\n  .DataParamSize", pPatchToken->DataParamSize);
+        } break;
+
+        case PATCH_TOKEN_ALLOCATE_STATELESS_PRINTF_SURFACE: {
+            const SPatchAllocateStatelessPrintfSurface *pPatchToken = reinterpret_cast<const SPatchAllocateStatelessPrintfSurface *>(pPatch);
+            kernelInfo.storePatchToken(pPatchToken);
+            DBG_LOG(LogPatchTokens,
+                    "\n.ALLOCATE_STATELESS_PRINTF_SURFACE", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .PrintfSurfaceIndex", pPatchToken->PrintfSurfaceIndex,
+                    "\n  .SurfaceStateHeapOffset", pPatchToken->SurfaceStateHeapOffset,
+                    "\n  .DataParamOffset", pPatchToken->DataParamOffset,
+                    "\n  .DataParamSize", pPatchToken->DataParamSize);
+        } break;
+
+        case PATCH_TOKEN_ALLOCATE_STATELESS_EVENT_POOL_SURFACE: {
+            const SPatchAllocateStatelessEventPoolSurface *pPatchToken = reinterpret_cast<const SPatchAllocateStatelessEventPoolSurface *>(pPatch);
+            kernelInfo.storePatchToken(pPatchToken);
+            DBG_LOG(LogPatchTokens,
+                    "\n.ALLOCATE_STATELESS_EVENT_POOL_SURFACE", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .EventPoolSurfaceIndex", pPatchToken->EventPoolSurfaceIndex,
+                    "\n  .SurfaceStateHeapOffset", pPatchToken->SurfaceStateHeapOffset,
+                    "\n  .DataParamOffset", pPatchToken->DataParamOffset,
+                    "\n  .DataParamSize", pPatchToken->DataParamSize);
+        } break;
+
+        case PATCH_TOKEN_ALLOCATE_STATELESS_DEFAULT_DEVICE_QUEUE_SURFACE: {
+            const SPatchAllocateStatelessDefaultDeviceQueueSurface *pPatchToken = reinterpret_cast<const SPatchAllocateStatelessDefaultDeviceQueueSurface *>(pPatch);
+            kernelInfo.storePatchToken(pPatchToken);
+            DBG_LOG(LogPatchTokens,
+                    "\n.ALLOCATE_STATELESS_DEFAULT_DEVICE_QUEUE_SURFACE", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .SurfaceStateHeapOffset", pPatchToken->SurfaceStateHeapOffset,
+                    "\n  .DataParamOffset", pPatchToken->DataParamOffset,
+                    "\n  .DataParamSize", pPatchToken->DataParamSize);
+        } break;
+
+        case PATCH_TOKEN_STRING: {
+            const SPatchString *pPatchToken = reinterpret_cast<const SPatchString *>(pPatch);
+            kernelInfo.storePatchToken(pPatchToken);
+            DBG_LOG(LogPatchTokens,
+                    "\n.STRING", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .Index", pPatchToken->Index,
+                    "\n  .StringSize", pPatchToken->StringSize);
+        } break;
+
+        case PATCH_TOKEN_INLINE_VME_SAMPLER_INFO:
+            kernelInfo.isVmeWorkload = true;
+            DBG_LOG(LogPatchTokens,
+                    "\n.INLINE_VME_SAMPLER_INFO", pPatch->Token,
+                    "\n  .Size", pPatch->Size);
+            break;
+
+        case PATCH_TOKEN_GTPIN_FREE_GRF_INFO: {
+            const SPatchGtpinFreeGRFInfo *pPatchToken = reinterpret_cast<const SPatchGtpinFreeGRFInfo *>(pPatch);
+            DBG_LOG(LogPatchTokens,
+                    "\n.PATCH_TOKEN_GTPIN_FREE_GRF_INFO", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .BufferSize", pPatchToken->BufferSize);
+        } break;
+
+        case PATCH_TOKEN_STATE_SIP: {
+            const SPatchStateSIP *pPatchToken = reinterpret_cast<const SPatchStateSIP *>(pPatch);
+            kernelInfo.systemKernelOffset = pPatchToken->SystemKernelOffset;
+            DBG_LOG(LogPatchTokens,
+                    "\n.PATCH_TOKEN_STATE_SIP", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .SystemKernelOffset", pPatchToken->SystemKernelOffset);
+        } break;
+
+        default:
+            printDebugString(DebugManager.flags.PrintDebugMessages.get(), stderr, " Program::parsePatchList. Unknown Patch Token: %d\n", pPatch->Token);
+            if (false == isSafeToSkipUnhandledToken(pPatch->Token)) {
+                retVal = CL_INVALID_KERNEL;
+            }
+            break;
+        }
+
+        if (retVal != CL_SUCCESS) {
+            break;
+        }
+        pCurPatchListPtr = ptrOffset(pCurPatchListPtr, pPatch->Size);
+    }
+
+    if (retVal == CL_SUCCESS) {
+        retVal = kernelInfo.resolveKernelInfo();
+    }
+
+    if (kernelInfo.patchInfo.dataParameterStream && kernelInfo.patchInfo.dataParameterStream->DataParameterStreamSize) {
+        uint32_t crossThreadDataSize = kernelInfo.patchInfo.dataParameterStream->DataParameterStreamSize;
+        kernelInfo.crossThreadData = new char[crossThreadDataSize];
+        memset(kernelInfo.crossThreadData, 0x00, crossThreadDataSize);
+
+        if (LocalMemoryStatelessWindowStartAddressOffset != 0xFFffFFff) {
+            *(uintptr_t *)&(kernelInfo.crossThreadData[LocalMemoryStatelessWindowStartAddressOffset]) = reinterpret_cast<uintptr_t>(this->pDevice->getSLMWindowStartAddress());
+        }
+
+        if (LocalMemoryStatelessWindowSizeOffset != 0xFFffFFff) {
+            *(uint32_t *)&(kernelInfo.crossThreadData[LocalMemoryStatelessWindowSizeOffset]) = (uint32_t)this->pDevice->getDeviceInfo().localMemSize;
+        }
+
+        if (kernelInfo.patchInfo.pAllocateStatelessPrivateSurface && (PrivateMemoryStatelessSizeOffset != 0xFFffFFff)) {
+            *(uint32_t *)&(kernelInfo.crossThreadData[PrivateMemoryStatelessSizeOffset]) = kernelInfo.patchInfo.pAllocateStatelessPrivateSurface->PerThreadPrivateMemorySize * this->getDevice(0).getDeviceInfo().computeUnitsUsedForScratch * kernelInfo.getMaxSimdSize();
+        }
+
+        if (kernelInfo.workloadInfo.maxWorkGroupSizeOffset != WorkloadInfo::undefinedOffset) {
+            *(uint32_t *)&(kernelInfo.crossThreadData[kernelInfo.workloadInfo.maxWorkGroupSizeOffset]) = (uint32_t)this->getDevice(0).getDeviceInfo().maxWorkGroupSize;
+        }
+    }
+
+    return retVal;
+}
+
+cl_int Program::parseProgramScopePatchList() {
+    cl_int retVal = CL_SUCCESS;
+    cl_uint surfaceSize = 0;
+
+    auto pPatchList = programScopePatchList;
+    auto patchListSize = programScopePatchListSize;
+    auto pCurPatchListPtr = pPatchList;
+    cl_uint headerSize = 0;
+
+    while (ptrDiff(pCurPatchListPtr, pPatchList) < patchListSize) {
+        auto pPatch = reinterpret_cast<const SPatchItemHeader *>(pCurPatchListPtr);
+        switch (pPatch->Token) {
+        case PATCH_TOKEN_ALLOCATE_CONSTANT_MEMORY_SURFACE_PROGRAM_BINARY_INFO: {
+            auto patch = *(SPatchAllocateConstantMemorySurfaceProgramBinaryInfo *)pPatch;
+
+            if (constantSurface) {
+                pDevice->getMemoryManager()->freeGraphicsMemory(constantSurface);
+            }
+
+            surfaceSize = patch.InlineDataSize;
+            headerSize = sizeof(SPatchAllocateConstantMemorySurfaceProgramBinaryInfo);
+
+            constantSurface = pDevice->getMemoryManager()->createGraphicsAllocationWithRequiredBitness(surfaceSize, nullptr);
+
+            memcpy_s(constantSurface->getUnderlyingBuffer(), surfaceSize, (cl_char *)pPatch + headerSize, surfaceSize);
+            pCurPatchListPtr = ptrOffset(pCurPatchListPtr, surfaceSize);
+            DBG_LOG(LogPatchTokens,
+                    "\n  .ALLOCATE_CONSTANT_MEMORY_SURFACE_PROGRAM_BINARY_INFO", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .ConstantBufferIndex", patch.ConstantBufferIndex,
+                    "\n  .InitializationDataSize", patch.InlineDataSize);
+        };
+            break;
+
+        case PATCH_TOKEN_ALLOCATE_GLOBAL_MEMORY_SURFACE_PROGRAM_BINARY_INFO: {
+            auto patch = *(SPatchAllocateGlobalMemorySurfaceProgramBinaryInfo *)pPatch;
+
+            if (globalSurface) {
+                pDevice->getMemoryManager()->freeGraphicsMemory(globalSurface);
+            }
+
+            surfaceSize = patch.InlineDataSize;
+            globalVarTotalSize += (size_t)surfaceSize;
+            headerSize = sizeof(SPatchAllocateGlobalMemorySurfaceProgramBinaryInfo);
+            globalSurface = pDevice->getMemoryManager()->createGraphicsAllocationWithRequiredBitness(surfaceSize, nullptr);
+            memcpy_s(globalSurface->getUnderlyingBuffer(), surfaceSize, (cl_char *)pPatch + headerSize, surfaceSize);
+            pCurPatchListPtr = ptrOffset(pCurPatchListPtr, surfaceSize);
+            DBG_LOG(LogPatchTokens,
+                    "\n  .ALLOCATE_GLOBAL_MEMORY_SURFACE_PROGRAM_BINARY_INFO", pPatch->Token,
+                    "\n  .Size", pPatch->Size,
+                    "\n  .BufferType", patch.Type,
+                    "\n  .GlobalBufferIndex", patch.GlobalBufferIndex,
+                    "\n  .InitializationDataSize", patch.InlineDataSize);
+        };
+            break;
+
+        case PATCH_TOKEN_GLOBAL_POINTER_PROGRAM_BINARY_INFO:
+            if (globalSurface != nullptr) {
+                auto patch = *(SPatchGlobalPointerProgramBinaryInfo *)pPatch;
+                if ((patch.GlobalBufferIndex == 0) && (patch.BufferIndex == 0) && (patch.BufferType == PROGRAM_SCOPE_GLOBAL_BUFFER)) {
+                    void *pPtr = (void *)((uintptr_t)globalSurface->getUnderlyingBuffer() + (uintptr_t)patch.GlobalPointerOffset);
+                    if (globalSurface->is32BitAllocation) {
+                        *reinterpret_cast<uint32_t *>(pPtr) += static_cast<uint32_t>(globalSurface->getGpuAddressToPatch());
+                    } else {
+                        *reinterpret_cast<uintptr_t *>(pPtr) += reinterpret_cast<uintptr_t>(globalSurface->getUnderlyingBuffer());
+                    }
+                } else {
+                    printDebugString(DebugManager.flags.PrintDebugMessages.get(), stderr, "Program::parseProgramScopePatchList. Unhandled Data parameter: %d\n", pPatch->Token);
+                }
+                DBG_LOG(LogPatchTokens,
+                        "\n  .GLOBAL_POINTER_PROGRAM_BINARY_INFO", pPatch->Token,
+                        "\n  .Size", pPatch->Size,
+                        "\n  .GlobalBufferIndex", patch.GlobalBufferIndex,
+                        "\n  .GlobalPointerOffset", patch.GlobalPointerOffset,
+                        "\n  .BufferType", patch.BufferType,
+                        "\n  .BufferIndex", patch.BufferIndex);
+            }
+            break;
+
+        case PATCH_TOKEN_CONSTANT_POINTER_PROGRAM_BINARY_INFO:
+            if (constantSurface != nullptr) {
+                auto patch = *(SPatchConstantPointerProgramBinaryInfo *)pPatch;
+                if ((patch.ConstantBufferIndex == 0) && (patch.BufferIndex == 0) && (patch.BufferType == PROGRAM_SCOPE_CONSTANT_BUFFER)) {
+                    void *pPtr = (uintptr_t *)((uintptr_t)constantSurface->getUnderlyingBuffer() + (uintptr_t)patch.ConstantPointerOffset);
+                    if (constantSurface->is32BitAllocation) {
+                        *reinterpret_cast<uint32_t *>(pPtr) += static_cast<uint32_t>(constantSurface->getGpuAddressToPatch());
+                    } else {
+                        *reinterpret_cast<uintptr_t *>(pPtr) += reinterpret_cast<uintptr_t>(constantSurface->getUnderlyingBuffer());
+                    }
+
+                } else {
+                    printDebugString(DebugManager.flags.PrintDebugMessages.get(), stderr, "Program::parseProgramScopePatchList. Unhandled Data parameter: %d\n", pPatch->Token);
+                }
+                DBG_LOG(LogPatchTokens,
+                        "\n  .CONSTANT_POINTER_PROGRAM_BINARY_INFO", pPatch->Token,
+                        "\n  .Size", pPatch->Size,
+                        "\n  .ConstantBufferIndex", patch.ConstantBufferIndex,
+                        "\n  .ConstantPointerOffset", patch.ConstantPointerOffset,
+                        "\n  .BufferType", patch.BufferType,
+                        "\n  .BufferIndex", patch.BufferIndex);
+            }
+            break;
+
+        default:
+            if (false == isSafeToSkipUnhandledToken(pPatch->Token)) {
+                retVal = CL_INVALID_BINARY;
+            }
+            printDebugString(DebugManager.flags.PrintDebugMessages.get(), stderr, " Program::parseProgramScopePatchList. Unknown Patch Token: %d\n", pPatch->Token);
+            DBG_LOG(LogPatchTokens,
+                    "\n  .Program Unknown Patch Token", pPatch->Token,
+                    "\n  .Size", pPatch->Size);
+        }
+
+        if (retVal != CL_SUCCESS) {
+            break;
+        }
+        pCurPatchListPtr = ptrOffset(pCurPatchListPtr, pPatch->Size);
+    }
+
+    return retVal;
+}
+
+cl_int Program::processGenBinary() {
+    cl_int retVal = CL_SUCCESS;
+
+    for (auto &i : kernelInfoArray)
+        delete i;
+    kernelInfoArray.clear();
+
+    do {
+        if (!genBinary || genBinarySize == 0) {
+            retVal = CL_INVALID_BINARY;
+            break;
+        }
+
+        auto pCurBinaryPtr = genBinary;
+        auto pGenBinaryHeader = reinterpret_cast<const SProgramBinaryHeader *>(pCurBinaryPtr);
+        if (!validateGenBinaryHeader(pGenBinaryHeader)) {
+            retVal = CL_INVALID_BINARY;
+            break;
+        }
+
+        pCurBinaryPtr = ptrOffset(pCurBinaryPtr, sizeof(SProgramBinaryHeader));
+        programScopePatchList = pCurBinaryPtr;
+        programScopePatchListSize = pGenBinaryHeader->PatchListSize;
+
+        if (programScopePatchListSize != 0u) {
+            retVal = parseProgramScopePatchList();
+        }
+
+        pCurBinaryPtr = ptrOffset(pCurBinaryPtr, pGenBinaryHeader->PatchListSize);
+
+        auto numKernels = pGenBinaryHeader->NumberOfKernels;
+        for (uint32_t i = 0; i < numKernels && retVal == CL_SUCCESS; i++) {
+
+            size_t bytesProcessed = processKernel(pCurBinaryPtr, retVal);
+            pCurBinaryPtr = ptrOffset(pCurBinaryPtr, bytesProcessed);
+        }
+    } while (false);
+
+    return retVal;
+}
+
+bool Program::validateGenBinaryDevice(GFXCORE_FAMILY device) const {
+    bool isValid = familyEnabled[device];
+
+    return isValid;
+}
+
+bool Program::validateGenBinaryHeader(const iOpenCL::SProgramBinaryHeader *pGenBinaryHeader) const {
+    return pGenBinaryHeader->Magic == MAGIC_CL &&
+           pGenBinaryHeader->Version == CURRENT_ICBE_VERSION &&
+           validateGenBinaryDevice(static_cast<GFXCORE_FAMILY>(pGenBinaryHeader->Device));
+}
+} // namespace OCLRT
--- a/runtime/program/process_spir_binary.cpp
+++ b/runtime/program/process_spir_binary.cpp
@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "program.h"
+
+namespace OCLRT {
+
+bool Program::isValidSpirvBinary(
+    const void *pBinary,
+    size_t binarySize) {
+
+    const uint32_t magicWord[2] = {0x03022307, 0x07230203};
+    bool retVal = false;
+
+    if (pBinary && (binarySize > sizeof(uint32_t))) {
+        if ((memcmp(pBinary, &magicWord[0], sizeof(uint32_t)) == 0) ||
+            (memcmp(pBinary, &magicWord[1], sizeof(uint32_t)) == 0)) {
+            retVal = true;
+        }
+    }
+    return retVal;
+}
+
+cl_int Program::processSpirBinary(
+    const void *pBinary,
+    size_t binarySize,
+    bool isSpirV) {
+    programBinaryType = CL_PROGRAM_BINARY_TYPE_INTERMEDIATE;
+
+    std::string binaryString(static_cast<const char *>(pBinary), binarySize);
+    sourceCode.swap(binaryString);
+
+    buildStatus = CL_BUILD_NONE;
+    this->isSpirV = isSpirV;
+
+    return CL_SUCCESS;
+}
+}
--- a/runtime/program/program.cpp
+++ b/runtime/program/program.cpp
@ -0,0 +1,443 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "program.h"
+#include "elf/writer.h"
+#include "runtime/context/context.h"
+#include "runtime/helpers/debug_helpers.h"
+#include "runtime/helpers/string.h"
+#include "runtime/memory_manager/memory_manager.h"
+#include "runtime/compiler_interface/compiler_interface.h"
+
+#include <sstream>
+
+namespace OCLRT {
+
+const std::string Program::clOptNameClVer("-cl-std=CL");
+const std::string Program::clOptNameUniformWgs{"-cl-uniform-work-group-size"};
+
+Program::Program() : Program(nullptr) {
+    numDevices = 0;
+}
+
+Program::Program(Context *context, bool isBuiltIn) : context(context), isBuiltIn(isBuiltIn) {
+    if (this->context && !this->isBuiltIn) {
+        this->context->incRefInternal();
+    }
+    blockKernelManager = new BlockKernelManager();
+    pDevice = context ? context->getDevice(0) : nullptr;
+    numDevices = 1;
+    elfBinary = nullptr;
+    elfBinarySize = 0;
+    genBinary = nullptr;
+    genBinarySize = 0;
+    llvmBinary = nullptr;
+    llvmBinarySize = 0;
+    debugData = nullptr;
+    debugDataSize = 0;
+    buildStatus = CL_BUILD_NONE;
+    programBinaryType = CL_PROGRAM_BINARY_TYPE_NONE;
+    isCreatedFromBinary = false;
+    isProgramBinaryResolved = false;
+    constantSurface = nullptr;
+    globalSurface = nullptr;
+    globalVarTotalSize = 0;
+    programScopePatchListSize = 0;
+    programScopePatchList = nullptr;
+    programOptionVersion = 12u;
+    allowNonUniform = false;
+    char paramValue[32] = {};
+    bool force32BitAddressess = false;
+
+    if (pDevice) {
+        pDevice->getDeviceInfo(CL_DEVICE_VERSION, 32, paramValue, nullptr);
+        if (strstr(paramValue, "2.1")) {
+            internalOptions = "-ocl-version=210 ";
+        } else if (strstr(paramValue, "2.0")) {
+            internalOptions = "-ocl-version=200 ";
+        } else if (strstr(paramValue, "1.2")) {
+            internalOptions = "-ocl-version=120 ";
+        }
+        force32BitAddressess = pDevice->getDeviceInfo().force32BitAddressess;
+
+        if (force32BitAddressess) {
+            internalOptions += "-m32 ";
+        }
+        pDevice->increaseProgramCount();
+
+        bool forceStateless = is32bit | DebugManager.flags.DisableStatelessToStatefulOptimization.get();
+
+        if (force32BitAddressess) {
+            forceStateless = true;
+        }
+
+        if (forceStateless) {
+            internalOptions += "-cl-intel-greater-than-4GB-buffer-required ";
+        }
+    }
+
+    if (DebugManager.flags.EnableStatelessToStatefulBufferOffsetOpt.get()) {
+        internalOptions += "-cl-intel-has-buffer-offset-arg ";
+    }
+}
+
+Program::~Program() {
+    if (context && !isBuiltIn) {
+        context->decRefInternal();
+    }
+    delete[] genBinary;
+    genBinary = nullptr;
+
+    delete[] llvmBinary;
+    llvmBinary = nullptr;
+
+    delete[] debugData;
+    debugData = nullptr;
+
+    delete[] elfBinary;
+    elfBinary = nullptr;
+    elfBinarySize = 0;
+
+    for (auto &i : kernelInfoArray) {
+        delete i;
+    }
+
+    freeBlockPrivateSurfaces();
+
+    delete blockKernelManager;
+
+    if (constantSurface) {
+        auto memoryManager = pDevice->getMemoryManager();
+        memoryManager->freeGraphicsMemory(constantSurface);
+        constantSurface = nullptr;
+    }
+
+    if (globalSurface) {
+        auto memoryManager = pDevice->getMemoryManager();
+        memoryManager->freeGraphicsMemory(globalSurface);
+        globalSurface = nullptr;
+    }
+}
+
+cl_int Program::createProgramFromBinary(
+    const void *pBinary,
+    size_t binarySize) {
+
+    cl_int retVal = CL_SUCCESS;
+    uint32_t binaryVersion = iOpenCL::CURRENT_ICBE_VERSION;
+
+    if (Program::isValidLlvmBinary(pBinary, binarySize)) {
+        retVal = processSpirBinary(pBinary, binarySize, false);
+    } else if (Program::isValidSpirvBinary(pBinary, binarySize)) {
+        retVal = processSpirBinary(pBinary, binarySize, true);
+    } else {
+        retVal = processElfBinary(pBinary, binarySize, binaryVersion);
+        if (retVal == CL_SUCCESS) {
+            isCreatedFromBinary = true;
+        } else if (binaryVersion != iOpenCL::CURRENT_ICBE_VERSION) {
+            // Version of compiler used to create program binary is invalid,
+            // needs to recompile program binary from its LLVM (if available).
+            // if recompile fails propagate error retVal from previous function
+            if (!rebuildProgramFromLLVM()) {
+                retVal = CL_SUCCESS;
+            }
+        }
+    }
+
+    return retVal;
+}
+
+cl_int Program::rebuildProgramFromLLVM() {
+    cl_int retVal = CL_SUCCESS;
+    size_t dataSize;
+    char *pData = nullptr;
+    CLElfLib::CElfWriter *pElfWriter = nullptr;
+
+    do {
+        if (!Program::isValidLlvmBinary(llvmBinary, llvmBinarySize)) {
+            retVal = CL_INVALID_PROGRAM;
+            break;
+        }
+
+        pElfWriter = CLElfLib::CElfWriter::create(CLElfLib::EH_TYPE_OPENCL_OBJECTS, CLElfLib::EH_MACHINE_NONE, 0);
+
+        CLElfLib::SSectionNode sectionNode;
+        sectionNode.Name = "";
+        sectionNode.Type = CLElfLib::SH_TYPE_OPENCL_LLVM_BINARY;
+        sectionNode.Flags = 0;
+        sectionNode.pData = llvmBinary;
+        sectionNode.DataSize = static_cast<unsigned int>(llvmBinarySize);
+        pElfWriter->addSection(&sectionNode);
+
+        pElfWriter->resolveBinary(nullptr, dataSize);
+        pData = new char[dataSize];
+        pElfWriter->resolveBinary(pData, dataSize);
+
+        CompilerInterface *pCompilerInterface = getCompilerInterface();
+        if (nullptr == pCompilerInterface) {
+            retVal = CL_OUT_OF_HOST_MEMORY;
+            break;
+        }
+
+        TranslationArgs inputArgs = {};
+        inputArgs.pInput = pData;
+        inputArgs.InputSize = static_cast<unsigned int>(dataSize);
+        inputArgs.pOptions = options.c_str();
+        inputArgs.OptionsSize = static_cast<unsigned int>(options.length());
+        inputArgs.pInternalOptions = internalOptions.c_str();
+        inputArgs.InternalOptionsSize = static_cast<unsigned int>(internalOptions.length());
+        inputArgs.pTracingOptions = nullptr;
+        inputArgs.TracingOptionsCount = 0;
+
+        retVal = pCompilerInterface->link(*this, inputArgs);
+        if (retVal != CL_SUCCESS) {
+            break;
+        }
+
+        retVal = processGenBinary();
+        if (retVal != CL_SUCCESS) {
+            break;
+        }
+
+        programBinaryType = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+        isCreatedFromBinary = true;
+        isProgramBinaryResolved = true;
+    } while (false);
+
+    CLElfLib::CElfWriter::destroy(pElfWriter);
+    delete[] pData;
+
+    return retVal;
+}
+
+void Program::getProgramCompilerVersion(
+    SProgramBinaryHeader *pSectionData,
+    uint32_t &binaryVersion) const {
+    if (pSectionData != nullptr) {
+        binaryVersion = pSectionData->Version;
+    }
+}
+
+bool Program::isValidLlvmBinary(
+    const void *pBinary,
+    size_t binarySize) {
+
+    const char *pLlvmMagic = "BC\xc0\xde";
+    bool retVal = false;
+
+    if (pBinary && (binarySize > (strlen(pLlvmMagic) + 1))) {
+        if (strstr((char *)pBinary, pLlvmMagic) != nullptr) {
+            retVal = true;
+        }
+    }
+
+    return retVal;
+}
+
+void Program::setSource(char *pSourceString) {
+    sourceCode = pSourceString;
+}
+
+cl_int Program::getSource(char *&pBinary, unsigned int &dataSize) const {
+    cl_int retVal = CL_INVALID_PROGRAM;
+    pBinary = nullptr;
+    dataSize = 0;
+    if (!sourceCode.empty()) {
+        pBinary = (char *)(sourceCode.c_str());
+        dataSize = (unsigned int)(sourceCode.size());
+        retVal = CL_SUCCESS;
+    }
+    return retVal;
+}
+
+void Program::storeGenBinary(
+    const void *pSrc,
+    const size_t srcSize) {
+    storeBinary(genBinary, genBinarySize, pSrc, srcSize);
+}
+
+void Program::storeLlvmBinary(
+    const void *pSrc,
+    const size_t srcSize) {
+    storeBinary(llvmBinary, llvmBinarySize, pSrc, srcSize);
+}
+
+void Program::storeDebugData(
+    const void *pSrc,
+    const size_t srcSize) {
+    storeBinary(debugData, debugDataSize, pSrc, srcSize);
+}
+
+void Program::storeBinary(
+    char *&pDst,
+    size_t &dstSize,
+    const void *pSrc,
+    const size_t srcSize) {
+    dstSize = 0;
+
+    DEBUG_BREAK_IF(!(pSrc && srcSize > 0));
+
+    delete[] pDst;
+    pDst = new char[srcSize];
+
+    dstSize = (cl_uint)srcSize;
+    memcpy_s(pDst, dstSize, pSrc, srcSize);
+}
+
+void Program::updateBuildLog(const Device *pDevice, const char *pErrorString,
+                             size_t errorStringSize) {
+    if ((pErrorString == nullptr) || (errorStringSize == 0) || (pErrorString[0] == '\0')) {
+        return;
+    }
+
+    if (pErrorString[errorStringSize - 1] == '\0') {
+        --errorStringSize;
+    }
+
+    auto it = buildLog.find(pDevice);
+
+    if (it == buildLog.end()) {
+        buildLog[pDevice].assign(pErrorString, pErrorString + errorStringSize);
+        return;
+    }
+
+    buildLog[pDevice].append("\n");
+    buildLog[pDevice].append(pErrorString, pErrorString + errorStringSize);
+}
+
+const char *Program::getBuildLog(const Device *pDevice) const {
+    const char *entry = nullptr;
+
+    auto it = buildLog.find(pDevice);
+
+    if (it != buildLog.end()) {
+        entry = it->second.c_str();
+    }
+
+    return entry;
+}
+
+CompilerInterface *Program::getCompilerInterface() const {
+    return CompilerInterface::getInstance();
+}
+
+void Program::separateBlockKernels() {
+    if ((0 == parentKernelInfoArray.size()) && (0 == subgroupKernelInfoArray.size())) {
+        return;
+    }
+
+    auto allKernelInfos(kernelInfoArray);
+    kernelInfoArray.clear();
+    for (auto &i : allKernelInfos) {
+        auto end = i->name.rfind("_dispatch_");
+        if (end != std::string::npos) {
+            bool baseKernelFound = false;
+            std::string baseKernelName(i->name, 0, end);
+            for (auto &j : parentKernelInfoArray) {
+                if (j->name.compare(baseKernelName) == 0) {
+                    baseKernelFound = true;
+                    break;
+                }
+            }
+            if (!baseKernelFound) {
+                for (auto &j : subgroupKernelInfoArray) {
+                    if (j->name.compare(baseKernelName) == 0) {
+                        baseKernelFound = true;
+                        break;
+                    }
+                }
+            }
+            if (baseKernelFound) {
+                //Parent or subgroup kernel found -> child kernel
+                blockKernelManager->addBlockKernelInfo(i);
+            } else {
+                kernelInfoArray.push_back(i);
+            }
+        } else {
+            //Regular kernel found
+            kernelInfoArray.push_back(i);
+        }
+    }
+    allKernelInfos.clear();
+}
+
+void Program::allocateBlockPrivateSurfaces() {
+    size_t blockCount = blockKernelManager->getCount();
+
+    for (uint32_t i = 0; i < blockCount; i++) {
+        const KernelInfo *info = blockKernelManager->getBlockKernelInfo(i);
+
+        if (info->patchInfo.pAllocateStatelessPrivateSurface) {
+            size_t privateSize = info->patchInfo.pAllocateStatelessPrivateSurface->PerThreadPrivateMemorySize;
+
+            if (privateSize > 0 && blockKernelManager->getPrivateSurface(i) == nullptr) {
+                privateSize *= getDevice(0).getDeviceInfo().computeUnitsUsedForScratch * info->getMaxSimdSize();
+
+                auto *privateSurface = getDevice(0).getMemoryManager()->createGraphicsAllocationWithRequiredBitness(privateSize, nullptr);
+                blockKernelManager->pushPrivateSurface(privateSurface, i);
+            }
+        }
+    }
+}
+
+void Program::freeBlockPrivateSurfaces() {
+    size_t blockCount = blockKernelManager->getCount();
+
+    for (uint32_t i = 0; i < blockCount; i++) {
+
+        auto *privateSurface = blockKernelManager->getPrivateSurface(i);
+
+        if (privateSurface != nullptr) {
+            blockKernelManager->pushPrivateSurface(nullptr, i);
+            getDevice(0).getMemoryManager()->freeGraphicsMemory(privateSurface);
+        }
+    }
+}
+
+void Program::updateNonUniformFlag() {
+    //Look for -cl-std=CL substring and extract value behind which can be 1.2 2.0 2.1 and convert to value
+    auto pos = options.find(clOptNameClVer);
+    if (pos == std::string::npos) {
+        programOptionVersion = 12u; //Default is 1.2
+    } else {
+        std::stringstream ss{options.c_str() + pos + clOptNameClVer.size()};
+        uint32_t majorV, minorV;
+        char dot;
+        ss >> majorV;
+        ss >> dot;
+        ss >> minorV;
+        programOptionVersion = majorV * 10u + minorV;
+    }
+
+    if (programOptionVersion >= 20u && options.find(clOptNameUniformWgs) == std::string::npos) {
+        allowNonUniform = true;
+    }
+}
+
+void Program::updateNonUniformFlag(const Program **inputPrograms, size_t numInputPrograms) {
+    bool allowNonUniform = true;
+    for (cl_uint i = 0; i < numInputPrograms; i++) {
+        allowNonUniform = allowNonUniform && inputPrograms[i]->getAllowNonUniform();
+    }
+    this->allowNonUniform = allowNonUniform;
+}
+} // namespace OCLRT
--- a/runtime/program/program.h
+++ b/runtime/program/program.h
@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "block_kernel_manager.h"
+#include "elf/reader.h"
+#include "kernel_info.h"
+#include "runtime/api/cl_types.h"
+#include "runtime/device/device.h"
+#include "runtime/helpers/base_object.h"
+#include "runtime/helpers/stdio.h"
+#include "runtime/helpers/string_helpers.h"
+#include "igfxfmid.h"
+#include "patch_list.h"
+#include <vector>
+#include <string>
+#include <map>
+
+#define OCLRT_ALIGN(a, b) ((((a) % (b)) != 0) ? ((a) - ((a) % (b)) + (b)) : (a))
+
+namespace OCLRT {
+class Context;
+class CompilerInterface;
+template <>
+struct OpenCLObjectMapper<_cl_program> {
+    typedef class Program DerivedType;
+};
+
+bool isSafeToSkipUnhandledToken(unsigned int token);
+
+class Program : public BaseObject<_cl_program> {
+  public:
+    static const cl_ulong objectMagic = 0x5651C89100AAACFELL;
+
+    // Create program from binary
+    template <typename T = Program>
+    static T *create(
+        cl_context context,
+        cl_uint numDevices,
+        const cl_device_id *deviceList,
+        const size_t *lengths,
+        const unsigned char **binaries,
+        cl_int *binaryStatus,
+        cl_int &errcodeRet);
+
+    // Create program from source
+    template <typename T = Program>
+    static T *create(
+        cl_context context,
+        cl_uint count,
+        const char **strings,
+        const size_t *lengths,
+        cl_int &errcodeRet);
+
+    template <typename T = Program>
+    static T *create(
+        const char *nullTerminatedString,
+        Context *context,
+        Device &device,
+        bool isBuiltIn,
+        cl_int *errcodeRet);
+
+    template <typename T = Program>
+    static T *createFromGenBinary(
+        Context *context,
+        const void *binary,
+        size_t size,
+        bool isBuiltIn,
+        cl_int *errcodeRet) {
+        cl_int retVal = CL_SUCCESS;
+        T *program = nullptr;
+
+        if ((binary == nullptr) || (size == 0)) {
+            retVal = CL_INVALID_VALUE;
+        }
+
+        if (CL_SUCCESS == retVal) {
+            program = new T(context, isBuiltIn);
+            program->numDevices = 1;
+            program->storeGenBinary(binary, size);
+            program->isCreatedFromBinary = true;
+            program->programBinaryType = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+            program->isProgramBinaryResolved = true;
+            program->buildStatus = CL_BUILD_SUCCESS;
+        }
+
+        if (errcodeRet) {
+            *errcodeRet = retVal;
+        }
+
+        return program;
+    }
+
+    template <typename T = Program>
+    static T *createFromIL(Context *context,
+                           const void *il,
+                           size_t length,
+                           cl_int &errcodeRet);
+
+    Program(Context *context, bool isBuiltIn = false);
+    ~Program() override;
+
+    Program(const Program &) = delete;
+    Program &operator=(const Program &) = delete;
+
+    cl_int build(cl_uint numDevices, const cl_device_id *deviceList, const char *buildOptions,
+                 void(CL_CALLBACK *funcNotify)(cl_program program, void *userData),
+                 void *userData, bool enableCaching);
+
+    cl_int build(const cl_device_id device, const char *buildOptions, bool enableCaching,
+                 std::unordered_map<std::string, BuiltinDispatchInfoBuilder *> &builtinsMap);
+
+    cl_int build(const char *pKernelData, size_t kernelDataSize);
+
+    MOCKABLE_VIRTUAL cl_int processGenBinary();
+
+    cl_int compile(cl_uint numDevices, const cl_device_id *deviceList, const char *buildOptions,
+                   cl_uint numInputHeaders, const cl_program *inputHeaders, const char **headerIncludeNames,
+                   void(CL_CALLBACK *funcNotify)(cl_program program, void *userData),
+                   void *userData);
+
+    cl_int link(cl_uint numDevices, const cl_device_id *deviceList, const char *buildOptions,
+                cl_uint numInputPrograms, const cl_program *inputPrograms,
+                void(CL_CALLBACK *funcNotify)(cl_program program, void *userData),
+                void *userData);
+
+    size_t getNumKernels() const;
+    const KernelInfo *getKernelInfo(const char *kernelName) const;
+    const KernelInfo *getKernelInfo(size_t ordinal) const;
+
+    cl_int getInfo(cl_program_info paramName, size_t paramValueSize,
+                   void *paramValue, size_t *paramValueSizeRet);
+
+    cl_int getBuildInfo(cl_device_id device, cl_program_build_info paramName,
+                        size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const;
+
+    Context &getContext() const {
+        return *context;
+    }
+
+    Context *getContextPtr() const {
+        return context;
+    }
+
+    const Device &getDevice(cl_uint deviceOrdinal) const {
+        return *pDevice;
+    }
+
+    cl_uint getNumDevices() const {
+        return 1;
+    }
+
+    MOCKABLE_VIRTUAL cl_int processElfBinary(const void *pBinary, size_t binarySize, uint32_t &binaryVersion);
+    cl_int processSpirBinary(const void *pBinary, size_t binarySize, bool isSpirV);
+
+    void setSource(char *pSourceString);
+
+    cl_int getSource(char *&pBinary, unsigned int &dataSize) const;
+
+    void storeGenBinary(const void *pSrc, const size_t srcSize);
+
+    char *getGenBinary(size_t &genBinarySize) const {
+        genBinarySize = this->genBinarySize;
+        return this->genBinary;
+    }
+
+    void storeLlvmBinary(const void *pSrc, const size_t srcSize);
+
+    void storeDebugData(const void *pSrc, const size_t srcSize);
+
+    void updateBuildLog(const Device *pDevice, const char *pErrorString, const size_t errorStringSize);
+
+    const char *getBuildLog(const Device *pDevice) const;
+
+    cl_uint getProgramBinaryType() const {
+        return programBinaryType;
+    }
+
+    bool getIsSpirV() const {
+        return isSpirV;
+    }
+
+    size_t getProgramScopePatchListSize() const {
+        return programScopePatchListSize;
+    }
+
+    GraphicsAllocation *getConstantSurface() const {
+        return constantSurface;
+    }
+
+    GraphicsAllocation *getGlobalSurface() const {
+        return globalSurface;
+    }
+
+    BlockKernelManager *getBlockKernelManager() const {
+        return blockKernelManager;
+    }
+
+    void allocateBlockPrivateSurfaces();
+    void freeBlockPrivateSurfaces();
+
+    const std::string &getOptions() const { return options; }
+
+    const std::string &getInternalOptions() const { return internalOptions; }
+
+    bool getAllowNonUniform() const {
+        return allowNonUniform;
+    }
+    bool getIsBuiltIn() const {
+        return isBuiltIn;
+    }
+    uint32_t getProgramOptionVersion() const {
+        return programOptionVersion;
+    }
+
+    static bool isValidLlvmBinary(const void *pBinary, size_t binarySize);
+    static bool isValidSpirvBinary(const void *pBinary, size_t binarySize);
+
+  protected:
+    Program();
+
+    MOCKABLE_VIRTUAL bool isSafeToSkipUnhandledToken(unsigned int token) const;
+
+    MOCKABLE_VIRTUAL cl_int createProgramFromBinary(const void *pBinary, size_t binarySize);
+
+    bool optionsAreNew(const char *options) const;
+
+    cl_int processElfHeader(const CLElfLib::SElf64Header *pElfHeader,
+                            cl_program_binary_type &binaryType, uint32_t &numSections);
+
+    void getProgramCompilerVersion(SProgramBinaryHeader *pSectionData, uint32_t &binaryVersion) const;
+
+    cl_int resolveProgramBinary();
+
+    cl_int parseProgramScopePatchList();
+
+    MOCKABLE_VIRTUAL cl_int rebuildProgramFromLLVM();
+
+    cl_int parsePatchList(KernelInfo &pKernelInfo);
+
+    size_t processKernel(const void *pKernelBlob, cl_int &retVal);
+
+    void storeBinary(char *&pDst, size_t &dstSize, const void *pSrc, const size_t srcSize);
+
+    bool validateGenBinaryDevice(GFXCORE_FAMILY device) const;
+    bool validateGenBinaryHeader(const iOpenCL::SProgramBinaryHeader *pGenBinaryHeader) const;
+
+    std::string getKernelNamesString() const;
+
+    MOCKABLE_VIRTUAL CompilerInterface *getCompilerInterface() const;
+
+    void separateBlockKernels();
+
+    void updateNonUniformFlag();
+    void updateNonUniformFlag(const Program **inputProgram, size_t numInputPrograms);
+
+    static const std::string clOptNameClVer;
+    static const std::string clOptNameUniformWgs;
+    // clang-format off
+    cl_program_binary_type    programBinaryType;
+    bool                      isSpirV = false;
+    char*                     elfBinary;
+    size_t                    elfBinarySize;
+
+    char*                     genBinary;
+    size_t                    genBinarySize;
+
+    char*                     llvmBinary;
+    size_t                    llvmBinarySize;
+
+    char*                     debugData;
+    size_t                    debugDataSize;
+
+    std::vector<KernelInfo*>  kernelInfoArray;
+    std::vector<KernelInfo*>  parentKernelInfoArray;
+    std::vector<KernelInfo*>  subgroupKernelInfoArray;
+    BlockKernelManager *      blockKernelManager;
+
+    const void*               programScopePatchList;
+    size_t                    programScopePatchListSize;
+
+    GraphicsAllocation*       constantSurface;
+    GraphicsAllocation*       globalSurface;
+
+    size_t                    globalVarTotalSize;
+
+    cl_build_status           buildStatus;
+    bool                      isCreatedFromBinary;
+    bool                      isProgramBinaryResolved;
+
+    std::string               sourceCode;
+    std::string               options;
+    std::string               internalOptions;
+    std::string               hashFileName;
+    std::string               hashFilePath;
+
+    uint32_t                  programOptionVersion;
+    bool                      allowNonUniform;
+
+    std::map<const Device*, std::string>  buildLog;
+
+    Context*                  context;
+    Device*                   pDevice;
+    cl_uint                   numDevices;
+
+    bool                      isBuiltIn;
+
+    friend class OfflineCompiler;
+    // clang-format on
+};
+} // namespace OCLRT