Initial commit

Change-Id: I4bf1707bd3dfeadf2c17b0a7daff372b1925ebbd
2025-12-20 08:53:55 +08:00 · 2017-12-21 00:45:38 +01:00
commit 7e9ad41290
1350 changed files with 233156 additions and 0 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -0,0 +1,92 @@
+---
+Language:        Cpp
+# BasedOnStyle:  LLVM
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: false
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: false
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:   
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: false
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+# clang-format > v3.8.0: BreakAfterJavaFieldAnnotations: false
+# clang-format > v3.8.0: BreakStringLiterals: true
+ColumnLimit:     0
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IncludeCategories: 
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+  - Regex:           '^(<|"(gtest|isl|json)/)'
+    Priority:        3
+  - Regex:           '.*'
+    Priority:        1
+IndentCaseLabels: false
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Right
+ReflowComments:  true
+SortIncludes:    false
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never
+...
+
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -0,0 +1,40 @@
+---
+Checks:          'clang-diagnostic-*,clang-analyzer-*,google-default-arguments,readability-identifier-naming,modernize-use-override,modernize-use-default-member-init,-clang-analyzer-alpha*,-clang-analyzer-core.StackAddressEscape,-clang-analyzer-optin.performance.Padding,-clang-analyzer-cplusplus.NewDeleteLeaks'
+# WarningsAsErrors: '.*'
+HeaderFilterRegex: 'runtime/'
+AnalyzeTemporaryDtors: false
+CheckOptions:    
+  - key:             google-readability-braces-around-statements.ShortStatementLines
+    value:           '1'
+  - key:             google-readability-function-size.StatementThreshold
+    value:           '800'
+  - key:             google-readability-namespace-comments.ShortNamespaceLines
+    value:           '10'
+  - key:             google-readability-namespace-comments.SpacesBeforeComments
+    value:           '2'
+  - key:             readability-identifier-naming.MethodCase
+    value:           camelBack
+  - key:             readability-identifier-naming.ParameterCase
+    value:           camelBack
+  - key:             readability-identifier-naming.StructMemberCase
+    value:           camelBack
+  - key:             readability-identifier-naming.ClassMemberCase
+    value:           camelBack
+  - key:             readability-identifier-naming.ClassMethodCase
+    value:           camelBack
+  - key:             modernize-loop-convert.MaxCopySize
+    value:           '16'
+  - key:             modernize-loop-convert.MinConfidence
+    value:           reasonable
+  - key:             modernize-loop-convert.NamingStyle
+    value:           CamelCase
+  - key:             modernize-pass-by-value.IncludeStyle
+    value:           llvm
+  - key:             modernize-replace-auto-ptr.IncludeStyle
+    value:           llvm
+  - key:             modernize-use-nullptr.NullMacros
+    value:           'NULL'
+  - key:             modernize-use-default-member-init.UseAssignment
+    value:           '1'
+...
+
--- a/.ctags
+++ b/.ctags
@@ -0,0 +1,3 @@
+-R
+-h .inl.h
+--langmap=c++:.inl.cpp.h
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1 @@
+manifests/manifest.yml filter=repo_converter
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+build/*
+build_linux/*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,550 @@
+# Copyright (c) 2017, Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+# We require cmake 3.2.0 or later
+cmake_minimum_required(VERSION 3.2.0 FATAL_ERROR)
+include(ExternalProject)
+
+project(igdrcl)
+
+if(TR_DEPRECATED)
+    add_definitions(-D_SILENCE_TR1_NAMESPACE_DEPRECATION_WARNING=1)
+endif(TR_DEPRECATED)
+
+if(NOT CMAKE_BUILD_TYPE)
+	set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type: [Release, Release-Internal, Debug]")
+endif()
+
+set(CMAKE_C_FLAGS_RELEASE-INTERNAL ${CMAKE_C_FLAGS_RELEASE})
+set(CMAKE_CXX_FLAGS_RELEASE-INTERNAL ${CMAKE_CXX_FLAGS_RELEASE})
+set(CMAKE_SHARED_LINKER_FLAGS_RELEASE-INTERNAL ${CMAKE_SHARED_LINKER_FLAGS_RELEASE})
+set(CMAKE_EXE_LINKER_FLAGS_RELEASE-INTERNAL ${CMAKE_EXE_LINKER_FLAGS_RELEASE})
+
+string(TOLOWER "${CMAKE_BUILD_TYPE}" BUILD_TYPE_lower)
+if("${BUILD_TYPE_lower}" STREQUAL "release-internal")
+	add_definitions(-D_RELEASE_INTERNAL)
+endif("${BUILD_TYPE_lower}" STREQUAL "release-internal")
+
+
+message(STATUS "${CMAKE_BUILD_TYPE} build configuration")
+
+# Set the runtime source directory
+if(NOT DEFINED IGDRCL_SOURCE_DIR)
+	set(IGDRCL_SOURCE_DIR ${CMAKE_SOURCE_DIR})
+endif()
+
+# Set our build directory
+if(NOT DEFINED IGDRCL_BUILD_DIR)
+	set(IGDRCL_BUILD_DIR ${CMAKE_BINARY_DIR})
+endif()
+
+if(NOT IGDRCL_BINARY_DIR)
+	set(IGDRCL_BINARY_DIR ${CMAKE_BINARY_DIR})
+endif()
+
+# we use c++11
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+if(NOT GTEST_SRC_DIR)
+	set(GTEST_SRC_DIR_tmp "${CMAKE_SOURCE_DIR}/../gtest")
+	get_filename_component(GTEST_SRC_DIR ${GTEST_SRC_DIR_tmp} ABSOLUTE)
+	set(GMOCK_SRC_DIR_tmp "${CMAKE_SOURCE_DIR}/../gmock")
+	get_filename_component(GMOCK_SRC_DIR ${GMOCK_SRC_DIR_tmp} ABSOLUTE)
+else(NOT GTEST_SRC_DIR)
+	get_filename_component(GTEST_SRC_DIR ${GTEST_SRC_DIR} ABSOLUTE)
+	set(GMOCK_SRC_DIR_tmp "${GTEST_SRC_DIR}/../gmock")
+	get_filename_component(GMOCK_SRC_DIR ${GMOCK_SRC_DIR_tmp} ABSOLUTE)
+endif(NOT GTEST_SRC_DIR)
+set(GTEST_INCLUDE_DIR "${GTEST_SRC_DIR}/include")
+set(GMOCK_INCLUDE_DIR "${GMOCK_SRC_DIR}/include")
+message(STATUS "Google Test source dir: ${GTEST_SRC_DIR}")
+message(STATUS "Google Mock source dir: ${GMOCK_SRC_DIR}")
+add_subdirectory(${GMOCK_SRC_DIR} ${IGDRCL_BINARY_DIR}/gmock)
+set_target_properties(gtest PROPERTIES CXX_STANDARD 11 CXX_STANDARD_REQUIRED ON)
+set_target_properties(gmock PROPERTIES CXX_STANDARD 11 CXX_STANDARD_REQUIRED ON)
+
+set(gtest_lib gtest)
+set(gmock_lib gmock)
+
+if("${BUILD_TYPE_lower}" STREQUAL "release-internal")
+	set(gtest_lib ${IGDRCL_BINARY_DIR}/gmock/gtest/Release/gtest${CMAKE_STATIC_LIBRARY_SUFFIX})
+	set(gmock_lib ${IGDRCL_BINARY_DIR}/gmock/Release/gmock${CMAKE_STATIC_LIBRARY_SUFFIX})
+endif("${BUILD_TYPE_lower}" STREQUAL "release-internal")
+
+if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+	set(NEO_BITS "64")
+	set(NEO_ARCH "x64")
+else()
+	set(NEO_BITS "32")
+	set(NEO_ARCH "x86")
+endif()
+
+if(NOT ARTIFACTS_DIR)
+	get_filename_component(ARTIFACTS_DIR_tmp "${CMAKE_SOURCE_DIR}/../artifacts" ABSOLUTE)
+	if(IS_DIRECTORY "${ARTIFACTS_DIR_tmp}")
+		set(ARTIFACTS_DIR "${ARTIFACTS_DIR_tmp}")
+	endif()
+endif(NOT ARTIFACTS_DIR)
+if(ARTIFACTS_DIR)
+	message(STATUS "Artifact directory is ${ARTIFACTS_DIR}")
+endif(ARTIFACTS_DIR)
+
+if(NOT GTPIN_HEADERS_DIR)
+    if ((EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../internal/gtpin/gtpin_dx11_interface.h") AND (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../internal/gtpin/gtpin_driver_common.h"))
+	    get_filename_component(GTPIN_HEADERS_DIR "../internal/gtpin/" ABSOLUTE)
+        message(STATUS "GT-Pin headers dir: ${GTPIN_HEADERS_DIR}")
+    endif()
+endif(NOT GTPIN_HEADERS_DIR)
+
+if(NOT LIBDRM_DIR)
+	get_filename_component(LIBDRM_DIR "../libdrm/" ABSOLUTE)
+endif(NOT LIBDRM_DIR)
+message(STATUS "libdrm dir: ${LIBDRM_DIR}")
+
+if(NOT KHRONOS_HEADERS_DIR)
+	get_filename_component(KHRONOS_HEADERS_DIR "../khronos/opencl21/" ABSOLUTE)
+endif(NOT KHRONOS_HEADERS_DIR)
+message(STATUS "Khronos OpenCL headers dir: ${KHRONOS_HEADERS_DIR}")
+set(OCL_HEADERS_DIR ${KHRONOS_HEADERS_DIR})
+
+if(NOT THIRD_PARTY_DIR)
+	get_filename_component(THIRD_PARTY_DIR "../third_party/" ABSOLUTE)
+endif(NOT THIRD_PARTY_DIR)
+message(STATUS "Third party dir: ${THIRD_PARTY_DIR}")
+
+if(ARTIFACTS_DIR)
+	if(NOT IGC_PATH)
+		if(WIN32)
+			file(GLOB_RECURSE IGC_FILE_tmp "${ARTIFACTS_DIR}/igc32.dll")
+			get_filename_component(IGC_PATH ${IGC_FILE_tmp} DIRECTORY)
+		else(WIN32 )
+			file(GLOB_RECURSE IGC_FILE_tmp "${ARTIFACTS_DIR}/libigdccl.so")
+			#exclude those form igdrcl location
+			foreach (TMP_PATH ${IGC_FILE_tmp})
+				string (FIND ${TMP_PATH} ${IGDRCL_BINARY_DIR} EXCLUDE_DIR_FOUND)
+				if (NOT ${EXCLUDE_DIR_FOUND} EQUAL -1)
+					list (REMOVE_ITEM IGC_FILE_tmp ${TMP_PATH})
+				endif ()
+			endforeach(TMP_PATH)
+			get_filename_component(IGC_PATH ${IGC_FILE_tmp} DIRECTORY)
+		endif(WIN32)
+	endif(NOT IGC_PATH)
+
+  if(NOT TARGET igc_dll)
+		add_library(igc_dll UNKNOWN IMPORTED)
+		if(WIN32)
+			set_property(TARGET igc_dll PROPERTY "IMPORTED_LOCATION" "${IGC_PATH}/${CMAKE_SHARED_LIBRARY_PREFIX}igc${IGDRCL_OPTION__BITS}${CMAKE_SHARED_LIBRARY_SUFFIX}")
+		else(WIN32)
+			set_property(TARGET igc_dll PROPERTY "IMPORTED_LOCATION" "${IGC_PATH}/${CMAKE_SHARED_LIBRARY_PREFIX}igdccl${IGDRCL_OPTION__BITS}${CMAKE_SHARED_LIBRARY_SUFFIX}")
+		endif(WIN32)
+	endif()
+	list(APPEND IGDRCL__IGC_TARGETS "igc_dll")
+
+  if(NOT TARGET fcl_dll)
+		add_library(fcl_dll UNKNOWN IMPORTED)
+		set_property(TARGET fcl_dll PROPERTY "IMPORTED_LOCATION" "${IGC_PATH}/${CMAKE_SHARED_LIBRARY_PREFIX}igdfcl${IGDRCL_OPTION__BITS}${CMAKE_SHARED_LIBRARY_SUFFIX}")
+	endif()
+	list(APPEND IGDRCL__IGC_TARGETS "fcl_dll")
+
+  if(NOT TARGET iga_dll)
+		add_library(iga_dll UNKNOWN IMPORTED)
+		set_property(TARGET iga_dll PROPERTY "IMPORTED_LOCATION" "${IGC_PATH}/${CMAKE_SHARED_LIBRARY_PREFIX}iga${NEO_BITS}${CMAKE_SHARED_LIBRARY_SUFFIX}")
+	endif()
+	list(APPEND IGDRCL__IGC_TARGETS "iga_dll")
+
+	if(NOT TARGET common_clang_dll)
+		add_library(common_clang_dll UNKNOWN IMPORTED)
+		set_property(TARGET common_clang_dll PROPERTY "IMPORTED_LOCATION" "${IGC_PATH}/${CMAKE_SHARED_LIBRARY_PREFIX}common_clang${IGDRCL_OPTION__BITS}${CMAKE_SHARED_LIBRARY_SUFFIX}")
+	endif()
+	list(APPEND IGDRCL__IGC_TARGETS "common_clang_dll")
+
+	# select proper gmm from artifacts
+	string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_lower)
+	if(NOT GMM_LIB_PATHS)
+		if(WIN32)
+			set(GMM_LIB_PATHS "${ARTIFACTS_DIR}/windows/${CMAKE_BUILD_TYPE_lower}${IGDRCL_OPTION__BITS}/gmmocl")
+		else(WIN32)
+			set(GMM_LIB_PATHS "${ARTIFACTS_DIR}/linux/${CMAKE_BUILD_TYPE_lower}")
+		endif(WIN32)
+	endif()
+	message(STATUS "GmmLib binary path: ${GMM_LIB_PATHS}")
+
+	if(GMM_SOURCE_DIR)
+		get_filename_component(GMM_SOURCE_DIR "${GMM_SOURCE_DIR}" ABSOLUTE)
+	else(GMM_SOURCE_DIR)
+		get_filename_component(GMM_SOURCE_DIR "${CMAKE_SOURCE_DIR}/../gmmlib" ABSOLUTE)
+	endif(GMM_SOURCE_DIR)
+
+	if(NOT IS_DIRECTORY "${GMM_SOURCE_DIR}")
+		message(FATAL_ERROR "GmmLib public API not found!")
+	endif()
+
+	set(GMM_INCLUDE_PATHS
+		"${GMM_SOURCE_DIR}/GmmLib/inc"
+	)
+	set(UMKM_SHAREDDATA_INCLUDE_PATHS
+		"${GMM_SOURCE_DIR}/inc"
+		"${GMM_SOURCE_DIR}/inc/common"
+	)
+	set(IGDRCL__IGC_INCLUDE_DIR ${THIRD_PARTY_DIR})
+else(ARTIFACTS_DIR)
+	if(GMM_SOURCE_DIR)
+		get_filename_component(GMM_SOURCE_DIR "${GMM_SOURCE_DIR}" ABSOLUTE)
+	else(GMM_SOURCE_DIR)
+		get_filename_component(GMM_SOURCE_DIR_tmp "${CMAKE_SOURCE_DIR}/../gmmlib" ABSOLUTE)
+		if(IS_DIRECTORY "${GMM_SOURCE_DIR_tmp}")
+			set(GMM_SOURCE_DIR "${GMM_SOURCE_DIR_tmp}")
+		endif()
+	endif()
+	if(NOT IS_DIRECTORY "${GMM_SOURCE_DIR}")
+		message(FATAL_ERROR "GmmLib source not found!")
+	endif()
+	message(STATUS "GmmLib source dir is: ${GMM_SOURCE_DIR}")
+	add_subdirectory("${GMM_SOURCE_DIR}" "${IGDRCL_BUILD_DIR}/gmmlib")
+	set(UMKM_SHAREDDATA_INCLUDE_PATHS $<TARGET_PROPERTY:gmm_umd,INTERFACE_INCLUDE_DIRECTORIES>)
+
+	if(IGC_DIR)
+		get_filename_component(IGC_DIR "${IGC_DIR}" ABSOLUTE)
+	else(IGC_DIR)
+		get_filename_component(IGC_DIR_tmp "${CMAKE_SOURCE_DIR}/../igc" ABSOLUTE)
+		if(IS_DIRECTORY "${IGC_DIR_tmp}")
+			set(IGC_DIR "${IGC_DIR_tmp}")
+		endif()
+	endif()
+	message(STATUS "IGC source dir is: ${IGC_DIR}")
+	get_filename_component(IGC_PATH "${IGDRCL_BUILD_DIR}/igc" ABSOLUTE)
+	if(IS_DIRECTORY ${IGC_DIR})
+		set(IGC_OPTION__LIBRARY_NAME "igdccl")
+		set(IGC_OPTION__OUTPUT_DIR "${IGC_PATH}")
+		set(IGC_OPTION__INCLUDE_IGC_COMPILER_TOOLS OFF)
+		add_subdirectory("${IGC_DIR}" "${IGDRCL_BUILD_DIR}/igc" EXCLUDE_FROM_ALL)
+
+		set(IGDRCL__IGC_TARGETS "${IGC__IGC_TARGETS}")
+
+		foreach(TARGET_tmp ${IGDRCL__IGC_TARGETS})
+			list(APPEND IGDRCL__IGC_INCLUDE_DIR $<TARGET_PROPERTY:${TARGET_tmp},INTERFACE_INCLUDE_DIRECTORIES>)
+		endforeach(TARGET_tmp)
+		message(STATUS "IGC Includes: ${IGDRCL__IGC_INCLUDE_DIR}")
+	endif()
+endif(ARTIFACTS_DIR)
+
+add_definitions(-DGMM_OCL)
+
+if(IGC_PATH)
+	get_filename_component(IGC_PATH "${IGC_PATH}" ABSOLUTE)
+	message(STATUS "IGC binaries path: ${IGC_PATH}")
+endif(IGC_PATH)
+
+# We want to organize our IDE targets into folders
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+# Get available platfroms
+include(platforms.cmake)
+
+# Enable/Disable BuiltIns compilatoin during build
+set(COMPILE_BUILT_INS TRUE CACHE BOOL "Enable built-in kernels compilation")
+
+# Changing the default executable and library output directories
+set(IGDRCL_OUTPUT_DIR "${IGDRCL_OPTION__OUTPUT_DIR}")
+
+# set output paths
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${IGDRCL_BINARY_DIR}/bin)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${IGDRCL_BINARY_DIR}/bin)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${IGDRCL_BINARY_DIR}/lib)
+
+# do not add rpath
+set(CMAKE_SKIP_RPATH YES CACHE BOOL "" FORCE )
+
+# Set the configuration type
+set(CMAKE_CONFIGURATION_TYPES
+	${CMAKE_BUILD_TYPE}
+)
+
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_DEBUG")
+
+option(APPVERIFIER_ALLOWED "allow use of AppVerifier" TRUE)
+
+option(CCACHE_ALLOWED "allow use of ccache" TRUE)
+find_program(CCACHE_EXE_FOUND ccache)
+if(CCACHE_EXE_FOUND AND CCACHE_ALLOWED)
+	message(STATUS "Found ccache: ${CCACHE_EXE_FOUND}")
+	set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
+	set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache)
+endif()
+
+if(MSVC)
+	if(NOT WDK_DIR)
+		get_filename_component(WDK_DIR "../wdk" ABSOLUTE)
+	endif(NOT WDK_DIR)
+	message(STATUS "WDK Directory: ${WDK_DIR}")
+else()
+	find_package(PkgConfig)
+  pkg_check_modules(LIBVA QUIET libva)
+  if(LIBVA_FOUND)
+    add_definitions(-DLIBVA)
+    message(STATUS "Using libva")
+  endif()
+endif()
+
+# Support for WUD
+macro(ENABLE_WUD)
+	if(MSVC)
+		set(CMAKE_CXX_STANDARD_LIBRARIES "onecore.lib")
+		set(LINKER_FLAGS "")
+		foreach(IT kernel32.lib;user32.lib;gdi32.lib;advapi32.lib;ole32.lib;)
+			set(LINKER_FLAGS "${LINKER_FLAGS} /NODEFAULTLIB:${IT}")
+		endforeach()
+		set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${LINKER_FLAGS}")
+		set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${LINKER_FLAGS}")
+		set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${LINKER_FLAGS}")
+		add_definitions(-DUNICODE -D_UNICODE)
+		link_directories("${WDK_DIR}/Win15/Lib/${WindowsTargetPlatformVersion}/um/${NEO_ARCH}/")
+	endif(MSVC)
+endmacro(ENABLE_WUD)
+
+# Miscs options
+option(IGDRCL_GCOV "generate gcov report" OFF)
+option(HAVE_TBX_SERVER "Compile TBX server from TbxAccess library" OFF)
+option(USE_CL_CACHE "Use OpenCL program binary cache" ON)
+set(CL_CACHE_LOCATION "cl_cache" CACHE STRING "OpenCL program binary cache location")
+
+if(NOT NEO_DRIVER_VERSION)
+  set(NEO_DRIVER_VERSION 1.0)
+endif()
+
+# Put profiling enable flag into define
+if(OCL_RUNTIME_PROFILING)
+	add_definitions(-DOCL_RUNTIME_PROFILING=${OCL_RUNTIME_PROFILING})
+endif()
+
+# We want to build with the static, multithreaded runtime libraries (as opposed
+# to the multithreaded runtime DLLs)
+if(MSVC)
+	# Get WDK version from ${WDK_DIR}/Win15/WDKVersion.txt
+	file(READ "${WDK_DIR}/Win15/WDKVersion.txt" WindowsTargetPlatformVersion)
+	string(REPLACE " " ";" WindowsTargetPlatformVersion ${WindowsTargetPlatformVersion})
+	list(LENGTH WindowsTargetPlatformVersion versionListLength)
+	if(NOT versionListLength EQUAL 3)
+		MESSAGE(ERROR "Error reading content of WDKVersion.txt file")
+	endif(NOT versionListLength EQUAL 3)
+	list(GET WindowsTargetPlatformVersion 2 WindowsTargetPlatformVersion)
+
+	message(STATUS "WDK Version is ${WindowsTargetPlatformVersion}")
+
+	set(WDK_INCLUDE_PATHS
+			"${WDK_DIR}/Win15/Include/${WindowsTargetPlatformVersion}/um"
+			"${WDK_DIR}/Win15/Include/${WindowsTargetPlatformVersion}/shared"
+			"${WDK_DIR}/Win15/Include/${WindowsTargetPlatformVersion}/km"
+			)
+	# Force to treat warnings as errors
+	if(NOT CMAKE_CXX_FLAGS MATCHES "/WX")
+		 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /WX")
+	endif()
+	MESSAGE(STATUS "WDK include paths: ${WDK_INCLUDE_PATHS}")
+
+	string(REPLACE "/MDd" "/MTd" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
+	string(REPLACE "/MD" "/MT" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+	string(REPLACE "/MD" "/MT" CMAKE_CXX_FLAGS_RELEASE-INTERNAL "${CMAKE_CXX_FLAGS_RELEASE-INTERNAL}")
+else()
+	if(IGDRCL_GCOV)
+		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage --coverage")
+	endif()
+	option(USE_ASAN "Link with address sanitization support" OFF)
+	if(USE_ASAN)
+		if(CMAKE_COMPILER_IS_GNUCC)
+			set(ASAN_FLAGS " -fsanitize=address -fno-omit-frame-pointer")
+			link_libraries(asan)
+		else()
+			message(STATUS "Address sanitization with clang not yet support")
+		endif()
+	endif()
+	if(USE_TSAN)
+		if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
+			set(TSAN_FLAGS " -fsanitize=thread")
+			link_libraries(tsan)
+		else()
+			message(STATUS "Thread sanitization with gcc is not fully supported")
+		endif()
+	endif()
+
+	include(CheckLibraryExists)
+	CHECK_LIBRARY_EXISTS(rt clock_gettime "time.h" HAVE_CLOCK_GETTIME)
+	if(HAVE_CLOCK_GETTIME)
+		link_libraries(rt)
+	endif(HAVE_CLOCK_GETTIME)
+endif(MSVC)
+
+# setup variables needed for custom configuration type
+# generate PDB files even for release build on MSVC
+if(MSVC)
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
+	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi")
+	set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
+endif()
+
+if(NOT MSVC)
+	SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ftemplate-depth=1024")
+endif(NOT MSVC)
+
+# Compiler warning flags
+if(NOT MSVC)
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wempty-body")
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wignored-qualifiers")
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wtype-limits")
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wuninitialized")
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wextra -Wno-unused-parameter -Wno-missing-field-initializers")
+
+	if("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang" )
+		# clang only
+		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wshorten-64-to-32")
+		if(USE_SANITIZE_UB)
+			message(STATUS "Enabling undefined behavior sanitizer")
+			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined -fno-sanitize-recover=undefined -fsanitize-recover=vptr -fno-rtti")
+		endif(USE_SANITIZE_UB)
+		if (NOT (CMAKE_C_COMPILER_VERSION VERSION_LESS 3.6))
+			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-local-typedefs")
+		endif()
+		if (NOT (CMAKE_C_COMPILER_VERSION VERSION_LESS 4.0))
+			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-register") # Added for htons()
+		endif()
+	else()
+		# gcc only
+		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-local-typedefs -Wno-unused-but-set-variable")
+		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wclobbered")
+		if (CMAKE_C_COMPILER_VERSION VERSION_LESS 7.0)
+		else()
+			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wimplicit-fallthrough=4")
+			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-noexcept-type")	# Added for gtest
+		endif()
+	endif()
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
+endif()
+
+# Compile code with defenses enabled (settings to be used for production release code)
+if("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
+	if(MSVC)
+		set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /GS")
+		set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /sdl")
+		set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /NXCompat")
+		set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DynamicBase")
+		if("${NEO_ARCH}" STREQUAL "x32")
+			set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /SafeSEH")
+		endif()
+	else()
+		if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
+			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector-strong")
+			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -D_FORTIFY_SOURCE=2")
+			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wformat -Wformat-security")
+		else()
+			# gcc, g++ only
+			if (CMAKE_C_COMPILER_VERSION VERSION_LESS 4.9)
+				set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector")
+			else()
+				set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector-strong")
+			endif()
+			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -D_FORTIFY_SOURCE=2")
+			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wformat -Wformat-security")
+			set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} -Wl,-z,noexecstack")
+			set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} -Wl,-z,relro")
+			set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} -Wl,-z,now")
+		endif()
+	endif()
+endif()
+
+# Project-wide include paths
+include_directories(${IGDRCL_SOURCE_DIR})
+include_directories(${IGDRCL_BUILD_DIR})
+
+# Define where to put binaries
+if(MSVC)
+	if ("${CMAKE_GENERATOR}" STREQUAL "Ninja")
+		set(TargetDir ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+	else()
+		set(TargetDir ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${CMAKE_BUILD_TYPE})
+	endif()
+else()
+	set(TargetDir ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+endif(MSVC)
+
+add_subdirectory(offline_compiler ${IGDRCL_BUILD_DIR}/offline_compiler)
+target_compile_definitions(cloc PUBLIC MOCKABLE_VIRTUAL=)
+
+macro(generate_runtime_lib LIB_NAME MOCKABLE GENERATE_EXEC)
+	set(NEO_STATIC_LIB_NAME ${LIB_NAME})
+	set(SHARINGS_ENABLE_LIB_NAME "${LIB_NAME}_sharings_enable")
+	set(GENERATE_EXECUTABLE ${GENERATE_EXEC})
+
+	add_subdirectory(runtime "${IGDRCL_BUILD_DIR}/${LIB_NAME}")
+	target_compile_definitions(${BUILTINS_SOURCES_LIB_NAME} PUBLIC MOCKABLE_VIRTUAL=)
+	target_compile_definitions(${BUILTINS_BINARIES_LIB_NAME} PUBLIC MOCKABLE_VIRTUAL=)
+	target_compile_definitions(${SCHEDULER_BINARY_LIB_NAME} PUBLIC MOCKABLE_VIRTUAL=)
+
+
+	if(${MOCKABLE})
+		target_compile_definitions(${LIB_NAME} PUBLIC MOCKABLE_VIRTUAL=virtual)
+		target_compile_definitions(${SHARINGS_ENABLE_LIB_NAME} PUBLIC MOCKABLE_VIRTUAL=virtual)
+		target_compile_definitions(${LIB_NAME} PUBLIC DEFAULT_TEST_PLATFORM=${DEFAULT_TESTED_PLATFORM} DEFAULT_TEST_FAMILY_NAME=${DEFAULT_TESTED_FAMILY_NAME})
+	else()
+		target_compile_definitions(${LIB_NAME} PUBLIC MOCKABLE_VIRTUAL=)
+		target_compile_definitions(${SHARINGS_ENABLE_LIB_NAME} PUBLIC MOCKABLE_VIRTUAL=)
+	endif()
+endmacro(generate_runtime_lib)
+
+set(NEO_MOCKABLE_LIB_NAME "igdrcl_lib_mockable") # Used by ULTS
+set(NEO_RELEASE_LIB_NAME "igdrcl_lib_release") # Used by dll/so
+set(NEO_DYNAMIC_LIB_NAME "igdrcl_dll") # single NEO dll (when WUD-crosscompilation is disabled)
+set(NEO_DLL_NAME_BASE "igdrcl")
+
+set(BIKSIM_LIB_NAME "biksim")
+set(BUILTINS_SOURCES_LIB_NAME "builtins_sources")
+set(BUILTINS_BINARIES_LIB_NAME "builtins_binaries")
+set(SCHEDULER_BINARY_LIB_NAME "scheduler_binary")
+
+add_subdirectory(elf)
+generate_runtime_lib(${NEO_RELEASE_LIB_NAME} FALSE TRUE)
+generate_runtime_lib(${NEO_MOCKABLE_LIB_NAME} TRUE FALSE)
+
+if(EXISTS ../icd)
+	add_subdirectory(../icd ${IGDRCL_BINARY_DIR}/icd)
+endif(EXISTS ../icd)
+
+if(TARGET OpenCL)
+	target_include_directories(OpenCL PRIVATE ${KHRONOS_HEADERS_DIR})
+endif()
+
+if(DEFAULT_TESTED_PLATFORM)
+  add_subdirectory(unit_tests ${IGDRCL_BUILD_DIR}/unit_tests)
+endif()
+
+set(DONT_LINK_ELF_STATICALLY TRUE)
+if(EXISTS ${IGDRCL_SOURCE_DIR}/../internal)
+	add_subdirectory(${IGDRCL_SOURCE_DIR}/../internal ${IGDRCL_BUILD_DIR}/internal)
+endif(EXISTS ${IGDRCL_SOURCE_DIR}/../internal)
+
+set(CL_USE_DEPRECATED_OPENCL_1_1_APIS 1)
+set(CL_USE_DEPRECATED_OPENCL_1_2_APIS 1)
+set(CL_USE_DEPRECATED_OPENCL_2_0_APIS 1)
+set(_CRT_SECURE_NO_WARNINGS 1)
+
+include(package.cmake)
+
+configure_file(config.h.in ${IGDRCL_BUILD_DIR}/config.h)
--- a/5
+++ b/5
@@ -0,0 +1,5 @@
+#!groovy
+neoDependenciesRev='716918-671'
+strategy='EQUAL'
+allowedF=49
+allowedCD=381
--- a/README.md
+++ b/README.md
@@ -0,0 +1,76 @@
+# Intel(R) Graphics Compute Runtime for OpenCL(TM)
+
+## Introduction
+
+The Intel(R) Graphics Compute Runtime for OpenCL(TM) is a open source project to
+converge Intel's development efforts on OpenCL(TM) compute stacks supporting the
+GEN graphics hardware architecture.
+
+Please refer to http://01.org/compute-runtime for additional details regarding Intel's
+motivation and intentions wrt OpenCL support in the open source.
+
+## License
+
+The Intel(R) Graphics Compute Runtime for OpenCL(TM) is distributed under the MIT.
+
+You may obtain a copy of the License at:
+
+https://opensource.org/licenses/MIT
+
+## Building
+
+<TODO:insert instructions here>
+
+
+### Install
+
+<TODO:insert instructions here>
+
+## Supported Platforms
+
+Intel Core Processors supporting Gen8 graphics devices - OpenCL 2.0  
+Intel Core Processors supporting Gen9 graphics devices - OpenCL 2.1  
+Intel Atom Processors supporting Gen9 graphics devices - OpenCL 1.2  
+
+## How to provide feedback
+
+By default, please submit an issue using native github.com interface: https://github.com/intel/compute-runtime/issues.  
+
+
+## How to contribute
+
+Create a pull request on github.com with your patch. Make sure your change is cleanly building and passing ULTs.
+A maintainer will contact you if there are questions or concerns.
+
+
+## Known Issues and Limitations
+
+OpenCL compliance of a driver built from open-source components should not be
+assumed by default. Intel will clearly designate / tag specific builds to
+indicate production quality including formal compliance. Other builds should be
+considered experimental. 
+
+The driver has the following functional delta compared to previously released drivers:
+* Intel's closed source SRB5.0 driver (aka Classic)  
+  https://software.intel.com/en-us/articles/opencl-drivers#latest_linux_driver
+* Intel's former open-source Beignet driver  
+  https://01.org/beignet
+
+## Generic extensions
+* cl_khr_mipmap
+* cl_khr_mipmap_writes
+* cl_khr_priority_hints
+* cl_khr_throttle_hints
+* cl_khr_fp64
+## Preview extensions
+* cl_intelx_video_enhancement
+* cl_intelx_video_enhancement_camera_pipeline
+* cl_intelx_video_enhancement_color_pipeline
+* cl_intelx_hevc_pak
+## Other capabilities
+* OpenGL sharing with MESA driver
+* CL_MEM_SVM_FINE_GRAIN_BUFFER (if using unpatched i915)
+
+
+___(*) Other names and brands my be claimed as property of others.___
+
--- a/config.h.in
+++ b/config.h.in
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef CONFIG_H
+#define CONFIG_H
+
+#cmakedefine CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#cmakedefine CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#cmakedefine CL_USE_DEPRECATED_OPENCL_2_0_APIS
+#cmakedefine _CRT_SECURE_NO_WARNINGS
+
+#cmakedefine USE_CL_CACHE
+#if defined(USE_CL_CACHE)
+static const bool clCacheEnabled = true;
+#else
+static const bool clCacheEnabled = false;
+#endif
+
+#cmakedefine CL_CACHE_LOCATION "${CL_CACHE_LOCATION}"
+
+#endif /* CONFIG_H */
--- a/elf/CMakeLists.txt
+++ b/elf/CMakeLists.txt
@@ -0,0 +1,38 @@
+# Copyright (c) 2017, Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+# We require cmake 3.2.0 or later
+cmake_minimum_required(VERSION 3.2.0 FATAL_ERROR)
+
+add_library(elflib STATIC
+	${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
+	${CMAKE_CURRENT_SOURCE_DIR}/reader.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/reader.h
+	${CMAKE_CURRENT_SOURCE_DIR}/types.h
+	${CMAKE_CURRENT_SOURCE_DIR}/writer.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/writer.h
+)
+
+target_include_directories(elflib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(elflib PRIVATE ${IGDRCL_SOURCE_DIR})
+
+set_target_properties(elflib PROPERTIES FOLDER "elflib")
+
+set_target_properties(elflib PROPERTIES POSITION_INDEPENDENT_CODE ON)
--- a/elf/reader.cpp
+++ b/elf/reader.cpp
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "reader.h"
+#include <string.h>
+
+namespace CLElfLib {
+
+/******************************************************************************\
+ Constructor: CElfReader::CElfReader
+\******************************************************************************/
+CElfReader::CElfReader(
+    const char *pElfBinary,
+    const size_t elfBinarySize) {
+    m_pNameTable = NULL;
+    m_nameTableSize = 0;
+    m_pElfHeader = (SElf64Header *)pElfBinary;
+    m_pBinary = pElfBinary;
+
+    // get a pointer to the string table
+    if (m_pElfHeader) {
+        getSectionData(
+            m_pElfHeader->SectionNameTableIndex,
+            m_pNameTable, m_nameTableSize);
+    }
+}
+
+/******************************************************************************\
+ Destructor: CElfReader::~CElfReader
+\******************************************************************************/
+CElfReader::~CElfReader() {
+}
+
+/******************************************************************************\
+ Member Function: CElfReader::Create
+\******************************************************************************/
+CElfReader *CElfReader::create(
+    const char *pElfBinary,
+    const size_t elfBinarySize) {
+    CElfReader *pNewReader = NULL;
+
+    if (isValidElf64(pElfBinary, elfBinarySize)) {
+        pNewReader = new CElfReader(pElfBinary, elfBinarySize);
+    }
+
+    return pNewReader;
+}
+
+/******************************************************************************\
+ Member Function: CElfReader::Delete
+\******************************************************************************/
+void CElfReader::destroy(
+    CElfReader *&pElfReader) {
+    if (pElfReader) {
+        delete pElfReader;
+        pElfReader = NULL;
+    }
+}
+
+/******************************************************************************\
+ Member Function: IsValidElf64
+ Description:     Determines if a binary is in the ELF64 format checks for
+                  invalid offsets.
+\******************************************************************************/
+bool CElfReader::isValidElf64(
+    const void *pBinary,
+    const size_t binarySize) {
+    bool retVal = false;
+    SElf64Header *pElf64Header = NULL;
+    SElf64SectionHeader *pSectionHeader = NULL;
+    char *pNameTable = NULL;
+    char *pEnd = NULL;
+    size_t ourSize = 0;
+    size_t entrySize = 0;
+    size_t indexedSectionHeaderOffset = 0;
+
+    // validate header
+    if (pBinary && (binarySize >= sizeof(SElf64Header))) {
+        // calculate a pointer to the end
+        pEnd = (char *)pBinary + binarySize;
+        pElf64Header = (SElf64Header *)pBinary;
+
+        if ((pElf64Header->Identity[ID_IDX_MAGIC0] == ELF_MAG0) &&
+            (pElf64Header->Identity[ID_IDX_MAGIC1] == ELF_MAG1) &&
+            (pElf64Header->Identity[ID_IDX_MAGIC2] == ELF_MAG2) &&
+            (pElf64Header->Identity[ID_IDX_MAGIC3] == ELF_MAG3) &&
+            (pElf64Header->Identity[ID_IDX_CLASS] == EH_CLASS_64)) {
+            ourSize += pElf64Header->ElfHeaderSize;
+            retVal = true;
+        }
+    }
+
+    // validate sections
+    if (retVal == true) {
+        // get the section entry size
+        entrySize = pElf64Header->SectionHeaderEntrySize;
+
+        // get an offset to the name table
+        if (pElf64Header->SectionNameTableIndex <
+            pElf64Header->NumSectionHeaderEntries) {
+            indexedSectionHeaderOffset =
+                (size_t)pElf64Header->SectionHeadersOffset +
+                (pElf64Header->SectionNameTableIndex * entrySize);
+
+            if (((char *)pBinary + indexedSectionHeaderOffset) <= pEnd) {
+                pNameTable = (char *)pBinary + indexedSectionHeaderOffset;
+            }
+        }
+
+        for (unsigned int i = 0; i < pElf64Header->NumSectionHeaderEntries; i++) {
+            indexedSectionHeaderOffset = (size_t)pElf64Header->SectionHeadersOffset +
+                                         (i * entrySize);
+
+            // check section header offset
+            if (((char *)pBinary + indexedSectionHeaderOffset) > pEnd) {
+                retVal = false;
+                break;
+            }
+
+            pSectionHeader = (SElf64SectionHeader *)((char *)pBinary + indexedSectionHeaderOffset);
+
+            // check section data
+            if (((char *)pBinary + pSectionHeader->DataOffset + pSectionHeader->DataSize) > pEnd) {
+                retVal = false;
+                break;
+            }
+
+            // check section name index
+            if ((pNameTable + pSectionHeader->Name) > pEnd) {
+                retVal = false;
+                break;
+            }
+
+            // tally up the sizes
+            ourSize += (size_t)pSectionHeader->DataSize;
+            ourSize += (size_t)entrySize;
+        }
+
+        if (ourSize != binarySize) {
+            retVal = false;
+        }
+    }
+
+    return retVal;
+}
+
+/******************************************************************************\
+ Member Function: GetElfHeader
+ Description:     Returns a pointer to the requested section header
+\******************************************************************************/
+const SElf64Header *CElfReader::getElfHeader() {
+    return m_pElfHeader;
+}
+
+/******************************************************************************\
+ Member Function: GetSectionHeader
+ Description:     Returns a pointer to the requested section header
+\******************************************************************************/
+const SElf64SectionHeader *CElfReader::getSectionHeader(
+    unsigned int sectionIndex) {
+    SElf64SectionHeader *pSectionHeader = NULL;
+    size_t indexedSectionHeaderOffset = 0;
+    size_t entrySize = m_pElfHeader->SectionHeaderEntrySize;
+
+    if (sectionIndex < m_pElfHeader->NumSectionHeaderEntries) {
+        indexedSectionHeaderOffset = (size_t)m_pElfHeader->SectionHeadersOffset +
+                                     (sectionIndex * entrySize);
+
+        pSectionHeader = (SElf64SectionHeader *)((char *)m_pElfHeader + indexedSectionHeaderOffset);
+    }
+
+    return pSectionHeader;
+}
+
+/******************************************************************************\
+ Member Function: GetSectionData
+ Description:     Returns a pointer to and size of the requested section's 
+                  data
+\******************************************************************************/
+bool CElfReader::getSectionData(
+    const unsigned int sectionIndex,
+    char *&pData,
+    size_t &dataSize) {
+    const SElf64SectionHeader *pSectionHeader = getSectionHeader(sectionIndex);
+
+    if (pSectionHeader) {
+        pData = (char *)m_pBinary + pSectionHeader->DataOffset;
+        dataSize = (size_t)pSectionHeader->DataSize;
+        return true;
+    }
+
+    return false;
+}
+
+/******************************************************************************\
+ Member Function: GetSectionData
+ Description:     Returns a pointer to and size of the requested section's 
+                  data
+\******************************************************************************/
+bool CElfReader::getSectionData(
+    const char *pName,
+    char *&pData,
+    size_t &dataSize) {
+    const char *pSectionName = NULL;
+
+    for (unsigned int i = 1; i < m_pElfHeader->NumSectionHeaderEntries; i++) {
+        pSectionName = getSectionName(i);
+
+        if (pSectionName && (strcmp(pName, pSectionName) == 0)) {
+            getSectionData(i, pData, dataSize);
+            return true;
+            ;
+        }
+    }
+
+    return false;
+}
+
+/******************************************************************************\
+ Member Function: GetSectionName
+ Description:     Returns a pointer to a NULL terminated string
+\******************************************************************************/
+const char *CElfReader::getSectionName(
+    unsigned int sectionIndex) {
+    char *pName = NULL;
+    const SElf64SectionHeader *pSectionHeader = getSectionHeader(sectionIndex);
+
+    if (pSectionHeader) {
+        pName = m_pNameTable + pSectionHeader->Name;
+    }
+
+    return pName;
+}
+
+} // namespace OclElfLib
--- a/elf/reader.h
+++ b/elf/reader.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "types.h"
+
+#if defined(_WIN32)
+#define ELF_CALL __stdcall
+#else
+#define ELF_CALL
+#endif
+
+namespace CLElfLib {
+/******************************************************************************\
+
+ Class:         CElfReader
+
+ Description:   Class to provide simpler interaction with the ELF standard
+                binary object.  SElf64Header defines the ELF header type and 
+                SElf64SectionHeader defines the section header type.
+
+\******************************************************************************/
+class CElfReader {
+  public:
+    static CElfReader *ELF_CALL create(
+        const char *pElfBinary,
+        const size_t elfBinarySize);
+
+    static void ELF_CALL destroy(
+        CElfReader *&pElfObject);
+
+    static bool ELF_CALL isValidElf64(
+        const void *pBinary,
+        const size_t binarySize);
+
+    const SElf64Header *ELF_CALL getElfHeader();
+
+    const SElf64SectionHeader *ELF_CALL getSectionHeader(
+        unsigned int sectionIndex);
+
+    const char *ELF_CALL getSectionName(
+        unsigned int sectionIndex);
+
+    bool ELF_CALL getSectionData(
+        const unsigned int sectionIndex,
+        char *&pData,
+        size_t &dataSize);
+
+    bool ELF_CALL getSectionData(
+        const char *sectionName,
+        char *&pData,
+        size_t &dataSize);
+
+  protected:
+    ELF_CALL CElfReader(
+        const char *pElfBinary,
+        const size_t elfBinarySize);
+
+    ELF_CALL ~CElfReader();
+
+    SElf64Header *m_pElfHeader; // pointer to the ELF header
+    const char *m_pBinary;      // portable ELF binary
+    char *m_pNameTable;         // pointer to the string table
+    size_t m_nameTableSize;     // size of string table in bytes
+};
+} // namespace CLElfLib
--- a/elf/types.h
+++ b/elf/types.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+// Abstract: Defines the types used for ELF headers/sections.
+#pragma once
+
+#include <inttypes.h>
+#include <stddef.h>
+
+namespace CLElfLib {
+/******************************************************************************\
+ ELF Enumerates
+\******************************************************************************/
+
+// E_ID_IDX - Defines a file as being ELF
+enum E_ID_IDX {
+    ID_IDX_MAGIC0 = 0,
+    ID_IDX_MAGIC1 = 1,
+    ID_IDX_MAGIC2 = 2,
+    ID_IDX_MAGIC3 = 3,
+    ID_IDX_CLASS = 4,
+    ID_IDX_VERSION = 5,
+    ID_IDX_OSABI = 6,
+    ID_IDX_ABI_VERSION = 7,
+    ID_IDX_PADDING = 8,
+    ID_IDX_NUM_BYTES = 16,
+};
+
+// E_EHT_CLASS - Describes what data types the ELF structures will use.
+enum E_EH_CLASS {
+    EH_CLASS_NONE = 0,
+    EH_CLASS_32 = 1, // Use Elf32 data types
+    EH_CLASS_64 = 2, // Use Elf64 data types
+};
+
+// E_EHT_TYPE - List of pre-defined types header types.
+//    OS-specific codes start at 0xfe00 and run to 0xfeff.
+//    Processor-specific codes start at 0xff00 and end at 0xffff.
+enum E_EH_TYPE {
+    EH_TYPE_NONE = 0,
+    EH_TYPE_RELOCATABLE = 1,
+    EH_TYPE_EXECUTABLE = 2,
+    EH_TYPE_DYNAMIC = 3,
+    EH_TYPE_CORE = 4,
+    EH_TYPE_OPENCL_SOURCE = 0xff01,     // format used to pass CL text sections to FE
+    EH_TYPE_OPENCL_OBJECTS = 0xff02,    // format used to pass LLVM objects / store LLVM binary output
+    EH_TYPE_OPENCL_LIBRARY = 0xff03,    // format used to store LLVM archive output
+    EH_TYPE_OPENCL_EXECUTABLE = 0xff04, // format used to store executable output
+    EH_TYPE_OPENCL_DEBUG = 0xff05,      // format used to store debug output
+};
+
+// E_EH_MACHINE - List of pre-defined machine types.
+//    For OpenCL, currently, we do not need this information, so this is not
+//    fully defined.
+enum E_EH_MACHINE {
+    EH_MACHINE_NONE = 0,
+    //EHT_MACHINE_LO_RSVD    = 1,   // Beginning of range of reserved types.
+    //EHT_MACHINE_HI_RSVD    = 200, // End of range of reserved types.
+};
+
+// E_EHT_VERSION - ELF header version options.
+enum E_EHT_VERSION {
+    EH_VERSION_INVALID = 0,
+    EH_VERSION_CURRENT = 1,
+};
+
+// E_SH_TYPE - List of pre-defined section header types.
+//    Processor-specific codes start at 0xff00 and end at 0xffff.
+enum E_SH_TYPE {
+    SH_TYPE_NULL = 0,
+    SH_TYPE_PROG_BITS = 1,
+    SH_TYPE_SYM_TBL = 2,
+    SH_TYPE_STR_TBL = 3,
+    SH_TYPE_RELO_ADDS = 4,
+    SH_TYPE_HASH = 5,
+    SH_TYPE_DYN = 6,
+    SH_TYPE_NOTE = 7,
+    SH_TYPE_NOBITS = 8,
+    SH_TYPE_RELO_NO_ADDS = 9,
+    SH_TYPE_SHLIB = 10,
+    SH_TYPE_DYN_SYM_TBL = 11,
+    SH_TYPE_INIT = 14,
+    SH_TYPE_FINI = 15,
+    SH_TYPE_PRE_INIT = 16,
+    SH_TYPE_GROUP = 17,
+    SH_TYPE_SYMTBL_SHNDX = 18,
+    SH_TYPE_OPENCL_SOURCE = 0xff000000,           // CL source to link into LLVM binary
+    SH_TYPE_OPENCL_HEADER = 0xff000001,           // CL header to link into LLVM binary
+    SH_TYPE_OPENCL_LLVM_TEXT = 0xff000002,        // LLVM text
+    SH_TYPE_OPENCL_LLVM_BINARY = 0xff000003,      // LLVM byte code
+    SH_TYPE_OPENCL_LLVM_ARCHIVE = 0xff000004,     // LLVM archives(s)
+    SH_TYPE_OPENCL_DEV_BINARY = 0xff000005,       // Device binary (coherent by default)
+    SH_TYPE_OPENCL_OPTIONS = 0xff000006,          // CL Options
+    SH_TYPE_OPENCL_PCH = 0xff000007,              // PCH (pre-compiled headers)
+    SH_TYPE_OPENCL_DEV_DEBUG = 0xff000008,        // Device debug
+    SH_TYPE_SPIRV = 0xff000009,                   // SPIRV
+    SH_TYPE_NON_COHERENT_DEV_BINARY = 0xff00000a, // Non-coherent Device binary
+};
+
+// E_SH_FLAG - List of section header flags.
+enum E_SH_FLAG {
+    SH_FLAG_WRITE = 0x1,
+    SH_FLAG_ALLOC = 0x2,
+    SH_FLAG_EXEC_INSTR = 0x4,
+    SH_FLAG_MERGE = 0x8,
+    SH_FLAG_STRINGS = 0x10,
+    SH_FLAG_INFO_LINK = 0x20,
+    SH_FLAG_LINK_ORDER = 0x40,
+    SH_FLAG_OS_NONCONFORM = 0x100,
+    SH_FLAG_GROUP = 0x200,
+    SH_FLAG_TLS = 0x400,
+    SH_FLAG_MASK_OS = 0x0ff00000,
+    SH_FLAG_MASK_PROC = 0xf0000000,
+};
+
+/******************************************************************************\
+ ELF-64 Data Types
+\******************************************************************************/
+#if defined(_MSC_VER) // && (_MSC_VER < 1700)
+typedef unsigned __int64 Elf64_Addr;
+typedef unsigned __int64 Elf64_Off;
+typedef unsigned __int16 Elf64_Short; // Renaming Elf64_Half to Elf64_Short to avoid a conflict with Android
+typedef unsigned __int32 Elf64_Word;
+typedef __int32 Elf64_Sword;
+typedef unsigned __int64 Elf64_Xword;
+#else
+#if !defined(_UAPI_LINUX_ELF_H)
+typedef uint64_t Elf64_Addr;
+typedef uint64_t Elf64_Off;
+typedef uint32_t Elf64_Word;
+typedef int32_t Elf64_Sword;
+typedef uint64_t Elf64_Xword;
+#endif
+typedef uint16_t Elf64_Short; // Renaming Elf64_Half to Elf64_Short to avoid a conflict with Android
+#endif
+
+/******************************************************************************\
+ ELF Constants
+\******************************************************************************/
+static const unsigned char ELF_MAG0 = 0x7f;     // ELFHeader.Identity[ELF_ID_MAGIC0]
+static const unsigned char ELF_MAG1 = 'E';      // ELFHeader.Identity[ELF_ID_MAGIC1]
+static const unsigned char ELF_MAG2 = 'L';      // ELFHeader.Identity[ELF_ID_MAGIC2]
+static const unsigned char ELF_MAG3 = 'F';      // ELFHeader.Identity[ELF_ID_MAGIC3]
+static const unsigned int ELF_ALIGN_BYTES = 16; // Alignment set to 16-bytes
+
+/******************************************************************************\
+ ELF-64 Header
+\******************************************************************************/
+struct SElf64Header {
+    unsigned char Identity[ID_IDX_NUM_BYTES];
+    Elf64_Short Type;
+    Elf64_Short Machine;
+    Elf64_Word Version;
+    Elf64_Addr EntryAddress;
+    Elf64_Off ProgramHeadersOffset;
+    Elf64_Off SectionHeadersOffset;
+    Elf64_Word Flags;
+    Elf64_Short ElfHeaderSize;
+    Elf64_Short ProgramHeaderEntrySize;
+    Elf64_Short NumProgramHeaderEntries;
+    Elf64_Short SectionHeaderEntrySize;
+    Elf64_Short NumSectionHeaderEntries;
+    Elf64_Short SectionNameTableIndex;
+};
+
+/******************************************************************************\
+ ELF-64 Section Header
+\******************************************************************************/
+struct SElf64SectionHeader {
+    Elf64_Word Name;
+    Elf64_Word Type;
+    Elf64_Xword Flags;
+    Elf64_Addr Address;
+    Elf64_Off DataOffset;
+    Elf64_Xword DataSize;
+    Elf64_Word Link;
+    Elf64_Word Info;
+    Elf64_Xword Alignment;
+    Elf64_Xword EntrySize;
+};
+
+} // namespace ELFlib
--- a/elf/writer.cpp
+++ b/elf/writer.cpp
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "writer.h"
+#include "runtime/helpers/string.h"
+#include <cstring>
+
+namespace CLElfLib {
+/******************************************************************************\
+ Constructor: CElfWriter::CElfWriter
+\******************************************************************************/
+CElfWriter::CElfWriter(
+    E_EH_TYPE type,
+    E_EH_MACHINE machine,
+    Elf64_Xword flags) {
+    m_type = type;
+    m_machine = machine;
+    m_flags = flags;
+}
+
+/******************************************************************************\
+ Destructor: CElfWriter::~CElfWriter
+\******************************************************************************/
+CElfWriter::~CElfWriter() {
+    SSectionNode *pNode = NULL;
+
+    // Walk through the section nodes
+    while (m_nodeQueue.empty() == false) {
+        pNode = m_nodeQueue.front();
+        m_nodeQueue.pop();
+
+        // delete the node and it's data
+        if (pNode) {
+            if (pNode->pData) {
+                delete[] pNode->pData;
+                pNode->pData = NULL;
+            }
+
+            delete pNode;
+            pNode = nullptr;
+        }
+    }
+}
+
+/******************************************************************************\
+ Member Function: CElfWriter::Create
+\******************************************************************************/
+CElfWriter *CElfWriter::create(
+    E_EH_TYPE type,
+    E_EH_MACHINE machine,
+    Elf64_Xword flags) {
+    CElfWriter *pWriter = new CElfWriter(type, machine, flags);
+
+    if (!pWriter->initialize()) {
+        destroy(pWriter);
+    }
+
+    return pWriter;
+}
+
+/******************************************************************************\
+ Member Function: CElfWriter::Delete
+\******************************************************************************/
+void CElfWriter::destroy(
+    CElfWriter *&pWriter) {
+    if (pWriter) {
+        delete pWriter;
+        pWriter = NULL;
+    }
+}
+
+/******************************************************************************\
+ Member Function: CElfWriter::AddSection
+\******************************************************************************/
+bool CElfWriter::addSection(
+    SSectionNode *pSectionNode) {
+    bool retVal = true;
+    SSectionNode *pNode = NULL;
+    size_t nameSize = 0;
+    unsigned int dataSize = 0;
+
+    // The section header must be non-NULL
+    if (pSectionNode) {
+        pNode = new SSectionNode();
+        if (!pNode)
+            return false;
+    } else {
+        return false;
+    }
+
+    pNode->Flags = pSectionNode->Flags;
+    pNode->Type = pSectionNode->Type;
+
+    nameSize = pSectionNode->Name.size() + 1;
+    dataSize = pSectionNode->DataSize;
+
+    pNode->Name = pSectionNode->Name;
+
+    // ok to have NULL data
+    if (dataSize > 0) {
+        pNode->pData = new char[dataSize];
+        if (pNode->pData) {
+            memcpy_s(pNode->pData, dataSize, pSectionNode->pData, dataSize);
+            pNode->DataSize = dataSize;
+        } else {
+            retVal = false;
+        }
+    }
+
+    if (retVal) {
+        // push the node onto the queue
+        m_nodeQueue.push(pNode);
+
+        // increment the sizes for each section
+        m_dataSize += dataSize;
+        m_stringTableSize += nameSize;
+        m_numSections++;
+    } else {
+        delete pNode;
+        pNode = nullptr;
+    }
+
+    return retVal;
+}
+
+/******************************************************************************\
+ Member Function: CElfWriter::ResolveBinary
+\******************************************************************************/
+bool CElfWriter::resolveBinary(
+    char *const pBinary,
+    size_t &binarySize) {
+    bool retVal = true;
+    SSectionNode *pNode = NULL;
+    SElf64SectionHeader *pCurSectionHeader = NULL;
+    char *pData = NULL;
+    char *pStringTable = NULL;
+    char *pCurString = NULL;
+
+    m_totalBinarySize =
+        sizeof(SElf64Header) +
+        ((m_numSections + 1) * sizeof(SElf64SectionHeader)) + // +1 to account for string table entry
+        m_dataSize +
+        m_stringTableSize;
+
+    if (pBinary) {
+        // get a pointer to the first section header
+        pCurSectionHeader = (SElf64SectionHeader *)(pBinary + sizeof(SElf64Header));
+
+        // get a pointer to the data
+        pData = pBinary +
+                sizeof(SElf64Header) +
+                ((m_numSections + 1) * sizeof(SElf64SectionHeader)); // +1 to account for string table entry
+
+        // get a pointer to the string table
+        pStringTable = pBinary + sizeof(SElf64Header) +
+                       ((m_numSections + 1) * sizeof(SElf64SectionHeader)) + // +1 to account for string table entry
+                       m_dataSize;
+
+        pCurString = pStringTable;
+
+        // Walk through the section nodes
+        while (m_nodeQueue.empty() == false) {
+            pNode = m_nodeQueue.front();
+
+            if (pNode) {
+                m_nodeQueue.pop();
+
+                // Copy data into the section header
+                memset(pCurSectionHeader, 0, sizeof(SElf64SectionHeader));
+                pCurSectionHeader->Type = pNode->Type;
+                pCurSectionHeader->Flags = pNode->Flags;
+                pCurSectionHeader->DataSize = pNode->DataSize;
+                pCurSectionHeader->DataOffset = pData - pBinary;
+                pCurSectionHeader->Name = (Elf64_Word)(pCurString - pStringTable);
+                pCurSectionHeader = (SElf64SectionHeader *)((unsigned char *)pCurSectionHeader + sizeof(SElf64SectionHeader));
+
+                // copy the data, move the data pointer
+                memcpy_s(pData, pNode->DataSize, pNode->pData, pNode->DataSize);
+                pData += pNode->DataSize;
+
+                // copy the name into the string table, move the string pointer
+                if (pNode->Name.size() > 0) {
+                    memcpy_s(pCurString, pNode->Name.size(), pNode->Name.c_str(), pNode->Name.size());
+                    pCurString += pNode->Name.size();
+                }
+                *(pCurString++) = '\0';
+
+                // delete the node and it's data
+                if (pNode->pData) {
+                    delete[] pNode->pData;
+                    pNode->pData = NULL;
+                }
+
+                delete pNode;
+                pNode = nullptr;
+            }
+        }
+
+        // add the string table section header
+        SElf64SectionHeader stringSectionHeader = {0};
+        stringSectionHeader.Type = SH_TYPE_STR_TBL;
+        stringSectionHeader.Flags = 0;
+        stringSectionHeader.DataOffset = pStringTable - pBinary;
+        stringSectionHeader.DataSize = m_stringTableSize;
+        stringSectionHeader.Name = 0;
+
+        // Copy into the last section header
+        memcpy_s(pCurSectionHeader, sizeof(SElf64SectionHeader),
+                 &stringSectionHeader, sizeof(SElf64SectionHeader));
+
+        // Add to our section number
+        m_numSections++;
+
+        // patch up the ELF header
+        retVal = patchElfHeader(pBinary);
+    }
+
+    if (retVal) {
+        binarySize = m_totalBinarySize;
+    }
+
+    return retVal;
+}
+
+/******************************************************************************\
+ Member Function: CElfWriter::Initialize
+\******************************************************************************/
+bool CElfWriter::initialize() {
+    SSectionNode emptySection;
+
+    // Add an empty section 0 (points to "no-bits")
+    return addSection(&emptySection);
+}
+
+/******************************************************************************\
+ Member Function: CElfWriter::PatchElfHeader
+\******************************************************************************/
+bool CElfWriter::patchElfHeader(char *const pBinary) {
+    SElf64Header *pElfHeader = (SElf64Header *)pBinary;
+
+    if (pElfHeader) {
+        // Setup the identity
+        memset(pElfHeader, 0x00, sizeof(SElf64Header));
+        pElfHeader->Identity[ID_IDX_MAGIC0] = ELF_MAG0;
+        pElfHeader->Identity[ID_IDX_MAGIC1] = ELF_MAG1;
+        pElfHeader->Identity[ID_IDX_MAGIC2] = ELF_MAG2;
+        pElfHeader->Identity[ID_IDX_MAGIC3] = ELF_MAG3;
+        pElfHeader->Identity[ID_IDX_CLASS] = EH_CLASS_64;
+        pElfHeader->Identity[ID_IDX_VERSION] = EH_VERSION_CURRENT;
+
+        // Add other non-zero info
+        pElfHeader->Type = m_type;
+        pElfHeader->Machine = m_machine;
+        pElfHeader->Flags = (unsigned int)m_flags;
+        pElfHeader->ElfHeaderSize = sizeof(SElf64Header);
+        pElfHeader->SectionHeaderEntrySize = sizeof(SElf64SectionHeader);
+        pElfHeader->NumSectionHeaderEntries = (Elf64_Short)m_numSections;
+        pElfHeader->SectionHeadersOffset = (unsigned int)(sizeof(SElf64Header));
+        pElfHeader->SectionNameTableIndex = m_numSections - 1; // last index
+
+        return true;
+    }
+
+    return false;
+}
+
+} // namespace OclElfLib
--- a/elf/writer.h
+++ b/elf/writer.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+#include "types.h"
+#include <queue>
+#include <string>
+
+#if defined(_WIN32)
+#define ELF_CALL __stdcall
+#else
+#define ELF_CALL
+#endif
+
+using namespace std;
+
+namespace CLElfLib {
+static const unsigned int g_scElfHeaderAlignment = 16; // allocation alignment restriction
+static const unsigned int g_scInitialElfSize = 2048;   // initial elf size (in bytes)
+static const unsigned int g_scInitNumSectionHeaders = 8;
+
+struct SSectionNode {
+    E_SH_TYPE Type;
+    unsigned int Flags;
+    string Name;
+    char *pData;
+    unsigned int DataSize;
+
+    SSectionNode() {
+        Type = SH_TYPE_NULL;
+        Flags = 0;
+        pData = NULL;
+        DataSize = 0;
+    }
+
+    ~SSectionNode() {
+    }
+};
+
+/******************************************************************************\
+
+ Class:         CElfWriter
+
+ Description:   Class to provide simpler interaction with the ELF standard
+                binary object.  SElf64Header defines the ELF header type and 
+                SElf64SectionHeader defines the section header type.
+
+\******************************************************************************/
+class CElfWriter {
+  public:
+    static CElfWriter *ELF_CALL create(
+        E_EH_TYPE type,
+        E_EH_MACHINE machine,
+        Elf64_Xword flags);
+
+    static void ELF_CALL destroy(CElfWriter *&pElfWriter);
+
+    bool ELF_CALL addSection(
+        SSectionNode *pSectionNode);
+
+    bool ELF_CALL resolveBinary(
+        char *const pBinary,
+        size_t &dataSize);
+
+    bool ELF_CALL initialize();
+    bool ELF_CALL patchElfHeader(char *const pBinary);
+
+  protected:
+    ELF_CALL CElfWriter(
+        E_EH_TYPE type,
+        E_EH_MACHINE machine,
+        Elf64_Xword flags);
+
+    ELF_CALL ~CElfWriter();
+
+    E_EH_TYPE m_type = EH_TYPE_NONE;
+    E_EH_MACHINE m_machine = EH_MACHINE_NONE;
+    Elf64_Xword m_flags = 0U;
+
+    std::queue<SSectionNode *> m_nodeQueue;
+
+    unsigned int m_dataSize = 0U;
+    unsigned int m_numSections = 0U;
+    size_t m_stringTableSize = 0U;
+    size_t m_totalBinarySize = 0U;
+};
+} // namespace ELFLib
--- a/manifests/manifest.yml
+++ b/manifests/manifest.yml
@@ -0,0 +1,61 @@
+components:
+  gmmlib:
+    branch: gmmlib
+    clean_on_sync: true
+    dest_dir: gmmlib
+    repository: https://github.com/intel/gmmlib.git
+    revision: 9a261a60bd990b237fe14138b7aaf5eaee342ff8
+    type: git
+  gmock:
+    branch: master
+    clean_on_sync: true
+    dest_dir: gmock
+    repository: https://github.com/google/googlemock.git
+    revision: c440c8fafc6f60301197720617ce64028e09c79d
+    type: git
+  gtest:
+    branch: master
+    clean_on_sync: true
+    dest_dir: gtest
+    repository: https://github.com/google/googletest.git
+    revision: c99458533a9b4c743ed51537e25989ea55944908
+    type: git
+  igc:
+    branch: igc
+    clean_on_sync: true
+    dest_dir: igc
+    repository: https://github.com/intel/intelgraphicscompiler
+    revision: d6379492df107094d0642f0ecf75a6f20ae573b2-2
+    type: git
+  infra:
+    branch: infra
+    clean_on_sync: true
+    dest_dir: infra
+    revision: c81cf66d7995e55cb8f11b24d3776e3fc013a809
+    type: git
+  internal:
+    branch: master
+    dest_dir: internal
+    revision: 93d1c17c98d8c051bcd4368686bc9cf2eddd8f8e
+    type: git
+  khronos:
+    branch: master
+    clean_on_sync: true
+    dest_dir: khronos
+    repository: https://github.com/KhronosGroup/OpenCL-Headers.git
+    revision: f039db6764d52388658ef15c30b2237bbda49803
+    type: git
+  libdrm:
+    branch: libdrm-2.4.84
+    clean_on_sync: true
+    dest_dir: libdrm
+    repository: https://anongit.freedesktop.org/git/mesa/drm.git
+    revision: 290d29d9794813a2fe0578dbb905ad09bc810516
+    type: git
+  wdk:
+    branch: wdk
+    clean_on_sync: true
+    dest_dir: wdk
+    revision: c67a2fa209d3ad3c3ab05f6f10e2234fd81fcebc
+    type: git
+version: '1'
--- a/offline_compiler/CMakeLists.txt
+++ b/offline_compiler/CMakeLists.txt
@@ -0,0 +1,139 @@
+# Copyright (c) 2017, Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+cmake_minimum_required(VERSION 3.2.0 FATAL_ERROR)
+
+project(cloc)
+
+set(CLOC_SRCS_LIB
+  ${IGDRCL_SOURCE_DIR}/offline_compiler/offline_compiler.cpp
+  ${IGDRCL_SOURCE_DIR}/offline_compiler/offline_compiler.h
+  ${IGDRCL_SOURCE_DIR}/offline_compiler/options.cpp
+  ${IGDRCL_SOURCE_DIR}/offline_compiler/helper.cpp
+  ${IGDRCL_SOURCE_DIR}/runtime/compiler_interface/create_main.cpp
+  ${IGDRCL_SOURCE_DIR}/runtime/helpers/hw_info.cpp
+  ${IGDRCL_SOURCE_DIR}/runtime/helpers/file_io.cpp
+  ${IGDRCL_SOURCE_DIR}/runtime/helpers/abort.cpp
+  ${IGDRCL_SOURCE_DIR}/runtime/helpers/debug_helpers.cpp
+)
+
+if (WIN32)
+  list (APPEND CLOC_SRCS_LIB
+    ${IGDRCL_SOURCE_DIR}/runtime/os_interface/windows/os_library.cpp
+    ${IGDRCL_SOURCE_DIR}/runtime/os_interface/windows/options.cpp
+  )
+endif (WIN32)
+
+if (UNIX)
+  list (APPEND CLOC_SRCS_LIB
+    ${IGDRCL_SOURCE_DIR}/runtime/os_interface/linux/os_library.cpp
+    ${IGDRCL_SOURCE_DIR}/runtime/os_interface/linux/options.cpp
+  )
+endif (UNIX)
+
+list (APPEND HW_SRC_INCLUDES ${IGDRCL_SOURCE_DIR}/runtime/gen_common)
+
+set(CLOC_LIB_FLAGS_DEFINITIONS
+  -DCIF_HEADERS_ONLY_BUILD
+)
+
+set(OPTIONAL_RUNTIME_GENX_FILES
+  hw_info.cpp
+)
+
+foreach(GEN_NUM RANGE 0 ${MAX_GEN} 1)
+  GEN_CONTAINS_PLATFORMS("SUPPORTED" ${GEN_NUM} GENX_HAS_PLATFORMS)
+  if(${GENX_HAS_PLATFORMS})
+    foreach(SRC_IT ${OPTIONAL_RUNTIME_GENX_FILES})
+      set(SRC_FILE ${IGDRCL_SOURCE_DIR}/runtime/gen${GEN_NUM}/${SRC_IT})
+      if(EXISTS ${SRC_FILE})
+        list(APPEND CLOC_SRCS_LIB ${SRC_FILE})
+      endif()
+    endforeach()
+
+    GET_PLATFORMS_FOR_GEN("SUPPORTED" ${GEN_NUM} SUPPORTED_GENX_PLATFORMS)
+    foreach(PLATFORM_IT ${SUPPORTED_GENX_PLATFORMS})
+      string(TOLOWER ${PLATFORM_IT} PLATFORM_IT_LOWER)
+      list (APPEND CLOC_SRCS_LIB
+        ${IGDRCL_SOURCE_DIR}/runtime/gen${GEN_NUM}/hw_info_${PLATFORM_IT_LOWER}.cpp
+        ${IGDRCL_SOURCE_DIR}/runtime/gen${GEN_NUM}/enable_${PLATFORM_IT_LOWER}.cpp
+    )
+    endforeach(PLATFORM_IT)
+  endif(${GENX_HAS_PLATFORMS})
+endforeach(GEN_NUM)
+
+set(CLOC_SRCS
+  ${CLOC_SRCS_LIB}
+  main.cpp
+  ${IGDRCL_SOURCE_DIR}/offline_compiler/CMakeLists.txt
+)
+
+add_executable(cloc ${CLOC_SRCS})
+
+if(IGC_OCL_ADAPTOR_DIR) # IGC/AdaptorOCL
+  target_include_directories(cloc PUBLIC "${IGC_OCL_ADAPTOR_DIR}")
+endif(IGC_OCL_ADAPTOR_DIR)
+
+if(CIF_BASE_DIR)
+  target_include_directories(cloc PUBLIC "${CIF_BASE_DIR}")
+endif(CIF_BASE_DIR)
+
+SET(CLOC_INCLUDES
+  "${HW_SRC_INCLUDES}"
+  "${UMKM_SHAREDDATA_INCLUDE_PATHS}"
+  "${KHRONOS_HEADERS_DIR}"
+  "${IGDRCL__IGC_INCLUDE_DIR}"
+  "${THIRD_PARTY_DIR}"
+  PARENT_SCOPE
+)
+
+target_include_directories(cloc BEFORE PRIVATE
+  "${HW_SRC_INCLUDES}"
+  "${UMKM_SHAREDDATA_INCLUDE_PATHS}"
+  "${KHRONOS_HEADERS_DIR}"
+  "${IGDRCL__IGC_INCLUDE_DIR}"
+  "${THIRD_PARTY_DIR}"
+)
+target_compile_definitions(cloc PUBLIC ${CLOC_LIB_FLAGS_DEFINITIONS} ${SUPPORTED_GEN_FLAGS_DEFINITONS} DEFAULT_PLATFORM=${DEFAULT_SUPPORTED_PLATFORM})
+
+if(UNIX)
+  target_link_libraries(cloc dl pthread)
+endif(UNIX)
+
+target_link_libraries(cloc elflib)
+
+source_group("source files" FILES ${CLOC_SRCS})
+set_target_properties(cloc PROPERTIES FOLDER "offline_compiler")
+set_property(TARGET cloc APPEND_STRING PROPERTY COMPILE_FLAGS ${ASAN_FLAGS} ${TSAN_FLAGS})
+
+add_custom_target(copy_compiler_files DEPENDS ${IGDRCL__IGC_TARGETS})
+set_target_properties(copy_compiler_files PROPERTIES FOLDER "opencl runtime")
+
+foreach(TARGET_tmp ${IGDRCL__IGC_TARGETS})
+  add_custom_command(
+    TARGET copy_compiler_files
+    PRE_BUILD
+    COMMAND echo copying $<TARGET_FILE:${TARGET_tmp}> to "$<TARGET_FILE_DIR:cloc>"
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:${TARGET_tmp}> $<TARGET_FILE_DIR:cloc>
+)
+endforeach(TARGET_tmp)
+
+SET(CLOC_SRCS_LIB ${CLOC_SRCS_LIB} PARENT_SCOPE)
+SET(CLOC_LIB_FLAGS_DEFINITIONS ${CLOC_LIB_FLAGS_DEFINITIONS} PARENT_SCOPE)
--- a/offline_compiler/create_command_stream.cpp
+++ b/offline_compiler/create_command_stream.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "runtime/command_stream/command_stream_receiver.h"
+#include "hw_info.h"
+#include "runtime/helpers/debug_helpers.h"
+
+namespace OCLRT {
+extern CommandStreamReceiverCreateFunc commandStreamReceiverFactory[2 * IGFX_MAX_CORE];
+
+CommandStreamReceiver *createCommandStream(const HardwareInfo *pHwInfo) {
+    DEBUG_BREAK_IF(nullptr == pHwInfo->pPlatform);
+    auto funcCreate = commandStreamReceiverFactory[IGFX_MAX_CORE + pHwInfo->pPlatform->eRenderCoreFamily];
+
+    return funcCreate ? funcCreate(*pHwInfo) : nullptr;
+}
+
+bool getDevices(HardwareInfo **hwInfo, size_t &numDevicesReturned) {
+    *hwInfo = nullptr;
+    numDevicesReturned = 0;
+    return true;
+}
+}
--- a/offline_compiler/helper.cpp
+++ b/offline_compiler/helper.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "runtime/helpers/hw_info.h"
+#include "runtime/os_interface/debug_settings_manager.h"
+
+namespace OCLRT {
+
+template <DebugFunctionalityLevel DebugLevel>
+DebugSettingsManager<DebugLevel>::DebugSettingsManager() {
+}
+
+template <DebugFunctionalityLevel DebugLevel>
+DebugSettingsManager<DebugLevel>::~DebugSettingsManager() {
+}
+
+template <DebugFunctionalityLevel DebugLevel>
+void DebugSettingsManager<DebugLevel>::writeToFile(std::string filename, const char *str, size_t length, std::ios_base::openmode mode) {
+    std::ofstream outFile(filename, mode);
+    if (outFile.is_open()) {
+        outFile.write(str, length);
+        outFile.close();
+    }
+}
+
+// Global Debug Settings Manager
+DebugSettingsManager<globalDebugFunctionalityLevel> DebugManager;
+
+// Global table of hardware prefixes
+const char *hardwarePrefix[IGFX_MAX_PRODUCT] = {
+    nullptr,
+};
+
+// Global table of family names
+const char *familyName[IGFX_MAX_CORE] = {
+    nullptr,
+};
+
+// Global table of family names
+bool familyEnabled[IGFX_MAX_CORE] = {
+    false,
+};
+
+} // namespace OCLRT
--- a/offline_compiler/main.cpp
+++ b/offline_compiler/main.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "config.h"
+
+#include "offline_compiler/offline_compiler.h"
+#include "runtime/os_interface/os_library.h"
+
+#include <CL/cl.h>
+
+using namespace OCLRT;
+
+int main(int numArgs, const char *argv[]) {
+    int retVal = CL_SUCCESS;
+    OfflineCompiler *pCompiler = OfflineCompiler::create(numArgs, argv, retVal);
+
+    if (retVal == CL_SUCCESS) {
+        retVal = pCompiler->build();
+
+        std::string buildLog = pCompiler->getBuildLog();
+        if (buildLog.empty() == false) {
+            printf("%s\n", buildLog.c_str());
+        }
+
+        if (retVal == CL_SUCCESS) {
+            if (!pCompiler->isQuiet())
+                printf("Build succeeded.\n");
+        } else {
+            printf("Build failed with error code: %d\n", retVal);
+        }
+    }
+
+    delete pCompiler;
+    return retVal;
+}
--- a/offline_compiler/offline_compiler.cpp
+++ b/offline_compiler/offline_compiler.cpp
@@ -0,0 +1,781 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "cif/common/cif_main.h"
+#include "cif/helpers/error.h"
+#include "cif/import/library_api.h"
+#include "ocl_igc_interface/code_type.h"
+#include "ocl_igc_interface/fcl_ocl_device_ctx.h"
+#include "ocl_igc_interface/igc_ocl_device_ctx.h"
+#include "ocl_igc_interface/platform_helper.h"
+#include "offline_compiler.h"
+#include "igfxfmid.h"
+#include "runtime/helpers/file_io.h"
+#include "runtime/os_interface/debug_settings_manager.h"
+#include "runtime/os_interface/os_inc.h"
+#include "runtime/os_interface/os_library.h"
+#include "runtime/helpers/string.h"
+#include "runtime/helpers/debug_helpers.h"
+#include "runtime/helpers/hw_info.h"
+#include "runtime/helpers/validators.h"
+#include "elf/writer.h"
+#include <iomanip>
+#include <list>
+#include <algorithm>
+#include <iostream>
+
+#ifdef _WIN32
+#include <direct.h>
+#define MakeDirectory _mkdir
+#define GetCurrentWorkingDirectory _getcwd
+#else
+#include <sys/stat.h>
+#define MakeDirectory(dir) mkdir(dir, 0777)
+#define GetCurrentWorkingDirectory getcwd
+#endif
+
+namespace OCLRT {
+
+CIF::CIFMain *createMainNoSanitize(CIF::CreateCIFMainFunc_t createFunc);
+
+////////////////////////////////////////////////////////////////////////////////
+// StringsAreEqual
+////////////////////////////////////////////////////////////////////////////////
+bool stringsAreEqual(const char *string1, const char *string2) {
+    if (string2 == nullptr)
+        return false;
+    return (strcmp(string1, string2) == 0);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// convertToPascalCase
+////////////////////////////////////////////////////////////////////////////////
+std::string convertToPascalCase(const std::string &inString) {
+    std::string outString;
+    bool capitalize = true;
+
+    for (unsigned int i = 0; i < inString.length(); i++) {
+        if (isalpha(inString[i]) && capitalize == true) {
+            outString += toupper(inString[i]);
+            capitalize = false;
+        } else if (inString[i] == '_') {
+            capitalize = true;
+        } else {
+            outString += inString[i];
+        }
+    }
+    return outString;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// ctor
+////////////////////////////////////////////////////////////////////////////////
+OfflineCompiler::OfflineCompiler() = default;
+
+////////////////////////////////////////////////////////////////////////////////
+// dtor
+////////////////////////////////////////////////////////////////////////////////
+OfflineCompiler::~OfflineCompiler() {
+    delete[] llvmBinary;
+    delete[] genBinary;
+    delete[] elfBinary;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Create
+////////////////////////////////////////////////////////////////////////////////
+OfflineCompiler *OfflineCompiler::create(uint32_t numArgs, const char **argv, int &retVal) {
+    retVal = CL_SUCCESS;
+    auto pOffCompiler = new OfflineCompiler();
+
+    if (pOffCompiler) {
+        retVal = pOffCompiler->initialize(numArgs, argv);
+    }
+
+    if (retVal != CL_SUCCESS) {
+        delete pOffCompiler;
+        pOffCompiler = nullptr;
+    }
+
+    return pOffCompiler;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// buildSourceCode
+////////////////////////////////////////////////////////////////////////////////
+int OfflineCompiler::buildSourceCode() {
+    int retVal = CL_SUCCESS;
+
+    do {
+        if (strcmp(sourceCode.c_str(), "") == 0) {
+            retVal = CL_INVALID_PROGRAM;
+            break;
+        }
+        UNRECOVERABLE_IF(fclDeviceCtx == nullptr);
+        UNRECOVERABLE_IF(igcDeviceCtx == nullptr);
+
+        IGC::CodeType::CodeType_t intermediateRepresentation = useLlvmText ? IGC::CodeType::llvmLl : IGC::CodeType::llvmBc;
+        auto fclSrc = CIF::Builtins::CreateConstBuffer(fclMain.get(), sourceCode.c_str(), sourceCode.size());
+        auto fclOptions = CIF::Builtins::CreateConstBuffer(fclMain.get(), options.c_str(), options.size());
+        auto fclInternalOptions = CIF::Builtins::CreateConstBuffer(fclMain.get(), internalOptions.c_str(), internalOptions.size());
+
+        auto fclTranslationCtx = fclDeviceCtx->CreateTranslationCtx(IGC::CodeType::oclC, intermediateRepresentation);
+        auto igcTranslationCtx = igcDeviceCtx->CreateTranslationCtx(intermediateRepresentation, IGC::CodeType::oclGenBin);
+
+        if (false == OCLRT::areNotNullptr(fclSrc.get(), fclOptions.get(), fclInternalOptions.get(),
+                                          fclTranslationCtx.get(), igcTranslationCtx.get())) {
+            retVal = CL_OUT_OF_HOST_MEMORY;
+            break;
+        }
+
+        auto fclOutput = fclTranslationCtx->Translate(fclSrc.get(), fclOptions.get(),
+                                                      fclInternalOptions.get(), nullptr, 0);
+
+        if (fclOutput == nullptr) {
+            retVal = CL_OUT_OF_HOST_MEMORY;
+            break;
+        }
+
+        UNRECOVERABLE_IF(fclOutput->GetBuildLog() == nullptr);
+        UNRECOVERABLE_IF(fclOutput->GetOutput() == nullptr);
+
+        if (fclOutput->Successful() == false) {
+            updateBuildLog(fclOutput->GetBuildLog()->GetMemory<char>(), fclOutput->GetBuildLog()->GetSizeRaw());
+            retVal = CL_BUILD_PROGRAM_FAILURE;
+            break;
+        }
+
+        storeBinary(llvmBinary, llvmBinarySize, fclOutput->GetOutput()->GetMemory<char>(), fclOutput->GetOutput()->GetSizeRaw());
+        updateBuildLog(fclOutput->GetBuildLog()->GetMemory<char>(), fclOutput->GetBuildLog()->GetSizeRaw());
+
+        auto igcOutput = igcTranslationCtx->Translate(fclOutput->GetOutput(), fclOptions.get(),
+                                                      fclInternalOptions.get(),
+                                                      nullptr, 0);
+        if (igcOutput == nullptr) {
+            retVal = CL_OUT_OF_HOST_MEMORY;
+            break;
+        }
+
+        UNRECOVERABLE_IF(igcOutput->GetBuildLog() == nullptr);
+        UNRECOVERABLE_IF(igcOutput->GetOutput() == nullptr);
+        storeBinary(genBinary, genBinarySize, igcOutput->GetOutput()->GetMemory<char>(), igcOutput->GetOutput()->GetSizeRaw());
+        updateBuildLog(igcOutput->GetBuildLog()->GetMemory<char>(), igcOutput->GetBuildLog()->GetSizeRaw());
+        retVal = igcOutput->Successful() ? CL_SUCCESS : CL_BUILD_PROGRAM_FAILURE;
+    } while (0);
+
+    return retVal;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// build
+////////////////////////////////////////////////////////////////////////////////
+int OfflineCompiler::build() {
+    int retVal = CL_SUCCESS;
+
+    retVal = buildSourceCode();
+
+    if (retVal == CL_SUCCESS) {
+        generateElfBinary();
+        writeOutAllFiles();
+    }
+
+    return retVal;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// updateBuildLog
+////////////////////////////////////////////////////////////////////////////////
+void OfflineCompiler::updateBuildLog(const char *pErrorString, const size_t errorStringSize) {
+    std::string errorString = (errorStringSize && pErrorString) ? std::string(pErrorString, pErrorString + errorStringSize) : "";
+    if (errorString[0] != '\0') {
+        if (buildLog.empty()) {
+            buildLog.assign(errorString);
+        } else {
+            buildLog.append("\n" + errorString);
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// getBuildLog
+////////////////////////////////////////////////////////////////////////////////
+std::string &OfflineCompiler::getBuildLog() {
+    return buildLog;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// getHardwareInfo
+////////////////////////////////////////////////////////////////////////////////
+int OfflineCompiler::getHardwareInfo(const char *pDeviceName) {
+    int retVal = CL_INVALID_DEVICE;
+
+    for (unsigned int productId = 0; productId < IGFX_MAX_PRODUCT; ++productId) {
+        if (stringsAreEqual(pDeviceName, hardwarePrefix[productId])) {
+            if (hardwareInfoTable[productId]) {
+                hwInfo = hardwareInfoTable[productId];
+                retVal = CL_SUCCESS;
+                break;
+            }
+        }
+    }
+
+    return retVal;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// getStringWithinDelimiters
+////////////////////////////////////////////////////////////////////////////////
+std::string OfflineCompiler::getStringWithinDelimiters(const std::string &src) {
+    size_t start = src.find("R\"===(");
+    size_t stop = src.find(")===\"");
+
+    DEBUG_BREAK_IF(std::string::npos == start);
+    DEBUG_BREAK_IF(std::string::npos == stop);
+
+    start += strlen("R\"===(");
+    size_t size = stop - start;
+
+    std::string dst(src, start, size);
+    return dst;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Initialize
+////////////////////////////////////////////////////////////////////////////////
+int OfflineCompiler::initialize(uint32_t numArgs, const char **argv) {
+    int retVal = CL_SUCCESS;
+    const char *pSource = nullptr;
+    void *pSourceFromFile = nullptr;
+    size_t sourceFromFileSize = 0;
+
+    retVal = parseCommandLine(numArgs, argv);
+    if (retVal != CL_SUCCESS) {
+        return retVal;
+    }
+
+    parseDebugSettings();
+
+    if (options.empty()) {
+        // try to read options from file if not provided by commandline
+        std::string optionsFileName = inputFile;
+        size_t ext_start = optionsFileName.find(".cl");
+        if (ext_start != std::string::npos) {
+            optionsFileName.replace(ext_start, strlen(".cl"), "_options.txt");
+            void *pOptions = nullptr;
+            size_t optionsSize = loadDataFromFile(optionsFileName.c_str(), pOptions);
+            if (optionsSize > 0) {
+                options = (char *)pOptions;
+                // Remove comment containing copyright header
+                size_t commentBegin = options.find_first_of("/*");
+                size_t commentEnd = options.find_last_of("*/");
+                if (commentBegin != std::string::npos && commentEnd != std::string::npos) {
+                    options = options.replace(commentBegin, commentEnd - commentBegin + 1, "");
+                    size_t optionsBegin = options.find_first_not_of(" \t\n\r");
+                    if (optionsBegin != std::string::npos) {
+                        options = options.substr(optionsBegin, options.length());
+                    }
+                }
+                auto trimPos = options.find_last_not_of(" \n\r");
+                options = options.substr(0, trimPos + 1);
+                if (!isQuiet())
+                    printf("Building with options:\n%s\n", options.c_str());
+            }
+            deleteDataReadFromFile(pOptions);
+        }
+    }
+
+    // set up the device inside the program
+    sourceFromFileSize = loadDataFromFile(inputFile.c_str(), pSourceFromFile);
+    struct Helper {
+        static void deleter(void *ptr) { deleteDataReadFromFile(ptr); }
+    };
+    auto sourceRaii = std::unique_ptr<void, decltype(&Helper::deleter)>{pSourceFromFile, Helper::deleter};
+    if (sourceFromFileSize == 0) {
+        retVal = INVALID_FILE;
+        return retVal;
+    }
+
+    // we also accept files used as runtime builtins
+    pSource = strstr((const char *)pSourceFromFile, "R\"===(");
+    sourceCode = (pSource != nullptr) ? getStringWithinDelimiters((char *)pSourceFromFile) : (char *)pSourceFromFile;
+
+    this->fclLib.reset(OsLibrary::load(Os::frontEndDllName));
+    if (this->fclLib == nullptr) {
+        return CL_OUT_OF_HOST_MEMORY;
+    }
+
+    auto fclCreateMain = reinterpret_cast<CIF::CreateCIFMainFunc_t>(this->fclLib->getProcAddress(CIF::CreateCIFMainFuncName));
+    if (fclCreateMain == nullptr) {
+        return CL_OUT_OF_HOST_MEMORY;
+    }
+
+    this->fclMain = CIF::RAII::UPtr(createMainNoSanitize(fclCreateMain));
+    if (this->fclMain == nullptr) {
+        return CL_OUT_OF_HOST_MEMORY;
+    }
+
+    if (false == this->fclMain->IsCompatible<IGC::FclOclDeviceCtx>()) {
+        // given FCL is not compatible
+        DEBUG_BREAK_IF(true);
+        return CL_OUT_OF_HOST_MEMORY;
+    }
+
+    this->fclDeviceCtx = this->fclMain->CreateInterface<IGC::FclOclDeviceCtxTagOCL>();
+    if (this->fclDeviceCtx == nullptr) {
+        return CL_OUT_OF_HOST_MEMORY;
+    }
+
+    fclDeviceCtx->SetOclApiVersion(hwInfo->capabilityTable.clVersionSupport * 10);
+
+    this->igcLib.reset(OsLibrary::load(Os::igcDllName));
+    if (this->igcLib == nullptr) {
+        return CL_OUT_OF_HOST_MEMORY;
+    }
+
+    auto igcCreateMain = reinterpret_cast<CIF::CreateCIFMainFunc_t>(this->igcLib->getProcAddress(CIF::CreateCIFMainFuncName));
+    if (igcCreateMain == nullptr) {
+        return CL_OUT_OF_HOST_MEMORY;
+    }
+
+    this->igcMain = CIF::RAII::UPtr(createMainNoSanitize(igcCreateMain));
+    if (this->igcMain == nullptr) {
+        return CL_OUT_OF_HOST_MEMORY;
+    }
+
+    if (false == this->igcMain->IsCompatible<IGC::IgcOclDeviceCtx>()) {
+        // given IGC is not compatible
+        DEBUG_BREAK_IF(true);
+        return CL_OUT_OF_HOST_MEMORY;
+    }
+
+    this->igcDeviceCtx = this->igcMain->CreateInterface<IGC::IgcOclDeviceCtxTagOCL>();
+    if (this->igcDeviceCtx == nullptr) {
+        return CL_OUT_OF_HOST_MEMORY;
+    }
+    this->igcDeviceCtx->SetProfilingTimerResolution(static_cast<float>(hwInfo->capabilityTable.defaultProfilingTimerResolution));
+    auto igcPlatform = this->igcDeviceCtx->GetPlatformHandle();
+    auto igcGtSystemInfo = this->igcDeviceCtx->GetGTSystemInfoHandle();
+    auto igcFeWa = this->igcDeviceCtx->GetIgcFeaturesAndWorkaroundsHandle();
+    if ((igcPlatform == nullptr) || (igcGtSystemInfo == nullptr) || (igcFeWa == nullptr)) {
+        return CL_OUT_OF_HOST_MEMORY;
+    }
+    IGC::PlatformHelper::PopulateInterfaceWith(*igcPlatform.get(), *hwInfo->pPlatform);
+    IGC::GtSysInfoHelper::PopulateInterfaceWith(*igcGtSystemInfo.get(), *hwInfo->pSysInfo);
+    // populate with features
+    igcFeWa.get()->SetFtrDesktop(hwInfo->pSkuTable->ftrDesktop);
+    igcFeWa.get()->SetFtrChannelSwizzlingXOREnabled(hwInfo->pSkuTable->ftrChannelSwizzlingXOREnabled);
+
+    igcFeWa.get()->SetFtrGtBigDie(hwInfo->pSkuTable->ftrGtBigDie);
+    igcFeWa.get()->SetFtrGtMediumDie(hwInfo->pSkuTable->ftrGtMediumDie);
+    igcFeWa.get()->SetFtrGtSmallDie(hwInfo->pSkuTable->ftrGtSmallDie);
+
+    igcFeWa.get()->SetFtrGT1(hwInfo->pSkuTable->ftrGT1);
+    igcFeWa.get()->SetFtrGT1_5(hwInfo->pSkuTable->ftrGT1_5);
+    igcFeWa.get()->SetFtrGT2(hwInfo->pSkuTable->ftrGT2);
+    igcFeWa.get()->SetFtrGT3(hwInfo->pSkuTable->ftrGT3);
+    igcFeWa.get()->SetFtrGT4(hwInfo->pSkuTable->ftrGT4);
+
+    igcFeWa.get()->SetFtrIVBM0M1Platform(hwInfo->pSkuTable->ftrIVBM0M1Platform);
+    igcFeWa.get()->SetFtrGTL(hwInfo->pSkuTable->ftrGT1);
+    igcFeWa.get()->SetFtrGTM(hwInfo->pSkuTable->ftrGT2);
+    igcFeWa.get()->SetFtrGTH(hwInfo->pSkuTable->ftrGT3);
+
+    igcFeWa.get()->SetFtrSGTPVSKUStrapPresent(hwInfo->pSkuTable->ftrSGTPVSKUStrapPresent);
+    igcFeWa.get()->SetFtrGTA(hwInfo->pSkuTable->ftrGTA);
+    igcFeWa.get()->SetFtrGTC(hwInfo->pSkuTable->ftrGTC);
+    igcFeWa.get()->SetFtrGTX(hwInfo->pSkuTable->ftrGTX);
+    igcFeWa.get()->SetFtr5Slice(hwInfo->pSkuTable->ftr5Slice);
+
+    igcFeWa.get()->SetFtrGpGpuMidThreadLevelPreempt(hwInfo->pSkuTable->ftrGpGpuMidThreadLevelPreempt);
+    igcFeWa.get()->SetFtrIoMmuPageFaulting(hwInfo->pSkuTable->ftrIoMmuPageFaulting);
+    igcFeWa.get()->SetFtrWddm2Svm(hwInfo->pSkuTable->ftrWddm2Svm);
+    igcFeWa.get()->SetFtrPooledEuEnabled(hwInfo->pSkuTable->ftrPooledEuEnabled);
+
+    igcFeWa.get()->SetFtrResourceStreamer(hwInfo->pSkuTable->ftrResourceStreamer);
+
+    return retVal;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// ParseCommandLine
+////////////////////////////////////////////////////////////////////////////////
+int OfflineCompiler::parseCommandLine(uint32_t numArgs, const char **argv) {
+    int retVal = CL_SUCCESS;
+    bool compile32 = false;
+    bool compile64 = false;
+
+    if (numArgs < 2) {
+        printUsage();
+        retVal = PRINT_USAGE;
+    }
+
+    for (uint32_t argIndex = 1; argIndex < numArgs; argIndex++) {
+        if ((stringsAreEqual(argv[argIndex], "-file")) &&
+            (argIndex + 1 < numArgs)) {
+            inputFile = argv[argIndex + 1];
+            argIndex++;
+        } else if (stringsAreEqual(argv[argIndex], "-32")) {
+            compile32 = true;
+            internalOptions.append(" -m32 ");
+        } else if (stringsAreEqual(argv[argIndex], "-64")) {
+            compile64 = true;
+            internalOptions.append(" -m64 ");
+        } else if (stringsAreEqual(argv[argIndex], "-cl-intel-greater-than-4GB-buffer-required")) {
+            internalOptions.append(" -cl-intel-greater-than-4GB-buffer-required ");
+        } else if ((stringsAreEqual(argv[argIndex], "-device")) &&
+                   (argIndex + 1 < numArgs)) {
+            deviceName = argv[argIndex + 1];
+            argIndex++;
+        } else if (stringsAreEqual(argv[argIndex], "-llvm_text")) {
+            useLlvmText = true;
+        } else if (stringsAreEqual(argv[argIndex], "-cpp_file")) {
+            useCppFile = true;
+        } else if ((stringsAreEqual(argv[argIndex], "-options")) &&
+                   (argIndex + 1 < numArgs)) {
+            options = argv[argIndex + 1];
+            argIndex++;
+        } else if (stringsAreEqual(argv[argIndex], "-options_name")) {
+            useOptionsSuffix = true;
+        } else if ((stringsAreEqual(argv[argIndex], "-out_dir")) &&
+                   (argIndex + 1 < numArgs)) {
+            outputDirectory = argv[argIndex + 1];
+            argIndex++;
+        } else if (stringsAreEqual(argv[argIndex], "-q")) {
+            quiet = true;
+        } else if (stringsAreEqual(argv[argIndex], "-?")) {
+            printUsage();
+            retVal = PRINT_USAGE;
+        } else {
+            printf("Invalid option (arg %d): %s\n", argIndex, argv[argIndex]);
+            retVal = INVALID_COMMAND_LINE;
+            break;
+        }
+    }
+
+    if (retVal == CL_SUCCESS) {
+        if (compile32 && compile64) {
+            printf("Error: Cannot compile for 32-bit and 64-bit, please choose one.\n");
+            retVal = INVALID_COMMAND_LINE;
+        } else if (inputFile.empty()) {
+            printf("Error: Input file name missing.\n");
+            retVal = INVALID_COMMAND_LINE;
+        } else if (deviceName.empty()) {
+            printf("Error: Device name missing.\n");
+            retVal = INVALID_COMMAND_LINE;
+        } else if (!fileExists(inputFile)) {
+            printf("Error: Input file %s missing.\n", inputFile.c_str());
+            retVal = INVALID_FILE;
+        } else {
+            retVal = getHardwareInfo(deviceName.c_str());
+            if (retVal != CL_SUCCESS) {
+                printf("Error: Cannot get HW Info for device %s.\n", deviceName.c_str());
+            }
+        }
+    }
+
+    return retVal;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// ParseCommandLine
+////////////////////////////////////////////////////////////////////////////////
+void OfflineCompiler::parseDebugSettings() {
+    if (DebugManager.flags.EnableStatelessToStatefulBufferOffsetOpt.get()) {
+        internalOptions += "-cl-intel-has-buffer-offset-arg ";
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// ParseBinAsCharArray
+////////////////////////////////////////////////////////////////////////////////
+std::string OfflineCompiler::parseBinAsCharArray(uint8_t *binary, size_t size, std::string &deviceName, std::string &fileName) {
+    std::string builtinName = convertToPascalCase(fileName);
+    std::ostringstream out;
+
+    // Convert binary to cpp
+    out << "#include <cstddef>\n";
+    out << "#include <cstdint>\n\n";
+    out << "size_t " << builtinName << "BinarySize_" << deviceName << " = " << size << ";\n";
+    out << "uint32_t " << builtinName << "Binary_" << deviceName << "[" << (size + 3) / 4 << "] = {"
+        << std::endl
+        << "    ";
+
+    uint32_t *binaryUint = (uint32_t *)binary;
+    for (size_t i = 0; i < (size + 3) / 4; i++) {
+        if (i != 0) {
+            out << ", ";
+            if (i % 8 == 0) {
+                out << std::endl
+                    << "    ";
+            }
+        }
+        if (i < size / 4) {
+            out << "0x" << std::hex << std::setw(8) << std::setfill('0') << binaryUint[i];
+        } else {
+            uint32_t lastBytes = size & 0x3;
+            uint32_t lastUint = 0;
+            uint8_t *pLastUint = (uint8_t *)&lastUint;
+            for (uint32_t j = 0; j < lastBytes; j++) {
+                pLastUint[sizeof(uint32_t) - 1 - j] = binary[i * 4 + j];
+            }
+            out << "0x" << std::hex << std::setw(8) << std::setfill('0') << lastUint;
+        }
+    }
+    out << "};" << std::endl;
+
+    out << std::endl
+        << "#include \"runtime/built_ins/registry/built_ins_registry.h\"\n"
+        << std::endl;
+    out << "namespace OCLRT {" << std::endl;
+    out << "static RegisterEmbeddedResource register" << builtinName << "Bin(" << std::endl;
+    out << "    createBuiltinResourceName(" << std::endl;
+    out << "        EBuiltInOps::" << builtinName << "," << std::endl;
+    out << "        BuiltinCode::getExtension(BuiltinCode::ECodeType::Binary), \"" << deviceName << "\", 0)" << std::endl;
+    out << "        .c_str()," << std::endl;
+    out << "    (const char *)" << builtinName << "Binary"
+        << "_" << deviceName << "," << std::endl;
+    out << "    " << builtinName << "BinarySize_" << deviceName << ");" << std::endl;
+    out << "}" << std::endl;
+
+    return out.str();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// GetFileNameTrunk
+////////////////////////////////////////////////////////////////////////////////
+std::string OfflineCompiler::getFileNameTrunk(std::string &filePath) {
+    size_t slashPos = filePath.find_last_of("\\/", filePath.size()) + 1;
+    size_t extPos = filePath.find_last_of(".", filePath.size());
+    if (extPos == std::string::npos) {
+        extPos = filePath.size();
+    }
+
+    std::string fileName;
+    std::string fileTrunk = filePath.substr(slashPos, (extPos - slashPos));
+
+    return fileTrunk;
+}
+//
+std::string getDevicesTypes() {
+    std::list<std::string> prefixes;
+    for (int j = 0; j < IGFX_MAX_PRODUCT; j++) {
+        if (hardwarePrefix[j] == nullptr)
+            continue;
+        prefixes.push_back(hardwarePrefix[j]);
+    }
+
+    ostringstream os;
+    for (auto it = prefixes.begin(); it != prefixes.end(); it++) {
+        if (it != prefixes.begin())
+            os << ",";
+        os << *it;
+    }
+
+    return os.str();
+}
+////////////////////////////////////////////////////////////////////////////////
+// PrintUsage
+////////////////////////////////////////////////////////////////////////////////
+void OfflineCompiler::printUsage() {
+
+    printf("Compiles CL files into llvm (.bc or .ll), gen isa (.gen), and binary files (.bin)\n\n");
+    printf("cloc -file <filename> -device <device_type> [-outdir <output_dir>]\n\n");
+    printf("  -file <filename>        Indicates the CL kernel file to be compiled.\n");
+    printf("  -device <device_type>   Indicates which device for which we will compile.\n");
+    printf("                          <device_type> can be: %s\n", getDevicesTypes().c_str());
+    printf("  -out_dir <output_dir>   Indicates the directory into which the compiled files\n");
+    printf("                          will be placed.\n");
+    printf("  -llvm_text              Readable LLVM text will be output in a .ll file instead of\n");
+    printf("                          through the default lllvm binary (.bc) file.\n");
+    printf("  -cpp_file               Cpp file with scheduler program binary will be generated.");
+    printf("  -options <options>      Compiler options.\n");
+    printf("  -options_name           Add suffix with compile options to filename\n");
+    printf("  -32                     Force compile to 32-bit binary.\n");
+    printf("  -64                     Force compile to 64-bit binary.\n");
+    printf("  -q                      Be more quiet. print only warnings and errors.\n");
+    printf("  -?                      Print this usage message.\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// StoreBinary
+////////////////////////////////////////////////////////////////////////////////
+void OfflineCompiler::storeBinary(
+    char *&pDst,
+    size_t &dstSize,
+    const void *pSrc,
+    const size_t srcSize) {
+    dstSize = 0;
+
+    DEBUG_BREAK_IF(!(pSrc && srcSize > 0));
+
+    delete[] pDst;
+    pDst = new char[srcSize];
+
+    dstSize = (cl_uint)srcSize;
+    memcpy_s(pDst, dstSize, pSrc, srcSize);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// GenerateElfBinary
+////////////////////////////////////////////////////////////////////////////////
+bool OfflineCompiler::generateElfBinary() {
+    bool retVal = true;
+    CLElfLib::CElfWriter *pElfWriter = nullptr;
+
+    if (!genBinary || !genBinarySize) {
+        retVal = false;
+    }
+
+    if (retVal) {
+        pElfWriter = CLElfLib::CElfWriter::create(CLElfLib::EH_TYPE_OPENCL_EXECUTABLE, CLElfLib::EH_MACHINE_NONE, 0);
+
+        if (pElfWriter) {
+            CLElfLib::SSectionNode sectionNode;
+
+            // Always add the options string
+            sectionNode.Name = "BuildOptions";
+            sectionNode.Type = CLElfLib::SH_TYPE_OPENCL_OPTIONS;
+            sectionNode.pData = (char *)options.c_str();
+            sectionNode.DataSize = (uint32_t)(strlen(options.c_str()) + 1);
+
+            retVal = pElfWriter->addSection(&sectionNode);
+
+            if (retVal) {
+                sectionNode.Name = "Intel(R) OpenCL LLVM Object";
+                sectionNode.Type = CLElfLib::SH_TYPE_OPENCL_LLVM_BINARY;
+                sectionNode.pData = llvmBinary;
+                sectionNode.DataSize = (uint32_t)llvmBinarySize;
+                retVal = pElfWriter->addSection(&sectionNode);
+            }
+
+            // Add the device binary if it exists
+            if (retVal && genBinary) {
+                sectionNode.Name = "Intel(R) OpenCL Device Binary";
+                sectionNode.Type = CLElfLib::SH_TYPE_OPENCL_DEV_BINARY;
+                sectionNode.pData = genBinary;
+                sectionNode.DataSize = (uint32_t)genBinarySize;
+
+                retVal = pElfWriter->addSection(&sectionNode);
+            }
+
+            if (retVal) {
+                // get the size
+                retVal = pElfWriter->resolveBinary(elfBinary, elfBinarySize);
+            }
+
+            if (retVal) {
+                // allocate the binary
+                elfBinary = new char[elfBinarySize];
+
+                retVal = pElfWriter->resolveBinary(elfBinary, elfBinarySize);
+            }
+        } else {
+            retVal = false;
+        }
+
+        CLElfLib::CElfWriter::destroy(pElfWriter);
+    }
+
+    return retVal;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// WriteOutAllFiles
+////////////////////////////////////////////////////////////////////////////////
+void OfflineCompiler::writeOutAllFiles() {
+    std::string fileTrunk = getFileNameTrunk(inputFile);
+    std::string fileBase = fileTrunk + "_" + deviceName;
+
+    if (outputDirectory != "") {
+        std::list<std::string> dirList;
+        std::string tmp = outputDirectory;
+        size_t pos = outputDirectory.size() + 1;
+
+        do {
+            dirList.push_back(tmp);
+            pos = tmp.find_last_of("/\\", pos);
+            tmp = tmp.substr(0, pos);
+        } while (pos != std::string::npos);
+
+        while (!dirList.empty()) {
+            MakeDirectory(dirList.back().c_str());
+            dirList.pop_back();
+        }
+    }
+
+    if (llvmBinary) {
+        std::string llvmOutputFile = (outputDirectory == "") ? "" : outputDirectory + "/";
+        (useLlvmText == true) ? llvmOutputFile.append(fileBase + ".ll") : llvmOutputFile.append(fileBase + ".bc");
+
+        if (useOptionsSuffix) {
+            std::string opts(options.c_str());
+            std::replace(opts.begin(), opts.end(), ' ', '_');
+            llvmOutputFile.append(opts);
+        }
+
+        writeDataToFile(
+            llvmOutputFile.c_str(),
+            llvmBinary,
+            llvmBinarySize);
+    }
+
+    if (genBinary) {
+        std::string genOutputFile = (outputDirectory == "") ? "" : outputDirectory + "/";
+        genOutputFile.append(fileBase + ".gen");
+
+        if (useOptionsSuffix) {
+            std::string opts(options.c_str());
+            std::replace(opts.begin(), opts.end(), ' ', '_');
+            genOutputFile.append(opts);
+        }
+
+        writeDataToFile(
+            genOutputFile.c_str(),
+            genBinary,
+            genBinarySize);
+
+        if (useCppFile) {
+            std::string cppOutputFile = (outputDirectory == "") ? "" : outputDirectory + "/";
+            cppOutputFile.append(fileBase + ".cpp");
+            std::string cpp = parseBinAsCharArray((uint8_t *)genBinary, genBinarySize, deviceName, fileTrunk);
+            writeDataToFile(cppOutputFile.c_str(), cpp.c_str(), cpp.size());
+        }
+    }
+
+    if (elfBinary) {
+        std::string elfOutputFile = (outputDirectory == "") ? "" : outputDirectory + "/";
+
+        elfOutputFile.append(fileBase + ".bin");
+
+        if (useOptionsSuffix) {
+            std::string opts(options.c_str());
+            std::replace(opts.begin(), opts.end(), ' ', '_');
+            elfOutputFile.append(opts);
+        }
+
+        writeDataToFile(
+            elfOutputFile.c_str(),
+            elfBinary,
+            elfBinarySize);
+    }
+}
+} // namespace OCLRT
--- a/offline_compiler/offline_compiler.h
+++ b/offline_compiler/offline_compiler.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "cif/common/cif_main.h"
+#include "ocl_igc_interface/igc_ocl_device_ctx.h"
+#include "ocl_igc_interface/fcl_ocl_device_ctx.h"
+#include <cstdint>
+#include <string>
+#include <memory>
+
+namespace OCLRT {
+
+struct HardwareInfo;
+class OsLibrary;
+
+std::string convertToPascalCase(const std::string &inString);
+
+enum ErrorCode {
+    INVALID_COMMAND_LINE = -5150,
+    INVALID_FILE = -5151,
+    PRINT_USAGE = -5152,
+};
+
+class OfflineCompiler {
+  public:
+    static OfflineCompiler *create(uint32_t numArgs, const char **argv, int &retVal);
+    int build();
+    std::string &getBuildLog();
+    void printUsage();
+
+    OfflineCompiler &operator=(const OfflineCompiler &) = delete;
+    OfflineCompiler(const OfflineCompiler &) = delete;
+    ~OfflineCompiler();
+
+    bool isQuiet() const {
+        return quiet;
+    }
+
+    std::string parseBinAsCharArray(uint8_t *binary, size_t size, std::string &deviceName, std::string &fileName);
+
+  protected:
+    OfflineCompiler();
+
+    int getHardwareInfo(const char *pDeviceName);
+    std::string getFileNameTrunk(std::string &filePath);
+    std::string getStringWithinDelimiters(const std::string &src);
+    int initialize(uint32_t numArgs, const char **argv);
+    int parseCommandLine(uint32_t numArgs, const char **argv);
+    void parseDebugSettings();
+    void storeBinary(char *&pDst, size_t &dstSize, const void *pSrc, const size_t srcSize);
+    int buildSourceCode();
+    void updateBuildLog(const char *pErrorString, const size_t errorStringSize);
+    bool generateElfBinary();
+    void writeOutAllFiles();
+    const HardwareInfo *hwInfo = nullptr;
+
+    std::string deviceName;
+    std::string inputFile;
+    std::string outputFile;
+    std::string outputDirectory;
+    std::string options;
+    std::string internalOptions;
+    std::string sourceCode;
+    std::string buildLog;
+
+    bool useLlvmText = false;
+    bool useCppFile = false;
+    bool useOptionsSuffix = false;
+    bool quiet = false;
+
+    char *elfBinary = nullptr;
+    size_t elfBinarySize = 0;
+    char *genBinary = nullptr;
+    size_t genBinarySize = 0;
+    char *llvmBinary = nullptr;
+    size_t llvmBinarySize = 0;
+
+    std::unique_ptr<OsLibrary> igcLib = nullptr;
+    CIF::RAII::UPtr_t<CIF::CIFMain> igcMain = nullptr;
+    CIF::RAII::UPtr_t<IGC::IgcOclDeviceCtxTagOCL> igcDeviceCtx = nullptr;
+
+    std::unique_ptr<OsLibrary> fclLib = nullptr;
+    CIF::RAII::UPtr_t<CIF::CIFMain> fclMain = nullptr;
+    CIF::RAII::UPtr_t<IGC::FclOclDeviceCtxTagOCL> fclDeviceCtx = nullptr;
+};
+} // namespace OCLRT
--- a/offline_compiler/options.cpp
+++ b/offline_compiler/options.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "hw_cmds.h"
+#include "runtime/helpers/options.h"
+#include "runtime/helpers/array_count.h"
+#include <cstddef>
+
+namespace OCLRT {
+// AUB file folder location
+const char *folderAUB = "aub_out";
+
+// Initial value for HW tag
+uint32_t initialHardwareTag = (uint32_t)-1;
+
+// Number of devices in the platform
+static const HardwareInfo *DefaultPlatformDevices[] =
+    {
+        &DEFAULT_PLATFORM::hwInfo,
+};
+
+size_t numPlatformDevices = ARRAY_COUNT(DefaultPlatformDevices);
+const HardwareInfo **platformDevices = DefaultPlatformDevices;
+} // namespace OCLRT
--- a/package.cmake
+++ b/package.cmake
@@ -0,0 +1,125 @@
+# Copyright (c) 2017, Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+if(UNIX)
+  set(package_input_dir ${IGDRCL_BINARY_DIR}/packageinput)
+  set(package_output_dir ${IGDRCL_BINARY_DIR}/packages)
+
+  if(NOT NEO_VERSION_MAJOR)
+    set(NEO_VERSION_MAJOR 1)
+  endif()
+  if(NOT NEO_VERSION_MINOR)
+    set(NEO_VERSION_MINOR 0)
+  endif()
+  if(NOT NEO_VERSION_BUILD)
+    set(NEO_VERSION_BUILD 0)
+  endif()
+
+  set(NEO_BINARY_INSTALL_DIR /opt/intel/opencl)
+  set(CMAKE_INSTALL_PREFIX ${NEO_BINARY_INSTALL_DIR})
+
+  install(FILES
+    ${IGDRCL_BINARY_DIR}/bin/libigdrcl.so
+    ${IGDRCL_BINARY_DIR}/bin/libigdccl.so
+    ${IGDRCL_BINARY_DIR}/bin/libigdfcl.so
+    ${IGDRCL_BINARY_DIR}/bin/libiga64.so
+    ${IGDRCL_BINARY_DIR}/bin/libcommon_clang.so
+    DESTINATION ${NEO_BINARY_INSTALL_DIR}
+    COMPONENT igdrcl
+  )
+
+  set(OCL_ICD_RUNTIME_NAME libigdrcl.so)
+  install(
+    CODE "file( WRITE  ${IGDRCL_BINARY_DIR}/libintelopencl.conf \"/opt/intel/opencl\n\" )"
+    CODE "file( WRITE  ${IGDRCL_BINARY_DIR}/intel.icd \"/opt/intel/opencl/${OCL_ICD_RUNTIME_NAME}\n\" )"
+    CODE "file( WRITE  ${IGDRCL_BINARY_DIR}/postinst \"echo /opt/intel/opencl >> /etc/ld.so.conf\n\" )"
+    CODE "file( APPEND ${IGDRCL_BINARY_DIR}/postinst \"/sbin/ldconfig\n\" )"
+    CODE "file( WRITE  ${IGDRCL_BINARY_DIR}/postrm \"sed -i '/\\\\/opt\\\\/intel\\\\/opencl.*$/d' /etc/ld.so.conf\n\" )"
+    CODE "file( APPEND ${IGDRCL_BINARY_DIR}/postrm \"/sbin/ldconfig\n\" )"
+    COMPONENT igdrcl
+  )
+  install(FILES ${IGDRCL_BINARY_DIR}/libintelopencl.conf DESTINATION /etc/ld.so.conf.d COMPONENT igdrcl)
+  install(FILES ${IGDRCL_BINARY_DIR}/intel.icd DESTINATION /etc/OpenCL/vendors/ COMPONENT igdrcl)
+
+  # Add Khronos ICD loader - if available
+  if(NOT ICD_LIB_DIR)
+    # Try to find ICD in upper level directory
+    if(EXISTS ${IGDRCL_SOURCE_DIR}/../OpenCL-ICD-Loader/build/lib/libOpenCL.so)
+      set(ICD_LIB_DIR ${IGDRCL_SOURCE_DIR}/../OpenCL-ICD-Loader/build/lib)
+      message(STATUS "Taking ICD library from ${ICD_LIB_DIR}")
+    else()
+      get_filename_component(IGDRCL_PARENT_DIR ${IGDRCL_SOURCE_DIR} DIRECTORY)
+      message(WARNING "Missing Khronos ICD library. Generated package (.rpm, .deb, .tar.xz) may be incomple.\nPlease download Khronos ICD loader to ${IGDRCL_PARENT_DIR} and build it, or point directory containing library libOpenCL.so using ICD_LIB_DIR.")
+    endif()
+  endif()
+
+  if(ICD_LIB_DIR)
+    get_filename_component(ICD_LIB_DIR ${ICD_LIB_DIR} ABSOLUTE)
+    set(ICD_LIB_NAME "libOpenCL.so*")
+    install(
+      CODE "if(NOT((EXISTS ${ICD_LIB_DIR}/libOpenCL.so) OR (IS_SYMLINK ${ICD_LIB_DIR}/libOpenCL.so)))\n execute_process( COMMAND ln -s ${NEO_BINARY_INSTALL_DIR}/libOpenCL.so.1 ${ICD_LIB_DIR}/libOpenCL.so)\n endif()\n"
+      CODE "file( GLOB _NeoIcdLibFiles \"${ICD_LIB_DIR}/${ICD_LIB_NAME}\" )"
+      CODE "if(NOT _NeoIcdLibFiles)\n message(FATAL_ERROR \"${ICD_LIB_NAME} cannot be found in ${ICD_LIB_DIR}\")\nendif()"
+      CODE "file( INSTALL \${_NeoIcdLibFiles} DESTINATION \"${NEO_BINARY_INSTALL_DIR}\" )"
+      COMPONENT igdrcl
+    )
+  endif()
+
+  if(NEO_CPACK_GENERATOR)
+    set(CPACK_GENERATOR "${NEO_CPACK_GENERATOR}")
+  else()
+    # If generators list was not define build native package for current distro
+    if(EXISTS "/etc/debian_version")
+      set(CPACK_GENERATOR "DEB")
+    elseif(EXISTS "/etc/redhat-release")
+      set(CPACK_GENERATOR "RPM")
+    else()
+      set(CPACK_GENERATOR "TXZ")
+    endif()
+  endif()
+
+  set(CPACK_SET_DESTDIR TRUE)
+  set(CPACK_PACKAGE_RELOCATABLE FALSE)
+  set(CPACK_PACKAGE_NAME "intel-opencl")
+  set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Intel OpenCL GPU driver")
+  set(CPACK_PACKAGE_VENDOR "Intel")
+  set(CPACK_PACKAGE_VERSION_MAJOR ${NEO_VERSION_MAJOR})
+  set(CPACK_PACKAGE_VERSION_MINOR ${NEO_VERSION_MINOR})
+  set(CPACK_PACKAGE_VERSION_PATCH ${NEO_VERSION_BUILD})
+  set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "amd64")
+  set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "postinst;postrm")
+  set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64")
+  set(CPACK_RPM_COMPRESSION_TYPE "xz")
+  set(CPACK_RPM_PACKAGE_DESCRIPTION "Intel OpenCL GPU driver")
+  set(CPACK_RPM_PACKAGE_GROUP "System Environment/Libraries")
+  set(CPACK_RPM_PACKAGE_LICENSE "MIT")
+  set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${IGDRCL_BINARY_DIR}/postinst")
+  set(CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${IGDRCL_BINARY_DIR}/postrm")
+  set(CPACK_PACKAGE_INSTALL_DIRECTORY "/opt/intel/opencl")
+  set(CPACK_PACKAGE_CONTACT "Intel Corporation")
+  set(CPACK_PACKAGE_FILE_NAME "intel-opencl-${NEO_VERSION_MAJOR}.${NEO_VERSION_MINOR}-${NEO_VERSION_BUILD}.${CPACK_RPM_PACKAGE_ARCHITECTURE}")
+  set(CPACK_DEB_COMPONENT_INSTALL ON)
+  set(CPACK_RPM_COMPONENT_INSTALL ON)
+  set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
+  set(CPACK_COMPONENTS_ALL igdrcl)
+
+  include(CPack)
+
+endif(UNIX)
--- a/platforms.cmake
+++ b/platforms.cmake
@@ -0,0 +1,255 @@
+# Copyright (c) 2017, Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+# We require cmake 3.2.0 or later
+cmake_minimum_required(VERSION 3.2.0 FATAL_ERROR)
+
+set(MAX_GEN 64)
+
+macro(INIT_LIST LIST_TYPE ELEMENT_TYPE)
+  foreach(IT RANGE 0 ${MAX_GEN} 1)
+    list(APPEND ALL_${ELEMENT_TYPE}_${LIST_TYPE} " ")
+  endforeach(IT)
+endmacro(INIT_LIST)
+
+macro(GET_LIST_FOR_GEN LIST_TYPE ELEMENT_TYPE GEN_NUMBER OUT_LIST)
+  list(GET ALL_${ELEMENT_TYPE}_${LIST_TYPE} ${GEN_NUMBER} GEN_X_${LIST_TYPE})
+  string(REPLACE "_" ";" ${OUT_LIST} ${GEN_X_${LIST_TYPE}})
+endmacro(GET_LIST_FOR_GEN)
+
+macro(ADD_ITEM_FOR_GEN LIST_TYPE ELEMENT_TYPE GEN_NUMBER ITEM)
+  list(GET ALL_${ELEMENT_TYPE}_${LIST_TYPE} ${GEN_NUMBER} GEN_X_LIST)
+  string(REPLACE " " "" GEN_X_LIST ${GEN_X_LIST})
+  if("${GEN_X_LIST}" STREQUAL "")
+    set(GEN_X_LIST "${ITEM}")
+  else("${GEN_X_LIST}" STREQUAL "")
+    set(GEN_X_LIST "${GEN_X_LIST}_${ITEM}")
+  endif("${GEN_X_LIST}" STREQUAL "")
+  list(REMOVE_AT ALL_${ELEMENT_TYPE}_${LIST_TYPE} ${GEN_NUMBER})
+  list(INSERT ALL_${ELEMENT_TYPE}_${LIST_TYPE} ${GEN_NUMBER} ${GEN_X_LIST})
+endmacro(ADD_ITEM_FOR_GEN)
+
+macro(GEN_CONTAINS_PLATFORMS TYPE GEN_NUMBER OUT_FLAG)
+  GET_LIST_FOR_GEN("PLATFORMS" ${TYPE} ${GEN_NUMBER} GEN_X_PLATFORMS)
+  string(REPLACE " " "" GEN_X_PLATFORMS ${GEN_X_PLATFORMS})
+  if("${GEN_X_PLATFORMS}" STREQUAL "")
+    set(${OUT_FLAG} FALSE)
+  else("${GEN_X_PLATFORMS}" STREQUAL "")
+    set(${OUT_FLAG} TRUE)
+  endif("${GEN_X_PLATFORMS}" STREQUAL "")
+endmacro(GEN_CONTAINS_PLATFORMS)
+
+macro(GET_AVAILABLE_PLATFORMS TYPE FLAG_NAME OUT_STR)
+  set(${TYPE}_PLATFORM_LIST)
+  set(${TYPE}_GEN_FLAGS_DEFINITONS)
+  foreach(GEN_NUM RANGE ${MAX_GEN} 0 -1)
+    GEN_CONTAINS_PLATFORMS(${TYPE} ${GEN_NUM} GENX_HAS_PLATFORMS)
+    if(${GENX_HAS_PLATFORMS})
+      list(APPEND ${TYPE}_GEN_FLAGS_DEFINITONS ${FLAG_NAME}_GEN${GEN_NUM})
+      GET_LIST_FOR_GEN("PLATFORMS" ${TYPE} ${GEN_NUM} ${TYPE}_GENX_PLATFORMS)
+      list(APPEND ${TYPE}_PLATFORM_LIST ${${TYPE}_GENX_PLATFORMS})
+      if(NOT DEFAULT_${TYPE}_PLATFORM)
+        list(GET ${TYPE}_PLATFORM_LIST 0 DEFAULT_${TYPE}_PLATFORM ${PLATFORM_IT})
+      endif()
+      if(NOT DEFAULT_${TYPE}_GEN${GEN_NUM}_PLATFORM)
+        list(GET ${TYPE}_GENX_PLATFORMS 0 DEFAULT_${TYPE}_GEN${GEN_NUM}_PLATFORM)
+      endif()
+    endif()
+  endforeach()
+  foreach(PLATFORM_IT ${${TYPE}_PLATFORM_LIST})
+    set(${OUT_STR} "${${OUT_STR}} ${PLATFORM_IT}")
+    list(APPEND ${TYPE}_GEN_FLAGS_DEFINITONS ${FLAG_NAME}_${PLATFORM_IT})
+  endforeach()
+endmacro(GET_AVAILABLE_PLATFORMS)
+
+macro(GET_PLATFORMS_FOR_GEN TYPE GEN_NUMBER OUT_LIST)
+  GET_LIST_FOR_GEN("PLATFORMS" ${TYPE} ${GEN_NUMBER} ${OUT_LIST})
+endmacro(GET_PLATFORMS_FOR_GEN)
+
+macro(GET_TEST_CONFIGURATIONS_FOR_PLATFORM TYPE GEN_NUMBER PLATFORM OUT_LIST)
+  set(${OUT_LIST})
+  string(TOLOWER ${PLATFORM} PLATFORM_LOWER)
+  GET_LIST_FOR_GEN("CONFIGURATIONS" ${TYPE} ${GEN_NUMBER} ALL_CONFIGURATIONS_FOR_GEN)
+  foreach(CONFIGURATION ${ALL_CONFIGURATIONS_FOR_GEN})
+    string(REPLACE "/" ";" CONFIGURATION_PARAMS ${CONFIGURATION})
+    list(GET CONFIGURATION_PARAMS 0 CONFIGURATION_PLATFORM)
+    if(${CONFIGURATION_PLATFORM} STREQUAL ${PLATFORM_LOWER})
+      list(APPEND ${OUT_LIST} ${CONFIGURATION})
+    endif()
+  endforeach(CONFIGURATION)
+endmacro(GET_TEST_CONFIGURATIONS_FOR_PLATFORM)
+
+macro(PLATFORM_HAS_2_0 GEN_NUMBER PLATFORM_NAME OUT_FLAG)
+  GET_LIST_FOR_GEN("PLATFORMS" "SUPPORTED_2_0" ${GEN_NUMBER} GEN_X_PLATFORMS)
+  list(FIND GEN_X_PLATFORMS ${PLATFORM_NAME} PLATFORM_EXISTS)
+  if("${PLATFORM_EXISTS}" LESS 0)
+    set(${OUT_FLAG} FALSE)
+  else("${PLATFORM_EXISTS}" LESS 0)
+    set(${OUT_FLAG} TRUE)
+  endif("${PLATFORM_EXISTS}" LESS 0)
+
+endmacro(PLATFORM_HAS_2_0 PLATFORM_NAME OUT_FLAG)
+
+macro(PLATFORM_TESTED_WITH_APPVERIFIER GEN_NUMBER PLATFORM_NAME OUT_FLAG)
+  GET_LIST_FOR_GEN("PLATFORMS" "TESTED_APPVERIFIER" ${GEN_NUMBER} GEN_X_PLATFORMS)
+  list(FIND GEN_X_PLATFORMS ${PLATFORM_NAME} PLATFORM_EXISTS)
+  if("${PLATFORM_EXISTS}" LESS 0)
+    set(${OUT_FLAG} FALSE)
+  else("${PLATFORM_EXISTS}" LESS 0)
+    set(${OUT_FLAG} TRUE)
+  endif("${PLATFORM_EXISTS}" LESS 0)
+
+endmacro(PLATFORM_TESTED_WITH_APPVERIFIER PLATFORM_NAME OUT_FLAG)
+
+# default flag for GenX devices support
+set(SUPPORT_GEN_DEFAULT TRUE CACHE BOOL "default value for SUPPORT_GENx")
+# default flag for platform support
+set(SUPPORT_PLATFORM_DEFAULT TRUE CACHE BOOL "default value for support platform")
+
+# Define the hardware configurations we support
+set(SUPPORT_GEN8 ${SUPPORT_GEN_DEFAULT} CACHE BOOL "Support Gen8 devices")
+set(SUPPORT_GEN9 ${SUPPORT_GEN_DEFAULT} CACHE BOOL "Support Gen9 devices")
+
+# Define the hardware configurations we test
+set(TESTS_GEN8 ${SUPPORT_GEN8} CACHE BOOL "Build ULTs for Gen8 devices")
+set(TESTS_GEN9 ${SUPPORT_GEN9} CACHE BOOL "Build ULTs for Gen9 devices")
+
+if(SUPPORT_GEN9)
+  set(SUPPORT_SKL ${SUPPORT_PLATFORM_DEFAULT} CACHE BOOL "Support SKL")
+  set(SUPPORT_KBL ${SUPPORT_PLATFORM_DEFAULT} CACHE BOOL "Support KBL")
+  set(SUPPORT_BXT ${SUPPORT_PLATFORM_DEFAULT} CACHE BOOL "Support BXT")
+  set(SUPPORT_GLK ${SUPPORT_PLATFORM_DEFAULT} CACHE BOOL "Support GLK")
+endif()
+
+if(TESTS_GEN9)
+  if(SUPPORT_SKL)
+    set(TESTS_SKL ${TESTS_GEN9} CACHE BOOL "Build ULTs for SKL")
+  endif()
+  if(SUPPORT_KBL)
+    set(TESTS_KBL ${TESTS_GEN9} CACHE BOOL "Build ULTs for KBL")
+  endif()
+  if(SUPPORT_GLK)
+    set(TESTS_GLK ${TESTS_GEN9} CACHE BOOL "Build ULTs for GLK")
+  endif()
+  if(SUPPORT_BXT)
+    set(TESTS_BXT ${TESTS_GEN9} CACHE BOOL "Build ULTs for BXT")
+  endif()
+endif()
+
+# Init lists
+INIT_LIST("FAMILY_NAME" "TESTED")
+INIT_LIST("PLATFORMS" "SUPPORTED")
+INIT_LIST("PLATFORMS" "SUPPORTED_2_0")
+INIT_LIST("PLATFORMS" "TESTED")
+INIT_LIST("PLATFORMS" "TESTED_APPVERIFIER")
+INIT_LIST("CONFIGURATIONS" "UNIT_TESTS")
+INIT_LIST("CONFIGURATIONS" "AUB_TESTS")
+INIT_LIST("CONFIGURATIONS" "MT_TESTS")
+
+# Add supported and tested platforms
+if(SUPPORT_GEN8)
+  ADD_ITEM_FOR_GEN("PLATFORMS" "SUPPORTED" 8 "BDW")
+  ADD_ITEM_FOR_GEN("PLATFORMS" "SUPPORTED_2_0" 8 "BDW")
+  if(TESTS_GEN8)
+    ADD_ITEM_FOR_GEN("FAMILY_NAME" "TESTED" 8 "BDWFamily")
+    ADD_ITEM_FOR_GEN("PLATFORMS" "TESTED" 8 "BDW")
+    ADD_ITEM_FOR_GEN("PLATFORMS" "TESTED_APPVERIFIER" 8 "BDW")
+    ADD_ITEM_FOR_GEN("CONFIGURATIONS" "AUB_TESTS" 8 "bdw/1/3/8")
+    ADD_ITEM_FOR_GEN("CONFIGURATIONS" "MT_TESTS" 8 "bdw/1/3/8")
+    ADD_ITEM_FOR_GEN("CONFIGURATIONS" "UNIT_TESTS" 8 "bdw/1/3/8")
+  endif()
+endif(SUPPORT_GEN8)
+
+if(SUPPORT_GEN9)
+  if(TESTS_GEN9)
+    ADD_ITEM_FOR_GEN("FAMILY_NAME" "TESTED" 9 "SKLFamily")
+  endif()
+  if(SUPPORT_SKL)
+    ADD_ITEM_FOR_GEN("PLATFORMS" "SUPPORTED" 9 "SKL")
+    ADD_ITEM_FOR_GEN("PLATFORMS" "SUPPORTED_2_0" 9 "SKL")
+    if(TESTS_SKL)
+      ADD_ITEM_FOR_GEN("PLATFORMS" "TESTED"  9 "SKL")
+      ADD_ITEM_FOR_GEN("PLATFORMS" "TESTED_APPVERIFIER"  9 "SKL")
+      ADD_ITEM_FOR_GEN("CONFIGURATIONS" "AUB_TESTS" 9 "skl/1/3/8")
+      ADD_ITEM_FOR_GEN("CONFIGURATIONS" "MT_TESTS" 9 "skl/1/3/8")
+      ADD_ITEM_FOR_GEN("CONFIGURATIONS" "UNIT_TESTS" 9 "skl/1/3/8")
+    endif()
+  endif()
+
+  if(SUPPORT_KBL)
+    ADD_ITEM_FOR_GEN("PLATFORMS" "SUPPORTED" 9 "KBL")
+    ADD_ITEM_FOR_GEN("PLATFORMS" "SUPPORTED_2_0" 9 "KBL")
+    if(TESTS_KBL)
+      ADD_ITEM_FOR_GEN("PLATFORMS" "TESTED"  9 "KBL")
+      ADD_ITEM_FOR_GEN("CONFIGURATIONS" "UNIT_TESTS" 9 "kbl/1/3/6")
+    endif()
+  endif()
+
+  if(SUPPORT_GLK)
+    ADD_ITEM_FOR_GEN("PLATFORMS" "SUPPORTED" 9 "GLK")
+    if(TESTS_GLK)
+      ADD_ITEM_FOR_GEN("PLATFORMS" "TESTED"  9 "GLK")
+      ADD_ITEM_FOR_GEN("CONFIGURATIONS" "UNIT_TESTS" 9 "glk/1/3/6")
+    endif()
+  endif()
+
+  if(SUPPORT_BXT)
+    ADD_ITEM_FOR_GEN("PLATFORMS" "SUPPORTED" 9 "BXT")
+    if(TESTS_BXT)
+      ADD_ITEM_FOR_GEN("PLATFORMS" "TESTED"  9 "BXT")
+      ADD_ITEM_FOR_GEN("CONFIGURATIONS" "AUB_TESTS" 9 "bxt/1/3/6")
+      ADD_ITEM_FOR_GEN("CONFIGURATIONS" "UNIT_TESTS" 9 "bxt/1/3/6")
+    endif()
+  endif()
+endif(SUPPORT_GEN9)
+
+# Get platform lists, flag definition and set default platforms
+GET_AVAILABLE_PLATFORMS("SUPPORTED" "SUPPORT" ALL_AVAILABLE_SUPPORTED_PLATFORMS)
+GET_AVAILABLE_PLATFORMS("TESTED" "TESTS" ALL_AVAILABLE_TESTED_PLATFORMS)
+
+message(STATUS "All supported platforms: ${ALL_AVAILABLE_SUPPORTED_PLATFORMS}")
+message(STATUS "All tested platforms: ${ALL_AVAILABLE_TESTED_PLATFORMS}")
+
+message(STATUS "Default supported platform: ${DEFAULT_SUPPORTED_PLATFORM}")
+
+list(FIND SUPPORTED_PLATFORM_LIST ${DEFAULT_SUPPORTED_PLATFORM} VALID_DEFAULT_SUPPORTED_PLATFORM)
+if(VALID_DEFAULT_SUPPORTED_PLATFORM LESS 0)
+  message(FATAL_ERROR "Not a valid supported platform: ${DEFAULT_SUPPORTED_PLATFORM}")
+endif()
+
+message(STATUS "Default tested platform: ${DEFAULT_TESTED_PLATFORM}")
+
+if(DEFAULT_TESTED_PLATFORM)
+  list(FIND TESTED_PLATFORM_LIST ${DEFAULT_TESTED_PLATFORM} VALID_DEFAULT_TESTED_PLATFORM)
+  if(VALID_DEFAULT_TESTED_PLATFORM LESS 0)
+    message(FATAL_ERROR "Not a valid tested platform: ${DEFAULT_TESTED_PLATFORM}")
+  endif()
+endif()
+
+if(NOT DEFAULT_TESTED_FAMILY_NAME)
+  foreach(GEN_NUM RANGE ${MAX_GEN} 0 -1)
+    list(GET ALL_TESTED_FAMILY_NAME ${GEN_NUM} GEN_FAMILY_NAME)
+    if(NOT GEN_FAMILY_NAME STREQUAL " ")
+      set(DEFAULT_TESTED_FAMILY_NAME ${GEN_FAMILY_NAME})
+      break()
+    endif()
+  endforeach()
+endif()
+message(STATUS "Default tested family name: ${DEFAULT_TESTED_FAMILY_NAME}")
--- a/public/cl_ext_private.h
+++ b/public/cl_ext_private.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+
+/***************************************
+ * * Internal only queue properties *
+ * ****************************************/
+// Intel evaluation now. Remove it after approval for public release
+#define CL_DEVICE_DRIVER_VERSION_INTEL 0x10010
+
+#define CL_DEVICE_DRIVER_VERSION_INTEL_NEO1 0x454E4831 // Driver version is ENH1
+
+/***************************************
+ * * cl_intel_debug_info extension *
+ * ****************************************/
+#define cl_intel_debug_info 1
+
+// New queries for clGetProgramInfo:
+#define CL_PROGRAM_DEBUG_INFO_INTEL 0x4100
+#define CL_PROGRAM_DEBUG_INFO_SIZES_INTEL 0x4101
+
+// New queries for clGetKernelInfo:
+#define CL_KERNEL_BINARY_PROGRAM_INTEL 0x407D
+#define CL_KERNEL_BINARIES_INTEL 0x4102
+#define CL_KERNEL_BINARY_SIZES_INTEL 0x4103
--- a/public/cl_vebox_intel.h
+++ b/public/cl_vebox_intel.h
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef __CL_EXT_VEBOX_INTEL_H
+#define __CL_EXT_VEBOX_INTEL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <CL/cl.h>
+
+/***************************************
+* cl_intel_video_enhancement extension *
+****************************************/
+#define CL_ACCELERATOR_TYPE_VE_INTEL 0x9
+#define CL_DEVICE_VE_VERSION_INTEL 0x4160
+#define CL_DEVICE_VE_ENGINE_COUNT_INTEL 0x4161
+#define CL_DEVICE_VE_COLOR_PIPE_VERSION_INTEL 0x416A
+#define CL_DEVICE_VE_CAMERA_PIPE_VERSION_INTEL 0x4177
+#define CL_VE_VERSION_VER_1_INTEL 0x1
+#define CL_VE_VERSION_VER_2_INTEL 0x2
+#define CL_VE_VERSION_VER_3_INTEL 0x3
+#define CL_QUEUE_VE_ENABLE_INTEL 0x4162
+// VE Attributes
+#define CL_VE_ACCELERATOR_ATTRIB_DENOISE_INTEL 0x4163
+#define CL_VE_ACCELERATOR_ATTRIB_DEINTERLACE_INTEL 0x4164
+#define CL_VE_ACCELERATOR_ATTRIB_HPC_INTEL 0x4165
+#define CL_VE_ACCELERATOR_ATTRIB_STD_STE_INTEL 0x416B
+#define CL_VE_ACCELERATOR_ATTRIB_GAMUT_COMP_INTEL 0x416C
+#define CL_VE_ACCELERATOR_ATTRIB_GECC_INTEL 0x416D
+#define CL_VE_ACCELERATOR_ATTRIB_ACE_INTEL 0x416E
+#define CL_VE_ACCELERATOR_ATTRIB_ACE_ADVANCED_INTEL 0x416F
+#define CL_VE_ACCELERATOR_ATTRIB_TCC_INTEL 0x4170
+#define CL_VE_ACCELERATOR_ATTRIB_PROC_AMP_INTEL 0x4171
+#define CL_VE_ACCELERATOR_ATTRIB_BACK_END_CSC_INTEL 0x4172
+#define CL_VE_ACCELERATOR_ATTRIB_AOI_ALPHA_INTEL 0x4173
+#define CL_VE_ACCELERATOR_ATTRIB_CCM_INTEL 0x4174
+#define CL_VE_ACCELERATOR_ATTRIB_FWD_GAMMA_CORRECTION_INTEL 0x4175
+#define CL_VE_ACCELERATOR_ATTRIB_FRONT_END_CSC_INTEL 0x4176
+#define CL_VE_ACCELERATOR_ATTRIB_BLC_INTEL 0x4178
+#define CL_VE_ACCELERATOR_ATTRIB_DEMOSAIC_INTEL 0x4179
+#define CL_VE_ACCELERATOR_ATTRIB_WBC_INTEL 0x417A
+#define CL_VE_ACCELERATOR_ATTRIB_VIGNETTE_INTEL 0x417B
+
+// VE Statistics
+#define CL_VE_ACCELERATOR_HISTOGRAMS_INTEL 0x4166
+#define CL_VE_ACCELERATOR_STATISTICS_INTEL 0x4167
+#define CL_VE_ACCELERATOR_STMM_INPUT_INTEL 0x4168
+#define CL_VE_ACCELERATOR_STMM_OUTPUT_INTEL 0x4169
+
+// Denoise Control
+#define CL_VE_DENOISE_FACTOR_MAX_INTEL 64
+#define CL_VE_DENOISE_FACTOR_MIN_INTEL 0
+#define CL_VE_DENOISE_FACTOR_DEFAULT_INTEL 32
+
+// Hot Pixel Correction ranges
+#define CL_VE_HPC_THRESHOLD_MAX_INTEL 255
+#define CL_VE_HPC_THRESHOLD_MIN_INTEL 0
+#define CL_VE_HPC_THRESHOLD_DEFAULT_INTEL 0
+#define CL_VE_HPC_PIXEL_COUNT_MAX_INTEL 8
+#define CL_VE_HPC_PIXEL_COUNT_MIN_INTEL 0
+#define CL_VE_HPC_PIXEL_COUNT_DEFAULT_INTEL 0
+
+// Skin tone detection/enhancement ranges
+#define CL_VE_STE_FACTOR_MIN_INTEL 0
+#define CL_VE_STE_FACTOR_MAX_INTEL 10
+#define CL_VE_STE_FACTOR_DEFAULT_INTEL 3
+
+// Constants for gamut compression scaling factors
+#define CL_VE_GAMUT_SCALING_FACTOR_MAX_INTEL 4.0f
+#define CL_VE_GAMUT_SCALING_FACTOR_MIN_INTEL 0.0f
+#define CL_VE_GAMUT_SCALING_FACTOR_DEFAULT_INTEL 0.0f
+#define CL_VE_GAMUT_CHROMATICITY_CONTROLS_MAX_INTEL 1.0f
+#define CL_VE_GAMUT_CHROMATICITY_CONTROLS_MIN_INTEL 0.0f
+#define CL_VE_GAMUT_CHROMATICITY_CONTRL_RX_DEFAULT_INTEL 0.576f
+#define CL_VE_GAMUT_CHROMATICITY_CONTRL_GX_DEFAULT_INTEL 0.331f
+#define CL_VE_GAMUT_CHROMATICITY_CONTRL_BX_DEFAULT_INTEL 0.143f
+#define CL_VE_GAMUT_CHROMATICITY_CONTRL_RY_DEFAULT_INTEL 0.343f
+#define CL_VE_GAMUT_CHROMATICITY_CONTRL_GY_DEFAULT_INTEL 0.555f
+#define CL_VE_GAMUT_CHROMATICITY_CONTRL_BY_DEFAULT_INTEL 0.104f
+#define CL_VE_GAMUT_CHROMATICITY_CONTRL_RX_SRGB_INTEL 0.640f
+#define CL_VE_GAMUT_CHROMATICITY_CONTRL_GX_SRGB_INTEL 0.300f
+#define CL_VE_GAMUT_CHROMATICITY_CONTRL_BX_SRGB_INTEL 0.150f
+#define CL_VE_GAMUT_CHROMATICITY_CONTRL_RY_SRGB_INTEL 0.330f
+#define CL_VE_GAMUT_CHROMATICITY_CONTRL_GY_SRGB_INTEL 0.600f
+#define CL_VE_GAMUT_CHROMATICITY_CONTRL_BY_SRGB_INTEL 0.060f
+
+// Constants for gamut expansion / color correction
+#define CL_VE_GECC_PIECE_COUNT_INTEL 11
+#define CL_VE_GECC_TX_COEFFICIENTS_MIN_INTEL -4.0f
+#define CL_VE_GECC_TX_COEFFICIENTS_MAX_INTEL 4.0f
+#define CL_VE_GECC_TX_COEFF_C0_DEFAULT_INTEL 0.681f
+#define CL_VE_GECC_TX_COEFF_C1_DEFAULT_INTEL 0.278f
+#define CL_VE_GECC_TX_COEFF_C2_DEFAULT_INTEL 0.008f
+#define CL_VE_GECC_TX_COEFF_C3_DEFAULT_INTEL 0.017f
+#define CL_VE_GECC_TX_COEFF_C4_DEFAULT_INTEL 0.894f
+#define CL_VE_GECC_TX_COEFF_C5_DEFAULT_INTEL -0.012f
+#define CL_VE_GECC_TX_COEFF_C6_DEFAULT_INTEL -0.002f
+#define CL_VE_GECC_TX_COEFF_C7_DEFAULT_INTEL 0.041f
+#define CL_VE_GECC_TX_COEFF_C8_DEFAULT_INTEL 0.838f
+#define CL_VE_GECC_TX_OFFSET_IN_MIN_INTEL -16384
+#define CL_VE_GECC_TX_OFFSET_IN_MAX_INTEL 16383
+#define CL_VE_GECC_TX_OFFSET_OUT_MIN_INTEL -4.0f
+#define CL_VE_GECC_TX_OFFSET_OUT_MAX_INTEL 4.0f
+
+// AOI Parameter defaults
+#define CL_VE_AOI_RANGE_DEFAULT_INTEL 0
+#define CL_VE_AOI_ALPHA_DEFAULT_INTEL 0
+
+// CCM Config Parameter Range
+#define CL_VE_CCM_COEFFICIENTS_MIN_INTEL -16.0f
+#define CL_VE_CCM_COEFFICIENTS_MAX_INTEL 16.0f
+#define CL_VE_CCM_COEFFICIENTS_DEFAULT_INTEL 0.0f
+
+// CSC Config Parameter Range
+#define CL_VE_CSC_OFFSET_MIN_INTEL -256.0f
+#define CL_VE_CSC_OFFSET_MAX_INTEL 256.0f
+#define CL_VE_CSC_COEFF_MIN_INTEL -4.0f
+#define CL_VE_CSC_COEFF_MAX_INTEL 4.0f
+
+// Constants for specific color spaces
+#define CL_VE_GAMUT_CS_BT601_INTEL 0x0
+#define CL_VE_GAMUT_CS_BT709_INTEL 0x1
+#define CL_VE_GAMUT_CS_XVYCC601_INTEL 0x2
+#define CL_VE_GAMUT_CS_XVYCC709_INTEL 0x3
+
+// LACE/ACE Control
+#define CL_VE_ACE_PIECE_COUNT_INTEL 10
+#define CL_VE_ACE_LEVEL_MIN_INTEL 0
+#define CL_VE_ACE_LEVEL_MAX_INTEL 9
+#define CL_VE_ACE_LEVEL_DEFAULT_INTEL 5
+#define CL_VE_ACE_STRENGTH_MIN_INTEL 0
+#define CL_VE_ACE_STRENGTH_MAX_INTEL 6
+#define CL_VE_ACE_STRENGTH_DEFAULT_INTEL 1
+#define CL_VE_ACE_SKIN_THRESHOLD_MIN_INTEL 1
+#define CL_VE_ACE_SKIN_THRESHOLD_MAX_INTEL 31
+#define CL_VE_ACE_SKIN_THRESHOLD_DEFAULT_INTEL 26
+
+// TCC Parameter Range
+#define CL_VE_TCC_MIN_INTEL 0
+#define CL_VE_TCC_MAX_INTEL 255
+#define CL_VE_TCC_DEFAULT_INTEL 220
+
+// Proc-Amp Ranges
+#define CL_VE_PROCAMP_BRIGHTNESS_MIN_INTEL -100.0f
+#define CL_VE_PROCAMP_BRIGHTNESS_MAX_INTEL 100.0f
+#define CL_VE_PROCAMP_BRIGHTNESS_DEFAULT_INTEL 0.0f
+
+#define CL_VE_PROCAMP_CONTRAST_MIN_INTEL 0.0f
+#define CL_VE_PROCAMP_CONTRAST_MAX_INTEL 15.0f
+#define CL_VE_PROCAMP_CONTRAST_DEFAULT_INTEL 1.0f
+
+#define CL_VE_PROCAMP_HUE_MIN_INTEL -180.0f
+#define CL_VE_PROCAMP_HUE_MAX_INTEL 180.0f
+#define CL_VE_PROCAMP_HUE_DEFAULT_INTEL 0.0f
+
+#define CL_VE_PROCAMP_SATURATION_MIN_INTEL 0.0f
+#define CL_VE_PROCAMP_SATURATION_MAX_INTEL 8.0f
+#define CL_VE_PROCAMP_SATURATION_DEFAULT_INTEL 1.0f
+
+// BLC Parameter Range
+#define CL_VE_BLC_MIN_INTEL -65536
+#define CL_VE_BLC_MAX_INTEL 65535
+#define CL_VE_BLC_DEFAULT_INTEL 0
+
+// WBC Parameter Range
+#define CL_VE_WBC_MIN_INTEL 0.0f
+#define CL_VE_WBC_MAX_INTEL 16.0f
+#define CL_VE_WBC_DEFAULT_INTEL 0.0f
+
+// FGC Parameter Range
+#define CL_VE_FGC_DEFAULT_INTEL 0
+
+// Video enhancement kernel flags
+#define CL_VE_FIRST_FRAME_INTEL (1 << 0)
+#define CL_VE_RESET_DN_HISTORY_INTEL (1 << 1)
+#define CL_VE_RESET_DI_HISTORY_INTEL (1 << 2)
+#define CL_VE_RESET_ACE_HISTORY_INTEL (1 << 3)
+#define CL_VE_RESET_STE_HISTORY_INTEL (1 << 4)
+#define CL_VE_GENERATE_LACE_HISTOGRAM_128_BINS_INTEL (1 << 5)
+#define CL_VE_GENERATE_LACE_HISTOGRAM_256_BINS_INTEL (1 << 6)
+
+// Bayer pattern controls
+#define CL_VE_BAYER_PATTERN_FORMAT_8BIT_INTEL 0x0
+#define CL_VE_BAYER_PATTERN_FORMAT_16BIT_INTEL 0x1
+#define CL_VE_BAYER_PATTERN_OFFSET_BG_INTEL 0x0
+#define CL_VE_BAYER_PATTERN_OFFSET_RG_INTEL 0x1
+#define CL_VE_BAYER_PATTERN_OFFSET_GR_INTEL 0x2
+#define CL_VE_BAYER_PATTERN_OFFSET_GB_INTEL 0x3
+
+// Default color-space conversion coefficients
+#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_OFFSET_IN_0 (-16.0f)
+#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_OFFSET_IN_1 (-128.0f)
+#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_OFFSET_IN_2 (-128.0f)
+#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_TX_COEFF_0_0 (1.164f)
+#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_TX_COEFF_0_1 (0.0f)
+#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_TX_COEFF_0_2 (1.596f)
+#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_TX_COEFF_1_0 (1.164f)
+#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_TX_COEFF_1_1 (-0.392f)
+#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_TX_COEFF_1_2 (-0.813f)
+#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_TX_COEFF_2_0 (1.164f)
+#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_TX_COEFF_2_1 (2.017f)
+#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_TX_COEFF_2_2 (0.0f)
+#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_OFFSET_OUT_0 (0.0f)
+#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_OFFSET_OUT_1 (0.0f)
+#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_OFFSET_OUT_2 (0.0f)
+
+#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_OFFSET_IN_0 (0.0f)
+#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_OFFSET_IN_1 (0.0f)
+#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_OFFSET_IN_2 (0.0f)
+#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_TX_COEFF_0_0 (0.257f)
+#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_TX_COEFF_0_1 (0.504f)
+#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_TX_COEFF_0_2 (0.098f)
+#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_TX_COEFF_1_0 (-0.148f)
+#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_TX_COEFF_1_1 (-0.291f)
+#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_TX_COEFF_1_2 (0.439f)
+#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_TX_COEFF_2_0 (0.439f)
+#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_TX_COEFF_2_1 (-0.368f)
+#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_TX_COEFF_2_2 (-0.071f)
+#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_OFFSET_OUT_0 (16.0f)
+#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_OFFSET_OUT_1 (128.0f)
+#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_OFFSET_OUT_2 (128.0f)
+
+// Forward Gamma Correction controls
+#define CL_VE_FWD_GAMMA_SEGMENT_COUNT 64
+
+typedef cl_uint cl_ve_accelerator_attrib_id;
+
+typedef struct _cl_ve_dn_attrib_intel {
+    cl_bool enable_luma;
+    cl_bool enable_chroma;
+    cl_bool auto_detect;
+    cl_uint denoise_factor;
+} cl_ve_dn_attrib_intel;
+
+typedef struct _cl_ve_di_attrib_intel {
+    cl_bool enabled;
+    cl_bool motion_compensation_enabled;
+    cl_bool top_first;
+} cl_ve_di_attrib_intel;
+
+typedef struct _cl_ve_std_ste_attrib_intel {
+    cl_bool enabled;
+    cl_uint ste_factor;
+    cl_bool write_std_decisions_only;
+} cl_ve_std_ste_attrib_intel;
+
+typedef struct _cl_ve_gamut_comp_attrib_intel {
+    cl_bool enabled;
+    cl_bool advanced_mode_enable;
+    cl_uint src_color_space;
+    cl_float basic_mode_scaling_factor;
+    cl_float display_rgb_x[3];
+    cl_float display_rgb_y[3];
+} cl_ve_gamut_comp_attrib_intel;
+
+typedef struct _cl_ve_gecc_attrib_intel {
+    cl_bool enabled;
+    cl_bool use_advanced_mode;
+    cl_float matrix[3][3];
+    cl_int offset_in[3];
+    cl_float offset_out[3];
+    cl_uchar gamma_correction_in[CL_VE_GECC_PIECE_COUNT_INTEL];
+    cl_uchar gamma_correction_out[CL_VE_GECC_PIECE_COUNT_INTEL];
+    cl_uchar inv_gamma_correction_in[CL_VE_GECC_PIECE_COUNT_INTEL];
+    cl_uchar inv_gamma_correction_out[CL_VE_GECC_PIECE_COUNT_INTEL];
+} cl_ve_gecc_attrib_intel;
+
+typedef struct _cl_ve_ace_attrib_intel {
+    cl_bool enabled;
+    cl_uchar skin_threshold;
+    cl_uint level;
+    cl_uint strength;
+} cl_ve_ace_attrib_intel;
+
+typedef struct _cl_ve_ace_advanced_attrib_intel {
+    cl_bool enabled;
+    cl_uchar luma_min;
+    cl_uchar luma_max;
+    cl_uchar luma_in[CL_VE_ACE_PIECE_COUNT_INTEL];
+    cl_uchar luma_out[CL_VE_ACE_PIECE_COUNT_INTEL];
+} cl_ve_ace_advanced_attrib_intel;
+
+typedef struct _cl_ve_tcc_attrib_intel {
+    cl_bool enabled;
+    cl_uchar red_saturation;
+    cl_uchar green_saturation;
+    cl_uchar blue_saturation;
+    cl_uchar cyan_saturation;
+    cl_uchar magenta_saturation;
+    cl_uchar yellow_saturation;
+} cl_ve_tcc_attrib_intel;
+
+typedef struct _cl_ve_procamp_attrib_intel {
+    cl_bool enabled;
+    cl_float brightness;
+    cl_float contrast;
+    cl_float hue;
+    cl_float saturation;
+} cl_ve_procamp_attrib_intel;
+
+typedef struct _cl_ve_becsc_attrib_intel {
+    cl_bool enabled;
+    cl_bool yuv_channel_swap;
+    cl_float offset_in[3];
+    cl_float matrix[3][3];
+    cl_float offset_out[3];
+} cl_ve_becsc_attrib_intel;
+
+typedef struct _cl_ve_aoi_alpha_attrib_intel {
+    cl_bool aoi_enabled;
+    cl_uint x_min;
+    cl_uint x_max;
+    cl_uint y_min;
+    cl_uint y_max;
+    cl_bool alpha_enable;
+    cl_ushort alpha_value;
+} cl_ve_aoi_alpha_attrib_intel;
+
+typedef struct _cl_ve_hpc_attrib_intel {
+    cl_bool enabled;
+    cl_uchar threshold;
+    cl_uchar count;
+} cl_ve_hpc_attrib_intel;
+
+typedef struct _cl_ve_blc_attrib_intel {
+    cl_bool enabled;
+    cl_int black_point_offset_red;
+    cl_int black_point_offset_green_top;
+    cl_int black_point_offset_green_bottom;
+    cl_int black_point_offset_blue;
+} cl_ve_blc_attrib_intel;
+
+typedef struct _cl_ve_demosaic_attrib_intel {
+    cl_uint input_width;
+    cl_uint input_height;
+    cl_uint input_stride;
+    cl_uint bayer_pattern_offset;
+    cl_uint bayer_pattern_format;
+} cl_ve_demosaic_attrib_intel;
+
+typedef struct _cl_ve_wbc_attrib_intel {
+    cl_bool enabled;
+    cl_float white_balance_red_correction;
+    cl_float white_balance_green_top_correction;
+    cl_float white_balance_green_bottom_correction;
+    cl_float white_balance_blue_correction;
+} cl_ve_wbc_attrib_intel;
+
+typedef struct _cl_ve_vignette_attrib_intel {
+    cl_bool enabled;
+} cl_ve_vignette_attrib_intel;
+
+typedef struct _cl_ve_ccm_attrib_intel {
+    cl_bool enabled;
+    cl_float matrix[3][3];
+} cl_ve_ccm_attrib_intel;
+
+typedef struct _cl_ve_fgc_attrib_intel {
+    cl_bool enabled;
+    cl_ushort pixel_value[CL_VE_FWD_GAMMA_SEGMENT_COUNT];
+    cl_ushort red_channel_corrected_value[CL_VE_FWD_GAMMA_SEGMENT_COUNT];
+    cl_ushort green_channel_corrected_value[CL_VE_FWD_GAMMA_SEGMENT_COUNT];
+    cl_ushort blue_channel_corrected_value[CL_VE_FWD_GAMMA_SEGMENT_COUNT];
+} cl_ve_fgc_attrib_intel;
+
+typedef struct _cl_ve_fecsc_attrib_intel {
+    cl_bool enabled;
+    cl_float offset_in[3];
+    cl_float matrix[3][3];
+    cl_float offset_out[3];
+} cl_ve_fecsc_attrib_intel;
+
+typedef struct _cl_ve_attrib_desc_intel {
+    cl_ve_accelerator_attrib_id attrib_id;
+    void *attrib_data;
+} cl_ve_attrib_desc_intel;
+
+typedef struct _cl_ve_desc_intel {
+    cl_uint attrib_count;
+    cl_ve_attrib_desc_intel *attribs;
+} cl_ve_desc_intel;
+
+typedef struct _cl_vignette_format_intel {
+    cl_ushort Red;
+    cl_ushort GreenTop;
+    cl_ushort Blue;
+    cl_ushort GreenBottom;
+} cl_vignette_format_intel;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CL_EXT_VEBOX_INTEL_H */
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -0,0 +1,908 @@
+# Copyright (c) 2017, Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+cmake_minimum_required (VERSION 3.0)
+
+if (POLICY CMP0042)
+  cmake_policy (SET CMP0042 NEW)
+endif (POLICY CMP0042)
+
+if (POLICY CMP0063)
+  cmake_policy (SET CMP0063 NEW)
+endif (POLICY CMP0063)
+
+project (neo)
+
+#set (CMAKE_CXX_VISIBILITY_PRESET default)
+#set (CMAKE_VISIBILITY_INLINES_HIDDEN 1)
+
+# Support for Windows Universal Drivers
+ENABLE_WUD()
+
+if(WIN32)
+  set(GEN_OS_SRC
+    windows/command_stream_receiver.cpp
+    windows/wddm_engine_mapper.cpp
+    windows/wddm.cpp
+  )
+else(WIN32)
+  set(GEN_OS_SRC
+    linux/command_stream_receiver.cpp
+    linux/drm_engine_mapper.cpp
+  )
+endif(WIN32)
+
+set(RUNTIME_SRCS_GENX
+  aub_command_stream_receiver.cpp
+  aub_mapper.h
+  aub_mem_dump.cpp
+  command_queue.cpp
+  device_enqueue.h
+  device_queue.cpp
+  command_stream_receiver_hw.cpp
+  hw_cmds.h
+  hw_cmds_generated.h
+  hw_helper.cpp
+  hw_info.cpp
+  hw_info.h
+  buffer.cpp
+  image.cpp
+  kernel_commands.cpp
+  preamble.cpp
+  preemption.cpp
+  reg_configs.h
+  sampler.cpp
+  scheduler_definitions.h
+  scheduler_igdrcl_built_in.inl
+  state_base_address.cpp
+  tbx_command_stream_receiver.cpp
+  ${GEN_OS_SRC}
+)
+if(NOT (TARGET ${BIKSIM_LIB_NAME}))
+  add_subdirectory(builtin_kernels_simulation)
+endif(NOT (TARGET ${BIKSIM_LIB_NAME}))
+
+if(NOT (TARGET ${SCHEDULER_BINARY_LIB_NAME}))
+  add_subdirectory("scheduler")
+endif(NOT (TARGET ${SCHEDULER_BINARY_LIB_NAME}))
+
+if(NOT (TARGET ${BUILTINS_BINARIES_LIB_NAME}))
+  add_subdirectory("built_ins")
+endif(NOT (TARGET ${BUILTINS_BINARIES_LIB_NAME}))
+
+add_subdirectory(api)
+add_subdirectory(accelerators)
+
+set (RUNTIME_SRCS_AUB_MEM_DUMP
+  aub_mem_dump/aub_mem_dump.cpp
+  aub_mem_dump/aub_mem_dump.h
+  aub_mem_dump/aub_mem_dump.inl
+  aub_mem_dump/aub_header.h
+  aub_mem_dump/aub_services.h
+)
+
+
+set (RUNTIME_SRCS_BUILT_INS
+  built_ins/built_ins_storage.cpp
+  built_ins/built_ins.cpp
+  built_ins/built_ins.h
+  built_ins/sip.cpp
+  built_ins/sip.h
+  built_ins/vme_dispatch_builder.h
+)
+
+set (RUNTIME_SRCS_BUILT_IN_KERNELS
+  built_ins/kernels/copy_buffer_rect.igdrcl_built_in
+  built_ins/kernels/copy_buffer_to_buffer.igdrcl_built_in
+  built_ins/kernels/copy_buffer_to_image3d.igdrcl_built_in
+  built_ins/kernels/copy_image3d_to_buffer.igdrcl_built_in
+  built_ins/kernels/copy_image_to_image1d.igdrcl_built_in
+  built_ins/kernels/copy_image_to_image2d.igdrcl_built_in
+  built_ins/kernels/copy_image_to_image3d.igdrcl_built_in
+  built_ins/kernels/fill_buffer.igdrcl_built_in
+  built_ins/kernels/fill_image1d.igdrcl_built_in
+  built_ins/kernels/fill_image2d.igdrcl_built_in
+  built_ins/kernels/fill_image3d.igdrcl_built_in
+  built_ins/kernels/vme_block_motion_estimate_intel.igdrcl_built_in
+  built_ins/kernels/vme_block_advanced_motion_estimate_check_intel.igdrcl_built_in
+  built_ins/kernels/vme_block_advanced_motion_estimate_bidirectional_check_intel.igdrcl_built_in
+  built_ins/kernels/vebox_ve_enhance_intel.igdrcl_built_in
+  built_ins/kernels/vebox_ve_dn_enhance_intel.igdrcl_built_in
+  built_ins/kernels/vebox_ve_dn_di_enhance_intel.igdrcl_built_in
+)
+
+set (RUNTIME_SRCS_COMMANDS
+  commands/bxml_generator_glue.h
+)
+
+set (RUNTIME_SRCS_COMMAND_QUEUE
+  command_queue/cpu_data_transfer_handler.h
+  command_queue/command_queue.cpp
+  command_queue/command_queue.h
+  command_queue/command_queue_hw.h
+  command_queue/command_queue_hw.inl
+  command_queue/dispatch_walker.h
+  command_queue/dispatch_walker_helper.h
+  command_queue/dispatch_walker_helper.inl
+  command_queue/enqueue_barrier.h
+  command_queue/enqueue_common.h
+  command_queue/enqueue_copy_buffer.h
+  command_queue/enqueue_copy_buffer_rect.h
+  command_queue/enqueue_copy_buffer_to_image.h
+  command_queue/enqueue_copy_image.h
+  command_queue/enqueue_copy_image_to_buffer.h
+  command_queue/enqueue_fill_buffer.h
+  command_queue/enqueue_fill_image.h
+  command_queue/enqueue_kernel.h
+  command_queue/enqueue_map_buffer.h
+  command_queue/enqueue_map_image.h
+  command_queue/enqueue_svm.h
+  command_queue/enqueue_marker.h
+  command_queue/enqueue_migrate_mem_objects.h
+  command_queue/enqueue_read_buffer.h
+  command_queue/enqueue_read_buffer_rect.h
+  command_queue/enqueue_read_image.h
+  command_queue/enqueue_write_buffer.h
+  command_queue/enqueue_write_buffer_rect.h
+  command_queue/enqueue_write_image.h
+  command_queue/finish.h
+  command_queue/flush.h
+  command_queue/local_id_gen.cpp
+  command_queue/local_id_gen_avx2.cpp
+  command_queue/local_id_gen_sse4.cpp
+  command_queue/local_id_gen.h
+  command_queue/local_id_gen.inl
+  command_queue/local_work_size.cpp
+)
+
+set (RUNTIME_SRCS_COMMAND_STREAM
+  command_stream/aub_command_stream_receiver.cpp
+  command_stream/aub_command_stream_receiver.h
+  command_stream/aub_command_stream_receiver_hw.h
+  command_stream/aub_command_stream_receiver_hw.inl
+  command_stream/command_stream_receiver.cpp
+  command_stream/command_stream_receiver.h
+  command_stream/command_stream_receiver_hw.h
+  command_stream/command_stream_receiver_hw.inl
+  command_stream/csr_definitions.h
+  command_stream/device_command_stream.h
+  command_stream/linear_stream.cpp
+  command_stream/linear_stream.h
+  command_stream/submissions_aggregator.cpp
+  command_stream/submissions_aggregator.h
+  command_stream/tbx_command_stream_receiver.cpp
+  command_stream/tbx_command_stream_receiver.h
+  command_stream/tbx_command_stream_receiver_hw.h
+  command_stream/tbx_command_stream_receiver_hw.inl
+  command_stream/tbx_stream.cpp
+  command_stream/thread_arbitration_policy.h
+  command_stream/preemption.h
+  command_stream/preemption.cpp
+)
+
+set (RUNTIME_SRCS_COMPILER_INTERFACE
+  compiler_interface/binary_cache.cpp
+  compiler_interface/compiler_interface.cpp
+  compiler_interface/compiler_interface.h
+  compiler_interface/compiler_interface.inl
+  compiler_interface/create_main.cpp
+)
+
+set (RUNTIME_SRCS_CONTEXT
+  context/context.cpp
+  context/context.h
+  context/context.inl
+  context/driver_diagnostics.cpp
+  context/driver_diagnostics.h
+)
+
+set (RUNTIME_SRCS_DEVICE
+  device/device.cpp
+  device/device.h
+  device/device_caps.cpp
+  device/device_info.cpp
+  device/device_info.h
+  device/device_info_map.h
+  device/device_vector.h
+)
+
+set (RUNTIME_SRCS_DEVICE_QUEUE
+  device_queue/device_queue.cpp
+  device_queue/device_queue.h
+  device_queue/device_queue_hw.h
+  device_queue/device_queue_hw.inl
+  device_queue/device_queue_hw_profiling.inl
+)
+
+set (RUNTIME_SRCS_EVENT
+  event/async_events_handler.h
+  event/async_events_handler.cpp
+  event/event.cpp
+  event/event.h
+  event/event_builder.cpp
+  event/event_builder.h
+  event/event_registry.cpp
+  event/event_registry.h
+  event/user_event.cpp
+  event/user_event.h
+  event/hw_timestamps.h
+  event/perf_counter.h
+)
+
+set (RUNTIME_SRCS_EXECUTION_MODEL
+  execution_model/device_enqueue.h
+)
+
+if(GTPIN_HEADERS_DIR)
+  set (RUNTIME_SRCS_GTPIN
+    gtpin/gtpin_init.cpp
+    gtpin/gtpin_init.h
+    gtpin/gtpin_helpers.cpp
+    gtpin/gtpin_helpers.h
+  )
+endif(GTPIN_HEADERS_DIR)
+
+set (RUNTIME_SRCS_HELPERS
+  helpers/abort.h
+  helpers/aligned_memory.h
+  helpers/array_count.h
+  helpers/base_object.cpp
+  helpers/base_object.h
+  helpers/base_object_allocator.cpp
+  helpers/basic_math.h
+  helpers/cache_policy.cpp
+  helpers/cache_policy.h
+  helpers/dirty_state_helpers.h
+  helpers/dirty_state_helpers.cpp
+  helpers/dispatch_info.h
+  helpers/dispatch_info.cpp
+  helpers/dispatch_info_builder.h
+  helpers/completion_stamp.h
+  helpers/debug_helpers.h
+  helpers/engine_node.h
+  helpers/error_mappers.h
+  helpers/file_io.cpp
+  helpers/file_io.h
+  helpers/flush_stamp.h
+  helpers/flush_stamp.cpp
+  helpers/get_info.h
+  helpers/hash.h
+  helpers/hw_helper.cpp
+  helpers/hw_helper.h
+  helpers/hw_helper.inl
+  helpers/hw_info.cpp
+  helpers/hw_info.h
+  helpers/kernel_commands.h
+  helpers/kernel_commands.inl
+  helpers/options.cpp
+  helpers/options.h
+  helpers/per_thread_data.cpp
+  helpers/per_thread_data.h
+  helpers/preamble.h
+  helpers/preamble.inl
+  helpers/ptr_math.h
+  helpers/queue_helpers.h
+  helpers/sampler_helpers.h
+  helpers/selectors.h
+  helpers/state_base_address.h
+  helpers/state_base_address.inl
+  helpers/stdio.h
+  helpers/string.h
+  helpers/string_helpers.h
+  helpers/surface_formats.cpp
+  helpers/surface_formats.h
+  helpers/task_information.cpp
+  helpers/task_information.h
+  helpers/uint16_avx2.h
+  helpers/uint16_sse4.h
+  helpers/wddm_helper.h
+  helpers/validators.cpp
+  helpers/validators.h
+)
+
+if (WIN32)
+  list (APPEND RUNTIME_SRCS_HELPERS
+    helpers/translationtable_callbacks.h
+  )
+endif(WIN32)
+
+set (RUNTIME_SRCS_INDIRECT_HEAP
+  indirect_heap/indirect_heap.cpp
+  indirect_heap/indirect_heap.h
+)
+
+set (RUNTIME_SRCS_INSTRUMENTATION
+  instrumentation/instrumentation.cpp
+  instrumentation/instrumentation.h
+)
+
+set (RUNTIME_SRCS_KERNEL
+  kernel/dynamic_kernel_info.h
+  kernel/kernel.cpp
+  kernel/kernel.h
+  kernel/kernel.inl
+)
+
+set (RUNTIME_SRCS_MEMORY_MANAGER
+  memory_manager/deferrable_deletion.h
+  memory_manager/deferred_deleter.cpp
+  memory_manager/deferred_deleter.h
+  memory_manager/graphics_allocation.h
+  memory_manager/graphics_allocation.cpp
+  memory_manager/host_ptr_defines.h
+  memory_manager/host_ptr_manager.h
+  memory_manager/host_ptr_manager.cpp
+  memory_manager/memory_manager.cpp
+  memory_manager/memory_manager.h
+  memory_manager/svm_memory_manager.cpp
+  memory_manager/svm_memory_manager.h
+  memory_manager/os_agnostic_memory_manager.cpp
+  memory_manager/os_agnostic_memory_manager.h
+  memory_manager/page_table.cpp
+  memory_manager/page_table.h
+  memory_manager/address_mapper.cpp
+  memory_manager/address_mapper.h
+  memory_manager/surface.h
+)
+
+set (RUNTIME_SRCS_GMM_HELPER
+  gmm_helper/gmm_helper.cpp
+  gmm_helper/gmm_helper.h
+  gmm_helper/gmm_lib.h
+  gmm_helper/resource_info.h
+ )
+
+if (WIN32)
+  list (APPEND RUNTIME_SRCS_GMM_HELPER
+    gmm_helper/page_table_mngr.h
+    gmm_helper/gmm_memory.h
+  )
+endif(WIN32)
+
+set (RUNTIME_SRCS_MEM_OBJ
+  mem_obj/buffer.cpp
+  mem_obj/buffer.h
+  mem_obj/buffer.inl
+  mem_obj/image.cpp
+  mem_obj/image.h
+  mem_obj/image.inl
+  mem_obj/mem_obj.cpp
+  mem_obj/mem_obj.h
+  mem_obj/buffer_factory_init.inl
+  mem_obj/image_factory_init.inl
+  mem_obj/pipe.h
+  mem_obj/pipe.cpp
+)
+
+set (RUNTIME_SRCS_OS_INTERFACE
+  os_interface/32bit_memory.h
+  os_interface/os_library.h
+  os_interface/linux/linux_inc.h
+  os_interface/windows/windows_inc.h
+  os_interface/device_factory.h
+  os_interface/os_inc.h
+  os_interface/os_interface.h
+  os_interface/os_time.h
+  os_interface/os_time.cpp
+  os_interface/debug_settings_manager.cpp
+  os_interface/debug_settings_manager.h
+  os_interface/performance_counters.cpp
+  os_interface/performance_counters.h
+  os_interface/print.h
+)
+
+set (RUNTIME_SRCS_PLATFORM
+  platform/platform.cpp
+  platform/platform.h
+  platform/platform_info.h
+)
+
+set (RUNTIME_SRCS_PROGRAM
+  program/block_kernel_manager.cpp
+  program/block_kernel_manager.h
+  program/build.cpp
+  program/compile.cpp
+  program/create.cpp
+  program/get_info.cpp
+  program/heap_info.h
+  program/kernel_arg_info.h
+  program/kernel_info.cpp
+  program/kernel_info.h
+  program/link.cpp
+  program/patch_info.h
+  program/process_elf_binary.cpp
+  program/process_spir_binary.cpp
+  program/process_gen_binary.cpp
+  program/program.cpp
+  program/program.h
+  program/printf_handler.h
+  program/printf_handler.cpp
+  program/print_formatter.h
+  program/print_formatter.cpp
+)
+
+set (RUNTIME_SRCS_SAMPLER
+  sampler/sampler.cpp
+  sampler/sampler.h
+  sampler/sampler.inl
+  sampler/sampler_factory_init.inl
+)
+
+list (APPEND RUNTIME_SRCS_SCHEDULER
+  scheduler/scheduler_kernel.cpp
+  scheduler/scheduler_kernel.h
+  scheduler/CMakeLists.txt
+)
+
+set (RUNTIME_SRCS_SHARINGS
+  sharings/sharing.h
+  sharings/sharing.cpp
+)
+
+set (RUNTIME_SRCS_TBX
+  tbx/tbx_proto.h
+  tbx/tbx_sockets.cpp
+  tbx/tbx_sockets.h
+)
+
+set (RUNTIME_SRCS_UTILITIES
+  utilities/api_intercept.h
+  utilities/arrayref.h
+  utilities/cpu_info.h
+  utilities/debug_file_reader.cpp
+  utilities/debug_file_reader.h
+  utilities/debug_settings_reader.cpp
+  utilities/debug_settings_reader.h
+  utilities/directory.h
+  utilities/heap_allocator.cpp
+  utilities/heap_allocator.h
+  utilities/iflist.h
+  utilities/idlist.h
+  utilities/stackvec.h
+  utilities/perf_profiler.cpp
+  utilities/perf_profiler.h
+  utilities/reference_tracked_object.h
+  utilities/tag_allocator.h
+  utilities/timer_util.h
+  utilities/vec.h
+)
+
+set (RUNTIME_SRCS_GEN_COMMON
+  gen_common/aub_mapper.h
+  gen_common/aub_mapper_base.h
+  gen_common/hw_cmds.h
+  gen_common/reg_configs.h
+)
+
+if (WIN32)
+  list (APPEND RUNTIME_SRCS_UTILITIES
+    utilities/windows/directory.cpp
+    utilities/windows/timer_util.cpp
+    utilities/windows/cpu_info.cpp
+  )
+else(WIN32)
+  list (APPEND RUNTIME_SRCS_UTILITIES
+    utilities/linux/directory.cpp
+    utilities/linux/timer_util.cpp
+    utilities/linux/cpu_info.cpp
+  )
+endif (WIN32)
+
+if (WIN32)
+  list (APPEND RUNTIME_SRCS_OS_INTERFACE
+    os_interface/windows/api.cpp
+    os_interface/windows/d3d_sharing_functions.h
+    os_interface/windows/d3d9_sharing_functions.cpp
+    os_interface/windows/d3d10_11_sharing_functions.cpp
+    os_interface/windows/debug_registry_reader.cpp
+    os_interface/windows/deferrable_deletion_win.cpp
+    os_interface/windows/deferrable_deletion_win.h
+    os_interface/windows/device_command_stream.inl
+    os_interface/windows/device_factory.cpp
+    os_interface/windows/gdi_interface.cpp
+    os_interface/windows/gdi_interface.h
+    os_interface/windows/options.cpp
+    os_interface/windows/os_interface.cpp
+    os_interface/windows/os_interface.h
+    os_interface/windows/os_library.cpp
+    os_interface/windows/os_library.h
+    os_interface/windows/os_time.cpp
+    os_interface/windows/os_time.h
+    os_interface/windows/registry_reader.h
+    os_interface/windows/thk_wrapper.h
+    os_interface/windows/wddm.cpp
+    os_interface/windows/wddm.h
+    os_interface/windows/wddm.inl
+    os_interface/windows/wddm_32bit_memory.cpp
+    os_interface/windows/wddm_allocation.h
+    os_interface/windows/wddm_device_command_stream.inl
+    os_interface/windows/wddm_device_command_stream.h
+    os_interface/windows/wddm_engine_mapper.h
+    os_interface/windows/wddm_memory_manager.cpp
+    os_interface/windows/wddm_memory_manager.h
+    os_interface/windows/windows_inc.cpp
+    os_interface/windows/windows_wrapper.h
+    os_interface/windows/performance_counters_win.cpp
+    os_interface/windows/performance_counters_win.h
+    os_interface/windows/print.cpp
+    os_interface/windows/driver_info.h
+    os_interface/windows/driver_info.cpp
+  )
+
+  if ("${IGDRCL_OPTION__BITS}" STREQUAL "32" )
+    set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /SAFESEH:NO")
+  endif ("${IGDRCL_OPTION__BITS}" STREQUAL "32")
+endif (WIN32)
+
+if (UNIX)
+  list (APPEND RUNTIME_SRCS_OS_INTERFACE
+    os_interface/linux/api.cpp
+    os_interface/linux/d3d_sharing_functions.h
+    os_interface/linux/debug_env_reader.cpp
+    os_interface/linux/device_command_stream.inl
+    os_interface/linux/device_factory.cpp
+    os_interface/linux/drm_32bit_memory.cpp
+    os_interface/linux/drm_allocation.h
+    os_interface/linux/drm_buffer_object.cpp
+    os_interface/linux/drm_buffer_object.h
+    os_interface/linux/drm_command_stream.inl
+    os_interface/linux/drm_command_stream.h
+    os_interface/linux/drm_engine_mapper.h
+    os_interface/linux/drm_null_device.h
+    os_interface/linux/drm_gem_close_worker.cpp
+    os_interface/linux/drm_gem_close_worker.h
+    os_interface/linux/drm_memory_manager.cpp
+    os_interface/linux/drm_memory_manager.h
+    os_interface/linux/drm_neo.cpp
+    os_interface/linux/drm_neo.h
+    os_interface/linux/drm_neo_create.cpp
+    os_interface/linux/hw_info_config.cpp
+    os_interface/linux/hw_info_config.h
+    os_interface/linux/linux_inc.cpp
+    os_interface/linux/options.cpp
+    os_interface/linux/os_interface.cpp
+    os_interface/linux/os_interface.h
+    os_interface/linux/os_library.cpp
+    os_interface/linux/os_library.h
+    os_interface/linux/os_time.cpp
+    os_interface/linux/os_time.h
+    os_interface/linux/performance_counters_linux.cpp
+    os_interface/linux/performance_counters_linux.h
+    os_interface/linux/print.cpp
+    os_interface/linux/driver_info.cpp
+  )
+endif (UNIX)
+
+add_subdirectory("sharings")
+
+set (RUNTIME_SRCS
+  ${RUNTIME_SRCS_API}
+  ${RUNTIME_SRCS_ACCELERATORS}
+  ${RUNTIME_SRCS_AUB_MEM_DUMP}
+  ${RUNTIME_SRCS_BUILT_INS}
+  ${RUNTIME_SRCS_BUILT_IN_KERNELS}
+  ${RUNTIME_SRCS_COMMANDS}
+  ${RUNTIME_SRCS_COMMAND_QUEUE}
+  ${RUNTIME_SRCS_COMMAND_STREAM}
+  ${RUNTIME_SRCS_COMPILER_INTERFACE}
+  ${RUNTIME_SRCS_CONTEXT}
+  ${RUNTIME_SRCS_DEVICE}
+  ${RUNTIME_SRCS_DEVICE_QUEUE}
+  ${RUNTIME_SRCS_EVENT}
+  ${RUNTIME_SRCS_EXECUTION_MODEL}
+  ${RUNTIME_SRCS_GEN_COMMON}
+  ${RUNTIME_SRCS_GTPIN}
+  ${RUNTIME_SRCS_HELPERS}
+  ${RUNTIME_SRCS_INDIRECT_HEAP}
+  ${RUNTIME_SRCS_INSTRUMENTATION}
+  ${RUNTIME_SRCS_KERNEL}
+  ${RUNTIME_SRCS_MEMORY_MANAGER}
+  ${RUNTIME_SRCS_GMM_HELPER}
+  ${RUNTIME_SRCS_MEM_OBJ}
+  ${RUNTIME_SRCS_OS_INTERFACE}
+  ${RUNTIME_SRCS_PLATFORM}
+  ${RUNTIME_SRCS_PROGRAM}
+  ${RUNTIME_SRCS_SAMPLER}
+  ${RUNTIME_SRCS_SCHEDULER}
+  ${RUNTIME_SRCS_SHARINGS}
+  ${RUNTIME_SRCS_TBX}
+  ${RUNTIME_SRCS_UTILITIES}
+  CMakeLists.txt
+)
+
+# Enable SSE4/AVX2 options for files that need them
+if(MSVC)
+	set_source_files_properties(command_queue/local_id_gen_avx2.cpp PROPERTIES COMPILE_FLAGS /arch:AVX2)
+else()
+	set_source_files_properties(command_queue/local_id_gen_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+	set_source_files_properties(command_queue/local_id_gen_sse4.cpp PROPERTIES COMPILE_FLAGS -msse4.2)
+endif (MSVC)
+
+# Put Driver version into define
+if(NEO_DRIVER_VERSION)
+	set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/device/device_caps.cpp PROPERTIES COMPILE_DEFINITIONS NEO_DRIVER_VERSION="${NEO_DRIVER_VERSION}")
+endif(NEO_DRIVER_VERSION)
+
+list (APPEND HW_SRC_INCLUDES ${CMAKE_CURRENT_SOURCE_DIR}/gen_common)
+
+# Include/enable each GEN
+# Reverse order so that GEN N+1 includes GEN N
+foreach(GEN_NUM RANGE ${MAX_GEN} 0 -1)
+  GEN_CONTAINS_PLATFORMS("SUPPORTED" ${GEN_NUM} GENX_HAS_PLATFORMS)
+  if(${GENX_HAS_PLATFORMS})
+    # Add GEN-specific files
+    set(RUNTIME_SRCS_GEN${GEN_NUM} ${RUNTIME_SRCS_GEN${GEN_NUM}_SPECIFIC})
+
+    # Add default GEN files
+    foreach(SRC_IT ${RUNTIME_SRCS_GENX})
+      list (APPEND RUNTIME_SRCS_GEN${GEN_NUM} gen${GEN_NUM}/${SRC_IT})
+    endforeach(SRC_IT)
+
+    # Get all supported platforms for this GEN
+    GET_PLATFORMS_FOR_GEN("SUPPORTED" ${GEN_NUM} SUPPORTED_GENX_PLATFORMS)
+
+    # Add platform-specific files
+    foreach(PLATFORM_IT ${SUPPORTED_GENX_PLATFORMS})
+      string(TOLOWER ${PLATFORM_IT} PLATFORM_IT_LOWER)
+      list(APPEND RUNTIME_SRCS_GEN${GEN_NUM} gen${GEN_NUM}/hw_cmds_${PLATFORM_IT_LOWER}.h)
+      list(APPEND RUNTIME_SRCS_GEN${GEN_NUM} gen${GEN_NUM}/hw_info_${PLATFORM_IT_LOWER}.cpp)
+      list(APPEND RUNTIME_SRCS_GEN${GEN_NUM} ${RUNTIME_SRCS_GEN${GEN_NUM}_${PLATFORM_IT}})
+      if(UNIX)
+        list(APPEND RUNTIME_SRCS_GEN${GEN_NUM} gen${GEN_NUM}/linux/hw_info_config_${PLATFORM_IT_LOWER}.cpp)
+      endif(UNIX)
+
+      # Enable platform
+      list(APPEND GEN${GEN_NUM}_SRC_LINK gen${GEN_NUM}/enable_${PLATFORM_IT_LOWER}.cpp)
+      if(UNIX)
+        list(APPEND GEN${GEN_NUM}_SRC_LINK gen${GEN_NUM}/linux/enable_${PLATFORM_IT_LOWER}.cpp)
+      endif(UNIX)
+    endforeach(PLATFORM_IT)
+
+    list(APPEND GEN${GEN_NUM}_SRC_LINK gen${GEN_NUM}/enable_family_full.cpp)
+
+    # Append this GEN's sources to the list of all sources
+    foreach(SRC_IT ${RUNTIME_SRCS_GEN${GEN_NUM}})
+      if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_IT}")
+        list(APPEND RUNTIME_SRCS ${SRC_IT})
+      endif(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_IT}")
+    endforeach(SRC_IT)
+
+    # Set-up gen include dir and sources for the dll
+    list(APPEND HW_SRC_INCLUDES ${CMAKE_CURRENT_SOURCE_DIR}/gen${GEN_NUM})
+    list(APPEND HW_SRC_LINK ${GEN${GEN_NUM}_SRC_LINK})
+
+    source_group("gen${GEN_NUM}" FILES ${RUNTIME_SRCS_GEN${GEN_NUM}} ${GEN${GEN_NUM}_SRC_LINK} )
+  endif(${GENX_HAS_PLATFORMS})
+endforeach(GEN_NUM)
+
+add_library(${NEO_STATIC_LIB_NAME} STATIC $<TARGET_OBJECTS:${BIKSIM_LIB_NAME}>
+  ${RUNTIME_SRCS}
+)
+
+target_link_libraries(${NEO_STATIC_LIB_NAME} elflib)
+
+target_include_directories(${NEO_STATIC_LIB_NAME} PRIVATE
+  ${CMAKE_CURRENT_BINARY_DIR}
+  ${GMM_INCLUDE_PATHS}
+  ${UMKM_SHAREDDATA_INCLUDE_PATHS}
+)
+
+target_include_directories(${NEO_STATIC_LIB_NAME} PUBLIC
+	${KHRONOS_HEADERS_DIR}
+	${IGDRCL__IGC_INCLUDE_DIR}
+	${THIRD_PARTY_DIR}
+)
+
+if(GTPIN_HEADERS_DIR)
+  target_include_directories(${NEO_STATIC_LIB_NAME} PUBLIC
+	${GTPIN_HEADERS_DIR}
+  )
+endif(GTPIN_HEADERS_DIR)
+
+if (WIN32)
+  target_include_directories(${NEO_STATIC_LIB_NAME} PUBLIC
+    ${WDK_INCLUDE_PATHS}
+    os_interface/windows
+  )
+  target_compile_definitions(${NEO_STATIC_LIB_NAME} PUBLIC OGL=1)
+  target_compile_definitions(${NEO_STATIC_LIB_NAME} PUBLIC INSTR_WIN_UMD=1)
+
+endif (WIN32)
+
+if (UNIX)
+  target_compile_definitions(${NEO_STATIC_LIB_NAME} PUBLIC OGL_GEM=1)
+  target_include_directories(${NEO_STATIC_LIB_NAME} PUBLIC
+    os_interface/linux
+    "${LIBDRM_DIR}/include"
+  )
+endif (UNIX)
+
+#cl_khr_priority support
+if(NOT MSVC)
+  target_compile_definitions(${NEO_STATIC_LIB_NAME} PUBLIC -DSUPPORT_PRIORITY_HINTS)
+  message(STATUS "Supporting priority hints")
+endif()
+
+target_compile_definitions(${NEO_STATIC_LIB_NAME} PUBLIC DEFAULT_PLATFORM=${DEFAULT_SUPPORTED_PLATFORM})
+
+link_directories(${GMM_LIB_PATHS})
+
+if(NOT GMMUMD_LIB_NAME)
+  set(GMMUMD_LIB_NAME "gmm_umd" CACHE STRING "name of gmm static library")
+endif()
+target_link_libraries(${NEO_STATIC_LIB_NAME} ${GMMUMD_LIB_NAME})
+
+if( "${IGDRCL_OPTION__BITS}" STREQUAL "32" )
+  set( DEF_FILE "${CMAKE_CURRENT_SOURCE_DIR}/dll/windows/OpenCLExports32.def" )
+else( "${IGDRCL_OPTION__BITS}" STREQUAL "32" )
+  set( DEF_FILE "${CMAKE_CURRENT_SOURCE_DIR}/dll/windows/OpenCLExports64.def" )
+endif( "${IGDRCL_OPTION__BITS}" STREQUAL "32" )
+
+list(APPEND LIB_FLAGS_DEFINITIONS -DCIF_HEADERS_ONLY_BUILD ${SUPPORTED_GEN_FLAGS_DEFINITONS})
+
+target_compile_definitions(${NEO_STATIC_LIB_NAME} PUBLIC ${LIB_FLAGS_DEFINITIONS})
+if(IGC_OCL_ADAPTOR_DIR) # IGC/AdaptorOCL
+  target_include_directories("${NEO_STATIC_LIB_NAME}" PUBLIC "${IGC_OCL_ADAPTOR_DIR}")
+endif(IGC_OCL_ADAPTOR_DIR)
+
+if(CIF_BASE_DIR)
+  target_include_directories("${NEO_STATIC_LIB_NAME}" PUBLIC "${CIF_BASE_DIR}")
+endif(CIF_BASE_DIR)
+
+set(IGDRCL_LIB_FLAGS_DEFINITIONS ${LIB_FLAGS_DEFINITIONS} PARENT_SCOPE)
+set_target_properties(${NEO_STATIC_LIB_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+set_property(TARGET ${NEO_STATIC_LIB_NAME} APPEND_STRING PROPERTY COMPILE_FLAGS ${ASAN_FLAGS} ${TSAN_FLAGS})
+
+set_target_properties(${NEO_STATIC_LIB_NAME} PROPERTIES FOLDER "opencl runtime")
+
+target_include_directories(${NEO_STATIC_LIB_NAME} BEFORE PRIVATE ${HW_SRC_INCLUDES})
+
+if(${GENERATE_EXECUTABLE})
+	set (RUNTIME_SRCS_DLL
+	  dll/options.cpp
+	  dll/create_command_stream.cpp
+	  dll/create_deferred_deleter.cpp
+	  helpers/abort.cpp
+	  helpers/debug_helpers.cpp
+	  gmm_helper/resource_info.cpp
+	  program/evaluate_unhandled_token.cpp
+	  "${DEF_FILE}"
+	)
+
+	list (APPEND RUNTIME_SRCS_DLL ${HW_SRC_LINK})
+
+	if (UNIX)
+		list (APPEND RUNTIME_SRCS_DLL dll/linux/drm_neo_create.cpp)
+	endif (UNIX)
+
+	if (WIN32)
+	  list (APPEND RUNTIME_SRCS_DLL os_interface/windows/wddm_create.cpp)
+	  list (APPEND RUNTIME_SRCS_DLL gmm_helper/page_table_mngr.cpp)
+	  list (APPEND RUNTIME_SRCS_DLL gmm_helper/gmm_memory.cpp)
+	endif (WIN32)
+
+	list (APPEND RUNTIME_SRCS_DLL api/api.cpp)
+
+	if(GTPIN_HEADERS_DIR)
+		list (APPEND RUNTIME_SRCS_DLL gtpin/gtpin_init.cpp)
+	endif(GTPIN_HEADERS_DIR)
+
+	add_library(${NEO_DYNAMIC_LIB_NAME} SHARED
+	  ${RUNTIME_SRCS_DLL}
+	  $<TARGET_OBJECTS:${SHARINGS_ENABLE_LIB_NAME}>
+	  $<TARGET_OBJECTS:${BUILTINS_SOURCES_LIB_NAME}>
+	  $<TARGET_OBJECTS:${BUILTINS_BINARIES_LIB_NAME}>
+	  $<TARGET_OBJECTS:${SCHEDULER_BINARY_LIB_NAME}>
+	)
+
+	target_include_directories(${NEO_DYNAMIC_LIB_NAME} BEFORE PRIVATE
+	  ${CMAKE_CURRENT_BINARY_DIR}
+	  ${HW_SRC_INCLUDES}
+	)
+
+	target_link_libraries(${NEO_DYNAMIC_LIB_NAME} ${NEO_STATIC_LIB_NAME})
+
+	if (WIN32)
+	  target_include_directories(${NEO_DYNAMIC_LIB_NAME} PUBLIC
+		${WDK_INCLUDE_PATHS}
+		${GMM_INCLUDE_PATHS}
+		${UMKM_SHAREDDATA_INCLUDE_PATHS}
+		${INSTRUMENTATION_INCLUDE_PATH}
+	  )
+	  target_link_libraries(${NEO_DYNAMIC_LIB_NAME} ${NEO_STATIC_LIB_NAME} dxgi Ws2_32.lib)
+	else(WIN32)
+	  target_include_directories(${NEO_DYNAMIC_LIB_NAME} PUBLIC
+		${GMM_INCLUDE_PATHS}
+		${UMKM_SHAREDDATA_INCLUDE_PATHS}
+		${INSTRUMENTATION_INCLUDE_PATH}
+	  )
+	endif (WIN32)
+
+	if (UNIX)
+	  target_link_libraries(${NEO_DYNAMIC_LIB_NAME} dl pthread)
+	  set_property(TARGET ${NEO_DYNAMIC_LIB_NAME}
+		APPEND_STRING PROPERTY LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/dll/linux/ocl.exports"
+	  )
+	endif (UNIX)
+
+	set_target_properties(${NEO_DYNAMIC_LIB_NAME} PROPERTIES
+	  DEBUG_OUTPUT_NAME "${NEO_DLL_NAME_BASE}${IGDRCL_NAME_POSTFIX}${IGDRCL_OPTION__BITS}"
+	  RELEASE_OUTPUT_NAME "${NEO_DLL_NAME_BASE}${IGDRCL_NAME_POSTFIX}${IGDRCL_OPTION__BITS}"
+	  RELEASE-INTERNAL_OUTPUT_NAME "${NEO_DLL_NAME_BASE}${IGDRCL_NAME_POSTFIX}${IGDRCL_OPTION__BITS}"
+	  OUTPUT_NAME "${NEO_DLL_NAME_BASE}${IGDRCL_NAME_POSTFIX}${IGDRCL_OPTION__BITS}"
+	)
+
+	set_property(TARGET ${NEO_DYNAMIC_LIB_NAME} APPEND_STRING PROPERTY COMPILE_FLAGS ${ASAN_FLAGS})
+	set_target_properties(${NEO_DYNAMIC_LIB_NAME} PROPERTIES FOLDER "opencl runtime")
+endif(${GENERATE_EXECUTABLE})
+
+if (WIN32)
+  source_group("source files" FILES ${IGDRCL_SRCS_DLL})
+  source_group("source files\\api" FILES ${RUNTIME_SRCS_API})
+  source_group("source files\\accelerators" FILES ${RUNTIME_SRCS_ACCELERATORS})
+  source_group("source files\\aub_mem_dump" FILES ${RUNTIME_SRCS_AUB_MEM_DUMP})
+  source_group("source files\\built_ins" FILES ${RUNTIME_SRCS_BUILT_INS})
+  source_group("source files\\built_ins\\kernels" FILES ${RUNTIME_SRCS_BUILT_IN_KERNELS})
+  source_group("source files\\commands" FILES ${RUNTIME_SRCS_COMMANDS})
+  source_group("source files\\command_queue" FILES ${RUNTIME_SRCS_COMMAND_QUEUE})
+  source_group("source files\\command_stream" FILES ${RUNTIME_SRCS_COMMAND_STREAM})
+  source_group("source files\\compiler_interface" FILES ${RUNTIME_SRCS_COMPILER_INTERFACE})
+  source_group("source files\\context" FILES ${RUNTIME_SRCS_CONTEXT})
+  source_group("source files\\device" FILES ${RUNTIME_SRCS_DEVICE})
+  source_group("source files\\device_queue" FILES ${RUNTIME_SRCS_DEVICE_QUEUE})
+  source_group("source files\\event" FILES ${RUNTIME_SRCS_EVENT})
+  source_group("source files\\execution_model" FILES ${RUNTIME_SRCS_EXECUTION_MODEL})
+  source_group("source files\\gen_common" FILES ${RUNTIME_SRCS_GEN_COMMON})
+  source_group("source files\\helpers" FILES ${RUNTIME_SRCS_HELPERS})
+  source_group("source files\\indirect_heap" FILES ${RUNTIME_SRCS_INDIRECT_HEAP})
+  source_group("source files\\instrumentation" FILES ${RUNTIME_SRCS_INSTRUMENTATION})
+  source_group("source files\\kernel" FILES ${RUNTIME_SRCS_KERNEL})
+  source_group("source files\\memory_manager" FILES ${RUNTIME_SRCS_MEMORY_MANAGER})
+  source_group("source files\\gmm_helper" FILES ${RUNTIME_SRCS_GMM_HELPER})
+  if(GTPIN_HEADERS_DIR)
+    source_group("source files\\gtpin" FILES ${RUNTIME_SRCS_GTPIN})
+  endif(GTPIN_HEADERS_DIR)
+  source_group("source files\\mem_obj" FILES ${RUNTIME_SRCS_MEM_OBJ})
+  source_group("source files\\os_interface" FILES ${RUNTIME_SRCS_OS_INTERFACE})
+  source_group("source files\\platform" FILES ${RUNTIME_SRCS_PLATFORM})
+  source_group("source files\\program" FILES ${RUNTIME_SRCS_PROGRAM})
+  source_group("source files\\sampler" FILES ${RUNTIME_SRCS_SAMPLER})
+  source_group("source files\\scheduler" FILES ${RUNTIME_SRCS_SCHEDULER})
+  source_group("source files\\sharings" FILES ${RUNTIME_SRCS_SHARINGS})
+  source_group("source files\\tbx" FILES ${RUNTIME_SRCS_TBX})
+  source_group("source files\\utilities" FILES ${RUNTIME_SRCS_UTILITIES})
+endif (WIN32)
+
+if (UNIX)
+  if(NOT (TARGET clang-tidy))
+    add_custom_target(clang-tidy
+      DEPENDS scheduler
+    )
+
+    add_custom_command(
+      TARGET clang-tidy
+      POST_BUILD
+      COMMAND echo clang-tidy...
+      COMMAND find ${CMAKE_CURRENT_SOURCE_DIR} -name *.cpp -print0 | xargs -0 -I{} -P`nproc` clang-tidy -p ${IGDRCL_BINARY_DIR} {} | tee ${IGDRCL_BINARY_DIR}/clang-tidy.log
+      WORKING_DIRECTORY ${IGDRCL_SOURCE_DIR}
+    )
+  endif(NOT (TARGET clang-tidy))
+endif(UNIX)
+
--- a/runtime/accelerators/CMakeLists.txt
+++ b/runtime/accelerators/CMakeLists.txt
@@ -0,0 +1,31 @@
+# Copyright (c) 2017, Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+# We require cmake 3.2.0 or later
+cmake_minimum_required(VERSION 3.2.0 FATAL_ERROR)
+
+set (RUNTIME_SRCS_ACCELERATORS
+	${CMAKE_CURRENT_SOURCE_DIR}/intel_accelerator.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/intel_accelerator.h
+	${CMAKE_CURRENT_SOURCE_DIR}/intel_motion_estimation.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/intel_motion_estimation.h
+	${CMAKE_CURRENT_SOURCE_DIR}/vebox_accelerator.h
+	PARENT_SCOPE
+)
--- a/runtime/accelerators/intel_accelerator.cpp
+++ b/runtime/accelerators/intel_accelerator.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "runtime/accelerators/intel_accelerator.h"
+#include "runtime/context/context.h"
+#include "runtime/helpers/string.h"
+#include "runtime/helpers/get_info.h"
+
+namespace OCLRT {
+
+cl_int IntelAccelerator::getInfo(cl_accelerator_info_intel paramName,
+                                 size_t paramValueSize,
+                                 void *paramValue,
+                                 size_t *paramValueSizeRet) const {
+    cl_int result = CL_SUCCESS;
+    size_t ret = 0;
+
+    switch (paramName) {
+    case CL_ACCELERATOR_DESCRIPTOR_INTEL: {
+        ret = getDescriptorSize();
+        result = ::getInfo(paramValue, paramValueSize, getDescriptor(), ret);
+    }
+
+    break;
+
+    case CL_ACCELERATOR_REFERENCE_COUNT_INTEL: {
+        auto v = getReference();
+
+        ret = sizeof(cl_uint);
+        result = ::getInfo(paramValue, paramValueSize, &v, ret);
+    }
+
+    break;
+
+    case CL_ACCELERATOR_CONTEXT_INTEL: {
+        ret = sizeof(cl_context);
+        cl_context ctx = static_cast<cl_context>(pContext);
+        result = ::getInfo(paramValue, paramValueSize, &ctx, ret);
+    }
+
+    break;
+
+    case CL_ACCELERATOR_TYPE_INTEL: {
+        auto v = getTypeId();
+        ret = sizeof(cl_accelerator_type_intel);
+        result = ::getInfo(paramValue, paramValueSize, &v, ret);
+    }
+
+    break;
+
+    default:
+        result = CL_INVALID_VALUE;
+        break;
+    }
+
+    if (paramValueSizeRet) {
+        *paramValueSizeRet = ret;
+    }
+
+    return result;
+}
+}
--- a/runtime/accelerators/intel_accelerator.h
+++ b/runtime/accelerators/intel_accelerator.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "runtime/api/cl_types.h"
+#include "runtime/helpers/base_object.h"
+
+//------------------------------------------------------------------------------
+// cl_intel_accelerator Class Stuff
+//------------------------------------------------------------------------------
+
+namespace OCLRT {
+
+class Context;
+
+typedef struct TagAcceleratorObjParams {
+    cl_uint AcceleratorType;
+    cl_uint AcceleratorFlags;
+} OCLRT_ACCELERATOR_OBJECT_PARAMS, *POCLRT_ACCELERATOR_OBJECT_PARAMS;
+
+template <>
+struct OpenCLObjectMapper<_cl_accelerator_intel> {
+    typedef class IntelAccelerator DerivedType;
+};
+
+class IntelAccelerator : public BaseObject<_cl_accelerator_intel> {
+  public:
+    IntelAccelerator(Context *context,
+                     cl_accelerator_type_intel typeId,
+                     size_t descriptorSize,
+                     const void *descriptor) : pContext(context),
+                                               typeId(typeId),
+                                               descriptorSize(descriptorSize),
+                                               pDescriptor(descriptor) {}
+
+    IntelAccelerator() {}
+
+    static const cl_ulong objectMagic = 0xC6D72FA2E81EA569ULL;
+
+    cl_accelerator_type_intel getTypeId() const { return typeId; }
+
+    size_t getDescriptorSize() const { return descriptorSize; }
+
+    const void *getDescriptor() const { return pDescriptor; }
+
+    cl_int getInfo(cl_accelerator_info_intel paramName,
+                   size_t paramValueSize,
+                   void *paramValue,
+                   size_t *paramValueSizeRet) const;
+
+  protected:
+    Context *pContext = nullptr;
+    const cl_accelerator_type_intel typeId = -1;
+    const size_t descriptorSize = 0;
+    const void *pDescriptor = nullptr;
+
+  private:
+};
+} // namespace OCLRT
--- a/runtime/accelerators/intel_motion_estimation.cpp
+++ b/runtime/accelerators/intel_motion_estimation.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "runtime/accelerators/intel_motion_estimation.h"
+
+namespace OCLRT {
+
+cl_int VmeAccelerator::validateVmeArgs(Context *context,
+                                       cl_accelerator_type_intel typeId,
+                                       size_t descriptorSize,
+                                       const void *descriptor) {
+    const cl_motion_estimation_desc_intel *descObj =
+        (const cl_motion_estimation_desc_intel *)descriptor;
+
+    DEBUG_BREAK_IF(!context);
+    DEBUG_BREAK_IF(typeId != CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL);
+
+    if ((descriptorSize != sizeof(cl_motion_estimation_desc_intel)) ||
+        (descriptor == NULL)) {
+        return CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL;
+    }
+
+    switch (descObj->mb_block_type) {
+    case CL_ME_MB_TYPE_16x16_INTEL:
+    case CL_ME_MB_TYPE_8x8_INTEL:
+    case CL_ME_MB_TYPE_4x4_INTEL:
+        break;
+    default:
+        return CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL;
+    }
+
+    switch (descObj->subpixel_mode) {
+    case CL_ME_SUBPIXEL_MODE_INTEGER_INTEL:
+    case CL_ME_SUBPIXEL_MODE_HPEL_INTEL:
+    case CL_ME_SUBPIXEL_MODE_QPEL_INTEL:
+        break;
+    default:
+        return CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL;
+    }
+
+    switch (descObj->sad_adjust_mode) {
+    case CL_ME_SAD_ADJUST_MODE_NONE_INTEL:
+    case CL_ME_SAD_ADJUST_MODE_HAAR_INTEL:
+        break;
+    default:
+        return CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL;
+    }
+
+    switch (descObj->search_path_type) {
+    case CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL:
+    case CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL:
+    case CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL:
+        break;
+    default:
+        return CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL;
+    }
+
+    return CL_SUCCESS;
+}
+}
--- a/runtime/accelerators/intel_motion_estimation.h
+++ b/runtime/accelerators/intel_motion_estimation.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "runtime/accelerators/intel_accelerator.h"
+
+//------------------------------------------------------------------------------
+// VmeAccelerator Class Stuff
+//------------------------------------------------------------------------------
+
+namespace OCLRT {
+
+class Context;
+
+class VmeAccelerator : public IntelAccelerator {
+  public:
+    static VmeAccelerator *create(Context *context,
+                                  cl_accelerator_type_intel typeId,
+                                  size_t descriptorSize,
+                                  const void *descriptor,
+                                  cl_int &result) {
+
+        result = validateVmeArgs(context, typeId, descriptorSize, descriptor);
+        VmeAccelerator *acc = nullptr;
+
+        if (result == CL_SUCCESS) {
+            acc = new VmeAccelerator(
+                context,
+                typeId,
+                descriptorSize,
+                descriptor);
+        }
+
+        return acc;
+    }
+
+  protected:
+  private:
+    VmeAccelerator(Context *context,
+                   cl_accelerator_type_intel typeId,
+                   size_t descriptorSize,
+                   const void *descriptor) : IntelAccelerator(context,
+                                                              typeId,
+                                                              descriptorSize,
+                                                              descriptor) {
+    }
+    static cl_int validateVmeArgs(Context *context,
+                                  cl_accelerator_type_intel typeId,
+                                  size_t descriptorSize,
+                                  const void *descriptor);
+};
+}
--- a/runtime/accelerators/vebox_accelerator.h
+++ b/runtime/accelerators/vebox_accelerator.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "runtime/accelerators/intel_accelerator.h"
+
+//------------------------------------------------------------------------------
+// VeboxAccelerator Class Stuff
+//------------------------------------------------------------------------------
+
+namespace OCLRT {
+
+class Context;
+
+class VeboxAccelerator : public IntelAccelerator {
+  public:
+    static VeboxAccelerator *create(Context *context,
+                                    cl_accelerator_type_intel typeId,
+                                    size_t descriptorSize,
+                                    const void *descriptor,
+                                    cl_int &result) {
+        DEBUG_BREAK_IF(!context);
+        DEBUG_BREAK_IF(!descriptor);
+        VeboxAccelerator *acc = new VeboxAccelerator(
+            context,
+            typeId,
+            descriptorSize,
+            descriptor);
+
+        result = CL_SUCCESS;
+
+        return acc;
+    }
+
+  protected:
+  private:
+    VeboxAccelerator(Context *context,
+                     cl_accelerator_type_intel typeId,
+                     size_t descriptorSize,
+                     const void *descriptor) : IntelAccelerator(context,
+                                                                typeId,
+                                                                descriptorSize,
+                                                                descriptor) {
+    }
+};
+}
--- a/runtime/api/CMakeLists.txt
+++ b/runtime/api/CMakeLists.txt
@@ -0,0 +1,31 @@
+# Copyright (c) 2017, Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+# We require cmake 3.2.0 or later
+cmake_minimum_required(VERSION 3.2.0 FATAL_ERROR)
+
+set (RUNTIME_SRCS_API
+	${CMAKE_CURRENT_SOURCE_DIR}/api.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/api.h
+	${CMAKE_CURRENT_SOURCE_DIR}/cl_types.h
+	${CMAKE_CURRENT_SOURCE_DIR}/dispatch.cpp
+	${CMAKE_CURRENT_SOURCE_DIR}/dispatch.h
+	PARENT_SCOPE
+)
--- a/runtime/api/api.cpp
+++ b/runtime/api/api.cpp
--- a/runtime/api/api.h
+++ b/runtime/api/api.h
@@ -0,0 +1,887 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "config.h"
+
+#include "CL/cl.h"
+#include "CL/cl_gl.h"
+#include "runtime/api/dispatch.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+cl_int CL_API_CALL clGetPlatformIDs(
+    cl_uint numEntries,
+    cl_platform_id *platforms,
+    cl_uint *numPlatforms);
+
+cl_int CL_API_CALL clGetPlatformInfo(
+    cl_platform_id platform,
+    cl_platform_info paramName,
+    size_t paramValueSize,
+    void *paramValue,
+    size_t *paramValueSizeRet);
+
+cl_int CL_API_CALL clGetDeviceIDs(
+    cl_platform_id platform,
+    cl_device_type deviceType,
+    cl_uint numEntries,
+    cl_device_id *devices,
+    cl_uint *numDevices);
+
+cl_int CL_API_CALL clGetDeviceInfo(
+    cl_device_id device,
+    cl_device_info paramName,
+    size_t paramValueSize,
+    void *paramValue,
+    size_t *paramValueSizeRet);
+
+cl_int CL_API_CALL clCreateSubDevices(
+    cl_device_id inDevice,
+    const cl_device_partition_property *properties,
+    cl_uint numDevices,
+    cl_device_id *outDevices,
+    cl_uint *numDevicesRet);
+
+cl_int CL_API_CALL clRetainDevice(
+    cl_device_id device);
+
+cl_int CL_API_CALL clReleaseDevice(
+    cl_device_id device);
+
+cl_context CL_API_CALL clCreateContext(
+    const cl_context_properties *properties,
+    cl_uint numDevices,
+    const cl_device_id *devices,
+    void(CL_CALLBACK *funcNotify)(const char *, const void *, size_t, void *),
+    void *userData,
+    cl_int *errcodeRet);
+
+cl_context CL_API_CALL clCreateContextFromType(
+    const cl_context_properties *properties,
+    cl_device_type deviceType,
+    void(CL_CALLBACK *funcNotify)(const char *, const void *, size_t, void *),
+    void *userData,
+    cl_int *errcodeRet);
+
+cl_int CL_API_CALL clRetainContext(
+    cl_context context);
+
+cl_int CL_API_CALL clReleaseContext(
+    cl_context context);
+
+cl_int CL_API_CALL clGetContextInfo(
+    cl_context context,
+    cl_context_info paramName,
+    size_t paramValueSize,
+    void *paramValue,
+    size_t *paramValueSizeRet);
+
+cl_int CL_API_CALL clGetGLContextInfoKHR(
+    const cl_context_properties *properties,
+    cl_gl_context_info paramName,
+    size_t paramValueSize,
+    void *paramValue,
+    size_t *paramValueSizeRet);
+
+cl_command_queue CL_API_CALL clCreateCommandQueue(
+    cl_context context,
+    cl_device_id device,
+    cl_command_queue_properties properties,
+    cl_int *errcodeRet);
+
+cl_int CL_API_CALL clRetainCommandQueue(
+    cl_command_queue commandQueue);
+
+cl_int CL_API_CALL clReleaseCommandQueue(
+    cl_command_queue commandQueue);
+
+cl_int CL_API_CALL clGetCommandQueueInfo(
+    cl_command_queue commandQueue,
+    cl_command_queue_info paramName,
+    size_t paramValueSize,
+    void *paramValue,
+    size_t *paramValueSizeRet);
+
+// deprecated OpenCL 1.0
+cl_int CL_API_CALL clSetCommandQueueProperty(
+    cl_command_queue commandQueue,
+    cl_command_queue_properties properties,
+    cl_bool enable,
+    cl_command_queue_properties *oldProperties);
+
+cl_mem CL_API_CALL clCreateBuffer(
+    cl_context context,
+    cl_mem_flags flags,
+    size_t size,
+    void *hostPtr,
+    cl_int *errcodeRet);
+
+cl_mem CL_API_CALL clCreateSubBuffer(
+    cl_mem buffer,
+    cl_mem_flags flags,
+    cl_buffer_create_type bufferCreateType,
+    const void *bufferCreateInfo,
+    cl_int *errcodeRet);
+
+cl_mem CL_API_CALL clCreateImage(
+    cl_context context,
+    cl_mem_flags flags,
+    const cl_image_format *imageFormat,
+    const cl_image_desc *imageDesc,
+    void *hostPtr,
+    cl_int *errcodeRet);
+
+// deprecated OpenCL 1.1
+cl_mem CL_API_CALL clCreateImage2D(
+    cl_context context,
+    cl_mem_flags flags,
+    const cl_image_format *imageFormat,
+    size_t imageWidth,
+    size_t imageHeight,
+    size_t imageRowPitch,
+    void *hostPtr,
+    cl_int *errcodeRet);
+
+// deprecated OpenCL 1.1
+cl_mem CL_API_CALL clCreateImage3D(
+    cl_context context,
+    cl_mem_flags flags,
+    const cl_image_format *imageFormat,
+    size_t imageWidth,
+    size_t imageHeight,
+    size_t imageDepth,
+    size_t imageRowPitch,
+    size_t imageSlicePitch,
+    void *hostPtr,
+    cl_int *errcodeRet);
+
+cl_int CL_API_CALL clRetainMemObject(
+    cl_mem memobj);
+
+cl_int CL_API_CALL clReleaseMemObject(
+    cl_mem memobj);
+
+cl_int CL_API_CALL clGetSupportedImageFormats(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type imageType,
+    cl_uint numEntries,
+    cl_image_format *imageFormats,
+    cl_uint *numImageFormats);
+
+cl_int CL_API_CALL clGetMemObjectInfo(
+    cl_mem memobj,
+    cl_mem_info paramName,
+    size_t paramValueSize,
+    void *paramValue,
+    size_t *paramValueSizeRet);
+
+cl_int CL_API_CALL clGetImageInfo(
+    cl_mem image,
+    cl_image_info paramName,
+    size_t paramValueSize,
+    void *paramValue,
+    size_t *paramValueSizeRet);
+
+cl_int CL_API_CALL clGetImageParamsINTEL(
+    cl_context context,
+    const cl_image_format *imageFormat,
+    const cl_image_desc *imageDesc,
+    size_t *imageRowPitch,
+    size_t *imageSlicePitch);
+
+cl_int CL_API_CALL clSetMemObjectDestructorCallback(
+    cl_mem memobj,
+    void(CL_CALLBACK *funcNotify)(cl_mem, void *),
+    void *userData);
+
+cl_sampler CL_API_CALL clCreateSampler(
+    cl_context context,
+    cl_bool normalizedCoords,
+    cl_addressing_mode addressingMode,
+    cl_filter_mode filterMode,
+    cl_int *errcodeRet);
+
+cl_int CL_API_CALL clRetainSampler(
+    cl_sampler sampler);
+
+cl_int CL_API_CALL clReleaseSampler(
+    cl_sampler sampler);
+
+cl_int CL_API_CALL clGetSamplerInfo(
+    cl_sampler sampler,
+    cl_sampler_info paramName,
+    size_t paramValueSize,
+    void *paramValue,
+    size_t *paramValueSizeRet);
+
+cl_program CL_API_CALL clCreateProgramWithSource(
+    cl_context context,
+    cl_uint count,
+    const char **strings,
+    const size_t *lengths,
+    cl_int *errcodeRet);
+
+cl_program CL_API_CALL clCreateProgramWithBinary(
+    cl_context context,
+    cl_uint numDevices,
+    const cl_device_id *deviceList,
+    const size_t *lengths,
+    const unsigned char **binaries,
+    cl_int *binaryStatus,
+    cl_int *errcodeRet);
+
+cl_program CL_API_CALL clCreateProgramWithBuiltInKernels(
+    cl_context context,
+    cl_uint numDevices,
+    const cl_device_id *deviceList,
+    const char *kernelNames,
+    cl_int *errcodeRet);
+
+cl_int CL_API_CALL clRetainProgram(
+    cl_program program);
+
+cl_int CL_API_CALL clReleaseProgram(
+    cl_program program);
+
+cl_int CL_API_CALL clBuildProgram(
+    cl_program program,
+    cl_uint numDevices,
+    const cl_device_id *deviceList,
+    const char *options,
+    void(CL_CALLBACK *funcNotify)(cl_program program, void *userData),
+    void *userData);
+
+cl_int CL_API_CALL clCompileProgram(
+    cl_program program,
+    cl_uint numDevices,
+    const cl_device_id *deviceList,
+    const char *options,
+    cl_uint numInputHeaders,
+    const cl_program *inputHeaders,
+    const char **headerIncludeNames,
+    void(CL_CALLBACK *funcNotify)(cl_program program, void *userData),
+    void *userData);
+
+cl_program CL_API_CALL clLinkProgram(
+    cl_context context,
+    cl_uint numDevices,
+    const cl_device_id *deviceList,
+    const char *options,
+    cl_uint numInputPrograms,
+    const cl_program *inputPrograms,
+    void(CL_CALLBACK *funcNotify)(cl_program program, void *userData),
+    void *userData,
+    cl_int *errcodeRet);
+
+cl_int CL_API_CALL clUnloadPlatformCompiler(
+    cl_platform_id platform);
+
+// deprecated OpenCL 1.1
+cl_int CL_API_CALL clUnloadCompiler(void);
+
+cl_int CL_API_CALL clGetProgramInfo(
+    cl_program program,
+    cl_program_info paramName,
+    size_t paramValueSize,
+    void *paramValue,
+    size_t *paramValueSizeRet);
+
+cl_int CL_API_CALL clGetProgramBuildInfo(
+    cl_program program,
+    cl_device_id device,
+    cl_program_build_info paramName,
+    size_t paramValueSize,
+    void *paramValue,
+    size_t *paramValueSizeRet);
+
+cl_kernel CL_API_CALL clCreateKernel(
+    cl_program program,
+    const char *kernelName,
+    cl_int *errcodeRet);
+
+cl_int CL_API_CALL clCreateKernelsInProgram(
+    cl_program program,
+    cl_uint numKernels,
+    cl_kernel *kernels,
+    cl_uint *numKernelsRet);
+
+cl_int CL_API_CALL clRetainKernel(
+    cl_kernel kernel);
+
+cl_int CL_API_CALL clReleaseKernel(
+    cl_kernel kernel);
+
+cl_int CL_API_CALL clSetKernelArg(
+    cl_kernel kernel,
+    cl_uint argIndex,
+    size_t argSize,
+    const void *argValue);
+
+cl_int CL_API_CALL clGetKernelInfo(
+    cl_kernel kernel,
+    cl_kernel_info paramName,
+    size_t paramValueSize,
+    void *paramValue,
+    size_t *paramValueSizeRet);
+
+cl_int CL_API_CALL clGetKernelArgInfo(
+    cl_kernel kernel,
+    cl_uint argIndx,
+    cl_kernel_arg_info paramName,
+    size_t paramValueSize,
+    void *paramValue,
+    size_t *paramValueSizeRet);
+
+cl_int CL_API_CALL clGetKernelWorkGroupInfo(
+    cl_kernel kernel,
+    cl_device_id device,
+    cl_kernel_work_group_info paramName,
+    size_t paramValueSize,
+    void *paramValue,
+    size_t *paramValueSizeRet);
+
+cl_int CL_API_CALL clWaitForEvents(
+    cl_uint numEvents,
+    const cl_event *eventList);
+
+cl_int CL_API_CALL clGetEventInfo(
+    cl_event event,
+    cl_event_info paramName,
+    size_t paramValueSize,
+    void *paramValue,
+    size_t *paramValueSizeRet);
+
+cl_event CL_API_CALL clCreateUserEvent(
+    cl_context context,
+    cl_int *errcodeRet);
+
+cl_int CL_API_CALL clRetainEvent(
+    cl_event event);
+
+cl_int CL_API_CALL clReleaseEvent(
+    cl_event event);
+
+cl_int CL_API_CALL clSetUserEventStatus(
+    cl_event event,
+    cl_int executionStatus);
+
+cl_int CL_API_CALL clSetEventCallback(
+    cl_event event,
+    cl_int commandExecCallbackType,
+    void(CL_CALLBACK *funcNotify)(cl_event, cl_int, void *),
+    void *userData);
+
+cl_int CL_API_CALL clGetEventProfilingInfo(
+    cl_event event,
+    cl_profiling_info paramName,
+    size_t paramValueSize,
+    void *paramValue,
+    size_t *paramValueSizeRet);
+
+cl_int CL_API_CALL clFlush(
+    cl_command_queue commandQueue);
+
+cl_int CL_API_CALL clFinish(
+    cl_command_queue commandQueue);
+
+cl_int CL_API_CALL clEnqueueReadBuffer(
+    cl_command_queue commandQueue,
+    cl_mem buffer,
+    cl_bool blockingRead,
+    size_t offset,
+    size_t cb,
+    void *ptr,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueReadBufferRect(
+    cl_command_queue commandQueue,
+    cl_mem buffer,
+    cl_bool blockingRead,
+    const size_t *bufferOrigin,
+    const size_t *hostOrigin,
+    const size_t *region,
+    size_t bufferRowPitch,
+    size_t bufferSlicePitch,
+    size_t hostRowPitch,
+    size_t hostSlicePitch,
+    void *ptr,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueWriteBuffer(
+    cl_command_queue commandQueue,
+    cl_mem buffer,
+    cl_bool blockingWrite,
+    size_t offset,
+    size_t cb,
+    const void *ptr,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueWriteBufferRect(
+    cl_command_queue commandQueue,
+    cl_mem buffer,
+    cl_bool blockingWrite,
+    const size_t *bufferOrigin,
+    const size_t *hostOrigin,
+    const size_t *region,
+    size_t bufferRowPitch,
+    size_t bufferSlicePitch,
+    size_t hostRowPitch,
+    size_t hostSlicePitch,
+    const void *ptr,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueFillBuffer(
+    cl_command_queue commandQueue,
+    cl_mem buffer,
+    const void *pattern,
+    size_t patternSize,
+    size_t offset,
+    size_t size,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueCopyBuffer(
+    cl_command_queue commandQueue,
+    cl_mem srcBuffer,
+    cl_mem dstBuffer,
+    size_t srcOffset,
+    size_t dstOffset,
+    size_t cb,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueCopyBufferRect(
+    cl_command_queue commandQueue,
+    cl_mem srcBuffer,
+    cl_mem dstBuffer,
+    const size_t *srcOrigin,
+    const size_t *dstOrigin,
+    const size_t *region,
+    size_t srcRowPitch,
+    size_t srcSlicePitch,
+    size_t dstRowPitch,
+    size_t dstSlicePitch,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueReadImage(
+    cl_command_queue commandQueue,
+    cl_mem image,
+    cl_bool blockingRead,
+    const size_t *origin,
+    const size_t *region,
+    size_t rowPitch,
+    size_t slicePitch,
+    void *ptr,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueWriteImage(
+    cl_command_queue commandQueue,
+    cl_mem image,
+    cl_bool blockingWrite,
+    const size_t *origin,
+    const size_t *region,
+    size_t inputRowPitch,
+    size_t inputSlicePitch,
+    const void *ptr,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueFillImage(
+    cl_command_queue commandQueue,
+    cl_mem image,
+    const void *fillColor,
+    const size_t *origin,
+    const size_t *region,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueCopyImage(
+    cl_command_queue commandQueue,
+    cl_mem srcImage,
+    cl_mem dstImage,
+    const size_t *srcOrigin,
+    const size_t *dstOrigin,
+    const size_t *region,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueCopyImageToBuffer(
+    cl_command_queue commandQueue,
+    cl_mem srcImage,
+    cl_mem dstBuffer,
+    const size_t *srcOrigin,
+    const size_t *region,
+    size_t dstOffset,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueCopyBufferToImage(
+    cl_command_queue commandQueue,
+    cl_mem srcBuffer,
+    cl_mem dstImage,
+    size_t srcOffset,
+    const size_t *dstOrigin,
+    const size_t *region,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+void *CL_API_CALL clEnqueueMapBuffer(
+    cl_command_queue commandQueue,
+    cl_mem buffer,
+    cl_bool blockingMap,
+    cl_map_flags mapFlags,
+    size_t offset,
+    size_t cb,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event,
+    cl_int *errcodeRet);
+
+void *CL_API_CALL clEnqueueMapImage(
+    cl_command_queue commandQueue,
+    cl_mem image,
+    cl_bool blockingMap,
+    cl_map_flags mapFlags,
+    const size_t *origin,
+    const size_t *region,
+    size_t *imageRowPitch,
+    size_t *imageSlicePitch,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event,
+    cl_int *errcodeRet);
+
+cl_int CL_API_CALL clEnqueueUnmapMemObject(
+    cl_command_queue commandQueue,
+    cl_mem memobj,
+    void *mappedPtr,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueMigrateMemObjects(
+    cl_command_queue commandQueue,
+    cl_uint numMemObjects,
+    const cl_mem *memObjects,
+    cl_mem_migration_flags flags,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueNDRangeKernel(
+    cl_command_queue commandQueue,
+    cl_kernel kernel,
+    cl_uint workDim,
+    const size_t *globalWorkOffset,
+    const size_t *globalWorkSize,
+    const size_t *localWorkSize,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueTask(
+    cl_command_queue commandQueue,
+    cl_kernel kernel,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueNativeKernel(
+    cl_command_queue commandQueue,
+    void(CL_CALLBACK *userFunc)(void *),
+    void *args,
+    size_t cbArgs,
+    cl_uint numMemObjects,
+    const cl_mem *memList,
+    const void **argsMemLoc,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+// deprecated OpenCL 1.1
+cl_int CL_API_CALL clEnqueueMarker(
+    cl_command_queue commandQueue,
+    cl_event *event);
+
+// deprecated OpenCL 1.1
+cl_int CL_API_CALL clEnqueueWaitForEvents(
+    cl_command_queue commandQueue,
+    cl_uint numEvents,
+    const cl_event *eventList);
+
+// deprecated OpenCL 1.1
+cl_int CL_API_CALL clEnqueueBarrier(
+    cl_command_queue commandQueue);
+
+cl_int CL_API_CALL clEnqueueMarkerWithWaitList(
+    cl_command_queue commandQueue,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueBarrierWithWaitList(
+    cl_command_queue commandQueue,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+// deprecated OpenCL 1.1
+void *CL_API_CALL clGetExtensionFunctionAddress(
+    const char *funcName);
+
+void *CL_API_CALL clGetExtensionFunctionAddressForPlatform(
+    cl_platform_id platform,
+    const char *funcName);
+
+// CL-GL Sharing
+
+cl_mem CL_API_CALL clCreateFromGLBuffer(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLuint bufobj,
+    int *errcodeRet);
+
+// OpenCL 1.2
+cl_mem CL_API_CALL clCreateFromGLTexture(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLenum target,
+    cl_GLint miplevel,
+    cl_GLuint texture,
+    cl_int *errcodeRet);
+
+// deprecated OpenCL 1.1
+cl_mem CL_API_CALL clCreateFromGLTexture2D(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLenum target,
+    cl_GLint miplevel,
+    cl_GLuint texture,
+    cl_int *errcodeRet);
+
+// deprecated OpenCL 1.1
+cl_mem CL_API_CALL clCreateFromGLTexture3D(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLenum target,
+    cl_GLint miplevel,
+    cl_GLuint texture,
+    cl_int *errcodeRet);
+
+cl_mem CL_API_CALL clCreateFromGLRenderbuffer(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLuint renderbuffer,
+    cl_int *errcodeRet);
+
+cl_int CL_API_CALL clGetGLObjectInfo(
+    cl_mem memobj,
+    cl_gl_object_type *glObjectType,
+    cl_GLuint *glObjectName);
+
+cl_int CL_API_CALL clGetGLTextureInfo(
+    cl_mem memobj,
+    cl_gl_texture_info paramName,
+    size_t paramValueSize,
+    void *paramValue,
+    size_t *paramValueSizeRet);
+
+cl_int CL_API_CALL clEnqueueAcquireGLObjects(
+    cl_command_queue commandQueue,
+    cl_uint numObjects,
+    const cl_mem *memObjects,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueReleaseGLObjects(
+    cl_command_queue commandQueue,
+    cl_uint numObjects,
+    const cl_mem *memObjects,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+// OpenCL 2.0
+
+void *CL_API_CALL clSVMAlloc(
+    cl_context context,
+    cl_svm_mem_flags flags,
+    size_t size,
+    cl_uint alignment);
+
+void CL_API_CALL clSVMFree(
+    cl_context context,
+    void *svmPointer);
+
+cl_int CL_API_CALL clEnqueueSVMFree(
+    cl_command_queue commandQueue,
+    cl_uint numSvmPointers,
+    void *svmPointers[],
+    void(CL_CALLBACK *pfnFreeFunc)(
+        cl_command_queue queue,
+        cl_uint numSvmPointers,
+        void *svmPointers[],
+        void *userData),
+    void *userData,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueSVMMemcpy(
+    cl_command_queue commandQueue,
+    cl_bool blockingCopy,
+    void *dstPtr,
+    const void *srcPtr,
+    size_t size,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueSVMMemFill(
+    cl_command_queue commandQueue,
+    void *svmPtr,
+    const void *pattern,
+    size_t patternSize,
+    size_t size,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueSVMMap(
+    cl_command_queue commandQueue,
+    cl_bool blockingMap,
+    cl_map_flags mapFlags,
+    void *svmPtr,
+    size_t size,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clEnqueueSVMUnmap(
+    cl_command_queue commandQueue,
+    void *svmPtr,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event);
+
+cl_int CL_API_CALL clSetKernelArgSVMPointer(
+    cl_kernel kernel,
+    cl_uint argIndex,
+    const void *argValue);
+
+cl_int CL_API_CALL clSetKernelExecInfo(
+    cl_kernel kernel,
+    cl_kernel_exec_info paramName,
+    size_t paramValueSize,
+    const void *paramValue);
+
+cl_mem CL_API_CALL clCreatePipe(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_uint pipePacketSize,
+    cl_uint pipeMaxPackets,
+    const cl_pipe_properties *properties,
+    cl_int *errcodeRet);
+
+cl_int CL_API_CALL clGetPipeInfo(
+    cl_mem pipe,
+    cl_pipe_info paramName,
+    size_t paramValueSize,
+    void *paramValue,
+    size_t *paramValueSizeRet);
+
+cl_command_queue CL_API_CALL clCreateCommandQueueWithProperties(
+    cl_context context,
+    cl_device_id device,
+    const cl_queue_properties *properties,
+    cl_int *errcodeRet);
+
+cl_command_queue CL_API_CALL clCreateCommandQueueWithPropertiesINTEL(
+    cl_context context,
+    cl_device_id device,
+    const cl_queue_properties_intel *properties,
+    cl_int *errcodeRet);
+
+cl_sampler CL_API_CALL clCreateSamplerWithProperties(
+    cl_context context,
+    const cl_sampler_properties *samplerProperties,
+    cl_int *errcodeRet);
+
+// OpenCL 2.1
+
+cl_int CL_API_CALL clGetDeviceAndHostTimer(cl_device_id device,
+                                           cl_ulong *deviceTimestamp,
+                                           cl_ulong *hostTimestamp);
+
+cl_int CL_API_CALL clGetHostTimer(cl_device_id device,
+                                  cl_ulong *hostTimestamp);
+
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreatePerfCountersCommandQueueINTEL(
+    cl_context context,
+    cl_device_id device,
+    cl_command_queue_properties properties,
+    cl_uint configuration,
+    cl_int *errcodeRet);
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetPerformanceConfigurationINTEL(
+    cl_device_id device,
+    cl_uint count,
+    cl_uint *offsets,
+    cl_uint *values);
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(
+    cl_context context,
+    cl_GLsync sync,
+    cl_int *errcodeRet) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_program CL_API_CALL clCreateProgramWithILKHR(
+    cl_context context,
+    const void *il,
+    size_t length,
+    cl_int *errcodeRet) CL_API_SUFFIX__VERSION_1_2;
+}
--- a/runtime/api/cl_types.h
+++ b/runtime/api/cl_types.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "config.h"
+
+#include "CL/cl.h"
+#include "runtime/api/dispatch.h"
+#include <cstdint>
+
+struct ClDispatch {
+    SEntryPointsTable dispatch;
+    ClDispatch() : dispatch(globalDispatchTable) {
+    }
+};
+
+struct _cl_accelerator_intel : public ClDispatch {
+};
+
+struct _cl_command_queue : public ClDispatch {
+};
+
+// device_queue is a type used internally
+struct _device_queue : public _cl_command_queue {
+};
+typedef _device_queue *device_queue;
+
+struct _cl_context : public ClDispatch {
+    bool isSharedContext = false;
+};
+
+struct _cl_device_id : public ClDispatch {
+};
+
+struct _cl_event : public ClDispatch {
+};
+
+struct _cl_kernel : public ClDispatch {
+};
+
+struct _cl_mem : public ClDispatch {
+};
+
+struct _cl_platform_id : public ClDispatch {
+};
+
+struct _cl_program : public ClDispatch {
+};
+
+struct _cl_sampler : public ClDispatch {
+};
+
+template <typename Type>
+inline bool isValidObject(Type object) {
+    return object && object->dispatch.icdDispatch == &icdGlobalDispatchTable;
+}
--- a/runtime/api/dispatch.cpp
+++ b/runtime/api/dispatch.cpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "dispatch.h"
+#include "api.h"
+
+SDispatchTable icdGlobalDispatchTable =
+    {
+        clGetPlatformIDs,
+        clGetPlatformInfo,
+        clGetDeviceIDs,
+        clGetDeviceInfo,
+        clCreateContext,
+        clCreateContextFromType,
+        clRetainContext,
+        clReleaseContext,
+        clGetContextInfo,
+        clCreateCommandQueue,
+        clRetainCommandQueue,
+        clReleaseCommandQueue,
+        clGetCommandQueueInfo,
+        clSetCommandQueueProperty,
+        clCreateBuffer,
+        clCreateImage2D,
+        clCreateImage3D,
+        clRetainMemObject,
+        clReleaseMemObject,
+        clGetSupportedImageFormats,
+        clGetMemObjectInfo,
+        clGetImageInfo,
+        clCreateSampler,
+        clRetainSampler,
+        clReleaseSampler,
+        clGetSamplerInfo,
+        clCreateProgramWithSource,
+        clCreateProgramWithBinary,
+        clRetainProgram,
+        clReleaseProgram,
+        clBuildProgram,
+        clUnloadCompiler,
+        clGetProgramInfo,
+        clGetProgramBuildInfo,
+        clCreateKernel,
+        clCreateKernelsInProgram,
+        clRetainKernel,
+        clReleaseKernel,
+        clSetKernelArg,
+        clGetKernelInfo,
+        clGetKernelWorkGroupInfo,
+        clWaitForEvents,
+        clGetEventInfo,
+        clRetainEvent,
+        clReleaseEvent,
+        clGetEventProfilingInfo,
+        clFlush,
+        clFinish,
+        clEnqueueReadBuffer,
+        clEnqueueWriteBuffer,
+        clEnqueueCopyBuffer,
+        clEnqueueReadImage,
+        clEnqueueWriteImage,
+        clEnqueueCopyImage,
+        clEnqueueCopyImageToBuffer,
+        clEnqueueCopyBufferToImage,
+        clEnqueueMapBuffer,
+        clEnqueueMapImage,
+        clEnqueueUnmapMemObject,
+        clEnqueueNDRangeKernel,
+        clEnqueueTask,
+        clEnqueueNativeKernel,
+        clEnqueueMarker,
+        clEnqueueWaitForEvents,
+        clEnqueueBarrier,
+        clGetExtensionFunctionAddress,
+
+        /* cl_khr_gl_sharing */
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+
+        /* cl_khr_d3d10_sharing */
+        nullptr, // clGetDeviceIDsFromD3D10KHR,
+        nullptr, // clCreateFromD3D10BufferKHR,
+        nullptr, // clCreateFromD3D10Texture2DKHR,
+        nullptr, // clCreateFromD3D10Texture3DKHR,
+        nullptr, // clEnqueueAcquireD3D10ObjectsKHR,
+        nullptr, // clEnqueueReleaseD3D10ObjectsKHR,
+
+        /* OpenCL 1.1 */
+        clSetEventCallback,
+        clCreateSubBuffer,
+        clSetMemObjectDestructorCallback,
+        clCreateUserEvent,
+        clSetUserEventStatus,
+        clEnqueueReadBufferRect,
+        clEnqueueWriteBufferRect,
+        clEnqueueCopyBufferRect,
+
+        /* cl_ext_device_fission */
+        nullptr, //clCreateSubDevicesEXT,
+        nullptr, //clRetainDeviceEXT,
+        nullptr, //clReleaseDeviceEXT,
+
+        /* cl_khr_gl_event */
+        nullptr,
+
+        /* OpenCL 1.2 */
+        clCreateSubDevices,
+        clRetainDevice,
+        clReleaseDevice,
+        clCreateImage,
+        clCreateProgramWithBuiltInKernels,
+        clCompileProgram,
+        clLinkProgram,
+        clUnloadPlatformCompiler,
+        clGetKernelArgInfo,
+        clEnqueueFillBuffer,
+        clEnqueueFillImage,
+        clEnqueueMigrateMemObjects,
+        clEnqueueMarkerWithWaitList,
+        clEnqueueBarrierWithWaitList,
+        clGetExtensionFunctionAddressForPlatform,
+        nullptr,
+
+        /* cl_khr_d3d11_sharing */
+        nullptr, // clGetDeviceIDsFromD3D11KHR,
+        nullptr, // clCreateFromD3D11BufferKHR,
+        nullptr, // clCreateFromD3D11Texture2DKHR,
+        nullptr, // clCreateFromD3D11Texture3DKHR,
+        nullptr, // clCreateFromDX9MediaSurfaceKHR,
+        nullptr, // clEnqueueAcquireD3D11ObjectsKHR,
+        nullptr, // clEnqueueReleaseD3D11ObjectsKHR,
+
+        /* cl_khr_dx9_media_sharing */
+        nullptr, // clGetDeviceIDsFromDX9MediaAdapterKHR,
+        nullptr, // clEnqueueAcquireDX9MediaSurfacesKHR,
+        nullptr, // clEnqueueReleaseDX9MediaSurfacesKHR,
+
+        /* cl_khr_egl_image */
+        nullptr, //clCreateFromEGLImageKHR,
+        nullptr, //clEnqueueAcquireEGLObjectsKHR,
+        nullptr, //clEnqueueReleaseEGLObjectsKHR,
+
+        /* cl_khr_egl_event */
+        nullptr, //clCreateEventFromEGLSyncKHR,
+
+        /* OpenCL 2.0 */
+        clCreateCommandQueueWithProperties,
+        clCreatePipe,
+        clGetPipeInfo,
+        clSVMAlloc,
+        clSVMFree,
+        clEnqueueSVMFree,
+        clEnqueueSVMMemcpy,
+        clEnqueueSVMMemFill,
+        clEnqueueSVMMap,
+        clEnqueueSVMUnmap,
+        clCreateSamplerWithProperties,
+        clSetKernelArgSVMPointer,
+        clSetKernelExecInfo,
+        clGetKernelSubGroupInfoKHR,
+
+        /* OpenCL 2.1 */
+        clCloneKernel,
+        clCreateProgramWithIL,
+        clEnqueueSVMMigrateMem,
+        clGetDeviceAndHostTimer,
+        clGetHostTimer,
+        clGetKernelSubGroupInfo,
+        clSetDefaultDeviceCommandQueue,
+};
+SCRTDispatchTable crtGlobalDispatchTable = {
+    clGetKernelArgInfo,
+
+    nullptr, // clGetDeviceIDsFromDX9INTEL,
+    nullptr, // clCreateFromDX9MediaSurfaceINTEL,
+    nullptr, // clEnqueueAcquireDX9ObjectsINTEL,
+    nullptr, // clEnqueueReleaseDX9ObjectsINTEL,
+    clGetImageParamsINTEL,
+    clCreatePerfCountersCommandQueueINTEL,
+
+    clCreateAcceleratorINTEL,
+    clGetAcceleratorInfoINTEL,
+    clRetainAcceleratorINTEL,
+    clReleaseAcceleratorINTEL,
+
+    nullptr,
+    nullptr,
+
+    nullptr,
+    nullptr,
+    nullptr,
+    nullptr,
+
+    nullptr,
+    nullptr,
+    nullptr,
+    nullptr,
+    clSetPerformanceConfigurationINTEL};
+
+SEntryPointsTable globalDispatchTable = {&icdGlobalDispatchTable, &crtGlobalDispatchTable};
--- a/runtime/api/dispatch.h
+++ b/runtime/api/dispatch.h
--- a/runtime/aub_mem_dump/aub_header.h
+++ b/runtime/aub_mem_dump/aub_header.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+#ifndef WIN32
+#pragma pack(4)
+#else
+#pragma pack(push, 4)
+#endif
+
+struct AubCmdHdr {
+    uint32_t DwordLength : 16,
+        SubOp : 7,
+        Opcode : 6,
+        Type : 3;
+};
+static_assert(4 == sizeof(AubCmdHdr), "Invalid size for AubCmdHdr");
+
+struct AubCmdDumpBmpHd {
+    AubCmdHdr Header;
+    uint32_t Xmin;
+    uint32_t Ymin;
+    uint32_t BufferPitch;
+    uint32_t BitsPerPixel : 8,
+        Format : 8,
+        Reserved_0 : 16;
+    uint32_t Xsize;
+    uint32_t Ysize;
+    uint64_t BaseAddr;
+    uint32_t Secure : 1,
+        UseFence : 1,
+        TileOn : 1,
+        WalkY : 1,
+        UsePPGTT : 1,
+        Use32BitDump : 1,
+        UseFullFormat : 1,
+        Reserved_1 : 25;
+    uint32_t DirectoryHandle;
+};
+static_assert(44 == sizeof(AubCmdDumpBmpHd), "Invalid size for AubCmdDumpBmpHd");
+
+struct AubPpgttContextCreate {
+    AubCmdHdr Header;
+    uint32_t Handle;
+    uint32_t AdvancedContext : 1,
+        SixtyFourBit : 1,
+        Reserved_31_2 : 30;
+    uint64_t PageDirPointer[4];
+};
+static_assert(44 == sizeof(AubPpgttContextCreate), "Invalid size for AubPpgttContextCreate");
+
+struct AubBinaryDump {
+    AubCmdHdr Header;
+    char OutputFile[40];
+    uint32_t Height;
+    uint32_t Width;
+    uint64_t BaseAddr;
+    uint32_t SurfaceType : 4,
+        Pitch : 28;
+    uint32_t GttType : 2,
+        Reserved_31_2 : 30;
+    uint32_t DirectoryHandle;
+};
+static_assert(72 == sizeof(AubBinaryDump), "Invalid size for AubBinaryDump");
+
+#ifndef WIN32
+#pragma pack()
+#else
+#pragma pack(pop)
+#endif
--- a/runtime/aub_mem_dump/aub_mem_dump.cpp
+++ b/runtime/aub_mem_dump/aub_mem_dump.cpp
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "config.h"
+#include "aub_mem_dump.h"
+#include "runtime/helpers/ptr_math.h"
+#include "runtime/helpers/debug_helpers.h"
+
+namespace AubMemDump {
+
+const uint64_t g_pageMask = ~(4096ull - 1);
+
+const size_t g_dwordCountMax = 65536;
+
+// Some page table constants used in virtualizing the page tables.
+// clang-format off
+// 32 bit page table traits
+const uint64_t PageTableTraits<32>::physicalMemory = 0; // 1ull <<addressingBits;
+
+const uint64_t PageTableTraits<32>::numPTEntries  = BIT(PageTableTraits<32>::addressingBits - PageTableTraits<32>::NUM_OFFSET_BITS);
+const uint64_t PageTableTraits<32>::sizePT        = BIT(PageTableTraits<32>::addressingBits - PageTableTraits<32>::NUM_OFFSET_BITS) * sizeof(uint64_t);
+const uint64_t PageTableTraits<32>::ptBaseAddress = BIT(38);
+
+const uint64_t PageTableTraits<32>::numPDEntries  = BIT(PageTableTraits<32>::addressingBits - PageTableTraits<32>::NUM_OFFSET_BITS - PageTableTraits<32>::NUM_PTE_BITS);
+const uint64_t PageTableTraits<32>::sizePD        = BIT(PageTableTraits<32>::addressingBits - PageTableTraits<32>::NUM_OFFSET_BITS - PageTableTraits<32>::NUM_PTE_BITS) * sizeof(uint64_t);
+const uint64_t PageTableTraits<32>::pdBaseAddress = BIT(37);
+
+const uint64_t PageTableTraits<32>::numPDPEntries  = BIT(PageTableTraits<32>::addressingBits - PageTableTraits<32>::NUM_OFFSET_BITS - PageTableTraits<32>::NUM_PTE_BITS - PageTableTraits<32>::NUM_PDE_BITS);
+const uint64_t PageTableTraits<32>::sizePDP        = BIT(PageTableTraits<32>::addressingBits - PageTableTraits<32>::NUM_OFFSET_BITS - PageTableTraits<32>::NUM_PTE_BITS - PageTableTraits<32>::NUM_PDE_BITS) * sizeof(uint64_t);
+const uint64_t PageTableTraits<32>::pdpBaseAddress = BIT(36);
+
+// 48 bit page table traits
+const uint64_t PageTableTraits<48>::physicalMemory = 0; // 1ull <<addressingBits;
+
+const uint64_t PageTableTraits<48>::numPTEntries  = BIT(PageTableTraits<48>::addressingBits - PageTableTraits<48>::NUM_OFFSET_BITS);
+const uint64_t PageTableTraits<48>::sizePT        = BIT(PageTableTraits<48>::addressingBits - PageTableTraits<48>::NUM_OFFSET_BITS) * sizeof(uint64_t);
+const uint64_t PageTableTraits<48>::ptBaseAddress = BIT(38);
+
+const uint64_t PageTableTraits<48>::numPDEntries  = BIT(PageTableTraits<48>::addressingBits - PageTableTraits<48>::NUM_OFFSET_BITS - PageTableTraits<48>::NUM_PTE_BITS);
+const uint64_t PageTableTraits<48>::sizePD        = BIT(PageTableTraits<48>::addressingBits - PageTableTraits<48>::NUM_OFFSET_BITS - PageTableTraits<48>::NUM_PTE_BITS) * sizeof(uint64_t);
+const uint64_t PageTableTraits<48>::pdBaseAddress = BIT(37);
+
+const uint64_t PageTableTraits<48>::numPDPEntries  = BIT(PageTableTraits<48>::addressingBits - PageTableTraits<48>::NUM_OFFSET_BITS - PageTableTraits<48>::NUM_PTE_BITS - PageTableTraits<48>::NUM_PDE_BITS);
+const uint64_t PageTableTraits<48>::sizePDP        = BIT(PageTableTraits<48>::addressingBits - PageTableTraits<48>::NUM_OFFSET_BITS - PageTableTraits<48>::NUM_PTE_BITS - PageTableTraits<48>::NUM_PDE_BITS) * sizeof(uint64_t);
+const uint64_t PageTableTraits<48>::pdpBaseAddress = BIT(36);
+const uint64_t PageTableTraits<48>::numPML4Entries  = BIT(NUM_PML4_BITS);
+const uint64_t PageTableTraits<48>::sizePML4        = BIT(NUM_PML4_BITS) * sizeof(uint64_t);
+const uint64_t PageTableTraits<48>::pml4BaseAddress = BIT(35);
+// clang-format on
+
+void LrcaHelper::setRingTail(void *pLRCIn, uint32_t ringTail) const {
+    auto pLRCA = ptrOffset(reinterpret_cast<uint32_t *>(pLRCIn),
+                           offsetContext + offsetRingRegisters + offsetRingTail);
+    *pLRCA++ = mmioBase + 0x2030;
+    *pLRCA++ = ringTail;
+}
+
+void LrcaHelper::setRingHead(void *pLRCIn, uint32_t ringHead) const {
+    auto pLRCA = ptrOffset(reinterpret_cast<uint32_t *>(pLRCIn),
+                           offsetContext + offsetRingRegisters + offsetRingHead);
+    *pLRCA++ = mmioBase + 0x2034;
+    *pLRCA++ = ringHead;
+}
+
+void LrcaHelper::setRingBase(void *pLRCIn, uint32_t ringBase) const {
+    auto pLRCA = ptrOffset(reinterpret_cast<uint32_t *>(pLRCIn),
+                           offsetContext + offsetRingRegisters + offsetRingBase);
+    *pLRCA++ = mmioBase + 0x2038;
+    *pLRCA++ = ringBase;
+}
+
+void LrcaHelper::setRingCtrl(void *pLRCIn, uint32_t ringCtrl) const {
+    auto pLRCA = ptrOffset(reinterpret_cast<uint32_t *>(pLRCIn),
+                           offsetContext + offsetRingRegisters + offsetRingCtrl);
+    *pLRCA++ = mmioBase + 0x203c;
+    *pLRCA++ = ringCtrl;
+}
+
+void LrcaHelper::setPDP0(void *pLRCIn, uint64_t address) const {
+    auto pLRCA = ptrOffset(reinterpret_cast<uint32_t *>(pLRCIn),
+                           offsetContext + offsetPageTableRegisters + offsetPDP0);
+
+    *pLRCA++ = mmioBase + 0x2274;
+    *pLRCA++ = address >> 32;
+    *pLRCA++ = mmioBase + 0x2270;
+    *pLRCA++ = address & 0xffffffff;
+}
+
+void LrcaHelper::setPDP1(void *pLRCIn, uint64_t address) const {
+    auto pLRCA = ptrOffset(reinterpret_cast<uint32_t *>(pLRCIn),
+                           offsetContext + offsetPageTableRegisters + offsetPDP1);
+
+    *pLRCA++ = mmioBase + 0x227c;
+    *pLRCA++ = address >> 32;
+    *pLRCA++ = mmioBase + 0x2278;
+    *pLRCA++ = address & 0xffffffff;
+}
+
+void LrcaHelper::setPDP2(void *pLRCIn, uint64_t address) const {
+    auto pLRCA = ptrOffset(reinterpret_cast<uint32_t *>(pLRCIn),
+                           offsetContext + offsetPageTableRegisters + offsetPDP2);
+
+    *pLRCA++ = mmioBase + 0x2284;
+    *pLRCA++ = address >> 32;
+    *pLRCA++ = mmioBase + 0x2280;
+    *pLRCA++ = address & 0xffffffff;
+}
+
+void LrcaHelper::setPDP3(void *pLRCIn, uint64_t address) const {
+    auto pLRCA = ptrOffset(reinterpret_cast<uint32_t *>(pLRCIn),
+                           offsetContext + offsetPageTableRegisters + offsetPDP3);
+
+    *pLRCA++ = mmioBase + 0x228c;
+    *pLRCA++ = address >> 32;
+    *pLRCA++ = mmioBase + 0x2288;
+    *pLRCA++ = address & 0xffffffff;
+}
+
+void LrcaHelper::setPML4(void *pLRCIn, uint64_t address) const {
+    setPDP0(pLRCIn, address);
+}
+
+void LrcaHelper::initialize(void *pLRCIn) const {
+    auto pLRCABase = reinterpret_cast<uint32_t *>(pLRCIn);
+
+    // Initialize to known but benign garbage
+    for (size_t i = 0; i < sizeLRCA / sizeof(uint32_t); i++) {
+        pLRCABase[i] = 0x1;
+    }
+
+    auto pLRCA = ptrOffset(pLRCABase, offsetContext);
+
+    // Initialize the ring context of the LRCA
+    auto pLRI = ptrOffset(pLRCA, offsetLRI0);
+    auto numRegs = numRegsLRI0;
+    *pLRI++ = 0x11001000 | (2 * numRegs - 1);
+    while (numRegs-- > 0) {
+        *pLRI++ = mmioBase + 0x2244; // CTXT_SR_CTL
+        *pLRI++ = 0x00010001;        // Inhibit context-restore
+    }
+
+    // Initialize the other LRI
+    DEBUG_BREAK_IF(offsetLRI1 != 0x21 * sizeof(uint32_t));
+    pLRI = ptrOffset(pLRCA, offsetLRI1);
+    numRegs = numRegsLRI1;
+    *pLRI++ = 0x11001000 | (2 * numRegs - 1);
+    while (numRegs-- > 0) {
+        *pLRI++ = mmioBase + 0x2094; // NOP ID
+        *pLRI++ = 0x00000000;
+    }
+
+    DEBUG_BREAK_IF(offsetLRI2 != 0x41 * sizeof(uint32_t));
+    pLRI = ptrOffset(pLRCA, offsetLRI2);
+    numRegs = numRegsLRI2;
+    *pLRI++ = 0x11000000 | (2 * numRegs - 1);
+    while (numRegs-- > 0) {
+        *pLRI++ = mmioBase + 0x2094; // NOP ID
+        *pLRI++ = 0x00000000;
+    }
+
+    setRingHead(pLRCIn, 0);
+    setRingTail(pLRCIn, 0);
+    setRingBase(pLRCIn, 0);
+    setRingCtrl(pLRCIn, 0);
+
+    setPDP0(pLRCIn, 0);
+    setPDP1(pLRCIn, 0);
+    setPDP2(pLRCIn, 0);
+    setPDP3(pLRCIn, 0);
+}
+}
--- a/runtime/aub_mem_dump/aub_mem_dump.h
+++ b/runtime/aub_mem_dump/aub_mem_dump.h
@@ -0,0 +1,382 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include <cstdio>
+#include <cstdint>
+#include <fstream>
+
+#ifndef BIT
+#define BIT(x) (((uint64_t)1) << (x))
+#endif
+
+namespace AubMemDump {
+#include "aub_services.h"
+
+template <typename Cmd>
+inline void setAddress(Cmd &cmd, uint64_t address) {
+    cmd.address = address;
+}
+
+template <>
+inline void setAddress(CmdServicesMemTraceMemoryCompare &cmd, uint64_t address) {
+    cmd.address = static_cast<uint32_t>(address);
+    cmd.addressHigh = static_cast<uint32_t>(address >> 32);
+}
+
+template <typename TypeTrue, typename TypeFalse, bool is32Bits>
+struct TypeSelector {
+    typedef TypeTrue type;
+};
+
+template <typename TypeTrue, typename TypeFalse>
+struct TypeSelector<TypeTrue, TypeFalse, false> {
+    typedef TypeFalse type;
+};
+
+union IAPageTableEntry {
+    struct
+    {
+        uint64_t Present : 1;          //[0]
+        uint64_t Writable : 1;         //[1]
+        uint64_t UserSupervisor : 1;   //[2]
+        uint64_t PWT : 1;              //[3]
+        uint64_t PCD : 1;              //[4]
+        uint64_t Accessed : 1;         //[5]
+        uint64_t Dirty : 1;            //[6]
+        uint64_t PAT : 1;              //[7]
+        uint64_t Global : 1;           //[8]
+        uint64_t Reserved_11_9 : 3;    //[11:9]
+        uint64_t PhysicalAddress : 27; //[38:12]
+        uint64_t Reserved_51_39 : 13;  //[51:39]
+        uint64_t Ignored : 11;         //[62:52]
+        uint64_t ExecuteDisable : 1;   //[63]
+    } pageConfig;
+    uint32_t dwordData[2];
+    uint64_t uiData;
+};
+
+typedef IAPageTableEntry MiGttEntry;
+
+static inline void setGttEntry(IAPageTableEntry &entry, uint64_t address) {
+    entry.uiData = 0;
+    entry.pageConfig.PhysicalAddress = address / 4096;
+    entry.pageConfig.Present = true;
+    entry.pageConfig.Writable = true;
+    entry.pageConfig.UserSupervisor = true;
+}
+
+// Use the latest DeviceValues enumerations available
+typedef CmdServicesMemTraceVersion::DeviceValues DeviceValues;
+typedef CmdServicesMemTraceVersion::SteppingValues SteppingValues;
+typedef CmdServicesMemTraceMemoryWrite::AddressSpaceValues AddressSpaceValues;
+typedef CmdServicesMemTraceMemoryWrite::DataTypeHintValues DataTypeHintValues;
+typedef CmdServicesMemTraceMemoryDump::TilingValues TilingValues;
+typedef CmdServicesMemTraceMemoryWrite::RepeatMemoryValues RepeatMemoryValues;
+typedef CmdServicesMemTraceRegisterWrite::MessageSourceIdValues MessageSourceIdValues;
+typedef CmdServicesMemTraceRegisterWrite::RegisterSizeValues RegisterSizeValues;
+typedef CmdServicesMemTraceRegisterWrite::RegisterSpaceValues RegisterSpaceValues;
+typedef CmdServicesMemTraceMemoryPoll::DataSizeValues DataSizeValues;
+
+template <int deviceIn, int addressingBitsIn>
+struct Traits {
+    typedef struct AubStream Stream;
+
+    enum {
+        addressingBits = addressingBitsIn,
+        device = deviceIn
+    };
+};
+
+struct AubStream {
+    virtual void open(const char *filePath) = 0;
+    virtual void close() = 0;
+    virtual bool init(uint32_t stepping, uint32_t device) = 0;
+    virtual void createContext(const AubPpgttContextCreate &cmd) {}
+    virtual void writeMemory(uint64_t physAddress, const void *memory, size_t sizeToDumpThisIteration, uint32_t addressSpace, uint32_t hint) = 0;
+    virtual void writeMemoryWriteHeader(uint64_t physAddress, size_t size, uint32_t addressSpace, uint32_t hint) = 0;
+    virtual void writeMemoryWriteHeader(uint64_t physAddress, size_t size, uint32_t addressSpace) {
+        return writeMemoryWriteHeader(physAddress, size, addressSpace, CmdServicesMemTraceMemoryWrite::DataTypeHintValues::TraceNotype);
+    }
+    virtual void writePTE(uint64_t physAddress, uint64_t entry) = 0;
+    virtual void writeGTT(uint32_t offset, uint64_t entry) = 0;
+    virtual void writeMMIO(uint32_t offset, uint32_t value) = 0;
+    virtual void registerPoll(uint32_t registerOffset, uint32_t mask, uint32_t value, bool pollNotEqual, uint32_t timeoutAction) = 0;
+};
+
+struct AubFileStream : public AubStream {
+    void open(const char *filePath) override;
+    void close() override;
+    bool init(uint32_t stepping, uint32_t device) override;
+    void createContext(const AubPpgttContextCreate &cmd) override;
+    void writeMemory(uint64_t physAddress, const void *memory, size_t size, uint32_t addressSpace, uint32_t hint) override;
+    void writeMemoryWriteHeader(uint64_t physAddress, size_t size, uint32_t addressSpace, uint32_t hint) override;
+    void writePTE(uint64_t physAddress, uint64_t entry) override;
+    void writeGTT(uint32_t offset, uint64_t entry) override;
+    void writeMMIO(uint32_t offset, uint32_t value) override;
+    void registerPoll(uint32_t registerOffset, uint32_t mask, uint32_t value, bool pollNotEqual, uint32_t timeoutAction) override;
+    void expectMemory(uint64_t physAddress, const void *memory, size_t size);
+    void addComment(const char *message);
+
+    std::ofstream fileHandle;
+};
+
+template <int addressingBits>
+struct PageTableTraits {
+};
+
+template <>
+struct PageTableTraits<32> {
+    // clang-format off
+    enum {
+        addressingBits  = 32,
+        NUM_OFFSET_BITS = 12,
+        NUM_PTE_BITS    =  9,
+        NUM_PDE_BITS    =  9,
+        NUM_PDP_BITS    = addressingBits - NUM_PDE_BITS - NUM_PTE_BITS - NUM_OFFSET_BITS,
+    };
+
+    static const uint64_t physicalMemory;
+    static const uint64_t numPTEntries;
+    static const uint64_t sizePT;
+    static const uint64_t ptBaseAddress;
+
+    static const uint64_t numPDEntries;
+    static const uint64_t sizePD;
+    static const uint64_t pdBaseAddress;
+
+    static const uint64_t numPDPEntries;
+    static const uint64_t sizePDP;
+    static const uint64_t pdpBaseAddress;
+    // clang-format on
+};
+
+template <>
+struct PageTableTraits<48> {
+    // clang-format off
+    enum {
+        addressingBits  = 48,
+        NUM_OFFSET_BITS = PageTableTraits<32>::NUM_OFFSET_BITS,
+        NUM_PTE_BITS    = PageTableTraits<32>::NUM_PTE_BITS,
+        NUM_PDE_BITS    = PageTableTraits<32>::NUM_PDE_BITS,
+        NUM_PDP_BITS    = PageTableTraits<32>::NUM_PDP_BITS,
+        NUM_PML4_BITS   = addressingBits - NUM_PDP_BITS - NUM_PDE_BITS - NUM_PTE_BITS - NUM_OFFSET_BITS
+    };
+
+    static const uint64_t physicalMemory;
+    static const uint64_t numPTEntries;
+    static const uint64_t sizePT;
+    static const uint64_t ptBaseAddress;
+
+    static const uint64_t numPDEntries;
+    static const uint64_t sizePD;
+    static const uint64_t pdBaseAddress;
+
+    static const uint64_t numPDPEntries;
+    static const uint64_t sizePDP;
+    static const uint64_t pdpBaseAddress;
+
+    static const uint64_t numPML4Entries;
+    static const uint64_t sizePML4;
+    static const uint64_t pml4BaseAddress;
+    // clang-format on
+};
+
+template <typename Traits>
+struct AubPageTableHelper {
+    typedef AubMemDump::PageTableTraits<Traits::addressingBits> PageTableTraits;
+
+    enum {
+        addressingBits = Traits::addressingBits
+    };
+
+    static inline uint32_t ptrToGGTT(const void *memory) {
+        return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(memory));
+    }
+
+    static inline uintptr_t ptrToPPGTT(const void *memory) {
+        return reinterpret_cast<uintptr_t>(memory);
+    }
+
+    static inline uint64_t getPTEAddress(uint64_t ptIndex) {
+        return PageTableTraits::ptBaseAddress + ptIndex * sizeof(uint64_t);
+    }
+
+    static inline uint64_t getPDEAddress(uint64_t pdIndex) {
+        return PageTableTraits::pdBaseAddress + pdIndex * sizeof(uint64_t);
+    }
+
+    static inline uint64_t getPDPAddress(uint64_t pdpIndex) {
+        return PageTableTraits::pdpBaseAddress + pdpIndex * sizeof(uint64_t);
+    }
+};
+
+template <typename Traits>
+struct AubPageTableHelper32 : public AubPageTableHelper<Traits>, PageTableTraits<32> {
+    typedef AubPageTableHelper<Traits> BaseClass;
+
+    static void createContext(typename Traits::Stream &stream, uint32_t context);
+    static uint64_t reserveAddressPPGTT(typename Traits::Stream &stream, uintptr_t gfxAddress, size_t blockSize, uint64_t physAddress);
+
+    static void fixupLRC(uint8_t *pLrc);
+};
+
+template <typename Traits>
+struct AubPageTableHelper64 : public AubPageTableHelper<Traits>, PageTableTraits<48> {
+    typedef AubPageTableHelper<Traits> BaseClass;
+
+    static inline uint64_t getPML4Address(uint64_t pml4Index) {
+        return pml4BaseAddress + pml4Index * sizeof(uint64_t);
+    }
+
+    static void createContext(typename Traits::Stream &stream, uint32_t context);
+    static uint64_t reserveAddressPPGTT(typename Traits::Stream &stream, uintptr_t gfxAddress, size_t blockSize, uint64_t physAddress);
+
+    static void fixupLRC(uint8_t *pLrc);
+};
+
+template <typename TraitsIn>
+struct AubDump : public TypeSelector<AubPageTableHelper32<TraitsIn>, AubPageTableHelper64<TraitsIn>, TraitsIn::addressingBits == 32>::type {
+    typedef TraitsIn Traits;
+    typedef typename TypeSelector<uint32_t, uint64_t, Traits::addressingBits == 32>::type AddressType;
+    typedef typename TypeSelector<AubPageTableHelper32<Traits>, AubPageTableHelper64<Traits>, Traits::addressingBits == 32>::type BaseHelper;
+    typedef typename Traits::Stream Stream;
+
+    typedef union _MiContextDescriptorReg_ {
+        struct {
+            uint64_t Valid : 1;                  //[0]
+            uint64_t ForcePageDirRestore : 1;    //[1]
+            uint64_t ForceRestore : 1;           //[2]
+            uint64_t Legacy : 1;                 //[3]
+            uint64_t ADor64bitSupport : 1;       //[4] Selects 64-bit PPGTT in Legacy mode
+            uint64_t LlcCoherencySupport : 1;    //[5]
+            uint64_t FaultSupport : 2;           //[7:6]
+            uint64_t PrivilegeAccessOrPPGTT : 1; //[8] Selects PPGTT in Legacy mode
+            uint64_t FunctionType : 3;           //[11:9]
+            uint64_t LogicalRingCtxAddress : 20; //[31:12]
+            uint64_t ContextID : 32;             //[63:32]
+        } sData;
+        uint32_t ulData[2];
+        uint64_t qwordData[2 / 2];
+    } MiContextDescriptorReg, *pMiContextDescriptorReg;
+
+    // Write a block of memory to a given address space using an optional hint
+    static void addMemoryWrite(Stream &stream, uint64_t addr, const void *memory, size_t blockSize, int addressSpace, int hint = DataTypeHintValues::TraceNotype);
+    static uint64_t reserveAddressGGTT(Stream &stream, uint32_t addr, size_t size, uint64_t physStart);
+    static uint64_t reserveAddressGGTT(Stream &stream, const void *memory, size_t size, uint64_t physStart);
+
+  private:
+    static uint64_t reserveAddress(Stream &stream, uint32_t addr, size_t size, unsigned int addressSpace /* = AddressSpaceValues::TraceGttEntry*/, uint64_t physStart);
+};
+
+struct LrcaHelper {
+    LrcaHelper(uint32_t base) : mmioBase(base) {
+    }
+
+    int aubHintLRCA = DataTypeHintValues::TraceNotype;
+    int aubHintCommandBuffer = DataTypeHintValues::TraceCommandBuffer;
+    int aubHintBatchBuffer = DataTypeHintValues::TraceBatchBuffer;
+
+    const char *name = "XCS";
+    uint32_t mmioBase = 0;
+
+    size_t sizeLRCA = 0x2000;
+    uint32_t alignLRCA = 0x1000;
+    uint32_t offsetContext = 0x1000;
+
+    uint32_t offsetLRI0 = 0x01 * sizeof(uint32_t);
+    uint32_t numRegsLRI0 = 14;
+
+    uint32_t numNoops0 = 3;
+
+    uint32_t offsetLRI1 = offsetLRI0 + (1 + numRegsLRI0 * 2 + numNoops0) * sizeof(uint32_t); //offsetLRI == 0x21 * sizeof(uint32_t);
+    uint32_t numRegsLRI1 = 9;
+
+    uint32_t numNoops1 = 13;
+
+    uint32_t offsetLRI2 = offsetLRI1 + (1 + numRegsLRI1 * 2 + numNoops1) * sizeof(uint32_t); //offsetLR2 == 0x41 * sizeof(uint32_t);
+    uint32_t numRegsLRI2 = 1;
+
+    uint32_t offsetRingRegisters = offsetLRI0 + (3 * sizeof(uint32_t));
+    uint32_t offsetRingHead = 0x0 * sizeof(uint32_t);
+    uint32_t offsetRingTail = 0x2 * sizeof(uint32_t);
+    uint32_t offsetRingBase = 0x4 * sizeof(uint32_t);
+    uint32_t offsetRingCtrl = 0x6 * sizeof(uint32_t);
+
+    uint32_t offsetPageTableRegisters = offsetLRI1 + (3 * sizeof(uint32_t));
+    uint32_t offsetPDP0 = 0xc * sizeof(uint32_t);
+    uint32_t offsetPDP1 = 0x8 * sizeof(uint32_t);
+    uint32_t offsetPDP2 = 0x4 * sizeof(uint32_t);
+    uint32_t offsetPDP3 = 0x0 * sizeof(uint32_t);
+
+    void initialize(void *pLRCIn) const;
+    void setRingHead(void *pLRCIn, uint32_t ringHead) const;
+    void setRingTail(void *pLRCIn, uint32_t ringTail) const;
+    void setRingBase(void *pLRCIn, uint32_t ringBase) const;
+    void setRingCtrl(void *pLRCIn, uint32_t ringCtrl) const;
+
+    void setPDP0(void *pLRCIn, uint64_t address) const;
+    void setPDP1(void *pLRCIn, uint64_t address) const;
+    void setPDP2(void *pLRCIn, uint64_t address) const;
+    void setPDP3(void *pLRCIn, uint64_t address) const;
+
+    void setPML4(void *pLRCIn, uint64_t address) const;
+};
+
+struct LrcaHelperRcs : public LrcaHelper {
+    LrcaHelperRcs(uint32_t base) : LrcaHelper(base) {
+        aubHintLRCA = DataTypeHintValues::TraceLogicalRingContextRcs;
+        aubHintCommandBuffer = DataTypeHintValues::TraceCommandBufferPrimary;
+        aubHintBatchBuffer = DataTypeHintValues::TraceBatchBufferPrimary;
+        sizeLRCA = 0x11000;
+        name = "RCS";
+    }
+};
+
+struct LrcaHelperBcs : public LrcaHelper {
+    LrcaHelperBcs(uint32_t base) : LrcaHelper(base) {
+        aubHintLRCA = DataTypeHintValues::TraceLogicalRingContextBcs;
+        aubHintCommandBuffer = DataTypeHintValues::TraceCommandBufferBlt;
+        aubHintBatchBuffer = DataTypeHintValues::TraceBatchBufferBlt;
+        name = "BCS";
+    }
+};
+
+struct LrcaHelperVcs : public LrcaHelper {
+    LrcaHelperVcs(uint32_t base) : LrcaHelper(base) {
+        aubHintLRCA = DataTypeHintValues::TraceLogicalRingContextVcs;
+        aubHintCommandBuffer = DataTypeHintValues::TraceCommandBufferMfx;
+        aubHintBatchBuffer = DataTypeHintValues::TraceBatchBufferMfx;
+        name = "VCS";
+    }
+};
+
+struct LrcaHelperVecs : public LrcaHelper {
+    LrcaHelperVecs(uint32_t base) : LrcaHelper(base) {
+        aubHintLRCA = DataTypeHintValues::TraceLogicalRingContextVecs;
+        name = "VECS";
+    }
+};
+
+extern const uint64_t g_pageMask;
+extern const size_t g_dwordCountMax;
+}
--- a/runtime/aub_mem_dump/aub_mem_dump.inl
+++ b/runtime/aub_mem_dump/aub_mem_dump.inl
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "config.h"
+#include "aub_mem_dump.h"
+#include "runtime/helpers/debug_helpers.h"
+#include "runtime/helpers/ptr_math.h"
+#include <algorithm>
+#include <cstring>
+
+namespace AubMemDump {
+
+template <typename Traits>
+void AubPageTableHelper32<Traits>::fixupLRC(uint8_t *pLRC) {
+    uint32_t pdAddress;
+    pdAddress = BaseClass::getPDEAddress(0x600) >> 32;
+    *(uint32_t *)(pLRC + 0x1094) = pdAddress;
+    pdAddress = BaseClass::getPDEAddress(0x600) & 0xffffffff;
+    *(uint32_t *)(pLRC + 0x109c) = pdAddress;
+    pdAddress = BaseClass::getPDEAddress(0x400) >> 32;
+    *(uint32_t *)(pLRC + 0x10a4) = pdAddress;
+    pdAddress = BaseClass::getPDEAddress(0x400) & 0xffffffff;
+    *(uint32_t *)(pLRC + 0x10ac) = pdAddress;
+    pdAddress = BaseClass::getPDEAddress(0x200) >> 32;
+    *(uint32_t *)(pLRC + 0x10b4) = pdAddress;
+    pdAddress = BaseClass::getPDEAddress(0x200) & 0xffffffff;
+    *(uint32_t *)(pLRC + 0x10bc) = pdAddress;
+    pdAddress = BaseClass::getPDEAddress(0) >> 32;
+    *(uint32_t *)(pLRC + 0x10c4) = pdAddress;
+    pdAddress = BaseClass::getPDEAddress(0) & 0xffffffff;
+    *(uint32_t *)(pLRC + 0x10cc) = pdAddress;
+}
+
+template <typename Traits>
+void AubPageTableHelper64<Traits>::fixupLRC(uint8_t *pLRC) {
+    uint32_t pml4Address = getPML4Address(0) >> 32;
+    *(uint32_t *)(pLRC + 0x10c4) = pml4Address;
+    pml4Address = getPML4Address(0) & 0xffffffff;
+    *(uint32_t *)(pLRC + 0x10cc) = pml4Address;
+}
+
+// Write a block of memory to a given address space using an optional hint
+template <typename Traits>
+void AubDump<Traits>::addMemoryWrite(typename Traits::Stream &stream, uint64_t addr, const void *memory, size_t sizeRemaining, int addressSpace, int hint) {
+    // We can only dump a relatively small amount per CmdServicesMemTraceMemoryWrite
+    auto sizeMemoryWriteHeader = sizeof(CmdServicesMemTraceMemoryWrite) - sizeof(CmdServicesMemTraceMemoryWrite::data);
+    auto blockSizeMax = g_dwordCountMax * sizeof(uint32_t) - sizeMemoryWriteHeader;
+
+    if (hint == CmdServicesMemTraceMemoryWrite::DataTypeHintValues::TraceLogicalRingContextRcs ||
+        hint == CmdServicesMemTraceMemoryWrite::DataTypeHintValues::TraceLogicalRingContextBcs ||
+        hint == CmdServicesMemTraceMemoryWrite::DataTypeHintValues::TraceLogicalRingContextVcs ||
+        hint == CmdServicesMemTraceMemoryWrite::DataTypeHintValues::TraceLogicalRingContextVecs) {
+        DEBUG_BREAK_IF(sizeRemaining <= 0x10cc);
+        uint8_t *pLRC = reinterpret_cast<uint8_t *>(const_cast<void *>(memory));
+        BaseHelper::fixupLRC(pLRC);
+    }
+
+    // loop to dump all of the blocks
+    while (sizeRemaining > 0) {
+        auto sizeThisIteration = std::min(blockSizeMax, sizeRemaining);
+        stream.writeMemory(addr, memory, sizeThisIteration, addressSpace, hint);
+
+        sizeRemaining -= sizeThisIteration;
+        memory = (uint8_t *)memory + sizeThisIteration;
+        addr += sizeThisIteration;
+    }
+}
+
+// Reserve memory in the GGTT.
+template <typename Traits>
+uint64_t AubDump<Traits>::reserveAddress(typename Traits::Stream &stream, uint32_t addr, size_t size, unsigned int addressSpace, uint64_t physStart) {
+    auto startPage = addr & g_pageMask;
+    auto endPage = (addr + size - 1) & g_pageMask;
+    auto numPages = (uint32_t)(((endPage - startPage) / 4096) + 1);
+
+    // Can only handle 16 bits of dwordCount.
+    DEBUG_BREAK_IF(!(numPages > 0 && (numPages + 4) < 65536));
+    auto gttTableOffset = static_cast<uint32_t>((((uint32_t)startPage) / 4096) * sizeof(MiGttEntry));
+
+    // Write header
+    {
+        typedef AubMemDump::CmdServicesMemTraceMemoryWrite CmdServicesMemTraceMemoryWrite;
+        stream.writeMemoryWriteHeader(gttTableOffset, numPages * sizeof(AubMemDump::MiGttEntry), addressSpace, CmdServicesMemTraceMemoryWrite::DataTypeHintValues::TraceNotype);
+    }
+
+    uint64_t physAddress = physStart;
+    while (startPage <= endPage) {
+        MiGttEntry entry;
+        setGttEntry(entry, physAddress);
+
+        stream.writeGTT(gttTableOffset, entry.uiData);
+        gttTableOffset += sizeof(entry);
+
+        physAddress += 4096;
+        startPage += 4096;
+    }
+
+    return physStart;
+}
+
+template <typename Traits>
+uint64_t AubDump<Traits>::reserveAddressGGTT(typename Traits::Stream &stream, uint32_t addr, size_t size, uint64_t physStart) {
+    return AubDump<Traits>::reserveAddress(stream, addr, size, AddressSpaceValues::TraceGttEntry, physStart);
+}
+
+template <typename Traits>
+uint64_t AubDump<Traits>::reserveAddressGGTT(typename Traits::Stream &stream, const void *memory, size_t size, uint64_t physStart) {
+    auto gfxAddress = BaseHelper::ptrToGGTT(memory);
+    return AubDump<Traits>::reserveAddress(stream, gfxAddress, size, AddressSpaceValues::TraceGttEntry, physStart);
+}
+
+template <typename Traits>
+uint64_t AubPageTableHelper32<Traits>::reserveAddressPPGTT(typename Traits::Stream &stream, uintptr_t gfxAddress, size_t blockSize, uint64_t physAddress) {
+    auto startAddress = gfxAddress;
+    auto endAddress = gfxAddress + blockSize - 1;
+
+    auto startPTE = startAddress >> 12;
+    auto endPTE = endAddress >> 12;
+    auto numPTEs = endPTE - startPTE + 1;
+
+    auto startPDE = startPTE >> 9;
+    auto endPDE = endPTE >> 9;
+    auto numPDEs = endPDE - startPDE + 1;
+
+    // Process the PD entries
+    bool writePDE = true;
+    if (writePDE) {
+        auto start_address = BaseClass::getPDEAddress(startPDE);
+
+        stream.writeMemoryWriteHeader(start_address, numPDEs * sizeof(uint64_t), AddressSpaceValues::TracePpgttPdEntry);
+
+        auto currPDE = startPDE;
+        auto physPage = BaseClass::getPTEAddress(startPTE) & g_pageMask;
+        while (currPDE <= endPDE) {
+            auto pde = physPage | 7;
+
+            stream.writePTE(start_address, pde);
+            start_address += sizeof(pde);
+
+            physPage += 4096;
+            currPDE++;
+        }
+    }
+
+    // Process the PT entries
+    bool writePTE = true;
+    if (writePTE) {
+        auto start_address = BaseClass::getPTEAddress(startPTE);
+
+        stream.writeMemoryWriteHeader(start_address, numPTEs * sizeof(uint64_t), AddressSpaceValues::TracePpgttEntry);
+
+        auto currPTE = startPTE;
+        auto physPage = physAddress & g_pageMask;
+        while (currPTE <= endPTE) {
+            auto pte = physPage | 7;
+
+            stream.writePTE(start_address, pte);
+            start_address += sizeof(pte);
+
+            physPage += 4096;
+            currPTE++;
+        }
+    }
+
+    return physAddress;
+}
+
+template <typename Traits>
+uint64_t AubPageTableHelper64<Traits>::reserveAddressPPGTT(typename Traits::Stream &stream, uintptr_t gfxAddress, size_t blockSize, uint64_t physAddress) {
+    auto startAddress = gfxAddress;
+    auto endAddress = gfxAddress + blockSize - 1;
+
+    auto startPTE = startAddress >> 12;
+    auto endPTE = endAddress >> 12;
+    auto numPTEs = endPTE - startPTE + 1;
+
+    auto startPDE = startPTE >> 9;
+    auto endPDE = endPTE >> 9;
+    auto numPDEs = endPDE - startPDE + 1;
+
+    auto startPDP = startPDE >> 9;
+    auto endPDP = endPDE >> 9;
+    auto numPDPs = endPDP - startPDP + 1;
+
+    auto startPML4 = startPDP >> 9;
+    auto endPML4 = endPDP >> 9;
+    auto numPML4s = endPML4 - startPML4 + 1;
+
+    // Process the PML4 entries
+    bool writePML4 = true;
+    if (writePML4) {
+        auto start_address = getPML4Address(startPML4);
+
+        stream.writeMemoryWriteHeader(start_address, numPML4s * sizeof(uint64_t), AddressSpaceValues::TracePml4Entry);
+
+        auto currPML4 = startPML4;
+        auto physPage = BaseClass::getPDPAddress(startPDP) & g_pageMask;
+        while (currPML4 <= endPML4) {
+            auto pml4 = physPage | 7;
+
+            stream.writePTE(start_address, pml4);
+            start_address += sizeof(pml4);
+
+            physPage += 4096;
+            currPML4++;
+        }
+    }
+
+    // Process the PDP entries
+    bool writePDPE = true;
+    if (writePDPE) {
+        auto start_address = BaseClass::getPDPAddress(startPDP);
+
+        stream.writeMemoryWriteHeader(start_address, numPDPs * sizeof(uint64_t), AddressSpaceValues::TracePhysicalPdpEntry);
+
+        auto currPDP = startPDP;
+        auto physPage = BaseClass::getPDEAddress(startPDE) & g_pageMask;
+        while (currPDP <= endPDP) {
+            auto pdp = physPage | 7;
+
+            stream.writePTE(start_address, pdp);
+            start_address += sizeof(pdp);
+
+            physPage += 4096;
+            currPDP++;
+        }
+    }
+
+    // Process the PD entries
+    bool writePDE = true;
+    if (writePDE) {
+        auto start_address = BaseClass::getPDEAddress(startPDE);
+
+        stream.writeMemoryWriteHeader(start_address, numPDEs * sizeof(uint64_t), AddressSpaceValues::TracePpgttPdEntry);
+
+        auto currPDE = startPDE;
+        auto physPage = BaseClass::getPTEAddress(startPTE) & g_pageMask;
+        while (currPDE <= endPDE) {
+            auto pde = physPage | 7;
+
+            stream.writePTE(start_address, pde);
+            start_address += sizeof(pde);
+
+            physPage += 4096;
+            currPDE++;
+        }
+    }
+
+    // Process the PT entries
+    bool writePTE = true;
+    if (writePTE) {
+        auto start_address = BaseClass::getPTEAddress(startPTE);
+
+        stream.writeMemoryWriteHeader(start_address, numPTEs * sizeof(uint64_t), AddressSpaceValues::TracePpgttEntry);
+
+        auto currPTE = startPTE;
+        auto physPage = physAddress & g_pageMask;
+        while (currPTE <= endPTE) {
+            auto pte = physPage | 7;
+
+            stream.writePTE(start_address, pte);
+            start_address += sizeof(pte);
+
+            physPage += 4096;
+            currPTE++;
+        }
+    }
+
+    return physAddress;
+}
+
+template <typename Traits>
+void AubPageTableHelper32<Traits>::createContext(typename Traits::Stream &stream, uint32_t context) {
+    AubPpgttContextCreate cmd;
+    memset(&cmd, 0, sizeof(cmd));
+    cmd.Header.Type = 0x7;
+    cmd.Header.Opcode = 0x1;
+    cmd.Header.SubOp = 0x14;
+    cmd.Header.DwordLength = ((sizeof(cmd) - sizeof(cmd.Header)) / sizeof(uint32_t)) - 1;
+    cmd.Handle = context;
+    cmd.AdvancedContext = false;
+
+    cmd.SixtyFourBit = 0;
+    cmd.PageDirPointer[0] = BaseClass::getPDEAddress(0x000);
+    cmd.PageDirPointer[1] = BaseClass::getPDEAddress(0x200);
+    cmd.PageDirPointer[2] = BaseClass::getPDEAddress(0x400);
+    cmd.PageDirPointer[3] = BaseClass::getPDEAddress(0x600);
+
+    stream.createContext(cmd);
+}
+
+template <typename Traits>
+void AubPageTableHelper64<Traits>::createContext(typename Traits::Stream &stream, uint32_t context) {
+    AubPpgttContextCreate cmd;
+    memset(&cmd, 0, sizeof(cmd));
+    cmd.Header.Type = 0x7;
+    cmd.Header.Opcode = 0x1;
+    cmd.Header.SubOp = 0x14;
+    cmd.Header.DwordLength = ((sizeof(cmd) - sizeof(cmd.Header)) / sizeof(uint32_t)) - 1;
+    cmd.Handle = context;
+    cmd.AdvancedContext = false;
+
+    cmd.SixtyFourBit = 1;
+    cmd.PageDirPointer[0] = getPML4Address(0);
+
+    stream.createContext(cmd);
+}
+
+}
--- a/runtime/aub_mem_dump/aub_services.h
+++ b/runtime/aub_mem_dump/aub_services.h
--- a/runtime/built_ins/CMakeLists.txt
+++ b/runtime/built_ins/CMakeLists.txt
@@ -0,0 +1,79 @@
+# Copyright (c) 2017, Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+# We require cmake 3.2.0 or later
+cmake_minimum_required(VERSION 3.2.0 FATAL_ERROR)
+
+add_library(${BUILTINS_BINARIES_LIB_NAME} OBJECT CMakeLists.txt)
+
+# Add builtins sources
+add_subdirectory(registry)
+
+set(GENERATED_BUILTINS "COPY_BUFFER_TO_BUFFER"
+                       "COPY_BUFFER_RECT"
+                       "FILL_BUFFER"
+                       "COPY_BUFFER_TO_IMAGE3D"
+                       "COPY_IMAGE3D_TO_BUFFER"
+                       "COPY_IMAGE_TO_IMAGE1D"
+                       "COPY_IMAGE_TO_IMAGE2D"
+                       "COPY_IMAGE_TO_IMAGE3D"
+                       "FILL_IMAGE1D"
+                       "FILL_IMAGE2D"
+                       "FILL_IMAGE3D"
+)
+
+# Generate builtins cpps
+if(COMPILE_BUILT_INS)
+  add_subdirectory(kernels)
+endif()
+
+# Reverse order so that GEN N+1 includes GEN N
+foreach(GEN_NUM RANGE ${MAX_GEN} 0 -1)
+  GEN_CONTAINS_PLATFORMS("SUPPORTED" ${GEN_NUM} GENX_HAS_PLATFORMS)
+  if(${GENX_HAS_PLATFORMS})
+    # Get all supported platforms for this GEN
+    GET_PLATFORMS_FOR_GEN("SUPPORTED" ${GEN_NUM} SUPPORTED_GENX_PLATFORMS)
+
+    # Add platform-specific files
+    foreach(PLATFORM_IT ${SUPPORTED_GENX_PLATFORMS})
+      foreach(GENERATED_BUILTIN ${GENERATED_BUILTINS})
+        list(APPEND GENERATED_BUILTINS_CPPS ${BUILTINS_INCLUDE_DIR}/${RUNTIME_GENERATED_${GENERATED_BUILTIN}_GEN${GEN_NUM}_${PLATFORM_IT}})
+      endforeach(GENERATED_BUILTIN)
+    endforeach(PLATFORM_IT)
+
+    source_group("generated files\\gen${GEN_NUM}" FILES ${GENERATED_BUILTINS_CPPS})
+  endif(${GENX_HAS_PLATFORMS})
+endforeach(GEN_NUM)
+
+
+if(COMPILE_BUILT_INS)
+  target_sources(${BUILTINS_BINARIES_LIB_NAME} PUBLIC ${GENERATED_BUILTINS_CPPS})
+  set_source_files_properties(${GENERATED_BUILTINS_CPPS} PROPERTIES GENERATED TRUE)
+endif(COMPILE_BUILT_INS)
+
+set_target_properties(${BUILTINS_BINARIES_LIB_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+set_target_properties(${BUILTINS_BINARIES_LIB_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+target_include_directories(${BUILTINS_BINARIES_LIB_NAME} PRIVATE
+  ${KHRONOS_HEADERS_DIR}
+  ${UMKM_SHAREDDATA_INCLUDE_PATHS}
+  ${IGDRCL__IGC_INCLUDE_DIR}
+  ${THIRD_PARTY_DIR}
+)
--- a/runtime/built_ins/built_ins.cpp
+++ b/runtime/built_ins/built_ins.cpp
@@ -0,0 +1,890 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <cstdint>
+#include "built_ins.h"
+#include "runtime/built_ins/vme_dispatch_builder.h"
+#include "runtime/built_ins/sip.h"
+#include "runtime/compiler_interface/compiler_interface.h"
+#include "runtime/program/program.h"
+#include "runtime/mem_obj/image.h"
+#include "runtime/kernel/kernel.h"
+#include "runtime/helpers/basic_math.h"
+#include "runtime/helpers/convert_color.h"
+#include "runtime/helpers/dispatch_info_builder.h"
+#include "runtime/helpers/debug_helpers.h"
+#include <sstream>
+
+namespace OCLRT {
+BuiltIns *BuiltIns::pInstance = nullptr;
+
+const char *mediaKernelsBuildOptions = {
+    "-D cl_intel_device_side_advanced_vme_enable "
+    "-D cl_intel_device_side_avc_vme_enable "
+    "-D cl_intel_device_side_vme_enable "
+    "-D cl_intel_media_block_io "
+    "-cl-fast-relaxed-math "};
+
+BuiltIns::BuiltIns() {
+    builtinsLib.reset(new BuiltinsLib());
+}
+
+BuiltIns::~BuiltIns() {
+    delete static_cast<SchedulerKernel *>(schedulerBuiltIn.pKernel);
+    delete schedulerBuiltIn.pProgram;
+    schedulerBuiltIn.pKernel = nullptr;
+    schedulerBuiltIn.pProgram = nullptr;
+}
+
+BuiltIns &BuiltIns::getInstance() {
+    static std::mutex initMutex;
+    std::lock_guard<std::mutex> autolock(initMutex);
+
+    if (pInstance == nullptr) {
+        pInstance = new BuiltIns();
+    }
+    return *pInstance;
+}
+
+void BuiltIns::shutDown() {
+    if (pInstance) {
+        auto inst = pInstance;
+        pInstance = nullptr;
+        delete inst;
+    }
+}
+
+SchedulerKernel &BuiltIns::getSchedulerKernel(Context &context) {
+    if (schedulerBuiltIn.pKernel) {
+        return *static_cast<SchedulerKernel *>(schedulerBuiltIn.pKernel);
+    }
+
+    auto initializeSchedulerProgramAndKernel = [&] {
+        cl_int retVal = CL_SUCCESS;
+
+        auto src = getInstance().builtinsLib->getBuiltinCode(EBuiltInOps::Scheduler, BuiltinCode::ECodeType::Any, *context.getDevice(0));
+
+        auto program = Program::createFromGenBinary(&context,
+                                                    src.resource.data(),
+                                                    src.resource.size(),
+                                                    true,
+                                                    &retVal);
+        DEBUG_BREAK_IF(retVal != CL_SUCCESS);
+        DEBUG_BREAK_IF(!program);
+
+        retVal = program->processGenBinary();
+        DEBUG_BREAK_IF(retVal != CL_SUCCESS);
+
+        schedulerBuiltIn.pProgram = program;
+
+        auto kernelInfo = schedulerBuiltIn.pProgram->getKernelInfo(SchedulerKernel::schedulerName);
+        DEBUG_BREAK_IF(!kernelInfo);
+
+        schedulerBuiltIn.pKernel = Kernel::create<SchedulerKernel>(
+            schedulerBuiltIn.pProgram,
+            *kernelInfo,
+            &retVal);
+
+        DEBUG_BREAK_IF(retVal != CL_SUCCESS);
+    };
+    std::call_once(schedulerBuiltIn.programIsInitialized, initializeSchedulerProgramAndKernel);
+
+    return *static_cast<SchedulerKernel *>(schedulerBuiltIn.pKernel);
+}
+
+SipKernel &BuiltIns::getSipKernel(SipKernelType type, Context &context) {
+    uint32_t kernelId = static_cast<uint32_t>(type);
+    UNRECOVERABLE_IF(kernelId >= static_cast<uint32_t>(SipKernelType::COUNT));
+    auto &sipBuiltIn = this->sipKernels[kernelId];
+
+    auto initializer = [&] {
+        cl_int retVal = CL_SUCCESS;
+
+        std::vector<char> sipBinary;
+        auto compilerInteface = CompilerInterface::getInstance();
+        UNRECOVERABLE_IF(compilerInteface == nullptr);
+
+        auto ret = compilerInteface->getSipKernelBinary(type, *context.getDevice(0), sipBinary);
+
+        UNRECOVERABLE_IF(ret != CL_SUCCESS);
+        UNRECOVERABLE_IF(sipBinary.size() == 0);
+        auto program = Program::createFromGenBinary(&context,
+                                                    sipBinary.data(),
+                                                    sipBinary.size(),
+                                                    true,
+                                                    &retVal);
+        DEBUG_BREAK_IF(retVal != CL_SUCCESS);
+        UNRECOVERABLE_IF(program == nullptr);
+
+        retVal = program->processGenBinary();
+        DEBUG_BREAK_IF(retVal != CL_SUCCESS);
+
+        auto kernelInfo = program->getKernelInfo(size_t{0});
+        UNRECOVERABLE_IF(kernelInfo == nullptr);
+
+        uint32_t sipOffset = kernelInfo->systemKernelOffset;
+        UNRECOVERABLE_IF(sipOffset >= kernelInfo->heapInfo.pKernelHeader->KernelHeapSize)
+        sipBuiltIn.first.reset(new SipKernel(type, ptrOffset(kernelInfo->heapInfo.pKernelHeap, sipOffset),
+                                             kernelInfo->heapInfo.pKernelHeader->KernelHeapSize - sipOffset));
+
+        DEBUG_BREAK_IF(retVal != CL_SUCCESS);
+        program->release();
+    };
+    std::call_once(sipBuiltIn.second, initializer);
+    UNRECOVERABLE_IF(sipBuiltIn.first == nullptr);
+    return *sipBuiltIn.first;
+}
+
+// VME:
+static const char *blockMotionEstimateIntelSrc = {
+#include "kernels/vme_block_motion_estimate_intel_frontend.igdrcl_built_in"
+};
+
+static const char *blockAdvancedMotionEstimateCheckIntelSrc = {
+#include "kernels/vme_block_advanced_motion_estimate_check_intel_frontend.igdrcl_built_in"
+};
+
+static const char *blockAdvancedMotionEstimateBidirectionalCheckIntelSrc = {
+#include "kernels/vme_block_advanced_motion_estimate_bidirectional_check_intel_frontend.igdrcl_built_in"
+};
+
+// VEBOX:
+static const char *veEnhanceIntelSrc = {
+#include "kernels/vebox_ve_enhance_intel.igdrcl_built_in"
+};
+
+static const char *veDnEnhanceIntelSrc = {
+#include "kernels/vebox_ve_dn_enhance_intel.igdrcl_built_in"
+};
+
+static const char *veDnDiEnhanceIntelSrc = {
+#include "kernels/vebox_ve_dn_di_enhance_intel.igdrcl_built_in"
+};
+
+static const std::tuple<const char *, const char *> mediaBuiltIns[] = {
+    std::make_tuple("block_motion_estimate_intel", blockMotionEstimateIntelSrc),
+    std::make_tuple("block_advanced_motion_estimate_check_intel", blockAdvancedMotionEstimateCheckIntelSrc),
+    std::make_tuple("block_advanced_motion_estimate_bidirectional_check_intel", blockAdvancedMotionEstimateBidirectionalCheckIntelSrc),
+    std::make_tuple("ve_enhance_intel", veEnhanceIntelSrc),
+    std::make_tuple("ve_dn_enhance_intel", veDnEnhanceIntelSrc),
+    std::make_tuple("ve_dn_di_enhance_intel", veDnDiEnhanceIntelSrc),
+};
+
+// Unlike other built-ins media kernels are not stored in BuiltIns object.
+// Pointer to program with built in kernels is returned to the user through API
+// call and user is responsible for releasing it by calling clReleaseProgram.
+Program *BuiltIns::createBuiltInProgram(
+    Context &context,
+    Device &device,
+    const char *kernelNames,
+    int &errcodeRet) {
+    std::string programSourceStr = "";
+    std::istringstream ss(kernelNames);
+    std::string currentKernelName;
+
+    while (std::getline(ss, currentKernelName, ';')) {
+        bool found = false;
+        for (auto &builtInTuple : mediaBuiltIns) {
+            if (currentKernelName == std::get<0>(builtInTuple)) {
+                programSourceStr += std::get<1>(builtInTuple);
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            errcodeRet = CL_INVALID_VALUE;
+            return nullptr;
+        }
+    }
+    if (programSourceStr.empty() == true) {
+        errcodeRet = CL_INVALID_VALUE;
+        return nullptr;
+    }
+
+    Program *pBuiltInProgram = nullptr;
+
+    pBuiltInProgram = Program::create(programSourceStr.c_str(), &context, device, true, nullptr);
+
+    if (pBuiltInProgram) {
+        std::unordered_map<std::string, BuiltinDispatchInfoBuilder *> builtinsBuilders;
+        builtinsBuilders["block_motion_estimate_intel"] =
+            &BuiltIns::getInstance().getBuiltinDispatchInfoBuilder(EBuiltInOps::VmeBlockMotionEstimateIntel, context, device);
+        builtinsBuilders["block_advanced_motion_estimate_check_intel"] =
+            &BuiltIns::getInstance().getBuiltinDispatchInfoBuilder(EBuiltInOps::VmeBlockAdvancedMotionEstimateCheckIntel, context, device);
+        builtinsBuilders["block_advanced_motion_estimate_bidirectional_check_intel"] =
+            &BuiltIns::getInstance().getBuiltinDispatchInfoBuilder(EBuiltInOps::VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel, context, device);
+        const cl_device_id clDevice = &device;
+        errcodeRet = pBuiltInProgram->build(
+            clDevice,
+            mediaKernelsBuildOptions,
+            enableCacheing,
+            builtinsBuilders);
+    } else {
+        errcodeRet = CL_INVALID_VALUE;
+    }
+    return pBuiltInProgram;
+}
+
+void BuiltinDispatchInfoBuilder::takeOwnership(Context *context) {
+    for (auto &k : usedKernels) {
+        k->takeOwnership(true);
+        k->setContext(context);
+    }
+}
+
+void BuiltinDispatchInfoBuilder::releaseOwnership() {
+    for (auto &k : usedKernels) {
+        k->setContext(nullptr);
+        k->releaseOwnership();
+    }
+}
+
+template <typename... KernelsDescArgsT>
+void BuiltinDispatchInfoBuilder::populate(Context &context, Device &device, EBuiltInOps op, const char *options, KernelsDescArgsT &&... desc) {
+    auto src = kernelsLib.getBuiltinsLib().getBuiltinCode(op, BuiltinCode::ECodeType::Any, device);
+    prog.reset(BuiltinsLib::createProgramFromCode(src, context, device).release());
+    prog->build(0, nullptr, options, nullptr, nullptr, kernelsLib.isCacheingEnabled());
+    grabKernels(std::forward<KernelsDescArgsT>(desc)...);
+}
+
+template <typename HWFamily>
+class BuiltInOp<HWFamily, EBuiltInOps::CopyBufferToBuffer> : public BuiltinDispatchInfoBuilder {
+  public:
+    BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device)
+        : BuiltinDispatchInfoBuilder(kernelsLib), kernLeftLeftover(nullptr), kernMiddle(nullptr), kernRightLeftover(nullptr) {
+        populate(context, device,
+                 EBuiltInOps::CopyBufferToBuffer,
+                 "",
+                 "CopyBufferToBufferLeftLeftover", kernLeftLeftover,
+                 "CopyBufferToBufferMiddle", kernMiddle,
+                 "CopyBufferToBufferRightLeftover", kernRightLeftover);
+    }
+
+    bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, const BuiltinOpParams &operationParams) const override {
+        DispatchInfoBuilder<SplitDispatch::Dim::d1D, SplitDispatch::SplitMode::KernelSplit> kernelSplit1DBuilder;
+
+        uintptr_t start = reinterpret_cast<uintptr_t>(operationParams.dstPtr) + operationParams.dstOffset.x;
+
+        size_t middleAlignment = MemoryConstants::cacheLineSize;
+        size_t middleElSize = sizeof(uint32_t) * 4;
+
+        uintptr_t leftSize = start % middleAlignment;
+        leftSize = (leftSize > 0) ? (middleAlignment - leftSize) : 0; // calc left leftover size
+        leftSize = std::min(leftSize, operationParams.size.x);        // clamp left leftover size to requested size
+
+        uintptr_t rightSize = (start + operationParams.size.x) % middleAlignment; // calc right leftover size
+        rightSize = std::min(rightSize, operationParams.size.x - leftSize);       // clamp
+
+        uintptr_t middleSizeBytes = operationParams.size.x - leftSize - rightSize; // calc middle size
+
+        if (!isAligned<4>(reinterpret_cast<uintptr_t>(operationParams.srcPtr) + operationParams.srcOffset.x + leftSize)) {
+            //corner case - src relative to dst does not have DWORD alignment
+            leftSize += middleSizeBytes;
+            middleSizeBytes = 0;
+        }
+
+        auto middleSizeEls = middleSizeBytes / middleElSize; // num work items in middle walker
+
+        // Set-up ISA
+        kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Left, kernLeftLeftover);
+        kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Middle, kernMiddle);
+        kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Right, kernRightLeftover);
+
+        // Set-up common kernel args
+        if (operationParams.srcSvmAlloc) {
+            kernelSplit1DBuilder.setArgSvmAlloc(0, operationParams.srcPtr, operationParams.srcSvmAlloc);
+        } else if (operationParams.srcMemObj) {
+            kernelSplit1DBuilder.setArg(0, operationParams.srcMemObj);
+        } else {
+            kernelSplit1DBuilder.setArgSvm(0, operationParams.size.x, operationParams.srcPtr, nullptr, CL_MEM_READ_ONLY);
+        }
+        if (operationParams.dstSvmAlloc) {
+            kernelSplit1DBuilder.setArgSvmAlloc(1, operationParams.dstPtr, operationParams.dstSvmAlloc);
+        } else if (operationParams.dstMemObj) {
+            kernelSplit1DBuilder.setArg(1, operationParams.dstMemObj);
+        } else {
+            kernelSplit1DBuilder.setArgSvm(1, operationParams.size.x, operationParams.dstPtr);
+        }
+
+        // Set-up srcOffset
+        kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Left, 2, static_cast<uint32_t>(operationParams.srcOffset.x));
+        kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Middle, 2, static_cast<uint32_t>(operationParams.srcOffset.x + leftSize));
+        kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Right, 2, static_cast<uint32_t>(operationParams.srcOffset.x + leftSize + middleSizeBytes));
+
+        // Set-up dstOffset
+        kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Left, 3, static_cast<uint32_t>(operationParams.dstOffset.x));
+        kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Middle, 3, static_cast<uint32_t>(operationParams.dstOffset.x + leftSize));
+        kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Right, 3, static_cast<uint32_t>(operationParams.dstOffset.x + leftSize + middleSizeBytes));
+
+        // Set-up work sizes
+        // Note for split walker, it would be just builder.SetDipatchGeometry(GWS, ELWS, OFFSET)
+        kernelSplit1DBuilder.setDispatchGeometry(SplitDispatch::RegionCoordX::Left, Vec3<size_t>{leftSize, 0, 0}, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
+        kernelSplit1DBuilder.setDispatchGeometry(SplitDispatch::RegionCoordX::Middle, Vec3<size_t>{middleSizeEls, 0, 0}, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
+        kernelSplit1DBuilder.setDispatchGeometry(SplitDispatch::RegionCoordX::Right, Vec3<size_t>{rightSize, 0, 0}, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
+        kernelSplit1DBuilder.bake(multiDispatchInfo);
+
+        return true;
+    }
+
+  protected:
+    Kernel *kernLeftLeftover;
+    Kernel *kernMiddle;
+    Kernel *kernRightLeftover;
+};
+
+template <typename HWFamily>
+class BuiltInOp<HWFamily, EBuiltInOps::CopyBufferRect> : public BuiltinDispatchInfoBuilder {
+  public:
+    BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device)
+        : BuiltinDispatchInfoBuilder(kernelsLib), kernelBytes{nullptr} {
+        populate(context, device,
+                 EBuiltInOps::CopyBufferRect,
+                 "",
+                 "CopyBufferRectBytes2d", kernelBytes[0],
+                 "CopyBufferRectBytes2d", kernelBytes[1],
+                 "CopyBufferRectBytes3d", kernelBytes[2]);
+    }
+
+    bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, const BuiltinOpParams &operationParams) const override {
+        DispatchInfoBuilder<SplitDispatch::Dim::d3D, SplitDispatch::SplitMode::NoSplit> kernelNoSplit3DBuilder;
+
+        size_t hostPtrSize = 0;
+        bool is3D = false;
+
+        if (operationParams.srcMemObj && operationParams.dstMemObj) {
+            DEBUG_BREAK_IF(!((operationParams.srcPtr == nullptr) && (operationParams.dstPtr == nullptr)));
+            is3D = (operationParams.size.z > 1) || (operationParams.srcOffset.z > 0) || (operationParams.dstOffset.z > 0);
+        } else {
+            if (operationParams.srcPtr) {
+                size_t origin[] = {operationParams.srcOffset.x, operationParams.srcOffset.y, operationParams.srcOffset.z};
+                size_t region[] = {operationParams.size.x, operationParams.size.y, operationParams.size.z};
+                hostPtrSize = Buffer::calculateHostPtrSize(origin, region, operationParams.srcRowPitch, operationParams.srcSlicePitch);
+                is3D = (operationParams.size.z > 1) || (operationParams.dstOffset.z > 0);
+            } else if (operationParams.dstPtr) {
+                size_t origin[] = {operationParams.dstOffset.x, operationParams.dstOffset.y, operationParams.dstOffset.z};
+                size_t region[] = {operationParams.size.x, operationParams.size.y, operationParams.size.z};
+                hostPtrSize = Buffer::calculateHostPtrSize(origin, region, operationParams.dstRowPitch, operationParams.dstSlicePitch);
+                is3D = (operationParams.size.z > 1) || (operationParams.srcOffset.z > 0);
+            } else {
+                DEBUG_BREAK_IF(!false);
+            }
+        }
+
+        // Set-up ISA
+        int dimensions = is3D ? 3 : 2;
+        kernelNoSplit3DBuilder.setKernel(kernelBytes[dimensions - 1]);
+
+        // arg0 = src
+        if (operationParams.srcMemObj) {
+            kernelNoSplit3DBuilder.setArg(0, operationParams.srcMemObj);
+        } else {
+            kernelNoSplit3DBuilder.setArgSvm(0, hostPtrSize, is3D ? operationParams.srcPtr : ptrOffset(operationParams.srcPtr, operationParams.srcOffset.z * operationParams.srcSlicePitch));
+        }
+
+        // arg1 = dst
+        if (operationParams.dstMemObj) {
+            kernelNoSplit3DBuilder.setArg(1, operationParams.dstMemObj);
+        } else {
+            kernelNoSplit3DBuilder.setArgSvm(1, hostPtrSize, is3D ? operationParams.dstPtr : ptrOffset(operationParams.dstPtr, operationParams.dstOffset.z * operationParams.dstSlicePitch));
+        }
+
+        // arg2 = srcOrigin
+        uint32_t kSrcOrigin[4] = {(uint32_t)operationParams.srcOffset.x, (uint32_t)operationParams.srcOffset.y, (uint32_t)operationParams.srcOffset.z, 0};
+        kernelNoSplit3DBuilder.setArg(2, sizeof(uint32_t) * 4, kSrcOrigin);
+
+        // arg3 = dstOrigin
+        uint32_t kDstOrigin[4] = {(uint32_t)operationParams.dstOffset.x, (uint32_t)operationParams.dstOffset.y, (uint32_t)operationParams.dstOffset.z, 0};
+        kernelNoSplit3DBuilder.setArg(3, sizeof(uint32_t) * 4, kDstOrigin);
+
+        // arg4 = srcPitch
+        uint32_t kSrcPitch[2] = {(uint32_t)operationParams.srcRowPitch, (uint32_t)operationParams.srcSlicePitch};
+        kernelNoSplit3DBuilder.setArg(4, sizeof(uint32_t) * 2, kSrcPitch);
+
+        // arg5 = dstPitch
+        uint32_t kDstPitch[2] = {(uint32_t)operationParams.dstRowPitch, (uint32_t)operationParams.dstSlicePitch};
+        kernelNoSplit3DBuilder.setArg(5, sizeof(uint32_t) * 2, kDstPitch);
+
+        // Set-up work sizes
+        kernelNoSplit3DBuilder.setDispatchGeometry(operationParams.size, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
+        kernelNoSplit3DBuilder.bake(multiDispatchInfo);
+
+        // Store source and destination surfaces for residency purposes
+        if (operationParams.srcMemObj) {
+            multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new MemObjSurface(operationParams.srcMemObj)));
+        } else {
+            multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new HostPtrSurface(operationParams.srcPtr, hostPtrSize)));
+        }
+        if (operationParams.dstMemObj) {
+            multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new MemObjSurface(operationParams.dstMemObj)));
+        } else {
+            multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new HostPtrSurface(operationParams.dstPtr, hostPtrSize)));
+        }
+
+        return true;
+    }
+
+  protected:
+    Kernel *kernelBytes[3];
+};
+
+template <typename HWFamily>
+class BuiltInOp<HWFamily, EBuiltInOps::FillBuffer> : public BuiltinDispatchInfoBuilder {
+  public:
+    BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device)
+        : BuiltinDispatchInfoBuilder(kernelsLib), kernLeftLeftover(nullptr), kernMiddle(nullptr), kernRightLeftover(nullptr) {
+        populate(context, device,
+                 EBuiltInOps::FillBuffer,
+                 "",
+                 "FillBufferLeftLeftover", kernLeftLeftover,
+                 "FillBufferMiddle", kernMiddle,
+                 "FillBufferRightLeftover", kernRightLeftover);
+    }
+
+    bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, const BuiltinOpParams &operationParams) const override {
+        DispatchInfoBuilder<SplitDispatch::Dim::d1D, SplitDispatch::SplitMode::KernelSplit> kernelSplit1DBuilder;
+
+        uintptr_t start = reinterpret_cast<uintptr_t>(operationParams.dstPtr) + operationParams.dstOffset.x;
+
+        size_t middleAlignment = MemoryConstants::cacheLineSize;
+        size_t middleElSize = sizeof(uint32_t);
+
+        uintptr_t leftSize = start % middleAlignment;
+        leftSize = (leftSize > 0) ? (middleAlignment - leftSize) : 0; // calc left leftover size
+        leftSize = std::min(leftSize, operationParams.size.x);        // clamp left leftover size to requested size
+
+        uintptr_t rightSize = (start + operationParams.size.x) % middleAlignment; // calc right leftover size
+        rightSize = std::min(rightSize, operationParams.size.x - leftSize);       // clamp
+
+        uintptr_t middleSizeBytes = operationParams.size.x - leftSize - rightSize; // calc middle size
+
+        auto middleSizeEls = middleSizeBytes / middleElSize; // num work items in middle walker
+
+        // Set-up ISA
+        kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Left, kernLeftLeftover);
+        kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Middle, kernMiddle);
+        kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Right, kernRightLeftover);
+
+        DEBUG_BREAK_IF((operationParams.srcMemObj == nullptr) || (operationParams.srcOffset != 0));
+        DEBUG_BREAK_IF((operationParams.dstMemObj == nullptr) && (operationParams.dstSvmAlloc == nullptr));
+
+        // Set-up dstMemObj with buffer
+        if (operationParams.dstSvmAlloc) {
+            kernelSplit1DBuilder.setArgSvmAlloc(0, operationParams.dstPtr, operationParams.dstSvmAlloc);
+        } else {
+            kernelSplit1DBuilder.setArg(0, operationParams.dstMemObj);
+        }
+
+        // Set-up dstOffset
+        kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Left, 1, static_cast<uint32_t>(operationParams.dstOffset.x));
+        kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Middle, 1, static_cast<uint32_t>(operationParams.dstOffset.x + leftSize));
+        kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Right, 1, static_cast<uint32_t>(operationParams.dstOffset.x + leftSize + middleSizeBytes));
+
+        // Set-up srcMemObj with pattern
+        kernelSplit1DBuilder.setArgSvm(2, operationParams.srcMemObj->getSize(), operationParams.srcMemObj->getGraphicsAllocation()->getUnderlyingBuffer(), operationParams.srcMemObj->getGraphicsAllocation());
+
+        // Set-up patternSizeInEls
+        kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Left, 3, static_cast<uint32_t>(operationParams.srcMemObj->getSize()));
+        kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Middle, 3, static_cast<uint32_t>(operationParams.srcMemObj->getSize() / middleElSize));
+        kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Right, 3, static_cast<uint32_t>(operationParams.srcMemObj->getSize()));
+
+        // Set-up work sizes
+        // Note for split walker, it would be just builder.SetDipatchGeomtry(GWS, ELWS, OFFSET)
+        kernelSplit1DBuilder.setDispatchGeometry(SplitDispatch::RegionCoordX::Left, Vec3<size_t>{leftSize, 0, 0}, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
+        kernelSplit1DBuilder.setDispatchGeometry(SplitDispatch::RegionCoordX::Middle, Vec3<size_t>{middleSizeEls, 0, 0}, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
+        kernelSplit1DBuilder.setDispatchGeometry(SplitDispatch::RegionCoordX::Right, Vec3<size_t>{rightSize, 0, 0}, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
+        kernelSplit1DBuilder.bake(multiDispatchInfo);
+
+        return true;
+    }
+
+  protected:
+    Kernel *kernLeftLeftover;
+    Kernel *kernMiddle;
+    Kernel *kernRightLeftover;
+};
+
+template <typename HWFamily>
+class BuiltInOp<HWFamily, EBuiltInOps::CopyBufferToImage3d> : public BuiltinDispatchInfoBuilder {
+  public:
+    BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device)
+        : BuiltinDispatchInfoBuilder(kernelsLib), kernelBytes{nullptr} {
+        populate(context, device,
+                 EBuiltInOps::CopyBufferToImage3d,
+                 "",
+                 "CopyBufferToImage3dBytes", kernelBytes[0],
+                 "CopyBufferToImage3d2Bytes", kernelBytes[1],
+                 "CopyBufferToImage3d4Bytes", kernelBytes[2],
+                 "CopyBufferToImage3d8Bytes", kernelBytes[3],
+                 "CopyBufferToImage3d16Bytes", kernelBytes[4]);
+    }
+
+    bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, const BuiltinOpParams &operationParams) const override {
+        DispatchInfoBuilder<SplitDispatch::Dim::d3D, SplitDispatch::SplitMode::NoSplit> kernelNoSplit3DBuilder;
+
+        DEBUG_BREAK_IF(!(((operationParams.srcPtr != nullptr) || (operationParams.srcMemObj != nullptr)) && (operationParams.dstPtr == nullptr)));
+
+        auto dstImage = castToObjectOrAbort<Image>(operationParams.dstMemObj);
+
+        // Redescribe image to be byte-copy
+        auto dstImageRedescribed = dstImage->redescribe();
+        multiDispatchInfo.pushRedescribedMemObj(std::unique_ptr<MemObj>(dstImageRedescribed)); // life range same as mdi's
+
+        // Calculate srcRowPitch and srcSlicePitch
+        auto bytesPerPixel = dstImage->getSurfaceFormatInfo().ImageElementSizeInBytes;
+
+        size_t region[] = {operationParams.size.x, operationParams.size.y, operationParams.size.z};
+
+        auto srcRowPitch = static_cast<uint32_t>(operationParams.dstRowPitch ? operationParams.dstRowPitch : region[0] * bytesPerPixel);
+
+        auto srcSlicePitch = static_cast<uint32_t>(
+            operationParams.dstSlicePitch ? operationParams.dstSlicePitch : ((dstImage->getImageDesc().image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY ? 1 : region[1]) * srcRowPitch));
+
+        // Determine size of host ptr surface for residency purposes
+        size_t hostPtrSize = operationParams.srcPtr ? Image::calculateHostPtrSize(region, srcRowPitch, srcSlicePitch, bytesPerPixel, dstImage->getImageDesc().image_type) : 0;
+
+        // Set-up kernel
+        auto bytesExponent = Math::log2(bytesPerPixel);
+        DEBUG_BREAK_IF(bytesExponent >= 5);
+        kernelNoSplit3DBuilder.setKernel(kernelBytes[bytesExponent]);
+
+        // Set-up source host ptr / buffer
+        if (operationParams.srcPtr) {
+            kernelNoSplit3DBuilder.setArgSvm(0, hostPtrSize, operationParams.srcPtr);
+        } else {
+            kernelNoSplit3DBuilder.setArg(0, operationParams.srcMemObj);
+        }
+
+        // Set-up destination image
+        kernelNoSplit3DBuilder.setArg(1, dstImageRedescribed);
+
+        // Set-up srcOffset
+        kernelNoSplit3DBuilder.setArg(2, static_cast<uint32_t>(operationParams.srcOffset.x));
+
+        // Set-up dstOrigin
+        {
+            uint32_t origin[] = {
+                static_cast<uint32_t>(operationParams.dstOffset.x),
+                static_cast<uint32_t>(operationParams.dstOffset.y),
+                static_cast<uint32_t>(operationParams.dstOffset.z),
+                0};
+            kernelNoSplit3DBuilder.setArg(3, sizeof(origin), origin);
+        }
+
+        // Set-up srcRowPitch
+        {
+            uint32_t pitch[] = {
+                static_cast<uint32_t>(srcRowPitch),
+                static_cast<uint32_t>(srcSlicePitch)};
+            kernelNoSplit3DBuilder.setArg(4, sizeof(pitch), pitch);
+        }
+
+        // Set-up work sizes
+        kernelNoSplit3DBuilder.setDispatchGeometry(operationParams.size, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
+        kernelNoSplit3DBuilder.bake(multiDispatchInfo);
+
+        // Store source and destination surfaces for residency purposes
+        if (operationParams.srcMemObj) {
+            multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new MemObjSurface(operationParams.srcMemObj)));
+        } else {
+            multiDispatchInfo.pushUsedSurface(std::unique_ptr<HostPtrSurface>(new HostPtrSurface(operationParams.srcPtr, hostPtrSize)));
+        }
+        multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new MemObjSurface(operationParams.dstMemObj)));
+
+        return true;
+    }
+
+  protected:
+    Kernel *kernelBytes[5];
+};
+
+template <typename HWFamily>
+class BuiltInOp<HWFamily, EBuiltInOps::CopyImage3dToBuffer> : public BuiltinDispatchInfoBuilder {
+  public:
+    BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device)
+        : BuiltinDispatchInfoBuilder(kernelsLib), kernelBytes{nullptr} {
+        populate(context, device,
+                 EBuiltInOps::CopyImage3dToBuffer,
+                 "",
+                 "CopyImage3dToBufferBytes", kernelBytes[0],
+                 "CopyImage3dToBuffer2Bytes", kernelBytes[1],
+                 "CopyImage3dToBuffer4Bytes", kernelBytes[2],
+                 "CopyImage3dToBuffer8Bytes", kernelBytes[3],
+                 "CopyImage3dToBuffer16Bytes", kernelBytes[4]);
+    }
+
+    bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, const BuiltinOpParams &operationParams) const override {
+        DispatchInfoBuilder<SplitDispatch::Dim::d3D, SplitDispatch::SplitMode::NoSplit> kernelNoSplit3DBuilder;
+
+        DEBUG_BREAK_IF(!((operationParams.srcPtr == nullptr) && ((operationParams.dstPtr != nullptr) || (operationParams.dstMemObj != nullptr))));
+
+        auto srcImage = castToObjectOrAbort<Image>(operationParams.srcMemObj);
+
+        // Redescribe image to be byte-copy
+        auto srcImageRedescribed = srcImage->redescribe();
+        multiDispatchInfo.pushRedescribedMemObj(std::unique_ptr<MemObj>(srcImageRedescribed)); // life range same as mdi's
+
+        // Calculate dstRowPitch and dstSlicePitch
+        auto bytesPerPixel = srcImage->getSurfaceFormatInfo().ImageElementSizeInBytes;
+
+        size_t region[] = {operationParams.size.x, operationParams.size.y, operationParams.size.z};
+
+        auto dstRowPitch = static_cast<uint32_t>(operationParams.srcRowPitch ? operationParams.srcRowPitch : region[0] * bytesPerPixel);
+
+        auto dstSlicePitch = static_cast<uint32_t>(
+            operationParams.srcSlicePitch ? operationParams.srcSlicePitch : ((srcImage->getImageDesc().image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY ? 1 : region[1]) * dstRowPitch));
+
+        // Determine size of host ptr surface for residency purposes
+        size_t hostPtrSize = operationParams.dstPtr ? Image::calculateHostPtrSize(region, dstRowPitch, dstSlicePitch, bytesPerPixel, srcImage->getImageDesc().image_type) : 0;
+
+        // Set-up ISA
+        auto bytesExponent = Math::log2(bytesPerPixel);
+        DEBUG_BREAK_IF(bytesExponent >= 5);
+        kernelNoSplit3DBuilder.setKernel(kernelBytes[bytesExponent]);
+
+        // Set-up source image
+        kernelNoSplit3DBuilder.setArg(0, srcImageRedescribed);
+
+        // Set-up destination host ptr / buffer
+        if (operationParams.dstPtr) {
+            kernelNoSplit3DBuilder.setArgSvm(1, hostPtrSize, operationParams.dstPtr);
+        } else {
+            kernelNoSplit3DBuilder.setArg(1, operationParams.dstMemObj);
+        }
+
+        // Set-up srcOrigin
+        {
+            uint32_t origin[] = {
+                static_cast<uint32_t>(operationParams.srcOffset.x),
+                static_cast<uint32_t>(operationParams.srcOffset.y),
+                static_cast<uint32_t>(operationParams.srcOffset.z),
+                0};
+            kernelNoSplit3DBuilder.setArg(2, sizeof(origin), origin);
+        }
+
+        // Set-up dstOffset
+        kernelNoSplit3DBuilder.setArg(3, static_cast<uint32_t>(operationParams.dstOffset.x));
+
+        // Set-up dstRowPitch
+        {
+            uint32_t pitch[] = {
+                static_cast<uint32_t>(dstRowPitch),
+                static_cast<uint32_t>(dstSlicePitch)};
+            kernelNoSplit3DBuilder.setArg(4, sizeof(pitch), pitch);
+        }
+
+        // Set-up work sizes
+        kernelNoSplit3DBuilder.setDispatchGeometry(operationParams.size, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
+        kernelNoSplit3DBuilder.bake(multiDispatchInfo);
+
+        // Store source and destination surfaces for residency purposes
+        multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new MemObjSurface(operationParams.srcMemObj)));
+        if (operationParams.dstMemObj) {
+            multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new MemObjSurface(operationParams.dstMemObj)));
+        } else {
+            multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new HostPtrSurface(operationParams.dstPtr, hostPtrSize)));
+        }
+
+        return true;
+    }
+
+  protected:
+    Kernel *kernelBytes[5];
+};
+
+template <typename HWFamily>
+class BuiltInOp<HWFamily, EBuiltInOps::CopyImageToImage3d> : public BuiltinDispatchInfoBuilder {
+  public:
+    BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device)
+        : BuiltinDispatchInfoBuilder(kernelsLib), kernel(nullptr) {
+        populate(context, device,
+                 EBuiltInOps::CopyImageToImage3d,
+                 "",
+                 "CopyImageToImage3d", kernel);
+    }
+
+    bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, const BuiltinOpParams &operationParams) const override {
+        DispatchInfoBuilder<SplitDispatch::Dim::d3D, SplitDispatch::SplitMode::NoSplit> kernelNoSplit3DBuilder;
+
+        DEBUG_BREAK_IF(!((operationParams.srcPtr == nullptr) && (operationParams.dstPtr == nullptr)));
+
+        auto srcImage = castToObjectOrAbort<Image>(operationParams.srcMemObj);
+        auto dstImage = castToObjectOrAbort<Image>(operationParams.dstMemObj);
+
+        // Redescribe images to be byte-copies
+        auto srcImageRedescribed = srcImage->redescribe();
+        auto dstImageRedescribed = dstImage->redescribe();
+        multiDispatchInfo.pushRedescribedMemObj(std::unique_ptr<MemObj>(srcImageRedescribed)); // life range same as mdi's
+        multiDispatchInfo.pushRedescribedMemObj(std::unique_ptr<MemObj>(dstImageRedescribed)); // life range same as mdi's
+
+        // Set-up kernel
+        kernelNoSplit3DBuilder.setKernel(kernel);
+
+        // Set-up source image
+        kernelNoSplit3DBuilder.setArg(0, srcImageRedescribed);
+
+        // Set-up destination image
+        kernelNoSplit3DBuilder.setArg(1, dstImageRedescribed);
+
+        // Set-up srcOrigin
+        {
+            uint32_t origin[] = {
+                static_cast<uint32_t>(operationParams.srcOffset.x),
+                static_cast<uint32_t>(operationParams.srcOffset.y),
+                static_cast<uint32_t>(operationParams.srcOffset.z),
+                0};
+            kernelNoSplit3DBuilder.setArg(2, sizeof(origin), origin);
+        }
+
+        // Set-up dstOrigin
+        {
+            uint32_t origin[] = {
+                static_cast<uint32_t>(operationParams.dstOffset.x),
+                static_cast<uint32_t>(operationParams.dstOffset.y),
+                static_cast<uint32_t>(operationParams.dstOffset.z),
+                0};
+            kernelNoSplit3DBuilder.setArg(3, sizeof(origin), origin);
+        }
+
+        // Set-up work sizes
+        kernelNoSplit3DBuilder.setDispatchGeometry(operationParams.size, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
+        kernelNoSplit3DBuilder.bake(multiDispatchInfo);
+
+        // Store source and destination surfaces for residency purposes
+        multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new MemObjSurface(srcImage)));
+        multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new MemObjSurface(dstImage)));
+
+        return true;
+    }
+
+  protected:
+    Kernel *kernel;
+};
+
+template <typename HWFamily>
+class BuiltInOp<HWFamily, EBuiltInOps::FillImage3d> : public BuiltinDispatchInfoBuilder {
+  public:
+    BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device)
+        : BuiltinDispatchInfoBuilder(kernelsLib), kernel(nullptr) {
+        populate(context, device,
+                 EBuiltInOps::FillImage3d,
+                 "",
+                 "FillImage3d", kernel);
+    }
+
+    bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, const BuiltinOpParams &operationParams) const override {
+        DispatchInfoBuilder<SplitDispatch::Dim::d3D, SplitDispatch::SplitMode::NoSplit> kernelNoSplit3DBuilder;
+
+        DEBUG_BREAK_IF(!((operationParams.srcMemObj == nullptr) && (operationParams.srcPtr != nullptr) && (operationParams.dstPtr == nullptr)));
+
+        auto image = castToObjectOrAbort<Image>(operationParams.dstMemObj);
+
+        // Redescribe image to be byte-copy
+        auto imageRedescribed = image->redescribeFillImage();
+        multiDispatchInfo.pushRedescribedMemObj(std::unique_ptr<MemObj>(imageRedescribed));
+
+        // Set-up kernel
+        kernelNoSplit3DBuilder.setKernel(kernel);
+
+        // Set-up destination image
+        kernelNoSplit3DBuilder.setArg(0, imageRedescribed);
+
+        // Set-up fill color
+        int iFillColor[4] = {0};
+        const void *fillColor = operationParams.srcPtr;
+        convertFillColor(fillColor,
+                         iFillColor,
+                         image->getSurfaceFormatInfo().OCLImageFormat,
+                         imageRedescribed->getSurfaceFormatInfo().OCLImageFormat);
+        kernelNoSplit3DBuilder.setArg(1, 4 * sizeof(int32_t), iFillColor);
+
+        // Set-up dstOffset
+        {
+            uint32_t offset[] = {
+                static_cast<uint32_t>(operationParams.dstOffset.x),
+                static_cast<uint32_t>(operationParams.dstOffset.y),
+                static_cast<uint32_t>(operationParams.dstOffset.z),
+                0};
+            kernelNoSplit3DBuilder.setArg(2, sizeof(offset), offset);
+        }
+
+        // Set-up work sizes
+        kernelNoSplit3DBuilder.setDispatchGeometry(operationParams.size, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
+        kernelNoSplit3DBuilder.bake(multiDispatchInfo);
+
+        // Store destination surface for residency purposes
+        multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new MemObjSurface(image)));
+
+        return true;
+    }
+
+  protected:
+    Kernel *kernel;
+};
+
+BuiltinDispatchInfoBuilder &BuiltIns::getBuiltinDispatchInfoBuilder(EBuiltInOps operation, Context &context, Device &device) {
+    uint32_t operationId = static_cast<uint32_t>(operation);
+    auto &operationBuilder = BuiltinOpsBuilders[operationId];
+    switch (operation) {
+    default:
+        throw std::runtime_error("getBuiltinDispatchInfoBuilder failed");
+    case EBuiltInOps::CopyBufferToBuffer:
+        std::call_once(operationBuilder.second, [&] { operationBuilder.first.reset(new BuiltInOp<HWFamily, EBuiltInOps::CopyBufferToBuffer>(*this, context, device)); });
+        break;
+    case EBuiltInOps::CopyBufferRect:
+        std::call_once(operationBuilder.second, [&] { operationBuilder.first.reset(new BuiltInOp<HWFamily, EBuiltInOps::CopyBufferRect>(*this, context, device)); });
+        break;
+    case EBuiltInOps::FillBuffer:
+        std::call_once(operationBuilder.second, [&] { operationBuilder.first.reset(new BuiltInOp<HWFamily, EBuiltInOps::FillBuffer>(*this, context, device)); });
+        break;
+    case EBuiltInOps::CopyBufferToImage3d:
+        std::call_once(operationBuilder.second, [&] { operationBuilder.first.reset(new BuiltInOp<HWFamily, EBuiltInOps::CopyBufferToImage3d>(*this, context, device)); });
+        break;
+    case EBuiltInOps::CopyImage3dToBuffer:
+        std::call_once(operationBuilder.second, [&] { operationBuilder.first.reset(new BuiltInOp<HWFamily, EBuiltInOps::CopyImage3dToBuffer>(*this, context, device)); });
+        break;
+    case EBuiltInOps::CopyImageToImage3d:
+        std::call_once(operationBuilder.second, [&] { operationBuilder.first.reset(new BuiltInOp<HWFamily, EBuiltInOps::CopyImageToImage3d>(*this, context, device)); });
+        break;
+    case EBuiltInOps::FillImage3d:
+        std::call_once(operationBuilder.second, [&] { operationBuilder.first.reset(new BuiltInOp<HWFamily, EBuiltInOps::FillImage3d>(*this, context, device)); });
+        break;
+    case EBuiltInOps::VmeBlockMotionEstimateIntel:
+        std::call_once(operationBuilder.second, [&] { operationBuilder.first.reset(new BuiltInOp<HWFamily, EBuiltInOps::VmeBlockMotionEstimateIntel>(*this, context, device)); });
+        break;
+    case EBuiltInOps::VmeBlockAdvancedMotionEstimateCheckIntel:
+        std::call_once(operationBuilder.second, [&] { operationBuilder.first.reset(new BuiltInOp<HWFamily, EBuiltInOps::VmeBlockAdvancedMotionEstimateCheckIntel>(*this, context, device)); });
+        break;
+    case EBuiltInOps::VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel:
+        std::call_once(operationBuilder.second, [&] { operationBuilder.first.reset(new BuiltInOp<HWFamily, EBuiltInOps::VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel>(*this, context, device)); });
+        break;
+    }
+    return *operationBuilder.first;
+}
+
+std::unique_ptr<BuiltinDispatchInfoBuilder> BuiltIns::setBuiltinDispatchInfoBuilder(EBuiltInOps operation, Context &context, Device &device, std::unique_ptr<BuiltinDispatchInfoBuilder> builder) {
+    uint32_t operationId = static_cast<uint32_t>(operation);
+    auto &operationBuilder = BuiltinOpsBuilders[operationId];
+    operationBuilder.first.swap(builder);
+    return builder;
+}
+
+} // namespace OCLRT
--- a/runtime/built_ins/built_ins.h
+++ b/runtime/built_ins/built_ins.h
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "runtime/built_ins/sip.h"
+#include "runtime/scheduler/scheduler_kernel.h"
+#include "runtime/program/program.h"
+#include "runtime/utilities/vec.h"
+#include "runtime/os_interface/os_inc.h"
+
+#include <array>
+#include <cstdint>
+#include <fstream>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <tuple>
+
+namespace OCLRT {
+typedef std::vector<char> BuiltinResourceT;
+
+extern const char *mediaKernelsBuildOptions;
+
+enum class EBuiltInOps : uint32_t {
+    CopyBufferToBuffer = 0,
+    CopyBufferRect,
+    FillBuffer,
+    CopyBufferToImage3d,
+    CopyImage3dToBuffer,
+    CopyImageToImage1d,
+    CopyImageToImage2d,
+    CopyImageToImage3d,
+    FillImage1d,
+    FillImage2d,
+    FillImage3d,
+    VmeBlockMotionEstimateIntel,
+    VmeBlockAdvancedMotionEstimateCheckIntel,
+    VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel,
+    Scheduler,
+    COUNT
+};
+
+BuiltinResourceT createBuiltinResource(const char *ptr, size_t size);
+BuiltinResourceT createBuiltinResource(const BuiltinResourceT &r);
+std::string createBuiltinResourceName(EBuiltInOps builtin, const std::string &extension,
+                                      const std::string &platformName = "", uint32_t deviceRevId = 0);
+std::string joinPath(const std::string &lhs, const std::string &rhs);
+const char *getBuiltinAsString(EBuiltInOps builtin);
+
+class Storage {
+  public:
+    Storage(const std::string &rootPath)
+        : rootPath(rootPath) {
+    }
+
+    BuiltinResourceT load(const std::string &resourceName);
+
+  protected:
+    virtual BuiltinResourceT loadImpl(const std::string &fullResourceName) = 0;
+
+    std::string rootPath;
+};
+
+class FileStorage : public Storage {
+  public:
+    FileStorage(const std::string &rootPath = "")
+        : Storage(rootPath) {
+    }
+
+  protected:
+    BuiltinResourceT loadImpl(const std::string &fullResourceName) override;
+};
+
+struct EmbeddedStorageRegistry {
+    static EmbeddedStorageRegistry &getInstance() {
+        static EmbeddedStorageRegistry gsr;
+        return gsr;
+    }
+
+    void store(const std::string &name, BuiltinResourceT &&resource) {
+        resources.emplace(name, BuiltinResourceT(std::move(resource)));
+    }
+
+    const BuiltinResourceT *get(const std::string &name) const;
+
+  private:
+    using ResourcesContainer = std::unordered_map<std::string, BuiltinResourceT>;
+    ResourcesContainer resources;
+};
+
+class EmbeddedStorage : public Storage {
+  public:
+    EmbeddedStorage(const std::string &rootPath)
+        : Storage(rootPath) {
+    }
+
+  protected:
+    BuiltinResourceT loadImpl(const std::string &fullResourceName) override;
+};
+
+struct BuiltinCode {
+    enum class ECodeType {
+        Any = 0,          // for requesting "any" code available - priorities as below
+        Binary = 1,       // ISA - highest priority
+        Intermediate = 2, // SPIR/LLVM - medium prioroty
+        Source = 3,       // OCL C - lowest priority
+        COUNT,
+        INVALID
+    };
+
+    static const char *getExtension(ECodeType ct) {
+        switch (ct) {
+        default:
+            return "";
+        case ECodeType::Binary:
+            return ".bin";
+        case ECodeType::Intermediate:
+            return ".bc";
+        case ECodeType::Source:
+            return ".cl";
+        }
+    }
+
+    ECodeType type;
+    BuiltinResourceT resource;
+    Device *targetDevice;
+};
+
+class BuiltinsLib {
+  public:
+    BuiltinsLib();
+    BuiltinCode getBuiltinCode(EBuiltInOps builtin, BuiltinCode::ECodeType requestedCodeType, Device &device);
+
+    static std::unique_ptr<Program> createProgramFromCode(const BuiltinCode &bc, Context &context, Device &device);
+
+  protected:
+    BuiltinResourceT getBuiltinResource(EBuiltInOps builtin, BuiltinCode::ECodeType requestedCodeType, Device &device);
+
+    using StoragesContainerT = std::vector<std::unique_ptr<Storage>>;
+    StoragesContainerT allStorages; // sorted by priority allStorages[0] will be checked before allStorages[1], etc.
+
+    std::mutex mutex;
+};
+
+class Context;
+class Device;
+class Kernel;
+class Program;
+
+struct BuiltInKernel {
+    const char *pSource = nullptr;
+    Program *pProgram = nullptr;
+    std::once_flag programIsInitialized; // guard for creating+building the program
+    Kernel *pKernel = nullptr;
+
+    BuiltInKernel() {
+    }
+};
+
+class BuiltinDispatchInfoBuilder;
+
+class BuiltIns {
+  public:
+    using HWFamily = int;
+    std::pair<std::unique_ptr<BuiltinDispatchInfoBuilder>, std::once_flag> BuiltinOpsBuilders[static_cast<uint32_t>(EBuiltInOps::COUNT)];
+
+    BuiltinDispatchInfoBuilder &getBuiltinDispatchInfoBuilder(EBuiltInOps op, Context &context, Device &device);
+    std::unique_ptr<BuiltinDispatchInfoBuilder> setBuiltinDispatchInfoBuilder(EBuiltInOps op, Context &context, Device &device,
+                                                                              std::unique_ptr<BuiltinDispatchInfoBuilder> newBuilder);
+
+    static BuiltIns &getInstance();
+    static void shutDown();
+    Program *createBuiltInProgram(
+        Context &context,
+        Device &device,
+        const char *kernelNames,
+        int &errcodeRet);
+
+    SchedulerKernel &getSchedulerKernel(Context &context);
+
+    SipKernel &getSipKernel(SipKernelType kernel, Context &context);
+
+    BuiltinsLib &getBuiltinsLib() {
+        DEBUG_BREAK_IF(!builtinsLib.get());
+        return *builtinsLib;
+    }
+
+    void setCacheingEnableState(bool enableCacheing) {
+        this->enableCacheing = enableCacheing;
+    }
+
+    bool isCacheingEnabled() const {
+        return this->enableCacheing;
+    }
+
+  protected:
+    BuiltIns();
+    ~BuiltIns();
+
+    // singleton
+    static BuiltIns *pInstance;
+
+    // scheduler kernel
+    BuiltInKernel schedulerBuiltIn;
+
+    // sip builtins
+    std::pair<std::unique_ptr<SipKernel>, std::once_flag> sipKernels[static_cast<uint32_t>(SipKernelType::COUNT)];
+
+    std::unique_ptr<BuiltinsLib> builtinsLib;
+
+    using ProgramsContainerT = std::array<std::pair<std::unique_ptr<Program>, std::once_flag>, static_cast<size_t>(EBuiltInOps::COUNT)>;
+    ProgramsContainerT builtinPrograms;
+    bool enableCacheing = true;
+};
+
+class MemObj;
+
+class BuiltinDispatchInfoBuilder {
+  public:
+    struct BuiltinOpParams {
+        void *srcPtr = nullptr;
+        void *dstPtr = nullptr;
+        MemObj *srcMemObj = nullptr;
+        MemObj *dstMemObj = nullptr;
+        GraphicsAllocation *srcSvmAlloc = nullptr;
+        GraphicsAllocation *dstSvmAlloc = nullptr;
+        Vec3<size_t> srcOffset = {0, 0, 0};
+        Vec3<size_t> dstOffset = {0, 0, 0};
+        Vec3<size_t> size = {0, 0, 0};
+        size_t srcRowPitch = 0;
+        size_t dstRowPitch = 0;
+        size_t srcSlicePitch = 0;
+        size_t dstSlicePitch = 0;
+    };
+
+    BuiltinDispatchInfoBuilder(BuiltIns &kernelLib) : kernelsLib(kernelLib) {}
+
+    template <typename... KernelsDescArgsT>
+    void populate(Context &context, Device &device, EBuiltInOps operation, const char *options, KernelsDescArgsT &&... desc);
+
+    virtual bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, const BuiltinOpParams &operationParams) const {
+        return false;
+    }
+    virtual bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, Kernel *kernel,
+                                    const uint32_t dim, const Vec3<size_t> &gws, const Vec3<size_t> &elws, const Vec3<size_t> &offset) const {
+        return false;
+    }
+
+    virtual cl_int validateDispatch(Kernel *kernel, uint32_t inworkDim, const Vec3<size_t> &gws, const Vec3<size_t> &elws, const Vec3<size_t> &offset) const {
+        return CL_SUCCESS;
+    }
+
+    // returns true if argument should be updated in kernel exposed to user code
+    virtual bool setExplicitArg(uint32_t argIndex, size_t argSize, const void *argVal, cl_int &err) const {
+        return true;
+    }
+
+    void takeOwnership(Context *context);
+    void releaseOwnership();
+
+  protected:
+    template <typename KernelNameT, typename... KernelsDescArgsT>
+    void grabKernels(KernelNameT &&kernelName, Kernel *&kernelDst, KernelsDescArgsT &&... kernelsDesc) {
+        const KernelInfo *ki = prog->getKernelInfo(kernelName);
+        cl_int err = 0;
+        kernelDst = Kernel::create(prog.get(), *ki, &err);
+        kernelDst->isBuiltIn = true;
+        usedKernels.push_back(std::unique_ptr<Kernel>(kernelDst));
+        grabKernels(std::forward<KernelsDescArgsT>(kernelsDesc)...);
+    }
+
+    cl_int grabKernels() { return CL_SUCCESS; }
+
+    std::unique_ptr<Program> prog;
+    std::vector<std::unique_ptr<Kernel>> usedKernels;
+    BuiltIns &kernelsLib;
+};
+
+template <typename HWFamily, EBuiltInOps OpCode>
+class BuiltInOp;
+
+} // namespace OCLRT
--- a/runtime/built_ins/built_ins_storage.cpp
+++ b/runtime/built_ins/built_ins_storage.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <cstdint>
+#include "runtime/built_ins/built_ins.h"
+
+namespace OCLRT {
+
+const char *getBuiltinAsString(EBuiltInOps builtin) {
+    switch (builtin) {
+    default:
+        return "unknown";
+    case EBuiltInOps::CopyBufferToBuffer:
+        return "copy_buffer_to_buffer.igdrcl_built_in";
+    case EBuiltInOps::CopyBufferRect:
+        return "copy_buffer_rect.igdrcl_built_in";
+    case EBuiltInOps::FillBuffer:
+        return "fill_buffer.igdrcl_built_in";
+    case EBuiltInOps::CopyBufferToImage3d:
+        return "copy_buffer_to_image3d.igdrcl_built_in";
+    case EBuiltInOps::CopyImage3dToBuffer:
+        return "copy_image3d_to_buffer.igdrcl_built_in";
+    case EBuiltInOps::CopyImageToImage1d:
+        return "copy_image_to_image1d.igdrcl_built_in";
+    case EBuiltInOps::CopyImageToImage2d:
+        return "copy_image_to_image2d.igdrcl_built_in";
+    case EBuiltInOps::CopyImageToImage3d:
+        return "copy_image_to_image3d.igdrcl_built_in";
+    case EBuiltInOps::FillImage1d:
+        return "fill_image1d.igdrcl_built_in";
+    case EBuiltInOps::FillImage2d:
+        return "fill_image2d.igdrcl_built_in";
+    case EBuiltInOps::FillImage3d:
+        return "fill_image3d.igdrcl_built_in";
+    case EBuiltInOps::VmeBlockMotionEstimateIntel:
+        return "vme_block_motion_estimate_intel.igdrcl_built_in";
+    case EBuiltInOps::VmeBlockAdvancedMotionEstimateCheckIntel:
+        return "vme_block_advanced_motion_estimate_check_intel.igdrcl_built_in";
+    case EBuiltInOps::VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel:
+        return "vme_block_advanced_motion_estimate_bidirectional_check_intel";
+    case EBuiltInOps::Scheduler:
+        return "scheduler.igdrcl_built_in";
+    };
+}
+
+BuiltinResourceT createBuiltinResource(const char *ptr, size_t size) {
+    return BuiltinResourceT(ptr, ptr + size);
+}
+
+BuiltinResourceT createBuiltinResource(const BuiltinResourceT &r) {
+    return BuiltinResourceT(r);
+}
+
+std::string createBuiltinResourceName(EBuiltInOps builtin, const std::string &extension,
+                                      const std::string &platformName, uint32_t deviceRevId) {
+    std::string ret;
+    if (platformName.size() > 0) {
+        ret = platformName;
+        ret += "_" + std::to_string(deviceRevId);
+        ret += "_";
+    }
+
+    ret += getBuiltinAsString(builtin);
+
+    if (extension.size() > 0) {
+        ret += extension;
+    }
+
+    return ret;
+}
+
+std::string joinPath(const std::string &lhs, const std::string &rhs) {
+    if (lhs.size() == 0) {
+        return rhs;
+    }
+
+    if (rhs.size() == 0) {
+        return lhs;
+    }
+
+    if (*lhs.rbegin() == PATH_SEPARATOR) {
+        return lhs + rhs;
+    }
+
+    return lhs + PATH_SEPARATOR + rhs;
+}
+
+std::string getDriverInstallationPath() {
+    return "";
+}
+
+BuiltinResourceT Storage::load(const std::string &resourceName) {
+    return loadImpl(joinPath(rootPath, resourceName));
+}
+
+BuiltinResourceT FileStorage::loadImpl(const std::string &fullResourceName) {
+    BuiltinResourceT ret;
+
+    std::ifstream f{fullResourceName, std::ios::in | std::ios::binary | std::ios::ate};
+    auto end = f.tellg();
+    f.seekg(0, std::ios::beg);
+    auto beg = f.tellg();
+    auto s = end - beg;
+    ret.resize(static_cast<size_t>(s));
+    f.read(ret.data(), s);
+    return ret;
+}
+
+const BuiltinResourceT *EmbeddedStorageRegistry::get(const std::string &name) const {
+    auto it = resources.find(name);
+    if (resources.end() == it) {
+        return nullptr;
+    }
+
+    return &it->second;
+}
+
+BuiltinResourceT EmbeddedStorage::loadImpl(const std::string &fullResourceName) {
+    auto *constResource = EmbeddedStorageRegistry::getInstance().get(fullResourceName);
+    if (constResource == nullptr) {
+        BuiltinResourceT ret;
+        return ret;
+    }
+
+    return createBuiltinResource(*constResource);
+}
+
+BuiltinsLib::BuiltinsLib() {
+    allStorages.push_back(std::unique_ptr<Storage>(new EmbeddedStorage("")));
+    allStorages.push_back(std::unique_ptr<Storage>(new FileStorage(getDriverInstallationPath())));
+}
+
+BuiltinCode BuiltinsLib::getBuiltinCode(EBuiltInOps builtin, BuiltinCode::ECodeType requestedCodeType, Device &device) {
+    std::lock_guard<std::mutex> lockRaii{mutex};
+
+    BuiltinResourceT bc;
+    BuiltinCode::ECodeType usedCodetType = BuiltinCode::ECodeType::INVALID;
+    if (requestedCodeType == BuiltinCode::ECodeType::Any) {
+        for (uint32_t codeType = static_cast<uint32_t>(BuiltinCode::ECodeType::Binary), e = static_cast<uint32_t>(BuiltinCode::ECodeType::COUNT);
+             codeType != e; ++codeType) {
+            bc = getBuiltinResource(builtin, static_cast<BuiltinCode::ECodeType>(codeType), device);
+            if (bc.size() > 0) {
+                usedCodetType = static_cast<BuiltinCode::ECodeType>(codeType);
+                break;
+            }
+        }
+    } else {
+        bc = getBuiltinResource(builtin, requestedCodeType, device);
+        usedCodetType = requestedCodeType;
+    }
+
+    BuiltinCode ret;
+    std::swap(ret.resource, bc);
+    ret.type = usedCodetType;
+    ret.targetDevice = &device;
+
+    return ret;
+}
+
+std::unique_ptr<Program> BuiltinsLib::createProgramFromCode(const BuiltinCode &bc, Context &context, Device &device) {
+    std::unique_ptr<Program> ret;
+    const char *data = bc.resource.data();
+    size_t dataLen = bc.resource.size();
+    cl_int err = 0;
+    switch (bc.type) {
+    default:
+        break;
+    case BuiltinCode::ECodeType::Source:
+    case BuiltinCode::ECodeType::Intermediate:
+        ret.reset(Program::create(data, &context, device, true, &err));
+        break;
+    case BuiltinCode::ECodeType::Binary:
+        ret.reset(Program::createFromGenBinary(&context, data, dataLen, true, nullptr));
+        break;
+    }
+    return ret;
+}
+
+BuiltinResourceT BuiltinsLib::getBuiltinResource(EBuiltInOps builtin, BuiltinCode::ECodeType requestedCodeType, Device &device) {
+    BuiltinResourceT bc;
+    std::string resourceNameGeneric = createBuiltinResourceName(builtin, BuiltinCode::getExtension(requestedCodeType));
+    std::string resourceNameForPlatform = createBuiltinResourceName(builtin, BuiltinCode::getExtension(requestedCodeType), device.getProductAbbrev());
+    std::string resourceNameForPlatformAndStepping = createBuiltinResourceName(builtin, BuiltinCode::getExtension(requestedCodeType), device.getProductAbbrev(),
+                                                                               device.getHardwareInfo().pPlatform->usRevId);
+
+    for (auto &rn : {resourceNameForPlatformAndStepping, resourceNameForPlatform, resourceNameGeneric}) { // first look for dedicated version, only fallback to generic one
+        for (auto &s : allStorages) {
+            bc = s.get()->load(rn);
+            if (bc.size() != 0) {
+                return bc;
+            }
+        }
+    }
+    return bc;
+}
+
+} // namespace OCLRT
--- a/runtime/built_ins/kernels/CMakeLists.txt
+++ b/runtime/built_ins/kernels/CMakeLists.txt
@@ -0,0 +1,120 @@
+# Copyright (c) 2017, Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+add_custom_target(builtins)
+set_target_properties(builtins PROPERTIES FOLDER "built_ins")
+set (BUILTINS_OUTDIR_WITH_ARCH "${TargetDir}/built_ins/${NEO_ARCH}")
+add_dependencies(${BUILTINS_BINARIES_LIB_NAME} builtins)
+
+# Set variables needed to compile built-in kernels
+set (COPY_BUFFER_TO_BUFFER_BUILTIN copy_buffer_to_buffer.igdrcl_built_in)
+set (COPY_BUFFER_RECT_BUILTIN copy_buffer_rect.igdrcl_built_in)
+set (FILL_BUFFER_BUILTIN fill_buffer.igdrcl_built_in)
+set (COPY_BUFFER_TO_IMAGE3D_BUILTIN copy_buffer_to_image3d.igdrcl_built_in)
+set (COPY_IMAGE3D_TO_BUFFER_BUILTIN copy_image3d_to_buffer.igdrcl_built_in)
+set (COPY_IMAGE_TO_IMAGE1D_BUILTIN copy_image_to_image1d.igdrcl_built_in)
+set (COPY_IMAGE_TO_IMAGE2D_BUILTIN copy_image_to_image2d.igdrcl_built_in)
+set (COPY_IMAGE_TO_IMAGE3D_BUILTIN copy_image_to_image3d.igdrcl_built_in)
+set (FILL_IMAGE1D_BUILTIN fill_image1d.igdrcl_built_in)
+set (FILL_IMAGE2D_BUILTIN fill_image2d.igdrcl_built_in)
+set (FILL_IMAGE3D_BUILTIN fill_image3d.igdrcl_built_in)
+
+if("${NEO_ARCH}" STREQUAL "x32")
+    set(BUILTIN_OPTIONS "-cl-intel-greater-than-4GB-buffer-required")
+else()
+    set(BUILTIN_OPTIONS "")
+endif()
+
+if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug" )
+    set(BUILTIN_DEBUG_OPTION "-D DEBUG")
+else()
+    set(BUILTIN_DEBUG_OPTION "")
+endif()
+
+set(BUILTINS_INCLUDE_DIR ${TargetDir} PARENT_SCOPE)
+set(BUILTIN_CPP "")
+
+# Define function for compiling built-ins (with cloc)
+function(compile_builtin gen_name builtin)
+  set(OUTPUTDIR "${BUILTINS_OUTDIR_WITH_ARCH}/${gen_name}")
+
+  # get filename
+  get_filename_component(FILENAME ${builtin} NAME)
+
+  # get name of the file w/o extension
+  get_filename_component(BASENAME ${builtin} NAME_WE)
+
+  set(OUTPUTPATH_BASE "${OUTPUTDIR}/${BASENAME}_${gen_name}")
+  set(OUTPUT_FILES
+    ${OUTPUTPATH_BASE}.bc
+    ${OUTPUTPATH_BASE}.bin
+    ${OUTPUTPATH_BASE}.cpp
+    ${OUTPUTPATH_BASE}.gen
+  )
+
+  # function returns builtin cpp filename
+  unset(BUILTIN_CPP)
+  # set variable outside function
+  set(BUILTIN_CPP built_ins/${NEO_ARCH}/${gen_name}/${BASENAME}_${gen_name}.cpp PARENT_SCOPE)
+
+  if(MSVC)
+    add_custom_command(
+      OUTPUT ${OUTPUT_FILES}
+      COMMAND cloc -q -file ${FILENAME} -device ${gen_name} ${BUILTIN_OPTIONS} -${NEO_BITS} -out_dir ${OUTPUTDIR} -cpp_file -options "-cl-kernel-arg-info ${BUILTIN_DEBUG_OPTION}"
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+      DEPENDS ${builtin} cloc copy_compiler_files
+    )
+  else()
+    add_custom_command(
+      OUTPUT ${OUTPUT_FILES}
+      COMMAND LD_LIBRARY_PATH=$<TARGET_FILE_DIR:cloc> $<TARGET_FILE:cloc> -q -file ${FILENAME} -device ${gen_name} ${BUILTIN_OPTIONS} -${NEO_BITS} -out_dir ${OUTPUTDIR} -cpp_file -options "-cl-kernel-arg-info ${BUILTIN_DEBUG_OPTION}"
+      WORKING_DIRECTORY  ${CMAKE_CURRENT_SOURCE_DIR}
+      DEPENDS ${builtin} cloc copy_compiler_files
+    )
+  endif()
+endfunction()
+
+macro(compile_builtins GEN_NUM PLATFORM_IT)
+  string(TOLOWER ${PLATFORM_IT} PLATFORM_LOWER)
+  string(CONCAT GEN "_GEN" ${GEN_NUM} "_" ${PLATFORM_IT})
+
+  set (BUILTINS_COMMANDS)
+  foreach(GENERATED_BUILTIN ${GENERATED_BUILTINS})
+    compile_builtin(${PLATFORM_LOWER} ${${GENERATED_BUILTIN}_BUILTIN})
+    list(APPEND BUILTINS_COMMANDS ${TargetDir}/${BUILTIN_CPP})
+    set (RUNTIME_GENERATED_${GENERATED_BUILTIN}${GEN} ${BUILTIN_CPP} PARENT_SCOPE)
+  endforeach(GENERATED_BUILTIN)
+
+  set(target_name builtins_${PLATFORM_LOWER})
+  add_custom_target(${target_name} DEPENDS ${BUILTINS_COMMANDS})
+  add_dependencies(builtins ${target_name})
+  set_target_properties(${target_name} PROPERTIES FOLDER "built_ins/${PLATFORM_LOWER}")
+endmacro()
+
+# Compile built-in kernels for all GENs
+foreach(GEN_NUM RANGE ${MAX_GEN} 0 -1)
+  GEN_CONTAINS_PLATFORMS("SUPPORTED" ${GEN_NUM} GENX_HAS_PLATFORMS)
+  if(${GENX_HAS_PLATFORMS})
+    GET_PLATFORMS_FOR_GEN("SUPPORTED" ${GEN_NUM} SUPPORTED_GENX_PLATFORMS)
+    foreach(PLATFORM_IT ${SUPPORTED_GENX_PLATFORMS})
+      compile_builtins(${GEN_NUM} ${PLATFORM_IT})
+    endforeach()
+  endif()
+endforeach()
--- a/runtime/built_ins/kernels/copy_buffer_rect.igdrcl_built_in
+++ b/runtime/built_ins/kernels/copy_buffer_rect.igdrcl_built_in
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+R"===(
+//////////////////////////////////////////////////////////////////////////////
+__kernel void CopyBufferRectBytes2d(
+    __global const char* src,
+    __global char* dst,
+    uint4 SrcOrigin,
+    uint4 DstOrigin,
+    uint2 SrcPitch,
+    uint2 DstPitch )
+
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    uint LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x );
+    uint LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x );
+
+    *( dst + LDstOffset )  = *( src + LSrcOffset ); 
+
+}
+//////////////////////////////////////////////////////////////////////////////
+__kernel void CopyBufferRectBytes3d(
+    __global const char* src, 
+    __global char* dst, 
+    uint4 SrcOrigin, 
+    uint4 DstOrigin, 
+    uint2 SrcPitch, 
+    uint2 DstPitch ) 
+ 
+{ 
+    int x = get_global_id(0); 
+    int y = get_global_id(1); 
+    int z = get_global_id(2); 
+ 
+    uint LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y ); 
+    uint LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y ); 
+ 
+    *( dst + LDstOffset )  = *( src + LSrcOffset );  
+ 
+}
+)==="
--- a/runtime/built_ins/kernels/copy_buffer_to_buffer.igdrcl_built_in
+++ b/runtime/built_ins/kernels/copy_buffer_to_buffer.igdrcl_built_in
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+R"===(
+__kernel void CopyBufferToBufferBytes(
+    const __global uchar* pSrc,
+    __global uchar* pDst,
+    uint srcOffsetInBytes,
+    uint dstOffsetInBytes,
+    uint bytesToRead )
+{
+    pSrc += ( srcOffsetInBytes + get_global_id(0) );
+    pDst += ( dstOffsetInBytes + get_global_id(0) );
+    pDst[ 0 ] = pSrc[ 0 ];
+}
+
+__kernel void CopyBufferToBufferLeftLeftover(
+    const __global uchar* pSrc,
+    __global uchar* pDst,
+    uint srcOffsetInBytes,
+    uint dstOffsetInBytes)
+{
+    unsigned int gid = get_global_id(0);
+    pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ];
+}
+
+__kernel void CopyBufferToBufferMiddle(
+    const __global uint* pSrc,
+    __global uint* pDst,
+    uint srcOffsetInBytes,
+    uint dstOffsetInBytes)
+{
+    unsigned int gid = get_global_id(0);
+    pDst += dstOffsetInBytes >> 2;
+    pSrc += srcOffsetInBytes >> 2;
+    uint4 loaded = vload4(gid, pSrc);
+    vstore4(loaded, gid, pDst);
+}
+
+__kernel void CopyBufferToBufferRightLeftover(
+    const __global uchar* pSrc,
+    __global uchar* pDst,
+    uint srcOffsetInBytes,
+    uint dstOffsetInBytes)
+{
+    unsigned int gid = get_global_id(0);
+    pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ];
+}
+
+)==="
--- a/runtime/built_ins/kernels/copy_buffer_to_image3d.igdrcl_built_in
+++ b/runtime/built_ins/kernels/copy_buffer_to_image3d.igdrcl_built_in
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+R"===(
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
+
+__kernel void CopyBufferToImage3dBytes(__global uchar *src,
+                                       __write_only image3d_t output,
+                                       int srcOffset,
+                                       int4 dstOffset,
+                                       uint2 Pitch) {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
+    uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
+
+    write_imageui(output, dstCoord, (__global uint4)(*(src + LOffset + x), 0, 0, 1));
+}
+
+__kernel void CopyBufferToImage3d2Bytes(__global uchar *src,
+                                        __write_only image3d_t output,
+                                        int srcOffset,
+                                        int4 dstOffset,
+                                        uint2 Pitch) {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
+    uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
+
+    uint4 c = (uint4)(0, 0, 0, 1);
+
+    if(( ulong )(src + srcOffset) & 0x00000001){
+        ushort upper = *((__global uchar*)(src + LOffset + x * 2 + 1));
+        ushort lower = *((__global uchar*)(src + LOffset + x * 2));
+        ushort combined = (upper << 8) | lower;
+        c.x = (uint)combined;
+    }
+    else{
+        c.x = (uint)(*(__global ushort*)(src + LOffset + x * 2));
+    }
+    write_imageui(output, dstCoord, c);
+}
+
+__kernel void CopyBufferToImage3d4Bytes(__global uchar *src,
+                                        __write_only image3d_t output,
+                                        int srcOffset,
+                                        int4 dstOffset,
+                                        uint2 Pitch) {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
+    uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
+
+    uint4 c = (uint4)(0, 0, 0, 1);
+
+    if(( ulong )(src + srcOffset) & 0x00000003){
+        uint upper2 = *((__global uchar*)(src + LOffset + x * 4 + 3));
+        uint upper  = *((__global uchar*)(src + LOffset + x * 4 + 2));
+        uint lower2 = *((__global uchar*)(src + LOffset + x * 4 + 1));
+        uint lower  = *((__global uchar*)(src + LOffset + x * 4));
+        uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
+        c.x = combined;
+    }
+    else{
+        c.x = (*(__global uint*)(src + LOffset + x * 4));
+    }
+    write_imageui(output, dstCoord, c);
+}
+
+__kernel void CopyBufferToImage3d8Bytes(__global uchar *src,
+                                        __write_only image3d_t output,
+                                        int srcOffset,
+                                        int4 dstOffset,
+                                        uint2 Pitch) {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
+    uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
+
+    uint2 c = (uint2)(0, 0);//*((__global uint2*)(src + LOffset + x * 8));
+
+    if(( ulong )(src + srcOffset) & 0x00000007){
+        uint upper2 = *((__global uchar*)(src + LOffset + x * 8 + 3));
+        uint upper  = *((__global uchar*)(src + LOffset + x * 8 + 2));
+        uint lower2 = *((__global uchar*)(src + LOffset + x * 8 + 1));
+        uint lower  = *((__global uchar*)(src + LOffset + x * 8));
+        uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
+        c.x = combined;
+        upper2 = *((__global uchar*)(src + LOffset + x * 8 + 7));
+        upper  = *((__global uchar*)(src + LOffset + x * 8 + 6));
+        lower2 = *((__global uchar*)(src + LOffset + x * 8 + 5));
+        lower  = *((__global uchar*)(src + LOffset + x * 8 + 4));
+        combined = ((uint)upper2 << 24) | ((uint)upper << 16) | ((uint)lower2 << 8) | lower;
+        c.y = combined;
+    }
+    else{
+        c = *((__global uint2*)(src + LOffset + x * 8));
+    }
+
+    write_imageui(output, dstCoord, (__global uint4)(c.x, c.y, 0, 1));
+}
+
+__kernel void CopyBufferToImage3d16Bytes(__global uchar *src,
+                                         __write_only image3d_t output,
+                                         int srcOffset,
+                                         int4 dstOffset,
+                                         uint2 Pitch) {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
+    uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
+
+    uint4 c = (uint4)(0, 0, 0, 0);
+
+    if(( ulong )(src + srcOffset) & 0x0000000f){
+        uint upper2 = *((__global uchar*)(src + LOffset + x * 16 + 3));
+        uint upper  = *((__global uchar*)(src + LOffset + x * 16 + 2));
+        uint lower2 = *((__global uchar*)(src + LOffset + x * 16 + 1));
+        uint lower  = *((__global uchar*)(src + LOffset + x * 16));
+        uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
+        c.x = combined;
+        upper2 = *((__global uchar*)(src + LOffset + x * 16 + 7));
+        upper  = *((__global uchar*)(src + LOffset + x * 16 + 6));
+        lower2 = *((__global uchar*)(src + LOffset + x * 16 + 5));
+        lower  = *((__global uchar*)(src + LOffset + x * 16 + 4));
+        combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
+        c.y = combined;
+        upper2 = *((__global uchar*)(src + LOffset + x * 16 + 11));
+        upper  = *((__global uchar*)(src + LOffset + x * 16 + 10));
+        lower2 = *((__global uchar*)(src + LOffset + x * 16 + 9));
+        lower  = *((__global uchar*)(src + LOffset + x * 16 + 8));
+        combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
+        c.z = combined;
+        upper2 = *((__global uchar*)(src + LOffset + x * 16 + 15));
+        upper  = *((__global uchar*)(src + LOffset + x * 16 + 14));
+        lower2 = *((__global uchar*)(src + LOffset + x * 16 + 13));
+        lower  = *((__global uchar*)(src + LOffset + x * 16 + 12));
+        combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
+        c.w = combined;
+    }
+    else{
+        c = *((__global uint4 *)(src + LOffset + x * 16));
+    }
+
+    write_imageui(output, dstCoord, c);
+}
+)==="
--- a/runtime/built_ins/kernels/copy_image3d_to_buffer.igdrcl_built_in
+++ b/runtime/built_ins/kernels/copy_image3d_to_buffer.igdrcl_built_in
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+R"===(
+__kernel void CopyImage3dToBufferBytes(__read_only image3d_t input,
+                                       __global uchar *dst,
+                                       int4 srcOffset,
+                                       int dstOffset,
+                                       uint2 Pitch) {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
+    uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
+
+    uint4 c = read_imageui(input, srcCoord);
+    *(dst + DstOffset + x) = convert_uchar_sat(c.x);
+}
+
+__kernel void CopyImage3dToBuffer2Bytes(__read_only image3d_t input,
+                                        __global uchar *dst,
+                                        int4 srcOffset,
+                                        int dstOffset,
+                                        uint2 Pitch) {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
+    uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
+    
+    uint4 c = read_imageui(input, srcCoord);
+
+    if(( ulong )(dst + dstOffset) & 0x00000001){
+        *((__global uchar*)(dst + DstOffset + x * 2 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 2)) = convert_uchar_sat(c.x & 0xff);
+    }
+    else{
+        *((__global ushort*)(dst + DstOffset + x * 2)) = convert_ushort_sat(c.x);
+    }
+}
+
+__kernel void CopyImage3dToBuffer4Bytes(__read_only image3d_t input,
+                                        __global uchar *dst,
+                                        int4 srcOffset,
+                                        int dstOffset,
+                                        uint2 Pitch) {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
+    uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
+
+    uint4 c = read_imageui(input, srcCoord);
+
+    if(( ulong )(dst + dstOffset) & 0x00000003){
+        *((__global uchar*)(dst + DstOffset + x * 4 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 4 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 4 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 4)) = convert_uchar_sat(c.x & 0xff);
+    }
+    else{
+        *((__global uint*)(dst + DstOffset + x * 4)) = c.x;
+    }
+}
+
+__kernel void CopyImage3dToBuffer8Bytes(__read_only image3d_t input,
+                                        __global uchar *dst,
+                                        int4 srcOffset,
+                                        int dstOffset,
+                                        uint2 Pitch) {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
+    uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
+
+    uint4 c = read_imageui(input, srcCoord);
+
+    if(( ulong )(dst + dstOffset) & 0x00000007){
+        *((__global uchar*)(dst + DstOffset + x * 8 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 8 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 8 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 8)) = convert_uchar_sat(c.x & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 8 + 7)) = convert_uchar_sat((c.y >> 24 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 8 + 6)) = convert_uchar_sat((c.y >> 16 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 8 + 5)) = convert_uchar_sat((c.y >> 8 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 8 + 4)) = convert_uchar_sat(c.y & 0xff);
+    }
+    else{
+        uint2 d = (uint2)(c.x,c.y);
+        *((__global uint2*)(dst + DstOffset + x * 8)) = d;
+    }
+}
+
+__kernel void CopyImage3dToBuffer16Bytes(__read_only image3d_t input,
+                                         __global uchar *dst,
+                                         int4 srcOffset,
+                                         int dstOffset,
+                                         uint2 Pitch) {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
+    uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
+
+    const uint4 c = read_imageui(input, srcCoord);
+
+    if(( ulong )(dst + dstOffset) & 0x0000000f){
+        *((__global uchar*)(dst + DstOffset + x * 16 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 16 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 16 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 16)) = convert_uchar_sat(c.x & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 16 + 7)) = convert_uchar_sat((c.y >> 24 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 16 + 6)) = convert_uchar_sat((c.y >> 16 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 16 + 5)) = convert_uchar_sat((c.y >> 8 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 16 + 4)) = convert_uchar_sat(c.y & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 16 + 11)) = convert_uchar_sat((c.z >> 24 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 16 + 10)) = convert_uchar_sat((c.z >> 16 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 16 + 9)) = convert_uchar_sat((c.z >> 8 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 16 + 8)) = convert_uchar_sat(c.z & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 16 + 15)) = convert_uchar_sat((c.w >> 24 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 16 + 14)) = convert_uchar_sat((c.w >> 16 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 16 + 13)) = convert_uchar_sat((c.w >> 8 ) & 0xff);
+        *((__global uchar*)(dst + DstOffset + x * 16 + 12)) = convert_uchar_sat(c.w & 0xff);
+    }
+    else{
+        *(__global uint4*)(dst + DstOffset + x * 16) = c;
+    }
+}
+)==="
--- a/runtime/built_ins/kernels/copy_image_to_image1d.igdrcl_built_in
+++ b/runtime/built_ins/kernels/copy_image_to_image1d.igdrcl_built_in
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+R"===(
+__kernel void CopyImageToImage1d(
+    __read_only image1d_t input,
+    __write_only image1d_t output,
+    int4 srcOffset,
+    int4 dstOffset) {
+    const int x = get_global_id(0);
+
+    const int srcCoord = x + srcOffset.x;
+    const int dstCoord = x + dstOffset.x;
+    const uint4 c = read_imageui(input, srcCoord);
+    write_imageui(output, dstCoord, c);
+}
+)==="
--- a/runtime/built_ins/kernels/copy_image_to_image2d.igdrcl_built_in
+++ b/runtime/built_ins/kernels/copy_image_to_image2d.igdrcl_built_in
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+R"===(
+__kernel void CopyImageToImage2d(
+    __read_only image2d_t input,
+    __write_only image2d_t output,
+    int4 srcOffset,
+    int4 dstOffset) {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    const int2 srcCoord = (int2)(x, y) + (int2)(srcOffset.x, srcOffset.y);
+    const int2 dstCoord = (int2)(x, y) + (int2)(dstOffset.x, dstOffset.y);
+    const uint4 c = read_imageui(input, srcCoord);
+    write_imageui(output, dstCoord, c);
+}
+)==="
--- a/runtime/built_ins/kernels/copy_image_to_image3d.igdrcl_built_in
+++ b/runtime/built_ins/kernels/copy_image_to_image3d.igdrcl_built_in
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+R"===(
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
+
+__kernel void CopyImageToImage3d(
+    __read_only image3d_t input,
+    __write_only image3d_t output,
+    int4 srcOffset,
+    int4 dstOffset) {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
+    const int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
+    const uint4 c = read_imageui(input, srcCoord);
+    write_imageui(output, dstCoord, c);
+}
+)==="
--- a/runtime/built_ins/kernels/fill_buffer.igdrcl_built_in
+++ b/runtime/built_ins/kernels/fill_buffer.igdrcl_built_in
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+R"===(
+// assumption is local work size = pattern size
+__kernel void FillBufferBytes(
+    __global uchar* pDst,
+    uint dstOffsetInBytes,
+    const __global uchar* pPattern )
+{
+    uint dstIndex = get_global_id(0) + dstOffsetInBytes;
+    uint srcIndex = get_local_id(0);
+    pDst[dstIndex] = pPattern[srcIndex];
+}
+
+__kernel void FillBufferLeftLeftover(
+    __global uchar* pDst,
+    uint dstOffsetInBytes,
+    const __global uchar* pPattern,
+    const uint patternSizeInEls )
+{
+    uint gid = get_global_id(0);
+    pDst[ gid + dstOffsetInBytes ] = pPattern[ gid & (patternSizeInEls - 1) ];
+}
+
+__kernel void FillBufferMiddle(
+    __global uchar* pDst,
+    uint dstOffsetInBytes,
+    const __global uint* pPattern,
+    const uint patternSizeInEls )
+{
+    uint gid = get_global_id(0);
+    ((__global uint*)(pDst + dstOffsetInBytes))[gid] = pPattern[ gid & (patternSizeInEls - 1) ];
+}
+
+__kernel void FillBufferRightLeftover(
+    __global uchar* pDst,
+    uint dstOffsetInBytes,
+    const __global uchar* pPattern,
+    const uint patternSizeInEls )
+{
+    uint gid = get_global_id(0);
+    pDst[ gid + dstOffsetInBytes ] = pPattern[ gid & (patternSizeInEls - 1) ];
+}
+)==="
--- a/runtime/built_ins/kernels/fill_image1d.igdrcl_built_in
+++ b/runtime/built_ins/kernels/fill_image1d.igdrcl_built_in
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+R"===(
+__kernel void FillImage1d(
+    __write_only image1d_t output,
+    uint4 color,
+    int4 dstOffset) {
+    const int x = get_global_id(0);
+
+    const int dstCoord = x + dstOffset.x;
+    write_imageui(output, dstCoord, color);
+}
+)==="
--- a/runtime/built_ins/kernels/fill_image2d.igdrcl_built_in
+++ b/runtime/built_ins/kernels/fill_image2d.igdrcl_built_in
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+R"===(
+__kernel void FillImage2d(
+    __write_only image2d_t output,
+    uint4 color,
+    int4 dstOffset) {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    const int2 dstCoord = (int2)(x, y) + (int2)(dstOffset.x, dstOffset.y);
+    write_imageui(output, dstCoord, color);
+}
+)==="
--- a/runtime/built_ins/kernels/fill_image3d.igdrcl_built_in
+++ b/runtime/built_ins/kernels/fill_image3d.igdrcl_built_in
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+R"===(
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
+
+__kernel void FillImage3d(
+    __write_only image3d_t output,
+    uint4 color,
+    int4 dstOffset) {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    const int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
+    write_imageui(output, dstCoord, color);
+}
+)==="
--- a/runtime/built_ins/kernels/vebox_ve_dn_di_enhance_intel.igdrcl_built_in
+++ b/runtime/built_ins/kernels/vebox_ve_dn_di_enhance_intel.igdrcl_built_in
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+R"===(
+__kernel void ve_dn_di_enhance_intel(sampler_t accelerator,
+                                     int flags,
+                                     __read_only image2d_t current_input,
+                                     __read_only image2d_t ref_input,
+                                     __write_only image2d_t current_output,
+                                     __write_only image2d_t ref_output,
+                                     __write_only image2d_t dndi_output) {
+}
+)==="
--- a/runtime/built_ins/kernels/vebox_ve_dn_enhance_intel.igdrcl_built_in
+++ b/runtime/built_ins/kernels/vebox_ve_dn_enhance_intel.igdrcl_built_in
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+R"===(
+__kernel void ve_dn_enhance_intel(sampler_t accelerator,
+                                  int flags,
+                                  __read_only image2d_t ref_input,
+                                  __read_only image2d_t current_input,
+                                  __write_only image2d_t current_output) {
+}
+)==="
--- a/runtime/built_ins/kernels/vebox_ve_enhance_intel.igdrcl_built_in
+++ b/runtime/built_ins/kernels/vebox_ve_enhance_intel.igdrcl_built_in
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+R"===(
+__kernel void ve_enhance_intel(sampler_t accelerator,
+                               int flags,
+                               __read_only image2d_t current_input,
+                               __write_only image2d_t current_output) {
+}
+)==="
--- a/runtime/built_ins/kernels/vme_block_advanced_motion_estimate_bidirectional_check_intel.igdrcl_built_in
+++ b/runtime/built_ins/kernels/vme_block_advanced_motion_estimate_bidirectional_check_intel.igdrcl_built_in
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+R"===(
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
+block_advanced_motion_estimate_bidirectional_check_intel(
+    sampler_t accelerator, __read_only image2d_t srcImg,
+    __read_only image2d_t refImg, __read_only image2d_t src_check_image,
+    __read_only image2d_t ref0_check_image,
+    __read_only image2d_t ref1_check_image, uint flags,
+    uint search_cost_penalty, uint search_cost_precision, short2 count_global,
+    uchar bidir_weight, __global short2 *count_motion_vector_buffer,
+    __global short2 *prediction_motion_vector_buffer,
+    __global char *skip_input_mode_buffer,
+    __global short2 *skip_motion_vector_buffer,
+    __global short2 *search_motion_vector_buffer,
+    __global char *intra_search_predictor_modes,
+    __global ushort *search_residuals, __global ushort *skip_residuals,
+    __global ushort *intra_residuals, __read_only image2d_t intraSrcImg,
+    int height, int width, int stride) {
+  __local uint dstSearch[64];         // 8 GRFs
+  __local uint dstSkipIntra[32 + 24]; // 7 GRFs (4 for inter, 3 for intra)
+
+  // distortion in the 6th GRF
+  __local ushort *distSearch = (__local ushort *)&dstSearch[8 * 5];
+
+  // Initialize the MV cost table:
+  // MV Cost in U4U4 format:
+  // No cost    : 0,  0,  0,  0,  0,  0,  0,  0
+  // Low Cost   : 1,  4,  5,  9,  10, 12, 14, 15
+  // Normal Cost: 5,  26, 29, 43, 45, 47, 57, 57
+  // High Cost  : 29, 61, 72, 78, 88, 89, 91, 92
+
+  uint2 MVCostTable;
+  if (search_cost_penalty == 1) {
+    MVCostTable.s0 = 0x09050401;
+    MVCostTable.s1 = 0x0F0E0C0A;
+  } else if (search_cost_penalty == 2) {
+    MVCostTable.s0 = 0x2B1D1A05;
+    MVCostTable.s1 = 0x39392F2D;
+  } else if (search_cost_penalty == 3) {
+    MVCostTable.s0 = 0x4E483D1D;
+    MVCostTable.s1 = 0x5C5B5958;
+  } else {
+    MVCostTable.s0 = 0;
+    MVCostTable.s1 = 0;
+  }
+
+  uint MVCostPrecision = ((uint)search_cost_precision) << 16;
+
+  // Frame is divided into rows * columns of MBs.
+  // One h/w thread per WG.
+  // One WG processes "row" MBs - one row per iteration and one MB per row.
+  // Number of WGs (or h/w threads) is number of columns MBs.Each iteration
+  // processes the MB in a row - gid_0 is the MB id in a row and gid_1 is the
+  // row offset.
+
+  int sid_0 = stride * get_group_id(0);
+  int gid_0 = sid_0 / height;
+  int gid_1 = sid_0 % height;
+  for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
+       sid++, gid_0 = sid / height, gid_1 = sid % height) {
+    int2 srcCoord;
+
+    srcCoord.x = gid_0 * 16 +
+                 get_global_offset(0); // 16 pixels wide MBs (globally scalar)
+    srcCoord.y = gid_1 * 16 +
+                 get_global_offset(1);  // 16 pixels tall MBs (globally scalar)
+    uint curMB = gid_0 + gid_1 * width; // current MB id
+    short2 count;
+
+    // If either the search or skip vector counts are per-MB, then we need to
+    // read in
+    // the count motion vector buffer.
+
+    if ((count_global.s0 == -1) | (count_global.s1 == -1)) {
+      count = count_motion_vector_buffer[curMB];
+    }
+
+    // If either the search or skip vector counts are per-frame, we need to use
+    // those.
+
+    if (count_global.s0 >= 0) {
+      count.s0 = count_global.s0;
+    }
+
+    if (count_global.s1 >= 0) {
+      count.s1 = count_global.s1;
+    }
+
+    int countPredMVs = count.x;
+    if (countPredMVs != 0) {
+      uint offset = curMB * 4;       // 4 predictors per MB
+      offset += get_local_id(0) % 4; // 16 work-items access 4 MVs for MB
+      // one predictor for MB per SIMD channel
+
+      // Reduce predictors from Q-pixel to integer precision.
+      int2 predMV = 0;
+
+      if (get_local_id(0) < countPredMVs) {
+        // one MV per work-item
+        predMV = convert_int2(prediction_motion_vector_buffer[offset]);
+        // Predictors are input in QP resolution. Convert that to integer
+        // resolution.
+        predMV.x /= 4;
+        predMV.y /= 4;
+        predMV.y &= 0xFFFFFFFE;
+      }
+
+      // Do up to 4 IMEs, get the best MVs and their distortions, and optionally
+      // a FBR of
+      // the best MVs. Finally the results are written out to SLM.
+
+      intel_work_group_vme_mb_multi_query_4(
+          dstSearch,    // best search MV and its distortions into SLM
+          countPredMVs, // count of predictor MVs (globally scalar - value range
+                        // 1 to 4)
+          MVCostPrecision, // MV cost precision
+          MVCostTable,     // MV cost table
+          srcCoord,        // MB 2-D offset (globally scalar)
+          predMV, // predictor MVs (up to 4 distinct MVs for SIMD16 thread)
+          srcImg, // source
+          refImg, // reference
+          accelerator); // vme object
+    }
+
+    int doIntra = ((flags & 0x2) != 0);
+    int intraEdges = 0;
+    if (doIntra) {
+      // Enable all edges by default.
+      intraEdges = 0x3C;
+      // If this is a left-edge MB, then disable left edges.
+      if ((gid_0 == 0) & (get_global_offset(0) == 0)) {
+        intraEdges &= 0x18;
+      }
+
+      // If this is a right edge MB then disable right edges.
+      if (gid_0 == width - 1) {
+        intraEdges &= 0x34;
+      }
+
+      // If this is a top-edge MB, then disable top edges.
+      if ((gid_1 == 0) & (get_global_offset(1) == 0)) {
+        intraEdges &= 0x20;
+      }
+
+      // Set bit6=bit5.
+      intraEdges |= ((intraEdges & 0x20) << 1);
+
+      intraEdges <<= 8;
+    }
+
+    int skip_block_type_8x8 = flags & 0x4;
+
+    int countSkipMVs = count.y;
+    if (countSkipMVs != 0 || doIntra == true) {
+      // one set of skip MV per SIMD channel
+
+      // Do up to 4 skip checks and get the distortions for each of them.
+      // Finally the results are written out to SLM.
+
+      if ((skip_block_type_8x8 == 0) | ((doIntra) & (countSkipMVs == 0))) {
+        // 16x16:
+
+        uint offset = curMB * 4 * 2; // 4 sets of skip check MVs per MB
+        int skipMV = 0;
+        if (get_local_id(0) < countSkipMVs * 2) // need 2 values per MV
+        {
+          offset +=
+              (get_local_id(0)); // 16 work-items access 4 sets of MVs for MB
+          __global int *skip1_motion_vector_buffer =
+              (__global int *)skip_motion_vector_buffer;
+          skipMV = skip1_motion_vector_buffer[offset]; // one MV per work-item
+        }
+
+        uchar skipMode = 0;
+        if (get_local_id(0) < countSkipMVs) {
+          skipMode = skip_input_mode_buffer[curMB];
+
+          if (skipMode == 0) {
+            skipMode = 1;
+          }
+          if (skipMode > 3) {
+            skipMode = 3;
+          }
+        }
+
+        intel_work_group_vme_mb_multi_bidir_check_16x16(
+            dstSkipIntra, // distortions into SLM
+            countSkipMVs, // count of skip check MVs (globally scalar - value
+                          // range 1 to 4)
+            doIntra,      // compute intra modes
+            intraEdges,   // intra edges to use
+            srcCoord,     // MB 2-D offset (globally scalar)
+            bidir_weight, // bidirectional weight
+            skipMode,     // skip modes
+            skipMV, // skip check MVs (up to 4 distinct sets of skip check MVs
+                    // for SIMD16 thread)
+            src_check_image,  // source
+            ref0_check_image, // reference fwd
+            ref1_check_image, // reference bwd
+            intraSrcImg,      // intra source
+            accelerator);     // vme object
+      } else {
+        // 8x8:
+
+        uint offset =
+            curMB * 4 *
+            8; // 4 sets of skip check MVs, 16 shorts (8 ints) each per MB
+        int2 skipMVs = 0;
+        if (get_local_id(0) < countSkipMVs * 8) // need 8 values per MV
+        {
+          offset +=
+              (get_local_id(0)); // 16 work-items access 4 sets of MVs for MB
+          __global int *skip1_motion_vector_buffer =
+              (__global int *)(skip_motion_vector_buffer);
+          skipMVs.x = skip1_motion_vector_buffer[offset]; // four component MVs
+                                                          // per work-item
+          skipMVs.y = skip1_motion_vector_buffer[offset + 16];
+        }
+
+        uchar skipModes = 0;
+        if (get_local_id(0) < countSkipMVs) {
+          skipModes = skip_input_mode_buffer[curMB];
+        }
+
+        intel_work_group_vme_mb_multi_bidir_check_8x8(
+            dstSkipIntra, // distortions into SLM
+            countSkipMVs, // count of skip check MVs per MB (globally scalar -
+                          // value range 1 to 4)
+            doIntra,      // compute intra modes
+            intraEdges,   // intra edges to use
+            srcCoord,     // MB 2-D offset (globally scalar)
+            bidir_weight, // bidirectional weight
+            skipModes,    // skip modes
+            skipMVs, // skip check MVs (up to 4 distinct sets of skip check MVs
+                     // for SIMD16 thread)
+            src_check_image,  // source
+            ref0_check_image, // reference fwd
+            ref1_check_image, // reference bwd
+            intraSrcImg,      // intra source
+            accelerator);     // vme object
+      }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Write Out motion estimation result:
+    // Result format
+    //     Hierarchical row-major layout
+    //     i.e. row-major of blocks MVs in MBs, and row-major of 4 sets of
+    //     MVs/distortion in blocks
+    if (countPredMVs != 0) {
+      // 4x4
+      if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
+        int index = (gid_0 * 16 + get_local_id(0)) + (gid_1 * 16 * width);
+
+        // 1. 16 work-items enabled.
+        // 2. Work-items gather fwd MVs in strided dword locations 0, 2, .., 30
+        // (interleaved
+        //    fwd/bdw MVs) with constant offset 8 (control data size) from SLM
+        //    into contiguous
+        //    short2 locations 0, 1, .., 15 of global buffer
+        //    search_motion_vector_buffer with
+        //    offset index.
+        // 3. Work-items gather contiguous ushort locations 0, 1, .., 15 from
+        // distSearch into
+        //    contiguous ushort locations 0, 1, .., 15 of search_residuals with
+        //    offset index.
+
+        short2 val = as_short2(dstSearch[8 + get_local_id(0) * 2]);
+        search_motion_vector_buffer[index] = val;
+
+#ifndef HW_NULL_CHECK
+        if (search_residuals != NULL)
+#endif
+        {
+          search_residuals[index] = distSearch[get_local_id(0)];
+        }
+      }
+
+      // 8x8
+      else if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
+        // Only 1st 4 work-item are needed.
+        if (get_local_id(0) < 4) {
+          int index = (gid_0 * 4 + get_local_id(0)) + (gid_1 * 4 * width);
+
+          // 1. 4 work-items enabled.
+          // 2. Work-items gather fw MVs in strided dword locations 0, 8, 16, 24
+          // (interleaved
+          //    fwd/bdw MVs) with constant offset 8 from SLM into contiguous
+          //    short2 locations
+          //    0, 1, .., 15 of global buffer search_motion_vector_buffer with
+          //    offset index.
+          // 3. Work-items gather strided ushort locations 0, 4, 8, 12 from
+          // distSearch into
+          //    contiguous ushort locations 0, 1, .., 15 of search_residuals
+          //    with offset index.
+
+          short2 val = as_short2(dstSearch[8 + get_local_id(0) * 4 * 2]);
+          search_motion_vector_buffer[index] = val;
+
+#ifndef HW_NULL_CHECK
+          if (search_residuals != NULL)
+#endif
+          {
+            search_residuals[index] = distSearch[get_local_id(0) * 4];
+          }
+        }
+      }
+
+      // 16x16
+      else if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
+        // One 1st work is needed.
+        if (get_local_id(0) == 0) {
+          int index = gid_0 + gid_1 * width;
+
+          // 1. 1 work-item enabled.
+          // 2. Work-item gathers fwd MV in dword location 0 with constant
+          // offset 8 from
+          //    SLM into short2 locations 0 of global buffer
+          //    search_motion_vector_buffer.
+          // 3. Work-item gathers ushort location 0 from distSearch into ushort
+          //    location 0 of search_residuals with offset index.
+
+          short2 val = as_short2(dstSearch[8]);
+          search_motion_vector_buffer[index] = val;
+
+#ifndef HW_NULL_CHECK
+          if (search_residuals != NULL)
+#endif
+          {
+            search_residuals[index] = distSearch[0];
+          }
+        }
+      }
+    }
+
+    // Write out motion skip check result:
+    // Result format
+    //     Hierarchical row-major layout
+    //     i.e. row-major of blocks in MBs, and row-major of 8 sets of
+    //     distortions in blocks
+    if (countSkipMVs != 0) {
+      if (skip_block_type_8x8 == false) {
+        // Copy out 4 (1 component) sets of distortion values.
+
+        int index = (gid_0 * 4) + (get_local_id(0)) + (gid_1 * 4 * width);
+
+        if (get_local_id(0) < countSkipMVs) {
+          // 1. Up to 4 work-items are enabled.
+          // 2. The work-item gathers distSkip locations 0, 16*1, .., 16*7 and
+          //    copies them to contiguous skip_residual locations 0, 1, 2, ..,
+          //    7.
+          __local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
+          skip_residuals[index] = distSkip[get_local_id(0) * 16];
+        }
+      } else {
+        // Copy out 4 (4 component) sets of distortion values.
+        int index =
+            (gid_0 * 4 * 4) + (get_local_id(0)) + (gid_1 * 4 * 4 * width);
+
+        if (get_local_id(0) < countSkipMVs * 4) {
+          // 1. Up to 16 work-items are enabled.
+          // 2. The work-item gathers distSkip locations 0, 4*1, .., 4*15 and
+          //    copies them to contiguous skip_residual locations 0, 1, 2, ..,
+          //    15.
+
+          __local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
+          skip_residuals[index] = distSkip[get_local_id(0) * 4];
+        }
+      }
+    }
+
+    // Write out intra search result:
+    if (doIntra) {
+      // Write out the 4x4 intra modes
+      if (get_local_id(0) < 8) {
+        __local char *dstIntra_4x4 =
+            (__local char *)(&dstSkipIntra[32 + 16 + 4]);
+        char value = dstIntra_4x4[get_local_id(0)];
+        char value_low = (value)&0xf;
+        char value_high = (value >> 4) & 0xf;
+
+        int index_low =
+            (gid_0 * 22) + (get_local_id(0) * 2) + (gid_1 * 22 * width);
+
+        int index_high =
+            (gid_0 * 22) + (get_local_id(0) * 2) + 1 + (gid_1 * 22 * width);
+
+        intra_search_predictor_modes[index_low + 5] = value_low;
+        intra_search_predictor_modes[index_high + 5] = value_high;
+      }
+
+      // Write out the 8x8 intra modes
+      if (get_local_id(0) < 4) {
+        __local char *dstIntra_8x8 =
+            (__local char *)(&dstSkipIntra[32 + 8 + 4]);
+        char value = dstIntra_8x8[get_local_id(0) * 2];
+        char value_low = (value)&0xf;
+        int index = (gid_0 * 22) + (get_local_id(0)) + (gid_1 * 22 * width);
+        intra_search_predictor_modes[index + 1] = value_low;
+      }
+
+      // Write out the 16x16 intra modes
+      if (get_local_id(0) < 1) {
+        __local char *dstIntra_16x16 =
+            (__local char *)(&dstSkipIntra[32 + 0 + 4]);
+        char value = dstIntra_16x16[0];
+        char value_low = (value)&0xf;
+        int index = (gid_0 * 22) + (gid_1 * 22 * width);
+        intra_search_predictor_modes[index] = value_low;
+      }
+
+// Get the intra residuals.
+#ifndef HW_NULL_CHECK
+      if (intra_residuals != NULL)
+#endif
+      {
+        int index = (gid_0 * 4) + (gid_1 * 4 * width);
+
+        if (get_local_id(0) < 1) {
+          __local ushort *distIntra_4x4 =
+              (__local ushort *)(&dstSkipIntra[32 + 16 + 3]);
+          __local ushort *distIntra_8x8 =
+              (__local ushort *)(&dstSkipIntra[32 + 8 + 3]);
+          __local ushort *distIntra_16x16 =
+              (__local ushort *)(&dstSkipIntra[32 + 0 + 3]);
+
+          intra_residuals[index + 2] = distIntra_4x4[0];
+          intra_residuals[index + 1] = distIntra_8x8[0];
+          intra_residuals[index + 0] = distIntra_16x16[0];
+        }
+      }
+    }
+  }
+}
+
+)==="
--- a/runtime/built_ins/kernels/vme_block_advanced_motion_estimate_bidirectional_check_intel_frontend.igdrcl_built_in
+++ b/runtime/built_ins/kernels/vme_block_advanced_motion_estimate_bidirectional_check_intel_frontend.igdrcl_built_in
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+R"===(
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
+block_advanced_motion_estimate_bidirectional_check_intel(
+    sampler_t accelerator, __read_only image2d_t srcImg,
+    __read_only image2d_t refImg, __read_only image2d_t src_check_image,
+    __read_only image2d_t ref0_check_image,
+    __read_only image2d_t ref1_check_image, uint flags,
+    uint search_cost_penalty, uint search_cost_precision, short2 count_global,
+    uchar bidir_weight, __global short2 *count_motion_vector_buffer,
+    __global short2 *prediction_motion_vector_buffer,
+    __global char *skip_input_mode_buffer,
+    __global short2 *skip_motion_vector_buffer,
+    __global short2 *search_motion_vector_buffer,
+    __global char *intra_search_predictor_modes,
+    __global ushort *search_residuals, __global ushort *skip_residuals,
+    __global ushort *intra_residuals) {
+}
+
+)==="
--- a/runtime/built_ins/kernels/vme_block_advanced_motion_estimate_check_intel.igdrcl_built_in
+++ b/runtime/built_ins/kernels/vme_block_advanced_motion_estimate_check_intel.igdrcl_built_in
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+R"===(
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
+block_advanced_motion_estimate_check_intel(
+    sampler_t accelerator, __read_only image2d_t srcImg,
+    __read_only image2d_t refImg, uint flags, uint skip_block_type,
+    uint search_cost_penalty, uint search_cost_precision,
+    __global short2 *count_motion_vector_buffer,
+    __global short2 *predictors_buffer,
+    __global short2 *skip_motion_vector_buffer,
+    __global short2 *motion_vector_buffer,
+    __global char *intra_search_predictor_modes, __global ushort *residuals,
+    __global ushort *skip_residuals, __global ushort *intra_residuals,
+    __read_only image2d_t intraSrcImg, int height, int width, int stride) {
+  __local uint dstSearch[64];         // 8 GRFs
+  __local uint dstSkipIntra[64 + 24]; // 11 GRFs (8 for inter, 3 for intra)
+
+  __local ushort *distSearch =
+      (__local ushort *)&dstSearch[8 * 5]; // distortion in the 6th GRF
+
+  // Initialize the MV cost table:
+  // MV Cost in U4U4 format:
+  // No cost    : 0,  0,  0,  0,  0,  0,  0,  0
+  // Low Cost   : 1,  4,  5,  9,  10, 12, 14, 15
+  // Normal Cost: 5,  26, 29, 43, 45, 47, 57, 57
+  // High Cost  : 29, 61, 72, 78, 88, 89, 91, 92
+
+  uint2 MVCostTable;
+  if (search_cost_penalty == 1) {
+    MVCostTable.s0 = 0x09050401;
+    MVCostTable.s1 = 0x0F0E0C0A;
+  } else if (search_cost_penalty == 2) {
+    MVCostTable.s0 = 0x2B1D1A05;
+    MVCostTable.s1 = 0x39392F2D;
+  } else if (search_cost_penalty == 3) {
+    MVCostTable.s0 = 0x4E483D1D;
+    MVCostTable.s1 = 0x5C5B5958;
+  } else {
+    MVCostTable.s0 = 0;
+    MVCostTable.s1 = 0;
+  }
+
+  uint MVCostPrecision = ((uint)search_cost_precision) << 16;
+  // Frame is divided into rows * columns of MBs.
+  // One h/w thread per WG.
+  // One WG processes 'row' MBs - one row per iteration and one MB per row.
+  // Number of WGs (or h/w threads) is number of columns MBs
+  // Each iteration processes the MB in a row - gid_0 is the MB id in a row and
+  // gid_1 is the row offset.
+
+  int sid_0 = stride * get_group_id(0);
+  int gid_0 = sid_0 / height;
+  int gid_1 = sid_0 % height;
+  for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
+       sid++, gid_0 = sid / height, gid_1 = sid % height) {
+    int2 srcCoord;
+
+    srcCoord.x = gid_0 * 16 +
+                 get_global_offset(0); // 16 pixels wide MBs (globally scalar)
+    srcCoord.y = gid_1 * 16 +
+                 get_global_offset(1); // 16 pixels tall MBs (globally scalar)
+
+    uint curMB = gid_0 + gid_1 * width; // current MB id
+    short2 count = count_motion_vector_buffer[curMB];
+
+    int countPredMVs = count.x;
+    if (countPredMVs != 0) {
+      uint offset = curMB * 8;       // 8 predictors per MB
+      offset += get_local_id(0) % 8; // 16 work-items access 8 MVs for MB
+                                     // one predictor for MB per SIMD channel
+
+      // Reduce predictors from Q-pixel to integer precision.
+
+      int2 predMV = 0;
+      if (get_local_id(0) < countPredMVs) {
+        predMV =
+            convert_int2(predictors_buffer[offset]); // one MV per work-item
+        predMV.x /= 4;
+        predMV.y /= 4;
+        predMV.y &= 0xFFFE;
+      }
+
+      // Do up to 8 IMEs, get the best MVs and their distortions, and optionally
+      // a FBR of the best MVs.
+      // Finally the results are written out to SLM.
+
+      intel_work_group_vme_mb_multi_query_8(
+          dstSearch,    // best search MV and its distortions into SLM
+          countPredMVs, // count of predictor MVs (globally scalar - value range
+                        // 1 to 8)
+          MVCostPrecision, // MV cost precision
+          MVCostTable,     // MV cost table
+          srcCoord,        // MB 2-D offset (globally scalar)
+          predMV, // predictor MVs (up to 8 distinct MVs for SIMD16 thread)
+          srcImg, // source
+          refImg, // reference
+          accelerator); // vme object
+    }
+
+    int doIntra = (flags & 0x2) != 0;
+    int intraEdges = 0;
+    if (doIntra) {
+      // Enable all edges by default.
+      intraEdges = 0x3C;
+      // If this is a left-edge MB, then disable left edges.
+      if ((gid_0 == 0) & (get_global_offset(0) == 0)) {
+        intraEdges &= 0x18;
+      }
+      // If this is a right edge MB then disable right edges.
+      if (gid_0 == width - 1) {
+        intraEdges &= 0x34;
+      }
+      // If this is a top-edge MB, then disable top edges.
+      if ((gid_1 == 0) & (get_global_offset(1) == 0)) {
+        intraEdges &= 0x20;
+      }
+      // Set bit6=bit5.
+      intraEdges |= ((intraEdges & 0x20) << 1);
+      intraEdges <<= 8;
+    }
+    int countSkipMVs = count.y;
+    if (countSkipMVs != 0 || doIntra == true) {
+      uint offset = curMB * 8; // 8 sets of skip check MVs per MB
+      offset +=
+          (get_local_id(0) % 8); // 16 work-items access 8 sets of MVs for MB
+                                 // one set of skip MV per SIMD channel
+
+      // Do up to 8 skip checks and get the distortions for each of them.
+      // Finally the results are written out to SLM.
+
+      if ((skip_block_type == 0x0) | ((doIntra) & (countSkipMVs == 0))) {
+        int skipMVs = 0;
+        if (get_local_id(0) < countSkipMVs) {
+          __global int *skip1_motion_vector_buffer =
+              (__global int *)skip_motion_vector_buffer;
+          skipMVs = skip1_motion_vector_buffer[offset]; // one packed MV for one
+                                                        // work-item
+        }
+        intel_work_group_vme_mb_multi_check_16x16(
+            dstSkipIntra, // distortions into SLM
+            countSkipMVs, // count of skip check MVs (value range 0 to 8)
+            doIntra,      // compute intra modes
+            intraEdges,   // intra edges to use
+            srcCoord,     // MB 2-D offset (globally scalar)
+            skipMVs,      // skip check MVs (up to 8 sets of skip check MVs for
+                          // SIMD16 thread)
+            srcImg,       // source
+            refImg,       // reference
+            intraSrcImg,  // intra source
+            accelerator);
+      }
+
+      if ((skip_block_type == 0x1) & (countSkipMVs > 0)) {
+        int4 skipMVs = 0;
+        if (get_local_id(0) < countSkipMVs) {
+          __global int4 *skip4_motion_vector_buffer =
+              (__global int4 *)(skip_motion_vector_buffer);
+          skipMVs = skip4_motion_vector_buffer[offset]; // four component MVs
+                                                        // per work-item
+        }
+        intel_work_group_vme_mb_multi_check_8x8(
+            dstSkipIntra, // distortions into SLM
+            countSkipMVs, // count of skip check MVs per MB (value range 0 to 8)
+            doIntra,      // compute intra modes
+            intraEdges,   // intra edges to use
+            srcCoord,     // MB 2-D offset (globally scalar)
+            skipMVs, // skip check MVs (up to 8 ets of skip check MVs for SIMD16
+                     // thread)
+            srcImg,  // source
+            refImg,  // reference
+            intraSrcImg, // intra source
+            accelerator);
+      }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Write Out motion estimation result:
+    // Result format
+    //     Hierarchical row-major layout
+    //     i.e. row-major of blocks MVs in MBs, and row-major of 8 sets of
+    //     MVs/distortion in blocks
+
+    if (countPredMVs != 0) {
+      // 4x4
+      if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
+        int index = (gid_0 * 16 + get_local_id(0)) + (gid_1 * 16 * width);
+
+        // 1. 16 work-items enabled.
+        // 2. Work-items gather fwd MVs in strided dword locations 0, 2, .., 30
+        // (interleaved
+        //    fwd/bdw MVs) with constant offset 8 (control data size) from SLM
+        //    into contiguous
+        //    short2 locations 0, 1, .., 15 of global buffer
+        //    search_motion_vector_buffer with
+        //    offset index.
+        // 3. Work-items gather contiguous ushort locations 0, 1, .., 15 from
+        // distSearch into
+        //    contiguous ushort locations 0, 1, .., 15 of search_residuals with
+        //    offset index.
+
+        short2 val = as_short2(dstSearch[8 + get_local_id(0) * 2]);
+        motion_vector_buffer[index] = val;
+
+#ifndef HW_NULL_CHECK
+        if (residuals != NULL)
+#endif
+        {
+          residuals[index] = distSearch[get_local_id(0)];
+        }
+      }
+
+      // 8x8
+      else if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
+        // Only 1st 4 work-item are needed.
+        if (get_local_id(0) < 4) {
+          int index = (gid_0 * 4 + get_local_id(0)) + (gid_1 * 4 * width);
+
+          // 1. 4 work-items enabled.
+          // 2. Work-items gather fw MVs in strided dword locations 0, 8, 16, 24
+          // (interleaved
+          //    fwd/bdw MVs) with constant offset 8 from SLM into contiguous
+          //    short2 locations
+          //    0, 1, .., 15 of global buffer search_motion_vector_buffer with
+          //    offset index.
+          // 3. Work-items gather strided ushort locations 0, 4, 8, 12 from
+          // distSearch into
+          //    contiguous ushort locations 0, 1, .., 15 of search_residuals
+          //    with offset index.
+
+          short2 val = as_short2(dstSearch[8 + get_local_id(0) * 4 * 2]);
+          motion_vector_buffer[index] = val;
+
+#ifndef HW_NULL_CHECK
+          if (residuals != NULL)
+#endif
+          {
+            residuals[index] = distSearch[get_local_id(0) * 4];
+          }
+        }
+      }
+
+      // 16x16
+      else if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
+        // One 1st work is needed.
+        if (get_local_id(0) == 0) {
+          int index = gid_0 + gid_1 * width;
+
+          // 1. 1 work-item enabled.
+          // 2. Work-item gathers fwd MV in dword location 0 with constant
+          // offset 8 from
+          //    SLM into short2 locations 0 of global buffer
+          //    search_motion_vector_buffer.
+          // 3. Work-item gathers ushort location 0 from distSearch into ushort
+          //    location 0 of search_residuals with offset index.
+
+          short2 val = as_short2(dstSearch[8]);
+          motion_vector_buffer[index] = val;
+
+#ifndef HW_NULL_CHECK
+          if (residuals != NULL)
+#endif
+          {
+            residuals[index] = distSearch[0];
+          }
+        }
+      }
+    }
+
+    // Write out motion skip check result:
+    // Result format
+    //     Hierarchical row-major layout
+    //     i.e. row-major of blocks in MBs, and row-major of 8 sets of
+    //     distortions in blocks
+
+    if (countSkipMVs != 0) {
+      if (skip_block_type == 0x0) {
+        // Copy out 8 (1 component) sets of distortion values.
+
+        int index = (gid_0 * 8) + (get_local_id(0)) + (gid_1 * 8 * width);
+
+        if (get_local_id(0) < countSkipMVs) {
+          __local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
+
+          // 1. Up to 8 work-items are enabled.
+          // 2. The work-item gathers distSkip locations 0, 16*1, .., 16*7 and
+          //    copies them to contiguous skip_residual locations 0, 1, 2, ..,
+          //    7.
+          skip_residuals[index] = distSkip[get_local_id(0) * 16];
+        }
+      } else {
+        // Copy out 8 (4 component) sets of distortion values.
+
+        int index =
+            (gid_0 * 8 * 4) + (get_local_id(0)) + (gid_1 * 8 * 4 * width);
+
+        __local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
+
+        if (get_local_id(0) < countSkipMVs * 4) {
+          // 1. Up to 16 work-items are enabled.
+          // 2. The work-item gathers distSkip locations 0, 4*1, .., 4*31 and
+          //    copies them to contiguous skip_residual locations 0, 1, 2, ..,
+          //    31.
+
+          skip_residuals[index] = distSkip[get_local_id(0) * 4];
+          skip_residuals[index + 16] = distSkip[(get_local_id(0) + 16) * 4];
+        }
+      }
+    }
+
+    // Write out intra search result:
+
+    if (doIntra) {
+
+      int index_low =
+          (gid_0 * 22) + (get_local_id(0) * 2) + (gid_1 * 22 * width);
+      int index_high =
+          (gid_0 * 22) + (get_local_id(0) * 2) + 1 + (gid_1 * 22 * width);
+
+      // Write out the 4x4 intra modes
+      if (get_local_id(0) < 8) {
+        __local char *dstIntra_4x4 =
+            (__local char *)(&dstSkipIntra[64 + 16 + 4]);
+        char value = dstIntra_4x4[get_local_id(0)];
+        char value_low = (value)&0xf;
+        char value_high = (value >> 4) & 0xf;
+        intra_search_predictor_modes[index_low + 5] = value_low;
+        intra_search_predictor_modes[index_high + 5] = value_high;
+      }
+
+      // Write out the 8x8 intra modes
+      if (get_local_id(0) < 4) {
+        __local char *dstIntra_8x8 =
+            (__local char *)(&dstSkipIntra[64 + 8 + 4]);
+        char value = dstIntra_8x8[get_local_id(0) * 2];
+        char value_low = (value)&0xf;
+        int index = (gid_0 * 22) + (get_local_id(0)) + (gid_1 * 22 * width);
+        intra_search_predictor_modes[index + 1] = value_low;
+      }
+
+      // Write out the 16x16 intra modes
+      if (get_local_id(0) < 1) {
+        __local char *dstIntra_16x16 =
+            (__local char *)(&dstSkipIntra[64 + 0 + 4]);
+        char value = dstIntra_16x16[get_local_id(0)];
+        char value_low = (value)&0xf;
+        intra_search_predictor_modes[index_low] = value_low;
+      }
+
+// Get the intra residuals.
+#ifndef HW_NULL_CHECK
+      if (intra_residuals != NULL)
+#endif
+      {
+        int index = (gid_0 * 4) + (gid_1 * 4 * width);
+
+        if (get_local_id(0) < 1) {
+          __local ushort *distIntra_4x4 = (__local ushort *)(&dstSkipIntra[64 + 16 + 3]);
+          __local ushort *distIntra_8x8 = (__local ushort *)(&dstSkipIntra[64 + 8 + 3]);
+          __local ushort *distIntra_16x16 = (__local ushort *)(&dstSkipIntra[64 + 0 + 3]);
+          intra_residuals[index + 2] = distIntra_4x4[0];
+          intra_residuals[index + 1] = distIntra_8x8[0];
+          intra_residuals[index + 0] = distIntra_16x16[0];
+        }
+      }
+    }
+  }
+}
+)==="
--- a/runtime/built_ins/kernels/vme_block_advanced_motion_estimate_check_intel_frontend.igdrcl_built_in
+++ b/runtime/built_ins/kernels/vme_block_advanced_motion_estimate_check_intel_frontend.igdrcl_built_in
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+R"===(
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
+block_advanced_motion_estimate_check_intel(
+    sampler_t accelerator, __read_only image2d_t srcImg,
+    __read_only image2d_t refImg, uint flags, uint skip_block_type,
+    uint search_cost_penalty, uint search_cost_precision,
+    __global short2 *count_motion_vector_buffer,
+    __global short2 *predictors_buffer,
+    __global short2 *skip_motion_vector_buffer,
+    __global short2 *motion_vector_buffer,
+    __global char *intra_search_predictor_modes, __global ushort *residuals,
+    __global ushort *skip_residuals, __global ushort *intra_residuals) {
+}
+)==="
--- a/runtime/built_ins/kernels/vme_block_motion_estimate_intel.igdrcl_built_in
+++ b/runtime/built_ins/kernels/vme_block_motion_estimate_intel.igdrcl_built_in
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+R"===(
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
+block_motion_estimate_intel(sampler_t accelerator, __read_only image2d_t srcImg,
+                            __read_only image2d_t refImg,
+                            __global short2 *prediction_motion_vector_buffer,
+                            __global short2 *motion_vector_buffer,
+                            __global ushort *residuals, int height, int width,
+                            int stride) {
+  __local uint dst[64];
+  __local ushort *dist = (__local ushort *)&dst[8 * 5];
+
+  int sid_0 = stride * get_group_id(0);
+  int gid_0 = sid_0 / height;
+  int gid_1 = sid_0 % height;
+  for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
+       sid++, gid_0 = sid / height, gid_1 = sid % height) {
+    int2 srcCoord = 0;
+    int2 refCoord = 0;
+
+    srcCoord.x = gid_0 * 16 + get_global_offset(0);
+    srcCoord.y = gid_1 * 16 + get_global_offset(1);
+
+    short2 predMV = 0;
+
+#ifndef HW_NULL_CHECK
+    if (prediction_motion_vector_buffer != NULL)
+#endif
+    {
+      predMV = prediction_motion_vector_buffer[gid_0 + gid_1 * width];
+      refCoord.x = predMV.x / 4;
+      refCoord.y = predMV.y / 4;
+      refCoord.y = refCoord.y & 0xFFFE;
+    }
+
+    {
+      intel_work_group_vme_mb_query(dst, srcCoord, refCoord, srcImg, refImg,
+                                    accelerator);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Write Out Result
+
+    // 4x4
+    if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
+      int x = get_local_id(0) % 4;
+      int y = get_local_id(0) / 4;
+      int index = (gid_0 * 4 + x) + (gid_1 * 4 + y) * width * 4;
+
+      short2 val = as_short2(dst[8 + (y * 4 + x) * 2]);
+      motion_vector_buffer[index] = val;
+
+#ifndef HW_NULL_CHECK
+      if (residuals != NULL)
+#endif
+      {
+        residuals[index] = dist[y * 4 + x];
+      }
+    }
+
+    // 8x8
+    if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
+      if (get_local_id(0) < 4) {
+        int x = get_local_id(0) % 2;
+        int y = get_local_id(0) / 2;
+        int index = (gid_0 * 2 + x) + (gid_1 * 2 + y) * width * 2;
+        short2 val = as_short2(dst[8 + (y * 2 + x) * 8]);
+        motion_vector_buffer[index] = val;
+
+#ifndef HW_NULL_CHECK
+        if (residuals != NULL)
+#endif
+        {
+          residuals[index] = dist[(y * 2 + x) * 4];
+        }
+      }
+    }
+
+    // 16x16
+    if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
+      if (get_local_id(0) == 0) {
+        int index = gid_0 + gid_1 * width;
+
+        short2 val = as_short2(dst[8]);
+        motion_vector_buffer[index] = val;
+
+#ifndef HW_NULL_CHECK
+        if (residuals != NULL)
+#endif
+        {
+          residuals[index] = dist[0];
+        }
+      }
+    }
+  }
+}
+)==="
--- a/runtime/built_ins/kernels/vme_block_motion_estimate_intel_frontend.igdrcl_built_in
+++ b/runtime/built_ins/kernels/vme_block_motion_estimate_intel_frontend.igdrcl_built_in
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+R"===(
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
+block_motion_estimate_intel(sampler_t accelerator, __read_only image2d_t srcImg,
+                            __read_only image2d_t refImg,
+                            __global short2 *prediction_motion_vector_buffer,
+                            __global short2 *motion_vector_buffer,
+                            __global ushort *residuals) {
+}
+)==="
--- a/runtime/built_ins/registry/CMakeLists.txt
+++ b/runtime/built_ins/registry/CMakeLists.txt
@@ -0,0 +1,40 @@
+# Copyright (c) 2017, Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+set (NEO_REGISTERED_BUILTINS_SOURCES
+  register_copy_kernels_source.cpp
+  register_ext_vme_source.cpp
+  CMakeLists.txt
+)
+
+add_library(${BUILTINS_SOURCES_LIB_NAME} OBJECT
+  ${NEO_REGISTERED_BUILTINS_SOURCES}
+)
+
+set_target_properties(${BUILTINS_SOURCES_LIB_NAME}
+  PROPERTIES POSITION_INDEPENDENT_CODE ON
+)
+
+target_include_directories(${BUILTINS_SOURCES_LIB_NAME} PRIVATE
+  ${KHRONOS_HEADERS_DIR}
+  ${UMKM_SHAREDDATA_INCLUDE_PATHS}
+  ${IGDRCL__IGC_INCLUDE_DIR}
+  ${THIRD_PARTY_DIR}
+)
--- a/runtime/built_ins/registry/built_ins_registry.h
+++ b/runtime/built_ins/registry/built_ins_registry.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include "runtime/built_ins/built_ins.h"
+
+namespace OCLRT {
+
+struct RegisterEmbeddedResource {
+    RegisterEmbeddedResource(const char *name, const char *resource, size_t resourceLength) {
+        auto &storageRegistry = EmbeddedStorageRegistry::getInstance();
+        storageRegistry.store(name, createBuiltinResource(resource, resourceLength));
+    }
+
+    RegisterEmbeddedResource(const char *name, std::string &&resource)
+        : RegisterEmbeddedResource(name, resource.data(), resource.size() + 1) {
+    }
+};
+
+} // namespace OCLRT
--- a/runtime/built_ins/registry/register_copy_kernels_source.cpp
+++ b/runtime/built_ins/registry/register_copy_kernels_source.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <string>
+#include "runtime/built_ins/registry/built_ins_registry.h"
+
+namespace OCLRT {
+
+static RegisterEmbeddedResource registerCopyBufferToBufferSrc(
+    createBuiltinResourceName(
+        EBuiltInOps::CopyBufferToBuffer,
+        BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
+        .c_str(),
+    std::string(
+#include "runtime/built_ins/kernels/copy_buffer_to_buffer.igdrcl_built_in"
+        ));
+
+static RegisterEmbeddedResource registerCopyBufferRectSrc(
+    createBuiltinResourceName(
+        EBuiltInOps::CopyBufferRect,
+        BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
+        .c_str(),
+    std::string(
+#include "runtime/built_ins/kernels/copy_buffer_rect.igdrcl_built_in"
+        ));
+
+static RegisterEmbeddedResource registerFillBufferSrc(
+    createBuiltinResourceName(
+        EBuiltInOps::FillBuffer,
+        BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
+        .c_str(),
+    std::string(
+#include "runtime/built_ins/kernels/fill_buffer.igdrcl_built_in"
+        ));
+
+static RegisterEmbeddedResource registerCopyBufferToImage3dSrc(
+    createBuiltinResourceName(
+        EBuiltInOps::CopyBufferToImage3d,
+        BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
+        .c_str(),
+    std::string(
+#include "runtime/built_ins/kernels/copy_buffer_to_image3d.igdrcl_built_in"
+        ));
+
+static RegisterEmbeddedResource registerCopyImage3dToBufferSrc(
+    createBuiltinResourceName(
+        EBuiltInOps::CopyImage3dToBuffer,
+        BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
+        .c_str(),
+    std::string(
+#include "runtime/built_ins/kernels/copy_image3d_to_buffer.igdrcl_built_in"
+        ));
+
+static RegisterEmbeddedResource registerCopyImageToImage1dSrc(
+    createBuiltinResourceName(
+        EBuiltInOps::CopyImageToImage1d,
+        BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
+        .c_str(),
+    std::string(
+#include "runtime/built_ins/kernels/copy_image_to_image1d.igdrcl_built_in"
+        ));
+
+static RegisterEmbeddedResource registerCopyImageToImage2dSrc(
+    createBuiltinResourceName(
+        EBuiltInOps::CopyImageToImage2d,
+        BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
+        .c_str(),
+    std::string(
+#include "runtime/built_ins/kernels/copy_image_to_image2d.igdrcl_built_in"
+        ));
+
+static RegisterEmbeddedResource registerCopyImageToImage3dSrc(
+    createBuiltinResourceName(
+        EBuiltInOps::CopyImageToImage3d,
+        BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
+        .c_str(),
+    std::string(
+#include "runtime/built_ins/kernels/copy_image_to_image3d.igdrcl_built_in"
+        ));
+
+static RegisterEmbeddedResource registerFillImage1dSrc(
+    createBuiltinResourceName(
+        EBuiltInOps::FillImage1d,
+        BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
+        .c_str(),
+    std::string(
+#include "runtime/built_ins/kernels/fill_image1d.igdrcl_built_in"
+        ));
+
+static RegisterEmbeddedResource registerFillImage2dSrc(
+    createBuiltinResourceName(
+        EBuiltInOps::FillImage2d,
+        BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
+        .c_str(),
+    std::string(
+#include "runtime/built_ins/kernels/fill_image2d.igdrcl_built_in"
+        ));
+
+static RegisterEmbeddedResource registerFillImage3dSrc(
+    createBuiltinResourceName(
+        EBuiltInOps::FillImage3d,
+        BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
+        .c_str(),
+    std::string(
+#include "runtime/built_ins/kernels/fill_image3d.igdrcl_built_in"
+        ));
+
+} // namespace OCLRT
--- a/runtime/built_ins/registry/register_ext_vme_source.cpp
+++ b/runtime/built_ins/registry/register_ext_vme_source.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <string>
+#include "runtime/built_ins/registry/built_ins_registry.h"
+
+namespace OCLRT {
+
+static RegisterEmbeddedResource registerVmeSrc(
+    createBuiltinResourceName(
+        EBuiltInOps::VmeBlockMotionEstimateIntel,
+        BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
+        .c_str(),
+    std::string(
+#include "runtime/built_ins/kernels/vme_block_motion_estimate_intel.igdrcl_built_in"
+        ));
+
+static RegisterEmbeddedResource registerVmeAdvancedSrc(
+    createBuiltinResourceName(
+        EBuiltInOps::VmeBlockAdvancedMotionEstimateCheckIntel,
+        BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
+        .c_str(),
+    std::string(
+#include "runtime/built_ins/kernels/vme_block_advanced_motion_estimate_check_intel.igdrcl_built_in"
+        ));
+
+static RegisterEmbeddedResource registerVmeAdvancedBidirectionalSrc(
+    createBuiltinResourceName(
+        EBuiltInOps::VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel,
+        BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
+        .c_str(),
+    std::string(
+#include "runtime/built_ins/kernels/vme_block_advanced_motion_estimate_bidirectional_check_intel.igdrcl_built_in"
+        ));
+
+} // namespace OCLRT
--- a/runtime/built_ins/sip.cpp
+++ b/runtime/built_ins/sip.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "runtime/built_ins/sip.h"
+#include "runtime/device/device.h"
+#include "runtime/helpers/debug_helpers.h"
+#include "runtime/helpers/string.h"
+
+namespace OCLRT {
+
+const char *getSipKernelCompilerInternalOptions(SipKernelType kernel) {
+    switch (kernel) {
+    default:
+        DEBUG_BREAK_IF(true);
+        return "";
+    case SipKernelType::Csr:
+        return "-cl-include-sip-csr";
+    }
+}
+
+const char *getSipLlSrc(const Device &device) {
+#define M_DUMMY_LL_SRC                              \
+    "define void @f()  {                        \n" \
+    "    ret void                               \n" \
+    "}                                          \n" \
+    "!opencl.compiler.options = !{!0}           \n" \
+    "!opencl.kernels = !{!1}                    \n" \
+    "!0 = !{}                                   \n" \
+    "!1 = !{void()* @f, !2, !3, !4, !5, !6, !7} \n" \
+    "!2 = !{!\"kernel_arg_addr_space\"}         \n" \
+    "!3 = !{!\"kernel_arg_access_qual\"}        \n" \
+    "!4 = !{!\"kernel_arg_type\"}               \n" \
+    "!5 = !{!\"kernel_arg_type_qual\"}          \n" \
+    "!6 = !{!\"kernel_arg_base_type\"}          \n" \
+    "!7 = !{!\"kernel_arg_name\"}               \n"
+
+    constexpr const char *llDummySrc32 =
+        "target datalayout = \"e-p:32:32:32\"       \n"
+        "target triple = \"spir\"                   \n" M_DUMMY_LL_SRC;
+
+    constexpr const char *llDummySrc64 =
+        "target datalayout = \"e-p:64:64:64\"       \n"
+        "target triple = \"spir64\"                 \n" M_DUMMY_LL_SRC;
+
+#undef M_DUMMY_LL_SRC
+
+    const uint32_t ptrSize = device.getDeviceInfo().force32BitAddressess ? 4 : sizeof(void *);
+
+    return (ptrSize == 8) ? llDummySrc64 : llDummySrc32;
+}
+
+SipKernel::SipKernel(SipKernelType type, const void *binary, size_t binarySize)
+    : type(type) {
+
+    UNRECOVERABLE_IF(binary == nullptr);
+    UNRECOVERABLE_IF(binarySize == 0);
+
+    this->binary.reset(new char[binarySize]);
+    memcpy_s(this->binary.get(), binarySize, binary, binarySize);
+    this->binarySize = binarySize;
+}
+}
--- a/runtime/built_ins/sip.h
+++ b/runtime/built_ins/sip.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <cinttypes>
+#include <memory>
+
+namespace OCLRT {
+
+class Device;
+
+enum class SipKernelType : std::uint32_t {
+    Csr = 0,
+    COUNT
+};
+
+const char *getSipKernelCompilerInternalOptions(SipKernelType kernel);
+
+const char *getSipLlSrc(const Device &device);
+
+class SipKernel {
+  public:
+    SipKernel(SipKernelType type, const void *binary, size_t binarySize);
+    SipKernel(const SipKernel &) = delete;
+    SipKernel &operator=(const SipKernel &) = delete;
+    SipKernel(SipKernel &&) = default;
+    SipKernel &operator=(SipKernel &&) = default;
+
+    const char *getBinary() const {
+        return binary.get();
+    }
+    size_t getBinarySize() const {
+        return binarySize;
+    }
+
+  protected:
+    SipKernelType type = SipKernelType::COUNT;
+    std::unique_ptr<char[]> binary = nullptr;
+    size_t binarySize = 0;
+};
+}
--- a/runtime/built_ins/vme_dispatch_builder.h
+++ b/runtime/built_ins/vme_dispatch_builder.h
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "runtime/accelerators/intel_accelerator.h"
+#include "runtime/accelerators/intel_motion_estimation.h"
+#include "runtime/built_ins/built_ins.h"
+#include "runtime/helpers/dispatch_info_builder.h"
+#include "runtime/mem_obj/buffer.h"
+#include "runtime/mem_obj/image.h"
+
+namespace OCLRT {
+template <typename HWFamily>
+class VmeBuiltinDispatchInfoBuilder : public BuiltinDispatchInfoBuilder {
+  public:
+    VmeBuiltinDispatchInfoBuilder(BuiltIns &kernelsLib, Context &context, Device &device, EBuiltInOps builtinOp,
+                                  const char *kernelName)
+        : BuiltinDispatchInfoBuilder(kernelsLib) {
+        populate(context, device, builtinOp,
+                 mediaKernelsBuildOptions,
+                 kernelName, vmeKernel);
+        widthArgNum = vmeKernel->getKernelInfo().getArgNumByName("width");
+        heightArgNum = vmeKernel->getKernelInfo().getArgNumByName("height");
+        strideArgNum = vmeKernel->getKernelInfo().getArgNumByName("stride");
+        acceleratorArgNum = vmeKernel->getKernelInfo().getArgNumByName("accelerator");
+        srcImgArgNum = vmeKernel->getKernelInfo().getArgNumByName("srcImg");
+        refImgArgNum = vmeKernel->getKernelInfo().getArgNumByName("refImg");
+        motionVectorBufferArgNum = vmeKernel->getKernelInfo().getArgNumByName("motion_vector_buffer");
+        predictionMotionVectorBufferArgNum = vmeKernel->getKernelInfo().getArgNumByName("prediction_motion_vector_buffer");
+        residualsArgNum = vmeKernel->getKernelInfo().getArgNumByName("residuals");
+    }
+
+    void getBlkTraits(const Vec3<size_t> &inGws, size_t &gwWidthInBlk, size_t &gwHeightInBlk) const {
+        const size_t vmeMacroBlockWidth = 16;
+        const size_t vmeMacroBlockHeight = 16;
+        gwWidthInBlk = (inGws.x + vmeMacroBlockWidth - 1) / vmeMacroBlockWidth;
+        gwHeightInBlk = (inGws.y + vmeMacroBlockHeight - 1) / vmeMacroBlockHeight;
+    }
+
+    bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, Kernel *kern,
+                            const uint32_t inDim, const Vec3<size_t> &inGws, const Vec3<size_t> &inLws, const Vec3<size_t> &inOffset) const override {
+        if (kern == nullptr) {
+            return false;
+        }
+
+        size_t gwWidthInBlk = 0;
+        size_t gwHeightInBlk = 0;
+        getBlkTraits(inGws, gwWidthInBlk, gwHeightInBlk);
+
+        cl_int height = (cl_int)gwHeightInBlk;
+        cl_int width = (cl_int)gwWidthInBlk;
+        cl_int stride = height;
+        size_t numThreadsX = gwWidthInBlk;
+        const size_t simdWidth = vmeKernel->getKernelInfo().getMaxSimdSize();
+        stride = (height * width + (cl_int)numThreadsX - 1) / (cl_int)numThreadsX;
+
+        // update implicit args
+        vmeKernel->setArg(heightArgNum, sizeof(height), &height);
+        vmeKernel->setArg(widthArgNum, sizeof(width), &width);
+        vmeKernel->setArg(strideArgNum, sizeof(stride), &stride);
+
+        // Update global work size to force macro-block to HW thread execution model
+        Vec3<size_t> gws = {numThreadsX * simdWidth, 1, 1};
+        Vec3<size_t> lws = {vmeKernel->getKernelInfo().reqdWorkGroupSize[0], 1, 1};
+
+        DispatchInfoBuilder<SplitDispatch::Dim::d2D, SplitDispatch::SplitMode::NoSplit> builder;
+        builder.setDispatchGeometry(gws, lws, inOffset, gws, lws);
+        builder.setKernel(vmeKernel);
+        builder.bake(multiDispatchInfo);
+        return true;
+    }
+
+    bool setExplicitArg(uint32_t argIndex, size_t argSize, const void *argVal, cl_int &err) const override {
+        DEBUG_BREAK_IF(!((argIndex != widthArgNum) && (argIndex != heightArgNum) && (argIndex != strideArgNum)));
+        if ((argIndex == acceleratorArgNum) && (argVal == nullptr)) {
+            err = CL_INVALID_ACCELERATOR_INTEL;
+            return false;
+        }
+        err = vmeKernel->setArg(argIndex, argSize, argVal);
+        return false;
+    }
+
+    cl_int validateDispatch(Kernel *kernel, uint32_t inworkDim, const Vec3<size_t> &inGws, const Vec3<size_t> &inLws, const Vec3<size_t> &inOffset) const override {
+        if (inworkDim != 2) {
+            return CL_INVALID_WORK_DIMENSION;
+        }
+
+        size_t gwWidthInBlk = 0;
+        size_t gwHeightInBlk = 0;
+        getBlkTraits(inGws, gwWidthInBlk, gwHeightInBlk);
+
+        size_t BlkNum = gwWidthInBlk * gwHeightInBlk;
+        size_t BlkMul = 1;
+        IntelAccelerator *accelerator = castToObject<IntelAccelerator>((cl_accelerator_intel)vmeKernel->getKernelArg(acceleratorArgNum));
+        if (accelerator == nullptr) {
+            return CL_INVALID_KERNEL_ARGS; // accelerator was not set
+        }
+        DEBUG_BREAK_IF(accelerator->getDescriptorSize() != sizeof(cl_motion_estimation_desc_intel));
+        const cl_motion_estimation_desc_intel *acceleratorDesc = reinterpret_cast<const cl_motion_estimation_desc_intel *>(accelerator->getDescriptor());
+        switch (acceleratorDesc->mb_block_type) {
+        case CL_ME_MB_TYPE_8x8_INTEL:
+            BlkMul = 4;
+            break;
+        case CL_ME_MB_TYPE_4x4_INTEL:
+            BlkMul = 16;
+            break;
+        default:
+            break;
+        }
+
+        return validateVmeDispatch(inGws, inOffset, BlkNum, BlkMul);
+    }
+
+    // notes on corner cases :
+    // * if arg not available in kernels - returns true
+    // * if arg set to nullptr - returns true
+    bool validateBufferSize(int32_t bufferArgNum, size_t minimumSizeExpected) const {
+        if (bufferArgNum == -1) {
+            return true;
+        }
+
+        auto buff = castToObject<Buffer>((cl_mem)vmeKernel->getKernelArg(bufferArgNum));
+        if (buff == nullptr) {
+            return true;
+        }
+
+        size_t bufferSize = buff->getSize();
+        if (bufferSize < minimumSizeExpected) {
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename EnumBaseType>
+    bool validateEnumVal(EnumBaseType val) const {
+        return false;
+    }
+
+    template <typename EnumBaseType, typename ExpectedValType, typename... ExpectedValsTypes>
+    bool validateEnumVal(EnumBaseType val, ExpectedValType expectedVal, ExpectedValsTypes... expVals) const {
+        return (val == static_cast<EnumBaseType>(expectedVal)) || validateEnumVal<EnumBaseType, ExpectedValsTypes...>(val, expVals...);
+    }
+
+    // notes on corner cases :
+    // * if arg not available in kernels - returns true
+    template <typename EnumBaseType, typename... ExpectedValsTypes>
+    bool validateEnumArg(int32_t argNum, ExpectedValsTypes... expVals) const {
+        if (argNum == -1) {
+            return true;
+        }
+
+        EnumBaseType val = this->getKernelArgByValValue<EnumBaseType>(static_cast<uint32_t>(argNum));
+        return validateEnumVal<EnumBaseType, ExpectedValsTypes...>(val, expVals...);
+    }
+
+    template <typename RetType>
+    RetType getKernelArgByValValue(uint32_t argNum) const {
+        auto &kai = vmeKernel->getKernelInfo().kernelArgInfo[argNum];
+        DEBUG_BREAK_IF(kai.kernelArgPatchInfoVector.size() != 1);
+        const KernelArgPatchInfo &patchInfo = kai.kernelArgPatchInfoVector[0];
+        DEBUG_BREAK_IF(sizeof(RetType) > patchInfo.size);
+        return *(RetType *)(vmeKernel->getCrossThreadData() + patchInfo.crossthreadOffset);
+    }
+
+    cl_int validateImages(Vec3<size_t> inputRegion, Vec3<size_t> offset) const {
+        Image *srcImg = castToObject<Image>((cl_mem)vmeKernel->getKernelArg(srcImgArgNum));
+        Image *refImg = castToObject<Image>((cl_mem)vmeKernel->getKernelArg(refImgArgNum));
+
+        if ((srcImg == nullptr) || (refImg == nullptr)) {
+            return CL_INVALID_KERNEL_ARGS;
+        }
+
+        for (Image *img : {srcImg, refImg}) {
+            const cl_image_format &imgFormat = img->getImageFormat();
+            if ((imgFormat.image_channel_order != CL_R) || (imgFormat.image_channel_data_type != CL_UNORM_INT8)) {
+                return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+            }
+
+            if (false == img->isTiledImage) {
+                //VME only works with tiled images.
+                return CL_OUT_OF_RESOURCES;
+            }
+        }
+
+        {
+            const cl_image_desc &srcImgDesc = srcImg->getImageDesc();
+
+            size_t srcImageWidth = srcImgDesc.image_width;
+            size_t srcImageHeight = srcImgDesc.image_height;
+            if (((inputRegion.x + offset.x) > srcImageWidth) ||
+                ((inputRegion.y + offset.y) > srcImageHeight)) {
+                return CL_INVALID_IMAGE_SIZE;
+            }
+        }
+
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int validateVmeDispatch(Vec3<size_t> inputRegion, Vec3<size_t> offset, size_t blkNum, size_t blkMul) const {
+        {
+            cl_int imageValidationStatus = validateImages(inputRegion, offset);
+            if (imageValidationStatus != CL_SUCCESS) {
+                return imageValidationStatus;
+            }
+        }
+
+        size_t numPredictors = 1;
+        std::pair<int32_t, size_t> bufferRequirements[] = {
+            std::make_pair(motionVectorBufferArgNum, (blkNum * blkMul * 2 * sizeof(cl_short))),
+            std::make_pair(predictionMotionVectorBufferArgNum, (blkNum * numPredictors * 2 * sizeof(cl_short))),
+            std::make_pair(residualsArgNum, (blkNum * blkMul * sizeof(cl_ushort)))};
+        for (const auto &req : bufferRequirements) {
+            if (false == validateBufferSize(req.first, req.second)) {
+                return CL_INVALID_BUFFER_SIZE;
+            }
+        }
+
+        return CL_SUCCESS;
+    }
+
+  protected:
+    uint32_t heightArgNum;
+    uint32_t widthArgNum;
+    uint32_t strideArgNum;
+    uint32_t acceleratorArgNum;
+    uint32_t srcImgArgNum;
+    uint32_t refImgArgNum;
+    int32_t motionVectorBufferArgNum;
+    int32_t predictionMotionVectorBufferArgNum;
+    int32_t residualsArgNum;
+    Kernel *vmeKernel;
+};
+
+template <typename HWFamily>
+class BuiltInOp<HWFamily, EBuiltInOps::VmeBlockMotionEstimateIntel> : public VmeBuiltinDispatchInfoBuilder<HWFamily> {
+  public:
+    BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device)
+        : VmeBuiltinDispatchInfoBuilder<HWFamily>(kernelsLib, context, device,
+                                                  EBuiltInOps::VmeBlockMotionEstimateIntel, "block_motion_estimate_intel") {
+    }
+};
+
+template <typename HWFamily>
+class AdvancedVmeBuiltinDispatchInfoBuilder : public VmeBuiltinDispatchInfoBuilder<HWFamily> {
+  public:
+    AdvancedVmeBuiltinDispatchInfoBuilder(BuiltIns &kernelsLib, Context &context, Device &device, EBuiltInOps builtinOp,
+                                          const char *kernelName)
+        : VmeBuiltinDispatchInfoBuilder<HWFamily>(kernelsLib, context, device, builtinOp,
+                                                  kernelName) {
+        flagsArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("flags");
+        intraSrcImgArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("intraSrcImg");
+        skipBlockTypeArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("skip_block_type");
+        searchCostPenaltyArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("search_cost_penalty");
+        searchCostPrecisionArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("search_cost_precision");
+        bidirWeightArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("bidir_weight");
+        predictorsBufferArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("predictors_buffer");
+        countMotionVectorBufferArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("count_motion_vector_buffer");
+        skipMotionVectorBufferArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("skip_motion_vector_buffer");
+        intraSearchPredictorModesArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("intra_search_predictor_modes");
+        skipResidualsArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("skip_residuals");
+        intraResidualsArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("intra_residuals");
+    }
+
+    bool setExplicitArg(uint32_t argIndex, size_t argSize, const void *argVal, cl_int &err) const override {
+        DEBUG_BREAK_IF(argIndex == intraSrcImgArgNum);
+        if (argIndex == this->srcImgArgNum) {
+            // rebind also as media block image
+            this->vmeKernel->setArg(intraSrcImgArgNum, argSize, argVal);
+        }
+        return VmeBuiltinDispatchInfoBuilder<HWFamily>::setExplicitArg(argIndex, argSize, argVal, err);
+    }
+
+    virtual bool isBidirKernel() const {
+        return false;
+    }
+
+    bool validateFlags(uint32_t &outSkipBlockType) const {
+        uint32_t flagsVal = VmeBuiltinDispatchInfoBuilder<HWFamily>::template getKernelArgByValValue<uint32_t>(flagsArgNum);
+
+        if ((flagsVal & CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL) == CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL) {
+            return false;
+        }
+
+        if (flagsVal == CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL) {
+            outSkipBlockType = CL_ME_MB_TYPE_16x16_INTEL;
+        } else if ((flagsVal & CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL) == CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL) {
+            outSkipBlockType = CL_ME_MB_TYPE_8x8_INTEL;
+        }
+
+        return true;
+    }
+
+    bool validateSkipBlockTypeArg(uint32_t &outSkipBlockType) const {
+        if (skipBlockTypeArgNum == -1) {
+            return true;
+        }
+
+        outSkipBlockType = VmeBuiltinDispatchInfoBuilder<HWFamily>::template getKernelArgByValValue<uint32_t>(static_cast<uint32_t>(skipBlockTypeArgNum));
+
+        switch (outSkipBlockType) {
+        case CL_ME_MB_TYPE_16x16_INTEL:
+            break;
+        case CL_ME_MB_TYPE_8x8_INTEL:
+            break;
+        default:
+            return false;
+            ;
+        }
+        return true;
+    }
+
+    size_t getIntraSearchPredictorModesBuffExpSize(size_t blkNum) const {
+        // vector size is 22 - 1 (16x16 luma block) +  4 (8x8 luma block) + 16 (4x4 luma block) + 1 (8x8 chroma block)
+        int VectorSize = 22;
+        size_t intraSearchPredictorModesBuffExpSize = blkNum * VectorSize;
+        return intraSearchPredictorModesBuffExpSize;
+    }
+
+    size_t getSkipMotionVectorBufferExpSize(uint32_t skipBlockType, size_t blkNum) const {
+        // vector size is either 1 (16x16 block) or 4 (8x8 block)
+        // 0 to 8 skip MVs per MB
+        // may be null if all MBs in frame have 0 skip check MVs in which case VME skip checks are not performed
+        // layout assumes 4 (for bidir) or 8 (otherwise) skip check MVs per MB
+        // row-major block layout; all MVs for a block are contiguous
+        // buffer size depends on the block and frame size .
+        int vectorSize = (skipBlockType == CL_ME_MB_TYPE_16x16_INTEL) ? 1 : 4;
+        int numChecks = (isBidirKernel() ? 4 : 8);
+        size_t skipMotionVectorBufferExpSize = blkNum * numChecks * vectorSize * 2 * sizeof(cl_short);
+        return skipMotionVectorBufferExpSize;
+    }
+
+    size_t getSkipResidualsBuffExpSize(uint32_t skipBlockType, size_t blkNum) const {
+        /*  output buffer of vectors of unsigned short SAD adjusted values corresponding to the input skip check MVs
+            may be null if skip_motion_vector_buffer is null
+            vector size is either 1 (16x16 block) or 4 (8x8 block)
+            0 to 8 skip check residuals per MB
+            layout always assumes 8 skip check residuals per MB
+            row major block layout; all MVs for a block are contiguous
+            buffer size depends on the block and frame size  */
+        int vectorSize = 1;
+        switch (skipBlockType) {
+        case CL_ME_MB_TYPE_16x16_INTEL:
+            vectorSize = 1;
+            break;
+        case CL_ME_MB_TYPE_8x8_INTEL:
+            vectorSize = 4;
+            break;
+        default:
+            break;
+        };
+
+        int numChecks = (isBidirKernel() ? 4 : 8);
+        size_t skipResidualsBuffExpSize = blkNum * vectorSize * numChecks * sizeof(cl_ushort);
+        return skipResidualsBuffExpSize;
+    }
+
+    size_t getIntraResidualsBuffExpSize(size_t blkNum) const {
+        /*  output buffer of vectors of  unsigned short SAD adjusted values
+            may be null in which case the intra residuals corresponding not returned
+            vector size is 4 - 1 (16x16 luma block) +  1 (8x8 luma block) + 1 (4x4  luma block) + 1 (8x8 chroma block)
+            1 vector per MB
+            buffer size depends on the frame size  */
+        int vectorSize = 4;
+        size_t intraResidualsBuffExpSize = (blkNum * sizeof(cl_ushort) * vectorSize);
+        return intraResidualsBuffExpSize;
+    }
+
+    size_t getPredictorsBufferExpSize(size_t blkNum) const {
+        size_t numPredictors = 8;
+        size_t predictorsBufferExpSize = (blkNum * numPredictors * 2 * sizeof(cl_short));
+        return predictorsBufferExpSize;
+    }
+
+    cl_int validateVmeDispatch(Vec3<size_t> inputRegion, Vec3<size_t> offset, size_t blkNum, size_t blkMul) const override {
+        cl_int basicVmeValidationStatus = VmeBuiltinDispatchInfoBuilder<HWFamily>::validateVmeDispatch(inputRegion, offset, blkNum, blkMul);
+        if (basicVmeValidationStatus != CL_SUCCESS) {
+            return basicVmeValidationStatus;
+        }
+
+        uint32_t skipBlockType = CL_ME_MB_TYPE_16x16_INTEL;
+        if (false == validateFlags(skipBlockType)) {
+            return CL_INVALID_KERNEL_ARGS;
+        }
+
+        if (false == validateSkipBlockTypeArg(skipBlockType)) {
+            return CL_OUT_OF_RESOURCES;
+        }
+
+        if (false == VmeBuiltinDispatchInfoBuilder<HWFamily>::template validateEnumArg<uint32_t>(searchCostPenaltyArgNum, CL_ME_COST_PENALTY_NONE_INTEL, CL_ME_COST_PENALTY_LOW_INTEL, CL_ME_COST_PENALTY_NORMAL_INTEL,
+                                                                                                 CL_ME_COST_PENALTY_HIGH_INTEL)) {
+            return CL_OUT_OF_RESOURCES;
+        }
+
+        if (false == VmeBuiltinDispatchInfoBuilder<HWFamily>::template validateEnumArg<uint32_t>(searchCostPrecisionArgNum, CL_ME_COST_PRECISION_QPEL_INTEL, CL_ME_COST_PRECISION_HPEL_INTEL, CL_ME_COST_PRECISION_PEL_INTEL,
+                                                                                                 CL_ME_COST_PRECISION_DPEL_INTEL)) {
+            return CL_OUT_OF_RESOURCES;
+        }
+
+        if (false == VmeBuiltinDispatchInfoBuilder<HWFamily>::template validateEnumArg<uint8_t>(bidirWeightArgNum, 0, CL_ME_BIDIR_WEIGHT_QUARTER_INTEL, CL_ME_BIDIR_WEIGHT_THIRD_INTEL, CL_ME_BIDIR_WEIGHT_HALF_INTEL,
+                                                                                                CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL, CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL)) {
+            return CL_INVALID_KERNEL_ARGS;
+        }
+
+        std::pair<int32_t, size_t> bufferRequirements[] = {
+            std::make_pair(countMotionVectorBufferArgNum, (blkNum * 2 * sizeof(cl_short))),
+            std::make_pair(skipMotionVectorBufferArgNum, getSkipMotionVectorBufferExpSize(skipBlockType, blkNum)),
+            std::make_pair(intraSearchPredictorModesArgNum, getIntraSearchPredictorModesBuffExpSize(blkNum)),
+            std::make_pair(skipResidualsArgNum, getSkipResidualsBuffExpSize(skipBlockType, blkNum)),
+            std::make_pair(intraResidualsArgNum, getIntraResidualsBuffExpSize(blkNum)),
+            std::make_pair(predictorsBufferArgNum, getPredictorsBufferExpSize(blkNum))};
+        for (const auto &req : bufferRequirements) {
+            if (false == this->validateBufferSize(req.first, req.second)) {
+                return CL_INVALID_BUFFER_SIZE;
+            }
+        }
+
+        return CL_SUCCESS;
+    }
+
+  protected:
+    uint32_t flagsArgNum;
+    int32_t skipBlockTypeArgNum;
+    uint32_t searchCostPenaltyArgNum;
+    uint32_t searchCostPrecisionArgNum;
+    int32_t bidirWeightArgNum;
+    int32_t predictorsBufferArgNum;
+    uint32_t countMotionVectorBufferArgNum;
+    uint32_t skipMotionVectorBufferArgNum;
+    uint32_t intraSearchPredictorModesArgNum;
+    uint32_t skipResidualsArgNum;
+    uint32_t intraResidualsArgNum;
+    uint32_t intraSrcImgArgNum;
+};
+
+template <typename HWFamily>
+class BuiltInOp<HWFamily, EBuiltInOps::VmeBlockAdvancedMotionEstimateCheckIntel> : public AdvancedVmeBuiltinDispatchInfoBuilder<HWFamily> {
+  public:
+    BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device)
+        : AdvancedVmeBuiltinDispatchInfoBuilder<HWFamily>(kernelsLib, context, device, EBuiltInOps::VmeBlockAdvancedMotionEstimateCheckIntel,
+                                                          "block_advanced_motion_estimate_check_intel") {
+    }
+
+    cl_int validateVmeDispatch(Vec3<size_t> inputRegion, Vec3<size_t> offset,
+                               size_t gwWidthInBlk, size_t gwHeightInBlk) const override {
+        cl_int basicAdvVmeValidationStatus = AdvancedVmeBuiltinDispatchInfoBuilder<HWFamily>::validateVmeDispatch(inputRegion, offset, gwWidthInBlk, gwHeightInBlk);
+        if (basicAdvVmeValidationStatus != CL_SUCCESS) {
+            return basicAdvVmeValidationStatus;
+        }
+
+        auto countMotionVectorBuff = castToObject<Buffer>((cl_mem)this->vmeKernel->getKernelArg(this->countMotionVectorBufferArgNum));
+        if (countMotionVectorBuff == nullptr) {
+            return CL_INVALID_BUFFER_SIZE;
+        }
+
+        return CL_SUCCESS;
+    }
+};
+
+template <typename HWFamily>
+class BuiltInOp<HWFamily, EBuiltInOps::VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel> : public AdvancedVmeBuiltinDispatchInfoBuilder<HWFamily> {
+  public:
+    BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device)
+        : AdvancedVmeBuiltinDispatchInfoBuilder<HWFamily>(kernelsLib, context, device, EBuiltInOps::VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel,
+                                                          "block_advanced_motion_estimate_bidirectional_check_intel") {
+    }
+
+    bool isBidirKernel() const override {
+        return true;
+    }
+};
+}
--- a/runtime/builtin_kernels_simulation/CMakeLists.txt
+++ b/runtime/builtin_kernels_simulation/CMakeLists.txt
@@ -0,0 +1,74 @@
+# Copyright (c) 2017, Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+set(BUILTIN_KERNELS_SIMULATION_SRCS
+    "${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt"
+    "${CMAKE_CURRENT_SOURCE_DIR}/opencl_c.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/opencl_c.h"
+    "${CMAKE_CURRENT_SOURCE_DIR}/scheduler_simulation.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/scheduler_simulation.inl"
+    "${CMAKE_CURRENT_SOURCE_DIR}/scheduler_simulation.h"
+    )
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
+string(REPLACE "/WX" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+string(REGEX REPLACE "-Werror[^ \t\n]*" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+string(REPLACE "-Wsometimes-uninitialized" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+string(REPLACE "-Wsign-compare" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+string(REPLACE "-Wunused-variable" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+
+if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-everything")
+endif()
+
+if(COMPILER_SUPPORTS_CXX11)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+elseif(COMPILER_SUPPORTS_CXX0X)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x")
+endif()
+
+if(NOT MSVC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fpermissive -fPIC")
+endif()
+
+ENABLE_WUD()
+
+list (APPEND HEADER_INCLUDES ${IGDRCL_SOURCE_DIR}/runtime ${UMKM_SHAREDDATA_INCLUDE_PATHS})
+
+
+foreach(GEN_NUM RANGE ${MAX_GEN} 0 -1)
+  GEN_CONTAINS_PLATFORMS("SUPPORTED" ${GEN_NUM} GENX_HAS_PLATFORMS)
+  if(${GENX_HAS_PLATFORMS})
+    list(APPEND DEFAULT_GEN_PLATFORMS_DEFITIONS DEFAULT_GEN${GEN_NUM}_PLATFORM=${DEFAULT_SUPPORTED_GEN${GEN_NUM}_PLATFORM})
+    list (APPEND HEADER_INCLUDES ${IGDRCL_SOURCE_DIR}/runtime/gen${GEN_NUM})
+    list (APPEND BUILTIN_KERNELS_SIMULATION_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/gen${GEN_NUM}/scheduler_simulation.cpp)
+  endif()
+endforeach()
+
+add_library(${BIKSIM_LIB_NAME} OBJECT ${BUILTIN_KERNELS_SIMULATION_SRCS})
+target_include_directories(${BIKSIM_LIB_NAME} BEFORE PRIVATE ${HEADER_INCLUDES})
+target_include_directories(${BIKSIM_LIB_NAME} PRIVATE 
+	${KHRONOS_HEADERS_DIR} 
+	${IGDRCL__IGC_INCLUDE_DIR}
+	${THIRD_PARTY_DIR}
+)
+set_target_properties(${BIKSIM_LIB_NAME} PROPERTIES FOLDER "built_ins")
+target_compile_definitions(${BIKSIM_LIB_NAME} PUBLIC ${SUPPORTED_GEN_FLAGS_DEFINITONS} ${DEFAULT_GEN_PLATFORMS_DEFITIONS})
--- a/runtime/builtin_kernels_simulation/gen8/scheduler_simulation.cpp
+++ b/runtime/builtin_kernels_simulation/gen8/scheduler_simulation.cpp
@@ -0,0 +1,105 @@
+/*
+* Copyright (c) 2017, Intel Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include "CL/cl.h"
+#include "runtime/builtin_kernels_simulation/opencl_c.h"
+#include "runtime/builtin_kernels_simulation/scheduler_simulation.h"
+#include "runtime/builtin_kernels_simulation/scheduler_simulation.inl"
+#include "runtime/memory_manager/graphics_allocation.h"
+#include "runtime/gen8/hw_cmds.h"
+#include "runtime/execution_model/device_enqueue.h"
+
+using namespace OCLRT;
+using namespace BuiltinKernelsSimulation;
+
+namespace Gen8SchedulerSimulation {
+
+#define SCHEDULER_EMULATION
+
+uint GetNextPowerof2(uint number);
+
+float __intel__getProfilingTimerResolution() {
+    return static_cast<float>(DEFAULT_GEN8_PLATFORM::hwInfo.capabilityTable.defaultProfilingTimerResolution);
+}
+
+#include "runtime/gen8/device_enqueue.h"
+#include "runtime/gen8/scheduler_definitions.h"
+#include "runtime/gen8/scheduler_igdrcl_built_in.inl"
+#include "runtime/scheduler/scheduler.cl"
+}
+
+namespace BuiltinKernelsSimulation {
+
+template <>
+void SchedulerSimulation<BDWFamily>::startScheduler(uint32_t index,
+                                                    GraphicsAllocation *queue,
+                                                    GraphicsAllocation *commandsStack,
+                                                    GraphicsAllocation *eventsPool,
+                                                    GraphicsAllocation *secondaryBatchBuffer,
+                                                    GraphicsAllocation *dsh,
+                                                    GraphicsAllocation *reflectionSurface,
+                                                    GraphicsAllocation *queueStorageBuffer,
+                                                    GraphicsAllocation *ssh,
+                                                    GraphicsAllocation *debugQueue) {
+
+    threadIDToLocalIDmap.insert(std::make_pair(std::this_thread::get_id(), index));
+
+    while (!conditionReady) {
+    }
+
+    Gen8SchedulerSimulation::SchedulerParallel20((IGIL_CommandQueue *)queue->getUnderlyingBuffer(),
+                                                 (uint *)commandsStack->getUnderlyingBuffer(),
+                                                 (IGIL_EventPool *)eventsPool->getUnderlyingBuffer(),
+                                                 (uint *)secondaryBatchBuffer->getUnderlyingBuffer(),
+                                                 (char *)dsh->getUnderlyingBuffer(),
+                                                 (IGIL_KernelDataHeader *)reflectionSurface->getUnderlyingBuffer(),
+                                                 (uint *)queueStorageBuffer->getUnderlyingBuffer(),
+                                                 (char *)ssh->getUnderlyingBuffer(),
+                                                 debugQueue != nullptr ? (DebugDataBuffer *)debugQueue->getUnderlyingBuffer() : nullptr);
+}
+
+template <>
+void SchedulerSimulation<BDWFamily>::patchGpGpuWalker(uint secondLevelBatchOffset,
+                                                      __global uint *secondaryBatchBuffer,
+                                                      uint interfaceDescriptorOffset,
+                                                      uint simdSize,
+                                                      uint totalLocalWorkSize,
+                                                      uint3 dimSize,
+                                                      uint3 startPoint,
+                                                      uint numberOfHwThreadsPerWg,
+                                                      uint indirectPayloadSize,
+                                                      uint ioHoffset) {
+    Gen8SchedulerSimulation::patchGpGpuWalker(secondLevelBatchOffset,
+                                              secondaryBatchBuffer,
+                                              interfaceDescriptorOffset,
+                                              simdSize,
+                                              totalLocalWorkSize,
+                                              dimSize,
+                                              startPoint,
+                                              numberOfHwThreadsPerWg,
+                                              indirectPayloadSize,
+                                              ioHoffset);
+}
+
+template class SchedulerSimulation<BDWFamily>;
+
+} // namespace BuiltinKernelsSimulation
--- a/runtime/builtin_kernels_simulation/gen9/scheduler_simulation.cpp
+++ b/runtime/builtin_kernels_simulation/gen9/scheduler_simulation.cpp
@@ -0,0 +1,104 @@
+/*
+* Copyright (c) 2017, Intel Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include "CL/cl.h"
+#include "runtime/builtin_kernels_simulation/opencl_c.h"
+#include "runtime/builtin_kernels_simulation/scheduler_simulation.h"
+#include "runtime/builtin_kernels_simulation/scheduler_simulation.inl"
+#include "runtime/memory_manager/graphics_allocation.h"
+#include "runtime/gen9/hw_cmds.h"
+#include "runtime/execution_model/device_enqueue.h"
+
+using namespace OCLRT;
+using namespace BuiltinKernelsSimulation;
+
+namespace OCLRT {
+struct SKLFamily;
+}
+
+namespace Gen9SchedulerSimulation {
+
+#define SCHEDULER_EMULATION
+
+float __intel__getProfilingTimerResolution() {
+    return static_cast<float>(DEFAULT_GEN9_PLATFORM::hwInfo.capabilityTable.defaultProfilingTimerResolution);
+}
+
+#include "runtime/gen9/device_enqueue.h"
+#include "runtime/gen9/scheduler_definitions.h"
+#include "runtime/gen9/scheduler_igdrcl_built_in.inl"
+#include "runtime/scheduler/scheduler.cl"
+}
+
+namespace BuiltinKernelsSimulation {
+
+template <>
+void SchedulerSimulation<SKLFamily>::startScheduler(uint32_t index,
+                                                    GraphicsAllocation *queue,
+                                                    GraphicsAllocation *commandsStack,
+                                                    GraphicsAllocation *eventsPool,
+                                                    GraphicsAllocation *secondaryBatchBuffer,
+                                                    GraphicsAllocation *dsh,
+                                                    GraphicsAllocation *reflectionSurface,
+                                                    GraphicsAllocation *queueStorageBuffer,
+                                                    GraphicsAllocation *ssh,
+                                                    GraphicsAllocation *debugQueue) {
+
+    threadIDToLocalIDmap.insert(std::make_pair(std::this_thread::get_id(), index));
+
+    while (!conditionReady) {
+    }
+
+    Gen9SchedulerSimulation::SchedulerParallel20((IGIL_CommandQueue *)queue->getUnderlyingBuffer(),
+                                                 (uint *)commandsStack->getUnderlyingBuffer(),
+                                                 (IGIL_EventPool *)eventsPool->getUnderlyingBuffer(),
+                                                 (uint *)secondaryBatchBuffer->getUnderlyingBuffer(),
+                                                 (char *)dsh->getUnderlyingBuffer(),
+                                                 (IGIL_KernelDataHeader *)reflectionSurface->getUnderlyingBuffer(),
+                                                 (uint *)queueStorageBuffer->getUnderlyingBuffer(),
+                                                 (char *)ssh->getUnderlyingBuffer(),
+                                                 debugQueue != nullptr ? (DebugDataBuffer *)debugQueue->getUnderlyingBuffer() : nullptr);
+}
+template <>
+void SchedulerSimulation<SKLFamily>::patchGpGpuWalker(uint secondLevelBatchOffset,
+                                                      __global uint *secondaryBatchBuffer,
+                                                      uint interfaceDescriptorOffset,
+                                                      uint simdSize,
+                                                      uint totalLocalWorkSize,
+                                                      uint3 dimSize,
+                                                      uint3 startPoint,
+                                                      uint numberOfHwThreadsPerWg,
+                                                      uint indirectPayloadSize,
+                                                      uint ioHoffset) {
+    Gen9SchedulerSimulation::patchGpGpuWalker(secondLevelBatchOffset,
+                                              secondaryBatchBuffer,
+                                              interfaceDescriptorOffset,
+                                              simdSize,
+                                              totalLocalWorkSize,
+                                              dimSize,
+                                              startPoint,
+                                              numberOfHwThreadsPerWg,
+                                              indirectPayloadSize,
+                                              ioHoffset);
+}
+template class SchedulerSimulation<SKLFamily>;
+} // namespace BuiltinKernelsSimulation
--- a/runtime/builtin_kernels_simulation/opencl_c.cpp
+++ b/runtime/builtin_kernels_simulation/opencl_c.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <cstdint>
+#include "runtime/helpers/string.h"
+#include "CL/cl.h"
+#include "opencl_c.h"
+
+namespace BuiltinKernelsSimulation {
+
+#define SCHEDULER_EMULATION 1
+
+// globals
+std::mutex gMutex;
+unsigned int globalID[3];
+unsigned int localID[3];
+unsigned int localSize[3];
+
+std::map<std::thread::id, uint32_t> threadIDToLocalIDmap;
+
+SynchronizationBarrier *pGlobalBarrier = nullptr;
+
+uint4 operator+(uint4 const &a, uint4 const &b) {
+    uint4 c(0, 0, 0, 0);
+    c.x = a.x + b.x;
+    c.y = a.y + b.y;
+    c.z = a.z + b.z;
+    c.w = a.w + b.w;
+    return c;
+}
+
+int4 operator+(int4 const &a, int4 const &b) {
+    int4 c(0, 0, 0, 0);
+    c.x = a.x + b.x;
+    c.y = a.y + b.y;
+    c.z = a.z + b.z;
+    c.w = a.w + b.w;
+    return c;
+}
+
+uint get_local_id(int dim) {
+    uint LID = 0;
+
+    // use thread id
+    if (threadIDToLocalIDmap.size() > 0) {
+        std::thread::id id = std::this_thread::get_id();
+        LID = threadIDToLocalIDmap[id] % 24;
+    }
+    // use id from loop iteration
+    else {
+        LID = localID[dim];
+    }
+    return LID;
+}
+
+uint get_global_id(int dim) {
+    uint GID = 0;
+
+    // use thread id
+    if (threadIDToLocalIDmap.size() > 0) {
+        std::thread::id id = std::this_thread::get_id();
+        GID = threadIDToLocalIDmap[id];
+    }
+    // use id from loop iteration
+    else {
+        GID = globalID[dim];
+    }
+    return GID;
+}
+
+uint get_local_size(int dim) {
+    return localSize[dim];
+}
+
+uint get_num_groups(int dim) {
+    return NUM_OF_THREADS / 24;
+}
+
+uint get_group_id(int dim) {
+    return get_global_id(dim) / 24;
+}
+
+void barrier(int x) {
+    pGlobalBarrier->enter();
+
+    // int LID = get_local_id(0);
+    volatile int BreakPointHere = 0;
+
+    // PUT BREAKPOINT HERE to stop after each barrier
+    BreakPointHere++;
+}
+
+uint4 read_imageui(image *im, int4 coord) {
+    uint4 color = {0, 0, 0, 1};
+
+    uint offset = ((coord.z * im->height + coord.y) * im->width + coord.x) * im->bytesPerChannel * im->channels;
+
+    char *temp = &im->ptr[offset];
+    char *colorDst = (char *)&color;
+
+    for (uint i = 0; i < im->channels; i++) {
+        memcpy_s(colorDst, sizeof(uint4), temp, im->bytesPerChannel);
+        temp += im->bytesPerChannel;
+        colorDst += 4;
+    }
+    return color;
+}
+
+uint4 write_imageui(image *im, uint4 coord, uint4 color) {
+    uint offset = ((coord.z * im->height + coord.y) * im->width + coord.x) * im->bytesPerChannel * im->channels;
+
+    char *temp = &im->ptr[offset];
+    char *colorSrc = (char *)&color;
+
+    size_t size = im->width * im->height * im->depth * im->bytesPerChannel * im->channels;
+
+    for (uint i = 0; i < im->channels; i++) {
+        memcpy_s(temp, size - offset, colorSrc, im->bytesPerChannel);
+        temp += im->bytesPerChannel;
+        colorSrc += 4;
+    }
+    return *(uint4 *)temp;
+}
+
+uchar convert_uchar_sat(uint c) {
+    return (uchar)c;
+}
+
+ushort convert_ushort_sat(uint c) {
+    return (ushort)c;
+}
+
+} // namespace BuiltinKernelsSimulation
--- a/runtime/builtin_kernels_simulation/opencl_c.h
+++ b/runtime/builtin_kernels_simulation/opencl_c.h
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include <mutex>
+#include <condition_variable>
+#include <map>
+#include <thread>
+#include <string.h>
+#include <cstdint>
+
+// OpenCL Types
+typedef uint32_t uint;
+typedef uint8_t uchar;
+typedef uint16_t ushort;
+typedef uint64_t ulong;
+
+namespace BuiltinKernelsSimulation {
+
+// number of threads in wkg
+#define NUM_OF_THREADS 24
+
+#define CLK_GLOBAL_MEM_FENCE 1
+#define CLK_LOCAL_MEM_FENCE 2
+
+class SynchronizationBarrier {
+  public:
+    SynchronizationBarrier(int count) : m_InitialCount(count) {
+        m_Count = count;
+        m_BarrierCounter = 0;
+    }
+
+    ~SynchronizationBarrier() {
+    }
+
+    void enter() {
+        std::unique_lock<std::mutex> lck(m_Mutex);
+
+        m_Count--;
+
+        unsigned int BarrierCount = m_BarrierCounter;
+
+        if (m_Count > 0) {
+            while (BarrierCount == m_BarrierCounter) {
+                m_AllHitBarrierCondition.wait(lck);
+            }
+        } else {
+            m_Count = m_InitialCount;
+            m_BarrierCounter++;
+            m_AllHitBarrierCondition.notify_all();
+        }
+    }
+
+  private:
+    std::mutex m_Mutex;
+    std::condition_variable m_AllHitBarrierCondition;
+    int m_Count;
+    const int m_InitialCount;
+    unsigned int m_BarrierCounter;
+};
+
+// globals
+extern std::mutex gMutex;
+extern unsigned int globalID[3];
+extern unsigned int localID[3];
+extern unsigned int localSize[3];
+extern std::map<std::thread::id, uint32_t> threadIDToLocalIDmap;
+extern SynchronizationBarrier *pGlobalBarrier;
+
+typedef struct taguint2 {
+    taguint2(uint x, uint y) {
+        this->x = x;
+        this->y = y;
+    }
+    taguint2() {
+        this->x = 0;
+        this->y = 0;
+    }
+    uint x;
+    uint y;
+} uint2;
+
+typedef struct taguint3 {
+    taguint3(uint x, uint y, uint z) {
+        this->x = x;
+        this->y = y;
+        this->z = z;
+    }
+    taguint3() {
+        this->x = 0;
+        this->y = 0;
+        this->z = 0;
+    }
+    uint x;
+    uint y;
+    uint z;
+} uint3;
+
+typedef struct taguint4 {
+    taguint4(uint x, uint y, uint z, uint w) {
+        this->x = x;
+        this->y = y;
+        this->z = z;
+        this->w = w;
+    }
+    uint x;
+    uint y;
+    uint z;
+    uint w;
+} uint4;
+
+typedef struct tagint2 {
+    tagint2(int x, int y) {
+        this->x = x;
+        this->y = y;
+    }
+    int x;
+    int y;
+} int2;
+
+typedef struct tagint3 {
+    tagint3(int x, int y, int z) {
+        this->x = x;
+        this->y = y;
+        this->z = z;
+    }
+    int x;
+    int y;
+    int z;
+} int3;
+
+typedef struct tagint4 {
+    tagint4(int x, int y, int z, int w) {
+        this->x = x;
+        this->y = y;
+        this->z = z;
+        this->w = w;
+    }
+    int x;
+    int y;
+    int z;
+    int w;
+} int4;
+
+typedef struct tagushort2 {
+    tagushort2(ushort x, ushort y) {
+        this->x = x;
+        this->y = y;
+    }
+    unsigned short x;
+    unsigned short y;
+} ushort2;
+
+typedef struct tagushort8 {
+    unsigned short xxx[8];
+} ushort8;
+
+typedef struct tagushort16 {
+    unsigned short xxx[16];
+} ushort16;
+
+uint4 operator+(uint4 const &a, uint4 const &b);
+int4 operator+(int4 const &a, int4 const &b);
+
+typedef struct tagimage {
+    char *ptr;
+    uint width;
+    uint height;
+    uint depth;
+    uint bytesPerChannel;
+    uint channels;
+} image;
+
+// images as pointer
+typedef image *image1d_t;
+typedef image *image2d_t;
+typedef image *image3d_t;
+
+// OpenCL keywords
+#define __global
+#define __local
+#define __private
+#define __kernel
+#define __attribute__(...)
+#define __read_only
+#define __write_only
+#define queue_t void *
+
+struct clk_event_t {
+    clk_event_t() {
+        value = 0;
+    }
+    clk_event_t(void *v) {
+        value = static_cast<uint>(reinterpret_cast<uintptr_t>(v));
+    }
+
+    explicit operator void *() const {
+        return reinterpret_cast<void *>(static_cast<uintptr_t>(value));
+    }
+
+    operator uint() {
+        return (uint)value;
+    }
+
+    void operator=(uint input) {
+        value = input;
+    }
+
+    uint value;
+};
+
+// OpenCL builtins
+#define __builtin_astype(var, type) \
+    (                               \
+        (type)var)
+
+#define select(a, b, c) (c ? b : a)
+
+uint get_local_id(int dim);
+uint get_global_id(int dim);
+uint get_local_size(int dim);
+uint get_num_groups(int dim);
+uint get_group_id(int dim);
+void barrier(int x);
+uint4 read_imageui(image *im, int4 coord);
+uint4 write_imageui(image *im, uint4 coord, uint4 color);
+uchar convert_uchar_sat(uint c);
+ushort convert_ushort_sat(uint c);
+
+#define EMULATION_ENTER_FUNCTION() \
+    uint __LOCAL_ID__ = 0;         \
+    __LOCAL_ID__ = get_local_id(0);
+
+template <class TYPE, class TYPE2>
+void atomic_xchg(TYPE *dest, TYPE2 val) {
+    gMutex.lock();
+    dest[0] = (TYPE)val;
+    gMutex.unlock();
+}
+
+template <class TYPE, class TYPE2>
+TYPE atomic_add(TYPE *first, TYPE2 second) {
+    gMutex.lock();
+    TYPE temp = first[0];
+    first[0] = (TYPE)(temp + (TYPE)second);
+    gMutex.unlock();
+    return temp;
+}
+
+template <class TYPE, class TYPE2>
+TYPE atomic_sub(TYPE *first, TYPE2 second) {
+    gMutex.lock();
+    TYPE temp = first[0];
+    first[0] = temp - second;
+    gMutex.unlock();
+    return temp;
+}
+
+template <class TYPE>
+TYPE atomic_inc(TYPE *first) {
+    gMutex.lock();
+    TYPE temp = first[0];
+    first[0] = temp + 1;
+    gMutex.unlock();
+    return temp;
+}
+
+template <class TYPE>
+TYPE atomic_dec(TYPE *first) {
+    gMutex.lock();
+    TYPE temp = first[0];
+    first[0] = temp - 1;
+    gMutex.unlock();
+    return temp;
+}
+
+template <class TYPE, class TYPE2>
+TYPE atomic_min(TYPE *first, TYPE2 second) {
+    gMutex.lock();
+    TYPE temp = first[0];
+    first[0] = (TYPE)((TYPE)second < temp ? (TYPE)second : temp);
+    gMutex.unlock();
+    return temp;
+}
+}
--- a/runtime/builtin_kernels_simulation/scheduler_simulation.cpp
+++ b/runtime/builtin_kernels_simulation/scheduler_simulation.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "runtime/builtin_kernels_simulation/scheduler_simulation.h"
+#include "runtime/builtin_kernels_simulation/opencl_c.h"
+
+#include <thread>
+
+using namespace std;
+using namespace OCLRT;
+
+namespace BuiltinKernelsSimulation {
+
+bool conditionReady = false;
+std::thread threads[NUM_OF_THREADS];
+
+} // namespace BuiltinKernelsSimulation
--- a/runtime/builtin_kernels_simulation/scheduler_simulation.h
+++ b/runtime/builtin_kernels_simulation/scheduler_simulation.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#pragma once
+#include <cstdint>
+#include <thread>
+
+#include "runtime/builtin_kernels_simulation/opencl_c.h"
+namespace OCLRT {
+class GraphicsAllocation;
+}
+
+namespace BuiltinKernelsSimulation {
+
+extern bool conditionReady;
+extern std::thread threads[];
+
+template <typename GfxFamily>
+class SchedulerSimulation {
+  public:
+    void runSchedulerSimulation(OCLRT::GraphicsAllocation *queue,
+                                OCLRT::GraphicsAllocation *commandsStack,
+                                OCLRT::GraphicsAllocation *eventsPool,
+                                OCLRT::GraphicsAllocation *secondaryBatchBuffer,
+                                OCLRT::GraphicsAllocation *dsh,
+                                OCLRT::GraphicsAllocation *reflectionSurface,
+                                OCLRT::GraphicsAllocation *queueStorageBuffer,
+                                OCLRT::GraphicsAllocation *ssh,
+                                OCLRT::GraphicsAllocation *debugQueue);
+
+    void cleanSchedulerSimulation();
+
+    static void startScheduler(uint32_t index,
+                               OCLRT::GraphicsAllocation *queue,
+                               OCLRT::GraphicsAllocation *commandsStack,
+                               OCLRT::GraphicsAllocation *eventsPool,
+                               OCLRT::GraphicsAllocation *secondaryBatchBuffer,
+                               OCLRT::GraphicsAllocation *dsh,
+                               OCLRT::GraphicsAllocation *reflectionSurface,
+                               OCLRT::GraphicsAllocation *queueStorageBuffer,
+                               OCLRT::GraphicsAllocation *ssh,
+                               OCLRT::GraphicsAllocation *debugQueue);
+
+    void initializeSchedulerSimulation(OCLRT::GraphicsAllocation *queue,
+                                       OCLRT::GraphicsAllocation *commandsStack,
+                                       OCLRT::GraphicsAllocation *eventsPool,
+                                       OCLRT::GraphicsAllocation *secondaryBatchBuffer,
+                                       OCLRT::GraphicsAllocation *dsh,
+                                       OCLRT::GraphicsAllocation *reflectionSurface,
+                                       OCLRT::GraphicsAllocation *queueStorageBuffer,
+                                       OCLRT::GraphicsAllocation *ssh,
+                                       OCLRT::GraphicsAllocation *debugQueue);
+
+    static void patchGpGpuWalker(uint secondLevelBatchOffset,
+                                 __global uint *secondaryBatchBuffer,
+                                 uint interfaceDescriptorOffset,
+                                 uint simdSize,
+                                 uint totalLocalWorkSize,
+                                 uint3 dimSize,
+                                 uint3 startPoint,
+                                 uint numberOfHwThreadsPerWg,
+                                 uint indirectPayloadSize,
+                                 uint ioHoffset);
+    static bool enabled;
+    static bool simulationRun;
+};
+
+template <typename GfxFamily>
+bool SchedulerSimulation<GfxFamily>::enabled = true;
+
+template <typename GfxFamily>
+bool SchedulerSimulation<GfxFamily>::simulationRun = false;
+
+} // namespace BuiltinKernelsSimulation
--- a/runtime/builtin_kernels_simulation/scheduler_simulation.inl
+++ b/runtime/builtin_kernels_simulation/scheduler_simulation.inl
@@ -0,0 +1,112 @@
+/*
+* Copyright (c) 2017, Intel Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include "runtime/memory_manager/graphics_allocation.h"
+#include "runtime/builtin_kernels_simulation/scheduler_simulation.h"
+
+#include <cstdint>
+#include <mutex>
+#include <thread>
+
+using namespace std;
+using namespace OCLRT;
+
+namespace BuiltinKernelsSimulation {
+
+template <typename GfxFamily>
+void SchedulerSimulation<GfxFamily>::cleanSchedulerSimulation() {
+    threadIDToLocalIDmap.clear();
+    delete pGlobalBarrier;
+}
+
+template <typename GfxFamily>
+void SchedulerSimulation<GfxFamily>::initializeSchedulerSimulation(GraphicsAllocation *queue,
+                                                                   GraphicsAllocation *commandsStack,
+                                                                   GraphicsAllocation *eventsPool,
+                                                                   GraphicsAllocation *secondaryBatchBuffer,
+                                                                   GraphicsAllocation *dsh,
+                                                                   GraphicsAllocation *reflectionSurface,
+                                                                   GraphicsAllocation *queueStorageBuffer,
+                                                                   GraphicsAllocation *ssh,
+                                                                   GraphicsAllocation *debugQueue) {
+
+    localSize[0] = NUM_OF_THREADS;
+    localSize[1] = 1;
+    localSize[2] = 1;
+
+    threadIDToLocalIDmap.clear();
+    pGlobalBarrier = new SynchronizationBarrier(NUM_OF_THREADS);
+
+    // Spawn Thread ID == 0 on main thread
+    for (uint32_t i = 1; i < NUM_OF_THREADS; i++) {
+        threads[i] = std::thread(startScheduler, i, queue, commandsStack, eventsPool, secondaryBatchBuffer, dsh, reflectionSurface, queueStorageBuffer, ssh, debugQueue);
+    }
+
+    conditionReady = true;
+}
+
+template <typename GfxFamily>
+void SchedulerSimulation<GfxFamily>::runSchedulerSimulation(GraphicsAllocation *queue,
+                                                            GraphicsAllocation *commandsStack,
+                                                            GraphicsAllocation *eventsPool,
+                                                            GraphicsAllocation *secondaryBatchBuffer,
+                                                            GraphicsAllocation *dsh,
+                                                            GraphicsAllocation *reflectionSurface,
+                                                            GraphicsAllocation *queueStorageBuffer,
+                                                            GraphicsAllocation *ssh,
+                                                            GraphicsAllocation *debugQueue) {
+    simulationRun = true;
+    if (enabled) {
+        initializeSchedulerSimulation(queue,
+                                      commandsStack,
+                                      eventsPool,
+                                      secondaryBatchBuffer,
+                                      dsh,
+                                      reflectionSurface,
+                                      queueStorageBuffer,
+                                      ssh,
+                                      debugQueue);
+
+        // start main thread with LID == 0
+        startScheduler(0,
+                       queue,
+                       commandsStack,
+                       eventsPool,
+                       secondaryBatchBuffer,
+                       dsh,
+                       reflectionSurface,
+                       queueStorageBuffer,
+                       ssh,
+                       debugQueue);
+
+        // Wait for all threads on main thread
+        if (threadIDToLocalIDmap[std::this_thread::get_id()] == 0) {
+
+            for (uint32_t i = 1; i < NUM_OF_THREADS; i++)
+                threads[i].join();
+
+            cleanSchedulerSimulation();
+        }
+    }
+};
+
+} // namespace BuiltinKernelsSimulation
--- a/runtime/command_queue/command_queue.cpp
+++ b/runtime/command_queue/command_queue.cpp
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "runtime/command_queue/command_queue.h"
+#include "runtime/command_queue/command_queue_hw.h"
+#include "runtime/command_stream/command_stream_receiver.h"
+#include "runtime/context/context.h"
+#include "runtime/device/device.h"
+#include "runtime/device_queue/device_queue.h"
+#include "runtime/event/event.h"
+#include "runtime/helpers/aligned_memory.h"
+#include "runtime/helpers/array_count.h"
+#include "runtime/helpers/get_info.h"
+#include "hw_info.h"
+#include "runtime/helpers/options.h"
+#include "runtime/helpers/ptr_math.h"
+#include "runtime/mem_obj/buffer.h"
+#include "runtime/mem_obj/image.h"
+#include "runtime/helpers/surface_formats.h"
+#include "runtime/memory_manager/memory_manager.h"
+#include "runtime/helpers/string.h"
+#include "CL/cl_ext.h"
+#include "runtime/utilities/api_intercept.h"
+#include "runtime/helpers/convert_color.h"
+#include "runtime/helpers/queue_helpers.h"
+#include <map>
+
+namespace OCLRT {
+
+// Global table of create functions
+CommandQueueCreateFunc commandQueueFactory[IGFX_MAX_CORE] = {};
+
+CommandQueue *CommandQueue::create(Context *context,
+                                   Device *device,
+                                   const cl_queue_properties *properties,
+                                   cl_int &retVal) {
+    retVal = CL_SUCCESS;
+
+    auto funcCreate = commandQueueFactory[device->getRenderCoreFamily()];
+    DEBUG_BREAK_IF(nullptr == funcCreate);
+
+    return funcCreate(context, device, properties);
+}
+
+CommandQueue::CommandQueue() : CommandQueue(nullptr, nullptr, 0) {
+}
+
+CommandQueue::CommandQueue(Context *context,
+                           Device *deviceId,
+                           const cl_queue_properties *properties) : low_priority(false),
+                                                                    taskCount(0),
+                                                                    taskLevel(0),
+                                                                    virtualEvent(nullptr),
+                                                                    context(context),
+                                                                    device(deviceId),
+                                                                    perfCountersEnabled(false),
+                                                                    perfCountersConfig(UINT32_MAX),
+                                                                    perfCountersUserRegistersNumber(0),
+                                                                    perfConfigurationData(nullptr),
+                                                                    perfCountersRegsCfgHandle(0),
+                                                                    perfCountersRegsCfgPending(false),
+                                                                    commandStream(nullptr) {
+    if (context) {
+        context->incRefInternal();
+    }
+    for (int i = 0; i < NUM_HEAPS; ++i) {
+        indirectHeap[i] = nullptr;
+    }
+    commandQueueProperties = getCmdQueueProperties<cl_command_queue_properties>(properties);
+    flushStamp.reset(new FlushStampTracker(true));
+}
+
+CommandQueue::~CommandQueue() {
+    if (virtualEvent) {
+        UNRECOVERABLE_IF(this->virtualEvent->getCommandQueue() != this && this->virtualEvent->getCommandQueue() != nullptr);
+        virtualEvent->setCurrentCmdQVirtualEvent(false);
+        virtualEvent->decRefInternal();
+    }
+
+    if (device) {
+        auto memoryManager = device->getMemoryManager();
+        DEBUG_BREAK_IF(nullptr == memoryManager);
+
+        if (commandStream && commandStream->getGraphicsAllocation()) {
+            memoryManager->storeAllocation(std::unique_ptr<GraphicsAllocation>(commandStream->getGraphicsAllocation()), REUSABLE_ALLOCATION);
+            commandStream->replaceGraphicsAllocation(nullptr);
+        }
+        delete commandStream;
+
+        for (int i = 0; i < NUM_HEAPS; ++i) {
+            if (indirectHeap[i] != nullptr) {
+                auto allocation = indirectHeap[i]->getGraphicsAllocation();
+                if (allocation != nullptr) {
+                    memoryManager->storeAllocation(std::unique_ptr<GraphicsAllocation>(allocation), REUSABLE_ALLOCATION);
+                }
+                delete indirectHeap[i];
+            }
+        }
+        if (perfConfigurationData) {
+            delete perfConfigurationData;
+        }
+        if (this->perfCountersEnabled) {
+            device->getPerformanceCounters()->shutdown();
+        }
+    }
+
+    if (context && !context->isSpecialQueue(this)) {
+        context->decRefInternal();
+    }
+}
+
+uint32_t CommandQueue::getHwTag() const {
+    uint32_t tag = *getHwTagAddress();
+    return tag;
+}
+
+volatile uint32_t *CommandQueue::getHwTagAddress() const {
+    DEBUG_BREAK_IF(!this->device);
+    auto &commandStreamReceiver = device->getCommandStreamReceiver();
+    auto tag_address = commandStreamReceiver.getTagAddress();
+    commandStreamReceiver.makeCoherent((void *)tag_address, sizeof(tag_address));
+    return tag_address;
+}
+
+bool CommandQueue::isCompleted(uint32_t taskCount) const {
+    uint32_t tag = getHwTag();
+    DEBUG_BREAK_IF(tag == Event::eventNotReady);
+    return tag >= taskCount;
+}
+
+void CommandQueue::waitUntilComplete(uint32_t taskCountToWait, FlushStamp flushStampToWait) {
+    WAIT_ENTER()
+
+    DBG_LOG(LogTaskCounts, __FUNCTION__, "Waiting for taskCount:", taskCountToWait);
+    DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "Current taskCount:", getHwTag());
+
+    device->getCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait);
+
+    DEBUG_BREAK_IF(getHwTag() < taskCountToWait);
+    latestTaskCountWaited = taskCountToWait;
+    WAIT_LEAVE()
+}
+
+bool CommandQueue::isQueueBlocked() {
+    TakeOwnershipWrapper<CommandQueue> takeOwnershipWrapper(*this);
+    //check if we have user event and if so, if it is in blocked state.
+    if (this->virtualEvent) {
+        if (this->virtualEvent->peekIsCompleted()) {
+            UNRECOVERABLE_IF(this->virtualEvent == nullptr);
+
+            if (this->virtualEvent->peekIsCompletedByTermination() == false) {
+                taskCount = this->virtualEvent->peekTaskCount();
+                flushStamp->setStamp(this->virtualEvent->flushStamp->peekStamp());
+                taskLevel = this->virtualEvent->taskLevel;
+                // If this isn't an OOQ, update the taskLevel for the queue
+                if (!isOOQEnabled()) {
+                    taskLevel++;
+                }
+            } else {
+                //at this point we may reset queue TaskCount, since all command previous to this were aborted
+                taskCount = 0;
+                flushStamp->setStamp(0);
+                taskLevel = getDevice().getCommandStreamReceiver().peekTaskLevel();
+            }
+
+            DebugManager.log(DebugManager.flags.EventsDebugEnable.get(), "isQueueBlocked taskLevel change from", taskLevel, "to new from virtualEvent", this->virtualEvent, "new tasklevel", this->virtualEvent->taskLevel.load());
+
+            //close the access to virtual event, driver added only 1 ref count.
+            this->virtualEvent->decRefInternal();
+            this->virtualEvent = nullptr;
+            return false;
+        }
+        return true;
+    }
+    return false;
+}
+
+cl_int CommandQueue::getCommandQueueInfo(cl_command_queue_info paramName,
+                                         size_t paramValueSize,
+                                         void *paramValue,
+                                         size_t *paramValueSizeRet) {
+    return getQueueInfo<CommandQueue>(this, paramName, paramValueSize, paramValue, paramValueSizeRet);
+}
+
+uint32_t CommandQueue::getTaskLevelFromWaitList(uint32_t taskLevel,
+                                                cl_uint numEventsInWaitList,
+                                                const cl_event *eventWaitList) {
+    for (auto iEvent = 0u; iEvent < numEventsInWaitList; ++iEvent) {
+        auto pEvent = (Event *)(eventWaitList[iEvent]);
+        uint32_t eventTaskLevel = pEvent->taskLevel;
+        taskLevel = std::max(taskLevel, eventTaskLevel);
+    }
+    return taskLevel;
+}
+
+IndirectHeap &CommandQueue::getIndirectHeap(IndirectHeap::Type heapType,
+                                            size_t minRequiredSize) {
+    DEBUG_BREAK_IF(static_cast<uint32_t>(heapType) >= ARRAY_COUNT(indirectHeap));
+    auto &heap = indirectHeap[heapType];
+    GraphicsAllocation *heapMemory = nullptr;
+
+    DEBUG_BREAK_IF(nullptr == device);
+    auto memoryManager = device->getMemoryManager();
+    DEBUG_BREAK_IF(nullptr == memoryManager);
+
+    if (heap)
+        heapMemory = heap->getGraphicsAllocation();
+
+    if (heap && heap->getAvailableSpace() < minRequiredSize && heapMemory) {
+        memoryManager->storeAllocation(std::unique_ptr<GraphicsAllocation>(heapMemory), REUSABLE_ALLOCATION);
+        heapMemory = nullptr;
+    }
+
+    if (!heapMemory) {
+        // Heap should be at least minHeapSize unless we're requesting an empty heap
+        size_t minHeapSize = 64 * KB;
+        if (IndirectHeap::SURFACE_STATE == heapType) {
+            minHeapSize -= MemoryConstants::pageSize;
+        }
+
+        minRequiredSize = minRequiredSize ? std::max(minRequiredSize, minHeapSize) : 0;
+        minRequiredSize = minRequiredSize > 0 ? alignUp(minRequiredSize, MemoryConstants::cacheLineSize) : 0;
+
+        const size_t heapAlignment = MemoryConstants::pageSize;
+        heapMemory = memoryManager->obtainReusableAllocation(minRequiredSize).release();
+
+        if (!heapMemory) {
+            heapMemory = memoryManager->allocateGraphicsMemory(minRequiredSize, heapAlignment);
+        }
+
+        if (heap) {
+            heap->replaceBuffer(heapMemory->getUnderlyingBuffer(), minRequiredSize);
+            heap->replaceGraphicsAllocation(heapMemory);
+        } else {
+            heap = new IndirectHeap(heapMemory);
+            heap->overrideMaxSize(minRequiredSize);
+        }
+    }
+
+    return *heap;
+}
+
+void CommandQueue::releaseIndirectHeap(IndirectHeap::Type heapType) {
+    DEBUG_BREAK_IF(static_cast<uint32_t>(heapType) >= ARRAY_COUNT(indirectHeap));
+    auto &heap = indirectHeap[heapType];
+
+    DEBUG_BREAK_IF(nullptr == device);
+    auto memoryManager = device->getMemoryManager();
+    DEBUG_BREAK_IF(nullptr == memoryManager);
+
+    if (heap) {
+        auto heapMemory = heap->getGraphicsAllocation();
+        if (heapMemory != nullptr)
+            memoryManager->storeAllocation(std::unique_ptr<GraphicsAllocation>(heapMemory), REUSABLE_ALLOCATION);
+        heap->replaceBuffer(nullptr, 0);
+        heap->replaceGraphicsAllocation(nullptr);
+    }
+}
+
+LinearStream &CommandQueue::getCS(size_t minRequiredSize) {
+    DEBUG_BREAK_IF(nullptr == device);
+    auto &commandStreamReceiver = device->getCommandStreamReceiver();
+    auto memoryManager = commandStreamReceiver.getMemoryManager();
+    DEBUG_BREAK_IF(nullptr == memoryManager);
+
+    if (!commandStream) {
+        commandStream = new LinearStream(nullptr);
+    }
+
+    // Make sure we have enough room for any CSR additions
+    minRequiredSize += CSRequirements::minCommandQueueCommandStreamSize;
+
+    if (commandStream->getAvailableSpace() < minRequiredSize) {
+        // If not, allocate a new block. allocate full pages
+        minRequiredSize = alignUp(minRequiredSize, MemoryConstants::pageSize);
+
+        auto requiredSize = minRequiredSize + CSRequirements::csOverfetchSize;
+
+        GraphicsAllocation *allocation = memoryManager->obtainReusableAllocation(requiredSize).release();
+
+        if (!allocation) {
+            allocation = memoryManager->allocateGraphicsMemory(requiredSize, MemoryConstants::pageSize);
+        }
+
+        // Deallocate the old block, if not null
+        auto oldAllocation = commandStream->getGraphicsAllocation();
+
+        if (oldAllocation) {
+            memoryManager->storeAllocation(std::unique_ptr<GraphicsAllocation>(oldAllocation), REUSABLE_ALLOCATION);
+        }
+        commandStream->replaceBuffer(allocation->getUnderlyingBuffer(), minRequiredSize - CSRequirements::minCommandQueueCommandStreamSize);
+        commandStream->replaceGraphicsAllocation(allocation);
+    }
+
+    return *commandStream;
+}
+
+cl_int CommandQueue::enqueueAcquireSharedObjects(cl_uint numObjects, const cl_mem *memObjects, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *oclEvent, cl_uint cmdType) {
+
+    for (unsigned int object = 0; object < numObjects; object++) {
+        auto memObject = castToObjectOrAbort<MemObj>(memObjects[object]);
+        memObject->peekSharingHandler()->acquire(memObject);
+        memObject->acquireCount++;
+    }
+    auto status = enqueueMarkerWithWaitList(
+        numEventsInWaitList,
+        eventWaitList,
+        oclEvent);
+
+    if (oclEvent) {
+        castToObjectOrAbort<Event>(*oclEvent)->setCmdType(cmdType);
+    }
+
+    return status;
+}
+
+cl_int CommandQueue::enqueueReleaseSharedObjects(cl_uint numObjects, const cl_mem *memObjects, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *oclEvent, cl_uint cmdType) {
+    for (unsigned int object = 0; object < numObjects; object++) {
+        auto memObject = castToObjectOrAbort<MemObj>(memObjects[object]);
+        memObject->peekSharingHandler()->release(memObject);
+        DEBUG_BREAK_IF(memObject->acquireCount <= 0);
+        memObject->acquireCount--;
+    }
+    auto status = enqueueMarkerWithWaitList(
+        numEventsInWaitList,
+        eventWaitList,
+        oclEvent);
+
+    if (oclEvent) {
+        castToObjectOrAbort<Event>(*oclEvent)->setCmdType(cmdType);
+    }
+    return status;
+}
+
+void CommandQueue::updateFromCompletionStamp(const CompletionStamp &completionStamp) {
+    taskCount = completionStamp.taskCount;
+    flushStamp->setStamp(completionStamp.flushStamp);
+    this->taskLevel = completionStamp.taskLevel;
+}
+
+void CommandQueue::flushWaitList(
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    bool ndRangeKernel) {
+
+    bool isQBlocked = false;
+
+    //as long as queue is blocked we need to stall.
+    if (!isOOQEnabled()) {
+        while ((isQBlocked = isQueueBlocked()))
+            ;
+    }
+
+    TakeOwnershipWrapper<Device> deviceOwnership(*device);
+    device->getCommandStreamReceiver().flushBatchedSubmissions();
+
+    if (!isQBlocked) {
+        auto taskLevel = getTaskLevelFromWaitList(this->taskLevel, numEventsInWaitList, eventWaitList);
+        auto &commandStream = getCS();
+        auto &commandStreamReceiver = device->getCommandStreamReceiver();
+
+        bool flushTask = false;
+
+        for (auto eventId = 0u; eventId < numEventsInWaitList; eventId++) {
+            Event *event = (Event *)eventWaitList[eventId];
+            if (event->peekTaskCount() > commandStreamReceiver.peekLatestSentTaskCount()) {
+                flushTask = true;
+                break;
+            }
+        }
+
+        if (flushTask) {
+            DispatchFlags dispatchFlags;
+            dispatchFlags.GSBA32BitRequired = ndRangeKernel;
+            dispatchFlags.low_priority = low_priority;
+            dispatchFlags.implicitFlush = true;
+            dispatchFlags.preemptionMode = PreemptionHelper::taskPreemptionMode(*device, nullptr);
+
+            DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady);
+
+            commandStreamReceiver.flushTask(
+                commandStream,
+                commandStream.getUsed(),
+                getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0),
+                getIndirectHeap(IndirectHeap::INSTRUCTION, 0),
+                getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0),
+                getIndirectHeap(IndirectHeap::SURFACE_STATE, 0),
+                taskLevel + 1,
+                dispatchFlags);
+        }
+    }
+}
+
+bool CommandQueue::setPerfCountersEnabled(bool perfCountersEnabled, cl_uint configuration) {
+    DEBUG_BREAK_IF(device == nullptr);
+    if (perfCountersEnabled == this->perfCountersEnabled) {
+        return true;
+    }
+    auto perfCounters = device->getPerformanceCounters();
+    if (perfCountersEnabled) {
+        perfCounters->enable();
+        if (!perfCounters->isAvailable()) {
+            perfCounters->shutdown();
+            return false;
+        }
+        perfConfigurationData = perfCounters->getPmRegsCfg(configuration);
+        if (perfConfigurationData == nullptr) {
+            perfCounters->shutdown();
+            return false;
+        }
+        InstrReadRegsCfg *pUserCounters = &perfConfigurationData->readRegs;
+        for (uint32_t i = 0; i < pUserCounters->regsCount; ++i) {
+            perfCountersUserRegistersNumber++;
+            if (pUserCounters->reg[i].bitSize > 32) {
+                perfCountersUserRegistersNumber++;
+            }
+        }
+    } else {
+        if (perfCounters->isAvailable()) {
+            perfCounters->shutdown();
+        }
+    }
+    this->perfCountersConfig = configuration;
+    this->perfCountersEnabled = perfCountersEnabled;
+
+    return true;
+}
+
+PerformanceCounters *CommandQueue::getPerfCounters() {
+    return device->getPerformanceCounters();
+}
+
+bool CommandQueue::sendPerfCountersConfig() {
+    return getPerfCounters()->sendPmRegsCfgCommands(perfConfigurationData, &perfCountersRegsCfgHandle, &perfCountersRegsCfgPending);
+}
+
+} // namespace OCLRT
--- a/runtime/command_queue/command_queue.h
+++ b/runtime/command_queue/command_queue.h
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "runtime/api/cl_types.h"
+#include "runtime/indirect_heap/indirect_heap.h"
+#include "runtime/helpers/base_object.h"
+#include "runtime/helpers/completion_stamp.h"
+#include "runtime/helpers/flush_stamp.h"
+#include "runtime/event/user_event.h"
+#include "runtime/os_interface/performance_counters.h"
+#include <atomic>
+#include <cstdint>
+
+namespace OCLRT {
+class Buffer;
+class LinearStream;
+class Context;
+class Device;
+class Image;
+class IndirectHeap;
+class Kernel;
+class MemObj;
+
+template <>
+struct OpenCLObjectMapper<_cl_command_queue> {
+    typedef class CommandQueue DerivedType;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// CommandQueue - Core implementation
+////////////////////////////////////////////////////////////////////////////////
+class CommandQueue : public BaseObject<_cl_command_queue> {
+  public:
+    static const cl_ulong objectMagic = 0x1234567890987654LL;
+    enum { NUM_HEAPS = IndirectHeap::NUM_TYPES };
+
+    bool low_priority;
+
+    static CommandQueue *create(Context *context, Device *device,
+                                const cl_queue_properties *properties,
+                                cl_int &errcodeRet);
+
+    CommandQueue();
+
+    CommandQueue(Context *context, Device *device,
+                 const cl_queue_properties *properties);
+
+    CommandQueue &operator=(const CommandQueue &) = delete;
+    CommandQueue(const CommandQueue &) = delete;
+
+    ~CommandQueue() override;
+
+    // API entry points
+    virtual cl_int
+    enqueueCopyImage(Image *srcImage, Image *dstImage, const size_t srcOrigin[3],
+                     const size_t dstOrigin[3], const size_t region[3],
+                     cl_uint numEventsInWaitList, const cl_event *eventWaitList,
+                     cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int enqueueFillImage(Image *image, const void *fillColor,
+                                    const size_t *origin, const size_t *region,
+                                    cl_uint numEventsInWaitList,
+                                    const cl_event *eventWaitList,
+                                    cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int enqueueFillBuffer(Buffer *buffer, const void *pattern,
+                                     size_t patternSize, size_t offset,
+                                     size_t size, cl_uint numEventsInWaitList,
+                                     const cl_event *eventWaitList,
+                                     cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int enqueueKernel(cl_kernel kernel, cl_uint workDim,
+                                 const size_t *globalWorkOffset,
+                                 const size_t *globalWorkSize,
+                                 const size_t *localWorkSize,
+                                 cl_uint numEventsInWaitList,
+                                 const cl_event *eventWaitList, cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int enqueueBarrierWithWaitList(cl_uint numEventsInWaitList,
+                                              const cl_event *eventWaitList,
+                                              cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual void *enqueueMapBuffer(Buffer *buffer, cl_bool blockingMap,
+                                   cl_map_flags mapFlags, size_t offset,
+                                   size_t size, cl_uint numEventsInWaitList,
+                                   const cl_event *eventWaitList, cl_event *event,
+                                   cl_int &errcodeRet) {
+        errcodeRet = CL_SUCCESS;
+        return CL_SUCCESS;
+    }
+
+    virtual void *enqueueMapImage(cl_mem image, cl_bool blockingMap,
+                                  cl_map_flags mapFlags, const size_t *origin,
+                                  const size_t *region, size_t *imageRowPitch,
+                                  size_t *imageSlicePitch,
+                                  cl_uint numEventsInWaitList,
+                                  const cl_event *eventWaitList, cl_event *event,
+                                  cl_int &errcodeRet) {
+        errcodeRet = CL_SUCCESS;
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int enqueueSVMMap(cl_bool blockingMap, cl_map_flags mapFlags,
+                                 void *svmPtr, size_t size,
+                                 cl_uint numEventsInWaitList, const cl_event *eventWaitList,
+                                 cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int enqueueSVMUnmap(void *svmPtr,
+                                   cl_uint numEventsInWaitList, const cl_event *eventWaitList,
+                                   cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int enqueueSVMFree(cl_uint numSvmPointers,
+                                  void *svmPointers[],
+                                  void(CL_CALLBACK *pfnFreeFunc)(cl_command_queue queue,
+                                                                 cl_uint numSvmPointers,
+                                                                 void *svmPointers[],
+                                                                 void *userData),
+                                  void *userData,
+                                  cl_uint numEventsInWaitList,
+                                  const cl_event *eventWaitList,
+                                  cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int enqueueSVMMemcpy(cl_bool blockingCopy,
+                                    void *dstPtr,
+                                    const void *srcPtr,
+                                    size_t size,
+                                    cl_uint numEventsInWaitList,
+                                    const cl_event *eventWaitList,
+                                    cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int enqueueSVMMemFill(void *svmPtr,
+                                     const void *pattern,
+                                     size_t patternSize,
+                                     size_t size,
+                                     cl_uint numEventsInWaitList,
+                                     const cl_event *eventWaitList,
+                                     cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int enqueueMarkerWithWaitList(cl_uint numEventsInWaitList,
+                                             const cl_event *eventWaitList,
+                                             cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int enqueueMigrateMemObjects(cl_uint numMemObjects,
+                                            const cl_mem *memObjects,
+                                            cl_mem_migration_flags flags,
+                                            cl_uint numEventsInWaitList,
+                                            const cl_event *eventWaitList,
+                                            cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int enqueueSVMMigrateMem(cl_uint numSvmPointers,
+                                        const void **svmPointers,
+                                        const size_t *sizes,
+                                        const cl_mem_migration_flags flags,
+                                        cl_uint numEventsInWaitList,
+                                        const cl_event *eventWaitList,
+                                        cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int enqueueCopyBuffer(Buffer *srcBuffer, Buffer *dstBuffer,
+                                     size_t srcOffset, size_t dstOffset,
+                                     size_t size, cl_uint numEventsInWaitList,
+                                     const cl_event *eventWaitList,
+                                     cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int enqueueReadBuffer(Buffer *buffer, cl_bool blockingRead,
+                                     size_t offset, size_t size, void *ptr,
+                                     cl_uint numEventsInWaitList,
+                                     const cl_event *eventWaitList,
+                                     cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int enqueueReadImage(Image *srcImage, cl_bool blockingRead,
+                                    const size_t *origin, const size_t *region,
+                                    size_t rowPitch, size_t slicePitch, void *ptr,
+                                    cl_uint numEventsInWaitList,
+                                    const cl_event *eventWaitList,
+                                    cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int enqueueUnmapMemObject(MemObj *memObj, void *mappedPtr,
+                                         cl_uint numEventsInWaitList,
+                                         const cl_event *eventWaitList,
+                                         cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int enqueueWriteBuffer(Buffer *buffer, cl_bool blockingWrite,
+                                      size_t offset, size_t cb, const void *ptr,
+                                      cl_uint numEventsInWaitList,
+                                      const cl_event *eventWaitList,
+                                      cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int enqueueWriteImage(Image *dstImage, cl_bool blockingWrite,
+                                     const size_t *origin, const size_t *region,
+                                     size_t inputRowPitch, size_t inputSlicePitch,
+                                     const void *ptr, cl_uint numEventsInWaitList,
+                                     const cl_event *eventWaitList,
+                                     cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int
+    enqueueCopyBufferRect(Buffer *srcBuffer, Buffer *dstBuffer,
+                          const size_t *srcOrigin, const size_t *dstOrigin,
+                          const size_t *region, size_t srcRowPitch,
+                          size_t srcSlicePitch, size_t dstRowPitch,
+                          size_t dstSlicePitch, cl_uint numEventsInWaitList,
+                          const cl_event *eventWaitList, cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int enqueueWriteBufferRect(
+        Buffer *buffer, cl_bool blockingWrite, const size_t *bufferOrigin,
+        const size_t *hostOrigin, const size_t *region, size_t bufferRowPitch,
+        size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch,
+        const void *ptr, cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList, cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int enqueueReadBufferRect(
+        Buffer *buffer, cl_bool blockingRead, const size_t *bufferOrigin,
+        const size_t *hostOrigin, const size_t *region, size_t bufferRowPitch,
+        size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch,
+        void *ptr, cl_uint numEventsInWaitList,
+        const cl_event *eventWaitList, cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int
+    enqueueCopyBufferToImage(Buffer *srcBuffer, Image *dstImage, size_t srcOffset,
+                             const size_t *dstOrigin, const size_t *region,
+                             cl_uint numEventsInWaitList,
+                             const cl_event *eventWaitList, cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    virtual cl_int
+    enqueueCopyImageToBuffer(Image *srcImage, Buffer *dstBuffer,
+                             const size_t *srcOrigin, const size_t *region,
+                             size_t dstOffset, cl_uint numEventsInWaitList,
+                             const cl_event *eventWaitList, cl_event *event) {
+        return CL_SUCCESS;
+    }
+
+    cl_int enqueueAcquireSharedObjects(cl_uint numObjects,
+                                       const cl_mem *memObjects,
+                                       cl_uint numEventsInWaitList,
+                                       const cl_event *eventWaitList,
+                                       cl_event *oclEvent,
+                                       cl_uint cmdType);
+
+    cl_int enqueueReleaseSharedObjects(cl_uint numObjects,
+                                       const cl_mem *memObjects,
+                                       cl_uint numEventsInWaitList,
+                                       const cl_event *eventWaitList,
+                                       cl_event *oclEvent,
+                                       cl_uint cmdType);
+
+    virtual cl_int finish(bool dcFlush) { return CL_SUCCESS; }
+
+    virtual cl_int flush() { return CL_SUCCESS; }
+
+    void updateFromCompletionStamp(const CompletionStamp &completionStamp);
+
+    cl_int getCommandQueueInfo(cl_command_queue_info paramName,
+                               size_t paramValueSize, void *paramValue,
+                               size_t *paramValueSizeRet);
+
+    uint32_t getHwTag() const;
+
+    volatile uint32_t *getHwTagAddress() const;
+
+    bool isCompleted(uint32_t taskCount) const;
+
+    MOCKABLE_VIRTUAL bool isQueueBlocked();
+
+    void waitUntilComplete(uint32_t taskCountToWait, FlushStamp flushStampToWait);
+
+    void flushWaitList(cl_uint numEventsInWaitList,
+                       const cl_event *eventWaitList,
+                       bool ndRangeKernel);
+
+    static uint32_t getTaskLevelFromWaitList(uint32_t taskLevel,
+                                             cl_uint numEventsInWaitList,
+                                             const cl_event *eventWaitList);
+
+    Device &getDevice() { return *device; }
+    Context &getContext() { return *context; }
+    Context *getContextPtr() { return context; }
+
+    LinearStream &getCS(size_t minRequiredSize = 1024u);
+    IndirectHeap &getIndirectHeap(IndirectHeap::Type heapType,
+                                  size_t minRequiredSize = 0u);
+
+    MOCKABLE_VIRTUAL void releaseIndirectHeap(IndirectHeap::Type heapType);
+
+    cl_command_queue_properties getCommandQueueProperties() const {
+        return commandQueueProperties;
+    }
+
+    bool isProfilingEnabled() {
+        return !!(this->getCommandQueueProperties() & CL_QUEUE_PROFILING_ENABLE);
+    }
+
+    bool isOOQEnabled() {
+        return !!(this->getCommandQueueProperties() & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
+    }
+
+    bool isPerfCountersEnabled() {
+        return perfCountersEnabled;
+    }
+
+    InstrPmRegsCfg *getPerfCountersConfigData() {
+        return perfConfigurationData;
+    }
+
+    PerformanceCounters *getPerfCounters();
+
+    bool sendPerfCountersConfig();
+
+    bool setPerfCountersEnabled(bool perfCountersEnabled, cl_uint configuration);
+
+    uint16_t getPerfCountersUserRegistersNumber() {
+        return perfCountersUserRegistersNumber;
+    }
+
+    // taskCount of last task
+    uint32_t taskCount;
+
+    // current taskLevel. Used for determining if a PIPE_CONTROL is needed.
+    uint32_t taskLevel;
+
+    std::unique_ptr<FlushStampTracker> flushStamp;
+
+    std::atomic<uint32_t> latestTaskCountWaited{(uint32_t)-1};
+
+    // virtual event that holds last Enqueue information
+    Event *virtualEvent;
+
+  protected:
+    Context *context;
+    Device *device;
+
+    cl_command_queue_properties commandQueueProperties;
+
+    bool perfCountersEnabled;
+    cl_uint perfCountersConfig;
+    uint32_t perfCountersUserRegistersNumber;
+    InstrPmRegsCfg *perfConfigurationData;
+    uint32_t perfCountersRegsCfgHandle;
+    bool perfCountersRegsCfgPending;
+
+    LinearStream *commandStream;
+    IndirectHeap *indirectHeap[NUM_HEAPS];
+
+    bool mapDcFlushRequired = false;
+};
+
+typedef CommandQueue *(*CommandQueueCreateFunc)(
+    Context *context, Device *device, const cl_queue_properties *properties);
+
+template <typename GfxFamily, unsigned int eventType>
+LinearStream &getCommandStream(CommandQueue &commandQueue,
+                               bool reserveProfilingCmdsSpace,
+                               bool reservePerfCounterCmdsSpace,
+                               const Kernel *pKernel);
+
+template <typename GfxFamily, IndirectHeap::Type heapType>
+IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const Kernel &kernel);
+} // namespace OCLRT
--- a/runtime/command_queue/command_queue_hw.h
+++ b/runtime/command_queue/command_queue_hw.h
@@ -0,0 +1,391 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "runtime/command_stream/command_stream_receiver.h"
+#include "runtime/command_queue/command_queue.h"
+#include "runtime/mem_obj/mem_obj.h"
+#include "runtime/memory_manager/graphics_allocation.h"
+#include "runtime/program/printf_handler.h"
+#include "runtime/helpers/dispatch_info.h"
+#include "runtime/command_stream/preemption.h"
+#include "runtime/helpers/queue_helpers.h"
+#include <memory>
+
+namespace OCLRT {
+
+class EventBuilder;
+
+template <typename GfxFamily>
+class CommandQueueHw : public CommandQueue {
+    typedef CommandQueue BaseClass;
+
+  public:
+    CommandQueueHw(Context *context,
+                   Device *device,
+                   const cl_queue_properties *properties) : BaseClass(context, device, properties) {
+        if (getCmdQueueProperties<cl_queue_priority_khr>(properties, CL_QUEUE_PRIORITY_KHR) & static_cast<cl_queue_priority_khr>(CL_QUEUE_PRIORITY_LOW_KHR)) {
+            low_priority = true;
+        }
+        if (getCmdQueueProperties<cl_queue_properties>(properties, CL_QUEUE_PROPERTIES) & static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)) {
+            device->getCommandStreamReceiver().overrideDispatchPolicy(CommandStreamReceiver::BatchedDispatch);
+        }
+    }
+
+    static CommandQueue *create(Context *context,
+                                Device *device,
+                                const cl_queue_properties *properties) {
+        return new CommandQueueHw<GfxFamily>(context, device, properties);
+    }
+
+    cl_int enqueueBarrierWithWaitList(cl_uint numEventsInWaitList,
+                                      const cl_event *eventWaitList,
+                                      cl_event *event) override;
+
+    cl_int enqueueCopyBuffer(Buffer *srcBuffer,
+                             Buffer *dstBuffer,
+                             size_t srcOffset,
+                             size_t dstOffset,
+                             size_t size,
+                             cl_uint numEventsInWaitList,
+                             const cl_event *eventWaitList,
+                             cl_event *event) override;
+
+    cl_int enqueueCopyBufferRect(Buffer *srcBuffer,
+                                 Buffer *dstBuffer,
+                                 const size_t *srcOrigin,
+                                 const size_t *dstOrigin,
+                                 const size_t *region,
+                                 size_t srcRowPitch,
+                                 size_t srcSlicePitch,
+                                 size_t dstRowPitch,
+                                 size_t dstSlicePitch,
+                                 cl_uint numEventsInWaitList,
+                                 const cl_event *eventWaitList,
+                                 cl_event *event) override;
+
+    cl_int enqueueCopyImage(Image *srcImage,
+                            Image *dstImage,
+                            const size_t srcOrigin[3],
+                            const size_t dstOrigin[3],
+                            const size_t region[3],
+                            cl_uint numEventsInWaitList,
+                            const cl_event *eventWaitList,
+                            cl_event *event) override;
+
+    cl_int enqueueFillBuffer(Buffer *buffer,
+                             const void *pattern,
+                             size_t patternSize,
+                             size_t offset,
+                             size_t size,
+                             cl_uint numEventsInWaitList,
+                             const cl_event *eventWaitList,
+                             cl_event *event) override;
+
+    cl_int enqueueFillImage(Image *image,
+                            const void *fillColor,
+                            const size_t *origin,
+                            const size_t *region,
+                            cl_uint numEventsInWaitList,
+                            const cl_event *eventWaitList,
+                            cl_event *event) override;
+
+    cl_int enqueueKernel(cl_kernel kernel,
+                         cl_uint workDim,
+                         const size_t *globalWorkOffset,
+                         const size_t *globalWorkSize,
+                         const size_t *localWorkSize,
+                         cl_uint numEventsInWaitList,
+                         const cl_event *eventWaitList,
+                         cl_event *event) override;
+
+    void *enqueueMapBuffer(Buffer *buffer, cl_bool blockingMap, cl_map_flags mapFlags,
+                           size_t offset, size_t size, cl_uint numEventsInWaitList,
+                           const cl_event *eventWaitList, cl_event *event, cl_int &errcodeRet) override;
+
+    void *enqueueMapSharedBuffer(Buffer *buffer, cl_bool blockingMap, cl_map_flags mapFlags,
+                                 size_t offset, size_t size, cl_uint numEventsInWaitList,
+                                 const cl_event *eventWaitList, cl_event *event, cl_int &errcodeRet);
+
+    void *enqueueMapImage(cl_mem image,
+                          cl_bool blockingMap,
+                          cl_map_flags mapFlags,
+                          const size_t *origin,
+                          const size_t *region,
+                          size_t *imageRowPitch,
+                          size_t *imageSlicePitch,
+                          cl_uint numEventsInWaitList,
+                          const cl_event *eventWaitList,
+                          cl_event *event,
+                          cl_int &errcodeRet) override;
+
+    cl_int enqueueSVMMap(cl_bool blockingMap,
+                         cl_map_flags mapFlags,
+                         void *svmPtr,
+                         size_t size,
+                         cl_uint numEventsInWaitList,
+                         const cl_event *eventWaitList,
+                         cl_event *event) override;
+
+    cl_int enqueueSVMUnmap(void *svmPtr,
+                           cl_uint numEventsInWaitList,
+                           const cl_event *eventWaitList,
+                           cl_event *event) override;
+
+    cl_int enqueueSVMFree(cl_uint numSvmPointers,
+                          void *svmPointers[],
+                          void(CL_CALLBACK *pfnFreeFunc)(cl_command_queue queue,
+                                                         cl_uint numSvmPointers,
+                                                         void *svmPointers[],
+                                                         void *userData),
+                          void *userData,
+                          cl_uint numEventsInWaitList,
+                          const cl_event *eventWaitList,
+                          cl_event *event) override;
+
+    cl_int enqueueSVMMemcpy(cl_bool blockingCopy,
+                            void *dstPtr,
+                            const void *srcPtr,
+                            size_t size,
+                            cl_uint numEventsInWaitList,
+                            const cl_event *eventWaitList,
+                            cl_event *event) override;
+
+    cl_int enqueueSVMMemFill(void *svmPtr,
+                             const void *pattern,
+                             size_t patternSize,
+                             size_t size,
+                             cl_uint numEventsInWaitList,
+                             const cl_event *eventWaitList,
+                             cl_event *event) override;
+
+    cl_int enqueueMarkerWithWaitList(cl_uint numEventsInWaitList,
+                                     const cl_event *eventWaitList,
+                                     cl_event *event) override;
+
+    cl_int enqueueMigrateMemObjects(cl_uint numMemObjects,
+                                    const cl_mem *memObjects,
+                                    cl_mem_migration_flags flags,
+                                    cl_uint numEventsInWaitList,
+                                    const cl_event *eventWaitList,
+                                    cl_event *event) override;
+
+    cl_int enqueueSVMMigrateMem(cl_uint numSvmPointers,
+                                const void **svmPointers,
+                                const size_t *sizes,
+                                const cl_mem_migration_flags flags,
+                                cl_uint numEventsInWaitList,
+                                const cl_event *eventWaitList,
+                                cl_event *event) override;
+
+    cl_int enqueueReadBuffer(Buffer *buffer,
+                             cl_bool blockingRead,
+                             size_t offset,
+                             size_t size,
+                             void *ptr,
+                             cl_uint numEventsInWaitList,
+                             const cl_event *eventWaitList,
+                             cl_event *event) override;
+
+    cl_int enqueueReadBufferRect(Buffer *buffer,
+                                 cl_bool blockingRead,
+                                 const size_t *bufferOrigin,
+                                 const size_t *hostOrigin,
+                                 const size_t *region,
+                                 size_t bufferRowPitch,
+                                 size_t bufferSlicePitch,
+                                 size_t hostRowPitch,
+                                 size_t hostSlicePitch,
+                                 void *ptr,
+                                 cl_uint numEventsInWaitList,
+                                 const cl_event *eventWaitList,
+                                 cl_event *event) override;
+
+    cl_int enqueueReadImage(Image *srcImage,
+                            cl_bool blockingRead,
+                            const size_t *origin,
+                            const size_t *region,
+                            size_t rowPitch,
+                            size_t slicePitch,
+                            void *ptr,
+                            cl_uint numEventsInWaitList,
+                            const cl_event *eventWaitList,
+                            cl_event *event) override;
+
+    cl_int enqueueUnmapMemObject(MemObj *memObj,
+                                 void *mappedPtr,
+                                 cl_uint numEventsInWaitList,
+                                 const cl_event *eventWaitList,
+                                 cl_event *event) override {
+        cl_int retVal;
+        if (memObj->allowTiling() || memObj->peekSharingHandler()) {
+            retVal = memObj->unmapObj(this, mappedPtr, numEventsInWaitList, eventWaitList, event);
+        } else {
+            cpuDataTransferHandler(memObj,
+                                   CL_COMMAND_UNMAP_MEM_OBJECT,
+                                   CL_FALSE,
+                                   0,
+                                   0,
+                                   mappedPtr,
+                                   numEventsInWaitList,
+                                   eventWaitList,
+                                   event,
+                                   retVal);
+        }
+        return retVal;
+    }
+
+    cl_int enqueueWriteBuffer(Buffer *buffer,
+                              cl_bool blockingWrite,
+                              size_t offset,
+                              size_t cb,
+                              const void *ptr,
+                              cl_uint numEventsInWaitList,
+                              const cl_event *eventWaitList,
+                              cl_event *event) override;
+
+    cl_int enqueueWriteBufferRect(Buffer *buffer,
+                                  cl_bool blockingWrite,
+                                  const size_t *bufferOrigin,
+                                  const size_t *hostOrigin,
+                                  const size_t *region,
+                                  size_t bufferRowPitch,
+                                  size_t bufferSlicePitch,
+                                  size_t hostRowPitch,
+                                  size_t hostSlicePitch,
+                                  const void *ptr,
+                                  cl_uint numEventsInWaitList,
+                                  const cl_event *eventWaitList,
+                                  cl_event *event) override;
+
+    cl_int enqueueWriteImage(Image *dstImage,
+                             cl_bool blockingWrite,
+                             const size_t *origin,
+                             const size_t *region,
+                             size_t inputRowPitch,
+                             size_t inputSlicePitch,
+                             const void *ptr,
+                             cl_uint numEventsInWaitList,
+                             const cl_event *eventWaitList,
+                             cl_event *event) override;
+
+    cl_int enqueueCopyBufferToImage(Buffer *srcBuffer,
+                                    Image *dstImage,
+                                    size_t srcOffset,
+                                    const size_t *dstOrigin,
+                                    const size_t *region,
+                                    cl_uint numEventsInWaitList,
+                                    const cl_event *eventWaitList,
+                                    cl_event *event) override;
+
+    cl_int enqueueCopyImageToBuffer(Image *srcImage,
+                                    Buffer *dstBuffer,
+                                    const size_t *srcOrigin,
+                                    const size_t *region,
+                                    size_t dstOffset,
+                                    cl_uint numEventsInWaitList,
+                                    const cl_event *eventWaitList,
+                                    cl_event *event) override;
+    cl_int finish(bool dcFlush) override;
+    cl_int flush() override;
+
+    template <unsigned int enqueueType>
+    void enqueueHandler(Surface **surfacesForResidency,
+                        size_t numSurfaceForResidency,
+                        bool blocking,
+                        const MultiDispatchInfo &dispatchInfo,
+                        cl_uint numEventsInWaitList,
+                        const cl_event *eventWaitList,
+                        cl_event *event);
+
+    template <unsigned int enqueueType, size_t size>
+    void enqueueHandler(Surface *(&surfacesForResidency)[size],
+                        bool blocking,
+                        const MultiDispatchInfo &dispatchInfo,
+                        cl_uint numEventsInWaitList,
+                        const cl_event *eventWaitList,
+                        cl_event *event) {
+        enqueueHandler<enqueueType>(surfacesForResidency, size, blocking, dispatchInfo, numEventsInWaitList, eventWaitList, event);
+    }
+
+    template <unsigned int enqueueType, size_t size>
+    void enqueueHandler(Surface *(&surfacesForResidency)[size],
+                        bool blocking,
+                        Kernel *kernel,
+                        cl_uint workDim,
+                        const size_t globalOffsets[3],
+                        const size_t workItems[3],
+                        const size_t *localWorkSizesIn,
+                        cl_uint numEventsInWaitList,
+                        const cl_event *eventWaitList,
+                        cl_event *event);
+
+    template <unsigned int commandType>
+    CompletionStamp enqueueNonBlocked(Surface **surfacesForResidency,
+                                      size_t surfaceCount,
+                                      LinearStream &commandStream,
+                                      size_t commandStreamStart,
+                                      bool &blocking,
+                                      const MultiDispatchInfo &multiDispatchInfo,
+                                      EventBuilder &eventBuilder,
+                                      uint32_t taskLevel,
+                                      bool slmUsed,
+                                      PrintfHandler *printfHandler);
+
+    template <unsigned int commandType>
+    void enqueueBlocked(Surface **surfacesForResidency,
+                        size_t surfacesCount,
+                        bool &blocking,
+                        const MultiDispatchInfo &multiDispatchInfo,
+                        KernelOperation *blockedCommandsData,
+                        cl_uint numEventsInWaitList,
+                        const cl_event *eventWaitList,
+                        bool slmUsed,
+                        EventBuilder &externalEventBuilder,
+                        std::unique_ptr<PrintfHandler> printfHandler);
+
+    void addMapUnmapToWaitlistEventsDependencies(const cl_event *eventWaitList,
+                                                 size_t numEventsInWaitlist,
+                                                 MapOperationType opType,
+                                                 MemObj *memObj,
+                                                 EventBuilder &externalEventBuilder);
+
+    void *cpuDataTransferHandler(MemObj *memObj,
+                                 cl_command_type cmdType,
+                                 cl_bool blocking,
+                                 size_t offset,
+                                 size_t size,
+                                 void *ptr,
+                                 cl_uint numEventsInWaitList,
+                                 const cl_event *eventWaitList,
+                                 cl_event *event,
+                                 cl_int &retVal);
+
+  protected:
+    MOCKABLE_VIRTUAL void enqueueHandlerHook(const unsigned int commandType, const MultiDispatchInfo &dispatchInfo);
+
+  private:
+    bool isTaskLevelUpdateRequired(const uint32_t &taskLevel, const cl_event *eventWaitList, const cl_uint &numEventsInWaitList, unsigned int commandType);
+
+    void forceDispatchScheduler(OCLRT::MultiDispatchInfo &multiDispatchInfo);
+};
+} // namespace OCLRT
--- a/runtime/command_queue/command_queue_hw.inl
+++ b/runtime/command_queue/command_queue_hw.inl
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "runtime/command_queue/dispatch_walker.h"
+#include "runtime/command_queue/enqueue_barrier.h"
+#include "runtime/command_queue/enqueue_copy_buffer.h"
+#include "runtime/command_queue/enqueue_copy_buffer_rect.h"
+#include "runtime/command_queue/enqueue_copy_buffer_to_image.h"
+#include "runtime/command_queue/enqueue_copy_image_to_buffer.h"
+#include "runtime/command_queue/enqueue_copy_image.h"
+#include "runtime/command_queue/enqueue_fill_buffer.h"
+#include "runtime/command_queue/enqueue_fill_image.h"
+#include "runtime/command_queue/enqueue_kernel.h"
+#include "runtime/command_queue/enqueue_map_buffer.h"
+#include "runtime/command_queue/enqueue_map_image.h"
+#include "runtime/command_queue/enqueue_svm.h"
+#include "runtime/command_queue/enqueue_marker.h"
+#include "runtime/command_queue/enqueue_migrate_mem_objects.h"
+#include "runtime/command_queue/enqueue_read_buffer.h"
+#include "runtime/command_queue/enqueue_read_buffer_rect.h"
+#include "runtime/command_queue/enqueue_read_image.h"
+#include "runtime/command_queue/enqueue_write_buffer.h"
+#include "runtime/command_queue/enqueue_write_buffer_rect.h"
+#include "runtime/command_queue/enqueue_write_image.h"
+#include "runtime/command_queue/cpu_data_transfer_handler.h"
+#include "runtime/command_queue/finish.h"
+#include "runtime/command_queue/flush.h"
--- a/runtime/command_queue/cpu_data_transfer_handler.h
+++ b/runtime/command_queue/cpu_data_transfer_handler.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "runtime/command_queue/command_queue_hw.h"
+#include "runtime/device/device.h"
+#include "runtime/event/event_builder.h"
+
+namespace OCLRT {
+template <typename GfxFamily>
+void *CommandQueueHw<GfxFamily>::cpuDataTransferHandler(MemObj *memObj,
+                                                        cl_command_type cmdType,
+                                                        cl_bool blocking,
+                                                        size_t offset,
+                                                        size_t size,
+                                                        void *ptr,
+                                                        cl_uint numEventsInWaitList,
+                                                        const cl_event *eventWaitList,
+                                                        cl_event *event,
+                                                        cl_int &retVal) {
+    EventBuilder eventBuilder;
+    bool blockQueue = false;
+    bool eventCompleted = false;
+    ErrorCodeHelper err(&retVal, CL_SUCCESS);
+
+    if (event) {
+        eventBuilder.create<Event>(this, cmdType, Event::eventNotReady, Event::eventNotReady);
+        eventBuilder.getEvent()->setQueueTimeStamp();
+        eventBuilder.getEvent()->setCPUProfilingPath(true);
+        *event = eventBuilder.getEvent();
+    }
+
+    TakeOwnershipWrapper<Device> deviceOwnership(*device);
+    TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
+
+    auto taskLevel = getTaskLevelFromWaitList(this->taskLevel, numEventsInWaitList, eventWaitList);
+    auto updateTaskLevel = isTaskLevelUpdateRequired(taskLevel, eventWaitList, numEventsInWaitList, cmdType);
+
+    DBG_LOG(LogTaskCounts, __FUNCTION__, "taskLevel", taskLevel);
+
+    if (updateTaskLevel) {
+        taskLevel++;
+        this->taskLevel = taskLevel;
+    }
+
+    if (event) {
+        eventBuilder.getEvent()->taskLevel = taskLevel;
+    }
+
+    blockQueue = ((taskLevel == Event::eventNotReady) || isQueueBlocked());
+
+    if (blockQueue &&
+        (cmdType == CL_COMMAND_MAP_BUFFER || cmdType == CL_COMMAND_UNMAP_MEM_OBJECT)) {
+
+        addMapUnmapToWaitlistEventsDependencies(eventWaitList,
+                                                static_cast<size_t>(numEventsInWaitList),
+                                                cmdType == CL_COMMAND_MAP_BUFFER ? MAP : UNMAP,
+                                                memObj,
+                                                eventBuilder);
+    }
+
+    queueOwnership.unlock();
+    deviceOwnership.unlock();
+
+    // read/write buffers are always blocking
+    if (!blockQueue || blocking) {
+        err.set(Event::waitForEvents(numEventsInWaitList, eventWaitList));
+
+        if (eventBuilder.getEvent()) {
+            eventBuilder.getEvent()->setSubmitTimeStamp();
+        }
+        //wait for the completness of previous commands
+        if (cmdType != CL_COMMAND_UNMAP_MEM_OBJECT) {
+            if (!memObj->isMemObjZeroCopy() || blocking) {
+                finish(true);
+                eventCompleted = true;
+            }
+        }
+
+        auto bufferStorage = ptrOffset(memObj->getCpuAddressForMemoryTransfer(), offset);
+
+        if (eventBuilder.getEvent()) {
+            eventBuilder.getEvent()->setStartTimeStamp();
+        }
+
+        switch (cmdType) {
+        case CL_COMMAND_MAP_BUFFER:
+            if (!memObj->isMemObjZeroCopy()) {
+                if (context->isProvidingPerformanceHints()) {
+                    context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, CL_ENQUEUE_MAP_BUFFER_REQUIRES_COPY_DATA, static_cast<cl_mem>(memObj));
+                }
+                memObj->transferDataToHostPtr();
+                eventCompleted = true;
+            } else {
+                if (context->isProvidingPerformanceHints()) {
+                    context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL, CL_ENQUEUE_MAP_BUFFER_DOESNT_REQUIRE_COPY_DATA, static_cast<cl_mem>(memObj));
+                }
+            }
+            break;
+        case CL_COMMAND_UNMAP_MEM_OBJECT:
+            if (!memObj->isMemObjZeroCopy()) {
+                if (context->isProvidingPerformanceHints()) {
+                    context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, CL_ENQUEUE_UNMAP_MEM_OBJ_REQUIRES_COPY_DATA, ptr, static_cast<cl_mem>(memObj));
+                }
+                memObj->transferDataFromHostPtrToMemoryStorage();
+                eventCompleted = true;
+            } else {
+                if (context->isProvidingPerformanceHints()) {
+                    context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL, CL_ENQUEUE_UNMAP_MEM_OBJ_DOESNT_REQUIRE_COPY_DATA, ptr);
+                }
+            }
+            break;
+        case CL_COMMAND_READ_BUFFER:
+            if (context->isProvidingPerformanceHints()) {
+                context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, CL_ENQUEUE_READ_BUFFER_REQUIRES_COPY_DATA, static_cast<cl_mem>(memObj), ptr);
+            }
+            memcpy_s(ptr, size, bufferStorage, size);
+            eventCompleted = true;
+            break;
+        case CL_COMMAND_WRITE_BUFFER:
+            if (context->isProvidingPerformanceHints()) {
+                context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, CL_ENQUEUE_WRITE_BUFFER_REQUIRES_COPY_DATA, static_cast<cl_mem>(memObj), ptr);
+            }
+            memcpy_s(bufferStorage, size, ptr, size);
+            eventCompleted = true;
+            break;
+        default:
+            err.set(CL_INVALID_OPERATION);
+        }
+
+        if (eventBuilder.getEvent()) {
+            eventBuilder.getEvent()->setEndTimeStamp();
+            eventBuilder.getEvent()->updateTaskCount(this->taskCount);
+            if (eventCompleted) {
+                eventBuilder.getEvent()->setStatus(CL_COMPLETE);
+            } else {
+                eventBuilder.getEvent()->updateExecutionStatus();
+            }
+        }
+    }
+
+    if (cmdType == CL_COMMAND_MAP_BUFFER) {
+        return memObj->setAndReturnMappedPtr(offset);
+    }
+
+    if (cmdType == CL_COMMAND_UNMAP_MEM_OBJECT) {
+        err.set(ptr == memObj->getMappedPtr() ? CL_SUCCESS : CL_INVALID_VALUE);
+    }
+
+    return nullptr; // only map returns pointer
+}
+} // namespace OCLRT
--- a/runtime/command_queue/dispatch_walker.h
+++ b/runtime/command_queue/dispatch_walker.h
@@ -0,0 +1,937 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "runtime/context/context.h"
+#include "runtime/gen9/gen9_cmd_def.h"
+#include "runtime/command_queue/local_id_gen.h"
+#include "runtime/command_queue/command_queue.h"
+#include "runtime/command_queue/dispatch_walker_helper.h"
+#include "runtime/command_stream/command_stream_receiver.h"
+#include "runtime/device/device_info.h"
+#include "runtime/device_queue/device_queue_hw.h"
+#include "runtime/event/perf_counter.h"
+#include "runtime/event/user_event.h"
+#include "runtime/indirect_heap/indirect_heap.h"
+#include "runtime/helpers/aligned_memory.h"
+#include "runtime/helpers/debug_helpers.h"
+#include "runtime/helpers/kernel_commands.h"
+#include "runtime/helpers/task_information.h"
+#include "runtime/helpers/validators.h"
+#include "runtime/helpers/dispatch_info.h"
+#include "runtime/kernel/kernel.h"
+#include "runtime/mem_obj/mem_obj.h"
+#include "runtime/memory_manager/graphics_allocation.h"
+#include <algorithm>
+#include <cmath>
+
+namespace OCLRT {
+
+void computeWorkgroupSize1D(
+    uint32_t maxWorkGroupSize,
+    size_t workGroupSize[3],
+    const size_t workItems[3],
+    size_t simdSize);
+
+void computeWorkgroupSizeND(
+    WorkSizeInfo wsInfo,
+    size_t workGroupSize[3],
+    const size_t workItems[3],
+    const uint32_t workDim);
+
+void computeWorkgroupSize2D(
+    uint32_t maxWorkGroupSize,
+    size_t workGroupSize[3],
+    const size_t workItems[3],
+    size_t simdSize);
+
+void computeWorkgroupSizeSquared(
+    uint32_t maxWorkGroupSize,
+    size_t workGroupSize[3],
+    const size_t workItems[3],
+    size_t simdSize,
+    const uint32_t workDim);
+
+Vec3<size_t> computeWorkgroupSize(
+    const DispatchInfo &dispatchInfo);
+
+Vec3<size_t> generateWorkgroupSize(
+    const DispatchInfo &dispatchInfo);
+
+Vec3<size_t> computeWorkgroupsNumber(
+    const Vec3<size_t> gws,
+    const Vec3<size_t> lws);
+
+Vec3<size_t> generateWorkgroupsNumber(
+    const Vec3<size_t> gws,
+    const Vec3<size_t> lws);
+
+Vec3<size_t> generateWorkgroupsNumber(
+    const DispatchInfo &dispatchInfo);
+
+Vec3<size_t> canonizeWorkgroup(
+    Vec3<size_t> workgroup);
+
+inline uint32_t calculateDispatchDim(Vec3<size_t> dispatchSize, Vec3<size_t> dispatchOffset) {
+    return std::max(1U, std::max(dispatchSize.getSimplifiedDim(), dispatchOffset.getSimplifiedDim()));
+}
+
+template <typename GfxFamily>
+inline size_t setGpgpuWalkerThreadData(
+    typename GfxFamily::GPGPU_WALKER *pCmd,
+    const size_t globalOffsets[3],
+    const size_t startWorkGroups[3],
+    const size_t numWorkGroups[3],
+    const size_t localWorkSizesIn[3],
+    uint32_t simd) {
+    typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER;
+
+    auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];
+
+    auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize);
+    pCmd->setThreadWidthCounterMaximum((uint32_t)threadsPerWorkGroup);
+
+    pCmd->setThreadGroupIdXDimension((uint32_t)numWorkGroups[0]);
+    pCmd->setThreadGroupIdYDimension((uint32_t)numWorkGroups[1]);
+    pCmd->setThreadGroupIdZDimension((uint32_t)numWorkGroups[2]);
+
+    // compute RightExecutionMask
+    auto remainderSimdLanes = localWorkSize & (simd - 1);
+    uint64_t executionMask = (1ull << remainderSimdLanes) - 1;
+    if (!executionMask)
+        executionMask = ~executionMask;
+
+    pCmd->setRightExecutionMask((uint32_t)executionMask);
+
+    pCmd->setBottomExecutionMask((uint32_t)0xffffffff);
+    pCmd->setSimdSize((typename GPGPU_WALKER::SIMD_SIZE)(simd >> 4));
+
+    pCmd->setThreadGroupIdStartingX((uint32_t)startWorkGroups[0]);
+    pCmd->setThreadGroupIdStartingY((uint32_t)startWorkGroups[1]);
+    pCmd->setThreadGroupIdStartingResumeZ((uint32_t)startWorkGroups[2]);
+
+    return localWorkSize;
+}
+
+inline cl_uint computeDimensions(const size_t workItems[3]) {
+    return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 : 1;
+}
+
+void provideLocalWorkGroupSizeHints(Context *context, uint32_t maxWorkGroupSize, DispatchInfo dispatchInfo);
+
+template <typename SizeAndAllocCalcT, typename... CalcArgsT>
+IndirectHeap *allocateIndirectHeap(SizeAndAllocCalcT &&calc, CalcArgsT &&... args) {
+    size_t alignment = MemoryConstants::pageSize;
+    size_t size = calc(std::forward<CalcArgsT>(args)...);
+    return new IndirectHeap(alignedMalloc(size, alignment), size);
+}
+
+template <typename GfxFamily>
+void dispatchProfilingCommandsStart(
+    HwTimeStamps &hwTimeStamps,
+    OCLRT::LinearStream *commandStream) {
+    using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
+    using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
+
+    // PIPE_CONTROL for global timestamp
+    uint64_t TimeStampAddress = reinterpret_cast<uint64_t>(&(hwTimeStamps.GlobalStartTS));
+
+    auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
+    *pPipeControlCmd = PIPE_CONTROL::sInit();
+    pPipeControlCmd->setCommandStreamerStallEnable(true);
+    pPipeControlCmd->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP);
+    pPipeControlCmd->setAddress(static_cast<uint32_t>(TimeStampAddress & 0x0000FFFFFFFFULL));
+    pPipeControlCmd->setAddressHigh(static_cast<uint32_t>(TimeStampAddress >> 32));
+
+    //MI_STORE_REGISTER_MEM for context local timestamp
+    TimeStampAddress = reinterpret_cast<uint64_t>(&(hwTimeStamps.ContextStartTS));
+
+    //low part
+    auto pMICmdLow = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
+    *pMICmdLow = MI_STORE_REGISTER_MEM::sInit();
+    pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
+    pMICmdLow->setMemoryAddress(TimeStampAddress);
+
+    //hi part
+    TimeStampAddress += sizeof(uint32_t);
+    auto pMICmdHigh = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
+    *pMICmdHigh = MI_STORE_REGISTER_MEM::sInit();
+    pMICmdHigh->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_HIGH);
+    pMICmdHigh->setMemoryAddress(TimeStampAddress);
+}
+
+template <typename GfxFamily>
+void dispatchProfilingCommandsEnd(
+    HwTimeStamps &hwTimeStamps,
+    OCLRT::LinearStream *commandStream) {
+
+    using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
+    using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
+
+    // PIPE_CONTROL for global timestamp
+    auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
+    *pPipeControlCmd = PIPE_CONTROL::sInit();
+    pPipeControlCmd->setCommandStreamerStallEnable(true);
+
+    //MI_STORE_REGISTER_MEM for context local timestamp
+    uint64_t TimeStampAddress = reinterpret_cast<uint64_t>(&(hwTimeStamps.ContextEndTS));
+
+    //low part
+    auto pMICmdLow = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
+    *pMICmdLow = MI_STORE_REGISTER_MEM::sInit();
+    pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
+    pMICmdLow->setMemoryAddress(TimeStampAddress);
+
+    //hi part
+    TimeStampAddress += sizeof(uint32_t);
+    auto pMICmdHi = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
+    *pMICmdHi = MI_STORE_REGISTER_MEM::sInit();
+    pMICmdHi->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_HIGH);
+    pMICmdHi->setMemoryAddress(TimeStampAddress);
+}
+
+template <typename GfxFamily>
+void dispatchPerfCountersNoopidRegisterCommands(
+    CommandQueue &commandQueue,
+    OCLRT::HwPerfCounter &hwPerfCounter,
+    OCLRT::LinearStream *commandStream,
+    bool start) {
+
+    using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
+
+    uint64_t address = start ? reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.dmaFenceIdBegin))
+                             : reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.dmaFenceIdEnd));
+
+    auto pNoopIdRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
+    *pNoopIdRegister = MI_STORE_REGISTER_MEM::sInit();
+    pNoopIdRegister->setRegisterAddress(OCLRT::INSTR_MMIO_NOOPID);
+    pNoopIdRegister->setMemoryAddress(address);
+}
+
+template <typename GfxFamily>
+void dispatchPerfCountersReadFreqRegisterCommands(
+    CommandQueue &commandQueue,
+    OCLRT::HwPerfCounter &hwPerfCounter,
+    OCLRT::LinearStream *commandStream,
+    bool start) {
+
+    using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
+
+    uint64_t address = start ? reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.coreFreqBegin))
+                             : reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.coreFreqEnd));
+
+    auto pCoreFreqRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
+    *pCoreFreqRegister = MI_STORE_REGISTER_MEM::sInit();
+    pCoreFreqRegister->setRegisterAddress(OCLRT::INSTR_MMIO_RPSTAT1);
+    pCoreFreqRegister->setMemoryAddress(address);
+}
+
+template <typename GfxFamily>
+void dispatchPerfCountersGeneralPurposeCounterCommands(
+    CommandQueue &commandQueue,
+    OCLRT::HwPerfCounter &hwPerfCounter,
+    OCLRT::LinearStream *commandStream,
+    bool start) {
+
+    using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
+    uint64_t address = 0;
+    const uint64_t baseAddress = start ? reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.reportBegin.gp))
+                                       : reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.reportEnd.gp));
+
+    // Read General Purpose counters
+    for (uint16_t i = 0; i < OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT; i++) {
+        auto pGeneralPurposeRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
+        *pGeneralPurposeRegister = MI_STORE_REGISTER_MEM::sInit();
+        uint32_t regAddr = INSTR_GFX_OFFSETS::INSTR_PERF_CNT_1_DW0 + i * sizeof(cl_uint);
+        pGeneralPurposeRegister->setRegisterAddress(regAddr);
+        //Gp field is 2*uint64 wide so it can hold 4 uint32
+        address = baseAddress + i * sizeof(cl_uint);
+        pGeneralPurposeRegister->setMemoryAddress(address);
+    }
+}
+
+template <typename GfxFamily>
+void dispatchPerfCountersUserCounterCommands(
+    CommandQueue &commandQueue,
+    OCLRT::HwPerfCounter &hwPerfCounter,
+    OCLRT::LinearStream *commandStream,
+    bool start) {
+
+    using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
+
+    uint64_t address = 0;
+    const uint64_t baseAddr = start ? reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.reportBegin.user))
+                                    : reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.reportEnd.user));
+    uint32_t cmdNum = 0;
+    uint32_t regAddr = 0;
+    auto configData = commandQueue.getPerfCountersConfigData();
+    auto userRegs = &configData->readRegs;
+
+    for (uint32_t i = 0; i < userRegs->regsCount; i++) {
+        auto pRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
+        *pRegister = MI_STORE_REGISTER_MEM::sInit();
+
+        regAddr = userRegs->reg[i].offset;
+        pRegister->setRegisterAddress(regAddr);
+        //offset between base (low) registers is cl_ulong wide
+        address = baseAddr + i * sizeof(cl_ulong);
+        pRegister->setMemoryAddress(address);
+        cmdNum++;
+
+        if (userRegs->reg[i].bitSize > 32) {
+            pRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
+            *pRegister = MI_STORE_REGISTER_MEM::sInit();
+
+            regAddr += sizeof(cl_uint);
+            pRegister->setRegisterAddress(regAddr);
+            address += sizeof(cl_uint);
+            pRegister->setMemoryAddress(address);
+            cmdNum++;
+        }
+    }
+}
+
+template <typename GfxFamily>
+void dispatchPerfCountersOABufferStateCommands(
+    CommandQueue &commandQueue,
+    OCLRT::HwPerfCounter &hwPerfCounter,
+    OCLRT::LinearStream *commandStream) {
+
+    using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
+
+    uint64_t address = 0;
+    //OA Status
+    auto pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
+    *pOaRegister = MI_STORE_REGISTER_MEM::sInit();
+    pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_STATUS);
+    address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.oaStatus));
+    pOaRegister->setMemoryAddress(address);
+
+    //OA Head
+    pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
+    *pOaRegister = MI_STORE_REGISTER_MEM::sInit();
+    pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_HEAD_PTR);
+    address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.oaHead));
+    pOaRegister->setMemoryAddress(address);
+
+    //OA Tail
+    pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
+    *pOaRegister = MI_STORE_REGISTER_MEM::sInit();
+    pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_TAIL_PTR);
+    address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.oaTail));
+    pOaRegister->setMemoryAddress(address);
+}
+
+template <typename GfxFamily>
+void dispatchPerfCountersCommandsStart(
+    CommandQueue &commandQueue,
+    OCLRT::HwPerfCounter &hwPerfCounter,
+    OCLRT::LinearStream *commandStream) {
+
+    using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
+    using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
+    using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT;
+
+    auto perfCounters = commandQueue.getPerfCounters();
+
+    uint32_t currentReportId = perfCounters->getCurrentReportId();
+    uint64_t address = 0;
+    //flush command streamer
+    auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
+    *pPipeControlCmd = PIPE_CONTROL::sInit();
+    pPipeControlCmd->setCommandStreamerStallEnable(true);
+
+    //Store value of NOOPID register
+    dispatchPerfCountersNoopidRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
+
+    //Read Core Frequency
+    dispatchPerfCountersReadFreqRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
+
+    dispatchPerfCountersGeneralPurposeCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
+
+    auto pReportPerfCount = (MI_REPORT_PERF_COUNT *)commandStream->getSpace(sizeof(MI_REPORT_PERF_COUNT));
+    *pReportPerfCount = MI_REPORT_PERF_COUNT::sInit();
+    pReportPerfCount->setReportId(currentReportId);
+    address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.reportBegin.oa));
+    pReportPerfCount->setMemoryAddress(address);
+
+    //Timestamp: Global Start
+    pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
+    *pPipeControlCmd = PIPE_CONTROL::sInit();
+    pPipeControlCmd->setCommandStreamerStallEnable(true);
+    pPipeControlCmd->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP);
+    address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWTimeStamp.GlobalStartTS));
+    pPipeControlCmd->setAddress(static_cast<uint32_t>(address & ((uint64_t)UINT32_MAX)));
+    pPipeControlCmd->setAddressHigh(static_cast<uint32_t>(address >> 32));
+
+    dispatchPerfCountersUserCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
+
+    commandQueue.sendPerfCountersConfig();
+}
+
+template <typename GfxFamily>
+void dispatchPerfCountersCommandsEnd(
+    CommandQueue &commandQueue,
+    OCLRT::HwPerfCounter &hwPerfCounter,
+    OCLRT::LinearStream *commandStream) {
+
+    using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
+    using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
+    using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT;
+
+    auto perfCounters = commandQueue.getPerfCounters();
+
+    uint32_t currentReportId = perfCounters->getCurrentReportId();
+    uint64_t address = 0;
+
+    //flush command streamer
+    auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
+    *pPipeControlCmd = PIPE_CONTROL::sInit();
+    pPipeControlCmd->setCommandStreamerStallEnable(true);
+
+    dispatchPerfCountersOABufferStateCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream);
+
+    //Timestamp: Global End
+    pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
+    *pPipeControlCmd = PIPE_CONTROL::sInit();
+    pPipeControlCmd->setCommandStreamerStallEnable(true);
+    pPipeControlCmd->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP);
+    address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWTimeStamp.GlobalEndTS));
+    pPipeControlCmd->setAddress(static_cast<uint32_t>(address & ((uint64_t)UINT32_MAX)));
+    pPipeControlCmd->setAddressHigh(static_cast<uint32_t>(address >> 32));
+
+    auto pReportPerfCount = (MI_REPORT_PERF_COUNT *)commandStream->getSpace(sizeof(MI_REPORT_PERF_COUNT));
+    *pReportPerfCount = MI_REPORT_PERF_COUNT::sInit();
+    pReportPerfCount->setReportId(currentReportId);
+    address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.reportEnd.oa));
+    pReportPerfCount->setMemoryAddress(address);
+
+    dispatchPerfCountersGeneralPurposeCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
+
+    //Store value of NOOPID register
+    dispatchPerfCountersNoopidRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
+
+    //Read Core Frequency
+    dispatchPerfCountersReadFreqRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
+
+    dispatchPerfCountersUserCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
+
+    perfCounters->setCpuTimestamp();
+}
+
+template <typename GfxFamily>
+void dispatchWalker(
+    CommandQueue &commandQueue,
+    const MultiDispatchInfo &multiDispatchInfo,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    KernelOperation **blockedCommandsData,
+    HwTimeStamps *hwTimeStamps,
+    OCLRT::HwPerfCounter *hwPerfCounter,
+    bool blockQueue = false,
+    unsigned int commandType = 0) {
+
+    OCLRT::LinearStream *commandStream = nullptr;
+    OCLRT::IndirectHeap *dsh = nullptr, *ish = nullptr, *ioh = nullptr, *ssh = nullptr;
+    bool executionModelKernel = multiDispatchInfo.begin()->getKernel()->isParentKernel;
+
+    // Allocate command stream and indirect heaps
+    if (blockQueue) {
+        using KCH = KernelCommandsHelper<GfxFamily>;
+        commandStream = new LinearStream(alignedMalloc(MemoryConstants::pageSize, MemoryConstants::pageSize), MemoryConstants::pageSize);
+        if (executionModelKernel) {
+            uint32_t offsetDsh = commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset();
+            uint32_t colorCalcSize = commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize;
+
+            dsh = allocateIndirectHeap([&multiDispatchInfo, offsetDsh] { return KCH::getTotalSizeRequiredDSH(multiDispatchInfo) + KCH::getTotalSizeRequiredIOH(multiDispatchInfo) + offsetDsh; });
+            dsh->getSpace(colorCalcSize);
+            ioh = dsh;
+        } else {
+            dsh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredDSH(multiDispatchInfo); });
+            ioh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredIOH(multiDispatchInfo); });
+        }
+        ish = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredIH(multiDispatchInfo); });
+        ssh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredSSH(multiDispatchInfo); });
+        using UniqueIH = std::unique_ptr<IndirectHeap>;
+        *blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh),
+                                                   UniqueIH(ish), UniqueIH(ioh), UniqueIH(ssh));
+        if (executionModelKernel)
+            (*blockedCommandsData)->doNotFreeISH = true;
+    } else {
+        commandStream = &commandQueue.getCS(0);
+        if (executionModelKernel && (commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, 0).getUsed() > 0)) {
+            commandQueue.releaseIndirectHeap(IndirectHeap::SURFACE_STATE);
+        }
+        dsh = &getIndirectHeap<GfxFamily, IndirectHeap::DYNAMIC_STATE>(commandQueue, multiDispatchInfo);
+        ish = &getIndirectHeap<GfxFamily, IndirectHeap::INSTRUCTION>(commandQueue, multiDispatchInfo);
+        ioh = &getIndirectHeap<GfxFamily, IndirectHeap::INDIRECT_OBJECT>(commandQueue, multiDispatchInfo);
+        ssh = &getIndirectHeap<GfxFamily, IndirectHeap::SURFACE_STATE>(commandQueue, multiDispatchInfo);
+    }
+
+    using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
+
+    dsh->align(KernelCommandsHelper<GfxFamily>::alignInterfaceDescriptorData);
+
+    const size_t offsetInterfaceDescriptorTable = dsh->getUsed();
+    uint32_t interfaceDescriptorIndex = 0;
+    size_t totalInterfaceDescriptorTableSize = sizeof(INTERFACE_DESCRIPTOR_DATA);
+    size_t numDispatches = multiDispatchInfo.size();
+    totalInterfaceDescriptorTableSize *= numDispatches;
+
+    if (!executionModelKernel) {
+        dsh->getSpace(totalInterfaceDescriptorTableSize);
+    } else {
+        dsh->getSpace(commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset() - dsh->getUsed());
+    }
+
+    // Program media interface descriptor load
+    KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
+        *commandStream,
+        offsetInterfaceDescriptorTable,
+        totalInterfaceDescriptorTableSize);
+
+    DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
+
+    for (auto &dispatchInfo : multiDispatchInfo) {
+        auto &kernel = *dispatchInfo.getKernel();
+
+        DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3));
+        DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 || dispatchInfo.getDim() == 3));
+        DEBUG_BREAK_IF(!(dispatchInfo.getGWS().y == 1 || dispatchInfo.getDim() >= 2));
+        DEBUG_BREAK_IF(!(dispatchInfo.getOffset().z == 0 || dispatchInfo.getDim() == 3));
+        DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 || dispatchInfo.getDim() >= 2));
+
+        // Determine SIMD size
+        uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();
+
+        // If we don't have a required WGS, compute one opportunistically
+        auto maxWorkGroupSize = static_cast<uint32_t>(commandQueue.getDevice().getDeviceInfo().maxWorkGroupSize);
+        if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
+            provideLocalWorkGroupSizeHints(commandQueue.getContextPtr(), maxWorkGroupSize, dispatchInfo);
+        }
+
+        //Get dispatch geometry
+        uint32_t dim = dispatchInfo.getDim();
+        Vec3<size_t> gws = dispatchInfo.getGWS();
+        Vec3<size_t> offset = dispatchInfo.getOffset();
+        Vec3<size_t> swgs = dispatchInfo.getStartOfWorkgroups();
+
+        // Compute local workgroup sizes
+        Vec3<size_t> lws = (dispatchInfo.getLocalWorkgroupSize().x > 0) ? dispatchInfo.getLocalWorkgroupSize() : generateWorkgroupSize(dispatchInfo);
+        Vec3<size_t> elws = (dispatchInfo.getEnqueuedWorkgroupSize().x > 0) ? dispatchInfo.getEnqueuedWorkgroupSize() : lws;
+
+        // Compute number of work groups
+        Vec3<size_t> twgs = (dispatchInfo.getTotalNumberOfWorkgroups().x > 0) ? dispatchInfo.getTotalNumberOfWorkgroups() : generateWorkgroupsNumber(gws, lws);
+        Vec3<size_t> nwgs = (dispatchInfo.getNumberOfWorkgroups().x > 0) ? dispatchInfo.getNumberOfWorkgroups() : twgs;
+
+        // Patch our kernel constants
+        *kernel.globalWorkOffsetX = static_cast<uint32_t>(offset.x);
+        *kernel.globalWorkOffsetY = static_cast<uint32_t>(offset.y);
+        *kernel.globalWorkOffsetZ = static_cast<uint32_t>(offset.z);
+
+        *kernel.globalWorkSizeX = static_cast<uint32_t>(gws.x);
+        *kernel.globalWorkSizeY = static_cast<uint32_t>(gws.y);
+        *kernel.globalWorkSizeZ = static_cast<uint32_t>(gws.z);
+
+        if ((&dispatchInfo == &*multiDispatchInfo.begin()) || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) {
+            *kernel.localWorkSizeX = static_cast<uint32_t>(lws.x);
+            *kernel.localWorkSizeY = static_cast<uint32_t>(lws.y);
+            *kernel.localWorkSizeZ = static_cast<uint32_t>(lws.z);
+        }
+
+        *kernel.localWorkSizeX2 = static_cast<uint32_t>(lws.x);
+        *kernel.localWorkSizeY2 = static_cast<uint32_t>(lws.y);
+        *kernel.localWorkSizeZ2 = static_cast<uint32_t>(lws.z);
+
+        *kernel.enqueuedLocalWorkSizeX = static_cast<uint32_t>(elws.x);
+        *kernel.enqueuedLocalWorkSizeY = static_cast<uint32_t>(elws.y);
+        *kernel.enqueuedLocalWorkSizeZ = static_cast<uint32_t>(elws.z);
+
+        if (&dispatchInfo == &*multiDispatchInfo.begin()) {
+            *kernel.numWorkGroupsX = static_cast<uint32_t>(twgs.x);
+            *kernel.numWorkGroupsY = static_cast<uint32_t>(twgs.y);
+            *kernel.numWorkGroupsZ = static_cast<uint32_t>(twgs.z);
+        }
+
+        *kernel.workDim = dim;
+
+        // Send our indirect object data
+        size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};
+
+        auto offsetCrossThreadData = KernelCommandsHelper<GfxFamily>::sendIndirectState(
+            *commandStream,
+            *dsh,
+            *ish,
+            *ioh,
+            *ssh,
+            kernel,
+            simd,
+            localWorkSizes,
+            offsetInterfaceDescriptorTable,
+            interfaceDescriptorIndex);
+
+        if (&dispatchInfo == &*multiDispatchInfo.begin()) {
+            // If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled
+            if (hwTimeStamps != nullptr) {
+                dispatchProfilingCommandsStart<GfxFamily>(*hwTimeStamps, commandStream);
+            }
+            if (hwPerfCounter != nullptr) {
+                dispatchPerfCountersCommandsStart<GfxFamily>(commandQueue, *hwPerfCounter, commandStream);
+            }
+        }
+
+        // Implement enabling special WA DisableLSQCROPERFforOCL if needed
+        applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, kernel, true);
+
+        // Program the walker.  Invokes execution so all state should already be programmed
+        typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER;
+        auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER));
+        *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
+
+        size_t globalOffsets[3] = {offset.x, offset.y, offset.z};
+        size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z};
+        size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z};
+        auto localWorkSize = setGpgpuWalkerThreadData<GfxFamily>(pGpGpuWalkerCmd, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizes, simd);
+
+        pGpGpuWalkerCmd->setIndirectDataStartAddress((uint32_t)offsetCrossThreadData);
+        DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
+        pGpGpuWalkerCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex++);
+
+        auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload;
+        DEBUG_BREAK_IF(nullptr == threadPayload);
+
+        auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
+        auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels);
+        localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF));
+
+        auto sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkSize) * localIdSizePerThread;
+        DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group
+
+        auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
+        auto IndirectDataLength = alignUp((uint32_t)(sizeCrossThreadData + sizePerThreadDataTotal), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
+        pGpGpuWalkerCmd->setIndirectDataLength(IndirectDataLength);
+
+        // Implement disabling special WA DisableLSQCROPERFforOCL if needed
+        applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, kernel, false);
+    }
+
+    // If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled
+    if (hwTimeStamps != nullptr) {
+        dispatchProfilingCommandsEnd<GfxFamily>(*hwTimeStamps, commandStream);
+    }
+    if (hwPerfCounter != nullptr) {
+        dispatchPerfCountersCommandsEnd<GfxFamily>(commandQueue, *hwPerfCounter, commandStream);
+    }
+}
+
+template <typename GfxFamily>
+void dispatchWalker(
+    CommandQueue &commandQueue,
+    const Kernel &kernel,
+    cl_uint workDim,
+    const size_t globalOffsets[3],
+    const size_t workItems[3],
+    const size_t *localWorkSizesIn,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    KernelOperation **blockedCommandsData,
+    HwTimeStamps *hwTimeStamps,
+    HwPerfCounter *hwPerfCounter,
+    bool blockQueue = false) {
+
+    DispatchInfo dispatchInfo(const_cast<Kernel *>(&kernel), workDim, workItems, localWorkSizesIn, globalOffsets);
+    dispatchWalker<GfxFamily>(commandQueue, dispatchInfo, numEventsInWaitList, eventWaitList,
+                              blockedCommandsData, hwTimeStamps, hwPerfCounter, blockQueue);
+}
+
+template <typename GfxFamily>
+void dispatchScheduler(
+    CommandQueue &commandQueue,
+    DeviceQueueHw<GfxFamily> &devQueueHw,
+    SchedulerKernel &scheduler) {
+
+    using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
+    using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
+    using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
+    using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
+
+    OCLRT::LinearStream *commandStream = nullptr;
+    OCLRT::IndirectHeap *dsh = nullptr, *ish = nullptr, *ioh = nullptr, *ssh = nullptr;
+
+    commandStream = &commandQueue.getCS(0);
+    // note : below code assumes that caller to dispatchScheduler "preallocated" memory
+    //        required for execution model in below heap managers
+    dsh = devQueueHw.getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
+    ish = &commandQueue.getIndirectHeap(IndirectHeap::INSTRUCTION);
+    ssh = &commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE);
+
+    bool dcFlush = false;
+    commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, dcFlush);
+
+    uint32_t interfaceDescriptorIndex = devQueueHw.schedulerIDIndex;
+    const size_t offsetInterfaceDescriptorTable = devQueueHw.colorCalcStateSize;
+    const size_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable;
+    const size_t totalInterfaceDescriptorTableSize = devQueueHw.interfaceDescriptorEntries * sizeof(INTERFACE_DESCRIPTOR_DATA);
+
+    // Program media interface descriptor load
+    KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
+        *commandStream,
+        offsetInterfaceDescriptor,
+        totalInterfaceDescriptorTableSize);
+
+    DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
+
+    // Determine SIMD size
+    uint32_t simd = scheduler.getKernelInfo().getMaxSimdSize();
+    DEBUG_BREAK_IF(simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20);
+
+    // Patch our kernel constants
+    *scheduler.globalWorkOffsetX = 0;
+    *scheduler.globalWorkOffsetY = 0;
+    *scheduler.globalWorkOffsetZ = 0;
+
+    *scheduler.globalWorkSizeX = (uint32_t)scheduler.getGws();
+    *scheduler.globalWorkSizeY = 1;
+    *scheduler.globalWorkSizeZ = 1;
+
+    *scheduler.localWorkSizeX = (uint32_t)scheduler.getLws();
+    *scheduler.localWorkSizeY = 1;
+    *scheduler.localWorkSizeZ = 1;
+
+    *scheduler.localWorkSizeX2 = (uint32_t)scheduler.getLws();
+    *scheduler.localWorkSizeY2 = 1;
+    *scheduler.localWorkSizeZ2 = 1;
+
+    *scheduler.enqueuedLocalWorkSizeX = (uint32_t)scheduler.getLws();
+    *scheduler.enqueuedLocalWorkSizeY = 1;
+    *scheduler.enqueuedLocalWorkSizeZ = 1;
+
+    *scheduler.numWorkGroupsX = (uint32_t)(scheduler.getGws() / scheduler.getLws());
+    *scheduler.numWorkGroupsY = 0;
+    *scheduler.numWorkGroupsZ = 0;
+
+    *scheduler.workDim = 1;
+
+    // Send our indirect object data
+    size_t localWorkSizes[3] = {scheduler.getLws(), 1, 1};
+
+    // Create indirectHeap for IOH that is located at the end of device enqueue DSH
+    size_t curbeOffset = devQueueHw.setSchedulerCrossThreadData(scheduler);
+    IndirectHeap indirectObjectHeap(dsh->getBase(), dsh->getMaxAvailableSpace());
+    indirectObjectHeap.getSpace(curbeOffset);
+    ioh = &indirectObjectHeap;
+
+    auto offsetCrossThreadData = KernelCommandsHelper<GfxFamily>::sendIndirectState(
+        *commandStream,
+        *dsh,
+        *ish,
+        *ioh,
+        *ssh,
+        scheduler,
+        simd,
+        localWorkSizes,
+        offsetInterfaceDescriptorTable,
+        interfaceDescriptorIndex);
+
+    // Implement enabling special WA DisableLSQCROPERFforOCL if needed
+    applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, scheduler, true);
+
+    // Program the walker.  Invokes execution so all state should already be programmed
+    auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER));
+    *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
+
+    size_t globalOffsets[3] = {0, 0, 0};
+    size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
+    auto localWorkSize = setGpgpuWalkerThreadData<GfxFamily>(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd);
+
+    pGpGpuWalkerCmd->setIndirectDataStartAddress((uint32_t)offsetCrossThreadData);
+    DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
+    pGpGpuWalkerCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex);
+
+    auto threadPayload = scheduler.getKernelInfo().patchInfo.threadPayload;
+    DEBUG_BREAK_IF(nullptr == threadPayload);
+
+    auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
+    auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels);
+    localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF));
+
+    auto sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkSize) * localIdSizePerThread;
+    DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group
+
+    auto sizeCrossThreadData = scheduler.getCrossThreadDataSize();
+    auto IndirectDataLength = alignUp((uint32_t)(sizeCrossThreadData + sizePerThreadDataTotal), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
+    pGpGpuWalkerCmd->setIndirectDataLength(IndirectDataLength);
+
+    // Implement disabling special WA DisableLSQCROPERFforOCL if needed
+    applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, scheduler, false);
+
+    // Do not put BB_START only when returning in first Scheduler run
+    if (devQueueHw.getSchedulerReturnInstance() != 1) {
+
+        commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, true);
+
+        // Add BB Start Cmd to the SLB in the Primary Batch Buffer
+        auto *bbStart = (MI_BATCH_BUFFER_START *)commandStream->getSpace(sizeof(MI_BATCH_BUFFER_START));
+        *bbStart = MI_BATCH_BUFFER_START::sInit();
+        bbStart->setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH);
+        uint64_t slbAddress = devQueueHw.getSlbBuffer()->getGpuAddress();
+        bbStart->setBatchBufferStartAddressGraphicsaddress472(slbAddress);
+    }
+}
+
+template <typename GfxFamily, unsigned int eventType>
+struct EnqueueOperation {
+    static_assert(eventType != CL_COMMAND_NDRANGE_KERNEL, "for eventType CL_COMMAND_NDRANGE_KERNEL use specialization class");
+    static_assert(eventType != CL_COMMAND_MARKER, "for eventType CL_COMMAND_MARKER use specialization class");
+    static_assert(eventType != CL_COMMAND_MIGRATE_MEM_OBJECTS, "for eventType CL_COMMAND_MIGRATE_MEM_OBJECTS use specialization class");
+    static size_t getTotalSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
+        size_t size = KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
+                      sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
+        if (reserveProfilingCmdsSpace) {
+            size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+        }
+        if (reservePerfCounters) {
+            //start cmds
+            //P_C: flush CS & TimeStamp BEGIN
+            size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
+            //SRM NOOPID & Frequency
+            size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+            //gp registers
+            size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+            //report perf count
+            size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
+            //user registers
+            size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+
+            //end cmds
+            //P_C: flush CS & TimeStamp END;
+            size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
+            //OA buffer (status head, tail)
+            size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+            //report perf count
+            size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
+            //gp registers
+            size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+            //SRM NOOPID & Frequency
+            size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+            //user registers
+            size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+        }
+        for (auto &dispatchInfo : multiDispatchInfo) {
+            auto &kernel = *dispatchInfo.getKernel();
+            size += sizeof(typename GfxFamily::GPGPU_WALKER);
+            size += getSizeForWADisableLSQCROPERFforOCL<GfxFamily>(&kernel);
+        }
+        return size;
+    }
+
+    static size_t getSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
+        size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
+                      sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
+
+        if (reserveProfilingCmdsSpace) {
+            size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+        }
+        if (reservePerfCounters) {
+            //start cmds
+            //P_C: flush CS & TimeStamp BEGIN
+            size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
+            //SRM NOOPID & Frequency
+            size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+            //gp registers
+            size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+            //report perf count
+            size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
+            //user registers
+            size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+
+            //end cmds
+            //P_C: flush CS & TimeStamp END;
+            size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
+            //OA buffer (status head, tail)
+            size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+            //report perf count
+            size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
+            //gp registers
+            size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+            //SRM NOOPID & Frequency
+            size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+            //user registers
+            size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
+        }
+        size += getSizeForWADisableLSQCROPERFforOCL<GfxFamily>(pKernel);
+
+        return size;
+    }
+};
+
+template <typename GfxFamily, unsigned int eventType>
+LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const Kernel *pKernel) {
+    auto expectedSizeCS = EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, pKernel);
+    return commandQueue.getCS(expectedSizeCS);
+}
+
+template <typename GfxFamily, unsigned int eventType>
+LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const MultiDispatchInfo &multiDispatchInfo) {
+    size_t expectedSizeCS = 0;
+    Kernel *parentKernel = multiDispatchInfo.size() > 0 ? multiDispatchInfo.begin()->getKernel() : nullptr;
+    for (auto &dispatchInfo : multiDispatchInfo) {
+        expectedSizeCS += EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, dispatchInfo.getKernel());
+    }
+    if (parentKernel && parentKernel->isParentKernel) {
+        SchedulerKernel &scheduler = BuiltIns::getInstance().getSchedulerKernel(parentKernel->getContext());
+        expectedSizeCS += EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, &scheduler);
+    }
+    return commandQueue.getCS(expectedSizeCS);
+}
+
+template <typename GfxFamily, IndirectHeap::Type heapType>
+IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
+    size_t expectedSize = 0;
+    IndirectHeap *ih = nullptr;
+
+    // clang-format off
+    switch(heapType) {
+    case IndirectHeap::DYNAMIC_STATE:   expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH(multiDispatchInfo); break;
+    case IndirectHeap::INSTRUCTION:     expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredIH( multiDispatchInfo); break;
+    case IndirectHeap::INDIRECT_OBJECT: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(multiDispatchInfo); break;
+    case IndirectHeap::SURFACE_STATE:   expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(multiDispatchInfo); break;
+    }
+    // clang-format on
+
+    if (multiDispatchInfo.begin()->getKernel()->isParentKernel) {
+        if (heapType == IndirectHeap::INSTRUCTION || heapType == IndirectHeap::SURFACE_STATE) {
+            expectedSize += KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<heapType>(const_cast<const Kernel &>(*(multiDispatchInfo.begin()->getKernel())));
+        } else //if (heapType == IndirectHeap::DYNAMIC_STATE || heapType == IndirectHeap::INDIRECT_OBJECT)
+        {
+            DeviceQueueHw<GfxFamily> *pDevQueue = castToObject<DeviceQueueHw<GfxFamily>>(commandQueue.getContext().getDefaultDeviceQueue());
+            DEBUG_BREAK_IF(pDevQueue == nullptr);
+            ih = pDevQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
+        }
+    }
+
+    if (ih == nullptr)
+        ih = &commandQueue.getIndirectHeap(heapType, expectedSize);
+
+    return *ih;
+}
+} // namespace OCLRT
--- a/runtime/command_queue/dispatch_walker_helper.h
+++ b/runtime/command_queue/dispatch_walker_helper.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+namespace OCLRT {
+
+#define L3SQC_BIT_LQSC_RO_PERF_DIS 0x08000000
+#define L3SQC_REG4 0xB118
+
+template <typename GfxFamily>
+void applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode);
+
+template <typename GfxFamily>
+size_t getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel);
+}
--- a/runtime/command_queue/dispatch_walker_helper.inl
+++ b/runtime/command_queue/dispatch_walker_helper.inl
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+namespace OCLRT {
+
+#define CS_GPR_R0 0x2600
+#define CS_GPR_R1 0x2608
+
+#define NUM_ALU_INST_FOR_READ_MODIFY_WRITE 4
+#define ALU_OPCODE_LOAD 0x080
+#define ALU_OPCODE_STORE 0x180
+#define ALU_OPCODE_OR 0x103
+#define ALU_OPCODE_AND 0x102
+
+#define ALU_REGISTER_R_0 0x0
+#define ALU_REGISTER_R_1 0x1
+#define ALU_REGISTER_R_SRCA 0x20
+#define ALU_REGISTER_R_SRCB 0x21
+#define ALU_REGISTER_R_ACCU 0x31
+
+// Performs ReadModifyWrite operation on value of a register: Register = Register Operation Mask
+template <typename GfxFamily>
+void addAluReadModifyWriteRegister(
+    OCLRT::LinearStream *pCommandStream,
+    uint32_t aluRegister,
+    uint32_t operation,
+    uint32_t mask) {
+    // Load "Register" value into CS_GPR_R0
+    typedef typename GfxFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
+    typedef typename GfxFamily::MI_MATH MI_MATH;
+    typedef typename GfxFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
+    auto pCmd = reinterpret_cast<MI_LOAD_REGISTER_REG *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG)));
+    *pCmd = MI_LOAD_REGISTER_REG::sInit();
+    pCmd->setSourceRegisterAddress(aluRegister);
+    pCmd->setDestinationRegisterAddress(CS_GPR_R0);
+
+    // Load "Mask" into CS_GPR_R1
+    typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
+    auto pCmd2 = reinterpret_cast<MI_LOAD_REGISTER_IMM *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_IMM)));
+    *pCmd2 = MI_LOAD_REGISTER_IMM::sInit();
+    pCmd2->setRegisterOffset(CS_GPR_R1);
+    pCmd2->setDataDword(mask);
+
+    // Add instruction MI_MATH with 4 MI_MATH_ALU_INST_INLINE operands
+    auto pCmd3 = reinterpret_cast<uint32_t *>(pCommandStream->getSpace(sizeof(MI_MATH) + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)));
+    reinterpret_cast<MI_MATH *>(pCmd3)->DW0.Value = 0x0;
+    reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND;
+    reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH;
+    // 0x3 - 5 Dwords length cmd (-2): 1 for MI_MATH, 4 for MI_MATH_ALU_INST_INLINE
+    reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1;
+    pCmd3++;
+    MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(pCmd3);
+
+    // Setup first operand of MI_MATH - load CS_GPR_R0 into register A
+    pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
+    pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCA;
+    pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_0;
+    pAluParam++;
+
+    // Setup second operand of MI_MATH - load CS_GPR_R1 into register B
+    pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
+    pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCB;
+    pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_1;
+    pAluParam++;
+
+    // Setup third operand of MI_MATH - "Operation" on registers A and B
+    pAluParam->DW0.BitField.ALUOpcode = operation;
+    pAluParam->DW0.BitField.Operand1 = 0;
+    pAluParam->DW0.BitField.Operand2 = 0;
+    pAluParam++;
+
+    // Setup fourth operand of MI_MATH - store result into CS_GPR_R0
+    pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_STORE;
+    pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_0;
+    pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_ACCU;
+
+    // LOAD value of CS_GPR_R0 into "Register"
+    auto pCmd4 = reinterpret_cast<MI_LOAD_REGISTER_REG *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG)));
+    *pCmd4 = MI_LOAD_REGISTER_REG::sInit();
+    pCmd4->setSourceRegisterAddress(CS_GPR_R0);
+    pCmd4->setDestinationRegisterAddress(aluRegister);
+
+    // Add PIPE_CONTROL to flush caches
+    typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL;
+    auto pCmd5 = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
+    *pCmd5 = PIPE_CONTROL::sInit();
+    pCmd5->setCommandStreamerStallEnable(true);
+    pCmd5->setDcFlushEnable(true);
+    pCmd5->setTextureCacheInvalidationEnable(true);
+    pCmd5->setPipeControlFlushEnable(true);
+    pCmd5->setStateCacheInvalidationEnable(true);
+}
+}
--- a/runtime/command_queue/enqueue_barrier.h
+++ b/runtime/command_queue/enqueue_barrier.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "hw_cmds.h"
+#include "runtime/command_queue/command_queue_hw.h"
+#include "runtime/command_stream/command_stream_receiver.h"
+#include "runtime/device/device.h"
+#include "runtime/event/event.h"
+#include "runtime/memory_manager/surface.h"
+#include <new>
+
+namespace OCLRT {
+
+template <typename GfxFamily>
+cl_int CommandQueueHw<GfxFamily>::enqueueBarrierWithWaitList(
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event) {
+    NullSurface s;
+    Surface *surfaces[] = {&s};
+    cl_uint dimensions = 1;
+    enqueueHandler<CL_COMMAND_BARRIER>(surfaces,
+                                       false,
+                                       nullptr,
+                                       dimensions,
+                                       nullptr,
+                                       nullptr,
+                                       nullptr,
+                                       numEventsInWaitList,
+                                       eventWaitList,
+                                       event);
+
+    return CL_SUCCESS;
+}
+}
--- a/runtime/command_queue/enqueue_common.h
+++ b/runtime/command_queue/enqueue_common.h
@@ -0,0 +1,687 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "runtime/builtin_kernels_simulation/scheduler_simulation.h"
+#include "hw_cmds.h"
+#include "runtime/command_queue/command_queue_hw.h"
+#include "runtime/command_queue/dispatch_walker.h"
+#include "runtime/command_stream/command_stream_receiver.h"
+#include "runtime/event/event_builder.h"
+#include "runtime/helpers/kernel_commands.h"
+#include "runtime/helpers/dispatch_info_builder.h"
+#include "runtime/mem_obj/buffer.h"
+#include "runtime/memory_manager/memory_manager.h"
+#include "runtime/memory_manager/surface.h"
+#include "runtime/built_ins/built_ins.h"
+#include "runtime/helpers/array_count.h"
+#include "runtime/helpers/options.h"
+#include "runtime/helpers/task_information.h"
+#include "runtime/program/printf_handler.h"
+#include "runtime/program/block_kernel_manager.h"
+#include "runtime/utilities/range.h"
+#include <new>
+#include <memory>
+
+namespace OCLRT {
+
+inline bool shouldFlushDC(unsigned int commandType, PrintfHandler *printfHandler) {
+    return (commandType == CL_COMMAND_READ_BUFFER ||
+            commandType == CL_COMMAND_READ_BUFFER_RECT ||
+            commandType == CL_COMMAND_READ_IMAGE ||
+            commandType == CL_COMMAND_SVM_MAP ||
+            printfHandler);
+}
+
+inline bool isCommandWithoutKernel(unsigned int commandType) {
+    return ((commandType == CL_COMMAND_BARRIER) || (commandType == CL_COMMAND_MARKER) ||
+            (commandType == CL_COMMAND_MIGRATE_MEM_OBJECTS) ||
+            (commandType == CL_COMMAND_SVM_MAP) ||
+            (commandType == CL_COMMAND_SVM_UNMAP) ||
+            (commandType == CL_COMMAND_SVM_FREE));
+}
+
+template <typename GfxFamily>
+void CommandQueueHw<GfxFamily>::enqueueHandlerHook(const unsigned int commandType, const MultiDispatchInfo &dispatchInfo) {}
+
+template <typename GfxFamily>
+template <unsigned int commandType, size_t surfaceCount>
+void CommandQueueHw<GfxFamily>::enqueueHandler(Surface *(&surfaces)[surfaceCount],
+                                               bool blocking,
+                                               Kernel *kernel,
+                                               cl_uint workDim,
+                                               const size_t globalOffsets[3],
+                                               const size_t workItems[3],
+                                               const size_t *localWorkSizesIn,
+                                               cl_uint numEventsInWaitList,
+                                               const cl_event *eventWaitList,
+                                               cl_event *event) {
+    if (kernel == nullptr) {
+        enqueueHandler<commandType>(surfaces, blocking, MultiDispatchInfo(), numEventsInWaitList, eventWaitList, event);
+    } else {
+        MultiDispatchInfo multiDispatchInfo;
+
+        if (DebugManager.flags.ForceDispatchScheduler.get()) {
+            forceDispatchScheduler(multiDispatchInfo);
+        } else {
+            if (kernel->getKernelInfo().builtinDispatchBuilder == nullptr) {
+                DispatchInfoBuilder<SplitDispatch::Dim::d3D, SplitDispatch::SplitMode::WalkerSplit> builder;
+                builder.setDispatchGeometry(workDim, workItems, localWorkSizesIn, globalOffsets);
+                builder.setKernel(kernel);
+                builder.bake(multiDispatchInfo);
+            } else {
+                auto builder = kernel->getKernelInfo().builtinDispatchBuilder;
+                builder->buildDispatchInfos(multiDispatchInfo, kernel, workDim, workItems, localWorkSizesIn, globalOffsets);
+
+                if (multiDispatchInfo.size() == 0) {
+                    return;
+                }
+            }
+        }
+
+        enqueueHandler<commandType>(surfaces, blocking, multiDispatchInfo, numEventsInWaitList, eventWaitList, event);
+    }
+}
+
+template <typename GfxFamily>
+void CommandQueueHw<GfxFamily>::forceDispatchScheduler(OCLRT::MultiDispatchInfo &multiDispatchInfo) {
+    BuiltIns &builtIns = BuiltIns::getInstance();
+    SchedulerKernel &scheduler = builtIns.getSchedulerKernel(this->getContext());
+    DispatchInfo dispatchInfo(&scheduler, 1, Vec3<size_t>(scheduler.getGws(), 1, 1), Vec3<size_t>(scheduler.getLws(), 1, 1), Vec3<size_t>(0, 0, 0));
+
+    auto devQueue = this->getContext().getDefaultDeviceQueue();
+    DeviceQueueHw<GfxFamily> *devQueueHw = castToObject<DeviceQueueHw<GfxFamily>>(devQueue);
+
+    scheduler.createReflectionSurface();
+    GraphicsAllocation *reflectionSurface = scheduler.getKernelReflectionSurface();
+
+    devQueueHw->resetDeviceQueue();
+
+    scheduler.setArgs(devQueueHw->getQueueBuffer(),
+                      devQueueHw->getStackBuffer(),
+                      devQueueHw->getEventPoolBuffer(),
+                      devQueueHw->getSlbBuffer(),
+                      devQueueHw->getDshBuffer(),
+                      reflectionSurface,
+                      devQueueHw->getQueueStorageBuffer(),
+                      this->getIndirectHeap(IndirectHeap::SURFACE_STATE).getGraphicsAllocation());
+
+    multiDispatchInfo.push(dispatchInfo);
+}
+
+template <typename GfxFamily>
+template <unsigned int commandType>
+void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
+                                               size_t numSurfaceForResidency,
+                                               bool blocking,
+                                               const MultiDispatchInfo &multiDispatchInfo,
+                                               cl_uint numEventsInWaitList,
+                                               const cl_event *eventWaitList,
+                                               cl_event *event) {
+    if (multiDispatchInfo.empty() && !isCommandWithoutKernel(commandType)) {
+        enqueueHandler<CL_COMMAND_MARKER>(surfacesForResidency, numSurfaceForResidency, blocking, multiDispatchInfo,
+                                          numEventsInWaitList, eventWaitList, event);
+        if (event) {
+            castToObjectOrAbort<Event>(*event)->setCmdType(commandType);
+        }
+        return;
+    }
+
+    bool executionModelKernel = multiDispatchInfo.empty() ? false : multiDispatchInfo.begin()->getKernel()->isParentKernel;
+    Kernel *parentKernel = executionModelKernel ? multiDispatchInfo.begin()->getKernel() : nullptr;
+    auto devQueue = this->getContext().getDefaultDeviceQueue();
+    DeviceQueueHw<GfxFamily> *devQueueHw = castToObject<DeviceQueueHw<GfxFamily>>(devQueue);
+
+    HwTimeStamps *hwTimeStamps = nullptr;
+
+    TakeOwnershipWrapper<Device> deviceOwnership(*device);
+
+    TimeStampData queueTimeStamp;
+    if (isProfilingEnabled() && event) {
+        this->getDevice().getOSTime()->getCpuGpuTime(&queueTimeStamp);
+    }
+
+    EventBuilder eventBuilder;
+    if (event) {
+        eventBuilder.create<Event>(this, commandType, Event::eventNotReady, 0);
+        *event = eventBuilder.getEvent();
+        if (eventBuilder.getEvent()->isProfilingEnabled()) {
+            eventBuilder.getEvent()->setQueueTimeStamp(&queueTimeStamp);
+            if (isCommandWithoutKernel(commandType)) {
+                eventBuilder.getEvent()->setCPUProfilingPath(true);
+                eventBuilder.getEvent()->setQueueTimeStamp();
+            }
+        }
+        DBG_LOG(EventsDebugEnable, "enqueueHandler commandType", commandType, "output Event", eventBuilder.getEvent());
+    }
+
+    bool profilingRequired = (this->isProfilingEnabled() && event != nullptr);
+    bool perfCountersRequired = false;
+    perfCountersRequired = (this->isPerfCountersEnabled() && event != nullptr);
+    KernelOperation *blockedCommandsData = nullptr;
+    std::unique_ptr<PrintfHandler> printfHandler;
+    bool slmUsed = false;
+    TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
+
+    auto taskLevel = getTaskLevelFromWaitList(this->taskLevel, numEventsInWaitList, eventWaitList);
+    auto blockQueue = (taskLevel == Event::eventNotReady) || isQueueBlocked();
+
+    // isQueueBlocked() may use commandStream resolving events tree, get start offset after the call
+    auto &commandStream = getCommandStream<GfxFamily, commandType>(*this, profilingRequired, perfCountersRequired, multiDispatchInfo);
+    auto commandStreamStart = commandStream.getUsed();
+    auto &commandStreamReceiver = device->getCommandStreamReceiver();
+
+    // isQueueBlocked may unblock queue, get new taskLevel
+    taskLevel = getTaskLevelFromWaitList(this->taskLevel, numEventsInWaitList, eventWaitList);
+
+    DBG_LOG(EventsDebugEnable, "blockQueue", blockQueue, "virtualEvent", virtualEvent, "taskLevel", taskLevel);
+
+    if (DebugManager.flags.MakeEachEnqueueBlocking.get()) {
+        blocking = true;
+    }
+
+    if (executionModelKernel && !blockQueue) {
+        while (!devQueueHw->isEMCriticalSectionFree())
+            ;
+    }
+
+    auto updateTaskLevel = isTaskLevelUpdateRequired(taskLevel, eventWaitList, numEventsInWaitList, commandType);
+
+    if (updateTaskLevel) {
+        taskLevel++;
+    }
+
+    enqueueHandlerHook(commandType, multiDispatchInfo);
+
+    if (multiDispatchInfo.empty() == false) {
+        HwPerfCounter *hwPerfCounter = nullptr;
+        DebugManager.dumpKernelArgs(&multiDispatchInfo);
+
+        printfHandler.reset(PrintfHandler::create(multiDispatchInfo, *device));
+        if (printfHandler) {
+            printfHandler.get()->prepareDispatch(multiDispatchInfo);
+        }
+
+        if ((this->isProfilingEnabled() && (eventBuilder.getEvent() != nullptr))) {
+            // Get allocation for timestamps
+            hwTimeStamps = eventBuilder.getEvent()->getHwTimeStamp();
+            if (this->isPerfCountersEnabled()) {
+                hwPerfCounter = eventBuilder.getEvent()->getHwPerfCounter();
+                //PERF COUNTER: copy current configuration from queue to event
+                eventBuilder.getEvent()->copyPerfCounters(this->getPerfCountersConfigData());
+            }
+        }
+
+        if (executionModelKernel) {
+            parentKernel->createReflectionSurface();
+            parentKernel->patchDefaultDeviceQueue(context->getDefaultDeviceQueue());
+            parentKernel->patchEventPool(context->getDefaultDeviceQueue());
+            parentKernel->patchReflectionSurface(context->getDefaultDeviceQueue(), printfHandler.get());
+            if (!blockQueue) {
+                devQueueHw->resetDeviceQueue();
+                devQueueHw->acquireEMCriticalSection();
+            }
+        }
+
+        dispatchWalker<GfxFamily>(
+            *this,
+            multiDispatchInfo,
+            numEventsInWaitList,
+            eventWaitList,
+            &blockedCommandsData,
+            hwTimeStamps,
+            hwPerfCounter,
+            blockQueue,
+            commandType);
+
+        commandStreamReceiver.setRequiredScratchSize(multiDispatchInfo.getRequiredScratchSize());
+
+        slmUsed = multiDispatchInfo.usesSlm();
+    }
+
+    CompletionStamp completionStamp;
+    if (!blockQueue) {
+        if (executionModelKernel) {
+            size_t minSizeISHForEM = KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<IndirectHeap::INSTRUCTION>(const_cast<const Kernel &>(*(multiDispatchInfo.begin()->getKernel())));
+            size_t minSizeSSHForEM = KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(const_cast<const Kernel &>(*(multiDispatchInfo.begin()->getKernel())));
+
+            uint32_t taskCount = commandStreamReceiver.peekTaskCount() + 1;
+            devQueueHw->setupExecutionModelDispatch(getIndirectHeap(IndirectHeap::INSTRUCTION, minSizeISHForEM),
+                                                    getIndirectHeap(IndirectHeap::SURFACE_STATE, minSizeSSHForEM),
+                                                    multiDispatchInfo.begin()->getKernel(),
+                                                    (uint32_t)multiDispatchInfo.size(),
+                                                    taskCount,
+                                                    hwTimeStamps);
+
+            BuiltIns &builtIns = BuiltIns::getInstance();
+            SchedulerKernel &scheduler = builtIns.getSchedulerKernel(this->getContext());
+
+            scheduler.setArgs(devQueueHw->getQueueBuffer(),
+                              devQueueHw->getStackBuffer(),
+                              devQueueHw->getEventPoolBuffer(),
+                              devQueueHw->getSlbBuffer(),
+                              devQueueHw->getDshBuffer(),
+                              multiDispatchInfo.begin()->getKernel()->getKernelReflectionSurface(),
+                              devQueueHw->getQueueStorageBuffer(),
+                              this->getIndirectHeap(IndirectHeap::SURFACE_STATE).getGraphicsAllocation(),
+                              devQueueHw->getDebugQueue());
+
+            dispatchScheduler<GfxFamily>(
+                *this,
+                *devQueueHw,
+                scheduler);
+
+            scheduler.makeResident(commandStreamReceiver);
+
+            // Update SLM usage
+            slmUsed |= scheduler.slmTotalSize > 0;
+
+            size_t count = parentKernel->getProgram()->getBlockKernelManager()->getCount();
+
+            for (uint32_t surfaceIndex = 0; surfaceIndex < count; surfaceIndex++) {
+                auto surface = parentKernel->getProgram()->getBlockKernelManager()->getPrivateSurface(surfaceIndex);
+                if (surface) {
+                    commandStreamReceiver.makeResident(*surface);
+                }
+            }
+        }
+
+        auto submissionRequired = isCommandWithoutKernel(commandType) ? false : true;
+
+        if (submissionRequired) {
+            completionStamp = enqueueNonBlocked<commandType>(
+                surfacesForResidency,
+                numSurfaceForResidency,
+                commandStream,
+                commandStreamStart,
+                blocking,
+                multiDispatchInfo,
+                eventBuilder,
+                taskLevel,
+                slmUsed,
+                printfHandler.get());
+
+            if (eventBuilder.getEvent()) {
+                eventBuilder.getEvent()->flushStamp->replaceStampObject(this->flushStamp->getStampReference());
+            }
+
+            if (executionModelKernel && devQueueHw->getSchedulerReturnInstance() > 0) {
+                waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp);
+
+                BuiltinKernelsSimulation::SchedulerSimulation<GfxFamily> simulation;
+                simulation.runSchedulerSimulation(devQueueHw->getQueueBuffer(),
+                                                  devQueueHw->getStackBuffer(),
+                                                  devQueueHw->getEventPoolBuffer(),
+                                                  devQueueHw->getSlbBuffer(),
+                                                  devQueueHw->getDshBuffer(),
+                                                  multiDispatchInfo.begin()->getKernel()->getKernelReflectionSurface(),
+                                                  devQueueHw->getQueueStorageBuffer(),
+                                                  this->getIndirectHeap(IndirectHeap::SURFACE_STATE).getGraphicsAllocation(),
+                                                  devQueueHw->getDebugQueue());
+            }
+        } else {
+            auto maxTaskCount = this->taskCount;
+            for (auto eventId = 0u; eventId < numEventsInWaitList; eventId++) {
+                auto event = castToObject<Event>(eventWaitList[eventId]);
+                if (!event->isUserEvent()) {
+                    maxTaskCount = std::max(maxTaskCount, event->peekTaskCount());
+                }
+            }
+
+            //inherit data from event_wait_list and previous packets
+            completionStamp.flushStamp = this->flushStamp->peekStamp();
+            completionStamp.taskCount = maxTaskCount;
+            completionStamp.taskLevel = taskLevel;
+
+            if (eventBuilder.getEvent() && isProfilingEnabled()) {
+                TimeStampData submitTimeStamp;
+                this->getDevice().getOSTime()->getCpuGpuTime(&submitTimeStamp);
+                eventBuilder.getEvent()->setSubmitTimeStamp(&submitTimeStamp);
+                eventBuilder.getEvent()->setSubmitTimeStamp();
+                eventBuilder.getEvent()->setStartTimeStamp();
+            }
+        }
+    } else {
+        CompletionStamp cmplStamp = {
+            Event::eventNotReady,
+            taskLevel,
+            0,
+            EngineType::ENGINE_RCS};
+        completionStamp = cmplStamp;
+    }
+    updateFromCompletionStamp(completionStamp);
+
+    if (eventBuilder.getEvent()) {
+        eventBuilder.getEvent()->updateCompletionStamp(completionStamp.taskCount, completionStamp.taskLevel, completionStamp.flushStamp);
+        DebugManager.log(DebugManager.flags.EventsDebugEnable.get(), "updateCompletionStamp Event", eventBuilder.getEvent(), "taskLevel", eventBuilder.getEvent()->taskLevel.load());
+    }
+
+    if (blockQueue) {
+
+        if (executionModelKernel) {
+            size_t minSizeISHForEM = KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<IndirectHeap::INSTRUCTION>(const_cast<const Kernel &>(*(multiDispatchInfo.begin()->getKernel())));
+            size_t minSizeSSHForEM = KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(const_cast<const Kernel &>(*(multiDispatchInfo.begin()->getKernel())));
+            blockedCommandsData->instructionHeapSizeEM = minSizeISHForEM;
+            blockedCommandsData->surfaceStateHeapSizeEM = minSizeSSHForEM;
+        }
+
+        enqueueBlocked<commandType>(
+            surfacesForResidency,
+            numSurfaceForResidency,
+            blocking,
+            multiDispatchInfo,
+            blockedCommandsData,
+            numEventsInWaitList,
+            eventWaitList,
+            slmUsed,
+            eventBuilder,
+            std::move(printfHandler));
+    }
+
+    queueOwnership.unlock();
+    deviceOwnership.unlock();
+
+    if (blockQueue) {
+        TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
+        if (this->virtualEvent == eventBuilder.getEvent()) {
+            eventBuilder.registerEvent();
+        }
+    }
+
+    if (blocking) {
+        if (blockQueue) {
+            while (isQueueBlocked())
+                ;
+            waitUntilComplete(taskCount, flushStamp->peekStamp());
+        } else {
+            waitUntilComplete(taskCount, flushStamp->peekStamp());
+            for (auto sIt = surfacesForResidency, sE = surfacesForResidency + numSurfaceForResidency;
+                 sIt != sE; ++sIt) {
+                (*sIt)->setCompletionStamp(completionStamp, nullptr, nullptr);
+            }
+            if (printfHandler) {
+                printfHandler->printEnqueueOutput();
+            }
+            commandStreamReceiver.cleanAllocationList(taskCount, TEMPORARY_ALLOCATION);
+        }
+    }
+}
+
+template <typename GfxFamily>
+bool CommandQueueHw<GfxFamily>::isTaskLevelUpdateRequired(const uint32_t &taskLevel, const cl_event *eventWaitList, const cl_uint &numEventsInWaitList, unsigned int commandType) {
+    bool updateTaskLevel = true;
+    //if we are blocked by user event then no update
+    if (taskLevel == Event::eventNotReady) {
+        updateTaskLevel = false;
+    }
+    //if we are executing command without kernel then it will inherit state from
+    //previous commands, barrier is exception
+    if (isCommandWithoutKernel(commandType) && commandType != CL_COMMAND_BARRIER) {
+        updateTaskLevel = false;
+    }
+    //ooq special cases starts here
+    if (this->isOOQEnabled()) {
+        //if no wait list and barrier , do not update task level
+        if (eventWaitList == nullptr && commandType != CL_COMMAND_BARRIER) {
+            updateTaskLevel = false;
+        }
+        //if we have waitlist then deduce task level from waitlist and check if it is higher then current task level of queue
+        if (eventWaitList != nullptr) {
+            auto taskLevelFromEvents = getTaskLevelFromWaitList(0, numEventsInWaitList, eventWaitList);
+            taskLevelFromEvents++;
+            if (taskLevelFromEvents <= this->taskLevel) {
+                updateTaskLevel = false;
+            }
+        }
+    }
+    return updateTaskLevel;
+}
+
+template <typename GfxFamily>
+template <unsigned int commandType>
+CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
+    Surface **surfaces,
+    size_t surfaceCount,
+    LinearStream &commandStream,
+    size_t commandStreamStart,
+    bool &blocking,
+    const MultiDispatchInfo &multiDispatchInfo,
+    EventBuilder &eventBuilder,
+    uint32_t taskLevel,
+    bool slmUsed,
+    PrintfHandler *printfHandler) {
+
+    UNRECOVERABLE_IF(multiDispatchInfo.empty());
+
+    auto &commandStreamReceiver = device->getCommandStreamReceiver();
+    auto implicitFlush = false;
+
+    if (printfHandler) {
+        blocking = true;
+        printfHandler->makeResident(commandStreamReceiver);
+    }
+
+    auto requiresCoherency = false;
+    for (auto surface : CreateRange(surfaces, surfaceCount)) {
+        surface->makeResident(commandStreamReceiver);
+        requiresCoherency |= surface->IsCoherent;
+    }
+
+    auto mediaSamplerRequired = false;
+    for (auto &dispatchInfo : multiDispatchInfo) {
+        dispatchInfo.getKernel()->makeResident(commandStreamReceiver);
+        requiresCoherency |= dispatchInfo.getKernel()->requiresCoherency();
+        mediaSamplerRequired |= dispatchInfo.getKernel()->isVmeKernel();
+    }
+
+    if (mediaSamplerRequired) {
+        DEBUG_BREAK_IF(device->getDeviceInfo().preemptionSupported != false);
+    }
+
+    TimeStampData submitTimeStamp;
+    if (isProfilingEnabled() && eventBuilder.getEvent()) {
+        this->getDevice().getOSTime()->getCpuGpuTime(&submitTimeStamp);
+        eventBuilder.getEvent()->setSubmitTimeStamp(&submitTimeStamp);
+        this->getDevice().getCommandStreamReceiver().makeResident(*eventBuilder.getEvent()->getHwTimeStampAllocation());
+        if (isPerfCountersEnabled()) {
+            this->getDevice().getCommandStreamReceiver().makeResident(*eventBuilder.getEvent()->getHwPerfCounterAllocation());
+        }
+    }
+
+    IndirectHeap *dsh = nullptr;
+    IndirectHeap *ioh = nullptr;
+    const bool executionModelKernel = multiDispatchInfo.begin()->getKernel()->isParentKernel;
+
+    if (executionModelKernel) {
+        DeviceQueueHw<GfxFamily> *pDevQueue = castToObject<DeviceQueueHw<GfxFamily>>(this->getContext().getDefaultDeviceQueue());
+        DEBUG_BREAK_IF(pDevQueue == nullptr);
+        dsh = pDevQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
+        // In ExecutionModel IOH is the same as DSH to eliminate StateBaseAddress reprogramming for scheduler kernel and blocks.
+        ioh = dsh;
+        implicitFlush = true;
+    } else {
+        dsh = &getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
+        ioh = &getIndirectHeap(IndirectHeap::INDIRECT_OBJECT);
+    }
+
+    commandStreamReceiver.requestThreadArbitrationPolicy(multiDispatchInfo.begin()->getKernel()->getThreadArbitrationPolicy());
+
+    DispatchFlags dispatchFlags;
+    dispatchFlags.blocking = blocking;
+    dispatchFlags.dcFlush = shouldFlushDC(commandType, printfHandler);
+    dispatchFlags.useSLM = slmUsed;
+    dispatchFlags.guardCommandBufferWithPipeControl = true;
+    dispatchFlags.GSBA32BitRequired = commandType == CL_COMMAND_NDRANGE_KERNEL;
+    dispatchFlags.mediaSamplerRequired = mediaSamplerRequired;
+    dispatchFlags.requiresCoherency = requiresCoherency;
+    dispatchFlags.low_priority = low_priority;
+    dispatchFlags.implicitFlush = implicitFlush;
+    dispatchFlags.flushStampReference = this->flushStamp->getStampReference();
+    dispatchFlags.preemptionMode = PreemptionHelper::taskPreemptionMode(*device, multiDispatchInfo);
+    dispatchFlags.outOfOrderExecutionAllowed = this->isOOQEnabled();
+
+    DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady);
+
+    CompletionStamp completionStamp = commandStreamReceiver.flushTask(
+        commandStream,
+        commandStreamStart,
+        *dsh,
+        getIndirectHeap(IndirectHeap::INSTRUCTION),
+        *ioh,
+        getIndirectHeap(IndirectHeap::SURFACE_STATE),
+        taskLevel,
+        dispatchFlags);
+
+    for (auto surface : CreateRange(surfaces, surfaceCount)) {
+        surface->setCompletionStamp(completionStamp, device, this);
+    }
+
+    for (auto &dispatchInfo : multiDispatchInfo) {
+        dispatchInfo.getKernel()->updateWithCompletionStamp(commandStreamReceiver, &completionStamp);
+    }
+
+    return completionStamp;
+}
+
+template <typename GfxFamily>
+template <unsigned int commandType>
+void CommandQueueHw<GfxFamily>::enqueueBlocked(
+    Surface **surfaces,
+    size_t surfaceCount,
+    bool &blocking,
+    const MultiDispatchInfo &multiDispatchInfo,
+    KernelOperation *blockedCommandsData,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    bool slmUsed,
+    EventBuilder &externalEventBuilder,
+    std::unique_ptr<PrintfHandler> printfHandler) {
+
+    auto &commandStreamReceiver = device->getCommandStreamReceiver();
+
+    TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
+
+    //store previous virtual event as it will add dependecies to new virtual event
+    if (this->virtualEvent) {
+        DBG_LOG(EventsDebugEnable, "enqueueBlocked", "previousVirtualEvent", this->virtualEvent);
+    }
+
+    EventBuilder internalEventBuilder;
+    EventBuilder *eventBuilder;
+    // check if event will be exposed externally
+    if (externalEventBuilder.getEvent()) {
+        externalEventBuilder.getEvent()->incRefInternal();
+        eventBuilder = &externalEventBuilder;
+        DBG_LOG(EventsDebugEnable, "enqueueBlocked", "output event as virtualEvent", virtualEvent);
+    } else {
+        // it will be an internal event
+        internalEventBuilder.create<VirtualEvent>(this, context);
+        eventBuilder = &internalEventBuilder;
+        DBG_LOG(EventsDebugEnable, "enqueueBlocked", "new virtualEvent", eventBuilder->getEvent());
+    }
+    eventBuilder->getEvent()->setCurrentCmdQVirtualEvent(true);
+
+    //update queue taskCount
+    taskCount = eventBuilder->getEvent()->getCompletionStamp();
+
+    if (multiDispatchInfo.empty()) {
+        DEBUG_BREAK_IF(!isCommandWithoutKernel(commandType));
+        auto cmdSize = (unsigned int)EnqueueOperation<GfxFamily, commandType>::getSizeRequiredCS(isProfilingEnabled(),
+                                                                                                 isPerfCountersEnabled(),
+                                                                                                 *this,
+                                                                                                 nullptr);
+        auto cmd = std::unique_ptr<Command>(new CommandMarker(
+            *this, commandStreamReceiver, commandType, cmdSize));
+        eventBuilder->getEvent()->setCommand(std::move(cmd));
+    } else {
+        //store task data in event
+        std::vector<Surface *> allSurfaces;
+        for (auto &dispatchInfo : multiDispatchInfo) {
+            dispatchInfo.getKernel()->getResidency(allSurfaces);
+            for (auto &surface : CreateRange(surfaces, surfaceCount)) {
+                allSurfaces.push_back(surface->duplicate());
+            }
+        }
+
+        auto kernelOperation = std::unique_ptr<KernelOperation>(blockedCommandsData); // marking ownership
+        auto cmd = std::unique_ptr<Command>(new CommandComputeKernel(
+            *this,
+            commandStreamReceiver,
+            std::move(kernelOperation),
+            allSurfaces,
+            shouldFlushDC(commandType, printfHandler.get()),
+            slmUsed,
+            commandType == CL_COMMAND_NDRANGE_KERNEL,
+            std::move(printfHandler),
+            multiDispatchInfo.begin()->getKernel(),
+            (uint32_t)multiDispatchInfo.size()));
+        eventBuilder->getEvent()->setCommand(std::move(cmd));
+    }
+
+    eventBuilder->addParentEvents(ArrayRef<const cl_event>(eventWaitList, numEventsInWaitList));
+    eventBuilder->addParentEvent(this->virtualEvent);
+    eventBuilder->finalize();
+
+    if (this->virtualEvent) {
+        this->virtualEvent->setCurrentCmdQVirtualEvent(false);
+        this->virtualEvent->decRefInternal();
+    }
+
+    this->virtualEvent = eventBuilder->getEvent();
+}
+
+template <typename GfxFamily>
+void CommandQueueHw<GfxFamily>::addMapUnmapToWaitlistEventsDependencies(const cl_event *eventWaitList,
+                                                                        size_t numEventsInWaitlist,
+                                                                        MapOperationType opType,
+                                                                        MemObj *memObj,
+                                                                        EventBuilder &externalEventBuilder) {
+    auto &commandStreamReceiver = device->getCommandStreamReceiver();
+
+    EventBuilder internalEventBuilder;
+    EventBuilder *eventBuilder;
+    // check if event will be exposed externally
+    if (externalEventBuilder.getEvent()) {
+        externalEventBuilder.getEvent()->incRefInternal();
+        eventBuilder = &externalEventBuilder;
+    } else {
+        // it will be an internal event
+        internalEventBuilder.create<VirtualEvent>(this, context);
+        eventBuilder = &internalEventBuilder;
+    }
+
+    //store task data in event
+    auto cmd = std::unique_ptr<Command>(new CommandMapUnmap(opType, *memObj, commandStreamReceiver, *this));
+    eventBuilder->getEvent()->setCommand(std::move(cmd));
+
+    //bind output event with input events
+    eventBuilder->addParentEvents(ArrayRef<const cl_event>(eventWaitList, numEventsInWaitlist));
+    eventBuilder->addParentEvent(this->virtualEvent);
+    eventBuilder->finalize();
+
+    if (this->virtualEvent) {
+        this->virtualEvent->setCurrentCmdQVirtualEvent(false);
+        this->virtualEvent->decRefInternal();
+    }
+    this->virtualEvent = eventBuilder->getEvent();
+}
+} // namespace OCLRT
--- a/runtime/command_queue/enqueue_copy_buffer.h
+++ b/runtime/command_queue/enqueue_copy_buffer.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "hw_cmds.h"
+#include "runtime/command_queue/command_queue_hw.h"
+#include "runtime/command_queue/enqueue_common.h"
+#include "runtime/command_stream/command_stream_receiver.h"
+#include "runtime/helpers/kernel_commands.h"
+#include "runtime/mem_obj/buffer.h"
+#include "runtime/memory_manager/surface.h"
+#include "runtime/built_ins/built_ins.h"
+#include <new>
+
+namespace OCLRT {
+
+template <typename GfxFamily>
+cl_int CommandQueueHw<GfxFamily>::enqueueCopyBuffer(
+    Buffer *srcBuffer,
+    Buffer *dstBuffer,
+    size_t srcOffset,
+    size_t dstOffset,
+    size_t size,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event) {
+
+    MultiDispatchInfo dispatchInfo;
+
+    auto &builder = BuiltIns::getInstance().getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferToBuffer,
+                                                                          this->getContext(), this->getDevice());
+    builder.takeOwnership(this->context);
+
+    BuiltinDispatchInfoBuilder::BuiltinOpParams dc;
+    dc.srcMemObj = srcBuffer;
+    dc.dstMemObj = dstBuffer;
+    dc.srcOffset = {srcOffset, 0, 0};
+    dc.dstOffset = {dstOffset, 0, 0};
+    dc.size = {size, 0, 0};
+    builder.buildDispatchInfos(dispatchInfo, dc);
+
+    MemObjSurface s1(srcBuffer);
+    MemObjSurface s2(dstBuffer);
+    Surface *surfaces[] = {&s1, &s2};
+
+    enqueueHandler<CL_COMMAND_COPY_BUFFER>(
+        surfaces,
+        false,
+        dispatchInfo,
+        numEventsInWaitList,
+        eventWaitList,
+        event);
+
+    builder.releaseOwnership();
+
+    return CL_SUCCESS;
+}
+}
--- a/runtime/command_queue/enqueue_copy_buffer_rect.h
+++ b/runtime/command_queue/enqueue_copy_buffer_rect.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "runtime/command_queue/command_queue_hw.h"
+#include "runtime/command_stream/command_stream_receiver.h"
+#include "runtime/helpers/kernel_commands.h"
+#include "runtime/mem_obj/buffer.h"
+#include "runtime/memory_manager/surface.h"
+#include "runtime/built_ins/built_ins.h"
+#include <new>
+
+namespace OCLRT {
+
+template <typename GfxFamily>
+cl_int CommandQueueHw<GfxFamily>::enqueueCopyBufferRect(
+    Buffer *srcBuffer,
+    Buffer *dstBuffer,
+    const size_t *srcOrigin,
+    const size_t *dstOrigin,
+    const size_t *region,
+    size_t srcRowPitch,
+    size_t srcSlicePitch,
+    size_t dstRowPitch,
+    size_t dstSlicePitch,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event) {
+
+    MultiDispatchInfo dispatchInfo;
+
+    auto &builder = BuiltIns::getInstance().getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferRect,
+                                                                          this->getContext(), this->getDevice());
+    builder.takeOwnership(this->context);
+
+    BuiltinDispatchInfoBuilder::BuiltinOpParams dc;
+    dc.srcMemObj = srcBuffer;
+    dc.dstMemObj = dstBuffer;
+    dc.srcOffset = srcOrigin;
+    dc.dstOffset = dstOrigin;
+    dc.size = region;
+    dc.srcRowPitch = srcRowPitch;
+    dc.srcSlicePitch = srcSlicePitch;
+    dc.dstRowPitch = dstRowPitch;
+    dc.dstSlicePitch = dstSlicePitch;
+    builder.buildDispatchInfos(dispatchInfo, dc);
+
+    enqueueHandler<CL_COMMAND_COPY_BUFFER_RECT>(
+        dispatchInfo.getUsedSurfaces().begin(),
+        dispatchInfo.getUsedSurfaces().size(),
+        false,
+        dispatchInfo,
+        numEventsInWaitList,
+        eventWaitList,
+        event);
+
+    builder.releaseOwnership();
+
+    return CL_SUCCESS;
+}
+}
--- a/runtime/command_queue/enqueue_copy_buffer_to_image.h
+++ b/runtime/command_queue/enqueue_copy_buffer_to_image.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "hw_cmds.h"
+#include "runtime/command_queue/command_queue_hw.h"
+#include "runtime/command_stream/command_stream_receiver.h"
+#include "runtime/helpers/surface_formats.h"
+#include "runtime/helpers/kernel_commands.h"
+#include "runtime/mem_obj/buffer.h"
+#include "runtime/mem_obj/image.h"
+#include "runtime/memory_manager/surface.h"
+#include "runtime/built_ins/built_ins.h"
+#include <new>
+
+namespace OCLRT {
+
+template <typename GfxFamily>
+cl_int CommandQueueHw<GfxFamily>::enqueueCopyBufferToImage(
+    Buffer *srcBuffer,
+    Image *dstImage,
+    size_t srcOffset,
+    const size_t *dstOrigin,
+    const size_t *region,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event) {
+
+    MultiDispatchInfo di;
+
+    auto &builder = BuiltIns::getInstance().getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferToImage3d,
+                                                                          this->getContext(), this->getDevice());
+    builder.takeOwnership(this->context);
+
+    BuiltinDispatchInfoBuilder::BuiltinOpParams dc;
+    dc.srcMemObj = srcBuffer;
+    dc.dstMemObj = dstImage;
+    dc.srcOffset = {srcOffset, 0, 0};
+    dc.dstOffset = dstOrigin;
+    dc.size = region;
+    builder.buildDispatchInfos(di, dc);
+
+    enqueueHandler<CL_COMMAND_COPY_BUFFER_TO_IMAGE>(
+        di.getUsedSurfaces().begin(),
+        di.getUsedSurfaces().size(),
+        false,
+        di,
+        numEventsInWaitList,
+        eventWaitList,
+        event);
+
+    builder.releaseOwnership();
+
+    return CL_SUCCESS;
+}
+}
--- a/runtime/command_queue/enqueue_copy_image.h
+++ b/runtime/command_queue/enqueue_copy_image.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "runtime/built_ins/built_ins.h"
+#include "hw_cmds.h"
+#include "runtime/command_queue/command_queue_hw.h"
+#include "runtime/command_stream/command_stream_receiver.h"
+#include "runtime/helpers/kernel_commands.h"
+#include "runtime/helpers/basic_math.h"
+#include "runtime/mem_obj/image.h"
+#include "runtime/memory_manager/surface.h"
+#include <algorithm>
+#include <new>
+
+namespace OCLRT {
+
+template <typename GfxFamily>
+cl_int CommandQueueHw<GfxFamily>::enqueueCopyImage(
+    Image *srcImage,
+    Image *dstImage,
+    const size_t srcOrigin[3],
+    const size_t dstOrigin[3],
+    const size_t region[3],
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event) {
+
+    MultiDispatchInfo di;
+
+    auto &builder = BuiltIns::getInstance().getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyImageToImage3d,
+                                                                          this->getContext(), this->getDevice());
+    builder.takeOwnership(this->context);
+
+    BuiltinDispatchInfoBuilder::BuiltinOpParams dc;
+    dc.srcMemObj = srcImage;
+    dc.dstMemObj = dstImage;
+    dc.srcOffset = srcOrigin;
+    dc.dstOffset = dstOrigin;
+    dc.size = region;
+    builder.buildDispatchInfos(di, dc);
+
+    enqueueHandler<CL_COMMAND_COPY_IMAGE>(
+        di.getUsedSurfaces().begin(),
+        di.getUsedSurfaces().size(),
+        false,
+        di,
+        numEventsInWaitList,
+        eventWaitList,
+        event);
+
+    builder.releaseOwnership();
+
+    return CL_SUCCESS;
+}
+}
--- a/runtime/command_queue/enqueue_copy_image_to_buffer.h
+++ b/runtime/command_queue/enqueue_copy_image_to_buffer.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include "hw_cmds.h"
+#include "runtime/command_queue/command_queue_hw.h"
+#include "runtime/command_stream/command_stream_receiver.h"
+#include "runtime/helpers/surface_formats.h"
+#include "runtime/helpers/kernel_commands.h"
+#include "runtime/mem_obj/buffer.h"
+#include "runtime/mem_obj/image.h"
+#include "runtime/memory_manager/surface.h"
+#include "runtime/built_ins/built_ins.h"
+#include <new>
+
+namespace OCLRT {
+
+template <typename GfxFamily>
+cl_int CommandQueueHw<GfxFamily>::enqueueCopyImageToBuffer(
+    Image *srcImage,
+    Buffer *dstBuffer,
+    const size_t *srcOrigin,
+    const size_t *region,
+    size_t dstOffset,
+    cl_uint numEventsInWaitList,
+    const cl_event *eventWaitList,
+    cl_event *event) {
+
+    MultiDispatchInfo di;
+
+    auto &builder = BuiltIns::getInstance().getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyImage3dToBuffer,
+                                                                          this->getContext(), this->getDevice());
+    builder.takeOwnership(this->context);
+
+    BuiltinDispatchInfoBuilder::BuiltinOpParams dc;
+    dc.srcMemObj = srcImage;
+    dc.dstMemObj = dstBuffer;
+    dc.srcOffset = srcOrigin;
+    dc.dstOffset = {dstOffset, 0, 0};
+    dc.size = region;
+    builder.buildDispatchInfos(di, dc);
+
+    enqueueHandler<CL_COMMAND_COPY_IMAGE_TO_BUFFER>(
+        di.getUsedSurfaces().begin(),
+        di.getUsedSurfaces().size(),
+        false,
+        di,
+        numEventsInWaitList,
+        eventWaitList,
+        event);
+
+    builder.releaseOwnership();
+
+    return CL_SUCCESS;
+}
+}
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`manifests/manifest.yml filter=repo_converter`