Initial commit

Change-Id: I4bf1707bd3dfeadf2c17b0a7daff372b1925ebbd
This commit is contained in:
Brandon Fliflet
2017-12-21 00:45:38 +01:00
commit 7e9ad41290
1350 changed files with 233156 additions and 0 deletions

92
.clang-format Normal file
View File

@@ -0,0 +1,92 @@
---
Language: Cpp
# BasedOnStyle: LLVM
AccessModifierOffset: -2
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlinesLeft: false
AlignOperands: true
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: All
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: false
BinPackArguments: true
BinPackParameters: true
BraceWrapping:
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
BeforeCatch: false
BeforeElse: false
IndentBraces: false
BreakBeforeBinaryOperators: false
BreakBeforeBraces: Attach
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
# clang-format > v3.8.0: BreakAfterJavaFieldAnnotations: false
# clang-format > v3.8.0: BreakStringLiterals: true
ColumnLimit: 0
CommentPragmas: '^ IWYU pragma:'
ConstructorInitializerAllOnOneLineOrOnePerLine: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: false
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
IncludeCategories:
- Regex: '^"(llvm|llvm-c|clang|clang-c)/'
Priority: 2
- Regex: '^(<|"(gtest|isl|json)/)'
Priority: 3
- Regex: '.*'
Priority: 1
IndentCaseLabels: false
IndentWidth: 4
IndentWrappedFunctionNames: false
KeepEmptyLinesAtTheStartOfBlocks: true
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: true
PenaltyBreakBeforeFirstCallParameter: 19
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 60
PointerAlignment: Right
ReflowComments: true
SortIncludes: false
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Cpp11
TabWidth: 8
UseTab: Never
...

40
.clang-tidy Normal file
View File

@@ -0,0 +1,40 @@
---
Checks: 'clang-diagnostic-*,clang-analyzer-*,google-default-arguments,readability-identifier-naming,modernize-use-override,modernize-use-default-member-init,-clang-analyzer-alpha*,-clang-analyzer-core.StackAddressEscape,-clang-analyzer-optin.performance.Padding,-clang-analyzer-cplusplus.NewDeleteLeaks'
# WarningsAsErrors: '.*'
HeaderFilterRegex: 'runtime/'
AnalyzeTemporaryDtors: false
CheckOptions:
- key: google-readability-braces-around-statements.ShortStatementLines
value: '1'
- key: google-readability-function-size.StatementThreshold
value: '800'
- key: google-readability-namespace-comments.ShortNamespaceLines
value: '10'
- key: google-readability-namespace-comments.SpacesBeforeComments
value: '2'
- key: readability-identifier-naming.MethodCase
value: camelBack
- key: readability-identifier-naming.ParameterCase
value: camelBack
- key: readability-identifier-naming.StructMemberCase
value: camelBack
- key: readability-identifier-naming.ClassMemberCase
value: camelBack
- key: readability-identifier-naming.ClassMethodCase
value: camelBack
- key: modernize-loop-convert.MaxCopySize
value: '16'
- key: modernize-loop-convert.MinConfidence
value: reasonable
- key: modernize-loop-convert.NamingStyle
value: CamelCase
- key: modernize-pass-by-value.IncludeStyle
value: llvm
- key: modernize-replace-auto-ptr.IncludeStyle
value: llvm
- key: modernize-use-nullptr.NullMacros
value: 'NULL'
- key: modernize-use-default-member-init.UseAssignment
value: '1'
...

3
.ctags Normal file
View File

@@ -0,0 +1,3 @@
-R
-h .inl.h
--langmap=c++:.inl.cpp.h

1
.gitattributes vendored Normal file
View File

@@ -0,0 +1 @@
manifests/manifest.yml filter=repo_converter

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
build/*
build_linux/*

550
CMakeLists.txt Normal file
View File

@@ -0,0 +1,550 @@
# Copyright (c) 2017, Intel Corporation
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
# We require cmake 3.2.0 or later
cmake_minimum_required(VERSION 3.2.0 FATAL_ERROR)
include(ExternalProject)
project(igdrcl)
if(TR_DEPRECATED)
add_definitions(-D_SILENCE_TR1_NAMESPACE_DEPRECATION_WARNING=1)
endif(TR_DEPRECATED)
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type: [Release, Release-Internal, Debug]")
endif()
set(CMAKE_C_FLAGS_RELEASE-INTERNAL ${CMAKE_C_FLAGS_RELEASE})
set(CMAKE_CXX_FLAGS_RELEASE-INTERNAL ${CMAKE_CXX_FLAGS_RELEASE})
set(CMAKE_SHARED_LINKER_FLAGS_RELEASE-INTERNAL ${CMAKE_SHARED_LINKER_FLAGS_RELEASE})
set(CMAKE_EXE_LINKER_FLAGS_RELEASE-INTERNAL ${CMAKE_EXE_LINKER_FLAGS_RELEASE})
string(TOLOWER "${CMAKE_BUILD_TYPE}" BUILD_TYPE_lower)
if("${BUILD_TYPE_lower}" STREQUAL "release-internal")
add_definitions(-D_RELEASE_INTERNAL)
endif("${BUILD_TYPE_lower}" STREQUAL "release-internal")
message(STATUS "${CMAKE_BUILD_TYPE} build configuration")
# Set the runtime source directory
if(NOT DEFINED IGDRCL_SOURCE_DIR)
set(IGDRCL_SOURCE_DIR ${CMAKE_SOURCE_DIR})
endif()
# Set our build directory
if(NOT DEFINED IGDRCL_BUILD_DIR)
set(IGDRCL_BUILD_DIR ${CMAKE_BINARY_DIR})
endif()
if(NOT IGDRCL_BINARY_DIR)
set(IGDRCL_BINARY_DIR ${CMAKE_BINARY_DIR})
endif()
# we use c++11
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
if(NOT GTEST_SRC_DIR)
set(GTEST_SRC_DIR_tmp "${CMAKE_SOURCE_DIR}/../gtest")
get_filename_component(GTEST_SRC_DIR ${GTEST_SRC_DIR_tmp} ABSOLUTE)
set(GMOCK_SRC_DIR_tmp "${CMAKE_SOURCE_DIR}/../gmock")
get_filename_component(GMOCK_SRC_DIR ${GMOCK_SRC_DIR_tmp} ABSOLUTE)
else(NOT GTEST_SRC_DIR)
get_filename_component(GTEST_SRC_DIR ${GTEST_SRC_DIR} ABSOLUTE)
set(GMOCK_SRC_DIR_tmp "${GTEST_SRC_DIR}/../gmock")
get_filename_component(GMOCK_SRC_DIR ${GMOCK_SRC_DIR_tmp} ABSOLUTE)
endif(NOT GTEST_SRC_DIR)
set(GTEST_INCLUDE_DIR "${GTEST_SRC_DIR}/include")
set(GMOCK_INCLUDE_DIR "${GMOCK_SRC_DIR}/include")
message(STATUS "Google Test source dir: ${GTEST_SRC_DIR}")
message(STATUS "Google Mock source dir: ${GMOCK_SRC_DIR}")
add_subdirectory(${GMOCK_SRC_DIR} ${IGDRCL_BINARY_DIR}/gmock)
set_target_properties(gtest PROPERTIES CXX_STANDARD 11 CXX_STANDARD_REQUIRED ON)
set_target_properties(gmock PROPERTIES CXX_STANDARD 11 CXX_STANDARD_REQUIRED ON)
set(gtest_lib gtest)
set(gmock_lib gmock)
if("${BUILD_TYPE_lower}" STREQUAL "release-internal")
set(gtest_lib ${IGDRCL_BINARY_DIR}/gmock/gtest/Release/gtest${CMAKE_STATIC_LIBRARY_SUFFIX})
set(gmock_lib ${IGDRCL_BINARY_DIR}/gmock/Release/gmock${CMAKE_STATIC_LIBRARY_SUFFIX})
endif("${BUILD_TYPE_lower}" STREQUAL "release-internal")
if(CMAKE_SIZEOF_VOID_P EQUAL 8)
set(NEO_BITS "64")
set(NEO_ARCH "x64")
else()
set(NEO_BITS "32")
set(NEO_ARCH "x86")
endif()
if(NOT ARTIFACTS_DIR)
get_filename_component(ARTIFACTS_DIR_tmp "${CMAKE_SOURCE_DIR}/../artifacts" ABSOLUTE)
if(IS_DIRECTORY "${ARTIFACTS_DIR_tmp}")
set(ARTIFACTS_DIR "${ARTIFACTS_DIR_tmp}")
endif()
endif(NOT ARTIFACTS_DIR)
if(ARTIFACTS_DIR)
message(STATUS "Artifact directory is ${ARTIFACTS_DIR}")
endif(ARTIFACTS_DIR)
if(NOT GTPIN_HEADERS_DIR)
if ((EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../internal/gtpin/gtpin_dx11_interface.h") AND (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../internal/gtpin/gtpin_driver_common.h"))
get_filename_component(GTPIN_HEADERS_DIR "../internal/gtpin/" ABSOLUTE)
message(STATUS "GT-Pin headers dir: ${GTPIN_HEADERS_DIR}")
endif()
endif(NOT GTPIN_HEADERS_DIR)
if(NOT LIBDRM_DIR)
get_filename_component(LIBDRM_DIR "../libdrm/" ABSOLUTE)
endif(NOT LIBDRM_DIR)
message(STATUS "libdrm dir: ${LIBDRM_DIR}")
if(NOT KHRONOS_HEADERS_DIR)
get_filename_component(KHRONOS_HEADERS_DIR "../khronos/opencl21/" ABSOLUTE)
endif(NOT KHRONOS_HEADERS_DIR)
message(STATUS "Khronos OpenCL headers dir: ${KHRONOS_HEADERS_DIR}")
set(OCL_HEADERS_DIR ${KHRONOS_HEADERS_DIR})
if(NOT THIRD_PARTY_DIR)
get_filename_component(THIRD_PARTY_DIR "../third_party/" ABSOLUTE)
endif(NOT THIRD_PARTY_DIR)
message(STATUS "Third party dir: ${THIRD_PARTY_DIR}")
if(ARTIFACTS_DIR)
if(NOT IGC_PATH)
if(WIN32)
file(GLOB_RECURSE IGC_FILE_tmp "${ARTIFACTS_DIR}/igc32.dll")
get_filename_component(IGC_PATH ${IGC_FILE_tmp} DIRECTORY)
else(WIN32 )
file(GLOB_RECURSE IGC_FILE_tmp "${ARTIFACTS_DIR}/libigdccl.so")
#exclude those form igdrcl location
foreach (TMP_PATH ${IGC_FILE_tmp})
string (FIND ${TMP_PATH} ${IGDRCL_BINARY_DIR} EXCLUDE_DIR_FOUND)
if (NOT ${EXCLUDE_DIR_FOUND} EQUAL -1)
list (REMOVE_ITEM IGC_FILE_tmp ${TMP_PATH})
endif ()
endforeach(TMP_PATH)
get_filename_component(IGC_PATH ${IGC_FILE_tmp} DIRECTORY)
endif(WIN32)
endif(NOT IGC_PATH)
if(NOT TARGET igc_dll)
add_library(igc_dll UNKNOWN IMPORTED)
if(WIN32)
set_property(TARGET igc_dll PROPERTY "IMPORTED_LOCATION" "${IGC_PATH}/${CMAKE_SHARED_LIBRARY_PREFIX}igc${IGDRCL_OPTION__BITS}${CMAKE_SHARED_LIBRARY_SUFFIX}")
else(WIN32)
set_property(TARGET igc_dll PROPERTY "IMPORTED_LOCATION" "${IGC_PATH}/${CMAKE_SHARED_LIBRARY_PREFIX}igdccl${IGDRCL_OPTION__BITS}${CMAKE_SHARED_LIBRARY_SUFFIX}")
endif(WIN32)
endif()
list(APPEND IGDRCL__IGC_TARGETS "igc_dll")
if(NOT TARGET fcl_dll)
add_library(fcl_dll UNKNOWN IMPORTED)
set_property(TARGET fcl_dll PROPERTY "IMPORTED_LOCATION" "${IGC_PATH}/${CMAKE_SHARED_LIBRARY_PREFIX}igdfcl${IGDRCL_OPTION__BITS}${CMAKE_SHARED_LIBRARY_SUFFIX}")
endif()
list(APPEND IGDRCL__IGC_TARGETS "fcl_dll")
if(NOT TARGET iga_dll)
add_library(iga_dll UNKNOWN IMPORTED)
set_property(TARGET iga_dll PROPERTY "IMPORTED_LOCATION" "${IGC_PATH}/${CMAKE_SHARED_LIBRARY_PREFIX}iga${NEO_BITS}${CMAKE_SHARED_LIBRARY_SUFFIX}")
endif()
list(APPEND IGDRCL__IGC_TARGETS "iga_dll")
if(NOT TARGET common_clang_dll)
add_library(common_clang_dll UNKNOWN IMPORTED)
set_property(TARGET common_clang_dll PROPERTY "IMPORTED_LOCATION" "${IGC_PATH}/${CMAKE_SHARED_LIBRARY_PREFIX}common_clang${IGDRCL_OPTION__BITS}${CMAKE_SHARED_LIBRARY_SUFFIX}")
endif()
list(APPEND IGDRCL__IGC_TARGETS "common_clang_dll")
# select proper gmm from artifacts
string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_lower)
if(NOT GMM_LIB_PATHS)
if(WIN32)
set(GMM_LIB_PATHS "${ARTIFACTS_DIR}/windows/${CMAKE_BUILD_TYPE_lower}${IGDRCL_OPTION__BITS}/gmmocl")
else(WIN32)
set(GMM_LIB_PATHS "${ARTIFACTS_DIR}/linux/${CMAKE_BUILD_TYPE_lower}")
endif(WIN32)
endif()
message(STATUS "GmmLib binary path: ${GMM_LIB_PATHS}")
if(GMM_SOURCE_DIR)
get_filename_component(GMM_SOURCE_DIR "${GMM_SOURCE_DIR}" ABSOLUTE)
else(GMM_SOURCE_DIR)
get_filename_component(GMM_SOURCE_DIR "${CMAKE_SOURCE_DIR}/../gmmlib" ABSOLUTE)
endif(GMM_SOURCE_DIR)
if(NOT IS_DIRECTORY "${GMM_SOURCE_DIR}")
message(FATAL_ERROR "GmmLib public API not found!")
endif()
set(GMM_INCLUDE_PATHS
"${GMM_SOURCE_DIR}/GmmLib/inc"
)
set(UMKM_SHAREDDATA_INCLUDE_PATHS
"${GMM_SOURCE_DIR}/inc"
"${GMM_SOURCE_DIR}/inc/common"
)
set(IGDRCL__IGC_INCLUDE_DIR ${THIRD_PARTY_DIR})
else(ARTIFACTS_DIR)
if(GMM_SOURCE_DIR)
get_filename_component(GMM_SOURCE_DIR "${GMM_SOURCE_DIR}" ABSOLUTE)
else(GMM_SOURCE_DIR)
get_filename_component(GMM_SOURCE_DIR_tmp "${CMAKE_SOURCE_DIR}/../gmmlib" ABSOLUTE)
if(IS_DIRECTORY "${GMM_SOURCE_DIR_tmp}")
set(GMM_SOURCE_DIR "${GMM_SOURCE_DIR_tmp}")
endif()
endif()
if(NOT IS_DIRECTORY "${GMM_SOURCE_DIR}")
message(FATAL_ERROR "GmmLib source not found!")
endif()
message(STATUS "GmmLib source dir is: ${GMM_SOURCE_DIR}")
add_subdirectory("${GMM_SOURCE_DIR}" "${IGDRCL_BUILD_DIR}/gmmlib")
set(UMKM_SHAREDDATA_INCLUDE_PATHS $<TARGET_PROPERTY:gmm_umd,INTERFACE_INCLUDE_DIRECTORIES>)
if(IGC_DIR)
get_filename_component(IGC_DIR "${IGC_DIR}" ABSOLUTE)
else(IGC_DIR)
get_filename_component(IGC_DIR_tmp "${CMAKE_SOURCE_DIR}/../igc" ABSOLUTE)
if(IS_DIRECTORY "${IGC_DIR_tmp}")
set(IGC_DIR "${IGC_DIR_tmp}")
endif()
endif()
message(STATUS "IGC source dir is: ${IGC_DIR}")
get_filename_component(IGC_PATH "${IGDRCL_BUILD_DIR}/igc" ABSOLUTE)
if(IS_DIRECTORY ${IGC_DIR})
set(IGC_OPTION__LIBRARY_NAME "igdccl")
set(IGC_OPTION__OUTPUT_DIR "${IGC_PATH}")
set(IGC_OPTION__INCLUDE_IGC_COMPILER_TOOLS OFF)
add_subdirectory("${IGC_DIR}" "${IGDRCL_BUILD_DIR}/igc" EXCLUDE_FROM_ALL)
set(IGDRCL__IGC_TARGETS "${IGC__IGC_TARGETS}")
foreach(TARGET_tmp ${IGDRCL__IGC_TARGETS})
list(APPEND IGDRCL__IGC_INCLUDE_DIR $<TARGET_PROPERTY:${TARGET_tmp},INTERFACE_INCLUDE_DIRECTORIES>)
endforeach(TARGET_tmp)
message(STATUS "IGC Includes: ${IGDRCL__IGC_INCLUDE_DIR}")
endif()
endif(ARTIFACTS_DIR)
add_definitions(-DGMM_OCL)
if(IGC_PATH)
get_filename_component(IGC_PATH "${IGC_PATH}" ABSOLUTE)
message(STATUS "IGC binaries path: ${IGC_PATH}")
endif(IGC_PATH)
# We want to organize our IDE targets into folders
set_property(GLOBAL PROPERTY USE_FOLDERS ON)
# Get available platfroms
include(platforms.cmake)
# Enable/Disable BuiltIns compilatoin during build
set(COMPILE_BUILT_INS TRUE CACHE BOOL "Enable built-in kernels compilation")
# Changing the default executable and library output directories
set(IGDRCL_OUTPUT_DIR "${IGDRCL_OPTION__OUTPUT_DIR}")
# set output paths
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${IGDRCL_BINARY_DIR}/bin)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${IGDRCL_BINARY_DIR}/bin)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${IGDRCL_BINARY_DIR}/lib)
# do not add rpath
set(CMAKE_SKIP_RPATH YES CACHE BOOL "" FORCE )
# Set the configuration type
set(CMAKE_CONFIGURATION_TYPES
${CMAKE_BUILD_TYPE}
)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_DEBUG")
option(APPVERIFIER_ALLOWED "allow use of AppVerifier" TRUE)
option(CCACHE_ALLOWED "allow use of ccache" TRUE)
find_program(CCACHE_EXE_FOUND ccache)
if(CCACHE_EXE_FOUND AND CCACHE_ALLOWED)
message(STATUS "Found ccache: ${CCACHE_EXE_FOUND}")
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache)
endif()
if(MSVC)
if(NOT WDK_DIR)
get_filename_component(WDK_DIR "../wdk" ABSOLUTE)
endif(NOT WDK_DIR)
message(STATUS "WDK Directory: ${WDK_DIR}")
else()
find_package(PkgConfig)
pkg_check_modules(LIBVA QUIET libva)
if(LIBVA_FOUND)
add_definitions(-DLIBVA)
message(STATUS "Using libva")
endif()
endif()
# Support for WUD
macro(ENABLE_WUD)
if(MSVC)
set(CMAKE_CXX_STANDARD_LIBRARIES "onecore.lib")
set(LINKER_FLAGS "")
foreach(IT kernel32.lib;user32.lib;gdi32.lib;advapi32.lib;ole32.lib;)
set(LINKER_FLAGS "${LINKER_FLAGS} /NODEFAULTLIB:${IT}")
endforeach()
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${LINKER_FLAGS}")
set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${LINKER_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${LINKER_FLAGS}")
add_definitions(-DUNICODE -D_UNICODE)
link_directories("${WDK_DIR}/Win15/Lib/${WindowsTargetPlatformVersion}/um/${NEO_ARCH}/")
endif(MSVC)
endmacro(ENABLE_WUD)
# Miscs options
option(IGDRCL_GCOV "generate gcov report" OFF)
option(HAVE_TBX_SERVER "Compile TBX server from TbxAccess library" OFF)
option(USE_CL_CACHE "Use OpenCL program binary cache" ON)
set(CL_CACHE_LOCATION "cl_cache" CACHE STRING "OpenCL program binary cache location")
if(NOT NEO_DRIVER_VERSION)
set(NEO_DRIVER_VERSION 1.0)
endif()
# Put profiling enable flag into define
if(OCL_RUNTIME_PROFILING)
add_definitions(-DOCL_RUNTIME_PROFILING=${OCL_RUNTIME_PROFILING})
endif()
# We want to build with the static, multithreaded runtime libraries (as opposed
# to the multithreaded runtime DLLs)
if(MSVC)
# Get WDK version from ${WDK_DIR}/Win15/WDKVersion.txt
file(READ "${WDK_DIR}/Win15/WDKVersion.txt" WindowsTargetPlatformVersion)
string(REPLACE " " ";" WindowsTargetPlatformVersion ${WindowsTargetPlatformVersion})
list(LENGTH WindowsTargetPlatformVersion versionListLength)
if(NOT versionListLength EQUAL 3)
MESSAGE(ERROR "Error reading content of WDKVersion.txt file")
endif(NOT versionListLength EQUAL 3)
list(GET WindowsTargetPlatformVersion 2 WindowsTargetPlatformVersion)
message(STATUS "WDK Version is ${WindowsTargetPlatformVersion}")
set(WDK_INCLUDE_PATHS
"${WDK_DIR}/Win15/Include/${WindowsTargetPlatformVersion}/um"
"${WDK_DIR}/Win15/Include/${WindowsTargetPlatformVersion}/shared"
"${WDK_DIR}/Win15/Include/${WindowsTargetPlatformVersion}/km"
)
# Force to treat warnings as errors
if(NOT CMAKE_CXX_FLAGS MATCHES "/WX")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /WX")
endif()
MESSAGE(STATUS "WDK include paths: ${WDK_INCLUDE_PATHS}")
string(REPLACE "/MDd" "/MTd" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
string(REPLACE "/MD" "/MT" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
string(REPLACE "/MD" "/MT" CMAKE_CXX_FLAGS_RELEASE-INTERNAL "${CMAKE_CXX_FLAGS_RELEASE-INTERNAL}")
else()
if(IGDRCL_GCOV)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage --coverage")
endif()
option(USE_ASAN "Link with address sanitization support" OFF)
if(USE_ASAN)
if(CMAKE_COMPILER_IS_GNUCC)
set(ASAN_FLAGS " -fsanitize=address -fno-omit-frame-pointer")
link_libraries(asan)
else()
message(STATUS "Address sanitization with clang not yet support")
endif()
endif()
if(USE_TSAN)
if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
set(TSAN_FLAGS " -fsanitize=thread")
link_libraries(tsan)
else()
message(STATUS "Thread sanitization with gcc is not fully supported")
endif()
endif()
include(CheckLibraryExists)
CHECK_LIBRARY_EXISTS(rt clock_gettime "time.h" HAVE_CLOCK_GETTIME)
if(HAVE_CLOCK_GETTIME)
link_libraries(rt)
endif(HAVE_CLOCK_GETTIME)
endif(MSVC)
# setup variables needed for custom configuration type
# generate PDB files even for release build on MSVC
if(MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi")
set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
endif()
if(NOT MSVC)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ftemplate-depth=1024")
endif(NOT MSVC)
# Compiler warning flags
if(NOT MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wempty-body")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wignored-qualifiers")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wtype-limits")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wuninitialized")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wextra -Wno-unused-parameter -Wno-missing-field-initializers")
if("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang" )
# clang only
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wshorten-64-to-32")
if(USE_SANITIZE_UB)
message(STATUS "Enabling undefined behavior sanitizer")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined -fno-sanitize-recover=undefined -fsanitize-recover=vptr -fno-rtti")
endif(USE_SANITIZE_UB)
if (NOT (CMAKE_C_COMPILER_VERSION VERSION_LESS 3.6))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-local-typedefs")
endif()
if (NOT (CMAKE_C_COMPILER_VERSION VERSION_LESS 4.0))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-register") # Added for htons()
endif()
else()
# gcc only
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-local-typedefs -Wno-unused-but-set-variable")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wclobbered")
if (CMAKE_C_COMPILER_VERSION VERSION_LESS 7.0)
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wimplicit-fallthrough=4")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-noexcept-type") # Added for gtest
endif()
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
endif()
# Compile code with defenses enabled (settings to be used for production release code)
if("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
if(MSVC)
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /GS")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /sdl")
set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /NXCompat")
set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DynamicBase")
if("${NEO_ARCH}" STREQUAL "x32")
set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /SafeSEH")
endif()
else()
if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector-strong")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -D_FORTIFY_SOURCE=2")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wformat -Wformat-security")
else()
# gcc, g++ only
if (CMAKE_C_COMPILER_VERSION VERSION_LESS 4.9)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector-strong")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -D_FORTIFY_SOURCE=2")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wformat -Wformat-security")
set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} -Wl,-z,noexecstack")
set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} -Wl,-z,relro")
set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} -Wl,-z,now")
endif()
endif()
endif()
# Project-wide include paths
include_directories(${IGDRCL_SOURCE_DIR})
include_directories(${IGDRCL_BUILD_DIR})
# Define where to put binaries
if(MSVC)
if ("${CMAKE_GENERATOR}" STREQUAL "Ninja")
set(TargetDir ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
else()
set(TargetDir ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${CMAKE_BUILD_TYPE})
endif()
else()
set(TargetDir ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
endif(MSVC)
add_subdirectory(offline_compiler ${IGDRCL_BUILD_DIR}/offline_compiler)
target_compile_definitions(cloc PUBLIC MOCKABLE_VIRTUAL=)
macro(generate_runtime_lib LIB_NAME MOCKABLE GENERATE_EXEC)
set(NEO_STATIC_LIB_NAME ${LIB_NAME})
set(SHARINGS_ENABLE_LIB_NAME "${LIB_NAME}_sharings_enable")
set(GENERATE_EXECUTABLE ${GENERATE_EXEC})
add_subdirectory(runtime "${IGDRCL_BUILD_DIR}/${LIB_NAME}")
target_compile_definitions(${BUILTINS_SOURCES_LIB_NAME} PUBLIC MOCKABLE_VIRTUAL=)
target_compile_definitions(${BUILTINS_BINARIES_LIB_NAME} PUBLIC MOCKABLE_VIRTUAL=)
target_compile_definitions(${SCHEDULER_BINARY_LIB_NAME} PUBLIC MOCKABLE_VIRTUAL=)
if(${MOCKABLE})
target_compile_definitions(${LIB_NAME} PUBLIC MOCKABLE_VIRTUAL=virtual)
target_compile_definitions(${SHARINGS_ENABLE_LIB_NAME} PUBLIC MOCKABLE_VIRTUAL=virtual)
target_compile_definitions(${LIB_NAME} PUBLIC DEFAULT_TEST_PLATFORM=${DEFAULT_TESTED_PLATFORM} DEFAULT_TEST_FAMILY_NAME=${DEFAULT_TESTED_FAMILY_NAME})
else()
target_compile_definitions(${LIB_NAME} PUBLIC MOCKABLE_VIRTUAL=)
target_compile_definitions(${SHARINGS_ENABLE_LIB_NAME} PUBLIC MOCKABLE_VIRTUAL=)
endif()
endmacro(generate_runtime_lib)
set(NEO_MOCKABLE_LIB_NAME "igdrcl_lib_mockable") # Used by ULTS
set(NEO_RELEASE_LIB_NAME "igdrcl_lib_release") # Used by dll/so
set(NEO_DYNAMIC_LIB_NAME "igdrcl_dll") # single NEO dll (when WUD-crosscompilation is disabled)
set(NEO_DLL_NAME_BASE "igdrcl")
set(BIKSIM_LIB_NAME "biksim")
set(BUILTINS_SOURCES_LIB_NAME "builtins_sources")
set(BUILTINS_BINARIES_LIB_NAME "builtins_binaries")
set(SCHEDULER_BINARY_LIB_NAME "scheduler_binary")
add_subdirectory(elf)
generate_runtime_lib(${NEO_RELEASE_LIB_NAME} FALSE TRUE)
generate_runtime_lib(${NEO_MOCKABLE_LIB_NAME} TRUE FALSE)
if(EXISTS ../icd)
add_subdirectory(../icd ${IGDRCL_BINARY_DIR}/icd)
endif(EXISTS ../icd)
if(TARGET OpenCL)
target_include_directories(OpenCL PRIVATE ${KHRONOS_HEADERS_DIR})
endif()
if(DEFAULT_TESTED_PLATFORM)
add_subdirectory(unit_tests ${IGDRCL_BUILD_DIR}/unit_tests)
endif()
set(DONT_LINK_ELF_STATICALLY TRUE)
if(EXISTS ${IGDRCL_SOURCE_DIR}/../internal)
add_subdirectory(${IGDRCL_SOURCE_DIR}/../internal ${IGDRCL_BUILD_DIR}/internal)
endif(EXISTS ${IGDRCL_SOURCE_DIR}/../internal)
set(CL_USE_DEPRECATED_OPENCL_1_1_APIS 1)
set(CL_USE_DEPRECATED_OPENCL_1_2_APIS 1)
set(CL_USE_DEPRECATED_OPENCL_2_0_APIS 1)
set(_CRT_SECURE_NO_WARNINGS 1)
include(package.cmake)
configure_file(config.h.in ${IGDRCL_BUILD_DIR}/config.h)

5
Jenkinsfile vendored Normal file
View File

@@ -0,0 +1,5 @@
#!groovy
neoDependenciesRev='716918-671'
strategy='EQUAL'
allowedF=49
allowedCD=381

76
README.md Normal file
View File

@@ -0,0 +1,76 @@
# Intel(R) Graphics Compute Runtime for OpenCL(TM)
## Introduction
The Intel(R) Graphics Compute Runtime for OpenCL(TM) is a open source project to
converge Intel's development efforts on OpenCL(TM) compute stacks supporting the
GEN graphics hardware architecture.
Please refer to http://01.org/compute-runtime for additional details regarding Intel's
motivation and intentions wrt OpenCL support in the open source.
## License
The Intel(R) Graphics Compute Runtime for OpenCL(TM) is distributed under the MIT.
You may obtain a copy of the License at:
https://opensource.org/licenses/MIT
## Building
<TODO:insert instructions here>
### Install
<TODO:insert instructions here>
## Supported Platforms
Intel Core Processors supporting Gen8 graphics devices - OpenCL 2.0
Intel Core Processors supporting Gen9 graphics devices - OpenCL 2.1
Intel Atom Processors supporting Gen9 graphics devices - OpenCL 1.2
## How to provide feedback
By default, please submit an issue using native github.com interface: https://github.com/intel/compute-runtime/issues.
## How to contribute
Create a pull request on github.com with your patch. Make sure your change is cleanly building and passing ULTs.
A maintainer will contact you if there are questions or concerns.
## Known Issues and Limitations
OpenCL compliance of a driver built from open-source components should not be
assumed by default. Intel will clearly designate / tag specific builds to
indicate production quality including formal compliance. Other builds should be
considered experimental.
The driver has the following functional delta compared to previously released drivers:
* Intel's closed source SRB5.0 driver (aka Classic)
https://software.intel.com/en-us/articles/opencl-drivers#latest_linux_driver
* Intel's former open-source Beignet driver
https://01.org/beignet
## Generic extensions
* cl_khr_mipmap
* cl_khr_mipmap_writes
* cl_khr_priority_hints
* cl_khr_throttle_hints
* cl_khr_fp64
## Preview extensions
* cl_intelx_video_enhancement
* cl_intelx_video_enhancement_camera_pipeline
* cl_intelx_video_enhancement_color_pipeline
* cl_intelx_hevc_pak
## Other capabilities
* OpenGL sharing with MESA driver
* CL_MEM_SVM_FINE_GRAIN_BUFFER (if using unpatched i915)
___(*) Other names and brands my be claimed as property of others.___

40
config.h.in Normal file
View File

@@ -0,0 +1,40 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef CONFIG_H
#define CONFIG_H
#cmakedefine CL_USE_DEPRECATED_OPENCL_1_1_APIS
#cmakedefine CL_USE_DEPRECATED_OPENCL_1_2_APIS
#cmakedefine CL_USE_DEPRECATED_OPENCL_2_0_APIS
#cmakedefine _CRT_SECURE_NO_WARNINGS
#cmakedefine USE_CL_CACHE
#if defined(USE_CL_CACHE)
static const bool clCacheEnabled = true;
#else
static const bool clCacheEnabled = false;
#endif
#cmakedefine CL_CACHE_LOCATION "${CL_CACHE_LOCATION}"
#endif /* CONFIG_H */

38
elf/CMakeLists.txt Normal file
View File

@@ -0,0 +1,38 @@
# Copyright (c) 2017, Intel Corporation
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
# We require cmake 3.2.0 or later
cmake_minimum_required(VERSION 3.2.0 FATAL_ERROR)
add_library(elflib STATIC
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/reader.cpp
${CMAKE_CURRENT_SOURCE_DIR}/reader.h
${CMAKE_CURRENT_SOURCE_DIR}/types.h
${CMAKE_CURRENT_SOURCE_DIR}/writer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/writer.h
)
target_include_directories(elflib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_include_directories(elflib PRIVATE ${IGDRCL_SOURCE_DIR})
set_target_properties(elflib PROPERTIES FOLDER "elflib")
set_target_properties(elflib PROPERTIES POSITION_INDEPENDENT_CODE ON)

254
elf/reader.cpp Normal file
View File

@@ -0,0 +1,254 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "reader.h"
#include <string.h>
namespace CLElfLib {
/******************************************************************************\
Constructor: CElfReader::CElfReader
\******************************************************************************/
CElfReader::CElfReader(
const char *pElfBinary,
const size_t elfBinarySize) {
m_pNameTable = NULL;
m_nameTableSize = 0;
m_pElfHeader = (SElf64Header *)pElfBinary;
m_pBinary = pElfBinary;
// get a pointer to the string table
if (m_pElfHeader) {
getSectionData(
m_pElfHeader->SectionNameTableIndex,
m_pNameTable, m_nameTableSize);
}
}
/******************************************************************************\
Destructor: CElfReader::~CElfReader
\******************************************************************************/
CElfReader::~CElfReader() {
}
/******************************************************************************\
Member Function: CElfReader::Create
\******************************************************************************/
CElfReader *CElfReader::create(
const char *pElfBinary,
const size_t elfBinarySize) {
CElfReader *pNewReader = NULL;
if (isValidElf64(pElfBinary, elfBinarySize)) {
pNewReader = new CElfReader(pElfBinary, elfBinarySize);
}
return pNewReader;
}
/******************************************************************************\
Member Function: CElfReader::Delete
\******************************************************************************/
void CElfReader::destroy(
CElfReader *&pElfReader) {
if (pElfReader) {
delete pElfReader;
pElfReader = NULL;
}
}
/******************************************************************************\
Member Function: IsValidElf64
Description: Determines if a binary is in the ELF64 format checks for
invalid offsets.
\******************************************************************************/
bool CElfReader::isValidElf64(
const void *pBinary,
const size_t binarySize) {
bool retVal = false;
SElf64Header *pElf64Header = NULL;
SElf64SectionHeader *pSectionHeader = NULL;
char *pNameTable = NULL;
char *pEnd = NULL;
size_t ourSize = 0;
size_t entrySize = 0;
size_t indexedSectionHeaderOffset = 0;
// validate header
if (pBinary && (binarySize >= sizeof(SElf64Header))) {
// calculate a pointer to the end
pEnd = (char *)pBinary + binarySize;
pElf64Header = (SElf64Header *)pBinary;
if ((pElf64Header->Identity[ID_IDX_MAGIC0] == ELF_MAG0) &&
(pElf64Header->Identity[ID_IDX_MAGIC1] == ELF_MAG1) &&
(pElf64Header->Identity[ID_IDX_MAGIC2] == ELF_MAG2) &&
(pElf64Header->Identity[ID_IDX_MAGIC3] == ELF_MAG3) &&
(pElf64Header->Identity[ID_IDX_CLASS] == EH_CLASS_64)) {
ourSize += pElf64Header->ElfHeaderSize;
retVal = true;
}
}
// validate sections
if (retVal == true) {
// get the section entry size
entrySize = pElf64Header->SectionHeaderEntrySize;
// get an offset to the name table
if (pElf64Header->SectionNameTableIndex <
pElf64Header->NumSectionHeaderEntries) {
indexedSectionHeaderOffset =
(size_t)pElf64Header->SectionHeadersOffset +
(pElf64Header->SectionNameTableIndex * entrySize);
if (((char *)pBinary + indexedSectionHeaderOffset) <= pEnd) {
pNameTable = (char *)pBinary + indexedSectionHeaderOffset;
}
}
for (unsigned int i = 0; i < pElf64Header->NumSectionHeaderEntries; i++) {
indexedSectionHeaderOffset = (size_t)pElf64Header->SectionHeadersOffset +
(i * entrySize);
// check section header offset
if (((char *)pBinary + indexedSectionHeaderOffset) > pEnd) {
retVal = false;
break;
}
pSectionHeader = (SElf64SectionHeader *)((char *)pBinary + indexedSectionHeaderOffset);
// check section data
if (((char *)pBinary + pSectionHeader->DataOffset + pSectionHeader->DataSize) > pEnd) {
retVal = false;
break;
}
// check section name index
if ((pNameTable + pSectionHeader->Name) > pEnd) {
retVal = false;
break;
}
// tally up the sizes
ourSize += (size_t)pSectionHeader->DataSize;
ourSize += (size_t)entrySize;
}
if (ourSize != binarySize) {
retVal = false;
}
}
return retVal;
}
/******************************************************************************\
Member Function: GetElfHeader
Description: Returns a pointer to the requested section header
\******************************************************************************/
const SElf64Header *CElfReader::getElfHeader() {
return m_pElfHeader;
}
/******************************************************************************\
Member Function: GetSectionHeader
Description: Returns a pointer to the requested section header
\******************************************************************************/
const SElf64SectionHeader *CElfReader::getSectionHeader(
unsigned int sectionIndex) {
SElf64SectionHeader *pSectionHeader = NULL;
size_t indexedSectionHeaderOffset = 0;
size_t entrySize = m_pElfHeader->SectionHeaderEntrySize;
if (sectionIndex < m_pElfHeader->NumSectionHeaderEntries) {
indexedSectionHeaderOffset = (size_t)m_pElfHeader->SectionHeadersOffset +
(sectionIndex * entrySize);
pSectionHeader = (SElf64SectionHeader *)((char *)m_pElfHeader + indexedSectionHeaderOffset);
}
return pSectionHeader;
}
/******************************************************************************\
Member Function: GetSectionData
Description: Returns a pointer to and size of the requested section's
data
\******************************************************************************/
bool CElfReader::getSectionData(
const unsigned int sectionIndex,
char *&pData,
size_t &dataSize) {
const SElf64SectionHeader *pSectionHeader = getSectionHeader(sectionIndex);
if (pSectionHeader) {
pData = (char *)m_pBinary + pSectionHeader->DataOffset;
dataSize = (size_t)pSectionHeader->DataSize;
return true;
}
return false;
}
/******************************************************************************\
Member Function: GetSectionData
Description: Returns a pointer to and size of the requested section's
data
\******************************************************************************/
bool CElfReader::getSectionData(
const char *pName,
char *&pData,
size_t &dataSize) {
const char *pSectionName = NULL;
for (unsigned int i = 1; i < m_pElfHeader->NumSectionHeaderEntries; i++) {
pSectionName = getSectionName(i);
if (pSectionName && (strcmp(pName, pSectionName) == 0)) {
getSectionData(i, pData, dataSize);
return true;
;
}
}
return false;
}
/******************************************************************************\
Member Function: GetSectionName
Description: Returns a pointer to a NULL terminated string
\******************************************************************************/
const char *CElfReader::getSectionName(
unsigned int sectionIndex) {
char *pName = NULL;
const SElf64SectionHeader *pSectionHeader = getSectionHeader(sectionIndex);
if (pSectionHeader) {
pName = m_pNameTable + pSectionHeader->Name;
}
return pName;
}
} // namespace OclElfLib

85
elf/reader.h Normal file
View File

@@ -0,0 +1,85 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "types.h"
#if defined(_WIN32)
#define ELF_CALL __stdcall
#else
#define ELF_CALL
#endif
namespace CLElfLib {
/******************************************************************************\
Class: CElfReader
Description: Class to provide simpler interaction with the ELF standard
binary object. SElf64Header defines the ELF header type and
SElf64SectionHeader defines the section header type.
\******************************************************************************/
class CElfReader {
public:
static CElfReader *ELF_CALL create(
const char *pElfBinary,
const size_t elfBinarySize);
static void ELF_CALL destroy(
CElfReader *&pElfObject);
static bool ELF_CALL isValidElf64(
const void *pBinary,
const size_t binarySize);
const SElf64Header *ELF_CALL getElfHeader();
const SElf64SectionHeader *ELF_CALL getSectionHeader(
unsigned int sectionIndex);
const char *ELF_CALL getSectionName(
unsigned int sectionIndex);
bool ELF_CALL getSectionData(
const unsigned int sectionIndex,
char *&pData,
size_t &dataSize);
bool ELF_CALL getSectionData(
const char *sectionName,
char *&pData,
size_t &dataSize);
protected:
ELF_CALL CElfReader(
const char *pElfBinary,
const size_t elfBinarySize);
ELF_CALL ~CElfReader();
SElf64Header *m_pElfHeader; // pointer to the ELF header
const char *m_pBinary; // portable ELF binary
char *m_pNameTable; // pointer to the string table
size_t m_nameTableSize; // size of string table in bytes
};
} // namespace CLElfLib

201
elf/types.h Normal file
View File

@@ -0,0 +1,201 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
// Abstract: Defines the types used for ELF headers/sections.
#pragma once
#include <inttypes.h>
#include <stddef.h>
namespace CLElfLib {
/******************************************************************************\
ELF Enumerates
\******************************************************************************/
// E_ID_IDX - Defines a file as being ELF
enum E_ID_IDX {
ID_IDX_MAGIC0 = 0,
ID_IDX_MAGIC1 = 1,
ID_IDX_MAGIC2 = 2,
ID_IDX_MAGIC3 = 3,
ID_IDX_CLASS = 4,
ID_IDX_VERSION = 5,
ID_IDX_OSABI = 6,
ID_IDX_ABI_VERSION = 7,
ID_IDX_PADDING = 8,
ID_IDX_NUM_BYTES = 16,
};
// E_EHT_CLASS - Describes what data types the ELF structures will use.
enum E_EH_CLASS {
EH_CLASS_NONE = 0,
EH_CLASS_32 = 1, // Use Elf32 data types
EH_CLASS_64 = 2, // Use Elf64 data types
};
// E_EHT_TYPE - List of pre-defined types header types.
// OS-specific codes start at 0xfe00 and run to 0xfeff.
// Processor-specific codes start at 0xff00 and end at 0xffff.
enum E_EH_TYPE {
EH_TYPE_NONE = 0,
EH_TYPE_RELOCATABLE = 1,
EH_TYPE_EXECUTABLE = 2,
EH_TYPE_DYNAMIC = 3,
EH_TYPE_CORE = 4,
EH_TYPE_OPENCL_SOURCE = 0xff01, // format used to pass CL text sections to FE
EH_TYPE_OPENCL_OBJECTS = 0xff02, // format used to pass LLVM objects / store LLVM binary output
EH_TYPE_OPENCL_LIBRARY = 0xff03, // format used to store LLVM archive output
EH_TYPE_OPENCL_EXECUTABLE = 0xff04, // format used to store executable output
EH_TYPE_OPENCL_DEBUG = 0xff05, // format used to store debug output
};
// E_EH_MACHINE - List of pre-defined machine types.
// For OpenCL, currently, we do not need this information, so this is not
// fully defined.
enum E_EH_MACHINE {
EH_MACHINE_NONE = 0,
//EHT_MACHINE_LO_RSVD = 1, // Beginning of range of reserved types.
//EHT_MACHINE_HI_RSVD = 200, // End of range of reserved types.
};
// E_EHT_VERSION - ELF header version options.
enum E_EHT_VERSION {
EH_VERSION_INVALID = 0,
EH_VERSION_CURRENT = 1,
};
// E_SH_TYPE - List of pre-defined section header types.
// Processor-specific codes start at 0xff00 and end at 0xffff.
enum E_SH_TYPE {
SH_TYPE_NULL = 0,
SH_TYPE_PROG_BITS = 1,
SH_TYPE_SYM_TBL = 2,
SH_TYPE_STR_TBL = 3,
SH_TYPE_RELO_ADDS = 4,
SH_TYPE_HASH = 5,
SH_TYPE_DYN = 6,
SH_TYPE_NOTE = 7,
SH_TYPE_NOBITS = 8,
SH_TYPE_RELO_NO_ADDS = 9,
SH_TYPE_SHLIB = 10,
SH_TYPE_DYN_SYM_TBL = 11,
SH_TYPE_INIT = 14,
SH_TYPE_FINI = 15,
SH_TYPE_PRE_INIT = 16,
SH_TYPE_GROUP = 17,
SH_TYPE_SYMTBL_SHNDX = 18,
SH_TYPE_OPENCL_SOURCE = 0xff000000, // CL source to link into LLVM binary
SH_TYPE_OPENCL_HEADER = 0xff000001, // CL header to link into LLVM binary
SH_TYPE_OPENCL_LLVM_TEXT = 0xff000002, // LLVM text
SH_TYPE_OPENCL_LLVM_BINARY = 0xff000003, // LLVM byte code
SH_TYPE_OPENCL_LLVM_ARCHIVE = 0xff000004, // LLVM archives(s)
SH_TYPE_OPENCL_DEV_BINARY = 0xff000005, // Device binary (coherent by default)
SH_TYPE_OPENCL_OPTIONS = 0xff000006, // CL Options
SH_TYPE_OPENCL_PCH = 0xff000007, // PCH (pre-compiled headers)
SH_TYPE_OPENCL_DEV_DEBUG = 0xff000008, // Device debug
SH_TYPE_SPIRV = 0xff000009, // SPIRV
SH_TYPE_NON_COHERENT_DEV_BINARY = 0xff00000a, // Non-coherent Device binary
};
// E_SH_FLAG - List of section header flags.
enum E_SH_FLAG {
SH_FLAG_WRITE = 0x1,
SH_FLAG_ALLOC = 0x2,
SH_FLAG_EXEC_INSTR = 0x4,
SH_FLAG_MERGE = 0x8,
SH_FLAG_STRINGS = 0x10,
SH_FLAG_INFO_LINK = 0x20,
SH_FLAG_LINK_ORDER = 0x40,
SH_FLAG_OS_NONCONFORM = 0x100,
SH_FLAG_GROUP = 0x200,
SH_FLAG_TLS = 0x400,
SH_FLAG_MASK_OS = 0x0ff00000,
SH_FLAG_MASK_PROC = 0xf0000000,
};
/******************************************************************************\
ELF-64 Data Types
\******************************************************************************/
#if defined(_MSC_VER) // && (_MSC_VER < 1700)
typedef unsigned __int64 Elf64_Addr;
typedef unsigned __int64 Elf64_Off;
typedef unsigned __int16 Elf64_Short; // Renaming Elf64_Half to Elf64_Short to avoid a conflict with Android
typedef unsigned __int32 Elf64_Word;
typedef __int32 Elf64_Sword;
typedef unsigned __int64 Elf64_Xword;
#else
#if !defined(_UAPI_LINUX_ELF_H)
typedef uint64_t Elf64_Addr;
typedef uint64_t Elf64_Off;
typedef uint32_t Elf64_Word;
typedef int32_t Elf64_Sword;
typedef uint64_t Elf64_Xword;
#endif
typedef uint16_t Elf64_Short; // Renaming Elf64_Half to Elf64_Short to avoid a conflict with Android
#endif
/******************************************************************************\
ELF Constants
\******************************************************************************/
static const unsigned char ELF_MAG0 = 0x7f; // ELFHeader.Identity[ELF_ID_MAGIC0]
static const unsigned char ELF_MAG1 = 'E'; // ELFHeader.Identity[ELF_ID_MAGIC1]
static const unsigned char ELF_MAG2 = 'L'; // ELFHeader.Identity[ELF_ID_MAGIC2]
static const unsigned char ELF_MAG3 = 'F'; // ELFHeader.Identity[ELF_ID_MAGIC3]
static const unsigned int ELF_ALIGN_BYTES = 16; // Alignment set to 16-bytes
/******************************************************************************\
ELF-64 Header
\******************************************************************************/
struct SElf64Header {
unsigned char Identity[ID_IDX_NUM_BYTES];
Elf64_Short Type;
Elf64_Short Machine;
Elf64_Word Version;
Elf64_Addr EntryAddress;
Elf64_Off ProgramHeadersOffset;
Elf64_Off SectionHeadersOffset;
Elf64_Word Flags;
Elf64_Short ElfHeaderSize;
Elf64_Short ProgramHeaderEntrySize;
Elf64_Short NumProgramHeaderEntries;
Elf64_Short SectionHeaderEntrySize;
Elf64_Short NumSectionHeaderEntries;
Elf64_Short SectionNameTableIndex;
};
/******************************************************************************\
ELF-64 Section Header
\******************************************************************************/
struct SElf64SectionHeader {
Elf64_Word Name;
Elf64_Word Type;
Elf64_Xword Flags;
Elf64_Addr Address;
Elf64_Off DataOffset;
Elf64_Xword DataSize;
Elf64_Word Link;
Elf64_Word Info;
Elf64_Xword Alignment;
Elf64_Xword EntrySize;
};
} // namespace ELFlib

285
elf/writer.cpp Normal file
View File

@@ -0,0 +1,285 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "writer.h"
#include "runtime/helpers/string.h"
#include <cstring>
namespace CLElfLib {
/******************************************************************************\
Constructor: CElfWriter::CElfWriter
\******************************************************************************/
CElfWriter::CElfWriter(
E_EH_TYPE type,
E_EH_MACHINE machine,
Elf64_Xword flags) {
m_type = type;
m_machine = machine;
m_flags = flags;
}
/******************************************************************************\
Destructor: CElfWriter::~CElfWriter
\******************************************************************************/
CElfWriter::~CElfWriter() {
SSectionNode *pNode = NULL;
// Walk through the section nodes
while (m_nodeQueue.empty() == false) {
pNode = m_nodeQueue.front();
m_nodeQueue.pop();
// delete the node and it's data
if (pNode) {
if (pNode->pData) {
delete[] pNode->pData;
pNode->pData = NULL;
}
delete pNode;
pNode = nullptr;
}
}
}
/******************************************************************************\
Member Function: CElfWriter::Create
\******************************************************************************/
CElfWriter *CElfWriter::create(
E_EH_TYPE type,
E_EH_MACHINE machine,
Elf64_Xword flags) {
CElfWriter *pWriter = new CElfWriter(type, machine, flags);
if (!pWriter->initialize()) {
destroy(pWriter);
}
return pWriter;
}
/******************************************************************************\
Member Function: CElfWriter::Delete
\******************************************************************************/
void CElfWriter::destroy(
CElfWriter *&pWriter) {
if (pWriter) {
delete pWriter;
pWriter = NULL;
}
}
/******************************************************************************\
Member Function: CElfWriter::AddSection
\******************************************************************************/
bool CElfWriter::addSection(
SSectionNode *pSectionNode) {
bool retVal = true;
SSectionNode *pNode = NULL;
size_t nameSize = 0;
unsigned int dataSize = 0;
// The section header must be non-NULL
if (pSectionNode) {
pNode = new SSectionNode();
if (!pNode)
return false;
} else {
return false;
}
pNode->Flags = pSectionNode->Flags;
pNode->Type = pSectionNode->Type;
nameSize = pSectionNode->Name.size() + 1;
dataSize = pSectionNode->DataSize;
pNode->Name = pSectionNode->Name;
// ok to have NULL data
if (dataSize > 0) {
pNode->pData = new char[dataSize];
if (pNode->pData) {
memcpy_s(pNode->pData, dataSize, pSectionNode->pData, dataSize);
pNode->DataSize = dataSize;
} else {
retVal = false;
}
}
if (retVal) {
// push the node onto the queue
m_nodeQueue.push(pNode);
// increment the sizes for each section
m_dataSize += dataSize;
m_stringTableSize += nameSize;
m_numSections++;
} else {
delete pNode;
pNode = nullptr;
}
return retVal;
}
/******************************************************************************\
Member Function: CElfWriter::ResolveBinary
\******************************************************************************/
bool CElfWriter::resolveBinary(
char *const pBinary,
size_t &binarySize) {
bool retVal = true;
SSectionNode *pNode = NULL;
SElf64SectionHeader *pCurSectionHeader = NULL;
char *pData = NULL;
char *pStringTable = NULL;
char *pCurString = NULL;
m_totalBinarySize =
sizeof(SElf64Header) +
((m_numSections + 1) * sizeof(SElf64SectionHeader)) + // +1 to account for string table entry
m_dataSize +
m_stringTableSize;
if (pBinary) {
// get a pointer to the first section header
pCurSectionHeader = (SElf64SectionHeader *)(pBinary + sizeof(SElf64Header));
// get a pointer to the data
pData = pBinary +
sizeof(SElf64Header) +
((m_numSections + 1) * sizeof(SElf64SectionHeader)); // +1 to account for string table entry
// get a pointer to the string table
pStringTable = pBinary + sizeof(SElf64Header) +
((m_numSections + 1) * sizeof(SElf64SectionHeader)) + // +1 to account for string table entry
m_dataSize;
pCurString = pStringTable;
// Walk through the section nodes
while (m_nodeQueue.empty() == false) {
pNode = m_nodeQueue.front();
if (pNode) {
m_nodeQueue.pop();
// Copy data into the section header
memset(pCurSectionHeader, 0, sizeof(SElf64SectionHeader));
pCurSectionHeader->Type = pNode->Type;
pCurSectionHeader->Flags = pNode->Flags;
pCurSectionHeader->DataSize = pNode->DataSize;
pCurSectionHeader->DataOffset = pData - pBinary;
pCurSectionHeader->Name = (Elf64_Word)(pCurString - pStringTable);
pCurSectionHeader = (SElf64SectionHeader *)((unsigned char *)pCurSectionHeader + sizeof(SElf64SectionHeader));
// copy the data, move the data pointer
memcpy_s(pData, pNode->DataSize, pNode->pData, pNode->DataSize);
pData += pNode->DataSize;
// copy the name into the string table, move the string pointer
if (pNode->Name.size() > 0) {
memcpy_s(pCurString, pNode->Name.size(), pNode->Name.c_str(), pNode->Name.size());
pCurString += pNode->Name.size();
}
*(pCurString++) = '\0';
// delete the node and it's data
if (pNode->pData) {
delete[] pNode->pData;
pNode->pData = NULL;
}
delete pNode;
pNode = nullptr;
}
}
// add the string table section header
SElf64SectionHeader stringSectionHeader = {0};
stringSectionHeader.Type = SH_TYPE_STR_TBL;
stringSectionHeader.Flags = 0;
stringSectionHeader.DataOffset = pStringTable - pBinary;
stringSectionHeader.DataSize = m_stringTableSize;
stringSectionHeader.Name = 0;
// Copy into the last section header
memcpy_s(pCurSectionHeader, sizeof(SElf64SectionHeader),
&stringSectionHeader, sizeof(SElf64SectionHeader));
// Add to our section number
m_numSections++;
// patch up the ELF header
retVal = patchElfHeader(pBinary);
}
if (retVal) {
binarySize = m_totalBinarySize;
}
return retVal;
}
/******************************************************************************\
Member Function: CElfWriter::Initialize
\******************************************************************************/
bool CElfWriter::initialize() {
SSectionNode emptySection;
// Add an empty section 0 (points to "no-bits")
return addSection(&emptySection);
}
/******************************************************************************\
Member Function: CElfWriter::PatchElfHeader
\******************************************************************************/
bool CElfWriter::patchElfHeader(char *const pBinary) {
SElf64Header *pElfHeader = (SElf64Header *)pBinary;
if (pElfHeader) {
// Setup the identity
memset(pElfHeader, 0x00, sizeof(SElf64Header));
pElfHeader->Identity[ID_IDX_MAGIC0] = ELF_MAG0;
pElfHeader->Identity[ID_IDX_MAGIC1] = ELF_MAG1;
pElfHeader->Identity[ID_IDX_MAGIC2] = ELF_MAG2;
pElfHeader->Identity[ID_IDX_MAGIC3] = ELF_MAG3;
pElfHeader->Identity[ID_IDX_CLASS] = EH_CLASS_64;
pElfHeader->Identity[ID_IDX_VERSION] = EH_VERSION_CURRENT;
// Add other non-zero info
pElfHeader->Type = m_type;
pElfHeader->Machine = m_machine;
pElfHeader->Flags = (unsigned int)m_flags;
pElfHeader->ElfHeaderSize = sizeof(SElf64Header);
pElfHeader->SectionHeaderEntrySize = sizeof(SElf64SectionHeader);
pElfHeader->NumSectionHeaderEntries = (Elf64_Short)m_numSections;
pElfHeader->SectionHeadersOffset = (unsigned int)(sizeof(SElf64Header));
pElfHeader->SectionNameTableIndex = m_numSections - 1; // last index
return true;
}
return false;
}
} // namespace OclElfLib

105
elf/writer.h Normal file
View File

@@ -0,0 +1,105 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "types.h"
#include <queue>
#include <string>
#if defined(_WIN32)
#define ELF_CALL __stdcall
#else
#define ELF_CALL
#endif
using namespace std;
namespace CLElfLib {
static const unsigned int g_scElfHeaderAlignment = 16; // allocation alignment restriction
static const unsigned int g_scInitialElfSize = 2048; // initial elf size (in bytes)
static const unsigned int g_scInitNumSectionHeaders = 8;
struct SSectionNode {
E_SH_TYPE Type;
unsigned int Flags;
string Name;
char *pData;
unsigned int DataSize;
SSectionNode() {
Type = SH_TYPE_NULL;
Flags = 0;
pData = NULL;
DataSize = 0;
}
~SSectionNode() {
}
};
/******************************************************************************\
Class: CElfWriter
Description: Class to provide simpler interaction with the ELF standard
binary object. SElf64Header defines the ELF header type and
SElf64SectionHeader defines the section header type.
\******************************************************************************/
class CElfWriter {
public:
static CElfWriter *ELF_CALL create(
E_EH_TYPE type,
E_EH_MACHINE machine,
Elf64_Xword flags);
static void ELF_CALL destroy(CElfWriter *&pElfWriter);
bool ELF_CALL addSection(
SSectionNode *pSectionNode);
bool ELF_CALL resolveBinary(
char *const pBinary,
size_t &dataSize);
bool ELF_CALL initialize();
bool ELF_CALL patchElfHeader(char *const pBinary);
protected:
ELF_CALL CElfWriter(
E_EH_TYPE type,
E_EH_MACHINE machine,
Elf64_Xword flags);
ELF_CALL ~CElfWriter();
E_EH_TYPE m_type = EH_TYPE_NONE;
E_EH_MACHINE m_machine = EH_MACHINE_NONE;
Elf64_Xword m_flags = 0U;
std::queue<SSectionNode *> m_nodeQueue;
unsigned int m_dataSize = 0U;
unsigned int m_numSections = 0U;
size_t m_stringTableSize = 0U;
size_t m_totalBinarySize = 0U;
};
} // namespace ELFLib

61
manifests/manifest.yml Normal file
View File

@@ -0,0 +1,61 @@
components:
gmmlib:
branch: gmmlib
clean_on_sync: true
dest_dir: gmmlib
repository: https://github.com/intel/gmmlib.git
revision: 9a261a60bd990b237fe14138b7aaf5eaee342ff8
type: git
gmock:
branch: master
clean_on_sync: true
dest_dir: gmock
repository: https://github.com/google/googlemock.git
revision: c440c8fafc6f60301197720617ce64028e09c79d
type: git
gtest:
branch: master
clean_on_sync: true
dest_dir: gtest
repository: https://github.com/google/googletest.git
revision: c99458533a9b4c743ed51537e25989ea55944908
type: git
igc:
branch: igc
clean_on_sync: true
dest_dir: igc
repository: https://github.com/intel/intelgraphicscompiler
revision: d6379492df107094d0642f0ecf75a6f20ae573b2-2
type: git
infra:
branch: infra
clean_on_sync: true
dest_dir: infra
revision: c81cf66d7995e55cb8f11b24d3776e3fc013a809
type: git
internal:
branch: master
dest_dir: internal
revision: 93d1c17c98d8c051bcd4368686bc9cf2eddd8f8e
type: git
khronos:
branch: master
clean_on_sync: true
dest_dir: khronos
repository: https://github.com/KhronosGroup/OpenCL-Headers.git
revision: f039db6764d52388658ef15c30b2237bbda49803
type: git
libdrm:
branch: libdrm-2.4.84
clean_on_sync: true
dest_dir: libdrm
repository: https://anongit.freedesktop.org/git/mesa/drm.git
revision: 290d29d9794813a2fe0578dbb905ad09bc810516
type: git
wdk:
branch: wdk
clean_on_sync: true
dest_dir: wdk
revision: c67a2fa209d3ad3c3ab05f6f10e2234fd81fcebc
type: git
version: '1'

View File

@@ -0,0 +1,139 @@
# Copyright (c) 2017, Intel Corporation
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
cmake_minimum_required(VERSION 3.2.0 FATAL_ERROR)
project(cloc)
set(CLOC_SRCS_LIB
${IGDRCL_SOURCE_DIR}/offline_compiler/offline_compiler.cpp
${IGDRCL_SOURCE_DIR}/offline_compiler/offline_compiler.h
${IGDRCL_SOURCE_DIR}/offline_compiler/options.cpp
${IGDRCL_SOURCE_DIR}/offline_compiler/helper.cpp
${IGDRCL_SOURCE_DIR}/runtime/compiler_interface/create_main.cpp
${IGDRCL_SOURCE_DIR}/runtime/helpers/hw_info.cpp
${IGDRCL_SOURCE_DIR}/runtime/helpers/file_io.cpp
${IGDRCL_SOURCE_DIR}/runtime/helpers/abort.cpp
${IGDRCL_SOURCE_DIR}/runtime/helpers/debug_helpers.cpp
)
if (WIN32)
list (APPEND CLOC_SRCS_LIB
${IGDRCL_SOURCE_DIR}/runtime/os_interface/windows/os_library.cpp
${IGDRCL_SOURCE_DIR}/runtime/os_interface/windows/options.cpp
)
endif (WIN32)
if (UNIX)
list (APPEND CLOC_SRCS_LIB
${IGDRCL_SOURCE_DIR}/runtime/os_interface/linux/os_library.cpp
${IGDRCL_SOURCE_DIR}/runtime/os_interface/linux/options.cpp
)
endif (UNIX)
list (APPEND HW_SRC_INCLUDES ${IGDRCL_SOURCE_DIR}/runtime/gen_common)
set(CLOC_LIB_FLAGS_DEFINITIONS
-DCIF_HEADERS_ONLY_BUILD
)
set(OPTIONAL_RUNTIME_GENX_FILES
hw_info.cpp
)
foreach(GEN_NUM RANGE 0 ${MAX_GEN} 1)
GEN_CONTAINS_PLATFORMS("SUPPORTED" ${GEN_NUM} GENX_HAS_PLATFORMS)
if(${GENX_HAS_PLATFORMS})
foreach(SRC_IT ${OPTIONAL_RUNTIME_GENX_FILES})
set(SRC_FILE ${IGDRCL_SOURCE_DIR}/runtime/gen${GEN_NUM}/${SRC_IT})
if(EXISTS ${SRC_FILE})
list(APPEND CLOC_SRCS_LIB ${SRC_FILE})
endif()
endforeach()
GET_PLATFORMS_FOR_GEN("SUPPORTED" ${GEN_NUM} SUPPORTED_GENX_PLATFORMS)
foreach(PLATFORM_IT ${SUPPORTED_GENX_PLATFORMS})
string(TOLOWER ${PLATFORM_IT} PLATFORM_IT_LOWER)
list (APPEND CLOC_SRCS_LIB
${IGDRCL_SOURCE_DIR}/runtime/gen${GEN_NUM}/hw_info_${PLATFORM_IT_LOWER}.cpp
${IGDRCL_SOURCE_DIR}/runtime/gen${GEN_NUM}/enable_${PLATFORM_IT_LOWER}.cpp
)
endforeach(PLATFORM_IT)
endif(${GENX_HAS_PLATFORMS})
endforeach(GEN_NUM)
set(CLOC_SRCS
${CLOC_SRCS_LIB}
main.cpp
${IGDRCL_SOURCE_DIR}/offline_compiler/CMakeLists.txt
)
add_executable(cloc ${CLOC_SRCS})
if(IGC_OCL_ADAPTOR_DIR) # IGC/AdaptorOCL
target_include_directories(cloc PUBLIC "${IGC_OCL_ADAPTOR_DIR}")
endif(IGC_OCL_ADAPTOR_DIR)
if(CIF_BASE_DIR)
target_include_directories(cloc PUBLIC "${CIF_BASE_DIR}")
endif(CIF_BASE_DIR)
SET(CLOC_INCLUDES
"${HW_SRC_INCLUDES}"
"${UMKM_SHAREDDATA_INCLUDE_PATHS}"
"${KHRONOS_HEADERS_DIR}"
"${IGDRCL__IGC_INCLUDE_DIR}"
"${THIRD_PARTY_DIR}"
PARENT_SCOPE
)
target_include_directories(cloc BEFORE PRIVATE
"${HW_SRC_INCLUDES}"
"${UMKM_SHAREDDATA_INCLUDE_PATHS}"
"${KHRONOS_HEADERS_DIR}"
"${IGDRCL__IGC_INCLUDE_DIR}"
"${THIRD_PARTY_DIR}"
)
target_compile_definitions(cloc PUBLIC ${CLOC_LIB_FLAGS_DEFINITIONS} ${SUPPORTED_GEN_FLAGS_DEFINITONS} DEFAULT_PLATFORM=${DEFAULT_SUPPORTED_PLATFORM})
if(UNIX)
target_link_libraries(cloc dl pthread)
endif(UNIX)
target_link_libraries(cloc elflib)
source_group("source files" FILES ${CLOC_SRCS})
set_target_properties(cloc PROPERTIES FOLDER "offline_compiler")
set_property(TARGET cloc APPEND_STRING PROPERTY COMPILE_FLAGS ${ASAN_FLAGS} ${TSAN_FLAGS})
add_custom_target(copy_compiler_files DEPENDS ${IGDRCL__IGC_TARGETS})
set_target_properties(copy_compiler_files PROPERTIES FOLDER "opencl runtime")
foreach(TARGET_tmp ${IGDRCL__IGC_TARGETS})
add_custom_command(
TARGET copy_compiler_files
PRE_BUILD
COMMAND echo copying $<TARGET_FILE:${TARGET_tmp}> to "$<TARGET_FILE_DIR:cloc>"
COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:${TARGET_tmp}> $<TARGET_FILE_DIR:cloc>
)
endforeach(TARGET_tmp)
SET(CLOC_SRCS_LIB ${CLOC_SRCS_LIB} PARENT_SCOPE)
SET(CLOC_LIB_FLAGS_DEFINITIONS ${CLOC_LIB_FLAGS_DEFINITIONS} PARENT_SCOPE)

View File

@@ -0,0 +1,42 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "runtime/command_stream/command_stream_receiver.h"
#include "hw_info.h"
#include "runtime/helpers/debug_helpers.h"
namespace OCLRT {
extern CommandStreamReceiverCreateFunc commandStreamReceiverFactory[2 * IGFX_MAX_CORE];
CommandStreamReceiver *createCommandStream(const HardwareInfo *pHwInfo) {
DEBUG_BREAK_IF(nullptr == pHwInfo->pPlatform);
auto funcCreate = commandStreamReceiverFactory[IGFX_MAX_CORE + pHwInfo->pPlatform->eRenderCoreFamily];
return funcCreate ? funcCreate(*pHwInfo) : nullptr;
}
bool getDevices(HardwareInfo **hwInfo, size_t &numDevicesReturned) {
*hwInfo = nullptr;
numDevicesReturned = 0;
return true;
}
}

View File

@@ -0,0 +1,63 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "runtime/helpers/hw_info.h"
#include "runtime/os_interface/debug_settings_manager.h"
namespace OCLRT {
template <DebugFunctionalityLevel DebugLevel>
DebugSettingsManager<DebugLevel>::DebugSettingsManager() {
}
template <DebugFunctionalityLevel DebugLevel>
DebugSettingsManager<DebugLevel>::~DebugSettingsManager() {
}
template <DebugFunctionalityLevel DebugLevel>
void DebugSettingsManager<DebugLevel>::writeToFile(std::string filename, const char *str, size_t length, std::ios_base::openmode mode) {
std::ofstream outFile(filename, mode);
if (outFile.is_open()) {
outFile.write(str, length);
outFile.close();
}
}
// Global Debug Settings Manager
DebugSettingsManager<globalDebugFunctionalityLevel> DebugManager;
// Global table of hardware prefixes
const char *hardwarePrefix[IGFX_MAX_PRODUCT] = {
nullptr,
};
// Global table of family names
const char *familyName[IGFX_MAX_CORE] = {
nullptr,
};
// Global table of family names
bool familyEnabled[IGFX_MAX_CORE] = {
false,
};
} // namespace OCLRT

54
offline_compiler/main.cpp Normal file
View File

@@ -0,0 +1,54 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "config.h"
#include "offline_compiler/offline_compiler.h"
#include "runtime/os_interface/os_library.h"
#include <CL/cl.h>
using namespace OCLRT;
int main(int numArgs, const char *argv[]) {
int retVal = CL_SUCCESS;
OfflineCompiler *pCompiler = OfflineCompiler::create(numArgs, argv, retVal);
if (retVal == CL_SUCCESS) {
retVal = pCompiler->build();
std::string buildLog = pCompiler->getBuildLog();
if (buildLog.empty() == false) {
printf("%s\n", buildLog.c_str());
}
if (retVal == CL_SUCCESS) {
if (!pCompiler->isQuiet())
printf("Build succeeded.\n");
} else {
printf("Build failed with error code: %d\n", retVal);
}
}
delete pCompiler;
return retVal;
}

View File

@@ -0,0 +1,781 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "cif/common/cif_main.h"
#include "cif/helpers/error.h"
#include "cif/import/library_api.h"
#include "ocl_igc_interface/code_type.h"
#include "ocl_igc_interface/fcl_ocl_device_ctx.h"
#include "ocl_igc_interface/igc_ocl_device_ctx.h"
#include "ocl_igc_interface/platform_helper.h"
#include "offline_compiler.h"
#include "igfxfmid.h"
#include "runtime/helpers/file_io.h"
#include "runtime/os_interface/debug_settings_manager.h"
#include "runtime/os_interface/os_inc.h"
#include "runtime/os_interface/os_library.h"
#include "runtime/helpers/string.h"
#include "runtime/helpers/debug_helpers.h"
#include "runtime/helpers/hw_info.h"
#include "runtime/helpers/validators.h"
#include "elf/writer.h"
#include <iomanip>
#include <list>
#include <algorithm>
#include <iostream>
#ifdef _WIN32
#include <direct.h>
#define MakeDirectory _mkdir
#define GetCurrentWorkingDirectory _getcwd
#else
#include <sys/stat.h>
#define MakeDirectory(dir) mkdir(dir, 0777)
#define GetCurrentWorkingDirectory getcwd
#endif
namespace OCLRT {
CIF::CIFMain *createMainNoSanitize(CIF::CreateCIFMainFunc_t createFunc);
////////////////////////////////////////////////////////////////////////////////
// StringsAreEqual
////////////////////////////////////////////////////////////////////////////////
bool stringsAreEqual(const char *string1, const char *string2) {
if (string2 == nullptr)
return false;
return (strcmp(string1, string2) == 0);
}
////////////////////////////////////////////////////////////////////////////////
// convertToPascalCase
////////////////////////////////////////////////////////////////////////////////
std::string convertToPascalCase(const std::string &inString) {
std::string outString;
bool capitalize = true;
for (unsigned int i = 0; i < inString.length(); i++) {
if (isalpha(inString[i]) && capitalize == true) {
outString += toupper(inString[i]);
capitalize = false;
} else if (inString[i] == '_') {
capitalize = true;
} else {
outString += inString[i];
}
}
return outString;
}
////////////////////////////////////////////////////////////////////////////////
// ctor
////////////////////////////////////////////////////////////////////////////////
OfflineCompiler::OfflineCompiler() = default;
////////////////////////////////////////////////////////////////////////////////
// dtor
////////////////////////////////////////////////////////////////////////////////
OfflineCompiler::~OfflineCompiler() {
delete[] llvmBinary;
delete[] genBinary;
delete[] elfBinary;
}
////////////////////////////////////////////////////////////////////////////////
// Create
////////////////////////////////////////////////////////////////////////////////
OfflineCompiler *OfflineCompiler::create(uint32_t numArgs, const char **argv, int &retVal) {
retVal = CL_SUCCESS;
auto pOffCompiler = new OfflineCompiler();
if (pOffCompiler) {
retVal = pOffCompiler->initialize(numArgs, argv);
}
if (retVal != CL_SUCCESS) {
delete pOffCompiler;
pOffCompiler = nullptr;
}
return pOffCompiler;
}
////////////////////////////////////////////////////////////////////////////////
// buildSourceCode
////////////////////////////////////////////////////////////////////////////////
int OfflineCompiler::buildSourceCode() {
int retVal = CL_SUCCESS;
do {
if (strcmp(sourceCode.c_str(), "") == 0) {
retVal = CL_INVALID_PROGRAM;
break;
}
UNRECOVERABLE_IF(fclDeviceCtx == nullptr);
UNRECOVERABLE_IF(igcDeviceCtx == nullptr);
IGC::CodeType::CodeType_t intermediateRepresentation = useLlvmText ? IGC::CodeType::llvmLl : IGC::CodeType::llvmBc;
auto fclSrc = CIF::Builtins::CreateConstBuffer(fclMain.get(), sourceCode.c_str(), sourceCode.size());
auto fclOptions = CIF::Builtins::CreateConstBuffer(fclMain.get(), options.c_str(), options.size());
auto fclInternalOptions = CIF::Builtins::CreateConstBuffer(fclMain.get(), internalOptions.c_str(), internalOptions.size());
auto fclTranslationCtx = fclDeviceCtx->CreateTranslationCtx(IGC::CodeType::oclC, intermediateRepresentation);
auto igcTranslationCtx = igcDeviceCtx->CreateTranslationCtx(intermediateRepresentation, IGC::CodeType::oclGenBin);
if (false == OCLRT::areNotNullptr(fclSrc.get(), fclOptions.get(), fclInternalOptions.get(),
fclTranslationCtx.get(), igcTranslationCtx.get())) {
retVal = CL_OUT_OF_HOST_MEMORY;
break;
}
auto fclOutput = fclTranslationCtx->Translate(fclSrc.get(), fclOptions.get(),
fclInternalOptions.get(), nullptr, 0);
if (fclOutput == nullptr) {
retVal = CL_OUT_OF_HOST_MEMORY;
break;
}
UNRECOVERABLE_IF(fclOutput->GetBuildLog() == nullptr);
UNRECOVERABLE_IF(fclOutput->GetOutput() == nullptr);
if (fclOutput->Successful() == false) {
updateBuildLog(fclOutput->GetBuildLog()->GetMemory<char>(), fclOutput->GetBuildLog()->GetSizeRaw());
retVal = CL_BUILD_PROGRAM_FAILURE;
break;
}
storeBinary(llvmBinary, llvmBinarySize, fclOutput->GetOutput()->GetMemory<char>(), fclOutput->GetOutput()->GetSizeRaw());
updateBuildLog(fclOutput->GetBuildLog()->GetMemory<char>(), fclOutput->GetBuildLog()->GetSizeRaw());
auto igcOutput = igcTranslationCtx->Translate(fclOutput->GetOutput(), fclOptions.get(),
fclInternalOptions.get(),
nullptr, 0);
if (igcOutput == nullptr) {
retVal = CL_OUT_OF_HOST_MEMORY;
break;
}
UNRECOVERABLE_IF(igcOutput->GetBuildLog() == nullptr);
UNRECOVERABLE_IF(igcOutput->GetOutput() == nullptr);
storeBinary(genBinary, genBinarySize, igcOutput->GetOutput()->GetMemory<char>(), igcOutput->GetOutput()->GetSizeRaw());
updateBuildLog(igcOutput->GetBuildLog()->GetMemory<char>(), igcOutput->GetBuildLog()->GetSizeRaw());
retVal = igcOutput->Successful() ? CL_SUCCESS : CL_BUILD_PROGRAM_FAILURE;
} while (0);
return retVal;
}
////////////////////////////////////////////////////////////////////////////////
// build
////////////////////////////////////////////////////////////////////////////////
int OfflineCompiler::build() {
int retVal = CL_SUCCESS;
retVal = buildSourceCode();
if (retVal == CL_SUCCESS) {
generateElfBinary();
writeOutAllFiles();
}
return retVal;
}
////////////////////////////////////////////////////////////////////////////////
// updateBuildLog
////////////////////////////////////////////////////////////////////////////////
void OfflineCompiler::updateBuildLog(const char *pErrorString, const size_t errorStringSize) {
std::string errorString = (errorStringSize && pErrorString) ? std::string(pErrorString, pErrorString + errorStringSize) : "";
if (errorString[0] != '\0') {
if (buildLog.empty()) {
buildLog.assign(errorString);
} else {
buildLog.append("\n" + errorString);
}
}
}
////////////////////////////////////////////////////////////////////////////////
// getBuildLog
////////////////////////////////////////////////////////////////////////////////
std::string &OfflineCompiler::getBuildLog() {
return buildLog;
}
////////////////////////////////////////////////////////////////////////////////
// getHardwareInfo
////////////////////////////////////////////////////////////////////////////////
int OfflineCompiler::getHardwareInfo(const char *pDeviceName) {
int retVal = CL_INVALID_DEVICE;
for (unsigned int productId = 0; productId < IGFX_MAX_PRODUCT; ++productId) {
if (stringsAreEqual(pDeviceName, hardwarePrefix[productId])) {
if (hardwareInfoTable[productId]) {
hwInfo = hardwareInfoTable[productId];
retVal = CL_SUCCESS;
break;
}
}
}
return retVal;
}
////////////////////////////////////////////////////////////////////////////////
// getStringWithinDelimiters
////////////////////////////////////////////////////////////////////////////////
std::string OfflineCompiler::getStringWithinDelimiters(const std::string &src) {
size_t start = src.find("R\"===(");
size_t stop = src.find(")===\"");
DEBUG_BREAK_IF(std::string::npos == start);
DEBUG_BREAK_IF(std::string::npos == stop);
start += strlen("R\"===(");
size_t size = stop - start;
std::string dst(src, start, size);
return dst;
}
////////////////////////////////////////////////////////////////////////////////
// Initialize
////////////////////////////////////////////////////////////////////////////////
int OfflineCompiler::initialize(uint32_t numArgs, const char **argv) {
int retVal = CL_SUCCESS;
const char *pSource = nullptr;
void *pSourceFromFile = nullptr;
size_t sourceFromFileSize = 0;
retVal = parseCommandLine(numArgs, argv);
if (retVal != CL_SUCCESS) {
return retVal;
}
parseDebugSettings();
if (options.empty()) {
// try to read options from file if not provided by commandline
std::string optionsFileName = inputFile;
size_t ext_start = optionsFileName.find(".cl");
if (ext_start != std::string::npos) {
optionsFileName.replace(ext_start, strlen(".cl"), "_options.txt");
void *pOptions = nullptr;
size_t optionsSize = loadDataFromFile(optionsFileName.c_str(), pOptions);
if (optionsSize > 0) {
options = (char *)pOptions;
// Remove comment containing copyright header
size_t commentBegin = options.find_first_of("/*");
size_t commentEnd = options.find_last_of("*/");
if (commentBegin != std::string::npos && commentEnd != std::string::npos) {
options = options.replace(commentBegin, commentEnd - commentBegin + 1, "");
size_t optionsBegin = options.find_first_not_of(" \t\n\r");
if (optionsBegin != std::string::npos) {
options = options.substr(optionsBegin, options.length());
}
}
auto trimPos = options.find_last_not_of(" \n\r");
options = options.substr(0, trimPos + 1);
if (!isQuiet())
printf("Building with options:\n%s\n", options.c_str());
}
deleteDataReadFromFile(pOptions);
}
}
// set up the device inside the program
sourceFromFileSize = loadDataFromFile(inputFile.c_str(), pSourceFromFile);
struct Helper {
static void deleter(void *ptr) { deleteDataReadFromFile(ptr); }
};
auto sourceRaii = std::unique_ptr<void, decltype(&Helper::deleter)>{pSourceFromFile, Helper::deleter};
if (sourceFromFileSize == 0) {
retVal = INVALID_FILE;
return retVal;
}
// we also accept files used as runtime builtins
pSource = strstr((const char *)pSourceFromFile, "R\"===(");
sourceCode = (pSource != nullptr) ? getStringWithinDelimiters((char *)pSourceFromFile) : (char *)pSourceFromFile;
this->fclLib.reset(OsLibrary::load(Os::frontEndDllName));
if (this->fclLib == nullptr) {
return CL_OUT_OF_HOST_MEMORY;
}
auto fclCreateMain = reinterpret_cast<CIF::CreateCIFMainFunc_t>(this->fclLib->getProcAddress(CIF::CreateCIFMainFuncName));
if (fclCreateMain == nullptr) {
return CL_OUT_OF_HOST_MEMORY;
}
this->fclMain = CIF::RAII::UPtr(createMainNoSanitize(fclCreateMain));
if (this->fclMain == nullptr) {
return CL_OUT_OF_HOST_MEMORY;
}
if (false == this->fclMain->IsCompatible<IGC::FclOclDeviceCtx>()) {
// given FCL is not compatible
DEBUG_BREAK_IF(true);
return CL_OUT_OF_HOST_MEMORY;
}
this->fclDeviceCtx = this->fclMain->CreateInterface<IGC::FclOclDeviceCtxTagOCL>();
if (this->fclDeviceCtx == nullptr) {
return CL_OUT_OF_HOST_MEMORY;
}
fclDeviceCtx->SetOclApiVersion(hwInfo->capabilityTable.clVersionSupport * 10);
this->igcLib.reset(OsLibrary::load(Os::igcDllName));
if (this->igcLib == nullptr) {
return CL_OUT_OF_HOST_MEMORY;
}
auto igcCreateMain = reinterpret_cast<CIF::CreateCIFMainFunc_t>(this->igcLib->getProcAddress(CIF::CreateCIFMainFuncName));
if (igcCreateMain == nullptr) {
return CL_OUT_OF_HOST_MEMORY;
}
this->igcMain = CIF::RAII::UPtr(createMainNoSanitize(igcCreateMain));
if (this->igcMain == nullptr) {
return CL_OUT_OF_HOST_MEMORY;
}
if (false == this->igcMain->IsCompatible<IGC::IgcOclDeviceCtx>()) {
// given IGC is not compatible
DEBUG_BREAK_IF(true);
return CL_OUT_OF_HOST_MEMORY;
}
this->igcDeviceCtx = this->igcMain->CreateInterface<IGC::IgcOclDeviceCtxTagOCL>();
if (this->igcDeviceCtx == nullptr) {
return CL_OUT_OF_HOST_MEMORY;
}
this->igcDeviceCtx->SetProfilingTimerResolution(static_cast<float>(hwInfo->capabilityTable.defaultProfilingTimerResolution));
auto igcPlatform = this->igcDeviceCtx->GetPlatformHandle();
auto igcGtSystemInfo = this->igcDeviceCtx->GetGTSystemInfoHandle();
auto igcFeWa = this->igcDeviceCtx->GetIgcFeaturesAndWorkaroundsHandle();
if ((igcPlatform == nullptr) || (igcGtSystemInfo == nullptr) || (igcFeWa == nullptr)) {
return CL_OUT_OF_HOST_MEMORY;
}
IGC::PlatformHelper::PopulateInterfaceWith(*igcPlatform.get(), *hwInfo->pPlatform);
IGC::GtSysInfoHelper::PopulateInterfaceWith(*igcGtSystemInfo.get(), *hwInfo->pSysInfo);
// populate with features
igcFeWa.get()->SetFtrDesktop(hwInfo->pSkuTable->ftrDesktop);
igcFeWa.get()->SetFtrChannelSwizzlingXOREnabled(hwInfo->pSkuTable->ftrChannelSwizzlingXOREnabled);
igcFeWa.get()->SetFtrGtBigDie(hwInfo->pSkuTable->ftrGtBigDie);
igcFeWa.get()->SetFtrGtMediumDie(hwInfo->pSkuTable->ftrGtMediumDie);
igcFeWa.get()->SetFtrGtSmallDie(hwInfo->pSkuTable->ftrGtSmallDie);
igcFeWa.get()->SetFtrGT1(hwInfo->pSkuTable->ftrGT1);
igcFeWa.get()->SetFtrGT1_5(hwInfo->pSkuTable->ftrGT1_5);
igcFeWa.get()->SetFtrGT2(hwInfo->pSkuTable->ftrGT2);
igcFeWa.get()->SetFtrGT3(hwInfo->pSkuTable->ftrGT3);
igcFeWa.get()->SetFtrGT4(hwInfo->pSkuTable->ftrGT4);
igcFeWa.get()->SetFtrIVBM0M1Platform(hwInfo->pSkuTable->ftrIVBM0M1Platform);
igcFeWa.get()->SetFtrGTL(hwInfo->pSkuTable->ftrGT1);
igcFeWa.get()->SetFtrGTM(hwInfo->pSkuTable->ftrGT2);
igcFeWa.get()->SetFtrGTH(hwInfo->pSkuTable->ftrGT3);
igcFeWa.get()->SetFtrSGTPVSKUStrapPresent(hwInfo->pSkuTable->ftrSGTPVSKUStrapPresent);
igcFeWa.get()->SetFtrGTA(hwInfo->pSkuTable->ftrGTA);
igcFeWa.get()->SetFtrGTC(hwInfo->pSkuTable->ftrGTC);
igcFeWa.get()->SetFtrGTX(hwInfo->pSkuTable->ftrGTX);
igcFeWa.get()->SetFtr5Slice(hwInfo->pSkuTable->ftr5Slice);
igcFeWa.get()->SetFtrGpGpuMidThreadLevelPreempt(hwInfo->pSkuTable->ftrGpGpuMidThreadLevelPreempt);
igcFeWa.get()->SetFtrIoMmuPageFaulting(hwInfo->pSkuTable->ftrIoMmuPageFaulting);
igcFeWa.get()->SetFtrWddm2Svm(hwInfo->pSkuTable->ftrWddm2Svm);
igcFeWa.get()->SetFtrPooledEuEnabled(hwInfo->pSkuTable->ftrPooledEuEnabled);
igcFeWa.get()->SetFtrResourceStreamer(hwInfo->pSkuTable->ftrResourceStreamer);
return retVal;
}
////////////////////////////////////////////////////////////////////////////////
// ParseCommandLine
////////////////////////////////////////////////////////////////////////////////
int OfflineCompiler::parseCommandLine(uint32_t numArgs, const char **argv) {
int retVal = CL_SUCCESS;
bool compile32 = false;
bool compile64 = false;
if (numArgs < 2) {
printUsage();
retVal = PRINT_USAGE;
}
for (uint32_t argIndex = 1; argIndex < numArgs; argIndex++) {
if ((stringsAreEqual(argv[argIndex], "-file")) &&
(argIndex + 1 < numArgs)) {
inputFile = argv[argIndex + 1];
argIndex++;
} else if (stringsAreEqual(argv[argIndex], "-32")) {
compile32 = true;
internalOptions.append(" -m32 ");
} else if (stringsAreEqual(argv[argIndex], "-64")) {
compile64 = true;
internalOptions.append(" -m64 ");
} else if (stringsAreEqual(argv[argIndex], "-cl-intel-greater-than-4GB-buffer-required")) {
internalOptions.append(" -cl-intel-greater-than-4GB-buffer-required ");
} else if ((stringsAreEqual(argv[argIndex], "-device")) &&
(argIndex + 1 < numArgs)) {
deviceName = argv[argIndex + 1];
argIndex++;
} else if (stringsAreEqual(argv[argIndex], "-llvm_text")) {
useLlvmText = true;
} else if (stringsAreEqual(argv[argIndex], "-cpp_file")) {
useCppFile = true;
} else if ((stringsAreEqual(argv[argIndex], "-options")) &&
(argIndex + 1 < numArgs)) {
options = argv[argIndex + 1];
argIndex++;
} else if (stringsAreEqual(argv[argIndex], "-options_name")) {
useOptionsSuffix = true;
} else if ((stringsAreEqual(argv[argIndex], "-out_dir")) &&
(argIndex + 1 < numArgs)) {
outputDirectory = argv[argIndex + 1];
argIndex++;
} else if (stringsAreEqual(argv[argIndex], "-q")) {
quiet = true;
} else if (stringsAreEqual(argv[argIndex], "-?")) {
printUsage();
retVal = PRINT_USAGE;
} else {
printf("Invalid option (arg %d): %s\n", argIndex, argv[argIndex]);
retVal = INVALID_COMMAND_LINE;
break;
}
}
if (retVal == CL_SUCCESS) {
if (compile32 && compile64) {
printf("Error: Cannot compile for 32-bit and 64-bit, please choose one.\n");
retVal = INVALID_COMMAND_LINE;
} else if (inputFile.empty()) {
printf("Error: Input file name missing.\n");
retVal = INVALID_COMMAND_LINE;
} else if (deviceName.empty()) {
printf("Error: Device name missing.\n");
retVal = INVALID_COMMAND_LINE;
} else if (!fileExists(inputFile)) {
printf("Error: Input file %s missing.\n", inputFile.c_str());
retVal = INVALID_FILE;
} else {
retVal = getHardwareInfo(deviceName.c_str());
if (retVal != CL_SUCCESS) {
printf("Error: Cannot get HW Info for device %s.\n", deviceName.c_str());
}
}
}
return retVal;
}
////////////////////////////////////////////////////////////////////////////////
// ParseCommandLine
////////////////////////////////////////////////////////////////////////////////
void OfflineCompiler::parseDebugSettings() {
if (DebugManager.flags.EnableStatelessToStatefulBufferOffsetOpt.get()) {
internalOptions += "-cl-intel-has-buffer-offset-arg ";
}
}
////////////////////////////////////////////////////////////////////////////////
// ParseBinAsCharArray
////////////////////////////////////////////////////////////////////////////////
std::string OfflineCompiler::parseBinAsCharArray(uint8_t *binary, size_t size, std::string &deviceName, std::string &fileName) {
std::string builtinName = convertToPascalCase(fileName);
std::ostringstream out;
// Convert binary to cpp
out << "#include <cstddef>\n";
out << "#include <cstdint>\n\n";
out << "size_t " << builtinName << "BinarySize_" << deviceName << " = " << size << ";\n";
out << "uint32_t " << builtinName << "Binary_" << deviceName << "[" << (size + 3) / 4 << "] = {"
<< std::endl
<< " ";
uint32_t *binaryUint = (uint32_t *)binary;
for (size_t i = 0; i < (size + 3) / 4; i++) {
if (i != 0) {
out << ", ";
if (i % 8 == 0) {
out << std::endl
<< " ";
}
}
if (i < size / 4) {
out << "0x" << std::hex << std::setw(8) << std::setfill('0') << binaryUint[i];
} else {
uint32_t lastBytes = size & 0x3;
uint32_t lastUint = 0;
uint8_t *pLastUint = (uint8_t *)&lastUint;
for (uint32_t j = 0; j < lastBytes; j++) {
pLastUint[sizeof(uint32_t) - 1 - j] = binary[i * 4 + j];
}
out << "0x" << std::hex << std::setw(8) << std::setfill('0') << lastUint;
}
}
out << "};" << std::endl;
out << std::endl
<< "#include \"runtime/built_ins/registry/built_ins_registry.h\"\n"
<< std::endl;
out << "namespace OCLRT {" << std::endl;
out << "static RegisterEmbeddedResource register" << builtinName << "Bin(" << std::endl;
out << " createBuiltinResourceName(" << std::endl;
out << " EBuiltInOps::" << builtinName << "," << std::endl;
out << " BuiltinCode::getExtension(BuiltinCode::ECodeType::Binary), \"" << deviceName << "\", 0)" << std::endl;
out << " .c_str()," << std::endl;
out << " (const char *)" << builtinName << "Binary"
<< "_" << deviceName << "," << std::endl;
out << " " << builtinName << "BinarySize_" << deviceName << ");" << std::endl;
out << "}" << std::endl;
return out.str();
}
////////////////////////////////////////////////////////////////////////////////
// GetFileNameTrunk
////////////////////////////////////////////////////////////////////////////////
std::string OfflineCompiler::getFileNameTrunk(std::string &filePath) {
size_t slashPos = filePath.find_last_of("\\/", filePath.size()) + 1;
size_t extPos = filePath.find_last_of(".", filePath.size());
if (extPos == std::string::npos) {
extPos = filePath.size();
}
std::string fileName;
std::string fileTrunk = filePath.substr(slashPos, (extPos - slashPos));
return fileTrunk;
}
//
std::string getDevicesTypes() {
std::list<std::string> prefixes;
for (int j = 0; j < IGFX_MAX_PRODUCT; j++) {
if (hardwarePrefix[j] == nullptr)
continue;
prefixes.push_back(hardwarePrefix[j]);
}
ostringstream os;
for (auto it = prefixes.begin(); it != prefixes.end(); it++) {
if (it != prefixes.begin())
os << ",";
os << *it;
}
return os.str();
}
////////////////////////////////////////////////////////////////////////////////
// PrintUsage
////////////////////////////////////////////////////////////////////////////////
void OfflineCompiler::printUsage() {
printf("Compiles CL files into llvm (.bc or .ll), gen isa (.gen), and binary files (.bin)\n\n");
printf("cloc -file <filename> -device <device_type> [-outdir <output_dir>]\n\n");
printf(" -file <filename> Indicates the CL kernel file to be compiled.\n");
printf(" -device <device_type> Indicates which device for which we will compile.\n");
printf(" <device_type> can be: %s\n", getDevicesTypes().c_str());
printf(" -out_dir <output_dir> Indicates the directory into which the compiled files\n");
printf(" will be placed.\n");
printf(" -llvm_text Readable LLVM text will be output in a .ll file instead of\n");
printf(" through the default lllvm binary (.bc) file.\n");
printf(" -cpp_file Cpp file with scheduler program binary will be generated.");
printf(" -options <options> Compiler options.\n");
printf(" -options_name Add suffix with compile options to filename\n");
printf(" -32 Force compile to 32-bit binary.\n");
printf(" -64 Force compile to 64-bit binary.\n");
printf(" -q Be more quiet. print only warnings and errors.\n");
printf(" -? Print this usage message.\n");
}
////////////////////////////////////////////////////////////////////////////////
// StoreBinary
////////////////////////////////////////////////////////////////////////////////
void OfflineCompiler::storeBinary(
char *&pDst,
size_t &dstSize,
const void *pSrc,
const size_t srcSize) {
dstSize = 0;
DEBUG_BREAK_IF(!(pSrc && srcSize > 0));
delete[] pDst;
pDst = new char[srcSize];
dstSize = (cl_uint)srcSize;
memcpy_s(pDst, dstSize, pSrc, srcSize);
}
////////////////////////////////////////////////////////////////////////////////
// GenerateElfBinary
////////////////////////////////////////////////////////////////////////////////
bool OfflineCompiler::generateElfBinary() {
bool retVal = true;
CLElfLib::CElfWriter *pElfWriter = nullptr;
if (!genBinary || !genBinarySize) {
retVal = false;
}
if (retVal) {
pElfWriter = CLElfLib::CElfWriter::create(CLElfLib::EH_TYPE_OPENCL_EXECUTABLE, CLElfLib::EH_MACHINE_NONE, 0);
if (pElfWriter) {
CLElfLib::SSectionNode sectionNode;
// Always add the options string
sectionNode.Name = "BuildOptions";
sectionNode.Type = CLElfLib::SH_TYPE_OPENCL_OPTIONS;
sectionNode.pData = (char *)options.c_str();
sectionNode.DataSize = (uint32_t)(strlen(options.c_str()) + 1);
retVal = pElfWriter->addSection(&sectionNode);
if (retVal) {
sectionNode.Name = "Intel(R) OpenCL LLVM Object";
sectionNode.Type = CLElfLib::SH_TYPE_OPENCL_LLVM_BINARY;
sectionNode.pData = llvmBinary;
sectionNode.DataSize = (uint32_t)llvmBinarySize;
retVal = pElfWriter->addSection(&sectionNode);
}
// Add the device binary if it exists
if (retVal && genBinary) {
sectionNode.Name = "Intel(R) OpenCL Device Binary";
sectionNode.Type = CLElfLib::SH_TYPE_OPENCL_DEV_BINARY;
sectionNode.pData = genBinary;
sectionNode.DataSize = (uint32_t)genBinarySize;
retVal = pElfWriter->addSection(&sectionNode);
}
if (retVal) {
// get the size
retVal = pElfWriter->resolveBinary(elfBinary, elfBinarySize);
}
if (retVal) {
// allocate the binary
elfBinary = new char[elfBinarySize];
retVal = pElfWriter->resolveBinary(elfBinary, elfBinarySize);
}
} else {
retVal = false;
}
CLElfLib::CElfWriter::destroy(pElfWriter);
}
return retVal;
}
////////////////////////////////////////////////////////////////////////////////
// WriteOutAllFiles
////////////////////////////////////////////////////////////////////////////////
void OfflineCompiler::writeOutAllFiles() {
std::string fileTrunk = getFileNameTrunk(inputFile);
std::string fileBase = fileTrunk + "_" + deviceName;
if (outputDirectory != "") {
std::list<std::string> dirList;
std::string tmp = outputDirectory;
size_t pos = outputDirectory.size() + 1;
do {
dirList.push_back(tmp);
pos = tmp.find_last_of("/\\", pos);
tmp = tmp.substr(0, pos);
} while (pos != std::string::npos);
while (!dirList.empty()) {
MakeDirectory(dirList.back().c_str());
dirList.pop_back();
}
}
if (llvmBinary) {
std::string llvmOutputFile = (outputDirectory == "") ? "" : outputDirectory + "/";
(useLlvmText == true) ? llvmOutputFile.append(fileBase + ".ll") : llvmOutputFile.append(fileBase + ".bc");
if (useOptionsSuffix) {
std::string opts(options.c_str());
std::replace(opts.begin(), opts.end(), ' ', '_');
llvmOutputFile.append(opts);
}
writeDataToFile(
llvmOutputFile.c_str(),
llvmBinary,
llvmBinarySize);
}
if (genBinary) {
std::string genOutputFile = (outputDirectory == "") ? "" : outputDirectory + "/";
genOutputFile.append(fileBase + ".gen");
if (useOptionsSuffix) {
std::string opts(options.c_str());
std::replace(opts.begin(), opts.end(), ' ', '_');
genOutputFile.append(opts);
}
writeDataToFile(
genOutputFile.c_str(),
genBinary,
genBinarySize);
if (useCppFile) {
std::string cppOutputFile = (outputDirectory == "") ? "" : outputDirectory + "/";
cppOutputFile.append(fileBase + ".cpp");
std::string cpp = parseBinAsCharArray((uint8_t *)genBinary, genBinarySize, deviceName, fileTrunk);
writeDataToFile(cppOutputFile.c_str(), cpp.c_str(), cpp.size());
}
}
if (elfBinary) {
std::string elfOutputFile = (outputDirectory == "") ? "" : outputDirectory + "/";
elfOutputFile.append(fileBase + ".bin");
if (useOptionsSuffix) {
std::string opts(options.c_str());
std::replace(opts.begin(), opts.end(), ' ', '_');
elfOutputFile.append(opts);
}
writeDataToFile(
elfOutputFile.c_str(),
elfBinary,
elfBinarySize);
}
}
} // namespace OCLRT

View File

@@ -0,0 +1,106 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "cif/common/cif_main.h"
#include "ocl_igc_interface/igc_ocl_device_ctx.h"
#include "ocl_igc_interface/fcl_ocl_device_ctx.h"
#include <cstdint>
#include <string>
#include <memory>
namespace OCLRT {
struct HardwareInfo;
class OsLibrary;
std::string convertToPascalCase(const std::string &inString);
enum ErrorCode {
INVALID_COMMAND_LINE = -5150,
INVALID_FILE = -5151,
PRINT_USAGE = -5152,
};
class OfflineCompiler {
public:
static OfflineCompiler *create(uint32_t numArgs, const char **argv, int &retVal);
int build();
std::string &getBuildLog();
void printUsage();
OfflineCompiler &operator=(const OfflineCompiler &) = delete;
OfflineCompiler(const OfflineCompiler &) = delete;
~OfflineCompiler();
bool isQuiet() const {
return quiet;
}
std::string parseBinAsCharArray(uint8_t *binary, size_t size, std::string &deviceName, std::string &fileName);
protected:
OfflineCompiler();
int getHardwareInfo(const char *pDeviceName);
std::string getFileNameTrunk(std::string &filePath);
std::string getStringWithinDelimiters(const std::string &src);
int initialize(uint32_t numArgs, const char **argv);
int parseCommandLine(uint32_t numArgs, const char **argv);
void parseDebugSettings();
void storeBinary(char *&pDst, size_t &dstSize, const void *pSrc, const size_t srcSize);
int buildSourceCode();
void updateBuildLog(const char *pErrorString, const size_t errorStringSize);
bool generateElfBinary();
void writeOutAllFiles();
const HardwareInfo *hwInfo = nullptr;
std::string deviceName;
std::string inputFile;
std::string outputFile;
std::string outputDirectory;
std::string options;
std::string internalOptions;
std::string sourceCode;
std::string buildLog;
bool useLlvmText = false;
bool useCppFile = false;
bool useOptionsSuffix = false;
bool quiet = false;
char *elfBinary = nullptr;
size_t elfBinarySize = 0;
char *genBinary = nullptr;
size_t genBinarySize = 0;
char *llvmBinary = nullptr;
size_t llvmBinarySize = 0;
std::unique_ptr<OsLibrary> igcLib = nullptr;
CIF::RAII::UPtr_t<CIF::CIFMain> igcMain = nullptr;
CIF::RAII::UPtr_t<IGC::IgcOclDeviceCtxTagOCL> igcDeviceCtx = nullptr;
std::unique_ptr<OsLibrary> fclLib = nullptr;
CIF::RAII::UPtr_t<CIF::CIFMain> fclMain = nullptr;
CIF::RAII::UPtr_t<IGC::FclOclDeviceCtxTagOCL> fclDeviceCtx = nullptr;
};
} // namespace OCLRT

View File

@@ -0,0 +1,43 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "hw_cmds.h"
#include "runtime/helpers/options.h"
#include "runtime/helpers/array_count.h"
#include <cstddef>
namespace OCLRT {
// AUB file folder location
const char *folderAUB = "aub_out";
// Initial value for HW tag
uint32_t initialHardwareTag = (uint32_t)-1;
// Number of devices in the platform
static const HardwareInfo *DefaultPlatformDevices[] =
{
&DEFAULT_PLATFORM::hwInfo,
};
size_t numPlatformDevices = ARRAY_COUNT(DefaultPlatformDevices);
const HardwareInfo **platformDevices = DefaultPlatformDevices;
} // namespace OCLRT

125
package.cmake Normal file
View File

@@ -0,0 +1,125 @@
# Copyright (c) 2017, Intel Corporation
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
if(UNIX)
set(package_input_dir ${IGDRCL_BINARY_DIR}/packageinput)
set(package_output_dir ${IGDRCL_BINARY_DIR}/packages)
if(NOT NEO_VERSION_MAJOR)
set(NEO_VERSION_MAJOR 1)
endif()
if(NOT NEO_VERSION_MINOR)
set(NEO_VERSION_MINOR 0)
endif()
if(NOT NEO_VERSION_BUILD)
set(NEO_VERSION_BUILD 0)
endif()
set(NEO_BINARY_INSTALL_DIR /opt/intel/opencl)
set(CMAKE_INSTALL_PREFIX ${NEO_BINARY_INSTALL_DIR})
install(FILES
${IGDRCL_BINARY_DIR}/bin/libigdrcl.so
${IGDRCL_BINARY_DIR}/bin/libigdccl.so
${IGDRCL_BINARY_DIR}/bin/libigdfcl.so
${IGDRCL_BINARY_DIR}/bin/libiga64.so
${IGDRCL_BINARY_DIR}/bin/libcommon_clang.so
DESTINATION ${NEO_BINARY_INSTALL_DIR}
COMPONENT igdrcl
)
set(OCL_ICD_RUNTIME_NAME libigdrcl.so)
install(
CODE "file( WRITE ${IGDRCL_BINARY_DIR}/libintelopencl.conf \"/opt/intel/opencl\n\" )"
CODE "file( WRITE ${IGDRCL_BINARY_DIR}/intel.icd \"/opt/intel/opencl/${OCL_ICD_RUNTIME_NAME}\n\" )"
CODE "file( WRITE ${IGDRCL_BINARY_DIR}/postinst \"echo /opt/intel/opencl >> /etc/ld.so.conf\n\" )"
CODE "file( APPEND ${IGDRCL_BINARY_DIR}/postinst \"/sbin/ldconfig\n\" )"
CODE "file( WRITE ${IGDRCL_BINARY_DIR}/postrm \"sed -i '/\\\\/opt\\\\/intel\\\\/opencl.*$/d' /etc/ld.so.conf\n\" )"
CODE "file( APPEND ${IGDRCL_BINARY_DIR}/postrm \"/sbin/ldconfig\n\" )"
COMPONENT igdrcl
)
install(FILES ${IGDRCL_BINARY_DIR}/libintelopencl.conf DESTINATION /etc/ld.so.conf.d COMPONENT igdrcl)
install(FILES ${IGDRCL_BINARY_DIR}/intel.icd DESTINATION /etc/OpenCL/vendors/ COMPONENT igdrcl)
# Add Khronos ICD loader - if available
if(NOT ICD_LIB_DIR)
# Try to find ICD in upper level directory
if(EXISTS ${IGDRCL_SOURCE_DIR}/../OpenCL-ICD-Loader/build/lib/libOpenCL.so)
set(ICD_LIB_DIR ${IGDRCL_SOURCE_DIR}/../OpenCL-ICD-Loader/build/lib)
message(STATUS "Taking ICD library from ${ICD_LIB_DIR}")
else()
get_filename_component(IGDRCL_PARENT_DIR ${IGDRCL_SOURCE_DIR} DIRECTORY)
message(WARNING "Missing Khronos ICD library. Generated package (.rpm, .deb, .tar.xz) may be incomple.\nPlease download Khronos ICD loader to ${IGDRCL_PARENT_DIR} and build it, or point directory containing library libOpenCL.so using ICD_LIB_DIR.")
endif()
endif()
if(ICD_LIB_DIR)
get_filename_component(ICD_LIB_DIR ${ICD_LIB_DIR} ABSOLUTE)
set(ICD_LIB_NAME "libOpenCL.so*")
install(
CODE "if(NOT((EXISTS ${ICD_LIB_DIR}/libOpenCL.so) OR (IS_SYMLINK ${ICD_LIB_DIR}/libOpenCL.so)))\n execute_process( COMMAND ln -s ${NEO_BINARY_INSTALL_DIR}/libOpenCL.so.1 ${ICD_LIB_DIR}/libOpenCL.so)\n endif()\n"
CODE "file( GLOB _NeoIcdLibFiles \"${ICD_LIB_DIR}/${ICD_LIB_NAME}\" )"
CODE "if(NOT _NeoIcdLibFiles)\n message(FATAL_ERROR \"${ICD_LIB_NAME} cannot be found in ${ICD_LIB_DIR}\")\nendif()"
CODE "file( INSTALL \${_NeoIcdLibFiles} DESTINATION \"${NEO_BINARY_INSTALL_DIR}\" )"
COMPONENT igdrcl
)
endif()
if(NEO_CPACK_GENERATOR)
set(CPACK_GENERATOR "${NEO_CPACK_GENERATOR}")
else()
# If generators list was not define build native package for current distro
if(EXISTS "/etc/debian_version")
set(CPACK_GENERATOR "DEB")
elseif(EXISTS "/etc/redhat-release")
set(CPACK_GENERATOR "RPM")
else()
set(CPACK_GENERATOR "TXZ")
endif()
endif()
set(CPACK_SET_DESTDIR TRUE)
set(CPACK_PACKAGE_RELOCATABLE FALSE)
set(CPACK_PACKAGE_NAME "intel-opencl")
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Intel OpenCL GPU driver")
set(CPACK_PACKAGE_VENDOR "Intel")
set(CPACK_PACKAGE_VERSION_MAJOR ${NEO_VERSION_MAJOR})
set(CPACK_PACKAGE_VERSION_MINOR ${NEO_VERSION_MINOR})
set(CPACK_PACKAGE_VERSION_PATCH ${NEO_VERSION_BUILD})
set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "amd64")
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "postinst;postrm")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64")
set(CPACK_RPM_COMPRESSION_TYPE "xz")
set(CPACK_RPM_PACKAGE_DESCRIPTION "Intel OpenCL GPU driver")
set(CPACK_RPM_PACKAGE_GROUP "System Environment/Libraries")
set(CPACK_RPM_PACKAGE_LICENSE "MIT")
set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${IGDRCL_BINARY_DIR}/postinst")
set(CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${IGDRCL_BINARY_DIR}/postrm")
set(CPACK_PACKAGE_INSTALL_DIRECTORY "/opt/intel/opencl")
set(CPACK_PACKAGE_CONTACT "Intel Corporation")
set(CPACK_PACKAGE_FILE_NAME "intel-opencl-${NEO_VERSION_MAJOR}.${NEO_VERSION_MINOR}-${NEO_VERSION_BUILD}.${CPACK_RPM_PACKAGE_ARCHITECTURE}")
set(CPACK_DEB_COMPONENT_INSTALL ON)
set(CPACK_RPM_COMPONENT_INSTALL ON)
set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
set(CPACK_COMPONENTS_ALL igdrcl)
include(CPack)
endif(UNIX)

255
platforms.cmake Normal file
View File

@@ -0,0 +1,255 @@
# Copyright (c) 2017, Intel Corporation
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
# We require cmake 3.2.0 or later
cmake_minimum_required(VERSION 3.2.0 FATAL_ERROR)
set(MAX_GEN 64)
macro(INIT_LIST LIST_TYPE ELEMENT_TYPE)
foreach(IT RANGE 0 ${MAX_GEN} 1)
list(APPEND ALL_${ELEMENT_TYPE}_${LIST_TYPE} " ")
endforeach(IT)
endmacro(INIT_LIST)
macro(GET_LIST_FOR_GEN LIST_TYPE ELEMENT_TYPE GEN_NUMBER OUT_LIST)
list(GET ALL_${ELEMENT_TYPE}_${LIST_TYPE} ${GEN_NUMBER} GEN_X_${LIST_TYPE})
string(REPLACE "_" ";" ${OUT_LIST} ${GEN_X_${LIST_TYPE}})
endmacro(GET_LIST_FOR_GEN)
macro(ADD_ITEM_FOR_GEN LIST_TYPE ELEMENT_TYPE GEN_NUMBER ITEM)
list(GET ALL_${ELEMENT_TYPE}_${LIST_TYPE} ${GEN_NUMBER} GEN_X_LIST)
string(REPLACE " " "" GEN_X_LIST ${GEN_X_LIST})
if("${GEN_X_LIST}" STREQUAL "")
set(GEN_X_LIST "${ITEM}")
else("${GEN_X_LIST}" STREQUAL "")
set(GEN_X_LIST "${GEN_X_LIST}_${ITEM}")
endif("${GEN_X_LIST}" STREQUAL "")
list(REMOVE_AT ALL_${ELEMENT_TYPE}_${LIST_TYPE} ${GEN_NUMBER})
list(INSERT ALL_${ELEMENT_TYPE}_${LIST_TYPE} ${GEN_NUMBER} ${GEN_X_LIST})
endmacro(ADD_ITEM_FOR_GEN)
macro(GEN_CONTAINS_PLATFORMS TYPE GEN_NUMBER OUT_FLAG)
GET_LIST_FOR_GEN("PLATFORMS" ${TYPE} ${GEN_NUMBER} GEN_X_PLATFORMS)
string(REPLACE " " "" GEN_X_PLATFORMS ${GEN_X_PLATFORMS})
if("${GEN_X_PLATFORMS}" STREQUAL "")
set(${OUT_FLAG} FALSE)
else("${GEN_X_PLATFORMS}" STREQUAL "")
set(${OUT_FLAG} TRUE)
endif("${GEN_X_PLATFORMS}" STREQUAL "")
endmacro(GEN_CONTAINS_PLATFORMS)
macro(GET_AVAILABLE_PLATFORMS TYPE FLAG_NAME OUT_STR)
set(${TYPE}_PLATFORM_LIST)
set(${TYPE}_GEN_FLAGS_DEFINITONS)
foreach(GEN_NUM RANGE ${MAX_GEN} 0 -1)
GEN_CONTAINS_PLATFORMS(${TYPE} ${GEN_NUM} GENX_HAS_PLATFORMS)
if(${GENX_HAS_PLATFORMS})
list(APPEND ${TYPE}_GEN_FLAGS_DEFINITONS ${FLAG_NAME}_GEN${GEN_NUM})
GET_LIST_FOR_GEN("PLATFORMS" ${TYPE} ${GEN_NUM} ${TYPE}_GENX_PLATFORMS)
list(APPEND ${TYPE}_PLATFORM_LIST ${${TYPE}_GENX_PLATFORMS})
if(NOT DEFAULT_${TYPE}_PLATFORM)
list(GET ${TYPE}_PLATFORM_LIST 0 DEFAULT_${TYPE}_PLATFORM ${PLATFORM_IT})
endif()
if(NOT DEFAULT_${TYPE}_GEN${GEN_NUM}_PLATFORM)
list(GET ${TYPE}_GENX_PLATFORMS 0 DEFAULT_${TYPE}_GEN${GEN_NUM}_PLATFORM)
endif()
endif()
endforeach()
foreach(PLATFORM_IT ${${TYPE}_PLATFORM_LIST})
set(${OUT_STR} "${${OUT_STR}} ${PLATFORM_IT}")
list(APPEND ${TYPE}_GEN_FLAGS_DEFINITONS ${FLAG_NAME}_${PLATFORM_IT})
endforeach()
endmacro(GET_AVAILABLE_PLATFORMS)
macro(GET_PLATFORMS_FOR_GEN TYPE GEN_NUMBER OUT_LIST)
GET_LIST_FOR_GEN("PLATFORMS" ${TYPE} ${GEN_NUMBER} ${OUT_LIST})
endmacro(GET_PLATFORMS_FOR_GEN)
macro(GET_TEST_CONFIGURATIONS_FOR_PLATFORM TYPE GEN_NUMBER PLATFORM OUT_LIST)
set(${OUT_LIST})
string(TOLOWER ${PLATFORM} PLATFORM_LOWER)
GET_LIST_FOR_GEN("CONFIGURATIONS" ${TYPE} ${GEN_NUMBER} ALL_CONFIGURATIONS_FOR_GEN)
foreach(CONFIGURATION ${ALL_CONFIGURATIONS_FOR_GEN})
string(REPLACE "/" ";" CONFIGURATION_PARAMS ${CONFIGURATION})
list(GET CONFIGURATION_PARAMS 0 CONFIGURATION_PLATFORM)
if(${CONFIGURATION_PLATFORM} STREQUAL ${PLATFORM_LOWER})
list(APPEND ${OUT_LIST} ${CONFIGURATION})
endif()
endforeach(CONFIGURATION)
endmacro(GET_TEST_CONFIGURATIONS_FOR_PLATFORM)
macro(PLATFORM_HAS_2_0 GEN_NUMBER PLATFORM_NAME OUT_FLAG)
GET_LIST_FOR_GEN("PLATFORMS" "SUPPORTED_2_0" ${GEN_NUMBER} GEN_X_PLATFORMS)
list(FIND GEN_X_PLATFORMS ${PLATFORM_NAME} PLATFORM_EXISTS)
if("${PLATFORM_EXISTS}" LESS 0)
set(${OUT_FLAG} FALSE)
else("${PLATFORM_EXISTS}" LESS 0)
set(${OUT_FLAG} TRUE)
endif("${PLATFORM_EXISTS}" LESS 0)
endmacro(PLATFORM_HAS_2_0 PLATFORM_NAME OUT_FLAG)
macro(PLATFORM_TESTED_WITH_APPVERIFIER GEN_NUMBER PLATFORM_NAME OUT_FLAG)
GET_LIST_FOR_GEN("PLATFORMS" "TESTED_APPVERIFIER" ${GEN_NUMBER} GEN_X_PLATFORMS)
list(FIND GEN_X_PLATFORMS ${PLATFORM_NAME} PLATFORM_EXISTS)
if("${PLATFORM_EXISTS}" LESS 0)
set(${OUT_FLAG} FALSE)
else("${PLATFORM_EXISTS}" LESS 0)
set(${OUT_FLAG} TRUE)
endif("${PLATFORM_EXISTS}" LESS 0)
endmacro(PLATFORM_TESTED_WITH_APPVERIFIER PLATFORM_NAME OUT_FLAG)
# default flag for GenX devices support
set(SUPPORT_GEN_DEFAULT TRUE CACHE BOOL "default value for SUPPORT_GENx")
# default flag for platform support
set(SUPPORT_PLATFORM_DEFAULT TRUE CACHE BOOL "default value for support platform")
# Define the hardware configurations we support
set(SUPPORT_GEN8 ${SUPPORT_GEN_DEFAULT} CACHE BOOL "Support Gen8 devices")
set(SUPPORT_GEN9 ${SUPPORT_GEN_DEFAULT} CACHE BOOL "Support Gen9 devices")
# Define the hardware configurations we test
set(TESTS_GEN8 ${SUPPORT_GEN8} CACHE BOOL "Build ULTs for Gen8 devices")
set(TESTS_GEN9 ${SUPPORT_GEN9} CACHE BOOL "Build ULTs for Gen9 devices")
if(SUPPORT_GEN9)
set(SUPPORT_SKL ${SUPPORT_PLATFORM_DEFAULT} CACHE BOOL "Support SKL")
set(SUPPORT_KBL ${SUPPORT_PLATFORM_DEFAULT} CACHE BOOL "Support KBL")
set(SUPPORT_BXT ${SUPPORT_PLATFORM_DEFAULT} CACHE BOOL "Support BXT")
set(SUPPORT_GLK ${SUPPORT_PLATFORM_DEFAULT} CACHE BOOL "Support GLK")
endif()
if(TESTS_GEN9)
if(SUPPORT_SKL)
set(TESTS_SKL ${TESTS_GEN9} CACHE BOOL "Build ULTs for SKL")
endif()
if(SUPPORT_KBL)
set(TESTS_KBL ${TESTS_GEN9} CACHE BOOL "Build ULTs for KBL")
endif()
if(SUPPORT_GLK)
set(TESTS_GLK ${TESTS_GEN9} CACHE BOOL "Build ULTs for GLK")
endif()
if(SUPPORT_BXT)
set(TESTS_BXT ${TESTS_GEN9} CACHE BOOL "Build ULTs for BXT")
endif()
endif()
# Init lists
INIT_LIST("FAMILY_NAME" "TESTED")
INIT_LIST("PLATFORMS" "SUPPORTED")
INIT_LIST("PLATFORMS" "SUPPORTED_2_0")
INIT_LIST("PLATFORMS" "TESTED")
INIT_LIST("PLATFORMS" "TESTED_APPVERIFIER")
INIT_LIST("CONFIGURATIONS" "UNIT_TESTS")
INIT_LIST("CONFIGURATIONS" "AUB_TESTS")
INIT_LIST("CONFIGURATIONS" "MT_TESTS")
# Add supported and tested platforms
if(SUPPORT_GEN8)
ADD_ITEM_FOR_GEN("PLATFORMS" "SUPPORTED" 8 "BDW")
ADD_ITEM_FOR_GEN("PLATFORMS" "SUPPORTED_2_0" 8 "BDW")
if(TESTS_GEN8)
ADD_ITEM_FOR_GEN("FAMILY_NAME" "TESTED" 8 "BDWFamily")
ADD_ITEM_FOR_GEN("PLATFORMS" "TESTED" 8 "BDW")
ADD_ITEM_FOR_GEN("PLATFORMS" "TESTED_APPVERIFIER" 8 "BDW")
ADD_ITEM_FOR_GEN("CONFIGURATIONS" "AUB_TESTS" 8 "bdw/1/3/8")
ADD_ITEM_FOR_GEN("CONFIGURATIONS" "MT_TESTS" 8 "bdw/1/3/8")
ADD_ITEM_FOR_GEN("CONFIGURATIONS" "UNIT_TESTS" 8 "bdw/1/3/8")
endif()
endif(SUPPORT_GEN8)
if(SUPPORT_GEN9)
if(TESTS_GEN9)
ADD_ITEM_FOR_GEN("FAMILY_NAME" "TESTED" 9 "SKLFamily")
endif()
if(SUPPORT_SKL)
ADD_ITEM_FOR_GEN("PLATFORMS" "SUPPORTED" 9 "SKL")
ADD_ITEM_FOR_GEN("PLATFORMS" "SUPPORTED_2_0" 9 "SKL")
if(TESTS_SKL)
ADD_ITEM_FOR_GEN("PLATFORMS" "TESTED" 9 "SKL")
ADD_ITEM_FOR_GEN("PLATFORMS" "TESTED_APPVERIFIER" 9 "SKL")
ADD_ITEM_FOR_GEN("CONFIGURATIONS" "AUB_TESTS" 9 "skl/1/3/8")
ADD_ITEM_FOR_GEN("CONFIGURATIONS" "MT_TESTS" 9 "skl/1/3/8")
ADD_ITEM_FOR_GEN("CONFIGURATIONS" "UNIT_TESTS" 9 "skl/1/3/8")
endif()
endif()
if(SUPPORT_KBL)
ADD_ITEM_FOR_GEN("PLATFORMS" "SUPPORTED" 9 "KBL")
ADD_ITEM_FOR_GEN("PLATFORMS" "SUPPORTED_2_0" 9 "KBL")
if(TESTS_KBL)
ADD_ITEM_FOR_GEN("PLATFORMS" "TESTED" 9 "KBL")
ADD_ITEM_FOR_GEN("CONFIGURATIONS" "UNIT_TESTS" 9 "kbl/1/3/6")
endif()
endif()
if(SUPPORT_GLK)
ADD_ITEM_FOR_GEN("PLATFORMS" "SUPPORTED" 9 "GLK")
if(TESTS_GLK)
ADD_ITEM_FOR_GEN("PLATFORMS" "TESTED" 9 "GLK")
ADD_ITEM_FOR_GEN("CONFIGURATIONS" "UNIT_TESTS" 9 "glk/1/3/6")
endif()
endif()
if(SUPPORT_BXT)
ADD_ITEM_FOR_GEN("PLATFORMS" "SUPPORTED" 9 "BXT")
if(TESTS_BXT)
ADD_ITEM_FOR_GEN("PLATFORMS" "TESTED" 9 "BXT")
ADD_ITEM_FOR_GEN("CONFIGURATIONS" "AUB_TESTS" 9 "bxt/1/3/6")
ADD_ITEM_FOR_GEN("CONFIGURATIONS" "UNIT_TESTS" 9 "bxt/1/3/6")
endif()
endif()
endif(SUPPORT_GEN9)
# Get platform lists, flag definition and set default platforms
GET_AVAILABLE_PLATFORMS("SUPPORTED" "SUPPORT" ALL_AVAILABLE_SUPPORTED_PLATFORMS)
GET_AVAILABLE_PLATFORMS("TESTED" "TESTS" ALL_AVAILABLE_TESTED_PLATFORMS)
message(STATUS "All supported platforms: ${ALL_AVAILABLE_SUPPORTED_PLATFORMS}")
message(STATUS "All tested platforms: ${ALL_AVAILABLE_TESTED_PLATFORMS}")
message(STATUS "Default supported platform: ${DEFAULT_SUPPORTED_PLATFORM}")
list(FIND SUPPORTED_PLATFORM_LIST ${DEFAULT_SUPPORTED_PLATFORM} VALID_DEFAULT_SUPPORTED_PLATFORM)
if(VALID_DEFAULT_SUPPORTED_PLATFORM LESS 0)
message(FATAL_ERROR "Not a valid supported platform: ${DEFAULT_SUPPORTED_PLATFORM}")
endif()
message(STATUS "Default tested platform: ${DEFAULT_TESTED_PLATFORM}")
if(DEFAULT_TESTED_PLATFORM)
list(FIND TESTED_PLATFORM_LIST ${DEFAULT_TESTED_PLATFORM} VALID_DEFAULT_TESTED_PLATFORM)
if(VALID_DEFAULT_TESTED_PLATFORM LESS 0)
message(FATAL_ERROR "Not a valid tested platform: ${DEFAULT_TESTED_PLATFORM}")
endif()
endif()
if(NOT DEFAULT_TESTED_FAMILY_NAME)
foreach(GEN_NUM RANGE ${MAX_GEN} 0 -1)
list(GET ALL_TESTED_FAMILY_NAME ${GEN_NUM} GEN_FAMILY_NAME)
if(NOT GEN_FAMILY_NAME STREQUAL " ")
set(DEFAULT_TESTED_FAMILY_NAME ${GEN_FAMILY_NAME})
break()
endif()
endforeach()
endif()
message(STATUS "Default tested family name: ${DEFAULT_TESTED_FAMILY_NAME}")

44
public/cl_ext_private.h Normal file
View File

@@ -0,0 +1,44 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
/***************************************
* * Internal only queue properties *
* ****************************************/
// Intel evaluation now. Remove it after approval for public release
#define CL_DEVICE_DRIVER_VERSION_INTEL 0x10010
#define CL_DEVICE_DRIVER_VERSION_INTEL_NEO1 0x454E4831 // Driver version is ENH1
/***************************************
* * cl_intel_debug_info extension *
* ****************************************/
#define cl_intel_debug_info 1
// New queries for clGetProgramInfo:
#define CL_PROGRAM_DEBUG_INFO_INTEL 0x4100
#define CL_PROGRAM_DEBUG_INFO_SIZES_INTEL 0x4101
// New queries for clGetKernelInfo:
#define CL_KERNEL_BINARY_PROGRAM_INTEL 0x407D
#define CL_KERNEL_BINARIES_INTEL 0x4102
#define CL_KERNEL_BINARY_SIZES_INTEL 0x4103

414
public/cl_vebox_intel.h Normal file
View File

@@ -0,0 +1,414 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef __CL_EXT_VEBOX_INTEL_H
#define __CL_EXT_VEBOX_INTEL_H
#ifdef __cplusplus
extern "C" {
#endif
#include <CL/cl.h>
/***************************************
* cl_intel_video_enhancement extension *
****************************************/
#define CL_ACCELERATOR_TYPE_VE_INTEL 0x9
#define CL_DEVICE_VE_VERSION_INTEL 0x4160
#define CL_DEVICE_VE_ENGINE_COUNT_INTEL 0x4161
#define CL_DEVICE_VE_COLOR_PIPE_VERSION_INTEL 0x416A
#define CL_DEVICE_VE_CAMERA_PIPE_VERSION_INTEL 0x4177
#define CL_VE_VERSION_VER_1_INTEL 0x1
#define CL_VE_VERSION_VER_2_INTEL 0x2
#define CL_VE_VERSION_VER_3_INTEL 0x3
#define CL_QUEUE_VE_ENABLE_INTEL 0x4162
// VE Attributes
#define CL_VE_ACCELERATOR_ATTRIB_DENOISE_INTEL 0x4163
#define CL_VE_ACCELERATOR_ATTRIB_DEINTERLACE_INTEL 0x4164
#define CL_VE_ACCELERATOR_ATTRIB_HPC_INTEL 0x4165
#define CL_VE_ACCELERATOR_ATTRIB_STD_STE_INTEL 0x416B
#define CL_VE_ACCELERATOR_ATTRIB_GAMUT_COMP_INTEL 0x416C
#define CL_VE_ACCELERATOR_ATTRIB_GECC_INTEL 0x416D
#define CL_VE_ACCELERATOR_ATTRIB_ACE_INTEL 0x416E
#define CL_VE_ACCELERATOR_ATTRIB_ACE_ADVANCED_INTEL 0x416F
#define CL_VE_ACCELERATOR_ATTRIB_TCC_INTEL 0x4170
#define CL_VE_ACCELERATOR_ATTRIB_PROC_AMP_INTEL 0x4171
#define CL_VE_ACCELERATOR_ATTRIB_BACK_END_CSC_INTEL 0x4172
#define CL_VE_ACCELERATOR_ATTRIB_AOI_ALPHA_INTEL 0x4173
#define CL_VE_ACCELERATOR_ATTRIB_CCM_INTEL 0x4174
#define CL_VE_ACCELERATOR_ATTRIB_FWD_GAMMA_CORRECTION_INTEL 0x4175
#define CL_VE_ACCELERATOR_ATTRIB_FRONT_END_CSC_INTEL 0x4176
#define CL_VE_ACCELERATOR_ATTRIB_BLC_INTEL 0x4178
#define CL_VE_ACCELERATOR_ATTRIB_DEMOSAIC_INTEL 0x4179
#define CL_VE_ACCELERATOR_ATTRIB_WBC_INTEL 0x417A
#define CL_VE_ACCELERATOR_ATTRIB_VIGNETTE_INTEL 0x417B
// VE Statistics
#define CL_VE_ACCELERATOR_HISTOGRAMS_INTEL 0x4166
#define CL_VE_ACCELERATOR_STATISTICS_INTEL 0x4167
#define CL_VE_ACCELERATOR_STMM_INPUT_INTEL 0x4168
#define CL_VE_ACCELERATOR_STMM_OUTPUT_INTEL 0x4169
// Denoise Control
#define CL_VE_DENOISE_FACTOR_MAX_INTEL 64
#define CL_VE_DENOISE_FACTOR_MIN_INTEL 0
#define CL_VE_DENOISE_FACTOR_DEFAULT_INTEL 32
// Hot Pixel Correction ranges
#define CL_VE_HPC_THRESHOLD_MAX_INTEL 255
#define CL_VE_HPC_THRESHOLD_MIN_INTEL 0
#define CL_VE_HPC_THRESHOLD_DEFAULT_INTEL 0
#define CL_VE_HPC_PIXEL_COUNT_MAX_INTEL 8
#define CL_VE_HPC_PIXEL_COUNT_MIN_INTEL 0
#define CL_VE_HPC_PIXEL_COUNT_DEFAULT_INTEL 0
// Skin tone detection/enhancement ranges
#define CL_VE_STE_FACTOR_MIN_INTEL 0
#define CL_VE_STE_FACTOR_MAX_INTEL 10
#define CL_VE_STE_FACTOR_DEFAULT_INTEL 3
// Constants for gamut compression scaling factors
#define CL_VE_GAMUT_SCALING_FACTOR_MAX_INTEL 4.0f
#define CL_VE_GAMUT_SCALING_FACTOR_MIN_INTEL 0.0f
#define CL_VE_GAMUT_SCALING_FACTOR_DEFAULT_INTEL 0.0f
#define CL_VE_GAMUT_CHROMATICITY_CONTROLS_MAX_INTEL 1.0f
#define CL_VE_GAMUT_CHROMATICITY_CONTROLS_MIN_INTEL 0.0f
#define CL_VE_GAMUT_CHROMATICITY_CONTRL_RX_DEFAULT_INTEL 0.576f
#define CL_VE_GAMUT_CHROMATICITY_CONTRL_GX_DEFAULT_INTEL 0.331f
#define CL_VE_GAMUT_CHROMATICITY_CONTRL_BX_DEFAULT_INTEL 0.143f
#define CL_VE_GAMUT_CHROMATICITY_CONTRL_RY_DEFAULT_INTEL 0.343f
#define CL_VE_GAMUT_CHROMATICITY_CONTRL_GY_DEFAULT_INTEL 0.555f
#define CL_VE_GAMUT_CHROMATICITY_CONTRL_BY_DEFAULT_INTEL 0.104f
#define CL_VE_GAMUT_CHROMATICITY_CONTRL_RX_SRGB_INTEL 0.640f
#define CL_VE_GAMUT_CHROMATICITY_CONTRL_GX_SRGB_INTEL 0.300f
#define CL_VE_GAMUT_CHROMATICITY_CONTRL_BX_SRGB_INTEL 0.150f
#define CL_VE_GAMUT_CHROMATICITY_CONTRL_RY_SRGB_INTEL 0.330f
#define CL_VE_GAMUT_CHROMATICITY_CONTRL_GY_SRGB_INTEL 0.600f
#define CL_VE_GAMUT_CHROMATICITY_CONTRL_BY_SRGB_INTEL 0.060f
// Constants for gamut expansion / color correction
#define CL_VE_GECC_PIECE_COUNT_INTEL 11
#define CL_VE_GECC_TX_COEFFICIENTS_MIN_INTEL -4.0f
#define CL_VE_GECC_TX_COEFFICIENTS_MAX_INTEL 4.0f
#define CL_VE_GECC_TX_COEFF_C0_DEFAULT_INTEL 0.681f
#define CL_VE_GECC_TX_COEFF_C1_DEFAULT_INTEL 0.278f
#define CL_VE_GECC_TX_COEFF_C2_DEFAULT_INTEL 0.008f
#define CL_VE_GECC_TX_COEFF_C3_DEFAULT_INTEL 0.017f
#define CL_VE_GECC_TX_COEFF_C4_DEFAULT_INTEL 0.894f
#define CL_VE_GECC_TX_COEFF_C5_DEFAULT_INTEL -0.012f
#define CL_VE_GECC_TX_COEFF_C6_DEFAULT_INTEL -0.002f
#define CL_VE_GECC_TX_COEFF_C7_DEFAULT_INTEL 0.041f
#define CL_VE_GECC_TX_COEFF_C8_DEFAULT_INTEL 0.838f
#define CL_VE_GECC_TX_OFFSET_IN_MIN_INTEL -16384
#define CL_VE_GECC_TX_OFFSET_IN_MAX_INTEL 16383
#define CL_VE_GECC_TX_OFFSET_OUT_MIN_INTEL -4.0f
#define CL_VE_GECC_TX_OFFSET_OUT_MAX_INTEL 4.0f
// AOI Parameter defaults
#define CL_VE_AOI_RANGE_DEFAULT_INTEL 0
#define CL_VE_AOI_ALPHA_DEFAULT_INTEL 0
// CCM Config Parameter Range
#define CL_VE_CCM_COEFFICIENTS_MIN_INTEL -16.0f
#define CL_VE_CCM_COEFFICIENTS_MAX_INTEL 16.0f
#define CL_VE_CCM_COEFFICIENTS_DEFAULT_INTEL 0.0f
// CSC Config Parameter Range
#define CL_VE_CSC_OFFSET_MIN_INTEL -256.0f
#define CL_VE_CSC_OFFSET_MAX_INTEL 256.0f
#define CL_VE_CSC_COEFF_MIN_INTEL -4.0f
#define CL_VE_CSC_COEFF_MAX_INTEL 4.0f
// Constants for specific color spaces
#define CL_VE_GAMUT_CS_BT601_INTEL 0x0
#define CL_VE_GAMUT_CS_BT709_INTEL 0x1
#define CL_VE_GAMUT_CS_XVYCC601_INTEL 0x2
#define CL_VE_GAMUT_CS_XVYCC709_INTEL 0x3
// LACE/ACE Control
#define CL_VE_ACE_PIECE_COUNT_INTEL 10
#define CL_VE_ACE_LEVEL_MIN_INTEL 0
#define CL_VE_ACE_LEVEL_MAX_INTEL 9
#define CL_VE_ACE_LEVEL_DEFAULT_INTEL 5
#define CL_VE_ACE_STRENGTH_MIN_INTEL 0
#define CL_VE_ACE_STRENGTH_MAX_INTEL 6
#define CL_VE_ACE_STRENGTH_DEFAULT_INTEL 1
#define CL_VE_ACE_SKIN_THRESHOLD_MIN_INTEL 1
#define CL_VE_ACE_SKIN_THRESHOLD_MAX_INTEL 31
#define CL_VE_ACE_SKIN_THRESHOLD_DEFAULT_INTEL 26
// TCC Parameter Range
#define CL_VE_TCC_MIN_INTEL 0
#define CL_VE_TCC_MAX_INTEL 255
#define CL_VE_TCC_DEFAULT_INTEL 220
// Proc-Amp Ranges
#define CL_VE_PROCAMP_BRIGHTNESS_MIN_INTEL -100.0f
#define CL_VE_PROCAMP_BRIGHTNESS_MAX_INTEL 100.0f
#define CL_VE_PROCAMP_BRIGHTNESS_DEFAULT_INTEL 0.0f
#define CL_VE_PROCAMP_CONTRAST_MIN_INTEL 0.0f
#define CL_VE_PROCAMP_CONTRAST_MAX_INTEL 15.0f
#define CL_VE_PROCAMP_CONTRAST_DEFAULT_INTEL 1.0f
#define CL_VE_PROCAMP_HUE_MIN_INTEL -180.0f
#define CL_VE_PROCAMP_HUE_MAX_INTEL 180.0f
#define CL_VE_PROCAMP_HUE_DEFAULT_INTEL 0.0f
#define CL_VE_PROCAMP_SATURATION_MIN_INTEL 0.0f
#define CL_VE_PROCAMP_SATURATION_MAX_INTEL 8.0f
#define CL_VE_PROCAMP_SATURATION_DEFAULT_INTEL 1.0f
// BLC Parameter Range
#define CL_VE_BLC_MIN_INTEL -65536
#define CL_VE_BLC_MAX_INTEL 65535
#define CL_VE_BLC_DEFAULT_INTEL 0
// WBC Parameter Range
#define CL_VE_WBC_MIN_INTEL 0.0f
#define CL_VE_WBC_MAX_INTEL 16.0f
#define CL_VE_WBC_DEFAULT_INTEL 0.0f
// FGC Parameter Range
#define CL_VE_FGC_DEFAULT_INTEL 0
// Video enhancement kernel flags
#define CL_VE_FIRST_FRAME_INTEL (1 << 0)
#define CL_VE_RESET_DN_HISTORY_INTEL (1 << 1)
#define CL_VE_RESET_DI_HISTORY_INTEL (1 << 2)
#define CL_VE_RESET_ACE_HISTORY_INTEL (1 << 3)
#define CL_VE_RESET_STE_HISTORY_INTEL (1 << 4)
#define CL_VE_GENERATE_LACE_HISTOGRAM_128_BINS_INTEL (1 << 5)
#define CL_VE_GENERATE_LACE_HISTOGRAM_256_BINS_INTEL (1 << 6)
// Bayer pattern controls
#define CL_VE_BAYER_PATTERN_FORMAT_8BIT_INTEL 0x0
#define CL_VE_BAYER_PATTERN_FORMAT_16BIT_INTEL 0x1
#define CL_VE_BAYER_PATTERN_OFFSET_BG_INTEL 0x0
#define CL_VE_BAYER_PATTERN_OFFSET_RG_INTEL 0x1
#define CL_VE_BAYER_PATTERN_OFFSET_GR_INTEL 0x2
#define CL_VE_BAYER_PATTERN_OFFSET_GB_INTEL 0x3
// Default color-space conversion coefficients
#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_OFFSET_IN_0 (-16.0f)
#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_OFFSET_IN_1 (-128.0f)
#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_OFFSET_IN_2 (-128.0f)
#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_TX_COEFF_0_0 (1.164f)
#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_TX_COEFF_0_1 (0.0f)
#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_TX_COEFF_0_2 (1.596f)
#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_TX_COEFF_1_0 (1.164f)
#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_TX_COEFF_1_1 (-0.392f)
#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_TX_COEFF_1_2 (-0.813f)
#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_TX_COEFF_2_0 (1.164f)
#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_TX_COEFF_2_1 (2.017f)
#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_TX_COEFF_2_2 (0.0f)
#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_OFFSET_OUT_0 (0.0f)
#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_OFFSET_OUT_1 (0.0f)
#define CL_VE_CSC_DEFAULT_YUV_TO_RGB_OFFSET_OUT_2 (0.0f)
#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_OFFSET_IN_0 (0.0f)
#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_OFFSET_IN_1 (0.0f)
#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_OFFSET_IN_2 (0.0f)
#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_TX_COEFF_0_0 (0.257f)
#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_TX_COEFF_0_1 (0.504f)
#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_TX_COEFF_0_2 (0.098f)
#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_TX_COEFF_1_0 (-0.148f)
#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_TX_COEFF_1_1 (-0.291f)
#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_TX_COEFF_1_2 (0.439f)
#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_TX_COEFF_2_0 (0.439f)
#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_TX_COEFF_2_1 (-0.368f)
#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_TX_COEFF_2_2 (-0.071f)
#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_OFFSET_OUT_0 (16.0f)
#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_OFFSET_OUT_1 (128.0f)
#define CL_VE_CSC_DEFAULT_RGB_TO_YUV_OFFSET_OUT_2 (128.0f)
// Forward Gamma Correction controls
#define CL_VE_FWD_GAMMA_SEGMENT_COUNT 64
typedef cl_uint cl_ve_accelerator_attrib_id;
typedef struct _cl_ve_dn_attrib_intel {
cl_bool enable_luma;
cl_bool enable_chroma;
cl_bool auto_detect;
cl_uint denoise_factor;
} cl_ve_dn_attrib_intel;
typedef struct _cl_ve_di_attrib_intel {
cl_bool enabled;
cl_bool motion_compensation_enabled;
cl_bool top_first;
} cl_ve_di_attrib_intel;
typedef struct _cl_ve_std_ste_attrib_intel {
cl_bool enabled;
cl_uint ste_factor;
cl_bool write_std_decisions_only;
} cl_ve_std_ste_attrib_intel;
typedef struct _cl_ve_gamut_comp_attrib_intel {
cl_bool enabled;
cl_bool advanced_mode_enable;
cl_uint src_color_space;
cl_float basic_mode_scaling_factor;
cl_float display_rgb_x[3];
cl_float display_rgb_y[3];
} cl_ve_gamut_comp_attrib_intel;
typedef struct _cl_ve_gecc_attrib_intel {
cl_bool enabled;
cl_bool use_advanced_mode;
cl_float matrix[3][3];
cl_int offset_in[3];
cl_float offset_out[3];
cl_uchar gamma_correction_in[CL_VE_GECC_PIECE_COUNT_INTEL];
cl_uchar gamma_correction_out[CL_VE_GECC_PIECE_COUNT_INTEL];
cl_uchar inv_gamma_correction_in[CL_VE_GECC_PIECE_COUNT_INTEL];
cl_uchar inv_gamma_correction_out[CL_VE_GECC_PIECE_COUNT_INTEL];
} cl_ve_gecc_attrib_intel;
typedef struct _cl_ve_ace_attrib_intel {
cl_bool enabled;
cl_uchar skin_threshold;
cl_uint level;
cl_uint strength;
} cl_ve_ace_attrib_intel;
typedef struct _cl_ve_ace_advanced_attrib_intel {
cl_bool enabled;
cl_uchar luma_min;
cl_uchar luma_max;
cl_uchar luma_in[CL_VE_ACE_PIECE_COUNT_INTEL];
cl_uchar luma_out[CL_VE_ACE_PIECE_COUNT_INTEL];
} cl_ve_ace_advanced_attrib_intel;
typedef struct _cl_ve_tcc_attrib_intel {
cl_bool enabled;
cl_uchar red_saturation;
cl_uchar green_saturation;
cl_uchar blue_saturation;
cl_uchar cyan_saturation;
cl_uchar magenta_saturation;
cl_uchar yellow_saturation;
} cl_ve_tcc_attrib_intel;
typedef struct _cl_ve_procamp_attrib_intel {
cl_bool enabled;
cl_float brightness;
cl_float contrast;
cl_float hue;
cl_float saturation;
} cl_ve_procamp_attrib_intel;
typedef struct _cl_ve_becsc_attrib_intel {
cl_bool enabled;
cl_bool yuv_channel_swap;
cl_float offset_in[3];
cl_float matrix[3][3];
cl_float offset_out[3];
} cl_ve_becsc_attrib_intel;
typedef struct _cl_ve_aoi_alpha_attrib_intel {
cl_bool aoi_enabled;
cl_uint x_min;
cl_uint x_max;
cl_uint y_min;
cl_uint y_max;
cl_bool alpha_enable;
cl_ushort alpha_value;
} cl_ve_aoi_alpha_attrib_intel;
typedef struct _cl_ve_hpc_attrib_intel {
cl_bool enabled;
cl_uchar threshold;
cl_uchar count;
} cl_ve_hpc_attrib_intel;
typedef struct _cl_ve_blc_attrib_intel {
cl_bool enabled;
cl_int black_point_offset_red;
cl_int black_point_offset_green_top;
cl_int black_point_offset_green_bottom;
cl_int black_point_offset_blue;
} cl_ve_blc_attrib_intel;
typedef struct _cl_ve_demosaic_attrib_intel {
cl_uint input_width;
cl_uint input_height;
cl_uint input_stride;
cl_uint bayer_pattern_offset;
cl_uint bayer_pattern_format;
} cl_ve_demosaic_attrib_intel;
typedef struct _cl_ve_wbc_attrib_intel {
cl_bool enabled;
cl_float white_balance_red_correction;
cl_float white_balance_green_top_correction;
cl_float white_balance_green_bottom_correction;
cl_float white_balance_blue_correction;
} cl_ve_wbc_attrib_intel;
typedef struct _cl_ve_vignette_attrib_intel {
cl_bool enabled;
} cl_ve_vignette_attrib_intel;
typedef struct _cl_ve_ccm_attrib_intel {
cl_bool enabled;
cl_float matrix[3][3];
} cl_ve_ccm_attrib_intel;
typedef struct _cl_ve_fgc_attrib_intel {
cl_bool enabled;
cl_ushort pixel_value[CL_VE_FWD_GAMMA_SEGMENT_COUNT];
cl_ushort red_channel_corrected_value[CL_VE_FWD_GAMMA_SEGMENT_COUNT];
cl_ushort green_channel_corrected_value[CL_VE_FWD_GAMMA_SEGMENT_COUNT];
cl_ushort blue_channel_corrected_value[CL_VE_FWD_GAMMA_SEGMENT_COUNT];
} cl_ve_fgc_attrib_intel;
typedef struct _cl_ve_fecsc_attrib_intel {
cl_bool enabled;
cl_float offset_in[3];
cl_float matrix[3][3];
cl_float offset_out[3];
} cl_ve_fecsc_attrib_intel;
typedef struct _cl_ve_attrib_desc_intel {
cl_ve_accelerator_attrib_id attrib_id;
void *attrib_data;
} cl_ve_attrib_desc_intel;
typedef struct _cl_ve_desc_intel {
cl_uint attrib_count;
cl_ve_attrib_desc_intel *attribs;
} cl_ve_desc_intel;
typedef struct _cl_vignette_format_intel {
cl_ushort Red;
cl_ushort GreenTop;
cl_ushort Blue;
cl_ushort GreenBottom;
} cl_vignette_format_intel;
#ifdef __cplusplus
}
#endif
#endif /* __CL_EXT_VEBOX_INTEL_H */

908
runtime/CMakeLists.txt Normal file
View File

@@ -0,0 +1,908 @@
# Copyright (c) 2017, Intel Corporation
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
cmake_minimum_required (VERSION 3.0)
if (POLICY CMP0042)
cmake_policy (SET CMP0042 NEW)
endif (POLICY CMP0042)
if (POLICY CMP0063)
cmake_policy (SET CMP0063 NEW)
endif (POLICY CMP0063)
project (neo)
#set (CMAKE_CXX_VISIBILITY_PRESET default)
#set (CMAKE_VISIBILITY_INLINES_HIDDEN 1)
# Support for Windows Universal Drivers
ENABLE_WUD()
if(WIN32)
set(GEN_OS_SRC
windows/command_stream_receiver.cpp
windows/wddm_engine_mapper.cpp
windows/wddm.cpp
)
else(WIN32)
set(GEN_OS_SRC
linux/command_stream_receiver.cpp
linux/drm_engine_mapper.cpp
)
endif(WIN32)
set(RUNTIME_SRCS_GENX
aub_command_stream_receiver.cpp
aub_mapper.h
aub_mem_dump.cpp
command_queue.cpp
device_enqueue.h
device_queue.cpp
command_stream_receiver_hw.cpp
hw_cmds.h
hw_cmds_generated.h
hw_helper.cpp
hw_info.cpp
hw_info.h
buffer.cpp
image.cpp
kernel_commands.cpp
preamble.cpp
preemption.cpp
reg_configs.h
sampler.cpp
scheduler_definitions.h
scheduler_igdrcl_built_in.inl
state_base_address.cpp
tbx_command_stream_receiver.cpp
${GEN_OS_SRC}
)
if(NOT (TARGET ${BIKSIM_LIB_NAME}))
add_subdirectory(builtin_kernels_simulation)
endif(NOT (TARGET ${BIKSIM_LIB_NAME}))
if(NOT (TARGET ${SCHEDULER_BINARY_LIB_NAME}))
add_subdirectory("scheduler")
endif(NOT (TARGET ${SCHEDULER_BINARY_LIB_NAME}))
if(NOT (TARGET ${BUILTINS_BINARIES_LIB_NAME}))
add_subdirectory("built_ins")
endif(NOT (TARGET ${BUILTINS_BINARIES_LIB_NAME}))
add_subdirectory(api)
add_subdirectory(accelerators)
set (RUNTIME_SRCS_AUB_MEM_DUMP
aub_mem_dump/aub_mem_dump.cpp
aub_mem_dump/aub_mem_dump.h
aub_mem_dump/aub_mem_dump.inl
aub_mem_dump/aub_header.h
aub_mem_dump/aub_services.h
)
set (RUNTIME_SRCS_BUILT_INS
built_ins/built_ins_storage.cpp
built_ins/built_ins.cpp
built_ins/built_ins.h
built_ins/sip.cpp
built_ins/sip.h
built_ins/vme_dispatch_builder.h
)
set (RUNTIME_SRCS_BUILT_IN_KERNELS
built_ins/kernels/copy_buffer_rect.igdrcl_built_in
built_ins/kernels/copy_buffer_to_buffer.igdrcl_built_in
built_ins/kernels/copy_buffer_to_image3d.igdrcl_built_in
built_ins/kernels/copy_image3d_to_buffer.igdrcl_built_in
built_ins/kernels/copy_image_to_image1d.igdrcl_built_in
built_ins/kernels/copy_image_to_image2d.igdrcl_built_in
built_ins/kernels/copy_image_to_image3d.igdrcl_built_in
built_ins/kernels/fill_buffer.igdrcl_built_in
built_ins/kernels/fill_image1d.igdrcl_built_in
built_ins/kernels/fill_image2d.igdrcl_built_in
built_ins/kernels/fill_image3d.igdrcl_built_in
built_ins/kernels/vme_block_motion_estimate_intel.igdrcl_built_in
built_ins/kernels/vme_block_advanced_motion_estimate_check_intel.igdrcl_built_in
built_ins/kernels/vme_block_advanced_motion_estimate_bidirectional_check_intel.igdrcl_built_in
built_ins/kernels/vebox_ve_enhance_intel.igdrcl_built_in
built_ins/kernels/vebox_ve_dn_enhance_intel.igdrcl_built_in
built_ins/kernels/vebox_ve_dn_di_enhance_intel.igdrcl_built_in
)
set (RUNTIME_SRCS_COMMANDS
commands/bxml_generator_glue.h
)
set (RUNTIME_SRCS_COMMAND_QUEUE
command_queue/cpu_data_transfer_handler.h
command_queue/command_queue.cpp
command_queue/command_queue.h
command_queue/command_queue_hw.h
command_queue/command_queue_hw.inl
command_queue/dispatch_walker.h
command_queue/dispatch_walker_helper.h
command_queue/dispatch_walker_helper.inl
command_queue/enqueue_barrier.h
command_queue/enqueue_common.h
command_queue/enqueue_copy_buffer.h
command_queue/enqueue_copy_buffer_rect.h
command_queue/enqueue_copy_buffer_to_image.h
command_queue/enqueue_copy_image.h
command_queue/enqueue_copy_image_to_buffer.h
command_queue/enqueue_fill_buffer.h
command_queue/enqueue_fill_image.h
command_queue/enqueue_kernel.h
command_queue/enqueue_map_buffer.h
command_queue/enqueue_map_image.h
command_queue/enqueue_svm.h
command_queue/enqueue_marker.h
command_queue/enqueue_migrate_mem_objects.h
command_queue/enqueue_read_buffer.h
command_queue/enqueue_read_buffer_rect.h
command_queue/enqueue_read_image.h
command_queue/enqueue_write_buffer.h
command_queue/enqueue_write_buffer_rect.h
command_queue/enqueue_write_image.h
command_queue/finish.h
command_queue/flush.h
command_queue/local_id_gen.cpp
command_queue/local_id_gen_avx2.cpp
command_queue/local_id_gen_sse4.cpp
command_queue/local_id_gen.h
command_queue/local_id_gen.inl
command_queue/local_work_size.cpp
)
set (RUNTIME_SRCS_COMMAND_STREAM
command_stream/aub_command_stream_receiver.cpp
command_stream/aub_command_stream_receiver.h
command_stream/aub_command_stream_receiver_hw.h
command_stream/aub_command_stream_receiver_hw.inl
command_stream/command_stream_receiver.cpp
command_stream/command_stream_receiver.h
command_stream/command_stream_receiver_hw.h
command_stream/command_stream_receiver_hw.inl
command_stream/csr_definitions.h
command_stream/device_command_stream.h
command_stream/linear_stream.cpp
command_stream/linear_stream.h
command_stream/submissions_aggregator.cpp
command_stream/submissions_aggregator.h
command_stream/tbx_command_stream_receiver.cpp
command_stream/tbx_command_stream_receiver.h
command_stream/tbx_command_stream_receiver_hw.h
command_stream/tbx_command_stream_receiver_hw.inl
command_stream/tbx_stream.cpp
command_stream/thread_arbitration_policy.h
command_stream/preemption.h
command_stream/preemption.cpp
)
set (RUNTIME_SRCS_COMPILER_INTERFACE
compiler_interface/binary_cache.cpp
compiler_interface/compiler_interface.cpp
compiler_interface/compiler_interface.h
compiler_interface/compiler_interface.inl
compiler_interface/create_main.cpp
)
set (RUNTIME_SRCS_CONTEXT
context/context.cpp
context/context.h
context/context.inl
context/driver_diagnostics.cpp
context/driver_diagnostics.h
)
set (RUNTIME_SRCS_DEVICE
device/device.cpp
device/device.h
device/device_caps.cpp
device/device_info.cpp
device/device_info.h
device/device_info_map.h
device/device_vector.h
)
set (RUNTIME_SRCS_DEVICE_QUEUE
device_queue/device_queue.cpp
device_queue/device_queue.h
device_queue/device_queue_hw.h
device_queue/device_queue_hw.inl
device_queue/device_queue_hw_profiling.inl
)
set (RUNTIME_SRCS_EVENT
event/async_events_handler.h
event/async_events_handler.cpp
event/event.cpp
event/event.h
event/event_builder.cpp
event/event_builder.h
event/event_registry.cpp
event/event_registry.h
event/user_event.cpp
event/user_event.h
event/hw_timestamps.h
event/perf_counter.h
)
set (RUNTIME_SRCS_EXECUTION_MODEL
execution_model/device_enqueue.h
)
if(GTPIN_HEADERS_DIR)
set (RUNTIME_SRCS_GTPIN
gtpin/gtpin_init.cpp
gtpin/gtpin_init.h
gtpin/gtpin_helpers.cpp
gtpin/gtpin_helpers.h
)
endif(GTPIN_HEADERS_DIR)
set (RUNTIME_SRCS_HELPERS
helpers/abort.h
helpers/aligned_memory.h
helpers/array_count.h
helpers/base_object.cpp
helpers/base_object.h
helpers/base_object_allocator.cpp
helpers/basic_math.h
helpers/cache_policy.cpp
helpers/cache_policy.h
helpers/dirty_state_helpers.h
helpers/dirty_state_helpers.cpp
helpers/dispatch_info.h
helpers/dispatch_info.cpp
helpers/dispatch_info_builder.h
helpers/completion_stamp.h
helpers/debug_helpers.h
helpers/engine_node.h
helpers/error_mappers.h
helpers/file_io.cpp
helpers/file_io.h
helpers/flush_stamp.h
helpers/flush_stamp.cpp
helpers/get_info.h
helpers/hash.h
helpers/hw_helper.cpp
helpers/hw_helper.h
helpers/hw_helper.inl
helpers/hw_info.cpp
helpers/hw_info.h
helpers/kernel_commands.h
helpers/kernel_commands.inl
helpers/options.cpp
helpers/options.h
helpers/per_thread_data.cpp
helpers/per_thread_data.h
helpers/preamble.h
helpers/preamble.inl
helpers/ptr_math.h
helpers/queue_helpers.h
helpers/sampler_helpers.h
helpers/selectors.h
helpers/state_base_address.h
helpers/state_base_address.inl
helpers/stdio.h
helpers/string.h
helpers/string_helpers.h
helpers/surface_formats.cpp
helpers/surface_formats.h
helpers/task_information.cpp
helpers/task_information.h
helpers/uint16_avx2.h
helpers/uint16_sse4.h
helpers/wddm_helper.h
helpers/validators.cpp
helpers/validators.h
)
if (WIN32)
list (APPEND RUNTIME_SRCS_HELPERS
helpers/translationtable_callbacks.h
)
endif(WIN32)
set (RUNTIME_SRCS_INDIRECT_HEAP
indirect_heap/indirect_heap.cpp
indirect_heap/indirect_heap.h
)
set (RUNTIME_SRCS_INSTRUMENTATION
instrumentation/instrumentation.cpp
instrumentation/instrumentation.h
)
set (RUNTIME_SRCS_KERNEL
kernel/dynamic_kernel_info.h
kernel/kernel.cpp
kernel/kernel.h
kernel/kernel.inl
)
set (RUNTIME_SRCS_MEMORY_MANAGER
memory_manager/deferrable_deletion.h
memory_manager/deferred_deleter.cpp
memory_manager/deferred_deleter.h
memory_manager/graphics_allocation.h
memory_manager/graphics_allocation.cpp
memory_manager/host_ptr_defines.h
memory_manager/host_ptr_manager.h
memory_manager/host_ptr_manager.cpp
memory_manager/memory_manager.cpp
memory_manager/memory_manager.h
memory_manager/svm_memory_manager.cpp
memory_manager/svm_memory_manager.h
memory_manager/os_agnostic_memory_manager.cpp
memory_manager/os_agnostic_memory_manager.h
memory_manager/page_table.cpp
memory_manager/page_table.h
memory_manager/address_mapper.cpp
memory_manager/address_mapper.h
memory_manager/surface.h
)
set (RUNTIME_SRCS_GMM_HELPER
gmm_helper/gmm_helper.cpp
gmm_helper/gmm_helper.h
gmm_helper/gmm_lib.h
gmm_helper/resource_info.h
)
if (WIN32)
list (APPEND RUNTIME_SRCS_GMM_HELPER
gmm_helper/page_table_mngr.h
gmm_helper/gmm_memory.h
)
endif(WIN32)
set (RUNTIME_SRCS_MEM_OBJ
mem_obj/buffer.cpp
mem_obj/buffer.h
mem_obj/buffer.inl
mem_obj/image.cpp
mem_obj/image.h
mem_obj/image.inl
mem_obj/mem_obj.cpp
mem_obj/mem_obj.h
mem_obj/buffer_factory_init.inl
mem_obj/image_factory_init.inl
mem_obj/pipe.h
mem_obj/pipe.cpp
)
set (RUNTIME_SRCS_OS_INTERFACE
os_interface/32bit_memory.h
os_interface/os_library.h
os_interface/linux/linux_inc.h
os_interface/windows/windows_inc.h
os_interface/device_factory.h
os_interface/os_inc.h
os_interface/os_interface.h
os_interface/os_time.h
os_interface/os_time.cpp
os_interface/debug_settings_manager.cpp
os_interface/debug_settings_manager.h
os_interface/performance_counters.cpp
os_interface/performance_counters.h
os_interface/print.h
)
set (RUNTIME_SRCS_PLATFORM
platform/platform.cpp
platform/platform.h
platform/platform_info.h
)
set (RUNTIME_SRCS_PROGRAM
program/block_kernel_manager.cpp
program/block_kernel_manager.h
program/build.cpp
program/compile.cpp
program/create.cpp
program/get_info.cpp
program/heap_info.h
program/kernel_arg_info.h
program/kernel_info.cpp
program/kernel_info.h
program/link.cpp
program/patch_info.h
program/process_elf_binary.cpp
program/process_spir_binary.cpp
program/process_gen_binary.cpp
program/program.cpp
program/program.h
program/printf_handler.h
program/printf_handler.cpp
program/print_formatter.h
program/print_formatter.cpp
)
set (RUNTIME_SRCS_SAMPLER
sampler/sampler.cpp
sampler/sampler.h
sampler/sampler.inl
sampler/sampler_factory_init.inl
)
list (APPEND RUNTIME_SRCS_SCHEDULER
scheduler/scheduler_kernel.cpp
scheduler/scheduler_kernel.h
scheduler/CMakeLists.txt
)
set (RUNTIME_SRCS_SHARINGS
sharings/sharing.h
sharings/sharing.cpp
)
set (RUNTIME_SRCS_TBX
tbx/tbx_proto.h
tbx/tbx_sockets.cpp
tbx/tbx_sockets.h
)
set (RUNTIME_SRCS_UTILITIES
utilities/api_intercept.h
utilities/arrayref.h
utilities/cpu_info.h
utilities/debug_file_reader.cpp
utilities/debug_file_reader.h
utilities/debug_settings_reader.cpp
utilities/debug_settings_reader.h
utilities/directory.h
utilities/heap_allocator.cpp
utilities/heap_allocator.h
utilities/iflist.h
utilities/idlist.h
utilities/stackvec.h
utilities/perf_profiler.cpp
utilities/perf_profiler.h
utilities/reference_tracked_object.h
utilities/tag_allocator.h
utilities/timer_util.h
utilities/vec.h
)
set (RUNTIME_SRCS_GEN_COMMON
gen_common/aub_mapper.h
gen_common/aub_mapper_base.h
gen_common/hw_cmds.h
gen_common/reg_configs.h
)
if (WIN32)
list (APPEND RUNTIME_SRCS_UTILITIES
utilities/windows/directory.cpp
utilities/windows/timer_util.cpp
utilities/windows/cpu_info.cpp
)
else(WIN32)
list (APPEND RUNTIME_SRCS_UTILITIES
utilities/linux/directory.cpp
utilities/linux/timer_util.cpp
utilities/linux/cpu_info.cpp
)
endif (WIN32)
if (WIN32)
list (APPEND RUNTIME_SRCS_OS_INTERFACE
os_interface/windows/api.cpp
os_interface/windows/d3d_sharing_functions.h
os_interface/windows/d3d9_sharing_functions.cpp
os_interface/windows/d3d10_11_sharing_functions.cpp
os_interface/windows/debug_registry_reader.cpp
os_interface/windows/deferrable_deletion_win.cpp
os_interface/windows/deferrable_deletion_win.h
os_interface/windows/device_command_stream.inl
os_interface/windows/device_factory.cpp
os_interface/windows/gdi_interface.cpp
os_interface/windows/gdi_interface.h
os_interface/windows/options.cpp
os_interface/windows/os_interface.cpp
os_interface/windows/os_interface.h
os_interface/windows/os_library.cpp
os_interface/windows/os_library.h
os_interface/windows/os_time.cpp
os_interface/windows/os_time.h
os_interface/windows/registry_reader.h
os_interface/windows/thk_wrapper.h
os_interface/windows/wddm.cpp
os_interface/windows/wddm.h
os_interface/windows/wddm.inl
os_interface/windows/wddm_32bit_memory.cpp
os_interface/windows/wddm_allocation.h
os_interface/windows/wddm_device_command_stream.inl
os_interface/windows/wddm_device_command_stream.h
os_interface/windows/wddm_engine_mapper.h
os_interface/windows/wddm_memory_manager.cpp
os_interface/windows/wddm_memory_manager.h
os_interface/windows/windows_inc.cpp
os_interface/windows/windows_wrapper.h
os_interface/windows/performance_counters_win.cpp
os_interface/windows/performance_counters_win.h
os_interface/windows/print.cpp
os_interface/windows/driver_info.h
os_interface/windows/driver_info.cpp
)
if ("${IGDRCL_OPTION__BITS}" STREQUAL "32" )
set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /SAFESEH:NO")
endif ("${IGDRCL_OPTION__BITS}" STREQUAL "32")
endif (WIN32)
if (UNIX)
list (APPEND RUNTIME_SRCS_OS_INTERFACE
os_interface/linux/api.cpp
os_interface/linux/d3d_sharing_functions.h
os_interface/linux/debug_env_reader.cpp
os_interface/linux/device_command_stream.inl
os_interface/linux/device_factory.cpp
os_interface/linux/drm_32bit_memory.cpp
os_interface/linux/drm_allocation.h
os_interface/linux/drm_buffer_object.cpp
os_interface/linux/drm_buffer_object.h
os_interface/linux/drm_command_stream.inl
os_interface/linux/drm_command_stream.h
os_interface/linux/drm_engine_mapper.h
os_interface/linux/drm_null_device.h
os_interface/linux/drm_gem_close_worker.cpp
os_interface/linux/drm_gem_close_worker.h
os_interface/linux/drm_memory_manager.cpp
os_interface/linux/drm_memory_manager.h
os_interface/linux/drm_neo.cpp
os_interface/linux/drm_neo.h
os_interface/linux/drm_neo_create.cpp
os_interface/linux/hw_info_config.cpp
os_interface/linux/hw_info_config.h
os_interface/linux/linux_inc.cpp
os_interface/linux/options.cpp
os_interface/linux/os_interface.cpp
os_interface/linux/os_interface.h
os_interface/linux/os_library.cpp
os_interface/linux/os_library.h
os_interface/linux/os_time.cpp
os_interface/linux/os_time.h
os_interface/linux/performance_counters_linux.cpp
os_interface/linux/performance_counters_linux.h
os_interface/linux/print.cpp
os_interface/linux/driver_info.cpp
)
endif (UNIX)
add_subdirectory("sharings")
set (RUNTIME_SRCS
${RUNTIME_SRCS_API}
${RUNTIME_SRCS_ACCELERATORS}
${RUNTIME_SRCS_AUB_MEM_DUMP}
${RUNTIME_SRCS_BUILT_INS}
${RUNTIME_SRCS_BUILT_IN_KERNELS}
${RUNTIME_SRCS_COMMANDS}
${RUNTIME_SRCS_COMMAND_QUEUE}
${RUNTIME_SRCS_COMMAND_STREAM}
${RUNTIME_SRCS_COMPILER_INTERFACE}
${RUNTIME_SRCS_CONTEXT}
${RUNTIME_SRCS_DEVICE}
${RUNTIME_SRCS_DEVICE_QUEUE}
${RUNTIME_SRCS_EVENT}
${RUNTIME_SRCS_EXECUTION_MODEL}
${RUNTIME_SRCS_GEN_COMMON}
${RUNTIME_SRCS_GTPIN}
${RUNTIME_SRCS_HELPERS}
${RUNTIME_SRCS_INDIRECT_HEAP}
${RUNTIME_SRCS_INSTRUMENTATION}
${RUNTIME_SRCS_KERNEL}
${RUNTIME_SRCS_MEMORY_MANAGER}
${RUNTIME_SRCS_GMM_HELPER}
${RUNTIME_SRCS_MEM_OBJ}
${RUNTIME_SRCS_OS_INTERFACE}
${RUNTIME_SRCS_PLATFORM}
${RUNTIME_SRCS_PROGRAM}
${RUNTIME_SRCS_SAMPLER}
${RUNTIME_SRCS_SCHEDULER}
${RUNTIME_SRCS_SHARINGS}
${RUNTIME_SRCS_TBX}
${RUNTIME_SRCS_UTILITIES}
CMakeLists.txt
)
# Enable SSE4/AVX2 options for files that need them
if(MSVC)
set_source_files_properties(command_queue/local_id_gen_avx2.cpp PROPERTIES COMPILE_FLAGS /arch:AVX2)
else()
set_source_files_properties(command_queue/local_id_gen_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
set_source_files_properties(command_queue/local_id_gen_sse4.cpp PROPERTIES COMPILE_FLAGS -msse4.2)
endif (MSVC)
# Put Driver version into define
if(NEO_DRIVER_VERSION)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/device/device_caps.cpp PROPERTIES COMPILE_DEFINITIONS NEO_DRIVER_VERSION="${NEO_DRIVER_VERSION}")
endif(NEO_DRIVER_VERSION)
list (APPEND HW_SRC_INCLUDES ${CMAKE_CURRENT_SOURCE_DIR}/gen_common)
# Include/enable each GEN
# Reverse order so that GEN N+1 includes GEN N
foreach(GEN_NUM RANGE ${MAX_GEN} 0 -1)
GEN_CONTAINS_PLATFORMS("SUPPORTED" ${GEN_NUM} GENX_HAS_PLATFORMS)
if(${GENX_HAS_PLATFORMS})
# Add GEN-specific files
set(RUNTIME_SRCS_GEN${GEN_NUM} ${RUNTIME_SRCS_GEN${GEN_NUM}_SPECIFIC})
# Add default GEN files
foreach(SRC_IT ${RUNTIME_SRCS_GENX})
list (APPEND RUNTIME_SRCS_GEN${GEN_NUM} gen${GEN_NUM}/${SRC_IT})
endforeach(SRC_IT)
# Get all supported platforms for this GEN
GET_PLATFORMS_FOR_GEN("SUPPORTED" ${GEN_NUM} SUPPORTED_GENX_PLATFORMS)
# Add platform-specific files
foreach(PLATFORM_IT ${SUPPORTED_GENX_PLATFORMS})
string(TOLOWER ${PLATFORM_IT} PLATFORM_IT_LOWER)
list(APPEND RUNTIME_SRCS_GEN${GEN_NUM} gen${GEN_NUM}/hw_cmds_${PLATFORM_IT_LOWER}.h)
list(APPEND RUNTIME_SRCS_GEN${GEN_NUM} gen${GEN_NUM}/hw_info_${PLATFORM_IT_LOWER}.cpp)
list(APPEND RUNTIME_SRCS_GEN${GEN_NUM} ${RUNTIME_SRCS_GEN${GEN_NUM}_${PLATFORM_IT}})
if(UNIX)
list(APPEND RUNTIME_SRCS_GEN${GEN_NUM} gen${GEN_NUM}/linux/hw_info_config_${PLATFORM_IT_LOWER}.cpp)
endif(UNIX)
# Enable platform
list(APPEND GEN${GEN_NUM}_SRC_LINK gen${GEN_NUM}/enable_${PLATFORM_IT_LOWER}.cpp)
if(UNIX)
list(APPEND GEN${GEN_NUM}_SRC_LINK gen${GEN_NUM}/linux/enable_${PLATFORM_IT_LOWER}.cpp)
endif(UNIX)
endforeach(PLATFORM_IT)
list(APPEND GEN${GEN_NUM}_SRC_LINK gen${GEN_NUM}/enable_family_full.cpp)
# Append this GEN's sources to the list of all sources
foreach(SRC_IT ${RUNTIME_SRCS_GEN${GEN_NUM}})
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_IT}")
list(APPEND RUNTIME_SRCS ${SRC_IT})
endif(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_IT}")
endforeach(SRC_IT)
# Set-up gen include dir and sources for the dll
list(APPEND HW_SRC_INCLUDES ${CMAKE_CURRENT_SOURCE_DIR}/gen${GEN_NUM})
list(APPEND HW_SRC_LINK ${GEN${GEN_NUM}_SRC_LINK})
source_group("gen${GEN_NUM}" FILES ${RUNTIME_SRCS_GEN${GEN_NUM}} ${GEN${GEN_NUM}_SRC_LINK} )
endif(${GENX_HAS_PLATFORMS})
endforeach(GEN_NUM)
add_library(${NEO_STATIC_LIB_NAME} STATIC $<TARGET_OBJECTS:${BIKSIM_LIB_NAME}>
${RUNTIME_SRCS}
)
target_link_libraries(${NEO_STATIC_LIB_NAME} elflib)
target_include_directories(${NEO_STATIC_LIB_NAME} PRIVATE
${CMAKE_CURRENT_BINARY_DIR}
${GMM_INCLUDE_PATHS}
${UMKM_SHAREDDATA_INCLUDE_PATHS}
)
target_include_directories(${NEO_STATIC_LIB_NAME} PUBLIC
${KHRONOS_HEADERS_DIR}
${IGDRCL__IGC_INCLUDE_DIR}
${THIRD_PARTY_DIR}
)
if(GTPIN_HEADERS_DIR)
target_include_directories(${NEO_STATIC_LIB_NAME} PUBLIC
${GTPIN_HEADERS_DIR}
)
endif(GTPIN_HEADERS_DIR)
if (WIN32)
target_include_directories(${NEO_STATIC_LIB_NAME} PUBLIC
${WDK_INCLUDE_PATHS}
os_interface/windows
)
target_compile_definitions(${NEO_STATIC_LIB_NAME} PUBLIC OGL=1)
target_compile_definitions(${NEO_STATIC_LIB_NAME} PUBLIC INSTR_WIN_UMD=1)
endif (WIN32)
if (UNIX)
target_compile_definitions(${NEO_STATIC_LIB_NAME} PUBLIC OGL_GEM=1)
target_include_directories(${NEO_STATIC_LIB_NAME} PUBLIC
os_interface/linux
"${LIBDRM_DIR}/include"
)
endif (UNIX)
#cl_khr_priority support
if(NOT MSVC)
target_compile_definitions(${NEO_STATIC_LIB_NAME} PUBLIC -DSUPPORT_PRIORITY_HINTS)
message(STATUS "Supporting priority hints")
endif()
target_compile_definitions(${NEO_STATIC_LIB_NAME} PUBLIC DEFAULT_PLATFORM=${DEFAULT_SUPPORTED_PLATFORM})
link_directories(${GMM_LIB_PATHS})
if(NOT GMMUMD_LIB_NAME)
set(GMMUMD_LIB_NAME "gmm_umd" CACHE STRING "name of gmm static library")
endif()
target_link_libraries(${NEO_STATIC_LIB_NAME} ${GMMUMD_LIB_NAME})
if( "${IGDRCL_OPTION__BITS}" STREQUAL "32" )
set( DEF_FILE "${CMAKE_CURRENT_SOURCE_DIR}/dll/windows/OpenCLExports32.def" )
else( "${IGDRCL_OPTION__BITS}" STREQUAL "32" )
set( DEF_FILE "${CMAKE_CURRENT_SOURCE_DIR}/dll/windows/OpenCLExports64.def" )
endif( "${IGDRCL_OPTION__BITS}" STREQUAL "32" )
list(APPEND LIB_FLAGS_DEFINITIONS -DCIF_HEADERS_ONLY_BUILD ${SUPPORTED_GEN_FLAGS_DEFINITONS})
target_compile_definitions(${NEO_STATIC_LIB_NAME} PUBLIC ${LIB_FLAGS_DEFINITIONS})
if(IGC_OCL_ADAPTOR_DIR) # IGC/AdaptorOCL
target_include_directories("${NEO_STATIC_LIB_NAME}" PUBLIC "${IGC_OCL_ADAPTOR_DIR}")
endif(IGC_OCL_ADAPTOR_DIR)
if(CIF_BASE_DIR)
target_include_directories("${NEO_STATIC_LIB_NAME}" PUBLIC "${CIF_BASE_DIR}")
endif(CIF_BASE_DIR)
set(IGDRCL_LIB_FLAGS_DEFINITIONS ${LIB_FLAGS_DEFINITIONS} PARENT_SCOPE)
set_target_properties(${NEO_STATIC_LIB_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_property(TARGET ${NEO_STATIC_LIB_NAME} APPEND_STRING PROPERTY COMPILE_FLAGS ${ASAN_FLAGS} ${TSAN_FLAGS})
set_target_properties(${NEO_STATIC_LIB_NAME} PROPERTIES FOLDER "opencl runtime")
target_include_directories(${NEO_STATIC_LIB_NAME} BEFORE PRIVATE ${HW_SRC_INCLUDES})
if(${GENERATE_EXECUTABLE})
set (RUNTIME_SRCS_DLL
dll/options.cpp
dll/create_command_stream.cpp
dll/create_deferred_deleter.cpp
helpers/abort.cpp
helpers/debug_helpers.cpp
gmm_helper/resource_info.cpp
program/evaluate_unhandled_token.cpp
"${DEF_FILE}"
)
list (APPEND RUNTIME_SRCS_DLL ${HW_SRC_LINK})
if (UNIX)
list (APPEND RUNTIME_SRCS_DLL dll/linux/drm_neo_create.cpp)
endif (UNIX)
if (WIN32)
list (APPEND RUNTIME_SRCS_DLL os_interface/windows/wddm_create.cpp)
list (APPEND RUNTIME_SRCS_DLL gmm_helper/page_table_mngr.cpp)
list (APPEND RUNTIME_SRCS_DLL gmm_helper/gmm_memory.cpp)
endif (WIN32)
list (APPEND RUNTIME_SRCS_DLL api/api.cpp)
if(GTPIN_HEADERS_DIR)
list (APPEND RUNTIME_SRCS_DLL gtpin/gtpin_init.cpp)
endif(GTPIN_HEADERS_DIR)
add_library(${NEO_DYNAMIC_LIB_NAME} SHARED
${RUNTIME_SRCS_DLL}
$<TARGET_OBJECTS:${SHARINGS_ENABLE_LIB_NAME}>
$<TARGET_OBJECTS:${BUILTINS_SOURCES_LIB_NAME}>
$<TARGET_OBJECTS:${BUILTINS_BINARIES_LIB_NAME}>
$<TARGET_OBJECTS:${SCHEDULER_BINARY_LIB_NAME}>
)
target_include_directories(${NEO_DYNAMIC_LIB_NAME} BEFORE PRIVATE
${CMAKE_CURRENT_BINARY_DIR}
${HW_SRC_INCLUDES}
)
target_link_libraries(${NEO_DYNAMIC_LIB_NAME} ${NEO_STATIC_LIB_NAME})
if (WIN32)
target_include_directories(${NEO_DYNAMIC_LIB_NAME} PUBLIC
${WDK_INCLUDE_PATHS}
${GMM_INCLUDE_PATHS}
${UMKM_SHAREDDATA_INCLUDE_PATHS}
${INSTRUMENTATION_INCLUDE_PATH}
)
target_link_libraries(${NEO_DYNAMIC_LIB_NAME} ${NEO_STATIC_LIB_NAME} dxgi Ws2_32.lib)
else(WIN32)
target_include_directories(${NEO_DYNAMIC_LIB_NAME} PUBLIC
${GMM_INCLUDE_PATHS}
${UMKM_SHAREDDATA_INCLUDE_PATHS}
${INSTRUMENTATION_INCLUDE_PATH}
)
endif (WIN32)
if (UNIX)
target_link_libraries(${NEO_DYNAMIC_LIB_NAME} dl pthread)
set_property(TARGET ${NEO_DYNAMIC_LIB_NAME}
APPEND_STRING PROPERTY LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/dll/linux/ocl.exports"
)
endif (UNIX)
set_target_properties(${NEO_DYNAMIC_LIB_NAME} PROPERTIES
DEBUG_OUTPUT_NAME "${NEO_DLL_NAME_BASE}${IGDRCL_NAME_POSTFIX}${IGDRCL_OPTION__BITS}"
RELEASE_OUTPUT_NAME "${NEO_DLL_NAME_BASE}${IGDRCL_NAME_POSTFIX}${IGDRCL_OPTION__BITS}"
RELEASE-INTERNAL_OUTPUT_NAME "${NEO_DLL_NAME_BASE}${IGDRCL_NAME_POSTFIX}${IGDRCL_OPTION__BITS}"
OUTPUT_NAME "${NEO_DLL_NAME_BASE}${IGDRCL_NAME_POSTFIX}${IGDRCL_OPTION__BITS}"
)
set_property(TARGET ${NEO_DYNAMIC_LIB_NAME} APPEND_STRING PROPERTY COMPILE_FLAGS ${ASAN_FLAGS})
set_target_properties(${NEO_DYNAMIC_LIB_NAME} PROPERTIES FOLDER "opencl runtime")
endif(${GENERATE_EXECUTABLE})
if (WIN32)
source_group("source files" FILES ${IGDRCL_SRCS_DLL})
source_group("source files\\api" FILES ${RUNTIME_SRCS_API})
source_group("source files\\accelerators" FILES ${RUNTIME_SRCS_ACCELERATORS})
source_group("source files\\aub_mem_dump" FILES ${RUNTIME_SRCS_AUB_MEM_DUMP})
source_group("source files\\built_ins" FILES ${RUNTIME_SRCS_BUILT_INS})
source_group("source files\\built_ins\\kernels" FILES ${RUNTIME_SRCS_BUILT_IN_KERNELS})
source_group("source files\\commands" FILES ${RUNTIME_SRCS_COMMANDS})
source_group("source files\\command_queue" FILES ${RUNTIME_SRCS_COMMAND_QUEUE})
source_group("source files\\command_stream" FILES ${RUNTIME_SRCS_COMMAND_STREAM})
source_group("source files\\compiler_interface" FILES ${RUNTIME_SRCS_COMPILER_INTERFACE})
source_group("source files\\context" FILES ${RUNTIME_SRCS_CONTEXT})
source_group("source files\\device" FILES ${RUNTIME_SRCS_DEVICE})
source_group("source files\\device_queue" FILES ${RUNTIME_SRCS_DEVICE_QUEUE})
source_group("source files\\event" FILES ${RUNTIME_SRCS_EVENT})
source_group("source files\\execution_model" FILES ${RUNTIME_SRCS_EXECUTION_MODEL})
source_group("source files\\gen_common" FILES ${RUNTIME_SRCS_GEN_COMMON})
source_group("source files\\helpers" FILES ${RUNTIME_SRCS_HELPERS})
source_group("source files\\indirect_heap" FILES ${RUNTIME_SRCS_INDIRECT_HEAP})
source_group("source files\\instrumentation" FILES ${RUNTIME_SRCS_INSTRUMENTATION})
source_group("source files\\kernel" FILES ${RUNTIME_SRCS_KERNEL})
source_group("source files\\memory_manager" FILES ${RUNTIME_SRCS_MEMORY_MANAGER})
source_group("source files\\gmm_helper" FILES ${RUNTIME_SRCS_GMM_HELPER})
if(GTPIN_HEADERS_DIR)
source_group("source files\\gtpin" FILES ${RUNTIME_SRCS_GTPIN})
endif(GTPIN_HEADERS_DIR)
source_group("source files\\mem_obj" FILES ${RUNTIME_SRCS_MEM_OBJ})
source_group("source files\\os_interface" FILES ${RUNTIME_SRCS_OS_INTERFACE})
source_group("source files\\platform" FILES ${RUNTIME_SRCS_PLATFORM})
source_group("source files\\program" FILES ${RUNTIME_SRCS_PROGRAM})
source_group("source files\\sampler" FILES ${RUNTIME_SRCS_SAMPLER})
source_group("source files\\scheduler" FILES ${RUNTIME_SRCS_SCHEDULER})
source_group("source files\\sharings" FILES ${RUNTIME_SRCS_SHARINGS})
source_group("source files\\tbx" FILES ${RUNTIME_SRCS_TBX})
source_group("source files\\utilities" FILES ${RUNTIME_SRCS_UTILITIES})
endif (WIN32)
if (UNIX)
if(NOT (TARGET clang-tidy))
add_custom_target(clang-tidy
DEPENDS scheduler
)
add_custom_command(
TARGET clang-tidy
POST_BUILD
COMMAND echo clang-tidy...
COMMAND find ${CMAKE_CURRENT_SOURCE_DIR} -name *.cpp -print0 | xargs -0 -I{} -P`nproc` clang-tidy -p ${IGDRCL_BINARY_DIR} {} | tee ${IGDRCL_BINARY_DIR}/clang-tidy.log
WORKING_DIRECTORY ${IGDRCL_SOURCE_DIR}
)
endif(NOT (TARGET clang-tidy))
endif(UNIX)

View File

@@ -0,0 +1,31 @@
# Copyright (c) 2017, Intel Corporation
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
# We require cmake 3.2.0 or later
cmake_minimum_required(VERSION 3.2.0 FATAL_ERROR)
set (RUNTIME_SRCS_ACCELERATORS
${CMAKE_CURRENT_SOURCE_DIR}/intel_accelerator.cpp
${CMAKE_CURRENT_SOURCE_DIR}/intel_accelerator.h
${CMAKE_CURRENT_SOURCE_DIR}/intel_motion_estimation.cpp
${CMAKE_CURRENT_SOURCE_DIR}/intel_motion_estimation.h
${CMAKE_CURRENT_SOURCE_DIR}/vebox_accelerator.h
PARENT_SCOPE
)

View File

@@ -0,0 +1,81 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "runtime/accelerators/intel_accelerator.h"
#include "runtime/context/context.h"
#include "runtime/helpers/string.h"
#include "runtime/helpers/get_info.h"
namespace OCLRT {
cl_int IntelAccelerator::getInfo(cl_accelerator_info_intel paramName,
size_t paramValueSize,
void *paramValue,
size_t *paramValueSizeRet) const {
cl_int result = CL_SUCCESS;
size_t ret = 0;
switch (paramName) {
case CL_ACCELERATOR_DESCRIPTOR_INTEL: {
ret = getDescriptorSize();
result = ::getInfo(paramValue, paramValueSize, getDescriptor(), ret);
}
break;
case CL_ACCELERATOR_REFERENCE_COUNT_INTEL: {
auto v = getReference();
ret = sizeof(cl_uint);
result = ::getInfo(paramValue, paramValueSize, &v, ret);
}
break;
case CL_ACCELERATOR_CONTEXT_INTEL: {
ret = sizeof(cl_context);
cl_context ctx = static_cast<cl_context>(pContext);
result = ::getInfo(paramValue, paramValueSize, &ctx, ret);
}
break;
case CL_ACCELERATOR_TYPE_INTEL: {
auto v = getTypeId();
ret = sizeof(cl_accelerator_type_intel);
result = ::getInfo(paramValue, paramValueSize, &v, ret);
}
break;
default:
result = CL_INVALID_VALUE;
break;
}
if (paramValueSizeRet) {
*paramValueSizeRet = ret;
}
return result;
}
}

View File

@@ -0,0 +1,79 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "runtime/api/cl_types.h"
#include "runtime/helpers/base_object.h"
//------------------------------------------------------------------------------
// cl_intel_accelerator Class Stuff
//------------------------------------------------------------------------------
namespace OCLRT {
class Context;
typedef struct TagAcceleratorObjParams {
cl_uint AcceleratorType;
cl_uint AcceleratorFlags;
} OCLRT_ACCELERATOR_OBJECT_PARAMS, *POCLRT_ACCELERATOR_OBJECT_PARAMS;
template <>
struct OpenCLObjectMapper<_cl_accelerator_intel> {
typedef class IntelAccelerator DerivedType;
};
class IntelAccelerator : public BaseObject<_cl_accelerator_intel> {
public:
IntelAccelerator(Context *context,
cl_accelerator_type_intel typeId,
size_t descriptorSize,
const void *descriptor) : pContext(context),
typeId(typeId),
descriptorSize(descriptorSize),
pDescriptor(descriptor) {}
IntelAccelerator() {}
static const cl_ulong objectMagic = 0xC6D72FA2E81EA569ULL;
cl_accelerator_type_intel getTypeId() const { return typeId; }
size_t getDescriptorSize() const { return descriptorSize; }
const void *getDescriptor() const { return pDescriptor; }
cl_int getInfo(cl_accelerator_info_intel paramName,
size_t paramValueSize,
void *paramValue,
size_t *paramValueSizeRet) const;
protected:
Context *pContext = nullptr;
const cl_accelerator_type_intel typeId = -1;
const size_t descriptorSize = 0;
const void *pDescriptor = nullptr;
private:
};
} // namespace OCLRT

View File

@@ -0,0 +1,79 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "runtime/accelerators/intel_motion_estimation.h"
namespace OCLRT {
cl_int VmeAccelerator::validateVmeArgs(Context *context,
cl_accelerator_type_intel typeId,
size_t descriptorSize,
const void *descriptor) {
const cl_motion_estimation_desc_intel *descObj =
(const cl_motion_estimation_desc_intel *)descriptor;
DEBUG_BREAK_IF(!context);
DEBUG_BREAK_IF(typeId != CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL);
if ((descriptorSize != sizeof(cl_motion_estimation_desc_intel)) ||
(descriptor == NULL)) {
return CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL;
}
switch (descObj->mb_block_type) {
case CL_ME_MB_TYPE_16x16_INTEL:
case CL_ME_MB_TYPE_8x8_INTEL:
case CL_ME_MB_TYPE_4x4_INTEL:
break;
default:
return CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL;
}
switch (descObj->subpixel_mode) {
case CL_ME_SUBPIXEL_MODE_INTEGER_INTEL:
case CL_ME_SUBPIXEL_MODE_HPEL_INTEL:
case CL_ME_SUBPIXEL_MODE_QPEL_INTEL:
break;
default:
return CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL;
}
switch (descObj->sad_adjust_mode) {
case CL_ME_SAD_ADJUST_MODE_NONE_INTEL:
case CL_ME_SAD_ADJUST_MODE_HAAR_INTEL:
break;
default:
return CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL;
}
switch (descObj->search_path_type) {
case CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL:
case CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL:
case CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL:
break;
default:
return CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL;
}
return CL_SUCCESS;
}
}

View File

@@ -0,0 +1,72 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "runtime/accelerators/intel_accelerator.h"
//------------------------------------------------------------------------------
// VmeAccelerator Class Stuff
//------------------------------------------------------------------------------
namespace OCLRT {
class Context;
class VmeAccelerator : public IntelAccelerator {
public:
static VmeAccelerator *create(Context *context,
cl_accelerator_type_intel typeId,
size_t descriptorSize,
const void *descriptor,
cl_int &result) {
result = validateVmeArgs(context, typeId, descriptorSize, descriptor);
VmeAccelerator *acc = nullptr;
if (result == CL_SUCCESS) {
acc = new VmeAccelerator(
context,
typeId,
descriptorSize,
descriptor);
}
return acc;
}
protected:
private:
VmeAccelerator(Context *context,
cl_accelerator_type_intel typeId,
size_t descriptorSize,
const void *descriptor) : IntelAccelerator(context,
typeId,
descriptorSize,
descriptor) {
}
static cl_int validateVmeArgs(Context *context,
cl_accelerator_type_intel typeId,
size_t descriptorSize,
const void *descriptor);
};
}

View File

@@ -0,0 +1,66 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "runtime/accelerators/intel_accelerator.h"
//------------------------------------------------------------------------------
// VeboxAccelerator Class Stuff
//------------------------------------------------------------------------------
namespace OCLRT {
class Context;
class VeboxAccelerator : public IntelAccelerator {
public:
static VeboxAccelerator *create(Context *context,
cl_accelerator_type_intel typeId,
size_t descriptorSize,
const void *descriptor,
cl_int &result) {
DEBUG_BREAK_IF(!context);
DEBUG_BREAK_IF(!descriptor);
VeboxAccelerator *acc = new VeboxAccelerator(
context,
typeId,
descriptorSize,
descriptor);
result = CL_SUCCESS;
return acc;
}
protected:
private:
VeboxAccelerator(Context *context,
cl_accelerator_type_intel typeId,
size_t descriptorSize,
const void *descriptor) : IntelAccelerator(context,
typeId,
descriptorSize,
descriptor) {
}
};
}

View File

@@ -0,0 +1,31 @@
# Copyright (c) 2017, Intel Corporation
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
# We require cmake 3.2.0 or later
cmake_minimum_required(VERSION 3.2.0 FATAL_ERROR)
set (RUNTIME_SRCS_API
${CMAKE_CURRENT_SOURCE_DIR}/api.cpp
${CMAKE_CURRENT_SOURCE_DIR}/api.h
${CMAKE_CURRENT_SOURCE_DIR}/cl_types.h
${CMAKE_CURRENT_SOURCE_DIR}/dispatch.cpp
${CMAKE_CURRENT_SOURCE_DIR}/dispatch.h
PARENT_SCOPE
)

3760
runtime/api/api.cpp Normal file

File diff suppressed because it is too large Load Diff

887
runtime/api/api.h Normal file
View File

@@ -0,0 +1,887 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "config.h"
#include "CL/cl.h"
#include "CL/cl_gl.h"
#include "runtime/api/dispatch.h"
#ifdef __cplusplus
extern "C" {
#endif
cl_int CL_API_CALL clGetPlatformIDs(
cl_uint numEntries,
cl_platform_id *platforms,
cl_uint *numPlatforms);
cl_int CL_API_CALL clGetPlatformInfo(
cl_platform_id platform,
cl_platform_info paramName,
size_t paramValueSize,
void *paramValue,
size_t *paramValueSizeRet);
cl_int CL_API_CALL clGetDeviceIDs(
cl_platform_id platform,
cl_device_type deviceType,
cl_uint numEntries,
cl_device_id *devices,
cl_uint *numDevices);
cl_int CL_API_CALL clGetDeviceInfo(
cl_device_id device,
cl_device_info paramName,
size_t paramValueSize,
void *paramValue,
size_t *paramValueSizeRet);
cl_int CL_API_CALL clCreateSubDevices(
cl_device_id inDevice,
const cl_device_partition_property *properties,
cl_uint numDevices,
cl_device_id *outDevices,
cl_uint *numDevicesRet);
cl_int CL_API_CALL clRetainDevice(
cl_device_id device);
cl_int CL_API_CALL clReleaseDevice(
cl_device_id device);
cl_context CL_API_CALL clCreateContext(
const cl_context_properties *properties,
cl_uint numDevices,
const cl_device_id *devices,
void(CL_CALLBACK *funcNotify)(const char *, const void *, size_t, void *),
void *userData,
cl_int *errcodeRet);
cl_context CL_API_CALL clCreateContextFromType(
const cl_context_properties *properties,
cl_device_type deviceType,
void(CL_CALLBACK *funcNotify)(const char *, const void *, size_t, void *),
void *userData,
cl_int *errcodeRet);
cl_int CL_API_CALL clRetainContext(
cl_context context);
cl_int CL_API_CALL clReleaseContext(
cl_context context);
cl_int CL_API_CALL clGetContextInfo(
cl_context context,
cl_context_info paramName,
size_t paramValueSize,
void *paramValue,
size_t *paramValueSizeRet);
cl_int CL_API_CALL clGetGLContextInfoKHR(
const cl_context_properties *properties,
cl_gl_context_info paramName,
size_t paramValueSize,
void *paramValue,
size_t *paramValueSizeRet);
cl_command_queue CL_API_CALL clCreateCommandQueue(
cl_context context,
cl_device_id device,
cl_command_queue_properties properties,
cl_int *errcodeRet);
cl_int CL_API_CALL clRetainCommandQueue(
cl_command_queue commandQueue);
cl_int CL_API_CALL clReleaseCommandQueue(
cl_command_queue commandQueue);
cl_int CL_API_CALL clGetCommandQueueInfo(
cl_command_queue commandQueue,
cl_command_queue_info paramName,
size_t paramValueSize,
void *paramValue,
size_t *paramValueSizeRet);
// deprecated OpenCL 1.0
cl_int CL_API_CALL clSetCommandQueueProperty(
cl_command_queue commandQueue,
cl_command_queue_properties properties,
cl_bool enable,
cl_command_queue_properties *oldProperties);
cl_mem CL_API_CALL clCreateBuffer(
cl_context context,
cl_mem_flags flags,
size_t size,
void *hostPtr,
cl_int *errcodeRet);
cl_mem CL_API_CALL clCreateSubBuffer(
cl_mem buffer,
cl_mem_flags flags,
cl_buffer_create_type bufferCreateType,
const void *bufferCreateInfo,
cl_int *errcodeRet);
cl_mem CL_API_CALL clCreateImage(
cl_context context,
cl_mem_flags flags,
const cl_image_format *imageFormat,
const cl_image_desc *imageDesc,
void *hostPtr,
cl_int *errcodeRet);
// deprecated OpenCL 1.1
cl_mem CL_API_CALL clCreateImage2D(
cl_context context,
cl_mem_flags flags,
const cl_image_format *imageFormat,
size_t imageWidth,
size_t imageHeight,
size_t imageRowPitch,
void *hostPtr,
cl_int *errcodeRet);
// deprecated OpenCL 1.1
cl_mem CL_API_CALL clCreateImage3D(
cl_context context,
cl_mem_flags flags,
const cl_image_format *imageFormat,
size_t imageWidth,
size_t imageHeight,
size_t imageDepth,
size_t imageRowPitch,
size_t imageSlicePitch,
void *hostPtr,
cl_int *errcodeRet);
cl_int CL_API_CALL clRetainMemObject(
cl_mem memobj);
cl_int CL_API_CALL clReleaseMemObject(
cl_mem memobj);
cl_int CL_API_CALL clGetSupportedImageFormats(
cl_context context,
cl_mem_flags flags,
cl_mem_object_type imageType,
cl_uint numEntries,
cl_image_format *imageFormats,
cl_uint *numImageFormats);
cl_int CL_API_CALL clGetMemObjectInfo(
cl_mem memobj,
cl_mem_info paramName,
size_t paramValueSize,
void *paramValue,
size_t *paramValueSizeRet);
cl_int CL_API_CALL clGetImageInfo(
cl_mem image,
cl_image_info paramName,
size_t paramValueSize,
void *paramValue,
size_t *paramValueSizeRet);
cl_int CL_API_CALL clGetImageParamsINTEL(
cl_context context,
const cl_image_format *imageFormat,
const cl_image_desc *imageDesc,
size_t *imageRowPitch,
size_t *imageSlicePitch);
cl_int CL_API_CALL clSetMemObjectDestructorCallback(
cl_mem memobj,
void(CL_CALLBACK *funcNotify)(cl_mem, void *),
void *userData);
cl_sampler CL_API_CALL clCreateSampler(
cl_context context,
cl_bool normalizedCoords,
cl_addressing_mode addressingMode,
cl_filter_mode filterMode,
cl_int *errcodeRet);
cl_int CL_API_CALL clRetainSampler(
cl_sampler sampler);
cl_int CL_API_CALL clReleaseSampler(
cl_sampler sampler);
cl_int CL_API_CALL clGetSamplerInfo(
cl_sampler sampler,
cl_sampler_info paramName,
size_t paramValueSize,
void *paramValue,
size_t *paramValueSizeRet);
cl_program CL_API_CALL clCreateProgramWithSource(
cl_context context,
cl_uint count,
const char **strings,
const size_t *lengths,
cl_int *errcodeRet);
cl_program CL_API_CALL clCreateProgramWithBinary(
cl_context context,
cl_uint numDevices,
const cl_device_id *deviceList,
const size_t *lengths,
const unsigned char **binaries,
cl_int *binaryStatus,
cl_int *errcodeRet);
cl_program CL_API_CALL clCreateProgramWithBuiltInKernels(
cl_context context,
cl_uint numDevices,
const cl_device_id *deviceList,
const char *kernelNames,
cl_int *errcodeRet);
cl_int CL_API_CALL clRetainProgram(
cl_program program);
cl_int CL_API_CALL clReleaseProgram(
cl_program program);
cl_int CL_API_CALL clBuildProgram(
cl_program program,
cl_uint numDevices,
const cl_device_id *deviceList,
const char *options,
void(CL_CALLBACK *funcNotify)(cl_program program, void *userData),
void *userData);
cl_int CL_API_CALL clCompileProgram(
cl_program program,
cl_uint numDevices,
const cl_device_id *deviceList,
const char *options,
cl_uint numInputHeaders,
const cl_program *inputHeaders,
const char **headerIncludeNames,
void(CL_CALLBACK *funcNotify)(cl_program program, void *userData),
void *userData);
cl_program CL_API_CALL clLinkProgram(
cl_context context,
cl_uint numDevices,
const cl_device_id *deviceList,
const char *options,
cl_uint numInputPrograms,
const cl_program *inputPrograms,
void(CL_CALLBACK *funcNotify)(cl_program program, void *userData),
void *userData,
cl_int *errcodeRet);
cl_int CL_API_CALL clUnloadPlatformCompiler(
cl_platform_id platform);
// deprecated OpenCL 1.1
cl_int CL_API_CALL clUnloadCompiler(void);
cl_int CL_API_CALL clGetProgramInfo(
cl_program program,
cl_program_info paramName,
size_t paramValueSize,
void *paramValue,
size_t *paramValueSizeRet);
cl_int CL_API_CALL clGetProgramBuildInfo(
cl_program program,
cl_device_id device,
cl_program_build_info paramName,
size_t paramValueSize,
void *paramValue,
size_t *paramValueSizeRet);
cl_kernel CL_API_CALL clCreateKernel(
cl_program program,
const char *kernelName,
cl_int *errcodeRet);
cl_int CL_API_CALL clCreateKernelsInProgram(
cl_program program,
cl_uint numKernels,
cl_kernel *kernels,
cl_uint *numKernelsRet);
cl_int CL_API_CALL clRetainKernel(
cl_kernel kernel);
cl_int CL_API_CALL clReleaseKernel(
cl_kernel kernel);
cl_int CL_API_CALL clSetKernelArg(
cl_kernel kernel,
cl_uint argIndex,
size_t argSize,
const void *argValue);
cl_int CL_API_CALL clGetKernelInfo(
cl_kernel kernel,
cl_kernel_info paramName,
size_t paramValueSize,
void *paramValue,
size_t *paramValueSizeRet);
cl_int CL_API_CALL clGetKernelArgInfo(
cl_kernel kernel,
cl_uint argIndx,
cl_kernel_arg_info paramName,
size_t paramValueSize,
void *paramValue,
size_t *paramValueSizeRet);
cl_int CL_API_CALL clGetKernelWorkGroupInfo(
cl_kernel kernel,
cl_device_id device,
cl_kernel_work_group_info paramName,
size_t paramValueSize,
void *paramValue,
size_t *paramValueSizeRet);
cl_int CL_API_CALL clWaitForEvents(
cl_uint numEvents,
const cl_event *eventList);
cl_int CL_API_CALL clGetEventInfo(
cl_event event,
cl_event_info paramName,
size_t paramValueSize,
void *paramValue,
size_t *paramValueSizeRet);
cl_event CL_API_CALL clCreateUserEvent(
cl_context context,
cl_int *errcodeRet);
cl_int CL_API_CALL clRetainEvent(
cl_event event);
cl_int CL_API_CALL clReleaseEvent(
cl_event event);
cl_int CL_API_CALL clSetUserEventStatus(
cl_event event,
cl_int executionStatus);
cl_int CL_API_CALL clSetEventCallback(
cl_event event,
cl_int commandExecCallbackType,
void(CL_CALLBACK *funcNotify)(cl_event, cl_int, void *),
void *userData);
cl_int CL_API_CALL clGetEventProfilingInfo(
cl_event event,
cl_profiling_info paramName,
size_t paramValueSize,
void *paramValue,
size_t *paramValueSizeRet);
cl_int CL_API_CALL clFlush(
cl_command_queue commandQueue);
cl_int CL_API_CALL clFinish(
cl_command_queue commandQueue);
cl_int CL_API_CALL clEnqueueReadBuffer(
cl_command_queue commandQueue,
cl_mem buffer,
cl_bool blockingRead,
size_t offset,
size_t cb,
void *ptr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueReadBufferRect(
cl_command_queue commandQueue,
cl_mem buffer,
cl_bool blockingRead,
const size_t *bufferOrigin,
const size_t *hostOrigin,
const size_t *region,
size_t bufferRowPitch,
size_t bufferSlicePitch,
size_t hostRowPitch,
size_t hostSlicePitch,
void *ptr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueWriteBuffer(
cl_command_queue commandQueue,
cl_mem buffer,
cl_bool blockingWrite,
size_t offset,
size_t cb,
const void *ptr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueWriteBufferRect(
cl_command_queue commandQueue,
cl_mem buffer,
cl_bool blockingWrite,
const size_t *bufferOrigin,
const size_t *hostOrigin,
const size_t *region,
size_t bufferRowPitch,
size_t bufferSlicePitch,
size_t hostRowPitch,
size_t hostSlicePitch,
const void *ptr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueFillBuffer(
cl_command_queue commandQueue,
cl_mem buffer,
const void *pattern,
size_t patternSize,
size_t offset,
size_t size,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueCopyBuffer(
cl_command_queue commandQueue,
cl_mem srcBuffer,
cl_mem dstBuffer,
size_t srcOffset,
size_t dstOffset,
size_t cb,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueCopyBufferRect(
cl_command_queue commandQueue,
cl_mem srcBuffer,
cl_mem dstBuffer,
const size_t *srcOrigin,
const size_t *dstOrigin,
const size_t *region,
size_t srcRowPitch,
size_t srcSlicePitch,
size_t dstRowPitch,
size_t dstSlicePitch,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueReadImage(
cl_command_queue commandQueue,
cl_mem image,
cl_bool blockingRead,
const size_t *origin,
const size_t *region,
size_t rowPitch,
size_t slicePitch,
void *ptr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueWriteImage(
cl_command_queue commandQueue,
cl_mem image,
cl_bool blockingWrite,
const size_t *origin,
const size_t *region,
size_t inputRowPitch,
size_t inputSlicePitch,
const void *ptr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueFillImage(
cl_command_queue commandQueue,
cl_mem image,
const void *fillColor,
const size_t *origin,
const size_t *region,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueCopyImage(
cl_command_queue commandQueue,
cl_mem srcImage,
cl_mem dstImage,
const size_t *srcOrigin,
const size_t *dstOrigin,
const size_t *region,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueCopyImageToBuffer(
cl_command_queue commandQueue,
cl_mem srcImage,
cl_mem dstBuffer,
const size_t *srcOrigin,
const size_t *region,
size_t dstOffset,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueCopyBufferToImage(
cl_command_queue commandQueue,
cl_mem srcBuffer,
cl_mem dstImage,
size_t srcOffset,
const size_t *dstOrigin,
const size_t *region,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
void *CL_API_CALL clEnqueueMapBuffer(
cl_command_queue commandQueue,
cl_mem buffer,
cl_bool blockingMap,
cl_map_flags mapFlags,
size_t offset,
size_t cb,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event,
cl_int *errcodeRet);
void *CL_API_CALL clEnqueueMapImage(
cl_command_queue commandQueue,
cl_mem image,
cl_bool blockingMap,
cl_map_flags mapFlags,
const size_t *origin,
const size_t *region,
size_t *imageRowPitch,
size_t *imageSlicePitch,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event,
cl_int *errcodeRet);
cl_int CL_API_CALL clEnqueueUnmapMemObject(
cl_command_queue commandQueue,
cl_mem memobj,
void *mappedPtr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueMigrateMemObjects(
cl_command_queue commandQueue,
cl_uint numMemObjects,
const cl_mem *memObjects,
cl_mem_migration_flags flags,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueNDRangeKernel(
cl_command_queue commandQueue,
cl_kernel kernel,
cl_uint workDim,
const size_t *globalWorkOffset,
const size_t *globalWorkSize,
const size_t *localWorkSize,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueTask(
cl_command_queue commandQueue,
cl_kernel kernel,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueNativeKernel(
cl_command_queue commandQueue,
void(CL_CALLBACK *userFunc)(void *),
void *args,
size_t cbArgs,
cl_uint numMemObjects,
const cl_mem *memList,
const void **argsMemLoc,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
// deprecated OpenCL 1.1
cl_int CL_API_CALL clEnqueueMarker(
cl_command_queue commandQueue,
cl_event *event);
// deprecated OpenCL 1.1
cl_int CL_API_CALL clEnqueueWaitForEvents(
cl_command_queue commandQueue,
cl_uint numEvents,
const cl_event *eventList);
// deprecated OpenCL 1.1
cl_int CL_API_CALL clEnqueueBarrier(
cl_command_queue commandQueue);
cl_int CL_API_CALL clEnqueueMarkerWithWaitList(
cl_command_queue commandQueue,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueBarrierWithWaitList(
cl_command_queue commandQueue,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
// deprecated OpenCL 1.1
void *CL_API_CALL clGetExtensionFunctionAddress(
const char *funcName);
void *CL_API_CALL clGetExtensionFunctionAddressForPlatform(
cl_platform_id platform,
const char *funcName);
// CL-GL Sharing
cl_mem CL_API_CALL clCreateFromGLBuffer(
cl_context context,
cl_mem_flags flags,
cl_GLuint bufobj,
int *errcodeRet);
// OpenCL 1.2
cl_mem CL_API_CALL clCreateFromGLTexture(
cl_context context,
cl_mem_flags flags,
cl_GLenum target,
cl_GLint miplevel,
cl_GLuint texture,
cl_int *errcodeRet);
// deprecated OpenCL 1.1
cl_mem CL_API_CALL clCreateFromGLTexture2D(
cl_context context,
cl_mem_flags flags,
cl_GLenum target,
cl_GLint miplevel,
cl_GLuint texture,
cl_int *errcodeRet);
// deprecated OpenCL 1.1
cl_mem CL_API_CALL clCreateFromGLTexture3D(
cl_context context,
cl_mem_flags flags,
cl_GLenum target,
cl_GLint miplevel,
cl_GLuint texture,
cl_int *errcodeRet);
cl_mem CL_API_CALL clCreateFromGLRenderbuffer(
cl_context context,
cl_mem_flags flags,
cl_GLuint renderbuffer,
cl_int *errcodeRet);
cl_int CL_API_CALL clGetGLObjectInfo(
cl_mem memobj,
cl_gl_object_type *glObjectType,
cl_GLuint *glObjectName);
cl_int CL_API_CALL clGetGLTextureInfo(
cl_mem memobj,
cl_gl_texture_info paramName,
size_t paramValueSize,
void *paramValue,
size_t *paramValueSizeRet);
cl_int CL_API_CALL clEnqueueAcquireGLObjects(
cl_command_queue commandQueue,
cl_uint numObjects,
const cl_mem *memObjects,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueReleaseGLObjects(
cl_command_queue commandQueue,
cl_uint numObjects,
const cl_mem *memObjects,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
// OpenCL 2.0
void *CL_API_CALL clSVMAlloc(
cl_context context,
cl_svm_mem_flags flags,
size_t size,
cl_uint alignment);
void CL_API_CALL clSVMFree(
cl_context context,
void *svmPointer);
cl_int CL_API_CALL clEnqueueSVMFree(
cl_command_queue commandQueue,
cl_uint numSvmPointers,
void *svmPointers[],
void(CL_CALLBACK *pfnFreeFunc)(
cl_command_queue queue,
cl_uint numSvmPointers,
void *svmPointers[],
void *userData),
void *userData,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueSVMMemcpy(
cl_command_queue commandQueue,
cl_bool blockingCopy,
void *dstPtr,
const void *srcPtr,
size_t size,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueSVMMemFill(
cl_command_queue commandQueue,
void *svmPtr,
const void *pattern,
size_t patternSize,
size_t size,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueSVMMap(
cl_command_queue commandQueue,
cl_bool blockingMap,
cl_map_flags mapFlags,
void *svmPtr,
size_t size,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clEnqueueSVMUnmap(
cl_command_queue commandQueue,
void *svmPtr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
cl_int CL_API_CALL clSetKernelArgSVMPointer(
cl_kernel kernel,
cl_uint argIndex,
const void *argValue);
cl_int CL_API_CALL clSetKernelExecInfo(
cl_kernel kernel,
cl_kernel_exec_info paramName,
size_t paramValueSize,
const void *paramValue);
cl_mem CL_API_CALL clCreatePipe(
cl_context context,
cl_mem_flags flags,
cl_uint pipePacketSize,
cl_uint pipeMaxPackets,
const cl_pipe_properties *properties,
cl_int *errcodeRet);
cl_int CL_API_CALL clGetPipeInfo(
cl_mem pipe,
cl_pipe_info paramName,
size_t paramValueSize,
void *paramValue,
size_t *paramValueSizeRet);
cl_command_queue CL_API_CALL clCreateCommandQueueWithProperties(
cl_context context,
cl_device_id device,
const cl_queue_properties *properties,
cl_int *errcodeRet);
cl_command_queue CL_API_CALL clCreateCommandQueueWithPropertiesINTEL(
cl_context context,
cl_device_id device,
const cl_queue_properties_intel *properties,
cl_int *errcodeRet);
cl_sampler CL_API_CALL clCreateSamplerWithProperties(
cl_context context,
const cl_sampler_properties *samplerProperties,
cl_int *errcodeRet);
// OpenCL 2.1
cl_int CL_API_CALL clGetDeviceAndHostTimer(cl_device_id device,
cl_ulong *deviceTimestamp,
cl_ulong *hostTimestamp);
cl_int CL_API_CALL clGetHostTimer(cl_device_id device,
cl_ulong *hostTimestamp);
extern CL_API_ENTRY cl_command_queue CL_API_CALL
clCreatePerfCountersCommandQueueINTEL(
cl_context context,
cl_device_id device,
cl_command_queue_properties properties,
cl_uint configuration,
cl_int *errcodeRet);
extern CL_API_ENTRY cl_int CL_API_CALL
clSetPerformanceConfigurationINTEL(
cl_device_id device,
cl_uint count,
cl_uint *offsets,
cl_uint *values);
extern CL_API_ENTRY cl_event CL_API_CALL
clCreateEventFromGLsyncKHR(
cl_context context,
cl_GLsync sync,
cl_int *errcodeRet) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_program CL_API_CALL clCreateProgramWithILKHR(
cl_context context,
const void *il,
size_t length,
cl_int *errcodeRet) CL_API_SUFFIX__VERSION_1_2;
}

75
runtime/api/cl_types.h Normal file
View File

@@ -0,0 +1,75 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "config.h"
#include "CL/cl.h"
#include "runtime/api/dispatch.h"
#include <cstdint>
struct ClDispatch {
SEntryPointsTable dispatch;
ClDispatch() : dispatch(globalDispatchTable) {
}
};
struct _cl_accelerator_intel : public ClDispatch {
};
struct _cl_command_queue : public ClDispatch {
};
// device_queue is a type used internally
struct _device_queue : public _cl_command_queue {
};
typedef _device_queue *device_queue;
struct _cl_context : public ClDispatch {
bool isSharedContext = false;
};
struct _cl_device_id : public ClDispatch {
};
struct _cl_event : public ClDispatch {
};
struct _cl_kernel : public ClDispatch {
};
struct _cl_mem : public ClDispatch {
};
struct _cl_platform_id : public ClDispatch {
};
struct _cl_program : public ClDispatch {
};
struct _cl_sampler : public ClDispatch {
};
template <typename Type>
inline bool isValidObject(Type object) {
return object && object->dispatch.icdDispatch == &icdGlobalDispatchTable;
}

226
runtime/api/dispatch.cpp Normal file
View File

@@ -0,0 +1,226 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "dispatch.h"
#include "api.h"
SDispatchTable icdGlobalDispatchTable =
{
clGetPlatformIDs,
clGetPlatformInfo,
clGetDeviceIDs,
clGetDeviceInfo,
clCreateContext,
clCreateContextFromType,
clRetainContext,
clReleaseContext,
clGetContextInfo,
clCreateCommandQueue,
clRetainCommandQueue,
clReleaseCommandQueue,
clGetCommandQueueInfo,
clSetCommandQueueProperty,
clCreateBuffer,
clCreateImage2D,
clCreateImage3D,
clRetainMemObject,
clReleaseMemObject,
clGetSupportedImageFormats,
clGetMemObjectInfo,
clGetImageInfo,
clCreateSampler,
clRetainSampler,
clReleaseSampler,
clGetSamplerInfo,
clCreateProgramWithSource,
clCreateProgramWithBinary,
clRetainProgram,
clReleaseProgram,
clBuildProgram,
clUnloadCompiler,
clGetProgramInfo,
clGetProgramBuildInfo,
clCreateKernel,
clCreateKernelsInProgram,
clRetainKernel,
clReleaseKernel,
clSetKernelArg,
clGetKernelInfo,
clGetKernelWorkGroupInfo,
clWaitForEvents,
clGetEventInfo,
clRetainEvent,
clReleaseEvent,
clGetEventProfilingInfo,
clFlush,
clFinish,
clEnqueueReadBuffer,
clEnqueueWriteBuffer,
clEnqueueCopyBuffer,
clEnqueueReadImage,
clEnqueueWriteImage,
clEnqueueCopyImage,
clEnqueueCopyImageToBuffer,
clEnqueueCopyBufferToImage,
clEnqueueMapBuffer,
clEnqueueMapImage,
clEnqueueUnmapMemObject,
clEnqueueNDRangeKernel,
clEnqueueTask,
clEnqueueNativeKernel,
clEnqueueMarker,
clEnqueueWaitForEvents,
clEnqueueBarrier,
clGetExtensionFunctionAddress,
/* cl_khr_gl_sharing */
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
/* cl_khr_d3d10_sharing */
nullptr, // clGetDeviceIDsFromD3D10KHR,
nullptr, // clCreateFromD3D10BufferKHR,
nullptr, // clCreateFromD3D10Texture2DKHR,
nullptr, // clCreateFromD3D10Texture3DKHR,
nullptr, // clEnqueueAcquireD3D10ObjectsKHR,
nullptr, // clEnqueueReleaseD3D10ObjectsKHR,
/* OpenCL 1.1 */
clSetEventCallback,
clCreateSubBuffer,
clSetMemObjectDestructorCallback,
clCreateUserEvent,
clSetUserEventStatus,
clEnqueueReadBufferRect,
clEnqueueWriteBufferRect,
clEnqueueCopyBufferRect,
/* cl_ext_device_fission */
nullptr, //clCreateSubDevicesEXT,
nullptr, //clRetainDeviceEXT,
nullptr, //clReleaseDeviceEXT,
/* cl_khr_gl_event */
nullptr,
/* OpenCL 1.2 */
clCreateSubDevices,
clRetainDevice,
clReleaseDevice,
clCreateImage,
clCreateProgramWithBuiltInKernels,
clCompileProgram,
clLinkProgram,
clUnloadPlatformCompiler,
clGetKernelArgInfo,
clEnqueueFillBuffer,
clEnqueueFillImage,
clEnqueueMigrateMemObjects,
clEnqueueMarkerWithWaitList,
clEnqueueBarrierWithWaitList,
clGetExtensionFunctionAddressForPlatform,
nullptr,
/* cl_khr_d3d11_sharing */
nullptr, // clGetDeviceIDsFromD3D11KHR,
nullptr, // clCreateFromD3D11BufferKHR,
nullptr, // clCreateFromD3D11Texture2DKHR,
nullptr, // clCreateFromD3D11Texture3DKHR,
nullptr, // clCreateFromDX9MediaSurfaceKHR,
nullptr, // clEnqueueAcquireD3D11ObjectsKHR,
nullptr, // clEnqueueReleaseD3D11ObjectsKHR,
/* cl_khr_dx9_media_sharing */
nullptr, // clGetDeviceIDsFromDX9MediaAdapterKHR,
nullptr, // clEnqueueAcquireDX9MediaSurfacesKHR,
nullptr, // clEnqueueReleaseDX9MediaSurfacesKHR,
/* cl_khr_egl_image */
nullptr, //clCreateFromEGLImageKHR,
nullptr, //clEnqueueAcquireEGLObjectsKHR,
nullptr, //clEnqueueReleaseEGLObjectsKHR,
/* cl_khr_egl_event */
nullptr, //clCreateEventFromEGLSyncKHR,
/* OpenCL 2.0 */
clCreateCommandQueueWithProperties,
clCreatePipe,
clGetPipeInfo,
clSVMAlloc,
clSVMFree,
clEnqueueSVMFree,
clEnqueueSVMMemcpy,
clEnqueueSVMMemFill,
clEnqueueSVMMap,
clEnqueueSVMUnmap,
clCreateSamplerWithProperties,
clSetKernelArgSVMPointer,
clSetKernelExecInfo,
clGetKernelSubGroupInfoKHR,
/* OpenCL 2.1 */
clCloneKernel,
clCreateProgramWithIL,
clEnqueueSVMMigrateMem,
clGetDeviceAndHostTimer,
clGetHostTimer,
clGetKernelSubGroupInfo,
clSetDefaultDeviceCommandQueue,
};
SCRTDispatchTable crtGlobalDispatchTable = {
clGetKernelArgInfo,
nullptr, // clGetDeviceIDsFromDX9INTEL,
nullptr, // clCreateFromDX9MediaSurfaceINTEL,
nullptr, // clEnqueueAcquireDX9ObjectsINTEL,
nullptr, // clEnqueueReleaseDX9ObjectsINTEL,
clGetImageParamsINTEL,
clCreatePerfCountersCommandQueueINTEL,
clCreateAcceleratorINTEL,
clGetAcceleratorInfoINTEL,
clRetainAcceleratorINTEL,
clReleaseAcceleratorINTEL,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
clSetPerformanceConfigurationINTEL};
SEntryPointsTable globalDispatchTable = {&icdGlobalDispatchTable, &crtGlobalDispatchTable};

1304
runtime/api/dispatch.h Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,92 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include <cstdint>
#include <type_traits>
#ifndef WIN32
#pragma pack(4)
#else
#pragma pack(push, 4)
#endif
struct AubCmdHdr {
uint32_t DwordLength : 16,
SubOp : 7,
Opcode : 6,
Type : 3;
};
static_assert(4 == sizeof(AubCmdHdr), "Invalid size for AubCmdHdr");
struct AubCmdDumpBmpHd {
AubCmdHdr Header;
uint32_t Xmin;
uint32_t Ymin;
uint32_t BufferPitch;
uint32_t BitsPerPixel : 8,
Format : 8,
Reserved_0 : 16;
uint32_t Xsize;
uint32_t Ysize;
uint64_t BaseAddr;
uint32_t Secure : 1,
UseFence : 1,
TileOn : 1,
WalkY : 1,
UsePPGTT : 1,
Use32BitDump : 1,
UseFullFormat : 1,
Reserved_1 : 25;
uint32_t DirectoryHandle;
};
static_assert(44 == sizeof(AubCmdDumpBmpHd), "Invalid size for AubCmdDumpBmpHd");
struct AubPpgttContextCreate {
AubCmdHdr Header;
uint32_t Handle;
uint32_t AdvancedContext : 1,
SixtyFourBit : 1,
Reserved_31_2 : 30;
uint64_t PageDirPointer[4];
};
static_assert(44 == sizeof(AubPpgttContextCreate), "Invalid size for AubPpgttContextCreate");
struct AubBinaryDump {
AubCmdHdr Header;
char OutputFile[40];
uint32_t Height;
uint32_t Width;
uint64_t BaseAddr;
uint32_t SurfaceType : 4,
Pitch : 28;
uint32_t GttType : 2,
Reserved_31_2 : 30;
uint32_t DirectoryHandle;
};
static_assert(72 == sizeof(AubBinaryDump), "Invalid size for AubBinaryDump");
#ifndef WIN32
#pragma pack()
#else
#pragma pack(pop)
#endif

View File

@@ -0,0 +1,190 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "config.h"
#include "aub_mem_dump.h"
#include "runtime/helpers/ptr_math.h"
#include "runtime/helpers/debug_helpers.h"
namespace AubMemDump {
const uint64_t g_pageMask = ~(4096ull - 1);
const size_t g_dwordCountMax = 65536;
// Some page table constants used in virtualizing the page tables.
// clang-format off
// 32 bit page table traits
const uint64_t PageTableTraits<32>::physicalMemory = 0; // 1ull <<addressingBits;
const uint64_t PageTableTraits<32>::numPTEntries = BIT(PageTableTraits<32>::addressingBits - PageTableTraits<32>::NUM_OFFSET_BITS);
const uint64_t PageTableTraits<32>::sizePT = BIT(PageTableTraits<32>::addressingBits - PageTableTraits<32>::NUM_OFFSET_BITS) * sizeof(uint64_t);
const uint64_t PageTableTraits<32>::ptBaseAddress = BIT(38);
const uint64_t PageTableTraits<32>::numPDEntries = BIT(PageTableTraits<32>::addressingBits - PageTableTraits<32>::NUM_OFFSET_BITS - PageTableTraits<32>::NUM_PTE_BITS);
const uint64_t PageTableTraits<32>::sizePD = BIT(PageTableTraits<32>::addressingBits - PageTableTraits<32>::NUM_OFFSET_BITS - PageTableTraits<32>::NUM_PTE_BITS) * sizeof(uint64_t);
const uint64_t PageTableTraits<32>::pdBaseAddress = BIT(37);
const uint64_t PageTableTraits<32>::numPDPEntries = BIT(PageTableTraits<32>::addressingBits - PageTableTraits<32>::NUM_OFFSET_BITS - PageTableTraits<32>::NUM_PTE_BITS - PageTableTraits<32>::NUM_PDE_BITS);
const uint64_t PageTableTraits<32>::sizePDP = BIT(PageTableTraits<32>::addressingBits - PageTableTraits<32>::NUM_OFFSET_BITS - PageTableTraits<32>::NUM_PTE_BITS - PageTableTraits<32>::NUM_PDE_BITS) * sizeof(uint64_t);
const uint64_t PageTableTraits<32>::pdpBaseAddress = BIT(36);
// 48 bit page table traits
const uint64_t PageTableTraits<48>::physicalMemory = 0; // 1ull <<addressingBits;
const uint64_t PageTableTraits<48>::numPTEntries = BIT(PageTableTraits<48>::addressingBits - PageTableTraits<48>::NUM_OFFSET_BITS);
const uint64_t PageTableTraits<48>::sizePT = BIT(PageTableTraits<48>::addressingBits - PageTableTraits<48>::NUM_OFFSET_BITS) * sizeof(uint64_t);
const uint64_t PageTableTraits<48>::ptBaseAddress = BIT(38);
const uint64_t PageTableTraits<48>::numPDEntries = BIT(PageTableTraits<48>::addressingBits - PageTableTraits<48>::NUM_OFFSET_BITS - PageTableTraits<48>::NUM_PTE_BITS);
const uint64_t PageTableTraits<48>::sizePD = BIT(PageTableTraits<48>::addressingBits - PageTableTraits<48>::NUM_OFFSET_BITS - PageTableTraits<48>::NUM_PTE_BITS) * sizeof(uint64_t);
const uint64_t PageTableTraits<48>::pdBaseAddress = BIT(37);
const uint64_t PageTableTraits<48>::numPDPEntries = BIT(PageTableTraits<48>::addressingBits - PageTableTraits<48>::NUM_OFFSET_BITS - PageTableTraits<48>::NUM_PTE_BITS - PageTableTraits<48>::NUM_PDE_BITS);
const uint64_t PageTableTraits<48>::sizePDP = BIT(PageTableTraits<48>::addressingBits - PageTableTraits<48>::NUM_OFFSET_BITS - PageTableTraits<48>::NUM_PTE_BITS - PageTableTraits<48>::NUM_PDE_BITS) * sizeof(uint64_t);
const uint64_t PageTableTraits<48>::pdpBaseAddress = BIT(36);
const uint64_t PageTableTraits<48>::numPML4Entries = BIT(NUM_PML4_BITS);
const uint64_t PageTableTraits<48>::sizePML4 = BIT(NUM_PML4_BITS) * sizeof(uint64_t);
const uint64_t PageTableTraits<48>::pml4BaseAddress = BIT(35);
// clang-format on
void LrcaHelper::setRingTail(void *pLRCIn, uint32_t ringTail) const {
auto pLRCA = ptrOffset(reinterpret_cast<uint32_t *>(pLRCIn),
offsetContext + offsetRingRegisters + offsetRingTail);
*pLRCA++ = mmioBase + 0x2030;
*pLRCA++ = ringTail;
}
void LrcaHelper::setRingHead(void *pLRCIn, uint32_t ringHead) const {
auto pLRCA = ptrOffset(reinterpret_cast<uint32_t *>(pLRCIn),
offsetContext + offsetRingRegisters + offsetRingHead);
*pLRCA++ = mmioBase + 0x2034;
*pLRCA++ = ringHead;
}
void LrcaHelper::setRingBase(void *pLRCIn, uint32_t ringBase) const {
auto pLRCA = ptrOffset(reinterpret_cast<uint32_t *>(pLRCIn),
offsetContext + offsetRingRegisters + offsetRingBase);
*pLRCA++ = mmioBase + 0x2038;
*pLRCA++ = ringBase;
}
void LrcaHelper::setRingCtrl(void *pLRCIn, uint32_t ringCtrl) const {
auto pLRCA = ptrOffset(reinterpret_cast<uint32_t *>(pLRCIn),
offsetContext + offsetRingRegisters + offsetRingCtrl);
*pLRCA++ = mmioBase + 0x203c;
*pLRCA++ = ringCtrl;
}
void LrcaHelper::setPDP0(void *pLRCIn, uint64_t address) const {
auto pLRCA = ptrOffset(reinterpret_cast<uint32_t *>(pLRCIn),
offsetContext + offsetPageTableRegisters + offsetPDP0);
*pLRCA++ = mmioBase + 0x2274;
*pLRCA++ = address >> 32;
*pLRCA++ = mmioBase + 0x2270;
*pLRCA++ = address & 0xffffffff;
}
void LrcaHelper::setPDP1(void *pLRCIn, uint64_t address) const {
auto pLRCA = ptrOffset(reinterpret_cast<uint32_t *>(pLRCIn),
offsetContext + offsetPageTableRegisters + offsetPDP1);
*pLRCA++ = mmioBase + 0x227c;
*pLRCA++ = address >> 32;
*pLRCA++ = mmioBase + 0x2278;
*pLRCA++ = address & 0xffffffff;
}
void LrcaHelper::setPDP2(void *pLRCIn, uint64_t address) const {
auto pLRCA = ptrOffset(reinterpret_cast<uint32_t *>(pLRCIn),
offsetContext + offsetPageTableRegisters + offsetPDP2);
*pLRCA++ = mmioBase + 0x2284;
*pLRCA++ = address >> 32;
*pLRCA++ = mmioBase + 0x2280;
*pLRCA++ = address & 0xffffffff;
}
void LrcaHelper::setPDP3(void *pLRCIn, uint64_t address) const {
auto pLRCA = ptrOffset(reinterpret_cast<uint32_t *>(pLRCIn),
offsetContext + offsetPageTableRegisters + offsetPDP3);
*pLRCA++ = mmioBase + 0x228c;
*pLRCA++ = address >> 32;
*pLRCA++ = mmioBase + 0x2288;
*pLRCA++ = address & 0xffffffff;
}
void LrcaHelper::setPML4(void *pLRCIn, uint64_t address) const {
setPDP0(pLRCIn, address);
}
void LrcaHelper::initialize(void *pLRCIn) const {
auto pLRCABase = reinterpret_cast<uint32_t *>(pLRCIn);
// Initialize to known but benign garbage
for (size_t i = 0; i < sizeLRCA / sizeof(uint32_t); i++) {
pLRCABase[i] = 0x1;
}
auto pLRCA = ptrOffset(pLRCABase, offsetContext);
// Initialize the ring context of the LRCA
auto pLRI = ptrOffset(pLRCA, offsetLRI0);
auto numRegs = numRegsLRI0;
*pLRI++ = 0x11001000 | (2 * numRegs - 1);
while (numRegs-- > 0) {
*pLRI++ = mmioBase + 0x2244; // CTXT_SR_CTL
*pLRI++ = 0x00010001; // Inhibit context-restore
}
// Initialize the other LRI
DEBUG_BREAK_IF(offsetLRI1 != 0x21 * sizeof(uint32_t));
pLRI = ptrOffset(pLRCA, offsetLRI1);
numRegs = numRegsLRI1;
*pLRI++ = 0x11001000 | (2 * numRegs - 1);
while (numRegs-- > 0) {
*pLRI++ = mmioBase + 0x2094; // NOP ID
*pLRI++ = 0x00000000;
}
DEBUG_BREAK_IF(offsetLRI2 != 0x41 * sizeof(uint32_t));
pLRI = ptrOffset(pLRCA, offsetLRI2);
numRegs = numRegsLRI2;
*pLRI++ = 0x11000000 | (2 * numRegs - 1);
while (numRegs-- > 0) {
*pLRI++ = mmioBase + 0x2094; // NOP ID
*pLRI++ = 0x00000000;
}
setRingHead(pLRCIn, 0);
setRingTail(pLRCIn, 0);
setRingBase(pLRCIn, 0);
setRingCtrl(pLRCIn, 0);
setPDP0(pLRCIn, 0);
setPDP1(pLRCIn, 0);
setPDP2(pLRCIn, 0);
setPDP3(pLRCIn, 0);
}
}

View File

@@ -0,0 +1,382 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include <cstdio>
#include <cstdint>
#include <fstream>
#ifndef BIT
#define BIT(x) (((uint64_t)1) << (x))
#endif
namespace AubMemDump {
#include "aub_services.h"
template <typename Cmd>
inline void setAddress(Cmd &cmd, uint64_t address) {
cmd.address = address;
}
template <>
inline void setAddress(CmdServicesMemTraceMemoryCompare &cmd, uint64_t address) {
cmd.address = static_cast<uint32_t>(address);
cmd.addressHigh = static_cast<uint32_t>(address >> 32);
}
template <typename TypeTrue, typename TypeFalse, bool is32Bits>
struct TypeSelector {
typedef TypeTrue type;
};
template <typename TypeTrue, typename TypeFalse>
struct TypeSelector<TypeTrue, TypeFalse, false> {
typedef TypeFalse type;
};
union IAPageTableEntry {
struct
{
uint64_t Present : 1; //[0]
uint64_t Writable : 1; //[1]
uint64_t UserSupervisor : 1; //[2]
uint64_t PWT : 1; //[3]
uint64_t PCD : 1; //[4]
uint64_t Accessed : 1; //[5]
uint64_t Dirty : 1; //[6]
uint64_t PAT : 1; //[7]
uint64_t Global : 1; //[8]
uint64_t Reserved_11_9 : 3; //[11:9]
uint64_t PhysicalAddress : 27; //[38:12]
uint64_t Reserved_51_39 : 13; //[51:39]
uint64_t Ignored : 11; //[62:52]
uint64_t ExecuteDisable : 1; //[63]
} pageConfig;
uint32_t dwordData[2];
uint64_t uiData;
};
typedef IAPageTableEntry MiGttEntry;
static inline void setGttEntry(IAPageTableEntry &entry, uint64_t address) {
entry.uiData = 0;
entry.pageConfig.PhysicalAddress = address / 4096;
entry.pageConfig.Present = true;
entry.pageConfig.Writable = true;
entry.pageConfig.UserSupervisor = true;
}
// Use the latest DeviceValues enumerations available
typedef CmdServicesMemTraceVersion::DeviceValues DeviceValues;
typedef CmdServicesMemTraceVersion::SteppingValues SteppingValues;
typedef CmdServicesMemTraceMemoryWrite::AddressSpaceValues AddressSpaceValues;
typedef CmdServicesMemTraceMemoryWrite::DataTypeHintValues DataTypeHintValues;
typedef CmdServicesMemTraceMemoryDump::TilingValues TilingValues;
typedef CmdServicesMemTraceMemoryWrite::RepeatMemoryValues RepeatMemoryValues;
typedef CmdServicesMemTraceRegisterWrite::MessageSourceIdValues MessageSourceIdValues;
typedef CmdServicesMemTraceRegisterWrite::RegisterSizeValues RegisterSizeValues;
typedef CmdServicesMemTraceRegisterWrite::RegisterSpaceValues RegisterSpaceValues;
typedef CmdServicesMemTraceMemoryPoll::DataSizeValues DataSizeValues;
template <int deviceIn, int addressingBitsIn>
struct Traits {
typedef struct AubStream Stream;
enum {
addressingBits = addressingBitsIn,
device = deviceIn
};
};
struct AubStream {
virtual void open(const char *filePath) = 0;
virtual void close() = 0;
virtual bool init(uint32_t stepping, uint32_t device) = 0;
virtual void createContext(const AubPpgttContextCreate &cmd) {}
virtual void writeMemory(uint64_t physAddress, const void *memory, size_t sizeToDumpThisIteration, uint32_t addressSpace, uint32_t hint) = 0;
virtual void writeMemoryWriteHeader(uint64_t physAddress, size_t size, uint32_t addressSpace, uint32_t hint) = 0;
virtual void writeMemoryWriteHeader(uint64_t physAddress, size_t size, uint32_t addressSpace) {
return writeMemoryWriteHeader(physAddress, size, addressSpace, CmdServicesMemTraceMemoryWrite::DataTypeHintValues::TraceNotype);
}
virtual void writePTE(uint64_t physAddress, uint64_t entry) = 0;
virtual void writeGTT(uint32_t offset, uint64_t entry) = 0;
virtual void writeMMIO(uint32_t offset, uint32_t value) = 0;
virtual void registerPoll(uint32_t registerOffset, uint32_t mask, uint32_t value, bool pollNotEqual, uint32_t timeoutAction) = 0;
};
struct AubFileStream : public AubStream {
void open(const char *filePath) override;
void close() override;
bool init(uint32_t stepping, uint32_t device) override;
void createContext(const AubPpgttContextCreate &cmd) override;
void writeMemory(uint64_t physAddress, const void *memory, size_t size, uint32_t addressSpace, uint32_t hint) override;
void writeMemoryWriteHeader(uint64_t physAddress, size_t size, uint32_t addressSpace, uint32_t hint) override;
void writePTE(uint64_t physAddress, uint64_t entry) override;
void writeGTT(uint32_t offset, uint64_t entry) override;
void writeMMIO(uint32_t offset, uint32_t value) override;
void registerPoll(uint32_t registerOffset, uint32_t mask, uint32_t value, bool pollNotEqual, uint32_t timeoutAction) override;
void expectMemory(uint64_t physAddress, const void *memory, size_t size);
void addComment(const char *message);
std::ofstream fileHandle;
};
template <int addressingBits>
struct PageTableTraits {
};
template <>
struct PageTableTraits<32> {
// clang-format off
enum {
addressingBits = 32,
NUM_OFFSET_BITS = 12,
NUM_PTE_BITS = 9,
NUM_PDE_BITS = 9,
NUM_PDP_BITS = addressingBits - NUM_PDE_BITS - NUM_PTE_BITS - NUM_OFFSET_BITS,
};
static const uint64_t physicalMemory;
static const uint64_t numPTEntries;
static const uint64_t sizePT;
static const uint64_t ptBaseAddress;
static const uint64_t numPDEntries;
static const uint64_t sizePD;
static const uint64_t pdBaseAddress;
static const uint64_t numPDPEntries;
static const uint64_t sizePDP;
static const uint64_t pdpBaseAddress;
// clang-format on
};
template <>
struct PageTableTraits<48> {
// clang-format off
enum {
addressingBits = 48,
NUM_OFFSET_BITS = PageTableTraits<32>::NUM_OFFSET_BITS,
NUM_PTE_BITS = PageTableTraits<32>::NUM_PTE_BITS,
NUM_PDE_BITS = PageTableTraits<32>::NUM_PDE_BITS,
NUM_PDP_BITS = PageTableTraits<32>::NUM_PDP_BITS,
NUM_PML4_BITS = addressingBits - NUM_PDP_BITS - NUM_PDE_BITS - NUM_PTE_BITS - NUM_OFFSET_BITS
};
static const uint64_t physicalMemory;
static const uint64_t numPTEntries;
static const uint64_t sizePT;
static const uint64_t ptBaseAddress;
static const uint64_t numPDEntries;
static const uint64_t sizePD;
static const uint64_t pdBaseAddress;
static const uint64_t numPDPEntries;
static const uint64_t sizePDP;
static const uint64_t pdpBaseAddress;
static const uint64_t numPML4Entries;
static const uint64_t sizePML4;
static const uint64_t pml4BaseAddress;
// clang-format on
};
template <typename Traits>
struct AubPageTableHelper {
typedef AubMemDump::PageTableTraits<Traits::addressingBits> PageTableTraits;
enum {
addressingBits = Traits::addressingBits
};
static inline uint32_t ptrToGGTT(const void *memory) {
return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(memory));
}
static inline uintptr_t ptrToPPGTT(const void *memory) {
return reinterpret_cast<uintptr_t>(memory);
}
static inline uint64_t getPTEAddress(uint64_t ptIndex) {
return PageTableTraits::ptBaseAddress + ptIndex * sizeof(uint64_t);
}
static inline uint64_t getPDEAddress(uint64_t pdIndex) {
return PageTableTraits::pdBaseAddress + pdIndex * sizeof(uint64_t);
}
static inline uint64_t getPDPAddress(uint64_t pdpIndex) {
return PageTableTraits::pdpBaseAddress + pdpIndex * sizeof(uint64_t);
}
};
template <typename Traits>
struct AubPageTableHelper32 : public AubPageTableHelper<Traits>, PageTableTraits<32> {
typedef AubPageTableHelper<Traits> BaseClass;
static void createContext(typename Traits::Stream &stream, uint32_t context);
static uint64_t reserveAddressPPGTT(typename Traits::Stream &stream, uintptr_t gfxAddress, size_t blockSize, uint64_t physAddress);
static void fixupLRC(uint8_t *pLrc);
};
template <typename Traits>
struct AubPageTableHelper64 : public AubPageTableHelper<Traits>, PageTableTraits<48> {
typedef AubPageTableHelper<Traits> BaseClass;
static inline uint64_t getPML4Address(uint64_t pml4Index) {
return pml4BaseAddress + pml4Index * sizeof(uint64_t);
}
static void createContext(typename Traits::Stream &stream, uint32_t context);
static uint64_t reserveAddressPPGTT(typename Traits::Stream &stream, uintptr_t gfxAddress, size_t blockSize, uint64_t physAddress);
static void fixupLRC(uint8_t *pLrc);
};
template <typename TraitsIn>
struct AubDump : public TypeSelector<AubPageTableHelper32<TraitsIn>, AubPageTableHelper64<TraitsIn>, TraitsIn::addressingBits == 32>::type {
typedef TraitsIn Traits;
typedef typename TypeSelector<uint32_t, uint64_t, Traits::addressingBits == 32>::type AddressType;
typedef typename TypeSelector<AubPageTableHelper32<Traits>, AubPageTableHelper64<Traits>, Traits::addressingBits == 32>::type BaseHelper;
typedef typename Traits::Stream Stream;
typedef union _MiContextDescriptorReg_ {
struct {
uint64_t Valid : 1; //[0]
uint64_t ForcePageDirRestore : 1; //[1]
uint64_t ForceRestore : 1; //[2]
uint64_t Legacy : 1; //[3]
uint64_t ADor64bitSupport : 1; //[4] Selects 64-bit PPGTT in Legacy mode
uint64_t LlcCoherencySupport : 1; //[5]
uint64_t FaultSupport : 2; //[7:6]
uint64_t PrivilegeAccessOrPPGTT : 1; //[8] Selects PPGTT in Legacy mode
uint64_t FunctionType : 3; //[11:9]
uint64_t LogicalRingCtxAddress : 20; //[31:12]
uint64_t ContextID : 32; //[63:32]
} sData;
uint32_t ulData[2];
uint64_t qwordData[2 / 2];
} MiContextDescriptorReg, *pMiContextDescriptorReg;
// Write a block of memory to a given address space using an optional hint
static void addMemoryWrite(Stream &stream, uint64_t addr, const void *memory, size_t blockSize, int addressSpace, int hint = DataTypeHintValues::TraceNotype);
static uint64_t reserveAddressGGTT(Stream &stream, uint32_t addr, size_t size, uint64_t physStart);
static uint64_t reserveAddressGGTT(Stream &stream, const void *memory, size_t size, uint64_t physStart);
private:
static uint64_t reserveAddress(Stream &stream, uint32_t addr, size_t size, unsigned int addressSpace /* = AddressSpaceValues::TraceGttEntry*/, uint64_t physStart);
};
struct LrcaHelper {
LrcaHelper(uint32_t base) : mmioBase(base) {
}
int aubHintLRCA = DataTypeHintValues::TraceNotype;
int aubHintCommandBuffer = DataTypeHintValues::TraceCommandBuffer;
int aubHintBatchBuffer = DataTypeHintValues::TraceBatchBuffer;
const char *name = "XCS";
uint32_t mmioBase = 0;
size_t sizeLRCA = 0x2000;
uint32_t alignLRCA = 0x1000;
uint32_t offsetContext = 0x1000;
uint32_t offsetLRI0 = 0x01 * sizeof(uint32_t);
uint32_t numRegsLRI0 = 14;
uint32_t numNoops0 = 3;
uint32_t offsetLRI1 = offsetLRI0 + (1 + numRegsLRI0 * 2 + numNoops0) * sizeof(uint32_t); //offsetLRI == 0x21 * sizeof(uint32_t);
uint32_t numRegsLRI1 = 9;
uint32_t numNoops1 = 13;
uint32_t offsetLRI2 = offsetLRI1 + (1 + numRegsLRI1 * 2 + numNoops1) * sizeof(uint32_t); //offsetLR2 == 0x41 * sizeof(uint32_t);
uint32_t numRegsLRI2 = 1;
uint32_t offsetRingRegisters = offsetLRI0 + (3 * sizeof(uint32_t));
uint32_t offsetRingHead = 0x0 * sizeof(uint32_t);
uint32_t offsetRingTail = 0x2 * sizeof(uint32_t);
uint32_t offsetRingBase = 0x4 * sizeof(uint32_t);
uint32_t offsetRingCtrl = 0x6 * sizeof(uint32_t);
uint32_t offsetPageTableRegisters = offsetLRI1 + (3 * sizeof(uint32_t));
uint32_t offsetPDP0 = 0xc * sizeof(uint32_t);
uint32_t offsetPDP1 = 0x8 * sizeof(uint32_t);
uint32_t offsetPDP2 = 0x4 * sizeof(uint32_t);
uint32_t offsetPDP3 = 0x0 * sizeof(uint32_t);
void initialize(void *pLRCIn) const;
void setRingHead(void *pLRCIn, uint32_t ringHead) const;
void setRingTail(void *pLRCIn, uint32_t ringTail) const;
void setRingBase(void *pLRCIn, uint32_t ringBase) const;
void setRingCtrl(void *pLRCIn, uint32_t ringCtrl) const;
void setPDP0(void *pLRCIn, uint64_t address) const;
void setPDP1(void *pLRCIn, uint64_t address) const;
void setPDP2(void *pLRCIn, uint64_t address) const;
void setPDP3(void *pLRCIn, uint64_t address) const;
void setPML4(void *pLRCIn, uint64_t address) const;
};
struct LrcaHelperRcs : public LrcaHelper {
LrcaHelperRcs(uint32_t base) : LrcaHelper(base) {
aubHintLRCA = DataTypeHintValues::TraceLogicalRingContextRcs;
aubHintCommandBuffer = DataTypeHintValues::TraceCommandBufferPrimary;
aubHintBatchBuffer = DataTypeHintValues::TraceBatchBufferPrimary;
sizeLRCA = 0x11000;
name = "RCS";
}
};
struct LrcaHelperBcs : public LrcaHelper {
LrcaHelperBcs(uint32_t base) : LrcaHelper(base) {
aubHintLRCA = DataTypeHintValues::TraceLogicalRingContextBcs;
aubHintCommandBuffer = DataTypeHintValues::TraceCommandBufferBlt;
aubHintBatchBuffer = DataTypeHintValues::TraceBatchBufferBlt;
name = "BCS";
}
};
struct LrcaHelperVcs : public LrcaHelper {
LrcaHelperVcs(uint32_t base) : LrcaHelper(base) {
aubHintLRCA = DataTypeHintValues::TraceLogicalRingContextVcs;
aubHintCommandBuffer = DataTypeHintValues::TraceCommandBufferMfx;
aubHintBatchBuffer = DataTypeHintValues::TraceBatchBufferMfx;
name = "VCS";
}
};
struct LrcaHelperVecs : public LrcaHelper {
LrcaHelperVecs(uint32_t base) : LrcaHelper(base) {
aubHintLRCA = DataTypeHintValues::TraceLogicalRingContextVecs;
name = "VECS";
}
};
extern const uint64_t g_pageMask;
extern const size_t g_dwordCountMax;
}

View File

@@ -0,0 +1,328 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "config.h"
#include "aub_mem_dump.h"
#include "runtime/helpers/debug_helpers.h"
#include "runtime/helpers/ptr_math.h"
#include <algorithm>
#include <cstring>
namespace AubMemDump {
template <typename Traits>
void AubPageTableHelper32<Traits>::fixupLRC(uint8_t *pLRC) {
uint32_t pdAddress;
pdAddress = BaseClass::getPDEAddress(0x600) >> 32;
*(uint32_t *)(pLRC + 0x1094) = pdAddress;
pdAddress = BaseClass::getPDEAddress(0x600) & 0xffffffff;
*(uint32_t *)(pLRC + 0x109c) = pdAddress;
pdAddress = BaseClass::getPDEAddress(0x400) >> 32;
*(uint32_t *)(pLRC + 0x10a4) = pdAddress;
pdAddress = BaseClass::getPDEAddress(0x400) & 0xffffffff;
*(uint32_t *)(pLRC + 0x10ac) = pdAddress;
pdAddress = BaseClass::getPDEAddress(0x200) >> 32;
*(uint32_t *)(pLRC + 0x10b4) = pdAddress;
pdAddress = BaseClass::getPDEAddress(0x200) & 0xffffffff;
*(uint32_t *)(pLRC + 0x10bc) = pdAddress;
pdAddress = BaseClass::getPDEAddress(0) >> 32;
*(uint32_t *)(pLRC + 0x10c4) = pdAddress;
pdAddress = BaseClass::getPDEAddress(0) & 0xffffffff;
*(uint32_t *)(pLRC + 0x10cc) = pdAddress;
}
template <typename Traits>
void AubPageTableHelper64<Traits>::fixupLRC(uint8_t *pLRC) {
uint32_t pml4Address = getPML4Address(0) >> 32;
*(uint32_t *)(pLRC + 0x10c4) = pml4Address;
pml4Address = getPML4Address(0) & 0xffffffff;
*(uint32_t *)(pLRC + 0x10cc) = pml4Address;
}
// Write a block of memory to a given address space using an optional hint
template <typename Traits>
void AubDump<Traits>::addMemoryWrite(typename Traits::Stream &stream, uint64_t addr, const void *memory, size_t sizeRemaining, int addressSpace, int hint) {
// We can only dump a relatively small amount per CmdServicesMemTraceMemoryWrite
auto sizeMemoryWriteHeader = sizeof(CmdServicesMemTraceMemoryWrite) - sizeof(CmdServicesMemTraceMemoryWrite::data);
auto blockSizeMax = g_dwordCountMax * sizeof(uint32_t) - sizeMemoryWriteHeader;
if (hint == CmdServicesMemTraceMemoryWrite::DataTypeHintValues::TraceLogicalRingContextRcs ||
hint == CmdServicesMemTraceMemoryWrite::DataTypeHintValues::TraceLogicalRingContextBcs ||
hint == CmdServicesMemTraceMemoryWrite::DataTypeHintValues::TraceLogicalRingContextVcs ||
hint == CmdServicesMemTraceMemoryWrite::DataTypeHintValues::TraceLogicalRingContextVecs) {
DEBUG_BREAK_IF(sizeRemaining <= 0x10cc);
uint8_t *pLRC = reinterpret_cast<uint8_t *>(const_cast<void *>(memory));
BaseHelper::fixupLRC(pLRC);
}
// loop to dump all of the blocks
while (sizeRemaining > 0) {
auto sizeThisIteration = std::min(blockSizeMax, sizeRemaining);
stream.writeMemory(addr, memory, sizeThisIteration, addressSpace, hint);
sizeRemaining -= sizeThisIteration;
memory = (uint8_t *)memory + sizeThisIteration;
addr += sizeThisIteration;
}
}
// Reserve memory in the GGTT.
template <typename Traits>
uint64_t AubDump<Traits>::reserveAddress(typename Traits::Stream &stream, uint32_t addr, size_t size, unsigned int addressSpace, uint64_t physStart) {
auto startPage = addr & g_pageMask;
auto endPage = (addr + size - 1) & g_pageMask;
auto numPages = (uint32_t)(((endPage - startPage) / 4096) + 1);
// Can only handle 16 bits of dwordCount.
DEBUG_BREAK_IF(!(numPages > 0 && (numPages + 4) < 65536));
auto gttTableOffset = static_cast<uint32_t>((((uint32_t)startPage) / 4096) * sizeof(MiGttEntry));
// Write header
{
typedef AubMemDump::CmdServicesMemTraceMemoryWrite CmdServicesMemTraceMemoryWrite;
stream.writeMemoryWriteHeader(gttTableOffset, numPages * sizeof(AubMemDump::MiGttEntry), addressSpace, CmdServicesMemTraceMemoryWrite::DataTypeHintValues::TraceNotype);
}
uint64_t physAddress = physStart;
while (startPage <= endPage) {
MiGttEntry entry;
setGttEntry(entry, physAddress);
stream.writeGTT(gttTableOffset, entry.uiData);
gttTableOffset += sizeof(entry);
physAddress += 4096;
startPage += 4096;
}
return physStart;
}
template <typename Traits>
uint64_t AubDump<Traits>::reserveAddressGGTT(typename Traits::Stream &stream, uint32_t addr, size_t size, uint64_t physStart) {
return AubDump<Traits>::reserveAddress(stream, addr, size, AddressSpaceValues::TraceGttEntry, physStart);
}
template <typename Traits>
uint64_t AubDump<Traits>::reserveAddressGGTT(typename Traits::Stream &stream, const void *memory, size_t size, uint64_t physStart) {
auto gfxAddress = BaseHelper::ptrToGGTT(memory);
return AubDump<Traits>::reserveAddress(stream, gfxAddress, size, AddressSpaceValues::TraceGttEntry, physStart);
}
template <typename Traits>
uint64_t AubPageTableHelper32<Traits>::reserveAddressPPGTT(typename Traits::Stream &stream, uintptr_t gfxAddress, size_t blockSize, uint64_t physAddress) {
auto startAddress = gfxAddress;
auto endAddress = gfxAddress + blockSize - 1;
auto startPTE = startAddress >> 12;
auto endPTE = endAddress >> 12;
auto numPTEs = endPTE - startPTE + 1;
auto startPDE = startPTE >> 9;
auto endPDE = endPTE >> 9;
auto numPDEs = endPDE - startPDE + 1;
// Process the PD entries
bool writePDE = true;
if (writePDE) {
auto start_address = BaseClass::getPDEAddress(startPDE);
stream.writeMemoryWriteHeader(start_address, numPDEs * sizeof(uint64_t), AddressSpaceValues::TracePpgttPdEntry);
auto currPDE = startPDE;
auto physPage = BaseClass::getPTEAddress(startPTE) & g_pageMask;
while (currPDE <= endPDE) {
auto pde = physPage | 7;
stream.writePTE(start_address, pde);
start_address += sizeof(pde);
physPage += 4096;
currPDE++;
}
}
// Process the PT entries
bool writePTE = true;
if (writePTE) {
auto start_address = BaseClass::getPTEAddress(startPTE);
stream.writeMemoryWriteHeader(start_address, numPTEs * sizeof(uint64_t), AddressSpaceValues::TracePpgttEntry);
auto currPTE = startPTE;
auto physPage = physAddress & g_pageMask;
while (currPTE <= endPTE) {
auto pte = physPage | 7;
stream.writePTE(start_address, pte);
start_address += sizeof(pte);
physPage += 4096;
currPTE++;
}
}
return physAddress;
}
template <typename Traits>
uint64_t AubPageTableHelper64<Traits>::reserveAddressPPGTT(typename Traits::Stream &stream, uintptr_t gfxAddress, size_t blockSize, uint64_t physAddress) {
auto startAddress = gfxAddress;
auto endAddress = gfxAddress + blockSize - 1;
auto startPTE = startAddress >> 12;
auto endPTE = endAddress >> 12;
auto numPTEs = endPTE - startPTE + 1;
auto startPDE = startPTE >> 9;
auto endPDE = endPTE >> 9;
auto numPDEs = endPDE - startPDE + 1;
auto startPDP = startPDE >> 9;
auto endPDP = endPDE >> 9;
auto numPDPs = endPDP - startPDP + 1;
auto startPML4 = startPDP >> 9;
auto endPML4 = endPDP >> 9;
auto numPML4s = endPML4 - startPML4 + 1;
// Process the PML4 entries
bool writePML4 = true;
if (writePML4) {
auto start_address = getPML4Address(startPML4);
stream.writeMemoryWriteHeader(start_address, numPML4s * sizeof(uint64_t), AddressSpaceValues::TracePml4Entry);
auto currPML4 = startPML4;
auto physPage = BaseClass::getPDPAddress(startPDP) & g_pageMask;
while (currPML4 <= endPML4) {
auto pml4 = physPage | 7;
stream.writePTE(start_address, pml4);
start_address += sizeof(pml4);
physPage += 4096;
currPML4++;
}
}
// Process the PDP entries
bool writePDPE = true;
if (writePDPE) {
auto start_address = BaseClass::getPDPAddress(startPDP);
stream.writeMemoryWriteHeader(start_address, numPDPs * sizeof(uint64_t), AddressSpaceValues::TracePhysicalPdpEntry);
auto currPDP = startPDP;
auto physPage = BaseClass::getPDEAddress(startPDE) & g_pageMask;
while (currPDP <= endPDP) {
auto pdp = physPage | 7;
stream.writePTE(start_address, pdp);
start_address += sizeof(pdp);
physPage += 4096;
currPDP++;
}
}
// Process the PD entries
bool writePDE = true;
if (writePDE) {
auto start_address = BaseClass::getPDEAddress(startPDE);
stream.writeMemoryWriteHeader(start_address, numPDEs * sizeof(uint64_t), AddressSpaceValues::TracePpgttPdEntry);
auto currPDE = startPDE;
auto physPage = BaseClass::getPTEAddress(startPTE) & g_pageMask;
while (currPDE <= endPDE) {
auto pde = physPage | 7;
stream.writePTE(start_address, pde);
start_address += sizeof(pde);
physPage += 4096;
currPDE++;
}
}
// Process the PT entries
bool writePTE = true;
if (writePTE) {
auto start_address = BaseClass::getPTEAddress(startPTE);
stream.writeMemoryWriteHeader(start_address, numPTEs * sizeof(uint64_t), AddressSpaceValues::TracePpgttEntry);
auto currPTE = startPTE;
auto physPage = physAddress & g_pageMask;
while (currPTE <= endPTE) {
auto pte = physPage | 7;
stream.writePTE(start_address, pte);
start_address += sizeof(pte);
physPage += 4096;
currPTE++;
}
}
return physAddress;
}
template <typename Traits>
void AubPageTableHelper32<Traits>::createContext(typename Traits::Stream &stream, uint32_t context) {
AubPpgttContextCreate cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.Header.Type = 0x7;
cmd.Header.Opcode = 0x1;
cmd.Header.SubOp = 0x14;
cmd.Header.DwordLength = ((sizeof(cmd) - sizeof(cmd.Header)) / sizeof(uint32_t)) - 1;
cmd.Handle = context;
cmd.AdvancedContext = false;
cmd.SixtyFourBit = 0;
cmd.PageDirPointer[0] = BaseClass::getPDEAddress(0x000);
cmd.PageDirPointer[1] = BaseClass::getPDEAddress(0x200);
cmd.PageDirPointer[2] = BaseClass::getPDEAddress(0x400);
cmd.PageDirPointer[3] = BaseClass::getPDEAddress(0x600);
stream.createContext(cmd);
}
template <typename Traits>
void AubPageTableHelper64<Traits>::createContext(typename Traits::Stream &stream, uint32_t context) {
AubPpgttContextCreate cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.Header.Type = 0x7;
cmd.Header.Opcode = 0x1;
cmd.Header.SubOp = 0x14;
cmd.Header.DwordLength = ((sizeof(cmd) - sizeof(cmd.Header)) / sizeof(uint32_t)) - 1;
cmd.Handle = context;
cmd.AdvancedContext = false;
cmd.SixtyFourBit = 1;
cmd.PageDirPointer[0] = getPML4Address(0);
stream.createContext(cmd);
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,79 @@
# Copyright (c) 2017, Intel Corporation
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
# We require cmake 3.2.0 or later
cmake_minimum_required(VERSION 3.2.0 FATAL_ERROR)
add_library(${BUILTINS_BINARIES_LIB_NAME} OBJECT CMakeLists.txt)
# Add builtins sources
add_subdirectory(registry)
set(GENERATED_BUILTINS "COPY_BUFFER_TO_BUFFER"
"COPY_BUFFER_RECT"
"FILL_BUFFER"
"COPY_BUFFER_TO_IMAGE3D"
"COPY_IMAGE3D_TO_BUFFER"
"COPY_IMAGE_TO_IMAGE1D"
"COPY_IMAGE_TO_IMAGE2D"
"COPY_IMAGE_TO_IMAGE3D"
"FILL_IMAGE1D"
"FILL_IMAGE2D"
"FILL_IMAGE3D"
)
# Generate builtins cpps
if(COMPILE_BUILT_INS)
add_subdirectory(kernels)
endif()
# Reverse order so that GEN N+1 includes GEN N
foreach(GEN_NUM RANGE ${MAX_GEN} 0 -1)
GEN_CONTAINS_PLATFORMS("SUPPORTED" ${GEN_NUM} GENX_HAS_PLATFORMS)
if(${GENX_HAS_PLATFORMS})
# Get all supported platforms for this GEN
GET_PLATFORMS_FOR_GEN("SUPPORTED" ${GEN_NUM} SUPPORTED_GENX_PLATFORMS)
# Add platform-specific files
foreach(PLATFORM_IT ${SUPPORTED_GENX_PLATFORMS})
foreach(GENERATED_BUILTIN ${GENERATED_BUILTINS})
list(APPEND GENERATED_BUILTINS_CPPS ${BUILTINS_INCLUDE_DIR}/${RUNTIME_GENERATED_${GENERATED_BUILTIN}_GEN${GEN_NUM}_${PLATFORM_IT}})
endforeach(GENERATED_BUILTIN)
endforeach(PLATFORM_IT)
source_group("generated files\\gen${GEN_NUM}" FILES ${GENERATED_BUILTINS_CPPS})
endif(${GENX_HAS_PLATFORMS})
endforeach(GEN_NUM)
if(COMPILE_BUILT_INS)
target_sources(${BUILTINS_BINARIES_LIB_NAME} PUBLIC ${GENERATED_BUILTINS_CPPS})
set_source_files_properties(${GENERATED_BUILTINS_CPPS} PROPERTIES GENERATED TRUE)
endif(COMPILE_BUILT_INS)
set_target_properties(${BUILTINS_BINARIES_LIB_NAME} PROPERTIES LINKER_LANGUAGE CXX)
set_target_properties(${BUILTINS_BINARIES_LIB_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(${BUILTINS_BINARIES_LIB_NAME} PRIVATE
${KHRONOS_HEADERS_DIR}
${UMKM_SHAREDDATA_INCLUDE_PATHS}
${IGDRCL__IGC_INCLUDE_DIR}
${THIRD_PARTY_DIR}
)

View File

@@ -0,0 +1,890 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include <cstdint>
#include "built_ins.h"
#include "runtime/built_ins/vme_dispatch_builder.h"
#include "runtime/built_ins/sip.h"
#include "runtime/compiler_interface/compiler_interface.h"
#include "runtime/program/program.h"
#include "runtime/mem_obj/image.h"
#include "runtime/kernel/kernel.h"
#include "runtime/helpers/basic_math.h"
#include "runtime/helpers/convert_color.h"
#include "runtime/helpers/dispatch_info_builder.h"
#include "runtime/helpers/debug_helpers.h"
#include <sstream>
namespace OCLRT {
BuiltIns *BuiltIns::pInstance = nullptr;
const char *mediaKernelsBuildOptions = {
"-D cl_intel_device_side_advanced_vme_enable "
"-D cl_intel_device_side_avc_vme_enable "
"-D cl_intel_device_side_vme_enable "
"-D cl_intel_media_block_io "
"-cl-fast-relaxed-math "};
BuiltIns::BuiltIns() {
builtinsLib.reset(new BuiltinsLib());
}
BuiltIns::~BuiltIns() {
delete static_cast<SchedulerKernel *>(schedulerBuiltIn.pKernel);
delete schedulerBuiltIn.pProgram;
schedulerBuiltIn.pKernel = nullptr;
schedulerBuiltIn.pProgram = nullptr;
}
BuiltIns &BuiltIns::getInstance() {
static std::mutex initMutex;
std::lock_guard<std::mutex> autolock(initMutex);
if (pInstance == nullptr) {
pInstance = new BuiltIns();
}
return *pInstance;
}
void BuiltIns::shutDown() {
if (pInstance) {
auto inst = pInstance;
pInstance = nullptr;
delete inst;
}
}
SchedulerKernel &BuiltIns::getSchedulerKernel(Context &context) {
if (schedulerBuiltIn.pKernel) {
return *static_cast<SchedulerKernel *>(schedulerBuiltIn.pKernel);
}
auto initializeSchedulerProgramAndKernel = [&] {
cl_int retVal = CL_SUCCESS;
auto src = getInstance().builtinsLib->getBuiltinCode(EBuiltInOps::Scheduler, BuiltinCode::ECodeType::Any, *context.getDevice(0));
auto program = Program::createFromGenBinary(&context,
src.resource.data(),
src.resource.size(),
true,
&retVal);
DEBUG_BREAK_IF(retVal != CL_SUCCESS);
DEBUG_BREAK_IF(!program);
retVal = program->processGenBinary();
DEBUG_BREAK_IF(retVal != CL_SUCCESS);
schedulerBuiltIn.pProgram = program;
auto kernelInfo = schedulerBuiltIn.pProgram->getKernelInfo(SchedulerKernel::schedulerName);
DEBUG_BREAK_IF(!kernelInfo);
schedulerBuiltIn.pKernel = Kernel::create<SchedulerKernel>(
schedulerBuiltIn.pProgram,
*kernelInfo,
&retVal);
DEBUG_BREAK_IF(retVal != CL_SUCCESS);
};
std::call_once(schedulerBuiltIn.programIsInitialized, initializeSchedulerProgramAndKernel);
return *static_cast<SchedulerKernel *>(schedulerBuiltIn.pKernel);
}
SipKernel &BuiltIns::getSipKernel(SipKernelType type, Context &context) {
uint32_t kernelId = static_cast<uint32_t>(type);
UNRECOVERABLE_IF(kernelId >= static_cast<uint32_t>(SipKernelType::COUNT));
auto &sipBuiltIn = this->sipKernels[kernelId];
auto initializer = [&] {
cl_int retVal = CL_SUCCESS;
std::vector<char> sipBinary;
auto compilerInteface = CompilerInterface::getInstance();
UNRECOVERABLE_IF(compilerInteface == nullptr);
auto ret = compilerInteface->getSipKernelBinary(type, *context.getDevice(0), sipBinary);
UNRECOVERABLE_IF(ret != CL_SUCCESS);
UNRECOVERABLE_IF(sipBinary.size() == 0);
auto program = Program::createFromGenBinary(&context,
sipBinary.data(),
sipBinary.size(),
true,
&retVal);
DEBUG_BREAK_IF(retVal != CL_SUCCESS);
UNRECOVERABLE_IF(program == nullptr);
retVal = program->processGenBinary();
DEBUG_BREAK_IF(retVal != CL_SUCCESS);
auto kernelInfo = program->getKernelInfo(size_t{0});
UNRECOVERABLE_IF(kernelInfo == nullptr);
uint32_t sipOffset = kernelInfo->systemKernelOffset;
UNRECOVERABLE_IF(sipOffset >= kernelInfo->heapInfo.pKernelHeader->KernelHeapSize)
sipBuiltIn.first.reset(new SipKernel(type, ptrOffset(kernelInfo->heapInfo.pKernelHeap, sipOffset),
kernelInfo->heapInfo.pKernelHeader->KernelHeapSize - sipOffset));
DEBUG_BREAK_IF(retVal != CL_SUCCESS);
program->release();
};
std::call_once(sipBuiltIn.second, initializer);
UNRECOVERABLE_IF(sipBuiltIn.first == nullptr);
return *sipBuiltIn.first;
}
// VME:
static const char *blockMotionEstimateIntelSrc = {
#include "kernels/vme_block_motion_estimate_intel_frontend.igdrcl_built_in"
};
static const char *blockAdvancedMotionEstimateCheckIntelSrc = {
#include "kernels/vme_block_advanced_motion_estimate_check_intel_frontend.igdrcl_built_in"
};
static const char *blockAdvancedMotionEstimateBidirectionalCheckIntelSrc = {
#include "kernels/vme_block_advanced_motion_estimate_bidirectional_check_intel_frontend.igdrcl_built_in"
};
// VEBOX:
static const char *veEnhanceIntelSrc = {
#include "kernels/vebox_ve_enhance_intel.igdrcl_built_in"
};
static const char *veDnEnhanceIntelSrc = {
#include "kernels/vebox_ve_dn_enhance_intel.igdrcl_built_in"
};
static const char *veDnDiEnhanceIntelSrc = {
#include "kernels/vebox_ve_dn_di_enhance_intel.igdrcl_built_in"
};
static const std::tuple<const char *, const char *> mediaBuiltIns[] = {
std::make_tuple("block_motion_estimate_intel", blockMotionEstimateIntelSrc),
std::make_tuple("block_advanced_motion_estimate_check_intel", blockAdvancedMotionEstimateCheckIntelSrc),
std::make_tuple("block_advanced_motion_estimate_bidirectional_check_intel", blockAdvancedMotionEstimateBidirectionalCheckIntelSrc),
std::make_tuple("ve_enhance_intel", veEnhanceIntelSrc),
std::make_tuple("ve_dn_enhance_intel", veDnEnhanceIntelSrc),
std::make_tuple("ve_dn_di_enhance_intel", veDnDiEnhanceIntelSrc),
};
// Unlike other built-ins media kernels are not stored in BuiltIns object.
// Pointer to program with built in kernels is returned to the user through API
// call and user is responsible for releasing it by calling clReleaseProgram.
Program *BuiltIns::createBuiltInProgram(
Context &context,
Device &device,
const char *kernelNames,
int &errcodeRet) {
std::string programSourceStr = "";
std::istringstream ss(kernelNames);
std::string currentKernelName;
while (std::getline(ss, currentKernelName, ';')) {
bool found = false;
for (auto &builtInTuple : mediaBuiltIns) {
if (currentKernelName == std::get<0>(builtInTuple)) {
programSourceStr += std::get<1>(builtInTuple);
found = true;
break;
}
}
if (!found) {
errcodeRet = CL_INVALID_VALUE;
return nullptr;
}
}
if (programSourceStr.empty() == true) {
errcodeRet = CL_INVALID_VALUE;
return nullptr;
}
Program *pBuiltInProgram = nullptr;
pBuiltInProgram = Program::create(programSourceStr.c_str(), &context, device, true, nullptr);
if (pBuiltInProgram) {
std::unordered_map<std::string, BuiltinDispatchInfoBuilder *> builtinsBuilders;
builtinsBuilders["block_motion_estimate_intel"] =
&BuiltIns::getInstance().getBuiltinDispatchInfoBuilder(EBuiltInOps::VmeBlockMotionEstimateIntel, context, device);
builtinsBuilders["block_advanced_motion_estimate_check_intel"] =
&BuiltIns::getInstance().getBuiltinDispatchInfoBuilder(EBuiltInOps::VmeBlockAdvancedMotionEstimateCheckIntel, context, device);
builtinsBuilders["block_advanced_motion_estimate_bidirectional_check_intel"] =
&BuiltIns::getInstance().getBuiltinDispatchInfoBuilder(EBuiltInOps::VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel, context, device);
const cl_device_id clDevice = &device;
errcodeRet = pBuiltInProgram->build(
clDevice,
mediaKernelsBuildOptions,
enableCacheing,
builtinsBuilders);
} else {
errcodeRet = CL_INVALID_VALUE;
}
return pBuiltInProgram;
}
void BuiltinDispatchInfoBuilder::takeOwnership(Context *context) {
for (auto &k : usedKernels) {
k->takeOwnership(true);
k->setContext(context);
}
}
void BuiltinDispatchInfoBuilder::releaseOwnership() {
for (auto &k : usedKernels) {
k->setContext(nullptr);
k->releaseOwnership();
}
}
template <typename... KernelsDescArgsT>
void BuiltinDispatchInfoBuilder::populate(Context &context, Device &device, EBuiltInOps op, const char *options, KernelsDescArgsT &&... desc) {
auto src = kernelsLib.getBuiltinsLib().getBuiltinCode(op, BuiltinCode::ECodeType::Any, device);
prog.reset(BuiltinsLib::createProgramFromCode(src, context, device).release());
prog->build(0, nullptr, options, nullptr, nullptr, kernelsLib.isCacheingEnabled());
grabKernels(std::forward<KernelsDescArgsT>(desc)...);
}
template <typename HWFamily>
class BuiltInOp<HWFamily, EBuiltInOps::CopyBufferToBuffer> : public BuiltinDispatchInfoBuilder {
public:
BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device)
: BuiltinDispatchInfoBuilder(kernelsLib), kernLeftLeftover(nullptr), kernMiddle(nullptr), kernRightLeftover(nullptr) {
populate(context, device,
EBuiltInOps::CopyBufferToBuffer,
"",
"CopyBufferToBufferLeftLeftover", kernLeftLeftover,
"CopyBufferToBufferMiddle", kernMiddle,
"CopyBufferToBufferRightLeftover", kernRightLeftover);
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, const BuiltinOpParams &operationParams) const override {
DispatchInfoBuilder<SplitDispatch::Dim::d1D, SplitDispatch::SplitMode::KernelSplit> kernelSplit1DBuilder;
uintptr_t start = reinterpret_cast<uintptr_t>(operationParams.dstPtr) + operationParams.dstOffset.x;
size_t middleAlignment = MemoryConstants::cacheLineSize;
size_t middleElSize = sizeof(uint32_t) * 4;
uintptr_t leftSize = start % middleAlignment;
leftSize = (leftSize > 0) ? (middleAlignment - leftSize) : 0; // calc left leftover size
leftSize = std::min(leftSize, operationParams.size.x); // clamp left leftover size to requested size
uintptr_t rightSize = (start + operationParams.size.x) % middleAlignment; // calc right leftover size
rightSize = std::min(rightSize, operationParams.size.x - leftSize); // clamp
uintptr_t middleSizeBytes = operationParams.size.x - leftSize - rightSize; // calc middle size
if (!isAligned<4>(reinterpret_cast<uintptr_t>(operationParams.srcPtr) + operationParams.srcOffset.x + leftSize)) {
//corner case - src relative to dst does not have DWORD alignment
leftSize += middleSizeBytes;
middleSizeBytes = 0;
}
auto middleSizeEls = middleSizeBytes / middleElSize; // num work items in middle walker
// Set-up ISA
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Left, kernLeftLeftover);
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Middle, kernMiddle);
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Right, kernRightLeftover);
// Set-up common kernel args
if (operationParams.srcSvmAlloc) {
kernelSplit1DBuilder.setArgSvmAlloc(0, operationParams.srcPtr, operationParams.srcSvmAlloc);
} else if (operationParams.srcMemObj) {
kernelSplit1DBuilder.setArg(0, operationParams.srcMemObj);
} else {
kernelSplit1DBuilder.setArgSvm(0, operationParams.size.x, operationParams.srcPtr, nullptr, CL_MEM_READ_ONLY);
}
if (operationParams.dstSvmAlloc) {
kernelSplit1DBuilder.setArgSvmAlloc(1, operationParams.dstPtr, operationParams.dstSvmAlloc);
} else if (operationParams.dstMemObj) {
kernelSplit1DBuilder.setArg(1, operationParams.dstMemObj);
} else {
kernelSplit1DBuilder.setArgSvm(1, operationParams.size.x, operationParams.dstPtr);
}
// Set-up srcOffset
kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Left, 2, static_cast<uint32_t>(operationParams.srcOffset.x));
kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Middle, 2, static_cast<uint32_t>(operationParams.srcOffset.x + leftSize));
kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Right, 2, static_cast<uint32_t>(operationParams.srcOffset.x + leftSize + middleSizeBytes));
// Set-up dstOffset
kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Left, 3, static_cast<uint32_t>(operationParams.dstOffset.x));
kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Middle, 3, static_cast<uint32_t>(operationParams.dstOffset.x + leftSize));
kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Right, 3, static_cast<uint32_t>(operationParams.dstOffset.x + leftSize + middleSizeBytes));
// Set-up work sizes
// Note for split walker, it would be just builder.SetDipatchGeometry(GWS, ELWS, OFFSET)
kernelSplit1DBuilder.setDispatchGeometry(SplitDispatch::RegionCoordX::Left, Vec3<size_t>{leftSize, 0, 0}, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
kernelSplit1DBuilder.setDispatchGeometry(SplitDispatch::RegionCoordX::Middle, Vec3<size_t>{middleSizeEls, 0, 0}, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
kernelSplit1DBuilder.setDispatchGeometry(SplitDispatch::RegionCoordX::Right, Vec3<size_t>{rightSize, 0, 0}, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
kernelSplit1DBuilder.bake(multiDispatchInfo);
return true;
}
protected:
Kernel *kernLeftLeftover;
Kernel *kernMiddle;
Kernel *kernRightLeftover;
};
template <typename HWFamily>
class BuiltInOp<HWFamily, EBuiltInOps::CopyBufferRect> : public BuiltinDispatchInfoBuilder {
public:
BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device)
: BuiltinDispatchInfoBuilder(kernelsLib), kernelBytes{nullptr} {
populate(context, device,
EBuiltInOps::CopyBufferRect,
"",
"CopyBufferRectBytes2d", kernelBytes[0],
"CopyBufferRectBytes2d", kernelBytes[1],
"CopyBufferRectBytes3d", kernelBytes[2]);
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, const BuiltinOpParams &operationParams) const override {
DispatchInfoBuilder<SplitDispatch::Dim::d3D, SplitDispatch::SplitMode::NoSplit> kernelNoSplit3DBuilder;
size_t hostPtrSize = 0;
bool is3D = false;
if (operationParams.srcMemObj && operationParams.dstMemObj) {
DEBUG_BREAK_IF(!((operationParams.srcPtr == nullptr) && (operationParams.dstPtr == nullptr)));
is3D = (operationParams.size.z > 1) || (operationParams.srcOffset.z > 0) || (operationParams.dstOffset.z > 0);
} else {
if (operationParams.srcPtr) {
size_t origin[] = {operationParams.srcOffset.x, operationParams.srcOffset.y, operationParams.srcOffset.z};
size_t region[] = {operationParams.size.x, operationParams.size.y, operationParams.size.z};
hostPtrSize = Buffer::calculateHostPtrSize(origin, region, operationParams.srcRowPitch, operationParams.srcSlicePitch);
is3D = (operationParams.size.z > 1) || (operationParams.dstOffset.z > 0);
} else if (operationParams.dstPtr) {
size_t origin[] = {operationParams.dstOffset.x, operationParams.dstOffset.y, operationParams.dstOffset.z};
size_t region[] = {operationParams.size.x, operationParams.size.y, operationParams.size.z};
hostPtrSize = Buffer::calculateHostPtrSize(origin, region, operationParams.dstRowPitch, operationParams.dstSlicePitch);
is3D = (operationParams.size.z > 1) || (operationParams.srcOffset.z > 0);
} else {
DEBUG_BREAK_IF(!false);
}
}
// Set-up ISA
int dimensions = is3D ? 3 : 2;
kernelNoSplit3DBuilder.setKernel(kernelBytes[dimensions - 1]);
// arg0 = src
if (operationParams.srcMemObj) {
kernelNoSplit3DBuilder.setArg(0, operationParams.srcMemObj);
} else {
kernelNoSplit3DBuilder.setArgSvm(0, hostPtrSize, is3D ? operationParams.srcPtr : ptrOffset(operationParams.srcPtr, operationParams.srcOffset.z * operationParams.srcSlicePitch));
}
// arg1 = dst
if (operationParams.dstMemObj) {
kernelNoSplit3DBuilder.setArg(1, operationParams.dstMemObj);
} else {
kernelNoSplit3DBuilder.setArgSvm(1, hostPtrSize, is3D ? operationParams.dstPtr : ptrOffset(operationParams.dstPtr, operationParams.dstOffset.z * operationParams.dstSlicePitch));
}
// arg2 = srcOrigin
uint32_t kSrcOrigin[4] = {(uint32_t)operationParams.srcOffset.x, (uint32_t)operationParams.srcOffset.y, (uint32_t)operationParams.srcOffset.z, 0};
kernelNoSplit3DBuilder.setArg(2, sizeof(uint32_t) * 4, kSrcOrigin);
// arg3 = dstOrigin
uint32_t kDstOrigin[4] = {(uint32_t)operationParams.dstOffset.x, (uint32_t)operationParams.dstOffset.y, (uint32_t)operationParams.dstOffset.z, 0};
kernelNoSplit3DBuilder.setArg(3, sizeof(uint32_t) * 4, kDstOrigin);
// arg4 = srcPitch
uint32_t kSrcPitch[2] = {(uint32_t)operationParams.srcRowPitch, (uint32_t)operationParams.srcSlicePitch};
kernelNoSplit3DBuilder.setArg(4, sizeof(uint32_t) * 2, kSrcPitch);
// arg5 = dstPitch
uint32_t kDstPitch[2] = {(uint32_t)operationParams.dstRowPitch, (uint32_t)operationParams.dstSlicePitch};
kernelNoSplit3DBuilder.setArg(5, sizeof(uint32_t) * 2, kDstPitch);
// Set-up work sizes
kernelNoSplit3DBuilder.setDispatchGeometry(operationParams.size, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
kernelNoSplit3DBuilder.bake(multiDispatchInfo);
// Store source and destination surfaces for residency purposes
if (operationParams.srcMemObj) {
multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new MemObjSurface(operationParams.srcMemObj)));
} else {
multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new HostPtrSurface(operationParams.srcPtr, hostPtrSize)));
}
if (operationParams.dstMemObj) {
multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new MemObjSurface(operationParams.dstMemObj)));
} else {
multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new HostPtrSurface(operationParams.dstPtr, hostPtrSize)));
}
return true;
}
protected:
Kernel *kernelBytes[3];
};
template <typename HWFamily>
class BuiltInOp<HWFamily, EBuiltInOps::FillBuffer> : public BuiltinDispatchInfoBuilder {
public:
BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device)
: BuiltinDispatchInfoBuilder(kernelsLib), kernLeftLeftover(nullptr), kernMiddle(nullptr), kernRightLeftover(nullptr) {
populate(context, device,
EBuiltInOps::FillBuffer,
"",
"FillBufferLeftLeftover", kernLeftLeftover,
"FillBufferMiddle", kernMiddle,
"FillBufferRightLeftover", kernRightLeftover);
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, const BuiltinOpParams &operationParams) const override {
DispatchInfoBuilder<SplitDispatch::Dim::d1D, SplitDispatch::SplitMode::KernelSplit> kernelSplit1DBuilder;
uintptr_t start = reinterpret_cast<uintptr_t>(operationParams.dstPtr) + operationParams.dstOffset.x;
size_t middleAlignment = MemoryConstants::cacheLineSize;
size_t middleElSize = sizeof(uint32_t);
uintptr_t leftSize = start % middleAlignment;
leftSize = (leftSize > 0) ? (middleAlignment - leftSize) : 0; // calc left leftover size
leftSize = std::min(leftSize, operationParams.size.x); // clamp left leftover size to requested size
uintptr_t rightSize = (start + operationParams.size.x) % middleAlignment; // calc right leftover size
rightSize = std::min(rightSize, operationParams.size.x - leftSize); // clamp
uintptr_t middleSizeBytes = operationParams.size.x - leftSize - rightSize; // calc middle size
auto middleSizeEls = middleSizeBytes / middleElSize; // num work items in middle walker
// Set-up ISA
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Left, kernLeftLeftover);
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Middle, kernMiddle);
kernelSplit1DBuilder.setKernel(SplitDispatch::RegionCoordX::Right, kernRightLeftover);
DEBUG_BREAK_IF((operationParams.srcMemObj == nullptr) || (operationParams.srcOffset != 0));
DEBUG_BREAK_IF((operationParams.dstMemObj == nullptr) && (operationParams.dstSvmAlloc == nullptr));
// Set-up dstMemObj with buffer
if (operationParams.dstSvmAlloc) {
kernelSplit1DBuilder.setArgSvmAlloc(0, operationParams.dstPtr, operationParams.dstSvmAlloc);
} else {
kernelSplit1DBuilder.setArg(0, operationParams.dstMemObj);
}
// Set-up dstOffset
kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Left, 1, static_cast<uint32_t>(operationParams.dstOffset.x));
kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Middle, 1, static_cast<uint32_t>(operationParams.dstOffset.x + leftSize));
kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Right, 1, static_cast<uint32_t>(operationParams.dstOffset.x + leftSize + middleSizeBytes));
// Set-up srcMemObj with pattern
kernelSplit1DBuilder.setArgSvm(2, operationParams.srcMemObj->getSize(), operationParams.srcMemObj->getGraphicsAllocation()->getUnderlyingBuffer(), operationParams.srcMemObj->getGraphicsAllocation());
// Set-up patternSizeInEls
kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Left, 3, static_cast<uint32_t>(operationParams.srcMemObj->getSize()));
kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Middle, 3, static_cast<uint32_t>(operationParams.srcMemObj->getSize() / middleElSize));
kernelSplit1DBuilder.setArg(SplitDispatch::RegionCoordX::Right, 3, static_cast<uint32_t>(operationParams.srcMemObj->getSize()));
// Set-up work sizes
// Note for split walker, it would be just builder.SetDipatchGeomtry(GWS, ELWS, OFFSET)
kernelSplit1DBuilder.setDispatchGeometry(SplitDispatch::RegionCoordX::Left, Vec3<size_t>{leftSize, 0, 0}, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
kernelSplit1DBuilder.setDispatchGeometry(SplitDispatch::RegionCoordX::Middle, Vec3<size_t>{middleSizeEls, 0, 0}, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
kernelSplit1DBuilder.setDispatchGeometry(SplitDispatch::RegionCoordX::Right, Vec3<size_t>{rightSize, 0, 0}, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
kernelSplit1DBuilder.bake(multiDispatchInfo);
return true;
}
protected:
Kernel *kernLeftLeftover;
Kernel *kernMiddle;
Kernel *kernRightLeftover;
};
template <typename HWFamily>
class BuiltInOp<HWFamily, EBuiltInOps::CopyBufferToImage3d> : public BuiltinDispatchInfoBuilder {
public:
BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device)
: BuiltinDispatchInfoBuilder(kernelsLib), kernelBytes{nullptr} {
populate(context, device,
EBuiltInOps::CopyBufferToImage3d,
"",
"CopyBufferToImage3dBytes", kernelBytes[0],
"CopyBufferToImage3d2Bytes", kernelBytes[1],
"CopyBufferToImage3d4Bytes", kernelBytes[2],
"CopyBufferToImage3d8Bytes", kernelBytes[3],
"CopyBufferToImage3d16Bytes", kernelBytes[4]);
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, const BuiltinOpParams &operationParams) const override {
DispatchInfoBuilder<SplitDispatch::Dim::d3D, SplitDispatch::SplitMode::NoSplit> kernelNoSplit3DBuilder;
DEBUG_BREAK_IF(!(((operationParams.srcPtr != nullptr) || (operationParams.srcMemObj != nullptr)) && (operationParams.dstPtr == nullptr)));
auto dstImage = castToObjectOrAbort<Image>(operationParams.dstMemObj);
// Redescribe image to be byte-copy
auto dstImageRedescribed = dstImage->redescribe();
multiDispatchInfo.pushRedescribedMemObj(std::unique_ptr<MemObj>(dstImageRedescribed)); // life range same as mdi's
// Calculate srcRowPitch and srcSlicePitch
auto bytesPerPixel = dstImage->getSurfaceFormatInfo().ImageElementSizeInBytes;
size_t region[] = {operationParams.size.x, operationParams.size.y, operationParams.size.z};
auto srcRowPitch = static_cast<uint32_t>(operationParams.dstRowPitch ? operationParams.dstRowPitch : region[0] * bytesPerPixel);
auto srcSlicePitch = static_cast<uint32_t>(
operationParams.dstSlicePitch ? operationParams.dstSlicePitch : ((dstImage->getImageDesc().image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY ? 1 : region[1]) * srcRowPitch));
// Determine size of host ptr surface for residency purposes
size_t hostPtrSize = operationParams.srcPtr ? Image::calculateHostPtrSize(region, srcRowPitch, srcSlicePitch, bytesPerPixel, dstImage->getImageDesc().image_type) : 0;
// Set-up kernel
auto bytesExponent = Math::log2(bytesPerPixel);
DEBUG_BREAK_IF(bytesExponent >= 5);
kernelNoSplit3DBuilder.setKernel(kernelBytes[bytesExponent]);
// Set-up source host ptr / buffer
if (operationParams.srcPtr) {
kernelNoSplit3DBuilder.setArgSvm(0, hostPtrSize, operationParams.srcPtr);
} else {
kernelNoSplit3DBuilder.setArg(0, operationParams.srcMemObj);
}
// Set-up destination image
kernelNoSplit3DBuilder.setArg(1, dstImageRedescribed);
// Set-up srcOffset
kernelNoSplit3DBuilder.setArg(2, static_cast<uint32_t>(operationParams.srcOffset.x));
// Set-up dstOrigin
{
uint32_t origin[] = {
static_cast<uint32_t>(operationParams.dstOffset.x),
static_cast<uint32_t>(operationParams.dstOffset.y),
static_cast<uint32_t>(operationParams.dstOffset.z),
0};
kernelNoSplit3DBuilder.setArg(3, sizeof(origin), origin);
}
// Set-up srcRowPitch
{
uint32_t pitch[] = {
static_cast<uint32_t>(srcRowPitch),
static_cast<uint32_t>(srcSlicePitch)};
kernelNoSplit3DBuilder.setArg(4, sizeof(pitch), pitch);
}
// Set-up work sizes
kernelNoSplit3DBuilder.setDispatchGeometry(operationParams.size, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
kernelNoSplit3DBuilder.bake(multiDispatchInfo);
// Store source and destination surfaces for residency purposes
if (operationParams.srcMemObj) {
multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new MemObjSurface(operationParams.srcMemObj)));
} else {
multiDispatchInfo.pushUsedSurface(std::unique_ptr<HostPtrSurface>(new HostPtrSurface(operationParams.srcPtr, hostPtrSize)));
}
multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new MemObjSurface(operationParams.dstMemObj)));
return true;
}
protected:
Kernel *kernelBytes[5];
};
template <typename HWFamily>
class BuiltInOp<HWFamily, EBuiltInOps::CopyImage3dToBuffer> : public BuiltinDispatchInfoBuilder {
public:
BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device)
: BuiltinDispatchInfoBuilder(kernelsLib), kernelBytes{nullptr} {
populate(context, device,
EBuiltInOps::CopyImage3dToBuffer,
"",
"CopyImage3dToBufferBytes", kernelBytes[0],
"CopyImage3dToBuffer2Bytes", kernelBytes[1],
"CopyImage3dToBuffer4Bytes", kernelBytes[2],
"CopyImage3dToBuffer8Bytes", kernelBytes[3],
"CopyImage3dToBuffer16Bytes", kernelBytes[4]);
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, const BuiltinOpParams &operationParams) const override {
DispatchInfoBuilder<SplitDispatch::Dim::d3D, SplitDispatch::SplitMode::NoSplit> kernelNoSplit3DBuilder;
DEBUG_BREAK_IF(!((operationParams.srcPtr == nullptr) && ((operationParams.dstPtr != nullptr) || (operationParams.dstMemObj != nullptr))));
auto srcImage = castToObjectOrAbort<Image>(operationParams.srcMemObj);
// Redescribe image to be byte-copy
auto srcImageRedescribed = srcImage->redescribe();
multiDispatchInfo.pushRedescribedMemObj(std::unique_ptr<MemObj>(srcImageRedescribed)); // life range same as mdi's
// Calculate dstRowPitch and dstSlicePitch
auto bytesPerPixel = srcImage->getSurfaceFormatInfo().ImageElementSizeInBytes;
size_t region[] = {operationParams.size.x, operationParams.size.y, operationParams.size.z};
auto dstRowPitch = static_cast<uint32_t>(operationParams.srcRowPitch ? operationParams.srcRowPitch : region[0] * bytesPerPixel);
auto dstSlicePitch = static_cast<uint32_t>(
operationParams.srcSlicePitch ? operationParams.srcSlicePitch : ((srcImage->getImageDesc().image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY ? 1 : region[1]) * dstRowPitch));
// Determine size of host ptr surface for residency purposes
size_t hostPtrSize = operationParams.dstPtr ? Image::calculateHostPtrSize(region, dstRowPitch, dstSlicePitch, bytesPerPixel, srcImage->getImageDesc().image_type) : 0;
// Set-up ISA
auto bytesExponent = Math::log2(bytesPerPixel);
DEBUG_BREAK_IF(bytesExponent >= 5);
kernelNoSplit3DBuilder.setKernel(kernelBytes[bytesExponent]);
// Set-up source image
kernelNoSplit3DBuilder.setArg(0, srcImageRedescribed);
// Set-up destination host ptr / buffer
if (operationParams.dstPtr) {
kernelNoSplit3DBuilder.setArgSvm(1, hostPtrSize, operationParams.dstPtr);
} else {
kernelNoSplit3DBuilder.setArg(1, operationParams.dstMemObj);
}
// Set-up srcOrigin
{
uint32_t origin[] = {
static_cast<uint32_t>(operationParams.srcOffset.x),
static_cast<uint32_t>(operationParams.srcOffset.y),
static_cast<uint32_t>(operationParams.srcOffset.z),
0};
kernelNoSplit3DBuilder.setArg(2, sizeof(origin), origin);
}
// Set-up dstOffset
kernelNoSplit3DBuilder.setArg(3, static_cast<uint32_t>(operationParams.dstOffset.x));
// Set-up dstRowPitch
{
uint32_t pitch[] = {
static_cast<uint32_t>(dstRowPitch),
static_cast<uint32_t>(dstSlicePitch)};
kernelNoSplit3DBuilder.setArg(4, sizeof(pitch), pitch);
}
// Set-up work sizes
kernelNoSplit3DBuilder.setDispatchGeometry(operationParams.size, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
kernelNoSplit3DBuilder.bake(multiDispatchInfo);
// Store source and destination surfaces for residency purposes
multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new MemObjSurface(operationParams.srcMemObj)));
if (operationParams.dstMemObj) {
multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new MemObjSurface(operationParams.dstMemObj)));
} else {
multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new HostPtrSurface(operationParams.dstPtr, hostPtrSize)));
}
return true;
}
protected:
Kernel *kernelBytes[5];
};
template <typename HWFamily>
class BuiltInOp<HWFamily, EBuiltInOps::CopyImageToImage3d> : public BuiltinDispatchInfoBuilder {
public:
BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device)
: BuiltinDispatchInfoBuilder(kernelsLib), kernel(nullptr) {
populate(context, device,
EBuiltInOps::CopyImageToImage3d,
"",
"CopyImageToImage3d", kernel);
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, const BuiltinOpParams &operationParams) const override {
DispatchInfoBuilder<SplitDispatch::Dim::d3D, SplitDispatch::SplitMode::NoSplit> kernelNoSplit3DBuilder;
DEBUG_BREAK_IF(!((operationParams.srcPtr == nullptr) && (operationParams.dstPtr == nullptr)));
auto srcImage = castToObjectOrAbort<Image>(operationParams.srcMemObj);
auto dstImage = castToObjectOrAbort<Image>(operationParams.dstMemObj);
// Redescribe images to be byte-copies
auto srcImageRedescribed = srcImage->redescribe();
auto dstImageRedescribed = dstImage->redescribe();
multiDispatchInfo.pushRedescribedMemObj(std::unique_ptr<MemObj>(srcImageRedescribed)); // life range same as mdi's
multiDispatchInfo.pushRedescribedMemObj(std::unique_ptr<MemObj>(dstImageRedescribed)); // life range same as mdi's
// Set-up kernel
kernelNoSplit3DBuilder.setKernel(kernel);
// Set-up source image
kernelNoSplit3DBuilder.setArg(0, srcImageRedescribed);
// Set-up destination image
kernelNoSplit3DBuilder.setArg(1, dstImageRedescribed);
// Set-up srcOrigin
{
uint32_t origin[] = {
static_cast<uint32_t>(operationParams.srcOffset.x),
static_cast<uint32_t>(operationParams.srcOffset.y),
static_cast<uint32_t>(operationParams.srcOffset.z),
0};
kernelNoSplit3DBuilder.setArg(2, sizeof(origin), origin);
}
// Set-up dstOrigin
{
uint32_t origin[] = {
static_cast<uint32_t>(operationParams.dstOffset.x),
static_cast<uint32_t>(operationParams.dstOffset.y),
static_cast<uint32_t>(operationParams.dstOffset.z),
0};
kernelNoSplit3DBuilder.setArg(3, sizeof(origin), origin);
}
// Set-up work sizes
kernelNoSplit3DBuilder.setDispatchGeometry(operationParams.size, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
kernelNoSplit3DBuilder.bake(multiDispatchInfo);
// Store source and destination surfaces for residency purposes
multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new MemObjSurface(srcImage)));
multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new MemObjSurface(dstImage)));
return true;
}
protected:
Kernel *kernel;
};
template <typename HWFamily>
class BuiltInOp<HWFamily, EBuiltInOps::FillImage3d> : public BuiltinDispatchInfoBuilder {
public:
BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device)
: BuiltinDispatchInfoBuilder(kernelsLib), kernel(nullptr) {
populate(context, device,
EBuiltInOps::FillImage3d,
"",
"FillImage3d", kernel);
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, const BuiltinOpParams &operationParams) const override {
DispatchInfoBuilder<SplitDispatch::Dim::d3D, SplitDispatch::SplitMode::NoSplit> kernelNoSplit3DBuilder;
DEBUG_BREAK_IF(!((operationParams.srcMemObj == nullptr) && (operationParams.srcPtr != nullptr) && (operationParams.dstPtr == nullptr)));
auto image = castToObjectOrAbort<Image>(operationParams.dstMemObj);
// Redescribe image to be byte-copy
auto imageRedescribed = image->redescribeFillImage();
multiDispatchInfo.pushRedescribedMemObj(std::unique_ptr<MemObj>(imageRedescribed));
// Set-up kernel
kernelNoSplit3DBuilder.setKernel(kernel);
// Set-up destination image
kernelNoSplit3DBuilder.setArg(0, imageRedescribed);
// Set-up fill color
int iFillColor[4] = {0};
const void *fillColor = operationParams.srcPtr;
convertFillColor(fillColor,
iFillColor,
image->getSurfaceFormatInfo().OCLImageFormat,
imageRedescribed->getSurfaceFormatInfo().OCLImageFormat);
kernelNoSplit3DBuilder.setArg(1, 4 * sizeof(int32_t), iFillColor);
// Set-up dstOffset
{
uint32_t offset[] = {
static_cast<uint32_t>(operationParams.dstOffset.x),
static_cast<uint32_t>(operationParams.dstOffset.y),
static_cast<uint32_t>(operationParams.dstOffset.z),
0};
kernelNoSplit3DBuilder.setArg(2, sizeof(offset), offset);
}
// Set-up work sizes
kernelNoSplit3DBuilder.setDispatchGeometry(operationParams.size, Vec3<size_t>{0, 0, 0}, Vec3<size_t>{0, 0, 0});
kernelNoSplit3DBuilder.bake(multiDispatchInfo);
// Store destination surface for residency purposes
multiDispatchInfo.pushUsedSurface(std::unique_ptr<Surface>(new MemObjSurface(image)));
return true;
}
protected:
Kernel *kernel;
};
BuiltinDispatchInfoBuilder &BuiltIns::getBuiltinDispatchInfoBuilder(EBuiltInOps operation, Context &context, Device &device) {
uint32_t operationId = static_cast<uint32_t>(operation);
auto &operationBuilder = BuiltinOpsBuilders[operationId];
switch (operation) {
default:
throw std::runtime_error("getBuiltinDispatchInfoBuilder failed");
case EBuiltInOps::CopyBufferToBuffer:
std::call_once(operationBuilder.second, [&] { operationBuilder.first.reset(new BuiltInOp<HWFamily, EBuiltInOps::CopyBufferToBuffer>(*this, context, device)); });
break;
case EBuiltInOps::CopyBufferRect:
std::call_once(operationBuilder.second, [&] { operationBuilder.first.reset(new BuiltInOp<HWFamily, EBuiltInOps::CopyBufferRect>(*this, context, device)); });
break;
case EBuiltInOps::FillBuffer:
std::call_once(operationBuilder.second, [&] { operationBuilder.first.reset(new BuiltInOp<HWFamily, EBuiltInOps::FillBuffer>(*this, context, device)); });
break;
case EBuiltInOps::CopyBufferToImage3d:
std::call_once(operationBuilder.second, [&] { operationBuilder.first.reset(new BuiltInOp<HWFamily, EBuiltInOps::CopyBufferToImage3d>(*this, context, device)); });
break;
case EBuiltInOps::CopyImage3dToBuffer:
std::call_once(operationBuilder.second, [&] { operationBuilder.first.reset(new BuiltInOp<HWFamily, EBuiltInOps::CopyImage3dToBuffer>(*this, context, device)); });
break;
case EBuiltInOps::CopyImageToImage3d:
std::call_once(operationBuilder.second, [&] { operationBuilder.first.reset(new BuiltInOp<HWFamily, EBuiltInOps::CopyImageToImage3d>(*this, context, device)); });
break;
case EBuiltInOps::FillImage3d:
std::call_once(operationBuilder.second, [&] { operationBuilder.first.reset(new BuiltInOp<HWFamily, EBuiltInOps::FillImage3d>(*this, context, device)); });
break;
case EBuiltInOps::VmeBlockMotionEstimateIntel:
std::call_once(operationBuilder.second, [&] { operationBuilder.first.reset(new BuiltInOp<HWFamily, EBuiltInOps::VmeBlockMotionEstimateIntel>(*this, context, device)); });
break;
case EBuiltInOps::VmeBlockAdvancedMotionEstimateCheckIntel:
std::call_once(operationBuilder.second, [&] { operationBuilder.first.reset(new BuiltInOp<HWFamily, EBuiltInOps::VmeBlockAdvancedMotionEstimateCheckIntel>(*this, context, device)); });
break;
case EBuiltInOps::VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel:
std::call_once(operationBuilder.second, [&] { operationBuilder.first.reset(new BuiltInOp<HWFamily, EBuiltInOps::VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel>(*this, context, device)); });
break;
}
return *operationBuilder.first;
}
std::unique_ptr<BuiltinDispatchInfoBuilder> BuiltIns::setBuiltinDispatchInfoBuilder(EBuiltInOps operation, Context &context, Device &device, std::unique_ptr<BuiltinDispatchInfoBuilder> builder) {
uint32_t operationId = static_cast<uint32_t>(operation);
auto &operationBuilder = BuiltinOpsBuilders[operationId];
operationBuilder.first.swap(builder);
return builder;
}
} // namespace OCLRT

View File

@@ -0,0 +1,301 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "runtime/built_ins/sip.h"
#include "runtime/scheduler/scheduler_kernel.h"
#include "runtime/program/program.h"
#include "runtime/utilities/vec.h"
#include "runtime/os_interface/os_inc.h"
#include <array>
#include <cstdint>
#include <fstream>
#include <memory>
#include <mutex>
#include <string>
#include <tuple>
namespace OCLRT {
typedef std::vector<char> BuiltinResourceT;
extern const char *mediaKernelsBuildOptions;
enum class EBuiltInOps : uint32_t {
CopyBufferToBuffer = 0,
CopyBufferRect,
FillBuffer,
CopyBufferToImage3d,
CopyImage3dToBuffer,
CopyImageToImage1d,
CopyImageToImage2d,
CopyImageToImage3d,
FillImage1d,
FillImage2d,
FillImage3d,
VmeBlockMotionEstimateIntel,
VmeBlockAdvancedMotionEstimateCheckIntel,
VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel,
Scheduler,
COUNT
};
BuiltinResourceT createBuiltinResource(const char *ptr, size_t size);
BuiltinResourceT createBuiltinResource(const BuiltinResourceT &r);
std::string createBuiltinResourceName(EBuiltInOps builtin, const std::string &extension,
const std::string &platformName = "", uint32_t deviceRevId = 0);
std::string joinPath(const std::string &lhs, const std::string &rhs);
const char *getBuiltinAsString(EBuiltInOps builtin);
class Storage {
public:
Storage(const std::string &rootPath)
: rootPath(rootPath) {
}
BuiltinResourceT load(const std::string &resourceName);
protected:
virtual BuiltinResourceT loadImpl(const std::string &fullResourceName) = 0;
std::string rootPath;
};
class FileStorage : public Storage {
public:
FileStorage(const std::string &rootPath = "")
: Storage(rootPath) {
}
protected:
BuiltinResourceT loadImpl(const std::string &fullResourceName) override;
};
struct EmbeddedStorageRegistry {
static EmbeddedStorageRegistry &getInstance() {
static EmbeddedStorageRegistry gsr;
return gsr;
}
void store(const std::string &name, BuiltinResourceT &&resource) {
resources.emplace(name, BuiltinResourceT(std::move(resource)));
}
const BuiltinResourceT *get(const std::string &name) const;
private:
using ResourcesContainer = std::unordered_map<std::string, BuiltinResourceT>;
ResourcesContainer resources;
};
class EmbeddedStorage : public Storage {
public:
EmbeddedStorage(const std::string &rootPath)
: Storage(rootPath) {
}
protected:
BuiltinResourceT loadImpl(const std::string &fullResourceName) override;
};
struct BuiltinCode {
enum class ECodeType {
Any = 0, // for requesting "any" code available - priorities as below
Binary = 1, // ISA - highest priority
Intermediate = 2, // SPIR/LLVM - medium prioroty
Source = 3, // OCL C - lowest priority
COUNT,
INVALID
};
static const char *getExtension(ECodeType ct) {
switch (ct) {
default:
return "";
case ECodeType::Binary:
return ".bin";
case ECodeType::Intermediate:
return ".bc";
case ECodeType::Source:
return ".cl";
}
}
ECodeType type;
BuiltinResourceT resource;
Device *targetDevice;
};
class BuiltinsLib {
public:
BuiltinsLib();
BuiltinCode getBuiltinCode(EBuiltInOps builtin, BuiltinCode::ECodeType requestedCodeType, Device &device);
static std::unique_ptr<Program> createProgramFromCode(const BuiltinCode &bc, Context &context, Device &device);
protected:
BuiltinResourceT getBuiltinResource(EBuiltInOps builtin, BuiltinCode::ECodeType requestedCodeType, Device &device);
using StoragesContainerT = std::vector<std::unique_ptr<Storage>>;
StoragesContainerT allStorages; // sorted by priority allStorages[0] will be checked before allStorages[1], etc.
std::mutex mutex;
};
class Context;
class Device;
class Kernel;
class Program;
struct BuiltInKernel {
const char *pSource = nullptr;
Program *pProgram = nullptr;
std::once_flag programIsInitialized; // guard for creating+building the program
Kernel *pKernel = nullptr;
BuiltInKernel() {
}
};
class BuiltinDispatchInfoBuilder;
class BuiltIns {
public:
using HWFamily = int;
std::pair<std::unique_ptr<BuiltinDispatchInfoBuilder>, std::once_flag> BuiltinOpsBuilders[static_cast<uint32_t>(EBuiltInOps::COUNT)];
BuiltinDispatchInfoBuilder &getBuiltinDispatchInfoBuilder(EBuiltInOps op, Context &context, Device &device);
std::unique_ptr<BuiltinDispatchInfoBuilder> setBuiltinDispatchInfoBuilder(EBuiltInOps op, Context &context, Device &device,
std::unique_ptr<BuiltinDispatchInfoBuilder> newBuilder);
static BuiltIns &getInstance();
static void shutDown();
Program *createBuiltInProgram(
Context &context,
Device &device,
const char *kernelNames,
int &errcodeRet);
SchedulerKernel &getSchedulerKernel(Context &context);
SipKernel &getSipKernel(SipKernelType kernel, Context &context);
BuiltinsLib &getBuiltinsLib() {
DEBUG_BREAK_IF(!builtinsLib.get());
return *builtinsLib;
}
void setCacheingEnableState(bool enableCacheing) {
this->enableCacheing = enableCacheing;
}
bool isCacheingEnabled() const {
return this->enableCacheing;
}
protected:
BuiltIns();
~BuiltIns();
// singleton
static BuiltIns *pInstance;
// scheduler kernel
BuiltInKernel schedulerBuiltIn;
// sip builtins
std::pair<std::unique_ptr<SipKernel>, std::once_flag> sipKernels[static_cast<uint32_t>(SipKernelType::COUNT)];
std::unique_ptr<BuiltinsLib> builtinsLib;
using ProgramsContainerT = std::array<std::pair<std::unique_ptr<Program>, std::once_flag>, static_cast<size_t>(EBuiltInOps::COUNT)>;
ProgramsContainerT builtinPrograms;
bool enableCacheing = true;
};
class MemObj;
class BuiltinDispatchInfoBuilder {
public:
struct BuiltinOpParams {
void *srcPtr = nullptr;
void *dstPtr = nullptr;
MemObj *srcMemObj = nullptr;
MemObj *dstMemObj = nullptr;
GraphicsAllocation *srcSvmAlloc = nullptr;
GraphicsAllocation *dstSvmAlloc = nullptr;
Vec3<size_t> srcOffset = {0, 0, 0};
Vec3<size_t> dstOffset = {0, 0, 0};
Vec3<size_t> size = {0, 0, 0};
size_t srcRowPitch = 0;
size_t dstRowPitch = 0;
size_t srcSlicePitch = 0;
size_t dstSlicePitch = 0;
};
BuiltinDispatchInfoBuilder(BuiltIns &kernelLib) : kernelsLib(kernelLib) {}
template <typename... KernelsDescArgsT>
void populate(Context &context, Device &device, EBuiltInOps operation, const char *options, KernelsDescArgsT &&... desc);
virtual bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, const BuiltinOpParams &operationParams) const {
return false;
}
virtual bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, Kernel *kernel,
const uint32_t dim, const Vec3<size_t> &gws, const Vec3<size_t> &elws, const Vec3<size_t> &offset) const {
return false;
}
virtual cl_int validateDispatch(Kernel *kernel, uint32_t inworkDim, const Vec3<size_t> &gws, const Vec3<size_t> &elws, const Vec3<size_t> &offset) const {
return CL_SUCCESS;
}
// returns true if argument should be updated in kernel exposed to user code
virtual bool setExplicitArg(uint32_t argIndex, size_t argSize, const void *argVal, cl_int &err) const {
return true;
}
void takeOwnership(Context *context);
void releaseOwnership();
protected:
template <typename KernelNameT, typename... KernelsDescArgsT>
void grabKernels(KernelNameT &&kernelName, Kernel *&kernelDst, KernelsDescArgsT &&... kernelsDesc) {
const KernelInfo *ki = prog->getKernelInfo(kernelName);
cl_int err = 0;
kernelDst = Kernel::create(prog.get(), *ki, &err);
kernelDst->isBuiltIn = true;
usedKernels.push_back(std::unique_ptr<Kernel>(kernelDst));
grabKernels(std::forward<KernelsDescArgsT>(kernelsDesc)...);
}
cl_int grabKernels() { return CL_SUCCESS; }
std::unique_ptr<Program> prog;
std::vector<std::unique_ptr<Kernel>> usedKernels;
BuiltIns &kernelsLib;
};
template <typename HWFamily, EBuiltInOps OpCode>
class BuiltInOp;
} // namespace OCLRT

View File

@@ -0,0 +1,216 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include <cstdint>
#include "runtime/built_ins/built_ins.h"
namespace OCLRT {
const char *getBuiltinAsString(EBuiltInOps builtin) {
switch (builtin) {
default:
return "unknown";
case EBuiltInOps::CopyBufferToBuffer:
return "copy_buffer_to_buffer.igdrcl_built_in";
case EBuiltInOps::CopyBufferRect:
return "copy_buffer_rect.igdrcl_built_in";
case EBuiltInOps::FillBuffer:
return "fill_buffer.igdrcl_built_in";
case EBuiltInOps::CopyBufferToImage3d:
return "copy_buffer_to_image3d.igdrcl_built_in";
case EBuiltInOps::CopyImage3dToBuffer:
return "copy_image3d_to_buffer.igdrcl_built_in";
case EBuiltInOps::CopyImageToImage1d:
return "copy_image_to_image1d.igdrcl_built_in";
case EBuiltInOps::CopyImageToImage2d:
return "copy_image_to_image2d.igdrcl_built_in";
case EBuiltInOps::CopyImageToImage3d:
return "copy_image_to_image3d.igdrcl_built_in";
case EBuiltInOps::FillImage1d:
return "fill_image1d.igdrcl_built_in";
case EBuiltInOps::FillImage2d:
return "fill_image2d.igdrcl_built_in";
case EBuiltInOps::FillImage3d:
return "fill_image3d.igdrcl_built_in";
case EBuiltInOps::VmeBlockMotionEstimateIntel:
return "vme_block_motion_estimate_intel.igdrcl_built_in";
case EBuiltInOps::VmeBlockAdvancedMotionEstimateCheckIntel:
return "vme_block_advanced_motion_estimate_check_intel.igdrcl_built_in";
case EBuiltInOps::VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel:
return "vme_block_advanced_motion_estimate_bidirectional_check_intel";
case EBuiltInOps::Scheduler:
return "scheduler.igdrcl_built_in";
};
}
BuiltinResourceT createBuiltinResource(const char *ptr, size_t size) {
return BuiltinResourceT(ptr, ptr + size);
}
BuiltinResourceT createBuiltinResource(const BuiltinResourceT &r) {
return BuiltinResourceT(r);
}
std::string createBuiltinResourceName(EBuiltInOps builtin, const std::string &extension,
const std::string &platformName, uint32_t deviceRevId) {
std::string ret;
if (platformName.size() > 0) {
ret = platformName;
ret += "_" + std::to_string(deviceRevId);
ret += "_";
}
ret += getBuiltinAsString(builtin);
if (extension.size() > 0) {
ret += extension;
}
return ret;
}
std::string joinPath(const std::string &lhs, const std::string &rhs) {
if (lhs.size() == 0) {
return rhs;
}
if (rhs.size() == 0) {
return lhs;
}
if (*lhs.rbegin() == PATH_SEPARATOR) {
return lhs + rhs;
}
return lhs + PATH_SEPARATOR + rhs;
}
std::string getDriverInstallationPath() {
return "";
}
BuiltinResourceT Storage::load(const std::string &resourceName) {
return loadImpl(joinPath(rootPath, resourceName));
}
BuiltinResourceT FileStorage::loadImpl(const std::string &fullResourceName) {
BuiltinResourceT ret;
std::ifstream f{fullResourceName, std::ios::in | std::ios::binary | std::ios::ate};
auto end = f.tellg();
f.seekg(0, std::ios::beg);
auto beg = f.tellg();
auto s = end - beg;
ret.resize(static_cast<size_t>(s));
f.read(ret.data(), s);
return ret;
}
const BuiltinResourceT *EmbeddedStorageRegistry::get(const std::string &name) const {
auto it = resources.find(name);
if (resources.end() == it) {
return nullptr;
}
return &it->second;
}
BuiltinResourceT EmbeddedStorage::loadImpl(const std::string &fullResourceName) {
auto *constResource = EmbeddedStorageRegistry::getInstance().get(fullResourceName);
if (constResource == nullptr) {
BuiltinResourceT ret;
return ret;
}
return createBuiltinResource(*constResource);
}
BuiltinsLib::BuiltinsLib() {
allStorages.push_back(std::unique_ptr<Storage>(new EmbeddedStorage("")));
allStorages.push_back(std::unique_ptr<Storage>(new FileStorage(getDriverInstallationPath())));
}
BuiltinCode BuiltinsLib::getBuiltinCode(EBuiltInOps builtin, BuiltinCode::ECodeType requestedCodeType, Device &device) {
std::lock_guard<std::mutex> lockRaii{mutex};
BuiltinResourceT bc;
BuiltinCode::ECodeType usedCodetType = BuiltinCode::ECodeType::INVALID;
if (requestedCodeType == BuiltinCode::ECodeType::Any) {
for (uint32_t codeType = static_cast<uint32_t>(BuiltinCode::ECodeType::Binary), e = static_cast<uint32_t>(BuiltinCode::ECodeType::COUNT);
codeType != e; ++codeType) {
bc = getBuiltinResource(builtin, static_cast<BuiltinCode::ECodeType>(codeType), device);
if (bc.size() > 0) {
usedCodetType = static_cast<BuiltinCode::ECodeType>(codeType);
break;
}
}
} else {
bc = getBuiltinResource(builtin, requestedCodeType, device);
usedCodetType = requestedCodeType;
}
BuiltinCode ret;
std::swap(ret.resource, bc);
ret.type = usedCodetType;
ret.targetDevice = &device;
return ret;
}
std::unique_ptr<Program> BuiltinsLib::createProgramFromCode(const BuiltinCode &bc, Context &context, Device &device) {
std::unique_ptr<Program> ret;
const char *data = bc.resource.data();
size_t dataLen = bc.resource.size();
cl_int err = 0;
switch (bc.type) {
default:
break;
case BuiltinCode::ECodeType::Source:
case BuiltinCode::ECodeType::Intermediate:
ret.reset(Program::create(data, &context, device, true, &err));
break;
case BuiltinCode::ECodeType::Binary:
ret.reset(Program::createFromGenBinary(&context, data, dataLen, true, nullptr));
break;
}
return ret;
}
BuiltinResourceT BuiltinsLib::getBuiltinResource(EBuiltInOps builtin, BuiltinCode::ECodeType requestedCodeType, Device &device) {
BuiltinResourceT bc;
std::string resourceNameGeneric = createBuiltinResourceName(builtin, BuiltinCode::getExtension(requestedCodeType));
std::string resourceNameForPlatform = createBuiltinResourceName(builtin, BuiltinCode::getExtension(requestedCodeType), device.getProductAbbrev());
std::string resourceNameForPlatformAndStepping = createBuiltinResourceName(builtin, BuiltinCode::getExtension(requestedCodeType), device.getProductAbbrev(),
device.getHardwareInfo().pPlatform->usRevId);
for (auto &rn : {resourceNameForPlatformAndStepping, resourceNameForPlatform, resourceNameGeneric}) { // first look for dedicated version, only fallback to generic one
for (auto &s : allStorages) {
bc = s.get()->load(rn);
if (bc.size() != 0) {
return bc;
}
}
}
return bc;
}
} // namespace OCLRT

View File

@@ -0,0 +1,120 @@
# Copyright (c) 2017, Intel Corporation
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
add_custom_target(builtins)
set_target_properties(builtins PROPERTIES FOLDER "built_ins")
set (BUILTINS_OUTDIR_WITH_ARCH "${TargetDir}/built_ins/${NEO_ARCH}")
add_dependencies(${BUILTINS_BINARIES_LIB_NAME} builtins)
# Set variables needed to compile built-in kernels
set (COPY_BUFFER_TO_BUFFER_BUILTIN copy_buffer_to_buffer.igdrcl_built_in)
set (COPY_BUFFER_RECT_BUILTIN copy_buffer_rect.igdrcl_built_in)
set (FILL_BUFFER_BUILTIN fill_buffer.igdrcl_built_in)
set (COPY_BUFFER_TO_IMAGE3D_BUILTIN copy_buffer_to_image3d.igdrcl_built_in)
set (COPY_IMAGE3D_TO_BUFFER_BUILTIN copy_image3d_to_buffer.igdrcl_built_in)
set (COPY_IMAGE_TO_IMAGE1D_BUILTIN copy_image_to_image1d.igdrcl_built_in)
set (COPY_IMAGE_TO_IMAGE2D_BUILTIN copy_image_to_image2d.igdrcl_built_in)
set (COPY_IMAGE_TO_IMAGE3D_BUILTIN copy_image_to_image3d.igdrcl_built_in)
set (FILL_IMAGE1D_BUILTIN fill_image1d.igdrcl_built_in)
set (FILL_IMAGE2D_BUILTIN fill_image2d.igdrcl_built_in)
set (FILL_IMAGE3D_BUILTIN fill_image3d.igdrcl_built_in)
if("${NEO_ARCH}" STREQUAL "x32")
set(BUILTIN_OPTIONS "-cl-intel-greater-than-4GB-buffer-required")
else()
set(BUILTIN_OPTIONS "")
endif()
if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug" )
set(BUILTIN_DEBUG_OPTION "-D DEBUG")
else()
set(BUILTIN_DEBUG_OPTION "")
endif()
set(BUILTINS_INCLUDE_DIR ${TargetDir} PARENT_SCOPE)
set(BUILTIN_CPP "")
# Define function for compiling built-ins (with cloc)
function(compile_builtin gen_name builtin)
set(OUTPUTDIR "${BUILTINS_OUTDIR_WITH_ARCH}/${gen_name}")
# get filename
get_filename_component(FILENAME ${builtin} NAME)
# get name of the file w/o extension
get_filename_component(BASENAME ${builtin} NAME_WE)
set(OUTPUTPATH_BASE "${OUTPUTDIR}/${BASENAME}_${gen_name}")
set(OUTPUT_FILES
${OUTPUTPATH_BASE}.bc
${OUTPUTPATH_BASE}.bin
${OUTPUTPATH_BASE}.cpp
${OUTPUTPATH_BASE}.gen
)
# function returns builtin cpp filename
unset(BUILTIN_CPP)
# set variable outside function
set(BUILTIN_CPP built_ins/${NEO_ARCH}/${gen_name}/${BASENAME}_${gen_name}.cpp PARENT_SCOPE)
if(MSVC)
add_custom_command(
OUTPUT ${OUTPUT_FILES}
COMMAND cloc -q -file ${FILENAME} -device ${gen_name} ${BUILTIN_OPTIONS} -${NEO_BITS} -out_dir ${OUTPUTDIR} -cpp_file -options "-cl-kernel-arg-info ${BUILTIN_DEBUG_OPTION}"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDS ${builtin} cloc copy_compiler_files
)
else()
add_custom_command(
OUTPUT ${OUTPUT_FILES}
COMMAND LD_LIBRARY_PATH=$<TARGET_FILE_DIR:cloc> $<TARGET_FILE:cloc> -q -file ${FILENAME} -device ${gen_name} ${BUILTIN_OPTIONS} -${NEO_BITS} -out_dir ${OUTPUTDIR} -cpp_file -options "-cl-kernel-arg-info ${BUILTIN_DEBUG_OPTION}"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDS ${builtin} cloc copy_compiler_files
)
endif()
endfunction()
macro(compile_builtins GEN_NUM PLATFORM_IT)
string(TOLOWER ${PLATFORM_IT} PLATFORM_LOWER)
string(CONCAT GEN "_GEN" ${GEN_NUM} "_" ${PLATFORM_IT})
set (BUILTINS_COMMANDS)
foreach(GENERATED_BUILTIN ${GENERATED_BUILTINS})
compile_builtin(${PLATFORM_LOWER} ${${GENERATED_BUILTIN}_BUILTIN})
list(APPEND BUILTINS_COMMANDS ${TargetDir}/${BUILTIN_CPP})
set (RUNTIME_GENERATED_${GENERATED_BUILTIN}${GEN} ${BUILTIN_CPP} PARENT_SCOPE)
endforeach(GENERATED_BUILTIN)
set(target_name builtins_${PLATFORM_LOWER})
add_custom_target(${target_name} DEPENDS ${BUILTINS_COMMANDS})
add_dependencies(builtins ${target_name})
set_target_properties(${target_name} PROPERTIES FOLDER "built_ins/${PLATFORM_LOWER}")
endmacro()
# Compile built-in kernels for all GENs
foreach(GEN_NUM RANGE ${MAX_GEN} 0 -1)
GEN_CONTAINS_PLATFORMS("SUPPORTED" ${GEN_NUM} GENX_HAS_PLATFORMS)
if(${GENX_HAS_PLATFORMS})
GET_PLATFORMS_FOR_GEN("SUPPORTED" ${GEN_NUM} SUPPORTED_GENX_PLATFORMS)
foreach(PLATFORM_IT ${SUPPORTED_GENX_PLATFORMS})
compile_builtins(${GEN_NUM} ${PLATFORM_IT})
endforeach()
endif()
endforeach()

View File

@@ -0,0 +1,63 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
R"===(
//////////////////////////////////////////////////////////////////////////////
__kernel void CopyBufferRectBytes2d(
__global const char* src,
__global char* dst,
uint4 SrcOrigin,
uint4 DstOrigin,
uint2 SrcPitch,
uint2 DstPitch )
{
int x = get_global_id(0);
int y = get_global_id(1);
uint LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x );
uint LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x );
*( dst + LDstOffset ) = *( src + LSrcOffset );
}
//////////////////////////////////////////////////////////////////////////////
__kernel void CopyBufferRectBytes3d(
__global const char* src,
__global char* dst,
uint4 SrcOrigin,
uint4 DstOrigin,
uint2 SrcPitch,
uint2 DstPitch )
{
int x = get_global_id(0);
int y = get_global_id(1);
int z = get_global_id(2);
uint LSrcOffset = x + SrcOrigin.x + ( ( y + SrcOrigin.y ) * SrcPitch.x ) + ( ( z + SrcOrigin.z ) * SrcPitch.y );
uint LDstOffset = x + DstOrigin.x + ( ( y + DstOrigin.y ) * DstPitch.x ) + ( ( z + DstOrigin.z ) * DstPitch.y );
*( dst + LDstOffset ) = *( src + LSrcOffset );
}
)==="

View File

@@ -0,0 +1,69 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
R"===(
__kernel void CopyBufferToBufferBytes(
const __global uchar* pSrc,
__global uchar* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes,
uint bytesToRead )
{
pSrc += ( srcOffsetInBytes + get_global_id(0) );
pDst += ( dstOffsetInBytes + get_global_id(0) );
pDst[ 0 ] = pSrc[ 0 ];
}
__kernel void CopyBufferToBufferLeftLeftover(
const __global uchar* pSrc,
__global uchar* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes)
{
unsigned int gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ];
}
__kernel void CopyBufferToBufferMiddle(
const __global uint* pSrc,
__global uint* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes)
{
unsigned int gid = get_global_id(0);
pDst += dstOffsetInBytes >> 2;
pSrc += srcOffsetInBytes >> 2;
uint4 loaded = vload4(gid, pSrc);
vstore4(loaded, gid, pDst);
}
__kernel void CopyBufferToBufferRightLeftover(
const __global uchar* pSrc,
__global uchar* pDst,
uint srcOffsetInBytes,
uint dstOffsetInBytes)
{
unsigned int gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pSrc[ gid + srcOffsetInBytes ];
}
)==="

View File

@@ -0,0 +1,176 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
R"===(
#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
__kernel void CopyBufferToImage3dBytes(__global uchar *src,
__write_only image3d_t output,
int srcOffset,
int4 dstOffset,
uint2 Pitch) {
const int x = get_global_id(0);
const int y = get_global_id(1);
const int z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
write_imageui(output, dstCoord, (__global uint4)(*(src + LOffset + x), 0, 0, 1));
}
__kernel void CopyBufferToImage3d2Bytes(__global uchar *src,
__write_only image3d_t output,
int srcOffset,
int4 dstOffset,
uint2 Pitch) {
const int x = get_global_id(0);
const int y = get_global_id(1);
const int z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = (uint4)(0, 0, 0, 1);
if(( ulong )(src + srcOffset) & 0x00000001){
ushort upper = *((__global uchar*)(src + LOffset + x * 2 + 1));
ushort lower = *((__global uchar*)(src + LOffset + x * 2));
ushort combined = (upper << 8) | lower;
c.x = (uint)combined;
}
else{
c.x = (uint)(*(__global ushort*)(src + LOffset + x * 2));
}
write_imageui(output, dstCoord, c);
}
__kernel void CopyBufferToImage3d4Bytes(__global uchar *src,
__write_only image3d_t output,
int srcOffset,
int4 dstOffset,
uint2 Pitch) {
const int x = get_global_id(0);
const int y = get_global_id(1);
const int z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = (uint4)(0, 0, 0, 1);
if(( ulong )(src + srcOffset) & 0x00000003){
uint upper2 = *((__global uchar*)(src + LOffset + x * 4 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 4 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 4 + 1));
uint lower = *((__global uchar*)(src + LOffset + x * 4));
uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.x = combined;
}
else{
c.x = (*(__global uint*)(src + LOffset + x * 4));
}
write_imageui(output, dstCoord, c);
}
__kernel void CopyBufferToImage3d8Bytes(__global uchar *src,
__write_only image3d_t output,
int srcOffset,
int4 dstOffset,
uint2 Pitch) {
const int x = get_global_id(0);
const int y = get_global_id(1);
const int z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint2 c = (uint2)(0, 0);//*((__global uint2*)(src + LOffset + x * 8));
if(( ulong )(src + srcOffset) & 0x00000007){
uint upper2 = *((__global uchar*)(src + LOffset + x * 8 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 8 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 8 + 1));
uint lower = *((__global uchar*)(src + LOffset + x * 8));
uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.x = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 8 + 7));
upper = *((__global uchar*)(src + LOffset + x * 8 + 6));
lower2 = *((__global uchar*)(src + LOffset + x * 8 + 5));
lower = *((__global uchar*)(src + LOffset + x * 8 + 4));
combined = ((uint)upper2 << 24) | ((uint)upper << 16) | ((uint)lower2 << 8) | lower;
c.y = combined;
}
else{
c = *((__global uint2*)(src + LOffset + x * 8));
}
write_imageui(output, dstCoord, (__global uint4)(c.x, c.y, 0, 1));
}
__kernel void CopyBufferToImage3d16Bytes(__global uchar *src,
__write_only image3d_t output,
int srcOffset,
int4 dstOffset,
uint2 Pitch) {
const int x = get_global_id(0);
const int y = get_global_id(1);
const int z = get_global_id(2);
int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
uint LOffset = srcOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = (uint4)(0, 0, 0, 0);
if(( ulong )(src + srcOffset) & 0x0000000f){
uint upper2 = *((__global uchar*)(src + LOffset + x * 16 + 3));
uint upper = *((__global uchar*)(src + LOffset + x * 16 + 2));
uint lower2 = *((__global uchar*)(src + LOffset + x * 16 + 1));
uint lower = *((__global uchar*)(src + LOffset + x * 16));
uint combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.x = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 7));
upper = *((__global uchar*)(src + LOffset + x * 16 + 6));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 5));
lower = *((__global uchar*)(src + LOffset + x * 16 + 4));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.y = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 11));
upper = *((__global uchar*)(src + LOffset + x * 16 + 10));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 9));
lower = *((__global uchar*)(src + LOffset + x * 16 + 8));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.z = combined;
upper2 = *((__global uchar*)(src + LOffset + x * 16 + 15));
upper = *((__global uchar*)(src + LOffset + x * 16 + 14));
lower2 = *((__global uchar*)(src + LOffset + x * 16 + 13));
lower = *((__global uchar*)(src + LOffset + x * 16 + 12));
combined = (upper2 << 24) | (upper << 16) | (lower2 << 8) | lower;
c.w = combined;
}
else{
c = *((__global uint4 *)(src + LOffset + x * 16));
}
write_imageui(output, dstCoord, c);
}
)==="

View File

@@ -0,0 +1,154 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
R"===(
__kernel void CopyImage3dToBufferBytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const int x = get_global_id(0);
const int y = get_global_id(1);
const int z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
*(dst + DstOffset + x) = convert_uchar_sat(c.x);
}
__kernel void CopyImage3dToBuffer2Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const int x = get_global_id(0);
const int y = get_global_id(1);
const int z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
if(( ulong )(dst + dstOffset) & 0x00000001){
*((__global uchar*)(dst + DstOffset + x * 2 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 2)) = convert_uchar_sat(c.x & 0xff);
}
else{
*((__global ushort*)(dst + DstOffset + x * 2)) = convert_ushort_sat(c.x);
}
}
__kernel void CopyImage3dToBuffer4Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const int x = get_global_id(0);
const int y = get_global_id(1);
const int z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
if(( ulong )(dst + dstOffset) & 0x00000003){
*((__global uchar*)(dst + DstOffset + x * 4 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 4 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 4 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 4)) = convert_uchar_sat(c.x & 0xff);
}
else{
*((__global uint*)(dst + DstOffset + x * 4)) = c.x;
}
}
__kernel void CopyImage3dToBuffer8Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const int x = get_global_id(0);
const int y = get_global_id(1);
const int z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
uint4 c = read_imageui(input, srcCoord);
if(( ulong )(dst + dstOffset) & 0x00000007){
*((__global uchar*)(dst + DstOffset + x * 8 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 8 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 8 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 8)) = convert_uchar_sat(c.x & 0xff);
*((__global uchar*)(dst + DstOffset + x * 8 + 7)) = convert_uchar_sat((c.y >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 8 + 6)) = convert_uchar_sat((c.y >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 8 + 5)) = convert_uchar_sat((c.y >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 8 + 4)) = convert_uchar_sat(c.y & 0xff);
}
else{
uint2 d = (uint2)(c.x,c.y);
*((__global uint2*)(dst + DstOffset + x * 8)) = d;
}
}
__kernel void CopyImage3dToBuffer16Bytes(__read_only image3d_t input,
__global uchar *dst,
int4 srcOffset,
int dstOffset,
uint2 Pitch) {
const int x = get_global_id(0);
const int y = get_global_id(1);
const int z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
uint DstOffset = dstOffset + (y * Pitch.x) + (z * Pitch.y);
const uint4 c = read_imageui(input, srcCoord);
if(( ulong )(dst + dstOffset) & 0x0000000f){
*((__global uchar*)(dst + DstOffset + x * 16 + 3)) = convert_uchar_sat((c.x >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 2)) = convert_uchar_sat((c.x >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 1)) = convert_uchar_sat((c.x >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16)) = convert_uchar_sat(c.x & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 7)) = convert_uchar_sat((c.y >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 6)) = convert_uchar_sat((c.y >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 5)) = convert_uchar_sat((c.y >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 4)) = convert_uchar_sat(c.y & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 11)) = convert_uchar_sat((c.z >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 10)) = convert_uchar_sat((c.z >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 9)) = convert_uchar_sat((c.z >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 8)) = convert_uchar_sat(c.z & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 15)) = convert_uchar_sat((c.w >> 24 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 14)) = convert_uchar_sat((c.w >> 16 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 13)) = convert_uchar_sat((c.w >> 8 ) & 0xff);
*((__global uchar*)(dst + DstOffset + x * 16 + 12)) = convert_uchar_sat(c.w & 0xff);
}
else{
*(__global uint4*)(dst + DstOffset + x * 16) = c;
}
}
)==="

View File

@@ -0,0 +1,36 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
R"===(
__kernel void CopyImageToImage1d(
__read_only image1d_t input,
__write_only image1d_t output,
int4 srcOffset,
int4 dstOffset) {
const int x = get_global_id(0);
const int srcCoord = x + srcOffset.x;
const int dstCoord = x + dstOffset.x;
const uint4 c = read_imageui(input, srcCoord);
write_imageui(output, dstCoord, c);
}
)==="

View File

@@ -0,0 +1,37 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
R"===(
__kernel void CopyImageToImage2d(
__read_only image2d_t input,
__write_only image2d_t output,
int4 srcOffset,
int4 dstOffset) {
const int x = get_global_id(0);
const int y = get_global_id(1);
const int2 srcCoord = (int2)(x, y) + (int2)(srcOffset.x, srcOffset.y);
const int2 dstCoord = (int2)(x, y) + (int2)(dstOffset.x, dstOffset.y);
const uint4 c = read_imageui(input, srcCoord);
write_imageui(output, dstCoord, c);
}
)==="

View File

@@ -0,0 +1,40 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
R"===(
#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
__kernel void CopyImageToImage3d(
__read_only image3d_t input,
__write_only image3d_t output,
int4 srcOffset,
int4 dstOffset) {
const int x = get_global_id(0);
const int y = get_global_id(1);
const int z = get_global_id(2);
const int4 srcCoord = (int4)(x, y, z, 0) + srcOffset;
const int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
const uint4 c = read_imageui(input, srcCoord);
write_imageui(output, dstCoord, c);
}
)==="

View File

@@ -0,0 +1,64 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
R"===(
// assumption is local work size = pattern size
__kernel void FillBufferBytes(
__global uchar* pDst,
uint dstOffsetInBytes,
const __global uchar* pPattern )
{
uint dstIndex = get_global_id(0) + dstOffsetInBytes;
uint srcIndex = get_local_id(0);
pDst[dstIndex] = pPattern[srcIndex];
}
__kernel void FillBufferLeftLeftover(
__global uchar* pDst,
uint dstOffsetInBytes,
const __global uchar* pPattern,
const uint patternSizeInEls )
{
uint gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pPattern[ gid & (patternSizeInEls - 1) ];
}
__kernel void FillBufferMiddle(
__global uchar* pDst,
uint dstOffsetInBytes,
const __global uint* pPattern,
const uint patternSizeInEls )
{
uint gid = get_global_id(0);
((__global uint*)(pDst + dstOffsetInBytes))[gid] = pPattern[ gid & (patternSizeInEls - 1) ];
}
__kernel void FillBufferRightLeftover(
__global uchar* pDst,
uint dstOffsetInBytes,
const __global uchar* pPattern,
const uint patternSizeInEls )
{
uint gid = get_global_id(0);
pDst[ gid + dstOffsetInBytes ] = pPattern[ gid & (patternSizeInEls - 1) ];
}
)==="

View File

@@ -0,0 +1,33 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
R"===(
__kernel void FillImage1d(
__write_only image1d_t output,
uint4 color,
int4 dstOffset) {
const int x = get_global_id(0);
const int dstCoord = x + dstOffset.x;
write_imageui(output, dstCoord, color);
}
)==="

View File

@@ -0,0 +1,34 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
R"===(
__kernel void FillImage2d(
__write_only image2d_t output,
uint4 color,
int4 dstOffset) {
const int x = get_global_id(0);
const int y = get_global_id(1);
const int2 dstCoord = (int2)(x, y) + (int2)(dstOffset.x, dstOffset.y);
write_imageui(output, dstCoord, color);
}
)==="

View File

@@ -0,0 +1,37 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
R"===(
#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
__kernel void FillImage3d(
__write_only image3d_t output,
uint4 color,
int4 dstOffset) {
const int x = get_global_id(0);
const int y = get_global_id(1);
const int z = get_global_id(2);
const int4 dstCoord = (int4)(x, y, z, 0) + dstOffset;
write_imageui(output, dstCoord, color);
}
)==="

View File

@@ -0,0 +1,32 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
R"===(
__kernel void ve_dn_di_enhance_intel(sampler_t accelerator,
int flags,
__read_only image2d_t current_input,
__read_only image2d_t ref_input,
__write_only image2d_t current_output,
__write_only image2d_t ref_output,
__write_only image2d_t dndi_output) {
}
)==="

View File

@@ -0,0 +1,30 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
R"===(
__kernel void ve_dn_enhance_intel(sampler_t accelerator,
int flags,
__read_only image2d_t ref_input,
__read_only image2d_t current_input,
__write_only image2d_t current_output) {
}
)==="

View File

@@ -0,0 +1,29 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
R"===(
__kernel void ve_enhance_intel(sampler_t accelerator,
int flags,
__read_only image2d_t current_input,
__write_only image2d_t current_output) {
}
)==="

View File

@@ -0,0 +1,458 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
R"===(
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_advanced_motion_estimate_bidirectional_check_intel(
sampler_t accelerator, __read_only image2d_t srcImg,
__read_only image2d_t refImg, __read_only image2d_t src_check_image,
__read_only image2d_t ref0_check_image,
__read_only image2d_t ref1_check_image, uint flags,
uint search_cost_penalty, uint search_cost_precision, short2 count_global,
uchar bidir_weight, __global short2 *count_motion_vector_buffer,
__global short2 *prediction_motion_vector_buffer,
__global char *skip_input_mode_buffer,
__global short2 *skip_motion_vector_buffer,
__global short2 *search_motion_vector_buffer,
__global char *intra_search_predictor_modes,
__global ushort *search_residuals, __global ushort *skip_residuals,
__global ushort *intra_residuals, __read_only image2d_t intraSrcImg,
int height, int width, int stride) {
__local uint dstSearch[64]; // 8 GRFs
__local uint dstSkipIntra[32 + 24]; // 7 GRFs (4 for inter, 3 for intra)
// distortion in the 6th GRF
__local ushort *distSearch = (__local ushort *)&dstSearch[8 * 5];
// Initialize the MV cost table:
// MV Cost in U4U4 format:
// No cost : 0, 0, 0, 0, 0, 0, 0, 0
// Low Cost : 1, 4, 5, 9, 10, 12, 14, 15
// Normal Cost: 5, 26, 29, 43, 45, 47, 57, 57
// High Cost : 29, 61, 72, 78, 88, 89, 91, 92
uint2 MVCostTable;
if (search_cost_penalty == 1) {
MVCostTable.s0 = 0x09050401;
MVCostTable.s1 = 0x0F0E0C0A;
} else if (search_cost_penalty == 2) {
MVCostTable.s0 = 0x2B1D1A05;
MVCostTable.s1 = 0x39392F2D;
} else if (search_cost_penalty == 3) {
MVCostTable.s0 = 0x4E483D1D;
MVCostTable.s1 = 0x5C5B5958;
} else {
MVCostTable.s0 = 0;
MVCostTable.s1 = 0;
}
uint MVCostPrecision = ((uint)search_cost_precision) << 16;
// Frame is divided into rows * columns of MBs.
// One h/w thread per WG.
// One WG processes "row" MBs - one row per iteration and one MB per row.
// Number of WGs (or h/w threads) is number of columns MBs.Each iteration
// processes the MB in a row - gid_0 is the MB id in a row and gid_1 is the
// row offset.
int sid_0 = stride * get_group_id(0);
int gid_0 = sid_0 / height;
int gid_1 = sid_0 % height;
for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
sid++, gid_0 = sid / height, gid_1 = sid % height) {
int2 srcCoord;
srcCoord.x = gid_0 * 16 +
get_global_offset(0); // 16 pixels wide MBs (globally scalar)
srcCoord.y = gid_1 * 16 +
get_global_offset(1); // 16 pixels tall MBs (globally scalar)
uint curMB = gid_0 + gid_1 * width; // current MB id
short2 count;
// If either the search or skip vector counts are per-MB, then we need to
// read in
// the count motion vector buffer.
if ((count_global.s0 == -1) | (count_global.s1 == -1)) {
count = count_motion_vector_buffer[curMB];
}
// If either the search or skip vector counts are per-frame, we need to use
// those.
if (count_global.s0 >= 0) {
count.s0 = count_global.s0;
}
if (count_global.s1 >= 0) {
count.s1 = count_global.s1;
}
int countPredMVs = count.x;
if (countPredMVs != 0) {
uint offset = curMB * 4; // 4 predictors per MB
offset += get_local_id(0) % 4; // 16 work-items access 4 MVs for MB
// one predictor for MB per SIMD channel
// Reduce predictors from Q-pixel to integer precision.
int2 predMV = 0;
if (get_local_id(0) < countPredMVs) {
// one MV per work-item
predMV = convert_int2(prediction_motion_vector_buffer[offset]);
// Predictors are input in QP resolution. Convert that to integer
// resolution.
predMV.x /= 4;
predMV.y /= 4;
predMV.y &= 0xFFFFFFFE;
}
// Do up to 4 IMEs, get the best MVs and their distortions, and optionally
// a FBR of
// the best MVs. Finally the results are written out to SLM.
intel_work_group_vme_mb_multi_query_4(
dstSearch, // best search MV and its distortions into SLM
countPredMVs, // count of predictor MVs (globally scalar - value range
// 1 to 4)
MVCostPrecision, // MV cost precision
MVCostTable, // MV cost table
srcCoord, // MB 2-D offset (globally scalar)
predMV, // predictor MVs (up to 4 distinct MVs for SIMD16 thread)
srcImg, // source
refImg, // reference
accelerator); // vme object
}
int doIntra = ((flags & 0x2) != 0);
int intraEdges = 0;
if (doIntra) {
// Enable all edges by default.
intraEdges = 0x3C;
// If this is a left-edge MB, then disable left edges.
if ((gid_0 == 0) & (get_global_offset(0) == 0)) {
intraEdges &= 0x18;
}
// If this is a right edge MB then disable right edges.
if (gid_0 == width - 1) {
intraEdges &= 0x34;
}
// If this is a top-edge MB, then disable top edges.
if ((gid_1 == 0) & (get_global_offset(1) == 0)) {
intraEdges &= 0x20;
}
// Set bit6=bit5.
intraEdges |= ((intraEdges & 0x20) << 1);
intraEdges <<= 8;
}
int skip_block_type_8x8 = flags & 0x4;
int countSkipMVs = count.y;
if (countSkipMVs != 0 || doIntra == true) {
// one set of skip MV per SIMD channel
// Do up to 4 skip checks and get the distortions for each of them.
// Finally the results are written out to SLM.
if ((skip_block_type_8x8 == 0) | ((doIntra) & (countSkipMVs == 0))) {
// 16x16:
uint offset = curMB * 4 * 2; // 4 sets of skip check MVs per MB
int skipMV = 0;
if (get_local_id(0) < countSkipMVs * 2) // need 2 values per MV
{
offset +=
(get_local_id(0)); // 16 work-items access 4 sets of MVs for MB
__global int *skip1_motion_vector_buffer =
(__global int *)skip_motion_vector_buffer;
skipMV = skip1_motion_vector_buffer[offset]; // one MV per work-item
}
uchar skipMode = 0;
if (get_local_id(0) < countSkipMVs) {
skipMode = skip_input_mode_buffer[curMB];
if (skipMode == 0) {
skipMode = 1;
}
if (skipMode > 3) {
skipMode = 3;
}
}
intel_work_group_vme_mb_multi_bidir_check_16x16(
dstSkipIntra, // distortions into SLM
countSkipMVs, // count of skip check MVs (globally scalar - value
// range 1 to 4)
doIntra, // compute intra modes
intraEdges, // intra edges to use
srcCoord, // MB 2-D offset (globally scalar)
bidir_weight, // bidirectional weight
skipMode, // skip modes
skipMV, // skip check MVs (up to 4 distinct sets of skip check MVs
// for SIMD16 thread)
src_check_image, // source
ref0_check_image, // reference fwd
ref1_check_image, // reference bwd
intraSrcImg, // intra source
accelerator); // vme object
} else {
// 8x8:
uint offset =
curMB * 4 *
8; // 4 sets of skip check MVs, 16 shorts (8 ints) each per MB
int2 skipMVs = 0;
if (get_local_id(0) < countSkipMVs * 8) // need 8 values per MV
{
offset +=
(get_local_id(0)); // 16 work-items access 4 sets of MVs for MB
__global int *skip1_motion_vector_buffer =
(__global int *)(skip_motion_vector_buffer);
skipMVs.x = skip1_motion_vector_buffer[offset]; // four component MVs
// per work-item
skipMVs.y = skip1_motion_vector_buffer[offset + 16];
}
uchar skipModes = 0;
if (get_local_id(0) < countSkipMVs) {
skipModes = skip_input_mode_buffer[curMB];
}
intel_work_group_vme_mb_multi_bidir_check_8x8(
dstSkipIntra, // distortions into SLM
countSkipMVs, // count of skip check MVs per MB (globally scalar -
// value range 1 to 4)
doIntra, // compute intra modes
intraEdges, // intra edges to use
srcCoord, // MB 2-D offset (globally scalar)
bidir_weight, // bidirectional weight
skipModes, // skip modes
skipMVs, // skip check MVs (up to 4 distinct sets of skip check MVs
// for SIMD16 thread)
src_check_image, // source
ref0_check_image, // reference fwd
ref1_check_image, // reference bwd
intraSrcImg, // intra source
accelerator); // vme object
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// Write Out motion estimation result:
// Result format
// Hierarchical row-major layout
// i.e. row-major of blocks MVs in MBs, and row-major of 4 sets of
// MVs/distortion in blocks
if (countPredMVs != 0) {
// 4x4
if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
int index = (gid_0 * 16 + get_local_id(0)) + (gid_1 * 16 * width);
// 1. 16 work-items enabled.
// 2. Work-items gather fwd MVs in strided dword locations 0, 2, .., 30
// (interleaved
// fwd/bdw MVs) with constant offset 8 (control data size) from SLM
// into contiguous
// short2 locations 0, 1, .., 15 of global buffer
// search_motion_vector_buffer with
// offset index.
// 3. Work-items gather contiguous ushort locations 0, 1, .., 15 from
// distSearch into
// contiguous ushort locations 0, 1, .., 15 of search_residuals with
// offset index.
short2 val = as_short2(dstSearch[8 + get_local_id(0) * 2]);
search_motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (search_residuals != NULL)
#endif
{
search_residuals[index] = distSearch[get_local_id(0)];
}
}
// 8x8
else if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
// Only 1st 4 work-item are needed.
if (get_local_id(0) < 4) {
int index = (gid_0 * 4 + get_local_id(0)) + (gid_1 * 4 * width);
// 1. 4 work-items enabled.
// 2. Work-items gather fw MVs in strided dword locations 0, 8, 16, 24
// (interleaved
// fwd/bdw MVs) with constant offset 8 from SLM into contiguous
// short2 locations
// 0, 1, .., 15 of global buffer search_motion_vector_buffer with
// offset index.
// 3. Work-items gather strided ushort locations 0, 4, 8, 12 from
// distSearch into
// contiguous ushort locations 0, 1, .., 15 of search_residuals
// with offset index.
short2 val = as_short2(dstSearch[8 + get_local_id(0) * 4 * 2]);
search_motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (search_residuals != NULL)
#endif
{
search_residuals[index] = distSearch[get_local_id(0) * 4];
}
}
}
// 16x16
else if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
// One 1st work is needed.
if (get_local_id(0) == 0) {
int index = gid_0 + gid_1 * width;
// 1. 1 work-item enabled.
// 2. Work-item gathers fwd MV in dword location 0 with constant
// offset 8 from
// SLM into short2 locations 0 of global buffer
// search_motion_vector_buffer.
// 3. Work-item gathers ushort location 0 from distSearch into ushort
// location 0 of search_residuals with offset index.
short2 val = as_short2(dstSearch[8]);
search_motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (search_residuals != NULL)
#endif
{
search_residuals[index] = distSearch[0];
}
}
}
}
// Write out motion skip check result:
// Result format
// Hierarchical row-major layout
// i.e. row-major of blocks in MBs, and row-major of 8 sets of
// distortions in blocks
if (countSkipMVs != 0) {
if (skip_block_type_8x8 == false) {
// Copy out 4 (1 component) sets of distortion values.
int index = (gid_0 * 4) + (get_local_id(0)) + (gid_1 * 4 * width);
if (get_local_id(0) < countSkipMVs) {
// 1. Up to 4 work-items are enabled.
// 2. The work-item gathers distSkip locations 0, 16*1, .., 16*7 and
// copies them to contiguous skip_residual locations 0, 1, 2, ..,
// 7.
__local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
skip_residuals[index] = distSkip[get_local_id(0) * 16];
}
} else {
// Copy out 4 (4 component) sets of distortion values.
int index =
(gid_0 * 4 * 4) + (get_local_id(0)) + (gid_1 * 4 * 4 * width);
if (get_local_id(0) < countSkipMVs * 4) {
// 1. Up to 16 work-items are enabled.
// 2. The work-item gathers distSkip locations 0, 4*1, .., 4*15 and
// copies them to contiguous skip_residual locations 0, 1, 2, ..,
// 15.
__local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
skip_residuals[index] = distSkip[get_local_id(0) * 4];
}
}
}
// Write out intra search result:
if (doIntra) {
// Write out the 4x4 intra modes
if (get_local_id(0) < 8) {
__local char *dstIntra_4x4 =
(__local char *)(&dstSkipIntra[32 + 16 + 4]);
char value = dstIntra_4x4[get_local_id(0)];
char value_low = (value)&0xf;
char value_high = (value >> 4) & 0xf;
int index_low =
(gid_0 * 22) + (get_local_id(0) * 2) + (gid_1 * 22 * width);
int index_high =
(gid_0 * 22) + (get_local_id(0) * 2) + 1 + (gid_1 * 22 * width);
intra_search_predictor_modes[index_low + 5] = value_low;
intra_search_predictor_modes[index_high + 5] = value_high;
}
// Write out the 8x8 intra modes
if (get_local_id(0) < 4) {
__local char *dstIntra_8x8 =
(__local char *)(&dstSkipIntra[32 + 8 + 4]);
char value = dstIntra_8x8[get_local_id(0) * 2];
char value_low = (value)&0xf;
int index = (gid_0 * 22) + (get_local_id(0)) + (gid_1 * 22 * width);
intra_search_predictor_modes[index + 1] = value_low;
}
// Write out the 16x16 intra modes
if (get_local_id(0) < 1) {
__local char *dstIntra_16x16 =
(__local char *)(&dstSkipIntra[32 + 0 + 4]);
char value = dstIntra_16x16[0];
char value_low = (value)&0xf;
int index = (gid_0 * 22) + (gid_1 * 22 * width);
intra_search_predictor_modes[index] = value_low;
}
// Get the intra residuals.
#ifndef HW_NULL_CHECK
if (intra_residuals != NULL)
#endif
{
int index = (gid_0 * 4) + (gid_1 * 4 * width);
if (get_local_id(0) < 1) {
__local ushort *distIntra_4x4 =
(__local ushort *)(&dstSkipIntra[32 + 16 + 3]);
__local ushort *distIntra_8x8 =
(__local ushort *)(&dstSkipIntra[32 + 8 + 3]);
__local ushort *distIntra_16x16 =
(__local ushort *)(&dstSkipIntra[32 + 0 + 3]);
intra_residuals[index + 2] = distIntra_4x4[0];
intra_residuals[index + 1] = distIntra_8x8[0];
intra_residuals[index + 0] = distIntra_16x16[0];
}
}
}
}
}
)==="

View File

@@ -0,0 +1,41 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
R"===(
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_advanced_motion_estimate_bidirectional_check_intel(
sampler_t accelerator, __read_only image2d_t srcImg,
__read_only image2d_t refImg, __read_only image2d_t src_check_image,
__read_only image2d_t ref0_check_image,
__read_only image2d_t ref1_check_image, uint flags,
uint search_cost_penalty, uint search_cost_precision, short2 count_global,
uchar bidir_weight, __global short2 *count_motion_vector_buffer,
__global short2 *prediction_motion_vector_buffer,
__global char *skip_input_mode_buffer,
__global short2 *skip_motion_vector_buffer,
__global short2 *search_motion_vector_buffer,
__global char *intra_search_predictor_modes,
__global ushort *search_residuals, __global ushort *skip_residuals,
__global ushort *intra_residuals) {
}
)==="

View File

@@ -0,0 +1,390 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
R"===(
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_advanced_motion_estimate_check_intel(
sampler_t accelerator, __read_only image2d_t srcImg,
__read_only image2d_t refImg, uint flags, uint skip_block_type,
uint search_cost_penalty, uint search_cost_precision,
__global short2 *count_motion_vector_buffer,
__global short2 *predictors_buffer,
__global short2 *skip_motion_vector_buffer,
__global short2 *motion_vector_buffer,
__global char *intra_search_predictor_modes, __global ushort *residuals,
__global ushort *skip_residuals, __global ushort *intra_residuals,
__read_only image2d_t intraSrcImg, int height, int width, int stride) {
__local uint dstSearch[64]; // 8 GRFs
__local uint dstSkipIntra[64 + 24]; // 11 GRFs (8 for inter, 3 for intra)
__local ushort *distSearch =
(__local ushort *)&dstSearch[8 * 5]; // distortion in the 6th GRF
// Initialize the MV cost table:
// MV Cost in U4U4 format:
// No cost : 0, 0, 0, 0, 0, 0, 0, 0
// Low Cost : 1, 4, 5, 9, 10, 12, 14, 15
// Normal Cost: 5, 26, 29, 43, 45, 47, 57, 57
// High Cost : 29, 61, 72, 78, 88, 89, 91, 92
uint2 MVCostTable;
if (search_cost_penalty == 1) {
MVCostTable.s0 = 0x09050401;
MVCostTable.s1 = 0x0F0E0C0A;
} else if (search_cost_penalty == 2) {
MVCostTable.s0 = 0x2B1D1A05;
MVCostTable.s1 = 0x39392F2D;
} else if (search_cost_penalty == 3) {
MVCostTable.s0 = 0x4E483D1D;
MVCostTable.s1 = 0x5C5B5958;
} else {
MVCostTable.s0 = 0;
MVCostTable.s1 = 0;
}
uint MVCostPrecision = ((uint)search_cost_precision) << 16;
// Frame is divided into rows * columns of MBs.
// One h/w thread per WG.
// One WG processes 'row' MBs - one row per iteration and one MB per row.
// Number of WGs (or h/w threads) is number of columns MBs
// Each iteration processes the MB in a row - gid_0 is the MB id in a row and
// gid_1 is the row offset.
int sid_0 = stride * get_group_id(0);
int gid_0 = sid_0 / height;
int gid_1 = sid_0 % height;
for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
sid++, gid_0 = sid / height, gid_1 = sid % height) {
int2 srcCoord;
srcCoord.x = gid_0 * 16 +
get_global_offset(0); // 16 pixels wide MBs (globally scalar)
srcCoord.y = gid_1 * 16 +
get_global_offset(1); // 16 pixels tall MBs (globally scalar)
uint curMB = gid_0 + gid_1 * width; // current MB id
short2 count = count_motion_vector_buffer[curMB];
int countPredMVs = count.x;
if (countPredMVs != 0) {
uint offset = curMB * 8; // 8 predictors per MB
offset += get_local_id(0) % 8; // 16 work-items access 8 MVs for MB
// one predictor for MB per SIMD channel
// Reduce predictors from Q-pixel to integer precision.
int2 predMV = 0;
if (get_local_id(0) < countPredMVs) {
predMV =
convert_int2(predictors_buffer[offset]); // one MV per work-item
predMV.x /= 4;
predMV.y /= 4;
predMV.y &= 0xFFFE;
}
// Do up to 8 IMEs, get the best MVs and their distortions, and optionally
// a FBR of the best MVs.
// Finally the results are written out to SLM.
intel_work_group_vme_mb_multi_query_8(
dstSearch, // best search MV and its distortions into SLM
countPredMVs, // count of predictor MVs (globally scalar - value range
// 1 to 8)
MVCostPrecision, // MV cost precision
MVCostTable, // MV cost table
srcCoord, // MB 2-D offset (globally scalar)
predMV, // predictor MVs (up to 8 distinct MVs for SIMD16 thread)
srcImg, // source
refImg, // reference
accelerator); // vme object
}
int doIntra = (flags & 0x2) != 0;
int intraEdges = 0;
if (doIntra) {
// Enable all edges by default.
intraEdges = 0x3C;
// If this is a left-edge MB, then disable left edges.
if ((gid_0 == 0) & (get_global_offset(0) == 0)) {
intraEdges &= 0x18;
}
// If this is a right edge MB then disable right edges.
if (gid_0 == width - 1) {
intraEdges &= 0x34;
}
// If this is a top-edge MB, then disable top edges.
if ((gid_1 == 0) & (get_global_offset(1) == 0)) {
intraEdges &= 0x20;
}
// Set bit6=bit5.
intraEdges |= ((intraEdges & 0x20) << 1);
intraEdges <<= 8;
}
int countSkipMVs = count.y;
if (countSkipMVs != 0 || doIntra == true) {
uint offset = curMB * 8; // 8 sets of skip check MVs per MB
offset +=
(get_local_id(0) % 8); // 16 work-items access 8 sets of MVs for MB
// one set of skip MV per SIMD channel
// Do up to 8 skip checks and get the distortions for each of them.
// Finally the results are written out to SLM.
if ((skip_block_type == 0x0) | ((doIntra) & (countSkipMVs == 0))) {
int skipMVs = 0;
if (get_local_id(0) < countSkipMVs) {
__global int *skip1_motion_vector_buffer =
(__global int *)skip_motion_vector_buffer;
skipMVs = skip1_motion_vector_buffer[offset]; // one packed MV for one
// work-item
}
intel_work_group_vme_mb_multi_check_16x16(
dstSkipIntra, // distortions into SLM
countSkipMVs, // count of skip check MVs (value range 0 to 8)
doIntra, // compute intra modes
intraEdges, // intra edges to use
srcCoord, // MB 2-D offset (globally scalar)
skipMVs, // skip check MVs (up to 8 sets of skip check MVs for
// SIMD16 thread)
srcImg, // source
refImg, // reference
intraSrcImg, // intra source
accelerator);
}
if ((skip_block_type == 0x1) & (countSkipMVs > 0)) {
int4 skipMVs = 0;
if (get_local_id(0) < countSkipMVs) {
__global int4 *skip4_motion_vector_buffer =
(__global int4 *)(skip_motion_vector_buffer);
skipMVs = skip4_motion_vector_buffer[offset]; // four component MVs
// per work-item
}
intel_work_group_vme_mb_multi_check_8x8(
dstSkipIntra, // distortions into SLM
countSkipMVs, // count of skip check MVs per MB (value range 0 to 8)
doIntra, // compute intra modes
intraEdges, // intra edges to use
srcCoord, // MB 2-D offset (globally scalar)
skipMVs, // skip check MVs (up to 8 ets of skip check MVs for SIMD16
// thread)
srcImg, // source
refImg, // reference
intraSrcImg, // intra source
accelerator);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// Write Out motion estimation result:
// Result format
// Hierarchical row-major layout
// i.e. row-major of blocks MVs in MBs, and row-major of 8 sets of
// MVs/distortion in blocks
if (countPredMVs != 0) {
// 4x4
if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
int index = (gid_0 * 16 + get_local_id(0)) + (gid_1 * 16 * width);
// 1. 16 work-items enabled.
// 2. Work-items gather fwd MVs in strided dword locations 0, 2, .., 30
// (interleaved
// fwd/bdw MVs) with constant offset 8 (control data size) from SLM
// into contiguous
// short2 locations 0, 1, .., 15 of global buffer
// search_motion_vector_buffer with
// offset index.
// 3. Work-items gather contiguous ushort locations 0, 1, .., 15 from
// distSearch into
// contiguous ushort locations 0, 1, .., 15 of search_residuals with
// offset index.
short2 val = as_short2(dstSearch[8 + get_local_id(0) * 2]);
motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (residuals != NULL)
#endif
{
residuals[index] = distSearch[get_local_id(0)];
}
}
// 8x8
else if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
// Only 1st 4 work-item are needed.
if (get_local_id(0) < 4) {
int index = (gid_0 * 4 + get_local_id(0)) + (gid_1 * 4 * width);
// 1. 4 work-items enabled.
// 2. Work-items gather fw MVs in strided dword locations 0, 8, 16, 24
// (interleaved
// fwd/bdw MVs) with constant offset 8 from SLM into contiguous
// short2 locations
// 0, 1, .., 15 of global buffer search_motion_vector_buffer with
// offset index.
// 3. Work-items gather strided ushort locations 0, 4, 8, 12 from
// distSearch into
// contiguous ushort locations 0, 1, .., 15 of search_residuals
// with offset index.
short2 val = as_short2(dstSearch[8 + get_local_id(0) * 4 * 2]);
motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (residuals != NULL)
#endif
{
residuals[index] = distSearch[get_local_id(0) * 4];
}
}
}
// 16x16
else if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
// One 1st work is needed.
if (get_local_id(0) == 0) {
int index = gid_0 + gid_1 * width;
// 1. 1 work-item enabled.
// 2. Work-item gathers fwd MV in dword location 0 with constant
// offset 8 from
// SLM into short2 locations 0 of global buffer
// search_motion_vector_buffer.
// 3. Work-item gathers ushort location 0 from distSearch into ushort
// location 0 of search_residuals with offset index.
short2 val = as_short2(dstSearch[8]);
motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (residuals != NULL)
#endif
{
residuals[index] = distSearch[0];
}
}
}
}
// Write out motion skip check result:
// Result format
// Hierarchical row-major layout
// i.e. row-major of blocks in MBs, and row-major of 8 sets of
// distortions in blocks
if (countSkipMVs != 0) {
if (skip_block_type == 0x0) {
// Copy out 8 (1 component) sets of distortion values.
int index = (gid_0 * 8) + (get_local_id(0)) + (gid_1 * 8 * width);
if (get_local_id(0) < countSkipMVs) {
__local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
// 1. Up to 8 work-items are enabled.
// 2. The work-item gathers distSkip locations 0, 16*1, .., 16*7 and
// copies them to contiguous skip_residual locations 0, 1, 2, ..,
// 7.
skip_residuals[index] = distSkip[get_local_id(0) * 16];
}
} else {
// Copy out 8 (4 component) sets of distortion values.
int index =
(gid_0 * 8 * 4) + (get_local_id(0)) + (gid_1 * 8 * 4 * width);
__local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
if (get_local_id(0) < countSkipMVs * 4) {
// 1. Up to 16 work-items are enabled.
// 2. The work-item gathers distSkip locations 0, 4*1, .., 4*31 and
// copies them to contiguous skip_residual locations 0, 1, 2, ..,
// 31.
skip_residuals[index] = distSkip[get_local_id(0) * 4];
skip_residuals[index + 16] = distSkip[(get_local_id(0) + 16) * 4];
}
}
}
// Write out intra search result:
if (doIntra) {
int index_low =
(gid_0 * 22) + (get_local_id(0) * 2) + (gid_1 * 22 * width);
int index_high =
(gid_0 * 22) + (get_local_id(0) * 2) + 1 + (gid_1 * 22 * width);
// Write out the 4x4 intra modes
if (get_local_id(0) < 8) {
__local char *dstIntra_4x4 =
(__local char *)(&dstSkipIntra[64 + 16 + 4]);
char value = dstIntra_4x4[get_local_id(0)];
char value_low = (value)&0xf;
char value_high = (value >> 4) & 0xf;
intra_search_predictor_modes[index_low + 5] = value_low;
intra_search_predictor_modes[index_high + 5] = value_high;
}
// Write out the 8x8 intra modes
if (get_local_id(0) < 4) {
__local char *dstIntra_8x8 =
(__local char *)(&dstSkipIntra[64 + 8 + 4]);
char value = dstIntra_8x8[get_local_id(0) * 2];
char value_low = (value)&0xf;
int index = (gid_0 * 22) + (get_local_id(0)) + (gid_1 * 22 * width);
intra_search_predictor_modes[index + 1] = value_low;
}
// Write out the 16x16 intra modes
if (get_local_id(0) < 1) {
__local char *dstIntra_16x16 =
(__local char *)(&dstSkipIntra[64 + 0 + 4]);
char value = dstIntra_16x16[get_local_id(0)];
char value_low = (value)&0xf;
intra_search_predictor_modes[index_low] = value_low;
}
// Get the intra residuals.
#ifndef HW_NULL_CHECK
if (intra_residuals != NULL)
#endif
{
int index = (gid_0 * 4) + (gid_1 * 4 * width);
if (get_local_id(0) < 1) {
__local ushort *distIntra_4x4 = (__local ushort *)(&dstSkipIntra[64 + 16 + 3]);
__local ushort *distIntra_8x8 = (__local ushort *)(&dstSkipIntra[64 + 8 + 3]);
__local ushort *distIntra_16x16 = (__local ushort *)(&dstSkipIntra[64 + 0 + 3]);
intra_residuals[index + 2] = distIntra_4x4[0];
intra_residuals[index + 1] = distIntra_8x8[0];
intra_residuals[index + 0] = distIntra_16x16[0];
}
}
}
}
}
)==="

View File

@@ -0,0 +1,36 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
R"===(
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_advanced_motion_estimate_check_intel(
sampler_t accelerator, __read_only image2d_t srcImg,
__read_only image2d_t refImg, uint flags, uint skip_block_type,
uint search_cost_penalty, uint search_cost_precision,
__global short2 *count_motion_vector_buffer,
__global short2 *predictors_buffer,
__global short2 *skip_motion_vector_buffer,
__global short2 *motion_vector_buffer,
__global char *intra_search_predictor_modes, __global ushort *residuals,
__global ushort *skip_residuals, __global ushort *intra_residuals) {
}
)==="

View File

@@ -0,0 +1,118 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
R"===(
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_motion_estimate_intel(sampler_t accelerator, __read_only image2d_t srcImg,
__read_only image2d_t refImg,
__global short2 *prediction_motion_vector_buffer,
__global short2 *motion_vector_buffer,
__global ushort *residuals, int height, int width,
int stride) {
__local uint dst[64];
__local ushort *dist = (__local ushort *)&dst[8 * 5];
int sid_0 = stride * get_group_id(0);
int gid_0 = sid_0 / height;
int gid_1 = sid_0 % height;
for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
sid++, gid_0 = sid / height, gid_1 = sid % height) {
int2 srcCoord = 0;
int2 refCoord = 0;
srcCoord.x = gid_0 * 16 + get_global_offset(0);
srcCoord.y = gid_1 * 16 + get_global_offset(1);
short2 predMV = 0;
#ifndef HW_NULL_CHECK
if (prediction_motion_vector_buffer != NULL)
#endif
{
predMV = prediction_motion_vector_buffer[gid_0 + gid_1 * width];
refCoord.x = predMV.x / 4;
refCoord.y = predMV.y / 4;
refCoord.y = refCoord.y & 0xFFFE;
}
{
intel_work_group_vme_mb_query(dst, srcCoord, refCoord, srcImg, refImg,
accelerator);
}
barrier(CLK_LOCAL_MEM_FENCE);
// Write Out Result
// 4x4
if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
int x = get_local_id(0) % 4;
int y = get_local_id(0) / 4;
int index = (gid_0 * 4 + x) + (gid_1 * 4 + y) * width * 4;
short2 val = as_short2(dst[8 + (y * 4 + x) * 2]);
motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (residuals != NULL)
#endif
{
residuals[index] = dist[y * 4 + x];
}
}
// 8x8
if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
if (get_local_id(0) < 4) {
int x = get_local_id(0) % 2;
int y = get_local_id(0) / 2;
int index = (gid_0 * 2 + x) + (gid_1 * 2 + y) * width * 2;
short2 val = as_short2(dst[8 + (y * 2 + x) * 8]);
motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (residuals != NULL)
#endif
{
residuals[index] = dist[(y * 2 + x) * 4];
}
}
}
// 16x16
if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
if (get_local_id(0) == 0) {
int index = gid_0 + gid_1 * width;
short2 val = as_short2(dst[8]);
motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (residuals != NULL)
#endif
{
residuals[index] = dist[0];
}
}
}
}
}
)==="

View File

@@ -0,0 +1,31 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
R"===(
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_motion_estimate_intel(sampler_t accelerator, __read_only image2d_t srcImg,
__read_only image2d_t refImg,
__global short2 *prediction_motion_vector_buffer,
__global short2 *motion_vector_buffer,
__global ushort *residuals) {
}
)==="

View File

@@ -0,0 +1,40 @@
# Copyright (c) 2017, Intel Corporation
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
set (NEO_REGISTERED_BUILTINS_SOURCES
register_copy_kernels_source.cpp
register_ext_vme_source.cpp
CMakeLists.txt
)
add_library(${BUILTINS_SOURCES_LIB_NAME} OBJECT
${NEO_REGISTERED_BUILTINS_SOURCES}
)
set_target_properties(${BUILTINS_SOURCES_LIB_NAME}
PROPERTIES POSITION_INDEPENDENT_CODE ON
)
target_include_directories(${BUILTINS_SOURCES_LIB_NAME} PRIVATE
${KHRONOS_HEADERS_DIR}
${UMKM_SHAREDDATA_INCLUDE_PATHS}
${IGDRCL__IGC_INCLUDE_DIR}
${THIRD_PARTY_DIR}
)

View File

@@ -0,0 +1,42 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include <string>
#include <unordered_map>
#include "runtime/built_ins/built_ins.h"
namespace OCLRT {
struct RegisterEmbeddedResource {
RegisterEmbeddedResource(const char *name, const char *resource, size_t resourceLength) {
auto &storageRegistry = EmbeddedStorageRegistry::getInstance();
storageRegistry.store(name, createBuiltinResource(resource, resourceLength));
}
RegisterEmbeddedResource(const char *name, std::string &&resource)
: RegisterEmbeddedResource(name, resource.data(), resource.size() + 1) {
}
};
} // namespace OCLRT

View File

@@ -0,0 +1,127 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include <string>
#include "runtime/built_ins/registry/built_ins_registry.h"
namespace OCLRT {
static RegisterEmbeddedResource registerCopyBufferToBufferSrc(
createBuiltinResourceName(
EBuiltInOps::CopyBufferToBuffer,
BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
.c_str(),
std::string(
#include "runtime/built_ins/kernels/copy_buffer_to_buffer.igdrcl_built_in"
));
static RegisterEmbeddedResource registerCopyBufferRectSrc(
createBuiltinResourceName(
EBuiltInOps::CopyBufferRect,
BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
.c_str(),
std::string(
#include "runtime/built_ins/kernels/copy_buffer_rect.igdrcl_built_in"
));
static RegisterEmbeddedResource registerFillBufferSrc(
createBuiltinResourceName(
EBuiltInOps::FillBuffer,
BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
.c_str(),
std::string(
#include "runtime/built_ins/kernels/fill_buffer.igdrcl_built_in"
));
static RegisterEmbeddedResource registerCopyBufferToImage3dSrc(
createBuiltinResourceName(
EBuiltInOps::CopyBufferToImage3d,
BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
.c_str(),
std::string(
#include "runtime/built_ins/kernels/copy_buffer_to_image3d.igdrcl_built_in"
));
static RegisterEmbeddedResource registerCopyImage3dToBufferSrc(
createBuiltinResourceName(
EBuiltInOps::CopyImage3dToBuffer,
BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
.c_str(),
std::string(
#include "runtime/built_ins/kernels/copy_image3d_to_buffer.igdrcl_built_in"
));
static RegisterEmbeddedResource registerCopyImageToImage1dSrc(
createBuiltinResourceName(
EBuiltInOps::CopyImageToImage1d,
BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
.c_str(),
std::string(
#include "runtime/built_ins/kernels/copy_image_to_image1d.igdrcl_built_in"
));
static RegisterEmbeddedResource registerCopyImageToImage2dSrc(
createBuiltinResourceName(
EBuiltInOps::CopyImageToImage2d,
BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
.c_str(),
std::string(
#include "runtime/built_ins/kernels/copy_image_to_image2d.igdrcl_built_in"
));
static RegisterEmbeddedResource registerCopyImageToImage3dSrc(
createBuiltinResourceName(
EBuiltInOps::CopyImageToImage3d,
BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
.c_str(),
std::string(
#include "runtime/built_ins/kernels/copy_image_to_image3d.igdrcl_built_in"
));
static RegisterEmbeddedResource registerFillImage1dSrc(
createBuiltinResourceName(
EBuiltInOps::FillImage1d,
BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
.c_str(),
std::string(
#include "runtime/built_ins/kernels/fill_image1d.igdrcl_built_in"
));
static RegisterEmbeddedResource registerFillImage2dSrc(
createBuiltinResourceName(
EBuiltInOps::FillImage2d,
BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
.c_str(),
std::string(
#include "runtime/built_ins/kernels/fill_image2d.igdrcl_built_in"
));
static RegisterEmbeddedResource registerFillImage3dSrc(
createBuiltinResourceName(
EBuiltInOps::FillImage3d,
BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
.c_str(),
std::string(
#include "runtime/built_ins/kernels/fill_image3d.igdrcl_built_in"
));
} // namespace OCLRT

View File

@@ -0,0 +1,55 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include <string>
#include "runtime/built_ins/registry/built_ins_registry.h"
namespace OCLRT {
static RegisterEmbeddedResource registerVmeSrc(
createBuiltinResourceName(
EBuiltInOps::VmeBlockMotionEstimateIntel,
BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
.c_str(),
std::string(
#include "runtime/built_ins/kernels/vme_block_motion_estimate_intel.igdrcl_built_in"
));
static RegisterEmbeddedResource registerVmeAdvancedSrc(
createBuiltinResourceName(
EBuiltInOps::VmeBlockAdvancedMotionEstimateCheckIntel,
BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
.c_str(),
std::string(
#include "runtime/built_ins/kernels/vme_block_advanced_motion_estimate_check_intel.igdrcl_built_in"
));
static RegisterEmbeddedResource registerVmeAdvancedBidirectionalSrc(
createBuiltinResourceName(
EBuiltInOps::VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel,
BuiltinCode::getExtension(BuiltinCode::ECodeType::Source))
.c_str(),
std::string(
#include "runtime/built_ins/kernels/vme_block_advanced_motion_estimate_bidirectional_check_intel.igdrcl_built_in"
));
} // namespace OCLRT

81
runtime/built_ins/sip.cpp Normal file
View File

@@ -0,0 +1,81 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "runtime/built_ins/sip.h"
#include "runtime/device/device.h"
#include "runtime/helpers/debug_helpers.h"
#include "runtime/helpers/string.h"
namespace OCLRT {
const char *getSipKernelCompilerInternalOptions(SipKernelType kernel) {
switch (kernel) {
default:
DEBUG_BREAK_IF(true);
return "";
case SipKernelType::Csr:
return "-cl-include-sip-csr";
}
}
const char *getSipLlSrc(const Device &device) {
#define M_DUMMY_LL_SRC \
"define void @f() { \n" \
" ret void \n" \
"} \n" \
"!opencl.compiler.options = !{!0} \n" \
"!opencl.kernels = !{!1} \n" \
"!0 = !{} \n" \
"!1 = !{void()* @f, !2, !3, !4, !5, !6, !7} \n" \
"!2 = !{!\"kernel_arg_addr_space\"} \n" \
"!3 = !{!\"kernel_arg_access_qual\"} \n" \
"!4 = !{!\"kernel_arg_type\"} \n" \
"!5 = !{!\"kernel_arg_type_qual\"} \n" \
"!6 = !{!\"kernel_arg_base_type\"} \n" \
"!7 = !{!\"kernel_arg_name\"} \n"
constexpr const char *llDummySrc32 =
"target datalayout = \"e-p:32:32:32\" \n"
"target triple = \"spir\" \n" M_DUMMY_LL_SRC;
constexpr const char *llDummySrc64 =
"target datalayout = \"e-p:64:64:64\" \n"
"target triple = \"spir64\" \n" M_DUMMY_LL_SRC;
#undef M_DUMMY_LL_SRC
const uint32_t ptrSize = device.getDeviceInfo().force32BitAddressess ? 4 : sizeof(void *);
return (ptrSize == 8) ? llDummySrc64 : llDummySrc32;
}
SipKernel::SipKernel(SipKernelType type, const void *binary, size_t binarySize)
: type(type) {
UNRECOVERABLE_IF(binary == nullptr);
UNRECOVERABLE_IF(binarySize == 0);
this->binary.reset(new char[binarySize]);
memcpy_s(this->binary.get(), binarySize, binary, binarySize);
this->binarySize = binarySize;
}
}

61
runtime/built_ins/sip.h Normal file
View File

@@ -0,0 +1,61 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include <cinttypes>
#include <memory>
namespace OCLRT {
class Device;
enum class SipKernelType : std::uint32_t {
Csr = 0,
COUNT
};
const char *getSipKernelCompilerInternalOptions(SipKernelType kernel);
const char *getSipLlSrc(const Device &device);
class SipKernel {
public:
SipKernel(SipKernelType type, const void *binary, size_t binarySize);
SipKernel(const SipKernel &) = delete;
SipKernel &operator=(const SipKernel &) = delete;
SipKernel(SipKernel &&) = default;
SipKernel &operator=(SipKernel &&) = default;
const char *getBinary() const {
return binary.get();
}
size_t getBinarySize() const {
return binarySize;
}
protected:
SipKernelType type = SipKernelType::COUNT;
std::unique_ptr<char[]> binary = nullptr;
size_t binarySize = 0;
};
}

View File

@@ -0,0 +1,492 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "runtime/accelerators/intel_accelerator.h"
#include "runtime/accelerators/intel_motion_estimation.h"
#include "runtime/built_ins/built_ins.h"
#include "runtime/helpers/dispatch_info_builder.h"
#include "runtime/mem_obj/buffer.h"
#include "runtime/mem_obj/image.h"
namespace OCLRT {
template <typename HWFamily>
class VmeBuiltinDispatchInfoBuilder : public BuiltinDispatchInfoBuilder {
public:
VmeBuiltinDispatchInfoBuilder(BuiltIns &kernelsLib, Context &context, Device &device, EBuiltInOps builtinOp,
const char *kernelName)
: BuiltinDispatchInfoBuilder(kernelsLib) {
populate(context, device, builtinOp,
mediaKernelsBuildOptions,
kernelName, vmeKernel);
widthArgNum = vmeKernel->getKernelInfo().getArgNumByName("width");
heightArgNum = vmeKernel->getKernelInfo().getArgNumByName("height");
strideArgNum = vmeKernel->getKernelInfo().getArgNumByName("stride");
acceleratorArgNum = vmeKernel->getKernelInfo().getArgNumByName("accelerator");
srcImgArgNum = vmeKernel->getKernelInfo().getArgNumByName("srcImg");
refImgArgNum = vmeKernel->getKernelInfo().getArgNumByName("refImg");
motionVectorBufferArgNum = vmeKernel->getKernelInfo().getArgNumByName("motion_vector_buffer");
predictionMotionVectorBufferArgNum = vmeKernel->getKernelInfo().getArgNumByName("prediction_motion_vector_buffer");
residualsArgNum = vmeKernel->getKernelInfo().getArgNumByName("residuals");
}
void getBlkTraits(const Vec3<size_t> &inGws, size_t &gwWidthInBlk, size_t &gwHeightInBlk) const {
const size_t vmeMacroBlockWidth = 16;
const size_t vmeMacroBlockHeight = 16;
gwWidthInBlk = (inGws.x + vmeMacroBlockWidth - 1) / vmeMacroBlockWidth;
gwHeightInBlk = (inGws.y + vmeMacroBlockHeight - 1) / vmeMacroBlockHeight;
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, Kernel *kern,
const uint32_t inDim, const Vec3<size_t> &inGws, const Vec3<size_t> &inLws, const Vec3<size_t> &inOffset) const override {
if (kern == nullptr) {
return false;
}
size_t gwWidthInBlk = 0;
size_t gwHeightInBlk = 0;
getBlkTraits(inGws, gwWidthInBlk, gwHeightInBlk);
cl_int height = (cl_int)gwHeightInBlk;
cl_int width = (cl_int)gwWidthInBlk;
cl_int stride = height;
size_t numThreadsX = gwWidthInBlk;
const size_t simdWidth = vmeKernel->getKernelInfo().getMaxSimdSize();
stride = (height * width + (cl_int)numThreadsX - 1) / (cl_int)numThreadsX;
// update implicit args
vmeKernel->setArg(heightArgNum, sizeof(height), &height);
vmeKernel->setArg(widthArgNum, sizeof(width), &width);
vmeKernel->setArg(strideArgNum, sizeof(stride), &stride);
// Update global work size to force macro-block to HW thread execution model
Vec3<size_t> gws = {numThreadsX * simdWidth, 1, 1};
Vec3<size_t> lws = {vmeKernel->getKernelInfo().reqdWorkGroupSize[0], 1, 1};
DispatchInfoBuilder<SplitDispatch::Dim::d2D, SplitDispatch::SplitMode::NoSplit> builder;
builder.setDispatchGeometry(gws, lws, inOffset, gws, lws);
builder.setKernel(vmeKernel);
builder.bake(multiDispatchInfo);
return true;
}
bool setExplicitArg(uint32_t argIndex, size_t argSize, const void *argVal, cl_int &err) const override {
DEBUG_BREAK_IF(!((argIndex != widthArgNum) && (argIndex != heightArgNum) && (argIndex != strideArgNum)));
if ((argIndex == acceleratorArgNum) && (argVal == nullptr)) {
err = CL_INVALID_ACCELERATOR_INTEL;
return false;
}
err = vmeKernel->setArg(argIndex, argSize, argVal);
return false;
}
cl_int validateDispatch(Kernel *kernel, uint32_t inworkDim, const Vec3<size_t> &inGws, const Vec3<size_t> &inLws, const Vec3<size_t> &inOffset) const override {
if (inworkDim != 2) {
return CL_INVALID_WORK_DIMENSION;
}
size_t gwWidthInBlk = 0;
size_t gwHeightInBlk = 0;
getBlkTraits(inGws, gwWidthInBlk, gwHeightInBlk);
size_t BlkNum = gwWidthInBlk * gwHeightInBlk;
size_t BlkMul = 1;
IntelAccelerator *accelerator = castToObject<IntelAccelerator>((cl_accelerator_intel)vmeKernel->getKernelArg(acceleratorArgNum));
if (accelerator == nullptr) {
return CL_INVALID_KERNEL_ARGS; // accelerator was not set
}
DEBUG_BREAK_IF(accelerator->getDescriptorSize() != sizeof(cl_motion_estimation_desc_intel));
const cl_motion_estimation_desc_intel *acceleratorDesc = reinterpret_cast<const cl_motion_estimation_desc_intel *>(accelerator->getDescriptor());
switch (acceleratorDesc->mb_block_type) {
case CL_ME_MB_TYPE_8x8_INTEL:
BlkMul = 4;
break;
case CL_ME_MB_TYPE_4x4_INTEL:
BlkMul = 16;
break;
default:
break;
}
return validateVmeDispatch(inGws, inOffset, BlkNum, BlkMul);
}
// notes on corner cases :
// * if arg not available in kernels - returns true
// * if arg set to nullptr - returns true
bool validateBufferSize(int32_t bufferArgNum, size_t minimumSizeExpected) const {
if (bufferArgNum == -1) {
return true;
}
auto buff = castToObject<Buffer>((cl_mem)vmeKernel->getKernelArg(bufferArgNum));
if (buff == nullptr) {
return true;
}
size_t bufferSize = buff->getSize();
if (bufferSize < minimumSizeExpected) {
return false;
}
return true;
}
template <typename EnumBaseType>
bool validateEnumVal(EnumBaseType val) const {
return false;
}
template <typename EnumBaseType, typename ExpectedValType, typename... ExpectedValsTypes>
bool validateEnumVal(EnumBaseType val, ExpectedValType expectedVal, ExpectedValsTypes... expVals) const {
return (val == static_cast<EnumBaseType>(expectedVal)) || validateEnumVal<EnumBaseType, ExpectedValsTypes...>(val, expVals...);
}
// notes on corner cases :
// * if arg not available in kernels - returns true
template <typename EnumBaseType, typename... ExpectedValsTypes>
bool validateEnumArg(int32_t argNum, ExpectedValsTypes... expVals) const {
if (argNum == -1) {
return true;
}
EnumBaseType val = this->getKernelArgByValValue<EnumBaseType>(static_cast<uint32_t>(argNum));
return validateEnumVal<EnumBaseType, ExpectedValsTypes...>(val, expVals...);
}
template <typename RetType>
RetType getKernelArgByValValue(uint32_t argNum) const {
auto &kai = vmeKernel->getKernelInfo().kernelArgInfo[argNum];
DEBUG_BREAK_IF(kai.kernelArgPatchInfoVector.size() != 1);
const KernelArgPatchInfo &patchInfo = kai.kernelArgPatchInfoVector[0];
DEBUG_BREAK_IF(sizeof(RetType) > patchInfo.size);
return *(RetType *)(vmeKernel->getCrossThreadData() + patchInfo.crossthreadOffset);
}
cl_int validateImages(Vec3<size_t> inputRegion, Vec3<size_t> offset) const {
Image *srcImg = castToObject<Image>((cl_mem)vmeKernel->getKernelArg(srcImgArgNum));
Image *refImg = castToObject<Image>((cl_mem)vmeKernel->getKernelArg(refImgArgNum));
if ((srcImg == nullptr) || (refImg == nullptr)) {
return CL_INVALID_KERNEL_ARGS;
}
for (Image *img : {srcImg, refImg}) {
const cl_image_format &imgFormat = img->getImageFormat();
if ((imgFormat.image_channel_order != CL_R) || (imgFormat.image_channel_data_type != CL_UNORM_INT8)) {
return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
}
if (false == img->isTiledImage) {
//VME only works with tiled images.
return CL_OUT_OF_RESOURCES;
}
}
{
const cl_image_desc &srcImgDesc = srcImg->getImageDesc();
size_t srcImageWidth = srcImgDesc.image_width;
size_t srcImageHeight = srcImgDesc.image_height;
if (((inputRegion.x + offset.x) > srcImageWidth) ||
((inputRegion.y + offset.y) > srcImageHeight)) {
return CL_INVALID_IMAGE_SIZE;
}
}
return CL_SUCCESS;
}
virtual cl_int validateVmeDispatch(Vec3<size_t> inputRegion, Vec3<size_t> offset, size_t blkNum, size_t blkMul) const {
{
cl_int imageValidationStatus = validateImages(inputRegion, offset);
if (imageValidationStatus != CL_SUCCESS) {
return imageValidationStatus;
}
}
size_t numPredictors = 1;
std::pair<int32_t, size_t> bufferRequirements[] = {
std::make_pair(motionVectorBufferArgNum, (blkNum * blkMul * 2 * sizeof(cl_short))),
std::make_pair(predictionMotionVectorBufferArgNum, (blkNum * numPredictors * 2 * sizeof(cl_short))),
std::make_pair(residualsArgNum, (blkNum * blkMul * sizeof(cl_ushort)))};
for (const auto &req : bufferRequirements) {
if (false == validateBufferSize(req.first, req.second)) {
return CL_INVALID_BUFFER_SIZE;
}
}
return CL_SUCCESS;
}
protected:
uint32_t heightArgNum;
uint32_t widthArgNum;
uint32_t strideArgNum;
uint32_t acceleratorArgNum;
uint32_t srcImgArgNum;
uint32_t refImgArgNum;
int32_t motionVectorBufferArgNum;
int32_t predictionMotionVectorBufferArgNum;
int32_t residualsArgNum;
Kernel *vmeKernel;
};
template <typename HWFamily>
class BuiltInOp<HWFamily, EBuiltInOps::VmeBlockMotionEstimateIntel> : public VmeBuiltinDispatchInfoBuilder<HWFamily> {
public:
BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device)
: VmeBuiltinDispatchInfoBuilder<HWFamily>(kernelsLib, context, device,
EBuiltInOps::VmeBlockMotionEstimateIntel, "block_motion_estimate_intel") {
}
};
template <typename HWFamily>
class AdvancedVmeBuiltinDispatchInfoBuilder : public VmeBuiltinDispatchInfoBuilder<HWFamily> {
public:
AdvancedVmeBuiltinDispatchInfoBuilder(BuiltIns &kernelsLib, Context &context, Device &device, EBuiltInOps builtinOp,
const char *kernelName)
: VmeBuiltinDispatchInfoBuilder<HWFamily>(kernelsLib, context, device, builtinOp,
kernelName) {
flagsArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("flags");
intraSrcImgArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("intraSrcImg");
skipBlockTypeArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("skip_block_type");
searchCostPenaltyArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("search_cost_penalty");
searchCostPrecisionArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("search_cost_precision");
bidirWeightArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("bidir_weight");
predictorsBufferArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("predictors_buffer");
countMotionVectorBufferArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("count_motion_vector_buffer");
skipMotionVectorBufferArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("skip_motion_vector_buffer");
intraSearchPredictorModesArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("intra_search_predictor_modes");
skipResidualsArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("skip_residuals");
intraResidualsArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("intra_residuals");
}
bool setExplicitArg(uint32_t argIndex, size_t argSize, const void *argVal, cl_int &err) const override {
DEBUG_BREAK_IF(argIndex == intraSrcImgArgNum);
if (argIndex == this->srcImgArgNum) {
// rebind also as media block image
this->vmeKernel->setArg(intraSrcImgArgNum, argSize, argVal);
}
return VmeBuiltinDispatchInfoBuilder<HWFamily>::setExplicitArg(argIndex, argSize, argVal, err);
}
virtual bool isBidirKernel() const {
return false;
}
bool validateFlags(uint32_t &outSkipBlockType) const {
uint32_t flagsVal = VmeBuiltinDispatchInfoBuilder<HWFamily>::template getKernelArgByValValue<uint32_t>(flagsArgNum);
if ((flagsVal & CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL) == CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL) {
return false;
}
if (flagsVal == CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL) {
outSkipBlockType = CL_ME_MB_TYPE_16x16_INTEL;
} else if ((flagsVal & CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL) == CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL) {
outSkipBlockType = CL_ME_MB_TYPE_8x8_INTEL;
}
return true;
}
bool validateSkipBlockTypeArg(uint32_t &outSkipBlockType) const {
if (skipBlockTypeArgNum == -1) {
return true;
}
outSkipBlockType = VmeBuiltinDispatchInfoBuilder<HWFamily>::template getKernelArgByValValue<uint32_t>(static_cast<uint32_t>(skipBlockTypeArgNum));
switch (outSkipBlockType) {
case CL_ME_MB_TYPE_16x16_INTEL:
break;
case CL_ME_MB_TYPE_8x8_INTEL:
break;
default:
return false;
;
}
return true;
}
size_t getIntraSearchPredictorModesBuffExpSize(size_t blkNum) const {
// vector size is 22 - 1 (16x16 luma block) + 4 (8x8 luma block) + 16 (4x4 luma block) + 1 (8x8 chroma block)
int VectorSize = 22;
size_t intraSearchPredictorModesBuffExpSize = blkNum * VectorSize;
return intraSearchPredictorModesBuffExpSize;
}
size_t getSkipMotionVectorBufferExpSize(uint32_t skipBlockType, size_t blkNum) const {
// vector size is either 1 (16x16 block) or 4 (8x8 block)
// 0 to 8 skip MVs per MB
// may be null if all MBs in frame have 0 skip check MVs in which case VME skip checks are not performed
// layout assumes 4 (for bidir) or 8 (otherwise) skip check MVs per MB
// row-major block layout; all MVs for a block are contiguous
// buffer size depends on the block and frame size .
int vectorSize = (skipBlockType == CL_ME_MB_TYPE_16x16_INTEL) ? 1 : 4;
int numChecks = (isBidirKernel() ? 4 : 8);
size_t skipMotionVectorBufferExpSize = blkNum * numChecks * vectorSize * 2 * sizeof(cl_short);
return skipMotionVectorBufferExpSize;
}
size_t getSkipResidualsBuffExpSize(uint32_t skipBlockType, size_t blkNum) const {
/* output buffer of vectors of unsigned short SAD adjusted values corresponding to the input skip check MVs
may be null if skip_motion_vector_buffer is null
vector size is either 1 (16x16 block) or 4 (8x8 block)
0 to 8 skip check residuals per MB
layout always assumes 8 skip check residuals per MB
row major block layout; all MVs for a block are contiguous
buffer size depends on the block and frame size */
int vectorSize = 1;
switch (skipBlockType) {
case CL_ME_MB_TYPE_16x16_INTEL:
vectorSize = 1;
break;
case CL_ME_MB_TYPE_8x8_INTEL:
vectorSize = 4;
break;
default:
break;
};
int numChecks = (isBidirKernel() ? 4 : 8);
size_t skipResidualsBuffExpSize = blkNum * vectorSize * numChecks * sizeof(cl_ushort);
return skipResidualsBuffExpSize;
}
size_t getIntraResidualsBuffExpSize(size_t blkNum) const {
/* output buffer of vectors of unsigned short SAD adjusted values
may be null in which case the intra residuals corresponding not returned
vector size is 4 - 1 (16x16 luma block) + 1 (8x8 luma block) + 1 (4x4 luma block) + 1 (8x8 chroma block)
1 vector per MB
buffer size depends on the frame size */
int vectorSize = 4;
size_t intraResidualsBuffExpSize = (blkNum * sizeof(cl_ushort) * vectorSize);
return intraResidualsBuffExpSize;
}
size_t getPredictorsBufferExpSize(size_t blkNum) const {
size_t numPredictors = 8;
size_t predictorsBufferExpSize = (blkNum * numPredictors * 2 * sizeof(cl_short));
return predictorsBufferExpSize;
}
cl_int validateVmeDispatch(Vec3<size_t> inputRegion, Vec3<size_t> offset, size_t blkNum, size_t blkMul) const override {
cl_int basicVmeValidationStatus = VmeBuiltinDispatchInfoBuilder<HWFamily>::validateVmeDispatch(inputRegion, offset, blkNum, blkMul);
if (basicVmeValidationStatus != CL_SUCCESS) {
return basicVmeValidationStatus;
}
uint32_t skipBlockType = CL_ME_MB_TYPE_16x16_INTEL;
if (false == validateFlags(skipBlockType)) {
return CL_INVALID_KERNEL_ARGS;
}
if (false == validateSkipBlockTypeArg(skipBlockType)) {
return CL_OUT_OF_RESOURCES;
}
if (false == VmeBuiltinDispatchInfoBuilder<HWFamily>::template validateEnumArg<uint32_t>(searchCostPenaltyArgNum, CL_ME_COST_PENALTY_NONE_INTEL, CL_ME_COST_PENALTY_LOW_INTEL, CL_ME_COST_PENALTY_NORMAL_INTEL,
CL_ME_COST_PENALTY_HIGH_INTEL)) {
return CL_OUT_OF_RESOURCES;
}
if (false == VmeBuiltinDispatchInfoBuilder<HWFamily>::template validateEnumArg<uint32_t>(searchCostPrecisionArgNum, CL_ME_COST_PRECISION_QPEL_INTEL, CL_ME_COST_PRECISION_HPEL_INTEL, CL_ME_COST_PRECISION_PEL_INTEL,
CL_ME_COST_PRECISION_DPEL_INTEL)) {
return CL_OUT_OF_RESOURCES;
}
if (false == VmeBuiltinDispatchInfoBuilder<HWFamily>::template validateEnumArg<uint8_t>(bidirWeightArgNum, 0, CL_ME_BIDIR_WEIGHT_QUARTER_INTEL, CL_ME_BIDIR_WEIGHT_THIRD_INTEL, CL_ME_BIDIR_WEIGHT_HALF_INTEL,
CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL, CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL)) {
return CL_INVALID_KERNEL_ARGS;
}
std::pair<int32_t, size_t> bufferRequirements[] = {
std::make_pair(countMotionVectorBufferArgNum, (blkNum * 2 * sizeof(cl_short))),
std::make_pair(skipMotionVectorBufferArgNum, getSkipMotionVectorBufferExpSize(skipBlockType, blkNum)),
std::make_pair(intraSearchPredictorModesArgNum, getIntraSearchPredictorModesBuffExpSize(blkNum)),
std::make_pair(skipResidualsArgNum, getSkipResidualsBuffExpSize(skipBlockType, blkNum)),
std::make_pair(intraResidualsArgNum, getIntraResidualsBuffExpSize(blkNum)),
std::make_pair(predictorsBufferArgNum, getPredictorsBufferExpSize(blkNum))};
for (const auto &req : bufferRequirements) {
if (false == this->validateBufferSize(req.first, req.second)) {
return CL_INVALID_BUFFER_SIZE;
}
}
return CL_SUCCESS;
}
protected:
uint32_t flagsArgNum;
int32_t skipBlockTypeArgNum;
uint32_t searchCostPenaltyArgNum;
uint32_t searchCostPrecisionArgNum;
int32_t bidirWeightArgNum;
int32_t predictorsBufferArgNum;
uint32_t countMotionVectorBufferArgNum;
uint32_t skipMotionVectorBufferArgNum;
uint32_t intraSearchPredictorModesArgNum;
uint32_t skipResidualsArgNum;
uint32_t intraResidualsArgNum;
uint32_t intraSrcImgArgNum;
};
template <typename HWFamily>
class BuiltInOp<HWFamily, EBuiltInOps::VmeBlockAdvancedMotionEstimateCheckIntel> : public AdvancedVmeBuiltinDispatchInfoBuilder<HWFamily> {
public:
BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device)
: AdvancedVmeBuiltinDispatchInfoBuilder<HWFamily>(kernelsLib, context, device, EBuiltInOps::VmeBlockAdvancedMotionEstimateCheckIntel,
"block_advanced_motion_estimate_check_intel") {
}
cl_int validateVmeDispatch(Vec3<size_t> inputRegion, Vec3<size_t> offset,
size_t gwWidthInBlk, size_t gwHeightInBlk) const override {
cl_int basicAdvVmeValidationStatus = AdvancedVmeBuiltinDispatchInfoBuilder<HWFamily>::validateVmeDispatch(inputRegion, offset, gwWidthInBlk, gwHeightInBlk);
if (basicAdvVmeValidationStatus != CL_SUCCESS) {
return basicAdvVmeValidationStatus;
}
auto countMotionVectorBuff = castToObject<Buffer>((cl_mem)this->vmeKernel->getKernelArg(this->countMotionVectorBufferArgNum));
if (countMotionVectorBuff == nullptr) {
return CL_INVALID_BUFFER_SIZE;
}
return CL_SUCCESS;
}
};
template <typename HWFamily>
class BuiltInOp<HWFamily, EBuiltInOps::VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel> : public AdvancedVmeBuiltinDispatchInfoBuilder<HWFamily> {
public:
BuiltInOp(BuiltIns &kernelsLib, Context &context, Device &device)
: AdvancedVmeBuiltinDispatchInfoBuilder<HWFamily>(kernelsLib, context, device, EBuiltInOps::VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel,
"block_advanced_motion_estimate_bidirectional_check_intel") {
}
bool isBidirKernel() const override {
return true;
}
};
}

View File

@@ -0,0 +1,74 @@
# Copyright (c) 2017, Intel Corporation
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
set(BUILTIN_KERNELS_SIMULATION_SRCS
"${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt"
"${CMAKE_CURRENT_SOURCE_DIR}/opencl_c.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/opencl_c.h"
"${CMAKE_CURRENT_SOURCE_DIR}/scheduler_simulation.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/scheduler_simulation.inl"
"${CMAKE_CURRENT_SOURCE_DIR}/scheduler_simulation.h"
)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
string(REPLACE "/WX" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
string(REGEX REPLACE "-Werror[^ \t\n]*" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
string(REPLACE "-Wsometimes-uninitialized" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
string(REPLACE "-Wsign-compare" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
string(REPLACE "-Wunused-variable" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-everything")
endif()
if(COMPILER_SUPPORTS_CXX11)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
elseif(COMPILER_SUPPORTS_CXX0X)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x")
endif()
if(NOT MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fpermissive -fPIC")
endif()
ENABLE_WUD()
list (APPEND HEADER_INCLUDES ${IGDRCL_SOURCE_DIR}/runtime ${UMKM_SHAREDDATA_INCLUDE_PATHS})
foreach(GEN_NUM RANGE ${MAX_GEN} 0 -1)
GEN_CONTAINS_PLATFORMS("SUPPORTED" ${GEN_NUM} GENX_HAS_PLATFORMS)
if(${GENX_HAS_PLATFORMS})
list(APPEND DEFAULT_GEN_PLATFORMS_DEFITIONS DEFAULT_GEN${GEN_NUM}_PLATFORM=${DEFAULT_SUPPORTED_GEN${GEN_NUM}_PLATFORM})
list (APPEND HEADER_INCLUDES ${IGDRCL_SOURCE_DIR}/runtime/gen${GEN_NUM})
list (APPEND BUILTIN_KERNELS_SIMULATION_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/gen${GEN_NUM}/scheduler_simulation.cpp)
endif()
endforeach()
add_library(${BIKSIM_LIB_NAME} OBJECT ${BUILTIN_KERNELS_SIMULATION_SRCS})
target_include_directories(${BIKSIM_LIB_NAME} BEFORE PRIVATE ${HEADER_INCLUDES})
target_include_directories(${BIKSIM_LIB_NAME} PRIVATE
${KHRONOS_HEADERS_DIR}
${IGDRCL__IGC_INCLUDE_DIR}
${THIRD_PARTY_DIR}
)
set_target_properties(${BIKSIM_LIB_NAME} PROPERTIES FOLDER "built_ins")
target_compile_definitions(${BIKSIM_LIB_NAME} PUBLIC ${SUPPORTED_GEN_FLAGS_DEFINITONS} ${DEFAULT_GEN_PLATFORMS_DEFITIONS})

View File

@@ -0,0 +1,105 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "CL/cl.h"
#include "runtime/builtin_kernels_simulation/opencl_c.h"
#include "runtime/builtin_kernels_simulation/scheduler_simulation.h"
#include "runtime/builtin_kernels_simulation/scheduler_simulation.inl"
#include "runtime/memory_manager/graphics_allocation.h"
#include "runtime/gen8/hw_cmds.h"
#include "runtime/execution_model/device_enqueue.h"
using namespace OCLRT;
using namespace BuiltinKernelsSimulation;
namespace Gen8SchedulerSimulation {
#define SCHEDULER_EMULATION
uint GetNextPowerof2(uint number);
float __intel__getProfilingTimerResolution() {
return static_cast<float>(DEFAULT_GEN8_PLATFORM::hwInfo.capabilityTable.defaultProfilingTimerResolution);
}
#include "runtime/gen8/device_enqueue.h"
#include "runtime/gen8/scheduler_definitions.h"
#include "runtime/gen8/scheduler_igdrcl_built_in.inl"
#include "runtime/scheduler/scheduler.cl"
}
namespace BuiltinKernelsSimulation {
template <>
void SchedulerSimulation<BDWFamily>::startScheduler(uint32_t index,
GraphicsAllocation *queue,
GraphicsAllocation *commandsStack,
GraphicsAllocation *eventsPool,
GraphicsAllocation *secondaryBatchBuffer,
GraphicsAllocation *dsh,
GraphicsAllocation *reflectionSurface,
GraphicsAllocation *queueStorageBuffer,
GraphicsAllocation *ssh,
GraphicsAllocation *debugQueue) {
threadIDToLocalIDmap.insert(std::make_pair(std::this_thread::get_id(), index));
while (!conditionReady) {
}
Gen8SchedulerSimulation::SchedulerParallel20((IGIL_CommandQueue *)queue->getUnderlyingBuffer(),
(uint *)commandsStack->getUnderlyingBuffer(),
(IGIL_EventPool *)eventsPool->getUnderlyingBuffer(),
(uint *)secondaryBatchBuffer->getUnderlyingBuffer(),
(char *)dsh->getUnderlyingBuffer(),
(IGIL_KernelDataHeader *)reflectionSurface->getUnderlyingBuffer(),
(uint *)queueStorageBuffer->getUnderlyingBuffer(),
(char *)ssh->getUnderlyingBuffer(),
debugQueue != nullptr ? (DebugDataBuffer *)debugQueue->getUnderlyingBuffer() : nullptr);
}
template <>
void SchedulerSimulation<BDWFamily>::patchGpGpuWalker(uint secondLevelBatchOffset,
__global uint *secondaryBatchBuffer,
uint interfaceDescriptorOffset,
uint simdSize,
uint totalLocalWorkSize,
uint3 dimSize,
uint3 startPoint,
uint numberOfHwThreadsPerWg,
uint indirectPayloadSize,
uint ioHoffset) {
Gen8SchedulerSimulation::patchGpGpuWalker(secondLevelBatchOffset,
secondaryBatchBuffer,
interfaceDescriptorOffset,
simdSize,
totalLocalWorkSize,
dimSize,
startPoint,
numberOfHwThreadsPerWg,
indirectPayloadSize,
ioHoffset);
}
template class SchedulerSimulation<BDWFamily>;
} // namespace BuiltinKernelsSimulation

View File

@@ -0,0 +1,104 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "CL/cl.h"
#include "runtime/builtin_kernels_simulation/opencl_c.h"
#include "runtime/builtin_kernels_simulation/scheduler_simulation.h"
#include "runtime/builtin_kernels_simulation/scheduler_simulation.inl"
#include "runtime/memory_manager/graphics_allocation.h"
#include "runtime/gen9/hw_cmds.h"
#include "runtime/execution_model/device_enqueue.h"
using namespace OCLRT;
using namespace BuiltinKernelsSimulation;
namespace OCLRT {
struct SKLFamily;
}
namespace Gen9SchedulerSimulation {
#define SCHEDULER_EMULATION
float __intel__getProfilingTimerResolution() {
return static_cast<float>(DEFAULT_GEN9_PLATFORM::hwInfo.capabilityTable.defaultProfilingTimerResolution);
}
#include "runtime/gen9/device_enqueue.h"
#include "runtime/gen9/scheduler_definitions.h"
#include "runtime/gen9/scheduler_igdrcl_built_in.inl"
#include "runtime/scheduler/scheduler.cl"
}
namespace BuiltinKernelsSimulation {
template <>
void SchedulerSimulation<SKLFamily>::startScheduler(uint32_t index,
GraphicsAllocation *queue,
GraphicsAllocation *commandsStack,
GraphicsAllocation *eventsPool,
GraphicsAllocation *secondaryBatchBuffer,
GraphicsAllocation *dsh,
GraphicsAllocation *reflectionSurface,
GraphicsAllocation *queueStorageBuffer,
GraphicsAllocation *ssh,
GraphicsAllocation *debugQueue) {
threadIDToLocalIDmap.insert(std::make_pair(std::this_thread::get_id(), index));
while (!conditionReady) {
}
Gen9SchedulerSimulation::SchedulerParallel20((IGIL_CommandQueue *)queue->getUnderlyingBuffer(),
(uint *)commandsStack->getUnderlyingBuffer(),
(IGIL_EventPool *)eventsPool->getUnderlyingBuffer(),
(uint *)secondaryBatchBuffer->getUnderlyingBuffer(),
(char *)dsh->getUnderlyingBuffer(),
(IGIL_KernelDataHeader *)reflectionSurface->getUnderlyingBuffer(),
(uint *)queueStorageBuffer->getUnderlyingBuffer(),
(char *)ssh->getUnderlyingBuffer(),
debugQueue != nullptr ? (DebugDataBuffer *)debugQueue->getUnderlyingBuffer() : nullptr);
}
template <>
void SchedulerSimulation<SKLFamily>::patchGpGpuWalker(uint secondLevelBatchOffset,
__global uint *secondaryBatchBuffer,
uint interfaceDescriptorOffset,
uint simdSize,
uint totalLocalWorkSize,
uint3 dimSize,
uint3 startPoint,
uint numberOfHwThreadsPerWg,
uint indirectPayloadSize,
uint ioHoffset) {
Gen9SchedulerSimulation::patchGpGpuWalker(secondLevelBatchOffset,
secondaryBatchBuffer,
interfaceDescriptorOffset,
simdSize,
totalLocalWorkSize,
dimSize,
startPoint,
numberOfHwThreadsPerWg,
indirectPayloadSize,
ioHoffset);
}
template class SchedulerSimulation<SKLFamily>;
} // namespace BuiltinKernelsSimulation

View File

@@ -0,0 +1,152 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include <cstdint>
#include "runtime/helpers/string.h"
#include "CL/cl.h"
#include "opencl_c.h"
namespace BuiltinKernelsSimulation {
#define SCHEDULER_EMULATION 1
// globals
std::mutex gMutex;
unsigned int globalID[3];
unsigned int localID[3];
unsigned int localSize[3];
std::map<std::thread::id, uint32_t> threadIDToLocalIDmap;
SynchronizationBarrier *pGlobalBarrier = nullptr;
uint4 operator+(uint4 const &a, uint4 const &b) {
uint4 c(0, 0, 0, 0);
c.x = a.x + b.x;
c.y = a.y + b.y;
c.z = a.z + b.z;
c.w = a.w + b.w;
return c;
}
int4 operator+(int4 const &a, int4 const &b) {
int4 c(0, 0, 0, 0);
c.x = a.x + b.x;
c.y = a.y + b.y;
c.z = a.z + b.z;
c.w = a.w + b.w;
return c;
}
uint get_local_id(int dim) {
uint LID = 0;
// use thread id
if (threadIDToLocalIDmap.size() > 0) {
std::thread::id id = std::this_thread::get_id();
LID = threadIDToLocalIDmap[id] % 24;
}
// use id from loop iteration
else {
LID = localID[dim];
}
return LID;
}
uint get_global_id(int dim) {
uint GID = 0;
// use thread id
if (threadIDToLocalIDmap.size() > 0) {
std::thread::id id = std::this_thread::get_id();
GID = threadIDToLocalIDmap[id];
}
// use id from loop iteration
else {
GID = globalID[dim];
}
return GID;
}
uint get_local_size(int dim) {
return localSize[dim];
}
uint get_num_groups(int dim) {
return NUM_OF_THREADS / 24;
}
uint get_group_id(int dim) {
return get_global_id(dim) / 24;
}
void barrier(int x) {
pGlobalBarrier->enter();
// int LID = get_local_id(0);
volatile int BreakPointHere = 0;
// PUT BREAKPOINT HERE to stop after each barrier
BreakPointHere++;
}
uint4 read_imageui(image *im, int4 coord) {
uint4 color = {0, 0, 0, 1};
uint offset = ((coord.z * im->height + coord.y) * im->width + coord.x) * im->bytesPerChannel * im->channels;
char *temp = &im->ptr[offset];
char *colorDst = (char *)&color;
for (uint i = 0; i < im->channels; i++) {
memcpy_s(colorDst, sizeof(uint4), temp, im->bytesPerChannel);
temp += im->bytesPerChannel;
colorDst += 4;
}
return color;
}
uint4 write_imageui(image *im, uint4 coord, uint4 color) {
uint offset = ((coord.z * im->height + coord.y) * im->width + coord.x) * im->bytesPerChannel * im->channels;
char *temp = &im->ptr[offset];
char *colorSrc = (char *)&color;
size_t size = im->width * im->height * im->depth * im->bytesPerChannel * im->channels;
for (uint i = 0; i < im->channels; i++) {
memcpy_s(temp, size - offset, colorSrc, im->bytesPerChannel);
temp += im->bytesPerChannel;
colorSrc += 4;
}
return *(uint4 *)temp;
}
uchar convert_uchar_sat(uint c) {
return (uchar)c;
}
ushort convert_ushort_sat(uint c) {
return (ushort)c;
}
} // namespace BuiltinKernelsSimulation

View File

@@ -0,0 +1,304 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include <mutex>
#include <condition_variable>
#include <map>
#include <thread>
#include <string.h>
#include <cstdint>
// OpenCL Types
typedef uint32_t uint;
typedef uint8_t uchar;
typedef uint16_t ushort;
typedef uint64_t ulong;
namespace BuiltinKernelsSimulation {
// number of threads in wkg
#define NUM_OF_THREADS 24
#define CLK_GLOBAL_MEM_FENCE 1
#define CLK_LOCAL_MEM_FENCE 2
class SynchronizationBarrier {
public:
SynchronizationBarrier(int count) : m_InitialCount(count) {
m_Count = count;
m_BarrierCounter = 0;
}
~SynchronizationBarrier() {
}
void enter() {
std::unique_lock<std::mutex> lck(m_Mutex);
m_Count--;
unsigned int BarrierCount = m_BarrierCounter;
if (m_Count > 0) {
while (BarrierCount == m_BarrierCounter) {
m_AllHitBarrierCondition.wait(lck);
}
} else {
m_Count = m_InitialCount;
m_BarrierCounter++;
m_AllHitBarrierCondition.notify_all();
}
}
private:
std::mutex m_Mutex;
std::condition_variable m_AllHitBarrierCondition;
int m_Count;
const int m_InitialCount;
unsigned int m_BarrierCounter;
};
// globals
extern std::mutex gMutex;
extern unsigned int globalID[3];
extern unsigned int localID[3];
extern unsigned int localSize[3];
extern std::map<std::thread::id, uint32_t> threadIDToLocalIDmap;
extern SynchronizationBarrier *pGlobalBarrier;
typedef struct taguint2 {
taguint2(uint x, uint y) {
this->x = x;
this->y = y;
}
taguint2() {
this->x = 0;
this->y = 0;
}
uint x;
uint y;
} uint2;
typedef struct taguint3 {
taguint3(uint x, uint y, uint z) {
this->x = x;
this->y = y;
this->z = z;
}
taguint3() {
this->x = 0;
this->y = 0;
this->z = 0;
}
uint x;
uint y;
uint z;
} uint3;
typedef struct taguint4 {
taguint4(uint x, uint y, uint z, uint w) {
this->x = x;
this->y = y;
this->z = z;
this->w = w;
}
uint x;
uint y;
uint z;
uint w;
} uint4;
typedef struct tagint2 {
tagint2(int x, int y) {
this->x = x;
this->y = y;
}
int x;
int y;
} int2;
typedef struct tagint3 {
tagint3(int x, int y, int z) {
this->x = x;
this->y = y;
this->z = z;
}
int x;
int y;
int z;
} int3;
typedef struct tagint4 {
tagint4(int x, int y, int z, int w) {
this->x = x;
this->y = y;
this->z = z;
this->w = w;
}
int x;
int y;
int z;
int w;
} int4;
typedef struct tagushort2 {
tagushort2(ushort x, ushort y) {
this->x = x;
this->y = y;
}
unsigned short x;
unsigned short y;
} ushort2;
typedef struct tagushort8 {
unsigned short xxx[8];
} ushort8;
typedef struct tagushort16 {
unsigned short xxx[16];
} ushort16;
uint4 operator+(uint4 const &a, uint4 const &b);
int4 operator+(int4 const &a, int4 const &b);
typedef struct tagimage {
char *ptr;
uint width;
uint height;
uint depth;
uint bytesPerChannel;
uint channels;
} image;
// images as pointer
typedef image *image1d_t;
typedef image *image2d_t;
typedef image *image3d_t;
// OpenCL keywords
#define __global
#define __local
#define __private
#define __kernel
#define __attribute__(...)
#define __read_only
#define __write_only
#define queue_t void *
struct clk_event_t {
clk_event_t() {
value = 0;
}
clk_event_t(void *v) {
value = static_cast<uint>(reinterpret_cast<uintptr_t>(v));
}
explicit operator void *() const {
return reinterpret_cast<void *>(static_cast<uintptr_t>(value));
}
operator uint() {
return (uint)value;
}
void operator=(uint input) {
value = input;
}
uint value;
};
// OpenCL builtins
#define __builtin_astype(var, type) \
( \
(type)var)
#define select(a, b, c) (c ? b : a)
uint get_local_id(int dim);
uint get_global_id(int dim);
uint get_local_size(int dim);
uint get_num_groups(int dim);
uint get_group_id(int dim);
void barrier(int x);
uint4 read_imageui(image *im, int4 coord);
uint4 write_imageui(image *im, uint4 coord, uint4 color);
uchar convert_uchar_sat(uint c);
ushort convert_ushort_sat(uint c);
#define EMULATION_ENTER_FUNCTION() \
uint __LOCAL_ID__ = 0; \
__LOCAL_ID__ = get_local_id(0);
template <class TYPE, class TYPE2>
void atomic_xchg(TYPE *dest, TYPE2 val) {
gMutex.lock();
dest[0] = (TYPE)val;
gMutex.unlock();
}
template <class TYPE, class TYPE2>
TYPE atomic_add(TYPE *first, TYPE2 second) {
gMutex.lock();
TYPE temp = first[0];
first[0] = (TYPE)(temp + (TYPE)second);
gMutex.unlock();
return temp;
}
template <class TYPE, class TYPE2>
TYPE atomic_sub(TYPE *first, TYPE2 second) {
gMutex.lock();
TYPE temp = first[0];
first[0] = temp - second;
gMutex.unlock();
return temp;
}
template <class TYPE>
TYPE atomic_inc(TYPE *first) {
gMutex.lock();
TYPE temp = first[0];
first[0] = temp + 1;
gMutex.unlock();
return temp;
}
template <class TYPE>
TYPE atomic_dec(TYPE *first) {
gMutex.lock();
TYPE temp = first[0];
first[0] = temp - 1;
gMutex.unlock();
return temp;
}
template <class TYPE, class TYPE2>
TYPE atomic_min(TYPE *first, TYPE2 second) {
gMutex.lock();
TYPE temp = first[0];
first[0] = (TYPE)((TYPE)second < temp ? (TYPE)second : temp);
gMutex.unlock();
return temp;
}
}

View File

@@ -0,0 +1,36 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "runtime/builtin_kernels_simulation/scheduler_simulation.h"
#include "runtime/builtin_kernels_simulation/opencl_c.h"
#include <thread>
using namespace std;
using namespace OCLRT;
namespace BuiltinKernelsSimulation {
bool conditionReady = false;
std::thread threads[NUM_OF_THREADS];
} // namespace BuiltinKernelsSimulation

View File

@@ -0,0 +1,92 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include <cstdint>
#include <thread>
#include "runtime/builtin_kernels_simulation/opencl_c.h"
namespace OCLRT {
class GraphicsAllocation;
}
namespace BuiltinKernelsSimulation {
extern bool conditionReady;
extern std::thread threads[];
template <typename GfxFamily>
class SchedulerSimulation {
public:
void runSchedulerSimulation(OCLRT::GraphicsAllocation *queue,
OCLRT::GraphicsAllocation *commandsStack,
OCLRT::GraphicsAllocation *eventsPool,
OCLRT::GraphicsAllocation *secondaryBatchBuffer,
OCLRT::GraphicsAllocation *dsh,
OCLRT::GraphicsAllocation *reflectionSurface,
OCLRT::GraphicsAllocation *queueStorageBuffer,
OCLRT::GraphicsAllocation *ssh,
OCLRT::GraphicsAllocation *debugQueue);
void cleanSchedulerSimulation();
static void startScheduler(uint32_t index,
OCLRT::GraphicsAllocation *queue,
OCLRT::GraphicsAllocation *commandsStack,
OCLRT::GraphicsAllocation *eventsPool,
OCLRT::GraphicsAllocation *secondaryBatchBuffer,
OCLRT::GraphicsAllocation *dsh,
OCLRT::GraphicsAllocation *reflectionSurface,
OCLRT::GraphicsAllocation *queueStorageBuffer,
OCLRT::GraphicsAllocation *ssh,
OCLRT::GraphicsAllocation *debugQueue);
void initializeSchedulerSimulation(OCLRT::GraphicsAllocation *queue,
OCLRT::GraphicsAllocation *commandsStack,
OCLRT::GraphicsAllocation *eventsPool,
OCLRT::GraphicsAllocation *secondaryBatchBuffer,
OCLRT::GraphicsAllocation *dsh,
OCLRT::GraphicsAllocation *reflectionSurface,
OCLRT::GraphicsAllocation *queueStorageBuffer,
OCLRT::GraphicsAllocation *ssh,
OCLRT::GraphicsAllocation *debugQueue);
static void patchGpGpuWalker(uint secondLevelBatchOffset,
__global uint *secondaryBatchBuffer,
uint interfaceDescriptorOffset,
uint simdSize,
uint totalLocalWorkSize,
uint3 dimSize,
uint3 startPoint,
uint numberOfHwThreadsPerWg,
uint indirectPayloadSize,
uint ioHoffset);
static bool enabled;
static bool simulationRun;
};
template <typename GfxFamily>
bool SchedulerSimulation<GfxFamily>::enabled = true;
template <typename GfxFamily>
bool SchedulerSimulation<GfxFamily>::simulationRun = false;
} // namespace BuiltinKernelsSimulation

View File

@@ -0,0 +1,112 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "runtime/memory_manager/graphics_allocation.h"
#include "runtime/builtin_kernels_simulation/scheduler_simulation.h"
#include <cstdint>
#include <mutex>
#include <thread>
using namespace std;
using namespace OCLRT;
namespace BuiltinKernelsSimulation {
template <typename GfxFamily>
void SchedulerSimulation<GfxFamily>::cleanSchedulerSimulation() {
threadIDToLocalIDmap.clear();
delete pGlobalBarrier;
}
template <typename GfxFamily>
void SchedulerSimulation<GfxFamily>::initializeSchedulerSimulation(GraphicsAllocation *queue,
GraphicsAllocation *commandsStack,
GraphicsAllocation *eventsPool,
GraphicsAllocation *secondaryBatchBuffer,
GraphicsAllocation *dsh,
GraphicsAllocation *reflectionSurface,
GraphicsAllocation *queueStorageBuffer,
GraphicsAllocation *ssh,
GraphicsAllocation *debugQueue) {
localSize[0] = NUM_OF_THREADS;
localSize[1] = 1;
localSize[2] = 1;
threadIDToLocalIDmap.clear();
pGlobalBarrier = new SynchronizationBarrier(NUM_OF_THREADS);
// Spawn Thread ID == 0 on main thread
for (uint32_t i = 1; i < NUM_OF_THREADS; i++) {
threads[i] = std::thread(startScheduler, i, queue, commandsStack, eventsPool, secondaryBatchBuffer, dsh, reflectionSurface, queueStorageBuffer, ssh, debugQueue);
}
conditionReady = true;
}
template <typename GfxFamily>
void SchedulerSimulation<GfxFamily>::runSchedulerSimulation(GraphicsAllocation *queue,
GraphicsAllocation *commandsStack,
GraphicsAllocation *eventsPool,
GraphicsAllocation *secondaryBatchBuffer,
GraphicsAllocation *dsh,
GraphicsAllocation *reflectionSurface,
GraphicsAllocation *queueStorageBuffer,
GraphicsAllocation *ssh,
GraphicsAllocation *debugQueue) {
simulationRun = true;
if (enabled) {
initializeSchedulerSimulation(queue,
commandsStack,
eventsPool,
secondaryBatchBuffer,
dsh,
reflectionSurface,
queueStorageBuffer,
ssh,
debugQueue);
// start main thread with LID == 0
startScheduler(0,
queue,
commandsStack,
eventsPool,
secondaryBatchBuffer,
dsh,
reflectionSurface,
queueStorageBuffer,
ssh,
debugQueue);
// Wait for all threads on main thread
if (threadIDToLocalIDmap[std::this_thread::get_id()] == 0) {
for (uint32_t i = 1; i < NUM_OF_THREADS; i++)
threads[i].join();
cleanSchedulerSimulation();
}
}
};
} // namespace BuiltinKernelsSimulation

View File

@@ -0,0 +1,456 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "runtime/command_queue/command_queue.h"
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/command_stream/command_stream_receiver.h"
#include "runtime/context/context.h"
#include "runtime/device/device.h"
#include "runtime/device_queue/device_queue.h"
#include "runtime/event/event.h"
#include "runtime/helpers/aligned_memory.h"
#include "runtime/helpers/array_count.h"
#include "runtime/helpers/get_info.h"
#include "hw_info.h"
#include "runtime/helpers/options.h"
#include "runtime/helpers/ptr_math.h"
#include "runtime/mem_obj/buffer.h"
#include "runtime/mem_obj/image.h"
#include "runtime/helpers/surface_formats.h"
#include "runtime/memory_manager/memory_manager.h"
#include "runtime/helpers/string.h"
#include "CL/cl_ext.h"
#include "runtime/utilities/api_intercept.h"
#include "runtime/helpers/convert_color.h"
#include "runtime/helpers/queue_helpers.h"
#include <map>
namespace OCLRT {
// Global table of create functions
CommandQueueCreateFunc commandQueueFactory[IGFX_MAX_CORE] = {};
CommandQueue *CommandQueue::create(Context *context,
Device *device,
const cl_queue_properties *properties,
cl_int &retVal) {
retVal = CL_SUCCESS;
auto funcCreate = commandQueueFactory[device->getRenderCoreFamily()];
DEBUG_BREAK_IF(nullptr == funcCreate);
return funcCreate(context, device, properties);
}
CommandQueue::CommandQueue() : CommandQueue(nullptr, nullptr, 0) {
}
CommandQueue::CommandQueue(Context *context,
Device *deviceId,
const cl_queue_properties *properties) : low_priority(false),
taskCount(0),
taskLevel(0),
virtualEvent(nullptr),
context(context),
device(deviceId),
perfCountersEnabled(false),
perfCountersConfig(UINT32_MAX),
perfCountersUserRegistersNumber(0),
perfConfigurationData(nullptr),
perfCountersRegsCfgHandle(0),
perfCountersRegsCfgPending(false),
commandStream(nullptr) {
if (context) {
context->incRefInternal();
}
for (int i = 0; i < NUM_HEAPS; ++i) {
indirectHeap[i] = nullptr;
}
commandQueueProperties = getCmdQueueProperties<cl_command_queue_properties>(properties);
flushStamp.reset(new FlushStampTracker(true));
}
CommandQueue::~CommandQueue() {
if (virtualEvent) {
UNRECOVERABLE_IF(this->virtualEvent->getCommandQueue() != this && this->virtualEvent->getCommandQueue() != nullptr);
virtualEvent->setCurrentCmdQVirtualEvent(false);
virtualEvent->decRefInternal();
}
if (device) {
auto memoryManager = device->getMemoryManager();
DEBUG_BREAK_IF(nullptr == memoryManager);
if (commandStream && commandStream->getGraphicsAllocation()) {
memoryManager->storeAllocation(std::unique_ptr<GraphicsAllocation>(commandStream->getGraphicsAllocation()), REUSABLE_ALLOCATION);
commandStream->replaceGraphicsAllocation(nullptr);
}
delete commandStream;
for (int i = 0; i < NUM_HEAPS; ++i) {
if (indirectHeap[i] != nullptr) {
auto allocation = indirectHeap[i]->getGraphicsAllocation();
if (allocation != nullptr) {
memoryManager->storeAllocation(std::unique_ptr<GraphicsAllocation>(allocation), REUSABLE_ALLOCATION);
}
delete indirectHeap[i];
}
}
if (perfConfigurationData) {
delete perfConfigurationData;
}
if (this->perfCountersEnabled) {
device->getPerformanceCounters()->shutdown();
}
}
if (context && !context->isSpecialQueue(this)) {
context->decRefInternal();
}
}
uint32_t CommandQueue::getHwTag() const {
uint32_t tag = *getHwTagAddress();
return tag;
}
volatile uint32_t *CommandQueue::getHwTagAddress() const {
DEBUG_BREAK_IF(!this->device);
auto &commandStreamReceiver = device->getCommandStreamReceiver();
auto tag_address = commandStreamReceiver.getTagAddress();
commandStreamReceiver.makeCoherent((void *)tag_address, sizeof(tag_address));
return tag_address;
}
bool CommandQueue::isCompleted(uint32_t taskCount) const {
uint32_t tag = getHwTag();
DEBUG_BREAK_IF(tag == Event::eventNotReady);
return tag >= taskCount;
}
void CommandQueue::waitUntilComplete(uint32_t taskCountToWait, FlushStamp flushStampToWait) {
WAIT_ENTER()
DBG_LOG(LogTaskCounts, __FUNCTION__, "Waiting for taskCount:", taskCountToWait);
DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "Current taskCount:", getHwTag());
device->getCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait);
DEBUG_BREAK_IF(getHwTag() < taskCountToWait);
latestTaskCountWaited = taskCountToWait;
WAIT_LEAVE()
}
bool CommandQueue::isQueueBlocked() {
TakeOwnershipWrapper<CommandQueue> takeOwnershipWrapper(*this);
//check if we have user event and if so, if it is in blocked state.
if (this->virtualEvent) {
if (this->virtualEvent->peekIsCompleted()) {
UNRECOVERABLE_IF(this->virtualEvent == nullptr);
if (this->virtualEvent->peekIsCompletedByTermination() == false) {
taskCount = this->virtualEvent->peekTaskCount();
flushStamp->setStamp(this->virtualEvent->flushStamp->peekStamp());
taskLevel = this->virtualEvent->taskLevel;
// If this isn't an OOQ, update the taskLevel for the queue
if (!isOOQEnabled()) {
taskLevel++;
}
} else {
//at this point we may reset queue TaskCount, since all command previous to this were aborted
taskCount = 0;
flushStamp->setStamp(0);
taskLevel = getDevice().getCommandStreamReceiver().peekTaskLevel();
}
DebugManager.log(DebugManager.flags.EventsDebugEnable.get(), "isQueueBlocked taskLevel change from", taskLevel, "to new from virtualEvent", this->virtualEvent, "new tasklevel", this->virtualEvent->taskLevel.load());
//close the access to virtual event, driver added only 1 ref count.
this->virtualEvent->decRefInternal();
this->virtualEvent = nullptr;
return false;
}
return true;
}
return false;
}
cl_int CommandQueue::getCommandQueueInfo(cl_command_queue_info paramName,
size_t paramValueSize,
void *paramValue,
size_t *paramValueSizeRet) {
return getQueueInfo<CommandQueue>(this, paramName, paramValueSize, paramValue, paramValueSizeRet);
}
uint32_t CommandQueue::getTaskLevelFromWaitList(uint32_t taskLevel,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList) {
for (auto iEvent = 0u; iEvent < numEventsInWaitList; ++iEvent) {
auto pEvent = (Event *)(eventWaitList[iEvent]);
uint32_t eventTaskLevel = pEvent->taskLevel;
taskLevel = std::max(taskLevel, eventTaskLevel);
}
return taskLevel;
}
IndirectHeap &CommandQueue::getIndirectHeap(IndirectHeap::Type heapType,
size_t minRequiredSize) {
DEBUG_BREAK_IF(static_cast<uint32_t>(heapType) >= ARRAY_COUNT(indirectHeap));
auto &heap = indirectHeap[heapType];
GraphicsAllocation *heapMemory = nullptr;
DEBUG_BREAK_IF(nullptr == device);
auto memoryManager = device->getMemoryManager();
DEBUG_BREAK_IF(nullptr == memoryManager);
if (heap)
heapMemory = heap->getGraphicsAllocation();
if (heap && heap->getAvailableSpace() < minRequiredSize && heapMemory) {
memoryManager->storeAllocation(std::unique_ptr<GraphicsAllocation>(heapMemory), REUSABLE_ALLOCATION);
heapMemory = nullptr;
}
if (!heapMemory) {
// Heap should be at least minHeapSize unless we're requesting an empty heap
size_t minHeapSize = 64 * KB;
if (IndirectHeap::SURFACE_STATE == heapType) {
minHeapSize -= MemoryConstants::pageSize;
}
minRequiredSize = minRequiredSize ? std::max(minRequiredSize, minHeapSize) : 0;
minRequiredSize = minRequiredSize > 0 ? alignUp(minRequiredSize, MemoryConstants::cacheLineSize) : 0;
const size_t heapAlignment = MemoryConstants::pageSize;
heapMemory = memoryManager->obtainReusableAllocation(minRequiredSize).release();
if (!heapMemory) {
heapMemory = memoryManager->allocateGraphicsMemory(minRequiredSize, heapAlignment);
}
if (heap) {
heap->replaceBuffer(heapMemory->getUnderlyingBuffer(), minRequiredSize);
heap->replaceGraphicsAllocation(heapMemory);
} else {
heap = new IndirectHeap(heapMemory);
heap->overrideMaxSize(minRequiredSize);
}
}
return *heap;
}
void CommandQueue::releaseIndirectHeap(IndirectHeap::Type heapType) {
DEBUG_BREAK_IF(static_cast<uint32_t>(heapType) >= ARRAY_COUNT(indirectHeap));
auto &heap = indirectHeap[heapType];
DEBUG_BREAK_IF(nullptr == device);
auto memoryManager = device->getMemoryManager();
DEBUG_BREAK_IF(nullptr == memoryManager);
if (heap) {
auto heapMemory = heap->getGraphicsAllocation();
if (heapMemory != nullptr)
memoryManager->storeAllocation(std::unique_ptr<GraphicsAllocation>(heapMemory), REUSABLE_ALLOCATION);
heap->replaceBuffer(nullptr, 0);
heap->replaceGraphicsAllocation(nullptr);
}
}
LinearStream &CommandQueue::getCS(size_t minRequiredSize) {
DEBUG_BREAK_IF(nullptr == device);
auto &commandStreamReceiver = device->getCommandStreamReceiver();
auto memoryManager = commandStreamReceiver.getMemoryManager();
DEBUG_BREAK_IF(nullptr == memoryManager);
if (!commandStream) {
commandStream = new LinearStream(nullptr);
}
// Make sure we have enough room for any CSR additions
minRequiredSize += CSRequirements::minCommandQueueCommandStreamSize;
if (commandStream->getAvailableSpace() < minRequiredSize) {
// If not, allocate a new block. allocate full pages
minRequiredSize = alignUp(minRequiredSize, MemoryConstants::pageSize);
auto requiredSize = minRequiredSize + CSRequirements::csOverfetchSize;
GraphicsAllocation *allocation = memoryManager->obtainReusableAllocation(requiredSize).release();
if (!allocation) {
allocation = memoryManager->allocateGraphicsMemory(requiredSize, MemoryConstants::pageSize);
}
// Deallocate the old block, if not null
auto oldAllocation = commandStream->getGraphicsAllocation();
if (oldAllocation) {
memoryManager->storeAllocation(std::unique_ptr<GraphicsAllocation>(oldAllocation), REUSABLE_ALLOCATION);
}
commandStream->replaceBuffer(allocation->getUnderlyingBuffer(), minRequiredSize - CSRequirements::minCommandQueueCommandStreamSize);
commandStream->replaceGraphicsAllocation(allocation);
}
return *commandStream;
}
cl_int CommandQueue::enqueueAcquireSharedObjects(cl_uint numObjects, const cl_mem *memObjects, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *oclEvent, cl_uint cmdType) {
for (unsigned int object = 0; object < numObjects; object++) {
auto memObject = castToObjectOrAbort<MemObj>(memObjects[object]);
memObject->peekSharingHandler()->acquire(memObject);
memObject->acquireCount++;
}
auto status = enqueueMarkerWithWaitList(
numEventsInWaitList,
eventWaitList,
oclEvent);
if (oclEvent) {
castToObjectOrAbort<Event>(*oclEvent)->setCmdType(cmdType);
}
return status;
}
cl_int CommandQueue::enqueueReleaseSharedObjects(cl_uint numObjects, const cl_mem *memObjects, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *oclEvent, cl_uint cmdType) {
for (unsigned int object = 0; object < numObjects; object++) {
auto memObject = castToObjectOrAbort<MemObj>(memObjects[object]);
memObject->peekSharingHandler()->release(memObject);
DEBUG_BREAK_IF(memObject->acquireCount <= 0);
memObject->acquireCount--;
}
auto status = enqueueMarkerWithWaitList(
numEventsInWaitList,
eventWaitList,
oclEvent);
if (oclEvent) {
castToObjectOrAbort<Event>(*oclEvent)->setCmdType(cmdType);
}
return status;
}
void CommandQueue::updateFromCompletionStamp(const CompletionStamp &completionStamp) {
taskCount = completionStamp.taskCount;
flushStamp->setStamp(completionStamp.flushStamp);
this->taskLevel = completionStamp.taskLevel;
}
void CommandQueue::flushWaitList(
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
bool ndRangeKernel) {
bool isQBlocked = false;
//as long as queue is blocked we need to stall.
if (!isOOQEnabled()) {
while ((isQBlocked = isQueueBlocked()))
;
}
TakeOwnershipWrapper<Device> deviceOwnership(*device);
device->getCommandStreamReceiver().flushBatchedSubmissions();
if (!isQBlocked) {
auto taskLevel = getTaskLevelFromWaitList(this->taskLevel, numEventsInWaitList, eventWaitList);
auto &commandStream = getCS();
auto &commandStreamReceiver = device->getCommandStreamReceiver();
bool flushTask = false;
for (auto eventId = 0u; eventId < numEventsInWaitList; eventId++) {
Event *event = (Event *)eventWaitList[eventId];
if (event->peekTaskCount() > commandStreamReceiver.peekLatestSentTaskCount()) {
flushTask = true;
break;
}
}
if (flushTask) {
DispatchFlags dispatchFlags;
dispatchFlags.GSBA32BitRequired = ndRangeKernel;
dispatchFlags.low_priority = low_priority;
dispatchFlags.implicitFlush = true;
dispatchFlags.preemptionMode = PreemptionHelper::taskPreemptionMode(*device, nullptr);
DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady);
commandStreamReceiver.flushTask(
commandStream,
commandStream.getUsed(),
getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0),
getIndirectHeap(IndirectHeap::INSTRUCTION, 0),
getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0),
getIndirectHeap(IndirectHeap::SURFACE_STATE, 0),
taskLevel + 1,
dispatchFlags);
}
}
}
bool CommandQueue::setPerfCountersEnabled(bool perfCountersEnabled, cl_uint configuration) {
DEBUG_BREAK_IF(device == nullptr);
if (perfCountersEnabled == this->perfCountersEnabled) {
return true;
}
auto perfCounters = device->getPerformanceCounters();
if (perfCountersEnabled) {
perfCounters->enable();
if (!perfCounters->isAvailable()) {
perfCounters->shutdown();
return false;
}
perfConfigurationData = perfCounters->getPmRegsCfg(configuration);
if (perfConfigurationData == nullptr) {
perfCounters->shutdown();
return false;
}
InstrReadRegsCfg *pUserCounters = &perfConfigurationData->readRegs;
for (uint32_t i = 0; i < pUserCounters->regsCount; ++i) {
perfCountersUserRegistersNumber++;
if (pUserCounters->reg[i].bitSize > 32) {
perfCountersUserRegistersNumber++;
}
}
} else {
if (perfCounters->isAvailable()) {
perfCounters->shutdown();
}
}
this->perfCountersConfig = configuration;
this->perfCountersEnabled = perfCountersEnabled;
return true;
}
PerformanceCounters *CommandQueue::getPerfCounters() {
return device->getPerformanceCounters();
}
bool CommandQueue::sendPerfCountersConfig() {
return getPerfCounters()->sendPmRegsCfgCommands(perfConfigurationData, &perfCountersRegsCfgHandle, &perfCountersRegsCfgPending);
}
} // namespace OCLRT

View File

@@ -0,0 +1,422 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "runtime/api/cl_types.h"
#include "runtime/indirect_heap/indirect_heap.h"
#include "runtime/helpers/base_object.h"
#include "runtime/helpers/completion_stamp.h"
#include "runtime/helpers/flush_stamp.h"
#include "runtime/event/user_event.h"
#include "runtime/os_interface/performance_counters.h"
#include <atomic>
#include <cstdint>
namespace OCLRT {
class Buffer;
class LinearStream;
class Context;
class Device;
class Image;
class IndirectHeap;
class Kernel;
class MemObj;
template <>
struct OpenCLObjectMapper<_cl_command_queue> {
typedef class CommandQueue DerivedType;
};
////////////////////////////////////////////////////////////////////////////////
// CommandQueue - Core implementation
////////////////////////////////////////////////////////////////////////////////
class CommandQueue : public BaseObject<_cl_command_queue> {
public:
static const cl_ulong objectMagic = 0x1234567890987654LL;
enum { NUM_HEAPS = IndirectHeap::NUM_TYPES };
bool low_priority;
static CommandQueue *create(Context *context, Device *device,
const cl_queue_properties *properties,
cl_int &errcodeRet);
CommandQueue();
CommandQueue(Context *context, Device *device,
const cl_queue_properties *properties);
CommandQueue &operator=(const CommandQueue &) = delete;
CommandQueue(const CommandQueue &) = delete;
~CommandQueue() override;
// API entry points
virtual cl_int
enqueueCopyImage(Image *srcImage, Image *dstImage, const size_t srcOrigin[3],
const size_t dstOrigin[3], const size_t region[3],
cl_uint numEventsInWaitList, const cl_event *eventWaitList,
cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int enqueueFillImage(Image *image, const void *fillColor,
const size_t *origin, const size_t *region,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int enqueueFillBuffer(Buffer *buffer, const void *pattern,
size_t patternSize, size_t offset,
size_t size, cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int enqueueKernel(cl_kernel kernel, cl_uint workDim,
const size_t *globalWorkOffset,
const size_t *globalWorkSize,
const size_t *localWorkSize,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList, cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int enqueueBarrierWithWaitList(cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
return CL_SUCCESS;
}
virtual void *enqueueMapBuffer(Buffer *buffer, cl_bool blockingMap,
cl_map_flags mapFlags, size_t offset,
size_t size, cl_uint numEventsInWaitList,
const cl_event *eventWaitList, cl_event *event,
cl_int &errcodeRet) {
errcodeRet = CL_SUCCESS;
return CL_SUCCESS;
}
virtual void *enqueueMapImage(cl_mem image, cl_bool blockingMap,
cl_map_flags mapFlags, const size_t *origin,
const size_t *region, size_t *imageRowPitch,
size_t *imageSlicePitch,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList, cl_event *event,
cl_int &errcodeRet) {
errcodeRet = CL_SUCCESS;
return CL_SUCCESS;
}
virtual cl_int enqueueSVMMap(cl_bool blockingMap, cl_map_flags mapFlags,
void *svmPtr, size_t size,
cl_uint numEventsInWaitList, const cl_event *eventWaitList,
cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int enqueueSVMUnmap(void *svmPtr,
cl_uint numEventsInWaitList, const cl_event *eventWaitList,
cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int enqueueSVMFree(cl_uint numSvmPointers,
void *svmPointers[],
void(CL_CALLBACK *pfnFreeFunc)(cl_command_queue queue,
cl_uint numSvmPointers,
void *svmPointers[],
void *userData),
void *userData,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int enqueueSVMMemcpy(cl_bool blockingCopy,
void *dstPtr,
const void *srcPtr,
size_t size,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int enqueueSVMMemFill(void *svmPtr,
const void *pattern,
size_t patternSize,
size_t size,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int enqueueMarkerWithWaitList(cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int enqueueMigrateMemObjects(cl_uint numMemObjects,
const cl_mem *memObjects,
cl_mem_migration_flags flags,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int enqueueSVMMigrateMem(cl_uint numSvmPointers,
const void **svmPointers,
const size_t *sizes,
const cl_mem_migration_flags flags,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int enqueueCopyBuffer(Buffer *srcBuffer, Buffer *dstBuffer,
size_t srcOffset, size_t dstOffset,
size_t size, cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int enqueueReadBuffer(Buffer *buffer, cl_bool blockingRead,
size_t offset, size_t size, void *ptr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int enqueueReadImage(Image *srcImage, cl_bool blockingRead,
const size_t *origin, const size_t *region,
size_t rowPitch, size_t slicePitch, void *ptr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int enqueueUnmapMemObject(MemObj *memObj, void *mappedPtr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int enqueueWriteBuffer(Buffer *buffer, cl_bool blockingWrite,
size_t offset, size_t cb, const void *ptr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int enqueueWriteImage(Image *dstImage, cl_bool blockingWrite,
const size_t *origin, const size_t *region,
size_t inputRowPitch, size_t inputSlicePitch,
const void *ptr, cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int
enqueueCopyBufferRect(Buffer *srcBuffer, Buffer *dstBuffer,
const size_t *srcOrigin, const size_t *dstOrigin,
const size_t *region, size_t srcRowPitch,
size_t srcSlicePitch, size_t dstRowPitch,
size_t dstSlicePitch, cl_uint numEventsInWaitList,
const cl_event *eventWaitList, cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int enqueueWriteBufferRect(
Buffer *buffer, cl_bool blockingWrite, const size_t *bufferOrigin,
const size_t *hostOrigin, const size_t *region, size_t bufferRowPitch,
size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch,
const void *ptr, cl_uint numEventsInWaitList,
const cl_event *eventWaitList, cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int enqueueReadBufferRect(
Buffer *buffer, cl_bool blockingRead, const size_t *bufferOrigin,
const size_t *hostOrigin, const size_t *region, size_t bufferRowPitch,
size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch,
void *ptr, cl_uint numEventsInWaitList,
const cl_event *eventWaitList, cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int
enqueueCopyBufferToImage(Buffer *srcBuffer, Image *dstImage, size_t srcOffset,
const size_t *dstOrigin, const size_t *region,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList, cl_event *event) {
return CL_SUCCESS;
}
virtual cl_int
enqueueCopyImageToBuffer(Image *srcImage, Buffer *dstBuffer,
const size_t *srcOrigin, const size_t *region,
size_t dstOffset, cl_uint numEventsInWaitList,
const cl_event *eventWaitList, cl_event *event) {
return CL_SUCCESS;
}
cl_int enqueueAcquireSharedObjects(cl_uint numObjects,
const cl_mem *memObjects,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *oclEvent,
cl_uint cmdType);
cl_int enqueueReleaseSharedObjects(cl_uint numObjects,
const cl_mem *memObjects,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *oclEvent,
cl_uint cmdType);
virtual cl_int finish(bool dcFlush) { return CL_SUCCESS; }
virtual cl_int flush() { return CL_SUCCESS; }
void updateFromCompletionStamp(const CompletionStamp &completionStamp);
cl_int getCommandQueueInfo(cl_command_queue_info paramName,
size_t paramValueSize, void *paramValue,
size_t *paramValueSizeRet);
uint32_t getHwTag() const;
volatile uint32_t *getHwTagAddress() const;
bool isCompleted(uint32_t taskCount) const;
MOCKABLE_VIRTUAL bool isQueueBlocked();
void waitUntilComplete(uint32_t taskCountToWait, FlushStamp flushStampToWait);
void flushWaitList(cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
bool ndRangeKernel);
static uint32_t getTaskLevelFromWaitList(uint32_t taskLevel,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList);
Device &getDevice() { return *device; }
Context &getContext() { return *context; }
Context *getContextPtr() { return context; }
LinearStream &getCS(size_t minRequiredSize = 1024u);
IndirectHeap &getIndirectHeap(IndirectHeap::Type heapType,
size_t minRequiredSize = 0u);
MOCKABLE_VIRTUAL void releaseIndirectHeap(IndirectHeap::Type heapType);
cl_command_queue_properties getCommandQueueProperties() const {
return commandQueueProperties;
}
bool isProfilingEnabled() {
return !!(this->getCommandQueueProperties() & CL_QUEUE_PROFILING_ENABLE);
}
bool isOOQEnabled() {
return !!(this->getCommandQueueProperties() & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
}
bool isPerfCountersEnabled() {
return perfCountersEnabled;
}
InstrPmRegsCfg *getPerfCountersConfigData() {
return perfConfigurationData;
}
PerformanceCounters *getPerfCounters();
bool sendPerfCountersConfig();
bool setPerfCountersEnabled(bool perfCountersEnabled, cl_uint configuration);
uint16_t getPerfCountersUserRegistersNumber() {
return perfCountersUserRegistersNumber;
}
// taskCount of last task
uint32_t taskCount;
// current taskLevel. Used for determining if a PIPE_CONTROL is needed.
uint32_t taskLevel;
std::unique_ptr<FlushStampTracker> flushStamp;
std::atomic<uint32_t> latestTaskCountWaited{(uint32_t)-1};
// virtual event that holds last Enqueue information
Event *virtualEvent;
protected:
Context *context;
Device *device;
cl_command_queue_properties commandQueueProperties;
bool perfCountersEnabled;
cl_uint perfCountersConfig;
uint32_t perfCountersUserRegistersNumber;
InstrPmRegsCfg *perfConfigurationData;
uint32_t perfCountersRegsCfgHandle;
bool perfCountersRegsCfgPending;
LinearStream *commandStream;
IndirectHeap *indirectHeap[NUM_HEAPS];
bool mapDcFlushRequired = false;
};
typedef CommandQueue *(*CommandQueueCreateFunc)(
Context *context, Device *device, const cl_queue_properties *properties);
template <typename GfxFamily, unsigned int eventType>
LinearStream &getCommandStream(CommandQueue &commandQueue,
bool reserveProfilingCmdsSpace,
bool reservePerfCounterCmdsSpace,
const Kernel *pKernel);
template <typename GfxFamily, IndirectHeap::Type heapType>
IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const Kernel &kernel);
} // namespace OCLRT

View File

@@ -0,0 +1,391 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "runtime/command_stream/command_stream_receiver.h"
#include "runtime/command_queue/command_queue.h"
#include "runtime/mem_obj/mem_obj.h"
#include "runtime/memory_manager/graphics_allocation.h"
#include "runtime/program/printf_handler.h"
#include "runtime/helpers/dispatch_info.h"
#include "runtime/command_stream/preemption.h"
#include "runtime/helpers/queue_helpers.h"
#include <memory>
namespace OCLRT {
class EventBuilder;
template <typename GfxFamily>
class CommandQueueHw : public CommandQueue {
typedef CommandQueue BaseClass;
public:
CommandQueueHw(Context *context,
Device *device,
const cl_queue_properties *properties) : BaseClass(context, device, properties) {
if (getCmdQueueProperties<cl_queue_priority_khr>(properties, CL_QUEUE_PRIORITY_KHR) & static_cast<cl_queue_priority_khr>(CL_QUEUE_PRIORITY_LOW_KHR)) {
low_priority = true;
}
if (getCmdQueueProperties<cl_queue_properties>(properties, CL_QUEUE_PROPERTIES) & static_cast<cl_queue_properties>(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)) {
device->getCommandStreamReceiver().overrideDispatchPolicy(CommandStreamReceiver::BatchedDispatch);
}
}
static CommandQueue *create(Context *context,
Device *device,
const cl_queue_properties *properties) {
return new CommandQueueHw<GfxFamily>(context, device, properties);
}
cl_int enqueueBarrierWithWaitList(cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueCopyBuffer(Buffer *srcBuffer,
Buffer *dstBuffer,
size_t srcOffset,
size_t dstOffset,
size_t size,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueCopyBufferRect(Buffer *srcBuffer,
Buffer *dstBuffer,
const size_t *srcOrigin,
const size_t *dstOrigin,
const size_t *region,
size_t srcRowPitch,
size_t srcSlicePitch,
size_t dstRowPitch,
size_t dstSlicePitch,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueCopyImage(Image *srcImage,
Image *dstImage,
const size_t srcOrigin[3],
const size_t dstOrigin[3],
const size_t region[3],
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueFillBuffer(Buffer *buffer,
const void *pattern,
size_t patternSize,
size_t offset,
size_t size,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueFillImage(Image *image,
const void *fillColor,
const size_t *origin,
const size_t *region,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueKernel(cl_kernel kernel,
cl_uint workDim,
const size_t *globalWorkOffset,
const size_t *globalWorkSize,
const size_t *localWorkSize,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
void *enqueueMapBuffer(Buffer *buffer, cl_bool blockingMap, cl_map_flags mapFlags,
size_t offset, size_t size, cl_uint numEventsInWaitList,
const cl_event *eventWaitList, cl_event *event, cl_int &errcodeRet) override;
void *enqueueMapSharedBuffer(Buffer *buffer, cl_bool blockingMap, cl_map_flags mapFlags,
size_t offset, size_t size, cl_uint numEventsInWaitList,
const cl_event *eventWaitList, cl_event *event, cl_int &errcodeRet);
void *enqueueMapImage(cl_mem image,
cl_bool blockingMap,
cl_map_flags mapFlags,
const size_t *origin,
const size_t *region,
size_t *imageRowPitch,
size_t *imageSlicePitch,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event,
cl_int &errcodeRet) override;
cl_int enqueueSVMMap(cl_bool blockingMap,
cl_map_flags mapFlags,
void *svmPtr,
size_t size,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueSVMUnmap(void *svmPtr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueSVMFree(cl_uint numSvmPointers,
void *svmPointers[],
void(CL_CALLBACK *pfnFreeFunc)(cl_command_queue queue,
cl_uint numSvmPointers,
void *svmPointers[],
void *userData),
void *userData,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueSVMMemcpy(cl_bool blockingCopy,
void *dstPtr,
const void *srcPtr,
size_t size,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueSVMMemFill(void *svmPtr,
const void *pattern,
size_t patternSize,
size_t size,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueMarkerWithWaitList(cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueMigrateMemObjects(cl_uint numMemObjects,
const cl_mem *memObjects,
cl_mem_migration_flags flags,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueSVMMigrateMem(cl_uint numSvmPointers,
const void **svmPointers,
const size_t *sizes,
const cl_mem_migration_flags flags,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueReadBuffer(Buffer *buffer,
cl_bool blockingRead,
size_t offset,
size_t size,
void *ptr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueReadBufferRect(Buffer *buffer,
cl_bool blockingRead,
const size_t *bufferOrigin,
const size_t *hostOrigin,
const size_t *region,
size_t bufferRowPitch,
size_t bufferSlicePitch,
size_t hostRowPitch,
size_t hostSlicePitch,
void *ptr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueReadImage(Image *srcImage,
cl_bool blockingRead,
const size_t *origin,
const size_t *region,
size_t rowPitch,
size_t slicePitch,
void *ptr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueUnmapMemObject(MemObj *memObj,
void *mappedPtr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override {
cl_int retVal;
if (memObj->allowTiling() || memObj->peekSharingHandler()) {
retVal = memObj->unmapObj(this, mappedPtr, numEventsInWaitList, eventWaitList, event);
} else {
cpuDataTransferHandler(memObj,
CL_COMMAND_UNMAP_MEM_OBJECT,
CL_FALSE,
0,
0,
mappedPtr,
numEventsInWaitList,
eventWaitList,
event,
retVal);
}
return retVal;
}
cl_int enqueueWriteBuffer(Buffer *buffer,
cl_bool blockingWrite,
size_t offset,
size_t cb,
const void *ptr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueWriteBufferRect(Buffer *buffer,
cl_bool blockingWrite,
const size_t *bufferOrigin,
const size_t *hostOrigin,
const size_t *region,
size_t bufferRowPitch,
size_t bufferSlicePitch,
size_t hostRowPitch,
size_t hostSlicePitch,
const void *ptr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueWriteImage(Image *dstImage,
cl_bool blockingWrite,
const size_t *origin,
const size_t *region,
size_t inputRowPitch,
size_t inputSlicePitch,
const void *ptr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueCopyBufferToImage(Buffer *srcBuffer,
Image *dstImage,
size_t srcOffset,
const size_t *dstOrigin,
const size_t *region,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int enqueueCopyImageToBuffer(Image *srcImage,
Buffer *dstBuffer,
const size_t *srcOrigin,
const size_t *region,
size_t dstOffset,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) override;
cl_int finish(bool dcFlush) override;
cl_int flush() override;
template <unsigned int enqueueType>
void enqueueHandler(Surface **surfacesForResidency,
size_t numSurfaceForResidency,
bool blocking,
const MultiDispatchInfo &dispatchInfo,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
template <unsigned int enqueueType, size_t size>
void enqueueHandler(Surface *(&surfacesForResidency)[size],
bool blocking,
const MultiDispatchInfo &dispatchInfo,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
enqueueHandler<enqueueType>(surfacesForResidency, size, blocking, dispatchInfo, numEventsInWaitList, eventWaitList, event);
}
template <unsigned int enqueueType, size_t size>
void enqueueHandler(Surface *(&surfacesForResidency)[size],
bool blocking,
Kernel *kernel,
cl_uint workDim,
const size_t globalOffsets[3],
const size_t workItems[3],
const size_t *localWorkSizesIn,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event);
template <unsigned int commandType>
CompletionStamp enqueueNonBlocked(Surface **surfacesForResidency,
size_t surfaceCount,
LinearStream &commandStream,
size_t commandStreamStart,
bool &blocking,
const MultiDispatchInfo &multiDispatchInfo,
EventBuilder &eventBuilder,
uint32_t taskLevel,
bool slmUsed,
PrintfHandler *printfHandler);
template <unsigned int commandType>
void enqueueBlocked(Surface **surfacesForResidency,
size_t surfacesCount,
bool &blocking,
const MultiDispatchInfo &multiDispatchInfo,
KernelOperation *blockedCommandsData,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
bool slmUsed,
EventBuilder &externalEventBuilder,
std::unique_ptr<PrintfHandler> printfHandler);
void addMapUnmapToWaitlistEventsDependencies(const cl_event *eventWaitList,
size_t numEventsInWaitlist,
MapOperationType opType,
MemObj *memObj,
EventBuilder &externalEventBuilder);
void *cpuDataTransferHandler(MemObj *memObj,
cl_command_type cmdType,
cl_bool blocking,
size_t offset,
size_t size,
void *ptr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event,
cl_int &retVal);
protected:
MOCKABLE_VIRTUAL void enqueueHandlerHook(const unsigned int commandType, const MultiDispatchInfo &dispatchInfo);
private:
bool isTaskLevelUpdateRequired(const uint32_t &taskLevel, const cl_event *eventWaitList, const cl_uint &numEventsInWaitList, unsigned int commandType);
void forceDispatchScheduler(OCLRT::MultiDispatchInfo &multiDispatchInfo);
};
} // namespace OCLRT

View File

@@ -0,0 +1,46 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "runtime/command_queue/dispatch_walker.h"
#include "runtime/command_queue/enqueue_barrier.h"
#include "runtime/command_queue/enqueue_copy_buffer.h"
#include "runtime/command_queue/enqueue_copy_buffer_rect.h"
#include "runtime/command_queue/enqueue_copy_buffer_to_image.h"
#include "runtime/command_queue/enqueue_copy_image_to_buffer.h"
#include "runtime/command_queue/enqueue_copy_image.h"
#include "runtime/command_queue/enqueue_fill_buffer.h"
#include "runtime/command_queue/enqueue_fill_image.h"
#include "runtime/command_queue/enqueue_kernel.h"
#include "runtime/command_queue/enqueue_map_buffer.h"
#include "runtime/command_queue/enqueue_map_image.h"
#include "runtime/command_queue/enqueue_svm.h"
#include "runtime/command_queue/enqueue_marker.h"
#include "runtime/command_queue/enqueue_migrate_mem_objects.h"
#include "runtime/command_queue/enqueue_read_buffer.h"
#include "runtime/command_queue/enqueue_read_buffer_rect.h"
#include "runtime/command_queue/enqueue_read_image.h"
#include "runtime/command_queue/enqueue_write_buffer.h"
#include "runtime/command_queue/enqueue_write_buffer_rect.h"
#include "runtime/command_queue/enqueue_write_image.h"
#include "runtime/command_queue/cpu_data_transfer_handler.h"
#include "runtime/command_queue/finish.h"
#include "runtime/command_queue/flush.h"

View File

@@ -0,0 +1,171 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/device/device.h"
#include "runtime/event/event_builder.h"
namespace OCLRT {
template <typename GfxFamily>
void *CommandQueueHw<GfxFamily>::cpuDataTransferHandler(MemObj *memObj,
cl_command_type cmdType,
cl_bool blocking,
size_t offset,
size_t size,
void *ptr,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event,
cl_int &retVal) {
EventBuilder eventBuilder;
bool blockQueue = false;
bool eventCompleted = false;
ErrorCodeHelper err(&retVal, CL_SUCCESS);
if (event) {
eventBuilder.create<Event>(this, cmdType, Event::eventNotReady, Event::eventNotReady);
eventBuilder.getEvent()->setQueueTimeStamp();
eventBuilder.getEvent()->setCPUProfilingPath(true);
*event = eventBuilder.getEvent();
}
TakeOwnershipWrapper<Device> deviceOwnership(*device);
TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
auto taskLevel = getTaskLevelFromWaitList(this->taskLevel, numEventsInWaitList, eventWaitList);
auto updateTaskLevel = isTaskLevelUpdateRequired(taskLevel, eventWaitList, numEventsInWaitList, cmdType);
DBG_LOG(LogTaskCounts, __FUNCTION__, "taskLevel", taskLevel);
if (updateTaskLevel) {
taskLevel++;
this->taskLevel = taskLevel;
}
if (event) {
eventBuilder.getEvent()->taskLevel = taskLevel;
}
blockQueue = ((taskLevel == Event::eventNotReady) || isQueueBlocked());
if (blockQueue &&
(cmdType == CL_COMMAND_MAP_BUFFER || cmdType == CL_COMMAND_UNMAP_MEM_OBJECT)) {
addMapUnmapToWaitlistEventsDependencies(eventWaitList,
static_cast<size_t>(numEventsInWaitList),
cmdType == CL_COMMAND_MAP_BUFFER ? MAP : UNMAP,
memObj,
eventBuilder);
}
queueOwnership.unlock();
deviceOwnership.unlock();
// read/write buffers are always blocking
if (!blockQueue || blocking) {
err.set(Event::waitForEvents(numEventsInWaitList, eventWaitList));
if (eventBuilder.getEvent()) {
eventBuilder.getEvent()->setSubmitTimeStamp();
}
//wait for the completness of previous commands
if (cmdType != CL_COMMAND_UNMAP_MEM_OBJECT) {
if (!memObj->isMemObjZeroCopy() || blocking) {
finish(true);
eventCompleted = true;
}
}
auto bufferStorage = ptrOffset(memObj->getCpuAddressForMemoryTransfer(), offset);
if (eventBuilder.getEvent()) {
eventBuilder.getEvent()->setStartTimeStamp();
}
switch (cmdType) {
case CL_COMMAND_MAP_BUFFER:
if (!memObj->isMemObjZeroCopy()) {
if (context->isProvidingPerformanceHints()) {
context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, CL_ENQUEUE_MAP_BUFFER_REQUIRES_COPY_DATA, static_cast<cl_mem>(memObj));
}
memObj->transferDataToHostPtr();
eventCompleted = true;
} else {
if (context->isProvidingPerformanceHints()) {
context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL, CL_ENQUEUE_MAP_BUFFER_DOESNT_REQUIRE_COPY_DATA, static_cast<cl_mem>(memObj));
}
}
break;
case CL_COMMAND_UNMAP_MEM_OBJECT:
if (!memObj->isMemObjZeroCopy()) {
if (context->isProvidingPerformanceHints()) {
context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, CL_ENQUEUE_UNMAP_MEM_OBJ_REQUIRES_COPY_DATA, ptr, static_cast<cl_mem>(memObj));
}
memObj->transferDataFromHostPtrToMemoryStorage();
eventCompleted = true;
} else {
if (context->isProvidingPerformanceHints()) {
context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL, CL_ENQUEUE_UNMAP_MEM_OBJ_DOESNT_REQUIRE_COPY_DATA, ptr);
}
}
break;
case CL_COMMAND_READ_BUFFER:
if (context->isProvidingPerformanceHints()) {
context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, CL_ENQUEUE_READ_BUFFER_REQUIRES_COPY_DATA, static_cast<cl_mem>(memObj), ptr);
}
memcpy_s(ptr, size, bufferStorage, size);
eventCompleted = true;
break;
case CL_COMMAND_WRITE_BUFFER:
if (context->isProvidingPerformanceHints()) {
context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, CL_ENQUEUE_WRITE_BUFFER_REQUIRES_COPY_DATA, static_cast<cl_mem>(memObj), ptr);
}
memcpy_s(bufferStorage, size, ptr, size);
eventCompleted = true;
break;
default:
err.set(CL_INVALID_OPERATION);
}
if (eventBuilder.getEvent()) {
eventBuilder.getEvent()->setEndTimeStamp();
eventBuilder.getEvent()->updateTaskCount(this->taskCount);
if (eventCompleted) {
eventBuilder.getEvent()->setStatus(CL_COMPLETE);
} else {
eventBuilder.getEvent()->updateExecutionStatus();
}
}
}
if (cmdType == CL_COMMAND_MAP_BUFFER) {
return memObj->setAndReturnMappedPtr(offset);
}
if (cmdType == CL_COMMAND_UNMAP_MEM_OBJECT) {
err.set(ptr == memObj->getMappedPtr() ? CL_SUCCESS : CL_INVALID_VALUE);
}
return nullptr; // only map returns pointer
}
} // namespace OCLRT

View File

@@ -0,0 +1,937 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "runtime/context/context.h"
#include "runtime/gen9/gen9_cmd_def.h"
#include "runtime/command_queue/local_id_gen.h"
#include "runtime/command_queue/command_queue.h"
#include "runtime/command_queue/dispatch_walker_helper.h"
#include "runtime/command_stream/command_stream_receiver.h"
#include "runtime/device/device_info.h"
#include "runtime/device_queue/device_queue_hw.h"
#include "runtime/event/perf_counter.h"
#include "runtime/event/user_event.h"
#include "runtime/indirect_heap/indirect_heap.h"
#include "runtime/helpers/aligned_memory.h"
#include "runtime/helpers/debug_helpers.h"
#include "runtime/helpers/kernel_commands.h"
#include "runtime/helpers/task_information.h"
#include "runtime/helpers/validators.h"
#include "runtime/helpers/dispatch_info.h"
#include "runtime/kernel/kernel.h"
#include "runtime/mem_obj/mem_obj.h"
#include "runtime/memory_manager/graphics_allocation.h"
#include <algorithm>
#include <cmath>
namespace OCLRT {
void computeWorkgroupSize1D(
uint32_t maxWorkGroupSize,
size_t workGroupSize[3],
const size_t workItems[3],
size_t simdSize);
void computeWorkgroupSizeND(
WorkSizeInfo wsInfo,
size_t workGroupSize[3],
const size_t workItems[3],
const uint32_t workDim);
void computeWorkgroupSize2D(
uint32_t maxWorkGroupSize,
size_t workGroupSize[3],
const size_t workItems[3],
size_t simdSize);
void computeWorkgroupSizeSquared(
uint32_t maxWorkGroupSize,
size_t workGroupSize[3],
const size_t workItems[3],
size_t simdSize,
const uint32_t workDim);
Vec3<size_t> computeWorkgroupSize(
const DispatchInfo &dispatchInfo);
Vec3<size_t> generateWorkgroupSize(
const DispatchInfo &dispatchInfo);
Vec3<size_t> computeWorkgroupsNumber(
const Vec3<size_t> gws,
const Vec3<size_t> lws);
Vec3<size_t> generateWorkgroupsNumber(
const Vec3<size_t> gws,
const Vec3<size_t> lws);
Vec3<size_t> generateWorkgroupsNumber(
const DispatchInfo &dispatchInfo);
Vec3<size_t> canonizeWorkgroup(
Vec3<size_t> workgroup);
inline uint32_t calculateDispatchDim(Vec3<size_t> dispatchSize, Vec3<size_t> dispatchOffset) {
return std::max(1U, std::max(dispatchSize.getSimplifiedDim(), dispatchOffset.getSimplifiedDim()));
}
template <typename GfxFamily>
inline size_t setGpgpuWalkerThreadData(
typename GfxFamily::GPGPU_WALKER *pCmd,
const size_t globalOffsets[3],
const size_t startWorkGroups[3],
const size_t numWorkGroups[3],
const size_t localWorkSizesIn[3],
uint32_t simd) {
typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER;
auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];
auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize);
pCmd->setThreadWidthCounterMaximum((uint32_t)threadsPerWorkGroup);
pCmd->setThreadGroupIdXDimension((uint32_t)numWorkGroups[0]);
pCmd->setThreadGroupIdYDimension((uint32_t)numWorkGroups[1]);
pCmd->setThreadGroupIdZDimension((uint32_t)numWorkGroups[2]);
// compute RightExecutionMask
auto remainderSimdLanes = localWorkSize & (simd - 1);
uint64_t executionMask = (1ull << remainderSimdLanes) - 1;
if (!executionMask)
executionMask = ~executionMask;
pCmd->setRightExecutionMask((uint32_t)executionMask);
pCmd->setBottomExecutionMask((uint32_t)0xffffffff);
pCmd->setSimdSize((typename GPGPU_WALKER::SIMD_SIZE)(simd >> 4));
pCmd->setThreadGroupIdStartingX((uint32_t)startWorkGroups[0]);
pCmd->setThreadGroupIdStartingY((uint32_t)startWorkGroups[1]);
pCmd->setThreadGroupIdStartingResumeZ((uint32_t)startWorkGroups[2]);
return localWorkSize;
}
inline cl_uint computeDimensions(const size_t workItems[3]) {
return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 : 1;
}
void provideLocalWorkGroupSizeHints(Context *context, uint32_t maxWorkGroupSize, DispatchInfo dispatchInfo);
template <typename SizeAndAllocCalcT, typename... CalcArgsT>
IndirectHeap *allocateIndirectHeap(SizeAndAllocCalcT &&calc, CalcArgsT &&... args) {
size_t alignment = MemoryConstants::pageSize;
size_t size = calc(std::forward<CalcArgsT>(args)...);
return new IndirectHeap(alignedMalloc(size, alignment), size);
}
template <typename GfxFamily>
void dispatchProfilingCommandsStart(
HwTimeStamps &hwTimeStamps,
OCLRT::LinearStream *commandStream) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
// PIPE_CONTROL for global timestamp
uint64_t TimeStampAddress = reinterpret_cast<uint64_t>(&(hwTimeStamps.GlobalStartTS));
auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
*pPipeControlCmd = PIPE_CONTROL::sInit();
pPipeControlCmd->setCommandStreamerStallEnable(true);
pPipeControlCmd->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP);
pPipeControlCmd->setAddress(static_cast<uint32_t>(TimeStampAddress & 0x0000FFFFFFFFULL));
pPipeControlCmd->setAddressHigh(static_cast<uint32_t>(TimeStampAddress >> 32));
//MI_STORE_REGISTER_MEM for context local timestamp
TimeStampAddress = reinterpret_cast<uint64_t>(&(hwTimeStamps.ContextStartTS));
//low part
auto pMICmdLow = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pMICmdLow = MI_STORE_REGISTER_MEM::sInit();
pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
pMICmdLow->setMemoryAddress(TimeStampAddress);
//hi part
TimeStampAddress += sizeof(uint32_t);
auto pMICmdHigh = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pMICmdHigh = MI_STORE_REGISTER_MEM::sInit();
pMICmdHigh->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_HIGH);
pMICmdHigh->setMemoryAddress(TimeStampAddress);
}
template <typename GfxFamily>
void dispatchProfilingCommandsEnd(
HwTimeStamps &hwTimeStamps,
OCLRT::LinearStream *commandStream) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
// PIPE_CONTROL for global timestamp
auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
*pPipeControlCmd = PIPE_CONTROL::sInit();
pPipeControlCmd->setCommandStreamerStallEnable(true);
//MI_STORE_REGISTER_MEM for context local timestamp
uint64_t TimeStampAddress = reinterpret_cast<uint64_t>(&(hwTimeStamps.ContextEndTS));
//low part
auto pMICmdLow = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pMICmdLow = MI_STORE_REGISTER_MEM::sInit();
pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
pMICmdLow->setMemoryAddress(TimeStampAddress);
//hi part
TimeStampAddress += sizeof(uint32_t);
auto pMICmdHi = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pMICmdHi = MI_STORE_REGISTER_MEM::sInit();
pMICmdHi->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_HIGH);
pMICmdHi->setMemoryAddress(TimeStampAddress);
}
template <typename GfxFamily>
void dispatchPerfCountersNoopidRegisterCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream,
bool start) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
uint64_t address = start ? reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.dmaFenceIdBegin))
: reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.dmaFenceIdEnd));
auto pNoopIdRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pNoopIdRegister = MI_STORE_REGISTER_MEM::sInit();
pNoopIdRegister->setRegisterAddress(OCLRT::INSTR_MMIO_NOOPID);
pNoopIdRegister->setMemoryAddress(address);
}
template <typename GfxFamily>
void dispatchPerfCountersReadFreqRegisterCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream,
bool start) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
uint64_t address = start ? reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.coreFreqBegin))
: reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.coreFreqEnd));
auto pCoreFreqRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pCoreFreqRegister = MI_STORE_REGISTER_MEM::sInit();
pCoreFreqRegister->setRegisterAddress(OCLRT::INSTR_MMIO_RPSTAT1);
pCoreFreqRegister->setMemoryAddress(address);
}
template <typename GfxFamily>
void dispatchPerfCountersGeneralPurposeCounterCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream,
bool start) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
uint64_t address = 0;
const uint64_t baseAddress = start ? reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.reportBegin.gp))
: reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.reportEnd.gp));
// Read General Purpose counters
for (uint16_t i = 0; i < OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT; i++) {
auto pGeneralPurposeRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pGeneralPurposeRegister = MI_STORE_REGISTER_MEM::sInit();
uint32_t regAddr = INSTR_GFX_OFFSETS::INSTR_PERF_CNT_1_DW0 + i * sizeof(cl_uint);
pGeneralPurposeRegister->setRegisterAddress(regAddr);
//Gp field is 2*uint64 wide so it can hold 4 uint32
address = baseAddress + i * sizeof(cl_uint);
pGeneralPurposeRegister->setMemoryAddress(address);
}
}
template <typename GfxFamily>
void dispatchPerfCountersUserCounterCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream,
bool start) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
uint64_t address = 0;
const uint64_t baseAddr = start ? reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.reportBegin.user))
: reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.reportEnd.user));
uint32_t cmdNum = 0;
uint32_t regAddr = 0;
auto configData = commandQueue.getPerfCountersConfigData();
auto userRegs = &configData->readRegs;
for (uint32_t i = 0; i < userRegs->regsCount; i++) {
auto pRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pRegister = MI_STORE_REGISTER_MEM::sInit();
regAddr = userRegs->reg[i].offset;
pRegister->setRegisterAddress(regAddr);
//offset between base (low) registers is cl_ulong wide
address = baseAddr + i * sizeof(cl_ulong);
pRegister->setMemoryAddress(address);
cmdNum++;
if (userRegs->reg[i].bitSize > 32) {
pRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pRegister = MI_STORE_REGISTER_MEM::sInit();
regAddr += sizeof(cl_uint);
pRegister->setRegisterAddress(regAddr);
address += sizeof(cl_uint);
pRegister->setMemoryAddress(address);
cmdNum++;
}
}
}
template <typename GfxFamily>
void dispatchPerfCountersOABufferStateCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
uint64_t address = 0;
//OA Status
auto pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pOaRegister = MI_STORE_REGISTER_MEM::sInit();
pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_STATUS);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.oaStatus));
pOaRegister->setMemoryAddress(address);
//OA Head
pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pOaRegister = MI_STORE_REGISTER_MEM::sInit();
pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_HEAD_PTR);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.oaHead));
pOaRegister->setMemoryAddress(address);
//OA Tail
pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pOaRegister = MI_STORE_REGISTER_MEM::sInit();
pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_TAIL_PTR);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.oaTail));
pOaRegister->setMemoryAddress(address);
}
template <typename GfxFamily>
void dispatchPerfCountersCommandsStart(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream) {
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT;
auto perfCounters = commandQueue.getPerfCounters();
uint32_t currentReportId = perfCounters->getCurrentReportId();
uint64_t address = 0;
//flush command streamer
auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
*pPipeControlCmd = PIPE_CONTROL::sInit();
pPipeControlCmd->setCommandStreamerStallEnable(true);
//Store value of NOOPID register
dispatchPerfCountersNoopidRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
//Read Core Frequency
dispatchPerfCountersReadFreqRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
dispatchPerfCountersGeneralPurposeCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
auto pReportPerfCount = (MI_REPORT_PERF_COUNT *)commandStream->getSpace(sizeof(MI_REPORT_PERF_COUNT));
*pReportPerfCount = MI_REPORT_PERF_COUNT::sInit();
pReportPerfCount->setReportId(currentReportId);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.reportBegin.oa));
pReportPerfCount->setMemoryAddress(address);
//Timestamp: Global Start
pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
*pPipeControlCmd = PIPE_CONTROL::sInit();
pPipeControlCmd->setCommandStreamerStallEnable(true);
pPipeControlCmd->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWTimeStamp.GlobalStartTS));
pPipeControlCmd->setAddress(static_cast<uint32_t>(address & ((uint64_t)UINT32_MAX)));
pPipeControlCmd->setAddressHigh(static_cast<uint32_t>(address >> 32));
dispatchPerfCountersUserCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
commandQueue.sendPerfCountersConfig();
}
template <typename GfxFamily>
void dispatchPerfCountersCommandsEnd(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream) {
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT;
auto perfCounters = commandQueue.getPerfCounters();
uint32_t currentReportId = perfCounters->getCurrentReportId();
uint64_t address = 0;
//flush command streamer
auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
*pPipeControlCmd = PIPE_CONTROL::sInit();
pPipeControlCmd->setCommandStreamerStallEnable(true);
dispatchPerfCountersOABufferStateCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream);
//Timestamp: Global End
pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
*pPipeControlCmd = PIPE_CONTROL::sInit();
pPipeControlCmd->setCommandStreamerStallEnable(true);
pPipeControlCmd->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWTimeStamp.GlobalEndTS));
pPipeControlCmd->setAddress(static_cast<uint32_t>(address & ((uint64_t)UINT32_MAX)));
pPipeControlCmd->setAddressHigh(static_cast<uint32_t>(address >> 32));
auto pReportPerfCount = (MI_REPORT_PERF_COUNT *)commandStream->getSpace(sizeof(MI_REPORT_PERF_COUNT));
*pReportPerfCount = MI_REPORT_PERF_COUNT::sInit();
pReportPerfCount->setReportId(currentReportId);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.reportEnd.oa));
pReportPerfCount->setMemoryAddress(address);
dispatchPerfCountersGeneralPurposeCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
//Store value of NOOPID register
dispatchPerfCountersNoopidRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
//Read Core Frequency
dispatchPerfCountersReadFreqRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
dispatchPerfCountersUserCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
perfCounters->setCpuTimestamp();
}
template <typename GfxFamily>
void dispatchWalker(
CommandQueue &commandQueue,
const MultiDispatchInfo &multiDispatchInfo,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
KernelOperation **blockedCommandsData,
HwTimeStamps *hwTimeStamps,
OCLRT::HwPerfCounter *hwPerfCounter,
bool blockQueue = false,
unsigned int commandType = 0) {
OCLRT::LinearStream *commandStream = nullptr;
OCLRT::IndirectHeap *dsh = nullptr, *ish = nullptr, *ioh = nullptr, *ssh = nullptr;
bool executionModelKernel = multiDispatchInfo.begin()->getKernel()->isParentKernel;
// Allocate command stream and indirect heaps
if (blockQueue) {
using KCH = KernelCommandsHelper<GfxFamily>;
commandStream = new LinearStream(alignedMalloc(MemoryConstants::pageSize, MemoryConstants::pageSize), MemoryConstants::pageSize);
if (executionModelKernel) {
uint32_t offsetDsh = commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset();
uint32_t colorCalcSize = commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize;
dsh = allocateIndirectHeap([&multiDispatchInfo, offsetDsh] { return KCH::getTotalSizeRequiredDSH(multiDispatchInfo) + KCH::getTotalSizeRequiredIOH(multiDispatchInfo) + offsetDsh; });
dsh->getSpace(colorCalcSize);
ioh = dsh;
} else {
dsh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredDSH(multiDispatchInfo); });
ioh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredIOH(multiDispatchInfo); });
}
ish = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredIH(multiDispatchInfo); });
ssh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredSSH(multiDispatchInfo); });
using UniqueIH = std::unique_ptr<IndirectHeap>;
*blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh),
UniqueIH(ish), UniqueIH(ioh), UniqueIH(ssh));
if (executionModelKernel)
(*blockedCommandsData)->doNotFreeISH = true;
} else {
commandStream = &commandQueue.getCS(0);
if (executionModelKernel && (commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, 0).getUsed() > 0)) {
commandQueue.releaseIndirectHeap(IndirectHeap::SURFACE_STATE);
}
dsh = &getIndirectHeap<GfxFamily, IndirectHeap::DYNAMIC_STATE>(commandQueue, multiDispatchInfo);
ish = &getIndirectHeap<GfxFamily, IndirectHeap::INSTRUCTION>(commandQueue, multiDispatchInfo);
ioh = &getIndirectHeap<GfxFamily, IndirectHeap::INDIRECT_OBJECT>(commandQueue, multiDispatchInfo);
ssh = &getIndirectHeap<GfxFamily, IndirectHeap::SURFACE_STATE>(commandQueue, multiDispatchInfo);
}
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
dsh->align(KernelCommandsHelper<GfxFamily>::alignInterfaceDescriptorData);
const size_t offsetInterfaceDescriptorTable = dsh->getUsed();
uint32_t interfaceDescriptorIndex = 0;
size_t totalInterfaceDescriptorTableSize = sizeof(INTERFACE_DESCRIPTOR_DATA);
size_t numDispatches = multiDispatchInfo.size();
totalInterfaceDescriptorTableSize *= numDispatches;
if (!executionModelKernel) {
dsh->getSpace(totalInterfaceDescriptorTableSize);
} else {
dsh->getSpace(commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset() - dsh->getUsed());
}
// Program media interface descriptor load
KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
*commandStream,
offsetInterfaceDescriptorTable,
totalInterfaceDescriptorTableSize);
DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
for (auto &dispatchInfo : multiDispatchInfo) {
auto &kernel = *dispatchInfo.getKernel();
DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3));
DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 || dispatchInfo.getDim() == 3));
DEBUG_BREAK_IF(!(dispatchInfo.getGWS().y == 1 || dispatchInfo.getDim() >= 2));
DEBUG_BREAK_IF(!(dispatchInfo.getOffset().z == 0 || dispatchInfo.getDim() == 3));
DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 || dispatchInfo.getDim() >= 2));
// Determine SIMD size
uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();
// If we don't have a required WGS, compute one opportunistically
auto maxWorkGroupSize = static_cast<uint32_t>(commandQueue.getDevice().getDeviceInfo().maxWorkGroupSize);
if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
provideLocalWorkGroupSizeHints(commandQueue.getContextPtr(), maxWorkGroupSize, dispatchInfo);
}
//Get dispatch geometry
uint32_t dim = dispatchInfo.getDim();
Vec3<size_t> gws = dispatchInfo.getGWS();
Vec3<size_t> offset = dispatchInfo.getOffset();
Vec3<size_t> swgs = dispatchInfo.getStartOfWorkgroups();
// Compute local workgroup sizes
Vec3<size_t> lws = (dispatchInfo.getLocalWorkgroupSize().x > 0) ? dispatchInfo.getLocalWorkgroupSize() : generateWorkgroupSize(dispatchInfo);
Vec3<size_t> elws = (dispatchInfo.getEnqueuedWorkgroupSize().x > 0) ? dispatchInfo.getEnqueuedWorkgroupSize() : lws;
// Compute number of work groups
Vec3<size_t> twgs = (dispatchInfo.getTotalNumberOfWorkgroups().x > 0) ? dispatchInfo.getTotalNumberOfWorkgroups() : generateWorkgroupsNumber(gws, lws);
Vec3<size_t> nwgs = (dispatchInfo.getNumberOfWorkgroups().x > 0) ? dispatchInfo.getNumberOfWorkgroups() : twgs;
// Patch our kernel constants
*kernel.globalWorkOffsetX = static_cast<uint32_t>(offset.x);
*kernel.globalWorkOffsetY = static_cast<uint32_t>(offset.y);
*kernel.globalWorkOffsetZ = static_cast<uint32_t>(offset.z);
*kernel.globalWorkSizeX = static_cast<uint32_t>(gws.x);
*kernel.globalWorkSizeY = static_cast<uint32_t>(gws.y);
*kernel.globalWorkSizeZ = static_cast<uint32_t>(gws.z);
if ((&dispatchInfo == &*multiDispatchInfo.begin()) || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) {
*kernel.localWorkSizeX = static_cast<uint32_t>(lws.x);
*kernel.localWorkSizeY = static_cast<uint32_t>(lws.y);
*kernel.localWorkSizeZ = static_cast<uint32_t>(lws.z);
}
*kernel.localWorkSizeX2 = static_cast<uint32_t>(lws.x);
*kernel.localWorkSizeY2 = static_cast<uint32_t>(lws.y);
*kernel.localWorkSizeZ2 = static_cast<uint32_t>(lws.z);
*kernel.enqueuedLocalWorkSizeX = static_cast<uint32_t>(elws.x);
*kernel.enqueuedLocalWorkSizeY = static_cast<uint32_t>(elws.y);
*kernel.enqueuedLocalWorkSizeZ = static_cast<uint32_t>(elws.z);
if (&dispatchInfo == &*multiDispatchInfo.begin()) {
*kernel.numWorkGroupsX = static_cast<uint32_t>(twgs.x);
*kernel.numWorkGroupsY = static_cast<uint32_t>(twgs.y);
*kernel.numWorkGroupsZ = static_cast<uint32_t>(twgs.z);
}
*kernel.workDim = dim;
// Send our indirect object data
size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};
auto offsetCrossThreadData = KernelCommandsHelper<GfxFamily>::sendIndirectState(
*commandStream,
*dsh,
*ish,
*ioh,
*ssh,
kernel,
simd,
localWorkSizes,
offsetInterfaceDescriptorTable,
interfaceDescriptorIndex);
if (&dispatchInfo == &*multiDispatchInfo.begin()) {
// If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled
if (hwTimeStamps != nullptr) {
dispatchProfilingCommandsStart<GfxFamily>(*hwTimeStamps, commandStream);
}
if (hwPerfCounter != nullptr) {
dispatchPerfCountersCommandsStart<GfxFamily>(commandQueue, *hwPerfCounter, commandStream);
}
}
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, kernel, true);
// Program the walker. Invokes execution so all state should already be programmed
typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER;
auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER));
*pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
size_t globalOffsets[3] = {offset.x, offset.y, offset.z};
size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z};
size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z};
auto localWorkSize = setGpgpuWalkerThreadData<GfxFamily>(pGpGpuWalkerCmd, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizes, simd);
pGpGpuWalkerCmd->setIndirectDataStartAddress((uint32_t)offsetCrossThreadData);
DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
pGpGpuWalkerCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex++);
auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload;
DEBUG_BREAK_IF(nullptr == threadPayload);
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels);
localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF));
auto sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkSize) * localIdSizePerThread;
DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group
auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
auto IndirectDataLength = alignUp((uint32_t)(sizeCrossThreadData + sizePerThreadDataTotal), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
pGpGpuWalkerCmd->setIndirectDataLength(IndirectDataLength);
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, kernel, false);
}
// If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled
if (hwTimeStamps != nullptr) {
dispatchProfilingCommandsEnd<GfxFamily>(*hwTimeStamps, commandStream);
}
if (hwPerfCounter != nullptr) {
dispatchPerfCountersCommandsEnd<GfxFamily>(commandQueue, *hwPerfCounter, commandStream);
}
}
template <typename GfxFamily>
void dispatchWalker(
CommandQueue &commandQueue,
const Kernel &kernel,
cl_uint workDim,
const size_t globalOffsets[3],
const size_t workItems[3],
const size_t *localWorkSizesIn,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
KernelOperation **blockedCommandsData,
HwTimeStamps *hwTimeStamps,
HwPerfCounter *hwPerfCounter,
bool blockQueue = false) {
DispatchInfo dispatchInfo(const_cast<Kernel *>(&kernel), workDim, workItems, localWorkSizesIn, globalOffsets);
dispatchWalker<GfxFamily>(commandQueue, dispatchInfo, numEventsInWaitList, eventWaitList,
blockedCommandsData, hwTimeStamps, hwPerfCounter, blockQueue);
}
template <typename GfxFamily>
void dispatchScheduler(
CommandQueue &commandQueue,
DeviceQueueHw<GfxFamily> &devQueueHw,
SchedulerKernel &scheduler) {
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
OCLRT::LinearStream *commandStream = nullptr;
OCLRT::IndirectHeap *dsh = nullptr, *ish = nullptr, *ioh = nullptr, *ssh = nullptr;
commandStream = &commandQueue.getCS(0);
// note : below code assumes that caller to dispatchScheduler "preallocated" memory
// required for execution model in below heap managers
dsh = devQueueHw.getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
ish = &commandQueue.getIndirectHeap(IndirectHeap::INSTRUCTION);
ssh = &commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE);
bool dcFlush = false;
commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, dcFlush);
uint32_t interfaceDescriptorIndex = devQueueHw.schedulerIDIndex;
const size_t offsetInterfaceDescriptorTable = devQueueHw.colorCalcStateSize;
const size_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable;
const size_t totalInterfaceDescriptorTableSize = devQueueHw.interfaceDescriptorEntries * sizeof(INTERFACE_DESCRIPTOR_DATA);
// Program media interface descriptor load
KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
*commandStream,
offsetInterfaceDescriptor,
totalInterfaceDescriptorTableSize);
DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
// Determine SIMD size
uint32_t simd = scheduler.getKernelInfo().getMaxSimdSize();
DEBUG_BREAK_IF(simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20);
// Patch our kernel constants
*scheduler.globalWorkOffsetX = 0;
*scheduler.globalWorkOffsetY = 0;
*scheduler.globalWorkOffsetZ = 0;
*scheduler.globalWorkSizeX = (uint32_t)scheduler.getGws();
*scheduler.globalWorkSizeY = 1;
*scheduler.globalWorkSizeZ = 1;
*scheduler.localWorkSizeX = (uint32_t)scheduler.getLws();
*scheduler.localWorkSizeY = 1;
*scheduler.localWorkSizeZ = 1;
*scheduler.localWorkSizeX2 = (uint32_t)scheduler.getLws();
*scheduler.localWorkSizeY2 = 1;
*scheduler.localWorkSizeZ2 = 1;
*scheduler.enqueuedLocalWorkSizeX = (uint32_t)scheduler.getLws();
*scheduler.enqueuedLocalWorkSizeY = 1;
*scheduler.enqueuedLocalWorkSizeZ = 1;
*scheduler.numWorkGroupsX = (uint32_t)(scheduler.getGws() / scheduler.getLws());
*scheduler.numWorkGroupsY = 0;
*scheduler.numWorkGroupsZ = 0;
*scheduler.workDim = 1;
// Send our indirect object data
size_t localWorkSizes[3] = {scheduler.getLws(), 1, 1};
// Create indirectHeap for IOH that is located at the end of device enqueue DSH
size_t curbeOffset = devQueueHw.setSchedulerCrossThreadData(scheduler);
IndirectHeap indirectObjectHeap(dsh->getBase(), dsh->getMaxAvailableSpace());
indirectObjectHeap.getSpace(curbeOffset);
ioh = &indirectObjectHeap;
auto offsetCrossThreadData = KernelCommandsHelper<GfxFamily>::sendIndirectState(
*commandStream,
*dsh,
*ish,
*ioh,
*ssh,
scheduler,
simd,
localWorkSizes,
offsetInterfaceDescriptorTable,
interfaceDescriptorIndex);
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, scheduler, true);
// Program the walker. Invokes execution so all state should already be programmed
auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER));
*pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
size_t globalOffsets[3] = {0, 0, 0};
size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
auto localWorkSize = setGpgpuWalkerThreadData<GfxFamily>(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd);
pGpGpuWalkerCmd->setIndirectDataStartAddress((uint32_t)offsetCrossThreadData);
DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
pGpGpuWalkerCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex);
auto threadPayload = scheduler.getKernelInfo().patchInfo.threadPayload;
DEBUG_BREAK_IF(nullptr == threadPayload);
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels);
localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF));
auto sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkSize) * localIdSizePerThread;
DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group
auto sizeCrossThreadData = scheduler.getCrossThreadDataSize();
auto IndirectDataLength = alignUp((uint32_t)(sizeCrossThreadData + sizePerThreadDataTotal), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
pGpGpuWalkerCmd->setIndirectDataLength(IndirectDataLength);
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, scheduler, false);
// Do not put BB_START only when returning in first Scheduler run
if (devQueueHw.getSchedulerReturnInstance() != 1) {
commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, true);
// Add BB Start Cmd to the SLB in the Primary Batch Buffer
auto *bbStart = (MI_BATCH_BUFFER_START *)commandStream->getSpace(sizeof(MI_BATCH_BUFFER_START));
*bbStart = MI_BATCH_BUFFER_START::sInit();
bbStart->setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH);
uint64_t slbAddress = devQueueHw.getSlbBuffer()->getGpuAddress();
bbStart->setBatchBufferStartAddressGraphicsaddress472(slbAddress);
}
}
template <typename GfxFamily, unsigned int eventType>
struct EnqueueOperation {
static_assert(eventType != CL_COMMAND_NDRANGE_KERNEL, "for eventType CL_COMMAND_NDRANGE_KERNEL use specialization class");
static_assert(eventType != CL_COMMAND_MARKER, "for eventType CL_COMMAND_MARKER use specialization class");
static_assert(eventType != CL_COMMAND_MIGRATE_MEM_OBJECTS, "for eventType CL_COMMAND_MIGRATE_MEM_OBJECTS use specialization class");
static size_t getTotalSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
size_t size = KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
if (reserveProfilingCmdsSpace) {
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
if (reservePerfCounters) {
//start cmds
//P_C: flush CS & TimeStamp BEGIN
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
//SRM NOOPID & Frequency
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//gp registers
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//report perf count
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//end cmds
//P_C: flush CS & TimeStamp END;
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
//OA buffer (status head, tail)
size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//report perf count
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
//gp registers
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//SRM NOOPID & Frequency
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
for (auto &dispatchInfo : multiDispatchInfo) {
auto &kernel = *dispatchInfo.getKernel();
size += sizeof(typename GfxFamily::GPGPU_WALKER);
size += getSizeForWADisableLSQCROPERFforOCL<GfxFamily>(&kernel);
}
return size;
}
static size_t getSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
if (reserveProfilingCmdsSpace) {
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
if (reservePerfCounters) {
//start cmds
//P_C: flush CS & TimeStamp BEGIN
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
//SRM NOOPID & Frequency
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//gp registers
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//report perf count
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//end cmds
//P_C: flush CS & TimeStamp END;
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
//OA buffer (status head, tail)
size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//report perf count
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
//gp registers
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//SRM NOOPID & Frequency
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
size += getSizeForWADisableLSQCROPERFforOCL<GfxFamily>(pKernel);
return size;
}
};
template <typename GfxFamily, unsigned int eventType>
LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const Kernel *pKernel) {
auto expectedSizeCS = EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, pKernel);
return commandQueue.getCS(expectedSizeCS);
}
template <typename GfxFamily, unsigned int eventType>
LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const MultiDispatchInfo &multiDispatchInfo) {
size_t expectedSizeCS = 0;
Kernel *parentKernel = multiDispatchInfo.size() > 0 ? multiDispatchInfo.begin()->getKernel() : nullptr;
for (auto &dispatchInfo : multiDispatchInfo) {
expectedSizeCS += EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, dispatchInfo.getKernel());
}
if (parentKernel && parentKernel->isParentKernel) {
SchedulerKernel &scheduler = BuiltIns::getInstance().getSchedulerKernel(parentKernel->getContext());
expectedSizeCS += EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, &scheduler);
}
return commandQueue.getCS(expectedSizeCS);
}
template <typename GfxFamily, IndirectHeap::Type heapType>
IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
size_t expectedSize = 0;
IndirectHeap *ih = nullptr;
// clang-format off
switch(heapType) {
case IndirectHeap::DYNAMIC_STATE: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH(multiDispatchInfo); break;
case IndirectHeap::INSTRUCTION: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredIH( multiDispatchInfo); break;
case IndirectHeap::INDIRECT_OBJECT: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(multiDispatchInfo); break;
case IndirectHeap::SURFACE_STATE: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(multiDispatchInfo); break;
}
// clang-format on
if (multiDispatchInfo.begin()->getKernel()->isParentKernel) {
if (heapType == IndirectHeap::INSTRUCTION || heapType == IndirectHeap::SURFACE_STATE) {
expectedSize += KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<heapType>(const_cast<const Kernel &>(*(multiDispatchInfo.begin()->getKernel())));
} else //if (heapType == IndirectHeap::DYNAMIC_STATE || heapType == IndirectHeap::INDIRECT_OBJECT)
{
DeviceQueueHw<GfxFamily> *pDevQueue = castToObject<DeviceQueueHw<GfxFamily>>(commandQueue.getContext().getDefaultDeviceQueue());
DEBUG_BREAK_IF(pDevQueue == nullptr);
ih = pDevQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
}
}
if (ih == nullptr)
ih = &commandQueue.getIndirectHeap(heapType, expectedSize);
return *ih;
}
} // namespace OCLRT

View File

@@ -0,0 +1,35 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
namespace OCLRT {
#define L3SQC_BIT_LQSC_RO_PERF_DIS 0x08000000
#define L3SQC_REG4 0xB118
template <typename GfxFamily>
void applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode);
template <typename GfxFamily>
size_t getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel);
}

View File

@@ -0,0 +1,112 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
namespace OCLRT {
#define CS_GPR_R0 0x2600
#define CS_GPR_R1 0x2608
#define NUM_ALU_INST_FOR_READ_MODIFY_WRITE 4
#define ALU_OPCODE_LOAD 0x080
#define ALU_OPCODE_STORE 0x180
#define ALU_OPCODE_OR 0x103
#define ALU_OPCODE_AND 0x102
#define ALU_REGISTER_R_0 0x0
#define ALU_REGISTER_R_1 0x1
#define ALU_REGISTER_R_SRCA 0x20
#define ALU_REGISTER_R_SRCB 0x21
#define ALU_REGISTER_R_ACCU 0x31
// Performs ReadModifyWrite operation on value of a register: Register = Register Operation Mask
template <typename GfxFamily>
void addAluReadModifyWriteRegister(
OCLRT::LinearStream *pCommandStream,
uint32_t aluRegister,
uint32_t operation,
uint32_t mask) {
// Load "Register" value into CS_GPR_R0
typedef typename GfxFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
typedef typename GfxFamily::MI_MATH MI_MATH;
typedef typename GfxFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
auto pCmd = reinterpret_cast<MI_LOAD_REGISTER_REG *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG)));
*pCmd = MI_LOAD_REGISTER_REG::sInit();
pCmd->setSourceRegisterAddress(aluRegister);
pCmd->setDestinationRegisterAddress(CS_GPR_R0);
// Load "Mask" into CS_GPR_R1
typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
auto pCmd2 = reinterpret_cast<MI_LOAD_REGISTER_IMM *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_IMM)));
*pCmd2 = MI_LOAD_REGISTER_IMM::sInit();
pCmd2->setRegisterOffset(CS_GPR_R1);
pCmd2->setDataDword(mask);
// Add instruction MI_MATH with 4 MI_MATH_ALU_INST_INLINE operands
auto pCmd3 = reinterpret_cast<uint32_t *>(pCommandStream->getSpace(sizeof(MI_MATH) + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)));
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.Value = 0x0;
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND;
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH;
// 0x3 - 5 Dwords length cmd (-2): 1 for MI_MATH, 4 for MI_MATH_ALU_INST_INLINE
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1;
pCmd3++;
MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(pCmd3);
// Setup first operand of MI_MATH - load CS_GPR_R0 into register A
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCA;
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_0;
pAluParam++;
// Setup second operand of MI_MATH - load CS_GPR_R1 into register B
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCB;
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_1;
pAluParam++;
// Setup third operand of MI_MATH - "Operation" on registers A and B
pAluParam->DW0.BitField.ALUOpcode = operation;
pAluParam->DW0.BitField.Operand1 = 0;
pAluParam->DW0.BitField.Operand2 = 0;
pAluParam++;
// Setup fourth operand of MI_MATH - store result into CS_GPR_R0
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_STORE;
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_0;
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_ACCU;
// LOAD value of CS_GPR_R0 into "Register"
auto pCmd4 = reinterpret_cast<MI_LOAD_REGISTER_REG *>(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_REG)));
*pCmd4 = MI_LOAD_REGISTER_REG::sInit();
pCmd4->setSourceRegisterAddress(CS_GPR_R0);
pCmd4->setDestinationRegisterAddress(aluRegister);
// Add PIPE_CONTROL to flush caches
typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL;
auto pCmd5 = reinterpret_cast<PIPE_CONTROL *>(pCommandStream->getSpace(sizeof(PIPE_CONTROL)));
*pCmd5 = PIPE_CONTROL::sInit();
pCmd5->setCommandStreamerStallEnable(true);
pCmd5->setDcFlushEnable(true);
pCmd5->setTextureCacheInvalidationEnable(true);
pCmd5->setPipeControlFlushEnable(true);
pCmd5->setStateCacheInvalidationEnable(true);
}
}

View File

@@ -0,0 +1,55 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "hw_cmds.h"
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/command_stream/command_stream_receiver.h"
#include "runtime/device/device.h"
#include "runtime/event/event.h"
#include "runtime/memory_manager/surface.h"
#include <new>
namespace OCLRT {
template <typename GfxFamily>
cl_int CommandQueueHw<GfxFamily>::enqueueBarrierWithWaitList(
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
NullSurface s;
Surface *surfaces[] = {&s};
cl_uint dimensions = 1;
enqueueHandler<CL_COMMAND_BARRIER>(surfaces,
false,
nullptr,
dimensions,
nullptr,
nullptr,
nullptr,
numEventsInWaitList,
eventWaitList,
event);
return CL_SUCCESS;
}
}

View File

@@ -0,0 +1,687 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "runtime/builtin_kernels_simulation/scheduler_simulation.h"
#include "hw_cmds.h"
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/command_queue/dispatch_walker.h"
#include "runtime/command_stream/command_stream_receiver.h"
#include "runtime/event/event_builder.h"
#include "runtime/helpers/kernel_commands.h"
#include "runtime/helpers/dispatch_info_builder.h"
#include "runtime/mem_obj/buffer.h"
#include "runtime/memory_manager/memory_manager.h"
#include "runtime/memory_manager/surface.h"
#include "runtime/built_ins/built_ins.h"
#include "runtime/helpers/array_count.h"
#include "runtime/helpers/options.h"
#include "runtime/helpers/task_information.h"
#include "runtime/program/printf_handler.h"
#include "runtime/program/block_kernel_manager.h"
#include "runtime/utilities/range.h"
#include <new>
#include <memory>
namespace OCLRT {
inline bool shouldFlushDC(unsigned int commandType, PrintfHandler *printfHandler) {
return (commandType == CL_COMMAND_READ_BUFFER ||
commandType == CL_COMMAND_READ_BUFFER_RECT ||
commandType == CL_COMMAND_READ_IMAGE ||
commandType == CL_COMMAND_SVM_MAP ||
printfHandler);
}
inline bool isCommandWithoutKernel(unsigned int commandType) {
return ((commandType == CL_COMMAND_BARRIER) || (commandType == CL_COMMAND_MARKER) ||
(commandType == CL_COMMAND_MIGRATE_MEM_OBJECTS) ||
(commandType == CL_COMMAND_SVM_MAP) ||
(commandType == CL_COMMAND_SVM_UNMAP) ||
(commandType == CL_COMMAND_SVM_FREE));
}
template <typename GfxFamily>
void CommandQueueHw<GfxFamily>::enqueueHandlerHook(const unsigned int commandType, const MultiDispatchInfo &dispatchInfo) {}
template <typename GfxFamily>
template <unsigned int commandType, size_t surfaceCount>
void CommandQueueHw<GfxFamily>::enqueueHandler(Surface *(&surfaces)[surfaceCount],
bool blocking,
Kernel *kernel,
cl_uint workDim,
const size_t globalOffsets[3],
const size_t workItems[3],
const size_t *localWorkSizesIn,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
if (kernel == nullptr) {
enqueueHandler<commandType>(surfaces, blocking, MultiDispatchInfo(), numEventsInWaitList, eventWaitList, event);
} else {
MultiDispatchInfo multiDispatchInfo;
if (DebugManager.flags.ForceDispatchScheduler.get()) {
forceDispatchScheduler(multiDispatchInfo);
} else {
if (kernel->getKernelInfo().builtinDispatchBuilder == nullptr) {
DispatchInfoBuilder<SplitDispatch::Dim::d3D, SplitDispatch::SplitMode::WalkerSplit> builder;
builder.setDispatchGeometry(workDim, workItems, localWorkSizesIn, globalOffsets);
builder.setKernel(kernel);
builder.bake(multiDispatchInfo);
} else {
auto builder = kernel->getKernelInfo().builtinDispatchBuilder;
builder->buildDispatchInfos(multiDispatchInfo, kernel, workDim, workItems, localWorkSizesIn, globalOffsets);
if (multiDispatchInfo.size() == 0) {
return;
}
}
}
enqueueHandler<commandType>(surfaces, blocking, multiDispatchInfo, numEventsInWaitList, eventWaitList, event);
}
}
template <typename GfxFamily>
void CommandQueueHw<GfxFamily>::forceDispatchScheduler(OCLRT::MultiDispatchInfo &multiDispatchInfo) {
BuiltIns &builtIns = BuiltIns::getInstance();
SchedulerKernel &scheduler = builtIns.getSchedulerKernel(this->getContext());
DispatchInfo dispatchInfo(&scheduler, 1, Vec3<size_t>(scheduler.getGws(), 1, 1), Vec3<size_t>(scheduler.getLws(), 1, 1), Vec3<size_t>(0, 0, 0));
auto devQueue = this->getContext().getDefaultDeviceQueue();
DeviceQueueHw<GfxFamily> *devQueueHw = castToObject<DeviceQueueHw<GfxFamily>>(devQueue);
scheduler.createReflectionSurface();
GraphicsAllocation *reflectionSurface = scheduler.getKernelReflectionSurface();
devQueueHw->resetDeviceQueue();
scheduler.setArgs(devQueueHw->getQueueBuffer(),
devQueueHw->getStackBuffer(),
devQueueHw->getEventPoolBuffer(),
devQueueHw->getSlbBuffer(),
devQueueHw->getDshBuffer(),
reflectionSurface,
devQueueHw->getQueueStorageBuffer(),
this->getIndirectHeap(IndirectHeap::SURFACE_STATE).getGraphicsAllocation());
multiDispatchInfo.push(dispatchInfo);
}
template <typename GfxFamily>
template <unsigned int commandType>
void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
size_t numSurfaceForResidency,
bool blocking,
const MultiDispatchInfo &multiDispatchInfo,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
if (multiDispatchInfo.empty() && !isCommandWithoutKernel(commandType)) {
enqueueHandler<CL_COMMAND_MARKER>(surfacesForResidency, numSurfaceForResidency, blocking, multiDispatchInfo,
numEventsInWaitList, eventWaitList, event);
if (event) {
castToObjectOrAbort<Event>(*event)->setCmdType(commandType);
}
return;
}
bool executionModelKernel = multiDispatchInfo.empty() ? false : multiDispatchInfo.begin()->getKernel()->isParentKernel;
Kernel *parentKernel = executionModelKernel ? multiDispatchInfo.begin()->getKernel() : nullptr;
auto devQueue = this->getContext().getDefaultDeviceQueue();
DeviceQueueHw<GfxFamily> *devQueueHw = castToObject<DeviceQueueHw<GfxFamily>>(devQueue);
HwTimeStamps *hwTimeStamps = nullptr;
TakeOwnershipWrapper<Device> deviceOwnership(*device);
TimeStampData queueTimeStamp;
if (isProfilingEnabled() && event) {
this->getDevice().getOSTime()->getCpuGpuTime(&queueTimeStamp);
}
EventBuilder eventBuilder;
if (event) {
eventBuilder.create<Event>(this, commandType, Event::eventNotReady, 0);
*event = eventBuilder.getEvent();
if (eventBuilder.getEvent()->isProfilingEnabled()) {
eventBuilder.getEvent()->setQueueTimeStamp(&queueTimeStamp);
if (isCommandWithoutKernel(commandType)) {
eventBuilder.getEvent()->setCPUProfilingPath(true);
eventBuilder.getEvent()->setQueueTimeStamp();
}
}
DBG_LOG(EventsDebugEnable, "enqueueHandler commandType", commandType, "output Event", eventBuilder.getEvent());
}
bool profilingRequired = (this->isProfilingEnabled() && event != nullptr);
bool perfCountersRequired = false;
perfCountersRequired = (this->isPerfCountersEnabled() && event != nullptr);
KernelOperation *blockedCommandsData = nullptr;
std::unique_ptr<PrintfHandler> printfHandler;
bool slmUsed = false;
TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
auto taskLevel = getTaskLevelFromWaitList(this->taskLevel, numEventsInWaitList, eventWaitList);
auto blockQueue = (taskLevel == Event::eventNotReady) || isQueueBlocked();
// isQueueBlocked() may use commandStream resolving events tree, get start offset after the call
auto &commandStream = getCommandStream<GfxFamily, commandType>(*this, profilingRequired, perfCountersRequired, multiDispatchInfo);
auto commandStreamStart = commandStream.getUsed();
auto &commandStreamReceiver = device->getCommandStreamReceiver();
// isQueueBlocked may unblock queue, get new taskLevel
taskLevel = getTaskLevelFromWaitList(this->taskLevel, numEventsInWaitList, eventWaitList);
DBG_LOG(EventsDebugEnable, "blockQueue", blockQueue, "virtualEvent", virtualEvent, "taskLevel", taskLevel);
if (DebugManager.flags.MakeEachEnqueueBlocking.get()) {
blocking = true;
}
if (executionModelKernel && !blockQueue) {
while (!devQueueHw->isEMCriticalSectionFree())
;
}
auto updateTaskLevel = isTaskLevelUpdateRequired(taskLevel, eventWaitList, numEventsInWaitList, commandType);
if (updateTaskLevel) {
taskLevel++;
}
enqueueHandlerHook(commandType, multiDispatchInfo);
if (multiDispatchInfo.empty() == false) {
HwPerfCounter *hwPerfCounter = nullptr;
DebugManager.dumpKernelArgs(&multiDispatchInfo);
printfHandler.reset(PrintfHandler::create(multiDispatchInfo, *device));
if (printfHandler) {
printfHandler.get()->prepareDispatch(multiDispatchInfo);
}
if ((this->isProfilingEnabled() && (eventBuilder.getEvent() != nullptr))) {
// Get allocation for timestamps
hwTimeStamps = eventBuilder.getEvent()->getHwTimeStamp();
if (this->isPerfCountersEnabled()) {
hwPerfCounter = eventBuilder.getEvent()->getHwPerfCounter();
//PERF COUNTER: copy current configuration from queue to event
eventBuilder.getEvent()->copyPerfCounters(this->getPerfCountersConfigData());
}
}
if (executionModelKernel) {
parentKernel->createReflectionSurface();
parentKernel->patchDefaultDeviceQueue(context->getDefaultDeviceQueue());
parentKernel->patchEventPool(context->getDefaultDeviceQueue());
parentKernel->patchReflectionSurface(context->getDefaultDeviceQueue(), printfHandler.get());
if (!blockQueue) {
devQueueHw->resetDeviceQueue();
devQueueHw->acquireEMCriticalSection();
}
}
dispatchWalker<GfxFamily>(
*this,
multiDispatchInfo,
numEventsInWaitList,
eventWaitList,
&blockedCommandsData,
hwTimeStamps,
hwPerfCounter,
blockQueue,
commandType);
commandStreamReceiver.setRequiredScratchSize(multiDispatchInfo.getRequiredScratchSize());
slmUsed = multiDispatchInfo.usesSlm();
}
CompletionStamp completionStamp;
if (!blockQueue) {
if (executionModelKernel) {
size_t minSizeISHForEM = KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<IndirectHeap::INSTRUCTION>(const_cast<const Kernel &>(*(multiDispatchInfo.begin()->getKernel())));
size_t minSizeSSHForEM = KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(const_cast<const Kernel &>(*(multiDispatchInfo.begin()->getKernel())));
uint32_t taskCount = commandStreamReceiver.peekTaskCount() + 1;
devQueueHw->setupExecutionModelDispatch(getIndirectHeap(IndirectHeap::INSTRUCTION, minSizeISHForEM),
getIndirectHeap(IndirectHeap::SURFACE_STATE, minSizeSSHForEM),
multiDispatchInfo.begin()->getKernel(),
(uint32_t)multiDispatchInfo.size(),
taskCount,
hwTimeStamps);
BuiltIns &builtIns = BuiltIns::getInstance();
SchedulerKernel &scheduler = builtIns.getSchedulerKernel(this->getContext());
scheduler.setArgs(devQueueHw->getQueueBuffer(),
devQueueHw->getStackBuffer(),
devQueueHw->getEventPoolBuffer(),
devQueueHw->getSlbBuffer(),
devQueueHw->getDshBuffer(),
multiDispatchInfo.begin()->getKernel()->getKernelReflectionSurface(),
devQueueHw->getQueueStorageBuffer(),
this->getIndirectHeap(IndirectHeap::SURFACE_STATE).getGraphicsAllocation(),
devQueueHw->getDebugQueue());
dispatchScheduler<GfxFamily>(
*this,
*devQueueHw,
scheduler);
scheduler.makeResident(commandStreamReceiver);
// Update SLM usage
slmUsed |= scheduler.slmTotalSize > 0;
size_t count = parentKernel->getProgram()->getBlockKernelManager()->getCount();
for (uint32_t surfaceIndex = 0; surfaceIndex < count; surfaceIndex++) {
auto surface = parentKernel->getProgram()->getBlockKernelManager()->getPrivateSurface(surfaceIndex);
if (surface) {
commandStreamReceiver.makeResident(*surface);
}
}
}
auto submissionRequired = isCommandWithoutKernel(commandType) ? false : true;
if (submissionRequired) {
completionStamp = enqueueNonBlocked<commandType>(
surfacesForResidency,
numSurfaceForResidency,
commandStream,
commandStreamStart,
blocking,
multiDispatchInfo,
eventBuilder,
taskLevel,
slmUsed,
printfHandler.get());
if (eventBuilder.getEvent()) {
eventBuilder.getEvent()->flushStamp->replaceStampObject(this->flushStamp->getStampReference());
}
if (executionModelKernel && devQueueHw->getSchedulerReturnInstance() > 0) {
waitUntilComplete(completionStamp.taskCount, completionStamp.flushStamp);
BuiltinKernelsSimulation::SchedulerSimulation<GfxFamily> simulation;
simulation.runSchedulerSimulation(devQueueHw->getQueueBuffer(),
devQueueHw->getStackBuffer(),
devQueueHw->getEventPoolBuffer(),
devQueueHw->getSlbBuffer(),
devQueueHw->getDshBuffer(),
multiDispatchInfo.begin()->getKernel()->getKernelReflectionSurface(),
devQueueHw->getQueueStorageBuffer(),
this->getIndirectHeap(IndirectHeap::SURFACE_STATE).getGraphicsAllocation(),
devQueueHw->getDebugQueue());
}
} else {
auto maxTaskCount = this->taskCount;
for (auto eventId = 0u; eventId < numEventsInWaitList; eventId++) {
auto event = castToObject<Event>(eventWaitList[eventId]);
if (!event->isUserEvent()) {
maxTaskCount = std::max(maxTaskCount, event->peekTaskCount());
}
}
//inherit data from event_wait_list and previous packets
completionStamp.flushStamp = this->flushStamp->peekStamp();
completionStamp.taskCount = maxTaskCount;
completionStamp.taskLevel = taskLevel;
if (eventBuilder.getEvent() && isProfilingEnabled()) {
TimeStampData submitTimeStamp;
this->getDevice().getOSTime()->getCpuGpuTime(&submitTimeStamp);
eventBuilder.getEvent()->setSubmitTimeStamp(&submitTimeStamp);
eventBuilder.getEvent()->setSubmitTimeStamp();
eventBuilder.getEvent()->setStartTimeStamp();
}
}
} else {
CompletionStamp cmplStamp = {
Event::eventNotReady,
taskLevel,
0,
EngineType::ENGINE_RCS};
completionStamp = cmplStamp;
}
updateFromCompletionStamp(completionStamp);
if (eventBuilder.getEvent()) {
eventBuilder.getEvent()->updateCompletionStamp(completionStamp.taskCount, completionStamp.taskLevel, completionStamp.flushStamp);
DebugManager.log(DebugManager.flags.EventsDebugEnable.get(), "updateCompletionStamp Event", eventBuilder.getEvent(), "taskLevel", eventBuilder.getEvent()->taskLevel.load());
}
if (blockQueue) {
if (executionModelKernel) {
size_t minSizeISHForEM = KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<IndirectHeap::INSTRUCTION>(const_cast<const Kernel &>(*(multiDispatchInfo.begin()->getKernel())));
size_t minSizeSSHForEM = KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(const_cast<const Kernel &>(*(multiDispatchInfo.begin()->getKernel())));
blockedCommandsData->instructionHeapSizeEM = minSizeISHForEM;
blockedCommandsData->surfaceStateHeapSizeEM = minSizeSSHForEM;
}
enqueueBlocked<commandType>(
surfacesForResidency,
numSurfaceForResidency,
blocking,
multiDispatchInfo,
blockedCommandsData,
numEventsInWaitList,
eventWaitList,
slmUsed,
eventBuilder,
std::move(printfHandler));
}
queueOwnership.unlock();
deviceOwnership.unlock();
if (blockQueue) {
TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
if (this->virtualEvent == eventBuilder.getEvent()) {
eventBuilder.registerEvent();
}
}
if (blocking) {
if (blockQueue) {
while (isQueueBlocked())
;
waitUntilComplete(taskCount, flushStamp->peekStamp());
} else {
waitUntilComplete(taskCount, flushStamp->peekStamp());
for (auto sIt = surfacesForResidency, sE = surfacesForResidency + numSurfaceForResidency;
sIt != sE; ++sIt) {
(*sIt)->setCompletionStamp(completionStamp, nullptr, nullptr);
}
if (printfHandler) {
printfHandler->printEnqueueOutput();
}
commandStreamReceiver.cleanAllocationList(taskCount, TEMPORARY_ALLOCATION);
}
}
}
template <typename GfxFamily>
bool CommandQueueHw<GfxFamily>::isTaskLevelUpdateRequired(const uint32_t &taskLevel, const cl_event *eventWaitList, const cl_uint &numEventsInWaitList, unsigned int commandType) {
bool updateTaskLevel = true;
//if we are blocked by user event then no update
if (taskLevel == Event::eventNotReady) {
updateTaskLevel = false;
}
//if we are executing command without kernel then it will inherit state from
//previous commands, barrier is exception
if (isCommandWithoutKernel(commandType) && commandType != CL_COMMAND_BARRIER) {
updateTaskLevel = false;
}
//ooq special cases starts here
if (this->isOOQEnabled()) {
//if no wait list and barrier , do not update task level
if (eventWaitList == nullptr && commandType != CL_COMMAND_BARRIER) {
updateTaskLevel = false;
}
//if we have waitlist then deduce task level from waitlist and check if it is higher then current task level of queue
if (eventWaitList != nullptr) {
auto taskLevelFromEvents = getTaskLevelFromWaitList(0, numEventsInWaitList, eventWaitList);
taskLevelFromEvents++;
if (taskLevelFromEvents <= this->taskLevel) {
updateTaskLevel = false;
}
}
}
return updateTaskLevel;
}
template <typename GfxFamily>
template <unsigned int commandType>
CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
Surface **surfaces,
size_t surfaceCount,
LinearStream &commandStream,
size_t commandStreamStart,
bool &blocking,
const MultiDispatchInfo &multiDispatchInfo,
EventBuilder &eventBuilder,
uint32_t taskLevel,
bool slmUsed,
PrintfHandler *printfHandler) {
UNRECOVERABLE_IF(multiDispatchInfo.empty());
auto &commandStreamReceiver = device->getCommandStreamReceiver();
auto implicitFlush = false;
if (printfHandler) {
blocking = true;
printfHandler->makeResident(commandStreamReceiver);
}
auto requiresCoherency = false;
for (auto surface : CreateRange(surfaces, surfaceCount)) {
surface->makeResident(commandStreamReceiver);
requiresCoherency |= surface->IsCoherent;
}
auto mediaSamplerRequired = false;
for (auto &dispatchInfo : multiDispatchInfo) {
dispatchInfo.getKernel()->makeResident(commandStreamReceiver);
requiresCoherency |= dispatchInfo.getKernel()->requiresCoherency();
mediaSamplerRequired |= dispatchInfo.getKernel()->isVmeKernel();
}
if (mediaSamplerRequired) {
DEBUG_BREAK_IF(device->getDeviceInfo().preemptionSupported != false);
}
TimeStampData submitTimeStamp;
if (isProfilingEnabled() && eventBuilder.getEvent()) {
this->getDevice().getOSTime()->getCpuGpuTime(&submitTimeStamp);
eventBuilder.getEvent()->setSubmitTimeStamp(&submitTimeStamp);
this->getDevice().getCommandStreamReceiver().makeResident(*eventBuilder.getEvent()->getHwTimeStampAllocation());
if (isPerfCountersEnabled()) {
this->getDevice().getCommandStreamReceiver().makeResident(*eventBuilder.getEvent()->getHwPerfCounterAllocation());
}
}
IndirectHeap *dsh = nullptr;
IndirectHeap *ioh = nullptr;
const bool executionModelKernel = multiDispatchInfo.begin()->getKernel()->isParentKernel;
if (executionModelKernel) {
DeviceQueueHw<GfxFamily> *pDevQueue = castToObject<DeviceQueueHw<GfxFamily>>(this->getContext().getDefaultDeviceQueue());
DEBUG_BREAK_IF(pDevQueue == nullptr);
dsh = pDevQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
// In ExecutionModel IOH is the same as DSH to eliminate StateBaseAddress reprogramming for scheduler kernel and blocks.
ioh = dsh;
implicitFlush = true;
} else {
dsh = &getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
ioh = &getIndirectHeap(IndirectHeap::INDIRECT_OBJECT);
}
commandStreamReceiver.requestThreadArbitrationPolicy(multiDispatchInfo.begin()->getKernel()->getThreadArbitrationPolicy());
DispatchFlags dispatchFlags;
dispatchFlags.blocking = blocking;
dispatchFlags.dcFlush = shouldFlushDC(commandType, printfHandler);
dispatchFlags.useSLM = slmUsed;
dispatchFlags.guardCommandBufferWithPipeControl = true;
dispatchFlags.GSBA32BitRequired = commandType == CL_COMMAND_NDRANGE_KERNEL;
dispatchFlags.mediaSamplerRequired = mediaSamplerRequired;
dispatchFlags.requiresCoherency = requiresCoherency;
dispatchFlags.low_priority = low_priority;
dispatchFlags.implicitFlush = implicitFlush;
dispatchFlags.flushStampReference = this->flushStamp->getStampReference();
dispatchFlags.preemptionMode = PreemptionHelper::taskPreemptionMode(*device, multiDispatchInfo);
dispatchFlags.outOfOrderExecutionAllowed = this->isOOQEnabled();
DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady);
CompletionStamp completionStamp = commandStreamReceiver.flushTask(
commandStream,
commandStreamStart,
*dsh,
getIndirectHeap(IndirectHeap::INSTRUCTION),
*ioh,
getIndirectHeap(IndirectHeap::SURFACE_STATE),
taskLevel,
dispatchFlags);
for (auto surface : CreateRange(surfaces, surfaceCount)) {
surface->setCompletionStamp(completionStamp, device, this);
}
for (auto &dispatchInfo : multiDispatchInfo) {
dispatchInfo.getKernel()->updateWithCompletionStamp(commandStreamReceiver, &completionStamp);
}
return completionStamp;
}
template <typename GfxFamily>
template <unsigned int commandType>
void CommandQueueHw<GfxFamily>::enqueueBlocked(
Surface **surfaces,
size_t surfaceCount,
bool &blocking,
const MultiDispatchInfo &multiDispatchInfo,
KernelOperation *blockedCommandsData,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
bool slmUsed,
EventBuilder &externalEventBuilder,
std::unique_ptr<PrintfHandler> printfHandler) {
auto &commandStreamReceiver = device->getCommandStreamReceiver();
TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
//store previous virtual event as it will add dependecies to new virtual event
if (this->virtualEvent) {
DBG_LOG(EventsDebugEnable, "enqueueBlocked", "previousVirtualEvent", this->virtualEvent);
}
EventBuilder internalEventBuilder;
EventBuilder *eventBuilder;
// check if event will be exposed externally
if (externalEventBuilder.getEvent()) {
externalEventBuilder.getEvent()->incRefInternal();
eventBuilder = &externalEventBuilder;
DBG_LOG(EventsDebugEnable, "enqueueBlocked", "output event as virtualEvent", virtualEvent);
} else {
// it will be an internal event
internalEventBuilder.create<VirtualEvent>(this, context);
eventBuilder = &internalEventBuilder;
DBG_LOG(EventsDebugEnable, "enqueueBlocked", "new virtualEvent", eventBuilder->getEvent());
}
eventBuilder->getEvent()->setCurrentCmdQVirtualEvent(true);
//update queue taskCount
taskCount = eventBuilder->getEvent()->getCompletionStamp();
if (multiDispatchInfo.empty()) {
DEBUG_BREAK_IF(!isCommandWithoutKernel(commandType));
auto cmdSize = (unsigned int)EnqueueOperation<GfxFamily, commandType>::getSizeRequiredCS(isProfilingEnabled(),
isPerfCountersEnabled(),
*this,
nullptr);
auto cmd = std::unique_ptr<Command>(new CommandMarker(
*this, commandStreamReceiver, commandType, cmdSize));
eventBuilder->getEvent()->setCommand(std::move(cmd));
} else {
//store task data in event
std::vector<Surface *> allSurfaces;
for (auto &dispatchInfo : multiDispatchInfo) {
dispatchInfo.getKernel()->getResidency(allSurfaces);
for (auto &surface : CreateRange(surfaces, surfaceCount)) {
allSurfaces.push_back(surface->duplicate());
}
}
auto kernelOperation = std::unique_ptr<KernelOperation>(blockedCommandsData); // marking ownership
auto cmd = std::unique_ptr<Command>(new CommandComputeKernel(
*this,
commandStreamReceiver,
std::move(kernelOperation),
allSurfaces,
shouldFlushDC(commandType, printfHandler.get()),
slmUsed,
commandType == CL_COMMAND_NDRANGE_KERNEL,
std::move(printfHandler),
multiDispatchInfo.begin()->getKernel(),
(uint32_t)multiDispatchInfo.size()));
eventBuilder->getEvent()->setCommand(std::move(cmd));
}
eventBuilder->addParentEvents(ArrayRef<const cl_event>(eventWaitList, numEventsInWaitList));
eventBuilder->addParentEvent(this->virtualEvent);
eventBuilder->finalize();
if (this->virtualEvent) {
this->virtualEvent->setCurrentCmdQVirtualEvent(false);
this->virtualEvent->decRefInternal();
}
this->virtualEvent = eventBuilder->getEvent();
}
template <typename GfxFamily>
void CommandQueueHw<GfxFamily>::addMapUnmapToWaitlistEventsDependencies(const cl_event *eventWaitList,
size_t numEventsInWaitlist,
MapOperationType opType,
MemObj *memObj,
EventBuilder &externalEventBuilder) {
auto &commandStreamReceiver = device->getCommandStreamReceiver();
EventBuilder internalEventBuilder;
EventBuilder *eventBuilder;
// check if event will be exposed externally
if (externalEventBuilder.getEvent()) {
externalEventBuilder.getEvent()->incRefInternal();
eventBuilder = &externalEventBuilder;
} else {
// it will be an internal event
internalEventBuilder.create<VirtualEvent>(this, context);
eventBuilder = &internalEventBuilder;
}
//store task data in event
auto cmd = std::unique_ptr<Command>(new CommandMapUnmap(opType, *memObj, commandStreamReceiver, *this));
eventBuilder->getEvent()->setCommand(std::move(cmd));
//bind output event with input events
eventBuilder->addParentEvents(ArrayRef<const cl_event>(eventWaitList, numEventsInWaitlist));
eventBuilder->addParentEvent(this->virtualEvent);
eventBuilder->finalize();
if (this->virtualEvent) {
this->virtualEvent->setCurrentCmdQVirtualEvent(false);
this->virtualEvent->decRefInternal();
}
this->virtualEvent = eventBuilder->getEvent();
}
} // namespace OCLRT

View File

@@ -0,0 +1,77 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "hw_cmds.h"
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/command_queue/enqueue_common.h"
#include "runtime/command_stream/command_stream_receiver.h"
#include "runtime/helpers/kernel_commands.h"
#include "runtime/mem_obj/buffer.h"
#include "runtime/memory_manager/surface.h"
#include "runtime/built_ins/built_ins.h"
#include <new>
namespace OCLRT {
template <typename GfxFamily>
cl_int CommandQueueHw<GfxFamily>::enqueueCopyBuffer(
Buffer *srcBuffer,
Buffer *dstBuffer,
size_t srcOffset,
size_t dstOffset,
size_t size,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
MultiDispatchInfo dispatchInfo;
auto &builder = BuiltIns::getInstance().getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferToBuffer,
this->getContext(), this->getDevice());
builder.takeOwnership(this->context);
BuiltinDispatchInfoBuilder::BuiltinOpParams dc;
dc.srcMemObj = srcBuffer;
dc.dstMemObj = dstBuffer;
dc.srcOffset = {srcOffset, 0, 0};
dc.dstOffset = {dstOffset, 0, 0};
dc.size = {size, 0, 0};
builder.buildDispatchInfos(dispatchInfo, dc);
MemObjSurface s1(srcBuffer);
MemObjSurface s2(dstBuffer);
Surface *surfaces[] = {&s1, &s2};
enqueueHandler<CL_COMMAND_COPY_BUFFER>(
surfaces,
false,
dispatchInfo,
numEventsInWaitList,
eventWaitList,
event);
builder.releaseOwnership();
return CL_SUCCESS;
}
}

View File

@@ -0,0 +1,80 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/command_stream/command_stream_receiver.h"
#include "runtime/helpers/kernel_commands.h"
#include "runtime/mem_obj/buffer.h"
#include "runtime/memory_manager/surface.h"
#include "runtime/built_ins/built_ins.h"
#include <new>
namespace OCLRT {
template <typename GfxFamily>
cl_int CommandQueueHw<GfxFamily>::enqueueCopyBufferRect(
Buffer *srcBuffer,
Buffer *dstBuffer,
const size_t *srcOrigin,
const size_t *dstOrigin,
const size_t *region,
size_t srcRowPitch,
size_t srcSlicePitch,
size_t dstRowPitch,
size_t dstSlicePitch,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
MultiDispatchInfo dispatchInfo;
auto &builder = BuiltIns::getInstance().getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferRect,
this->getContext(), this->getDevice());
builder.takeOwnership(this->context);
BuiltinDispatchInfoBuilder::BuiltinOpParams dc;
dc.srcMemObj = srcBuffer;
dc.dstMemObj = dstBuffer;
dc.srcOffset = srcOrigin;
dc.dstOffset = dstOrigin;
dc.size = region;
dc.srcRowPitch = srcRowPitch;
dc.srcSlicePitch = srcSlicePitch;
dc.dstRowPitch = dstRowPitch;
dc.dstSlicePitch = dstSlicePitch;
builder.buildDispatchInfos(dispatchInfo, dc);
enqueueHandler<CL_COMMAND_COPY_BUFFER_RECT>(
dispatchInfo.getUsedSurfaces().begin(),
dispatchInfo.getUsedSurfaces().size(),
false,
dispatchInfo,
numEventsInWaitList,
eventWaitList,
event);
builder.releaseOwnership();
return CL_SUCCESS;
}
}

View File

@@ -0,0 +1,75 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "hw_cmds.h"
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/command_stream/command_stream_receiver.h"
#include "runtime/helpers/surface_formats.h"
#include "runtime/helpers/kernel_commands.h"
#include "runtime/mem_obj/buffer.h"
#include "runtime/mem_obj/image.h"
#include "runtime/memory_manager/surface.h"
#include "runtime/built_ins/built_ins.h"
#include <new>
namespace OCLRT {
template <typename GfxFamily>
cl_int CommandQueueHw<GfxFamily>::enqueueCopyBufferToImage(
Buffer *srcBuffer,
Image *dstImage,
size_t srcOffset,
const size_t *dstOrigin,
const size_t *region,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
MultiDispatchInfo di;
auto &builder = BuiltIns::getInstance().getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferToImage3d,
this->getContext(), this->getDevice());
builder.takeOwnership(this->context);
BuiltinDispatchInfoBuilder::BuiltinOpParams dc;
dc.srcMemObj = srcBuffer;
dc.dstMemObj = dstImage;
dc.srcOffset = {srcOffset, 0, 0};
dc.dstOffset = dstOrigin;
dc.size = region;
builder.buildDispatchInfos(di, dc);
enqueueHandler<CL_COMMAND_COPY_BUFFER_TO_IMAGE>(
di.getUsedSurfaces().begin(),
di.getUsedSurfaces().size(),
false,
di,
numEventsInWaitList,
eventWaitList,
event);
builder.releaseOwnership();
return CL_SUCCESS;
}
}

View File

@@ -0,0 +1,75 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "runtime/built_ins/built_ins.h"
#include "hw_cmds.h"
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/command_stream/command_stream_receiver.h"
#include "runtime/helpers/kernel_commands.h"
#include "runtime/helpers/basic_math.h"
#include "runtime/mem_obj/image.h"
#include "runtime/memory_manager/surface.h"
#include <algorithm>
#include <new>
namespace OCLRT {
template <typename GfxFamily>
cl_int CommandQueueHw<GfxFamily>::enqueueCopyImage(
Image *srcImage,
Image *dstImage,
const size_t srcOrigin[3],
const size_t dstOrigin[3],
const size_t region[3],
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
MultiDispatchInfo di;
auto &builder = BuiltIns::getInstance().getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyImageToImage3d,
this->getContext(), this->getDevice());
builder.takeOwnership(this->context);
BuiltinDispatchInfoBuilder::BuiltinOpParams dc;
dc.srcMemObj = srcImage;
dc.dstMemObj = dstImage;
dc.srcOffset = srcOrigin;
dc.dstOffset = dstOrigin;
dc.size = region;
builder.buildDispatchInfos(di, dc);
enqueueHandler<CL_COMMAND_COPY_IMAGE>(
di.getUsedSurfaces().begin(),
di.getUsedSurfaces().size(),
false,
di,
numEventsInWaitList,
eventWaitList,
event);
builder.releaseOwnership();
return CL_SUCCESS;
}
}

View File

@@ -0,0 +1,75 @@
/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "hw_cmds.h"
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/command_stream/command_stream_receiver.h"
#include "runtime/helpers/surface_formats.h"
#include "runtime/helpers/kernel_commands.h"
#include "runtime/mem_obj/buffer.h"
#include "runtime/mem_obj/image.h"
#include "runtime/memory_manager/surface.h"
#include "runtime/built_ins/built_ins.h"
#include <new>
namespace OCLRT {
template <typename GfxFamily>
cl_int CommandQueueHw<GfxFamily>::enqueueCopyImageToBuffer(
Image *srcImage,
Buffer *dstBuffer,
const size_t *srcOrigin,
const size_t *region,
size_t dstOffset,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
cl_event *event) {
MultiDispatchInfo di;
auto &builder = BuiltIns::getInstance().getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyImage3dToBuffer,
this->getContext(), this->getDevice());
builder.takeOwnership(this->context);
BuiltinDispatchInfoBuilder::BuiltinOpParams dc;
dc.srcMemObj = srcImage;
dc.dstMemObj = dstBuffer;
dc.srcOffset = srcOrigin;
dc.dstOffset = {dstOffset, 0, 0};
dc.size = region;
builder.buildDispatchInfos(di, dc);
enqueueHandler<CL_COMMAND_COPY_IMAGE_TO_BUFFER>(
di.getUsedSurfaces().begin(),
di.getUsedSurfaces().size(),
false,
di,
numEventsInWaitList,
eventWaitList,
event);
builder.releaseOwnership();
return CL_SUCCESS;
}
}

Some files were not shown because too many files have changed in this diff Show More