refactor: remove not needed vme builtin related code

Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
Mateusz Jablonski 2024-09-16 14:21:29 +00:00 committed by Compute-Runtime-Automation
parent 8170599b17
commit d5812f49d7
33 changed files with 9 additions and 3973 deletions

View File

@ -982,7 +982,6 @@ set(BUILTINS_BINARIES_STATELESS_HEAPLESS_LIB_NAME "builtins_binaries_stateless_h
set(BUILTINS_BINARIES_BINDFUL_LIB_NAME "builtins_binaries_bindful")
set(BUILTINS_BINARIES_BINDLESS_LIB_NAME "builtins_binaries_bindless")
set(BUILTINS_SPIRV_LIB_NAME "builtins_spirv")
set(BUILTINS_VME_LIB_NAME "builtins_vme")
if(WIN32)
set(NEO_EXTRA_LIBS Ws2_32)

View File

@ -14,12 +14,6 @@
#include "level_zero/core/source/device/device.h"
#include "level_zero/core/source/kernel/kernel.h"
namespace NEO {
const char *getAdditionalBuiltinAsString(EBuiltInOps::Type builtin) {
return nullptr;
}
} // namespace NEO
namespace L0 {
BuiltinFunctionsLibImpl::BuiltinData::~BuiltinData() {

View File

@ -42,7 +42,7 @@ components:
dest_dir: kernels_bin
type: git
branch: kernels_bin
revision: 3043-2372
revision: 3043-2375
kmdaf:
branch: kmdaf
dest_dir: kmdaf

View File

@ -1,5 +1,5 @@
#
# Copyright (C) 2021-2023 Intel Corporation
# Copyright (C) 2021-2024 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
@ -20,7 +20,6 @@ macro(generate_runtime_lib LIB_NAME MOCKABLE GENERATE_EXEC)
add_subdirectory(source "${NEO_BUILD_DIR}/${LIB_NAME}" EXCLUDE_FROM_ALL)
endif()
target_compile_definitions(${BUILTINS_SOURCES_LIB_NAME} PUBLIC MOCKABLE_VIRTUAL=)
target_compile_definitions(${BUILTINS_VME_LIB_NAME} PUBLIC MOCKABLE_VIRTUAL=)
if(${MOCKABLE})
target_compile_definitions(${LIB_NAME} PUBLIC MOCKABLE_VIRTUAL=virtual)

View File

@ -106,7 +106,6 @@ if(${GENERATE_EXECUTABLE})
list(APPEND NEO_DYNAMIC_LIB__TARGET_OBJECTS
$<TARGET_OBJECTS:${SHARINGS_ENABLE_LIB_NAME}>
$<TARGET_OBJECTS:${BUILTINS_SOURCES_LIB_NAME}>
$<TARGET_OBJECTS:${BUILTINS_VME_LIB_NAME}>
$<TARGET_OBJECTS:${BUILTINS_BINARIES_STATELESS_LIB_NAME}>
$<TARGET_OBJECTS:${BUILTINS_BINARIES_STATELESS_HEAPLESS_LIB_NAME}>
$<TARGET_OBJECTS:${BUILTINS_BINARIES_BINDFUL_LIB_NAME}>

View File

@ -25,7 +25,6 @@
#include "opencl/source/accelerators/intel_motion_estimation.h"
#include "opencl/source/api/additional_extensions.h"
#include "opencl/source/api/api_enter.h"
#include "opencl/source/built_ins/vme_builtin.h"
#include "opencl/source/cl_device/cl_device.h"
#include "opencl/source/command_queue/command_queue.h"
#include "opencl/source/context/context.h"
@ -1506,36 +1505,12 @@ cl_program CL_API_CALL clCreateProgramWithBuiltInKernels(cl_context context,
TRACING_ENTER(ClCreateProgramWithBuiltInKernels, &context, &numDevices, &deviceList, &kernelNames, &errcodeRet);
cl_int retVal = CL_SUCCESS;
API_ENTER(&retVal);
cl_program program = nullptr;
DBG_LOG_INPUTS("context", context,
"numDevices", numDevices,
"deviceList", deviceList,
"kernelNames", kernelNames);
cl_program program = nullptr;
Context *pContext = nullptr;
retVal = validateObjects(withCastToInternal(context, &pContext), numDevices,
deviceList, kernelNames, errcodeRet);
if (retVal == CL_SUCCESS) {
ClDeviceVector deviceVector;
for (auto i = 0u; i < numDevices; i++) {
auto device = castToObject<ClDevice>(deviceList[i]);
if (!device || !pContext->isDeviceAssociated(*device)) {
retVal = CL_INVALID_DEVICE;
break;
}
deviceVector.push_back(device);
}
if (retVal == CL_SUCCESS) {
program = Vme::createBuiltInProgram(
*pContext,
deviceVector,
kernelNames,
retVal);
}
}
retVal = CL_INVALID_VALUE;
if (errcodeRet) {
*errcodeRet = retVal;
}

View File

@ -1,5 +1,5 @@
#
# Copyright (C) 2018-2022 Intel Corporation
# Copyright (C) 2018-2024 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
@ -9,32 +9,11 @@ set(RUNTIME_SRCS_BUILT_INS
${CMAKE_CURRENT_SOURCE_DIR}/aux_translation_builtin.h
${CMAKE_CURRENT_SOURCE_DIR}/builtins_dispatch_builder.cpp
${CMAKE_CURRENT_SOURCE_DIR}/builtins_dispatch_builder.h
${CMAKE_CURRENT_SOURCE_DIR}/built_in_ops_vme.h
${CMAKE_CURRENT_SOURCE_DIR}/built_ins.inl
${CMAKE_CURRENT_SOURCE_DIR}/vme_builtin.cpp
${CMAKE_CURRENT_SOURCE_DIR}/vme_builtin.h
${CMAKE_CURRENT_SOURCE_DIR}/vme_dispatch_builder.h
)
target_sources(${NEO_STATIC_LIB_NAME} PRIVATE ${RUNTIME_SRCS_BUILT_INS})
set_property(GLOBAL PROPERTY RUNTIME_SRCS_BUILT_INS ${RUNTIME_SRCS_BUILT_INS})
set(RUNTIME_SRCS_BUILT_IN_KERNELS
${CMAKE_CURRENT_SOURCE_DIR}/kernels/vme_block_advanced_motion_estimate_bidirectional_check_intel.builtin_kernel
${CMAKE_CURRENT_SOURCE_DIR}/kernels/vme_block_advanced_motion_estimate_check_intel.builtin_kernel
${CMAKE_CURRENT_SOURCE_DIR}/kernels/vme_block_motion_estimate_intel.builtin_kernel
)
target_sources(${NEO_STATIC_LIB_NAME} PRIVATE ${RUNTIME_SRCS_BUILT_IN_KERNELS})
hide_subdir(registry)
hide_subdir(kernels)
add_subdirectories()
if(NOT (TARGET ${BUILTINS_VME_LIB_NAME}))
add_subdirectory(registry)
if(COMPILE_BUILT_INS)
add_subdirectory(kernels)
endif()
endif()

View File

@ -1,20 +0,0 @@
/*
* Copyright (C) 2020-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/built_ins/builtinops/built_in_ops.h"
namespace NEO {
namespace EBuiltInOps {
using Type = uint32_t;
inline constexpr Type vmeBlockMotionEstimateIntel{maxCoreValue + 1};
inline constexpr Type vmeBlockAdvancedMotionEstimateCheckIntel{maxCoreValue + 2};
inline constexpr Type vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel{maxCoreValue + 3};
} // namespace EBuiltInOps
} // namespace NEO

View File

@ -14,7 +14,6 @@
#include "opencl/source/built_ins/aux_translation_builtin.h"
#include "opencl/source/built_ins/built_ins.inl"
#include "opencl/source/built_ins/vme_dispatch_builder.h"
#include "opencl/source/cl_device/cl_device.h"
#include "opencl/source/execution_environment/cl_execution_environment.h"
#include "opencl/source/helpers/convert_color.h"

View File

@ -1,11 +0,0 @@
#
# Copyright (C) 2018-2023 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
add_custom_target(builtins_vme_sources)
set_target_properties(builtins_vme_sources PROPERTIES FOLDER "${OPENCL_RUNTIME_PROJECTS_FOLDER}/${OPENCL_BUILTINS_PROJECTS_FOLDER}")
set(BUILTINS_OUTDIR_WITH_ARCH "${TargetDir}/built_ins/${NEO_ARCH}")
add_dependencies(${BUILTINS_BINARIES_BINDFUL_LIB_NAME} builtins_vme_sources)
add_subdirectories()

View File

@ -1,460 +0,0 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
R"===(
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_advanced_motion_estimate_bidirectional_check_intel(
sampler_t accelerator,
__read_only image2d_t srcImg,
__read_only image2d_t refImg,
__read_only image2d_t src_check_image,
__read_only image2d_t ref0_check_image,
__read_only image2d_t ref1_check_image,
uint flags,
uint search_cost_penalty,
uint search_cost_precision,
short2 count_global,
uchar bidir_weight,
__global short2 *count_motion_vector_buffer,
__global short2 *prediction_motion_vector_buffer,
__global char *skip_input_mode_buffer,
__global short2 *skip_motion_vector_buffer,
__global short2 *search_motion_vector_buffer,
__global char *intra_search_predictor_modes,
__global ushort *search_residuals,
__global ushort *skip_residuals,
__global ushort *intra_residuals,
__read_only image2d_t intraSrcImg,
int height,
int width,
int stride) {
__local uint dstSearch[64]; // 8 GRFs
__local uint dstSkipIntra[32 + 24]; // 7 GRFs (4 for inter, 3 for intra)
// distortion in the 6th GRF
__local ushort *distSearch = (__local ushort *)&dstSearch[8 * 5];
// Initialize the MV cost table:
// MV Cost in U4U4 format:
// No cost : 0, 0, 0, 0, 0, 0, 0, 0
// Low Cost : 1, 4, 5, 9, 10, 12, 14, 15
// Normal Cost: 5, 26, 29, 43, 45, 47, 57, 57
// High Cost : 29, 61, 72, 78, 88, 89, 91, 92
uint2 MVCostTable;
if (search_cost_penalty == 1) {
MVCostTable.s0 = 0x09050401;
MVCostTable.s1 = 0x0F0E0C0A;
} else if (search_cost_penalty == 2) {
MVCostTable.s0 = 0x2B1D1A05;
MVCostTable.s1 = 0x39392F2D;
} else if (search_cost_penalty == 3) {
MVCostTable.s0 = 0x4E483D1D;
MVCostTable.s1 = 0x5C5B5958;
} else {
MVCostTable.s0 = 0;
MVCostTable.s1 = 0;
}
uint MVCostPrecision = ((uint)search_cost_precision) << 16;
// Frame is divided into rows * columns of MBs.
// One h/w thread per WG.
// One WG processes "row" MBs - one row per iteration and one MB per row.
// Number of WGs (or h/w threads) is number of columns MBs.Each iteration
// processes the MB in a row - gid_0 is the MB id in a row and gid_1 is the
// row offset.
int sid_0 = stride * get_group_id(0);
int gid_0 = sid_0 / height;
int gid_1 = sid_0 % height;
for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
sid++, gid_0 = sid / height, gid_1 = sid % height) {
int2 srcCoord;
srcCoord.x = gid_0 * 16 +
get_global_offset(0); // 16 pixels wide MBs (globally scalar)
srcCoord.y = gid_1 * 16 +
get_global_offset(1); // 16 pixels tall MBs (globally scalar)
uint curMB = gid_0 + gid_1 * width; // current MB id
short2 count;
// If either the search or skip vector counts are per-MB, then we need to
// read in
// the count motion vector buffer.
if ((count_global.s0 == -1) | (count_global.s1 == -1)) {
count = count_motion_vector_buffer[curMB];
}
// If either the search or skip vector counts are per-frame, we need to use
// those.
if (count_global.s0 >= 0) {
count.s0 = count_global.s0;
}
if (count_global.s1 >= 0) {
count.s1 = count_global.s1;
}
int countPredMVs = count.x;
if (countPredMVs != 0) {
uint offset = curMB * 4; // 4 predictors per MB
offset += get_local_id(0) % 4; // 16 work-items access 4 MVs for MB
// one predictor for MB per SIMD channel
// Reduce predictors from Q-pixel to integer precision.
int2 predMV = 0;
if (get_local_id(0) < countPredMVs) {
// one MV per work-item
if(prediction_motion_vector_buffer != NULL)
{
predMV = convert_int2(prediction_motion_vector_buffer[offset]);
}
// Predictors are input in QP resolution. Convert that to integer
// resolution.
predMV.x /= 4;
predMV.y /= 4;
predMV.y &= 0xFFFFFFFE;
}
// Do up to 4 IMEs, get the best MVs and their distortions, and optionally
// a FBR of
// the best MVs. Finally the results are written out to SLM.
intel_work_group_vme_mb_multi_query_4(
dstSearch, // best search MV and its distortions into SLM
countPredMVs, // count of predictor MVs (globally scalar - value range
// 1 to 4)
MVCostPrecision, // MV cost precision
MVCostTable, // MV cost table
srcCoord, // MB 2-D offset (globally scalar)
predMV, // predictor MVs (up to 4 distinct MVs for SIMD16 thread)
srcImg, // source
refImg, // reference
accelerator); // vme object
}
int doIntra = ((flags & 0x2) != 0);
int intraEdges = 0;
if (doIntra) {
// Enable all edges by default.
intraEdges = 0x3C;
// If this is a left-edge MB, then disable left edges.
if ((gid_0 == 0) & (get_global_offset(0) == 0)) {
intraEdges &= 0x18;
}
// If this is a right edge MB then disable right edges.
if (gid_0 == width - 1) {
intraEdges &= 0x34;
}
// If this is a top-edge MB, then disable top edges.
if ((gid_1 == 0) & (get_global_offset(1) == 0)) {
intraEdges &= 0x20;
}
// Set bit6=bit5.
intraEdges |= ((intraEdges & 0x20) << 1);
intraEdges <<= 8;
}
int skip_block_type_8x8 = flags & 0x4;
int countSkipMVs = count.y;
if (countSkipMVs != 0 || doIntra == true) {
// one set of skip MV per SIMD channel
// Do up to 4 skip checks and get the distortions for each of them.
// Finally the results are written out to SLM.
if ((skip_block_type_8x8 == 0) | ((doIntra) & (countSkipMVs == 0))) {
// 16x16:
uint offset = curMB * 4 * 2; // 4 sets of skip check MVs per MB
int skipMV = 0;
if (get_local_id(0) < countSkipMVs * 2) // need 2 values per MV
{
offset +=
(get_local_id(0)); // 16 work-items access 4 sets of MVs for MB
if(skip_motion_vector_buffer != NULL){
__global int *skip1_motion_vector_buffer =
(__global int *)skip_motion_vector_buffer;
skipMV = skip1_motion_vector_buffer[offset]; // one MV per work-item
}
}
uchar skipMode = 0;
if (get_local_id(0) < countSkipMVs) {
if(skip_input_mode_buffer != NULL)
skipMode = skip_input_mode_buffer[curMB];
if (skipMode == 0) {
skipMode = 1;
}
if (skipMode > 3) {
skipMode = 3;
}
}
intel_work_group_vme_mb_multi_bidir_check_16x16(
dstSkipIntra, // distortions into SLM
countSkipMVs, // count of skip check MVs (globally scalar - value
// range 1 to 4)
doIntra, // compute intra modes
intraEdges, // intra edges to use
srcCoord, // MB 2-D offset (globally scalar)
bidir_weight, // bidirectional weight
skipMode, // skip modes
skipMV, // skip check MVs (up to 4 distinct sets of skip check MVs
// for SIMD16 thread)
src_check_image, // source
ref0_check_image, // reference fwd
ref1_check_image, // reference bwd
intraSrcImg, // intra source
accelerator); // vme object
} else {
// 8x8:
uint offset =
curMB * 4 *
8; // 4 sets of skip check MVs, 16 shorts (8 ints) each per MB
int2 skipMVs = 0;
if (get_local_id(0) < countSkipMVs * 8) // need 8 values per MV
{
offset +=
(get_local_id(0)); // 16 work-items access 4 sets of MVs for MB
if(skip_motion_vector_buffer != NULL){
__global int *skip1_motion_vector_buffer =
(__global int *)(skip_motion_vector_buffer);
skipMVs.x = skip1_motion_vector_buffer[offset]; // four component MVs
// per work-item
skipMVs.y = skip1_motion_vector_buffer[offset + 16];}
}
uchar skipModes = 0;
if (get_local_id(0) < countSkipMVs) {
if(skip_input_mode_buffer != NULL)
skipModes = skip_input_mode_buffer[curMB];
}
intel_work_group_vme_mb_multi_bidir_check_8x8(
dstSkipIntra, // distortions into SLM
countSkipMVs, // count of skip check MVs per MB (globally scalar -
// value range 1 to 4)
doIntra, // compute intra modes
intraEdges, // intra edges to use
srcCoord, // MB 2-D offset (globally scalar)
bidir_weight, // bidirectional weight
skipModes, // skip modes
skipMVs, // skip check MVs (up to 4 distinct sets of skip check MVs
// for SIMD16 thread)
src_check_image, // source
ref0_check_image, // reference fwd
ref1_check_image, // reference bwd
intraSrcImg, // intra source
accelerator); // vme object
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// Write Out motion estimation result:
// Result format
// Hierarchical row-major layout
// i.e. row-major of blocks MVs in MBs, and row-major of 4 sets of
// MVs/distortion in blocks
if (countPredMVs != 0) {
// 4x4
if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
int index = (gid_0 * 16 + get_local_id(0)) + (gid_1 * 16 * width);
// 1. 16 work-items enabled.
// 2. Work-items gather fwd MVs in strided dword locations 0, 2, .., 30
// (interleaved
// fwd/bdw MVs) with constant offset 8 (control data size) from SLM
// into contiguous
// short2 locations 0, 1, .., 15 of global buffer
// search_motion_vector_buffer with
// offset index.
// 3. Work-items gather contiguous ushort locations 0, 1, .., 15 from
// distSearch into
// contiguous ushort locations 0, 1, .., 15 of search_residuals with
// offset index.
short2 val = as_short2(dstSearch[8 + get_local_id(0) * 2]);
if(search_motion_vector_buffer != NULL)
search_motion_vector_buffer[index] = val;
if (search_residuals != NULL)
{
search_residuals[index] = distSearch[get_local_id(0)];
}
}
// 8x8
else if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
// Only 1st 4 work-item are needed.
if (get_local_id(0) < 4) {
int index = (gid_0 * 4 + get_local_id(0)) + (gid_1 * 4 * width);
// 1. 4 work-items enabled.
// 2. Work-items gather fw MVs in strided dword locations 0, 8, 16, 24
// (interleaved
// fwd/bdw MVs) with constant offset 8 from SLM into contiguous
// short2 locations
// 0, 1, .., 15 of global buffer search_motion_vector_buffer with
// offset index.
// 3. Work-items gather strided ushort locations 0, 4, 8, 12 from
// distSearch into
// contiguous ushort locations 0, 1, .., 15 of search_residuals
// with offset index.
short2 val = as_short2(dstSearch[8 + get_local_id(0) * 4 * 2]);
if(search_motion_vector_buffer != NULL)
search_motion_vector_buffer[index] = val;
if (search_residuals != NULL)
{
search_residuals[index] = distSearch[get_local_id(0) * 4];
}
}
}
// 16x16
else if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
// One 1st work is needed.
if (get_local_id(0) == 0) {
int index = gid_0 + gid_1 * width;
// 1. 1 work-item enabled.
// 2. Work-item gathers fwd MV in dword location 0 with constant
// offset 8 from
// SLM into short2 locations 0 of global buffer
// search_motion_vector_buffer.
// 3. Work-item gathers ushort location 0 from distSearch into ushort
// location 0 of search_residuals with offset index.
short2 val = as_short2(dstSearch[8]);
if(search_motion_vector_buffer != NULL)
search_motion_vector_buffer[index] = val;
if (search_residuals != NULL)
{
search_residuals[index] = distSearch[0];
}
}
}
}
// Write out motion skip check result:
// Result format
// Hierarchical row-major layout
// i.e. row-major of blocks in MBs, and row-major of 8 sets of
// distortions in blocks
if (countSkipMVs != 0) {
if (skip_block_type_8x8 == false) {
// Copy out 4 (1 component) sets of distortion values.
int index = (gid_0 * 4) + (get_local_id(0)) + (gid_1 * 4 * width);
if (get_local_id(0) < countSkipMVs) {
// 1. Up to 4 work-items are enabled.
// 2. The work-item gathers distSkip locations 0, 16*1, .., 16*7 and
// copies them to contiguous skip_residual locations 0, 1, 2, ..,
// 7.
__local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
if(skip_residuals != NULL)
skip_residuals[index] = distSkip[get_local_id(0) * 16];
}
} else {
// Copy out 4 (4 component) sets of distortion values.
int index =
(gid_0 * 4 * 4) + (get_local_id(0)) + (gid_1 * 4 * 4 * width);
if (get_local_id(0) < countSkipMVs * 4) {
// 1. Up to 16 work-items are enabled.
// 2. The work-item gathers distSkip locations 0, 4*1, .., 4*15 and
// copies them to contiguous skip_residual locations 0, 1, 2, ..,
// 15.
__local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
if(skip_residuals != NULL)
skip_residuals[index] = distSkip[get_local_id(0) * 4];
}
}
}
// Write out intra search result:
if (doIntra) {
// Write out the 4x4 intra modes
if (get_local_id(0) < 8) {
__local char *dstIntra_4x4 =
(__local char *)(&dstSkipIntra[32 + 16 + 4]);
char value = dstIntra_4x4[get_local_id(0)];
char value_low = (value)&0xf;
char value_high = (value >> 4) & 0xf;
int index_low =
(gid_0 * 22) + (get_local_id(0) * 2) + (gid_1 * 22 * width);
int index_high =
(gid_0 * 22) + (get_local_id(0) * 2) + 1 + (gid_1 * 22 * width);
if(intra_search_predictor_modes != NULL) {
intra_search_predictor_modes[index_low + 5] = value_low;
intra_search_predictor_modes[index_high + 5] = value_high;
}
}
// Write out the 8x8 intra modes
if (get_local_id(0) < 4) {
__local char *dstIntra_8x8 =
(__local char *)(&dstSkipIntra[32 + 8 + 4]);
char value = dstIntra_8x8[get_local_id(0) * 2];
char value_low = (value)&0xf;
int index = (gid_0 * 22) + (get_local_id(0)) + (gid_1 * 22 * width);
if(intra_search_predictor_modes != NULL)
intra_search_predictor_modes[index + 1] = value_low;
}
// Write out the 16x16 intra modes
if (get_local_id(0) < 1) {
__local char *dstIntra_16x16 =
(__local char *)(&dstSkipIntra[32 + 0 + 4]);
char value = dstIntra_16x16[0];
char value_low = (value)&0xf;
int index = (gid_0 * 22) + (gid_1 * 22 * width);
if(intra_search_predictor_modes != NULL)
intra_search_predictor_modes[index] = value_low;
}
// Get the intra residuals.
if (intra_residuals != NULL)
{
int index = (gid_0 * 4) + (gid_1 * 4 * width);
if (get_local_id(0) < 1) {
__local ushort *distIntra_4x4 =
(__local ushort *)(&dstSkipIntra[32 + 16 + 3]);
__local ushort *distIntra_8x8 =
(__local ushort *)(&dstSkipIntra[32 + 8 + 3]);
__local ushort *distIntra_16x16 =
(__local ushort *)(&dstSkipIntra[32 + 0 + 3]);
intra_residuals[index + 2] = distIntra_4x4[0];
intra_residuals[index + 1] = distIntra_8x8[0];
intra_residuals[index + 0] = distIntra_16x16[0];
}
}
}
}
}
)==="

View File

@ -1,26 +0,0 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
R"===(
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_advanced_motion_estimate_bidirectional_check_intel(
sampler_t accelerator, __read_only image2d_t srcImg,
__read_only image2d_t refImg, __read_only image2d_t src_check_image,
__read_only image2d_t ref0_check_image,
__read_only image2d_t ref1_check_image, uint flags,
uint search_cost_penalty, uint search_cost_precision, short2 count_global,
uchar bidir_weight, __global short2 *count_motion_vector_buffer,
__global short2 *prediction_motion_vector_buffer,
__global char *skip_input_mode_buffer,
__global short2 *skip_motion_vector_buffer,
__global short2 *search_motion_vector_buffer,
__global char *intra_search_predictor_modes,
__global ushort *search_residuals, __global ushort *skip_residuals,
__global ushort *intra_residuals) {
}
)==="

View File

@ -1,379 +0,0 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
R"===(
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_advanced_motion_estimate_check_intel(
sampler_t accelerator, __read_only image2d_t srcImg,
__read_only image2d_t refImg, uint flags, uint skip_block_type,
uint search_cost_penalty, uint search_cost_precision,
__global short2 *count_motion_vector_buffer,
__global short2 *predictors_buffer,
__global short2 *skip_motion_vector_buffer,
__global short2 *motion_vector_buffer,
__global char *intra_search_predictor_modes, __global ushort *residuals,
__global ushort *skip_residuals, __global ushort *intra_residuals,
__read_only image2d_t intraSrcImg, int height, int width, int stride) {
__local uint dstSearch[64]; // 8 GRFs
__local uint dstSkipIntra[64 + 24]; // 11 GRFs (8 for inter, 3 for intra)
__local ushort *distSearch =
(__local ushort *)&dstSearch[8 * 5]; // distortion in the 6th GRF
// Initialize the MV cost table:
// MV Cost in U4U4 format:
// No cost : 0, 0, 0, 0, 0, 0, 0, 0
// Low Cost : 1, 4, 5, 9, 10, 12, 14, 15
// Normal Cost: 5, 26, 29, 43, 45, 47, 57, 57
// High Cost : 29, 61, 72, 78, 88, 89, 91, 92
uint2 MVCostTable;
if (search_cost_penalty == 1) {
MVCostTable.s0 = 0x09050401;
MVCostTable.s1 = 0x0F0E0C0A;
} else if (search_cost_penalty == 2) {
MVCostTable.s0 = 0x2B1D1A05;
MVCostTable.s1 = 0x39392F2D;
} else if (search_cost_penalty == 3) {
MVCostTable.s0 = 0x4E483D1D;
MVCostTable.s1 = 0x5C5B5958;
} else {
MVCostTable.s0 = 0;
MVCostTable.s1 = 0;
}
uint MVCostPrecision = ((uint)search_cost_precision) << 16;
// Frame is divided into rows * columns of MBs.
// One h/w thread per WG.
// One WG processes 'row' MBs - one row per iteration and one MB per row.
// Number of WGs (or h/w threads) is number of columns MBs
// Each iteration processes the MB in a row - gid_0 is the MB id in a row and
// gid_1 is the row offset.
int sid_0 = stride * get_group_id(0);
int gid_0 = sid_0 / height;
int gid_1 = sid_0 % height;
for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
sid++, gid_0 = sid / height, gid_1 = sid % height) {
int2 srcCoord;
srcCoord.x = gid_0 * 16 +
get_global_offset(0); // 16 pixels wide MBs (globally scalar)
srcCoord.y = gid_1 * 16 +
get_global_offset(1); // 16 pixels tall MBs (globally scalar)
uint curMB = gid_0 + gid_1 * width; // current MB id
short2 count = 0;
if(count_motion_vector_buffer != NULL)
count = count_motion_vector_buffer[curMB];
int countPredMVs = count.x;
if (countPredMVs != 0) {
uint offset = curMB * 8; // 8 predictors per MB
offset += get_local_id(0) % 8; // 16 work-items access 8 MVs for MB
// one predictor for MB per SIMD channel
// Reduce predictors from Q-pixel to integer precision.
int2 predMV = 0;
if (get_local_id(0) < countPredMVs) {
if(predictors_buffer != NULL){
predMV =
convert_int2(predictors_buffer[offset]); // one MV per work-item
predMV.x /= 4;
predMV.y /= 4;
predMV.y &= 0xFFFE;}
}
// Do up to 8 IMEs, get the best MVs and their distortions, and optionally
// a FBR of the best MVs.
// Finally the results are written out to SLM.
intel_work_group_vme_mb_multi_query_8(
dstSearch, // best search MV and its distortions into SLM
countPredMVs, // count of predictor MVs (globally scalar - value range
// 1 to 8)
MVCostPrecision, // MV cost precision
MVCostTable, // MV cost table
srcCoord, // MB 2-D offset (globally scalar)
predMV, // predictor MVs (up to 8 distinct MVs for SIMD16 thread)
srcImg, // source
refImg, // reference
accelerator); // vme object
}
int doIntra = (flags & 0x2) != 0;
int intraEdges = 0;
if (doIntra) {
// Enable all edges by default.
intraEdges = 0x3C;
// If this is a left-edge MB, then disable left edges.
if ((gid_0 == 0) & (get_global_offset(0) == 0)) {
intraEdges &= 0x18;
}
// If this is a right edge MB then disable right edges.
if (gid_0 == width - 1) {
intraEdges &= 0x34;
}
// If this is a top-edge MB, then disable top edges.
if ((gid_1 == 0) & (get_global_offset(1) == 0)) {
intraEdges &= 0x20;
}
// Set bit6=bit5.
intraEdges |= ((intraEdges & 0x20) << 1);
intraEdges <<= 8;
}
int countSkipMVs = count.y;
if (countSkipMVs != 0 || doIntra == true) {
uint offset = curMB * 8; // 8 sets of skip check MVs per MB
offset +=
(get_local_id(0) % 8); // 16 work-items access 8 sets of MVs for MB
// one set of skip MV per SIMD channel
// Do up to 8 skip checks and get the distortions for each of them.
// Finally the results are written out to SLM.
if ((skip_block_type == 0x0) | ((doIntra) & (countSkipMVs == 0))) {
int skipMVs = 0;
if (get_local_id(0) < countSkipMVs) {
if(skip_motion_vector_buffer != NULL ) {
__global int *skip1_motion_vector_buffer =
(__global int *)skip_motion_vector_buffer;
skipMVs = skip1_motion_vector_buffer[offset]; } // one packed MV for one
// work-item
}
intel_work_group_vme_mb_multi_check_16x16(
dstSkipIntra, // distortions into SLM
countSkipMVs, // count of skip check MVs (value range 0 to 8)
doIntra, // compute intra modes
intraEdges, // intra edges to use
srcCoord, // MB 2-D offset (globally scalar)
skipMVs, // skip check MVs (up to 8 sets of skip check MVs for
// SIMD16 thread)
srcImg, // source
refImg, // reference
intraSrcImg, // intra source
accelerator);
}
if ((skip_block_type == 0x1) & (countSkipMVs > 0)) {
int4 skipMVs = 0;
if (get_local_id(0) < countSkipMVs) {
if(skip_motion_vector_buffer != NULL){
__global int4 *skip4_motion_vector_buffer =
(__global int4 *)(skip_motion_vector_buffer);
skipMVs = skip4_motion_vector_buffer[offset]; } // four component MVs
// per work-item
}
intel_work_group_vme_mb_multi_check_8x8(
dstSkipIntra, // distortions into SLM
countSkipMVs, // count of skip check MVs per MB (value range 0 to 8)
doIntra, // compute intra modes
intraEdges, // intra edges to use
srcCoord, // MB 2-D offset (globally scalar)
skipMVs, // skip check MVs (up to 8 ets of skip check MVs for SIMD16
// thread)
srcImg, // source
refImg, // reference
intraSrcImg, // intra source
accelerator);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// Write Out motion estimation result:
// Result format
// Hierarchical row-major layout
// i.e. row-major of blocks MVs in MBs, and row-major of 8 sets of
// MVs/distortion in blocks
if (countPredMVs != 0) {
// 4x4
if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
int index = (gid_0 * 16 + get_local_id(0)) + (gid_1 * 16 * width);
// 1. 16 work-items enabled.
// 2. Work-items gather fwd MVs in strided dword locations 0, 2, .., 30
// (interleaved
// fwd/bdw MVs) with constant offset 8 (control data size) from SLM
// into contiguous
// short2 locations 0, 1, .., 15 of global buffer
// search_motion_vector_buffer with
// offset index.
// 3. Work-items gather contiguous ushort locations 0, 1, .., 15 from
// distSearch into
// contiguous ushort locations 0, 1, .., 15 of search_residuals with
// offset index.
short2 val = as_short2(dstSearch[8 + get_local_id(0) * 2]);
if(motion_vector_buffer != NULL)
motion_vector_buffer[index] = val;
if (residuals != NULL)
{
residuals[index] = distSearch[get_local_id(0)];
}
}
// 8x8
else if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
// Only 1st 4 work-item are needed.
if (get_local_id(0) < 4) {
int index = (gid_0 * 4 + get_local_id(0)) + (gid_1 * 4 * width);
// 1. 4 work-items enabled.
// 2. Work-items gather fw MVs in strided dword locations 0, 8, 16, 24
// (interleaved
// fwd/bdw MVs) with constant offset 8 from SLM into contiguous
// short2 locations
// 0, 1, .., 15 of global buffer search_motion_vector_buffer with
// offset index.
// 3. Work-items gather strided ushort locations 0, 4, 8, 12 from
// distSearch into
// contiguous ushort locations 0, 1, .., 15 of search_residuals
// with offset index.
short2 val = as_short2(dstSearch[8 + get_local_id(0) * 4 * 2]);
if(motion_vector_buffer != NULL)
motion_vector_buffer[index] = val;
if (residuals != NULL)
{
residuals[index] = distSearch[get_local_id(0) * 4];
}
}
}
// 16x16
else if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
// One 1st work is needed.
if (get_local_id(0) == 0) {
int index = gid_0 + gid_1 * width;
// 1. 1 work-item enabled.
// 2. Work-item gathers fwd MV in dword location 0 with constant
// offset 8 from
// SLM into short2 locations 0 of global buffer
// search_motion_vector_buffer.
// 3. Work-item gathers ushort location 0 from distSearch into ushort
// location 0 of search_residuals with offset index.
short2 val = as_short2(dstSearch[8]);
if(motion_vector_buffer != NULL)
motion_vector_buffer[index] = val;
if (residuals != NULL)
{
residuals[index] = distSearch[0];
}
}
}
}
// Write out motion skip check result:
// Result format
// Hierarchical row-major layout
// i.e. row-major of blocks in MBs, and row-major of 8 sets of
// distortions in blocks
if (countSkipMVs != 0) {
if (skip_block_type == 0x0) {
// Copy out 8 (1 component) sets of distortion values.
int index = (gid_0 * 8) + (get_local_id(0)) + (gid_1 * 8 * width);
if (get_local_id(0) < countSkipMVs) {
__local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
// 1. Up to 8 work-items are enabled.
// 2. The work-item gathers distSkip locations 0, 16*1, .., 16*7 and
// copies them to contiguous skip_residual locations 0, 1, 2, ..,
// 7.
if(skip_residuals != NULL)
skip_residuals[index] = distSkip[get_local_id(0) * 16];
}
} else {
// Copy out 8 (4 component) sets of distortion values.
int index =
(gid_0 * 8 * 4) + (get_local_id(0)) + (gid_1 * 8 * 4 * width);
__local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
if (get_local_id(0) < countSkipMVs * 4) {
// 1. Up to 16 work-items are enabled.
// 2. The work-item gathers distSkip locations 0, 4*1, .., 4*31 and
// copies them to contiguous skip_residual locations 0, 1, 2, ..,
// 31.
if(skip_residuals != NULL){
skip_residuals[index] = distSkip[get_local_id(0) * 4];
skip_residuals[index + 16] = distSkip[(get_local_id(0) + 16) * 4];}
}
}
}
// Write out intra search result:
if (doIntra) {
int index_low =
(gid_0 * 22) + (get_local_id(0) * 2) + (gid_1 * 22 * width);
int index_high =
(gid_0 * 22) + (get_local_id(0) * 2) + 1 + (gid_1 * 22 * width);
// Write out the 4x4 intra modes
if (get_local_id(0) < 8) {
__local char *dstIntra_4x4 =
(__local char *)(&dstSkipIntra[64 + 16 + 4]);
char value = dstIntra_4x4[get_local_id(0)];
char value_low = (value)&0xf;
char value_high = (value >> 4) & 0xf;
if(intra_search_predictor_modes != NULL){
intra_search_predictor_modes[index_low + 5] = value_low;
intra_search_predictor_modes[index_high + 5] = value_high;}
}
// Write out the 8x8 intra modes
if (get_local_id(0) < 4) {
__local char *dstIntra_8x8 =
(__local char *)(&dstSkipIntra[64 + 8 + 4]);
char value = dstIntra_8x8[get_local_id(0) * 2];
char value_low = (value)&0xf;
int index = (gid_0 * 22) + (get_local_id(0)) + (gid_1 * 22 * width);
if(intra_search_predictor_modes != NULL)
intra_search_predictor_modes[index + 1] = value_low;
}
// Write out the 16x16 intra modes
if (get_local_id(0) < 1) {
__local char *dstIntra_16x16 =
(__local char *)(&dstSkipIntra[64 + 0 + 4]);
char value = dstIntra_16x16[get_local_id(0)];
char value_low = (value)&0xf;
if(intra_search_predictor_modes != NULL)
intra_search_predictor_modes[index_low] = value_low;
}
// Get the intra residuals.
if (intra_residuals != NULL)
{
int index = (gid_0 * 4) + (gid_1 * 4 * width);
if (get_local_id(0) < 1) {
__local ushort *distIntra_4x4 = (__local ushort *)(&dstSkipIntra[64 + 16 + 3]);
__local ushort *distIntra_8x8 = (__local ushort *)(&dstSkipIntra[64 + 8 + 3]);
__local ushort *distIntra_16x16 = (__local ushort *)(&dstSkipIntra[64 + 0 + 3]);
intra_residuals[index + 2] = distIntra_4x4[0];
intra_residuals[index + 1] = distIntra_8x8[0];
intra_residuals[index + 0] = distIntra_16x16[0];
}
}
}
}
}
)==="

View File

@ -1,21 +0,0 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
R"===(
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_advanced_motion_estimate_check_intel(
sampler_t accelerator, __read_only image2d_t srcImg,
__read_only image2d_t refImg, uint flags, uint skip_block_type,
uint search_cost_penalty, uint search_cost_precision,
__global short2 *count_motion_vector_buffer,
__global short2 *predictors_buffer,
__global short2 *skip_motion_vector_buffer,
__global short2 *motion_vector_buffer,
__global char *intra_search_predictor_modes, __global ushort *residuals,
__global ushort *skip_residuals, __global ushort *intra_residuals) {
}
)==="

View File

@ -1,103 +0,0 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
R"===(
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_motion_estimate_intel(sampler_t accelerator, __read_only image2d_t srcImg,
__read_only image2d_t refImg,
__global short2 *prediction_motion_vector_buffer,
__global short2 *motion_vector_buffer,
__global ushort *residuals, int height, int width,
int stride) {
__local uint dst[64];
__local ushort *dist = (__local ushort *)&dst[8 * 5];
int sid_0 = stride * get_group_id(0);
int gid_0 = sid_0 / height;
int gid_1 = sid_0 % height;
for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
sid++, gid_0 = sid / height, gid_1 = sid % height) {
int2 srcCoord = 0;
int2 refCoord = 0;
srcCoord.x = gid_0 * 16 + get_global_offset(0);
srcCoord.y = gid_1 * 16 + get_global_offset(1);
short2 predMV = 0;
#ifndef HW_NULL_CHECK
if (prediction_motion_vector_buffer != NULL)
#endif
{
predMV = prediction_motion_vector_buffer[gid_0 + gid_1 * width];
refCoord.x = predMV.x / 4;
refCoord.y = predMV.y / 4;
refCoord.y = refCoord.y & 0xFFFE;
}
{
intel_work_group_vme_mb_query(dst, srcCoord, refCoord, srcImg, refImg,
accelerator);
}
barrier(CLK_LOCAL_MEM_FENCE);
// Write Out Result
// 4x4
if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
int x = get_local_id(0) % 4;
int y = get_local_id(0) / 4;
int index = (gid_0 * 4 + x) + (gid_1 * 4 + y) * width * 4;
short2 val = as_short2(dst[8 + (y * 4 + x) * 2]);
motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (residuals != NULL)
#endif
{
residuals[index] = dist[y * 4 + x];
}
}
// 8x8
if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
if (get_local_id(0) < 4) {
int x = get_local_id(0) % 2;
int y = get_local_id(0) / 2;
int index = (gid_0 * 2 + x) + (gid_1 * 2 + y) * width * 2;
short2 val = as_short2(dst[8 + (y * 2 + x) * 8]);
motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (residuals != NULL)
#endif
{
residuals[index] = dist[(y * 2 + x) * 4];
}
}
}
// 16x16
if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
if (get_local_id(0) == 0) {
int index = gid_0 + gid_1 * width;
short2 val = as_short2(dst[8]);
motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (residuals != NULL)
#endif
{
residuals[index] = dist[0];
}
}
}
}
}
)==="

View File

@ -1,16 +0,0 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
R"===(
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_motion_estimate_intel(sampler_t accelerator, __read_only image2d_t srcImg,
__read_only image2d_t refImg,
__global short2 *prediction_motion_vector_buffer,
__global short2 *motion_vector_buffer,
__global ushort *residuals) {
}
)==="

View File

@ -1,16 +0,0 @@
#
# Copyright (C) 2018-2021 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
add_library(${BUILTINS_VME_LIB_NAME} OBJECT EXCLUDE_FROM_ALL
CMakeLists.txt
register_ext_vme_source.cpp
)
set_target_properties(${BUILTINS_VME_LIB_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(${BUILTINS_VME_LIB_NAME} PROPERTIES FOLDER "${OPENCL_RUNTIME_PROJECTS_FOLDER}/${OPENCL_BUILTINS_PROJECTS_FOLDER}")
target_include_directories(${BUILTINS_VME_LIB_NAME} PRIVATE
${KHRONOS_HEADERS_DIR}
)

View File

@ -1,44 +0,0 @@
/*
* Copyright (C) 2018-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/built_ins/built_ins.h"
#include "shared/source/built_ins/registry/built_ins_registry.h"
#include "opencl/source/built_ins/built_in_ops_vme.h"
#include <string>
namespace NEO {
static RegisterEmbeddedResource registerVmeSrc(
createBuiltinResourceName(
EBuiltInOps::vmeBlockMotionEstimateIntel,
BuiltinCode::getExtension(BuiltinCode::ECodeType::source))
.c_str(),
std::string(
#include "opencl/source/built_ins/kernels/vme_block_motion_estimate_intel.builtin_kernel"
));
static RegisterEmbeddedResource registerVmeAdvancedSrc(
createBuiltinResourceName(
EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel,
BuiltinCode::getExtension(BuiltinCode::ECodeType::source))
.c_str(),
std::string(
#include "opencl/source/built_ins/kernels/vme_block_advanced_motion_estimate_check_intel.builtin_kernel"
));
static RegisterEmbeddedResource registerVmeAdvancedBidirectionalSrc(
createBuiltinResourceName(
EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel,
BuiltinCode::getExtension(BuiltinCode::ECodeType::source))
.c_str(),
std::string(
#include "opencl/source/built_ins/kernels/vme_block_advanced_motion_estimate_bidirectional_check_intel.builtin_kernel"
));
} // namespace NEO

View File

@ -1,126 +0,0 @@
/*
* Copyright (C) 2020-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "opencl/source/built_ins/vme_builtin.h"
#include "shared/source/built_ins/built_ins.h"
#include "shared/source/device/device.h"
#include "opencl/source/built_ins/built_in_ops_vme.h"
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
#include "opencl/source/built_ins/populate_built_ins.inl"
#include "opencl/source/built_ins/vme_dispatch_builder.h"
#include "opencl/source/execution_environment/cl_execution_environment.h"
#include "opencl/source/program/program.h"
#include <sstream>
namespace NEO {
static const char *blockMotionEstimateIntelSrc = {
#include "kernels/vme_block_motion_estimate_intel_frontend.builtin_kernel"
};
static const char *blockAdvancedMotionEstimateCheckIntelSrc = {
#include "kernels/vme_block_advanced_motion_estimate_check_intel_frontend.builtin_kernel"
};
static const char *blockAdvancedMotionEstimateBidirectionalCheckIntelSrc = {
#include "kernels/vme_block_advanced_motion_estimate_bidirectional_check_intel_frontend.builtin_kernel"
};
static const std::tuple<const char *, const char *> mediaBuiltIns[] = {
{"block_motion_estimate_intel", blockMotionEstimateIntelSrc},
{"block_advanced_motion_estimate_check_intel", blockAdvancedMotionEstimateCheckIntelSrc},
{"block_advanced_motion_estimate_bidirectional_check_intel", blockAdvancedMotionEstimateBidirectionalCheckIntelSrc}};
// Unlike other built-ins media kernels are not stored in BuiltIns object.
// Pointer to program with built in kernels is returned to the user through API
// call and user is responsible for releasing it by calling clReleaseProgram.
Program *Vme::createBuiltInProgram(
Context &context,
const ClDeviceVector &deviceVector,
const char *kernelNames,
int &errcodeRet) {
std::string programSourceStr = "";
std::istringstream ss(kernelNames);
std::string currentKernelName;
while (std::getline(ss, currentKernelName, ';')) {
bool found = false;
for (auto &builtInTuple : mediaBuiltIns) {
if (currentKernelName == std::get<0>(builtInTuple)) {
programSourceStr += std::get<1>(builtInTuple);
found = true;
break;
}
}
if (!found) {
errcodeRet = CL_INVALID_VALUE;
return nullptr;
}
}
if (programSourceStr.empty() == true) {
errcodeRet = CL_INVALID_VALUE;
return nullptr;
}
Program *pBuiltInProgram = nullptr;
pBuiltInProgram = Program::createBuiltInFromSource(programSourceStr.c_str(), &context, deviceVector, nullptr);
auto &device = *deviceVector[0];
if (pBuiltInProgram) {
std::unordered_map<std::string, BuiltinDispatchInfoBuilder *> builtinsBuilders;
builtinsBuilders["block_motion_estimate_intel"] =
&Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::vmeBlockMotionEstimateIntel, device);
builtinsBuilders["block_advanced_motion_estimate_check_intel"] =
&Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel, device);
builtinsBuilders["block_advanced_motion_estimate_bidirectional_check_intel"] =
&Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel, device);
errcodeRet = pBuiltInProgram->build(deviceVector, mediaKernelsBuildOptions, builtinsBuilders);
} else {
errcodeRet = CL_INVALID_VALUE;
}
return pBuiltInProgram;
}
const char *getAdditionalBuiltinAsString(EBuiltInOps::Type builtin) {
switch (builtin) {
default:
return nullptr;
case EBuiltInOps::vmeBlockMotionEstimateIntel:
return "vme_block_motion_estimate_intel.builtin_kernel";
case EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel:
return "vme_block_advanced_motion_estimate_check_intel.builtin_kernel";
case EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel:
return "vme_block_advanced_motion_estimate_bidirectional_check_intel";
}
}
BuiltinDispatchInfoBuilder &Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::Type operation, ClDevice &device) {
auto &builtins = *device.getDevice().getBuiltIns();
uint32_t operationId = static_cast<uint32_t>(operation);
auto clExecutionEnvironment = static_cast<ClExecutionEnvironment *>(device.getExecutionEnvironment());
auto &operationBuilder = clExecutionEnvironment->peekBuilders(device.getRootDeviceIndex())[operationId];
switch (operation) {
default:
return BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(operation, device);
case EBuiltInOps::vmeBlockMotionEstimateIntel:
std::call_once(operationBuilder.second, [&] { operationBuilder.first = std::make_unique<BuiltInOp<EBuiltInOps::vmeBlockMotionEstimateIntel>>(builtins, device); });
break;
case EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel:
std::call_once(operationBuilder.second, [&] { operationBuilder.first = std::make_unique<BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel>>(builtins, device); });
break;
case EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel:
std::call_once(operationBuilder.second, [&] { operationBuilder.first = std::make_unique<BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel>>(builtins, device); });
break;
}
return *operationBuilder.first;
}
} // namespace NEO

View File

@ -1,29 +0,0 @@
/*
* Copyright (C) 2020 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "opencl/source/built_ins/built_in_ops_vme.h"
namespace NEO {
class Program;
class ClDevice;
class ClDeviceVector;
class Context;
class BuiltIns;
class BuiltinDispatchInfoBuilder;
namespace Vme {
Program *createBuiltInProgram(
Context &context,
const ClDeviceVector &deviceVector,
const char *kernelNames,
int &errcodeRet);
BuiltinDispatchInfoBuilder &getBuiltinDispatchInfoBuilder(EBuiltInOps::Type operation, ClDevice &device);
} // namespace Vme
} // namespace NEO

View File

@ -1,483 +0,0 @@
/*
* Copyright (C) 2018-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/built_ins/built_ins.h"
#include "shared/source/helpers/basic_math.h"
#include "opencl/source/accelerators/intel_accelerator.h"
#include "opencl/source/built_ins/built_in_ops_vme.h"
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
#include "opencl/source/cl_device/cl_device.h"
#include "opencl/source/helpers/dispatch_info_builder.h"
#include "opencl/source/mem_obj/buffer.h"
#include "opencl/source/mem_obj/image.h"
namespace NEO {
class VmeBuiltinDispatchInfoBuilder : public BuiltinDispatchInfoBuilder {
public:
VmeBuiltinDispatchInfoBuilder(BuiltIns &kernelsLib, ClDevice &device, EBuiltInOps::Type builtinOp,
const char *kernelName)
: BuiltinDispatchInfoBuilder(kernelsLib, device) {
populate(builtinOp,
mediaKernelsBuildOptions,
kernelName, multiDeviceVmeKernel);
auto rootDeviceIndex = device.getRootDeviceIndex();
vmeKernel = multiDeviceVmeKernel->getKernel(rootDeviceIndex);
widthArgNum = vmeKernel->getKernelInfo().getArgNumByName("width");
heightArgNum = vmeKernel->getKernelInfo().getArgNumByName("height");
strideArgNum = vmeKernel->getKernelInfo().getArgNumByName("stride");
acceleratorArgNum = vmeKernel->getKernelInfo().getArgNumByName("accelerator");
srcImgArgNum = vmeKernel->getKernelInfo().getArgNumByName("srcImg");
refImgArgNum = vmeKernel->getKernelInfo().getArgNumByName("refImg");
motionVectorBufferArgNum = vmeKernel->getKernelInfo().getArgNumByName("motion_vector_buffer");
predictionMotionVectorBufferArgNum = vmeKernel->getKernelInfo().getArgNumByName("prediction_motion_vector_buffer");
residualsArgNum = vmeKernel->getKernelInfo().getArgNumByName("residuals");
}
void getBlkTraits(const Vec3<size_t> &inGws, size_t &gwWidthInBlk, size_t &gwHeightInBlk) const {
const size_t vmeMacroBlockWidth = 16;
const size_t vmeMacroBlockHeight = 16;
gwWidthInBlk = Math::divideAndRoundUp(inGws.x, vmeMacroBlockWidth);
gwHeightInBlk = Math::divideAndRoundUp(inGws.y, vmeMacroBlockHeight);
}
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, Kernel *kern,
const uint32_t inDim, const Vec3<size_t> &inGws, const Vec3<size_t> &inLws, const Vec3<size_t> &inOffset) const override {
if (kern == nullptr) {
return false;
}
size_t gwWidthInBlk = 0;
size_t gwHeightInBlk = 0;
getBlkTraits(inGws, gwWidthInBlk, gwHeightInBlk);
cl_int height = (cl_int)gwHeightInBlk;
cl_int width = (cl_int)gwWidthInBlk;
cl_int stride = height;
size_t numThreadsX = gwWidthInBlk;
const size_t simdWidth = vmeKernel->getKernelInfo().getMaxSimdSize();
stride = static_cast<cl_int>(Math::divideAndRoundUp(height * width, numThreadsX));
// update implicit args
vmeKernel->setArg(heightArgNum, sizeof(height), &height);
vmeKernel->setArg(widthArgNum, sizeof(width), &width);
vmeKernel->setArg(strideArgNum, sizeof(stride), &stride);
// Update global work size to force macro-block to HW thread execution model
Vec3<size_t> gws = {numThreadsX * simdWidth, 1, 1};
Vec3<size_t> lws = {vmeKernel->getKernelInfo().kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0], 1, 1};
DispatchInfoBuilder<SplitDispatch::Dim::d2D, SplitDispatch::SplitMode::noSplit> builder(clDevice);
builder.setDispatchGeometry(gws, lws, inOffset, gws, lws);
builder.setKernel(vmeKernel);
builder.bake(multiDispatchInfo);
return true;
}
bool setExplicitArg(uint32_t argIndex, size_t argSize, const void *argVal, cl_int &err) const override {
DEBUG_BREAK_IF(!((argIndex != widthArgNum) && (argIndex != heightArgNum) && (argIndex != strideArgNum)));
if ((argIndex == acceleratorArgNum) && (argVal == nullptr)) {
err = CL_INVALID_ACCELERATOR_INTEL;
return false;
}
err = vmeKernel->setArg(argIndex, argSize, argVal);
return false;
}
cl_int validateDispatch(Kernel *kernel, uint32_t inworkDim, const Vec3<size_t> &inGws, const Vec3<size_t> &inLws, const Vec3<size_t> &inOffset) const override {
if (inworkDim != 2) {
return CL_INVALID_WORK_DIMENSION;
}
size_t gwWidthInBlk = 0;
size_t gwHeightInBlk = 0;
getBlkTraits(inGws, gwWidthInBlk, gwHeightInBlk);
size_t blkNum = gwWidthInBlk * gwHeightInBlk;
size_t blkMul = 1;
IntelAccelerator *accelerator = castToObject<IntelAccelerator>((cl_accelerator_intel)vmeKernel->getKernelArg(acceleratorArgNum));
if (accelerator == nullptr) {
return CL_INVALID_KERNEL_ARGS; // accelerator was not set
}
DEBUG_BREAK_IF(accelerator->getDescriptorSize() != sizeof(cl_motion_estimation_desc_intel));
const cl_motion_estimation_desc_intel *acceleratorDesc = reinterpret_cast<const cl_motion_estimation_desc_intel *>(accelerator->getDescriptor());
switch (acceleratorDesc->mb_block_type) {
case CL_ME_MB_TYPE_8x8_INTEL:
blkMul = 4;
break;
case CL_ME_MB_TYPE_4x4_INTEL:
blkMul = 16;
break;
default:
break;
}
return validateVmeDispatch(inGws, inOffset, blkNum, blkMul);
}
// notes on corner cases :
// * if arg not available in kernels - returns true
// * if arg set to nullptr - returns true
bool validateBufferSize(int32_t bufferArgNum, size_t minimumSizeExpected) const {
if (bufferArgNum == -1) {
return true;
}
auto buff = castToObject<Buffer>((cl_mem)vmeKernel->getKernelArg(bufferArgNum));
if (buff == nullptr) {
return true;
}
size_t bufferSize = buff->getSize();
if (bufferSize < minimumSizeExpected) {
return false;
}
return true;
}
template <typename EnumBaseType>
bool validateEnumVal(EnumBaseType val) const {
return false;
}
template <typename EnumBaseType, typename ExpectedValType, typename... ExpectedValsTypes>
bool validateEnumVal(EnumBaseType val, ExpectedValType expectedVal, ExpectedValsTypes... expVals) const {
return (val == static_cast<EnumBaseType>(expectedVal)) || validateEnumVal<EnumBaseType, ExpectedValsTypes...>(val, expVals...);
}
// notes on corner cases :
// * if arg not available in kernels - returns true
template <typename EnumBaseType, typename... ExpectedValsTypes>
bool validateEnumArg(int32_t argNum, ExpectedValsTypes... expVals) const {
if (argNum == -1) {
return true;
}
EnumBaseType val = this->getKernelArgByValValue<EnumBaseType>(static_cast<uint32_t>(argNum));
return validateEnumVal<EnumBaseType, ExpectedValsTypes...>(val, expVals...);
}
template <typename RetType>
RetType getKernelArgByValValue(uint32_t argNum) const {
const auto &argAsVal = vmeKernel->getKernelInfo().kernelDescriptor.payloadMappings.explicitArgs[argNum].as<ArgDescValue>();
DEBUG_BREAK_IF(argAsVal.elements.size() != 1);
const auto &element = argAsVal.elements[0];
DEBUG_BREAK_IF(sizeof(RetType) > element.size);
return *(RetType *)(vmeKernel->getCrossThreadData() + element.offset);
}
cl_int validateImages(const Vec3<size_t> &inputRegion, const Vec3<size_t> &offset) const {
Image *srcImg = castToObject<Image>((cl_mem)vmeKernel->getKernelArg(srcImgArgNum));
Image *refImg = castToObject<Image>((cl_mem)vmeKernel->getKernelArg(refImgArgNum));
if ((srcImg == nullptr) || (refImg == nullptr)) {
return CL_INVALID_KERNEL_ARGS;
}
for (Image *img : {srcImg, refImg}) {
const cl_image_format &imgFormat = img->getImageFormat();
if ((imgFormat.image_channel_order != CL_R) || (imgFormat.image_channel_data_type != CL_UNORM_INT8)) {
return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
}
if (false == img->isTiledAllocation()) {
// VME only works with tiled images.
return CL_OUT_OF_RESOURCES;
}
}
{
const cl_image_desc &srcImgDesc = srcImg->getImageDesc();
size_t srcImageWidth = srcImgDesc.image_width;
size_t srcImageHeight = srcImgDesc.image_height;
if (((inputRegion.x + offset.x) > srcImageWidth) ||
((inputRegion.y + offset.y) > srcImageHeight)) {
return CL_INVALID_IMAGE_SIZE;
}
}
return CL_SUCCESS;
}
virtual cl_int validateVmeDispatch(const Vec3<size_t> &inputRegion, const Vec3<size_t> &offset, size_t blkNum, size_t blkMul) const {
{
cl_int imageValidationStatus = validateImages(inputRegion, offset);
if (imageValidationStatus != CL_SUCCESS) {
return imageValidationStatus;
}
}
size_t numPredictors = 1;
std::pair<int32_t, size_t> bufferRequirements[] = {
std::make_pair(motionVectorBufferArgNum, (blkNum * blkMul * 2 * sizeof(cl_short))),
std::make_pair(predictionMotionVectorBufferArgNum, (blkNum * numPredictors * 2 * sizeof(cl_short))),
std::make_pair(residualsArgNum, (blkNum * blkMul * sizeof(cl_ushort)))};
for (const auto &req : bufferRequirements) {
if (false == validateBufferSize(req.first, req.second)) {
return CL_INVALID_BUFFER_SIZE;
}
}
return CL_SUCCESS;
}
protected:
uint32_t heightArgNum;
uint32_t widthArgNum;
uint32_t strideArgNum;
uint32_t acceleratorArgNum;
uint32_t srcImgArgNum;
uint32_t refImgArgNum;
int32_t motionVectorBufferArgNum;
int32_t predictionMotionVectorBufferArgNum;
int32_t residualsArgNum;
MultiDeviceKernel *multiDeviceVmeKernel;
Kernel *vmeKernel;
};
template <>
class BuiltInOp<EBuiltInOps::vmeBlockMotionEstimateIntel> : public VmeBuiltinDispatchInfoBuilder {
public:
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
: VmeBuiltinDispatchInfoBuilder(kernelsLib, device,
EBuiltInOps::vmeBlockMotionEstimateIntel, "block_motion_estimate_intel") {
}
};
class AdvancedVmeBuiltinDispatchInfoBuilder : public VmeBuiltinDispatchInfoBuilder {
public:
AdvancedVmeBuiltinDispatchInfoBuilder(BuiltIns &kernelsLib, ClDevice &device, EBuiltInOps::Type builtinOp,
const char *kernelName)
: VmeBuiltinDispatchInfoBuilder(kernelsLib, device, builtinOp,
kernelName) {
flagsArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("flags");
intraSrcImgArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("intraSrcImg");
skipBlockTypeArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("skip_block_type");
searchCostPenaltyArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("search_cost_penalty");
searchCostPrecisionArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("search_cost_precision");
bidirWeightArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("bidir_weight");
predictorsBufferArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("predictors_buffer");
countMotionVectorBufferArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("count_motion_vector_buffer");
skipMotionVectorBufferArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("skip_motion_vector_buffer");
intraSearchPredictorModesArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("intra_search_predictor_modes");
skipResidualsArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("skip_residuals");
intraResidualsArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("intra_residuals");
}
bool setExplicitArg(uint32_t argIndex, size_t argSize, const void *argVal, cl_int &err) const override {
DEBUG_BREAK_IF(argIndex == intraSrcImgArgNum);
if (argIndex == this->srcImgArgNum) {
// rebind also as media block image
this->vmeKernel->setArg(intraSrcImgArgNum, argSize, argVal);
}
return VmeBuiltinDispatchInfoBuilder::setExplicitArg(argIndex, argSize, argVal, err);
}
virtual bool isBidirKernel() const {
return false;
}
bool validateFlags(uint32_t &outSkipBlockType) const {
uint32_t flagsVal = VmeBuiltinDispatchInfoBuilder::template getKernelArgByValValue<uint32_t>(flagsArgNum);
if ((flagsVal & CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL) == CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL) {
return false;
}
if (flagsVal == CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL) {
outSkipBlockType = CL_ME_MB_TYPE_16x16_INTEL;
} else if ((flagsVal & CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL) == CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL) {
outSkipBlockType = CL_ME_MB_TYPE_8x8_INTEL;
}
return true;
}
bool validateSkipBlockTypeArg(uint32_t &outSkipBlockType) const {
if (skipBlockTypeArgNum == -1) {
return true;
}
outSkipBlockType = VmeBuiltinDispatchInfoBuilder::template getKernelArgByValValue<uint32_t>(static_cast<uint32_t>(skipBlockTypeArgNum));
switch (outSkipBlockType) {
case CL_ME_MB_TYPE_16x16_INTEL:
break;
case CL_ME_MB_TYPE_8x8_INTEL:
break;
default:
return false;
;
}
return true;
}
size_t getIntraSearchPredictorModesBuffExpSize(size_t blkNum) const {
// vector size is 22 - 1 (16x16 luma block) + 4 (8x8 luma block) + 16 (4x4 luma block) + 1 (8x8 chroma block)
int vectorSize = 22;
size_t intraSearchPredictorModesBuffExpSize = blkNum * vectorSize;
return intraSearchPredictorModesBuffExpSize;
}
size_t getSkipMotionVectorBufferExpSize(uint32_t skipBlockType, size_t blkNum) const {
// vector size is either 1 (16x16 block) or 4 (8x8 block)
// 0 to 8 skip MVs per MB
// may be null if all MBs in frame have 0 skip check MVs in which case VME skip checks are not performed
// layout assumes 4 (for bidir) or 8 (otherwise) skip check MVs per MB
// row-major block layout; all MVs for a block are contiguous
// buffer size depends on the block and frame size .
int vectorSize = (skipBlockType == CL_ME_MB_TYPE_16x16_INTEL) ? 1 : 4;
int numChecks = (isBidirKernel() ? 4 : 8);
size_t skipMotionVectorBufferExpSize = blkNum * numChecks * vectorSize * 2 * sizeof(cl_short);
return skipMotionVectorBufferExpSize;
}
size_t getSkipResidualsBuffExpSize(uint32_t skipBlockType, size_t blkNum) const {
/* output buffer of vectors of unsigned short SAD adjusted values corresponding to the input skip check MVs
may be null if skip_motion_vector_buffer is null
vector size is either 1 (16x16 block) or 4 (8x8 block)
0 to 8 skip check residuals per MB
layout always assumes 8 skip check residuals per MB
row major block layout; all MVs for a block are contiguous
buffer size depends on the block and frame size */
int vectorSize = 1;
switch (skipBlockType) {
case CL_ME_MB_TYPE_16x16_INTEL:
vectorSize = 1;
break;
case CL_ME_MB_TYPE_8x8_INTEL:
vectorSize = 4;
break;
default:
break;
};
int numChecks = (isBidirKernel() ? 4 : 8);
size_t skipResidualsBuffExpSize = blkNum * vectorSize * numChecks * sizeof(cl_ushort);
return skipResidualsBuffExpSize;
}
size_t getIntraResidualsBuffExpSize(size_t blkNum) const {
/* output buffer of vectors of unsigned short SAD adjusted values
may be null in which case the intra residuals corresponding not returned
vector size is 4 - 1 (16x16 luma block) + 1 (8x8 luma block) + 1 (4x4 luma block) + 1 (8x8 chroma block)
1 vector per MB
buffer size depends on the frame size */
int vectorSize = 4;
size_t intraResidualsBuffExpSize = (blkNum * sizeof(cl_ushort) * vectorSize);
return intraResidualsBuffExpSize;
}
size_t getPredictorsBufferExpSize(size_t blkNum) const {
size_t numPredictors = 8;
size_t predictorsBufferExpSize = (blkNum * numPredictors * 2 * sizeof(cl_short));
return predictorsBufferExpSize;
}
cl_int validateVmeDispatch(const Vec3<size_t> &inputRegion, const Vec3<size_t> &offset, size_t blkNum, size_t blkMul) const override {
cl_int basicVmeValidationStatus = VmeBuiltinDispatchInfoBuilder::validateVmeDispatch(inputRegion, offset, blkNum, blkMul);
if (basicVmeValidationStatus != CL_SUCCESS) {
return basicVmeValidationStatus;
}
uint32_t skipBlockType = CL_ME_MB_TYPE_16x16_INTEL;
if (false == validateFlags(skipBlockType)) {
return CL_INVALID_KERNEL_ARGS;
}
if (false == validateSkipBlockTypeArg(skipBlockType)) {
return CL_OUT_OF_RESOURCES;
}
if (false == VmeBuiltinDispatchInfoBuilder::template validateEnumArg<uint32_t>(searchCostPenaltyArgNum, CL_ME_COST_PENALTY_NONE_INTEL, CL_ME_COST_PENALTY_LOW_INTEL, CL_ME_COST_PENALTY_NORMAL_INTEL,
CL_ME_COST_PENALTY_HIGH_INTEL)) {
return CL_OUT_OF_RESOURCES;
}
if (false == VmeBuiltinDispatchInfoBuilder::template validateEnumArg<uint32_t>(searchCostPrecisionArgNum, CL_ME_COST_PRECISION_QPEL_INTEL, CL_ME_COST_PRECISION_HPEL_INTEL, CL_ME_COST_PRECISION_PEL_INTEL,
CL_ME_COST_PRECISION_DPEL_INTEL)) {
return CL_OUT_OF_RESOURCES;
}
if (false == VmeBuiltinDispatchInfoBuilder::template validateEnumArg<uint8_t>(bidirWeightArgNum, 0, CL_ME_BIDIR_WEIGHT_QUARTER_INTEL, CL_ME_BIDIR_WEIGHT_THIRD_INTEL, CL_ME_BIDIR_WEIGHT_HALF_INTEL,
CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL, CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL)) {
return CL_INVALID_KERNEL_ARGS;
}
std::pair<int32_t, size_t> bufferRequirements[] = {
std::make_pair(countMotionVectorBufferArgNum, (blkNum * 2 * sizeof(cl_short))),
std::make_pair(skipMotionVectorBufferArgNum, getSkipMotionVectorBufferExpSize(skipBlockType, blkNum)),
std::make_pair(intraSearchPredictorModesArgNum, getIntraSearchPredictorModesBuffExpSize(blkNum)),
std::make_pair(skipResidualsArgNum, getSkipResidualsBuffExpSize(skipBlockType, blkNum)),
std::make_pair(intraResidualsArgNum, getIntraResidualsBuffExpSize(blkNum)),
std::make_pair(predictorsBufferArgNum, getPredictorsBufferExpSize(blkNum))};
for (const auto &req : bufferRequirements) {
if (false == this->validateBufferSize(req.first, req.second)) {
return CL_INVALID_BUFFER_SIZE;
}
}
return CL_SUCCESS;
}
protected:
uint32_t flagsArgNum;
int32_t skipBlockTypeArgNum;
uint32_t searchCostPenaltyArgNum;
uint32_t searchCostPrecisionArgNum;
int32_t bidirWeightArgNum;
int32_t predictorsBufferArgNum;
uint32_t countMotionVectorBufferArgNum;
uint32_t skipMotionVectorBufferArgNum;
uint32_t intraSearchPredictorModesArgNum;
uint32_t skipResidualsArgNum;
uint32_t intraResidualsArgNum;
uint32_t intraSrcImgArgNum;
};
template <>
class BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel> : public AdvancedVmeBuiltinDispatchInfoBuilder {
public:
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
: AdvancedVmeBuiltinDispatchInfoBuilder(kernelsLib, device, EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel,
"block_advanced_motion_estimate_check_intel") {
}
cl_int validateVmeDispatch(const Vec3<size_t> &inputRegion, const Vec3<size_t> &offset,
size_t gwWidthInBlk, size_t gwHeightInBlk) const override {
cl_int basicAdvVmeValidationStatus = AdvancedVmeBuiltinDispatchInfoBuilder::validateVmeDispatch(inputRegion, offset, gwWidthInBlk, gwHeightInBlk);
if (basicAdvVmeValidationStatus != CL_SUCCESS) {
return basicAdvVmeValidationStatus;
}
auto countMotionVectorBuff = castToObject<Buffer>((cl_mem)this->vmeKernel->getKernelArg(this->countMotionVectorBufferArgNum));
if (countMotionVectorBuff == nullptr) {
return CL_INVALID_BUFFER_SIZE;
}
return CL_SUCCESS;
}
};
template <>
class BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel> : public AdvancedVmeBuiltinDispatchInfoBuilder {
public:
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
: AdvancedVmeBuiltinDispatchInfoBuilder(kernelsLib, device, EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel,
"block_advanced_motion_estimate_bidirectional_check_intel") {
}
bool isBidirKernel() const override {
return true;
}
};
} // namespace NEO

View File

@ -59,7 +59,6 @@ set(NEO_IGDRCL_TESTS__TARGET_OBJECTS
$<TARGET_OBJECTS:mock_gmm>
$<TARGET_OBJECTS:${SHARINGS_ENABLE_LIB_NAME}>
$<TARGET_OBJECTS:${BUILTINS_SOURCES_LIB_NAME}>
$<TARGET_OBJECTS:${BUILTINS_VME_LIB_NAME}>
$<TARGET_OBJECTS:${BUILTINS_BINARIES_STATELESS_LIB_NAME}>
$<TARGET_OBJECTS:${BUILTINS_BINARIES_STATELESS_HEAPLESS_LIB_NAME}>
$<TARGET_OBJECTS:${BUILTINS_BINARIES_BINDFUL_LIB_NAME}>
@ -311,12 +310,6 @@ set(TEST_KERNEL_STATELESS
test_files/stateless_kernel.cl
)
set(TEST_KERNEL_VME
${CMAKE_CURRENT_SOURCE_DIR}/test_files/vme_kernels.cl
${CMAKE_CURRENT_SOURCE_DIR}/test_files/media_kernels_backend.cl
${CMAKE_CURRENT_SOURCE_DIR}/test_files/media_kernels_frontend.cl
)
set(TEST_KERNEL_BINDLESS_internal_options
"-cl-intel-use-bindless-mode -cl-intel-use-bindless-advanced-mode"
)
@ -336,12 +329,10 @@ set(TEST_KERNEL_PRINTF_internal_options_gen9lp
file(GLOB_RECURSE TEST_KERNELS test_files/*.cl)
list(REMOVE_ITEM TEST_KERNELS "${CMAKE_CURRENT_SOURCE_DIR}/test_files/simple_nonuniform.cl")
list(REMOVE_ITEM TEST_KERNELS "${CMAKE_CURRENT_SOURCE_DIR}/test_files/stateless_kernel.cl")
list(REMOVE_ITEM TEST_KERNELS ${TEST_KERNEL_VME})
list(REMOVE_ITEM TEST_KERNELS "${CMAKE_CURRENT_SOURCE_DIR}/${TEST_KERNEL_PRINTF}")
macro(macro_for_each_platform)
set(PLATFORM_2_0_LOWER ${DEFAULT_SUPPORTED_2_0_${CORE_TYPE}_${PLATFORM_IT}_PLATFORM})
set(PLATFORM_VME_LOWER ${DEFAULT_SUPPORTED_VME_${CORE_TYPE}_${PLATFORM_IT}_PLATFORM})
set(PLATFORM_TEST_KERNELS ${TEST_KERNELS})
set(IMAGE_SUPPORT FALSE)
@ -398,12 +389,6 @@ macro(macro_for_each_platform)
neo_gen_kernels_with_options(${PLATFORM_IT_LOWER} ${DEVICE_ID} ${REVISION_ID} "${TEST_KERNEL_2_0}" ${TEST_KERNEL_2_0_options})
endforeach()
endif()
if(PLATFORM_VME_LOWER)
foreach(REVISION_CONFIG ${${PLATFORM_IT}_${CORE_TYPE}_REVISIONS})
parse_revision_config(${REVISION_CONFIG} ${PLATFORM_IT_LOWER} DEVICE_ID REVISION_ID)
neo_gen_kernels(${PLATFORM_IT_LOWER} ${DEVICE_ID} ${REVISION_ID} TRUE ${TEST_KERNEL_VME})
endforeach()
endif()
endif()
set(PREVIOUS_TARGET)

View File

@ -1,140 +1,26 @@
/*
* Copyright (C) 2018-2023 Intel Corporation
* Copyright (C) 2018-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/built_ins/built_ins.h"
#include "shared/source/compiler_interface/compiler_interface.h"
#include "shared/source/device/device.h"
#include "opencl/source/built_ins/built_in_ops_vme.h"
#include "opencl/source/built_ins/vme_builtin.h"
#include "opencl/source/context/context.h"
#include "opencl/source/helpers/base_object.h"
#include "opencl/source/kernel/kernel.h"
#include "opencl/source/program/program.h"
#include "opencl/test/unit_test/fixtures/run_kernel_fixture.h"
#include "opencl/test/unit_test/mocks/mock_cl_device.h"
#include "cl_api_tests.h"
using namespace NEO;
using ClCreateProgramWithBuiltInKernelsTests = ApiTests;
struct ClCreateProgramWithBuiltInVmeKernelsTests : ClCreateProgramWithBuiltInKernelsTests {
void SetUp() override {
ClCreateProgramWithBuiltInKernelsTests::SetUp();
if (!castToObject<ClDevice>(testedClDevice)->getHardwareInfo().capabilityTable.supportsVme) {
GTEST_SKIP();
}
pClDevice = pContext->getDevice(0);
}
ClDevice *pClDevice;
};
namespace ULT {
TEST_F(ClCreateProgramWithBuiltInKernelsTests, GivenInvalidContextWhenCreatingProgramWithBuiltInKernelsThenInvalidContextErrorIsReturned) {
TEST_F(ClCreateProgramWithBuiltInKernelsTests, GivenMediaKernelsWhenCreatingProgramWithBuiltInKernelsThenProgramIsNotCreated) {
cl_int retVal = CL_SUCCESS;
auto program = clCreateProgramWithBuiltInKernels(
nullptr, // context
1, // num_devices
nullptr, // device_list
nullptr, // kernel_names
&retVal);
EXPECT_EQ(nullptr, program);
EXPECT_EQ(CL_INVALID_CONTEXT, retVal);
}
TEST_F(ClCreateProgramWithBuiltInKernelsTests, GivenNoKernelsWhenCreatingProgramWithBuiltInKernelsThenInvalidValueErrorIsReturned) {
cl_int retVal = CL_SUCCESS;
auto program = clCreateProgramWithBuiltInKernels(
pContext, // context
1, // num_devices
&testedClDevice, // device_list
"", // kernel_names
&retVal);
EXPECT_EQ(nullptr, program);
EXPECT_EQ(CL_INVALID_VALUE, retVal);
}
TEST_F(ClCreateProgramWithBuiltInKernelsTests, GivenNoDeviceWhenCreatingProgramWithBuiltInKernelsThenInvalidValueErrorIsReturned) {
cl_int retVal = CL_SUCCESS;
auto program = clCreateProgramWithBuiltInKernels(
pContext, // context
0, // num_devices
&testedClDevice, // device_list
"", // kernel_names
&retVal);
EXPECT_EQ(nullptr, program);
EXPECT_EQ(CL_INVALID_VALUE, retVal);
}
TEST_F(ClCreateProgramWithBuiltInKernelsTests, GivenNoKernelsAndNoReturnWhenCreatingProgramWithBuiltInKernelsThenProgramIsNotCreated) {
auto program = clCreateProgramWithBuiltInKernels(
pContext, // context
1, // num_devices
&testedClDevice, // device_list
"", // kernel_names
nullptr);
EXPECT_EQ(nullptr, program);
}
TEST_F(ClCreateProgramWithBuiltInVmeKernelsTests, GivenDeviceNotAssociatedWithContextWhenCreatingProgramWithBuiltInThenInvalidDeviceErrorIsReturned) {
cl_program pProgram = nullptr;
const char *kernelNamesString = {
"block_advanced_motion_estimate_bidirectional_check_intel;"
"block_motion_estimate_intel;"
"block_advanced_motion_estimate_check_intel;"};
MockClDevice invalidDevice(new MockDevice());
cl_device_id devicesForProgram[] = {&invalidDevice};
pProgram = clCreateProgramWithBuiltInKernels(
pContext,
1,
devicesForProgram,
kernelNamesString,
&retVal);
EXPECT_EQ(CL_INVALID_DEVICE, retVal);
EXPECT_EQ(nullptr, pProgram);
retVal = CL_INVALID_PROGRAM;
devicesForProgram[0] = nullptr;
pProgram = clCreateProgramWithBuiltInKernels(
pContext,
1,
devicesForProgram,
kernelNamesString,
&retVal);
EXPECT_EQ(CL_INVALID_DEVICE, retVal);
EXPECT_EQ(nullptr, pProgram);
}
TEST_F(ClCreateProgramWithBuiltInVmeKernelsTests, GivenValidMediaKernelsWhenCreatingProgramWithBuiltInKernelsThenProgramIsSuccessfullyCreated) {
cl_int retVal = CL_SUCCESS;
overwriteBuiltInBinaryName("media_kernels_frontend");
const char *kernelNamesString = {
"block_advanced_motion_estimate_bidirectional_check_intel;"
"block_motion_estimate_intel;"
"block_advanced_motion_estimate_check_intel;"};
const char *kernelNames[] = {
"block_motion_estimate_intel",
"block_advanced_motion_estimate_check_intel",
"block_advanced_motion_estimate_bidirectional_check_intel",
};
cl_program program = clCreateProgramWithBuiltInKernels(
pContext, // context
1, // num_devices
@ -142,160 +28,7 @@ TEST_F(ClCreateProgramWithBuiltInVmeKernelsTests, GivenValidMediaKernelsWhenCrea
kernelNamesString, // kernel_names
&retVal);
restoreBuiltInBinaryName();
EXPECT_NE(nullptr, program);
EXPECT_EQ(CL_SUCCESS, retVal);
for (auto &kernelName : kernelNames) {
cl_kernel kernel = clCreateKernel(
program,
kernelName,
&retVal);
ASSERT_EQ(CL_SUCCESS, retVal);
ASSERT_NE(nullptr, kernel);
retVal = clReleaseKernel(kernel);
EXPECT_EQ(CL_SUCCESS, retVal);
}
retVal = clReleaseProgram(program);
EXPECT_EQ(CL_SUCCESS, retVal);
}
TEST_F(ClCreateProgramWithBuiltInVmeKernelsTests, GivenValidMediaKernelsWithOptionsWhenCreatingProgramWithBuiltInKernelsThenProgramIsSuccessfullyCreatedWithThoseOptions) {
cl_int retVal = CL_SUCCESS;
overwriteBuiltInBinaryName("media_kernels_frontend");
const char *kernelNamesString = {
"block_motion_estimate_intel;"};
cl_program program = clCreateProgramWithBuiltInKernels(
pContext, // context
1, // num_devices
&testedClDevice, // device_list
kernelNamesString, // kernel_names
&retVal);
restoreBuiltInBinaryName();
auto neoProgram = castToObject<Program>(program);
auto builtinOptions = neoProgram->getOptions();
auto it = builtinOptions.find("HW_NULL_CHECK");
EXPECT_EQ(std::string::npos, it);
clReleaseProgram(program);
}
TEST_F(ClCreateProgramWithBuiltInVmeKernelsTests, GivenVmeBlockMotionEstimateKernelWhenCreatingProgramWithBuiltInKernelsThenCorrectDispatchBuilderAndFrontendKernelIsCreated) {
cl_int retVal = CL_SUCCESS;
overwriteBuiltInBinaryName("media_kernels_backend");
Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::vmeBlockMotionEstimateIntel, *pClDevice);
restoreBuiltInBinaryName();
overwriteBuiltInBinaryName("media_kernels_frontend");
const char *kernelNamesString = {
"block_motion_estimate_intel;"};
cl_program program = clCreateProgramWithBuiltInKernels(
pContext, // context
1, // num_devices
&testedClDevice, // device_list
kernelNamesString, // kernel_names
&retVal);
restoreBuiltInBinaryName();
cl_kernel kernel = clCreateKernel(
program,
"block_motion_estimate_intel",
&retVal);
auto pMultiDeviceKernel = castToObject<MultiDeviceKernel>(kernel);
auto kernNeo = pMultiDeviceKernel->getKernel(testedRootDeviceIndex);
EXPECT_NE(nullptr, kernNeo->getKernelInfo().builtinDispatchBuilder);
EXPECT_EQ(6U, kernNeo->getKernelArgsNumber());
auto &vmeBuilder = Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::vmeBlockMotionEstimateIntel, *pClDevice);
EXPECT_EQ(&vmeBuilder, kernNeo->getKernelInfo().builtinDispatchBuilder);
clReleaseKernel(kernel);
clReleaseProgram(program);
}
TEST_F(ClCreateProgramWithBuiltInVmeKernelsTests, GivenVmeBlockAdvancedMotionEstimateKernelWhenCreatingProgramWithBuiltInKernelsThenCorrectDispatchBuilderAndFrontendKernelIsCreated) {
cl_int retVal = CL_SUCCESS;
overwriteBuiltInBinaryName("media_kernels_backend");
Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel, *pClDevice);
restoreBuiltInBinaryName();
overwriteBuiltInBinaryName("media_kernels_frontend");
const char *kernelNamesString = {
"block_advanced_motion_estimate_check_intel;"};
cl_program program = clCreateProgramWithBuiltInKernels(
pContext, // context
1, // num_devices
&testedClDevice, // device_list
kernelNamesString, // kernel_names
&retVal);
restoreBuiltInBinaryName();
cl_kernel kernel = clCreateKernel(
program,
"block_advanced_motion_estimate_check_intel",
&retVal);
auto pMultiDeviceKernel = castToObject<MultiDeviceKernel>(kernel);
auto kernNeo = pMultiDeviceKernel->getKernel(testedRootDeviceIndex);
EXPECT_NE(nullptr, kernNeo->getKernelInfo().builtinDispatchBuilder);
EXPECT_EQ(15U, kernNeo->getKernelArgsNumber());
auto &vmeBuilder = Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel, *pClDevice);
EXPECT_EQ(&vmeBuilder, kernNeo->getKernelInfo().builtinDispatchBuilder);
clReleaseKernel(kernel);
clReleaseProgram(program);
}
TEST_F(ClCreateProgramWithBuiltInVmeKernelsTests, GivenVmeBlockAdvancedMotionEstimateBidirectionalCheckKernelWhenCreatingProgramWithBuiltInKernelsThenCorrectDispatchBuilderAndFrontendKernelIsCreated) {
cl_int retVal = CL_SUCCESS;
overwriteBuiltInBinaryName("media_kernels_backend");
Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel, *pClDevice);
restoreBuiltInBinaryName();
overwriteBuiltInBinaryName("media_kernels_frontend");
const char *kernelNamesString = {
"block_advanced_motion_estimate_bidirectional_check_intel;"};
cl_program program = clCreateProgramWithBuiltInKernels(
pContext, // context
1, // num_devices
&testedClDevice, // device_list
kernelNamesString, // kernel_names
&retVal);
restoreBuiltInBinaryName();
cl_kernel kernel = clCreateKernel(
program,
"block_advanced_motion_estimate_bidirectional_check_intel",
&retVal);
auto pMultiDeviceKernel = castToObject<MultiDeviceKernel>(kernel);
auto kernNeo = pMultiDeviceKernel->getKernel(testedRootDeviceIndex);
EXPECT_NE(nullptr, kernNeo->getKernelInfo().builtinDispatchBuilder);
EXPECT_EQ(20U, kernNeo->getKernelArgsNumber());
auto ctxNeo = castToObject<Context>(pContext);
auto &vmeBuilder = Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel, *ctxNeo->getDevice(0));
EXPECT_EQ(&vmeBuilder, kernNeo->getKernelInfo().builtinDispatchBuilder);
clReleaseKernel(kernel);
clReleaseProgram(program);
EXPECT_EQ(nullptr, program);
EXPECT_EQ(CL_INVALID_VALUE, retVal);
}
} // namespace ULT

View File

@ -30,8 +30,6 @@
#include "opencl/source/accelerators/intel_motion_estimation.h"
#include "opencl/source/built_ins/aux_translation_builtin.h"
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
#include "opencl/source/built_ins/vme_builtin.h"
#include "opencl/source/built_ins/vme_dispatch_builder.h"
#include "opencl/source/helpers/dispatch_info_builder.h"
#include "opencl/source/kernel/kernel.h"
#include "opencl/test/unit_test/built_ins/built_ins_file_names.h"
@ -122,15 +120,6 @@ class BuiltInTests
std::string allBuiltIns;
};
struct VmeBuiltInTests : BuiltInTests {
void SetUp() override {
BuiltInTests::SetUp();
if (!pDevice->getHardwareInfo().capabilityTable.supportsVme) {
GTEST_SKIP();
}
}
};
struct AuxBuiltInTests : BuiltInTests, public ::testing::WithParamInterface<KernelObjForAuxTranslation::Type> {
void SetUp() override {
BuiltInTests::SetUp();
@ -1350,167 +1339,6 @@ TEST_F(BuiltInTests, WhenSettingExplictArgThenTrueIsReturned) {
EXPECT_TRUE(ret);
}
TEST_F(VmeBuiltInTests, GivenVmeBuilderWhenGettingDispatchInfoThenValidPointerIsReturned) {
overwriteBuiltInBinaryName("media_kernels_backend");
EBuiltInOps::Type vmeOps[] = {EBuiltInOps::vmeBlockMotionEstimateIntel, EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel, EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel};
for (auto op : vmeOps) {
BuiltinDispatchInfoBuilder &builder = Vme::getBuiltinDispatchInfoBuilder(op, *pClDevice);
EXPECT_NE(nullptr, &builder);
}
restoreBuiltInBinaryName();
}
TEST_F(VmeBuiltInTests, givenInvalidBuiltInOpWhenGetVmeBuilderInfoThenExceptionIsThrown) {
EXPECT_THROW(Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::count, *pClDevice), std::exception);
}
TEST_F(VmeBuiltInTests, GivenVmeBuilderAndInvalidParamsWhenGettingDispatchInfoThenEmptyKernelIsReturned) {
overwriteBuiltInBinaryName("media_kernels_backend");
EBuiltInOps::Type vmeOps[] = {EBuiltInOps::vmeBlockMotionEstimateIntel, EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel, EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel};
for (auto op : vmeOps) {
BuiltinDispatchInfoBuilder &builder = Vme::getBuiltinDispatchInfoBuilder(op, *pClDevice);
MultiDispatchInfo outMdi;
Vec3<size_t> gws{352, 288, 0};
Vec3<size_t> elws{0, 0, 0};
Vec3<size_t> offset{0, 0, 0};
auto ret = builder.buildDispatchInfos(outMdi, nullptr, 0, gws, elws, offset);
EXPECT_FALSE(ret);
EXPECT_EQ(0U, outMdi.size());
}
restoreBuiltInBinaryName();
}
TEST_F(VmeBuiltInTests, GivenVmeBuilderWhenGettingDispatchInfoThenParamsAreCorrect) {
MockKernelWithInternals mockKernel{*pClDevice};
mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 16;
mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0] = 16;
mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[1] = 0;
mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2] = 0;
overwriteBuiltInBinaryName("media_kernels_backend");
BuiltinDispatchInfoBuilder &builder = Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::vmeBlockMotionEstimateIntel, *pClDevice);
restoreBuiltInBinaryName();
MultiDispatchInfo outMdi;
Vec3<size_t> gws{352, 288, 0};
Vec3<size_t> elws{0, 0, 0};
Vec3<size_t> offset{16, 0, 0};
MockBuffer mb;
cl_mem bufferArg = static_cast<cl_mem>(&mb);
cl_int err;
constexpr uint32_t bufferArgNum = 3;
bool ret = builder.setExplicitArg(bufferArgNum, sizeof(cl_mem), &bufferArg, err);
EXPECT_FALSE(ret);
EXPECT_EQ(CL_SUCCESS, err);
ret = builder.buildDispatchInfos(outMdi, mockKernel.mockKernel, 0, gws, elws, offset);
EXPECT_TRUE(ret);
EXPECT_EQ(1U, outMdi.size());
auto outDi = outMdi.begin();
EXPECT_EQ(Vec3<size_t>(352, 1, 1), outDi->getGWS());
EXPECT_EQ(Vec3<size_t>(16, 1, 1), outDi->getEnqueuedWorkgroupSize());
EXPECT_EQ(Vec3<size_t>(16, 0, 0), outDi->getOffset());
EXPECT_NE(mockKernel.mockKernel, outDi->getKernel());
EXPECT_EQ(bufferArg, outDi->getKernel()->getKernelArg(bufferArgNum));
constexpr uint32_t vmeImplicitArgsBase = 6;
constexpr uint32_t vmeImplicitArgs = 3;
ASSERT_EQ(vmeImplicitArgsBase + vmeImplicitArgs, outDi->getKernel()->getKernelInfo().kernelDescriptor.payloadMappings.explicitArgs.size());
uint32_t vmeExtraArgsExpectedVals[] = {18, 22, 18}; // height, width, stride
for (uint32_t i = 0; i < vmeImplicitArgs; ++i) {
auto &argAsVal = outDi->getKernel()->getKernelInfo().getArgDescriptorAt(vmeImplicitArgsBase + i).as<ArgDescValue>();
EXPECT_EQ(vmeExtraArgsExpectedVals[i], *((uint32_t *)(outDi->getKernel()->getCrossThreadData() + argAsVal.elements[0].offset)));
}
}
TEST_F(VmeBuiltInTests, GivenAdvancedVmeBuilderWhenGettingDispatchInfoThenParamsAreCorrect) {
MockKernelWithInternals mockKernel{*pClDevice};
mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 16;
mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0] = 16;
mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[1] = 0;
mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2] = 0;
Vec3<size_t> gws{352, 288, 0};
Vec3<size_t> elws{0, 0, 0};
Vec3<size_t> offset{0, 0, 0};
cl_int err;
constexpr uint32_t bufferArgNum = 7;
MockBuffer mb;
cl_mem bufferArg = static_cast<cl_mem>(&mb);
constexpr uint32_t srcImageArgNum = 1;
auto image = std::unique_ptr<Image>(Image2dHelper<>::create(pContext));
cl_mem srcImageArg = static_cast<cl_mem>(image.get());
EBuiltInOps::Type vmeOps[] = {EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel, EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel};
for (auto op : vmeOps) {
MultiDispatchInfo outMdi;
overwriteBuiltInBinaryName("media_kernels_backend");
BuiltinDispatchInfoBuilder &builder = Vme::getBuiltinDispatchInfoBuilder(op, *pClDevice);
restoreBuiltInBinaryName();
bool ret = builder.setExplicitArg(srcImageArgNum, sizeof(cl_mem), &srcImageArg, err);
EXPECT_FALSE(ret);
EXPECT_EQ(CL_SUCCESS, err);
ret = builder.setExplicitArg(bufferArgNum, sizeof(cl_mem), &bufferArg, err);
EXPECT_FALSE(ret);
EXPECT_EQ(CL_SUCCESS, err);
ret = builder.buildDispatchInfos(outMdi, mockKernel.mockKernel, 0, gws, elws, offset);
EXPECT_TRUE(ret);
EXPECT_EQ(1U, outMdi.size());
auto outDi = outMdi.begin();
EXPECT_EQ(Vec3<size_t>(352, 1, 1), outDi->getGWS());
EXPECT_EQ(Vec3<size_t>(16, 1, 1), outDi->getEnqueuedWorkgroupSize());
EXPECT_NE(mockKernel.mockKernel, outDi->getKernel());
EXPECT_EQ(srcImageArg, outDi->getKernel()->getKernelArg(srcImageArgNum));
uint32_t vmeImplicitArgsBase = outDi->getKernel()->getKernelInfo().getArgNumByName("intraSrcImg");
uint32_t vmeImplicitArgs = 4;
ASSERT_EQ(vmeImplicitArgsBase + vmeImplicitArgs, outDi->getKernel()->getKernelInfo().getExplicitArgs().size());
EXPECT_EQ(srcImageArg, outDi->getKernel()->getKernelArg(vmeImplicitArgsBase));
++vmeImplicitArgsBase;
--vmeImplicitArgs;
uint32_t vmeExtraArgsExpectedVals[] = {18, 22, 18}; // height, width, stride
for (uint32_t i = 0; i < vmeImplicitArgs; ++i) {
auto &argAsVal = outDi->getKernel()->getKernelInfo().getArgDescriptorAt(vmeImplicitArgsBase + i).as<ArgDescValue>();
EXPECT_EQ(vmeExtraArgsExpectedVals[i], *((uint32_t *)(outDi->getKernel()->getCrossThreadData() + argAsVal.elements[0].offset)));
}
}
}
TEST_F(VmeBuiltInTests, WhenGettingBuiltinAsStringThenCorrectStringIsReturned) {
EXPECT_EQ(0, strcmp("aux_translation.builtin_kernel", getBuiltinAsString(EBuiltInOps::auxTranslation)));
EXPECT_EQ(0, strcmp("copy_buffer_to_buffer.builtin_kernel", getBuiltinAsString(EBuiltInOps::copyBufferToBuffer)));
EXPECT_EQ(0, strcmp("copy_buffer_rect.builtin_kernel", getBuiltinAsString(EBuiltInOps::copyBufferRect)));
EXPECT_EQ(0, strcmp("fill_buffer.builtin_kernel", getBuiltinAsString(EBuiltInOps::fillBuffer)));
EXPECT_EQ(0, strcmp("copy_buffer_to_image3d.builtin_kernel", getBuiltinAsString(EBuiltInOps::copyBufferToImage3d)));
EXPECT_EQ(0, strcmp("copy_image3d_to_buffer.builtin_kernel", getBuiltinAsString(EBuiltInOps::copyImage3dToBuffer)));
EXPECT_EQ(0, strcmp("copy_image_to_image1d.builtin_kernel", getBuiltinAsString(EBuiltInOps::copyImageToImage1d)));
EXPECT_EQ(0, strcmp("copy_image_to_image2d.builtin_kernel", getBuiltinAsString(EBuiltInOps::copyImageToImage2d)));
EXPECT_EQ(0, strcmp("copy_image_to_image3d.builtin_kernel", getBuiltinAsString(EBuiltInOps::copyImageToImage3d)));
EXPECT_EQ(0, strcmp("copy_kernel_timestamps.builtin_kernel", getBuiltinAsString(EBuiltInOps::queryKernelTimestamps)));
EXPECT_EQ(0, strcmp("fill_image1d.builtin_kernel", getBuiltinAsString(EBuiltInOps::fillImage1d)));
EXPECT_EQ(0, strcmp("fill_image2d.builtin_kernel", getBuiltinAsString(EBuiltInOps::fillImage2d)));
EXPECT_EQ(0, strcmp("fill_image3d.builtin_kernel", getBuiltinAsString(EBuiltInOps::fillImage3d)));
EXPECT_EQ(0, strcmp("vme_block_motion_estimate_intel.builtin_kernel", getBuiltinAsString(EBuiltInOps::vmeBlockMotionEstimateIntel)));
EXPECT_EQ(0, strcmp("vme_block_advanced_motion_estimate_check_intel.builtin_kernel", getBuiltinAsString(EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel)));
EXPECT_EQ(0, strcmp("vme_block_advanced_motion_estimate_bidirectional_check_intel", getBuiltinAsString(EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel)));
EXPECT_EQ(0, strcmp("unknown", getBuiltinAsString(EBuiltInOps::count)));
}
TEST_F(BuiltInTests, GivenEncodeTypeWhenGettingExtensionThenCorrectStringIsReturned) {
EXPECT_EQ(0, strcmp("", BuiltinCode::getExtension(BuiltinCode::ECodeType::any)));
EXPECT_EQ(0, strcmp(".bin", BuiltinCode::getExtension(BuiltinCode::ECodeType::binary)));
@ -1670,9 +1498,6 @@ TEST_F(BuiltInTests, GivenBuiltinTypeSourceWhenGettingBuiltinResourceThenResourc
EXPECT_NE(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::fillImage1d, BuiltinCode::ECodeType::source, *pDevice).size());
EXPECT_NE(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::fillImage2d, BuiltinCode::ECodeType::source, *pDevice).size());
EXPECT_NE(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::fillImage3d, BuiltinCode::ECodeType::source, *pDevice).size());
EXPECT_NE(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::vmeBlockMotionEstimateIntel, BuiltinCode::ECodeType::source, *pDevice).size());
EXPECT_NE(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel, BuiltinCode::ECodeType::source, *pDevice).size());
EXPECT_NE(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel, BuiltinCode::ECodeType::source, *pDevice).size());
EXPECT_EQ(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::count, BuiltinCode::ECodeType::source, *pDevice).size());
}
@ -1690,9 +1515,6 @@ HWCMDTEST_F(IGFX_GEN8_CORE, BuiltInTests, GivenBuiltinTypeBinaryWhenGettingBuilt
EXPECT_NE(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::fillImage1d, BuiltinCode::ECodeType::binary, *pDevice).size());
EXPECT_NE(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::fillImage2d, BuiltinCode::ECodeType::binary, *pDevice).size());
EXPECT_NE(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::fillImage3d, BuiltinCode::ECodeType::binary, *pDevice).size());
EXPECT_EQ(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::vmeBlockMotionEstimateIntel, BuiltinCode::ECodeType::binary, *pDevice).size());
EXPECT_EQ(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel, BuiltinCode::ECodeType::binary, *pDevice).size());
EXPECT_EQ(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel, BuiltinCode::ECodeType::binary, *pDevice).size());
EXPECT_EQ(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::count, BuiltinCode::ECodeType::binary, *pDevice).size());
}
@ -1804,554 +1626,6 @@ TEST_F(BuiltInTests, GivenForce32bitWhenCreatingProgramThenCorrectKernelIsCreate
const_cast<DeviceInfo *>(&pDevice->getDeviceInfo())->force32BitAddressess = force32BitAddressess;
}
TEST_F(BuiltInTests, GivenVmeKernelWhenGettingDeviceInfoThenCorrectVmeVersionIsReturned) {
if (!pDevice->getHardwareInfo().capabilityTable.supportsVme) {
GTEST_SKIP();
}
cl_uint param;
auto ret = pClDevice->getDeviceInfo(CL_DEVICE_ME_VERSION_INTEL, sizeof(param), &param, nullptr);
EXPECT_EQ(CL_SUCCESS, ret);
EXPECT_EQ(static_cast<cl_uint>(CL_ME_VERSION_ADVANCED_VER_2_INTEL), param);
}
TEST_F(VmeBuiltInTests, WhenVmeKernelIsCreatedThenParamsAreCorrect) {
overwriteBuiltInBinaryName("media_kernels_backend");
BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel> vmeBuilder(*this->pBuiltIns, *pClDevice);
restoreBuiltInBinaryName();
cl_int err;
{
int32_t bufArgNum = 7;
cl_mem mem = 0;
vmeBuilder.setExplicitArg(bufArgNum, sizeof(cl_mem), &mem, err);
EXPECT_TRUE(vmeBuilder.validateBufferSize(-1, 16));
EXPECT_TRUE(vmeBuilder.validateBufferSize(bufArgNum, 16));
MockBuffer mb;
mem = &mb;
vmeBuilder.setExplicitArg(bufArgNum, sizeof(cl_mem), &mem, err);
EXPECT_TRUE(vmeBuilder.validateBufferSize(bufArgNum, mb.getSize()));
EXPECT_TRUE(vmeBuilder.validateBufferSize(bufArgNum, mb.getSize() / 2));
EXPECT_FALSE(vmeBuilder.validateBufferSize(bufArgNum, mb.getSize() * 2));
mem = 0;
vmeBuilder.setExplicitArg(bufArgNum, sizeof(cl_mem), &mem, err);
}
{
EXPECT_TRUE(vmeBuilder.validateEnumVal(1, 1, 2, 3, 4));
EXPECT_TRUE(vmeBuilder.validateEnumVal(1, 1));
EXPECT_TRUE(vmeBuilder.validateEnumVal(3, 1, 2, 3));
EXPECT_FALSE(vmeBuilder.validateEnumVal(1, 3, 4));
EXPECT_FALSE(vmeBuilder.validateEnumVal(1));
EXPECT_FALSE(vmeBuilder.validateEnumVal(1, 2));
int32_t valArgNum = 3;
uint32_t val = 7;
vmeBuilder.setExplicitArg(valArgNum, sizeof(val), &val, err);
EXPECT_FALSE(vmeBuilder.validateEnumArg<uint32_t>(valArgNum, 3));
EXPECT_TRUE(vmeBuilder.validateEnumArg<uint32_t>(valArgNum, 7));
val = 0;
vmeBuilder.setExplicitArg(valArgNum, sizeof(val), &val, err);
}
{
int32_t valArgNum = 3;
uint32_t val = 7;
vmeBuilder.setExplicitArg(valArgNum, sizeof(val), &val, err);
EXPECT_EQ(val, vmeBuilder.getKernelArgByValValue<uint32_t>(valArgNum));
val = 11;
vmeBuilder.setExplicitArg(valArgNum, sizeof(val), &val, err);
EXPECT_EQ(val, vmeBuilder.getKernelArgByValValue<uint32_t>(valArgNum));
val = 0;
vmeBuilder.setExplicitArg(valArgNum, sizeof(val), &val, err);
}
}
TEST_F(VmeBuiltInTests, WhenVmeKernelIsCreatedThenDispatchIsBidirectional) {
overwriteBuiltInBinaryName("media_kernels_backend");
BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel> avmeBuilder(*this->pBuiltIns, *pClDevice);
BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel> avmeBidirBuilder(*this->pBuiltIns, *pClDevice);
restoreBuiltInBinaryName();
EXPECT_FALSE(avmeBuilder.isBidirKernel());
EXPECT_TRUE(avmeBidirBuilder.isBidirKernel());
}
struct ImageVmeValidFormat : Image2dDefaults {
static const cl_image_format imageFormat;
static const cl_image_desc iamgeDesc;
};
const cl_image_format ImageVmeValidFormat::imageFormat = {
CL_R,
CL_UNORM_INT8};
const cl_image_desc ImageVmeValidFormat::iamgeDesc = {
CL_MEM_OBJECT_IMAGE1D,
8192,
16,
1,
1,
0,
0,
0,
0,
{nullptr}};
struct ImageVmeInvalidDataType : Image2dDefaults {
static const cl_image_format imageFormat;
};
const cl_image_format ImageVmeInvalidDataType::imageFormat = {
CL_R,
CL_FLOAT};
struct ImageVmeInvalidChannelOrder : Image2dDefaults {
static const cl_image_format imageFormat;
};
const cl_image_format ImageVmeInvalidChannelOrder::imageFormat = {
CL_RGBA,
CL_UNORM_INT8};
TEST_F(VmeBuiltInTests, WhenValidatingImagesThenCorrectResponses) {
overwriteBuiltInBinaryName("media_kernels_backend");
BuiltInOp<EBuiltInOps::vmeBlockMotionEstimateIntel> vmeBuilder(*this->pBuiltIns, *pClDevice);
restoreBuiltInBinaryName();
uint32_t srcImgArgNum = 1;
uint32_t refImgArgNum = 2;
cl_int err;
{ // validate images are not null
std::unique_ptr<Image> image1(ImageHelper<ImageVmeValidFormat>::create(pContext));
cl_mem srcImgMem = 0;
EXPECT_EQ(CL_INVALID_KERNEL_ARGS, vmeBuilder.validateImages(Vec3<size_t>{3, 3, 0}, Vec3<size_t>{0, 0, 0}));
srcImgMem = image1.get();
vmeBuilder.setExplicitArg(srcImgArgNum, sizeof(srcImgMem), &srcImgMem, err);
EXPECT_EQ(CL_INVALID_KERNEL_ARGS, vmeBuilder.validateImages(Vec3<size_t>{3, 3, 0}, Vec3<size_t>{0, 0, 0}));
}
{ // validate image formats
std::unique_ptr<Image> imageValid(ImageHelper<ImageVmeValidFormat>::create(pContext));
std::unique_ptr<Image> imageInvalidDataType(ImageHelper<ImageVmeInvalidDataType>::create(pContext));
std::unique_ptr<Image> imageChannelOrder(ImageHelper<ImageVmeInvalidChannelOrder>::create(pContext));
Image *images[] = {imageValid.get(), imageInvalidDataType.get(), imageChannelOrder.get()};
for (Image *srcImg : images) {
for (Image *dstImg : images) {
cl_mem srcImgMem = srcImg;
cl_mem refImgMem = dstImg;
vmeBuilder.setExplicitArg(srcImgArgNum, sizeof(srcImgMem), &srcImgMem, err);
vmeBuilder.setExplicitArg(refImgArgNum, sizeof(refImgMem), &refImgMem, err);
bool shouldSucceed = (srcImg == imageValid.get()) && (dstImg == imageValid.get());
if (shouldSucceed) {
EXPECT_EQ(CL_SUCCESS, vmeBuilder.validateImages(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}));
} else {
EXPECT_EQ(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR, vmeBuilder.validateImages(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}));
}
}
}
}
{ // validate image tiling
std::unique_ptr<Image> imageValid(ImageHelper<ImageVmeValidFormat>::create(pContext));
DebugManagerStateRestore restorer;
debugManager.flags.ForceLinearImages.set(true);
std::unique_ptr<Image> imageLinear(ImageHelper<ImageVmeValidFormat>::create(pContext));
Image *images[] = {imageValid.get(), imageLinear.get()};
for (Image *srcImg : images) {
for (Image *dstImg : images) {
cl_mem srcImgMem = srcImg;
cl_mem refImgMem = dstImg;
vmeBuilder.setExplicitArg(srcImgArgNum, sizeof(srcImgMem), &srcImgMem, err);
vmeBuilder.setExplicitArg(refImgArgNum, sizeof(refImgMem), &refImgMem, err);
bool shouldSucceed = (srcImg == imageValid.get()) && (dstImg == imageValid.get());
if (shouldSucceed) {
EXPECT_EQ(CL_SUCCESS, vmeBuilder.validateImages(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}));
} else {
EXPECT_EQ(CL_OUT_OF_RESOURCES, vmeBuilder.validateImages(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}));
}
}
}
}
{ // validate region size
std::unique_ptr<Image> imageValid(ImageHelper<ImageVmeValidFormat>::create(pContext));
cl_mem imgValidMem = imageValid.get();
vmeBuilder.setExplicitArg(srcImgArgNum, sizeof(imgValidMem), &imgValidMem, err);
vmeBuilder.setExplicitArg(refImgArgNum, sizeof(imgValidMem), &imgValidMem, err);
EXPECT_EQ(CL_INVALID_IMAGE_SIZE, vmeBuilder.validateImages(Vec3<size_t>{imageValid->getImageDesc().image_width + 1, 1, 0}, Vec3<size_t>{0, 0, 0}));
EXPECT_EQ(CL_INVALID_IMAGE_SIZE, vmeBuilder.validateImages(Vec3<size_t>{1, imageValid->getImageDesc().image_height + 1, 0}, Vec3<size_t>{0, 0, 0}));
}
}
TEST_F(VmeBuiltInTests, WhenValidatingFlagsThenValidFlagCombinationsReturnTrue) {
overwriteBuiltInBinaryName("media_kernels_backend");
BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel> vmeBuilder(*this->pBuiltIns, *pClDevice);
restoreBuiltInBinaryName();
uint32_t defaultSkipBlockVal = 8192;
uint32_t flagsArgNum = 3;
std::tuple<uint32_t, bool, uint32_t> flagsToTest[] = {
std::make_tuple(CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL, false, defaultSkipBlockVal),
std::make_tuple(CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL, true, CL_ME_MB_TYPE_16x16_INTEL),
std::make_tuple(CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL, true, CL_ME_MB_TYPE_8x8_INTEL),
std::make_tuple(defaultSkipBlockVal, true, defaultSkipBlockVal),
};
cl_int err;
for (auto &conf : flagsToTest) {
uint32_t skipBlock = defaultSkipBlockVal;
vmeBuilder.setExplicitArg(flagsArgNum, sizeof(uint32_t), &std::get<0>(conf), err);
bool validationResult = vmeBuilder.validateFlags(skipBlock);
if (std::get<1>(conf)) {
EXPECT_TRUE(validationResult);
} else {
EXPECT_FALSE(validationResult);
}
EXPECT_EQ(std::get<2>(conf), skipBlock);
}
}
TEST_F(VmeBuiltInTests, WhenValidatingSkipBlockTypeThenCorrectResponses) {
overwriteBuiltInBinaryName("media_kernels_backend");
BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel> avmeBidirectionalBuilder(*this->pBuiltIns, *pClDevice);
BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel> avmeBuilder(*this->pBuiltIns, *pClDevice);
restoreBuiltInBinaryName();
cl_int err;
uint32_t skipBlockTypeArgNum = 4;
uint32_t skipBlockType = 8192;
bool ret = avmeBidirectionalBuilder.validateSkipBlockTypeArg(skipBlockType);
EXPECT_TRUE(ret);
EXPECT_EQ(8192U, skipBlockType);
skipBlockType = 8192U;
avmeBuilder.setExplicitArg(skipBlockTypeArgNum, sizeof(uint32_t), &skipBlockType, err);
ret = avmeBuilder.validateSkipBlockTypeArg(skipBlockType);
EXPECT_FALSE(ret);
skipBlockType = CL_ME_MB_TYPE_16x16_INTEL;
avmeBuilder.setExplicitArg(skipBlockTypeArgNum, sizeof(uint32_t), &skipBlockType, err);
skipBlockType = 8192U;
ret = avmeBuilder.validateSkipBlockTypeArg(skipBlockType);
EXPECT_TRUE(ret);
EXPECT_EQ(static_cast<uint32_t>(CL_ME_MB_TYPE_16x16_INTEL), skipBlockType);
skipBlockType = CL_ME_MB_TYPE_8x8_INTEL;
avmeBuilder.setExplicitArg(skipBlockTypeArgNum, sizeof(uint32_t), &skipBlockType, err);
skipBlockType = 8192U;
ret = avmeBuilder.validateSkipBlockTypeArg(skipBlockType);
EXPECT_TRUE(ret);
EXPECT_EQ(static_cast<uint32_t>(CL_ME_MB_TYPE_8x8_INTEL), skipBlockType);
}
TEST_F(VmeBuiltInTests, GivenAcceleratorWhenExplicitlySettingArgThenFalseIsReturned) {
overwriteBuiltInBinaryName("media_kernels_backend");
BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel> vmeBuilder(*this->pBuiltIns, *pClDevice);
restoreBuiltInBinaryName();
cl_int err;
uint32_t aceleratorArgNum = 0;
bool ret = vmeBuilder.setExplicitArg(aceleratorArgNum, sizeof(cl_accelerator_intel), nullptr, err);
EXPECT_FALSE(ret);
EXPECT_EQ(CL_INVALID_ACCELERATOR_INTEL, err);
cl_motion_estimation_desc_intel acceleratorDesc;
acceleratorDesc.subpixel_mode = CL_ME_SUBPIXEL_MODE_INTEGER_INTEL;
acceleratorDesc.sad_adjust_mode = CL_ME_SAD_ADJUST_MODE_NONE_INTEL;
acceleratorDesc.search_path_type = CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL;
acceleratorDesc.mb_block_type = CL_ME_MB_TYPE_16x16_INTEL;
auto neoAccelerator = std::unique_ptr<VmeAccelerator>(VmeAccelerator::create(pContext, CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL, sizeof(acceleratorDesc), &acceleratorDesc, err));
ASSERT_NE(nullptr, neoAccelerator.get());
cl_accelerator_intel clAccel = neoAccelerator.get();
ret = vmeBuilder.setExplicitArg(aceleratorArgNum, sizeof(cl_accelerator_intel), &clAccel, err);
EXPECT_FALSE(ret);
EXPECT_EQ(CL_SUCCESS, err);
}
TEST_F(VmeBuiltInTests, WhenValidatingDispatchThenCorrectReturns) {
overwriteBuiltInBinaryName("media_kernels_backend");
struct MockVmeBuilder : BuiltInOp<EBuiltInOps::vmeBlockMotionEstimateIntel> {
using BuiltInOp<EBuiltInOps::vmeBlockMotionEstimateIntel>::BuiltInOp;
cl_int validateVmeDispatch(const Vec3<size_t> &inputRegion, const Vec3<size_t> &offset, size_t blkNum, size_t blkMul) const override {
receivedInputRegion = inputRegion;
receivedOffset = offset;
receivedBlkNum = blkNum;
receivedBlkMul = blkMul;
wasValidateVmeDispatchCalled = true;
return valueToReturn;
}
mutable bool wasValidateVmeDispatchCalled = false;
mutable Vec3<size_t> receivedInputRegion = {0, 0, 0};
mutable Vec3<size_t> receivedOffset = {0, 0, 0};
mutable size_t receivedBlkNum = 0;
mutable size_t receivedBlkMul = 0;
mutable cl_int valueToReturn = CL_SUCCESS;
};
uint32_t aaceleratorArgNum = 0;
MockVmeBuilder vmeBuilder(*this->pBuiltIns, *pClDevice);
restoreBuiltInBinaryName();
cl_int ret = vmeBuilder.validateDispatch(nullptr, 1, Vec3<size_t>{16, 16, 0}, Vec3<size_t>{16, 1, 0}, Vec3<size_t>{0, 0, 0});
EXPECT_EQ(CL_INVALID_WORK_DIMENSION, ret);
ret = vmeBuilder.validateDispatch(nullptr, 3, Vec3<size_t>{16, 16, 0}, Vec3<size_t>{16, 1, 0}, Vec3<size_t>{0, 0, 0});
EXPECT_EQ(CL_INVALID_WORK_DIMENSION, ret);
ret = vmeBuilder.validateDispatch(nullptr, 2, Vec3<size_t>{16, 16, 0}, Vec3<size_t>{16, 1, 0}, Vec3<size_t>{0, 0, 0});
EXPECT_EQ(CL_INVALID_KERNEL_ARGS, ret); // accelerator not set
EXPECT_FALSE(vmeBuilder.wasValidateVmeDispatchCalled);
cl_int err;
cl_motion_estimation_desc_intel acceleratorDesc;
acceleratorDesc.subpixel_mode = CL_ME_SUBPIXEL_MODE_INTEGER_INTEL;
acceleratorDesc.sad_adjust_mode = CL_ME_SAD_ADJUST_MODE_NONE_INTEL;
acceleratorDesc.search_path_type = CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL;
Vec3<size_t> gws{16, 16, 0};
Vec3<size_t> lws{16, 1, 0};
Vec3<size_t> off{0, 0, 0};
size_t gwWidthInBlk = 0;
size_t gwHeightInBlk = 0;
vmeBuilder.getBlkTraits(gws, gwWidthInBlk, gwHeightInBlk);
{
acceleratorDesc.mb_block_type = CL_ME_MB_TYPE_16x16_INTEL;
auto neoAccelerator = std::unique_ptr<VmeAccelerator>(VmeAccelerator::create(pContext, CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL, sizeof(acceleratorDesc), &acceleratorDesc, err));
ASSERT_NE(nullptr, neoAccelerator.get());
cl_accelerator_intel clAccel = neoAccelerator.get();
vmeBuilder.setExplicitArg(aaceleratorArgNum, sizeof(clAccel), &clAccel, err);
vmeBuilder.wasValidateVmeDispatchCalled = false;
auto ret = vmeBuilder.validateDispatch(nullptr, 2, gws, lws, off);
EXPECT_EQ(CL_SUCCESS, ret);
EXPECT_TRUE(vmeBuilder.wasValidateVmeDispatchCalled);
EXPECT_EQ(gws, vmeBuilder.receivedInputRegion);
EXPECT_EQ(off, vmeBuilder.receivedOffset);
EXPECT_EQ(gwWidthInBlk * gwHeightInBlk, vmeBuilder.receivedBlkNum);
EXPECT_EQ(1U, vmeBuilder.receivedBlkMul);
}
{
acceleratorDesc.mb_block_type = CL_ME_MB_TYPE_4x4_INTEL;
auto neoAccelerator = std::unique_ptr<VmeAccelerator>(VmeAccelerator::create(pContext, CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL, sizeof(acceleratorDesc), &acceleratorDesc, err));
ASSERT_NE(nullptr, neoAccelerator.get());
cl_accelerator_intel clAccel = neoAccelerator.get();
vmeBuilder.setExplicitArg(aaceleratorArgNum, sizeof(clAccel), &clAccel, err);
vmeBuilder.wasValidateVmeDispatchCalled = false;
auto ret = vmeBuilder.validateDispatch(nullptr, 2, gws, lws, off);
EXPECT_EQ(CL_SUCCESS, ret);
EXPECT_TRUE(vmeBuilder.wasValidateVmeDispatchCalled);
EXPECT_EQ(gws, vmeBuilder.receivedInputRegion);
EXPECT_EQ(off, vmeBuilder.receivedOffset);
EXPECT_EQ(gwWidthInBlk * gwHeightInBlk, vmeBuilder.receivedBlkNum);
EXPECT_EQ(16U, vmeBuilder.receivedBlkMul);
}
{
acceleratorDesc.mb_block_type = CL_ME_MB_TYPE_8x8_INTEL;
auto neoAccelerator = std::unique_ptr<VmeAccelerator>(VmeAccelerator::create(pContext, CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL, sizeof(acceleratorDesc), &acceleratorDesc, err));
ASSERT_NE(nullptr, neoAccelerator.get());
cl_accelerator_intel clAccel = neoAccelerator.get();
vmeBuilder.setExplicitArg(aaceleratorArgNum, sizeof(clAccel), &clAccel, err);
vmeBuilder.wasValidateVmeDispatchCalled = false;
vmeBuilder.valueToReturn = 37;
auto ret = vmeBuilder.validateDispatch(nullptr, 2, gws, lws, off);
EXPECT_EQ(37, ret);
EXPECT_TRUE(vmeBuilder.wasValidateVmeDispatchCalled);
EXPECT_EQ(gws, vmeBuilder.receivedInputRegion);
EXPECT_EQ(off, vmeBuilder.receivedOffset);
EXPECT_EQ(gwWidthInBlk * gwHeightInBlk, vmeBuilder.receivedBlkNum);
EXPECT_EQ(4U, vmeBuilder.receivedBlkMul);
}
}
TEST_F(VmeBuiltInTests, WhenValidatingVmeDispatchThenCorrectReturns) {
overwriteBuiltInBinaryName("media_kernels_backend");
BuiltInOp<EBuiltInOps::vmeBlockMotionEstimateIntel> vmeBuilder(*this->pBuiltIns, *pClDevice);
restoreBuiltInBinaryName();
cl_int err;
// images not set
EXPECT_EQ(CL_INVALID_KERNEL_ARGS, vmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
uint32_t srcImgArgNum = 1;
uint32_t refImgArgNum = 2;
std::unique_ptr<Image> imageValid(ImageHelper<ImageVmeValidFormat>::create(pContext));
cl_mem srcImgMem = imageValid.get();
vmeBuilder.setExplicitArg(srcImgArgNum, sizeof(srcImgMem), &srcImgMem, err);
vmeBuilder.setExplicitArg(refImgArgNum, sizeof(srcImgMem), &srcImgMem, err);
// null buffers are valid
EXPECT_EQ(CL_SUCCESS, vmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
// too small buffers should fail
MockBuffer mb;
cl_mem mem = &mb;
uint32_t predictionMotionVectorBufferArgNum = 3;
uint32_t motionVectorBufferArgNum = 4;
uint32_t residualsBufferArgNum = 5;
for (uint32_t argNum : {predictionMotionVectorBufferArgNum, motionVectorBufferArgNum, residualsBufferArgNum}) {
EXPECT_EQ(CL_SUCCESS, vmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, mb.getSize() * 2, 1));
vmeBuilder.setExplicitArg(argNum, sizeof(cl_mem), &mem, err);
EXPECT_EQ(CL_INVALID_BUFFER_SIZE, vmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, mb.getSize() * 2, 1));
vmeBuilder.setExplicitArg(argNum, sizeof(cl_mem), nullptr, err);
}
}
TEST_F(VmeBuiltInTests, GivenAdvancedVmeWhenValidatingVmeDispatchThenCorrectReturns) {
overwriteBuiltInBinaryName("media_kernels_backend");
BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel> avmeBuilder(*this->pBuiltIns, *pClDevice);
restoreBuiltInBinaryName();
cl_int err;
// images not set
ASSERT_EQ(CL_INVALID_KERNEL_ARGS, avmeBuilder.VmeBuiltinDispatchInfoBuilder::validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
EXPECT_EQ(CL_INVALID_KERNEL_ARGS, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
uint32_t srcImgArgNum = 1;
uint32_t refImgArgNum = 2;
std::unique_ptr<Image> imageValid(ImageHelper<ImageVmeValidFormat>::create(pContext));
cl_mem srcImgMem = imageValid.get();
avmeBuilder.setExplicitArg(srcImgArgNum, sizeof(srcImgMem), &srcImgMem, err);
avmeBuilder.setExplicitArg(refImgArgNum, sizeof(srcImgMem), &srcImgMem, err);
ASSERT_EQ(CL_SUCCESS, avmeBuilder.VmeBuiltinDispatchInfoBuilder::validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
uint32_t flagsArgNum = 3;
uint32_t val = CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL;
avmeBuilder.setExplicitArg(flagsArgNum, sizeof(val), &val, err);
EXPECT_EQ(CL_INVALID_KERNEL_ARGS, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
val = CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL;
avmeBuilder.setExplicitArg(flagsArgNum, sizeof(val), &val, err);
uint32_t skipBlockTypeArgNum = 4;
val = 8192;
avmeBuilder.setExplicitArg(skipBlockTypeArgNum, sizeof(uint32_t), &val, err);
EXPECT_EQ(CL_OUT_OF_RESOURCES, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
val = CL_ME_MB_TYPE_16x16_INTEL;
avmeBuilder.setExplicitArg(skipBlockTypeArgNum, sizeof(uint32_t), &val, err);
uint32_t searchCostPenaltyArgNum = 5;
val = 8192;
avmeBuilder.setExplicitArg(searchCostPenaltyArgNum, sizeof(uint32_t), &val, err);
EXPECT_EQ(CL_OUT_OF_RESOURCES, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
val = CL_ME_COST_PENALTY_NONE_INTEL;
avmeBuilder.setExplicitArg(searchCostPenaltyArgNum, sizeof(uint32_t), &val, err);
uint32_t searchCostPrecisionArgNum = 6;
val = 8192;
avmeBuilder.setExplicitArg(searchCostPrecisionArgNum, sizeof(uint32_t), &val, err);
EXPECT_EQ(CL_OUT_OF_RESOURCES, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
val = CL_ME_COST_PRECISION_QPEL_INTEL;
avmeBuilder.setExplicitArg(searchCostPrecisionArgNum, sizeof(uint32_t), &val, err);
// for non-bidirectional avme kernel, countMotionVectorBuffer must be set
uint32_t countMotionVectorBufferArgNum = 7;
EXPECT_EQ(CL_INVALID_BUFFER_SIZE, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
MockBuffer mb;
cl_mem mem = &mb;
avmeBuilder.setExplicitArg(countMotionVectorBufferArgNum, sizeof(cl_mem), &mem, err);
EXPECT_EQ(CL_SUCCESS, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 1, 1));
}
TEST_F(VmeBuiltInTests, GivenAdvancedBidirectionalVmeWhenValidatingVmeDispatchThenCorrectReturns) {
overwriteBuiltInBinaryName("media_kernels_backend");
BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel> avmeBuilder(*this->pBuiltIns, *pClDevice);
restoreBuiltInBinaryName();
cl_int err;
uint32_t srcImgArgNum = 1;
uint32_t refImgArgNum = 2;
std::unique_ptr<Image> imageValid(ImageHelper<ImageVmeValidFormat>::create(pContext));
cl_mem srcImgMem = imageValid.get();
avmeBuilder.setExplicitArg(srcImgArgNum, sizeof(srcImgMem), &srcImgMem, err);
avmeBuilder.setExplicitArg(refImgArgNum, sizeof(srcImgMem), &srcImgMem, err);
ASSERT_EQ(CL_SUCCESS, avmeBuilder.VmeBuiltinDispatchInfoBuilder::validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
uint32_t flagsArgNum = 6;
uint32_t val = CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL;
avmeBuilder.setExplicitArg(flagsArgNum, sizeof(val), &val, err);
uint32_t searchCostPenaltyArgNum = 7;
val = CL_ME_COST_PENALTY_NONE_INTEL;
avmeBuilder.setExplicitArg(searchCostPenaltyArgNum, sizeof(uint32_t), &val, err);
uint32_t searchCostPrecisionArgNum = 8;
val = CL_ME_COST_PRECISION_QPEL_INTEL;
avmeBuilder.setExplicitArg(searchCostPrecisionArgNum, sizeof(uint32_t), &val, err);
uint32_t bidirWeightArgNum = 10;
val = 255;
avmeBuilder.setExplicitArg(bidirWeightArgNum, sizeof(uint8_t), &val, err);
EXPECT_EQ(CL_INVALID_KERNEL_ARGS, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
val = CL_ME_BIDIR_WEIGHT_QUARTER_INTEL;
avmeBuilder.setExplicitArg(bidirWeightArgNum, sizeof(uint8_t), &val, err);
EXPECT_EQ(CL_SUCCESS, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
// test bufferSize checking
uint32_t countMotionVectorBufferArgNum = 11;
MockBuffer mb;
cl_mem mem = &mb;
avmeBuilder.setExplicitArg(countMotionVectorBufferArgNum, sizeof(cl_mem), &mem, err);
EXPECT_EQ(CL_SUCCESS, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 1, 1));
EXPECT_EQ(CL_INVALID_BUFFER_SIZE, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, mb.getSize() * 2, 1));
}
TEST_F(VmeBuiltInTests, GivenAdvancedVmeWhenGettingSkipResidualsBuffExpSizeThenDefaultSizeIsReturned) {
overwriteBuiltInBinaryName("media_kernels_backend");
BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel> vmeBuilder(*this->pBuiltIns, *pClDevice);
restoreBuiltInBinaryName();
auto size16x16 = vmeBuilder.getSkipResidualsBuffExpSize(CL_ME_MB_TYPE_16x16_INTEL, 4);
auto sizeDefault = vmeBuilder.getSkipResidualsBuffExpSize(8192, 4);
EXPECT_EQ(size16x16, sizeDefault);
}
TEST_F(BuiltInTests, GivenInvalidBuiltinKernelNameWhenCreatingBuiltInProgramThenInvalidValueErrorIsReturned) {
const char *kernelNames = "invalid_kernel";
cl_int retVal = CL_SUCCESS;
cl_program program = Vme::createBuiltInProgram(
*pContext,
pContext->getDevices(),
kernelNames,
retVal);
EXPECT_EQ(CL_INVALID_VALUE, retVal);
EXPECT_EQ(nullptr, program);
}
TEST_F(BuiltInTests, WhenGettingSipKernelThenReturnProgramCreatedFromIsaAcquiredThroughCompilerInterface) {
auto mockCompilerInterface = new MockCompilerInterface();
pDevice->getExecutionEnvironment()->rootDeviceEnvironments[rootDeviceIndex]->compilerInterface.reset(mockCompilerInterface);

View File

@ -1,962 +0,0 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
// VME KERNELS
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_motion_estimate_intel(sampler_t accelerator, __read_only image2d_t srcImg,
__read_only image2d_t refImg,
__global short2 *prediction_motion_vector_buffer,
__global short2 *motion_vector_buffer,
__global ushort *residuals, int height, int width,
int stride) {
__local uint dst[64];
__local ushort *dist = (__local ushort *)&dst[8 * 5];
int sid_0 = stride * get_group_id(0);
int gid_0 = sid_0 / height;
int gid_1 = sid_0 % height;
for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
sid++, gid_0 = sid / height, gid_1 = sid % height) {
int2 srcCoord = 0;
int2 refCoord = 0;
srcCoord.x = gid_0 * 16 + get_global_offset(0);
srcCoord.y = gid_1 * 16 + get_global_offset(1);
short2 predMV = 0;
#ifndef HW_NULL_CHECK
if (prediction_motion_vector_buffer != NULL)
#endif
{
predMV = prediction_motion_vector_buffer[gid_0 + gid_1 * width];
refCoord.x = predMV.x / 4;
refCoord.y = predMV.y / 4;
refCoord.y = refCoord.y & 0xFFFE;
}
{
intel_work_group_vme_mb_query(dst, srcCoord, refCoord, srcImg, refImg,
accelerator);
}
barrier(CLK_LOCAL_MEM_FENCE);
// Write Out Result
// 4x4
if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
int x = get_local_id(0) % 4;
int y = get_local_id(0) / 4;
int index = (gid_0 * 4 + x) + (gid_1 * 4 + y) * width * 4;
short2 val = as_short2(dst[8 + (y * 4 + x) * 2]);
motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (residuals != NULL)
#endif
{
residuals[index] = dist[y * 4 + x];
}
}
// 8x8
if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
if (get_local_id(0) < 4) {
int x = get_local_id(0) % 2;
int y = get_local_id(0) / 2;
int index = (gid_0 * 2 + x) + (gid_1 * 2 + y) * width * 2;
short2 val = as_short2(dst[8 + (y * 2 + x) * 8]);
motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (residuals != NULL)
#endif
{
residuals[index] = dist[(y * 2 + x) * 4];
}
}
}
// 16x16
if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
if (get_local_id(0) == 0) {
int index = gid_0 + gid_1 * width;
short2 val = as_short2(dst[8]);
motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (residuals != NULL)
#endif
{
residuals[index] = dist[0];
}
}
}
}
}
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_advanced_motion_estimate_check_intel(
sampler_t accelerator, __read_only image2d_t srcImg,
__read_only image2d_t refImg, uint flags, uint skip_block_type,
uint search_cost_penalty, uint search_cost_precision,
__global short2 *count_motion_vector_buffer,
__global short2 *predictors_buffer,
__global short2 *skip_motion_vector_buffer,
__global short2 *motion_vector_buffer,
__global char *intra_search_predictor_modes, __global ushort *residuals,
__global ushort *skip_residuals, __global ushort *intra_residuals,
__read_only image2d_t intraSrcImg, int height, int width, int stride) {
__local uint dstSearch[64]; // 8 GRFs
__local uint dstSkipIntra[64 + 24]; // 11 GRFs (8 for inter, 3 for intra)
__local ushort *distSearch =
(__local ushort *)&dstSearch[8 * 5]; // distortion in the 6th GRF
// Initialize the MV cost table:
// MV Cost in U4U4 format:
// No cost : 0, 0, 0, 0, 0, 0, 0, 0
// Low Cost : 1, 4, 5, 9, 10, 12, 14, 15
// Normal Cost: 5, 26, 29, 43, 45, 47, 57, 57
// High Cost : 29, 61, 72, 78, 88, 89, 91, 92
uint2 MVCostTable;
if (search_cost_penalty == 1) {
MVCostTable.s0 = 0x09050401;
MVCostTable.s1 = 0x0F0E0C0A;
} else if (search_cost_penalty == 2) {
MVCostTable.s0 = 0x2B1D1A05;
MVCostTable.s1 = 0x39392F2D;
} else if (search_cost_penalty == 3) {
MVCostTable.s0 = 0x4E483D1D;
MVCostTable.s1 = 0x5C5B5958;
} else {
MVCostTable.s0 = 0;
MVCostTable.s1 = 0;
}
uint MVCostPrecision = ((uint)search_cost_precision) << 16;
// Frame is divided into rows * columns of MBs.
// One h/w thread per WG.
// One WG processes 'row' MBs - one row per iteration and one MB per row.
// Number of WGs (or h/w threads) is number of columns MBs
// Each iteration processes the MB in a row - gid_0 is the MB id in a row and
// gid_1 is the row offset.
int sid_0 = stride * get_group_id(0);
int gid_0 = sid_0 / height;
int gid_1 = sid_0 % height;
for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
sid++, gid_0 = sid / height, gid_1 = sid % height) {
int2 srcCoord;
srcCoord.x = gid_0 * 16 +
get_global_offset(0); // 16 pixels wide MBs (globally scalar)
srcCoord.y = gid_1 * 16 +
get_global_offset(1); // 16 pixels tall MBs (globally scalar)
uint curMB = gid_0 + gid_1 * width; // current MB id
short2 count = count_motion_vector_buffer[curMB];
int countPredMVs = count.x;
if (countPredMVs != 0) {
uint offset = curMB * 8; // 8 predictors per MB
offset += get_local_id(0) % 8; // 16 work-items access 8 MVs for MB
// one predictor for MB per SIMD channel
// Reduce predictors from Q-pixel to integer precision.
int2 predMV = 0;
if (get_local_id(0) < countPredMVs) {
predMV =
convert_int2(predictors_buffer[offset]); // one MV per work-item
predMV.x /= 4;
predMV.y /= 4;
predMV.y &= 0xFFFE;
}
// Do up to 8 IMEs, get the best MVs and their distortions, and optionally
// a FBR of the best MVs.
// Finally the results are written out to SLM.
intel_work_group_vme_mb_multi_query_8(
dstSearch, // best search MV and its distortions into SLM
countPredMVs, // count of predictor MVs (globally scalar - value range
// 1 to 8)
MVCostPrecision, // MV cost precision
MVCostTable, // MV cost table
srcCoord, // MB 2-D offset (globally scalar)
predMV, // predictor MVs (up to 8 distinct MVs for SIMD16 thread)
srcImg, // source
refImg, // reference
accelerator); // vme object
}
int doIntra = (flags & 0x2) != 0;
int intraEdges = 0;
if (doIntra) {
// Enable all edges by default.
intraEdges = 0x3C;
// If this is a left-edge MB, then disable left edges.
if ((gid_0 == 0) & (get_global_offset(0) == 0)) {
intraEdges &= 0x14;
}
// If this is a right edge MB then disable right edges.
if (gid_0 == width - 1) {
intraEdges &= 0x38;
}
// If this is a top-edge MB, then disable top edges.
if ((gid_1 == 0) & (get_global_offset(1) == 0)) {
intraEdges &= 0x20;
}
// Set bit6=bit5.
intraEdges |= ((intraEdges & 0x20) << 1);
intraEdges <<= 8;
}
int countSkipMVs = count.y;
if (countSkipMVs != 0 || doIntra == true) {
uint offset = curMB * 8; // 8 sets of skip check MVs per MB
offset +=
(get_local_id(0) % 8); // 16 work-items access 8 sets of MVs for MB
// one set of skip MV per SIMD channel
// Do up to 8 skip checks and get the distortions for each of them.
// Finally the results are written out to SLM.
if ((skip_block_type == 0x0) | ((doIntra) & (countSkipMVs == 0))) {
int skipMVs = 0;
if (get_local_id(0) < countSkipMVs) {
__global int *skip1_motion_vector_buffer =
(__global int *)skip_motion_vector_buffer;
skipMVs = skip1_motion_vector_buffer[offset]; // one packed MV for one
// work-item
}
intel_work_group_vme_mb_multi_check_16x16(
dstSkipIntra, // distortions into SLM
countSkipMVs, // count of skip check MVs (value range 0 to 8)
doIntra, // compute intra modes
intraEdges, // intra edges to use
srcCoord, // MB 2-D offset (globally scalar)
skipMVs, // skip check MVs (up to 8 sets of skip check MVs for
// SIMD16 thread)
srcImg, // source
refImg, // reference
intraSrcImg, // intra source
accelerator);
}
if ((skip_block_type == 0x1) & (countSkipMVs > 0)) {
int4 skipMVs = 0;
if (get_local_id(0) < countSkipMVs) {
__global int4 *skip4_motion_vector_buffer =
(__global int4 *)(skip_motion_vector_buffer);
skipMVs = skip4_motion_vector_buffer[offset]; // four component MVs
// per work-item
}
intel_work_group_vme_mb_multi_check_8x8(
dstSkipIntra, // distortions into SLM
countSkipMVs, // count of skip check MVs per MB (value range 0 to 8)
doIntra, // compute intra modes
intraEdges, // intra edges to use
srcCoord, // MB 2-D offset (globally scalar)
skipMVs, // skip check MVs (up to 8 ets of skip check MVs for SIMD16
// thread)
srcImg, // source
refImg, // reference
intraSrcImg, // intra source
accelerator);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// Write Out motion estimation result:
// Result format
// Hierarchical row-major layout
// i.e. row-major of blocks MVs in MBs, and row-major of 8 sets of
// MVs/distortion in blocks
if (countPredMVs != 0) {
// 4x4
if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
int index = (gid_0 * 16 + get_local_id(0)) + (gid_1 * 16 * width);
// 1. 16 work-items enabled.
// 2. Work-items gather fwd MVs in strided dword locations 0, 2, .., 30
// (interleaved
// fwd/bdw MVs) with constant offset 8 (control data size) from SLM
// into contiguous
// short2 locations 0, 1, .., 15 of global buffer
// search_motion_vector_buffer with
// offset index.
// 3. Work-items gather contiguous ushort locations 0, 1, .., 15 from
// distSearch into
// contiguous ushort locations 0, 1, .., 15 of search_residuals with
// offset index.
short2 val = as_short2(dstSearch[8 + get_local_id(0) * 2]);
motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (residuals != NULL)
#endif
{
residuals[index] = distSearch[get_local_id(0)];
}
}
// 8x8
else if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
// Only 1st 4 work-item are needed.
if (get_local_id(0) < 4) {
int index = (gid_0 * 4 + get_local_id(0)) + (gid_1 * 4 * width);
// 1. 4 work-items enabled.
// 2. Work-items gather fw MVs in strided dword locations 0, 8, 16, 24
// (interleaved
// fwd/bdw MVs) with constant offset 8 from SLM into contiguous
// short2 locations
// 0, 1, .., 15 of global buffer search_motion_vector_buffer with
// offset index.
// 3. Work-items gather strided ushort locations 0, 4, 8, 12 from
// distSearch into
// contiguous ushort locations 0, 1, .., 15 of search_residuals
// with offset index.
short2 val = as_short2(dstSearch[8 + get_local_id(0) * 4 * 2]);
motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (residuals != NULL)
#endif
{
residuals[index] = distSearch[get_local_id(0) * 4];
}
}
}
// 16x16
else if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
// One 1st work is needed.
if (get_local_id(0) == 0) {
int index = gid_0 + gid_1 * width;
// 1. 1 work-item enabled.
// 2. Work-item gathers fwd MV in dword location 0 with constant
// offset 8 from
// SLM into short2 locations 0 of global buffer
// search_motion_vector_buffer.
// 3. Work-item gathers ushort location 0 from distSearch into ushort
// location 0 of search_residuals with offset index.
short2 val = as_short2(dstSearch[8]);
motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (residuals != NULL)
#endif
{
residuals[index] = distSearch[0];
}
}
}
}
// Write out motion skip check result:
// Result format
// Hierarchical row-major layout
// i.e. row-major of blocks in MBs, and row-major of 8 sets of
// distortions in blocks
if (countSkipMVs != 0) {
if (skip_block_type == 0x0) {
// Copy out 8 (1 component) sets of distortion values.
int index = (gid_0 * 8) + (get_local_id(0)) + (gid_1 * 8 * width);
if (get_local_id(0) < countSkipMVs) {
__local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
// 1. Up to 8 work-items are enabled.
// 2. The work-item gathers distSkip locations 0, 16*1, .., 16*7 and
// copies them to contiguous skip_residual locations 0, 1, 2, ..,
// 7.
skip_residuals[index] = distSkip[get_local_id(0) * 16];
}
} else {
// Copy out 8 (4 component) sets of distortion values.
int index =
(gid_0 * 8 * 4) + (get_local_id(0)) + (gid_1 * 8 * 4 * width);
__local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
if (get_local_id(0) < countSkipMVs * 4) {
// 1. Up to 16 work-items are enabled.
// 2. The work-item gathers distSkip locations 0, 4*1, .., 4*31 and
// copies them to contiguous skip_residual locations 0, 1, 2, ..,
// 31.
skip_residuals[index] = distSkip[get_local_id(0) * 4];
skip_residuals[index + 16] = distSkip[(get_local_id(0) + 16) * 4];
}
}
}
// Write out intra search result:
if (doIntra) {
int index_low =
(gid_0 * 22) + (get_local_id(0) * 2) + (gid_1 * 22 * width);
int index_high =
(gid_0 * 22) + (get_local_id(0) * 2) + 1 + (gid_1 * 22 * width);
// Write out the 4x4 intra modes
if (get_local_id(0) < 8) {
__local char *dstIntra_4x4 =
(__local char *)(&dstSkipIntra[64 + 16 + 4]);
char value = dstIntra_4x4[get_local_id(0)];
char value_low = (value)&0xf;
char value_high = (value >> 4) & 0xf;
intra_search_predictor_modes[index_low + 5] = value_low;
intra_search_predictor_modes[index_high + 5] = value_high;
}
// Write out the 8x8 intra modes
if (get_local_id(0) < 4) {
__local char *dstIntra_8x8 =
(__local char *)(&dstSkipIntra[64 + 8 + 4]);
char value = dstIntra_8x8[get_local_id(0) * 2];
char value_low = (value)&0xf;
int index = (gid_0 * 22) + (get_local_id(0)) + (gid_1 * 22 * width);
intra_search_predictor_modes[index + 1] = value_low;
}
// Write out the 16x16 intra modes
if (get_local_id(0) < 1) {
__local char *dstIntra_16x16 =
(__local char *)(&dstSkipIntra[64 + 0 + 4]);
char value = dstIntra_16x16[get_local_id(0)];
char value_low = (value)&0xf;
intra_search_predictor_modes[index_low] = value_low;
}
// Get the intra residuals.
#ifndef HW_NULL_CHECK
if (intra_residuals != NULL)
#endif
{
int index = (gid_0 * 4) + (gid_1 * 4 * width);
if (get_local_id(0) < 1) {
__local ushort *distIntra_4x4 =
(__local ushort *)(&dstSkipIntra[64 + 16 + 3]);
__local ushort *distIntra_8x8 =
(__local ushort *)(&dstSkipIntra[64 + 8 + 3]);
__local ushort *distIntra_16x16 =
(__local ushort *)(&dstSkipIntra[64 + 0 + 3]);
intra_residuals[index + 2] = distIntra_4x4[0];
intra_residuals[index + 1] = distIntra_8x8[0];
intra_residuals[index + 0] = distIntra_16x16[0];
}
}
}
}
}
/*************************************************************************************************
Built-in kernel:
block_advanced_motion_estimate_bidirectional_check_intel
Description:
1. Do motion estimation with 0 to 4 predictor MVs using 0 to 4 (integer
motion estimation)
IMEs per macro-block, calculating the best search MVs per specified (16x16,
8x8, 4x4) luma
block with lowest distortion from amongst the 0 to 4 IME results, and
optionally do
(fractional bi-directional refinement) FBR on the best IME search results to
refine the best
search results. The best search (FBR if done, or IME) MVs and their
distortions are returned.
2. Do undirectional or bidirectional skip (zero search) checks with 0 to 4
sets of skip check
MVs for (16x16, 8x8) luma blocks using 0 to 4 (skip and intra check) SICs and
return the
distortions associated with the input sets of skip check MVs per specified
luma block. 4x4
blocks are not supported by h/w for skip checks.
3. Do intra-prediction for (16x16, 8x8, 4x4) luma blocks and (8x8) chroma
blocks using 3 SICs
and returning the predictor modes and their associated distortions.
Intra-prediction is done
for all block sizes. Support for 8x8 chroma blocks cannot be enabled until NV
image formats
are supported in OCL.
**************************************************************************************************/
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_advanced_motion_estimate_bidirectional_check_intel(
sampler_t accelerator, __read_only image2d_t srcImg,
__read_only image2d_t refImg, __read_only image2d_t src_check_image,
__read_only image2d_t ref0_check_image,
__read_only image2d_t ref1_check_image, uint flags,
uint search_cost_penalty, uint search_cost_precision, short2 count_global,
uchar bidir_weight, __global short2 *count_motion_vector_buffer,
__global short2 *prediction_motion_vector_buffer,
__global char *skip_input_mode_buffer,
__global short2 *skip_motion_vector_buffer,
__global short2 *search_motion_vector_buffer,
__global char *intra_search_predictor_modes,
__global ushort *search_residuals, __global ushort *skip_residuals,
__global ushort *intra_residuals, __read_only image2d_t intraSrcImg,
int height, int width, int stride) {
__local uint dstSearch[64]; // 8 GRFs
__local uint dstSkipIntra[32 + 24]; // 7 GRFs (4 for inter, 3 for intra)
// distortion in the 6th GRF
__local ushort *distSearch = (__local ushort *)&dstSearch[8 * 5];
// Initialize the MV cost table:
// MV Cost in U4U4 format:
// No cost : 0, 0, 0, 0, 0, 0, 0, 0
// Low Cost : 1, 4, 5, 9, 10, 12, 14, 15
// Normal Cost: 5, 26, 29, 43, 45, 47, 57, 57
// High Cost : 29, 61, 72, 78, 88, 89, 91, 92
uint2 MVCostTable;
if (search_cost_penalty == 1) {
MVCostTable.s0 = 0x09050401;
MVCostTable.s1 = 0x0F0E0C0A;
} else if (search_cost_penalty == 2) {
MVCostTable.s0 = 0x2B1D1A05;
MVCostTable.s1 = 0x39392F2D;
} else if (search_cost_penalty == 3) {
MVCostTable.s0 = 0x4E483D1D;
MVCostTable.s1 = 0x5C5B5958;
} else {
MVCostTable.s0 = 0;
MVCostTable.s1 = 0;
}
uint MVCostPrecision = ((uint)search_cost_precision) << 16;
// Frame is divided into rows * columns of MBs.
// One h/w thread per WG.
// One WG processes "row" MBs - one row per iteration and one MB per row.
// Number of WGs (or h/w threads) is number of columns MBs.Each iteration
// processes the MB in a row - gid_0 is the MB id in a row and gid_1 is the
// row offset.
int sid_0 = stride * get_group_id(0);
int gid_0 = sid_0 / height;
int gid_1 = sid_0 % height;
for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
sid++, gid_0 = sid / height, gid_1 = sid % height) {
int2 srcCoord;
srcCoord.x = gid_0 * 16 +
get_global_offset(0); // 16 pixels wide MBs (globally scalar)
srcCoord.y = gid_1 * 16 +
get_global_offset(1); // 16 pixels tall MBs (globally scalar)
uint curMB = gid_0 + gid_1 * width; // current MB id
short2 count;
// If either the search or skip vector counts are per-MB, then we need to
// read in
// the count motion vector buffer.
if ((count_global.s0 == -1) | (count_global.s1 == -1)) {
count = count_motion_vector_buffer[curMB];
}
// If either the search or skip vector counts are per-frame, we need to use
// those.
if (count_global.s0 >= 0) {
count.s0 = count_global.s0;
}
if (count_global.s1 >= 0) {
count.s1 = count_global.s1;
}
int countPredMVs = count.x;
if (countPredMVs != 0) {
uint offset = curMB * 4; // 4 predictors per MB
offset += get_local_id(0) % 4; // 16 work-items access 4 MVs for MB
// one predictor for MB per SIMD channel
// Reduce predictors from Q-pixel to integer precision.
int2 predMV = 0;
if (get_local_id(0) < countPredMVs) {
// one MV per work-item
predMV = convert_int2(prediction_motion_vector_buffer[offset]);
// Predictors are input in QP resolution. Convert that to integer
// resolution.
predMV.x /= 4;
predMV.y /= 4;
predMV.y &= 0xFFFFFFFE;
}
// Do up to 4 IMEs, get the best MVs and their distortions, and optionally
// a FBR of
// the best MVs. Finally the results are written out to SLM.
intel_work_group_vme_mb_multi_query_4(
dstSearch, // best search MV and its distortions into SLM
countPredMVs, // count of predictor MVs (globally scalar - value range
// 1 to 4)
MVCostPrecision, // MV cost precision
MVCostTable, // MV cost table
srcCoord, // MB 2-D offset (globally scalar)
predMV, // predictor MVs (up to 4 distinct MVs for SIMD16 thread)
srcImg, // source
refImg, // reference
accelerator); // vme object
}
int doIntra = ((flags & 0x2) != 0);
int intraEdges = 0;
if (doIntra) {
// Enable all edges by default.
intraEdges = 0x3C;
// If this is a left-edge MB, then disable left edges.
if ((gid_0 == 0) & (get_global_offset(0) == 0)) {
intraEdges &= 0x14;
}
// If this is a right edge MB then disable right edges.
if (gid_0 == width - 1) {
intraEdges &= 0x38;
}
// If this is a top-edge MB, then disable top edges.
if ((gid_1 == 0) & (get_global_offset(1) == 0)) {
intraEdges &= 0x20;
}
// Set bit6=bit5.
intraEdges |= ((intraEdges & 0x20) << 1);
intraEdges <<= 8;
}
int skip_block_type_8x8 = flags & 0x4;
int countSkipMVs = count.y;
if (countSkipMVs != 0 || doIntra == true) {
// one set of skip MV per SIMD channel
// Do up to 4 skip checks and get the distortions for each of them.
// Finally the results are written out to SLM.
if ((skip_block_type_8x8 == 0) | ((doIntra) & (countSkipMVs == 0))) {
// 16x16:
uint offset = curMB * 4 * 2; // 4 sets of skip check MVs per MB
int skipMV = 0;
if (get_local_id(0) < countSkipMVs * 2) // need 2 values per MV
{
offset +=
(get_local_id(0)); // 16 work-items access 4 sets of MVs for MB
__global int *skip1_motion_vector_buffer =
(__global int *)skip_motion_vector_buffer;
skipMV = skip1_motion_vector_buffer[offset]; // one MV per work-item
}
uchar skipMode = 0;
if (get_local_id(0) < countSkipMVs) {
skipMode = skip_input_mode_buffer[curMB];
if (skipMode == 0) {
skipMode = 1;
}
if (skipMode > 3) {
skipMode = 3;
}
}
intel_work_group_vme_mb_multi_bidir_check_16x16(
dstSkipIntra, // distortions into SLM
countSkipMVs, // count of skip check MVs (globally scalar - value
// range 1 to 4)
doIntra, // compute intra modes
intraEdges, // intra edges to use
srcCoord, // MB 2-D offset (globally scalar)
bidir_weight, // bidirectional weight
skipMode, // skip modes
skipMV, // skip check MVs (up to 4 distinct sets of skip check MVs
// for SIMD16 thread)
src_check_image, // source
ref0_check_image, // reference fwd
ref1_check_image, // reference bwd
intraSrcImg, // intra source
accelerator); // vme object
} else {
// 8x8:
uint offset =
curMB * 4 *
8; // 4 sets of skip check MVs, 16 shorts (8 ints) each per MB
int2 skipMVs = 0;
if (get_local_id(0) < countSkipMVs * 8) // need 8 values per MV
{
offset +=
(get_local_id(0)); // 16 work-items access 4 sets of MVs for MB
__global int *skip1_motion_vector_buffer =
(__global int *)(skip_motion_vector_buffer);
skipMVs.x = skip1_motion_vector_buffer[offset]; // four component MVs
// per work-item
skipMVs.y = skip1_motion_vector_buffer[offset + 16];
}
uchar skipModes = 0;
if (get_local_id(0) < countSkipMVs) {
skipModes = skip_input_mode_buffer[curMB];
}
intel_work_group_vme_mb_multi_bidir_check_8x8(
dstSkipIntra, // distortions into SLM
countSkipMVs, // count of skip check MVs per MB (globally scalar -
// value range 1 to 4)
doIntra, // compute intra modes
intraEdges, // intra edges to use
srcCoord, // MB 2-D offset (globally scalar)
bidir_weight, // bidirectional weight
skipModes, // skip modes
skipMVs, // skip check MVs (up to 4 distinct sets of skip check MVs
// for SIMD16 thread)
src_check_image, // source
ref0_check_image, // reference fwd
ref1_check_image, // reference bwd
intraSrcImg, // intra source
accelerator); // vme object
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// Write Out motion estimation result:
// Result format
// Hierarchical row-major layout
// i.e. row-major of blocks MVs in MBs, and row-major of 4 sets of
// MVs/distortion in blocks
if (countPredMVs != 0) {
// 4x4
if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
int index = (gid_0 * 16 + get_local_id(0)) + (gid_1 * 16 * width);
// 1. 16 work-items enabled.
// 2. Work-items gather fwd MVs in strided dword locations 0, 2, .., 30
// (interleaved
// fwd/bdw MVs) with constant offset 8 (control data size) from SLM
// into contiguous
// short2 locations 0, 1, .., 15 of global buffer
// search_motion_vector_buffer with
// offset index.
// 3. Work-items gather contiguous ushort locations 0, 1, .., 15 from
// distSearch into
// contiguous ushort locations 0, 1, .., 15 of search_residuals with
// offset index.
short2 val = as_short2(dstSearch[8 + get_local_id(0) * 2]);
search_motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (search_residuals != NULL)
#endif
{
search_residuals[index] = distSearch[get_local_id(0)];
}
}
// 8x8
else if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
// Only 1st 4 work-item are needed.
if (get_local_id(0) < 4) {
int index = (gid_0 * 4 + get_local_id(0)) + (gid_1 * 4 * width);
// 1. 4 work-items enabled.
// 2. Work-items gather fw MVs in strided dword locations 0, 8, 16, 24
// (interleaved
// fwd/bdw MVs) with constant offset 8 from SLM into contiguous
// short2 locations
// 0, 1, .., 15 of global buffer search_motion_vector_buffer with
// offset index.
// 3. Work-items gather strided ushort locations 0, 4, 8, 12 from
// distSearch into
// contiguous ushort locations 0, 1, .., 15 of search_residuals
// with offset index.
short2 val = as_short2(dstSearch[8 + get_local_id(0) * 4 * 2]);
search_motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (search_residuals != NULL)
#endif
{
search_residuals[index] = distSearch[get_local_id(0) * 4];
}
}
}
// 16x16
else if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
// One 1st work is needed.
if (get_local_id(0) == 0) {
int index = gid_0 + gid_1 * width;
// 1. 1 work-item enabled.
// 2. Work-item gathers fwd MV in dword location 0 with constant
// offset 8 from
// SLM into short2 locations 0 of global buffer
// search_motion_vector_buffer.
// 3. Work-item gathers ushort location 0 from distSearch into ushort
// location 0 of search_residuals with offset index.
short2 val = as_short2(dstSearch[8]);
search_motion_vector_buffer[index] = val;
#ifndef HW_NULL_CHECK
if (search_residuals != NULL)
#endif
{
search_residuals[index] = distSearch[0];
}
}
}
}
// Write out motion skip check result:
// Result format
// Hierarchical row-major layout
// i.e. row-major of blocks in MBs, and row-major of 8 sets of
// distortions in blocks
if (countSkipMVs != 0) {
if (skip_block_type_8x8 == false) {
// Copy out 4 (1 component) sets of distortion values.
int index = (gid_0 * 4) + (get_local_id(0)) + (gid_1 * 4 * width);
if (get_local_id(0) < countSkipMVs) {
// 1. Up to 4 work-items are enabled.
// 2. The work-item gathers distSkip locations 0, 16*1, .., 16*7 and
// copies them to contiguous skip_residual locations 0, 1, 2, ..,
// 7.
__local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
skip_residuals[index] = distSkip[get_local_id(0) * 16];
}
} else {
// Copy out 4 (4 component) sets of distortion values.
int index =
(gid_0 * 4 * 4) + (get_local_id(0)) + (gid_1 * 4 * 4 * width);
if (get_local_id(0) < countSkipMVs * 4) {
// 1. Up to 16 work-items are enabled.
// 2. The work-item gathers distSkip locations 0, 4*1, .., 4*15 and
// copies them to contiguous skip_residual locations 0, 1, 2, ..,
// 15.
__local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
skip_residuals[index] = distSkip[get_local_id(0) * 4];
}
}
}
// Write out intra search result:
if (doIntra) {
// Write out the 4x4 intra modes
if (get_local_id(0) < 8) {
__local char *dstIntra_4x4 =
(__local char *)(&dstSkipIntra[32 + 16 + 4]);
char value = dstIntra_4x4[get_local_id(0)];
char value_low = (value)&0xf;
char value_high = (value >> 4) & 0xf;
int index_low =
(gid_0 * 22) + (get_local_id(0) * 2) + (gid_1 * 22 * width);
int index_high =
(gid_0 * 22) + (get_local_id(0) * 2) + 1 + (gid_1 * 22 * width);
intra_search_predictor_modes[index_low + 5] = value_low;
intra_search_predictor_modes[index_high + 5] = value_high;
}
// Write out the 8x8 intra modes
if (get_local_id(0) < 4) {
__local char *dstIntra_8x8 =
(__local char *)(&dstSkipIntra[32 + 8 + 4]);
char value = dstIntra_8x8[get_local_id(0) * 2];
char value_low = (value)&0xf;
int index = (gid_0 * 22) + (get_local_id(0)) + (gid_1 * 22 * width);
intra_search_predictor_modes[index + 1] = value_low;
}
// Write out the 16x16 intra modes
if (get_local_id(0) < 1) {
__local char *dstIntra_16x16 =
(__local char *)(&dstSkipIntra[32 + 0 + 4]);
char value = dstIntra_16x16[0];
char value_low = (value)&0xf;
int index = (gid_0 * 22) + (gid_1 * 22 * width);
intra_search_predictor_modes[index] = value_low;
}
// Get the intra residuals.
#ifndef HW_NULL_CHECK
if (intra_residuals != NULL)
#endif
{
int index = (gid_0 * 4) + (gid_1 * 4 * width);
if (get_local_id(0) < 1) {
__local ushort *distIntra_4x4 =
(__local ushort *)(&dstSkipIntra[32 + 16 + 3]);
__local ushort *distIntra_8x8 =
(__local ushort *)(&dstSkipIntra[32 + 8 + 3]);
__local ushort *distIntra_16x16 =
(__local ushort *)(&dstSkipIntra[32 + 0 + 3]);
intra_residuals[index + 2] = distIntra_4x4[0];
intra_residuals[index + 1] = distIntra_8x8[0];
intra_residuals[index + 0] = distIntra_16x16[0];
}
}
}
}
}
// VEBOX KERNELS:
__kernel void ve_enhance_intel(sampler_t accelerator,
int flags,
__read_only image2d_t current_input,
__write_only image2d_t current_output) {
}
__kernel void ve_dn_enhance_intel(sampler_t accelerator,
int flags,
__read_only image2d_t ref_input,
__read_only image2d_t current_input,
__write_only image2d_t current_output) {
}
__kernel void ve_dn_di_enhance_intel(sampler_t accelerator,
int flags,
__read_only image2d_t current_input,
__read_only image2d_t ref_input,
__write_only image2d_t current_output,
__write_only image2d_t ref_output,
__write_only image2d_t dndi_output) {
}

View File

@ -1,8 +0,0 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
-D cl_intel_device_side_advanced_vme_enable -D cl_intel_device_side_avc_vme_enable -D cl_intel_device_side_vme_enable -D cl_intel_media_block_io -cl-unsafe-math-optimizations -cl-mad-enable -cl-fast-relaxed-math

View File

@ -1,68 +0,0 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
// VME KERNELS
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_motion_estimate_intel(sampler_t accelerator, __read_only image2d_t srcImg,
__read_only image2d_t refImg,
__global short2 *prediction_motion_vector_buffer,
__global short2 *motion_vector_buffer,
__global ushort *residuals) {
}
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_advanced_motion_estimate_check_intel(
sampler_t accelerator, __read_only image2d_t srcImg,
__read_only image2d_t refImg, uint flags, uint skip_block_type,
uint search_cost_penalty, uint search_cost_precision,
__global short2 *count_motion_vector_buffer,
__global short2 *predictors_buffer,
__global short2 *skip_motion_vector_buffer,
__global short2 *motion_vector_buffer,
__global char *intra_search_predictor_modes, __global ushort *residuals,
__global ushort *skip_residuals, __global ushort *intra_residuals) {
}
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_advanced_motion_estimate_bidirectional_check_intel(
sampler_t accelerator, __read_only image2d_t srcImg,
__read_only image2d_t refImg, __read_only image2d_t src_check_image,
__read_only image2d_t ref0_check_image,
__read_only image2d_t ref1_check_image, uint flags,
uint search_cost_penalty, uint search_cost_precision, short2 count_global,
uchar bidir_weight, __global short2 *count_motion_vector_buffer,
__global short2 *prediction_motion_vector_buffer,
__global char *skip_input_mode_buffer,
__global short2 *skip_motion_vector_buffer,
__global short2 *search_motion_vector_buffer,
__global char *intra_search_predictor_modes,
__global ushort *search_residuals, __global ushort *skip_residuals,
__global ushort *intra_residuals) {
}
// VEBOX KERNELS:
__kernel void ve_enhance_intel(sampler_t accelerator,
int flags,
__read_only image2d_t current_input,
__write_only image2d_t current_output) {
}
__kernel void ve_dn_enhance_intel(sampler_t accelerator,
int flags,
__read_only image2d_t ref_input,
__read_only image2d_t current_input,
__write_only image2d_t current_output) {
}
__kernel void ve_dn_di_enhance_intel(sampler_t accelerator,
int flags,
__read_only image2d_t current_input,
__read_only image2d_t ref_input,
__write_only image2d_t current_output,
__write_only image2d_t ref_output,
__write_only image2d_t dndi_output) {
}

View File

@ -1,8 +0,0 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
-D cl_intel_device_side_advanced_vme_enable -D cl_intel_device_side_avc_vme_enable -D cl_intel_device_side_vme_enable -D cl_intel_media_block_io -cl-unsafe-math-optimizations -cl-mad-enable -cl-fast-relaxed-math

View File

@ -1,106 +0,0 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
/*************************************************************************************************\
Device-side basic vme kernel:
device_side_block_motion_estimate_intel
Description:
Computes motion vectors by comparing a 2d image source with a 2d reference image, producing a
vector field motion vectors. The algorithm searches the best match of each macroblock pixel
block in the source image by searching an image region in the reference image, centered on the
coordinates of that pixel macroblock in the source image (optionally offset by the prediction
motion vectors).
This kernel optionally takes a vector field of motion vector predictors via the
prediction_motion_vector_image kernel argument. The kernel also optionally returns a vector
field of per-macroblock pixel-block information records. Each record contains the best-match
distortion (SAD) value and additional search result information.
This kernel needs to be compiled with following compiler option:
" -D cl_intel_device_side_avc_vme_enable "
\*************************************************************************************************/
__kernel __attribute__((reqd_work_group_size(16, 1, 1)))
void device_side_block_motion_estimate_intel(__read_only image2d_t srcImg,
__read_only image2d_t refImg,
__global short2 *prediction_motion_vector_buffer,
__global short2 *motion_vector_buffer,
__global ushort *residuals_buffer,
__global uchar2 *shapes_buffer,
int iterations,
int partitionMask) {
int gid_0 = get_group_id(0);
int gid_1 = 0;
sampler_t vme_samp = 0;
for (int i = 0; i < iterations; i++, gid_1++) {
ushort2 srcCoord = 0;
short2 refCoord = 0;
short2 predMV = 0;
srcCoord.x = gid_0 * 16 + get_global_offset(0);
srcCoord.y = gid_1 * 16 + get_global_offset(1);
if (prediction_motion_vector_buffer != NULL) {
predMV = prediction_motion_vector_buffer[gid_0 + gid_1 * get_num_groups(0)];
refCoord.x = predMV.x / 4;
refCoord.y = predMV.y / 4;
refCoord.y = refCoord.y & 0xFFFE;
}
uchar partition_mask = (uchar)partitionMask;
uchar sad_adjustment = CLK_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL;
uchar pixel_mode = CLK_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL;
intel_sub_group_avc_ime_payload_t payload = intel_sub_group_avc_ime_initialize(srcCoord, partition_mask, sad_adjustment);
payload = intel_sub_group_avc_ime_set_single_reference(refCoord, CLK_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL, payload);
intel_sub_group_avc_ime_result_t result = intel_sub_group_avc_ime_evaluate_with_single_reference(srcImg, refImg, vme_samp, payload);
// Process Results
long mvs = intel_sub_group_avc_ime_get_motion_vectors(result);
ushort sads = intel_sub_group_avc_ime_get_inter_distortions(result);
uchar major_shape = intel_sub_group_avc_ime_get_inter_major_shape(result);
uchar minor_shapes = intel_sub_group_avc_ime_get_inter_minor_shapes(result);
uchar2 shapes = {major_shape, minor_shapes};
uchar directions = intel_sub_group_avc_ime_get_inter_directions(result);
// Perform FME for non-Integer Pixel mode
if (pixel_mode != CLK_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL) {
intel_sub_group_avc_ref_payload_t payload = intel_sub_group_avc_fme_initialize(srcCoord, mvs, major_shape, minor_shapes, directions, pixel_mode, sad_adjustment);
intel_sub_group_avc_ref_result_t result = intel_sub_group_avc_ref_evaluate_with_single_reference(srcImg, refImg, vme_samp, payload);
mvs = intel_sub_group_avc_ref_get_motion_vectors(result);
sads = intel_sub_group_avc_ref_get_inter_distortions(result);
}
// Write Out Result
if ((get_local_id(0) % 4) == 0) {
int x = get_local_id(0) % 4;
int y = get_local_id(0) / 4;
int width = get_image_width(srcImg);
int index = (gid_0 * 4 + x) + (gid_1 * width / 4 + y);
int2 bi_mvs = as_int2(mvs);
motion_vector_buffer[index] = as_short2(bi_mvs.s0);
if (residuals_buffer != NULL) {
residuals_buffer[index] = sads;
}
shapes_buffer[gid_0 + gid_1 * get_num_groups(0)] = shapes;
}
}
}
__kernel void non_vme_kernel(__global unsigned int *src, __global unsigned int *dst) {
int id = (int)get_global_id(0);
dst[id] = lgamma((float)src[id]);
dst[id] = src[id];
}

View File

@ -1,8 +0,0 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
-D cl_intel_device_side_vme_enable -D HW_NULL_CHECK

View File

@ -69,7 +69,6 @@ BuiltinResourceT createBuiltinResource(const BuiltinResourceT &r);
std::string createBuiltinResourceName(EBuiltInOps::Type builtin, const std::string &extension);
StackVec<std::string, 3> getBuiltinResourceNames(EBuiltInOps::Type builtin, BuiltinCode::ECodeType type, const Device &device);
const char *getBuiltinAsString(EBuiltInOps::Type builtin);
const char *getAdditionalBuiltinAsString(EBuiltInOps::Type builtin);
class Storage {
public:

View File

@ -23,10 +23,6 @@
namespace NEO {
const char *getBuiltinAsString(EBuiltInOps::Type builtin) {
const char *builtinString = getAdditionalBuiltinAsString(builtin);
if (builtinString) {
return builtinString;
}
switch (builtin) {
default:
return "unknown";

View File

@ -35,7 +35,6 @@ void PageFaultManager::transferToGpu(void *ptr, void *cmdQ) {
}
void PageFaultManager::allowCPUMemoryEviction(void *ptr, PageFaultData &pageFaultData) {
}
const char *getAdditionalBuiltinAsString(EBuiltInOps::Type builtin) { return nullptr; }
void RootDeviceEnvironment::initApiGfxCoreHelper() {
}