refactor: remove not needed vme builtin related code
Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
parent
8170599b17
commit
d5812f49d7
|
@ -982,7 +982,6 @@ set(BUILTINS_BINARIES_STATELESS_HEAPLESS_LIB_NAME "builtins_binaries_stateless_h
|
|||
set(BUILTINS_BINARIES_BINDFUL_LIB_NAME "builtins_binaries_bindful")
|
||||
set(BUILTINS_BINARIES_BINDLESS_LIB_NAME "builtins_binaries_bindless")
|
||||
set(BUILTINS_SPIRV_LIB_NAME "builtins_spirv")
|
||||
set(BUILTINS_VME_LIB_NAME "builtins_vme")
|
||||
|
||||
if(WIN32)
|
||||
set(NEO_EXTRA_LIBS Ws2_32)
|
||||
|
|
|
@ -14,12 +14,6 @@
|
|||
#include "level_zero/core/source/device/device.h"
|
||||
#include "level_zero/core/source/kernel/kernel.h"
|
||||
|
||||
namespace NEO {
|
||||
const char *getAdditionalBuiltinAsString(EBuiltInOps::Type builtin) {
|
||||
return nullptr;
|
||||
}
|
||||
} // namespace NEO
|
||||
|
||||
namespace L0 {
|
||||
|
||||
BuiltinFunctionsLibImpl::BuiltinData::~BuiltinData() {
|
||||
|
|
|
@ -42,7 +42,7 @@ components:
|
|||
dest_dir: kernels_bin
|
||||
type: git
|
||||
branch: kernels_bin
|
||||
revision: 3043-2372
|
||||
revision: 3043-2375
|
||||
kmdaf:
|
||||
branch: kmdaf
|
||||
dest_dir: kmdaf
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (C) 2021-2023 Intel Corporation
|
||||
# Copyright (C) 2021-2024 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
#
|
||||
|
@ -20,7 +20,6 @@ macro(generate_runtime_lib LIB_NAME MOCKABLE GENERATE_EXEC)
|
|||
add_subdirectory(source "${NEO_BUILD_DIR}/${LIB_NAME}" EXCLUDE_FROM_ALL)
|
||||
endif()
|
||||
target_compile_definitions(${BUILTINS_SOURCES_LIB_NAME} PUBLIC MOCKABLE_VIRTUAL=)
|
||||
target_compile_definitions(${BUILTINS_VME_LIB_NAME} PUBLIC MOCKABLE_VIRTUAL=)
|
||||
|
||||
if(${MOCKABLE})
|
||||
target_compile_definitions(${LIB_NAME} PUBLIC MOCKABLE_VIRTUAL=virtual)
|
||||
|
|
|
@ -106,7 +106,6 @@ if(${GENERATE_EXECUTABLE})
|
|||
list(APPEND NEO_DYNAMIC_LIB__TARGET_OBJECTS
|
||||
$<TARGET_OBJECTS:${SHARINGS_ENABLE_LIB_NAME}>
|
||||
$<TARGET_OBJECTS:${BUILTINS_SOURCES_LIB_NAME}>
|
||||
$<TARGET_OBJECTS:${BUILTINS_VME_LIB_NAME}>
|
||||
$<TARGET_OBJECTS:${BUILTINS_BINARIES_STATELESS_LIB_NAME}>
|
||||
$<TARGET_OBJECTS:${BUILTINS_BINARIES_STATELESS_HEAPLESS_LIB_NAME}>
|
||||
$<TARGET_OBJECTS:${BUILTINS_BINARIES_BINDFUL_LIB_NAME}>
|
||||
|
|
|
@ -25,7 +25,6 @@
|
|||
#include "opencl/source/accelerators/intel_motion_estimation.h"
|
||||
#include "opencl/source/api/additional_extensions.h"
|
||||
#include "opencl/source/api/api_enter.h"
|
||||
#include "opencl/source/built_ins/vme_builtin.h"
|
||||
#include "opencl/source/cl_device/cl_device.h"
|
||||
#include "opencl/source/command_queue/command_queue.h"
|
||||
#include "opencl/source/context/context.h"
|
||||
|
@ -1506,36 +1505,12 @@ cl_program CL_API_CALL clCreateProgramWithBuiltInKernels(cl_context context,
|
|||
TRACING_ENTER(ClCreateProgramWithBuiltInKernels, &context, &numDevices, &deviceList, &kernelNames, &errcodeRet);
|
||||
cl_int retVal = CL_SUCCESS;
|
||||
API_ENTER(&retVal);
|
||||
cl_program program = nullptr;
|
||||
DBG_LOG_INPUTS("context", context,
|
||||
"numDevices", numDevices,
|
||||
"deviceList", deviceList,
|
||||
"kernelNames", kernelNames);
|
||||
cl_program program = nullptr;
|
||||
Context *pContext = nullptr;
|
||||
|
||||
retVal = validateObjects(withCastToInternal(context, &pContext), numDevices,
|
||||
deviceList, kernelNames, errcodeRet);
|
||||
|
||||
if (retVal == CL_SUCCESS) {
|
||||
ClDeviceVector deviceVector;
|
||||
for (auto i = 0u; i < numDevices; i++) {
|
||||
auto device = castToObject<ClDevice>(deviceList[i]);
|
||||
if (!device || !pContext->isDeviceAssociated(*device)) {
|
||||
retVal = CL_INVALID_DEVICE;
|
||||
break;
|
||||
}
|
||||
deviceVector.push_back(device);
|
||||
}
|
||||
if (retVal == CL_SUCCESS) {
|
||||
|
||||
program = Vme::createBuiltInProgram(
|
||||
*pContext,
|
||||
deviceVector,
|
||||
kernelNames,
|
||||
retVal);
|
||||
}
|
||||
}
|
||||
|
||||
retVal = CL_INVALID_VALUE;
|
||||
if (errcodeRet) {
|
||||
*errcodeRet = retVal;
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (C) 2018-2022 Intel Corporation
|
||||
# Copyright (C) 2018-2024 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
#
|
||||
|
@ -9,32 +9,11 @@ set(RUNTIME_SRCS_BUILT_INS
|
|||
${CMAKE_CURRENT_SOURCE_DIR}/aux_translation_builtin.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/builtins_dispatch_builder.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/builtins_dispatch_builder.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/built_in_ops_vme.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/built_ins.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/vme_builtin.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/vme_builtin.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/vme_dispatch_builder.h
|
||||
)
|
||||
|
||||
target_sources(${NEO_STATIC_LIB_NAME} PRIVATE ${RUNTIME_SRCS_BUILT_INS})
|
||||
set_property(GLOBAL PROPERTY RUNTIME_SRCS_BUILT_INS ${RUNTIME_SRCS_BUILT_INS})
|
||||
|
||||
set(RUNTIME_SRCS_BUILT_IN_KERNELS
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kernels/vme_block_advanced_motion_estimate_bidirectional_check_intel.builtin_kernel
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kernels/vme_block_advanced_motion_estimate_check_intel.builtin_kernel
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kernels/vme_block_motion_estimate_intel.builtin_kernel
|
||||
)
|
||||
|
||||
target_sources(${NEO_STATIC_LIB_NAME} PRIVATE ${RUNTIME_SRCS_BUILT_IN_KERNELS})
|
||||
|
||||
hide_subdir(registry)
|
||||
hide_subdir(kernels)
|
||||
|
||||
add_subdirectories()
|
||||
|
||||
if(NOT (TARGET ${BUILTINS_VME_LIB_NAME}))
|
||||
add_subdirectory(registry)
|
||||
if(COMPILE_BUILT_INS)
|
||||
add_subdirectory(kernels)
|
||||
endif()
|
||||
endif()
|
||||
|
|
|
@ -1,20 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2020-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "shared/source/built_ins/builtinops/built_in_ops.h"
|
||||
|
||||
namespace NEO {
|
||||
namespace EBuiltInOps {
|
||||
|
||||
using Type = uint32_t;
|
||||
|
||||
inline constexpr Type vmeBlockMotionEstimateIntel{maxCoreValue + 1};
|
||||
inline constexpr Type vmeBlockAdvancedMotionEstimateCheckIntel{maxCoreValue + 2};
|
||||
inline constexpr Type vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel{maxCoreValue + 3};
|
||||
} // namespace EBuiltInOps
|
||||
} // namespace NEO
|
|
@ -14,7 +14,6 @@
|
|||
|
||||
#include "opencl/source/built_ins/aux_translation_builtin.h"
|
||||
#include "opencl/source/built_ins/built_ins.inl"
|
||||
#include "opencl/source/built_ins/vme_dispatch_builder.h"
|
||||
#include "opencl/source/cl_device/cl_device.h"
|
||||
#include "opencl/source/execution_environment/cl_execution_environment.h"
|
||||
#include "opencl/source/helpers/convert_color.h"
|
||||
|
|
|
@ -1,11 +0,0 @@
|
|||
#
|
||||
# Copyright (C) 2018-2023 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
#
|
||||
|
||||
add_custom_target(builtins_vme_sources)
|
||||
set_target_properties(builtins_vme_sources PROPERTIES FOLDER "${OPENCL_RUNTIME_PROJECTS_FOLDER}/${OPENCL_BUILTINS_PROJECTS_FOLDER}")
|
||||
set(BUILTINS_OUTDIR_WITH_ARCH "${TargetDir}/built_ins/${NEO_ARCH}")
|
||||
add_dependencies(${BUILTINS_BINARIES_BINDFUL_LIB_NAME} builtins_vme_sources)
|
||||
add_subdirectories()
|
|
@ -1,460 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
R"===(
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
|
||||
block_advanced_motion_estimate_bidirectional_check_intel(
|
||||
sampler_t accelerator,
|
||||
__read_only image2d_t srcImg,
|
||||
__read_only image2d_t refImg,
|
||||
__read_only image2d_t src_check_image,
|
||||
__read_only image2d_t ref0_check_image,
|
||||
__read_only image2d_t ref1_check_image,
|
||||
uint flags,
|
||||
uint search_cost_penalty,
|
||||
uint search_cost_precision,
|
||||
short2 count_global,
|
||||
uchar bidir_weight,
|
||||
__global short2 *count_motion_vector_buffer,
|
||||
__global short2 *prediction_motion_vector_buffer,
|
||||
__global char *skip_input_mode_buffer,
|
||||
__global short2 *skip_motion_vector_buffer,
|
||||
__global short2 *search_motion_vector_buffer,
|
||||
__global char *intra_search_predictor_modes,
|
||||
__global ushort *search_residuals,
|
||||
__global ushort *skip_residuals,
|
||||
__global ushort *intra_residuals,
|
||||
__read_only image2d_t intraSrcImg,
|
||||
int height,
|
||||
int width,
|
||||
int stride) {
|
||||
__local uint dstSearch[64]; // 8 GRFs
|
||||
__local uint dstSkipIntra[32 + 24]; // 7 GRFs (4 for inter, 3 for intra)
|
||||
|
||||
// distortion in the 6th GRF
|
||||
__local ushort *distSearch = (__local ushort *)&dstSearch[8 * 5];
|
||||
|
||||
// Initialize the MV cost table:
|
||||
// MV Cost in U4U4 format:
|
||||
// No cost : 0, 0, 0, 0, 0, 0, 0, 0
|
||||
// Low Cost : 1, 4, 5, 9, 10, 12, 14, 15
|
||||
// Normal Cost: 5, 26, 29, 43, 45, 47, 57, 57
|
||||
// High Cost : 29, 61, 72, 78, 88, 89, 91, 92
|
||||
|
||||
uint2 MVCostTable;
|
||||
if (search_cost_penalty == 1) {
|
||||
MVCostTable.s0 = 0x09050401;
|
||||
MVCostTable.s1 = 0x0F0E0C0A;
|
||||
} else if (search_cost_penalty == 2) {
|
||||
MVCostTable.s0 = 0x2B1D1A05;
|
||||
MVCostTable.s1 = 0x39392F2D;
|
||||
} else if (search_cost_penalty == 3) {
|
||||
MVCostTable.s0 = 0x4E483D1D;
|
||||
MVCostTable.s1 = 0x5C5B5958;
|
||||
} else {
|
||||
MVCostTable.s0 = 0;
|
||||
MVCostTable.s1 = 0;
|
||||
}
|
||||
|
||||
uint MVCostPrecision = ((uint)search_cost_precision) << 16;
|
||||
|
||||
// Frame is divided into rows * columns of MBs.
|
||||
// One h/w thread per WG.
|
||||
// One WG processes "row" MBs - one row per iteration and one MB per row.
|
||||
// Number of WGs (or h/w threads) is number of columns MBs.Each iteration
|
||||
// processes the MB in a row - gid_0 is the MB id in a row and gid_1 is the
|
||||
// row offset.
|
||||
|
||||
int sid_0 = stride * get_group_id(0);
|
||||
int gid_0 = sid_0 / height;
|
||||
int gid_1 = sid_0 % height;
|
||||
for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
|
||||
sid++, gid_0 = sid / height, gid_1 = sid % height) {
|
||||
int2 srcCoord;
|
||||
|
||||
srcCoord.x = gid_0 * 16 +
|
||||
get_global_offset(0); // 16 pixels wide MBs (globally scalar)
|
||||
srcCoord.y = gid_1 * 16 +
|
||||
get_global_offset(1); // 16 pixels tall MBs (globally scalar)
|
||||
uint curMB = gid_0 + gid_1 * width; // current MB id
|
||||
short2 count;
|
||||
|
||||
// If either the search or skip vector counts are per-MB, then we need to
|
||||
// read in
|
||||
// the count motion vector buffer.
|
||||
|
||||
if ((count_global.s0 == -1) | (count_global.s1 == -1)) {
|
||||
count = count_motion_vector_buffer[curMB];
|
||||
}
|
||||
|
||||
// If either the search or skip vector counts are per-frame, we need to use
|
||||
// those.
|
||||
|
||||
if (count_global.s0 >= 0) {
|
||||
count.s0 = count_global.s0;
|
||||
}
|
||||
|
||||
if (count_global.s1 >= 0) {
|
||||
count.s1 = count_global.s1;
|
||||
}
|
||||
|
||||
int countPredMVs = count.x;
|
||||
if (countPredMVs != 0) {
|
||||
uint offset = curMB * 4; // 4 predictors per MB
|
||||
offset += get_local_id(0) % 4; // 16 work-items access 4 MVs for MB
|
||||
// one predictor for MB per SIMD channel
|
||||
|
||||
// Reduce predictors from Q-pixel to integer precision.
|
||||
int2 predMV = 0;
|
||||
|
||||
if (get_local_id(0) < countPredMVs) {
|
||||
// one MV per work-item
|
||||
if(prediction_motion_vector_buffer != NULL)
|
||||
{
|
||||
predMV = convert_int2(prediction_motion_vector_buffer[offset]);
|
||||
}
|
||||
// Predictors are input in QP resolution. Convert that to integer
|
||||
// resolution.
|
||||
predMV.x /= 4;
|
||||
predMV.y /= 4;
|
||||
predMV.y &= 0xFFFFFFFE;
|
||||
}
|
||||
|
||||
// Do up to 4 IMEs, get the best MVs and their distortions, and optionally
|
||||
// a FBR of
|
||||
// the best MVs. Finally the results are written out to SLM.
|
||||
|
||||
intel_work_group_vme_mb_multi_query_4(
|
||||
dstSearch, // best search MV and its distortions into SLM
|
||||
countPredMVs, // count of predictor MVs (globally scalar - value range
|
||||
// 1 to 4)
|
||||
MVCostPrecision, // MV cost precision
|
||||
MVCostTable, // MV cost table
|
||||
srcCoord, // MB 2-D offset (globally scalar)
|
||||
predMV, // predictor MVs (up to 4 distinct MVs for SIMD16 thread)
|
||||
srcImg, // source
|
||||
refImg, // reference
|
||||
accelerator); // vme object
|
||||
}
|
||||
|
||||
int doIntra = ((flags & 0x2) != 0);
|
||||
int intraEdges = 0;
|
||||
if (doIntra) {
|
||||
// Enable all edges by default.
|
||||
intraEdges = 0x3C;
|
||||
// If this is a left-edge MB, then disable left edges.
|
||||
if ((gid_0 == 0) & (get_global_offset(0) == 0)) {
|
||||
intraEdges &= 0x18;
|
||||
}
|
||||
|
||||
// If this is a right edge MB then disable right edges.
|
||||
if (gid_0 == width - 1) {
|
||||
intraEdges &= 0x34;
|
||||
}
|
||||
|
||||
// If this is a top-edge MB, then disable top edges.
|
||||
if ((gid_1 == 0) & (get_global_offset(1) == 0)) {
|
||||
intraEdges &= 0x20;
|
||||
}
|
||||
|
||||
// Set bit6=bit5.
|
||||
intraEdges |= ((intraEdges & 0x20) << 1);
|
||||
|
||||
intraEdges <<= 8;
|
||||
}
|
||||
|
||||
int skip_block_type_8x8 = flags & 0x4;
|
||||
|
||||
int countSkipMVs = count.y;
|
||||
if (countSkipMVs != 0 || doIntra == true) {
|
||||
// one set of skip MV per SIMD channel
|
||||
|
||||
// Do up to 4 skip checks and get the distortions for each of them.
|
||||
// Finally the results are written out to SLM.
|
||||
|
||||
if ((skip_block_type_8x8 == 0) | ((doIntra) & (countSkipMVs == 0))) {
|
||||
// 16x16:
|
||||
|
||||
uint offset = curMB * 4 * 2; // 4 sets of skip check MVs per MB
|
||||
int skipMV = 0;
|
||||
if (get_local_id(0) < countSkipMVs * 2) // need 2 values per MV
|
||||
{
|
||||
offset +=
|
||||
(get_local_id(0)); // 16 work-items access 4 sets of MVs for MB
|
||||
if(skip_motion_vector_buffer != NULL){
|
||||
__global int *skip1_motion_vector_buffer =
|
||||
(__global int *)skip_motion_vector_buffer;
|
||||
skipMV = skip1_motion_vector_buffer[offset]; // one MV per work-item
|
||||
}
|
||||
}
|
||||
|
||||
uchar skipMode = 0;
|
||||
if (get_local_id(0) < countSkipMVs) {
|
||||
if(skip_input_mode_buffer != NULL)
|
||||
skipMode = skip_input_mode_buffer[curMB];
|
||||
|
||||
if (skipMode == 0) {
|
||||
skipMode = 1;
|
||||
}
|
||||
if (skipMode > 3) {
|
||||
skipMode = 3;
|
||||
}
|
||||
}
|
||||
|
||||
intel_work_group_vme_mb_multi_bidir_check_16x16(
|
||||
dstSkipIntra, // distortions into SLM
|
||||
countSkipMVs, // count of skip check MVs (globally scalar - value
|
||||
// range 1 to 4)
|
||||
doIntra, // compute intra modes
|
||||
intraEdges, // intra edges to use
|
||||
srcCoord, // MB 2-D offset (globally scalar)
|
||||
bidir_weight, // bidirectional weight
|
||||
skipMode, // skip modes
|
||||
skipMV, // skip check MVs (up to 4 distinct sets of skip check MVs
|
||||
// for SIMD16 thread)
|
||||
src_check_image, // source
|
||||
ref0_check_image, // reference fwd
|
||||
ref1_check_image, // reference bwd
|
||||
intraSrcImg, // intra source
|
||||
accelerator); // vme object
|
||||
} else {
|
||||
// 8x8:
|
||||
|
||||
uint offset =
|
||||
curMB * 4 *
|
||||
8; // 4 sets of skip check MVs, 16 shorts (8 ints) each per MB
|
||||
int2 skipMVs = 0;
|
||||
if (get_local_id(0) < countSkipMVs * 8) // need 8 values per MV
|
||||
{
|
||||
offset +=
|
||||
(get_local_id(0)); // 16 work-items access 4 sets of MVs for MB
|
||||
if(skip_motion_vector_buffer != NULL){
|
||||
__global int *skip1_motion_vector_buffer =
|
||||
(__global int *)(skip_motion_vector_buffer);
|
||||
skipMVs.x = skip1_motion_vector_buffer[offset]; // four component MVs
|
||||
// per work-item
|
||||
skipMVs.y = skip1_motion_vector_buffer[offset + 16];}
|
||||
}
|
||||
|
||||
uchar skipModes = 0;
|
||||
if (get_local_id(0) < countSkipMVs) {
|
||||
if(skip_input_mode_buffer != NULL)
|
||||
skipModes = skip_input_mode_buffer[curMB];
|
||||
}
|
||||
|
||||
intel_work_group_vme_mb_multi_bidir_check_8x8(
|
||||
dstSkipIntra, // distortions into SLM
|
||||
countSkipMVs, // count of skip check MVs per MB (globally scalar -
|
||||
// value range 1 to 4)
|
||||
doIntra, // compute intra modes
|
||||
intraEdges, // intra edges to use
|
||||
srcCoord, // MB 2-D offset (globally scalar)
|
||||
bidir_weight, // bidirectional weight
|
||||
skipModes, // skip modes
|
||||
skipMVs, // skip check MVs (up to 4 distinct sets of skip check MVs
|
||||
// for SIMD16 thread)
|
||||
src_check_image, // source
|
||||
ref0_check_image, // reference fwd
|
||||
ref1_check_image, // reference bwd
|
||||
intraSrcImg, // intra source
|
||||
accelerator); // vme object
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Write Out motion estimation result:
|
||||
// Result format
|
||||
// Hierarchical row-major layout
|
||||
// i.e. row-major of blocks MVs in MBs, and row-major of 4 sets of
|
||||
// MVs/distortion in blocks
|
||||
if (countPredMVs != 0) {
|
||||
// 4x4
|
||||
if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
|
||||
int index = (gid_0 * 16 + get_local_id(0)) + (gid_1 * 16 * width);
|
||||
|
||||
// 1. 16 work-items enabled.
|
||||
// 2. Work-items gather fwd MVs in strided dword locations 0, 2, .., 30
|
||||
// (interleaved
|
||||
// fwd/bdw MVs) with constant offset 8 (control data size) from SLM
|
||||
// into contiguous
|
||||
// short2 locations 0, 1, .., 15 of global buffer
|
||||
// search_motion_vector_buffer with
|
||||
// offset index.
|
||||
// 3. Work-items gather contiguous ushort locations 0, 1, .., 15 from
|
||||
// distSearch into
|
||||
// contiguous ushort locations 0, 1, .., 15 of search_residuals with
|
||||
// offset index.
|
||||
|
||||
short2 val = as_short2(dstSearch[8 + get_local_id(0) * 2]);
|
||||
if(search_motion_vector_buffer != NULL)
|
||||
search_motion_vector_buffer[index] = val;
|
||||
if (search_residuals != NULL)
|
||||
{
|
||||
search_residuals[index] = distSearch[get_local_id(0)];
|
||||
}
|
||||
}
|
||||
|
||||
// 8x8
|
||||
else if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
|
||||
// Only 1st 4 work-item are needed.
|
||||
if (get_local_id(0) < 4) {
|
||||
int index = (gid_0 * 4 + get_local_id(0)) + (gid_1 * 4 * width);
|
||||
|
||||
// 1. 4 work-items enabled.
|
||||
// 2. Work-items gather fw MVs in strided dword locations 0, 8, 16, 24
|
||||
// (interleaved
|
||||
// fwd/bdw MVs) with constant offset 8 from SLM into contiguous
|
||||
// short2 locations
|
||||
// 0, 1, .., 15 of global buffer search_motion_vector_buffer with
|
||||
// offset index.
|
||||
// 3. Work-items gather strided ushort locations 0, 4, 8, 12 from
|
||||
// distSearch into
|
||||
// contiguous ushort locations 0, 1, .., 15 of search_residuals
|
||||
// with offset index.
|
||||
|
||||
short2 val = as_short2(dstSearch[8 + get_local_id(0) * 4 * 2]);
|
||||
if(search_motion_vector_buffer != NULL)
|
||||
search_motion_vector_buffer[index] = val;
|
||||
|
||||
if (search_residuals != NULL)
|
||||
{
|
||||
search_residuals[index] = distSearch[get_local_id(0) * 4];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 16x16
|
||||
else if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
|
||||
// One 1st work is needed.
|
||||
if (get_local_id(0) == 0) {
|
||||
int index = gid_0 + gid_1 * width;
|
||||
|
||||
// 1. 1 work-item enabled.
|
||||
// 2. Work-item gathers fwd MV in dword location 0 with constant
|
||||
// offset 8 from
|
||||
// SLM into short2 locations 0 of global buffer
|
||||
// search_motion_vector_buffer.
|
||||
// 3. Work-item gathers ushort location 0 from distSearch into ushort
|
||||
// location 0 of search_residuals with offset index.
|
||||
|
||||
short2 val = as_short2(dstSearch[8]);
|
||||
if(search_motion_vector_buffer != NULL)
|
||||
search_motion_vector_buffer[index] = val;
|
||||
if (search_residuals != NULL)
|
||||
{
|
||||
search_residuals[index] = distSearch[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write out motion skip check result:
|
||||
// Result format
|
||||
// Hierarchical row-major layout
|
||||
// i.e. row-major of blocks in MBs, and row-major of 8 sets of
|
||||
// distortions in blocks
|
||||
if (countSkipMVs != 0) {
|
||||
if (skip_block_type_8x8 == false) {
|
||||
// Copy out 4 (1 component) sets of distortion values.
|
||||
|
||||
int index = (gid_0 * 4) + (get_local_id(0)) + (gid_1 * 4 * width);
|
||||
|
||||
if (get_local_id(0) < countSkipMVs) {
|
||||
// 1. Up to 4 work-items are enabled.
|
||||
// 2. The work-item gathers distSkip locations 0, 16*1, .., 16*7 and
|
||||
// copies them to contiguous skip_residual locations 0, 1, 2, ..,
|
||||
// 7.
|
||||
__local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
|
||||
if(skip_residuals != NULL)
|
||||
skip_residuals[index] = distSkip[get_local_id(0) * 16];
|
||||
}
|
||||
} else {
|
||||
// Copy out 4 (4 component) sets of distortion values.
|
||||
int index =
|
||||
(gid_0 * 4 * 4) + (get_local_id(0)) + (gid_1 * 4 * 4 * width);
|
||||
|
||||
if (get_local_id(0) < countSkipMVs * 4) {
|
||||
// 1. Up to 16 work-items are enabled.
|
||||
// 2. The work-item gathers distSkip locations 0, 4*1, .., 4*15 and
|
||||
// copies them to contiguous skip_residual locations 0, 1, 2, ..,
|
||||
// 15.
|
||||
|
||||
__local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
|
||||
if(skip_residuals != NULL)
|
||||
skip_residuals[index] = distSkip[get_local_id(0) * 4];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write out intra search result:
|
||||
if (doIntra) {
|
||||
// Write out the 4x4 intra modes
|
||||
if (get_local_id(0) < 8) {
|
||||
__local char *dstIntra_4x4 =
|
||||
(__local char *)(&dstSkipIntra[32 + 16 + 4]);
|
||||
char value = dstIntra_4x4[get_local_id(0)];
|
||||
char value_low = (value)&0xf;
|
||||
char value_high = (value >> 4) & 0xf;
|
||||
|
||||
int index_low =
|
||||
(gid_0 * 22) + (get_local_id(0) * 2) + (gid_1 * 22 * width);
|
||||
|
||||
int index_high =
|
||||
(gid_0 * 22) + (get_local_id(0) * 2) + 1 + (gid_1 * 22 * width);
|
||||
|
||||
if(intra_search_predictor_modes != NULL) {
|
||||
intra_search_predictor_modes[index_low + 5] = value_low;
|
||||
intra_search_predictor_modes[index_high + 5] = value_high;
|
||||
}
|
||||
}
|
||||
|
||||
// Write out the 8x8 intra modes
|
||||
if (get_local_id(0) < 4) {
|
||||
__local char *dstIntra_8x8 =
|
||||
(__local char *)(&dstSkipIntra[32 + 8 + 4]);
|
||||
char value = dstIntra_8x8[get_local_id(0) * 2];
|
||||
char value_low = (value)&0xf;
|
||||
int index = (gid_0 * 22) + (get_local_id(0)) + (gid_1 * 22 * width);
|
||||
if(intra_search_predictor_modes != NULL)
|
||||
intra_search_predictor_modes[index + 1] = value_low;
|
||||
}
|
||||
|
||||
// Write out the 16x16 intra modes
|
||||
if (get_local_id(0) < 1) {
|
||||
__local char *dstIntra_16x16 =
|
||||
(__local char *)(&dstSkipIntra[32 + 0 + 4]);
|
||||
char value = dstIntra_16x16[0];
|
||||
char value_low = (value)&0xf;
|
||||
int index = (gid_0 * 22) + (gid_1 * 22 * width);
|
||||
if(intra_search_predictor_modes != NULL)
|
||||
intra_search_predictor_modes[index] = value_low;
|
||||
}
|
||||
|
||||
// Get the intra residuals.
|
||||
if (intra_residuals != NULL)
|
||||
{
|
||||
int index = (gid_0 * 4) + (gid_1 * 4 * width);
|
||||
|
||||
if (get_local_id(0) < 1) {
|
||||
__local ushort *distIntra_4x4 =
|
||||
(__local ushort *)(&dstSkipIntra[32 + 16 + 3]);
|
||||
__local ushort *distIntra_8x8 =
|
||||
(__local ushort *)(&dstSkipIntra[32 + 8 + 3]);
|
||||
__local ushort *distIntra_16x16 =
|
||||
(__local ushort *)(&dstSkipIntra[32 + 0 + 3]);
|
||||
|
||||
intra_residuals[index + 2] = distIntra_4x4[0];
|
||||
intra_residuals[index + 1] = distIntra_8x8[0];
|
||||
intra_residuals[index + 0] = distIntra_16x16[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
)==="
|
|
@ -1,26 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
R"===(
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
|
||||
block_advanced_motion_estimate_bidirectional_check_intel(
|
||||
sampler_t accelerator, __read_only image2d_t srcImg,
|
||||
__read_only image2d_t refImg, __read_only image2d_t src_check_image,
|
||||
__read_only image2d_t ref0_check_image,
|
||||
__read_only image2d_t ref1_check_image, uint flags,
|
||||
uint search_cost_penalty, uint search_cost_precision, short2 count_global,
|
||||
uchar bidir_weight, __global short2 *count_motion_vector_buffer,
|
||||
__global short2 *prediction_motion_vector_buffer,
|
||||
__global char *skip_input_mode_buffer,
|
||||
__global short2 *skip_motion_vector_buffer,
|
||||
__global short2 *search_motion_vector_buffer,
|
||||
__global char *intra_search_predictor_modes,
|
||||
__global ushort *search_residuals, __global ushort *skip_residuals,
|
||||
__global ushort *intra_residuals) {
|
||||
}
|
||||
|
||||
)==="
|
|
@ -1,379 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
R"===(
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
|
||||
block_advanced_motion_estimate_check_intel(
|
||||
sampler_t accelerator, __read_only image2d_t srcImg,
|
||||
__read_only image2d_t refImg, uint flags, uint skip_block_type,
|
||||
uint search_cost_penalty, uint search_cost_precision,
|
||||
__global short2 *count_motion_vector_buffer,
|
||||
__global short2 *predictors_buffer,
|
||||
__global short2 *skip_motion_vector_buffer,
|
||||
__global short2 *motion_vector_buffer,
|
||||
__global char *intra_search_predictor_modes, __global ushort *residuals,
|
||||
__global ushort *skip_residuals, __global ushort *intra_residuals,
|
||||
__read_only image2d_t intraSrcImg, int height, int width, int stride) {
|
||||
__local uint dstSearch[64]; // 8 GRFs
|
||||
__local uint dstSkipIntra[64 + 24]; // 11 GRFs (8 for inter, 3 for intra)
|
||||
|
||||
__local ushort *distSearch =
|
||||
(__local ushort *)&dstSearch[8 * 5]; // distortion in the 6th GRF
|
||||
|
||||
// Initialize the MV cost table:
|
||||
// MV Cost in U4U4 format:
|
||||
// No cost : 0, 0, 0, 0, 0, 0, 0, 0
|
||||
// Low Cost : 1, 4, 5, 9, 10, 12, 14, 15
|
||||
// Normal Cost: 5, 26, 29, 43, 45, 47, 57, 57
|
||||
// High Cost : 29, 61, 72, 78, 88, 89, 91, 92
|
||||
|
||||
uint2 MVCostTable;
|
||||
if (search_cost_penalty == 1) {
|
||||
MVCostTable.s0 = 0x09050401;
|
||||
MVCostTable.s1 = 0x0F0E0C0A;
|
||||
} else if (search_cost_penalty == 2) {
|
||||
MVCostTable.s0 = 0x2B1D1A05;
|
||||
MVCostTable.s1 = 0x39392F2D;
|
||||
} else if (search_cost_penalty == 3) {
|
||||
MVCostTable.s0 = 0x4E483D1D;
|
||||
MVCostTable.s1 = 0x5C5B5958;
|
||||
} else {
|
||||
MVCostTable.s0 = 0;
|
||||
MVCostTable.s1 = 0;
|
||||
}
|
||||
|
||||
uint MVCostPrecision = ((uint)search_cost_precision) << 16;
|
||||
// Frame is divided into rows * columns of MBs.
|
||||
// One h/w thread per WG.
|
||||
// One WG processes 'row' MBs - one row per iteration and one MB per row.
|
||||
// Number of WGs (or h/w threads) is number of columns MBs
|
||||
// Each iteration processes the MB in a row - gid_0 is the MB id in a row and
|
||||
// gid_1 is the row offset.
|
||||
|
||||
int sid_0 = stride * get_group_id(0);
|
||||
int gid_0 = sid_0 / height;
|
||||
int gid_1 = sid_0 % height;
|
||||
for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
|
||||
sid++, gid_0 = sid / height, gid_1 = sid % height) {
|
||||
int2 srcCoord;
|
||||
|
||||
srcCoord.x = gid_0 * 16 +
|
||||
get_global_offset(0); // 16 pixels wide MBs (globally scalar)
|
||||
srcCoord.y = gid_1 * 16 +
|
||||
get_global_offset(1); // 16 pixels tall MBs (globally scalar)
|
||||
|
||||
uint curMB = gid_0 + gid_1 * width; // current MB id
|
||||
short2 count = 0;
|
||||
if(count_motion_vector_buffer != NULL)
|
||||
count = count_motion_vector_buffer[curMB];
|
||||
|
||||
int countPredMVs = count.x;
|
||||
if (countPredMVs != 0) {
|
||||
uint offset = curMB * 8; // 8 predictors per MB
|
||||
offset += get_local_id(0) % 8; // 16 work-items access 8 MVs for MB
|
||||
// one predictor for MB per SIMD channel
|
||||
|
||||
// Reduce predictors from Q-pixel to integer precision.
|
||||
|
||||
int2 predMV = 0;
|
||||
if (get_local_id(0) < countPredMVs) {
|
||||
if(predictors_buffer != NULL){
|
||||
predMV =
|
||||
convert_int2(predictors_buffer[offset]); // one MV per work-item
|
||||
predMV.x /= 4;
|
||||
predMV.y /= 4;
|
||||
predMV.y &= 0xFFFE;}
|
||||
}
|
||||
|
||||
// Do up to 8 IMEs, get the best MVs and their distortions, and optionally
|
||||
// a FBR of the best MVs.
|
||||
// Finally the results are written out to SLM.
|
||||
|
||||
intel_work_group_vme_mb_multi_query_8(
|
||||
dstSearch, // best search MV and its distortions into SLM
|
||||
countPredMVs, // count of predictor MVs (globally scalar - value range
|
||||
// 1 to 8)
|
||||
MVCostPrecision, // MV cost precision
|
||||
MVCostTable, // MV cost table
|
||||
srcCoord, // MB 2-D offset (globally scalar)
|
||||
predMV, // predictor MVs (up to 8 distinct MVs for SIMD16 thread)
|
||||
srcImg, // source
|
||||
refImg, // reference
|
||||
accelerator); // vme object
|
||||
}
|
||||
|
||||
int doIntra = (flags & 0x2) != 0;
|
||||
int intraEdges = 0;
|
||||
if (doIntra) {
|
||||
// Enable all edges by default.
|
||||
intraEdges = 0x3C;
|
||||
// If this is a left-edge MB, then disable left edges.
|
||||
if ((gid_0 == 0) & (get_global_offset(0) == 0)) {
|
||||
intraEdges &= 0x18;
|
||||
}
|
||||
// If this is a right edge MB then disable right edges.
|
||||
if (gid_0 == width - 1) {
|
||||
intraEdges &= 0x34;
|
||||
}
|
||||
// If this is a top-edge MB, then disable top edges.
|
||||
if ((gid_1 == 0) & (get_global_offset(1) == 0)) {
|
||||
intraEdges &= 0x20;
|
||||
}
|
||||
// Set bit6=bit5.
|
||||
intraEdges |= ((intraEdges & 0x20) << 1);
|
||||
intraEdges <<= 8;
|
||||
}
|
||||
int countSkipMVs = count.y;
|
||||
if (countSkipMVs != 0 || doIntra == true) {
|
||||
uint offset = curMB * 8; // 8 sets of skip check MVs per MB
|
||||
offset +=
|
||||
(get_local_id(0) % 8); // 16 work-items access 8 sets of MVs for MB
|
||||
// one set of skip MV per SIMD channel
|
||||
|
||||
// Do up to 8 skip checks and get the distortions for each of them.
|
||||
// Finally the results are written out to SLM.
|
||||
|
||||
if ((skip_block_type == 0x0) | ((doIntra) & (countSkipMVs == 0))) {
|
||||
int skipMVs = 0;
|
||||
if (get_local_id(0) < countSkipMVs) {
|
||||
if(skip_motion_vector_buffer != NULL ) {
|
||||
__global int *skip1_motion_vector_buffer =
|
||||
(__global int *)skip_motion_vector_buffer;
|
||||
skipMVs = skip1_motion_vector_buffer[offset]; } // one packed MV for one
|
||||
// work-item
|
||||
}
|
||||
intel_work_group_vme_mb_multi_check_16x16(
|
||||
dstSkipIntra, // distortions into SLM
|
||||
countSkipMVs, // count of skip check MVs (value range 0 to 8)
|
||||
doIntra, // compute intra modes
|
||||
intraEdges, // intra edges to use
|
||||
srcCoord, // MB 2-D offset (globally scalar)
|
||||
skipMVs, // skip check MVs (up to 8 sets of skip check MVs for
|
||||
// SIMD16 thread)
|
||||
srcImg, // source
|
||||
refImg, // reference
|
||||
intraSrcImg, // intra source
|
||||
accelerator);
|
||||
}
|
||||
|
||||
if ((skip_block_type == 0x1) & (countSkipMVs > 0)) {
|
||||
int4 skipMVs = 0;
|
||||
if (get_local_id(0) < countSkipMVs) {
|
||||
if(skip_motion_vector_buffer != NULL){
|
||||
__global int4 *skip4_motion_vector_buffer =
|
||||
(__global int4 *)(skip_motion_vector_buffer);
|
||||
skipMVs = skip4_motion_vector_buffer[offset]; } // four component MVs
|
||||
// per work-item
|
||||
}
|
||||
intel_work_group_vme_mb_multi_check_8x8(
|
||||
dstSkipIntra, // distortions into SLM
|
||||
countSkipMVs, // count of skip check MVs per MB (value range 0 to 8)
|
||||
doIntra, // compute intra modes
|
||||
intraEdges, // intra edges to use
|
||||
srcCoord, // MB 2-D offset (globally scalar)
|
||||
skipMVs, // skip check MVs (up to 8 ets of skip check MVs for SIMD16
|
||||
// thread)
|
||||
srcImg, // source
|
||||
refImg, // reference
|
||||
intraSrcImg, // intra source
|
||||
accelerator);
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Write Out motion estimation result:
|
||||
// Result format
|
||||
// Hierarchical row-major layout
|
||||
// i.e. row-major of blocks MVs in MBs, and row-major of 8 sets of
|
||||
// MVs/distortion in blocks
|
||||
|
||||
if (countPredMVs != 0) {
|
||||
// 4x4
|
||||
if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
|
||||
int index = (gid_0 * 16 + get_local_id(0)) + (gid_1 * 16 * width);
|
||||
|
||||
// 1. 16 work-items enabled.
|
||||
// 2. Work-items gather fwd MVs in strided dword locations 0, 2, .., 30
|
||||
// (interleaved
|
||||
// fwd/bdw MVs) with constant offset 8 (control data size) from SLM
|
||||
// into contiguous
|
||||
// short2 locations 0, 1, .., 15 of global buffer
|
||||
// search_motion_vector_buffer with
|
||||
// offset index.
|
||||
// 3. Work-items gather contiguous ushort locations 0, 1, .., 15 from
|
||||
// distSearch into
|
||||
// contiguous ushort locations 0, 1, .., 15 of search_residuals with
|
||||
// offset index.
|
||||
|
||||
short2 val = as_short2(dstSearch[8 + get_local_id(0) * 2]);
|
||||
if(motion_vector_buffer != NULL)
|
||||
motion_vector_buffer[index] = val;
|
||||
|
||||
if (residuals != NULL)
|
||||
{
|
||||
residuals[index] = distSearch[get_local_id(0)];
|
||||
}
|
||||
}
|
||||
|
||||
// 8x8
|
||||
else if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
|
||||
// Only 1st 4 work-item are needed.
|
||||
if (get_local_id(0) < 4) {
|
||||
int index = (gid_0 * 4 + get_local_id(0)) + (gid_1 * 4 * width);
|
||||
|
||||
// 1. 4 work-items enabled.
|
||||
// 2. Work-items gather fw MVs in strided dword locations 0, 8, 16, 24
|
||||
// (interleaved
|
||||
// fwd/bdw MVs) with constant offset 8 from SLM into contiguous
|
||||
// short2 locations
|
||||
// 0, 1, .., 15 of global buffer search_motion_vector_buffer with
|
||||
// offset index.
|
||||
// 3. Work-items gather strided ushort locations 0, 4, 8, 12 from
|
||||
// distSearch into
|
||||
// contiguous ushort locations 0, 1, .., 15 of search_residuals
|
||||
// with offset index.
|
||||
|
||||
short2 val = as_short2(dstSearch[8 + get_local_id(0) * 4 * 2]);
|
||||
if(motion_vector_buffer != NULL)
|
||||
motion_vector_buffer[index] = val;
|
||||
|
||||
if (residuals != NULL)
|
||||
{
|
||||
residuals[index] = distSearch[get_local_id(0) * 4];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 16x16
|
||||
else if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
|
||||
// One 1st work is needed.
|
||||
if (get_local_id(0) == 0) {
|
||||
int index = gid_0 + gid_1 * width;
|
||||
|
||||
// 1. 1 work-item enabled.
|
||||
// 2. Work-item gathers fwd MV in dword location 0 with constant
|
||||
// offset 8 from
|
||||
// SLM into short2 locations 0 of global buffer
|
||||
// search_motion_vector_buffer.
|
||||
// 3. Work-item gathers ushort location 0 from distSearch into ushort
|
||||
// location 0 of search_residuals with offset index.
|
||||
|
||||
short2 val = as_short2(dstSearch[8]);
|
||||
if(motion_vector_buffer != NULL)
|
||||
motion_vector_buffer[index] = val;
|
||||
|
||||
if (residuals != NULL)
|
||||
{
|
||||
residuals[index] = distSearch[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write out motion skip check result:
|
||||
// Result format
|
||||
// Hierarchical row-major layout
|
||||
// i.e. row-major of blocks in MBs, and row-major of 8 sets of
|
||||
// distortions in blocks
|
||||
|
||||
if (countSkipMVs != 0) {
|
||||
if (skip_block_type == 0x0) {
|
||||
// Copy out 8 (1 component) sets of distortion values.
|
||||
|
||||
int index = (gid_0 * 8) + (get_local_id(0)) + (gid_1 * 8 * width);
|
||||
|
||||
if (get_local_id(0) < countSkipMVs) {
|
||||
__local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
|
||||
|
||||
// 1. Up to 8 work-items are enabled.
|
||||
// 2. The work-item gathers distSkip locations 0, 16*1, .., 16*7 and
|
||||
// copies them to contiguous skip_residual locations 0, 1, 2, ..,
|
||||
// 7.
|
||||
if(skip_residuals != NULL)
|
||||
skip_residuals[index] = distSkip[get_local_id(0) * 16];
|
||||
}
|
||||
} else {
|
||||
// Copy out 8 (4 component) sets of distortion values.
|
||||
|
||||
int index =
|
||||
(gid_0 * 8 * 4) + (get_local_id(0)) + (gid_1 * 8 * 4 * width);
|
||||
|
||||
__local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
|
||||
|
||||
if (get_local_id(0) < countSkipMVs * 4) {
|
||||
// 1. Up to 16 work-items are enabled.
|
||||
// 2. The work-item gathers distSkip locations 0, 4*1, .., 4*31 and
|
||||
// copies them to contiguous skip_residual locations 0, 1, 2, ..,
|
||||
// 31.
|
||||
if(skip_residuals != NULL){
|
||||
skip_residuals[index] = distSkip[get_local_id(0) * 4];
|
||||
skip_residuals[index + 16] = distSkip[(get_local_id(0) + 16) * 4];}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write out intra search result:
|
||||
|
||||
if (doIntra) {
|
||||
|
||||
int index_low =
|
||||
(gid_0 * 22) + (get_local_id(0) * 2) + (gid_1 * 22 * width);
|
||||
int index_high =
|
||||
(gid_0 * 22) + (get_local_id(0) * 2) + 1 + (gid_1 * 22 * width);
|
||||
|
||||
// Write out the 4x4 intra modes
|
||||
if (get_local_id(0) < 8) {
|
||||
__local char *dstIntra_4x4 =
|
||||
(__local char *)(&dstSkipIntra[64 + 16 + 4]);
|
||||
char value = dstIntra_4x4[get_local_id(0)];
|
||||
char value_low = (value)&0xf;
|
||||
char value_high = (value >> 4) & 0xf;
|
||||
if(intra_search_predictor_modes != NULL){
|
||||
intra_search_predictor_modes[index_low + 5] = value_low;
|
||||
intra_search_predictor_modes[index_high + 5] = value_high;}
|
||||
}
|
||||
|
||||
// Write out the 8x8 intra modes
|
||||
if (get_local_id(0) < 4) {
|
||||
__local char *dstIntra_8x8 =
|
||||
(__local char *)(&dstSkipIntra[64 + 8 + 4]);
|
||||
char value = dstIntra_8x8[get_local_id(0) * 2];
|
||||
char value_low = (value)&0xf;
|
||||
int index = (gid_0 * 22) + (get_local_id(0)) + (gid_1 * 22 * width);
|
||||
if(intra_search_predictor_modes != NULL)
|
||||
intra_search_predictor_modes[index + 1] = value_low;
|
||||
}
|
||||
|
||||
// Write out the 16x16 intra modes
|
||||
if (get_local_id(0) < 1) {
|
||||
__local char *dstIntra_16x16 =
|
||||
(__local char *)(&dstSkipIntra[64 + 0 + 4]);
|
||||
char value = dstIntra_16x16[get_local_id(0)];
|
||||
char value_low = (value)&0xf;
|
||||
if(intra_search_predictor_modes != NULL)
|
||||
intra_search_predictor_modes[index_low] = value_low;
|
||||
}
|
||||
|
||||
// Get the intra residuals.
|
||||
if (intra_residuals != NULL)
|
||||
{
|
||||
int index = (gid_0 * 4) + (gid_1 * 4 * width);
|
||||
|
||||
if (get_local_id(0) < 1) {
|
||||
__local ushort *distIntra_4x4 = (__local ushort *)(&dstSkipIntra[64 + 16 + 3]);
|
||||
__local ushort *distIntra_8x8 = (__local ushort *)(&dstSkipIntra[64 + 8 + 3]);
|
||||
__local ushort *distIntra_16x16 = (__local ushort *)(&dstSkipIntra[64 + 0 + 3]);
|
||||
intra_residuals[index + 2] = distIntra_4x4[0];
|
||||
intra_residuals[index + 1] = distIntra_8x8[0];
|
||||
intra_residuals[index + 0] = distIntra_16x16[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
)==="
|
|
@ -1,21 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
R"===(
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
|
||||
block_advanced_motion_estimate_check_intel(
|
||||
sampler_t accelerator, __read_only image2d_t srcImg,
|
||||
__read_only image2d_t refImg, uint flags, uint skip_block_type,
|
||||
uint search_cost_penalty, uint search_cost_precision,
|
||||
__global short2 *count_motion_vector_buffer,
|
||||
__global short2 *predictors_buffer,
|
||||
__global short2 *skip_motion_vector_buffer,
|
||||
__global short2 *motion_vector_buffer,
|
||||
__global char *intra_search_predictor_modes, __global ushort *residuals,
|
||||
__global ushort *skip_residuals, __global ushort *intra_residuals) {
|
||||
}
|
||||
)==="
|
|
@ -1,103 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
R"===(
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
|
||||
block_motion_estimate_intel(sampler_t accelerator, __read_only image2d_t srcImg,
|
||||
__read_only image2d_t refImg,
|
||||
__global short2 *prediction_motion_vector_buffer,
|
||||
__global short2 *motion_vector_buffer,
|
||||
__global ushort *residuals, int height, int width,
|
||||
int stride) {
|
||||
__local uint dst[64];
|
||||
__local ushort *dist = (__local ushort *)&dst[8 * 5];
|
||||
|
||||
int sid_0 = stride * get_group_id(0);
|
||||
int gid_0 = sid_0 / height;
|
||||
int gid_1 = sid_0 % height;
|
||||
for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
|
||||
sid++, gid_0 = sid / height, gid_1 = sid % height) {
|
||||
int2 srcCoord = 0;
|
||||
int2 refCoord = 0;
|
||||
|
||||
srcCoord.x = gid_0 * 16 + get_global_offset(0);
|
||||
srcCoord.y = gid_1 * 16 + get_global_offset(1);
|
||||
|
||||
short2 predMV = 0;
|
||||
|
||||
#ifndef HW_NULL_CHECK
|
||||
if (prediction_motion_vector_buffer != NULL)
|
||||
#endif
|
||||
{
|
||||
predMV = prediction_motion_vector_buffer[gid_0 + gid_1 * width];
|
||||
refCoord.x = predMV.x / 4;
|
||||
refCoord.y = predMV.y / 4;
|
||||
refCoord.y = refCoord.y & 0xFFFE;
|
||||
}
|
||||
|
||||
{
|
||||
intel_work_group_vme_mb_query(dst, srcCoord, refCoord, srcImg, refImg,
|
||||
accelerator);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Write Out Result
|
||||
|
||||
// 4x4
|
||||
if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
|
||||
int x = get_local_id(0) % 4;
|
||||
int y = get_local_id(0) / 4;
|
||||
int index = (gid_0 * 4 + x) + (gid_1 * 4 + y) * width * 4;
|
||||
|
||||
short2 val = as_short2(dst[8 + (y * 4 + x) * 2]);
|
||||
motion_vector_buffer[index] = val;
|
||||
|
||||
#ifndef HW_NULL_CHECK
|
||||
if (residuals != NULL)
|
||||
#endif
|
||||
{
|
||||
residuals[index] = dist[y * 4 + x];
|
||||
}
|
||||
}
|
||||
|
||||
// 8x8
|
||||
if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
|
||||
if (get_local_id(0) < 4) {
|
||||
int x = get_local_id(0) % 2;
|
||||
int y = get_local_id(0) / 2;
|
||||
int index = (gid_0 * 2 + x) + (gid_1 * 2 + y) * width * 2;
|
||||
short2 val = as_short2(dst[8 + (y * 2 + x) * 8]);
|
||||
motion_vector_buffer[index] = val;
|
||||
|
||||
#ifndef HW_NULL_CHECK
|
||||
if (residuals != NULL)
|
||||
#endif
|
||||
{
|
||||
residuals[index] = dist[(y * 2 + x) * 4];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 16x16
|
||||
if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
|
||||
if (get_local_id(0) == 0) {
|
||||
int index = gid_0 + gid_1 * width;
|
||||
|
||||
short2 val = as_short2(dst[8]);
|
||||
motion_vector_buffer[index] = val;
|
||||
|
||||
#ifndef HW_NULL_CHECK
|
||||
if (residuals != NULL)
|
||||
#endif
|
||||
{
|
||||
residuals[index] = dist[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
)==="
|
|
@ -1,16 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
R"===(
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
|
||||
block_motion_estimate_intel(sampler_t accelerator, __read_only image2d_t srcImg,
|
||||
__read_only image2d_t refImg,
|
||||
__global short2 *prediction_motion_vector_buffer,
|
||||
__global short2 *motion_vector_buffer,
|
||||
__global ushort *residuals) {
|
||||
}
|
||||
)==="
|
|
@ -1,16 +0,0 @@
|
|||
#
|
||||
# Copyright (C) 2018-2021 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
#
|
||||
|
||||
add_library(${BUILTINS_VME_LIB_NAME} OBJECT EXCLUDE_FROM_ALL
|
||||
CMakeLists.txt
|
||||
register_ext_vme_source.cpp
|
||||
)
|
||||
set_target_properties(${BUILTINS_VME_LIB_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
set_target_properties(${BUILTINS_VME_LIB_NAME} PROPERTIES FOLDER "${OPENCL_RUNTIME_PROJECTS_FOLDER}/${OPENCL_BUILTINS_PROJECTS_FOLDER}")
|
||||
|
||||
target_include_directories(${BUILTINS_VME_LIB_NAME} PRIVATE
|
||||
${KHRONOS_HEADERS_DIR}
|
||||
)
|
|
@ -1,44 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/source/built_ins/built_ins.h"
|
||||
#include "shared/source/built_ins/registry/built_ins_registry.h"
|
||||
|
||||
#include "opencl/source/built_ins/built_in_ops_vme.h"
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace NEO {
|
||||
|
||||
static RegisterEmbeddedResource registerVmeSrc(
|
||||
createBuiltinResourceName(
|
||||
EBuiltInOps::vmeBlockMotionEstimateIntel,
|
||||
BuiltinCode::getExtension(BuiltinCode::ECodeType::source))
|
||||
.c_str(),
|
||||
std::string(
|
||||
#include "opencl/source/built_ins/kernels/vme_block_motion_estimate_intel.builtin_kernel"
|
||||
));
|
||||
|
||||
static RegisterEmbeddedResource registerVmeAdvancedSrc(
|
||||
createBuiltinResourceName(
|
||||
EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel,
|
||||
BuiltinCode::getExtension(BuiltinCode::ECodeType::source))
|
||||
.c_str(),
|
||||
std::string(
|
||||
#include "opencl/source/built_ins/kernels/vme_block_advanced_motion_estimate_check_intel.builtin_kernel"
|
||||
));
|
||||
|
||||
static RegisterEmbeddedResource registerVmeAdvancedBidirectionalSrc(
|
||||
createBuiltinResourceName(
|
||||
EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel,
|
||||
BuiltinCode::getExtension(BuiltinCode::ECodeType::source))
|
||||
.c_str(),
|
||||
std::string(
|
||||
#include "opencl/source/built_ins/kernels/vme_block_advanced_motion_estimate_bidirectional_check_intel.builtin_kernel"
|
||||
));
|
||||
|
||||
} // namespace NEO
|
|
@ -1,126 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2020-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "opencl/source/built_ins/vme_builtin.h"
|
||||
|
||||
#include "shared/source/built_ins/built_ins.h"
|
||||
#include "shared/source/device/device.h"
|
||||
|
||||
#include "opencl/source/built_ins/built_in_ops_vme.h"
|
||||
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
|
||||
#include "opencl/source/built_ins/populate_built_ins.inl"
|
||||
#include "opencl/source/built_ins/vme_dispatch_builder.h"
|
||||
#include "opencl/source/execution_environment/cl_execution_environment.h"
|
||||
#include "opencl/source/program/program.h"
|
||||
|
||||
#include <sstream>
|
||||
|
||||
namespace NEO {
|
||||
|
||||
static const char *blockMotionEstimateIntelSrc = {
|
||||
#include "kernels/vme_block_motion_estimate_intel_frontend.builtin_kernel"
|
||||
};
|
||||
|
||||
static const char *blockAdvancedMotionEstimateCheckIntelSrc = {
|
||||
#include "kernels/vme_block_advanced_motion_estimate_check_intel_frontend.builtin_kernel"
|
||||
};
|
||||
|
||||
static const char *blockAdvancedMotionEstimateBidirectionalCheckIntelSrc = {
|
||||
#include "kernels/vme_block_advanced_motion_estimate_bidirectional_check_intel_frontend.builtin_kernel"
|
||||
};
|
||||
|
||||
static const std::tuple<const char *, const char *> mediaBuiltIns[] = {
|
||||
{"block_motion_estimate_intel", blockMotionEstimateIntelSrc},
|
||||
{"block_advanced_motion_estimate_check_intel", blockAdvancedMotionEstimateCheckIntelSrc},
|
||||
{"block_advanced_motion_estimate_bidirectional_check_intel", blockAdvancedMotionEstimateBidirectionalCheckIntelSrc}};
|
||||
|
||||
// Unlike other built-ins media kernels are not stored in BuiltIns object.
|
||||
// Pointer to program with built in kernels is returned to the user through API
|
||||
// call and user is responsible for releasing it by calling clReleaseProgram.
|
||||
Program *Vme::createBuiltInProgram(
|
||||
Context &context,
|
||||
const ClDeviceVector &deviceVector,
|
||||
const char *kernelNames,
|
||||
int &errcodeRet) {
|
||||
std::string programSourceStr = "";
|
||||
std::istringstream ss(kernelNames);
|
||||
std::string currentKernelName;
|
||||
|
||||
while (std::getline(ss, currentKernelName, ';')) {
|
||||
bool found = false;
|
||||
for (auto &builtInTuple : mediaBuiltIns) {
|
||||
if (currentKernelName == std::get<0>(builtInTuple)) {
|
||||
programSourceStr += std::get<1>(builtInTuple);
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
errcodeRet = CL_INVALID_VALUE;
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
if (programSourceStr.empty() == true) {
|
||||
errcodeRet = CL_INVALID_VALUE;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
Program *pBuiltInProgram = nullptr;
|
||||
pBuiltInProgram = Program::createBuiltInFromSource(programSourceStr.c_str(), &context, deviceVector, nullptr);
|
||||
|
||||
auto &device = *deviceVector[0];
|
||||
|
||||
if (pBuiltInProgram) {
|
||||
std::unordered_map<std::string, BuiltinDispatchInfoBuilder *> builtinsBuilders;
|
||||
builtinsBuilders["block_motion_estimate_intel"] =
|
||||
&Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::vmeBlockMotionEstimateIntel, device);
|
||||
builtinsBuilders["block_advanced_motion_estimate_check_intel"] =
|
||||
&Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel, device);
|
||||
builtinsBuilders["block_advanced_motion_estimate_bidirectional_check_intel"] =
|
||||
&Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel, device);
|
||||
|
||||
errcodeRet = pBuiltInProgram->build(deviceVector, mediaKernelsBuildOptions, builtinsBuilders);
|
||||
} else {
|
||||
errcodeRet = CL_INVALID_VALUE;
|
||||
}
|
||||
return pBuiltInProgram;
|
||||
}
|
||||
|
||||
const char *getAdditionalBuiltinAsString(EBuiltInOps::Type builtin) {
|
||||
switch (builtin) {
|
||||
default:
|
||||
return nullptr;
|
||||
case EBuiltInOps::vmeBlockMotionEstimateIntel:
|
||||
return "vme_block_motion_estimate_intel.builtin_kernel";
|
||||
case EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel:
|
||||
return "vme_block_advanced_motion_estimate_check_intel.builtin_kernel";
|
||||
case EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel:
|
||||
return "vme_block_advanced_motion_estimate_bidirectional_check_intel";
|
||||
}
|
||||
}
|
||||
|
||||
BuiltinDispatchInfoBuilder &Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::Type operation, ClDevice &device) {
|
||||
auto &builtins = *device.getDevice().getBuiltIns();
|
||||
uint32_t operationId = static_cast<uint32_t>(operation);
|
||||
auto clExecutionEnvironment = static_cast<ClExecutionEnvironment *>(device.getExecutionEnvironment());
|
||||
auto &operationBuilder = clExecutionEnvironment->peekBuilders(device.getRootDeviceIndex())[operationId];
|
||||
switch (operation) {
|
||||
default:
|
||||
return BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(operation, device);
|
||||
case EBuiltInOps::vmeBlockMotionEstimateIntel:
|
||||
std::call_once(operationBuilder.second, [&] { operationBuilder.first = std::make_unique<BuiltInOp<EBuiltInOps::vmeBlockMotionEstimateIntel>>(builtins, device); });
|
||||
break;
|
||||
case EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel:
|
||||
std::call_once(operationBuilder.second, [&] { operationBuilder.first = std::make_unique<BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel>>(builtins, device); });
|
||||
break;
|
||||
case EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel:
|
||||
std::call_once(operationBuilder.second, [&] { operationBuilder.first = std::make_unique<BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel>>(builtins, device); });
|
||||
break;
|
||||
}
|
||||
return *operationBuilder.first;
|
||||
}
|
||||
} // namespace NEO
|
|
@ -1,29 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2020 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "opencl/source/built_ins/built_in_ops_vme.h"
|
||||
|
||||
namespace NEO {
|
||||
class Program;
|
||||
class ClDevice;
|
||||
class ClDeviceVector;
|
||||
class Context;
|
||||
class BuiltIns;
|
||||
class BuiltinDispatchInfoBuilder;
|
||||
namespace Vme {
|
||||
|
||||
Program *createBuiltInProgram(
|
||||
Context &context,
|
||||
const ClDeviceVector &deviceVector,
|
||||
const char *kernelNames,
|
||||
int &errcodeRet);
|
||||
|
||||
BuiltinDispatchInfoBuilder &getBuiltinDispatchInfoBuilder(EBuiltInOps::Type operation, ClDevice &device);
|
||||
|
||||
} // namespace Vme
|
||||
} // namespace NEO
|
|
@ -1,483 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "shared/source/built_ins/built_ins.h"
|
||||
#include "shared/source/helpers/basic_math.h"
|
||||
|
||||
#include "opencl/source/accelerators/intel_accelerator.h"
|
||||
#include "opencl/source/built_ins/built_in_ops_vme.h"
|
||||
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
|
||||
#include "opencl/source/cl_device/cl_device.h"
|
||||
#include "opencl/source/helpers/dispatch_info_builder.h"
|
||||
#include "opencl/source/mem_obj/buffer.h"
|
||||
#include "opencl/source/mem_obj/image.h"
|
||||
|
||||
namespace NEO {
|
||||
class VmeBuiltinDispatchInfoBuilder : public BuiltinDispatchInfoBuilder {
|
||||
public:
|
||||
VmeBuiltinDispatchInfoBuilder(BuiltIns &kernelsLib, ClDevice &device, EBuiltInOps::Type builtinOp,
|
||||
const char *kernelName)
|
||||
: BuiltinDispatchInfoBuilder(kernelsLib, device) {
|
||||
populate(builtinOp,
|
||||
mediaKernelsBuildOptions,
|
||||
kernelName, multiDeviceVmeKernel);
|
||||
auto rootDeviceIndex = device.getRootDeviceIndex();
|
||||
vmeKernel = multiDeviceVmeKernel->getKernel(rootDeviceIndex);
|
||||
widthArgNum = vmeKernel->getKernelInfo().getArgNumByName("width");
|
||||
heightArgNum = vmeKernel->getKernelInfo().getArgNumByName("height");
|
||||
strideArgNum = vmeKernel->getKernelInfo().getArgNumByName("stride");
|
||||
acceleratorArgNum = vmeKernel->getKernelInfo().getArgNumByName("accelerator");
|
||||
srcImgArgNum = vmeKernel->getKernelInfo().getArgNumByName("srcImg");
|
||||
refImgArgNum = vmeKernel->getKernelInfo().getArgNumByName("refImg");
|
||||
motionVectorBufferArgNum = vmeKernel->getKernelInfo().getArgNumByName("motion_vector_buffer");
|
||||
predictionMotionVectorBufferArgNum = vmeKernel->getKernelInfo().getArgNumByName("prediction_motion_vector_buffer");
|
||||
residualsArgNum = vmeKernel->getKernelInfo().getArgNumByName("residuals");
|
||||
}
|
||||
|
||||
void getBlkTraits(const Vec3<size_t> &inGws, size_t &gwWidthInBlk, size_t &gwHeightInBlk) const {
|
||||
const size_t vmeMacroBlockWidth = 16;
|
||||
const size_t vmeMacroBlockHeight = 16;
|
||||
gwWidthInBlk = Math::divideAndRoundUp(inGws.x, vmeMacroBlockWidth);
|
||||
gwHeightInBlk = Math::divideAndRoundUp(inGws.y, vmeMacroBlockHeight);
|
||||
}
|
||||
|
||||
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, Kernel *kern,
|
||||
const uint32_t inDim, const Vec3<size_t> &inGws, const Vec3<size_t> &inLws, const Vec3<size_t> &inOffset) const override {
|
||||
if (kern == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t gwWidthInBlk = 0;
|
||||
size_t gwHeightInBlk = 0;
|
||||
getBlkTraits(inGws, gwWidthInBlk, gwHeightInBlk);
|
||||
|
||||
cl_int height = (cl_int)gwHeightInBlk;
|
||||
cl_int width = (cl_int)gwWidthInBlk;
|
||||
cl_int stride = height;
|
||||
size_t numThreadsX = gwWidthInBlk;
|
||||
const size_t simdWidth = vmeKernel->getKernelInfo().getMaxSimdSize();
|
||||
stride = static_cast<cl_int>(Math::divideAndRoundUp(height * width, numThreadsX));
|
||||
|
||||
// update implicit args
|
||||
vmeKernel->setArg(heightArgNum, sizeof(height), &height);
|
||||
vmeKernel->setArg(widthArgNum, sizeof(width), &width);
|
||||
vmeKernel->setArg(strideArgNum, sizeof(stride), &stride);
|
||||
|
||||
// Update global work size to force macro-block to HW thread execution model
|
||||
Vec3<size_t> gws = {numThreadsX * simdWidth, 1, 1};
|
||||
Vec3<size_t> lws = {vmeKernel->getKernelInfo().kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0], 1, 1};
|
||||
|
||||
DispatchInfoBuilder<SplitDispatch::Dim::d2D, SplitDispatch::SplitMode::noSplit> builder(clDevice);
|
||||
builder.setDispatchGeometry(gws, lws, inOffset, gws, lws);
|
||||
builder.setKernel(vmeKernel);
|
||||
builder.bake(multiDispatchInfo);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool setExplicitArg(uint32_t argIndex, size_t argSize, const void *argVal, cl_int &err) const override {
|
||||
DEBUG_BREAK_IF(!((argIndex != widthArgNum) && (argIndex != heightArgNum) && (argIndex != strideArgNum)));
|
||||
if ((argIndex == acceleratorArgNum) && (argVal == nullptr)) {
|
||||
err = CL_INVALID_ACCELERATOR_INTEL;
|
||||
return false;
|
||||
}
|
||||
err = vmeKernel->setArg(argIndex, argSize, argVal);
|
||||
return false;
|
||||
}
|
||||
|
||||
cl_int validateDispatch(Kernel *kernel, uint32_t inworkDim, const Vec3<size_t> &inGws, const Vec3<size_t> &inLws, const Vec3<size_t> &inOffset) const override {
|
||||
if (inworkDim != 2) {
|
||||
return CL_INVALID_WORK_DIMENSION;
|
||||
}
|
||||
|
||||
size_t gwWidthInBlk = 0;
|
||||
size_t gwHeightInBlk = 0;
|
||||
getBlkTraits(inGws, gwWidthInBlk, gwHeightInBlk);
|
||||
|
||||
size_t blkNum = gwWidthInBlk * gwHeightInBlk;
|
||||
size_t blkMul = 1;
|
||||
IntelAccelerator *accelerator = castToObject<IntelAccelerator>((cl_accelerator_intel)vmeKernel->getKernelArg(acceleratorArgNum));
|
||||
if (accelerator == nullptr) {
|
||||
return CL_INVALID_KERNEL_ARGS; // accelerator was not set
|
||||
}
|
||||
DEBUG_BREAK_IF(accelerator->getDescriptorSize() != sizeof(cl_motion_estimation_desc_intel));
|
||||
const cl_motion_estimation_desc_intel *acceleratorDesc = reinterpret_cast<const cl_motion_estimation_desc_intel *>(accelerator->getDescriptor());
|
||||
switch (acceleratorDesc->mb_block_type) {
|
||||
case CL_ME_MB_TYPE_8x8_INTEL:
|
||||
blkMul = 4;
|
||||
break;
|
||||
case CL_ME_MB_TYPE_4x4_INTEL:
|
||||
blkMul = 16;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return validateVmeDispatch(inGws, inOffset, blkNum, blkMul);
|
||||
}
|
||||
|
||||
// notes on corner cases :
|
||||
// * if arg not available in kernels - returns true
|
||||
// * if arg set to nullptr - returns true
|
||||
bool validateBufferSize(int32_t bufferArgNum, size_t minimumSizeExpected) const {
|
||||
if (bufferArgNum == -1) {
|
||||
return true;
|
||||
}
|
||||
|
||||
auto buff = castToObject<Buffer>((cl_mem)vmeKernel->getKernelArg(bufferArgNum));
|
||||
if (buff == nullptr) {
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t bufferSize = buff->getSize();
|
||||
if (bufferSize < minimumSizeExpected) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename EnumBaseType>
|
||||
bool validateEnumVal(EnumBaseType val) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename EnumBaseType, typename ExpectedValType, typename... ExpectedValsTypes>
|
||||
bool validateEnumVal(EnumBaseType val, ExpectedValType expectedVal, ExpectedValsTypes... expVals) const {
|
||||
return (val == static_cast<EnumBaseType>(expectedVal)) || validateEnumVal<EnumBaseType, ExpectedValsTypes...>(val, expVals...);
|
||||
}
|
||||
|
||||
// notes on corner cases :
|
||||
// * if arg not available in kernels - returns true
|
||||
template <typename EnumBaseType, typename... ExpectedValsTypes>
|
||||
bool validateEnumArg(int32_t argNum, ExpectedValsTypes... expVals) const {
|
||||
if (argNum == -1) {
|
||||
return true;
|
||||
}
|
||||
|
||||
EnumBaseType val = this->getKernelArgByValValue<EnumBaseType>(static_cast<uint32_t>(argNum));
|
||||
return validateEnumVal<EnumBaseType, ExpectedValsTypes...>(val, expVals...);
|
||||
}
|
||||
|
||||
template <typename RetType>
|
||||
RetType getKernelArgByValValue(uint32_t argNum) const {
|
||||
const auto &argAsVal = vmeKernel->getKernelInfo().kernelDescriptor.payloadMappings.explicitArgs[argNum].as<ArgDescValue>();
|
||||
DEBUG_BREAK_IF(argAsVal.elements.size() != 1);
|
||||
|
||||
const auto &element = argAsVal.elements[0];
|
||||
DEBUG_BREAK_IF(sizeof(RetType) > element.size);
|
||||
return *(RetType *)(vmeKernel->getCrossThreadData() + element.offset);
|
||||
}
|
||||
|
||||
cl_int validateImages(const Vec3<size_t> &inputRegion, const Vec3<size_t> &offset) const {
|
||||
Image *srcImg = castToObject<Image>((cl_mem)vmeKernel->getKernelArg(srcImgArgNum));
|
||||
Image *refImg = castToObject<Image>((cl_mem)vmeKernel->getKernelArg(refImgArgNum));
|
||||
|
||||
if ((srcImg == nullptr) || (refImg == nullptr)) {
|
||||
return CL_INVALID_KERNEL_ARGS;
|
||||
}
|
||||
|
||||
for (Image *img : {srcImg, refImg}) {
|
||||
const cl_image_format &imgFormat = img->getImageFormat();
|
||||
if ((imgFormat.image_channel_order != CL_R) || (imgFormat.image_channel_data_type != CL_UNORM_INT8)) {
|
||||
return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
|
||||
}
|
||||
|
||||
if (false == img->isTiledAllocation()) {
|
||||
// VME only works with tiled images.
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
const cl_image_desc &srcImgDesc = srcImg->getImageDesc();
|
||||
|
||||
size_t srcImageWidth = srcImgDesc.image_width;
|
||||
size_t srcImageHeight = srcImgDesc.image_height;
|
||||
if (((inputRegion.x + offset.x) > srcImageWidth) ||
|
||||
((inputRegion.y + offset.y) > srcImageHeight)) {
|
||||
return CL_INVALID_IMAGE_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
virtual cl_int validateVmeDispatch(const Vec3<size_t> &inputRegion, const Vec3<size_t> &offset, size_t blkNum, size_t blkMul) const {
|
||||
{
|
||||
cl_int imageValidationStatus = validateImages(inputRegion, offset);
|
||||
if (imageValidationStatus != CL_SUCCESS) {
|
||||
return imageValidationStatus;
|
||||
}
|
||||
}
|
||||
|
||||
size_t numPredictors = 1;
|
||||
std::pair<int32_t, size_t> bufferRequirements[] = {
|
||||
std::make_pair(motionVectorBufferArgNum, (blkNum * blkMul * 2 * sizeof(cl_short))),
|
||||
std::make_pair(predictionMotionVectorBufferArgNum, (blkNum * numPredictors * 2 * sizeof(cl_short))),
|
||||
std::make_pair(residualsArgNum, (blkNum * blkMul * sizeof(cl_ushort)))};
|
||||
for (const auto &req : bufferRequirements) {
|
||||
if (false == validateBufferSize(req.first, req.second)) {
|
||||
return CL_INVALID_BUFFER_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
protected:
|
||||
uint32_t heightArgNum;
|
||||
uint32_t widthArgNum;
|
||||
uint32_t strideArgNum;
|
||||
uint32_t acceleratorArgNum;
|
||||
uint32_t srcImgArgNum;
|
||||
uint32_t refImgArgNum;
|
||||
int32_t motionVectorBufferArgNum;
|
||||
int32_t predictionMotionVectorBufferArgNum;
|
||||
int32_t residualsArgNum;
|
||||
MultiDeviceKernel *multiDeviceVmeKernel;
|
||||
Kernel *vmeKernel;
|
||||
};
|
||||
|
||||
template <>
|
||||
class BuiltInOp<EBuiltInOps::vmeBlockMotionEstimateIntel> : public VmeBuiltinDispatchInfoBuilder {
|
||||
public:
|
||||
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
|
||||
: VmeBuiltinDispatchInfoBuilder(kernelsLib, device,
|
||||
EBuiltInOps::vmeBlockMotionEstimateIntel, "block_motion_estimate_intel") {
|
||||
}
|
||||
};
|
||||
|
||||
class AdvancedVmeBuiltinDispatchInfoBuilder : public VmeBuiltinDispatchInfoBuilder {
|
||||
public:
|
||||
AdvancedVmeBuiltinDispatchInfoBuilder(BuiltIns &kernelsLib, ClDevice &device, EBuiltInOps::Type builtinOp,
|
||||
const char *kernelName)
|
||||
: VmeBuiltinDispatchInfoBuilder(kernelsLib, device, builtinOp,
|
||||
kernelName) {
|
||||
flagsArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("flags");
|
||||
intraSrcImgArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("intraSrcImg");
|
||||
skipBlockTypeArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("skip_block_type");
|
||||
searchCostPenaltyArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("search_cost_penalty");
|
||||
searchCostPrecisionArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("search_cost_precision");
|
||||
bidirWeightArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("bidir_weight");
|
||||
predictorsBufferArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("predictors_buffer");
|
||||
countMotionVectorBufferArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("count_motion_vector_buffer");
|
||||
skipMotionVectorBufferArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("skip_motion_vector_buffer");
|
||||
intraSearchPredictorModesArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("intra_search_predictor_modes");
|
||||
skipResidualsArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("skip_residuals");
|
||||
intraResidualsArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("intra_residuals");
|
||||
}
|
||||
|
||||
bool setExplicitArg(uint32_t argIndex, size_t argSize, const void *argVal, cl_int &err) const override {
|
||||
DEBUG_BREAK_IF(argIndex == intraSrcImgArgNum);
|
||||
if (argIndex == this->srcImgArgNum) {
|
||||
// rebind also as media block image
|
||||
this->vmeKernel->setArg(intraSrcImgArgNum, argSize, argVal);
|
||||
}
|
||||
return VmeBuiltinDispatchInfoBuilder::setExplicitArg(argIndex, argSize, argVal, err);
|
||||
}
|
||||
|
||||
virtual bool isBidirKernel() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool validateFlags(uint32_t &outSkipBlockType) const {
|
||||
uint32_t flagsVal = VmeBuiltinDispatchInfoBuilder::template getKernelArgByValValue<uint32_t>(flagsArgNum);
|
||||
|
||||
if ((flagsVal & CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL) == CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (flagsVal == CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL) {
|
||||
outSkipBlockType = CL_ME_MB_TYPE_16x16_INTEL;
|
||||
} else if ((flagsVal & CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL) == CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL) {
|
||||
outSkipBlockType = CL_ME_MB_TYPE_8x8_INTEL;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool validateSkipBlockTypeArg(uint32_t &outSkipBlockType) const {
|
||||
if (skipBlockTypeArgNum == -1) {
|
||||
return true;
|
||||
}
|
||||
|
||||
outSkipBlockType = VmeBuiltinDispatchInfoBuilder::template getKernelArgByValValue<uint32_t>(static_cast<uint32_t>(skipBlockTypeArgNum));
|
||||
|
||||
switch (outSkipBlockType) {
|
||||
case CL_ME_MB_TYPE_16x16_INTEL:
|
||||
break;
|
||||
case CL_ME_MB_TYPE_8x8_INTEL:
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t getIntraSearchPredictorModesBuffExpSize(size_t blkNum) const {
|
||||
// vector size is 22 - 1 (16x16 luma block) + 4 (8x8 luma block) + 16 (4x4 luma block) + 1 (8x8 chroma block)
|
||||
int vectorSize = 22;
|
||||
size_t intraSearchPredictorModesBuffExpSize = blkNum * vectorSize;
|
||||
return intraSearchPredictorModesBuffExpSize;
|
||||
}
|
||||
|
||||
size_t getSkipMotionVectorBufferExpSize(uint32_t skipBlockType, size_t blkNum) const {
|
||||
// vector size is either 1 (16x16 block) or 4 (8x8 block)
|
||||
// 0 to 8 skip MVs per MB
|
||||
// may be null if all MBs in frame have 0 skip check MVs in which case VME skip checks are not performed
|
||||
// layout assumes 4 (for bidir) or 8 (otherwise) skip check MVs per MB
|
||||
// row-major block layout; all MVs for a block are contiguous
|
||||
// buffer size depends on the block and frame size .
|
||||
int vectorSize = (skipBlockType == CL_ME_MB_TYPE_16x16_INTEL) ? 1 : 4;
|
||||
int numChecks = (isBidirKernel() ? 4 : 8);
|
||||
size_t skipMotionVectorBufferExpSize = blkNum * numChecks * vectorSize * 2 * sizeof(cl_short);
|
||||
return skipMotionVectorBufferExpSize;
|
||||
}
|
||||
|
||||
size_t getSkipResidualsBuffExpSize(uint32_t skipBlockType, size_t blkNum) const {
|
||||
/* output buffer of vectors of unsigned short SAD adjusted values corresponding to the input skip check MVs
|
||||
may be null if skip_motion_vector_buffer is null
|
||||
vector size is either 1 (16x16 block) or 4 (8x8 block)
|
||||
0 to 8 skip check residuals per MB
|
||||
layout always assumes 8 skip check residuals per MB
|
||||
row major block layout; all MVs for a block are contiguous
|
||||
buffer size depends on the block and frame size */
|
||||
int vectorSize = 1;
|
||||
switch (skipBlockType) {
|
||||
case CL_ME_MB_TYPE_16x16_INTEL:
|
||||
vectorSize = 1;
|
||||
break;
|
||||
case CL_ME_MB_TYPE_8x8_INTEL:
|
||||
vectorSize = 4;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
};
|
||||
|
||||
int numChecks = (isBidirKernel() ? 4 : 8);
|
||||
size_t skipResidualsBuffExpSize = blkNum * vectorSize * numChecks * sizeof(cl_ushort);
|
||||
return skipResidualsBuffExpSize;
|
||||
}
|
||||
|
||||
size_t getIntraResidualsBuffExpSize(size_t blkNum) const {
|
||||
/* output buffer of vectors of unsigned short SAD adjusted values
|
||||
may be null in which case the intra residuals corresponding not returned
|
||||
vector size is 4 - 1 (16x16 luma block) + 1 (8x8 luma block) + 1 (4x4 luma block) + 1 (8x8 chroma block)
|
||||
1 vector per MB
|
||||
buffer size depends on the frame size */
|
||||
int vectorSize = 4;
|
||||
size_t intraResidualsBuffExpSize = (blkNum * sizeof(cl_ushort) * vectorSize);
|
||||
return intraResidualsBuffExpSize;
|
||||
}
|
||||
|
||||
size_t getPredictorsBufferExpSize(size_t blkNum) const {
|
||||
size_t numPredictors = 8;
|
||||
size_t predictorsBufferExpSize = (blkNum * numPredictors * 2 * sizeof(cl_short));
|
||||
return predictorsBufferExpSize;
|
||||
}
|
||||
|
||||
cl_int validateVmeDispatch(const Vec3<size_t> &inputRegion, const Vec3<size_t> &offset, size_t blkNum, size_t blkMul) const override {
|
||||
cl_int basicVmeValidationStatus = VmeBuiltinDispatchInfoBuilder::validateVmeDispatch(inputRegion, offset, blkNum, blkMul);
|
||||
if (basicVmeValidationStatus != CL_SUCCESS) {
|
||||
return basicVmeValidationStatus;
|
||||
}
|
||||
|
||||
uint32_t skipBlockType = CL_ME_MB_TYPE_16x16_INTEL;
|
||||
if (false == validateFlags(skipBlockType)) {
|
||||
return CL_INVALID_KERNEL_ARGS;
|
||||
}
|
||||
|
||||
if (false == validateSkipBlockTypeArg(skipBlockType)) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
if (false == VmeBuiltinDispatchInfoBuilder::template validateEnumArg<uint32_t>(searchCostPenaltyArgNum, CL_ME_COST_PENALTY_NONE_INTEL, CL_ME_COST_PENALTY_LOW_INTEL, CL_ME_COST_PENALTY_NORMAL_INTEL,
|
||||
CL_ME_COST_PENALTY_HIGH_INTEL)) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
if (false == VmeBuiltinDispatchInfoBuilder::template validateEnumArg<uint32_t>(searchCostPrecisionArgNum, CL_ME_COST_PRECISION_QPEL_INTEL, CL_ME_COST_PRECISION_HPEL_INTEL, CL_ME_COST_PRECISION_PEL_INTEL,
|
||||
CL_ME_COST_PRECISION_DPEL_INTEL)) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
if (false == VmeBuiltinDispatchInfoBuilder::template validateEnumArg<uint8_t>(bidirWeightArgNum, 0, CL_ME_BIDIR_WEIGHT_QUARTER_INTEL, CL_ME_BIDIR_WEIGHT_THIRD_INTEL, CL_ME_BIDIR_WEIGHT_HALF_INTEL,
|
||||
CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL, CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL)) {
|
||||
return CL_INVALID_KERNEL_ARGS;
|
||||
}
|
||||
|
||||
std::pair<int32_t, size_t> bufferRequirements[] = {
|
||||
std::make_pair(countMotionVectorBufferArgNum, (blkNum * 2 * sizeof(cl_short))),
|
||||
std::make_pair(skipMotionVectorBufferArgNum, getSkipMotionVectorBufferExpSize(skipBlockType, blkNum)),
|
||||
std::make_pair(intraSearchPredictorModesArgNum, getIntraSearchPredictorModesBuffExpSize(blkNum)),
|
||||
std::make_pair(skipResidualsArgNum, getSkipResidualsBuffExpSize(skipBlockType, blkNum)),
|
||||
std::make_pair(intraResidualsArgNum, getIntraResidualsBuffExpSize(blkNum)),
|
||||
std::make_pair(predictorsBufferArgNum, getPredictorsBufferExpSize(blkNum))};
|
||||
for (const auto &req : bufferRequirements) {
|
||||
if (false == this->validateBufferSize(req.first, req.second)) {
|
||||
return CL_INVALID_BUFFER_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
protected:
|
||||
uint32_t flagsArgNum;
|
||||
int32_t skipBlockTypeArgNum;
|
||||
uint32_t searchCostPenaltyArgNum;
|
||||
uint32_t searchCostPrecisionArgNum;
|
||||
int32_t bidirWeightArgNum;
|
||||
int32_t predictorsBufferArgNum;
|
||||
uint32_t countMotionVectorBufferArgNum;
|
||||
uint32_t skipMotionVectorBufferArgNum;
|
||||
uint32_t intraSearchPredictorModesArgNum;
|
||||
uint32_t skipResidualsArgNum;
|
||||
uint32_t intraResidualsArgNum;
|
||||
uint32_t intraSrcImgArgNum;
|
||||
};
|
||||
|
||||
template <>
|
||||
class BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel> : public AdvancedVmeBuiltinDispatchInfoBuilder {
|
||||
public:
|
||||
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
|
||||
: AdvancedVmeBuiltinDispatchInfoBuilder(kernelsLib, device, EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel,
|
||||
"block_advanced_motion_estimate_check_intel") {
|
||||
}
|
||||
|
||||
cl_int validateVmeDispatch(const Vec3<size_t> &inputRegion, const Vec3<size_t> &offset,
|
||||
size_t gwWidthInBlk, size_t gwHeightInBlk) const override {
|
||||
cl_int basicAdvVmeValidationStatus = AdvancedVmeBuiltinDispatchInfoBuilder::validateVmeDispatch(inputRegion, offset, gwWidthInBlk, gwHeightInBlk);
|
||||
if (basicAdvVmeValidationStatus != CL_SUCCESS) {
|
||||
return basicAdvVmeValidationStatus;
|
||||
}
|
||||
|
||||
auto countMotionVectorBuff = castToObject<Buffer>((cl_mem)this->vmeKernel->getKernelArg(this->countMotionVectorBufferArgNum));
|
||||
if (countMotionVectorBuff == nullptr) {
|
||||
return CL_INVALID_BUFFER_SIZE;
|
||||
}
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel> : public AdvancedVmeBuiltinDispatchInfoBuilder {
|
||||
public:
|
||||
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
|
||||
: AdvancedVmeBuiltinDispatchInfoBuilder(kernelsLib, device, EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel,
|
||||
"block_advanced_motion_estimate_bidirectional_check_intel") {
|
||||
}
|
||||
|
||||
bool isBidirKernel() const override {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
} // namespace NEO
|
|
@ -59,7 +59,6 @@ set(NEO_IGDRCL_TESTS__TARGET_OBJECTS
|
|||
$<TARGET_OBJECTS:mock_gmm>
|
||||
$<TARGET_OBJECTS:${SHARINGS_ENABLE_LIB_NAME}>
|
||||
$<TARGET_OBJECTS:${BUILTINS_SOURCES_LIB_NAME}>
|
||||
$<TARGET_OBJECTS:${BUILTINS_VME_LIB_NAME}>
|
||||
$<TARGET_OBJECTS:${BUILTINS_BINARIES_STATELESS_LIB_NAME}>
|
||||
$<TARGET_OBJECTS:${BUILTINS_BINARIES_STATELESS_HEAPLESS_LIB_NAME}>
|
||||
$<TARGET_OBJECTS:${BUILTINS_BINARIES_BINDFUL_LIB_NAME}>
|
||||
|
@ -311,12 +310,6 @@ set(TEST_KERNEL_STATELESS
|
|||
test_files/stateless_kernel.cl
|
||||
)
|
||||
|
||||
set(TEST_KERNEL_VME
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test_files/vme_kernels.cl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test_files/media_kernels_backend.cl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test_files/media_kernels_frontend.cl
|
||||
)
|
||||
|
||||
set(TEST_KERNEL_BINDLESS_internal_options
|
||||
"-cl-intel-use-bindless-mode -cl-intel-use-bindless-advanced-mode"
|
||||
)
|
||||
|
@ -336,12 +329,10 @@ set(TEST_KERNEL_PRINTF_internal_options_gen9lp
|
|||
file(GLOB_RECURSE TEST_KERNELS test_files/*.cl)
|
||||
list(REMOVE_ITEM TEST_KERNELS "${CMAKE_CURRENT_SOURCE_DIR}/test_files/simple_nonuniform.cl")
|
||||
list(REMOVE_ITEM TEST_KERNELS "${CMAKE_CURRENT_SOURCE_DIR}/test_files/stateless_kernel.cl")
|
||||
list(REMOVE_ITEM TEST_KERNELS ${TEST_KERNEL_VME})
|
||||
list(REMOVE_ITEM TEST_KERNELS "${CMAKE_CURRENT_SOURCE_DIR}/${TEST_KERNEL_PRINTF}")
|
||||
|
||||
macro(macro_for_each_platform)
|
||||
set(PLATFORM_2_0_LOWER ${DEFAULT_SUPPORTED_2_0_${CORE_TYPE}_${PLATFORM_IT}_PLATFORM})
|
||||
set(PLATFORM_VME_LOWER ${DEFAULT_SUPPORTED_VME_${CORE_TYPE}_${PLATFORM_IT}_PLATFORM})
|
||||
|
||||
set(PLATFORM_TEST_KERNELS ${TEST_KERNELS})
|
||||
set(IMAGE_SUPPORT FALSE)
|
||||
|
@ -398,12 +389,6 @@ macro(macro_for_each_platform)
|
|||
neo_gen_kernels_with_options(${PLATFORM_IT_LOWER} ${DEVICE_ID} ${REVISION_ID} "${TEST_KERNEL_2_0}" ${TEST_KERNEL_2_0_options})
|
||||
endforeach()
|
||||
endif()
|
||||
if(PLATFORM_VME_LOWER)
|
||||
foreach(REVISION_CONFIG ${${PLATFORM_IT}_${CORE_TYPE}_REVISIONS})
|
||||
parse_revision_config(${REVISION_CONFIG} ${PLATFORM_IT_LOWER} DEVICE_ID REVISION_ID)
|
||||
neo_gen_kernels(${PLATFORM_IT_LOWER} ${DEVICE_ID} ${REVISION_ID} TRUE ${TEST_KERNEL_VME})
|
||||
endforeach()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(PREVIOUS_TARGET)
|
||||
|
|
|
@ -1,140 +1,26 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2023 Intel Corporation
|
||||
* Copyright (C) 2018-2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/source/built_ins/built_ins.h"
|
||||
#include "shared/source/compiler_interface/compiler_interface.h"
|
||||
#include "shared/source/device/device.h"
|
||||
|
||||
#include "opencl/source/built_ins/built_in_ops_vme.h"
|
||||
#include "opencl/source/built_ins/vme_builtin.h"
|
||||
#include "opencl/source/context/context.h"
|
||||
#include "opencl/source/helpers/base_object.h"
|
||||
#include "opencl/source/kernel/kernel.h"
|
||||
#include "opencl/source/program/program.h"
|
||||
#include "opencl/test/unit_test/fixtures/run_kernel_fixture.h"
|
||||
#include "opencl/test/unit_test/mocks/mock_cl_device.h"
|
||||
|
||||
#include "cl_api_tests.h"
|
||||
|
||||
using namespace NEO;
|
||||
|
||||
using ClCreateProgramWithBuiltInKernelsTests = ApiTests;
|
||||
|
||||
struct ClCreateProgramWithBuiltInVmeKernelsTests : ClCreateProgramWithBuiltInKernelsTests {
|
||||
void SetUp() override {
|
||||
ClCreateProgramWithBuiltInKernelsTests::SetUp();
|
||||
if (!castToObject<ClDevice>(testedClDevice)->getHardwareInfo().capabilityTable.supportsVme) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
|
||||
pClDevice = pContext->getDevice(0);
|
||||
}
|
||||
|
||||
ClDevice *pClDevice;
|
||||
};
|
||||
|
||||
namespace ULT {
|
||||
|
||||
TEST_F(ClCreateProgramWithBuiltInKernelsTests, GivenInvalidContextWhenCreatingProgramWithBuiltInKernelsThenInvalidContextErrorIsReturned) {
|
||||
TEST_F(ClCreateProgramWithBuiltInKernelsTests, GivenMediaKernelsWhenCreatingProgramWithBuiltInKernelsThenProgramIsNotCreated) {
|
||||
cl_int retVal = CL_SUCCESS;
|
||||
auto program = clCreateProgramWithBuiltInKernels(
|
||||
nullptr, // context
|
||||
1, // num_devices
|
||||
nullptr, // device_list
|
||||
nullptr, // kernel_names
|
||||
&retVal);
|
||||
EXPECT_EQ(nullptr, program);
|
||||
EXPECT_EQ(CL_INVALID_CONTEXT, retVal);
|
||||
}
|
||||
|
||||
TEST_F(ClCreateProgramWithBuiltInKernelsTests, GivenNoKernelsWhenCreatingProgramWithBuiltInKernelsThenInvalidValueErrorIsReturned) {
|
||||
cl_int retVal = CL_SUCCESS;
|
||||
auto program = clCreateProgramWithBuiltInKernels(
|
||||
pContext, // context
|
||||
1, // num_devices
|
||||
&testedClDevice, // device_list
|
||||
"", // kernel_names
|
||||
&retVal);
|
||||
EXPECT_EQ(nullptr, program);
|
||||
EXPECT_EQ(CL_INVALID_VALUE, retVal);
|
||||
}
|
||||
|
||||
TEST_F(ClCreateProgramWithBuiltInKernelsTests, GivenNoDeviceWhenCreatingProgramWithBuiltInKernelsThenInvalidValueErrorIsReturned) {
|
||||
cl_int retVal = CL_SUCCESS;
|
||||
auto program = clCreateProgramWithBuiltInKernels(
|
||||
pContext, // context
|
||||
0, // num_devices
|
||||
&testedClDevice, // device_list
|
||||
"", // kernel_names
|
||||
&retVal);
|
||||
EXPECT_EQ(nullptr, program);
|
||||
EXPECT_EQ(CL_INVALID_VALUE, retVal);
|
||||
}
|
||||
|
||||
TEST_F(ClCreateProgramWithBuiltInKernelsTests, GivenNoKernelsAndNoReturnWhenCreatingProgramWithBuiltInKernelsThenProgramIsNotCreated) {
|
||||
auto program = clCreateProgramWithBuiltInKernels(
|
||||
pContext, // context
|
||||
1, // num_devices
|
||||
&testedClDevice, // device_list
|
||||
"", // kernel_names
|
||||
nullptr);
|
||||
EXPECT_EQ(nullptr, program);
|
||||
}
|
||||
|
||||
TEST_F(ClCreateProgramWithBuiltInVmeKernelsTests, GivenDeviceNotAssociatedWithContextWhenCreatingProgramWithBuiltInThenInvalidDeviceErrorIsReturned) {
|
||||
cl_program pProgram = nullptr;
|
||||
|
||||
const char *kernelNamesString = {
|
||||
"block_advanced_motion_estimate_bidirectional_check_intel;"
|
||||
"block_motion_estimate_intel;"
|
||||
"block_advanced_motion_estimate_check_intel;"};
|
||||
|
||||
MockClDevice invalidDevice(new MockDevice());
|
||||
|
||||
cl_device_id devicesForProgram[] = {&invalidDevice};
|
||||
|
||||
pProgram = clCreateProgramWithBuiltInKernels(
|
||||
pContext,
|
||||
1,
|
||||
devicesForProgram,
|
||||
kernelNamesString,
|
||||
&retVal);
|
||||
EXPECT_EQ(CL_INVALID_DEVICE, retVal);
|
||||
EXPECT_EQ(nullptr, pProgram);
|
||||
|
||||
retVal = CL_INVALID_PROGRAM;
|
||||
devicesForProgram[0] = nullptr;
|
||||
|
||||
pProgram = clCreateProgramWithBuiltInKernels(
|
||||
pContext,
|
||||
1,
|
||||
devicesForProgram,
|
||||
kernelNamesString,
|
||||
&retVal);
|
||||
EXPECT_EQ(CL_INVALID_DEVICE, retVal);
|
||||
EXPECT_EQ(nullptr, pProgram);
|
||||
}
|
||||
|
||||
TEST_F(ClCreateProgramWithBuiltInVmeKernelsTests, GivenValidMediaKernelsWhenCreatingProgramWithBuiltInKernelsThenProgramIsSuccessfullyCreated) {
|
||||
cl_int retVal = CL_SUCCESS;
|
||||
|
||||
overwriteBuiltInBinaryName("media_kernels_frontend");
|
||||
|
||||
const char *kernelNamesString = {
|
||||
"block_advanced_motion_estimate_bidirectional_check_intel;"
|
||||
"block_motion_estimate_intel;"
|
||||
"block_advanced_motion_estimate_check_intel;"};
|
||||
|
||||
const char *kernelNames[] = {
|
||||
"block_motion_estimate_intel",
|
||||
"block_advanced_motion_estimate_check_intel",
|
||||
"block_advanced_motion_estimate_bidirectional_check_intel",
|
||||
};
|
||||
|
||||
cl_program program = clCreateProgramWithBuiltInKernels(
|
||||
pContext, // context
|
||||
1, // num_devices
|
||||
|
@ -142,160 +28,7 @@ TEST_F(ClCreateProgramWithBuiltInVmeKernelsTests, GivenValidMediaKernelsWhenCrea
|
|||
kernelNamesString, // kernel_names
|
||||
&retVal);
|
||||
|
||||
restoreBuiltInBinaryName();
|
||||
EXPECT_NE(nullptr, program);
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
for (auto &kernelName : kernelNames) {
|
||||
cl_kernel kernel = clCreateKernel(
|
||||
program,
|
||||
kernelName,
|
||||
&retVal);
|
||||
ASSERT_EQ(CL_SUCCESS, retVal);
|
||||
ASSERT_NE(nullptr, kernel);
|
||||
|
||||
retVal = clReleaseKernel(kernel);
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
}
|
||||
|
||||
retVal = clReleaseProgram(program);
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
}
|
||||
|
||||
TEST_F(ClCreateProgramWithBuiltInVmeKernelsTests, GivenValidMediaKernelsWithOptionsWhenCreatingProgramWithBuiltInKernelsThenProgramIsSuccessfullyCreatedWithThoseOptions) {
|
||||
cl_int retVal = CL_SUCCESS;
|
||||
|
||||
overwriteBuiltInBinaryName("media_kernels_frontend");
|
||||
|
||||
const char *kernelNamesString = {
|
||||
"block_motion_estimate_intel;"};
|
||||
|
||||
cl_program program = clCreateProgramWithBuiltInKernels(
|
||||
pContext, // context
|
||||
1, // num_devices
|
||||
&testedClDevice, // device_list
|
||||
kernelNamesString, // kernel_names
|
||||
&retVal);
|
||||
|
||||
restoreBuiltInBinaryName();
|
||||
|
||||
auto neoProgram = castToObject<Program>(program);
|
||||
auto builtinOptions = neoProgram->getOptions();
|
||||
auto it = builtinOptions.find("HW_NULL_CHECK");
|
||||
EXPECT_EQ(std::string::npos, it);
|
||||
|
||||
clReleaseProgram(program);
|
||||
}
|
||||
|
||||
TEST_F(ClCreateProgramWithBuiltInVmeKernelsTests, GivenVmeBlockMotionEstimateKernelWhenCreatingProgramWithBuiltInKernelsThenCorrectDispatchBuilderAndFrontendKernelIsCreated) {
|
||||
cl_int retVal = CL_SUCCESS;
|
||||
|
||||
overwriteBuiltInBinaryName("media_kernels_backend");
|
||||
Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::vmeBlockMotionEstimateIntel, *pClDevice);
|
||||
restoreBuiltInBinaryName();
|
||||
|
||||
overwriteBuiltInBinaryName("media_kernels_frontend");
|
||||
|
||||
const char *kernelNamesString = {
|
||||
"block_motion_estimate_intel;"};
|
||||
|
||||
cl_program program = clCreateProgramWithBuiltInKernels(
|
||||
pContext, // context
|
||||
1, // num_devices
|
||||
&testedClDevice, // device_list
|
||||
kernelNamesString, // kernel_names
|
||||
&retVal);
|
||||
restoreBuiltInBinaryName();
|
||||
|
||||
cl_kernel kernel = clCreateKernel(
|
||||
program,
|
||||
"block_motion_estimate_intel",
|
||||
&retVal);
|
||||
|
||||
auto pMultiDeviceKernel = castToObject<MultiDeviceKernel>(kernel);
|
||||
auto kernNeo = pMultiDeviceKernel->getKernel(testedRootDeviceIndex);
|
||||
EXPECT_NE(nullptr, kernNeo->getKernelInfo().builtinDispatchBuilder);
|
||||
EXPECT_EQ(6U, kernNeo->getKernelArgsNumber());
|
||||
|
||||
auto &vmeBuilder = Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::vmeBlockMotionEstimateIntel, *pClDevice);
|
||||
EXPECT_EQ(&vmeBuilder, kernNeo->getKernelInfo().builtinDispatchBuilder);
|
||||
|
||||
clReleaseKernel(kernel);
|
||||
clReleaseProgram(program);
|
||||
}
|
||||
|
||||
TEST_F(ClCreateProgramWithBuiltInVmeKernelsTests, GivenVmeBlockAdvancedMotionEstimateKernelWhenCreatingProgramWithBuiltInKernelsThenCorrectDispatchBuilderAndFrontendKernelIsCreated) {
|
||||
cl_int retVal = CL_SUCCESS;
|
||||
|
||||
overwriteBuiltInBinaryName("media_kernels_backend");
|
||||
Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel, *pClDevice);
|
||||
restoreBuiltInBinaryName();
|
||||
|
||||
overwriteBuiltInBinaryName("media_kernels_frontend");
|
||||
|
||||
const char *kernelNamesString = {
|
||||
"block_advanced_motion_estimate_check_intel;"};
|
||||
|
||||
cl_program program = clCreateProgramWithBuiltInKernels(
|
||||
pContext, // context
|
||||
1, // num_devices
|
||||
&testedClDevice, // device_list
|
||||
kernelNamesString, // kernel_names
|
||||
&retVal);
|
||||
restoreBuiltInBinaryName();
|
||||
|
||||
cl_kernel kernel = clCreateKernel(
|
||||
program,
|
||||
"block_advanced_motion_estimate_check_intel",
|
||||
&retVal);
|
||||
|
||||
auto pMultiDeviceKernel = castToObject<MultiDeviceKernel>(kernel);
|
||||
auto kernNeo = pMultiDeviceKernel->getKernel(testedRootDeviceIndex);
|
||||
EXPECT_NE(nullptr, kernNeo->getKernelInfo().builtinDispatchBuilder);
|
||||
EXPECT_EQ(15U, kernNeo->getKernelArgsNumber());
|
||||
|
||||
auto &vmeBuilder = Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel, *pClDevice);
|
||||
EXPECT_EQ(&vmeBuilder, kernNeo->getKernelInfo().builtinDispatchBuilder);
|
||||
|
||||
clReleaseKernel(kernel);
|
||||
clReleaseProgram(program);
|
||||
}
|
||||
|
||||
TEST_F(ClCreateProgramWithBuiltInVmeKernelsTests, GivenVmeBlockAdvancedMotionEstimateBidirectionalCheckKernelWhenCreatingProgramWithBuiltInKernelsThenCorrectDispatchBuilderAndFrontendKernelIsCreated) {
|
||||
cl_int retVal = CL_SUCCESS;
|
||||
|
||||
overwriteBuiltInBinaryName("media_kernels_backend");
|
||||
Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel, *pClDevice);
|
||||
restoreBuiltInBinaryName();
|
||||
|
||||
overwriteBuiltInBinaryName("media_kernels_frontend");
|
||||
|
||||
const char *kernelNamesString = {
|
||||
"block_advanced_motion_estimate_bidirectional_check_intel;"};
|
||||
|
||||
cl_program program = clCreateProgramWithBuiltInKernels(
|
||||
pContext, // context
|
||||
1, // num_devices
|
||||
&testedClDevice, // device_list
|
||||
kernelNamesString, // kernel_names
|
||||
&retVal);
|
||||
restoreBuiltInBinaryName();
|
||||
|
||||
cl_kernel kernel = clCreateKernel(
|
||||
program,
|
||||
"block_advanced_motion_estimate_bidirectional_check_intel",
|
||||
&retVal);
|
||||
|
||||
auto pMultiDeviceKernel = castToObject<MultiDeviceKernel>(kernel);
|
||||
auto kernNeo = pMultiDeviceKernel->getKernel(testedRootDeviceIndex);
|
||||
EXPECT_NE(nullptr, kernNeo->getKernelInfo().builtinDispatchBuilder);
|
||||
EXPECT_EQ(20U, kernNeo->getKernelArgsNumber());
|
||||
|
||||
auto ctxNeo = castToObject<Context>(pContext);
|
||||
auto &vmeBuilder = Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel, *ctxNeo->getDevice(0));
|
||||
EXPECT_EQ(&vmeBuilder, kernNeo->getKernelInfo().builtinDispatchBuilder);
|
||||
|
||||
clReleaseKernel(kernel);
|
||||
clReleaseProgram(program);
|
||||
EXPECT_EQ(nullptr, program);
|
||||
EXPECT_EQ(CL_INVALID_VALUE, retVal);
|
||||
}
|
||||
} // namespace ULT
|
||||
|
|
|
@ -30,8 +30,6 @@
|
|||
#include "opencl/source/accelerators/intel_motion_estimation.h"
|
||||
#include "opencl/source/built_ins/aux_translation_builtin.h"
|
||||
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
|
||||
#include "opencl/source/built_ins/vme_builtin.h"
|
||||
#include "opencl/source/built_ins/vme_dispatch_builder.h"
|
||||
#include "opencl/source/helpers/dispatch_info_builder.h"
|
||||
#include "opencl/source/kernel/kernel.h"
|
||||
#include "opencl/test/unit_test/built_ins/built_ins_file_names.h"
|
||||
|
@ -122,15 +120,6 @@ class BuiltInTests
|
|||
std::string allBuiltIns;
|
||||
};
|
||||
|
||||
struct VmeBuiltInTests : BuiltInTests {
|
||||
void SetUp() override {
|
||||
BuiltInTests::SetUp();
|
||||
if (!pDevice->getHardwareInfo().capabilityTable.supportsVme) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct AuxBuiltInTests : BuiltInTests, public ::testing::WithParamInterface<KernelObjForAuxTranslation::Type> {
|
||||
void SetUp() override {
|
||||
BuiltInTests::SetUp();
|
||||
|
@ -1350,167 +1339,6 @@ TEST_F(BuiltInTests, WhenSettingExplictArgThenTrueIsReturned) {
|
|||
EXPECT_TRUE(ret);
|
||||
}
|
||||
|
||||
TEST_F(VmeBuiltInTests, GivenVmeBuilderWhenGettingDispatchInfoThenValidPointerIsReturned) {
|
||||
overwriteBuiltInBinaryName("media_kernels_backend");
|
||||
|
||||
EBuiltInOps::Type vmeOps[] = {EBuiltInOps::vmeBlockMotionEstimateIntel, EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel, EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel};
|
||||
for (auto op : vmeOps) {
|
||||
BuiltinDispatchInfoBuilder &builder = Vme::getBuiltinDispatchInfoBuilder(op, *pClDevice);
|
||||
EXPECT_NE(nullptr, &builder);
|
||||
}
|
||||
|
||||
restoreBuiltInBinaryName();
|
||||
}
|
||||
|
||||
TEST_F(VmeBuiltInTests, givenInvalidBuiltInOpWhenGetVmeBuilderInfoThenExceptionIsThrown) {
|
||||
EXPECT_THROW(Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::count, *pClDevice), std::exception);
|
||||
}
|
||||
|
||||
TEST_F(VmeBuiltInTests, GivenVmeBuilderAndInvalidParamsWhenGettingDispatchInfoThenEmptyKernelIsReturned) {
|
||||
overwriteBuiltInBinaryName("media_kernels_backend");
|
||||
EBuiltInOps::Type vmeOps[] = {EBuiltInOps::vmeBlockMotionEstimateIntel, EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel, EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel};
|
||||
for (auto op : vmeOps) {
|
||||
BuiltinDispatchInfoBuilder &builder = Vme::getBuiltinDispatchInfoBuilder(op, *pClDevice);
|
||||
|
||||
MultiDispatchInfo outMdi;
|
||||
Vec3<size_t> gws{352, 288, 0};
|
||||
Vec3<size_t> elws{0, 0, 0};
|
||||
Vec3<size_t> offset{0, 0, 0};
|
||||
auto ret = builder.buildDispatchInfos(outMdi, nullptr, 0, gws, elws, offset);
|
||||
EXPECT_FALSE(ret);
|
||||
EXPECT_EQ(0U, outMdi.size());
|
||||
}
|
||||
restoreBuiltInBinaryName();
|
||||
}
|
||||
|
||||
TEST_F(VmeBuiltInTests, GivenVmeBuilderWhenGettingDispatchInfoThenParamsAreCorrect) {
|
||||
MockKernelWithInternals mockKernel{*pClDevice};
|
||||
mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 16;
|
||||
mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0] = 16;
|
||||
mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[1] = 0;
|
||||
mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2] = 0;
|
||||
|
||||
overwriteBuiltInBinaryName("media_kernels_backend");
|
||||
BuiltinDispatchInfoBuilder &builder = Vme::getBuiltinDispatchInfoBuilder(EBuiltInOps::vmeBlockMotionEstimateIntel, *pClDevice);
|
||||
restoreBuiltInBinaryName();
|
||||
|
||||
MultiDispatchInfo outMdi;
|
||||
Vec3<size_t> gws{352, 288, 0};
|
||||
Vec3<size_t> elws{0, 0, 0};
|
||||
Vec3<size_t> offset{16, 0, 0};
|
||||
|
||||
MockBuffer mb;
|
||||
cl_mem bufferArg = static_cast<cl_mem>(&mb);
|
||||
|
||||
cl_int err;
|
||||
constexpr uint32_t bufferArgNum = 3;
|
||||
bool ret = builder.setExplicitArg(bufferArgNum, sizeof(cl_mem), &bufferArg, err);
|
||||
EXPECT_FALSE(ret);
|
||||
EXPECT_EQ(CL_SUCCESS, err);
|
||||
|
||||
ret = builder.buildDispatchInfos(outMdi, mockKernel.mockKernel, 0, gws, elws, offset);
|
||||
EXPECT_TRUE(ret);
|
||||
EXPECT_EQ(1U, outMdi.size());
|
||||
|
||||
auto outDi = outMdi.begin();
|
||||
EXPECT_EQ(Vec3<size_t>(352, 1, 1), outDi->getGWS());
|
||||
EXPECT_EQ(Vec3<size_t>(16, 1, 1), outDi->getEnqueuedWorkgroupSize());
|
||||
EXPECT_EQ(Vec3<size_t>(16, 0, 0), outDi->getOffset());
|
||||
EXPECT_NE(mockKernel.mockKernel, outDi->getKernel());
|
||||
|
||||
EXPECT_EQ(bufferArg, outDi->getKernel()->getKernelArg(bufferArgNum));
|
||||
|
||||
constexpr uint32_t vmeImplicitArgsBase = 6;
|
||||
constexpr uint32_t vmeImplicitArgs = 3;
|
||||
ASSERT_EQ(vmeImplicitArgsBase + vmeImplicitArgs, outDi->getKernel()->getKernelInfo().kernelDescriptor.payloadMappings.explicitArgs.size());
|
||||
uint32_t vmeExtraArgsExpectedVals[] = {18, 22, 18}; // height, width, stride
|
||||
for (uint32_t i = 0; i < vmeImplicitArgs; ++i) {
|
||||
auto &argAsVal = outDi->getKernel()->getKernelInfo().getArgDescriptorAt(vmeImplicitArgsBase + i).as<ArgDescValue>();
|
||||
EXPECT_EQ(vmeExtraArgsExpectedVals[i], *((uint32_t *)(outDi->getKernel()->getCrossThreadData() + argAsVal.elements[0].offset)));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(VmeBuiltInTests, GivenAdvancedVmeBuilderWhenGettingDispatchInfoThenParamsAreCorrect) {
|
||||
MockKernelWithInternals mockKernel{*pClDevice};
|
||||
mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 16;
|
||||
mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0] = 16;
|
||||
mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[1] = 0;
|
||||
mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2] = 0;
|
||||
|
||||
Vec3<size_t> gws{352, 288, 0};
|
||||
Vec3<size_t> elws{0, 0, 0};
|
||||
Vec3<size_t> offset{0, 0, 0};
|
||||
|
||||
cl_int err;
|
||||
|
||||
constexpr uint32_t bufferArgNum = 7;
|
||||
MockBuffer mb;
|
||||
cl_mem bufferArg = static_cast<cl_mem>(&mb);
|
||||
|
||||
constexpr uint32_t srcImageArgNum = 1;
|
||||
auto image = std::unique_ptr<Image>(Image2dHelper<>::create(pContext));
|
||||
cl_mem srcImageArg = static_cast<cl_mem>(image.get());
|
||||
|
||||
EBuiltInOps::Type vmeOps[] = {EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel, EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel};
|
||||
for (auto op : vmeOps) {
|
||||
MultiDispatchInfo outMdi;
|
||||
overwriteBuiltInBinaryName("media_kernels_backend");
|
||||
BuiltinDispatchInfoBuilder &builder = Vme::getBuiltinDispatchInfoBuilder(op, *pClDevice);
|
||||
restoreBuiltInBinaryName();
|
||||
|
||||
bool ret = builder.setExplicitArg(srcImageArgNum, sizeof(cl_mem), &srcImageArg, err);
|
||||
EXPECT_FALSE(ret);
|
||||
EXPECT_EQ(CL_SUCCESS, err);
|
||||
|
||||
ret = builder.setExplicitArg(bufferArgNum, sizeof(cl_mem), &bufferArg, err);
|
||||
EXPECT_FALSE(ret);
|
||||
EXPECT_EQ(CL_SUCCESS, err);
|
||||
|
||||
ret = builder.buildDispatchInfos(outMdi, mockKernel.mockKernel, 0, gws, elws, offset);
|
||||
EXPECT_TRUE(ret);
|
||||
EXPECT_EQ(1U, outMdi.size());
|
||||
|
||||
auto outDi = outMdi.begin();
|
||||
EXPECT_EQ(Vec3<size_t>(352, 1, 1), outDi->getGWS());
|
||||
EXPECT_EQ(Vec3<size_t>(16, 1, 1), outDi->getEnqueuedWorkgroupSize());
|
||||
EXPECT_NE(mockKernel.mockKernel, outDi->getKernel());
|
||||
|
||||
EXPECT_EQ(srcImageArg, outDi->getKernel()->getKernelArg(srcImageArgNum));
|
||||
|
||||
uint32_t vmeImplicitArgsBase = outDi->getKernel()->getKernelInfo().getArgNumByName("intraSrcImg");
|
||||
uint32_t vmeImplicitArgs = 4;
|
||||
ASSERT_EQ(vmeImplicitArgsBase + vmeImplicitArgs, outDi->getKernel()->getKernelInfo().getExplicitArgs().size());
|
||||
EXPECT_EQ(srcImageArg, outDi->getKernel()->getKernelArg(vmeImplicitArgsBase));
|
||||
++vmeImplicitArgsBase;
|
||||
--vmeImplicitArgs;
|
||||
uint32_t vmeExtraArgsExpectedVals[] = {18, 22, 18}; // height, width, stride
|
||||
for (uint32_t i = 0; i < vmeImplicitArgs; ++i) {
|
||||
auto &argAsVal = outDi->getKernel()->getKernelInfo().getArgDescriptorAt(vmeImplicitArgsBase + i).as<ArgDescValue>();
|
||||
EXPECT_EQ(vmeExtraArgsExpectedVals[i], *((uint32_t *)(outDi->getKernel()->getCrossThreadData() + argAsVal.elements[0].offset)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(VmeBuiltInTests, WhenGettingBuiltinAsStringThenCorrectStringIsReturned) {
|
||||
EXPECT_EQ(0, strcmp("aux_translation.builtin_kernel", getBuiltinAsString(EBuiltInOps::auxTranslation)));
|
||||
EXPECT_EQ(0, strcmp("copy_buffer_to_buffer.builtin_kernel", getBuiltinAsString(EBuiltInOps::copyBufferToBuffer)));
|
||||
EXPECT_EQ(0, strcmp("copy_buffer_rect.builtin_kernel", getBuiltinAsString(EBuiltInOps::copyBufferRect)));
|
||||
EXPECT_EQ(0, strcmp("fill_buffer.builtin_kernel", getBuiltinAsString(EBuiltInOps::fillBuffer)));
|
||||
EXPECT_EQ(0, strcmp("copy_buffer_to_image3d.builtin_kernel", getBuiltinAsString(EBuiltInOps::copyBufferToImage3d)));
|
||||
EXPECT_EQ(0, strcmp("copy_image3d_to_buffer.builtin_kernel", getBuiltinAsString(EBuiltInOps::copyImage3dToBuffer)));
|
||||
EXPECT_EQ(0, strcmp("copy_image_to_image1d.builtin_kernel", getBuiltinAsString(EBuiltInOps::copyImageToImage1d)));
|
||||
EXPECT_EQ(0, strcmp("copy_image_to_image2d.builtin_kernel", getBuiltinAsString(EBuiltInOps::copyImageToImage2d)));
|
||||
EXPECT_EQ(0, strcmp("copy_image_to_image3d.builtin_kernel", getBuiltinAsString(EBuiltInOps::copyImageToImage3d)));
|
||||
EXPECT_EQ(0, strcmp("copy_kernel_timestamps.builtin_kernel", getBuiltinAsString(EBuiltInOps::queryKernelTimestamps)));
|
||||
EXPECT_EQ(0, strcmp("fill_image1d.builtin_kernel", getBuiltinAsString(EBuiltInOps::fillImage1d)));
|
||||
EXPECT_EQ(0, strcmp("fill_image2d.builtin_kernel", getBuiltinAsString(EBuiltInOps::fillImage2d)));
|
||||
EXPECT_EQ(0, strcmp("fill_image3d.builtin_kernel", getBuiltinAsString(EBuiltInOps::fillImage3d)));
|
||||
EXPECT_EQ(0, strcmp("vme_block_motion_estimate_intel.builtin_kernel", getBuiltinAsString(EBuiltInOps::vmeBlockMotionEstimateIntel)));
|
||||
EXPECT_EQ(0, strcmp("vme_block_advanced_motion_estimate_check_intel.builtin_kernel", getBuiltinAsString(EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel)));
|
||||
EXPECT_EQ(0, strcmp("vme_block_advanced_motion_estimate_bidirectional_check_intel", getBuiltinAsString(EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel)));
|
||||
EXPECT_EQ(0, strcmp("unknown", getBuiltinAsString(EBuiltInOps::count)));
|
||||
}
|
||||
|
||||
TEST_F(BuiltInTests, GivenEncodeTypeWhenGettingExtensionThenCorrectStringIsReturned) {
|
||||
EXPECT_EQ(0, strcmp("", BuiltinCode::getExtension(BuiltinCode::ECodeType::any)));
|
||||
EXPECT_EQ(0, strcmp(".bin", BuiltinCode::getExtension(BuiltinCode::ECodeType::binary)));
|
||||
|
@ -1670,9 +1498,6 @@ TEST_F(BuiltInTests, GivenBuiltinTypeSourceWhenGettingBuiltinResourceThenResourc
|
|||
EXPECT_NE(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::fillImage1d, BuiltinCode::ECodeType::source, *pDevice).size());
|
||||
EXPECT_NE(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::fillImage2d, BuiltinCode::ECodeType::source, *pDevice).size());
|
||||
EXPECT_NE(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::fillImage3d, BuiltinCode::ECodeType::source, *pDevice).size());
|
||||
EXPECT_NE(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::vmeBlockMotionEstimateIntel, BuiltinCode::ECodeType::source, *pDevice).size());
|
||||
EXPECT_NE(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel, BuiltinCode::ECodeType::source, *pDevice).size());
|
||||
EXPECT_NE(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel, BuiltinCode::ECodeType::source, *pDevice).size());
|
||||
EXPECT_EQ(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::count, BuiltinCode::ECodeType::source, *pDevice).size());
|
||||
}
|
||||
|
||||
|
@ -1690,9 +1515,6 @@ HWCMDTEST_F(IGFX_GEN8_CORE, BuiltInTests, GivenBuiltinTypeBinaryWhenGettingBuilt
|
|||
EXPECT_NE(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::fillImage1d, BuiltinCode::ECodeType::binary, *pDevice).size());
|
||||
EXPECT_NE(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::fillImage2d, BuiltinCode::ECodeType::binary, *pDevice).size());
|
||||
EXPECT_NE(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::fillImage3d, BuiltinCode::ECodeType::binary, *pDevice).size());
|
||||
EXPECT_EQ(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::vmeBlockMotionEstimateIntel, BuiltinCode::ECodeType::binary, *pDevice).size());
|
||||
EXPECT_EQ(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel, BuiltinCode::ECodeType::binary, *pDevice).size());
|
||||
EXPECT_EQ(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel, BuiltinCode::ECodeType::binary, *pDevice).size());
|
||||
EXPECT_EQ(0u, mockBuiltinsLib->getBuiltinResource(EBuiltInOps::count, BuiltinCode::ECodeType::binary, *pDevice).size());
|
||||
}
|
||||
|
||||
|
@ -1804,554 +1626,6 @@ TEST_F(BuiltInTests, GivenForce32bitWhenCreatingProgramThenCorrectKernelIsCreate
|
|||
const_cast<DeviceInfo *>(&pDevice->getDeviceInfo())->force32BitAddressess = force32BitAddressess;
|
||||
}
|
||||
|
||||
TEST_F(BuiltInTests, GivenVmeKernelWhenGettingDeviceInfoThenCorrectVmeVersionIsReturned) {
|
||||
if (!pDevice->getHardwareInfo().capabilityTable.supportsVme) {
|
||||
GTEST_SKIP();
|
||||
}
|
||||
cl_uint param;
|
||||
auto ret = pClDevice->getDeviceInfo(CL_DEVICE_ME_VERSION_INTEL, sizeof(param), ¶m, nullptr);
|
||||
EXPECT_EQ(CL_SUCCESS, ret);
|
||||
EXPECT_EQ(static_cast<cl_uint>(CL_ME_VERSION_ADVANCED_VER_2_INTEL), param);
|
||||
}
|
||||
|
||||
TEST_F(VmeBuiltInTests, WhenVmeKernelIsCreatedThenParamsAreCorrect) {
|
||||
overwriteBuiltInBinaryName("media_kernels_backend");
|
||||
BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel> vmeBuilder(*this->pBuiltIns, *pClDevice);
|
||||
restoreBuiltInBinaryName();
|
||||
|
||||
cl_int err;
|
||||
|
||||
{
|
||||
int32_t bufArgNum = 7;
|
||||
|
||||
cl_mem mem = 0;
|
||||
vmeBuilder.setExplicitArg(bufArgNum, sizeof(cl_mem), &mem, err);
|
||||
EXPECT_TRUE(vmeBuilder.validateBufferSize(-1, 16));
|
||||
EXPECT_TRUE(vmeBuilder.validateBufferSize(bufArgNum, 16));
|
||||
|
||||
MockBuffer mb;
|
||||
mem = &mb;
|
||||
vmeBuilder.setExplicitArg(bufArgNum, sizeof(cl_mem), &mem, err);
|
||||
EXPECT_TRUE(vmeBuilder.validateBufferSize(bufArgNum, mb.getSize()));
|
||||
EXPECT_TRUE(vmeBuilder.validateBufferSize(bufArgNum, mb.getSize() / 2));
|
||||
EXPECT_FALSE(vmeBuilder.validateBufferSize(bufArgNum, mb.getSize() * 2));
|
||||
|
||||
mem = 0;
|
||||
vmeBuilder.setExplicitArg(bufArgNum, sizeof(cl_mem), &mem, err);
|
||||
}
|
||||
|
||||
{
|
||||
EXPECT_TRUE(vmeBuilder.validateEnumVal(1, 1, 2, 3, 4));
|
||||
EXPECT_TRUE(vmeBuilder.validateEnumVal(1, 1));
|
||||
EXPECT_TRUE(vmeBuilder.validateEnumVal(3, 1, 2, 3));
|
||||
|
||||
EXPECT_FALSE(vmeBuilder.validateEnumVal(1, 3, 4));
|
||||
EXPECT_FALSE(vmeBuilder.validateEnumVal(1));
|
||||
EXPECT_FALSE(vmeBuilder.validateEnumVal(1, 2));
|
||||
|
||||
int32_t valArgNum = 3;
|
||||
uint32_t val = 7;
|
||||
vmeBuilder.setExplicitArg(valArgNum, sizeof(val), &val, err);
|
||||
EXPECT_FALSE(vmeBuilder.validateEnumArg<uint32_t>(valArgNum, 3));
|
||||
EXPECT_TRUE(vmeBuilder.validateEnumArg<uint32_t>(valArgNum, 7));
|
||||
|
||||
val = 0;
|
||||
vmeBuilder.setExplicitArg(valArgNum, sizeof(val), &val, err);
|
||||
}
|
||||
|
||||
{
|
||||
int32_t valArgNum = 3;
|
||||
uint32_t val = 7;
|
||||
vmeBuilder.setExplicitArg(valArgNum, sizeof(val), &val, err);
|
||||
EXPECT_EQ(val, vmeBuilder.getKernelArgByValValue<uint32_t>(valArgNum));
|
||||
val = 11;
|
||||
vmeBuilder.setExplicitArg(valArgNum, sizeof(val), &val, err);
|
||||
EXPECT_EQ(val, vmeBuilder.getKernelArgByValValue<uint32_t>(valArgNum));
|
||||
|
||||
val = 0;
|
||||
vmeBuilder.setExplicitArg(valArgNum, sizeof(val), &val, err);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(VmeBuiltInTests, WhenVmeKernelIsCreatedThenDispatchIsBidirectional) {
|
||||
overwriteBuiltInBinaryName("media_kernels_backend");
|
||||
BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel> avmeBuilder(*this->pBuiltIns, *pClDevice);
|
||||
BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel> avmeBidirBuilder(*this->pBuiltIns, *pClDevice);
|
||||
restoreBuiltInBinaryName();
|
||||
|
||||
EXPECT_FALSE(avmeBuilder.isBidirKernel());
|
||||
EXPECT_TRUE(avmeBidirBuilder.isBidirKernel());
|
||||
}
|
||||
|
||||
struct ImageVmeValidFormat : Image2dDefaults {
|
||||
static const cl_image_format imageFormat;
|
||||
static const cl_image_desc iamgeDesc;
|
||||
};
|
||||
|
||||
const cl_image_format ImageVmeValidFormat::imageFormat = {
|
||||
CL_R,
|
||||
CL_UNORM_INT8};
|
||||
|
||||
const cl_image_desc ImageVmeValidFormat::iamgeDesc = {
|
||||
CL_MEM_OBJECT_IMAGE1D,
|
||||
8192,
|
||||
16,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
{nullptr}};
|
||||
|
||||
struct ImageVmeInvalidDataType : Image2dDefaults {
|
||||
static const cl_image_format imageFormat;
|
||||
};
|
||||
|
||||
const cl_image_format ImageVmeInvalidDataType::imageFormat = {
|
||||
CL_R,
|
||||
CL_FLOAT};
|
||||
|
||||
struct ImageVmeInvalidChannelOrder : Image2dDefaults {
|
||||
static const cl_image_format imageFormat;
|
||||
};
|
||||
|
||||
const cl_image_format ImageVmeInvalidChannelOrder::imageFormat = {
|
||||
CL_RGBA,
|
||||
CL_UNORM_INT8};
|
||||
|
||||
TEST_F(VmeBuiltInTests, WhenValidatingImagesThenCorrectResponses) {
|
||||
overwriteBuiltInBinaryName("media_kernels_backend");
|
||||
BuiltInOp<EBuiltInOps::vmeBlockMotionEstimateIntel> vmeBuilder(*this->pBuiltIns, *pClDevice);
|
||||
restoreBuiltInBinaryName();
|
||||
|
||||
uint32_t srcImgArgNum = 1;
|
||||
uint32_t refImgArgNum = 2;
|
||||
|
||||
cl_int err;
|
||||
{ // validate images are not null
|
||||
std::unique_ptr<Image> image1(ImageHelper<ImageVmeValidFormat>::create(pContext));
|
||||
|
||||
cl_mem srcImgMem = 0;
|
||||
EXPECT_EQ(CL_INVALID_KERNEL_ARGS, vmeBuilder.validateImages(Vec3<size_t>{3, 3, 0}, Vec3<size_t>{0, 0, 0}));
|
||||
|
||||
srcImgMem = image1.get();
|
||||
vmeBuilder.setExplicitArg(srcImgArgNum, sizeof(srcImgMem), &srcImgMem, err);
|
||||
EXPECT_EQ(CL_INVALID_KERNEL_ARGS, vmeBuilder.validateImages(Vec3<size_t>{3, 3, 0}, Vec3<size_t>{0, 0, 0}));
|
||||
}
|
||||
|
||||
{ // validate image formats
|
||||
std::unique_ptr<Image> imageValid(ImageHelper<ImageVmeValidFormat>::create(pContext));
|
||||
std::unique_ptr<Image> imageInvalidDataType(ImageHelper<ImageVmeInvalidDataType>::create(pContext));
|
||||
std::unique_ptr<Image> imageChannelOrder(ImageHelper<ImageVmeInvalidChannelOrder>::create(pContext));
|
||||
|
||||
Image *images[] = {imageValid.get(), imageInvalidDataType.get(), imageChannelOrder.get()};
|
||||
for (Image *srcImg : images) {
|
||||
for (Image *dstImg : images) {
|
||||
cl_mem srcImgMem = srcImg;
|
||||
cl_mem refImgMem = dstImg;
|
||||
vmeBuilder.setExplicitArg(srcImgArgNum, sizeof(srcImgMem), &srcImgMem, err);
|
||||
vmeBuilder.setExplicitArg(refImgArgNum, sizeof(refImgMem), &refImgMem, err);
|
||||
bool shouldSucceed = (srcImg == imageValid.get()) && (dstImg == imageValid.get());
|
||||
if (shouldSucceed) {
|
||||
EXPECT_EQ(CL_SUCCESS, vmeBuilder.validateImages(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}));
|
||||
} else {
|
||||
EXPECT_EQ(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR, vmeBuilder.validateImages(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{ // validate image tiling
|
||||
std::unique_ptr<Image> imageValid(ImageHelper<ImageVmeValidFormat>::create(pContext));
|
||||
DebugManagerStateRestore restorer;
|
||||
debugManager.flags.ForceLinearImages.set(true);
|
||||
std::unique_ptr<Image> imageLinear(ImageHelper<ImageVmeValidFormat>::create(pContext));
|
||||
Image *images[] = {imageValid.get(), imageLinear.get()};
|
||||
for (Image *srcImg : images) {
|
||||
for (Image *dstImg : images) {
|
||||
cl_mem srcImgMem = srcImg;
|
||||
cl_mem refImgMem = dstImg;
|
||||
vmeBuilder.setExplicitArg(srcImgArgNum, sizeof(srcImgMem), &srcImgMem, err);
|
||||
vmeBuilder.setExplicitArg(refImgArgNum, sizeof(refImgMem), &refImgMem, err);
|
||||
bool shouldSucceed = (srcImg == imageValid.get()) && (dstImg == imageValid.get());
|
||||
if (shouldSucceed) {
|
||||
EXPECT_EQ(CL_SUCCESS, vmeBuilder.validateImages(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}));
|
||||
} else {
|
||||
EXPECT_EQ(CL_OUT_OF_RESOURCES, vmeBuilder.validateImages(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{ // validate region size
|
||||
std::unique_ptr<Image> imageValid(ImageHelper<ImageVmeValidFormat>::create(pContext));
|
||||
cl_mem imgValidMem = imageValid.get();
|
||||
vmeBuilder.setExplicitArg(srcImgArgNum, sizeof(imgValidMem), &imgValidMem, err);
|
||||
vmeBuilder.setExplicitArg(refImgArgNum, sizeof(imgValidMem), &imgValidMem, err);
|
||||
|
||||
EXPECT_EQ(CL_INVALID_IMAGE_SIZE, vmeBuilder.validateImages(Vec3<size_t>{imageValid->getImageDesc().image_width + 1, 1, 0}, Vec3<size_t>{0, 0, 0}));
|
||||
EXPECT_EQ(CL_INVALID_IMAGE_SIZE, vmeBuilder.validateImages(Vec3<size_t>{1, imageValid->getImageDesc().image_height + 1, 0}, Vec3<size_t>{0, 0, 0}));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(VmeBuiltInTests, WhenValidatingFlagsThenValidFlagCombinationsReturnTrue) {
|
||||
overwriteBuiltInBinaryName("media_kernels_backend");
|
||||
BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel> vmeBuilder(*this->pBuiltIns, *pClDevice);
|
||||
restoreBuiltInBinaryName();
|
||||
|
||||
uint32_t defaultSkipBlockVal = 8192;
|
||||
uint32_t flagsArgNum = 3;
|
||||
|
||||
std::tuple<uint32_t, bool, uint32_t> flagsToTest[] = {
|
||||
std::make_tuple(CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL, false, defaultSkipBlockVal),
|
||||
std::make_tuple(CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL, true, CL_ME_MB_TYPE_16x16_INTEL),
|
||||
std::make_tuple(CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL, true, CL_ME_MB_TYPE_8x8_INTEL),
|
||||
std::make_tuple(defaultSkipBlockVal, true, defaultSkipBlockVal),
|
||||
};
|
||||
|
||||
cl_int err;
|
||||
for (auto &conf : flagsToTest) {
|
||||
uint32_t skipBlock = defaultSkipBlockVal;
|
||||
vmeBuilder.setExplicitArg(flagsArgNum, sizeof(uint32_t), &std::get<0>(conf), err);
|
||||
bool validationResult = vmeBuilder.validateFlags(skipBlock);
|
||||
if (std::get<1>(conf)) {
|
||||
EXPECT_TRUE(validationResult);
|
||||
} else {
|
||||
EXPECT_FALSE(validationResult);
|
||||
}
|
||||
EXPECT_EQ(std::get<2>(conf), skipBlock);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(VmeBuiltInTests, WhenValidatingSkipBlockTypeThenCorrectResponses) {
|
||||
overwriteBuiltInBinaryName("media_kernels_backend");
|
||||
BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel> avmeBidirectionalBuilder(*this->pBuiltIns, *pClDevice);
|
||||
BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel> avmeBuilder(*this->pBuiltIns, *pClDevice);
|
||||
restoreBuiltInBinaryName();
|
||||
|
||||
cl_int err;
|
||||
uint32_t skipBlockTypeArgNum = 4;
|
||||
|
||||
uint32_t skipBlockType = 8192;
|
||||
bool ret = avmeBidirectionalBuilder.validateSkipBlockTypeArg(skipBlockType);
|
||||
EXPECT_TRUE(ret);
|
||||
EXPECT_EQ(8192U, skipBlockType);
|
||||
|
||||
skipBlockType = 8192U;
|
||||
avmeBuilder.setExplicitArg(skipBlockTypeArgNum, sizeof(uint32_t), &skipBlockType, err);
|
||||
ret = avmeBuilder.validateSkipBlockTypeArg(skipBlockType);
|
||||
EXPECT_FALSE(ret);
|
||||
|
||||
skipBlockType = CL_ME_MB_TYPE_16x16_INTEL;
|
||||
avmeBuilder.setExplicitArg(skipBlockTypeArgNum, sizeof(uint32_t), &skipBlockType, err);
|
||||
skipBlockType = 8192U;
|
||||
ret = avmeBuilder.validateSkipBlockTypeArg(skipBlockType);
|
||||
EXPECT_TRUE(ret);
|
||||
EXPECT_EQ(static_cast<uint32_t>(CL_ME_MB_TYPE_16x16_INTEL), skipBlockType);
|
||||
|
||||
skipBlockType = CL_ME_MB_TYPE_8x8_INTEL;
|
||||
avmeBuilder.setExplicitArg(skipBlockTypeArgNum, sizeof(uint32_t), &skipBlockType, err);
|
||||
skipBlockType = 8192U;
|
||||
ret = avmeBuilder.validateSkipBlockTypeArg(skipBlockType);
|
||||
EXPECT_TRUE(ret);
|
||||
EXPECT_EQ(static_cast<uint32_t>(CL_ME_MB_TYPE_8x8_INTEL), skipBlockType);
|
||||
}
|
||||
|
||||
TEST_F(VmeBuiltInTests, GivenAcceleratorWhenExplicitlySettingArgThenFalseIsReturned) {
|
||||
overwriteBuiltInBinaryName("media_kernels_backend");
|
||||
|
||||
BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel> vmeBuilder(*this->pBuiltIns, *pClDevice);
|
||||
restoreBuiltInBinaryName();
|
||||
|
||||
cl_int err;
|
||||
uint32_t aceleratorArgNum = 0;
|
||||
bool ret = vmeBuilder.setExplicitArg(aceleratorArgNum, sizeof(cl_accelerator_intel), nullptr, err);
|
||||
EXPECT_FALSE(ret);
|
||||
EXPECT_EQ(CL_INVALID_ACCELERATOR_INTEL, err);
|
||||
|
||||
cl_motion_estimation_desc_intel acceleratorDesc;
|
||||
acceleratorDesc.subpixel_mode = CL_ME_SUBPIXEL_MODE_INTEGER_INTEL;
|
||||
acceleratorDesc.sad_adjust_mode = CL_ME_SAD_ADJUST_MODE_NONE_INTEL;
|
||||
acceleratorDesc.search_path_type = CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL;
|
||||
acceleratorDesc.mb_block_type = CL_ME_MB_TYPE_16x16_INTEL;
|
||||
auto neoAccelerator = std::unique_ptr<VmeAccelerator>(VmeAccelerator::create(pContext, CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL, sizeof(acceleratorDesc), &acceleratorDesc, err));
|
||||
ASSERT_NE(nullptr, neoAccelerator.get());
|
||||
cl_accelerator_intel clAccel = neoAccelerator.get();
|
||||
ret = vmeBuilder.setExplicitArg(aceleratorArgNum, sizeof(cl_accelerator_intel), &clAccel, err);
|
||||
EXPECT_FALSE(ret);
|
||||
EXPECT_EQ(CL_SUCCESS, err);
|
||||
}
|
||||
|
||||
TEST_F(VmeBuiltInTests, WhenValidatingDispatchThenCorrectReturns) {
|
||||
overwriteBuiltInBinaryName("media_kernels_backend");
|
||||
struct MockVmeBuilder : BuiltInOp<EBuiltInOps::vmeBlockMotionEstimateIntel> {
|
||||
using BuiltInOp<EBuiltInOps::vmeBlockMotionEstimateIntel>::BuiltInOp;
|
||||
|
||||
cl_int validateVmeDispatch(const Vec3<size_t> &inputRegion, const Vec3<size_t> &offset, size_t blkNum, size_t blkMul) const override {
|
||||
receivedInputRegion = inputRegion;
|
||||
receivedOffset = offset;
|
||||
receivedBlkNum = blkNum;
|
||||
receivedBlkMul = blkMul;
|
||||
wasValidateVmeDispatchCalled = true;
|
||||
return valueToReturn;
|
||||
}
|
||||
|
||||
mutable bool wasValidateVmeDispatchCalled = false;
|
||||
mutable Vec3<size_t> receivedInputRegion = {0, 0, 0};
|
||||
mutable Vec3<size_t> receivedOffset = {0, 0, 0};
|
||||
mutable size_t receivedBlkNum = 0;
|
||||
mutable size_t receivedBlkMul = 0;
|
||||
mutable cl_int valueToReturn = CL_SUCCESS;
|
||||
};
|
||||
|
||||
uint32_t aaceleratorArgNum = 0;
|
||||
MockVmeBuilder vmeBuilder(*this->pBuiltIns, *pClDevice);
|
||||
restoreBuiltInBinaryName();
|
||||
|
||||
cl_int ret = vmeBuilder.validateDispatch(nullptr, 1, Vec3<size_t>{16, 16, 0}, Vec3<size_t>{16, 1, 0}, Vec3<size_t>{0, 0, 0});
|
||||
EXPECT_EQ(CL_INVALID_WORK_DIMENSION, ret);
|
||||
|
||||
ret = vmeBuilder.validateDispatch(nullptr, 3, Vec3<size_t>{16, 16, 0}, Vec3<size_t>{16, 1, 0}, Vec3<size_t>{0, 0, 0});
|
||||
EXPECT_EQ(CL_INVALID_WORK_DIMENSION, ret);
|
||||
|
||||
ret = vmeBuilder.validateDispatch(nullptr, 2, Vec3<size_t>{16, 16, 0}, Vec3<size_t>{16, 1, 0}, Vec3<size_t>{0, 0, 0});
|
||||
EXPECT_EQ(CL_INVALID_KERNEL_ARGS, ret); // accelerator not set
|
||||
EXPECT_FALSE(vmeBuilder.wasValidateVmeDispatchCalled);
|
||||
|
||||
cl_int err;
|
||||
cl_motion_estimation_desc_intel acceleratorDesc;
|
||||
acceleratorDesc.subpixel_mode = CL_ME_SUBPIXEL_MODE_INTEGER_INTEL;
|
||||
acceleratorDesc.sad_adjust_mode = CL_ME_SAD_ADJUST_MODE_NONE_INTEL;
|
||||
acceleratorDesc.search_path_type = CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL;
|
||||
|
||||
Vec3<size_t> gws{16, 16, 0};
|
||||
Vec3<size_t> lws{16, 1, 0};
|
||||
Vec3<size_t> off{0, 0, 0};
|
||||
size_t gwWidthInBlk = 0;
|
||||
size_t gwHeightInBlk = 0;
|
||||
vmeBuilder.getBlkTraits(gws, gwWidthInBlk, gwHeightInBlk);
|
||||
|
||||
{
|
||||
acceleratorDesc.mb_block_type = CL_ME_MB_TYPE_16x16_INTEL;
|
||||
auto neoAccelerator = std::unique_ptr<VmeAccelerator>(VmeAccelerator::create(pContext, CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL, sizeof(acceleratorDesc), &acceleratorDesc, err));
|
||||
ASSERT_NE(nullptr, neoAccelerator.get());
|
||||
cl_accelerator_intel clAccel = neoAccelerator.get();
|
||||
vmeBuilder.setExplicitArg(aaceleratorArgNum, sizeof(clAccel), &clAccel, err);
|
||||
vmeBuilder.wasValidateVmeDispatchCalled = false;
|
||||
auto ret = vmeBuilder.validateDispatch(nullptr, 2, gws, lws, off);
|
||||
EXPECT_EQ(CL_SUCCESS, ret);
|
||||
EXPECT_TRUE(vmeBuilder.wasValidateVmeDispatchCalled);
|
||||
EXPECT_EQ(gws, vmeBuilder.receivedInputRegion);
|
||||
EXPECT_EQ(off, vmeBuilder.receivedOffset);
|
||||
|
||||
EXPECT_EQ(gwWidthInBlk * gwHeightInBlk, vmeBuilder.receivedBlkNum);
|
||||
EXPECT_EQ(1U, vmeBuilder.receivedBlkMul);
|
||||
}
|
||||
|
||||
{
|
||||
acceleratorDesc.mb_block_type = CL_ME_MB_TYPE_4x4_INTEL;
|
||||
auto neoAccelerator = std::unique_ptr<VmeAccelerator>(VmeAccelerator::create(pContext, CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL, sizeof(acceleratorDesc), &acceleratorDesc, err));
|
||||
ASSERT_NE(nullptr, neoAccelerator.get());
|
||||
cl_accelerator_intel clAccel = neoAccelerator.get();
|
||||
vmeBuilder.setExplicitArg(aaceleratorArgNum, sizeof(clAccel), &clAccel, err);
|
||||
vmeBuilder.wasValidateVmeDispatchCalled = false;
|
||||
auto ret = vmeBuilder.validateDispatch(nullptr, 2, gws, lws, off);
|
||||
EXPECT_EQ(CL_SUCCESS, ret);
|
||||
EXPECT_TRUE(vmeBuilder.wasValidateVmeDispatchCalled);
|
||||
EXPECT_EQ(gws, vmeBuilder.receivedInputRegion);
|
||||
EXPECT_EQ(off, vmeBuilder.receivedOffset);
|
||||
|
||||
EXPECT_EQ(gwWidthInBlk * gwHeightInBlk, vmeBuilder.receivedBlkNum);
|
||||
EXPECT_EQ(16U, vmeBuilder.receivedBlkMul);
|
||||
}
|
||||
|
||||
{
|
||||
acceleratorDesc.mb_block_type = CL_ME_MB_TYPE_8x8_INTEL;
|
||||
auto neoAccelerator = std::unique_ptr<VmeAccelerator>(VmeAccelerator::create(pContext, CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL, sizeof(acceleratorDesc), &acceleratorDesc, err));
|
||||
ASSERT_NE(nullptr, neoAccelerator.get());
|
||||
cl_accelerator_intel clAccel = neoAccelerator.get();
|
||||
vmeBuilder.setExplicitArg(aaceleratorArgNum, sizeof(clAccel), &clAccel, err);
|
||||
vmeBuilder.wasValidateVmeDispatchCalled = false;
|
||||
vmeBuilder.valueToReturn = 37;
|
||||
auto ret = vmeBuilder.validateDispatch(nullptr, 2, gws, lws, off);
|
||||
EXPECT_EQ(37, ret);
|
||||
EXPECT_TRUE(vmeBuilder.wasValidateVmeDispatchCalled);
|
||||
EXPECT_EQ(gws, vmeBuilder.receivedInputRegion);
|
||||
EXPECT_EQ(off, vmeBuilder.receivedOffset);
|
||||
|
||||
EXPECT_EQ(gwWidthInBlk * gwHeightInBlk, vmeBuilder.receivedBlkNum);
|
||||
EXPECT_EQ(4U, vmeBuilder.receivedBlkMul);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(VmeBuiltInTests, WhenValidatingVmeDispatchThenCorrectReturns) {
|
||||
overwriteBuiltInBinaryName("media_kernels_backend");
|
||||
BuiltInOp<EBuiltInOps::vmeBlockMotionEstimateIntel> vmeBuilder(*this->pBuiltIns, *pClDevice);
|
||||
restoreBuiltInBinaryName();
|
||||
|
||||
cl_int err;
|
||||
|
||||
// images not set
|
||||
EXPECT_EQ(CL_INVALID_KERNEL_ARGS, vmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
|
||||
|
||||
uint32_t srcImgArgNum = 1;
|
||||
uint32_t refImgArgNum = 2;
|
||||
|
||||
std::unique_ptr<Image> imageValid(ImageHelper<ImageVmeValidFormat>::create(pContext));
|
||||
cl_mem srcImgMem = imageValid.get();
|
||||
|
||||
vmeBuilder.setExplicitArg(srcImgArgNum, sizeof(srcImgMem), &srcImgMem, err);
|
||||
vmeBuilder.setExplicitArg(refImgArgNum, sizeof(srcImgMem), &srcImgMem, err);
|
||||
|
||||
// null buffers are valid
|
||||
EXPECT_EQ(CL_SUCCESS, vmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
|
||||
|
||||
// too small buffers should fail
|
||||
MockBuffer mb;
|
||||
cl_mem mem = &mb;
|
||||
|
||||
uint32_t predictionMotionVectorBufferArgNum = 3;
|
||||
uint32_t motionVectorBufferArgNum = 4;
|
||||
uint32_t residualsBufferArgNum = 5;
|
||||
for (uint32_t argNum : {predictionMotionVectorBufferArgNum, motionVectorBufferArgNum, residualsBufferArgNum}) {
|
||||
EXPECT_EQ(CL_SUCCESS, vmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, mb.getSize() * 2, 1));
|
||||
vmeBuilder.setExplicitArg(argNum, sizeof(cl_mem), &mem, err);
|
||||
EXPECT_EQ(CL_INVALID_BUFFER_SIZE, vmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, mb.getSize() * 2, 1));
|
||||
vmeBuilder.setExplicitArg(argNum, sizeof(cl_mem), nullptr, err);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(VmeBuiltInTests, GivenAdvancedVmeWhenValidatingVmeDispatchThenCorrectReturns) {
|
||||
overwriteBuiltInBinaryName("media_kernels_backend");
|
||||
BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel> avmeBuilder(*this->pBuiltIns, *pClDevice);
|
||||
restoreBuiltInBinaryName();
|
||||
|
||||
cl_int err;
|
||||
// images not set
|
||||
ASSERT_EQ(CL_INVALID_KERNEL_ARGS, avmeBuilder.VmeBuiltinDispatchInfoBuilder::validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
|
||||
EXPECT_EQ(CL_INVALID_KERNEL_ARGS, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
|
||||
|
||||
uint32_t srcImgArgNum = 1;
|
||||
uint32_t refImgArgNum = 2;
|
||||
|
||||
std::unique_ptr<Image> imageValid(ImageHelper<ImageVmeValidFormat>::create(pContext));
|
||||
cl_mem srcImgMem = imageValid.get();
|
||||
|
||||
avmeBuilder.setExplicitArg(srcImgArgNum, sizeof(srcImgMem), &srcImgMem, err);
|
||||
avmeBuilder.setExplicitArg(refImgArgNum, sizeof(srcImgMem), &srcImgMem, err);
|
||||
|
||||
ASSERT_EQ(CL_SUCCESS, avmeBuilder.VmeBuiltinDispatchInfoBuilder::validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
|
||||
|
||||
uint32_t flagsArgNum = 3;
|
||||
uint32_t val = CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL;
|
||||
avmeBuilder.setExplicitArg(flagsArgNum, sizeof(val), &val, err);
|
||||
EXPECT_EQ(CL_INVALID_KERNEL_ARGS, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
|
||||
val = CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL;
|
||||
avmeBuilder.setExplicitArg(flagsArgNum, sizeof(val), &val, err);
|
||||
|
||||
uint32_t skipBlockTypeArgNum = 4;
|
||||
val = 8192;
|
||||
avmeBuilder.setExplicitArg(skipBlockTypeArgNum, sizeof(uint32_t), &val, err);
|
||||
EXPECT_EQ(CL_OUT_OF_RESOURCES, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
|
||||
val = CL_ME_MB_TYPE_16x16_INTEL;
|
||||
avmeBuilder.setExplicitArg(skipBlockTypeArgNum, sizeof(uint32_t), &val, err);
|
||||
|
||||
uint32_t searchCostPenaltyArgNum = 5;
|
||||
val = 8192;
|
||||
avmeBuilder.setExplicitArg(searchCostPenaltyArgNum, sizeof(uint32_t), &val, err);
|
||||
EXPECT_EQ(CL_OUT_OF_RESOURCES, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
|
||||
val = CL_ME_COST_PENALTY_NONE_INTEL;
|
||||
avmeBuilder.setExplicitArg(searchCostPenaltyArgNum, sizeof(uint32_t), &val, err);
|
||||
|
||||
uint32_t searchCostPrecisionArgNum = 6;
|
||||
val = 8192;
|
||||
avmeBuilder.setExplicitArg(searchCostPrecisionArgNum, sizeof(uint32_t), &val, err);
|
||||
EXPECT_EQ(CL_OUT_OF_RESOURCES, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
|
||||
val = CL_ME_COST_PRECISION_QPEL_INTEL;
|
||||
avmeBuilder.setExplicitArg(searchCostPrecisionArgNum, sizeof(uint32_t), &val, err);
|
||||
|
||||
// for non-bidirectional avme kernel, countMotionVectorBuffer must be set
|
||||
uint32_t countMotionVectorBufferArgNum = 7;
|
||||
EXPECT_EQ(CL_INVALID_BUFFER_SIZE, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
|
||||
|
||||
MockBuffer mb;
|
||||
cl_mem mem = &mb;
|
||||
avmeBuilder.setExplicitArg(countMotionVectorBufferArgNum, sizeof(cl_mem), &mem, err);
|
||||
EXPECT_EQ(CL_SUCCESS, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 1, 1));
|
||||
}
|
||||
|
||||
TEST_F(VmeBuiltInTests, GivenAdvancedBidirectionalVmeWhenValidatingVmeDispatchThenCorrectReturns) {
|
||||
overwriteBuiltInBinaryName("media_kernels_backend");
|
||||
BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateBidirectionalCheckIntel> avmeBuilder(*this->pBuiltIns, *pClDevice);
|
||||
restoreBuiltInBinaryName();
|
||||
|
||||
cl_int err;
|
||||
uint32_t srcImgArgNum = 1;
|
||||
uint32_t refImgArgNum = 2;
|
||||
|
||||
std::unique_ptr<Image> imageValid(ImageHelper<ImageVmeValidFormat>::create(pContext));
|
||||
cl_mem srcImgMem = imageValid.get();
|
||||
|
||||
avmeBuilder.setExplicitArg(srcImgArgNum, sizeof(srcImgMem), &srcImgMem, err);
|
||||
avmeBuilder.setExplicitArg(refImgArgNum, sizeof(srcImgMem), &srcImgMem, err);
|
||||
|
||||
ASSERT_EQ(CL_SUCCESS, avmeBuilder.VmeBuiltinDispatchInfoBuilder::validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
|
||||
|
||||
uint32_t flagsArgNum = 6;
|
||||
uint32_t val = CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL;
|
||||
avmeBuilder.setExplicitArg(flagsArgNum, sizeof(val), &val, err);
|
||||
|
||||
uint32_t searchCostPenaltyArgNum = 7;
|
||||
val = CL_ME_COST_PENALTY_NONE_INTEL;
|
||||
avmeBuilder.setExplicitArg(searchCostPenaltyArgNum, sizeof(uint32_t), &val, err);
|
||||
|
||||
uint32_t searchCostPrecisionArgNum = 8;
|
||||
val = CL_ME_COST_PRECISION_QPEL_INTEL;
|
||||
avmeBuilder.setExplicitArg(searchCostPrecisionArgNum, sizeof(uint32_t), &val, err);
|
||||
|
||||
uint32_t bidirWeightArgNum = 10;
|
||||
val = 255;
|
||||
avmeBuilder.setExplicitArg(bidirWeightArgNum, sizeof(uint8_t), &val, err);
|
||||
EXPECT_EQ(CL_INVALID_KERNEL_ARGS, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
|
||||
val = CL_ME_BIDIR_WEIGHT_QUARTER_INTEL;
|
||||
avmeBuilder.setExplicitArg(bidirWeightArgNum, sizeof(uint8_t), &val, err);
|
||||
|
||||
EXPECT_EQ(CL_SUCCESS, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 64, 1));
|
||||
|
||||
// test bufferSize checking
|
||||
uint32_t countMotionVectorBufferArgNum = 11;
|
||||
MockBuffer mb;
|
||||
cl_mem mem = &mb;
|
||||
avmeBuilder.setExplicitArg(countMotionVectorBufferArgNum, sizeof(cl_mem), &mem, err);
|
||||
EXPECT_EQ(CL_SUCCESS, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, 1, 1));
|
||||
EXPECT_EQ(CL_INVALID_BUFFER_SIZE, avmeBuilder.validateVmeDispatch(Vec3<size_t>{1, 1, 0}, Vec3<size_t>{0, 0, 0}, mb.getSize() * 2, 1));
|
||||
}
|
||||
|
||||
TEST_F(VmeBuiltInTests, GivenAdvancedVmeWhenGettingSkipResidualsBuffExpSizeThenDefaultSizeIsReturned) {
|
||||
overwriteBuiltInBinaryName("media_kernels_backend");
|
||||
BuiltInOp<EBuiltInOps::vmeBlockAdvancedMotionEstimateCheckIntel> vmeBuilder(*this->pBuiltIns, *pClDevice);
|
||||
restoreBuiltInBinaryName();
|
||||
|
||||
auto size16x16 = vmeBuilder.getSkipResidualsBuffExpSize(CL_ME_MB_TYPE_16x16_INTEL, 4);
|
||||
auto sizeDefault = vmeBuilder.getSkipResidualsBuffExpSize(8192, 4);
|
||||
EXPECT_EQ(size16x16, sizeDefault);
|
||||
}
|
||||
|
||||
TEST_F(BuiltInTests, GivenInvalidBuiltinKernelNameWhenCreatingBuiltInProgramThenInvalidValueErrorIsReturned) {
|
||||
const char *kernelNames = "invalid_kernel";
|
||||
cl_int retVal = CL_SUCCESS;
|
||||
|
||||
cl_program program = Vme::createBuiltInProgram(
|
||||
*pContext,
|
||||
pContext->getDevices(),
|
||||
kernelNames,
|
||||
retVal);
|
||||
|
||||
EXPECT_EQ(CL_INVALID_VALUE, retVal);
|
||||
EXPECT_EQ(nullptr, program);
|
||||
}
|
||||
|
||||
TEST_F(BuiltInTests, WhenGettingSipKernelThenReturnProgramCreatedFromIsaAcquiredThroughCompilerInterface) {
|
||||
auto mockCompilerInterface = new MockCompilerInterface();
|
||||
pDevice->getExecutionEnvironment()->rootDeviceEnvironments[rootDeviceIndex]->compilerInterface.reset(mockCompilerInterface);
|
||||
|
|
|
@ -1,962 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
// VME KERNELS
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
|
||||
block_motion_estimate_intel(sampler_t accelerator, __read_only image2d_t srcImg,
|
||||
__read_only image2d_t refImg,
|
||||
__global short2 *prediction_motion_vector_buffer,
|
||||
__global short2 *motion_vector_buffer,
|
||||
__global ushort *residuals, int height, int width,
|
||||
int stride) {
|
||||
__local uint dst[64];
|
||||
__local ushort *dist = (__local ushort *)&dst[8 * 5];
|
||||
|
||||
int sid_0 = stride * get_group_id(0);
|
||||
int gid_0 = sid_0 / height;
|
||||
int gid_1 = sid_0 % height;
|
||||
for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
|
||||
sid++, gid_0 = sid / height, gid_1 = sid % height) {
|
||||
int2 srcCoord = 0;
|
||||
int2 refCoord = 0;
|
||||
|
||||
srcCoord.x = gid_0 * 16 + get_global_offset(0);
|
||||
srcCoord.y = gid_1 * 16 + get_global_offset(1);
|
||||
|
||||
short2 predMV = 0;
|
||||
|
||||
#ifndef HW_NULL_CHECK
|
||||
if (prediction_motion_vector_buffer != NULL)
|
||||
#endif
|
||||
{
|
||||
predMV = prediction_motion_vector_buffer[gid_0 + gid_1 * width];
|
||||
refCoord.x = predMV.x / 4;
|
||||
refCoord.y = predMV.y / 4;
|
||||
refCoord.y = refCoord.y & 0xFFFE;
|
||||
}
|
||||
|
||||
{
|
||||
intel_work_group_vme_mb_query(dst, srcCoord, refCoord, srcImg, refImg,
|
||||
accelerator);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Write Out Result
|
||||
|
||||
// 4x4
|
||||
if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
|
||||
int x = get_local_id(0) % 4;
|
||||
int y = get_local_id(0) / 4;
|
||||
int index = (gid_0 * 4 + x) + (gid_1 * 4 + y) * width * 4;
|
||||
|
||||
short2 val = as_short2(dst[8 + (y * 4 + x) * 2]);
|
||||
motion_vector_buffer[index] = val;
|
||||
|
||||
#ifndef HW_NULL_CHECK
|
||||
if (residuals != NULL)
|
||||
#endif
|
||||
{
|
||||
residuals[index] = dist[y * 4 + x];
|
||||
}
|
||||
}
|
||||
|
||||
// 8x8
|
||||
if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
|
||||
if (get_local_id(0) < 4) {
|
||||
int x = get_local_id(0) % 2;
|
||||
int y = get_local_id(0) / 2;
|
||||
int index = (gid_0 * 2 + x) + (gid_1 * 2 + y) * width * 2;
|
||||
short2 val = as_short2(dst[8 + (y * 2 + x) * 8]);
|
||||
motion_vector_buffer[index] = val;
|
||||
|
||||
#ifndef HW_NULL_CHECK
|
||||
if (residuals != NULL)
|
||||
#endif
|
||||
{
|
||||
residuals[index] = dist[(y * 2 + x) * 4];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 16x16
|
||||
if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
|
||||
if (get_local_id(0) == 0) {
|
||||
int index = gid_0 + gid_1 * width;
|
||||
|
||||
short2 val = as_short2(dst[8]);
|
||||
motion_vector_buffer[index] = val;
|
||||
|
||||
#ifndef HW_NULL_CHECK
|
||||
if (residuals != NULL)
|
||||
#endif
|
||||
{
|
||||
residuals[index] = dist[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
|
||||
block_advanced_motion_estimate_check_intel(
|
||||
sampler_t accelerator, __read_only image2d_t srcImg,
|
||||
__read_only image2d_t refImg, uint flags, uint skip_block_type,
|
||||
uint search_cost_penalty, uint search_cost_precision,
|
||||
__global short2 *count_motion_vector_buffer,
|
||||
__global short2 *predictors_buffer,
|
||||
__global short2 *skip_motion_vector_buffer,
|
||||
__global short2 *motion_vector_buffer,
|
||||
__global char *intra_search_predictor_modes, __global ushort *residuals,
|
||||
__global ushort *skip_residuals, __global ushort *intra_residuals,
|
||||
__read_only image2d_t intraSrcImg, int height, int width, int stride) {
|
||||
__local uint dstSearch[64]; // 8 GRFs
|
||||
__local uint dstSkipIntra[64 + 24]; // 11 GRFs (8 for inter, 3 for intra)
|
||||
|
||||
__local ushort *distSearch =
|
||||
(__local ushort *)&dstSearch[8 * 5]; // distortion in the 6th GRF
|
||||
|
||||
// Initialize the MV cost table:
|
||||
// MV Cost in U4U4 format:
|
||||
// No cost : 0, 0, 0, 0, 0, 0, 0, 0
|
||||
// Low Cost : 1, 4, 5, 9, 10, 12, 14, 15
|
||||
// Normal Cost: 5, 26, 29, 43, 45, 47, 57, 57
|
||||
// High Cost : 29, 61, 72, 78, 88, 89, 91, 92
|
||||
|
||||
uint2 MVCostTable;
|
||||
if (search_cost_penalty == 1) {
|
||||
MVCostTable.s0 = 0x09050401;
|
||||
MVCostTable.s1 = 0x0F0E0C0A;
|
||||
} else if (search_cost_penalty == 2) {
|
||||
MVCostTable.s0 = 0x2B1D1A05;
|
||||
MVCostTable.s1 = 0x39392F2D;
|
||||
} else if (search_cost_penalty == 3) {
|
||||
MVCostTable.s0 = 0x4E483D1D;
|
||||
MVCostTable.s1 = 0x5C5B5958;
|
||||
} else {
|
||||
MVCostTable.s0 = 0;
|
||||
MVCostTable.s1 = 0;
|
||||
}
|
||||
|
||||
uint MVCostPrecision = ((uint)search_cost_precision) << 16;
|
||||
// Frame is divided into rows * columns of MBs.
|
||||
// One h/w thread per WG.
|
||||
// One WG processes 'row' MBs - one row per iteration and one MB per row.
|
||||
// Number of WGs (or h/w threads) is number of columns MBs
|
||||
// Each iteration processes the MB in a row - gid_0 is the MB id in a row and
|
||||
// gid_1 is the row offset.
|
||||
|
||||
int sid_0 = stride * get_group_id(0);
|
||||
int gid_0 = sid_0 / height;
|
||||
int gid_1 = sid_0 % height;
|
||||
for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
|
||||
sid++, gid_0 = sid / height, gid_1 = sid % height) {
|
||||
int2 srcCoord;
|
||||
|
||||
srcCoord.x = gid_0 * 16 +
|
||||
get_global_offset(0); // 16 pixels wide MBs (globally scalar)
|
||||
srcCoord.y = gid_1 * 16 +
|
||||
get_global_offset(1); // 16 pixels tall MBs (globally scalar)
|
||||
|
||||
uint curMB = gid_0 + gid_1 * width; // current MB id
|
||||
short2 count = count_motion_vector_buffer[curMB];
|
||||
|
||||
int countPredMVs = count.x;
|
||||
if (countPredMVs != 0) {
|
||||
uint offset = curMB * 8; // 8 predictors per MB
|
||||
offset += get_local_id(0) % 8; // 16 work-items access 8 MVs for MB
|
||||
// one predictor for MB per SIMD channel
|
||||
|
||||
// Reduce predictors from Q-pixel to integer precision.
|
||||
|
||||
int2 predMV = 0;
|
||||
if (get_local_id(0) < countPredMVs) {
|
||||
predMV =
|
||||
convert_int2(predictors_buffer[offset]); // one MV per work-item
|
||||
predMV.x /= 4;
|
||||
predMV.y /= 4;
|
||||
predMV.y &= 0xFFFE;
|
||||
}
|
||||
|
||||
// Do up to 8 IMEs, get the best MVs and their distortions, and optionally
|
||||
// a FBR of the best MVs.
|
||||
// Finally the results are written out to SLM.
|
||||
|
||||
intel_work_group_vme_mb_multi_query_8(
|
||||
dstSearch, // best search MV and its distortions into SLM
|
||||
countPredMVs, // count of predictor MVs (globally scalar - value range
|
||||
// 1 to 8)
|
||||
MVCostPrecision, // MV cost precision
|
||||
MVCostTable, // MV cost table
|
||||
srcCoord, // MB 2-D offset (globally scalar)
|
||||
predMV, // predictor MVs (up to 8 distinct MVs for SIMD16 thread)
|
||||
srcImg, // source
|
||||
refImg, // reference
|
||||
accelerator); // vme object
|
||||
}
|
||||
|
||||
int doIntra = (flags & 0x2) != 0;
|
||||
int intraEdges = 0;
|
||||
if (doIntra) {
|
||||
// Enable all edges by default.
|
||||
intraEdges = 0x3C;
|
||||
// If this is a left-edge MB, then disable left edges.
|
||||
if ((gid_0 == 0) & (get_global_offset(0) == 0)) {
|
||||
intraEdges &= 0x14;
|
||||
}
|
||||
// If this is a right edge MB then disable right edges.
|
||||
if (gid_0 == width - 1) {
|
||||
intraEdges &= 0x38;
|
||||
}
|
||||
// If this is a top-edge MB, then disable top edges.
|
||||
if ((gid_1 == 0) & (get_global_offset(1) == 0)) {
|
||||
intraEdges &= 0x20;
|
||||
}
|
||||
// Set bit6=bit5.
|
||||
intraEdges |= ((intraEdges & 0x20) << 1);
|
||||
intraEdges <<= 8;
|
||||
}
|
||||
int countSkipMVs = count.y;
|
||||
if (countSkipMVs != 0 || doIntra == true) {
|
||||
uint offset = curMB * 8; // 8 sets of skip check MVs per MB
|
||||
offset +=
|
||||
(get_local_id(0) % 8); // 16 work-items access 8 sets of MVs for MB
|
||||
// one set of skip MV per SIMD channel
|
||||
|
||||
// Do up to 8 skip checks and get the distortions for each of them.
|
||||
// Finally the results are written out to SLM.
|
||||
|
||||
if ((skip_block_type == 0x0) | ((doIntra) & (countSkipMVs == 0))) {
|
||||
int skipMVs = 0;
|
||||
if (get_local_id(0) < countSkipMVs) {
|
||||
__global int *skip1_motion_vector_buffer =
|
||||
(__global int *)skip_motion_vector_buffer;
|
||||
skipMVs = skip1_motion_vector_buffer[offset]; // one packed MV for one
|
||||
// work-item
|
||||
}
|
||||
intel_work_group_vme_mb_multi_check_16x16(
|
||||
dstSkipIntra, // distortions into SLM
|
||||
countSkipMVs, // count of skip check MVs (value range 0 to 8)
|
||||
doIntra, // compute intra modes
|
||||
intraEdges, // intra edges to use
|
||||
srcCoord, // MB 2-D offset (globally scalar)
|
||||
skipMVs, // skip check MVs (up to 8 sets of skip check MVs for
|
||||
// SIMD16 thread)
|
||||
srcImg, // source
|
||||
refImg, // reference
|
||||
intraSrcImg, // intra source
|
||||
accelerator);
|
||||
}
|
||||
|
||||
if ((skip_block_type == 0x1) & (countSkipMVs > 0)) {
|
||||
int4 skipMVs = 0;
|
||||
if (get_local_id(0) < countSkipMVs) {
|
||||
__global int4 *skip4_motion_vector_buffer =
|
||||
(__global int4 *)(skip_motion_vector_buffer);
|
||||
skipMVs = skip4_motion_vector_buffer[offset]; // four component MVs
|
||||
// per work-item
|
||||
}
|
||||
intel_work_group_vme_mb_multi_check_8x8(
|
||||
dstSkipIntra, // distortions into SLM
|
||||
countSkipMVs, // count of skip check MVs per MB (value range 0 to 8)
|
||||
doIntra, // compute intra modes
|
||||
intraEdges, // intra edges to use
|
||||
srcCoord, // MB 2-D offset (globally scalar)
|
||||
skipMVs, // skip check MVs (up to 8 ets of skip check MVs for SIMD16
|
||||
// thread)
|
||||
srcImg, // source
|
||||
refImg, // reference
|
||||
intraSrcImg, // intra source
|
||||
accelerator);
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Write Out motion estimation result:
|
||||
// Result format
|
||||
// Hierarchical row-major layout
|
||||
// i.e. row-major of blocks MVs in MBs, and row-major of 8 sets of
|
||||
// MVs/distortion in blocks
|
||||
|
||||
if (countPredMVs != 0) {
|
||||
// 4x4
|
||||
if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
|
||||
int index = (gid_0 * 16 + get_local_id(0)) + (gid_1 * 16 * width);
|
||||
|
||||
// 1. 16 work-items enabled.
|
||||
// 2. Work-items gather fwd MVs in strided dword locations 0, 2, .., 30
|
||||
// (interleaved
|
||||
// fwd/bdw MVs) with constant offset 8 (control data size) from SLM
|
||||
// into contiguous
|
||||
// short2 locations 0, 1, .., 15 of global buffer
|
||||
// search_motion_vector_buffer with
|
||||
// offset index.
|
||||
// 3. Work-items gather contiguous ushort locations 0, 1, .., 15 from
|
||||
// distSearch into
|
||||
// contiguous ushort locations 0, 1, .., 15 of search_residuals with
|
||||
// offset index.
|
||||
|
||||
short2 val = as_short2(dstSearch[8 + get_local_id(0) * 2]);
|
||||
motion_vector_buffer[index] = val;
|
||||
|
||||
#ifndef HW_NULL_CHECK
|
||||
if (residuals != NULL)
|
||||
#endif
|
||||
{
|
||||
residuals[index] = distSearch[get_local_id(0)];
|
||||
}
|
||||
}
|
||||
|
||||
// 8x8
|
||||
else if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
|
||||
// Only 1st 4 work-item are needed.
|
||||
if (get_local_id(0) < 4) {
|
||||
int index = (gid_0 * 4 + get_local_id(0)) + (gid_1 * 4 * width);
|
||||
|
||||
// 1. 4 work-items enabled.
|
||||
// 2. Work-items gather fw MVs in strided dword locations 0, 8, 16, 24
|
||||
// (interleaved
|
||||
// fwd/bdw MVs) with constant offset 8 from SLM into contiguous
|
||||
// short2 locations
|
||||
// 0, 1, .., 15 of global buffer search_motion_vector_buffer with
|
||||
// offset index.
|
||||
// 3. Work-items gather strided ushort locations 0, 4, 8, 12 from
|
||||
// distSearch into
|
||||
// contiguous ushort locations 0, 1, .., 15 of search_residuals
|
||||
// with offset index.
|
||||
|
||||
short2 val = as_short2(dstSearch[8 + get_local_id(0) * 4 * 2]);
|
||||
motion_vector_buffer[index] = val;
|
||||
|
||||
#ifndef HW_NULL_CHECK
|
||||
if (residuals != NULL)
|
||||
#endif
|
||||
{
|
||||
residuals[index] = distSearch[get_local_id(0) * 4];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 16x16
|
||||
else if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
|
||||
// One 1st work is needed.
|
||||
if (get_local_id(0) == 0) {
|
||||
int index = gid_0 + gid_1 * width;
|
||||
|
||||
// 1. 1 work-item enabled.
|
||||
// 2. Work-item gathers fwd MV in dword location 0 with constant
|
||||
// offset 8 from
|
||||
// SLM into short2 locations 0 of global buffer
|
||||
// search_motion_vector_buffer.
|
||||
// 3. Work-item gathers ushort location 0 from distSearch into ushort
|
||||
// location 0 of search_residuals with offset index.
|
||||
|
||||
short2 val = as_short2(dstSearch[8]);
|
||||
motion_vector_buffer[index] = val;
|
||||
|
||||
#ifndef HW_NULL_CHECK
|
||||
if (residuals != NULL)
|
||||
#endif
|
||||
{
|
||||
residuals[index] = distSearch[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write out motion skip check result:
|
||||
// Result format
|
||||
// Hierarchical row-major layout
|
||||
// i.e. row-major of blocks in MBs, and row-major of 8 sets of
|
||||
// distortions in blocks
|
||||
|
||||
if (countSkipMVs != 0) {
|
||||
if (skip_block_type == 0x0) {
|
||||
// Copy out 8 (1 component) sets of distortion values.
|
||||
|
||||
int index = (gid_0 * 8) + (get_local_id(0)) + (gid_1 * 8 * width);
|
||||
|
||||
if (get_local_id(0) < countSkipMVs) {
|
||||
__local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
|
||||
|
||||
// 1. Up to 8 work-items are enabled.
|
||||
// 2. The work-item gathers distSkip locations 0, 16*1, .., 16*7 and
|
||||
// copies them to contiguous skip_residual locations 0, 1, 2, ..,
|
||||
// 7.
|
||||
skip_residuals[index] = distSkip[get_local_id(0) * 16];
|
||||
}
|
||||
} else {
|
||||
// Copy out 8 (4 component) sets of distortion values.
|
||||
|
||||
int index =
|
||||
(gid_0 * 8 * 4) + (get_local_id(0)) + (gid_1 * 8 * 4 * width);
|
||||
|
||||
__local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
|
||||
|
||||
if (get_local_id(0) < countSkipMVs * 4) {
|
||||
// 1. Up to 16 work-items are enabled.
|
||||
// 2. The work-item gathers distSkip locations 0, 4*1, .., 4*31 and
|
||||
// copies them to contiguous skip_residual locations 0, 1, 2, ..,
|
||||
// 31.
|
||||
|
||||
skip_residuals[index] = distSkip[get_local_id(0) * 4];
|
||||
skip_residuals[index + 16] = distSkip[(get_local_id(0) + 16) * 4];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write out intra search result:
|
||||
|
||||
if (doIntra) {
|
||||
|
||||
int index_low =
|
||||
(gid_0 * 22) + (get_local_id(0) * 2) + (gid_1 * 22 * width);
|
||||
int index_high =
|
||||
(gid_0 * 22) + (get_local_id(0) * 2) + 1 + (gid_1 * 22 * width);
|
||||
|
||||
// Write out the 4x4 intra modes
|
||||
if (get_local_id(0) < 8) {
|
||||
__local char *dstIntra_4x4 =
|
||||
(__local char *)(&dstSkipIntra[64 + 16 + 4]);
|
||||
char value = dstIntra_4x4[get_local_id(0)];
|
||||
char value_low = (value)&0xf;
|
||||
char value_high = (value >> 4) & 0xf;
|
||||
intra_search_predictor_modes[index_low + 5] = value_low;
|
||||
intra_search_predictor_modes[index_high + 5] = value_high;
|
||||
}
|
||||
|
||||
// Write out the 8x8 intra modes
|
||||
if (get_local_id(0) < 4) {
|
||||
__local char *dstIntra_8x8 =
|
||||
(__local char *)(&dstSkipIntra[64 + 8 + 4]);
|
||||
char value = dstIntra_8x8[get_local_id(0) * 2];
|
||||
char value_low = (value)&0xf;
|
||||
int index = (gid_0 * 22) + (get_local_id(0)) + (gid_1 * 22 * width);
|
||||
intra_search_predictor_modes[index + 1] = value_low;
|
||||
}
|
||||
|
||||
// Write out the 16x16 intra modes
|
||||
if (get_local_id(0) < 1) {
|
||||
__local char *dstIntra_16x16 =
|
||||
(__local char *)(&dstSkipIntra[64 + 0 + 4]);
|
||||
char value = dstIntra_16x16[get_local_id(0)];
|
||||
char value_low = (value)&0xf;
|
||||
intra_search_predictor_modes[index_low] = value_low;
|
||||
}
|
||||
|
||||
// Get the intra residuals.
|
||||
#ifndef HW_NULL_CHECK
|
||||
if (intra_residuals != NULL)
|
||||
#endif
|
||||
{
|
||||
int index = (gid_0 * 4) + (gid_1 * 4 * width);
|
||||
|
||||
if (get_local_id(0) < 1) {
|
||||
__local ushort *distIntra_4x4 =
|
||||
(__local ushort *)(&dstSkipIntra[64 + 16 + 3]);
|
||||
__local ushort *distIntra_8x8 =
|
||||
(__local ushort *)(&dstSkipIntra[64 + 8 + 3]);
|
||||
__local ushort *distIntra_16x16 =
|
||||
(__local ushort *)(&dstSkipIntra[64 + 0 + 3]);
|
||||
intra_residuals[index + 2] = distIntra_4x4[0];
|
||||
intra_residuals[index + 1] = distIntra_8x8[0];
|
||||
intra_residuals[index + 0] = distIntra_16x16[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/*************************************************************************************************
|
||||
Built-in kernel:
|
||||
block_advanced_motion_estimate_bidirectional_check_intel
|
||||
|
||||
Description:
|
||||
|
||||
1. Do motion estimation with 0 to 4 predictor MVs using 0 to 4 (integer
|
||||
motion estimation)
|
||||
IMEs per macro-block, calculating the best search MVs per specified (16x16,
|
||||
8x8, 4x4) luma
|
||||
block with lowest distortion from amongst the 0 to 4 IME results, and
|
||||
optionally do
|
||||
(fractional bi-directional refinement) FBR on the best IME search results to
|
||||
refine the best
|
||||
search results. The best search (FBR if done, or IME) MVs and their
|
||||
distortions are returned.
|
||||
|
||||
2. Do undirectional or bidirectional skip (zero search) checks with 0 to 4
|
||||
sets of skip check
|
||||
MVs for (16x16, 8x8) luma blocks using 0 to 4 (skip and intra check) SICs and
|
||||
return the
|
||||
distortions associated with the input sets of skip check MVs per specified
|
||||
luma block. 4x4
|
||||
blocks are not supported by h/w for skip checks.
|
||||
|
||||
3. Do intra-prediction for (16x16, 8x8, 4x4) luma blocks and (8x8) chroma
|
||||
blocks using 3 SICs
|
||||
and returning the predictor modes and their associated distortions.
|
||||
Intra-prediction is done
|
||||
for all block sizes. Support for 8x8 chroma blocks cannot be enabled until NV
|
||||
image formats
|
||||
are supported in OCL.
|
||||
|
||||
**************************************************************************************************/
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
|
||||
block_advanced_motion_estimate_bidirectional_check_intel(
|
||||
sampler_t accelerator, __read_only image2d_t srcImg,
|
||||
__read_only image2d_t refImg, __read_only image2d_t src_check_image,
|
||||
__read_only image2d_t ref0_check_image,
|
||||
__read_only image2d_t ref1_check_image, uint flags,
|
||||
uint search_cost_penalty, uint search_cost_precision, short2 count_global,
|
||||
uchar bidir_weight, __global short2 *count_motion_vector_buffer,
|
||||
__global short2 *prediction_motion_vector_buffer,
|
||||
__global char *skip_input_mode_buffer,
|
||||
__global short2 *skip_motion_vector_buffer,
|
||||
__global short2 *search_motion_vector_buffer,
|
||||
__global char *intra_search_predictor_modes,
|
||||
__global ushort *search_residuals, __global ushort *skip_residuals,
|
||||
__global ushort *intra_residuals, __read_only image2d_t intraSrcImg,
|
||||
int height, int width, int stride) {
|
||||
__local uint dstSearch[64]; // 8 GRFs
|
||||
__local uint dstSkipIntra[32 + 24]; // 7 GRFs (4 for inter, 3 for intra)
|
||||
|
||||
// distortion in the 6th GRF
|
||||
__local ushort *distSearch = (__local ushort *)&dstSearch[8 * 5];
|
||||
|
||||
// Initialize the MV cost table:
|
||||
// MV Cost in U4U4 format:
|
||||
// No cost : 0, 0, 0, 0, 0, 0, 0, 0
|
||||
// Low Cost : 1, 4, 5, 9, 10, 12, 14, 15
|
||||
// Normal Cost: 5, 26, 29, 43, 45, 47, 57, 57
|
||||
// High Cost : 29, 61, 72, 78, 88, 89, 91, 92
|
||||
|
||||
uint2 MVCostTable;
|
||||
if (search_cost_penalty == 1) {
|
||||
MVCostTable.s0 = 0x09050401;
|
||||
MVCostTable.s1 = 0x0F0E0C0A;
|
||||
} else if (search_cost_penalty == 2) {
|
||||
MVCostTable.s0 = 0x2B1D1A05;
|
||||
MVCostTable.s1 = 0x39392F2D;
|
||||
} else if (search_cost_penalty == 3) {
|
||||
MVCostTable.s0 = 0x4E483D1D;
|
||||
MVCostTable.s1 = 0x5C5B5958;
|
||||
} else {
|
||||
MVCostTable.s0 = 0;
|
||||
MVCostTable.s1 = 0;
|
||||
}
|
||||
|
||||
uint MVCostPrecision = ((uint)search_cost_precision) << 16;
|
||||
|
||||
// Frame is divided into rows * columns of MBs.
|
||||
// One h/w thread per WG.
|
||||
// One WG processes "row" MBs - one row per iteration and one MB per row.
|
||||
// Number of WGs (or h/w threads) is number of columns MBs.Each iteration
|
||||
// processes the MB in a row - gid_0 is the MB id in a row and gid_1 is the
|
||||
// row offset.
|
||||
|
||||
int sid_0 = stride * get_group_id(0);
|
||||
int gid_0 = sid_0 / height;
|
||||
int gid_1 = sid_0 % height;
|
||||
for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
|
||||
sid++, gid_0 = sid / height, gid_1 = sid % height) {
|
||||
int2 srcCoord;
|
||||
|
||||
srcCoord.x = gid_0 * 16 +
|
||||
get_global_offset(0); // 16 pixels wide MBs (globally scalar)
|
||||
srcCoord.y = gid_1 * 16 +
|
||||
get_global_offset(1); // 16 pixels tall MBs (globally scalar)
|
||||
uint curMB = gid_0 + gid_1 * width; // current MB id
|
||||
short2 count;
|
||||
|
||||
// If either the search or skip vector counts are per-MB, then we need to
|
||||
// read in
|
||||
// the count motion vector buffer.
|
||||
|
||||
if ((count_global.s0 == -1) | (count_global.s1 == -1)) {
|
||||
count = count_motion_vector_buffer[curMB];
|
||||
}
|
||||
|
||||
// If either the search or skip vector counts are per-frame, we need to use
|
||||
// those.
|
||||
|
||||
if (count_global.s0 >= 0) {
|
||||
count.s0 = count_global.s0;
|
||||
}
|
||||
|
||||
if (count_global.s1 >= 0) {
|
||||
count.s1 = count_global.s1;
|
||||
}
|
||||
|
||||
int countPredMVs = count.x;
|
||||
if (countPredMVs != 0) {
|
||||
uint offset = curMB * 4; // 4 predictors per MB
|
||||
offset += get_local_id(0) % 4; // 16 work-items access 4 MVs for MB
|
||||
// one predictor for MB per SIMD channel
|
||||
|
||||
// Reduce predictors from Q-pixel to integer precision.
|
||||
int2 predMV = 0;
|
||||
|
||||
if (get_local_id(0) < countPredMVs) {
|
||||
// one MV per work-item
|
||||
predMV = convert_int2(prediction_motion_vector_buffer[offset]);
|
||||
// Predictors are input in QP resolution. Convert that to integer
|
||||
// resolution.
|
||||
predMV.x /= 4;
|
||||
predMV.y /= 4;
|
||||
predMV.y &= 0xFFFFFFFE;
|
||||
}
|
||||
|
||||
// Do up to 4 IMEs, get the best MVs and their distortions, and optionally
|
||||
// a FBR of
|
||||
// the best MVs. Finally the results are written out to SLM.
|
||||
|
||||
intel_work_group_vme_mb_multi_query_4(
|
||||
dstSearch, // best search MV and its distortions into SLM
|
||||
countPredMVs, // count of predictor MVs (globally scalar - value range
|
||||
// 1 to 4)
|
||||
MVCostPrecision, // MV cost precision
|
||||
MVCostTable, // MV cost table
|
||||
srcCoord, // MB 2-D offset (globally scalar)
|
||||
predMV, // predictor MVs (up to 4 distinct MVs for SIMD16 thread)
|
||||
srcImg, // source
|
||||
refImg, // reference
|
||||
accelerator); // vme object
|
||||
}
|
||||
|
||||
int doIntra = ((flags & 0x2) != 0);
|
||||
int intraEdges = 0;
|
||||
if (doIntra) {
|
||||
// Enable all edges by default.
|
||||
intraEdges = 0x3C;
|
||||
// If this is a left-edge MB, then disable left edges.
|
||||
if ((gid_0 == 0) & (get_global_offset(0) == 0)) {
|
||||
intraEdges &= 0x14;
|
||||
}
|
||||
|
||||
// If this is a right edge MB then disable right edges.
|
||||
if (gid_0 == width - 1) {
|
||||
intraEdges &= 0x38;
|
||||
}
|
||||
|
||||
// If this is a top-edge MB, then disable top edges.
|
||||
if ((gid_1 == 0) & (get_global_offset(1) == 0)) {
|
||||
intraEdges &= 0x20;
|
||||
}
|
||||
|
||||
// Set bit6=bit5.
|
||||
intraEdges |= ((intraEdges & 0x20) << 1);
|
||||
|
||||
intraEdges <<= 8;
|
||||
}
|
||||
|
||||
int skip_block_type_8x8 = flags & 0x4;
|
||||
|
||||
int countSkipMVs = count.y;
|
||||
if (countSkipMVs != 0 || doIntra == true) {
|
||||
// one set of skip MV per SIMD channel
|
||||
|
||||
// Do up to 4 skip checks and get the distortions for each of them.
|
||||
// Finally the results are written out to SLM.
|
||||
|
||||
if ((skip_block_type_8x8 == 0) | ((doIntra) & (countSkipMVs == 0))) {
|
||||
// 16x16:
|
||||
|
||||
uint offset = curMB * 4 * 2; // 4 sets of skip check MVs per MB
|
||||
int skipMV = 0;
|
||||
if (get_local_id(0) < countSkipMVs * 2) // need 2 values per MV
|
||||
{
|
||||
offset +=
|
||||
(get_local_id(0)); // 16 work-items access 4 sets of MVs for MB
|
||||
__global int *skip1_motion_vector_buffer =
|
||||
(__global int *)skip_motion_vector_buffer;
|
||||
skipMV = skip1_motion_vector_buffer[offset]; // one MV per work-item
|
||||
}
|
||||
|
||||
uchar skipMode = 0;
|
||||
if (get_local_id(0) < countSkipMVs) {
|
||||
skipMode = skip_input_mode_buffer[curMB];
|
||||
|
||||
if (skipMode == 0) {
|
||||
skipMode = 1;
|
||||
}
|
||||
if (skipMode > 3) {
|
||||
skipMode = 3;
|
||||
}
|
||||
}
|
||||
|
||||
intel_work_group_vme_mb_multi_bidir_check_16x16(
|
||||
dstSkipIntra, // distortions into SLM
|
||||
countSkipMVs, // count of skip check MVs (globally scalar - value
|
||||
// range 1 to 4)
|
||||
doIntra, // compute intra modes
|
||||
intraEdges, // intra edges to use
|
||||
srcCoord, // MB 2-D offset (globally scalar)
|
||||
bidir_weight, // bidirectional weight
|
||||
skipMode, // skip modes
|
||||
skipMV, // skip check MVs (up to 4 distinct sets of skip check MVs
|
||||
// for SIMD16 thread)
|
||||
src_check_image, // source
|
||||
ref0_check_image, // reference fwd
|
||||
ref1_check_image, // reference bwd
|
||||
intraSrcImg, // intra source
|
||||
accelerator); // vme object
|
||||
} else {
|
||||
// 8x8:
|
||||
|
||||
uint offset =
|
||||
curMB * 4 *
|
||||
8; // 4 sets of skip check MVs, 16 shorts (8 ints) each per MB
|
||||
int2 skipMVs = 0;
|
||||
if (get_local_id(0) < countSkipMVs * 8) // need 8 values per MV
|
||||
{
|
||||
offset +=
|
||||
(get_local_id(0)); // 16 work-items access 4 sets of MVs for MB
|
||||
__global int *skip1_motion_vector_buffer =
|
||||
(__global int *)(skip_motion_vector_buffer);
|
||||
skipMVs.x = skip1_motion_vector_buffer[offset]; // four component MVs
|
||||
// per work-item
|
||||
skipMVs.y = skip1_motion_vector_buffer[offset + 16];
|
||||
}
|
||||
|
||||
uchar skipModes = 0;
|
||||
if (get_local_id(0) < countSkipMVs) {
|
||||
skipModes = skip_input_mode_buffer[curMB];
|
||||
}
|
||||
|
||||
intel_work_group_vme_mb_multi_bidir_check_8x8(
|
||||
dstSkipIntra, // distortions into SLM
|
||||
countSkipMVs, // count of skip check MVs per MB (globally scalar -
|
||||
// value range 1 to 4)
|
||||
doIntra, // compute intra modes
|
||||
intraEdges, // intra edges to use
|
||||
srcCoord, // MB 2-D offset (globally scalar)
|
||||
bidir_weight, // bidirectional weight
|
||||
skipModes, // skip modes
|
||||
skipMVs, // skip check MVs (up to 4 distinct sets of skip check MVs
|
||||
// for SIMD16 thread)
|
||||
src_check_image, // source
|
||||
ref0_check_image, // reference fwd
|
||||
ref1_check_image, // reference bwd
|
||||
intraSrcImg, // intra source
|
||||
accelerator); // vme object
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Write Out motion estimation result:
|
||||
// Result format
|
||||
// Hierarchical row-major layout
|
||||
// i.e. row-major of blocks MVs in MBs, and row-major of 4 sets of
|
||||
// MVs/distortion in blocks
|
||||
if (countPredMVs != 0) {
|
||||
// 4x4
|
||||
if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
|
||||
int index = (gid_0 * 16 + get_local_id(0)) + (gid_1 * 16 * width);
|
||||
|
||||
// 1. 16 work-items enabled.
|
||||
// 2. Work-items gather fwd MVs in strided dword locations 0, 2, .., 30
|
||||
// (interleaved
|
||||
// fwd/bdw MVs) with constant offset 8 (control data size) from SLM
|
||||
// into contiguous
|
||||
// short2 locations 0, 1, .., 15 of global buffer
|
||||
// search_motion_vector_buffer with
|
||||
// offset index.
|
||||
// 3. Work-items gather contiguous ushort locations 0, 1, .., 15 from
|
||||
// distSearch into
|
||||
// contiguous ushort locations 0, 1, .., 15 of search_residuals with
|
||||
// offset index.
|
||||
|
||||
short2 val = as_short2(dstSearch[8 + get_local_id(0) * 2]);
|
||||
search_motion_vector_buffer[index] = val;
|
||||
|
||||
#ifndef HW_NULL_CHECK
|
||||
if (search_residuals != NULL)
|
||||
#endif
|
||||
{
|
||||
search_residuals[index] = distSearch[get_local_id(0)];
|
||||
}
|
||||
}
|
||||
|
||||
// 8x8
|
||||
else if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
|
||||
// Only 1st 4 work-item are needed.
|
||||
if (get_local_id(0) < 4) {
|
||||
int index = (gid_0 * 4 + get_local_id(0)) + (gid_1 * 4 * width);
|
||||
|
||||
// 1. 4 work-items enabled.
|
||||
// 2. Work-items gather fw MVs in strided dword locations 0, 8, 16, 24
|
||||
// (interleaved
|
||||
// fwd/bdw MVs) with constant offset 8 from SLM into contiguous
|
||||
// short2 locations
|
||||
// 0, 1, .., 15 of global buffer search_motion_vector_buffer with
|
||||
// offset index.
|
||||
// 3. Work-items gather strided ushort locations 0, 4, 8, 12 from
|
||||
// distSearch into
|
||||
// contiguous ushort locations 0, 1, .., 15 of search_residuals
|
||||
// with offset index.
|
||||
|
||||
short2 val = as_short2(dstSearch[8 + get_local_id(0) * 4 * 2]);
|
||||
search_motion_vector_buffer[index] = val;
|
||||
|
||||
#ifndef HW_NULL_CHECK
|
||||
if (search_residuals != NULL)
|
||||
#endif
|
||||
{
|
||||
search_residuals[index] = distSearch[get_local_id(0) * 4];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 16x16
|
||||
else if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
|
||||
// One 1st work is needed.
|
||||
if (get_local_id(0) == 0) {
|
||||
int index = gid_0 + gid_1 * width;
|
||||
|
||||
// 1. 1 work-item enabled.
|
||||
// 2. Work-item gathers fwd MV in dword location 0 with constant
|
||||
// offset 8 from
|
||||
// SLM into short2 locations 0 of global buffer
|
||||
// search_motion_vector_buffer.
|
||||
// 3. Work-item gathers ushort location 0 from distSearch into ushort
|
||||
// location 0 of search_residuals with offset index.
|
||||
|
||||
short2 val = as_short2(dstSearch[8]);
|
||||
search_motion_vector_buffer[index] = val;
|
||||
|
||||
#ifndef HW_NULL_CHECK
|
||||
if (search_residuals != NULL)
|
||||
#endif
|
||||
{
|
||||
search_residuals[index] = distSearch[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write out motion skip check result:
|
||||
// Result format
|
||||
// Hierarchical row-major layout
|
||||
// i.e. row-major of blocks in MBs, and row-major of 8 sets of
|
||||
// distortions in blocks
|
||||
if (countSkipMVs != 0) {
|
||||
if (skip_block_type_8x8 == false) {
|
||||
// Copy out 4 (1 component) sets of distortion values.
|
||||
|
||||
int index = (gid_0 * 4) + (get_local_id(0)) + (gid_1 * 4 * width);
|
||||
|
||||
if (get_local_id(0) < countSkipMVs) {
|
||||
// 1. Up to 4 work-items are enabled.
|
||||
// 2. The work-item gathers distSkip locations 0, 16*1, .., 16*7 and
|
||||
// copies them to contiguous skip_residual locations 0, 1, 2, ..,
|
||||
// 7.
|
||||
__local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
|
||||
skip_residuals[index] = distSkip[get_local_id(0) * 16];
|
||||
}
|
||||
} else {
|
||||
// Copy out 4 (4 component) sets of distortion values.
|
||||
int index =
|
||||
(gid_0 * 4 * 4) + (get_local_id(0)) + (gid_1 * 4 * 4 * width);
|
||||
|
||||
if (get_local_id(0) < countSkipMVs * 4) {
|
||||
// 1. Up to 16 work-items are enabled.
|
||||
// 2. The work-item gathers distSkip locations 0, 4*1, .., 4*15 and
|
||||
// copies them to contiguous skip_residual locations 0, 1, 2, ..,
|
||||
// 15.
|
||||
|
||||
__local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
|
||||
skip_residuals[index] = distSkip[get_local_id(0) * 4];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write out intra search result:
|
||||
if (doIntra) {
|
||||
// Write out the 4x4 intra modes
|
||||
if (get_local_id(0) < 8) {
|
||||
__local char *dstIntra_4x4 =
|
||||
(__local char *)(&dstSkipIntra[32 + 16 + 4]);
|
||||
char value = dstIntra_4x4[get_local_id(0)];
|
||||
char value_low = (value)&0xf;
|
||||
char value_high = (value >> 4) & 0xf;
|
||||
|
||||
int index_low =
|
||||
(gid_0 * 22) + (get_local_id(0) * 2) + (gid_1 * 22 * width);
|
||||
|
||||
int index_high =
|
||||
(gid_0 * 22) + (get_local_id(0) * 2) + 1 + (gid_1 * 22 * width);
|
||||
|
||||
intra_search_predictor_modes[index_low + 5] = value_low;
|
||||
intra_search_predictor_modes[index_high + 5] = value_high;
|
||||
}
|
||||
|
||||
// Write out the 8x8 intra modes
|
||||
if (get_local_id(0) < 4) {
|
||||
__local char *dstIntra_8x8 =
|
||||
(__local char *)(&dstSkipIntra[32 + 8 + 4]);
|
||||
char value = dstIntra_8x8[get_local_id(0) * 2];
|
||||
char value_low = (value)&0xf;
|
||||
int index = (gid_0 * 22) + (get_local_id(0)) + (gid_1 * 22 * width);
|
||||
intra_search_predictor_modes[index + 1] = value_low;
|
||||
}
|
||||
|
||||
// Write out the 16x16 intra modes
|
||||
if (get_local_id(0) < 1) {
|
||||
__local char *dstIntra_16x16 =
|
||||
(__local char *)(&dstSkipIntra[32 + 0 + 4]);
|
||||
char value = dstIntra_16x16[0];
|
||||
char value_low = (value)&0xf;
|
||||
int index = (gid_0 * 22) + (gid_1 * 22 * width);
|
||||
intra_search_predictor_modes[index] = value_low;
|
||||
}
|
||||
|
||||
// Get the intra residuals.
|
||||
#ifndef HW_NULL_CHECK
|
||||
if (intra_residuals != NULL)
|
||||
#endif
|
||||
{
|
||||
int index = (gid_0 * 4) + (gid_1 * 4 * width);
|
||||
|
||||
if (get_local_id(0) < 1) {
|
||||
__local ushort *distIntra_4x4 =
|
||||
(__local ushort *)(&dstSkipIntra[32 + 16 + 3]);
|
||||
__local ushort *distIntra_8x8 =
|
||||
(__local ushort *)(&dstSkipIntra[32 + 8 + 3]);
|
||||
__local ushort *distIntra_16x16 =
|
||||
(__local ushort *)(&dstSkipIntra[32 + 0 + 3]);
|
||||
|
||||
intra_residuals[index + 2] = distIntra_4x4[0];
|
||||
intra_residuals[index + 1] = distIntra_8x8[0];
|
||||
intra_residuals[index + 0] = distIntra_16x16[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// VEBOX KERNELS:
|
||||
__kernel void ve_enhance_intel(sampler_t accelerator,
|
||||
int flags,
|
||||
__read_only image2d_t current_input,
|
||||
__write_only image2d_t current_output) {
|
||||
}
|
||||
|
||||
__kernel void ve_dn_enhance_intel(sampler_t accelerator,
|
||||
int flags,
|
||||
__read_only image2d_t ref_input,
|
||||
__read_only image2d_t current_input,
|
||||
__write_only image2d_t current_output) {
|
||||
}
|
||||
|
||||
__kernel void ve_dn_di_enhance_intel(sampler_t accelerator,
|
||||
int flags,
|
||||
__read_only image2d_t current_input,
|
||||
__read_only image2d_t ref_input,
|
||||
__write_only image2d_t current_output,
|
||||
__write_only image2d_t ref_output,
|
||||
__write_only image2d_t dndi_output) {
|
||||
}
|
|
@ -1,8 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
-D cl_intel_device_side_advanced_vme_enable -D cl_intel_device_side_avc_vme_enable -D cl_intel_device_side_vme_enable -D cl_intel_media_block_io -cl-unsafe-math-optimizations -cl-mad-enable -cl-fast-relaxed-math
|
|
@ -1,68 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
// VME KERNELS
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
|
||||
block_motion_estimate_intel(sampler_t accelerator, __read_only image2d_t srcImg,
|
||||
__read_only image2d_t refImg,
|
||||
__global short2 *prediction_motion_vector_buffer,
|
||||
__global short2 *motion_vector_buffer,
|
||||
__global ushort *residuals) {
|
||||
}
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
|
||||
block_advanced_motion_estimate_check_intel(
|
||||
sampler_t accelerator, __read_only image2d_t srcImg,
|
||||
__read_only image2d_t refImg, uint flags, uint skip_block_type,
|
||||
uint search_cost_penalty, uint search_cost_precision,
|
||||
__global short2 *count_motion_vector_buffer,
|
||||
__global short2 *predictors_buffer,
|
||||
__global short2 *skip_motion_vector_buffer,
|
||||
__global short2 *motion_vector_buffer,
|
||||
__global char *intra_search_predictor_modes, __global ushort *residuals,
|
||||
__global ushort *skip_residuals, __global ushort *intra_residuals) {
|
||||
}
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
|
||||
block_advanced_motion_estimate_bidirectional_check_intel(
|
||||
sampler_t accelerator, __read_only image2d_t srcImg,
|
||||
__read_only image2d_t refImg, __read_only image2d_t src_check_image,
|
||||
__read_only image2d_t ref0_check_image,
|
||||
__read_only image2d_t ref1_check_image, uint flags,
|
||||
uint search_cost_penalty, uint search_cost_precision, short2 count_global,
|
||||
uchar bidir_weight, __global short2 *count_motion_vector_buffer,
|
||||
__global short2 *prediction_motion_vector_buffer,
|
||||
__global char *skip_input_mode_buffer,
|
||||
__global short2 *skip_motion_vector_buffer,
|
||||
__global short2 *search_motion_vector_buffer,
|
||||
__global char *intra_search_predictor_modes,
|
||||
__global ushort *search_residuals, __global ushort *skip_residuals,
|
||||
__global ushort *intra_residuals) {
|
||||
}
|
||||
|
||||
// VEBOX KERNELS:
|
||||
__kernel void ve_enhance_intel(sampler_t accelerator,
|
||||
int flags,
|
||||
__read_only image2d_t current_input,
|
||||
__write_only image2d_t current_output) {
|
||||
}
|
||||
|
||||
__kernel void ve_dn_enhance_intel(sampler_t accelerator,
|
||||
int flags,
|
||||
__read_only image2d_t ref_input,
|
||||
__read_only image2d_t current_input,
|
||||
__write_only image2d_t current_output) {
|
||||
}
|
||||
|
||||
__kernel void ve_dn_di_enhance_intel(sampler_t accelerator,
|
||||
int flags,
|
||||
__read_only image2d_t current_input,
|
||||
__read_only image2d_t ref_input,
|
||||
__write_only image2d_t current_output,
|
||||
__write_only image2d_t ref_output,
|
||||
__write_only image2d_t dndi_output) {
|
||||
}
|
|
@ -1,8 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
-D cl_intel_device_side_advanced_vme_enable -D cl_intel_device_side_avc_vme_enable -D cl_intel_device_side_vme_enable -D cl_intel_media_block_io -cl-unsafe-math-optimizations -cl-mad-enable -cl-fast-relaxed-math
|
|
@ -1,106 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
/*************************************************************************************************\
|
||||
Device-side basic vme kernel:
|
||||
device_side_block_motion_estimate_intel
|
||||
|
||||
Description:
|
||||
Computes motion vectors by comparing a 2d image source with a 2d reference image, producing a
|
||||
vector field motion vectors. The algorithm searches the best match of each macroblock pixel
|
||||
block in the source image by searching an image region in the reference image, centered on the
|
||||
coordinates of that pixel macroblock in the source image (optionally offset by the prediction
|
||||
motion vectors).
|
||||
|
||||
This kernel optionally takes a vector field of motion vector predictors via the
|
||||
prediction_motion_vector_image kernel argument. The kernel also optionally returns a vector
|
||||
field of per-macroblock pixel-block information records. Each record contains the best-match
|
||||
distortion (SAD) value and additional search result information.
|
||||
|
||||
This kernel needs to be compiled with following compiler option:
|
||||
" -D cl_intel_device_side_avc_vme_enable "
|
||||
|
||||
\*************************************************************************************************/
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
void device_side_block_motion_estimate_intel(__read_only image2d_t srcImg,
|
||||
__read_only image2d_t refImg,
|
||||
__global short2 *prediction_motion_vector_buffer,
|
||||
__global short2 *motion_vector_buffer,
|
||||
__global ushort *residuals_buffer,
|
||||
__global uchar2 *shapes_buffer,
|
||||
int iterations,
|
||||
int partitionMask) {
|
||||
int gid_0 = get_group_id(0);
|
||||
int gid_1 = 0;
|
||||
sampler_t vme_samp = 0;
|
||||
|
||||
for (int i = 0; i < iterations; i++, gid_1++) {
|
||||
ushort2 srcCoord = 0;
|
||||
short2 refCoord = 0;
|
||||
short2 predMV = 0;
|
||||
|
||||
srcCoord.x = gid_0 * 16 + get_global_offset(0);
|
||||
srcCoord.y = gid_1 * 16 + get_global_offset(1);
|
||||
|
||||
if (prediction_motion_vector_buffer != NULL) {
|
||||
predMV = prediction_motion_vector_buffer[gid_0 + gid_1 * get_num_groups(0)];
|
||||
refCoord.x = predMV.x / 4;
|
||||
refCoord.y = predMV.y / 4;
|
||||
refCoord.y = refCoord.y & 0xFFFE;
|
||||
}
|
||||
|
||||
uchar partition_mask = (uchar)partitionMask;
|
||||
uchar sad_adjustment = CLK_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL;
|
||||
uchar pixel_mode = CLK_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL;
|
||||
|
||||
intel_sub_group_avc_ime_payload_t payload = intel_sub_group_avc_ime_initialize(srcCoord, partition_mask, sad_adjustment);
|
||||
payload = intel_sub_group_avc_ime_set_single_reference(refCoord, CLK_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL, payload);
|
||||
|
||||
intel_sub_group_avc_ime_result_t result = intel_sub_group_avc_ime_evaluate_with_single_reference(srcImg, refImg, vme_samp, payload);
|
||||
|
||||
// Process Results
|
||||
long mvs = intel_sub_group_avc_ime_get_motion_vectors(result);
|
||||
ushort sads = intel_sub_group_avc_ime_get_inter_distortions(result);
|
||||
uchar major_shape = intel_sub_group_avc_ime_get_inter_major_shape(result);
|
||||
uchar minor_shapes = intel_sub_group_avc_ime_get_inter_minor_shapes(result);
|
||||
uchar2 shapes = {major_shape, minor_shapes};
|
||||
uchar directions = intel_sub_group_avc_ime_get_inter_directions(result);
|
||||
|
||||
// Perform FME for non-Integer Pixel mode
|
||||
if (pixel_mode != CLK_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL) {
|
||||
intel_sub_group_avc_ref_payload_t payload = intel_sub_group_avc_fme_initialize(srcCoord, mvs, major_shape, minor_shapes, directions, pixel_mode, sad_adjustment);
|
||||
intel_sub_group_avc_ref_result_t result = intel_sub_group_avc_ref_evaluate_with_single_reference(srcImg, refImg, vme_samp, payload);
|
||||
mvs = intel_sub_group_avc_ref_get_motion_vectors(result);
|
||||
sads = intel_sub_group_avc_ref_get_inter_distortions(result);
|
||||
}
|
||||
|
||||
// Write Out Result
|
||||
|
||||
if ((get_local_id(0) % 4) == 0) {
|
||||
int x = get_local_id(0) % 4;
|
||||
int y = get_local_id(0) / 4;
|
||||
int width = get_image_width(srcImg);
|
||||
|
||||
int index = (gid_0 * 4 + x) + (gid_1 * width / 4 + y);
|
||||
int2 bi_mvs = as_int2(mvs);
|
||||
|
||||
motion_vector_buffer[index] = as_short2(bi_mvs.s0);
|
||||
|
||||
if (residuals_buffer != NULL) {
|
||||
residuals_buffer[index] = sads;
|
||||
}
|
||||
shapes_buffer[gid_0 + gid_1 * get_num_groups(0)] = shapes;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void non_vme_kernel(__global unsigned int *src, __global unsigned int *dst) {
|
||||
int id = (int)get_global_id(0);
|
||||
dst[id] = lgamma((float)src[id]);
|
||||
dst[id] = src[id];
|
||||
}
|
|
@ -1,8 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
-D cl_intel_device_side_vme_enable -D HW_NULL_CHECK
|
|
@ -69,7 +69,6 @@ BuiltinResourceT createBuiltinResource(const BuiltinResourceT &r);
|
|||
std::string createBuiltinResourceName(EBuiltInOps::Type builtin, const std::string &extension);
|
||||
StackVec<std::string, 3> getBuiltinResourceNames(EBuiltInOps::Type builtin, BuiltinCode::ECodeType type, const Device &device);
|
||||
const char *getBuiltinAsString(EBuiltInOps::Type builtin);
|
||||
const char *getAdditionalBuiltinAsString(EBuiltInOps::Type builtin);
|
||||
|
||||
class Storage {
|
||||
public:
|
||||
|
|
|
@ -23,10 +23,6 @@
|
|||
namespace NEO {
|
||||
|
||||
const char *getBuiltinAsString(EBuiltInOps::Type builtin) {
|
||||
const char *builtinString = getAdditionalBuiltinAsString(builtin);
|
||||
if (builtinString) {
|
||||
return builtinString;
|
||||
}
|
||||
switch (builtin) {
|
||||
default:
|
||||
return "unknown";
|
||||
|
|
|
@ -35,7 +35,6 @@ void PageFaultManager::transferToGpu(void *ptr, void *cmdQ) {
|
|||
}
|
||||
void PageFaultManager::allowCPUMemoryEviction(void *ptr, PageFaultData &pageFaultData) {
|
||||
}
|
||||
const char *getAdditionalBuiltinAsString(EBuiltInOps::Type builtin) { return nullptr; }
|
||||
|
||||
void RootDeviceEnvironment::initApiGfxCoreHelper() {
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue