484 lines
21 KiB
C++
484 lines
21 KiB
C++
/*
|
|
* Copyright (C) 2018-2021 Intel Corporation
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "shared/source/built_ins/built_ins.h"
|
|
|
|
#include "opencl/source/accelerators/intel_accelerator.h"
|
|
#include "opencl/source/accelerators/intel_motion_estimation.h"
|
|
#include "opencl/source/built_ins/built_in_ops_vme.h"
|
|
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
|
|
#include "opencl/source/cl_device/cl_device.h"
|
|
#include "opencl/source/helpers/dispatch_info_builder.h"
|
|
#include "opencl/source/mem_obj/buffer.h"
|
|
#include "opencl/source/mem_obj/image.h"
|
|
|
|
namespace NEO {
|
|
class VmeBuiltinDispatchInfoBuilder : public BuiltinDispatchInfoBuilder {
|
|
public:
|
|
VmeBuiltinDispatchInfoBuilder(BuiltIns &kernelsLib, ClDevice &device, EBuiltInOps::Type builtinOp,
|
|
const char *kernelName)
|
|
: BuiltinDispatchInfoBuilder(kernelsLib, device) {
|
|
populate(builtinOp,
|
|
mediaKernelsBuildOptions,
|
|
kernelName, multiDeviceVmeKernel);
|
|
auto rootDeviceIndex = device.getRootDeviceIndex();
|
|
vmeKernel = multiDeviceVmeKernel->getKernel(rootDeviceIndex);
|
|
widthArgNum = vmeKernel->getKernelInfo().getArgNumByName("width");
|
|
heightArgNum = vmeKernel->getKernelInfo().getArgNumByName("height");
|
|
strideArgNum = vmeKernel->getKernelInfo().getArgNumByName("stride");
|
|
acceleratorArgNum = vmeKernel->getKernelInfo().getArgNumByName("accelerator");
|
|
srcImgArgNum = vmeKernel->getKernelInfo().getArgNumByName("srcImg");
|
|
refImgArgNum = vmeKernel->getKernelInfo().getArgNumByName("refImg");
|
|
motionVectorBufferArgNum = vmeKernel->getKernelInfo().getArgNumByName("motion_vector_buffer");
|
|
predictionMotionVectorBufferArgNum = vmeKernel->getKernelInfo().getArgNumByName("prediction_motion_vector_buffer");
|
|
residualsArgNum = vmeKernel->getKernelInfo().getArgNumByName("residuals");
|
|
}
|
|
|
|
void getBlkTraits(const Vec3<size_t> &inGws, size_t &gwWidthInBlk, size_t &gwHeightInBlk) const {
|
|
const size_t vmeMacroBlockWidth = 16;
|
|
const size_t vmeMacroBlockHeight = 16;
|
|
gwWidthInBlk = Math::divideAndRoundUp(inGws.x, vmeMacroBlockWidth);
|
|
gwHeightInBlk = Math::divideAndRoundUp(inGws.y, vmeMacroBlockHeight);
|
|
}
|
|
|
|
bool buildDispatchInfos(MultiDispatchInfo &multiDispatchInfo, Kernel *kern,
|
|
const uint32_t inDim, const Vec3<size_t> &inGws, const Vec3<size_t> &inLws, const Vec3<size_t> &inOffset) const override {
|
|
if (kern == nullptr) {
|
|
return false;
|
|
}
|
|
|
|
size_t gwWidthInBlk = 0;
|
|
size_t gwHeightInBlk = 0;
|
|
getBlkTraits(inGws, gwWidthInBlk, gwHeightInBlk);
|
|
|
|
cl_int height = (cl_int)gwHeightInBlk;
|
|
cl_int width = (cl_int)gwWidthInBlk;
|
|
cl_int stride = height;
|
|
size_t numThreadsX = gwWidthInBlk;
|
|
const size_t simdWidth = vmeKernel->getKernelInfo().getMaxSimdSize();
|
|
stride = static_cast<cl_int>(Math::divideAndRoundUp(height * width, numThreadsX));
|
|
|
|
// update implicit args
|
|
vmeKernel->setArg(heightArgNum, sizeof(height), &height);
|
|
vmeKernel->setArg(widthArgNum, sizeof(width), &width);
|
|
vmeKernel->setArg(strideArgNum, sizeof(stride), &stride);
|
|
|
|
// Update global work size to force macro-block to HW thread execution model
|
|
Vec3<size_t> gws = {numThreadsX * simdWidth, 1, 1};
|
|
Vec3<size_t> lws = {vmeKernel->getKernelInfo().kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0], 1, 1};
|
|
|
|
DispatchInfoBuilder<SplitDispatch::Dim::d2D, SplitDispatch::SplitMode::NoSplit> builder(clDevice);
|
|
builder.setDispatchGeometry(gws, lws, inOffset, gws, lws);
|
|
builder.setKernel(vmeKernel);
|
|
builder.bake(multiDispatchInfo);
|
|
return true;
|
|
}
|
|
|
|
bool setExplicitArg(uint32_t argIndex, size_t argSize, const void *argVal, cl_int &err) const override {
|
|
DEBUG_BREAK_IF(!((argIndex != widthArgNum) && (argIndex != heightArgNum) && (argIndex != strideArgNum)));
|
|
if ((argIndex == acceleratorArgNum) && (argVal == nullptr)) {
|
|
err = CL_INVALID_ACCELERATOR_INTEL;
|
|
return false;
|
|
}
|
|
err = vmeKernel->setArg(argIndex, argSize, argVal);
|
|
return false;
|
|
}
|
|
|
|
cl_int validateDispatch(Kernel *kernel, uint32_t inworkDim, const Vec3<size_t> &inGws, const Vec3<size_t> &inLws, const Vec3<size_t> &inOffset) const override {
|
|
if (inworkDim != 2) {
|
|
return CL_INVALID_WORK_DIMENSION;
|
|
}
|
|
|
|
size_t gwWidthInBlk = 0;
|
|
size_t gwHeightInBlk = 0;
|
|
getBlkTraits(inGws, gwWidthInBlk, gwHeightInBlk);
|
|
|
|
size_t BlkNum = gwWidthInBlk * gwHeightInBlk;
|
|
size_t BlkMul = 1;
|
|
IntelAccelerator *accelerator = castToObject<IntelAccelerator>((cl_accelerator_intel)vmeKernel->getKernelArg(acceleratorArgNum));
|
|
if (accelerator == nullptr) {
|
|
return CL_INVALID_KERNEL_ARGS; // accelerator was not set
|
|
}
|
|
DEBUG_BREAK_IF(accelerator->getDescriptorSize() != sizeof(cl_motion_estimation_desc_intel));
|
|
const cl_motion_estimation_desc_intel *acceleratorDesc = reinterpret_cast<const cl_motion_estimation_desc_intel *>(accelerator->getDescriptor());
|
|
switch (acceleratorDesc->mb_block_type) {
|
|
case CL_ME_MB_TYPE_8x8_INTEL:
|
|
BlkMul = 4;
|
|
break;
|
|
case CL_ME_MB_TYPE_4x4_INTEL:
|
|
BlkMul = 16;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return validateVmeDispatch(inGws, inOffset, BlkNum, BlkMul);
|
|
}
|
|
|
|
// notes on corner cases :
|
|
// * if arg not available in kernels - returns true
|
|
// * if arg set to nullptr - returns true
|
|
bool validateBufferSize(int32_t bufferArgNum, size_t minimumSizeExpected) const {
|
|
if (bufferArgNum == -1) {
|
|
return true;
|
|
}
|
|
|
|
auto buff = castToObject<Buffer>((cl_mem)vmeKernel->getKernelArg(bufferArgNum));
|
|
if (buff == nullptr) {
|
|
return true;
|
|
}
|
|
|
|
size_t bufferSize = buff->getSize();
|
|
if (bufferSize < minimumSizeExpected) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
template <typename EnumBaseType>
|
|
bool validateEnumVal(EnumBaseType val) const {
|
|
return false;
|
|
}
|
|
|
|
template <typename EnumBaseType, typename ExpectedValType, typename... ExpectedValsTypes>
|
|
bool validateEnumVal(EnumBaseType val, ExpectedValType expectedVal, ExpectedValsTypes... expVals) const {
|
|
return (val == static_cast<EnumBaseType>(expectedVal)) || validateEnumVal<EnumBaseType, ExpectedValsTypes...>(val, expVals...);
|
|
}
|
|
|
|
// notes on corner cases :
|
|
// * if arg not available in kernels - returns true
|
|
template <typename EnumBaseType, typename... ExpectedValsTypes>
|
|
bool validateEnumArg(int32_t argNum, ExpectedValsTypes... expVals) const {
|
|
if (argNum == -1) {
|
|
return true;
|
|
}
|
|
|
|
EnumBaseType val = this->getKernelArgByValValue<EnumBaseType>(static_cast<uint32_t>(argNum));
|
|
return validateEnumVal<EnumBaseType, ExpectedValsTypes...>(val, expVals...);
|
|
}
|
|
|
|
template <typename RetType>
|
|
RetType getKernelArgByValValue(uint32_t argNum) const {
|
|
const auto &argAsVal = vmeKernel->getKernelInfo().kernelDescriptor.payloadMappings.explicitArgs[argNum].as<ArgDescValue>();
|
|
DEBUG_BREAK_IF(argAsVal.elements.size() != 1);
|
|
|
|
const auto &element = argAsVal.elements[0];
|
|
DEBUG_BREAK_IF(sizeof(RetType) > element.size);
|
|
return *(RetType *)(vmeKernel->getCrossThreadData() + element.offset);
|
|
}
|
|
|
|
cl_int validateImages(const Vec3<size_t> &inputRegion, const Vec3<size_t> &offset) const {
|
|
Image *srcImg = castToObject<Image>((cl_mem)vmeKernel->getKernelArg(srcImgArgNum));
|
|
Image *refImg = castToObject<Image>((cl_mem)vmeKernel->getKernelArg(refImgArgNum));
|
|
|
|
if ((srcImg == nullptr) || (refImg == nullptr)) {
|
|
return CL_INVALID_KERNEL_ARGS;
|
|
}
|
|
|
|
for (Image *img : {srcImg, refImg}) {
|
|
const cl_image_format &imgFormat = img->getImageFormat();
|
|
if ((imgFormat.image_channel_order != CL_R) || (imgFormat.image_channel_data_type != CL_UNORM_INT8)) {
|
|
return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
|
|
}
|
|
|
|
if (false == img->isTiledAllocation()) {
|
|
//VME only works with tiled images.
|
|
return CL_OUT_OF_RESOURCES;
|
|
}
|
|
}
|
|
|
|
{
|
|
const cl_image_desc &srcImgDesc = srcImg->getImageDesc();
|
|
|
|
size_t srcImageWidth = srcImgDesc.image_width;
|
|
size_t srcImageHeight = srcImgDesc.image_height;
|
|
if (((inputRegion.x + offset.x) > srcImageWidth) ||
|
|
((inputRegion.y + offset.y) > srcImageHeight)) {
|
|
return CL_INVALID_IMAGE_SIZE;
|
|
}
|
|
}
|
|
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
virtual cl_int validateVmeDispatch(const Vec3<size_t> &inputRegion, const Vec3<size_t> &offset, size_t blkNum, size_t blkMul) const {
|
|
{
|
|
cl_int imageValidationStatus = validateImages(inputRegion, offset);
|
|
if (imageValidationStatus != CL_SUCCESS) {
|
|
return imageValidationStatus;
|
|
}
|
|
}
|
|
|
|
size_t numPredictors = 1;
|
|
std::pair<int32_t, size_t> bufferRequirements[] = {
|
|
std::make_pair(motionVectorBufferArgNum, (blkNum * blkMul * 2 * sizeof(cl_short))),
|
|
std::make_pair(predictionMotionVectorBufferArgNum, (blkNum * numPredictors * 2 * sizeof(cl_short))),
|
|
std::make_pair(residualsArgNum, (blkNum * blkMul * sizeof(cl_ushort)))};
|
|
for (const auto &req : bufferRequirements) {
|
|
if (false == validateBufferSize(req.first, req.second)) {
|
|
return CL_INVALID_BUFFER_SIZE;
|
|
}
|
|
}
|
|
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
protected:
|
|
uint32_t heightArgNum;
|
|
uint32_t widthArgNum;
|
|
uint32_t strideArgNum;
|
|
uint32_t acceleratorArgNum;
|
|
uint32_t srcImgArgNum;
|
|
uint32_t refImgArgNum;
|
|
int32_t motionVectorBufferArgNum;
|
|
int32_t predictionMotionVectorBufferArgNum;
|
|
int32_t residualsArgNum;
|
|
MultiDeviceKernel *multiDeviceVmeKernel;
|
|
Kernel *vmeKernel;
|
|
};
|
|
|
|
template <>
|
|
class BuiltInOp<EBuiltInOps::VmeBlockMotionEstimateIntel> : public VmeBuiltinDispatchInfoBuilder {
|
|
public:
|
|
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
|
|
: VmeBuiltinDispatchInfoBuilder(kernelsLib, device,
|
|
EBuiltInOps::VmeBlockMotionEstimateIntel, "block_motion_estimate_intel") {
|
|
}
|
|
};
|
|
|
|
class AdvancedVmeBuiltinDispatchInfoBuilder : public VmeBuiltinDispatchInfoBuilder {
|
|
public:
|
|
AdvancedVmeBuiltinDispatchInfoBuilder(BuiltIns &kernelsLib, ClDevice &device, EBuiltInOps::Type builtinOp,
|
|
const char *kernelName)
|
|
: VmeBuiltinDispatchInfoBuilder(kernelsLib, device, builtinOp,
|
|
kernelName) {
|
|
flagsArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("flags");
|
|
intraSrcImgArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("intraSrcImg");
|
|
skipBlockTypeArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("skip_block_type");
|
|
searchCostPenaltyArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("search_cost_penalty");
|
|
searchCostPrecisionArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("search_cost_precision");
|
|
bidirWeightArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("bidir_weight");
|
|
predictorsBufferArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("predictors_buffer");
|
|
countMotionVectorBufferArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("count_motion_vector_buffer");
|
|
skipMotionVectorBufferArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("skip_motion_vector_buffer");
|
|
intraSearchPredictorModesArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("intra_search_predictor_modes");
|
|
skipResidualsArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("skip_residuals");
|
|
intraResidualsArgNum = this->vmeKernel->getKernelInfo().getArgNumByName("intra_residuals");
|
|
}
|
|
|
|
bool setExplicitArg(uint32_t argIndex, size_t argSize, const void *argVal, cl_int &err) const override {
|
|
DEBUG_BREAK_IF(argIndex == intraSrcImgArgNum);
|
|
if (argIndex == this->srcImgArgNum) {
|
|
// rebind also as media block image
|
|
this->vmeKernel->setArg(intraSrcImgArgNum, argSize, argVal);
|
|
}
|
|
return VmeBuiltinDispatchInfoBuilder::setExplicitArg(argIndex, argSize, argVal, err);
|
|
}
|
|
|
|
virtual bool isBidirKernel() const {
|
|
return false;
|
|
}
|
|
|
|
bool validateFlags(uint32_t &outSkipBlockType) const {
|
|
uint32_t flagsVal = VmeBuiltinDispatchInfoBuilder::template getKernelArgByValValue<uint32_t>(flagsArgNum);
|
|
|
|
if ((flagsVal & CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL) == CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL) {
|
|
return false;
|
|
}
|
|
|
|
if (flagsVal == CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL) {
|
|
outSkipBlockType = CL_ME_MB_TYPE_16x16_INTEL;
|
|
} else if ((flagsVal & CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL) == CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL) {
|
|
outSkipBlockType = CL_ME_MB_TYPE_8x8_INTEL;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool validateSkipBlockTypeArg(uint32_t &outSkipBlockType) const {
|
|
if (skipBlockTypeArgNum == -1) {
|
|
return true;
|
|
}
|
|
|
|
outSkipBlockType = VmeBuiltinDispatchInfoBuilder::template getKernelArgByValValue<uint32_t>(static_cast<uint32_t>(skipBlockTypeArgNum));
|
|
|
|
switch (outSkipBlockType) {
|
|
case CL_ME_MB_TYPE_16x16_INTEL:
|
|
break;
|
|
case CL_ME_MB_TYPE_8x8_INTEL:
|
|
break;
|
|
default:
|
|
return false;
|
|
;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
size_t getIntraSearchPredictorModesBuffExpSize(size_t blkNum) const {
|
|
// vector size is 22 - 1 (16x16 luma block) + 4 (8x8 luma block) + 16 (4x4 luma block) + 1 (8x8 chroma block)
|
|
int VectorSize = 22;
|
|
size_t intraSearchPredictorModesBuffExpSize = blkNum * VectorSize;
|
|
return intraSearchPredictorModesBuffExpSize;
|
|
}
|
|
|
|
size_t getSkipMotionVectorBufferExpSize(uint32_t skipBlockType, size_t blkNum) const {
|
|
// vector size is either 1 (16x16 block) or 4 (8x8 block)
|
|
// 0 to 8 skip MVs per MB
|
|
// may be null if all MBs in frame have 0 skip check MVs in which case VME skip checks are not performed
|
|
// layout assumes 4 (for bidir) or 8 (otherwise) skip check MVs per MB
|
|
// row-major block layout; all MVs for a block are contiguous
|
|
// buffer size depends on the block and frame size .
|
|
int vectorSize = (skipBlockType == CL_ME_MB_TYPE_16x16_INTEL) ? 1 : 4;
|
|
int numChecks = (isBidirKernel() ? 4 : 8);
|
|
size_t skipMotionVectorBufferExpSize = blkNum * numChecks * vectorSize * 2 * sizeof(cl_short);
|
|
return skipMotionVectorBufferExpSize;
|
|
}
|
|
|
|
size_t getSkipResidualsBuffExpSize(uint32_t skipBlockType, size_t blkNum) const {
|
|
/* output buffer of vectors of unsigned short SAD adjusted values corresponding to the input skip check MVs
|
|
may be null if skip_motion_vector_buffer is null
|
|
vector size is either 1 (16x16 block) or 4 (8x8 block)
|
|
0 to 8 skip check residuals per MB
|
|
layout always assumes 8 skip check residuals per MB
|
|
row major block layout; all MVs for a block are contiguous
|
|
buffer size depends on the block and frame size */
|
|
int vectorSize = 1;
|
|
switch (skipBlockType) {
|
|
case CL_ME_MB_TYPE_16x16_INTEL:
|
|
vectorSize = 1;
|
|
break;
|
|
case CL_ME_MB_TYPE_8x8_INTEL:
|
|
vectorSize = 4;
|
|
break;
|
|
default:
|
|
break;
|
|
};
|
|
|
|
int numChecks = (isBidirKernel() ? 4 : 8);
|
|
size_t skipResidualsBuffExpSize = blkNum * vectorSize * numChecks * sizeof(cl_ushort);
|
|
return skipResidualsBuffExpSize;
|
|
}
|
|
|
|
size_t getIntraResidualsBuffExpSize(size_t blkNum) const {
|
|
/* output buffer of vectors of unsigned short SAD adjusted values
|
|
may be null in which case the intra residuals corresponding not returned
|
|
vector size is 4 - 1 (16x16 luma block) + 1 (8x8 luma block) + 1 (4x4 luma block) + 1 (8x8 chroma block)
|
|
1 vector per MB
|
|
buffer size depends on the frame size */
|
|
int vectorSize = 4;
|
|
size_t intraResidualsBuffExpSize = (blkNum * sizeof(cl_ushort) * vectorSize);
|
|
return intraResidualsBuffExpSize;
|
|
}
|
|
|
|
size_t getPredictorsBufferExpSize(size_t blkNum) const {
|
|
size_t numPredictors = 8;
|
|
size_t predictorsBufferExpSize = (blkNum * numPredictors * 2 * sizeof(cl_short));
|
|
return predictorsBufferExpSize;
|
|
}
|
|
|
|
cl_int validateVmeDispatch(const Vec3<size_t> &inputRegion, const Vec3<size_t> &offset, size_t blkNum, size_t blkMul) const override {
|
|
cl_int basicVmeValidationStatus = VmeBuiltinDispatchInfoBuilder::validateVmeDispatch(inputRegion, offset, blkNum, blkMul);
|
|
if (basicVmeValidationStatus != CL_SUCCESS) {
|
|
return basicVmeValidationStatus;
|
|
}
|
|
|
|
uint32_t skipBlockType = CL_ME_MB_TYPE_16x16_INTEL;
|
|
if (false == validateFlags(skipBlockType)) {
|
|
return CL_INVALID_KERNEL_ARGS;
|
|
}
|
|
|
|
if (false == validateSkipBlockTypeArg(skipBlockType)) {
|
|
return CL_OUT_OF_RESOURCES;
|
|
}
|
|
|
|
if (false == VmeBuiltinDispatchInfoBuilder::template validateEnumArg<uint32_t>(searchCostPenaltyArgNum, CL_ME_COST_PENALTY_NONE_INTEL, CL_ME_COST_PENALTY_LOW_INTEL, CL_ME_COST_PENALTY_NORMAL_INTEL,
|
|
CL_ME_COST_PENALTY_HIGH_INTEL)) {
|
|
return CL_OUT_OF_RESOURCES;
|
|
}
|
|
|
|
if (false == VmeBuiltinDispatchInfoBuilder::template validateEnumArg<uint32_t>(searchCostPrecisionArgNum, CL_ME_COST_PRECISION_QPEL_INTEL, CL_ME_COST_PRECISION_HPEL_INTEL, CL_ME_COST_PRECISION_PEL_INTEL,
|
|
CL_ME_COST_PRECISION_DPEL_INTEL)) {
|
|
return CL_OUT_OF_RESOURCES;
|
|
}
|
|
|
|
if (false == VmeBuiltinDispatchInfoBuilder::template validateEnumArg<uint8_t>(bidirWeightArgNum, 0, CL_ME_BIDIR_WEIGHT_QUARTER_INTEL, CL_ME_BIDIR_WEIGHT_THIRD_INTEL, CL_ME_BIDIR_WEIGHT_HALF_INTEL,
|
|
CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL, CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL)) {
|
|
return CL_INVALID_KERNEL_ARGS;
|
|
}
|
|
|
|
std::pair<int32_t, size_t> bufferRequirements[] = {
|
|
std::make_pair(countMotionVectorBufferArgNum, (blkNum * 2 * sizeof(cl_short))),
|
|
std::make_pair(skipMotionVectorBufferArgNum, getSkipMotionVectorBufferExpSize(skipBlockType, blkNum)),
|
|
std::make_pair(intraSearchPredictorModesArgNum, getIntraSearchPredictorModesBuffExpSize(blkNum)),
|
|
std::make_pair(skipResidualsArgNum, getSkipResidualsBuffExpSize(skipBlockType, blkNum)),
|
|
std::make_pair(intraResidualsArgNum, getIntraResidualsBuffExpSize(blkNum)),
|
|
std::make_pair(predictorsBufferArgNum, getPredictorsBufferExpSize(blkNum))};
|
|
for (const auto &req : bufferRequirements) {
|
|
if (false == this->validateBufferSize(req.first, req.second)) {
|
|
return CL_INVALID_BUFFER_SIZE;
|
|
}
|
|
}
|
|
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
protected:
|
|
uint32_t flagsArgNum;
|
|
int32_t skipBlockTypeArgNum;
|
|
uint32_t searchCostPenaltyArgNum;
|
|
uint32_t searchCostPrecisionArgNum;
|
|
int32_t bidirWeightArgNum;
|
|
int32_t predictorsBufferArgNum;
|
|
uint32_t countMotionVectorBufferArgNum;
|
|
uint32_t skipMotionVectorBufferArgNum;
|
|
uint32_t intraSearchPredictorModesArgNum;
|
|
uint32_t skipResidualsArgNum;
|
|
uint32_t intraResidualsArgNum;
|
|
uint32_t intraSrcImgArgNum;
|
|
};
|
|
|
|
template <>
|
|
class BuiltInOp<EBuiltInOps::VmeBlockAdvancedMotionEstimateCheckIntel> : public AdvancedVmeBuiltinDispatchInfoBuilder {
|
|
public:
|
|
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
|
|
: AdvancedVmeBuiltinDispatchInfoBuilder(kernelsLib, device, EBuiltInOps::VmeBlockAdvancedMotionEstimateCheckIntel,
|
|
"block_advanced_motion_estimate_check_intel") {
|
|
}
|
|
|
|
cl_int validateVmeDispatch(const Vec3<size_t> &inputRegion, const Vec3<size_t> &offset,
|
|
size_t gwWidthInBlk, size_t gwHeightInBlk) const override {
|
|
cl_int basicAdvVmeValidationStatus = AdvancedVmeBuiltinDispatchInfoBuilder::validateVmeDispatch(inputRegion, offset, gwWidthInBlk, gwHeightInBlk);
|
|
if (basicAdvVmeValidationStatus != CL_SUCCESS) {
|
|
return basicAdvVmeValidationStatus;
|
|
}
|
|
|
|
auto countMotionVectorBuff = castToObject<Buffer>((cl_mem)this->vmeKernel->getKernelArg(this->countMotionVectorBufferArgNum));
|
|
if (countMotionVectorBuff == nullptr) {
|
|
return CL_INVALID_BUFFER_SIZE;
|
|
}
|
|
|
|
return CL_SUCCESS;
|
|
}
|
|
};
|
|
|
|
template <>
|
|
class BuiltInOp<EBuiltInOps::VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel> : public AdvancedVmeBuiltinDispatchInfoBuilder {
|
|
public:
|
|
BuiltInOp(BuiltIns &kernelsLib, ClDevice &device)
|
|
: AdvancedVmeBuiltinDispatchInfoBuilder(kernelsLib, device, EBuiltInOps::VmeBlockAdvancedMotionEstimateBidirectionalCheckIntel,
|
|
"block_advanced_motion_estimate_bidirectional_check_intel") {
|
|
}
|
|
|
|
bool isBidirKernel() const override {
|
|
return true;
|
|
}
|
|
};
|
|
} // namespace NEO
|