/*
 * Copyright (c) 2017, Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

// VME KERNELS
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_motion_estimate_intel(sampler_t accelerator, __read_only image2d_t srcImg,
                            __read_only image2d_t refImg,
                            __global short2 *prediction_motion_vector_buffer,
                            __global short2 *motion_vector_buffer,
                            __global ushort *residuals, int height, int width,
                            int stride) {
    __local uint dst[64];
    __local ushort *dist = (__local ushort *)&dst[8 * 5];

    int sid_0 = stride * get_group_id(0);
    int gid_0 = sid_0 / height;
    int gid_1 = sid_0 % height;
    for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
         sid++, gid_0 = sid / height, gid_1 = sid % height) {
        int2 srcCoord = 0;
        int2 refCoord = 0;

        srcCoord.x = gid_0 * 16 + get_global_offset(0);
        srcCoord.y = gid_1 * 16 + get_global_offset(1);

        short2 predMV = 0;

#ifndef HW_NULL_CHECK
        if (prediction_motion_vector_buffer != NULL)
#endif
        {
            predMV = prediction_motion_vector_buffer[gid_0 + gid_1 * width];
            refCoord.x = predMV.x / 4;
            refCoord.y = predMV.y / 4;
            refCoord.y = refCoord.y & 0xFFFE;
        }

        {
            intel_work_group_vme_mb_query(dst, srcCoord, refCoord, srcImg, refImg,
                                          accelerator);
        }
        barrier(CLK_LOCAL_MEM_FENCE);

        // Write Out Result

        // 4x4
        if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
            int x = get_local_id(0) % 4;
            int y = get_local_id(0) / 4;
            int index = (gid_0 * 4 + x) + (gid_1 * 4 + y) * width * 4;

            short2 val = as_short2(dst[8 + (y * 4 + x) * 2]);
            motion_vector_buffer[index] = val;

#ifndef HW_NULL_CHECK
            if (residuals != NULL)
#endif
            {
                residuals[index] = dist[y * 4 + x];
            }
        }

        // 8x8
        if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
            if (get_local_id(0) < 4) {
                int x = get_local_id(0) % 2;
                int y = get_local_id(0) / 2;
                int index = (gid_0 * 2 + x) + (gid_1 * 2 + y) * width * 2;
                short2 val = as_short2(dst[8 + (y * 2 + x) * 8]);
                motion_vector_buffer[index] = val;

#ifndef HW_NULL_CHECK
                if (residuals != NULL)
#endif
                {
                    residuals[index] = dist[(y * 2 + x) * 4];
                }
            }
        }

        // 16x16
        if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
            if (get_local_id(0) == 0) {
                int index = gid_0 + gid_1 * width;

                short2 val = as_short2(dst[8]);
                motion_vector_buffer[index] = val;

#ifndef HW_NULL_CHECK
                if (residuals != NULL)
#endif
                {
                    residuals[index] = dist[0];
                }
            }
        }
    }
}

__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_advanced_motion_estimate_check_intel(
    sampler_t accelerator, __read_only image2d_t srcImg,
    __read_only image2d_t refImg, uint flags, uint skip_block_type,
    uint search_cost_penalty, uint search_cost_precision,
    __global short2 *count_motion_vector_buffer,
    __global short2 *predictors_buffer,
    __global short2 *skip_motion_vector_buffer,
    __global short2 *motion_vector_buffer,
    __global char *intra_search_predictor_modes, __global ushort *residuals,
    __global ushort *skip_residuals, __global ushort *intra_residuals,
    __read_only image2d_t intraSrcImg, int height, int width, int stride) {
    __local uint dstSearch[64];         // 8 GRFs
    __local uint dstSkipIntra[64 + 24]; // 11 GRFs (8 for inter, 3 for intra)

    __local ushort *distSearch =
        (__local ushort *)&dstSearch[8 * 5]; // distortion in the 6th GRF

    // Initialize the MV cost table:
    // MV Cost in U4U4 format:
    // No cost    : 0,  0,  0,  0,  0,  0,  0,  0
    // Low Cost   : 1,  4,  5,  9,  10, 12, 14, 15
    // Normal Cost: 5,  26, 29, 43, 45, 47, 57, 57
    // High Cost  : 29, 61, 72, 78, 88, 89, 91, 92

    uint2 MVCostTable;
    if (search_cost_penalty == 1) {
        MVCostTable.s0 = 0x09050401;
        MVCostTable.s1 = 0x0F0E0C0A;
    } else if (search_cost_penalty == 2) {
        MVCostTable.s0 = 0x2B1D1A05;
        MVCostTable.s1 = 0x39392F2D;
    } else if (search_cost_penalty == 3) {
        MVCostTable.s0 = 0x4E483D1D;
        MVCostTable.s1 = 0x5C5B5958;
    } else {
        MVCostTable.s0 = 0;
        MVCostTable.s1 = 0;
    }

    uint MVCostPrecision = ((uint)search_cost_precision) << 16;
    // Frame is divided into rows * columns of MBs.
    // One h/w thread per WG.
    // One WG processes 'row' MBs - one row per iteration and one MB per row.
    // Number of WGs (or h/w threads) is number of columns MBs
    // Each iteration processes the MB in a row - gid_0 is the MB id in a row and
    // gid_1 is the row offset.

    int sid_0 = stride * get_group_id(0);
    int gid_0 = sid_0 / height;
    int gid_1 = sid_0 % height;
    for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
         sid++, gid_0 = sid / height, gid_1 = sid % height) {
        int2 srcCoord;

        srcCoord.x = gid_0 * 16 +
                     get_global_offset(0); // 16 pixels wide MBs (globally scalar)
        srcCoord.y = gid_1 * 16 +
                     get_global_offset(1); // 16 pixels tall MBs (globally scalar)

        uint curMB = gid_0 + gid_1 * width; // current MB id
        short2 count = count_motion_vector_buffer[curMB];

        int countPredMVs = count.x;
        if (countPredMVs != 0) {
            uint offset = curMB * 8;       // 8 predictors per MB
            offset += get_local_id(0) % 8; // 16 work-items access 8 MVs for MB
                                           // one predictor for MB per SIMD channel

            // Reduce predictors from Q-pixel to integer precision.

            int2 predMV = 0;
            if (get_local_id(0) < countPredMVs) {
                predMV =
                    convert_int2(predictors_buffer[offset]); // one MV per work-item
                predMV.x /= 4;
                predMV.y /= 4;
                predMV.y &= 0xFFFE;
            }

            // Do up to 8 IMEs, get the best MVs and their distortions, and optionally
            // a FBR of the best MVs.
            // Finally the results are written out to SLM.

            intel_work_group_vme_mb_multi_query_8(
                dstSearch,       // best search MV and its distortions into SLM
                countPredMVs,    // count of predictor MVs (globally scalar - value range
                                 // 1 to 8)
                MVCostPrecision, // MV cost precision
                MVCostTable,     // MV cost table
                srcCoord,        // MB 2-D offset (globally scalar)
                predMV,          // predictor MVs (up to 8 distinct MVs for SIMD16 thread)
                srcImg,          // source
                refImg,          // reference
                accelerator);    // vme object
        }

        int doIntra = (flags & 0x2) != 0;
        int intraEdges = 0;
        if (doIntra) {
            // Enable all edges by default.
            intraEdges = 0x3C;
            // If this is a left-edge MB, then disable left edges.
            if ((gid_0 == 0) & (get_global_offset(0) == 0)) {
                intraEdges &= 0x14;
            }
            // If this is a right edge MB then disable right edges.
            if (gid_0 == width - 1) {
                intraEdges &= 0x38;
            }
            // If this is a top-edge MB, then disable top edges.
            if ((gid_1 == 0) & (get_global_offset(1) == 0)) {
                intraEdges &= 0x20;
            }
            // Set bit6=bit5.
            intraEdges |= ((intraEdges & 0x20) << 1);
            intraEdges <<= 8;
        }
        int countSkipMVs = count.y;
        if (countSkipMVs != 0 || doIntra == true) {
            uint offset = curMB * 8; // 8 sets of skip check MVs per MB
            offset +=
                (get_local_id(0) % 8); // 16 work-items access 8 sets of MVs for MB
                                       // one set of skip MV per SIMD channel

            // Do up to 8 skip checks and get the distortions for each of them.
            // Finally the results are written out to SLM.

            if ((skip_block_type == 0x0) | ((doIntra) & (countSkipMVs == 0))) {
                int skipMVs = 0;
                if (get_local_id(0) < countSkipMVs) {
                    __global int *skip1_motion_vector_buffer =
                        (__global int *)skip_motion_vector_buffer;
                    skipMVs = skip1_motion_vector_buffer[offset]; // one packed MV for one
                                                                  // work-item
                }
                intel_work_group_vme_mb_multi_check_16x16(
                    dstSkipIntra, // distortions into SLM
                    countSkipMVs, // count of skip check MVs (value range 0 to 8)
                    doIntra,      // compute intra modes
                    intraEdges,   // intra edges to use
                    srcCoord,     // MB 2-D offset (globally scalar)
                    skipMVs,      // skip check MVs (up to 8 sets of skip check MVs for
                                  // SIMD16 thread)
                    srcImg,       // source
                    refImg,       // reference
                    intraSrcImg,  // intra source
                    accelerator);
            }

            if ((skip_block_type == 0x1) & (countSkipMVs > 0)) {
                int4 skipMVs = 0;
                if (get_local_id(0) < countSkipMVs) {
                    __global int4 *skip4_motion_vector_buffer =
                        (__global int4 *)(skip_motion_vector_buffer);
                    skipMVs = skip4_motion_vector_buffer[offset]; // four component MVs
                                                                  // per work-item
                }
                intel_work_group_vme_mb_multi_check_8x8(
                    dstSkipIntra, // distortions into SLM
                    countSkipMVs, // count of skip check MVs per MB (value range 0 to 8)
                    doIntra,      // compute intra modes
                    intraEdges,   // intra edges to use
                    srcCoord,     // MB 2-D offset (globally scalar)
                    skipMVs,      // skip check MVs (up to 8 ets of skip check MVs for SIMD16
                                  // thread)
                    srcImg,       // source
                    refImg,       // reference
                    intraSrcImg,  // intra source
                    accelerator);
            }
        }

        barrier(CLK_LOCAL_MEM_FENCE);

        // Write Out motion estimation result:
        // Result format
        //     Hierarchical row-major layout
        //     i.e. row-major of blocks MVs in MBs, and row-major of 8 sets of
        //     MVs/distortion in blocks

        if (countPredMVs != 0) {
            // 4x4
            if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
                int index = (gid_0 * 16 + get_local_id(0)) + (gid_1 * 16 * width);

                // 1. 16 work-items enabled.
                // 2. Work-items gather fwd MVs in strided dword locations 0, 2, .., 30
                // (interleaved
                //    fwd/bdw MVs) with constant offset 8 (control data size) from SLM
                //    into contiguous
                //    short2 locations 0, 1, .., 15 of global buffer
                //    search_motion_vector_buffer with
                //    offset index.
                // 3. Work-items gather contiguous ushort locations 0, 1, .., 15 from
                // distSearch into
                //    contiguous ushort locations 0, 1, .., 15 of search_residuals with
                //    offset index.

                short2 val = as_short2(dstSearch[8 + get_local_id(0) * 2]);
                motion_vector_buffer[index] = val;

#ifndef HW_NULL_CHECK
                if (residuals != NULL)
#endif
                {
                    residuals[index] = distSearch[get_local_id(0)];
                }
            }

            // 8x8
            else if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
                // Only 1st 4 work-item are needed.
                if (get_local_id(0) < 4) {
                    int index = (gid_0 * 4 + get_local_id(0)) + (gid_1 * 4 * width);

                    // 1. 4 work-items enabled.
                    // 2. Work-items gather fw MVs in strided dword locations 0, 8, 16, 24
                    // (interleaved
                    //    fwd/bdw MVs) with constant offset 8 from SLM into contiguous
                    //    short2 locations
                    //    0, 1, .., 15 of global buffer search_motion_vector_buffer with
                    //    offset index.
                    // 3. Work-items gather strided ushort locations 0, 4, 8, 12 from
                    // distSearch into
                    //    contiguous ushort locations 0, 1, .., 15 of search_residuals
                    //    with offset index.

                    short2 val = as_short2(dstSearch[8 + get_local_id(0) * 4 * 2]);
                    motion_vector_buffer[index] = val;

#ifndef HW_NULL_CHECK
                    if (residuals != NULL)
#endif
                    {
                        residuals[index] = distSearch[get_local_id(0) * 4];
                    }
                }
            }

            // 16x16
            else if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
                // One 1st work is needed.
                if (get_local_id(0) == 0) {
                    int index = gid_0 + gid_1 * width;

                    // 1. 1 work-item enabled.
                    // 2. Work-item gathers fwd MV in dword location 0 with constant
                    // offset 8 from
                    //    SLM into short2 locations 0 of global buffer
                    //    search_motion_vector_buffer.
                    // 3. Work-item gathers ushort location 0 from distSearch into ushort
                    //    location 0 of search_residuals with offset index.

                    short2 val = as_short2(dstSearch[8]);
                    motion_vector_buffer[index] = val;

#ifndef HW_NULL_CHECK
                    if (residuals != NULL)
#endif
                    {
                        residuals[index] = distSearch[0];
                    }
                }
            }
        }

        // Write out motion skip check result:
        // Result format
        //     Hierarchical row-major layout
        //     i.e. row-major of blocks in MBs, and row-major of 8 sets of
        //     distortions in blocks

        if (countSkipMVs != 0) {
            if (skip_block_type == 0x0) {
                // Copy out 8 (1 component) sets of distortion values.

                int index = (gid_0 * 8) + (get_local_id(0)) + (gid_1 * 8 * width);

                if (get_local_id(0) < countSkipMVs) {
                    __local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];

                    // 1. Up to 8 work-items are enabled.
                    // 2. The work-item gathers distSkip locations 0, 16*1, .., 16*7 and
                    //    copies them to contiguous skip_residual locations 0, 1, 2, ..,
                    //    7.
                    skip_residuals[index] = distSkip[get_local_id(0) * 16];
                }
            } else {
                // Copy out 8 (4 component) sets of distortion values.

                int index =
                    (gid_0 * 8 * 4) + (get_local_id(0)) + (gid_1 * 8 * 4 * width);

                __local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];

                if (get_local_id(0) < countSkipMVs * 4) {
                    // 1. Up to 16 work-items are enabled.
                    // 2. The work-item gathers distSkip locations 0, 4*1, .., 4*31 and
                    //    copies them to contiguous skip_residual locations 0, 1, 2, ..,
                    //    31.

                    skip_residuals[index] = distSkip[get_local_id(0) * 4];
                    skip_residuals[index + 16] = distSkip[(get_local_id(0) + 16) * 4];
                }
            }
        }

        // Write out intra search result:

        if (doIntra) {

            int index_low =
                (gid_0 * 22) + (get_local_id(0) * 2) + (gid_1 * 22 * width);
            int index_high =
                (gid_0 * 22) + (get_local_id(0) * 2) + 1 + (gid_1 * 22 * width);

            // Write out the 4x4 intra modes
            if (get_local_id(0) < 8) {
                __local char *dstIntra_4x4 =
                    (__local char *)(&dstSkipIntra[64 + 16 + 4]);
                char value = dstIntra_4x4[get_local_id(0)];
                char value_low = (value)&0xf;
                char value_high = (value >> 4) & 0xf;
                intra_search_predictor_modes[index_low + 5] = value_low;
                intra_search_predictor_modes[index_high + 5] = value_high;
            }

            // Write out the 8x8 intra modes
            if (get_local_id(0) < 4) {
                __local char *dstIntra_8x8 =
                    (__local char *)(&dstSkipIntra[64 + 8 + 4]);
                char value = dstIntra_8x8[get_local_id(0) * 2];
                char value_low = (value)&0xf;
                int index = (gid_0 * 22) + (get_local_id(0)) + (gid_1 * 22 * width);
                intra_search_predictor_modes[index + 1] = value_low;
            }

            // Write out the 16x16 intra modes
            if (get_local_id(0) < 1) {
                __local char *dstIntra_16x16 =
                    (__local char *)(&dstSkipIntra[64 + 0 + 4]);
                char value = dstIntra_16x16[get_local_id(0)];
                char value_low = (value)&0xf;
                intra_search_predictor_modes[index_low] = value_low;
            }

// Get the intra residuals.
#ifndef HW_NULL_CHECK
            if (intra_residuals != NULL)
#endif
            {
                int index = (gid_0 * 4) + (gid_1 * 4 * width);

                if (get_local_id(0) < 1) {
                    __local ushort *distIntra_4x4 =
                        (__local ushort *)(&dstSkipIntra[64 + 16 + 3]);
                    __local ushort *distIntra_8x8 =
                        (__local ushort *)(&dstSkipIntra[64 + 8 + 3]);
                    __local ushort *distIntra_16x16 =
                        (__local ushort *)(&dstSkipIntra[64 + 0 + 3]);
                    intra_residuals[index + 2] = distIntra_4x4[0];
                    intra_residuals[index + 1] = distIntra_8x8[0];
                    intra_residuals[index + 0] = distIntra_16x16[0];
                }
            }
        }
    }
}
/*************************************************************************************************
Built-in kernel:
   block_advanced_motion_estimate_bidirectional_check_intel

Description:

   1. Do motion estimation with 0 to 4 predictor MVs using 0 to 4 (integer
motion estimation)
   IMEs per macro-block, calculating the best search MVs per specified (16x16,
8x8, 4x4) luma
   block with lowest distortion from amongst the 0 to 4 IME results, and
optionally do
   (fractional bi-directional refinement) FBR on the best IME search results to
refine the best
   search results. The best search (FBR if done, or IME) MVs and their
distortions are returned.

   2. Do undirectional or bidirectional skip (zero search) checks with 0 to 4
sets of skip check
   MVs for (16x16, 8x8) luma blocks using 0 to 4 (skip and intra check) SICs and
return the
   distortions associated with the input sets of skip check MVs per specified
luma block. 4x4
   blocks are not supported by h/w for skip checks.

   3. Do intra-prediction for (16x16, 8x8, 4x4) luma blocks and (8x8) chroma
blocks using 3 SICs
   and returning the predictor modes and their associated distortions.
Intra-prediction is done
   for all block sizes. Support for 8x8 chroma blocks cannot be enabled until NV
image formats
   are supported in OCL.

**************************************************************************************************/
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void
block_advanced_motion_estimate_bidirectional_check_intel(
    sampler_t accelerator, __read_only image2d_t srcImg,
    __read_only image2d_t refImg, __read_only image2d_t src_check_image,
    __read_only image2d_t ref0_check_image,
    __read_only image2d_t ref1_check_image, uint flags,
    uint search_cost_penalty, uint search_cost_precision, short2 count_global,
    uchar bidir_weight, __global short2 *count_motion_vector_buffer,
    __global short2 *prediction_motion_vector_buffer,
    __global char *skip_input_mode_buffer,
    __global short2 *skip_motion_vector_buffer,
    __global short2 *search_motion_vector_buffer,
    __global char *intra_search_predictor_modes,
    __global ushort *search_residuals, __global ushort *skip_residuals,
    __global ushort *intra_residuals, __read_only image2d_t intraSrcImg,
    int height, int width, int stride) {
    __local uint dstSearch[64];         // 8 GRFs
    __local uint dstSkipIntra[32 + 24]; // 7 GRFs (4 for inter, 3 for intra)

    // distortion in the 6th GRF
    __local ushort *distSearch = (__local ushort *)&dstSearch[8 * 5];

    // Initialize the MV cost table:
    // MV Cost in U4U4 format:
    // No cost    : 0,  0,  0,  0,  0,  0,  0,  0
    // Low Cost   : 1,  4,  5,  9,  10, 12, 14, 15
    // Normal Cost: 5,  26, 29, 43, 45, 47, 57, 57
    // High Cost  : 29, 61, 72, 78, 88, 89, 91, 92

    uint2 MVCostTable;
    if (search_cost_penalty == 1) {
        MVCostTable.s0 = 0x09050401;
        MVCostTable.s1 = 0x0F0E0C0A;
    } else if (search_cost_penalty == 2) {
        MVCostTable.s0 = 0x2B1D1A05;
        MVCostTable.s1 = 0x39392F2D;
    } else if (search_cost_penalty == 3) {
        MVCostTable.s0 = 0x4E483D1D;
        MVCostTable.s1 = 0x5C5B5958;
    } else {
        MVCostTable.s0 = 0;
        MVCostTable.s1 = 0;
    }

    uint MVCostPrecision = ((uint)search_cost_precision) << 16;

    // Frame is divided into rows * columns of MBs.
    // One h/w thread per WG.
    // One WG processes "row" MBs - one row per iteration and one MB per row.
    // Number of WGs (or h/w threads) is number of columns MBs.Each iteration
    // processes the MB in a row - gid_0 is the MB id in a row and gid_1 is the
    // row offset.

    int sid_0 = stride * get_group_id(0);
    int gid_0 = sid_0 / height;
    int gid_1 = sid_0 % height;
    for (int sid = sid_0; sid < sid_0 + stride && gid_0 < width && gid_1 < height;
         sid++, gid_0 = sid / height, gid_1 = sid % height) {
        int2 srcCoord;

        srcCoord.x = gid_0 * 16 +
                     get_global_offset(0); // 16 pixels wide MBs (globally scalar)
        srcCoord.y = gid_1 * 16 +
                     get_global_offset(1);  // 16 pixels tall MBs (globally scalar)
        uint curMB = gid_0 + gid_1 * width; // current MB id
        short2 count;

        // If either the search or skip vector counts are per-MB, then we need to
        // read in
        // the count motion vector buffer.

        if ((count_global.s0 == -1) | (count_global.s1 == -1)) {
            count = count_motion_vector_buffer[curMB];
        }

        // If either the search or skip vector counts are per-frame, we need to use
        // those.

        if (count_global.s0 >= 0) {
            count.s0 = count_global.s0;
        }

        if (count_global.s1 >= 0) {
            count.s1 = count_global.s1;
        }

        int countPredMVs = count.x;
        if (countPredMVs != 0) {
            uint offset = curMB * 4;       // 4 predictors per MB
            offset += get_local_id(0) % 4; // 16 work-items access 4 MVs for MB
            // one predictor for MB per SIMD channel

            // Reduce predictors from Q-pixel to integer precision.
            int2 predMV = 0;

            if (get_local_id(0) < countPredMVs) {
                // one MV per work-item
                predMV = convert_int2(prediction_motion_vector_buffer[offset]);
                // Predictors are input in QP resolution. Convert that to integer
                // resolution.
                predMV.x /= 4;
                predMV.y /= 4;
                predMV.y &= 0xFFFFFFFE;
            }

            // Do up to 4 IMEs, get the best MVs and their distortions, and optionally
            // a FBR of
            // the best MVs. Finally the results are written out to SLM.

            intel_work_group_vme_mb_multi_query_4(
                dstSearch,       // best search MV and its distortions into SLM
                countPredMVs,    // count of predictor MVs (globally scalar - value range
                                 // 1 to 4)
                MVCostPrecision, // MV cost precision
                MVCostTable,     // MV cost table
                srcCoord,        // MB 2-D offset (globally scalar)
                predMV,          // predictor MVs (up to 4 distinct MVs for SIMD16 thread)
                srcImg,          // source
                refImg,          // reference
                accelerator);    // vme object
        }

        int doIntra = ((flags & 0x2) != 0);
        int intraEdges = 0;
        if (doIntra) {
            // Enable all edges by default.
            intraEdges = 0x3C;
            // If this is a left-edge MB, then disable left edges.
            if ((gid_0 == 0) & (get_global_offset(0) == 0)) {
                intraEdges &= 0x14;
            }

            // If this is a right edge MB then disable right edges.
            if (gid_0 == width - 1) {
                intraEdges &= 0x38;
            }

            // If this is a top-edge MB, then disable top edges.
            if ((gid_1 == 0) & (get_global_offset(1) == 0)) {
                intraEdges &= 0x20;
            }

            // Set bit6=bit5.
            intraEdges |= ((intraEdges & 0x20) << 1);

            intraEdges <<= 8;
        }

        int skip_block_type_8x8 = flags & 0x4;

        int countSkipMVs = count.y;
        if (countSkipMVs != 0 || doIntra == true) {
            // one set of skip MV per SIMD channel

            // Do up to 4 skip checks and get the distortions for each of them.
            // Finally the results are written out to SLM.

            if ((skip_block_type_8x8 == 0) | ((doIntra) & (countSkipMVs == 0))) {
                // 16x16:

                uint offset = curMB * 4 * 2; // 4 sets of skip check MVs per MB
                int skipMV = 0;
                if (get_local_id(0) < countSkipMVs * 2) // need 2 values per MV
                {
                    offset +=
                        (get_local_id(0)); // 16 work-items access 4 sets of MVs for MB
                    __global int *skip1_motion_vector_buffer =
                        (__global int *)skip_motion_vector_buffer;
                    skipMV = skip1_motion_vector_buffer[offset]; // one MV per work-item
                }

                uchar skipMode = 0;
                if (get_local_id(0) < countSkipMVs) {
                    skipMode = skip_input_mode_buffer[curMB];

                    if (skipMode == 0) {
                        skipMode = 1;
                    }
                    if (skipMode > 3) {
                        skipMode = 3;
                    }
                }

                intel_work_group_vme_mb_multi_bidir_check_16x16(
                    dstSkipIntra,     // distortions into SLM
                    countSkipMVs,     // count of skip check MVs (globally scalar - value
                                      // range 1 to 4)
                    doIntra,          // compute intra modes
                    intraEdges,       // intra edges to use
                    srcCoord,         // MB 2-D offset (globally scalar)
                    bidir_weight,     // bidirectional weight
                    skipMode,         // skip modes
                    skipMV,           // skip check MVs (up to 4 distinct sets of skip check MVs
                                      // for SIMD16 thread)
                    src_check_image,  // source
                    ref0_check_image, // reference fwd
                    ref1_check_image, // reference bwd
                    intraSrcImg,      // intra source
                    accelerator);     // vme object
            } else {
                // 8x8:

                uint offset =
                    curMB * 4 *
                    8; // 4 sets of skip check MVs, 16 shorts (8 ints) each per MB
                int2 skipMVs = 0;
                if (get_local_id(0) < countSkipMVs * 8) // need 8 values per MV
                {
                    offset +=
                        (get_local_id(0)); // 16 work-items access 4 sets of MVs for MB
                    __global int *skip1_motion_vector_buffer =
                        (__global int *)(skip_motion_vector_buffer);
                    skipMVs.x = skip1_motion_vector_buffer[offset]; // four component MVs
                                                                    // per work-item
                    skipMVs.y = skip1_motion_vector_buffer[offset + 16];
                }

                uchar skipModes = 0;
                if (get_local_id(0) < countSkipMVs) {
                    skipModes = skip_input_mode_buffer[curMB];
                }

                intel_work_group_vme_mb_multi_bidir_check_8x8(
                    dstSkipIntra,     // distortions into SLM
                    countSkipMVs,     // count of skip check MVs per MB (globally scalar -
                                      // value range 1 to 4)
                    doIntra,          // compute intra modes
                    intraEdges,       // intra edges to use
                    srcCoord,         // MB 2-D offset (globally scalar)
                    bidir_weight,     // bidirectional weight
                    skipModes,        // skip modes
                    skipMVs,          // skip check MVs (up to 4 distinct sets of skip check MVs
                                      // for SIMD16 thread)
                    src_check_image,  // source
                    ref0_check_image, // reference fwd
                    ref1_check_image, // reference bwd
                    intraSrcImg,      // intra source
                    accelerator);     // vme object
            }
        }

        barrier(CLK_LOCAL_MEM_FENCE);

        // Write Out motion estimation result:
        // Result format
        //     Hierarchical row-major layout
        //     i.e. row-major of blocks MVs in MBs, and row-major of 4 sets of
        //     MVs/distortion in blocks
        if (countPredMVs != 0) {
            // 4x4
            if (intel_get_accelerator_mb_block_type(accelerator) == 0x2) {
                int index = (gid_0 * 16 + get_local_id(0)) + (gid_1 * 16 * width);

                // 1. 16 work-items enabled.
                // 2. Work-items gather fwd MVs in strided dword locations 0, 2, .., 30
                // (interleaved
                //    fwd/bdw MVs) with constant offset 8 (control data size) from SLM
                //    into contiguous
                //    short2 locations 0, 1, .., 15 of global buffer
                //    search_motion_vector_buffer with
                //    offset index.
                // 3. Work-items gather contiguous ushort locations 0, 1, .., 15 from
                // distSearch into
                //    contiguous ushort locations 0, 1, .., 15 of search_residuals with
                //    offset index.

                short2 val = as_short2(dstSearch[8 + get_local_id(0) * 2]);
                search_motion_vector_buffer[index] = val;

#ifndef HW_NULL_CHECK
                if (search_residuals != NULL)
#endif
                {
                    search_residuals[index] = distSearch[get_local_id(0)];
                }
            }

            // 8x8
            else if (intel_get_accelerator_mb_block_type(accelerator) == 0x1) {
                // Only 1st 4 work-item are needed.
                if (get_local_id(0) < 4) {
                    int index = (gid_0 * 4 + get_local_id(0)) + (gid_1 * 4 * width);

                    // 1. 4 work-items enabled.
                    // 2. Work-items gather fw MVs in strided dword locations 0, 8, 16, 24
                    // (interleaved
                    //    fwd/bdw MVs) with constant offset 8 from SLM into contiguous
                    //    short2 locations
                    //    0, 1, .., 15 of global buffer search_motion_vector_buffer with
                    //    offset index.
                    // 3. Work-items gather strided ushort locations 0, 4, 8, 12 from
                    // distSearch into
                    //    contiguous ushort locations 0, 1, .., 15 of search_residuals
                    //    with offset index.

                    short2 val = as_short2(dstSearch[8 + get_local_id(0) * 4 * 2]);
                    search_motion_vector_buffer[index] = val;

#ifndef HW_NULL_CHECK
                    if (search_residuals != NULL)
#endif
                    {
                        search_residuals[index] = distSearch[get_local_id(0) * 4];
                    }
                }
            }

            // 16x16
            else if (intel_get_accelerator_mb_block_type(accelerator) == 0x0) {
                // One 1st work is needed.
                if (get_local_id(0) == 0) {
                    int index = gid_0 + gid_1 * width;

                    // 1. 1 work-item enabled.
                    // 2. Work-item gathers fwd MV in dword location 0 with constant
                    // offset 8 from
                    //    SLM into short2 locations 0 of global buffer
                    //    search_motion_vector_buffer.
                    // 3. Work-item gathers ushort location 0 from distSearch into ushort
                    //    location 0 of search_residuals with offset index.

                    short2 val = as_short2(dstSearch[8]);
                    search_motion_vector_buffer[index] = val;

#ifndef HW_NULL_CHECK
                    if (search_residuals != NULL)
#endif
                    {
                        search_residuals[index] = distSearch[0];
                    }
                }
            }
        }

        // Write out motion skip check result:
        // Result format
        //     Hierarchical row-major layout
        //     i.e. row-major of blocks in MBs, and row-major of 8 sets of
        //     distortions in blocks
        if (countSkipMVs != 0) {
            if (skip_block_type_8x8 == false) {
                // Copy out 4 (1 component) sets of distortion values.

                int index = (gid_0 * 4) + (get_local_id(0)) + (gid_1 * 4 * width);

                if (get_local_id(0) < countSkipMVs) {
                    // 1. Up to 4 work-items are enabled.
                    // 2. The work-item gathers distSkip locations 0, 16*1, .., 16*7 and
                    //    copies them to contiguous skip_residual locations 0, 1, 2, ..,
                    //    7.
                    __local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
                    skip_residuals[index] = distSkip[get_local_id(0) * 16];
                }
            } else {
                // Copy out 4 (4 component) sets of distortion values.
                int index =
                    (gid_0 * 4 * 4) + (get_local_id(0)) + (gid_1 * 4 * 4 * width);

                if (get_local_id(0) < countSkipMVs * 4) {
                    // 1. Up to 16 work-items are enabled.
                    // 2. The work-item gathers distSkip locations 0, 4*1, .., 4*15 and
                    //    copies them to contiguous skip_residual locations 0, 1, 2, ..,
                    //    15.

                    __local ushort *distSkip = (__local ushort *)&dstSkipIntra[0];
                    skip_residuals[index] = distSkip[get_local_id(0) * 4];
                }
            }
        }

        // Write out intra search result:
        if (doIntra) {
            // Write out the 4x4 intra modes
            if (get_local_id(0) < 8) {
                __local char *dstIntra_4x4 =
                    (__local char *)(&dstSkipIntra[32 + 16 + 4]);
                char value = dstIntra_4x4[get_local_id(0)];
                char value_low = (value)&0xf;
                char value_high = (value >> 4) & 0xf;

                int index_low =
                    (gid_0 * 22) + (get_local_id(0) * 2) + (gid_1 * 22 * width);

                int index_high =
                    (gid_0 * 22) + (get_local_id(0) * 2) + 1 + (gid_1 * 22 * width);

                intra_search_predictor_modes[index_low + 5] = value_low;
                intra_search_predictor_modes[index_high + 5] = value_high;
            }

            // Write out the 8x8 intra modes
            if (get_local_id(0) < 4) {
                __local char *dstIntra_8x8 =
                    (__local char *)(&dstSkipIntra[32 + 8 + 4]);
                char value = dstIntra_8x8[get_local_id(0) * 2];
                char value_low = (value)&0xf;
                int index = (gid_0 * 22) + (get_local_id(0)) + (gid_1 * 22 * width);
                intra_search_predictor_modes[index + 1] = value_low;
            }

            // Write out the 16x16 intra modes
            if (get_local_id(0) < 1) {
                __local char *dstIntra_16x16 =
                    (__local char *)(&dstSkipIntra[32 + 0 + 4]);
                char value = dstIntra_16x16[0];
                char value_low = (value)&0xf;
                int index = (gid_0 * 22) + (gid_1 * 22 * width);
                intra_search_predictor_modes[index] = value_low;
            }

// Get the intra residuals.
#ifndef HW_NULL_CHECK
            if (intra_residuals != NULL)
#endif
            {
                int index = (gid_0 * 4) + (gid_1 * 4 * width);

                if (get_local_id(0) < 1) {
                    __local ushort *distIntra_4x4 =
                        (__local ushort *)(&dstSkipIntra[32 + 16 + 3]);
                    __local ushort *distIntra_8x8 =
                        (__local ushort *)(&dstSkipIntra[32 + 8 + 3]);
                    __local ushort *distIntra_16x16 =
                        (__local ushort *)(&dstSkipIntra[32 + 0 + 3]);

                    intra_residuals[index + 2] = distIntra_4x4[0];
                    intra_residuals[index + 1] = distIntra_8x8[0];
                    intra_residuals[index + 0] = distIntra_16x16[0];
                }
            }
        }
    }
}

// VEBOX KERNELS:
__kernel void ve_enhance_intel(sampler_t accelerator,
                               int flags,
                               __read_only image2d_t current_input,
                               __write_only image2d_t current_output) {
}

__kernel void ve_dn_enhance_intel(sampler_t accelerator,
                                  int flags,
                                  __read_only image2d_t ref_input,
                                  __read_only image2d_t current_input,
                                  __write_only image2d_t current_output) {
}

__kernel void ve_dn_di_enhance_intel(sampler_t accelerator,
                                     int flags,
                                     __read_only image2d_t current_input,
                                     __read_only image2d_t ref_input,
                                     __write_only image2d_t current_output,
                                     __write_only image2d_t ref_output,
                                     __write_only image2d_t dndi_output) {
}