intel-graphics-compiler/visa/HWConformity.cpp

/*===================== begin_copyright_notice ==================================

Copyright (c) 2017 Intel Corporation

Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:

The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


======================= end_copyright_notice ==================================*/

#include <cmath>

#include "HWConformity.h"
#include "Optimizer.h"
#include "visa_wa.h"
#include "DebugInfo.h"
#include "G4Verifier.h"

using namespace vISA;

static G4_CondModifier getReverseCondMod( G4_CondModifier mod )
{
    switch(mod)
    {
    case Mod_z:
        return Mod_z;
    case Mod_e:
        return Mod_e;
    case Mod_nz:
        return Mod_nz;
    case Mod_ne:
        return Mod_ne;
    case Mod_g:
        return Mod_l;
    case Mod_ge:
        return Mod_le;
    case Mod_l:
        return Mod_g;
    case Mod_le:
        return Mod_ge;
    default:
        MUST_BE_TRUE( 0, "Invalid conditional modifier input for reversed conditional modifier." );
    }
    return Mod_cond_undef;
}

static bool isCompressedInst( G4_INST *inst ){
    return inst->isComprInst();
}

#define isUnitRegionRow( opnd, exec_size )      \
        ( opnd->isImm() ||      \
        opnd->isSrcRegRegion() && opnd->asSrcRegRegion()->getRegion()->width == exec_size || \
        opnd->isSrcRegRegion() && opnd->asSrcRegRegion()->getRegion()->vertStride == 0 )

G4_Align HWConformity::getDclAlignment( int opndBytes, G4_INST *inst, bool isScalar, G4_SubReg_Align &subAlign )
{
    G4_Align align = Either;
    subAlign = Get_G4_SubRegAlign_From_Size( (uint16_t) opndBytes );
    bool hasAccSrc = inst->hasACCSrc();

    if( hasAccSrc && subAlign < Sixteen_Word )
    {
        subAlign = Sixteen_Word;
    }

    if (!isScalar)
    {
        // certain instructions have additional alignment requirements for non-scalar sources
        if (!builder.hasAlign1Ternary() && inst->getNumSrc() == 3 && !inst->isSend() && subAlign < Eight_Word)
        {
            subAlign = Eight_Word;
        }
        if (inst->isMath())
        {
            subAlign = Sixteen_Word;
        }
    }

    return align;
}
/*
 *  create a new mov instruction and insert it before iter
 *  mov (esize) dst tmp:type
 *  where esize is "inst"'s execution size and insert it after "inst"
 *  return value is the new temp variable as a dst operand
 *  If dstAlign is specified, the new temp will at least be aligend to that size
 */
G4_DstRegRegion* HWConformity::insertMovAfter( INST_LIST_ITER& it, G4_DstRegRegion* dst, G4_Type type, G4_BB *bb, G4_SubReg_Align dstAlign )
{
    G4_INST* inst = *it;

    if( !dst )
    {
        return dst;
    }

    if (inst->hasNULLDst() )
    {
        return builder.createDstRegRegion(Direct,
                                dst->getBase(),
                                0,
                                0,
                                1,
                                type);
    }

    INST_LIST_ITER iter = it;
    iter++;
    unsigned char exec_size = inst->getExecSize();
    G4_Type execType = inst->isRawMov() ? dst->getType() : inst->getExecType();
    bool scalarSrc = true;

    for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; i++)
    {
        G4_Operand *src = inst->getSrc(i);
        if( !src->isImm() )
        {
            if (!(inst->isMath() && i == 1 && src->isNullReg()) &&
                (src->isSrcRegRegion() && !src->asSrcRegRegion()->isScalar()))
            {
                scalarSrc = false;
            }
        }
        else if( IS_VINTTYPE(src->getType()) || IS_VFTYPE(src->getType()) )
        {
            scalarSrc = false;
        }
    }

    uint8_t newExecSize = ((inst->opcode() == G4_sel || inst->getImplAccSrc() || !scalarSrc) ? exec_size : 1);

    uint32_t opExecWidthBytes = newExecSize * G4_Type_Table[execType].byteSize;
    if( execType == Type_DF && IS_BTYPE( type ) )
    {
        type = ( type == Type_UB ? Type_UW : Type_W );
    }
    uint16_t dstWidthBytes = newExecSize * G4_Type_Table[type].byteSize;
    uint16_t scale = G4_Type_Table[execType].byteSize / G4_Type_Table[type].byteSize;
    /*   so according to comments in function that call it MAD needs to have packed format.
        It ends up with hStride 2, due to DefHoisting.
        So it is trying to undo it.
        For every other type if srcType > dstCype we need to adjust regions.
        This is not necessary for HF. It's already packed.

        The src region of move is wrong. Since for HF it is packed, unlike other data types.
        mad (8) r56.0.xyzw:hf -r37.0.xyzw:f r59.0.xyzw:hf r58.0.xyzw:hf {Align16, NoMask}
        mov (16) r44.0<2>:hf r56.0<16;8,2>:hf {Align1, H1} // #??:$39:%66
    */
    if( scale == 0 || (getGenxPlatform() >= GENX_CHV && execType == Type_F && type == Type_HF))
    {
        scale = 1;
    }

    G4_SubReg_Align subAlign; // set by getDclAlignment
    G4_Align align = getDclAlignment( opExecWidthBytes > dstWidthBytes ? opExecWidthBytes : dstWidthBytes,
        inst, newExecSize == 1, subAlign );

    if (subAlign < dstAlign)
    {
        subAlign = dstAlign;
    }

    RegionDesc* region = newExecSize > 1 ? builder.createRegionDesc(scale, 1, 0) : builder.getRegionScalar();

    G4_Declare* dcl = builder.createTempVar( newExecSize == 1 ? 1 : newExecSize * scale, type, align, subAlign );

    G4_SrcRegRegion *srcRegion = builder.Create_Src_Opnd_From_Dcl( dcl, region );
    G4_Predicate *pred = NULL;

    if (inst->opcode() != G4_sel)
    {
        pred = inst->getPredicate();
        inst->setPredicate( NULL );
        // maintainDU4TempMov will update def-use
    }

    unsigned int new_option = inst->getMaskOption();
    G4_INST* newInst = builder.createInternalInst( pred, G4_mov, NULL, inst->getSaturate(),
        exec_size, dst, srcRegion, NULL, new_option, inst->getLineNo(), inst->getCISAOff(),
        inst->getSrcFilename() );
    bb->instList.insert( iter, newInst );

    // update propagation info
    maintainDU4TempMov( inst, newInst );

    if( type == dst->getType() )
    {
        newInst->setSaturate( false );
    }
    else if (type == Type_F || type == Type_DF)
    {
        inst->setSaturate(false);
    }

    inst->setExecSize( newExecSize );
    if (newExecSize == 1)
    {
        inst->setOptions((inst->getOption() & ~InstOpt_Masks ) | InstOpt_WriteEnable);
    }

    return builder.Create_Dst_Opnd_From_Dcl( dcl, scale);
}

//
// replace instruction (*it)' source srcPos, which must be a scalar/immediate,
// with a temp variable after inserting
// mov (esize) tmp<1>:type imm/scalar {options}
// before the instruction
// This is like insertMovBefore(), except that the latter will always use
// simd1 move for scalar/imm values, which may not be what we want
// NOTE: This does not check for redundant moves.  We are counting on a later LVN pass
// to clean them up
//
void HWConformity::broadcast(
    G4_BB* bb, INST_LIST_ITER it, int srcPos, G4_SubReg_Align align)
 {
     G4_INST* inst = *it;
     G4_Operand* src = inst->getSrc(srcPos);
     MUST_BE_TRUE(src->isImm() ||
         (src->isSrcRegRegion() && src->asSrcRegRegion()->isScalar()),
         "source must be an immediate or scalar");
     G4_Type type = src->getType();

     uint8_t execSize = inst->getExecSize();
     uint32_t instMask = inst->getMaskOption();

     // avoid simd16 Qword moves
     MUST_BE_TRUE(execSize * G4_Type_Table[type].byteSize <= 2 * GENX_GRF_REG_SIZ,
         "move can't exceed 2 GRFs");

     G4_Declare* dcl = builder.createTempVar( execSize, type, Either, align );
     G4_DstRegRegion* dst = builder.createDstRegRegion(
         Direct,
         dcl->getRegVar(),
         0,
         0,
         1,
         type);
     G4_INST* newInst = builder.createInternalInst( NULL, G4_mov, NULL, false,
         execSize, dst, src, NULL, instMask,
         inst->getLineNo(), inst->getCISAOff(), inst->getSrcFilename() );

     bb->instList.insert(it, newInst);

     RegionDesc* srcRegion = builder.getRegionStride1();
     G4_SrcRegRegion* newSrc = builder.Create_Src_Opnd_From_Dcl(dcl, srcRegion);
     inst->setSrc(newSrc, srcPos);
     newInst->addDefUse(inst, inst->getSrcOperandNum(srcPos));

 }

//
// A simplified version of insertMovBefore(), this copies raw bytes from source to a temp
// and replaces the original source with tmp.  This is primarily used to ensure operand alignment and region restrictions
// op (esize) ... (mod) src<region>:type
// -->
// mov (esize) tmp<1>:type src<region>:type
// op (esize) ... (mod) tmp<1;1,0>:type
//
// source must be a G4_SrcRegRegion (direct or indirect), immediates are not supported
// note that modifier is propagated from source to tmp, but region is not
//
//
G4_SrcRegRegion* HWConformity::insertCopyBefore(INST_LIST_ITER it, uint32_t srcNum,
    G4_SubReg_Align tmpAlign, G4_BB *bb)
{
    G4_INST* inst = *it;
    G4_Operand* src = inst->getSrc(srcNum);
    MUST_BE_TRUE(src != nullptr && src->isSrcRegRegion(), "source must be a SrcRegRegion");
    G4_SrcRegRegion* origSrc = src->asSrcRegRegion();

    uint8_t newExecSize = origSrc->isScalar() ? 1 : inst->getExecSize();
    G4_Declare* dcl = builder.createTempVar(newExecSize, origSrc->getType(), Either, tmpAlign);
    G4_SrcModifier modifier = origSrc->getModifier();
    origSrc->setModifier(Mod_src_undef);
    G4_DstRegRegion* dst = builder.Create_Dst_Opnd_From_Dcl(dcl, 1);

    G4_INST* movInst = builder.createInternalInst(nullptr, G4_mov, nullptr, false,
        newExecSize, dst, origSrc, nullptr, InstOpt_WriteEnable,
        inst->getLineNo(), inst->getCISAOff(), inst->getSrcFilename());

    bb->instList.insert(it, movInst);
    G4_SrcRegRegion* newSrc = builder.createSrcRegRegion(modifier, Direct, dcl->getRegVar(),
        0, 0, newExecSize == 1 ? builder.getRegionScalar() : builder.getRegionStride1(),
        dcl->getElemType());

    return newSrc;
}

/*
 *  create a new mov instruction
 *  mov (esize) tmp<1>:type src
 *  where esize is "inst"'s execution size and insert it before "inst"
 *  return value is the new temp variable as a source operand
 */
G4_Operand* HWConformity::insertMovBefore(
    INST_LIST_ITER it, uint32_t srcNum, G4_Type type, G4_BB *bb,
    G4_SubReg_Align tmpAlign )
{
    G4_INST* inst = *it;
    G4_Align align = Either;
    G4_SubReg_Align subAlign;
    RegionDesc* region = NULL;
    unsigned short vs = 0, hs = 0, wd = 1;
    unsigned char exec_size = inst->getExecSize();
    G4_Operand *src = inst->getSrc( srcNum );
    unsigned short scale = IS_BTYPE( src->getType() ) && src->getType() == type ? 2 : 1;

    uint8_t newExecSize = ((src->isImm() && !IS_VTYPE(src->getType())) ||
                        (src->isSrcRegRegion() && src->asSrcRegRegion()->isScalar()))
                        ? 1 : exec_size;

    if( newExecSize > 1 )
    {
        if (scale == 1 && !IS_VTYPE(src->getType()))
        {
            scale = (unsigned short) (G4_Type_Table[src->getType()].byteSize / G4_Type_Table[type].byteSize);
        }
        if( scale == 0 )
        {
            scale = 1;
        }
        hs = scale;
        if( isCompressedInst(inst) || G4_Type_Table[type].byteSize * exec_size * hs > G4_GRF_REG_NBYTES )
        {
                wd = exec_size / 2;
        }
        else
        {
            wd = exec_size;
        }
        vs = wd * hs;
    }
    else
    {
        vs = 0;
        wd = 1;
        hs = 0;
        scale = (unsigned short)(G4_Type_Table[src->getType()].byteSize / G4_Type_Table[type].byteSize);
        if (scale == 0)
        {
            scale = 1;
        }
    }

    region = builder.createRegionDesc(vs, wd, hs);

    int opExecWidthBytes = IS_VINTTYPE(src->getType()) ?
                            G4_GRF_REG_NBYTES/2 * ( exec_size > 8 ? exec_size/8 : 1 ) :
                            ( src->getType() == Type_VF ?
                            G4_GRF_REG_NBYTES/2 * ( exec_size > 4 ? exec_size/4 : 1 ) :
                            newExecSize * G4_Type_Table[type].byteSize * scale );

    align = getDclAlignment( opExecWidthBytes, inst, newExecSize == 1, subAlign );

    if (subAlign < tmpAlign)
    {
        subAlign = tmpAlign;
    }

    uint32_t newInstEMask = newExecSize == 1 ? InstOpt_WriteEnable : inst->getMaskOption();

    // due to old BDW regioning rule we need NoMask inst here so they can be split
    if (builder.getOptions()->isTargetCM() && getGenxPlatform() == GENX_BDW)
    {
        if (bb->isInSimdFlow())
        {
            newInstEMask = InstOpt_WriteEnable;
        }
    }

    G4_Declare* dcl = builder.createTempVar( newExecSize == 1 ? 1 : newExecSize * scale, type, align, subAlign );
    G4_DstRegRegion *dstRegion = builder.Create_Dst_Opnd_From_Dcl(dcl, scale);
    G4_INST* newInst = builder.createInternalInst(nullptr, G4_mov, nullptr, false,
        newExecSize, dstRegion, builder.duplicateOperand(src), nullptr, newInstEMask,
        inst->getLineNo(), inst->getCISAOff(), inst->getSrcFilename() );
    bb->instList.insert( it, newInst );
    inst->transferDef( newInst, Gen4_Operand_Number(srcNum + 1), Opnd_src0 );
    newInst->addDefUse(inst, Gen4_Operand_Number(srcNum + 1));

    G4_SrcModifier modifier = Mod_src_undef;
    if (src->isSrcRegRegion() && src->asSrcRegRegion()->getModifier() == Mod_Not)
    {
        // mov doesn't support logic modifiers, so we keep it on the new source
        modifier = Mod_Not;
        newInst->getSrc(0)->asSrcRegRegion()->setModifier(Mod_src_undef);
    }

    return builder.createSrcRegRegion(
                        modifier,
                        Direct,
                        dcl->getRegVar(),
                        0,
                        0,
                        region,
                        dcl->getElemType());
}

void HWConformity::fixPackedSource(INST_LIST_ITER it, G4_BB *bb, G4_Type extype)
{
    G4_INST* inst = *it;

    bool nonTypeWFound = false, nonTypeFFound = false, incompatibleTypeFound = false;

    for( int i = 0; i < G4_Inst_Table[inst->opcode()].n_srcs; i++ )
    {
        G4_Operand *src = inst->getSrc(i);
        if( !src || !(IS_VTYPE(src->getType())))
        {
            // Make sure other src operands are of word type only as this is a HW requirement
            if( src &&
                ( src->getType() != Type_W &&
                src->getType() != Type_UW ) )
            {
                nonTypeWFound = true;
            }
            if( src &&
                ( src->getType() != Type_F ) )
            {
                nonTypeFFound = true;
            }


            continue;
        }
        G4_Type target_type = Type_W;
        if( src->getType() == Type_VF )
        {
            target_type = Type_F;
        }

        if( target_type == Type_W && nonTypeWFound == true )
        {
            // non-word type src is not allowed to co-exist with :v src
            incompatibleTypeFound = true;
        }
        else if( target_type == Type_F && nonTypeFFound == true )
        {
            // non-float type src is not allowed to co-exist with :vf src
            incompatibleTypeFound = true;
        }

        // Insert a move only if immediate operand is not
        // last src operand
        if( i != G4_Inst_Table[inst->opcode()].n_srcs - 1 ||
            incompatibleTypeFound == true )
        {
            inst->setSrc( insertMovBefore( it, i, target_type, bb), i );
        }
    }
}
/*
 * fixMathInst() checks the following:
 * The math instruction can only use GRF registers as source(s) and destination.
 * The math instruction does not support indirect addressing modes.
 * When Align1 mode is used,  source horizontal stride must be 1 with the exception of scalar sources and destination horizontal stride must be always 1.
 * Source and destination offset must be the same, except the case of scalar source
 * DW and UD is the only source format supported for INT DIV, float32 is the only source format supported for all the other functions.
 * Mixed DW and UD sources are not allowed for the INT DIV function.
 * For single source math function, <src1> must be programmed as ARF-NULL register.
 * The FDIV function is not supported in ALT_MODE.
 */
bool HWConformity::fixMathInst(INST_LIST_ITER it, G4_BB *bb)
{
    G4_INST* inst = *it;
    G4_DstRegRegion *dst = inst->getDst();
    G4_Operand *src0 = inst->getSrc(0), *src1 = inst->getSrc(1);
    bool mov_dst = false;

    MUST_BE_TRUE(inst->isMath(), "Expect math instruction");

    if (inst->asMathInst()->getMathCtrl() == MATH_INVM ||
        inst->asMathInst()->getMathCtrl() == MATH_RSQRTM)
    {
        return false;
    }

    // SKIP mixed mode instructions which are already handled by fixMixedHFInst.

    // covers MATH_INT_DIV, MATH_INT_DIV_QUOT, MATH_INT_DIV_REM
    bool isIntDivide = inst->asMathInst()->isMathIntDiv();
    bool hasSameOffset = hasSameSubregOffset(inst);

    // check if the source needs a move and if so the new move type
    auto needsMove = [this, inst, isIntDivide, hasSameOffset](int srcID, G4_Type& newType)
    {
        assert((srcID == 0 || srcID == 1) && "math can have at most two sources");
        G4_Operand* src = inst->getSrc(srcID);
        newType = src->getType();
        if (isIntDivide)
        {
            G4_Type divType = IS_UNSIGNED_INT(inst->getSrc(0)->getType()) && IS_UNSIGNED_INT(inst->getSrc(1)->getType()) ?
                Type_UD : Type_D;
            if (newType != divType)
            {
                newType = divType;
                return true;
            }
        }
        else if ((src->getType() != Type_F && src->getType() != Type_VF) &&
                 (getGenxPlatform() == GENX_BDW || src->getType() != Type_HF))
        {
            // CHV+ supports F/HF math, while BDW only supports F math
            // mix mode math is handled in fixMixedHFInst()
            newType = Type_F;
            return true;
        }

        if (src->isImm())
        {
            if (srcID == 0 && inst->asMathInst()->getMathCtrl() >= MATH_FDIV)
            {
                return true;
            }
        }
        else if (src->isSrcRegRegion())
        {
            G4_SrcRegRegion *srcRegion = src->asSrcRegRegion();
            RegionDesc *rd = srcRegion->getRegion();
            if (srcRegion->getModifier() != Mod_src_undef && isIntDivide)
            {
                // no source modifer for int divide
                return true;
            }
            else if (srcRegion->getRegAccess() != Direct)
            {
                return true;
            }
            else if (!srcRegion->isScalar())
            {
                if (!hasSameOffset && !builder.isOpndAligned(srcRegion, GENX_GRF_REG_SIZ))
                {
                    return true;
                }
                else if (!rd->isContiguous(inst->getExecSize()))
                {
                    return true;
                }
            }
        }
        else
        {
            ASSERT_USER(false, "Unexpected math source!");
        }
        return false;
    };

    if (src0)
    {
        G4_Type src0_type = src0->getType();
        bool needsSrc0Mov = needsMove(0, src0_type);
        if (needsSrc0Mov)
        {
            inst->setSrc(insertMovBefore(it, 0, src0->isImm() ? getNonVectorType(src0_type) : src0_type, bb), 0);
            src0 = inst->getSrc(0);
        }
    }

    bool nullSrc1 = src1 && src1->isNullReg();
    if (!nullSrc1 && src1)
    {
        G4_Type src1_type = src1->getType();
        bool needsSrc1Move = needsMove(1, src1_type);

        if (needsSrc1Move)
        {
            if (isIntDivide && src1->isImm() && !IS_VINTTYPE(src1->getType()))
            {
                // just change the immediate's type
                uint32_t immVal = (uint32_t)src1->asImm()->getImm();
                inst->setSrc(builder.createImm(immVal, src1_type), 1);
            }
            else
            {
                inst->setSrc(insertMovBefore(it, 1, src1->isImm() ? getNonVectorType(src1_type) : src1_type, bb), 1);
            }
            src1 = inst->getSrc(1);
        }
    }

    if (nullSrc1 && src0 && src1->getType() != src0->getType())
    {
        G4_SrcRegRegion *src1_opnd = builder.createNullSrc(inst->getSrc(0)->getType());
        inst->setSrc(src1_opnd, 1);
    }

    // recompute as src0 and src1 may have been modified
    hasSameOffset = hasSameSubregOffset(inst);
    G4_Type extype = inst->getExecType2();
    bool cond1 = (dst->getType() != extype && !(dst->getType() == Type_UD && extype == Type_D));
    if (dst->getRegAccess() != Direct || dst->getHorzStride() != 1 || cond1 ||
        (!hasSameOffset && inst->getExecSize() != 1 && !builder.isOpndAligned(dst, GENX_GRF_REG_SIZ)))
    {
        mov_dst = true;
        G4_DstRegRegion *new_dst = insertMovAfter(it, dst, extype, bb);
        inst->setDest(new_dst);
    }
    return mov_dst;
}

//  find a common (integer) type for constant folding.  The rules are:
//  -- both types must be int
//  -- Q and UQ are not folded
//  -- UD if one of the type is UD
//  -- D otherwise
//
//  returns Type_UNDEF if no appropriate type can be found
//
static G4_Type findConstFoldCommonType( G4_Type type1, G4_Type type2 )
{
     if (IS_TYPE_INT(type1) && IS_TYPE_INT(type2))
     {
         if (G4_Type_Table[type1].byteSize == 8 || G4_Type_Table[type2].byteSize == 8)
         {
             return Type_UNDEF;
         }
         if (type1 == Type_UD || type2 == Type_UD)
         {
             return Type_UD;
         }
         else
         {
             return Type_D;
         }
     }
     return Type_UNDEF;
}

//
// returns true if all sources and dst in this inst have the same fixed subreg offset
// null src/dst, scalar sources and immediates are excluded from the check
//
bool HWConformity::hasSameSubregOffset(G4_INST* inst) const
{
    bool anyOffset = true; // true means offset is not fixed yet
    uint32_t byteOffset = 0;
    if (inst->getDst())
    {
        G4_DstRegRegion* dst = inst->getDst();
        if (dst->isNullReg())
        {
            // do nothing
        }
        else if (dst->hasFixedSubregOffset(byteOffset))
        {
            anyOffset = false;
        }
        else
        {
            return false;
        }
    }

    for (int i = 0; i < inst->getNumSrc(); ++i)
    {
        G4_Operand* src = inst->getSrc(i);
        if (src->isSrcRegRegion())
        {
            uint32_t srcOffset = 0;
            G4_SrcRegRegion* srcRegion = src->asSrcRegRegion();
            if (srcRegion->isNullReg() || srcRegion->isScalar())
            {
                continue;
            }
            else if (srcRegion->hasFixedSubregOffset(srcOffset))
            {
                if (anyOffset)
                {
                    byteOffset = srcOffset;
                    anyOffset = false;
                }
                else if (srcOffset != byteOffset)
                {
                    return false;
                }
            }
            else
            {
                return false;
            }
        }
    }

    return true;
}

// Check the following rules
// -- src0 in 2 source instructions may not be immediate.  We try to swap for src0 and src1 for
//    commutative instructions in such cases
// -- ARF may not be in src1
void HWConformity::fixImmAndARFSrc(INST_LIST_ITER it, G4_BB *bb)
{
    G4_INST* inst = *it;
    if (inst->isSend())
    {
        return;
    }

    G4_Operand *src0, *src1, *src2;
    src0 = inst->getSrc(0);
    src1 = inst->getSrc(1);
    src2 = inst->getSrc(2);

    /* Check for usage of two constants in binary operations */
    if (src0 != NULL && (src0->isImm() || src0->isAddrExp()) && G4_Inst_Table[inst->opcode()].n_srcs == 2)
    {
        if (INST_COMMUTATIVE(inst->opcode()) && !src1->isImm())
        {
            //all commutative inst must have 2 sources
            if (inst->opcode() == G4_mul)
            {
                bool needConstMov;
                //for DW and W mul, src0 must be DW and src1 W
                needConstMov = IS_DTYPE(src0->getType()) && !IS_DTYPE(src1->getType());

                if (needConstMov)
                {
                    G4_Type tmpType = getNonVectorType(src0->getType());

                    G4_Operand* newSrc0 = insertMovBefore(it, 0, tmpType, bb);
                    inst->setSrc(newSrc0, 0);
                }
                else
                {
                    // swap operands
                    inst->setSrc(src1, 0);
                    inst->setSrc(src0, 1);
                    inst->swapDefUse();
                }
            }
            else
            {
                // swap operands
                inst->setSrc(src1, 0);
                inst->setSrc(src0, 1);
                inst->swapDefUse();
            }
        }
        /*
        * A select operation isn't commutative, but we may commute the
        * operands provided we perform a predicate inversion as well.
        * (v0)  sel ... const V1
        *    =>
        * (-v0) sel ... V1 const
        */
        else if (inst->opcode() == G4_sel && !src1->isImm())
        {
            bool SwapOpnd = false;
            G4_CondMod *cond = inst->getCondMod();
            if (cond != NULL)
            {
                switch (cond->getMod())
                {
                case Mod_ne:
                {
                    inst->setCondMod(builder.createCondMod(Mod_e, cond->getBase(), 0));
                    SwapOpnd = true;
                    break;
                }
                case Mod_e:
                {
                    inst->setCondMod(builder.createCondMod(Mod_ne, cond->getBase(), 0));
                    SwapOpnd = true;
                    break;
                }
                default:
                    break; // Prevent gcc warning
                }
            }
            else
            {
                G4_Predicate* pred = inst->getPredicate();
                MUST_BE_TRUE(pred != NULL, "predicate must not be null");
                G4_PredState reverse = pred->getState() == PredState_Minus ? PredState_Plus : PredState_Minus;
                inst->setPredicate(builder.createPredicate(
                    reverse, pred->getBase(), pred->getSubRegOff(), pred->getControl()));
                SwapOpnd = true;
            }

            if (SwapOpnd)
            {
                inst->setSrc(src1, 0);
                inst->setSrc(src0, 1);
                inst->swapDefUse();
            }
            else
            {
                G4_Type tmpType = getNonVectorType(src0->getType());
                G4_Operand* newSrc0 = insertMovBefore(it, 0, tmpType, bb);
                inst->setSrc(newSrc0, 0);
            }
        }
        else if (!inst->isMath())
        {
            // math immediate src0 is handled separately in fixMathInst()
            if ((inst->opcode() == G4_add || inst->opcode() == G4_mul) &&
                src0->isImm() && src1->isImm() &&
                IS_TYPE_INT(src0->getType()) && IS_TYPE_INT(src1->getType()) &&
                inst->getSaturate() == false)
            {
                // FIXME: this is duplicating the functionality of Optimizer::doConsFolding.
                G4_Type src0T = src0->getType(), src1T = src1->getType(), resultType = src0T;

                resultType = findConstFoldCommonType(src0T, src1T);
                if (resultType != Type_UNDEF)
                {
                    G4_Imm *newSrc = NULL;
                    int64_t res = inst->opcode() == G4_add ?
                        ((int64_t)(src0->asImm()->getInt()) + (int64_t)(src1->asImm()->getInt())) :
                        ((int64_t)(src0->asImm()->getInt()) * (int64_t)(src1->asImm()->getInt()));

                    // don't fold if the value overflows D/UD
                    if (G4_Imm::isInTypeRange(res, resultType))
                    {
                        newSrc = builder.createImmWithLowerType(res, resultType);

                        // change instruction into a MOV
                        inst->setOpcode(G4_mov);
                        inst->setSrc(newSrc, 0);
                        inst->setSrc(NULL, 1);
                        return;
                    }
                }
            }
            // If src0 is not 64-bit, src1 is 64-bit, swap them to save one move.
            if (INST_COMMUTATIVE(inst->opcode()) && src0->isImm() && src1->isImm() &&
                G4_Type_Table[src0->getType()].byteSize != 8 &&
                G4_Type_Table[src1->getType()].byteSize == 8)
            {
                inst->setSrc(src1, 0);
                inst->setSrc(src0, 1);
                inst->swapDefUse();
                src0 = inst->getSrc(0);
                src1 = inst->getSrc(1);
            }
            if (INST_COMMUTATIVE(inst->opcode()) && src0->isAddrExp() && src1->isImm())
            {
                // The original IR has both addr expr and immediate
                //   add(8) A0(0, 0)<1>:uw &V36 + 0 0xeca86420 : uv{ Align1, Q1 }
                // We insert a move for src1 which is an immediate
                //   mov(8) TV0(0, 0)<1> : uw 0xeca86420 : uv{ Align1 }
                //   add(8) A0(0, 0)<1> : uw &V36 + 0 TV0(0, 0)<8; 8, 1> : uw{ Align1, Q1 }
                G4_Type type = src1->getType();
                inst->setSrc(insertMovBefore(it, 1, getNonVectorType(type), bb), 1);
                // And we swap addr expr and the new variable
                //   add(8) A0(0, 0)<1> : uw TV0(0, 0)<8; 8, 1> : uw &V36 + 0 {Align1, Q1}
                // The final code sequence is
                //   mov(8) r13.0<1>:uw 0xeca86420 : uv{ Align1 } // #26:$9:%79
                //   add(8) a0.0<1> : uw r13.0<8; 8, 1> : uw 0x60 : uw{ Align1, Q1 }
                inst->setSrc(inst->getSrc(1), 0);
                inst->setSrc(src0, 1);
                inst->swapDefUse();
            }
            else
            {
                G4_Type newSrcType = inst->needsDWType() ? (IS_UNSIGNED_INT(src0->getType()) ? Type_UD : Type_D) : src0->getType();
                inst->setSrc(insertMovBefore(it, 0, newSrcType, bb), 0);
            }
        }
    }

    src0 = inst->getSrc(0);
    src1 = inst->getSrc(1);
    src2 = inst->getSrc(2);

    // check for non-mad 3src inst
    if (G4_Inst_Table[inst->opcode()].n_srcs == 3 && src1->isImm())
    {
        inst->setSrc(insertMovBefore(it, 1, INST_FLOAT_SRC_ONLY(inst->opcode()) ? Type_F : src1->getType(), bb), 1);
    }

    // Architecture registers may not appear as src1.
    auto isARF = [](G4_Operand* opnd) { return opnd->isAreg() || opnd->isFlag(); };
    if (src1 != nullptr && isARF(src1) && !src1->isNullReg())
    {
        /* See if we can swap the src1 */
        if (INST_COMMUTATIVE(inst->opcode()) && !isARF(src0))
        {
            inst->setSrc(src1, 0);
            inst->setSrc(src0, 1);
            inst->swapDefUse();
        }
        /* Otherwise introduce a tmp */
        inst->setSrc(insertMovBefore(it, 1, INST_FLOAT_SRC_ONLY(inst->opcode()) ? Type_F : src1->getType(), bb), 1);
    }

    src2 = inst->getSrc(2);

    /* 3 src instructions can't have any constants */
    if (!builder.hasAlign1Ternary() && src2 != nullptr && src2->isImm())
    {
        inst->setSrc(insertMovBefore(it, 2, src2->getType(), bb), 2);
    }
}

bool HWConformity::fixLine(INST_LIST_ITER it, G4_BB *bb)
{
    G4_INST* inst = *it;

    if (inst->opcode() == G4_line)
    {
        bool badRegion = false;
        G4_Operand* src0 = inst->getSrc(0);
        // assumption: there are 4 elements in src0
        if (src0->isSrcRegRegion())
        {
            RegionDesc *rd = src0->asSrcRegRegion()->getRegion();
            badRegion = (rd->vertStride != 0 || rd->width != 4 || rd->horzStride != 1);
        }
        if (!IS_FTYPE(src0->getType()) || src0->isImm() || badRegion ||
            !builder.isOpndAligned(src0, G4_GRF_REG_NBYTES / 2))
        {
            // insertMovBefore()  is not used here
            // due to the special region <0;4,1> of src0 of line
            G4_Declare *src0_dcl;
            G4_DstRegRegion *new_dst_opnd;
            G4_SrcRegRegion *new_src0_opnd;
            unsigned char mov_size = 4;

            src0_dcl = builder.createTempVar(mov_size, Type_F, Either, Eight_Word);
            /* Create temporary variable */
            // Actully we set region to be <0;4,1> directly here.
            RegionDesc *rd = builder.createRegionDesc(0, 4, 1);
            new_src0_opnd = builder.Create_Src_Opnd_From_Dcl(src0_dcl, rd);
            new_dst_opnd = builder.Create_Dst_Opnd_From_Dcl(src0_dcl, 1);

            G4_INST* newInst = builder.createInternalInst(NULL, G4_mov, NULL, false,
                mov_size, new_dst_opnd, src0, NULL, InstOpt_NoOpt, inst->getLineNo(),
                inst->getCISAOff(), inst->getSrcFilename());
            if (bb->isInSimdFlow())
            {
                newInst->setOptions((newInst->getOption() & ~InstOpt_Masks) | InstOpt_WriteEnable);
            }
            bb->instList.insert(it, newInst);
            inst->setSrc(new_src0_opnd, 0);
            return true;
        }
    }
    return false;
}

bool HWConformity::fixOpndType(INST_LIST_ITER it, G4_BB *bb)
{
    /*
    * Check for instruction that only accept float/int operands, as well as
    * instruction with mixed operand types.  Even though the CISA itself forbids
    * mixed type instructions, optimizations such as copy propagation
    * may reintroduce them and so we do the checks here
    */
    G4_INST* inst = *it;
    bool changed = false;
    int numSrc = inst->getNumSrc();
    bool has_float = false;
    bool has_int = false;

    if (inst->isSend())
    {
        return false;
    }

    for (int i = 0; i < numSrc; i++)
    {
        if (!inst->getSrc(i))
        {
            continue;
        }
        if (IS_FTYPE(inst->getSrc(i)->getType()) ||
            IS_VFTYPE(inst->getSrc(i)->getType()))
        {
            has_float = true;
        }
        else if (!IS_DFTYPE(inst->getSrc(i)->getType()) && !IS_HFTYPE(inst->getSrc(i)->getType()))
        {
            has_int = true;
        }
    }
    if (has_float && has_int)
    {
        for (int i = 0; i < numSrc; i++)
        {
            if (inst->getSrc(i) && !IS_FTYPE(inst->getSrc(i)->getType()) && !IS_DFTYPE(inst->getSrc(i)->getType()))
            {
                if (!((inst->opcode() == G4_smov) && (i == 1)))
                {
                    inst->setSrc(insertMovBefore(it, i, Type_F, bb), i);
                    changed = true;
                }
            }
        }
    }

    if (builder.noSrc1Byte())
    {
        if (numSrc > 1)
        {
            G4_Operand* src0 = inst->getSrc(0);
            G4_Operand* src1 = inst->getSrc(1);
            if (src0 != nullptr && src1 != nullptr && IS_BTYPE(src1->getType()))
            {
                if (!IS_BTYPE(src0->getType()) && inst->canSwapSource())
                {
                    inst->setSrc(src1, 0);
                    inst->setSrc(src0, 1);
                }
                else
                {
                    inst->setSrc(insertMovBefore(it, 1, Type_W, bb), 1);
                    changed = true;
                }
            }
        }
    }
    return changed;
}

/*
 * fixOpnds() looks for operands conformity:
 * 1. checks can operand be a constant.
 * 2. checks if operand's type is conformant to operation.
 * 3. check if only src0 uses VxH
 * 4. check if indirect scalar is used in compressed inst
 * It tries to fix these cases by changing operands order if possible
 * or by insertion if temporary location with appropriate conversion.
 */
void HWConformity::fixOpnds( INST_LIST_ITER it, G4_BB *bb, G4_Type& exType )
{
    G4_INST* inst = *it;
    if (inst->isSend())
    {
        return;
    }

    G4_Operand *src0, *src1, *src2;

    src0 = inst->getSrc(0);
    src1 = inst->getSrc(1);
    src2 = inst->getSrc(2);

    if( inst->opcode() == G4_mul )
    {
        if (IS_DTYPE(src1->getType()) &&
            !(IS_DTYPE(src0->getType()) || IS_FTYPE(src0->getType())))
        {
            // check if src0 uses VxH
            bool src0_use_VxH = false;

            if( src0->isSrcRegRegion() && src0->asSrcRegRegion()->getRegAccess() != Direct &&
                src0->asSrcRegRegion()->getRegion()->isRegionWH() ) // is this safe?
            {
                src0_use_VxH = true;
            }
            if( src0_use_VxH )
            {
                src0 = insertMovBefore( it, 0, src0->getType(), bb );
            }
            inst->setSrc( src1, 0 );
            inst->setSrc( src0, 1 );
            inst->swapDefUse();
            src0 = inst->getSrc(0);
            src1 = inst->getSrc(1);
        }

        if( src1->isSrcRegRegion() && src1->asSrcRegRegion()->getRegAccess() != Direct &&
            src1->asSrcRegRegion()->getRegion()->isRegionWH() )
        {
            if (IS_DTYPE(src0->getType()) &&
                !(IS_DTYPE(src1->getType()) || IS_FTYPE(src1->getType()) ) )
            {
                inst->setSrc( insertMovBefore( it, 1, src1->getType(), bb ), 1 );
            }
            else
            {
                inst->setSrc( src1, 0 );
                inst->setSrc( src0, 1 );
                inst->swapDefUse();
            }
            src0 = inst->getSrc(0);
            src1 = inst->getSrc(1);
        }
    }

    fixImmAndARFSrc(it, bb);

    src0 = inst->getSrc(0);
    src1 = inst->getSrc(1);
    src2 = inst->getSrc(2);

    // Vx1 and VxH can only be used for src0
    bool src0_use_VxH = false, src1_use_VxH = false;

    if( src2 &&
        src2->isSrcRegRegion() &&
        src2->asSrcRegRegion()->getRegion()->isRegionWH() ){
        inst->setSrc(insertMovBefore(it, 2, exType, bb), 2);
    }

    if( src0 != NULL &&
        src0->isSrcRegRegion() &&
        src0->asSrcRegRegion()->getRegion()->isRegionWH() ){
            src0_use_VxH = true;
    }

    if( src1 != NULL && !( inst->isMath() && src1->isNullReg() ) &&
        src1->isSrcRegRegion() &&
        src1->asSrcRegRegion()->getRegion()->isRegionWH() ){
            src1_use_VxH = true;
    }

    if( src1_use_VxH )
    {
        if( ( INST_COMMUTATIVE(inst->opcode()) || inst->opcode() == G4_cmp )
            && !src0_use_VxH &&
            ! ( inst->opcode() == G4_mul &&
            ( IS_DTYPE( src0->getType() ) ) ) )
        {
            inst->setSrc( src1, 0 );
            inst->setSrc( src0, 1 );
            if( inst->opcode() == G4_cmp )
            {
                // change condMod
                G4_CondMod *condMod = inst->getCondMod();
                if( condMod )
                {
                    G4_CondMod *newCondModOpnd = builder.createCondMod(
                        getReverseCondMod(condMod->getMod()), condMod->getBase(), condMod->getSubRegOff());
                    inst->setCondMod( newCondModOpnd );
                }
            }
        }
        else
        {
            inst->setSrc(insertMovBefore(it, 1, exType, bb), 1);
        }
    }

    if( inst->isComprInst() )
    {
        // check if there is indirect scalar or repeat region
        for( int i = 0; i < G4_Inst_Table[inst->opcode()].n_srcs; i++ )
        {
            G4_Operand *src = inst->getSrc(i);
            if( src && src->isSrcRegRegion() &&
                src->asSrcRegRegion()->getRegAccess() != Direct &&
                ( src->asSrcRegRegion()->isScalar() || src->asSrcRegRegion()->getRegion()->isRepeatRegion( inst->getExecSize() ) ) )
            {
                inst->setSrc(insertMovBefore(it, i, src->getType(), bb), i);
            }
        }
    }
}

void HWConformity::fixAlign13SrcInst(INST_LIST_ITER iter, G4_BB* bb)
{
    // again mad should already conform by construction
    G4_INST* inst = *iter;
    MUST_BE_TRUE(inst->getNumSrc() == 3 && !inst->isSend(), "expect 3src inst");

    if (inst->opcode() != G4_mad)
    {
        G4_DstRegRegion* dst = inst->getDst();
        if (!isGoodAlign1TernaryDst(inst))
        {
            auto alignment = builder.noSrc2Regioning() ? Sixteen_Word : Four_Word;
            G4_DstRegRegion* tmpDst = insertMovAfter(iter, dst, dst->getType(), bb, alignment);
            inst->setDest(tmpDst);
        }

        bool canBeImm = true;
        for (int i = 0; i < inst->getNumSrc(); ++i)
        {
            if (!isGoodAlign1TernarySrc(inst, i, canBeImm))
            {
                G4_SubReg_Align subalign = Any;
                if (i == 2)
                {
                    // when there's no scr2 regioning,
                    // src2 has to hvae same offset as dst, we enforce it by making it GRF-aligned
                    subalign = builder.noSrc2Regioning() ? Sixteen_Word : Four_Word;
                }
                inst->setSrc(insertMovBefore(iter, i, inst->getSrc(i)->getType(), bb, subalign), i);
            }
            else
            {
                if (inst->getSrc(i)->isImm())
                {
                    canBeImm = false;
                }
            }
        }
    }
}

void HWConformity::fix3SrcInst(INST_LIST_ITER iter, G4_BB* bb)
{
    G4_INST* inst = *iter;
    if (inst->getNumSrc() != 3 || inst->isSend() || inst->opcode() == G4_madm)
    {
        return;
    }

    if (builder.hasAlign1Ternary())
    {
        fixAlign13SrcInst(iter, bb);
        return;
    }

    if (inst->opcode() != G4_mad)
    {
        // check that dst and srcs are legal for 3src.  We do not check
        // mad since they should already conform by construction
        uint8_t execSize = inst->getExecSize();
        G4_DstRegRegion* dst = inst->getDst();
        if (dst->getRegAccess() != Direct || dst->getHorzStride() != 1 ||
            !builder.isOpndAligned(dst, (execSize >= 8) ? 32 : execSize * 4))
        {
            G4_DstRegRegion* tmpDst = insertMovAfter(iter, dst, dst->getType(), bb);
            inst->setDest(tmpDst);
        }
        for (int i = 0; i < 3; i++)
        {
            if (!isGoodAlign16Src(inst, i))
            {
                inst->setSrc(
                    insertMovBefore(iter, i, inst->getSrc(i)->getType(), bb),
                    i);
            }
        }
    }

    //When it is set (Align16), the instruction uses 16-byte-aligned addressing for source and destination operands.
    if ((inst->getExecSize() == 1))
    {
        if (inst->getDst() &&
            inst->getDst()->getBase()->isRegVar())
        {
            if (!builder.isOpndAligned(inst->getDst(), 16))
            {
                G4_DstRegRegion *new_dst = insertMovAfter(iter, inst->getDst(), inst->getDst()->getType(), bb);
                G4_Declare* tmpDstDcl = new_dst->getTopDcl();
                tmpDstDcl->setSubRegAlign(Eight_Word);
                inst->setDest( new_dst );
            }
        }
    }

    if (inst->getExecSize() == 16)
    {
        /*
            According to Krishna, Narsim WA only applies if intruction is not contained in one GRF
        */
        bool wa3rc = (VISA_WA_CHECK(builder.getPWaTable(), WaDisableSIMD16On3SrcInstr)  &&
                        !(inst->getExecType() == Type_HF                                &&
                          inst->getOperand(Opnd_src1)->isSrcRegRegion()                 &&
                          inst->getOperand(Opnd_src1)->getType() == Type_HF             &&
                          !inst->getOperand(Opnd_src1)->asSrcRegRegion()->crossGRF()));

        if (wa3rc)
        {
            evenlySplitInst(iter, bb);
        }
    }
}
// return 1: packed word
bool HWConformity::isPackedWord( G4_Operand *src )
{
    if( !src || src->isSrcRegRegion() == false ||
        src->asSrcRegRegion()->getBase()->isNullReg() ){
            return false;
    }
    RegionDesc *rd = src->asSrcRegRegion()->getRegion();

    if( src->asSrcRegRegion()->getRegAccess() != Direct ||
        ( src->asSrcRegRegion()->getType() != Type_W && src->asSrcRegRegion()->getType() != Type_UW ) ||
        src->asSrcRegRegion()->getSubRegOff() != 0 ||
        rd->horzStride != 1 ||
        ( !( rd->width == 8 && rd->vertStride == 8 ) &&
        !( rd->width == 16 && rd->vertStride == 16 ) ) ){
            return false;
    }
    return true;
}

void HWConformity::fixCompareInst(
                                  INST_LIST_ITER i,
                                  G4_BB *bb,
                                  G4_Type exType,
                                  int dst_elsize )
{
    G4_INST *inst = *i;
    G4_Operand *dst = inst->getDst();

    if( dst && dst->isNullReg() )
    {
        // change dst hstride if necessary
        if( G4_Type_Table[exType].byteSize > G4_Type_Table[dst->getType()].byteSize )
        {
            // create a new dst with new stride
            G4_DstRegRegion *new_null = builder.createNullDst( exType );
            inst->setDest( new_null );
        }
        return;
    }
}

// For integer packing moves, we can replace the src type with the dst type instead of inserting
// a new move to satisfy dst alignment, since integer down conversion is based on truncation
// an inst has to satisfy the following properties:
// -- is a move (duh) and does not have conditional modifiers or saturation
// -- dst must be a direct DstRegRegion that is GRF-aligned
// -- src must be a direct SrcRegRegion with GRF base, no modifiers, and packed/scalar region
// -- both dst and src have integer type, with source stride > dst stride
// returns true if we have successfully down cast the src type
static bool canReplaceMovSrcType(IR_Builder& builder, G4_INST* inst, uint32_t extypesize)
{

    if (inst->opcode() != G4_mov || inst->getCondMod() != NULL || inst->getSaturate())
    {
        return false;
    }
    if (!inst->getSrc(0)->isSrcRegRegion())
    {
        return false;
    }

    G4_DstRegRegion* dst = inst->getDst();
    G4_SrcRegRegion* src0 = inst->getSrc(0)->asSrcRegRegion();
    int dstByteOffset = dst->getByteOffset();
    if (dstByteOffset % extypesize != 0 ||
        dst->getRegAccess() != Direct)
    {
        // don't do this if dst is not GRF aligned, since we have to fix it later anyway
        return false;
    }

    if (src0->getRegAccess() != Direct || src0->getModifier() != Mod_src_undef ||
        (src0->getTopDcl() == NULL || src0->getTopDcl()->getRegFile() != G4_GRF))
    {
        return false;
    }

    bool isIntPackingMove = false;
    if (IS_TYPE_INT(dst->getType()) && IS_TYPE_INT(src0->getType()))
    {
        uint32_t dstAlign = G4_Type_Table[dst->getType()].byteSize * dst->getHorzStride();
        if (dstAlign < G4_Type_Table[src0->getType()].byteSize)
        {
            isIntPackingMove = true;
        }
    }

    if (!isIntPackingMove)
    {
        return false;
    }

    // we only handle direct contiguous and scalar source region for now,
    // as VxH and strided regions are a bit harder to update
    if (src0->getRegion()->isContiguous(inst->getExecSize()))
    {
        uint16_t newHS = extypesize / G4_Type_Table[dst->getType()].byteSize;
        if (newHS > 4)
        {
            // rule out Q -> B moves if Q is not scalar
            return false;
        }
    }
    else if (!src0->isScalar())
    {
        // only handle scalar and contiguous regions for now
        return false;
    }

    // instead of inserting a move, we change src's type to be same as dst type
    // e.g.,
    // mov (8) r1.0<1>:b r2.4<8;8,1>:d
    // becomes
    // mov (8) r1.0<1>:b r2.16<32;8,4>:b
    // This is safe since integer down conversion is based on truncation
    uint32_t typeSizeRatio = extypesize / G4_Type_Table[dst->getType()].byteSize;
    uint32_t numElt = src0->isScalar() ? 1 : inst->getExecSize() * typeSizeRatio;
    G4_Declare* newDcl = builder.createTempVar(numElt, dst->getType(), Either, Any);
    newDcl->setAliasDeclare(src0->getBase()->asRegVar()->getDeclare(), 0);
    RegionDesc* region = src0->isScalar() ? builder.getRegionScalar() :
        builder.createRegionDesc((uint16_t)inst->getExecSize() * typeSizeRatio,
        inst->getExecSize(),
        (uint16_t)typeSizeRatio);
    G4_SrcRegRegion* newSrc = builder.createSrcRegRegion(
        Mod_src_undef,
        Direct,
        newDcl->getRegVar(),
        src0->getRegOff(),
        src0->getSubRegOff() * typeSizeRatio,
        region,
        dst->getType());
    inst->setSrc(newSrc, 0);
    return true;
}

// implement HW restrictions on mov
// -- There is no direct conversion from B/UB to DF or DF to B/UB.
//    Use two instructions and a word or DWord intermediate type.
// -- There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
//    Use two instructions and a word or DWord intermediate integer type.
// -- There is no direct conversion from HF to DF or DF to HF.
//    Use two instructions and F (Float) as an intermediate type.
// -- There is no direct conversion from HF to Q/UQ or Q/UQ to HF.
//    Use two instructions and F (Float) or a word integer type or a DWord integer type as an intermediate type.
// returns true if a move is inserted
bool HWConformity::fixMov(INST_LIST_ITER i, G4_BB* bb)
{
    G4_INST* inst = *i;

    if (inst->opcode() != G4_mov)
    {
        return false;
    }

    G4_Type dstType = inst->getDst()->getType();
    G4_Type srcType = inst->getSrc(0)->getType();

    if (IS_BTYPE(dstType) && (IS_DFTYPE(srcType) || IS_QTYPE(srcType)))
    {
        // mov B Q/DF
        inst->setDest(insertMovAfter(i, inst->getDst(), Type_W, bb));
        return true;
    }
    else if (IS_BTYPE(srcType) && (IS_DFTYPE(dstType) || IS_QTYPE(dstType)))
    {
        // mov Q/DF B
        inst->setDest(insertMovAfter(i, inst->getDst(), Type_W, bb));
        return true;
    }
    else if (IS_HFTYPE(dstType) && (IS_DFTYPE(srcType) || IS_QTYPE(srcType)))
    {
        // mov HF Q/DF
        inst->setDest(insertMovAfter(i, inst->getDst(), Type_F, bb));
        return true;
    }
    else if (IS_HFTYPE(srcType) && (IS_DFTYPE(dstType) || IS_QTYPE(dstType)))
    {
        // mov Q/DF HF
        inst->setDest(insertMovAfter(i, inst->getDst(), Type_F, bb));
        return true;
    }
    return false;
}


bool HWConformity::fixDstAlignment( INST_LIST_ITER i, G4_BB* bb, G4_Type extype, unsigned int dst_elsize )
{
    G4_INST *inst = *i;
    bool insertMOV = false;

    unsigned char exec_size = inst->getExecSize();
    G4_DstRegRegion *dst = inst->getDst();
    G4_Operand *src0 = inst->getSrc(0);
    unsigned h_stride = dst->getHorzStride();
    unsigned int extypesize = G4_Type_Table[extype].byteSize;

    if (inst->hasNULLDst())
    {
        if (dst_elsize * h_stride < extypesize)
        {
            uint16_t newHStride = extypesize / dst_elsize;
            if (newHStride == 8)
            {
                // dst is a null byte, this can be produced by logical optimization
                // we chagne the type to W here; this should be safe since the conditional modifier
                // is either .ez or .nz
                MUST_BE_TRUE(dst_elsize == 1, "expect B/UB dst");
                dst->setType(dst->getType() == Type_B ? Type_W : Type_UW);
                dst->setHorzStride(4);
            }
            else
            {
                MUST_BE_TRUE(newHStride <= 4, "horizontal stride must be <=4");
                dst->setHorzStride(newHStride);
            }
        }

        return insertMOV;
    }

    // optimize initialization instructions
    if( inst->opcode() == G4_mov && src0->isImm() &&
        ( !bb->isInSimdFlow() || inst->isWriteEnableInst() ) &&
        !inst->getPredicate() &&
        dst->getRegAccess() == Direct &&
        dst->getHorzStride() == 1 &&
        inst->getSaturate() == false &&
        IS_BTYPE(dst->getType()) &&
        !IS_TYPE_F32_F64(src0->getType()) &&
        builder.isOpndAligned( dst, getTypeSize(src0->getType()) ) )
    {
        // inst is a mov with packed byte dst and int imm source
        int64_t value = src0->asImm()->getInt();
        uint64_t new_value = ( value & 0xFF ) | ( value << 0x8 );
        int scale = 2;

        if (IS_DTYPE(src0->getType()))
        {
            scale = 4;
            new_value = ( new_value & 0xFFFF ) | ( new_value << 0x10 );
        }

        if (exec_size >= scale)
        {
            G4_Type new_type = ( scale == 2 ) ? Type_UW : Type_UD;
            dst->setHorzStride( 1 );
            dst->setSubRegOff( (short) (dst->getSubRegOff() / scale) );
            dst->setType( new_type );
            inst->setSrc( builder.createImm( new_value, new_type ), 0 );
            inst->setExecSize( (unsigned char) (exec_size / scale) );
            return insertMOV;
        }
    }

    bool byteDst = IS_BTYPE(dst->getType());

    // Byte can not be used as dstination of INT*INT
    if ((byteDst && inst->opcode() == G4_mul &&
        IS_TYPE_INT(inst->getSrc(0)->getType()) && IS_TYPE_INT(inst->getSrc(1)->getType())))
    {
        // change dst type to W
        inst->setDest( insertMovAfter( i, dst, Type_W, bb ) );
        return true;
    }

    if (byteDst && extypesize == 8)
    {
        // Gen doesn't support hstride 8, so we add a W move here
        inst->setDest(insertMovAfter(i, dst, Type_W, bb));
        return true;
    }

    bool dstHFMixModeInst = inst->getDst()->getType() == Type_HF && extype == Type_F;
    bool dstNotAlignedToExecType = exec_size > 1 && (dst_elsize * h_stride) < extypesize &&
        !(builder.hasMixMode() && dstHFMixModeInst);
    unsigned short dst_byte_offset;
    builder.isOpndAligned(dst, dst_byte_offset, extypesize);
    if (!((dst_byte_offset % extypesize == 0)  ||
        (byteDst &&
            !VISA_WA_CHECK(builder.getPWaTable(), WaByteDstAlignRelaxedRule) &&
            (dst_byte_offset % extypesize == 1))
        ) ||
        /*
         * Dynamic offset can be odd for serialized instructions
         * or when horizontal offset is dynamic.
         * Probably we need the same for any dst with dynamic offsets.
         */
         (  dst_elsize < extypesize &&
            dst->getRegAccess() != Direct     &&
            !( byteDst && extypesize == 2 && exec_size == 1 )
         )                                                                  ||
         dstNotAlignedToExecType)
    {
        /*
         * 10.3
         * For byte dst type:
         * 1. no 1 horstride
         * 2. no odd start subreg
         * There is only one excpetion - raw mov op
         * Raw means src operand has no attribute.
         *
         * Note: Actually all these cases are now controlled
         *       by extypesize value.
         */

         if (inst->isRawMov() &&
             ( dst_byte_offset % extypesize == 0 ||
             ( byteDst && dst_byte_offset % extypesize == 1 ) ) )
         {
             return insertMOV;
         }

         if (canReplaceMovSrcType(builder, inst, extypesize))
         {
             return false;
         }

         if (inst->opcode() == G4_mov)
         {
             bool intHFConversion = false;
             G4_Operand* src0 = inst->getSrc(0);
             if (IS_HFTYPE(dst->getType()) && IS_TYPE_INT(src0->getType()))
             {
                 intHFConversion = true;
             }
             else if (IS_HFTYPE(src0->getType()) && IS_TYPE_INT(dst->getType()))
             {
                 intHFConversion = true;
             }
             // we allow pact destination for F to HF.
             if (getGenxPlatform() >= GENX_CHV && !intHFConversion && inst->isMixedMode())
             {
                 return insertMOV;
             }
         }

         if( !VISA_WA_CHECK(builder.getPWaTable(), WaByteDstAlignRelaxedRule) )
         {
             if( splitInstListForByteDst( i, bb, (uint16_t) extypesize ) )
             {
                 return true;
             }
         }

         inst->setDest(insertMovAfter(i, dst, dst->getType(), bb));
         insertMOV = true;
    }

    return insertMOV;
}

/*
 * This function checks to see if the instruction's indirect operands
 * potentially require totally more than 8 distinct addr reg sub-registers, and
 * then determines which of the operands to spill into temporary GRFs so
 * as to limit total number of distinct sub-registers used by the instruction
 * to 8. This is a requirement imposed by the CM register allocator.
 *
 * NOTES:
 *    1. 3-src instructions do not support indirect oeprands.
 *    2. SIMD16 is not allowed when indirect operands are present.
 */

bool HWConformity::fixIndirectOpnd( INST_LIST_ITER i, G4_BB *bb )
{
    G4_INST *inst = *i;

    G4_Operand *src0 = inst->getSrc(0), *src1 = inst->getSrc(1);
    G4_DstRegRegion *dst = inst->getDst();
    bool null_dst = ( !dst || inst->hasNULLDst() );

    bool null_src0 = !src0;
    bool null_src1 = !src1 || ( inst->isMath() && src1->isNullReg() );

    const int addr_reg_max_count = 16;
    const int addr_reg_size      = G4_Type_Table[Type_UW].byteSize;
    int src_uniq_count           = 0;
    int src1_count               = 0;
    int src0_count               = 0;
    int dst_uniq_count           = 0;
    int dst_count                = 0;
    bool nospill_src1            = false;
    bool nospill_src0            = false;
    bool nospill_dst             = false;
    bool spill_src1              = false;
    bool spill_src0              = false;
    bool spill_dst               = false;
    G4_Declare *addr_dcl0 = NULL, *addr_dcl1 = NULL, *addr_dcl2 = NULL;
    if( !null_src0 && src0->isSrcRegRegion() &&
        src0->getRegAccess() != Direct && src0->asSrcRegRegion()->getBase()->isRegVar() ){
            addr_dcl0 = src0->asSrcRegRegion()->getBase()->asRegVar()->getDeclare();
            while( addr_dcl0->getAliasDeclare() != NULL ){
                addr_dcl0 = addr_dcl0->getAliasDeclare();
            }
            // is the following precise?
            src0_count = addr_dcl0->getNumElems() * addr_dcl0->getNumRows() * addr_dcl0->getElemSize() / addr_reg_size;
            MUST_BE_TRUE( src0_count <= addr_reg_max_count, "More than 8 address subregisters required for one oerand." );
            src_uniq_count += src0_count;
    }

    if( !null_src1 && src1->isSrcRegRegion() &&
        src1->getRegAccess() != Direct && src1->asSrcRegRegion()->getBase()->isRegVar() ){
            addr_dcl1 = src1->asSrcRegRegion()->getBase()->asRegVar()->getDeclare();
            while( addr_dcl1->getAliasDeclare() != NULL ){
                addr_dcl1 = addr_dcl1->getAliasDeclare();
            }
            src1_count = addr_dcl1->getNumElems() * addr_dcl1->getNumRows() * addr_dcl1->getElemSize() / addr_reg_size;
            MUST_BE_TRUE( src1_count <= addr_reg_max_count, "More than 8 address subregisters required for one oerand." );
            if (addr_dcl1 != addr_dcl0) {
                // should we use top level dcl here?
                src_uniq_count += src1_count;
            }
            else {
                nospill_src1 = true;
                nospill_src0 = true;
            }
    }

    if( !null_dst &&
        dst->getRegAccess() != Direct && dst->getBase()->isRegVar() )
    {
            addr_dcl2 = dst->getBase()->asRegVar()->getDeclare();
            while( addr_dcl2->getAliasDeclare() != NULL ){
                addr_dcl2 = addr_dcl2->getAliasDeclare();
            }
            dst_count = addr_dcl2->getNumElems() * addr_dcl2->getNumRows() * addr_dcl2->getElemSize() / addr_reg_size;
            MUST_BE_TRUE( dst_count <= addr_reg_max_count, "More than 8 address subregisters required for one oerand." );
            if (addr_dcl2 != addr_dcl0 && addr_dcl2 != addr_dcl1) {
                dst_uniq_count += dst_count;
            }
            else if( addr_dcl2 != addr_dcl0 ){
                nospill_dst = true;
                nospill_src0 = true;
            }else{
                nospill_dst = true;
                nospill_src1 = true;
            }
    }

    if (src_uniq_count > addr_reg_max_count) {
        if (src0_count > src1_count || nospill_src1) {
            MUST_BE_TRUE(nospill_src0 == false, "Address of source0 should be spilled." );
            spill_src0 = true;
            src_uniq_count -= src0_count;
        }
        else {
            MUST_BE_TRUE(nospill_src1 == false, "Address of source1 should be spilled.");
            spill_src1 = true;
            src_uniq_count -= src1_count;
        }
    }

    if (src_uniq_count + dst_uniq_count > addr_reg_max_count) {
        MUST_BE_TRUE(nospill_dst == false, "Address of dst should be spilled." );

        if (nospill_src1 && nospill_src0) {
            spill_dst = true;
            dst_uniq_count = 0;
        }
        else if (dst_uniq_count > src0_count && dst_uniq_count > src1_count) {
            spill_dst = true;
            dst_uniq_count = 0;
        }
        else if (spill_src0 ) {
            spill_src1 = true;
            src_uniq_count -= src1_count;
        }
        else if (spill_src1 ) {
            spill_src0 = true;
            src_uniq_count -= src0_count;
        }
        else if (src0_count > src1_count) {
            spill_src0 = true;
            src_uniq_count -= src0_count;
        }
        else {
            spill_src1 = true;
            src_uniq_count -= src1_count;
        }
    }

    MUST_BE_TRUE (src_uniq_count + dst_uniq_count <= addr_reg_max_count,
        "Remianed number of address registers should be no more than 8 after spill.");

    // Is this only for iselect?
    // What if a scalar with indirect addressing is used?
    if (spill_src0) {
        G4_Operand *new_src0 = insertMovBefore(i, 0, src0->getType(), bb);
        inst->setSrc( new_src0, 0 );
    }

    if (spill_src1 && src1) {
        G4_Operand *new_src1 = insertMovBefore(i, 1, src1->getType(), bb);
        inst->setSrc( new_src1, 1 );
    }

    if (spill_dst && dst)
    {
        G4_DstRegRegion *new_dst = insertMovAfter( i, dst, dst->getType(), bb );
        inst->setDest( new_dst );
        if( dst != new_dst &&
            ( IS_FTYPE(dst->getType()) || IS_DFTYPE(dst->getType()) ) )
        {
            inst->setSaturate( false );
        }
    }
    return spill_dst;
}
/*
 *     Two rules are checked here:
 * (1) When source(s) is/are of float type, destination must be of float
 *     type also. The exception is MOV instruction which can be used
 *     for explicit type conversion between float and integer.
 *
 * (2) For Gen6, only the following instructions can have
 *     interger sources and float destination:
 *     MOV, ADD, MUL, MAC, MAD, LINE
 */
bool HWConformity::fixDstType( INST_LIST_ITER i, G4_BB *bb, G4_Type extype )
{
    G4_INST *inst = *i;
    G4_DstRegRegion *dst = inst->getDst();

    if (!dst)
    {
        return false;
    }

    if( inst->hasNULLDst() ){
        if( inst->opcode() != G4_mov && IS_FTYPE( extype ) && !IS_FTYPE( dst->getType() ) ){
            // change type and stride of NULL dst
            G4_DstRegRegion *null_dst_opnd = builder.createNullDst( Type_F );
            inst->setDest( null_dst_opnd );
        }
    }
    else if( (
                (inst->opcode() != G4_mov && !inst->isSend())   &&
                (IS_FTYPE( extype ) || IS_HFTYPE(extype))       &&
                !(IS_FTYPE(dst->getType()) || IS_HFTYPE(dst->getType()))
             )                                                              ||
             (
                IS_FTYPE(dst->getType())                        &&
                //assumes checks for platform were already done for HF
                !(IS_FTYPE(extype) || IS_HFTYPE(extype))       &&
                !Opcode_int_src_float_dst_OK( inst->opcode() )
             )
          )
    {
            G4_DstRegRegion *new_dst = insertMovAfter(i, dst, extype, bb);
            // TODO: since cmp amd cmpn have no dst, we do not handle cmp/cmpn dst during MOV inst insertion.
            inst->setDest( new_dst );
            if( dst != new_dst &&
                ( IS_FTYPE(dst->getType()) || IS_DFTYPE(dst->getType()) ) )
            {
                inst->setSaturate( false );
            }
            return true;
    }
    return false;
}


// If an accumulator is an explicit source operand, its register region must match that of the
// destination register with the exception(s) described below.
// When OWords of accumulators are accessed, the source and destination OWords may be different
bool HWConformity::fixAccSrc(INST_LIST_ITER iter, G4_BB* bb)
{
    G4_INST *inst = *iter;
    bool AccExplictUse = false;

    for (int i = 0; i < inst->getNumSrc(); ++i)
    {
        G4_Operand* src = inst->getSrc(i);
        if (src && src->isAccReg())
        {
            AccExplictUse = true;
            break;
        }
    }

    if (AccExplictUse &&
        inst->getDst() &&
        inst->getDst()->getBase() &&
        inst->getDst()->getBase()->isRegVar())
    {
        int alignByte = 16;
        if (!builder.isOpndAligned(inst->getDst(), alignByte))
        {
            G4_DstRegRegion *new_dst = insertMovAfter(iter, inst->getDst(), inst->getDst()->getType(), bb);
            G4_Declare* tmpDstDcl = new_dst->getTopDcl();
            G4_SubReg_Align subAlign = Eight_Word;
            tmpDstDcl->setSubRegAlign(subAlign);
            inst->setDest(new_dst);
            return true;
        }
    }

    return false;
}

// check for implicit acc region rules
// -- implicit acc should have same subreg offset dst (currently always 0)
// this should be done when creating the instruction with implicit acc source,
// but the code for generating mac is just way too complicated..
bool HWConformity::fixImplicitAcc(INST_LIST_ITER i, G4_BB* bb)
{
    G4_INST *inst = *i;
    if (inst->hasImplicitAccSrc())
    {
        G4_DstRegRegion* dst = inst->getDst();
        if (!builder.isOpndAligned(dst, GENX_GRF_REG_SIZ))
        {
            inst->setDest(insertMovAfter(i, dst, dst->getType(), bb, Sixteen_Word));
            return true;
        }
    }
    return false;
}

bool HWConformity::fixAccDst( INST_LIST_ITER i, G4_BB* bb )
{
    G4_INST *inst = *i;
    G4_DstRegRegion *dst = inst->getDst();
    int exec_size = inst->getExecSize( );
    bool compressed = isCompressedInst( inst );
    int uncompressed_exec_size = compressed ? exec_size / 2: exec_size;

    bool insertMov = false;
    unsigned short dst_hs = dst->getHorzStride();

    bool covers_whole_acc = ( dst_hs == 1 ) &&
        ( ( inst->getExecSize() * dst->getExecTypeSize() ) % G4_GRF_REG_NBYTES == 0 );

    bool non_null_dst = ( inst->opcode() == G4_mach && inst->getDst() && !inst->getDst()->isNullReg() );

    // Locate the only use corresponding to the ACC definition in
    // the same basic block.
    bool found_acc_use = false;
    INST_LIST_ITER iter = i;
    G4_INST *acc_use_op = NULL;
    iter++;
    for (auto useIter = inst->use_begin(); useIter != inst->use_end(); useIter++)
    {
        if( (*useIter).second == Opnd_pred || (*useIter).second == Opnd_dst )
        {
            continue;
        }
        if( (*useIter).second == Opnd_implAccSrc )
        {
            acc_use_op = (*useIter).first;
            found_acc_use = true;
            break;
        }
        G4_Operand *use = (*useIter).first->getSrc( (*useIter).second - 1 );
        if( use && use->isAccReg() )
        {
            acc_use_op = (*useIter).first;
            found_acc_use = true;
            break;
        }
    }

    MUST_BE_TRUE( found_acc_use || non_null_dst, "Defined ACC is not used in the same BB." );
    if( !found_acc_use )
    {
        return insertMov;
    }

    // get iterator of acc use inst
    while( (*iter) != acc_use_op )
    {
        iter++;
    }

    G4_DstRegRegion *acc_use_op_dst = acc_use_op->getDst();
    bool null_use_op_dst = acc_use_op->hasNULLDst();
    if( acc_use_op->opcode() == G4_mach && null_use_op_dst )
    {
        return insertMov;
    }
    // If the entire ACC, either acc0 or both acc0 and acc1, is not
    // covered by the ACC definition then we need to ensure the channel
    // corresponding to the only use's destination sub-register will
    // contain the value.

    if ( !covers_whole_acc )
    {
        bool need_replication;
        bool can_replicate;
        if( found_acc_use )
        {
            G4_Operand *src0 = inst->getSrc(0), *src1 = inst->getSrc(1), *src2 = inst->getSrc(2);

            // Decision making - start

            // If the acc_def_op has a unit exec_size while its use is not, then we
            // can and should replicate.

            if ( uncompressed_exec_size == 1 && acc_use_op->getExecSize() != 1 )
            {
                need_replication = true;
                can_replicate = true;
            }

            // If the acc def use destination has a dynamic offset then
            // it is not GenX conformant and we cannot guarantee safe
            // replication of ACC channels.

            else if ( acc_use_op_dst && acc_use_op_dst->getRegAccess() != Direct )
            {
                need_replication = true;
                can_replicate = false;
            }

            // If the destination is guaranteed to be GRF aligned then we
            // still have a chance to attempt replication if the sub-register
            // offset is not zero. If the offset is zero then we are already
            // GenX conformant and there is no need to replicate.
            else
            {
                unsigned short offset;
                bool opndGRFAligned = builder.isOpndAligned( acc_use_op_dst, offset, G4_GRF_REG_NBYTES );
                if ( !acc_use_op_dst || null_use_op_dst )
                {
                        opndGRFAligned = true;
                }
                if( opndGRFAligned )
                {
                    // If the offset of acc def use is not zero and the offset is
                    // a multiple of the acc def exec size.

                    if (offset % G4_GRF_REG_NBYTES != 0)
                    {
                        need_replication = true;
                        can_replicate = (offset % uncompressed_exec_size == 0);
                    }

                    else
                    {
                        need_replication = false;
                        can_replicate = false;
                    }
                }

                // The offset is at least guaranteed to be aligned w.r.t the
                // destination type. This implies that if the execution size
                // is one then we can always replicate.
                else if (uncompressed_exec_size == 1)
                {
                    need_replication = true;
                    can_replicate = true;
                }
                // Destination is not guaranteed to be GRF aligned and execution
                // size is not one either.

                else
                {
                    need_replication = true;
                    can_replicate = false;
                }
            }

            // Check if we can safely set the vertical stride of the
            // destination to zero.

            if (need_replication && can_replicate)
            {
                can_replicate =
                    ( (!src0 || isUnitRegionRow( src0, exec_size ) ) &&
                    (!src1 || isUnitRegionRow( src1, exec_size ) ) &&
                    (!src2 || isUnitRegionRow( src2, exec_size ) ) )    ;

                // If we replicated a mac/mach we need to match the implicit
                // ACC source region with its definition's region as
                // well. Make sure we can do that for a chain of mac/mach.

                INST_LIST_ITER def_inst_iter = i;

                while ( (*def_inst_iter)->hasImplicitAccSrc() )
                {
                    INST_LIST_ITER mac_acc_def_iter = def_inst_iter;

                    // Find the definition for the implicit ACC source which
                    // must be in the same basic block.
                    bool found = false;

                    for (auto def_iter = (*def_inst_iter)->def_begin();
                        def_iter != (*def_inst_iter)->def_end();
                        def_iter++ )
                    {
                        if( (*def_iter).second == Opnd_implAccSrc )
                        {
                            found = true;
                            break;
                        }
                    }

                    MUST_BE_TRUE( found, "Acc is not defined in the same BB." );

                    if ( ( *mac_acc_def_iter )->getExecSize() == exec_size ) {
                        break;
                    }
                    else {
                        G4_Operand *src0 = ( *mac_acc_def_iter )->getSrc(0), *src1 = ( *mac_acc_def_iter )->getSrc(1),
                            *src2 = ( *mac_acc_def_iter )->getSrc(2);
                        // Check if we can replicate this mac(h) in the chain.
                        if ( ( !src0 || isUnitRegionRow( src0, exec_size ) ) &&
                            ( !src1 || isUnitRegionRow( src1, exec_size ) ) &&
                            ( !src2 || isUnitRegionRow( src2, exec_size ) ) ){
                            def_inst_iter = mac_acc_def_iter;
                        }
                        else {
                            can_replicate = false;
                            break;
                        }
                    }
                }
            }

            // Decision making - end

            // Handle case (1) - perform replication across ACC channels.
            // all sources are SrcRegRegion now

            if (need_replication && can_replicate)
            {
                int acc_replication_factor =
                    G4_GRF_REG_NBYTES /
                    (uncompressed_exec_size * dst->getExecTypeSize() );

                exec_size *= acc_replication_factor;
                inst->setExecSize( (unsigned char) exec_size );

                if( src0 && src0->isSrcRegRegion() )
                {
                    RegionDesc *rd = builder.createRegionDesc( 0,
                        src0->asSrcRegRegion()->getRegion()->width,
                        src0->asSrcRegRegion()->getRegion()->horzStride );
                    src0->asSrcRegRegion()->setRegion( rd );
                }

                if( src1 && src1->isSrcRegRegion() )
                {
                    RegionDesc *rd = builder.createRegionDesc( 0,
                        src1->asSrcRegRegion()->getRegion()->width,
                        src1->asSrcRegRegion()->getRegion()->horzStride );
                    src1->asSrcRegRegion()->setRegion( rd );
                }

                if( src2 && src2->isSrcRegRegion() )
                {
                    RegionDesc *rd = builder.createRegionDesc( 0,
                        src2->asSrcRegRegion()->getRegion()->width,
                        src2->asSrcRegRegion()->getRegion()->horzStride );
                    src2->asSrcRegRegion()->setRegion( rd );
                }
                // If we replicated a mac/mach we to match the implicit
                // ACC source region with its definition's region as
                // well.

                INST_LIST_ITER def_inst_iter = i;

                while ( (*def_inst_iter)->hasImplicitAccSrc() )
                {
                    INST_LIST_ITER mac_acc_def_iter = def_inst_iter;

                    // Find the definition for the implicit ACC source which
                    // mus be in the same basic block.
                    bool found = false;
                    while( mac_acc_def_iter != bb->instList.begin() )
                    {
                        mac_acc_def_iter--;
                        if ( ( *mac_acc_def_iter )->getDst() &&
                            ( *mac_acc_def_iter )->getDst()->isAccReg() )
                        {
                                found = true;
                                break;
                        }
                    }

                    MUST_BE_TRUE( found, "Acc is not defined in the same BB." );

                    if ( ( *mac_acc_def_iter )->getExecSize() != exec_size)
                    {
                        ( *mac_acc_def_iter )->setExecSize( (unsigned char) exec_size );
                        G4_Operand *src0 = ( *mac_acc_def_iter )->getSrc(0), *src1 = ( *mac_acc_def_iter )->getSrc(1),
                            *src2 = ( *mac_acc_def_iter )->getSrc(2);

                        if( src0 && src0->isSrcRegRegion() )
                        {
                            RegionDesc *rd = builder.createRegionDesc( 0,
                                src0->asSrcRegRegion()->getRegion()->width,
                                src0->asSrcRegRegion()->getRegion()->horzStride );
                            src0->asSrcRegRegion()->setRegion( rd );
                        }

                        if( src1 && src1->isSrcRegRegion() )
                        {
                            RegionDesc *rd = builder.createRegionDesc( 0,
                                src1->asSrcRegRegion()->getRegion()->width,
                                src1->asSrcRegRegion()->getRegion()->horzStride );
                            src1->asSrcRegRegion()->setRegion( rd );
                        }

                        if( src2 && src2->isSrcRegRegion() )
                        {
                            RegionDesc *rd = builder.createRegionDesc( 0,
                                src2->asSrcRegRegion()->getRegion()->width,
                                src2->asSrcRegRegion()->getRegion()->horzStride );
                            src2->asSrcRegRegion()->setRegion( rd );
                        }
                    }
                    def_inst_iter = mac_acc_def_iter;
                }
            }

            // Handle case (2) - replace destination with an GRF boundary
            //                   aligned temporary.

            else if (need_replication && !can_replicate && acc_use_op_dst != NULL)
            {
                // Replace the destination of the acc use to be a temporary
                // GRF that is aligned to GRF boundary

                uint32_t inst_opt = acc_use_op->getOption();

                G4_Declare *aligned_grf_dcl = builder.createTempVar(
                    (unsigned short) (exec_size * acc_use_op_dst->getHorzStride()),
                    acc_use_op_dst->getType(),
                    Either,
                    Sixteen_Word );

                G4_DstRegRegion *aligned_grf_dst_opnd = builder.createDstRegRegion(
                    Direct,
                    aligned_grf_dcl->getRegVar(),
                    0,
                    0,
                    acc_use_op_dst->getHorzStride(),
                    acc_use_op_dst->getType());

                MUST_BE_TRUE( acc_use_op->getExecSize() == exec_size, "ACC def and use instructions have different execution size." );

                acc_use_op->setDest( aligned_grf_dst_opnd );

                // Insert a mov instruction to the original destination.
                unsigned short vs = aligned_grf_dst_opnd->getHorzStride();
                RegionDesc *rd = builder.createRegionDesc((uint16_t)exec_size, vs, 1, 0);
                G4_SrcRegRegion *mov_src_opnd = builder.createSrcRegRegion(
                        Mod_src_undef,
                        Direct,
                        aligned_grf_dcl->getRegVar(),
                        0,
                        0,
                        rd,
                        aligned_grf_dcl->getElemType());

                G4_INST *new_mov_inst = builder.createInternalInst(
                    acc_use_op->getPredicate(),
                    G4_mov,
                    acc_use_op->getCondMod(),
                    acc_use_op->getSaturate(),
                    (unsigned char) exec_size,
                    acc_use_op_dst,
                    mov_src_opnd,
                    NULL,
                    inst_opt,
                    acc_use_op->getLineNo(),
                    acc_use_op->getCISAOff(),
                    acc_use_op->getSrcFilename() );
                iter++;
                bb->instList.insert( iter, new_mov_inst );
                if( acc_use_op_dst->getType() == Type_F )
                {
                    acc_use_op->setSaturate(false);
                }
                acc_use_op->setPredicate( NULL );
                insertMov = true;
            }
        }
    }
    else
    {
        // it is possible that the def covers whole acc, but the dst of use inst is not aligned to GRF.
        // insert MOV for this case
        if( !builder.isOpndAligned( acc_use_op_dst, G4_GRF_REG_NBYTES ) )
        {
            while( *iter != acc_use_op )
            {
                iter++;
            }
            insertMov = true;
            acc_use_op->setDest( insertMovAfter( iter, acc_use_op_dst, acc_use_op_dst->getType(), bb ) );
        }
    }
    return insertMov;
}

/*
 * When operation execution size is 1, destination horizontal stride is set
 * according to rule 10.2:
 *
 * 10.1.2. If ExecSize is greater than 1, dst.HorzStride*sizeof(dst.Type) must
 *         be equal to or greater than the size of the execution data type.
 * 10.2. If ExecSize is 1, dst.HorzStride must not be 0. Note that this is
 *       relaxed from rule 10.1.2. Also note that this rule for destination
 *       horizontal stride is different from that for source as stated
 *       in rule #7.
 *
 * There are some instructions which work unpredictably if both ExecSize
 * and dst.HorzStride are 1. But they work fine if dst.HorzStride is set
 * according to rule 10.1.2. So we have to correct all such cases.
 *
 * This supposed to be the last operation before emitting final assembly code.
 */
void HWConformity::fixDstHstride( INST_LIST_ITER i, int extypesize )
{
    G4_INST *inst = *i;
    G4_DstRegRegion *dst = inst->getDst();
    int dst_elsize = G4_Type_Table[dst->getType()].byteSize;

    if (dst)
    {
        unsigned short hs = dst->getHorzStride();
        if( hs * dst_elsize < extypesize )
        {
            dst->setHorzStride( (unsigned short) (extypesize/dst_elsize) );
        }
    }
}

template<class T>
bool isPreAssignedRegOffsetNonZero(T* region)
{
    // T is non-NULL and either
    // G4_SrcRegRegion or G4_DstRegRegion
    bool ret = false;

    if ((region->isSrcRegRegion() || region->isDstRegRegion()) &&
        region->getBase() &&
        region->getBase()->isRegVar() &&
        region->getBase()->asRegVar()->isPhyRegAssigned() &&
        region->getBase()->asRegVar()->getPhyRegOff() != 0)
    {
        ret = true;
    }

    return ret;
}

void HWConformity::generateMacl(INST_LIST_ITER it, G4_BB* bb)
{
    G4_INST* mulInst = *it;
    MUST_BE_TRUE(mulInst->opcode() == G4_mul, "expect mul instruction");
    if (mulInst->getExecSize() == 16)
    {
        auto startIter = it;
        bool isFirstInst = startIter == bb->instList.begin();
        if (!isFirstInst)
        {
            --startIter;
        }
        evenlySplitInst(it, bb);
        if (!isFirstInst)
        {
            ++startIter;
        }
        // startIter now points to first mul created by split
        auto endIter = it;
        ++endIter;
        // endIter points to the first inst after the original mul
        for (auto iter = startIter; iter != endIter;)
        {
            auto nextIter = iter;
            ++nextIter;
            G4_INST* currInst = *iter;
            if (currInst->opcode() == G4_mul)
            {
              doGenerateMacl(iter, bb);
            }
            iter = nextIter;
        }
    }
    else
    {
        doGenerateMacl(it, bb);
    }
}

// convert vISA mul (8) dst src0 src1 into
// mul (8) acc0.0<1>:d src0:d src1:w
// mach (8) dst:d src0:d src1:d
//
void HWConformity::doGenerateMacl(INST_LIST_ITER it, G4_BB *bb)
{
    G4_INST* mulInst = *it;
    MUST_BE_TRUE(mulInst->opcode() == G4_mul, "expect mul instruction");
    assert(mulInst->getExecSize() <= 8 && "expect simd8 or less inst");

    G4_Operand* src0 = mulInst->getSrc(0);
    G4_Operand* src1 = mulInst->getSrc(1);
    MUST_BE_TRUE(IS_DTYPE(src0->getType()) && IS_DTYPE(src1->getType()), "both sources must have dword type");

    if (src1->isSrcRegRegion())
    {
        G4_SrcRegRegion* src1Region = src1->asSrcRegRegion();
        if (src1Region->getModifier() != Mod_src_undef)
        {
            // need extra move for the modifier
            src1 = insertMovBefore(it, 1, src1->getType(), bb);
            mulInst->setSrc(src1, 1);
        }
    }

    // sat cannot be used at all in the macro sequence
    // this effectivly means sat is broken for mul D D D
    mulInst->setSaturate(false);

    G4_DstRegRegion* origDst = mulInst->getDst();
    G4_Type accType = (IS_UNSIGNED_INT(src0->getType()) && IS_UNSIGNED_INT(src1->getType())) ? Type_UD : Type_D;
    G4_DstRegRegion *accDstOpnd = builder.createDstRegRegion(Direct, builder.phyregpool.getAcc0Reg(), 0, 0, 1, accType);
    mulInst->setDest(accDstOpnd);

    uint32_t origOptions = mulInst->getOption();
    fixMulSrc1(it, bb);
    mulInst->setOptionOn(InstOpt_WriteEnable);

    G4_Predicate* predicate = mulInst->getPredicate();
    if (predicate != nullptr)
    {
        // move pred to mach
        mulInst->setPredicate(nullptr);
    }
    if (mulInst->getCondMod() != nullptr)
    {
        // conditional modifier cannot be used
        // when the MUL source operand is of dword type.
        MUST_BE_TRUE(false, "Dw multiply does not support conditional modifiers");
        mulInst->setCondMod(nullptr);
    }

    // create a mach inst
    G4_INST* machInst = builder.createInternalInst(predicate, G4_mach, nullptr, false, mulInst->getExecSize(),
        origDst, builder.duplicateOperand(src0), builder.duplicateOperand(src1), origOptions,
        mulInst->getLineNo(), mulInst->getCISAOff(), mulInst->getSrcFilename());

    // maintain du chain as fixAccDst uses it later
    G4_SrcRegRegion *accSrcOpnd = builder.createSrcRegRegion(Mod_src_undef, Direct,
        builder.phyregpool.getAcc0Reg(), 0, 0, builder.getRegionStride1(), accType);
    machInst->setImplAccSrc(accSrcOpnd);
    mulInst->addDefUse(machInst, Opnd_implAccSrc);

    INST_LIST_ITER machIter = it;
    machIter = bb->instList.insert(++machIter, machInst);

    if (!IS_DTYPE(origDst->getType()) || origDst->getHorzStride() != 1 ||
        !builder.isOpndAligned(origDst, 32))
    {
        // mach dst must be grf-aligned, packed D/UD as it is also used for the implicit acc source's region
        G4_DstRegRegion* tmpDst = insertMovAfter(machIter, origDst, accType, bb);
        machInst->setDest(tmpDst);
    }

}

// If both source operands of an MUL instruction are of dword integer type,
// only the lower 16 bits of data elements in src0 are used.
// The full precision multiplication results can be only produced together
// with the mach and mov instructions.

bool HWConformity::fixMULInst( INST_LIST_ITER &i, G4_BB *bb )
{
    bool insertedInst = false;
    G4_INST *inst = *i;
    G4_DstRegRegion *dst = inst->getDst();
    uint8_t exec_size = inst->getExecSize();
    bool srcExchanged = false;

    if (dst && dst->isAccReg())
    {
        return false;
    }

    uint32_t inst_opt = inst->getOption();
    G4_Operand *src0 = inst->getSrc(0), *src1 = inst->getSrc(1);

    // MUL is commutative and only
    // allows src1 to be a constant.
    // If src1 is a constant and src1
    // is not, they are swapped here.
    // If both are constants, they
    // will be fixed in checking HW conformity.
    // this is fixed in fixOpnd.

    if (src0->isImm() && !src1->isImm())
    {
        inst->setSrc( src1, 0 );
        inst->setSrc( src0, 1 );
        srcExchanged = true;
    }

    src0 = inst->getSrc(0);
    src1 = inst->getSrc(1);
    // Q dst needs 64-bit support regardless of src type
    bool isDMul = IS_QTYPE(dst->getType()) || (IS_DTYPE(src0->getType()) && IS_DTYPE(src1->getType()));

    if (!isDMul)
    {
        return false;
    }

    if (builder.hasMacl() && !IS_QTYPE(dst->getType()) &&
        (builder.no64bitType() || inst->getExecSize() > 1))
    {
        // use macl for D = D x D. We use macl when possible
        // except on scalar inst on platforms that support native DMul
        generateMacl(i, bb);
        return true;
    }

    bool doNativeMul = false;
    if (!builder.no64bitRegioning())
    {
        // platform natively supports DW-DW multiply, no need to generate mul/mach/mov sequence
        doNativeMul = true;
    }
    else
    {
        if ((getGenxPlatform() == GENX_CHV || getGenxPlatform() == GENX_BXT))
        {
            if (inst->getExecSize() == 1)
            {
                // scalar insts are a-ok
                return false;
            }
            // ok if source is scalar or qword-aligned
            doNativeMul = (getTypeSize(dst->getType()) * dst->getHorzStride() == 8);
            auto isQWordStride = [inst, this](G4_SrcRegRegion* src)
            {
                RegionDesc* region = src->getRegion();
                if (!region->isScalar())
                {
                    uint16_t stride = 0;
                    (void) region->isSingleNonUnitStride(inst->getExecSize(), stride);
                    if (stride != 2)
                    {
                        return false;
                    }
                    // check that source is GRF-aligned to ensure that every element is qword-aligned
                    return builder.isOpndAligned(src, 32);
                }
                return true;
            };
            if (doNativeMul && src0->isSrcRegRegion())
            {
                doNativeMul = isQWordStride(src0->asSrcRegRegion());
            }
            if (doNativeMul && src1->isSrcRegRegion())
            {
                doNativeMul = isQWordStride(src1->asSrcRegRegion());
            }
        }
    }

    if (doNativeMul)
    {
        // promote source to D type if necessary
        if (IS_QTYPE(dst->getType()))
        {
            G4_Type newTy;
            G4_Operand* newOpnd;
            if (!IS_DTYPE(src0->getType()))
            {
                newTy = IS_SIGNED_INT(src0->getType()) ? Type_D : Type_UD;
                newOpnd = insertMovBefore(i, 0, newTy, bb);
                inst->setSrc(newOpnd, 0);
                insertedInst = true;
            }

            if (!IS_DTYPE(src1->getType()))
            {
                newTy = IS_SIGNED_INT(src1->getType()) ? Type_D : Type_UD;
                if (src1->isImm())
                {
                    newOpnd = builder.createImm(src1->asImm()->getImm(), newTy);
                }
                else
                {
                    newOpnd = insertMovBefore(i, 1, newTy, bb);
                }
                inst->setSrc(newOpnd, 1);
                insertedInst = true;
            }
        }
        return insertedInst;
    }

    // both sources are dword, replace with mul/mach/mov sequence
    // At this point, src0 and src1 are both DW, so we simply make
    // acc's type (i.e. dst_type) be DW/UD

    G4_CondMod *condmod = (G4_CondMod *)builder.duplicateOperand(inst->getCondMod());
    G4_Predicate *pred = (G4_Predicate *)builder.duplicateOperand(inst->getPredicate());

    // check if the following inst is mulh and uses the same srcs as this mul.
    // if true, translate them into
    // mul acc src0 src1
    // mach dst_mulh src0 src1
    // mov mul_dst src0 src1
    INST_LIST_ITER next_i = i;
    next_i++;
    G4_Type tmp_type = (IS_UNSIGNED_INT(src0->getType()) && IS_UNSIGNED_INT(src1->getType())) ? Type_UD : Type_D;
    bool isCompressed = isCompressedInst(inst);

    if (src1->isSrcRegRegion())
    {
        G4_SrcRegRegion* src1Region = src1->asSrcRegRegion();
        if (src1Region->getModifier() != Mod_src_undef)
        {
            // need extra move for the modifier
            src1 = insertMovBefore(i, 1, src1->getType(), bb);
            inst->setSrc(src1, 1);
        }
    }

    bool sat_mod = inst->getSaturate();
    inst->setSaturate(false);

    // see if we can combine this mul with a mulh following it
    if (next_i != bb->instList.end())
    {
        G4_INST *next_inst = *next_i;

        if (next_inst->opcode() == G4_mulh &&
            next_inst->getExecSize() == exec_size &&
            inst->getPredicate() == next_inst->getPredicate() &&
            ((srcExchanged &&
            src0->getType() == next_inst->getSrc(1)->getType() &&
            src0->compareOperand(next_inst->getSrc(1)) == Rel_eq &&
            src1->getType() == next_inst->getSrc(0)->getType() &&
            src1->compareOperand(next_inst->getSrc(0)) == Rel_eq) ||
            (!srcExchanged &&
            src0->getType() == next_inst->getSrc(0)->getType() &&
            src0->compareOperand(next_inst->getSrc(0)) == Rel_eq  &&
            src1->getType() == next_inst->getSrc(1)->getType() &&
            src1->compareOperand(next_inst->getSrc(1)) == Rel_eq)))
        {
            // change current mul inst
            G4_DstRegRegion *acc_dst_opnd = builder.createDstRegRegion(
                Direct,
                builder.phyregpool.getAcc0Reg(),
                0,
                0,
                1,
                tmp_type);

            inst->setDest(acc_dst_opnd);

            fixMulSrc1(i, bb);

            inst->transferUse(next_inst, true);
            inst->addDefUse(next_inst, Opnd_implAccSrc);
            // change mulh inst
            next_inst->setOpcode(G4_mach);

            G4_DstRegRegion *next_dst = next_inst->getDst();
            if (next_dst != NULL &&
                (next_inst->getSaturate() ||
                next_dst->getByteOffset() % GENX_GRF_REG_SIZ != 0 ||
                (bb->isInSimdFlow() && next_inst->isWriteEnableInst() == false) ||
                (next_dst &&
                ((next_dst->getExecTypeSize() > G4_Type_Table[Type_D].byteSize) ||
                isPreAssignedRegOffsetNonZero<G4_DstRegRegion>(next_dst)))))
            {
                // add a tmp mov
                G4_DstRegRegion *new_next_dst = insertMovAfter(next_i, next_dst, next_dst->getType(), bb);
                next_inst->setDest(new_next_dst);
            }

            // set implicit source/dst for MACH
            RegionDesc *rd = NULL;
            unsigned short vs = 0, wd = exec_size, hs = 0;
            if (exec_size > 1){
                if (exec_size == 16){
                    wd = wd / 2;
                }
                hs = 1;
                vs = wd;
            }
            rd = builder.createRegionDesc(vs, wd, hs);
            G4_SrcRegRegion *acc_src_opnd = builder.createSrcRegRegion(Mod_src_undef, Direct, builder.phyregpool.getAcc0Reg(), 0, 0, rd, tmp_type);
            next_inst->setImplAccSrc(acc_src_opnd);
            next_inst->setImplAccDst(builder.createDstRegRegion(*acc_dst_opnd));

            // create mov inst
            G4_SrcRegRegion* movAccSrc = builder.createSrcRegRegion(Mod_src_undef, Direct, builder.phyregpool.getAcc0Reg(), 0, 0, rd, tmp_type);
            G4_INST* newMov = builder.createInternalInst(pred, G4_mov, condmod, false, exec_size, dst, movAccSrc, NULL, inst_opt,
                inst->getLineNo(), inst->getCISAOff(), inst->getSrcFilename());

            INST_LIST_ITER iter = next_i;
            iter++;
            bb->instList.insert(iter, newMov);

            next_inst->addDefUse(newMov, Opnd_src0);

            INST_LIST_ITER last_iter = iter;
            last_iter--;

            if (dst != NULL &&
                (sat_mod ||
                (dst &&
                ((dst->getExecTypeSize() > G4_Type_Table[Type_D].byteSize) ||
                (isPreAssignedRegOffsetNonZero<G4_DstRegRegion>(dst))))))
            {
                // add a tmp mov
                iter--;
                G4_DstRegRegion *new_next_dst = insertMovAfter(iter, dst, dst->getType(), bb);
                newMov->setDest(new_next_dst);
                if (new_next_dst != dst && sat_mod)
                {
                    MUST_BE_TRUE(iter != bb->instList.end() && (*iter)->opcode() == G4_mov,
                        "Next instruciton should be the MOV generated for consistent Dst and ACC source region.");
                    (*iter)->setSaturate(false);
                }
            }

            next_inst->setOptionOn(InstOpt_AccWrCtrl);

            if (exec_size == 16)
            {
                splitDWMULInst(i, last_iter, bb);
            }
            return true;
        }
    }

    G4_DstRegRegion *acc_dst_opnd = builder.createDstRegRegion(Direct, builder.phyregpool.getAcc0Reg(), 0, 0, 1, tmp_type);
    inst->setDest(acc_dst_opnd);
    fixMulSrc1(i, bb);

    if (bb->isInSimdFlow())
    {
        inst->setOptions((inst->getOption() & ~InstOpt_Masks) | InstOpt_WriteEnable);
    }

    if (pred != NULL) {
        // conditional modifier cannot be used
        // when the MUL source operand is of dword type.
        inst->setCondMod(NULL);
    }

    // Dst is either null, or a temp D if the original dst is Q/UQ
    G4_DstRegRegion *machDst = NULL;
    G4_Declare* high32BitDcl = NULL;
    if (IS_QTYPE(dst->getType()))
    {
        high32BitDcl = builder.createTempVar(exec_size, Type_D, Either, Any);
        machDst = builder.Create_Dst_Opnd_From_Dcl(high32BitDcl, 1);
    }
    else
    {
        machDst = builder.createNullDst(Type_D);
    }

    // create a mach inst
    G4_INST* newInst = builder.createInternalInst(
        NULL,
        G4_mach,
        NULL,
        false,
        exec_size,
        machDst,
        builder.duplicateOperand(src0),
        builder.duplicateOperand(src1),
        inst_opt,
        inst->getLineNo(),
        inst->getCISAOff(),
        inst->getSrcFilename());

    newInst->setOptionOn(InstOpt_AccWrCtrl);

    INST_LIST_ITER iter = i;
    iter++;
    bb->instList.insert(iter, newInst);

    inst->setPredicate(NULL);

    inst->copyDef(newInst, Opnd_src0, Opnd_src0);
    inst->copyDef(newInst, Opnd_src1, Opnd_src1);
    inst->transferUse(newInst);
    inst->addDefUse(newInst, Opnd_implAccSrc);

    // create an implicit source for MACH
    RegionDesc *rd = NULL;
    unsigned short vs = 0, wd = exec_size, hs = 0;
    if (exec_size > 1){
        if (isCompressed){
            wd = wd / 2;
        }
        hs = 1;
        vs = wd;
    }
    rd = builder.createRegionDesc(vs, wd, hs);
    G4_SrcRegRegion *acc_src_opnd = builder.createSrcRegRegion(Mod_src_undef, Direct,
        builder.phyregpool.getAcc0Reg(), 0, 0, rd, tmp_type);

    newInst->setImplAccSrc(acc_src_opnd);

    // set an implicit dst for MACH
    newInst->setImplAccDst(builder.createDstRegRegion(*acc_dst_opnd));

    insertedInst = true;

    if (IS_QTYPE(dst->getType()))
    {
        // we have to produce two additional moves to form the Q/UQ:
        // mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
        // mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d
        // mov (8) r6.0<1>:d acc0:d  // Low 32 bits.
        // mov (8) dst.0<2>:d r6.0<1>:d
        // mov (8) dst.1<2>:d r5.0<1>:d
        // Note that we don't try to combine the moves because of the HW restriction that
        // "If an accumulator is an explicit source operand, its register region must match that of the destination register"

        G4_Declare* low32BitDcl = builder.createTempVar(exec_size, Type_D, Either, Any);
        G4_INST* movInst = builder.createInternalInst(NULL, G4_mov, NULL, false, exec_size,
            builder.Create_Dst_Opnd_From_Dcl(low32BitDcl, 1),
            builder.createSrcRegRegion(*acc_src_opnd), NULL, inst_opt,
            inst->getLineNo(), inst->getCISAOff(), inst->getSrcFilename());
        bb->instList.insert(iter, movInst);

        G4_DstRegRegion* origDst = dst;
        bool needsExtraMov = origDst->getHorzStride() > 1 || condmod != NULL || sat_mod;

        G4_Declare* dstAlias = builder.createTempVar(exec_size * 2, Type_D, Either, Any);
        if (!needsExtraMov)
        {
            uint32_t aliasOffset = origDst->getRegOff() * GENX_GRF_REG_SIZ + origDst->getSubRegOff() * 8;
            dstAlias->setAliasDeclare(origDst->getBase()->asRegVar()->getDeclare(), aliasOffset);
        }
        G4_INST* lowMove = builder.createInternalInst(pred, G4_mov, NULL, false, exec_size,
            builder.Create_Dst_Opnd_From_Dcl(dstAlias, 2),
            builder.Create_Src_Opnd_From_Dcl(low32BitDcl, builder.getRegionStride1()),
            NULL, inst_opt);
        bb->instList.insert(iter, lowMove);

        MUST_BE_TRUE(high32BitDcl != NULL, "mach dst must not be null");
        G4_INST* highMove = builder.createInternalInst(pred, G4_mov, NULL, false, exec_size,
            builder.createDstRegRegion(Direct, dstAlias->getRegVar(), 0, 1, 2, dstAlias->getElemType()),
            builder.Create_Src_Opnd_From_Dcl(high32BitDcl, builder.getRegionStride1()),
            NULL, inst_opt);
        bb->instList.insert(iter, highMove);

        if (needsExtraMov)
        {
            // this will take care of non-packed dst/cond mod/saturate
            G4_Declare* dstAliasAsQ = builder.createTempVar(exec_size, Type_Q, Either, Any);
            dstAliasAsQ->setAliasDeclare(dstAlias, 0);
            G4_INST* moveInst = builder.createInternalInst(NULL, G4_mov, condmod, sat_mod, exec_size,
                dst,
                builder.Create_Src_Opnd_From_Dcl(dstAliasAsQ, builder.getRegionStride1()),
                NULL, inst_opt);
            bb->instList.insert(iter, moveInst);
        }

        return true;
    }

    INST_LIST_ITER last_iter;
    // create a mov inst
    if (sat_mod == false)
    {
        bool extra_mov = dst &&
            dst->getExecTypeSize() > G4_Type_Table[Type_D].byteSize;
        extra_mov |= (isPreAssignedRegOffsetNonZero<G4_DstRegRegion>(dst));

        G4_INST* movInst = builder.createInternalInst(pred, G4_mov, condmod, false, exec_size, dst,
            builder.createSrcRegRegion(*acc_src_opnd), NULL, inst_opt,
            inst->getLineNo(), inst->getCISAOff(), inst->getSrcFilename());

        newInst->transferUse(movInst);
        newInst->addDefUse(movInst, Opnd_src0);

        bb->instList.insert(iter, movInst);
        last_iter = iter;
        last_iter--;
        if (extra_mov)
        {
            // add a tmp mov
            iter--;
            G4_DstRegRegion *new_next_dst = insertMovAfter(iter, dst, dst->getType(), bb);
            movInst->setDest(new_next_dst);
            movInst->setPredicate(NULL);
        }
    }
    else
    {
        // create an extra mov inst
        G4_Declare *dcl = builder.createTempVar(
            exec_size,
            tmp_type,
            Either,
            Sixteen_Word);

        G4_DstRegRegion *tmp_dst_opnd = builder.createDstRegRegion(
            Direct,
            dcl->getRegVar(),
            0,
            0,
            1,
            tmp_type);
        G4_INST* movInst = builder.createInternalInst(NULL, G4_mov, condmod, false, exec_size, tmp_dst_opnd,
            builder.createSrcRegRegion(*acc_src_opnd), NULL, InstOpt_NoOpt,
            inst->getLineNo(), inst->getCISAOff(), inst->getSrcFilename());

        bb->instList.insert(iter, movInst);

        last_iter = iter;
        last_iter--;

        G4_SrcRegRegion *tmp_src_opnd = builder.createSrcRegRegion(Mod_src_undef, Direct, dcl->getRegVar(), 0, 0, rd, tmp_type);

        G4_INST *newInst2 = builder.createInternalInst(pred, G4_mov, condmod, sat_mod, exec_size, dst, tmp_src_opnd, NULL, inst_opt,
            inst->getLineNo(), inst->getCISAOff(), inst->getSrcFilename());

        newInst->transferUse(newInst2);
        newInst->addDefUse(movInst, Opnd_src0);
        movInst->addDefUse(newInst2, Opnd_src0);
        bb->instList.insert(iter, newInst2);
        iter++;
    }

    if (exec_size == 16)
    {
        splitDWMULInst(i, last_iter, bb);
    }

    return insertedInst;
}


// Translate MULH into
// MUL acc src0 src1
// MACH dst src0 src1
void HWConformity::fixMULHInst( INST_LIST_ITER &i, G4_BB *bb )
{
    G4_INST *inst = *i;
    INST_LIST_ITER iter = i;
    uint8_t exec_size = inst->getExecSize( );

    int inst_opt = inst->getOption();

    G4_Operand *src0 = inst->getSrc(0), *src1 = inst->getSrc(1);

    if ( src0->isImm() && !src1->isImm() ){
        inst->setSrc( src1, 0 );
        inst->setSrc( src0, 1 );

        src0 = inst->getSrc(0);
        src1 = inst->getSrc(1);
    }

    unsigned int instExecSize = inst->getExecSize();
    if (instExecSize <= 8 && !builder.no64bitRegioning())
    {
        // use mul Q D D to get the upper 32-bit
        // not that we don't do this for CHV/BXT due to the 64-bit type restrictions
        inst->setOpcode(G4_mul);
        G4_DstRegRegion *dst = inst->getDst();
        G4_Type dstType = dst->getType();

        if (dstType == Type_UD)
            dstType = Type_UQ;
        else
            dstType = Type_Q;
        G4_Declare *dstDcl = dst->getBase()->asRegVar()->getDeclare();
        G4_Declare *tmpDcl = builder.createTempVar(
                                        dstDcl->getNumElems(),
                                        dstType,
                                        dstDcl->getAlign(),
                                        dstDcl->getSubRegAlign(),
                                        "TV");

        G4_DstRegRegion* tmpDst = builder.Create_Dst_Opnd_From_Dcl(tmpDcl, 1);
        inst->setDest(tmpDst);

        //need move to cast back to D/UD type
        G4_SrcRegRegion *tmpSrc = builder.createSrcRegRegion(
            Mod_src_undef,
            Direct,
            tmpDcl->getRegVar(),
            0,
            1,
            instExecSize > 1 ? builder.getRegionStride2() : builder.getRegionScalar(),
            dst->getType());

        ++iter;

        G4_INST *tmpMov = builder.createInternalInst(
                builder.duplicateOperand(inst->getPredicate()),
                G4_mov,
                NULL,
                false,
                (unsigned char) instExecSize,
                dst,
                tmpSrc,
                NULL,
                NULL,
                inst->getOption(),
                inst->getLineNo(),
                inst->getCISAOff(),
                inst->getSrcFilename());

        bb->instList.insert(iter, tmpMov);
        //it will decrement back to mov
        i = iter;

        /*
            Need to remove dst from uses list of mulh, and add them to movInst useList
            add movInst to uselist of mulh.
            Add mulh to def instruction list of movInst
        */
        inst->transferUse(tmpMov);
        inst->addDefUse(tmpMov, Opnd_src0);
        return;
    }

    if(src1->isSrcRegRegion() && src1->asSrcRegRegion()->getModifier() != Mod_src_undef)
    {
        // WaAdditionalMovWhenSrc1ModOnMulMach
        G4_Declare *src1Dcl = src1->asSrcRegRegion()->getBase()->asRegVar()->getDeclare();
        G4_Declare *tmpDcl = builder.createTempVar(
                                        src1Dcl->getNumElems(),
                                        src1Dcl->getElemType(),
                                        src1Dcl->getAlign(),
                                        src1Dcl->getSubRegAlign(),
                                        "TV");

        G4_DstRegRegion* tmpDst = builder.createDstRegRegion(
                                                Direct,
                                                tmpDcl->getRegVar(),
                                                0,
                                                0,
                                                1,
                                                src1->asSrcRegRegion()->getType());

        RegionDesc * src1Desc = src1->asSrcRegRegion()->getRegion();

        G4_INST *tmpMov = builder.createInternalInst(
                                            NULL,
                                            G4_mov,
                                            NULL,
                                            false,
                                            (uint8_t) src1Desc->width,
                                            tmpDst,
                                            src1,
                                            NULL,
                                            NULL,
                                            InstOpt_WriteEnable,
                                            inst->getLineNo(),
                                            inst->getCISAOff(),
                                            inst->getSrcFilename());
        bb->instList.insert(iter, tmpMov);

        RegionDesc *tmpSrcDesc = NULL;

        if(src1Desc->width == 1)
            tmpSrcDesc = builder.getRegionScalar();
        else
            tmpSrcDesc = builder.createRegionDesc( src1Desc->vertStride, src1Desc->width, src1Desc->horzStride );

        G4_SrcRegRegion *srcTmp = builder.createSrcRegRegion(Mod_src_undef, Direct, tmpDcl->getRegVar(), 0, 0, tmpSrcDesc, src1->asSrcRegRegion()->getType());
        src1 = srcTmp;
        inst->setSrc(src1, 1);

        //Remove def instruction(s) from mulh
        //add them to the tmpMov
        //remove mulh from dev instructions, add tmpMov to them
        inst->transferDef(tmpMov, Opnd_src1, Opnd_src0);
        tmpMov->addDefUse(inst, Opnd_src1);
    }

    G4_Type tmp_type = ( IS_UNSIGNED_INT(src0->getType()) && IS_UNSIGNED_INT(src1->getType()) ) ? Type_UD : Type_D;

    assert(IS_DTYPE(src0->getType()) && "src0 must be DW type");

    G4_DstRegRegion* acc_dst_opnd = builder.createDstRegRegion(
        Direct,
        builder.phyregpool.getAcc0Reg(),
        0,
        0,
        1,
        tmp_type);
    G4_INST* newMul = builder.createInternalInst(nullptr, G4_mul, NULL, false, exec_size,
        acc_dst_opnd, builder.duplicateOperand(src0), builder.duplicateOperand(src1), inst_opt,
        inst->getLineNo(), inst->getCISAOff(), inst->getSrcFilename());

    bb->instList.insert(iter, newMul);
    inst->copyDefsTo(newMul, false);
    newMul->addDefUse(inst, Opnd_implAccSrc);

    iter = i;
    iter--;
    fixMulSrc1(iter, bb);

    if( bb->isInSimdFlow() )
    {
        newMul->setOptions( ( inst_opt & ~InstOpt_Masks ) | InstOpt_WriteEnable );
    }
    inst->setOpcode( G4_mach );

    if (src1->isImm() && src0->getType() != src1->getType()) {
        G4_Imm *oldImm = src1->asImm();
        // Ensure src1 has the same type as src0.
        G4_Imm *newImm = builder.createImm(oldImm->getInt(), src0->getType());
        inst->setSrc(newImm, 1);
    }

    //set implicit src/dst for mach
    RegionDesc *rd = NULL;
    unsigned short vs = 0, wd = exec_size, hs = 0;
    if( exec_size > 1 ){
        if( exec_size == 16 ){
            wd = wd/2;
        }
        hs = 1;
        vs = wd;
    }
    rd = builder.createRegionDesc( vs, wd, hs );
    G4_SrcRegRegion *acc_src_opnd = builder.createSrcRegRegion(Mod_src_undef, Direct, builder.phyregpool.getAcc0Reg(), 0, 0, rd, tmp_type);
    inst->setImplAccSrc( acc_src_opnd );
    inst->setImplAccDst( builder.createDstRegRegion( *acc_dst_opnd ) );

    INST_LIST_ITER end_iter = i;
    // check if the ACC source is aligned to mach dst
    G4_DstRegRegion *dst = inst->getDst();
    if ((inst->getSaturate()) ||
        (dst &&
        ((dst->getExecTypeSize() > G4_Type_Table[Type_D].byteSize) ||
        (isPreAssignedRegOffsetNonZero<G4_DstRegRegion>(dst)))))
    {
        // add a tmp mov
        inst->setDest( insertMovAfter( i, dst, dst->getType(), bb ) );
        end_iter++;
    }

    inst->setOptionOn(InstOpt_AccWrCtrl);

    if (exec_size == 16)
    {
        INST_LIST_ITER start_iter = i;
        start_iter--;
        splitDWMULInst( start_iter, end_iter, bb );
        i = end_iter;
    }
}

//
// insert move instructions to copy numDwords dwords from src to dst at the specified location
// a NoMask UD move is used.
// dst and src must be dword-aligned.
// srcOffset and dstOffset are in bytes
// numDwords must be one of {1,2,4,8,16}
// ToDo: may want to generalize this into a copyBytes function that selects the appropriate move type
// based on dst and src type
//
void HWConformity::copyDwords(G4_Declare* dst,
                              int dstOffset,
                              G4_Declare* src,
                              int srcOffset,
                              int numDwords,
                              G4_BB* bb,
                              INST_LIST_ITER iter)
{

    MUST_BE_TRUE(numDwords == 1 || numDwords == 2 || numDwords == 4 ||
        numDwords == 8 || numDwords == 16, "invalid number of dwords to copy");

    G4_Declare* newDst = dst;

    if (dst->getElemType() != Type_UD)
    {
        // create an alias with type UD
        newDst = builder.createTempVar(numDwords, Type_UD, Either, Any);
        newDst->setAliasDeclare(dst, 0);
    }

    G4_Declare* newSrc = src;
    if (src->getElemType() != Type_UD)
    {
        // create an alias with type UD
        newSrc = builder.createTempVar(numDwords, Type_UD, Either, Any);
        newSrc->setAliasDeclare(src, 0);
    }

    G4_SrcRegRegion* srcOpnd = builder.createSrcRegRegion(Mod_src_undef, Direct,
        newSrc->getRegVar(), srcOffset / GENX_GRF_REG_SIZ,
        (srcOffset % GENX_GRF_REG_SIZ) / G4_Type_Table[Type_UD].byteSize,
        builder.getRegionStride1(), Type_UD);
    G4_DstRegRegion* dstOpnd = builder.createDstRegRegion(Direct, newDst->getRegVar(),
        dstOffset / GENX_GRF_REG_SIZ,
        (dstOffset % GENX_GRF_REG_SIZ) / G4_Type_Table[Type_UD].byteSize, 1, Type_UD);

    G4_INST* movInst = builder.createInternalInst(NULL, G4_mov, NULL, false, (uint8_t) numDwords, dstOpnd, srcOpnd,
        NULL, InstOpt_WriteEnable);

    INST_LIST_ITER movPos = bb->instList.insert(iter, movInst);

    if (numDwords == 16 &&
        ((dstOffset % GENX_GRF_REG_SIZ) != 0 || (srcOffset % GENX_GRF_REG_SIZ) != 0))
    {
        // move crosses 2 GRF boundary, needs splitting
        evenlySplitInst(movPos, bb);
    }
}

// like the above, but source is an indirect 64-bit source and dst offset is always 0
// If source is Indirect 1x1, we generate
//  mov (esize*2) tmp<1>:ud r[A0]<1;1,0>:ud
//  ...     tmpSrc<region>:q
// If source is VxH indirect, we have to generate instead
//  mov (esize*2) tmp<1>:ud r[A0]<2,1>:ud
//  ...     tmpSrc<1;1,0>:q
// as we can't have the indirect region on the 64-bit type operand
// A0 is not changed otherwise
void HWConformity::copyDwordsIndirect(G4_Declare* dst,
    G4_SrcRegRegion* src,
    int numDwords,
    G4_BB* bb,
    INST_LIST_ITER iter)
{
    MUST_BE_TRUE(G4_Type_Table[dst->getElemType()].byteSize >= 4 &&
        G4_Type_Table[src->getType()].byteSize >= 4, "dst and src must have dword or qword type");

    MUST_BE_TRUE(numDwords == 1 || numDwords == 2 || numDwords == 4 ||
        numDwords == 8 || numDwords == 16, "invalid number of dwords to copy");

    MUST_BE_TRUE(src->getRegAccess() == IndirGRF, "source must be indirect GRF");

    G4_Declare* newDst = dst;

    if (dst->getElemType() != Type_UD)
    {
        // create an alias with type UD
        newDst = builder.createTempVar(numDwords, Type_UD, Either, Any);
        newDst->setAliasDeclare(dst, 0);
    }

    G4_SrcRegRegion* newSrc = builder.duplicateOperand(src);
    MUST_BE_TRUE(G4_Type_Table[newSrc->getType()].byteSize == 8, "only support 64-bit type source so far");
    newSrc->setType(Type_UD);
    if (newSrc->getRegion()->isRegionWH())
    {
        MUST_BE_TRUE(newSrc->getRegion()->width == 1, "only handle <1,0> region for now");
        newSrc->setRegion(builder.createRegionDesc(UNDEFINED_SHORT, 2, 1));
    }
    else
    {
        newSrc->setRegion(builder.getRegionStride1());
    }

    G4_DstRegRegion* dstOpnd = builder.createDstRegRegion(Direct, newDst->getRegVar(), 0, 0, 1, Type_UD);

    G4_INST* movInst = builder.createInternalInst(NULL, G4_mov, NULL, false, (uint8_t)numDwords, dstOpnd, newSrc,
        NULL, InstOpt_WriteEnable);

    bb->instList.insert(iter, movInst);
}

// copy numRegs GRFs from src[srcOffset] to dst[dstOffset]
// dst[dstOffset] and src[srcOffset] are both GRF-aligned
void HWConformity::copyRegs(G4_Declare* dst,
    int dstOffset,
    G4_Declare* src,
    int srcOffset,
    int numRegs,
    G4_BB* bb,
    INST_LIST_ITER iter)
{
    int numByteCopied = 0;
    for (; numRegs >= 2; numRegs -= 2, numByteCopied += 64)
    {
        copyDwords(dst, dstOffset + numByteCopied, src, srcOffset + numByteCopied, 16, bb, iter);
    }
    if (numRegs != 0)
    {
        copyDwords(dst, dstOffset + numByteCopied, src, srcOffset + numByteCopied, 8, bb, iter);
    }
}

void HWConformity::fix64bInst( INST_LIST_ITER iter, G4_BB* bb )
{

    // HW restrictions:
    // [DevCHV, DevBXT]: When source or destination datatype is 64b, indirect addressing must not be used.
    // the region rules are:
    // Source and Destination horizontal stride must be aligned to the execution datatype.
    // Example:
    // mov (4) r10.0:df r11.0<16;8,2>:f // Source stride must be 2 since datatype is smaller
    // move (4) r10.0<2>:f r11.0<4;4,1>:df // Destination stride must be 2 since datatype is smaller.
    // as this would require splitting in some cases
    // Regioning must ensure Src.Vstride = Src.Width * Src.Hstride
    // Source and Destination offset must be the same, except the case of scalar source
    // [DevCHV, DevBXT]: When source or destination datatype is 64b, indirect addressing must not be used.
    // [DevCHV, DevBXT]: ARF registers must never be used with 64b datatype.

    if (!builder.no64bitRegioning())
    {
        return;
    }

    G4_INST* inst = *iter;
    bool uses64BitType = false;
    bool isDWMultiply = false;
    uint8_t execSize = inst->getExecSize();

    if (inst->isSend())
    {
        return;
    }
    if (inst->getDst() != NULL && G4_Type_Table[inst->getDst()->getType()].byteSize == 8)
    {
        uses64BitType = true;
    }
    for (int i = 0; !uses64BitType && i < G4_Inst_Table[inst->opcode()].n_srcs; i++)
    {
        G4_Operand* src = inst->getSrc(i);
        if (src != NULL && G4_Type_Table[src->getType()].byteSize == 8)
        {
            uses64BitType = true;
        }
    }
    if (inst->opcode() == G4_mul && IS_DTYPE(inst->getSrc(0)->getType()) &&
        IS_DTYPE(inst->getSrc(1)->getType()))
    {
        //WA: dw*dw multiply is considered to use 64bit data type since the result is 64-bit
        uses64BitType = true;
        isDWMultiply = true;
    }

    if (uses64BitType)
    {
#if 0
//#ifdef DEBUG_VERBOSE_ON
        std::cout << "CHV 64b fix for:\n";
        inst->emit(std::cout);
        std::cout << "\n";
#endif
        int numSrc = G4_Inst_Table[inst->opcode()].n_srcs;

        // handle indirect sources first
        for (int i = 0; i < numSrc; ++i)
        {
            G4_Operand* src = inst->getSrc(i);
            if (src != nullptr && src->isSrcRegRegion() && src->asSrcRegRegion()->getRegAccess() == IndirGRF)
            {
                G4_SrcRegRegion* srcAsRegion = src->asSrcRegRegion();
                RegionDesc* region = srcAsRegion->getRegion();
                int byteSize = G4_Type_Table[srcAsRegion->getType()].byteSize;
                if (byteSize == 8)
                {
                    // right bound is not available for indirect operands
                    // FIXME: this code should be moved to getRightBound()
                    int rightBound = 0;
                    // we must change move type to UD

                    if (region->isScalar())
                    {
                        rightBound = byteSize;
                    }
                    else if (region->isRegionWH())
                    {
                        rightBound = inst->getExecSize() * byteSize;
                    }
                    else
                    {
                        int num_rows = inst->getExecSize() / region->width;
                        rightBound = (num_rows - 1) * region->vertStride * byteSize +
                                region->horzStride * (region->width - 1) * byteSize +
                                byteSize;
                    }

                    int numDwords = rightBound / G4_Type_Table[Type_UD].byteSize;
                    numDwords = Round_Up_Pow2(numDwords);
                    G4_Declare* tmpSrc = builder.createTempVar(numDwords / 2, src->getType(), Either, Sixteen_Word);
                    // new source's region varies depending on whether it's VxH or 1x1
                    RegionDesc* newRegion = region->isRegionWH() ? builder.getRegionStride1() : region;
                    copyDwordsIndirect(tmpSrc, srcAsRegion, numDwords, bb, iter);
                    G4_SrcRegRegion* tmpSrcOpnd = builder.createSrcRegRegion(srcAsRegion->getModifier(),
                        Direct, tmpSrc->getRegVar(), 0, 0, newRegion, tmpSrc->getElemType());
                    inst->setSrc(tmpSrcOpnd, i);
                }
                else
                {
                    // use the good ol' insertMovBefore
                    G4_Operand* tmpSrc = insertMovBefore(iter, i, src->getType(), bb);
                    G4_Declare* tmpSrcDcl = tmpSrc->getTopDcl();
                    tmpSrcDcl->setSubRegAlign(Sixteen_Word);
                    inst->setSrc(tmpSrc, i);
                }
            }
        }

        // now handle direct sources with bad region/alignment
        bool hasSameOffset = hasSameSubregOffset(inst);
        for (int i = 0; i < numSrc; i++)
        {
            G4_Operand* src = inst->getSrc(i);
            if (src != NULL && src->isSrcRegRegion())
            {
                G4_SrcRegRegion* srcAsRegion = src->asSrcRegRegion();
                RegionDesc* region = srcAsRegion->getRegion();
                int byteSize = G4_Type_Table[srcAsRegion->getType()].byteSize;

                if (!isDWMultiply && !region->isScalar() &&
                    (byteSize != 8 && (byteSize * region->horzStride) < 8))
                {
                    // source is not 8 byte aligned
                    // this can happen e.g. for
                    // mov (8) r1.0<1>:df (mod)r3<8;8,1>:f
                    // which we'd need to change to
                    // mov (8) r10.0<2>:f (mod)r3.0<8;8,1>:f
                    // mov (8) r1.0<1>:df r10.0<8;4,2>:f
                    // to satisfy rule 1
                    uint8_t exSize = inst->getExecSize();
                    uint16_t multFactor = (uint16_t)(8 / byteSize);
                    G4_Type tmpType = srcAsRegion->getType();
                    if (multFactor == 8)
                    {
                        // byte type needs special handling since we can't have stride 8
                        tmpType = (tmpType == Type_B) ? Type_W : Type_UW;
                        multFactor = 4;
                    }
                    MUST_BE_TRUE(multFactor != 8, "does not support 64b operation with byte source");
                    G4_Declare* tmp = builder.createTempVar(exSize * multFactor,
                        tmpType, Either, Sixteen_Word);
                    G4_DstRegRegion* tmpDst = builder.Create_Dst_Opnd_From_Dcl(tmp, multFactor);
                    G4_INST* movInst = builder.createInternalInst(NULL, G4_mov, NULL, false,
                        inst->getExecSize(), tmpDst, src, NULL, inst->getOption());
                    bb->instList.insert(iter, movInst);
                    uint16_t width = exSize;
                    if (width * 8 > GENX_GRF_REG_SIZ)
                    {
                        // can't have width cross GRF
                        width = 4;
                    }
                    G4_SrcRegRegion* newSrc = builder.Create_Src_Opnd_From_Dcl(tmp,
                        builder.createRegionDesc((uint16_t)multFactor * width, width, multFactor));
                    inst->setSrc(newSrc, i);
                }
                else if (region->isScalar())
                {
#if 0
                    // scalar region still must be aligned to qword, though it can be any qword
                    if (byteSize < 8 && !builder.isOpndAligned(srcAsRegion, 8))
                    {
                        G4_Operand* tmpSrc = insertCopyBefore(iter, i, Four_Word, bb);
                        inst->setSrc(tmpSrc, i);
                    }
#endif
                }
                else if (!hasSameOffset)
                {
                    // we need a temp src that is GRF-aligned
                    if (byteSize == 8)
                    {
                        // the same src/dst offset restriction applies to move as well, so we have to generate
                        // a packed move with UD type to work around the restriction
                        // e.g., for
                        // add (2) ... r1.1<4;2,2>:q
                        // we turn it into
                        // mov (8) r10.0<1>:ud r1.2<1;1,0>:ud {NoMask}
                        // add (2) ... r10.0<4;2,2>:q
                        int numDwords = (src->getRightBound() - src->getLeftBound() + 1) / G4_Type_Table[Type_UD].byteSize;
                        numDwords = Round_Up_Pow2(numDwords);
                        G4_Declare* tmpSrc = builder.createTempVar(numDwords / 2, src->getType(), Either, Sixteen_Word);
                        copyDwords(tmpSrc, 0, src->getTopDcl(), src->getLeftBound(), numDwords, bb, iter);
                        G4_SrcRegRegion* tmpSrcOpnd = builder.createSrcRegRegion(srcAsRegion->getModifier(),
                            Direct, tmpSrc->getRegVar(), 0, 0, srcAsRegion->getRegion(), tmpSrc->getElemType());
                        inst->setSrc(tmpSrcOpnd, i);
                    }
                    else
                    {
                        // use the good ol' insertMovBefore
                        G4_Operand* tmpSrc = insertMovBefore(iter, i, src->getType(), bb);
                        G4_Declare* tmpSrcDcl = tmpSrc->getTopDcl();
                        tmpSrcDcl->setSubRegAlign(Sixteen_Word);
                        inst->setSrc(tmpSrc, i);
                    }
                }
            }
        }

        for (int i = 0; i < numSrc; i++)
        {
            // rewrite <1;1,0> to <2;2,1> since HW does not like the former
            G4_Operand* src = inst->getSrc(i);
            if (src != nullptr && src->isSrcRegRegion())
            {
                G4_SrcRegRegion* srcAsRegion = src->asSrcRegRegion();
                RegionDesc* region = srcAsRegion->getRegion();
                if (!region->isRegionWH() && region->vertStride != region->horzStride * region->width)
                {
                    // see if we can fix the region to satisfy VS = W * HS
                    if (region->width == inst->getExecSize())
                    {
                        // vs is a don't care, change to <w*hs, w, hz>
                        srcAsRegion->setRegion(builder.createRegionDesc(region->width * region->horzStride, region->width, region->horzStride));
                    }
                    else if (region->width == 1)
                    {
                        // hs is a don't care, change it to <esize*vs, esize, vs>
                        MUST_BE_TRUE(region->vertStride <= 4, "illegal vertical stride");

                        uint16_t wd = inst->getExecSize();
                        uint16_t hs = region->vertStride;
                        if (src->crossGRF())
                        {
                            // Make sure the new hs does not cross GRF
                            uint32_t nbytesIn1stGRF = GENX_GRF_REG_SIZ - (src->getLeftBound() % GENX_GRF_REG_SIZ);
                            uint32_t eltBytes = G4_Type_Table[srcAsRegion->getType()].byteSize;
                            uint32_t neltsIn1stGRF = nbytesIn1stGRF / eltBytes;

                            MUST_BE_TRUE((nbytesIn1stGRF % eltBytes) == 0, "Bad region with element crossing GRF");
                            MUST_BE_TRUE((neltsIn1stGRF % hs) == 0, "hs cannot cross GRF");

                            wd = neltsIn1stGRF / hs;
                            // Get the largest powOfTwo that can divide wd
                            wd = wd & (-wd);
                            //MUST_BE_TRUE( wd > 1, "Cannot select non-1 width w/o crossing GRF");
                        }
                        srcAsRegion->setRegion(builder.createRegionDesc(wd * hs, wd, hs));
                    }

                    else
                    {
                        // FIXME: Both VS and HS are used by the region, so we have to either split inst or insert multiple moves to pack the source
                        // both are painful, so we assert for now and fix later if we encounter such a case
                        MUST_BE_TRUE(false, "Unhandled bad 64b region on CHV/BXT");
                    }

                }
            }
        }
        G4_DstRegRegion* dst = inst->getDst();
        if (dst != NULL && !dst->isNullReg())
        {
            bool needsTmpDst = dst->getRegAccess() != Direct ||
                (execSize > 1 && !hasSameOffset) ||
                dst->isAreg();
            if (needsTmpDst)
            {
                // we need to have a temp dst that is direct and GRF-aligned
                if (dst->getRegAccess() == Direct && G4_Type_Table[dst->getType()].byteSize == 8)
                {
                    // the same src/dst offset restriction applies to move as well, so we have to generate
                    // a move with UD type to work around the restriction
                    // e.g., for
                    // add (2) r1.2<1>:q ...
                    // we generate
                    // add (2) r3.0<1>:q ...
                    // mov (4) r1.4<1>:ud r3.0<1;1,0>:ud {NoMask}
                    // If dst is not contiguous, we additionally add a move to pre-load the old values:
                    // add (2) r1.2<2>:q ...
                    // becomes
                    // mov (8) r3.0<1>:ud r1.4<1;1,0>:ud {NoMask}
                    // add (2) r3.0<2>:q ...
                    // mov (8) r1.4<1>:ud r3.0<1;1,0>:ud {NoMask}
                    int numDwords = (dst->getRightBound() - dst->getLeftBound() + 1) / G4_Type_Table[Type_UD].byteSize;
                    numDwords = Round_Up_Pow2(numDwords);
                    G4_Declare* tmpDst = builder.createTempVar(numDwords / 2, dst->getType(), Either, Sixteen_Word);
                    if (numDwords > execSize * 2)
                    {
                        // dst is not packed, need a move to pre-load the dst value into tmp
                        copyDwords(tmpDst, 0, dst->getTopDcl(), dst->getLeftBound(), numDwords, bb, iter);
                    }
                    INST_LIST_ITER next = iter;
                    ++next;
                    copyDwords(dst->getTopDcl(), dst->getLeftBound(), tmpDst, 0, numDwords, bb, next);
                    inst->setDest(builder.Create_Dst_Opnd_From_Dcl(tmpDst, dst->getHorzStride()));
                }
                else
                {
                    // use the good ol' insertMoveAfter
                    G4_DstRegRegion* tmpDst = insertMovAfter(iter, dst, dst->getType(), bb);
                    G4_Declare* tmpDstDcl = tmpDst->getTopDcl();
                    tmpDstDcl->setSubRegAlign(Sixteen_Word);
                    inst->setDest(tmpDst);
                    if (G4_Type_Table[dst->getType()].byteSize == 8)
                    {
                        // tmpDst is indirect and thus still does not conform
                        // we rewrite
                        // mov (e) r[a0.0]<1>:q src<1;1,0>:q
                        // into
                        // mov (e*2) r[a0.0]<1>:ud src<1;1,0>:ud {NoMask}
                        INST_LIST_ITER movIter = iter;
                        ++iter;
                        G4_INST* movInst = *iter;
                        MUST_BE_TRUE(movInst->opcode() == G4_mov && movInst->getDst() == dst &&
                            movInst->getSrc(0)->isSrcRegRegion(),
                            "unexpected instruction created by insertMovAfter");
                        MUST_BE_TRUE(dst->getHorzStride() == 1, "only stride 1 is supported for now");
                        dst->setType(Type_UD);
                        G4_SrcRegRegion* src = movInst->getSrc(0)->asSrcRegRegion();
                        G4_Declare* tmpAsUD = builder.createTempVar(tmpDstDcl->getNumElems() * 2, Type_UD, Either, Any);
                        tmpAsUD->setAliasDeclare(tmpDstDcl, 0);
                        RegionDesc* newRegion = src->getRegion()->isScalar() ?
                            builder.createRegionDesc(0, 2, 1) : builder.getRegionStride1();
                        G4_SrcRegRegion* srcAsUD = builder.createSrcRegRegion(src->getModifier(),
                            src->getRegAccess(), tmpAsUD->getRegVar(), src->getRegOff(),
                            src->getSubRegOff() * 2, newRegion, tmpAsUD->getElemType());
                        movInst->setSrc(srcAsUD, 0);
                        movInst->setExecSize(inst->getExecSize() * 2);

                        // mov saturate/pred to the original inst
                        movInst->setOptionOn(InstOpt_WriteEnable);
                        if (movInst->getSaturate())
                        {
                            movInst->setSaturate(false);
                            inst->setSaturate(true);
                        }
                        G4_Predicate* pred = movInst->getPredicate();
                        if (pred)
                        {
                            MUST_BE_TRUE(inst->getPredicate() == nullptr, "both inst and movInst have predicates");
                            movInst->setPredicate(nullptr);
                            inst->setPredicate(pred);
                        }
                    }
                }
            }
        }
    }
}

//------------------------------------------------------------------------------
//
//  For BDW, 32 bits integer multiply is implemented as the following macro
//
//  mul (8) acc0:d     r2.0<8;8,1>d   r3.0<16;8,2>:uw
//  mach (8) rTemp<1>:d r2.0<8;8,1>d   r3.0<8;8,1>:d
//  mov (8) r5.0<1>:d   rTemp:d // hi-32bits
//  mov (8) r6.0<1>:d acc0:d // lo-32bits
//
//  Note that this only changes the mul instruction's src1, mach and mov is generated elsewhere
//------------------------------------------------------------------------------
void HWConformity::fixMulSrc1( INST_LIST_ITER i, G4_BB* bb )
{
    G4_INST *inst = *i;
    G4_Operand *src1 = inst->getSrc(1);

    if (!IS_DTYPE(src1->getType()))
    {
        // this could happen if dst is Q
        return;
    }

    if (src1->isImm())
    {
        uint64_t truncVal = src1->asImm()->getImm() & 0xFFFF;
        G4_Imm *new_src1 = builder.createImm(truncVal, Type_UW);
        inst->setSrc(new_src1, 1);
        return;
    }

    assert(src1->isSrcRegRegion() && "region expected");
    G4_SrcRegRegion *srcRegion = src1->asSrcRegRegion();
    RegionDesc *rd = srcRegion->getRegion();
    if (rd->horzStride >= 4)
    {
        G4_Operand* new_src1 = insertMovBefore(i, 1, Type_UW, bb);
        inst->setSrc(new_src1, 1);
    }
    else
    {
        // create a new opnd with type UW
        unsigned short scale = G4_Type_Table[Type_D].byteSize / G4_Type_Table[Type_UW].byteSize;
        unsigned short newHS = rd->horzStride * scale;
        unsigned short newVS = rd->vertStride * scale;
        RegionDesc *new_rd = builder.createRegionDesc(newVS, rd->width, newHS);
        short subRegOff = srcRegion->getSubRegOff();
        if (srcRegion->getRegAccess() == Direct)
        {
            subRegOff *= scale;
        }
        auto new_src1 = builder.createSrcRegRegion(
            srcRegion->getModifier(), srcRegion->getRegAccess(),
            srcRegion->getBase(), srcRegion->getRegOff(), subRegOff, new_rd,
            Type_UW);
        inst->setSrc(new_src1, 1);
        if (srcRegion->getRegAccess() != Direct)
        {
            new_src1->setImmAddrOff(srcRegion->getAddrImm());
        }
    }
}

/*
 *  only acc0 may be used in DWord operations, so we have to break a
 *  SIMD16 DWord multiply into two mul-mach-mov sequences.
 *
 *  Input:
 *  (f0) mul (16) dst:d  src0:d  src1:d
 *
 *  Output:
 *  mul (8) acc0:d  src0:d  src1:d
 *  mach    (8) null:d  src0:d  src1:d
 *  (f0) mov (8) dst:d acc0:d
 *  mul (8) acc0:d  src0+1:d  src1+1:d
 *  mach    (8) null:d  src0+1:d    src1+1:d
 *  (f1) mov (8) dst+1:d acc0:d
 *
 */
void HWConformity::splitDWMULInst( INST_LIST_ITER &start, INST_LIST_ITER &end, G4_BB *bb )
{
    // split simd16 inst into SIMD8 ones, since D is not supported for acc1
    INST_LIST_ITER iter = start, last_iter = end;
    //iter--;
    last_iter++;
    INST_LIST_ITER curr_iter;
    while( iter != end )
    {
        curr_iter = iter;
        evenlySplitInst( curr_iter, bb );
        // curr_iter points to the second half after instruction splitting
        G4_INST *expand_sec_half_op = *curr_iter;
        iter++;

        bb->instList.insert( last_iter, expand_sec_half_op );
        if( curr_iter == start )
        {
            start--;
        }
        bb->instList.erase( curr_iter );
    }
    // handle the last inst
    if( iter == end )
    {
        evenlySplitInst( iter, bb );
        G4_INST *expand_sec_half_op = *iter;
        bb->instList.insert( last_iter, expand_sec_half_op );
        end--;
        bb->instList.erase( iter );
    }
}

static bool isGoodMadType(G4_Type type)
{
    switch (type)
    {
    case Type_F:
    case Type_HF:
        return true;
    case Type_DF:
        return true;
    default:
        return false;
    }
}

bool HWConformity::isGoodAlign1TernaryDst(G4_INST* inst) const
{
    // Align1 MAD requirements:
    // -- dst must be direct GRF/acc with horizontal stride 1 or 2
    G4_Type execType = inst->getExecType();
    G4_DstRegRegion* dst = inst->getDst();

    MUST_BE_TRUE(!IS_QTYPE(dst->getType()) && !IS_BTYPE(dst->getType()), "3Src inst don't support Q and B dst types");

    if (!builder.hasMixMode() &&
        (dst->getType() == Type_HF && execType != Type_HF))
    {
        return false;
    }

    int alignInBytes = 8;
    // if src2 is not a scalar, then align it to 32 bytes.
    if (builder.noSrc2Regioning())
    {
        unsigned src2Pos = inst->opcode() == G4_pseudo_mad ? 0 : 2;
        auto src2 = inst->getSrc(src2Pos);
        if (src2->isSrcRegRegion() && !src2->asSrcRegRegion()->isScalar())
        {
            alignInBytes = 32;
        }
    }

    if (!builder.isOpndAligned(dst, alignInBytes))
    {
        // dst must be 8 byte aligned due to encoding issues
        return false;
    }

    uint32_t effectiveStride = dst->getHorzStride();
    if (G4_Type_Table[dst->getType()].byteSize < G4_Type_Table[execType].byteSize)
    {
        if (IS_TYPE_INT(dst->getType()))
        {
            effectiveStride *= G4_Type_Table[execType].byteSize / G4_Type_Table[dst->getType()].byteSize;
        }
        else
        {
            // we have mixed HF and F inst
            // dst can be packed HF, but then it must be oword aligned
            // this should be checked later for mixed mode inst
        }
    }

    return dst->getRegAccess() == Direct && effectiveStride <= 2;
}

//
// check for legal align1 ternary inst sources
//
bool HWConformity::isGoodAlign1TernarySrc(G4_INST* inst, int srcPos, bool canBeImm)
{
    MUST_BE_TRUE(srcPos >= 0 && srcPos < 3, "illegal source pos");

    uint8_t execSize = inst->getExecSize();
    G4_Operand* src = inst->getSrc(srcPos);
    // for pseudo_mad we have to swap src0 and src2
    bool isSrc2 = inst->opcode() == G4_pseudo_mad ? srcPos == 0 : srcPos == 2;

    if (!builder.hasMixMode())
    {
        G4_Type execType = inst->getExecType();
        if (src->getType() == Type_HF && execType != Type_HF)
        {
            return false;
        }
    }

    if (IS_QTYPE(src->getType()))
    {
        return false;
    }

    if (inst->opcode() == G4_pseudo_mad && isSrc2)
    {
        if (IS_DTYPE(src->getType()))
        {
            return false;
        }

        if (builder.noSrc2Regioning() && IS_BTYPE(src->getType()))
        {
            return false;
        }
    }

    if (src->isImm())
    {
        // either src0 or src2 can be 16b imm, but not both
        // permanent WA: simd16 inst can't have src0 imm.
        // Instead of splitting, we just add a move

        if (canBeImm && (srcPos == 0 || srcPos == 2) && G4_Type_Table[src->getType()].byteSize <= 2)
        {
            if (VISA_WA_CHECK(builder.getPWaTable(), WaNoSimd16TernarySrc0Imm))
            {
                return !isSrc2 && inst->getExecSize() == 16 ? false : true;
            }
            return true;
        }
        return false;
    }
    else if (src->isSrcRegRegion())
    {

        if (src->asSrcRegRegion()->getRegAccess() != Direct)
        {
            return false;
        }

        auto checkSingleStrideRegion = [](G4_SrcRegRegion* src, int stride, IR_Builder& builder)
        {
            if (stride > 4)
            {
                return false;
            }
            else if ((src->getLeftBound() % GENX_GRF_REG_SIZ != 0) &&
                (src->getRightBound() - src->getLeftBound() >= GENX_GRF_REG_SIZ - 1))
            {
                // we have to make sure width is not being used to cross GRF, as <1;1,0>
                // is not a legal region for align1 ternary source (vs 1 not supported)
                int minAlignment = G4_Type_Table[src->getType()].byteSize * stride * 2;
                return builder.isOpndAligned(src, minAlignment);
            }
            return true;
        };

        // the following regions are supported:
        // <N;N,0>
        // <0;1,0>
        // <W*H;W,H>
        RegionDesc* srcRegion = src->asSrcRegRegion()->getRegion();
        if (srcRegion->isScalar())
        {
            return true;
        }

        // src0 and src1 (for psuedo-mad, it's src1 and src2) may use the <N;N,0> region
        // as they come with a vStride in encoding
        // TODO: we may consider swapping src1 and src2 to catch more regions
        if (!isSrc2)
        {
            uint16_t stride = 0;
            if (srcRegion->isSingleStride(execSize, stride))
            {
                return checkSingleStrideRegion(src->asSrcRegRegion(), stride, builder);
            }
            // <4;4,0> and <8;8,0> are ok
            return srcRegion->vertStride == srcRegion->width &&
                srcRegion->horzStride == 0 && srcRegion->width < 8 && srcRegion->width != 2;
        }
        else
        {
            if (!builder.noSrc2Regioning())
            {
                // src2 (src0 for pseudo-mad) is without vstride, and its region must be
                // <esize*H;esize,H>, with vstride derived from exSize and hstride
                uint16_t stride = 0;
                if (srcRegion->isSingleStride(execSize, stride))
                {
                    return checkSingleStrideRegion(src->asSrcRegRegion(), stride, builder);
                }
            }
            else
            {
                // not a scalar, src2 must be GRF aligned.
                if (!builder.isOpndAligned(src, G4_GRF_REG_NBYTES))
                {
                    return false;
                }

                uint16_t stride = 0;
                if (srcRegion->isSingleStride(execSize, stride))
                {
                    unsigned short dstExecSize = inst->getDst()->getExecTypeSize();
                    unsigned short srcExecSize = stride * src->asSrcRegRegion()->getElemSize();
                    // Source 2 and destination stride must be aligned to the same execution type.
                    // E.g. mad (4) r10.0<1>:hf src0 src1 r13.0<1>:hf
                    //      mad (4) r10.0<2>:hf src0 src1 r13.0<1>:f
                    //      mad (4) r10.0<1>:f  src0 src1 r13.0<2>:hf
                    // this rule is relaxed if mix mode is enabled (packed HF ok)
                    if (builder.hasPartialMixMode() || dstExecSize == srcExecSize)
                    {
                        return true;
                    }
                }
            }

            return false;
        }
    }

    return true;
}

//
// a source is good for align16 if:
// -- it is a direct srcRegRegion
// -- it has contiguous region and can be made either GRF-aligned (for exec size >= 8)
//    or oword aligned (for exec size == 4)
// -- or it has scalar region and is not non-simd1 double
bool HWConformity::isGoodAlign16Src(G4_INST* inst, int srcPos)
{
    MUST_BE_TRUE(srcPos >= 0 && srcPos < 3, "illegal source pos");

    uint8_t execSize = inst->getExecSize();
    G4_Operand* src = inst->getSrc(srcPos);
    G4_Type opnd_type = src->getType();

    // Constants are not allowed as MAD opnds.
    if (src->isSrcRegRegion())
    {
        RegionDesc* region = src->asSrcRegRegion()->getRegion();
        G4_RegAccess regAcc = src->asSrcRegRegion()->getRegAccess();

        if (regAcc != Direct)
        {
            return false;
        }

        if (region->isContiguous(execSize))
        {
            if (getGenxPlatform() == GENX_BDW && getTypeSize(opnd_type) < 4)
            {
                // BDW HF has to be 32-byte aligned
                if (!builder.isOpndAligned(src, 32))
                {
                    return false;
                }
            }
            else
            {
                if (execSize >= 8)
                {
                    // operand must be GRF aligned, or oword aligned for HF/W
                    uint32_t align = std::min<uint32_t>(execSize * getTypeSize(src->getType()), 32);
                    if (!builder.isOpndAligned(src, align))
                    {
                        return false;
                    }
                }
                else if (execSize == 4 || execSize == 2)
                {
                    // operand must be oword-aligned
                    if (!builder.isOpndAligned(src, 16))
                    {
                        return false;
                    }
                }
            }
        }
        else if (src->asSrcRegRegion()->isScalar())
        {
            if (opnd_type == Type_DF && execSize != 1)
            {
                // scalar region is illegal for DF since replicate is not supported
                return false;
            }

            if (opnd_type == Type_HF && getGenxPlatform() == GENX_BDW) {
                return false;
            }
        }
        else
        {
            // all other regions are illegal
            return false;
        }

        return true;
    }
    else
    {
        return false;
    }

}

//
// Move modifiers of src2 in pseudo_mad to its defining instruction.
//
// mul (16) V66(0,0)<1>:d V46(23,0)<16;16,1>:w 0x39db:w {Align1, H1}
// psuedo_mad (16) V67(0,0)<1>:d V469,0)<8;8,1>:w 0x1b5d:w -V66(0,0)<16;16,1>:d
//
// becomes
//
// mul (16) V66(0,0)<1>:d -V46(23,0)<16;16,1>:w 0x39db:w {Align1, H1}
// psuedo_mad (16) V67(0,0)<1>:d V469,0)<8;8,1>:w 0x1b5d:w V66(0,0)<16;16,1>:d
//
static void tryTransferSrcModifier(IR_Builder &builder, G4_INST *def,
                                   G4_Operand *src)
{
    // Only when def has no other users.
    if (!def->hasOneUse())
        return;

    // Only transfer for integer types.
    if (!IS_SIGNED_INT(src->getType()))
        return;

    // In case the use type is different from the def type.
    if (!def->getDst() || (def->getDst()->getType() != src->getType()))
        return;

    switch (def->opcode()) {
    default:
        break;

    // Probably this is the only interesting op, since G4_math will not be
    // used to generate mac.
    case G4_mul:
    {
        // Chances are src1 is an immediate.
        G4_Operand *defSrc1 = def->getSrc(1);
        if (!IS_SIGNED_INT(defSrc1->getType()))
            return;

        if (defSrc1->isImm())
        {
            G4_Imm *val = defSrc1->asImm();
            // Mod_Minus is assumed.
            G4_Imm *newVal = builder.createImm(-val->getInt(), val->getType());
            def->setSrc(newVal, 1);
            src->asSrcRegRegion()->setModifier(Mod_src_undef);
        }
        else if (defSrc1->isSrcRegRegion())
        {
            G4_SrcRegRegion *reg = defSrc1->asSrcRegRegion();
            if (reg->getModifier() == Mod_src_undef)
            {
                reg->setModifier(src->asSrcRegRegion()->getModifier());
                src->asSrcRegRegion()->setModifier(Mod_src_undef);
            }
            else if (reg->getModifier() == Mod_Minus)
            {
                reg->setModifier(Mod_src_undef);
                src->asSrcRegRegion()->setModifier(Mod_src_undef);
            }
        }
    } break;
    }
}

// Try to move source modifiers on MAD's src2 into its defintion. This allows
// pseudo_mad ops to be translated into mac ops.
void HWConformity::tryEliminateMadSrcModifier(IR_Builder &builder, G4_INST *inst)
{
    ASSERT_USER(inst->opcode() == G4_pseudo_mad, "not a speudo-mad");

    // For pseudo_mad, src2 is the major source operand to be examined later.
    // If there is no modifier on src2, then nothing to do.
    G4_Operand *src2 = inst->getSrc(2);
    if (!src2->isSrcRegRegion())
        return;

    // Currently, only handle modifier minus. To handle others, we may need
    // to insert extra instructions.
    if (src2->asSrcRegRegion()->getModifier() != Mod_Minus)
        return;

    // Only when src2 has a single definition.
    if (G4_INST *def = inst->getSingleDef(Opnd_src2, true))
    {
        tryTransferSrcModifier(builder, def, src2);
    }
}

/// Heuristic to decide whether this fp pseudo-mad should be lowered into a
/// GEN mad or not. Returns true if mad is preferred, false otherwise.
///
/// We flavor generating non-mad when this vISA mad is part of b2b mads that
/// share the same dst.
///
bool HWConformity::isFpMadPreferred(G4_BB *bb, INST_LIST_ITER iter)
{
    G4_INST *inst = *iter;
    G4_Operand *dst = inst->getDst();
    MUST_BE_TRUE(inst->opcode() == G4_pseudo_mad, "expect pseudo mad");

    // Check whether test_inst is sharing the same dst.
    auto equal_mad_dst = [](G4_INST *test_inst, G4_Operand *dst)
    {
        if (test_inst->opcode() == G4_pseudo_mad)
        {
            G4_Operand *test_dst = test_inst->getDst();
            if (test_dst->compareOperand(dst) == Rel_eq)
                return true;
        }
        return false;
    };

    auto next_iter = std::next(iter);
    if (next_iter != bb->instList.end())
    {
        G4_INST *next_inst = *next_iter;
        if (equal_mad_dst(next_inst, dst))
            return false;
    }
    if (iter != bb->instList.begin())
    {
        auto prev_iter = std::prev(iter);
        G4_INST *prev_inst = *prev_iter;
        if (equal_mad_dst(prev_inst, dst))
            return false;
    }

    // FIXME: remove possile duplicate calls to isGoodAlign16Src, Cm only.
    // This will go away if we use an extra opcode to represent muladd.
    unsigned extraMov = 0;
    for (int k = 0; k < inst->getNumSrc(); k++)
    {
        if (!isGoodAlign16Src(inst, k))
        {
            // If need to insert >1 number of moves, then do not use mad.
            if (++extraMov > 1)
                return false;
        }
    }

    return true;
}

// generate align1 mad, inserting moves if necessary
// returns true if conversion is successful
// for floating point mad this must succeed due to precision requirements
bool HWConformity::generateAlign1Mad(G4_BB* bb, INST_LIST_ITER iter)
{

    G4_INST* inst = *iter;
    MUST_BE_TRUE(inst->opcode() == G4_pseudo_mad, "expect pseudo mad");
    bool mustDoMad = IS_TYPE_FLOAT_ALL(inst->getDst()->getType());

    if (!isGoodAlign1TernaryDst(inst))
    {
        if (mustDoMad)
        {
            auto alignment = builder.noSrc2Regioning() ? Sixteen_Word : Four_Word;
            inst->setDest(insertMovAfter(iter, inst->getDst(), inst->getDst()->getType(), bb, alignment));
        }
        else
        {
            return false;
        }
    }

    // try swapping src0 and src1 if src0 is D, as MAD only supports D + D * W,
    // and pseudo-mad src0 becomes mad src2
    {
        G4_Operand* src0 = inst->getSrc(0);
        G4_Operand* src1 = inst->getSrc(1);
        if (IS_DTYPE(src0->getType()) && src0->isSrcRegRegion())
        {
            if (!IS_DTYPE(src1->getType()))
            {
                inst->setSrc(src1, 0);
                inst->setSrc(src0, 1);
            }
        }
        else if (src1->isImm() && getTypeSize(src1->getType()) == 2)
        {
            //swap src0 and src1 as src0 supports imm
            inst->setSrc(src1, 0);
            inst->setSrc(src0, 1);
        } else if (builder.noSrc2Regioning() &&
                   src0->isSrcRegRegion() && src1->isSrcRegRegion() &&
                   !src0->asSrcRegRegion()->isScalar() &&
                   src1->asSrcRegRegion()->isScalar()) {
            // Swap src0 and src1 if src1 is scalar but src0 is not when src2
            // regioning support is quite limited.
            inst->setSrc(src1, 0);
            inst->setSrc(src0, 1);
        }
    }

    // check src
    bool canBeImm = true;
    for (int k = 0; k < inst->getNumSrc(); k++)
    {
        G4_Operand* src = inst->getSrc(k);
        if (!isGoodAlign1TernarySrc(inst, k, canBeImm))
        {
            if (mustDoMad)
            {
                bool isSrc2 = (k == 0);
                if (builder.noSrc2Regioning() && isSrc2)
                {
                    // Promote src2's type to f if necessary.
                    //
                    // mad (4) r10.0<1>:f src0 src1 r12.0<1>:hf  --> f
                    // mad (4) r10.0<2>:hf src0 src1 r12.0<1>:hf --> f
                    // mad (4) r10.0<1>:hf src0 src1 r12.0<2>:hf --> hf
                    // mad (4) r10.0<2>:hf src0 src1 r12.1<2>:hf --> f
                    G4_Type srcTy = src->getType();
                    unsigned short dstEltSz = inst->getDst()->getExecTypeSize();
                    if (dstEltSz >= 4 && IS_HFTYPE(src->getType()))
                    {
                        srcTy = Type_F;
                    }
                    inst->setSrc(insertMovBefore(iter, k, srcTy, bb, Sixteen_Word), k);

                    // Check if dst stride aligns with src2.
                    if (dstEltSz != G4_Type_Table[srcTy].byteSize)
                    {
                        inst->setDest(insertMovAfter(iter, inst->getDst(), inst->getDst()->getType(), bb, Sixteen_Word));
                    }
                }
                else
                {
                    inst->setSrc(insertMovBefore(iter, k, src->getType(), bb), k);
                }
            }
            else
            {
                return false;
            }
        }
        else
        {
            if (src->isImm())
            {
                canBeImm = false;
            }
        }
    }

    inst->setOpcode(G4_mad);

    //swap src0 and src2 (vISA MAD is src0*src1+src2, while GEN MAD is src1*src2+src0)
    G4_Operand* src0 = inst->getSrc(0);
    G4_Operand* src2 = inst->getSrc(2);
    inst->setSrc(src2, 0);
    inst->setSrc(src0, 2);

    return true;
}

// convert a FP (HF/F/DF) pseudo-mad into a GEN mad,
// inserting moves if necessary
// returns true if conversion is successful
// note that this must return true for IGC due to precision requirements
bool HWConformity::generateFPMad(G4_BB* bb, INST_LIST_ITER iter)
{
    G4_INST* inst = *iter;
    MUST_BE_TRUE(inst->opcode() == G4_pseudo_mad, "expect pseudo mad");
    uint8_t execSize = inst->getExecSize();
    G4_DstRegRegion *dst = inst->getDst();
    MUST_BE_TRUE(dst->getType() == Type_HF || dst->getType() == Type_F ||
        dst->getType() == Type_DF, "inst must have FP type");

    // Align16 MAD requirements:
    // -- dst and all 3 srcs have the same F/HF/DF type (mixed F/HF is allowed on CHV+)
    // -- dst and all 3 srcs have direct access
    // -- execution size is 16/8/4/1
    // -- dst and src must be packed
    // -- if src region is not scalar, its subregister must be 16 byte aligned

    // do not force fma for CM since it doesn't have precision requirements
    bool preferFpMad = builder.getOption(vISA_forceFPMAD) && builder.getOptions()->getTarget() != VISA_CM;
    if (!preferFpMad)
    {
        preferFpMad = isFpMadPreferred(bb, iter);
    }

    auto alignMent = execSize * G4_Type_Table[dst->getType()].byteSize;
    alignMent = (alignMent > 32)  ? 32 : alignMent;
    alignMent = (alignMent < 16)  ? 16 : alignMent;

    if (dst->getRegAccess() != Direct || dst->getHorzStride() != 1 ||
        !builder.isOpndAligned(dst, alignMent))
    {
        if (preferFpMad)
        {
            G4_DstRegRegion* tmpDst = insertMovAfter(iter, dst, dst->getType(), bb);
            inst->setDest(tmpDst);
        }
        else
        {
            return false;
        }
    }

    // check src
    for (int k = 0; k < inst->getNumSrc(); k++)
    {
        G4_Type type = inst->getSrc(k)->getType();
        MUST_BE_TRUE(type == Type_HF || type == Type_F || type == Type_DF,
            "expect FP type");
        bool goodSrc = isGoodAlign16Src(inst, k);
        if (!goodSrc && preferFpMad)
        {
            // insert moves if type is legal mad type
            if (isGoodMadType(type))
            {
                G4_Operand* src = inst->getSrc(k);
                if ((type == Type_DF ||
                     (type == Type_HF && getGenxPlatform() == GENX_BDW)) &&
                    execSize > 1 &&
                    (src->isImm() || src->asSrcRegRegion()->isScalar()))
                {
                    // MAD DF does not support .r, so we have to broadcast the value
                    // '.r' on MAD HF on BDW is not a replication of that
                    // scalar element but a pair of half.
                    auto align = type == Type_HF ? Sixteen_Word : Eight_Word;
                    broadcast(bb, iter, k, align);
                }
                else
                {
                    inst->setSrc(insertMovBefore(iter, k, type, bb), k);
                }
                goodSrc = true;
            }
        }
        if (!goodSrc)
        {
            return false;
        }
    }

    inst->setOpcode(G4_mad);

    //swap src0 and src2 (vISA MAD is src0*src1+src2, while GEN MAD is src1*src2+src0)
    G4_Operand* src0 = inst->getSrc(0);
    G4_Operand* src2 = inst->getSrc(2);
    inst->setSrc(src2, 0);
    inst->setSrc(src0, 2);

    return true;
}

// If the LF MAD does not conform to Genx ISA semantics, then translate
// it into a valid GenX sequence - either an equivalent MUL/ADD sequence
// or an equivalent MAC.
// ASSUMPTION:
//    This phase must be called at the end of all other optimizations
//    phases and just prior to testing for ACC spilling.
void HWConformity::fixMADInst( BB_LIST_ITER it )
{
    G4_BB* bb = *it;
    INST_LIST expand_list;
    // trace the MAD instrcutions that may be converted into MAC later
    std::vector<G4_INST*> madList;

    bool doAlign1Mad = builder.hasAlign1Ternary();

    bb->resetLocalId();
    INST_LIST_ITER i = bb->instList.begin();

    for (auto iterEnd = bb->instList.end(); i != iterEnd; ++i )
    {

        G4_INST *inst = *i;
        // predicated mad is not allowed?
        if( inst->opcode() != G4_pseudo_mad )
        {
            continue;
        }

        tryEliminateMadSrcModifier(builder, inst);

        G4_DstRegRegion *dst = inst->getDst();
        int exec_size = inst->getExecSize( );
        G4_Operand *src0 = inst->getSrc(0), *src1 = inst->getSrc(1), *src2 = inst->getSrc(2);

        bool conforming_genx_mad = true;
        bool generate_genx_mac;

        if (exec_size == 32)
        {
            conforming_genx_mad = false;
        }
        else
        {
            // since copy prop and def-hoisting are not allowed to Align16 instructions,
            // sources of psuedo mad should use the same data type as in CISA input
            // so we only check dst type
            switch (dst->getType())
            {
            case Type_F:
            case Type_HF:
                break;
            case Type_DF:
                break;
            case Type_W:
            case Type_UW:
            case Type_D:
            case Type_UD:
                if (!doAlign1Mad)
                {
                    conforming_genx_mad = false;
                }
                break;
            default:
                conforming_genx_mad = false;
            }
        }

        if (conforming_genx_mad)
        {
            bool doMad = doAlign1Mad ?
                generateAlign1Mad(bb, i) : generateFPMad(bb, i);
            if (doMad)
            {
                // done with this pseudo-mad
                continue;
            }
        }

        // Translate the LF MAD to an equivalent GenX sequence.
        if (builder.getOption(vISA_LocalMACopt))
        {
            generate_genx_mac = true;
        }
        else
        {
            generate_genx_mac = false;
        }

        bool dstPackedHF = false ;
        bool dstIsFloat = false;
        checkHFMixModeRule4_11(*i, dstPackedHF, dstIsFloat);
        //not dealing with that mess. Shouldn't be a common code sequence.
        if(dstPackedHF || dstIsFloat)
            generate_genx_mac = false;

        if( generate_genx_mac )
        {
            int emask = inst->getMaskOption();
            if (emask != InstOpt_WriteEnable && inst->getMaskOffset() != 0)
            {
                generate_genx_mac = false;
            }
            // If either src1 or src0 are DWORD then we cannot generate a MAC.
            // ACC does not support B type
            if (generate_genx_mac &&
                (IS_BTYPE(src2->getType()) ||
                    IS_DTYPE(src0->getType()) ||
                    IS_DTYPE(src1->getType())))
            {
                generate_genx_mac = false;
            }

            // If there is a modifer for src2, or src2 is accessed somewhere indirectly then we will
            // not generate a MAC.
            if (generate_genx_mac)
            {
                if (src2->isImm() ||
                    (src2->isSrcRegRegion() &&
                    (src2->asSrcRegRegion()->getModifier() != Mod_src_undef ||
                        src2->asSrcRegRegion()->getRegAccess() != Direct ||
                        (src2->getTopDcl() && src2->getTopDcl()->getAddressed()))) ||
                    src2->getType() == Type_DF)
                {
                    generate_genx_mac = false;
                }
            }
        }
        // we can't do mac if src2 is global or it has >1 def or its single def is global
        if( generate_genx_mac )
        {
            G4_INST *mad_src2_def_inst = inst->getSingleDef(Opnd_src2);
            if (!mad_src2_def_inst || kernel.fg.globalOpndHT.isOpndGlobal(src2) ||
                kernel.fg.globalOpndHT.isOpndGlobal(mad_src2_def_inst->getDst()))
            {
                generate_genx_mac = false;
            }


            if( madList.size() > 0 && mad_src2_def_inst != madList.back())
            {
                // terminate the last mad list as this mad has a different definition
                int32_t lastMadId = madList.back()->getLocalId();
                bool macGenerated = convertMAD2MAC( i, madList, bb );
                madList.clear();
                if (generate_genx_mac && macGenerated &&
                    mad_src2_def_inst->getLocalId() < lastMadId)
                {
                    // mad's definition is before the last use of acc
                    generate_genx_mac = false;
                }
            }

            if( generate_genx_mac &&
                ( mad_src2_def_inst->getPredicate() ||
                mad_src2_def_inst->getSaturate() ||
                mad_src2_def_inst->isMath() ||
                mad_src2_def_inst->opcode() == G4_shl ||
                mad_src2_def_inst->opcode() == G4_mad ||
                !mad_src2_def_inst->hasOneUse() ||
                ( isCompressedInst(mad_src2_def_inst) ^ isCompressedInst(inst) )) )
            {
                generate_genx_mac = false;
            }

            if( generate_genx_mac &&
                madList.size() == 0 &&
                IS_DTYPE(mad_src2_def_inst->getExecType()) )
            {
                // We don't generate mac in this case since by default we use w type for acc,
                // and it would violate dst alignment restriction
                // if mad_src2_def_inst is itself a psuedo_mad, however, then it's ok
                // since both sources for mac must have word type.
                generate_genx_mac = false;
            }

            if( generate_genx_mac )
            {
                // We will try to generate a MAC if it is possible to hoist
                // the definition for src2 into ACC, otherwise we will need to
                // generate a MOV/MAC; in which case we might as well
                // generate a MUL/ADD sequence anyway.

                // If the src2_def_op does not immediately precede the
                // MAD then we will attempt to schedule backward op to
                // immediately after src2_def_op. This will increase
                // the MAC reduction opportunities as it has the
                // effect of keeping ACC live ranges to very
                // short intervals.
                // NOTE: We do not attempt to schedule the src2_def_op
                // to just before op, as src2_def_op may be a
                // previously scheduled MAD.

                INST_LIST_ITER mov_iter = i;
                mov_iter--;
                uint16_t movDist = 0;

                if ((*mov_iter) != mad_src2_def_inst) {
                    // check if src and dst of MAD are re-defined in between and
                    // if dst is used in between
                    if (!findHoistLocation(i, mov_iter, movDist, mad_src2_def_inst))
                    {
                        generate_genx_mac = false;
                    }
                    else
                    {
                        if (movDist > 0)
                        {
                            mov_iter++;
                            bb->instList.insert(mov_iter, inst);
                            INST_LIST_ITER tmpIter = i;
                            i--;
                            bb->instList.erase(tmpIter);
                        }
                    }
                }

                // if instruction moving is blocked by some re-def, we need to check if it is possible that the ACC def instruction
                // will be split later. If yes, we do not use ACC and MAC here.

                // push this decision to convertMAC2MAD

                if (generate_genx_mac)
                {
                    if (madList.size() == 0)
                    {
                        // push src2 def into list
                        madList.push_back(mad_src2_def_inst);
                    }
                    madList.push_back(inst);
                }
            }
        }

        // translate MAD into MUL/ADD
        if( !generate_genx_mac )
        {
            convertMAD2MulAdd(i, bb);
            i++;
        }
    }
    if( madList.size() > 0 )
    {
        i--;
        convertMAD2MAC( i, madList, bb );
    }
}

struct AccInterval
{
    G4_INST* inst;
    int lastUse;
    bool mustBeAcc0 = false;
    bool isPreAssigned = false;
    int assignedAcc = -1;

    AccInterval(G4_INST* inst_, int lastUse_, bool preAssigned = false) :
        inst(inst_), lastUse(lastUse_), isPreAssigned(preAssigned)
    {
        if (isPreAssigned)
        {
            mustBeAcc0 = true;
            assignedAcc = 0;
        }
    }

    double getSpillCost()
    {
        if (isPreAssigned)
        {
            // don't spill pre-assigned
            return (double) 1000000;
        }
        int dist = lastUse - inst->getLocalId();
        return std::pow((double) inst->use_size(), 3) / dist;
    }
};

// returns true if the inst is a candidate for acc substitution
// lastUse is also update to point to the last use id of the inst
static bool isAccCandidate(G4_INST* inst, G4_Kernel& kernel, int& lastUse, bool& mustBeAcc0)
{
    mustBeAcc0 = false;
    G4_DstRegRegion* dst = inst->getDst();
    if (!dst || kernel.fg.globalOpndHT.isOpndGlobal(dst) || !inst->canDstBeAcc(*kernel.fg.builder))
    {
        return false;
    }

    // check that every use may be replaced with acc
    int lastUseId = 0;
    std::vector<G4_INST*> madSrc0Use;
    for (auto I = inst->use_begin(), E = inst->use_end(); I != E; ++I)
    {
        auto&& use = *I;
        G4_INST* useInst = use.first;
        Gen4_Operand_Number opndNum = use.second;
        lastUseId = std::max(lastUseId, useInst->getLocalId());
        // acc may be src0 of two-source inst or src1 of three-source inst
        // ToDo: may swap source here
        if (useInst->getNumSrc() == 3)
        {
            if (opndNum != Opnd_src1)
            {
                mustBeAcc0 = true;
                bool goodMadSrc0 = useInst->opcode() == G4_mad && opndNum == Opnd_src0;
                if (!goodMadSrc0)
                {
                    return false;
                }
                if (useInst->getSrc(0)->getType() == Type_HF && useInst->getMaskOffset() == 16)
                {
                    // we must use acc1, and need to check that inst does not have an acc0 source
                    // so that dst and src won't have different acc source
                    if (inst->isAccSrcInst())
                    {
                        bool hasAcc0Src = false;
                        auto isAcc0 = [](G4_SrcRegRegion* src)
                        {
                            return src->getBase()->asAreg()->getArchRegType() == AREG_ACC0;
                        };
                        if (inst->getSrc(0)->isSrcRegRegion() &&
                            inst->getSrc(0)->asSrcRegRegion()->getBase()->isAccReg())
                        {
                            hasAcc0Src = isAcc0(inst->getSrc(0)->asSrcRegRegion());
                        }
                        else if (inst->getSrc(1)->isSrcRegRegion() &&
                            inst->getSrc(1)->asSrcRegRegion()->getBase()->isAccReg())
                        {
                            hasAcc0Src = isAcc0(inst->getSrc(1)->asSrcRegRegion());
                        }
                        if (hasAcc0Src)
                        {
                            return false;
                        }
                    }
                }
                madSrc0Use.push_back(useInst);
            }
        }
        else if (opndNum != Opnd_src0)
        {
            return false;
        }

        if (useInst->getSingleDef(opndNum) == nullptr)
        {
            // def must be the only define for this use
            return false;
        }

        int srcId = opndNum == Opnd_src0 ? 0 : 1;
        G4_Operand* src = useInst->getSrc(srcId);
        if (dst->getType() != src->getType() || kernel.fg.globalOpndHT.isOpndGlobal(src) ||
            dst->compareOperand(src) != Rel_eq)
        {
            return false;
        }
        if (!useInst->canSrcBeAcc(srcId, *kernel.fg.builder))
        {
            return false;
        }
    }

    // we have to avoid the case where the dst is used as both src0 and src1 of a mad
    for (auto madUse : madSrc0Use)
    {
        for (auto I = inst->use_begin(), E = inst->use_end(); I != E; ++I)
        {
            auto&& use = *I;
            G4_INST* useInst = use.first;
            Gen4_Operand_Number opndNum = use.second;
            if (madUse == useInst && opndNum == Opnd_src1)
            {
                return false;
            }
        }
    }

    lastUse = lastUseId;
    return true;
}

// replace an inst's dst and all of its (local) uses with acc
static void replaceDstWithAcc(G4_INST* inst, int accNum, IR_Builder& builder)
{
    G4_DstRegRegion* dst = inst->getDst();
    bool useAcc1 = false;
    for (auto I = inst->use_begin(), E = inst->use_end(); I != E; ++I)
    {
        auto&& use = *I;
        G4_INST* useInst = use.first;
        if (useInst->opcode() == G4_mad && use.second == Opnd_src0)
        {
            // if we are replacing mad with mac, additionally check if acc1 needs to be used
            if (useInst->getMaskOffset() == 16 && dst->getType() == Type_HF)
            {
                useAcc1 = true;
                break;
            }
        }
    }

    G4_Areg* accReg = useAcc1 ? builder.phyregpool.getAcc1Reg() : builder.phyregpool.getAcc0Reg();
    G4_DstRegRegion* accDst = builder.createDstRegRegion(Direct, accReg,
        (short)accNum, 0, 1, dst->getType());
    inst->setDest(accDst);
    for (auto I = inst->use_begin(), E = inst->use_end(); I != E; ++I)
    {
        auto&& use = *I;
        G4_INST* useInst = use.first;
        int srcId = use.second == Opnd_src0 ? 0 : 1;
        G4_SrcRegRegion* oldSrc = useInst->getSrc(srcId)->asSrcRegRegion();
        G4_SrcRegRegion* accSrc = builder.createSrcRegRegion(oldSrc->getModifier(), Direct,
            accReg, (short)accNum, 0, builder.getRegionStride1(), dst->getType());
        if (useInst->opcode() == G4_mad && srcId == 0)
        {
            // change mad to mac as src0 of 3-src does not support acc
            auto updateDefSrcPos = [](G4_INST* useInst, Gen4_Operand_Number origPos)
            {
                for (auto DI = useInst->def_begin(), DE = useInst->def_end(); DI != DE; ++DI)
                {
                    auto&& def = *DI;
                    if (def.second == origPos)
                    {
                        for (auto UI = def.first->use_begin(), UE = def.first->use_end(); UI != UE; ++UI)
                        {
                            auto& use = *UI;
                            if (use.first == useInst && use.second == origPos)
                            {
                                switch (use.second)
                                {
                                case Opnd_src1:
                                    use.second = Opnd_src0;
                                    break;
                                case Opnd_src2:
                                    use.second = Opnd_src1;
                                    break;
                                default:
                                    assert(false && "unexpectd src pos");
                                }
                            }
                        }
                    }
                }
            };
            assert(accNum == 0 && "mad src0 may only use acc0");
            G4_Operand* macSrc0 = useInst->getSrc(1);
            updateDefSrcPos(useInst, Opnd_src1);
            G4_Operand* macSrc1 = useInst->getSrc(2);
            updateDefSrcPos(useInst, Opnd_src2);
            useInst->setSrc(macSrc0, 0);
            useInst->setSrc(macSrc1, 1);
            useInst->setOpcode(G4_mac);
            useInst->setImplAccSrc(accSrc);
        }
        else
        {
            useInst->setSrc(accSrc, srcId);
        }
    }
}

static uint32_t getNumACC(IR_Builder& builder)
{
    uint32_t numUserAcc = builder.getOptions()->getuInt32Option(vISA_numGeneralAcc);
    if (numUserAcc != 0)
    {
        // use user-provided value
        return numUserAcc;
    }
    return builder.getNumACC();
}

void HWConformity::multiAccSubstitution(G4_BB* bb)
{
    int numGeneralAcc = getNumACC(builder);

    std::vector<AccInterval*> intervals;

    //build intervals for potential acc candidates as well as pre-existing acc uses from mac/mach/addc/etc
    for (auto instIter = bb->instList.begin(), instEnd = bb->instList.end(); instIter != instEnd; ++instIter)
    {
        G4_INST* inst = *instIter;
        if (inst->defAcc())
        {
            // we should only have single def/use acc at this point, so any use would kill the def
            auto iter = instIter;
            auto useIter = std::find_if(++iter, instEnd, [](G4_INST* inst) { return inst->useAcc(); });
            int lastUseId = useIter == instEnd ? bb->instList.back()->getLocalId() : (*useIter)->getLocalId();
            AccInterval *newInterval = new AccInterval(inst, lastUseId, true);
            intervals.push_back(newInterval);
        }
        else
        {
            int lastUseId = 0;
            bool mustBeAcc0 = false;
            if (isAccCandidate(inst, kernel, lastUseId, mustBeAcc0))
            {
                // this is a potential candidate for acc substitution
                AccInterval *newInterval = new AccInterval(inst, lastUseId);
                newInterval->mustBeAcc0 = mustBeAcc0;
                intervals.push_back(newInterval);
            }
        }
    }

    //modified linear scan to assign free accs to intervals
    std::vector<bool> freeAccs;
    freeAccs.resize(numGeneralAcc, true);
    std::list<AccInterval*> activeIntervals;
    for (auto interval : intervals)
    {
        // expire intervals
        for (auto iter = activeIntervals.begin(), iterEnd = activeIntervals.end(); iter != iterEnd;)
        {
            AccInterval* active = *iter;
            if (active->lastUse <= interval->inst->getLocalId())
            {
                assert(!freeAccs[active->assignedAcc] && "active interval's acc should not be free");
                freeAccs[active->assignedAcc] = true;
                iter = activeIntervals.erase(iter);
            }
            else
            {
                ++iter;
            }
        }

        // assign interval/spill acc0 interval
        if (interval->isPreAssigned)
        {
            if (!freeAccs[0])
            {
                //spill active interval that is using acc0, if it exists
                auto acc0Iter = std::find_if(activeIntervals.begin(), activeIntervals.end(),
                    [](AccInterval* interval) { return interval->assignedAcc == 0; });
                assert(acc0Iter != activeIntervals.end() && "expect to find interval with acc0");
                assert(!(*acc0Iter)->isPreAssigned && "overlapping pre-assigned acc0");
                (*acc0Iter)->assignedAcc = -1;
                activeIntervals.erase(acc0Iter);
            }
            freeAccs[0] = false;
            activeIntervals.push_back(interval);
        }
        else
        {
            bool foundFreeAcc = false;
            for (int i = 0, end = interval->mustBeAcc0 ? 1 : (int)freeAccs.size(); i < end; ++i)
            {
                if (freeAccs[i])
                {
                    interval->assignedAcc = i;
                    freeAccs[i] = false;
                    activeIntervals.push_back(interval);
                    foundFreeAcc = true;
                    break;
                }
            }
            if (!foundFreeAcc)
            {
                // check if we should spill one of the active intervals
                auto spillCostCmp = [interval](AccInterval* intv1, AccInterval* intv2)
                {
                    if (!interval->mustBeAcc0)
                    {
                        return intv1->getSpillCost() < intv2->getSpillCost();
                    }

                    // different compr function if interval must use acc0
                    if (intv1->assignedAcc == 0 && intv2->assignedAcc == 0)
                    {
                        return intv1->getSpillCost() < intv2->getSpillCost();
                    }
                    else if (intv1->assignedAcc == 0)
                    {
                        return true;
                    }
                    return false;
                };
                auto spillIter = std::min_element(activeIntervals.begin(), activeIntervals.end(),
                    spillCostCmp);
                auto spillCandidate = *spillIter;
                if (interval->getSpillCost() > spillCandidate->getSpillCost() &&
                    !spillCandidate->isPreAssigned &&
                    !(interval->mustBeAcc0 && spillCandidate->assignedAcc != 0))
                {
                    interval->assignedAcc = spillCandidate->assignedAcc;
                    spillCandidate->assignedAcc = -1;
                    activeIntervals.erase(spillIter);
                    activeIntervals.push_back(interval);
                }
            }
        }
    }

    for (auto interval : intervals)
    {
        if (!interval->isPreAssigned && interval->assignedAcc != -1)
        {
            G4_INST* inst = interval->inst;
            replaceDstWithAcc(inst, interval->assignedAcc * 2, builder);

            numAccSubDef++;
            numAccSubUse += (int)inst->use_size();
#if 0
            std::cout << "Acc sub def inst: \n";
            inst->emit(std::cout);
            std::cout << "[" << inst->getLocalId() << "]\n";
            std::cout << "Uses:\n";
            for (auto&& use : inst->useInstList)
            {
                std::cout << "\t";
                use.first->emit(std::cout);
                std::cout << "[" << use.first->getLocalId() << "]\n";
            }
#endif
        }
    }

    for (int i = 0, end = (int)intervals.size(); i < end; ++i)
    {
        delete intervals[i];
    }
}
// substitute local operands with acc when possible
void HWConformity::accSubstitution(G4_BB* bb)
{
    bb->resetLocalId();

    if (getNumACC(builder) > 1)
    {
        multiAccSubstitution(bb);
        return;
    }

    for (auto instIter = bb->instList.begin(), instEnd = bb->instList.end(); instIter != instEnd; ++instIter)
    {
        bool canDoAccSub = true;
        G4_INST* inst = *instIter;

        if (inst->defAcc())
        {
            // skip ahead till its single use
            // we should only have single def/use acc at this point, so any use would
            // kill the def
            auto iter = instIter;
            auto useIter = std::find_if(++iter, instEnd, [](G4_INST* inst) { return inst->useAcc(); });
            if (useIter == instEnd)
            {
                return;
            }
            instIter = --useIter; // start at the use inst next time
            continue;
        }

        int lastUseId = 0;
        bool mustBeAcc0 = false; //ignored
        if (!isAccCandidate(inst, kernel, lastUseId, mustBeAcc0))
        {
            continue;
        }

        // don't attempt acc sub if def and last use are too far apart
        // this is a crude way to avoid a long running life range from blocking
        // other acc sub opportunities
        const int accWindow = 25;
        if (lastUseId == 0 || lastUseId - inst->getLocalId() > accWindow)
        {
            continue;
        }

        // check for intervening acc usage between inst and its last use
        auto subIter = instIter;
        ++subIter;
        for (int instId = inst->getLocalId() + 1; instId != lastUseId; ++subIter, ++instId)
        {
            G4_INST* anInst = *subIter;
            if (anInst->useAcc() || anInst->mayExpandToAccMacro(builder))
            {
                canDoAccSub = false;
                break;
            }
        }

        if (!canDoAccSub)
        {
            continue;
        }
        else
        {
            replaceDstWithAcc(inst, 0, builder);
            // advance iter to the last use of the acc
            instIter = subIter;
            --instIter;

            numAccSubDef++;
            numAccSubUse += (int)inst->use_size();

#if 0
            std::cout << "Acc sub def inst: \n";
            inst->emit(std::cout);
            std::cout << "[" << inst->getLocalId() << "]\n";
            std::cout << "Uses:\n";
            for (auto&& use : inst->useInstList)
            {
                std::cout << "\t";
                use.first->emit(std::cout);
                std::cout << "[" << use.first->getLocalId() << "]\n";
            }
#endif
        }
    }
}

// find the location for hoisting the inst pointed to by start
// boundary is the upper limit for hoisting
// if there is any ACC def/use between start and end, return false;
// otherwise, return true.
bool HWConformity::findHoistLocation(
    INST_LIST_ITER start, INST_LIST_ITER &end, uint16_t &movDist, G4_INST *boundary )
{
    bool canMov = true;
    G4_INST *inst = *start;
    end = start;
    end--;
    movDist = 0;

    if ((*end) != boundary)
    {
        // check if src and dst of MAD are re-defined in between and
        // if dst is used in between
        while ((*end) != boundary)
        {
            G4_INST *curInst = *end;
            if (curInst->hasACCOpnd() || curInst->mayExpandToAccMacro(builder))
            {
                canMov = false;
                break;
            }

            if (inst->isRAWdep(curInst) ||
                inst->isWAWdep(curInst) ||
                inst->isWARdep(curInst))
            {
                break;
            }
            movDist++;
            --end;
        }

        // check if acc is possibly updated between the new location and boundary
        if (canMov && ((*end) != boundary))
        {
            INST_LIST_ITER in_between_iter = end;
            ++in_between_iter;
            for (; (*in_between_iter) != boundary; --in_between_iter)
            {
                G4_INST *curInst = *in_between_iter;
                if (curInst->hasACCOpnd() || curInst->mayExpandToAccMacro(builder))
                {
                    canMov = false;
                    break;
                }
            }
        }
    }
    return canMov;
}

// for mac code gen we use W as acc type for int since it has enough precision for int
G4_Type HWConformity::getAccType( G4_Type ty )
{
    if( ty == Type_D )
    {
        return Type_W;
    }
    else if( ty == Type_UD )
    {
        return Type_UW;
    }
    else
    {
        return ty;
    }
}
// convert MAD in madList to MAC instructions
// iter is either the next pseudo-mad that does not belong to this list, or the last inst in the BB
// return true if the mad list is converted to mac
bool HWConformity::convertMAD2MAC( INST_LIST_ITER iter, std::vector<G4_INST*> &madList, G4_BB *bb )
{
    if( madList.size() == 1 )
    {
        // there is only one inst in list, it is not a MAD
        return false;
    }

    // find the iterator of the last mad in list
    G4_INST *lastMad = madList.back();
    INST_LIST_ITER movTarget, lastMadIter = iter;
    while ((*lastMadIter) != lastMad)
    {
        lastMadIter--;
    }
    movTarget = lastMadIter;

    bool changeType = false;
    bool dwDst = IS_TYPE_INT(lastMad->getDst()->getType());
    bool twoGRFDst = lastMad->hasNULLDst() ? false :
            ((lastMad->getDst()->getRightBound() - lastMad->getDst()->getLeftBound() + 1) > GENX_GRF_REG_SIZ );
    G4_Type newType = lastMad->getDst()->getType();
    // check if we can convert the type of MAC dst from DW to W,
    // such that we can avoid instruction splitting and improve code quality
    if (dwDst && lastMad->hasNULLDst())
    {
        // is this possible?
        changeType = true;
        lastMad->getDst()->setType( IS_SIGNED_INT( lastMad->getDst()->getType() ) ? Type_W : Type_UW );
    }
    else if( dwDst && twoGRFDst &&
        lastMad->hasOneUse() &&
        !kernel.fg.globalOpndHT.isOpndGlobal(lastMad->getDst()) )
    {
        // last mad has single use, see if we can replace the def-use pair with acc
        G4_INST *useInst = lastMad->use_front().first;
        if( useInst->getDst() &&
            ( IS_BTYPE( useInst->getDst()->getType() ) || IS_WTYPE( useInst->getDst()->getType() ) ) )
        {
            // check the use of last MAD dst
            INST_LIST_ITER useIter = lastMadIter;
            useIter++;
            while( (*useIter) != useInst )
            {
                useIter++;
            }

            uint16_t movDist, hs;
            if( lastMad->canUseACCOpt( false, true, hs, true, true, true ) && hs == 1 &&
                findHoistLocation( useIter, movTarget, movDist, lastMad ) &&
                (*movTarget) == lastMad )
            {
                changeType = true;
                if( movDist > 0 )
                {
                    movTarget++;
                    bb->instList.insert( movTarget, useInst );
                    bb->instList.erase( useIter );
                }
                uint32_t dstStrideSize = G4_Type_Table[useInst->getDst()->getType()].byteSize * useInst->getDst()->getHorzStride();
                uint32_t useTypeSize = G4_Type_Table[Type_UW].byteSize;
                // insert a temp mov
                if( dstStrideSize > useTypeSize )
                {
                    movTarget--;
                    insertMovAfter( movTarget,
                        (uint16_t)( useTypeSize / G4_Type_Table[useInst->getDst()->getType()].byteSize ),
                        bb->instList );
                }

                newType = getAccType( newType );
                // change src of useInst to ACC
                Gen4_Operand_Number srcNum = lastMad->use_front().second;

                ASSERT_USER(useInst->getSrc((uint32_t)srcNum - 1)->isSrcRegRegion(),
                            "Unexpected src to be changed!");

                G4_SrcRegRegion *accSrcOpnd = builder.createSrcRegRegion(
                    useInst->getSrc( (uint32_t)srcNum - 1 )->asSrcRegRegion()->getModifier(),
                    Direct,
                    builder.phyregpool.getAcc0Reg(),
                    0,
                    0,
                    builder.getRegionStride1(),
                    newType );

                useInst->setSrc( accSrcOpnd, (uint32_t)srcNum - 1 );

                // change dst of the last MAD
                G4_DstRegRegion *accDstOpnd = builder.createDstRegRegion(
                                Direct,
                                builder.phyregpool.getAcc0Reg(),
                                0,
                                0,
                                1,
                                newType);

                lastMad->setDest( accDstOpnd );
            }
        }
    }

    // if we can do type demotion or dst fits in 1GRF, we do not have to worry about inst splitting.
    if( !twoGRFDst || changeType )
    {
        // generate MAC directly
        auto madIter = madList.end();
        madIter--;
        G4_INST *curInst = (*madIter);

        G4_Type accType = getAccType(curInst->getSrc(2)->getType());
        uint32_t accTypeSize = getTypeSize(accType);
        // mac dst region has to match that of acc, which is always GRF-aligned
        // we also cannot have acc dst hstride > 4
        if (!builder.isOpndAligned(curInst->getDst(), GENX_GRF_REG_SIZ) ||
            (curInst->getDst()->getExecTypeSize() / accTypeSize) > 4)
        {
            // ToDo: store the iter in madInst?
            auto instIter = std::find(bb->instList.begin(), bb->instList.end(), curInst);
            auto newDst = insertMovAfter(instIter, curInst->getDst(), curInst->getDst()->getType(), bb, Sixteen_Word);
            curInst->setDest(newDst);
        }
        uint32_t dstByteStride = curInst->getDst()->getExecTypeSize();
        uint16_t stride = (uint16_t) (dstByteStride > accTypeSize ? dstByteStride / accTypeSize : 1);
        RegionDesc* region = builder.createRegionDesc(stride, 1, 0);

        G4_SrcRegRegion *accSrcOpnd = builder.createSrcRegRegion(
            Mod_src_undef, Direct, builder.phyregpool.getAcc0Reg(),
            0, 0, region, accType);

        curInst->setImplAccSrc(accSrcOpnd);
        curInst->setSrc(nullptr, 2);
        curInst->setOpcode( G4_mac );
        curInst->fixMACSrc2DefUse();

        do
        {
            // change all intermediate macs to use acc dst and src
            madIter--;
            curInst = (*madIter);
            bool changeSrc = curInst->opcode() == G4_pseudo_mad;
            addACCOpnd(curInst, changeSrc, stride, accType);
        }
        while( madIter != madList.begin() );
        return true;
    }

    // just split them into mul/add
    // assumption: all pseudo_mads from lastMadIter back to the first inst should be on madList

    auto madIter = lastMadIter;
    for (G4_INST* inst = *madIter; inst != madList.front(); inst = *(--madIter))
    {
        if (inst->opcode() == G4_pseudo_mad)
        {
            convertMAD2MulAdd(madIter, bb);
        }
    }
    return false;
}

void HWConformity::convertComprInstSrcRegion( G4_INST *inst )
{
    for( int k = 0; k < 2; k++ )
    {
        G4_Operand *src = inst->getSrc( k );

        if (!src || src->isImm() || (inst->isMath() && k == 1 && src->isNullReg()))
        {
            continue;
        }

        if (!src->isSrcRegRegion()) {
            continue;
        }

        int w  = src->asSrcRegRegion()->getRegion()->width;
        int hs = src->asSrcRegRegion()->getRegion()->horzStride;
        int vs = src->asSrcRegRegion()->getRegion()->vertStride;

        if( w == 1 && hs == 0 && vs == 0 )
        {
            continue;
        }

        if( inst->getExecSize() < w )
        {
            RegionDesc *rd = builder.createRegionDesc( (uint16_t) (vs/2), (uint16_t) (w/2), (uint16_t) (hs/2) );
            src->asSrcRegRegion()->setRegion( rd );
        }
    }
}

// replace src/dst with ACC
void HWConformity::addACCOpnd(G4_INST *curInst, bool needACCSrc, int dstStride, G4_Type accTy)
{

    if (needACCSrc)
    {
        // change src2 to implicit ACC src.
        RegionDesc* region = nullptr;
        switch (dstStride)
        {
        case 1:
            region = builder.getRegionStride1();
            break;
        case 2:
            region = builder.getRegionStride2();
            break;
        case 4:
            region = builder.getRegionStride4();
            break;
        default:
            MUST_BE_TRUE(false, "unexpected stride value");
            break;
        }

        G4_SrcRegRegion *accSrcOpnd = builder.createSrcRegRegion(
            Mod_src_undef, Direct, builder.phyregpool.getAcc0Reg(),
            0, 0, region, accTy);

        curInst->setImplAccSrc( accSrcOpnd );
        curInst->setSrc( NULL, 2 );
        curInst->setOpcode( G4_mac );
        curInst->fixMACSrc2DefUse();
    }

    // change dst for all in between MAD
    G4_DstRegRegion *accDstOpnd = builder.createDstRegRegion(
        Direct, builder.phyregpool.getAcc0Reg(), 0,
        0, (unsigned short)dstStride, accTy);
    curInst->setDest(accDstOpnd);

}

// convert a psuedo mad inst into mul/add
// return the iterator pointing to add
void HWConformity::convertMAD2MulAdd( INST_LIST_ITER iter, G4_BB *bb )
{
    G4_INST *inst = *iter;
    assert(inst->opcode() == G4_pseudo_mad && "expect pseudo-mad");

    G4_DstRegRegion    *addOpDst     = inst->getDst();
    G4_Operand    *addOpnd2           = inst->getSrc(2);
    G4_Type mulOpDstType = addOpDst->getType();
    G4_Type mulOpExecType = inst->getExecType();
    // pick the widest type of mad's src and dst as the intermediate type
    if (G4_Type_Table[mulOpDstType].byteSize > G4_Type_Table[mulOpExecType].byteSize)
    {
        mulOpExecType = mulOpDstType;
    }

    mulOpDstType = mulOpExecType;

    G4_SubReg_Align     subAlign = Get_G4_SubRegAlign_From_Type( mulOpDstType );

    // Reuse the MAD op for MUL.
    inst->setOpcode(G4_mul);
    inst->setSrc(nullptr, 2);

    G4_Declare* mulDefDcl = builder.createTempVar(inst->getExecSize(), mulOpDstType,
        G4_Align::Either, subAlign);

    G4_DstRegRegion* mulOpDst = builder.Create_Dst_Opnd_From_Dcl(mulDefDcl, 1);
    inst->setDest(mulOpDst);

    // Follow with an ADD.
    INST_LIST_ITER tIter = iter;
    tIter++;

    auto addOpnd1 = builder.Create_Src_Opnd_From_Dcl(mulDefDcl, builder.getRegionStride1());
    G4_INST* addOp = builder.createInternalInst(
        inst->getPredicate(),
        G4_add,
        inst->getCondMod(),
        inst->getSaturate(),
        inst->getExecSize(),
        addOpDst,
        addOpnd1,
        addOpnd2,
        nullptr,
        inst->getOption(),
        inst->getLineNo(),
        inst->getCISAOff(),
        inst->getSrcFilename() );

    auto addIter = bb->instList.insert( tIter, addOp );

    // predicate/condmod/saturate, if they exist, are propagated to the add instruction
    inst->setSaturate( false );
    inst->setPredicate( NULL );
    inst->setCondMod(nullptr);

    {
        inst->transferDef( addOp, Opnd_src2, Opnd_src1 );
        if( addOp->getPredicate() )
        {
            inst->transferDef( addOp, Opnd_pred, Opnd_pred );
        }
        inst->transferUse( addOp );
        inst->addDefUse(addOp, Opnd_src0);
    }
}

// See if we can convert the pseudo_sada2 instruction into an actual Gen sada2
// This can be done if the following conditions are met:
// -- We can find the definition of the pseudo sada2 instruction's source 2 in
//    the same basic block, and that
// -- it may be replaced by an acc (i.e., the src2 is its only use, the dst and
//    the src have identical regions, and there are no intervening instructions
//    that update acc)
//
// We additionally attempt to schedule up the sada2 instruction to be as close
// as possible to the src2 defining instruction (subject to the constraints of
// def-use chains for def, src0 and src1), so that more opportunites may be
// exposed for later sada2 instructions

void HWConformity::fixSADA2Inst( BB_LIST_ITER it )
{
    G4_BB* bb = *it;

    INST_LIST_ITER i = bb->instList.begin();
    while (i != bb->instList.end())
    {

        G4_INST *inst = *i;
        if( inst->opcode() != G4_pseudo_sada2 )
        {
            ++i;
            continue;
        }

        G4_Operand *src2 = inst->getSrc(2);

        bool canDoSada2 = true;
        G4_INST* src2Dst = NULL;

        int emask = inst->getMaskOption();
        if (bb->isInSimdFlow() &&
            emask != InstOpt_WriteEnable &&
            inst->getMaskOffset() != 0)
        {
            canDoSada2 = false;
        }

        G4_DstRegRegion *dst = inst->getDst();
        if( canDoSada2 )
        {
            if( src2->isSrcRegRegion() && src2->asSrcRegRegion()->getRegAccess() == Direct )
            {
                // check Src2
                if( kernel.fg.globalOpndHT.isOpndGlobal(src2 ) )
                {
                    // no sada2 if operand is global
                    canDoSada2 = false;
                }
                else if( src2->asSrcRegRegion()->getModifier() != Mod_src_undef )
                {
                    // no sada2 if src2 has a modifier
                    canDoSada2 = false;
                }
                else
                {
                    for (auto defIter = inst->def_begin(); defIter != inst->def_end(); ++defIter)
                    {
                        if((*defIter).second == Opnd_src2 )
                        {
                            if( src2Dst != NULL )
                            {
                                // no sada2 if src2 has >1 definition
                                canDoSada2 = false;
                                break;
                            }
                            src2Dst = (*defIter).first;
                        }
                    }

                    if( !src2Dst )
                    {
                        canDoSada2 = false;
                    }
                    else
                    {
                        if( !src2Dst->hasOneUse() )
                        {
                            // no sad2 if def has more than one use
                            canDoSada2 = false;
                        }
                        else
                        {
                            G4_DstRegRegion *src2DstOpnd = src2Dst->getDst();
                            G4_Type src2DstType = src2DstOpnd->getType();
                            if( src2DstOpnd->getRegAccess() != Direct
                                || (src2DstType != Type_W && src2DstType != Type_UW) )
                            {
                                // no sada2 if def's dst is indirect, or it type is not W or UW
                                canDoSada2 = false;
                            }
                            else if( src2DstOpnd->compareOperand( src2 ) !=
                                Rel_eq )
                            {
                                // no sada2 if src2Dst and src2 are not equal
                                canDoSada2 = false;
                            }
                        }
                    }
                }
            }
            else
            {
                canDoSada2 = false;
            }
        }

        // The new location of the sada2 after the conversion
        INST_LIST_ITER newSada2Iter = i;
        --newSada2Iter;
        if( canDoSada2 )
        {
            // try to schedule up the sada2 to be as close to the src2-defining instruction
            // as possible to expose more optmizaition opportunities
            for(; *newSada2Iter != src2Dst; --newSada2Iter )
            {
                if( inst->isRAWdep( *newSada2Iter ) ||
                    inst->isWAWdep( *newSada2Iter ) ||
                    inst->isWARdep( *newSada2Iter ) )
                {
                    break;
                }
            }

            // make sure there are no instructions between the sada2's new location
            // and the src2-defining instruction that updates acc
            for( INST_LIST_ITER iter = newSada2Iter; *iter != src2Dst; --iter )
            {
                G4_INST* aInst = *iter;
                if( aInst->isAccDstInst() || aInst->isAccWrCtrlInst() ||
                    ( aInst->opcode() == G4_mulh &&
                    IS_DTYPE(aInst->getSrc(0)->getType()) && IS_DTYPE(aInst->getSrc(1)->getType()) ) )
                {
                    canDoSada2 = false;
                    break;
                }
            }
        }

        if( canDoSada2 )
        {
            // We have verified all conditions and can convert this instruction to sada2.
            // replace the destination for src2Dst to be acc0.
            // The actual acc0 offset will be fixed in a later pass
            G4_DstRegRegion *accDstOpnd = builder.createDstRegRegion(
                Direct,
                builder.phyregpool.getAcc0Reg(),
                0,
                0,
                1,
                src2->getType());
            src2Dst->setDest( accDstOpnd );

            // create an implicit acc parameter for sada2
            inst->setOpcode( G4_sada2 );
            inst->setSrc( NULL, 2 );
            G4_SrcRegRegion *accSrcOpnd = builder.createSrcRegRegion(
                Mod_src_undef,
                Direct,
                builder.phyregpool.getAcc0Reg(),
                0,
                0,
                builder.getRegionStride1(),
                src2->getType());

            inst->setImplAccSrc( accSrcOpnd );

            ++newSada2Iter;
            bb->instList.insert( newSada2Iter, inst );
            i = bb->instList.erase(i);

            // maintain def-use

            for (auto tmpIter = src2Dst->use_begin(); tmpIter != src2Dst->use_end(); ++tmpIter)
            {
                if( (*tmpIter).first == inst && (*tmpIter).second == Opnd_src2 )
                {
                    (*tmpIter).second = Opnd_implAccSrc;
                    break;
                }
            }

            for (auto tmpIter = inst->def_begin(); tmpIter != inst->def_end(); ++tmpIter)
            {
                if( (*tmpIter).first == src2Dst && (*tmpIter).second == Opnd_src2 )
                {
                    (*tmpIter).second = Opnd_implAccSrc;
                    break;
                }
            }
        }
        else
        {
            // pseudo_sada2 (N) dst src0 src1 src2
            // becomes
            // sad2 (n) tmp<1>:w src0 src1
            // add (n) dst tmp<n;n,1>:w src2

            inst->setOpcode( G4_sad2 );
            inst->setSrc( NULL, 2 );

            G4_Align sad2TmpAlign = Either;
            G4_SubReg_Align sad2TmpSubAlign = Get_G4_SubRegAlign_From_Type( dst->getType() );

            if( inst->getExecSize() * G4_Type_Table[dst->getType()].byteSize > GENX_GRF_REG_SIZ )
            {
                // align to GRF
                sad2TmpSubAlign = Sixteen_Word;
            }
            // create a new temp variable as sad2's destination
            G4_Declare* sad2Tmp = builder.createTempVar( inst->getExecSize(), dst->getType(),
                sad2TmpAlign, sad2TmpSubAlign );
            G4_DstRegRegion* sad2Dst = builder.Create_Dst_Opnd_From_Dcl(sad2Tmp, 1);
            inst->setDest( sad2Dst );

            uint16_t srcVertStride, srcWidth, srcHorzStride;
            srcWidth = inst->getExecSize() > 8 ? 8 : inst->getExecSize();
            srcHorzStride = 1;
            srcVertStride = srcWidth;

            // opnd 0 for add is the new temp we've just created
            RegionDesc *rd = builder.createRegionDesc( srcVertStride, srcWidth, srcHorzStride );
            G4_Operand* addSrc0Opnd = builder.createSrcRegRegion(Mod_src_undef, Direct, sad2Dst->getBase(),
                0, 0, rd, sad2Dst->getType() );

            // opnd 1 is src2 of the pseudo_sada2
            // dst is the same as the pseudo_sada2
            G4_INST* addInst = builder.createInternalInst(
                inst->getPredicate(),
                G4_add,
                inst->getCondMod(),
                inst->getSaturate(),
                inst->getExecSize(),
                dst,
                addSrc0Opnd,
                src2,
                NULL,
                inst->getOption(),
                inst->getLineNo(),
                inst->getCISAOff(),
                inst->getSrcFilename() );

            INST_LIST_ITER addLoc = i;
            ++addLoc;
            bb->instList.insert( addLoc, addInst );

            // FIXME: redundant?
            inst->addDefUse(addInst, Opnd_src0);

            // The sad2 op should not have the SAT attribute set,
            // as this is intended only for the final result of the
            // SADA2 (and thus the add op will keep the SAT attribute).
            inst->setSaturate( false );
            inst->setPredicate( NULL );

            {
                inst->transferDef( addInst, Opnd_src2, Opnd_src1 );
                if( addInst->getPredicate() )
                {
                    inst->transferDef( addInst, Opnd_pred, Opnd_pred );
                }
                inst->transferUse( addInst );
                inst->addDefUse(addInst, Opnd_src0);
            }
            ++i;
        }
    }
}

void HWConformity::fixSendInst(BB_LIST_ITER it)
{
    G4_BB* bb = *it;

    for (INST_LIST_ITER i = bb->instList.begin(); i != bb->instList.end(); i++)
    {

        G4_INST *inst = *i;
        if (!inst->isSend())
        {
            continue;
        }

        if (inst->getExecSize() < 8)
        {
            // A64 messages require a minimum msg len of two for address (src0), which is inconsistent
            // with our input IR as it allows <2 GRF address variables (e.g., simd1 A64 scatter r/w).
            // To avoid this causing overlap between send dst/src0/src1 (it is known to cause HW hang),
            // we have to ensure they are all 2GRF-aligned
            G4_Declare* src0Dcl = inst->getSrc(0)->getTopDcl();
            // ToDo: check if dst/src1 may also exhibit such size mismatch
            bool sizeMismatch = inst->getMsgDesc()->MessageLength() == 2 &&
                (src0Dcl && src0Dcl->getRootDeclare()->getByteSize() < 2 * GENX_GRF_REG_SIZ);
            auto doEvenAlign = [](G4_Declare* dcl)
            {
                if (dcl)
                {
                    dcl = dcl->getRootDeclare();
                    // variables >= 2 GRF don't need even alignment since they can't possibly overlap
                    if (dcl->getByteSize() < 2 * GENX_GRF_REG_SIZ)
                    {
                        dcl->setAlign(G4_Align::Even);
                    }
                }
            };
            if (sizeMismatch)
            {
                doEvenAlign(inst->getSrc(0)->getTopDcl());
                if (inst->isSplitSend())
                {
                    doEvenAlign(inst->getSrc(1)->getTopDcl());
                }
                if (VISA_WA_CHECK(builder.getPWaTable(), WaDisableSendSrcDstOverlap))
                {
                    doEvenAlign(inst->getDst()->getTopDcl());
                }
            }
        }

        uint16_t offset = 0;
        if (!builder.isOpndAligned(inst->getDst(), offset, GENX_GRF_REG_SIZ))
        {
            inst->setDest(insertMovAfter(i, inst->getDst(), inst->getDst()->getType(), bb, Sixteen_Word));
        }

        G4_Operand *src0 = inst->getSrc(0);
        G4_Declare *src0TopDcl = src0->getTopDcl();

        // if src0 and src1 are hard-wired GRF, check that
        // they satisfy EOT and preemption restrictions
        auto needsTempSrc = [this](G4_INST* inst, G4_Declare* dcl)
        {
            return dcl->getRegVar() && dcl->getRegVar()->getPhyReg() &&
                ((inst->isEOT() && builder.hasEOTGRFBinding() &&
                    dcl->getRegVar()->getPhyReg()->asGreg()->getRegNum() < 112) ||
                    (builder.getOption(vISA_enablePreemption) &&
                        dcl->getRegVar()->getPhyReg()->asGreg()->getRegNum() < 2));
        };

        if (needsTempSrc(inst, src0TopDcl))
        {
            uint16_t rows = inst->getMsgDesc()->MessageLength();
            G4_Type type = src0->getType();
            G4_Declare* dcl = builder.createTempVar(rows * 8, type, Either, Sixteen_Word);

            MUST_BE_TRUE(G4_Type_Table[type].byteSize == 4, "Invalid src0 opnd type for send.");

            RegionDesc* region = builder.getRegionStride1();
            G4_VarBase *base = src0->asSrcRegRegion()->getBase();
            short baseOff = src0->asSrcRegRegion()->getRegOff();
            short baseSubOff = src0->asSrcRegRegion()->getSubRegOff();
            for (uint16_t idx = 0; idx != rows; ++idx) {
                G4_SrcRegRegion *src = builder.createSrcRegRegion(Mod_src_undef, Direct, base, baseOff + idx, baseSubOff + 0, region, type);
                G4_DstRegRegion* dst = builder.createDstRegRegion(Direct, dcl->getRegVar(), idx, 0, 1, type);

                G4_INST* newInst = builder.createInternalInst(NULL, G4_mov, NULL, false,
                    8, dst, src, NULL, InstOpt_WriteEnable,
                    inst->getLineNo(), inst->getCISAOff(), inst->getSrcFilename());

                bb->instList.insert(i, newInst);
                inst->transferDef(newInst, Opnd_src0, Opnd_src0);
                newInst->addDefUse(inst, Opnd_src0);
            }

            G4_Operand *newSrc = builder.Create_Src_Opnd_From_Dcl(dcl, builder.getRegionStride1());
            inst->setSrc(newSrc, 0);
        }

        if (inst->isSplitSend() && !inst->getSrc(1)->isNullReg())
        {
            // src1 may be null because some messages (e.g., CPS) require split send
            if (!builder.isOpndAligned(inst->getSrc(1), GENX_GRF_REG_SIZ))
            {
                inst->setSrc(insertMovBefore(i, 1, inst->getSrc(1)->getType(), bb, Sixteen_Word), 1);
            }
            G4_Operand *src1 = inst->getSrc(1);
            G4_Declare *src1TopDcl = src1->getTopDcl();

            if (needsTempSrc(inst, src1TopDcl))
            {
                uint16_t rows = inst->getMsgDesc()->extMessageLength();
                G4_Type type = src1->getType();
                G4_Declare* dcl = builder.createTempVar(rows * 8, type, Either, Sixteen_Word);

                MUST_BE_TRUE(G4_Type_Table[type].byteSize == 4, "Invalid src1 opnd type for send.");

                RegionDesc* region = builder.getRegionStride1();
                G4_VarBase *base = src1->asSrcRegRegion()->getBase();
                short baseOff = src1->asSrcRegRegion()->getRegOff();
                short baseSubOff = src1->asSrcRegRegion()->getSubRegOff();
                for (uint16_t idx = 0; idx != rows; ++idx)
                {
                    G4_SrcRegRegion *src = builder.createSrcRegRegion(Mod_src_undef, Direct, base, baseOff + idx, baseSubOff + 0, region, type);
                    G4_DstRegRegion* dst = builder.createDstRegRegion(Direct, dcl->getRegVar(), idx, 0, 1, type);

                    G4_INST* newInst = builder.createInternalInst(NULL, G4_mov, NULL, false,
                        8, dst, src, NULL, InstOpt_WriteEnable,
                        inst->getLineNo(), inst->getCISAOff(), inst->getSrcFilename());

                    bb->instList.insert(i, newInst);
                    inst->transferDef(newInst, Opnd_src1, Opnd_src1);
                    newInst->addDefUse(inst, Opnd_src1);
                }

                G4_Operand *newSrc = builder.Create_Src_Opnd_From_Dcl(dcl, region);
                inst->setSrc(newSrc, 1);
            }
        }

        if (builder.getOption(vISA_enablePreemption))
        {
            G4_DstRegRegion *dst = inst->getDst();
            if (!dst->isNullReg())
            {
                G4_Declare *dstTopDcl = dst->getTopDcl();
                if (dstTopDcl != NULL &&
                    dstTopDcl->getRegVar() &&
                    dstTopDcl->getRegVar()->getPhyReg())
                {
                    MUST_BE_TRUE((dstTopDcl->getRegVar()->getPhyReg()->asGreg()->getRegNum() > 2), "Unexpected preg used for send destination.");
                }
            }
        }

        if (VISA_WA_CHECK(builder.getPWaTable(), WaDisableSendSrcDstOverlap))
        {
            // create copy if dst and src0/src1 overlap due to being the same variable
            bool src0Overlap = inst->getDst()->compareOperand(inst->getSrc(0)) != Rel_disjoint;
            bool src1Overlap = inst->isSplitSend() && inst->getDst()->compareOperand(inst->getSrc(1)) != Rel_disjoint;
            if (src0Overlap || src1Overlap)
            {
                int dstSize = inst->getMsgDesc()->ResponseLength();
                int src0Size = src0Overlap ? inst->getMsgDesc()->MessageLength() : 0;
                int src1Size = src1Overlap ? inst->getMsgDesc()->extMessageLength() : 0;
                if (dstSize > src0Size + src1Size)
                {
                    //copy src0/src1
                    if (src0Overlap)
                    {
                        G4_Declare* copyDst = builder.createTempVar(src0Size * 8, Type_UD, Either, Any);
                        copyRegs(copyDst, 0, inst->getSrc(0)->getBase()->asRegVar()->getDeclare(),
                            inst->getSrc(0)->asSrcRegRegion()->getRegOff() * 32, src0Size, bb, i);
                        inst->setSrc(builder.Create_Src_Opnd_From_Dcl(copyDst, builder.getRegionStride1()), 0);
                    }
                    if (src1Overlap)
                    {
                        G4_Declare* copyDst = builder.createTempVar(src1Size * 8, Type_UD, Either, Any);
                        copyRegs(copyDst, 0, inst->getSrc(1)->getBase()->asRegVar()->getDeclare(),
                            inst->getSrc(1)->asSrcRegRegion()->getRegOff() * 32, src1Size, bb, i);
                        inst->setSrc(builder.Create_Src_Opnd_From_Dcl(copyDst, builder.getRegionStride1()), 1);
                    }
                }
                else
                {
                    // copy dst
                    auto copyIter = i;
                    ++copyIter;
                    G4_Declare* copySrc = builder.createTempVar(dstSize * 8, Type_UD, Either, Any);
                    copyRegs(inst->getDst()->getBase()->asRegVar()->getDeclare(), inst->getDst()->getRegOff() * 32,
                        copySrc, 0, dstSize, bb, copyIter);
                    inst->setDest(builder.Create_Dst_Opnd_From_Dcl(copySrc, 1));
                }
            }
        }

    }

}

//
// Fix sel and csel instructions:
//  -- set their cond mod to null as they don't modify it.  They will be hard-coded to f0.0 in Gen asm

void HWConformity::fixSelCsel(INST_LIST_ITER it, G4_BB* bb)
{
    G4_INST* inst = *it;
    if (inst->opcode() == G4_sel || inst->opcode() == G4_csel)
    {
        G4_CondMod *condMod = inst->getCondMod();
        if (condMod)
        {
            condMod->setBase(nullptr);
        }
    }
}

void HWConformity::conformBB( BB_LIST_ITER it)
{
    G4_BB *bb = *it;
    INST_LIST_ITER i = bb->instList.begin(), iEnd = bb->instList.end();
    INST_LIST_ITER next_iter = i;
    for ( ; i != iEnd; i = next_iter )
    {
        // by default we skip the newly inserted instructions as we assume they are already HW conformed
        // if a check may produce new instructions that violate HW rules, it must adjust the next_iter
        // to point to them
        ++next_iter;
        G4_INST *inst = *i;
        G4_opcode opcode = inst->opcode();
        if (opcode == G4_nop || opcode == G4_label)
        {
            continue;
        }

        // do this early since otherwise the moves inserted by other passes may still
        // inherit bad regions from the original inst
        fixSrcRegion(inst);

        bool changed = fixMov(i, bb);
        if (changed)
        {
            next_iter = i;
            next_iter++;
        }

        fixOpndType(i, bb);

        fixSelCsel(i, bb);

        if (inst->getExecSize() == 16)
        {
            if (inst->opcode() == G4_math               &&
                inst->getDst()->getType() == Type_HF    &&
                inst->getSrc(0)->getType() == Type_HF &&
                (!inst->getSrc(1) || inst->getSrc(1)->getType() == Type_HF))
            {
                // split pure HF math to simd8
                evenlySplitInst(i, bb);
            }
        }
        fix3SrcInst(i, bb);

        G4_Operand *dst = inst->getDst();

#ifdef _DEBUG
        verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif

         /* HW Check #2
         * First check sources for math instructions.
         * math only uses GRFs as operands and sub register number should be the same
         */
        if (inst->isMath())
        {
            if( fixMathInst( i, bb ) )
            {
                // check the newly added insts later
                next_iter = i;
                next_iter++;
            }
        }

        inst = *i;

#ifdef _DEBUG
        verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif
        /* HW Check #3 */
        if( inst->opcode() == G4_mul )
        {
            if( fixMULInst( i, bb ) )
            {
                // inserted mach and mov
                // check the newly added insts later ( MUL, MACH, MOV )
                next_iter = i;
                next_iter++;
            }
        }

#ifdef _DEBUG
        verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif
        /* HW Check #3a */
        if( inst->opcode() == G4_mulh )
        {
            fixMULHInst( i, bb );
            // inserted mul before
            // check the newly added MUL inst
            i--;
            next_iter = i;
            continue;
        }

#ifdef _DEBUG
        verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif

        // HW check #6: indirect operand spilling
        fixIndirectOpnd( i, bb );

#ifdef _DEBUG
        verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif
        // HW check #8: unsigned dst with execution type F
        /* If the execution type is F and the destination type if either UD, UW
         * or UB and the detination is not saturated, then we need to add an
         * intermediate type conversion to D.
         */
        inst = *i;
        opcode = inst->opcode();

        if (opcode == G4_cmp || opcode == G4_cmpn)
        {
            dst = inst->getDst();
            int dst_elsize = 0;
            bool null_dst = !dst || inst->hasNULLDst();
            if (!null_dst)
            {
                dst_elsize = dst->isPredicate() ? G4_Type_Table[Type_UW].byteSize : G4_Type_Table[dst->getType()].byteSize;
            }
            int extypesize;
            G4_Type extype = inst->getOpExecType( extypesize );
            fixCompareInst( i, bb, extype, dst_elsize );
        }
        dst = inst->getDst();
#ifdef _DEBUG
        verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif
        {
            int extypesize;
            G4_Type extype = inst->getOpExecType( extypesize );
            /*
            * HW check #11: check destination type.
            *
            * (*) When source(s) is/are of float type, destination must be of float
            *     type also. The exception is MOV instruction which can be used
            *     for explicit type conversion between float and integer.
            */
            if (dst != NULL &&
                ( ( opcode != G4_mov && IS_FTYPE( extype ) && !IS_FTYPE( dst->getType() ) ) ||
                ( IS_FTYPE(dst->getType()) && !IS_FTYPE(extype) && !Opcode_int_src_float_dst_OK( opcode ) ) ) )
            {
                if(fixDstType( i, bb, extype ))
                {
                    next_iter = i;
                    next_iter++;
                }
            }
        }

#ifdef _DEBUG
        verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif
        if (fixImplicitAcc(i, bb))
        {
            next_iter = i;
            next_iter++;
        }

        if (fixAccSrc(i, bb))
        {
            next_iter = i;
            next_iter++;
        }

        /* HW check #13: check acc source */
        if ( (dst != NULL && dst->isAccReg()) || opcode == G4_mach )
        {
            if( fixAccDst( i, bb ) )
            {
                // TODO: we should fix inst with ACC src separately?
                next_iter = i;
                next_iter++;
            }
        }

        {
            dst = inst->getDst();
            G4_Type extype = inst->getExecType2();
            int extypesize = G4_Type_Table[extype].byteSize;
            int dst_elsize = 0;
            if (dst)
            {
                dst_elsize = G4_Type_Table[dst->getType()].byteSize;
            }
#ifdef _DEBUG
            verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif
            /* HW check #15 : DST HS */
            if (dst                         &&
                inst->getExecSize() == 1    &&
                dst_elsize < extypesize     &&
                !IS_VTYPE(extype)           &&
                !inst->isMixedMode())
            {
                fixDstHstride( i, extypesize );
            }
        }

#ifdef _DEBUG
        verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif

        bool planeDeleted = fixPlaneInst(i, bb);
        if (planeDeleted)
        {
            continue;
        }

        fixLine(i, bb);

        // CHV/BXT specific checks for 64b datatypes
        fix64bInst( i, bb);

#ifdef _DEBUG
        verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif
        fixImm64( i, bb ); // fixed immediates for DF4 in fixImm64()

        // FIXME: may be better to call fixDstAlign instead
        if (getGenxPlatform() == GENX_BDW)
        {
            fixPackedHFConversions(i, bb);
        }
    }
}

//
// SIMD16 addc/subb are illegal on GEN, since they write to acc and there are only 8 acc
// channels for D/UD type.  In vISA IR we should get something like
// addc (16) V0 V2 V3
// mov  (16) V1 acc0<8;8,1>:ud
// which needs to be translated to
// addc (8) V0(0) V2(0) V3(0) {Q1}
// mov (8) V1(0) acc0<8;8,1>:ud {Q1}
// addc (8) V0(1) V2(1) V3(1) {Q2}
// mov (8) V1(1) acc0<8;8,1>:ud {Q2}
//
// We do this first thing in HW conformity to avoid REXES from splitting addc/subb incorrectly
// We also count on previous opt to preserve the inst pair by not inserting any acc using inst in between;
// it should hopefully be the case since we generally don't optimize instructions with acc src/dst
//
// If exec size of addc is < 8, we also have to make sure both the addc's dst and the carry move's dst are
// GRF-aligned, since acc's channel is dependent on the dst's subreg offset.  In other words, we fix
// addc (1) r1.0 ...
// mov (1) r1.1 acc0.0<0;1,0>
// into
// addc (1) r1.0 ...
// mov (1) r2.0 acc0.0<0;1,0>
// mov (1) r1.1 r2.0
//
//
bool HWConformity::fixAddcSubb(G4_BB* bb)
{
    bool changed = false;
    for (auto iter = bb->instList.begin(), iterEnd = bb->instList.end();
        iter != iterEnd; ++iter)
    {
        G4_INST* inst = *iter;
        if ((inst->opcode() == G4_addc || inst->opcode() == G4_subb) &&
            inst->getExecSize() != 8)
        {
            // find the matching carry move
            G4_INST* carryMov = NULL;
            auto movIter = iter;
            for (++movIter; movIter != iterEnd; ++movIter)
            {
                G4_INST* inst2 = *movIter;
                if (inst2->opcode() == G4_mov && inst2->getExecSize() == inst->getExecSize() &&
                    inst2->getSrc(0)->isAccReg() && inst2->getSrc(0)->getType() == Type_UD)
                {
                    carryMov = inst2;
                    break;
                }
                else if (inst2->useAcc())
                {
                    break;
                }
            }

            if (carryMov == NULL)
            {
                // can't find the move using acc, skip this addc/subb
                continue;
            }

            if (inst->getExecSize() == 16)
            {
                evenlySplitInst(iter, bb);
                evenlySplitInst(movIter, bb);

                // movIter now points to the second half of move, and we want to move the first move to be
                // before the second half of the addc/subb, which is pointed by iter
                --movIter;
                G4_INST* mov1 = *movIter;
                bb->instList.erase(movIter);
                bb->instList.insert(iter, mov1);

                changed = true;
            }
            else
            {
                // we will need to GRF-align addc's dst as well as the move dst,
                // so that the acc will have the correct offset
                // note that insertMovAfter will align the tmp since addc/subb has implicit acc use
                if (!builder.isOpndAligned(inst->getDst(), 32))
                {
                    inst->setDest(
                        insertMovAfter(iter, inst->getDst(), inst->getDst()->getType(), bb));
                    changed = true;
                }
                if (!builder.isOpndAligned(carryMov->getDst(), 32))
                {
                    carryMov->setDest(
                        insertMovAfter(movIter, carryMov->getDst(), carryMov->getDst()->getType(), bb));
                    changed = true;
                }
            }
        }
    }
    return changed;
}

void HWConformity::chkHWConformity()
{
    fixDataLayout();

    for (BB_LIST_ITER it = kernel.fg.BBs.begin(); it != kernel.fg.BBs.end();it++)
    {
        // hw conformity #1
#ifdef _DEBUG
        verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif
        fixAddcSubb(*it);
#ifdef _DEBUG
        verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif

        fixMADInst( it );

#ifdef _DEBUG
        verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif
        // fix source operand first to avoid redundant MOVs if this fix is done after
        // reducing execution size.
        // used by 3d. Mainly to fix sel with two imm sources
        fixOpndTypeAlign( *it );

#ifdef _DEBUG
        verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif

        if (builder.getOption(vISA_accSubstitution) &&
            !builder.getOption(vISA_doAccSubAfterSchedule))
        {
            accSubstitution(*it);
        }

#ifdef _DEBUG
        verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif

        fixInstExecSize( it );

#ifdef _DEBUG
        verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif

        fixMixedHFInst( it );

#ifdef _DEBUG
        verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif
        fixSADA2Inst( it );

#ifdef _DEBUG
        verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif

        fixSendInst( it );

#ifdef _DEBUG
        verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif

        conformBB(it);

#ifdef _DEBUG
        verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif

#ifdef _DEBUG
        verifyG4Kernel(kernel, Optimizer::PI_HWConformityChk, false);
#endif
    }
}

bool HWConformity::hasBadRegion( G4_INST *inst )
{
    if( inst->getImplAccDst() || inst->getImplAccSrc() )
        return false;
    bool badRegion = false;
    for( unsigned int srcNum = 0; srcNum < G4_Inst_Table[inst->opcode()].n_srcs; srcNum++ )
    {
        if( !(inst->getSrc(srcNum)->isSrcRegRegion()) )
        {
            continue;
        }
        RegionDesc *rd = inst->getSrc(srcNum)->asSrcRegRegion()->getRegion();
        if( rd->isRegionWH() )
        {
            badRegion = true;
            break;
        }
        if( rd->horzStride == GENX_MAX_H_STRIDE && rd->width > 1 )
        {
            badRegion = true;
            break;
        }
        G4_SrcRegRegion *expandSrcRegion = inst->getSrc(srcNum)->asSrcRegRegion();
        if( expandSrcRegion->getRegAccess() != Direct )
        {
            RegionDesc* origRegion = expandSrcRegion->getRegion();
            short secondSubRegOffDiff = 0, secondAddrImmedDiff = 0;

            if( origRegion->width == 1 )
            {
                secondSubRegOffDiff = origRegion->vertStride;
            }
            else
            {
                secondSubRegOffDiff = origRegion->horzStride;
            }
            secondAddrImmedDiff = (short) (secondSubRegOffDiff * G4_Type_Table[expandSrcRegion->getType()].byteSize);
            if( (expandSrcRegion->getAddrImm() + secondAddrImmedDiff) > G4_MAX_ADDR_IMM )
            {
                badRegion = true;
                break;
            }
        }
    }
    return badRegion;
}
// check if we can split an inst
bool HWConformity::canSplitInst( G4_INST *inst, G4_INST *use_op )
{
    if( ( inst->getPredicate() && inst->getExecSize() < 16 ) || hasBadRegion( inst ) )
        return false;

    bool condModIsUsed = false;

    // make sure cond mod is only used by use_op
    G4_CondMod *condMod = inst->getCondMod();
    if( condMod )
    {
        for (auto use_iter = inst->use_begin(); use_iter != inst->use_end(); use_iter++)
        {
            if( (*use_iter).first == use_op )
            {
                if( (*use_iter).second != Opnd_pred )
                {
                    condModIsUsed = true;
                    break;
                }
                continue;
            }
             G4_CmpRelation rel = Rel_disjoint;
             if( (*use_iter).second == Opnd_pred )
             {
                 rel = condMod->compareOperand( (*use_iter).first->getPredicate() );
             }
             else if( (*use_iter).second == Opnd_dst )
             {
                 G4_Operand *use = (*use_iter).first->getDst();
                 if( use->isFlag() )
                     rel = condMod->compareOperand( use );
             }
             else
             {
                 G4_Operand *use = (*use_iter).first->getSrc( (*use_iter).second - 1 );
                 if( use->isFlag() )
                     rel = condMod->compareOperand( use );
             }

             if( rel != Rel_disjoint )
             {
                 condModIsUsed = true;
                 break;
             }
        }
    }
    if( condModIsUsed )
    {
        return false;
    }

    for (int i = 0; i < inst->getNumSrc(); i++)
    {
        G4_Operand *src = inst->getSrc(i);
        if (src->isAccReg())
        {
            // don't split inst with explicit acc
            return false;
        }
    }

    return true;
}

bool HWConformity::canSplitByteDst( G4_opcode op )
{
    switch( op )
    {
    case G4_mac:
    case G4_mach:
    case G4_cmp:
    case G4_mad:
    case G4_sad2:
    case G4_sada2:
    case G4_line:
    case G4_send:
    case G4_sendc:
        return false;
    default:
        return true;
    }
}
// split one instruction into 2 if its dstination is packed byte and execution type is W.
// for example:
// add <16> V1(0,0)<1>:b V1(0,0)<16;16,1>:w V2(0,0)<16;16,1>:w
// ==>
// add <8> V1(0,0)<2>:b V1(0,0)<16;8,2>:w V2(0,0)<16;8,2>:w
// add <8> V1(0,1)<2>:b V1(0,1)<16;8,2>:w V2(0,1)<16;8,2>:w

// if predicate is used for instruction, the definition of this predicate is tracked and the
// corresponding instruction is checked to see if it can do the same split.
bool HWConformity::splitInstListForByteDst( INST_LIST_ITER it, G4_BB *bb, uint16_t extypesize )
{
    G4_INST *inst = *it;
    G4_opcode inst_op = inst->opcode();
    G4_DstRegRegion *dst = inst->getDst();
    // check if we can split the inst
    if( !canSplitByteDst( inst_op ) ||
         inst->getExecSize() == 1 ||
        ( bb->isInSimdFlow() && !inst->isWriteEnableInst() ) ||
        dst->getByteOffset() % extypesize != 0 ||
        dst->getHorzStride() != 1 ||
        extypesize != G4_Type_Table[Type_W].byteSize)
    {
        return false;
    }

    if (inst->getPredicate() || inst->getCondMod())
    {
        return false;
    }

    // recursively the inst that defines its predicate can be split
    INST_LIST expandOpList;
    bool canSplit = canSplitInst( inst, NULL );
    if( canSplit )
    {
        expandOpList.push_back( inst );
    }

    G4_INST *currInst = inst;
    while( canSplit && currInst->getPredicate() )
    {
        // look for predicate def inst
        uint16_t defNum = 0;
        G4_INST *defInst = NULL;

        // FIXME: should be currInst->defInstList.begin()?
        for (auto def_iter = inst->def_begin(); def_iter != inst->def_end(); def_iter++)
        {
            if( (*def_iter).second == Opnd_pred )
            {
                defNum++;
                defInst = (*def_iter).first;
            }
        }
        if( defNum != 1 || !defInst->getCondMod() )
        {
            canSplit = false;
            break;
        }
        if( canSplit )
        {
            if( bb->isInSimdFlow() && !defInst->isWriteEnableInst() )
            {
                canSplit = false;
            }
            else
            {
                canSplit = canSplitInst( defInst, currInst );
            }
        }
        // check if def inst can be split
        if( !canSplit )
        {
            break;
        }
        else
        {
            expandOpList.push_back( defInst );
            currInst = defInst;
        }
    }

    // split inst into two
    INST_LIST_ITER new_iter = it;
    new_iter++;
    if( canSplit )
    {
        while( !expandOpList.empty() )
        {
            G4_INST *expand_op = expandOpList.front();
            expandOpList.pop_front();
            // find location of expand_op in instruction list
            do
            {
                new_iter--;
                if( (*new_iter) == expand_op )
                {
                    break;
                }
            }while( new_iter != bb->instList.begin() );

            MUST_BE_TRUE( new_iter != bb->instList.end(), "Cannot find predicate definition function in BB." );
            new_iter++;
            G4_INST *secondHalfOp = splitInstWithByteDst( expand_op );
            MUST_BE_TRUE( secondHalfOp, "Error in spliting instruction." );
            bb->instList.insert( new_iter, secondHalfOp );
        }
    }


    return canSplit;
}

G4_INST* HWConformity::splitInstWithByteDst( G4_INST *expand_op )
{
    unsigned char newExecSize = expand_op->getExecSize()/2;
    if( expand_op->getPredicate() )
    {
        expand_op->getPredicate()->splitPred();
    }
    if( expand_op->getCondMod() )
    {
        expand_op->getCondMod()->splitCondMod();
    }
    G4_INST *expand_sec_half_op = builder.createInternalInst(
        (G4_Predicate *)builder.duplicateOperand( expand_op->getPredicate() ),
        expand_op->opcode(),
        (G4_CondMod *)builder.duplicateOperand( expand_op->getCondMod() ),
        expand_op->getSaturate(),
        newExecSize,
        NULL,
        NULL,
        NULL,
        NULL,
        expand_op->getOption(),
        expand_op->getLineNo(),
        expand_op->getCISAOff(),
        expand_op->getSrcFilename() );
    MUST_BE_TRUE( expand_sec_half_op != NULL, ERROR_MEM_ALLOC );

    expand_op->setExecSize( newExecSize );

    if( expand_op->getDst() && !expand_op->hasNULLDst() )
    {
        G4_DstRegRegion *old_dst = expand_op->getDst();
        short secondSubRegOff = old_dst->getSubRegOff() + 1;

        G4_DstRegRegion *newDstOpnd = builder.createDstRegRegion(
            old_dst->getRegAccess(),
            old_dst->getBase(),
            old_dst->getRegOff(),
            old_dst->getSubRegOff(),
            old_dst->getHorzStride() * 2,
            old_dst->getType() );
        if( old_dst->getRegAccess() != Direct )
        {
            newDstOpnd->setImmAddrOff(old_dst->getAddrImm() );
            secondSubRegOff -= 1;
        }

        expand_op->setDest( newDstOpnd );

        G4_DstRegRegion *secondDstOpnd = builder.createDstRegRegion(
            old_dst->getRegAccess(),
            old_dst->getBase(),
            old_dst->getRegOff(),
            secondSubRegOff,
            old_dst->getHorzStride() * 2,
            old_dst->getType());

        if( old_dst->getRegAccess() != Direct )
        {
            secondDstOpnd->setImmAddrOff( old_dst->getAddrImm() + 1 );
        }

        expand_sec_half_op->setDest( secondDstOpnd );
    }
    else
    {
        expand_sec_half_op->setDest( expand_op->getDst() );
    }

    for( int k = 0; k < G4_Inst_Table[expand_op->opcode()].n_srcs; k++ )
    {
        G4_Operand *expand_src = expand_op->getSrc(k);

        if (!expand_src)
            continue;

        if ((expand_op->isMath() && k == 1 && expand_src->isNullReg()) ||
            expand_src->isImm()) {
            expand_sec_half_op->setSrc(expand_src, k);
        } else if (expand_src->isSrcRegRegion()) {
            G4_SrcRegRegion *expandSrcRegion = expand_src->asSrcRegRegion();

            if (expandSrcRegion->isScalar()) {
                expand_sec_half_op->setSrc(builder.duplicateOperand(expand_src), k);
            } else {
                short secondSubRegOffDiff = 0, secondAddrImmedDiff = 0;

                RegionDesc* origRegion = expandSrcRegion->getRegion();
                RegionDesc* newRegion = NULL;

                if( origRegion->width == 1 )
                {
                    newRegion = builder.createRegionDesc( origRegion->vertStride * 2, origRegion->width, origRegion->horzStride );
                    secondSubRegOffDiff = origRegion->vertStride;
                }
                else
                {
                    unsigned short newWD = origRegion->width/2;
                    secondSubRegOffDiff = origRegion->horzStride;
                    newRegion = builder.createRegionDesc( (newWD == 1 && newExecSize == 1) ? 0 : origRegion->vertStride,
                        newWD, (newWD== 1) ? 0 : origRegion->horzStride * 2 );
                }
                secondAddrImmedDiff = (short) (secondSubRegOffDiff * G4_Type_Table[expand_src->getType()].byteSize);
                expandSrcRegion->setRegion( newRegion );

                bool directSrc = ( expandSrcRegion->getRegAccess() == Direct );
                if( secondAddrImmedDiff >= GENX_GRF_REG_SIZ )
                {
                    secondSubRegOffDiff = (short) (( secondAddrImmedDiff - GENX_GRF_REG_SIZ ) / G4_Type_Table[expand_src->getType()].byteSize);
                }
                G4_SrcRegRegion *secondSrcOpnd = builder.createSrcRegRegion(
                    expandSrcRegion->getModifier(),
                    expandSrcRegion->getRegAccess(),
                    expandSrcRegion->getBase(),
                    expandSrcRegion->getRegOff() + ( ( directSrc && secondAddrImmedDiff >= GENX_GRF_REG_SIZ ) ? 1 : 0 ),
                    expandSrcRegion->getSubRegOff() + ( directSrc ? secondSubRegOffDiff : 0 ),
                    newRegion,
                    expandSrcRegion->getType());
                if (expandSrcRegion->getRegAccess() != Direct)
                {
                    secondSrcOpnd->setImmAddrOff( expandSrcRegion->getAddrImm() + secondAddrImmedDiff );
                }
                expand_sec_half_op->setSrc( secondSrcOpnd, k );
            }
        }
    }
    expand_sec_half_op->setLineNo(expand_op->getLineNo());

    if (expand_op->getPredicate() || expand_op->getCondMod())
    {
        if (expand_op->getMaskOffset() == 0)
        {
            expand_sec_half_op->setMaskOption(InstOpt_M8);
        }
        else if(expand_op->getMaskOffset() == 16)
        {
            expand_sec_half_op->setMaskOption(InstOpt_M24);
        }
        else if(!( expand_op->opcode() == G4_sel && !(expand_op->getPredicate()) && expand_op->getCondMod()))
        {
            expand_sec_half_op->setMaskOption( newExecSize > 8 ? InstOpt_M16 : InstOpt_M8 );
        }
    }
    return expand_sec_half_op;
}

// Fix up src regions in instructions:
//    In the component uncompressed instruction uop,
//       if exec size(uop) == width(src(uop)) && hstride(src(uop)) != 0
//          vstride(src(uop)) = width(src(uop)) * hstride(src(uop))
//    In the compressed instruction op,
//       if exec size(op) == width(src(op)) == vstride(src(op)) &&
//          hstride(src(op)) == 1
//          width(src(op)) = vstride(src(op)) = 8
// e.g.
//     mul (32) r60.0<1>:uw r76.0<32;16,1>:ub r78.0<0;1,0>:ub
//     =>
//     mul (32) r60.0<1>:uw r76.0<16;16,1>:ub r78.0<0;1,0>:ub
//     add (16) r60.0<1>:d r76.0<16;16,1>:d r78.0<0;1,0>:d
//     =>
//     add (16) r60.0<1>:d r76.0<8;8,1>:d r78.0<0;1,0>:d

//  in addition, fix the source region to follow the region restriction:
//  1. ExecSize must be greater than or equal to Width.  -- no check for this one
//  2. If ExecSize = Width and HorzStride ? 0, VertStride must be set to Width * HorzStride.
//  3. If ExecSize = Width and HorzStride = 0, there is no restriction on VertStride.
//  4. If Width = 1, HorzStride must be 0 regardless of the values of ExecSize and VertStride.
//  5. If ExecSize = Width = 1, both VertStride and HorzStride must be 0. This defines a scalar.
//  6. If VertStride = HorzStride = 0, Width must be 1 regardless of the value of ExecSize.
//  7. Dst.HorzStride must not be 0.        -- this needs not to be checked.
//  8. VertStride must be used to cross GRF register boundaries. This rule implies that
//      elements within a 'Width' cannot cross GRF boundaries.

void HWConformity::fixSrcRegion( G4_INST *inst )
{

    bool comprInst = isCompressedInst( inst );
    for (int i = 0; i < G4_MAX_SRCS; i++)
    {
        if (inst->getSrc(i) && inst->getSrc(i)->isSrcRegRegion() && !inst->getSrc(i)->isNullReg())
        {
            G4_SrcRegRegion *src = inst->getSrc(i)->asSrcRegRegion();
            RegionDesc* srcRegion = src->getRegion();
            if( srcRegion->isRegionWH() || srcRegion->isRegionV() || srcRegion->isRegionSW() )
                continue;
            uint16_t vs = srcRegion->vertStride, wd = srcRegion->width, hs = srcRegion->horzStride;
            uint8_t exSize = inst->getExecSize();
            MUST_BE_TRUE( inst->isSend() || exSize >= wd, " Bad source region: Width is greater than execution size." );
            if ( comprInst )
            {
                if (G4_Type_Table[inst->getSrc(i)->getType()].byteSize > G4_WSIZE &&
                    wd == exSize &&
                    vs == wd && hs == 1)
                {
                    vs = wd = exSize / 2;
                }
            }
            if( wd == exSize && hs != 0 && vs != wd * hs )
            {
                vs = wd * hs;
            }
            if( wd == 1 )
            {
                hs = 0;
                if( 1 == exSize )
                    vs = 0;
            }
            if( vs == 0 && hs == 0 )
            {
                wd = 1;
            }
            if( hs == 0 &&
                ((G4_Type_Table[inst->getSrc(i)->getType()].byteSize == G4_WSIZE &&
                  exSize == 32 && vs == 32 && wd == 32) ||
                 (G4_Type_Table[inst->getSrc(i)->getType()].byteSize == G4_DSIZE &&
                  exSize == 16 && vs == 16 && wd == 16)) )
            {
                vs = 0;
                wd = 1;
            }

            // check cross GRF (rule 2H)
            // TODO! for the following two cases, split the instruction:
            // source region is like<8;4,1>
            // source region is like<2;4,1>
            if( src->getRegAccess() == Direct && src->crossGRF() && hs != 0)
            {
                // TODO: this is a temp fix
                if( (getGenxPlatform() == GENX_BDW || getGenxPlatform() == GENX_CHV) && vs < wd * hs )
                    continue;
                // check number of elements in first GRF.
                uint16_t execTypeSize = hs * src->getElemSize();
                uint16_t sizeInFirstGRF = GENX_GRF_REG_SIZ - src->getLeftBound() % GENX_GRF_REG_SIZ;
                uint16_t vertSize = vs * G4_Type_Table[src->getType()].byteSize;
                uint16_t numEle = ( sizeInFirstGRF + execTypeSize - 1 ) / execTypeSize;
                uint16_t rowSize = wd * execTypeSize;

                if( sizeInFirstGRF <= vertSize )
                {
                    if( numEle >= wd )
                    {
                        numEle = wd;
                    }
                }
                else if( vs > wd )
                {
                    numEle = sizeInFirstGRF/vertSize * wd +
                        (( sizeInFirstGRF%vertSize > rowSize ) ? wd : ( sizeInFirstGRF%vertSize + execTypeSize - 1 ) / execTypeSize );
                }
                // wd is used to cross GRF, change to <vs;1,0>
                if( numEle < wd || ( wd >= vs && numEle % wd != 0 ) )
                {

                    wd = 1;
                    if( hs == 0 )
                    {
                        vs = 1;
                    }
                    else
                    {
                        vs = hs;
                    }
                    hs = 0;
                }
            }

            if( vs != srcRegion->vertStride || wd != srcRegion->width || hs != srcRegion->horzStride )
            {
                G4_SrcRegRegion *origSrc = inst->getSrc(i)->asSrcRegRegion();
                origSrc->setRegion( builder.createRegionDesc( vs, wd, hs ) );
            }
        }
    }
    if( inst->getDst() && !inst->hasNULLDst() )
    {
        MUST_BE_TRUE( inst->getDst()->getHorzStride() != 0,
            "Bad source region: Width is greater than execution size." );
    }
}

//
//single entry point for HW conformity checks
//
void HWConformityChk(IR_Builder& builder, G4_Kernel& kernel, Mem_Manager& mem )
{
    HWConformity conformity( builder, kernel, mem );
    conformity.chkHWConformity();
}

bool HWConformity::markPackedByteReference(G4_Kernel& kernel, G4_Operand* opnd, G4_INST* inst)
{
    G4_Declare *dcl = NULL, *topdcl = NULL;
    bool foundOptCandidate = false;

    if ((opnd->isSrcRegRegion() || opnd->isDstRegRegion()))
    {
        if (opnd->getBase() && opnd->getBase()->isRegVar())
        {
            dcl = opnd->getBase()->asRegVar()->getDeclare();
            topdcl = dcl->getRootDeclare();
        }
    }

    if (topdcl != NULL &&
        topdcl->getRegFile() == G4_GRF &&
        !(topdcl->getAddressed()))
    {
        if (topdcl->doNotWiden() || inst->isSend())
        {
            //send has no regioning so it is certainly illegal to change data layout
            setAccessPattern(topdcl, ACCESS_PATTERN_INVALID);
            return false;
        }

        if (opnd->isDstRegRegion() &&
            // check if the opnd has pre-assigned physical regsiter
            !(opnd->asDstRegRegion()->getBase()->asRegVar()->isPhyRegAssigned()) &&
            // check if the opnd is global
            !(kernel.fg.globalOpndHT.isOpndGlobal(opnd)) &&
            // check if the opnd is used as packed byte
            G4_Type_Table[opnd->getType()].byteSize == 1 &&
            dcl->getElemSize() == 1 &&
            opnd->asDstRegRegion()->getHorzStride() == 1 &&
            // check if the instruction is a raw mov
            !inst->isRawMov() &&
            // check if the instruction execution type is word
            // (This should be the most common case that can benefit
            //  from this optimization. It could be extended to other
            //  cases like D execution type).
            G4_Type_Table[inst->getExecType()].byteSize == 2 )
        {
            unsigned int leftBound = opnd->asDstRegRegion()->getLeftBound();
            unsigned int rightBound = opnd->asDstRegRegion()->getRightBound();

            if (((rightBound*2/G4_GRF_REG_NBYTES - leftBound*2/G4_GRF_REG_NBYTES) > 1) ||
                (getGenxPlatform() == GENX_BDW &&
                 (rightBound*2/G4_GRF_REG_NBYTES != leftBound*2/G4_GRF_REG_NBYTES)))
            {
                setAccessPattern(topdcl, ACCESS_PATTERN_INVALID);
            }
            else if (getAccessPattern(topdcl) == ACCESS_PATTERN_UNDEF)
            {
                setAccessPattern(topdcl, ACCESS_PATTERN_PACKED_BYTE);
                foundOptCandidate = true;
            }
        }
        else if (opnd->isSrcRegRegion() &&
                // check if the opnd has pre-assigned physical regsiter
                !(opnd->asSrcRegRegion()->getBase()->asRegVar()->isPhyRegAssigned()) &&
                // check if the opnd is global
                !(kernel.fg.globalOpndHT.isOpndGlobal(opnd)) &&
                // check if the opnd is used as packed byte
                G4_Type_Table[opnd->getType()].byteSize == 1 &&
                dcl->getElemSize() == 1 &&
                opnd->asSrcRegRegion()->getRegion()->isContiguous(inst->getExecSize()))
        {
            unsigned int leftBound = opnd->asSrcRegRegion()->getLeftBound();
            unsigned int rightBound = opnd->asSrcRegRegion()->getRightBound();

            if (((rightBound*2/G4_GRF_REG_NBYTES - leftBound*2/G4_GRF_REG_NBYTES) > 1) ||
                (getGenxPlatform() == GENX_BDW &&
                 (rightBound*2/G4_GRF_REG_NBYTES != leftBound*2/G4_GRF_REG_NBYTES)))
            {
                setAccessPattern(topdcl, ACCESS_PATTERN_INVALID);
            }
        }
        else
        {
            setAccessPattern(topdcl, ACCESS_PATTERN_INVALID);
        }
    }

    return foundOptCandidate;
}

G4_Operand* HWConformity::fixPackedByteReference(IR_Builder& builder, G4_Operand* opnd)
{
    G4_Operand* newOpnd = NULL;
    G4_Declare* topdcl = NULL;

    if (opnd->isDstRegRegion() ||
        opnd->isSrcRegRegion())
    {
        topdcl = GetTopDclFromRegRegion(opnd);
    }

    if (topdcl != NULL &&
        getAccessPattern(topdcl) == ACCESS_PATTERN_PACKED_BYTE)
    {
        if (opnd->isDstRegRegion())
        {
            short dst_regoff = opnd->asDstRegRegion()->getRegOff();
            short dst_subregoff = opnd->asDstRegRegion()->getSubRegOff();
            short off = (dst_regoff * G4_GRF_REG_NBYTES + dst_subregoff) * 2;

            dst_regoff = off / G4_GRF_REG_NBYTES;
            dst_subregoff = off % G4_GRF_REG_NBYTES;

            G4_DstRegRegion* newDstOpnd = builder.createDstRegRegion(
                Direct,
                opnd->getBase()->asRegVar(),
                dst_regoff,
                dst_subregoff,
                2,
                opnd->getType());
            newOpnd = newDstOpnd;
        }
        else if (opnd->isSrcRegRegion())
        {
            short src_regoff = opnd->asSrcRegRegion()->getRegOff();
            short src_subregoff = opnd->asSrcRegRegion()->getSubRegOff();
            short off = (src_regoff * G4_GRF_REG_NBYTES + src_subregoff) * 2;

            src_regoff = off / G4_GRF_REG_NBYTES;
            src_subregoff = off % G4_GRF_REG_NBYTES;

            RegionDesc *rd = builder.getRegionStride2();
            G4_SrcRegRegion* newSrcOpnd = builder.createSrcRegRegion(opnd->asSrcRegRegion()->getModifier(),
                Direct,
                opnd->getBase()->asRegVar(),
                src_regoff,
                src_subregoff,
                rd,
                opnd->getType());
            newOpnd = newSrcOpnd;
        }
    }

    return newOpnd;
}

void HWConformity::fixDataLayout( )
{
    bool changeDataLayout = false;

    for (auto &bb : kernel.fg.BBs)
    {
        for (auto &inst : bb->instList)
        {
            if (G4_Inst_Table[inst->opcode()].n_dst == 1)
            {
                G4_Operand* dst = inst->getDst();

                if (dst)
                {
                    bool foundOptCandidate = markPackedByteReference(kernel, dst, inst);
                    if (changeDataLayout == false && foundOptCandidate)
                    {
                        changeDataLayout = true;
                    }
                }
            }

            for (int i = 0; i < G4_Inst_Table[inst->opcode()].n_srcs; i++)
            {
                G4_Operand* src = inst->getSrc(i);

                if (src)
                {
                    markPackedByteReference(kernel, src, inst);
                }
            }
        }
    }

    if (changeDataLayout)
    {
        for (auto &dcl : kernel.Declares)
        {
            G4_Declare* topdcl = dcl->getRootDeclare();

            if (getAccessPattern(topdcl) == ACCESS_PATTERN_PACKED_BYTE)
            {
                dcl->setTotalElems(dcl->getTotalElems() * 2);

                if (dcl != topdcl)
                {
                    G4_Declare* aliasDcl = dcl->getAliasDeclare();
                    unsigned int aliasOffset = dcl->getAliasOffset();
                    dcl->setAliasDeclare(aliasDcl, aliasOffset * 2);
                }
            }
        }

        for (auto &bb : kernel.fg.BBs)
        {
            for (auto &inst : bb->instList)
            {
                if (G4_Inst_Table[inst->opcode()].n_dst == 1)
                {
                    G4_Operand* dst = inst->getDst();
                    G4_Operand* newDst = NULL;

                    if (dst)
                    {
                        newDst = fixPackedByteReference(builder, dst);
                        if (newDst)
                        {
                            inst->setDest(newDst->asDstRegRegion());
                        }
                    }
                }

                for (int i = 0; i < inst->getNumSrc(); i++)
                {
                    G4_Operand* src = inst->getSrc(i);
                    G4_Operand* newSrc = NULL;

                    if (src)
                    {
                        newSrc = fixPackedByteReference(builder, src);
                        if (newSrc)
                        {
                            inst->setSrc(newSrc, i);
                        }
                    }
                }
            }
        }
    }
}

// maintain def-use chain for current inst and the MOV inst generated for its dst
void HWConformity::maintainDU4TempMov( G4_INST *inst, G4_INST *newInst )
{
    if (newInst->getPredicate())
    {
        inst->transferDef(newInst, Opnd_pred, Opnd_pred);
    }

    inst->transferUse(newInst);

    inst->addDefUse(newInst, Opnd_src0);
}

static void expandPlaneMacro(IR_Builder& builder, INST_LIST_ITER it, G4_BB* bb, bool secondHalf)
{
    G4_INST* inst = *it;
    G4_DstRegRegion* dst = inst->getDst();
    G4_SrcRegRegion* src0 = inst->getSrc(0)->asSrcRegRegion();
    G4_SrcRegRegion* src1 = inst->getSrc(1)->asSrcRegRegion();

    G4_SrcRegRegion* srcP = builder.createSrcRegRegion(src0->getModifier(), Direct, src0->getBase(),
        src0->getRegOff(), src0->getSubRegOff(), builder.getRegionScalar(), src0->getType());
    G4_SrcRegRegion* srcQ = builder.createSrcRegRegion(src0->getModifier(), Direct, src0->getBase(),
        src0->getRegOff(), src0->getSubRegOff() + 1, builder.getRegionScalar(), src0->getType());
    G4_SrcRegRegion* srcR = builder.createSrcRegRegion(src0->getModifier(), Direct, src0->getBase(),
        src0->getRegOff(), src0->getSubRegOff() + 3, builder.getRegionScalar(), src0->getType());

    G4_SrcRegRegion* u = builder.duplicateOperand(src1);
    u->setRegOff(u->getRegOff() + (secondHalf ? 2 : 0));
    G4_SrcRegRegion* v = builder.duplicateOperand(src1);
    v->setRegOff(v->getRegOff() + (secondHalf ? 3 : 1));

    uint32_t options = inst->getOption();
    if (inst->getExecSize() == 16)
    {
        options &= ~InstOpt_QuarterMasks;
        int maskOffset = inst->getMaskOffset() + (secondHalf ? 8 : 0);
        switch (maskOffset)
        {
        case 0:
            options |= InstOpt_M0;
            break;
        case 8:
            options |= InstOpt_M8;
            break;
        case 16:
            options |= InstOpt_M16;
            break;
        case 24:
            options |= InstOpt_M24;
            break;
        default:
            MUST_BE_TRUE(false, "unexpected offset value");
        }
    }

    G4_Declare* tmpVal = builder.hasNFType() ? nullptr : builder.createTempVar(8, Type_F, Either, Any);
    G4_DstRegRegion* accDst = builder.hasNFType() ?
        builder.createDstRegRegion(Direct, builder.phyregpool.getAcc0Reg(), 0, 0, 1, Type_NF) :
        builder.Create_Dst_Opnd_From_Dcl(tmpVal, 1);
    G4_INST* madInst = builder.createInternalInst(nullptr, G4_mad, nullptr, false, 8, accDst,
        srcR, u, srcP, options);
    bb->instList.insert(it, madInst);

    G4_Predicate* pred = inst->getPredicate() ? builder.duplicateOperand(inst->getPredicate()) : nullptr;
    G4_CondMod* condMod = inst->getCondMod() ? builder.duplicateOperand(inst->getCondMod()) : nullptr;
    G4_SrcRegRegion* accSrc = builder.hasNFType() ?
        builder.createSrcRegRegion(Mod_src_undef, Direct, builder.phyregpool.getAcc0Reg(), 0, 0, builder.getRegionStride1(), Type_NF) :
        builder.Create_Src_Opnd_From_Dcl(tmpVal, builder.getRegionStride1());
    G4_DstRegRegion* newDst = builder.createDstRegRegion(Direct, dst->getBase(),
        dst->getRegOff() + (secondHalf ? 1 : 0), dst->getSubRegOff(), dst->getHorzStride(), dst->getType());
    G4_INST* secondMadInst = builder.createInternalInst(pred, G4_mad, condMod, inst->getSaturate(), 8, newDst,
        accSrc, v, srcQ, options);
    bb->instList.insert(it, secondMadInst);
}

// Replace plane with a macro sequence:
// pln dest:f src0:f src1:f
// -->
// mad acc0:nf src0.3:f src1:f src0.0:f
// mad dest:f acc0:nf src1+1:f src0.1:f
// simd16 pln also needs to be split as the macro is simd8 only

void HWConformity::expandPlaneInst(INST_LIST_ITER it, G4_BB* bb)
{
    G4_INST* inst = *it;
    MUST_BE_TRUE(inst->opcode() == G4_pln, "expect a plane inst");
    MUST_BE_TRUE(inst->getSrc(0)->isSrcRegRegion(), "src0 must be source reg region");
    MUST_BE_TRUE(inst->getExecSize() == 8 || inst->getExecSize() == 16, " only size 8 and 16 are supported");

    G4_DstRegRegion* dst = inst->getDst();
    if (dst->getRegAccess() == IndirGRF || dst->getHorzStride() > 1)
    {
        inst->setDest(insertMovAfter(it, dst, dst->getType(), bb));
    }
    G4_SrcRegRegion* src0 = inst->getSrc(0)->asSrcRegRegion();
    if (src0->getRegAccess() == IndirGRF)
    {
        // insert move to make src0 direct
        inst->setSrc(insertMovBefore(it, 0, src0->getType(), bb), 0);
    }
    G4_SrcRegRegion* src1 = inst->getSrc(1)->asSrcRegRegion();
    if (src1->getRegAccess() == IndirGRF)
    {
        // insert move to make src1 direct
        inst->setSrc(insertMovBefore(it, 1, src1->getType(), bb), 1);
    }

    expandPlaneMacro(builder, it, bb, false);
    if (inst->getExecSize() == 16)
    {
        expandPlaneMacro(builder, it, bb, true);
    }

    it = bb->instList.erase(it);
}

// plane does not support pln with non-packed dst.
// also fix up plane sources, which don't support modifiers
// returns true if the original plane is deleted
bool HWConformity::fixPlaneInst(INST_LIST_ITER it, G4_BB* bb)
{

    G4_INST* inst = *it;
    if (inst->opcode() == G4_pln)
    {
        if (!builder.doPlane())
        {
            expandPlaneInst(it, bb);
            return true;
        }
        G4_DstRegRegion* dst = inst->getDst();
        if (dst->getHorzStride() != 1)
        {
            G4_DstRegRegion *newDst = insertMovAfter(it, dst, dst->getType(), bb);
            inst->setDest(newDst);
        }

        G4_Operand* src0 = inst->getSrc(0);
        G4_Operand* src1 = inst->getSrc(1);

        // Source modifiers are not supported for pln instruction
        if (src0 &&
            ((src0->isSrcRegRegion() &&
            src0->asSrcRegRegion()->getModifier() != Mod_src_undef) ||
            !builder.isOpndAligned(src0, 16)))
        {
            // src0 needs a temp
            G4_Declare* tmpDcl = builder.createTempVar(4, Type_F,
                Either, Sixteen_Word);

            // Before:
            // pln (16) dst, (mod)src0, src1
            //
            // After:
            // mov (4) tmp(0,0):f (mod)src0(r)<4;4,1>:f
            // pln (16) dst, tmp(0,0)<0;1,0>, src1
            G4_DstRegRegion* dstRgn = builder.createDstRegRegion(
                Direct,
                tmpDcl->getRegVar(),
                0,
                0,
                1,
                Type_F);

            RegionDesc* rd = builder.createRegionDesc(4, 4, 1);
            G4_SrcRegRegion* srcRgn = builder.createSrcRegRegion(
                src0->asSrcRegRegion()->getModifier(),
                Direct,
                src0->asSrcRegRegion()->getBase(),
                src0->asSrcRegRegion()->getRegOff(),
                src0->asSrcRegRegion()->getSubRegOff(),
                rd,
                Type_F);

            G4_INST* newInst = builder.createInternalInst(NULL, G4_mov,
                NULL, false, 4, dstRgn, srcRgn, NULL, 0);

            bb->instList.insert(it, newInst);

            rd = builder.getRegionScalar();
            G4_SrcRegRegion* newSrcRgn = builder.createSrcRegRegion(
                Mod_src_undef,
                Direct,
                tmpDcl->getRegVar(),
                0,
                0,
                rd,
                Type_F);

            inst->setSrc(newSrcRgn, 0);
            inst->transferDef(newInst, Opnd_src0, Opnd_src0);
            newInst->addDefUse(inst, Opnd_src0);
        }

        if (src1 && src1->isSrcRegRegion() && src1->asSrcRegRegion()->getModifier() != Mod_src_undef)
        {
            // src1 needs a temp
            // For pln instruction src2 is implied from src1 and exec_size
            // When exec_size = 8, src2 is 1 GRF after src1 with size = 1 GRF
            // When exec_size = 16, src2 is 2 GRFs after src1 with size = 2 GRFs
            unsigned short numGRFsToCopy = inst->getExecSize() == 8 ? 2 : 4;

            G4_Declare* tmpDcl = builder.createTempVar((unsigned short)(G4_GRF_REG_NBYTES / G4_Type_Table[Type_F].byteSize * numGRFsToCopy), Type_F,
                Either, Any);

            // Before:
            // pln (16) dst, src0, (mod)src1
            //
            // After:
            // mov (16) tmp(0,0):f (mod)src1(r)<8;8,1>:f
            // mov (16) tmp(2,0):f (mod)src1(r+2)<8;8,1>:f <-- only if exec_size = 16
            // pln (16) dst, src0, tmp(0,0)
            for (int i = 0; i < numGRFsToCopy; i += 2)
            {
                G4_DstRegRegion* dstRgn = builder.createDstRegRegion(
                    Direct,
                    tmpDcl->getRegVar(),
                    (short)i,
                    0,
                    1,
                    Type_F);

                RegionDesc* rd = builder.createRegionDesc(8, 8, 1);
                G4_SrcRegRegion* srcRgn = builder.createSrcRegRegion(
                    src1->asSrcRegRegion()->getModifier(),
                    Direct,
                    src1->asSrcRegRegion()->getBase(),
                    src1->asSrcRegRegion()->getRegOff() + i,
                    0,
                    rd,
                    Type_F);

                G4_INST* newInst = builder.createInternalInst(NULL, G4_mov,
                    NULL, false, 16, dstRgn, srcRgn, NULL, 0);

                bb->instList.insert(it, newInst);

                if (i == 0)
                {
                    G4_SrcRegRegion* newSrcRgn = builder.createSrcRegRegion(
                        Mod_src_undef,
                        Direct,
                        tmpDcl->getRegVar(),
                        0,
                        0,
                        rd,
                        Type_F);

                    inst->setSrc(newSrcRgn, 1);
                    inst->transferDef(newInst, Opnd_src1, Opnd_src0);
                }
                newInst->addDefUse(inst, Opnd_src1);
            }
        }
    }
    return false;
}

void HWConformity::fixImm64 ( INST_LIST_ITER i,
                              G4_BB* bb )
{
    G4_INST *inst = *i;
    for( int j = 0; j < G4_Inst_Table[inst->opcode()].n_srcs; j++ )
    {
        G4_Operand *src = inst->getSrc(j);
        if( !src                                    ||
            !(src->isImm() )                        ||
            G4_Type_Table[src->getType()].byteSize != 8 )
        {
            continue;
        }
        // a 64bit immediate is supported ONLY for a MOV operation
        bool needsSplit =  false;

        if( VISA_WA_CHECK(builder.getPWaTable(), WaDisallow64BitImmMov) )
        {
            needsSplit = true;
        }
        if (needsSplit)
        {
            char* immPtr = NULL;
            double dfValue = 0.0f;
            int64_t qValue = 0;

            if (IS_DFTYPE(src->getType()))
            {
                dfValue = src->asImm()->getDouble();
                immPtr = (char*) &dfValue;
            }
            else
            {
                qValue = src->asImm()->getInt();
                immPtr = (char*) &qValue;
            }
            unsigned int lowValue = *((unsigned int*)(immPtr));
            unsigned int highValue = *((unsigned int*)(immPtr+4));
            G4_Imm *lowImm = builder.createImm( (int64_t)lowValue, Type_UD);
            G4_Imm *highImm = builder.createImm( (int64_t)highValue, Type_UD);

            G4_Declare *defDcl = NULL;

            defDcl = builder.createTempVar(1, src->getType(), Either, Eight_Word);
            G4_Declare* dcl = builder.createTempVar( 2, Type_UD, Either, Eight_Word );
            dcl->setAliasDeclare(defDcl, 0);

            G4_DstRegRegion *dstRegion = builder.Create_Dst_Opnd_From_Dcl(dcl, 1);
            G4_INST* lowMovInst = builder.createInternalInst(NULL, G4_mov, NULL, false,
                1, dstRegion, lowImm, NULL, InstOpt_WriteEnable,
                inst->getLineNo(), inst->getCISAOff(), inst->getSrcFilename() );

            bb->instList.insert(i, lowMovInst);

            G4_DstRegRegion *dstRegionNext = builder.Create_Dst_Opnd_From_Dcl(dcl, 1);
            G4_INST *highMovInst = builder.createInternalInst( NULL, G4_mov, NULL, false,
                1, dstRegionNext, highImm, NULL, InstOpt_WriteEnable,
                inst->getLineNo(), inst->getCISAOff(), inst->getSrcFilename() );
            dstRegionNext->setSubRegOff(1);
            bb->instList.insert(i, highMovInst);

            inst->transferDef(lowMovInst, Gen4_Operand_Number(j + 1), Opnd_src0);
            lowMovInst->addDefUse(inst, Gen4_Operand_Number(j + 1));
            inst->transferDef(highMovInst, Gen4_Operand_Number(j + 1), Opnd_src0);
            highMovInst->addDefUse(inst, Gen4_Operand_Number(j + 1));

            unsigned short vs = 0, hs = 0, wd = 1; // gen7_5: always 0;1,0
            G4_SrcRegRegion *new_src = builder.Create_Src_Opnd_From_Dcl(defDcl,
                builder.createRegionDesc(vs, wd, hs));
            inst->setSrc( new_src, j );
        }
        else
        {
            if ( inst->opcode() != G4_mov )
            {
                inst->setSrc(insertMovBefore(i, j, src->getType(), bb), j);
            }
        }
    }
}

// Check if the source of def_inst is redefined before inst
G4_INST* HWConformity::checkSrcDefInst( G4_INST *inst,
                                        G4_INST *def_inst,
                                        uint32_t srcNum )
{
    G4_INST* valid_inst = def_inst;

    if( def_inst != NULL )
    {
        MUST_BE_TRUE( def_inst->opcode() == G4_mov, "def inst must be a mov instruction" );

        G4_INST* def_inst1 = NULL;
        for (auto def_it1 = inst->def_begin(); def_it1 != inst->def_end(); def_it1++ )
        {
            if((*def_it1).second == srcNum + 1 )
            {
                def_inst1 = (*def_it1).first;
            }
        }

        if( def_inst1 != NULL )
        {
            G4_INST* def_inst2 = NULL;
            for (auto def_it2 = def_inst->def_begin(); def_it2 != def_inst->def_end(); def_it2++ )
            {
                if((*def_it2).second == Opnd_src0 )
                {
                    def_inst2 = (*def_it2).first;
                }
            }

            if ( def_inst1 != def_inst2 )
            {
                valid_inst = NULL;
            }
        }
    }

    return valid_inst;
}

/*
    Helper function for fixMixedHFInst
    It assumes dst is not null and is of type DstRegRegion.
    This check must be done before this method is called.
*/
void HWConformity::helperGenerateTempDst(
                                G4_BB* bb,
                                INST_LIST_ITER instIter,
                                G4_INST *inst,
                                uint8_t hStride,
                                G4_Type tempDstType,
                                G4_SubReg_Align subAlign)
{
    G4_DstRegRegion *dst = inst->getDst();
    uint8_t execSize = inst->getExecSize();
    uint8_t dstSize = execSize * G4_Type_Table[tempDstType].byteSize;
    //create a new temp with horizontal stride of 1 (packed)
    //create a move to dst.

    uint32_t numElt = execSize == 1 ? 1 : execSize * hStride;
    if (numElt > 1 && tempDstType == Type_HF && hStride == 1 && subAlign < Eight_Word)
        subAlign = Eight_Word;
    G4_Align align = getDclAlignment( dstSize, inst, execSize == 1, subAlign );

    G4_Declare* dcl = builder.createTempVar( numElt, tempDstType, align , subAlign );


    G4_DstRegRegion *dstRegion = builder.Create_Dst_Opnd_From_Dcl(dcl, hStride);
    inst->setDest(dstRegion);

    RegionDesc* region = builder.createRegionDesc(execSize*hStride, execSize, hStride);
    G4_SrcRegRegion *srcRegion = builder.Create_Src_Opnd_From_Dcl(dcl, region);

    //creating a mov from temp dst to final destination using original options of fixed instruction
    G4_INST* movInst = builder.createInst( NULL, G4_mov, NULL, false, execSize, dst, srcRegion, NULL, inst->getMaskOption() );

    ++instIter;
    //inserting mov after fixed instruction
    bb->instList.insert( instIter, movInst );

    /*
    Need to remove dst from uses list of mulh, and add them to movInst useList
    add movInst to uselist of mulh.
    Add mulh to def instruction list of movInst
    */
    inst->transferUse(movInst);
    inst->addDefUse(movInst, Opnd_src0);
}

/*
    Not Implemented rules:

    3:  (Does this mean align1 doesn't support replication?)
        In Align16 mode, replicate is supported and is coissueable.

    4: (handled in reduce execution size)
        No simd16 in mixed mode when destination is packed f16 for both Align1 and Align16.

            mad(8) r3.xyzw:hf r4.xyzw:f r6.xyzw:hf r7.xyzw:hf

            add(8) r20.0<1>:hf r3<8;8,1>:f r6.0<8;8,1>:hf {Q1}

    5: (we are not producing this type of code)
        No accumulator read access for align16 mixed float

    6: (we do not generate code like this)
        [DevCHV, DevSKL+]: When source is float from accumulator register and destination is half float with a stride of 1, the source must register aligned. i.e., source must have offset zero.

    7: (doesn't seem like it is applicable to our code)
        In Align16, vertical stride can never be zero for f16

    8.a: (handled by another check)
        Math operations for mixed mode,
            - In Align16, only packed format is supported

    11. (handled in reduce execution size)
        [DevCHV, DevSKL, DevBXT]: No simd16 in mixed mode when destination is f32. Instruction Execution size must be no more than 8.

*/
void HWConformity::fixMixedHFInst( BB_LIST_ITER it )
{
    G4_BB* bb = *it;
    for (auto instIter = bb->instList.begin(); instIter != bb->instList.end(); ++instIter)
    {
        G4_INST *inst = *instIter;

        if (inst->isSend())
        {
            continue;
        }
        //In case of invalid ISA
        if (inst->isMath() && (inst->isMixedMode() || builder.getOption(vISA_DisableHFMath)))
        {
            auto src0 = inst->getSrc(0);
            auto src1 = inst->getSrc(1);
            auto dst = inst->getDst();
            if (src0 && src0->getType() == Type_HF)
            {
                inst->setSrc(insertMovBefore(instIter, 0, Type_F, bb), 0);
            }

            if (src1 && src1->getType() == Type_HF)
            {
                inst->setSrc(insertMovBefore(instIter, 1, Type_F, bb), 1);
            }

            if (dst && dst->getType() == Type_HF)
            {
                inst->setDest(insertMovAfter(instIter, dst, inst->getExecType2(), bb));
            }
            continue;
        }

        if (VISA_WA_CHECK(builder.getPWaTable(), WaSrc1ImmHfNotAllowed) && !inst->isSend())
        {
            G4_Operand *tSrc1 = inst->getSrc(1);
            if (tSrc1 && tSrc1->isImm() && tSrc1->getType() == Type_HF)
            {
                inst->setSrc(insertMovBefore(instIter, 1, Type_HF, bb), 1);
            }
        }


        // Restriction :
        // The execution size must be no more than 8 when half-floats are used in source or destination operand.
        if (inst->getExecSize() == 16)
        {
            if (inst->opcode() == G4_math               &&
                inst->getDst()->getType() == Type_HF    &&
                inst->getSrc(0)->getType() == Type_HF &&
                (!inst->getSrc(1) || inst->getSrc(1)->getType() == Type_HF))
            {
                evenlySplitInst(instIter, bb);
            }
        }

        if (inst->isMath() &&
            VISA_WA_CHECK(builder.getPWaTable(), WaDstSubRegNumNotAllowedWithLowPrecPacked))
        {
            G4_DstRegRegion* dst = inst->getDst();
            if (dst                         &&
                dst->getType() == Type_HF   &&
                dst->getSubRegOff() == 8)
            {
                helperGenerateTempDst(bb, instIter, inst, 1, Type_HF, Sixteen_Word);
            }
        }

        if (inst->isMath() && inst->isMixedMode())
        {
            // For `math`, additional GRF alignment checking for non-scalar
            // destination.
            G4_DstRegRegion* dst = inst->getDst();
            if (dst->getType() == Type_F &&
                inst->getExecSize() != 1 &&
                !builder.isOpndAligned(dst, G4_GRF_REG_NBYTES))
            {
                helperGenerateTempDst(bb, instIter, inst, 1, Type_F, Sixteen_Word);
            }
        }

        G4_DstRegRegion *dst = inst->getDst();
        if (INST_FLOAT_SRC_ONLY(inst->opcode()) && dst && !dst->isNullReg() && dst->getType() == Type_HF)
        {
            helperGenerateTempDst(bb, instIter, inst, 1, Type_F);
        }

        if (!inst->isMixedMode())
            continue;

        if (inst->getDst() && !inst->getDst()->isNullReg())
            dst = inst->getDst();

        if ((VISA_WA_CHECK(builder.getPWaTable(), WaMixModeSelInstDstNotPacked) ||
            VISA_WA_CHECK(builder.getPWaTable(), WaFloatMixedModeSelNotAllowedWithPackedDestination)) &&
            inst->opcode() == G4_sel                                        &&
            dst                                                             &&
            (VISA_WA_CHECK(builder.getPWaTable(), WaMixModeSelInstDstNotPacked) || dst->getHorzStride() == 1) &&
            dst->getType() == Type_HF)
        {
            helperGenerateTempDst(bb, instIter, inst, 1, Type_F);
        }

        if (!inst->isMixedMode())
            continue;
        /*
        Checks for mix mode HW conformity violations.
        */
        if (getGenxPlatform() >= GENX_CHV)
        {
            if(checkMixMode(instIter, bb))
            {
                //instruction was split, and new instruction inserted before
                //going back to previous instruction to double check it still confirms.
                --instIter;
                inst = *instIter;
            }
        }

        if (VISA_WA_CHECK(builder.getPWaTable(), WaDstSubRegNumNotAllowedWithLowPrecPacked) &&
            dst                                                                             &&
            dst->getType() == Type_HF                                                       &&
            dst->getSubRegOff() == 8                                                        &&
            inst->getExecSize() == 8)
        {
            helperGenerateTempDst(bb, instIter, inst, 1, dst->getType());
        }

        if( inst->isMath()       &&
            ((VISA_WA_CHECK(builder.getPWaTable(), WaDisableMixedModeLog) && inst->asMathInst()->getMathCtrl() == MATH_LOG) ||
            (VISA_WA_CHECK(builder.getPWaTable(), WaDisableMixedModeFdiv) && inst->asMathInst()->getMathCtrl() == MATH_FDIV) ||
            (VISA_WA_CHECK(builder.getPWaTable(), WaDisableMixedModePow) && inst->asMathInst()->getMathCtrl() == MATH_POW)))
        {
            if (dst && dst->getType() == Type_HF)
            {
                helperGenerateTempDst(bb, instIter, inst, 1, Type_F);
            }

            for (uint8_t i = 0; i < inst->getNumSrc(); ++i)
            {
                G4_Operand *tOpnd = inst->getSrc(i);

                if (tOpnd == NULL || !tOpnd->isSrcRegRegion() ||
                    tOpnd->asSrcRegRegion()->getType() != Type_HF)
                {
                    continue;
                }

                inst->setSrc(insertMovBefore(instIter, i, Type_F, bb), i);
            }
        }

        // - In Align1, f16 inputs need to be strided
        // math(8) r3<1>:hf r4.0<8;8,1>:f r6.0<8;4,2>:hf
        if (inst->isMath())
        {
            for (uint8_t i = 0; i < inst->getNumSrc(); ++i)
            {
                G4_Operand *tOpnd = inst->getSrc(i);

                if (tOpnd == NULL                                       ||
                    !tOpnd->isSrcRegRegion()                            ||
                    tOpnd->asSrcRegRegion()->getType() != Type_HF       ||
                    !tOpnd->asSrcRegRegion()->isNativePackedSrcRegion())
                {
                    continue;
                }

                inst->setSrc(insertMovBefore(instIter, i, Type_F, bb), i);
            }
        }

        if (inst->isMath() && inst->getSrc(0)->isImm())
        {
            bool nullSrc1 = inst->getSrc(1) == nullptr || inst->getSrc(1)->isNullReg();
            if (!nullSrc1)
            {
                inst->setSrc(insertMovBefore(instIter, 0, inst->getSrc(0)->getType(), bb), 0);
            }
        }

        for (uint8_t i = 0; i < inst->getNumSrc(); ++i)
        {
            G4_Operand *tOpnd = inst->getSrc(i);

            if (tOpnd == NULL || !tOpnd->isSrcRegRegion())
                continue;

            G4_SrcRegRegion *srcOpnd = tOpnd->asSrcRegRegion();

            // `math` instruction requires non-scalar float operand to be
            // GRF aligned.
            if (inst->isMath() &&
                srcOpnd->getType() == Type_F &&
                !srcOpnd->isScalar() &&
                !builder.isOpndAligned(tOpnd, G4_GRF_REG_NBYTES)) {
                inst->setSrc(insertMovBefore(instIter, i, Type_F, bb), i);
            }
            /*

            8: Math operations for mixed mode,
            - In Align1, f16 inputs need to be strided
            math(8) r3<1>:hf r4.0<8;8,1>:f r6.0<8;4,2>:hf

            If type is hf, and stride is 1, assume it is packed, generate move with stride 2.
            */
            if (inst->isMath() &&
                srcOpnd->getType() == Type_HF &&
                srcOpnd->getRegion()->horzStride == 1)
            {
                inst->setSrc(insertMovBefore(instIter, i, Type_F, bb), i);
            }
        }
        /*
            10. [DevCHV:A]: When packed f16 is used as destination datatype, the subregister MUST be 0.
        */
        if(getGenxPlatform() == GENX_CHV    &&
            GetStepping() == Step_A         &&
            dst                             &&
            dst->getHorzStride() ==1        &&
            dst->getSubRegOff() != 0)
        {
            helperGenerateTempDst(bb, instIter, inst, 1, dst->getType());
        }

        /*
            12: [DevCHV, DevSKL]: Indirect Addressing on source is not supported when source and destination data types are mixed float.
        */
        if (getGenxPlatform() == GENX_CHV || getGenxPlatform() == GENX_SKL)
        {
            for (uint8_t i = 0; i < inst->getNumSrc(); ++i)
            {
                G4_Operand* src = inst->getSrc(i);
                if (src == nullptr || !src->isSrcRegRegion() || !src->asSrcRegRegion()->isIndirect())
                {
                    continue;
                }
                inst->setSrc(insertMovBefore(instIter, i, src->getType(), bb), i);
            }
        }

        if (inst->getDst()->getBase()->isRegVar()                   &&
            inst->getDst()->getType() == Type_HF                    &&
            inst->getDst()->getHorzStride() == 1)
        {
            if (VISA_WA_CHECK(builder.getPWaTable(), WaDstSubRegNumNotAllowedWithLowPrecPacked))
                inst->getDst()->getBase()->asRegVar()->getDeclare()->setSubRegAlign(Sixteen_Word);
            else
                inst->getDst()->getBase()->asRegVar()->getDeclare()->setSubRegAlign(Eight_Word);
        }
    }
}

// Fix for packed half types on BDW.
// Conversions from F to packed HF are not supported on this platform,
// only unpacked HF is supported on destination.
// When we encounter an instruction with HF type on destination with <1> stride
// and float on source, add an additional mov that handles unpacking.
void HWConformity::fixPackedHFConversions(INST_LIST_ITER it, G4_BB* bb)
{
    G4_INST *inst = *it;
    G4_DstRegRegion* dst = inst->getDst();
    if (dst && dst->getType() == Type_HF && dst->getHorzStride() == 1 &&
        getTypeSize(inst->getExecType()) > 2)
    {
        helperGenerateTempDst(bb, it, inst, 2, Type_HF);
    }
}