Files
intel-graphics-compiler/IGC/Compiler/CISACodeGen/PixelShaderLowering.cpp
Krystian Andrzejewski 24f3f1d042 Fix lowering pixel output instructions
Pixel output instructions must be lowered after any actions which can potentially change their operands. Otherwise, there is no guarantee the compilation process finishes successfully.
2021-10-27 10:48:22 +02:00

1672 lines
56 KiB
C++

/*========================== begin_copyright_notice ============================
Copyright (C) 2017-2021 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
#include "common/LLVMUtils.h"
#include "common/IGCIRBuilder.h"
#include "PixelShaderLowering.hpp"
#include "GenISAIntrinsics/GenIntrinsics.h"
#include "Compiler/IGCPassSupport.h"
#include "Probe/Assertion.h"
using namespace llvm;
//#define DEBUG_BLEND_TO_DISCARD
namespace IGC
{
#define PASS_FLAG "igc-pixel-shader-addmask"
#define PASS_DESCRIPTION "Pixel shader lowering pass"
#define PASS_CFG_ONLY false
#define PASS_ANALYSIS true
IGC_INITIALIZE_PASS_BEGIN(PixelShaderAddMask, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
IGC_INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
IGC_INITIALIZE_PASS_END(PixelShaderAddMask, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
#undef PASS_FLAG
#undef PASS_DESCRIPTION
#undef PASS_CFG_ONLY
#undef PASS_ANALYSIS
char PixelShaderAddMask::ID = 0;
PixelShaderAddMask::PixelShaderAddMask() :
FunctionPass(ID)
{
initializePixelShaderAddMaskPass(*PassRegistry::getPassRegistry());
}
bool PixelShaderAddMask::runOnFunction(llvm::Function& F)
{
m_cgCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
Module* mod = F.getParent();
bool hasDiscard;
hasDiscard = (mod->getNamedMetadata("KillPixel") != nullptr);
m_modMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
IGCMD::MetaDataUtils* pMdUtils =
getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
if (!hasDiscard || pMdUtils->findFunctionsInfoItem(&F) == pMdUtils->end_FunctionsInfo())
{
return false;
}
Instruction* globalMask = nullptr;
Instruction* updateMask = nullptr;
unsigned numUpdateMask = 0;
for (auto BI = F.begin(), BE = F.end(); BI != BE; BI++)
{
for (auto II = BI->begin(), IE = BI->end(); II != IE; II++)
{
if (isa<GenIntrinsicInst>(II, GenISAIntrinsic::GenISA_InitDiscardMask))
{
globalMask = &(*II);
}
else
if (isa<GenIntrinsicInst>(II, GenISAIntrinsic::GenISA_UpdateDiscardMask))
{
numUpdateMask++;
updateMask = &(*II);
}
}
}
if (!globalMask)
{
return false;
}
if (F.size() == 1 && numUpdateMask == 1)
{
// handle special case function has 1 BB and 1 discard, then we
// can directly use the discard condition for RTWrite, no need to
// generate GetPixelMask.
Value* discardCond = updateMask->getOperand(1);
updateMask->eraseFromParent();
globalMask->eraseFromParent();
Value* mask = nullptr;
for (auto BI = F.begin(), BE = F.end(); BI != BE; BI++)
{
RTWritIntrinsic* rtw;
RTDualBlendSourceIntrinsic* drt;
for (auto II = BI->begin(), IE = BI->end(); II != IE; II++)
{
if ((rtw = dyn_cast<RTWritIntrinsic>(II)))
{
IGC_ASSERT(isa<ConstantInt>(rtw->getPMask()));
if (!mask)
{
mask = BinaryOperator::CreateNot(discardCond, "", rtw);
}
rtw->setPMask(mask);
}
else
if ((drt = dyn_cast<RTDualBlendSourceIntrinsic>(II)))
{
IGC_ASSERT(isa<ConstantInt>(drt->getPMask()));
if (!mask)
{
mask = BinaryOperator::CreateNot(discardCond, "", drt);
}
drt->setPMask(mask);
}
}
}
}
else
{
globalMask->moveBefore(globalMask->getParent()->getFirstNonPHI());
Function* getMaskF;
getMaskF = GenISAIntrinsic::getDeclaration(mod,
GenISAIntrinsic::GenISA_GetPixelMask);
Value* mask = nullptr;
for (auto BI = F.begin(), BE = F.end(); BI != BE; BI++)
{
RTWritIntrinsic* rtw;
RTDualBlendSourceIntrinsic* drt;
mask = nullptr;
for (auto II = BI->begin(), IE = BI->end(); II != IE; II++)
{
if ((rtw = dyn_cast<RTWritIntrinsic>(II)) && globalMask)
{
if (!mask)
{
mask = CallInst::Create(getMaskF, { globalMask }, "", rtw);
}
IGC_ASSERT(isa<ConstantInt>(rtw->getPMask()));
rtw->setPMask(mask);
}
else
if ((drt = dyn_cast<RTDualBlendSourceIntrinsic>(II)) && globalMask)
{
if (!mask)
{
mask = CallInst::Create(getMaskF, { globalMask }, "", drt);
}
IGC_ASSERT(isa<ConstantInt>(drt->getPMask()));
drt->setPMask(mask);
}
}
}
}
return false;
}
char PixelShaderLowering::ID = 0;
// Register pass to igc-opt
#define PASS_FLAG "igc-pixel-shader-lowering"
#define PASS_DESCRIPTION "This is the pixel shader lowering pass "
#define PASS_CFG_ONLY false
#define PASS_ANALYSIS true
IGC_INITIALIZE_PASS_BEGIN(PixelShaderLowering, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
IGC_INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
IGC_INITIALIZE_PASS_END(PixelShaderLowering, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
#undef PASS_FLAG
#undef PASS_DESCRIPTION
#undef PASS_CFG_ONLY
#undef PASS_ANALYSIS
PixelShaderLowering::PixelShaderLowering() :
FunctionPass(ID),
m_module(nullptr),
PDT(nullptr),
m_ReturnBlock(nullptr),
SkipSrc0Alpha(false),
m_dualSrcBlendEnabled(false),
uavPixelSync(false)
{
initializePixelShaderLoweringPass(*PassRegistry::getPassRegistry());
}
bool PixelShaderLowering::runOnFunction(llvm::Function& F)
{
m_cgCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
IGCMD::MetaDataUtils* pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
if (!isEntryFunc(pMdUtils, &F))
{
return false;
}
m_modMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
for (llvm::Function::iterator bb = F.begin(), be = F.end(); bb != be; ++bb)
{
if (llvm::isa<llvm::ReturnInst>(bb->getTerminator()))
{
m_ReturnBlock = &(*bb);
break;
}
}
if (m_ReturnBlock == nullptr)
{
F.begin()->getTerminator()->eraseFromParent();
ReturnInst::Create(F.getContext(), &(*F.begin()));
m_ReturnBlock = &(*F.begin());
}
m_outputBlock = nullptr;
m_module = F.getParent();
ColorOutputArray colors;
DebugLocArray debugLocs;
Value* depth = nullptr;
Value* mask = nullptr;
Value* src0Alpha = nullptr;
Value* stencil = nullptr;
// src0Alphas need not be sent when renderTargetBlending metadata is disabled
// this means alpha to coverage and alpha test is disabled
// this also means the render target blending is disabled
SkipSrc0Alpha = m_modMD->psInfo.SkipSrc0Alpha || IGC_IS_FLAG_ENABLED(ForceDisableSrc0Alpha);
// Check whether metadata indicates that dual source blending should be disabled
bool dualSourceBlendingDisabled =
IGC_IS_FLAG_ENABLED(DisableDualBlendSource) ||
m_modMD->psInfo.DualSourceBlendingDisabled;
m_dualSrcBlendEnabled = !dualSourceBlendingDisabled;
m_isPerSample = false;
m_hasDiscard = (m_module->getNamedMetadata("KillPixel") != nullptr);
// In case we are using intrinsic retrieve the output
FindIntrinsicOutput(colors, depth, stencil, mask, src0Alpha, debugLocs);
if (uavPixelSync)
{
// Emitting a fence to ensure that the uav write is completed before an EOT is issued
IRBuilder<> builder(F.getContext());
bool fenceFlushNone = 0;
EmitMemoryFence(builder, fenceFlushNone);
}
// EmitRender target write intrinsic
EmitRTWrite(colors, depth, stencil, mask, src0Alpha, debugLocs);
Function* pixelPhase = nullptr;
Function* coarsePhase = nullptr;
NamedMDNode* coarseNode = F.getParent()->getNamedMetadata(NAMED_METADATA_COARSE_PHASE);
NamedMDNode* pixelNode = F.getParent()->getNamedMetadata(NAMED_METADATA_PIXEL_PHASE);
bool cfgChanged = false;
if (coarseNode)
{
coarsePhase = mdconst::dyn_extract<Function>(coarseNode->getOperand(0)->getOperand(0));
}
if (pixelNode)
{
pixelPhase = mdconst::dyn_extract<Function>(pixelNode->getOperand(0)->getOperand(0));
}
if (&F == coarsePhase && pixelPhase != nullptr && mask != nullptr)
{
EmitCoarseMask(mask);
}
return cfgChanged;
}
void PixelShaderLowering::FindIntrinsicOutput(
ColorOutputArray& colors,
Value*& depth,
Value*& stencil,
Value*& mask,
Value*& src0Alpha,
DebugLocArray& debugLocs)
{
constexpr uint cMaxInputs = 32;
constexpr uint cMaxInputComponents = cMaxInputs * 4;
std::bitset<cMaxInputComponents> inputComponentsUsed;
std::bitset<cMaxInputs> isLinearInterpolation;
llvm::Instruction* primId = nullptr;
llvm::Instruction* pointCoordX = nullptr;
llvm::Instruction* pointCoordY = nullptr;
SmallVector<GenIntrinsicInst*, 4> outputInstructions;
SmallVector<Instruction*, 4> instructionToRemove;
Function& F = *m_ReturnBlock->getParent();
Value* btrue = llvm::ConstantInt::get(Type::getInt1Ty(m_module->getContext()), true);
m_modMD->psInfo.colorOutputMask.resize(USC::NUM_PSHADER_OUTPUT_REGISTERS);
for (auto BI = F.begin(), BE = F.end(); BI != BE; BI++)
{
for (auto II = BI->begin(), IE = BI->end(); II != IE; II++)
{
if (GenIntrinsicInst * inst = dyn_cast<GenIntrinsicInst>(II))
{
GenISAIntrinsic::ID IID = inst->getIntrinsicID();
if (IID == GenISAIntrinsic::GenISA_uavSerializeAll ||
IID == GenISAIntrinsic::GenISA_uavSerializeOnResID)
{
uavPixelSync = true;
}
else if (IID == GenISAIntrinsic::GenISA_OUTPUT)
{
m_outputBlock = inst->getParent();
outputInstructions.push_back(inst);
uint outputType = (uint)llvm::cast<llvm::ConstantInt>(inst->getOperand(4))->getZExtValue();
IGC_ASSERT(outputType == SHADER_OUTPUT_TYPE_DEFAULT ||
outputType == SHADER_OUTPUT_TYPE_DEPTHOUT ||
outputType == SHADER_OUTPUT_TYPE_STENCIL ||
outputType == SHADER_OUTPUT_TYPE_OMASK);
//Need to save debug location
debugLocs.push_back(((Instruction*)inst)->getDebugLoc());
// delete the output
instructionToRemove.push_back(inst);
}
else if (IID == GenISAIntrinsic::GenISA_DCL_SystemValue)
{
SGVUsage usage = (SGVUsage)
llvm::cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue();
if (usage == PRIMITIVEID)
{
primId = inst;
}
else if (usage == POINT_COORD_X)
{
pointCoordX = inst;
}
else if (usage == POINT_COORD_Y)
{
pointCoordY = inst;
}
else if (usage == POSITION_X || usage == POSITION_Y)
{
LowerPositionInput(inst, usage);
}
else if (usage == SAMPLEINDEX)
{
m_isPerSample = true;
}
}
else if (IID == GenISAIntrinsic::GenISA_DCL_inputVec)
{
uint setupIndex =
(uint)llvm::cast<llvm::ConstantInt>(inst->getOperand(0))->getZExtValue();
IGC_ASSERT_MESSAGE(setupIndex < cMaxInputComponents, "Max inputs cannot be greater than 32 x 4");
inputComponentsUsed.set(setupIndex);
e_interpolation mode = (e_interpolation)
llvm::cast<llvm::ConstantInt>(inst->getOperand(1))->getZExtValue();
switch (mode)
{
case EINTERPOLATION_CONSTANT:
IGC_ASSERT(!isLinearInterpolation.test(setupIndex / 4));
break;
case EINTERPOLATION_LINEARSAMPLE:
case EINTERPOLATION_LINEARNOPERSPECTIVESAMPLE:
m_isPerSample = true;
// fall through
case EINTERPOLATION_LINEAR:
case EINTERPOLATION_LINEARCENTROID:
case EINTERPOLATION_LINEARNOPERSPECTIVE:
case EINTERPOLATION_LINEARNOPERSPECTIVECENTROID:
isLinearInterpolation.set(setupIndex / 4);
break;
case EINTERPOLATION_UNDEFINED:
case EINTERPOLATION_VERTEX:
default:
IGC_ASSERT_MESSAGE(0, "Unexpected Pixel Shader input interpolation mode.");
}
}
}
}
}
if (primId)
{
// When PrimitiveId input is present in shader IGC allocates an additional input and returns
// information about the PrimitiveID input to UMD (to program SBE). This new input component
// is created with constant interpolation and cannot be placed in a (4-dword) location that
// has linearly interpolated components. Alernatively code in MarkConstantInterpolation()
// could be modified to ignore the additional input created for PrimitveID.
unsigned int location;
for (location = 0; location < cMaxInputComponents; location++)
{
if (inputComponentsUsed.test(location) == false &&
isLinearInterpolation.test(location / 4) == false)
{
break;
}
}
Value* arguments[] =
{
ConstantInt::get(Type::getInt32Ty(m_module->getContext()), location),
ConstantInt::get(Type::getInt32Ty(m_module->getContext()), EINTERPOLATION_CONSTANT),
};
CallInst* in = GenIntrinsicInst::Create(
GenISAIntrinsic::getDeclaration(
m_module,
GenISAIntrinsic::GenISA_DCL_inputVec,
Type::getFloatTy(m_module->getContext())),
arguments,
"",
primId);
in->setDebugLoc(primId->getDebugLoc());
primId->replaceAllUsesWith(in);
NamedMDNode* primIdMD = m_module->getOrInsertNamedMetadata("PrimIdLocation");
Constant* cval = ConstantInt::get(
Type::getInt32Ty(m_module->getContext()), location);
llvm::MDNode* locationNd = llvm::MDNode::get(
m_module->getContext(),
ConstantAsMetadata::get(cval));
primIdMD->addOperand(locationNd);
}
if (pointCoordX || pointCoordY)
{
// Although PointCoords needs only 2 DWORDs, IGC must allocate 4 additional input and returns
// information about the PointCoord input to UMD (to program SBE). These new input components
// are created with linear interpolation and must be placed in an empty attribute index (4 DWORDs).
unsigned int location;
for (location = 0; location < cMaxInputComponents; location += 4)
{
bool isAttributeIndexEmpty =
inputComponentsUsed.test(location) == false &&
inputComponentsUsed.test(location + 1) == false &&
inputComponentsUsed.test(location + 2) == false &&
inputComponentsUsed.test(location + 3) == false;
if (isAttributeIndexEmpty)
{
isLinearInterpolation.set(location / 4);
break;
}
}
IGC_ASSERT(location < cMaxInputComponents);
llvm::Instruction* inputPointCoords[] = { pointCoordX, pointCoordY };
for (unsigned int i = 0; i < sizeof(inputPointCoords) / sizeof(inputPointCoords[0]); i++)
{
if (inputPointCoords[i] == nullptr)
{
continue;
}
Value* arguments[] =
{
ConstantInt::get(Type::getInt32Ty(m_module->getContext()), location + i),
ConstantInt::get(Type::getInt32Ty(m_module->getContext()), EINTERPOLATION_LINEAR),
};
CallInst* in = GenIntrinsicInst::Create(
GenISAIntrinsic::getDeclaration(
m_module,
GenISAIntrinsic::GenISA_DCL_inputVec,
Type::getFloatTy(m_module->getContext())),
arguments,
"",
inputPointCoords[i]);
in->setDebugLoc(inputPointCoords[i]->getDebugLoc());
inputPointCoords[i]->replaceAllUsesWith(in);
instructionToRemove.push_back(inputPointCoords[i]);
}
NamedMDNode* PointCoordMD = m_module->getOrInsertNamedMetadata("PointCoordLocation");
Constant* cval = ConstantInt::get(
Type::getInt32Ty(m_module->getContext()), location);
llvm::MDNode* locationNd = llvm::MDNode::get(
m_module->getContext(),
ConstantAsMetadata::get(cval));
PointCoordMD->addOperand(locationNd);
}
for (GenIntrinsicInst* pInst : outputInstructions)
{
uint outputType = (uint)llvm::cast<llvm::ConstantInt>(pInst->getOperand(4))->getZExtValue();
if (outputType == SHADER_OUTPUT_TYPE_DEFAULT)
{
uint RTIndex = (uint)llvm::cast<llvm::ConstantInt>(pInst->getOperand(5))->getZExtValue();
unsigned mask = 0;
// if any of the color channel is undef, initialize it
// to 0 for color compression perf.
for (int i = 0; i < 4; i++)
{
if (isa<UndefValue>(pInst->getOperand(i)))
{
if (i == 3 &&
IGC_IS_FLAG_ENABLED(EnableUndefAlphaOutputAsRed))
{
// if it's alpha, then set default value to
// color.r, see IGC-959.
pInst->setOperand(i, pInst->getOperand(0));
}
else
{
pInst->setOperand(i,
ConstantFP::get(pInst->getOperand(i)->getType(), 0.0f));
}
}
else
{
mask |= 1 << i;
}
}
if (RTIndex == 0)
{
src0Alpha = pInst->getOperand(3);
}
m_modMD->psInfo.colorOutputMask[RTIndex] = mask;
ColorOutput data;
data.RTindex = RTIndex;
data.color[0] = pInst->getOperand(0);
data.color[1] = pInst->getOperand(1);
data.color[2] = pInst->getOperand(2);
data.color[3] = pInst->getOperand(3);
data.mask = btrue;
data.blendStateIndex = nullptr;
data.bb = pInst->getParent();
colors.push_back(data);
}
else if (outputType == SHADER_OUTPUT_TYPE_DEPTHOUT)
{
depth = pInst->getOperand(0);
}
else if (outputType == SHADER_OUTPUT_TYPE_STENCIL)
{
stencil = pInst->getOperand(0);
}
else if (outputType == SHADER_OUTPUT_TYPE_OMASK)
{
mask = pInst->getOperand(0);
}
}
for (unsigned int i = 0; i < instructionToRemove.size(); i++)
{
instructionToRemove[i]->eraseFromParent();
}
}
void PixelShaderLowering::EmitMemoryFence(IRBuilder<>& builder, bool forceFlushNone)
{
Value* trueValue = builder.getInt1(true);
Value* falseValue = builder.getInt1(false);
Value* arguments[] =
{
trueValue,
falseValue,
falseValue,
falseValue,
falseValue,
trueValue,
falseValue,
};
CallInst* memFence = GenIntrinsicInst::Create(GenISAIntrinsic::getDeclaration(m_module, GenISAIntrinsic::GenISA_memoryfence),
arguments,
"",
m_ReturnBlock->getTerminator());
}
CallInst* PixelShaderLowering::addRTWrite(
BasicBlock* bbToAdd, Value* src0Alpha,
Value* oMask, ColorOutput& color,
Value* depth, Value* stencil)
{
bool isHF = false;
Value* undefSrc0Alpha = nullptr;
Value* r = color.color[0];
Value* g = color.color[1];
Value* b = color.color[2];
Value* a = color.color[3];
//True if src0Alpha exists and renderTargetBlendingDisabled is false
bool needsSrc0Alpha = ((src0Alpha && color.RTindex > 0) && (!SkipSrc0Alpha) && src0Alpha != color.color[3]);
bool src0AlphaIsHF = (needsSrc0Alpha && isa<FPExtInst>(src0Alpha)) || !needsSrc0Alpha;
if (m_cgCtx->platform.supportFP16() &&
(llvm::isa<llvm::FPExtInst>(r) &&
llvm::isa<llvm::FPExtInst>(g) &&
llvm::isa<llvm::FPExtInst>(b) &&
llvm::isa<llvm::FPExtInst>(a)) &&
src0AlphaIsHF &&
!SkipSrc0Alpha)
{
FPExtInst* rInst = llvm::cast<llvm::FPExtInst>(r);
FPExtInst* gInst = llvm::cast<llvm::FPExtInst>(g);
FPExtInst* bInst = llvm::cast<llvm::FPExtInst>(b);
FPExtInst* aInst = llvm::cast<llvm::FPExtInst>(a);
FPExtInst* src0AlphaInst = nullptr;
if (needsSrc0Alpha &&
llvm::isa<llvm::FPExtInst>(src0Alpha))
src0AlphaInst = llvm::cast<llvm::FPExtInst>(src0Alpha);
r = rInst->getOperand(0);
g = gInst->getOperand(0);
b = bInst->getOperand(0);
a = aInst->getOperand(0);
if (src0AlphaInst)
{
src0Alpha = src0AlphaInst->getOperand(0);
}
isHF = true;
}
if (r->getType()->isHalfTy())
{
isHF = true;
}
/*
In case src0Alpha comes from a HF RT Write
*/
IRBuilder<> builder(bbToAdd->getTerminator());
if (!isHF &&
needsSrc0Alpha &&
src0Alpha->getType()->isHalfTy())
{
if (llvm::isa<llvm::FPTruncInst>(src0Alpha))
{
src0Alpha = (llvm::cast<llvm::FPTruncInst>(src0Alpha))->getOperand(0);
}
else
{
src0Alpha = builder.CreateFPExt(src0Alpha, builder.getFloatTy());
}
}
else if (isHF &&
needsSrc0Alpha &&
src0Alpha->getType()->isFloatTy())
{
/*
reverse, src0Alpha comes from half float in to float RT Write
*/
if (llvm::isa<llvm::FPExtInst>(src0Alpha))
{
src0Alpha = (llvm::cast<llvm::FPExtInst>(src0Alpha))->getOperand(0);
}
else
{
src0Alpha = builder.CreateFPTrunc(src0Alpha, llvm::Type::getHalfTy(m_module->getContext()));
}
}
if (isHF)
undefSrc0Alpha = llvm::UndefValue::get(Type::getHalfTy(m_module->getContext()));
else
undefSrc0Alpha = llvm::UndefValue::get(Type::getFloatTy(m_module->getContext()));
Type* i32t = Type::getInt32Ty(m_module->getContext());
Type* i1t = Type::getInt1Ty(m_module->getContext());
Value* undef = llvm::UndefValue::get(Type::getFloatTy(m_module->getContext()));
Value* iundef = llvm::UndefValue::get(i32t);
Value* i1true = ConstantInt::get(i1t, 1);
Value* i1false = ConstantInt::get(i1t, 0);
Value* vrtIdx = ConstantInt::get(i32t, color.RTindex);
Value* vblendIdx = color.blendStateIndex ? color.blendStateIndex : vrtIdx;
Value* hasOmask = (oMask || m_modMD->psInfo.outputMask) ? i1true : i1false;
Value* hasDepth = (depth || m_modMD->psInfo.outputDepth) ? i1true : i1false;
Value* hasStencil = (stencil || m_modMD->psInfo.outputStencil) ? i1true : i1false;
Value* arguments[] = {
needsSrc0Alpha ? src0Alpha : undefSrc0Alpha, // 0
oMask ? oMask : undef, // 1 - oMask
color.mask, // 2 - pMask
r, g, b, a, // 3,4,5,6
depth ? depth : undef, // 7
stencil ? stencil : undef, // 8
vrtIdx, // 9 - RT index
vblendIdx, // 10 - blend state index
hasOmask, // 11
hasDepth, // 12
hasStencil, // 13
i1false, // 14 - per sample
iundef // 15 - sample idx
};
Function* frtw;
if (isHF)
{
frtw = GenISAIntrinsic::getDeclaration(m_module,
GenISAIntrinsic::GenISA_RTWrite,
Type::getHalfTy(this->m_module->getContext()));
}
else
{
frtw = GenISAIntrinsic::getDeclaration(m_module,
GenISAIntrinsic::GenISA_RTWrite,
Type::getFloatTy(this->m_module->getContext()));
}
return GenIntrinsicInst::Create(frtw, arguments, "",
bbToAdd->getTerminator());
}
#ifdef DEBUG_BLEND_TO_DISCARD
// debug function
static void dbgPrintBlendOptMode(uint64_t hash,
std::vector<int>& blendOpt, unsigned ncolors)
{
static const char* blendOptName[] =
{
"BLEND_OPTIMIZATION_NONE",
"BLEND_OPTIMIZATION_SRC_ALPHA",
"BLEND_OPTIMIZATION_INV_SRC_ALPHA",
"BLEND_OPTIMIZATION_SRC_ALPHA_DISCARD_ONLY",
"BLEND_OPTIMIZATION_SRC_ALPHA_FILL_ONLY",
"BLEND_OPTIMIZATION_SRC_COLOR_ZERO",
"BLEND_OPTIMIZATION_SRC_COLOR_ONE",
"BLEND_OPTIMIZATION_SRC_BOTH_ZERO",
"BLEND_OPTIMIZATION_SRC_BOTH_ONE",
"BLEND_OPTIMIZATION_SRC_ALPHA_OR_COLOR_ZERO",
"BLEND_OPTIMIZATION_SRC_COLOR_ZERO_ALPHA_ONE",
"BLEND_OPTIMIZATION_SRC_COLOR_ZERO_ALPHA_IGNORE"
};
bool doprint = false;
for (unsigned i = 0; i < ncolors; i++)
{
if (blendOpt[i] != USC::BLEND_OPTIMIZATION_NONE)
doprint = true;
}
if (doprint)
{
printf("%016llx blend opt[%d]:\n", hash, ncolors);
for (unsigned i = 0; i < ncolors; i++)
{
printf(" %s\n", blendOptName[blendOpt[i]]);
}
}
}
#endif
void PixelShaderLowering::EmitRTWrite(
ColorOutputArray& colors, Value* depth, Value* stencil,
Value* oMask, Value* src0Alpha, DebugLocArray& debugLocs)
{
if (!m_hasDiscard)
{
// no discard found
//IGC_ASSERT(m_module->getNamedMetadata("KillPixel") == nullptr);
// check blend to discard optimization and generate mask for each
// render target output
std::vector<int>& blendOpt = m_modMD->psInfo.blendOptimizationMode;
#ifdef DEBUG_BLEND_TO_DISCARD
dbgPrintBlendOptMode(m_cgCtx->hash.getAsmHash(), blendOpt, colors.size());
#endif
if (blendOpt.size() && !useDualSrcBlend(colors))
{
bool hasDiscard = false;
unsigned maxRTIndex = 0;
for (unsigned i = 0; i < colors.size(); i++)
{
if (maxRTIndex < colors[i].RTindex)
{
maxRTIndex = colors[i].RTindex;
}
}
for (unsigned i = 0; i < colors.size(); i++)
{
USC::BLEND_OPTIMIZATION_MODE blendOptMode =
static_cast<USC::BLEND_OPTIMIZATION_MODE>(blendOpt[i]);
// Only do blend to fill if the shader is persample, hardware
// already does blend to fill for other cases.
bool enableBlendToFill =
m_cgCtx->m_DriverInfo.SupportBlendToFillOpt() &&
maxRTIndex <= 4 && m_isPerSample;
if (optBlendState(blendOptMode, colors[i], enableBlendToFill))
{
// for blend to discard opt, we need to force earlyz
hasDiscard = true;
m_modMD->psInfo.forceEarlyZ = true;
}
}
if (hasDiscard)
{
m_module->getOrInsertNamedMetadata("KillPixel");
}
}
}
uint32_t RTindexVal = -1;
//According to Spec, the RT Write instruction must follow this order : dual source followed by single source
if (useDualSrcBlend(colors))
{
//If RT0 is executed first when size is 2
if (colors[0].RTindex == 0 && colors[1].RTindex == 1)
{
RTindexVal = 0;
}
else if (colors[0].RTindex == 1 && colors[1].RTindex == 0)
{
RTindexVal = 1;
}
}
if (RTindexVal != -1)
{
//dual source RTWrite first
colors[RTindexVal].inst = addDualBlendWrite(
colors[RTindexVal].bb,
oMask,
colors[RTindexVal],
colors[1 - RTindexVal],
depth, stencil, 0);
colors[RTindexVal].inst->setDebugLoc(debugLocs[RTindexVal]);
//Single source RTWrite
colors[1 - RTindexVal].inst = addRTWrite(
colors[1 - RTindexVal].bb,
src0Alpha,
oMask, colors[1 - RTindexVal],
depth,
stencil);
colors[1 - RTindexVal].inst->setDebugLoc(debugLocs[1 - RTindexVal]);
}
else
{
for (unsigned int i = 0; i < colors.size(); i++)
{
colors[i].inst = addRTWrite(
colors[i].bb,
src0Alpha,
oMask, colors[i],
depth,
stencil);
colors[i].inst->setDebugLoc(debugLocs[i]);
}
}
// pick up 1 RTWrite and move it to return block, so we don't need to
// generate an additional null surface write for EOT.
if (m_hasDiscard)
{
moveRTWritesToReturnBlock(colors);
}
checkAndCreateNullRTWrite(oMask, depth, stencil);
}
inline Value* fixHFSource(IRBuilder<>& builder, Value* val)
{
if (val->getType()->isFloatTy())
return val;
if (llvm::isa<llvm::FPTruncInst>(val))
{
return (llvm::cast<llvm::FPTruncInst>(val))->getOperand(0);
}
else
{
return builder.CreateFPExt(val, builder.getFloatTy());
}
}
CallInst* PixelShaderLowering::addDualBlendWrite(
BasicBlock* bbToAdd, Value* oMask,
ColorOutput& color0, ColorOutput& color1,
Value* depth, Value* stencil, uint index)
{
bool isFP16 = false;
bool isFP32 = false;
Value* pMask = color0.mask;
Value* r0 = color0.color[0];
Value* g0 = color0.color[1];
Value* b0 = color0.color[2];
Value* a0 = color0.color[3];
Value* r1 = color1.color[0];
Value* g1 = color1.color[1];
Value* b1 = color1.color[2];
Value* a1 = color1.color[3];
IGC_ASSERT(color0.mask == color1.mask);
//assuming types are consistent
if (r0->getType()->isHalfTy() ||
r1->getType()->isHalfTy())
{
isFP16 = true;
}
if (r0->getType()->isFloatTy() ||
r1->getType()->isFloatTy())
{
isFP32 = true;
}
/*
if we are combining FP32 and FP16 RT writes
promote everything to FP32
Three Cases:
Case 1) Immediate, extend to FP32 Immediate.
Case 2) FP16 Not Immediate. Not result to FPTrunc. Add FPExt Instruction
Case 3) FP16 Not Immediate. Result of FPTrunc. Use src of FPTrunc
*/
if (isFP16 && isFP32)
{
IRBuilder<> builder(bbToAdd->getTerminator());
r0 = fixHFSource(builder, r0);
g0 = fixHFSource(builder, g0);
b0 = fixHFSource(builder, b0);
a0 = fixHFSource(builder, a0);
r1 = fixHFSource(builder, r1);
g1 = fixHFSource(builder, g1);
b1 = fixHFSource(builder, b1);
a1 = fixHFSource(builder, a1);
}
Type* i32t = Type::getInt32Ty(m_module->getContext());
Type* i1t = Type::getInt1Ty(m_module->getContext());
Value* undef = llvm::UndefValue::get(Type::getFloatTy(m_module->getContext()));
Value* iundef = llvm::UndefValue::get(i32t);
Value* i1true = ConstantInt::get(i1t, 1);
Value* i1false = ConstantInt::get(i1t, 0);
Value* arguments[] = {
oMask ? oMask : undef, // 0 - oMask
pMask, // 1 - pMask
r0, g0, b0, a0, // 2, 3, 4, 5
r1, g1, b1, a1, // 6, 7, 8, 9
depth ? depth : undef, // 10
stencil ? stencil : undef, // 11
ConstantInt::get(i32t, index), // 12 - RT index
oMask ? i1true : i1false, // 13
depth ? i1true : i1false, // 14
stencil ? i1true : i1false, // 15
i1false, // 16 - per sample
iundef, // 17 - sample index
};
return GenIntrinsicInst::Create(
GenISAIntrinsic::getDeclaration(m_module, GenISAIntrinsic::GenISA_RTDualBlendSource, r0->getType()),
arguments,
"",
bbToAdd->getTerminator());
}
void PixelShaderLowering::EmitCoarseMask(llvm::Value* mask)
{
Type* floatTy = Type::getFloatTy(m_module->getContext());
Value* undef = llvm::UndefValue::get(floatTy);
Value* oMaskType =
ConstantInt::get(Type::getInt32Ty(m_module->getContext()), SHADER_OUTPUT_TYPE_OMASK);
Value* zero = ConstantInt::get(Type::getInt32Ty(m_module->getContext()), 0);
Value* arguments[] =
{
mask,
undef,
undef,
undef,
oMaskType,
zero,
};
GenIntrinsicInst::Create(
GenISAIntrinsic::getDeclaration(m_module, GenISAIntrinsic::GenISA_OUTPUT, floatTy),
arguments,
"",
m_ReturnBlock->getTerminator());
}
void PixelShaderLowering::LowerPositionInput(GenIntrinsicInst* positionInstr, uint usage)
{
IRBuilder<> builder(positionInstr);
Function* positionIntr = GenISAIntrinsic::getDeclaration(m_module,
usage == POSITION_X ? GenISAIntrinsic::GenISA_PixelPositionX : GenISAIntrinsic::GenISA_PixelPositionY);
Value* intPosition = builder.CreateCall(positionIntr);
Value* floatPosition = positionInstr;
if (floatPosition->hasOneUse())
{
if (BinaryOperator * fadd = dyn_cast<BinaryOperator>(*floatPosition->user_begin()))
{
if (ConstantFP * cst = dyn_cast<ConstantFP>(fadd->getOperand(1)))
{
float constant = cst->getValueAPF().convertToFloat();
if (constant >= 0.0f && constant < 1.f)
{
floatPosition = fadd;
}
}
}
}
if (floatPosition->hasOneUse())
{
Value* v = *floatPosition->user_begin();
if (v->getType()->isIntegerTy(32) && (isa<FPToUIInst>(v) || isa<FPToSIInst>(v)))
{
for (auto UI = v->user_begin(), UE = v->user_end(); UI != UE;)
{
Value* use = *UI++;
if (TruncInst * truncI = dyn_cast<TruncInst>(use))
{
truncI->replaceAllUsesWith(builder.CreateZExtOrTrunc(intPosition, truncI->getType()));
}
}
if (!v->user_empty())
{
v->replaceAllUsesWith(builder.CreateZExt(intPosition, v->getType()));
}
return;
}
}
positionInstr->replaceAllUsesWith(builder.CreateUIToFP(intPosition, positionInstr->getType()));
}
// Based on blend state, check color output and discard them if possible.
bool PixelShaderLowering::optBlendState(
USC::BLEND_OPTIMIZATION_MODE blendOpt,
ColorOutput& colorOut,
bool enableBlendToFill)
{
Function* fBallot = GenISAIntrinsic::getDeclaration(m_module,
GenISAIntrinsic::GenISA_WaveBallot);
bool enableBlendToDiscard =
IGC_IS_FLAG_ENABLED(EnableBlendToDiscard) &&
m_cgCtx->platform.enableBlendToDiscardAndFill();
enableBlendToFill = enableBlendToFill &&
IGC_IS_FLAG_ENABLED(EnableBlendToFill) &&
m_cgCtx->platform.enableBlendToDiscardAndFill();
bool hasDiscard = false;
if (m_modMD->psInfo.outputDepth || m_modMD->psInfo.outputStencil)
{
enableBlendToDiscard = false;
}
IGCIRBuilder<> irb(m_ReturnBlock->getTerminator());
switch (blendOpt)
{
case USC::BLEND_OPTIMIZATION_SRC_ALPHA:
{
// discard: src.a == 0, fill: src.a == 1
if (enableBlendToDiscard)
{
Constant* f0 = ConstantFP::get(colorOut.color[3]->getType(), 0.0);
Value* ane0 = irb.CreateFCmpUNE(colorOut.color[3], f0);
colorOut.mask = ane0;
hasDiscard = true;
}
if (enableBlendToFill)
{
// ifany(src.a != 1.0) ? RTIndex : RTIndex + 4
Constant* f1 = ConstantFP::get(colorOut.color[3]->getType(), 1.0);
Value* ane1 = irb.CreateFCmpUNE(colorOut.color[3], f1);
Value* ane1_ballot = irb.CreateCall(fBallot, { ane1 });
Value* any = irb.CreateICmpNE(ane1_ballot, irb.getInt32(0));
colorOut.blendStateIndex = irb.CreateSelect(any,
irb.getInt32(colorOut.RTindex),
irb.getInt32(colorOut.RTindex + 4));
m_modMD->psInfo.blendToFillEnabled = true;
}
return hasDiscard;
}
case USC::BLEND_OPTIMIZATION_INV_SRC_ALPHA:
{
// discard: src.a == 1, fill: src.a == 0
Constant* f1 = ConstantFP::get(colorOut.color[0]->getType(), 1.0);
if (enableBlendToDiscard)
{
Value* ane1 = irb.CreateFCmpUNE(colorOut.color[3], f1);
colorOut.mask = ane1;
hasDiscard = true;
}
if (enableBlendToFill)
{
// ifall(src.a == 0) ? RTIndex + 4 : RTIndex
// ifany(src.a != 0) ? RTIndex : RTIndex + 4
Value* ai = irb.CreateBitCast(colorOut.color[3], irb.getInt32Ty());
Value* ane0 = irb.CreateICmpNE(ai, irb.getInt32(0));
Value* ane0_ballot = irb.CreateCall(fBallot, { ane0 });
Value* any = irb.CreateICmpNE(ane0_ballot, irb.getInt32(0));
colorOut.blendStateIndex = irb.CreateSelect(any,
irb.getInt32(colorOut.RTindex),
irb.getInt32(colorOut.RTindex + 4));
m_modMD->psInfo.blendToFillEnabled = true;
}
return hasDiscard;
}
case USC::BLEND_OPTIMIZATION_SRC_ALPHA_DISCARD_ONLY:
{
// discard: src.a == 0
if (enableBlendToDiscard)
{
Constant* f0 = ConstantFP::get(colorOut.color[3]->getType(), 0.0);
Value* ane0 = irb.CreateFCmpUNE(colorOut.color[3], f0);
colorOut.mask = ane0;
hasDiscard = true;
}
return hasDiscard;
}
case USC::BLEND_OPTIMIZATION_SRC_ALPHA_FILL_ONLY:
{
// fill: src.a == 1
if (enableBlendToFill)
{
// ifall(src.a == 1.0) ? RTIndex + 4 : RTIndex
// ifany(src.a != 1.0) ? RTIndex : RTIndex + 4
Constant* f1 = ConstantFP::get(colorOut.color[3]->getType(), 1.0);
Value* ane1 = irb.CreateFCmpUNE(colorOut.color[3], f1);
Value* ane1_ballot = irb.CreateCall(fBallot, { ane1 });
Value* any = irb.CreateICmpNE(ane1_ballot, irb.getInt32(0));
colorOut.blendStateIndex = irb.CreateSelect(any,
irb.getInt32(colorOut.RTindex),
irb.getInt32(colorOut.RTindex + 4));
m_modMD->psInfo.blendToFillEnabled = true;
}
return false;
}
case USC::BLEND_OPTIMIZATION_SRC_COLOR_ZERO:
{
// discard: src.rgb == 0
if (enableBlendToDiscard)
{
colorOut.mask = irb.CreateAnyValuesNotZero(colorOut.color, 3);
hasDiscard = true;
}
return hasDiscard;
}
case USC::BLEND_OPTIMIZATION_SRC_COLOR_ONE:
{
// discard if src.rgb == 1
if (enableBlendToDiscard)
{
ConstantFP* f1 = cast<ConstantFP>(
ConstantFP::get(colorOut.color[0]->getType(), 1.0));
Value* rne1 = fcmpUNEConst(irb, colorOut.color[0], f1);
Value* gne1 = fcmpUNEConst(irb, colorOut.color[1], f1);
Value* bne1 = fcmpUNEConst(irb, colorOut.color[2], f1);
colorOut.mask = createOr(irb, bne1, createOr(irb, rne1, gne1));
hasDiscard = true;
}
return hasDiscard;
}
case USC::BLEND_OPTIMIZATION_SRC_BOTH_ZERO:
{
// discard: src.rgba == 0
if (enableBlendToDiscard)
{
colorOut.mask = irb.CreateAnyValuesNotZero(colorOut.color, 4);
hasDiscard = true;
}
return hasDiscard;
}
case USC::BLEND_OPTIMIZATION_SRC_BOTH_ONE:
{
// discard if src.rgba == 1
if (enableBlendToDiscard)
{
Constant* f1 = ConstantFP::get(colorOut.color[0]->getType(), 1.0);
Value* rne1 = irb.CreateFCmpUNE(colorOut.color[0], f1);
Value* gne1 = irb.CreateFCmpUNE(colorOut.color[1], f1);
Value* bne1 = irb.CreateFCmpUNE(colorOut.color[2], f1);
Value* ane1 = irb.CreateFCmpUNE(colorOut.color[3], f1);
colorOut.mask = irb.CreateOr(ane1, irb.CreateOr(bne1, irb.CreateOr(rne1, gne1)));
hasDiscard = true;
}
return hasDiscard;
}
case USC::BLEND_OPTIMIZATION_SRC_ALPHA_OR_COLOR_ZERO:
{
// discard: src.a == 0 || src.rgb == 0
if (enableBlendToDiscard)
{
Value* a = colorOut.color[3];
Constant* f0 = ConstantFP::get(a->getType(), 0.0);
Value* ane0 = irb.CreateFCmpUNE(a, f0);
Value* cne0 = irb.CreateAnyValuesNotZero(colorOut.color, 3);
colorOut.mask = irb.CreateAnd(ane0, cne0);
hasDiscard = true;
}
return hasDiscard;
}
case USC::BLEND_OPTIMIZATION_SRC_COLOR_ZERO_ALPHA_ONE:
{
// discard: src.rgb == 0 && src.a == 1
// equivalently mask = (r|g|b != 0) || (a != 1)
if (enableBlendToDiscard)
{
Value* cne0 = irb.CreateAnyValuesNotZero(colorOut.color, 3);
Value* a = colorOut.color[3];
Constant* f1 = ConstantFP::get(a->getType(), 1.0);
Value* ane1 = irb.CreateFCmpUNE(a, f1);
colorOut.mask = irb.CreateOr(cne0, ane1);
hasDiscard = true;
}
return hasDiscard;
}
case USC::BLEND_OPTIMIZATION_SRC_COLOR_ZERO_ALPHA_IGNORE:
{
// Discard: src.rgb == 0 and don't compute src.a
// equivalently mask = (r|g|b != 0)
if (enableBlendToDiscard)
{
colorOut.mask = irb.CreateAnyValuesNotZero(colorOut.color, 3);
hasDiscard = true;
}
// set output alpha as output.r, see IGC-959
if (IGC_IS_FLAG_ENABLED(EnableUndefAlphaOutputAsRed))
{
colorOut.color[3] = colorOut.color[0];
}
else
{
colorOut.color[3] = ConstantFP::get(
colorOut.color[3]->getType(), 0.0);
}
return hasDiscard;
}
default:
return false;
}
}
void PixelShaderLowering::moveRTWriteToBlock(
CallInst* call, SmallVector<BasicBlock*, 8> & predBB, BasicBlock* toBB,
llvm::DenseMap<llvm::Value*, llvm::PHINode*>& valueToPhiMap)
{
unsigned numPredBB = predBB.size();
if (numPredBB > 1)
{
for (unsigned i = 0; i < call->getNumArgOperands(); i++)
{
if (Instruction * inst = dyn_cast<Instruction>(call->getArgOperand(i)))
{
auto it = valueToPhiMap.find(inst);
if (it != valueToPhiMap.end())
{
call->setArgOperand(i, it->second);
continue;
}
PHINode* phi = PHINode::Create(
inst->getType(), numPredBB, "", &(*toBB->begin()));
valueToPhiMap[inst] = phi;
for (unsigned j = 0; j < numPredBB; j++)
{
Value* inVal;
if (predBB[j] == call->getParent())
{
inVal = inst;
}
else
{
inVal = UndefValue::get(inst->getType());
}
phi->addIncoming(inVal, predBB[j]);
}
call->setArgOperand(i, phi);
}
}
}
call->removeFromParent();
call->insertBefore(toBB->getTerminator());
}
void PixelShaderLowering::moveRTWritesToReturnBlock(
const ColorOutputArray& colors)
{
if (colors.size())
{
IGC_ASSERT(colors[0].inst != nullptr);
SmallVector<BasicBlock*, 8> predBB;
DenseMap<Value*, PHINode*> valueToPhiMap;
for (auto PI = pred_begin(m_ReturnBlock), PE = pred_end(m_ReturnBlock);
PI != PE; ++PI)
{
predBB.push_back(*PI);
}
if (useDualSrcBlend(colors))
{
// For SIMD16 PS thread with two output colors must send
// messages in the following sequence for each RT: SIMD8 dual
// source RTW message (low); SIMD8 dual source RTW message
// (high); SIMD16 single src RTW message with second color.
CallInst* const dualSourceRTW =
isa<RTDualBlendSourceIntrinsic>(colors[0].inst) ? colors[0].inst : colors[1].inst;
CallInst* const singleSourceRTW =
isa<RTDualBlendSourceIntrinsic>(colors[0].inst) ? colors[1].inst : colors[0].inst;
IGC_ASSERT(isa<RTWritIntrinsic>(singleSourceRTW));
IGC_ASSERT(isa<RTDualBlendSourceIntrinsic>(dualSourceRTW));
moveRTWriteToBlock(dualSourceRTW, predBB, m_ReturnBlock, valueToPhiMap);
moveRTWriteToBlock(singleSourceRTW, predBB, m_ReturnBlock, valueToPhiMap);
}
else
{
moveRTWriteToBlock(colors[0].inst, predBB, m_ReturnBlock, valueToPhiMap);
}
}
}
PHINode* PixelShaderLowering::createPhiForRTWrite(Value* val,
smallvector<BasicBlock*, 8> & predBB, BasicBlock* toBB)
{
PHINode* phi = PHINode::Create(
val->getType(), predBB.size(), "", &(*toBB->begin()));
for (auto* BB : predBB)
{
Value* inVal;
if (BB == m_outputBlock)
inVal = val;
else
inVal = UndefValue::get(val->getType());
phi->addIncoming(inVal, BB);
}
return phi;
}
// create a null surface write in return block if there's no one
void PixelShaderLowering::checkAndCreateNullRTWrite(
Value* oMask, Value* depth, Value* stencil)
{
bool hasRTW = false;
for (auto& I : *m_ReturnBlock)
{
if (isa<RTWritIntrinsic>(&I) ||
isa<RTDualBlendSourceIntrinsic>(&I))
{
hasRTW = true;
break;
}
}
if (!hasRTW)
{
Value* undef = UndefValue::get(Type::getFloatTy(m_module->getContext()));
ColorOutput color;
color.color[0] = color.color[1] = color.color[2] = color.color[3] = undef;
color.mask = ConstantInt::get(Type::getInt1Ty(m_module->getContext()), true);
color.RTindex = -1;
color.blendStateIndex = nullptr;
if (m_outputBlock != m_ReturnBlock)
{
smallvector<BasicBlock*, 8> predBB;
for (auto PI = pred_begin(m_ReturnBlock), PE = pred_end(m_ReturnBlock);
PI != PE; ++PI)
{
predBB.push_back(*PI);
}
if (predBB.size() > 1)
{
if (oMask)
{
oMask = createPhiForRTWrite(oMask, predBB, m_ReturnBlock);
}
if (depth)
{
depth = createPhiForRTWrite(depth, predBB, m_ReturnBlock);
}
if (stencil)
{
stencil = createPhiForRTWrite(stencil, predBB, m_ReturnBlock);
}
}
}
addRTWrite(
m_ReturnBlock,
undef,
oMask, color,
depth, stencil);
}
}
///////////////////////////////////////////////////////////////////////
// Lower discard intrinsics
///////////////////////////////////////////////////////////////////////
#define PASS_FLAG "igc-lower-discard"
#define PASS_DESCRIPTION "Lower discard intrinsics"
#define PASS_CFG_ONLY false
#define PASS_ANALYSIS false
IGC_INITIALIZE_PASS_BEGIN(DiscardLowering, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
IGC_INITIALIZE_PASS_END(DiscardLowering, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
#undef PASS_FLAG
#undef PASS_DESCRIPTION
#undef PASS_CFG_ONLY
#undef PASS_ANALYSIS
char DiscardLowering::ID = 0;
DiscardLowering::DiscardLowering()
: FunctionPass(ID)
{
initializeDiscardLoweringPass(*PassRegistry::getPassRegistry());
}
bool DiscardLowering::lowerDiscards(Function& F)
{
if (m_discards.empty() && m_isHelperInvocationCalls.empty())
{
return false;
}
m_earlyRet = BasicBlock::Create(m_module->getContext(), "DiscardRet", &F);
// add OUTPUT_PIXELMASK call to track discard conditions
IRBuilder<> irb(m_earlyRet);
irb.CreateRetVoid();
if (m_retBB)
{
m_retBB->getTerminator()->eraseFromParent();
BranchInst::Create(m_earlyRet, m_retBB);
}
m_retBB = m_earlyRet;
Function* fInitMask = GenISAIntrinsic::getDeclaration(m_module,
GenISAIntrinsic::GenISA_InitDiscardMask);
Function* fSetMask = GenISAIntrinsic::getDeclaration(m_module,
GenISAIntrinsic::GenISA_UpdateDiscardMask);
Value* discardMask = CallInst::Create(fInitMask, llvm::None, "",
m_entryBB->getFirstNonPHI());
bool killsPixels = false;
for (auto discard : m_discards)
{
IGC_ASSERT(discard->isGenIntrinsic(GenISAIntrinsic::GenISA_discard));
killsPixels = true;
BasicBlock* bbDiscard;
BasicBlock* bbAfter;
bbDiscard = discard->getParent();
BasicBlock::iterator bi = discard->getIterator();
++bi;
bbAfter = bbDiscard->splitBasicBlock(
bi, "postDiscard");
// erase the branch inserted by splitBasicBLock
bbDiscard->getTerminator()->eraseFromParent();
// create conditional branch to early ret
IRBuilder<> B(discard);
// call discard(%dcond)
// -->
// UpdatePixelMask(%globalMask, %dcond) ; update discard pixel mask in dmask
// %all = WaveBallot(%dcond)
// %1 = icmp eq i32 %all, -1 ; if.all %dcond returnBB
// br %1, returnBB, postDiscardBB
Value* discardCond = discard->getOperand(0);
Value* v = B.CreateCall(fSetMask, { discardMask, discardCond });
B.CreateCondBr(v, m_earlyRet, bbAfter);
}
if (killsPixels)
{
m_module->getOrInsertNamedMetadata("KillPixel");
}
for (auto inst : m_isHelperInvocationCalls)
{
IRBuilder<> B(inst);
Function* getPixelMask = GenISAIntrinsic::getDeclaration(m_module,
GenISAIntrinsic::GenISA_GetPixelMask);
llvm::Value* pixelMask = B.CreateCall(getPixelMask, { discardMask });
inst->replaceAllUsesWith(B.CreateNot(pixelMask));
inst->eraseFromParent();
}
for (auto discard : m_discards)
{
discard->eraseFromParent();
}
return true;
}
bool DiscardLowering::runOnFunction(Function& F)
{
IGCMD::MetaDataUtils* pMdUtils = nullptr;
pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
if (pMdUtils->findFunctionsInfoItem(&F) == pMdUtils->end_FunctionsInfo())
{
return false;
}
m_cgCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
m_modMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
m_entryBB = &F.getEntryBlock();
m_module = F.getParent();
// find return block
for (auto& bb : F)
{
if (llvm::isa<llvm::ReturnInst>(bb.getTerminator()))
{
m_retBB = &bb;
break;
}
}
SmallVector<GenIntrinsicInst*, 4> discardToDel;
for (auto BI = F.begin(), BE = F.end(); BI != BE; BI++)
{
for (auto II = BI->begin(), IE = BI->end(); II != IE; II++)
{
GenIntrinsicInst* inst = dyn_cast<GenIntrinsicInst>(II);
if (inst)
{
if (inst->isGenIntrinsic(GenISAIntrinsic::GenISA_discard))
{
// get rid of discard(false)
if (ConstantInt * cval = dyn_cast<ConstantInt>(inst->getOperand(0)))
{
if (cval->isZero())
{
discardToDel.push_back(inst);
continue;
}
}
m_discards.push_back(inst);
}
else if (inst->isGenIntrinsic(GenISAIntrinsic::GenISA_IsHelperInvocation))
{
m_isHelperInvocationCalls.push_back(inst);
}
else
if (inst->isGenIntrinsic(GenISAIntrinsic::GenISA_OUTPUT))
{
// Check whether PS output omask/depth/stencil and save to
// metadata, since after discard lowering, the OUTPUT
// could become dead code and get cleaned. While we need to
// know it when creating null surface write.
uint outputType = (uint)llvm::cast<llvm::ConstantInt>(
inst->getOperand(4))->getZExtValue();
IGC_ASSERT(outputType == SHADER_OUTPUT_TYPE_DEFAULT ||
outputType == SHADER_OUTPUT_TYPE_DEPTHOUT ||
outputType == SHADER_OUTPUT_TYPE_STENCIL ||
outputType == SHADER_OUTPUT_TYPE_OMASK);
switch (outputType)
{
case SHADER_OUTPUT_TYPE_DEPTHOUT:
m_modMD->psInfo.outputDepth = true;
break;
case SHADER_OUTPUT_TYPE_STENCIL:
m_modMD->psInfo.outputStencil = true;
break;
case SHADER_OUTPUT_TYPE_OMASK:
m_modMD->psInfo.outputMask = true;
break;
default:
break;
}
}
}
}
}
for (auto I : discardToDel)
{
I->eraseFromParent();
}
Function* samplePhaseEntry = nullptr;
Function* pixelPhaseEntry = nullptr;
NamedMDNode* pixelNode = F.getParent()->getNamedMetadata("pixel_phase");
NamedMDNode* sampleNode = F.getParent()->getNamedMetadata("sample_phase");
if (sampleNode)
{
samplePhaseEntry = mdconst::dyn_extract<Function>(
sampleNode->getOperand(0)->getOperand(0));
}
if (pixelNode)
{
pixelPhaseEntry = mdconst::dyn_extract<Function>(
pixelNode->getOperand(0)->getOperand(0));
}
bool cfgChanged = false;
// For multirate PS, we will run discard lowering twice, first on sample
// phase entry before link multi rate pass, second on pixel entry after
// link multi rate pass. The check is to make sure only lower discards on
// sample phase entry before link multi rate pass.
if (samplePhaseEntry == nullptr || pixelPhaseEntry != &F)
{
cfgChanged = lowerDiscards(F);
}
m_discards.clear();
#ifdef DEBUG_DISCARD_OPT
DumpLLVMIR(getAnalysis<CodeGenContextWrapper>().getCodeGenContext(), "discard");
#endif
return cfgChanged;
}
}//namespace IGC