mirror of
				https://github.com/intel/intel-graphics-compiler.git
				synced 2025-10-30 08:18:26 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			2062 lines
		
	
	
		
			79 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			2062 lines
		
	
	
		
			79 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*========================== begin_copyright_notice ============================
 | |
| 
 | |
| Copyright (C) 2017-2024 Intel Corporation
 | |
| 
 | |
| SPDX-License-Identifier: MIT
 | |
| 
 | |
| ============================= end_copyright_notice ===========================*/
 | |
| 
 | |
| #include "AdaptorCommon/ImplicitArgs.hpp"
 | |
| #include "Compiler/CISACodeGen/WIAnalysis.hpp"
 | |
| #include "Compiler/CISACodeGen/helper.h"
 | |
| #include "Compiler/CodeGenContextWrapper.hpp"
 | |
| #include "Compiler/CodeGenPublic.h"
 | |
| #include "Compiler/IGCPassSupport.h"
 | |
| #include "common/debug/Debug.hpp"
 | |
| #include "common/igc_regkeys.hpp"
 | |
| #include "GenISAIntrinsics/GenIntrinsicInst.h"
 | |
| #include "common/LLVMWarningsPush.hpp"
 | |
| #include <llvm/IR/Function.h>
 | |
| #include <llvm/IR/CFG.h>
 | |
| #include <llvm/Support/CommandLine.h>
 | |
| #include <llvm/Support/Debug.h>
 | |
| #include <llvm/IR/Constants.h>
 | |
| #include "common/LLVMWarningsPop.hpp"
 | |
| 
 | |
| #include <string>
 | |
| #include <sstream>
 | |
| #include "Probe/Assertion.h"
 | |
| 
 | |
| using namespace llvm;
 | |
| using namespace IGC;
 | |
| using namespace IGC::IGCMD;
 | |
| using namespace IGC::Debug;
 | |
| 
 | |
| static cl::opt<bool> PrintWiaCheck("print-wia-check", cl::init(false), cl::Hidden,
 | |
|                                    cl::desc("Debug wia-check analysis"));
 | |
| 
 | |
| // Register pass to igc-opt
 | |
| #define PASS_FLAG "igc-wi-analysis"
 | |
| #define PASS_DESCRIPTION "WIAnalysis provides work item dependency info"
 | |
| #define PASS_CFG_ONLY true
 | |
| #define PASS_ANALYSIS true
 | |
| IGC_INITIALIZE_PASS_BEGIN(WIAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
 | |
| IGC_INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 | |
| IGC_INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
 | |
| IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
 | |
| IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
 | |
| IGC_INITIALIZE_PASS_DEPENDENCY(TranslationTable)
 | |
| IGC_INITIALIZE_PASS_END(WIAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
 | |
| 
 | |
| char WIAnalysis::ID = 0;
 | |
| 
 | |
| WIAnalysis::WIAnalysis() : FunctionPass(ID) { initializeWIAnalysisPass(*PassRegistry::getPassRegistry()); }
 | |
| 
 | |
| const unsigned int WIAnalysisRunner::MinIndexBitwidthToPreserve = 16;
 | |
| 
 | |
| // For dumpping WIA info per each invocation
 | |
| DenseMap<const Function *, int> WIAnalysisRunner::m_funcInvocationId;
 | |
| 
 | |
| /// Define shorter names for dependencies, for clarity of the conversion maps
 | |
| /// Note that only UGL/UWG/UTH/RND are supported.
 | |
| #define UGL WIAnalysis::UNIFORM_GLOBAL
 | |
| #define UWG WIAnalysis::UNIFORM_WORKGROUP
 | |
| #define UTH WIAnalysis::UNIFORM_THREAD
 | |
| #define SEQ WIAnalysis::CONSECUTIVE
 | |
| #define PTR WIAnalysis::PTR_CONSECUTIVE
 | |
| #define STR WIAnalysis::STRIDED
 | |
| #define RND WIAnalysis::RANDOM
 | |
| 
 | |
| static const char *const dep_str[] = {"uniform_global", "uniform_workgroup", "uniform_thread", "consecu",
 | |
|                                       "p_conse",        "strided",           "random "};
 | |
| 
 | |
| /// Dependency maps (define output dependency according to 2 input deps)
 | |
| 
 | |
| static const WIAnalysis::WIDependancy add_conversion[WIAnalysis::NumDeps][WIAnalysis::NumDeps] = {
 | |
|     /*          UGL, UWG, UTH, SEQ, PTR, STR, RND */
 | |
|     // clang-format off
 | |
|     /* UGL */  {UGL, UWG, UTH, SEQ, PTR, STR, RND},
 | |
|     /* UWG */  {UWG, UWG, UTH, SEQ, PTR, STR, RND},
 | |
|     /* UTH */  {UTH, UTH, UTH, SEQ, PTR, STR, RND},
 | |
|     /* SEQ */  {SEQ, SEQ, SEQ, STR, STR, STR, RND},
 | |
|     /* PTR */  {PTR, PTR, PTR, STR, STR, STR, RND},
 | |
|     /* STR */  {STR, STR, STR, STR, STR, STR, RND},
 | |
|     /* RND */  {RND, RND, RND, RND, RND, RND, RND}
 | |
|     // clang-format on
 | |
| };
 | |
| 
 | |
| static const WIAnalysis::WIDependancy
 | |
|     // clang-format off
 | |
| sub_conversion[WIAnalysis::NumDeps][WIAnalysis::NumDeps] = {
 | |
|     /*          UGL, UWG, UTH, SEQ, PTR, STR, RND */
 | |
|     /* UGL */  {UGL, UWG, UTH, STR, RND, RND, RND},
 | |
|     /* UWG */  {UWG, UWG, UTH, STR, RND, RND, RND},
 | |
|     /* UTH */  {UTH, UTH, UTH, STR, RND, RND, RND},
 | |
|     /* SEQ */  {SEQ, SEQ, SEQ, RND, RND, RND, RND},
 | |
|     /* PTR */  {PTR, PTR, PTR, RND, RND, RND, RND},
 | |
|     /* STR */  {STR, STR, STR, RND, RND, RND, RND},
 | |
|     /* RND */  {RND, RND, RND, RND, RND, RND, RND}
 | |
|         // clang-format on
 | |
| };
 | |
| 
 | |
| static const WIAnalysis::WIDependancy mul_conversion[WIAnalysis::NumDeps][WIAnalysis::NumDeps] = {
 | |
|     // clang-format off
 | |
|     /*          UGL, UWG, UTH, SEQ, PTR, STR, RND */
 | |
|     /* UGL */  {UGL, UWG, UTH, STR, STR, STR, RND},
 | |
|     /* UWG */  {UWG, UWG, UTH, STR, STR, STR, RND},
 | |
|     /* UTH */  {UTH, UTH, UTH, STR, STR, STR, RND},
 | |
|     /* SEQ */  {STR, STR, STR, RND, RND, RND, RND},
 | |
|     /* PTR */  {STR, STR, STR, RND, RND, RND, RND},
 | |
|     /* STR */  {STR, STR, STR, RND, RND, RND, RND},
 | |
|     /* RND */  {RND, RND, RND, RND, RND, RND, RND}
 | |
|     // clang-format on
 | |
| };
 | |
| 
 | |
| // select is to have a weaker dep of two
 | |
| static const WIAnalysis::WIDependancy select_conversion[WIAnalysis::NumDeps][WIAnalysis::NumDeps] = {
 | |
|     // clang-format off
 | |
|     /*          UGL, UWG, UTH, SEQ, PTR, STR, RND */
 | |
|     /* UGL */  {UGL, UWG, UTH, STR, STR, STR, RND},
 | |
|     /* UWG */  {UWG, UWG, UTH, STR, STR, STR, RND},
 | |
|     /* UTH */  {UTH, UTH, UTH, STR, STR, STR, RND},
 | |
|     /* SEQ */  {STR, STR, STR, SEQ, STR, STR, RND},
 | |
|     /* PTR */  {STR, STR, STR, STR, PTR, STR, RND},
 | |
|     /* STR */  {STR, STR, STR, STR, STR, STR, RND},
 | |
|     /* RND */  {RND, RND, RND, RND, RND, RND, RND}
 | |
|     // clang-format on
 | |
| };
 | |
| 
 | |
| static const WIAnalysis::WIDependancy gep_conversion[WIAnalysis::NumDeps][WIAnalysis::NumDeps] = {
 | |
|     // clang-format off
 | |
|     /* ptr\index UGL, UWG, UTH, SEQ, PTR, STR, RND */
 | |
|     /* UGL */  {UGL, UWG, UTH, PTR, RND, RND, RND},
 | |
|     /* UWG */  {UWG, UWG, UTH, PTR, RND, RND, RND},
 | |
|     /* UTH */  {UTH, UTH, UTH, PTR, RND, RND, RND},
 | |
|     /* SEQ */  {RND, RND, RND, RND, RND, RND, RND},
 | |
|     /* PTR */  {PTR, PTR, PTR, RND, RND, RND, RND},
 | |
|     /* STR */  {RND, RND, RND, RND, RND, RND, RND},
 | |
|     /* RND */  {RND, RND, RND, RND, RND, RND, RND}
 | |
|     // clang-format on
 | |
| };
 | |
| 
 | |
| // For better readability, the rank of a dependency is used to compare two dependencies
 | |
| // to see which of them is weaker or stronger.
 | |
| //
 | |
| //   Dependancy rank : an integer value for each Dependancy, starting from 0.
 | |
| //   Property of rank: the lower (smaller) the rank, the stronger the dependancy.
 | |
| //
 | |
| // Currently, enum value of each dependency is used exactly as its rank.
 | |
| inline int depRank(WIAnalysis::WIDependancy D) { return (int)D; }
 | |
| 
 | |
| namespace IGC {
 | |
| /// @Brief, given a conditional branch and its immediate post dominator,
 | |
| /// find its influence-region and partial joins within the influence region
 | |
| class BranchInfo {
 | |
| public:
 | |
|   BranchInfo(const IGCLLVM::TerminatorInst *inst, const llvm::BasicBlock *ipd);
 | |
| 
 | |
|   void print(llvm::raw_ostream &OS) const;
 | |
| 
 | |
|   const IGCLLVM::TerminatorInst *cbr;
 | |
|   const llvm::BasicBlock *full_join;
 | |
|   llvm::DenseSet<llvm::BasicBlock *> influence_region;
 | |
|   llvm::SmallPtrSet<llvm::BasicBlock *, 4> partial_joins;
 | |
| };
 | |
| } // namespace IGC
 | |
| 
 | |
| void WIAnalysisRunner::print(raw_ostream &OS, const Module *) const {
 | |
|   DenseMap<BasicBlock *, int> BBIDs;
 | |
|   int id = 0;
 | |
|   for (Function::iterator I = m_func->begin(), E = m_func->end(); I != E; ++I, ++id) {
 | |
|     BasicBlock *BB = &*I;
 | |
|     BBIDs[BB] = id;
 | |
|   }
 | |
| 
 | |
|   std::stringstream ss;
 | |
|   ss << "WIAnalysis: " << m_func->getName().str();
 | |
|   Banner(OS, ss.str());
 | |
| 
 | |
|   OS << "Args: \n";
 | |
|   for (Function::arg_iterator I = m_func->arg_begin(), E = m_func->arg_end(); I != E; ++I) {
 | |
|     Value *AVal = &*I;
 | |
| 
 | |
|     if (m_depMap.GetAttributeWithoutCreating(AVal) != m_depMap.end())
 | |
|       OS << "    " << dep_str[m_depMap.GetAttributeWithoutCreating(AVal)] << " " << *AVal << "\n";
 | |
|     else
 | |
|       OS << "  unknown " << *AVal << "\n";
 | |
|   }
 | |
|   OS << "\n";
 | |
| 
 | |
|   for (Function::iterator I = m_func->begin(), E = m_func->end(); I != E; ++I) {
 | |
|     BasicBlock *BB = &*I;
 | |
|     OS << "BB:" << BBIDs[BB];
 | |
|     if (BB->hasName())
 | |
|       OS << " " << BB->getName();
 | |
|     OS << "       ; preds =";
 | |
|     bool isFirst = true;
 | |
|     for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
 | |
|       BasicBlock *pred = *PI;
 | |
|       OS << ((isFirst) ? " " : ", ") << "BB:" << BBIDs[pred] << "  ";
 | |
|       if (pred->hasName())
 | |
|         OS << pred->getName();
 | |
|       isFirst = false;
 | |
|     }
 | |
|     {
 | |
|       auto dep = getCFDependency(BB);
 | |
|       OS << "[ " << dep_str[dep] << " ]";
 | |
|     }
 | |
|     OS << "\n";
 | |
|     for (BasicBlock::iterator it = BB->begin(), ie = BB->end(); it != ie; ++it) {
 | |
|       Instruction *I = &*it;
 | |
|       if (m_depMap.GetAttributeWithoutCreating(I) != m_depMap.end()) {
 | |
|         OS << "  " << dep_str[m_depMap.GetAttributeWithoutCreating(I)] << " " << *I;
 | |
|       } else {
 | |
|         OS << "  unknown " << *I;
 | |
|       }
 | |
|       if (I->isTerminator()) {
 | |
|         IGCLLVM::TerminatorInst *TI = dyn_cast<IGCLLVM::TerminatorInst>(I);
 | |
|         OS << " [";
 | |
|         for (unsigned i = 0, e = TI->getNumSuccessors(); i < e; ++i) {
 | |
|           BasicBlock *succ = TI->getSuccessor(i);
 | |
|           OS << " BB:" << BBIDs[succ];
 | |
|         }
 | |
|         OS << " ]";
 | |
|       }
 | |
|       OS << "\n";
 | |
|     }
 | |
|     OS << "\n";
 | |
|   }
 | |
| }
 | |
| 
 | |
| void WIAnalysisRunner::lock_print() {
 | |
|   IGC::Debug::DumpLock();
 | |
|   {
 | |
|     int id = m_funcInvocationId[m_func]++;
 | |
|     std::stringstream ss;
 | |
|     ss << m_func->getName().str() << "_WIAnalysis_" << id;
 | |
|     auto name = DumpName(IGC::Debug::GetShaderOutputName())
 | |
|                     .Hash(m_CGCtx->hash)
 | |
|                     .Type(m_CGCtx->type)
 | |
|                     .Pass(ss.str().c_str())
 | |
|                     .Extension("txt");
 | |
|     print(Dump(name, DumpType::DBG_MSG_TEXT).stream());
 | |
|   }
 | |
|   IGC::Debug::DumpUnlock();
 | |
| }
 | |
| 
 | |
| // Used for dumpping into a file with a fixed name while running in debugger
 | |
| void WIAnalysisRunner::dump() const {
 | |
|   auto name = DumpName(IGC::Debug::GetShaderOutputName())
 | |
|                   .Hash(m_CGCtx->hash)
 | |
|                   .Type(m_CGCtx->type)
 | |
|                   .Pass("WIAnalysis")
 | |
|                   .Extension("txt");
 | |
|   print(Dump(name, DumpType::DBG_MSG_TEXT).stream());
 | |
| }
 | |
| 
 | |
| void WIAnalysisRunner::init(llvm::Function *F, llvm::LoopInfo *LI, llvm::DominatorTree *DT,
 | |
|                             llvm::PostDominatorTree *PDT, IGC::IGCMD::MetaDataUtils *MDUtils,
 | |
|                             IGC::CodeGenContext *CGCtx, IGC::ModuleMetaData *ModMD, IGC::TranslationTable *TransTable,
 | |
|                             bool ForCodegen) {
 | |
|   m_func = F;
 | |
|   this->LI = LI;
 | |
|   this->DT = DT;
 | |
|   this->PDT = PDT;
 | |
|   this->ForCodegen = ForCodegen;
 | |
|   m_pMdUtils = MDUtils;
 | |
|   m_CGCtx = CGCtx;
 | |
|   m_ModMD = ModMD;
 | |
|   m_TT = TransTable;
 | |
| 
 | |
|   // CS uniformness
 | |
|   m_localIDxUniform = false;
 | |
|   m_localIDyUniform = false;
 | |
|   m_localIDzUniform = false;
 | |
| }
 | |
| 
 | |
| bool WIAnalysisRunner::run() {
 | |
|   auto &F = *m_func;
 | |
|   if (m_pMdUtils->findFunctionsInfoItem(&F) == m_pMdUtils->end_FunctionsInfo())
 | |
|     return false;
 | |
|   if (IGC::isIntelSymbolTableVoidProgram(&F))
 | |
|     return false;
 | |
| 
 | |
|   m_depMap.Initialize(m_TT);
 | |
|   m_TT->RegisterListener(&m_depMap);
 | |
| 
 | |
|   m_changed1.clear();
 | |
|   m_changed2.clear();
 | |
|   m_pChangedNew = &m_changed1;
 | |
|   m_pChangedOld = &m_changed2;
 | |
|   m_ctrlBranches.clear();
 | |
| 
 | |
|   m_storeDepMap.clear();
 | |
|   m_allocaDepMap.clear();
 | |
|   m_forcedUniforms.clear();
 | |
| 
 | |
|   updateArgsDependency(&F);
 | |
| 
 | |
|   if (!IGC_IS_FLAG_ENABLED(DisableUniformAnalysis)) {
 | |
|     // Compute the  first iteration of the WI-dep according to ordering
 | |
|     // instructions this ordering is generally good (as it ususally correlates
 | |
|     // well with dominance).
 | |
|     inst_iterator it = inst_begin(F);
 | |
|     inst_iterator e = inst_end(F);
 | |
|     for (; it != e; ++it) {
 | |
|       calculate_dep(&*it);
 | |
|     }
 | |
| 
 | |
|     // Recursively check if WI-dep changes and if so reclaculates
 | |
|     // the WI-dep and marks the users for re-checking.
 | |
|     // This procedure is guranteed to converge since WI-dep can only
 | |
|     // become less unifrom (uniform->consecutive->ptr->stride->random).
 | |
|     updateDeps();
 | |
| 
 | |
|     // sweep the dataflow started from those GenISA_vectorUniform,
 | |
|     // force all the insert-elements and phi-nodes to uniform
 | |
|     std::set<const Value *> visited;
 | |
|     while (!m_forcedUniforms.empty()) {
 | |
|       const Value *V = m_forcedUniforms.back();
 | |
|       m_forcedUniforms.pop_back();
 | |
|       visited.insert(V);
 | |
|       for (auto UI = V->user_begin(), UE = V->user_end(); UI != UE; ++UI) {
 | |
|         const Value *use = (*UI);
 | |
|         if (!visited.count(use) && use->getType() == V->getType()) {
 | |
|           if (auto INS = dyn_cast<InsertElementInst>(use)) {
 | |
|             if (!isUniform(use))
 | |
|               m_depMap.SetAttribute(INS, WIAnalysis::UNIFORM_THREAD);
 | |
|             m_forcedUniforms.push_back(use);
 | |
|           } else if (auto PHI = dyn_cast<PHINode>(use)) {
 | |
|             if (!isUniform(use))
 | |
|               m_depMap.SetAttribute(PHI, WIAnalysis::UNIFORM_THREAD);
 | |
|             m_forcedUniforms.push_back(use);
 | |
|           }
 | |
|         }
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   if (IGC_IS_FLAG_ENABLED(DumpWIA)) {
 | |
|     // Dump into a unique file under dump dir, per each function invocation.
 | |
|     lock_print();
 | |
|   }
 | |
| 
 | |
|   // Original print to stdout
 | |
|   //   Need igc key PrintToConsole and llvm flag -print-wia-check
 | |
|   if (PrintWiaCheck) {
 | |
|     print(ods());
 | |
|   }
 | |
|   return false;
 | |
| }
 | |
| 
 | |
| bool WIAnalysis::runOnFunction(Function &F) {
 | |
|   auto *MDUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
 | |
|   auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 | |
|   auto *PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
 | |
|   auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 | |
|   auto *CGCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
 | |
|   auto *ModMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
 | |
|   auto *pTT = &getAnalysis<TranslationTable>();
 | |
| 
 | |
|   Runner.init(&F, LI, DT, PDT, MDUtils, CGCtx, ModMD, pTT);
 | |
|   return Runner.run();
 | |
| }
 | |
| 
 | |
| void WIAnalysisRunner::updateDeps() {
 | |
|   // As lonst as we have values to update
 | |
|   while (!m_pChangedNew->empty()) {
 | |
|     // swap between changedSet pointers - recheck the newChanged(now old)
 | |
|     std::swap(m_pChangedNew, m_pChangedOld);
 | |
|     // clear the newChanged set so it will be filled with the users of
 | |
|     // instruction which their WI-dep canged during the current iteration
 | |
|     m_pChangedNew->clear();
 | |
| 
 | |
|     // update all changed values
 | |
|     std::vector<const Value *>::iterator it = m_pChangedOld->begin();
 | |
|     std::vector<const Value *>::iterator e = m_pChangedOld->end();
 | |
|     for (; it != e; ++it) {
 | |
|       // remove first instruction
 | |
|       // calculate its new dependencey value
 | |
|       calculate_dep(*it);
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| bool WIAnalysisRunner::isInstructionSimple(const Instruction *inst) {
 | |
|   // avoid changing cb load to sampler load, since sampler load
 | |
|   // has longer latency.
 | |
|   if (isa<LoadInst>(inst)) {
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   if (isa<UnaryInstruction>(inst) || isa<BinaryOperator>(inst) || isa<CmpInst>(inst) || isa<SelectInst>(inst)) {
 | |
|     return true;
 | |
|   }
 | |
|   if (IsMathIntrinsic(GetOpCode((Instruction *)inst))) {
 | |
|     return true;
 | |
|   }
 | |
| 
 | |
|   return false;
 | |
| }
 | |
| 
 | |
| bool WIAnalysisRunner::needToBeUniform(const Value *val) {
 | |
|   for (auto UI = val->user_begin(), E = val->user_end(); UI != E; ++UI) {
 | |
|     if (const RTWriteIntrinsic *use = dyn_cast<RTWriteIntrinsic>(*UI)) {
 | |
|       if (use->getSampleIndex() == val || use->getBlendStateIndex() == val) {
 | |
|         return true;
 | |
|       }
 | |
|     }
 | |
|     // TODO add sampler cases
 | |
|   }
 | |
|   return false;
 | |
| }
 | |
| 
 | |
| bool WIAnalysisRunner::allUsesRandom(const Value *val) {
 | |
|   for (auto UI = val->user_begin(), E = val->user_end(); UI != E; ++UI) {
 | |
|     const Value *use = (*UI);
 | |
|     if (getDependency(use) != WIAnalysis::RANDOM) {
 | |
|       return false;
 | |
|     }
 | |
|   }
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| void WIAnalysisRunner::updateArgsDependency(llvm::Function *pF) {
 | |
|   /*
 | |
|   Function Signature: define void @kernel(
 | |
|       [OCL function args...],
 | |
|       [implicit args...],
 | |
|       [push analysis args...])
 | |
| 
 | |
|   Example push analysis args:
 | |
|       float %urb_read_0, float %urb_read_1, float %urb_read_2, float %urb_read_3, float %urb_read_4
 | |
| 
 | |
|   Metadata Generated:
 | |
|   !igc.pushanalysis.wi.info = !{!3, !4, !5, !6, !7}
 | |
| 
 | |
|   !3 = metadata !{metadata !"urb_read_0", i32 0, i32 4}
 | |
|   !4 = metadata !{metadata !"urb_read_1", i32 1, i32 4}
 | |
|   !5 = metadata !{metadata !"urb_read_2", i32 2, i32 4}
 | |
|   !6 = metadata !{metadata !"urb_read_3", i32 3, i32 4}
 | |
|   !7 = metadata !{metadata !"urb_read_4", i32 4, i32 4}
 | |
| 
 | |
|   Assumption is that the order of metadata matches the order of arguments in function.
 | |
|   */
 | |
| 
 | |
|   // For a subroutine, conservatively assume that all user provided arguments
 | |
|   // are random. Note that all other functions are treated as kernels.
 | |
|   // To enable subroutine for other FEs, we need to update this check.
 | |
|   bool IsSubroutine = !isEntryFunc(m_pMdUtils, pF) || isNonEntryMultirateShader(pF);
 | |
| 
 | |
|   ImplicitArgs implicitArgs(*pF, m_pMdUtils);
 | |
|   int implicitArgStart = (unsigned)(pF->arg_size() - implicitArgs.size() -
 | |
|                                     (IsSubroutine ? 0 : m_ModMD->pushInfo.pushAnalysisWIInfos.size()));
 | |
|   IGC_ASSERT_MESSAGE(implicitArgStart >= 0, "Function arg size does not match meta data and push args.");
 | |
| 
 | |
|   llvm::Function::arg_iterator ai, ae;
 | |
|   ai = pF->arg_begin();
 | |
|   ae = pF->arg_end();
 | |
| 
 | |
|   // 1. add all kernel function args as uniform, or
 | |
|   //    add all subroutine function args as random
 | |
|   for (int i = 0; i < implicitArgStart; ++i, ++ai) {
 | |
|     IGC_ASSERT(ai != ae);
 | |
|     incUpdateDepend(&(*ai), IsSubroutine ? WIAnalysis::RANDOM : WIAnalysis::UNIFORM_GLOBAL);
 | |
|   }
 | |
| 
 | |
|   // 2. add implicit args
 | |
|   //    By default, local IDs are not uniform. But if we know that the runtime dispatchs
 | |
|   //    order (intel_reqd_workgroup_walk_order()) and work group size (reqd_work_group_size()),
 | |
|   //    we may derive that some of local IDs are uniform.
 | |
|   bool localX_uniform = false, localY_uniform = false, localZ_uniform = false;
 | |
| 
 | |
|   if (!IsSubroutine) {
 | |
|     checkLocalIdUniform(pF, localX_uniform, localY_uniform, localZ_uniform);
 | |
|   }
 | |
| 
 | |
|   for (unsigned i = 0; i < implicitArgs.size(); ++i, ++ai) {
 | |
|     IGC_ASSERT(ai != ae);
 | |
|     const ImplicitArg &iArg = implicitArgs[ai->getArgNo() - implicitArgStart];
 | |
|     WIAnalysis::WIDependancy dependency = iArg.getDependency();
 | |
|     if ((localX_uniform && iArg.getArgType() == ImplicitArg::ArgType::LOCAL_ID_X) ||
 | |
|         (localY_uniform && iArg.getArgType() == ImplicitArg::ArgType::LOCAL_ID_Y) ||
 | |
|         (localZ_uniform && iArg.getArgType() == ImplicitArg::ArgType::LOCAL_ID_Z)) {
 | |
|       // todo: may improve it to have UNIFORM_WORKGROUP
 | |
|       dependency = WIAnalysis::UNIFORM_THREAD;
 | |
|     }
 | |
| 
 | |
|     incUpdateDepend(&(*ai), dependency);
 | |
|   }
 | |
| 
 | |
|   // 3. add push analysis args
 | |
|   if (!IsSubroutine) {
 | |
|     for (unsigned i = 0; i < m_ModMD->pushInfo.pushAnalysisWIInfos.size(); ++i, ++ai) {
 | |
|       IGC_ASSERT(ai != ae);
 | |
|       WIAnalysis::WIDependancy dependency =
 | |
|           static_cast<WIAnalysis::WIDependancy>(m_ModMD->pushInfo.pushAnalysisWIInfos[i].argDependency);
 | |
|       incUpdateDepend(&(*ai), dependency);
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| void WIAnalysis::print(llvm::raw_ostream &OS, const llvm::Module *M) const { Runner.print(OS, M); }
 | |
| 
 | |
| void WIAnalysis::dump() const { Runner.dump(); }
 | |
| 
 | |
| void WIAnalysis::incUpdateDepend(const llvm::Value *val, WIDependancy dep) { Runner.incUpdateDepend(val, dep); }
 | |
| 
 | |
| WIAnalysis::WIDependancy WIAnalysis::whichDepend(const llvm::Value *val) { return Runner.whichDepend(val); }
 | |
| 
 | |
| bool WIAnalysis::isUniform(const Value *val) const { return Runner.isUniform(val); }
 | |
| 
 | |
| bool WIAnalysis::isGlobalUniform(const Value *val) { return Runner.isGlobalUniform(val); }
 | |
| 
 | |
| bool WIAnalysis::isWorkGroupOrGlobalUniform(const Value *val) { return Runner.isWorkGroupOrGlobalUniform(val); }
 | |
| 
 | |
| bool WIAnalysis::insideDivergentCF(const Value *val) const { return Runner.insideDivergentCF(val); }
 | |
| 
 | |
| bool WIAnalysis::insideWorkgroupDivergentCF(const Value *val) const { return Runner.insideWorkgroupDivergentCF(val); }
 | |
| 
 | |
| WIAnalysis::WIDependancy WIAnalysisRunner::whichDepend(const Value *val) const {
 | |
|   IGC_ASSERT_MESSAGE(m_pChangedNew->empty(), "set should be empty before query");
 | |
|   IGC_ASSERT_MESSAGE(nullptr != val, "Bad value");
 | |
|   if (isa<Constant>(val)) {
 | |
|     return WIAnalysis::UNIFORM_GLOBAL;
 | |
|   } else if (isa<StaticConstantPatchIntrinsic>(val)) {
 | |
|     return WIAnalysis::UNIFORM_GLOBAL;
 | |
|   }
 | |
|   auto EL = m_depMap.GetAttributeWithoutCreating(val);
 | |
|   if (IGC_IS_FLAG_ENABLED(DisableUniformAnalysis)) {
 | |
|     if (EL == m_depMap.end()) {
 | |
|       return WIAnalysis::RANDOM;
 | |
|     }
 | |
|   }
 | |
|   IGC_ASSERT(EL != m_depMap.end());
 | |
|   return EL;
 | |
| }
 | |
| 
 | |
| bool WIAnalysisRunner::isUniform(const Value *val) const {
 | |
|   if (!hasDependency(val))
 | |
|     return false;
 | |
| 
 | |
|   return WIAnalysis::isDepUniform(whichDepend(val));
 | |
| }
 | |
| 
 | |
| bool WIAnalysisRunner::isWorkGroupOrGlobalUniform(const Value *val) const {
 | |
|   if (!hasDependency(val))
 | |
|     return false;
 | |
|   WIAnalysis::WIDependancy dep = whichDepend(val);
 | |
|   return dep == WIAnalysis::UNIFORM_GLOBAL || dep == WIAnalysis::UNIFORM_WORKGROUP;
 | |
| }
 | |
| 
 | |
| bool WIAnalysisRunner::isGlobalUniform(const Value *val) const {
 | |
|   if (!hasDependency(val))
 | |
|     return false;
 | |
|   WIAnalysis::WIDependancy dep = whichDepend(val);
 | |
|   return dep == WIAnalysis::UNIFORM_GLOBAL;
 | |
| }
 | |
| 
 | |
| WIAnalysis::WIDependancy WIAnalysisRunner::getCFDependency(const BasicBlock *BB) const {
 | |
|   auto II = m_ctrlBranches.find(BB);
 | |
|   if (II == m_ctrlBranches.end())
 | |
|     return WIAnalysis::UNIFORM_GLOBAL;
 | |
| 
 | |
|   WIAnalysis::WIDependancy dep = WIAnalysis::UNIFORM_GLOBAL;
 | |
|   for (auto *BI : II->second) {
 | |
|     auto newDep = whichDepend(BI);
 | |
|     if (depRank(dep) < depRank(newDep))
 | |
|       dep = newDep;
 | |
|   }
 | |
| 
 | |
|   return dep;
 | |
| }
 | |
| 
 | |
| bool WIAnalysisRunner::insideWorkgroupDivergentCF(const Value *val) const {
 | |
|   if (auto *I = dyn_cast<Instruction>(val)) {
 | |
|     auto dep = getCFDependency(I->getParent());
 | |
|     return depRank(dep) > WIAnalysis::UNIFORM_WORKGROUP;
 | |
|   }
 | |
| 
 | |
|   return false;
 | |
| }
 | |
| 
 | |
| WIAnalysis::WIDependancy WIAnalysisRunner::getDependency(const Value *val) {
 | |
|   if (m_depMap.GetAttributeWithoutCreating(val) == m_depMap.end()) {
 | |
|     // Make sure that constants are not added in the map.
 | |
|     if (!isa<Instruction>(val) && !isa<Argument>(val)) {
 | |
|       return WIAnalysis::UNIFORM_GLOBAL;
 | |
|     }
 | |
|     // Don't expect this happens, let's assertion fail
 | |
|     IGC_ASSERT_MESSAGE(0, "Dependence for 'val' should bave been set already!");
 | |
|   }
 | |
|   IGC_ASSERT(m_depMap.GetAttributeWithoutCreating(val) != m_depMap.end());
 | |
|   return m_depMap.GetAttributeWithoutCreating(val);
 | |
| }
 | |
| 
 | |
| bool WIAnalysisRunner::hasDependency(const Value *val) const {
 | |
| 
 | |
|   if (!isa<Instruction>(val) && !isa<Argument>(val)) {
 | |
|     return true;
 | |
|   }
 | |
|   return (m_depMap.GetAttributeWithoutCreating(val) != m_depMap.end());
 | |
| }
 | |
| 
 | |
| static bool HasPhiUse(const llvm::Value *inst) {
 | |
|   for (auto UI = inst->user_begin(), E = inst->user_end(); UI != E; ++UI) {
 | |
|     if (llvm::isa<llvm::PHINode>(*UI)) {
 | |
|       return true;
 | |
|     }
 | |
|   }
 | |
|   return false;
 | |
| }
 | |
| 
 | |
| void WIAnalysisRunner::calculate_dep(const Value *val) {
 | |
|   IGC_ASSERT_MESSAGE(nullptr != val, "Bad value");
 | |
| 
 | |
|   // Not an instruction, must be a constant or an argument
 | |
|   // Could this vector type be of a constant which
 | |
|   // is not uniform ?
 | |
|   IGC_ASSERT_MESSAGE(isa<Instruction>(val), "Could we reach here with non instruction value?");
 | |
| 
 | |
|   const Instruction *const inst = dyn_cast<Instruction>(val);
 | |
|   IGC_ASSERT_MESSAGE(nullptr != inst, "This Value is not an Instruction");
 | |
|   if (inst) {
 | |
|     bool hasOriginal = hasDependency(inst);
 | |
|     WIAnalysis::WIDependancy orig;
 | |
|     // We only calculate dependency on unset instructions if all their operands
 | |
|     // were already given dependency. This is good for compile time since these
 | |
|     // instructions will be visited again after the operands dependency is set.
 | |
|     // An exception are phi nodes since they can be the ancestor of themselves in
 | |
|     // the def-use chain. Note that in this case we force the phi to have the
 | |
|     // pre-header value already calculated.
 | |
|     //
 | |
|     // Another case is that an inst might be set under control dependence (for example, phi)
 | |
|     // before any of its operands have been set. In this case, we will skip here. Here
 | |
|     // is the example (derived from ocl scheduler):
 | |
|     //      B0:  (p) goto Bt
 | |
|     //      B1:  goto Bf
 | |
|     //  L   B2:  x.lcssa = phi (x.0, Bn)      // B2: partial join
 | |
|     //      ...
 | |
|     //      Bt: ...
 | |
|     //      ...
 | |
|     //      Bf:
 | |
|     //      ...
 | |
|     //          goto Bm (out of loop)
 | |
|     //      Bn:
 | |
|     //          x.0 = ...
 | |
|     //          goto  B2
 | |
|     //      Bm:  ...
 | |
|     //      ...
 | |
|     //      B_ipd  ( iPDOM(B0) = B_ipd)
 | |
|     //
 | |
|     // B0's branch instruction has random dependency, which triggers control dependence calculation.
 | |
|     // B2 is a partial join in InfluenceRegion. Thus its phi is marked as random, but its operand
 | |
|     // x.0 is still not set yet.
 | |
|     unsigned int unsetOpNum = 0;
 | |
|     for (unsigned i = 0; i < inst->getNumOperands(); ++i) {
 | |
|       if (!hasDependency(inst->getOperand(i)))
 | |
|         unsetOpNum++;
 | |
|     }
 | |
|     if (isa<PHINode>(inst)) {
 | |
|       // We do not calculate PhiNode with all incoming values unset.
 | |
|       //
 | |
|       // This seems right as we don't expect a phi that only depends upon other
 | |
|       // phi's (if it happens, those phis form a cycle dependency) so any phi's
 | |
|       // calculation will eventually be triggered from calculating a non-phi one
 | |
|       // which the phi depends upon.
 | |
|       if (unsetOpNum == inst->getNumOperands())
 | |
|         return;
 | |
|     } else {
 | |
|       // We do not calculate non-PhiNode instruction that have unset operands
 | |
|       if (unsetOpNum > 0)
 | |
|         return;
 | |
| 
 | |
|       // We have all operands set. Check a special case from calculate_dep for
 | |
|       // binary ops (see the details below). It checks for ASHR+ADD and ASHR+SHL
 | |
|       // cases, and in particular it accesses dependency for ADD operands. It
 | |
|       // could happen these operands are not processed yet and in such case
 | |
|       // getDependency raises the assertion. Thus check if dependency is set.
 | |
|       // Currently we need to check dependency for ASHR->ADD operands only.
 | |
|       // For SHR, its operands are checked to be constant so skip this case.
 | |
|       // This code could be extended further depending on requirements.
 | |
|       if (inst->getOpcode() == Instruction::AShr) {
 | |
|         BinaryOperator *op0 = dyn_cast<BinaryOperator>(inst->getOperand(0));
 | |
|         if (op0 && op0->getOpcode() == Instruction::Add && !hasDependency(op0->getOperand(1))) {
 | |
|           return;
 | |
|         }
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     if (!hasOriginal) {
 | |
|       orig = WIAnalysis::UNIFORM_GLOBAL;
 | |
|     } else {
 | |
|       orig = m_depMap.GetAttributeWithoutCreating(inst);
 | |
| 
 | |
|       // if inst is already marked random, it cannot get better
 | |
|       if (orig == WIAnalysis::RANDOM) {
 | |
|         return;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     WIAnalysis::WIDependancy dep = orig;
 | |
| 
 | |
|     // LLVM does not have compile time polymorphisms
 | |
|     // TODO: to make things faster we may want to sort the list below according
 | |
|     // to the order of their probability of appearance.
 | |
|     if (const BinaryOperator *BI = dyn_cast<BinaryOperator>(inst))
 | |
|       dep = calculate_dep(BI);
 | |
|     else if (const CallInst *CI = dyn_cast<CallInst>(inst))
 | |
|       dep = calculate_dep(CI);
 | |
|     else if (isa<CmpInst>(inst))
 | |
|       dep = calculate_dep_simple(inst);
 | |
|     else if (isa<ExtractElementInst>(inst))
 | |
|       dep = calculate_dep_simple(inst);
 | |
|     else if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(inst))
 | |
|       dep = calculate_dep(GEP);
 | |
|     else if (isa<InsertElementInst>(inst))
 | |
|       dep = calculate_dep_simple(inst);
 | |
|     else if (isa<InsertValueInst>(inst))
 | |
|       dep = calculate_dep_simple(inst);
 | |
|     else if (const PHINode *Phi = dyn_cast<PHINode>(inst))
 | |
|       dep = calculate_dep(Phi);
 | |
|     else if (isa<ShuffleVectorInst>(inst))
 | |
|       dep = calculate_dep_simple(inst);
 | |
|     else if (isa<StoreInst>(inst))
 | |
|       dep = calculate_dep_simple(inst);
 | |
|     else if (inst->isTerminator())
 | |
|       dep = calculate_dep_terminator(dyn_cast<IGCLLVM::TerminatorInst>(inst));
 | |
|     else if (const SelectInst *SI = dyn_cast<SelectInst>(inst))
 | |
|       dep = calculate_dep(SI);
 | |
|     else if (const AllocaInst *AI = dyn_cast<AllocaInst>(inst))
 | |
|       dep = calculate_dep(AI);
 | |
|     else if (const CastInst *CI = dyn_cast<CastInst>(inst))
 | |
|       dep = calculate_dep(CI);
 | |
|     else if (isa<ExtractValueInst>(inst))
 | |
|       dep = calculate_dep_simple(inst);
 | |
|     else if (const LoadInst *LI = dyn_cast<LoadInst>(inst))
 | |
|       dep = calculate_dep(LI);
 | |
|     else if (const VAArgInst *VAI = dyn_cast<VAArgInst>(inst))
 | |
|       dep = calculate_dep(VAI);
 | |
|     else if (inst->getOpcode() == Instruction::FNeg)
 | |
|       dep = calculate_dep_simple(inst);
 | |
|     else if (inst->getOpcode() == Instruction::Freeze)
 | |
|       dep = calculate_dep_simple(inst);
 | |
| 
 | |
|     if (m_func->hasFnAttribute("KMPLOCK")) {
 | |
|       dep = WIAnalysis::UNIFORM_THREAD;
 | |
|     }
 | |
| 
 | |
|     // Spec enforces subgroup broadcast to use thread-uniform local ID.
 | |
|     if (isUsedByWaveBroadcastAsLocalID(inst)) {
 | |
| #ifdef _DEBUG
 | |
|       // Print warning exactly once per kernel.
 | |
|       static SmallPtrSet<Function *, 8> detectedKernels;
 | |
|       if (dep > WIAnalysis::UNIFORM_THREAD && !detectedKernels.count(m_func)) {
 | |
|         detectedKernels.insert(m_func);
 | |
|         std::string msg =
 | |
|             "Detected llvm.genx.GenISA.WaveBroadcast with potentially non-uniform LocalID in kernel " +
 | |
|             m_func->getName().str() +
 | |
|             "; such operation doesn't meet specification of OpGroupBroadcast and can lead to unexpected results.";
 | |
|         m_CGCtx->EmitWarning(msg.c_str());
 | |
|       }
 | |
| #endif // DEBUG
 | |
|     }
 | |
| 
 | |
|     // If the value was changed in this calculation
 | |
|     if (!hasOriginal || dep != orig) {
 | |
|       // i1 instructions used in phi cannot be uniform as it may prevent us from removing the phi of 1
 | |
|       if (ForCodegen && inst->getType()->isIntegerTy(1) && WIAnalysis::isDepUniform(dep) && HasPhiUse(inst)) {
 | |
|         dep = WIAnalysis::RANDOM;
 | |
|       }
 | |
|       // Update dependence of this instruction if dep is weaker than orig.
 | |
|       // Note depRank(orig) could be higher than depRank(dep) for phi.
 | |
|       // (Algo will never decrease the rank of a value.)
 | |
|       WIAnalysis::WIDependancy newDep = depRank(orig) < depRank(dep) ? dep : orig;
 | |
|       if (!hasOriginal || newDep != orig) {
 | |
|         // update only if it is a new dep
 | |
|         updateDepMap(inst, newDep);
 | |
|       }
 | |
|       // divergent branch, trigger updates due to control-dependence
 | |
|       if (inst->isTerminator() && dep != WIAnalysis::UNIFORM_GLOBAL) {
 | |
|         update_cf_dep(dyn_cast<IGCLLVM::TerminatorInst>(inst));
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| bool WIAnalysisRunner::isRegionInvariant(const llvm::Instruction *defi, BranchInfo *brInfo) {
 | |
|   constexpr uint8_t MAX_DEPTH = 4;
 | |
|   struct RegionOperand {
 | |
|     const llvm::Instruction *inst;
 | |
|     uint8_t operandNum;
 | |
|   };
 | |
| 
 | |
|   llvm::SmallVector<RegionOperand, MAX_DEPTH> operands;
 | |
|   operands.push_back({defi, 0});
 | |
| 
 | |
|   while (!operands.empty()) {
 | |
|     auto &rop = operands.back();
 | |
|     if (isa<PHINode>(rop.inst)) {
 | |
|       return false;
 | |
|     }
 | |
| 
 | |
|     if (rop.operandNum < rop.inst->getNumOperands()) {
 | |
|       Value *op = rop.inst->getOperand(rop.operandNum);
 | |
|       rop.operandNum++;
 | |
|       auto *srci = dyn_cast<Instruction>(op);
 | |
|       if (srci) {
 | |
|         if (!brInfo->influence_region.count(srci->getParent())) {
 | |
|           // go on to check the next operand
 | |
|           continue;
 | |
|         }
 | |
|         if (operands.size() + 1 > MAX_DEPTH) {
 | |
|           return false;
 | |
|         }
 | |
|         operands.push_back({srci, 0});
 | |
|       }
 | |
|     } else {
 | |
|       operands.pop_back();
 | |
|     }
 | |
|   }
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| bool WIAnalysisRunner::isUsedByWaveBroadcastAsLocalID(const llvm::Instruction *inst) {
 | |
|   for (auto it = inst->users().begin(); it != inst->users().end(); ++it) {
 | |
|     const GenIntrinsicInst *GII = dyn_cast<GenIntrinsicInst>(*it);
 | |
|     if (GII && GII->getIntrinsicID() == GenISAIntrinsic::GenISA_WaveBroadcast && GII->getOperand(1) == inst)
 | |
|       return true;
 | |
|   }
 | |
| 
 | |
|   return false;
 | |
| }
 | |
| 
 | |
| void WIAnalysisRunner::update_cf_dep(const IGCLLVM::TerminatorInst *inst) {
 | |
|   IGC_ASSERT(hasDependency(inst));
 | |
|   WIBaseClass::WIDependancy instDep = getDependency(inst);
 | |
| 
 | |
|   BasicBlock *blk = (BasicBlock *)(inst->getParent());
 | |
|   BasicBlock *ipd = PDT->getNode(blk)->getIDom()->getBlock();
 | |
|   // a branch can have NULL immediate post-dominator when a function
 | |
|   // has multiple exits in llvm-ir
 | |
|   // compute influence region and the partial-joins
 | |
|   BranchInfo br_info(inst, ipd);
 | |
|   // debug: dump influence region and partial-joins
 | |
|   // br_info.print(ods());
 | |
|   auto *CbrLoop = LI->getLoopFor(blk);
 | |
|   // Loop* IPDLoop = nullptr;
 | |
|   // check dep-type for every phi in the full join
 | |
|   if (ipd) {
 | |
|     updatePHIDepAtJoin(ipd, &br_info);
 | |
|     // IPDLoop = LI->getLoopFor(ipd);
 | |
|   }
 | |
|   // check dep-type for every phi in the partial-joins
 | |
|   for (SmallPtrSet<BasicBlock *, 4>::iterator join_it = br_info.partial_joins.begin(),
 | |
|                                               join_e = br_info.partial_joins.end();
 | |
|        join_it != join_e; ++join_it) {
 | |
|     auto *PJ = *join_it;
 | |
|     // skip the special loop-entry case
 | |
|     if (DT->dominates(PJ, blk)) {
 | |
|       int NumPreds = 0;
 | |
|       for (auto *pred : predecessors(PJ)) {
 | |
|         if (br_info.influence_region.count(pred)) {
 | |
|           NumPreds++;
 | |
|         }
 | |
|       }
 | |
|       if (NumPreds <= 1)
 | |
|         continue;
 | |
|     }
 | |
|     auto *PJNode = DT->getNode(PJ);
 | |
|     if (PJNode == nullptr) {
 | |
|       IGC_ASSERT(!DT->isReachableFromEntry(PJ));
 | |
|       continue;
 | |
|     }
 | |
|     auto PJDom = PJNode->getIDom()->getBlock();
 | |
| 
 | |
|     // If both partial-join and it IDom are in partial-join region
 | |
|     // there are cases in which phi-nodes in partial-joins are not
 | |
|     // relevant to the cbr under the investigation
 | |
|     auto LoopA = LI->getLoopFor(PJDom);
 | |
|     auto LoopB = LI->getLoopFor(PJ);
 | |
|     if (br_info.partial_joins.count(PJDom)) {
 | |
|       // both PJ and its IDom are outside the CBR loop
 | |
|       if (!CbrLoop || !CbrLoop->contains(LoopA))
 | |
|         continue;
 | |
|       // CbrLoop contains both PJDom and PJ
 | |
|       if (CbrLoop->contains(LoopB)) {
 | |
|         // Either Cbr-block strongly dominates the PJDom
 | |
|         // Or PJ dominates Cbr-block
 | |
|         if ((blk != PJDom && DT->dominates(blk, PJDom)) || DT->dominates(PJ, blk))
 | |
|           continue;
 | |
|       }
 | |
|     }
 | |
|     updatePHIDepAtJoin(PJ, &br_info);
 | |
|   }
 | |
| 
 | |
|   // walk through all the instructions in the influence-region
 | |
|   // update the dep-type based upon its uses
 | |
|   DenseSet<BasicBlock *>::iterator blk_it = br_info.influence_region.begin();
 | |
|   DenseSet<BasicBlock *>::iterator blk_e = br_info.influence_region.end();
 | |
|   for (; blk_it != blk_e; ++blk_it) {
 | |
|     BasicBlock *def_blk = *blk_it;
 | |
|     // add the branch into the controlling-branch set of the block
 | |
|     // if the block is in the influence-region
 | |
|     IGC_ASSERT(def_blk != br_info.full_join);
 | |
|     m_ctrlBranches[def_blk].insert(inst);
 | |
| 
 | |
|     for (BasicBlock::iterator I = def_blk->begin(), E = def_blk->end(); I != E; ++I) {
 | |
|       Instruction *defi = &(*I);
 | |
|       if (hasDependency(defi) && depRank(getDependency(defi)) >= depRank(instDep)) {
 | |
|         // defi is already weaker than or equal to inst (br), do nothing.
 | |
|         continue;
 | |
|       }
 | |
| 
 | |
|       if (const auto *st = dyn_cast<StoreInst>(defi)) {
 | |
|         // If we encounter a store in divergent control flow,
 | |
|         // we need to process the associated alloca (if any) again
 | |
|         // because it might need to be RANDOM.
 | |
|         auto it = m_storeDepMap.find(st);
 | |
|         if (it != m_storeDepMap.end())
 | |
|           m_pChangedNew->push_back(it->second);
 | |
|       }
 | |
| 
 | |
|       // This is an optimization that tries to detect instruction
 | |
|       // not really affected by control-flow divergency because
 | |
|       // all the sources are outside the region.
 | |
|       // However this is only as good as we can get because we
 | |
|       // only search limited depth
 | |
|       if (isRegionInvariant(defi, &br_info)) {
 | |
|         continue;
 | |
|       }
 | |
|       // We need to look at where the use is in order to decide
 | |
|       // we should make def to be "random" when loop is not in
 | |
|       // LCSSA form because we do not have LCSSA phi-nodes.
 | |
|       // 1) if use is in the full-join
 | |
|       // 2) if use is even outside the full-join
 | |
|       // 3) if use is in partial-join but def is not in partial-join
 | |
|       // 4) if def and use are in partial-join but def inside loop
 | |
|       Value::use_iterator use_it = defi->use_begin();
 | |
|       Value::use_iterator use_e = defi->use_end();
 | |
|       for (; use_it != use_e; ++use_it) {
 | |
|         Instruction *user = dyn_cast<Instruction>((*use_it).getUser());
 | |
|         IGC_ASSERT(user);
 | |
|         BasicBlock *user_blk = user->getParent();
 | |
|         PHINode *phi = dyn_cast<PHINode>(user);
 | |
|         if (phi) {
 | |
|           // another place we assume all critical edges have been
 | |
|           // split and phi-move will be placed on those splitters
 | |
|           user_blk = phi->getIncomingBlock(*use_it);
 | |
|         }
 | |
|         if (user_blk == def_blk) {
 | |
|           // local def-use, not related to control-dependence
 | |
|           continue; // skip
 | |
|         }
 | |
|         auto DefLoop = LI->getLoopFor(def_blk);
 | |
|         auto UseLoop = LI->getLoopFor(user_blk);
 | |
|         if (user_blk == br_info.full_join || !br_info.influence_region.count(user_blk) ||
 | |
|             (br_info.partial_joins.count(user_blk) &&
 | |
|              (!br_info.partial_joins.count(def_blk) || (DefLoop && !DefLoop->contains(UseLoop))))) {
 | |
|           updateDepMap(defi, instDep);
 | |
|           // break out of the use loop
 | |
|           // since def is changed to RANDOM, all uses will be changed later
 | |
|           break;
 | |
|         }
 | |
|       } // end of usei loop
 | |
|     } // end of defi loop within a block
 | |
|   } // end of influence-region block loop
 | |
| }
 | |
| 
 | |
| void WIAnalysisRunner::updatePHIDepAtJoin(BasicBlock *blk, BranchInfo *brInfo) {
 | |
|   // This is to bring down PHI's dep to br's dep.
 | |
|   // If PHI's dep is already weaker than br's dep, do nothing.
 | |
|   IGC_ASSERT(hasDependency(brInfo->cbr));
 | |
|   WIAnalysis::WIDependancy brDep = getDependency(brInfo->cbr);
 | |
| 
 | |
|   for (BasicBlock::iterator I = blk->begin(), E = blk->end(); I != E; ++I) {
 | |
|     Instruction *defi = &(*I);
 | |
|     PHINode *phi = dyn_cast<PHINode>(defi);
 | |
|     if (!phi) {
 | |
|       break;
 | |
|     }
 | |
|     if (hasDependency(phi) && depRank(getDependency(phi)) >= depRank(brDep)) {
 | |
|       // phi's dep is already the same or weaker, do nothing.
 | |
|       continue;
 | |
|     }
 | |
|     Value *trickySrc = nullptr;
 | |
|     for (unsigned predIdx = 0; predIdx < phi->getNumOperands(); ++predIdx) {
 | |
|       Value *srcVal = phi->getOperand(predIdx);
 | |
|       Instruction *defi = dyn_cast<Instruction>(srcVal);
 | |
|       if (defi && brInfo->influence_region.count(defi->getParent())) {
 | |
|         updateDepMap(phi, brDep);
 | |
|         break;
 | |
|       } else {
 | |
|         // if the src is an immed, or an argument, or defined outside,
 | |
|         // think about the phi-move that can be placed in the incoming block.
 | |
|         // this phi should be random if we have two different src-values like that.
 | |
|         // this is one place where we assume all critical edges have been split
 | |
|         BasicBlock *predBlk = phi->getIncomingBlock(predIdx);
 | |
|         if (brInfo->influence_region.count(predBlk)) {
 | |
|           if (!trickySrc) {
 | |
|             trickySrc = srcVal;
 | |
|           } else if (trickySrc != srcVal) {
 | |
|             updateDepMap(phi, brDep);
 | |
|             break;
 | |
|           }
 | |
|         }
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| void WIAnalysisRunner::updateDepMap(const Instruction *inst, WIAnalysis::WIDependancy dep) {
 | |
|   // Save the new value of this instruction
 | |
|   m_depMap.SetAttribute(inst, dep);
 | |
|   // Register for update all of the dependent values of this updated
 | |
|   // instruction.
 | |
|   Value::const_user_iterator it = inst->user_begin();
 | |
|   Value::const_user_iterator e = inst->user_end();
 | |
|   for (; it != e; ++it) {
 | |
|     m_pChangedNew->push_back(*it);
 | |
|   }
 | |
|   if (const StoreInst *st = dyn_cast<StoreInst>(inst)) {
 | |
|     auto it = m_storeDepMap.find(st);
 | |
|     if (it != m_storeDepMap.end()) {
 | |
|       m_pChangedNew->push_back(it->second);
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   if (dep == WIAnalysis::RANDOM) {
 | |
|     EOPCODE eopcode = GetOpCode((Instruction *)inst);
 | |
|     if (eopcode == llvm_insert) {
 | |
|       updateInsertElements((const InsertElementInst *)inst);
 | |
|     } else if (const InsertValueInst *IVI = dyn_cast<const InsertValueInst>(inst)) {
 | |
|       updateInsertValues(IVI);
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| /// if one of insert-element is random, turn all the insert-elements into random
 | |
| void WIAnalysisRunner::updateInsertElements(const InsertElementInst *inst) {
 | |
|   /// find the first one in the sequence
 | |
|   InsertElementInst *curInst = (InsertElementInst *)inst;
 | |
|   InsertElementInst *srcInst = dyn_cast<InsertElementInst>(curInst->getOperand(0));
 | |
|   while (srcInst) {
 | |
|     if (hasDependency(srcInst) && getDependency(srcInst) == WIAnalysis::RANDOM)
 | |
|       return;
 | |
|     curInst = srcInst;
 | |
|     srcInst = dyn_cast<InsertElementInst>(curInst->getOperand(0));
 | |
|   }
 | |
|   if (curInst != inst) {
 | |
|     m_depMap.SetAttribute(curInst, WIAnalysis::RANDOM);
 | |
|     Value::user_iterator it = curInst->user_begin();
 | |
|     Value::user_iterator e = curInst->user_end();
 | |
|     for (; it != e; ++it) {
 | |
|       m_pChangedNew->push_back(*it);
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| /// If one of an insertvalue chain is random, turn all into random
 | |
| ///   The change of insertvalue is like the following:
 | |
| ///       st0 = insertvalue %INIT, %e0, 0
 | |
| ///       st1 = insertvalue %st0,  %e1, 1
 | |
| ///       st2 = insertvalue %st1,  %e2, 2
 | |
| ///       st3 = insertvalue %st2,  %e3, 3
 | |
| ///    here, {st0, st1, st2, st3} is one insertvalue chain, in which st0, st1
 | |
| ///    and st2 are all single use, st3 is either not a single use or used
 | |
| ///    in a non-insertvalue instruction.
 | |
| void WIAnalysisRunner::updateInsertValues(const InsertValueInst *Inst) {
 | |
|   /// find the first one in the sequence
 | |
|   const InsertValueInst *pI = Inst;
 | |
|   const InsertValueInst *aI = dyn_cast<const InsertValueInst>(pI->getOperand(0));
 | |
|   while (aI && aI->hasOneUse()) {
 | |
|     if (hasDependency(aI) && getDependency(aI) == WIAnalysis::RANDOM)
 | |
|       return;
 | |
|     pI = aI;
 | |
|     aI = dyn_cast<const InsertValueInst>(aI->getOperand(0));
 | |
|   }
 | |
|   if (pI != Inst) {
 | |
|     m_depMap.SetAttribute(pI, WIAnalysis::RANDOM);
 | |
|     auto it = pI->user_begin();
 | |
|     auto e = pI->user_end();
 | |
|     for (; it != e; ++it) {
 | |
|       m_pChangedNew->push_back(*it);
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep_simple(const Instruction *I) {
 | |
|   // simply check that all operands are uniform, if so return uniform, else random
 | |
|   const unsigned nOps = I->getNumOperands();
 | |
|   WIAnalysis::WIDependancy dep = WIAnalysis::UNIFORM_GLOBAL;
 | |
|   for (unsigned i = 0; i < nOps; ++i) {
 | |
|     const Value *op = I->getOperand(i);
 | |
|     WIAnalysis::WIDependancy D = getDependency(op);
 | |
|     dep = add_conversion[dep][D];
 | |
|     if (dep == WIAnalysis::RANDOM) {
 | |
|       break;
 | |
|     }
 | |
|   }
 | |
|   return dep;
 | |
| }
 | |
| 
 | |
| WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const LoadInst *inst) { return calculate_dep_simple(inst); }
 | |
| 
 | |
| WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const BinaryOperator *inst) {
 | |
|   // Calculate the dependency type for each of the operands
 | |
|   Value *op0 = inst->getOperand(0);
 | |
|   Value *op1 = inst->getOperand(1);
 | |
| 
 | |
|   WIAnalysis::WIDependancy dep0 = getDependency(op0);
 | |
|   IGC_ASSERT(dep0 < WIAnalysis::NumDeps);
 | |
|   WIAnalysis::WIDependancy dep1 = getDependency(op1);
 | |
|   IGC_ASSERT(dep1 < WIAnalysis::NumDeps);
 | |
| 
 | |
|   // For whatever binary operation,
 | |
|   // uniform returns uniform
 | |
|   WIAnalysis::WIDependancy dep = select_conversion[dep0][dep1];
 | |
|   if (WIAnalysis::isDepUniform(dep)) {
 | |
|     return dep;
 | |
|   }
 | |
| 
 | |
|   // FIXME:: assumes that the X value does not cross the +/- border - risky !!!
 | |
|   // The pattern (and (X, C)), where C preserves the lower k bits of the value,
 | |
|   // is often used for truncating of numbers in 64bit. We assume that the index
 | |
|   // properties are not hurt by this.
 | |
|   if (inst->getOpcode() == Instruction::And) {
 | |
|     ConstantInt *C0 = dyn_cast<ConstantInt>(inst->getOperand(0));
 | |
|     ConstantInt *C1 = dyn_cast<ConstantInt>(inst->getOperand(1));
 | |
|     // Use any of the constants. Instcombine places constants on Op1
 | |
|     // so try Op1 first.
 | |
|     if (C1 || C0) {
 | |
|       ConstantInt *C = C1 ? C1 : C0;
 | |
|       dep = C1 ? dep0 : dep1;
 | |
|       // Cannot look at bit pattern of huge integers.
 | |
|       if (C->getBitWidth() < 65) {
 | |
|         uint64_t val = C->getZExtValue();
 | |
|         uint64_t ptr_mask = (1 << MinIndexBitwidthToPreserve) - 1;
 | |
|         // Zero all bits above the lower k bits that we are interested in
 | |
|         val &= (ptr_mask);
 | |
|         // Make sure that all of the remaining bits are active
 | |
|         if (val == ptr_mask) {
 | |
|           return dep;
 | |
|         }
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   // FIXME:: assumes that the X value does not cross the +/- border - risky !!!
 | |
|   // The pattern (ashr (shl X, C)C) is used for truncating of numbers in 64bit
 | |
|   // The constant C must leave at least 32bits of the original number
 | |
|   if (inst->getOpcode() == Instruction::AShr) {
 | |
|     BinaryOperator *SHL = dyn_cast<BinaryOperator>(inst->getOperand(0));
 | |
|     // We also allow add of uniform value between the ashr and shl instructions
 | |
|     // since instcombine creates this pattern when adding a constant.
 | |
|     // The shl forces all low bits to be zero, so there can be no carry to the
 | |
|     // high bits due to the addition. Addition with uniform preservs WI-dep.
 | |
|     if (SHL && SHL->getOpcode() == Instruction::Add) {
 | |
|       Value *addedVal = SHL->getOperand(1);
 | |
|       if (WIAnalysis::isDepUniform(getDependency(addedVal))) {
 | |
|         SHL = dyn_cast<BinaryOperator>(SHL->getOperand(0));
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     if (SHL && SHL->getOpcode() == Instruction::Shl) {
 | |
|       ConstantInt *c_ashr = dyn_cast<ConstantInt>(inst->getOperand(1));
 | |
|       ConstantInt *c_shl = dyn_cast<ConstantInt>(SHL->getOperand(1));
 | |
|       const IntegerType *AshrTy = cast<IntegerType>(inst->getType());
 | |
|       if (c_ashr && c_shl && c_ashr->getZExtValue() == c_shl->getZExtValue()) {
 | |
|         // If wordWidth - shift_width >= 32 bits
 | |
|         if ((AshrTy->getBitWidth() - c_shl->getZExtValue()) >= MinIndexBitwidthToPreserve) {
 | |
|           // return the dep of the original X
 | |
|           return getDependency(SHL->getOperand(0));
 | |
|         }
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   switch (inst->getOpcode()) {
 | |
|     // Addition simply adds the stride value, except for ptr_consecutive
 | |
|     // which is promoted to strided.
 | |
|     // Another exception is when we subtract the tid: 1 - X which turns the
 | |
|     // tid order to random.
 | |
|   case Instruction::Add:
 | |
|   case Instruction::FAdd:
 | |
|     return add_conversion[dep0][dep1];
 | |
|   case Instruction::Sub:
 | |
|   case Instruction::FSub:
 | |
|     return sub_conversion[dep0][dep1];
 | |
| 
 | |
|   case Instruction::Mul:
 | |
|   case Instruction::FMul:
 | |
|   case Instruction::Shl:
 | |
|     if (WIAnalysis::isDepUniform(dep0) || WIAnalysis::isDepUniform(dep1)) {
 | |
|       // If one of the sides is uniform, then we can adopt
 | |
|       // the other side (stride*uniform is still stride).
 | |
|       // stride size is K, where K is the uniform input.
 | |
|       // An exception to this is ptr_consecutive, which is
 | |
|       // promoted to strided.
 | |
|       return mul_conversion[dep0][dep1];
 | |
|     }
 | |
|   default:
 | |
|     // TODO: Support more arithmetic if needed
 | |
|     return WIAnalysis::RANDOM;
 | |
|   }
 | |
|   return WIAnalysis::RANDOM;
 | |
| }
 | |
| 
 | |
| WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const CallInst *inst) {
 | |
|   // handle 3D specific intrinsics
 | |
|   EOPCODE intrinsic_name = GetOpCode((Instruction *)(inst));
 | |
|   GenISAIntrinsic::ID GII_id = GenISAIntrinsic::no_intrinsic;
 | |
|   if (const GenIntrinsicInst *GII = dyn_cast<GenIntrinsicInst>(inst)) {
 | |
|     GII_id = GII->getIntrinsicID();
 | |
|   }
 | |
| 
 | |
|   const llvm::IntrinsicInst *llvmintrin = dyn_cast<llvm::IntrinsicInst>(inst);
 | |
|   if (llvmintrin != nullptr && (llvmintrin->getIntrinsicID() == llvm::Intrinsic::stacksave ||
 | |
|                                 llvmintrin->getIntrinsicID() == llvm::Intrinsic::stackrestore)) {
 | |
|     return WIAnalysis::UNIFORM_THREAD;
 | |
|   }
 | |
| 
 | |
|   if (IsMathIntrinsic(intrinsic_name) || intrinsic_name == llvm_input || intrinsic_name == llvm_sgv ||
 | |
|       intrinsic_name == llvm_shaderinputvec || intrinsic_name == llvm_getbufferptr ||
 | |
|       intrinsic_name == llvm_runtimeValue || intrinsic_name == llvm_getMessagePhaseX ||
 | |
|       intrinsic_name == llvm_getMessagePhaseXV || intrinsic_name == llvm_readsurfacetypeandformat ||
 | |
|       intrinsic_name == llvm_simdSize || intrinsic_name == llvm_resinfoptr || intrinsic_name == llvm_sampleinfoptr ||
 | |
|       intrinsic_name == llvm_ldrawvector_indexed || intrinsic_name == llvm_ldraw_indexed ||
 | |
|       intrinsic_name == llvm_cycleCounter || intrinsic_name == llvm_waveShuffleIndex ||
 | |
|       intrinsic_name == llvm_waveBroadcast || intrinsic_name == llvm_waveClusteredBroadcast ||
 | |
|       intrinsic_name == llvm_waveBallot || intrinsic_name == llvm_waveClusteredBallot ||
 | |
|       intrinsic_name == llvm_waveAll || intrinsic_name == llvm_waveClustered || intrinsic_name == llvm_waveInterleave ||
 | |
|       intrinsic_name == llvm_waveClusteredInterleave || intrinsic_name == llvm_ld_ptr ||
 | |
|       intrinsic_name == llvm_ldlptr ||
 | |
|       (IGC_IS_FLAG_DISABLED(DisableUniformTypedAccess) && intrinsic_name == llvm_typed_read) ||
 | |
|       intrinsic_name == llvm_add_pair || intrinsic_name == llvm_sub_pair || intrinsic_name == llvm_mul_pair ||
 | |
|       intrinsic_name == llvm_ptr_to_pair || intrinsic_name == llvm_pair_to_ptr || intrinsic_name == llvm_fma ||
 | |
|       intrinsic_name == llvm_canonicalize || GII_id == GenISAIntrinsic::GenISA_uitof_rtz ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_ftobf || GII_id == GenISAIntrinsic::GenISA_bftof ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_2fto2bf || GII_id == GenISAIntrinsic::GenISA_dual_subslice_id ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_hftobf8 || GII_id == GenISAIntrinsic::GenISA_bf8tohf ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_srnd_ftohf || GII_id == GenISAIntrinsic::GenISA_srnd_hftobf8 ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_hftohf8 || GII_id == GenISAIntrinsic::GenISA_hf8tohf ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_ftotf32 || GII_id == GenISAIntrinsic::GenISA_GlobalBufferPointer ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_LocalBufferPointer ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_InlinedData ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_GetShaderRecordPtr || GII_id == GenISAIntrinsic::GenISA_URBWrite ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_URBRead || GII_id == GenISAIntrinsic::GenISA_URBReadOutput ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_getSR0 || GII_id == GenISAIntrinsic::GenISA_getSR0_0 ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_mul_rtz || GII_id == GenISAIntrinsic::GenISA_fma_rtz ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_fma_rtp || GII_id == GenISAIntrinsic::GenISA_fma_rtn ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_add_rtz || GII_id == GenISAIntrinsic::GenISA_slice_id ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_subslice_id || GII_id == GenISAIntrinsic::GenISA_logical_subslice_id ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_dual_subslice_id || GII_id == GenISAIntrinsic::GenISA_eu_id ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_eu_thread_id || GII_id == GenISAIntrinsic::GenISA_movcr ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_hw_thread_id || GII_id == GenISAIntrinsic::GenISA_hw_thread_id_alloca ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_StackAlloca || GII_id == GenISAIntrinsic::GenISA_vectorUniform ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_getR0 || GII_id == GenISAIntrinsic::GenISA_getPayloadHeader ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_getGlobalOffset || GII_id == GenISAIntrinsic::GenISA_getWorkDim ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_getNumWorkGroups || GII_id == GenISAIntrinsic::GenISA_getLocalSize ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_getGlobalSize || GII_id == GenISAIntrinsic::GenISA_getEnqueuedLocalSize ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_getLocalID_X || GII_id == GenISAIntrinsic::GenISA_getLocalID_Y ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_getLocalID_Z || GII_id == GenISAIntrinsic::GenISA_getPrivateBase ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_getPrintfBuffer || GII_id == GenISAIntrinsic::GenISA_getStageInGridOrigin ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_getStageInGridSize || GII_id == GenISAIntrinsic::GenISA_getSyncBuffer ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_getRtGlobalBufferPtr || GII_id == GenISAIntrinsic::GenISA_GetGlobalBufferArg ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_GetImplicitBufferPtr || GII_id == GenISAIntrinsic::GenISA_GetLocalIdBufferPtr ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_ReadFromReservedArgSpace ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_getAssertBufferPtr ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_staticConstantPatchValue ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_bitcastfromstruct || GII_id == GenISAIntrinsic::GenISA_bitcasttostruct ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_LSC2DBlockCreateAddrPayload ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_LSC2DBlockCopyAddrPayload || GII_id == GenISAIntrinsic::GenISA_PredicatedLoad ||
 | |
|       GII_id == GenISAIntrinsic::GenISA_PredicatedStore || GII_id == GenISAIntrinsic::GenISA_bfn) {
 | |
|     switch (GII_id) {
 | |
|     default:
 | |
|       break;
 | |
|     case GenISAIntrinsic::GenISA_vectorUniform:
 | |
|       // collect the seeds for forcing uniform vectors
 | |
|       m_forcedUniforms.push_back(inst);
 | |
|       return WIAnalysis::UNIFORM_THREAD;
 | |
|     case GenISAIntrinsic::GenISA_getSR0:
 | |
|     case GenISAIntrinsic::GenISA_getSR0_0:
 | |
|     case GenISAIntrinsic::GenISA_eu_id:
 | |
|     case GenISAIntrinsic::GenISA_hw_thread_id:
 | |
|       return WIAnalysis::UNIFORM_THREAD;
 | |
|     case GenISAIntrinsic::GenISA_slice_id:
 | |
|     case GenISAIntrinsic::GenISA_subslice_id:
 | |
|     case GenISAIntrinsic::GenISA_logical_subslice_id:
 | |
|     case GenISAIntrinsic::GenISA_dual_subslice_id:
 | |
|       // Make sure they are UNIFORM_WORKGROUP
 | |
|       // return WIAnalysis::UNIFORM_WORKGROUP;
 | |
|       return WIAnalysis::UNIFORM_THREAD;
 | |
|     case GenISAIntrinsic::GenISA_GetImplicitBufferPtr:
 | |
|     case GenISAIntrinsic::GenISA_GetLocalIdBufferPtr:
 | |
|     case GenISAIntrinsic::GenISA_GetGlobalBufferArg:
 | |
|     case GenISAIntrinsic::GenISA_ReadFromReservedArgSpace:
 | |
|     case GenISAIntrinsic::GenISA_LSC2DBlockCreateAddrPayload:
 | |
|     case GenISAIntrinsic::GenISA_LSC2DBlockCopyAddrPayload:
 | |
|       return WIAnalysis::UNIFORM_THREAD;
 | |
|     case GenISAIntrinsic::GenISA_getR0:
 | |
|     case GenISAIntrinsic::GenISA_getPayloadHeader:
 | |
|     case GenISAIntrinsic::GenISA_getGlobalOffset:
 | |
|     case GenISAIntrinsic::GenISA_getWorkDim:
 | |
|     case GenISAIntrinsic::GenISA_getNumWorkGroups:
 | |
|     case GenISAIntrinsic::GenISA_getLocalSize:
 | |
|     case GenISAIntrinsic::GenISA_getGlobalSize:
 | |
|     case GenISAIntrinsic::GenISA_getEnqueuedLocalSize:
 | |
|     case GenISAIntrinsic::GenISA_getLocalID_X:
 | |
|     case GenISAIntrinsic::GenISA_getLocalID_Y:
 | |
|     case GenISAIntrinsic::GenISA_getLocalID_Z:
 | |
|     case GenISAIntrinsic::GenISA_getPrivateBase:
 | |
|     case GenISAIntrinsic::GenISA_getPrintfBuffer:
 | |
|     case GenISAIntrinsic::GenISA_getStageInGridOrigin:
 | |
|     case GenISAIntrinsic::GenISA_getStageInGridSize:
 | |
|     case GenISAIntrinsic::GenISA_getSyncBuffer:
 | |
|     case GenISAIntrinsic::GenISA_getRtGlobalBufferPtr:
 | |
|     case GenISAIntrinsic::GenISA_getAssertBufferPtr:
 | |
|       return ImplicitArgs::getArgDep(GII_id);
 | |
|     }
 | |
| 
 | |
|     if (intrinsic_name == llvm_input || intrinsic_name == llvm_shaderinputvec) {
 | |
|       if (auto *CI = dyn_cast<ConstantInt>(inst->getOperand(1))) {
 | |
|         e_interpolation mode = (e_interpolation)CI->getZExtValue();
 | |
|         if (mode != EINTERPOLATION_CONSTANT
 | |
|         ) {
 | |
|           return WIAnalysis::RANDOM;
 | |
|         }
 | |
|       } else {
 | |
|         return WIAnalysis::RANDOM;
 | |
|       }
 | |
|     }
 | |
| 
 | |
| 
 | |
|     if (intrinsic_name == llvm_sgv) {
 | |
|       IGC_ASSERT(isa<SGVIntrinsic>(inst));
 | |
|       const SGVIntrinsic *systemValueIntr = cast<SGVIntrinsic>(inst);
 | |
|       switch (systemValueIntr->getUsage()) {
 | |
|       case VFACE:                     // palygon front/back facing from PS payload
 | |
|       case RENDER_TARGET_ARRAY_INDEX: // render target array index from PS payload
 | |
|       case VIEWPORT_INDEX:            // viewport index from PS payload
 | |
|       {
 | |
|         IGC_ASSERT(m_CGCtx->type == ShaderType::PIXEL_SHADER);
 | |
|         const bool hasMultipolyDispatch = m_CGCtx->platform.hasDualKSPPS() || m_CGCtx->platform.supportDualSimd8PS();
 | |
|         return hasMultipolyDispatch ? WIAnalysis::RANDOM : WIAnalysis::UNIFORM_THREAD;
 | |
|       }
 | |
|       case POSITION_X:              // position from VUE header in GS or pixel position X in PS
 | |
|       case POSITION_Y:              // position from VUE header in GS or pixel position Y in PS
 | |
|       case POSITION_Z:              // position from VUE header in GS or source depth in PS
 | |
|       case POSITION_W:              // position from VUE header in GS or source W in PS
 | |
|       case PRIMITIVEID:             // primitive id payload phase in GS
 | |
|       case GS_INSTANCEID:           // GS instance id, calculated from URB handles
 | |
|       case POINT_WIDTH:             // point width in VUE header in GS
 | |
|       case INPUT_COVERAGE_MASK:     // pixel coverage mask payload phase in PS
 | |
|       case SAMPLEINDEX:             // sample index from PS payload
 | |
|       case CLIP_DISTANCE:           // unused
 | |
|       case THREAD_ID_X:             // global invocation id X in CS or OCL Kernel
 | |
|       case THREAD_ID_Y:             // global invocation id Y in CS or OCL Kernel
 | |
|       case THREAD_ID_Z:             // global invocation id Z in CS or OCL Kernel
 | |
|       case OUTPUT_CONTROL_POINT_ID: // unused
 | |
|       case DOMAIN_POINT_ID_X:       // domain point U from DS payload
 | |
|       case DOMAIN_POINT_ID_Y:       // domain point V from DS payload
 | |
|       case DOMAIN_POINT_ID_Z:       // domain point W from DS payload
 | |
|       case VERTEX_ID:               // vertex id in VS, delivered as an attribute
 | |
|       case REQUESTED_COARSE_SIZE_X: // requested per-subspan coarse pixel size X from PS payload
 | |
|       case REQUESTED_COARSE_SIZE_Y: // requested per-subspan coarse pixel size Y from PS payload
 | |
|       case CLIP_DISTANCE_X:         // DX10 clip distance X from VUE header in GS
 | |
|       case CLIP_DISTANCE_Y:         // DX10 clip distance Y from VUE header in GS
 | |
|       case CLIP_DISTANCE_Z:         // DX10 clip distance Z from VUE header in GS
 | |
|       case CLIP_DISTANCE_W:         // DX10 clip distance W from VUE header in GS
 | |
|       case CLIP_DISTANCE_HI_X:
 | |
|       case CLIP_DISTANCE_HI_Y:
 | |
|       case CLIP_DISTANCE_HI_Z:
 | |
|       case CLIP_DISTANCE_HI_W:
 | |
|       case POSITION_X_OFFSET: // pixel position offset X in PS
 | |
|       case POSITION_Y_OFFSET: // pixel position offset Y in PS
 | |
|       case POINT_COORD_X: // point-sprite coordinate X from PS attributes
 | |
|       case POINT_COORD_Y: // point-sprite coordinate Y from PS attributes
 | |
|       {
 | |
|         return WIAnalysis::RANDOM;
 | |
|       }
 | |
|       case THREAD_ID_IN_GROUP_X: // local invocation id X in CS or OCL Kernel
 | |
|       {
 | |
|         return m_localIDxUniform ? WIAnalysis::UNIFORM_THREAD : WIAnalysis::RANDOM;
 | |
|       }
 | |
|       case THREAD_ID_IN_GROUP_Y: // local invocation id Y in CS or OCL Kernel
 | |
|       {
 | |
|         return m_localIDyUniform ? WIAnalysis::UNIFORM_THREAD : WIAnalysis::RANDOM;
 | |
|       }
 | |
|       case THREAD_ID_IN_GROUP_Z: // local invocation id Z in CS or OCL Kernel
 | |
|       {
 | |
|         return m_localIDzUniform ? WIAnalysis::UNIFORM_THREAD : WIAnalysis::RANDOM;
 | |
|       }
 | |
|       case MSAA_RATE: // multisample rate from PS payload
 | |
|       case DISPATCH_DIMENSION_X:  // dispatch size X from MS payload
 | |
|       case DISPATCH_DIMENSION_Y:  // dispatch size Y from MS payload
 | |
|       case DISPATCH_DIMENSION_Z:  // dispatch size Z from MS payload
 | |
|       case INDIRECT_DATA_ADDRESS: // indirect data address from MS payload
 | |
|       case SHADER_TYPE: {
 | |
|         return WIAnalysis::UNIFORM_GLOBAL;
 | |
|       }
 | |
|       case THREAD_GROUP_ID_X: // workgroup id X in CS or OCL Kernel
 | |
|       case THREAD_GROUP_ID_Y: // workgroup id Y in CS or OCL Kernel
 | |
|       case THREAD_GROUP_ID_Z: // workgroup id Z in CS or OCL Kernel
 | |
|       {
 | |
|         return WIAnalysis::UNIFORM_WORKGROUP;
 | |
|       }
 | |
|       case ACTUAL_COARSE_SIZE_X:          // actual coarse pixel size X from PS payload
 | |
|       case ACTUAL_COARSE_SIZE_Y:          // actual coarse pixel size Y from PS payload
 | |
|       case THREAD_ID_WITHIN_THREAD_GROUP: // (physical) thread id in thread group from CS payload
 | |
|       {
 | |
|         return WIAnalysis::UNIFORM_THREAD;
 | |
|       }
 | |
|       case XP0: // base vertex from VS attributes
 | |
|       case XP1: // base instance from VS attributes
 | |
|       case XP2: // draw index from VS attributes
 | |
|       {
 | |
|         if (m_CGCtx->type == ShaderType::TASK_SHADER || m_CGCtx->type == ShaderType::MESH_SHADER) {
 | |
|           // XP0 is used for draw index in mesh and task
 | |
|           return WIAnalysis::UNIFORM_GLOBAL;
 | |
|         }
 | |
|         // Extended parameters are delivered in VS as attributes. Values
 | |
|         // are uniform but delivered per-vertex, frontends can use
 | |
|         // subgroup operations to get the uniform value.
 | |
|         return WIAnalysis::RANDOM;
 | |
|       }
 | |
|       case NUM_SGV:
 | |
|         IGC_ASSERT_MESSAGE(0, "Unexpected value");
 | |
|         break;
 | |
|         // This switch intentionally has no `default:` case. Whenever a new
 | |
|         // SGV type is added this code must be updated.
 | |
|       }
 | |
|     }
 | |
|     if (intrinsic_name == llvm_getMessagePhaseX || intrinsic_name == llvm_getMessagePhaseXV) {
 | |
|       return WIAnalysis::UNIFORM_THREAD;
 | |
|     }
 | |
| 
 | |
|     if (intrinsic_name == llvm_waveShuffleIndex || intrinsic_name == llvm_waveBroadcast) {
 | |
|       Value *op0 = inst->getArgOperand(0);
 | |
|       Value *op1 = inst->getArgOperand(1);
 | |
|       WIAnalysis::WIDependancy dep0 = getDependency(op0);
 | |
|       IGC_ASSERT(dep0 < WIAnalysis::NumDeps);
 | |
|       WIAnalysis::WIDependancy dep1 = getDependency(op1);
 | |
|       IGC_ASSERT(dep1 < WIAnalysis::NumDeps);
 | |
|       bool isUniform0 = WIAnalysis::isDepUniform(dep0);
 | |
|       bool isUniform1 = WIAnalysis::isDepUniform(dep1);
 | |
|       if ((isUniform0 && isUniform1) || (!isUniform0 && !isUniform1)) {
 | |
|         // Select worse one
 | |
|         return select_conversion[dep0][dep1];
 | |
|       } else {
 | |
|         // Select uniform one if only one is uniform
 | |
|         return isUniform0 ? dep0 : dep1;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     if (GII_id == GenISAIntrinsic::GenISA_StackAlloca) {
 | |
|       WIAnalysis::WIDependancy dep0 = WIAnalysis::UNIFORM_THREAD;
 | |
|       Value *op0 = inst->getArgOperand(0);
 | |
|       WIAnalysis::WIDependancy dep1 = getDependency(op0);
 | |
|       // Select worse one
 | |
|       return select_conversion[dep0][dep1];
 | |
|     }
 | |
| 
 | |
|     if (GII_id == GenISAIntrinsic::GenISA_staticConstantPatchValue) {
 | |
|       return WIAnalysis::UNIFORM_GLOBAL;
 | |
|     }
 | |
| 
 | |
|     if (intrinsic_name == llvm_waveBallot || intrinsic_name == llvm_waveAll) {
 | |
|       return WIAnalysis::UNIFORM_THREAD;
 | |
|     }
 | |
| 
 | |
|     if (intrinsic_name == llvm_waveClustered) {
 | |
|       const unsigned clusterSize =
 | |
|           static_cast<unsigned>(cast<llvm::ConstantInt>(inst->getArgOperand(2))->getZExtValue());
 | |
| 
 | |
|       constexpr unsigned maxSimdSize = 32;
 | |
|       if (clusterSize == maxSimdSize) {
 | |
|         // TODO: do the same for SIMD8 and SIMD16 if possible.
 | |
|         return WIAnalysis::UNIFORM_THREAD;
 | |
|       } else {
 | |
|         return WIAnalysis::RANDOM;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     if (intrinsic_name == llvm_waveInterleave || intrinsic_name == llvm_waveClusteredInterleave) {
 | |
|       return WIAnalysis::RANDOM;
 | |
|     }
 | |
| 
 | |
|     if (intrinsic_name == llvm_URBRead || intrinsic_name == llvm_URBReadOutput) {
 | |
|       if (!m_CGCtx->platform.isProductChildOf(IGFX_DG2)) {
 | |
|         return WIAnalysis::RANDOM;
 | |
|       }
 | |
|       if (m_CGCtx->type != ShaderType::TASK_SHADER && m_CGCtx->type != ShaderType::MESH_SHADER) {
 | |
|         return WIAnalysis::RANDOM;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     if (intrinsic_name == llvm_URBWrite) {
 | |
|       // TODO: enable this for other platforms/shader types if needed
 | |
|       if (!m_CGCtx->platform.isProductChildOf(IGFX_DG2)) {
 | |
|         return WIAnalysis::RANDOM;
 | |
|       }
 | |
|       if (m_CGCtx->type != ShaderType::TASK_SHADER && m_CGCtx->type != ShaderType::MESH_SHADER) {
 | |
|         return WIAnalysis::RANDOM;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     // Iterate over all input dependencies. If all are uniform - propagate it.
 | |
|     // otherwise - return RANDOM
 | |
|     unsigned numParams = IGCLLVM::getNumArgOperands(inst);
 | |
|     WIAnalysis::WIDependancy dep = WIAnalysis::UNIFORM_GLOBAL;
 | |
|     for (unsigned i = 0; i < numParams; ++i) {
 | |
|       Value *op = inst->getArgOperand(i);
 | |
|       WIAnalysis::WIDependancy tdep = getDependency(op);
 | |
|       dep = select_conversion[dep][tdep];
 | |
|       if (dep == WIAnalysis::RANDOM) {
 | |
|         break; // Uniformity check failed. no need to continue
 | |
|       }
 | |
|     }
 | |
|     return dep;
 | |
|   }
 | |
|   return WIAnalysis::RANDOM;
 | |
| }
 | |
| 
 | |
| WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const GetElementPtrInst *inst) {
 | |
|   const Value *opPtr = inst->getOperand(0);
 | |
|   WIAnalysis::WIDependancy dep = getDependency(opPtr);
 | |
|   // running over the all indices arguments except for the last
 | |
|   // here we assume the pointer is the first operand
 | |
|   unsigned num = inst->getNumIndices();
 | |
|   for (unsigned i = 1; i < num; ++i) {
 | |
|     const Value *op = inst->getOperand(i);
 | |
|     WIAnalysis::WIDependancy tdep = getDependency(op);
 | |
|     dep = select_conversion[dep][tdep];
 | |
|     if (!WIAnalysis::isDepUniform(dep)) {
 | |
|       return WIAnalysis::RANDOM;
 | |
|     }
 | |
|   }
 | |
|   const Value *lastInd = inst->getOperand(num);
 | |
|   WIAnalysis::WIDependancy lastIndDep = getDependency(lastInd);
 | |
|   return gep_conversion[dep][lastIndDep];
 | |
| }
 | |
| 
 | |
| WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const PHINode *inst) {
 | |
|   unsigned num = inst->getNumIncomingValues();
 | |
|   bool foundFirst = 0;
 | |
|   WIAnalysis::WIDependancy totalDep = WIAnalysis::WIDependancy::INVALID;
 | |
| 
 | |
|   for (unsigned i = 0; i < num; ++i) {
 | |
|     Value *op = inst->getIncomingValue(i);
 | |
|     if (hasDependency(op)) {
 | |
|       if (!foundFirst) {
 | |
|         totalDep = getDependency(op);
 | |
|       } else {
 | |
|         totalDep = select_conversion[totalDep][getDependency(op)];
 | |
|       }
 | |
|       foundFirst = 1;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   IGC_ASSERT_MESSAGE(foundFirst, "We should not reach here with All incoming values are unset");
 | |
| 
 | |
|   return totalDep;
 | |
| }
 | |
| 
 | |
| WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep_terminator(const IGCLLVM::TerminatorInst *inst) {
 | |
|   // Instruction has no return value
 | |
|   // Just need to know if this inst is uniform or not
 | |
|   // because we may want to avoid predication if the control flows
 | |
|   // in the function are uniform...
 | |
|   switch (inst->getOpcode()) {
 | |
|   case Instruction::Br: {
 | |
|     const BranchInst *brInst = cast<BranchInst>(inst);
 | |
|     if (brInst->isConditional()) {
 | |
|       // Conditional branch is uniform, if its condition is uniform
 | |
|       Value *op = brInst->getCondition();
 | |
|       WIAnalysis::WIDependancy dep = getDependency(op);
 | |
|       if (WIAnalysis::isDepUniform(dep)) {
 | |
|         return dep;
 | |
|       }
 | |
|       return WIAnalysis::RANDOM;
 | |
|     }
 | |
|     // Unconditional branch is non TID-dependent
 | |
|     return WIAnalysis::UNIFORM_GLOBAL;
 | |
|   }
 | |
|   // Return instructions are unconditional
 | |
|   case Instruction::Ret:
 | |
|     return WIAnalysis::UNIFORM_GLOBAL;
 | |
|   case Instruction::Unreachable:
 | |
|     return WIAnalysis::UNIFORM_GLOBAL;
 | |
|   case Instruction::IndirectBr:
 | |
|     return WIAnalysis::RANDOM;
 | |
|     // TODO: Define the dependency requirements of indirectBr
 | |
|   case Instruction::Switch: {
 | |
|     // Same as conditional br
 | |
|     const SwitchInst *swInst = cast<SwitchInst>(inst);
 | |
|     Value *op = swInst->getCondition();
 | |
|     WIAnalysis::WIDependancy dep = getDependency(op);
 | |
|     if (WIAnalysis::isDepUniform(dep)) {
 | |
|       return dep;
 | |
|     }
 | |
|     return WIAnalysis::RANDOM;
 | |
|   }
 | |
|   default:
 | |
|     return WIAnalysis::RANDOM;
 | |
|   }
 | |
| }
 | |
| 
 | |
| WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const SelectInst *inst) {
 | |
|   Value *op0 = inst->getOperand(0); // mask
 | |
|   WIAnalysis::WIDependancy dep0 = getDependency(op0);
 | |
|   if (WIAnalysis::isDepUniform(dep0)) {
 | |
|     Value *op1 = inst->getOperand(1);
 | |
|     Value *op2 = inst->getOperand(2);
 | |
|     WIAnalysis::WIDependancy dep1 = getDependency(op1);
 | |
|     WIAnalysis::WIDependancy dep2 = getDependency(op2);
 | |
|     // In case of constant scalar select we can choose according to the mask.
 | |
|     if (ConstantInt *C = dyn_cast<ConstantInt>(op0)) {
 | |
|       uint64_t val = C->getZExtValue();
 | |
|       if (val)
 | |
|         return dep1;
 | |
|       else
 | |
|         return dep2;
 | |
|     }
 | |
|     // Select the "weaker" dep, but if only one dep is ptr_consecutive,
 | |
|     // it must be promoted to strided ( as this data may
 | |
|     // propagate to Load/Store instructions.
 | |
|     WIAnalysis::WIDependancy tDep = select_conversion[dep1][dep2];
 | |
|     return select_conversion[dep0][tDep];
 | |
|   }
 | |
|   // In case the mask is non-uniform the select outcome can be a combination
 | |
|   // so we don't know nothing about it.
 | |
|   return WIAnalysis::RANDOM;
 | |
| }
 | |
| 
 | |
| bool WIAnalysisRunner::TrackAllocaDep(const Value *I, AllocaDep &dep) {
 | |
|   bool trackable = true;
 | |
|   for (Value::const_user_iterator use_it = I->user_begin(), use_e = I->user_end(); use_it != use_e; ++use_it) {
 | |
|     if (const GetElementPtrInst *gep = dyn_cast<GetElementPtrInst>(*use_it)) {
 | |
|       trackable &= TrackAllocaDep(gep, dep);
 | |
|     } else if (const llvm::LoadInst *pLoad = llvm::dyn_cast<llvm::LoadInst>(*use_it)) {
 | |
|       trackable &= (pLoad->isSimple());
 | |
|     } else if (const llvm::StoreInst *pStore = llvm::dyn_cast<llvm::StoreInst>(*use_it)) {
 | |
|       trackable &= (pStore->isSimple());
 | |
|       // Not supported case: GEP instruction is the stored value of the StoreInst
 | |
|       trackable &= (pStore->getValueOperand() != I);
 | |
|       dep.stores.push_back(pStore);
 | |
|     } else if (const llvm::BitCastInst *pBitCast = llvm::dyn_cast<llvm::BitCastInst>(*use_it)) {
 | |
|       trackable &= TrackAllocaDep(pBitCast, dep);
 | |
|     } else if (const llvm::AddrSpaceCastInst *pAddrCast = llvm::dyn_cast<llvm::AddrSpaceCastInst>(*use_it)) {
 | |
|       trackable &= TrackAllocaDep(pAddrCast, dep);
 | |
|     } else if (const GenIntrinsicInst *intr = dyn_cast<GenIntrinsicInst>(*use_it)) {
 | |
|       GenISAIntrinsic::ID IID = intr->getIntrinsicID();
 | |
|       if (IID == GenISAIntrinsic::GenISA_assume_uniform) {
 | |
|         dep.assume_uniform = true;
 | |
|       } else
 | |
|         trackable = false;
 | |
|     } else if (const IntrinsicInst *intr = dyn_cast<IntrinsicInst>(*use_it)) {
 | |
|       llvm::Intrinsic::ID IID = intr->getIntrinsicID();
 | |
|       if (IID != llvm::Intrinsic::lifetime_start && IID != llvm::Intrinsic::lifetime_end) {
 | |
|         trackable = false;
 | |
|       } else if (IID == llvm::Intrinsic::lifetime_start)
 | |
|         dep.lifetimes.push_back(intr);
 | |
|     } else {
 | |
|       // This is some other instruction. Right now we don't want to handle these
 | |
|       trackable = false;
 | |
|     }
 | |
|   }
 | |
|   return trackable;
 | |
| }
 | |
| 
 | |
| WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const AllocaInst *inst) {
 | |
|   if (m_CGCtx->platform.getWATable().WaNoA32ByteScatteredStatelessMessages) {
 | |
|     // avoid generating A32 byte scatter on platforms not supporting it
 | |
|     return WIAnalysis::RANDOM;
 | |
|   }
 | |
|   if (!hasDependency(inst)) {
 | |
|     AllocaDep dep;
 | |
|     dep.assume_uniform = false;
 | |
|     bool trackable = TrackAllocaDep(inst, dep);
 | |
| 
 | |
|     if (trackable || dep.assume_uniform) {
 | |
|       m_allocaDepMap.insert(std::make_pair(inst, dep));
 | |
|       for (auto it : dep.stores) {
 | |
|         m_storeDepMap.insert(std::make_pair(&(*it), inst));
 | |
|       }
 | |
|     }
 | |
|   }
 | |
|   auto depIt = m_allocaDepMap.find(inst);
 | |
|   if (depIt == m_allocaDepMap.end()) {
 | |
|     // If we haven't been able to track the dependency of the alloca make it random
 | |
|     return WIAnalysis::RANDOM;
 | |
|   }
 | |
|   // find assume-uniform
 | |
|   if (depIt->second.assume_uniform) {
 | |
|     return WIAnalysis::UNIFORM_THREAD;
 | |
|   }
 | |
|   // find the common dominator block among all the life-time starts
 | |
|   // that can be considered as the nearest logical location for alloca.
 | |
|   const BasicBlock *CommonDomBB = nullptr;
 | |
|   for (auto *SI : depIt->second.lifetimes) {
 | |
|     auto BB = SI->getParent();
 | |
|     IGC_ASSERT(BB);
 | |
|     if (!CommonDomBB)
 | |
|       CommonDomBB = BB;
 | |
|     else
 | |
|       CommonDomBB = DT->findNearestCommonDominator(CommonDomBB, BB);
 | |
|   }
 | |
|   if (!CommonDomBB) {
 | |
|     CommonDomBB = inst->getParent();
 | |
|   }
 | |
|   // if any store is not uniform, then alloca is not uniform
 | |
|   // if any store is affected by a divergent branch after alloca,
 | |
|   // then alloca is also not uniform
 | |
|   for (auto *SI : depIt->second.stores) {
 | |
|     if (hasDependency(SI)) {
 | |
|       if (!WIAnalysis::isDepUniform(getDependency(SI))) {
 | |
|         return WIAnalysis::RANDOM;
 | |
|       }
 | |
| 
 | |
|       if (auto I = m_ctrlBranches.find(SI->getParent()); I != m_ctrlBranches.end()) {
 | |
|         auto &Branches = I->second;
 | |
|         WIAnalysis::WIDependancy cntrDep = WIAnalysis::UNIFORM_GLOBAL;
 | |
|         for (auto *BrI : Branches) {
 | |
|           // exclude those branches that dominates alloca
 | |
|           if (!DT->dominates(BrI, CommonDomBB)) {
 | |
|             // select a weaker one
 | |
|             IGC_ASSERT(hasDependency(BrI));
 | |
|             cntrDep = select_conversion[cntrDep][getDependency(BrI)];
 | |
|             if (cntrDep == WIAnalysis::RANDOM)
 | |
|               break;
 | |
|           }
 | |
|         }
 | |
|         if (cntrDep == WIAnalysis::RANDOM)
 | |
|           return WIAnalysis::RANDOM;
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   return WIAnalysis::UNIFORM_THREAD;
 | |
| }
 | |
| 
 | |
| WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const CastInst *inst) {
 | |
|   Value *op0 = inst->getOperand(0);
 | |
|   WIAnalysis::WIDependancy dep0 = getDependency(op0);
 | |
| 
 | |
|   // independent remains independent
 | |
|   if (WIAnalysis::isDepUniform(dep0))
 | |
|     return dep0;
 | |
| 
 | |
|   switch (inst->getOpcode()) {
 | |
|   case Instruction::SExt:
 | |
|   case Instruction::FPTrunc:
 | |
|   case Instruction::FPExt:
 | |
|   case Instruction::PtrToInt:
 | |
|   case Instruction::IntToPtr:
 | |
|   case Instruction::AddrSpaceCast:
 | |
|   case Instruction::UIToFP:
 | |
|   case Instruction::FPToUI:
 | |
|   case Instruction::FPToSI:
 | |
|   case Instruction::SIToFP:
 | |
|     return dep0;
 | |
|   case Instruction::BitCast:
 | |
|   case Instruction::ZExt:
 | |
|     return WIAnalysis::RANDOM;
 | |
|     // FIXME:: assumes that the value does not cross the +/- border - risky !!!!
 | |
|   case Instruction::Trunc: {
 | |
|     const Type *destType = inst->getDestTy();
 | |
|     const IntegerType *intType = dyn_cast<IntegerType>(destType);
 | |
|     if (intType && (intType->getBitWidth() >= MinIndexBitwidthToPreserve)) {
 | |
|       return dep0;
 | |
|     }
 | |
|     return WIAnalysis::RANDOM;
 | |
|   }
 | |
|   default:
 | |
|     IGC_ASSERT_MESSAGE(0, "no such opcode");
 | |
|     // never get here
 | |
|     return WIAnalysis::RANDOM;
 | |
|   }
 | |
| }
 | |
| 
 | |
| WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const VAArgInst *inst) {
 | |
|   IGC_ASSERT_MESSAGE(0, "Are we supporting this ??");
 | |
|   return WIAnalysis::RANDOM;
 | |
| }
 | |
| 
 | |
| void WIAnalysisRunner::CS_checkLocalIDs(Function *F) {
 | |
|   CS_WALK_ORDER walkOrder = CS_WALK_ORDER::WO_XYZ;
 | |
|   ThreadIDLayout idLayout = ThreadIDLayout::X;
 | |
| 
 | |
|   {
 | |
|     if (IGC_IS_FLAG_DISABLED(OverrideCsWalkOrderEnable) || IGC_IS_FLAG_DISABLED(OverrideCsTileLayoutEnable))
 | |
|       return;
 | |
| 
 | |
|     walkOrder = (CS_WALK_ORDER)IGC_GET_FLAG_VALUE(OverrideCsWalkOrder);
 | |
|     idLayout = (ThreadIDLayout)IGC_GET_FLAG_VALUE(OverrideCsTileLayout);
 | |
|   }
 | |
| 
 | |
|   if (idLayout == ThreadIDLayout::TileY || idLayout == ThreadIDLayout::QuadTile) {
 | |
|     // Need clarification on semantics. Skip for now.
 | |
|     return;
 | |
|   }
 | |
| 
 | |
|   Module *M = F->getParent();
 | |
|   uint32_t X = 0, Y = 0, Z = 0;
 | |
|   if (GlobalVariable *pX = M->getGlobalVariable("ThreadGroupSize_X")) {
 | |
|     X = int_cast<unsigned>(cast<ConstantInt>(pX->getInitializer())->getZExtValue());
 | |
|   }
 | |
|   if (GlobalVariable *pY = M->getGlobalVariable("ThreadGroupSize_Y")) {
 | |
|     Y = int_cast<unsigned>(cast<ConstantInt>(pY->getInitializer())->getZExtValue());
 | |
|   }
 | |
|   if (GlobalVariable *pZ = M->getGlobalVariable("ThreadGroupSize_Z")) {
 | |
|     Z = int_cast<unsigned>(cast<ConstantInt>(pZ->getInitializer())->getZExtValue());
 | |
|   }
 | |
| 
 | |
|   // sanity
 | |
|   if (X == 0 || Y == 0 || Z == 0) {
 | |
|     return;
 | |
|   }
 | |
| 
 | |
|   if (X == 1) {
 | |
|     m_localIDxUniform = true;
 | |
|   }
 | |
|   if (Y == 1) {
 | |
|     m_localIDyUniform = true;
 | |
|   }
 | |
|   if (Z == 1) {
 | |
|     m_localIDzUniform = true;
 | |
|   }
 | |
| 
 | |
|   constexpr uint32_t simdSize = 32;
 | |
|   if (idLayout == ThreadIDLayout::X) {
 | |
|     auto setUniform = [this](int dim) {
 | |
|       switch (dim) {
 | |
|       case 0:
 | |
|         m_localIDxUniform = true;
 | |
|         break;
 | |
|       case 1:
 | |
|         m_localIDyUniform = true;
 | |
|         break;
 | |
|       case 2:
 | |
|         m_localIDzUniform = true;
 | |
|         break;
 | |
|       default:
 | |
|         IGC_ASSERT_UNREACHABLE();
 | |
|       }
 | |
|     };
 | |
| 
 | |
|     const uint32_t dims[3] = {X, Y, Z};
 | |
|     struct {
 | |
|       char first;
 | |
|       char second;
 | |
|       char third;
 | |
|     } orders;
 | |
| 
 | |
|     switch (walkOrder) {
 | |
|     case CS_WALK_ORDER::WO_XYZ:
 | |
|       orders = {0, 1, 2};
 | |
|       break;
 | |
|     case CS_WALK_ORDER::WO_XZY:
 | |
|       orders = {0, 2, 1};
 | |
|       break;
 | |
|     case CS_WALK_ORDER::WO_YXZ:
 | |
|       orders = {1, 0, 2};
 | |
|       break;
 | |
|     case CS_WALK_ORDER::WO_ZXY:
 | |
|       orders = {2, 0, 1};
 | |
|       break;
 | |
|     case CS_WALK_ORDER::WO_YZX:
 | |
|       orders = {1, 2, 0};
 | |
|       break;
 | |
|     case CS_WALK_ORDER::WO_ZYX:
 | |
|       orders = {2, 1, 0};
 | |
|       break;
 | |
|     default:
 | |
|       return;
 | |
|     }
 | |
| 
 | |
|     if ((dims[orders.first] % simdSize) == 0) {
 | |
|       // first dim is multiple of simd size
 | |
|       setUniform(orders.second);
 | |
|       setUniform(orders.third);
 | |
|     } else if (((dims[orders.first] * dims[orders.second]) % simdSize) == 0) {
 | |
|       // combination of first and second dims are multiple of simd size
 | |
|       setUniform(orders.third);
 | |
|     }
 | |
|     return;
 | |
|   }
 | |
| }
 | |
| 
 | |
| // Set IsLxUniform/IsLyUniform/IsLxUniform to true if they are uniform;
 | |
| // do nothing otherwise.
 | |
| void WIAnalysisRunner::checkLocalIdUniform(Function *F, bool &IsLxUniform, bool &IsLyUniform, bool &IsLzUniform) {
 | |
|   if (m_CGCtx->type == ShaderType::COMPUTE_SHADER) {
 | |
|     CS_checkLocalIDs(F);
 | |
|     return;
 | |
|   }
 | |
|   if (m_CGCtx->type != ShaderType::OPENCL_SHADER) {
 | |
|     return;
 | |
|   }
 | |
| 
 | |
|   FunctionInfoMetaDataHandle funcInfoMD = m_pMdUtils->getFunctionsInfoItem(F);
 | |
|   ModuleMetaData *modMD = m_CGCtx->getModuleMetaData();
 | |
|   auto funcMD = modMD->FuncMD.find(F);
 | |
| 
 | |
|   int32_t WO_0 = -1, WO_1 = -1, WO_2 = -1;
 | |
|   if (funcMD == modMD->FuncMD.end()) {
 | |
|     return;
 | |
|   }
 | |
| 
 | |
|   WorkGroupWalkOrderMD workGroupWalkOrder = funcMD->second.workGroupWalkOrder;
 | |
|   if (!workGroupWalkOrder.dim0 && !workGroupWalkOrder.dim1 && !workGroupWalkOrder.dim2) {
 | |
|     return;
 | |
|   }
 | |
| 
 | |
|   WO_0 = workGroupWalkOrder.dim0;
 | |
|   WO_1 = workGroupWalkOrder.dim1;
 | |
|   WO_2 = workGroupWalkOrder.dim2;
 | |
| 
 | |
|   // We expect that the work group walk order is always fixed.
 | |
|   IGC_ASSERT(WO_0 == 0 && WO_1 == 1 && WO_2 == 2);
 | |
| 
 | |
|   uint32_t simdSize = 0;
 | |
|   SubGroupSizeMetaDataHandle subGroupSize = funcInfoMD->getSubGroupSize();
 | |
|   if (subGroupSize->hasValue()) {
 | |
|     simdSize = (uint32_t)subGroupSize->getSIMDSize();
 | |
|   }
 | |
|   simdSize = simdSize >= 8 ? simdSize : 32;
 | |
| 
 | |
|   int32_t X = -1, Y = -1, Z = -1;
 | |
|   ThreadGroupSizeMetaDataHandle threadGroupSize = funcInfoMD->getThreadGroupSize();
 | |
|   if (threadGroupSize->hasValue()) {
 | |
|     X = (int32_t)threadGroupSize->getXDim();
 | |
|     Y = (int32_t)threadGroupSize->getYDim();
 | |
|     Z = (int32_t)threadGroupSize->getZDim();
 | |
|   }
 | |
| 
 | |
|   if (WO_0 == 0 && ((X / simdSize) * simdSize) == X) {
 | |
|     // each thread will have Y and Z unchanged.
 | |
|     IsLyUniform = true;
 | |
|     IsLzUniform = true;
 | |
|   } else if (WO_0 == 1 && ((Y / simdSize) * simdSize) == Y) {
 | |
|     // each thread will have X and Z unchanged.
 | |
|     IsLxUniform = true;
 | |
|     IsLzUniform = true;
 | |
|   } else if (WO_0 == 2 && ((Z / simdSize) * simdSize) == Z) {
 | |
|     // each thread will have X and Y unchanged.
 | |
|     IsLxUniform = true;
 | |
|     IsLyUniform = true;
 | |
|   }
 | |
| 
 | |
|   if (X == 1) {
 | |
|     IsLxUniform = true;
 | |
|   }
 | |
|   if (Y == 1) {
 | |
|     IsLyUniform = true;
 | |
|   }
 | |
|   if (Z == 1) {
 | |
|     IsLzUniform = true;
 | |
|   }
 | |
| 
 | |
|   // linear order dispatch
 | |
|   uint32_t XxY = X * Y;
 | |
|   if (X > 0 && (X % simdSize) == 0) {
 | |
|     // X is multiple of simdSize
 | |
|     IsLyUniform = true;
 | |
|     IsLzUniform = true;
 | |
|   } else if (X > 0 && Y > 0 && (XxY % simdSize) == 0) {
 | |
|     // X*Y is multiple of simdSize
 | |
|     IsLzUniform = true;
 | |
|   }
 | |
| }
 | |
| 
 | |
| BranchInfo::BranchInfo(const IGCLLVM::TerminatorInst *inst, const BasicBlock *ipd) : cbr(inst), full_join(ipd) {
 | |
|   auto *fork_blk = const_cast<BasicBlock *>(inst->getParent());
 | |
|   IGC_ASSERT_MESSAGE(cbr == fork_blk->getTerminator(), "block terminator mismatch");
 | |
| 
 | |
|   llvm::SmallVector<BasicBlock *, 4> WorkSet;
 | |
|   if (cbr->getNumSuccessors() != 2) {
 | |
|     llvm::DenseSet<BasicBlock *> Reached;
 | |
|     llvm::DenseSet<BasicBlock *> Visited;
 | |
|     for (auto *Succ : successors(fork_blk)) {
 | |
|       if (Succ == full_join)
 | |
|         continue;
 | |
|       Visited.clear();
 | |
|       WorkSet.push_back(Succ);
 | |
|       while (!WorkSet.empty()) {
 | |
|         BasicBlock *BB = WorkSet.pop_back_val();
 | |
|         Visited.insert(BB);
 | |
|         influence_region.insert(const_cast<BasicBlock *>(BB));
 | |
|         if (Reached.count(BB))
 | |
|           partial_joins.insert(const_cast<BasicBlock *>(BB));
 | |
|         for (auto *SBB : successors(BB)) {
 | |
|           if (SBB != full_join && !Visited.count(SBB))
 | |
|             WorkSet.push_back(SBB);
 | |
|         }
 | |
|       }
 | |
|       // Merge Visited into Reached.
 | |
|       Reached.insert(Visited.begin(), Visited.end());
 | |
|     }
 | |
|     return;
 | |
|   }
 | |
|   llvm::DenseSet<BasicBlock *> f_set, t_set;
 | |
|   if (cbr->getSuccessor(0) != full_join) {
 | |
|     WorkSet.push_back(cbr->getSuccessor(0));
 | |
|     while (!WorkSet.empty()) {
 | |
|       BasicBlock *cur_blk = WorkSet.pop_back_val();
 | |
|       f_set.insert(cur_blk);
 | |
|       influence_region.insert(cur_blk);
 | |
|       for (auto *succ_blk : successors(cur_blk)) {
 | |
|         if (succ_blk != full_join && !f_set.count(succ_blk)) {
 | |
|           WorkSet.push_back(succ_blk);
 | |
|         }
 | |
|       }
 | |
|     }
 | |
|   }
 | |
|   if (cbr->getSuccessor(1) != full_join) {
 | |
|     WorkSet.push_back(cbr->getSuccessor(1));
 | |
|     while (!WorkSet.empty()) {
 | |
|       BasicBlock *cur_blk = WorkSet.pop_back_val();
 | |
|       t_set.insert(cur_blk);
 | |
|       influence_region.insert(cur_blk);
 | |
|       if (f_set.count(cur_blk)) {
 | |
|         partial_joins.insert(cur_blk);
 | |
|       }
 | |
|       for (auto *succ_blk : successors(cur_blk)) {
 | |
|         if (succ_blk != full_join && !t_set.count(succ_blk)) {
 | |
|           WorkSet.push_back(succ_blk);
 | |
|         }
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| void BranchInfo::print(raw_ostream &OS) const {
 | |
|   OS << "\nCBR: " << *cbr;
 | |
|   OS << "\nIPD: ";
 | |
|   if (full_join) {
 | |
|     full_join->print(IGC::Debug::ods());
 | |
|   }
 | |
|   OS << "\nPartial Joins:";
 | |
|   SmallPtrSet<BasicBlock *, 4>::iterator join_it = partial_joins.begin();
 | |
|   SmallPtrSet<BasicBlock *, 4>::iterator join_e = partial_joins.end();
 | |
|   for (; join_it != join_e; ++join_it) {
 | |
|     BasicBlock *cur_blk = *join_it;
 | |
|     OS << "\n    ";
 | |
|     cur_blk->print(IGC::Debug::ods());
 | |
|   }
 | |
|   OS << "\nInfluence Region:";
 | |
|   DenseSet<BasicBlock *>::const_iterator blk_it = influence_region.begin();
 | |
|   DenseSet<BasicBlock *>::const_iterator blk_e = influence_region.end();
 | |
|   for (; blk_it != blk_e; ++blk_it) {
 | |
|     BasicBlock *cur_blk = *blk_it;
 | |
|     OS << "\n    ";
 | |
|     cur_blk->print(IGC::Debug::ods());
 | |
|   }
 | |
|   OS << "\n";
 | |
| }
 | |
| 
 | |
| extern "C" {
 | |
| void *createWIAnalysisPass() { return new WIAnalysis(); }
 | |
| }
 | 
