mirror of
https://github.com/intel/intel-graphics-compiler.git
synced 2025-11-04 08:21:06 +08:00
Implement cross block load vectorization for inline raytracing
We are not having performance parity with the old implementation. One of the reasons is suboptimal loading from rtstack. This change should coalesce loads for trivial rayquery usages
This commit is contained in:
@ -522,6 +522,7 @@ void InlineRaytracing::LowerIntrinsics(Function &F) {
|
||||
RQI->replaceAllUsesWith(getPackedData(IRB, rqObject).CandidateType);
|
||||
break;
|
||||
case GenISAIntrinsic::GenISA_TraceRayInlineRayInfo: {
|
||||
|
||||
auto *I = cast<RayQueryInfoIntrinsic>(RQI);
|
||||
auto data = getPackedData(IRB, rqObject);
|
||||
auto *loadCommittedFromPotential = IRB.CreateICmpEQ(data.CommittedDataLocation, IRB.getInt32(PotentialHit),
|
||||
@ -532,7 +533,7 @@ void InlineRaytracing::LowerIntrinsics(Function &F) {
|
||||
|
||||
switch (I->getInfoKind()) {
|
||||
default:
|
||||
I->replaceAllUsesWith(IRB.lowerRayInfo(getStackPtr(IRB, rqObject), I, shaderTy, std::nullopt));
|
||||
I->replaceAllUsesWith(IRB.lowerRayInfo(getStackPtr(IRB, rqObject, true), I, shaderTy, std::nullopt));
|
||||
break;
|
||||
// leave this in for now, until we prove we don't need the hack anymore
|
||||
case GEOMETRY_INDEX: {
|
||||
@ -541,9 +542,9 @@ void InlineRaytracing::LowerIntrinsics(Function &F) {
|
||||
specialPattern = forceShortCurcuitingOR_CommittedGeomIdx(IRB, I);
|
||||
}
|
||||
|
||||
Value *leafType = IRB.getLeafType(getStackPtr(IRB, rqObject), IRB.getInt1(I->isCommitted()));
|
||||
Value *leafType = IRB.getLeafType(getStackPtr(IRB, rqObject, true), IRB.getInt1(I->isCommitted()));
|
||||
Value *geoIndex = IRB.getGeometryIndex(
|
||||
getStackPtr(IRB, rqObject), I, leafType,
|
||||
getStackPtr(IRB, rqObject, true), I, leafType,
|
||||
IRB.getInt32(I->isCommitted() ? CallableShaderTypeMD::ClosestHit : CallableShaderTypeMD::AnyHit),
|
||||
!specialPattern);
|
||||
IGC_ASSERT_MESSAGE(I->getType()->isIntegerTy(), "Invalid geometryIndex type!");
|
||||
|
||||
@ -38,6 +38,7 @@ private:
|
||||
CodeGenContext *m_pCGCtx = nullptr;
|
||||
llvm::StructType *m_RQObjectType = nullptr;
|
||||
uint32_t m_numSlotsUsed = 0;
|
||||
llvm::DenseMap<std::pair<llvm::BasicBlock *, llvm::Value *>, llvm::AllocaInst *> m_CrossBlockVectorizationStacks;
|
||||
|
||||
void LowerIntrinsics(llvm::Function &F);
|
||||
bool LowerAllocations(llvm::Function &F);
|
||||
@ -142,7 +143,37 @@ private:
|
||||
IRB.CreateStore(packedData, getAtIndexFromRayQueryObject(IRB, rqObject, 1));
|
||||
}
|
||||
|
||||
llvm::RTBuilder::SyncStackPointerVal *getStackPtr(llvm::RTBuilder &IRB, llvm::Value *rqObject) {
|
||||
llvm::RTBuilder::SyncStackPointerVal *getStackPtr(llvm::RTBuilder &IRB, llvm::Value *rqObject,
|
||||
bool allowXBlockVectorize = false) {
|
||||
|
||||
bool doXBlockVectorize =
|
||||
allowXBlockVectorize && IGC_IS_FLAG_ENABLED(UseCrossBlockLoadVectorizationForInlineRaytracing);
|
||||
|
||||
// scan the basic block for continuation intrinsics. we don't want to contribute to raytracing swstack
|
||||
if (doXBlockVectorize) {
|
||||
for (auto &I : *IRB.GetInsertBlock())
|
||||
if (llvm::isa<llvm::ContinuationHLIntrinsic>(&I))
|
||||
doXBlockVectorize = false;
|
||||
}
|
||||
|
||||
if (doXBlockVectorize) {
|
||||
auto key = std::make_pair(IRB.GetInsertBlock(), rqObject);
|
||||
if (m_CrossBlockVectorizationStacks.find(key) == m_CrossBlockVectorizationStacks.end()) {
|
||||
|
||||
llvm::RTBuilder::InsertPointGuard g(IRB);
|
||||
IRB.SetInsertPoint(key.first->getParent()->getEntryBlock().getFirstNonPHI());
|
||||
auto *SMStack =
|
||||
IRB.CreateAlloca(IRB.getRTStack2Ty(), nullptr,
|
||||
VALUE_NAME("CrossBlockLoadSMStackForBlock"));
|
||||
IRB.SetInsertPoint(key.first->getFirstNonPHI());
|
||||
IRB.CreateMemCpy(SMStack, getStackPtr(IRB, rqObject), IRB.getSyncRTStackSize(),
|
||||
RayDispatchGlobalData::StackChunkSize);
|
||||
m_CrossBlockVectorizationStacks[key] = SMStack;
|
||||
}
|
||||
|
||||
return static_cast<llvm::RTBuilder::SyncStackPointerVal *>(llvm::cast<llvm::Value>(m_CrossBlockVectorizationStacks[key]));
|
||||
}
|
||||
|
||||
return static_cast<llvm::RTBuilder::SyncStackPointerVal *>(
|
||||
llvm::cast<llvm::Value>(IRB.CreateCall(m_Functions[GET_STACK_POINTER_FROM_RQ_OBJECT], rqObject)));
|
||||
}
|
||||
|
||||
@ -1837,6 +1837,8 @@ DECLARE_IGC_REGKEY_BITMASK(UseNewInlineRaytracing, 4, "Use the new rayquery impl
|
||||
NEW_INLINE_RAYTRACING_MASK, true)
|
||||
DECLARE_IGC_REGKEY(DWORD, AddDummySlotsForNewInlineRaytracing, 0,
|
||||
"Add dummy rayquery slots when doing new inline raytracing", true)
|
||||
DECLARE_IGC_REGKEY(bool, UseCrossBlockLoadVectorizationForInlineRaytracing, false,
|
||||
"If enabled, will try to vectorize loads that are not adjacent to each other. May increase GRF pressure", true)
|
||||
DECLARE_IGC_REGKEY(bool, OverrideRayQueryThrottling, false,
|
||||
"Force rayquery throttling (dynamic ray management) to be enabled or disabled. Default value of "
|
||||
"this key is ignored",
|
||||
|
||||
Reference in New Issue
Block a user