Implement cross block load vectorization for inline raytracing

We are not having performance parity with the old implementation. One of the reasons is suboptimal loading from rtstack.
This change should coalesce loads for trivial rayquery usages
This commit is contained in:
Jakub Jakacki
2025-08-28 16:41:00 +00:00
committed by igcbot
parent 3275c8a2a4
commit 74c3bcce15
3 changed files with 38 additions and 4 deletions

View File

@ -522,6 +522,7 @@ void InlineRaytracing::LowerIntrinsics(Function &F) {
RQI->replaceAllUsesWith(getPackedData(IRB, rqObject).CandidateType);
break;
case GenISAIntrinsic::GenISA_TraceRayInlineRayInfo: {
auto *I = cast<RayQueryInfoIntrinsic>(RQI);
auto data = getPackedData(IRB, rqObject);
auto *loadCommittedFromPotential = IRB.CreateICmpEQ(data.CommittedDataLocation, IRB.getInt32(PotentialHit),
@ -532,7 +533,7 @@ void InlineRaytracing::LowerIntrinsics(Function &F) {
switch (I->getInfoKind()) {
default:
I->replaceAllUsesWith(IRB.lowerRayInfo(getStackPtr(IRB, rqObject), I, shaderTy, std::nullopt));
I->replaceAllUsesWith(IRB.lowerRayInfo(getStackPtr(IRB, rqObject, true), I, shaderTy, std::nullopt));
break;
// leave this in for now, until we prove we don't need the hack anymore
case GEOMETRY_INDEX: {
@ -541,9 +542,9 @@ void InlineRaytracing::LowerIntrinsics(Function &F) {
specialPattern = forceShortCurcuitingOR_CommittedGeomIdx(IRB, I);
}
Value *leafType = IRB.getLeafType(getStackPtr(IRB, rqObject), IRB.getInt1(I->isCommitted()));
Value *leafType = IRB.getLeafType(getStackPtr(IRB, rqObject, true), IRB.getInt1(I->isCommitted()));
Value *geoIndex = IRB.getGeometryIndex(
getStackPtr(IRB, rqObject), I, leafType,
getStackPtr(IRB, rqObject, true), I, leafType,
IRB.getInt32(I->isCommitted() ? CallableShaderTypeMD::ClosestHit : CallableShaderTypeMD::AnyHit),
!specialPattern);
IGC_ASSERT_MESSAGE(I->getType()->isIntegerTy(), "Invalid geometryIndex type!");

View File

@ -38,6 +38,7 @@ private:
CodeGenContext *m_pCGCtx = nullptr;
llvm::StructType *m_RQObjectType = nullptr;
uint32_t m_numSlotsUsed = 0;
llvm::DenseMap<std::pair<llvm::BasicBlock *, llvm::Value *>, llvm::AllocaInst *> m_CrossBlockVectorizationStacks;
void LowerIntrinsics(llvm::Function &F);
bool LowerAllocations(llvm::Function &F);
@ -142,7 +143,37 @@ private:
IRB.CreateStore(packedData, getAtIndexFromRayQueryObject(IRB, rqObject, 1));
}
llvm::RTBuilder::SyncStackPointerVal *getStackPtr(llvm::RTBuilder &IRB, llvm::Value *rqObject) {
llvm::RTBuilder::SyncStackPointerVal *getStackPtr(llvm::RTBuilder &IRB, llvm::Value *rqObject,
bool allowXBlockVectorize = false) {
bool doXBlockVectorize =
allowXBlockVectorize && IGC_IS_FLAG_ENABLED(UseCrossBlockLoadVectorizationForInlineRaytracing);
// scan the basic block for continuation intrinsics. we don't want to contribute to raytracing swstack
if (doXBlockVectorize) {
for (auto &I : *IRB.GetInsertBlock())
if (llvm::isa<llvm::ContinuationHLIntrinsic>(&I))
doXBlockVectorize = false;
}
if (doXBlockVectorize) {
auto key = std::make_pair(IRB.GetInsertBlock(), rqObject);
if (m_CrossBlockVectorizationStacks.find(key) == m_CrossBlockVectorizationStacks.end()) {
llvm::RTBuilder::InsertPointGuard g(IRB);
IRB.SetInsertPoint(key.first->getParent()->getEntryBlock().getFirstNonPHI());
auto *SMStack =
IRB.CreateAlloca(IRB.getRTStack2Ty(), nullptr,
VALUE_NAME("CrossBlockLoadSMStackForBlock"));
IRB.SetInsertPoint(key.first->getFirstNonPHI());
IRB.CreateMemCpy(SMStack, getStackPtr(IRB, rqObject), IRB.getSyncRTStackSize(),
RayDispatchGlobalData::StackChunkSize);
m_CrossBlockVectorizationStacks[key] = SMStack;
}
return static_cast<llvm::RTBuilder::SyncStackPointerVal *>(llvm::cast<llvm::Value>(m_CrossBlockVectorizationStacks[key]));
}
return static_cast<llvm::RTBuilder::SyncStackPointerVal *>(
llvm::cast<llvm::Value>(IRB.CreateCall(m_Functions[GET_STACK_POINTER_FROM_RQ_OBJECT], rqObject)));
}

View File

@ -1837,6 +1837,8 @@ DECLARE_IGC_REGKEY_BITMASK(UseNewInlineRaytracing, 4, "Use the new rayquery impl
NEW_INLINE_RAYTRACING_MASK, true)
DECLARE_IGC_REGKEY(DWORD, AddDummySlotsForNewInlineRaytracing, 0,
"Add dummy rayquery slots when doing new inline raytracing", true)
DECLARE_IGC_REGKEY(bool, UseCrossBlockLoadVectorizationForInlineRaytracing, false,
"If enabled, will try to vectorize loads that are not adjacent to each other. May increase GRF pressure", true)
DECLARE_IGC_REGKEY(bool, OverrideRayQueryThrottling, false,
"Force rayquery throttling (dynamic ray management) to be enabled or disabled. Default value of "
"this key is ignored",