Fix i8/opaque pointer byte offset GEP scalarization in PrivateMemoryResolution

When LLVM IR uses opaque pointers or inserts a bitcast to i8*, a subsequent GEP is expressed in bytes. The legacy handleGEPInst always scalarized indices by starting from pGEP->getSourceElementType(). After the i8* cast, the type is i8, so the algorithm mistakenly treated the byte index as a count of elements, producing misscaled (too large) scalarized index. Example: %a = alloca [16 x [16 x float]], align 4 %b = bitcast [16 x [16 x float]]* %a to i8* %c = getelementptr inbounds i8, i8* %b, i64 64 Here, 64 is a byte offset into the original aggregate. The old implementation, seeing i8, scaled as if 64 elements, not 64 bytes. Yet, the meaningful base of the GEP is alloca's aggregate type [16 x [16 x float]] and the element-calculations should be based on this type. This change: 1. Introduces getFirstNonScalarSourceElementType(GEP), which walks back from the GEP base through pointer casts to find a root aggregate element type. 2. Adds additional handling in handleGEPInst, so that i8 GEP byte offset is converted to an element index of the underlying base type. This way the algorithm avoids basing element index scalarization on incidental i8* and keeps index calculation aligned with the underlying allocation layout. For reference, in typed pointer mode (or without the bitcast), the GEP would look like this: %a = alloca [16 x [16 x float]], align 4 %c = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* %a, i64 0, i64 1 Here, %c is the pointer to the 2nd inner array [16 x float]*.
2025-10-30 08:18:26 +08:00 · 2025-08-12 05:09:43 +00:00
parent bdd9b15ad7
commit e8906d0679
3 changed files with 76 additions and 4 deletions
--- a/IGC/Compiler/CISACodeGen/LowerGEPForPrivMem.cpp
+++ b/IGC/Compiler/CISACodeGen/LowerGEPForPrivMem.cpp
@ -1,6 +1,6 @@
 /*========================== begin_copyright_notice ============================

-Copyright (C) 2017-2024 Intel Corporation
+Copyright (C) 2017-2025 Intel Corporation

 SPDX-License-Identifier: MIT

@ -824,6 +824,25 @@ std::pair<unsigned int, Type *> TransposeHelper::getArrSizeAndEltType(Type *T) {
  return std::make_pair(arr_sz, retTy);
 }

+Type *TransposeHelper::getFirstNonScalarSourceElementType(const GetElementPtrInst &GEP) {
+  Type *currTy = GEP.getSourceElementType();
+  if (getArrSizeAndEltType(currTy).first > 1)
+    return currTy;
+
+  const Value *base = GEP.getPointerOperand()->stripPointerCasts();
+
+  if (const auto *AI = dyn_cast<AllocaInst>(base))
+    return AI->getAllocatedType();
+  if (const auto *GV = dyn_cast<GlobalVariable>(base))
+    return GV->getValueType();
+  if (const auto *LI = dyn_cast<LoadInst>(base))
+    return LI->getType();
+  if (const auto *SI = dyn_cast<StoreInst>(base))
+    return SI->getValueOperand()->getType();
+
+  return currTy;
+}
+
 void TransposeHelper::handleGEPInst(llvm::GetElementPtrInst *pGEP, llvm::Value *idx) {
  // TODO: Add support for GEP attributes: nsw, nuw, inbounds. Currently, neigher the old nor the new algorithm handles
  // them.
@ -841,13 +860,38 @@ void TransposeHelper::handleGEPInst(llvm::GetElementPtrInst *pGEP, llvm::Value *
    return;
  }

+  IRBuilder<> IRB(pGEP);
+  Value *pScalarizedIdx = IRB.getInt32(0);
+
+  // If the GEP is on i8, its index is a byte offset and must be converted to an element index of the underlying base
+  // type.
+  if (pGEP->getSourceElementType()->isIntegerTy(8)) {
+    // Get the non-scalar/aggregate GEP source element type.
+    Type *baseAggregateTy = getFirstNonScalarSourceElementType(*pGEP);
+    // Find the scalar element type at the bottom of the aggregate.
+    Type *elementTy = baseAggregateTy;
+    while (elementTy->isStructTy() || elementTy->isArrayTy() || elementTy->isVectorTy()) {
+      elementTy = getArrSizeAndEltType(elementTy).second;
+    }
+    elementTy = elementTy->getScalarType();
+    uint32_t elementBytes = (uint32_t)m_DL.getTypeAllocSize(elementTy);
+
+    // The 1st operand is the byte offset, convert bytes to element count.
+    Value *byteIndex = IRB.CreateZExtOrTrunc(pGEP->getOperand(1), IRB.getInt32Ty());
+    if (elementBytes > 1)
+      byteIndex = IRB.CreateUDiv(byteIndex, IRB.getInt32(elementBytes));
+
+    pScalarizedIdx = IRB.CreateAdd(pScalarizedIdx, byteIndex);
+    pScalarizedIdx = IRB.CreateAdd(pScalarizedIdx, idx);
+    HandleAllocaSources(pGEP, pScalarizedIdx);
+    return;
+  }
+
  // Given %p = getelementptr [4 x [3 x <2 x float>]]* %v, i64 0, i64 %1, i64 %2
  // compute the scalarized index with an auxiliary array [4, 3, 2]:
  //
  // Formula: index = (%1 x 3 + %2) x 2
  //
-  IRBuilder<> IRB(pGEP);
-  Value *pScalarizedIdx = IRB.getInt32(0);
  Type *T = pGEP->getSourceElementType();
  for (unsigned i = 0, e = pGEP->getNumIndices(); i < e; ++i) {
    // If T is VectorType we should be at the last loop iteration. This will break things only if m_vectorIndex == true.
--- a/IGC/Compiler/CISACodeGen/LowerGEPForPrivMem.hpp
+++ b/IGC/Compiler/CISACodeGen/LowerGEPForPrivMem.hpp
@ -1,6 +1,6 @@
 /*========================== begin_copyright_notice ============================

-Copyright (C) 2017-2024 Intel Corporation
+Copyright (C) 2017-2025 Intel Corporation

 SPDX-License-Identifier: MIT

@ -155,5 +155,6 @@ protected:
 private:
  bool m_vectorIndex;
  std::pair<unsigned int, llvm::Type *> getArrSizeAndEltType(llvm::Type *T);
+  llvm::Type *getFirstNonScalarSourceElementType(const llvm::GetElementPtrInst &GEP);
 };
 } // namespace IGC
--- a/IGC/Compiler/tests/PrivateMemoryResolution/i8_gep_byte_offset.ll
+++ b/IGC/Compiler/tests/PrivateMemoryResolution/i8_gep_byte_offset.ll
@ -0,0 +1,27 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2025 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+
+; RUN: igc_opt --opaque-pointers --igc-private-mem-resolution --platformlnl -S %s | FileCheck %s
+
+; This test ensures GEP scalarization on i8*/opaque ptr offsets treats the index as bytes and converts to element index via recovered base type size.
+
+; CHECK-NOT: mul i32 64
+; CHECK: mul i32 16
+
+define spir_kernel void @test() {
+  %a = alloca [16 x [16 x float]], align 4
+  %b = getelementptr inbounds i8, ptr %a, i64 64
+  %c = getelementptr <8 x i32>, ptr %b, i32 0
+  %d = load <8 x i32>, ptr %c, align 4
+  ret void
+}
+
+!igc.functions = !{!1}
+!1 = !{ptr @test, !2}
+!2 = !{!3}
+!3 = !{!"function_type", i32 0}