[flang] Added alternative inlining code for hlfir.cshift. (#129176)

Flang generates slower code for `CSHIFT(CSHIFT(PTR(:,:,I),sh1,1),sh2,2)` pattern in facerec than other compilers. The first CSHIFT can be done as two memcpy's wrapped in a loop for the second dimension. This does require creating a temporary array, but it seems to be faster, than the current hlfir.elemental inlining. I started with modifying the new index computation in hlfir.elemental inlining: the new arith.select approach does enable some vectorization in LLVM, but on x86 it is using gathers/scatters and does not give much speed-up. I also experimented with LoopBoundSplitPass and InductiveRangeCheckElimination for a simple (not chained) CSHIFT case, but I could not adjust them to split the loop with a condition on the value of the IV into two loops with disjoint iteration spaces. I thought if I could do it, I would be able to keep the hlfir.elemental inlining mostly untouched, and then adjust the hlfir.elemental inlining heuristics for the facerec case. Since I was not able to make these pass work for me, I added a special case inlining for CSHIFT(ARRAY,SH,DIM=1) via hlfir.eval_in_mem. If ARRAY is not statically known to have the contiguous leading dimension, there is a dynamic check for contiguity, which allows exposing it to LLVM and enabling the rewrite of the copy loops into memcpys. This approach is stepping on the toes of LoopVersioning, but it is helpful in facerec case. I measured ~6% speed-up on grace, and ~4% on zen4.
2026-02-08 17:28:30 +08:00 · 2025-03-03 09:58:20 -08:00
parent d2c4d1ec48
commit a704e6587b
4 changed files with 657 additions and 226 deletions
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -517,6 +517,19 @@ Entity loadElementAt(mlir::Location loc, fir::FirOpBuilder &builder,
 llvm::SmallVector<mlir::Value, Fortran::common::maxRank>
 genExtentsVector(mlir::Location loc, fir::FirOpBuilder &builder, Entity entity);

+/// Generate an hlfir.designate that produces an 1D section
+/// of \p array using \p oneBasedIndices and \p dim:
+///   i = oneBasedIndices
+///   result => array(i(1), ..., i(dim-1), :, i(dim+1), ..., i(n))
+///
+/// The caller provides the pre-computed \p lbounds, \p extents
+/// and \p typeParams of the array.
+Entity gen1DSection(mlir::Location loc, fir::FirOpBuilder &builder,
+                    Entity array, int64_t dim,
+                    mlir::ArrayRef<mlir::Value> lbounds,
+                    mlir::ArrayRef<mlir::Value> extents,
+                    mlir::ValueRange oneBasedIndices,
+                    mlir::ArrayRef<mlir::Value> typeParams);
 } // namespace hlfir

 #endif // FORTRAN_OPTIMIZER_BUILDER_HLFIRTOOLS_H
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -1535,3 +1535,52 @@ hlfir::genExtentsVector(mlir::Location loc, fir::FirOpBuilder &builder,
    shape.getDefiningOp()->erase();
  return extents;
 }
+
+hlfir::Entity hlfir::gen1DSection(mlir::Location loc,
+                                  fir::FirOpBuilder &builder,
+                                  hlfir::Entity array, int64_t dim,
+                                  mlir::ArrayRef<mlir::Value> lbounds,
+                                  mlir::ArrayRef<mlir::Value> extents,
+                                  mlir::ValueRange oneBasedIndices,
+                                  mlir::ArrayRef<mlir::Value> typeParams) {
+  assert(array.isVariable() && "array must be a variable");
+  assert(dim > 0 && dim <= array.getRank() && "invalid dim number");
+  mlir::Value one =
+      builder.createIntegerConstant(loc, builder.getIndexType(), 1);
+  hlfir::DesignateOp::Subscripts subscripts;
+  unsigned indexId = 0;
+  for (int i = 0; i < array.getRank(); ++i) {
+    if (i == dim - 1) {
+      mlir::Value ubound = genUBound(loc, builder, lbounds[i], extents[i], one);
+      subscripts.emplace_back(
+          hlfir::DesignateOp::Triplet{lbounds[i], ubound, one});
+    } else {
+      mlir::Value index =
+          genUBound(loc, builder, lbounds[i], oneBasedIndices[indexId++], one);
+      subscripts.emplace_back(index);
+    }
+  }
+  mlir::Value sectionShape =
+      builder.create<fir::ShapeOp>(loc, extents[dim - 1]);
+
+  // The result type is one of:
+  //   !fir.box/class<!fir.array<NxT>>
+  //   !fir.box/class<!fir.array<?xT>>
+  //
+  // We could use !fir.ref<!fir.array<NxT>> when the whole dimension's
+  // size is known and it is the leading dimension, but let it be simple
+  // for the time being.
+  auto seqType =
+      mlir::cast<fir::SequenceType>(array.getElementOrSequenceType());
+  int64_t dimExtent = seqType.getShape()[dim - 1];
+  mlir::Type sectionType =
+      fir::SequenceType::get({dimExtent}, seqType.getEleTy());
+  sectionType = fir::wrapInClassOrBoxType(sectionType, array.isPolymorphic());
+
+  auto designate = builder.create<hlfir::DesignateOp>(
+      loc, sectionType, array, /*component=*/"", /*componentShape=*/nullptr,
+      subscripts,
+      /*substring=*/mlir::ValueRange{}, /*complexPartAttr=*/std::nullopt,
+      sectionShape, typeParams);
+  return hlfir::Entity{designate.getResult()};
+}
--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
@@ -415,15 +415,13 @@ private:
  }
 };

-class CShiftAsElementalConversion
-    : public mlir::OpRewritePattern<hlfir::CShiftOp> {
+class CShiftConversion : public mlir::OpRewritePattern<hlfir::CShiftOp> {
 public:
  using mlir::OpRewritePattern<hlfir::CShiftOp>::OpRewritePattern;

  llvm::LogicalResult
  matchAndRewrite(hlfir::CShiftOp cshift,
                  mlir::PatternRewriter &rewriter) const override {
-    using Fortran::common::maxRank;

    hlfir::ExprType expr = mlir::dyn_cast<hlfir::ExprType>(cshift.getType());
    assert(expr &&
@@ -445,31 +443,88 @@ public:
    if (dimVal <= 0 || dimVal > arrayRank)
      return rewriter.notifyMatchFailure(cshift, "Invalid DIM for CSHIFT");

+    // When DIM==1 and the contiguity of the input array is not statically
+    // known, try to exploit the fact that the leading dimension might be
+    // contiguous. We can do this now using hlfir.eval_in_mem with
+    // a dynamic check for the leading dimension contiguity.
+    // Otherwise, convert hlfir.cshift to hlfir.elemental.
+    //
+    // Note that the hlfir.elemental can be inlined into other hlfir.elemental,
+    // while hlfir.eval_in_mem prevents this, and we will end up creating
+    // a temporary array for the result. We may need to come up with
+    // a more sophisticated logic for picking the most efficient
+    // representation.
+    hlfir::Entity array = hlfir::Entity{cshift.getArray()};
+    mlir::Type elementType = array.getFortranElementType();
+    if (dimVal == 1 && fir::isa_trivial(elementType) &&
+        // genInMemCShift() only works for variables currently.
+        array.isVariable())
+      rewriter.replaceOp(cshift, genInMemCShift(rewriter, cshift, dimVal));
+    else
+      rewriter.replaceOp(cshift, genElementalCShift(rewriter, cshift, dimVal));
+    return mlir::success();
+  }
+
+private:
+  /// Generate MODULO(\p shiftVal, \p extent).
+  static mlir::Value normalizeShiftValue(mlir::Location loc,
+                                         fir::FirOpBuilder &builder,
+                                         mlir::Value shiftVal,
+                                         mlir::Value extent,
+                                         mlir::Type calcType) {
+    shiftVal = builder.createConvert(loc, calcType, shiftVal);
+    extent = builder.createConvert(loc, calcType, extent);
+    // Make sure that we do not divide by zero. When the dimension
+    // has zero size, turn the extent into 1. Note that the computed
+    // MODULO value won't be used in this case, so it does not matter
+    // which extent value we use.
+    mlir::Value zero = builder.createIntegerConstant(loc, calcType, 0);
+    mlir::Value one = builder.createIntegerConstant(loc, calcType, 1);
+    mlir::Value isZero = builder.create<mlir::arith::CmpIOp>(
+        loc, mlir::arith::CmpIPredicate::eq, extent, zero);
+    extent = builder.create<mlir::arith::SelectOp>(loc, isZero, one, extent);
+    shiftVal = fir::IntrinsicLibrary{builder, loc}.genModulo(
+        calcType, {shiftVal, extent});
+    return builder.createConvert(loc, calcType, shiftVal);
+  }
+
+  /// Convert \p cshift into an hlfir.elemental using
+  /// the pre-computed constant \p dimVal.
+  static mlir::Operation *genElementalCShift(mlir::PatternRewriter &rewriter,
+                                             hlfir::CShiftOp cshift,
+                                             int64_t dimVal) {
+    using Fortran::common::maxRank;
+    hlfir::Entity shift = hlfir::Entity{cshift.getShift()};
+    hlfir::Entity array = hlfir::Entity{cshift.getArray()};
+
    mlir::Location loc = cshift.getLoc();
    fir::FirOpBuilder builder{rewriter, cshift.getOperation()};
-    mlir::Type elementType = expr.getElementType();
-    hlfir::Entity array = hlfir::Entity{cshift.getArray()};
-    mlir::Value arrayShape = hlfir::genShape(loc, builder, array);
-    llvm::SmallVector<mlir::Value> arrayExtents =
-        hlfir::getExplicitExtentsFromShape(arrayShape, builder);
-    llvm::SmallVector<mlir::Value, 1> typeParams;
-    hlfir::genLengthParameters(loc, builder, array, typeParams);
-    hlfir::Entity shift = hlfir::Entity{cshift.getShift()};
    // The new index computation involves MODULO, which is not implemented
    // for IndexType, so use I64 instead.
    mlir::Type calcType = builder.getI64Type();
+    // All the indices arithmetic used below does not overflow
+    // signed and unsigned I64.
+    builder.setIntegerOverflowFlags(mlir::arith::IntegerOverflowFlags::nsw |
+                                    mlir::arith::IntegerOverflowFlags::nuw);

-    mlir::Value one = builder.createIntegerConstant(loc, calcType, 1);
+    mlir::Value arrayShape = hlfir::genShape(loc, builder, array);
+    llvm::SmallVector<mlir::Value, maxRank> arrayExtents =
+        hlfir::getExplicitExtentsFromShape(arrayShape, builder);
+    llvm::SmallVector<mlir::Value, 1> typeParams;
+    hlfir::genLengthParameters(loc, builder, array, typeParams);
+    mlir::Value shiftDimExtent =
+        builder.createConvert(loc, calcType, arrayExtents[dimVal - 1]);
    mlir::Value shiftVal;
    if (shift.isScalar()) {
      shiftVal = hlfir::loadTrivialScalar(loc, builder, shift);
-      shiftVal = builder.createConvert(loc, calcType, shiftVal);
+      shiftVal =
+          normalizeShiftValue(loc, builder, shiftVal, shiftDimExtent, calcType);
    }

    auto genKernel = [&](mlir::Location loc, fir::FirOpBuilder &builder,
                         mlir::ValueRange inputIndices) -> hlfir::Entity {
      llvm::SmallVector<mlir::Value, maxRank> indices{inputIndices};
-      if (!shift.isScalar()) {
+      if (!shiftVal) {
        // When the array is not a vector, section
        // (s(1), s(2), ..., s(dim-1), :, s(dim+1), ..., s(n)
        // of the result has a value equal to:
@@ -482,35 +537,281 @@ public:
        hlfir::Entity shiftElement =
            hlfir::getElementAt(loc, builder, shift, shiftIndices);
        shiftVal = hlfir::loadTrivialScalar(loc, builder, shiftElement);
-        shiftVal = builder.createConvert(loc, calcType, shiftVal);
+        shiftVal = normalizeShiftValue(loc, builder, shiftVal, shiftDimExtent,
+                                       calcType);
      }

      // Element i of the result (1-based) is element
-      // 'MODULO(i + SH - 1, SIZE(ARRAY)) + 1' (1-based) of the original
+      // 'MODULO(i + SH - 1, SIZE(ARRAY,DIM)) + 1' (1-based) of the original
      // ARRAY (or its section, when ARRAY is not a vector).
+
+      // Compute the index into the original array using the normalized
+      // shift value, which satisfies (SH >= 0 && SH < SIZE(ARRAY,DIM)):
+      //   newIndex =
+      //     i + ((i <= SIZE(ARRAY,DIM) - SH) ? SH : SH - SIZE(ARRAY,DIM))
+      //
+      // Such index computation allows for further loop vectorization
+      // in LLVM.
+      mlir::Value wrapBound =
+          builder.create<mlir::arith::SubIOp>(loc, shiftDimExtent, shiftVal);
+      mlir::Value adjustedShiftVal =
+          builder.create<mlir::arith::SubIOp>(loc, shiftVal, shiftDimExtent);
      mlir::Value index =
          builder.createConvert(loc, calcType, inputIndices[dimVal - 1]);
-      mlir::Value extent = arrayExtents[dimVal - 1];
+      mlir::Value wrapCheck = builder.create<mlir::arith::CmpIOp>(
+          loc, mlir::arith::CmpIPredicate::sle, index, wrapBound);
+      mlir::Value actualShift = builder.create<mlir::arith::SelectOp>(
+          loc, wrapCheck, shiftVal, adjustedShiftVal);
      mlir::Value newIndex =
-          builder.create<mlir::arith::AddIOp>(loc, index, shiftVal);
-      newIndex = builder.create<mlir::arith::SubIOp>(loc, newIndex, one);
-      newIndex = fir::IntrinsicLibrary{builder, loc}.genModulo(
-          calcType, {newIndex, builder.createConvert(loc, calcType, extent)});
-      newIndex = builder.create<mlir::arith::AddIOp>(loc, newIndex, one);
+          builder.create<mlir::arith::AddIOp>(loc, index, actualShift);
      newIndex = builder.createConvert(loc, builder.getIndexType(), newIndex);
-
      indices[dimVal - 1] = newIndex;
      hlfir::Entity element = hlfir::getElementAt(loc, builder, array, indices);
      return hlfir::loadTrivialScalar(loc, builder, element);
    };

+    mlir::Type elementType = array.getFortranElementType();
    hlfir::ElementalOp elementalOp = hlfir::genElementalOp(
        loc, builder, elementType, arrayShape, typeParams, genKernel,
        /*isUnordered=*/true,
        array.isPolymorphic() ? static_cast<mlir::Value>(array) : nullptr,
        cshift.getResult().getType());
-    rewriter.replaceOp(cshift, elementalOp);
-    return mlir::success();
+    return elementalOp.getOperation();
+  }
+
+  /// Convert \p cshift into an hlfir.eval_in_mem using the pre-computed
+  /// constant \p dimVal.
+  /// The converted code looks like this:
+  ///   do i=1,SH
+  ///     result(i + (SIZE(ARRAY,DIM) - SH)) = array(i)
+  ///   end
+  ///   do i=1,SIZE(ARRAY,DIM) - SH
+  ///     result(i) = array(i + SH)
+  ///   end
+  ///
+  /// When \p dimVal is 1, we generate the same code twice
+  /// under a dynamic check for the contiguity of the leading
+  /// dimension. In the code corresponding to the contiguous
+  /// leading dimension, the shift dimension is represented
+  /// as a contiguous slice of the original array.
+  /// This allows recognizing the above two loops as memcpy
+  /// loop idioms in LLVM.
+  static mlir::Operation *genInMemCShift(mlir::PatternRewriter &rewriter,
+                                         hlfir::CShiftOp cshift,
+                                         int64_t dimVal) {
+    using Fortran::common::maxRank;
+    hlfir::Entity shift = hlfir::Entity{cshift.getShift()};
+    hlfir::Entity array = hlfir::Entity{cshift.getArray()};
+    assert(array.isVariable() && "array must be a variable");
+    assert(!array.isPolymorphic() &&
+           "genInMemCShift does not support polymorphic types");
+    mlir::Location loc = cshift.getLoc();
+    fir::FirOpBuilder builder{rewriter, cshift.getOperation()};
+    // The new index computation involves MODULO, which is not implemented
+    // for IndexType, so use I64 instead.
+    mlir::Type calcType = builder.getI64Type();
+    // All the indices arithmetic used below does not overflow
+    // signed and unsigned I64.
+    builder.setIntegerOverflowFlags(mlir::arith::IntegerOverflowFlags::nsw |
+                                    mlir::arith::IntegerOverflowFlags::nuw);
+
+    mlir::Value arrayShape = hlfir::genShape(loc, builder, array);
+    llvm::SmallVector<mlir::Value, maxRank> arrayExtents =
+        hlfir::getExplicitExtentsFromShape(arrayShape, builder);
+    llvm::SmallVector<mlir::Value, 1> typeParams;
+    hlfir::genLengthParameters(loc, builder, array, typeParams);
+    mlir::Value shiftDimExtent =
+        builder.createConvert(loc, calcType, arrayExtents[dimVal - 1]);
+    mlir::Value shiftVal;
+    if (shift.isScalar()) {
+      shiftVal = hlfir::loadTrivialScalar(loc, builder, shift);
+      shiftVal =
+          normalizeShiftValue(loc, builder, shiftVal, shiftDimExtent, calcType);
+    }
+
+    hlfir::EvaluateInMemoryOp evalOp =
+        builder.create<hlfir::EvaluateInMemoryOp>(
+            loc, mlir::cast<hlfir::ExprType>(cshift.getType()), arrayShape);
+    builder.setInsertionPointToStart(&evalOp.getBody().front());
+
+    mlir::Value resultArray = evalOp.getMemory();
+    mlir::Type arrayType = fir::dyn_cast_ptrEleTy(resultArray.getType());
+    resultArray = builder.createBox(loc, fir::BoxType::get(arrayType),
+                                    resultArray, arrayShape, /*slice=*/nullptr,
+                                    typeParams, /*tdesc=*/nullptr);
+
+    // This is a generator of the dimension shift code.
+    // The code is inserted inside a loop nest over the other dimensions
+    // (if any). If exposeContiguity is true, the array's section
+    // array(s(1), ..., s(dim-1), :, s(dim+1), ..., s(n)) is represented
+    // as a contiguous 1D array.
+    // shiftVal is the normalized shift value that satisfies (SH >= 0 && SH <
+    // SIZE(ARRAY,DIM)).
+    //
+    auto genDimensionShift = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                                 mlir::Value shiftVal, bool exposeContiguity,
+                                 mlir::ValueRange oneBasedIndices)
+        -> llvm::SmallVector<mlir::Value, 0> {
+      // Create a vector of indices (s(1), ..., s(dim-1), nullptr, s(dim+1),
+      // ..., s(n)) so that we can update the dimVal index as needed.
+      llvm::SmallVector<mlir::Value, maxRank> srcIndices(
+          oneBasedIndices.begin(), oneBasedIndices.begin() + (dimVal - 1));
+      srcIndices.push_back(nullptr);
+      srcIndices.append(oneBasedIndices.begin() + (dimVal - 1),
+                        oneBasedIndices.end());
+      llvm::SmallVector<mlir::Value, maxRank> dstIndices(srcIndices);
+
+      hlfir::Entity srcArray = array;
+      if (exposeContiguity && mlir::isa<fir::BaseBoxType>(srcArray.getType())) {
+        assert(dimVal == 1 && "can expose contiguity only for dim 1");
+        llvm::SmallVector<mlir::Value, maxRank> arrayLbounds =
+            hlfir::genLowerbounds(loc, builder, arrayShape, array.getRank());
+        hlfir::Entity section =
+            hlfir::gen1DSection(loc, builder, srcArray, dimVal, arrayLbounds,
+                                arrayExtents, oneBasedIndices, typeParams);
+        mlir::Value addr = hlfir::genVariableRawAddress(loc, builder, section);
+        mlir::Value shape = hlfir::genShape(loc, builder, section);
+        mlir::Type boxType = fir::wrapInClassOrBoxType(
+            hlfir::getFortranElementOrSequenceType(section.getType()),
+            section.isPolymorphic());
+        srcArray = hlfir::Entity{
+            builder.createBox(loc, boxType, addr, shape, /*slice=*/nullptr,
+                              /*lengths=*/{}, /*tdesc=*/nullptr)};
+        // When shifting the dimension as a 1D section of the original
+        // array, we only need one index for addressing.
+        srcIndices.resize(1);
+      }
+
+      // Copy first portion of the array:
+      // do i=1,SH
+      //   result(i + (SIZE(ARRAY,DIM) - SH)) = array(i)
+      // end
+      auto genAssign1 = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                            mlir::ValueRange index,
+                            mlir::ValueRange reductionArgs)
+          -> llvm::SmallVector<mlir::Value, 0> {
+        assert(index.size() == 1 && "expected single loop");
+        mlir::Value srcIndex = builder.createConvert(loc, calcType, index[0]);
+        srcIndices[dimVal - 1] = srcIndex;
+        hlfir::Entity srcElementValue =
+            hlfir::loadElementAt(loc, builder, srcArray, srcIndices);
+        mlir::Value dstIndex = builder.create<mlir::arith::AddIOp>(
+            loc, srcIndex,
+            builder.create<mlir::arith::SubIOp>(loc, shiftDimExtent, shiftVal));
+        dstIndices[dimVal - 1] = dstIndex;
+        hlfir::Entity dstElement = hlfir::getElementAt(
+            loc, builder, hlfir::Entity{resultArray}, dstIndices);
+        builder.create<hlfir::AssignOp>(loc, srcElementValue, dstElement);
+        return {};
+      };
+
+      // Generate the first loop.
+      hlfir::genLoopNestWithReductions(loc, builder, {shiftVal},
+                                       /*reductionInits=*/{}, genAssign1,
+                                       /*isUnordered=*/true);
+
+      // Copy second portion of the array:
+      // do i=1,SIZE(ARRAY,DIM)-SH
+      //   result(i) = array(i + SH)
+      // end
+      auto genAssign2 = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                            mlir::ValueRange index,
+                            mlir::ValueRange reductionArgs)
+          -> llvm::SmallVector<mlir::Value, 0> {
+        assert(index.size() == 1 && "expected single loop");
+        mlir::Value dstIndex = builder.createConvert(loc, calcType, index[0]);
+        mlir::Value srcIndex =
+            builder.create<mlir::arith::AddIOp>(loc, dstIndex, shiftVal);
+        srcIndices[dimVal - 1] = srcIndex;
+        hlfir::Entity srcElementValue =
+            hlfir::loadElementAt(loc, builder, srcArray, srcIndices);
+        dstIndices[dimVal - 1] = dstIndex;
+        hlfir::Entity dstElement = hlfir::getElementAt(
+            loc, builder, hlfir::Entity{resultArray}, dstIndices);
+        builder.create<hlfir::AssignOp>(loc, srcElementValue, dstElement);
+        return {};
+      };
+
+      // Generate the second loop.
+      mlir::Value bound =
+          builder.create<mlir::arith::SubIOp>(loc, shiftDimExtent, shiftVal);
+      hlfir::genLoopNestWithReductions(loc, builder, {bound},
+                                       /*reductionInits=*/{}, genAssign2,
+                                       /*isUnordered=*/true);
+      return {};
+    };
+
+    // A wrapper around genDimensionShift that computes the normalized
+    // shift value and manages the insertion of the multiple versions
+    // of the shift based on the dynamic check of the leading dimension's
+    // contiguity (when dimVal == 1).
+    auto genShiftBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                            mlir::ValueRange oneBasedIndices,
+                            mlir::ValueRange reductionArgs)
+        -> llvm::SmallVector<mlir::Value, 0> {
+      // Copy the dimension with a shift:
+      // SH is either SHIFT (if scalar) or SHIFT(oneBasedIndices).
+      if (!shiftVal) {
+        assert(!oneBasedIndices.empty() && "scalar shift must be precomputed");
+        hlfir::Entity shiftElement =
+            hlfir::getElementAt(loc, builder, shift, oneBasedIndices);
+        shiftVal = hlfir::loadTrivialScalar(loc, builder, shiftElement);
+        shiftVal = normalizeShiftValue(loc, builder, shiftVal, shiftDimExtent,
+                                       calcType);
+      }
+
+      // If we can fetch the byte stride of the leading dimension,
+      // and the byte size of the element, then we can generate
+      // a dynamic contiguity check and expose the leading dimension's
+      // contiguity in FIR, making memcpy loop idiom recognition
+      // possible.
+      mlir::Value elemSize;
+      mlir::Value stride;
+      if (dimVal == 1 && mlir::isa<fir::BaseBoxType>(array.getType())) {
+        mlir::Type indexType = builder.getIndexType();
+        elemSize =
+            builder.create<fir::BoxEleSizeOp>(loc, indexType, array.getBase());
+        mlir::Value dimIdx =
+            builder.createIntegerConstant(loc, indexType, dimVal - 1);
+        auto boxDim = builder.create<fir::BoxDimsOp>(
+            loc, indexType, indexType, indexType, array.getBase(), dimIdx);
+        stride = boxDim.getByteStride();
+      }
+
+      if (array.isSimplyContiguous() || !elemSize || !stride) {
+        genDimensionShift(loc, builder, shiftVal, /*exposeContiguity=*/false,
+                          oneBasedIndices);
+        return {};
+      }
+
+      mlir::Value isContiguous = builder.create<mlir::arith::CmpIOp>(
+          loc, mlir::arith::CmpIPredicate::eq, elemSize, stride);
+      builder.genIfOp(loc, {}, isContiguous, /*withElseRegion=*/true)
+          .genThen([&]() {
+            genDimensionShift(loc, builder, shiftVal, /*exposeContiguity=*/true,
+                              oneBasedIndices);
+          })
+          .genElse([&]() {
+            genDimensionShift(loc, builder, shiftVal,
+                              /*exposeContiguity=*/false, oneBasedIndices);
+          });
+
+      return {};
+    };
+
+    // For 1D case, generate a single loop.
+    // For ND case, generate a loop nest over the other dimensions
+    // with a single loop inside (generated separately).
+    llvm::SmallVector<mlir::Value, maxRank> newExtents(arrayExtents);
+    newExtents.erase(newExtents.begin() + (dimVal - 1));
+    if (!newExtents.empty())
+      hlfir::genLoopNestWithReductions(loc, builder, newExtents,
+                                       /*reductionInits=*/{}, genShiftBody,
+                                       /*isUnordered=*/true);
+    else
+      genShiftBody(loc, builder, {}, {});
+
+    return evalOp.getOperation();
  }
 };

@@ -1181,7 +1482,7 @@ public:
    mlir::RewritePatternSet patterns(context);
    patterns.insert<TransposeAsElementalConversion>(context);
    patterns.insert<SumAsElementalConversion>(context);
-    patterns.insert<CShiftAsElementalConversion>(context);
+    patterns.insert<CShiftConversion>(context);
    patterns.insert<MatmulConversion<hlfir::MatmulTransposeOp>>(context);

    // If forceMatmulAsElemental is false, then hlfir.matmul inlining
--- a/flang/test/HLFIR/simplify-hlfir-intrinsics-cshift.fir
+++ b/flang/test/HLFIR/simplify-hlfir-intrinsics-cshift.fir
@@ -1,4 +1,4 @@
-// Test hlfir.cshift simplification to hlfir.elemental:
+// Test hlfir.cshift simplification to hlfir.elemental and hlfir.eval_in_mem:
 // RUN: fir-opt --simplify-hlfir-intrinsics %s | FileCheck %s

 func.func @cshift_vector(%arg0: !fir.box<!fir.array<?xi32>>, %arg1: !fir.ref<i32>) -> !hlfir.expr<?xi32>{
@@ -6,39 +6,114 @@ func.func @cshift_vector(%arg0: !fir.box<!fir.array<?xi32>>, %arg1: !fir.ref<i32
  return %res : !hlfir.expr<?xi32>
 }
 // CHECK-LABEL:   func.func @cshift_vector(
-// CHECK-SAME:                             %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>,
-// CHECK-SAME:                             %[[VAL_1:.*]]: !fir.ref<i32>) -> !hlfir.expr<?xi32> {
-// CHECK:           %[[VAL_26:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_16:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]]#1 : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
-// CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i32) -> i64
-// CHECK:           %[[VAL_8:.*]] = hlfir.elemental %[[VAL_4]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
-// CHECK:           ^bb0(%[[VAL_9:.*]]: index):
-// CHECK:             %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (index) -> i64
-// CHECK:             %[[VAL_11:.*]] = arith.addi %[[VAL_10]], %[[VAL_7]] : i64
-// CHECK:             %[[VAL_12:.*]] = arith.subi %[[VAL_11]], %[[VAL_5]] : i64
-// CHECK:             %[[VAL_13:.*]] = fir.convert %[[VAL_3]]#1 : (index) -> i64
-// CHECK:             %[[VAL_14:.*]] = arith.remsi %[[VAL_12]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_15:.*]] = arith.xori %[[VAL_12]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_17:.*]] = arith.cmpi slt, %[[VAL_15]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_18:.*]] = arith.cmpi ne, %[[VAL_14]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_19:.*]] = arith.andi %[[VAL_18]], %[[VAL_17]] : i1
-// CHECK:             %[[VAL_20:.*]] = arith.addi %[[VAL_14]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_21:.*]] = arith.select %[[VAL_19]], %[[VAL_20]], %[[VAL_14]] : i64
-// CHECK:             %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_5]] : i64
-// CHECK:             %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (i64) -> index
-// CHECK:             %[[VAL_25:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_27:.*]] = arith.subi %[[VAL_25]]#0, %[[VAL_26]] : index
-// CHECK:             %[[VAL_28:.*]] = arith.addi %[[VAL_23]], %[[VAL_27]] : index
-// CHECK:             %[[VAL_29:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_28]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-// CHECK:             %[[VAL_30:.*]] = fir.load %[[VAL_29]] : !fir.ref<i32>
-// CHECK:             hlfir.yield_element %[[VAL_30]] : i32
+// CHECK-SAME:                             %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?xi32>>,
+// CHECK-SAME:                             %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref<i32>) -> !hlfir.expr<?xi32> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]]#1 : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64
+// CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+// CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> i64
+// CHECK:           %[[ISZERO:.*]] = arith.cmpi eq, %[[VAL_8]], %[[VAL_4]] : i64
+// CHECK:           %[[EXTENT:.*]] = arith.select %[[ISZERO]], %[[VAL_3]], %[[VAL_8]] : i64
+// CHECK:           %[[VAL_11:.*]] = arith.remsi %[[VAL_10]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_12:.*]] = arith.xori %[[VAL_10]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_13:.*]] = arith.cmpi slt, %[[VAL_12]], %[[VAL_4]] : i64
+// CHECK:           %[[VAL_14:.*]] = arith.cmpi ne, %[[VAL_11]], %[[VAL_4]] : i64
+// CHECK:           %[[VAL_15:.*]] = arith.andi %[[VAL_14]], %[[VAL_13]] : i1
+// CHECK:           %[[VAL_16:.*]] = arith.addi %[[VAL_11]], %[[EXTENT]] overflow<nsw, nuw> : i64
+// CHECK:           %[[VAL_17:.*]] = arith.select %[[VAL_15]], %[[VAL_16]], %[[VAL_11]] : i64
+// CHECK:           %[[VAL_18:.*]] = hlfir.eval_in_mem shape %[[VAL_7]] : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
+// CHECK:           ^bb0(%[[VAL_19:.*]]: !fir.ref<!fir.array<?xi32>>):
+// CHECK:             %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_7]]) : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:             %[[VAL_3:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box<!fir.array<?xi32>>) -> index
+// CHECK:             %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+
+// CHECK:             %[[VAL_22:.*]] = arith.cmpi eq, %[[VAL_3]], %[[VAL_21]]#2 : index
+// CHECK:             fir.if %[[VAL_22]] {
+// CHECK:               %[[VAL_23:.*]] = fir.shape %[[VAL_6]]#1 : (index) -> !fir.shape<1>
+// CHECK:               %[[VAL_24:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_2]]:%[[VAL_6]]#1:%[[VAL_2]])  shape %[[VAL_23]] : (!fir.box<!fir.array<?xi32>>, index, index, index, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:               %[[VAL_25:.*]] = fir.box_addr %[[VAL_24]] : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+// CHECK:               %[[VAL_26:.*]] = fir.embox %[[VAL_25]](%[[VAL_23]]) : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:               %[[VAL_27:.*]] = fir.convert %[[VAL_17]] : (i64) -> index
+// CHECK:               fir.do_loop %[[VAL_28:.*]] = %[[VAL_2]] to %[[VAL_27]] step %[[VAL_2]] unordered {
+// CHECK:                 %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (index) -> i64
+// CHECK:                 %[[VAL_30:.*]]:3 = fir.box_dims %[[VAL_26]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_31:.*]] = fir.convert %[[VAL_29]] : (i64) -> index
+// CHECK:                 %[[VAL_32:.*]] = arith.subi %[[VAL_30]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_33:.*]] = arith.addi %[[VAL_31]], %[[VAL_32]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_34:.*]] = hlfir.designate %[[VAL_26]] (%[[VAL_33]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 %[[VAL_35:.*]] = fir.load %[[VAL_34]] : !fir.ref<i32>
+// CHECK:                 %[[VAL_36:.*]] = arith.subi %[[VAL_8]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:                 %[[VAL_37:.*]] = arith.addi %[[VAL_29]], %[[VAL_36]] overflow<nsw, nuw> : i64
+// CHECK:                 %[[VAL_38:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_39:.*]] = fir.convert %[[VAL_37]] : (i64) -> index
+// CHECK:                 %[[VAL_40:.*]] = arith.subi %[[VAL_38]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_41:.*]] = arith.addi %[[VAL_39]], %[[VAL_40]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_42:.*]] = hlfir.designate %[[VAL_20]] (%[[VAL_41]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 hlfir.assign %[[VAL_35]] to %[[VAL_42]] : i32, !fir.ref<i32>
+// CHECK:               }
+// CHECK:               %[[VAL_43:.*]] = arith.subi %[[VAL_8]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:               %[[VAL_44:.*]] = fir.convert %[[VAL_43]] : (i64) -> index
+// CHECK:               fir.do_loop %[[VAL_45:.*]] = %[[VAL_2]] to %[[VAL_44]] step %[[VAL_2]] unordered {
+// CHECK:                 %[[VAL_46:.*]] = fir.convert %[[VAL_45]] : (index) -> i64
+// CHECK:                 %[[VAL_47:.*]] = arith.addi %[[VAL_46]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:                 %[[VAL_48:.*]]:3 = fir.box_dims %[[VAL_26]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_49:.*]] = fir.convert %[[VAL_47]] : (i64) -> index
+// CHECK:                 %[[VAL_50:.*]] = arith.subi %[[VAL_48]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_51:.*]] = arith.addi %[[VAL_49]], %[[VAL_50]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_52:.*]] = hlfir.designate %[[VAL_26]] (%[[VAL_51]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 %[[VAL_53:.*]] = fir.load %[[VAL_52]] : !fir.ref<i32>
+// CHECK:                 %[[VAL_54:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_55:.*]] = fir.convert %[[VAL_46]] : (i64) -> index
+// CHECK:                 %[[VAL_56:.*]] = arith.subi %[[VAL_54]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_57:.*]] = arith.addi %[[VAL_55]], %[[VAL_56]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_58:.*]] = hlfir.designate %[[VAL_20]] (%[[VAL_57]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 hlfir.assign %[[VAL_53]] to %[[VAL_58]] : i32, !fir.ref<i32>
+// CHECK:               }
+// CHECK:             } else {
+// CHECK:               %[[VAL_59:.*]] = fir.convert %[[VAL_17]] : (i64) -> index
+// CHECK:               fir.do_loop %[[VAL_60:.*]] = %[[VAL_2]] to %[[VAL_59]] step %[[VAL_2]] unordered {
+// CHECK:                 %[[VAL_61:.*]] = fir.convert %[[VAL_60]] : (index) -> i64
+// CHECK:                 %[[VAL_62:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_63:.*]] = fir.convert %[[VAL_61]] : (i64) -> index
+// CHECK:                 %[[VAL_64:.*]] = arith.subi %[[VAL_62]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_65:.*]] = arith.addi %[[VAL_63]], %[[VAL_64]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_66:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_65]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 %[[VAL_67:.*]] = fir.load %[[VAL_66]] : !fir.ref<i32>
+// CHECK:                 %[[VAL_68:.*]] = arith.subi %[[VAL_8]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:                 %[[VAL_69:.*]] = arith.addi %[[VAL_61]], %[[VAL_68]] overflow<nsw, nuw> : i64
+// CHECK:                 %[[VAL_70:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_71:.*]] = fir.convert %[[VAL_69]] : (i64) -> index
+// CHECK:                 %[[VAL_72:.*]] = arith.subi %[[VAL_70]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_73:.*]] = arith.addi %[[VAL_71]], %[[VAL_72]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_74:.*]] = hlfir.designate %[[VAL_20]] (%[[VAL_73]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 hlfir.assign %[[VAL_67]] to %[[VAL_74]] : i32, !fir.ref<i32>
+// CHECK:               }
+// CHECK:               %[[VAL_75:.*]] = arith.subi %[[VAL_8]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:               %[[VAL_76:.*]] = fir.convert %[[VAL_75]] : (i64) -> index
+// CHECK:               fir.do_loop %[[VAL_77:.*]] = %[[VAL_2]] to %[[VAL_76]] step %[[VAL_2]] unordered {
+// CHECK:                 %[[VAL_78:.*]] = fir.convert %[[VAL_77]] : (index) -> i64
+// CHECK:                 %[[VAL_79:.*]] = arith.addi %[[VAL_78]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:                 %[[VAL_80:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_81:.*]] = fir.convert %[[VAL_79]] : (i64) -> index
+// CHECK:                 %[[VAL_82:.*]] = arith.subi %[[VAL_80]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_83:.*]] = arith.addi %[[VAL_81]], %[[VAL_82]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_84:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_83]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 %[[VAL_85:.*]] = fir.load %[[VAL_84]] : !fir.ref<i32>
+// CHECK:                 %[[VAL_86:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_87:.*]] = fir.convert %[[VAL_78]] : (i64) -> index
+// CHECK:                 %[[VAL_88:.*]] = arith.subi %[[VAL_86]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_89:.*]] = arith.addi %[[VAL_87]], %[[VAL_88]] overflow<nsw, nuw> : index
+// CHECK:                 %[[VAL_90:.*]] = hlfir.designate %[[VAL_20]] (%[[VAL_89]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:                 hlfir.assign %[[VAL_85]] to %[[VAL_90]] : i32, !fir.ref<i32>
+// CHECK:               }
+// CHECK:             }
 // CHECK:           }
-// CHECK:           return
+// CHECK:           return %[[VAL_18]] : !hlfir.expr<?xi32>
 // CHECK:         }

 func.func @cshift_2d_by_scalar(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir.ref<i32>) -> !hlfir.expr<?x?xi32> {
@@ -47,43 +122,47 @@ func.func @cshift_2d_by_scalar(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir
  return %res : !hlfir.expr<?x?xi32>
 }
 // CHECK-LABEL:   func.func @cshift_2d_by_scalar(
-// CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xi32>>,
-// CHECK-SAME:                                   %[[VAL_1:.*]]: !fir.ref<i32>) -> !hlfir.expr<?x?xi32> {
-// CHECK:           %[[VAL_20:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_4]]#1, %[[VAL_6]]#1 : (index, index) -> !fir.shape<2>
+// CHECK-SAME:                                   %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?x?xi32>>,
+// CHECK-SAME:                                   %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref<i32>) -> !hlfir.expr<?x?xi32> {
+// CHECK:           %[[ONE:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_5]]#1, %[[VAL_6]]#1 : (index, index) -> !fir.shape<2>
+// CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64
 // CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
 // CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> i64
-// CHECK:           %[[VAL_11:.*]] = hlfir.elemental %[[VAL_7]] unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xi32> {
-// CHECK:           ^bb0(%[[VAL_12:.*]]: index, %[[VAL_13:.*]]: index):
-// CHECK:             %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (index) -> i64
-// CHECK:             %[[VAL_15:.*]] = arith.addi %[[VAL_14]], %[[VAL_10]] : i64
-// CHECK:             %[[VAL_16:.*]] = arith.subi %[[VAL_15]], %[[VAL_8]] : i64
-// CHECK:             %[[VAL_17:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64
-// CHECK:             %[[VAL_18:.*]] = arith.remsi %[[VAL_16]], %[[VAL_17]] : i64
-// CHECK:             %[[VAL_19:.*]] = arith.xori %[[VAL_16]], %[[VAL_17]] : i64
-// CHECK:             %[[VAL_21:.*]] = arith.cmpi slt, %[[VAL_19]], %[[VAL_20]] : i64
-// CHECK:             %[[VAL_22:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_20]] : i64
-// CHECK:             %[[VAL_23:.*]] = arith.andi %[[VAL_22]], %[[VAL_21]] : i1
-// CHECK:             %[[VAL_24:.*]] = arith.addi %[[VAL_18]], %[[VAL_17]] : i64
-// CHECK:             %[[VAL_25:.*]] = arith.select %[[VAL_23]], %[[VAL_24]], %[[VAL_18]] : i64
-// CHECK:             %[[VAL_26:.*]] = arith.addi %[[VAL_25]], %[[VAL_8]] : i64
+// CHECK:           %[[ISZERO:.*]] = arith.cmpi eq, %[[VAL_8]], %[[VAL_2]] : i64
+// CHECK:           %[[EXTENT:.*]] = arith.select %[[ISZERO]], %[[ONE]], %[[VAL_8]] : i64
+// CHECK:           %[[VAL_11:.*]] = arith.remsi %[[VAL_10]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_12:.*]] = arith.xori %[[VAL_10]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_13:.*]] = arith.cmpi slt, %[[VAL_12]], %[[VAL_2]] : i64
+// CHECK:           %[[VAL_14:.*]] = arith.cmpi ne, %[[VAL_11]], %[[VAL_2]] : i64
+// CHECK:           %[[VAL_15:.*]] = arith.andi %[[VAL_14]], %[[VAL_13]] : i1
+// CHECK:           %[[VAL_16:.*]] = arith.addi %[[VAL_11]], %[[EXTENT]] overflow<nsw, nuw> : i64
+// CHECK:           %[[VAL_17:.*]] = arith.select %[[VAL_15]], %[[VAL_16]], %[[VAL_11]] : i64
+// CHECK:           %[[VAL_18:.*]] = hlfir.elemental %[[VAL_7]] unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xi32> {
+// CHECK:           ^bb0(%[[VAL_19:.*]]: index, %[[VAL_20:.*]]: index):
+// CHECK:             %[[VAL_21:.*]] = arith.subi %[[VAL_8]], %[[VAL_17]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_22:.*]] = arith.subi %[[VAL_17]], %[[VAL_8]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_23:.*]] = fir.convert %[[VAL_20]] : (index) -> i64
+// CHECK:             %[[VAL_24:.*]] = arith.cmpi sle, %[[VAL_23]], %[[VAL_21]] : i64
+// CHECK:             %[[VAL_25:.*]] = arith.select %[[VAL_24]], %[[VAL_17]], %[[VAL_22]] : i64
+// CHECK:             %[[VAL_26:.*]] = arith.addi %[[VAL_23]], %[[VAL_25]] overflow<nsw, nuw> : i64
 // CHECK:             %[[VAL_27:.*]] = fir.convert %[[VAL_26]] : (i64) -> index
+// CHECK:             %[[VAL_28:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
 // CHECK:             %[[VAL_29:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_31:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_33:.*]] = arith.subi %[[VAL_29]]#0, %[[VAL_5]] : index
-// CHECK:             %[[VAL_34:.*]] = arith.addi %[[VAL_12]], %[[VAL_33]] : index
-// CHECK:             %[[VAL_35:.*]] = arith.subi %[[VAL_31]]#0, %[[VAL_5]] : index
-// CHECK:             %[[VAL_36:.*]] = arith.addi %[[VAL_27]], %[[VAL_35]] : index
-// CHECK:             %[[VAL_37:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_34]], %[[VAL_36]])  : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
-// CHECK:             %[[VAL_38:.*]] = fir.load %[[VAL_37]] : !fir.ref<i32>
-// CHECK:             hlfir.yield_element %[[VAL_38]] : i32
+// CHECK:             %[[VAL_30:.*]] = arith.subi %[[VAL_28]]#0, %[[VAL_3]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_31:.*]] = arith.addi %[[VAL_19]], %[[VAL_30]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_32:.*]] = arith.subi %[[VAL_29]]#0, %[[VAL_3]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_33:.*]] = arith.addi %[[VAL_27]], %[[VAL_32]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_34:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_31]], %[[VAL_33]])  : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_35:.*]] = fir.load %[[VAL_34]] : !fir.ref<i32>
+// CHECK:             hlfir.yield_element %[[VAL_35]] : i32
 // CHECK:           }
-// CHECK:           return
+// CHECK:           return %[[VAL_18]] : !hlfir.expr<?x?xi32>
 // CHECK:         }

 func.func @cshift_2d_by_vector(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir.box<!fir.array<?xi32>>) -> !hlfir.expr<?x?xi32> {
@@ -92,47 +171,51 @@ func.func @cshift_2d_by_vector(%arg0: !fir.box<!fir.array<?x?xi32>>, %arg1: !fir
  return %res : !hlfir.expr<?x?xi32>
 }
 // CHECK-LABEL:   func.func @cshift_2d_by_vector(
-// CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.box<!fir.array<?x?xi32>>,
-// CHECK-SAME:                                   %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>>) -> !hlfir.expr<?x?xi32> {
-// CHECK:           %[[VAL_26:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_4]]#1, %[[VAL_6]]#1 : (index, index) -> !fir.shape<2>
+// CHECK-SAME:                                   %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?x?xi32>>,
+// CHECK-SAME:                                   %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?xi32>>) -> !hlfir.expr<?x?xi32> {
+// CHECK:           %[[ONE:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_5]]#1, %[[VAL_6]]#1 : (index, index) -> !fir.shape<2>
+// CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64
 // CHECK:           %[[VAL_9:.*]] = hlfir.elemental %[[VAL_7]] unordered : (!fir.shape<2>) -> !hlfir.expr<?x?xi32> {
 // CHECK:           ^bb0(%[[VAL_10:.*]]: index, %[[VAL_11:.*]]: index):
-// CHECK:             %[[VAL_13:.*]]:3 = fir.box_dims %[[VAL_1]], %[[VAL_3]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_15:.*]] = arith.subi %[[VAL_13]]#0, %[[VAL_5]] : index
-// CHECK:             %[[VAL_16:.*]] = arith.addi %[[VAL_10]], %[[VAL_15]] : index
-// CHECK:             %[[VAL_17:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_16]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-// CHECK:             %[[VAL_18:.*]] = fir.load %[[VAL_17]] : !fir.ref<i32>
-// CHECK:             %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (i32) -> i64
-// CHECK:             %[[VAL_20:.*]] = fir.convert %[[VAL_11]] : (index) -> i64
-// CHECK:             %[[VAL_21:.*]] = arith.addi %[[VAL_20]], %[[VAL_19]] : i64
-// CHECK:             %[[VAL_22:.*]] = arith.subi %[[VAL_21]], %[[VAL_8]] : i64
-// CHECK:             %[[VAL_23:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64
-// CHECK:             %[[VAL_24:.*]] = arith.remsi %[[VAL_22]], %[[VAL_23]] : i64
-// CHECK:             %[[VAL_25:.*]] = arith.xori %[[VAL_22]], %[[VAL_23]] : i64
-// CHECK:             %[[VAL_27:.*]] = arith.cmpi slt, %[[VAL_25]], %[[VAL_26]] : i64
-// CHECK:             %[[VAL_28:.*]] = arith.cmpi ne, %[[VAL_24]], %[[VAL_26]] : i64
-// CHECK:             %[[VAL_29:.*]] = arith.andi %[[VAL_28]], %[[VAL_27]] : i1
-// CHECK:             %[[VAL_30:.*]] = arith.addi %[[VAL_24]], %[[VAL_23]] : i64
-// CHECK:             %[[VAL_31:.*]] = arith.select %[[VAL_29]], %[[VAL_30]], %[[VAL_24]] : i64
-// CHECK:             %[[VAL_32:.*]] = arith.addi %[[VAL_31]], %[[VAL_8]] : i64
-// CHECK:             %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (i64) -> index
-// CHECK:             %[[VAL_35:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_37:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_39:.*]] = arith.subi %[[VAL_35]]#0, %[[VAL_5]] : index
-// CHECK:             %[[VAL_40:.*]] = arith.addi %[[VAL_10]], %[[VAL_39]] : index
-// CHECK:             %[[VAL_41:.*]] = arith.subi %[[VAL_37]]#0, %[[VAL_5]] : index
-// CHECK:             %[[VAL_42:.*]] = arith.addi %[[VAL_33]], %[[VAL_41]] : index
-// CHECK:             %[[VAL_43:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_40]], %[[VAL_42]])  : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
-// CHECK:             %[[VAL_44:.*]] = fir.load %[[VAL_43]] : !fir.ref<i32>
-// CHECK:             hlfir.yield_element %[[VAL_44]] : i32
+// CHECK:             %[[VAL_12:.*]]:3 = fir.box_dims %[[VAL_1]], %[[VAL_4]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_13:.*]] = arith.subi %[[VAL_12]]#0, %[[VAL_3]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_10]], %[[VAL_13]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_15:.*]] = hlfir.designate %[[VAL_1]] (%[[VAL_14]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.ref<i32>
+// CHECK:             %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i32) -> i64
+// CHECK:             %[[ISZERO:.*]] = arith.cmpi eq, %[[VAL_8]], %[[VAL_2]] : i64
+// CHECK:             %[[EXTENT:.*]] = arith.select %[[ISZERO]], %[[ONE]], %[[VAL_8]] : i64
+// CHECK:             %[[VAL_18:.*]] = arith.remsi %[[VAL_17]], %[[EXTENT]] : i64
+// CHECK:             %[[VAL_19:.*]] = arith.xori %[[VAL_17]], %[[EXTENT]] : i64
+// CHECK:             %[[VAL_20:.*]] = arith.cmpi slt, %[[VAL_19]], %[[VAL_2]] : i64
+// CHECK:             %[[VAL_21:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_2]] : i64
+// CHECK:             %[[VAL_22:.*]] = arith.andi %[[VAL_21]], %[[VAL_20]] : i1
+// CHECK:             %[[VAL_23:.*]] = arith.addi %[[VAL_18]], %[[EXTENT]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_24:.*]] = arith.select %[[VAL_22]], %[[VAL_23]], %[[VAL_18]] : i64
+// CHECK:             %[[VAL_25:.*]] = arith.subi %[[VAL_8]], %[[VAL_24]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_26:.*]] = arith.subi %[[VAL_24]], %[[VAL_8]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_27:.*]] = fir.convert %[[VAL_11]] : (index) -> i64
+// CHECK:             %[[VAL_28:.*]] = arith.cmpi sle, %[[VAL_27]], %[[VAL_25]] : i64
+// CHECK:             %[[VAL_29:.*]] = arith.select %[[VAL_28]], %[[VAL_24]], %[[VAL_26]] : i64
+// CHECK:             %[[VAL_30:.*]] = arith.addi %[[VAL_27]], %[[VAL_29]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (i64) -> index
+// CHECK:             %[[VAL_32:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_33:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?x?xi32>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_34:.*]] = arith.subi %[[VAL_32]]#0, %[[VAL_3]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_35:.*]] = arith.addi %[[VAL_10]], %[[VAL_34]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_36:.*]] = arith.subi %[[VAL_33]]#0, %[[VAL_3]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_37:.*]] = arith.addi %[[VAL_31]], %[[VAL_36]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_38:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_35]], %[[VAL_37]])  : (!fir.box<!fir.array<?x?xi32>>, index, index) -> !fir.ref<i32>
+// CHECK:             %[[VAL_39:.*]] = fir.load %[[VAL_38]] : !fir.ref<i32>
+// CHECK:             hlfir.yield_element %[[VAL_39]] : i32
 // CHECK:           }
-// CHECK:           return
+// CHECK:           return %[[VAL_9]] : !hlfir.expr<?x?xi32>
 // CHECK:         }

 func.func @cshift_vector_char(%arg0: !fir.box<!fir.array<?x!fir.char<2,?>>>, %arg1: !fir.ref<i32>) -> !hlfir.expr<?x!fir.char<2,?>> {
@@ -140,43 +223,47 @@ func.func @cshift_vector_char(%arg0: !fir.box<!fir.array<?x!fir.char<2,?>>>, %ar
  return %res : !hlfir.expr<?x!fir.char<2,?>>
 }
 // CHECK-LABEL:   func.func @cshift_vector_char(
-// CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.box<!fir.array<?x!fir.char<2,?>>>,
-// CHECK-SAME:                                  %[[VAL_1:.*]]: !fir.ref<i32>) -> !hlfir.expr<?x!fir.char<2,?>> {
-// CHECK:           %[[VAL_32:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_19:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_6:.*]] = arith.constant 2 : index
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]]#1 : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_5:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>) -> index
-// CHECK:           %[[VAL_7:.*]] = arith.divsi %[[VAL_5]], %[[VAL_6]] : index
-// CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
-// CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> i64
-// CHECK:           %[[VAL_11:.*]] = hlfir.elemental %[[VAL_4]] typeparams %[[VAL_7]] unordered : (!fir.shape<1>, index) -> !hlfir.expr<?x!fir.char<2,?>> {
-// CHECK:           ^bb0(%[[VAL_12:.*]]: index):
-// CHECK:             %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (index) -> i64
-// CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_10]] : i64
-// CHECK:             %[[VAL_15:.*]] = arith.subi %[[VAL_14]], %[[VAL_8]] : i64
-// CHECK:             %[[VAL_16:.*]] = fir.convert %[[VAL_3]]#1 : (index) -> i64
-// CHECK:             %[[VAL_17:.*]] = arith.remsi %[[VAL_15]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_18:.*]] = arith.xori %[[VAL_15]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_20:.*]] = arith.cmpi slt, %[[VAL_18]], %[[VAL_19]] : i64
-// CHECK:             %[[VAL_21:.*]] = arith.cmpi ne, %[[VAL_17]], %[[VAL_19]] : i64
-// CHECK:             %[[VAL_22:.*]] = arith.andi %[[VAL_21]], %[[VAL_20]] : i1
-// CHECK:             %[[VAL_23:.*]] = arith.addi %[[VAL_17]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_24:.*]] = arith.select %[[VAL_22]], %[[VAL_23]], %[[VAL_17]] : i64
-// CHECK:             %[[VAL_25:.*]] = arith.addi %[[VAL_24]], %[[VAL_8]] : i64
-// CHECK:             %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i64) -> index
-// CHECK:             %[[VAL_27:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>) -> index
-// CHECK:             %[[VAL_29:.*]] = arith.divsi %[[VAL_27]], %[[VAL_6]] : index
-// CHECK:             %[[VAL_31:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_33:.*]] = arith.subi %[[VAL_31]]#0, %[[VAL_32]] : index
-// CHECK:             %[[VAL_34:.*]] = arith.addi %[[VAL_26]], %[[VAL_33]] : index
-// CHECK:             %[[VAL_35:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_34]])  typeparams %[[VAL_29]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>, index, index) -> !fir.boxchar<2>
-// CHECK:             hlfir.yield_element %[[VAL_35]] : !fir.boxchar<2>
+// CHECK-SAME:                                  %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box<!fir.array<?x!fir.char<2,?>>>,
+// CHECK-SAME:                                  %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref<i32>) -> !hlfir.expr<?x!fir.char<2,?>> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[ONE:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_4:.*]] = arith.constant 2 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]]#1 : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_8:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>) -> index
+// CHECK:           %[[VAL_9:.*]] = arith.divsi %[[VAL_8]], %[[VAL_4]] : index
+// CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64
+// CHECK:           %[[VAL_11:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
+// CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> i64
+// CHECK:           %[[ISZERO:.*]] = arith.cmpi eq, %[[VAL_10]], %[[VAL_3]] : i64
+// CHECK:           %[[EXTENT:.*]] = arith.select %[[ISZERO]], %[[ONE]], %[[VAL_10]] : i64
+// CHECK:           %[[VAL_13:.*]] = arith.remsi %[[VAL_12]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_14:.*]] = arith.xori %[[VAL_12]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_15:.*]] = arith.cmpi slt, %[[VAL_14]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_16:.*]] = arith.cmpi ne, %[[VAL_13]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_17:.*]] = arith.andi %[[VAL_16]], %[[VAL_15]] : i1
+// CHECK:           %[[VAL_18:.*]] = arith.addi %[[VAL_13]], %[[EXTENT]] overflow<nsw, nuw> : i64
+// CHECK:           %[[VAL_19:.*]] = arith.select %[[VAL_17]], %[[VAL_18]], %[[VAL_13]] : i64
+// CHECK:           %[[VAL_20:.*]] = hlfir.elemental %[[VAL_7]] typeparams %[[VAL_9]] unordered : (!fir.shape<1>, index) -> !hlfir.expr<?x!fir.char<2,?>> {
+// CHECK:           ^bb0(%[[VAL_21:.*]]: index):
+// CHECK:             %[[VAL_22:.*]] = arith.subi %[[VAL_10]], %[[VAL_19]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_23:.*]] = arith.subi %[[VAL_19]], %[[VAL_10]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_24:.*]] = fir.convert %[[VAL_21]] : (index) -> i64
+// CHECK:             %[[VAL_25:.*]] = arith.cmpi sle, %[[VAL_24]], %[[VAL_22]] : i64
+// CHECK:             %[[VAL_26:.*]] = arith.select %[[VAL_25]], %[[VAL_19]], %[[VAL_23]] : i64
+// CHECK:             %[[VAL_27:.*]] = arith.addi %[[VAL_24]], %[[VAL_26]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_28:.*]] = fir.convert %[[VAL_27]] : (i64) -> index
+// CHECK:             %[[VAL_29:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>) -> index
+// CHECK:             %[[VAL_30:.*]] = arith.divsi %[[VAL_29]], %[[VAL_4]] : index
+// CHECK:             %[[VAL_31:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_32:.*]] = arith.subi %[[VAL_31]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_33:.*]] = arith.addi %[[VAL_28]], %[[VAL_32]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_34:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_33]])  typeparams %[[VAL_30]] : (!fir.box<!fir.array<?x!fir.char<2,?>>>, index, index) -> !fir.boxchar<2>
+// CHECK:             hlfir.yield_element %[[VAL_34]] : !fir.boxchar<2>
 // CHECK:           }
-// CHECK:           return
+// CHECK:           return %[[VAL_20]] : !hlfir.expr<?x!fir.char<2,?>>
 // CHECK:         }

 func.func @cshift_vector_poly(%arg0: !fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>, %arg1: i32) -> !hlfir.expr<?x!fir.type<_QFFtestTt>?> {
@@ -184,37 +271,41 @@ func.func @cshift_vector_poly(%arg0: !fir.class<!fir.array<?x!fir.type<_QFFtestT
  return %res : !hlfir.expr<?x!fir.type<_QFFtestTt>?>
 }
 // CHECK-LABEL:   func.func @cshift_vector_poly(
-// CHECK-SAME:                                  %[[VAL_0:.*]]: !fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>,
-// CHECK-SAME:                                  %[[VAL_1:.*]]: i32) -> !hlfir.expr<?x!fir.type<_QFFtestTt>?> {
-// CHECK:           %[[VAL_25:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_15:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]]#1 : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64
-// CHECK:           %[[VAL_7:.*]] = hlfir.elemental %[[VAL_4]] mold %[[VAL_0]] unordered : (!fir.shape<1>, !fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>) -> !hlfir.expr<?x!fir.type<_QFFtestTt>?> {
-// CHECK:           ^bb0(%[[VAL_8:.*]]: index):
-// CHECK:             %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (index) -> i64
-// CHECK:             %[[VAL_10:.*]] = arith.addi %[[VAL_9]], %[[VAL_6]] : i64
-// CHECK:             %[[VAL_11:.*]] = arith.subi %[[VAL_10]], %[[VAL_5]] : i64
-// CHECK:             %[[VAL_12:.*]] = fir.convert %[[VAL_3]]#1 : (index) -> i64
-// CHECK:             %[[VAL_13:.*]] = arith.remsi %[[VAL_11]], %[[VAL_12]] : i64
-// CHECK:             %[[VAL_14:.*]] = arith.xori %[[VAL_11]], %[[VAL_12]] : i64
-// CHECK:             %[[VAL_16:.*]] = arith.cmpi slt, %[[VAL_14]], %[[VAL_15]] : i64
-// CHECK:             %[[VAL_17:.*]] = arith.cmpi ne, %[[VAL_13]], %[[VAL_15]] : i64
-// CHECK:             %[[VAL_18:.*]] = arith.andi %[[VAL_17]], %[[VAL_16]] : i1
-// CHECK:             %[[VAL_19:.*]] = arith.addi %[[VAL_13]], %[[VAL_12]] : i64
-// CHECK:             %[[VAL_20:.*]] = arith.select %[[VAL_18]], %[[VAL_19]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_21:.*]] = arith.addi %[[VAL_20]], %[[VAL_5]] : i64
-// CHECK:             %[[VAL_22:.*]] = fir.convert %[[VAL_21]] : (i64) -> index
-// CHECK:             %[[VAL_24:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_26:.*]] = arith.subi %[[VAL_24]]#0, %[[VAL_25]] : index
-// CHECK:             %[[VAL_27:.*]] = arith.addi %[[VAL_22]], %[[VAL_26]] : index
+// CHECK-SAME:                                  %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>,
+// CHECK-SAME:                                  %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: i32) -> !hlfir.expr<?x!fir.type<_QFFtestTt>?> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[ONE:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_5:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]]#1 : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_5]]#1 : (index) -> i64
+// CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64
+// CHECK:           %[[ISZERO:.*]] = arith.cmpi eq, %[[VAL_7]], %[[VAL_3]] : i64
+// CHECK:           %[[EXTENT:.*]] = arith.select %[[ISZERO]], %[[ONE]], %[[VAL_7]] : i64
+// CHECK:           %[[VAL_9:.*]] = arith.remsi %[[VAL_8]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_10:.*]] = arith.xori %[[VAL_8]], %[[EXTENT]] : i64
+// CHECK:           %[[VAL_11:.*]] = arith.cmpi slt, %[[VAL_10]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_12:.*]] = arith.cmpi ne, %[[VAL_9]], %[[VAL_3]] : i64
+// CHECK:           %[[VAL_13:.*]] = arith.andi %[[VAL_12]], %[[VAL_11]] : i1
+// CHECK:           %[[VAL_14:.*]] = arith.addi %[[VAL_9]], %[[EXTENT]] overflow<nsw, nuw> : i64
+// CHECK:           %[[VAL_15:.*]] = arith.select %[[VAL_13]], %[[VAL_14]], %[[VAL_9]] : i64
+// CHECK:           %[[VAL_16:.*]] = hlfir.elemental %[[VAL_6]] mold %[[VAL_0]] unordered : (!fir.shape<1>, !fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>) -> !hlfir.expr<?x!fir.type<_QFFtestTt>?> {
+// CHECK:           ^bb0(%[[VAL_17:.*]]: index):
+// CHECK:             %[[VAL_18:.*]] = arith.subi %[[VAL_7]], %[[VAL_15]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_19:.*]] = arith.subi %[[VAL_15]], %[[VAL_7]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_20:.*]] = fir.convert %[[VAL_17]] : (index) -> i64
+// CHECK:             %[[VAL_21:.*]] = arith.cmpi sle, %[[VAL_20]], %[[VAL_18]] : i64
+// CHECK:             %[[VAL_22:.*]] = arith.select %[[VAL_21]], %[[VAL_15]], %[[VAL_19]] : i64
+// CHECK:             %[[VAL_23:.*]] = arith.addi %[[VAL_20]], %[[VAL_22]] overflow<nsw, nuw> : i64
+// CHECK:             %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (i64) -> index
+// CHECK:             %[[VAL_25:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_26:.*]] = arith.subi %[[VAL_25]]#0, %[[VAL_2]] overflow<nsw, nuw> : index
+// CHECK:             %[[VAL_27:.*]] = arith.addi %[[VAL_24]], %[[VAL_26]] overflow<nsw, nuw> : index
 // CHECK:             %[[VAL_28:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_27]])  : (!fir.class<!fir.array<?x!fir.type<_QFFtestTt>>>, index) -> !fir.class<!fir.type<_QFFtestTt>>
 // CHECK:             hlfir.yield_element %[[VAL_28]] : !fir.class<!fir.type<_QFFtestTt>>
 // CHECK:           }
-// CHECK:           return
+// CHECK:           return %[[VAL_16]] : !hlfir.expr<?x!fir.type<_QFFtestTt>?>
 // CHECK:         }

 // negative: non-constant dim argument
@@ -243,36 +334,13 @@ func.func @cshift_vector_assumed_dim_1(%arg0: !fir.box<!fir.array<?xi32>>, %arg1
  return %res : !hlfir.expr<?xi32>
 }
 // CHECK-LABEL:   func.func @cshift_vector_assumed_dim_1(
-// CHECK-SAME:                                           %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>,
-// CHECK-SAME:                                           %[[VAL_1:.*]]: i32) -> !hlfir.expr<?xi32> {
-// CHECK:           %[[VAL_26:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_16:.*]] = arith.constant 0 : i64
-// CHECK:           %[[VAL_6:.*]] = arith.constant 1 : i64
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-// CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]]#1 : (index) -> !fir.shape<1>
-// CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64
-// CHECK:           %[[VAL_8:.*]] = hlfir.elemental %[[VAL_5]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
-// CHECK:           ^bb0(%[[VAL_9:.*]]: index):
-// CHECK:             %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (index) -> i64
-// CHECK:             %[[VAL_11:.*]] = arith.addi %[[VAL_10]], %[[VAL_7]] : i64
-// CHECK:             %[[VAL_12:.*]] = arith.subi %[[VAL_11]], %[[VAL_6]] : i64
-// CHECK:             %[[VAL_13:.*]] = fir.convert %[[VAL_4]]#1 : (index) -> i64
-// CHECK:             %[[VAL_14:.*]] = arith.remsi %[[VAL_12]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_15:.*]] = arith.xori %[[VAL_12]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_17:.*]] = arith.cmpi slt, %[[VAL_15]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_18:.*]] = arith.cmpi ne, %[[VAL_14]], %[[VAL_16]] : i64
-// CHECK:             %[[VAL_19:.*]] = arith.andi %[[VAL_18]], %[[VAL_17]] : i1
-// CHECK:             %[[VAL_20:.*]] = arith.addi %[[VAL_14]], %[[VAL_13]] : i64
-// CHECK:             %[[VAL_21:.*]] = arith.select %[[VAL_19]], %[[VAL_20]], %[[VAL_14]] : i64
-// CHECK:             %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_6]] : i64
-// CHECK:             %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (i64) -> index
-// CHECK:             %[[VAL_25:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-// CHECK:             %[[VAL_27:.*]] = arith.subi %[[VAL_25]]#0, %[[VAL_26]] : index
-// CHECK:             %[[VAL_28:.*]] = arith.addi %[[VAL_23]], %[[VAL_27]] : index
-// CHECK:             %[[VAL_29:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_28]])  : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-// CHECK:             %[[VAL_30:.*]] = fir.load %[[VAL_29]] : !fir.ref<i32>
-// CHECK:             hlfir.yield_element %[[VAL_30]] : i32
-// CHECK:           }
-// CHECK:           return
-// CHECK:         }
+// CHECK-NOT: hlfir.cshift
+
+// Check that hlfir.cshift is converted to hlfir.elemental
+// when the argument is an array expression:
+func.func @cshift_vector_expr(%arg0: !hlfir.expr<?xi32>, %arg1: !fir.ref<i32>) -> !hlfir.expr<?xi32>{
+  %res = hlfir.cshift %arg0 %arg1 : (!hlfir.expr<?xi32>, !fir.ref<i32>) -> !hlfir.expr<?xi32>
+  return %res : !hlfir.expr<?xi32>
+}
+// CHECK-LABEL:   func.func @cshift_vector_expr(
+// CHECK: hlfir.elemental