Backport LLVM instcombine fix in fold extract element to trunc

This is a backport for LLVM fix: a3f2e01c95
2025-11-04 08:21:06 +08:00 · 2025-06-20 09:04:48 +00:00
parent e7ec97343f
commit 110147b100
1 changed files with 99 additions and 0 deletions
--- a/external/llvm/releases/16.0.0/patches_external/InstCombine-Only-fold-extract-element-to-trunc-if-ve.patch
+++ b/external/llvm/releases/16.0.0/patches_external/InstCombine-Only-fold-extract-element-to-trunc-if-ve.patch
@ -0,0 +1,99 @@
+/*========================== begin_copyright_notice ============================
+
+Copyright (C) 2025 Intel Corporation
+
+SPDX-License-Identifier: MIT
+
+============================= end_copyright_notice ===========================*/
+
+/*========================== begin_copyright_notice ============================
+
+Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+See https://llvm.org/LICENSE.txt for license information.
+SPDX-License-Identifier: Apache-2.0 with LLVM-exception
+
+============================= end_copyright_notice ===========================*/
+
+From c5c679933c462f28fac7358841a23ee32c292a47 Mon Sep 17 00:00:00 2001
+From: peterbell10 <peterbell10@openai.com>
+Date: Wed, 20 Nov 2024 21:06:57 +0000
+Subject: [PATCH] [InstCombine] Only fold extract element to trunc if vector
+ `hasOneUse` (#115627)
+
+This fixes a missed optimization caused by the `foldBitcastExtElt`
+pattern interfering with other combine patterns. In the case I was
+hitting, we have IR that combines two vectors into a new larger vector
+by extracting elements and inserting them into the new vector.
+
+```llvm
+define <4 x half> @bitcast_extract_insert_to_shuffle(i32 %a, i32 %b) {
+  %avec = bitcast i32 %a to <2 x half>
+  %a0 = extractelement <2 x half> %avec, i32 0
+  %a1 = extractelement <2 x half> %avec, i32 1
+  %bvec = bitcast i32 %b to <2 x half>
+  %b0 = extractelement <2 x half> %bvec, i32 0
+  %b1 = extractelement <2 x half> %bvec, i32 1
+  %ins0 = insertelement <4 x half> undef, half %a0, i32 0
+  %ins1 = insertelement <4 x half> %ins0, half %a1, i32 1
+  %ins2 = insertelement <4 x half> %ins1, half %b0, i32 2
+  %ins3 = insertelement <4 x half> %ins2, half %b1, i32 3
+  ret <4 x half> %ins3
+}
+```
+
+With the current behavior, `InstCombine` converts each vector extract
+sequence to
+
+```llvm
+  %tmp = trunc i32 %a to i16
+  %a0 = bitcast i16 %tmp to half
+  %a1 = extractelement <2 x half> %avec, i32 1
+```
+
+where the extraction of `%a0` is now done by truncating the original
+integer. While on it's own this is fairly reasonable, in this case it
+also blocks the pattern which converts `extractelement` -
+`insertelement` into shuffles which gives the overall simpler result:
+
+```llvm
+define <4 x half> @bitcast_extract_insert_to_shuffle(i32 %a, i32 %b) {
+  %avec = bitcast i32 %a to <2 x half>
+  %bvec = bitcast i32 %b to <2 x half>
+  %ins3 = shufflevector <2 x half> %avec, <2 x half> %bvec, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x half> %ins3
+}
+```
+
+In this PR I fix the conflict by obeying the `hasOneUse` check even if
+there is no shift instruction required. In these cases we can't remove
+the vector completely, so the pattern has less benefit anyway.
+
+Also fwiw, I think dropping the `hasOneUse` check for the 0th element
+might have been a mistake in the first place. Looking at
+https://github.com/llvm/llvm-project/commit/535c5d56a7bc9966036a11362d8984983a4bf090
+the commit message only mentions loosening the `isDesirableIntType`
+requirement and doesn't mention changing the `hasOneUse` check at all.
+---
+ llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+index 61e62adbe327..d3b30848ab8b 100644
+--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+@@ -202,9 +202,9 @@ Instruction *InstCombinerImpl::foldBitcastExtElt(ExtractElementInst &Ext) {
+     if (IsBigEndian)
+       ExtIndexC = NumElts.getKnownMinValue() - 1 - ExtIndexC;
+     unsigned ShiftAmountC = ExtIndexC * DestWidth;
+-    if (!ShiftAmountC ||
+-        (isDesirableIntType(X->getType()->getPrimitiveSizeInBits()) &&
+-        Ext.getVectorOperand()->hasOneUse())) {
+    if ((!ShiftAmountC ||
+         isDesirableIntType(X->getType()->getPrimitiveSizeInBits())) &&
+        Ext.getVectorOperand()->hasOneUse()) {
+       if (ShiftAmountC)
+         X = Builder.CreateLShr(X, ShiftAmountC, "extelt.offset");
+       if (DestTy->isFloatingPointTy()) {
+-- 
+2.43.0
+