From 7dae3f64c069b942b9abd768fd94af4b2cd0b781 Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Thu, 13 Mar 2025 12:03:04 +0000
Subject: [PATCH] match.pd: Fold ((X >> C1) & C2) * (1 << C1)

Using a combination of rules, we were able to fold

  ((X >> C1) & C2) * (1 << C1)  -->  X & (C2 << C1)

if everything was done at the same precision, but we couldn't fold
it if the AND was done at a different precision.  The optimisation is
often (but not always) valid for that case too.

This patch adds a dedicated rule for the case where different precisions
are involved.

An alternative would be to extend the individual folds that together
handle the same-precision case so that those rules handle differing
precisions.  But the risk is that that could replace narrow operations
with wide operations, which would be especially harmful on targets
like avr.  It's also not obviously free of cycles.

I also wondered whether the converts should be non-optional.

gcc/
	* match.pd: Fold ((X >> C1) & C2) * (1 << C1) to X & (C2 << C1).

gcc/testsuite/
	* gcc.dg/fold-mul-and-lshift-1.c: New test.
	* gcc.dg/fold-mul-and-lshift-2.c: Likewise.
---
 gcc/match.pd                                 | 28 ++++++++++
 gcc/testsuite/gcc.dg/fold-mul-and-lshift-1.c | 59 ++++++++++++++++++++
 gcc/testsuite/gcc.dg/fold-mul-and-lshift-2.c | 15 +++++
 3 files changed, 102 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/fold-mul-and-lshift-1.c
 create mode 100644 gcc/testsuite/gcc.dg/fold-mul-and-lshift-2.c

diff --git a/gcc/match.pd b/gcc/match.pd
index 5c679848bdf..7017fd15277 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5231,6 +5231,34 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
      (if (mask)
       (bit_op (shift (convert @0) @1) { mask; })))))))
 
+/* Fold ((X >> C1) & C2) * (1 << C1) into X & (C2 << C1), including cases where
+   the & happens in a different type.  It is the conversion case that isn't
+   a composition of other folds.
+
+   Let the type of the * and >> be T1 and the type of the & be T2.
+   The fold is valid if the conversion to T2 preserves all information;
+   that is, if T2 is wider than T1 or drops no more than C1 bits from T1.
+   In that case, the & might operate on bits that are dropped by the
+   later conversion to T1 and the multiplication by (1 << C1), but those
+   bits are also dropped by ANDing with C2 << C1 (converted to T1).
+
+   If the conversion to T2 is not information-preserving, we have to be
+   careful about the later conversion to T1 acting as a sign extension.
+   We need either T2 to be unsigned or the top (sign) bit of C2 to be clear.
+   That is equivalent to testing whether C2 is nonnegative.  */
+(simplify
+ (mult (convert? (bit_and (convert? (rshift @0 INTEGER_CST@1)) INTEGER_CST@2))
+       INTEGER_CST@3)
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0))
+      && wi::ltu_p (wi::to_widest (@1), TYPE_PRECISION (type)))
+  (with { unsigned int shift = tree_to_uhwi (@1);
+	  unsigned int prec = TYPE_PRECISION (type); }
+   (if ((prec <= TYPE_PRECISION (TREE_TYPE (@2)) + shift
+	 || tree_int_cst_sgn (@2) >= 0)
+	&& wi::to_wide (@3) == wi::set_bit_in_zero (shift, prec))
+    (with { auto mask = wide_int::from (wi::to_wide (@2), prec, UNSIGNED); }
+     (bit_and @0 { wide_int_to_tree (type, mask << shift); }))))))
+
 /* ~(~X >> Y) -> X >> Y (for arithmetic shift).  */
 (simplify
  (bit_not (convert1?:s (rshift:s (convert2?@0 (bit_not @1)) @2)))
diff --git a/gcc/testsuite/gcc.dg/fold-mul-and-lshift-1.c b/gcc/testsuite/gcc.dg/fold-mul-and-lshift-1.c
new file mode 100644
index 00000000000..b1ce10495e3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-mul-and-lshift-1.c
@@ -0,0 +1,59 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-not { >> } "optimized" } } */
+/* { dg-final { scan-tree-dump-not { \* } "optimized" } } */
+
+unsigned int
+f1 (unsigned int x)
+{
+    x >>= 1;
+    unsigned long y = x;
+    y &= 255;
+    x = y;
+    x *= 2;
+    return x;
+}
+
+unsigned int
+f2 (unsigned int x)
+{
+    x >>= 1;
+    unsigned long y = x;
+    y &= -2UL;
+    x = y;
+    x *= 2;
+    return x;
+}
+
+unsigned int
+f3 (unsigned int x)
+{
+    x >>= 1;
+    unsigned short y = x;
+    y &= 255;
+    x = y;
+    x *= 2;
+    return x;
+}
+
+unsigned int
+f4 (unsigned int x)
+{
+    x >>= 1;
+    short y = x;
+    y &= (unsigned short) ~0U >> 1;
+    x = y;
+    x *= 2;
+    return x;
+}
+
+unsigned int
+f5 (unsigned int x)
+{
+    x >>= 16;
+    short y = x;
+    y &= -2;
+    x = y;
+    x *= 1 << 16;
+    return x;
+}
diff --git a/gcc/testsuite/gcc.dg/fold-mul-and-lshift-2.c b/gcc/testsuite/gcc.dg/fold-mul-and-lshift-2.c
new file mode 100644
index 00000000000..86eabef0fef
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-mul-and-lshift-2.c
@@ -0,0 +1,15 @@
+/* { dg-do compile { target int32 } } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump { >> 15;} "optimized" } } */
+/* { dg-final { scan-tree-dump { \* 32768;} "optimized" } } */
+
+unsigned int
+f1 (unsigned int x)
+{
+    x >>= 15;
+    short y = x;
+    y &= -2;
+    x = y;
+    x *= 1 << 15;
+    return x;
+}