match.pd: Fold ((X >> C1) & C2) * (1 << C1)

Using a combination of rules, we were able to fold ((X >> C1) & C2) * (1 << C1) --> X & (C2 << C1) if everything was done at the same precision, but we couldn't fold it if the AND was done at a different precision. The optimisation is often (but not always) valid for that case too. This patch adds a dedicated rule for the case where different precisions are involved. An alternative would be to extend the individual folds that together handle the same-precision case so that those rules handle differing precisions. But the risk is that that could replace narrow operations with wide operations, which would be especially harmful on targets like avr. It's also not obviously free of cycles. I also wondered whether the converts should be non-optional. gcc/ * match.pd: Fold ((X >> C1) & C2) * (1 << C1) to X & (C2 << C1). gcc/testsuite/ * gcc.dg/fold-mul-and-lshift-1.c: New test. * gcc.dg/fold-mul-and-lshift-2.c: Likewise.
2025-03-13 12:03:04 +00:00 · 2025-03-13 12:03:04 +00:00 · 7dae3f64c0
commit 7dae3f64c0
parent a68e32b8e4
3 changed files with 102 additions and 0 deletions
--- a/gcc/match.pd
+++ b/gcc/match.pd
@ -5231,6 +5231,34 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
     (if (mask)
      (bit_op (shift (convert @0) @1) { mask; })))))))

+/* Fold ((X >> C1) & C2) * (1 << C1) into X & (C2 << C1), including cases where
+   the & happens in a different type.  It is the conversion case that isn't
+   a composition of other folds.
+
+   Let the type of the * and >> be T1 and the type of the & be T2.
+   The fold is valid if the conversion to T2 preserves all information;
+   that is, if T2 is wider than T1 or drops no more than C1 bits from T1.
+   In that case, the & might operate on bits that are dropped by the
+   later conversion to T1 and the multiplication by (1 << C1), but those
+   bits are also dropped by ANDing with C2 << C1 (converted to T1).
+
+   If the conversion to T2 is not information-preserving, we have to be
+   careful about the later conversion to T1 acting as a sign extension.
+   We need either T2 to be unsigned or the top (sign) bit of C2 to be clear.
+   That is equivalent to testing whether C2 is nonnegative.  */
+(simplify
+ (mult (convert? (bit_and (convert? (rshift @0 INTEGER_CST@1)) INTEGER_CST@2))
+       INTEGER_CST@3)
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0))
+      && wi::ltu_p (wi::to_widest (@1), TYPE_PRECISION (type)))
+  (with { unsigned int shift = tree_to_uhwi (@1);
+	  unsigned int prec = TYPE_PRECISION (type); }
+   (if ((prec <= TYPE_PRECISION (TREE_TYPE (@2)) + shift
+	 || tree_int_cst_sgn (@2) >= 0)
+	&& wi::to_wide (@3) == wi::set_bit_in_zero (shift, prec))
+    (with { auto mask = wide_int::from (wi::to_wide (@2), prec, UNSIGNED); }
+     (bit_and @0 { wide_int_to_tree (type, mask << shift); }))))))
+
 /* ~(~X >> Y) -> X >> Y (for arithmetic shift).  */
 (simplify
 (bit_not (convert1?:s (rshift:s (convert2?@0 (bit_not @1)) @2)))
--- a/gcc/testsuite/gcc.dg/fold-mul-and-lshift-1.c
+++ b/gcc/testsuite/gcc.dg/fold-mul-and-lshift-1.c
@ -0,0 +1,59 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-not { >> } "optimized" } } */
+/* { dg-final { scan-tree-dump-not { \* } "optimized" } } */
+
+unsigned int
+f1 (unsigned int x)
+{
+    x >>= 1;
+    unsigned long y = x;
+    y &= 255;
+    x = y;
+    x *= 2;
+    return x;
+}
+
+unsigned int
+f2 (unsigned int x)
+{
+    x >>= 1;
+    unsigned long y = x;
+    y &= -2UL;
+    x = y;
+    x *= 2;
+    return x;
+}
+
+unsigned int
+f3 (unsigned int x)
+{
+    x >>= 1;
+    unsigned short y = x;
+    y &= 255;
+    x = y;
+    x *= 2;
+    return x;
+}
+
+unsigned int
+f4 (unsigned int x)
+{
+    x >>= 1;
+    short y = x;
+    y &= (unsigned short) ~0U >> 1;
+    x = y;
+    x *= 2;
+    return x;
+}
+
+unsigned int
+f5 (unsigned int x)
+{
+    x >>= 16;
+    short y = x;
+    y &= -2;
+    x = y;
+    x *= 1 << 16;
+    return x;
+}
--- a/gcc/testsuite/gcc.dg/fold-mul-and-lshift-2.c
+++ b/gcc/testsuite/gcc.dg/fold-mul-and-lshift-2.c
@ -0,0 +1,15 @@
+/* { dg-do compile { target int32 } } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump { >> 15;} "optimized" } } */
+/* { dg-final { scan-tree-dump { \* 32768;} "optimized" } } */
+
+unsigned int
+f1 (unsigned int x)
+{
+    x >>= 15;
+    short y = x;
+    y &= -2;
+    x = y;
+    x *= 1 << 15;
+    return x;
+}