AArch64: support encoding integer immediates using floating point moves

This patch extends our immediate SIMD generation cases to support generating integer immediates using floating point operation if the integer immediate maps to an exact FP value. As an example: uint32x4_t f1() { return vdupq_n_u32(0x3f800000); } currently generates: f1: adrp x0, .LC0 ldr q0, [x0, #:lo12:.LC0] ret i.e. a load, but with this change: f1: fmov v0.4s, 1.0e+0 ret Such immediates are common in e.g. our Math routines in glibc because they are created to extract or mark part of an FP immediate as masks. gcc/ChangeLog: * config/aarch64/aarch64.cc (aarch64_sve_valid_immediate, aarch64_simd_valid_immediate): Refactor accepting modes and values. (aarch64_float_const_representable_p): Refactor and extract FP checks into ... (aarch64_real_float_const_representable_p): ...This and fix fail fallback from real_to_integer. (aarch64_advsimd_valid_immediate): Use it. gcc/testsuite/ChangeLog: * gcc.target/aarch64/const_create_using_fmov.c: New test.
2024-10-18 09:43:45 +01:00 · 2024-10-18 09:43:45 +01:00 · 87dc6b1992
commit 87dc6b1992
parent fc35079277
2 changed files with 241 additions and 128 deletions
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@ -22899,19 +22899,19 @@ aarch64_advsimd_valid_immediate_hs (unsigned int val32,
  return false;
 }

-/* Return true if replicating VAL64 is a valid immediate for the
+/* Return true if replicating VAL64 with mode MODE is a valid immediate for the
   Advanced SIMD operation described by WHICH.  If INFO is nonnull,
   use it to describe valid immediates.  */
 static bool
 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
+				 scalar_int_mode mode,
 				 simd_immediate_info *info,
 				 enum simd_immediate_check which)
 {
  unsigned int val32 = val64 & 0xffffffff;
-  unsigned int val16 = val64 & 0xffff;
  unsigned int val8 = val64 & 0xff;

-  if (val32 == (val64 >> 32))
+  if (mode != DImode)
    {
      if ((which & AARCH64_CHECK_ORR) != 0
 	  && aarch64_advsimd_valid_immediate_hs (val32, info, which,
@ -22924,9 +22924,7 @@ aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
 	return true;

      /* Try using a replicated byte.  */
-      if (which == AARCH64_CHECK_MOV
-	  && val16 == (val32 >> 16)
-	  && val8 == (val16 >> 8))
+      if (which == AARCH64_CHECK_MOV && mode == QImode)
 	{
 	  if (info)
 	    *info = simd_immediate_info (QImode, val8);
@ -22954,28 +22952,15 @@ aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
  return false;
 }

-/* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
-   instruction.  If INFO is nonnull, use it to describe valid immediates.  */
+/* Return true if replicating IVAL with MODE gives a valid immediate for an SVE
+   MOV instruction.  If INFO is nonnull, use it to describe valid
+   immediates.  */

 static bool
-aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
+aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT ival, scalar_int_mode mode,
 			     simd_immediate_info *info)
 {
-  scalar_int_mode mode = DImode;
-  unsigned int val32 = val64 & 0xffffffff;
-  if (val32 == (val64 >> 32))
-    {
-      mode = SImode;
-      unsigned int val16 = val32 & 0xffff;
-      if (val16 == (val32 >> 16))
-	{
-	  mode = HImode;
-	  unsigned int val8 = val16 & 0xff;
-	  if (val8 == (val16 >> 8))
-	    mode = QImode;
-	}
-    }
-  HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
+  HOST_WIDE_INT val = trunc_int_for_mode (ival, mode);
  if (IN_RANGE (val, -0x80, 0x7f))
    {
      /* DUP with no shift.  */
@ -22990,7 +22975,7 @@ aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
 	*info = simd_immediate_info (mode, val);
      return true;
    }
-  if (aarch64_bitmask_imm (val64, mode))
+  if (aarch64_bitmask_imm (ival, mode))
    {
      /* DUPM.  */
      if (info)
@ -23071,6 +23056,91 @@ aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
  return false;
 }

+/* We can only represent floating point constants which will fit in
+   "quarter-precision" values.  These values are characterised by
+   a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
+   by:
+
+   (-1)^s * (n/16) * 2^r
+
+   Where:
+     's' is the sign bit.
+     'n' is an integer in the range 16 <= n <= 31.
+     'r' is an integer in the range -3 <= r <= 4.
+
+   Return true iff R represents a vale encodable into an AArch64 floating point
+   move instruction as an immediate.  Othewise false.  */
+
+static bool
+aarch64_real_float_const_representable_p (REAL_VALUE_TYPE r)
+{
+  /* This represents our current view of how many bits
+     make up the mantissa.  */
+  int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
+  int exponent;
+  unsigned HOST_WIDE_INT mantissa, mask;
+  REAL_VALUE_TYPE m;
+  bool fail = false;
+
+  /* We cannot represent infinities, NaNs or +/-zero.  We won't
+     know if we have +zero until we analyse the mantissa, but we
+     can reject the other invalid values.  */
+  if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
+      || REAL_VALUE_MINUS_ZERO (r))
+    return false;
+
+  /* Extract exponent.  */
+  r = real_value_abs (&r);
+  exponent = REAL_EXP (&r);
+
+  /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
+     highest (sign) bit, with a fixed binary point at bit point_pos.
+     m1 holds the low part of the mantissa, m2 the high part.
+     WARNING: If we ever have a representation using more than 2 * H_W_I - 1
+     bits for the mantissa, this can fail (low bits will be lost).  */
+  real_ldexp (&m, &r, point_pos - exponent);
+  wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
+
+  /* If the low part of the mantissa has bits set we cannot represent
+     the value.  */
+  if (fail || w.ulow () != 0)
+    return false;
+
+  /* We have rejected the lower HOST_WIDE_INT, so update our
+     understanding of how many bits lie in the mantissa and
+     look only at the high HOST_WIDE_INT.  */
+  mantissa = w.elt (1);
+  point_pos -= HOST_BITS_PER_WIDE_INT;
+
+  /* We can only represent values with a mantissa of the form 1.xxxx.  */
+  mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
+  if ((mantissa & mask) != 0)
+    return false;
+
+  /* Having filtered unrepresentable values, we may now remove all
+     but the highest 5 bits.  */
+  mantissa >>= point_pos - 5;
+
+  /* We cannot represent the value 0.0, so reject it.  This is handled
+     elsewhere.  */
+  if (mantissa == 0)
+    return false;
+
+  /* Then, as bit 4 is always set, we can mask it off, leaving
+     the mantissa in the range [0, 15].  */
+  mantissa &= ~(1 << 4);
+  gcc_assert (mantissa <= 15);
+
+  /* GCC internally does not use IEEE754-like encoding (where normalized
+     significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.cc).
+     Our mantissa values are shifted 4 places to the left relative to
+     normalized IEEE754 so we must modify the exponent returned by REAL_EXP
+     by 5 places to correct for GCC's representation.  */
+  exponent = 5 - exponent;
+
+  return (exponent >= 0 && exponent <= 7);
+}
+
 /* Return true if OP is a valid SIMD immediate for the operation
   described by WHICH.  If INFO is nonnull, use it to describe valid
   immediates.  */
@ -23124,20 +23194,6 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
  else
    return false;

-  scalar_float_mode elt_float_mode;
-  if (n_elts == 1
-      && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
-    {
-      rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
-      if (aarch64_float_const_zero_rtx_p (elt)
-	  || aarch64_float_const_representable_p (elt))
-	{
-	  if (info)
-	    *info = simd_immediate_info (elt_float_mode, elt);
-	  return true;
-	}
-    }
-
  /* If all elements in an SVE vector have the same value, we have a free
     choice between using the element mode and using the container mode.
     Using the element mode means that unused parts of the vector are
@ -23199,10 +23255,57 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
    val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
 	      << (i * BITS_PER_UNIT));

+  /* Try encoding the integer immediate as a floating point value if it's an
+     exact value.  */
+  scalar_float_mode fmode = DFmode;
+  scalar_int_mode imode = DImode;
+  unsigned HOST_WIDE_INT ival = val64;
+  unsigned int val32 = val64 & 0xffffffff;
+  if (val32 == (val64 >> 32))
+    {
+      fmode = SFmode;
+      imode = SImode;
+      ival = val32;
+      unsigned int val16 = val32 & 0xffff;
+      if (val16 == (val32 >> 16))
+	{
+	  fmode = HFmode;
+	  imode = HImode;
+	  ival = val16;
+	  unsigned int val8 = val16 & 0xff;
+	  if (val8 == (val16 >> 8))
+	    {
+	      imode = QImode;
+	      ival = val8;
+	    }
+	}
+    }
+
+  if (which == AARCH64_CHECK_MOV
+      && imode != QImode
+      && (imode != HImode || TARGET_FP_F16INST))
+    {
+      long int as_long_ints[2];
+      as_long_ints[0] = ival & 0xFFFFFFFF;
+      as_long_ints[1] = (ival >> 32) & 0xFFFFFFFF;
+
+      REAL_VALUE_TYPE r;
+      real_from_target (&r, as_long_ints, fmode);
+      if (aarch64_real_float_const_representable_p (r))
+	{
+	  if (info)
+	    {
+	      rtx float_val = const_double_from_real_value (r, fmode);
+	      *info = simd_immediate_info (fmode, float_val);
+	    }
+	  return true;
+	}
+    }
+
  if (vec_flags & VEC_SVE_DATA)
-    return aarch64_sve_valid_immediate (val64, info);
+    return aarch64_sve_valid_immediate (ival, imode, info);
  else
-    return aarch64_advsimd_valid_immediate (val64, info, which);
+    return aarch64_advsimd_valid_immediate (val64, imode, info, which);
 }

 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
@ -25205,106 +25308,29 @@ aarch64_c_mode_for_suffix (char suffix)
  return VOIDmode;
 }

-/* We can only represent floating point constants which will fit in
-   "quarter-precision" values.  These values are characterised by
-   a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
-   by:
-
-   (-1)^s * (n/16) * 2^r
-
-   Where:
-     's' is the sign bit.
-     'n' is an integer in the range 16 <= n <= 31.
-     'r' is an integer in the range -3 <= r <= 4.  */
-
-/* Return true iff X can be represented by a quarter-precision
+/* Return true iff X with mode MODE can be represented by a quarter-precision
   floating point immediate operand X.  Note, we cannot represent 0.0.  */
+
 bool
 aarch64_float_const_representable_p (rtx x)
 {
-  /* This represents our current view of how many bits
-     make up the mantissa.  */
-  int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
-  int exponent;
-  unsigned HOST_WIDE_INT mantissa, mask;
-  REAL_VALUE_TYPE r, m;
-  bool fail;
-
  x = unwrap_const_vec_duplicate (x);
+  machine_mode mode = GET_MODE (x);
  if (!CONST_DOUBLE_P (x))
    return false;

-  if (GET_MODE (x) == VOIDmode
-      || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
+  if ((mode == HFmode && !TARGET_FP_F16INST)
+      || mode == BFmode)
    return false;

-  r = *CONST_DOUBLE_REAL_VALUE (x);
+  REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (x);

-  /* We cannot represent infinities, NaNs or +/-zero.  We won't
-     know if we have +zero until we analyse the mantissa, but we
-     can reject the other invalid values.  */
-  if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
-      || REAL_VALUE_MINUS_ZERO (r))
-    return false;
-
-  /* For BFmode, only handle 0.0. */
-  if (GET_MODE (x) == BFmode)
-    return real_iszero (&r, false);
-
-  /* Extract exponent.  */
-  r = real_value_abs (&r);
-  exponent = REAL_EXP (&r);
-
-  /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
-     highest (sign) bit, with a fixed binary point at bit point_pos.
-     m1 holds the low part of the mantissa, m2 the high part.
-     WARNING: If we ever have a representation using more than 2 * H_W_I - 1
-     bits for the mantissa, this can fail (low bits will be lost).  */
-  real_ldexp (&m, &r, point_pos - exponent);
-  wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
-
-  /* If the low part of the mantissa has bits set we cannot represent
-     the value.  */
-  if (w.ulow () != 0)
-    return false;
-  /* We have rejected the lower HOST_WIDE_INT, so update our
-     understanding of how many bits lie in the mantissa and
-     look only at the high HOST_WIDE_INT.  */
-  mantissa = w.elt (1);
-  point_pos -= HOST_BITS_PER_WIDE_INT;
-
-  /* We can only represent values with a mantissa of the form 1.xxxx.  */
-  mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
-  if ((mantissa & mask) != 0)
-    return false;
-
-  /* Having filtered unrepresentable values, we may now remove all
-     but the highest 5 bits.  */
-  mantissa >>= point_pos - 5;
-
-  /* We cannot represent the value 0.0, so reject it.  This is handled
-     elsewhere.  */
-  if (mantissa == 0)
-    return false;
-
-  /* Then, as bit 4 is always set, we can mask it off, leaving
-     the mantissa in the range [0, 15].  */
-  mantissa &= ~(1 << 4);
-  gcc_assert (mantissa <= 15);
-
-  /* GCC internally does not use IEEE754-like encoding (where normalized
-     significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.cc).
-     Our mantissa values are shifted 4 places to the left relative to
-     normalized IEEE754 so we must modify the exponent returned by REAL_EXP
-     by 5 places to correct for GCC's representation.  */
-  exponent = 5 - exponent;
-
-  return (exponent >= 0 && exponent <= 7);
+  return aarch64_real_float_const_representable_p (r);
 }

-/* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
-   immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
-   output MOVI/MVNI, ORR or BIC immediate.  */
+/* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR, BIC or
+   FMOV immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether
+   to output MOVI/MVNI, ORR or BIC immediate.  */
 char*
 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
 				   enum simd_immediate_check which)
--- a/gcc/testsuite/gcc.target/aarch64/const_create_using_fmov.c
+++ b/gcc/testsuite/gcc.target/aarch64/const_create_using_fmov.c
@ -0,0 +1,87 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv9-a -Ofast" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** g:
+** 	fmov	v0\.4s, 1\.0e\+0
+** 	ret
+*/
+float32x4_t g(){
+    return vdupq_n_f32(1);
+}
+
+/*
+** h:
+** 	fmov	v0\.4s, 1\.0e\+0
+** 	ret
+*/
+uint32x4_t h() {
+    return vreinterpretq_u32_f32(g());
+}
+
+/*
+** f1:
+** 	fmov	v0\.4s, 1\.0e\+0
+** 	ret
+*/
+uint32x4_t f1() {
+    return vdupq_n_u32(0x3f800000);
+}
+
+/*
+** f2:
+** 	fmov	v0\.4s, 1\.5e\+0
+** 	ret
+*/
+uint32x4_t f2() {
+    return vdupq_n_u32(0x3FC00000);
+}
+
+/*
+** f3:
+** 	fmov	v0\.4s, 1\.25e\+0
+** 	ret
+*/
+uint32x4_t f3() {
+    return vdupq_n_u32(0x3FA00000);
+}
+
+/*
+** f4:
+** 	fmov	v0\.2d, 1\.0e\+0
+** 	ret
+*/
+uint64x2_t f4() {
+    return vdupq_n_u64(0x3FF0000000000000);
+}
+
+/*
+** fn4:
+** 	fmov	v0\.2d, -1\.0e\+0
+** 	ret
+*/
+uint64x2_t fn4() {
+    return vdupq_n_u64(0xBFF0000000000000);
+}
+
+/*
+** f5:
+** 	fmov	v0\.8h, 1\.5e\+0
+** 	ret
+*/
+uint16x8_t f5() {
+    return vdupq_n_u16(0x3E00);
+}
+
+/*
+** f6:
+** 	adrp	x0, \.LC0
+** 	ldr	q0, \[x0, #:lo12:\.LC0\]
+** 	ret
+*/
+uint32x4_t f6() {
+    return vdupq_n_u32(0x4f800000);
+}