AArch64: support encoding integer immediates using floating point moves

This patch extends our immediate SIMD generation cases to support generating
integer immediates using floating point operation if the integer immediate maps
to an exact FP value.

As an example:

uint32x4_t f1() {
    return vdupq_n_u32(0x3f800000);
}

currently generates:

f1:
        adrp    x0, .LC0
        ldr     q0, [x0, #:lo12:.LC0]
        ret

i.e. a load, but with this change:

f1:
        fmov    v0.4s, 1.0e+0
        ret

Such immediates are common in e.g. our Math routines in glibc because they are
created to extract or mark part of an FP immediate as masks.

gcc/ChangeLog:

	* config/aarch64/aarch64.cc (aarch64_sve_valid_immediate,
	aarch64_simd_valid_immediate): Refactor accepting modes and values.
	(aarch64_float_const_representable_p): Refactor and extract FP checks
	into ...
	(aarch64_real_float_const_representable_p): ...This and fix fail
	fallback from real_to_integer.
	(aarch64_advsimd_valid_immediate): Use it.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/const_create_using_fmov.c: New test.
This commit is contained in:
Tamar Christina 2024-10-18 09:43:45 +01:00
parent fc35079277
commit 87dc6b1992
2 changed files with 241 additions and 128 deletions

View file

@ -22899,19 +22899,19 @@ aarch64_advsimd_valid_immediate_hs (unsigned int val32,
return false;
}
/* Return true if replicating VAL64 is a valid immediate for the
/* Return true if replicating VAL64 with mode MODE is a valid immediate for the
Advanced SIMD operation described by WHICH. If INFO is nonnull,
use it to describe valid immediates. */
static bool
aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
scalar_int_mode mode,
simd_immediate_info *info,
enum simd_immediate_check which)
{
unsigned int val32 = val64 & 0xffffffff;
unsigned int val16 = val64 & 0xffff;
unsigned int val8 = val64 & 0xff;
if (val32 == (val64 >> 32))
if (mode != DImode)
{
if ((which & AARCH64_CHECK_ORR) != 0
&& aarch64_advsimd_valid_immediate_hs (val32, info, which,
@ -22924,9 +22924,7 @@ aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
return true;
/* Try using a replicated byte. */
if (which == AARCH64_CHECK_MOV
&& val16 == (val32 >> 16)
&& val8 == (val16 >> 8))
if (which == AARCH64_CHECK_MOV && mode == QImode)
{
if (info)
*info = simd_immediate_info (QImode, val8);
@ -22954,28 +22952,15 @@ aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
return false;
}
/* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
instruction. If INFO is nonnull, use it to describe valid immediates. */
/* Return true if replicating IVAL with MODE gives a valid immediate for an SVE
MOV instruction. If INFO is nonnull, use it to describe valid
immediates. */
static bool
aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT ival, scalar_int_mode mode,
simd_immediate_info *info)
{
scalar_int_mode mode = DImode;
unsigned int val32 = val64 & 0xffffffff;
if (val32 == (val64 >> 32))
{
mode = SImode;
unsigned int val16 = val32 & 0xffff;
if (val16 == (val32 >> 16))
{
mode = HImode;
unsigned int val8 = val16 & 0xff;
if (val8 == (val16 >> 8))
mode = QImode;
}
}
HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
HOST_WIDE_INT val = trunc_int_for_mode (ival, mode);
if (IN_RANGE (val, -0x80, 0x7f))
{
/* DUP with no shift. */
@ -22990,7 +22975,7 @@ aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
*info = simd_immediate_info (mode, val);
return true;
}
if (aarch64_bitmask_imm (val64, mode))
if (aarch64_bitmask_imm (ival, mode))
{
/* DUPM. */
if (info)
@ -23071,6 +23056,91 @@ aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
return false;
}
/* We can only represent floating point constants which will fit in
"quarter-precision" values. These values are characterised by
a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
by:
(-1)^s * (n/16) * 2^r
Where:
's' is the sign bit.
'n' is an integer in the range 16 <= n <= 31.
'r' is an integer in the range -3 <= r <= 4.
Return true iff R represents a vale encodable into an AArch64 floating point
move instruction as an immediate. Othewise false. */
static bool
aarch64_real_float_const_representable_p (REAL_VALUE_TYPE r)
{
/* This represents our current view of how many bits
make up the mantissa. */
int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
int exponent;
unsigned HOST_WIDE_INT mantissa, mask;
REAL_VALUE_TYPE m;
bool fail = false;
/* We cannot represent infinities, NaNs or +/-zero. We won't
know if we have +zero until we analyse the mantissa, but we
can reject the other invalid values. */
if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
|| REAL_VALUE_MINUS_ZERO (r))
return false;
/* Extract exponent. */
r = real_value_abs (&r);
exponent = REAL_EXP (&r);
/* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
highest (sign) bit, with a fixed binary point at bit point_pos.
m1 holds the low part of the mantissa, m2 the high part.
WARNING: If we ever have a representation using more than 2 * H_W_I - 1
bits for the mantissa, this can fail (low bits will be lost). */
real_ldexp (&m, &r, point_pos - exponent);
wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
/* If the low part of the mantissa has bits set we cannot represent
the value. */
if (fail || w.ulow () != 0)
return false;
/* We have rejected the lower HOST_WIDE_INT, so update our
understanding of how many bits lie in the mantissa and
look only at the high HOST_WIDE_INT. */
mantissa = w.elt (1);
point_pos -= HOST_BITS_PER_WIDE_INT;
/* We can only represent values with a mantissa of the form 1.xxxx. */
mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
if ((mantissa & mask) != 0)
return false;
/* Having filtered unrepresentable values, we may now remove all
but the highest 5 bits. */
mantissa >>= point_pos - 5;
/* We cannot represent the value 0.0, so reject it. This is handled
elsewhere. */
if (mantissa == 0)
return false;
/* Then, as bit 4 is always set, we can mask it off, leaving
the mantissa in the range [0, 15]. */
mantissa &= ~(1 << 4);
gcc_assert (mantissa <= 15);
/* GCC internally does not use IEEE754-like encoding (where normalized
significands are in the range [1, 2). GCC uses [0.5, 1) (see real.cc).
Our mantissa values are shifted 4 places to the left relative to
normalized IEEE754 so we must modify the exponent returned by REAL_EXP
by 5 places to correct for GCC's representation. */
exponent = 5 - exponent;
return (exponent >= 0 && exponent <= 7);
}
/* Return true if OP is a valid SIMD immediate for the operation
described by WHICH. If INFO is nonnull, use it to describe valid
immediates. */
@ -23124,20 +23194,6 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
else
return false;
scalar_float_mode elt_float_mode;
if (n_elts == 1
&& is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
{
rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
if (aarch64_float_const_zero_rtx_p (elt)
|| aarch64_float_const_representable_p (elt))
{
if (info)
*info = simd_immediate_info (elt_float_mode, elt);
return true;
}
}
/* If all elements in an SVE vector have the same value, we have a free
choice between using the element mode and using the container mode.
Using the element mode means that unused parts of the vector are
@ -23199,10 +23255,57 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
<< (i * BITS_PER_UNIT));
/* Try encoding the integer immediate as a floating point value if it's an
exact value. */
scalar_float_mode fmode = DFmode;
scalar_int_mode imode = DImode;
unsigned HOST_WIDE_INT ival = val64;
unsigned int val32 = val64 & 0xffffffff;
if (val32 == (val64 >> 32))
{
fmode = SFmode;
imode = SImode;
ival = val32;
unsigned int val16 = val32 & 0xffff;
if (val16 == (val32 >> 16))
{
fmode = HFmode;
imode = HImode;
ival = val16;
unsigned int val8 = val16 & 0xff;
if (val8 == (val16 >> 8))
{
imode = QImode;
ival = val8;
}
}
}
if (which == AARCH64_CHECK_MOV
&& imode != QImode
&& (imode != HImode || TARGET_FP_F16INST))
{
long int as_long_ints[2];
as_long_ints[0] = ival & 0xFFFFFFFF;
as_long_ints[1] = (ival >> 32) & 0xFFFFFFFF;
REAL_VALUE_TYPE r;
real_from_target (&r, as_long_ints, fmode);
if (aarch64_real_float_const_representable_p (r))
{
if (info)
{
rtx float_val = const_double_from_real_value (r, fmode);
*info = simd_immediate_info (fmode, float_val);
}
return true;
}
}
if (vec_flags & VEC_SVE_DATA)
return aarch64_sve_valid_immediate (val64, info);
return aarch64_sve_valid_immediate (ival, imode, info);
else
return aarch64_advsimd_valid_immediate (val64, info, which);
return aarch64_advsimd_valid_immediate (val64, imode, info, which);
}
/* Check whether X is a VEC_SERIES-like constant that starts at 0 and
@ -25205,106 +25308,29 @@ aarch64_c_mode_for_suffix (char suffix)
return VOIDmode;
}
/* We can only represent floating point constants which will fit in
"quarter-precision" values. These values are characterised by
a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
by:
(-1)^s * (n/16) * 2^r
Where:
's' is the sign bit.
'n' is an integer in the range 16 <= n <= 31.
'r' is an integer in the range -3 <= r <= 4. */
/* Return true iff X can be represented by a quarter-precision
/* Return true iff X with mode MODE can be represented by a quarter-precision
floating point immediate operand X. Note, we cannot represent 0.0. */
bool
aarch64_float_const_representable_p (rtx x)
{
/* This represents our current view of how many bits
make up the mantissa. */
int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
int exponent;
unsigned HOST_WIDE_INT mantissa, mask;
REAL_VALUE_TYPE r, m;
bool fail;
x = unwrap_const_vec_duplicate (x);
machine_mode mode = GET_MODE (x);
if (!CONST_DOUBLE_P (x))
return false;
if (GET_MODE (x) == VOIDmode
|| (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
if ((mode == HFmode && !TARGET_FP_F16INST)
|| mode == BFmode)
return false;
r = *CONST_DOUBLE_REAL_VALUE (x);
REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (x);
/* We cannot represent infinities, NaNs or +/-zero. We won't
know if we have +zero until we analyse the mantissa, but we
can reject the other invalid values. */
if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
|| REAL_VALUE_MINUS_ZERO (r))
return false;
/* For BFmode, only handle 0.0. */
if (GET_MODE (x) == BFmode)
return real_iszero (&r, false);
/* Extract exponent. */
r = real_value_abs (&r);
exponent = REAL_EXP (&r);
/* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
highest (sign) bit, with a fixed binary point at bit point_pos.
m1 holds the low part of the mantissa, m2 the high part.
WARNING: If we ever have a representation using more than 2 * H_W_I - 1
bits for the mantissa, this can fail (low bits will be lost). */
real_ldexp (&m, &r, point_pos - exponent);
wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
/* If the low part of the mantissa has bits set we cannot represent
the value. */
if (w.ulow () != 0)
return false;
/* We have rejected the lower HOST_WIDE_INT, so update our
understanding of how many bits lie in the mantissa and
look only at the high HOST_WIDE_INT. */
mantissa = w.elt (1);
point_pos -= HOST_BITS_PER_WIDE_INT;
/* We can only represent values with a mantissa of the form 1.xxxx. */
mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
if ((mantissa & mask) != 0)
return false;
/* Having filtered unrepresentable values, we may now remove all
but the highest 5 bits. */
mantissa >>= point_pos - 5;
/* We cannot represent the value 0.0, so reject it. This is handled
elsewhere. */
if (mantissa == 0)
return false;
/* Then, as bit 4 is always set, we can mask it off, leaving
the mantissa in the range [0, 15]. */
mantissa &= ~(1 << 4);
gcc_assert (mantissa <= 15);
/* GCC internally does not use IEEE754-like encoding (where normalized
significands are in the range [1, 2). GCC uses [0.5, 1) (see real.cc).
Our mantissa values are shifted 4 places to the left relative to
normalized IEEE754 so we must modify the exponent returned by REAL_EXP
by 5 places to correct for GCC's representation. */
exponent = 5 - exponent;
return (exponent >= 0 && exponent <= 7);
return aarch64_real_float_const_representable_p (r);
}
/* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
output MOVI/MVNI, ORR or BIC immediate. */
/* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR, BIC or
FMOV immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether
to output MOVI/MVNI, ORR or BIC immediate. */
char*
aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
enum simd_immediate_check which)

View file

@ -0,0 +1,87 @@
/* { dg-do compile } */
/* { dg-additional-options "-march=armv9-a -Ofast" } */
/* { dg-final { check-function-bodies "**" "" "" } } */
#include <arm_neon.h>
/*
** g:
** fmov v0\.4s, 1\.0e\+0
** ret
*/
float32x4_t g(){
return vdupq_n_f32(1);
}
/*
** h:
** fmov v0\.4s, 1\.0e\+0
** ret
*/
uint32x4_t h() {
return vreinterpretq_u32_f32(g());
}
/*
** f1:
** fmov v0\.4s, 1\.0e\+0
** ret
*/
uint32x4_t f1() {
return vdupq_n_u32(0x3f800000);
}
/*
** f2:
** fmov v0\.4s, 1\.5e\+0
** ret
*/
uint32x4_t f2() {
return vdupq_n_u32(0x3FC00000);
}
/*
** f3:
** fmov v0\.4s, 1\.25e\+0
** ret
*/
uint32x4_t f3() {
return vdupq_n_u32(0x3FA00000);
}
/*
** f4:
** fmov v0\.2d, 1\.0e\+0
** ret
*/
uint64x2_t f4() {
return vdupq_n_u64(0x3FF0000000000000);
}
/*
** fn4:
** fmov v0\.2d, -1\.0e\+0
** ret
*/
uint64x2_t fn4() {
return vdupq_n_u64(0xBFF0000000000000);
}
/*
** f5:
** fmov v0\.8h, 1\.5e\+0
** ret
*/
uint16x8_t f5() {
return vdupq_n_u16(0x3E00);
}
/*
** f6:
** adrp x0, \.LC0
** ldr q0, \[x0, #:lo12:\.LC0\]
** ret
*/
uint32x4_t f6() {
return vdupq_n_u32(0x4f800000);
}