aarch64: Add SVE support for -mlow-precision-sqrt
SVE was missing support for -mlow-precision-sqrt, which meant that -march=armv8.2-a+sve -mlow-precision-sqrt could cause a performance regression compared to -march=armv8.2-a -mlow-precision-sqrt. 2020-02-21 Richard Sandiford <richard.sandiford@arm.com> gcc/ * config/aarch64/aarch64.c (aarch64_emit_approx_sqrt): Add SVE support. Use aarch64_emit_mult instead of emitting multiplication instructions directly. * config/aarch64/aarch64-sve.md (sqrt<mode>2, rsqrt<mode>2) (@aarch64_rsqrte<mode>, @aarch64_rsqrts<mode>): New expanders. gcc/testsuite/ * gcc.target/aarch64/sve/rsqrt_1.c: New test. * gcc.target/aarch64/sve/rsqrt_1_run.c: Likewise. * gcc.target/aarch64/sve/sqrt_1.c: Likewise. * gcc.target/aarch64/sve/sqrt_1_run.c: Likewise.
This commit is contained in:
parent
04f307cbb9
commit
a0ee8352df
9 changed files with 233 additions and 18 deletions
|
@ -1,3 +1,11 @@
|
|||
2020-02-21 Richard Sandiford <richard.sandiford@arm.com>
|
||||
|
||||
* config/aarch64/aarch64.c (aarch64_emit_approx_sqrt): Add SVE
|
||||
support. Use aarch64_emit_mult instead of emitting multiplication
|
||||
instructions directly.
|
||||
* config/aarch64/aarch64-sve.md (sqrt<mode>2, rsqrt<mode>2)
|
||||
(@aarch64_rsqrte<mode>, @aarch64_rsqrts<mode>): New expanders.
|
||||
|
||||
2020-02-21 Richard Sandiford <richard.sandiford@arm.com>
|
||||
|
||||
* config/aarch64/aarch64.c (aarch64_emit_mult): New function.
|
||||
|
|
|
@ -76,6 +76,8 @@
|
|||
;; ---- [INT] Logical inverse
|
||||
;; ---- [FP<-INT] General unary arithmetic that maps to unspecs
|
||||
;; ---- [FP] General unary arithmetic corresponding to unspecs
|
||||
;; ---- [FP] Square root
|
||||
;; ---- [FP] Reciprocal square root
|
||||
;; ---- [PRED] Inverse
|
||||
|
||||
;; == Binary arithmetic
|
||||
|
@ -3246,7 +3248,7 @@
|
|||
;; - FRINTP
|
||||
;; - FRINTX
|
||||
;; - FRINTZ
|
||||
;; - FRSQRT
|
||||
;; - FRSQRTE
|
||||
;; - FSQRT
|
||||
;; -------------------------------------------------------------------------
|
||||
|
||||
|
@ -3267,7 +3269,7 @@
|
|||
[(match_dup 2)
|
||||
(const_int SVE_RELAXED_GP)
|
||||
(match_operand:SVE_FULL_F 1 "register_operand")]
|
||||
SVE_COND_FP_UNARY))]
|
||||
SVE_COND_FP_UNARY_OPTAB))]
|
||||
"TARGET_SVE"
|
||||
{
|
||||
operands[2] = aarch64_ptrue_reg (<VPRED>mode);
|
||||
|
@ -3357,6 +3359,56 @@
|
|||
[(set_attr "movprfx" "*,yes,yes")]
|
||||
)
|
||||
|
||||
;; -------------------------------------------------------------------------
|
||||
;; ---- [FP] Square root
|
||||
;; -------------------------------------------------------------------------
|
||||
|
||||
(define_expand "sqrt<mode>2"
|
||||
[(set (match_operand:SVE_FULL_F 0 "register_operand")
|
||||
(unspec:SVE_FULL_F
|
||||
[(match_dup 2)
|
||||
(const_int SVE_RELAXED_GP)
|
||||
(match_operand:SVE_FULL_F 1 "register_operand")]
|
||||
UNSPEC_COND_FSQRT))]
|
||||
"TARGET_SVE"
|
||||
{
|
||||
if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
|
||||
DONE;
|
||||
operands[2] = aarch64_ptrue_reg (<VPRED>mode);
|
||||
})
|
||||
|
||||
;; -------------------------------------------------------------------------
|
||||
;; ---- [FP] Reciprocal square root
|
||||
;; -------------------------------------------------------------------------
|
||||
|
||||
(define_expand "rsqrt<mode>2"
|
||||
[(set (match_operand:SVE_FULL_SDF 0 "register_operand")
|
||||
(unspec:SVE_FULL_SDF
|
||||
[(match_operand:SVE_FULL_SDF 1 "register_operand")]
|
||||
UNSPEC_RSQRT))]
|
||||
"TARGET_SVE"
|
||||
{
|
||||
aarch64_emit_approx_sqrt (operands[0], operands[1], true);
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_expand "@aarch64_rsqrte<mode>"
|
||||
[(set (match_operand:SVE_FULL_SDF 0 "register_operand")
|
||||
(unspec:SVE_FULL_SDF
|
||||
[(match_operand:SVE_FULL_SDF 1 "register_operand")]
|
||||
UNSPEC_RSQRTE))]
|
||||
"TARGET_SVE"
|
||||
)
|
||||
|
||||
(define_expand "@aarch64_rsqrts<mode>"
|
||||
[(set (match_operand:SVE_FULL_SDF 0 "register_operand")
|
||||
(unspec:SVE_FULL_SDF
|
||||
[(match_operand:SVE_FULL_SDF 1 "register_operand")
|
||||
(match_operand:SVE_FULL_SDF 2 "register_operand")]
|
||||
UNSPEC_RSQRTS))]
|
||||
"TARGET_SVE"
|
||||
)
|
||||
|
||||
;; -------------------------------------------------------------------------
|
||||
;; ---- [PRED] Inverse
|
||||
;; -------------------------------------------------------------------------
|
||||
|
|
|
@ -12790,6 +12790,9 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
|
|||
/* Caller assumes we cannot fail. */
|
||||
gcc_assert (use_rsqrt_p (mode));
|
||||
|
||||
rtx pg = NULL_RTX;
|
||||
if (aarch64_sve_mode_p (mode))
|
||||
pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
|
||||
machine_mode mmsk = (VECTOR_MODE_P (mode)
|
||||
? related_int_vector_mode (mode).require ()
|
||||
: int_mode_for_mode (mode).require ());
|
||||
|
@ -12798,11 +12801,21 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
|
|||
{
|
||||
/* When calculating the approximate square root, compare the
|
||||
argument with 0.0 and create a mask. */
|
||||
xmsk = gen_reg_rtx (mmsk);
|
||||
emit_insn (gen_rtx_SET (xmsk,
|
||||
gen_rtx_NEG (mmsk,
|
||||
gen_rtx_EQ (mmsk, src,
|
||||
CONST0_RTX (mode)))));
|
||||
rtx zero = CONST0_RTX (mode);
|
||||
if (pg)
|
||||
{
|
||||
xmsk = gen_reg_rtx (GET_MODE (pg));
|
||||
rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
|
||||
emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
|
||||
xmsk, pg, hint, src, zero));
|
||||
}
|
||||
else
|
||||
{
|
||||
xmsk = gen_reg_rtx (mmsk);
|
||||
emit_insn (gen_rtx_SET (xmsk,
|
||||
gen_rtx_NEG (mmsk,
|
||||
gen_rtx_EQ (mmsk, src, zero))));
|
||||
}
|
||||
}
|
||||
|
||||
/* Estimate the approximate reciprocal square root. */
|
||||
|
@ -12824,29 +12837,40 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
|
|||
while (iterations--)
|
||||
{
|
||||
rtx x2 = gen_reg_rtx (mode);
|
||||
emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
|
||||
aarch64_emit_mult (x2, pg, xdst, xdst);
|
||||
|
||||
emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
|
||||
|
||||
if (iterations > 0)
|
||||
emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
|
||||
aarch64_emit_mult (xdst, pg, xdst, x1);
|
||||
}
|
||||
|
||||
if (!recp)
|
||||
{
|
||||
/* Qualify the approximate reciprocal square root when the argument is
|
||||
0.0 by squashing the intermediary result to 0.0. */
|
||||
rtx xtmp = gen_reg_rtx (mmsk);
|
||||
emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
|
||||
gen_rtx_SUBREG (mmsk, xdst, 0)));
|
||||
emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
|
||||
if (pg)
|
||||
/* Multiply nonzero source values by the corresponding intermediate
|
||||
result elements, so that the final calculation is the approximate
|
||||
square root rather than its reciprocal. Select a zero result for
|
||||
zero source values, to avoid the Inf * 0 -> NaN that we'd get
|
||||
otherwise. */
|
||||
emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
|
||||
xdst, xmsk, xdst, src, CONST0_RTX (mode)));
|
||||
else
|
||||
{
|
||||
/* Qualify the approximate reciprocal square root when the
|
||||
argument is 0.0 by squashing the intermediary result to 0.0. */
|
||||
rtx xtmp = gen_reg_rtx (mmsk);
|
||||
emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
|
||||
gen_rtx_SUBREG (mmsk, xdst, 0)));
|
||||
emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
|
||||
|
||||
/* Calculate the approximate square root. */
|
||||
emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
|
||||
/* Calculate the approximate square root. */
|
||||
aarch64_emit_mult (xdst, pg, xdst, src);
|
||||
}
|
||||
}
|
||||
|
||||
/* Finalize the approximation. */
|
||||
emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
|
||||
aarch64_emit_mult (dst, pg, xdst, x1);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -2277,6 +2277,19 @@
|
|||
UNSPEC_COND_FRINTZ
|
||||
UNSPEC_COND_FSQRT])
|
||||
|
||||
;; Same as SVE_COND_FP_UNARY, but without codes that have a dedicated
|
||||
;; <optab><mode>2 expander.
|
||||
(define_int_iterator SVE_COND_FP_UNARY_OPTAB [UNSPEC_COND_FABS
|
||||
UNSPEC_COND_FNEG
|
||||
UNSPEC_COND_FRECPX
|
||||
UNSPEC_COND_FRINTA
|
||||
UNSPEC_COND_FRINTI
|
||||
UNSPEC_COND_FRINTM
|
||||
UNSPEC_COND_FRINTN
|
||||
UNSPEC_COND_FRINTP
|
||||
UNSPEC_COND_FRINTX
|
||||
UNSPEC_COND_FRINTZ])
|
||||
|
||||
(define_int_iterator SVE_COND_FCVT [UNSPEC_COND_FCVT])
|
||||
(define_int_iterator SVE_COND_FCVTI [UNSPEC_COND_FCVTZS UNSPEC_COND_FCVTZU])
|
||||
(define_int_iterator SVE_COND_ICVTF [UNSPEC_COND_SCVTF UNSPEC_COND_UCVTF])
|
||||
|
|
|
@ -1,3 +1,10 @@
|
|||
2020-02-21 Richard Sandiford <richard.sandiford@arm.com>
|
||||
|
||||
* gcc.target/aarch64/sve/rsqrt_1.c: New test.
|
||||
* gcc.target/aarch64/sve/rsqrt_1_run.c: Likewise.
|
||||
* gcc.target/aarch64/sve/sqrt_1.c: Likewise.
|
||||
* gcc.target/aarch64/sve/sqrt_1_run.c: Likewise.
|
||||
|
||||
2020-02-21 Richard Sandiford <richard.sandiford@arm.com>
|
||||
|
||||
* gcc.target/aarch64/sve/recip_1.c: New test.
|
||||
|
|
27
gcc/testsuite/gcc.target/aarch64/sve/rsqrt_1.c
Normal file
27
gcc/testsuite/gcc.target/aarch64/sve/rsqrt_1.c
Normal file
|
@ -0,0 +1,27 @@
|
|||
/* { dg-options "-Ofast -mlow-precision-sqrt" } */
|
||||
|
||||
#define DEF_LOOP(TYPE, FN) \
|
||||
void \
|
||||
test_##TYPE (TYPE *x, int n) \
|
||||
{ \
|
||||
for (int i = 0; i < n; ++i) \
|
||||
x[i] = (TYPE) 1 / FN (x[i]); \
|
||||
}
|
||||
|
||||
#define TEST_ALL(T) \
|
||||
T (_Float16, __builtin_sqrtf16) \
|
||||
T (float, __builtin_sqrtf) \
|
||||
T (double, __builtin_sqrt)
|
||||
|
||||
TEST_ALL (DEF_LOOP)
|
||||
|
||||
/* { dg-final { scan-assembler-not {\tfrsqrte\tz[0-9]+\.h} } } */
|
||||
/* { dg-final { scan-assembler-not {\tfrsqrts\tz[0-9]+\.h} } } */
|
||||
|
||||
/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {\tfrsqrte\tz[0-9]+\.s} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {\tfrsqrts\tz[0-9]+\.s} 1 } } */
|
||||
|
||||
/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d} 4 } } */
|
||||
/* { dg-final { scan-assembler-times {\tfrsqrte\tz[0-9]+\.d} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {\tfrsqrts\tz[0-9]+\.d} 2 } } */
|
27
gcc/testsuite/gcc.target/aarch64/sve/rsqrt_1_run.c
Normal file
27
gcc/testsuite/gcc.target/aarch64/sve/rsqrt_1_run.c
Normal file
|
@ -0,0 +1,27 @@
|
|||
/* { dg-do run { target aarch64_sve_hw } } */
|
||||
/* { dg-options "-Ofast -mlow-precision-sqrt" } */
|
||||
|
||||
#include "rsqrt_1.c"
|
||||
|
||||
#define N 77
|
||||
|
||||
#define TEST_LOOP(TYPE, FN) \
|
||||
{ \
|
||||
TYPE a[N]; \
|
||||
for (int i = 0; i < N; ++i) \
|
||||
a[i] = i + 1; \
|
||||
test_##TYPE (a, N); \
|
||||
for (int i = 0; i < N; ++i) \
|
||||
{ \
|
||||
double diff = a[i] - 1.0 / __builtin_sqrt (i + 1); \
|
||||
if (__builtin_fabs (diff) > 0x1.0p-8) \
|
||||
__builtin_abort (); \
|
||||
} \
|
||||
}
|
||||
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
TEST_ALL (TEST_LOOP);
|
||||
return 0;
|
||||
}
|
30
gcc/testsuite/gcc.target/aarch64/sve/sqrt_1.c
Normal file
30
gcc/testsuite/gcc.target/aarch64/sve/sqrt_1.c
Normal file
|
@ -0,0 +1,30 @@
|
|||
/* { dg-options "-Ofast -mlow-precision-sqrt" } */
|
||||
|
||||
#define DEF_LOOP(TYPE, FN) \
|
||||
void \
|
||||
test_##TYPE (TYPE *x, int n) \
|
||||
{ \
|
||||
for (int i = 0; i < n; ++i) \
|
||||
x[i] = FN (x[i]); \
|
||||
}
|
||||
|
||||
#define TEST_ALL(T) \
|
||||
T (_Float16, __builtin_sqrtf16) \
|
||||
T (float, __builtin_sqrtf) \
|
||||
T (double, __builtin_sqrt)
|
||||
|
||||
TEST_ALL (DEF_LOOP)
|
||||
|
||||
/* { dg-final { scan-assembler {\tfsqrt\tz[0-9]+\.h} } } */
|
||||
/* { dg-final { scan-assembler-not {\tfrsqrte\tz[0-9]+\.h} } } */
|
||||
/* { dg-final { scan-assembler-not {\tfrsqrts\tz[0-9]+\.h} } } */
|
||||
|
||||
/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/z} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s} 3 } } */
|
||||
/* { dg-final { scan-assembler-times {\tfrsqrte\tz[0-9]+\.s} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {\tfrsqrts\tz[0-9]+\.s} 1 } } */
|
||||
|
||||
/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.d, p[0-7]/z} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d} 5 } } */
|
||||
/* { dg-final { scan-assembler-times {\tfrsqrte\tz[0-9]+\.d} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {\tfrsqrts\tz[0-9]+\.d} 2 } } */
|
27
gcc/testsuite/gcc.target/aarch64/sve/sqrt_1_run.c
Normal file
27
gcc/testsuite/gcc.target/aarch64/sve/sqrt_1_run.c
Normal file
|
@ -0,0 +1,27 @@
|
|||
/* { dg-do run { target aarch64_sve_hw } } */
|
||||
/* { dg-options "-Ofast -mlow-precision-sqrt" } */
|
||||
|
||||
#include "sqrt_1.c"
|
||||
|
||||
#define N 77
|
||||
|
||||
#define TEST_LOOP(TYPE, FN) \
|
||||
{ \
|
||||
TYPE a[N]; \
|
||||
for (int i = 0; i < N; ++i) \
|
||||
a[i] = i; \
|
||||
test_##TYPE (a, N); \
|
||||
for (int i = 0; i < N; ++i) \
|
||||
{ \
|
||||
double diff = a[i] - __builtin_sqrt (i); \
|
||||
if (__builtin_fabs (diff) > 0x1.0p-8) \
|
||||
__builtin_abort (); \
|
||||
} \
|
||||
}
|
||||
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
TEST_ALL (TEST_LOOP);
|
||||
return 0;
|
||||
}
|
Loading…
Add table
Reference in a new issue