aarch64: Add SVE support for -mlow-precision-div
SVE was missing support for -mlow-precision-div, which meant that -march=armv8.2-a+sve -mlow-precision-div could cause a performance regression compared to -march=armv8.2-a -mlow-precision-div. I ended up doing this much later than originally intended, sorry... 2020-02-21 Richard Sandiford <richard.sandiford@arm.com> gcc/ * config/aarch64/aarch64.c (aarch64_emit_mult): New function. (aarch64_emit_approx_div): Add SVE support. Use aarch64_emit_mult instead of emitting multiplication instructions directly. * config/aarch64/iterators.md (SVE_COND_FP_BINARY_OPTAB): New iterator. * config/aarch64/aarch64-sve.md (div<mode>3, @aarch64_frecpe<mode>) (@aarch64_frecps<mode>): New expanders. gcc/testsuite/ * gcc.target/aarch64/sve/recip_1.c: New test. * gcc.target/aarch64/sve/recip_1_run.c: Likewise. * gcc.target/aarch64/sve/recip_2.c: Likewise. * gcc.target/aarch64/sve/recip_2_run.c: Likewise.
This commit is contained in:
parent
d87778ed09
commit
04f307cbb9
9 changed files with 207 additions and 4 deletions
|
@ -1,3 +1,12 @@
|
|||
2020-02-21 Richard Sandiford <richard.sandiford@arm.com>
|
||||
|
||||
* config/aarch64/aarch64.c (aarch64_emit_mult): New function.
|
||||
(aarch64_emit_approx_div): Add SVE support. Use aarch64_emit_mult
|
||||
instead of emitting multiplication instructions directly.
|
||||
* config/aarch64/iterators.md (SVE_COND_FP_BINARY_OPTAB): New iterator.
|
||||
* config/aarch64/aarch64-sve.md (div<mode>3, @aarch64_frecpe<mode>)
|
||||
(@aarch64_frecps<mode>): New expanders.
|
||||
|
||||
2020-02-21 Richard Sandiford <richard.sandiford@arm.com>
|
||||
|
||||
* config/aarch64/aarch64-protos.h (AARCH64_APPROX_MODE): Operate
|
||||
|
|
|
@ -99,6 +99,7 @@
|
|||
;; ---- [FP] Subtraction
|
||||
;; ---- [FP] Absolute difference
|
||||
;; ---- [FP] Multiplication
|
||||
;; ---- [FP] Division
|
||||
;; ---- [FP] Binary logical operations
|
||||
;; ---- [FP] Sign copying
|
||||
;; ---- [FP] Maximum and minimum
|
||||
|
@ -4719,7 +4720,7 @@
|
|||
(const_int SVE_RELAXED_GP)
|
||||
(match_operand:SVE_FULL_F 1 "<sve_pred_fp_rhs1_operand>")
|
||||
(match_operand:SVE_FULL_F 2 "<sve_pred_fp_rhs2_operand>")]
|
||||
SVE_COND_FP_BINARY))]
|
||||
SVE_COND_FP_BINARY_OPTAB))]
|
||||
"TARGET_SVE"
|
||||
{
|
||||
operands[3] = aarch64_ptrue_reg (<VPRED>mode);
|
||||
|
@ -5455,6 +5456,47 @@
|
|||
"fmul\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>[%3]"
|
||||
)
|
||||
|
||||
;; -------------------------------------------------------------------------
|
||||
;; ---- [FP] Division
|
||||
;; -------------------------------------------------------------------------
|
||||
;; The patterns in this section are synthetic.
|
||||
;; -------------------------------------------------------------------------
|
||||
|
||||
(define_expand "div<mode>3"
|
||||
[(set (match_operand:SVE_FULL_F 0 "register_operand")
|
||||
(unspec:SVE_FULL_F
|
||||
[(match_dup 3)
|
||||
(const_int SVE_RELAXED_GP)
|
||||
(match_operand:SVE_FULL_F 1 "nonmemory_operand")
|
||||
(match_operand:SVE_FULL_F 2 "register_operand")]
|
||||
UNSPEC_COND_FDIV))]
|
||||
"TARGET_SVE"
|
||||
{
|
||||
if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
|
||||
DONE;
|
||||
|
||||
operands[1] = force_reg (<MODE>mode, operands[1]);
|
||||
operands[3] = aarch64_ptrue_reg (<VPRED>mode);
|
||||
}
|
||||
)
|
||||
|
||||
(define_expand "@aarch64_frecpe<mode>"
|
||||
[(set (match_operand:SVE_FULL_F 0 "register_operand")
|
||||
(unspec:SVE_FULL_F
|
||||
[(match_operand:SVE_FULL_F 1 "register_operand")]
|
||||
UNSPEC_FRECPE))]
|
||||
"TARGET_SVE"
|
||||
)
|
||||
|
||||
(define_expand "@aarch64_frecps<mode>"
|
||||
[(set (match_operand:SVE_FULL_F 0 "register_operand")
|
||||
(unspec:SVE_FULL_F
|
||||
[(match_operand:SVE_FULL_F 1 "register_operand")
|
||||
(match_operand:SVE_FULL_F 2 "register_operand")]
|
||||
UNSPEC_FRECPS))]
|
||||
"TARGET_SVE"
|
||||
)
|
||||
|
||||
;; -------------------------------------------------------------------------
|
||||
;; ---- [FP] Binary logical operations
|
||||
;; -------------------------------------------------------------------------
|
||||
|
|
|
@ -12739,6 +12739,25 @@ aarch64_builtin_reciprocal (tree fndecl)
|
|||
gcc_unreachable ();
|
||||
}
|
||||
|
||||
/* Emit code to perform the floating-point operation:
|
||||
|
||||
DST = SRC1 * SRC2
|
||||
|
||||
where all three operands are already known to be registers.
|
||||
If the operation is an SVE one, PTRUE is a suitable all-true
|
||||
predicate. */
|
||||
|
||||
static void
|
||||
aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
|
||||
{
|
||||
if (ptrue)
|
||||
emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
|
||||
dst, ptrue, src1, src2,
|
||||
gen_int_mode (SVE_RELAXED_GP, SImode)));
|
||||
else
|
||||
emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
|
||||
}
|
||||
|
||||
/* Emit instruction sequence to compute either the approximate square root
|
||||
or its approximate reciprocal, depending on the flag RECP, and return
|
||||
whether the sequence was emitted or not. */
|
||||
|
@ -12857,6 +12876,10 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
|
|||
if (!TARGET_SIMD && VECTOR_MODE_P (mode))
|
||||
return false;
|
||||
|
||||
rtx pg = NULL_RTX;
|
||||
if (aarch64_sve_mode_p (mode))
|
||||
pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
|
||||
|
||||
/* Estimate the approximate reciprocal. */
|
||||
rtx xrcp = gen_reg_rtx (mode);
|
||||
emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
|
||||
|
@ -12876,7 +12899,7 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
|
|||
emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
|
||||
|
||||
if (iterations > 0)
|
||||
emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
|
||||
aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
|
||||
}
|
||||
|
||||
if (num != CONST1_RTX (mode))
|
||||
|
@ -12884,11 +12907,11 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
|
|||
/* As the approximate reciprocal of DEN is already calculated, only
|
||||
calculate the approximate division when NUM is not 1.0. */
|
||||
rtx xnum = force_reg (mode, num);
|
||||
emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
|
||||
aarch64_emit_mult (xrcp, pg, xrcp, xnum);
|
||||
}
|
||||
|
||||
/* Finalize the approximation. */
|
||||
emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
|
||||
aarch64_emit_mult (quo, pg, xrcp, xtmp);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -2291,6 +2291,17 @@
|
|||
UNSPEC_COND_FMULX
|
||||
UNSPEC_COND_FSUB])
|
||||
|
||||
;; Same as SVE_COND_FP_BINARY, but without codes that have a dedicated
|
||||
;; <optab><mode>3 expander.
|
||||
(define_int_iterator SVE_COND_FP_BINARY_OPTAB [UNSPEC_COND_FADD
|
||||
UNSPEC_COND_FMAX
|
||||
UNSPEC_COND_FMAXNM
|
||||
UNSPEC_COND_FMIN
|
||||
UNSPEC_COND_FMINNM
|
||||
UNSPEC_COND_FMUL
|
||||
UNSPEC_COND_FMULX
|
||||
UNSPEC_COND_FSUB])
|
||||
|
||||
(define_int_iterator SVE_COND_FP_BINARY_INT [UNSPEC_COND_FSCALE])
|
||||
|
||||
(define_int_iterator SVE_COND_FP_ADD [UNSPEC_COND_FADD])
|
||||
|
|
|
@ -1,3 +1,10 @@
|
|||
2020-02-21 Richard Sandiford <richard.sandiford@arm.com>
|
||||
|
||||
* gcc.target/aarch64/sve/recip_1.c: New test.
|
||||
* gcc.target/aarch64/sve/recip_1_run.c: Likewise.
|
||||
* gcc.target/aarch64/sve/recip_2.c: Likewise.
|
||||
* gcc.target/aarch64/sve/recip_2_run.c: Likewise.
|
||||
|
||||
2020-02-20 Martin Sebor <msebor@redhat.com>
|
||||
|
||||
PR c++/93801
|
||||
|
|
27
gcc/testsuite/gcc.target/aarch64/sve/recip_1.c
Normal file
27
gcc/testsuite/gcc.target/aarch64/sve/recip_1.c
Normal file
|
@ -0,0 +1,27 @@
|
|||
/* { dg-options "-Ofast -mlow-precision-div" } */
|
||||
|
||||
#define DEF_LOOP(TYPE) \
|
||||
void \
|
||||
test_##TYPE (TYPE *x, int n) \
|
||||
{ \
|
||||
for (int i = 0; i < n; ++i) \
|
||||
x[i] = (TYPE) 1 / x[i]; \
|
||||
}
|
||||
|
||||
#define TEST_ALL(T) \
|
||||
T (_Float16) \
|
||||
T (float) \
|
||||
T (double)
|
||||
|
||||
TEST_ALL (DEF_LOOP)
|
||||
|
||||
/* { dg-final { scan-assembler-not {\tfrecpe\tz[0-9]+\.h} } } */
|
||||
/* { dg-final { scan-assembler-not {\tfrecps\tz[0-9]+\.h} } } */
|
||||
|
||||
/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {\tfrecpe\tz[0-9]+\.s} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {\tfrecps\tz[0-9]+\.s} 1 } } */
|
||||
|
||||
/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {\tfrecpe\tz[0-9]+\.d} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {\tfrecps\tz[0-9]+\.d} 2 } } */
|
27
gcc/testsuite/gcc.target/aarch64/sve/recip_1_run.c
Normal file
27
gcc/testsuite/gcc.target/aarch64/sve/recip_1_run.c
Normal file
|
@ -0,0 +1,27 @@
|
|||
/* { dg-do run { target aarch64_sve_hw } } */
|
||||
/* { dg-options "-Ofast -mlow-precision-div" } */
|
||||
|
||||
#include "recip_1.c"
|
||||
|
||||
#define N 77
|
||||
|
||||
#define TEST_LOOP(TYPE) \
|
||||
{ \
|
||||
TYPE a[N]; \
|
||||
for (int i = 0; i < N; ++i) \
|
||||
a[i] = i + 1; \
|
||||
test_##TYPE (a, N); \
|
||||
for (int i = 0; i < N; ++i) \
|
||||
{ \
|
||||
double diff = a[i] - 1.0 / (i + 1); \
|
||||
if (__builtin_fabs (diff) > 0x1.0p-8) \
|
||||
__builtin_abort (); \
|
||||
} \
|
||||
}
|
||||
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
TEST_ALL (TEST_LOOP);
|
||||
return 0;
|
||||
}
|
27
gcc/testsuite/gcc.target/aarch64/sve/recip_2.c
Normal file
27
gcc/testsuite/gcc.target/aarch64/sve/recip_2.c
Normal file
|
@ -0,0 +1,27 @@
|
|||
/* { dg-options "-Ofast -mlow-precision-div" } */
|
||||
|
||||
#define DEF_LOOP(TYPE) \
|
||||
void \
|
||||
test_##TYPE (TYPE *restrict x, TYPE *restrict y, int n) \
|
||||
{ \
|
||||
for (int i = 0; i < n; ++i) \
|
||||
x[i] /= y[i]; \
|
||||
}
|
||||
|
||||
#define TEST_ALL(T) \
|
||||
T (_Float16) \
|
||||
T (float) \
|
||||
T (double)
|
||||
|
||||
TEST_ALL (DEF_LOOP)
|
||||
|
||||
/* { dg-final { scan-assembler-not {\tfrecpe\tz[0-9]+\.h} } } */
|
||||
/* { dg-final { scan-assembler-not {\tfrecps\tz[0-9]+\.h} } } */
|
||||
|
||||
/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s} 2 } } */
|
||||
/* { dg-final { scan-assembler-times {\tfrecpe\tz[0-9]+\.s} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {\tfrecps\tz[0-9]+\.s} 1 } } */
|
||||
|
||||
/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d} 3 } } */
|
||||
/* { dg-final { scan-assembler-times {\tfrecpe\tz[0-9]+\.d} 1 } } */
|
||||
/* { dg-final { scan-assembler-times {\tfrecps\tz[0-9]+\.d} 2 } } */
|
30
gcc/testsuite/gcc.target/aarch64/sve/recip_2_run.c
Normal file
30
gcc/testsuite/gcc.target/aarch64/sve/recip_2_run.c
Normal file
|
@ -0,0 +1,30 @@
|
|||
/* { dg-do run { target aarch64_sve_hw } } */
|
||||
/* { dg-options "-Ofast -mlow-precision-div" } */
|
||||
|
||||
#include "recip_2.c"
|
||||
|
||||
#define N 77
|
||||
|
||||
#define TEST_LOOP(TYPE) \
|
||||
{ \
|
||||
TYPE a[N], b[N]; \
|
||||
for (int i = 0; i < N; ++i) \
|
||||
{ \
|
||||
a[i] = i + 11; \
|
||||
b[i] = i + 1; \
|
||||
} \
|
||||
test_##TYPE (a, b, N); \
|
||||
for (int i = 0; i < N; ++i) \
|
||||
{ \
|
||||
double diff = a[i] - (i + 11.0) / (i + 1); \
|
||||
if (__builtin_fabs (diff) > 0x1.0p-8) \
|
||||
__builtin_abort (); \
|
||||
} \
|
||||
}
|
||||
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
TEST_ALL (TEST_LOOP);
|
||||
return 0;
|
||||
}
|
Loading…
Add table
Reference in a new issue