re PR target/56843 (PowerPC Newton-Raphson reciprocal estimates can be improved)
gcc: 2013-04-05 Bill Schmidt <wschmidt@linux.vnet.ibm.com> PR target/56843 * config/rs6000/rs6000.c (rs6000_emit_swdiv_high_precision): Remove. (rs6000_emit_swdiv_low_precision): Remove. (rs6000_emit_swdiv): Rewrite to handle between one and four iterations of Newton-Raphson generally; modify required number of iterations for some cases. * config/rs6000/rs6000.h (RS6000_RECIP_HIGH_PRECISION_P): Remove. gcc/testsuite: 2013-04-05 Bill Schmidt <wschmidt@linux.vnet.ibm.com> PR target/56843 * gcc.target/powerpc/recip-1.c: Modify expected output. * gcc.target/powerpc/recip-3.c: Likewise. * gcc.target/powerpc/recip-4.c: Likewise. * gcc.target/powerpc/recip-5.c: Add expected output for iterations. From-SVN: r197534
This commit is contained in:
parent
7bca81dc6a
commit
4902aa64ce
8 changed files with 86 additions and 84 deletions
|
@ -1,3 +1,13 @@
|
|||
2013-04-05 Bill Schmidt <wschmidt@linux.vnet.ibm.com>
|
||||
|
||||
PR target/56843
|
||||
* config/rs6000/rs6000.c (rs6000_emit_swdiv_high_precision): Remove.
|
||||
(rs6000_emit_swdiv_low_precision): Remove.
|
||||
(rs6000_emit_swdiv): Rewrite to handle between one and four
|
||||
iterations of Newton-Raphson generally; modify required number of
|
||||
iterations for some cases.
|
||||
* config/rs6000/rs6000.h (RS6000_RECIP_HIGH_PRECISION_P): Remove.
|
||||
|
||||
2013-04-05 Steven Bosscher <steven@gcc.gnu.org>
|
||||
|
||||
* bb-reorder.c (fix_crossing_unconditional_branches): Remove a
|
||||
|
|
|
@ -26913,54 +26913,26 @@ rs6000_emit_nmsub (rtx dst, rtx m1, rtx m2, rtx a)
|
|||
emit_insn (gen_rtx_SET (VOIDmode, dst, r));
|
||||
}
|
||||
|
||||
/* Newton-Raphson approximation of floating point divide with just 2 passes
|
||||
(either single precision floating point, or newer machines with higher
|
||||
accuracy estimates). Support both scalar and vector divide. Assumes no
|
||||
trapping math and finite arguments. */
|
||||
/* Newton-Raphson approximation of floating point divide DST = N/D. If NOTE_P,
|
||||
add a reg_note saying that this was a division. Support both scalar and
|
||||
vector divide. Assumes no trapping math and finite arguments. */
|
||||
|
||||
static void
|
||||
rs6000_emit_swdiv_high_precision (rtx dst, rtx n, rtx d)
|
||||
void
|
||||
rs6000_emit_swdiv (rtx dst, rtx n, rtx d, bool note_p)
|
||||
{
|
||||
enum machine_mode mode = GET_MODE (dst);
|
||||
rtx x0, e0, e1, y1, u0, v0;
|
||||
enum insn_code code = optab_handler (smul_optab, mode);
|
||||
gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (code);
|
||||
rtx one = rs6000_load_constant_and_splat (mode, dconst1);
|
||||
rtx one, x0, e0, x1, xprev, eprev, xnext, enext, u, v;
|
||||
int i;
|
||||
|
||||
gcc_assert (code != CODE_FOR_nothing);
|
||||
/* Low precision estimates guarantee 5 bits of accuracy. High
|
||||
precision estimates guarantee 14 bits of accuracy. SFmode
|
||||
requires 23 bits of accuracy. DFmode requires 52 bits of
|
||||
accuracy. Each pass at least doubles the accuracy, leading
|
||||
to the following. */
|
||||
int passes = (TARGET_RECIP_PRECISION) ? 1 : 3;
|
||||
if (mode == DFmode || mode == V2DFmode)
|
||||
passes++;
|
||||
|
||||
/* x0 = 1./d estimate */
|
||||
x0 = gen_reg_rtx (mode);
|
||||
emit_insn (gen_rtx_SET (VOIDmode, x0,
|
||||
gen_rtx_UNSPEC (mode, gen_rtvec (1, d),
|
||||
UNSPEC_FRES)));
|
||||
|
||||
e0 = gen_reg_rtx (mode);
|
||||
rs6000_emit_nmsub (e0, d, x0, one); /* e0 = 1. - (d * x0) */
|
||||
|
||||
e1 = gen_reg_rtx (mode);
|
||||
rs6000_emit_madd (e1, e0, e0, e0); /* e1 = (e0 * e0) + e0 */
|
||||
|
||||
y1 = gen_reg_rtx (mode);
|
||||
rs6000_emit_madd (y1, e1, x0, x0); /* y1 = (e1 * x0) + x0 */
|
||||
|
||||
u0 = gen_reg_rtx (mode);
|
||||
emit_insn (gen_mul (u0, n, y1)); /* u0 = n * y1 */
|
||||
|
||||
v0 = gen_reg_rtx (mode);
|
||||
rs6000_emit_nmsub (v0, d, u0, n); /* v0 = n - (d * u0) */
|
||||
|
||||
rs6000_emit_madd (dst, v0, y1, u0); /* dst = (v0 * y1) + u0 */
|
||||
}
|
||||
|
||||
/* Newton-Raphson approximation of floating point divide that has a low
|
||||
precision estimate. Assumes no trapping math and finite arguments. */
|
||||
|
||||
static void
|
||||
rs6000_emit_swdiv_low_precision (rtx dst, rtx n, rtx d)
|
||||
{
|
||||
enum machine_mode mode = GET_MODE (dst);
|
||||
rtx x0, e0, e1, e2, y1, y2, y3, u0, v0, one;
|
||||
enum insn_code code = optab_handler (smul_optab, mode);
|
||||
gen_2arg_fn_t gen_mul = (gen_2arg_fn_t) GEN_FCN (code);
|
||||
|
||||
|
@ -26974,46 +26946,44 @@ rs6000_emit_swdiv_low_precision (rtx dst, rtx n, rtx d)
|
|||
gen_rtx_UNSPEC (mode, gen_rtvec (1, d),
|
||||
UNSPEC_FRES)));
|
||||
|
||||
e0 = gen_reg_rtx (mode);
|
||||
rs6000_emit_nmsub (e0, d, x0, one); /* e0 = 1. - d * x0 */
|
||||
/* Each iteration but the last calculates x_(i+1) = x_i * (2 - d * x_i). */
|
||||
if (passes > 1) {
|
||||
|
||||
y1 = gen_reg_rtx (mode);
|
||||
rs6000_emit_madd (y1, e0, x0, x0); /* y1 = x0 + e0 * x0 */
|
||||
/* e0 = 1. - d * x0 */
|
||||
e0 = gen_reg_rtx (mode);
|
||||
rs6000_emit_nmsub (e0, d, x0, one);
|
||||
|
||||
e1 = gen_reg_rtx (mode);
|
||||
emit_insn (gen_mul (e1, e0, e0)); /* e1 = e0 * e0 */
|
||||
/* x1 = x0 + e0 * x0 */
|
||||
x1 = gen_reg_rtx (mode);
|
||||
rs6000_emit_madd (x1, e0, x0, x0);
|
||||
|
||||
y2 = gen_reg_rtx (mode);
|
||||
rs6000_emit_madd (y2, e1, y1, y1); /* y2 = y1 + e1 * y1 */
|
||||
for (i = 0, xprev = x1, eprev = e0; i < passes - 2;
|
||||
++i, xprev = xnext, eprev = enext) {
|
||||
|
||||
/* enext = eprev * eprev */
|
||||
enext = gen_reg_rtx (mode);
|
||||
emit_insn (gen_mul (enext, eprev, eprev));
|
||||
|
||||
e2 = gen_reg_rtx (mode);
|
||||
emit_insn (gen_mul (e2, e1, e1)); /* e2 = e1 * e1 */
|
||||
/* xnext = xprev + enext * xprev */
|
||||
xnext = gen_reg_rtx (mode);
|
||||
rs6000_emit_madd (xnext, enext, xprev, xprev);
|
||||
}
|
||||
|
||||
y3 = gen_reg_rtx (mode);
|
||||
rs6000_emit_madd (y3, e2, y2, y2); /* y3 = y2 + e2 * y2 */
|
||||
} else
|
||||
xprev = x0;
|
||||
|
||||
u0 = gen_reg_rtx (mode);
|
||||
emit_insn (gen_mul (u0, n, y3)); /* u0 = n * y3 */
|
||||
/* The last iteration calculates x_(i+1) = n * x_i * (2 - d * x_i). */
|
||||
|
||||
v0 = gen_reg_rtx (mode);
|
||||
rs6000_emit_nmsub (v0, d, u0, n); /* v0 = n - d * u0 */
|
||||
/* u = n * xprev */
|
||||
u = gen_reg_rtx (mode);
|
||||
emit_insn (gen_mul (u, n, xprev));
|
||||
|
||||
rs6000_emit_madd (dst, v0, y3, u0); /* dst = u0 + v0 * y3 */
|
||||
}
|
||||
/* v = n - (d * u) */
|
||||
v = gen_reg_rtx (mode);
|
||||
rs6000_emit_nmsub (v, d, u, n);
|
||||
|
||||
/* Newton-Raphson approximation of floating point divide DST = N/D. If NOTE_P,
|
||||
add a reg_note saying that this was a division. Support both scalar and
|
||||
vector divide. Assumes no trapping math and finite arguments. */
|
||||
|
||||
void
|
||||
rs6000_emit_swdiv (rtx dst, rtx n, rtx d, bool note_p)
|
||||
{
|
||||
enum machine_mode mode = GET_MODE (dst);
|
||||
|
||||
if (RS6000_RECIP_HIGH_PRECISION_P (mode))
|
||||
rs6000_emit_swdiv_high_precision (dst, n, d);
|
||||
else
|
||||
rs6000_emit_swdiv_low_precision (dst, n, d);
|
||||
/* dst = (v * xprev) + u */
|
||||
rs6000_emit_madd (dst, v, xprev, u);
|
||||
|
||||
if (note_p)
|
||||
add_reg_note (get_last_insn (), REG_EQUAL, gen_rtx_DIV (mode, n, d));
|
||||
|
@ -27028,7 +26998,16 @@ rs6000_emit_swrsqrt (rtx dst, rtx src)
|
|||
enum machine_mode mode = GET_MODE (src);
|
||||
rtx x0 = gen_reg_rtx (mode);
|
||||
rtx y = gen_reg_rtx (mode);
|
||||
int passes = (TARGET_RECIP_PRECISION) ? 2 : 3;
|
||||
|
||||
/* Low precision estimates guarantee 5 bits of accuracy. High
|
||||
precision estimates guarantee 14 bits of accuracy. SFmode
|
||||
requires 23 bits of accuracy. DFmode requires 52 bits of
|
||||
accuracy. Each pass at least doubles the accuracy, leading
|
||||
to the following. */
|
||||
int passes = (TARGET_RECIP_PRECISION) ? 1 : 3;
|
||||
if (mode == DFmode || mode == V2DFmode)
|
||||
passes++;
|
||||
|
||||
REAL_VALUE_TYPE dconst3_2;
|
||||
int i;
|
||||
rtx halfthree;
|
||||
|
|
|
@ -601,9 +601,6 @@ extern unsigned char rs6000_recip_bits[];
|
|||
#define RS6000_RECIP_AUTO_RSQRTE_P(MODE) \
|
||||
(rs6000_recip_bits[(int)(MODE)] & RS6000_RECIP_MASK_AUTO_RSQRTE)
|
||||
|
||||
#define RS6000_RECIP_HIGH_PRECISION_P(MODE) \
|
||||
((MODE) == SFmode || (MODE) == V4SFmode || TARGET_RECIP_PRECISION)
|
||||
|
||||
/* The default CPU for TARGET_OPTION_OVERRIDE. */
|
||||
#define OPTION_TARGET_CPU_DEFAULT TARGET_CPU_DEFAULT
|
||||
|
||||
|
|
|
@ -1,3 +1,11 @@
|
|||
2013-04-05 Bill Schmidt <wschmidt@linux.vnet.ibm.com>
|
||||
|
||||
PR target/56843
|
||||
* gcc.target/powerpc/recip-1.c: Modify expected output.
|
||||
* gcc.target/powerpc/recip-3.c: Likewise.
|
||||
* gcc.target/powerpc/recip-4.c: Likewise.
|
||||
* gcc.target/powerpc/recip-5.c: Add expected output for iterations.
|
||||
|
||||
2013-04-05 Greta Yorsh <Greta.Yorsh@arm.com>
|
||||
|
||||
* gcc.target/arm/peep-ldrd-1.c: New test.
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
/* { dg-options "-O2 -mrecip -ffast-math -mcpu=power6" } */
|
||||
/* { dg-final { scan-assembler-times "frsqrte" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "fmsub" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "fmul" 8 } } */
|
||||
/* { dg-final { scan-assembler-times "fnmsub" 4 } } */
|
||||
/* { dg-final { scan-assembler-times "fmul" 6 } } */
|
||||
/* { dg-final { scan-assembler-times "fnmsub" 3 } } */
|
||||
|
||||
double
|
||||
rsqrt_d (double a)
|
||||
|
|
|
@ -7,8 +7,8 @@
|
|||
/* { dg-final { scan-assembler-times "xsnmsub.dp\|fnmsub\ " 2 } } */
|
||||
/* { dg-final { scan-assembler-times "frsqrtes" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "fmsubs" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "fmuls" 4 } } */
|
||||
/* { dg-final { scan-assembler-times "fnmsubs" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "fmuls" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "fnmsubs" 1 } } */
|
||||
|
||||
double
|
||||
rsqrt_d (double a)
|
||||
|
|
|
@ -7,8 +7,8 @@
|
|||
/* { dg-final { scan-assembler-times "xvnmsub.dp" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "xvrsqrtesp" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "xvmsub.sp" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "xvmulsp" 4 } } */
|
||||
/* { dg-final { scan-assembler-times "xvnmsub.sp" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "xvmulsp" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "xvnmsub.sp" 1 } } */
|
||||
|
||||
#define SIZE 1024
|
||||
|
||||
|
|
|
@ -6,6 +6,14 @@
|
|||
/* { dg-final { scan-assembler-times "xvresp" 5 } } */
|
||||
/* { dg-final { scan-assembler-times "xsredp" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "fres" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "fmuls" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "fnmsubs" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "xsmuldp" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "xsnmsub.dp" 4 } } */
|
||||
/* { dg-final { scan-assembler-times "xvmulsp" 7 } } */
|
||||
/* { dg-final { scan-assembler-times "xvnmsub.sp" 5 } } */
|
||||
/* { dg-final { scan-assembler-times "xvmuldp" 6 } } */
|
||||
/* { dg-final { scan-assembler-times "xvnmsub.dp" 8 } } */
|
||||
|
||||
#include <altivec.h>
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue