[expmed] Properly account for the cost and latency of shift+add ops when synthesizing mults
* expmed.c: (synth_mult): Only assume overlapping shift with previous steps in alg_sub_t_m2 case. * gcc.target/aarch64/mult-synth_1.c: New test. * gcc.target/aarch64/mult-synth_2.c: Likewise. * gcc.target/aarch64/mult-synth_3.c: Likewise. * gcc.target/aarch64/mult-synth_4.c: Likewise. * gcc.target/aarch64/mult-synth_5.c: Likewise. * gcc.target/aarch64/mult-synth_6.c: Likewise. From-SVN: r222268
This commit is contained in:
parent
0ff093d855
commit
35430ca0c6
9 changed files with 106 additions and 29 deletions
|
@ -1,3 +1,8 @@
|
|||
2015-04-21 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
|
||||
|
||||
* expmed.c: (synth_mult): Only assume overlapping
|
||||
shift with previous steps in alg_sub_t_m2 case.
|
||||
|
||||
2015-04-21 Richard Biener <rguenther@suse.de>
|
||||
|
||||
PR tree-optimization/65650
|
||||
|
|
55
gcc/expmed.c
55
gcc/expmed.c
|
@ -2664,14 +2664,28 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
|
|||
m = exact_log2 (-orig_t + 1);
|
||||
if (m >= 0 && m < maxm)
|
||||
{
|
||||
op_cost = shiftsub1_cost (speed, mode, m);
|
||||
op_cost = add_cost (speed, mode) + shift_cost (speed, mode, m);
|
||||
/* If the target has a cheap shift-and-subtract insn use
|
||||
that in preference to a shift insn followed by a sub insn.
|
||||
Assume that the shift-and-sub is "atomic" with a latency
|
||||
equal to it's cost, otherwise assume that on superscalar
|
||||
hardware the shift may be executed concurrently with the
|
||||
earlier steps in the algorithm. */
|
||||
if (shiftsub1_cost (speed, mode, m) <= op_cost)
|
||||
{
|
||||
op_cost = shiftsub1_cost (speed, mode, m);
|
||||
op_latency = op_cost;
|
||||
}
|
||||
else
|
||||
op_latency = add_cost (speed, mode);
|
||||
|
||||
new_limit.cost = best_cost.cost - op_cost;
|
||||
new_limit.latency = best_cost.latency - op_cost;
|
||||
new_limit.latency = best_cost.latency - op_latency;
|
||||
synth_mult (alg_in, (unsigned HOST_WIDE_INT) (-orig_t + 1) >> m,
|
||||
&new_limit, mode);
|
||||
|
||||
alg_in->cost.cost += op_cost;
|
||||
alg_in->cost.latency += op_cost;
|
||||
alg_in->cost.latency += op_latency;
|
||||
if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost))
|
||||
{
|
||||
best_cost = alg_in->cost;
|
||||
|
@ -2704,20 +2718,12 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
|
|||
if (t % d == 0 && t > d && m < maxm
|
||||
&& (!cache_hit || cache_alg == alg_add_factor))
|
||||
{
|
||||
/* If the target has a cheap shift-and-add instruction use
|
||||
that in preference to a shift insn followed by an add insn.
|
||||
Assume that the shift-and-add is "atomic" with a latency
|
||||
equal to its cost, otherwise assume that on superscalar
|
||||
hardware the shift may be executed concurrently with the
|
||||
earlier steps in the algorithm. */
|
||||
op_cost = add_cost (speed, mode) + shift_cost (speed, mode, m);
|
||||
if (shiftadd_cost (speed, mode, m) < op_cost)
|
||||
{
|
||||
op_cost = shiftadd_cost (speed, mode, m);
|
||||
op_latency = op_cost;
|
||||
}
|
||||
else
|
||||
op_latency = add_cost (speed, mode);
|
||||
if (shiftadd_cost (speed, mode, m) <= op_cost)
|
||||
op_cost = shiftadd_cost (speed, mode, m);
|
||||
|
||||
op_latency = op_cost;
|
||||
|
||||
|
||||
new_limit.cost = best_cost.cost - op_cost;
|
||||
new_limit.latency = best_cost.latency - op_latency;
|
||||
|
@ -2742,20 +2748,11 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t,
|
|||
if (t % d == 0 && t > d && m < maxm
|
||||
&& (!cache_hit || cache_alg == alg_sub_factor))
|
||||
{
|
||||
/* If the target has a cheap shift-and-subtract insn use
|
||||
that in preference to a shift insn followed by a sub insn.
|
||||
Assume that the shift-and-sub is "atomic" with a latency
|
||||
equal to it's cost, otherwise assume that on superscalar
|
||||
hardware the shift may be executed concurrently with the
|
||||
earlier steps in the algorithm. */
|
||||
op_cost = add_cost (speed, mode) + shift_cost (speed, mode, m);
|
||||
if (shiftsub0_cost (speed, mode, m) < op_cost)
|
||||
{
|
||||
op_cost = shiftsub0_cost (speed, mode, m);
|
||||
op_latency = op_cost;
|
||||
}
|
||||
else
|
||||
op_latency = add_cost (speed, mode);
|
||||
if (shiftsub0_cost (speed, mode, m) <= op_cost)
|
||||
op_cost = shiftsub0_cost (speed, mode, m);
|
||||
|
||||
op_latency = op_cost;
|
||||
|
||||
new_limit.cost = best_cost.cost - op_cost;
|
||||
new_limit.latency = best_cost.latency - op_latency;
|
||||
|
|
|
@ -1,3 +1,12 @@
|
|||
2015-04-21 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
|
||||
|
||||
* gcc.target/aarch64/mult-synth_1.c: New test.
|
||||
* gcc.target/aarch64/mult-synth_2.c: Likewise.
|
||||
* gcc.target/aarch64/mult-synth_3.c: Likewise.
|
||||
* gcc.target/aarch64/mult-synth_4.c: Likewise.
|
||||
* gcc.target/aarch64/mult-synth_5.c: Likewise.
|
||||
* gcc.target/aarch64/mult-synth_6.c: Likewise.
|
||||
|
||||
2015-04-21 Richard Biener <rguenther@suse.de>
|
||||
|
||||
PR tree-optimization/65650
|
||||
|
|
11
gcc/testsuite/gcc.target/aarch64/mult-synth_1.c
Normal file
11
gcc/testsuite/gcc.target/aarch64/mult-synth_1.c
Normal file
|
@ -0,0 +1,11 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */
|
||||
|
||||
int
|
||||
foo (int x)
|
||||
{
|
||||
return x * 100;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times "mul\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
|
||||
/* { dg-final { cleanup-saved-temps } } */
|
11
gcc/testsuite/gcc.target/aarch64/mult-synth_2.c
Normal file
11
gcc/testsuite/gcc.target/aarch64/mult-synth_2.c
Normal file
|
@ -0,0 +1,11 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */
|
||||
|
||||
int
|
||||
foo (int x)
|
||||
{
|
||||
return x * 25;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times "mul\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
|
||||
/* { dg-final { cleanup-saved-temps } } */
|
11
gcc/testsuite/gcc.target/aarch64/mult-synth_3.c
Normal file
11
gcc/testsuite/gcc.target/aarch64/mult-synth_3.c
Normal file
|
@ -0,0 +1,11 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */
|
||||
|
||||
int
|
||||
foo (int x)
|
||||
{
|
||||
return x * 11;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times "mul\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
|
||||
/* { dg-final { cleanup-saved-temps } } */
|
11
gcc/testsuite/gcc.target/aarch64/mult-synth_4.c
Normal file
11
gcc/testsuite/gcc.target/aarch64/mult-synth_4.c
Normal file
|
@ -0,0 +1,11 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */
|
||||
|
||||
long
|
||||
foo (int x, int y)
|
||||
{
|
||||
return (long)x * 6L;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times "smull\tx\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
|
||||
/* { dg-final { cleanup-saved-temps } } */
|
11
gcc/testsuite/gcc.target/aarch64/mult-synth_5.c
Normal file
11
gcc/testsuite/gcc.target/aarch64/mult-synth_5.c
Normal file
|
@ -0,0 +1,11 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */
|
||||
|
||||
int
|
||||
foo (int x)
|
||||
{
|
||||
return x * 10;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-not "\tw1" } } */
|
||||
/* { dg-final { cleanup-saved-temps } } */
|
11
gcc/testsuite/gcc.target/aarch64/mult-synth_6.c
Normal file
11
gcc/testsuite/gcc.target/aarch64/mult-synth_6.c
Normal file
|
@ -0,0 +1,11 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */
|
||||
|
||||
int
|
||||
foo (int x)
|
||||
{
|
||||
return x * 20;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-not "\tw1" } } */
|
||||
/* { dg-final { cleanup-saved-temps } } */
|
Loading…
Add table
Reference in a new issue