rs6000: Enable limited unrolling at -O2
In PR88760, there are a few disscussion about improve or tune unroller for targets. And we would agree to enable unroller for small loops at O2 first. And we could see performance improvement(~10%) for below code: ``` subroutine foo (i, i1, block) integer :: i, i1 integer :: block(9, 9, 9) block(i:9,1,i1) = block(i:9,1,i1) - 10 end subroutine foo ``` This kind of code occurs a few times in exchange2 benchmark. Similar C code: ``` for (i = 0; i < n; i++) arr[i] = arr[i] - 10; ``` On powerpcle, for O2 , enable -funroll-loops and limit PARAM_MAX_UNROLL_TIMES=2 and PARAM_MAX_UNROLLED_INSNS=20, we can see >2% overall improvement for SPEC2017. This patch is only for rs6000 in which we see visible performance improvement. gcc/ 2019-10-25 Jiufu Guo <guojiufu@linux.ibm.com> PR tree-optimization/88760 * config/rs6000/rs6000-common.c (rs6000_option_optimization_table): Enable -funroll-loops for -O2 and above. * config/rs6000/rs6000.c (rs6000_option_override_internal): Set PARAM_MAX_UNROLL_TIMES to 2 and PARAM_MAX_UNROLLED_INSNS to 20, and do not turn on web and rngreg implicitly, if the unroller is not explicitly enabled. gcc.testsuite/ 2019-10-25 Jiufu Guo <guojiufu@linux.ibm.com> PR tree-optimization/88760 * gcc.target/powerpc/small-loop-unroll.c: New test. * c-c++-common/tsan/thread_leak2.c: Update test. * gcc.dg/pr59643.c: Update test. * gcc.target/powerpc/loop_align.c: Update test. * gcc.target/powerpc/ppc-fma-1.c: Update test. * gcc.target/powerpc/ppc-fma-2.c: Update test. * gcc.target/powerpc/ppc-fma-3.c: Update test. * gcc.target/powerpc/ppc-fma-4.c: Update test. * gcc.target/powerpc/pr78604.c: Update test. From-SVN: r277501
This commit is contained in:
parent
cf20d00ca1
commit
6d099a76a0
13 changed files with 70 additions and 6 deletions
|
@ -1,3 +1,13 @@
|
|||
2019-10-25 Jiufu Guo <guojiufu@linux.ibm.com>
|
||||
|
||||
PR tree-optimization/88760
|
||||
* config/rs6000/rs6000-common.c (rs6000_option_optimization_table):
|
||||
Enable -funroll-loops for -O2 and above.
|
||||
* config/rs6000/rs6000.c (rs6000_option_override_internal): Set
|
||||
PARAM_MAX_UNROLL_TIMES to 2 and PARAM_MAX_UNROLLED_INSNS to 20, and
|
||||
do not turn on web and rngreg implicitly, if the unroller is not
|
||||
explicitly enabled.
|
||||
|
||||
2019-10-27 Jan Hubicka <hubicka@ucw.cz>
|
||||
|
||||
* ipa-prop.c (ipa_propagate_indirect_call_infos): Do not remove
|
||||
|
|
|
@ -35,6 +35,7 @@ static const struct default_options rs6000_option_optimization_table[] =
|
|||
{ OPT_LEVELS_ALL, OPT_fsplit_wide_types_early, NULL, 1 },
|
||||
/* Enable -fsched-pressure for first pass instruction scheduling. */
|
||||
{ OPT_LEVELS_1_PLUS, OPT_fsched_pressure, NULL, 1 },
|
||||
{ OPT_LEVELS_2_PLUS, OPT_funroll_loops, NULL, 1 },
|
||||
{ OPT_LEVELS_NONE, 0, NULL, 0 }
|
||||
};
|
||||
|
||||
|
|
|
@ -4540,6 +4540,26 @@ rs6000_option_override_internal (bool global_init_p)
|
|||
global_options.x_param_values,
|
||||
global_options_set.x_param_values);
|
||||
|
||||
/* unroll very small loops 2 time if no -funroll-loops. */
|
||||
if (!global_options_set.x_flag_unroll_loops
|
||||
&& !global_options_set.x_flag_unroll_all_loops)
|
||||
{
|
||||
maybe_set_param_value (PARAM_MAX_UNROLL_TIMES, 2,
|
||||
global_options.x_param_values,
|
||||
global_options_set.x_param_values);
|
||||
|
||||
maybe_set_param_value (PARAM_MAX_UNROLLED_INSNS, 20,
|
||||
global_options.x_param_values,
|
||||
global_options_set.x_param_values);
|
||||
|
||||
/* If fweb or frename-registers are not specificed in command-line,
|
||||
do not turn them on implicitly. */
|
||||
if (!global_options_set.x_flag_web)
|
||||
global_options.x_flag_web = 0;
|
||||
if (!global_options_set.x_flag_rename_registers)
|
||||
global_options.x_flag_rename_registers = 0;
|
||||
}
|
||||
|
||||
/* If using typedef char *va_list, signal that
|
||||
__builtin_va_start (&ap, 0) can be optimized to
|
||||
ap = __builtin_next_arg (0). */
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
2019-10-25 Jiufu Guo <guojiufu@linux.ibm.com>
|
||||
|
||||
PR tree-optimization/88760
|
||||
* gcc.target/powerpc/small-loop-unroll.c: New test.
|
||||
* c-c++-common/tsan/thread_leak2.c: Update test.
|
||||
* gcc.dg/pr59643.c: Update test.
|
||||
* gcc.target/powerpc/loop_align.c: Update test.
|
||||
* gcc.target/powerpc/ppc-fma-1.c: Update test.
|
||||
* gcc.target/powerpc/ppc-fma-2.c: Update test.
|
||||
* gcc.target/powerpc/ppc-fma-3.c: Update test.
|
||||
* gcc.target/powerpc/ppc-fma-4.c: Update test.
|
||||
* gcc.target/powerpc/pr78604.c: Update test.
|
||||
|
||||
2019-10-27 Andreas Tobler <andreast@gcc.gnu.org>
|
||||
|
||||
* gcc.c-torture/execute/fprintf-2.c: Silence a Free/NetBSD libc warning.
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
/* { dg-shouldfail "tsan" } */
|
||||
|
||||
/* { dg-additional-options "-fno-unroll-loops" { target { powerpc*-*-* } } } */
|
||||
/* -fno-unroll-loops help to avoid ThreadSanitizer reporting multi-times
|
||||
message for pthread_create at difference calling addresses. */
|
||||
|
||||
#include <pthread.h>
|
||||
#include <unistd.h>
|
||||
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
/* PR tree-optimization/59643 */
|
||||
/* { dg-do compile } */
|
||||
/* { dg-options "-O3 -fdump-tree-pcom-details" } */
|
||||
/* { dg-additional-options "--param max-unrolled-insns=400" { target { powerpc*-*-* } } } */
|
||||
/* Implicit threashold of max-unrolled-insn on ppc at O3 is too small for the
|
||||
loop of this case. */
|
||||
|
||||
void
|
||||
foo (double *a, double *b, double *c, double d, double e, int n)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/* { dg-do compile { target { powerpc*-*-* } } } */
|
||||
/* { dg-skip-if "" { powerpc*-*-darwin* powerpc-ibm-aix* } } */
|
||||
/* { dg-options "-O2 -mdejagnu-cpu=power7 -falign-functions=16" } */
|
||||
/* { dg-options "-O2 -mdejagnu-cpu=power7 -falign-functions=16 -fno-unroll-loops" } */
|
||||
/* { dg-final { scan-assembler ".p2align 5" } } */
|
||||
|
||||
void f(double *a, double *b, double *c, unsigned long n) {
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/* { dg-do compile { target { powerpc*-*-* } } } */
|
||||
/* { dg-skip-if "" { powerpc*-*-darwin* } } */
|
||||
/* { dg-require-effective-target powerpc_vsx_ok } */
|
||||
/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math" } */
|
||||
/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -fno-unroll-loops" } */
|
||||
/* { dg-final { scan-assembler-times "xvmadd" 4 } } */
|
||||
/* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 2 } } */
|
||||
/* { dg-final { scan-assembler-times "fmadds" 2 } } */
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/* { dg-do compile { target { powerpc*-*-* } } } */
|
||||
/* { dg-skip-if "" { powerpc*-*-darwin* } } */
|
||||
/* { dg-require-effective-target powerpc_vsx_ok } */
|
||||
/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -ffp-contract=off" } */
|
||||
/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -ffp-contract=off -fno-unroll-loops" } */
|
||||
/* { dg-final { scan-assembler-times "xvmadd" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 1 } } */
|
||||
/* { dg-final { scan-assembler-times "fmadds" 1 } } */
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
/* { dg-skip-if "" { powerpc*-*-darwin* } } */
|
||||
/* { dg-require-effective-target powerpc_altivec_ok } */
|
||||
/* { dg-require-effective-target powerpc_fprs } */
|
||||
/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math" } */
|
||||
/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -fno-unroll-loops" } */
|
||||
/* { dg-final { scan-assembler-times "vmaddfp" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "fmadd " 2 } } */
|
||||
/* { dg-final { scan-assembler-times "fmadds" 2 } } */
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
/* { dg-skip-if "" { powerpc*-*-darwin* } } */
|
||||
/* { dg-require-effective-target powerpc_altivec_ok } */
|
||||
/* { dg-require-effective-target powerpc_fprs } */
|
||||
/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -ffp-contract=off" } */
|
||||
/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -ffp-contract=off -fno-unroll-loops" } */
|
||||
/* { dg-final { scan-assembler-times "vmaddfp" 1 } } */
|
||||
/* { dg-final { scan-assembler-times "fmadd " 1 } } */
|
||||
/* { dg-final { scan-assembler-times "fmadds" 1 } } */
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/* { dg-do compile { target { powerpc*-*-* } } } */
|
||||
/* { dg-skip-if "" { powerpc*-*-darwin* } } */
|
||||
/* { dg-require-effective-target powerpc_p8vector_ok } */
|
||||
/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize -fdump-tree-vect-details" } */
|
||||
/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize -fdump-tree-vect-details -fno-unroll-loops" } */
|
||||
|
||||
#ifndef SIZE
|
||||
#define SIZE 1024
|
||||
|
|
13
gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c
Normal file
13
gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c
Normal file
|
@ -0,0 +1,13 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-O2 -fdump-rtl-loop2_unroll" } */
|
||||
|
||||
void __attribute__ ((noinline)) foo(int n, int *arr)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < n; i++)
|
||||
arr[i] = arr[i] - 10;
|
||||
}
|
||||
/* { dg-final { scan-rtl-dump-times "Unrolled loop 1 times" 1 "loop2_unroll" } } */
|
||||
/* { dg-final { scan-assembler-times {\mlwz\M} 3 } } */
|
||||
/* { dg-final { scan-assembler-times {\mstw\M} 3 } } */
|
||||
|
Loading…
Add table
Reference in a new issue