AArch64: Add fma_reassoc_width [PR107413]
Add a reassocation width for FMA in per-CPU tuning structures. Keep the existing setting of 1 for cores with 2 FMA pipes (this disables reassociation), and use 4 for cores with 4 FMA pipes. This improves SPECFP2017 on Neoverse V1 by ~1.5%. gcc/ PR tree-optimization/107413 * config/aarch64/aarch64.cc (struct tune_params): Add fma_reassoc_width to all CPU tuning structures. (aarch64_reassociation_width): Use fma_reassoc_width. * config/aarch64/aarch64-protos.h (struct tune_params): Add fma_reassoc_width.
This commit is contained in:
parent
138ee8f745
commit
0c1b0a23f1
2 changed files with 34 additions and 3 deletions
|
@ -540,6 +540,7 @@ struct tune_params
|
|||
const char *loop_align;
|
||||
int int_reassoc_width;
|
||||
int fp_reassoc_width;
|
||||
int fma_reassoc_width;
|
||||
int vec_reassoc_width;
|
||||
int min_div_recip_mul_sf;
|
||||
int min_div_recip_mul_df;
|
||||
|
|
|
@ -1346,6 +1346,7 @@ static const struct tune_params generic_tunings =
|
|||
"8", /* loop_align. */
|
||||
2, /* int_reassoc_width. */
|
||||
4, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
1, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -1382,6 +1383,7 @@ static const struct tune_params cortexa35_tunings =
|
|||
"8", /* loop_align. */
|
||||
2, /* int_reassoc_width. */
|
||||
4, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
1, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -1415,6 +1417,7 @@ static const struct tune_params cortexa53_tunings =
|
|||
"8", /* loop_align. */
|
||||
2, /* int_reassoc_width. */
|
||||
4, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
1, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -1448,6 +1451,7 @@ static const struct tune_params cortexa57_tunings =
|
|||
"8", /* loop_align. */
|
||||
2, /* int_reassoc_width. */
|
||||
4, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
1, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -1481,6 +1485,7 @@ static const struct tune_params cortexa72_tunings =
|
|||
"8", /* loop_align. */
|
||||
2, /* int_reassoc_width. */
|
||||
4, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
1, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -1514,6 +1519,7 @@ static const struct tune_params cortexa73_tunings =
|
|||
"8", /* loop_align. */
|
||||
2, /* int_reassoc_width. */
|
||||
4, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
1, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -1548,6 +1554,7 @@ static const struct tune_params exynosm1_tunings =
|
|||
"4", /* loop_align. */
|
||||
2, /* int_reassoc_width. */
|
||||
4, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
1, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -1580,6 +1587,7 @@ static const struct tune_params thunderxt88_tunings =
|
|||
"8", /* loop_align. */
|
||||
2, /* int_reassoc_width. */
|
||||
4, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
1, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -1612,6 +1620,7 @@ static const struct tune_params thunderx_tunings =
|
|||
"8", /* loop_align. */
|
||||
2, /* int_reassoc_width. */
|
||||
4, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
1, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -1646,6 +1655,7 @@ static const struct tune_params tsv110_tunings =
|
|||
"8", /* loop_align. */
|
||||
2, /* int_reassoc_width. */
|
||||
4, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
1, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -1678,6 +1688,7 @@ static const struct tune_params xgene1_tunings =
|
|||
"16", /* loop_align. */
|
||||
2, /* int_reassoc_width. */
|
||||
4, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
1, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -1710,6 +1721,7 @@ static const struct tune_params emag_tunings =
|
|||
"16", /* loop_align. */
|
||||
2, /* int_reassoc_width. */
|
||||
4, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
1, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -1743,6 +1755,7 @@ static const struct tune_params qdf24xx_tunings =
|
|||
"16", /* loop_align. */
|
||||
2, /* int_reassoc_width. */
|
||||
4, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
1, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -1778,6 +1791,7 @@ static const struct tune_params saphira_tunings =
|
|||
"16", /* loop_align. */
|
||||
2, /* int_reassoc_width. */
|
||||
4, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
1, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -1811,6 +1825,7 @@ static const struct tune_params thunderx2t99_tunings =
|
|||
"16", /* loop_align. */
|
||||
3, /* int_reassoc_width. */
|
||||
2, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
2, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -1844,6 +1859,7 @@ static const struct tune_params thunderx3t110_tunings =
|
|||
"16", /* loop_align. */
|
||||
3, /* int_reassoc_width. */
|
||||
2, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
2, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -1876,6 +1892,7 @@ static const struct tune_params neoversen1_tunings =
|
|||
"32:16", /* loop_align. */
|
||||
2, /* int_reassoc_width. */
|
||||
4, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
2, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -1912,6 +1929,7 @@ static const struct tune_params ampere1_tunings =
|
|||
"32:16", /* loop_align. */
|
||||
2, /* int_reassoc_width. */
|
||||
4, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
2, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -1949,6 +1967,7 @@ static const struct tune_params ampere1a_tunings =
|
|||
"32:16", /* loop_align. */
|
||||
2, /* int_reassoc_width. */
|
||||
4, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
2, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -2126,6 +2145,7 @@ static const struct tune_params neoversev1_tunings =
|
|||
"32:16", /* loop_align. */
|
||||
2, /* int_reassoc_width. */
|
||||
4, /* fp_reassoc_width. */
|
||||
4, /* fma_reassoc_width. */
|
||||
2, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -2263,6 +2283,7 @@ static const struct tune_params neoverse512tvb_tunings =
|
|||
"32:16", /* loop_align. */
|
||||
2, /* int_reassoc_width. */
|
||||
4, /* fp_reassoc_width. */
|
||||
4, /* fma_reassoc_width. */
|
||||
2, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -2451,6 +2472,7 @@ static const struct tune_params neoversen2_tunings =
|
|||
"32:16", /* loop_align. */
|
||||
2, /* int_reassoc_width. */
|
||||
4, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
2, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -2640,6 +2662,7 @@ static const struct tune_params neoversev2_tunings =
|
|||
"32:16", /* loop_align. */
|
||||
3, /* int_reassoc_width. */
|
||||
6, /* fp_reassoc_width. */
|
||||
4, /* fma_reassoc_width. */
|
||||
3, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -2675,6 +2698,7 @@ static const struct tune_params a64fx_tunings =
|
|||
"32", /* loop_align. */
|
||||
4, /* int_reassoc_width. */
|
||||
2, /* fp_reassoc_width. */
|
||||
1, /* fma_reassoc_width. */
|
||||
2, /* vec_reassoc_width. */
|
||||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
|
@ -3387,9 +3411,15 @@ aarch64_reassociation_width (unsigned opc, machine_mode mode)
|
|||
return aarch64_tune_params.vec_reassoc_width;
|
||||
if (INTEGRAL_MODE_P (mode))
|
||||
return aarch64_tune_params.int_reassoc_width;
|
||||
/* Avoid reassociating floating point addition so we emit more FMAs. */
|
||||
if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
|
||||
return aarch64_tune_params.fp_reassoc_width;
|
||||
/* Reassociation reduces the number of FMAs which may result in worse
|
||||
performance. Use a per-CPU setting for FMA reassociation which allows
|
||||
narrow CPUs with few FP pipes to switch it off (value of 1), and wider
|
||||
CPUs with many FP pipes to enable reassociation.
|
||||
Since the reassociation pass doesn't understand FMA at all, assume
|
||||
that any FP addition might turn into FMA. */
|
||||
if (FLOAT_MODE_P (mode))
|
||||
return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
|
||||
: aarch64_tune_params.fp_reassoc_width;
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue