x86-tune-costs.h (bdver?_memcpy, [...]): Unify to ...
2018-10-11 Richard Biener <rguenther@suse.de> * config/i386/x86-tune-costs.h (bdver?_memcpy, bdver?_memset, bdver?_cost): Unify to ... (bdver_memcpy, bdver_memset, bdver_cost): ... this. * config/i386/i386.c (processor_cost_table): Adjust. From-SVN: r265038
This commit is contained in:
parent
c1b48b2929
commit
c727b83513
3 changed files with 17 additions and 316 deletions
|
@ -1,3 +1,10 @@
|
|||
2018-10-11 Richard Biener <rguenther@suse.de>
|
||||
|
||||
* config/i386/x86-tune-costs.h (bdver?_memcpy, bdver?_memset,
|
||||
bdver?_cost): Unify to ...
|
||||
(bdver_memcpy, bdver_memset, bdver_cost): ... this.
|
||||
* config/i386/i386.c (processor_cost_table): Adjust.
|
||||
|
||||
2018-10-10 Eric Botcazou <ebotcazou@adacore.com>
|
||||
|
||||
PR middle-end/87574
|
||||
|
|
|
@ -861,10 +861,10 @@ static const struct processor_costs *processor_cost_table[PROCESSOR_max] =
|
|||
&athlon_cost,
|
||||
&k8_cost,
|
||||
&amdfam10_cost,
|
||||
&bdver1_cost,
|
||||
&bdver2_cost,
|
||||
&bdver3_cost,
|
||||
&bdver4_cost,
|
||||
&bdver_cost,
|
||||
&bdver_cost,
|
||||
&bdver_cost,
|
||||
&bdver_cost,
|
||||
&btver1_cost,
|
||||
&btver2_cost,
|
||||
&znver1_cost,
|
||||
|
|
|
@ -1047,21 +1047,21 @@ struct processor_costs amdfam10_cost = {
|
|||
"32", /* Func alignment. */
|
||||
};
|
||||
|
||||
/* BDVER1 has optimized REP instruction for medium sized blocks, but for
|
||||
/* BDVER has optimized REP instruction for medium sized blocks, but for
|
||||
very small blocks it is better to use loop. For large blocks, libcall
|
||||
can do nontemporary accesses and beat inline considerably. */
|
||||
static stringop_algs bdver1_memcpy[2] = {
|
||||
static stringop_algs bdver_memcpy[2] = {
|
||||
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
|
||||
{-1, rep_prefix_4_byte, false}}},
|
||||
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
|
||||
{-1, libcall, false}}}};
|
||||
static stringop_algs bdver1_memset[2] = {
|
||||
static stringop_algs bdver_memset[2] = {
|
||||
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
|
||||
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
|
||||
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
|
||||
{-1, libcall, false}}}};
|
||||
|
||||
const struct processor_costs bdver1_cost = {
|
||||
const struct processor_costs bdver_cost = {
|
||||
COSTS_N_INSNS (1), /* cost of an add instruction */
|
||||
COSTS_N_INSNS (1), /* cost of a lea instruction */
|
||||
COSTS_N_INSNS (1), /* variable shift costs */
|
||||
|
@ -1139,314 +1139,8 @@ const struct processor_costs bdver1_cost = {
|
|||
COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
|
||||
COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
|
||||
1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
|
||||
bdver1_memcpy,
|
||||
bdver1_memset,
|
||||
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
|
||||
COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
|
||||
"16:11:8", /* Loop alignment. */
|
||||
"16:8:8", /* Jump alignment. */
|
||||
"0:0:8", /* Label alignment. */
|
||||
"11", /* Func alignment. */
|
||||
};
|
||||
|
||||
/* BDVER2 has optimized REP instruction for medium sized blocks, but for
|
||||
very small blocks it is better to use loop. For large blocks, libcall
|
||||
can do nontemporary accesses and beat inline considerably. */
|
||||
|
||||
static stringop_algs bdver2_memcpy[2] = {
|
||||
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
|
||||
{-1, rep_prefix_4_byte, false}}},
|
||||
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
|
||||
{-1, libcall, false}}}};
|
||||
static stringop_algs bdver2_memset[2] = {
|
||||
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
|
||||
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
|
||||
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
|
||||
{-1, libcall, false}}}};
|
||||
|
||||
const struct processor_costs bdver2_cost = {
|
||||
COSTS_N_INSNS (1), /* cost of an add instruction */
|
||||
COSTS_N_INSNS (1), /* cost of a lea instruction */
|
||||
COSTS_N_INSNS (1), /* variable shift costs */
|
||||
COSTS_N_INSNS (1), /* constant shift costs */
|
||||
{COSTS_N_INSNS (4), /* cost of starting multiply for QI */
|
||||
COSTS_N_INSNS (4), /* HI */
|
||||
COSTS_N_INSNS (4), /* SI */
|
||||
COSTS_N_INSNS (6), /* DI */
|
||||
COSTS_N_INSNS (6)}, /* other */
|
||||
0, /* cost of multiply per each bit set */
|
||||
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
|
||||
COSTS_N_INSNS (35), /* HI */
|
||||
COSTS_N_INSNS (51), /* SI */
|
||||
COSTS_N_INSNS (83), /* DI */
|
||||
COSTS_N_INSNS (83)}, /* other */
|
||||
COSTS_N_INSNS (1), /* cost of movsx */
|
||||
COSTS_N_INSNS (1), /* cost of movzx */
|
||||
8, /* "large" insn */
|
||||
9, /* MOVE_RATIO */
|
||||
|
||||
/* All move costs are relative to integer->integer move times 2 and thus
|
||||
they are latency*2. */
|
||||
8, /* cost for loading QImode using movzbl */
|
||||
{8, 8, 8}, /* cost of loading integer registers
|
||||
in QImode, HImode and SImode.
|
||||
Relative to reg-reg move (2). */
|
||||
{8, 8, 8}, /* cost of storing integer registers */
|
||||
4, /* cost of reg,reg fld/fst */
|
||||
{12, 12, 28}, /* cost of loading fp registers
|
||||
in SFmode, DFmode and XFmode */
|
||||
{10, 10, 18}, /* cost of storing fp registers
|
||||
in SFmode, DFmode and XFmode */
|
||||
4, /* cost of moving MMX register */
|
||||
{12, 12}, /* cost of loading MMX registers
|
||||
in SImode and DImode */
|
||||
{10, 10}, /* cost of storing MMX registers
|
||||
in SImode and DImode */
|
||||
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
|
||||
{12, 12, 10, 20, 30}, /* cost of loading SSE registers
|
||||
in 32,64,128,256 and 512-bit */
|
||||
{12, 12, 10, 20, 30}, /* cost of unaligned loads. */
|
||||
{10, 10, 10, 20, 30}, /* cost of storing SSE registers
|
||||
in 32,64,128,256 and 512-bit */
|
||||
{10, 10, 10, 20, 30}, /* cost of unaligned stores. */
|
||||
16, 20, /* SSE->integer and integer->SSE moves */
|
||||
12, 12, /* Gather load static, per_elt. */
|
||||
10, 10, /* Gather store static, per_elt. */
|
||||
16, /* size of l1 cache. */
|
||||
2048, /* size of l2 cache. */
|
||||
64, /* size of prefetch block */
|
||||
/* New AMD processors never drop prefetches; if they cannot be performed
|
||||
immediately, they are queued. We set number of simultaneous prefetches
|
||||
to a large constant to reflect this (it probably is not a good idea not
|
||||
to limit number of prefetches at all, as their execution also takes some
|
||||
time). */
|
||||
100, /* number of parallel prefetches */
|
||||
2, /* Branch cost */
|
||||
COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
|
||||
COSTS_N_INSNS (6), /* cost of FMUL instruction. */
|
||||
COSTS_N_INSNS (42), /* cost of FDIV instruction. */
|
||||
COSTS_N_INSNS (2), /* cost of FABS instruction. */
|
||||
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
|
||||
COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
|
||||
|
||||
COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
|
||||
COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
|
||||
COSTS_N_INSNS (6), /* cost of MULSS instruction. */
|
||||
COSTS_N_INSNS (6), /* cost of MULSD instruction. */
|
||||
COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
|
||||
COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
|
||||
/* 9-24 */
|
||||
COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
|
||||
/* 9-27 */
|
||||
COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
|
||||
COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
|
||||
COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
|
||||
1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
|
||||
bdver2_memcpy,
|
||||
bdver2_memset,
|
||||
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
|
||||
COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
|
||||
"16:11:8", /* Loop alignment. */
|
||||
"16:8:8", /* Jump alignment. */
|
||||
"0:0:8", /* Label alignment. */
|
||||
"11", /* Func alignment. */
|
||||
};
|
||||
|
||||
|
||||
/* BDVER3 has optimized REP instruction for medium sized blocks, but for
|
||||
very small blocks it is better to use loop. For large blocks, libcall
|
||||
can do nontemporary accesses and beat inline considerably. */
|
||||
static stringop_algs bdver3_memcpy[2] = {
|
||||
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
|
||||
{-1, rep_prefix_4_byte, false}}},
|
||||
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
|
||||
{-1, libcall, false}}}};
|
||||
static stringop_algs bdver3_memset[2] = {
|
||||
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
|
||||
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
|
||||
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
|
||||
{-1, libcall, false}}}};
|
||||
struct processor_costs bdver3_cost = {
|
||||
COSTS_N_INSNS (1), /* cost of an add instruction */
|
||||
COSTS_N_INSNS (1), /* cost of a lea instruction */
|
||||
COSTS_N_INSNS (1), /* variable shift costs */
|
||||
COSTS_N_INSNS (1), /* constant shift costs */
|
||||
{COSTS_N_INSNS (4), /* cost of starting multiply for QI */
|
||||
COSTS_N_INSNS (4), /* HI */
|
||||
COSTS_N_INSNS (4), /* SI */
|
||||
COSTS_N_INSNS (6), /* DI */
|
||||
COSTS_N_INSNS (6)}, /* other */
|
||||
0, /* cost of multiply per each bit set */
|
||||
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
|
||||
COSTS_N_INSNS (35), /* HI */
|
||||
COSTS_N_INSNS (51), /* SI */
|
||||
COSTS_N_INSNS (83), /* DI */
|
||||
COSTS_N_INSNS (83)}, /* other */
|
||||
COSTS_N_INSNS (1), /* cost of movsx */
|
||||
COSTS_N_INSNS (1), /* cost of movzx */
|
||||
8, /* "large" insn */
|
||||
9, /* MOVE_RATIO */
|
||||
|
||||
/* All move costs are relative to integer->integer move times 2 and thus
|
||||
they are latency*2. */
|
||||
8, /* cost for loading QImode using movzbl */
|
||||
{8, 8, 8}, /* cost of loading integer registers
|
||||
in QImode, HImode and SImode.
|
||||
Relative to reg-reg move (2). */
|
||||
{8, 8, 8}, /* cost of storing integer registers */
|
||||
4, /* cost of reg,reg fld/fst */
|
||||
{12, 12, 28}, /* cost of loading fp registers
|
||||
in SFmode, DFmode and XFmode */
|
||||
{10, 10, 18}, /* cost of storing fp registers
|
||||
in SFmode, DFmode and XFmode */
|
||||
4, /* cost of moving MMX register */
|
||||
{12, 12}, /* cost of loading MMX registers
|
||||
in SImode and DImode */
|
||||
{10, 10}, /* cost of storing MMX registers
|
||||
in SImode and DImode */
|
||||
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
|
||||
{12, 12, 10, 20, 30}, /* cost of loading SSE registers
|
||||
in 32,64,128,256 and 512-bit */
|
||||
{12, 12, 10, 20, 30}, /* cost of unaligned loads. */
|
||||
{10, 10, 10, 20, 30}, /* cost of storing SSE registers
|
||||
in 32,64,128,256 and 512-bit */
|
||||
{10, 10, 10, 20, 30}, /* cost of unaligned stores. */
|
||||
16, 20, /* SSE->integer and integer->SSE moves */
|
||||
12, 12, /* Gather load static, per_elt. */
|
||||
10, 10, /* Gather store static, per_elt. */
|
||||
16, /* size of l1 cache. */
|
||||
2048, /* size of l2 cache. */
|
||||
64, /* size of prefetch block */
|
||||
/* New AMD processors never drop prefetches; if they cannot be performed
|
||||
immediately, they are queued. We set number of simultaneous prefetches
|
||||
to a large constant to reflect this (it probably is not a good idea not
|
||||
to limit number of prefetches at all, as their execution also takes some
|
||||
time). */
|
||||
100, /* number of parallel prefetches */
|
||||
2, /* Branch cost */
|
||||
COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
|
||||
COSTS_N_INSNS (6), /* cost of FMUL instruction. */
|
||||
COSTS_N_INSNS (42), /* cost of FDIV instruction. */
|
||||
COSTS_N_INSNS (2), /* cost of FABS instruction. */
|
||||
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
|
||||
COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
|
||||
|
||||
COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
|
||||
COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
|
||||
COSTS_N_INSNS (6), /* cost of MULSS instruction. */
|
||||
COSTS_N_INSNS (6), /* cost of MULSD instruction. */
|
||||
COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
|
||||
COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
|
||||
/* 9-24 */
|
||||
COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
|
||||
/* 9-27 */
|
||||
COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
|
||||
COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
|
||||
COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
|
||||
1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
|
||||
bdver3_memcpy,
|
||||
bdver3_memset,
|
||||
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
|
||||
COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
|
||||
"16:11:8", /* Loop alignment. */
|
||||
"16:8:8", /* Jump alignment. */
|
||||
"0:0:8", /* Label alignment. */
|
||||
"11", /* Func alignment. */
|
||||
};
|
||||
|
||||
/* BDVER4 has optimized REP instruction for medium sized blocks, but for
|
||||
very small blocks it is better to use loop. For large blocks, libcall
|
||||
can do nontemporary accesses and beat inline considerably. */
|
||||
static stringop_algs bdver4_memcpy[2] = {
|
||||
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
|
||||
{-1, rep_prefix_4_byte, false}}},
|
||||
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
|
||||
{-1, libcall, false}}}};
|
||||
static stringop_algs bdver4_memset[2] = {
|
||||
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
|
||||
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
|
||||
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
|
||||
{-1, libcall, false}}}};
|
||||
struct processor_costs bdver4_cost = {
|
||||
COSTS_N_INSNS (1), /* cost of an add instruction */
|
||||
COSTS_N_INSNS (1), /* cost of a lea instruction */
|
||||
COSTS_N_INSNS (1), /* variable shift costs */
|
||||
COSTS_N_INSNS (1), /* constant shift costs */
|
||||
{COSTS_N_INSNS (4), /* cost of starting multiply for QI */
|
||||
COSTS_N_INSNS (4), /* HI */
|
||||
COSTS_N_INSNS (4), /* SI */
|
||||
COSTS_N_INSNS (6), /* DI */
|
||||
COSTS_N_INSNS (6)}, /* other */
|
||||
0, /* cost of multiply per each bit set */
|
||||
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
|
||||
COSTS_N_INSNS (35), /* HI */
|
||||
COSTS_N_INSNS (51), /* SI */
|
||||
COSTS_N_INSNS (83), /* DI */
|
||||
COSTS_N_INSNS (83)}, /* other */
|
||||
COSTS_N_INSNS (1), /* cost of movsx */
|
||||
COSTS_N_INSNS (1), /* cost of movzx */
|
||||
8, /* "large" insn */
|
||||
9, /* MOVE_RATIO */
|
||||
|
||||
/* All move costs are relative to integer->integer move times 2 and thus
|
||||
they are latency*2. */
|
||||
8, /* cost for loading QImode using movzbl */
|
||||
{8, 8, 8}, /* cost of loading integer registers
|
||||
in QImode, HImode and SImode.
|
||||
Relative to reg-reg move (2). */
|
||||
{8, 8, 8}, /* cost of storing integer registers */
|
||||
4, /* cost of reg,reg fld/fst */
|
||||
{12, 12, 28}, /* cost of loading fp registers
|
||||
in SFmode, DFmode and XFmode */
|
||||
{10, 10, 18}, /* cost of storing fp registers
|
||||
in SFmode, DFmode and XFmode */
|
||||
4, /* cost of moving MMX register */
|
||||
{12, 12}, /* cost of loading MMX registers
|
||||
in SImode and DImode */
|
||||
{10, 10}, /* cost of storing MMX registers
|
||||
in SImode and DImode */
|
||||
2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
|
||||
{12, 12, 10, 20, 30}, /* cost of loading SSE registers
|
||||
in 32,64,128,256 and 512-bit */
|
||||
{12, 12, 10, 20, 30}, /* cost of unaligned loads. */
|
||||
{10, 10, 10, 20, 30}, /* cost of storing SSE registers
|
||||
in 32,64,128,256 and 512-bit */
|
||||
{10, 10, 10, 20, 30}, /* cost of unaligned stores. */
|
||||
16, 20, /* SSE->integer and integer->SSE moves */
|
||||
12, 12, /* Gather load static, per_elt. */
|
||||
10, 10, /* Gather store static, per_elt. */
|
||||
16, /* size of l1 cache. */
|
||||
2048, /* size of l2 cache. */
|
||||
64, /* size of prefetch block */
|
||||
/* New AMD processors never drop prefetches; if they cannot be performed
|
||||
immediately, they are queued. We set number of simultaneous prefetches
|
||||
to a large constant to reflect this (it probably is not a good idea not
|
||||
to limit number of prefetches at all, as their execution also takes some
|
||||
time). */
|
||||
100, /* number of parallel prefetches */
|
||||
2, /* Branch cost */
|
||||
COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
|
||||
COSTS_N_INSNS (6), /* cost of FMUL instruction. */
|
||||
COSTS_N_INSNS (42), /* cost of FDIV instruction. */
|
||||
COSTS_N_INSNS (2), /* cost of FABS instruction. */
|
||||
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
|
||||
COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
|
||||
|
||||
COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
|
||||
COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
|
||||
COSTS_N_INSNS (6), /* cost of MULSS instruction. */
|
||||
COSTS_N_INSNS (6), /* cost of MULSD instruction. */
|
||||
COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
|
||||
COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
|
||||
/* 9-24 */
|
||||
COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
|
||||
/* 9-27 */
|
||||
COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
|
||||
COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
|
||||
COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
|
||||
1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
|
||||
bdver4_memcpy,
|
||||
bdver4_memset,
|
||||
bdver_memcpy,
|
||||
bdver_memset,
|
||||
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
|
||||
COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
|
||||
"16:11:8", /* Loop alignment. */
|
||||
|
|
Loading…
Add table
Reference in a new issue