i386-c.c (ix86_target_macros_internal): Update handling of core chips.

* i386-c.c (ix86_target_macros_internal): Update handling of core
	chips.
	* i386.c (DUMMY_STRINGOP_ALGS): Update
	(*_cost): Update.
	(core_cost): Copy from generic64_cost; fix stringop descriptors.
	(m_CORE2_32, m_CORE2_64, m_COREI7_32, m_COREI7_64, m_CORE2I7_32,
	m_CORE2I7_64): Remove.
	(m_CORE2, m_COREI7, m_CORE2I7): Update.
	(initial_ix86_tune_features): Update.
	(processor_target): Update.
	(ix86_option_override_internal): Update.
	(ix86_option_override_internal): Remove PROCESSOR_CORE2_64,
	PROCESSOR_COREI7_64 special cases.
	(decide_alg): Add noalign parameter; set it.
	(ix86_expand_movmem, ix86_expand_setmem): Update noalign handling.
	(ix86_issue_rate): Update.
	(ia32_multipass_dfa_lookahead): Update.
	(ix86_sched_init_global): Update.
	(get_builtin_code_for_version): Update.
	* i386.h (stringop_strategy): Add noalign flag.
	(TARGET_CORE2_32, TARGET_CORE2_64, TARGET_COREI7_32, TARGET_COREI7_64):
	Remove.
	(TARGET_CORE2, TARGET_COREI7): New.
	(enum processor_type): Remove PROCESSOR_CORE2_32, PROCESSOR_CORE2_64,
	PROCESSOR_COREI7_32, PROCESSOR_COREI7_64; add PROCESSOR_CORE2,
	PROCESSOR_COREI7.

From-SVN: r193761
This commit is contained in:
Jan Hubicka 2012-11-23 17:02:09 +01:00 committed by Jan Hubicka
parent ee88e690a2
commit 340ef734b8
4 changed files with 265 additions and 153 deletions

View file

@ -1,3 +1,32 @@
2012-11-23 Jan Hubicka <jh@suse.cz>
* i386-c.c (ix86_target_macros_internal): Update handling of core
chips.
* i386.c (DUMMY_STRINGOP_ALGS): Update
(*_cost): Update.
(core_cost): Copy from generic64_cost; fix stringop descriptors.
(m_CORE2_32, m_CORE2_64, m_COREI7_32, m_COREI7_64, m_CORE2I7_32,
m_CORE2I7_64): Remove.
(m_CORE2, m_COREI7, m_CORE2I7): Update.
(initial_ix86_tune_features): Update.
(processor_target): Update.
(ix86_option_override_internal): Update.
(ix86_option_override_internal): Remove PROCESSOR_CORE2_64,
PROCESSOR_COREI7_64 special cases.
(decide_alg): Add noalign parameter; set it.
(ix86_expand_movmem, ix86_expand_setmem): Update noalign handling.
(ix86_issue_rate): Update.
(ia32_multipass_dfa_lookahead): Update.
(ix86_sched_init_global): Update.
(get_builtin_code_for_version): Update.
* i386.h (stringop_strategy): Add noalign flag.
(TARGET_CORE2_32, TARGET_CORE2_64, TARGET_COREI7_32, TARGET_COREI7_64):
Remove.
(TARGET_CORE2, TARGET_COREI7): New.
(enum processor_type): Remove PROCESSOR_CORE2_32, PROCESSOR_CORE2_64,
PROCESSOR_COREI7_32, PROCESSOR_COREI7_64; add PROCESSOR_CORE2,
PROCESSOR_COREI7.
2012-11-23 Eric Botcazou <ebotcazou@adacore.com>
PR rtl-optimization/55388

View file

@ -134,13 +134,11 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
def_or_undef (parse_in, "__nocona");
def_or_undef (parse_in, "__nocona__");
break;
case PROCESSOR_CORE2_32:
case PROCESSOR_CORE2_64:
case PROCESSOR_CORE2:
def_or_undef (parse_in, "__core2");
def_or_undef (parse_in, "__core2__");
break;
case PROCESSOR_COREI7_32:
case PROCESSOR_COREI7_64:
case PROCESSOR_COREI7:
def_or_undef (parse_in, "__corei7");
def_or_undef (parse_in, "__corei7__");
break;
@ -228,12 +226,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
case PROCESSOR_NOCONA:
def_or_undef (parse_in, "__tune_nocona__");
break;
case PROCESSOR_CORE2_32:
case PROCESSOR_CORE2_64:
case PROCESSOR_CORE2:
def_or_undef (parse_in, "__tune_core2__");
break;
case PROCESSOR_COREI7_32:
case PROCESSOR_COREI7_64:
case PROCESSOR_COREI7:
def_or_undef (parse_in, "__tune_corei7__");
break;
case PROCESSOR_ATOM:

View file

@ -83,7 +83,7 @@ static rtx legitimize_dllimport_symbol (rtx, bool);
/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
#define COSTS_N_BYTES(N) ((N) * 2)
#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
const
struct processor_costs ix86_size_cost = {/* costs for tuning for size */
@ -138,10 +138,10 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
COSTS_N_BYTES (2), /* cost of FABS instruction. */
COSTS_N_BYTES (2), /* cost of FCHS instruction. */
COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@ -209,9 +209,9 @@ struct processor_costs i386_cost = { /* 386 specific costs */
COSTS_N_INSNS (22), /* cost of FABS instruction. */
COSTS_N_INSNS (24), /* cost of FCHS instruction. */
COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
DUMMY_STRINGOP_ALGS},
{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
DUMMY_STRINGOP_ALGS},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
@ -281,9 +281,9 @@ struct processor_costs i486_cost = { /* 486 specific costs */
COSTS_N_INSNS (3), /* cost of FABS instruction. */
COSTS_N_INSNS (3), /* cost of FCHS instruction. */
COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
{{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
{{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS},
{{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
{{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
@ -351,9 +351,9 @@ struct processor_costs pentium_cost = {
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
{{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
{{libcall, {{-1, rep_prefix_4_byte}}},
{{libcall, {{-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
@ -426,11 +426,13 @@ struct processor_costs pentiumpro_cost = {
noticeable win, for bigger blocks either rep movsl or rep movsb is
way to go. Rep movsb has apparently more expensive startup time in CPU,
but after 4K the difference is down in the noise. */
{{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
{8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
{{rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
{8192, rep_prefix_4_byte, false},
{-1, rep_prefix_1_byte, false}}},
DUMMY_STRINGOP_ALGS},
{{rep_prefix_4_byte, {{1024, unrolled_loop},
{8192, rep_prefix_4_byte}, {-1, libcall}}},
{{rep_prefix_4_byte, {{1024, unrolled_loop, false},
{8192, rep_prefix_4_byte, false},
{-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
@ -499,9 +501,9 @@ struct processor_costs geode_cost = {
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
{{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
{{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
@ -572,9 +574,9 @@ struct processor_costs k6_cost = {
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
{{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
{{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
@ -645,9 +647,9 @@ struct processor_costs athlon_cost = {
/* For some reason, Athlon deals better with REP prefix (relative to loops)
compared to K8. Alignment becomes important after 8 bytes for memcpy and
128 bytes for memset. */
{{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
{{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
{{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
{{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
@ -723,11 +725,14 @@ struct processor_costs k8_cost = {
/* K8 has optimized REP instruction for medium sized blocks, but for very
small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
{{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
{libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{8, loop}, {24, unrolled_loop},
{2048, rep_prefix_4_byte}, {-1, libcall}}},
{libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
{{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
4, /* scalar_stmt_cost. */
2, /* scalar load_cost. */
2, /* scalar_store_cost. */
@ -810,11 +815,14 @@ struct processor_costs amdfam10_cost = {
/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
{{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
{libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{8, loop}, {24, unrolled_loop},
{2048, rep_prefix_4_byte}, {-1, libcall}}},
{libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
{{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
4, /* scalar_stmt_cost. */
2, /* scalar load_cost. */
2, /* scalar_store_cost. */
@ -897,11 +905,14 @@ struct processor_costs bdver1_cost = {
/* BDVER1 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
{{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
{libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{8, loop}, {24, unrolled_loop},
{2048, rep_prefix_4_byte}, {-1, libcall}}},
{libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
{{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
6, /* scalar_stmt_cost. */
4, /* scalar load_cost. */
4, /* scalar_store_cost. */
@ -984,11 +995,14 @@ struct processor_costs bdver2_cost = {
/* BDVER2 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
{{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
{libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{8, loop}, {24, unrolled_loop},
{2048, rep_prefix_4_byte}, {-1, libcall}}},
{libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
{{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
6, /* scalar_stmt_cost. */
4, /* scalar load_cost. */
4, /* scalar_store_cost. */
@ -1063,11 +1077,14 @@ struct processor_costs bdver3_cost = {
/* BDVER3 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
{{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
{libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{8, loop}, {24, unrolled_loop},
{2048, rep_prefix_4_byte}, {-1, libcall}}},
{libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
{{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
6, /* scalar_stmt_cost. */
4, /* scalar load_cost. */
4, /* scalar_store_cost. */
@ -1145,11 +1162,14 @@ struct processor_costs btver1_cost = {
/* BTVER1 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
{{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
{libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{8, loop}, {24, unrolled_loop},
{2048, rep_prefix_4_byte}, {-1, libcall}}},
{libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
{{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
4, /* scalar_stmt_cost. */
2, /* scalar load_cost. */
2, /* scalar_store_cost. */
@ -1224,11 +1244,14 @@ struct processor_costs btver2_cost = {
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
{{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
{libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{8, loop}, {24, unrolled_loop},
{2048, rep_prefix_4_byte}, {-1, libcall}}},
{libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
{{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
4, /* scalar_stmt_cost. */
2, /* scalar load_cost. */
2, /* scalar_store_cost. */
@ -1295,10 +1318,10 @@ struct processor_costs pentium4_cost = {
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
{{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
{{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS},
{{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
{-1, libcall}}},
{{libcall, {{6, loop_1_byte, false}, {48, loop, false},
{20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
@ -1366,13 +1389,13 @@ struct processor_costs nocona_cost = {
COSTS_N_INSNS (3), /* cost of FABS instruction. */
COSTS_N_INSNS (3), /* cost of FCHS instruction. */
COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
{{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
{libcall, {{32, loop}, {20000, rep_prefix_8_byte},
{100000, unrolled_loop}, {-1, libcall}}}},
{{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
{-1, libcall}}},
{libcall, {{24, loop}, {64, unrolled_loop},
{8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
{libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
{100000, unrolled_loop, false}, {-1, libcall, false}}}},
{{libcall, {{6, loop_1_byte, false}, {48, loop, false},
{20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{24, loop, false}, {64, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@ -1439,13 +1462,13 @@ struct processor_costs atom_cost = {
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
{{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
{libcall, {{32, loop}, {64, rep_prefix_4_byte},
{8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{8, loop}, {15, unrolled_loop},
{2048, rep_prefix_4_byte}, {-1, libcall}}},
{libcall, {{24, loop}, {32, unrolled_loop},
{8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
{libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
{{libcall, {{8, loop, false}, {15, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{24, loop, false}, {32, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@ -1520,9 +1543,92 @@ struct processor_costs generic64_cost = {
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
{DUMMY_STRINGOP_ALGS,
{libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
{DUMMY_STRINGOP_ALGS,
{libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* core_cost should produce code tuned for Core familly of CPUs. */
static const
struct processor_costs core_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
/* On all chips taken into consideration lea is 2 cycles and more. With
this cost however our current implementation of synth_mult results in
use of unnecessary temporary registers causing regression on several
SPECfp benchmarks. */
COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (2)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
COSTS_N_INSNS (26), /* HI */
COSTS_N_INSNS (42), /* SI */
COSTS_N_INSNS (74), /* DI */
COSTS_N_INSNS (74)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
17, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{12, 12, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{8, 8}, /* cost of loading MMX registers
in SImode and DImode */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{8, 8, 8}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{8, 8, 8}, /* cost of storing SSE registers
in SImode, DImode and TImode */
5, /* MMX or SSE register to integer */
64, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
/* FIXME perhaps more appropriate value is 5. */
3, /* Branch cost */
COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (8), /* cost of FMUL instruction. */
COSTS_N_INSNS (20), /* cost of FDIV instruction. */
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
{{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
{libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
{-1, libcall, false}}}},
{{libcall, {{6, loop_1_byte, true},
{24, loop, true},
{8192, rep_prefix_4_byte, true},
{-1, libcall, false}}},
{libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
{-1, libcall, false}}}},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@ -1591,9 +1697,11 @@ struct processor_costs generic32_cost = {
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
{{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
{{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
{-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
{{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
{{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
{-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
@ -1622,14 +1730,9 @@ const struct processor_costs *ix86_cost = &pentium_cost;
#define m_PENT4 (1<<PROCESSOR_PENTIUM4)
#define m_NOCONA (1<<PROCESSOR_NOCONA)
#define m_P4_NOCONA (m_PENT4 | m_NOCONA)
#define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
#define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
#define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
#define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
#define m_COREI7 (m_COREI7_32 | m_COREI7_64)
#define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
#define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
#define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
#define m_CORE2 (1<<PROCESSOR_CORE2)
#define m_COREI7 (1<<PROCESSOR_COREI7)
#define m_CORE2I7 (m_CORE2 | m_COREI7)
#define m_ATOM (1<<PROCESSOR_ATOM)
#define m_GEODE (1<<PROCESSOR_GEODE)
@ -1665,7 +1768,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
negatively, so enabling for Generic64 seems like good code size
tradeoff. We can't enable it for 32bit generic because it does not
work well with PPro base chips. */
m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
m_386 | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
/* X86_TUNE_PUSH_MEMORY */
m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
@ -1856,7 +1959,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
/* X86_TUNE_AVOID_VECTOR_DECODE */
m_CORE2I7_64 | m_K8 | m_GENERIC64,
m_CORE2I7 | m_K8 | m_GENERIC64,
/* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
and SImode multiply, but 386 and 486 do HImode multiply faster. */
@ -1864,11 +1967,11 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
/* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
vector path on AMD machines. */
m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
/* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
machines. */
m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
/* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
than a MOV. */
@ -2329,14 +2432,10 @@ static const struct ptt processor_target_table[PROCESSOR_max] =
{&pentium4_cost, 0, 0, 0, 0, 0},
{&k8_cost, 16, 7, 16, 7, 16},
{&nocona_cost, 0, 0, 0, 0, 0},
/* Core 2 32-bit. */
{&generic32_cost, 16, 10, 16, 10, 16},
/* Core 2 64-bit. */
{&generic64_cost, 16, 10, 16, 10, 16},
/* Core i7 32-bit. */
{&generic32_cost, 16, 10, 16, 10, 16},
/* Core i7 64-bit. */
{&generic64_cost, 16, 10, 16, 10, 16},
/* Core 2 */
{&core_cost, 16, 10, 16, 10, 16},
/* Core i7 */
{&core_cost, 16, 10, 16, 10, 16},
{&generic32_cost, 16, 7, 16, 7, 16},
{&generic64_cost, 16, 10, 16, 10, 16},
{&amdfam10_cost, 32, 24, 32, 7, 32},
@ -2799,23 +2898,23 @@ ix86_option_override_internal (bool main_args_p)
{"nocona", PROCESSOR_NOCONA, CPU_NONE,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
| PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
{"core2", PROCESSOR_CORE2_64, CPU_CORE2,
{"core2", PROCESSOR_CORE2, CPU_CORE2,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
| PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
{"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
{"corei7", PROCESSOR_COREI7, CPU_COREI7,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_FXSR},
{"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
{"corei7-avx", PROCESSOR_COREI7, CPU_COREI7,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
| PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
{"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
{"core-avx-i", PROCESSOR_COREI7, CPU_COREI7,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
| PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
{"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
{"core-avx2", PROCESSOR_COREI7, CPU_COREI7,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
@ -3327,14 +3426,6 @@ ix86_option_override_internal (bool main_args_p)
ix86_schedule = CPU_PENTIUMPRO;
break;
case PROCESSOR_CORE2_64:
ix86_tune = PROCESSOR_CORE2_32;
break;
case PROCESSOR_COREI7_64:
ix86_tune = PROCESSOR_COREI7_32;
break;
default:
break;
}
@ -22282,7 +22373,7 @@ expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
static enum stringop_alg
decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
int *dynamic_check)
int *dynamic_check, bool *noalign)
{
const struct stringop_algs * algs;
bool optimize_for_speed;
@ -22293,6 +22384,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
|| (memset
? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
*noalign = false;
#define ALG_USABLE_P(alg) (rep_prefix_usable \
|| (alg != rep_prefix_1_byte \
@ -22360,7 +22452,10 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
break;
}
else if (ALG_USABLE_P (candidate))
return candidate;
{
*noalign = algs->size[i].noalign;
return candidate;
}
}
}
gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
@ -22401,7 +22496,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
}
if (max == -1)
max = 4096;
alg = decide_alg (count, max / 2, memset, dynamic_check);
alg = decide_alg (count, max / 2, memset, dynamic_check, noalign);
gcc_assert (*dynamic_check == -1);
gcc_assert (alg != libcall);
if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
@ -22515,6 +22610,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
enum stringop_alg alg;
int dynamic_check;
bool need_zero_guard = false;
bool noalign;
if (CONST_INT_P (align_exp))
align = INTVAL (align_exp);
@ -22539,10 +22635,10 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
/* Step 0: Decide on preferred algorithm, desired alignment and
size of chunks to be copied by main loop. */
alg = decide_alg (count, expected_size, false, &dynamic_check);
alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
desired_align = decide_alignment (align, alg, expected_size);
if (!TARGET_ALIGN_STRINGOPS)
if (!TARGET_ALIGN_STRINGOPS || noalign)
align = desired_align;
if (alg == libcall)
@ -22910,6 +23006,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
bool force_loopy_epilogue = false;
int dynamic_check;
bool need_zero_guard = false;
bool noalign;
if (CONST_INT_P (align_exp))
align = INTVAL (align_exp);
@ -22929,10 +23026,10 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
/* Step 0: Decide on preferred algorithm, desired alignment and
size of chunks to be copied by main loop. */
alg = decide_alg (count, expected_size, true, &dynamic_check);
alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign);
desired_align = decide_alignment (align, alg, expected_size);
if (!TARGET_ALIGN_STRINGOPS)
if (!TARGET_ALIGN_STRINGOPS || noalign)
align = desired_align;
if (alg == libcall)
@ -23935,10 +24032,8 @@ ix86_issue_rate (void)
case PROCESSOR_PENTIUMPRO:
case PROCESSOR_PENTIUM4:
case PROCESSOR_CORE2_32:
case PROCESSOR_CORE2_64:
case PROCESSOR_COREI7_32:
case PROCESSOR_COREI7_64:
case PROCESSOR_CORE2:
case PROCESSOR_COREI7:
case PROCESSOR_ATHLON:
case PROCESSOR_K8:
case PROCESSOR_AMDFAM10:
@ -24193,10 +24288,8 @@ ia32_multipass_dfa_lookahead (void)
case PROCESSOR_K6:
return 1;
case PROCESSOR_CORE2_32:
case PROCESSOR_CORE2_64:
case PROCESSOR_COREI7_32:
case PROCESSOR_COREI7_64:
case PROCESSOR_CORE2:
case PROCESSOR_COREI7:
case PROCESSOR_ATOM:
/* Generally, we want haifa-sched:max_issue() to look ahead as far
as many instructions can be executed on a cycle, i.e.,
@ -24739,10 +24832,8 @@ ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
they are actually used. */
switch (ix86_tune)
{
case PROCESSOR_CORE2_32:
case PROCESSOR_CORE2_64:
case PROCESSOR_COREI7_32:
case PROCESSOR_COREI7_64:
case PROCESSOR_CORE2:
case PROCESSOR_COREI7:
/* Do not perform multipass scheduling for pre-reload schedule
to save compile time. */
if (reload_completed)
@ -28572,13 +28663,11 @@ get_builtin_code_for_version (tree decl, tree *predicate_list)
{
switch (new_target->arch)
{
case PROCESSOR_CORE2_32:
case PROCESSOR_CORE2_64:
case PROCESSOR_CORE2:
arg_str = "core2";
priority = P_PROC_SSSE3;
break;
case PROCESSOR_COREI7_32:
case PROCESSOR_COREI7_64:
case PROCESSOR_COREI7:
arg_str = "corei7";
priority = P_PROC_SSE4_2;
break;
@ -32868,7 +32957,7 @@ avx_vpermilp_parallel (rtx par, enum machine_mode mode)
{
unsigned i, nelt = GET_MODE_NUNITS (mode);
unsigned mask = 0;
unsigned char ipar[8];
unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
if (XVECLEN (par, 0) != (int) nelt)
return 0;
@ -32943,7 +33032,7 @@ avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
{
unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
unsigned mask = 0;
unsigned char ipar[8];
unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
if (XVECLEN (par, 0) != (int) nelt)
return 0;

View file

@ -101,6 +101,9 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
known at compile time or estimated via feedback, the SIZE array
is walked in order until MAX is greater then the estimate (or -1
means infinity). Corresponding ALG is used then.
When NOALIGN is true the code guaranting the alignment of the memory
block is skipped.
For example initializer:
{{256, loop}, {-1, rep_prefix_4_byte}}
will use loop for blocks smaller or equal to 256 bytes, rep prefix will
@ -111,6 +114,7 @@ struct stringop_algs
const struct stringop_strategy {
const int max;
const enum stringop_alg alg;
int noalign;
} size [MAX_STRINGOP_ALGS];
};
@ -242,12 +246,8 @@ extern const struct processor_costs ix86_size_cost;
#define TARGET_K8 (ix86_tune == PROCESSOR_K8)
#define TARGET_ATHLON_K8 (TARGET_K8 || TARGET_ATHLON)
#define TARGET_NOCONA (ix86_tune == PROCESSOR_NOCONA)
#define TARGET_CORE2_32 (ix86_tune == PROCESSOR_CORE2_32)
#define TARGET_CORE2_64 (ix86_tune == PROCESSOR_CORE2_64)
#define TARGET_CORE2 (TARGET_CORE2_32 || TARGET_CORE2_64)
#define TARGET_COREI7_32 (ix86_tune == PROCESSOR_COREI7_32)
#define TARGET_COREI7_64 (ix86_tune == PROCESSOR_COREI7_64)
#define TARGET_COREI7 (TARGET_COREI7_32 || TARGET_COREI7_64)
#define TARGET_CORE2 (ix86_tune == PROCESSOR_CORE2)
#define TARGET_COREI7 (ix86_tune == PROCESSOR_COREI7)
#define TARGET_GENERIC32 (ix86_tune == PROCESSOR_GENERIC32)
#define TARGET_GENERIC64 (ix86_tune == PROCESSOR_GENERIC64)
#define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64)
@ -2092,10 +2092,8 @@ enum processor_type
PROCESSOR_PENTIUM4,
PROCESSOR_K8,
PROCESSOR_NOCONA,
PROCESSOR_CORE2_32,
PROCESSOR_CORE2_64,
PROCESSOR_COREI7_32,
PROCESSOR_COREI7_64,
PROCESSOR_CORE2,
PROCESSOR_COREI7,
PROCESSOR_GENERIC32,
PROCESSOR_GENERIC64,
PROCESSOR_AMDFAM10,