From 340ef734b8072d2b58f946bb69170f2e535813d7 Mon Sep 17 00:00:00 2001 From: Jan Hubicka Date: Fri, 23 Nov 2012 17:02:09 +0100 Subject: [PATCH] i386-c.c (ix86_target_macros_internal): Update handling of core chips. * i386-c.c (ix86_target_macros_internal): Update handling of core chips. * i386.c (DUMMY_STRINGOP_ALGS): Update (*_cost): Update. (core_cost): Copy from generic64_cost; fix stringop descriptors. (m_CORE2_32, m_CORE2_64, m_COREI7_32, m_COREI7_64, m_CORE2I7_32, m_CORE2I7_64): Remove. (m_CORE2, m_COREI7, m_CORE2I7): Update. (initial_ix86_tune_features): Update. (processor_target): Update. (ix86_option_override_internal): Update. (ix86_option_override_internal): Remove PROCESSOR_CORE2_64, PROCESSOR_COREI7_64 special cases. (decide_alg): Add noalign parameter; set it. (ix86_expand_movmem, ix86_expand_setmem): Update noalign handling. (ix86_issue_rate): Update. (ia32_multipass_dfa_lookahead): Update. (ix86_sched_init_global): Update. (get_builtin_code_for_version): Update. * i386.h (stringop_strategy): Add noalign flag. (TARGET_CORE2_32, TARGET_CORE2_64, TARGET_COREI7_32, TARGET_COREI7_64): Remove. (TARGET_CORE2, TARGET_COREI7): New. (enum processor_type): Remove PROCESSOR_CORE2_32, PROCESSOR_CORE2_64, PROCESSOR_COREI7_32, PROCESSOR_COREI7_64; add PROCESSOR_CORE2, PROCESSOR_COREI7. From-SVN: r193761 --- gcc/ChangeLog | 29 ++++ gcc/config/i386/i386-c.c | 12 +- gcc/config/i386/i386.c | 359 ++++++++++++++++++++++++--------------- gcc/config/i386/i386.h | 18 +- 4 files changed, 265 insertions(+), 153 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index b1a9d32a3e0..46760a06e4a 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,32 @@ +2012-11-23 Jan Hubicka + + * i386-c.c (ix86_target_macros_internal): Update handling of core + chips. + * i386.c (DUMMY_STRINGOP_ALGS): Update + (*_cost): Update. + (core_cost): Copy from generic64_cost; fix stringop descriptors. + (m_CORE2_32, m_CORE2_64, m_COREI7_32, m_COREI7_64, m_CORE2I7_32, + m_CORE2I7_64): Remove. + (m_CORE2, m_COREI7, m_CORE2I7): Update. + (initial_ix86_tune_features): Update. + (processor_target): Update. + (ix86_option_override_internal): Update. + (ix86_option_override_internal): Remove PROCESSOR_CORE2_64, + PROCESSOR_COREI7_64 special cases. + (decide_alg): Add noalign parameter; set it. + (ix86_expand_movmem, ix86_expand_setmem): Update noalign handling. + (ix86_issue_rate): Update. + (ia32_multipass_dfa_lookahead): Update. + (ix86_sched_init_global): Update. + (get_builtin_code_for_version): Update. + * i386.h (stringop_strategy): Add noalign flag. + (TARGET_CORE2_32, TARGET_CORE2_64, TARGET_COREI7_32, TARGET_COREI7_64): + Remove. + (TARGET_CORE2, TARGET_COREI7): New. + (enum processor_type): Remove PROCESSOR_CORE2_32, PROCESSOR_CORE2_64, + PROCESSOR_COREI7_32, PROCESSOR_COREI7_64; add PROCESSOR_CORE2, + PROCESSOR_COREI7. + 2012-11-23 Eric Botcazou PR rtl-optimization/55388 diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c index d1677cf25d2..22e5e9b45e3 100644 --- a/gcc/config/i386/i386-c.c +++ b/gcc/config/i386/i386-c.c @@ -134,13 +134,11 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, def_or_undef (parse_in, "__nocona"); def_or_undef (parse_in, "__nocona__"); break; - case PROCESSOR_CORE2_32: - case PROCESSOR_CORE2_64: + case PROCESSOR_CORE2: def_or_undef (parse_in, "__core2"); def_or_undef (parse_in, "__core2__"); break; - case PROCESSOR_COREI7_32: - case PROCESSOR_COREI7_64: + case PROCESSOR_COREI7: def_or_undef (parse_in, "__corei7"); def_or_undef (parse_in, "__corei7__"); break; @@ -228,12 +226,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, case PROCESSOR_NOCONA: def_or_undef (parse_in, "__tune_nocona__"); break; - case PROCESSOR_CORE2_32: - case PROCESSOR_CORE2_64: + case PROCESSOR_CORE2: def_or_undef (parse_in, "__tune_core2__"); break; - case PROCESSOR_COREI7_32: - case PROCESSOR_COREI7_64: + case PROCESSOR_COREI7: def_or_undef (parse_in, "__tune_corei7__"); break; case PROCESSOR_ATOM: diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index c9f027a00ce..eeb5ac8a881 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -83,7 +83,7 @@ static rtx legitimize_dllimport_symbol (rtx, bool); /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */ #define COSTS_N_BYTES(N) ((N) * 2) -#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}} +#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}} const struct processor_costs ix86_size_cost = {/* costs for tuning for size */ @@ -138,10 +138,10 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ COSTS_N_BYTES (2), /* cost of FABS instruction. */ COSTS_N_BYTES (2), /* cost of FCHS instruction. */ COSTS_N_BYTES (2), /* cost of FSQRT instruction. */ - {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}, - {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}, - {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}, - {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}, + {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}, + {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ @@ -209,9 +209,9 @@ struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (22), /* cost of FABS instruction. */ COSTS_N_INSNS (24), /* cost of FCHS instruction. */ COSTS_N_INSNS (122), /* cost of FSQRT instruction. */ - {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}, + {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, DUMMY_STRINGOP_ALGS}, - {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}, + {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, DUMMY_STRINGOP_ALGS}, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ @@ -281,9 +281,9 @@ struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (3), /* cost of FABS instruction. */ COSTS_N_INSNS (3), /* cost of FCHS instruction. */ COSTS_N_INSNS (83), /* cost of FSQRT instruction. */ - {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}}, + {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, DUMMY_STRINGOP_ALGS}, - {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}}, + {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, DUMMY_STRINGOP_ALGS}, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ @@ -351,9 +351,9 @@ struct processor_costs pentium_cost = { COSTS_N_INSNS (1), /* cost of FABS instruction. */ COSTS_N_INSNS (1), /* cost of FCHS instruction. */ COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ - {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, DUMMY_STRINGOP_ALGS}, - {{libcall, {{-1, rep_prefix_4_byte}}}, + {{libcall, {{-1, rep_prefix_4_byte, false}}}, DUMMY_STRINGOP_ALGS}, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ @@ -426,11 +426,13 @@ struct processor_costs pentiumpro_cost = { noticeable win, for bigger blocks either rep movsl or rep movsb is way to go. Rep movsb has apparently more expensive startup time in CPU, but after 4K the difference is down in the noise. */ - {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop}, - {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}}, + {{rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false}, + {8192, rep_prefix_4_byte, false}, + {-1, rep_prefix_1_byte, false}}}, DUMMY_STRINGOP_ALGS}, - {{rep_prefix_4_byte, {{1024, unrolled_loop}, - {8192, rep_prefix_4_byte}, {-1, libcall}}}, + {{rep_prefix_4_byte, {{1024, unrolled_loop, false}, + {8192, rep_prefix_4_byte, false}, + {-1, libcall, false}}}, DUMMY_STRINGOP_ALGS}, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ @@ -499,9 +501,9 @@ struct processor_costs geode_cost = { COSTS_N_INSNS (1), /* cost of FABS instruction. */ COSTS_N_INSNS (1), /* cost of FCHS instruction. */ COSTS_N_INSNS (54), /* cost of FSQRT instruction. */ - {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, DUMMY_STRINGOP_ALGS}, - {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, DUMMY_STRINGOP_ALGS}, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ @@ -572,9 +574,9 @@ struct processor_costs k6_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ - {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, DUMMY_STRINGOP_ALGS}, - {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, DUMMY_STRINGOP_ALGS}, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ @@ -645,9 +647,9 @@ struct processor_costs athlon_cost = { /* For some reason, Athlon deals better with REP prefix (relative to loops) compared to K8. Alignment becomes important after 8 bytes for memcpy and 128 bytes for memset. */ - {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}}, + {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, DUMMY_STRINGOP_ALGS}, - {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}}, + {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, DUMMY_STRINGOP_ALGS}, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ @@ -723,11 +725,14 @@ struct processor_costs k8_cost = { /* K8 has optimized REP instruction for medium sized blocks, but for very small blocks it is better to use loop. For large blocks, libcall can do nontemporary accesses and beat inline considerably. */ - {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, - {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, - {{libcall, {{8, loop}, {24, unrolled_loop}, - {2048, rep_prefix_4_byte}, {-1, libcall}}}, - {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}, + {{libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, + {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}, 4, /* scalar_stmt_cost. */ 2, /* scalar load_cost. */ 2, /* scalar_store_cost. */ @@ -810,11 +815,14 @@ struct processor_costs amdfam10_cost = { /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for very small blocks it is better to use loop. For large blocks, libcall can do nontemporary accesses and beat inline considerably. */ - {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, - {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, - {{libcall, {{8, loop}, {24, unrolled_loop}, - {2048, rep_prefix_4_byte}, {-1, libcall}}}, - {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}, + {{libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}, 4, /* scalar_stmt_cost. */ 2, /* scalar load_cost. */ 2, /* scalar_store_cost. */ @@ -897,11 +905,14 @@ struct processor_costs bdver1_cost = { /* BDVER1 has optimized REP instruction for medium sized blocks, but for very small blocks it is better to use loop. For large blocks, libcall can do nontemporary accesses and beat inline considerably. */ - {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, - {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, - {{libcall, {{8, loop}, {24, unrolled_loop}, - {2048, rep_prefix_4_byte}, {-1, libcall}}}, - {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}, + {{libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}, 6, /* scalar_stmt_cost. */ 4, /* scalar load_cost. */ 4, /* scalar_store_cost. */ @@ -984,11 +995,14 @@ struct processor_costs bdver2_cost = { /* BDVER2 has optimized REP instruction for medium sized blocks, but for very small blocks it is better to use loop. For large blocks, libcall can do nontemporary accesses and beat inline considerably. */ - {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, - {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, - {{libcall, {{8, loop}, {24, unrolled_loop}, - {2048, rep_prefix_4_byte}, {-1, libcall}}}, - {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}, + {{libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}, 6, /* scalar_stmt_cost. */ 4, /* scalar load_cost. */ 4, /* scalar_store_cost. */ @@ -1063,11 +1077,14 @@ struct processor_costs bdver3_cost = { /* BDVER3 has optimized REP instruction for medium sized blocks, but for very small blocks it is better to use loop. For large blocks, libcall can do nontemporary accesses and beat inline considerably. */ - {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, - {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, - {{libcall, {{8, loop}, {24, unrolled_loop}, - {2048, rep_prefix_4_byte}, {-1, libcall}}}, - {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}, + {{libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}, 6, /* scalar_stmt_cost. */ 4, /* scalar load_cost. */ 4, /* scalar_store_cost. */ @@ -1145,11 +1162,14 @@ struct processor_costs btver1_cost = { /* BTVER1 has optimized REP instruction for medium sized blocks, but for very small blocks it is better to use loop. For large blocks, libcall can do nontemporary accesses and beat inline considerably. */ - {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, - {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, - {{libcall, {{8, loop}, {24, unrolled_loop}, - {2048, rep_prefix_4_byte}, {-1, libcall}}}, - {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}, + {{libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}, 4, /* scalar_stmt_cost. */ 2, /* scalar load_cost. */ 2, /* scalar_store_cost. */ @@ -1224,11 +1244,14 @@ struct processor_costs btver2_cost = { COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ - {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, - {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, - {{libcall, {{8, loop}, {24, unrolled_loop}, - {2048, rep_prefix_4_byte}, {-1, libcall}}}, - {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}, + {{libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}, 4, /* scalar_stmt_cost. */ 2, /* scalar load_cost. */ 2, /* scalar_store_cost. */ @@ -1295,10 +1318,10 @@ struct processor_costs pentium4_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (43), /* cost of FSQRT instruction. */ - {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}}, + {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, DUMMY_STRINGOP_ALGS}, - {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte}, - {-1, libcall}}}, + {{libcall, {{6, loop_1_byte, false}, {48, loop, false}, + {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, DUMMY_STRINGOP_ALGS}, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ @@ -1366,13 +1389,13 @@ struct processor_costs nocona_cost = { COSTS_N_INSNS (3), /* cost of FABS instruction. */ COSTS_N_INSNS (3), /* cost of FCHS instruction. */ COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ - {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}}, - {libcall, {{32, loop}, {20000, rep_prefix_8_byte}, - {100000, unrolled_loop}, {-1, libcall}}}}, - {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte}, - {-1, libcall}}}, - {libcall, {{24, loop}, {64, unrolled_loop}, - {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, + {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false}, + {100000, unrolled_loop, false}, {-1, libcall, false}}}}, + {{libcall, {{6, loop_1_byte, false}, {48, loop, false}, + {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{24, loop, false}, {64, unrolled_loop, false}, + {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ @@ -1439,13 +1462,13 @@ struct processor_costs atom_cost = { COSTS_N_INSNS (8), /* cost of FABS instruction. */ COSTS_N_INSNS (8), /* cost of FCHS instruction. */ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ - {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}}, - {libcall, {{32, loop}, {64, rep_prefix_4_byte}, - {8192, rep_prefix_8_byte}, {-1, libcall}}}}, - {{libcall, {{8, loop}, {15, unrolled_loop}, - {2048, rep_prefix_4_byte}, {-1, libcall}}}, - {libcall, {{24, loop}, {32, unrolled_loop}, - {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, + {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, + {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}, + {{libcall, {{8, loop, false}, {15, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{24, loop, false}, {32, unrolled_loop, false}, + {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ @@ -1520,9 +1543,92 @@ struct processor_costs generic64_cost = { COSTS_N_INSNS (8), /* cost of FCHS instruction. */ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ {DUMMY_STRINGOP_ALGS, - {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}, {DUMMY_STRINGOP_ALGS, - {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + +/* core_cost should produce code tuned for Core familly of CPUs. */ +static const +struct processor_costs core_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + /* On all chips taken into consideration lea is 2 cycles and more. With + this cost however our current implementation of synth_mult results in + use of unnecessary temporary registers causing regression on several + SPECfp benchmarks. */ + COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (2)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (26), /* HI */ + COSTS_N_INSNS (42), /* SI */ + COSTS_N_INSNS (74), /* DI */ + COSTS_N_INSNS (74)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 17, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {12, 12, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {8, 8, 8}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {8, 8, 8}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 5, /* MMX or SSE register to integer */ + 64, /* size of l1 cache. */ + 512, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + /* FIXME perhaps more appropriate value is 5. */ + 3, /* Branch cost */ + COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (8), /* cost of FMUL instruction. */ + COSTS_N_INSNS (20), /* cost of FDIV instruction. */ + COSTS_N_INSNS (8), /* cost of FABS instruction. */ + COSTS_N_INSNS (8), /* cost of FCHS instruction. */ + COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + {{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, + {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true}, + {-1, libcall, false}}}}, + {{libcall, {{6, loop_1_byte, true}, + {24, loop, true}, + {8192, rep_prefix_4_byte, true}, + {-1, libcall, false}}}, + {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true}, + {-1, libcall, false}}}}, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ @@ -1591,9 +1697,11 @@ struct processor_costs generic32_cost = { COSTS_N_INSNS (8), /* cost of FABS instruction. */ COSTS_N_INSNS (8), /* cost of FCHS instruction. */ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ - {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}}, + {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, + {-1, libcall, false}}}, DUMMY_STRINGOP_ALGS}, - {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}}, + {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, + {-1, libcall, false}}}, DUMMY_STRINGOP_ALGS}, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ @@ -1622,14 +1730,9 @@ const struct processor_costs *ix86_cost = &pentium_cost; #define m_PENT4 (1<size[i].noalign; + return candidate; + } } } gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable); @@ -22401,7 +22496,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset, } if (max == -1) max = 4096; - alg = decide_alg (count, max / 2, memset, dynamic_check); + alg = decide_alg (count, max / 2, memset, dynamic_check, noalign); gcc_assert (*dynamic_check == -1); gcc_assert (alg != libcall); if (TARGET_INLINE_STRINGOPS_DYNAMICALLY) @@ -22515,6 +22610,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, enum stringop_alg alg; int dynamic_check; bool need_zero_guard = false; + bool noalign; if (CONST_INT_P (align_exp)) align = INTVAL (align_exp); @@ -22539,10 +22635,10 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, /* Step 0: Decide on preferred algorithm, desired alignment and size of chunks to be copied by main loop. */ - alg = decide_alg (count, expected_size, false, &dynamic_check); + alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign); desired_align = decide_alignment (align, alg, expected_size); - if (!TARGET_ALIGN_STRINGOPS) + if (!TARGET_ALIGN_STRINGOPS || noalign) align = desired_align; if (alg == libcall) @@ -22910,6 +23006,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, bool force_loopy_epilogue = false; int dynamic_check; bool need_zero_guard = false; + bool noalign; if (CONST_INT_P (align_exp)) align = INTVAL (align_exp); @@ -22929,10 +23026,10 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, /* Step 0: Decide on preferred algorithm, desired alignment and size of chunks to be copied by main loop. */ - alg = decide_alg (count, expected_size, true, &dynamic_check); + alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign); desired_align = decide_alignment (align, alg, expected_size); - if (!TARGET_ALIGN_STRINGOPS) + if (!TARGET_ALIGN_STRINGOPS || noalign) align = desired_align; if (alg == libcall) @@ -23935,10 +24032,8 @@ ix86_issue_rate (void) case PROCESSOR_PENTIUMPRO: case PROCESSOR_PENTIUM4: - case PROCESSOR_CORE2_32: - case PROCESSOR_CORE2_64: - case PROCESSOR_COREI7_32: - case PROCESSOR_COREI7_64: + case PROCESSOR_CORE2: + case PROCESSOR_COREI7: case PROCESSOR_ATHLON: case PROCESSOR_K8: case PROCESSOR_AMDFAM10: @@ -24193,10 +24288,8 @@ ia32_multipass_dfa_lookahead (void) case PROCESSOR_K6: return 1; - case PROCESSOR_CORE2_32: - case PROCESSOR_CORE2_64: - case PROCESSOR_COREI7_32: - case PROCESSOR_COREI7_64: + case PROCESSOR_CORE2: + case PROCESSOR_COREI7: case PROCESSOR_ATOM: /* Generally, we want haifa-sched:max_issue() to look ahead as far as many instructions can be executed on a cycle, i.e., @@ -24739,10 +24832,8 @@ ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED, they are actually used. */ switch (ix86_tune) { - case PROCESSOR_CORE2_32: - case PROCESSOR_CORE2_64: - case PROCESSOR_COREI7_32: - case PROCESSOR_COREI7_64: + case PROCESSOR_CORE2: + case PROCESSOR_COREI7: /* Do not perform multipass scheduling for pre-reload schedule to save compile time. */ if (reload_completed) @@ -28572,13 +28663,11 @@ get_builtin_code_for_version (tree decl, tree *predicate_list) { switch (new_target->arch) { - case PROCESSOR_CORE2_32: - case PROCESSOR_CORE2_64: + case PROCESSOR_CORE2: arg_str = "core2"; priority = P_PROC_SSSE3; break; - case PROCESSOR_COREI7_32: - case PROCESSOR_COREI7_64: + case PROCESSOR_COREI7: arg_str = "corei7"; priority = P_PROC_SSE4_2; break; @@ -32868,7 +32957,7 @@ avx_vpermilp_parallel (rtx par, enum machine_mode mode) { unsigned i, nelt = GET_MODE_NUNITS (mode); unsigned mask = 0; - unsigned char ipar[8]; + unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */ if (XVECLEN (par, 0) != (int) nelt) return 0; @@ -32943,7 +33032,7 @@ avx_vperm2f128_parallel (rtx par, enum machine_mode mode) { unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2; unsigned mask = 0; - unsigned char ipar[8]; + unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */ if (XVECLEN (par, 0) != (int) nelt) return 0; diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index ef626835f4a..443d92eadce 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -101,6 +101,9 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see known at compile time or estimated via feedback, the SIZE array is walked in order until MAX is greater then the estimate (or -1 means infinity). Corresponding ALG is used then. + When NOALIGN is true the code guaranting the alignment of the memory + block is skipped. + For example initializer: {{256, loop}, {-1, rep_prefix_4_byte}} will use loop for blocks smaller or equal to 256 bytes, rep prefix will @@ -111,6 +114,7 @@ struct stringop_algs const struct stringop_strategy { const int max; const enum stringop_alg alg; + int noalign; } size [MAX_STRINGOP_ALGS]; }; @@ -242,12 +246,8 @@ extern const struct processor_costs ix86_size_cost; #define TARGET_K8 (ix86_tune == PROCESSOR_K8) #define TARGET_ATHLON_K8 (TARGET_K8 || TARGET_ATHLON) #define TARGET_NOCONA (ix86_tune == PROCESSOR_NOCONA) -#define TARGET_CORE2_32 (ix86_tune == PROCESSOR_CORE2_32) -#define TARGET_CORE2_64 (ix86_tune == PROCESSOR_CORE2_64) -#define TARGET_CORE2 (TARGET_CORE2_32 || TARGET_CORE2_64) -#define TARGET_COREI7_32 (ix86_tune == PROCESSOR_COREI7_32) -#define TARGET_COREI7_64 (ix86_tune == PROCESSOR_COREI7_64) -#define TARGET_COREI7 (TARGET_COREI7_32 || TARGET_COREI7_64) +#define TARGET_CORE2 (ix86_tune == PROCESSOR_CORE2) +#define TARGET_COREI7 (ix86_tune == PROCESSOR_COREI7) #define TARGET_GENERIC32 (ix86_tune == PROCESSOR_GENERIC32) #define TARGET_GENERIC64 (ix86_tune == PROCESSOR_GENERIC64) #define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64) @@ -2092,10 +2092,8 @@ enum processor_type PROCESSOR_PENTIUM4, PROCESSOR_K8, PROCESSOR_NOCONA, - PROCESSOR_CORE2_32, - PROCESSOR_CORE2_64, - PROCESSOR_COREI7_32, - PROCESSOR_COREI7_64, + PROCESSOR_CORE2, + PROCESSOR_COREI7, PROCESSOR_GENERIC32, PROCESSOR_GENERIC64, PROCESSOR_AMDFAM10,