diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 3acc890ba0c..5e0f269d68b 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,15 @@ +2013-08-09 Xinliang David Li + + * config/i386/stringop.def: New file. + * config/i386/stringop.opt: New file. + * config/i386/i386-opts.h: Include stringopt.def. + * config/i386/i386.opt: Include stringopt.opt. + * config/i386/i386.c (ix86_option_override_internal): + Override default size based stringop inline strategies + with options. + * config/i386/i386.c (ix86_parse_stringop_strategy_string): + New function. + 2013-08-09 Jan Hubicka * ipa-ref.c (ipa_clear_stmts_in_references): Clear lto_stmt_uid, too. diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h index bea1c257830..5fcbd6b5776 100644 --- a/gcc/config/i386/i386-opts.h +++ b/gcc/config/i386/i386-opts.h @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see /* Algorithm to expand string function with. */ enum stringop_alg { - no_stringop, - libcall, - rep_prefix_1_byte, - rep_prefix_4_byte, - rep_prefix_8_byte, - loop_1_byte, - loop, - unrolled_loop, - vector_loop +#undef DEF_ENUM +#define DEF_ENUM + +#undef DEF_ALG +#define DEF_ALG(alg, name) alg, + +#include "stringop.def" +last_alg + +#undef DEF_ENUM +#undef DEF_ALG }; /* Available call abi. */ diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 9812b5e764c..509cb6e8f83 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -87,6 +87,13 @@ static rtx legitimize_pe_coff_symbol (rtx, bool); #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}} +static stringop_algs ix86_size_memcpy[2] = { + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; +static stringop_algs ix86_size_memset[2] = { + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; + const struct processor_costs ix86_size_cost = {/* costs for tuning for size */ COSTS_N_BYTES (2), /* cost of an add instruction */ @@ -140,10 +147,8 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ COSTS_N_BYTES (2), /* cost of FABS instruction. */ COSTS_N_BYTES (2), /* cost of FCHS instruction. */ COSTS_N_BYTES (2), /* cost of FSQRT instruction. */ - {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, - {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}, - {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, - {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}, + ix86_size_memcpy, + ix86_size_memset, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ @@ -158,6 +163,13 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ }; /* Processor costs (relative to an add) */ +static stringop_algs i386_memcpy[2] = { + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, + DUMMY_STRINGOP_ALGS}; +static stringop_algs i386_memset[2] = { + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, + DUMMY_STRINGOP_ALGS}; + static const struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -211,10 +223,8 @@ struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (22), /* cost of FABS instruction. */ COSTS_N_INSNS (24), /* cost of FCHS instruction. */ COSTS_N_INSNS (122), /* cost of FSQRT instruction. */ - {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, - DUMMY_STRINGOP_ALGS}, - {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, - DUMMY_STRINGOP_ALGS}, + i386_memcpy, + i386_memset, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ @@ -228,6 +238,13 @@ struct processor_costs i386_cost = { /* 386 specific costs */ 1, /* cond_not_taken_branch_cost. */ }; +static stringop_algs i486_memcpy[2] = { + {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, + DUMMY_STRINGOP_ALGS}; +static stringop_algs i486_memset[2] = { + {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, + DUMMY_STRINGOP_ALGS}; + static const struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -283,10 +300,8 @@ struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (3), /* cost of FABS instruction. */ COSTS_N_INSNS (3), /* cost of FCHS instruction. */ COSTS_N_INSNS (83), /* cost of FSQRT instruction. */ - {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, - DUMMY_STRINGOP_ALGS}, - {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, - DUMMY_STRINGOP_ALGS}, + i486_memcpy, + i486_memset, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ @@ -300,6 +315,13 @@ struct processor_costs i486_cost = { /* 486 specific costs */ 1, /* cond_not_taken_branch_cost. */ }; +static stringop_algs pentium_memcpy[2] = { + {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + DUMMY_STRINGOP_ALGS}; +static stringop_algs pentium_memset[2] = { + {libcall, {{-1, rep_prefix_4_byte, false}}}, + DUMMY_STRINGOP_ALGS}; + static const struct processor_costs pentium_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -353,10 +375,8 @@ struct processor_costs pentium_cost = { COSTS_N_INSNS (1), /* cost of FABS instruction. */ COSTS_N_INSNS (1), /* cost of FCHS instruction. */ COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ - {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - DUMMY_STRINGOP_ALGS}, - {{libcall, {{-1, rep_prefix_4_byte, false}}}, - DUMMY_STRINGOP_ALGS}, + pentium_memcpy, + pentium_memset, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ @@ -370,6 +390,21 @@ struct processor_costs pentium_cost = { 1, /* cond_not_taken_branch_cost. */ }; +/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes + (we ensure the alignment). For small blocks inline loop is still a + noticeable win, for bigger blocks either rep movsl or rep movsb is + way to go. Rep movsb has apparently more expensive startup time in CPU, + but after 4K the difference is down in the noise. */ +static stringop_algs pentiumpro_memcpy[2] = { + {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false}, + {8192, rep_prefix_4_byte, false}, + {-1, rep_prefix_1_byte, false}}}, + DUMMY_STRINGOP_ALGS}; +static stringop_algs pentiumpro_memset[2] = { + {rep_prefix_4_byte, {{1024, unrolled_loop, false}, + {8192, rep_prefix_4_byte, false}, + {-1, libcall, false}}}, + DUMMY_STRINGOP_ALGS}; static const struct processor_costs pentiumpro_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -423,19 +458,8 @@ struct processor_costs pentiumpro_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ - /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes - (we ensure the alignment). For small blocks inline loop is still a - noticeable win, for bigger blocks either rep movsl or rep movsb is - way to go. Rep movsb has apparently more expensive startup time in CPU, - but after 4K the difference is down in the noise. */ - {{rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false}, - {8192, rep_prefix_4_byte, false}, - {-1, rep_prefix_1_byte, false}}}, - DUMMY_STRINGOP_ALGS}, - {{rep_prefix_4_byte, {{1024, unrolled_loop, false}, - {8192, rep_prefix_4_byte, false}, - {-1, libcall, false}}}, - DUMMY_STRINGOP_ALGS}, + pentiumpro_memcpy, + pentiumpro_memset, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ @@ -449,6 +473,12 @@ struct processor_costs pentiumpro_cost = { 1, /* cond_not_taken_branch_cost. */ }; +static stringop_algs geode_memcpy[2] = { + {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + DUMMY_STRINGOP_ALGS}; +static stringop_algs geode_memset[2] = { + {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + DUMMY_STRINGOP_ALGS}; static const struct processor_costs geode_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -503,10 +533,8 @@ struct processor_costs geode_cost = { COSTS_N_INSNS (1), /* cost of FABS instruction. */ COSTS_N_INSNS (1), /* cost of FCHS instruction. */ COSTS_N_INSNS (54), /* cost of FSQRT instruction. */ - {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - DUMMY_STRINGOP_ALGS}, - {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - DUMMY_STRINGOP_ALGS}, + geode_memcpy, + geode_memset, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ @@ -520,6 +548,12 @@ struct processor_costs geode_cost = { 1, /* cond_not_taken_branch_cost. */ }; +static stringop_algs k6_memcpy[2] = { + {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + DUMMY_STRINGOP_ALGS}; +static stringop_algs k6_memset[2] = { + {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + DUMMY_STRINGOP_ALGS}; static const struct processor_costs k6_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -576,10 +610,8 @@ struct processor_costs k6_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ - {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - DUMMY_STRINGOP_ALGS}, - {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - DUMMY_STRINGOP_ALGS}, + k6_memcpy, + k6_memset, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ @@ -593,6 +625,15 @@ struct processor_costs k6_cost = { 1, /* cond_not_taken_branch_cost. */ }; +/* For some reason, Athlon deals better with REP prefix (relative to loops) + compared to K8. Alignment becomes important after 8 bytes for memcpy and + 128 bytes for memset. */ +static stringop_algs athlon_memcpy[2] = { + {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + DUMMY_STRINGOP_ALGS}; +static stringop_algs athlon_memset[2] = { + {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + DUMMY_STRINGOP_ALGS}; static const struct processor_costs athlon_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -646,13 +687,8 @@ struct processor_costs athlon_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ - /* For some reason, Athlon deals better with REP prefix (relative to loops) - compared to K8. Alignment becomes important after 8 bytes for memcpy and - 128 bytes for memset. */ - {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - DUMMY_STRINGOP_ALGS}, - {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - DUMMY_STRINGOP_ALGS}, + athlon_memcpy, + athlon_memset, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ @@ -666,6 +702,19 @@ struct processor_costs athlon_cost = { 1, /* cond_not_taken_branch_cost. */ }; +/* K8 has optimized REP instruction for medium sized blocks, but for very + small blocks it is better to use loop. For large blocks, libcall can + do nontemporary accesses and beat inline considerably. */ +static stringop_algs k8_memcpy[2] = { + {libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static stringop_algs k8_memset[2] = { + {libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, + {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; static const struct processor_costs k8_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -724,17 +773,9 @@ struct processor_costs k8_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ - /* K8 has optimized REP instruction for medium sized blocks, but for very - small blocks it is better to use loop. For large blocks, libcall can - do nontemporary accesses and beat inline considerably. */ - {{libcall, {{6, loop, false}, {14, unrolled_loop, false}, - {-1, rep_prefix_4_byte, false}}}, - {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}, - {{libcall, {{8, loop, false}, {24, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{48, unrolled_loop, false}, - {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}, + + k8_memcpy, + k8_memset, 4, /* scalar_stmt_cost. */ 2, /* scalar load_cost. */ 2, /* scalar_store_cost. */ @@ -748,6 +789,19 @@ struct processor_costs k8_cost = { 2, /* cond_not_taken_branch_cost. */ }; +/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall can + do nontemporary accesses and beat inline considerably. */ +static stringop_algs amdfam10_memcpy[2] = { + {libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static stringop_algs amdfam10_memset[2] = { + {libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; struct processor_costs amdfam10_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -814,17 +868,8 @@ struct processor_costs amdfam10_cost = { COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ - /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for - very small blocks it is better to use loop. For large blocks, libcall can - do nontemporary accesses and beat inline considerably. */ - {{libcall, {{6, loop, false}, {14, unrolled_loop, false}, - {-1, rep_prefix_4_byte, false}}}, - {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}, - {{libcall, {{8, loop, false}, {24, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}, + amdfam10_memcpy, + amdfam10_memset, 4, /* scalar_stmt_cost. */ 2, /* scalar load_cost. */ 2, /* scalar_store_cost. */ @@ -838,7 +883,21 @@ struct processor_costs amdfam10_cost = { 1, /* cond_not_taken_branch_cost. */ }; -struct processor_costs bdver1_cost = { +/* BDVER1 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall + can do nontemporary accesses and beat inline considerably. */ +static stringop_algs bdver1_memcpy[2] = { + {libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static stringop_algs bdver1_memset[2] = { + {libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; + +const struct processor_costs bdver1_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ COSTS_N_INSNS (1), /* variable shift costs */ @@ -904,17 +963,8 @@ struct processor_costs bdver1_cost = { COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ - /* BDVER1 has optimized REP instruction for medium sized blocks, but for - very small blocks it is better to use loop. For large blocks, libcall - can do nontemporary accesses and beat inline considerably. */ - {{libcall, {{6, loop, false}, {14, unrolled_loop, false}, - {-1, rep_prefix_4_byte, false}}}, - {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}, - {{libcall, {{8, loop, false}, {24, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}, + bdver1_memcpy, + bdver1_memset, 6, /* scalar_stmt_cost. */ 4, /* scalar load_cost. */ 4, /* scalar_store_cost. */ @@ -928,7 +978,22 @@ struct processor_costs bdver1_cost = { 1, /* cond_not_taken_branch_cost. */ }; -struct processor_costs bdver2_cost = { +/* BDVER2 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall + can do nontemporary accesses and beat inline considerably. */ + +static stringop_algs bdver2_memcpy[2] = { + {libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static stringop_algs bdver2_memset[2] = { + {libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; + +const struct processor_costs bdver2_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ COSTS_N_INSNS (1), /* variable shift costs */ @@ -994,17 +1059,8 @@ struct processor_costs bdver2_cost = { COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ - /* BDVER2 has optimized REP instruction for medium sized blocks, but for - very small blocks it is better to use loop. For large blocks, libcall - can do nontemporary accesses and beat inline considerably. */ - {{libcall, {{6, loop, false}, {14, unrolled_loop, false}, - {-1, rep_prefix_4_byte, false}}}, - {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}, - {{libcall, {{8, loop, false}, {24, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}, + bdver2_memcpy, + bdver2_memset, 6, /* scalar_stmt_cost. */ 4, /* scalar load_cost. */ 4, /* scalar_store_cost. */ @@ -1018,6 +1074,20 @@ struct processor_costs bdver2_cost = { 1, /* cond_not_taken_branch_cost. */ }; + + /* BDVER3 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall + can do nontemporary accesses and beat inline considerably. */ +static stringop_algs bdver3_memcpy[2] = { + {libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static stringop_algs bdver3_memset[2] = { + {libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; struct processor_costs bdver3_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -1076,17 +1146,8 @@ struct processor_costs bdver3_cost = { COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ - /* BDVER3 has optimized REP instruction for medium sized blocks, but for - very small blocks it is better to use loop. For large blocks, libcall - can do nontemporary accesses and beat inline considerably. */ - {{libcall, {{6, loop, false}, {14, unrolled_loop, false}, - {-1, rep_prefix_4_byte, false}}}, - {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}, - {{libcall, {{8, loop, false}, {24, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}, + bdver3_memcpy, + bdver3_memset, 6, /* scalar_stmt_cost. */ 4, /* scalar load_cost. */ 4, /* scalar_store_cost. */ @@ -1100,7 +1161,20 @@ struct processor_costs bdver3_cost = { 1, /* cond_not_taken_branch_cost. */ }; -struct processor_costs btver1_cost = { + /* BTVER1 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall can + do nontemporary accesses and beat inline considerably. */ +static stringop_algs btver1_memcpy[2] = { + {libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static stringop_algs btver1_memset[2] = { + {libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +const struct processor_costs btver1_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ COSTS_N_INSNS (1), /* variable shift costs */ @@ -1161,17 +1235,8 @@ struct processor_costs btver1_cost = { COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ - /* BTVER1 has optimized REP instruction for medium sized blocks, but for - very small blocks it is better to use loop. For large blocks, libcall can - do nontemporary accesses and beat inline considerably. */ - {{libcall, {{6, loop, false}, {14, unrolled_loop, false}, - {-1, rep_prefix_4_byte, false}}}, - {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}, - {{libcall, {{8, loop, false}, {24, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}, + btver1_memcpy, + btver1_memset, 4, /* scalar_stmt_cost. */ 2, /* scalar load_cost. */ 2, /* scalar_store_cost. */ @@ -1185,7 +1250,17 @@ struct processor_costs btver1_cost = { 1, /* cond_not_taken_branch_cost. */ }; -struct processor_costs btver2_cost = { +static stringop_algs btver2_memcpy[2] = { + {libcall, {{6, loop, false}, {14, unrolled_loop, false}, + {-1, rep_prefix_4_byte, false}}}, + {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static stringop_algs btver2_memset[2] = { + {libcall, {{8, loop, false}, {24, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +const struct processor_costs btver2_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ COSTS_N_INSNS (1), /* variable shift costs */ @@ -1245,15 +1320,8 @@ struct processor_costs btver2_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ - - {{libcall, {{6, loop, false}, {14, unrolled_loop, false}, - {-1, rep_prefix_4_byte, false}}}, - {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}, - {{libcall, {{8, loop, false}, {24, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}, + btver2_memcpy, + btver2_memset, 4, /* scalar_stmt_cost. */ 2, /* scalar load_cost. */ 2, /* scalar_store_cost. */ @@ -1267,6 +1335,14 @@ struct processor_costs btver2_cost = { 1, /* cond_not_taken_branch_cost. */ }; +static stringop_algs pentium4_memcpy[2] = { + {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, + DUMMY_STRINGOP_ALGS}; +static stringop_algs pentium4_memset[2] = { + {libcall, {{6, loop_1_byte, false}, {48, loop, false}, + {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + DUMMY_STRINGOP_ALGS}; + static const struct processor_costs pentium4_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -1320,11 +1396,8 @@ struct processor_costs pentium4_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (43), /* cost of FSQRT instruction. */ - {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, - DUMMY_STRINGOP_ALGS}, - {{libcall, {{6, loop_1_byte, false}, {48, loop, false}, - {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - DUMMY_STRINGOP_ALGS}, + pentium4_memcpy, + pentium4_memset, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ @@ -1338,6 +1411,17 @@ struct processor_costs pentium4_cost = { 1, /* cond_not_taken_branch_cost. */ }; +static stringop_algs nocona_memcpy[2] = { + {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, + {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false}, + {100000, unrolled_loop, false}, {-1, libcall, false}}}}; + +static stringop_algs nocona_memset[2] = { + {libcall, {{6, loop_1_byte, false}, {48, loop, false}, + {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{24, loop, false}, {64, unrolled_loop, false}, + {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; + static const struct processor_costs nocona_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -1391,13 +1475,8 @@ struct processor_costs nocona_cost = { COSTS_N_INSNS (3), /* cost of FABS instruction. */ COSTS_N_INSNS (3), /* cost of FCHS instruction. */ COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ - {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, - {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false}, - {100000, unrolled_loop, false}, {-1, libcall, false}}}}, - {{libcall, {{6, loop_1_byte, false}, {48, loop, false}, - {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{24, loop, false}, {64, unrolled_loop, false}, - {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}, + nocona_memcpy, + nocona_memset, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ @@ -1411,6 +1490,15 @@ struct processor_costs nocona_cost = { 1, /* cond_not_taken_branch_cost. */ }; +static stringop_algs atom_memcpy[2] = { + {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, + {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, + {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; +static stringop_algs atom_memset[2] = { + {libcall, {{8, loop, false}, {15, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{24, loop, false}, {32, unrolled_loop, false}, + {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; static const struct processor_costs atom_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -1464,13 +1552,8 @@ struct processor_costs atom_cost = { COSTS_N_INSNS (8), /* cost of FABS instruction. */ COSTS_N_INSNS (8), /* cost of FCHS instruction. */ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ - {{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, - {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, - {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}, - {{libcall, {{8, loop, false}, {15, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{24, loop, false}, {32, unrolled_loop, false}, - {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}, + atom_memcpy, + atom_memset, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ @@ -1484,6 +1567,15 @@ struct processor_costs atom_cost = { 1, /* cond_not_taken_branch_cost. */ }; +static stringop_algs slm_memcpy[2] = { + {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, + {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, + {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; +static stringop_algs slm_memset[2] = { + {libcall, {{8, loop, false}, {15, unrolled_loop, false}, + {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, + {libcall, {{24, loop, false}, {32, unrolled_loop, false}, + {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; static const struct processor_costs slm_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -1537,13 +1629,8 @@ struct processor_costs slm_cost = { COSTS_N_INSNS (8), /* cost of FABS instruction. */ COSTS_N_INSNS (8), /* cost of FCHS instruction. */ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ - {{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, - {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, - {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}, - {{libcall, {{8, loop, false}, {15, unrolled_loop, false}, - {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, - {libcall, {{24, loop, false}, {32, unrolled_loop, false}, - {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}, + slm_memcpy, + slm_memset, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ @@ -1558,6 +1645,15 @@ struct processor_costs slm_cost = { }; /* Generic64 should produce code tuned for Nocona and K8. */ + +static stringop_algs generic64_memcpy[2] = { + DUMMY_STRINGOP_ALGS, + {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; +static stringop_algs generic64_memset[2] = { + DUMMY_STRINGOP_ALGS, + {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, + {-1, libcall, false}}}}; static const struct processor_costs generic64_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -1617,12 +1713,8 @@ struct processor_costs generic64_cost = { COSTS_N_INSNS (8), /* cost of FABS instruction. */ COSTS_N_INSNS (8), /* cost of FCHS instruction. */ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ - {DUMMY_STRINGOP_ALGS, - {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}, - {DUMMY_STRINGOP_ALGS, - {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, - {-1, libcall, false}}}}, + generic64_memcpy, + generic64_memset, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ @@ -1637,6 +1729,18 @@ struct processor_costs generic64_cost = { }; /* core_cost should produce code tuned for Core familly of CPUs. */ +static stringop_algs core_memcpy[2] = { + {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, + {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true}, + {-1, libcall, false}}}}; +static stringop_algs core_memset[2] = { + {libcall, {{6, loop_1_byte, true}, + {24, loop, true}, + {8192, rep_prefix_4_byte, true}, + {-1, libcall, false}}}, + {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true}, + {-1, libcall, false}}}}; + static const struct processor_costs core_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -1695,15 +1799,8 @@ struct processor_costs core_cost = { COSTS_N_INSNS (8), /* cost of FABS instruction. */ COSTS_N_INSNS (8), /* cost of FCHS instruction. */ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ - {{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, - {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true}, - {-1, libcall, false}}}}, - {{libcall, {{6, loop_1_byte, true}, - {24, loop, true}, - {8192, rep_prefix_4_byte, true}, - {-1, libcall, false}}}, - {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true}, - {-1, libcall, false}}}}, + core_memcpy, + core_memset, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ @@ -1719,6 +1816,14 @@ struct processor_costs core_cost = { /* Generic32 should produce code tuned for PPro, Pentium4, Nocona, Athlon and K8. */ +static stringop_algs generic32_memcpy[2] = { + {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, + {-1, libcall, false}}}, + DUMMY_STRINGOP_ALGS}; +static stringop_algs generic32_memset[2] = { + {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, + {-1, libcall, false}}}, + DUMMY_STRINGOP_ALGS}; static const struct processor_costs generic32_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -1772,12 +1877,8 @@ struct processor_costs generic32_cost = { COSTS_N_INSNS (8), /* cost of FABS instruction. */ COSTS_N_INSNS (8), /* cost of FCHS instruction. */ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ - {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, - {-1, libcall, false}}}, - DUMMY_STRINGOP_ALGS}, - {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, - {-1, libcall, false}}}, - DUMMY_STRINGOP_ALGS}, + generic32_memcpy, + generic32_memset, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ 1, /* scalar_store_cost. */ @@ -2926,6 +3027,149 @@ ix86_debug_options (void) return; } + +static const char *stringop_alg_names[] = { +#define DEF_ENUM +#define DEF_ALG(alg, name) #name, +#include "stringop.def" +#undef DEF_ENUM +#undef DEF_ALG +}; + +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=. + The string is of the following form (or comma separated list of it): + + strategy_alg:max_size:[align|noalign] + + where the full size range for the strategy is either [0, max_size] or + [min_size, max_size], in which min_size is the max_size + 1 of the + preceding range. The last size range must have max_size == -1. + + Examples: + + 1. + -mmemcpy-strategy=libcall:-1:noalign + + this is equivalent to (for known size memcpy) -mstringop-strategy=libcall + + + 2. + -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign + + This is to tell the compiler to use the following strategy for memset + 1) when the expected size is between [1, 16], use rep_8byte strategy; + 2) when the size is between [17, 2048], use vector_loop; + 3) when the size is > 2048, use libcall. */ + +struct stringop_size_range +{ + int min; + int max; + stringop_alg alg; + bool noalign; +}; + +static void +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) +{ + const struct stringop_algs *default_algs; + stringop_size_range input_ranges[MAX_STRINGOP_ALGS]; + char *curr_range_str, *next_range_str; + int i = 0, n = 0; + + if (is_memset) + default_algs = &ix86_cost->memset[TARGET_64BIT != 0]; + else + default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; + + curr_range_str = strategy_str; + + do + { + int mins, maxs; + stringop_alg alg; + char alg_name[128]; + char align[16]; + next_range_str = strchr (curr_range_str, ','); + if (next_range_str) + *next_range_str++ = '\0'; + + if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s", + alg_name, &maxs, align)) + { + error ("wrong arg %s to option %s", curr_range_str, + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1)) + { + error ("size ranges of option %s should be increasing", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + for (i = 0; i < last_alg; i++) + { + if (!strcmp (alg_name, stringop_alg_names[i])) + { + alg = (stringop_alg) i; + break; + } + } + + if (i == last_alg) + { + error ("wrong stringop strategy name %s specified for option %s", + alg_name, + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + input_ranges[n].min = mins; + input_ranges[n].max = maxs; + input_ranges[n].alg = alg; + if (!strcmp (align, "align")) + input_ranges[n].noalign = false; + else if (!strcmp (align, "noalign")) + input_ranges[n].noalign = true; + else + { + error ("unknown alignment %s specified for option %s", + align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + n++; + curr_range_str = next_range_str; + } + while (curr_range_str); + + if (input_ranges[n - 1].max != -1) + { + error ("the max value for the last size range should be -1" + " for option %s", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + if (n > MAX_STRINGOP_ALGS) + { + error ("too many size ranges specified in option %s", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + /* Now override the default algs array. */ + for (i = 0; i < n; i++) + { + *const_cast(&default_algs->size[i].max) = input_ranges[i].max; + *const_cast(&default_algs->size[i].alg) + = input_ranges[i].alg; + *const_cast(&default_algs->size[i].noalign) + = input_ranges[i].noalign; + } +} + /* Override various settings based on options. If MAIN_ARGS_P, the options are from the command line, otherwise they are from @@ -4081,6 +4325,21 @@ ix86_option_override_internal (bool main_args_p) /* Handle stack protector */ if (!global_options_set.x_ix86_stack_protector_guard) ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS; + + /* Handle -mmemcpy-strategy= and -mmemset-strategy= */ + if (ix86_tune_memcpy_strategy) + { + char *str = xstrdup (ix86_tune_memcpy_strategy); + ix86_parse_stringop_strategy_string (str, false); + free (str); + } + + if (ix86_tune_memset_strategy) + { + char *str = xstrdup (ix86_tune_memset_strategy); + ix86_parse_stringop_strategy_string (str, true); + free (str); + } } /* Implement the TARGET_OPTION_OVERRIDE hook. */ @@ -22964,6 +23223,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: need_zero_guard = true; @@ -23154,6 +23414,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: case loop: @@ -23365,6 +23626,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop: need_zero_guard = true; @@ -23542,6 +23804,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: case loop: diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 97ddf0846a5..aa133538461 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -170,7 +170,7 @@ struct processor_costs { const int fsqrt; /* cost of FSQRT instruction. */ /* Specify what algorithm to use for stringops on unknown size. */ - struct stringop_algs memcpy[2], memset[2]; + struct stringop_algs *memcpy, *memset; const int scalar_stmt_cost; /* Cost of any scalar operation, excluding load and store. */ const int scalar_load_cost; /* Cost of scalar load. */ diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 4b0c4a630df..37e62f5bce8 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -316,6 +316,14 @@ mstack-arg-probe Target Report Mask(STACK_PROBE) Save Enable stack probing +mmemcpy-strategy= +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy) +Specify memcpy expansion strategy when expected size is known + +mmemset-strategy= +Target RejectNegative Joined Var(ix86_tune_memset_strategy) +Specify memset expansion strategy when expected size is known + mstringop-strategy= Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop) Chose strategy to generate stringop using diff --git a/gcc/config/i386/stringop.def b/gcc/config/i386/stringop.def new file mode 100644 index 00000000000..1a7d1e88f65 --- /dev/null +++ b/gcc/config/i386/stringop.def @@ -0,0 +1,37 @@ +/* Definitions for stringop strategy for IA-32. + Copyright (C) 2013 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the files COPYING3. If not, +see . */ + +DEF_ENUM +DEF_ALG (no_stringop, no_stringop) +DEF_ENUM +DEF_ALG (libcall, libcall) +DEF_ENUM +DEF_ALG (rep_prefix_1_byte, rep_byte) +DEF_ENUM +DEF_ALG (rep_prefix_4_byte, rep_4byte) +DEF_ENUM +DEF_ALG (rep_prefix_8_byte, rep_8byte) +DEF_ENUM +DEF_ALG (loop_1_byte, byte_loop) +DEF_ENUM +DEF_ALG (loop, loop) +DEF_ENUM +DEF_ALG (unrolled_loop, unrolled_loop) +DEF_ENUM +DEF_ALG (vector_loop, vector_loop) diff --git a/gcc/config/i386/stringop.opt b/gcc/config/i386/stringop.opt new file mode 100644 index 00000000000..5c5fc906a33 --- /dev/null +++ b/gcc/config/i386/stringop.opt @@ -0,0 +1,31 @@ +/* Definitions for stringop option handling for IA-32. + Copyright (C) 2013 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the files COPYING3. If not, +see . */ + +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte) + +#undef DEF_ENUM +#define DEF_ENUM EnumValue + +#undef DEF_ALG +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg) + +#include "stringop.def" + +#undef DEF_ENUM +#undef DEF_ALG diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 14955dd1be0..70f09e72773 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -652,6 +652,7 @@ Objective-C and Objective-C++ Dialects}. -mbmi2 -mrtm -mlwp -mthreads @gol -mno-align-stringops -minline-all-stringops @gol -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol -mregparm=@var{num} -msseregparm @gol @@ -14651,6 +14652,24 @@ Expand into an inline loop. Always use a library call. @end table +@item -mmemcpy-strategy=@var{strategy} +@opindex mmemcpy-strategy=@var{strategy} +Override the internal decision heuristic to decide if @code{__builtin_memcpy} +should be inlined and what inline algorithm to use when the expected size +of the copy operation is known. @var{strategy} +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies +the max byte size with which inline algorithm @var{alg} is allowed. For the last +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets +in the list must be specified in increasing order. The minimal byte size for +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the +preceding range. + +@item -mmemset-strategy=@var{strategy} +@opindex mmemset-strategy=@var{strategy} +The option is similar to @option{-mmemcpy-strategy=} except that it is to control +@code{__builtin_memset} expansion. + @item -momit-leaf-frame-pointer @opindex momit-leaf-frame-pointer Don't keep the frame pointer in a register for leaf functions. This diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 39fee31b1e6..6ecfe7ea01b 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,10 @@ +2013-08-09 Xinliang David Li + + * gcc.target/i386/memcpy-strategy-1.c: New test. + * gcc.target/i386/memcpy-strategy-2.c: Ditto. + * gcc.target/i386/memset-strategy-1.c: Ditto. + * gcc.target/i386/memcpy-strategy-3.c: Ditto. + 2013-08-09 Jan Hubicka * gcc.dg/tree-prof/crossmodule-indircall-1.c: New testcase. diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-1.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-1.c new file mode 100644 index 00000000000..a2b66d966d0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-1.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */ +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} + diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-2.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-2.c new file mode 100644 index 00000000000..c2f49f0cc5f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-2.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} + diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-3.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-3.c new file mode 100644 index 00000000000..ddd1ef7c0b2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-3.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "memcpy" 2 } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-1.c b/gcc/testsuite/gcc.target/i386/memset-strategy-1.c new file mode 100644 index 00000000000..d1b97c5df10 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-1.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "memset" 2 } } */ + +char a[2048]; +void t (void) +{ + __builtin_memset (a, 1, 2048); +} +