Implement -mmemcpy-strategy= and -mmemset-strategy= options

From-SVN: r201645
This commit is contained in:
Xinliang David Li 2013-08-10 03:48:01 +00:00 committed by Xinliang David Li
parent 7d1d05602c
commit ad83025ea6
13 changed files with 603 additions and 180 deletions

View file

@ -1,3 +1,15 @@
2013-08-09 Xinliang David Li <davidxl@google.com>
* config/i386/stringop.def: New file.
* config/i386/stringop.opt: New file.
* config/i386/i386-opts.h: Include stringopt.def.
* config/i386/i386.opt: Include stringopt.opt.
* config/i386/i386.c (ix86_option_override_internal):
Override default size based stringop inline strategies
with options.
* config/i386/i386.c (ix86_parse_stringop_strategy_string):
New function.
2013-08-09 Jan Hubicka <jh@suse.cz>
* ipa-ref.c (ipa_clear_stmts_in_references): Clear lto_stmt_uid, too.

View file

@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
/* Algorithm to expand string function with. */
enum stringop_alg
{
no_stringop,
libcall,
rep_prefix_1_byte,
rep_prefix_4_byte,
rep_prefix_8_byte,
loop_1_byte,
loop,
unrolled_loop,
vector_loop
#undef DEF_ENUM
#define DEF_ENUM
#undef DEF_ALG
#define DEF_ALG(alg, name) alg,
#include "stringop.def"
last_alg
#undef DEF_ENUM
#undef DEF_ALG
};
/* Available call abi. */

View file

@ -87,6 +87,13 @@ static rtx legitimize_pe_coff_symbol (rtx, bool);
#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
static stringop_algs ix86_size_memcpy[2] = {
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
static stringop_algs ix86_size_memset[2] = {
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
const
struct processor_costs ix86_size_cost = {/* costs for tuning for size */
COSTS_N_BYTES (2), /* cost of an add instruction */
@ -140,10 +147,8 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
COSTS_N_BYTES (2), /* cost of FABS instruction. */
COSTS_N_BYTES (2), /* cost of FCHS instruction. */
COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
ix86_size_memcpy,
ix86_size_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@ -158,6 +163,13 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
};
/* Processor costs (relative to an add) */
static stringop_algs i386_memcpy[2] = {
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs i386_memset[2] = {
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs i386_cost = { /* 386 specific costs */
COSTS_N_INSNS (1), /* cost of an add instruction */
@ -211,10 +223,8 @@ struct processor_costs i386_cost = { /* 386 specific costs */
COSTS_N_INSNS (22), /* cost of FABS instruction. */
COSTS_N_INSNS (24), /* cost of FCHS instruction. */
COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
DUMMY_STRINGOP_ALGS},
{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
DUMMY_STRINGOP_ALGS},
i386_memcpy,
i386_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@ -228,6 +238,13 @@ struct processor_costs i386_cost = { /* 386 specific costs */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs i486_memcpy[2] = {
{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs i486_memset[2] = {
{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs i486_cost = { /* 486 specific costs */
COSTS_N_INSNS (1), /* cost of an add instruction */
@ -283,10 +300,8 @@ struct processor_costs i486_cost = { /* 486 specific costs */
COSTS_N_INSNS (3), /* cost of FABS instruction. */
COSTS_N_INSNS (3), /* cost of FCHS instruction. */
COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
{{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS},
{{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS},
i486_memcpy,
i486_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@ -300,6 +315,13 @@ struct processor_costs i486_cost = { /* 486 specific costs */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs pentium_memcpy[2] = {
{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs pentium_memset[2] = {
{libcall, {{-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs pentium_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
@ -353,10 +375,8 @@ struct processor_costs pentium_cost = {
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
{{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
{{libcall, {{-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS},
pentium_memcpy,
pentium_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@ -370,6 +390,21 @@ struct processor_costs pentium_cost = {
1, /* cond_not_taken_branch_cost. */
};
/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
(we ensure the alignment). For small blocks inline loop is still a
noticeable win, for bigger blocks either rep movsl or rep movsb is
way to go. Rep movsb has apparently more expensive startup time in CPU,
but after 4K the difference is down in the noise. */
static stringop_algs pentiumpro_memcpy[2] = {
{rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
{8192, rep_prefix_4_byte, false},
{-1, rep_prefix_1_byte, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs pentiumpro_memset[2] = {
{rep_prefix_4_byte, {{1024, unrolled_loop, false},
{8192, rep_prefix_4_byte, false},
{-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs pentiumpro_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
@ -423,19 +458,8 @@ struct processor_costs pentiumpro_cost = {
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
(we ensure the alignment). For small blocks inline loop is still a
noticeable win, for bigger blocks either rep movsl or rep movsb is
way to go. Rep movsb has apparently more expensive startup time in CPU,
but after 4K the difference is down in the noise. */
{{rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
{8192, rep_prefix_4_byte, false},
{-1, rep_prefix_1_byte, false}}},
DUMMY_STRINGOP_ALGS},
{{rep_prefix_4_byte, {{1024, unrolled_loop, false},
{8192, rep_prefix_4_byte, false},
{-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
pentiumpro_memcpy,
pentiumpro_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@ -449,6 +473,12 @@ struct processor_costs pentiumpro_cost = {
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs geode_memcpy[2] = {
{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs geode_memset[2] = {
{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs geode_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
@ -503,10 +533,8 @@ struct processor_costs geode_cost = {
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
{{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
{{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
geode_memcpy,
geode_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@ -520,6 +548,12 @@ struct processor_costs geode_cost = {
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs k6_memcpy[2] = {
{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs k6_memset[2] = {
{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs k6_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
@ -576,10 +610,8 @@ struct processor_costs k6_cost = {
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
{{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
{{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
k6_memcpy,
k6_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@ -593,6 +625,15 @@ struct processor_costs k6_cost = {
1, /* cond_not_taken_branch_cost. */
};
/* For some reason, Athlon deals better with REP prefix (relative to loops)
compared to K8. Alignment becomes important after 8 bytes for memcpy and
128 bytes for memset. */
static stringop_algs athlon_memcpy[2] = {
{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs athlon_memset[2] = {
{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs athlon_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
@ -646,13 +687,8 @@ struct processor_costs athlon_cost = {
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
/* For some reason, Athlon deals better with REP prefix (relative to loops)
compared to K8. Alignment becomes important after 8 bytes for memcpy and
128 bytes for memset. */
{{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
{{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
athlon_memcpy,
athlon_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@ -666,6 +702,19 @@ struct processor_costs athlon_cost = {
1, /* cond_not_taken_branch_cost. */
};
/* K8 has optimized REP instruction for medium sized blocks, but for very
small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
static stringop_algs k8_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs k8_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static const
struct processor_costs k8_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
@ -724,17 +773,9 @@ struct processor_costs k8_cost = {
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
/* K8 has optimized REP instruction for medium sized blocks, but for very
small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
{{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
{{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
k8_memcpy,
k8_memset,
4, /* scalar_stmt_cost. */
2, /* scalar load_cost. */
2, /* scalar_store_cost. */
@ -748,6 +789,19 @@ struct processor_costs k8_cost = {
2, /* cond_not_taken_branch_cost. */
};
/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
static stringop_algs amdfam10_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs amdfam10_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
struct processor_costs amdfam10_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
@ -814,17 +868,8 @@ struct processor_costs amdfam10_cost = {
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
{{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
{{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
amdfam10_memcpy,
amdfam10_memset,
4, /* scalar_stmt_cost. */
2, /* scalar load_cost. */
2, /* scalar_store_cost. */
@ -838,7 +883,21 @@ struct processor_costs amdfam10_cost = {
1, /* cond_not_taken_branch_cost. */
};
struct processor_costs bdver1_cost = {
/* BDVER1 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
static stringop_algs bdver1_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs bdver1_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
const struct processor_costs bdver1_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
@ -904,17 +963,8 @@ struct processor_costs bdver1_cost = {
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
/* BDVER1 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
{{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
{{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
bdver1_memcpy,
bdver1_memset,
6, /* scalar_stmt_cost. */
4, /* scalar load_cost. */
4, /* scalar_store_cost. */
@ -928,7 +978,22 @@ struct processor_costs bdver1_cost = {
1, /* cond_not_taken_branch_cost. */
};
struct processor_costs bdver2_cost = {
/* BDVER2 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
static stringop_algs bdver2_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs bdver2_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
const struct processor_costs bdver2_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
@ -994,17 +1059,8 @@ struct processor_costs bdver2_cost = {
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
/* BDVER2 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
{{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
{{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
bdver2_memcpy,
bdver2_memset,
6, /* scalar_stmt_cost. */
4, /* scalar load_cost. */
4, /* scalar_store_cost. */
@ -1018,6 +1074,20 @@ struct processor_costs bdver2_cost = {
1, /* cond_not_taken_branch_cost. */
};
/* BDVER3 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
static stringop_algs bdver3_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs bdver3_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
struct processor_costs bdver3_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
@ -1076,17 +1146,8 @@ struct processor_costs bdver3_cost = {
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
/* BDVER3 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
{{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
{{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
bdver3_memcpy,
bdver3_memset,
6, /* scalar_stmt_cost. */
4, /* scalar load_cost. */
4, /* scalar_store_cost. */
@ -1100,7 +1161,20 @@ struct processor_costs bdver3_cost = {
1, /* cond_not_taken_branch_cost. */
};
struct processor_costs btver1_cost = {
/* BTVER1 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
static stringop_algs btver1_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs btver1_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
const struct processor_costs btver1_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
@ -1161,17 +1235,8 @@ struct processor_costs btver1_cost = {
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
/* BTVER1 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
{{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
{{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
btver1_memcpy,
btver1_memset,
4, /* scalar_stmt_cost. */
2, /* scalar load_cost. */
2, /* scalar_store_cost. */
@ -1185,7 +1250,17 @@ struct processor_costs btver1_cost = {
1, /* cond_not_taken_branch_cost. */
};
struct processor_costs btver2_cost = {
static stringop_algs btver2_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs btver2_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
const struct processor_costs btver2_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
@ -1245,15 +1320,8 @@ struct processor_costs btver2_cost = {
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
{{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
{{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
btver2_memcpy,
btver2_memset,
4, /* scalar_stmt_cost. */
2, /* scalar load_cost. */
2, /* scalar_store_cost. */
@ -1267,6 +1335,14 @@ struct processor_costs btver2_cost = {
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs pentium4_memcpy[2] = {
{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs pentium4_memset[2] = {
{libcall, {{6, loop_1_byte, false}, {48, loop, false},
{20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs pentium4_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
@ -1320,11 +1396,8 @@ struct processor_costs pentium4_cost = {
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
{{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS},
{{libcall, {{6, loop_1_byte, false}, {48, loop, false},
{20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
pentium4_memcpy,
pentium4_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@ -1338,6 +1411,17 @@ struct processor_costs pentium4_cost = {
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs nocona_memcpy[2] = {
{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
{libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
{100000, unrolled_loop, false}, {-1, libcall, false}}}};
static stringop_algs nocona_memset[2] = {
{libcall, {{6, loop_1_byte, false}, {48, loop, false},
{20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{24, loop, false}, {64, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static const
struct processor_costs nocona_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
@ -1391,13 +1475,8 @@ struct processor_costs nocona_cost = {
COSTS_N_INSNS (3), /* cost of FABS instruction. */
COSTS_N_INSNS (3), /* cost of FCHS instruction. */
COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
{{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
{libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
{100000, unrolled_loop, false}, {-1, libcall, false}}}},
{{libcall, {{6, loop_1_byte, false}, {48, loop, false},
{20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{24, loop, false}, {64, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
nocona_memcpy,
nocona_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@ -1411,6 +1490,15 @@ struct processor_costs nocona_cost = {
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs atom_memcpy[2] = {
{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
{libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static stringop_algs atom_memset[2] = {
{libcall, {{8, loop, false}, {15, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{24, loop, false}, {32, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static const
struct processor_costs atom_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
@ -1464,13 +1552,8 @@ struct processor_costs atom_cost = {
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
{{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
{libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
{{libcall, {{8, loop, false}, {15, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{24, loop, false}, {32, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
atom_memcpy,
atom_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@ -1484,6 +1567,15 @@ struct processor_costs atom_cost = {
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs slm_memcpy[2] = {
{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
{libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static stringop_algs slm_memset[2] = {
{libcall, {{8, loop, false}, {15, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{24, loop, false}, {32, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static const
struct processor_costs slm_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
@ -1537,13 +1629,8 @@ struct processor_costs slm_cost = {
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
{{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
{libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
{{libcall, {{8, loop, false}, {15, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{24, loop, false}, {32, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
slm_memcpy,
slm_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@ -1558,6 +1645,15 @@ struct processor_costs slm_cost = {
};
/* Generic64 should produce code tuned for Nocona and K8. */
static stringop_algs generic64_memcpy[2] = {
DUMMY_STRINGOP_ALGS,
{libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs generic64_memset[2] = {
DUMMY_STRINGOP_ALGS,
{libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static const
struct processor_costs generic64_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
@ -1617,12 +1713,8 @@ struct processor_costs generic64_cost = {
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
{DUMMY_STRINGOP_ALGS,
{libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
{DUMMY_STRINGOP_ALGS,
{libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}},
generic64_memcpy,
generic64_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@ -1637,6 +1729,18 @@ struct processor_costs generic64_cost = {
};
/* core_cost should produce code tuned for Core familly of CPUs. */
static stringop_algs core_memcpy[2] = {
{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
{libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
{-1, libcall, false}}}};
static stringop_algs core_memset[2] = {
{libcall, {{6, loop_1_byte, true},
{24, loop, true},
{8192, rep_prefix_4_byte, true},
{-1, libcall, false}}},
{libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
{-1, libcall, false}}}};
static const
struct processor_costs core_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
@ -1695,15 +1799,8 @@ struct processor_costs core_cost = {
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
{{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
{libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
{-1, libcall, false}}}},
{{libcall, {{6, loop_1_byte, true},
{24, loop, true},
{8192, rep_prefix_4_byte, true},
{-1, libcall, false}}},
{libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
{-1, libcall, false}}}},
core_memcpy,
core_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@ -1719,6 +1816,14 @@ struct processor_costs core_cost = {
/* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
Athlon and K8. */
static stringop_algs generic32_memcpy[2] = {
{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
{-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs generic32_memset[2] = {
{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
{-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs generic32_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
@ -1772,12 +1877,8 @@ struct processor_costs generic32_cost = {
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
{{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
{-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
{{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
{-1, libcall, false}}},
DUMMY_STRINGOP_ALGS},
generic32_memcpy,
generic32_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
@ -2926,6 +3027,149 @@ ix86_debug_options (void)
return;
}
static const char *stringop_alg_names[] = {
#define DEF_ENUM
#define DEF_ALG(alg, name) #name,
#include "stringop.def"
#undef DEF_ENUM
#undef DEF_ALG
};
/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
The string is of the following form (or comma separated list of it):
strategy_alg:max_size:[align|noalign]
where the full size range for the strategy is either [0, max_size] or
[min_size, max_size], in which min_size is the max_size + 1 of the
preceding range. The last size range must have max_size == -1.
Examples:
1.
-mmemcpy-strategy=libcall:-1:noalign
this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
2.
-mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
This is to tell the compiler to use the following strategy for memset
1) when the expected size is between [1, 16], use rep_8byte strategy;
2) when the size is between [17, 2048], use vector_loop;
3) when the size is > 2048, use libcall. */
struct stringop_size_range
{
int min;
int max;
stringop_alg alg;
bool noalign;
};
static void
ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
{
const struct stringop_algs *default_algs;
stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
char *curr_range_str, *next_range_str;
int i = 0, n = 0;
if (is_memset)
default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
else
default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
curr_range_str = strategy_str;
do
{
int mins, maxs;
stringop_alg alg;
char alg_name[128];
char align[16];
next_range_str = strchr (curr_range_str, ',');
if (next_range_str)
*next_range_str++ = '\0';
if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
alg_name, &maxs, align))
{
error ("wrong arg %s to option %s", curr_range_str,
is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
return;
}
if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1))
{
error ("size ranges of option %s should be increasing",
is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
return;
}
for (i = 0; i < last_alg; i++)
{
if (!strcmp (alg_name, stringop_alg_names[i]))
{
alg = (stringop_alg) i;
break;
}
}
if (i == last_alg)
{
error ("wrong stringop strategy name %s specified for option %s",
alg_name,
is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
return;
}
input_ranges[n].min = mins;
input_ranges[n].max = maxs;
input_ranges[n].alg = alg;
if (!strcmp (align, "align"))
input_ranges[n].noalign = false;
else if (!strcmp (align, "noalign"))
input_ranges[n].noalign = true;
else
{
error ("unknown alignment %s specified for option %s",
align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
return;
}
n++;
curr_range_str = next_range_str;
}
while (curr_range_str);
if (input_ranges[n - 1].max != -1)
{
error ("the max value for the last size range should be -1"
" for option %s",
is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
return;
}
if (n > MAX_STRINGOP_ALGS)
{
error ("too many size ranges specified in option %s",
is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
return;
}
/* Now override the default algs array. */
for (i = 0; i < n; i++)
{
*const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
*const_cast<stringop_alg *>(&default_algs->size[i].alg)
= input_ranges[i].alg;
*const_cast<int *>(&default_algs->size[i].noalign)
= input_ranges[i].noalign;
}
}
/* Override various settings based on options. If MAIN_ARGS_P, the
options are from the command line, otherwise they are from
@ -4081,6 +4325,21 @@ ix86_option_override_internal (bool main_args_p)
/* Handle stack protector */
if (!global_options_set.x_ix86_stack_protector_guard)
ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
/* Handle -mmemcpy-strategy= and -mmemset-strategy= */
if (ix86_tune_memcpy_strategy)
{
char *str = xstrdup (ix86_tune_memcpy_strategy);
ix86_parse_stringop_strategy_string (str, false);
free (str);
}
if (ix86_tune_memset_strategy)
{
char *str = xstrdup (ix86_tune_memset_strategy);
ix86_parse_stringop_strategy_string (str, true);
free (str);
}
}
/* Implement the TARGET_OPTION_OVERRIDE hook. */
@ -22964,6 +23223,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
{
case libcall:
case no_stringop:
case last_alg:
gcc_unreachable ();
case loop_1_byte:
need_zero_guard = true;
@ -23154,6 +23414,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
{
case libcall:
case no_stringop:
case last_alg:
gcc_unreachable ();
case loop_1_byte:
case loop:
@ -23365,6 +23626,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
{
case libcall:
case no_stringop:
case last_alg:
gcc_unreachable ();
case loop:
need_zero_guard = true;
@ -23542,6 +23804,7 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
{
case libcall:
case no_stringop:
case last_alg:
gcc_unreachable ();
case loop_1_byte:
case loop:

View file

@ -170,7 +170,7 @@ struct processor_costs {
const int fsqrt; /* cost of FSQRT instruction. */
/* Specify what algorithm
to use for stringops on unknown size. */
struct stringop_algs memcpy[2], memset[2];
struct stringop_algs *memcpy, *memset;
const int scalar_stmt_cost; /* Cost of any scalar operation, excluding
load and store. */
const int scalar_load_cost; /* Cost of scalar load. */

View file

@ -316,6 +316,14 @@ mstack-arg-probe
Target Report Mask(STACK_PROBE) Save
Enable stack probing
mmemcpy-strategy=
Target RejectNegative Joined Var(ix86_tune_memcpy_strategy)
Specify memcpy expansion strategy when expected size is known
mmemset-strategy=
Target RejectNegative Joined Var(ix86_tune_memset_strategy)
Specify memset expansion strategy when expected size is known
mstringop-strategy=
Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop)
Chose strategy to generate stringop using

View file

@ -0,0 +1,37 @@
/* Definitions for stringop strategy for IA-32.
Copyright (C) 2013 Free Software Foundation, Inc.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the files COPYING3. If not,
see <http://www.gnu.org/licenses/>. */
DEF_ENUM
DEF_ALG (no_stringop, no_stringop)
DEF_ENUM
DEF_ALG (libcall, libcall)
DEF_ENUM
DEF_ALG (rep_prefix_1_byte, rep_byte)
DEF_ENUM
DEF_ALG (rep_prefix_4_byte, rep_4byte)
DEF_ENUM
DEF_ALG (rep_prefix_8_byte, rep_8byte)
DEF_ENUM
DEF_ALG (loop_1_byte, byte_loop)
DEF_ENUM
DEF_ALG (loop, loop)
DEF_ENUM
DEF_ALG (unrolled_loop, unrolled_loop)
DEF_ENUM
DEF_ALG (vector_loop, vector_loop)

View file

@ -0,0 +1,31 @@
/* Definitions for stringop option handling for IA-32.
Copyright (C) 2013 Free Software Foundation, Inc.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the files COPYING3. If not,
see <http://www.gnu.org/licenses/>. */
Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte)
#undef DEF_ENUM
#define DEF_ENUM EnumValue
#undef DEF_ALG
#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg)
#include "stringop.def"
#undef DEF_ENUM
#undef DEF_ALG

View file

@ -652,6 +652,7 @@ Objective-C and Objective-C++ Dialects}.
-mbmi2 -mrtm -mlwp -mthreads @gol
-mno-align-stringops -minline-all-stringops @gol
-minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy}
-mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol
-m96bit-long-double -mlong-double-64 -mlong-double-80 @gol
-mregparm=@var{num} -msseregparm @gol
@ -14651,6 +14652,24 @@ Expand into an inline loop.
Always use a library call.
@end table
@item -mmemcpy-strategy=@var{strategy}
@opindex mmemcpy-strategy=@var{strategy}
Override the internal decision heuristic to decide if @code{__builtin_memcpy}
should be inlined and what inline algorithm to use when the expected size
of the copy operation is known. @var{strategy}
is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets.
@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies
the max byte size with which inline algorithm @var{alg} is allowed. For the last
triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets
in the list must be specified in increasing order. The minimal byte size for
@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the
preceding range.
@item -mmemset-strategy=@var{strategy}
@opindex mmemset-strategy=@var{strategy}
The option is similar to @option{-mmemcpy-strategy=} except that it is to control
@code{__builtin_memset} expansion.
@item -momit-leaf-frame-pointer
@opindex momit-leaf-frame-pointer
Don't keep the frame pointer in a register for leaf functions. This

View file

@ -1,3 +1,10 @@
2013-08-09 Xinliang David Li <davidxl@google.com>
* gcc.target/i386/memcpy-strategy-1.c: New test.
* gcc.target/i386/memcpy-strategy-2.c: Ditto.
* gcc.target/i386/memset-strategy-1.c: Ditto.
* gcc.target/i386/memcpy-strategy-3.c: Ditto.
2013-08-09 Jan Hubicka <jh@suse.cz>
* gcc.dg/tree-prof/crossmodule-indircall-1.c: New testcase.

View file

@ -0,0 +1,12 @@
/* { dg-do compile } */
/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */
/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
char a[2048];
char b[2048];
void t (void)
{
__builtin_memcpy (a, b, 2048);
}

View file

@ -0,0 +1,12 @@
/* { dg-do compile } */
/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */
/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
char a[2048];
char b[2048];
void t (void)
{
__builtin_memcpy (a, b, 2048);
}

View file

@ -0,0 +1,10 @@
/* { dg-do compile } */
/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */
/* { dg-final { scan-assembler-times "memcpy" 2 } } */
char a[2048];
char b[2048];
void t (void)
{
__builtin_memcpy (a, b, 2048);
}

View file

@ -0,0 +1,10 @@
/* { dg-do compile } */
/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */
/* { dg-final { scan-assembler-times "memset" 2 } } */
char a[2048];
void t (void)
{
__builtin_memset (a, 1, 2048);
}