aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
2015-11-06 Benedikt Huber <benedikt.huber@theobroma-systems.com> Philipp Tomsich <philipp.tomsich@theobroma-systems.com> * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf. * config/aarch64/aarch64-protos.h: Declare. * config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and frsqrts. * config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt. * config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when applicable. * config/aarch64/aarch64.md: Added enum entries. * config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt. * testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h: Common macros for assembly checks. * testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c: Make sure frsqrts and frsqrte are not emitted. * testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c: Make sure frsqrts and frsqrte are emitted. * testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt. Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com> Co-Authored-By: Philipp Tomsich <philipp.tomsich@theobroma-systems.com> From-SVN: r229866
This commit is contained in:
parent
7ad72a979b
commit
a6fc00da76
9 changed files with 292 additions and 2 deletions
|
@ -1,3 +1,23 @@
|
|||
2015-11-06 Benedikt Huber <benedikt.huber@theobroma-systems.com>
|
||||
Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
|
||||
|
||||
* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
|
||||
* config/aarch64/aarch64-protos.h: Declare.
|
||||
* config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and
|
||||
frsqrts.
|
||||
* config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt.
|
||||
* config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when
|
||||
applicable.
|
||||
* config/aarch64/aarch64.md: Added enum entries.
|
||||
* config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt.
|
||||
* testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h: Common macros for
|
||||
assembly checks.
|
||||
* testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c: Make sure
|
||||
frsqrts and frsqrte are not emitted.
|
||||
* testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c: Make sure frsqrts and
|
||||
frsqrte are emitted.
|
||||
* testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt.
|
||||
|
||||
2015-11-07 Jan Hubicka <hubicka@ucw.cz>
|
||||
|
||||
PR ipa/68057
|
||||
|
|
|
@ -324,6 +324,11 @@ enum aarch64_builtins
|
|||
AARCH64_BUILTIN_GET_FPSR,
|
||||
AARCH64_BUILTIN_SET_FPSR,
|
||||
|
||||
AARCH64_BUILTIN_RSQRT_DF,
|
||||
AARCH64_BUILTIN_RSQRT_SF,
|
||||
AARCH64_BUILTIN_RSQRT_V2DF,
|
||||
AARCH64_BUILTIN_RSQRT_V2SF,
|
||||
AARCH64_BUILTIN_RSQRT_V4SF,
|
||||
AARCH64_SIMD_BUILTIN_BASE,
|
||||
AARCH64_SIMD_BUILTIN_LANE_CHECK,
|
||||
#include "aarch64-simd-builtins.def"
|
||||
|
@ -822,6 +827,46 @@ aarch64_init_crc32_builtins ()
|
|||
}
|
||||
}
|
||||
|
||||
/* Add builtins for reciprocal square root. */
|
||||
|
||||
void
|
||||
aarch64_init_builtin_rsqrt (void)
|
||||
{
|
||||
tree fndecl = NULL;
|
||||
tree ftype = NULL;
|
||||
|
||||
tree V2SF_type_node = build_vector_type (float_type_node, 2);
|
||||
tree V2DF_type_node = build_vector_type (double_type_node, 2);
|
||||
tree V4SF_type_node = build_vector_type (float_type_node, 4);
|
||||
|
||||
struct builtin_decls_data
|
||||
{
|
||||
tree type_node;
|
||||
const char *builtin_name;
|
||||
int function_code;
|
||||
};
|
||||
|
||||
builtin_decls_data bdda[] =
|
||||
{
|
||||
{ double_type_node, "__builtin_aarch64_rsqrt_df", AARCH64_BUILTIN_RSQRT_DF },
|
||||
{ float_type_node, "__builtin_aarch64_rsqrt_sf", AARCH64_BUILTIN_RSQRT_SF },
|
||||
{ V2DF_type_node, "__builtin_aarch64_rsqrt_v2df", AARCH64_BUILTIN_RSQRT_V2DF },
|
||||
{ V2SF_type_node, "__builtin_aarch64_rsqrt_v2sf", AARCH64_BUILTIN_RSQRT_V2SF },
|
||||
{ V4SF_type_node, "__builtin_aarch64_rsqrt_v4sf", AARCH64_BUILTIN_RSQRT_V4SF }
|
||||
};
|
||||
|
||||
builtin_decls_data *bdd = bdda;
|
||||
builtin_decls_data *bdd_end = bdd + (sizeof (bdda) / sizeof (builtin_decls_data));
|
||||
|
||||
for (; bdd < bdd_end; bdd++)
|
||||
{
|
||||
ftype = build_function_type_list (bdd->type_node, bdd->type_node, NULL_TREE);
|
||||
fndecl = add_builtin_function (bdd->builtin_name,
|
||||
ftype, bdd->function_code, BUILT_IN_MD, NULL, NULL_TREE);
|
||||
aarch64_builtin_decls[bdd->function_code] = fndecl;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
aarch64_init_builtins (void)
|
||||
{
|
||||
|
@ -853,6 +898,7 @@ aarch64_init_builtins (void)
|
|||
aarch64_init_simd_builtins ();
|
||||
|
||||
aarch64_init_crc32_builtins ();
|
||||
aarch64_init_builtin_rsqrt ();
|
||||
}
|
||||
|
||||
tree
|
||||
|
@ -1116,6 +1162,44 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target)
|
|||
return target;
|
||||
}
|
||||
|
||||
/* Function to expand reciprocal square root builtins. */
|
||||
|
||||
static rtx
|
||||
aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
|
||||
{
|
||||
tree arg0 = CALL_EXPR_ARG (exp, 0);
|
||||
rtx op0 = expand_normal (arg0);
|
||||
|
||||
rtx (*gen) (rtx, rtx);
|
||||
|
||||
switch (fcode)
|
||||
{
|
||||
case AARCH64_BUILTIN_RSQRT_DF:
|
||||
gen = gen_aarch64_rsqrt_df2;
|
||||
break;
|
||||
case AARCH64_BUILTIN_RSQRT_SF:
|
||||
gen = gen_aarch64_rsqrt_sf2;
|
||||
break;
|
||||
case AARCH64_BUILTIN_RSQRT_V2DF:
|
||||
gen = gen_aarch64_rsqrt_v2df2;
|
||||
break;
|
||||
case AARCH64_BUILTIN_RSQRT_V2SF:
|
||||
gen = gen_aarch64_rsqrt_v2sf2;
|
||||
break;
|
||||
case AARCH64_BUILTIN_RSQRT_V4SF:
|
||||
gen = gen_aarch64_rsqrt_v4sf2;
|
||||
break;
|
||||
default: gcc_unreachable ();
|
||||
}
|
||||
|
||||
if (!target)
|
||||
target = gen_reg_rtx (GET_MODE (op0));
|
||||
|
||||
emit_insn (gen (target, op0));
|
||||
|
||||
return target;
|
||||
}
|
||||
|
||||
/* Expand an expression EXP that calls a built-in function,
|
||||
with result going to TARGET if that's convenient. */
|
||||
rtx
|
||||
|
@ -1163,6 +1247,13 @@ aarch64_expand_builtin (tree exp,
|
|||
else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX)
|
||||
return aarch64_crc32_expand_builtin (fcode, exp, target);
|
||||
|
||||
if (fcode == AARCH64_BUILTIN_RSQRT_DF
|
||||
|| fcode == AARCH64_BUILTIN_RSQRT_SF
|
||||
|| fcode == AARCH64_BUILTIN_RSQRT_V2DF
|
||||
|| fcode == AARCH64_BUILTIN_RSQRT_V2SF
|
||||
|| fcode == AARCH64_BUILTIN_RSQRT_V4SF)
|
||||
return aarch64_expand_builtin_rsqrt (fcode, exp, target);
|
||||
|
||||
gcc_unreachable ();
|
||||
}
|
||||
|
||||
|
@ -1320,6 +1411,30 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
|
|||
return NULL_TREE;
|
||||
}
|
||||
|
||||
/* Return builtin for reciprocal square root. */
|
||||
|
||||
tree
|
||||
aarch64_builtin_rsqrt (unsigned int fn, bool md_fn)
|
||||
{
|
||||
if (md_fn)
|
||||
{
|
||||
if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
|
||||
return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
|
||||
if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf)
|
||||
return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF];
|
||||
if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf)
|
||||
return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF];
|
||||
}
|
||||
else
|
||||
{
|
||||
if (fn == BUILT_IN_SQRT)
|
||||
return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF];
|
||||
if (fn == BUILT_IN_SQRTF)
|
||||
return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF];
|
||||
}
|
||||
return NULL_TREE;
|
||||
}
|
||||
|
||||
#undef VAR1
|
||||
#define VAR1(T, N, MAP, A) \
|
||||
case AARCH64_SIMD_BUILTIN_##T##_##N##A:
|
||||
|
|
|
@ -352,6 +352,8 @@ void aarch64_register_pragmas (void);
|
|||
void aarch64_relayout_simd_types (void);
|
||||
void aarch64_reset_previous_fndecl (void);
|
||||
|
||||
void aarch64_emit_swrsqrt (rtx, rtx);
|
||||
|
||||
/* Initialize builtins for SIMD intrinsics. */
|
||||
void init_aarch64_simd_builtins (void);
|
||||
|
||||
|
@ -403,6 +405,8 @@ rtx aarch64_expand_builtin (tree exp,
|
|||
int ignore ATTRIBUTE_UNUSED);
|
||||
tree aarch64_builtin_decl (unsigned, bool ATTRIBUTE_UNUSED);
|
||||
|
||||
tree aarch64_builtin_rsqrt (unsigned int, bool);
|
||||
|
||||
tree
|
||||
aarch64_builtin_vectorized_function (tree fndecl,
|
||||
tree type_out,
|
||||
|
|
|
@ -382,6 +382,33 @@
|
|||
[(set_attr "type" "neon_fp_mul_d_scalar_q")]
|
||||
)
|
||||
|
||||
(define_insn "aarch64_rsqrte_<mode>2"
|
||||
[(set (match_operand:VALLF 0 "register_operand" "=w")
|
||||
(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
|
||||
UNSPEC_RSQRTE))]
|
||||
"TARGET_SIMD"
|
||||
"frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
|
||||
[(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
|
||||
|
||||
(define_insn "aarch64_rsqrts_<mode>3"
|
||||
[(set (match_operand:VALLF 0 "register_operand" "=w")
|
||||
(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
|
||||
(match_operand:VALLF 2 "register_operand" "w")]
|
||||
UNSPEC_RSQRTS))]
|
||||
"TARGET_SIMD"
|
||||
"frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
|
||||
[(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
|
||||
|
||||
(define_expand "aarch64_rsqrt_<mode>2"
|
||||
[(set (match_operand:VALLF 0 "register_operand" "=w")
|
||||
(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
|
||||
UNSPEC_RSQRT))]
|
||||
"TARGET_SIMD"
|
||||
{
|
||||
aarch64_emit_swrsqrt (operands[0], operands[1]);
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_insn "*aarch64_mul3_elt_to_64v2df"
|
||||
[(set (match_operand:DF 0 "register_operand" "=w")
|
||||
(mult:DF
|
||||
|
|
|
@ -29,4 +29,5 @@
|
|||
AARCH64_TUNE_ to give an enum name. */
|
||||
|
||||
AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
|
||||
AARCH64_EXTRA_TUNING_OPTION ("recip_sqrt", RECIP_SQRT)
|
||||
|
||||
|
|
|
@ -403,7 +403,8 @@ static const struct tune_params cortexa57_tunings =
|
|||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
|
||||
(AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */
|
||||
(AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
|
||||
| AARCH64_EXTRA_TUNE_RECIP_SQRT) /* tune_flags. */
|
||||
};
|
||||
|
||||
static const struct tune_params cortexa72_tunings =
|
||||
|
@ -470,7 +471,7 @@ static const struct tune_params xgene1_tunings =
|
|||
2, /* min_div_recip_mul_sf. */
|
||||
2, /* min_div_recip_mul_df. */
|
||||
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
|
||||
(AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
|
||||
(AARCH64_EXTRA_TUNE_RECIP_SQRT) /* tune_flags. */
|
||||
};
|
||||
|
||||
/* Support for fine-grained override of the tuning structures. */
|
||||
|
@ -7031,6 +7032,105 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
|
|||
return aarch64_tune_params.memmov_cost;
|
||||
}
|
||||
|
||||
/* Function to decide when to use
|
||||
reciprocal square root builtins. */
|
||||
|
||||
static tree
|
||||
aarch64_builtin_reciprocal (unsigned int fn,
|
||||
bool md_fn,
|
||||
bool)
|
||||
{
|
||||
if (flag_trapping_math
|
||||
|| !flag_unsafe_math_optimizations
|
||||
|| optimize_size
|
||||
|| ! (aarch64_tune_params.extra_tuning_flags
|
||||
& AARCH64_EXTRA_TUNE_RECIP_SQRT))
|
||||
{
|
||||
return NULL_TREE;
|
||||
}
|
||||
|
||||
return aarch64_builtin_rsqrt (fn, md_fn);
|
||||
}
|
||||
|
||||
typedef rtx (*rsqrte_type) (rtx, rtx);
|
||||
|
||||
/* Select reciprocal square root initial estimate
|
||||
insn depending on machine mode. */
|
||||
|
||||
rsqrte_type
|
||||
get_rsqrte_type (machine_mode mode)
|
||||
{
|
||||
switch (mode)
|
||||
{
|
||||
case DFmode: return gen_aarch64_rsqrte_df2;
|
||||
case SFmode: return gen_aarch64_rsqrte_sf2;
|
||||
case V2DFmode: return gen_aarch64_rsqrte_v2df2;
|
||||
case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
|
||||
case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
|
||||
default: gcc_unreachable ();
|
||||
}
|
||||
}
|
||||
|
||||
typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
|
||||
|
||||
/* Select reciprocal square root Newton-Raphson step
|
||||
insn depending on machine mode. */
|
||||
|
||||
rsqrts_type
|
||||
get_rsqrts_type (machine_mode mode)
|
||||
{
|
||||
switch (mode)
|
||||
{
|
||||
case DFmode: return gen_aarch64_rsqrts_df3;
|
||||
case SFmode: return gen_aarch64_rsqrts_sf3;
|
||||
case V2DFmode: return gen_aarch64_rsqrts_v2df3;
|
||||
case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
|
||||
case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
|
||||
default: gcc_unreachable ();
|
||||
}
|
||||
}
|
||||
|
||||
/* Emit instruction sequence to compute
|
||||
reciprocal square root. Use two Newton-Raphson steps
|
||||
for single precision and three for double precision. */
|
||||
|
||||
void
|
||||
aarch64_emit_swrsqrt (rtx dst, rtx src)
|
||||
{
|
||||
machine_mode mode = GET_MODE (src);
|
||||
gcc_assert (
|
||||
mode == SFmode || mode == V2SFmode || mode == V4SFmode
|
||||
|| mode == DFmode || mode == V2DFmode);
|
||||
|
||||
rtx xsrc = gen_reg_rtx (mode);
|
||||
emit_move_insn (xsrc, src);
|
||||
rtx x0 = gen_reg_rtx (mode);
|
||||
|
||||
emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
|
||||
|
||||
bool double_mode = (mode == DFmode || mode == V2DFmode);
|
||||
|
||||
int iterations = double_mode ? 3 : 2;
|
||||
|
||||
if (flag_mrecip_low_precision_sqrt)
|
||||
iterations--;
|
||||
|
||||
for (int i = 0; i < iterations; ++i)
|
||||
{
|
||||
rtx x1 = gen_reg_rtx (mode);
|
||||
rtx x2 = gen_reg_rtx (mode);
|
||||
rtx x3 = gen_reg_rtx (mode);
|
||||
emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
|
||||
|
||||
emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
|
||||
|
||||
emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
|
||||
x0 = x1;
|
||||
}
|
||||
|
||||
emit_move_insn (dst, x0);
|
||||
}
|
||||
|
||||
/* Return the number of instructions that can be issued per cycle. */
|
||||
static int
|
||||
aarch64_sched_issue_rate (void)
|
||||
|
@ -13455,6 +13555,9 @@ aarch64_promoted_type (const_tree t)
|
|||
#undef TARGET_BUILTIN_DECL
|
||||
#define TARGET_BUILTIN_DECL aarch64_builtin_decl
|
||||
|
||||
#undef TARGET_BUILTIN_RECIPROCAL
|
||||
#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
|
||||
|
||||
#undef TARGET_EXPAND_BUILTIN
|
||||
#define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
|
||||
|
||||
|
|
|
@ -126,6 +126,9 @@
|
|||
UNSPEC_VSTRUCTDUMMY
|
||||
UNSPEC_SP_SET
|
||||
UNSPEC_SP_TEST
|
||||
UNSPEC_RSQRT
|
||||
UNSPEC_RSQRTE
|
||||
UNSPEC_RSQRTS
|
||||
])
|
||||
|
||||
(define_c_enum "unspecv" [
|
||||
|
|
|
@ -148,3 +148,8 @@ Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
|
|||
mpc-relative-literal-loads
|
||||
Target Report Save Var(nopcrelative_literal_loads) Init(2) Save
|
||||
PC relative literal loads.
|
||||
|
||||
mlow-precision-recip-sqrt
|
||||
Common Var(flag_mrecip_low_precision_sqrt) Optimization
|
||||
When calculating a sqrt approximation, run fewer steps.
|
||||
This reduces precision, but can result in faster computation.
|
||||
|
|
|
@ -521,6 +521,7 @@ Objective-C and Objective-C++ Dialects}.
|
|||
-mtls-size=@var{size} @gol
|
||||
-mfix-cortex-a53-835769 -mno-fix-cortex-a53-835769 @gol
|
||||
-mfix-cortex-a53-843419 -mno-fix-cortex-a53-843419 @gol
|
||||
-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
|
||||
-march=@var{name} -mcpu=@var{name} -mtune=@var{name}}
|
||||
|
||||
@emph{Adapteva Epiphany Options}
|
||||
|
@ -12519,6 +12520,17 @@ Enable or disable the workaround for the ARM Cortex-A53 erratum number 843419.
|
|||
This erratum workaround is made at link time and this will only pass the
|
||||
corresponding flag to the linker.
|
||||
|
||||
@item -mlow-precision-recip-sqrt
|
||||
@item -mno-low-precision-recip-sqrt
|
||||
@opindex -mlow-precision-recip-sqrt
|
||||
@opindex -mno-low-precision-recip-sqrt
|
||||
The square root estimate uses two steps instead of three for double-precision,
|
||||
and one step instead of two for single-precision.
|
||||
Thus reducing latency and precision.
|
||||
This is only relevant if @option{-ffast-math} activates
|
||||
reciprocal square root estimate instructions.
|
||||
Which in turn depends on the target processor.
|
||||
|
||||
@item -march=@var{name}
|
||||
@opindex march
|
||||
Specify the name of the target architecture, optionally suffixed by one or
|
||||
|
|
Loading…
Add table
Reference in a new issue