AArch64: Cleanup memset expansion
Cleanup memset implementation. Similar to memcpy/memmove, use an offset and bytes throughout. Simplify the complex calculations when optimizing for size by using a fixed limit. gcc/ChangeLog: * config/aarch64/aarch64.cc (MAX_SET_SIZE): New define. (aarch64_progress_pointer): Remove function. (aarch64_set_one_block_and_progress_pointer): Simplify and clean up. (aarch64_expand_setmem): Clean up implementation, use byte offsets, simplify size calculation.
This commit is contained in:
parent
768fbb56b3
commit
6b86f71165
1 changed files with 35 additions and 122 deletions
|
@ -103,6 +103,10 @@
|
|||
/* Defined for convenience. */
|
||||
#define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
|
||||
|
||||
/* Maximum bytes set for an inline memset expansion. With -Os use 3 STP
|
||||
and 1 MOVI/DUP (same size as a call). */
|
||||
#define MAX_SET_SIZE(speed) (speed ? 256 : 96)
|
||||
|
||||
/* Flags that describe how a function shares certain architectural state
|
||||
with its callers.
|
||||
|
||||
|
@ -26565,15 +26569,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount)
|
|||
next, amount);
|
||||
}
|
||||
|
||||
/* Return a new RTX holding the result of moving POINTER forward by the
|
||||
size of the mode it points to. */
|
||||
|
||||
static rtx
|
||||
aarch64_progress_pointer (rtx pointer)
|
||||
{
|
||||
return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
|
||||
}
|
||||
|
||||
/* Expand a cpymem/movmem using the MOPS extension. OPERANDS are taken
|
||||
from the cpymem/movmem pattern. IS_MEMMOVE is true if this is a memmove
|
||||
rather than memcpy. Return true iff we succeeded. */
|
||||
|
@ -26699,48 +26694,6 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
|
|||
return true;
|
||||
}
|
||||
|
||||
/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
|
||||
SRC is a register we have created with the duplicated value to be set. */
|
||||
static void
|
||||
aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
|
||||
machine_mode mode)
|
||||
{
|
||||
/* If we are copying 128bits or 256bits, we can do that straight from
|
||||
the SIMD register we prepared. */
|
||||
if (known_eq (GET_MODE_BITSIZE (mode), 256))
|
||||
{
|
||||
mode = GET_MODE (src);
|
||||
/* "Cast" the *dst to the correct mode. */
|
||||
*dst = adjust_address (*dst, mode, 0);
|
||||
/* Emit the memset. */
|
||||
emit_move_insn (*dst, src);
|
||||
emit_move_insn (aarch64_move_pointer (*dst, 16), src);
|
||||
|
||||
/* Move the pointers forward. */
|
||||
*dst = aarch64_move_pointer (*dst, 32);
|
||||
return;
|
||||
}
|
||||
if (known_eq (GET_MODE_BITSIZE (mode), 128))
|
||||
{
|
||||
/* "Cast" the *dst to the correct mode. */
|
||||
*dst = adjust_address (*dst, GET_MODE (src), 0);
|
||||
/* Emit the memset. */
|
||||
emit_move_insn (*dst, src);
|
||||
/* Move the pointers forward. */
|
||||
*dst = aarch64_move_pointer (*dst, 16);
|
||||
return;
|
||||
}
|
||||
/* For copying less, we have to extract the right amount from src. */
|
||||
rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
|
||||
|
||||
/* "Cast" the *dst to the correct mode. */
|
||||
*dst = adjust_address (*dst, mode, 0);
|
||||
/* Emit the memset. */
|
||||
emit_move_insn (*dst, reg);
|
||||
/* Move the pointer forward. */
|
||||
*dst = aarch64_progress_pointer (*dst);
|
||||
}
|
||||
|
||||
/* Expand a setmem using the MOPS instructions. OPERANDS are the same
|
||||
as for the setmem pattern. Return true iff we succeed. */
|
||||
static bool
|
||||
|
@ -26767,24 +26720,21 @@ aarch64_expand_setmem_mops (rtx *operands)
|
|||
bool
|
||||
aarch64_expand_setmem (rtx *operands)
|
||||
{
|
||||
int n, mode_bits;
|
||||
int mode_bytes;
|
||||
unsigned HOST_WIDE_INT len;
|
||||
rtx dst = operands[0];
|
||||
rtx val = operands[2], src;
|
||||
unsigned align = UINTVAL (operands[3]);
|
||||
rtx base;
|
||||
machine_mode cur_mode = BLKmode, next_mode;
|
||||
machine_mode mode = BLKmode, next_mode;
|
||||
|
||||
/* Variable-sized or strict-align memset may use the MOPS expansion. */
|
||||
if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
|
||||
|| (STRICT_ALIGNMENT && align < 16))
|
||||
return aarch64_expand_setmem_mops (operands);
|
||||
|
||||
bool size_p = optimize_function_for_size_p (cfun);
|
||||
|
||||
/* Default the maximum to 256-bytes when considering only libcall vs
|
||||
SIMD broadcast sequence. */
|
||||
unsigned max_set_size = 256;
|
||||
/* Set inline limits for memset. MOPS has a separate threshold. */
|
||||
unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun));
|
||||
unsigned mops_threshold = aarch64_mops_memset_size_threshold;
|
||||
|
||||
len = UINTVAL (operands[1]);
|
||||
|
@ -26793,88 +26743,51 @@ aarch64_expand_setmem (rtx *operands)
|
|||
if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
|
||||
return aarch64_expand_setmem_mops (operands);
|
||||
|
||||
int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
|
||||
/* The MOPS sequence takes:
|
||||
3 instructions for the memory storing
|
||||
+ 1 to move the constant size into a reg
|
||||
+ 1 if VAL is a non-zero constant to move into a reg
|
||||
(zero constants can use XZR directly). */
|
||||
unsigned mops_cost = 3 + 1 + cst_val;
|
||||
/* A libcall to memset in the worst case takes 3 instructions to prepare
|
||||
the arguments + 1 for the call. */
|
||||
unsigned libcall_cost = 4;
|
||||
|
||||
/* Attempt a sequence with a vector broadcast followed by stores.
|
||||
Count the number of operations involved to see if it's worth it
|
||||
against the alternatives. A simple counter simd_ops on the
|
||||
algorithmically-relevant operations is used rather than an rtx_insn count
|
||||
as all the pointer adjusmtents and mode reinterprets will be optimized
|
||||
away later. */
|
||||
start_sequence ();
|
||||
unsigned simd_ops = 0;
|
||||
|
||||
base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
|
||||
dst = adjust_automodify_address (dst, VOIDmode, base, 0);
|
||||
|
||||
/* Prepare the val using a DUP/MOVI v0.16B, val. */
|
||||
src = expand_vector_broadcast (V16QImode, val);
|
||||
src = force_reg (V16QImode, src);
|
||||
simd_ops++;
|
||||
/* Convert len to bits to make the rest of the code simpler. */
|
||||
n = len * BITS_PER_UNIT;
|
||||
val = expand_vector_broadcast (V16QImode, val);
|
||||
val = force_reg (V16QImode, val);
|
||||
|
||||
/* Maximum amount to copy in one go. We allow 256-bit chunks. */
|
||||
const int copy_limit = 256;
|
||||
|
||||
while (n > 0)
|
||||
int offset = 0;
|
||||
while (len > 0)
|
||||
{
|
||||
/* Find the largest mode in which to do the copy without
|
||||
over writing. */
|
||||
opt_scalar_int_mode mode_iter;
|
||||
FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
|
||||
if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
|
||||
cur_mode = mode_iter.require ();
|
||||
if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, 16))
|
||||
mode = mode_iter.require ();
|
||||
|
||||
gcc_assert (cur_mode != BLKmode);
|
||||
gcc_assert (mode != BLKmode);
|
||||
|
||||
mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
|
||||
aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
|
||||
simd_ops++;
|
||||
n -= mode_bits;
|
||||
mode_bytes = GET_MODE_SIZE (mode).to_constant ();
|
||||
|
||||
src = val;
|
||||
|
||||
/* Prefer Q-register accesses. */
|
||||
if (mode_bytes == 16)
|
||||
mode = V16QImode;
|
||||
else
|
||||
src = lowpart_subreg (mode, src, GET_MODE (val));
|
||||
|
||||
emit_move_insn (adjust_address (dst, mode, offset), src);
|
||||
len -= mode_bytes;
|
||||
offset += mode_bytes;
|
||||
|
||||
/* Emit trailing writes using overlapping unaligned accesses
|
||||
(when !STRICT_ALIGNMENT) - this is smaller and faster. */
|
||||
if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
|
||||
(when !STRICT_ALIGNMENT) - this is smaller and faster. */
|
||||
if (len > 0 && len < 16 && !STRICT_ALIGNMENT)
|
||||
{
|
||||
next_mode = smallest_mode_for_size (n, MODE_INT);
|
||||
int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
|
||||
gcc_assert (n_bits <= mode_bits);
|
||||
dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
|
||||
n = n_bits;
|
||||
next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT);
|
||||
int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
|
||||
gcc_assert (n_bytes <= mode_bytes);
|
||||
offset -= n_bytes - len;
|
||||
len = n_bytes;
|
||||
}
|
||||
}
|
||||
rtx_insn *seq = get_insns ();
|
||||
end_sequence ();
|
||||
|
||||
if (size_p)
|
||||
{
|
||||
/* When optimizing for size we have 3 options: the SIMD broadcast sequence,
|
||||
call to memset or the MOPS expansion. */
|
||||
if (TARGET_MOPS
|
||||
&& mops_cost <= libcall_cost
|
||||
&& mops_cost <= simd_ops)
|
||||
return aarch64_expand_setmem_mops (operands);
|
||||
/* If MOPS is not available or not shorter pick a libcall if the SIMD
|
||||
sequence is too long. */
|
||||
else if (libcall_cost < simd_ops)
|
||||
return false;
|
||||
emit_insn (seq);
|
||||
return true;
|
||||
}
|
||||
|
||||
/* At this point the SIMD broadcast sequence is the best choice when
|
||||
optimizing for speed. */
|
||||
emit_insn (seq);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue