AArch64: Cleanup memset expansion

Cleanup memset implementation.  Similar to memcpy/memmove, use an offset and
bytes throughout.  Simplify the complex calculations when optimizing for size
by using a fixed limit.

gcc/ChangeLog:
	* config/aarch64/aarch64.cc (MAX_SET_SIZE): New define.
	(aarch64_progress_pointer): Remove function.
	(aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
	(aarch64_expand_setmem): Clean up implementation, use byte offsets,
	simplify size calculation.
This commit is contained in:
Wilco Dijkstra 2024-02-21 23:34:37 +00:00
parent 768fbb56b3
commit 6b86f71165

View file

@ -103,6 +103,10 @@
/* Defined for convenience. */
#define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
/* Maximum bytes set for an inline memset expansion. With -Os use 3 STP
and 1 MOVI/DUP (same size as a call). */
#define MAX_SET_SIZE(speed) (speed ? 256 : 96)
/* Flags that describe how a function shares certain architectural state
with its callers.
@ -26565,15 +26569,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount)
next, amount);
}
/* Return a new RTX holding the result of moving POINTER forward by the
size of the mode it points to. */
static rtx
aarch64_progress_pointer (rtx pointer)
{
return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
}
/* Expand a cpymem/movmem using the MOPS extension. OPERANDS are taken
from the cpymem/movmem pattern. IS_MEMMOVE is true if this is a memmove
rather than memcpy. Return true iff we succeeded. */
@ -26699,48 +26694,6 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
return true;
}
/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
SRC is a register we have created with the duplicated value to be set. */
static void
aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
machine_mode mode)
{
/* If we are copying 128bits or 256bits, we can do that straight from
the SIMD register we prepared. */
if (known_eq (GET_MODE_BITSIZE (mode), 256))
{
mode = GET_MODE (src);
/* "Cast" the *dst to the correct mode. */
*dst = adjust_address (*dst, mode, 0);
/* Emit the memset. */
emit_move_insn (*dst, src);
emit_move_insn (aarch64_move_pointer (*dst, 16), src);
/* Move the pointers forward. */
*dst = aarch64_move_pointer (*dst, 32);
return;
}
if (known_eq (GET_MODE_BITSIZE (mode), 128))
{
/* "Cast" the *dst to the correct mode. */
*dst = adjust_address (*dst, GET_MODE (src), 0);
/* Emit the memset. */
emit_move_insn (*dst, src);
/* Move the pointers forward. */
*dst = aarch64_move_pointer (*dst, 16);
return;
}
/* For copying less, we have to extract the right amount from src. */
rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
/* "Cast" the *dst to the correct mode. */
*dst = adjust_address (*dst, mode, 0);
/* Emit the memset. */
emit_move_insn (*dst, reg);
/* Move the pointer forward. */
*dst = aarch64_progress_pointer (*dst);
}
/* Expand a setmem using the MOPS instructions. OPERANDS are the same
as for the setmem pattern. Return true iff we succeed. */
static bool
@ -26767,24 +26720,21 @@ aarch64_expand_setmem_mops (rtx *operands)
bool
aarch64_expand_setmem (rtx *operands)
{
int n, mode_bits;
int mode_bytes;
unsigned HOST_WIDE_INT len;
rtx dst = operands[0];
rtx val = operands[2], src;
unsigned align = UINTVAL (operands[3]);
rtx base;
machine_mode cur_mode = BLKmode, next_mode;
machine_mode mode = BLKmode, next_mode;
/* Variable-sized or strict-align memset may use the MOPS expansion. */
if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
|| (STRICT_ALIGNMENT && align < 16))
return aarch64_expand_setmem_mops (operands);
bool size_p = optimize_function_for_size_p (cfun);
/* Default the maximum to 256-bytes when considering only libcall vs
SIMD broadcast sequence. */
unsigned max_set_size = 256;
/* Set inline limits for memset. MOPS has a separate threshold. */
unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun));
unsigned mops_threshold = aarch64_mops_memset_size_threshold;
len = UINTVAL (operands[1]);
@ -26793,88 +26743,51 @@ aarch64_expand_setmem (rtx *operands)
if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
return aarch64_expand_setmem_mops (operands);
int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
/* The MOPS sequence takes:
3 instructions for the memory storing
+ 1 to move the constant size into a reg
+ 1 if VAL is a non-zero constant to move into a reg
(zero constants can use XZR directly). */
unsigned mops_cost = 3 + 1 + cst_val;
/* A libcall to memset in the worst case takes 3 instructions to prepare
the arguments + 1 for the call. */
unsigned libcall_cost = 4;
/* Attempt a sequence with a vector broadcast followed by stores.
Count the number of operations involved to see if it's worth it
against the alternatives. A simple counter simd_ops on the
algorithmically-relevant operations is used rather than an rtx_insn count
as all the pointer adjusmtents and mode reinterprets will be optimized
away later. */
start_sequence ();
unsigned simd_ops = 0;
base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
dst = adjust_automodify_address (dst, VOIDmode, base, 0);
/* Prepare the val using a DUP/MOVI v0.16B, val. */
src = expand_vector_broadcast (V16QImode, val);
src = force_reg (V16QImode, src);
simd_ops++;
/* Convert len to bits to make the rest of the code simpler. */
n = len * BITS_PER_UNIT;
val = expand_vector_broadcast (V16QImode, val);
val = force_reg (V16QImode, val);
/* Maximum amount to copy in one go. We allow 256-bit chunks. */
const int copy_limit = 256;
while (n > 0)
int offset = 0;
while (len > 0)
{
/* Find the largest mode in which to do the copy without
over writing. */
opt_scalar_int_mode mode_iter;
FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
cur_mode = mode_iter.require ();
if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, 16))
mode = mode_iter.require ();
gcc_assert (cur_mode != BLKmode);
gcc_assert (mode != BLKmode);
mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
simd_ops++;
n -= mode_bits;
mode_bytes = GET_MODE_SIZE (mode).to_constant ();
src = val;
/* Prefer Q-register accesses. */
if (mode_bytes == 16)
mode = V16QImode;
else
src = lowpart_subreg (mode, src, GET_MODE (val));
emit_move_insn (adjust_address (dst, mode, offset), src);
len -= mode_bytes;
offset += mode_bytes;
/* Emit trailing writes using overlapping unaligned accesses
(when !STRICT_ALIGNMENT) - this is smaller and faster. */
if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
(when !STRICT_ALIGNMENT) - this is smaller and faster. */
if (len > 0 && len < 16 && !STRICT_ALIGNMENT)
{
next_mode = smallest_mode_for_size (n, MODE_INT);
int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
gcc_assert (n_bits <= mode_bits);
dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
n = n_bits;
next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT);
int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
gcc_assert (n_bytes <= mode_bytes);
offset -= n_bytes - len;
len = n_bytes;
}
}
rtx_insn *seq = get_insns ();
end_sequence ();
if (size_p)
{
/* When optimizing for size we have 3 options: the SIMD broadcast sequence,
call to memset or the MOPS expansion. */
if (TARGET_MOPS
&& mops_cost <= libcall_cost
&& mops_cost <= simd_ops)
return aarch64_expand_setmem_mops (operands);
/* If MOPS is not available or not shorter pick a libcall if the SIMD
sequence is too long. */
else if (libcall_cost < simd_ops)
return false;
emit_insn (seq);
return true;
}
/* At this point the SIMD broadcast sequence is the best choice when
optimizing for speed. */
emit_insn (seq);
return true;
}