bpf: add inline memmove and memcpy expansion

BPF programs are not typically linked, which means we cannot fall back
on library calls to implement __builtin_{memmove,memcpy} and should
always expand them inline if possible.

GCC already successfully expands these builtins inline in many cases,
but failed to do so for a few for simple cases involving overlapping
memmove in the kernel BPF selftests and was instead emitting a libcall.

This patch implements a simple inline expansion of memcpy and memmove in
the BPF backend in a verifier-friendly way, with the caveat that the
size must be an integer constant, which is also required by clang.

gcc/

	* config/bpf/bpf-protos.h (bpf_expand_cpymem): New.
	* config/bpf/bpf.cc: (emit_move_loop, bpf_expand_cpymem): New.
	* config/bpf/bpf.md: (cpymemdi, movmemdi): New define_expands.

gcc/testsuite/

	* gcc.target/bpf/memcpy-1.c: New test.
	* gcc.target/bpf/memmove-1.c: New test.
	* gcc.target/bpf/memmove-2.c: New test.
This commit is contained in:
David Faust 2024-02-20 14:48:33 -08:00
parent 161a67b2be
commit 5772ea772d
6 changed files with 248 additions and 0 deletions

View file

@ -35,4 +35,6 @@ const char *bpf_add_core_reloc (rtx *operands, const char *templ);
class gimple_opt_pass;
gimple_opt_pass *make_pass_lower_bpf_core (gcc::context *ctxt);
bool bpf_expand_cpymem (rtx *, bool);
#endif /* ! GCC_BPF_PROTOS_H */

View file

@ -1184,6 +1184,121 @@ bpf_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
#define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
bpf_use_by_pieces_infrastructure_p
/* Helper for bpf_expand_cpymem. Emit an unrolled loop moving the bytes
from SRC to DST. */
static void
emit_move_loop (rtx src, rtx dst, machine_mode mode, int offset, int inc,
unsigned iters, unsigned remainder)
{
rtx reg = gen_reg_rtx (mode);
/* First copy in chunks as large as alignment permits. */
for (unsigned int i = 0; i < iters; i++)
{
emit_move_insn (reg, adjust_address (src, mode, offset));
emit_move_insn (adjust_address (dst, mode, offset), reg);
offset += inc;
}
/* Handle remaining bytes which might be smaller than the chunks
used above. */
if (remainder & 4)
{
emit_move_insn (reg, adjust_address (src, SImode, offset));
emit_move_insn (adjust_address (dst, SImode, offset), reg);
offset += (inc < 0 ? -4 : 4);
remainder -= 4;
}
if (remainder & 2)
{
emit_move_insn (reg, adjust_address (src, HImode, offset));
emit_move_insn (adjust_address (dst, HImode, offset), reg);
offset += (inc < 0 ? -2 : 2);
remainder -= 2;
}
if (remainder & 1)
{
emit_move_insn (reg, adjust_address (src, QImode, offset));
emit_move_insn (adjust_address (dst, QImode, offset), reg);
}
}
/* Expand cpymem/movmem, as from __builtin_memcpy/memmove.
OPERANDS are the same as the cpymem/movmem patterns.
IS_MOVE is true if this is a memmove, false for memcpy.
Return true if we successfully expanded, or false if we cannot
and must punt to a libcall. */
bool
bpf_expand_cpymem (rtx *operands, bool is_move)
{
/* Size must be constant for this expansion to work. */
if (!CONST_INT_P (operands[2]))
{
const char *name = is_move ? "memmove" : "memcpy";
if (flag_building_libgcc)
warning (0, "could not inline call to %<__builtin_%s%>: "
"size must be constant", name);
else
error ("could not inline call to %<__builtin_%s%>: "
"size must be constant", name);
return false;
}
/* Alignment is a CONST_INT. */
gcc_assert (CONST_INT_P (operands[3]));
rtx dst = operands[0];
rtx src = operands[1];
rtx size = operands[2];
unsigned HOST_WIDE_INT size_bytes = UINTVAL (size);
unsigned align = UINTVAL (operands[3]);
enum machine_mode mode;
switch (align)
{
case 1: mode = QImode; break;
case 2: mode = HImode; break;
case 4: mode = SImode; break;
case 8: mode = DImode; break;
default:
gcc_unreachable ();
}
unsigned iters = size_bytes >> ceil_log2 (align);
unsigned remainder = size_bytes & (align - 1);
int inc = GET_MODE_SIZE (mode);
rtx_code_label *fwd_label, *done_label;
if (is_move)
{
/* For memmove, be careful of overlap. It is not a concern for memcpy.
To handle overlap, we check (at runtime) if SRC < DST, and if so do
the move "backwards" starting from SRC + SIZE. */
fwd_label = gen_label_rtx ();
done_label = gen_label_rtx ();
rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (dst, 0));
rtx src_addr = copy_to_mode_reg (Pmode, XEXP (src, 0));
emit_cmp_and_jump_insns (src_addr, dst_addr, GEU, NULL_RTX, Pmode,
true, fwd_label, profile_probability::even ());
/* Emit the "backwards" unrolled loop. */
emit_move_loop (src, dst, mode, size_bytes, -inc, iters, remainder);
emit_jump_insn (gen_jump (done_label));
emit_barrier ();
emit_label (fwd_label);
}
emit_move_loop (src, dst, mode, 0, inc, iters, remainder);
if (is_move)
emit_label (done_label);
return true;
}
/* Finally, build the GCC target. */
struct gcc_target targetm = TARGET_INITIALIZER;

View file

@ -627,4 +627,40 @@
"{ldabs<ldop>\t%0|r0 = *(<pldop> *) skb[%0]}"
[(set_attr "type" "ld")])
;;; memmove and memcopy
;; 0 is dst
;; 1 is src
;; 2 is size of copy in bytes
;; 3 is alignment
(define_expand "cpymemdi"
[(match_operand:BLK 0 "memory_operand")
(match_operand:BLK 1 "memory_operand")
(match_operand:DI 2 "general_operand")
(match_operand:DI 3 "immediate_operand")]
""
{
if (bpf_expand_cpymem (operands, false))
DONE;
FAIL;
})
;; 0 is dst
;; 1 is src
;; 2 is size of copy in bytes
;; 3 is alignment
(define_expand "movmemdi"
[(match_operand:BLK 0 "memory_operand")
(match_operand:BLK 1 "memory_operand")
(match_operand:DI 2 "general_operand")
(match_operand:DI 3 "immediate_operand")]
""
{
if (bpf_expand_cpymem (operands, true))
DONE;
FAIL;
})
(include "atomic.md")

View file

@ -0,0 +1,26 @@
/* Ensure memcpy is expanded inline rather than emitting a libcall. */
/* { dg-do compile } */
/* { dg-options "-O2" } */
struct context {
unsigned int data;
unsigned int data_end;
unsigned int data_meta;
unsigned int ingress;
unsigned int queue_index;
unsigned int egress;
};
void
cpy_1(struct context *ctx)
{
void *data = (void *)(long)ctx->data;
char *dest;
dest = data;
dest += 16;
__builtin_memcpy (dest, data, 8);
}
/* { dg-final { scan-assembler-times "call" 0 } } */

View file

@ -0,0 +1,46 @@
/* Ensure memmove is expanded inline rather than emitting a libcall. */
/* { dg-do compile } */
/* { dg-options "-O2" } */
struct context {
unsigned int data;
unsigned int data_end;
unsigned int data_meta;
unsigned int ingress;
unsigned int queue_index;
unsigned int egress;
};
void
mov_1_nooverlap (struct context *ctx)
{
void *data = (void *)(long)ctx->data;
char *dest;
dest = data;
dest += 16;
__builtin_memmove (dest, data, 12);
}
void
mov_1_overlap (struct context *ctx)
{
void *data = (void *)(long)ctx->data;
char *dest;
dest = data;
dest += 4;
__builtin_memmove (dest, data, 12);
}
void
mov_1_arbitrary (struct context *ctx_a, struct context *ctx_b)
{
void *src = (void *)(long)ctx_a->data;
void *dst = (void *)(long)ctx_b->data;
__builtin_memmove (dst, src, 12);
}
/* { dg-final { scan-assembler-times "call" 0 } } */

View file

@ -0,0 +1,23 @@
/* Test that we error if memmove cannot be expanded inline. */
/* { dg-do compile } */
/* { dg-options "-O2" } */
typedef unsigned int __u32;
struct context {
unsigned int data;
unsigned int data_end;
unsigned int data_meta;
};
void
mov_2_unsupported (struct context *ctx)
{
void *data = (void *)(long)ctx->data;
char *dest;
dest = data;
dest += 4;
__builtin_memmove (dest, data, ctx->data_meta); /* { dg-error "could not inline call" } */
}