IBM Z: Provide rawmemchr{qi,hi,si} expander

gcc/ChangeLog:

	* config/s390/s390-protos.h (s390_rawmemchr): Add prototype.
	* config/s390/s390.c (s390_rawmemchr): New function.
	* config/s390/s390.md (rawmemchr<SINT:mode>): New expander.
	* config/s390/vector.md (@vec_vfees<mode>): Basically a copy of
	the pattern vfees<mode> from vx-builtins.md.
	* config/s390/vx-builtins.md (*vfees<mode>): Remove.

gcc/testsuite/ChangeLog:

	* gcc.target/s390/rawmemchr-1.c: New test.
This commit is contained in:
Stefan Schulze Frielinghaus 2021-10-11 09:59:32 +02:00
parent 6f966f0614
commit 6e3c4bfd0c
6 changed files with 203 additions and 26 deletions

View file

@ -66,6 +66,8 @@ s390_asm_declare_function_size (FILE *asm_out_file,
const char *fnname ATTRIBUTE_UNUSED, tree decl);
#endif
extern void s390_rawmemchr (machine_mode elt_mode, rtx dst, rtx src, rtx pat);
#ifdef RTX_CODE
extern int s390_extra_constraint_str (rtx, int, const char *);
extern int s390_const_ok_for_constraint_p (HOST_WIDE_INT, int, const char *);

View file

@ -16569,6 +16569,75 @@ s390_excess_precision (enum excess_precision_type type)
}
#endif
void
s390_rawmemchr (machine_mode elt_mode, rtx dst, rtx src, rtx pat)
{
machine_mode vec_mode = mode_for_vector (as_a <scalar_int_mode> (elt_mode),
16 / GET_MODE_SIZE (elt_mode)).require();
rtx lens = gen_reg_rtx (V16QImode);
rtx pattern = gen_reg_rtx (vec_mode);
rtx loop_start = gen_label_rtx ();
rtx loop_end = gen_label_rtx ();
rtx addr = gen_reg_rtx (Pmode);
rtx offset = gen_reg_rtx (Pmode);
rtx loadlen = gen_reg_rtx (SImode);
rtx matchlen = gen_reg_rtx (SImode);
rtx mem;
pat = GEN_INT (trunc_int_for_mode (INTVAL (pat), elt_mode));
emit_insn (gen_rtx_SET (pattern, gen_rtx_VEC_DUPLICATE (vec_mode, pat)));
emit_move_insn (addr, XEXP (src, 0));
// alignment
emit_insn (gen_vlbb (lens, gen_rtx_MEM (BLKmode, addr), GEN_INT (6)));
emit_insn (gen_lcbb (loadlen, addr, GEN_INT (6)));
lens = convert_to_mode (vec_mode, lens, 1);
emit_insn (gen_vec_vfees (vec_mode, lens, lens, pattern, GEN_INT (0)));
lens = convert_to_mode (V4SImode, lens, 1);
emit_insn (gen_vec_extractv4sisi (matchlen, lens, GEN_INT (1)));
lens = convert_to_mode (vec_mode, lens, 1);
emit_cmp_and_jump_insns (matchlen, loadlen, LT, NULL_RTX, SImode, 1, loop_end);
force_expand_binop (Pmode, add_optab, addr, GEN_INT(16), addr, 1, OPTAB_DIRECT);
force_expand_binop (Pmode, and_optab, addr, GEN_INT(~HOST_WIDE_INT_UC(0xf)), addr, 1, OPTAB_DIRECT);
// now, addr is 16-byte aligned
mem = gen_rtx_MEM (vec_mode, addr);
set_mem_align (mem, 128);
emit_move_insn (lens, mem);
emit_insn (gen_vec_vfees (vec_mode, lens, lens, pattern, GEN_INT (VSTRING_FLAG_CS)));
add_int_reg_note (s390_emit_ccraw_jump (4, EQ, loop_end),
REG_BR_PROB,
profile_probability::very_unlikely ().to_reg_br_prob_note ());
emit_label (loop_start);
LABEL_NUSES (loop_start) = 1;
force_expand_binop (Pmode, add_optab, addr, GEN_INT (16), addr, 1, OPTAB_DIRECT);
mem = gen_rtx_MEM (vec_mode, addr);
set_mem_align (mem, 128);
emit_move_insn (lens, mem);
emit_insn (gen_vec_vfees (vec_mode, lens, lens, pattern, GEN_INT (VSTRING_FLAG_CS)));
add_int_reg_note (s390_emit_ccraw_jump (4, NE, loop_start),
REG_BR_PROB,
profile_probability::very_likely ().to_reg_br_prob_note ());
emit_label (loop_end);
LABEL_NUSES (loop_end) = 1;
if (TARGET_64BIT)
{
lens = convert_to_mode (V2DImode, lens, 1);
emit_insn (gen_vec_extractv2didi (offset, lens, GEN_INT (0)));
}
else
{
lens = convert_to_mode (V4SImode, lens, 1);
emit_insn (gen_vec_extractv4sisi (offset, lens, GEN_INT (1)));
}
force_expand_binop (Pmode, add_optab, addr, offset, dst, 1, OPTAB_DIRECT);
}
/* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
static unsigned HOST_WIDE_INT

View file

@ -12258,3 +12258,10 @@
UNSPECV_PPA)]
"TARGET_ZEC12"
"")
(define_expand "rawmemchr<SINT:mode>"
[(match_operand 0 "register_operand")
(match_operand 1 "memory_operand")
(match_operand:SINT 2 "const_int_operand")]
"TARGET_VX"
"s390_rawmemchr(<SINT:MODE>mode, operands[0], operands[1], operands[2]); DONE;")

View file

@ -1988,6 +1988,32 @@
"vll\t%v0,%1,%2"
[(set_attr "op_type" "VRS")])
; vfeebs, vfeehs, vfeefs
; vfeezbs, vfeezhs, vfeezfs
(define_insn "@vec_vfees<mode>"
[(set (match_operand:VI_HW_QHS 0 "register_operand" "=v")
(unspec:VI_HW_QHS [(match_operand:VI_HW_QHS 1 "register_operand" "v")
(match_operand:VI_HW_QHS 2 "register_operand" "v")
(match_operand:QI 3 "const_mask_operand" "C")]
UNSPEC_VEC_VFEE))
(set (reg:CCRAW CC_REGNUM)
(unspec:CCRAW [(match_dup 1)
(match_dup 2)
(match_dup 3)]
UNSPEC_VEC_VFEECC))]
"TARGET_VX"
{
unsigned HOST_WIDE_INT flags = UINTVAL (operands[3]);
gcc_assert (!(flags & ~(VSTRING_FLAG_ZS | VSTRING_FLAG_CS)));
flags &= ~VSTRING_FLAG_CS;
if (flags == VSTRING_FLAG_ZS)
return "vfeez<bhfgq>s\t%v0,%v1,%v2";
return "vfee<bhfgq>s\t%v0,%v1,%v2";
}
[(set_attr "op_type" "VRR")])
; vfenebs, vfenehs, vfenefs
; vfenezbs, vfenezhs, vfenezfs
(define_insn "vec_vfenes<mode>"

View file

@ -1366,32 +1366,6 @@
; Vector find element equal
; vfeebs, vfeehs, vfeefs
; vfeezbs, vfeezhs, vfeezfs
(define_insn "*vfees<mode>"
[(set (match_operand:VI_HW_QHS 0 "register_operand" "=v")
(unspec:VI_HW_QHS [(match_operand:VI_HW_QHS 1 "register_operand" "v")
(match_operand:VI_HW_QHS 2 "register_operand" "v")
(match_operand:QI 3 "const_mask_operand" "C")]
UNSPEC_VEC_VFEE))
(set (reg:CCRAW CC_REGNUM)
(unspec:CCRAW [(match_dup 1)
(match_dup 2)
(match_dup 3)]
UNSPEC_VEC_VFEECC))]
"TARGET_VX"
{
unsigned HOST_WIDE_INT flags = UINTVAL (operands[3]);
gcc_assert (!(flags & ~(VSTRING_FLAG_ZS | VSTRING_FLAG_CS)));
flags &= ~VSTRING_FLAG_CS;
if (flags == VSTRING_FLAG_ZS)
return "vfeez<bhfgq>s\t%v0,%v1,%v2";
return "vfee<bhfgq>s\t%v0,%v1,%v2,%b3";
}
[(set_attr "op_type" "VRR")])
; vfeeb, vfeeh, vfeef
(define_insn "vfee<mode>"
[(set (match_operand:VI_HW_QHS 0 "register_operand" "=v")

View file

@ -0,0 +1,99 @@
/* { dg-do run } */
/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details -mzarch -march=z13" } */
/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" } } */
/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" } } */
/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" } } */
#include <string.h>
#include <assert.h>
#include <stdint.h>
#include <stdlib.h>
#define rawmemchrT(T, pattern) \
__attribute__((noinline,noclone)) \
T* rawmemchr_##T (T *s) \
{ \
while (*s != pattern) \
++s; \
return s; \
}
rawmemchrT(int8_t, (int8_t)0xde)
rawmemchrT(uint8_t, 0xde)
rawmemchrT(int16_t, (int16_t)0xdead)
rawmemchrT(uint16_t, 0xdead)
rawmemchrT(int32_t, (int32_t)0xdeadbeef)
rawmemchrT(uint32_t, 0xdeadbeef)
#define runT(T, pattern) \
void run_##T () \
{ \
T *buf = malloc (4096 * 2 * sizeof(T)); \
assert (buf != NULL); \
memset (buf, 0xa, 4096 * 2 * sizeof(T)); \
/* ensure q is 4096-byte aligned */ \
T *q = (T*)((unsigned char *)buf \
+ (4096 - ((uintptr_t)buf & 4095))); \
T *p; \
/* unaligned + block boundary + 1st load */ \
p = (T *) ((uintptr_t)q - 8); \
p[2] = pattern; \
assert ((rawmemchr_##T (&p[0]) == &p[2])); \
p[2] = (T) 0xaaaaaaaa; \
/* unaligned + block boundary + 2nd load */ \
p = (T *) ((uintptr_t)q - 8); \
p[6] = pattern; \
assert ((rawmemchr_##T (&p[0]) == &p[6])); \
p[6] = (T) 0xaaaaaaaa; \
/* unaligned + 1st load */ \
q[5] = pattern; \
assert ((rawmemchr_##T (&q[2]) == &q[5])); \
q[5] = (T) 0xaaaaaaaa; \
/* unaligned + 2nd load */ \
q[14] = pattern; \
assert ((rawmemchr_##T (&q[2]) == &q[14])); \
q[14] = (T) 0xaaaaaaaa; \
/* unaligned + 3rd load */ \
q[19] = pattern; \
assert ((rawmemchr_##T (&q[2]) == &q[19])); \
q[19] = (T) 0xaaaaaaaa; \
/* unaligned + 4th load */ \
q[25] = pattern; \
assert ((rawmemchr_##T (&q[2]) == &q[25])); \
q[25] = (T) 0xaaaaaaaa; \
/* aligned + 1st load */ \
q[5] = pattern; \
assert ((rawmemchr_##T (&q[0]) == &q[5])); \
q[5] = (T) 0xaaaaaaaa; \
/* aligned + 2nd load */ \
q[14] = pattern; \
assert ((rawmemchr_##T (&q[0]) == &q[14])); \
q[14] = (T) 0xaaaaaaaa; \
/* aligned + 3rd load */ \
q[19] = pattern; \
assert ((rawmemchr_##T (&q[0]) == &q[19])); \
q[19] = (T) 0xaaaaaaaa; \
/* aligned + 4th load */ \
q[25] = pattern; \
assert ((rawmemchr_##T (&q[0]) == &q[25])); \
q[25] = (T) 0xaaaaaaaa; \
free (buf); \
}
runT(int8_t, (int8_t)0xde)
runT(uint8_t, 0xde)
runT(int16_t, (int16_t)0xdead)
runT(uint16_t, 0xdead)
runT(int32_t, (int32_t)0xdeadbeef)
runT(uint32_t, 0xdeadbeef)
int main (void)
{
run_uint8_t ();
run_int8_t ();
run_uint16_t ();
run_int16_t ();
run_uint32_t ();
run_int32_t ();
return 0;
}