i386: Add cbranchti4 pattern to i386.md (for -m32 compare_by_pieces).
This patch fixes some very odd (unanticipated) code generation by compare_by_pieces with -m32 -mavx, since the recent addition of the cbranchoi4 pattern. The issue is that cbranchoi4 is available with TARGET_AVX, but cbranchti4 is currently conditional on TARGET_64BIT which results in the odd behaviour (thanks to OPTAB_WIDEN) that with -m32 -mavx, compare_by_pieces ends up (inefficiently) widening 128-bit comparisons to 256-bits before performing PTEST. This patch fixes this by providing a cbranchti4 pattern that's available with either TARGET_64BIT or TARGET_SSE4_1. For the test case below (again from PR 104610): int foo(char *a) { static const char t[] = "0123456789012345678901234567890"; return __builtin_memcmp(a, &t[0], sizeof(t)) == 0; } GCC with -m32 -O2 -mavx currently produces the bonkers: foo: pushl %ebp movl %esp, %ebp andl $-32, %esp subl $64, %esp movl 8(%ebp), %eax vmovdqa .LC0, %xmm4 movl $0, 48(%esp) vmovdqu (%eax), %xmm2 movl $0, 52(%esp) movl $0, 56(%esp) movl $0, 60(%esp) movl $0, 16(%esp) movl $0, 20(%esp) movl $0, 24(%esp) movl $0, 28(%esp) vmovdqa %xmm2, 32(%esp) vmovdqa %xmm4, (%esp) vmovdqa (%esp), %ymm5 vpxor 32(%esp), %ymm5, %ymm0 vptest %ymm0, %ymm0 jne .L2 vmovdqu 16(%eax), %xmm7 movl $0, 48(%esp) movl $0, 52(%esp) vmovdqa %xmm7, 32(%esp) vmovdqa .LC1, %xmm7 movl $0, 56(%esp) movl $0, 60(%esp) movl $0, 16(%esp) movl $0, 20(%esp) movl $0, 24(%esp) movl $0, 28(%esp) vmovdqa %xmm7, (%esp) vmovdqa (%esp), %ymm1 vpxor 32(%esp), %ymm1, %ymm0 vptest %ymm0, %ymm0 je .L6 .L2: movl $1, %eax xorl $1, %eax vzeroupper leave ret .L6: xorl %eax, %eax xorl $1, %eax vzeroupper leave ret with this patch, we now generate the (slightly) more sensible: foo: vmovdqa .LC0, %xmm0 movl 4(%esp), %eax vpxor (%eax), %xmm0, %xmm0 vptest %xmm0, %xmm0 jne .L2 vmovdqa .LC1, %xmm0 vpxor 16(%eax), %xmm0, %xmm0 vptest %xmm0, %xmm0 je .L5 .L2: movl $1, %eax xorl $1, %eax ret .L5: xorl %eax, %eax xorl $1, %eax ret 2023-06-28 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-expand.cc (ix86_expand_branch): Also use ptest for TImode comparisons on 32-bit architectures. * config/i386/i386.md (cbranch<mode>4): Change from SDWIM to SWIM1248x to exclude/avoid TImode being conditional on -m64. (cbranchti4): New define_expand for TImode on both TARGET_64BIT and/or with TARGET_SSE4_1. * config/i386/predicates.md (ix86_timode_comparison_operator): New predicate that depends upon TARGET_64BIT. (ix86_timode_comparison_operand): Likewise. gcc/testsuite/ChangeLog * gcc.target/i386/pieces-memcmp-2.c: New test case.
This commit is contained in:
parent
c027592d39
commit
4afbebcdc5
4 changed files with 45 additions and 3 deletions
|
@ -2365,6 +2365,7 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
|
|||
/* Handle special case - vector comparsion with boolean result, transform
|
||||
it using ptest instruction. */
|
||||
if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
|
||||
|| (mode == TImode && !TARGET_64BIT)
|
||||
|| mode == OImode)
|
||||
{
|
||||
rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
|
||||
|
@ -2372,7 +2373,7 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
|
|||
|
||||
gcc_assert (code == EQ || code == NE);
|
||||
|
||||
if (mode == OImode)
|
||||
if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
|
||||
{
|
||||
op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
|
||||
op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
|
||||
|
|
|
@ -1352,8 +1352,8 @@
|
|||
|
||||
(define_expand "cbranch<mode>4"
|
||||
[(set (reg:CC FLAGS_REG)
|
||||
(compare:CC (match_operand:SDWIM 1 "nonimmediate_operand")
|
||||
(match_operand:SDWIM 2 "<general_operand>")))
|
||||
(compare:CC (match_operand:SWIM1248x 1 "nonimmediate_operand")
|
||||
(match_operand:SWIM1248x 2 "<general_operand>")))
|
||||
(set (pc) (if_then_else
|
||||
(match_operator 0 "ordered_comparison_operator"
|
||||
[(reg:CC FLAGS_REG) (const_int 0)])
|
||||
|
@ -1368,6 +1368,22 @@
|
|||
DONE;
|
||||
})
|
||||
|
||||
(define_expand "cbranchti4"
|
||||
[(set (reg:CC FLAGS_REG)
|
||||
(compare:CC (match_operand:TI 1 "nonimmediate_operand")
|
||||
(match_operand:TI 2 "ix86_timode_comparison_operand")))
|
||||
(set (pc) (if_then_else
|
||||
(match_operator 0 "ix86_timode_comparison_operator"
|
||||
[(reg:CC FLAGS_REG) (const_int 0)])
|
||||
(label_ref (match_operand 3))
|
||||
(pc)))]
|
||||
"TARGET_64BIT || TARGET_SSE4_1"
|
||||
{
|
||||
ix86_expand_branch (GET_CODE (operands[0]),
|
||||
operands[1], operands[2], operands[3]);
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_expand "cbranchoi4"
|
||||
[(set (reg:CC FLAGS_REG)
|
||||
(compare:CC (match_operand:OI 1 "nonimmediate_operand")
|
||||
|
|
|
@ -1641,6 +1641,18 @@
|
|||
(match_operand 0 "comparison_operator")
|
||||
(match_operand 0 "ix86_trivial_fp_comparison_operator")))
|
||||
|
||||
;; Return true if we can perform this comparison on TImode operands.
|
||||
(define_predicate "ix86_timode_comparison_operator"
|
||||
(if_then_else (match_test "TARGET_64BIT")
|
||||
(match_operand 0 "ordered_comparison_operator")
|
||||
(match_operand 0 "bt_comparison_operator")))
|
||||
|
||||
;; Return true if this is a valid second operand for a TImode comparison.
|
||||
(define_predicate "ix86_timode_comparison_operand"
|
||||
(if_then_else (match_test "TARGET_64BIT")
|
||||
(match_operand 0 "x86_64_general_operand")
|
||||
(match_operand 0 "nonimmediate_operand")))
|
||||
|
||||
;; Nearly general operand, but accept any const_double, since we wish
|
||||
;; to be able to drop them into memory rather than have them get pulled
|
||||
;; into registers.
|
||||
|
|
13
gcc/testsuite/gcc.target/i386/pieces-memcmp-2.c
Normal file
13
gcc/testsuite/gcc.target/i386/pieces-memcmp-2.c
Normal file
|
@ -0,0 +1,13 @@
|
|||
/* { dg-do compile { target ia32 } } */
|
||||
/* { dg-options "-O2 -mavx2" } */
|
||||
|
||||
int foo(char *a)
|
||||
{
|
||||
static const char t[] = "0123456789012345678901234567890";
|
||||
return __builtin_memcmp(a, &t[0], sizeof(t)) == 0;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-not "movl\[ \\t]*\\\$0," } } */
|
||||
/* { dg-final { scan-assembler-not "vptest\[ \\t]*%ymm" } } */
|
||||
/* { dg-final { scan-assembler-times "vptest\[ \\t]*%xmm" 2 } } */
|
||||
|
Loading…
Add table
Reference in a new issue