match.pd: Fold vec_perm with view_convert

This patch improves the codegen for the following test case:
uint64x2_t foo (uint64x2_t r) {
    uint32x4_t a = vreinterpretq_u32_u64 (r);
    uint32_t t;
    t = a[0]; a[0] = a[1]; a[1] = t;
    t = a[2]; a[2] = a[3]; a[3] = t;
    return vreinterpretq_u64_u32 (a);
}
from (-O1):
foo:
        mov     v31.16b, v0.16b
        ins     v0.s[0], v0.s[1]
        ins     v0.s[1], v31.s[0]
        ins     v0.s[2], v31.s[3]
        ins     v0.s[3], v31.s[2]
        ret
to:
foo:
	rev64   v0.4s, v0.4s
        ret

This is achieved by extending the following match.pd pattern to account
for type differences between @0 and @1 due to view converts.
/* Simplify vector inserts of other vector extracts to a permute.  */
(simplify
 (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos)

The patch was bootstrapped and regtested on aarch64-linux-gnu and
x86_64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
Co-authored-by: Richard Biener <rguenther@suse.de>

gcc/
	PR tree-optimization/117093
	* match.pd: Extend
	(bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos) to allow
	type differences between @0 and @1 due to view converts.

gcc/testsuite/
	PR tree-optimization/117093
	* gcc.dg/tree-ssa/pr117093.c: New test.
This commit is contained in:
Jennifer Schmitz 2024-11-04 07:56:09 -08:00
parent 029c16c15f
commit c83e2d4757
2 changed files with 25 additions and 5 deletions

View file

@ -9583,7 +9583,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(if (VECTOR_TYPE_P (type)
&& (VECTOR_MODE_P (TYPE_MODE (type))
|| optimize_vectors_before_lowering_p ())
&& types_match (@0, @1)
&& operand_equal_p (TYPE_SIZE (TREE_TYPE (@0)),
TYPE_SIZE (TREE_TYPE (@1)), 0)
&& types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
&& TYPE_VECTOR_SUBPARTS (type).is_constant ()
&& multiple_p (wi::to_poly_offset (@rpos),
@ -9591,7 +9592,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(with
{
unsigned HOST_WIDE_INT elsz
= tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@1))));
= tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@0))));
poly_uint64 relt = exact_div (tree_to_poly_uint64 (@rpos), elsz);
poly_uint64 ielt = exact_div (tree_to_poly_uint64 (@ipos), elsz);
unsigned nunits = TYPE_VECTOR_SUBPARTS (type).to_constant ();
@ -9602,9 +9603,11 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
vec_perm_indices sel (builder, 2, nunits);
}
(if (!VECTOR_MODE_P (TYPE_MODE (type))
|| can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, false))
(vec_perm @0 @1 { vec_perm_indices_to_tree
(build_vector_type (ssizetype, nunits), sel); })))))
|| can_vec_perm_const_p (TYPE_MODE (type),
TYPE_MODE (type), sel, false))
(vec_perm @0 (view_convert @1)
{ vec_perm_indices_to_tree (build_vector_type (ssizetype, nunits),
sel); })))))
(if (canonicalize_math_after_vectorization_p ())
(for fmas (FMA)

View file

@ -0,0 +1,17 @@
/* { dg-final { check-function-bodies "**" "" } } */
/* { dg-options "-O1" } */
#include <arm_neon.h>
/*
** foo:
** rev64 v0\.4s, v0\.4s
** ret
*/
uint64x2_t foo (uint64x2_t r) {
uint32x4_t a = vreinterpretq_u32_u64 (r);
uint32_t t;
t = a[0]; a[0] = a[1]; a[1] = t;
t = a[2]; a[2] = a[3]; a[3] = t;
return vreinterpretq_u64_u32 (a);
}