Rearrange SLP nodes with duplicate statements [PR98138]

This change checks when a two_operators SLP node has multiple occurrences of
the same statement (e.g. {A, B, A, B, ...}) and tries to rearrange the operands
so that there are no duplicates. Two vec_perm expressions are then introduced
to recreate the original ordering. These duplicates can appear due to how
two_operators nodes are handled, and they prevent vectorization in some cases.

This targets the vectorization of the SPEC2017 x264 pixel_satd functions.
In some processors a larger than 10% improvement on x264 has been observed.

	PR tree-optimization/98138

gcc/ChangeLog:

	* tree-vect-slp.cc: Avoid duplicates in two_operators nodes.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vect-slp-two-operator.c: New test.
This commit is contained in:
Manolis Tsamis 2024-06-25 08:00:04 -07:00 committed by Christoph Müllner
parent 71aebb3617
commit ab18785840
2 changed files with 150 additions and 0 deletions

View file

@ -0,0 +1,36 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect -fdump-tree-vect-details" } */
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
int t0 = s0 + s1;\
int t1 = s0 - s1;\
int t2 = s2 + s3;\
int t3 = s2 - s3;\
d0 = t0 + t2;\
d1 = t1 + t3;\
d2 = t0 - t2;\
d3 = t1 - t3;\
}
void sink(uint32_t tmp[4][4]);
int x264_pixel_satd_8x4( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
{
uint32_t tmp[4][4];
int sum = 0;
for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
{
uint32_t a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
uint32_t a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
uint32_t a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
uint32_t a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 );
}
sink(tmp);
}
/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */

View file

@ -2437,6 +2437,95 @@ out:
} }
swap = NULL; swap = NULL;
bool has_two_operators_perm = false;
auto_vec<unsigned> two_op_perm_indices[2];
vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
if (two_operators && oprnds_info.length () == 2 && group_size > 2)
{
unsigned idx = 0;
hash_map<gimple *, unsigned> seen;
vec<slp_oprnd_info> new_oprnds_info
= vect_create_oprnd_info (1, group_size);
bool success = true;
enum tree_code code = ERROR_MARK;
if (oprnds_info[0]->def_stmts[0]
&& is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
for (unsigned j = 0; j < group_size; ++j)
{
FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
{
stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
if (!stmt_info || !stmt_info->stmt
|| !is_a<gassign *> (stmt_info->stmt)
|| gimple_assign_rhs_code (stmt_info->stmt) != code
|| skip_args[i])
{
success = false;
break;
}
bool exists;
unsigned &stmt_idx
= seen.get_or_insert (stmt_info->stmt, &exists);
if (!exists)
{
new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
stmt_idx = idx;
idx++;
}
two_op_perm_indices[i].safe_push (stmt_idx);
}
if (!success)
break;
}
if (success && idx == group_size)
{
if (dump_enabled_p ())
{
dump_printf_loc (MSG_NOTE, vect_location,
"Replace two_operators operands:\n");
FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
{
dump_printf_loc (MSG_NOTE, vect_location,
"Operand %u:\n", i);
for (unsigned j = 0; j < group_size; j++)
dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
j, oprnd_info->def_stmts[j]->stmt);
}
dump_printf_loc (MSG_NOTE, vect_location,
"With a single operand:\n");
for (unsigned j = 0; j < group_size; j++)
dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
j, new_oprnds_info[0]->def_stmts[j]->stmt);
}
two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
vect_free_oprnd_info (oprnds_info);
oprnds_info = new_oprnds_info;
nops = 1;
has_two_operators_perm = true;
}
}
auto_vec<slp_tree, 4> children; auto_vec<slp_tree, 4> children;
stmt_info = stmts[0]; stmt_info = stmts[0];
@ -2707,6 +2796,29 @@ fail:
the true { a+b, a+b, a+b, a+b } ... but there we don't have the true { a+b, a+b, a+b, a+b } ... but there we don't have
explicit stmts to put in so the keying on 'stmts' doesn't explicit stmts to put in so the keying on 'stmts' doesn't
work (but we have the same issue with nodes that use 'ops'). */ work (but we have the same issue with nodes that use 'ops'). */
if (has_two_operators_perm)
{
slp_tree child = children[0];
children.truncate (0);
for (i = 0; i < 2; i++)
{
slp_tree pnode
= vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
SLP_TREE_VECTYPE (pnode) = vectype;
SLP_TREE_CHILDREN (pnode).quick_push (child);
SLP_TREE_CHILDREN (pnode).quick_push (child);
lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
children.safe_push (pnode);
for (unsigned j = 0; j < stmts.length (); j++)
perm.safe_push (std::make_pair (0, two_op_perm_indices[i][j]));
}
SLP_TREE_REF_COUNT (child) += 4;
}
slp_tree one = new _slp_tree; slp_tree one = new _slp_tree;
slp_tree two = new _slp_tree; slp_tree two = new _slp_tree;
SLP_TREE_DEF_TYPE (one) = vect_internal_def; SLP_TREE_DEF_TYPE (one) = vect_internal_def;
@ -2743,12 +2855,14 @@ fail:
else else
SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i)); SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
} }
SLP_TREE_CODE (one) = code0; SLP_TREE_CODE (one) = code0;
SLP_TREE_CODE (two) = ocode; SLP_TREE_CODE (two) = ocode;
SLP_TREE_LANES (one) = stmts.length (); SLP_TREE_LANES (one) = stmts.length ();
SLP_TREE_LANES (two) = stmts.length (); SLP_TREE_LANES (two) = stmts.length ();
SLP_TREE_REPRESENTATIVE (one) = stmts[0]; SLP_TREE_REPRESENTATIVE (one) = stmts[0];
SLP_TREE_REPRESENTATIVE (two) = stmts[j]; SLP_TREE_REPRESENTATIVE (two) = stmts[j];
return node; return node;
} }