re PR target/86753 (gcc.target/aarch64/sve/vcond_[45].c fail after recent combine patch)

2019-10-18  Prathamesh Kulkarni  <prathamesh.kulkarni@linaro.org>
	    Richard Sandiford  <richard.sandiford@arm.com>

	PR target/86753
	* tree-vectorizer.h (scalar_cond_masked_key): New struct,
	and define hashmap traits for it.
	(loop_vec_info::scalar_cond_masked_set): New member.
	(vect_record_loop_mask): Adjust prototype.
	* tree-vectorizer.c (scalar_cond_masked_key::get_cond_ops_from_tree):
	Implement method.
	* tree-vect-loop.c (vectorizable_reduction): Pass NULL as last arg to
	vect_record_loop_mask.
	(vectorizable_live_operation): Likewise.
	(vect_record_loop_mask): New param scalar_mask. Add entry
	cond, loop_mask	to scalar_cond_masked_set if scalar_mask is non NULL.
	* tree-vect-stmts.c (check_load_store_masking): New param scalar_mask.
	Pass it as last arg to vect_record_loop_mask.
	(vectorizable_call): Pass scalar_mask as last arg to
	vect_record_loop_mask.
	(vectorizable_store): Likewise.
	(vectorizable_load): Likewise.
	(vectorizable_condition): Check if another part of vectorized code
	applies loop_mask to condition or to it's inverse, and if yes,
	apply loop_mask to result of vector comparison.

testsuite/
	* gcc.target/aarch64/sve/cond_cnot_2.c: Remove XFAIL
	from { scan-assembler-not {\tsel\t}.
	* gcc.target/aarch64/sve/cond_convert_1.c: Adjust to make
	only one load conditional.
	* gcc.target/aarch64/sve/cond_convert_4.c: Likewise.
	* gcc.target/aarch64/sve/cond_unary_2.c: Likewise.
	* gcc.target/aarch64/sve/vcond_4.c: Remove XFAIL's.
	* gcc.target/aarch64/sve/vcond_5.c: Likewise.

Co-Authored-By: Richard Sandiford <richard.sandiford@arm.com>

From-SVN: r277141
This commit is contained in:
Prathamesh Kulkarni 2019-10-18 05:13:26 +00:00 committed by Prathamesh Kulkarni
parent 4aa255f525
commit cc1facefe3
12 changed files with 281 additions and 43 deletions

View file

@ -1,3 +1,28 @@
2019-10-18 Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
Richard Sandiford <richard.sandiford@arm.com>
PR target/86753
* tree-vectorizer.h (scalar_cond_masked_key): New struct,
and define hashmap traits for it.
(loop_vec_info::scalar_cond_masked_set): New member.
(vect_record_loop_mask): Adjust prototype.
* tree-vectorizer.c (scalar_cond_masked_key::get_cond_ops_from_tree):
Implement method.
* tree-vect-loop.c (vectorizable_reduction): Pass NULL as last arg to
vect_record_loop_mask.
(vectorizable_live_operation): Likewise.
(vect_record_loop_mask): New param scalar_mask. Add entry
cond, loop_mask to scalar_cond_masked_set if scalar_mask is non NULL.
* tree-vect-stmts.c (check_load_store_masking): New param scalar_mask.
Pass it as last arg to vect_record_loop_mask.
(vectorizable_call): Pass scalar_mask as last arg to
vect_record_loop_mask.
(vectorizable_store): Likewise.
(vectorizable_load): Likewise.
(vectorizable_condition): Check if another part of vectorized code
applies loop_mask to condition or to it's inverse, and if yes,
apply loop_mask to result of vector comparison.
2019-10-17 John David Anglin <danglin@gcc.gnu.org>
* config/pa/pa.c (pa_output_indirect_call): Fix typos in last change.

View file

@ -1,3 +1,16 @@
2019-10-18 Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
Richard Sandiford <richard.sandiford@arm.com>
PR target/86753
* gcc.target/aarch64/sve/cond_cnot_2.c: Remove XFAIL
from { scan-assembler-not {\tsel\t}.
* gcc.target/aarch64/sve/cond_convert_1.c: Adjust to make
only one load conditional.
* gcc.target/aarch64/sve/cond_convert_4.c: Likewise.
* gcc.target/aarch64/sve/cond_unary_2.c: Likewise.
* gcc.target/aarch64/sve/vcond_4.c: Remove XFAIL's.
* gcc.target/aarch64/sve/vcond_5.c: Likewise.
2019-10-18 Jakub Jelinek <jakub@redhat.com>
PR tree-optimization/92056

View file

@ -32,4 +32,4 @@ TEST_ALL (DEF_LOOP)
/* { dg-final { scan-assembler-not {\tmov\tz} } } */
/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
/* Currently we canonicalize the ?: so that !b[i] is the "false" value. */
/* { dg-final { scan-assembler-not {\tsel\t} { xfail *-*-* } } } */
/* { dg-final { scan-assembler-not {\tsel\t} } } */

View file

@ -11,7 +11,10 @@
INT_TYPE *__restrict pred, int n) \
{ \
for (int i = 0; i < n; ++i) \
r[i] = pred[i] ? (FLOAT_TYPE) a[i] : b[i]; \
{ \
FLOAT_TYPE bi = b[i]; \
r[i] = pred[i] ? (FLOAT_TYPE) a[i] : bi; \
} \
}
#define TEST_ALL(T) \

View file

@ -11,7 +11,10 @@
INT_TYPE *__restrict pred, int n) \
{ \
for (int i = 0; i < n; ++i) \
r[i] = pred[i] ? (INT_TYPE) a[i] : b[i]; \
{ \
INT_TYPE bi = b[i]; \
r[i] = pred[i] ? (INT_TYPE) a[i] : bi; \
} \
}
#define TEST_ALL(T) \

View file

@ -13,7 +13,10 @@
TYPE *__restrict pred, int n) \
{ \
for (int i = 0; i < n; ++i) \
r[i] = pred[i] ? OP (a[i]) : b[i]; \
{ \
TYPE bi = b[i]; \
r[i] = pred[i] ? OP (a[i]) : bi; \
} \
}
#define TEST_INT_TYPE(T, TYPE) \

View file

@ -98,24 +98,24 @@ TEST_CMP (nugt)
/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
/* 5 for lt, 5 for ult and 5 for nult. */
/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */
/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */
/* 5 for le, 5 for ule and 5 for nule. */
/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */
/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */
/* 5 for gt, 5 for ugt and 5 for nugt. */
/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */
/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */
/* 5 for ge, 5 for uge and 5 for nuge. */
/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */
/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */
/* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} } } */
/* 3 loops * 5 invocations for all 12 unordered comparisons. */
/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 180 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 180 } } */
/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 7 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 14 { xfail *-*-* } } } */
@ -123,19 +123,19 @@ TEST_CMP (nugt)
/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */
/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */
/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */
/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */
/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */
/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */
/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */
/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */
/* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} } } */
/* 3 loops * 5 invocations, with 2 invocations having ncopies == 2,
for all 12 unordered comparisons. */
/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 252 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 252 } } */

View file

@ -19,16 +19,16 @@
/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 40 { xfail *-*-* } } } */
/* 5 for le, 5 for ule and 5 for nule. */
/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 } } */
/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 } } */
/* 5 for gt, 5 for ugt, 5 for nueq and 5 for nugt. */
/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 20 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 40 { xfail *-*-* } } } */
/* 5 for ge, 5 for uge and 5 for nuge. */
/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 } } */
/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 } } */
/* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} } } */
/* 3 loops * 5 invocations for ordered, unordered amd ueq. */
@ -43,14 +43,14 @@
/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 28 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 56 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 } } */
/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 } } */
/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 28 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 56 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 } } */
/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 } } */
/* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} } } */
/* 3 loops * 5 invocations, with 2 invocations having ncopies == 2,

View file

@ -6330,7 +6330,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
}
else
vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
vectype_in);
vectype_in, NULL);
}
if (dump_enabled_p ()
&& reduction_type == FOLD_LEFT_REDUCTION)
@ -7561,7 +7561,7 @@ vectorizable_live_operation (stmt_vec_info stmt_info,
gcc_assert (ncopies == 1 && !slp_node);
vect_record_loop_mask (loop_vinfo,
&LOOP_VINFO_MASKS (loop_vinfo),
1, vectype);
1, vectype, NULL);
}
}
return true;
@ -7760,11 +7760,12 @@ vect_double_mask_nunits (tree type)
/* Record that a fully-masked version of LOOP_VINFO would need MASKS to
contain a sequence of NVECTORS masks that each control a vector of type
VECTYPE. */
VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
these vector masks with the vector version of SCALAR_MASK. */
void
vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
unsigned int nvectors, tree vectype)
unsigned int nvectors, tree vectype, tree scalar_mask)
{
gcc_assert (nvectors != 0);
if (masks->length () < nvectors)
@ -7775,6 +7776,13 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
unsigned int nscalars_per_iter
= exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
if (scalar_mask)
{
scalar_cond_masked_key cond (scalar_mask, nvectors);
loop_vinfo->scalar_cond_masked_set.add (cond);
}
if (rgm->max_nscalars_per_iter < nscalars_per_iter)
{
rgm->max_nscalars_per_iter = nscalars_per_iter;

View file

@ -1879,7 +1879,8 @@ static tree permute_vec_elements (tree, tree, tree, stmt_vec_info,
says how the load or store is going to be implemented and GROUP_SIZE
is the number of load or store statements in the containing group.
If the access is a gather load or scatter store, GS_INFO describes
its arguments.
its arguments. If the load or store is conditional, SCALAR_MASK is the
condition under which it occurs.
Clear LOOP_VINFO_CAN_FULLY_MASK_P if a fully-masked loop is not
supported, otherwise record the required mask types. */
@ -1888,7 +1889,7 @@ static void
check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
vec_load_store_type vls_type, int group_size,
vect_memory_access_type memory_access_type,
gather_scatter_info *gs_info)
gather_scatter_info *gs_info, tree scalar_mask)
{
/* Invariant loads need no special support. */
if (memory_access_type == VMAT_INVARIANT)
@ -1912,7 +1913,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
return;
}
unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype);
vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype);
vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask);
return;
}
@ -1936,7 +1937,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
return;
}
unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype);
vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype);
vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask);
return;
}
@ -1974,7 +1975,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
unsigned int nvectors;
if (can_div_away_from_zero_p (group_size * vf, nunits, &nvectors))
vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype);
vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask);
else
gcc_unreachable ();
}
@ -3436,7 +3437,9 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
unsigned int nvectors = (slp_node
? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
: ncopies);
vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_out);
tree scalar_mask = gimple_call_arg (stmt_info->stmt, mask_opno);
vect_record_loop_mask (loop_vinfo, masks, nvectors,
vectype_out, scalar_mask);
}
return true;
}
@ -7390,7 +7393,7 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
if (loop_vinfo
&& LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
check_load_store_masking (loop_vinfo, vectype, vls_type, group_size,
memory_access_type, &gs_info);
memory_access_type, &gs_info, mask);
STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
vect_model_store_cost (stmt_info, ncopies, rhs_dt, memory_access_type,
@ -8637,7 +8640,7 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
if (loop_vinfo
&& LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
check_load_store_masking (loop_vinfo, vectype, VLS_LOAD, group_size,
memory_access_type, &gs_info);
memory_access_type, &gs_info, mask);
STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
vect_model_load_cost (stmt_info, ncopies, memory_access_type,
@ -10007,6 +10010,35 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
/* Handle cond expr. */
for (j = 0; j < ncopies; j++)
{
tree loop_mask = NULL_TREE;
bool swap_cond_operands = false;
/* See whether another part of the vectorized code applies a loop
mask to the condition, or to its inverse. */
if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
{
scalar_cond_masked_key cond (cond_expr, ncopies);
if (loop_vinfo->scalar_cond_masked_set.contains (cond))
{
vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
loop_mask = vect_get_loop_mask (gsi, masks, ncopies, vectype, j);
}
else
{
bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0));
cond.code = invert_tree_comparison (cond.code, honor_nans);
if (loop_vinfo->scalar_cond_masked_set.contains (cond))
{
vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
loop_mask = vect_get_loop_mask (gsi, masks, ncopies,
vectype, j);
cond_code = cond.code;
swap_cond_operands = true;
}
}
}
stmt_vec_info new_stmt_info = NULL;
if (j == 0)
{
@ -10084,6 +10116,9 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
vec_then_clause = vec_oprnds2[i];
vec_else_clause = vec_oprnds3[i];
if (swap_cond_operands)
std::swap (vec_then_clause, vec_else_clause);
if (masked)
vec_compare = vec_cond_lhs;
else
@ -10122,6 +10157,50 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
}
}
}
/* If we decided to apply a loop mask to the result of the vector
comparison, AND the comparison with the mask now. Later passes
should then be able to reuse the AND results between mulitple
vector statements.
For example:
for (int i = 0; i < 100; ++i)
x[i] = y[i] ? z[i] : 10;
results in following optimized GIMPLE:
mask__35.8_43 = vect__4.7_41 != { 0, ... };
vec_mask_and_46 = loop_mask_40 & mask__35.8_43;
_19 = &MEM[base: z_12(D), index: ivtmp_56, step: 4, offset: 0B];
vect_iftmp.11_47 = .MASK_LOAD (_19, 4B, vec_mask_and_46);
vect_iftmp.12_52 = VEC_COND_EXPR <vec_mask_and_46,
vect_iftmp.11_47, { 10, ... }>;
instead of using a masked and unmasked forms of
vec != { 0, ... } (masked in the MASK_LOAD,
unmasked in the VEC_COND_EXPR). */
if (loop_mask)
{
if (COMPARISON_CLASS_P (vec_compare))
{
tree tmp = make_ssa_name (vec_cmp_type);
tree op0 = TREE_OPERAND (vec_compare, 0);
tree op1 = TREE_OPERAND (vec_compare, 1);
gassign *g = gimple_build_assign (tmp,
TREE_CODE (vec_compare),
op0, op1);
vect_finish_stmt_generation (stmt_info, g, gsi);
vec_compare = tmp;
}
tree tmp2 = make_ssa_name (vec_cmp_type);
gassign *g = gimple_build_assign (tmp2, BIT_AND_EXPR,
vec_compare, loop_mask);
vect_finish_stmt_generation (stmt_info, g, gsi);
vec_compare = tmp2;
}
if (reduction_type == EXTRACT_LAST_REDUCTION)
{
if (!is_gimple_val (vec_compare))

View file

@ -1515,3 +1515,36 @@ make_pass_ipa_increase_alignment (gcc::context *ctxt)
{
return new pass_ipa_increase_alignment (ctxt);
}
/* If the condition represented by T is a comparison or the SSA name
result of a comparison, extract the comparison's operands. Represent
T as NE_EXPR <T, 0> otherwise. */
void
scalar_cond_masked_key::get_cond_ops_from_tree (tree t)
{
if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison)
{
this->code = TREE_CODE (t);
this->op0 = TREE_OPERAND (t, 0);
this->op1 = TREE_OPERAND (t, 1);
return;
}
if (TREE_CODE (t) == SSA_NAME)
if (gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (t)))
{
tree_code code = gimple_assign_rhs_code (stmt);
if (TREE_CODE_CLASS (code) == tcc_comparison)
{
this->code = code;
this->op0 = gimple_assign_rhs1 (stmt);
this->op1 = gimple_assign_rhs2 (stmt);
return;
}
}
this->code = NE_EXPR;
this->op0 = t;
this->op1 = build_zero_cst (TREE_TYPE (t));
}

View file

@ -177,7 +177,75 @@ public:
#define SLP_TREE_TWO_OPERATORS(S) (S)->two_operators
#define SLP_TREE_DEF_TYPE(S) (S)->def_type
/* Key for map that records association between
scalar conditions and corresponding loop mask, and
is populated by vect_record_loop_mask. */
struct scalar_cond_masked_key
{
scalar_cond_masked_key (tree t, unsigned ncopies_)
: ncopies (ncopies_)
{
get_cond_ops_from_tree (t);
}
void get_cond_ops_from_tree (tree);
unsigned ncopies;
tree_code code;
tree op0;
tree op1;
};
template<>
struct default_hash_traits<scalar_cond_masked_key>
{
typedef scalar_cond_masked_key compare_type;
typedef scalar_cond_masked_key value_type;
static inline hashval_t
hash (value_type v)
{
inchash::hash h;
h.add_int (v.code);
inchash::add_expr (v.op0, h, 0);
inchash::add_expr (v.op1, h, 0);
h.add_int (v.ncopies);
return h.end ();
}
static inline bool
equal (value_type existing, value_type candidate)
{
return (existing.ncopies == candidate.ncopies
&& existing.code == candidate.code
&& operand_equal_p (existing.op0, candidate.op0, 0)
&& operand_equal_p (existing.op1, candidate.op1, 0));
}
static inline void
mark_empty (value_type &v)
{
v.ncopies = 0;
}
static inline bool
is_empty (value_type v)
{
return v.ncopies == 0;
}
static inline void mark_deleted (value_type &) {}
static inline bool is_deleted (const value_type &)
{
return false;
}
static inline void remove (value_type &) {}
};
typedef hash_set<scalar_cond_masked_key> scalar_cond_masked_set_type;
/* Describes two objects whose addresses must be unequal for the vectorized
loop to be valid. */
@ -426,6 +494,9 @@ public:
on inactive scalars. */
vec_loop_masks masks;
/* Set of scalar conditions that have loop mask applied. */
scalar_cond_masked_set_type scalar_cond_masked_set;
/* If we are using a loop mask to align memory addresses, this variable
contains the number of vector elements that we should skip in the
first iteration of the vector loop (i.e. the number of leading
@ -1637,7 +1708,7 @@ extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *,
extern tree vect_halve_mask_nunits (tree);
extern tree vect_double_mask_nunits (tree);
extern void vect_record_loop_mask (loop_vec_info, vec_loop_masks *,
unsigned int, tree);
unsigned int, tree, tree);
extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
unsigned int, tree, unsigned int);
extern stmt_vec_info info_for_reduction (stmt_vec_info);