amdgcn: multi-size vector reductions

Add support for vector reductions for any vector width by switching iterators
and generalising the code slightly.  There's no one-instruction way to move an
item from lane 31 to lane 0 (63, 15, 7, 3, and 1 are all fine though), and
vec_extract is probably fewer cycles anyway, so now we always reduce to an
SGPR.

gcc/ChangeLog:

	* config/gcn/gcn-valu.md (V64_SI): Delete iterator.
	(V64_DI): Likewise.
	(V64_1REG): Likewise.
	(V64_INT_1REG): Likewise.
	(V64_2REG): Likewise.
	(V64_ALL): Likewise.
	(V64_FP): Likewise.
	(reduc_<reduc_op>_scal_<mode>): Use V_ALL. Use gen_vec_extract.
	(fold_left_plus_<mode>): Use V_FP.
	(*<reduc_op>_dpp_shr_<mode>): Use V_1REG.
	(*<reduc_op>_dpp_shr_<mode>): Use V_DI.
	(*plus_carry_dpp_shr_<mode>): Use V_INT_1REG.
	(*plus_carry_in_dpp_shr_<mode>): Use V_SI.
	(*plus_carry_dpp_shr_<mode>): Use V_DI.
	(mov_from_lane63_<mode>): Delete.
	(mov_from_lane63_<mode>): Delete.
	* config/gcn/gcn.cc (gcn_expand_reduc_scalar): Support partial vectors.
	* config/gcn/gcn.md (unspec): Remove UNSPEC_MOV_FROM_LANE63.
This commit is contained in:
Andrew Stubbs 2022-10-28 12:38:43 +01:00
parent 12a1085644
commit f539029c1c
3 changed files with 45 additions and 94 deletions

View file

@ -32,11 +32,6 @@
(define_mode_iterator V_DF
[V2DF V4DF V8DF V16DF V32DF V64DF])
(define_mode_iterator V64_SI
[V64SI])
(define_mode_iterator V64_DI
[V64DI])
; Vector modes for sub-dword modes
(define_mode_iterator V_QIHI
[V2QI V2HI
@ -77,13 +72,6 @@
V32HF V32SF
V64HF V64SF])
; V64_* modes are for where more general support is unimplemented
; (e.g. reductions)
(define_mode_iterator V64_1REG
[V64QI V64HI V64SI V64HF V64SF])
(define_mode_iterator V64_INT_1REG
[V64QI V64HI V64SI])
; Vector modes for two vector registers
(define_mode_iterator V_2REG
[V2DI V2DF
@ -93,9 +81,6 @@
V32DI V32DF
V64DI V64DF])
(define_mode_iterator V64_2REG
[V64DI V64DF])
; Vector modes with native support
(define_mode_iterator V_noQI
[V2HI V2HF V2SI V2SF V2DI V2DF
@ -158,11 +143,6 @@
V32HF V32SF V32DF
V64HF V64SF V64DF])
(define_mode_iterator V64_ALL
[V64QI V64HI V64HF V64SI V64SF V64DI V64DF])
(define_mode_iterator V64_FP
[V64HF V64SF V64DF])
(define_mode_attr scalar_mode
[(V2QI "qi") (V2HI "hi") (V2SI "si")
(V2HF "hf") (V2SF "sf") (V2DI "di") (V2DF "df")
@ -3528,15 +3508,16 @@
(define_expand "reduc_<reduc_op>_scal_<mode>"
[(set (match_operand:<SCALAR_MODE> 0 "register_operand")
(unspec:<SCALAR_MODE>
[(match_operand:V64_ALL 1 "register_operand")]
[(match_operand:V_ALL 1 "register_operand")]
REDUC_UNSPEC))]
""
{
rtx tmp = gcn_expand_reduc_scalar (<MODE>mode, operands[1],
<reduc_unspec>);
/* The result of the reduction is in lane 63 of tmp. */
emit_insn (gen_mov_from_lane63_<mode> (operands[0], tmp));
rtx last_lane = GEN_INT (GET_MODE_NUNITS (<MODE>mode) - 1);
emit_insn (gen_vec_extract<mode><scalar_mode> (operands[0], tmp,
last_lane));
DONE;
})
@ -3547,7 +3528,7 @@
(define_expand "fold_left_plus_<mode>"
[(match_operand:<SCALAR_MODE> 0 "register_operand")
(match_operand:<SCALAR_MODE> 1 "gcn_alu_operand")
(match_operand:V64_FP 2 "gcn_alu_operand")]
(match_operand:V_FP 2 "gcn_alu_operand")]
"can_create_pseudo_p ()
&& (flag_openacc || flag_openmp
|| flag_associative_math)"
@ -3563,11 +3544,11 @@
})
(define_insn "*<reduc_op>_dpp_shr_<mode>"
[(set (match_operand:V64_1REG 0 "register_operand" "=v")
(unspec:V64_1REG
[(match_operand:V64_1REG 1 "register_operand" "v")
(match_operand:V64_1REG 2 "register_operand" "v")
(match_operand:SI 3 "const_int_operand" "n")]
[(set (match_operand:V_1REG 0 "register_operand" "=v")
(unspec:V_1REG
[(match_operand:V_1REG 1 "register_operand" "v")
(match_operand:V_1REG 2 "register_operand" "v")
(match_operand:SI 3 "const_int_operand" "n")]
REDUC_UNSPEC))]
; GCN3 requires a carry out, GCN5 not
"!(TARGET_GCN3 && SCALAR_INT_MODE_P (<SCALAR_MODE>mode)
@ -3580,11 +3561,11 @@
(set_attr "length" "8")])
(define_insn_and_split "*<reduc_op>_dpp_shr_<mode>"
[(set (match_operand:V64_DI 0 "register_operand" "=v")
(unspec:V64_DI
[(match_operand:V64_DI 1 "register_operand" "v")
(match_operand:V64_DI 2 "register_operand" "v")
(match_operand:SI 3 "const_int_operand" "n")]
[(set (match_operand:V_DI 0 "register_operand" "=v")
(unspec:V_DI
[(match_operand:V_DI 1 "register_operand" "v")
(match_operand:V_DI 2 "register_operand" "v")
(match_operand:SI 3 "const_int_operand" "n")]
REDUC_2REG_UNSPEC))]
""
"#"
@ -3609,10 +3590,10 @@
; Special cases for addition.
(define_insn "*plus_carry_dpp_shr_<mode>"
[(set (match_operand:V64_INT_1REG 0 "register_operand" "=v")
(unspec:V64_INT_1REG
[(match_operand:V64_INT_1REG 1 "register_operand" "v")
(match_operand:V64_INT_1REG 2 "register_operand" "v")
[(set (match_operand:V_INT_1REG 0 "register_operand" "=v")
(unspec:V_INT_1REG
[(match_operand:V_INT_1REG 1 "register_operand" "v")
(match_operand:V_INT_1REG 2 "register_operand" "v")
(match_operand:SI 3 "const_int_operand" "n")]
UNSPEC_PLUS_CARRY_DPP_SHR))
(clobber (reg:DI VCC_REG))]
@ -3626,12 +3607,12 @@
(set_attr "length" "8")])
(define_insn "*plus_carry_in_dpp_shr_<mode>"
[(set (match_operand:V64_SI 0 "register_operand" "=v")
(unspec:V64_SI
[(match_operand:V64_SI 1 "register_operand" "v")
(match_operand:V64_SI 2 "register_operand" "v")
(match_operand:SI 3 "const_int_operand" "n")
(match_operand:DI 4 "register_operand" "cV")]
[(set (match_operand:V_SI 0 "register_operand" "=v")
(unspec:V_SI
[(match_operand:V_SI 1 "register_operand" "v")
(match_operand:V_SI 2 "register_operand" "v")
(match_operand:SI 3 "const_int_operand" "n")
(match_operand:DI 4 "register_operand" "cV")]
UNSPEC_PLUS_CARRY_IN_DPP_SHR))
(clobber (reg:DI VCC_REG))]
""
@ -3644,11 +3625,11 @@
(set_attr "length" "8")])
(define_insn_and_split "*plus_carry_dpp_shr_<mode>"
[(set (match_operand:V64_DI 0 "register_operand" "=v")
(unspec:V64_DI
[(match_operand:V64_DI 1 "register_operand" "v")
(match_operand:V64_DI 2 "register_operand" "v")
(match_operand:SI 3 "const_int_operand" "n")]
[(set (match_operand:V_DI 0 "register_operand" "=v")
(unspec:V_DI
[(match_operand:V_DI 1 "register_operand" "v")
(match_operand:V_DI 2 "register_operand" "v")
(match_operand:SI 3 "const_int_operand" "n")]
UNSPEC_PLUS_CARRY_DPP_SHR))
(clobber (reg:DI VCC_REG))]
""
@ -3675,38 +3656,6 @@
[(set_attr "type" "vmult")
(set_attr "length" "16")])
; Instructions to move a scalar value from lane 63 of a vector register.
(define_insn "mov_from_lane63_<mode>"
[(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg,v")
(unspec:<SCALAR_MODE>
[(match_operand:V64_1REG 1 "register_operand" " v,v")]
UNSPEC_MOV_FROM_LANE63))]
""
"@
v_readlane_b32\t%0, %1, 63
v_mov_b32\t%0, %1 wave_ror:1"
[(set_attr "type" "vop3a,vop_dpp")
(set_attr "exec" "none,*")
(set_attr "length" "8")])
(define_insn "mov_from_lane63_<mode>"
[(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg,v")
(unspec:<SCALAR_MODE>
[(match_operand:V64_2REG 1 "register_operand" " v,v")]
UNSPEC_MOV_FROM_LANE63))]
""
"@
v_readlane_b32\t%L0, %L1, 63\;v_readlane_b32\t%H0, %H1, 63
* if (REGNO (operands[0]) <= REGNO (operands[1])) \
return \"v_mov_b32\t%L0, %L1 wave_ror:1\;\" \
\"v_mov_b32\t%H0, %H1 wave_ror:1\"; \
else \
return \"v_mov_b32\t%H0, %H1 wave_ror:1\;\" \
\"v_mov_b32\t%L0, %L1 wave_ror:1\";"
[(set_attr "type" "vop3a,vop_dpp")
(set_attr "exec" "none,*")
(set_attr "length" "8")])
;; }}}
;; {{{ Miscellaneous

View file

@ -4918,23 +4918,25 @@ gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
The vector register SRC of mode MODE is reduced using the operation given
by UNSPEC, and the scalar result is returned in lane 63 of a vector
register. */
/* FIXME: Implement reductions for sizes other than V64.
(They're currently disabled in the machine description.) */
register (or lane 31, 15, 7, 3, 1 for partial vectors). */
rtx
gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
{
machine_mode orig_mode = mode;
machine_mode scalar_mode = GET_MODE_INNER (mode);
int vf = GET_MODE_NUNITS (mode);
bool use_moves = (((unspec == UNSPEC_SMIN_DPP_SHR
|| unspec == UNSPEC_SMIN_DPP_SHR
|| unspec == UNSPEC_SMAX_DPP_SHR
|| unspec == UNSPEC_UMIN_DPP_SHR
|| unspec == UNSPEC_UMAX_DPP_SHR)
&& (mode == V64DImode
|| mode == V64DFmode))
&& (scalar_mode == DImode
|| scalar_mode == DFmode))
|| (unspec == UNSPEC_PLUS_DPP_SHR
&& mode == V64DFmode));
&& scalar_mode == DFmode));
rtx_code code = (unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
: unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
: unspec == UNSPEC_SMAX_DPP_SHR ? SMAX
: unspec == UNSPEC_UMIN_DPP_SHR ? UMIN
: unspec == UNSPEC_UMAX_DPP_SHR ? UMAX
@ -4944,23 +4946,23 @@ gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
|| unspec == UNSPEC_SMAX_DPP_SHR
|| unspec == UNSPEC_UMIN_DPP_SHR
|| unspec == UNSPEC_UMAX_DPP_SHR)
&& (mode == V64QImode
|| mode == V64HImode));
&& (scalar_mode == QImode
|| scalar_mode == HImode));
bool unsignedp = (unspec == UNSPEC_UMIN_DPP_SHR
|| unspec == UNSPEC_UMAX_DPP_SHR);
bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR
&& GET_MODE_CLASS (mode) == MODE_VECTOR_INT
&& (TARGET_GCN3 || mode == V64DImode);
&& (TARGET_GCN3 || scalar_mode == DImode);
if (use_plus_carry)
unspec = UNSPEC_PLUS_CARRY_DPP_SHR;
if (use_extends)
{
rtx tmp = gen_reg_rtx (V64SImode);
mode = VnMODE (vf, SImode);
rtx tmp = gen_reg_rtx (mode);
convert_move (tmp, src, unsignedp);
src = tmp;
mode = V64SImode;
}
/* Perform reduction by first performing the reduction operation on every
@ -4968,7 +4970,8 @@ gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
iteration (thereby effectively reducing every 4 lanes) and so on until
all lanes are reduced. */
rtx in, out = force_reg (mode, src);
for (int i = 0, shift = 1; i < 6; i++, shift <<= 1)
int iterations = exact_log2 (vf);
for (int i = 0, shift = 1; i < iterations; i++, shift <<= 1)
{
rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
in = out;

View file

@ -78,7 +78,6 @@
UNSPEC_PLUS_CARRY_DPP_SHR UNSPEC_PLUS_CARRY_IN_DPP_SHR
UNSPEC_AND_DPP_SHR UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR
UNSPEC_MOV_DPP_SHR
UNSPEC_MOV_FROM_LANE63
UNSPEC_GATHER
UNSPEC_SCATTER
UNSPEC_RCP