amdgcn: multi-size vector reductions
Add support for vector reductions for any vector width by switching iterators and generalising the code slightly. There's no one-instruction way to move an item from lane 31 to lane 0 (63, 15, 7, 3, and 1 are all fine though), and vec_extract is probably fewer cycles anyway, so now we always reduce to an SGPR. gcc/ChangeLog: * config/gcn/gcn-valu.md (V64_SI): Delete iterator. (V64_DI): Likewise. (V64_1REG): Likewise. (V64_INT_1REG): Likewise. (V64_2REG): Likewise. (V64_ALL): Likewise. (V64_FP): Likewise. (reduc_<reduc_op>_scal_<mode>): Use V_ALL. Use gen_vec_extract. (fold_left_plus_<mode>): Use V_FP. (*<reduc_op>_dpp_shr_<mode>): Use V_1REG. (*<reduc_op>_dpp_shr_<mode>): Use V_DI. (*plus_carry_dpp_shr_<mode>): Use V_INT_1REG. (*plus_carry_in_dpp_shr_<mode>): Use V_SI. (*plus_carry_dpp_shr_<mode>): Use V_DI. (mov_from_lane63_<mode>): Delete. (mov_from_lane63_<mode>): Delete. * config/gcn/gcn.cc (gcn_expand_reduc_scalar): Support partial vectors. * config/gcn/gcn.md (unspec): Remove UNSPEC_MOV_FROM_LANE63.
This commit is contained in:
parent
12a1085644
commit
f539029c1c
3 changed files with 45 additions and 94 deletions
|
@ -32,11 +32,6 @@
|
|||
(define_mode_iterator V_DF
|
||||
[V2DF V4DF V8DF V16DF V32DF V64DF])
|
||||
|
||||
(define_mode_iterator V64_SI
|
||||
[V64SI])
|
||||
(define_mode_iterator V64_DI
|
||||
[V64DI])
|
||||
|
||||
; Vector modes for sub-dword modes
|
||||
(define_mode_iterator V_QIHI
|
||||
[V2QI V2HI
|
||||
|
@ -77,13 +72,6 @@
|
|||
V32HF V32SF
|
||||
V64HF V64SF])
|
||||
|
||||
; V64_* modes are for where more general support is unimplemented
|
||||
; (e.g. reductions)
|
||||
(define_mode_iterator V64_1REG
|
||||
[V64QI V64HI V64SI V64HF V64SF])
|
||||
(define_mode_iterator V64_INT_1REG
|
||||
[V64QI V64HI V64SI])
|
||||
|
||||
; Vector modes for two vector registers
|
||||
(define_mode_iterator V_2REG
|
||||
[V2DI V2DF
|
||||
|
@ -93,9 +81,6 @@
|
|||
V32DI V32DF
|
||||
V64DI V64DF])
|
||||
|
||||
(define_mode_iterator V64_2REG
|
||||
[V64DI V64DF])
|
||||
|
||||
; Vector modes with native support
|
||||
(define_mode_iterator V_noQI
|
||||
[V2HI V2HF V2SI V2SF V2DI V2DF
|
||||
|
@ -158,11 +143,6 @@
|
|||
V32HF V32SF V32DF
|
||||
V64HF V64SF V64DF])
|
||||
|
||||
(define_mode_iterator V64_ALL
|
||||
[V64QI V64HI V64HF V64SI V64SF V64DI V64DF])
|
||||
(define_mode_iterator V64_FP
|
||||
[V64HF V64SF V64DF])
|
||||
|
||||
(define_mode_attr scalar_mode
|
||||
[(V2QI "qi") (V2HI "hi") (V2SI "si")
|
||||
(V2HF "hf") (V2SF "sf") (V2DI "di") (V2DF "df")
|
||||
|
@ -3528,15 +3508,16 @@
|
|||
(define_expand "reduc_<reduc_op>_scal_<mode>"
|
||||
[(set (match_operand:<SCALAR_MODE> 0 "register_operand")
|
||||
(unspec:<SCALAR_MODE>
|
||||
[(match_operand:V64_ALL 1 "register_operand")]
|
||||
[(match_operand:V_ALL 1 "register_operand")]
|
||||
REDUC_UNSPEC))]
|
||||
""
|
||||
{
|
||||
rtx tmp = gcn_expand_reduc_scalar (<MODE>mode, operands[1],
|
||||
<reduc_unspec>);
|
||||
|
||||
/* The result of the reduction is in lane 63 of tmp. */
|
||||
emit_insn (gen_mov_from_lane63_<mode> (operands[0], tmp));
|
||||
rtx last_lane = GEN_INT (GET_MODE_NUNITS (<MODE>mode) - 1);
|
||||
emit_insn (gen_vec_extract<mode><scalar_mode> (operands[0], tmp,
|
||||
last_lane));
|
||||
|
||||
DONE;
|
||||
})
|
||||
|
@ -3547,7 +3528,7 @@
|
|||
(define_expand "fold_left_plus_<mode>"
|
||||
[(match_operand:<SCALAR_MODE> 0 "register_operand")
|
||||
(match_operand:<SCALAR_MODE> 1 "gcn_alu_operand")
|
||||
(match_operand:V64_FP 2 "gcn_alu_operand")]
|
||||
(match_operand:V_FP 2 "gcn_alu_operand")]
|
||||
"can_create_pseudo_p ()
|
||||
&& (flag_openacc || flag_openmp
|
||||
|| flag_associative_math)"
|
||||
|
@ -3563,11 +3544,11 @@
|
|||
})
|
||||
|
||||
(define_insn "*<reduc_op>_dpp_shr_<mode>"
|
||||
[(set (match_operand:V64_1REG 0 "register_operand" "=v")
|
||||
(unspec:V64_1REG
|
||||
[(match_operand:V64_1REG 1 "register_operand" "v")
|
||||
(match_operand:V64_1REG 2 "register_operand" "v")
|
||||
(match_operand:SI 3 "const_int_operand" "n")]
|
||||
[(set (match_operand:V_1REG 0 "register_operand" "=v")
|
||||
(unspec:V_1REG
|
||||
[(match_operand:V_1REG 1 "register_operand" "v")
|
||||
(match_operand:V_1REG 2 "register_operand" "v")
|
||||
(match_operand:SI 3 "const_int_operand" "n")]
|
||||
REDUC_UNSPEC))]
|
||||
; GCN3 requires a carry out, GCN5 not
|
||||
"!(TARGET_GCN3 && SCALAR_INT_MODE_P (<SCALAR_MODE>mode)
|
||||
|
@ -3580,11 +3561,11 @@
|
|||
(set_attr "length" "8")])
|
||||
|
||||
(define_insn_and_split "*<reduc_op>_dpp_shr_<mode>"
|
||||
[(set (match_operand:V64_DI 0 "register_operand" "=v")
|
||||
(unspec:V64_DI
|
||||
[(match_operand:V64_DI 1 "register_operand" "v")
|
||||
(match_operand:V64_DI 2 "register_operand" "v")
|
||||
(match_operand:SI 3 "const_int_operand" "n")]
|
||||
[(set (match_operand:V_DI 0 "register_operand" "=v")
|
||||
(unspec:V_DI
|
||||
[(match_operand:V_DI 1 "register_operand" "v")
|
||||
(match_operand:V_DI 2 "register_operand" "v")
|
||||
(match_operand:SI 3 "const_int_operand" "n")]
|
||||
REDUC_2REG_UNSPEC))]
|
||||
""
|
||||
"#"
|
||||
|
@ -3609,10 +3590,10 @@
|
|||
; Special cases for addition.
|
||||
|
||||
(define_insn "*plus_carry_dpp_shr_<mode>"
|
||||
[(set (match_operand:V64_INT_1REG 0 "register_operand" "=v")
|
||||
(unspec:V64_INT_1REG
|
||||
[(match_operand:V64_INT_1REG 1 "register_operand" "v")
|
||||
(match_operand:V64_INT_1REG 2 "register_operand" "v")
|
||||
[(set (match_operand:V_INT_1REG 0 "register_operand" "=v")
|
||||
(unspec:V_INT_1REG
|
||||
[(match_operand:V_INT_1REG 1 "register_operand" "v")
|
||||
(match_operand:V_INT_1REG 2 "register_operand" "v")
|
||||
(match_operand:SI 3 "const_int_operand" "n")]
|
||||
UNSPEC_PLUS_CARRY_DPP_SHR))
|
||||
(clobber (reg:DI VCC_REG))]
|
||||
|
@ -3626,12 +3607,12 @@
|
|||
(set_attr "length" "8")])
|
||||
|
||||
(define_insn "*plus_carry_in_dpp_shr_<mode>"
|
||||
[(set (match_operand:V64_SI 0 "register_operand" "=v")
|
||||
(unspec:V64_SI
|
||||
[(match_operand:V64_SI 1 "register_operand" "v")
|
||||
(match_operand:V64_SI 2 "register_operand" "v")
|
||||
(match_operand:SI 3 "const_int_operand" "n")
|
||||
(match_operand:DI 4 "register_operand" "cV")]
|
||||
[(set (match_operand:V_SI 0 "register_operand" "=v")
|
||||
(unspec:V_SI
|
||||
[(match_operand:V_SI 1 "register_operand" "v")
|
||||
(match_operand:V_SI 2 "register_operand" "v")
|
||||
(match_operand:SI 3 "const_int_operand" "n")
|
||||
(match_operand:DI 4 "register_operand" "cV")]
|
||||
UNSPEC_PLUS_CARRY_IN_DPP_SHR))
|
||||
(clobber (reg:DI VCC_REG))]
|
||||
""
|
||||
|
@ -3644,11 +3625,11 @@
|
|||
(set_attr "length" "8")])
|
||||
|
||||
(define_insn_and_split "*plus_carry_dpp_shr_<mode>"
|
||||
[(set (match_operand:V64_DI 0 "register_operand" "=v")
|
||||
(unspec:V64_DI
|
||||
[(match_operand:V64_DI 1 "register_operand" "v")
|
||||
(match_operand:V64_DI 2 "register_operand" "v")
|
||||
(match_operand:SI 3 "const_int_operand" "n")]
|
||||
[(set (match_operand:V_DI 0 "register_operand" "=v")
|
||||
(unspec:V_DI
|
||||
[(match_operand:V_DI 1 "register_operand" "v")
|
||||
(match_operand:V_DI 2 "register_operand" "v")
|
||||
(match_operand:SI 3 "const_int_operand" "n")]
|
||||
UNSPEC_PLUS_CARRY_DPP_SHR))
|
||||
(clobber (reg:DI VCC_REG))]
|
||||
""
|
||||
|
@ -3675,38 +3656,6 @@
|
|||
[(set_attr "type" "vmult")
|
||||
(set_attr "length" "16")])
|
||||
|
||||
; Instructions to move a scalar value from lane 63 of a vector register.
|
||||
(define_insn "mov_from_lane63_<mode>"
|
||||
[(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg,v")
|
||||
(unspec:<SCALAR_MODE>
|
||||
[(match_operand:V64_1REG 1 "register_operand" " v,v")]
|
||||
UNSPEC_MOV_FROM_LANE63))]
|
||||
""
|
||||
"@
|
||||
v_readlane_b32\t%0, %1, 63
|
||||
v_mov_b32\t%0, %1 wave_ror:1"
|
||||
[(set_attr "type" "vop3a,vop_dpp")
|
||||
(set_attr "exec" "none,*")
|
||||
(set_attr "length" "8")])
|
||||
|
||||
(define_insn "mov_from_lane63_<mode>"
|
||||
[(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg,v")
|
||||
(unspec:<SCALAR_MODE>
|
||||
[(match_operand:V64_2REG 1 "register_operand" " v,v")]
|
||||
UNSPEC_MOV_FROM_LANE63))]
|
||||
""
|
||||
"@
|
||||
v_readlane_b32\t%L0, %L1, 63\;v_readlane_b32\t%H0, %H1, 63
|
||||
* if (REGNO (operands[0]) <= REGNO (operands[1])) \
|
||||
return \"v_mov_b32\t%L0, %L1 wave_ror:1\;\" \
|
||||
\"v_mov_b32\t%H0, %H1 wave_ror:1\"; \
|
||||
else \
|
||||
return \"v_mov_b32\t%H0, %H1 wave_ror:1\;\" \
|
||||
\"v_mov_b32\t%L0, %L1 wave_ror:1\";"
|
||||
[(set_attr "type" "vop3a,vop_dpp")
|
||||
(set_attr "exec" "none,*")
|
||||
(set_attr "length" "8")])
|
||||
|
||||
;; }}}
|
||||
;; {{{ Miscellaneous
|
||||
|
||||
|
|
|
@ -4918,23 +4918,25 @@ gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
|
|||
|
||||
The vector register SRC of mode MODE is reduced using the operation given
|
||||
by UNSPEC, and the scalar result is returned in lane 63 of a vector
|
||||
register. */
|
||||
/* FIXME: Implement reductions for sizes other than V64.
|
||||
(They're currently disabled in the machine description.) */
|
||||
register (or lane 31, 15, 7, 3, 1 for partial vectors). */
|
||||
|
||||
rtx
|
||||
gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
|
||||
{
|
||||
machine_mode orig_mode = mode;
|
||||
machine_mode scalar_mode = GET_MODE_INNER (mode);
|
||||
int vf = GET_MODE_NUNITS (mode);
|
||||
bool use_moves = (((unspec == UNSPEC_SMIN_DPP_SHR
|
||||
|| unspec == UNSPEC_SMIN_DPP_SHR
|
||||
|| unspec == UNSPEC_SMAX_DPP_SHR
|
||||
|| unspec == UNSPEC_UMIN_DPP_SHR
|
||||
|| unspec == UNSPEC_UMAX_DPP_SHR)
|
||||
&& (mode == V64DImode
|
||||
|| mode == V64DFmode))
|
||||
&& (scalar_mode == DImode
|
||||
|| scalar_mode == DFmode))
|
||||
|| (unspec == UNSPEC_PLUS_DPP_SHR
|
||||
&& mode == V64DFmode));
|
||||
&& scalar_mode == DFmode));
|
||||
rtx_code code = (unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
|
||||
: unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
|
||||
: unspec == UNSPEC_SMAX_DPP_SHR ? SMAX
|
||||
: unspec == UNSPEC_UMIN_DPP_SHR ? UMIN
|
||||
: unspec == UNSPEC_UMAX_DPP_SHR ? UMAX
|
||||
|
@ -4944,23 +4946,23 @@ gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
|
|||
|| unspec == UNSPEC_SMAX_DPP_SHR
|
||||
|| unspec == UNSPEC_UMIN_DPP_SHR
|
||||
|| unspec == UNSPEC_UMAX_DPP_SHR)
|
||||
&& (mode == V64QImode
|
||||
|| mode == V64HImode));
|
||||
&& (scalar_mode == QImode
|
||||
|| scalar_mode == HImode));
|
||||
bool unsignedp = (unspec == UNSPEC_UMIN_DPP_SHR
|
||||
|| unspec == UNSPEC_UMAX_DPP_SHR);
|
||||
bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR
|
||||
&& GET_MODE_CLASS (mode) == MODE_VECTOR_INT
|
||||
&& (TARGET_GCN3 || mode == V64DImode);
|
||||
&& (TARGET_GCN3 || scalar_mode == DImode);
|
||||
|
||||
if (use_plus_carry)
|
||||
unspec = UNSPEC_PLUS_CARRY_DPP_SHR;
|
||||
|
||||
if (use_extends)
|
||||
{
|
||||
rtx tmp = gen_reg_rtx (V64SImode);
|
||||
mode = VnMODE (vf, SImode);
|
||||
rtx tmp = gen_reg_rtx (mode);
|
||||
convert_move (tmp, src, unsignedp);
|
||||
src = tmp;
|
||||
mode = V64SImode;
|
||||
}
|
||||
|
||||
/* Perform reduction by first performing the reduction operation on every
|
||||
|
@ -4968,7 +4970,8 @@ gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
|
|||
iteration (thereby effectively reducing every 4 lanes) and so on until
|
||||
all lanes are reduced. */
|
||||
rtx in, out = force_reg (mode, src);
|
||||
for (int i = 0, shift = 1; i < 6; i++, shift <<= 1)
|
||||
int iterations = exact_log2 (vf);
|
||||
for (int i = 0, shift = 1; i < iterations; i++, shift <<= 1)
|
||||
{
|
||||
rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
|
||||
in = out;
|
||||
|
|
|
@ -78,7 +78,6 @@
|
|||
UNSPEC_PLUS_CARRY_DPP_SHR UNSPEC_PLUS_CARRY_IN_DPP_SHR
|
||||
UNSPEC_AND_DPP_SHR UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR
|
||||
UNSPEC_MOV_DPP_SHR
|
||||
UNSPEC_MOV_FROM_LANE63
|
||||
UNSPEC_GATHER
|
||||
UNSPEC_SCATTER
|
||||
UNSPEC_RCP
|
||||
|
|
Loading…
Add table
Reference in a new issue