amdgcn: multi-size vector reductions

Add support for vector reductions for any vector width by switching iterators and generalising the code slightly. There's no one-instruction way to move an item from lane 31 to lane 0 (63, 15, 7, 3, and 1 are all fine though), and vec_extract is probably fewer cycles anyway, so now we always reduce to an SGPR. gcc/ChangeLog: * config/gcn/gcn-valu.md (V64_SI): Delete iterator. (V64_DI): Likewise. (V64_1REG): Likewise. (V64_INT_1REG): Likewise. (V64_2REG): Likewise. (V64_ALL): Likewise. (V64_FP): Likewise. (reduc_<reduc_op>_scal_<mode>): Use V_ALL. Use gen_vec_extract. (fold_left_plus_<mode>): Use V_FP. (*<reduc_op>_dpp_shr_<mode>): Use V_1REG. (*<reduc_op>_dpp_shr_<mode>): Use V_DI. (*plus_carry_dpp_shr_<mode>): Use V_INT_1REG. (*plus_carry_in_dpp_shr_<mode>): Use V_SI. (*plus_carry_dpp_shr_<mode>): Use V_DI. (mov_from_lane63_<mode>): Delete. (mov_from_lane63_<mode>): Delete. * config/gcn/gcn.cc (gcn_expand_reduc_scalar): Support partial vectors. * config/gcn/gcn.md (unspec): Remove UNSPEC_MOV_FROM_LANE63.
2022-10-28 12:38:43 +01:00 · 2022-10-28 12:38:43 +01:00 · f539029c1c
commit f539029c1c
parent 12a1085644
3 changed files with 45 additions and 94 deletions
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@ -32,11 +32,6 @@
 (define_mode_iterator V_DF
 		      [V2DF V4DF V8DF V16DF V32DF V64DF])

-(define_mode_iterator V64_SI
-		      [V64SI])
-(define_mode_iterator V64_DI
-		      [V64DI])
-
 ; Vector modes for sub-dword modes
 (define_mode_iterator V_QIHI
 		      [V2QI V2HI
@ -77,13 +72,6 @@
 		       V32HF V32SF
 		       V64HF V64SF])

-; V64_* modes are for where more general support is unimplemented
-; (e.g. reductions)
-(define_mode_iterator V64_1REG
-		      [V64QI V64HI V64SI V64HF V64SF])
-(define_mode_iterator V64_INT_1REG
-		      [V64QI V64HI V64SI])
-
 ; Vector modes for two vector registers
 (define_mode_iterator V_2REG
 		      [V2DI V2DF
@ -93,9 +81,6 @@
 		       V32DI V32DF
 		       V64DI V64DF])

-(define_mode_iterator V64_2REG
-		      [V64DI V64DF])
-
 ; Vector modes with native support
 (define_mode_iterator V_noQI
 		      [V2HI V2HF V2SI V2SF V2DI V2DF
@ -158,11 +143,6 @@
 		       V32HF V32SF V32DF
 		       V64HF V64SF V64DF])

-(define_mode_iterator V64_ALL
-		      [V64QI V64HI V64HF V64SI V64SF V64DI V64DF])
-(define_mode_iterator V64_FP
-		      [V64HF V64SF V64DF])
-
 (define_mode_attr scalar_mode
  [(V2QI "qi") (V2HI "hi") (V2SI "si")
   (V2HF "hf") (V2SF "sf") (V2DI "di") (V2DF "df")
@ -3528,15 +3508,16 @@
 (define_expand "reduc_<reduc_op>_scal_<mode>"
  [(set (match_operand:<SCALAR_MODE> 0 "register_operand")
 	(unspec:<SCALAR_MODE>
-	  [(match_operand:V64_ALL 1 "register_operand")]
+	  [(match_operand:V_ALL 1 "register_operand")]
 	  REDUC_UNSPEC))]
  ""
  {
    rtx tmp = gcn_expand_reduc_scalar (<MODE>mode, operands[1],
 				       <reduc_unspec>);

-    /* The result of the reduction is in lane 63 of tmp.  */
-    emit_insn (gen_mov_from_lane63_<mode> (operands[0], tmp));
+    rtx last_lane = GEN_INT (GET_MODE_NUNITS (<MODE>mode) - 1);
+    emit_insn (gen_vec_extract<mode><scalar_mode> (operands[0], tmp,
+						   last_lane));

    DONE;
  })
@ -3547,7 +3528,7 @@
 (define_expand "fold_left_plus_<mode>"
 [(match_operand:<SCALAR_MODE> 0 "register_operand")
  (match_operand:<SCALAR_MODE> 1 "gcn_alu_operand")
-  (match_operand:V64_FP 2 "gcn_alu_operand")]
+  (match_operand:V_FP 2 "gcn_alu_operand")]
  "can_create_pseudo_p ()
   && (flag_openacc || flag_openmp
       || flag_associative_math)"
@ -3563,11 +3544,11 @@
   })

 (define_insn "*<reduc_op>_dpp_shr_<mode>"
-  [(set (match_operand:V64_1REG 0 "register_operand"   "=v")
-	(unspec:V64_1REG
-	  [(match_operand:V64_1REG 1 "register_operand" "v")
-	   (match_operand:V64_1REG 2 "register_operand" "v")
-	   (match_operand:SI 3 "const_int_operand"      "n")]
+  [(set (match_operand:V_1REG 0 "register_operand"   "=v")
+	(unspec:V_1REG
+	  [(match_operand:V_1REG 1 "register_operand" "v")
+	   (match_operand:V_1REG 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"        "n")]
 	  REDUC_UNSPEC))]
  ; GCN3 requires a carry out, GCN5 not
  "!(TARGET_GCN3 && SCALAR_INT_MODE_P (<SCALAR_MODE>mode)
@ -3580,11 +3561,11 @@
   (set_attr "length" "8")])

 (define_insn_and_split "*<reduc_op>_dpp_shr_<mode>"
-  [(set (match_operand:V64_DI 0 "register_operand"    "=v")
-	(unspec:V64_DI
-	  [(match_operand:V64_DI 1 "register_operand" "v")
-	   (match_operand:V64_DI 2 "register_operand" "v")
-	   (match_operand:SI 3 "const_int_operand"    "n")]
+  [(set (match_operand:V_DI 0 "register_operand"    "=v")
+	(unspec:V_DI
+	  [(match_operand:V_DI 1 "register_operand" "v")
+	   (match_operand:V_DI 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"  "n")]
 	  REDUC_2REG_UNSPEC))]
  ""
  "#"
@ -3609,10 +3590,10 @@
 ; Special cases for addition.

 (define_insn "*plus_carry_dpp_shr_<mode>"
-  [(set (match_operand:V64_INT_1REG 0 "register_operand"   "=v")
-	(unspec:V64_INT_1REG
-	  [(match_operand:V64_INT_1REG 1 "register_operand" "v")
-	   (match_operand:V64_INT_1REG 2 "register_operand" "v")
+  [(set (match_operand:V_INT_1REG 0 "register_operand"   "=v")
+	(unspec:V_INT_1REG
+	  [(match_operand:V_INT_1REG 1 "register_operand" "v")
+	   (match_operand:V_INT_1REG 2 "register_operand" "v")
 	   (match_operand:SI 3 "const_int_operand"	  "n")]
 	  UNSPEC_PLUS_CARRY_DPP_SHR))
   (clobber (reg:DI VCC_REG))]
@ -3626,12 +3607,12 @@
   (set_attr "length" "8")])

 (define_insn "*plus_carry_in_dpp_shr_<mode>"
-  [(set (match_operand:V64_SI 0 "register_operand"    "=v")
-	(unspec:V64_SI
-	  [(match_operand:V64_SI 1 "register_operand" "v")
-	   (match_operand:V64_SI 2 "register_operand" "v")
-	   (match_operand:SI 3 "const_int_operand"    "n")
-	   (match_operand:DI 4 "register_operand"     "cV")]
+  [(set (match_operand:V_SI 0 "register_operand"    "=v")
+	(unspec:V_SI
+	  [(match_operand:V_SI 1 "register_operand" "v")
+	   (match_operand:V_SI 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"  "n")
+	   (match_operand:DI 4 "register_operand"   "cV")]
 	  UNSPEC_PLUS_CARRY_IN_DPP_SHR))
   (clobber (reg:DI VCC_REG))]
  ""
@ -3644,11 +3625,11 @@
   (set_attr "length" "8")])

 (define_insn_and_split "*plus_carry_dpp_shr_<mode>"
-  [(set (match_operand:V64_DI 0 "register_operand"    "=v")
-	(unspec:V64_DI
-	  [(match_operand:V64_DI 1 "register_operand" "v")
-	   (match_operand:V64_DI 2 "register_operand" "v")
-	   (match_operand:SI 3 "const_int_operand"    "n")]
+  [(set (match_operand:V_DI 0 "register_operand"    "=v")
+	(unspec:V_DI
+	  [(match_operand:V_DI 1 "register_operand" "v")
+	   (match_operand:V_DI 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"  "n")]
 	  UNSPEC_PLUS_CARRY_DPP_SHR))
   (clobber (reg:DI VCC_REG))]
  ""
@ -3675,38 +3656,6 @@
  [(set_attr "type" "vmult")
   (set_attr "length" "16")])

-; Instructions to move a scalar value from lane 63 of a vector register.
-(define_insn "mov_from_lane63_<mode>"
-  [(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg,v")
-	(unspec:<SCALAR_MODE>
-	  [(match_operand:V64_1REG 1 "register_operand"	  "  v,v")]
-	  UNSPEC_MOV_FROM_LANE63))]
-  ""
-  "@
-   v_readlane_b32\t%0, %1, 63
-   v_mov_b32\t%0, %1 wave_ror:1"
-  [(set_attr "type" "vop3a,vop_dpp")
-   (set_attr "exec" "none,*")
-   (set_attr "length" "8")])
-
-(define_insn "mov_from_lane63_<mode>"
-  [(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg,v")
-	(unspec:<SCALAR_MODE>
-	  [(match_operand:V64_2REG 1 "register_operand"	  "  v,v")]
-	  UNSPEC_MOV_FROM_LANE63))]
-  ""
-  "@
-   v_readlane_b32\t%L0, %L1, 63\;v_readlane_b32\t%H0, %H1, 63
-   * if (REGNO (operands[0]) <= REGNO (operands[1]))	\
-       return \"v_mov_b32\t%L0, %L1 wave_ror:1\;\"	\
-	      \"v_mov_b32\t%H0, %H1 wave_ror:1\";	\
-     else						\
-       return \"v_mov_b32\t%H0, %H1 wave_ror:1\;\"	\
-	      \"v_mov_b32\t%L0, %L1 wave_ror:1\";"
-  [(set_attr "type" "vop3a,vop_dpp")
-   (set_attr "exec" "none,*")
-   (set_attr "length" "8")])
-
 ;; }}}
 ;; {{{ Miscellaneous

--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@ -4918,23 +4918,25 @@ gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,

   The vector register SRC of mode MODE is reduced using the operation given
   by UNSPEC, and the scalar result is returned in lane 63 of a vector
-   register.  */
-/* FIXME: Implement reductions for sizes other than V64.
-          (They're currently disabled in the machine description.)  */
+   register (or lane 31, 15, 7, 3, 1 for partial vectors).  */

 rtx
 gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
 {
  machine_mode orig_mode = mode;
+  machine_mode scalar_mode = GET_MODE_INNER (mode);
+  int vf = GET_MODE_NUNITS (mode);
  bool use_moves = (((unspec == UNSPEC_SMIN_DPP_SHR
+		      || unspec == UNSPEC_SMIN_DPP_SHR
 		      || unspec == UNSPEC_SMAX_DPP_SHR
 		      || unspec == UNSPEC_UMIN_DPP_SHR
 		      || unspec == UNSPEC_UMAX_DPP_SHR)
-		     && (mode == V64DImode
-			 || mode == V64DFmode))
+		     && (scalar_mode == DImode
+			 || scalar_mode == DFmode))
 		    || (unspec == UNSPEC_PLUS_DPP_SHR
-			&& mode == V64DFmode));
+			&& scalar_mode == DFmode));
  rtx_code code = (unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
+		   : unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
 		   : unspec == UNSPEC_SMAX_DPP_SHR ? SMAX
 		   : unspec == UNSPEC_UMIN_DPP_SHR ? UMIN
 		   : unspec == UNSPEC_UMAX_DPP_SHR ? UMAX
@ -4944,23 +4946,23 @@ gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
 		       || unspec == UNSPEC_SMAX_DPP_SHR
 		       || unspec == UNSPEC_UMIN_DPP_SHR
 		       || unspec == UNSPEC_UMAX_DPP_SHR)
-		      && (mode == V64QImode
-			  || mode == V64HImode));
+		      && (scalar_mode == QImode
+			  || scalar_mode == HImode));
  bool unsignedp = (unspec == UNSPEC_UMIN_DPP_SHR
 		    || unspec == UNSPEC_UMAX_DPP_SHR);
  bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR
 			&& GET_MODE_CLASS (mode) == MODE_VECTOR_INT
-			&& (TARGET_GCN3 || mode == V64DImode);
+			&& (TARGET_GCN3 || scalar_mode == DImode);

  if (use_plus_carry)
    unspec = UNSPEC_PLUS_CARRY_DPP_SHR;

  if (use_extends)
    {
-      rtx tmp = gen_reg_rtx (V64SImode);
+      mode = VnMODE (vf, SImode);
+      rtx tmp = gen_reg_rtx (mode);
      convert_move (tmp, src, unsignedp);
      src = tmp;
-      mode = V64SImode;
    }

  /* Perform reduction by first performing the reduction operation on every
@ -4968,7 +4970,8 @@ gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
     iteration (thereby effectively reducing every 4 lanes) and so on until
     all lanes are reduced.  */
  rtx in, out = force_reg (mode, src);
-  for (int i = 0, shift = 1; i < 6; i++, shift <<= 1)
+  int iterations = exact_log2 (vf);
+  for (int i = 0, shift = 1; i < iterations; i++, shift <<= 1)
    {
      rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
      in = out;
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@ -78,7 +78,6 @@
  UNSPEC_PLUS_CARRY_DPP_SHR UNSPEC_PLUS_CARRY_IN_DPP_SHR
  UNSPEC_AND_DPP_SHR UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR
  UNSPEC_MOV_DPP_SHR
-  UNSPEC_MOV_FROM_LANE63
  UNSPEC_GATHER
  UNSPEC_SCATTER
  UNSPEC_RCP