i386: Auto vectorize sdot_prod, usdot_prod, udot_prod with AVX10.2 instructions

gcc/ChangeLog:

	* config/i386/sse.md (VI1_AVX512VNNIBW): New.
	(VI2_AVX10_2): Ditto.
	(sdot_prod<mode>): Add AVX10.2
	to auto vectorize and combine 512 bit part.
	(udot_prod<mode>): Ditto.
	(sdot_prodv64qi): Removed.
	(udot_prodv64qi): Ditto.
	(usdot_prod<mode>): Add AVX10.2 to auto vectorize.
	(udot_prod<mode>): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/vnniint16-auto-vectorize-2.c: Only define
	TEST when not defined.
	* gcc.target/i386/vnniint8-auto-vectorize-2.c: Ditto.
	* gcc.target/i386/vnniint16-auto-vectorize-3.c: New test.
	* gcc.target/i386/vnniint16-auto-vectorize-4.c: Ditto.
	* gcc.target/i386/vnniint8-auto-vectorize-3.c: Ditto.
	* gcc.target/i386/vnniint8-auto-vectorize-4.c: Ditto.
This commit is contained in:
Haochen Jiang 2024-09-02 10:24:29 +08:00
parent 5239902210
commit b1f9fbb6da
7 changed files with 88 additions and 80 deletions

View file

@ -610,6 +610,10 @@
(define_mode_iterator VI1_AVX512VNNI
[(V64QI "TARGET_AVX512VNNI && TARGET_EVEX512") (V32QI "TARGET_AVX2") V16QI])
(define_mode_iterator VI1_AVX512VNNIBW
[(V64QI "(TARGET_AVX512BW || TARGET_AVX512VNNI) && TARGET_EVEX512")
(V32QI "TARGET_AVX2") V16QI])
(define_mode_iterator VI12_256_512_AVX512VL
[(V64QI "TARGET_EVEX512") (V32QI "TARGET_AVX512VL")
(V32HI "TARGET_EVEX512") (V16HI "TARGET_AVX512VL")])
@ -627,6 +631,9 @@
[(V32HI "(TARGET_AVX512BW || TARGET_AVX512VNNI) && TARGET_EVEX512")
(V16HI "TARGET_AVX2") V8HI])
(define_mode_iterator VI2_AVX10_2
[(V32HI "TARGET_AVX10_2_512") V16HI V8HI])
(define_mode_iterator VI4_AVX
[(V8SI "TARGET_AVX") V4SI])
@ -31232,12 +31239,13 @@
(define_expand "sdot_prod<mode>"
[(match_operand:<ssedvecmode> 0 "register_operand")
(match_operand:VI1_AVX2 1 "register_operand")
(match_operand:VI1_AVX2 2 "register_operand")
(match_operand:VI1_AVX512VNNIBW 1 "register_operand")
(match_operand:VI1_AVX512VNNIBW 2 "register_operand")
(match_operand:<ssedvecmode> 3 "register_operand")]
"TARGET_SSE2"
{
if (TARGET_AVXVNNIINT8)
if ((<MODE_SIZE> == 64 && TARGET_AVX10_2_512)
|| (<MODE_SIZE> < 64 && (TARGET_AVXVNNIINT8 || TARGET_AVX10_2_256)))
{
operands[1] = lowpart_subreg (<ssedvecmode>mode,
force_reg (<MODE>mode, operands[1]),
@ -31276,44 +31284,15 @@
DONE;
})
(define_expand "sdot_prodv64qi"
[(match_operand:V16SI 0 "register_operand")
(match_operand:V64QI 1 "register_operand")
(match_operand:V64QI 2 "register_operand")
(match_operand:V16SI 3 "register_operand")]
"(TARGET_AVX512VNNI || TARGET_AVX512BW) && TARGET_EVEX512"
{
/* Emulate with vpdpwssd. */
rtx op1_lo = gen_reg_rtx (V32HImode);
rtx op1_hi = gen_reg_rtx (V32HImode);
rtx op2_lo = gen_reg_rtx (V32HImode);
rtx op2_hi = gen_reg_rtx (V32HImode);
emit_insn (gen_vec_unpacks_lo_v64qi (op1_lo, operands[1]));
emit_insn (gen_vec_unpacks_lo_v64qi (op2_lo, operands[2]));
emit_insn (gen_vec_unpacks_hi_v64qi (op1_hi, operands[1]));
emit_insn (gen_vec_unpacks_hi_v64qi (op2_hi, operands[2]));
rtx res1 = gen_reg_rtx (V16SImode);
rtx res2 = gen_reg_rtx (V16SImode);
rtx sum = gen_reg_rtx (V16SImode);
emit_move_insn (sum, CONST0_RTX (V16SImode));
emit_insn (gen_sdot_prodv32hi (res1, op1_lo, op2_lo, sum));
emit_insn (gen_sdot_prodv32hi (res2, op1_hi, op2_hi, operands[3]));
emit_insn (gen_addv16si3 (operands[0], res1, res2));
DONE;
})
(define_expand "udot_prod<mode>"
[(match_operand:<ssedvecmode> 0 "register_operand")
(match_operand:VI1_AVX2 1 "register_operand")
(match_operand:VI1_AVX2 2 "register_operand")
(match_operand:VI1_AVX512VNNIBW 1 "register_operand")
(match_operand:VI1_AVX512VNNIBW 2 "register_operand")
(match_operand:<ssedvecmode> 3 "register_operand")]
"TARGET_SSE2"
{
if (TARGET_AVXVNNIINT8)
if ((<MODE_SIZE> == 64 && TARGET_AVX10_2_512)
|| (<MODE_SIZE> < 64 && (TARGET_AVXVNNIINT8 || TARGET_AVX10_2_256)))
{
operands[1] = lowpart_subreg (<ssedvecmode>mode,
force_reg (<MODE>mode, operands[1]),
@ -31352,36 +31331,6 @@
DONE;
})
(define_expand "udot_prodv64qi"
[(match_operand:V16SI 0 "register_operand")
(match_operand:V64QI 1 "register_operand")
(match_operand:V64QI 2 "register_operand")
(match_operand:V16SI 3 "register_operand")]
"(TARGET_AVX512VNNI || TARGET_AVX512BW) && TARGET_EVEX512"
{
/* Emulate with vpdpwssd. */
rtx op1_lo = gen_reg_rtx (V32HImode);
rtx op1_hi = gen_reg_rtx (V32HImode);
rtx op2_lo = gen_reg_rtx (V32HImode);
rtx op2_hi = gen_reg_rtx (V32HImode);
emit_insn (gen_vec_unpacku_lo_v64qi (op1_lo, operands[1]));
emit_insn (gen_vec_unpacku_lo_v64qi (op2_lo, operands[2]));
emit_insn (gen_vec_unpacku_hi_v64qi (op1_hi, operands[1]));
emit_insn (gen_vec_unpacku_hi_v64qi (op2_hi, operands[2]));
rtx res1 = gen_reg_rtx (V16SImode);
rtx res2 = gen_reg_rtx (V16SImode);
rtx sum = gen_reg_rtx (V16SImode);
emit_move_insn (sum, CONST0_RTX (V16SImode));
emit_insn (gen_sdot_prodv32hi (res1, op1_lo, op2_lo, sum));
emit_insn (gen_sdot_prodv32hi (res2, op1_hi, op2_hi, operands[3]));
emit_insn (gen_addv16si3 (operands[0], res1, res2));
DONE;
})
(define_insn "vpdp<vpdotprodtype>_<mode>"
[(set (match_operand:VI4_AVX 0 "register_operand" "=v")
(unspec:VI4_AVX
@ -31757,10 +31706,10 @@
(define_expand "usdot_prod<mode>"
[(match_operand:<sseunpackmode> 0 "register_operand")
(match_operand:VI2_AVX2 1 "register_operand")
(match_operand:VI2_AVX2 2 "register_operand")
(match_operand:VI2_AVX10_2 1 "register_operand")
(match_operand:VI2_AVX10_2 2 "register_operand")
(match_operand:<sseunpackmode> 3 "register_operand")]
"TARGET_AVXVNNIINT16"
"TARGET_AVXVNNIINT16 || TARGET_AVX10_2_256"
{
operands[1] = lowpart_subreg (<sseunpackmode>mode,
force_reg (<MODE>mode, operands[1]),
@ -31775,10 +31724,10 @@
(define_expand "udot_prod<mode>"
[(match_operand:<sseunpackmode> 0 "register_operand")
(match_operand:VI2_AVX2 1 "register_operand")
(match_operand:VI2_AVX2 2 "register_operand")
(match_operand:VI2_AVX10_2 1 "register_operand")
(match_operand:VI2_AVX10_2 2 "register_operand")
(match_operand:<sseunpackmode> 3 "register_operand")]
"TARGET_AVXVNNIINT16"
"TARGET_AVXVNNIINT16 || TARGET_AVX10_2_256"
{
operands[1] = lowpart_subreg (<sseunpackmode>mode,
force_reg (<MODE>mode, operands[1]),

View file

@ -2,19 +2,24 @@
/* { dg-options "-O2 -mavxvnniint16" } */
/* { dg-require-effective-target avxvnniint16 } */
#ifndef AVX10_2
#define AVXVNNIINT16
#ifndef CHECK
#define CHECK "avx-check.h"
#endif
#ifndef TEST
#define TEST avx_test
#ifndef CHECK
#define CHECK "avx-check.h"
#endif
#include CHECK
#include "vnniint16-auto-vectorize-1.c"
#ifndef TEST
#define TEST avx_test
#endif
#ifndef N
#define N 256
#endif
short a_i16[N];
unsigned short b_u16[N], c_u16[N], d_u16[N];

View file

@ -0,0 +1,6 @@
/* { dg-do compile } */
/* { dg-options "-mavx10.2 -O2" } */
/* { dg-final { scan-assembler "vpdpwusd\t" } } */
/* { dg-final { scan-assembler "vpdpwuud\t" } } */
#include "vnniint16-auto-vectorize-1.c"

View file

@ -0,0 +1,18 @@
/* { dg-do run } */
/* { dg-options "-O2 -mavx10.2-512" } */
/* { dg-require-effective-target avx10_2_512 } */
#define N 512
#define AVX10_2
#define AVX10_2_512
#define AVX10_512BIT
#define AVX512F_LEN 512
#define TEST test_512
#ifndef CHECK
#define CHECK "avx10-check.h"
#endif
#include "vnniint16-auto-vectorize-2.c"

View file

@ -2,19 +2,25 @@
/* { dg-options "-O2 -mavxvnniint8" } */
/* { dg-require-effective-target avxvnniint8 } */
#ifndef AVX10_2
#define AVXVNNIINT8
#ifndef CHECK
#define CHECK "avx-check.h"
#endif
#ifndef TEST
#define TEST avx_test
#ifndef CHECK
#define CHECK "avx-check.h"
#endif
#include CHECK
#include "vnniint8-auto-vectorize-1.c"
#ifndef TEST
#define TEST avx_test
#endif
#ifndef N
#define N 256
#endif
char a_i8[N], b_i8[N];
unsigned char c_u8[N], d_u8[N];
int i8_exp, i8_ref;

View file

@ -0,0 +1,6 @@
/* { dg-do compile } */
/* { dg-options "-mavx10.2 -O2" } */
/* { dg-final { scan-assembler "vpdpbssd\t" } } */
/* { dg-final { scan-assembler "vpdpbuud\t" } } */
#include "vnniint8-auto-vectorize-1.c"

View file

@ -0,0 +1,18 @@
/* { dg-do run } */
/* { dg-options "-O2 -mavx10.2-512" } */
/* { dg-require-effective-target avx10_2_512 } */
#define N 512
#define AVX10_2
#define AVX10_2_512
#define AVX10_512BIT
#define AVX512F_LEN 512
#define TEST test_512
#ifndef CHECK
#define CHECK "avx10-check.h"
#endif
#include "vnniint8-auto-vectorize-2.c"