From faa2202ee7fcf039b2016ce5766a2927526c5f78 Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Fri, 4 Aug 2023 16:23:38 +0100 Subject: [PATCH] i386: Split SUBREGs of SSE vector registers into vec_select insns. This patch is the final piece in the series to improve the ABI issues affecting PR 88873. The previous patches tackled inserting DFmode values into V2DFmode registers, by introducing insvti_{low,high}part patterns. This patch improves the extraction of DFmode values from V2DFmode registers via TImode intermediates. I'd initially thought this would require new extvti_{low,high}part patterns to be defined, but all that's required is to recognize that the SUBREG idioms produced by combine are equivalent to (forms of) vec_select patterns. The target-independent middle-end can't be sure that the appropriate vec_select instruction exists on the target, hence doesn't canonicalize a SUBREG of a vector mode as a vec_select, but the backend can provide a define_split stating where and when this is useful, for example, considering whether the operand is in memory, or whether !TARGET_SSE_MATH and the destination is i387. For pr88873.c, gcc -O2 -march=cascadelake currently generates: foo: vpunpcklqdq %xmm3, %xmm2, %xmm7 vpunpcklqdq %xmm1, %xmm0, %xmm6 vpunpcklqdq %xmm5, %xmm4, %xmm2 vmovdqa %xmm7, -24(%rsp) vmovdqa %xmm6, %xmm1 movq -16(%rsp), %rax vpinsrq $1, %rax, %xmm7, %xmm4 vmovapd %xmm4, %xmm6 vfmadd132pd %xmm1, %xmm2, %xmm6 vmovapd %xmm6, -24(%rsp) vmovsd -16(%rsp), %xmm1 vmovsd -24(%rsp), %xmm0 ret with this patch, we now generate: foo: vpunpcklqdq %xmm1, %xmm0, %xmm6 vpunpcklqdq %xmm3, %xmm2, %xmm7 vpunpcklqdq %xmm5, %xmm4, %xmm2 vmovdqa %xmm6, %xmm1 vfmadd132pd %xmm7, %xmm2, %xmm1 vmovsd %xmm1, %xmm1, %xmm0 vunpckhpd %xmm1, %xmm1, %xmm1 ret The improvement is even more dramatic when compared to the original 29 instructions shown in comment #8. GCC 13, for example, required 12 transfers to/from memory. 2023-08-04 Roger Sayle gcc/ChangeLog * config/i386/sse.md (define_split): Convert highpart:DF extract from V2DFmode register into a sse2_storehpd instruction. (define_split): Likewise, convert lowpart:DF extract from V2DF register into a sse2_storelpd instruction. gcc/testsuite/ChangeLog * gcc.target/i386/pr88873.c: Tweak to check for improved code. --- gcc/config/i386/sse.md | 16 ++++++++++++++++ gcc/testsuite/gcc.target/i386/pr88873.c | 2 ++ 2 files changed, 18 insertions(+) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index ab455c3e297..f1712b001f9 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -13554,6 +13554,14 @@ [(set_attr "type" "ssemov") (set_attr "mode" "V2SF,V4SF,V2SF")]) +;; Convert highpart SUBREG in sse2_storehpd or *vec_extractv2df_1_sse. +(define_split + [(set (match_operand:DF 0 "register_operand") + (subreg:DF (match_operand:V2DF 1 "register_operand") 8))] + "TARGET_SSE" + [(set (match_dup 0) + (vec_select:DF (match_dup 1) (parallel [(const_int 1)])))]) + ;; Avoid combining registers from different units in a single alternative, ;; see comment above inline_secondary_memory_needed function in i386.cc (define_insn "sse2_storelpd" @@ -13599,6 +13607,14 @@ [(set_attr "type" "ssemov") (set_attr "mode" "V2SF,V4SF,V2SF")]) +;; Convert lowpart SUBREG into sse2_storelpd or *vec_extractv2df_0_sse. +(define_split + [(set (match_operand:DF 0 "register_operand") + (subreg:DF (match_operand:V2DF 1 "register_operand") 0))] + "TARGET_SSE" + [(set (match_dup 0) + (vec_select:DF (match_dup 1) (parallel [(const_int 0)])))]) + (define_expand "sse2_loadhpd_exp" [(set (match_operand:V2DF 0 "nonimmediate_operand") (vec_concat:V2DF diff --git a/gcc/testsuite/gcc.target/i386/pr88873.c b/gcc/testsuite/gcc.target/i386/pr88873.c index d893aac714e..a3a7ef242e2 100644 --- a/gcc/testsuite/gcc.target/i386/pr88873.c +++ b/gcc/testsuite/gcc.target/i386/pr88873.c @@ -9,3 +9,5 @@ s_t foo (s_t a, s_t b, s_t c) } /* { dg-final { scan-assembler-times "vpunpcklqdq" 3 } } */ +/* { dg-final { scan-assembler "vunpckhpd" } } */ +/* { dg-final { scan-assembler-not "rsp" } } */