[PR115458][LRA]: Run split sub-pass more times

In this PR case LRA needs to provide too many hard regs for insn reloads, where some reload pseudos require 8 aligned regs for themselves. As the last attempt, LRA tries to split live ranges of hard regs for insn reload pseudos. It is a very rare case. An inheritance pseudo involving a reload pseudo of the insn can be spilled in the assignment sub-pass run right after splitting and we need to run split sub-pass for the inheritance pseudo now. gcc/ChangeLog: PR target/115458 * lra-int.h (LRA_MAX_FAILED_SPLITS): Define and check its value. (lra_split_hard_reg_for): Change prototype. * lra.cc (lra): Try to split hard reg range several times after a failure. * lra-assigns.cc (lra_split_hard_reg_for): Add an arg, a flag of giving up. Report asm error and nullify the asm insn depending on the arg value. gcc/testsuite/ChangeLog: PR target/115458 * g++.target/riscv/pr115458.C: New.
2025-02-25 15:01:15 -05:00 · 2025-02-25 15:01:15 -05:00 · 2341f675ed
commit 2341f675ed
parent 0bb431d0a7
4 changed files with 410 additions and 25 deletions
--- a/gcc/lra-assigns.cc
+++ b/gcc/lra-assigns.cc
@ -1763,12 +1763,13 @@ find_reload_regno_insns (int regno, rtx_insn * &start, rtx_insn * &finish)
  return true;
 }

-/* Process reload pseudos which did not get a hard reg, split a hard
-   reg live range in live range of a reload pseudo, and then return
-   TRUE.  If we did not split a hard reg live range, report an error,
-   and return FALSE.  */
+/* Process reload pseudos which did not get a hard reg, split a hard reg live
+   range in live range of a reload pseudo, and then return TRUE.  Otherwise,
+   return FALSE.  When FAIL_P is TRUE and if we did not split a hard reg live
+   range for failed reload pseudos, report an error and modify related asm
+   insns.  */
 bool
-lra_split_hard_reg_for (void)
+lra_split_hard_reg_for (bool fail_p)
 {
  int i, regno;
  rtx_insn *insn, *first, *last;
@ -1843,23 +1844,30 @@ lra_split_hard_reg_for (void)
      regno = u;
      bitmap_ior_into (&failed_reload_insns,
 		       &lra_reg_info[regno].insn_bitmap);
-      lra_setup_reg_renumber
-	(regno, ira_class_hard_regs[lra_get_allocno_class (regno)][0], false);
-    }
-  EXECUTE_IF_SET_IN_BITMAP (&failed_reload_insns, 0, u, bi)
-    {
-      insn = lra_insn_recog_data[u]->insn;
-      if (asm_noperands (PATTERN (insn)) >= 0)
-	{
-	  asm_p = true;
-	  lra_asm_insn_error (insn);
-	}
-      else if (!asm_p)
-	{
-	  error ("unable to find a register to spill");
-	  fatal_insn ("this is the insn:", insn);
-	}
+      if (fail_p)
+	lra_setup_reg_renumber
+	  (regno, ira_class_hard_regs[lra_get_allocno_class (regno)][0], false);
    }
+  if (fail_p)
+    EXECUTE_IF_SET_IN_BITMAP (&failed_reload_insns, 0, u, bi)
+      {
+	insn = lra_insn_recog_data[u]->insn;
+	if (asm_noperands (PATTERN (insn)) >= 0)
+	  {
+	    asm_p = true;
+	    lra_asm_insn_error (insn);
+	    if (JUMP_P (insn))
+	      ira_nullify_asm_goto (insn);
+	    else
+	      PATTERN (insn) = gen_rtx_USE (VOIDmode, const0_rtx);
+	    lra_invalidate_insn_data (insn);
+	  }
+	else if (!asm_p)
+	  {
+	    error ("unable to find a register to spill");
+	    fatal_insn ("this is the insn:", insn);
+	  }
+      }
  bitmap_clear (&failed_reload_pseudos);
  bitmap_clear (&failed_reload_insns);
  return false;
--- a/gcc/lra-int.h
+++ b/gcc/lra-int.h
@ -252,6 +252,18 @@ typedef class lra_insn_recog_data *lra_insn_recog_data_t;
   for preventing LRA cycling in a bug case.  */
 #define LRA_MAX_ASSIGNMENT_ITERATION_NUMBER 30

+/* Maximum allowed number of tries to split hard reg live ranges after failure
+   in assignment of reload pseudos.  Theoretical bound for the value is the
+   number of the insn reload pseudos plus the number of inheritance pseudos
+   generated from the reload pseudos.  This bound can be achieved when all the
+   reload pseudos and the inheritance pseudos require hard reg splitting for
+   their assignment.  This is extremely unlikely event.  */
+#define LRA_MAX_FAILED_SPLITS 10
+
+#if LRA_MAX_FAILED_SPLITS >= LRA_MAX_ASSIGNMENT_ITERATION_NUMBER
+#error wrong LRA_MAX_FAILED_SPLITS value
+#endif
+
 /* The maximal number of inheritance/split passes in LRA.  It should
   be more 1 in order to perform caller saves transformations and much
   less MAX_CONSTRAINT_ITERATION_NUMBER to prevent LRA to do as many
@ -392,7 +404,7 @@ extern int lra_assignment_iter;
 extern int lra_assignment_iter_after_spill;
 extern void lra_setup_reg_renumber (int, int, bool);
 extern bool lra_assign (bool &);
-extern bool lra_split_hard_reg_for (void);
+extern bool lra_split_hard_reg_for (bool fail_p);

 /* lra-coalesce.cc: */

--- a/gcc/lra.cc
+++ b/gcc/lra.cc
@ -2480,6 +2480,7 @@ lra (FILE *f, int verbose)
 	    lra_clear_live_ranges ();
 	  bool fails_p;
 	  lra_hard_reg_split_p = false;
+	  int split_fails_num = 0;
 	  do
 	    {
 	      /* We need live ranges for lra_assign -- so build them.
@ -2493,7 +2494,7 @@ lra (FILE *f, int verbose)
 		 coalescing.  If inheritance pseudos were spilled, the
 		 memory-memory moves involving them will be removed by
 		 pass undoing inheritance.  */
-	      if (lra_simple_p)
+	      if (lra_simple_p || lra_hard_reg_split_p)
 		lra_assign (fails_p);
 	      else
 		{
@ -2522,8 +2523,15 @@ lra (FILE *f, int verbose)
 		  if (live_p)
 		    lra_clear_live_ranges ();
 		  live_p = false;
-		  if (! lra_split_hard_reg_for ())
-		    break;
+		  /* See a comment for LRA_MAX_FAILED_SPLITS definition.  */
+		  bool last_failed_split_p
+		    = split_fails_num > LRA_MAX_FAILED_SPLITS;
+		  if (! lra_split_hard_reg_for (last_failed_split_p))
+		    {
+		      if (last_failed_split_p)
+			break;
+		      split_fails_num++;
+		    }
 		  lra_hard_reg_split_p = true;
 		}
 	    }
--- a/gcc/testsuite/g++.target/riscv/pr115458.C
+++ b/gcc/testsuite/g++.target/riscv/pr115458.C
@ -0,0 +1,357 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv1p0 -mabi=lp64d -misa-spec=20191213 -mtls-dialect=trad -march=rv64imafdc_v1p0_zmmul_zca_zcd_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b -O2 -std=c++17 -fno-exceptions -w" } */
+
+typedef signed char int8_t;
+typedef unsigned char uint8_t;
+void Abort(...);
+template <bool> struct EnableIfT;
+template <> struct EnableIfT<true> {
+  using type = void;
+};
+template <bool Condition> using EnableIf = typename EnableIfT<Condition>::type;
+template <typename> using MakeUnsigned = unsigned char;
+template <typename> using MakeSigned = signed char;
+template <int __v> struct integral_constant {
+  static constexpr int value = __v;
+};
+template <bool __v> using __bool_constant = integral_constant<__v>;
+template <bool, typename _Tp> using __enable_if_t = _Tp;
+char *TargetName();
+template <typename _Tp> struct __uniq_ptr_impl {
+  template <typename _Up> struct _Ptr {
+    using type = _Up *;
+  };
+  using pointer = typename _Ptr<_Tp>::type;
+};
+template <typename _Tp, typename = _Tp> class unique_ptr;
+template <typename _Tp, typename _Dp> struct unique_ptr<_Tp[], _Dp> {
+  template <typename _Up, typename _Del>
+  unique_ptr(_Up, __enable_if_t<__bool_constant<false>::value, _Del>);
+  typename __uniq_ptr_impl<_Tp>::pointer get();
+  operator bool();
+};
+using AllocPtr = void *;
+using FreePtr = void();
+template <typename T> T AllocateAlignedItems(int, AllocPtr, void *);
+struct AlignedFreer {
+  AlignedFreer(FreePtr, void *);
+};
+template <typename T> using AlignedFreeUniquePtr = unique_ptr<T>;
+AllocPtr AllocateAligned_alloc;
+template <typename T>
+AlignedFreeUniquePtr<T[]> AllocateAligned(int items, void *opaque) {
+  FreePtr free;
+  return AlignedFreeUniquePtr<T[]>(
+      AllocateAlignedItems<T>(items, AllocateAligned_alloc, opaque),
+      AlignedFreer(free, opaque));
+}
+template <typename T> AlignedFreeUniquePtr<T[]> AllocateAligned(int items) {
+  return AllocateAligned<T>(items, nullptr);
+}
+template <typename> void MakeTypeInfo();
+void AssertArrayEqual(void *, void *, char *, int);
+#pragma riscv intrinsic "vector"
+template <typename Lane, int, int kPow2> struct Simd {
+  using T = Lane;
+  constexpr int Pow2() { return kPow2; }
+  template <typename> static constexpr int RebindPow2() { return kPow2; }
+  template <typename NewT> using Rebind = Simd<NewT, 0, RebindPow2<NewT>()>;
+};
+template <typename T, int kPow2> struct ClampNAndPow2 {
+  using type = Simd<T, 6, kPow2>;
+};
+template <typename T, int kPow2> struct ScalableTagChecker {
+  using type = typename ClampNAndPow2<T, kPow2>::type;
+};
+template <typename T, int kPow2>
+using ScalableTag = typename ScalableTagChecker<T, kPow2>::type;
+template <class D> using TFromD = typename D::T;
+template <class T, class D> using Rebind = typename D::Rebind<T>;
+template <class D> using RebindToSigned = Rebind<MakeSigned<D>, D>;
+template <class D> using RebindToUnsigned = Rebind<MakeUnsigned<D>, D>;
+template <class> struct DFromV_t;
+template <class V> using DFromV = typename DFromV_t<V>::type;
+template <> struct DFromV_t<vint8mf8_t> {
+  using Lane = int8_t;
+  using type = ScalableTag<Lane, -3>;
+};
+template <> struct DFromV_t<vint8mf4_t> {
+  using Lane = int8_t;
+  using type = ScalableTag<Lane, -2>;
+};
+template <> struct DFromV_t<vint8mf2_t> {
+  using Lane = int8_t;
+  using type = ScalableTag<Lane, -1>;
+};
+template <> struct DFromV_t<vint8m1_t> {
+  using Lane = int8_t;
+  using type = ScalableTag<Lane, 0>;
+};
+template <> struct DFromV_t<vint8m2_t> {
+  using Lane = int8_t;
+  using type = ScalableTag<Lane, 1>;
+};
+template <> struct DFromV_t<vint8m4_t> {
+  using Lane = int8_t;
+  using type = ScalableTag<Lane, 2>;
+};
+template <> struct DFromV_t<vint8m8_t> {
+  using Lane = int8_t;
+  using type = ScalableTag<Lane, 3>;
+};
+template <int N> int Lanes(Simd<int8_t, N, -3>);
+template <int N> int Lanes(Simd<int8_t, N, -2>);
+template <int N> int Lanes(Simd<int8_t, N, -1>);
+template <int N> int Lanes(Simd<int8_t, N, 0>);
+template <int N> int Lanes(Simd<int8_t, N, 1>);
+template <int N> int Lanes(Simd<int8_t, N, 2>);
+template <int N> int Lanes(Simd<int8_t, N, 3>);
+template <int N> vuint8mf8_t Set(Simd<uint8_t, N, -3>, uint8_t);
+template <int N> vuint8mf4_t Set(Simd<uint8_t, N, -2>, uint8_t);
+template <int N> vuint8mf2_t Set(Simd<uint8_t, N, -1>, uint8_t);
+template <int N> vuint8m1_t Set(Simd<uint8_t, N, 0>, uint8_t);
+template <int N> vuint8m2_t Set(Simd<uint8_t, N, 1>, uint8_t);
+template <int N> vuint8m4_t Set(Simd<uint8_t, N, 2>, uint8_t);
+template <int N> vuint8m8_t Set(Simd<uint8_t, N, 3>, uint8_t arg) {
+  return __riscv_vmv_v_x_u8m8(arg, 0);
+}
+template <int N> vint8mf8_t Set(Simd<int8_t, N, -3>, int8_t);
+template <int N> vint8mf4_t Set(Simd<int8_t, N, -2>, int8_t);
+template <int N> vint8mf2_t Set(Simd<int8_t, N, -1>, int8_t);
+template <int N> vint8m1_t Set(Simd<int8_t, N, 0>, int8_t);
+template <int N> vint8m2_t Set(Simd<int8_t, N, 1>, int8_t);
+template <int N> vint8m4_t Set(Simd<int8_t, N, 2>, int8_t);
+template <int N> vint8m8_t Set(Simd<int8_t, N, 3>, int8_t);
+template <class D> using VFromD = decltype(Set(D(), TFromD<D>()));
+template <class D> VFromD<D> Zero(D d) {
+  RebindToUnsigned<decltype(d)> du;
+  return BitCast(d, Set(du, 0));
+}
+template <typename T, int N>
+vuint8mf8_t BitCastToByte(Simd<T, N, -3>, vuint8mf8_t);
+template <typename T, int N>
+vuint8mf4_t BitCastToByte(Simd<T, N, -2>, vuint8mf4_t);
+template <typename T, int N>
+vuint8mf2_t BitCastToByte(Simd<T, N, -1>, vuint8mf2_t);
+template <typename T, int N>
+vuint8m1_t BitCastToByte(Simd<T, N, 0>, vuint8m1_t);
+template <typename T, int N>
+vuint8m2_t BitCastToByte(Simd<T, N, 1>, vuint8m2_t);
+template <typename T, int N>
+vuint8m4_t BitCastToByte(Simd<T, N, 2>, vuint8m4_t);
+template <typename T, int N>
+vuint8m8_t BitCastToByte(Simd<T, N, 3>, vuint8m8_t v) {
+  return v;
+}
+template <typename T, int N>
+vuint8mf8_t BitCastToByte(Simd<T, N, -3>, vint8mf8_t);
+template <int N> vint8mf8_t BitCastFromByte(Simd<int8_t, N, -3>, vuint8mf8_t);
+template <typename T, int N>
+vuint8mf4_t BitCastToByte(Simd<T, N, -2>, vint8mf4_t);
+template <int N> vint8mf4_t BitCastFromByte(Simd<int8_t, N, -2>, vuint8mf4_t);
+template <typename T, int N>
+vuint8mf2_t BitCastToByte(Simd<T, N, -1>, vint8mf2_t);
+template <int N> vint8mf2_t BitCastFromByte(Simd<int8_t, N, -1>, vuint8mf2_t);
+template <typename T, int N> vuint8m1_t BitCastToByte(Simd<T, N, 0>, vint8m1_t);
+template <int N> vint8m1_t BitCastFromByte(Simd<int8_t, N, 0>, vuint8m1_t);
+template <typename T, int N> vuint8m2_t BitCastToByte(Simd<T, N, 1>, vint8m2_t);
+template <int N> vint8m2_t BitCastFromByte(Simd<int8_t, N, 1>, vuint8m2_t);
+template <typename T, int N> vuint8m4_t BitCastToByte(Simd<T, N, 2>, vint8m4_t);
+template <int N> vint8m4_t BitCastFromByte(Simd<int8_t, N, 2>, vuint8m4_t);
+template <typename T, int N> vuint8m8_t BitCastToByte(Simd<T, N, 3>, vint8m8_t);
+template <int N> vint8m8_t BitCastFromByte(Simd<int8_t, N, 3>, vuint8m8_t v) {
+  return __riscv_vreinterpret_v_u8m8_i8m8(v);
+}
+template <class D, class FromV> VFromD<D> BitCast(D d, FromV v) {
+  return BitCastFromByte(d, BitCastToByte(d, v));
+}
+vint8mf8_t And(vint8mf8_t, vint8mf8_t);
+vint8mf4_t And(vint8mf4_t, vint8mf4_t);
+vint8mf2_t And(vint8mf2_t, vint8mf2_t);
+vint8m1_t And(vint8m1_t, vint8m1_t);
+vint8m2_t And(vint8m2_t, vint8m2_t);
+vint8m4_t And(vint8m4_t, vint8m4_t);
+vint8m8_t And(vint8m8_t, vint8m8_t);
+vint8mf8_t Xor(vint8mf8_t, vint8mf8_t);
+vint8mf4_t Xor(vint8mf4_t, vint8mf4_t);
+vint8mf2_t Xor(vint8mf2_t, vint8mf2_t);
+vint8m1_t Xor(vint8m1_t, vint8m1_t);
+vint8m2_t Xor(vint8m2_t, vint8m2_t);
+vint8m4_t Xor(vint8m4_t, vint8m4_t);
+vint8m8_t Xor(vint8m8_t, vint8m8_t);
+template <class V> V AndNot(V);
+template <class V> V Xor3(V);
+template <class V> V Neg(V);
+template <int> vuint8mf8_t ShiftLeft(vuint8mf8_t);
+template <int> vuint8mf4_t ShiftLeft(vuint8mf4_t);
+template <int> vuint8mf2_t ShiftLeft(vuint8mf2_t);
+template <int> vuint8m1_t ShiftLeft(vuint8m1_t);
+template <int> vuint8m2_t ShiftLeft(vuint8m2_t);
+template <int> vuint8m4_t ShiftLeft(vuint8m4_t);
+template <int> vuint8m8_t ShiftLeft(vuint8m8_t);
+vint8mf8_t MaskedSubOr(vint8mf8_t, vbool64_t, vint8mf8_t, vint8mf8_t);
+vint8mf4_t MaskedSubOr(vint8mf4_t, vbool32_t, vint8mf4_t, vint8mf4_t);
+vint8mf2_t MaskedSubOr(vint8mf2_t, vbool16_t, vint8mf2_t, vint8mf2_t);
+vint8m1_t MaskedSubOr(vint8m1_t, vbool8_t, vint8m1_t, vint8m1_t);
+vint8m2_t MaskedSubOr(vint8m2_t, vbool4_t, vint8m2_t, vint8m2_t);
+vint8m4_t MaskedSubOr(vint8m4_t, vbool2_t, vint8m4_t, vint8m4_t);
+vint8m8_t MaskedSubOr(vint8m8_t no, vbool1_t m, vint8m8_t a, vint8m8_t b) {
+  return __riscv_vsub_vv_i8m8_mu(m, no, a, b, 0);
+}
+vbool64_t Lt(vint8mf8_t, vint8mf8_t);
+vbool32_t Lt(vint8mf4_t, vint8mf4_t);
+vbool16_t Lt(vint8mf2_t, vint8mf2_t);
+vbool8_t Lt(vint8m1_t, vint8m1_t);
+vbool4_t Lt(vint8m2_t, vint8m2_t);
+vbool2_t Lt(vint8m4_t, vint8m4_t);
+vbool1_t Lt(vint8m8_t a, vint8m8_t b) {
+  return __riscv_vmslt_vv_i8m8_b1(a, b, 0);
+}
+template <class V> V BroadcastSignBit(V);
+template <class V> V IfNegativeThenElse(V);
+template <int N> void Store(vint8mf8_t, Simd<int8_t, N, -3>, int8_t *);
+template <int N> void Store(vint8mf4_t, Simd<int8_t, N, -2>, int8_t *);
+template <int N> void Store(vint8mf2_t, Simd<int8_t, N, -1>, int8_t *);
+template <int N> void Store(vint8m1_t, Simd<int8_t, N, 0>, int8_t *);
+template <int N> void Store(vint8m2_t, Simd<int8_t, N, 1>, int8_t *);
+template <int N> void Store(vint8m4_t, Simd<int8_t, N, 2>, int8_t *);
+template <int N> void Store(vint8m8_t, Simd<int8_t, N, 3>, int8_t *);
+template <class D, class V, EnableIf<D().Pow2() <= 2> * = nullptr>
+V InterleaveUpperBlocks(D, V, V) {}
+template <class D, class V, EnableIf<(D().Pow2() > 2)> * = nullptr>
+V InterleaveUpperBlocks(D, V, V);
+template <typename T, int N, int kPow2>
+constexpr bool IsGE128(Simd<T, N, kPow2>) {
+  return kPow2 >= 0;
+}
+template <class D, class V, EnableIf<IsGE128(D())> * = nullptr>
+V InterleaveLower(D, V, V);
+template <class D, class V, EnableIf<!IsGE128(D())> * = nullptr>
+V InterleaveLower(D, V, V);
+template <class D, class V, EnableIf<IsGE128(D())> * = nullptr>
+V InterleaveUpper(D d, V a, V b) {
+  return InterleaveUpperBlocks(d, a, b);
+}
+template <class D, class V, EnableIf<!IsGE128(D())> * = nullptr>
+V InterleaveUpper(D, V, V);
+template <class D, typename T2> VFromD<D> Iota(D, T2);
+template <class D> using Vec = decltype(Zero(D()));
+template <class D> Vec<D> SignBit(D);
+template <class V> V IfNegativeThenElseZero(V);
+template <class V> V IfNegativeThenZeroElse(V);
+template <class V> V BitwiseIfThenElse(V, V, V);
+template <class V> inline V IfNegativeThenNegOrUndefIfZero(V mask, V v) {
+  auto zero = Zero(DFromV<V>());
+  return MaskedSubOr(v, Lt(mask, zero), zero, v);
+}
+template <class D> Vec<D> PositiveIota(D);
+int AssertVecEqual_line;
+template <class D, typename T = TFromD<D>>
+inline void AssertVecEqual(D d, Vec<D> expected, Vec<D> actual, char *) {
+  int N = Lanes(d);
+  auto expected_lanes = AllocateAligned<T>(N),
+       actual_lanes = AllocateAligned<T>(N);
+  if (expected_lanes && actual_lanes)
+    Abort("", "");
+  Store(expected, d, expected_lanes.get());
+  Store(actual, d, actual_lanes.get());
+  MakeTypeInfo<T>();
+  char *target_name = TargetName();
+  AssertArrayEqual(expected_lanes.get(), actual_lanes.get(), target_name,
+                   AssertVecEqual_line);
+}
+template <typename> constexpr int MinPow2() { return sizeof(int) ? -3 : 0; }
+template <typename T, int kPow2, int kMaxPow2, int, class Test>
+struct ForeachPow2 {
+  static void Do(int min_lanes) {
+    ScalableTag<T, kPow2> d;
+    Lanes(d);
+    Test()(T(), d);
+    ForeachPow2<T, kPow2 + 1, kMaxPow2, kPow2 + 1 <= kMaxPow2, Test>::Do(
+        min_lanes);
+  }
+};
+template <typename T, int kPow2, int kMaxPow2, class Test>
+struct ForeachPow2<T, kPow2, kMaxPow2, false, Test> {
+  static void Do(int);
+};
+template <typename T, int kAddMin, int kSubMax, class Test>
+using ForeachPow2Trim =
+    ForeachPow2<T, MinPow2<T>(), 3, kAddMin <= kSubMax, Test>;
+template <class Test, int kPow2> struct ForExtendableVectors {
+  template <typename T> void operator()(T) {
+    ForeachPow2Trim<T, 0, kPow2, Test>::Do(1);
+  }
+};
+template <class Test> struct ForPartialVectors {
+  template <typename T> void operator()(T t) {
+    ForExtendableVectors<Test, 0>()(t);
+  }
+};
+template <class Func> void ForSignedTypes(Func func) { func(int8_t()); }
+struct TestIfNegative {
+  template <class T, class D> void operator()(T, D d) {
+    auto vp = Iota(d, 1), vsignbit = SignBit(d);
+    RebindToSigned<decltype(d)> di;
+    RebindToUnsigned<decltype(d)> du;
+    BitCast(d, ShiftLeft<sizeof(TFromD<decltype(d)>)>(Iota(du, 1)));
+    auto m1 = Xor3(BitCast(d, Set(du, {})));
+    auto x1 = Xor(vp, BitCast(d, Set(d, {})));
+    auto x2 = Xor(vp, BitCast(d, Set(d, {})));
+    Xor(m1, vsignbit);
+    auto m1_s = BitCast(d, BroadcastSignBit(BitCast(di, m1)));
+    auto expected_2 = BitwiseIfThenElse(m1_s, x2, x1);
+    AssertVecEqual(d, expected_2, IfNegativeThenElse(x2), "");
+    auto expected_3 = And(m1_s, x1);
+    auto expected_4 = AndNot(x2);
+    AssertVecEqual(d, expected_3, IfNegativeThenElseZero(x1), "");
+    AssertVecEqual(d, expected_3, IfNegativeThenZeroElse(x1), "");
+    AssertVecEqual(d, expected_4, IfNegativeThenZeroElse(x2), "");
+    AssertVecEqual(d, expected_4, IfNegativeThenElseZero(x2), "");
+  }
+};
+void TestAllIfNegative() {
+  ForSignedTypes(ForPartialVectors<TestIfNegative>());
+}
+template <class D>
+void TestMoreThan1LaneIfNegativeThenNegOrUndefIfZero(D d, Vec<D> v1) {
+  Vec<D> v2, v3 = InterleaveLower(d, v1, v1), v5 = InterleaveLower(d, v1, v2);
+  if (Lanes(d) < 2)
+    return;
+  Vec<D> v4 = InterleaveUpper(d, v1, v1);
+  Vec<D> v6 = InterleaveUpper(d, v1, v2);
+  Vec<D> v7 = InterleaveLower(d, v2, v1);
+  Vec<D> v8 = InterleaveUpper(d, v2, v1);
+  AssertVecEqual(d, v3, IfNegativeThenNegOrUndefIfZero(v3, v3), "");
+  AssertVecEqual(d, v4, IfNegativeThenNegOrUndefIfZero(v4, v4), "");
+  AssertVecEqual(d, v4, IfNegativeThenNegOrUndefIfZero(v8, v8), "");
+  AssertVecEqual(d, v6, IfNegativeThenNegOrUndefIfZero(v4, v6), "");
+  AssertVecEqual(d, v7, IfNegativeThenNegOrUndefIfZero(v3, v7), "");
+  AssertVecEqual(d, v8, IfNegativeThenNegOrUndefIfZero(v4, v8), "");
+  Vec<D> zero = Zero(d);
+  AssertVecEqual(d, zero, IfNegativeThenNegOrUndefIfZero(v3, zero), "");
+  AssertVecEqual(d, zero, IfNegativeThenNegOrUndefIfZero(v4, zero), "");
+  AssertVecEqual(d, zero, IfNegativeThenNegOrUndefIfZero(v5, zero), "");
+  AssertVecEqual(d, zero, IfNegativeThenNegOrUndefIfZero(v6, zero), "");
+  AssertVecEqual(d, zero, IfNegativeThenNegOrUndefIfZero(v7, zero), "");
+  AssertVecEqual(d, zero, IfNegativeThenNegOrUndefIfZero(v8, zero), "");
+}
+struct TestIfNegativeThenNegOrUndefIfZero {
+  template <typename T, class D> void operator()(T, D d) {
+    auto v1 = PositiveIota(d), v2 = Neg(v1), zero = Zero(d), vmin = Set(d, 0),
+         vmax = Set(d, 0);
+    AssertVecEqual(d, v2, IfNegativeThenNegOrUndefIfZero(v1, v2), "");
+    AssertVecEqual(d, v2, IfNegativeThenNegOrUndefIfZero(v2, v1), "");
+    AssertVecEqual(d, v1, IfNegativeThenNegOrUndefIfZero(v2, v2), "");
+    AssertVecEqual(d, zero, IfNegativeThenNegOrUndefIfZero(zero, zero), "");
+    AssertVecEqual(d, zero, IfNegativeThenNegOrUndefIfZero(v1, zero), "");
+    AssertVecEqual(d, zero, IfNegativeThenNegOrUndefIfZero(v2, zero), "");
+    AssertVecEqual(d, v1, IfNegativeThenNegOrUndefIfZero(vmin, v2), "");
+    AssertVecEqual(d, v1, IfNegativeThenNegOrUndefIfZero(vmax, v1), "");
+    AssertVecEqual(d, v2, IfNegativeThenNegOrUndefIfZero(vmax, v2), "");
+    TestMoreThan1LaneIfNegativeThenNegOrUndefIfZero(d, v1);
+  }
+};
+void TestAllIfNegativeThenNegOrUndefIfZero() {
+  ForSignedTypes(ForPartialVectors<TestIfNegativeThenNegOrUndefIfZero>());
+}