diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S
index 05439ce394b..a099037179b 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -22,6 +22,22 @@
. */
+/* AArch64 128-bit lock-free atomic implementation.
+
+ 128-bit atomics are now lock-free for all AArch64 architecture versions.
+ This is backwards compatible with existing binaries (as we swap all uses
+ of 128-bit atomics via an ifunc) and gives better performance than locking
+ atomics.
+
+ 128-bit atomic loads use a exclusive loop if LSE2 is not supported.
+ This results in an implicit store which is invisible to software as long
+ as the given address is writeable. Since all other atomics have explicit
+ writes, this will be true when using atomics in actual code.
+
+ The libat__16 entry points are ARMv8.0.
+ The libat__16_i1 entry points are used when LSE2 is available. */
+
+
.arch armv8-a+lse
#define ENTRY(name) \
@@ -37,6 +53,10 @@ name: \
.cfi_endproc; \
.size name, .-name;
+#define ALIAS(alias,name) \
+ .global alias; \
+ .set alias, name;
+
#define res0 x0
#define res1 x1
#define in0 x2
@@ -70,6 +90,24 @@ name: \
#define SEQ_CST 5
+ENTRY (libat_load_16)
+ mov x5, x0
+ cbnz w1, 2f
+
+ /* RELAXED. */
+1: ldxp res0, res1, [x5]
+ stxp w4, res0, res1, [x5]
+ cbnz w4, 1b
+ ret
+
+ /* ACQUIRE/CONSUME/SEQ_CST. */
+2: ldaxp res0, res1, [x5]
+ stxp w4, res0, res1, [x5]
+ cbnz w4, 2b
+ ret
+END (libat_load_16)
+
+
ENTRY (libat_load_16_i1)
cbnz w1, 1f
@@ -93,6 +131,23 @@ ENTRY (libat_load_16_i1)
END (libat_load_16_i1)
+ENTRY (libat_store_16)
+ cbnz w4, 2f
+
+ /* RELAXED. */
+1: ldxp xzr, tmp0, [x0]
+ stxp w4, in0, in1, [x0]
+ cbnz w4, 1b
+ ret
+
+ /* RELEASE/SEQ_CST. */
+2: ldxp xzr, tmp0, [x0]
+ stlxp w4, in0, in1, [x0]
+ cbnz w4, 2b
+ ret
+END (libat_store_16)
+
+
ENTRY (libat_store_16_i1)
cbnz w4, 1f
@@ -101,14 +156,14 @@ ENTRY (libat_store_16_i1)
ret
/* RELEASE/SEQ_CST. */
-1: ldaxp xzr, tmp0, [x0]
+1: ldxp xzr, tmp0, [x0]
stlxp w4, in0, in1, [x0]
cbnz w4, 1b
ret
END (libat_store_16_i1)
-ENTRY (libat_exchange_16_i1)
+ENTRY (libat_exchange_16)
mov x5, x0
cbnz w4, 2f
@@ -126,22 +181,60 @@ ENTRY (libat_exchange_16_i1)
stxp w4, in0, in1, [x5]
cbnz w4, 3b
ret
-4:
+
+ /* RELEASE/ACQ_REL/SEQ_CST. */
+4: ldaxp res0, res1, [x5]
+ stlxp w4, in0, in1, [x5]
+ cbnz w4, 4b
+ ret
+END (libat_exchange_16)
+
+
+ENTRY (libat_compare_exchange_16)
+ ldp exp0, exp1, [x1]
+ cbz w4, 3f
cmp w4, RELEASE
- b.ne 6f
+ b.hs 5f
- /* RELEASE. */
-5: ldxp res0, res1, [x5]
- stlxp w4, in0, in1, [x5]
+ /* ACQUIRE/CONSUME. */
+1: ldaxp tmp0, tmp1, [x0]
+ cmp tmp0, exp0
+ ccmp tmp1, exp1, 0, eq
+ csel tmp0, in0, tmp0, eq
+ csel tmp1, in1, tmp1, eq
+ stxp w4, tmp0, tmp1, [x0]
+ cbnz w4, 1b
+ beq 2f
+ stp tmp0, tmp1, [x1]
+2: cset x0, eq
+ ret
+
+ /* RELAXED. */
+3: ldxp tmp0, tmp1, [x0]
+ cmp tmp0, exp0
+ ccmp tmp1, exp1, 0, eq
+ csel tmp0, in0, tmp0, eq
+ csel tmp1, in1, tmp1, eq
+ stxp w4, tmp0, tmp1, [x0]
+ cbnz w4, 3b
+ beq 4f
+ stp tmp0, tmp1, [x1]
+4: cset x0, eq
+ ret
+
+ /* RELEASE/ACQ_REL/SEQ_CST. */
+5: ldaxp tmp0, tmp1, [x0]
+ cmp tmp0, exp0
+ ccmp tmp1, exp1, 0, eq
+ csel tmp0, in0, tmp0, eq
+ csel tmp1, in1, tmp1, eq
+ stlxp w4, tmp0, tmp1, [x0]
cbnz w4, 5b
+ beq 6f
+ stp tmp0, tmp1, [x1]
+6: cset x0, eq
ret
-
- /* ACQ_REL/SEQ_CST. */
-6: ldaxp res0, res1, [x5]
- stlxp w4, in0, in1, [x5]
- cbnz w4, 6b
- ret
-END (libat_exchange_16_i1)
+END (libat_compare_exchange_16)
ENTRY (libat_compare_exchange_16_i1)
@@ -180,7 +273,7 @@ ENTRY (libat_compare_exchange_16_i1)
END (libat_compare_exchange_16_i1)
-ENTRY (libat_fetch_add_16_i1)
+ENTRY (libat_fetch_add_16)
mov x5, x0
cbnz w4, 2f
@@ -199,10 +292,10 @@ ENTRY (libat_fetch_add_16_i1)
stlxp w4, tmp0, tmp1, [x5]
cbnz w4, 2b
ret
-END (libat_fetch_add_16_i1)
+END (libat_fetch_add_16)
-ENTRY (libat_add_fetch_16_i1)
+ENTRY (libat_add_fetch_16)
mov x5, x0
cbnz w4, 2f
@@ -221,10 +314,10 @@ ENTRY (libat_add_fetch_16_i1)
stlxp w4, res0, res1, [x5]
cbnz w4, 2b
ret
-END (libat_add_fetch_16_i1)
+END (libat_add_fetch_16)
-ENTRY (libat_fetch_sub_16_i1)
+ENTRY (libat_fetch_sub_16)
mov x5, x0
cbnz w4, 2f
@@ -243,10 +336,10 @@ ENTRY (libat_fetch_sub_16_i1)
stlxp w4, tmp0, tmp1, [x5]
cbnz w4, 2b
ret
-END (libat_fetch_sub_16_i1)
+END (libat_fetch_sub_16)
-ENTRY (libat_sub_fetch_16_i1)
+ENTRY (libat_sub_fetch_16)
mov x5, x0
cbnz w4, 2f
@@ -265,10 +358,10 @@ ENTRY (libat_sub_fetch_16_i1)
stlxp w4, res0, res1, [x5]
cbnz w4, 2b
ret
-END (libat_sub_fetch_16_i1)
+END (libat_sub_fetch_16)
-ENTRY (libat_fetch_or_16_i1)
+ENTRY (libat_fetch_or_16)
mov x5, x0
cbnz w4, 2f
@@ -287,10 +380,10 @@ ENTRY (libat_fetch_or_16_i1)
stlxp w4, tmp0, tmp1, [x5]
cbnz w4, 2b
ret
-END (libat_fetch_or_16_i1)
+END (libat_fetch_or_16)
-ENTRY (libat_or_fetch_16_i1)
+ENTRY (libat_or_fetch_16)
mov x5, x0
cbnz w4, 2f
@@ -309,10 +402,10 @@ ENTRY (libat_or_fetch_16_i1)
stlxp w4, res0, res1, [x5]
cbnz w4, 2b
ret
-END (libat_or_fetch_16_i1)
+END (libat_or_fetch_16)
-ENTRY (libat_fetch_and_16_i1)
+ENTRY (libat_fetch_and_16)
mov x5, x0
cbnz w4, 2f
@@ -331,10 +424,10 @@ ENTRY (libat_fetch_and_16_i1)
stlxp w4, tmp0, tmp1, [x5]
cbnz w4, 2b
ret
-END (libat_fetch_and_16_i1)
+END (libat_fetch_and_16)
-ENTRY (libat_and_fetch_16_i1)
+ENTRY (libat_and_fetch_16)
mov x5, x0
cbnz w4, 2f
@@ -353,10 +446,10 @@ ENTRY (libat_and_fetch_16_i1)
stlxp w4, res0, res1, [x5]
cbnz w4, 2b
ret
-END (libat_and_fetch_16_i1)
+END (libat_and_fetch_16)
-ENTRY (libat_fetch_xor_16_i1)
+ENTRY (libat_fetch_xor_16)
mov x5, x0
cbnz w4, 2f
@@ -375,10 +468,10 @@ ENTRY (libat_fetch_xor_16_i1)
stlxp w4, tmp0, tmp1, [x5]
cbnz w4, 2b
ret
-END (libat_fetch_xor_16_i1)
+END (libat_fetch_xor_16)
-ENTRY (libat_xor_fetch_16_i1)
+ENTRY (libat_xor_fetch_16)
mov x5, x0
cbnz w4, 2f
@@ -397,10 +490,10 @@ ENTRY (libat_xor_fetch_16_i1)
stlxp w4, res0, res1, [x5]
cbnz w4, 2b
ret
-END (libat_xor_fetch_16_i1)
+END (libat_xor_fetch_16)
-ENTRY (libat_fetch_nand_16_i1)
+ENTRY (libat_fetch_nand_16)
mov x5, x0
mvn in0, in0
mvn in1, in1
@@ -421,10 +514,10 @@ ENTRY (libat_fetch_nand_16_i1)
stlxp w4, tmp0, tmp1, [x5]
cbnz w4, 2b
ret
-END (libat_fetch_nand_16_i1)
+END (libat_fetch_nand_16)
-ENTRY (libat_nand_fetch_16_i1)
+ENTRY (libat_nand_fetch_16)
mov x5, x0
mvn in0, in0
mvn in1, in1
@@ -445,21 +538,38 @@ ENTRY (libat_nand_fetch_16_i1)
stlxp w4, res0, res1, [x5]
cbnz w4, 2b
ret
-END (libat_nand_fetch_16_i1)
+END (libat_nand_fetch_16)
-ENTRY (libat_test_and_set_16_i1)
- mov w2, 1
- cbnz w1, 2f
+/* __atomic_test_and_set is always inlined, so this entry is unused and
+ only required for completeness. */
+ENTRY (libat_test_and_set_16)
- /* RELAXED. */
- swpb w0, w2, [x0]
+ /* RELAXED/ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */
+ mov x5, x0
+1: ldaxrb w0, [x5]
+ stlxrb w4, w2, [x5]
+ cbnz w4, 1b
ret
+END (libat_test_and_set_16)
- /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */
-2: swpalb w0, w2, [x0]
- ret
-END (libat_test_and_set_16_i1)
+
+/* Alias entry points which are the same in baseline and LSE2. */
+
+ALIAS (libat_exchange_16_i1, libat_exchange_16)
+ALIAS (libat_fetch_add_16_i1, libat_fetch_add_16)
+ALIAS (libat_add_fetch_16_i1, libat_add_fetch_16)
+ALIAS (libat_fetch_sub_16_i1, libat_fetch_sub_16)
+ALIAS (libat_sub_fetch_16_i1, libat_sub_fetch_16)
+ALIAS (libat_fetch_or_16_i1, libat_fetch_or_16)
+ALIAS (libat_or_fetch_16_i1, libat_or_fetch_16)
+ALIAS (libat_fetch_and_16_i1, libat_fetch_and_16)
+ALIAS (libat_and_fetch_16_i1, libat_and_fetch_16)
+ALIAS (libat_fetch_xor_16_i1, libat_fetch_xor_16)
+ALIAS (libat_xor_fetch_16_i1, libat_xor_fetch_16)
+ALIAS (libat_fetch_nand_16_i1, libat_fetch_nand_16)
+ALIAS (libat_nand_fetch_16_i1, libat_nand_fetch_16)
+ALIAS (libat_test_and_set_16_i1, libat_test_and_set_16)
/* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code. */
diff --git a/libatomic/config/linux/aarch64/host-config.h b/libatomic/config/linux/aarch64/host-config.h
index 9747accd88f..ac4d922ca5c 100644
--- a/libatomic/config/linux/aarch64/host-config.h
+++ b/libatomic/config/linux/aarch64/host-config.h
@@ -35,12 +35,13 @@
#endif
#define IFUNC_NCOND(N) (1)
-#if N == 16 && IFUNC_ALT != 0
+#endif /* HAVE_IFUNC */
+
+/* All 128-bit atomic functions are defined in aarch64/atomic_16.S. */
+#if N == 16
# define DONE 1
#endif
-#endif /* HAVE_IFUNC */
-
#ifdef HWCAP_USCAT
#define MIDR_IMPLEMENTOR(midr) (((midr) >> 24) & 255)