diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 8baae4003fc..88e59426a04 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -770,7 +770,7 @@ tree aarch64_vector_load_decl (tree);
 rtx aarch64_gen_callee_cookie (aarch64_feature_flags, arm_pcs);
 void aarch64_expand_call (rtx, rtx, rtx, bool);
 bool aarch64_expand_cpymem_mops (rtx *, bool);
-bool aarch64_expand_cpymem (rtx *);
+bool aarch64_expand_cpymem (rtx *, bool);
 bool aarch64_expand_setmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
 bool aarch64_float_const_rtx_p (rtx);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 51673e9a847..190608d1381 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25428,52 +25428,40 @@ aarch64_progress_pointer (rtx pointer)
   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
 }
 
-/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
-   MODE bytes.  */
+typedef auto_vec<std::pair<rtx, rtx>, 12> copy_ops;
 
+/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
 static void
-aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
-					      machine_mode mode)
+aarch64_copy_one_block (copy_ops &ops, rtx src, rtx dst,
+			int offset, machine_mode mode)
 {
-  /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
-     address copies using V4SImode so that we can use Q registers.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
+  /* Emit explict load/store pair instructions for 32-byte copies.  */
+  if (known_eq (GET_MODE_SIZE (mode), 32))
     {
       mode = V4SImode;
+      rtx src1 = adjust_address (src, mode, offset);
+      rtx src2 = adjust_address (src, mode, offset + 16);
+      rtx dst1 = adjust_address (dst, mode, offset);
+      rtx dst2 = adjust_address (dst, mode, offset + 16);
       rtx reg1 = gen_reg_rtx (mode);
       rtx reg2 = gen_reg_rtx (mode);
-      /* "Cast" the pointers to the correct mode.  */
-      *src = adjust_address (*src, mode, 0);
-      *dst = adjust_address (*dst, mode, 0);
-      /* Emit the memcpy.  */
-      emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
-					aarch64_progress_pointer (*src)));
-      emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
-					 aarch64_progress_pointer (*dst), reg2));
-      /* Move the pointers forward.  */
-      *src = aarch64_move_pointer (*src, 32);
-      *dst = aarch64_move_pointer (*dst, 32);
+      rtx load = aarch64_gen_load_pair (mode, reg1, src1, reg2, src2);
+      rtx store = aarch64_gen_store_pair (mode, dst1, reg1, dst2, reg2);
+      ops.safe_push ({ load, store });
       return;
     }
 
   rtx reg = gen_reg_rtx (mode);
-
-  /* "Cast" the pointers to the correct mode.  */
-  *src = adjust_address (*src, mode, 0);
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memcpy.  */
-  emit_move_insn (reg, *src);
-  emit_move_insn (*dst, reg);
-  /* Move the pointers forward.  */
-  *src = aarch64_progress_pointer (*src);
-  *dst = aarch64_progress_pointer (*dst);
+  rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
+  rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
+  ops.safe_push ({ load, store });
 }
 
 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
    from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
    rather than memcpy.  Return true iff we succeeded.  */
 bool
-aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
+aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
 {
   if (!TARGET_MOPS)
     return false;
@@ -25492,55 +25480,51 @@ aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
   return true;
 }
 
-/* Expand cpymem, as if from a __builtin_memcpy.  Return true if
-   we succeed, otherwise return false, indicating that a libcall to
-   memcpy should be emitted.  */
-
+/* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
+   OPERANDS are taken from the cpymem/movmem pattern.  IS_MEMMOVE is true
+   if this is a memmove rather than memcpy.  Return true if we succeed,
+   otherwise return false, indicating that a libcall should be emitted.  */
 bool
-aarch64_expand_cpymem (rtx *operands)
+aarch64_expand_cpymem (rtx *operands, bool is_memmove)
 {
-  int mode_bits;
+  int mode_bytes;
   rtx dst = operands[0];
   rtx src = operands[1];
   unsigned align = UINTVAL (operands[3]);
   rtx base;
-  machine_mode cur_mode = BLKmode;
-  bool size_p = optimize_function_for_size_p (cfun);
+  machine_mode cur_mode = BLKmode, next_mode;
 
   /* Variable-sized or strict-align copies may use the MOPS expansion.  */
   if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
-    return aarch64_expand_cpymem_mops (operands);
+    return aarch64_expand_cpymem_mops (operands, is_memmove);
 
   unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
+  bool use_ldpq = TARGET_SIMD && !(aarch64_tune_params.extra_tuning_flags
+				   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
 
-  /* Try to inline up to 256 bytes.  */
-  unsigned max_copy_size = 256;
-  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
+  /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  */
+  unsigned max_copy_size = use_ldpq ? 256 : 128;
+  unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
+				       : aarch64_mops_memcpy_size_threshold;
+
+  /* Reduce the maximum size with -Os.  */
+  if (optimize_function_for_size_p (cfun))
+    max_copy_size /= 4;
 
   /* Large copies use MOPS when available or a library call.  */
   if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
-    return aarch64_expand_cpymem_mops (operands);
+    return aarch64_expand_cpymem_mops (operands, is_memmove);
 
-  int copy_bits = 256;
+  unsigned copy_max = 32;
 
-  /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
-     support or slow 256-bit LDP/STP fall back to 128-bit chunks.
+  /* Default to 32-byte LDP/STP on large copies, however small copies, no SIMD
+     support or slow LDP/STP fall back to 16-byte chunks.
 
      ??? Although it would be possible to use LDP/STP Qn in streaming mode
      (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
      whether that would improve performance.  */
-  if (size <= 24
-      || !TARGET_SIMD
-      || (aarch64_tune_params.extra_tuning_flags
-	  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
-    copy_bits = 128;
-
-  /* Emit an inline load+store sequence and count the number of operations
-     involved.  We use a simple count of just the loads and stores emitted
-     rather than rtx_insn count as all the pointer adjustments and reg copying
-     in this function will get optimized away later in the pipeline.  */
-  start_sequence ();
-  unsigned nops = 0;
+  if (size <= 24 || !use_ldpq)
+    copy_max = 16;
 
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -25548,69 +25532,55 @@ aarch64_expand_cpymem (rtx *operands)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
-  /* Convert size to bits to make the rest of the code simpler.  */
-  int n = size * BITS_PER_UNIT;
+  copy_ops ops;
+  int offset = 0;
 
-  while (n > 0)
+  while (size > 0)
     {
       /* Find the largest mode in which to do the copy in without over reading
 	 or writing.  */
       opt_scalar_int_mode mode_iter;
       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
+	if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max))
 	  cur_mode = mode_iter.require ();
 
       gcc_assert (cur_mode != BLKmode);
 
-      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
+      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
 
       /* Prefer Q-register accesses for the last bytes.  */
-      if (mode_bits == 128 && copy_bits == 256)
+      if (mode_bytes == 16 && copy_max == 32)
 	cur_mode = V4SImode;
-
-      aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
-      /* A single block copy is 1 load + 1 store.  */
-      nops += 2;
-      n -= mode_bits;
+      aarch64_copy_one_block (ops, src, dst, offset, cur_mode);
+      size -= mode_bytes;
+      offset += mode_bytes;
 
       /* Emit trailing copies using overlapping unaligned accesses
-	(when !STRICT_ALIGNMENT) - this is smaller and faster.  */
-      if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
+	 (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
+      if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
 	{
-	  machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
-	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
-	  gcc_assert (n_bits <= mode_bits);
-	  src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
-	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
-	  n = n_bits;
+	  next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
+	  int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
+	  gcc_assert (n_bytes <= mode_bytes);
+	  offset -= n_bytes - size;
+	  size = n_bytes;
 	}
     }
-  rtx_insn *seq = get_insns ();
-  end_sequence ();
-  /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
-     the constant size into a register.  */
-  unsigned mops_cost = 3 + 1;
 
-  /* If MOPS is available at this point we don't consider the libcall as it's
-     not a win even on code size.  At this point only consider MOPS if
-     optimizing for size.  For speed optimizations we will have chosen between
-     the two based on copy size already.  */
-  if (TARGET_MOPS)
+  /* Memcpy interleaves loads with stores, memmove emits all loads first.  */
+  int nops = ops.length();
+  int inc = is_memmove ? nops : nops == 4 ? 2 : 3;
+
+  for (int i = 0; i < nops; i += inc)
     {
-      if (size_p && mops_cost < nops)
-	return aarch64_expand_cpymem_mops (operands);
-      emit_insn (seq);
-      return true;
+      int m = MIN (nops, i + inc);
+      /* Emit loads.  */
+      for (int j = i; j < m; j++)
+	emit_insn (ops[j].first);
+      /* Emit stores.  */
+      for (int j = i; j < m; j++)
+	emit_insn (ops[j].second);
     }
-
-  /* A memcpy libcall in the worst case takes 3 instructions to prepare the
-     arguments + 1 for the call.  When MOPS is not available and we're
-     optimizing for size a libcall may be preferable.  */
-  unsigned libcall_cost = 4;
-  if (size_p && libcall_cost < nops)
-    return false;
-
-  emit_insn (seq);
   return true;
 }
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index d70535e87be..228c98ab06d 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1768,7 +1768,7 @@
    (match_operand:DI 3 "immediate_operand")]
    ""
 {
-  if (aarch64_expand_cpymem (operands))
+  if (aarch64_expand_cpymem (operands, false))
     DONE;
   FAIL;
 }
@@ -1812,17 +1812,9 @@
    (match_operand:BLK 1 "memory_operand")
    (match_operand:DI 2 "general_operand")
    (match_operand:DI 3 "immediate_operand")]
-   "TARGET_MOPS"
+   ""
 {
-   rtx sz_reg = operands[2];
-   /* For constant-sized memmoves check the threshold.
-      FIXME: We should add a non-MOPS memmove expansion for smaller,
-      constant-sized memmove to avoid going to a libcall.  */
-   if (CONST_INT_P (sz_reg)
-       && INTVAL (sz_reg) < aarch64_mops_memmove_size_threshold)
-     FAIL;
-
-  if (aarch64_expand_cpymem_mops (operands, true))
+  if (aarch64_expand_cpymem (operands, true))
     DONE;
   FAIL;
 }
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index df84c662d24..218e6c86db4 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -345,7 +345,7 @@ Target Joined UInteger Var(aarch64_mops_memcpy_size_threshold) Init(256) Param
 Constant memcpy size in bytes above which to start using MOPS sequence.
 
 -param=aarch64-mops-memmove-size-threshold=
-Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(0) Param
+Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(256) Param
 Constant memmove size in bytes above which to start using MOPS sequence.
 
 -param=aarch64-mops-memset-size-threshold=
diff --git a/gcc/testsuite/gcc.target/aarch64/memmove.c b/gcc/testsuite/gcc.target/aarch64/memmove.c
new file mode 100644
index 00000000000..d2dd65b5190
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/memmove.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#pragma GCC target "+nomops"
+
+void
+copy1 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 12);
+}
+
+void
+copy2 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 128);
+}
+
+void
+copy3 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 255);
+}
+
+/* { dg-final { scan-assembler-not {\tb\tmemmove} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/memmove2.c b/gcc/testsuite/gcc.target/aarch64/memmove2.c
new file mode 100644
index 00000000000..4c590a32214
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/memmove2.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mstrict-align" } */
+
+#pragma GCC target "+nomops"
+
+void
+copy1 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 12);
+}
+
+void
+copy2 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 128);
+}
+
+void
+copy3 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 255);
+}
+
+/* { dg-final { scan-assembler-times {\tb\tmemmove} 3 } } */