Speed up string-to-unibyte

* src/character.h (str_to_unibyte): * src/character.c (str_to_unibyte): Remove. * src/fns.c (Fstring_to_unibyte): Ditch the call to str_to_unibyte and the unnecessary heap allocation. Write new, faster code. * test/src/fns-tests.el (fns--string-to-unibyte): New test.
2022-07-10 18:02:08 +02:00 · 2022-07-10 18:02:08 +02:00 · cfda663282
commit cfda663282
parent 4bab499ed0
4 changed files with 30 additions and 37 deletions
--- a/src/character.c
+++ b/src/character.c
@ -734,31 +734,6 @@ str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
  return (to - str);
 }

-/* Convert eight-bit chars in SRC (in multibyte form) to the
-   corresponding byte and store in DST.  CHARS is the number of
-   characters in SRC.  The value is the number of bytes stored in DST.
-   Usually, the value is the same as CHARS, but is less than it if SRC
-   contains a non-ASCII, non-eight-bit character.  */
-
-ptrdiff_t
-str_to_unibyte (const unsigned char *src, unsigned char *dst, ptrdiff_t chars)
-{
-  ptrdiff_t i;
-
-  for (i = 0; i < chars; i++)
-    {
-      int c = string_char_advance (&src);
-
-      if (CHAR_BYTE8_P (c))
-	c = CHAR_TO_BYTE8 (c);
-      else if (! ASCII_CHAR_P (c))
-	return i;
-      *dst++ = c;
-    }
-  return i;
-}
-
-
 static ptrdiff_t
 string_count_byte8 (Lisp_Object string)
 {
--- a/src/character.h
+++ b/src/character.h
@ -569,8 +569,6 @@ extern ptrdiff_t str_as_multibyte (unsigned char *, ptrdiff_t, ptrdiff_t,
 				   ptrdiff_t *);
 extern ptrdiff_t str_to_multibyte (unsigned char *, ptrdiff_t, ptrdiff_t);
 extern ptrdiff_t str_as_unibyte (unsigned char *, ptrdiff_t);
-extern ptrdiff_t str_to_unibyte (const unsigned char *, unsigned char *,
-                                 ptrdiff_t);
 extern ptrdiff_t strwidth (const char *, ptrdiff_t);
 extern ptrdiff_t c_string_width (const unsigned char *, ptrdiff_t, int,
 				 ptrdiff_t *, ptrdiff_t *);
--- a/src/fns.c
+++ b/src/fns.c
@ -1413,19 +1413,24 @@ an error is signaled.  */)
  (Lisp_Object string)
 {
  CHECK_STRING (string);
+  if (!STRING_MULTIBYTE (string))
+    return string;

-  if (STRING_MULTIBYTE (string))
+  ptrdiff_t chars = SCHARS (string);
+  Lisp_Object ret = make_uninit_string (chars);
+  unsigned char *src = SDATA (string);
+  unsigned char *dst = SDATA (ret);
+  for (ptrdiff_t i = 0; i < chars; i++)
    {
-      ptrdiff_t chars = SCHARS (string);
-      unsigned char *str = xmalloc (chars);
-      ptrdiff_t converted = str_to_unibyte (SDATA (string), str, chars);
-
-      if (converted < chars)
-	error ("Can't convert the %"pD"dth character to unibyte", converted);
-      string = make_unibyte_string ((char *) str, chars);
-      xfree (str);
+      unsigned char b = *src++;
+      if (b <= 0x7f)
+	*dst++ = b;					 /* ASCII */
+      else if (CHAR_BYTE8_HEAD_P (b))
+	*dst++ = 0x80 | (b & 1) << 6 | (*src++ & 0x3f);	 /* raw byte */
+      else
+	error ("Cannot convert character at index %"pD"d to unibyte", i);
    }
-  return string;
+  return ret;
 }


--- a/test/src/fns-tests.el
+++ b/test/src/fns-tests.el
@ -1344,4 +1344,19 @@
    (should (equal (plist-member plist (copy-sequence "a") #'equal)
                   '("a" "c")))))

+(ert-deftest fns--string-to-unibyte ()
+  (dolist (str '("" "a" "abc" "a\x00\x7fz" "a\xaa\xbbz ""\x80\xdd\xff"))
+    (ert-info ((prin1-to-string str) :prefix "str: ")
+      (should-not (multibyte-string-p str))
+      (let* ((u (string-to-unibyte str))   ; should be identity
+             (m (string-to-multibyte u))   ; lossless conversion
+             (uu (string-to-unibyte m)))   ; also lossless
+        (should-not (multibyte-string-p u))
+        (should (multibyte-string-p m))
+        (should-not (multibyte-string-p uu))
+        (should (equal str u))
+        (should (equal str uu)))))
+  (should-error (string-to-unibyte "å"))
+  (should-error (string-to-unibyte "ABC∀BC")))
+
 ;;; fns-tests.el ends here