Simplify and speed up string-to-multibyte

* src/character.h (str_to_multibyte):
* src/character.c (str_to_multibyte): Change signature and simplify;
the conversion is no longer done in-place.
* src/fns.c (string_to_multibyte): Drop temporary buffer and memcpy;
adapt to new str_to_multibyte signature.
* src/print.c (print_string): Drop memcpy; adapt call to str_to_multibyte.
* test/src/fns-tests.el (fns--string-to-unibyte): Rename to...
(fns--string-to-unibyte-multibyte): ... this and strengthen, so that
the test covers string-to-multibyte reasonably well.
This commit is contained in:
Mattias Engdegård 2022-07-11 10:34:40 +02:00
parent 9684687793
commit 69b68099ec
5 changed files with 37 additions and 49 deletions

View file

@ -666,35 +666,26 @@ count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
}
/* Convert unibyte text at STR of BYTES bytes to a multibyte text
that contains the same single-byte characters. It actually
converts all 8-bit characters to multibyte forms. It is assured
that we can use LEN bytes at STR as a work area and that is
enough. */
ptrdiff_t
str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
/* Convert unibyte text at SRC of NCHARS bytes to a multibyte text
at DST of NBYTES bytes, that contains the same single-byte characters. */
void
str_to_multibyte (unsigned char *dst, const unsigned char *src,
ptrdiff_t nchars, ptrdiff_t nbytes)
{
unsigned char *p = str, *endp = str + bytes;
unsigned char *to;
while (p < endp && *p < 0x80) p++;
if (p == endp)
return bytes;
to = p;
bytes = endp - p;
endp = str + len;
memmove (endp - bytes, p, bytes);
p = endp - bytes;
while (p < endp)
const unsigned char *s = src + nchars;
unsigned char *d = dst + nbytes;
for (ptrdiff_t i = 0; i < nchars; i++)
{
int c = *p++;
if (c >= 0x80)
c = BYTE8_TO_CHAR (c);
to += CHAR_STRING (c, to);
unsigned char c = *--s;
if (c <= 0x7f)
*--d = c;
else
{
*--d = 0x80 + (c & 0x3f);
*--d = 0xc0 + ((c >> 6) & 1);
}
}
return (to - str);
eassert (d == dst && s == src);
}
/* Arrange multibyte text at STR of LEN bytes as a unibyte text. It

View file

@ -567,7 +567,8 @@ extern int translate_char (Lisp_Object, int c);
extern ptrdiff_t count_size_as_multibyte (const unsigned char *, ptrdiff_t);
extern ptrdiff_t str_as_multibyte (unsigned char *, ptrdiff_t, ptrdiff_t,
ptrdiff_t *);
extern ptrdiff_t str_to_multibyte (unsigned char *, ptrdiff_t, ptrdiff_t);
extern void str_to_multibyte (unsigned char *dst, const unsigned char *src,
ptrdiff_t nchars, ptrdiff_t nbytes);
extern ptrdiff_t str_as_unibyte (unsigned char *, ptrdiff_t);
extern ptrdiff_t strwidth (const char *, ptrdiff_t);
extern ptrdiff_t c_string_width (const unsigned char *, ptrdiff_t, int,

View file

@ -1237,33 +1237,24 @@ string_make_multibyte (Lisp_Object string)
/* Convert STRING (if unibyte) to a multibyte string without changing
the number of characters. Characters 0200 through 0237 are
converted to eight-bit characters. */
the number of characters. Characters 0x80..0xff are interpreted as
raw bytes. */
Lisp_Object
string_to_multibyte (Lisp_Object string)
{
unsigned char *buf;
ptrdiff_t nbytes;
Lisp_Object ret;
USE_SAFE_ALLOCA;
if (STRING_MULTIBYTE (string))
return string;
nbytes = count_size_as_multibyte (SDATA (string), SBYTES (string));
ptrdiff_t nchars = SCHARS (string);
ptrdiff_t nbytes = count_size_as_multibyte (SDATA (string), nchars);
/* If all the chars are ASCII, they won't need any more bytes once
converted. */
if (nbytes == SBYTES (string))
if (nbytes == nchars)
return make_multibyte_string (SSDATA (string), nbytes, nbytes);
buf = SAFE_ALLOCA (nbytes);
memcpy (buf, SDATA (string), SBYTES (string));
str_to_multibyte (buf, nbytes, SBYTES (string));
ret = make_multibyte_string ((char *) buf, SCHARS (string), nbytes);
SAFE_FREE ();
Lisp_Object ret = make_uninit_multibyte_string (nchars, nbytes);
str_to_multibyte (SDATA (ret), SDATA (string), nchars, nbytes);
return ret;
}

View file

@ -467,8 +467,7 @@ print_string (Lisp_Object string, Lisp_Object printcharfun)
if (chars < bytes)
{
newstr = make_uninit_multibyte_string (chars, bytes);
memcpy (SDATA (newstr), SDATA (string), chars);
str_to_multibyte (SDATA (newstr), bytes, chars);
str_to_multibyte (SDATA (newstr), SDATA (string), chars, bytes);
string = newstr;
}
}

View file

@ -1344,18 +1344,24 @@
(should (equal (plist-member plist (copy-sequence "a") #'equal)
'("a" "c")))))
(ert-deftest fns--string-to-unibyte ()
(dolist (str '("" "a" "abc" "a\x00\x7fz" "a\xaa\xbbz ""\x80\xdd\xff"))
(ert-deftest fns--string-to-unibyte-multibyte ()
(dolist (str (list "" "a" "abc" "a\x00\x7fz" "a\xaa\xbbz" "\x80\xdd\xff"
(apply #'unibyte-string (number-sequence 0 255))))
(ert-info ((prin1-to-string str) :prefix "str: ")
(should-not (multibyte-string-p str))
(let* ((u (string-to-unibyte str)) ; should be identity
(m (string-to-multibyte u)) ; lossless conversion
(uu (string-to-unibyte m))) ; also lossless
(mm (string-to-multibyte m)) ; should be identity
(uu (string-to-unibyte m)) ; also lossless
(ml (mapcar (lambda (c) (if (<= c #x7f) c (+ c #x3fff00))) u)))
(should-not (multibyte-string-p u))
(should (multibyte-string-p m))
(should (multibyte-string-p mm))
(should-not (multibyte-string-p uu))
(should (equal str u))
(should (equal str uu)))))
(should (equal m mm))
(should (equal str uu))
(should (equal (append m nil) ml)))))
(should-error (string-to-unibyte "å"))
(should-error (string-to-unibyte "ABC∀BC")))