Add Unicode-safe UTF-8 converter

* src/coding.c (encode_string_utf_8, decode_string_utf_8): New functions.
* src/coding.h (encode_string_utf_8, decode_string_utf_8): Extern them.
This commit is contained in:
K. Handa 2019-08-04 21:14:26 +09:00
parent 5ec3f70527
commit a8026dfde9
2 changed files with 734 additions and 0 deletions

View file

@ -9515,6 +9515,732 @@ code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
}
/* Return the gap address of BUFFER. If the gap size is less than
NBYTES, enlarge the gap in advance. */
static unsigned char *
get_buffer_gap_address (Lisp_Object buffer, int nbytes)
{
struct buffer *buf = XBUFFER (buffer);
if (BUF_GPT (buf) != BUF_PT (buf))
{
struct buffer *oldb = current_buffer;
current_buffer = buf;
move_gap_both (PT, PT_BYTE);
current_buffer = oldb;
}
if (BUF_GAP_SIZE (buf) < nbytes)
make_gap_1 (buf, nbytes);
return BUF_GPT_ADDR (buf);
}
/* Return a pointer to the byte sequence for C, and set the length in
LEN. This function is used to get a byte sequence for HANDLE_8_BIT
and HANDLE_OVER_UNI arguments of encode_string_utf_8 and
decode_string_utf_8 when those arguments are given by
characters. */
static unsigned char *
get_char_bytes (int c, int *len)
{
/* We uses two chaches considering the situation that
encode/decode_string_utf_8 are called repeatedly with the same
values for HANDLE_8_BIT and HANDLE_OVER_UNI arguments. */
static int chars[2];
static unsigned char bytes[2][6];
static int nbytes[2];
static int last_index;
if (chars[last_index] == c)
{
*len = nbytes[last_index];
return bytes[last_index];
}
if (chars[1 - last_index] == c)
{
*len = nbytes[1 - last_index];
return bytes[1 - last_index];
}
last_index = 1 - last_index;
chars[last_index] = c;
*len = nbytes[last_index] = CHAR_STRING (c, bytes[last_index]);
return bytes[last_index];
}
/* Encode STRING by the coding system utf-8-unix.
Even if :pre-write-conversion and :encode-translation-table
properties are put to that coding system, they are ignored.
It ignores :pre-write-conversion and :encode-translation-table
propeties of that coding system.
This function assumes that arguments have values as described
below. The validity must be assured by callers.
STRING is a multibyte string or an ASCII-only unibyte string.
BUFFER is a unibyte buffer or Qnil.
If BUFFER is a unibyte buffer, the encoding result of UTF-8
sequence is inserted after point of the buffer, and the number of
inserted characters is returned. Note that a caller should have
made BUFFER ready for modifying in advance (e.g. by calling
invalidate_buffer_caches).
If BUFFER is Qnil, a unibyte string is made from the encodnig
result of UTF-8 sequence, and it is returned. If NOCOPY and STRING
contains only Unicode characters (i.e. the encoding does not change
the byte sequence), STRING is returned even if it is multibyte.
HANDLE-8-BIT and HANDE-OVER-UNI specify how to handle a non-Unicode
character. The former is for an eight-bit character (represented
by 2-byte overlong sequence in multibyte STRING). The latter is
for an over-unicode character (a character whose code is greater
than the maximum Unicode character 0x10FFFF, and is represented by
4 or 5-byte sequence in multibyte STRING).
If they are unibyte strings (typically "\357\277\275"; UTF-8
sequence for the Unicode REPLACEMENT CHARACTER #xFFFD), a
non-Unicode character is encoded into that sequence.
If they are characters, a non-Unicode chracters is encoded into the
corresponding UTF-8 sequences.
If they are Qignored, a non-Unicode character is skipped on
encoding.
If HANDLE-8-BIT is Qt, an eight-bit character is encoded into one
byte of the same value.
If HANDLE-OVER-UNI is Qt, an over-unicode character is encoded
into the the same 4 or 5-byte sequence.
If they are Qnil, Qnil is returned if STRING has a non-Unicode
character. */
Lisp_Object
encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
bool nocopy, Lisp_Object handle_8_bit,
Lisp_Object handle_over_uni)
{
ptrdiff_t nchars = SCHARS (string), nbytes = SBYTES (string);
if (NILP (buffer) && nchars == nbytes)
/* STRING contains only ASCII characters. */
return string;
ptrdiff_t num_8_bit = 0; /* number of eight-bit chars in STRING */
/* The following two vars are counted only if handle_over_uni is not Qt */
ptrdiff_t num_over_4 = 0; /* number of 4-byte non-Unicode chars in STRING */
ptrdiff_t num_over_5 = 0; /* number of 5-byte non-Unicode chars in STRING */
ptrdiff_t outbytes; /* number of bytes of decoding result. */
unsigned char *p = SDATA (string);
unsigned char *pend = p + nbytes;
unsigned char *src = NULL, *dst = NULL;
unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL;
int replace_8_bit_len = 0, replace_over_uni_len = 0;
Lisp_Object val; /* the return value */
/* Scan bytes in STRING twice. The first scan is to count non-Unicode
characters, and the second scan is to encode STRING. If the
encoding is trivial (no need of changing the byte sequence),
the second scan is avoided. */
for (int scan_count = 0; scan_count < 2; scan_count++)
{
while (p < pend)
{
if (nchars == pend - p)
/* There is no multibyte character remaining. */
break;
int c = *p;
int len = BYTES_BY_CHAR_HEAD (c);
nchars--;
if (len == 1
|| len == 3
|| (len == 2 ? ! CHAR_BYTE8_HEAD_P (c)
: (EQ (handle_over_uni, Qt)
|| (len == 4
&& string_char (p, NULL, NULL) <= MAX_UNICODE_CHAR))))
{
p += len;
continue;
}
/* A character to change the byte sequence on encoding was
found. A rare case. */
if (len == 2)
{
/* Handle an eight-bit character by handle_8_bit. */
if (scan_count == 0)
{
if (NILP (handle_8_bit))
return Qnil;
num_8_bit++;
}
else
{
if (src < p)
{
memcpy (dst, src, p - src);
dst += p - src;
}
if (replace_8_bit_len > 0)
{
memcpy (dst, replace_8_bit, replace_8_bit_len);
dst += replace_8_bit_len;
}
else if (EQ (handle_8_bit, Qt))
{
int char8 = STRING_CHAR (p);
*dst++ = CHAR_TO_BYTE8 (char8);
}
}
}
else /* len == 4 or 5 */
{
/* Handle an over-unicode character by handle_over_uni. */
if (scan_count == 0)
{
if (NILP (handle_over_uni))
return Qnil;
if (len == 4)
num_over_4++;
else
num_over_5++;
}
else
{
if (src < p)
{
memcpy (dst, src, p - src);
dst += p - src;
}
if (replace_over_uni_len > 0)
{
memcpy (dst, replace_over_uni, replace_over_uni_len);
dst += replace_over_uni_len;
}
}
}
p += len;
src = p;
}
if (scan_count == 0)
{
/* End of the first scane */
outbytes = nbytes;
if (num_8_bit == 0
&& (num_over_4 + num_over_5 == 0 || EQ (handle_over_uni, Qt)))
{
/* We can break the loop because there is no need of
changing the byte sequence. This is the typical
case. */
scan_count = 1;
}
else
{
/* Prepare for the next scan to handle non-Unicode characters. */
if (num_8_bit > 0)
{
if (CHARACTERP (handle_8_bit))
replace_8_bit = get_char_bytes (XFIXNUM (handle_8_bit),
&replace_8_bit_len);
else if (STRINGP (handle_8_bit))
{
replace_8_bit = SDATA (handle_8_bit);
replace_8_bit_len = SBYTES (handle_8_bit);
}
if (replace_8_bit)
outbytes += (replace_8_bit_len - 2) * num_8_bit;
else if (EQ (handle_8_bit, Qignored))
outbytes -= 2 * num_8_bit;
else if (EQ (handle_8_bit, Qt))
outbytes -= num_8_bit;
else
return Qnil;
}
if (num_over_4 + num_over_5 > 0)
{
if (CHARACTERP (handle_over_uni))
replace_over_uni = get_char_bytes (XFIXNUM (handle_over_uni),
&replace_over_uni_len);
else if (STRINGP (handle_over_uni))
{
replace_over_uni = SDATA (handle_over_uni);
replace_over_uni_len = SBYTES (handle_over_uni);
}
if (num_over_4 > 0)
{
if (replace_over_uni)
outbytes += (replace_over_uni_len - 4) * num_over_4;
else if (EQ (handle_over_uni, Qignored))
outbytes -= 4 * num_over_4;
else if (! EQ (handle_over_uni, Qt))
return Qnil;
}
if (num_over_5 > 0)
{
if (replace_over_uni)
outbytes += (replace_over_uni_len - 5) * num_over_5;
else if (EQ (handle_over_uni, Qignored))
outbytes -= 5 * num_over_5;
else if (! EQ (handle_over_uni, Qt))
return Qnil;
}
}
}
/* Prepare a return value and a space to store the encoded bytes. */
if (BUFFERP (buffer))
{
val = make_fixnum (outbytes);
dst = get_buffer_gap_address (buffer, nbytes);
}
else
{
if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0)
return string;
val = make_uninit_string (outbytes);
dst = SDATA (val);
}
p = src = SDATA (string);
}
}
if (src < pend)
memcpy (dst, src, pend - src);
if (BUFFERP (buffer))
{
struct buffer *oldb = current_buffer;
current_buffer = XBUFFER (buffer);
insert_from_gap (outbytes, outbytes, false);
current_buffer = oldb;
}
return val;
}
/* Decode STRING by the coding system utf-8-unix.
Even if :post-read-conversion and :decode-translation-table
properties are put to that coding system, they are ignored.
This function assumes that arguments have values as described
below. The validity must be assured by callers.
STRING is a unibyte string or an ASCII-only multibyte string.
BUFFER is a multibyte buffer or Qnil.
If BUFFER is a multibyte buffer, the decoding result of Unicode
characters are inserted after point of the buffer, and the number
of inserted characters is returned. Note that a caller should have
made BUFFER ready for modifying in advance (e.g. by calling
invalidate_buffer_caches).
If BUFFER is Qnil, a multibyte string is made from the decoding
result of Unicode characters, and it is returned. As a special
case, STRING itself is returned in the following cases:
1. STRING contains only ASCII characters.
2. NOCOPY, and STRING contains only valid UTF-8 sequences.
HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a invalid
byte sequence. The former is for an 1-byte invalid sequence that
violates the fundamental UTF-8 encoding rule. The latter is for a
4 or 5-byte invalid sequence that Emacs internally uses to
represent an over-unicode character (a character of code greater
than #x10FFFF). Note that this function does not treat an overlong
UTF-8 sequence as invalid.
If they are strings (typically 1-char string of the Unicode
REPLACEMENT CHARACTER #xFFFD), an invalid sequence is decoded into
that string. They must be multibyte strings if they contain a
non-ASCII character.
If they are characters, an invalid sequence is decoded into the
corresponding multibyte representation of the characters.
If they are Qignored, an invalid sequence is skipped on decoding.
If HANDLE-8-BIT is Qt, an 1-byte invalid sequence is deoded into
the corresponding eight-bit character.
If HANDLE-OVER-UNI is Qt, a 4 or 5-byte invalid sequence that
follows Emacs' representation for an over-unicode character is
decoded into the corresponding character.
If they are Qnil, Qnil is returned if STRING has an invalid sequence. */
Lisp_Object
decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
bool nocopy, Lisp_Object handle_8_bit,
Lisp_Object handle_over_uni)
{
/* This is like BYTES_BY_CHAR_HEAD, but it is assured that C >= 0x80
and it returns 0 for invalid sequence. */
#define UTF_8_SEQUENCE_LENGTH(c) \
((c) < 0xC2 ? 0 \
: (c) < 0xE0 ? 2 \
: (c) < 0xF0 ? 3 \
: (c) < 0xF8 ? 4 \
: (c) == 0xF8 ? 5 \
: 0)
ptrdiff_t nbytes = SBYTES (string);
unsigned char *p = SDATA (string), *pend = p + nbytes;
ptrdiff_t num_8_bit = 0; /* number of invalid 1-byte sequences. */
ptrdiff_t num_over_4 = 0; /* number of invalid 4-byte sequences. */
ptrdiff_t num_over_5 = 0; /* number of invalid 5-byte sequences. */
ptrdiff_t outbytes = nbytes; /* number of decoded bytes. */
ptrdiff_t outchars = 0; /* number of decoded characters. */
unsigned char *src = NULL, *dst = NULL;
bool change_byte_sequence = false;
/* Scan bytes in STRING twice. The first scan is to count invalid
sequences, and the second scan is to decode STRING. If the
decoding is trivial (no need of changing the byte sequence),
the second scan is avoided. */
while (p < pend)
{
src = p;
/* Try short cut for an ASCII-only case. */
while (p < pend && *p < 0x80) p++;
outchars += (p - src);
if (p == pend)
break;
int c = *p;
outchars++;
int len = UTF_8_SEQUENCE_LENGTH (c);
/* len == 0, 2, 3, 4, 5 */
if (UTF_8_EXTRA_OCTET_P (p[1])
&& (len == 2
|| (UTF_8_EXTRA_OCTET_P (p[2])
&& (len == 3
|| (UTF_8_EXTRA_OCTET_P (p[3])
&& len == 4
&& string_char (p, NULL, NULL) <= MAX_UNICODE_CHAR)))))
{
p += len;
continue;
}
/* A sequence to change on decoding was found. A rare case. */
if (len == 0)
{
if (NILP (handle_8_bit))
return Qnil;
num_8_bit++;
len = 1;
}
else /* len == 4 or 5 */
{
if (NILP (handle_over_uni))
return Qnil;
if (len == 4)
num_over_4++;
else
num_over_5++;
}
change_byte_sequence = true;
p += len;
}
Lisp_Object val; /* the return value. */
if (! change_byte_sequence
&& NILP (buffer))
{
if (nocopy)
return string;
val = make_uninit_multibyte_string (outchars, outbytes);
memcpy (SDATA (val), SDATA (string), pend - SDATA (string));
return val;
}
/* Count the number of resulting chars and bytes. */
unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL;
int replace_8_bit_len = 0, replace_over_uni_len = 0;
if (change_byte_sequence)
{
if (num_8_bit > 0)
{
if (CHARACTERP (handle_8_bit))
replace_8_bit = get_char_bytes (XFIXNUM (handle_8_bit),
&replace_8_bit_len);
else if (STRINGP (handle_8_bit))
{
replace_8_bit = SDATA (handle_8_bit);
replace_8_bit_len = SBYTES (handle_8_bit);
}
if (replace_8_bit)
outbytes += (replace_8_bit_len - 1) * num_8_bit;
else if (EQ (handle_8_bit, Qignored))
{
outbytes -= num_8_bit;
outchars -= num_8_bit;
}
else /* EQ (handle_8_bit, Qt)) */
outbytes += num_8_bit;
}
else if (num_over_4 + num_over_5 > 0)
{
if (CHARACTERP (handle_over_uni))
replace_over_uni = get_char_bytes (XFIXNUM (handle_over_uni),
&replace_over_uni_len);
else if (STRINGP (handle_over_uni))
{
replace_over_uni = SDATA (handle_over_uni);
replace_over_uni_len = SBYTES (handle_over_uni);
}
if (num_over_4 > 0)
{
if (replace_over_uni)
outbytes += (replace_over_uni_len - 4) * num_over_4;
else if (EQ (handle_over_uni, Qignored))
{
outbytes -= 4 * num_over_4;
outchars -= num_over_4;
}
}
if (num_over_5 > 0)
{
if (replace_over_uni)
outbytes += (replace_over_uni_len - 5) * num_over_5;
else if (EQ (handle_over_uni, Qignored))
{
outbytes -= 5 * num_over_5;
outchars -= num_over_5;
}
}
}
}
/* Prepare a return value and a space to store the decoded bytes. */
if (BUFFERP (buffer))
{
val = make_fixnum (outchars);
dst = get_buffer_gap_address (buffer, outbytes);
}
else
{
if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0)
return string;
val = make_uninit_multibyte_string (outchars, outbytes);
dst = SDATA (val);
}
src = SDATA (string);
if (change_byte_sequence)
{
p = src;
while (p < pend)
{
/* Try short cut for an ASCII-only case. */
/* while (p < pend && *p < 0x80) p++; */
/* if (p == pend) */
/* break; */
int c = *p;
if (c < 0x80)
{
p++;
continue;
}
int len = UTF_8_SEQUENCE_LENGTH (c);
if (len > 1)
{
int mlen;
for (mlen = 1; mlen < len && UTF_8_EXTRA_OCTET_P (p[mlen]);
mlen++);
if (mlen == len
&& (len <= 3
|| (len == 4
&& string_char (p, NULL, NULL) <= MAX_UNICODE_CHAR)
|| EQ (handle_over_uni, Qt)))
{
p += len;
continue;
}
}
if (src < p)
{
memcpy (dst, src, p - src);
dst += p - src;
}
if (len == 0)
{
if (replace_8_bit)
{
memcpy (dst, replace_8_bit, replace_8_bit_len);
dst += replace_8_bit_len;
}
else if (EQ (handle_8_bit, Qt))
{
dst += BYTE8_STRING (c, dst);
}
len = 1;
}
else /* len == 4 or 5 */
{
/* Handle p[0]... by handle_over_uni */
if (replace_over_uni)
{
memcpy (dst, replace_over_uni, replace_over_uni_len);
dst += replace_over_uni_len;
}
}
p += len;
src = p;
}
}
if (src < pend)
memcpy (dst, src, pend - src);
if (BUFFERP (buffer))
{
struct buffer *oldb = current_buffer;
current_buffer = XBUFFER (buffer);
insert_from_gap (outchars, outbytes, false);
current_buffer = oldb;
}
return val;
}
/* #define ENABLE_UTF_8_CONVERTER_TEST */
#ifdef ENABLE_UTF_8_CONVERTER_TEST
/* These functions are useful for testing and benchmarking
encode_string_utf_8 and decode_string_utf_8. */
/* ENCODE_METHOD specifies which internal decoder to use.
If it is Qnil, use encode_string_utf_8.
Otherwise, use code_convert_string.
COUNT, if integer, specifies how many times to call those functions
with the same arguments (for benchmarking). */
DEFUN ("internal-encode-string-utf-8", Finternal_encode_string_utf_8,
Sinternal_encode_string_utf_8, 7, 7, 0,
doc: /* Internal use only.*/)
(Lisp_Object string, Lisp_Object buffer, Lisp_Object nocopy,
Lisp_Object handle_8_bit, Lisp_Object handle_over_uni,
Lisp_Object encode_method, Lisp_Object count)
{
int repeat_count;
Lisp_Object val;
/* Check arguments. Return Qnil when an argmement is invalid. */
if (! STRINGP (string))
return Qnil;
if (! NILP (buffer)
&& (! BUFFERP (buffer)
|| ! NILP (BVAR (XBUFFER (buffer), enable_multibyte_characters))))
return Qnil;
if (! NILP (handle_8_bit) && ! EQ (handle_8_bit, Qt)
&& ! EQ (handle_8_bit, Qignored)
&& ! CHARACTERP (handle_8_bit)
&& (! STRINGP (handle_8_bit) || STRING_MULTIBYTE (handle_8_bit)))
return Qnil;
if (! NILP (handle_over_uni) && ! EQ (handle_over_uni, Qt)
&& ! EQ (handle_over_uni, Qignored)
&& ! CHARACTERP (handle_over_uni)
&& (! STRINGP (handle_over_uni) || STRING_MULTIBYTE (handle_over_uni)))
return Qnil;
CHECK_FIXNUM (count);
repeat_count = XFIXNUM (count);
val = Qnil;
/* Run an encoder according to ENCODE_METHOD. */
if (NILP (encode_method))
{
for (int i = 0; i < repeat_count; i++)
val = encode_string_utf_8 (string, buffer, ! NILP (nocopy),
handle_8_bit, handle_over_uni);
}
else
{
for (int i = 0; i < repeat_count; i++)
val = code_convert_string (string, Qutf_8_unix, Qnil, true,
! NILP (nocopy), true);
}
return val;
}
/* DECODE_METHOD specifies which internal decoder to use.
If it is Qnil, use decode_string_utf_8.
If it is Qt, use code_convert_string.
Otherwise, use make_string_from_utf8.
COUNT, if integer, specifies how many times to call those functions
with the same arguments (for benchmarking). */
DEFUN ("internal-decode-string-utf-8", Finternal_decode_string_utf_8,
Sinternal_decode_string_utf_8, 7, 7, 0,
doc: /* Internal use only.*/)
(Lisp_Object string, Lisp_Object buffer, Lisp_Object nocopy,
Lisp_Object handle_8_bit, Lisp_Object handle_over_uni,
Lisp_Object decode_method, Lisp_Object count)
{
int repeat_count;
Lisp_Object val;
/* Check arguments. Return Qnil when an argmement is invalid. */
if (! STRINGP (string))
return Qnil;
if (! NILP (buffer)
&& (! BUFFERP (buffer)
|| NILP (BVAR (XBUFFER (buffer), enable_multibyte_characters))))
return Qnil;
if (! NILP (handle_8_bit) && ! EQ (handle_8_bit, Qt)
&& ! EQ (handle_8_bit, Qignored)
&& ! CHARACTERP (handle_8_bit)
&& (! STRINGP (handle_8_bit) || ! STRING_MULTIBYTE (handle_8_bit)))
return Qnil;
if (! NILP (handle_over_uni) && ! EQ (handle_over_uni, Qt)
&& ! EQ (handle_over_uni, Qignored)
&& ! CHARACTERP (handle_over_uni)
&& (! STRINGP (handle_over_uni) || ! STRING_MULTIBYTE (handle_over_uni)))
return Qnil;
CHECK_FIXNUM (count);
repeat_count = XFIXNUM (count);
val = Qnil;
/* Run a decoder according to DECODE_METHOD. */
if (NILP (decode_method))
{
for (int i = 0; i < repeat_count; i++)
val = decode_string_utf_8 (string, buffer, ! NILP (nocopy),
handle_8_bit, handle_over_uni);
}
else if (EQ (decode_method, Qt))
{
if (! BUFFERP (buffer))
buffer = Qt;
for (int i = 0; i < repeat_count; i++)
val = code_convert_string (string, Qutf_8_unix, buffer, false,
! NILP (nocopy), true);
}
else if (! NILP (decode_method))
{
for (int i = 0; i < repeat_count; i++)
val = make_string_from_utf8 ((char *) SDATA (string), SBYTES (string));
}
return val;
}
#endif /* ENABLE_UTF_8_CONVERTER_TEST */
/* Encode or decode a file name, to or from a unibyte string suitable
for passing to C library functions. */
Lisp_Object
@ -10974,6 +11700,10 @@ syms_of_coding (void)
defsubr (&Sencode_coding_region);
defsubr (&Sdecode_coding_string);
defsubr (&Sencode_coding_string);
#ifdef ENABLE_UTF_8_CONVERTER_TEST
defsubr (&Sinternal_encode_string_utf_8);
defsubr (&Sinternal_decode_string_utf_8);
#endif /* ENABLE_UTF_8_CONVERTER_TEST */
defsubr (&Sdecode_sjis_char);
defsubr (&Sencode_sjis_char);
defsubr (&Sdecode_big5_char);

View file

@ -689,6 +689,10 @@ extern Lisp_Object code_convert_string (Lisp_Object, Lisp_Object,
Lisp_Object, bool, bool, bool);
extern Lisp_Object code_convert_string_norecord (Lisp_Object, Lisp_Object,
bool);
extern Lisp_Object encode_string_utf_8 (Lisp_Object, Lisp_Object, bool,
Lisp_Object, Lisp_Object);
extern Lisp_Object decode_string_utf_8 (Lisp_Object, Lisp_Object, bool,
Lisp_Object, Lisp_Object);
extern Lisp_Object encode_file_name (Lisp_Object);
extern Lisp_Object decode_file_name (Lisp_Object);
extern Lisp_Object raw_text_coding_system (Lisp_Object);