Add Unicode-safe UTF-8 converter
* src/coding.c (encode_string_utf_8, decode_string_utf_8): New functions. * src/coding.h (encode_string_utf_8, decode_string_utf_8): Extern them.
This commit is contained in:
parent
5ec3f70527
commit
a8026dfde9
2 changed files with 734 additions and 0 deletions
730
src/coding.c
730
src/coding.c
|
@ -9515,6 +9515,732 @@ code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
|
|||
return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
|
||||
}
|
||||
|
||||
|
||||
/* Return the gap address of BUFFER. If the gap size is less than
|
||||
NBYTES, enlarge the gap in advance. */
|
||||
|
||||
static unsigned char *
|
||||
get_buffer_gap_address (Lisp_Object buffer, int nbytes)
|
||||
{
|
||||
struct buffer *buf = XBUFFER (buffer);
|
||||
|
||||
if (BUF_GPT (buf) != BUF_PT (buf))
|
||||
{
|
||||
struct buffer *oldb = current_buffer;
|
||||
|
||||
current_buffer = buf;
|
||||
move_gap_both (PT, PT_BYTE);
|
||||
current_buffer = oldb;
|
||||
}
|
||||
if (BUF_GAP_SIZE (buf) < nbytes)
|
||||
make_gap_1 (buf, nbytes);
|
||||
return BUF_GPT_ADDR (buf);
|
||||
}
|
||||
|
||||
/* Return a pointer to the byte sequence for C, and set the length in
|
||||
LEN. This function is used to get a byte sequence for HANDLE_8_BIT
|
||||
and HANDLE_OVER_UNI arguments of encode_string_utf_8 and
|
||||
decode_string_utf_8 when those arguments are given by
|
||||
characters. */
|
||||
|
||||
static unsigned char *
|
||||
get_char_bytes (int c, int *len)
|
||||
{
|
||||
/* We uses two chaches considering the situation that
|
||||
encode/decode_string_utf_8 are called repeatedly with the same
|
||||
values for HANDLE_8_BIT and HANDLE_OVER_UNI arguments. */
|
||||
static int chars[2];
|
||||
static unsigned char bytes[2][6];
|
||||
static int nbytes[2];
|
||||
static int last_index;
|
||||
|
||||
if (chars[last_index] == c)
|
||||
{
|
||||
*len = nbytes[last_index];
|
||||
return bytes[last_index];
|
||||
}
|
||||
if (chars[1 - last_index] == c)
|
||||
{
|
||||
*len = nbytes[1 - last_index];
|
||||
return bytes[1 - last_index];
|
||||
}
|
||||
last_index = 1 - last_index;
|
||||
chars[last_index] = c;
|
||||
*len = nbytes[last_index] = CHAR_STRING (c, bytes[last_index]);
|
||||
return bytes[last_index];
|
||||
}
|
||||
|
||||
/* Encode STRING by the coding system utf-8-unix.
|
||||
|
||||
Even if :pre-write-conversion and :encode-translation-table
|
||||
properties are put to that coding system, they are ignored.
|
||||
|
||||
It ignores :pre-write-conversion and :encode-translation-table
|
||||
propeties of that coding system.
|
||||
|
||||
This function assumes that arguments have values as described
|
||||
below. The validity must be assured by callers.
|
||||
|
||||
STRING is a multibyte string or an ASCII-only unibyte string.
|
||||
|
||||
BUFFER is a unibyte buffer or Qnil.
|
||||
|
||||
If BUFFER is a unibyte buffer, the encoding result of UTF-8
|
||||
sequence is inserted after point of the buffer, and the number of
|
||||
inserted characters is returned. Note that a caller should have
|
||||
made BUFFER ready for modifying in advance (e.g. by calling
|
||||
invalidate_buffer_caches).
|
||||
|
||||
If BUFFER is Qnil, a unibyte string is made from the encodnig
|
||||
result of UTF-8 sequence, and it is returned. If NOCOPY and STRING
|
||||
contains only Unicode characters (i.e. the encoding does not change
|
||||
the byte sequence), STRING is returned even if it is multibyte.
|
||||
|
||||
HANDLE-8-BIT and HANDE-OVER-UNI specify how to handle a non-Unicode
|
||||
character. The former is for an eight-bit character (represented
|
||||
by 2-byte overlong sequence in multibyte STRING). The latter is
|
||||
for an over-unicode character (a character whose code is greater
|
||||
than the maximum Unicode character 0x10FFFF, and is represented by
|
||||
4 or 5-byte sequence in multibyte STRING).
|
||||
|
||||
If they are unibyte strings (typically "\357\277\275"; UTF-8
|
||||
sequence for the Unicode REPLACEMENT CHARACTER #xFFFD), a
|
||||
non-Unicode character is encoded into that sequence.
|
||||
|
||||
If they are characters, a non-Unicode chracters is encoded into the
|
||||
corresponding UTF-8 sequences.
|
||||
|
||||
If they are Qignored, a non-Unicode character is skipped on
|
||||
encoding.
|
||||
|
||||
If HANDLE-8-BIT is Qt, an eight-bit character is encoded into one
|
||||
byte of the same value.
|
||||
|
||||
If HANDLE-OVER-UNI is Qt, an over-unicode character is encoded
|
||||
into the the same 4 or 5-byte sequence.
|
||||
|
||||
If they are Qnil, Qnil is returned if STRING has a non-Unicode
|
||||
character. */
|
||||
|
||||
Lisp_Object
|
||||
encode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
|
||||
bool nocopy, Lisp_Object handle_8_bit,
|
||||
Lisp_Object handle_over_uni)
|
||||
{
|
||||
ptrdiff_t nchars = SCHARS (string), nbytes = SBYTES (string);
|
||||
if (NILP (buffer) && nchars == nbytes)
|
||||
/* STRING contains only ASCII characters. */
|
||||
return string;
|
||||
|
||||
ptrdiff_t num_8_bit = 0; /* number of eight-bit chars in STRING */
|
||||
/* The following two vars are counted only if handle_over_uni is not Qt */
|
||||
ptrdiff_t num_over_4 = 0; /* number of 4-byte non-Unicode chars in STRING */
|
||||
ptrdiff_t num_over_5 = 0; /* number of 5-byte non-Unicode chars in STRING */
|
||||
ptrdiff_t outbytes; /* number of bytes of decoding result. */
|
||||
unsigned char *p = SDATA (string);
|
||||
unsigned char *pend = p + nbytes;
|
||||
unsigned char *src = NULL, *dst = NULL;
|
||||
unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL;
|
||||
int replace_8_bit_len = 0, replace_over_uni_len = 0;
|
||||
Lisp_Object val; /* the return value */
|
||||
|
||||
/* Scan bytes in STRING twice. The first scan is to count non-Unicode
|
||||
characters, and the second scan is to encode STRING. If the
|
||||
encoding is trivial (no need of changing the byte sequence),
|
||||
the second scan is avoided. */
|
||||
for (int scan_count = 0; scan_count < 2; scan_count++)
|
||||
{
|
||||
while (p < pend)
|
||||
{
|
||||
if (nchars == pend - p)
|
||||
/* There is no multibyte character remaining. */
|
||||
break;
|
||||
|
||||
int c = *p;
|
||||
int len = BYTES_BY_CHAR_HEAD (c);
|
||||
|
||||
nchars--;
|
||||
if (len == 1
|
||||
|| len == 3
|
||||
|| (len == 2 ? ! CHAR_BYTE8_HEAD_P (c)
|
||||
: (EQ (handle_over_uni, Qt)
|
||||
|| (len == 4
|
||||
&& string_char (p, NULL, NULL) <= MAX_UNICODE_CHAR))))
|
||||
{
|
||||
p += len;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* A character to change the byte sequence on encoding was
|
||||
found. A rare case. */
|
||||
if (len == 2)
|
||||
{
|
||||
/* Handle an eight-bit character by handle_8_bit. */
|
||||
if (scan_count == 0)
|
||||
{
|
||||
if (NILP (handle_8_bit))
|
||||
return Qnil;
|
||||
num_8_bit++;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (src < p)
|
||||
{
|
||||
memcpy (dst, src, p - src);
|
||||
dst += p - src;
|
||||
}
|
||||
if (replace_8_bit_len > 0)
|
||||
{
|
||||
memcpy (dst, replace_8_bit, replace_8_bit_len);
|
||||
dst += replace_8_bit_len;
|
||||
}
|
||||
else if (EQ (handle_8_bit, Qt))
|
||||
{
|
||||
int char8 = STRING_CHAR (p);
|
||||
*dst++ = CHAR_TO_BYTE8 (char8);
|
||||
}
|
||||
}
|
||||
}
|
||||
else /* len == 4 or 5 */
|
||||
{
|
||||
/* Handle an over-unicode character by handle_over_uni. */
|
||||
if (scan_count == 0)
|
||||
{
|
||||
if (NILP (handle_over_uni))
|
||||
return Qnil;
|
||||
if (len == 4)
|
||||
num_over_4++;
|
||||
else
|
||||
num_over_5++;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (src < p)
|
||||
{
|
||||
memcpy (dst, src, p - src);
|
||||
dst += p - src;
|
||||
}
|
||||
if (replace_over_uni_len > 0)
|
||||
{
|
||||
memcpy (dst, replace_over_uni, replace_over_uni_len);
|
||||
dst += replace_over_uni_len;
|
||||
}
|
||||
}
|
||||
}
|
||||
p += len;
|
||||
src = p;
|
||||
}
|
||||
|
||||
if (scan_count == 0)
|
||||
{
|
||||
/* End of the first scane */
|
||||
outbytes = nbytes;
|
||||
if (num_8_bit == 0
|
||||
&& (num_over_4 + num_over_5 == 0 || EQ (handle_over_uni, Qt)))
|
||||
{
|
||||
/* We can break the loop because there is no need of
|
||||
changing the byte sequence. This is the typical
|
||||
case. */
|
||||
scan_count = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Prepare for the next scan to handle non-Unicode characters. */
|
||||
if (num_8_bit > 0)
|
||||
{
|
||||
if (CHARACTERP (handle_8_bit))
|
||||
replace_8_bit = get_char_bytes (XFIXNUM (handle_8_bit),
|
||||
&replace_8_bit_len);
|
||||
else if (STRINGP (handle_8_bit))
|
||||
{
|
||||
replace_8_bit = SDATA (handle_8_bit);
|
||||
replace_8_bit_len = SBYTES (handle_8_bit);
|
||||
}
|
||||
if (replace_8_bit)
|
||||
outbytes += (replace_8_bit_len - 2) * num_8_bit;
|
||||
else if (EQ (handle_8_bit, Qignored))
|
||||
outbytes -= 2 * num_8_bit;
|
||||
else if (EQ (handle_8_bit, Qt))
|
||||
outbytes -= num_8_bit;
|
||||
else
|
||||
return Qnil;
|
||||
}
|
||||
if (num_over_4 + num_over_5 > 0)
|
||||
{
|
||||
if (CHARACTERP (handle_over_uni))
|
||||
replace_over_uni = get_char_bytes (XFIXNUM (handle_over_uni),
|
||||
&replace_over_uni_len);
|
||||
else if (STRINGP (handle_over_uni))
|
||||
{
|
||||
replace_over_uni = SDATA (handle_over_uni);
|
||||
replace_over_uni_len = SBYTES (handle_over_uni);
|
||||
}
|
||||
if (num_over_4 > 0)
|
||||
{
|
||||
if (replace_over_uni)
|
||||
outbytes += (replace_over_uni_len - 4) * num_over_4;
|
||||
else if (EQ (handle_over_uni, Qignored))
|
||||
outbytes -= 4 * num_over_4;
|
||||
else if (! EQ (handle_over_uni, Qt))
|
||||
return Qnil;
|
||||
}
|
||||
if (num_over_5 > 0)
|
||||
{
|
||||
if (replace_over_uni)
|
||||
outbytes += (replace_over_uni_len - 5) * num_over_5;
|
||||
else if (EQ (handle_over_uni, Qignored))
|
||||
outbytes -= 5 * num_over_5;
|
||||
else if (! EQ (handle_over_uni, Qt))
|
||||
return Qnil;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Prepare a return value and a space to store the encoded bytes. */
|
||||
if (BUFFERP (buffer))
|
||||
{
|
||||
val = make_fixnum (outbytes);
|
||||
dst = get_buffer_gap_address (buffer, nbytes);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0)
|
||||
return string;
|
||||
val = make_uninit_string (outbytes);
|
||||
dst = SDATA (val);
|
||||
}
|
||||
p = src = SDATA (string);
|
||||
}
|
||||
}
|
||||
|
||||
if (src < pend)
|
||||
memcpy (dst, src, pend - src);
|
||||
if (BUFFERP (buffer))
|
||||
{
|
||||
struct buffer *oldb = current_buffer;
|
||||
|
||||
current_buffer = XBUFFER (buffer);
|
||||
insert_from_gap (outbytes, outbytes, false);
|
||||
current_buffer = oldb;
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
/* Decode STRING by the coding system utf-8-unix.
|
||||
|
||||
Even if :post-read-conversion and :decode-translation-table
|
||||
properties are put to that coding system, they are ignored.
|
||||
|
||||
This function assumes that arguments have values as described
|
||||
below. The validity must be assured by callers.
|
||||
|
||||
STRING is a unibyte string or an ASCII-only multibyte string.
|
||||
|
||||
BUFFER is a multibyte buffer or Qnil.
|
||||
|
||||
If BUFFER is a multibyte buffer, the decoding result of Unicode
|
||||
characters are inserted after point of the buffer, and the number
|
||||
of inserted characters is returned. Note that a caller should have
|
||||
made BUFFER ready for modifying in advance (e.g. by calling
|
||||
invalidate_buffer_caches).
|
||||
|
||||
If BUFFER is Qnil, a multibyte string is made from the decoding
|
||||
result of Unicode characters, and it is returned. As a special
|
||||
case, STRING itself is returned in the following cases:
|
||||
1. STRING contains only ASCII characters.
|
||||
2. NOCOPY, and STRING contains only valid UTF-8 sequences.
|
||||
|
||||
HANDLE-8-BIT and HANDLE-OVER-UNI specify how to handle a invalid
|
||||
byte sequence. The former is for an 1-byte invalid sequence that
|
||||
violates the fundamental UTF-8 encoding rule. The latter is for a
|
||||
4 or 5-byte invalid sequence that Emacs internally uses to
|
||||
represent an over-unicode character (a character of code greater
|
||||
than #x10FFFF). Note that this function does not treat an overlong
|
||||
UTF-8 sequence as invalid.
|
||||
|
||||
If they are strings (typically 1-char string of the Unicode
|
||||
REPLACEMENT CHARACTER #xFFFD), an invalid sequence is decoded into
|
||||
that string. They must be multibyte strings if they contain a
|
||||
non-ASCII character.
|
||||
|
||||
If they are characters, an invalid sequence is decoded into the
|
||||
corresponding multibyte representation of the characters.
|
||||
|
||||
If they are Qignored, an invalid sequence is skipped on decoding.
|
||||
|
||||
If HANDLE-8-BIT is Qt, an 1-byte invalid sequence is deoded into
|
||||
the corresponding eight-bit character.
|
||||
|
||||
If HANDLE-OVER-UNI is Qt, a 4 or 5-byte invalid sequence that
|
||||
follows Emacs' representation for an over-unicode character is
|
||||
decoded into the corresponding character.
|
||||
|
||||
If they are Qnil, Qnil is returned if STRING has an invalid sequence. */
|
||||
|
||||
Lisp_Object
|
||||
decode_string_utf_8 (Lisp_Object string, Lisp_Object buffer,
|
||||
bool nocopy, Lisp_Object handle_8_bit,
|
||||
Lisp_Object handle_over_uni)
|
||||
{
|
||||
/* This is like BYTES_BY_CHAR_HEAD, but it is assured that C >= 0x80
|
||||
and it returns 0 for invalid sequence. */
|
||||
#define UTF_8_SEQUENCE_LENGTH(c) \
|
||||
((c) < 0xC2 ? 0 \
|
||||
: (c) < 0xE0 ? 2 \
|
||||
: (c) < 0xF0 ? 3 \
|
||||
: (c) < 0xF8 ? 4 \
|
||||
: (c) == 0xF8 ? 5 \
|
||||
: 0)
|
||||
|
||||
ptrdiff_t nbytes = SBYTES (string);
|
||||
unsigned char *p = SDATA (string), *pend = p + nbytes;
|
||||
ptrdiff_t num_8_bit = 0; /* number of invalid 1-byte sequences. */
|
||||
ptrdiff_t num_over_4 = 0; /* number of invalid 4-byte sequences. */
|
||||
ptrdiff_t num_over_5 = 0; /* number of invalid 5-byte sequences. */
|
||||
ptrdiff_t outbytes = nbytes; /* number of decoded bytes. */
|
||||
ptrdiff_t outchars = 0; /* number of decoded characters. */
|
||||
unsigned char *src = NULL, *dst = NULL;
|
||||
bool change_byte_sequence = false;
|
||||
|
||||
/* Scan bytes in STRING twice. The first scan is to count invalid
|
||||
sequences, and the second scan is to decode STRING. If the
|
||||
decoding is trivial (no need of changing the byte sequence),
|
||||
the second scan is avoided. */
|
||||
while (p < pend)
|
||||
{
|
||||
src = p;
|
||||
/* Try short cut for an ASCII-only case. */
|
||||
while (p < pend && *p < 0x80) p++;
|
||||
outchars += (p - src);
|
||||
if (p == pend)
|
||||
break;
|
||||
int c = *p;
|
||||
outchars++;
|
||||
int len = UTF_8_SEQUENCE_LENGTH (c);
|
||||
/* len == 0, 2, 3, 4, 5 */
|
||||
if (UTF_8_EXTRA_OCTET_P (p[1])
|
||||
&& (len == 2
|
||||
|| (UTF_8_EXTRA_OCTET_P (p[2])
|
||||
&& (len == 3
|
||||
|| (UTF_8_EXTRA_OCTET_P (p[3])
|
||||
&& len == 4
|
||||
&& string_char (p, NULL, NULL) <= MAX_UNICODE_CHAR)))))
|
||||
{
|
||||
p += len;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* A sequence to change on decoding was found. A rare case. */
|
||||
if (len == 0)
|
||||
{
|
||||
if (NILP (handle_8_bit))
|
||||
return Qnil;
|
||||
num_8_bit++;
|
||||
len = 1;
|
||||
}
|
||||
else /* len == 4 or 5 */
|
||||
{
|
||||
if (NILP (handle_over_uni))
|
||||
return Qnil;
|
||||
if (len == 4)
|
||||
num_over_4++;
|
||||
else
|
||||
num_over_5++;
|
||||
}
|
||||
change_byte_sequence = true;
|
||||
p += len;
|
||||
}
|
||||
|
||||
Lisp_Object val; /* the return value. */
|
||||
|
||||
if (! change_byte_sequence
|
||||
&& NILP (buffer))
|
||||
{
|
||||
if (nocopy)
|
||||
return string;
|
||||
val = make_uninit_multibyte_string (outchars, outbytes);
|
||||
memcpy (SDATA (val), SDATA (string), pend - SDATA (string));
|
||||
return val;
|
||||
}
|
||||
|
||||
/* Count the number of resulting chars and bytes. */
|
||||
unsigned char *replace_8_bit = NULL, *replace_over_uni = NULL;
|
||||
int replace_8_bit_len = 0, replace_over_uni_len = 0;
|
||||
|
||||
if (change_byte_sequence)
|
||||
{
|
||||
if (num_8_bit > 0)
|
||||
{
|
||||
if (CHARACTERP (handle_8_bit))
|
||||
replace_8_bit = get_char_bytes (XFIXNUM (handle_8_bit),
|
||||
&replace_8_bit_len);
|
||||
else if (STRINGP (handle_8_bit))
|
||||
{
|
||||
replace_8_bit = SDATA (handle_8_bit);
|
||||
replace_8_bit_len = SBYTES (handle_8_bit);
|
||||
}
|
||||
if (replace_8_bit)
|
||||
outbytes += (replace_8_bit_len - 1) * num_8_bit;
|
||||
else if (EQ (handle_8_bit, Qignored))
|
||||
{
|
||||
outbytes -= num_8_bit;
|
||||
outchars -= num_8_bit;
|
||||
}
|
||||
else /* EQ (handle_8_bit, Qt)) */
|
||||
outbytes += num_8_bit;
|
||||
}
|
||||
else if (num_over_4 + num_over_5 > 0)
|
||||
{
|
||||
if (CHARACTERP (handle_over_uni))
|
||||
replace_over_uni = get_char_bytes (XFIXNUM (handle_over_uni),
|
||||
&replace_over_uni_len);
|
||||
else if (STRINGP (handle_over_uni))
|
||||
{
|
||||
replace_over_uni = SDATA (handle_over_uni);
|
||||
replace_over_uni_len = SBYTES (handle_over_uni);
|
||||
}
|
||||
if (num_over_4 > 0)
|
||||
{
|
||||
if (replace_over_uni)
|
||||
outbytes += (replace_over_uni_len - 4) * num_over_4;
|
||||
else if (EQ (handle_over_uni, Qignored))
|
||||
{
|
||||
outbytes -= 4 * num_over_4;
|
||||
outchars -= num_over_4;
|
||||
}
|
||||
}
|
||||
if (num_over_5 > 0)
|
||||
{
|
||||
if (replace_over_uni)
|
||||
outbytes += (replace_over_uni_len - 5) * num_over_5;
|
||||
else if (EQ (handle_over_uni, Qignored))
|
||||
{
|
||||
outbytes -= 5 * num_over_5;
|
||||
outchars -= num_over_5;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Prepare a return value and a space to store the decoded bytes. */
|
||||
if (BUFFERP (buffer))
|
||||
{
|
||||
val = make_fixnum (outchars);
|
||||
dst = get_buffer_gap_address (buffer, outbytes);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (nocopy && (num_8_bit + num_over_4 + num_over_5) == 0)
|
||||
return string;
|
||||
val = make_uninit_multibyte_string (outchars, outbytes);
|
||||
dst = SDATA (val);
|
||||
}
|
||||
|
||||
src = SDATA (string);
|
||||
if (change_byte_sequence)
|
||||
{
|
||||
p = src;
|
||||
while (p < pend)
|
||||
{
|
||||
/* Try short cut for an ASCII-only case. */
|
||||
/* while (p < pend && *p < 0x80) p++; */
|
||||
/* if (p == pend) */
|
||||
/* break; */
|
||||
int c = *p;
|
||||
if (c < 0x80)
|
||||
{
|
||||
p++;
|
||||
continue;
|
||||
}
|
||||
int len = UTF_8_SEQUENCE_LENGTH (c);
|
||||
if (len > 1)
|
||||
{
|
||||
int mlen;
|
||||
for (mlen = 1; mlen < len && UTF_8_EXTRA_OCTET_P (p[mlen]);
|
||||
mlen++);
|
||||
if (mlen == len
|
||||
&& (len <= 3
|
||||
|| (len == 4
|
||||
&& string_char (p, NULL, NULL) <= MAX_UNICODE_CHAR)
|
||||
|| EQ (handle_over_uni, Qt)))
|
||||
{
|
||||
p += len;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (src < p)
|
||||
{
|
||||
memcpy (dst, src, p - src);
|
||||
dst += p - src;
|
||||
}
|
||||
if (len == 0)
|
||||
{
|
||||
if (replace_8_bit)
|
||||
{
|
||||
memcpy (dst, replace_8_bit, replace_8_bit_len);
|
||||
dst += replace_8_bit_len;
|
||||
}
|
||||
else if (EQ (handle_8_bit, Qt))
|
||||
{
|
||||
dst += BYTE8_STRING (c, dst);
|
||||
}
|
||||
len = 1;
|
||||
}
|
||||
else /* len == 4 or 5 */
|
||||
{
|
||||
/* Handle p[0]... by handle_over_uni */
|
||||
if (replace_over_uni)
|
||||
{
|
||||
memcpy (dst, replace_over_uni, replace_over_uni_len);
|
||||
dst += replace_over_uni_len;
|
||||
}
|
||||
}
|
||||
p += len;
|
||||
src = p;
|
||||
}
|
||||
}
|
||||
|
||||
if (src < pend)
|
||||
memcpy (dst, src, pend - src);
|
||||
if (BUFFERP (buffer))
|
||||
{
|
||||
struct buffer *oldb = current_buffer;
|
||||
|
||||
current_buffer = XBUFFER (buffer);
|
||||
insert_from_gap (outchars, outbytes, false);
|
||||
current_buffer = oldb;
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
/* #define ENABLE_UTF_8_CONVERTER_TEST */
|
||||
|
||||
#ifdef ENABLE_UTF_8_CONVERTER_TEST
|
||||
|
||||
/* These functions are useful for testing and benchmarking
|
||||
encode_string_utf_8 and decode_string_utf_8. */
|
||||
|
||||
/* ENCODE_METHOD specifies which internal decoder to use.
|
||||
If it is Qnil, use encode_string_utf_8.
|
||||
Otherwise, use code_convert_string.
|
||||
|
||||
COUNT, if integer, specifies how many times to call those functions
|
||||
with the same arguments (for benchmarking). */
|
||||
|
||||
DEFUN ("internal-encode-string-utf-8", Finternal_encode_string_utf_8,
|
||||
Sinternal_encode_string_utf_8, 7, 7, 0,
|
||||
doc: /* Internal use only.*/)
|
||||
(Lisp_Object string, Lisp_Object buffer, Lisp_Object nocopy,
|
||||
Lisp_Object handle_8_bit, Lisp_Object handle_over_uni,
|
||||
Lisp_Object encode_method, Lisp_Object count)
|
||||
{
|
||||
int repeat_count;
|
||||
Lisp_Object val;
|
||||
|
||||
/* Check arguments. Return Qnil when an argmement is invalid. */
|
||||
if (! STRINGP (string))
|
||||
return Qnil;
|
||||
if (! NILP (buffer)
|
||||
&& (! BUFFERP (buffer)
|
||||
|| ! NILP (BVAR (XBUFFER (buffer), enable_multibyte_characters))))
|
||||
return Qnil;
|
||||
if (! NILP (handle_8_bit) && ! EQ (handle_8_bit, Qt)
|
||||
&& ! EQ (handle_8_bit, Qignored)
|
||||
&& ! CHARACTERP (handle_8_bit)
|
||||
&& (! STRINGP (handle_8_bit) || STRING_MULTIBYTE (handle_8_bit)))
|
||||
return Qnil;
|
||||
if (! NILP (handle_over_uni) && ! EQ (handle_over_uni, Qt)
|
||||
&& ! EQ (handle_over_uni, Qignored)
|
||||
&& ! CHARACTERP (handle_over_uni)
|
||||
&& (! STRINGP (handle_over_uni) || STRING_MULTIBYTE (handle_over_uni)))
|
||||
return Qnil;
|
||||
|
||||
CHECK_FIXNUM (count);
|
||||
repeat_count = XFIXNUM (count);
|
||||
|
||||
val = Qnil;
|
||||
/* Run an encoder according to ENCODE_METHOD. */
|
||||
if (NILP (encode_method))
|
||||
{
|
||||
for (int i = 0; i < repeat_count; i++)
|
||||
val = encode_string_utf_8 (string, buffer, ! NILP (nocopy),
|
||||
handle_8_bit, handle_over_uni);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < repeat_count; i++)
|
||||
val = code_convert_string (string, Qutf_8_unix, Qnil, true,
|
||||
! NILP (nocopy), true);
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
/* DECODE_METHOD specifies which internal decoder to use.
|
||||
If it is Qnil, use decode_string_utf_8.
|
||||
If it is Qt, use code_convert_string.
|
||||
Otherwise, use make_string_from_utf8.
|
||||
|
||||
COUNT, if integer, specifies how many times to call those functions
|
||||
with the same arguments (for benchmarking). */
|
||||
|
||||
DEFUN ("internal-decode-string-utf-8", Finternal_decode_string_utf_8,
|
||||
Sinternal_decode_string_utf_8, 7, 7, 0,
|
||||
doc: /* Internal use only.*/)
|
||||
(Lisp_Object string, Lisp_Object buffer, Lisp_Object nocopy,
|
||||
Lisp_Object handle_8_bit, Lisp_Object handle_over_uni,
|
||||
Lisp_Object decode_method, Lisp_Object count)
|
||||
{
|
||||
int repeat_count;
|
||||
Lisp_Object val;
|
||||
|
||||
/* Check arguments. Return Qnil when an argmement is invalid. */
|
||||
if (! STRINGP (string))
|
||||
return Qnil;
|
||||
if (! NILP (buffer)
|
||||
&& (! BUFFERP (buffer)
|
||||
|| NILP (BVAR (XBUFFER (buffer), enable_multibyte_characters))))
|
||||
return Qnil;
|
||||
if (! NILP (handle_8_bit) && ! EQ (handle_8_bit, Qt)
|
||||
&& ! EQ (handle_8_bit, Qignored)
|
||||
&& ! CHARACTERP (handle_8_bit)
|
||||
&& (! STRINGP (handle_8_bit) || ! STRING_MULTIBYTE (handle_8_bit)))
|
||||
return Qnil;
|
||||
if (! NILP (handle_over_uni) && ! EQ (handle_over_uni, Qt)
|
||||
&& ! EQ (handle_over_uni, Qignored)
|
||||
&& ! CHARACTERP (handle_over_uni)
|
||||
&& (! STRINGP (handle_over_uni) || ! STRING_MULTIBYTE (handle_over_uni)))
|
||||
return Qnil;
|
||||
|
||||
CHECK_FIXNUM (count);
|
||||
repeat_count = XFIXNUM (count);
|
||||
|
||||
val = Qnil;
|
||||
/* Run a decoder according to DECODE_METHOD. */
|
||||
if (NILP (decode_method))
|
||||
{
|
||||
for (int i = 0; i < repeat_count; i++)
|
||||
val = decode_string_utf_8 (string, buffer, ! NILP (nocopy),
|
||||
handle_8_bit, handle_over_uni);
|
||||
}
|
||||
else if (EQ (decode_method, Qt))
|
||||
{
|
||||
if (! BUFFERP (buffer))
|
||||
buffer = Qt;
|
||||
for (int i = 0; i < repeat_count; i++)
|
||||
val = code_convert_string (string, Qutf_8_unix, buffer, false,
|
||||
! NILP (nocopy), true);
|
||||
}
|
||||
else if (! NILP (decode_method))
|
||||
{
|
||||
for (int i = 0; i < repeat_count; i++)
|
||||
val = make_string_from_utf8 ((char *) SDATA (string), SBYTES (string));
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
#endif /* ENABLE_UTF_8_CONVERTER_TEST */
|
||||
|
||||
/* Encode or decode a file name, to or from a unibyte string suitable
|
||||
for passing to C library functions. */
|
||||
Lisp_Object
|
||||
|
@ -10974,6 +11700,10 @@ syms_of_coding (void)
|
|||
defsubr (&Sencode_coding_region);
|
||||
defsubr (&Sdecode_coding_string);
|
||||
defsubr (&Sencode_coding_string);
|
||||
#ifdef ENABLE_UTF_8_CONVERTER_TEST
|
||||
defsubr (&Sinternal_encode_string_utf_8);
|
||||
defsubr (&Sinternal_decode_string_utf_8);
|
||||
#endif /* ENABLE_UTF_8_CONVERTER_TEST */
|
||||
defsubr (&Sdecode_sjis_char);
|
||||
defsubr (&Sencode_sjis_char);
|
||||
defsubr (&Sdecode_big5_char);
|
||||
|
|
|
@ -689,6 +689,10 @@ extern Lisp_Object code_convert_string (Lisp_Object, Lisp_Object,
|
|||
Lisp_Object, bool, bool, bool);
|
||||
extern Lisp_Object code_convert_string_norecord (Lisp_Object, Lisp_Object,
|
||||
bool);
|
||||
extern Lisp_Object encode_string_utf_8 (Lisp_Object, Lisp_Object, bool,
|
||||
Lisp_Object, Lisp_Object);
|
||||
extern Lisp_Object decode_string_utf_8 (Lisp_Object, Lisp_Object, bool,
|
||||
Lisp_Object, Lisp_Object);
|
||||
extern Lisp_Object encode_file_name (Lisp_Object);
|
||||
extern Lisp_Object decode_file_name (Lisp_Object);
|
||||
extern Lisp_Object raw_text_coding_system (Lisp_Object);
|
||||
|
|
Loading…
Add table
Reference in a new issue