Fix string-to-multibyte overlong sequence bug
* src/character.h (MULTIBYTE_LENGTH, MULTIBYTE_LENGTH_NO_CHECK): Remove, replacing with ... (multibyte_length): ... this new function. All callers changed. The new function rejects overlong multibyte forms. * test/src/buffer-tests.el (buffer-multibyte-overlong-sequences): New test.
This commit is contained in:
parent
856d9378a4
commit
c88a3be808
5 changed files with 99 additions and 66 deletions
|
@ -2634,8 +2634,7 @@ current buffer is cleared. */)
|
|||
if (ASCII_CHAR_P (*p))
|
||||
p++, pos++;
|
||||
else if (EQ (flag, Qt)
|
||||
&& ! CHAR_BYTE8_HEAD_P (*p)
|
||||
&& (bytes = MULTIBYTE_LENGTH (p, pend)) > 0)
|
||||
&& 0 < (bytes = multibyte_length (p, pend, true, false)))
|
||||
p += bytes, pos += bytes;
|
||||
else
|
||||
{
|
||||
|
|
|
@ -486,7 +486,7 @@ multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
|
|||
|
||||
while (ptr < endp)
|
||||
{
|
||||
int len = MULTIBYTE_LENGTH (ptr, endp);
|
||||
int len = multibyte_length (ptr, endp, true, true);
|
||||
|
||||
if (len == 0)
|
||||
emacs_abort ();
|
||||
|
@ -508,7 +508,6 @@ parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
|
|||
ptrdiff_t *nchars, ptrdiff_t *nbytes)
|
||||
{
|
||||
const unsigned char *endp = str + len;
|
||||
int n;
|
||||
ptrdiff_t chars = 0, bytes = 0;
|
||||
|
||||
if (len >= MAX_MULTIBYTE_LENGTH)
|
||||
|
@ -516,8 +515,8 @@ parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
|
|||
const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
|
||||
while (str < adjusted_endp)
|
||||
{
|
||||
if (! CHAR_BYTE8_HEAD_P (*str)
|
||||
&& (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
|
||||
int n = multibyte_length (str, NULL, false, false);
|
||||
if (0 < n)
|
||||
str += n, bytes += n;
|
||||
else
|
||||
str++, bytes += 2;
|
||||
|
@ -526,8 +525,8 @@ parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
|
|||
}
|
||||
while (str < endp)
|
||||
{
|
||||
if (! CHAR_BYTE8_HEAD_P (*str)
|
||||
&& (n = MULTIBYTE_LENGTH (str, endp)) > 0)
|
||||
int n = multibyte_length (str, endp, true, false);
|
||||
if (0 < n)
|
||||
str += n, bytes += n;
|
||||
else
|
||||
str++, bytes += 2;
|
||||
|
@ -554,20 +553,25 @@ str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
|
|||
unsigned char *p = str, *endp = str + nbytes;
|
||||
unsigned char *to;
|
||||
ptrdiff_t chars = 0;
|
||||
int n;
|
||||
|
||||
if (nbytes >= MAX_MULTIBYTE_LENGTH)
|
||||
{
|
||||
unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
|
||||
while (p < adjusted_endp
|
||||
&& ! CHAR_BYTE8_HEAD_P (*p)
|
||||
&& (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
|
||||
p += n, chars++;
|
||||
while (p < adjusted_endp)
|
||||
{
|
||||
int n = multibyte_length (p, NULL, false, false);
|
||||
if (n <= 0)
|
||||
break;
|
||||
p += n, chars++;
|
||||
}
|
||||
}
|
||||
while (true)
|
||||
{
|
||||
int n = multibyte_length (p, endp, true, false);
|
||||
if (n <= 0)
|
||||
break;
|
||||
p += n, chars++;
|
||||
}
|
||||
while (p < endp
|
||||
&& ! CHAR_BYTE8_HEAD_P (*p)
|
||||
&& (n = MULTIBYTE_LENGTH (p, endp)) > 0)
|
||||
p += n, chars++;
|
||||
if (nchars)
|
||||
*nchars = chars;
|
||||
if (p == endp)
|
||||
|
@ -584,8 +588,8 @@ str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
|
|||
unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
|
||||
while (p < adjusted_endp)
|
||||
{
|
||||
if (! CHAR_BYTE8_HEAD_P (*p)
|
||||
&& (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
|
||||
int n = multibyte_length (p, NULL, false, false);
|
||||
if (0 < n)
|
||||
{
|
||||
while (n--)
|
||||
*to++ = *p++;
|
||||
|
@ -601,8 +605,8 @@ str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
|
|||
}
|
||||
while (p < endp)
|
||||
{
|
||||
if (! CHAR_BYTE8_HEAD_P (*p)
|
||||
&& (n = MULTIBYTE_LENGTH (p, endp)) > 0)
|
||||
int n = multibyte_length (p, endp, true, false);
|
||||
if (0 < n)
|
||||
{
|
||||
while (n--)
|
||||
*to++ = *p++;
|
||||
|
|
|
@ -31,15 +31,19 @@ INLINE_HEADER_BEGIN
|
|||
/* character code 1st byte byte sequence
|
||||
-------------- -------- -------------
|
||||
0-7F 00..7F 0xxxxxxx
|
||||
80-7FF C2..DF 110xxxxx 10xxxxxx
|
||||
800-FFFF E0..EF 1110xxxx 10xxxxxx 10xxxxxx
|
||||
10000-1FFFFF F0..F7 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
200000-3FFF7F F8 11111000 1000xxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
80-7FF C2..DF 110yyyyx 10xxxxxx
|
||||
800-FFFF E0..EF 1110yyyy 10yxxxxx 10xxxxxx
|
||||
10000-1FFFFF F0..F7 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
|
||||
200000-3FFF7F F8 11111000 1000yxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
3FFF80-3FFFFF C0..C1 1100000x 10xxxxxx (for eight-bit-char)
|
||||
400000-... invalid
|
||||
|
||||
invalid 1st byte 80..BF 10xxxxxx
|
||||
F9..FF 11111xxx (xxx != 000)
|
||||
F9..FF 11111yyy
|
||||
|
||||
In each bit pattern, 'x' and 'y' each represent a single bit of the
|
||||
character code payload, and least one 'y' must be a 1 bit.
|
||||
In the 5-byte sequence, the 22-bit payload cannot exceed 3FFF7F.
|
||||
*/
|
||||
|
||||
/* Maximum character code ((1 << CHARACTERBITS) - 1). */
|
||||
|
@ -284,7 +288,7 @@ CHAR_HEAD_P (int byte)
|
|||
}
|
||||
|
||||
/* How many bytes a character that starts with BYTE occupies in a
|
||||
multibyte form. Unlike MULTIBYTE_LENGTH below, this function does not
|
||||
multibyte form. Unlike multibyte_length, this function does not
|
||||
validate the multibyte form, but looks only at its first byte. */
|
||||
INLINE int
|
||||
BYTES_BY_CHAR_HEAD (int byte)
|
||||
|
@ -297,44 +301,54 @@ BYTES_BY_CHAR_HEAD (int byte)
|
|||
}
|
||||
|
||||
|
||||
/* The byte length of multibyte form at unibyte string P ending at
|
||||
PEND. If the string doesn't point to a valid multibyte form,
|
||||
return 0. Unlike BYTES_BY_CHAR_HEAD, this macro validates the
|
||||
multibyte form. */
|
||||
/* The byte length of the multibyte form at the unibyte string P,
|
||||
ending at PEND if CHECK, and without a length check if !CHECK.
|
||||
If ALLOW_8BIT, allow multibyte forms of eight-bit characters.
|
||||
If the string doesn't point to a valid multibyte form, return 0.
|
||||
Unlike BYTES_BY_CHAR_HEAD, this function validates the multibyte form. */
|
||||
|
||||
INLINE int
|
||||
MULTIBYTE_LENGTH (unsigned char const *p, unsigned char const *pend)
|
||||
multibyte_length (unsigned char const *p, unsigned char const *pend,
|
||||
bool check, bool allow_8bit)
|
||||
{
|
||||
return (! (p < pend) ? 0
|
||||
: ! (p[0] & 0x80) ? 1
|
||||
: ! (p + 1 < pend && (p[1] & 0xC0) == 0x80) ? 0
|
||||
: (p[0] & 0xE0) == 0xC0 ? 2
|
||||
: ! (p + 2 < pend && (p[2] & 0xC0) == 0x80) ? 0
|
||||
: (p[0] & 0xF0) == 0xE0 ? 3
|
||||
: ! (p + 3 < pend && (p[3] & 0xC0) == 0x80) ? 0
|
||||
: (p[0] & 0xF8) == 0xF0 ? 4
|
||||
: ! (p + 4 < pend && (p[4] & 0xC0) == 0x80) ? 0
|
||||
: p[0] == 0xF8 && (p[1] & 0xF0) == 0x80 ? 5
|
||||
: 0);
|
||||
}
|
||||
if (!check || p < pend)
|
||||
{
|
||||
unsigned char c = p[0];
|
||||
if (c < 0x80)
|
||||
return 1;
|
||||
if (!check || p + 1 < pend)
|
||||
{
|
||||
/* The 'unsigned int' avoids int overflow in the 5-byte case. */
|
||||
unsigned int d = p[1];
|
||||
|
||||
if (TRAILING_CODE_P (d))
|
||||
{
|
||||
if (allow_8bit ? (c & 0xE0) == 0xC0 : 0xC2 <= c && c <= 0xDF)
|
||||
return 2;
|
||||
if ((!check || p + 2 < pend)
|
||||
&& TRAILING_CODE_P (p[2]))
|
||||
{
|
||||
if ((c & 0xF0) == 0xE0 && ((c & 0x0F) | (d & 0x20)))
|
||||
return 3;
|
||||
if ((!check || p + 3 < pend) && TRAILING_CODE_P (p[3]))
|
||||
{
|
||||
if ((c & 0xF8) == 0xF0 && ((c & 0x07) | (d & 0x30)))
|
||||
return 4;
|
||||
if (c == 0xF8 && (!check || p + 4 < pend)
|
||||
&& TRAILING_CODE_P (p[4]))
|
||||
{
|
||||
unsigned int w = ((d << 24) + (p[2] << 16)
|
||||
+ (p[3] << 8) + p[4]);
|
||||
if (0x88808080 <= w && w <= 0x8FBFBDBF)
|
||||
return 5;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Like MULTIBYTE_LENGTH, but don't check the ending address. The
|
||||
multibyte form is still validated, unlike BYTES_BY_CHAR_HEAD. */
|
||||
|
||||
INLINE int
|
||||
MULTIBYTE_LENGTH_NO_CHECK (unsigned char const *p)
|
||||
{
|
||||
return (!(p[0] & 0x80) ? 1
|
||||
: (p[1] & 0xC0) != 0x80 ? 0
|
||||
: (p[0] & 0xE0) == 0xC0 ? 2
|
||||
: (p[2] & 0xC0) != 0x80 ? 0
|
||||
: (p[0] & 0xF0) == 0xE0 ? 3
|
||||
: (p[3] & 0xC0) != 0x80 ? 0
|
||||
: (p[0] & 0xF8) == 0xF0 ? 4
|
||||
: (p[4] & 0xC0) != 0x80 ? 0
|
||||
: p[0] == 0xF8 && (p[1] & 0xF0) == 0x80 ? 5
|
||||
: 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
|
14
src/coding.c
14
src/coding.c
|
@ -7670,15 +7670,17 @@ consume_chars (struct coding_system *coding, Lisp_Object translation_table,
|
|||
|
||||
if (! multibytep)
|
||||
{
|
||||
int bytes;
|
||||
|
||||
if (coding->encoder == encode_coding_raw_text
|
||||
|| coding->encoder == encode_coding_ccl)
|
||||
c = *src++, pos++;
|
||||
else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
|
||||
c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
|
||||
else
|
||||
c = BYTE8_TO_CHAR (*src), src++, pos++;
|
||||
{
|
||||
int bytes = multibyte_length (src, src_end, true, true);
|
||||
if (0 < bytes)
|
||||
c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
|
||||
else
|
||||
c = BYTE8_TO_CHAR (*src), src++, pos++;
|
||||
}
|
||||
}
|
||||
else
|
||||
c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
|
||||
|
@ -7727,7 +7729,7 @@ consume_chars (struct coding_system *coding, Lisp_Object translation_table,
|
|||
for (i = 1; i < to_nchars; i++)
|
||||
*buf++ = XFIXNUM (AREF (trans, i));
|
||||
for (i = 1; i < from_nchars; i++, pos++)
|
||||
src += MULTIBYTE_LENGTH_NO_CHECK (src);
|
||||
src += multibyte_length (src, NULL, false, true);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1313,4 +1313,18 @@ with parameters from the *Messages* buffer modification."
|
|||
(ovshould nonempty-eob-end 4 5)
|
||||
(ovshould empty-eob 5 5)))))
|
||||
|
||||
(ert-deftest buffer-multibyte-overlong-sequences ()
|
||||
(dolist (uni '("\xE0\x80\x80"
|
||||
"\xF0\x80\x80\x80"
|
||||
"\xF8\x8F\xBF\xBF\x80"))
|
||||
(let ((multi (string-to-multibyte uni)))
|
||||
(should
|
||||
(string-equal
|
||||
multi
|
||||
(with-temp-buffer
|
||||
(set-buffer-multibyte nil)
|
||||
(insert uni)
|
||||
(set-buffer-multibyte t)
|
||||
(buffer-string)))))))
|
||||
|
||||
;;; buffer-tests.el ends here
|
||||
|
|
Loading…
Add table
Reference in a new issue