(detect_coding_iso2022): New arg latin_extra_code_state. Allow Latin

extra codes only when *latin_extra_code_state is nonzero.
(detect_coding_mask): If there is a NULL byte, detect the encoding as
UTF-16 or binary.  If there is a Latin extra code, detect the encoding
as ISO-2022 only when no other proper encoding is found.
This commit is contained in:
Kenichi Handa 2008-01-09 06:05:23 +00:00
parent ca8dfeda7c
commit 36a04480a5

View file

@ -1406,12 +1406,17 @@ enum iso_code_class_type iso_code_class[256];
CODING_CATEGORY_MASK_ISO_7_ELSE
CODING_CATEGORY_MASK_ISO_8_ELSE
are set. If a code which should never appear in ISO2022 is found,
returns 0. */
returns 0.
If *latin_extra_code_state is zero and Latin extra codes are found,
set *latin_extra_code_state to 1 and return 0. If it is nonzero,
accept Latin extra codes. */
static int
detect_coding_iso2022 (src, src_end, multibytep)
detect_coding_iso2022 (src, src_end, multibytep, latin_extra_code_state)
unsigned char *src, *src_end;
int multibytep;
int *latin_extra_code_state;
{
int mask = CODING_CATEGORY_MASK_ISO;
int mask_found = 0;
@ -1574,6 +1579,11 @@ detect_coding_iso2022 (src, src_end, multibytep)
if (VECTORP (Vlatin_extra_code_table)
&& !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
{
if (! *latin_extra_code_state)
{
*latin_extra_code_state = 1;
return 0;
}
if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
& CODING_FLAG_ISO_LATIN_EXTRA)
newmask |= CODING_CATEGORY_MASK_ISO_8_1;
@ -1600,6 +1610,11 @@ detect_coding_iso2022 (src, src_end, multibytep)
{
int newmask = 0;
if (! *latin_extra_code_state)
{
*latin_extra_code_state = 1;
return 0;
}
if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
& CODING_FLAG_ISO_LATIN_EXTRA)
newmask |= CODING_CATEGORY_MASK_ISO_8_1;
@ -4127,6 +4142,8 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
unsigned char *src = source, *src_end = source + src_bytes;
unsigned int mask, utf16_examined_p, iso2022_examined_p;
int i;
int null_byte_found;
int latin_extra_code_state = 1;
/* At first, skip all ASCII characters and control characters except
for three ISO2022 specific control characters. */
@ -4135,21 +4152,32 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
ascii_skip_code[ISO_CODE_ESC] = 0;
label_loop_detect_coding:
while (src < src_end && ascii_skip_code[*src]) src++;
null_byte_found = 0;
while (src < src_end && ascii_skip_code[*src])
null_byte_found |= (! *src++);
if (! null_byte_found)
{
unsigned char *p = src + 1;
while (p < src_end)
null_byte_found |= (! *p++);
}
*skip = src - source;
if (src >= src_end)
/* We found nothing other than ASCII. There's nothing to do. */
/* We found nothing other than ASCII (and NULL byte). There's
nothing to do. */
return 0;
c = *src;
/* The text seems to be encoded in some multilingual coding system.
Now, try to find in which coding system the text is encoded. */
if (c < 0x80)
if (! null_byte_found && c < 0x80)
{
/* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
/* C is an ISO2022 specific control code of C0. */
mask = detect_coding_iso2022 (src, src_end, multibytep);
latin_extra_code_state = 1;
mask = detect_coding_iso2022 (src, src_end, multibytep,
&latin_extra_code_state);
if (mask == 0)
{
/* No valid ISO2022 code follows C. Try again. */
@ -4177,21 +4205,27 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
c = src[1] - 0x20;
if (c < 0xA0)
if (null_byte_found)
{
try = (CODING_CATEGORY_MASK_UTF_16_BE
| CODING_CATEGORY_MASK_UTF_16_LE);
}
else if (c < 0xA0)
{
/* C is the first byte of SJIS character code,
or a leading-code of Emacs' internal format (emacs-mule),
or the first byte of UTF-16. */
try = (CODING_CATEGORY_MASK_SJIS
| CODING_CATEGORY_MASK_EMACS_MULE
| CODING_CATEGORY_MASK_UTF_16_BE
| CODING_CATEGORY_MASK_UTF_16_LE);
| CODING_CATEGORY_MASK_EMACS_MULE
| CODING_CATEGORY_MASK_UTF_16_BE
| CODING_CATEGORY_MASK_UTF_16_LE);
/* Or, if C is a special latin extra code,
or is an ISO2022 specific control code of C1 (SS2 or SS3),
or is an ISO2022 control-sequence-introducer (CSI),
we should also consider the possibility of ISO2022 codings. */
if ((VECTORP (Vlatin_extra_code_table)
if ((latin_extra_code_state
&& VECTORP (Vlatin_extra_code_table)
&& !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
|| (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
|| (c == ISO_CODE_CSI
@ -4201,7 +4235,7 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
&& src + 1 < src_end
&& src[1] == ']')))))
try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
| CODING_CATEGORY_MASK_ISO_8BIT);
| CODING_CATEGORY_MASK_ISO_8BIT);
}
else
/* C is a character of ISO2022 in graphic plane right,
@ -4209,29 +4243,36 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
or the first byte of BIG5's 2-byte code,
or the first byte of UTF-8/16. */
try = (CODING_CATEGORY_MASK_ISO_8_ELSE
| CODING_CATEGORY_MASK_ISO_8BIT
| CODING_CATEGORY_MASK_SJIS
| CODING_CATEGORY_MASK_BIG5
| CODING_CATEGORY_MASK_UTF_8
| CODING_CATEGORY_MASK_UTF_16_BE
| CODING_CATEGORY_MASK_UTF_16_LE);
| CODING_CATEGORY_MASK_ISO_8BIT
| CODING_CATEGORY_MASK_SJIS
| CODING_CATEGORY_MASK_BIG5
| CODING_CATEGORY_MASK_UTF_8
| CODING_CATEGORY_MASK_UTF_16_BE
| CODING_CATEGORY_MASK_UTF_16_LE);
/* Or, we may have to consider the possibility of CCL. */
if (coding_system_table[CODING_CATEGORY_IDX_CCL]
if (! null_byte_found
&& coding_system_table[CODING_CATEGORY_IDX_CCL]
&& (coding_system_table[CODING_CATEGORY_IDX_CCL]
->spec.ccl.valid_codes)[c])
try |= CODING_CATEGORY_MASK_CCL;
mask = 0;
utf16_examined_p = iso2022_examined_p = 0;
if (priorities)
{
/* At first try detection with Latin extra codes not-allowed.
If no proper coding system is found because of Latin extra
codes, try detection with Latin extra codes allowed. */
latin_extra_code_state = 0;
label_retry:
utf16_examined_p = iso2022_examined_p = 0;
for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
{
if (!iso2022_examined_p
&& (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
{
mask |= detect_coding_iso2022 (src, src_end, multibytep);
mask |= detect_coding_iso2022 (src, src_end, multibytep,
&latin_extra_code_state);
iso2022_examined_p = 1;
}
else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
@ -4252,16 +4293,40 @@ detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
mask |= detect_coding_ccl (src, src_end, multibytep);
else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
mask |= CODING_CATEGORY_MASK_RAW_TEXT;
{
if (latin_extra_code_state == 1)
{
/* Detection of ISO-2022 based coding system
failed because of Latin extra codes. Before
falling back to raw-text, try again with
Latin extra codes allowed. */
latin_extra_code_state = 2;
try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
| CODING_CATEGORY_MASK_ISO_8BIT);
goto label_retry;
}
mask |= CODING_CATEGORY_MASK_RAW_TEXT;
}
else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
mask |= CODING_CATEGORY_MASK_BINARY;
{
if (latin_extra_code_state == 1)
{
/* See the above comment. */
latin_extra_code_state = 2;
try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
| CODING_CATEGORY_MASK_ISO_8BIT);
goto label_retry;
}
mask |= CODING_CATEGORY_MASK_BINARY;
}
if (mask & priorities[i])
return priorities[i];
}
return CODING_CATEGORY_MASK_RAW_TEXT;
}
if (try & CODING_CATEGORY_MASK_ISO)
mask |= detect_coding_iso2022 (src, src_end, multibytep);
mask |= detect_coding_iso2022 (src, src_end, multibytep,
&latin_extra_code_state);
if (try & CODING_CATEGORY_MASK_SJIS)
mask |= detect_coding_sjis (src, src_end, multibytep);
if (try & CODING_CATEGORY_MASK_BIG5)