(encode_coding_utf_8): Initialize produced_chars to 0.
(decode_coding_utf_16): Fix converting high and low bytes to code-point. (encode_coding_utf_16): Substitute coding->default_char for non-Unicode characters. (decode_coding): Don't call record_insert here. (setup_coding_system): Initialize `surrogate' of coding->spec.utf_16 to 0. (EMIT_ONE_BYTE): Fix for multibyte case.
This commit is contained in:
parent
ed9d8bdadc
commit
e19c3639af
1 changed files with 57 additions and 50 deletions
107
src/coding.c
107
src/coding.c
|
@ -46,31 +46,23 @@ Boston, MA 02111-1307, USA. */
|
|||
|
||||
CODING SYSTEM
|
||||
|
||||
Coding system is an encoding mechanism of one or more character
|
||||
sets. Here's a list of coding system types supported by Emacs.
|
||||
When we say "decode", it means converting a text encoded by some
|
||||
coding system into Emacs' internal format (emacs-utf-8), and when we
|
||||
say "encode", it means converting a text of emacs-utf-8 to some
|
||||
other coding system.
|
||||
Coding system is an object for a encoding mechanism that contains
|
||||
information about how to convert byte sequence to character
|
||||
sequences and vice versa. When we say "decode", it means converting
|
||||
a byte sequence of a specific coding system into a character
|
||||
sequence that is represented by Emacs' internal coding system
|
||||
`emacs-utf-8', and when we say "encode", it means converting a
|
||||
character sequence of emacs-utf-8 to a byte sequence of a specific
|
||||
coding system.
|
||||
|
||||
Emacs represents a coding system by a Lisp symbol. Each symbol is a
|
||||
key to the hash table Vcharset_hash_table. This hash table
|
||||
associates the symbol to the corresponding detailed specifications.
|
||||
In Emacs Lisp, a coding system is represented by a Lisp symbol. In
|
||||
C level, a coding system is represented by a vector of attributes
|
||||
stored in the hash table Vcharset_hash_table. The conversion from a
|
||||
coding system symbol to attributes vector is done by looking up
|
||||
Vcharset_hash_table by the symbol.
|
||||
|
||||
Before using a coding system for decoding and encoding, we setup a
|
||||
structure of type `struct coding_system'. This structure keeps
|
||||
various information about a specific code conversion (e.g. the
|
||||
location of source and destination data).
|
||||
|
||||
Coding systems are classified into the following types by how to
|
||||
represent a character in a byte sequence. Here's a brief descrition
|
||||
about type.
|
||||
|
||||
o Emacs' internal format (emacs-utf-8)
|
||||
|
||||
The extended UTF-8 which allows eight-bit raw bytes mixed with
|
||||
character codes. Emacs holds characters in buffers and strings by
|
||||
this format.
|
||||
Coding systems are classified into the following types depending on
|
||||
the mechanism of encoding. Here's a brief descrition about type.
|
||||
|
||||
o UTF-8
|
||||
|
||||
|
@ -137,6 +129,13 @@ END-OF-LINE FORMAT
|
|||
independent, any coding system described above can take any format
|
||||
of end-of-line (except for no-conversion).
|
||||
|
||||
STRUCT CODING_SYSTEM
|
||||
|
||||
Before using a coding system for code conversion (i.e. decoding and
|
||||
encoding), we setup a structure of type `struct coding_system'.
|
||||
This structure keeps various information about a specific code
|
||||
conversion (e.g. the location of source and destination data).
|
||||
|
||||
*/
|
||||
|
||||
/* COMMON MACROS */
|
||||
|
@ -818,19 +817,27 @@ static int detected_mask[coding_category_raw_text] =
|
|||
|
||||
/* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
|
||||
|
||||
#define EMIT_TWO_BYTES(c1, c2) \
|
||||
do { \
|
||||
produced_chars += 2; \
|
||||
if (multibytep) \
|
||||
{ \
|
||||
CHAR_STRING_ADVANCE ((int) (c1), dst); \
|
||||
CHAR_STRING_ADVANCE ((int) (c2), dst); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
*dst++ = (c1); \
|
||||
*dst++ = (c2); \
|
||||
} \
|
||||
#define EMIT_TWO_BYTES(c1, c2) \
|
||||
do { \
|
||||
produced_chars += 2; \
|
||||
if (multibytep) \
|
||||
{ \
|
||||
int ch; \
|
||||
\
|
||||
ch = (c1); \
|
||||
if (ch >= 0x80) \
|
||||
ch = BYTE8_TO_CHAR (ch); \
|
||||
CHAR_STRING_ADVANCE (ch, dst); \
|
||||
ch = (c2); \
|
||||
if (ch >= 0x80) \
|
||||
ch = BYTE8_TO_CHAR (ch); \
|
||||
CHAR_STRING_ADVANCE (ch, dst); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
*dst++ = (c1); \
|
||||
*dst++ = (c2); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
|
||||
|
@ -889,10 +896,14 @@ coding_set_source (coding)
|
|||
coding->source = GAP_END_ADDR + coding->src_pos_byte;
|
||||
else
|
||||
{
|
||||
if (coding->src_pos < GPT
|
||||
&& coding->src_pos + coding->src_chars >= GPT)
|
||||
move_gap_both (coding->src_pos, coding->src_pos_byte);
|
||||
coding->source = BYTE_POS_ADDR (coding->src_pos_byte);
|
||||
struct buffer *buf = XBUFFER (coding->src_object);
|
||||
EMACS_INT beg_byte = BUF_BEG_BYTE (buf);
|
||||
EMACS_INT gpt_byte = BUF_GPT_BYTE (buf);
|
||||
unsigned char *beg_addr = BUF_BEG_ADDR (buf);
|
||||
|
||||
coding->source = beg_addr + coding->src_pos_byte - 1;
|
||||
if (coding->src_pos_byte >= gpt_byte)
|
||||
coding->source += BUF_GAP_SIZE (buf);
|
||||
}
|
||||
}
|
||||
else if (STRINGP (coding->src_object))
|
||||
|
@ -1182,7 +1193,7 @@ encode_coding_utf_8 (coding)
|
|||
int *charbuf_end = charbuf + coding->charbuf_used;
|
||||
unsigned char *dst = coding->destination + coding->produced;
|
||||
unsigned char *dst_end = coding->destination + coding->dst_bytes;
|
||||
int produced_chars;
|
||||
int produced_chars = 0;
|
||||
int c;
|
||||
|
||||
if (multibytep)
|
||||
|
@ -1290,7 +1301,7 @@ decode_coding_utf_16 (coding)
|
|||
src_base = src;
|
||||
ONE_MORE_BYTE (c1);
|
||||
ONE_MORE_BYTE (c2);
|
||||
c = (c1 << 16) | c2;
|
||||
c = (c1 << 8) | c2;
|
||||
if (bom == utf_16_with_bom)
|
||||
{
|
||||
if (endian == utf_16_big_endian
|
||||
|
@ -1333,7 +1344,7 @@ decode_coding_utf_16 (coding)
|
|||
ONE_MORE_BYTE (c1);
|
||||
ONE_MORE_BYTE (c2);
|
||||
c = (endian == utf_16_big_endian
|
||||
? ((c1 << 16) | c2) : ((c2 << 16) | c1));
|
||||
? ((c1 << 8) | c2) : ((c2 << 8) | c1));
|
||||
if (surrogate)
|
||||
{
|
||||
if (! UTF_16_LOW_SURROGATE_P (c))
|
||||
|
@ -1404,8 +1415,8 @@ encode_coding_utf_16 (coding)
|
|||
{
|
||||
ASSURE_DESTINATION (safe_room);
|
||||
c = *charbuf++;
|
||||
if (c >= 0x110000)
|
||||
c = 0xFFFF;
|
||||
if (c >= MAX_UNICODE_CHAR)
|
||||
c = coding->default_char;
|
||||
|
||||
if (c < 0x10000)
|
||||
{
|
||||
|
@ -4504,6 +4515,7 @@ setup_coding_system (coding_system, coding)
|
|||
val = AREF (attrs, coding_attr_utf_16_endian);
|
||||
CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian
|
||||
: utf_16_little_endian);
|
||||
CODING_UTF_16_SURROGATE (coding) = 0;
|
||||
coding->detector = detect_coding_utf_16;
|
||||
coding->decoder = decode_coding_utf_16;
|
||||
coding->encoder = encode_coding_utf_16;
|
||||
|
@ -5458,11 +5470,6 @@ decode_coding (coding)
|
|||
coding->consumed = coding->src_bytes;
|
||||
}
|
||||
|
||||
if (BUFFERP (coding->dst_object))
|
||||
{
|
||||
record_insert (coding->dst_pos, coding->produced_char);
|
||||
}
|
||||
|
||||
return coding->result;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue