Optimize ASCII file reading with EOL format detection and decoding.
This commit is contained in:
parent
9b59398006
commit
8a44e6d176
5 changed files with 210 additions and 60 deletions
|
@ -1,3 +1,31 @@
|
|||
2013-03-15 handa <handa@gnu.org>
|
||||
|
||||
* insdel.c (insert_from_gap): New arg text_at_gap_tail.
|
||||
(adjust_after_replace): Make it back to static. Delete the third
|
||||
arg text_at_gap_tail. Cancel the code for handling it.
|
||||
|
||||
* coding.h (struct coding_system): New member eol_seen.
|
||||
|
||||
* coding.c (detect_ascii): New function.
|
||||
(detect_coding): Set coding->head_ascii and coding->eol_seen only
|
||||
when the source bytes are actually scanned. On detecting for
|
||||
coding_category_utf_8_auto, call detect_ascii instead of scanning
|
||||
source bytes directly.
|
||||
(produce_chars): Call insert_from_gap with the new arg 0.
|
||||
(encode_coding): Likewise.
|
||||
(decode_coding_gap): Control ASCII optimization by the variable
|
||||
disable_ascii_optimization instead of #ifndef .. #endif.
|
||||
Deccode EOL format according to coding->eol_seen.
|
||||
(syms_of_coding): Declare disable-ascii-optimization as a Lisp
|
||||
variable.
|
||||
|
||||
* global.h (struct emacs_globals): New member
|
||||
f_disable_ascii_optimization.
|
||||
(disable_ascii_optimization): New macro.
|
||||
|
||||
* lisp.h (adjust_after_replace): Cancel externing it.
|
||||
(insert_from_gap): Adjust prototype.
|
||||
|
||||
2013-03-11 Paul Eggert <eggert@cs.ucla.edu>
|
||||
|
||||
* insdel.c (adjust_after_replace): Use bool for boolean.
|
||||
|
|
199
src/coding.c
199
src/coding.c
|
@ -6071,6 +6071,93 @@ complement_process_encoding_system (Lisp_Object coding_system)
|
|||
#define EOL_SEEN_CR 2
|
||||
#define EOL_SEEN_CRLF 4
|
||||
|
||||
|
||||
static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int eol_seen);
|
||||
|
||||
|
||||
/* Return 1 if all the source bytes are ASCII, and return 0 otherwize.
|
||||
By side effects, set coding->head_ascii and coding->eol_seen. The
|
||||
value of coding->eol_seen is "logical or" of EOL_SEEN_LF,
|
||||
EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when
|
||||
all the source bytes are ASCII. */
|
||||
|
||||
static bool
|
||||
detect_ascii (struct coding_system *coding)
|
||||
{
|
||||
const unsigned char *src, *end;
|
||||
Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
|
||||
int eol_seen;
|
||||
|
||||
eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE
|
||||
: EQ (eol_type, Qunix) ? EOL_SEEN_LF
|
||||
: EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
|
||||
: EOL_SEEN_CR);
|
||||
coding_set_source (coding);
|
||||
src = coding->source;
|
||||
end = src + coding->src_bytes;
|
||||
|
||||
if (inhibit_eol_conversion)
|
||||
{
|
||||
/* We don't have to check EOL format. */
|
||||
while (src < end && !( *src & 0x80)) src++;
|
||||
eol_seen = EOL_SEEN_LF;
|
||||
adjust_coding_eol_type (coding, eol_seen);
|
||||
}
|
||||
else if (eol_seen != EOL_SEEN_NONE)
|
||||
{
|
||||
/* We don't have to check EOL format either. */
|
||||
while (src < end && !(*src & 0x80)) src++;
|
||||
}
|
||||
else
|
||||
{
|
||||
end--; /* We look ahead one byte. */
|
||||
while (src < end)
|
||||
{
|
||||
int c = *src;
|
||||
|
||||
if (c & 0x80)
|
||||
break;
|
||||
src++;
|
||||
if (c < 0x20)
|
||||
{
|
||||
if (c == '\r')
|
||||
{
|
||||
if (*src == '\n')
|
||||
{
|
||||
eol_seen |= EOL_SEEN_CRLF;
|
||||
src++;
|
||||
}
|
||||
else
|
||||
eol_seen |= EOL_SEEN_CR;
|
||||
}
|
||||
else if (c == '\n')
|
||||
eol_seen |= EOL_SEEN_LF;
|
||||
}
|
||||
}
|
||||
if (src > end)
|
||||
/* The last two bytes are CR LF, which means that we have
|
||||
scanned all bytes. */
|
||||
end++;
|
||||
else if (src == end)
|
||||
{
|
||||
end++;
|
||||
if (! (*src & 0x80))
|
||||
{
|
||||
if (*src == '\r')
|
||||
eol_seen |= EOL_SEEN_CR;
|
||||
else if (*src == '\n')
|
||||
eol_seen |= EOL_SEEN_LF;
|
||||
src++;
|
||||
}
|
||||
}
|
||||
adjust_coding_eol_type (coding, eol_seen);
|
||||
}
|
||||
coding->head_ascii = src - coding->source;
|
||||
coding->eol_seen = eol_seen;
|
||||
return (src == end);
|
||||
}
|
||||
|
||||
|
||||
/* Detect how end-of-line of a text of length SRC_BYTES pointed by
|
||||
SOURCE is encoded. If CATEGORY is one of
|
||||
coding_category_utf_16_XXXX, assume that CR and LF are encoded by
|
||||
|
@ -6215,7 +6302,6 @@ detect_coding (struct coding_system *coding)
|
|||
coding_set_source (coding);
|
||||
|
||||
src_end = coding->source + coding->src_bytes;
|
||||
coding->head_ascii = 0;
|
||||
|
||||
/* If we have not yet decided the text encoding type, detect it
|
||||
now. */
|
||||
|
@ -6225,6 +6311,8 @@ detect_coding (struct coding_system *coding)
|
|||
struct coding_detection_info detect_info;
|
||||
bool null_byte_found = 0, eight_bit_found = 0;
|
||||
|
||||
coding->head_ascii = 0;
|
||||
coding->eol_seen = EOL_SEEN_NONE;
|
||||
detect_info.checked = detect_info.found = detect_info.rejected = 0;
|
||||
for (src = coding->source; src < src_end; src++)
|
||||
{
|
||||
|
@ -6263,6 +6351,26 @@ detect_coding (struct coding_system *coding)
|
|||
if (eight_bit_found)
|
||||
break;
|
||||
}
|
||||
else if (! disable_ascii_optimization
|
||||
&& ! inhibit_eol_conversion)
|
||||
{
|
||||
if (c == '\r')
|
||||
{
|
||||
if (src < src_end && src[1] == '\n')
|
||||
{
|
||||
coding->eol_seen |= EOL_SEEN_CRLF;
|
||||
src++;
|
||||
coding->head_ascii++;
|
||||
}
|
||||
else
|
||||
coding->eol_seen |= EOL_SEEN_CR;
|
||||
}
|
||||
else if (c == '\n')
|
||||
{
|
||||
coding->eol_seen |= EOL_SEEN_LF;
|
||||
}
|
||||
}
|
||||
|
||||
if (! eight_bit_found)
|
||||
coding->head_ascii++;
|
||||
}
|
||||
|
@ -6353,19 +6461,20 @@ detect_coding (struct coding_system *coding)
|
|||
coding_systems
|
||||
= AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
|
||||
detect_info.found = detect_info.rejected = 0;
|
||||
for (src = coding->source; src < src_end; src++)
|
||||
if (detect_ascii (coding))
|
||||
{
|
||||
if (*src & 0x80)
|
||||
break;
|
||||
setup_coding_system (XCDR (coding_systems), coding);
|
||||
}
|
||||
coding->head_ascii = src - coding->source;
|
||||
if (CONSP (coding_systems)
|
||||
&& detect_coding_utf_8 (coding, &detect_info))
|
||||
else
|
||||
{
|
||||
if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
|
||||
setup_coding_system (XCAR (coding_systems), coding);
|
||||
else
|
||||
setup_coding_system (XCDR (coding_systems), coding);
|
||||
if (CONSP (coding_systems)
|
||||
&& detect_coding_utf_8 (coding, &detect_info))
|
||||
{
|
||||
if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
|
||||
setup_coding_system (XCAR (coding_systems), coding);
|
||||
else
|
||||
setup_coding_system (XCDR (coding_systems), coding);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
|
||||
|
@ -6378,6 +6487,7 @@ detect_coding (struct coding_system *coding)
|
|||
= AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
|
||||
detect_info.found = detect_info.rejected = 0;
|
||||
coding->head_ascii = 0;
|
||||
coding->eol_seen = EOL_SEEN_NONE;
|
||||
if (CONSP (coding_systems)
|
||||
&& detect_coding_utf_16 (coding, &detect_info))
|
||||
{
|
||||
|
@ -6815,7 +6925,7 @@ produce_chars (struct coding_system *coding, Lisp_Object translation_table,
|
|||
|
||||
produced = dst - (coding->destination + coding->produced);
|
||||
if (BUFFERP (coding->dst_object) && produced_chars > 0)
|
||||
insert_from_gap (produced_chars, produced);
|
||||
insert_from_gap (produced_chars, produced, 0);
|
||||
coding->produced += produced;
|
||||
coding->produced_char += produced_chars;
|
||||
return carryover;
|
||||
|
@ -7400,7 +7510,7 @@ encode_coding (struct coding_system *coding)
|
|||
} while (coding->consumed_char < coding->src_chars);
|
||||
|
||||
if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
|
||||
insert_from_gap (coding->produced_char, coding->produced);
|
||||
insert_from_gap (coding->produced_char, coding->produced, 0);
|
||||
|
||||
SAFE_FREE ();
|
||||
}
|
||||
|
@ -7510,39 +7620,45 @@ decode_coding_gap (struct coding_system *coding,
|
|||
if (CODING_REQUIRE_DETECTION (coding))
|
||||
detect_coding (coding);
|
||||
attrs = CODING_ID_ATTRS (coding->id);
|
||||
#ifndef CODING_DISABLE_ASCII_OPTIMIZATION
|
||||
if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
|
||||
&& NILP (CODING_ATTR_POST_READ (attrs))
|
||||
&& NILP (get_translation_table (attrs, 0, NULL))
|
||||
&& (inhibit_eol_conversion
|
||||
|| EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)))
|
||||
if (! disable_ascii_optimization)
|
||||
{
|
||||
/* We can skip the conversion if all source bytes are ASCII. */
|
||||
if (coding->head_ascii < 0)
|
||||
if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
|
||||
&& NILP (CODING_ATTR_POST_READ (attrs))
|
||||
&& NILP (get_translation_table (attrs, 0, NULL))
|
||||
&& (coding->head_ascii >= 0 /* We've already called detect_coding */
|
||||
? coding->head_ascii == bytes
|
||||
: detect_ascii (coding)))
|
||||
{
|
||||
/* We have not yet counted the number of ASCII bytes at the
|
||||
head of the source. Do it now. */
|
||||
const unsigned char *src, *src_end;
|
||||
|
||||
coding_set_source (coding);
|
||||
src_end = coding->source + coding->src_bytes;
|
||||
for (src = coding->source; src < src_end; src++)
|
||||
if (coding->eol_seen == EOL_SEEN_CR)
|
||||
{
|
||||
if (*src & 0x80)
|
||||
break;
|
||||
unsigned char *src_end = GAP_END_ADDR;
|
||||
unsigned char *src = src - coding->src_bytes;
|
||||
|
||||
while (src < src_end)
|
||||
{
|
||||
if (*src++ == '\r')
|
||||
src[-1] = '\n';
|
||||
}
|
||||
}
|
||||
coding->head_ascii = src - coding->source;
|
||||
}
|
||||
if (coding->src_bytes == coding->head_ascii)
|
||||
{
|
||||
/* No need of conversion. Use the data in the gap as is. */
|
||||
coding->produced_char = chars;
|
||||
coding->produced = bytes;
|
||||
adjust_after_replace (PT, PT_BYTE, Qnil, chars, bytes, 1);
|
||||
else if (coding->eol_seen == EOL_SEEN_CRLF)
|
||||
{
|
||||
unsigned char *src = GAP_END_ADDR;
|
||||
unsigned char *src_beg = src - coding->src_bytes;
|
||||
unsigned char *dst = src;
|
||||
|
||||
while (src_beg < src)
|
||||
{
|
||||
*--dst = *--src;
|
||||
if (*src == '\n')
|
||||
src--;
|
||||
}
|
||||
bytes -= dst - src;
|
||||
}
|
||||
coding->produced_char = coding->produced = bytes;
|
||||
insert_from_gap (bytes, bytes, 1);
|
||||
return;
|
||||
}
|
||||
}
|
||||
#endif /* not CODING_DISABLE_ASCII_OPTIMIZATION */
|
||||
code_conversion_save (0, 0);
|
||||
|
||||
coding->mode |= CODING_MODE_LAST_BLOCK;
|
||||
|
@ -10758,6 +10874,11 @@ from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
|
|||
decode text as usual. */);
|
||||
inhibit_null_byte_detection = 0;
|
||||
|
||||
DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
|
||||
doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
|
||||
Internal use only. Removed after the experimental optimizer gets stable. */);
|
||||
disable_ascii_optimization = 0;
|
||||
|
||||
DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
|
||||
doc: /* Char table for translating self-inserting characters.
|
||||
This is applied to the result of input methods, not their input.
|
||||
|
|
|
@ -440,9 +440,13 @@ struct coding_system
|
|||
/* How may heading bytes we can skip for decoding. This is set to
|
||||
-1 in setup_coding_system, and updated by detect_coding. So,
|
||||
when this is equal to the byte length of the text being
|
||||
converted, we can skip the actual conversion process. */
|
||||
converted, we can skip the actual conversion process except for
|
||||
the eol format. */
|
||||
ptrdiff_t head_ascii;
|
||||
|
||||
/* Used internally in coding.c. See the comment of detect_ascii. */
|
||||
int eol_seen;
|
||||
|
||||
/* The following members are set by encoding/decoding routine. */
|
||||
ptrdiff_t produced, produced_char, consumed, consumed_char;
|
||||
|
||||
|
|
33
src/insdel.c
33
src/insdel.c
|
@ -977,10 +977,11 @@ insert_from_string_1 (Lisp_Object string, ptrdiff_t pos, ptrdiff_t pos_byte,
|
|||
}
|
||||
|
||||
/* Insert a sequence of NCHARS chars which occupy NBYTES bytes
|
||||
starting at GPT_ADDR. */
|
||||
starting at GAP_END_ADDR - NBYTES (if text_at_gap_tail) and at
|
||||
GPT_ADDR (if not text_at_gap_tail). */
|
||||
|
||||
void
|
||||
insert_from_gap (ptrdiff_t nchars, ptrdiff_t nbytes)
|
||||
insert_from_gap (ptrdiff_t nchars, ptrdiff_t nbytes, bool text_at_gap_tail)
|
||||
{
|
||||
if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
|
||||
nchars = nbytes;
|
||||
|
@ -989,10 +990,13 @@ insert_from_gap (ptrdiff_t nchars, ptrdiff_t nbytes)
|
|||
MODIFF++;
|
||||
|
||||
GAP_SIZE -= nbytes;
|
||||
GPT += nchars;
|
||||
if (! text_at_gap_tail)
|
||||
{
|
||||
GPT += nchars;
|
||||
GPT_BYTE += nbytes;
|
||||
}
|
||||
ZV += nchars;
|
||||
Z += nchars;
|
||||
GPT_BYTE += nbytes;
|
||||
ZV_BYTE += nbytes;
|
||||
Z_BYTE += nbytes;
|
||||
if (GAP_SIZE > 0) *(GPT_ADDR) = 0; /* Put an anchor. */
|
||||
|
@ -1010,7 +1014,7 @@ insert_from_gap (ptrdiff_t nchars, ptrdiff_t nbytes)
|
|||
current_buffer, 0);
|
||||
}
|
||||
|
||||
if (GPT - nchars < PT)
|
||||
if (! text_at_gap_tail && GPT - nchars < PT)
|
||||
adjust_point (nchars, nbytes);
|
||||
|
||||
check_markers ();
|
||||
|
@ -1162,16 +1166,14 @@ insert_from_buffer_1 (struct buffer *buf,
|
|||
|
||||
/* Record undo information and adjust markers and position keepers for
|
||||
a replacement of a text PREV_TEXT at FROM to a new text of LEN
|
||||
chars (LEN_BYTE bytes). If TEXT_AT_GAP_TAIL, the new text
|
||||
resides at the gap tail; i.e. at (GAP_END_ADDR - LEN_BYTE)
|
||||
Otherwise, the text resides in the gap just after GPT_BYTE.
|
||||
chars (LEN_BYTE bytes) which resides in the gap just after
|
||||
GPT_ADDR.
|
||||
|
||||
PREV_TEXT nil means the new text was just inserted. */
|
||||
|
||||
void
|
||||
static void
|
||||
adjust_after_replace (ptrdiff_t from, ptrdiff_t from_byte,
|
||||
Lisp_Object prev_text, ptrdiff_t len, ptrdiff_t len_byte,
|
||||
bool text_at_gap_tail)
|
||||
Lisp_Object prev_text, ptrdiff_t len, ptrdiff_t len_byte)
|
||||
{
|
||||
ptrdiff_t nchars_del = 0, nbytes_del = 0;
|
||||
|
||||
|
@ -1191,11 +1193,8 @@ adjust_after_replace (ptrdiff_t from, ptrdiff_t from_byte,
|
|||
GAP_SIZE -= len_byte;
|
||||
ZV += len; Z+= len;
|
||||
ZV_BYTE += len_byte; Z_BYTE += len_byte;
|
||||
if (! text_at_gap_tail)
|
||||
{
|
||||
GPT += len; GPT_BYTE += len_byte;
|
||||
if (GAP_SIZE > 0) *(GPT_ADDR) = 0; /* Put an anchor. */
|
||||
}
|
||||
GPT += len; GPT_BYTE += len_byte;
|
||||
if (GAP_SIZE > 0) *(GPT_ADDR) = 0; /* Put an anchor. */
|
||||
|
||||
if (nchars_del > 0)
|
||||
adjust_markers_for_replace (from, from_byte, nchars_del, nbytes_del,
|
||||
|
@ -1250,7 +1249,7 @@ adjust_after_insert (ptrdiff_t from, ptrdiff_t from_byte,
|
|||
GPT -= len; GPT_BYTE -= len_byte;
|
||||
ZV -= len; ZV_BYTE -= len_byte;
|
||||
Z -= len; Z_BYTE -= len_byte;
|
||||
adjust_after_replace (from, from_byte, Qnil, newlen, len_byte, 0);
|
||||
adjust_after_replace (from, from_byte, Qnil, newlen, len_byte);
|
||||
}
|
||||
|
||||
/* Replace the text from character positions FROM to TO with NEW,
|
||||
|
|
|
@ -2880,7 +2880,7 @@ extern void insert (const char *, ptrdiff_t);
|
|||
extern void insert_and_inherit (const char *, ptrdiff_t);
|
||||
extern void insert_1_both (const char *, ptrdiff_t, ptrdiff_t,
|
||||
bool, bool, bool);
|
||||
extern void insert_from_gap (ptrdiff_t, ptrdiff_t);
|
||||
extern void insert_from_gap (ptrdiff_t, ptrdiff_t, bool text_at_gap_tail);
|
||||
extern void insert_from_string (Lisp_Object, ptrdiff_t, ptrdiff_t,
|
||||
ptrdiff_t, ptrdiff_t, bool);
|
||||
extern void insert_from_buffer (struct buffer *, ptrdiff_t, ptrdiff_t, bool);
|
||||
|
@ -2900,8 +2900,6 @@ extern Lisp_Object del_range_2 (ptrdiff_t, ptrdiff_t,
|
|||
extern void modify_region_1 (ptrdiff_t, ptrdiff_t, bool);
|
||||
extern void prepare_to_modify_buffer (ptrdiff_t, ptrdiff_t, ptrdiff_t *);
|
||||
extern void signal_after_change (ptrdiff_t, ptrdiff_t, ptrdiff_t);
|
||||
extern void adjust_after_replace (ptrdiff_t, ptrdiff_t, Lisp_Object,
|
||||
ptrdiff_t, ptrdiff_t, bool);
|
||||
extern void adjust_after_insert (ptrdiff_t, ptrdiff_t, ptrdiff_t,
|
||||
ptrdiff_t, ptrdiff_t);
|
||||
extern void adjust_markers_for_delete (ptrdiff_t, ptrdiff_t,
|
||||
|
|
Loading…
Add table
Reference in a new issue