Optimize ASCII file reading with EOL format detection and decoding.

This commit is contained in:
Kenichi Handa 2013-03-16 01:03:54 +09:00
parent 9b59398006
commit 8a44e6d176
5 changed files with 210 additions and 60 deletions

View file

@ -1,3 +1,31 @@
2013-03-15 handa <handa@gnu.org>
* insdel.c (insert_from_gap): New arg text_at_gap_tail.
(adjust_after_replace): Make it back to static. Delete the third
arg text_at_gap_tail. Cancel the code for handling it.
* coding.h (struct coding_system): New member eol_seen.
* coding.c (detect_ascii): New function.
(detect_coding): Set coding->head_ascii and coding->eol_seen only
when the source bytes are actually scanned. On detecting for
coding_category_utf_8_auto, call detect_ascii instead of scanning
source bytes directly.
(produce_chars): Call insert_from_gap with the new arg 0.
(encode_coding): Likewise.
(decode_coding_gap): Control ASCII optimization by the variable
disable_ascii_optimization instead of #ifndef .. #endif.
Deccode EOL format according to coding->eol_seen.
(syms_of_coding): Declare disable-ascii-optimization as a Lisp
variable.
* global.h (struct emacs_globals): New member
f_disable_ascii_optimization.
(disable_ascii_optimization): New macro.
* lisp.h (adjust_after_replace): Cancel externing it.
(insert_from_gap): Adjust prototype.
2013-03-11 Paul Eggert <eggert@cs.ucla.edu>
* insdel.c (adjust_after_replace): Use bool for boolean.

View file

@ -6071,6 +6071,93 @@ complement_process_encoding_system (Lisp_Object coding_system)
#define EOL_SEEN_CR 2
#define EOL_SEEN_CRLF 4
static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int eol_seen);
/* Return 1 if all the source bytes are ASCII, and return 0 otherwize.
By side effects, set coding->head_ascii and coding->eol_seen. The
value of coding->eol_seen is "logical or" of EOL_SEEN_LF,
EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when
all the source bytes are ASCII. */
static bool
detect_ascii (struct coding_system *coding)
{
const unsigned char *src, *end;
Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
int eol_seen;
eol_seen = (VECTORP (eol_type) ? EOL_SEEN_NONE
: EQ (eol_type, Qunix) ? EOL_SEEN_LF
: EQ (eol_type, Qdos) ? EOL_SEEN_CRLF
: EOL_SEEN_CR);
coding_set_source (coding);
src = coding->source;
end = src + coding->src_bytes;
if (inhibit_eol_conversion)
{
/* We don't have to check EOL format. */
while (src < end && !( *src & 0x80)) src++;
eol_seen = EOL_SEEN_LF;
adjust_coding_eol_type (coding, eol_seen);
}
else if (eol_seen != EOL_SEEN_NONE)
{
/* We don't have to check EOL format either. */
while (src < end && !(*src & 0x80)) src++;
}
else
{
end--; /* We look ahead one byte. */
while (src < end)
{
int c = *src;
if (c & 0x80)
break;
src++;
if (c < 0x20)
{
if (c == '\r')
{
if (*src == '\n')
{
eol_seen |= EOL_SEEN_CRLF;
src++;
}
else
eol_seen |= EOL_SEEN_CR;
}
else if (c == '\n')
eol_seen |= EOL_SEEN_LF;
}
}
if (src > end)
/* The last two bytes are CR LF, which means that we have
scanned all bytes. */
end++;
else if (src == end)
{
end++;
if (! (*src & 0x80))
{
if (*src == '\r')
eol_seen |= EOL_SEEN_CR;
else if (*src == '\n')
eol_seen |= EOL_SEEN_LF;
src++;
}
}
adjust_coding_eol_type (coding, eol_seen);
}
coding->head_ascii = src - coding->source;
coding->eol_seen = eol_seen;
return (src == end);
}
/* Detect how end-of-line of a text of length SRC_BYTES pointed by
SOURCE is encoded. If CATEGORY is one of
coding_category_utf_16_XXXX, assume that CR and LF are encoded by
@ -6215,7 +6302,6 @@ detect_coding (struct coding_system *coding)
coding_set_source (coding);
src_end = coding->source + coding->src_bytes;
coding->head_ascii = 0;
/* If we have not yet decided the text encoding type, detect it
now. */
@ -6225,6 +6311,8 @@ detect_coding (struct coding_system *coding)
struct coding_detection_info detect_info;
bool null_byte_found = 0, eight_bit_found = 0;
coding->head_ascii = 0;
coding->eol_seen = EOL_SEEN_NONE;
detect_info.checked = detect_info.found = detect_info.rejected = 0;
for (src = coding->source; src < src_end; src++)
{
@ -6263,6 +6351,26 @@ detect_coding (struct coding_system *coding)
if (eight_bit_found)
break;
}
else if (! disable_ascii_optimization
&& ! inhibit_eol_conversion)
{
if (c == '\r')
{
if (src < src_end && src[1] == '\n')
{
coding->eol_seen |= EOL_SEEN_CRLF;
src++;
coding->head_ascii++;
}
else
coding->eol_seen |= EOL_SEEN_CR;
}
else if (c == '\n')
{
coding->eol_seen |= EOL_SEEN_LF;
}
}
if (! eight_bit_found)
coding->head_ascii++;
}
@ -6353,19 +6461,20 @@ detect_coding (struct coding_system *coding)
coding_systems
= AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
detect_info.found = detect_info.rejected = 0;
for (src = coding->source; src < src_end; src++)
if (detect_ascii (coding))
{
if (*src & 0x80)
break;
setup_coding_system (XCDR (coding_systems), coding);
}
coding->head_ascii = src - coding->source;
if (CONSP (coding_systems)
&& detect_coding_utf_8 (coding, &detect_info))
else
{
if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
setup_coding_system (XCAR (coding_systems), coding);
else
setup_coding_system (XCDR (coding_systems), coding);
if (CONSP (coding_systems)
&& detect_coding_utf_8 (coding, &detect_info))
{
if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
setup_coding_system (XCAR (coding_systems), coding);
else
setup_coding_system (XCDR (coding_systems), coding);
}
}
}
else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
@ -6378,6 +6487,7 @@ detect_coding (struct coding_system *coding)
= AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
detect_info.found = detect_info.rejected = 0;
coding->head_ascii = 0;
coding->eol_seen = EOL_SEEN_NONE;
if (CONSP (coding_systems)
&& detect_coding_utf_16 (coding, &detect_info))
{
@ -6815,7 +6925,7 @@ produce_chars (struct coding_system *coding, Lisp_Object translation_table,
produced = dst - (coding->destination + coding->produced);
if (BUFFERP (coding->dst_object) && produced_chars > 0)
insert_from_gap (produced_chars, produced);
insert_from_gap (produced_chars, produced, 0);
coding->produced += produced;
coding->produced_char += produced_chars;
return carryover;
@ -7400,7 +7510,7 @@ encode_coding (struct coding_system *coding)
} while (coding->consumed_char < coding->src_chars);
if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
insert_from_gap (coding->produced_char, coding->produced);
insert_from_gap (coding->produced_char, coding->produced, 0);
SAFE_FREE ();
}
@ -7510,39 +7620,45 @@ decode_coding_gap (struct coding_system *coding,
if (CODING_REQUIRE_DETECTION (coding))
detect_coding (coding);
attrs = CODING_ID_ATTRS (coding->id);
#ifndef CODING_DISABLE_ASCII_OPTIMIZATION
if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
&& NILP (CODING_ATTR_POST_READ (attrs))
&& NILP (get_translation_table (attrs, 0, NULL))
&& (inhibit_eol_conversion
|| EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)))
if (! disable_ascii_optimization)
{
/* We can skip the conversion if all source bytes are ASCII. */
if (coding->head_ascii < 0)
if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
&& NILP (CODING_ATTR_POST_READ (attrs))
&& NILP (get_translation_table (attrs, 0, NULL))
&& (coding->head_ascii >= 0 /* We've already called detect_coding */
? coding->head_ascii == bytes
: detect_ascii (coding)))
{
/* We have not yet counted the number of ASCII bytes at the
head of the source. Do it now. */
const unsigned char *src, *src_end;
coding_set_source (coding);
src_end = coding->source + coding->src_bytes;
for (src = coding->source; src < src_end; src++)
if (coding->eol_seen == EOL_SEEN_CR)
{
if (*src & 0x80)
break;
unsigned char *src_end = GAP_END_ADDR;
unsigned char *src = src - coding->src_bytes;
while (src < src_end)
{
if (*src++ == '\r')
src[-1] = '\n';
}
}
coding->head_ascii = src - coding->source;
}
if (coding->src_bytes == coding->head_ascii)
{
/* No need of conversion. Use the data in the gap as is. */
coding->produced_char = chars;
coding->produced = bytes;
adjust_after_replace (PT, PT_BYTE, Qnil, chars, bytes, 1);
else if (coding->eol_seen == EOL_SEEN_CRLF)
{
unsigned char *src = GAP_END_ADDR;
unsigned char *src_beg = src - coding->src_bytes;
unsigned char *dst = src;
while (src_beg < src)
{
*--dst = *--src;
if (*src == '\n')
src--;
}
bytes -= dst - src;
}
coding->produced_char = coding->produced = bytes;
insert_from_gap (bytes, bytes, 1);
return;
}
}
#endif /* not CODING_DISABLE_ASCII_OPTIMIZATION */
code_conversion_save (0, 0);
coding->mode |= CODING_MODE_LAST_BLOCK;
@ -10758,6 +10874,11 @@ from GNU Find and GNU Grep. Emacs will then ignore the null bytes and
decode text as usual. */);
inhibit_null_byte_detection = 0;
DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
doc: /* If non-nil, Emacs does not optimize code decoder for ASCII files.
Internal use only. Removed after the experimental optimizer gets stable. */);
disable_ascii_optimization = 0;
DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
doc: /* Char table for translating self-inserting characters.
This is applied to the result of input methods, not their input.

View file

@ -440,9 +440,13 @@ struct coding_system
/* How may heading bytes we can skip for decoding. This is set to
-1 in setup_coding_system, and updated by detect_coding. So,
when this is equal to the byte length of the text being
converted, we can skip the actual conversion process. */
converted, we can skip the actual conversion process except for
the eol format. */
ptrdiff_t head_ascii;
/* Used internally in coding.c. See the comment of detect_ascii. */
int eol_seen;
/* The following members are set by encoding/decoding routine. */
ptrdiff_t produced, produced_char, consumed, consumed_char;

View file

@ -977,10 +977,11 @@ insert_from_string_1 (Lisp_Object string, ptrdiff_t pos, ptrdiff_t pos_byte,
}
/* Insert a sequence of NCHARS chars which occupy NBYTES bytes
starting at GPT_ADDR. */
starting at GAP_END_ADDR - NBYTES (if text_at_gap_tail) and at
GPT_ADDR (if not text_at_gap_tail). */
void
insert_from_gap (ptrdiff_t nchars, ptrdiff_t nbytes)
insert_from_gap (ptrdiff_t nchars, ptrdiff_t nbytes, bool text_at_gap_tail)
{
if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
nchars = nbytes;
@ -989,10 +990,13 @@ insert_from_gap (ptrdiff_t nchars, ptrdiff_t nbytes)
MODIFF++;
GAP_SIZE -= nbytes;
GPT += nchars;
if (! text_at_gap_tail)
{
GPT += nchars;
GPT_BYTE += nbytes;
}
ZV += nchars;
Z += nchars;
GPT_BYTE += nbytes;
ZV_BYTE += nbytes;
Z_BYTE += nbytes;
if (GAP_SIZE > 0) *(GPT_ADDR) = 0; /* Put an anchor. */
@ -1010,7 +1014,7 @@ insert_from_gap (ptrdiff_t nchars, ptrdiff_t nbytes)
current_buffer, 0);
}
if (GPT - nchars < PT)
if (! text_at_gap_tail && GPT - nchars < PT)
adjust_point (nchars, nbytes);
check_markers ();
@ -1162,16 +1166,14 @@ insert_from_buffer_1 (struct buffer *buf,
/* Record undo information and adjust markers and position keepers for
a replacement of a text PREV_TEXT at FROM to a new text of LEN
chars (LEN_BYTE bytes). If TEXT_AT_GAP_TAIL, the new text
resides at the gap tail; i.e. at (GAP_END_ADDR - LEN_BYTE)
Otherwise, the text resides in the gap just after GPT_BYTE.
chars (LEN_BYTE bytes) which resides in the gap just after
GPT_ADDR.
PREV_TEXT nil means the new text was just inserted. */
void
static void
adjust_after_replace (ptrdiff_t from, ptrdiff_t from_byte,
Lisp_Object prev_text, ptrdiff_t len, ptrdiff_t len_byte,
bool text_at_gap_tail)
Lisp_Object prev_text, ptrdiff_t len, ptrdiff_t len_byte)
{
ptrdiff_t nchars_del = 0, nbytes_del = 0;
@ -1191,11 +1193,8 @@ adjust_after_replace (ptrdiff_t from, ptrdiff_t from_byte,
GAP_SIZE -= len_byte;
ZV += len; Z+= len;
ZV_BYTE += len_byte; Z_BYTE += len_byte;
if (! text_at_gap_tail)
{
GPT += len; GPT_BYTE += len_byte;
if (GAP_SIZE > 0) *(GPT_ADDR) = 0; /* Put an anchor. */
}
GPT += len; GPT_BYTE += len_byte;
if (GAP_SIZE > 0) *(GPT_ADDR) = 0; /* Put an anchor. */
if (nchars_del > 0)
adjust_markers_for_replace (from, from_byte, nchars_del, nbytes_del,
@ -1250,7 +1249,7 @@ adjust_after_insert (ptrdiff_t from, ptrdiff_t from_byte,
GPT -= len; GPT_BYTE -= len_byte;
ZV -= len; ZV_BYTE -= len_byte;
Z -= len; Z_BYTE -= len_byte;
adjust_after_replace (from, from_byte, Qnil, newlen, len_byte, 0);
adjust_after_replace (from, from_byte, Qnil, newlen, len_byte);
}
/* Replace the text from character positions FROM to TO with NEW,

View file

@ -2880,7 +2880,7 @@ extern void insert (const char *, ptrdiff_t);
extern void insert_and_inherit (const char *, ptrdiff_t);
extern void insert_1_both (const char *, ptrdiff_t, ptrdiff_t,
bool, bool, bool);
extern void insert_from_gap (ptrdiff_t, ptrdiff_t);
extern void insert_from_gap (ptrdiff_t, ptrdiff_t, bool text_at_gap_tail);
extern void insert_from_string (Lisp_Object, ptrdiff_t, ptrdiff_t,
ptrdiff_t, ptrdiff_t, bool);
extern void insert_from_buffer (struct buffer *, ptrdiff_t, ptrdiff_t, bool);
@ -2900,8 +2900,6 @@ extern Lisp_Object del_range_2 (ptrdiff_t, ptrdiff_t,
extern void modify_region_1 (ptrdiff_t, ptrdiff_t, bool);
extern void prepare_to_modify_buffer (ptrdiff_t, ptrdiff_t, ptrdiff_t *);
extern void signal_after_change (ptrdiff_t, ptrdiff_t, ptrdiff_t);
extern void adjust_after_replace (ptrdiff_t, ptrdiff_t, Lisp_Object,
ptrdiff_t, ptrdiff_t, bool);
extern void adjust_after_insert (ptrdiff_t, ptrdiff_t, ptrdiff_t,
ptrdiff_t, ptrdiff_t);
extern void adjust_markers_for_delete (ptrdiff_t, ptrdiff_t,