Make regex matching reentrant; update syntax during match

* src/lisp.h (compile_pattern): Remove prototype of
now-internal function.

* src/regex.c (POS_AS_IN_BUFFER): Consult gl_state instead of
re_match_object: the latter can change in Lisp.
(re_match_2_internal): Switch back to UPDATE_SYNTAX_* FROM
UPDATE_SYNTAX_FAST*, allowing calls into Lisp.

* src/regex.h (re_match_object): Uncomment declaration.

* src/search.c (struct regexp_cache): Add `busy' field.
(thaw_buffer_relocation): Delete; rely on unbind.
(compile_pattern_1): Assert pattern isn't busy.
(shrink_regexp_cache): Don't shrink busy patterns.
(clear_regexp_cache): Don't nuke busy patterns.
(unfreeze_pattern, freeze_pattern): New functions.
(compile_pattern): Return a regexp_cache pointer instead of the
re_pattern_buffer, allowing callers to use `freeze_pattern' if
needed.  Do not consider busy patterns as cache hit candidates;
error if we run out of non-busy cache entries.
(looking_at_1, fast_looking_at): Snapshot
Vinhibit_changing_match_data; mark pattern busy while we're
matching it; unbind.
(string_match_1, fast_string_match_internal)
(fast_c_string_match_ignore_case): Adjust for compile_pattern
return type.
(search_buffer_re): Regex code from old search_buffer moved here;
snapshot Vinhibit_changing_match_data; mark pattern busy while
we're matching it; unbind.
(search_buffer_non_re): Non-regex code from old search_buffer
moved here.
(search_buffer): Split into search_buffer_re,
search_buffer_non_re.
(syms_of_search): Staticpro re_match_object, even though we really
shouldn't have to.

* src/syntax.h (UPDATE_SYNTAX_TABLE_FORWARD_FAST):
(UPDATE_SYNTAX_TABLE_FAST): Remove.

* src/thread.h (struct thread_state): Remove m_re_match_object,
which is global again.  (It never needs to be preserved across
thread switch.)
This commit is contained in:
Daniel Colascione 2018-06-16 13:46:10 -07:00
parent 1502b377d3
commit 938d252d1c
6 changed files with 453 additions and 424 deletions

View file

@ -4029,10 +4029,6 @@ extern void restore_search_regs (void);
extern void update_search_regs (ptrdiff_t oldstart,
ptrdiff_t oldend, ptrdiff_t newend);
extern void record_unwind_save_match_data (void);
struct re_registers;
extern struct re_pattern_buffer *compile_pattern (Lisp_Object,
struct re_registers *,
Lisp_Object, bool, bool);
extern ptrdiff_t fast_string_match_internal (Lisp_Object, Lisp_Object,
Lisp_Object);

View file

@ -155,7 +155,8 @@
# define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
/* Strings are 0-indexed, buffers are 1-indexed; we pun on the boolean
result to get the right base index. */
# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
# define POS_AS_IN_BUFFER(p) \
((p) + (NILP (gl_state.object) || BUFFERP (gl_state.object)))
# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
@ -1233,6 +1234,15 @@ static const char *re_error_msgid[] =
# undef MATCH_MAY_ALLOCATE
#endif
/* While regex matching of a single compiled pattern isn't reentrant
(because we compile regexes to bytecode programs, and the bytecode
programs are self-modifying), the regex machinery must nevertheless
be reentrant with respect to _different_ patterns, and we do that
by avoiding global variables and using MATCH_MAY_ALLOCATE. */
#if !defined MATCH_MAY_ALLOCATE && defined emacs
# error "Emacs requires MATCH_MAY_ALLOCATE"
#endif
/* Failure stack declarations and macros; both re_compile_fastmap and
re_match_2 use a failure stack. These have to be macros because of
@ -5895,12 +5905,12 @@ re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1,
#ifdef emacs
ssize_t offset = PTR_TO_OFFSET (d - 1);
ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
UPDATE_SYNTAX_TABLE_FAST (charpos);
UPDATE_SYNTAX_TABLE (charpos);
#endif
GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
s1 = SYNTAX (c1);
#ifdef emacs
UPDATE_SYNTAX_TABLE_FORWARD_FAST (charpos + 1);
UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
#endif
PREFETCH_NOLIMIT ();
GET_CHAR_AFTER (c2, d, dummy);
@ -5937,7 +5947,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1,
#ifdef emacs
ssize_t offset = PTR_TO_OFFSET (d);
ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
UPDATE_SYNTAX_TABLE_FAST (charpos);
UPDATE_SYNTAX_TABLE (charpos);
#endif
PREFETCH ();
GET_CHAR_AFTER (c2, d, dummy);
@ -5982,7 +5992,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1,
#ifdef emacs
ssize_t offset = PTR_TO_OFFSET (d) - 1;
ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
UPDATE_SYNTAX_TABLE_FAST (charpos);
UPDATE_SYNTAX_TABLE (charpos);
#endif
GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
s1 = SYNTAX (c1);
@ -5997,7 +6007,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1,
PREFETCH_NOLIMIT ();
GET_CHAR_AFTER (c2, d, dummy);
#ifdef emacs
UPDATE_SYNTAX_TABLE_FORWARD_FAST (charpos);
UPDATE_SYNTAX_TABLE_FORWARD (charpos);
#endif
s2 = SYNTAX (c2);
@ -6026,7 +6036,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1,
#ifdef emacs
ssize_t offset = PTR_TO_OFFSET (d);
ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
UPDATE_SYNTAX_TABLE_FAST (charpos);
UPDATE_SYNTAX_TABLE (charpos);
#endif
PREFETCH ();
c2 = RE_STRING_CHAR (d, target_multibyte);
@ -6069,7 +6079,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1,
#ifdef emacs
ssize_t offset = PTR_TO_OFFSET (d) - 1;
ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
UPDATE_SYNTAX_TABLE_FAST (charpos);
UPDATE_SYNTAX_TABLE (charpos);
#endif
GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
s1 = SYNTAX (c1);
@ -6084,7 +6094,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1,
PREFETCH_NOLIMIT ();
c2 = RE_STRING_CHAR (d, target_multibyte);
#ifdef emacs
UPDATE_SYNTAX_TABLE_FORWARD_FAST (charpos + 1);
UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
#endif
s2 = SYNTAX (c2);
@ -6107,7 +6117,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, re_char *string1,
{
ssize_t offset = PTR_TO_OFFSET (d);
ssize_t pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
UPDATE_SYNTAX_TABLE_FAST (pos1);
UPDATE_SYNTAX_TABLE (pos1);
}
#endif
{

View file

@ -181,8 +181,15 @@ typedef unsigned long reg_syntax_t;
string; if it's nil, we are matching text in the current buffer; if
it's t, we are matching text in a C string.
This is defined as a macro in thread.h, which see. */
/* extern Lisp_Object re_match_object; */
This value is effectively another parameter to re_search_2 and
re_match_2. No calls into Lisp or thread switches are allowed
before setting re_match_object and calling into the regex search
and match functions. These functions capture the current value of
re_match_object into gl_state on entry.
TODO: once we get rid of the !emacs case in this code, turn into an
actual function parameter. */
extern Lisp_Object re_match_object;
#endif
/* Roughly the maximum number of failure points on the stack. */

View file

@ -48,6 +48,8 @@ struct regexp_cache
char fastmap[0400];
/* True means regexp was compiled to do full POSIX backtracking. */
bool posix;
/* True means we're inside a buffer match. */
bool busy;
};
/* The instances of that struct. */
@ -93,6 +95,8 @@ static EMACS_INT search_buffer (Lisp_Object, ptrdiff_t, ptrdiff_t,
ptrdiff_t, ptrdiff_t, EMACS_INT, int,
Lisp_Object, Lisp_Object, bool);
Lisp_Object re_match_object;
static _Noreturn void
matcher_overflow (void)
{
@ -110,14 +114,6 @@ freeze_buffer_relocation (void)
#endif
}
static void
thaw_buffer_relocation (void)
{
#ifdef REL_ALLOC
unbind_to (SPECPDL_INDEX () - 1, Qnil);
#endif
}
/* Compile a regexp and signal a Lisp error if anything goes wrong.
PATTERN is the pattern to compile.
CP is the place to put the result.
@ -134,6 +130,7 @@ compile_pattern_1 (struct regexp_cache *cp, Lisp_Object pattern,
const char *whitespace_regexp;
char *val;
eassert (!cp->busy);
cp->regexp = Qnil;
cp->buf.translate = (! NILP (translate) ? translate : make_number (0));
cp->posix = posix;
@ -170,10 +167,11 @@ shrink_regexp_cache (void)
struct regexp_cache *cp;
for (cp = searchbuf_head; cp != 0; cp = cp->next)
{
cp->buf.allocated = cp->buf.used;
cp->buf.buffer = xrealloc (cp->buf.buffer, cp->buf.used);
}
if (!cp->busy)
{
cp->buf.allocated = cp->buf.used;
cp->buf.buffer = xrealloc (cp->buf.buffer, cp->buf.used);
}
}
/* Clear the regexp cache w.r.t. a particular syntax table,
@ -190,10 +188,25 @@ clear_regexp_cache (void)
/* It's tempting to compare with the syntax-table we've actually changed,
but it's not sufficient because char-table inheritance means that
modifying one syntax-table can change others at the same time. */
if (!EQ (searchbufs[i].syntax_table, Qt))
if (!searchbufs[i].busy && !EQ (searchbufs[i].syntax_table, Qt))
searchbufs[i].regexp = Qnil;
}
static void
unfreeze_pattern (void *arg)
{
struct regexp_cache *searchbuf = arg;
searchbuf->busy = false;
}
static void
freeze_pattern (struct regexp_cache *searchbuf)
{
eassert (!searchbuf->busy);
record_unwind_protect_ptr (unfreeze_pattern, searchbuf);
searchbuf->busy = true;
}
/* Compile a regexp if necessary, but first check to see if there's one in
the cache.
PATTERN is the pattern to compile.
@ -205,7 +218,7 @@ clear_regexp_cache (void)
POSIX is true if we want full backtracking (POSIX style) for this pattern.
False means backtrack only enough to get a valid match. */
struct re_pattern_buffer *
static struct regexp_cache *
compile_pattern (Lisp_Object pattern, struct re_registers *regp,
Lisp_Object translate, bool posix, bool multibyte)
{
@ -222,6 +235,7 @@ compile_pattern (Lisp_Object pattern, struct re_registers *regp,
if (NILP (cp->regexp))
goto compile_it;
if (SCHARS (cp->regexp) == SCHARS (pattern)
&& !cp->busy
&& STRING_MULTIBYTE (cp->regexp) == STRING_MULTIBYTE (pattern)
&& !NILP (Fstring_equal (cp->regexp, pattern))
&& EQ (cp->buf.translate, (! NILP (translate) ? translate : make_number (0)))
@ -237,7 +251,10 @@ compile_pattern (Lisp_Object pattern, struct re_registers *regp,
string value. */
if (cp->next == 0)
{
if (cp->busy)
error ("Too much matching reentrancy");
compile_it:
eassert (!cp->busy);
compile_pattern_1 (cp, pattern, translate, posix);
break;
}
@ -258,8 +275,7 @@ compile_pattern (Lisp_Object pattern, struct re_registers *regp,
/* The compiled pattern can be used both for multibyte and unibyte
target. But, we have to tell which the pattern is used for. */
cp->buf.target_multibyte = multibyte;
return &cp->buf;
return cp;
}
@ -270,7 +286,6 @@ looking_at_1 (Lisp_Object string, bool posix)
unsigned char *p1, *p2;
ptrdiff_t s1, s2;
register ptrdiff_t i;
struct re_pattern_buffer *bufp;
if (running_asynch_code)
save_search_regs ();
@ -280,13 +295,17 @@ looking_at_1 (Lisp_Object string, bool posix)
BVAR (current_buffer, case_eqv_table));
CHECK_STRING (string);
bufp = compile_pattern (string,
(NILP (Vinhibit_changing_match_data)
? &search_regs : NULL),
(!NILP (BVAR (current_buffer, case_fold_search))
? BVAR (current_buffer, case_canon_table) : Qnil),
posix,
!NILP (BVAR (current_buffer, enable_multibyte_characters)));
/* Snapshot in case Lisp changes the value. */
bool preserve_match_data = NILP (Vinhibit_changing_match_data);
struct regexp_cache *cache_entry = compile_pattern (
string,
preserve_match_data ? &search_regs : NULL,
(!NILP (BVAR (current_buffer, case_fold_search))
? BVAR (current_buffer, case_canon_table) : Qnil),
posix,
!NILP (BVAR (current_buffer, enable_multibyte_characters)));
/* Do a pending quit right away, to avoid paradoxical behavior */
maybe_quit ();
@ -310,21 +329,20 @@ looking_at_1 (Lisp_Object string, bool posix)
s2 = 0;
}
re_match_object = Qnil;
ptrdiff_t count = SPECPDL_INDEX ();
freeze_buffer_relocation ();
i = re_match_2 (bufp, (char *) p1, s1, (char *) p2, s2,
freeze_pattern (cache_entry);
re_match_object = Qnil;
i = re_match_2 (&cache_entry->buf, (char *) p1, s1, (char *) p2, s2,
PT_BYTE - BEGV_BYTE,
(NILP (Vinhibit_changing_match_data)
? &search_regs : NULL),
preserve_match_data ? &search_regs : NULL,
ZV_BYTE - BEGV_BYTE);
thaw_buffer_relocation ();
if (i == -2)
matcher_overflow ();
val = (i >= 0 ? Qt : Qnil);
if (NILP (Vinhibit_changing_match_data) && i >= 0)
if (preserve_match_data && i >= 0)
{
for (i = 0; i < search_regs.num_regs; i++)
if (search_regs.start[i] >= 0)
@ -338,7 +356,7 @@ looking_at_1 (Lisp_Object string, bool posix)
XSETBUFFER (last_thing_searched, current_buffer);
}
return val;
return unbind_to (count, val);
}
DEFUN ("looking-at", Flooking_at, Slooking_at, 1, 1, 0,
@ -396,15 +414,14 @@ string_match_1 (Lisp_Object regexp, Lisp_Object string, Lisp_Object start,
set_char_table_extras (BVAR (current_buffer, case_canon_table), 2,
BVAR (current_buffer, case_eqv_table));
bufp = compile_pattern (regexp,
(NILP (Vinhibit_changing_match_data)
? &search_regs : NULL),
(!NILP (BVAR (current_buffer, case_fold_search))
? BVAR (current_buffer, case_canon_table) : Qnil),
posix,
STRING_MULTIBYTE (string));
bufp = &compile_pattern (regexp,
(NILP (Vinhibit_changing_match_data)
? &search_regs : NULL),
(!NILP (BVAR (current_buffer, case_fold_search))
? BVAR (current_buffer, case_canon_table) : Qnil),
posix,
STRING_MULTIBYTE (string))->buf;
re_match_object = string;
val = re_search (bufp, SSDATA (string),
SBYTES (string), pos_byte,
SBYTES (string) - pos_byte,
@ -471,10 +488,9 @@ fast_string_match_internal (Lisp_Object regexp, Lisp_Object string,
ptrdiff_t val;
struct re_pattern_buffer *bufp;
bufp = compile_pattern (regexp, 0, table,
0, STRING_MULTIBYTE (string));
bufp = &compile_pattern (regexp, 0, table,
0, STRING_MULTIBYTE (string))->buf;
re_match_object = string;
val = re_search (bufp, SSDATA (string),
SBYTES (string), 0,
SBYTES (string), 0);
@ -494,10 +510,10 @@ fast_c_string_match_ignore_case (Lisp_Object regexp,
struct re_pattern_buffer *bufp;
regexp = string_make_unibyte (regexp);
bufp = &compile_pattern (regexp, 0,
Vascii_canon_table, 0,
0)->buf;
re_match_object = Qt;
bufp = compile_pattern (regexp, 0,
Vascii_canon_table, 0,
0);
val = re_search (bufp, string, len, 0, len, 0);
return val;
}
@ -513,7 +529,6 @@ fast_looking_at (Lisp_Object regexp, ptrdiff_t pos, ptrdiff_t pos_byte,
ptrdiff_t limit, ptrdiff_t limit_byte, Lisp_Object string)
{
bool multibyte;
struct re_pattern_buffer *buf;
unsigned char *p1, *p2;
ptrdiff_t s1, s2;
ptrdiff_t len;
@ -528,7 +543,6 @@ fast_looking_at (Lisp_Object regexp, ptrdiff_t pos, ptrdiff_t pos_byte,
s1 = 0;
p2 = SDATA (string);
s2 = SBYTES (string);
re_match_object = string;
multibyte = STRING_MULTIBYTE (string);
}
else
@ -554,16 +568,19 @@ fast_looking_at (Lisp_Object regexp, ptrdiff_t pos, ptrdiff_t pos_byte,
s1 = ZV_BYTE - BEGV_BYTE;
s2 = 0;
}
re_match_object = Qnil;
multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
}
buf = compile_pattern (regexp, 0, Qnil, 0, multibyte);
struct regexp_cache *cache_entry =
compile_pattern (regexp, 0, Qnil, 0, multibyte);
ptrdiff_t count = SPECPDL_INDEX ();
freeze_buffer_relocation ();
len = re_match_2 (buf, (char *) p1, s1, (char *) p2, s2,
freeze_pattern (cache_entry);
re_match_object = STRINGP (string) ? string : Qnil;
len = re_match_2 (&cache_entry->buf, (char *) p1, s1, (char *) p2, s2,
pos_byte, NULL, limit_byte);
thaw_buffer_relocation ();
unbind_to (count, Qnil);
return len;
}
@ -1150,356 +1167,373 @@ while (0)
(i.e. Vinhibit_changing_match_data is non-nil). */
static struct re_registers search_regs_1;
static EMACS_INT
search_buffer_re (Lisp_Object string, ptrdiff_t pos, ptrdiff_t pos_byte,
ptrdiff_t lim, ptrdiff_t lim_byte, EMACS_INT n,
Lisp_Object trt, Lisp_Object inverse_trt, bool posix)
{
unsigned char *p1, *p2;
ptrdiff_t s1, s2;
/* Snapshot in case Lisp changes the value. */
bool preserve_match_data = NILP (Vinhibit_changing_match_data);
struct regexp_cache *cache_entry =
compile_pattern (string,
preserve_match_data ? &search_regs : &search_regs_1,
trt, posix,
!NILP (BVAR (current_buffer, enable_multibyte_characters)));
struct re_pattern_buffer *bufp = &cache_entry->buf;
maybe_quit (); /* Do a pending quit right away,
to avoid paradoxical behavior */
/* Get pointers and sizes of the two strings
that make up the visible portion of the buffer. */
p1 = BEGV_ADDR;
s1 = GPT_BYTE - BEGV_BYTE;
p2 = GAP_END_ADDR;
s2 = ZV_BYTE - GPT_BYTE;
if (s1 < 0)
{
p2 = p1;
s2 = ZV_BYTE - BEGV_BYTE;
s1 = 0;
}
if (s2 < 0)
{
s1 = ZV_BYTE - BEGV_BYTE;
s2 = 0;
}
ptrdiff_t count = SPECPDL_INDEX ();
freeze_buffer_relocation ();
freeze_pattern (cache_entry);
while (n < 0)
{
ptrdiff_t val;
re_match_object = Qnil;
val = re_search_2 (bufp, (char *) p1, s1, (char *) p2, s2,
pos_byte - BEGV_BYTE, lim_byte - pos_byte,
preserve_match_data ? &search_regs : &search_regs_1,
/* Don't allow match past current point */
pos_byte - BEGV_BYTE);
if (val == -2)
{
matcher_overflow ();
}
if (val >= 0)
{
if (preserve_match_data)
{
pos_byte = search_regs.start[0] + BEGV_BYTE;
for (ptrdiff_t i = 0; i < search_regs.num_regs; i++)
if (search_regs.start[i] >= 0)
{
search_regs.start[i]
= BYTE_TO_CHAR (search_regs.start[i] + BEGV_BYTE);
search_regs.end[i]
= BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE);
}
XSETBUFFER (last_thing_searched, current_buffer);
/* Set pos to the new position. */
pos = search_regs.start[0];
}
else
{
pos_byte = search_regs_1.start[0] + BEGV_BYTE;
/* Set pos to the new position. */
pos = BYTE_TO_CHAR (search_regs_1.start[0] + BEGV_BYTE);
}
}
else
{
unbind_to (count, Qnil);
return (n);
}
n++;
maybe_quit ();
}
while (n > 0)
{
ptrdiff_t val;
re_match_object = Qnil;
val = re_search_2 (bufp, (char *) p1, s1, (char *) p2, s2,
pos_byte - BEGV_BYTE, lim_byte - pos_byte,
preserve_match_data ? &search_regs : &search_regs_1,
lim_byte - BEGV_BYTE);
if (val == -2)
{
matcher_overflow ();
}
if (val >= 0)
{
if (preserve_match_data)
{
pos_byte = search_regs.end[0] + BEGV_BYTE;
for (ptrdiff_t i = 0; i < search_regs.num_regs; i++)
if (search_regs.start[i] >= 0)
{
search_regs.start[i]
= BYTE_TO_CHAR (search_regs.start[i] + BEGV_BYTE);
search_regs.end[i]
= BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE);
}
XSETBUFFER (last_thing_searched, current_buffer);
pos = search_regs.end[0];
}
else
{
pos_byte = search_regs_1.end[0] + BEGV_BYTE;
pos = BYTE_TO_CHAR (search_regs_1.end[0] + BEGV_BYTE);
}
}
else
{
unbind_to (count, Qnil);
return (0 - n);
}
n--;
maybe_quit ();
}
unbind_to (count, Qnil);
return (pos);
}
static EMACS_INT
search_buffer_non_re (Lisp_Object string, ptrdiff_t pos,
ptrdiff_t pos_byte, ptrdiff_t lim, ptrdiff_t lim_byte,
EMACS_INT n, int RE, Lisp_Object trt, Lisp_Object inverse_trt,
bool posix)
{
unsigned char *raw_pattern, *pat;
ptrdiff_t raw_pattern_size;
ptrdiff_t raw_pattern_size_byte;
unsigned char *patbuf;
bool multibyte = !NILP (BVAR (current_buffer, enable_multibyte_characters));
unsigned char *base_pat;
/* Set to positive if we find a non-ASCII char that need
translation. Otherwise set to zero later. */
int char_base = -1;
bool boyer_moore_ok = 1;
USE_SAFE_ALLOCA;
/* MULTIBYTE says whether the text to be searched is multibyte.
We must convert PATTERN to match that, or we will not really
find things right. */
if (multibyte == STRING_MULTIBYTE (string))
{
raw_pattern = SDATA (string);
raw_pattern_size = SCHARS (string);
raw_pattern_size_byte = SBYTES (string);
}
else if (multibyte)
{
raw_pattern_size = SCHARS (string);
raw_pattern_size_byte
= count_size_as_multibyte (SDATA (string),
raw_pattern_size);
raw_pattern = SAFE_ALLOCA (raw_pattern_size_byte + 1);
copy_text (SDATA (string), raw_pattern,
SCHARS (string), 0, 1);
}
else
{
/* Converting multibyte to single-byte.
??? Perhaps this conversion should be done in a special way
by subtracting nonascii-insert-offset from each non-ASCII char,
so that only the multibyte chars which really correspond to
the chosen single-byte character set can possibly match. */
raw_pattern_size = SCHARS (string);
raw_pattern_size_byte = SCHARS (string);
raw_pattern = SAFE_ALLOCA (raw_pattern_size + 1);
copy_text (SDATA (string), raw_pattern,
SBYTES (string), 1, 0);
}
/* Copy and optionally translate the pattern. */
ptrdiff_t len = raw_pattern_size;
ptrdiff_t len_byte = raw_pattern_size_byte;
SAFE_NALLOCA (patbuf, MAX_MULTIBYTE_LENGTH, len);
pat = patbuf;
base_pat = raw_pattern;
if (multibyte)
{
/* Fill patbuf by translated characters in STRING while
checking if we can use boyer-moore search. If TRT is
non-nil, we can use boyer-moore search only if TRT can be
represented by the byte array of 256 elements. For that,
all non-ASCII case-equivalents of all case-sensitive
characters in STRING must belong to the same character
group (two characters belong to the same group iff their
multibyte forms are the same except for the last byte;
i.e. every 64 characters form a group; U+0000..U+003F,
U+0040..U+007F, U+0080..U+00BF, ...). */
while (--len >= 0)
{
unsigned char str_base[MAX_MULTIBYTE_LENGTH], *str;
int c, translated, inverse;
int in_charlen, charlen;
/* If we got here and the RE flag is set, it's because we're
dealing with a regexp known to be trivial, so the backslash
just quotes the next character. */
if (RE && *base_pat == '\\')
{
len--;
raw_pattern_size--;
len_byte--;
base_pat++;
}
c = STRING_CHAR_AND_LENGTH (base_pat, in_charlen);
if (NILP (trt))
{
str = base_pat;
charlen = in_charlen;
}
else
{
/* Translate the character. */
TRANSLATE (translated, trt, c);
charlen = CHAR_STRING (translated, str_base);
str = str_base;
/* Check if C has any other case-equivalents. */
TRANSLATE (inverse, inverse_trt, c);
/* If so, check if we can use boyer-moore. */
if (c != inverse && boyer_moore_ok)
{
/* Check if all equivalents belong to the same
group of characters. Note that the check of C
itself is done by the last iteration. */
int this_char_base = -1;
while (boyer_moore_ok)
{
if (ASCII_CHAR_P (inverse))
{
if (this_char_base > 0)
boyer_moore_ok = 0;
else
this_char_base = 0;
}
else if (CHAR_BYTE8_P (inverse))
/* Boyer-moore search can't handle a
translation of an eight-bit
character. */
boyer_moore_ok = 0;
else if (this_char_base < 0)
{
this_char_base = inverse & ~0x3F;
if (char_base < 0)
char_base = this_char_base;
else if (this_char_base != char_base)
boyer_moore_ok = 0;
}
else if ((inverse & ~0x3F) != this_char_base)
boyer_moore_ok = 0;
if (c == inverse)
break;
TRANSLATE (inverse, inverse_trt, inverse);
}
}
}
/* Store this character into the translated pattern. */
memcpy (pat, str, charlen);
pat += charlen;
base_pat += in_charlen;
len_byte -= in_charlen;
}
/* If char_base is still negative we didn't find any translated
non-ASCII characters. */
if (char_base < 0)
char_base = 0;
}
else
{
/* Unibyte buffer. */
char_base = 0;
while (--len >= 0)
{
int c, translated, inverse;
/* If we got here and the RE flag is set, it's because we're
dealing with a regexp known to be trivial, so the backslash
just quotes the next character. */
if (RE && *base_pat == '\\')
{
len--;
raw_pattern_size--;
base_pat++;
}
c = *base_pat++;
TRANSLATE (translated, trt, c);
*pat++ = translated;
/* Check that none of C's equivalents violates the
assumptions of boyer_moore. */
TRANSLATE (inverse, inverse_trt, c);
while (1)
{
if (inverse >= 0200)
{
boyer_moore_ok = 0;
break;
}
if (c == inverse)
break;
TRANSLATE (inverse, inverse_trt, inverse);
}
}
}
len_byte = pat - patbuf;
pat = base_pat = patbuf;
EMACS_INT result
= (boyer_moore_ok
? boyer_moore (n, pat, len_byte, trt, inverse_trt,
pos_byte, lim_byte,
char_base)
: simple_search (n, pat, raw_pattern_size, len_byte, trt,
pos, pos_byte, lim, lim_byte));
SAFE_FREE ();
return result;
}
static EMACS_INT
search_buffer (Lisp_Object string, ptrdiff_t pos, ptrdiff_t pos_byte,
ptrdiff_t lim, ptrdiff_t lim_byte, EMACS_INT n,
int RE, Lisp_Object trt, Lisp_Object inverse_trt, bool posix)
{
ptrdiff_t len = SCHARS (string);
ptrdiff_t len_byte = SBYTES (string);
register ptrdiff_t i;
if (running_asynch_code)
save_search_regs ();
/* Searching 0 times means don't move. */
/* Null string is found at starting position. */
if (len == 0 || n == 0)
if (n == 0 || SCHARS (string) == 0)
{
set_search_regs (pos_byte, 0);
return pos;
}
if (RE && !(trivial_regexp_p (string) && NILP (Vsearch_spaces_regexp)))
{
unsigned char *p1, *p2;
ptrdiff_t s1, s2;
struct re_pattern_buffer *bufp;
pos = search_buffer_re (string, pos, pos_byte, lim, lim_byte,
n, trt, inverse_trt, posix);
else
pos = search_buffer_non_re (string, pos, pos_byte, lim, lim_byte,
n, RE, trt, inverse_trt, posix);
bufp = compile_pattern (string,
(NILP (Vinhibit_changing_match_data)
? &search_regs : &search_regs_1),
trt, posix,
!NILP (BVAR (current_buffer, enable_multibyte_characters)));
maybe_quit (); /* Do a pending quit right away,
to avoid paradoxical behavior */
/* Get pointers and sizes of the two strings
that make up the visible portion of the buffer. */
p1 = BEGV_ADDR;
s1 = GPT_BYTE - BEGV_BYTE;
p2 = GAP_END_ADDR;
s2 = ZV_BYTE - GPT_BYTE;
if (s1 < 0)
{
p2 = p1;
s2 = ZV_BYTE - BEGV_BYTE;
s1 = 0;
}
if (s2 < 0)
{
s1 = ZV_BYTE - BEGV_BYTE;
s2 = 0;
}
re_match_object = Qnil;
freeze_buffer_relocation ();
while (n < 0)
{
ptrdiff_t val;
val = re_search_2 (bufp, (char *) p1, s1, (char *) p2, s2,
pos_byte - BEGV_BYTE, lim_byte - pos_byte,
(NILP (Vinhibit_changing_match_data)
? &search_regs : &search_regs_1),
/* Don't allow match past current point */
pos_byte - BEGV_BYTE);
if (val == -2)
{
matcher_overflow ();
}
if (val >= 0)
{
if (NILP (Vinhibit_changing_match_data))
{
pos_byte = search_regs.start[0] + BEGV_BYTE;
for (i = 0; i < search_regs.num_regs; i++)
if (search_regs.start[i] >= 0)
{
search_regs.start[i]
= BYTE_TO_CHAR (search_regs.start[i] + BEGV_BYTE);
search_regs.end[i]
= BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE);
}
XSETBUFFER (last_thing_searched, current_buffer);
/* Set pos to the new position. */
pos = search_regs.start[0];
}
else
{
pos_byte = search_regs_1.start[0] + BEGV_BYTE;
/* Set pos to the new position. */
pos = BYTE_TO_CHAR (search_regs_1.start[0] + BEGV_BYTE);
}
}
else
{
thaw_buffer_relocation ();
return (n);
}
n++;
maybe_quit ();
}
while (n > 0)
{
ptrdiff_t val;
val = re_search_2 (bufp, (char *) p1, s1, (char *) p2, s2,
pos_byte - BEGV_BYTE, lim_byte - pos_byte,
(NILP (Vinhibit_changing_match_data)
? &search_regs : &search_regs_1),
lim_byte - BEGV_BYTE);
if (val == -2)
{
matcher_overflow ();
}
if (val >= 0)
{
if (NILP (Vinhibit_changing_match_data))
{
pos_byte = search_regs.end[0] + BEGV_BYTE;
for (i = 0; i < search_regs.num_regs; i++)
if (search_regs.start[i] >= 0)
{
search_regs.start[i]
= BYTE_TO_CHAR (search_regs.start[i] + BEGV_BYTE);
search_regs.end[i]
= BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE);
}
XSETBUFFER (last_thing_searched, current_buffer);
pos = search_regs.end[0];
}
else
{
pos_byte = search_regs_1.end[0] + BEGV_BYTE;
pos = BYTE_TO_CHAR (search_regs_1.end[0] + BEGV_BYTE);
}
}
else
{
thaw_buffer_relocation ();
return (0 - n);
}
n--;
maybe_quit ();
}
thaw_buffer_relocation ();
return (pos);
}
else /* non-RE case */
{
unsigned char *raw_pattern, *pat;
ptrdiff_t raw_pattern_size;
ptrdiff_t raw_pattern_size_byte;
unsigned char *patbuf;
bool multibyte = !NILP (BVAR (current_buffer, enable_multibyte_characters));
unsigned char *base_pat;
/* Set to positive if we find a non-ASCII char that need
translation. Otherwise set to zero later. */
int char_base = -1;
bool boyer_moore_ok = 1;
USE_SAFE_ALLOCA;
/* MULTIBYTE says whether the text to be searched is multibyte.
We must convert PATTERN to match that, or we will not really
find things right. */
if (multibyte == STRING_MULTIBYTE (string))
{
raw_pattern = SDATA (string);
raw_pattern_size = SCHARS (string);
raw_pattern_size_byte = SBYTES (string);
}
else if (multibyte)
{
raw_pattern_size = SCHARS (string);
raw_pattern_size_byte
= count_size_as_multibyte (SDATA (string),
raw_pattern_size);
raw_pattern = SAFE_ALLOCA (raw_pattern_size_byte + 1);
copy_text (SDATA (string), raw_pattern,
SCHARS (string), 0, 1);
}
else
{
/* Converting multibyte to single-byte.
??? Perhaps this conversion should be done in a special way
by subtracting nonascii-insert-offset from each non-ASCII char,
so that only the multibyte chars which really correspond to
the chosen single-byte character set can possibly match. */
raw_pattern_size = SCHARS (string);
raw_pattern_size_byte = SCHARS (string);
raw_pattern = SAFE_ALLOCA (raw_pattern_size + 1);
copy_text (SDATA (string), raw_pattern,
SBYTES (string), 1, 0);
}
/* Copy and optionally translate the pattern. */
len = raw_pattern_size;
len_byte = raw_pattern_size_byte;
SAFE_NALLOCA (patbuf, MAX_MULTIBYTE_LENGTH, len);
pat = patbuf;
base_pat = raw_pattern;
if (multibyte)
{
/* Fill patbuf by translated characters in STRING while
checking if we can use boyer-moore search. If TRT is
non-nil, we can use boyer-moore search only if TRT can be
represented by the byte array of 256 elements. For that,
all non-ASCII case-equivalents of all case-sensitive
characters in STRING must belong to the same character
group (two characters belong to the same group iff their
multibyte forms are the same except for the last byte;
i.e. every 64 characters form a group; U+0000..U+003F,
U+0040..U+007F, U+0080..U+00BF, ...). */
while (--len >= 0)
{
unsigned char str_base[MAX_MULTIBYTE_LENGTH], *str;
int c, translated, inverse;
int in_charlen, charlen;
/* If we got here and the RE flag is set, it's because we're
dealing with a regexp known to be trivial, so the backslash
just quotes the next character. */
if (RE && *base_pat == '\\')
{
len--;
raw_pattern_size--;
len_byte--;
base_pat++;
}
c = STRING_CHAR_AND_LENGTH (base_pat, in_charlen);
if (NILP (trt))
{
str = base_pat;
charlen = in_charlen;
}
else
{
/* Translate the character. */
TRANSLATE (translated, trt, c);
charlen = CHAR_STRING (translated, str_base);
str = str_base;
/* Check if C has any other case-equivalents. */
TRANSLATE (inverse, inverse_trt, c);
/* If so, check if we can use boyer-moore. */
if (c != inverse && boyer_moore_ok)
{
/* Check if all equivalents belong to the same
group of characters. Note that the check of C
itself is done by the last iteration. */
int this_char_base = -1;
while (boyer_moore_ok)
{
if (ASCII_CHAR_P (inverse))
{
if (this_char_base > 0)
boyer_moore_ok = 0;
else
this_char_base = 0;
}
else if (CHAR_BYTE8_P (inverse))
/* Boyer-moore search can't handle a
translation of an eight-bit
character. */
boyer_moore_ok = 0;
else if (this_char_base < 0)
{
this_char_base = inverse & ~0x3F;
if (char_base < 0)
char_base = this_char_base;
else if (this_char_base != char_base)
boyer_moore_ok = 0;
}
else if ((inverse & ~0x3F) != this_char_base)
boyer_moore_ok = 0;
if (c == inverse)
break;
TRANSLATE (inverse, inverse_trt, inverse);
}
}
}
/* Store this character into the translated pattern. */
memcpy (pat, str, charlen);
pat += charlen;
base_pat += in_charlen;
len_byte -= in_charlen;
}
/* If char_base is still negative we didn't find any translated
non-ASCII characters. */
if (char_base < 0)
char_base = 0;
}
else
{
/* Unibyte buffer. */
char_base = 0;
while (--len >= 0)
{
int c, translated, inverse;
/* If we got here and the RE flag is set, it's because we're
dealing with a regexp known to be trivial, so the backslash
just quotes the next character. */
if (RE && *base_pat == '\\')
{
len--;
raw_pattern_size--;
base_pat++;
}
c = *base_pat++;
TRANSLATE (translated, trt, c);
*pat++ = translated;
/* Check that none of C's equivalents violates the
assumptions of boyer_moore. */
TRANSLATE (inverse, inverse_trt, c);
while (1)
{
if (inverse >= 0200)
{
boyer_moore_ok = 0;
break;
}
if (c == inverse)
break;
TRANSLATE (inverse, inverse_trt, inverse);
}
}
}
len_byte = pat - patbuf;
pat = base_pat = patbuf;
EMACS_INT result
= (boyer_moore_ok
? boyer_moore (n, pat, len_byte, trt, inverse_trt,
pos_byte, lim_byte,
char_base)
: simple_search (n, pat, raw_pattern_size, len_byte, trt,
pos, pos_byte, lim, lim_byte));
SAFE_FREE ();
return result;
}
return pos;
}
/* Do a simple string search N times for the string PAT,
@ -3353,6 +3387,7 @@ the buffer. If the buffer doesn't have a cache, the value is nil. */)
return val;
}
void
syms_of_search (void)
{
@ -3365,6 +3400,7 @@ syms_of_search (void)
searchbufs[i].buf.fastmap = searchbufs[i].fastmap;
searchbufs[i].regexp = Qnil;
searchbufs[i].f_whitespace_regexp = Qnil;
searchbufs[i].busy = false;
searchbufs[i].syntax_table = Qnil;
staticpro (&searchbufs[i].regexp);
staticpro (&searchbufs[i].f_whitespace_regexp);
@ -3405,6 +3441,9 @@ syms_of_search (void)
saved_last_thing_searched = Qnil;
staticpro (&saved_last_thing_searched);
re_match_object = Qnil;
staticpro (&re_match_object);
DEFVAR_LISP ("search-spaces-regexp", Vsearch_spaces_regexp,
doc: /* Regexp to substitute for bunches of spaces in regexp search.
Some commands use this for user-specified regexps.

View file

@ -186,13 +186,6 @@ UPDATE_SYNTAX_TABLE_FORWARD (ptrdiff_t charpos)
false, gl_state.object);
}
INLINE void
UPDATE_SYNTAX_TABLE_FORWARD_FAST (ptrdiff_t charpos)
{
if (parse_sexp_lookup_properties && charpos >= gl_state.e_property)
update_syntax_table (charpos + gl_state.offset, 1, false, gl_state.object);
}
/* Make syntax table state (gl_state) good for CHARPOS, assuming it is
currently good for a position after CHARPOS. */
@ -212,13 +205,6 @@ UPDATE_SYNTAX_TABLE (ptrdiff_t charpos)
UPDATE_SYNTAX_TABLE_FORWARD (charpos);
}
INLINE void
UPDATE_SYNTAX_TABLE_FAST (ptrdiff_t charpos)
{
UPDATE_SYNTAX_TABLE_BACKWARD (charpos);
UPDATE_SYNTAX_TABLE_FORWARD_FAST (charpos);
}
/* Set up the buffer-global syntax table. */
INLINE void

View file

@ -137,15 +137,6 @@ struct thread_state
struct re_registers m_saved_search_regs;
#define saved_search_regs (current_thread->m_saved_search_regs)
/* This is the string or buffer in which we
are matching. It is used for looking up syntax properties.
If the value is a Lisp string object, we are matching text in that
string; if it's nil, we are matching text in the current buffer; if
it's t, we are matching text in a C string. */
Lisp_Object m_re_match_object;
#define re_match_object (current_thread->m_re_match_object)
/* This member is different from waiting_for_input.
It is used to communicate to a lisp process-filter/sentinel (via the
function Fwaiting_for_user_input_p) whether Emacs was waiting