regex.c: Consolidate the two analysis functions

We currently have two functions that analyze the bytecode
to try and apply optimizations: `analyze_first` and `mutually_exclusive_p`.
Extract the common code between them into a new function `forall_firstchar`,
and then rewrite the old ones on top of that one.

Along the way, we get slightly better analyses that reverts
the recent de-optimizations but without re-introducing the
corresponding bugs.

* src/regex-emacs.c (forall_firstchar_1, forall_firstchar): New functions.
(analyze_first_old): Rename from `analyze_first`.
(struct anafirst_data): New struct.
(analyze_first_fastmap, analyze_first_null): New functions.
(analyze_first): Rewrite to use `forall_firstchar` with those two functions.
Take a `bufp` rather than a `multibyte` arg.
(regex_compile, re_compile_fastmap): Adjust calls accordingly.
(struct mutexcl_data): New struct.
(mutually_exclusive_one): New function.
(mutually_exclusive_p): Rewrite to use `forall_firstchar` with that function.
This commit is contained in:
Stefan Monnier 2023-09-29 17:39:10 -04:00
parent 6e44d6e184
commit e61a039843

View file

@ -1179,8 +1179,8 @@ static void insert_op2 (re_opcode_t op, unsigned char *loc,
static bool at_begline_loc_p (re_char *pattern, re_char *p);
static bool at_endline_loc_p (re_char *p, re_char *pend);
static re_char *skip_one_char (re_char *p);
static bool analyze_first (re_char *p, re_char *pend,
char *fastmap, bool multibyte);
static bool analyze_first (struct re_pattern_buffer *bufp,
re_char *p, re_char *pend, char *fastmap);
/* Fetch the next character in the uncompiled pattern, with no
translation. */
@ -1930,7 +1930,7 @@ regex_compile (re_char *pattern, ptrdiff_t size,
ptrdiff_t startoffset = 0;
re_opcode_t ofj =
/* Check if the loop can match the empty string. */
(simple || !analyze_first (laststart, b, NULL, false))
(simple || !analyze_first (bufp, laststart, b, NULL))
? on_failure_jump : on_failure_jump_loop;
eassert (skip_one_char (laststart) <= b);
@ -1987,7 +1987,7 @@ regex_compile (re_char *pattern, ptrdiff_t size,
GET_BUFFER_SPACE (7); /* We might use less. */
if (many_times_ok)
{
bool emptyp = analyze_first (laststart, b, NULL, false);
bool emptyp = analyze_first (bufp, laststart, b, NULL);
/* The non-greedy multiple match looks like
a repeat..until: we only need a conditional jump
@ -2822,7 +2822,229 @@ group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
return false;
}
/* analyze_first.
/* Iterate through all the char-matching operations directly reachable from P.
This is the inner loop of `forall_firstchar`, which see.
LOOP_BEG..LOOP_END delimit the currentl "block" of code (we assume
the code is made of syntactically nested loops).
LOOP_END is blindly assumed to be "safe".
To guarantee termination, at each iteration, either LOOP_BEG should
get bigger, or it should stay the same and P should get bigger. */
static bool
forall_firstchar_1 (re_char *p, re_char *pend,
re_char *loop_beg, re_char *loop_end,
bool f (const re_char *p, void *arg), void *arg)
{
eassert (p >= loop_beg);
eassert (p <= loop_end);
while (true)
{
re_char *newp1, *newp2, *tmp;
re_char *p_orig = p;
if (p == pend)
return false;
else if (p == loop_end)
return true;
else if (p > loop_end)
{
#if ENABLE_CHECKING
fprintf (stderr, "FORALL_FIRSTCHAR: Broken assumption1!!\n");
#endif
return false; /* FIXME: Broken assumption about the code shape. */
}
else
switch (*p)
{
/* Cases which stop the iteration. */
case succeed:
case exactn:
case charset:
case charset_not:
case anychar:
case syntaxspec:
case notsyntaxspec:
case categoryspec:
case notcategoryspec:
return f (p, arg);
/* Cases which may match the empty string. */
case at_dot:
case begbuf:
case no_op:
case wordbound:
case notwordbound:
case begline:
p++;
continue;
/* Cases which may match the empty string and may
tell us something about the next char. */
case endline:
case endbuf:
case wordbeg:
case wordend:
case symbeg:
case symend:
if (f (p, arg))
return true;
p++;
continue;
case jump:
case jump_n:
newp1 = extract_address (p + 1);
if (newp1 > p)
{ /* Forward jump, boring. */
p = newp1;
continue;
}
switch (*newp1)
{
case on_failure_jump:
case on_failure_keep_string_jump:
case on_failure_jump_nastyloop:
case on_failure_jump_loop:
case on_failure_jump_smart:
case succeed_n:
newp2 = extract_address (newp1 + 1);
goto do_twoway_jump;
default:
newp2 = loop_end; /* "Safe" choice. */
goto do_jump;
}
case on_failure_jump:
case on_failure_keep_string_jump:
case on_failure_jump_nastyloop:
case on_failure_jump_loop:
case on_failure_jump_smart:
newp1 = extract_address (p + 1);
newp2 = p + 3;
/* For `+` loops, we often have an `on_failure_jump` that skips
forward over a subsequent `jump`. Recognize this pattern
since that subsequent `jump` is the one that jumps to the
loop-entry. */
newp2 = ((re_opcode_t) *newp2 == jump)
? extract_address (newp2 + 1) : newp2;
do_twoway_jump:
/* We have to check that both destinations are safe.
Arrange for `newp1` to be the smaller of the two. */
if (newp1 > newp2)
(tmp = newp1, newp1 = newp2, newp2 = tmp);
if (newp2 <= p_orig) /* Both destinations go backward! */
{
#if ENABLE_CHECKING
fprintf (stderr, "FORALL_FIRSTCHAR: Broken assumption2!!\n");
#endif
return false;
}
if (!forall_firstchar_1 (newp2, pend, loop_beg, loop_end, f, arg))
return false;
do_jump:
eassert (newp2 <= loop_end);
if (newp1 <= p_orig)
{
if (newp1 < loop_beg)
{
#if ENABLE_CHECKING
fprintf (stderr, "FORALL_FIRSTCHAR: Broken assumption3!!\n");
#endif
return false;
}
else if (newp1 == loop_beg)
/* If we jump backward to the entry point of the current loop
it means it's a zero-length cycle through that loop;
this cycle itself does not break safety. */
return true;
else
/* We jump backward to a new loop, nested within the current
one. `newp1` is the entry point and `newp2` the exit of
that inner loop. */
/* `p` gets smaller, but termination is still ensured because
`loop_beg` gets bigger. */
(loop_beg = newp1, loop_end = newp2);
}
p = newp1;
continue;
case succeed_n:
newp1 = extract_address (p + 1);
newp2 = p + 5; /* Skip the two bytes containing the count. */
goto do_twoway_jump;
case set_number_at:
int offset = extract_number (p + 1);
DEBUG_STATEMENT (eassert (extract_number (p + 3)));
/* If we're setting the counter of an immediately following
`succeed_n`, then this next execution of `succeed_n` will do
nothing but decrement its counter and "fall through".
So we do the fall through here to avoid considering the
"on failure" part of the `succeed_n` which should only be
considered when coming from the `jump(_n)` at the end of
the loop. */
p += (offset == 5 && p[5] == succeed_n) ? 10 : 5;
continue;
case start_memory:
case stop_memory:
p += 2;
continue;
/* This could match the empty string, so we may need to continue,
but in most cases, this can match "anything", so we should
return `false` unless told otherwise. */
case duplicate:
if (!f (p, arg))
return false;
p += 2;
continue;
default:
abort (); /* We have listed all the cases. */
}
}
}
/* Iterate through all the char-matching operations directly reachable from P.
Return true if P is "safe", meaning that PEND cannot be reached directly
from P and all calls to F returned true.
Return false if PEND *may* be directly reachable from P or if one of
the calls to F returned false.
PEND can be NULL (and hence never reachable).
Call `F (POS, ARG)` for every POS directly reachable from P,
before reaching PEND, where POS is the position of a char-matching
operation (`exactn`, `charset`, ...).
For operations that match the empty string (`wordbeg`, ...), if F
returns true we stop going down that path immediately but if it returns
false we don't consider it as a failure and we simply look for the
next char-matching operations on that path.
For `duplicate`, it is the reverse: a false is an immediate failure
whereas a true just lets the analysis continue with the rest of the path.
This function can be used while building the bytecode (in which case
you should pass NULL for bufp), but if so, P and PEND need to delimit
a valid block such that there is not jump to a location outside
of [P...PEND]. */
static bool
forall_firstchar (struct re_pattern_buffer *bufp, re_char *p, re_char *pend,
bool f (re_char *p, void *arg), void *arg)
{
eassert (!bufp || bufp->used);
eassert (pend || bufp->used);
return forall_firstchar_1 (p, pend,
bufp ? bufp->buffer - 1 : p,
bufp ? bufp->buffer + bufp->used + 1 : pend,
f, arg);
}
/* analyze_first_old.
If fastmap is non-NULL, go through the pattern and fill fastmap
with all the possible leading chars. If fastmap is NULL, don't
bother filling it up (obviously) and only return whether the
@ -2833,7 +3055,7 @@ group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
or if fastmap was not updated accurately. */
static bool
analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
analyze_first_old (re_char *p, re_char *pend, char *fastmap, bool multibyte)
{
int j, k;
int nbits;
@ -3079,7 +3301,7 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
{ /* We have to look down both arms.
We first go down the "straight" path so as to minimize
stack usage when going through alternatives. */
bool r = analyze_first (p, pend, fastmap, multibyte);
bool r = analyze_first_old (p, pend, fastmap, multibyte);
if (r) return r;
p += j;
}
@ -3133,7 +3355,263 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
/* We reached the end without matching anything. */
return true;
} /* analyze_first */
} /* analyze_first_old */
struct anafirst_data {
bool multibyte;
char *fastmap;
bool match_any_multibyte_characters;
};
static bool
analyze_first_fastmap (const re_char *p, void *arg)
{
struct anafirst_data *data = arg;
int j, k;
int nbits;
bool not;
switch (*p)
{
case succeed:
return false;
case duplicate:
/* If the first character has to match a backreference, that means
that the group was empty (since it already matched). Since this
is the only case that interests us here, we can assume that the
backreference must match the empty string and we need to
build the fastmap from the rest of the path. */
return true;
/* Following are the cases which match a character. These end
with 'break'. */
case exactn:
p++;
/* If multibyte is nonzero, the first byte of each
character is an ASCII or a leading code. Otherwise,
each byte is a character. Thus, this works in both
cases. */
data->fastmap[p[1]] = 1;
if (data->multibyte)
{
/* Cover the case of matching a raw char in a
multibyte regexp against unibyte. */
if (CHAR_BYTE8_HEAD_P (p[1]))
data->fastmap[CHAR_TO_BYTE8 (STRING_CHAR (p + 1))] = 1;
}
else
{
/* For the case of matching this unibyte regex
against multibyte, we must set a leading code of
the corresponding multibyte character. */
int c = RE_CHAR_TO_MULTIBYTE (p[1]);
data->fastmap[CHAR_LEADING_CODE (c)] = 1;
}
return true;
case anychar:
/* We could put all the chars except for \n (and maybe \0)
but we don't bother since it is generally not worth it. */
return false;
case charset_not:
{
/* Chars beyond end of bitmap are possible matches. */
for (j = CHARSET_BITMAP_SIZE (p) * BYTEWIDTH;
j < (1 << BYTEWIDTH); j++)
data->fastmap[j] = 1;
}
FALLTHROUGH;
case charset:
not = (re_opcode_t) *(p) == charset_not;
nbits = CHARSET_BITMAP_SIZE (p) * BYTEWIDTH;
p += 2;
for (j = 0; j < nbits; j++)
if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
data->fastmap[j] = 1;
/* To match raw bytes (in the 80..ff range) against multibyte
strings, add their leading bytes to the fastmap. */
for (j = 0x80; j < nbits; j++)
if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
data->fastmap[CHAR_LEADING_CODE (BYTE8_TO_CHAR (j))] = 1;
if (/* Any leading code can possibly start a character
which doesn't match the specified set of characters. */
not
||
/* If we can match a character class, we can match any
multibyte characters. */
(CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
&& CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
{
if (!data->match_any_multibyte_characters)
{
for (j = MIN_MULTIBYTE_LEADING_CODE;
j <= MAX_MULTIBYTE_LEADING_CODE; j++)
data->fastmap[j] = 1;
data->match_any_multibyte_characters = true;
}
}
else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
&& data->match_any_multibyte_characters == false)
{
/* Set fastmap[I] to 1 where I is a leading code of each
multibyte character in the range table. */
int c, count;
unsigned char lc1, lc2;
/* Make P points the range table. '+ 2' is to skip flag
bits for a character class. */
p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
/* Extract the number of ranges in range table into COUNT. */
EXTRACT_NUMBER_AND_INCR (count, p);
for (; count > 0; count--, p += 3)
{
/* Extract the start and end of each range. */
EXTRACT_CHARACTER (c, p);
lc1 = CHAR_LEADING_CODE (c);
p += 3;
EXTRACT_CHARACTER (c, p);
lc2 = CHAR_LEADING_CODE (c);
for (j = lc1; j <= lc2; j++)
data->fastmap[j] = 1;
}
}
return true;
case syntaxspec:
case notsyntaxspec:
/* This match depends on text properties. These end with
aborting optimizations. */
return false;
case categoryspec:
case notcategoryspec:
not = (re_opcode_t)p[0] == notcategoryspec;
p++;
k = *p++;
for (j = (1 << BYTEWIDTH); j >= 0; j--)
if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
data->fastmap[j] = 1;
/* Any leading code can possibly start a character which
has or doesn't has the_malloc_fn specified category. */
if (!data->match_any_multibyte_characters)
{
for (j = MIN_MULTIBYTE_LEADING_CODE;
j <= MAX_MULTIBYTE_LEADING_CODE; j++)
data->fastmap[j] = 1;
data->match_any_multibyte_characters = true;
}
return true;
case endline:
case endbuf:
case wordbeg:
case wordend:
case symbeg:
case symend:
/* This false doesn't mean failure but rather "not succeeded yet". */
return false;
default:
#if ENABLE_CHECKING
abort (); /* We have listed all the cases. */
#endif
return false;
}
}
static bool
analyze_first_null (const re_char *p, void *arg)
{
switch (*p)
{
case succeed:
/* This is safe: we can't reach `pend` at all from here. */
return true;
case duplicate:
/* Either `duplicate` ends up matching a non-empty string, in which
case we're good, or it matches the empty string, in which case we
need to continue checking the rest of this path, which is exactly
what returning `true` does, here. */
return true;
case exactn:
case anychar:
case charset_not:
case charset:
case syntaxspec:
case notsyntaxspec:
case categoryspec:
case notcategoryspec:
return true;
case endline:
case endbuf:
case wordbeg:
case wordend:
case symbeg:
case symend:
/* This false doesn't mean failure but rather "not succeeded yet". */
return false;
default:
#if ENABLE_CHECKING
abort (); /* We have listed all the cases. */
#endif
return false;
}
}
/* analyze_first.
If fastmap is non-NULL, go through the pattern and fill fastmap
with all the possible leading chars. If fastmap is NULL, don't
bother filling it up (obviously) and only return whether the
pattern could potentially match the empty string.
Return false if p matches at least one char before reaching pend.
Return true if p..pend might match the empty string
or if fastmap was not updated accurately. */
static bool
analyze_first (struct re_pattern_buffer *bufp,
re_char *p, re_char *pend, char *fastmap)
{
eassert (pend);
struct anafirst_data data = { bufp ? bufp->multibyte : false,
fastmap, false };
bool safe = forall_firstchar (bufp->used ? bufp : NULL, p, pend,
fastmap ? analyze_first_fastmap
: analyze_first_null,
&data);
#if ENABLE_CHECKING
bool old = !!analyze_first_old (p, pend, fastmap, data.multibyte);
if (old && safe)
{
fprintf (stderr, "ANALYZE_FIRST: New optimization (fastmap=%d)!\n",
fastmap ? 1 : 0);
print_partial_compiled_pattern (stderr, p, pend);
}
else if (!old && !safe)
{
fprintf (stderr, "ANALYZE_FIRST: Lost an optimization (fastmap=%d)!\n",
fastmap ? 1 : 0);
print_partial_compiled_pattern (stderr, p, pend);
}
#endif
return !safe;
}
/* Compute a fastmap for the compiled pattern in BUFP.
A fastmap records which of the (1 << BYTEWIDTH) possible
@ -3162,8 +3640,8 @@ re_compile_fastmap (struct re_pattern_buffer *bufp)
/* FIXME: Is the following assignment correct even when ANALYSIS < 0? */
bufp->fastmap_accurate = 1; /* It will be when we're done. */
bufp->can_be_null = analyze_first (bufp->buffer, bufp->buffer + bufp->used,
fastmap, RE_MULTIBYTE_P (bufp));
bufp->can_be_null = analyze_first (bufp, bufp->buffer,
bufp->buffer + bufp->used, fastmap);
} /* re_compile_fastmap */
/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
@ -3962,11 +4440,102 @@ mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1,
return false;
}
struct mutexcl_data {
struct re_pattern_buffer *bufp;
re_char *p1;
};
static bool
mutually_exclusive_one (re_char *p2, void *arg)
{
struct mutexcl_data *data = arg;
switch (*p2)
{
case endline:
case exactn:
return mutually_exclusive_exactn (data->bufp, data->p1, p2);
case charset:
{
if (*data->p1 == exactn)
return mutually_exclusive_exactn (data->bufp, p2, data->p1);
else
return mutually_exclusive_charset (data->bufp, data->p1, p2);
}
case charset_not:
switch (*data->p1)
{
case exactn:
return mutually_exclusive_exactn (data->bufp, p2, data->p1);
case charset:
return mutually_exclusive_charset (data->bufp, p2, data->p1);
case charset_not:
/* When we have two charset_not, it's very unlikely that
they don't overlap. The union of the two sets of excluded
chars should cover all possible chars, which, as a matter of
fact, is virtually impossible in multibyte buffers. */
return false;
}
return false;
case anychar:
return false; /* FIXME: exactn \n ? */
case syntaxspec:
return (*data->p1 == notsyntaxspec && data->p1[1] == p2[1]);
case notsyntaxspec:
return (*data->p1 == syntaxspec && data->p1[1] == p2[1]);
case categoryspec:
return (*data->p1 == notcategoryspec && data->p1[1] == p2[1]);
case notcategoryspec:
return (*data->p1 == categoryspec && data->p1[1] == p2[1]);
case endbuf:
case succeed:
return true;
case wordbeg:
return (*data->p1 == notsyntaxspec && data->p1[1] == Sword);
case wordend:
return (*data->p1 == syntaxspec && data->p1[1] == Sword);
case symbeg:
return (*data->p1 == notsyntaxspec
&& (data->p1[1] == Ssymbol || data->p1[1] == Sword));
case symend:
return (*data->p1 == syntaxspec
&& (data->p1[1] == Ssymbol || data->p1[1] == Sword));
case duplicate:
/* At this point, we know nothing about what this can match, sadly. */
return false;
default:
#if ENABLE_CHECKING
abort (); /* We have listed all the cases. */
#endif
return false;
}
}
static bool
mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
re_char *p2)
{
return mutually_exclusive_aux (bufp, p1, p2, bufp->buffer - 1, NULL);
struct mutexcl_data data = { bufp, p1 };
bool new = forall_firstchar (bufp, p2, NULL, mutually_exclusive_one, &data);
#if ENABLE_CHECKING
bool old = mutually_exclusive_aux (bufp, p1, p2, bufp->buffer - 1, NULL);
if (old && !new)
{
fprintf (stderr, "MUTUALLY_EXCLUSIVE_P: Lost an optimization between %d and %d!\n",
(int)(p1 - bufp->buffer), (int)(p2 - bufp->buffer));
print_partial_compiled_pattern (stderr, bufp->buffer, bufp->buffer + bufp->used);
}
else if (!old && new)
{
fprintf (stderr, "MUTUALLY_EXCLUSIVE_P: New optimization between %d and %d!\n",
(int)(p1 - bufp->buffer), (int)(p2 - bufp->buffer));
print_partial_compiled_pattern (stderr, bufp->buffer, bufp->buffer + bufp->used);
}
#endif
return new;
}
/* Matching routines. */