regex.c: Remove the old analyzes functions

After testing and checking that the changes brought by the new
consolidated analysis function (which are all cases of improved
optimizations) are indeed safe, remove the old code.

* src/regex-emacs.c (analyze_first_old): Delete function.
(analyze_first): Don't call it any more.
(skip_noops): Delete function.
(mutually_exclusive_aux): Delete function.
(mutually_exclusive_p): Don't call it any more.
This commit is contained in:
Stefan Monnier 2023-09-29 17:46:20 -04:00
parent e61a039843
commit 168cc0aff0

View file

@ -3044,319 +3044,6 @@ forall_firstchar (struct re_pattern_buffer *bufp, re_char *p, re_char *pend,
f, arg);
}
/* analyze_first_old.
If fastmap is non-NULL, go through the pattern and fill fastmap
with all the possible leading chars. If fastmap is NULL, don't
bother filling it up (obviously) and only return whether the
pattern could potentially match the empty string.
Return false if p..pend matches at least one char.
Return true if p..pend might match the empty string
or if fastmap was not updated accurately. */
static bool
analyze_first_old (re_char *p, re_char *pend, char *fastmap, bool multibyte)
{
int j, k;
int nbits;
bool not;
/* If all elements for base leading-codes in fastmap is set, this
flag is set true. */
bool match_any_multibyte_characters = false;
eassert (p);
/* The loop below works as follows:
- It has a working-list kept in the PATTERN_STACK and which basically
starts by only containing a pointer to the first operation.
- If the opcode we're looking at is a match against some set of
chars, then we add those chars to the fastmap and go on to the
next work element from the worklist (done via 'break').
- If the opcode is a control operator on the other hand, we either
ignore it (if it's meaningless at this point, such as 'start_memory')
or execute it (if it's a jump). If the jump has several destinations
(i.e. 'on_failure_jump'), then we push the other destination onto the
worklist.
We guarantee termination by ignoring backward jumps (more or less),
so that P is monotonically increasing. More to the point, we
never set P (or push) anything '<= p1'. */
while (p < pend)
{
/* P1 is used as a marker of how far back a 'on_failure_jump'
can go without being ignored. It is normally equal to P
(which prevents any backward 'on_failure_jump') except right
after a plain 'jump', to allow patterns such as:
0: jump 10
3..9: <body>
10: on_failure_jump 3
as used for the *? operator. */
re_char *p1 = p;
switch (*p++)
{
case succeed:
return true;
case duplicate:
/* If the first character has to match a backreference, that means
that the group was empty (since it already matched). Since this
is the only case that interests us here, we can assume that the
backreference must match the empty string. */
/* Note: Only true if we started from bufp->buffer (e.g. when
fastmap is non-NULL), but when fastmap is NULL we're only
trying to figure out if the body of a loop can possibly match
the empty string so it's safe (tho sometimes pessimistic)
to presume that `duplicate` can. */
p++;
continue;
/* Following are the cases which match a character. These end
with 'break'. */
case exactn:
if (fastmap)
{
/* If multibyte is nonzero, the first byte of each
character is an ASCII or a leading code. Otherwise,
each byte is a character. Thus, this works in both
cases. */
fastmap[p[1]] = 1;
if (multibyte)
{
/* Cover the case of matching a raw char in a
multibyte regexp against unibyte. */
if (CHAR_BYTE8_HEAD_P (p[1]))
fastmap[CHAR_TO_BYTE8 (STRING_CHAR (p + 1))] = 1;
}
else
{
/* For the case of matching this unibyte regex
against multibyte, we must set a leading code of
the corresponding multibyte character. */
int c = RE_CHAR_TO_MULTIBYTE (p[1]);
fastmap[CHAR_LEADING_CODE (c)] = 1;
}
}
break;
case anychar:
/* We could put all the chars except for \n (and maybe \0)
but we don't bother since it is generally not worth it. */
if (!fastmap) break;
return true;
case charset_not:
if (!fastmap) break;
{
/* Chars beyond end of bitmap are possible matches. */
for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
j < (1 << BYTEWIDTH); j++)
fastmap[j] = 1;
}
FALLTHROUGH;
case charset:
if (!fastmap) break;
not = (re_opcode_t) *(p - 1) == charset_not;
nbits = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
p++;
for (j = 0; j < nbits; j++)
if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
fastmap[j] = 1;
/* To match raw bytes (in the 80..ff range) against multibyte
strings, add their leading bytes to the fastmap. */
for (j = 0x80; j < nbits; j++)
if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
fastmap[CHAR_LEADING_CODE (BYTE8_TO_CHAR (j))] = 1;
if (/* Any leading code can possibly start a character
which doesn't match the specified set of characters. */
not
||
/* If we can match a character class, we can match any
multibyte characters. */
(CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
&& CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
{
if (match_any_multibyte_characters == false)
{
for (j = MIN_MULTIBYTE_LEADING_CODE;
j <= MAX_MULTIBYTE_LEADING_CODE; j++)
fastmap[j] = 1;
match_any_multibyte_characters = true;
}
}
else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
&& match_any_multibyte_characters == false)
{
/* Set fastmap[I] to 1 where I is a leading code of each
multibyte character in the range table. */
int c, count;
unsigned char lc1, lc2;
/* Make P points the range table. '+ 2' is to skip flag
bits for a character class. */
p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
/* Extract the number of ranges in range table into COUNT. */
EXTRACT_NUMBER_AND_INCR (count, p);
for (; count > 0; count--, p += 3)
{
/* Extract the start and end of each range. */
EXTRACT_CHARACTER (c, p);
lc1 = CHAR_LEADING_CODE (c);
p += 3;
EXTRACT_CHARACTER (c, p);
lc2 = CHAR_LEADING_CODE (c);
for (j = lc1; j <= lc2; j++)
fastmap[j] = 1;
}
}
break;
case syntaxspec:
case notsyntaxspec:
if (!fastmap) break;
/* This match depends on text properties. These end with
aborting optimizations. */
return true;
case categoryspec:
case notcategoryspec:
if (!fastmap) break;
not = (re_opcode_t)p[-1] == notcategoryspec;
k = *p++;
for (j = (1 << BYTEWIDTH); j >= 0; j--)
if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
fastmap[j] = 1;
/* Any leading code can possibly start a character which
has or doesn't has the specified category. */
if (match_any_multibyte_characters == false)
{
for (j = MIN_MULTIBYTE_LEADING_CODE;
j <= MAX_MULTIBYTE_LEADING_CODE; j++)
fastmap[j] = 1;
match_any_multibyte_characters = true;
}
break;
/* All cases after this match the empty string. These end with
'continue'. */
case at_dot:
case no_op:
case begline:
case endline:
case begbuf:
case endbuf:
case wordbound:
case notwordbound:
case wordbeg:
case wordend:
case symbeg:
case symend:
continue;
case jump:
EXTRACT_NUMBER_AND_INCR (j, p);
if (j < 0)
/* Backward jumps can only go back to code that we've already
visited. 're_compile' should make sure this is true. */
break;
p += j;
switch (*p)
{
case on_failure_jump:
case on_failure_keep_string_jump:
case on_failure_jump_loop:
case on_failure_jump_nastyloop:
case on_failure_jump_smart:
p++;
break;
default:
continue;
};
/* Keep P1 to allow the 'on_failure_jump' we are jumping to
to jump back to "just after here". */
FALLTHROUGH;
case on_failure_jump:
case on_failure_keep_string_jump:
case on_failure_jump_nastyloop:
case on_failure_jump_loop:
case on_failure_jump_smart:
EXTRACT_NUMBER_AND_INCR (j, p);
if (p + j <= p1)
; /* Backward jump to be ignored. */
else
{ /* We have to look down both arms.
We first go down the "straight" path so as to minimize
stack usage when going through alternatives. */
bool r = analyze_first_old (p, pend, fastmap, multibyte);
if (r) return r;
p += j;
}
continue;
case jump_n:
/* This code simply does not properly handle forward jump_n. */
DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); eassert (j < 0));
p += 4;
/* jump_n can either jump or fall through. The (backward) jump
case has already been handled, so we only need to look at the
fallthrough case. */
continue;
case succeed_n:
/* If N == 0, it should be an on_failure_jump_loop instead.
`j` can be negative because `EXTRACT_NUMBER` extracts a
signed number whereas `succeed_n` treats it as unsigned. */
DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j != 0));
p += 4;
/* We only care about one iteration of the loop, so we don't
need to consider the case where this behaves like an
on_failure_jump. */
/* FIXME: Sadly, the above is not true when the loop's body
can match the empty string :-( */
/* continue; */
return true;
case set_number_at:
p += 4;
continue;
case start_memory:
case stop_memory:
p += 1;
continue;
default:
abort (); /* We have listed all the cases. */
} /* switch *p++ */
/* Getting here means we have found the possible starting
characters for one path of the pattern -- and that the empty
string does not match. We need not follow this path further. */
return false;
} /* while p */
/* We reached the end without matching anything. */
return true;
} /* analyze_first_old */
struct anafirst_data {
bool multibyte;
char *fastmap;
@ -3594,21 +3281,6 @@ analyze_first (struct re_pattern_buffer *bufp,
fastmap ? analyze_first_fastmap
: analyze_first_null,
&data);
#if ENABLE_CHECKING
bool old = !!analyze_first_old (p, pend, fastmap, data.multibyte);
if (old && safe)
{
fprintf (stderr, "ANALYZE_FIRST: New optimization (fastmap=%d)!\n",
fastmap ? 1 : 0);
print_partial_compiled_pattern (stderr, p, pend);
}
else if (!old && !safe)
{
fprintf (stderr, "ANALYZE_FIRST: Lost an optimization (fastmap=%d)!\n",
fastmap ? 1 : 0);
print_partial_compiled_pattern (stderr, p, pend);
}
#endif
return !safe;
}
@ -4056,30 +3728,6 @@ skip_one_char (re_char *p)
}
/* Jump over non-matching operations. */
static re_char *
skip_noops (re_char *p, re_char *pend)
{
while (p < pend)
{
switch (*p)
{
case start_memory:
case stop_memory:
p += 2; break;
case no_op:
p += 1; break;
case jump:
p = extract_address (p + 1);
break;
default:
return p;
}
}
eassert (p == pend);
return p;
}
/* Test if C matches charset op. *PP points to the charset or charset_not
opcode. When the function finishes, *PP will be advanced past that opcode.
C is character to test (possibly after translations) and CORIG is original
@ -4247,199 +3895,6 @@ mutually_exclusive_charset (struct re_pattern_buffer *bufp, re_char *p1,
return false;
}
/* True if "p1 matches something" implies "p2 fails". */
/* We're trying to follow all paths reachable from `p2`, but since some
loops can match the empty string, this can loop back to `p2`.
To avoid inf-looping, we take advantage of the fact that
the bytecode we generate is made of syntactically nested loops, more
specifically, every loop has a single entry point and single exit point.
The function takes 2 more arguments (`loop_entry` and `loop_exit`).
`loop_entry` points to the sole entry point of the current loop and
`loop_exit` points to its sole exit point (when non-NULL).
Jumps outside of `loop_entry..exit` should not occur.
The function can assume that `loop_exit` is "mutually exclusive".
The same holds for `loop_entry` except when `p2 == loop_entry`.
To guarantee termination, recursive calls should make sure that either
`loop_entry` is larger, or it's unchanged but `p2` is larger.
FIXME: This is failsafe (can't return true when it shouldn't)
but it could be too conservative if we start generating bytecode
with a different shape, so maybe we should bite the bullet and
replace done_beg/end with an actual list of positions we've
already processed. */
static bool
mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1,
re_char *p2, re_char *loop_entry, re_char *loop_exit)
{
re_opcode_t op2;
unsigned char *pend = bufp->buffer + bufp->used;
re_char *p2_orig = p2;
eassert (p1 >= bufp->buffer && p1 < pend
&& p2 >= bufp->buffer && p2 <= pend);
if (p2 == loop_exit)
return true; /* Presumably already checked elsewhere. */
eassert (loop_entry && p2 >= loop_entry);
if (p2 < loop_entry || (loop_exit && p2 > loop_exit))
{ /* The assumptions about the shape of the code aren't true :-( */
#ifdef ENABLE_CHECKING
error ("Broken assumption in regex.c:mutually_exclusive_aux");
#endif
return false;
}
/* Skip over open/close-group commands.
If what follows this loop is a ...+ construct,
look at what begins its body, since we will have to
match at least one of that. */
p2 = skip_noops (p2, pend);
/* The same skip can be done for p1, except that this function
is only used in the case where p1 is a simple match operator. */
/* p1 = skip_noops (p1, pend); */
eassert (p1 >= bufp->buffer && p1 < pend
&& p2 >= bufp->buffer && p2 <= pend);
op2 = p2 == pend ? succeed : *p2;
switch (op2)
{
case succeed:
case endbuf:
/* If we're at the end of the pattern, we can change. */
if (skip_one_char (p1))
{
DEBUG_PRINT (" End of pattern: fast loop.\n");
return true;
}
break;
case endline:
case exactn:
return mutually_exclusive_exactn (bufp, p1, p2);
case charset:
{
if ((re_opcode_t) *p1 == exactn)
return mutually_exclusive_exactn (bufp, p2, p1);
else
return mutually_exclusive_charset (bufp, p1, p2);
}
break;
case charset_not:
switch (*p1)
{
case exactn:
return mutually_exclusive_exactn (bufp, p2, p1);
case charset:
return mutually_exclusive_charset (bufp, p2, p1);
case charset_not:
/* When we have two charset_not, it's very unlikely that
they don't overlap. The union of the two sets of excluded
chars should cover all possible chars, which, as a matter of
fact, is virtually impossible in multibyte buffers. */
break;
}
break;
case wordend:
return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
case symend:
return ((re_opcode_t) *p1 == syntaxspec
&& (p1[1] == Ssymbol || p1[1] == Sword));
case notsyntaxspec:
return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
case wordbeg:
return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
case symbeg:
return ((re_opcode_t) *p1 == notsyntaxspec
&& (p1[1] == Ssymbol || p1[1] == Sword));
case syntaxspec:
return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
case wordbound:
/* FIXME: This optimization seems correct after the first iteration
of the loop, but not for the very first :-(
IOW we'd need to pull out the first iteration and do:
syntaxspec w
on_failure_keep_string_jump end
loop:
syntaxspec w
goto loop
end:
wordbound
return (((re_opcode_t) *p1 == notsyntaxspec
|| (re_opcode_t) *p1 == syntaxspec)
&& p1[1] == Sword); */
return false;
case categoryspec:
return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
case notcategoryspec:
return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
case on_failure_jump_nastyloop:
case on_failure_jump_smart:
case on_failure_jump_loop:
case on_failure_keep_string_jump:
case on_failure_jump:
{
int mcnt;
p2++;
EXTRACT_NUMBER_AND_INCR (mcnt, p2);
re_char *p2_other = p2 + mcnt, *tmp;
/* For `+` loops, we often have an `on_failure_jump` that skips forward
over a subsequent `jump` for lack of an `on_failure_dont_jump`
kind of thing. Recognize this pattern since that subsequent
`jump` is the one that jumps to the loop-entry. */
if ((re_opcode_t) p2[0] == jump && mcnt == 3)
p2 = extract_address (p2 + 1);
/* We have to check that both destinations are safe.
Arrange for `p2` to be the smaller of the two. */
if (p2 > p2_other)
(tmp = p2, p2 = p2_other, p2_other = tmp);
if (p2_other <= p2_orig /* Both destinations go backward! */
|| !mutually_exclusive_aux (bufp, p1, p2_other,
loop_entry, loop_exit))
return false;
/* Now that we know that `p2_other` is a safe (i.e. mutually-exclusive)
position, let's check `p2`. */
if (p2 == loop_entry)
/* If we jump backward to the entry point of the current loop
it means it's a zero-length cycle through that loop, so
this cycle itself does not break mutual-exclusion. */
return true;
else if (p2 > p2_orig)
/* Boring forward jump. */
return mutually_exclusive_aux (bufp, p1, p2, loop_entry, loop_exit);
else if (loop_entry < p2 && p2 < p2_orig)
/* We jump backward to a new loop, nested within the current one.
`p2` is the entry point and `p2_other` the exit of that inner. */
return mutually_exclusive_aux (bufp, p1, p2, p2, p2_other);
else
return false;
}
default:
;
}
/* Safe default. */
return false;
}
struct mutexcl_data {
struct re_pattern_buffer *bufp;
re_char *p1;
@ -4514,28 +3969,14 @@ mutually_exclusive_one (re_char *p2, void *arg)
}
}
/* True if "p1 matches something" implies "p2 fails". */
static bool
mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
re_char *p2)
{
struct mutexcl_data data = { bufp, p1 };
bool new = forall_firstchar (bufp, p2, NULL, mutually_exclusive_one, &data);
#if ENABLE_CHECKING
bool old = mutually_exclusive_aux (bufp, p1, p2, bufp->buffer - 1, NULL);
if (old && !new)
{
fprintf (stderr, "MUTUALLY_EXCLUSIVE_P: Lost an optimization between %d and %d!\n",
(int)(p1 - bufp->buffer), (int)(p2 - bufp->buffer));
print_partial_compiled_pattern (stderr, bufp->buffer, bufp->buffer + bufp->used);
}
else if (!old && new)
{
fprintf (stderr, "MUTUALLY_EXCLUSIVE_P: New optimization between %d and %d!\n",
(int)(p1 - bufp->buffer), (int)(p2 - bufp->buffer));
print_partial_compiled_pattern (stderr, bufp->buffer, bufp->buffer + bufp->used);
}
#endif
return new;
return forall_firstchar (bufp, p2, NULL, mutually_exclusive_one, &data);
}
/* Matching routines. */