regex.c: Remove the old analyzes functions

After testing and checking that the changes brought by the new consolidated analysis function (which are all cases of improved optimizations) are indeed safe, remove the old code. * src/regex-emacs.c (analyze_first_old): Delete function. (analyze_first): Don't call it any more. (skip_noops): Delete function. (mutually_exclusive_aux): Delete function. (mutually_exclusive_p): Don't call it any more.
2023-09-29 17:46:20 -04:00 · 2023-09-29 17:46:20 -04:00 · 168cc0aff0
commit 168cc0aff0
parent e61a039843
1 changed files with 3 additions and 562 deletions
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@ -3044,319 +3044,6 @@ forall_firstchar (struct re_pattern_buffer *bufp, re_char *p, re_char *pend,
                             f, arg);
 }

-/* analyze_first_old.
-   If fastmap is non-NULL, go through the pattern and fill fastmap
-   with all the possible leading chars.  If fastmap is NULL, don't
-   bother filling it up (obviously) and only return whether the
-   pattern could potentially match the empty string.
-
-   Return false if p..pend matches at least one char.
-   Return true  if p..pend might match the empty string
-                or if fastmap was not updated accurately.  */
-
-static bool
-analyze_first_old (re_char *p, re_char *pend, char *fastmap, bool multibyte)
-{
-  int j, k;
-  int nbits;
-  bool not;
-
-  /* If all elements for base leading-codes in fastmap is set, this
-     flag is set true.  */
-  bool match_any_multibyte_characters = false;
-
-  eassert (p);
-
-  /* The loop below works as follows:
-     - It has a working-list kept in the PATTERN_STACK and which basically
-       starts by only containing a pointer to the first operation.
-     - If the opcode we're looking at is a match against some set of
-       chars, then we add those chars to the fastmap and go on to the
-       next work element from the worklist (done via 'break').
-     - If the opcode is a control operator on the other hand, we either
-       ignore it (if it's meaningless at this point, such as 'start_memory')
-       or execute it (if it's a jump).  If the jump has several destinations
-       (i.e. 'on_failure_jump'), then we push the other destination onto the
-       worklist.
-     We guarantee termination by ignoring backward jumps (more or less),
-     so that P is monotonically increasing.  More to the point, we
-     never set P (or push) anything '<= p1'.  */
-
-  while (p < pend)
-    {
-      /* P1 is used as a marker of how far back a 'on_failure_jump'
-	 can go without being ignored.  It is normally equal to P
-	 (which prevents any backward 'on_failure_jump') except right
-	 after a plain 'jump', to allow patterns such as:
-	    0: jump 10
-	    3..9: <body>
-	    10: on_failure_jump 3
-	 as used for the *? operator.  */
-      re_char *p1 = p;
-
-      switch (*p++)
-	{
-	case succeed:
-	  return true;
-
-	case duplicate:
-	  /* If the first character has to match a backreference, that means
-	     that the group was empty (since it already matched).  Since this
-	     is the only case that interests us here, we can assume that the
-	     backreference must match the empty string.  */
-	  /* Note: Only true if we started from bufp->buffer (e.g. when
-	     fastmap is non-NULL), but when fastmap is NULL we're only
-	     trying to figure out if the body of a loop can possibly match
-	     the empty string so it's safe (tho sometimes pessimistic)
-	     to presume that `duplicate` can.  */
-	  p++;
-	  continue;
-
-
-      /* Following are the cases which match a character.  These end
-	 with 'break'.  */
-
-	case exactn:
-	  if (fastmap)
-	    {
-	      /* If multibyte is nonzero, the first byte of each
-		 character is an ASCII or a leading code.  Otherwise,
-		 each byte is a character.  Thus, this works in both
-		 cases. */
-	      fastmap[p[1]] = 1;
-	      if (multibyte)
-		{
-		  /* Cover the case of matching a raw char in a
-		     multibyte regexp against unibyte.	*/
-		  if (CHAR_BYTE8_HEAD_P (p[1]))
-		    fastmap[CHAR_TO_BYTE8 (STRING_CHAR (p + 1))] = 1;
-		}
-	      else
-		{
-		  /* For the case of matching this unibyte regex
-		     against multibyte, we must set a leading code of
-		     the corresponding multibyte character.  */
-		  int c = RE_CHAR_TO_MULTIBYTE (p[1]);
-
-		  fastmap[CHAR_LEADING_CODE (c)] = 1;
-		}
-	    }
-	  break;
-
-
-	case anychar:
-	  /* We could put all the chars except for \n (and maybe \0)
-	     but we don't bother since it is generally not worth it.  */
-	  if (!fastmap) break;
-	  return true;
-
-
-	case charset_not:
-	  if (!fastmap) break;
-	  {
-	    /* Chars beyond end of bitmap are possible matches.  */
-	    for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
-		 j < (1 << BYTEWIDTH); j++)
-	      fastmap[j] = 1;
-	  }
-	  FALLTHROUGH;
-	case charset:
-	  if (!fastmap) break;
-	  not = (re_opcode_t) *(p - 1) == charset_not;
-	  nbits = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
-	  p++;
-	  for (j = 0; j < nbits; j++)
-	    if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
-	      fastmap[j] = 1;
-
-	  /* To match raw bytes (in the 80..ff range) against multibyte
-	     strings, add their leading bytes to the fastmap.  */
-	  for (j = 0x80; j < nbits; j++)
-	    if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
-	      fastmap[CHAR_LEADING_CODE (BYTE8_TO_CHAR (j))] = 1;
-
-	  if (/* Any leading code can possibly start a character
-		 which doesn't match the specified set of characters.  */
-	      not
-	      ||
-	      /* If we can match a character class, we can match any
-		 multibyte characters.  */
-	      (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
-	       && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
-
-	    {
-	      if (match_any_multibyte_characters == false)
-		{
-		  for (j = MIN_MULTIBYTE_LEADING_CODE;
-		       j <= MAX_MULTIBYTE_LEADING_CODE; j++)
-		    fastmap[j] = 1;
-		  match_any_multibyte_characters = true;
-		}
-	    }
-
-	  else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
-		   && match_any_multibyte_characters == false)
-	    {
-	      /* Set fastmap[I] to 1 where I is a leading code of each
-		 multibyte character in the range table. */
-	      int c, count;
-	      unsigned char lc1, lc2;
-
-	      /* Make P points the range table.  '+ 2' is to skip flag
-		 bits for a character class.  */
-	      p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
-
-	      /* Extract the number of ranges in range table into COUNT.  */
-	      EXTRACT_NUMBER_AND_INCR (count, p);
-	      for (; count > 0; count--, p += 3)
-		{
-		  /* Extract the start and end of each range.  */
-		  EXTRACT_CHARACTER (c, p);
-		  lc1 = CHAR_LEADING_CODE (c);
-		  p += 3;
-		  EXTRACT_CHARACTER (c, p);
-		  lc2 = CHAR_LEADING_CODE (c);
-		  for (j = lc1; j <= lc2; j++)
-		    fastmap[j] = 1;
-		}
-	    }
-	  break;
-
-	case syntaxspec:
-	case notsyntaxspec:
-	  if (!fastmap) break;
-	  /* This match depends on text properties.  These end with
-	     aborting optimizations.  */
-	  return true;
-
-	case categoryspec:
-	case notcategoryspec:
-	  if (!fastmap) break;
-	  not = (re_opcode_t)p[-1] == notcategoryspec;
-	  k = *p++;
-	  for (j = (1 << BYTEWIDTH); j >= 0; j--)
-	    if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
-	      fastmap[j] = 1;
-
-	  /* Any leading code can possibly start a character which
-	     has or doesn't has the specified category.  */
-	  if (match_any_multibyte_characters == false)
-	    {
-	      for (j = MIN_MULTIBYTE_LEADING_CODE;
-		   j <= MAX_MULTIBYTE_LEADING_CODE; j++)
-		fastmap[j] = 1;
-	      match_any_multibyte_characters = true;
-	    }
-	  break;
-
-      /* All cases after this match the empty string.  These end with
-	 'continue'.  */
-
-	case at_dot:
-	case no_op:
-	case begline:
-	case endline:
-	case begbuf:
-	case endbuf:
-	case wordbound:
-	case notwordbound:
-	case wordbeg:
-	case wordend:
-	case symbeg:
-	case symend:
-	  continue;
-
-
-	case jump:
-	  EXTRACT_NUMBER_AND_INCR (j, p);
-	  if (j < 0)
-	    /* Backward jumps can only go back to code that we've already
-	       visited.  're_compile' should make sure this is true.  */
-	    break;
-	  p += j;
-	  switch (*p)
-	    {
-	    case on_failure_jump:
-	    case on_failure_keep_string_jump:
-	    case on_failure_jump_loop:
-	    case on_failure_jump_nastyloop:
-	    case on_failure_jump_smart:
-	      p++;
-	      break;
-	    default:
-	      continue;
-	    };
-	  /* Keep P1 to allow the 'on_failure_jump' we are jumping to
-	     to jump back to "just after here".  */
-	  FALLTHROUGH;
-	case on_failure_jump:
-	case on_failure_keep_string_jump:
-	case on_failure_jump_nastyloop:
-	case on_failure_jump_loop:
-	case on_failure_jump_smart:
-	  EXTRACT_NUMBER_AND_INCR (j, p);
-	  if (p + j <= p1)
-	    ; /* Backward jump to be ignored.  */
-	  else
-	    { /* We have to look down both arms.
-		 We first go down the "straight" path so as to minimize
-		 stack usage when going through alternatives.  */
-	      bool r = analyze_first_old (p, pend, fastmap, multibyte);
-	      if (r) return r;
-	      p += j;
-	    }
-	  continue;
-
-
-	case jump_n:
-	  /* This code simply does not properly handle forward jump_n.  */
-	  DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); eassert (j < 0));
-	  p += 4;
-	  /* jump_n can either jump or fall through.  The (backward) jump
-	     case has already been handled, so we only need to look at the
-	     fallthrough case.  */
-	  continue;
-
-	case succeed_n:
-	  /* If N == 0, it should be an on_failure_jump_loop instead.
-	     `j` can be negative because `EXTRACT_NUMBER` extracts a
-	     signed number whereas `succeed_n` treats it as unsigned.  */
-	  DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j != 0));
-	  p += 4;
-	  /* We only care about one iteration of the loop, so we don't
-	     need to consider the case where this behaves like an
-	     on_failure_jump.  */
-	  /* FIXME: Sadly, the above is not true when the loop's body
-	     can match the empty string :-(  */
-	  /* continue; */
-	  return true;
-
-	case set_number_at:
-	  p += 4;
-	  continue;
-
-
-	case start_memory:
-	case stop_memory:
-	  p += 1;
-	  continue;
-
-
-	default:
-	  abort (); /* We have listed all the cases.  */
-	} /* switch *p++ */
-
-      /* Getting here means we have found the possible starting
-	 characters for one path of the pattern -- and that the empty
-	 string does not match.  We need not follow this path further.  */
-      return false;
-    } /* while p */
-
-  /* We reached the end without matching anything.  */
-  return true;
-
-} /* analyze_first_old */
-
 struct anafirst_data {
  bool multibyte;
  char *fastmap;
@ -3594,21 +3281,6 @@ analyze_first (struct re_pattern_buffer *bufp,
                                fastmap ? analyze_first_fastmap
                                : analyze_first_null,
                                &data);
-#if ENABLE_CHECKING
-  bool old = !!analyze_first_old (p, pend, fastmap, data.multibyte);
-  if (old && safe)
-    {
-      fprintf (stderr, "ANALYZE_FIRST: New optimization (fastmap=%d)!\n",
-               fastmap ? 1 : 0);
-      print_partial_compiled_pattern (stderr, p, pend);
-    }
-  else if (!old && !safe)
-    {
-      fprintf (stderr, "ANALYZE_FIRST: Lost an optimization (fastmap=%d)!\n",
-               fastmap ? 1 : 0);
-      print_partial_compiled_pattern (stderr, p, pend);
-    }
-#endif
  return !safe;
 }

@ -4056,30 +3728,6 @@ skip_one_char (re_char *p)
 }


-/* Jump over non-matching operations.  */
-static re_char *
-skip_noops (re_char *p, re_char *pend)
-{
-  while (p < pend)
-    {
-      switch (*p)
-	{
-	case start_memory:
-	case stop_memory:
-	  p += 2; break;
-	case no_op:
-	  p += 1; break;
-	case jump:
-	  p = extract_address (p + 1);
-	  break;
-	default:
-	  return p;
-	}
-    }
-  eassert (p == pend);
-  return p;
-}
-
 /* Test if C matches charset op.  *PP points to the charset or charset_not
   opcode.  When the function finishes, *PP will be advanced past that opcode.
   C is character to test (possibly after translations) and CORIG is original
@ -4247,199 +3895,6 @@ mutually_exclusive_charset (struct re_pattern_buffer *bufp, re_char *p1,
  return false;
 }

-/* True if "p1 matches something" implies "p2 fails".  */
-/* We're trying to follow all paths reachable from `p2`, but since some
-   loops can match the empty string, this can loop back to `p2`.
-
-   To avoid inf-looping, we take advantage of the fact that
-   the bytecode we generate is made of syntactically nested loops, more
-   specifically, every loop has a single entry point and single exit point.
-
-   The function takes 2 more arguments (`loop_entry` and `loop_exit`).
-   `loop_entry` points to the sole entry point of the current loop and
-   `loop_exit` points to its sole exit point (when non-NULL).
-
-   Jumps outside of `loop_entry..exit` should not occur.
-   The function can assume that `loop_exit` is "mutually exclusive".
-   The same holds for `loop_entry` except when `p2 == loop_entry`.
-
-   To guarantee termination, recursive calls should make sure that either
-   `loop_entry` is larger, or it's unchanged but `p2` is larger.
-
-   FIXME: This is failsafe (can't return true when it shouldn't)
-   but it could be too conservative if we start generating bytecode
-   with a different shape, so maybe we should bite the bullet and
-   replace done_beg/end with an actual list of positions we've
-   already processed.  */
-static bool
-mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1,
-		        re_char *p2, re_char *loop_entry, re_char *loop_exit)
-{
-  re_opcode_t op2;
-  unsigned char *pend = bufp->buffer + bufp->used;
-  re_char *p2_orig = p2;
-
-  eassert (p1 >= bufp->buffer && p1 < pend
-	   && p2 >= bufp->buffer && p2 <= pend);
-
-  if (p2 == loop_exit)
-    return true;          /* Presumably already checked elsewhere.  */
-  eassert (loop_entry && p2 >= loop_entry);
-  if (p2 < loop_entry || (loop_exit && p2 > loop_exit))
-    { /* The assumptions about the shape of the code aren't true :-(  */
-#ifdef ENABLE_CHECKING
-      error ("Broken assumption in regex.c:mutually_exclusive_aux");
-#endif
-      return false;
-    }
-
-  /* Skip over open/close-group commands.
-     If what follows this loop is a ...+ construct,
-     look at what begins its body, since we will have to
-     match at least one of that.  */
-  p2 = skip_noops (p2, pend);
-  /* The same skip can be done for p1, except that this function
-     is only used in the case where p1 is a simple match operator.  */
-  /* p1 = skip_noops (p1, pend); */
-
-  eassert (p1 >= bufp->buffer && p1 < pend
-	   && p2 >= bufp->buffer && p2 <= pend);
-
-  op2 = p2 == pend ? succeed : *p2;
-
-  switch (op2)
-    {
-    case succeed:
-    case endbuf:
-      /* If we're at the end of the pattern, we can change.  */
-      if (skip_one_char (p1))
-	{
-	  DEBUG_PRINT ("  End of pattern: fast loop.\n");
-	  return true;
-	}
-      break;
-
-    case endline:
-    case exactn:
-      return mutually_exclusive_exactn (bufp, p1, p2);
-
-    case charset:
-      {
-	if ((re_opcode_t) *p1 == exactn)
-	  return mutually_exclusive_exactn (bufp, p2, p1);
-	else
-	  return mutually_exclusive_charset (bufp, p1, p2);
-      }
-      break;
-
-    case charset_not:
-      switch (*p1)
-	{
-	case exactn:
-	  return mutually_exclusive_exactn (bufp, p2, p1);
-	case charset:
-	  return mutually_exclusive_charset (bufp, p2, p1);
-	case charset_not:
-	  /* When we have two charset_not, it's very unlikely that
-	     they don't overlap.  The union of the two sets of excluded
-	     chars should cover all possible chars, which, as a matter of
-	     fact, is virtually impossible in multibyte buffers.  */
-	  break;
-	}
-      break;
-
-    case wordend:
-      return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
-    case symend:
-      return ((re_opcode_t) *p1 == syntaxspec
-              && (p1[1] == Ssymbol || p1[1] == Sword));
-    case notsyntaxspec:
-      return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
-
-    case wordbeg:
-      return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
-    case symbeg:
-      return ((re_opcode_t) *p1 == notsyntaxspec
-              && (p1[1] == Ssymbol || p1[1] == Sword));
-    case syntaxspec:
-      return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
-
-    case wordbound:
-      /* FIXME: This optimization seems correct after the first iteration
-         of the loop, but not for the very first :-(
-         IOW we'd need to pull out the first iteration and do:
-
-            syntaxspec w
-            on_failure_keep_string_jump end
-          loop:
-            syntaxspec w
-            goto loop
-          end:
-            wordbound
-
-      return (((re_opcode_t) *p1 == notsyntaxspec
-               || (re_opcode_t) *p1 == syntaxspec)
-              && p1[1] == Sword);  */
-      return false;
-
-    case categoryspec:
-      return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
-    case notcategoryspec:
-      return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
-
-    case on_failure_jump_nastyloop:
-    case on_failure_jump_smart:
-    case on_failure_jump_loop:
-    case on_failure_keep_string_jump:
-    case on_failure_jump:
-      {
-        int mcnt;
-	p2++;
-	EXTRACT_NUMBER_AND_INCR (mcnt, p2);
-	re_char *p2_other = p2 + mcnt, *tmp;
-	/* For `+` loops, we often have an `on_failure_jump` that skips forward
-	   over a subsequent `jump` for lack of an `on_failure_dont_jump`
-	   kind of thing.  Recognize this pattern since that subsequent
-	   `jump` is the one that jumps to the loop-entry.  */
-	if ((re_opcode_t) p2[0] == jump && mcnt == 3)
-	  p2 = extract_address (p2 + 1);
-
-	/* We have to check that both destinations are safe.
-	   Arrange for `p2` to be the smaller of the two.  */
-	if (p2 > p2_other)
-	  (tmp = p2, p2 = p2_other, p2_other = tmp);
-
-	if (p2_other <= p2_orig /* Both destinations go backward!  */
-	    || !mutually_exclusive_aux (bufp, p1, p2_other,
-	                                loop_entry, loop_exit))
-	  return false;
-
-	/* Now that we know that `p2_other` is a safe (i.e. mutually-exclusive)
-	   position, let's check `p2`.  */
-	if (p2 == loop_entry)
-	  /* If we jump backward to the entry point of the current loop
-	     it means it's a zero-length cycle through that loop, so
-	     this cycle itself does not break mutual-exclusion.  */
-	  return true;
-	else if (p2 > p2_orig)
-	  /* Boring forward jump.  */
-	  return mutually_exclusive_aux (bufp, p1, p2, loop_entry, loop_exit);
-	else if (loop_entry < p2 && p2 < p2_orig)
-	  /* We jump backward to a new loop, nested within the current one.
-	     `p2` is the entry point and `p2_other` the exit of that inner.  */
-	  return mutually_exclusive_aux (bufp, p1, p2, p2, p2_other);
-	else
-	  return false;
-      }
-
-    default:
-      ;
-    }
-
-  /* Safe default.  */
-  return false;
-}
-
 struct mutexcl_data {
  struct re_pattern_buffer *bufp;
  re_char *p1;
@ -4514,28 +3969,14 @@ mutually_exclusive_one (re_char *p2, void *arg)
    }
 }

+/* True if "p1 matches something" implies "p2 fails".  */
+
 static bool
 mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
 		      re_char *p2)
 {
  struct mutexcl_data data = { bufp, p1 };
-  bool new = forall_firstchar (bufp, p2, NULL, mutually_exclusive_one, &data);
-#if ENABLE_CHECKING
-  bool old = mutually_exclusive_aux (bufp, p1, p2, bufp->buffer - 1, NULL);
-  if (old && !new)
-    {
-      fprintf (stderr, "MUTUALLY_EXCLUSIVE_P: Lost an optimization between %d and %d!\n",
-               (int)(p1 - bufp->buffer), (int)(p2 - bufp->buffer));
-      print_partial_compiled_pattern (stderr, bufp->buffer, bufp->buffer + bufp->used);
-    }
-  else if (!old && new)
-    {
-      fprintf (stderr, "MUTUALLY_EXCLUSIVE_P: New optimization between %d and %d!\n",
-               (int)(p1 - bufp->buffer), (int)(p2 - bufp->buffer));
-      print_partial_compiled_pattern (stderr, bufp->buffer, bufp->buffer + bufp->used);
-    }
-#endif
-  return new;
+  return forall_firstchar (bufp, p2, NULL, mutually_exclusive_one, &data);
 }

 /* Matching routines.  */