rx: fix `or' ordering by adding argument to regexp-opt
The rx `or' form may reorder its arguments in an unpredictable way, contrary to user expectation, since it sometimes uses `regexp-opt'. Add a NOREORDER option to `regexp-opt' for preventing it from producing a reordered regexp (Bug#34641). * doc/lispref/searching.texi (Regular Expression Functions): * etc/NEWS (Lisp Changes in Emacs 27.1): Describe the new regexp-opt NOREORDER argument. * lisp/emacs-lisp/regexp-opt.el (regexp-opt): Add NOREORDER. Make no attempt at regexp improvement if the set of strings contains a prefix of another string. (regexp-opt--contains-prefix): New. * lisp/emacs-lisp/rx.el (rx-or): Call regexp-opt with NOREORDER. * test/lisp/emacs-lisp/rx-tests.el: Test rx `or' form match order.
This commit is contained in:
parent
dbffbe0881
commit
da758046da
5 changed files with 65 additions and 8 deletions
|
@ -950,7 +950,7 @@ whitespace:
|
|||
@end defun
|
||||
|
||||
@cindex optimize regexp
|
||||
@defun regexp-opt strings &optional paren
|
||||
@defun regexp-opt strings &optional paren noreorder
|
||||
This function returns an efficient regular expression that will match
|
||||
any of the strings in the list @var{strings}. This is useful when you
|
||||
need to make matching or searching as fast as possible---for example,
|
||||
|
@ -985,8 +985,15 @@ if it is necessary to ensure that a postfix operator appended to
|
|||
it will apply to the whole expression.
|
||||
@end table
|
||||
|
||||
The resulting regexp of @code{regexp-opt} is equivalent to but usually
|
||||
more efficient than that of a simplified version:
|
||||
The optional argument @var{noreorder}, if @code{nil} or omitted,
|
||||
allows the returned regexp to match the strings in any order. If
|
||||
non-@code{nil}, the match is guaranteed to be performed in the order
|
||||
given, as if the strings were made into a regexp by joining them with
|
||||
the @samp{\|} operator.
|
||||
|
||||
Up to reordering, the resulting regexp of @code{regexp-opt} is
|
||||
equivalent to but usually more efficient than that of a simplified
|
||||
version:
|
||||
|
||||
@example
|
||||
(defun simplified-regexp-opt (strings &optional paren)
|
||||
|
|
7
etc/NEWS
7
etc/NEWS
|
@ -1642,6 +1642,13 @@ MS-Windows.
|
|||
** New module environment function 'process_input' to process user
|
||||
input while module code is running.
|
||||
|
||||
+++
|
||||
** The function 'regexp-opt' accepts an additional optional argument.
|
||||
By default, the regexp returned by 'regexp-opt' may match the strings
|
||||
in any order. If the new third argument is non-nil, the match is
|
||||
guaranteed to be performed in the order given, as if the strings were
|
||||
made into a regexp by joining them with '\|'.
|
||||
|
||||
|
||||
* Changes in Emacs 27.1 on Non-Free Operating Systems
|
||||
|
||||
|
|
|
@ -84,7 +84,7 @@
|
|||
;;; Code:
|
||||
|
||||
;;;###autoload
|
||||
(defun regexp-opt (strings &optional paren)
|
||||
(defun regexp-opt (strings &optional paren noreorder)
|
||||
"Return a regexp to match a string in the list STRINGS.
|
||||
Each string should be unique in STRINGS and should not contain
|
||||
any regexps, quoted or not. Optional PAREN specifies how the
|
||||
|
@ -111,8 +111,14 @@ nil
|
|||
necessary to ensure that a postfix operator appended to it will
|
||||
apply to the whole expression.
|
||||
|
||||
The resulting regexp is equivalent to but usually more efficient
|
||||
than that of a simplified version:
|
||||
The optional argument NOREORDER, if nil or omitted, allows the
|
||||
returned regexp to match the strings in any order. If non-nil,
|
||||
the match is guaranteed to be performed in the order given, as if
|
||||
the strings were made into a regexp by joining them with the
|
||||
`\\|' operator.
|
||||
|
||||
Up to reordering, the resulting regexp is equivalent to but
|
||||
usually more efficient than that of a simplified version:
|
||||
|
||||
(defun simplified-regexp-opt (strings &optional paren)
|
||||
(let ((parens
|
||||
|
@ -133,7 +139,15 @@ than that of a simplified version:
|
|||
(open (cond ((stringp paren) paren) (paren "\\(")))
|
||||
(sorted-strings (delete-dups
|
||||
(sort (copy-sequence strings) 'string-lessp)))
|
||||
(re (regexp-opt-group sorted-strings (or open t) (not open))))
|
||||
(re
|
||||
;; If NOREORDER is non-nil and the list contains a prefix
|
||||
;; of another string, we give up all attempts at optimisation.
|
||||
;; There is plenty of room for improvement (Bug#34641).
|
||||
(if (and noreorder (regexp-opt--contains-prefix sorted-strings))
|
||||
(concat (or open "\\(?:")
|
||||
(mapconcat #'regexp-quote strings "\\|")
|
||||
"\\)")
|
||||
(regexp-opt-group sorted-strings (or open t) (not open)))))
|
||||
(cond ((eq paren 'words)
|
||||
(concat "\\<" re "\\>"))
|
||||
((eq paren 'symbols)
|
||||
|
@ -313,6 +327,22 @@ CHARS should be a list of characters."
|
|||
(concat "[" dash caret "]"))
|
||||
(concat "[" bracket charset caret dash "]"))))
|
||||
|
||||
|
||||
(defun regexp-opt--contains-prefix (strings)
|
||||
"Whether STRINGS contains a proper prefix of one of its other elements.
|
||||
STRINGS must be a list of sorted strings without duplicates."
|
||||
(let ((s strings))
|
||||
;; In a lexicographically sorted list, a string always immediately
|
||||
;; succeeds one of its prefixes.
|
||||
(while (and (cdr s)
|
||||
(not (string-equal
|
||||
(car s)
|
||||
(substring (cadr s) 0 (min (length (car s))
|
||||
(length (cadr s)))))))
|
||||
(setq s (cdr s)))
|
||||
(cdr s)))
|
||||
|
||||
|
||||
(provide 'regexp-opt)
|
||||
|
||||
;;; regexp-opt.el ends here
|
||||
|
|
|
@ -393,7 +393,7 @@ FORM is of the form `(and FORM1 ...)'."
|
|||
(rx-group-if
|
||||
(if (memq nil (mapcar 'stringp (cdr form)))
|
||||
(mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|")
|
||||
(regexp-opt (cdr form)))
|
||||
(regexp-opt (cdr form) nil t))
|
||||
(and (memq rx-parent '(: * t)) rx-parent)))
|
||||
|
||||
|
||||
|
|
|
@ -92,5 +92,18 @@
|
|||
(*? "e") (+? "f") (\?? "g") (?? "h"))))
|
||||
"a*b+c?d?e*?f+?g??h??")))
|
||||
|
||||
(ert-deftest rx-or ()
|
||||
;; Test or-pattern reordering (Bug#34641).
|
||||
(let ((s "abc"))
|
||||
(should (equal (and (string-match (rx (or "abc" "ab" "a")) s)
|
||||
(match-string 0 s))
|
||||
"abc"))
|
||||
(should (equal (and (string-match (rx (or "ab" "abc" "a")) s)
|
||||
(match-string 0 s))
|
||||
"ab"))
|
||||
(should (equal (and (string-match (rx (or "a" "ab" "abc")) s)
|
||||
(match-string 0 s))
|
||||
"a"))))
|
||||
|
||||
(provide 'rx-tests)
|
||||
;; rx-tests.el ends here.
|
||||
|
|
Loading…
Add table
Reference in a new issue