rx: Improve 'or' compositionality (bug#37659)
Perform 'regexp-opt' on nested 'or' forms, and after expansion of user-defined and 'eval' forms. Characters are now turned into strings for wider 'regexp-opt' scope. This preserves the longest-match semantics for string in 'or' forms over composition. * doc/lispref/searching.texi (Rx Constructs): Document. * lisp/emacs-lisp/rx.el (rx--normalise-or-arg) (rx--all-string-or-args): New. (rx--translate-or): Normalise arguments first, and check for strings in subforms. (rx--expand-eval): Extracted from rx--translate-eval. (rx--translate-eval): Call rx--expand-eval. * test/lisp/emacs-lisp/rx-tests.el (rx-or, rx-def-in-or): Add tests. * etc/NEWS: Announce.
This commit is contained in:
parent
6b48aedb6b
commit
49d3cd90bd
4 changed files with 69 additions and 31 deletions
|
@ -1086,8 +1086,9 @@ Corresponding string regexp: @samp{@var{A}@var{B}@dots{}}
|
||||||
@itemx @code{(| @var{rx}@dots{})}
|
@itemx @code{(| @var{rx}@dots{})}
|
||||||
@cindex @code{|} in rx
|
@cindex @code{|} in rx
|
||||||
Match exactly one of the @var{rx}s.
|
Match exactly one of the @var{rx}s.
|
||||||
If all arguments are string literals, the longest possible match
|
If all arguments are strings, characters, or @code{or} forms
|
||||||
will always be used. Otherwise, either the longest match or the
|
so constrained, the longest possible match will always be used.
|
||||||
|
Otherwise, either the longest match or the
|
||||||
first (in left-to-right order) will be used.
|
first (in left-to-right order) will be used.
|
||||||
Without arguments, the expression will not match anything at all.@*
|
Without arguments, the expression will not match anything at all.@*
|
||||||
Corresponding string regexp: @samp{@var{A}\|@var{B}\|@dots{}}.
|
Corresponding string regexp: @samp{@var{A}\|@var{B}\|@dots{}}.
|
||||||
|
|
6
etc/NEWS
6
etc/NEWS
|
@ -2325,6 +2325,12 @@ expressions from simpler parts.
|
||||||
+++
|
+++
|
||||||
*** 'not' argument can now be a character or single-char string.
|
*** 'not' argument can now be a character or single-char string.
|
||||||
|
|
||||||
|
+++
|
||||||
|
*** Nested 'or' forms of strings guarantee a longest match.
|
||||||
|
For example, (or (or "IN" "OUT") (or "INPUT" "OUTPUT")) now matches
|
||||||
|
the whole string "INPUT" if present, not just "IN". Previously, this
|
||||||
|
was only guaranteed inside a single 'or' form of string literals.
|
||||||
|
|
||||||
** Frames
|
** Frames
|
||||||
|
|
||||||
+++
|
+++
|
||||||
|
|
|
@ -254,22 +254,39 @@ Left-fold the list L, starting with X, by the binary function F."
|
||||||
(setq l (cdr l)))
|
(setq l (cdr l)))
|
||||||
x)
|
x)
|
||||||
|
|
||||||
|
(defun rx--normalise-or-arg (form)
|
||||||
|
"Normalise the `or' argument FORM.
|
||||||
|
Characters become strings, user-definitions and `eval' forms are expanded,
|
||||||
|
and `or' forms are normalised recursively."
|
||||||
|
(cond ((characterp form)
|
||||||
|
(char-to-string form))
|
||||||
|
((and (consp form) (memq (car form) '(or |)))
|
||||||
|
(cons (car form) (mapcar #'rx--normalise-or-arg (cdr form))))
|
||||||
|
((and (consp form) (eq (car form) 'eval))
|
||||||
|
(rx--normalise-or-arg (rx--expand-eval (cdr form))))
|
||||||
|
(t
|
||||||
|
(let ((expanded (rx--expand-def form)))
|
||||||
|
(if expanded
|
||||||
|
(rx--normalise-or-arg expanded)
|
||||||
|
form)))))
|
||||||
|
|
||||||
|
(defun rx--all-string-or-args (body)
|
||||||
|
"If BODY only consists of strings or such `or' forms, return all the strings.
|
||||||
|
Otherwise throw `rx--nonstring'."
|
||||||
|
(mapcan (lambda (form)
|
||||||
|
(cond ((stringp form) (list form))
|
||||||
|
((and (consp form) (memq (car form) '(or |)))
|
||||||
|
(rx--all-string-or-args (cdr form)))
|
||||||
|
(t (throw 'rx--nonstring nil))))
|
||||||
|
body))
|
||||||
|
|
||||||
(defun rx--translate-or (body)
|
(defun rx--translate-or (body)
|
||||||
"Translate an or-pattern of zero or more rx items.
|
"Translate an or-pattern of zero or more rx items.
|
||||||
Return (REGEXP . PRECEDENCE)."
|
Return (REGEXP . PRECEDENCE)."
|
||||||
;; FIXME: Possible improvements:
|
;; FIXME: Possible improvements:
|
||||||
;;
|
;;
|
||||||
;; - Turn single characters to strings: (or ?a ?b) -> (or "a" "b"),
|
|
||||||
;; so that they can be candidates for regexp-opt.
|
|
||||||
;;
|
|
||||||
;; - Translate compile-time strings (`eval' forms), again for regexp-opt.
|
|
||||||
;;
|
|
||||||
;; - Flatten sub-patterns first: (or (or A B) (or C D)) -> (or A B C D)
|
;; - Flatten sub-patterns first: (or (or A B) (or C D)) -> (or A B C D)
|
||||||
;; in order to improve effectiveness of regexp-opt.
|
;; Then call regexp-opt on runs of string arguments. Example:
|
||||||
;; This would also help composability.
|
|
||||||
;;
|
|
||||||
;; - Use associativity to run regexp-opt on contiguous subsets of arguments
|
|
||||||
;; if not all of them are strings. Example:
|
|
||||||
;; (or (+ digit) "CHARLIE" "CHAN" (+ blank))
|
;; (or (+ digit) "CHARLIE" "CHAN" (+ blank))
|
||||||
;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank))
|
;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank))
|
||||||
;;
|
;;
|
||||||
|
@ -279,27 +296,26 @@ Return (REGEXP . PRECEDENCE)."
|
||||||
;; so that (or "@" "%" digit (any "A-Z" space) (syntax word))
|
;; so that (or "@" "%" digit (any "A-Z" space) (syntax word))
|
||||||
;; -> (any "@" "%" digit "A-Z" space word)
|
;; -> (any "@" "%" digit "A-Z" space word)
|
||||||
;; -> "[A-Z@%[:digit:][:space:][:word:]]"
|
;; -> "[A-Z@%[:digit:][:space:][:word:]]"
|
||||||
;;
|
|
||||||
;; Problem: If a subpattern is carefully written to be
|
|
||||||
;; optimizable by regexp-opt, how do we prevent the transforms
|
|
||||||
;; above from destroying that property?
|
|
||||||
;; Example: (or "a" (or "abc" "abd" "abe"))
|
|
||||||
(cond
|
(cond
|
||||||
((null body) ; No items: a never-matching regexp.
|
((null body) ; No items: a never-matching regexp.
|
||||||
(rx--empty))
|
(rx--empty))
|
||||||
((null (cdr body)) ; Single item.
|
((null (cdr body)) ; Single item.
|
||||||
(rx--translate (car body)))
|
(rx--translate (car body)))
|
||||||
((rx--every #'stringp body) ; All strings.
|
|
||||||
(cons (list (regexp-opt body nil))
|
|
||||||
t))
|
|
||||||
((rx--every #'rx--charset-p body) ; All charsets.
|
|
||||||
(rx--translate-union nil body))
|
|
||||||
(t
|
(t
|
||||||
(cons (append (car (rx--translate (car body)))
|
(let* ((args (mapcar #'rx--normalise-or-arg body))
|
||||||
(mapcan (lambda (item)
|
(all-strings (catch 'rx--nonstring (rx--all-string-or-args args))))
|
||||||
(cons "\\|" (car (rx--translate item))))
|
(cond
|
||||||
(cdr body)))
|
(all-strings ; Only strings.
|
||||||
nil))))
|
(cons (list (regexp-opt all-strings nil))
|
||||||
|
t))
|
||||||
|
((rx--every #'rx--charset-p args) ; All charsets.
|
||||||
|
(rx--translate-union nil args))
|
||||||
|
(t
|
||||||
|
(cons (append (car (rx--translate (car args)))
|
||||||
|
(mapcan (lambda (item)
|
||||||
|
(cons "\\|" (car (rx--translate item))))
|
||||||
|
(cdr args)))
|
||||||
|
nil)))))))
|
||||||
|
|
||||||
(defun rx--charset-p (form)
|
(defun rx--charset-p (form)
|
||||||
"Whether FORM looks like a charset, only consisting of character intervals
|
"Whether FORM looks like a charset, only consisting of character intervals
|
||||||
|
@ -840,11 +856,15 @@ Return (REGEXP . PRECEDENCE)."
|
||||||
(cons (list (list 'regexp-quote arg)) 'seq))
|
(cons (list (list 'regexp-quote arg)) 'seq))
|
||||||
(t (error "rx `literal' form with non-string argument")))))
|
(t (error "rx `literal' form with non-string argument")))))
|
||||||
|
|
||||||
(defun rx--translate-eval (body)
|
(defun rx--expand-eval (body)
|
||||||
"Translate the `eval' form. Return (REGEXP . PRECEDENCE)."
|
"Expand `eval' arguments. Return a new rx form."
|
||||||
(unless (and body (null (cdr body)))
|
(unless (and body (null (cdr body)))
|
||||||
(error "rx `eval' form takes exactly one argument"))
|
(error "rx `eval' form takes exactly one argument"))
|
||||||
(rx--translate (eval (car body))))
|
(eval (car body)))
|
||||||
|
|
||||||
|
(defun rx--translate-eval (body)
|
||||||
|
"Translate the `eval' form. Return (REGEXP . PRECEDENCE)."
|
||||||
|
(rx--translate (rx--expand-eval body)))
|
||||||
|
|
||||||
(defvar rx--regexp-atomic-regexp nil)
|
(defvar rx--regexp-atomic-regexp nil)
|
||||||
|
|
||||||
|
|
|
@ -42,13 +42,24 @@
|
||||||
(ert-deftest rx-or ()
|
(ert-deftest rx-or ()
|
||||||
(should (equal (rx (or "ab" (| "c" nonl) "de"))
|
(should (equal (rx (or "ab" (| "c" nonl) "de"))
|
||||||
"ab\\|c\\|.\\|de"))
|
"ab\\|c\\|.\\|de"))
|
||||||
(should (equal (rx (or "ab" "abc" "a"))
|
(should (equal (rx (or "ab" "abc" ?a))
|
||||||
"\\(?:a\\(?:bc?\\)?\\)"))
|
"\\(?:a\\(?:bc?\\)?\\)"))
|
||||||
|
(should (equal (rx (or "ab" (| (or "abcd" "abcde")) (or "a" "abc")))
|
||||||
|
"\\(?:a\\(?:b\\(?:c\\(?:de?\\)?\\)?\\)?\\)"))
|
||||||
|
(should (equal (rx (or "a" (eval (string ?a ?b))))
|
||||||
|
"\\(?:ab?\\)"))
|
||||||
(should (equal (rx (| nonl "a") (| "b" blank))
|
(should (equal (rx (| nonl "a") (| "b" blank))
|
||||||
"\\(?:.\\|a\\)\\(?:b\\|[[:blank:]]\\)"))
|
"\\(?:.\\|a\\)\\(?:b\\|[[:blank:]]\\)"))
|
||||||
(should (equal (rx (|))
|
(should (equal (rx (|))
|
||||||
"\\`a\\`")))
|
"\\`a\\`")))
|
||||||
|
|
||||||
|
(ert-deftest rx-def-in-or ()
|
||||||
|
(rx-let ((a b)
|
||||||
|
(b (or "abc" c))
|
||||||
|
(c ?a))
|
||||||
|
(should (equal (rx (or a (| "ab" "abcde") "abcd"))
|
||||||
|
"\\(?:a\\(?:b\\(?:c\\(?:de?\\)?\\)?\\)?\\)"))))
|
||||||
|
|
||||||
(ert-deftest rx-char-any ()
|
(ert-deftest rx-char-any ()
|
||||||
"Test character alternatives with `]' and `-' (Bug#25123)."
|
"Test character alternatives with `]' and `-' (Bug#25123)."
|
||||||
(should (equal
|
(should (equal
|
||||||
|
|
Loading…
Add table
Reference in a new issue