rx: Improve 'or' compositionality (bug#37659)

Perform 'regexp-opt' on nested 'or' forms, and after expansion of
user-defined and 'eval' forms.  Characters are now turned into strings
for wider 'regexp-opt' scope.  This preserves the longest-match
semantics for string in 'or' forms over composition.

* doc/lispref/searching.texi (Rx Constructs): Document.
* lisp/emacs-lisp/rx.el (rx--normalise-or-arg)
(rx--all-string-or-args): New.
(rx--translate-or): Normalise arguments first, and check for strings
in subforms.
(rx--expand-eval): Extracted from rx--translate-eval.
(rx--translate-eval): Call rx--expand-eval.
* test/lisp/emacs-lisp/rx-tests.el (rx-or, rx-def-in-or): Add tests.
* etc/NEWS: Announce.
This commit is contained in:
Mattias Engdegård 2020-02-11 20:04:42 +01:00
parent 6b48aedb6b
commit 49d3cd90bd
4 changed files with 69 additions and 31 deletions

View file

@ -1086,8 +1086,9 @@ Corresponding string regexp: @samp{@var{A}@var{B}@dots{}}
@itemx @code{(| @var{rx}@dots{})} @itemx @code{(| @var{rx}@dots{})}
@cindex @code{|} in rx @cindex @code{|} in rx
Match exactly one of the @var{rx}s. Match exactly one of the @var{rx}s.
If all arguments are string literals, the longest possible match If all arguments are strings, characters, or @code{or} forms
will always be used. Otherwise, either the longest match or the so constrained, the longest possible match will always be used.
Otherwise, either the longest match or the
first (in left-to-right order) will be used. first (in left-to-right order) will be used.
Without arguments, the expression will not match anything at all.@* Without arguments, the expression will not match anything at all.@*
Corresponding string regexp: @samp{@var{A}\|@var{B}\|@dots{}}. Corresponding string regexp: @samp{@var{A}\|@var{B}\|@dots{}}.

View file

@ -2325,6 +2325,12 @@ expressions from simpler parts.
+++ +++
*** 'not' argument can now be a character or single-char string. *** 'not' argument can now be a character or single-char string.
+++
*** Nested 'or' forms of strings guarantee a longest match.
For example, (or (or "IN" "OUT") (or "INPUT" "OUTPUT")) now matches
the whole string "INPUT" if present, not just "IN". Previously, this
was only guaranteed inside a single 'or' form of string literals.
** Frames ** Frames
+++ +++

View file

@ -254,22 +254,39 @@ Left-fold the list L, starting with X, by the binary function F."
(setq l (cdr l))) (setq l (cdr l)))
x) x)
(defun rx--normalise-or-arg (form)
"Normalise the `or' argument FORM.
Characters become strings, user-definitions and `eval' forms are expanded,
and `or' forms are normalised recursively."
(cond ((characterp form)
(char-to-string form))
((and (consp form) (memq (car form) '(or |)))
(cons (car form) (mapcar #'rx--normalise-or-arg (cdr form))))
((and (consp form) (eq (car form) 'eval))
(rx--normalise-or-arg (rx--expand-eval (cdr form))))
(t
(let ((expanded (rx--expand-def form)))
(if expanded
(rx--normalise-or-arg expanded)
form)))))
(defun rx--all-string-or-args (body)
"If BODY only consists of strings or such `or' forms, return all the strings.
Otherwise throw `rx--nonstring'."
(mapcan (lambda (form)
(cond ((stringp form) (list form))
((and (consp form) (memq (car form) '(or |)))
(rx--all-string-or-args (cdr form)))
(t (throw 'rx--nonstring nil))))
body))
(defun rx--translate-or (body) (defun rx--translate-or (body)
"Translate an or-pattern of zero or more rx items. "Translate an or-pattern of zero or more rx items.
Return (REGEXP . PRECEDENCE)." Return (REGEXP . PRECEDENCE)."
;; FIXME: Possible improvements: ;; FIXME: Possible improvements:
;; ;;
;; - Turn single characters to strings: (or ?a ?b) -> (or "a" "b"),
;; so that they can be candidates for regexp-opt.
;;
;; - Translate compile-time strings (`eval' forms), again for regexp-opt.
;;
;; - Flatten sub-patterns first: (or (or A B) (or C D)) -> (or A B C D) ;; - Flatten sub-patterns first: (or (or A B) (or C D)) -> (or A B C D)
;; in order to improve effectiveness of regexp-opt. ;; Then call regexp-opt on runs of string arguments. Example:
;; This would also help composability.
;;
;; - Use associativity to run regexp-opt on contiguous subsets of arguments
;; if not all of them are strings. Example:
;; (or (+ digit) "CHARLIE" "CHAN" (+ blank)) ;; (or (+ digit) "CHARLIE" "CHAN" (+ blank))
;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank)) ;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank))
;; ;;
@ -279,27 +296,26 @@ Return (REGEXP . PRECEDENCE)."
;; so that (or "@" "%" digit (any "A-Z" space) (syntax word)) ;; so that (or "@" "%" digit (any "A-Z" space) (syntax word))
;; -> (any "@" "%" digit "A-Z" space word) ;; -> (any "@" "%" digit "A-Z" space word)
;; -> "[A-Z@%[:digit:][:space:][:word:]]" ;; -> "[A-Z@%[:digit:][:space:][:word:]]"
;;
;; Problem: If a subpattern is carefully written to be
;; optimizable by regexp-opt, how do we prevent the transforms
;; above from destroying that property?
;; Example: (or "a" (or "abc" "abd" "abe"))
(cond (cond
((null body) ; No items: a never-matching regexp. ((null body) ; No items: a never-matching regexp.
(rx--empty)) (rx--empty))
((null (cdr body)) ; Single item. ((null (cdr body)) ; Single item.
(rx--translate (car body))) (rx--translate (car body)))
((rx--every #'stringp body) ; All strings.
(cons (list (regexp-opt body nil))
t))
((rx--every #'rx--charset-p body) ; All charsets.
(rx--translate-union nil body))
(t (t
(cons (append (car (rx--translate (car body))) (let* ((args (mapcar #'rx--normalise-or-arg body))
(mapcan (lambda (item) (all-strings (catch 'rx--nonstring (rx--all-string-or-args args))))
(cons "\\|" (car (rx--translate item)))) (cond
(cdr body))) (all-strings ; Only strings.
nil)))) (cons (list (regexp-opt all-strings nil))
t))
((rx--every #'rx--charset-p args) ; All charsets.
(rx--translate-union nil args))
(t
(cons (append (car (rx--translate (car args)))
(mapcan (lambda (item)
(cons "\\|" (car (rx--translate item))))
(cdr args)))
nil)))))))
(defun rx--charset-p (form) (defun rx--charset-p (form)
"Whether FORM looks like a charset, only consisting of character intervals "Whether FORM looks like a charset, only consisting of character intervals
@ -840,11 +856,15 @@ Return (REGEXP . PRECEDENCE)."
(cons (list (list 'regexp-quote arg)) 'seq)) (cons (list (list 'regexp-quote arg)) 'seq))
(t (error "rx `literal' form with non-string argument"))))) (t (error "rx `literal' form with non-string argument")))))
(defun rx--translate-eval (body) (defun rx--expand-eval (body)
"Translate the `eval' form. Return (REGEXP . PRECEDENCE)." "Expand `eval' arguments. Return a new rx form."
(unless (and body (null (cdr body))) (unless (and body (null (cdr body)))
(error "rx `eval' form takes exactly one argument")) (error "rx `eval' form takes exactly one argument"))
(rx--translate (eval (car body)))) (eval (car body)))
(defun rx--translate-eval (body)
"Translate the `eval' form. Return (REGEXP . PRECEDENCE)."
(rx--translate (rx--expand-eval body)))
(defvar rx--regexp-atomic-regexp nil) (defvar rx--regexp-atomic-regexp nil)

View file

@ -42,13 +42,24 @@
(ert-deftest rx-or () (ert-deftest rx-or ()
(should (equal (rx (or "ab" (| "c" nonl) "de")) (should (equal (rx (or "ab" (| "c" nonl) "de"))
"ab\\|c\\|.\\|de")) "ab\\|c\\|.\\|de"))
(should (equal (rx (or "ab" "abc" "a")) (should (equal (rx (or "ab" "abc" ?a))
"\\(?:a\\(?:bc?\\)?\\)")) "\\(?:a\\(?:bc?\\)?\\)"))
(should (equal (rx (or "ab" (| (or "abcd" "abcde")) (or "a" "abc")))
"\\(?:a\\(?:b\\(?:c\\(?:de?\\)?\\)?\\)?\\)"))
(should (equal (rx (or "a" (eval (string ?a ?b))))
"\\(?:ab?\\)"))
(should (equal (rx (| nonl "a") (| "b" blank)) (should (equal (rx (| nonl "a") (| "b" blank))
"\\(?:.\\|a\\)\\(?:b\\|[[:blank:]]\\)")) "\\(?:.\\|a\\)\\(?:b\\|[[:blank:]]\\)"))
(should (equal (rx (|)) (should (equal (rx (|))
"\\`a\\`"))) "\\`a\\`")))
(ert-deftest rx-def-in-or ()
(rx-let ((a b)
(b (or "abc" c))
(c ?a))
(should (equal (rx (or a (| "ab" "abcde") "abcd"))
"\\(?:a\\(?:b\\(?:c\\(?:de?\\)?\\)?\\)?\\)"))))
(ert-deftest rx-char-any () (ert-deftest rx-char-any ()
"Test character alternatives with `]' and `-' (Bug#25123)." "Test character alternatives with `]' and `-' (Bug#25123)."
(should (equal (should (equal