rx: Improve 'or' compositionality (bug#37659)

Perform 'regexp-opt' on nested 'or' forms, and after expansion of user-defined and 'eval' forms. Characters are now turned into strings for wider 'regexp-opt' scope. This preserves the longest-match semantics for string in 'or' forms over composition. * doc/lispref/searching.texi (Rx Constructs): Document. * lisp/emacs-lisp/rx.el (rx--normalise-or-arg) (rx--all-string-or-args): New. (rx--translate-or): Normalise arguments first, and check for strings in subforms. (rx--expand-eval): Extracted from rx--translate-eval. (rx--translate-eval): Call rx--expand-eval. * test/lisp/emacs-lisp/rx-tests.el (rx-or, rx-def-in-or): Add tests. * etc/NEWS: Announce.
2020-02-11 20:04:42 +01:00 · 2020-02-11 20:04:42 +01:00 · 49d3cd90bd
commit 49d3cd90bd
parent 6b48aedb6b
4 changed files with 69 additions and 31 deletions
--- a/doc/lispref/searching.texi
+++ b/doc/lispref/searching.texi
@ -1086,8 +1086,9 @@ Corresponding string regexp: @samp{@var{A}@var{B}@dots{}}
@itemx @code{(| @var{rx}@dots{})}
@cindex @code{|} in rx
 Match exactly one of the @var{rx}s.
-If all arguments are string literals, the longest possible match
+If all arguments are strings, characters, or @code{or} forms
-will always be used.  Otherwise, either the longest match or the
+so constrained, the longest possible match will always be used.
 Otherwise, either the longest match or the
 first (in left-to-right order) will be used.
 Without arguments, the expression will not match anything at all.@*
 Corresponding string regexp: @samp{@var{A}\|@var{B}\|@dots{}}.
--- a/etc/NEWS
+++ b/etc/NEWS
@ -2325,6 +2325,12 @@ expressions from simpler parts.
 +++
 *** 'not' argument can now be a character or single-char string.
 +++
 *** Nested 'or' forms of strings guarantee a longest match.
 For example, (or (or "IN" "OUT") (or "INPUT" "OUTPUT")) now matches
 the whole string "INPUT" if present, not just "IN".  Previously, this
 was only guaranteed inside a single 'or' form of string literals.
 ** Frames
 +++
--- a/lisp/emacs-lisp/rx.el
+++ b/lisp/emacs-lisp/rx.el
@ -254,22 +254,39 @@ Left-fold the list L, starting with X, by the binary function F."
    (setq l (cdr l)))
  x)
 (defun rx--normalise-or-arg (form)
  "Normalise the `or' argument FORM.
 Characters become strings, user-definitions and `eval' forms are expanded,
 and `or' forms are normalised recursively."
  (cond ((characterp form)
         (char-to-string form))
        ((and (consp form) (memq (car form) '(or |)))
         (cons (car form) (mapcar #'rx--normalise-or-arg (cdr form))))
        ((and (consp form) (eq (car form) 'eval))
         (rx--normalise-or-arg (rx--expand-eval (cdr form))))
        (t
         (let ((expanded (rx--expand-def form)))
           (if expanded
               (rx--normalise-or-arg expanded)
             form)))))
 (defun rx--all-string-or-args (body)
  "If BODY only consists of strings or such `or' forms, return all the strings.
 Otherwise throw `rx--nonstring'."
  (mapcan (lambda (form)
            (cond ((stringp form) (list form))
                  ((and (consp form) (memq (car form) '(or |)))
                   (rx--all-string-or-args (cdr form)))
                  (t (throw 'rx--nonstring nil))))
          body))
 (defun rx--translate-or (body)
  "Translate an or-pattern of zero or more rx items.
 Return (REGEXP . PRECEDENCE)."
  ;; FIXME: Possible improvements:
  ;;
  ;; - Turn single characters to strings: (or ?a ?b) -> (or "a" "b"),
  ;;   so that they can be candidates for regexp-opt.
  ;;
  ;; - Translate compile-time strings (`eval' forms), again for regexp-opt.
  ;;
  ;; - Flatten sub-patterns first: (or (or A B) (or C D)) -> (or A B C D)
-  ;;   in order to improve effectiveness of regexp-opt.
+  ;;   Then call regexp-opt on runs of string arguments. Example:
  ;;   This would also help composability.
  ;;
  ;; - Use associativity to run regexp-opt on contiguous subsets of arguments
  ;;   if not all of them are strings.  Example:
  ;;   (or (+ digit) "CHARLIE" "CHAN" (+ blank))
  ;;   -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank))
  ;;
@ -279,27 +296,26 @@ Return (REGEXP . PRECEDENCE)."
  ;;   so that (or "@" "%" digit (any "A-Z" space) (syntax word))
  ;;        -> (any "@" "%" digit "A-Z" space word)
  ;;        -> "[A-Z@%[:digit:][:space:][:word:]]"
  ;;
  ;; Problem: If a subpattern is carefully written to be
  ;; optimizable by regexp-opt, how do we prevent the transforms
  ;; above from destroying that property?
  ;; Example: (or "a" (or "abc" "abd" "abe"))
  (cond
   ((null body)                    ; No items: a never-matching regexp.
    (rx--empty))
   ((null (cdr body))              ; Single item.
    (rx--translate (car body)))
   ((rx--every #'stringp body)     ; All strings.
    (cons (list (regexp-opt body nil))
          t))
   ((rx--every #'rx--charset-p body)  ; All charsets.
    (rx--translate-union nil body))
   (t
-    (cons (append (car (rx--translate (car body)))
+    (let* ((args (mapcar #'rx--normalise-or-arg body))
-                  (mapcan (lambda (item)
+           (all-strings (catch 'rx--nonstring (rx--all-string-or-args args))))
-                            (cons "\\|" (car (rx--translate item))))
+      (cond
-                          (cdr body)))
+       (all-strings                       ; Only strings.
-          nil))))
+        (cons (list (regexp-opt all-strings nil))
              t))
       ((rx--every #'rx--charset-p args)  ; All charsets.
        (rx--translate-union nil args))
       (t
        (cons (append (car (rx--translate (car args)))
                      (mapcan (lambda (item)
                                (cons "\\|" (car (rx--translate item))))
                              (cdr args)))
              nil)))))))
 (defun rx--charset-p (form)
  "Whether FORM looks like a charset, only consisting of character intervals
@ -840,11 +856,15 @@ Return (REGEXP . PRECEDENCE)."
           (cons (list (list 'regexp-quote arg)) 'seq))
          (t (error "rx `literal' form with non-string argument")))))
-(defun rx--translate-eval (body)
+(defun rx--expand-eval (body)
-  "Translate the `eval' form.  Return (REGEXP . PRECEDENCE)."
+  "Expand `eval' arguments.  Return a new rx form."
  (unless (and body (null (cdr body)))
    (error "rx `eval' form takes exactly one argument"))
-  (rx--translate (eval (car body))))
+  (eval (car body)))
 (defun rx--translate-eval (body)
  "Translate the `eval' form.  Return (REGEXP . PRECEDENCE)."
  (rx--translate (rx--expand-eval body)))
 (defvar rx--regexp-atomic-regexp nil)
--- a/test/lisp/emacs-lisp/rx-tests.el
+++ b/test/lisp/emacs-lisp/rx-tests.el
@ -42,13 +42,24 @@
 (ert-deftest rx-or ()
  (should (equal (rx (or "ab" (| "c" nonl) "de"))
                 "ab\\|c\\|.\\|de"))
-  (should (equal (rx (or "ab" "abc" "a"))
+  (should (equal (rx (or "ab" "abc" ?a))
                 "\\(?:a\\(?:bc?\\)?\\)"))
  (should (equal (rx (or "ab" (| (or "abcd" "abcde")) (or "a" "abc")))
                 "\\(?:a\\(?:b\\(?:c\\(?:de?\\)?\\)?\\)?\\)"))
  (should (equal (rx (or "a" (eval (string ?a ?b))))
                 "\\(?:ab?\\)"))
  (should (equal (rx (| nonl "a") (| "b" blank))
                 "\\(?:.\\|a\\)\\(?:b\\|[[:blank:]]\\)"))
  (should (equal (rx (|))
                 "\\`a\\`")))
 (ert-deftest rx-def-in-or ()
  (rx-let ((a b)
           (b (or "abc" c))
           (c ?a))
    (should (equal (rx (or a (| "ab" "abcde") "abcd"))
                   "\\(?:a\\(?:b\\(?:c\\(?:de?\\)?\\)?\\)?\\)"))))
 (ert-deftest rx-char-any ()
  "Test character alternatives with `]' and `-' (Bug#25123)."
  (should (equal