rx: Improve 'or' compositionality (bug#37659)

Perform 'regexp-opt' on nested 'or' forms, and after expansion of
user-defined and 'eval' forms.  Characters are now turned into strings
for wider 'regexp-opt' scope.  This preserves the longest-match
semantics for string in 'or' forms over composition.

* doc/lispref/searching.texi (Rx Constructs): Document.
* lisp/emacs-lisp/rx.el (rx--normalise-or-arg)
(rx--all-string-or-args): New.
(rx--translate-or): Normalise arguments first, and check for strings
in subforms.
(rx--expand-eval): Extracted from rx--translate-eval.
(rx--translate-eval): Call rx--expand-eval.
* test/lisp/emacs-lisp/rx-tests.el (rx-or, rx-def-in-or): Add tests.
* etc/NEWS: Announce.
This commit is contained in:
Mattias Engdegård 2020-02-11 20:04:42 +01:00
parent 6b48aedb6b
commit 49d3cd90bd
4 changed files with 69 additions and 31 deletions

View file

@ -254,22 +254,39 @@ Left-fold the list L, starting with X, by the binary function F."
(setq l (cdr l)))
x)
(defun rx--normalise-or-arg (form)
"Normalise the `or' argument FORM.
Characters become strings, user-definitions and `eval' forms are expanded,
and `or' forms are normalised recursively."
(cond ((characterp form)
(char-to-string form))
((and (consp form) (memq (car form) '(or |)))
(cons (car form) (mapcar #'rx--normalise-or-arg (cdr form))))
((and (consp form) (eq (car form) 'eval))
(rx--normalise-or-arg (rx--expand-eval (cdr form))))
(t
(let ((expanded (rx--expand-def form)))
(if expanded
(rx--normalise-or-arg expanded)
form)))))
(defun rx--all-string-or-args (body)
"If BODY only consists of strings or such `or' forms, return all the strings.
Otherwise throw `rx--nonstring'."
(mapcan (lambda (form)
(cond ((stringp form) (list form))
((and (consp form) (memq (car form) '(or |)))
(rx--all-string-or-args (cdr form)))
(t (throw 'rx--nonstring nil))))
body))
(defun rx--translate-or (body)
"Translate an or-pattern of zero or more rx items.
Return (REGEXP . PRECEDENCE)."
;; FIXME: Possible improvements:
;;
;; - Turn single characters to strings: (or ?a ?b) -> (or "a" "b"),
;; so that they can be candidates for regexp-opt.
;;
;; - Translate compile-time strings (`eval' forms), again for regexp-opt.
;;
;; - Flatten sub-patterns first: (or (or A B) (or C D)) -> (or A B C D)
;; in order to improve effectiveness of regexp-opt.
;; This would also help composability.
;;
;; - Use associativity to run regexp-opt on contiguous subsets of arguments
;; if not all of them are strings. Example:
;; Then call regexp-opt on runs of string arguments. Example:
;; (or (+ digit) "CHARLIE" "CHAN" (+ blank))
;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank))
;;
@ -279,27 +296,26 @@ Return (REGEXP . PRECEDENCE)."
;; so that (or "@" "%" digit (any "A-Z" space) (syntax word))
;; -> (any "@" "%" digit "A-Z" space word)
;; -> "[A-Z@%[:digit:][:space:][:word:]]"
;;
;; Problem: If a subpattern is carefully written to be
;; optimizable by regexp-opt, how do we prevent the transforms
;; above from destroying that property?
;; Example: (or "a" (or "abc" "abd" "abe"))
(cond
((null body) ; No items: a never-matching regexp.
(rx--empty))
((null (cdr body)) ; Single item.
(rx--translate (car body)))
((rx--every #'stringp body) ; All strings.
(cons (list (regexp-opt body nil))
t))
((rx--every #'rx--charset-p body) ; All charsets.
(rx--translate-union nil body))
(t
(cons (append (car (rx--translate (car body)))
(mapcan (lambda (item)
(cons "\\|" (car (rx--translate item))))
(cdr body)))
nil))))
(let* ((args (mapcar #'rx--normalise-or-arg body))
(all-strings (catch 'rx--nonstring (rx--all-string-or-args args))))
(cond
(all-strings ; Only strings.
(cons (list (regexp-opt all-strings nil))
t))
((rx--every #'rx--charset-p args) ; All charsets.
(rx--translate-union nil args))
(t
(cons (append (car (rx--translate (car args)))
(mapcan (lambda (item)
(cons "\\|" (car (rx--translate item))))
(cdr args)))
nil)))))))
(defun rx--charset-p (form)
"Whether FORM looks like a charset, only consisting of character intervals
@ -840,11 +856,15 @@ Return (REGEXP . PRECEDENCE)."
(cons (list (list 'regexp-quote arg)) 'seq))
(t (error "rx `literal' form with non-string argument")))))
(defun rx--translate-eval (body)
"Translate the `eval' form. Return (REGEXP . PRECEDENCE)."
(defun rx--expand-eval (body)
"Expand `eval' arguments. Return a new rx form."
(unless (and body (null (cdr body)))
(error "rx `eval' form takes exactly one argument"))
(rx--translate (eval (car body))))
(eval (car body)))
(defun rx--translate-eval (body)
"Translate the `eval' form. Return (REGEXP . PRECEDENCE)."
(rx--translate (rx--expand-eval body)))
(defvar rx--regexp-atomic-regexp nil)