Support (rx (and (regexp EXPR) (literal EXPR))) (Bug#36237)

* lisp/emacs-lisp/rx.el (rx-regexp): Allow non-string forms.
(rx-constituents): Add literal constituent, which is like a plain
STRING form, but allows arbitrary lisp expressions.
(rx-literal): New function.
(rx-compile-to-lisp): New variable.
(rx--subforms): New helper function for handling subforms, including
non-constant case.
(rx-group-if, rx-and, rx-or, rx-=, rx->=, rx-repeat, rx-submatch)
(rx-submatch-n, rx-kleene, rx-atomic-p): Use it to handle non-constant
subforms.
(rx): Document new form, wrap non-constant forms with concat call.
* test/lisp/emacs-lisp/rx-tests.el (rx-tests--match): New macro.
(rx-nonstring-expr, rx-nonstring-expr-non-greedy): New tests.
* etc/NEWS: Announce changes.
This commit is contained in:
Noam Postavsky 2019-06-14 08:43:17 -04:00
parent 29babad728
commit b59ffd2290
3 changed files with 200 additions and 87 deletions

View file

@ -1441,12 +1441,18 @@ when given in a string. Previously, '(any "\x80-\xff")' would match
characters U+0080...U+00FF. Now the expression matches raw bytes in
the 128...255 range, as expected.
---
*** The rx 'or' and 'seq' forms no longer require any arguments.
(or) produces a regexp that never matches anything, while (seq)
matches the empty string, each being an identity for the operation.
This also works for their aliases: '|' for 'or'; ':', 'and' and
'sequence' for 'seq'.
---
*** 'regexp' and new 'literal' accept arbitrary lisp as arguments.
In this case, 'rx' will generate code which produces a regexp string
at run time, instead of a constant string.
** Frames
+++

View file

@ -47,57 +47,58 @@
;; Rx translates a sexp notation for regular expressions into the
;; usual string notation. The translation can be done at compile-time
;; by using the `rx' macro. It can be done at run-time by calling
;; function `rx-to-string'. See the documentation of `rx' for a
;; complete description of the sexp notation.
;; by using the `rx' macro. The `regexp' and `literal' forms accept
;; non-constant expressions, in which case `rx' will translate to a
;; `concat' expression. Translation can be done fully at run time by
;; calling function `rx-to-string'. See the documentation of `rx' for
;; a complete description of the sexp notation.
;;
;; Some examples of string regexps and their sexp counterparts:
;;
;; "^[a-z]*"
;; (rx (and line-start (0+ (in "a-z"))))
;; (rx line-start (0+ (in "a-z")))
;;
;; "\n[^ \t]"
;; (rx (and "\n" (not (any " \t"))))
;; (rx ?\n (not (in " \t")))
;;
;; "\\*\\*\\* EOOH \\*\\*\\*\n"
;; (rx "*** EOOH ***\n")
;;
;; "\\<\\(catch\\|finally\\)\\>[^_]"
;; (rx (and word-start (submatch (or "catch" "finally")) word-end
;; (not (any ?_))))
;; (rx word-start (submatch (or "catch" "finally")) word-end
;; (not (in ?_)))
;;
;; "[ \t\n]*:\\([^:]+\\|$\\)"
;; (rx (and (zero-or-more (in " \t\n")) ":"
;; (submatch (or line-end (one-or-more (not (any ?:)))))))
;; "[ \t\n]*:\\($\\|[^:]+\\)"
;; (rx (* (in " \t\n")) ":"
;; (submatch (or line-end (+ (not (in ?:))))))
;;
;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*"
;; (rx (and line-start
;; "content-transfer-encoding:"
;; (+ (? ?\n)) (any " \t")
;; "quoted-printable"
;; (+ (? ?\n)) (any " \t"))
;; "^content-transfer-encoding:\\(?:\n?[\t ]\\)*quoted-printable\\(?:\n?[\t ]\\)*"
;; (rx line-start
;; "content-transfer-encoding:"
;; (* (? ?\n) (in " \t"))
;; "quoted-printable"
;; (* (? ?\n) (in " \t")))
;;
;; (concat "^\\(?:" something-else "\\)")
;; (rx (and line-start (eval something-else))), statically or
;; (rx-to-string '(and line-start ,something-else)), dynamically.
;; (rx line-start (regexp something-else))
;;
;; (regexp-opt '(STRING1 STRING2 ...))
;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically
;; calls `regexp-opt' as needed.
;;
;; "^;;\\s-*\n\\|^\n"
;; (rx (or (and line-start ";;" (0+ space) ?\n)
;; (and line-start ?\n)))
;; (rx (or (seq line-start ";;" (0+ space) ?\n)
;; (seq line-start ?\n)))
;;
;; "\\$[I]d: [^ ]+ \\([^ ]+\\) "
;; (rx (and "$Id: "
;; (1+ (not (in " ")))
;; " "
;; (submatch (1+ (not (in " "))))
;; " "))
;; (rx "$Id: "
;; (1+ (not (in " ")))
;; " "
;; (submatch (1+ (not (in " "))))
;; " ")
;;
;; "\\\\\\\\\\[\\w+"
;; (rx (and ?\\ ?\\ ?\[ (1+ word)))
;; (rx "\\\\[" (1+ word))
;;
;; etc.
@ -176,6 +177,7 @@
(not-syntax . (rx-not-syntax 1 1)) ; sregex
(category . (rx-category 1 1 rx-check-category))
(eval . (rx-eval 1 1))
(literal . (rx-literal 1 1 stringp))
(regexp . (rx-regexp 1 1 stringp))
(regex . regexp) ; sregex
(digit . "[[:digit:]]")
@ -302,6 +304,10 @@ regular expression strings.")
"Non-nil means produce greedy regular expressions for `zero-or-one',
`zero-or-more', and `one-or-more'. Dynamically bound.")
(defvar rx--compile-to-lisp nil
"Nil means return a regexp as a string.
Non-nil means we may return a lisp form which produces a
string (used for `rx' macro).")
(defun rx-info (op head)
"Return parsing/code generation info for OP.
@ -344,7 +350,7 @@ a standalone symbol."
(> nargs max-args))
(error "rx form `%s' accepts at most %d args"
(car form) max-args))
(when (not (null type-pred))
(when type-pred
(dolist (sub-form (cdr form))
(unless (funcall type-pred sub-form)
(error "rx form `%s' requires args satisfying `%s'"
@ -360,8 +366,9 @@ is non-nil."
;; for concatenation
((eq group ':)
(if (rx-atomic-p
(if (string-match
"\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp)
(if (and (stringp regexp)
(string-match
"\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp))
(substring regexp 0 (match-beginning 0))
regexp))
(setq group nil)))
@ -370,9 +377,10 @@ is non-nil."
;; do anyway
((eq group t))
((rx-atomic-p regexp t) (setq group nil)))
(if group
(concat "\\(?:" regexp "\\)")
regexp))
(cond ((and group (stringp regexp))
(concat "\\(?:" regexp "\\)"))
(group `("\\(?:" ,@regexp "\\)"))
(t regexp)))
(defvar rx-parent)
@ -384,7 +392,7 @@ is non-nil."
FORM is of the form `(and FORM1 ...)'."
(rx-check form)
(rx-group-if
(mapconcat (lambda (x) (rx-form x ':)) (cdr form) nil)
(rx--subforms (cdr form) ':)
(and (memq rx-parent '(* t)) rx-parent)))
@ -396,7 +404,7 @@ FORM is of the form `(and FORM1 ...)'."
((null (cdr form)) regexp-unmatchable)
((cl-every #'stringp (cdr form))
(regexp-opt (cdr form) nil t))
(t (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|")))
(t (rx--subforms (cdr form) '| "\\|")))
(and (memq rx-parent '(: * t)) rx-parent)))
@ -669,7 +677,10 @@ If SKIP is non-nil, allow that number of items after the head, i.e.
(unless (and (integerp (nth 1 form))
(> (nth 1 form) 0))
(error "rx `=' requires positive integer first arg"))
(format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
(let ((subform (rx-form (nth 2 form) '*)))
(if (stringp subform)
(format "%s\\{%d\\}" subform (nth 1 form))
`(,@subform ,(format "\\{%d\\}" (nth 1 form))))))
(defun rx->= (form)
@ -679,7 +690,10 @@ If SKIP is non-nil, allow that number of items after the head, i.e.
(unless (and (integerp (nth 1 form))
(> (nth 1 form) 0))
(error "rx `>=' requires positive integer first arg"))
(format "%s\\{%d,\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
(let ((subform (rx-form (nth 2 form) '*)))
(if (stringp subform)
(format "%s\\{%d,\\}" subform (nth 1 form))
`(,@subform ,(format "\\{%d,\\}" (nth 1 form))))))
(defun rx-** (form)
@ -700,7 +714,10 @@ FORM is either `(repeat N FORM1)' or `(repeat N M FORMS...)'."
(unless (and (integerp (nth 1 form))
(> (nth 1 form) 0))
(error "rx `repeat' requires positive integer first arg"))
(format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
(let ((subform (rx-form (nth 2 form) '*)))
(if (stringp subform)
(format "%s\\{%d\\}" subform (nth 1 form))
`(,@subform ,(format "\\{%d\\}" (nth 1 form))))))
((or (not (integerp (nth 2 form)))
(< (nth 2 form) 0)
(not (integerp (nth 1 form)))
@ -708,32 +725,28 @@ FORM is either `(repeat N FORM1)' or `(repeat N M FORMS...)'."
(< (nth 2 form) (nth 1 form)))
(error "rx `repeat' range error"))
(t
(format "%s\\{%d,%d\\}" (rx-form (nth 3 form) '*)
(nth 1 form) (nth 2 form)))))
(let ((subform (rx-form (nth 3 form) '*)))
(if (stringp subform)
(format "%s\\{%d,%d\\}" subform (nth 1 form) (nth 2 form))
`(,@subform ,(format "\\{%d,%d\\}" (nth 1 form) (nth 2 form))))))))
(defun rx-submatch (form)
"Parse and produce code from FORM, which is `(submatch ...)'."
(concat "\\("
(if (= 2 (length form))
;; Only one sub-form.
(rx-form (cadr form))
;; Several sub-forms implicitly concatenated.
(mapconcat (lambda (re) (rx-form re ':)) (cdr form) nil))
"\\)"))
(let ((subforms (rx--subforms (cdr form) ':)))
(if (stringp subforms)
(concat "\\(" subforms "\\)")
`("\\(" ,@subforms "\\)"))))
(defun rx-submatch-n (form)
"Parse and produce code from FORM, which is `(submatch-n N ...)'."
(let ((n (nth 1 form)))
(let ((n (nth 1 form))
(subforms (rx--subforms (cddr form) ':)))
(unless (and (integerp n) (> n 0))
(error "rx `submatch-n' argument must be positive"))
(concat "\\(?" (number-to-string n) ":"
(if (= 3 (length form))
;; Only one sub-form.
(rx-form (nth 2 form))
;; Several sub-forms implicitly concatenated.
(mapconcat (lambda (re) (rx-form re ':)) (cddr form) nil))
"\\)")))
(if (stringp subforms)
(concat "\\(?" (number-to-string n) ":" subforms "\\)")
`("\\(?" ,(number-to-string n) ":" ,@subforms "\\)"))))
(defun rx-backref (form)
"Parse and produce code from FORM, which is `(backref N)'."
@ -761,9 +774,12 @@ is non-nil."
(t "?")))
(op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*")
((memq (car form) '(+ +? 1+ one-or-more)) "+")
(t "?"))))
(t "?")))
(subform (rx-form (cadr form) '*)))
(rx-group-if
(concat (rx-form (cadr form) '*) op suffix)
(if (stringp subform)
(concat subform op suffix)
`(,@subform ,(concat op suffix)))
(and (memq rx-parent '(t *)) rx-parent))))
@ -791,15 +807,18 @@ regexps that are atomic but end in operators, such as
be detected without much effort. A guarantee of no false
negatives would require a theoretic specification of the set
of all atomic regexps."
(let ((l (length r)))
(cond
((<= l 1))
((= l 2) (= (aref r 0) ?\\))
((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r))
((null lax)
(if (and rx--compile-to-lisp
(not (stringp r)))
nil ;; Runtime value, we must assume non-atomic.
(let ((l (length r)))
(cond
((string-match "\\`\\[\\^?]?\\(?:\\[:[a-z]+:]\\|[^]]\\)*]\\'" r))
((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^)]\\)*\\\\)\\'" r)))))))
((<= l 1))
((= l 2) (= (aref r 0) ?\\))
((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r))
((null lax)
(cond
((string-match "\\`\\[\\^?]?\\(?:\\[:[a-z]+:]\\|[^]]\\)*]\\'" r))
((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^)]\\)*\\\\)\\'" r))))))))
(defun rx-syntax (form)
@ -855,9 +874,23 @@ If FORM is `(minimal-match FORM1)', non-greedy versions of `*',
(defun rx-regexp (form)
"Parse and produce code from FORM, which is `(regexp STRING)'."
(rx-check form)
(rx-group-if (cadr form) rx-parent))
(cond ((stringp form)
(rx-group-if (cadr form) rx-parent))
(rx--compile-to-lisp
;; Always group non-string forms, since we can't be sure they
;; are atomic.
(rx-group-if (cdr form) t))
(t (rx-check form))))
(defun rx-literal (form)
"Parse and produce code from FORM, which is `(literal STRING-EXP)'."
(cond ((stringp form)
;; This is allowed, but makes little sense, you could just
;; use STRING directly.
(rx-group-if (regexp-quote (cadr form)) rx-parent))
(rx--compile-to-lisp
(rx-group-if `((regexp-quote ,(cadr form))) rx-parent))
(t (rx-check form))))
(defun rx-form (form &optional parent)
"Parse and produce code for regular expression FORM.
@ -888,12 +921,38 @@ shy groups around the result and some more in other functions."
(t
(error "rx syntax error at `%s'" form)))))
(defun rx--subforms (subforms &optional parent separator)
"Produce code for regular expressions SUBFORMS.
SUBFORMS is a list of regular expression sexps.
PARENT controls grouping, as in `rx-form'.
Insert SEPARATOR between the code from each of SUBFORMS."
(if (null (cdr subforms))
;; Zero or one forms, no need for grouping.
(and subforms (rx-form (car subforms)))
(let ((listify (lambda (x)
(if (listp x) (copy-sequence x)
(list x)))))
(setq subforms (mapcar (lambda (x) (rx-form x parent)) subforms))
(cond ((or (not rx--compile-to-lisp)
(cl-every #'stringp subforms))
(mapconcat #'identity subforms separator))
(separator
(nconc (funcall listify (car subforms))
(mapcan (lambda (x)
(cons separator (funcall listify x)))
(cdr subforms))))
(t (mapcan listify subforms))))))
;;;###autoload
(defun rx-to-string (form &optional no-group)
"Parse and produce code for regular expression FORM.
FORM is a regular expression in sexp form.
NO-GROUP non-nil means don't put shy groups around the result."
NO-GROUP non-nil means don't put shy groups around the result.
In contrast to the `rx' macro, subforms `literal' and `regexp'
will not accept non-string arguments, i.e., (literal STRING)
becomes just a more verbose version of STRING."
(rx-group-if (rx-form form) (null no-group)))
@ -903,8 +962,12 @@ NO-GROUP non-nil means don't put shy groups around the result."
REGEXPS is a non-empty sequence of forms of the sort listed below.
Note that `rx' is a Lisp macro; when used in a Lisp program being
compiled, the translation is performed by the compiler.
See `rx-to-string' for how to do such a translation at run-time.
compiled, the translation is performed by the compiler. The
`literal' and `regexp' forms accept subforms that will evaluate
to strings, in addition to constant strings. If REGEXPS include
such forms, then the result is an expression which returns a
regexp string, rather than a regexp string directly. See
`rx-to-string' for performing translation completely at run time.
The following are valid subforms of regular expressions in sexp
notation.
@ -1204,18 +1267,29 @@ enclosed in `(and ...)'.
`(backref N)'
matches what was matched previously by submatch N.
`(literal STRING-EXPR)'
matches STRING-EXPR literally, where STRING-EXPR is any lisp
expression that evaluates to a string.
`(regexp REGEXP-EXPR)'
include REGEXP-EXPR in string notation in the result, where
REGEXP-EXPR is any lisp expression that evaluates to a
string containing a valid regexp.
`(eval FORM)'
evaluate FORM and insert result. If result is a string,
`regexp-quote' it.
`(regexp REGEXP)'
include REGEXP in string notation in the result."
(cond ((null regexps)
(error "No regexp"))
((cdr regexps)
(rx-to-string `(and ,@regexps) t))
(t
(rx-to-string (car regexps) t))))
`regexp-quote' it. Note that FORM is evaluated during
macroexpansion."
(let* ((rx--compile-to-lisp t)
(re (cond ((null regexps)
(error "No regexp"))
((cdr regexps)
(rx-to-string `(and ,@regexps) t))
(t
(rx-to-string (car regexps) t)))))
(if (stringp re)
re
`(concat ,@re))))
(pcase-defmacro rx (&rest regexps)
@ -1277,14 +1351,6 @@ string as argument to `match-string'."
for var in vars
collect `(app (match-string ,i) ,var)))))
;; ;; sregex.el replacement
;; ;;;###autoload (provide 'sregex)
;; ;;;###autoload (autoload 'sregex "rx")
;; (defalias 'sregex 'rx-to-string)
;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro)
;; (defalias 'sregexq 'rx)
(provide 'rx)
;;; rx.el ends here

View file

@ -115,5 +115,46 @@
;; Test zero-argument `seq'.
(should (equal (rx (seq)) "")))
(defmacro rx-tests--match (regexp string &optional match)
(macroexp-let2 nil strexp string
`(ert-info ((format "Matching %S to %S" ',regexp ,strexp))
(should (string-match ,regexp ,strexp))
,@(when match
`((should (equal (match-string 0 ,strexp) ,match)))))))
(ert-deftest rx-nonstring-expr ()
(let ((bee "b")
(vowel "[aeiou]"))
(rx-tests--match (rx "a" (literal bee) "c") "abc")
(rx-tests--match (rx "a" (regexp bee) "c") "abc")
(rx-tests--match (rx "a" (or (regexp bee) "xy") "c") "abc")
(rx-tests--match (rx "a" (or "xy" (regexp bee)) "c") "abc")
(should-not (string-match (rx (or (regexp bee) "xy")) ""))
(rx-tests--match (rx "a" (= 3 (regexp bee)) "c") "abbbc")
(rx-tests--match (rx "x" (= 3 (regexp vowel)) "z") "xeoez")
(should-not (string-match (rx "x" (= 3 (regexp vowel)) "z") "xe[]z"))
(rx-tests--match (rx "x" (= 3 (literal vowel)) "z")
"x[aeiou][aeiou][aeiou]z")
(rx-tests--match (rx "x" (repeat 1 (regexp vowel)) "z") "xaz")
(rx-tests--match (rx "x" (repeat 1 2 (regexp vowel)) "z") "xaz")
(rx-tests--match (rx "x" (repeat 1 2 (regexp vowel)) "z") "xauz")
(rx-tests--match (rx "x" (>= 1 (regexp vowel)) "z") "xaiiz")
(rx-tests--match (rx "x" (** 1 2 (regexp vowel)) "z") "xaiz")
(rx-tests--match (rx "x" (group (regexp vowel)) "z") "xaz")
(rx-tests--match (rx "x" (group-n 1 (regexp vowel)) "z") "xaz")
(rx-tests--match (rx "x" (? (regexp vowel)) "z") "xz")))
(ert-deftest rx-nonstring-expr-non-greedy ()
"`rx's greediness can't affect runtime regexp parts."
(let ((ad-min "[ad]*?")
(ad-max "[ad]*")
(ad "[ad]"))
(rx-tests--match (rx "c" (regexp ad-min) "a") "cdaaada" "cda")
(rx-tests--match (rx "c" (regexp ad-max) "a") "cdaaada" "cdaaada")
(rx-tests--match (rx "c" (minimal-match (regexp ad-max)) "a") "cdaaada" "cdaaada")
(rx-tests--match (rx "c" (maximal-match (regexp ad-min)) "a") "cdaaada" "cda")
(rx-tests--match (rx "c" (minimal-match (0+ (regexp ad))) "a") "cdaaada" "cda")
(rx-tests--match (rx "c" (maximal-match (0+ (regexp ad))) "a") "cdaaada" "cdaaada")))
(provide 'rx-tests)
;; rx-tests.el ends here.