Use or' instead of union' for charset union in rx

Design change suggested by Stefan Monnier.

* doc/lispref/searching.texi (Rx Constructs):
* etc/NEWS: Document.
* lisp/emacs-lisp/rx.el (rx--translate-or): Detect charset arguments.
(rx--charset-p): New.
(rx--translate-not, rx--charset-intervals, rx--translate-union):
Change from `union' to `or'.
(rx--translate-form, rx--builtin-forms, rx): Remove `union'.
* test/lisp/emacs-lisp/rx-tests.el (rx-union, rx-def-in-union)
(rx-intersection): Rename tests and change `union' to `or' and `|'.
This commit is contained in:
Mattias Engdegård 2019-12-12 23:04:00 +01:00
parent d7efe98951
commit f16766a0eb
4 changed files with 65 additions and 48 deletions

View file

@ -1214,20 +1214,19 @@ Corresponding string regexp: @samp{[@dots{}]}
@item @code{(not @var{charspec})}
@cindex @code{not} in rx
Match a character not included in @var{charspec}. @var{charspec} can
be an @code{any}, @code{not}, @code{union}, @code{intersection},
@code{syntax} or @code{category} form, or a character class.@*
be an @code{any}, @code{not}, @code{or}, @code{intersection},
@code{syntax} or @code{category} form, or a character class.
If @var{charspec} is an @code{or} form, its arguments have the same
restrictions as those of @code{intersection}; see below.@*
Corresponding string regexp: @samp{[^@dots{}]}, @samp{\S@var{code}},
@samp{\C@var{code}}
@item @code{(union @var{charset}@dots{})}
@itemx @code{(intersection @var{charset}@dots{})}
@cindex @code{union} in rx
@item @code{(intersection @var{charset}@dots{})}
@cindex @code{intersection} in rx
Match a character that matches the union or intersection,
respectively, of the @var{charset}s. Each @var{charset} can be an
@code{any} form without character classes, or a @code{union},
@code{intersection} or @code{not} form whose arguments are also
@var{charset}s.
Match a character included in all of the @var{charset}s.
Each @var{charset} can be an @code{any} form without character
classes, or an @code{intersection}, @code{or} or @code{not} form whose
arguments are also @var{charset}s.
@item @code{not-newline}, @code{nonl}
@cindex @code{not-newline} in rx
@ -1591,7 +1590,8 @@ when they are used, not when they are defined.
User-defined forms are allowed wherever arbitrary @code{rx}
expressions are expected; for example, in the body of a
@code{zero-or-one} form, but not inside @code{any} or @code{category}
forms. They are also allowed inside @code{not} forms.
forms. They are also allowed inside @code{not} and
@code{intersection} forms.
@end itemize
@defmac rx-define name [arglist] rx-form

View file

@ -2120,9 +2120,9 @@ These macros add new forms to the rx notation.
Both match any single character; 'anychar' is more descriptive.
+++
*** New 'union' and 'intersection' forms for character sets.
These permit composing character-matching expressions from simpler
parts.
*** New 'intersection' form for character sets.
With 'or' and 'not', it can be used to compose character-matching
expressions from simpler parts.
** Frames

View file

@ -273,10 +273,8 @@ Return (REGEXP . PRECEDENCE)."
;; (or (+ digit) "CHARLIE" "CHAN" (+ blank))
;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank))
;;
;; - Fuse patterns into a single character alternative if they fit.
;; regexp-opt will do that if all are strings, but we want to do that for:
;; * symbols that expand to classes: space, alpha, ...
;; * character alternatives: (any ...)
;; - Optimise single-character alternatives better:
;; * classes: space, alpha, ...
;; * (syntax S), for some S (whitespace, word)
;; so that (or "@" "%" digit (any "A-Z" space) (syntax word))
;; -> (any "@" "%" digit "A-Z" space word)
@ -294,6 +292,8 @@ Return (REGEXP . PRECEDENCE)."
((rx--every #'stringp body) ; All strings.
(cons (list (regexp-opt body nil t))
t))
((rx--every #'rx--charset-p body) ; All charsets.
(rx--translate-union nil body))
(t
(cons (append (car (rx--translate (car body)))
(mapcan (lambda (item)
@ -301,6 +301,19 @@ Return (REGEXP . PRECEDENCE)."
(cdr body)))
nil))))
(defun rx--charset-p (form)
"Whether FORM looks like a charset, only consisting of character intervals
and set operations."
(or (and (consp form)
(or (and (memq (car form) '(any 'in 'char))
(rx--every (lambda (x) (not (symbolp x))) (cdr form)))
(and (memq (car form) '(not or | intersection))
(rx--every #'rx--charset-p (cdr form)))))
(and (or (symbolp form) (consp form))
(let ((expanded (rx--expand-def form)))
(and expanded
(rx--charset-p expanded))))))
(defun rx--string-to-intervals (str)
"Decode STR as intervals: A-Z becomes (?A . ?Z), and the single
character X becomes (?X . ?X). Return the intervals in a list."
@ -477,7 +490,7 @@ If NEGATED, negate the sense."
(not negated) (rx--complement-intervals intervals) nil)))
;; FIXME: Consider turning `not' into a variadic operator, following SRE:
;; (not A B) = (not (union A B)) = (intersection (not A) (not B)), and
;; (not A B) = (not (or A B)) = (intersection (not A) (not B)), and
;; (not) = anychar.
;; Maybe allow singleton characters as arguments.
@ -498,7 +511,7 @@ If NEGATED, negate the sense (thus making it positive)."
(rx--translate-category (not negated) (cdr arg)))
('not
(rx--translate-not (not negated) (cdr arg)))
('union
((or 'or '|)
(rx--translate-union (not negated) (cdr arg)))
('intersection
(rx--translate-intersection (not negated) (cdr arg))))))
@ -558,7 +571,7 @@ If NEGATED, negate the sense (thus making it positive)."
(defun rx--charset-intervals (charset)
"Return a sorted list of non-adjacent disjoint intervals from CHARSET.
CHARSET is any expression allowed in a character set expression:
either `any' (no classes permitted), or `not', `union' or `intersection'
either `any' (no classes permitted), or `not', `or' or `intersection'
forms whose arguments are charsets."
(pcase charset
(`(,(or 'any 'in 'char) . ,body)
@ -569,8 +582,8 @@ forms whose arguments are charsets."
(cadr parsed)))
(car parsed)))
(`(not ,x) (rx--complement-intervals (rx--charset-intervals x)))
(`(union . ,xs) (rx--charset-union xs))
(`(intersection . ,xs) (rx--charset-intersection xs))
(`(,(or 'or '|) . ,body) (rx--charset-union body))
(`(intersection . ,body) (rx--charset-intersection body))
(_ (let ((expanded (rx--expand-def charset)))
(if expanded
(rx--charset-intervals expanded)
@ -589,7 +602,7 @@ forms whose arguments are charsets."
(mapcar #'rx--charset-intervals charsets)))
(defun rx--translate-union (negated body)
"Translate a (union ...) construct. Return (REGEXP . PRECEDENCE).
"Translate an (or ...) construct of charsets. Return (REGEXP . PRECEDENCE).
If NEGATED, negate the sense."
(rx--intervals-to-alt negated (rx--charset-union body)))
@ -976,7 +989,6 @@ can expand to any number of values."
((or 'any 'in 'char) (rx--translate-any nil body))
('not-char (rx--translate-any t body))
('not (rx--translate-not nil body))
('union (rx--translate-union nil body))
('intersection (rx--translate-intersection nil body))
('repeat (rx--translate-repeat body))
@ -1036,7 +1048,7 @@ can expand to any number of values."
(t (error "Unknown rx form `%s'" op)))))))
(defconst rx--builtin-forms
'(seq sequence : and or | any in char not-char not union intersection
'(seq sequence : and or | any in char not-char not intersection
repeat = >= **
zero-or-more 0+ *
one-or-more 1+ +
@ -1149,11 +1161,10 @@ CHAR Match a literal character.
character, a string, a range as string \"A-Z\" or cons
(?A . ?Z), or a character class (see below). Alias: in, char.
(not CHARSPEC) Match one character not matched by CHARSPEC. CHARSPEC
can be (any ...), (union ...), (intersection ...),
can be (any ...), (or ...), (intersection ...),
(syntax ...), (category ...), or a character class.
(union CHARSET...) Union of CHARSETs.
(intersection CHARSET...) Intersection of CHARSETs.
CHARSET is (any...), (not...), (union...) or (intersection...).
CHARSET is (any...), (not...), (or...) or (intersection...).
not-newline Match any character except a newline. Alias: nonl.
anychar Match any character. Alias: anything.
unmatchable Never match anything at all.

View file

@ -274,33 +274,36 @@
(should (equal (rx (not (not ascii)) (not (not (not (any "a-z")))))
"[[:ascii:]][^a-z]")))
(ert-deftest rx-union ()
(should (equal (rx (union))
(ert-deftest rx-charset-or ()
(should (equal (rx (or))
"\\`a\\`"))
(should (equal (rx (union (any "ba")))
(should (equal (rx (or (any "ba")))
"[ab]"))
(should (equal (rx (union (any "a-f") (any "c-k" ?y) (any ?r "x-z")))
(should (equal (rx (| (any "a-f") (any "c-k" ?y) (any ?r "x-z")))
"[a-krx-z]"))
(should (equal (rx (union (not (any "a-m")) (not (any "f-p"))))
(should (equal (rx (or (not (any "a-m")) (not (any "f-p"))))
"[^f-m]"))
(should (equal (rx (union (any "e-m") (not (any "a-z"))))
(should (equal (rx (| (any "e-m") (not (any "a-z"))))
"[^a-dn-z]"))
(should (equal (rx (union (not (any "g-r")) (not (any "t"))))
(should (equal (rx (or (not (any "g-r")) (not (any "t"))))
"[^z-a]"))
(should (equal (rx (not (union (not (any "g-r")) (not (any "t")))))
(should (equal (rx (not (or (not (any "g-r")) (not (any "t")))))
"\\`a\\`"))
(should (equal (rx (union (union (any "a-f") (any "u-z"))
(any "g-r")))
(should (equal (rx (or (| (any "a-f") (any "u-z"))
(any "g-r")))
"[a-ru-z]"))
(should (equal (rx (union (intersection (any "c-z") (any "a-g"))
(not (any "a-k"))))
(should (equal (rx (or (intersection (any "c-z") (any "a-g"))
(not (any "a-k"))))
"[^abh-k]")))
(ert-deftest rx-def-in-union ()
(ert-deftest rx-def-in-charset-or ()
(rx-let ((a (any "badc"))
(b (union a (any "def"))))
(should (equal(rx (union b (any "q")))
"[a-fq]"))))
(b (| a (any "def"))))
(should (equal (rx (or b (any "q")))
"[a-fq]")))
(rx-let ((diff-| (a b) (not (or (not a) b))))
(should (equal (rx (diff-| (any "a-z") (any "gr")))
"[a-fh-qs-z]"))))
(ert-deftest rx-intersection ()
(should (equal (rx (intersection))
@ -321,15 +324,18 @@
(should (equal (rx (intersection (any "d-u")
(intersection (any "e-z") (any "a-m"))))
"[e-m]"))
(should (equal (rx (intersection (union (any "a-f") (any "f-t"))
(should (equal (rx (intersection (or (any "a-f") (any "f-t"))
(any "e-w")))
"[e-t]")))
(ert-deftest rx-def-in-intersection ()
(rx-let ((a (any "a-g"))
(b (intersection a (any "d-j"))))
(should (equal(rx (intersection b (any "e-k")))
"[e-g]"))))
(should (equal (rx (intersection b (any "e-k")))
"[e-g]")))
(rx-let ((diff-& (a b) (intersection a (not b))))
(should (equal (rx (diff-& (any "a-z") (any "m-p")))
"[a-lq-z]"))))
(ert-deftest rx-group ()
(should (equal (rx (group nonl) (submatch "x")