Use or' instead of
union' for charset union in rx
Design change suggested by Stefan Monnier. * doc/lispref/searching.texi (Rx Constructs): * etc/NEWS: Document. * lisp/emacs-lisp/rx.el (rx--translate-or): Detect charset arguments. (rx--charset-p): New. (rx--translate-not, rx--charset-intervals, rx--translate-union): Change from `union' to `or'. (rx--translate-form, rx--builtin-forms, rx): Remove `union'. * test/lisp/emacs-lisp/rx-tests.el (rx-union, rx-def-in-union) (rx-intersection): Rename tests and change `union' to `or' and `|'.
This commit is contained in:
parent
d7efe98951
commit
f16766a0eb
4 changed files with 65 additions and 48 deletions
|
@ -1214,20 +1214,19 @@ Corresponding string regexp: @samp{[@dots{}]}
|
|||
@item @code{(not @var{charspec})}
|
||||
@cindex @code{not} in rx
|
||||
Match a character not included in @var{charspec}. @var{charspec} can
|
||||
be an @code{any}, @code{not}, @code{union}, @code{intersection},
|
||||
@code{syntax} or @code{category} form, or a character class.@*
|
||||
be an @code{any}, @code{not}, @code{or}, @code{intersection},
|
||||
@code{syntax} or @code{category} form, or a character class.
|
||||
If @var{charspec} is an @code{or} form, its arguments have the same
|
||||
restrictions as those of @code{intersection}; see below.@*
|
||||
Corresponding string regexp: @samp{[^@dots{}]}, @samp{\S@var{code}},
|
||||
@samp{\C@var{code}}
|
||||
|
||||
@item @code{(union @var{charset}@dots{})}
|
||||
@itemx @code{(intersection @var{charset}@dots{})}
|
||||
@cindex @code{union} in rx
|
||||
@item @code{(intersection @var{charset}@dots{})}
|
||||
@cindex @code{intersection} in rx
|
||||
Match a character that matches the union or intersection,
|
||||
respectively, of the @var{charset}s. Each @var{charset} can be an
|
||||
@code{any} form without character classes, or a @code{union},
|
||||
@code{intersection} or @code{not} form whose arguments are also
|
||||
@var{charset}s.
|
||||
Match a character included in all of the @var{charset}s.
|
||||
Each @var{charset} can be an @code{any} form without character
|
||||
classes, or an @code{intersection}, @code{or} or @code{not} form whose
|
||||
arguments are also @var{charset}s.
|
||||
|
||||
@item @code{not-newline}, @code{nonl}
|
||||
@cindex @code{not-newline} in rx
|
||||
|
@ -1591,7 +1590,8 @@ when they are used, not when they are defined.
|
|||
User-defined forms are allowed wherever arbitrary @code{rx}
|
||||
expressions are expected; for example, in the body of a
|
||||
@code{zero-or-one} form, but not inside @code{any} or @code{category}
|
||||
forms. They are also allowed inside @code{not} forms.
|
||||
forms. They are also allowed inside @code{not} and
|
||||
@code{intersection} forms.
|
||||
@end itemize
|
||||
|
||||
@defmac rx-define name [arglist] rx-form
|
||||
|
|
6
etc/NEWS
6
etc/NEWS
|
@ -2120,9 +2120,9 @@ These macros add new forms to the rx notation.
|
|||
Both match any single character; 'anychar' is more descriptive.
|
||||
|
||||
+++
|
||||
*** New 'union' and 'intersection' forms for character sets.
|
||||
These permit composing character-matching expressions from simpler
|
||||
parts.
|
||||
*** New 'intersection' form for character sets.
|
||||
With 'or' and 'not', it can be used to compose character-matching
|
||||
expressions from simpler parts.
|
||||
|
||||
** Frames
|
||||
|
||||
|
|
|
@ -273,10 +273,8 @@ Return (REGEXP . PRECEDENCE)."
|
|||
;; (or (+ digit) "CHARLIE" "CHAN" (+ blank))
|
||||
;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank))
|
||||
;;
|
||||
;; - Fuse patterns into a single character alternative if they fit.
|
||||
;; regexp-opt will do that if all are strings, but we want to do that for:
|
||||
;; * symbols that expand to classes: space, alpha, ...
|
||||
;; * character alternatives: (any ...)
|
||||
;; - Optimise single-character alternatives better:
|
||||
;; * classes: space, alpha, ...
|
||||
;; * (syntax S), for some S (whitespace, word)
|
||||
;; so that (or "@" "%" digit (any "A-Z" space) (syntax word))
|
||||
;; -> (any "@" "%" digit "A-Z" space word)
|
||||
|
@ -294,6 +292,8 @@ Return (REGEXP . PRECEDENCE)."
|
|||
((rx--every #'stringp body) ; All strings.
|
||||
(cons (list (regexp-opt body nil t))
|
||||
t))
|
||||
((rx--every #'rx--charset-p body) ; All charsets.
|
||||
(rx--translate-union nil body))
|
||||
(t
|
||||
(cons (append (car (rx--translate (car body)))
|
||||
(mapcan (lambda (item)
|
||||
|
@ -301,6 +301,19 @@ Return (REGEXP . PRECEDENCE)."
|
|||
(cdr body)))
|
||||
nil))))
|
||||
|
||||
(defun rx--charset-p (form)
|
||||
"Whether FORM looks like a charset, only consisting of character intervals
|
||||
and set operations."
|
||||
(or (and (consp form)
|
||||
(or (and (memq (car form) '(any 'in 'char))
|
||||
(rx--every (lambda (x) (not (symbolp x))) (cdr form)))
|
||||
(and (memq (car form) '(not or | intersection))
|
||||
(rx--every #'rx--charset-p (cdr form)))))
|
||||
(and (or (symbolp form) (consp form))
|
||||
(let ((expanded (rx--expand-def form)))
|
||||
(and expanded
|
||||
(rx--charset-p expanded))))))
|
||||
|
||||
(defun rx--string-to-intervals (str)
|
||||
"Decode STR as intervals: A-Z becomes (?A . ?Z), and the single
|
||||
character X becomes (?X . ?X). Return the intervals in a list."
|
||||
|
@ -477,7 +490,7 @@ If NEGATED, negate the sense."
|
|||
(not negated) (rx--complement-intervals intervals) nil)))
|
||||
|
||||
;; FIXME: Consider turning `not' into a variadic operator, following SRE:
|
||||
;; (not A B) = (not (union A B)) = (intersection (not A) (not B)), and
|
||||
;; (not A B) = (not (or A B)) = (intersection (not A) (not B)), and
|
||||
;; (not) = anychar.
|
||||
;; Maybe allow singleton characters as arguments.
|
||||
|
||||
|
@ -498,7 +511,7 @@ If NEGATED, negate the sense (thus making it positive)."
|
|||
(rx--translate-category (not negated) (cdr arg)))
|
||||
('not
|
||||
(rx--translate-not (not negated) (cdr arg)))
|
||||
('union
|
||||
((or 'or '|)
|
||||
(rx--translate-union (not negated) (cdr arg)))
|
||||
('intersection
|
||||
(rx--translate-intersection (not negated) (cdr arg))))))
|
||||
|
@ -558,7 +571,7 @@ If NEGATED, negate the sense (thus making it positive)."
|
|||
(defun rx--charset-intervals (charset)
|
||||
"Return a sorted list of non-adjacent disjoint intervals from CHARSET.
|
||||
CHARSET is any expression allowed in a character set expression:
|
||||
either `any' (no classes permitted), or `not', `union' or `intersection'
|
||||
either `any' (no classes permitted), or `not', `or' or `intersection'
|
||||
forms whose arguments are charsets."
|
||||
(pcase charset
|
||||
(`(,(or 'any 'in 'char) . ,body)
|
||||
|
@ -569,8 +582,8 @@ forms whose arguments are charsets."
|
|||
(cadr parsed)))
|
||||
(car parsed)))
|
||||
(`(not ,x) (rx--complement-intervals (rx--charset-intervals x)))
|
||||
(`(union . ,xs) (rx--charset-union xs))
|
||||
(`(intersection . ,xs) (rx--charset-intersection xs))
|
||||
(`(,(or 'or '|) . ,body) (rx--charset-union body))
|
||||
(`(intersection . ,body) (rx--charset-intersection body))
|
||||
(_ (let ((expanded (rx--expand-def charset)))
|
||||
(if expanded
|
||||
(rx--charset-intervals expanded)
|
||||
|
@ -589,7 +602,7 @@ forms whose arguments are charsets."
|
|||
(mapcar #'rx--charset-intervals charsets)))
|
||||
|
||||
(defun rx--translate-union (negated body)
|
||||
"Translate a (union ...) construct. Return (REGEXP . PRECEDENCE).
|
||||
"Translate an (or ...) construct of charsets. Return (REGEXP . PRECEDENCE).
|
||||
If NEGATED, negate the sense."
|
||||
(rx--intervals-to-alt negated (rx--charset-union body)))
|
||||
|
||||
|
@ -976,7 +989,6 @@ can expand to any number of values."
|
|||
((or 'any 'in 'char) (rx--translate-any nil body))
|
||||
('not-char (rx--translate-any t body))
|
||||
('not (rx--translate-not nil body))
|
||||
('union (rx--translate-union nil body))
|
||||
('intersection (rx--translate-intersection nil body))
|
||||
|
||||
('repeat (rx--translate-repeat body))
|
||||
|
@ -1036,7 +1048,7 @@ can expand to any number of values."
|
|||
(t (error "Unknown rx form `%s'" op)))))))
|
||||
|
||||
(defconst rx--builtin-forms
|
||||
'(seq sequence : and or | any in char not-char not union intersection
|
||||
'(seq sequence : and or | any in char not-char not intersection
|
||||
repeat = >= **
|
||||
zero-or-more 0+ *
|
||||
one-or-more 1+ +
|
||||
|
@ -1149,11 +1161,10 @@ CHAR Match a literal character.
|
|||
character, a string, a range as string \"A-Z\" or cons
|
||||
(?A . ?Z), or a character class (see below). Alias: in, char.
|
||||
(not CHARSPEC) Match one character not matched by CHARSPEC. CHARSPEC
|
||||
can be (any ...), (union ...), (intersection ...),
|
||||
can be (any ...), (or ...), (intersection ...),
|
||||
(syntax ...), (category ...), or a character class.
|
||||
(union CHARSET...) Union of CHARSETs.
|
||||
(intersection CHARSET...) Intersection of CHARSETs.
|
||||
CHARSET is (any...), (not...), (union...) or (intersection...).
|
||||
CHARSET is (any...), (not...), (or...) or (intersection...).
|
||||
not-newline Match any character except a newline. Alias: nonl.
|
||||
anychar Match any character. Alias: anything.
|
||||
unmatchable Never match anything at all.
|
||||
|
|
|
@ -274,33 +274,36 @@
|
|||
(should (equal (rx (not (not ascii)) (not (not (not (any "a-z")))))
|
||||
"[[:ascii:]][^a-z]")))
|
||||
|
||||
(ert-deftest rx-union ()
|
||||
(should (equal (rx (union))
|
||||
(ert-deftest rx-charset-or ()
|
||||
(should (equal (rx (or))
|
||||
"\\`a\\`"))
|
||||
(should (equal (rx (union (any "ba")))
|
||||
(should (equal (rx (or (any "ba")))
|
||||
"[ab]"))
|
||||
(should (equal (rx (union (any "a-f") (any "c-k" ?y) (any ?r "x-z")))
|
||||
(should (equal (rx (| (any "a-f") (any "c-k" ?y) (any ?r "x-z")))
|
||||
"[a-krx-z]"))
|
||||
(should (equal (rx (union (not (any "a-m")) (not (any "f-p"))))
|
||||
(should (equal (rx (or (not (any "a-m")) (not (any "f-p"))))
|
||||
"[^f-m]"))
|
||||
(should (equal (rx (union (any "e-m") (not (any "a-z"))))
|
||||
(should (equal (rx (| (any "e-m") (not (any "a-z"))))
|
||||
"[^a-dn-z]"))
|
||||
(should (equal (rx (union (not (any "g-r")) (not (any "t"))))
|
||||
(should (equal (rx (or (not (any "g-r")) (not (any "t"))))
|
||||
"[^z-a]"))
|
||||
(should (equal (rx (not (union (not (any "g-r")) (not (any "t")))))
|
||||
(should (equal (rx (not (or (not (any "g-r")) (not (any "t")))))
|
||||
"\\`a\\`"))
|
||||
(should (equal (rx (union (union (any "a-f") (any "u-z"))
|
||||
(any "g-r")))
|
||||
(should (equal (rx (or (| (any "a-f") (any "u-z"))
|
||||
(any "g-r")))
|
||||
"[a-ru-z]"))
|
||||
(should (equal (rx (union (intersection (any "c-z") (any "a-g"))
|
||||
(not (any "a-k"))))
|
||||
(should (equal (rx (or (intersection (any "c-z") (any "a-g"))
|
||||
(not (any "a-k"))))
|
||||
"[^abh-k]")))
|
||||
|
||||
(ert-deftest rx-def-in-union ()
|
||||
(ert-deftest rx-def-in-charset-or ()
|
||||
(rx-let ((a (any "badc"))
|
||||
(b (union a (any "def"))))
|
||||
(should (equal(rx (union b (any "q")))
|
||||
"[a-fq]"))))
|
||||
(b (| a (any "def"))))
|
||||
(should (equal (rx (or b (any "q")))
|
||||
"[a-fq]")))
|
||||
(rx-let ((diff-| (a b) (not (or (not a) b))))
|
||||
(should (equal (rx (diff-| (any "a-z") (any "gr")))
|
||||
"[a-fh-qs-z]"))))
|
||||
|
||||
(ert-deftest rx-intersection ()
|
||||
(should (equal (rx (intersection))
|
||||
|
@ -321,15 +324,18 @@
|
|||
(should (equal (rx (intersection (any "d-u")
|
||||
(intersection (any "e-z") (any "a-m"))))
|
||||
"[e-m]"))
|
||||
(should (equal (rx (intersection (union (any "a-f") (any "f-t"))
|
||||
(should (equal (rx (intersection (or (any "a-f") (any "f-t"))
|
||||
(any "e-w")))
|
||||
"[e-t]")))
|
||||
|
||||
(ert-deftest rx-def-in-intersection ()
|
||||
(rx-let ((a (any "a-g"))
|
||||
(b (intersection a (any "d-j"))))
|
||||
(should (equal(rx (intersection b (any "e-k")))
|
||||
"[e-g]"))))
|
||||
(should (equal (rx (intersection b (any "e-k")))
|
||||
"[e-g]")))
|
||||
(rx-let ((diff-& (a b) (intersection a (not b))))
|
||||
(should (equal (rx (diff-& (any "a-z") (any "m-p")))
|
||||
"[a-lq-z]"))))
|
||||
|
||||
(ert-deftest rx-group ()
|
||||
(should (equal (rx (group nonl) (submatch "x")
|
||||
|
|
Loading…
Add table
Reference in a new issue